From ff2b7dab80026a7d1b9834d175eb6a901a754ce2 Mon Sep 17 00:00:00 2001 From: Socrates Date: Wed, 27 May 2026 16:46:43 +0800 Subject: [PATCH 1/8] Implement manifests and files system tables --- .../core/catalog/file_system_catalog_test.cpp | 53 ++- .../table/system/in_memory_system_table.cpp | 3 + .../table/system/metadata_system_tables.cpp | 426 +++++++++++++++++- .../table/system/metadata_system_tables.h | 37 ++ src/paimon/core/table/system/system_table.cpp | 18 + test/inte/read_inte_test.cpp | 145 ++++++ 6 files changed, 677 insertions(+), 5 deletions(-) diff --git a/src/paimon/core/catalog/file_system_catalog_test.cpp b/src/paimon/core/catalog/file_system_catalog_test.cpp index fce73d531..da510b089 100644 --- a/src/paimon/core/catalog/file_system_catalog_test.cpp +++ b/src/paimon/core/catalog/file_system_catalog_test.cpp @@ -300,8 +300,8 @@ TEST(FileSystemCatalogTest, TestMetadataSystemTableCatalog) { /*ignore_if_exists=*/false)); ArrowSchemaRelease(&schema); - std::vector metadata_tables = {"snapshots", "schemas", "tags", "branches", - "consumers"}; + std::vector metadata_tables = {"snapshots", "schemas", "tags", "branches", + "consumers", "manifests", "files"}; for (const auto& table_name : metadata_tables) { Identifier system_identifier("db1", "tbl1$" + table_name); ASSERT_OK_AND_ASSIGN(bool exists, catalog.TableExists(system_identifier)); @@ -363,6 +363,55 @@ TEST(FileSystemCatalogTest, TestMetadataSystemTableCatalog) { (std::vector{"consumer_id", "next_snapshot_id"})); ASSERT_FALSE(consumers_arrow_schema->field(1)->nullable()); + ASSERT_OK_AND_ASSIGN(std::shared_ptr manifests_schema, + catalog.LoadTableSchema(Identifier("db1", "tbl1$manifests"))); + ASSERT_OK_AND_ASSIGN(auto manifests_c_schema, manifests_schema->GetArrowSchema()); + auto manifests_arrow_schema = arrow::ImportSchema(manifests_c_schema.get()).ValueUnsafe(); + ASSERT_EQ(manifests_arrow_schema->field_names(), + (std::vector{"file_name", "file_size", "num_added_files", + "num_deleted_files", "schema_id", "min_partition_stats", + "max_partition_stats", "min_row_id", "max_row_id"})); + ASSERT_FALSE(manifests_arrow_schema->field(0)->nullable()); + ASSERT_EQ(manifests_arrow_schema->field(1)->type()->id(), arrow::Type::INT64); + ASSERT_FALSE(manifests_arrow_schema->field(4)->nullable()); + ASSERT_TRUE(manifests_arrow_schema->field(5)->nullable()); + ASSERT_TRUE(manifests_arrow_schema->field(8)->nullable()); + + ASSERT_OK_AND_ASSIGN(std::shared_ptr files_schema, + catalog.LoadTableSchema(Identifier("db1", "tbl1$files"))); + ASSERT_OK_AND_ASSIGN(auto files_c_schema, files_schema->GetArrowSchema()); + auto files_arrow_schema = arrow::ImportSchema(files_c_schema.get()).ValueUnsafe(); + ASSERT_EQ(files_arrow_schema->field_names(), (std::vector{"partition", + "bucket", + "file_path", + "file_format", + "schema_id", + "level", + "record_count", + "file_size_in_bytes", + "min_key", + "max_key", + "null_value_counts", + "min_value_stats", + "max_value_stats", + "min_sequence_number", + "max_sequence_number", + "creation_time", + "deleteRowCount", + "file_source", + "first_row_id", + "write_cols"})); + ASSERT_TRUE(files_arrow_schema->field(0)->nullable()); + ASSERT_FALSE(files_arrow_schema->field(1)->nullable()); + ASSERT_FALSE(files_arrow_schema->field(2)->nullable()); + ASSERT_FALSE(files_arrow_schema->field(10)->nullable()); + ASSERT_EQ(files_arrow_schema->field(15)->type()->id(), arrow::Type::TIMESTAMP); + ASSERT_EQ(files_arrow_schema->field(19)->type()->id(), arrow::Type::LIST); + auto write_cols_type = + std::dynamic_pointer_cast(files_arrow_schema->field(19)->type()); + ASSERT_TRUE(write_cols_type); + ASSERT_EQ(write_cols_type->value_type()->id(), arrow::Type::STRING); + Identifier snapshots_identifier("db1", "tbl1$snapshots"); ::ArrowSchema system_create_schema; ASSERT_TRUE(arrow::ExportSchema(*typed_schema, &system_create_schema).ok()); diff --git a/src/paimon/core/table/system/in_memory_system_table.cpp b/src/paimon/core/table/system/in_memory_system_table.cpp index fff2b0dd0..7fbd5c884 100644 --- a/src/paimon/core/table/system/in_memory_system_table.cpp +++ b/src/paimon/core/table/system/in_memory_system_table.cpp @@ -46,6 +46,9 @@ class InMemorySystemTableBatchReader : public BatchReader { emitted_ = true; PAIMON_ASSIGN_OR_RAISE(std::shared_ptr schema, table_->ArrowSchema()); PAIMON_ASSIGN_OR_RAISE(std::vector rows, table_->BuildRows()); + if (rows.empty()) { + return BatchReader::MakeEofBatch(); + } PAIMON_ASSIGN_OR_RAISE(std::unique_ptr converter, GenericRowToArrowArrayConverter::Create(schema, arrow_pool_.get())); return converter->NextBatch(rows); diff --git a/src/paimon/core/table/system/metadata_system_tables.cpp b/src/paimon/core/table/system/metadata_system_tables.cpp index 23107b8f2..3fdf687ee 100644 --- a/src/paimon/core/table/system/metadata_system_tables.cpp +++ b/src/paimon/core/table/system/metadata_system_tables.cpp @@ -24,22 +24,46 @@ #include #include #include +#include #include #include +#include "fmt/format.h" +#include "fmt/ranges.h" #include "paimon/common/data/binary_string.h" +#include "paimon/common/data/data_define.h" #include "paimon/common/data/generic_row.h" +#include "paimon/common/data/internal_array.h" +#include "paimon/common/data/internal_row.h" +#include "paimon/common/table/special_fields.h" +#include "paimon/common/types/data_field.h" +#include "paimon/common/utils/binary_row_partition_computer.h" #include "paimon/common/utils/date_time_utils.h" +#include "paimon/common/utils/internal_row_utils.h" +#include "paimon/common/utils/path_util.h" #include "paimon/common/utils/rapidjson_util.h" +#include "paimon/core/core_options.h" +#include "paimon/core/io/data_file_meta.h" +#include "paimon/core/manifest/file_entry.h" +#include "paimon/core/manifest/file_kind.h" +#include "paimon/core/manifest/manifest_entry.h" +#include "paimon/core/manifest/manifest_file.h" +#include "paimon/core/manifest/manifest_file_meta.h" +#include "paimon/core/manifest/manifest_list.h" #include "paimon/core/schema/schema_manager.h" #include "paimon/core/schema/table_schema.h" #include "paimon/core/snapshot.h" +#include "paimon/core/stats/simple_stats_evolutions.h" #include "paimon/core/tag/tag.h" #include "paimon/core/utils/branch_manager.h" #include "paimon/core/utils/consumer_manager.h" +#include "paimon/core/utils/field_mapping.h" +#include "paimon/core/utils/file_store_path_factory.h" #include "paimon/core/utils/snapshot_manager.h" #include "paimon/core/utils/tag_manager.h" +#include "paimon/data/timestamp.h" #include "paimon/fs/file_system.h" +#include "paimon/memory/memory_pool.h" #include "paimon/status.h" #include "rapidjson/document.h" #include "rapidjson/stringbuffer.h" @@ -48,6 +72,8 @@ namespace paimon { namespace { +constexpr int32_t kMaxPartitionStatsLength = 255; + template Result JsonString(const T& value) { rapidjson::Document document; @@ -140,6 +166,12 @@ Result LocalTimestampMillisValue(int64_t epoch_millis) { return TimestampMillisValue(local_timestamp.GetMillisecond()); } +Result LocalTimestampMillisValue(const Timestamp& local_timestamp) { + PAIMON_ASSIGN_OR_RAISE(Timestamp utc_timestamp, DateTimeUtils::ToUTCTimestamp(local_timestamp)); + int64_t epoch_millis = utc_timestamp.GetMillisecond(); + return LocalTimestampMillisValue(epoch_millis); +} + VariantType OptionalTimestampMillisValue(const std::optional& value) { if (!value) { return NullType(); @@ -150,12 +182,215 @@ VariantType OptionalTimestampMillisValue(const std::optional& value) { MetadataSystemTableContext CreateMetadataContext(std::shared_ptr fs, std::string table_path, std::string branch) { return { - std::move(fs), - std::move(table_path), - BranchManager::NormalizeBranch(branch), + std::move(fs), std::move(table_path), BranchManager::NormalizeBranch(branch), nullptr, {}, + }; +} + +MetadataSystemTableContext CreateMetadataContext(std::shared_ptr fs, + std::string table_path, std::string branch, + std::shared_ptr table_schema, + std::map options) { + return { + std::move(fs), std::move(table_path), BranchManager::NormalizeBranch(branch), + std::move(table_schema), std::move(options), }; } +Result CreateCoreOptions(const MetadataSystemTableContext& context) { + return CoreOptions::FromMap(context.options, context.fs); +} + +Result> CreatePathFactory( + const MetadataSystemTableContext& context, const CoreOptions& core_options, + const std::shared_ptr& pool) { + std::shared_ptr arrow_schema = + DataField::ConvertDataFieldsToArrowSchema(context.table_schema->Fields()); + PAIMON_ASSIGN_OR_RAISE(std::vector external_paths, + core_options.CreateExternalPaths()); + PAIMON_ASSIGN_OR_RAISE(std::optional global_index_external_path, + core_options.CreateGlobalIndexExternalPath()); + PAIMON_ASSIGN_OR_RAISE( + std::unique_ptr path_factory, + FileStorePathFactory::Create( + context.table_path, arrow_schema, context.table_schema->PartitionKeys(), + core_options.GetPartitionDefaultName(), core_options.GetFileFormat()->Identifier(), + core_options.DataFilePrefix(), core_options.LegacyPartitionNameEnabled(), + external_paths, global_index_external_path, core_options.IndexFileInDataFileDir(), + pool)); + return std::shared_ptr(std::move(path_factory)); +} + +Result> LatestSnapshot(const MetadataSystemTableContext& context) { + SnapshotManager snapshot_manager(context.fs, context.table_path, context.branch); + return snapshot_manager.LatestSnapshot(); +} + +Result> ReadDataManifests( + const MetadataSystemTableContext& context, const Snapshot& snapshot, + const std::shared_ptr& path_factory, const CoreOptions& core_options, + const std::shared_ptr& pool) { + PAIMON_ASSIGN_OR_RAISE( + std::unique_ptr manifest_list, + ManifestList::Create(context.fs, core_options.GetManifestFormat(), + core_options.GetManifestCompression(), path_factory, pool)); + std::vector manifests; + PAIMON_RETURN_NOT_OK(manifest_list->ReadDataManifests(snapshot, &manifests)); + return manifests; +} + +Result> CreateManifestFile( + const MetadataSystemTableContext& context, + const std::shared_ptr& path_factory, const CoreOptions& core_options, + const std::shared_ptr& pool) { + std::shared_ptr arrow_schema = + DataField::ConvertDataFieldsToArrowSchema(context.table_schema->Fields()); + PAIMON_ASSIGN_OR_RAISE( + std::shared_ptr partition_schema, + FieldMapping::GetPartitionSchema(arrow_schema, context.table_schema->PartitionKeys())); + return ManifestFile::Create(context.fs, core_options.GetManifestFormat(), + core_options.GetManifestCompression(), path_factory, + core_options.GetManifestTargetFileSize(), pool, core_options, + partition_schema); +} + +Result> ReadLatestManifestEntries( + const MetadataSystemTableContext& context, + const std::shared_ptr& path_factory, const CoreOptions& core_options, + const std::shared_ptr& pool) { + PAIMON_ASSIGN_OR_RAISE(std::optional snapshot, LatestSnapshot(context)); + if (!snapshot) { + return std::vector(); + } + PAIMON_ASSIGN_OR_RAISE( + std::vector manifests, + ReadDataManifests(context, snapshot.value(), path_factory, core_options, pool)); + PAIMON_ASSIGN_OR_RAISE(std::unique_ptr manifest_file, + CreateManifestFile(context, path_factory, core_options, pool)); + std::vector entries; + for (const auto& manifest : manifests) { + PAIMON_RETURN_NOT_OK( + manifest_file->Read(manifest.FileName(), /*filter=*/nullptr, &entries)); + } + return entries; +} + +Result> ReadLatestDataFiles( + const MetadataSystemTableContext& context, + const std::shared_ptr& path_factory, const CoreOptions& core_options, + const std::shared_ptr& pool) { + PAIMON_ASSIGN_OR_RAISE(std::vector entries, + ReadLatestManifestEntries(context, path_factory, core_options, pool)); + std::vector merged_entries; + PAIMON_RETURN_NOT_OK(FileEntry::MergeEntries(entries, &merged_entries)); + return merged_entries; +} + +std::optional OptionalBinaryRowString(const BinaryRow& row) { + if (row.GetFieldCount() <= 0) { + return std::nullopt; + } + return row.ToString(); +} + +Result> OptionalPartitionString( + const BinaryRow& row, const std::shared_ptr& partition_schema) { + if (row.GetFieldCount() <= 0) { + return std::optional(); + } + PAIMON_ASSIGN_OR_RAISE(std::string value, + BinaryRowPartitionComputer::PartToSimpleString( + partition_schema, row, ",", kMaxPartitionStatsLength)); + return std::optional(value); +} + +Result OptionalPartitionStringValue( + const BinaryRow& row, const std::shared_ptr& partition_schema) { + PAIMON_ASSIGN_OR_RAISE(std::optional value, + OptionalPartitionString(row, partition_schema)); + return OptionalStringValue(value); +} + +Result PartitionString(const std::shared_ptr& path_factory, + const BinaryRow& partition) { + PAIMON_ASSIGN_OR_RAISE(std::string value, path_factory->GetPartitionString(partition)); + return value; +} + +Result FilePath(const std::shared_ptr& path_factory, + const ManifestEntry& entry, const DataFileMeta& file) { + if (file.external_path) { + return file.external_path.value(); + } + PAIMON_ASSIGN_OR_RAISE(std::string bucket_path, + path_factory->BucketPath(entry.Partition(), entry.Bucket())); + return PathUtil::JoinPath(bucket_path, file.file_name); +} + +Result FieldsValueMapString(const std::vector& fields, + const InternalRow& row) { + std::shared_ptr schema = DataField::ConvertDataFieldsToArrowSchema(fields); + PAIMON_ASSIGN_OR_RAISE(std::vector getters, + InternalRowUtils::CreateFieldGetters(schema, /*use_view=*/false)); + std::vector values; + values.reserve(fields.size()); + for (size_t i = 0; i < fields.size(); ++i) { + std::string value = "null"; + if (!row.IsNullAt(i)) { + VariantType field_value = getters[i](row); + if (std::holds_alternative(field_value)) { + value = std::string(std::get(field_value)); + } else { + value = DataDefine::VariantValueToString(field_value); + } + } + values.emplace_back(fmt::format("{}:{}", fields[i].Name(), value)); + } + return fmt::format("{{{}}}", fmt::join(values, ", ")); +} + +Result NullValueCountsString(const std::vector& fields, + const InternalArray& null_counts) { + std::vector values; + values.reserve(fields.size()); + for (size_t i = 0; i < fields.size(); ++i) { + std::string value = + null_counts.IsNullAt(i) ? "null" : std::to_string(null_counts.GetLong(i)); + values.emplace_back(fmt::format("{}:{}", fields[i].Name(), value)); + } + return fmt::format("{{{}}}", fmt::join(values, ", ")); +} + +Result> StatsFields(const std::shared_ptr& schema) { + return schema->Fields(); +} + +Result> LoadDataSchema(const MetadataSystemTableContext& context, + int64_t schema_id) { + if (schema_id == context.table_schema->Id()) { + return context.table_schema; + } + SchemaManager schema_manager(context.fs, context.table_path, context.branch); + return schema_manager.ReadSchema(schema_id); +} + +Result> ValueStatsFields(const MetadataSystemTableContext& context, + int64_t schema_id) { + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr data_schema, + LoadDataSchema(context, schema_id)); + PAIMON_ASSIGN_OR_RAISE(std::vector fields, StatsFields(data_schema)); + return fields; +} + +Result> WriteColsValue( + const std::optional>& write_cols, + const std::shared_ptr& pool) { + if (!write_cols) { + return std::shared_ptr(); + } + return std::make_shared( + InternalRowUtils::ToNotNullStringArrayData(write_cols.value(), pool)); +} + } // namespace OptionsSystemTable::OptionsSystemTable(std::string table_path, @@ -424,4 +659,189 @@ Result> ConsumersSystemTable::BuildRows() const { return rows; } +ManifestsSystemTable::ManifestsSystemTable(std::shared_ptr fs, std::string table_path, + std::string branch, + std::shared_ptr table_schema, + std::map options) + : InMemorySystemTable(table_path), + context_(CreateMetadataContext(std::move(fs), std::move(table_path), std::move(branch), + std::move(table_schema), std::move(options))) {} + +std::string ManifestsSystemTable::Name() const { + return kName; +} + +Result> ManifestsSystemTable::ArrowSchema() const { + return arrow::schema({ + arrow::field("file_name", arrow::utf8(), /*nullable=*/false), + arrow::field("file_size", arrow::int64(), /*nullable=*/false), + arrow::field("num_added_files", arrow::int64(), /*nullable=*/false), + arrow::field("num_deleted_files", arrow::int64(), /*nullable=*/false), + arrow::field("schema_id", arrow::int64(), /*nullable=*/false), + arrow::field("min_partition_stats", arrow::utf8(), /*nullable=*/true), + arrow::field("max_partition_stats", arrow::utf8(), /*nullable=*/true), + arrow::field("min_row_id", arrow::int64(), /*nullable=*/true), + arrow::field("max_row_id", arrow::int64(), /*nullable=*/true), + }); +} + +Result> ManifestsSystemTable::BuildRows() const { + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr schema, ArrowSchema()); + PAIMON_ASSIGN_OR_RAISE(std::optional snapshot, LatestSnapshot(context_)); + if (!snapshot) { + return std::vector(); + } + + std::shared_ptr pool = GetDefaultPool(); + PAIMON_ASSIGN_OR_RAISE(CoreOptions core_options, CreateCoreOptions(context_)); + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr path_factory, + CreatePathFactory(context_, core_options, pool)); + PAIMON_ASSIGN_OR_RAISE( + std::vector manifests, + ReadDataManifests(context_, snapshot.value(), path_factory, core_options, pool)); + std::shared_ptr arrow_schema = + DataField::ConvertDataFieldsToArrowSchema(context_.table_schema->Fields()); + PAIMON_ASSIGN_OR_RAISE( + std::shared_ptr partition_schema, + FieldMapping::GetPartitionSchema(arrow_schema, context_.table_schema->PartitionKeys())); + + std::vector rows; + rows.reserve(manifests.size()); + for (const auto& manifest : manifests) { + GenericRow row(schema->num_fields()); + row.SetField(0, StringValue(manifest.FileName())); + row.SetField(1, manifest.FileSize()); + row.SetField(2, manifest.NumAddedFiles()); + row.SetField(3, manifest.NumDeletedFiles()); + row.SetField(4, manifest.SchemaId()); + PAIMON_ASSIGN_OR_RAISE( + VariantType min_partition, + OptionalPartitionStringValue(manifest.PartitionStats().MinValues(), partition_schema)); + PAIMON_ASSIGN_OR_RAISE( + VariantType max_partition, + OptionalPartitionStringValue(manifest.PartitionStats().MaxValues(), partition_schema)); + row.SetField(5, min_partition); + row.SetField(6, max_partition); + row.SetField(7, OptionalInt64Value(manifest.MinRowId())); + row.SetField(8, OptionalInt64Value(manifest.MaxRowId())); + rows.push_back(std::move(row)); + } + return rows; +} + +FilesSystemTable::FilesSystemTable(std::shared_ptr fs, std::string table_path, + std::string branch, std::shared_ptr table_schema, + std::map options) + : InMemorySystemTable(table_path), + context_(CreateMetadataContext(std::move(fs), std::move(table_path), std::move(branch), + std::move(table_schema), std::move(options))) {} + +std::string FilesSystemTable::Name() const { + return kName; +} + +Result> FilesSystemTable::ArrowSchema() const { + return arrow::schema({ + arrow::field("partition", arrow::utf8(), /*nullable=*/true), + arrow::field("bucket", arrow::int32(), /*nullable=*/false), + arrow::field("file_path", arrow::utf8(), /*nullable=*/false), + arrow::field("file_format", arrow::utf8(), /*nullable=*/false), + arrow::field("schema_id", arrow::int64(), /*nullable=*/false), + arrow::field("level", arrow::int32(), /*nullable=*/false), + arrow::field("record_count", arrow::int64(), /*nullable=*/false), + arrow::field("file_size_in_bytes", arrow::int64(), /*nullable=*/false), + arrow::field("min_key", arrow::utf8(), /*nullable=*/true), + arrow::field("max_key", arrow::utf8(), /*nullable=*/true), + arrow::field("null_value_counts", arrow::utf8(), /*nullable=*/false), + arrow::field("min_value_stats", arrow::utf8(), /*nullable=*/false), + arrow::field("max_value_stats", arrow::utf8(), /*nullable=*/false), + arrow::field("min_sequence_number", arrow::int64(), /*nullable=*/true), + arrow::field("max_sequence_number", arrow::int64(), /*nullable=*/true), + arrow::field("creation_time", arrow::timestamp(arrow::TimeUnit::MILLI), + /*nullable=*/true), + arrow::field("deleteRowCount", arrow::int64(), /*nullable=*/true), + arrow::field("file_source", arrow::utf8(), /*nullable=*/true), + arrow::field("first_row_id", arrow::int64(), /*nullable=*/true), + arrow::field("write_cols", arrow::list(arrow::utf8()), /*nullable=*/true), + }); +} + +Result> FilesSystemTable::BuildRows() const { + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr schema, ArrowSchema()); + std::shared_ptr pool = GetDefaultPool(); + PAIMON_ASSIGN_OR_RAISE(CoreOptions core_options, CreateCoreOptions(context_)); + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr path_factory, + CreatePathFactory(context_, core_options, pool)); + PAIMON_ASSIGN_OR_RAISE(std::vector entries, + ReadLatestDataFiles(context_, path_factory, core_options, pool)); + std::shared_ptr arrow_schema = + DataField::ConvertDataFieldsToArrowSchema(context_.table_schema->Fields()); + PAIMON_ASSIGN_OR_RAISE( + std::shared_ptr partition_schema, + FieldMapping::GetPartitionSchema(arrow_schema, context_.table_schema->PartitionKeys())); + + SimpleStatsEvolutions stats_evolutions(context_.table_schema, pool); + std::vector rows; + rows.reserve(entries.size()); + for (const auto& entry : entries) { + if (!(entry.Kind() == FileKind::Add())) { + continue; + } + + const std::shared_ptr& file = entry.File(); + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr data_schema, + LoadDataSchema(context_, file->schema_id)); + PAIMON_ASSIGN_OR_RAISE(std::vector value_stats_fields, + ValueStatsFields(context_, file->schema_id)); + std::shared_ptr stats_evolution = + stats_evolutions.GetOrCreate(data_schema); + PAIMON_ASSIGN_OR_RAISE( + SimpleStatsEvolution::EvolutionStats stats, + stats_evolution->Evolution(file->value_stats, file->row_count, file->value_stats_cols)); + + GenericRow row(schema->num_fields()); + if (context_.table_schema->PartitionKeys().empty()) { + row.SetField(0, NullType()); + } else { + PAIMON_ASSIGN_OR_RAISE(std::string partition, + PartitionString(path_factory, entry.Partition())); + row.SetField(0, StringValue(partition)); + } + row.SetField(1, entry.Bucket()); + PAIMON_ASSIGN_OR_RAISE(std::string file_path, FilePath(path_factory, entry, *file)); + row.SetField(2, StringValue(file_path)); + PAIMON_ASSIGN_OR_RAISE(std::string file_format, file->FileFormat()); + row.SetField(3, StringValue(file_format)); + row.SetField(4, file->schema_id); + row.SetField(5, file->level); + row.SetField(6, file->row_count); + row.SetField(7, file->file_size); + row.SetField(8, OptionalStringValue(OptionalBinaryRowString(file->min_key))); + row.SetField(9, OptionalStringValue(OptionalBinaryRowString(file->max_key))); + PAIMON_ASSIGN_OR_RAISE(std::string null_value_counts, + NullValueCountsString(value_stats_fields, *stats.null_counts)); + row.SetField(10, StringValue(null_value_counts)); + PAIMON_ASSIGN_OR_RAISE(std::string min_value_stats, + FieldsValueMapString(value_stats_fields, *stats.min_values)); + row.SetField(11, StringValue(min_value_stats)); + PAIMON_ASSIGN_OR_RAISE(std::string max_value_stats, + FieldsValueMapString(value_stats_fields, *stats.max_values)); + row.SetField(12, StringValue(max_value_stats)); + row.SetField(13, file->min_sequence_number); + row.SetField(14, file->max_sequence_number); + PAIMON_ASSIGN_OR_RAISE(VariantType creation_time, + LocalTimestampMillisValue(file->creation_time)); + row.SetField(15, creation_time); + row.SetField(16, OptionalInt64Value(file->delete_row_count)); + row.SetField(17, file->file_source ? StringValue(file->file_source.value().ToString()) + : VariantType(NullType())); + row.SetField(18, OptionalInt64Value(file->first_row_id)); + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr write_cols, + WriteColsValue(file->write_cols, pool)); + row.SetField(19, write_cols ? VariantType(write_cols) : VariantType(NullType())); + rows.push_back(std::move(row)); + } + return rows; +} + } // namespace paimon diff --git a/src/paimon/core/table/system/metadata_system_tables.h b/src/paimon/core/table/system/metadata_system_tables.h index 91948863f..389ad5a95 100644 --- a/src/paimon/core/table/system/metadata_system_tables.h +++ b/src/paimon/core/table/system/metadata_system_tables.h @@ -16,6 +16,7 @@ #pragma once +#include #include #include #include @@ -46,6 +47,8 @@ struct MetadataSystemTableContext { std::shared_ptr fs; std::string table_path; std::string branch; + std::shared_ptr table_schema; + std::map options; }; /// System table for `T$snapshots`, exposing snapshot commit history. @@ -125,4 +128,38 @@ class ConsumersSystemTable : public InMemorySystemTable { MetadataSystemTableContext context_; }; +/// System table for `T$manifests`, exposing data manifest metadata in the latest snapshot. +class ManifestsSystemTable : public InMemorySystemTable { + public: + static constexpr const char* kName = "manifests"; + + ManifestsSystemTable(std::shared_ptr fs, std::string table_path, std::string branch, + std::shared_ptr table_schema, + std::map options); + + std::string Name() const override; + Result> ArrowSchema() const override; + Result> BuildRows() const override; + + private: + MetadataSystemTableContext context_; +}; + +/// System table for `T$files`, exposing data file metadata in the latest snapshot. +class FilesSystemTable : public InMemorySystemTable { + public: + static constexpr const char* kName = "files"; + + FilesSystemTable(std::shared_ptr fs, std::string table_path, std::string branch, + std::shared_ptr table_schema, + std::map options); + + std::string Name() const override; + Result> ArrowSchema() const override; + Result> BuildRows() const override; + + private: + MetadataSystemTableContext context_; +}; + } // namespace paimon diff --git a/src/paimon/core/table/system/system_table.cpp b/src/paimon/core/table/system/system_table.cpp index cc176cd5e..061e6d4f1 100644 --- a/src/paimon/core/table/system/system_table.cpp +++ b/src/paimon/core/table/system/system_table.cpp @@ -126,6 +126,24 @@ const std::vector& SystemTableRegistry() { auto options = MergeOptions(table_schema, dynamic_options); return std::make_shared(fs, table_path, LoadBranch(options)); }}, + {ManifestsSystemTable::kName, + [](const std::shared_ptr& fs, const std::string& table_path, + const std::shared_ptr& table_schema, + const std::map& dynamic_options) + -> Result> { + auto options = MergeOptions(table_schema, dynamic_options); + return std::make_shared(fs, table_path, LoadBranch(options), + table_schema, std::move(options)); + }}, + {FilesSystemTable::kName, + [](const std::shared_ptr& fs, const std::string& table_path, + const std::shared_ptr& table_schema, + const std::map& dynamic_options) + -> Result> { + auto options = MergeOptions(table_schema, dynamic_options); + return std::make_shared(fs, table_path, LoadBranch(options), + table_schema, std::move(options)); + }}, }; return registry; } diff --git a/test/inte/read_inte_test.cpp b/test/inte/read_inte_test.cpp index 5211fdd8c..2bf5caad0 100644 --- a/test/inte/read_inte_test.cpp +++ b/test/inte/read_inte_test.cpp @@ -699,6 +699,151 @@ TEST(SystemTableReadInteTest, TestReadMetadataSystemTables) { auto branch_create_time_array = std::dynamic_pointer_cast(branches_array->field(1)); ASSERT_TRUE(branch_create_time_array); + + ASSERT_OK_AND_ASSIGN(auto manifests_result, + ReadSystemTable(table_path + "$manifests", options)); + auto manifests_array = SingleStructChunk(manifests_result); + ASSERT_EQ(StructFieldNames(manifests_array), + (std::vector{"file_name", "file_size", "num_added_files", + "num_deleted_files", "schema_id", "min_partition_stats", + "max_partition_stats", "min_row_id", "max_row_id"})); + ASSERT_GT(manifests_array->length(), 0); + auto manifest_file_name_array = + std::dynamic_pointer_cast(manifests_array->field(0)); + auto manifest_file_size_array = + std::dynamic_pointer_cast(manifests_array->field(1)); + auto manifest_num_added_files_array = + std::dynamic_pointer_cast(manifests_array->field(2)); + auto manifest_schema_id_array = + std::dynamic_pointer_cast(manifests_array->field(4)); + ASSERT_TRUE(manifest_file_name_array); + ASSERT_TRUE(manifest_file_size_array); + ASSERT_TRUE(manifest_num_added_files_array); + ASSERT_TRUE(manifest_schema_id_array); + ASSERT_TRUE(manifest_file_name_array->GetString(0).find("manifest-") == 0); + ASSERT_GT(manifest_file_size_array->Value(0), 0); + ASSERT_GE(manifest_num_added_files_array->Value(0), 1); + ASSERT_EQ(manifest_schema_id_array->Value(0), 0); + + ASSERT_OK_AND_ASSIGN(auto files_result, ReadSystemTable(table_path + "$files", options)); + auto files_array = SingleStructChunk(files_result); + ASSERT_EQ(StructFieldNames(files_array), (std::vector{"partition", + "bucket", + "file_path", + "file_format", + "schema_id", + "level", + "record_count", + "file_size_in_bytes", + "min_key", + "max_key", + "null_value_counts", + "min_value_stats", + "max_value_stats", + "min_sequence_number", + "max_sequence_number", + "creation_time", + "deleteRowCount", + "file_source", + "first_row_id", + "write_cols"})); + ASSERT_GT(files_array->length(), 0); + auto partition_array = std::dynamic_pointer_cast(files_array->field(0)); + auto bucket_array = std::dynamic_pointer_cast(files_array->field(1)); + auto file_path_array = std::dynamic_pointer_cast(files_array->field(2)); + auto file_format_array = std::dynamic_pointer_cast(files_array->field(3)); + auto file_schema_id_array = std::dynamic_pointer_cast(files_array->field(4)); + auto record_count_array = std::dynamic_pointer_cast(files_array->field(6)); + auto file_size_array = std::dynamic_pointer_cast(files_array->field(7)); + auto min_sequence_number_array = + std::dynamic_pointer_cast(files_array->field(13)); + auto max_sequence_number_array = + std::dynamic_pointer_cast(files_array->field(14)); + auto creation_time_array = + std::dynamic_pointer_cast(files_array->field(15)); + ASSERT_TRUE(partition_array); + ASSERT_TRUE(bucket_array); + ASSERT_TRUE(file_path_array); + ASSERT_TRUE(file_format_array); + ASSERT_TRUE(file_schema_id_array); + ASSERT_TRUE(record_count_array); + ASSERT_TRUE(file_size_array); + ASSERT_TRUE(min_sequence_number_array); + ASSERT_TRUE(max_sequence_number_array); + ASSERT_TRUE(creation_time_array); + ASSERT_TRUE(partition_array->IsNull(0)); + ASSERT_EQ(bucket_array->Value(0), 0); + ASSERT_NE(file_path_array->GetString(0).find("/bucket-0/"), std::string::npos); + ASSERT_EQ(file_format_array->GetString(0), "parquet"); + ASSERT_EQ(file_schema_id_array->Value(0), 0); + ASSERT_EQ(record_count_array->Value(0), 1); + ASSERT_GT(file_size_array->Value(0), 0); + ASSERT_GE(min_sequence_number_array->Value(0), 0); + ASSERT_GE(max_sequence_number_array->Value(0), min_sequence_number_array->Value(0)); + ASSERT_FALSE(creation_time_array->IsNull(0)); +} + +TEST(SystemTableReadInteTest, TestReadFilesSystemTableForPartitionedTable) { + arrow::FieldVector fields = { + arrow::field("dt", arrow::utf8()), + arrow::field("pk", arrow::utf8()), + arrow::field("v", arrow::int32()), + }; + auto schema = arrow::schema(fields); + std::map options = {{Options::FILE_SYSTEM, "local"}, + {Options::FILE_FORMAT, "orc"}, + {Options::MANIFEST_FORMAT, "orc"}, + {Options::BUCKET, "1"}}; + auto dir = UniqueTestDirectory::Create(); + ASSERT_TRUE(dir); + ASSERT_OK_AND_ASSIGN(auto helper, TestHelper::Create(dir->Str(), schema, + /*partition_keys=*/{"dt"}, + /*primary_keys=*/{"dt", "pk"}, options, + /*is_streaming_mode=*/true)); + + ASSERT_OK_AND_ASSIGN( + std::unique_ptr batch, + TestHelper::MakeRecordBatch(arrow::struct_(fields), R"([["20260527", "a", 1]])", + /*partition_map=*/{{"dt", "20260527"}}, /*bucket=*/0, {})); + ASSERT_OK(helper->WriteAndCommit(std::move(batch), /*commit_identifier=*/0, + /*expected_commit_messages=*/std::nullopt)); + + std::string table_path = PathUtil::JoinPath(dir->Str(), "foo.db/bar"); + ASSERT_OK_AND_ASSIGN(auto files_result, ReadSystemTable(table_path + "$files", options)); + auto files_array = SingleStructChunk(files_result); + ASSERT_EQ(files_array->length(), 1); + auto partition_array = std::dynamic_pointer_cast(files_array->field(0)); + auto file_path_array = std::dynamic_pointer_cast(files_array->field(2)); + ASSERT_TRUE(partition_array); + ASSERT_TRUE(file_path_array); + ASSERT_EQ(partition_array->GetString(0), "dt=20260527/"); + ASSERT_NE(file_path_array->GetString(0).find("/dt=20260527/bucket-0/"), std::string::npos); +} + +TEST(SystemTableReadInteTest, TestReadManifestAndFilesSystemTablesForEmptyTable) { + std::map options = {{Options::FILE_SYSTEM, "local"}, + {Options::FILE_FORMAT, "orc"}, + {Options::MANIFEST_FORMAT, "orc"}}; + auto dir = UniqueTestDirectory::Create(); + ASSERT_TRUE(dir); + std::string warehouse = PathUtil::JoinPath(dir->Str(), "warehouse"); + ASSERT_OK_AND_ASSIGN(auto catalog, Catalog::Create(warehouse, options)); + ASSERT_OK(catalog->CreateDatabase("db1", options, /*ignore_if_exists=*/false)); + + auto typed_schema = arrow::schema({arrow::field("f0", arrow::int32())}); + ::ArrowSchema schema; + ASSERT_TRUE(arrow::ExportSchema(*typed_schema, &schema).ok()); + ASSERT_OK(catalog->CreateTable(Identifier("db1", "tbl1"), &schema, + /*partition_keys=*/{}, /*primary_keys=*/{}, options, + /*ignore_if_exists=*/false)); + ArrowSchemaRelease(&schema); + + std::string table_path = catalog->GetTableLocation(Identifier("db1", "tbl1")); + ASSERT_OK_AND_ASSIGN(auto manifests_result, + ReadSystemTable(table_path + "$manifests", options)); + ASSERT_EQ(manifests_result.array, nullptr); + ASSERT_OK_AND_ASSIGN(auto files_result, ReadSystemTable(table_path + "$files", options)); + ASSERT_EQ(files_result.array, nullptr); } TEST(SystemTableReadInteTest, TestReadTagBranchAndConsumerSystemTables) { From fc58d0c7cff75c2e8ce9cf1f101505761a17be34 Mon Sep 17 00:00:00 2001 From: Socrates Date: Wed, 27 May 2026 20:22:21 +0800 Subject: [PATCH 2/8] Refactor system table implementations --- src/paimon/CMakeLists.txt | 10 +- .../table/system/branches_system_table.cpp | 67 ++ .../core/table/system/branches_system_table.h | 44 + .../table/system/consumers_system_table.cpp | 60 ++ .../table/system/consumers_system_table.h | 45 + .../core/table/system/files_system_table.cpp | 161 ++++ .../core/table/system/files_system_table.h | 48 + .../table/system/manifests_system_table.cpp | 105 +++ .../table/system/manifests_system_table.h | 48 + .../table/system/metadata_system_tables.cpp | 847 ------------------ .../table/system/metadata_system_tables.h | 165 ---- .../table/system/options_system_table.cpp | 53 ++ .../core/table/system/options_system_table.h | 43 + .../table/system/schemas_system_table.cpp | 87 ++ .../core/table/system/schemas_system_table.h | 44 + .../table/system/snapshots_system_table.cpp | 92 ++ .../table/system/snapshots_system_table.h | 45 + src/paimon/core/table/system/system_table.cpp | 69 +- .../core/table/system/system_table_utils.cpp | 373 ++++++++ .../core/table/system/system_table_utils.h | 148 +++ .../core/table/system/tags_system_table.cpp | 80 ++ .../core/table/system/tags_system_table.h | 44 + test/inte/read_inte_test.cpp | 2 +- 23 files changed, 1632 insertions(+), 1048 deletions(-) create mode 100644 src/paimon/core/table/system/branches_system_table.cpp create mode 100644 src/paimon/core/table/system/branches_system_table.h create mode 100644 src/paimon/core/table/system/consumers_system_table.cpp create mode 100644 src/paimon/core/table/system/consumers_system_table.h create mode 100644 src/paimon/core/table/system/files_system_table.cpp create mode 100644 src/paimon/core/table/system/files_system_table.h create mode 100644 src/paimon/core/table/system/manifests_system_table.cpp create mode 100644 src/paimon/core/table/system/manifests_system_table.h delete mode 100644 src/paimon/core/table/system/metadata_system_tables.cpp delete mode 100644 src/paimon/core/table/system/metadata_system_tables.h create mode 100644 src/paimon/core/table/system/options_system_table.cpp create mode 100644 src/paimon/core/table/system/options_system_table.h create mode 100644 src/paimon/core/table/system/schemas_system_table.cpp create mode 100644 src/paimon/core/table/system/schemas_system_table.h create mode 100644 src/paimon/core/table/system/snapshots_system_table.cpp create mode 100644 src/paimon/core/table/system/snapshots_system_table.h create mode 100644 src/paimon/core/table/system/system_table_utils.cpp create mode 100644 src/paimon/core/table/system/system_table_utils.h create mode 100644 src/paimon/core/table/system/tags_system_table.cpp create mode 100644 src/paimon/core/table/system/tags_system_table.h diff --git a/src/paimon/CMakeLists.txt b/src/paimon/CMakeLists.txt index a517184a4..edc9681e6 100644 --- a/src/paimon/CMakeLists.txt +++ b/src/paimon/CMakeLists.txt @@ -320,11 +320,19 @@ set(PAIMON_CORE_SRCS core/table/source/data_evolution_batch_scan.cpp core/table/system/audit_log_system_table.cpp core/table/system/binlog_system_table.cpp + core/table/system/branches_system_table.cpp + core/table/system/consumers_system_table.cpp + core/table/system/files_system_table.cpp core/table/system/in_memory_system_table.cpp - core/table/system/metadata_system_tables.cpp + core/table/system/manifests_system_table.cpp + core/table/system/options_system_table.cpp + core/table/system/schemas_system_table.cpp + core/table/system/snapshots_system_table.cpp core/table/system/system_table.cpp core/table/system/system_table_scan.cpp core/table/system/system_table_schema.cpp + core/table/system/system_table_utils.cpp + core/table/system/tags_system_table.cpp core/tag/tag.cpp core/utils/branch_manager.cpp core/utils/consumer_manager.cpp diff --git a/src/paimon/core/table/system/branches_system_table.cpp b/src/paimon/core/table/system/branches_system_table.cpp new file mode 100644 index 000000000..a99553283 --- /dev/null +++ b/src/paimon/core/table/system/branches_system_table.cpp @@ -0,0 +1,67 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/core/table/system/branches_system_table.h" + +#include + +#include "arrow/api.h" +#include "paimon/core/utils/branch_manager.h" +#include "paimon/fs/file_system.h" + +namespace paimon { + +BranchesSystemTable::BranchesSystemTable(std::shared_ptr fs, std::string table_path, + std::string branch) + : InMemorySystemTable(table_path), + context_(SystemTableUtils::CreateContext(std::move(fs), std::move(table_path), + std::move(branch))) {} + +std::string BranchesSystemTable::Name() const { + return kName; +} + +Result> BranchesSystemTable::ArrowSchema() const { + return arrow::schema({ + arrow::field("branch_name", arrow::utf8(), /*nullable=*/false), + arrow::field("create_time", arrow::timestamp(arrow::TimeUnit::MILLI), + /*nullable=*/false), + }); +} + +Result> BranchesSystemTable::BuildRows() const { + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr schema, ArrowSchema()); + PAIMON_ASSIGN_OR_RAISE(std::vector branches, + BranchManager::ListBranches(context_.fs, context_.table_path)); + std::vector rows; + rows.reserve(branches.size()); + + for (const auto& name : branches) { + PAIMON_ASSIGN_OR_RAISE( + std::unique_ptr branch_status, + context_.fs->GetFileStatus(BranchManager::BranchPath(context_.table_path, name))); + GenericRow row(schema->num_fields()); + row.SetField(0, SystemTableUtils::StringValue(name)); + PAIMON_ASSIGN_OR_RAISE(VariantType create_time, SystemTableUtils::LocalTimestampMillisValue( + branch_status->GetModificationTime())); + row.SetField(1, create_time); + rows.push_back(std::move(row)); + } + + return rows; +} + +} // namespace paimon diff --git a/src/paimon/core/table/system/branches_system_table.h b/src/paimon/core/table/system/branches_system_table.h new file mode 100644 index 000000000..7968b1c63 --- /dev/null +++ b/src/paimon/core/table/system/branches_system_table.h @@ -0,0 +1,44 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#include "paimon/core/table/system/in_memory_system_table.h" +#include "paimon/core/table/system/system_table_utils.h" + +namespace paimon { +class FileSystem; + +/// System table for `T$branches`, exposing table branches including `main`. +class BranchesSystemTable : public InMemorySystemTable { + public: + static constexpr const char* kName = "branches"; + + BranchesSystemTable(std::shared_ptr fs, std::string table_path, std::string branch); + + std::string Name() const override; + Result> ArrowSchema() const override; + Result> BuildRows() const override; + + private: + SystemTableContext context_; +}; + +} // namespace paimon diff --git a/src/paimon/core/table/system/consumers_system_table.cpp b/src/paimon/core/table/system/consumers_system_table.cpp new file mode 100644 index 000000000..1c0fac998 --- /dev/null +++ b/src/paimon/core/table/system/consumers_system_table.cpp @@ -0,0 +1,60 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/core/table/system/consumers_system_table.h" + +#include + +#include "arrow/api.h" +#include "paimon/core/utils/consumer_manager.h" + +namespace paimon { + +ConsumersSystemTable::ConsumersSystemTable(std::shared_ptr fs, std::string table_path, + std::string branch) + : InMemorySystemTable(table_path), + context_(SystemTableUtils::CreateContext(std::move(fs), std::move(table_path), + std::move(branch))) {} + +std::string ConsumersSystemTable::Name() const { + return kName; +} + +Result> ConsumersSystemTable::ArrowSchema() const { + return arrow::schema({ + arrow::field("consumer_id", arrow::utf8(), /*nullable=*/false), + arrow::field("next_snapshot_id", arrow::int64(), /*nullable=*/false), + }); +} + +Result> ConsumersSystemTable::BuildRows() const { + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr schema, ArrowSchema()); + ConsumerManager consumer_manager(context_.fs, context_.table_path, context_.branch); + PAIMON_ASSIGN_OR_RAISE(auto consumers, consumer_manager.Consumers()); + std::vector rows; + rows.reserve(consumers.size()); + + for (const auto& [id, snapshot_id] : consumers) { + GenericRow row(schema->num_fields()); + row.SetField(0, SystemTableUtils::StringValue(id)); + row.SetField(1, snapshot_id); + rows.push_back(std::move(row)); + } + + return rows; +} + +} // namespace paimon diff --git a/src/paimon/core/table/system/consumers_system_table.h b/src/paimon/core/table/system/consumers_system_table.h new file mode 100644 index 000000000..13761d904 --- /dev/null +++ b/src/paimon/core/table/system/consumers_system_table.h @@ -0,0 +1,45 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#include "paimon/core/table/system/in_memory_system_table.h" +#include "paimon/core/table/system/system_table_utils.h" + +namespace paimon { +class FileSystem; + +/// System table for `T$consumers`, exposing persisted streaming consumer offsets. +class ConsumersSystemTable : public InMemorySystemTable { + public: + static constexpr const char* kName = "consumers"; + + ConsumersSystemTable(std::shared_ptr fs, std::string table_path, + std::string branch); + + std::string Name() const override; + Result> ArrowSchema() const override; + Result> BuildRows() const override; + + private: + SystemTableContext context_; +}; + +} // namespace paimon diff --git a/src/paimon/core/table/system/files_system_table.cpp b/src/paimon/core/table/system/files_system_table.cpp new file mode 100644 index 000000000..7effaec6a --- /dev/null +++ b/src/paimon/core/table/system/files_system_table.cpp @@ -0,0 +1,161 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/core/table/system/files_system_table.h" + +#include + +#include "arrow/api.h" +#include "paimon/common/data/data_define.h" +#include "paimon/common/data/internal_array.h" +#include "paimon/common/types/data_field.h" +#include "paimon/core/core_options.h" +#include "paimon/core/io/data_file_meta.h" +#include "paimon/core/manifest/file_kind.h" +#include "paimon/core/manifest/manifest_entry.h" +#include "paimon/core/schema/table_schema.h" +#include "paimon/core/stats/simple_stats_evolutions.h" +#include "paimon/core/utils/field_mapping.h" +#include "paimon/core/utils/file_store_path_factory.h" +#include "paimon/memory/memory_pool.h" + +namespace paimon { + +FilesSystemTable::FilesSystemTable(std::shared_ptr fs, std::string table_path, + std::string branch, std::shared_ptr table_schema, + std::map options) + : InMemorySystemTable(table_path), + context_(SystemTableUtils::CreateContext(std::move(fs), std::move(table_path), + std::move(branch), std::move(table_schema), + std::move(options))) {} + +std::string FilesSystemTable::Name() const { + return kName; +} + +Result> FilesSystemTable::ArrowSchema() const { + return arrow::schema({ + arrow::field("partition", arrow::utf8(), /*nullable=*/true), + arrow::field("bucket", arrow::int32(), /*nullable=*/false), + arrow::field("file_path", arrow::utf8(), /*nullable=*/false), + arrow::field("file_format", arrow::utf8(), /*nullable=*/false), + arrow::field("schema_id", arrow::int64(), /*nullable=*/false), + arrow::field("level", arrow::int32(), /*nullable=*/false), + arrow::field("record_count", arrow::int64(), /*nullable=*/false), + arrow::field("file_size_in_bytes", arrow::int64(), /*nullable=*/false), + arrow::field("min_key", arrow::utf8(), /*nullable=*/true), + arrow::field("max_key", arrow::utf8(), /*nullable=*/true), + arrow::field("null_value_counts", arrow::utf8(), /*nullable=*/false), + arrow::field("min_value_stats", arrow::utf8(), /*nullable=*/false), + arrow::field("max_value_stats", arrow::utf8(), /*nullable=*/false), + arrow::field("min_sequence_number", arrow::int64(), /*nullable=*/true), + arrow::field("max_sequence_number", arrow::int64(), /*nullable=*/true), + arrow::field("creation_time", arrow::timestamp(arrow::TimeUnit::MILLI), + /*nullable=*/true), + arrow::field("deleteRowCount", arrow::int64(), /*nullable=*/true), + arrow::field("file_source", arrow::utf8(), /*nullable=*/true), + arrow::field("first_row_id", arrow::int64(), /*nullable=*/true), + arrow::field("write_cols", arrow::list(arrow::utf8()), /*nullable=*/true), + }); +} + +Result> FilesSystemTable::BuildRows() const { + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr schema, ArrowSchema()); + std::shared_ptr pool = GetDefaultPool(); + PAIMON_ASSIGN_OR_RAISE(CoreOptions core_options, SystemTableUtils::CreateCoreOptions(context_)); + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr path_factory, + SystemTableUtils::CreatePathFactory(context_, core_options, pool)); + PAIMON_ASSIGN_OR_RAISE( + std::vector entries, + SystemTableUtils::ReadLatestDataFiles(context_, path_factory, core_options, pool)); + std::shared_ptr arrow_schema = + DataField::ConvertDataFieldsToArrowSchema(context_.table_schema->Fields()); + PAIMON_ASSIGN_OR_RAISE( + std::shared_ptr partition_schema, + FieldMapping::GetPartitionSchema(arrow_schema, context_.table_schema->PartitionKeys())); + + SimpleStatsEvolutions stats_evolutions(context_.table_schema, pool); + std::vector rows; + rows.reserve(entries.size()); + for (const auto& entry : entries) { + if (!(entry.Kind() == FileKind::Add())) { + continue; + } + + const std::shared_ptr& file = entry.File(); + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr data_schema, + SystemTableUtils::LoadDataSchema(context_, file->schema_id)); + PAIMON_ASSIGN_OR_RAISE(std::vector value_stats_fields, + SystemTableUtils::ValueStatsFields(context_, file->schema_id)); + std::shared_ptr stats_evolution = + stats_evolutions.GetOrCreate(data_schema); + PAIMON_ASSIGN_OR_RAISE( + SimpleStatsEvolution::EvolutionStats stats, + stats_evolution->Evolution(file->value_stats, file->row_count, file->value_stats_cols)); + + GenericRow row(schema->num_fields()); + if (context_.table_schema->PartitionKeys().empty()) { + row.SetField(0, NullType()); + } else { + PAIMON_ASSIGN_OR_RAISE(std::string partition, SystemTableUtils::PartitionString( + path_factory, entry.Partition())); + row.SetField(0, SystemTableUtils::StringValue(partition)); + } + row.SetField(1, entry.Bucket()); + PAIMON_ASSIGN_OR_RAISE(std::string file_path, + SystemTableUtils::FilePath(path_factory, entry, *file)); + row.SetField(2, SystemTableUtils::StringValue(file_path)); + PAIMON_ASSIGN_OR_RAISE(std::string file_format, file->FileFormat()); + row.SetField(3, SystemTableUtils::StringValue(file_format)); + row.SetField(4, file->schema_id); + row.SetField(5, file->level); + row.SetField(6, file->row_count); + row.SetField(7, file->file_size); + row.SetField(8, SystemTableUtils::OptionalStringValue( + SystemTableUtils::OptionalBinaryRowString(file->min_key))); + row.SetField(9, SystemTableUtils::OptionalStringValue( + SystemTableUtils::OptionalBinaryRowString(file->max_key))); + PAIMON_ASSIGN_OR_RAISE( + std::string null_value_counts, + SystemTableUtils::NullValueCountsString(value_stats_fields, *stats.null_counts)); + row.SetField(10, SystemTableUtils::StringValue(null_value_counts)); + PAIMON_ASSIGN_OR_RAISE( + std::string min_value_stats, + SystemTableUtils::FieldsValueMapString(value_stats_fields, *stats.min_values)); + row.SetField(11, SystemTableUtils::StringValue(min_value_stats)); + PAIMON_ASSIGN_OR_RAISE( + std::string max_value_stats, + SystemTableUtils::FieldsValueMapString(value_stats_fields, *stats.max_values)); + row.SetField(12, SystemTableUtils::StringValue(max_value_stats)); + row.SetField(13, file->min_sequence_number); + row.SetField(14, file->max_sequence_number); + PAIMON_ASSIGN_OR_RAISE(VariantType creation_time, + SystemTableUtils::LocalTimestampMillisValue(file->creation_time)); + row.SetField(15, creation_time); + row.SetField(16, SystemTableUtils::OptionalInt64Value(file->delete_row_count)); + row.SetField(17, file->file_source + ? SystemTableUtils::StringValue(file->file_source.value().ToString()) + : VariantType(NullType())); + row.SetField(18, SystemTableUtils::OptionalInt64Value(file->first_row_id)); + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr write_cols, + SystemTableUtils::WriteColsValue(file->write_cols, pool)); + row.SetField(19, write_cols ? VariantType(write_cols) : VariantType(NullType())); + rows.push_back(std::move(row)); + } + return rows; +} + +} // namespace paimon diff --git a/src/paimon/core/table/system/files_system_table.h b/src/paimon/core/table/system/files_system_table.h new file mode 100644 index 000000000..9a0e0ee8a --- /dev/null +++ b/src/paimon/core/table/system/files_system_table.h @@ -0,0 +1,48 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +#include "paimon/core/table/system/in_memory_system_table.h" +#include "paimon/core/table/system/system_table_utils.h" + +namespace paimon { +class FileSystem; +class TableSchema; + +/// System table for `T$files`, exposing data file metadata in the latest snapshot. +class FilesSystemTable : public InMemorySystemTable { + public: + static constexpr const char* kName = "files"; + + FilesSystemTable(std::shared_ptr fs, std::string table_path, std::string branch, + std::shared_ptr table_schema, + std::map options); + + std::string Name() const override; + Result> ArrowSchema() const override; + Result> BuildRows() const override; + + private: + SystemTableContext context_; +}; + +} // namespace paimon diff --git a/src/paimon/core/table/system/manifests_system_table.cpp b/src/paimon/core/table/system/manifests_system_table.cpp new file mode 100644 index 000000000..a67b5486c --- /dev/null +++ b/src/paimon/core/table/system/manifests_system_table.cpp @@ -0,0 +1,105 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/core/table/system/manifests_system_table.h" + +#include + +#include "arrow/api.h" +#include "paimon/common/types/data_field.h" +#include "paimon/core/core_options.h" +#include "paimon/core/manifest/manifest_file_meta.h" +#include "paimon/core/schema/table_schema.h" +#include "paimon/core/snapshot.h" +#include "paimon/core/utils/field_mapping.h" +#include "paimon/core/utils/file_store_path_factory.h" +#include "paimon/memory/memory_pool.h" + +namespace paimon { + +ManifestsSystemTable::ManifestsSystemTable(std::shared_ptr fs, std::string table_path, + std::string branch, + std::shared_ptr table_schema, + std::map options) + : InMemorySystemTable(table_path), + context_(SystemTableUtils::CreateContext(std::move(fs), std::move(table_path), + std::move(branch), std::move(table_schema), + std::move(options))) {} + +std::string ManifestsSystemTable::Name() const { + return kName; +} + +Result> ManifestsSystemTable::ArrowSchema() const { + return arrow::schema({ + arrow::field("file_name", arrow::utf8(), /*nullable=*/false), + arrow::field("file_size", arrow::int64(), /*nullable=*/false), + arrow::field("num_added_files", arrow::int64(), /*nullable=*/false), + arrow::field("num_deleted_files", arrow::int64(), /*nullable=*/false), + arrow::field("schema_id", arrow::int64(), /*nullable=*/false), + arrow::field("min_partition_stats", arrow::utf8(), /*nullable=*/true), + arrow::field("max_partition_stats", arrow::utf8(), /*nullable=*/true), + arrow::field("min_row_id", arrow::int64(), /*nullable=*/true), + arrow::field("max_row_id", arrow::int64(), /*nullable=*/true), + }); +} + +Result> ManifestsSystemTable::BuildRows() const { + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr schema, ArrowSchema()); + PAIMON_ASSIGN_OR_RAISE(std::optional snapshot, + SystemTableUtils::LatestSnapshot(context_)); + if (!snapshot) { + return std::vector(); + } + + std::shared_ptr pool = GetDefaultPool(); + PAIMON_ASSIGN_OR_RAISE(CoreOptions core_options, SystemTableUtils::CreateCoreOptions(context_)); + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr path_factory, + SystemTableUtils::CreatePathFactory(context_, core_options, pool)); + PAIMON_ASSIGN_OR_RAISE(std::vector manifests, + SystemTableUtils::ReadDataManifests(context_, snapshot.value(), + path_factory, core_options, pool)); + std::shared_ptr arrow_schema = + DataField::ConvertDataFieldsToArrowSchema(context_.table_schema->Fields()); + PAIMON_ASSIGN_OR_RAISE( + std::shared_ptr partition_schema, + FieldMapping::GetPartitionSchema(arrow_schema, context_.table_schema->PartitionKeys())); + + std::vector rows; + rows.reserve(manifests.size()); + for (const auto& manifest : manifests) { + GenericRow row(schema->num_fields()); + row.SetField(0, SystemTableUtils::StringValue(manifest.FileName())); + row.SetField(1, manifest.FileSize()); + row.SetField(2, manifest.NumAddedFiles()); + row.SetField(3, manifest.NumDeletedFiles()); + row.SetField(4, manifest.SchemaId()); + PAIMON_ASSIGN_OR_RAISE(VariantType min_partition, + SystemTableUtils::OptionalPartitionStringValue( + manifest.PartitionStats().MinValues(), partition_schema)); + PAIMON_ASSIGN_OR_RAISE(VariantType max_partition, + SystemTableUtils::OptionalPartitionStringValue( + manifest.PartitionStats().MaxValues(), partition_schema)); + row.SetField(5, min_partition); + row.SetField(6, max_partition); + row.SetField(7, SystemTableUtils::OptionalInt64Value(manifest.MinRowId())); + row.SetField(8, SystemTableUtils::OptionalInt64Value(manifest.MaxRowId())); + rows.push_back(std::move(row)); + } + return rows; +} + +} // namespace paimon diff --git a/src/paimon/core/table/system/manifests_system_table.h b/src/paimon/core/table/system/manifests_system_table.h new file mode 100644 index 000000000..20f1bf189 --- /dev/null +++ b/src/paimon/core/table/system/manifests_system_table.h @@ -0,0 +1,48 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +#include "paimon/core/table/system/in_memory_system_table.h" +#include "paimon/core/table/system/system_table_utils.h" + +namespace paimon { +class FileSystem; +class TableSchema; + +/// System table for `T$manifests`, exposing data manifest metadata in the latest snapshot. +class ManifestsSystemTable : public InMemorySystemTable { + public: + static constexpr const char* kName = "manifests"; + + ManifestsSystemTable(std::shared_ptr fs, std::string table_path, std::string branch, + std::shared_ptr table_schema, + std::map options); + + std::string Name() const override; + Result> ArrowSchema() const override; + Result> BuildRows() const override; + + private: + SystemTableContext context_; +}; + +} // namespace paimon diff --git a/src/paimon/core/table/system/metadata_system_tables.cpp b/src/paimon/core/table/system/metadata_system_tables.cpp deleted file mode 100644 index 3fdf687ee..000000000 --- a/src/paimon/core/table/system/metadata_system_tables.cpp +++ /dev/null @@ -1,847 +0,0 @@ -/* - * Copyright 2026-present Alibaba Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "paimon/core/table/system/metadata_system_tables.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "fmt/format.h" -#include "fmt/ranges.h" -#include "paimon/common/data/binary_string.h" -#include "paimon/common/data/data_define.h" -#include "paimon/common/data/generic_row.h" -#include "paimon/common/data/internal_array.h" -#include "paimon/common/data/internal_row.h" -#include "paimon/common/table/special_fields.h" -#include "paimon/common/types/data_field.h" -#include "paimon/common/utils/binary_row_partition_computer.h" -#include "paimon/common/utils/date_time_utils.h" -#include "paimon/common/utils/internal_row_utils.h" -#include "paimon/common/utils/path_util.h" -#include "paimon/common/utils/rapidjson_util.h" -#include "paimon/core/core_options.h" -#include "paimon/core/io/data_file_meta.h" -#include "paimon/core/manifest/file_entry.h" -#include "paimon/core/manifest/file_kind.h" -#include "paimon/core/manifest/manifest_entry.h" -#include "paimon/core/manifest/manifest_file.h" -#include "paimon/core/manifest/manifest_file_meta.h" -#include "paimon/core/manifest/manifest_list.h" -#include "paimon/core/schema/schema_manager.h" -#include "paimon/core/schema/table_schema.h" -#include "paimon/core/snapshot.h" -#include "paimon/core/stats/simple_stats_evolutions.h" -#include "paimon/core/tag/tag.h" -#include "paimon/core/utils/branch_manager.h" -#include "paimon/core/utils/consumer_manager.h" -#include "paimon/core/utils/field_mapping.h" -#include "paimon/core/utils/file_store_path_factory.h" -#include "paimon/core/utils/snapshot_manager.h" -#include "paimon/core/utils/tag_manager.h" -#include "paimon/data/timestamp.h" -#include "paimon/fs/file_system.h" -#include "paimon/memory/memory_pool.h" -#include "paimon/status.h" -#include "rapidjson/document.h" -#include "rapidjson/stringbuffer.h" -#include "rapidjson/writer.h" - -namespace paimon { -namespace { - -constexpr int32_t kMaxPartitionStatsLength = 255; - -template -Result JsonString(const T& value) { - rapidjson::Document document; - auto json_value = RapidJsonUtil::SerializeValue(value, &document.GetAllocator()); - rapidjson::StringBuffer buffer; - rapidjson::Writer writer(buffer); - if (!json_value.Accept(writer)) { - return Status::Invalid("failed to serialize metadata system table value"); - } - return std::string(buffer.GetString(), buffer.GetSize()); -} - -Result LocalDateTimePartsToTimestampMillis(const std::vector& parts) { - if (parts.size() < 6) { - return Status::Invalid("tag create time requires at least 6 date-time fields"); - } - - int64_t year = parts[0]; - int64_t month = parts[1]; - int64_t day = parts[2]; - int64_t hour = parts[3]; - int64_t minute = parts[4]; - int64_t second = parts[5]; - int64_t nanos = parts.size() > 6 ? parts[6] : 0; - auto is_leap_year = [](int64_t value) { - return value % 4 == 0 && (value % 100 != 0 || value % 400 == 0); - }; - int64_t days_in_month[] = {31, is_leap_year(year) ? 29 : 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, - 31}; - if (month < 1 || month > 12 || day < 1 || day > days_in_month[month - 1] || hour < 0 || - hour > 23 || minute < 0 || minute > 59 || second < 0 || second > 59 || nanos < 0 || - nanos > 999999999) { - return Status::Invalid("invalid tag create time fields"); - } - - year -= month <= 2 ? 1 : 0; - int64_t era = (year >= 0 ? year : year - 399) / 400; - auto year_of_era = static_cast(year - era * 400); - auto month_prime = static_cast(month + (month > 2 ? -3 : 9)); - uint32_t day_of_year = (153 * month_prime + 2) / 5 + static_cast(day) - 1; - uint32_t day_of_era = year_of_era * 365 + year_of_era / 4 - year_of_era / 100 + day_of_year; - int64_t epoch_day = era * 146097 + static_cast(day_of_era) - 719468; - return epoch_day * DateTimeUtils::MILLIS_PER_DAY + hour * 3600000 + minute * 60000 + - second * 1000 + nanos / 1000000; -} - -Result> OptionalLocalDateTimePartsToTimestampMillis( - const std::optional>& parts) { - if (!parts) { - return std::optional(); - } - PAIMON_ASSIGN_OR_RAISE(int64_t timestamp_millis, - LocalDateTimePartsToTimestampMillis(parts.value())); - return std::optional(timestamp_millis); -} - -std::optional OptionalDoubleToString(const std::optional& value) { - if (!value) { - return std::optional(); - } - return std::to_string(value.value()); -} - -VariantType OptionalInt64Value(const std::optional& value) { - if (!value) { - return NullType(); - } - return value.value(); -} - -VariantType StringValue(const std::string& value) { - return BinaryString::FromString(value, GetDefaultPool().get()); -} - -VariantType OptionalStringValue(const std::optional& value) { - if (!value) { - return NullType(); - } - return StringValue(value.value()); -} - -VariantType TimestampMillisValue(int64_t value) { - return Timestamp::FromEpochMillis(value); -} - -Result LocalTimestampMillisValue(int64_t epoch_millis) { - PAIMON_ASSIGN_OR_RAISE( - Timestamp local_timestamp, - DateTimeUtils::ToLocalTimestamp(Timestamp::FromEpochMillis(epoch_millis))); - return TimestampMillisValue(local_timestamp.GetMillisecond()); -} - -Result LocalTimestampMillisValue(const Timestamp& local_timestamp) { - PAIMON_ASSIGN_OR_RAISE(Timestamp utc_timestamp, DateTimeUtils::ToUTCTimestamp(local_timestamp)); - int64_t epoch_millis = utc_timestamp.GetMillisecond(); - return LocalTimestampMillisValue(epoch_millis); -} - -VariantType OptionalTimestampMillisValue(const std::optional& value) { - if (!value) { - return NullType(); - } - return TimestampMillisValue(value.value()); -} - -MetadataSystemTableContext CreateMetadataContext(std::shared_ptr fs, - std::string table_path, std::string branch) { - return { - std::move(fs), std::move(table_path), BranchManager::NormalizeBranch(branch), nullptr, {}, - }; -} - -MetadataSystemTableContext CreateMetadataContext(std::shared_ptr fs, - std::string table_path, std::string branch, - std::shared_ptr table_schema, - std::map options) { - return { - std::move(fs), std::move(table_path), BranchManager::NormalizeBranch(branch), - std::move(table_schema), std::move(options), - }; -} - -Result CreateCoreOptions(const MetadataSystemTableContext& context) { - return CoreOptions::FromMap(context.options, context.fs); -} - -Result> CreatePathFactory( - const MetadataSystemTableContext& context, const CoreOptions& core_options, - const std::shared_ptr& pool) { - std::shared_ptr arrow_schema = - DataField::ConvertDataFieldsToArrowSchema(context.table_schema->Fields()); - PAIMON_ASSIGN_OR_RAISE(std::vector external_paths, - core_options.CreateExternalPaths()); - PAIMON_ASSIGN_OR_RAISE(std::optional global_index_external_path, - core_options.CreateGlobalIndexExternalPath()); - PAIMON_ASSIGN_OR_RAISE( - std::unique_ptr path_factory, - FileStorePathFactory::Create( - context.table_path, arrow_schema, context.table_schema->PartitionKeys(), - core_options.GetPartitionDefaultName(), core_options.GetFileFormat()->Identifier(), - core_options.DataFilePrefix(), core_options.LegacyPartitionNameEnabled(), - external_paths, global_index_external_path, core_options.IndexFileInDataFileDir(), - pool)); - return std::shared_ptr(std::move(path_factory)); -} - -Result> LatestSnapshot(const MetadataSystemTableContext& context) { - SnapshotManager snapshot_manager(context.fs, context.table_path, context.branch); - return snapshot_manager.LatestSnapshot(); -} - -Result> ReadDataManifests( - const MetadataSystemTableContext& context, const Snapshot& snapshot, - const std::shared_ptr& path_factory, const CoreOptions& core_options, - const std::shared_ptr& pool) { - PAIMON_ASSIGN_OR_RAISE( - std::unique_ptr manifest_list, - ManifestList::Create(context.fs, core_options.GetManifestFormat(), - core_options.GetManifestCompression(), path_factory, pool)); - std::vector manifests; - PAIMON_RETURN_NOT_OK(manifest_list->ReadDataManifests(snapshot, &manifests)); - return manifests; -} - -Result> CreateManifestFile( - const MetadataSystemTableContext& context, - const std::shared_ptr& path_factory, const CoreOptions& core_options, - const std::shared_ptr& pool) { - std::shared_ptr arrow_schema = - DataField::ConvertDataFieldsToArrowSchema(context.table_schema->Fields()); - PAIMON_ASSIGN_OR_RAISE( - std::shared_ptr partition_schema, - FieldMapping::GetPartitionSchema(arrow_schema, context.table_schema->PartitionKeys())); - return ManifestFile::Create(context.fs, core_options.GetManifestFormat(), - core_options.GetManifestCompression(), path_factory, - core_options.GetManifestTargetFileSize(), pool, core_options, - partition_schema); -} - -Result> ReadLatestManifestEntries( - const MetadataSystemTableContext& context, - const std::shared_ptr& path_factory, const CoreOptions& core_options, - const std::shared_ptr& pool) { - PAIMON_ASSIGN_OR_RAISE(std::optional snapshot, LatestSnapshot(context)); - if (!snapshot) { - return std::vector(); - } - PAIMON_ASSIGN_OR_RAISE( - std::vector manifests, - ReadDataManifests(context, snapshot.value(), path_factory, core_options, pool)); - PAIMON_ASSIGN_OR_RAISE(std::unique_ptr manifest_file, - CreateManifestFile(context, path_factory, core_options, pool)); - std::vector entries; - for (const auto& manifest : manifests) { - PAIMON_RETURN_NOT_OK( - manifest_file->Read(manifest.FileName(), /*filter=*/nullptr, &entries)); - } - return entries; -} - -Result> ReadLatestDataFiles( - const MetadataSystemTableContext& context, - const std::shared_ptr& path_factory, const CoreOptions& core_options, - const std::shared_ptr& pool) { - PAIMON_ASSIGN_OR_RAISE(std::vector entries, - ReadLatestManifestEntries(context, path_factory, core_options, pool)); - std::vector merged_entries; - PAIMON_RETURN_NOT_OK(FileEntry::MergeEntries(entries, &merged_entries)); - return merged_entries; -} - -std::optional OptionalBinaryRowString(const BinaryRow& row) { - if (row.GetFieldCount() <= 0) { - return std::nullopt; - } - return row.ToString(); -} - -Result> OptionalPartitionString( - const BinaryRow& row, const std::shared_ptr& partition_schema) { - if (row.GetFieldCount() <= 0) { - return std::optional(); - } - PAIMON_ASSIGN_OR_RAISE(std::string value, - BinaryRowPartitionComputer::PartToSimpleString( - partition_schema, row, ",", kMaxPartitionStatsLength)); - return std::optional(value); -} - -Result OptionalPartitionStringValue( - const BinaryRow& row, const std::shared_ptr& partition_schema) { - PAIMON_ASSIGN_OR_RAISE(std::optional value, - OptionalPartitionString(row, partition_schema)); - return OptionalStringValue(value); -} - -Result PartitionString(const std::shared_ptr& path_factory, - const BinaryRow& partition) { - PAIMON_ASSIGN_OR_RAISE(std::string value, path_factory->GetPartitionString(partition)); - return value; -} - -Result FilePath(const std::shared_ptr& path_factory, - const ManifestEntry& entry, const DataFileMeta& file) { - if (file.external_path) { - return file.external_path.value(); - } - PAIMON_ASSIGN_OR_RAISE(std::string bucket_path, - path_factory->BucketPath(entry.Partition(), entry.Bucket())); - return PathUtil::JoinPath(bucket_path, file.file_name); -} - -Result FieldsValueMapString(const std::vector& fields, - const InternalRow& row) { - std::shared_ptr schema = DataField::ConvertDataFieldsToArrowSchema(fields); - PAIMON_ASSIGN_OR_RAISE(std::vector getters, - InternalRowUtils::CreateFieldGetters(schema, /*use_view=*/false)); - std::vector values; - values.reserve(fields.size()); - for (size_t i = 0; i < fields.size(); ++i) { - std::string value = "null"; - if (!row.IsNullAt(i)) { - VariantType field_value = getters[i](row); - if (std::holds_alternative(field_value)) { - value = std::string(std::get(field_value)); - } else { - value = DataDefine::VariantValueToString(field_value); - } - } - values.emplace_back(fmt::format("{}:{}", fields[i].Name(), value)); - } - return fmt::format("{{{}}}", fmt::join(values, ", ")); -} - -Result NullValueCountsString(const std::vector& fields, - const InternalArray& null_counts) { - std::vector values; - values.reserve(fields.size()); - for (size_t i = 0; i < fields.size(); ++i) { - std::string value = - null_counts.IsNullAt(i) ? "null" : std::to_string(null_counts.GetLong(i)); - values.emplace_back(fmt::format("{}:{}", fields[i].Name(), value)); - } - return fmt::format("{{{}}}", fmt::join(values, ", ")); -} - -Result> StatsFields(const std::shared_ptr& schema) { - return schema->Fields(); -} - -Result> LoadDataSchema(const MetadataSystemTableContext& context, - int64_t schema_id) { - if (schema_id == context.table_schema->Id()) { - return context.table_schema; - } - SchemaManager schema_manager(context.fs, context.table_path, context.branch); - return schema_manager.ReadSchema(schema_id); -} - -Result> ValueStatsFields(const MetadataSystemTableContext& context, - int64_t schema_id) { - PAIMON_ASSIGN_OR_RAISE(std::shared_ptr data_schema, - LoadDataSchema(context, schema_id)); - PAIMON_ASSIGN_OR_RAISE(std::vector fields, StatsFields(data_schema)); - return fields; -} - -Result> WriteColsValue( - const std::optional>& write_cols, - const std::shared_ptr& pool) { - if (!write_cols) { - return std::shared_ptr(); - } - return std::make_shared( - InternalRowUtils::ToNotNullStringArrayData(write_cols.value(), pool)); -} - -} // namespace - -OptionsSystemTable::OptionsSystemTable(std::string table_path, - std::shared_ptr table_schema) - : InMemorySystemTable(std::move(table_path)), table_schema_(std::move(table_schema)) {} - -std::string OptionsSystemTable::Name() const { - return kName; -} - -Result> OptionsSystemTable::ArrowSchema() const { - return arrow::schema({arrow::field("key", arrow::utf8(), /*nullable=*/false), - arrow::field("value", arrow::utf8(), /*nullable=*/false)}); -} - -Result> OptionsSystemTable::BuildRows() const { - PAIMON_ASSIGN_OR_RAISE(std::shared_ptr schema, ArrowSchema()); - std::vector rows; - rows.reserve(table_schema_->Options().size()); - for (const auto& [key, value] : table_schema_->Options()) { - GenericRow row(schema->num_fields()); - row.SetField(0, std::string_view(key)); - row.SetField(1, std::string_view(value)); - rows.push_back(std::move(row)); - } - return rows; -} - -SnapshotsSystemTable::SnapshotsSystemTable(std::shared_ptr fs, std::string table_path, - std::string branch) - : InMemorySystemTable(table_path), - context_(CreateMetadataContext(std::move(fs), std::move(table_path), std::move(branch))) {} - -std::string SnapshotsSystemTable::Name() const { - return kName; -} - -Result> SnapshotsSystemTable::ArrowSchema() const { - return arrow::schema({ - arrow::field("snapshot_id", arrow::int64(), /*nullable=*/false), - arrow::field("schema_id", arrow::int64(), /*nullable=*/false), - arrow::field("commit_user", arrow::utf8(), /*nullable=*/false), - arrow::field("commit_identifier", arrow::int64(), /*nullable=*/false), - arrow::field("commit_kind", arrow::utf8(), /*nullable=*/false), - arrow::field("commit_time", arrow::timestamp(arrow::TimeUnit::MILLI), - /*nullable=*/false), - arrow::field("base_manifest_list", arrow::utf8(), /*nullable=*/false), - arrow::field("delta_manifest_list", arrow::utf8(), /*nullable=*/false), - arrow::field("changelog_manifest_list", arrow::utf8(), /*nullable=*/true), - arrow::field("total_record_count", arrow::int64(), /*nullable=*/true), - arrow::field("delta_record_count", arrow::int64(), /*nullable=*/true), - arrow::field("changelog_record_count", arrow::int64(), /*nullable=*/true), - arrow::field("watermark", arrow::int64(), /*nullable=*/true), - arrow::field("next_row_id", arrow::int64(), /*nullable=*/true), - }); -} - -Result> SnapshotsSystemTable::BuildRows() const { - PAIMON_ASSIGN_OR_RAISE(std::shared_ptr schema, ArrowSchema()); - SnapshotManager snapshot_manager(context_.fs, context_.table_path, context_.branch); - PAIMON_ASSIGN_OR_RAISE(std::vector snapshots, snapshot_manager.GetAllSnapshots()); - std::sort(snapshots.begin(), snapshots.end(), - [](const Snapshot& lhs, const Snapshot& rhs) { return lhs.Id() < rhs.Id(); }); - std::vector rows; - rows.reserve(snapshots.size()); - - for (const auto& snapshot : snapshots) { - GenericRow row(schema->num_fields()); - row.SetField(0, snapshot.Id()); - row.SetField(1, snapshot.SchemaId()); - row.SetField(2, StringValue(snapshot.CommitUser())); - row.SetField(3, snapshot.CommitIdentifier()); - row.SetField(4, StringValue(Snapshot::CommitKind::ToString(snapshot.GetCommitKind()))); - PAIMON_ASSIGN_OR_RAISE(VariantType commit_time, - LocalTimestampMillisValue(snapshot.TimeMillis())); - row.SetField(5, commit_time); - row.SetField(6, StringValue(snapshot.BaseManifestList())); - row.SetField(7, StringValue(snapshot.DeltaManifestList())); - row.SetField(8, OptionalStringValue(snapshot.ChangelogManifestList())); - row.SetField(9, OptionalInt64Value(snapshot.TotalRecordCount())); - row.SetField(10, OptionalInt64Value(snapshot.DeltaRecordCount())); - row.SetField(11, OptionalInt64Value(snapshot.ChangelogRecordCount())); - row.SetField(12, OptionalInt64Value(snapshot.Watermark())); - row.SetField(13, OptionalInt64Value(snapshot.NextRowId())); - rows.push_back(std::move(row)); - } - - return rows; -} - -SchemasSystemTable::SchemasSystemTable(std::shared_ptr fs, std::string table_path, - std::string branch) - : InMemorySystemTable(table_path), - context_(CreateMetadataContext(std::move(fs), std::move(table_path), std::move(branch))) {} - -std::string SchemasSystemTable::Name() const { - return kName; -} - -Result> SchemasSystemTable::ArrowSchema() const { - return arrow::schema({ - arrow::field("schema_id", arrow::int64(), /*nullable=*/false), - arrow::field("fields", arrow::utf8(), /*nullable=*/false), - arrow::field("partition_keys", arrow::utf8(), /*nullable=*/false), - arrow::field("primary_keys", arrow::utf8(), /*nullable=*/false), - arrow::field("options", arrow::utf8(), /*nullable=*/false), - arrow::field("comment", arrow::utf8(), /*nullable=*/true), - arrow::field("update_time", arrow::timestamp(arrow::TimeUnit::MILLI), - /*nullable=*/false), - }); -} - -Result> SchemasSystemTable::BuildRows() const { - PAIMON_ASSIGN_OR_RAISE(std::shared_ptr schema, ArrowSchema()); - SchemaManager schema_manager(context_.fs, context_.table_path, context_.branch); - PAIMON_ASSIGN_OR_RAISE(std::vector schema_ids, schema_manager.ListAllIds()); - std::sort(schema_ids.begin(), schema_ids.end()); - std::vector rows; - rows.reserve(schema_ids.size()); - - for (int64_t id : schema_ids) { - PAIMON_ASSIGN_OR_RAISE(std::shared_ptr table_schema, - schema_manager.ReadSchema(id)); - PAIMON_ASSIGN_OR_RAISE(std::string fields_json, JsonString(table_schema->Fields())); - PAIMON_ASSIGN_OR_RAISE(std::string partition_keys_json, - JsonString(table_schema->PartitionKeys())); - PAIMON_ASSIGN_OR_RAISE(std::string primary_keys_json, - JsonString(table_schema->PrimaryKeys())); - PAIMON_ASSIGN_OR_RAISE(std::string options_json, JsonString(table_schema->Options())); - - GenericRow row(schema->num_fields()); - row.SetField(0, table_schema->Id()); - row.SetField(1, StringValue(fields_json)); - row.SetField(2, StringValue(partition_keys_json)); - row.SetField(3, StringValue(primary_keys_json)); - row.SetField(4, StringValue(options_json)); - row.SetField(5, OptionalStringValue(table_schema->Comment())); - PAIMON_ASSIGN_OR_RAISE(VariantType update_time, - LocalTimestampMillisValue(table_schema->TimeMillis())); - row.SetField(6, update_time); - rows.push_back(std::move(row)); - } - - return rows; -} - -TagsSystemTable::TagsSystemTable(std::shared_ptr fs, std::string table_path, - std::string branch) - : InMemorySystemTable(table_path), - context_(CreateMetadataContext(std::move(fs), std::move(table_path), std::move(branch))) {} - -std::string TagsSystemTable::Name() const { - return kName; -} - -Result> TagsSystemTable::ArrowSchema() const { - return arrow::schema({ - arrow::field("tag_name", arrow::utf8(), /*nullable=*/false), - arrow::field("snapshot_id", arrow::int64(), /*nullable=*/false), - arrow::field("schema_id", arrow::int64(), /*nullable=*/false), - arrow::field("commit_time", arrow::timestamp(arrow::TimeUnit::MILLI), - /*nullable=*/false), - arrow::field("record_count", arrow::int64(), /*nullable=*/true), - arrow::field("create_time", arrow::timestamp(arrow::TimeUnit::MILLI), - /*nullable=*/true), - arrow::field("time_retained", arrow::utf8(), /*nullable=*/true), - }); -} - -Result> TagsSystemTable::BuildRows() const { - PAIMON_ASSIGN_OR_RAISE(std::shared_ptr schema, ArrowSchema()); - TagManager tag_manager(context_.fs, context_.table_path, context_.branch); - PAIMON_ASSIGN_OR_RAISE(std::vector tag_names, tag_manager.ListTagNames()); - std::vector rows; - rows.reserve(tag_names.size()); - - for (const auto& name : tag_names) { - PAIMON_ASSIGN_OR_RAISE(Tag tag, tag_manager.GetOrThrow(name)); - PAIMON_ASSIGN_OR_RAISE(std::optional tag_create_time, - OptionalLocalDateTimePartsToTimestampMillis(tag.TagCreateTime())); - GenericRow row(schema->num_fields()); - row.SetField(0, StringValue(name)); - row.SetField(1, tag.Id()); - row.SetField(2, tag.SchemaId()); - PAIMON_ASSIGN_OR_RAISE(VariantType commit_time, - LocalTimestampMillisValue(tag.TimeMillis())); - row.SetField(3, commit_time); - row.SetField(4, OptionalInt64Value(tag.TotalRecordCount())); - row.SetField(5, OptionalTimestampMillisValue(tag_create_time)); - row.SetField(6, OptionalStringValue(OptionalDoubleToString(tag.TagTimeRetained()))); - rows.push_back(std::move(row)); - } - - return rows; -} - -BranchesSystemTable::BranchesSystemTable(std::shared_ptr fs, std::string table_path, - std::string branch) - : InMemorySystemTable(table_path), - context_(CreateMetadataContext(std::move(fs), std::move(table_path), std::move(branch))) {} - -std::string BranchesSystemTable::Name() const { - return kName; -} - -Result> BranchesSystemTable::ArrowSchema() const { - return arrow::schema({ - arrow::field("branch_name", arrow::utf8(), /*nullable=*/false), - arrow::field("create_time", arrow::timestamp(arrow::TimeUnit::MILLI), - /*nullable=*/false), - }); -} - -Result> BranchesSystemTable::BuildRows() const { - PAIMON_ASSIGN_OR_RAISE(std::shared_ptr schema, ArrowSchema()); - PAIMON_ASSIGN_OR_RAISE(std::vector branches, - BranchManager::ListBranches(context_.fs, context_.table_path)); - std::vector rows; - rows.reserve(branches.size()); - - for (const auto& name : branches) { - PAIMON_ASSIGN_OR_RAISE( - std::unique_ptr branch_status, - context_.fs->GetFileStatus(BranchManager::BranchPath(context_.table_path, name))); - GenericRow row(schema->num_fields()); - row.SetField(0, StringValue(name)); - PAIMON_ASSIGN_OR_RAISE(VariantType create_time, - LocalTimestampMillisValue(branch_status->GetModificationTime())); - row.SetField(1, create_time); - rows.push_back(std::move(row)); - } - - return rows; -} - -ConsumersSystemTable::ConsumersSystemTable(std::shared_ptr fs, std::string table_path, - std::string branch) - : InMemorySystemTable(table_path), - context_(CreateMetadataContext(std::move(fs), std::move(table_path), std::move(branch))) {} - -std::string ConsumersSystemTable::Name() const { - return kName; -} - -Result> ConsumersSystemTable::ArrowSchema() const { - return arrow::schema({ - arrow::field("consumer_id", arrow::utf8(), /*nullable=*/false), - arrow::field("next_snapshot_id", arrow::int64(), /*nullable=*/false), - }); -} - -Result> ConsumersSystemTable::BuildRows() const { - PAIMON_ASSIGN_OR_RAISE(std::shared_ptr schema, ArrowSchema()); - ConsumerManager consumer_manager(context_.fs, context_.table_path, context_.branch); - PAIMON_ASSIGN_OR_RAISE(auto consumers, consumer_manager.Consumers()); - std::vector rows; - rows.reserve(consumers.size()); - - for (const auto& [id, snapshot_id] : consumers) { - GenericRow row(schema->num_fields()); - row.SetField(0, StringValue(id)); - row.SetField(1, snapshot_id); - rows.push_back(std::move(row)); - } - - return rows; -} - -ManifestsSystemTable::ManifestsSystemTable(std::shared_ptr fs, std::string table_path, - std::string branch, - std::shared_ptr table_schema, - std::map options) - : InMemorySystemTable(table_path), - context_(CreateMetadataContext(std::move(fs), std::move(table_path), std::move(branch), - std::move(table_schema), std::move(options))) {} - -std::string ManifestsSystemTable::Name() const { - return kName; -} - -Result> ManifestsSystemTable::ArrowSchema() const { - return arrow::schema({ - arrow::field("file_name", arrow::utf8(), /*nullable=*/false), - arrow::field("file_size", arrow::int64(), /*nullable=*/false), - arrow::field("num_added_files", arrow::int64(), /*nullable=*/false), - arrow::field("num_deleted_files", arrow::int64(), /*nullable=*/false), - arrow::field("schema_id", arrow::int64(), /*nullable=*/false), - arrow::field("min_partition_stats", arrow::utf8(), /*nullable=*/true), - arrow::field("max_partition_stats", arrow::utf8(), /*nullable=*/true), - arrow::field("min_row_id", arrow::int64(), /*nullable=*/true), - arrow::field("max_row_id", arrow::int64(), /*nullable=*/true), - }); -} - -Result> ManifestsSystemTable::BuildRows() const { - PAIMON_ASSIGN_OR_RAISE(std::shared_ptr schema, ArrowSchema()); - PAIMON_ASSIGN_OR_RAISE(std::optional snapshot, LatestSnapshot(context_)); - if (!snapshot) { - return std::vector(); - } - - std::shared_ptr pool = GetDefaultPool(); - PAIMON_ASSIGN_OR_RAISE(CoreOptions core_options, CreateCoreOptions(context_)); - PAIMON_ASSIGN_OR_RAISE(std::shared_ptr path_factory, - CreatePathFactory(context_, core_options, pool)); - PAIMON_ASSIGN_OR_RAISE( - std::vector manifests, - ReadDataManifests(context_, snapshot.value(), path_factory, core_options, pool)); - std::shared_ptr arrow_schema = - DataField::ConvertDataFieldsToArrowSchema(context_.table_schema->Fields()); - PAIMON_ASSIGN_OR_RAISE( - std::shared_ptr partition_schema, - FieldMapping::GetPartitionSchema(arrow_schema, context_.table_schema->PartitionKeys())); - - std::vector rows; - rows.reserve(manifests.size()); - for (const auto& manifest : manifests) { - GenericRow row(schema->num_fields()); - row.SetField(0, StringValue(manifest.FileName())); - row.SetField(1, manifest.FileSize()); - row.SetField(2, manifest.NumAddedFiles()); - row.SetField(3, manifest.NumDeletedFiles()); - row.SetField(4, manifest.SchemaId()); - PAIMON_ASSIGN_OR_RAISE( - VariantType min_partition, - OptionalPartitionStringValue(manifest.PartitionStats().MinValues(), partition_schema)); - PAIMON_ASSIGN_OR_RAISE( - VariantType max_partition, - OptionalPartitionStringValue(manifest.PartitionStats().MaxValues(), partition_schema)); - row.SetField(5, min_partition); - row.SetField(6, max_partition); - row.SetField(7, OptionalInt64Value(manifest.MinRowId())); - row.SetField(8, OptionalInt64Value(manifest.MaxRowId())); - rows.push_back(std::move(row)); - } - return rows; -} - -FilesSystemTable::FilesSystemTable(std::shared_ptr fs, std::string table_path, - std::string branch, std::shared_ptr table_schema, - std::map options) - : InMemorySystemTable(table_path), - context_(CreateMetadataContext(std::move(fs), std::move(table_path), std::move(branch), - std::move(table_schema), std::move(options))) {} - -std::string FilesSystemTable::Name() const { - return kName; -} - -Result> FilesSystemTable::ArrowSchema() const { - return arrow::schema({ - arrow::field("partition", arrow::utf8(), /*nullable=*/true), - arrow::field("bucket", arrow::int32(), /*nullable=*/false), - arrow::field("file_path", arrow::utf8(), /*nullable=*/false), - arrow::field("file_format", arrow::utf8(), /*nullable=*/false), - arrow::field("schema_id", arrow::int64(), /*nullable=*/false), - arrow::field("level", arrow::int32(), /*nullable=*/false), - arrow::field("record_count", arrow::int64(), /*nullable=*/false), - arrow::field("file_size_in_bytes", arrow::int64(), /*nullable=*/false), - arrow::field("min_key", arrow::utf8(), /*nullable=*/true), - arrow::field("max_key", arrow::utf8(), /*nullable=*/true), - arrow::field("null_value_counts", arrow::utf8(), /*nullable=*/false), - arrow::field("min_value_stats", arrow::utf8(), /*nullable=*/false), - arrow::field("max_value_stats", arrow::utf8(), /*nullable=*/false), - arrow::field("min_sequence_number", arrow::int64(), /*nullable=*/true), - arrow::field("max_sequence_number", arrow::int64(), /*nullable=*/true), - arrow::field("creation_time", arrow::timestamp(arrow::TimeUnit::MILLI), - /*nullable=*/true), - arrow::field("deleteRowCount", arrow::int64(), /*nullable=*/true), - arrow::field("file_source", arrow::utf8(), /*nullable=*/true), - arrow::field("first_row_id", arrow::int64(), /*nullable=*/true), - arrow::field("write_cols", arrow::list(arrow::utf8()), /*nullable=*/true), - }); -} - -Result> FilesSystemTable::BuildRows() const { - PAIMON_ASSIGN_OR_RAISE(std::shared_ptr schema, ArrowSchema()); - std::shared_ptr pool = GetDefaultPool(); - PAIMON_ASSIGN_OR_RAISE(CoreOptions core_options, CreateCoreOptions(context_)); - PAIMON_ASSIGN_OR_RAISE(std::shared_ptr path_factory, - CreatePathFactory(context_, core_options, pool)); - PAIMON_ASSIGN_OR_RAISE(std::vector entries, - ReadLatestDataFiles(context_, path_factory, core_options, pool)); - std::shared_ptr arrow_schema = - DataField::ConvertDataFieldsToArrowSchema(context_.table_schema->Fields()); - PAIMON_ASSIGN_OR_RAISE( - std::shared_ptr partition_schema, - FieldMapping::GetPartitionSchema(arrow_schema, context_.table_schema->PartitionKeys())); - - SimpleStatsEvolutions stats_evolutions(context_.table_schema, pool); - std::vector rows; - rows.reserve(entries.size()); - for (const auto& entry : entries) { - if (!(entry.Kind() == FileKind::Add())) { - continue; - } - - const std::shared_ptr& file = entry.File(); - PAIMON_ASSIGN_OR_RAISE(std::shared_ptr data_schema, - LoadDataSchema(context_, file->schema_id)); - PAIMON_ASSIGN_OR_RAISE(std::vector value_stats_fields, - ValueStatsFields(context_, file->schema_id)); - std::shared_ptr stats_evolution = - stats_evolutions.GetOrCreate(data_schema); - PAIMON_ASSIGN_OR_RAISE( - SimpleStatsEvolution::EvolutionStats stats, - stats_evolution->Evolution(file->value_stats, file->row_count, file->value_stats_cols)); - - GenericRow row(schema->num_fields()); - if (context_.table_schema->PartitionKeys().empty()) { - row.SetField(0, NullType()); - } else { - PAIMON_ASSIGN_OR_RAISE(std::string partition, - PartitionString(path_factory, entry.Partition())); - row.SetField(0, StringValue(partition)); - } - row.SetField(1, entry.Bucket()); - PAIMON_ASSIGN_OR_RAISE(std::string file_path, FilePath(path_factory, entry, *file)); - row.SetField(2, StringValue(file_path)); - PAIMON_ASSIGN_OR_RAISE(std::string file_format, file->FileFormat()); - row.SetField(3, StringValue(file_format)); - row.SetField(4, file->schema_id); - row.SetField(5, file->level); - row.SetField(6, file->row_count); - row.SetField(7, file->file_size); - row.SetField(8, OptionalStringValue(OptionalBinaryRowString(file->min_key))); - row.SetField(9, OptionalStringValue(OptionalBinaryRowString(file->max_key))); - PAIMON_ASSIGN_OR_RAISE(std::string null_value_counts, - NullValueCountsString(value_stats_fields, *stats.null_counts)); - row.SetField(10, StringValue(null_value_counts)); - PAIMON_ASSIGN_OR_RAISE(std::string min_value_stats, - FieldsValueMapString(value_stats_fields, *stats.min_values)); - row.SetField(11, StringValue(min_value_stats)); - PAIMON_ASSIGN_OR_RAISE(std::string max_value_stats, - FieldsValueMapString(value_stats_fields, *stats.max_values)); - row.SetField(12, StringValue(max_value_stats)); - row.SetField(13, file->min_sequence_number); - row.SetField(14, file->max_sequence_number); - PAIMON_ASSIGN_OR_RAISE(VariantType creation_time, - LocalTimestampMillisValue(file->creation_time)); - row.SetField(15, creation_time); - row.SetField(16, OptionalInt64Value(file->delete_row_count)); - row.SetField(17, file->file_source ? StringValue(file->file_source.value().ToString()) - : VariantType(NullType())); - row.SetField(18, OptionalInt64Value(file->first_row_id)); - PAIMON_ASSIGN_OR_RAISE(std::shared_ptr write_cols, - WriteColsValue(file->write_cols, pool)); - row.SetField(19, write_cols ? VariantType(write_cols) : VariantType(NullType())); - rows.push_back(std::move(row)); - } - return rows; -} - -} // namespace paimon diff --git a/src/paimon/core/table/system/metadata_system_tables.h b/src/paimon/core/table/system/metadata_system_tables.h deleted file mode 100644 index 389ad5a95..000000000 --- a/src/paimon/core/table/system/metadata_system_tables.h +++ /dev/null @@ -1,165 +0,0 @@ -/* - * Copyright 2026-present Alibaba Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include -#include - -#include "paimon/core/table/system/in_memory_system_table.h" - -namespace paimon { -class FileSystem; -class TableSchema; - -/// System table for `T$options`, exposing the latest base table options as key/value rows. -class OptionsSystemTable : public InMemorySystemTable { - public: - static constexpr const char* kName = "options"; - - OptionsSystemTable(std::string table_path, std::shared_ptr table_schema); - - std::string Name() const override; - Result> ArrowSchema() const override; - Result> BuildRows() const override; - - private: - std::shared_ptr table_schema_; -}; - -/// Shared table metadata location used by metadata system tables. -struct MetadataSystemTableContext { - std::shared_ptr fs; - std::string table_path; - std::string branch; - std::shared_ptr table_schema; - std::map options; -}; - -/// System table for `T$snapshots`, exposing snapshot commit history. -class SnapshotsSystemTable : public InMemorySystemTable { - public: - static constexpr const char* kName = "snapshots"; - - SnapshotsSystemTable(std::shared_ptr fs, std::string table_path, - std::string branch); - - std::string Name() const override; - Result> ArrowSchema() const override; - Result> BuildRows() const override; - - private: - MetadataSystemTableContext context_; -}; - -/// System table for `T$schemas`, exposing schema evolution history. -class SchemasSystemTable : public InMemorySystemTable { - public: - static constexpr const char* kName = "schemas"; - - SchemasSystemTable(std::shared_ptr fs, std::string table_path, std::string branch); - - std::string Name() const override; - Result> ArrowSchema() const override; - Result> BuildRows() const override; - - private: - MetadataSystemTableContext context_; -}; - -/// System table for `T$tags`, exposing tags and the snapshots they reference. -class TagsSystemTable : public InMemorySystemTable { - public: - static constexpr const char* kName = "tags"; - - TagsSystemTable(std::shared_ptr fs, std::string table_path, std::string branch); - - std::string Name() const override; - Result> ArrowSchema() const override; - Result> BuildRows() const override; - - private: - MetadataSystemTableContext context_; -}; - -/// System table for `T$branches`, exposing table branches including `main`. -class BranchesSystemTable : public InMemorySystemTable { - public: - static constexpr const char* kName = "branches"; - - BranchesSystemTable(std::shared_ptr fs, std::string table_path, std::string branch); - - std::string Name() const override; - Result> ArrowSchema() const override; - Result> BuildRows() const override; - - private: - MetadataSystemTableContext context_; -}; - -/// System table for `T$consumers`, exposing persisted streaming consumer offsets. -class ConsumersSystemTable : public InMemorySystemTable { - public: - static constexpr const char* kName = "consumers"; - - ConsumersSystemTable(std::shared_ptr fs, std::string table_path, - std::string branch); - - std::string Name() const override; - Result> ArrowSchema() const override; - Result> BuildRows() const override; - - private: - MetadataSystemTableContext context_; -}; - -/// System table for `T$manifests`, exposing data manifest metadata in the latest snapshot. -class ManifestsSystemTable : public InMemorySystemTable { - public: - static constexpr const char* kName = "manifests"; - - ManifestsSystemTable(std::shared_ptr fs, std::string table_path, std::string branch, - std::shared_ptr table_schema, - std::map options); - - std::string Name() const override; - Result> ArrowSchema() const override; - Result> BuildRows() const override; - - private: - MetadataSystemTableContext context_; -}; - -/// System table for `T$files`, exposing data file metadata in the latest snapshot. -class FilesSystemTable : public InMemorySystemTable { - public: - static constexpr const char* kName = "files"; - - FilesSystemTable(std::shared_ptr fs, std::string table_path, std::string branch, - std::shared_ptr table_schema, - std::map options); - - std::string Name() const override; - Result> ArrowSchema() const override; - Result> BuildRows() const override; - - private: - MetadataSystemTableContext context_; -}; - -} // namespace paimon diff --git a/src/paimon/core/table/system/options_system_table.cpp b/src/paimon/core/table/system/options_system_table.cpp new file mode 100644 index 000000000..feebb97cf --- /dev/null +++ b/src/paimon/core/table/system/options_system_table.cpp @@ -0,0 +1,53 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/core/table/system/options_system_table.h" + +#include +#include + +#include "arrow/api.h" +#include "paimon/core/schema/table_schema.h" + +namespace paimon { + +OptionsSystemTable::OptionsSystemTable(std::string table_path, + std::shared_ptr table_schema) + : InMemorySystemTable(std::move(table_path)), table_schema_(std::move(table_schema)) {} + +std::string OptionsSystemTable::Name() const { + return kName; +} + +Result> OptionsSystemTable::ArrowSchema() const { + return arrow::schema({arrow::field("key", arrow::utf8(), /*nullable=*/false), + arrow::field("value", arrow::utf8(), /*nullable=*/false)}); +} + +Result> OptionsSystemTable::BuildRows() const { + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr schema, ArrowSchema()); + std::vector rows; + rows.reserve(table_schema_->Options().size()); + for (const auto& [key, value] : table_schema_->Options()) { + GenericRow row(schema->num_fields()); + row.SetField(0, std::string_view(key)); + row.SetField(1, std::string_view(value)); + rows.push_back(std::move(row)); + } + return rows; +} + +} // namespace paimon diff --git a/src/paimon/core/table/system/options_system_table.h b/src/paimon/core/table/system/options_system_table.h new file mode 100644 index 000000000..a87fd688d --- /dev/null +++ b/src/paimon/core/table/system/options_system_table.h @@ -0,0 +1,43 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#include "paimon/core/table/system/in_memory_system_table.h" + +namespace paimon { +class TableSchema; + +/// System table for `T$options`, exposing the latest base table options as key/value rows. +class OptionsSystemTable : public InMemorySystemTable { + public: + static constexpr const char* kName = "options"; + + OptionsSystemTable(std::string table_path, std::shared_ptr table_schema); + + std::string Name() const override; + Result> ArrowSchema() const override; + Result> BuildRows() const override; + + private: + std::shared_ptr table_schema_; +}; + +} // namespace paimon diff --git a/src/paimon/core/table/system/schemas_system_table.cpp b/src/paimon/core/table/system/schemas_system_table.cpp new file mode 100644 index 000000000..9be51ee61 --- /dev/null +++ b/src/paimon/core/table/system/schemas_system_table.cpp @@ -0,0 +1,87 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/core/table/system/schemas_system_table.h" + +#include +#include + +#include "arrow/api.h" +#include "paimon/core/schema/schema_manager.h" +#include "paimon/core/schema/table_schema.h" + +namespace paimon { + +SchemasSystemTable::SchemasSystemTable(std::shared_ptr fs, std::string table_path, + std::string branch) + : InMemorySystemTable(table_path), + context_(SystemTableUtils::CreateContext(std::move(fs), std::move(table_path), + std::move(branch))) {} + +std::string SchemasSystemTable::Name() const { + return kName; +} + +Result> SchemasSystemTable::ArrowSchema() const { + return arrow::schema({ + arrow::field("schema_id", arrow::int64(), /*nullable=*/false), + arrow::field("fields", arrow::utf8(), /*nullable=*/false), + arrow::field("partition_keys", arrow::utf8(), /*nullable=*/false), + arrow::field("primary_keys", arrow::utf8(), /*nullable=*/false), + arrow::field("options", arrow::utf8(), /*nullable=*/false), + arrow::field("comment", arrow::utf8(), /*nullable=*/true), + arrow::field("update_time", arrow::timestamp(arrow::TimeUnit::MILLI), + /*nullable=*/false), + }); +} + +Result> SchemasSystemTable::BuildRows() const { + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr schema, ArrowSchema()); + SchemaManager schema_manager(context_.fs, context_.table_path, context_.branch); + PAIMON_ASSIGN_OR_RAISE(std::vector schema_ids, schema_manager.ListAllIds()); + std::sort(schema_ids.begin(), schema_ids.end()); + std::vector rows; + rows.reserve(schema_ids.size()); + + for (int64_t id : schema_ids) { + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr table_schema, + schema_manager.ReadSchema(id)); + PAIMON_ASSIGN_OR_RAISE(std::string fields_json, + SystemTableUtils::JsonString(table_schema->Fields())); + PAIMON_ASSIGN_OR_RAISE(std::string partition_keys_json, + SystemTableUtils::JsonString(table_schema->PartitionKeys())); + PAIMON_ASSIGN_OR_RAISE(std::string primary_keys_json, + SystemTableUtils::JsonString(table_schema->PrimaryKeys())); + PAIMON_ASSIGN_OR_RAISE(std::string options_json, + SystemTableUtils::JsonString(table_schema->Options())); + + GenericRow row(schema->num_fields()); + row.SetField(0, table_schema->Id()); + row.SetField(1, SystemTableUtils::StringValue(fields_json)); + row.SetField(2, SystemTableUtils::StringValue(partition_keys_json)); + row.SetField(3, SystemTableUtils::StringValue(primary_keys_json)); + row.SetField(4, SystemTableUtils::StringValue(options_json)); + row.SetField(5, SystemTableUtils::OptionalStringValue(table_schema->Comment())); + PAIMON_ASSIGN_OR_RAISE(VariantType update_time, SystemTableUtils::LocalTimestampMillisValue( + table_schema->TimeMillis())); + row.SetField(6, update_time); + rows.push_back(std::move(row)); + } + + return rows; +} + +} // namespace paimon diff --git a/src/paimon/core/table/system/schemas_system_table.h b/src/paimon/core/table/system/schemas_system_table.h new file mode 100644 index 000000000..a9a607dc8 --- /dev/null +++ b/src/paimon/core/table/system/schemas_system_table.h @@ -0,0 +1,44 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#include "paimon/core/table/system/in_memory_system_table.h" +#include "paimon/core/table/system/system_table_utils.h" + +namespace paimon { +class FileSystem; + +/// System table for `T$schemas`, exposing schema evolution history. +class SchemasSystemTable : public InMemorySystemTable { + public: + static constexpr const char* kName = "schemas"; + + SchemasSystemTable(std::shared_ptr fs, std::string table_path, std::string branch); + + std::string Name() const override; + Result> ArrowSchema() const override; + Result> BuildRows() const override; + + private: + SystemTableContext context_; +}; + +} // namespace paimon diff --git a/src/paimon/core/table/system/snapshots_system_table.cpp b/src/paimon/core/table/system/snapshots_system_table.cpp new file mode 100644 index 000000000..4ca7cfa01 --- /dev/null +++ b/src/paimon/core/table/system/snapshots_system_table.cpp @@ -0,0 +1,92 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/core/table/system/snapshots_system_table.h" + +#include +#include + +#include "arrow/api.h" +#include "paimon/core/snapshot.h" +#include "paimon/core/utils/snapshot_manager.h" + +namespace paimon { + +SnapshotsSystemTable::SnapshotsSystemTable(std::shared_ptr fs, std::string table_path, + std::string branch) + : InMemorySystemTable(table_path), + context_(SystemTableUtils::CreateContext(std::move(fs), std::move(table_path), + std::move(branch))) {} + +std::string SnapshotsSystemTable::Name() const { + return kName; +} + +Result> SnapshotsSystemTable::ArrowSchema() const { + return arrow::schema({ + arrow::field("snapshot_id", arrow::int64(), /*nullable=*/false), + arrow::field("schema_id", arrow::int64(), /*nullable=*/false), + arrow::field("commit_user", arrow::utf8(), /*nullable=*/false), + arrow::field("commit_identifier", arrow::int64(), /*nullable=*/false), + arrow::field("commit_kind", arrow::utf8(), /*nullable=*/false), + arrow::field("commit_time", arrow::timestamp(arrow::TimeUnit::MILLI), + /*nullable=*/false), + arrow::field("base_manifest_list", arrow::utf8(), /*nullable=*/false), + arrow::field("delta_manifest_list", arrow::utf8(), /*nullable=*/false), + arrow::field("changelog_manifest_list", arrow::utf8(), /*nullable=*/true), + arrow::field("total_record_count", arrow::int64(), /*nullable=*/true), + arrow::field("delta_record_count", arrow::int64(), /*nullable=*/true), + arrow::field("changelog_record_count", arrow::int64(), /*nullable=*/true), + arrow::field("watermark", arrow::int64(), /*nullable=*/true), + arrow::field("next_row_id", arrow::int64(), /*nullable=*/true), + }); +} + +Result> SnapshotsSystemTable::BuildRows() const { + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr schema, ArrowSchema()); + SnapshotManager snapshot_manager(context_.fs, context_.table_path, context_.branch); + PAIMON_ASSIGN_OR_RAISE(std::vector snapshots, snapshot_manager.GetAllSnapshots()); + std::sort(snapshots.begin(), snapshots.end(), + [](const Snapshot& lhs, const Snapshot& rhs) { return lhs.Id() < rhs.Id(); }); + std::vector rows; + rows.reserve(snapshots.size()); + + for (const auto& snapshot : snapshots) { + GenericRow row(schema->num_fields()); + row.SetField(0, snapshot.Id()); + row.SetField(1, snapshot.SchemaId()); + row.SetField(2, SystemTableUtils::StringValue(snapshot.CommitUser())); + row.SetField(3, snapshot.CommitIdentifier()); + row.SetField(4, SystemTableUtils::StringValue( + Snapshot::CommitKind::ToString(snapshot.GetCommitKind()))); + PAIMON_ASSIGN_OR_RAISE(VariantType commit_time, + SystemTableUtils::LocalTimestampMillisValue(snapshot.TimeMillis())); + row.SetField(5, commit_time); + row.SetField(6, SystemTableUtils::StringValue(snapshot.BaseManifestList())); + row.SetField(7, SystemTableUtils::StringValue(snapshot.DeltaManifestList())); + row.SetField(8, SystemTableUtils::OptionalStringValue(snapshot.ChangelogManifestList())); + row.SetField(9, SystemTableUtils::OptionalInt64Value(snapshot.TotalRecordCount())); + row.SetField(10, SystemTableUtils::OptionalInt64Value(snapshot.DeltaRecordCount())); + row.SetField(11, SystemTableUtils::OptionalInt64Value(snapshot.ChangelogRecordCount())); + row.SetField(12, SystemTableUtils::OptionalInt64Value(snapshot.Watermark())); + row.SetField(13, SystemTableUtils::OptionalInt64Value(snapshot.NextRowId())); + rows.push_back(std::move(row)); + } + + return rows; +} + +} // namespace paimon diff --git a/src/paimon/core/table/system/snapshots_system_table.h b/src/paimon/core/table/system/snapshots_system_table.h new file mode 100644 index 000000000..2e2c48f01 --- /dev/null +++ b/src/paimon/core/table/system/snapshots_system_table.h @@ -0,0 +1,45 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#include "paimon/core/table/system/in_memory_system_table.h" +#include "paimon/core/table/system/system_table_utils.h" + +namespace paimon { +class FileSystem; + +/// System table for `T$snapshots`, exposing snapshot commit history. +class SnapshotsSystemTable : public InMemorySystemTable { + public: + static constexpr const char* kName = "snapshots"; + + SnapshotsSystemTable(std::shared_ptr fs, std::string table_path, + std::string branch); + + std::string Name() const override; + Result> ArrowSchema() const override; + Result> BuildRows() const override; + + private: + SystemTableContext context_; +}; + +} // namespace paimon diff --git a/src/paimon/core/table/system/system_table.cpp b/src/paimon/core/table/system/system_table.cpp index 061e6d4f1..35ae430b1 100644 --- a/src/paimon/core/table/system/system_table.cpp +++ b/src/paimon/core/table/system/system_table.cpp @@ -30,8 +30,15 @@ #include "paimon/core/schema/table_schema.h" #include "paimon/core/table/system/audit_log_system_table.h" #include "paimon/core/table/system/binlog_system_table.h" -#include "paimon/core/table/system/metadata_system_tables.h" -#include "paimon/core/utils/branch_manager.h" +#include "paimon/core/table/system/branches_system_table.h" +#include "paimon/core/table/system/consumers_system_table.h" +#include "paimon/core/table/system/files_system_table.h" +#include "paimon/core/table/system/manifests_system_table.h" +#include "paimon/core/table/system/options_system_table.h" +#include "paimon/core/table/system/schemas_system_table.h" +#include "paimon/core/table/system/snapshots_system_table.h" +#include "paimon/core/table/system/system_table_utils.h" +#include "paimon/core/table/system/tags_system_table.h" #include "paimon/status.h" namespace paimon { @@ -46,21 +53,6 @@ struct SystemTableRegistryEntry { SystemTableFactory factory; }; -std::map MergeOptions( - const std::shared_ptr& table_schema, - const std::map& dynamic_options) { - auto options = table_schema->Options(); - for (const auto& [key, value] : dynamic_options) { - options[key] = value; - } - return options; -} - -std::string LoadBranch(const std::map& options) { - auto branch_iter = options.find(Options::BRANCH); - return branch_iter == options.end() ? BranchManager::DEFAULT_MAIN_BRANCH : branch_iter->second; -} - const std::vector& SystemTableRegistry() { static const std::vector registry = { {OptionsSystemTable::kName, @@ -76,7 +68,8 @@ const std::vector& SystemTableRegistry() { const std::map& dynamic_options) -> Result> { return std::make_shared( - fs, table_path, table_schema, MergeOptions(table_schema, dynamic_options)); + fs, table_path, table_schema, + SystemTableUtils::MergeOptions(table_schema, dynamic_options)); }}, {BinlogSystemTable::kName, [](const std::shared_ptr& fs, const std::string& table_path, @@ -84,55 +77,62 @@ const std::vector& SystemTableRegistry() { const std::map& dynamic_options) -> Result> { return std::make_shared( - fs, table_path, table_schema, MergeOptions(table_schema, dynamic_options)); + fs, table_path, table_schema, + SystemTableUtils::MergeOptions(table_schema, dynamic_options)); }}, {SnapshotsSystemTable::kName, [](const std::shared_ptr& fs, const std::string& table_path, const std::shared_ptr& table_schema, const std::map& dynamic_options) -> Result> { - auto options = MergeOptions(table_schema, dynamic_options); - return std::make_shared(fs, table_path, LoadBranch(options)); + auto options = SystemTableUtils::MergeOptions(table_schema, dynamic_options); + return std::make_shared(fs, table_path, + SystemTableUtils::LoadBranch(options)); }}, {SchemasSystemTable::kName, [](const std::shared_ptr& fs, const std::string& table_path, const std::shared_ptr& table_schema, const std::map& dynamic_options) -> Result> { - auto options = MergeOptions(table_schema, dynamic_options); - return std::make_shared(fs, table_path, LoadBranch(options)); + auto options = SystemTableUtils::MergeOptions(table_schema, dynamic_options); + return std::make_shared(fs, table_path, + SystemTableUtils::LoadBranch(options)); }}, {TagsSystemTable::kName, [](const std::shared_ptr& fs, const std::string& table_path, const std::shared_ptr& table_schema, const std::map& dynamic_options) -> Result> { - auto options = MergeOptions(table_schema, dynamic_options); - return std::make_shared(fs, table_path, LoadBranch(options)); + auto options = SystemTableUtils::MergeOptions(table_schema, dynamic_options); + return std::make_shared(fs, table_path, + SystemTableUtils::LoadBranch(options)); }}, {BranchesSystemTable::kName, [](const std::shared_ptr& fs, const std::string& table_path, const std::shared_ptr& table_schema, const std::map& dynamic_options) -> Result> { - auto options = MergeOptions(table_schema, dynamic_options); - return std::make_shared(fs, table_path, LoadBranch(options)); + auto options = SystemTableUtils::MergeOptions(table_schema, dynamic_options); + return std::make_shared(fs, table_path, + SystemTableUtils::LoadBranch(options)); }}, {ConsumersSystemTable::kName, [](const std::shared_ptr& fs, const std::string& table_path, const std::shared_ptr& table_schema, const std::map& dynamic_options) -> Result> { - auto options = MergeOptions(table_schema, dynamic_options); - return std::make_shared(fs, table_path, LoadBranch(options)); + auto options = SystemTableUtils::MergeOptions(table_schema, dynamic_options); + return std::make_shared(fs, table_path, + SystemTableUtils::LoadBranch(options)); }}, {ManifestsSystemTable::kName, [](const std::shared_ptr& fs, const std::string& table_path, const std::shared_ptr& table_schema, const std::map& dynamic_options) -> Result> { - auto options = MergeOptions(table_schema, dynamic_options); - return std::make_shared(fs, table_path, LoadBranch(options), + auto options = SystemTableUtils::MergeOptions(table_schema, dynamic_options); + return std::make_shared(fs, table_path, + SystemTableUtils::LoadBranch(options), table_schema, std::move(options)); }}, {FilesSystemTable::kName, @@ -140,8 +140,9 @@ const std::vector& SystemTableRegistry() { const std::shared_ptr& table_schema, const std::map& dynamic_options) -> Result> { - auto options = MergeOptions(table_schema, dynamic_options); - return std::make_shared(fs, table_path, LoadBranch(options), + auto options = SystemTableUtils::MergeOptions(table_schema, dynamic_options); + return std::make_shared(fs, table_path, + SystemTableUtils::LoadBranch(options), table_schema, std::move(options)); }}, }; @@ -203,7 +204,7 @@ Result> SystemTableLoader::LoadFromPath( } const auto& parsed = system_table_path.value(); SchemaManager schema_manager(fs, parsed.table_path, - parsed.branch.value_or(BranchManager::DEFAULT_MAIN_BRANCH)); + parsed.branch.value_or(SystemTableUtils::DefaultBranch())); PAIMON_ASSIGN_OR_RAISE(std::optional> latest_schema, schema_manager.Latest()); if (!latest_schema) { diff --git a/src/paimon/core/table/system/system_table_utils.cpp b/src/paimon/core/table/system/system_table_utils.cpp new file mode 100644 index 000000000..a06361588 --- /dev/null +++ b/src/paimon/core/table/system/system_table_utils.cpp @@ -0,0 +1,373 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/core/table/system/system_table_utils.h" + +#include +#include + +#include "fmt/format.h" +#include "fmt/ranges.h" +#include "paimon/common/data/binary_array.h" +#include "paimon/common/data/binary_row.h" +#include "paimon/common/data/binary_string.h" +#include "paimon/common/data/internal_array.h" +#include "paimon/common/data/internal_row.h" +#include "paimon/common/types/data_field.h" +#include "paimon/common/utils/binary_row_partition_computer.h" +#include "paimon/common/utils/date_time_utils.h" +#include "paimon/common/utils/internal_row_utils.h" +#include "paimon/common/utils/path_util.h" +#include "paimon/core/core_options.h" +#include "paimon/core/io/data_file_meta.h" +#include "paimon/core/manifest/file_entry.h" +#include "paimon/core/manifest/manifest_entry.h" +#include "paimon/core/manifest/manifest_file.h" +#include "paimon/core/manifest/manifest_file_meta.h" +#include "paimon/core/manifest/manifest_list.h" +#include "paimon/core/schema/schema_manager.h" +#include "paimon/core/schema/table_schema.h" +#include "paimon/core/snapshot.h" +#include "paimon/core/utils/branch_manager.h" +#include "paimon/core/utils/field_mapping.h" +#include "paimon/core/utils/file_store_path_factory.h" +#include "paimon/core/utils/snapshot_manager.h" +#include "paimon/fs/file_system.h" +#include "paimon/memory/memory_pool.h" +#include "paimon/status.h" + +namespace paimon { +namespace { + +constexpr int32_t kMaxPartitionStatsLength = 255; + +} // namespace + +SystemTableContext SystemTableUtils::CreateContext(std::shared_ptr fs, + std::string table_path, std::string branch) { + return { + std::move(fs), std::move(table_path), BranchManager::NormalizeBranch(branch), nullptr, {}, + }; +} + +SystemTableContext SystemTableUtils::CreateContext(std::shared_ptr fs, + std::string table_path, std::string branch, + std::shared_ptr table_schema, + std::map options) { + return { + std::move(fs), std::move(table_path), BranchManager::NormalizeBranch(branch), + std::move(table_schema), std::move(options), + }; +} + +std::map SystemTableUtils::MergeOptions( + const std::shared_ptr& table_schema, + const std::map& dynamic_options) { + auto options = table_schema->Options(); + for (const auto& [key, value] : dynamic_options) { + options[key] = value; + } + return options; +} + +std::string SystemTableUtils::DefaultBranch() { + return BranchManager::DEFAULT_MAIN_BRANCH; +} + +std::string SystemTableUtils::LoadBranch(const std::map& options) { + auto branch_iter = options.find(Options::BRANCH); + return branch_iter == options.end() ? DefaultBranch() : branch_iter->second; +} + +Result SystemTableUtils::LocalDateTimePartsToTimestampMillis( + const std::vector& parts) { + if (parts.size() < 6) { + return Status::Invalid("tag create time requires at least 6 date-time fields"); + } + + int64_t year = parts[0]; + int64_t month = parts[1]; + int64_t day = parts[2]; + int64_t hour = parts[3]; + int64_t minute = parts[4]; + int64_t second = parts[5]; + int64_t nanos = parts.size() > 6 ? parts[6] : 0; + auto is_leap_year = [](int64_t value) { + return value % 4 == 0 && (value % 100 != 0 || value % 400 == 0); + }; + int64_t days_in_month[] = {31, is_leap_year(year) ? 29 : 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, + 31}; + if (month < 1 || month > 12 || day < 1 || day > days_in_month[month - 1] || hour < 0 || + hour > 23 || minute < 0 || minute > 59 || second < 0 || second > 59 || nanos < 0 || + nanos > 999999999) { + return Status::Invalid("invalid tag create time fields"); + } + + year -= month <= 2 ? 1 : 0; + int64_t era = (year >= 0 ? year : year - 399) / 400; + auto year_of_era = static_cast(year - era * 400); + auto month_prime = static_cast(month + (month > 2 ? -3 : 9)); + uint32_t day_of_year = (153 * month_prime + 2) / 5 + static_cast(day) - 1; + uint32_t day_of_era = year_of_era * 365 + year_of_era / 4 - year_of_era / 100 + day_of_year; + int64_t epoch_day = era * 146097 + static_cast(day_of_era) - 719468; + return epoch_day * DateTimeUtils::MILLIS_PER_DAY + hour * 3600000 + minute * 60000 + + second * 1000 + nanos / 1000000; +} + +Result> SystemTableUtils::OptionalLocalDateTimePartsToTimestampMillis( + const std::optional>& parts) { + if (!parts) { + return std::optional(); + } + PAIMON_ASSIGN_OR_RAISE(int64_t timestamp_millis, + LocalDateTimePartsToTimestampMillis(parts.value())); + return std::optional(timestamp_millis); +} + +std::optional SystemTableUtils::OptionalDoubleToString( + const std::optional& value) { + if (!value) { + return std::optional(); + } + return std::to_string(value.value()); +} + +VariantType SystemTableUtils::OptionalInt64Value(const std::optional& value) { + if (!value) { + return NullType(); + } + return value.value(); +} + +VariantType SystemTableUtils::StringValue(const std::string& value) { + return BinaryString::FromString(value, GetDefaultPool().get()); +} + +VariantType SystemTableUtils::OptionalStringValue(const std::optional& value) { + if (!value) { + return NullType(); + } + return StringValue(value.value()); +} + +VariantType SystemTableUtils::TimestampMillisValue(int64_t value) { + return Timestamp::FromEpochMillis(value); +} + +Result SystemTableUtils::LocalTimestampMillisValue(int64_t epoch_millis) { + PAIMON_ASSIGN_OR_RAISE( + Timestamp local_timestamp, + DateTimeUtils::ToLocalTimestamp(Timestamp::FromEpochMillis(epoch_millis))); + return TimestampMillisValue(local_timestamp.GetMillisecond()); +} + +Result SystemTableUtils::LocalTimestampMillisValue(const Timestamp& local_timestamp) { + PAIMON_ASSIGN_OR_RAISE(Timestamp utc_timestamp, DateTimeUtils::ToUTCTimestamp(local_timestamp)); + int64_t epoch_millis = utc_timestamp.GetMillisecond(); + return LocalTimestampMillisValue(epoch_millis); +} + +VariantType SystemTableUtils::OptionalTimestampMillisValue(const std::optional& value) { + if (!value) { + return NullType(); + } + return TimestampMillisValue(value.value()); +} + +Result SystemTableUtils::CreateCoreOptions(const SystemTableContext& context) { + return CoreOptions::FromMap(context.options, context.fs); +} + +Result> SystemTableUtils::CreatePathFactory( + const SystemTableContext& context, const CoreOptions& core_options, + const std::shared_ptr& pool) { + std::shared_ptr arrow_schema = + DataField::ConvertDataFieldsToArrowSchema(context.table_schema->Fields()); + PAIMON_ASSIGN_OR_RAISE(std::vector external_paths, + core_options.CreateExternalPaths()); + PAIMON_ASSIGN_OR_RAISE(std::optional global_index_external_path, + core_options.CreateGlobalIndexExternalPath()); + PAIMON_ASSIGN_OR_RAISE( + std::unique_ptr path_factory, + FileStorePathFactory::Create( + context.table_path, arrow_schema, context.table_schema->PartitionKeys(), + core_options.GetPartitionDefaultName(), core_options.GetFileFormat()->Identifier(), + core_options.DataFilePrefix(), core_options.LegacyPartitionNameEnabled(), + external_paths, global_index_external_path, core_options.IndexFileInDataFileDir(), + pool)); + return std::shared_ptr(std::move(path_factory)); +} + +Result> SystemTableUtils::LatestSnapshot( + const SystemTableContext& context) { + SnapshotManager snapshot_manager(context.fs, context.table_path, context.branch); + return snapshot_manager.LatestSnapshot(); +} + +Result> SystemTableUtils::ReadDataManifests( + const SystemTableContext& context, const Snapshot& snapshot, + const std::shared_ptr& path_factory, const CoreOptions& core_options, + const std::shared_ptr& pool) { + PAIMON_ASSIGN_OR_RAISE( + std::unique_ptr manifest_list, + ManifestList::Create(context.fs, core_options.GetManifestFormat(), + core_options.GetManifestCompression(), path_factory, pool)); + std::vector manifests; + PAIMON_RETURN_NOT_OK(manifest_list->ReadDataManifests(snapshot, &manifests)); + return manifests; +} + +Result> SystemTableUtils::ReadLatestManifestEntries( + const SystemTableContext& context, const std::shared_ptr& path_factory, + const CoreOptions& core_options, const std::shared_ptr& pool) { + PAIMON_ASSIGN_OR_RAISE(std::optional snapshot, LatestSnapshot(context)); + if (!snapshot) { + return std::vector(); + } + PAIMON_ASSIGN_OR_RAISE( + std::vector manifests, + ReadDataManifests(context, snapshot.value(), path_factory, core_options, pool)); + std::shared_ptr arrow_schema = + DataField::ConvertDataFieldsToArrowSchema(context.table_schema->Fields()); + PAIMON_ASSIGN_OR_RAISE( + std::shared_ptr partition_schema, + FieldMapping::GetPartitionSchema(arrow_schema, context.table_schema->PartitionKeys())); + PAIMON_ASSIGN_OR_RAISE(std::unique_ptr manifest_file, + ManifestFile::Create(context.fs, core_options.GetManifestFormat(), + core_options.GetManifestCompression(), path_factory, + core_options.GetManifestTargetFileSize(), pool, + core_options, partition_schema)); + std::vector entries; + for (const auto& manifest : manifests) { + PAIMON_RETURN_NOT_OK( + manifest_file->Read(manifest.FileName(), /*filter=*/nullptr, &entries)); + } + return entries; +} + +Result> SystemTableUtils::ReadLatestDataFiles( + const SystemTableContext& context, const std::shared_ptr& path_factory, + const CoreOptions& core_options, const std::shared_ptr& pool) { + PAIMON_ASSIGN_OR_RAISE(std::vector entries, + ReadLatestManifestEntries(context, path_factory, core_options, pool)); + std::vector merged_entries; + PAIMON_RETURN_NOT_OK(FileEntry::MergeEntries(entries, &merged_entries)); + return merged_entries; +} + +std::optional SystemTableUtils::OptionalBinaryRowString(const BinaryRow& row) { + if (row.GetFieldCount() <= 0) { + return std::nullopt; + } + return row.ToString(); +} + +Result> SystemTableUtils::OptionalPartitionString( + const BinaryRow& row, const std::shared_ptr& partition_schema) { + if (row.GetFieldCount() <= 0) { + return std::optional(); + } + PAIMON_ASSIGN_OR_RAISE(std::string value, + BinaryRowPartitionComputer::PartToSimpleString( + partition_schema, row, ",", kMaxPartitionStatsLength)); + return std::optional(value); +} + +Result SystemTableUtils::OptionalPartitionStringValue( + const BinaryRow& row, const std::shared_ptr& partition_schema) { + PAIMON_ASSIGN_OR_RAISE(std::optional value, + OptionalPartitionString(row, partition_schema)); + return OptionalStringValue(value); +} + +Result SystemTableUtils::PartitionString( + const std::shared_ptr& path_factory, const BinaryRow& partition) { + PAIMON_ASSIGN_OR_RAISE(std::string value, path_factory->GetPartitionString(partition)); + return value; +} + +Result SystemTableUtils::FilePath( + const std::shared_ptr& path_factory, const ManifestEntry& entry, + const DataFileMeta& file) { + if (file.external_path) { + return file.external_path.value(); + } + PAIMON_ASSIGN_OR_RAISE(std::string bucket_path, + path_factory->BucketPath(entry.Partition(), entry.Bucket())); + return PathUtil::JoinPath(bucket_path, file.file_name); +} + +Result SystemTableUtils::FieldsValueMapString(const std::vector& fields, + const InternalRow& row) { + std::shared_ptr schema = DataField::ConvertDataFieldsToArrowSchema(fields); + PAIMON_ASSIGN_OR_RAISE(std::vector getters, + InternalRowUtils::CreateFieldGetters(schema, /*use_view=*/false)); + std::vector values; + values.reserve(fields.size()); + for (size_t i = 0; i < fields.size(); ++i) { + std::string value = "null"; + if (!row.IsNullAt(i)) { + VariantType field_value = getters[i](row); + if (std::holds_alternative(field_value)) { + value = std::string(std::get(field_value)); + } else { + value = DataDefine::VariantValueToString(field_value); + } + } + values.emplace_back(fmt::format("{}:{}", fields[i].Name(), value)); + } + return fmt::format("{{{}}}", fmt::join(values, ", ")); +} + +Result SystemTableUtils::NullValueCountsString(const std::vector& fields, + const InternalArray& null_counts) { + std::vector values; + values.reserve(fields.size()); + for (size_t i = 0; i < fields.size(); ++i) { + std::string value = + null_counts.IsNullAt(i) ? "null" : std::to_string(null_counts.GetLong(i)); + values.emplace_back(fmt::format("{}:{}", fields[i].Name(), value)); + } + return fmt::format("{{{}}}", fmt::join(values, ", ")); +} + +Result> SystemTableUtils::LoadDataSchema( + const SystemTableContext& context, int64_t schema_id) { + if (schema_id == context.table_schema->Id()) { + return context.table_schema; + } + SchemaManager schema_manager(context.fs, context.table_path, context.branch); + return schema_manager.ReadSchema(schema_id); +} + +Result> SystemTableUtils::ValueStatsFields(const SystemTableContext& context, + int64_t schema_id) { + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr data_schema, + LoadDataSchema(context, schema_id)); + return data_schema->Fields(); +} + +Result> SystemTableUtils::WriteColsValue( + const std::optional>& write_cols, + const std::shared_ptr& pool) { + if (!write_cols) { + return std::shared_ptr(); + } + return std::make_shared( + InternalRowUtils::ToNotNullStringArrayData(write_cols.value(), pool)); +} + +} // namespace paimon diff --git a/src/paimon/core/table/system/system_table_utils.h b/src/paimon/core/table/system/system_table_utils.h new file mode 100644 index 000000000..437d3dbfe --- /dev/null +++ b/src/paimon/core/table/system/system_table_utils.h @@ -0,0 +1,148 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "paimon/common/data/data_define.h" +#include "paimon/common/types/data_field.h" +#include "paimon/common/utils/rapidjson_util.h" +#include "paimon/core/core_options.h" +#include "paimon/core/io/data_file_meta.h" +#include "paimon/core/manifest/manifest_entry.h" +#include "paimon/core/manifest/manifest_file_meta.h" +#include "paimon/core/snapshot.h" +#include "paimon/data/timestamp.h" +#include "paimon/result.h" +#include "paimon/status.h" +#include "rapidjson/document.h" +#include "rapidjson/stringbuffer.h" +#include "rapidjson/writer.h" + +namespace arrow { +class Schema; +} // namespace arrow + +namespace paimon { +class BinaryRow; +class FileStorePathFactory; +class FileSystem; +class InternalArray; +class InternalRow; +class MemoryPool; +class TableSchema; + +/// Shared base table metadata used by table-scoped system tables. +struct SystemTableContext { + std::shared_ptr fs; + std::string table_path; + std::string branch; + std::shared_ptr table_schema; + std::map options; +}; + +/// Utility methods shared by system table implementations. +class SystemTableUtils { + public: + SystemTableUtils() = delete; + ~SystemTableUtils() = delete; + + static SystemTableContext CreateContext(std::shared_ptr fs, std::string table_path, + std::string branch); + static SystemTableContext CreateContext(std::shared_ptr fs, std::string table_path, + std::string branch, + std::shared_ptr table_schema, + std::map options); + static std::map MergeOptions( + const std::shared_ptr& table_schema, + const std::map& dynamic_options); + static std::string DefaultBranch(); + static std::string LoadBranch(const std::map& options); + + template + static Result JsonString(const T& value) { + rapidjson::Document document; + auto json_value = RapidJsonUtil::SerializeValue(value, &document.GetAllocator()); + rapidjson::StringBuffer buffer; + rapidjson::Writer writer(buffer); + if (!json_value.Accept(writer)) { + return Status::Invalid("failed to serialize metadata system table value"); + } + return std::string(buffer.GetString(), buffer.GetSize()); + } + + static Result> OptionalLocalDateTimePartsToTimestampMillis( + const std::optional>& parts); + static std::optional OptionalDoubleToString(const std::optional& value); + static VariantType OptionalInt64Value(const std::optional& value); + static VariantType StringValue(const std::string& value); + static VariantType OptionalStringValue(const std::optional& value); + static VariantType TimestampMillisValue(int64_t value); + static Result LocalTimestampMillisValue(int64_t epoch_millis); + static Result LocalTimestampMillisValue(const Timestamp& local_timestamp); + static VariantType OptionalTimestampMillisValue(const std::optional& value); + + static Result CreateCoreOptions(const SystemTableContext& context); + static Result> CreatePathFactory( + const SystemTableContext& context, const CoreOptions& core_options, + const std::shared_ptr& pool); + static Result> LatestSnapshot(const SystemTableContext& context); + static Result> ReadDataManifests( + const SystemTableContext& context, const Snapshot& snapshot, + const std::shared_ptr& path_factory, const CoreOptions& core_options, + const std::shared_ptr& pool); + static Result> ReadLatestDataFiles( + const SystemTableContext& context, + const std::shared_ptr& path_factory, const CoreOptions& core_options, + const std::shared_ptr& pool); + + static std::optional OptionalBinaryRowString(const BinaryRow& row); + static Result OptionalPartitionStringValue( + const BinaryRow& row, const std::shared_ptr& partition_schema); + static Result PartitionString( + const std::shared_ptr& path_factory, const BinaryRow& partition); + static Result FilePath(const std::shared_ptr& path_factory, + const ManifestEntry& entry, const DataFileMeta& file); + static Result FieldsValueMapString(const std::vector& fields, + const InternalRow& row); + static Result NullValueCountsString(const std::vector& fields, + const InternalArray& null_counts); + static Result> LoadDataSchema(const SystemTableContext& context, + int64_t schema_id); + static Result> ValueStatsFields(const SystemTableContext& context, + int64_t schema_id); + static Result> WriteColsValue( + const std::optional>& write_cols, + const std::shared_ptr& pool); + + private: + static Result LocalDateTimePartsToTimestampMillis(const std::vector& parts); + static Result> OptionalPartitionString( + const BinaryRow& row, const std::shared_ptr& partition_schema); + static Result> ReadLatestManifestEntries( + const SystemTableContext& context, + const std::shared_ptr& path_factory, const CoreOptions& core_options, + const std::shared_ptr& pool); +}; + +} // namespace paimon diff --git a/src/paimon/core/table/system/tags_system_table.cpp b/src/paimon/core/table/system/tags_system_table.cpp new file mode 100644 index 000000000..e40158270 --- /dev/null +++ b/src/paimon/core/table/system/tags_system_table.cpp @@ -0,0 +1,80 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/core/table/system/tags_system_table.h" + +#include + +#include "arrow/api.h" +#include "paimon/core/tag/tag.h" +#include "paimon/core/utils/tag_manager.h" + +namespace paimon { + +TagsSystemTable::TagsSystemTable(std::shared_ptr fs, std::string table_path, + std::string branch) + : InMemorySystemTable(table_path), + context_(SystemTableUtils::CreateContext(std::move(fs), std::move(table_path), + std::move(branch))) {} + +std::string TagsSystemTable::Name() const { + return kName; +} + +Result> TagsSystemTable::ArrowSchema() const { + return arrow::schema({ + arrow::field("tag_name", arrow::utf8(), /*nullable=*/false), + arrow::field("snapshot_id", arrow::int64(), /*nullable=*/false), + arrow::field("schema_id", arrow::int64(), /*nullable=*/false), + arrow::field("commit_time", arrow::timestamp(arrow::TimeUnit::MILLI), + /*nullable=*/false), + arrow::field("record_count", arrow::int64(), /*nullable=*/true), + arrow::field("create_time", arrow::timestamp(arrow::TimeUnit::MILLI), + /*nullable=*/true), + arrow::field("time_retained", arrow::utf8(), /*nullable=*/true), + }); +} + +Result> TagsSystemTable::BuildRows() const { + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr schema, ArrowSchema()); + TagManager tag_manager(context_.fs, context_.table_path, context_.branch); + PAIMON_ASSIGN_OR_RAISE(std::vector tag_names, tag_manager.ListTagNames()); + std::vector rows; + rows.reserve(tag_names.size()); + + for (const auto& name : tag_names) { + PAIMON_ASSIGN_OR_RAISE(Tag tag, tag_manager.GetOrThrow(name)); + PAIMON_ASSIGN_OR_RAISE( + std::optional tag_create_time, + SystemTableUtils::OptionalLocalDateTimePartsToTimestampMillis(tag.TagCreateTime())); + GenericRow row(schema->num_fields()); + row.SetField(0, SystemTableUtils::StringValue(name)); + row.SetField(1, tag.Id()); + row.SetField(2, tag.SchemaId()); + PAIMON_ASSIGN_OR_RAISE(VariantType commit_time, + SystemTableUtils::LocalTimestampMillisValue(tag.TimeMillis())); + row.SetField(3, commit_time); + row.SetField(4, SystemTableUtils::OptionalInt64Value(tag.TotalRecordCount())); + row.SetField(5, SystemTableUtils::OptionalTimestampMillisValue(tag_create_time)); + row.SetField(6, SystemTableUtils::OptionalStringValue( + SystemTableUtils::OptionalDoubleToString(tag.TagTimeRetained()))); + rows.push_back(std::move(row)); + } + + return rows; +} + +} // namespace paimon diff --git a/src/paimon/core/table/system/tags_system_table.h b/src/paimon/core/table/system/tags_system_table.h new file mode 100644 index 000000000..86a727958 --- /dev/null +++ b/src/paimon/core/table/system/tags_system_table.h @@ -0,0 +1,44 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#include "paimon/core/table/system/in_memory_system_table.h" +#include "paimon/core/table/system/system_table_utils.h" + +namespace paimon { +class FileSystem; + +/// System table for `T$tags`, exposing tags and the snapshots they reference. +class TagsSystemTable : public InMemorySystemTable { + public: + static constexpr const char* kName = "tags"; + + TagsSystemTable(std::shared_ptr fs, std::string table_path, std::string branch); + + std::string Name() const override; + Result> ArrowSchema() const override; + Result> BuildRows() const override; + + private: + SystemTableContext context_; +}; + +} // namespace paimon diff --git a/test/inte/read_inte_test.cpp b/test/inte/read_inte_test.cpp index 2bf5caad0..e1d927203 100644 --- a/test/inte/read_inte_test.cpp +++ b/test/inte/read_inte_test.cpp @@ -720,7 +720,7 @@ TEST(SystemTableReadInteTest, TestReadMetadataSystemTables) { ASSERT_TRUE(manifest_file_size_array); ASSERT_TRUE(manifest_num_added_files_array); ASSERT_TRUE(manifest_schema_id_array); - ASSERT_TRUE(manifest_file_name_array->GetString(0).find("manifest-") == 0); + ASSERT_EQ(manifest_file_name_array->GetString(0).find("manifest-"), 0); ASSERT_GT(manifest_file_size_array->Value(0), 0); ASSERT_GE(manifest_num_added_files_array->Value(0), 1); ASSERT_EQ(manifest_schema_id_array->Value(0), 0); From 866f81780832bdbfc27795f9047fb54935e732b4 Mon Sep 17 00:00:00 2001 From: Socrates Date: Wed, 27 May 2026 20:51:29 +0800 Subject: [PATCH 3/8] Narrow system table utilities --- .../core/table/system/files_system_table.cpp | 176 +++++++++++-- .../table/system/manifests_system_table.cpp | 37 ++- .../table/system/schemas_system_table.cpp | 30 ++- src/paimon/core/table/system/system_table.cpp | 66 +++-- .../core/table/system/system_table_utils.cpp | 246 +----------------- .../core/table/system/system_table_utils.h | 64 ----- .../core/table/system/tags_system_table.cpp | 74 +++++- 7 files changed, 320 insertions(+), 373 deletions(-) diff --git a/src/paimon/core/table/system/files_system_table.cpp b/src/paimon/core/table/system/files_system_table.cpp index 7effaec6a..e209c30f0 100644 --- a/src/paimon/core/table/system/files_system_table.cpp +++ b/src/paimon/core/table/system/files_system_table.cpp @@ -19,13 +19,23 @@ #include #include "arrow/api.h" +#include "fmt/format.h" +#include "fmt/ranges.h" +#include "paimon/common/data/binary_array.h" +#include "paimon/common/data/binary_row.h" #include "paimon/common/data/data_define.h" #include "paimon/common/data/internal_array.h" +#include "paimon/common/data/internal_row.h" #include "paimon/common/types/data_field.h" +#include "paimon/common/utils/internal_row_utils.h" +#include "paimon/common/utils/path_util.h" #include "paimon/core/core_options.h" #include "paimon/core/io/data_file_meta.h" +#include "paimon/core/manifest/file_entry.h" #include "paimon/core/manifest/file_kind.h" #include "paimon/core/manifest/manifest_entry.h" +#include "paimon/core/manifest/manifest_file.h" +#include "paimon/core/schema/schema_manager.h" #include "paimon/core/schema/table_schema.h" #include "paimon/core/stats/simple_stats_evolutions.h" #include "paimon/core/utils/field_mapping.h" @@ -33,6 +43,131 @@ #include "paimon/memory/memory_pool.h" namespace paimon { +namespace { + +Result> ReadLatestManifestEntries( + const SystemTableContext& context, const std::shared_ptr& path_factory, + const CoreOptions& core_options, const std::shared_ptr& pool) { + PAIMON_ASSIGN_OR_RAISE(std::optional snapshot, + SystemTableUtils::LatestSnapshot(context)); + if (!snapshot) { + return std::vector(); + } + PAIMON_ASSIGN_OR_RAISE(std::vector manifests, + SystemTableUtils::ReadDataManifests(context, snapshot.value(), + path_factory, core_options, pool)); + std::shared_ptr arrow_schema = + DataField::ConvertDataFieldsToArrowSchema(context.table_schema->Fields()); + PAIMON_ASSIGN_OR_RAISE( + std::shared_ptr partition_schema, + FieldMapping::GetPartitionSchema(arrow_schema, context.table_schema->PartitionKeys())); + PAIMON_ASSIGN_OR_RAISE(std::unique_ptr manifest_file, + ManifestFile::Create(context.fs, core_options.GetManifestFormat(), + core_options.GetManifestCompression(), path_factory, + core_options.GetManifestTargetFileSize(), pool, + core_options, partition_schema)); + std::vector entries; + for (const auto& manifest : manifests) { + PAIMON_RETURN_NOT_OK( + manifest_file->Read(manifest.FileName(), /*filter=*/nullptr, &entries)); + } + return entries; +} + +Result> ReadLatestDataFiles( + const SystemTableContext& context, const std::shared_ptr& path_factory, + const CoreOptions& core_options, const std::shared_ptr& pool) { + PAIMON_ASSIGN_OR_RAISE(std::vector entries, + ReadLatestManifestEntries(context, path_factory, core_options, pool)); + std::vector merged_entries; + PAIMON_RETURN_NOT_OK(FileEntry::MergeEntries(entries, &merged_entries)); + return merged_entries; +} + +std::optional OptionalBinaryRowString(const BinaryRow& row) { + if (row.GetFieldCount() <= 0) { + return std::nullopt; + } + return row.ToString(); +} + +Result PartitionString(const std::shared_ptr& path_factory, + const BinaryRow& partition) { + PAIMON_ASSIGN_OR_RAISE(std::string value, path_factory->GetPartitionString(partition)); + return value; +} + +Result FilePath(const std::shared_ptr& path_factory, + const ManifestEntry& entry, const DataFileMeta& file) { + if (file.external_path) { + return file.external_path.value(); + } + PAIMON_ASSIGN_OR_RAISE(std::string bucket_path, + path_factory->BucketPath(entry.Partition(), entry.Bucket())); + return PathUtil::JoinPath(bucket_path, file.file_name); +} + +Result FieldsValueMapString(const std::vector& fields, + const InternalRow& row) { + std::shared_ptr schema = DataField::ConvertDataFieldsToArrowSchema(fields); + PAIMON_ASSIGN_OR_RAISE(std::vector getters, + InternalRowUtils::CreateFieldGetters(schema, /*use_view=*/false)); + std::vector values; + values.reserve(fields.size()); + for (size_t i = 0; i < fields.size(); ++i) { + std::string value = "null"; + if (!row.IsNullAt(i)) { + VariantType field_value = getters[i](row); + if (std::holds_alternative(field_value)) { + value = std::string(std::get(field_value)); + } else { + value = DataDefine::VariantValueToString(field_value); + } + } + values.emplace_back(fmt::format("{}:{}", fields[i].Name(), value)); + } + return fmt::format("{{{}}}", fmt::join(values, ", ")); +} + +Result NullValueCountsString(const std::vector& fields, + const InternalArray& null_counts) { + std::vector values; + values.reserve(fields.size()); + for (size_t i = 0; i < fields.size(); ++i) { + std::string value = + null_counts.IsNullAt(i) ? "null" : std::to_string(null_counts.GetLong(i)); + values.emplace_back(fmt::format("{}:{}", fields[i].Name(), value)); + } + return fmt::format("{{{}}}", fmt::join(values, ", ")); +} + +Result> LoadDataSchema(const SystemTableContext& context, + int64_t schema_id) { + if (schema_id == context.table_schema->Id()) { + return context.table_schema; + } + SchemaManager schema_manager(context.fs, context.table_path, context.branch); + return schema_manager.ReadSchema(schema_id); +} + +Result> ValueStatsFields(const SystemTableContext& context, + int64_t schema_id) { + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr data_schema, + LoadDataSchema(context, schema_id)); + return data_schema->Fields(); +} + +Result> WriteColsValue( + const std::optional>& write_cols, + const std::shared_ptr& pool) { + if (!write_cols) { + return std::shared_ptr(); + } + return std::make_shared( + InternalRowUtils::ToNotNullStringArrayData(write_cols.value(), pool)); +} + +} // namespace FilesSystemTable::FilesSystemTable(std::shared_ptr fs, std::string table_path, std::string branch, std::shared_ptr table_schema, @@ -78,9 +213,8 @@ Result> FilesSystemTable::BuildRows() const { PAIMON_ASSIGN_OR_RAISE(CoreOptions core_options, SystemTableUtils::CreateCoreOptions(context_)); PAIMON_ASSIGN_OR_RAISE(std::shared_ptr path_factory, SystemTableUtils::CreatePathFactory(context_, core_options, pool)); - PAIMON_ASSIGN_OR_RAISE( - std::vector entries, - SystemTableUtils::ReadLatestDataFiles(context_, path_factory, core_options, pool)); + PAIMON_ASSIGN_OR_RAISE(std::vector entries, + ReadLatestDataFiles(context_, path_factory, core_options, pool)); std::shared_ptr arrow_schema = DataField::ConvertDataFieldsToArrowSchema(context_.table_schema->Fields()); PAIMON_ASSIGN_OR_RAISE( @@ -97,9 +231,9 @@ Result> FilesSystemTable::BuildRows() const { const std::shared_ptr& file = entry.File(); PAIMON_ASSIGN_OR_RAISE(std::shared_ptr data_schema, - SystemTableUtils::LoadDataSchema(context_, file->schema_id)); + LoadDataSchema(context_, file->schema_id)); PAIMON_ASSIGN_OR_RAISE(std::vector value_stats_fields, - SystemTableUtils::ValueStatsFields(context_, file->schema_id)); + ValueStatsFields(context_, file->schema_id)); std::shared_ptr stats_evolution = stats_evolutions.GetOrCreate(data_schema); PAIMON_ASSIGN_OR_RAISE( @@ -110,13 +244,12 @@ Result> FilesSystemTable::BuildRows() const { if (context_.table_schema->PartitionKeys().empty()) { row.SetField(0, NullType()); } else { - PAIMON_ASSIGN_OR_RAISE(std::string partition, SystemTableUtils::PartitionString( - path_factory, entry.Partition())); + PAIMON_ASSIGN_OR_RAISE(std::string partition, + PartitionString(path_factory, entry.Partition())); row.SetField(0, SystemTableUtils::StringValue(partition)); } row.SetField(1, entry.Bucket()); - PAIMON_ASSIGN_OR_RAISE(std::string file_path, - SystemTableUtils::FilePath(path_factory, entry, *file)); + PAIMON_ASSIGN_OR_RAISE(std::string file_path, FilePath(path_factory, entry, *file)); row.SetField(2, SystemTableUtils::StringValue(file_path)); PAIMON_ASSIGN_OR_RAISE(std::string file_format, file->FileFormat()); row.SetField(3, SystemTableUtils::StringValue(file_format)); @@ -124,21 +257,18 @@ Result> FilesSystemTable::BuildRows() const { row.SetField(5, file->level); row.SetField(6, file->row_count); row.SetField(7, file->file_size); - row.SetField(8, SystemTableUtils::OptionalStringValue( - SystemTableUtils::OptionalBinaryRowString(file->min_key))); - row.SetField(9, SystemTableUtils::OptionalStringValue( - SystemTableUtils::OptionalBinaryRowString(file->max_key))); - PAIMON_ASSIGN_OR_RAISE( - std::string null_value_counts, - SystemTableUtils::NullValueCountsString(value_stats_fields, *stats.null_counts)); + row.SetField(8, + SystemTableUtils::OptionalStringValue(OptionalBinaryRowString(file->min_key))); + row.SetField(9, + SystemTableUtils::OptionalStringValue(OptionalBinaryRowString(file->max_key))); + PAIMON_ASSIGN_OR_RAISE(std::string null_value_counts, + NullValueCountsString(value_stats_fields, *stats.null_counts)); row.SetField(10, SystemTableUtils::StringValue(null_value_counts)); - PAIMON_ASSIGN_OR_RAISE( - std::string min_value_stats, - SystemTableUtils::FieldsValueMapString(value_stats_fields, *stats.min_values)); + PAIMON_ASSIGN_OR_RAISE(std::string min_value_stats, + FieldsValueMapString(value_stats_fields, *stats.min_values)); row.SetField(11, SystemTableUtils::StringValue(min_value_stats)); - PAIMON_ASSIGN_OR_RAISE( - std::string max_value_stats, - SystemTableUtils::FieldsValueMapString(value_stats_fields, *stats.max_values)); + PAIMON_ASSIGN_OR_RAISE(std::string max_value_stats, + FieldsValueMapString(value_stats_fields, *stats.max_values)); row.SetField(12, SystemTableUtils::StringValue(max_value_stats)); row.SetField(13, file->min_sequence_number); row.SetField(14, file->max_sequence_number); @@ -151,7 +281,7 @@ Result> FilesSystemTable::BuildRows() const { : VariantType(NullType())); row.SetField(18, SystemTableUtils::OptionalInt64Value(file->first_row_id)); PAIMON_ASSIGN_OR_RAISE(std::shared_ptr write_cols, - SystemTableUtils::WriteColsValue(file->write_cols, pool)); + WriteColsValue(file->write_cols, pool)); row.SetField(19, write_cols ? VariantType(write_cols) : VariantType(NullType())); rows.push_back(std::move(row)); } diff --git a/src/paimon/core/table/system/manifests_system_table.cpp b/src/paimon/core/table/system/manifests_system_table.cpp index a67b5486c..9ffbdcfbd 100644 --- a/src/paimon/core/table/system/manifests_system_table.cpp +++ b/src/paimon/core/table/system/manifests_system_table.cpp @@ -19,7 +19,9 @@ #include #include "arrow/api.h" +#include "paimon/common/data/binary_row.h" #include "paimon/common/types/data_field.h" +#include "paimon/common/utils/binary_row_partition_computer.h" #include "paimon/core/core_options.h" #include "paimon/core/manifest/manifest_file_meta.h" #include "paimon/core/schema/table_schema.h" @@ -29,6 +31,29 @@ #include "paimon/memory/memory_pool.h" namespace paimon { +namespace { + +constexpr int32_t kMaxPartitionStatsLength = 255; + +Result> OptionalPartitionString( + const BinaryRow& row, const std::shared_ptr& partition_schema) { + if (row.GetFieldCount() <= 0) { + return std::optional(); + } + PAIMON_ASSIGN_OR_RAISE(std::string value, + BinaryRowPartitionComputer::PartToSimpleString( + partition_schema, row, ",", kMaxPartitionStatsLength)); + return std::optional(value); +} + +Result OptionalPartitionStringValue( + const BinaryRow& row, const std::shared_ptr& partition_schema) { + PAIMON_ASSIGN_OR_RAISE(std::optional value, + OptionalPartitionString(row, partition_schema)); + return SystemTableUtils::OptionalStringValue(value); +} + +} // namespace ManifestsSystemTable::ManifestsSystemTable(std::shared_ptr fs, std::string table_path, std::string branch, @@ -87,12 +112,12 @@ Result> ManifestsSystemTable::BuildRows() const { row.SetField(2, manifest.NumAddedFiles()); row.SetField(3, manifest.NumDeletedFiles()); row.SetField(4, manifest.SchemaId()); - PAIMON_ASSIGN_OR_RAISE(VariantType min_partition, - SystemTableUtils::OptionalPartitionStringValue( - manifest.PartitionStats().MinValues(), partition_schema)); - PAIMON_ASSIGN_OR_RAISE(VariantType max_partition, - SystemTableUtils::OptionalPartitionStringValue( - manifest.PartitionStats().MaxValues(), partition_schema)); + PAIMON_ASSIGN_OR_RAISE( + VariantType min_partition, + OptionalPartitionStringValue(manifest.PartitionStats().MinValues(), partition_schema)); + PAIMON_ASSIGN_OR_RAISE( + VariantType max_partition, + OptionalPartitionStringValue(manifest.PartitionStats().MaxValues(), partition_schema)); row.SetField(5, min_partition); row.SetField(6, max_partition); row.SetField(7, SystemTableUtils::OptionalInt64Value(manifest.MinRowId())); diff --git a/src/paimon/core/table/system/schemas_system_table.cpp b/src/paimon/core/table/system/schemas_system_table.cpp index 9be51ee61..f58cdf133 100644 --- a/src/paimon/core/table/system/schemas_system_table.cpp +++ b/src/paimon/core/table/system/schemas_system_table.cpp @@ -20,10 +20,30 @@ #include #include "arrow/api.h" +#include "paimon/common/utils/rapidjson_util.h" #include "paimon/core/schema/schema_manager.h" #include "paimon/core/schema/table_schema.h" +#include "paimon/status.h" +#include "rapidjson/document.h" +#include "rapidjson/stringbuffer.h" +#include "rapidjson/writer.h" namespace paimon { +namespace { + +template +Result JsonString(const T& value) { + rapidjson::Document document; + auto json_value = RapidJsonUtil::SerializeValue(value, &document.GetAllocator()); + rapidjson::StringBuffer buffer; + rapidjson::Writer writer(buffer); + if (!json_value.Accept(writer)) { + return Status::Invalid("failed to serialize schemas system table value"); + } + return std::string(buffer.GetString(), buffer.GetSize()); +} + +} // namespace SchemasSystemTable::SchemasSystemTable(std::shared_ptr fs, std::string table_path, std::string branch) @@ -59,14 +79,12 @@ Result> SchemasSystemTable::BuildRows() const { for (int64_t id : schema_ids) { PAIMON_ASSIGN_OR_RAISE(std::shared_ptr table_schema, schema_manager.ReadSchema(id)); - PAIMON_ASSIGN_OR_RAISE(std::string fields_json, - SystemTableUtils::JsonString(table_schema->Fields())); + PAIMON_ASSIGN_OR_RAISE(std::string fields_json, JsonString(table_schema->Fields())); PAIMON_ASSIGN_OR_RAISE(std::string partition_keys_json, - SystemTableUtils::JsonString(table_schema->PartitionKeys())); + JsonString(table_schema->PartitionKeys())); PAIMON_ASSIGN_OR_RAISE(std::string primary_keys_json, - SystemTableUtils::JsonString(table_schema->PrimaryKeys())); - PAIMON_ASSIGN_OR_RAISE(std::string options_json, - SystemTableUtils::JsonString(table_schema->Options())); + JsonString(table_schema->PrimaryKeys())); + PAIMON_ASSIGN_OR_RAISE(std::string options_json, JsonString(table_schema->Options())); GenericRow row(schema->num_fields()); row.SetField(0, table_schema->Id()); diff --git a/src/paimon/core/table/system/system_table.cpp b/src/paimon/core/table/system/system_table.cpp index 35ae430b1..7c86da729 100644 --- a/src/paimon/core/table/system/system_table.cpp +++ b/src/paimon/core/table/system/system_table.cpp @@ -37,8 +37,9 @@ #include "paimon/core/table/system/options_system_table.h" #include "paimon/core/table/system/schemas_system_table.h" #include "paimon/core/table/system/snapshots_system_table.h" -#include "paimon/core/table/system/system_table_utils.h" #include "paimon/core/table/system/tags_system_table.h" +#include "paimon/core/utils/branch_manager.h" +#include "paimon/defs.h" #include "paimon/status.h" namespace paimon { @@ -53,6 +54,25 @@ struct SystemTableRegistryEntry { SystemTableFactory factory; }; +std::map MergeOptions( + const std::shared_ptr& table_schema, + const std::map& dynamic_options) { + auto options = table_schema->Options(); + for (const auto& [key, value] : dynamic_options) { + options[key] = value; + } + return options; +} + +std::string DefaultBranch() { + return BranchManager::DEFAULT_MAIN_BRANCH; +} + +std::string LoadBranch(const std::map& options) { + auto branch_iter = options.find(Options::BRANCH); + return branch_iter == options.end() ? DefaultBranch() : branch_iter->second; +} + const std::vector& SystemTableRegistry() { static const std::vector registry = { {OptionsSystemTable::kName, @@ -68,8 +88,7 @@ const std::vector& SystemTableRegistry() { const std::map& dynamic_options) -> Result> { return std::make_shared( - fs, table_path, table_schema, - SystemTableUtils::MergeOptions(table_schema, dynamic_options)); + fs, table_path, table_schema, MergeOptions(table_schema, dynamic_options)); }}, {BinlogSystemTable::kName, [](const std::shared_ptr& fs, const std::string& table_path, @@ -77,62 +96,55 @@ const std::vector& SystemTableRegistry() { const std::map& dynamic_options) -> Result> { return std::make_shared( - fs, table_path, table_schema, - SystemTableUtils::MergeOptions(table_schema, dynamic_options)); + fs, table_path, table_schema, MergeOptions(table_schema, dynamic_options)); }}, {SnapshotsSystemTable::kName, [](const std::shared_ptr& fs, const std::string& table_path, const std::shared_ptr& table_schema, const std::map& dynamic_options) -> Result> { - auto options = SystemTableUtils::MergeOptions(table_schema, dynamic_options); - return std::make_shared(fs, table_path, - SystemTableUtils::LoadBranch(options)); + auto options = MergeOptions(table_schema, dynamic_options); + return std::make_shared(fs, table_path, LoadBranch(options)); }}, {SchemasSystemTable::kName, [](const std::shared_ptr& fs, const std::string& table_path, const std::shared_ptr& table_schema, const std::map& dynamic_options) -> Result> { - auto options = SystemTableUtils::MergeOptions(table_schema, dynamic_options); - return std::make_shared(fs, table_path, - SystemTableUtils::LoadBranch(options)); + auto options = MergeOptions(table_schema, dynamic_options); + return std::make_shared(fs, table_path, LoadBranch(options)); }}, {TagsSystemTable::kName, [](const std::shared_ptr& fs, const std::string& table_path, const std::shared_ptr& table_schema, const std::map& dynamic_options) -> Result> { - auto options = SystemTableUtils::MergeOptions(table_schema, dynamic_options); - return std::make_shared(fs, table_path, - SystemTableUtils::LoadBranch(options)); + auto options = MergeOptions(table_schema, dynamic_options); + return std::make_shared(fs, table_path, LoadBranch(options)); }}, {BranchesSystemTable::kName, [](const std::shared_ptr& fs, const std::string& table_path, const std::shared_ptr& table_schema, const std::map& dynamic_options) -> Result> { - auto options = SystemTableUtils::MergeOptions(table_schema, dynamic_options); - return std::make_shared(fs, table_path, - SystemTableUtils::LoadBranch(options)); + auto options = MergeOptions(table_schema, dynamic_options); + return std::make_shared(fs, table_path, LoadBranch(options)); }}, {ConsumersSystemTable::kName, [](const std::shared_ptr& fs, const std::string& table_path, const std::shared_ptr& table_schema, const std::map& dynamic_options) -> Result> { - auto options = SystemTableUtils::MergeOptions(table_schema, dynamic_options); - return std::make_shared(fs, table_path, - SystemTableUtils::LoadBranch(options)); + auto options = MergeOptions(table_schema, dynamic_options); + return std::make_shared(fs, table_path, LoadBranch(options)); }}, {ManifestsSystemTable::kName, [](const std::shared_ptr& fs, const std::string& table_path, const std::shared_ptr& table_schema, const std::map& dynamic_options) -> Result> { - auto options = SystemTableUtils::MergeOptions(table_schema, dynamic_options); - return std::make_shared(fs, table_path, - SystemTableUtils::LoadBranch(options), + auto options = MergeOptions(table_schema, dynamic_options); + return std::make_shared(fs, table_path, LoadBranch(options), table_schema, std::move(options)); }}, {FilesSystemTable::kName, @@ -140,9 +152,8 @@ const std::vector& SystemTableRegistry() { const std::shared_ptr& table_schema, const std::map& dynamic_options) -> Result> { - auto options = SystemTableUtils::MergeOptions(table_schema, dynamic_options); - return std::make_shared(fs, table_path, - SystemTableUtils::LoadBranch(options), + auto options = MergeOptions(table_schema, dynamic_options); + return std::make_shared(fs, table_path, LoadBranch(options), table_schema, std::move(options)); }}, }; @@ -203,8 +214,7 @@ Result> SystemTableLoader::LoadFromPath( return Status::Invalid("path is not a system table path: ", path); } const auto& parsed = system_table_path.value(); - SchemaManager schema_manager(fs, parsed.table_path, - parsed.branch.value_or(SystemTableUtils::DefaultBranch())); + SchemaManager schema_manager(fs, parsed.table_path, parsed.branch.value_or(DefaultBranch())); PAIMON_ASSIGN_OR_RAISE(std::optional> latest_schema, schema_manager.Latest()); if (!latest_schema) { diff --git a/src/paimon/core/table/system/system_table_utils.cpp b/src/paimon/core/table/system/system_table_utils.cpp index a06361588..d1a7634ac 100644 --- a/src/paimon/core/table/system/system_table_utils.cpp +++ b/src/paimon/core/table/system/system_table_utils.cpp @@ -16,33 +16,17 @@ #include "paimon/core/table/system/system_table_utils.h" -#include #include -#include "fmt/format.h" -#include "fmt/ranges.h" -#include "paimon/common/data/binary_array.h" -#include "paimon/common/data/binary_row.h" #include "paimon/common/data/binary_string.h" -#include "paimon/common/data/internal_array.h" -#include "paimon/common/data/internal_row.h" #include "paimon/common/types/data_field.h" -#include "paimon/common/utils/binary_row_partition_computer.h" #include "paimon/common/utils/date_time_utils.h" -#include "paimon/common/utils/internal_row_utils.h" -#include "paimon/common/utils/path_util.h" #include "paimon/core/core_options.h" -#include "paimon/core/io/data_file_meta.h" -#include "paimon/core/manifest/file_entry.h" -#include "paimon/core/manifest/manifest_entry.h" -#include "paimon/core/manifest/manifest_file.h" #include "paimon/core/manifest/manifest_file_meta.h" #include "paimon/core/manifest/manifest_list.h" -#include "paimon/core/schema/schema_manager.h" #include "paimon/core/schema/table_schema.h" #include "paimon/core/snapshot.h" #include "paimon/core/utils/branch_manager.h" -#include "paimon/core/utils/field_mapping.h" #include "paimon/core/utils/file_store_path_factory.h" #include "paimon/core/utils/snapshot_manager.h" #include "paimon/fs/file_system.h" @@ -50,11 +34,6 @@ #include "paimon/status.h" namespace paimon { -namespace { - -constexpr int32_t kMaxPartitionStatsLength = 255; - -} // namespace SystemTableContext SystemTableUtils::CreateContext(std::shared_ptr fs, std::string table_path, std::string branch) { @@ -73,78 +52,6 @@ SystemTableContext SystemTableUtils::CreateContext(std::shared_ptr f }; } -std::map SystemTableUtils::MergeOptions( - const std::shared_ptr& table_schema, - const std::map& dynamic_options) { - auto options = table_schema->Options(); - for (const auto& [key, value] : dynamic_options) { - options[key] = value; - } - return options; -} - -std::string SystemTableUtils::DefaultBranch() { - return BranchManager::DEFAULT_MAIN_BRANCH; -} - -std::string SystemTableUtils::LoadBranch(const std::map& options) { - auto branch_iter = options.find(Options::BRANCH); - return branch_iter == options.end() ? DefaultBranch() : branch_iter->second; -} - -Result SystemTableUtils::LocalDateTimePartsToTimestampMillis( - const std::vector& parts) { - if (parts.size() < 6) { - return Status::Invalid("tag create time requires at least 6 date-time fields"); - } - - int64_t year = parts[0]; - int64_t month = parts[1]; - int64_t day = parts[2]; - int64_t hour = parts[3]; - int64_t minute = parts[4]; - int64_t second = parts[5]; - int64_t nanos = parts.size() > 6 ? parts[6] : 0; - auto is_leap_year = [](int64_t value) { - return value % 4 == 0 && (value % 100 != 0 || value % 400 == 0); - }; - int64_t days_in_month[] = {31, is_leap_year(year) ? 29 : 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, - 31}; - if (month < 1 || month > 12 || day < 1 || day > days_in_month[month - 1] || hour < 0 || - hour > 23 || minute < 0 || minute > 59 || second < 0 || second > 59 || nanos < 0 || - nanos > 999999999) { - return Status::Invalid("invalid tag create time fields"); - } - - year -= month <= 2 ? 1 : 0; - int64_t era = (year >= 0 ? year : year - 399) / 400; - auto year_of_era = static_cast(year - era * 400); - auto month_prime = static_cast(month + (month > 2 ? -3 : 9)); - uint32_t day_of_year = (153 * month_prime + 2) / 5 + static_cast(day) - 1; - uint32_t day_of_era = year_of_era * 365 + year_of_era / 4 - year_of_era / 100 + day_of_year; - int64_t epoch_day = era * 146097 + static_cast(day_of_era) - 719468; - return epoch_day * DateTimeUtils::MILLIS_PER_DAY + hour * 3600000 + minute * 60000 + - second * 1000 + nanos / 1000000; -} - -Result> SystemTableUtils::OptionalLocalDateTimePartsToTimestampMillis( - const std::optional>& parts) { - if (!parts) { - return std::optional(); - } - PAIMON_ASSIGN_OR_RAISE(int64_t timestamp_millis, - LocalDateTimePartsToTimestampMillis(parts.value())); - return std::optional(timestamp_millis); -} - -std::optional SystemTableUtils::OptionalDoubleToString( - const std::optional& value) { - if (!value) { - return std::optional(); - } - return std::to_string(value.value()); -} - VariantType SystemTableUtils::OptionalInt64Value(const std::optional& value) { if (!value) { return NullType(); @@ -163,10 +70,14 @@ VariantType SystemTableUtils::OptionalStringValue(const std::optional SystemTableUtils::LocalTimestampMillisValue(int64_t epoch_millis) { PAIMON_ASSIGN_OR_RAISE( Timestamp local_timestamp, @@ -180,13 +91,6 @@ Result SystemTableUtils::LocalTimestampMillisValue(const Timestamp& return LocalTimestampMillisValue(epoch_millis); } -VariantType SystemTableUtils::OptionalTimestampMillisValue(const std::optional& value) { - if (!value) { - return NullType(); - } - return TimestampMillisValue(value.value()); -} - Result SystemTableUtils::CreateCoreOptions(const SystemTableContext& context) { return CoreOptions::FromMap(context.options, context.fs); } @@ -230,144 +134,4 @@ Result> SystemTableUtils::ReadDataManifests( return manifests; } -Result> SystemTableUtils::ReadLatestManifestEntries( - const SystemTableContext& context, const std::shared_ptr& path_factory, - const CoreOptions& core_options, const std::shared_ptr& pool) { - PAIMON_ASSIGN_OR_RAISE(std::optional snapshot, LatestSnapshot(context)); - if (!snapshot) { - return std::vector(); - } - PAIMON_ASSIGN_OR_RAISE( - std::vector manifests, - ReadDataManifests(context, snapshot.value(), path_factory, core_options, pool)); - std::shared_ptr arrow_schema = - DataField::ConvertDataFieldsToArrowSchema(context.table_schema->Fields()); - PAIMON_ASSIGN_OR_RAISE( - std::shared_ptr partition_schema, - FieldMapping::GetPartitionSchema(arrow_schema, context.table_schema->PartitionKeys())); - PAIMON_ASSIGN_OR_RAISE(std::unique_ptr manifest_file, - ManifestFile::Create(context.fs, core_options.GetManifestFormat(), - core_options.GetManifestCompression(), path_factory, - core_options.GetManifestTargetFileSize(), pool, - core_options, partition_schema)); - std::vector entries; - for (const auto& manifest : manifests) { - PAIMON_RETURN_NOT_OK( - manifest_file->Read(manifest.FileName(), /*filter=*/nullptr, &entries)); - } - return entries; -} - -Result> SystemTableUtils::ReadLatestDataFiles( - const SystemTableContext& context, const std::shared_ptr& path_factory, - const CoreOptions& core_options, const std::shared_ptr& pool) { - PAIMON_ASSIGN_OR_RAISE(std::vector entries, - ReadLatestManifestEntries(context, path_factory, core_options, pool)); - std::vector merged_entries; - PAIMON_RETURN_NOT_OK(FileEntry::MergeEntries(entries, &merged_entries)); - return merged_entries; -} - -std::optional SystemTableUtils::OptionalBinaryRowString(const BinaryRow& row) { - if (row.GetFieldCount() <= 0) { - return std::nullopt; - } - return row.ToString(); -} - -Result> SystemTableUtils::OptionalPartitionString( - const BinaryRow& row, const std::shared_ptr& partition_schema) { - if (row.GetFieldCount() <= 0) { - return std::optional(); - } - PAIMON_ASSIGN_OR_RAISE(std::string value, - BinaryRowPartitionComputer::PartToSimpleString( - partition_schema, row, ",", kMaxPartitionStatsLength)); - return std::optional(value); -} - -Result SystemTableUtils::OptionalPartitionStringValue( - const BinaryRow& row, const std::shared_ptr& partition_schema) { - PAIMON_ASSIGN_OR_RAISE(std::optional value, - OptionalPartitionString(row, partition_schema)); - return OptionalStringValue(value); -} - -Result SystemTableUtils::PartitionString( - const std::shared_ptr& path_factory, const BinaryRow& partition) { - PAIMON_ASSIGN_OR_RAISE(std::string value, path_factory->GetPartitionString(partition)); - return value; -} - -Result SystemTableUtils::FilePath( - const std::shared_ptr& path_factory, const ManifestEntry& entry, - const DataFileMeta& file) { - if (file.external_path) { - return file.external_path.value(); - } - PAIMON_ASSIGN_OR_RAISE(std::string bucket_path, - path_factory->BucketPath(entry.Partition(), entry.Bucket())); - return PathUtil::JoinPath(bucket_path, file.file_name); -} - -Result SystemTableUtils::FieldsValueMapString(const std::vector& fields, - const InternalRow& row) { - std::shared_ptr schema = DataField::ConvertDataFieldsToArrowSchema(fields); - PAIMON_ASSIGN_OR_RAISE(std::vector getters, - InternalRowUtils::CreateFieldGetters(schema, /*use_view=*/false)); - std::vector values; - values.reserve(fields.size()); - for (size_t i = 0; i < fields.size(); ++i) { - std::string value = "null"; - if (!row.IsNullAt(i)) { - VariantType field_value = getters[i](row); - if (std::holds_alternative(field_value)) { - value = std::string(std::get(field_value)); - } else { - value = DataDefine::VariantValueToString(field_value); - } - } - values.emplace_back(fmt::format("{}:{}", fields[i].Name(), value)); - } - return fmt::format("{{{}}}", fmt::join(values, ", ")); -} - -Result SystemTableUtils::NullValueCountsString(const std::vector& fields, - const InternalArray& null_counts) { - std::vector values; - values.reserve(fields.size()); - for (size_t i = 0; i < fields.size(); ++i) { - std::string value = - null_counts.IsNullAt(i) ? "null" : std::to_string(null_counts.GetLong(i)); - values.emplace_back(fmt::format("{}:{}", fields[i].Name(), value)); - } - return fmt::format("{{{}}}", fmt::join(values, ", ")); -} - -Result> SystemTableUtils::LoadDataSchema( - const SystemTableContext& context, int64_t schema_id) { - if (schema_id == context.table_schema->Id()) { - return context.table_schema; - } - SchemaManager schema_manager(context.fs, context.table_path, context.branch); - return schema_manager.ReadSchema(schema_id); -} - -Result> SystemTableUtils::ValueStatsFields(const SystemTableContext& context, - int64_t schema_id) { - PAIMON_ASSIGN_OR_RAISE(std::shared_ptr data_schema, - LoadDataSchema(context, schema_id)); - return data_schema->Fields(); -} - -Result> SystemTableUtils::WriteColsValue( - const std::optional>& write_cols, - const std::shared_ptr& pool) { - if (!write_cols) { - return std::shared_ptr(); - } - return std::make_shared( - InternalRowUtils::ToNotNullStringArrayData(write_cols.value(), pool)); -} - } // namespace paimon diff --git a/src/paimon/core/table/system/system_table_utils.h b/src/paimon/core/table/system/system_table_utils.h index 437d3dbfe..63e744e24 100644 --- a/src/paimon/core/table/system/system_table_utils.h +++ b/src/paimon/core/table/system/system_table_utils.h @@ -24,20 +24,11 @@ #include #include -#include "paimon/common/data/data_define.h" -#include "paimon/common/types/data_field.h" -#include "paimon/common/utils/rapidjson_util.h" #include "paimon/core/core_options.h" -#include "paimon/core/io/data_file_meta.h" -#include "paimon/core/manifest/manifest_entry.h" #include "paimon/core/manifest/manifest_file_meta.h" #include "paimon/core/snapshot.h" #include "paimon/data/timestamp.h" #include "paimon/result.h" -#include "paimon/status.h" -#include "rapidjson/document.h" -#include "rapidjson/stringbuffer.h" -#include "rapidjson/writer.h" namespace arrow { class Schema; @@ -73,34 +64,11 @@ class SystemTableUtils { std::string branch, std::shared_ptr table_schema, std::map options); - static std::map MergeOptions( - const std::shared_ptr& table_schema, - const std::map& dynamic_options); - static std::string DefaultBranch(); - static std::string LoadBranch(const std::map& options); - - template - static Result JsonString(const T& value) { - rapidjson::Document document; - auto json_value = RapidJsonUtil::SerializeValue(value, &document.GetAllocator()); - rapidjson::StringBuffer buffer; - rapidjson::Writer writer(buffer); - if (!json_value.Accept(writer)) { - return Status::Invalid("failed to serialize metadata system table value"); - } - return std::string(buffer.GetString(), buffer.GetSize()); - } - - static Result> OptionalLocalDateTimePartsToTimestampMillis( - const std::optional>& parts); - static std::optional OptionalDoubleToString(const std::optional& value); static VariantType OptionalInt64Value(const std::optional& value); static VariantType StringValue(const std::string& value); static VariantType OptionalStringValue(const std::optional& value); - static VariantType TimestampMillisValue(int64_t value); static Result LocalTimestampMillisValue(int64_t epoch_millis); static Result LocalTimestampMillisValue(const Timestamp& local_timestamp); - static VariantType OptionalTimestampMillisValue(const std::optional& value); static Result CreateCoreOptions(const SystemTableContext& context); static Result> CreatePathFactory( @@ -111,38 +79,6 @@ class SystemTableUtils { const SystemTableContext& context, const Snapshot& snapshot, const std::shared_ptr& path_factory, const CoreOptions& core_options, const std::shared_ptr& pool); - static Result> ReadLatestDataFiles( - const SystemTableContext& context, - const std::shared_ptr& path_factory, const CoreOptions& core_options, - const std::shared_ptr& pool); - - static std::optional OptionalBinaryRowString(const BinaryRow& row); - static Result OptionalPartitionStringValue( - const BinaryRow& row, const std::shared_ptr& partition_schema); - static Result PartitionString( - const std::shared_ptr& path_factory, const BinaryRow& partition); - static Result FilePath(const std::shared_ptr& path_factory, - const ManifestEntry& entry, const DataFileMeta& file); - static Result FieldsValueMapString(const std::vector& fields, - const InternalRow& row); - static Result NullValueCountsString(const std::vector& fields, - const InternalArray& null_counts); - static Result> LoadDataSchema(const SystemTableContext& context, - int64_t schema_id); - static Result> ValueStatsFields(const SystemTableContext& context, - int64_t schema_id); - static Result> WriteColsValue( - const std::optional>& write_cols, - const std::shared_ptr& pool); - - private: - static Result LocalDateTimePartsToTimestampMillis(const std::vector& parts); - static Result> OptionalPartitionString( - const BinaryRow& row, const std::shared_ptr& partition_schema); - static Result> ReadLatestManifestEntries( - const SystemTableContext& context, - const std::shared_ptr& path_factory, const CoreOptions& core_options, - const std::shared_ptr& pool); }; } // namespace paimon diff --git a/src/paimon/core/table/system/tags_system_table.cpp b/src/paimon/core/table/system/tags_system_table.cpp index e40158270..42d054fa8 100644 --- a/src/paimon/core/table/system/tags_system_table.cpp +++ b/src/paimon/core/table/system/tags_system_table.cpp @@ -19,10 +19,75 @@ #include #include "arrow/api.h" +#include "paimon/common/data/data_define.h" +#include "paimon/common/utils/date_time_utils.h" #include "paimon/core/tag/tag.h" #include "paimon/core/utils/tag_manager.h" +#include "paimon/data/timestamp.h" +#include "paimon/status.h" namespace paimon { +namespace { + +Result LocalDateTimePartsToTimestampMillis(const std::vector& parts) { + if (parts.size() < 6) { + return Status::Invalid("tag create time requires at least 6 date-time fields"); + } + + int64_t year = parts[0]; + int64_t month = parts[1]; + int64_t day = parts[2]; + int64_t hour = parts[3]; + int64_t minute = parts[4]; + int64_t second = parts[5]; + int64_t nanos = parts.size() > 6 ? parts[6] : 0; + auto is_leap_year = [](int64_t value) { + return value % 4 == 0 && (value % 100 != 0 || value % 400 == 0); + }; + int64_t days_in_month[] = {31, is_leap_year(year) ? 29 : 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, + 31}; + if (month < 1 || month > 12 || day < 1 || day > days_in_month[month - 1] || hour < 0 || + hour > 23 || minute < 0 || minute > 59 || second < 0 || second > 59 || nanos < 0 || + nanos > 999999999) { + return Status::Invalid("invalid tag create time fields"); + } + + year -= month <= 2 ? 1 : 0; + int64_t era = (year >= 0 ? year : year - 399) / 400; + auto year_of_era = static_cast(year - era * 400); + auto month_prime = static_cast(month + (month > 2 ? -3 : 9)); + uint32_t day_of_year = (153 * month_prime + 2) / 5 + static_cast(day) - 1; + uint32_t day_of_era = year_of_era * 365 + year_of_era / 4 - year_of_era / 100 + day_of_year; + int64_t epoch_day = era * 146097 + static_cast(day_of_era) - 719468; + return epoch_day * DateTimeUtils::MILLIS_PER_DAY + hour * 3600000 + minute * 60000 + + second * 1000 + nanos / 1000000; +} + +Result> OptionalLocalDateTimePartsToTimestampMillis( + const std::optional>& parts) { + if (!parts) { + return std::optional(); + } + PAIMON_ASSIGN_OR_RAISE(int64_t timestamp_millis, + LocalDateTimePartsToTimestampMillis(parts.value())); + return std::optional(timestamp_millis); +} + +std::optional OptionalDoubleToString(const std::optional& value) { + if (!value) { + return std::optional(); + } + return std::to_string(value.value()); +} + +VariantType OptionalTimestampMillisValue(const std::optional& value) { + if (!value) { + return NullType(); + } + return Timestamp::FromEpochMillis(value.value()); +} + +} // namespace TagsSystemTable::TagsSystemTable(std::shared_ptr fs, std::string table_path, std::string branch) @@ -57,9 +122,8 @@ Result> TagsSystemTable::BuildRows() const { for (const auto& name : tag_names) { PAIMON_ASSIGN_OR_RAISE(Tag tag, tag_manager.GetOrThrow(name)); - PAIMON_ASSIGN_OR_RAISE( - std::optional tag_create_time, - SystemTableUtils::OptionalLocalDateTimePartsToTimestampMillis(tag.TagCreateTime())); + PAIMON_ASSIGN_OR_RAISE(std::optional tag_create_time, + OptionalLocalDateTimePartsToTimestampMillis(tag.TagCreateTime())); GenericRow row(schema->num_fields()); row.SetField(0, SystemTableUtils::StringValue(name)); row.SetField(1, tag.Id()); @@ -68,9 +132,9 @@ Result> TagsSystemTable::BuildRows() const { SystemTableUtils::LocalTimestampMillisValue(tag.TimeMillis())); row.SetField(3, commit_time); row.SetField(4, SystemTableUtils::OptionalInt64Value(tag.TotalRecordCount())); - row.SetField(5, SystemTableUtils::OptionalTimestampMillisValue(tag_create_time)); + row.SetField(5, OptionalTimestampMillisValue(tag_create_time)); row.SetField(6, SystemTableUtils::OptionalStringValue( - SystemTableUtils::OptionalDoubleToString(tag.TagTimeRetained()))); + OptionalDoubleToString(tag.TagTimeRetained()))); rows.push_back(std::move(row)); } From 5d9dddb38abbcd1af4186b01bd55250e837a1d79 Mon Sep 17 00:00:00 2001 From: Socrates Date: Thu, 28 May 2026 10:20:06 +0800 Subject: [PATCH 4/8] Revert "Narrow system table utilities" This reverts commit cf6e2f196ef745dac6c1419e15c7d3038caee260. --- .../core/table/system/files_system_table.cpp | 176 ++----------- .../table/system/manifests_system_table.cpp | 37 +-- .../table/system/schemas_system_table.cpp | 30 +-- src/paimon/core/table/system/system_table.cpp | 66 ++--- .../core/table/system/system_table_utils.cpp | 246 +++++++++++++++++- .../core/table/system/system_table_utils.h | 64 +++++ .../core/table/system/tags_system_table.cpp | 74 +----- 7 files changed, 373 insertions(+), 320 deletions(-) diff --git a/src/paimon/core/table/system/files_system_table.cpp b/src/paimon/core/table/system/files_system_table.cpp index e209c30f0..7effaec6a 100644 --- a/src/paimon/core/table/system/files_system_table.cpp +++ b/src/paimon/core/table/system/files_system_table.cpp @@ -19,23 +19,13 @@ #include #include "arrow/api.h" -#include "fmt/format.h" -#include "fmt/ranges.h" -#include "paimon/common/data/binary_array.h" -#include "paimon/common/data/binary_row.h" #include "paimon/common/data/data_define.h" #include "paimon/common/data/internal_array.h" -#include "paimon/common/data/internal_row.h" #include "paimon/common/types/data_field.h" -#include "paimon/common/utils/internal_row_utils.h" -#include "paimon/common/utils/path_util.h" #include "paimon/core/core_options.h" #include "paimon/core/io/data_file_meta.h" -#include "paimon/core/manifest/file_entry.h" #include "paimon/core/manifest/file_kind.h" #include "paimon/core/manifest/manifest_entry.h" -#include "paimon/core/manifest/manifest_file.h" -#include "paimon/core/schema/schema_manager.h" #include "paimon/core/schema/table_schema.h" #include "paimon/core/stats/simple_stats_evolutions.h" #include "paimon/core/utils/field_mapping.h" @@ -43,131 +33,6 @@ #include "paimon/memory/memory_pool.h" namespace paimon { -namespace { - -Result> ReadLatestManifestEntries( - const SystemTableContext& context, const std::shared_ptr& path_factory, - const CoreOptions& core_options, const std::shared_ptr& pool) { - PAIMON_ASSIGN_OR_RAISE(std::optional snapshot, - SystemTableUtils::LatestSnapshot(context)); - if (!snapshot) { - return std::vector(); - } - PAIMON_ASSIGN_OR_RAISE(std::vector manifests, - SystemTableUtils::ReadDataManifests(context, snapshot.value(), - path_factory, core_options, pool)); - std::shared_ptr arrow_schema = - DataField::ConvertDataFieldsToArrowSchema(context.table_schema->Fields()); - PAIMON_ASSIGN_OR_RAISE( - std::shared_ptr partition_schema, - FieldMapping::GetPartitionSchema(arrow_schema, context.table_schema->PartitionKeys())); - PAIMON_ASSIGN_OR_RAISE(std::unique_ptr manifest_file, - ManifestFile::Create(context.fs, core_options.GetManifestFormat(), - core_options.GetManifestCompression(), path_factory, - core_options.GetManifestTargetFileSize(), pool, - core_options, partition_schema)); - std::vector entries; - for (const auto& manifest : manifests) { - PAIMON_RETURN_NOT_OK( - manifest_file->Read(manifest.FileName(), /*filter=*/nullptr, &entries)); - } - return entries; -} - -Result> ReadLatestDataFiles( - const SystemTableContext& context, const std::shared_ptr& path_factory, - const CoreOptions& core_options, const std::shared_ptr& pool) { - PAIMON_ASSIGN_OR_RAISE(std::vector entries, - ReadLatestManifestEntries(context, path_factory, core_options, pool)); - std::vector merged_entries; - PAIMON_RETURN_NOT_OK(FileEntry::MergeEntries(entries, &merged_entries)); - return merged_entries; -} - -std::optional OptionalBinaryRowString(const BinaryRow& row) { - if (row.GetFieldCount() <= 0) { - return std::nullopt; - } - return row.ToString(); -} - -Result PartitionString(const std::shared_ptr& path_factory, - const BinaryRow& partition) { - PAIMON_ASSIGN_OR_RAISE(std::string value, path_factory->GetPartitionString(partition)); - return value; -} - -Result FilePath(const std::shared_ptr& path_factory, - const ManifestEntry& entry, const DataFileMeta& file) { - if (file.external_path) { - return file.external_path.value(); - } - PAIMON_ASSIGN_OR_RAISE(std::string bucket_path, - path_factory->BucketPath(entry.Partition(), entry.Bucket())); - return PathUtil::JoinPath(bucket_path, file.file_name); -} - -Result FieldsValueMapString(const std::vector& fields, - const InternalRow& row) { - std::shared_ptr schema = DataField::ConvertDataFieldsToArrowSchema(fields); - PAIMON_ASSIGN_OR_RAISE(std::vector getters, - InternalRowUtils::CreateFieldGetters(schema, /*use_view=*/false)); - std::vector values; - values.reserve(fields.size()); - for (size_t i = 0; i < fields.size(); ++i) { - std::string value = "null"; - if (!row.IsNullAt(i)) { - VariantType field_value = getters[i](row); - if (std::holds_alternative(field_value)) { - value = std::string(std::get(field_value)); - } else { - value = DataDefine::VariantValueToString(field_value); - } - } - values.emplace_back(fmt::format("{}:{}", fields[i].Name(), value)); - } - return fmt::format("{{{}}}", fmt::join(values, ", ")); -} - -Result NullValueCountsString(const std::vector& fields, - const InternalArray& null_counts) { - std::vector values; - values.reserve(fields.size()); - for (size_t i = 0; i < fields.size(); ++i) { - std::string value = - null_counts.IsNullAt(i) ? "null" : std::to_string(null_counts.GetLong(i)); - values.emplace_back(fmt::format("{}:{}", fields[i].Name(), value)); - } - return fmt::format("{{{}}}", fmt::join(values, ", ")); -} - -Result> LoadDataSchema(const SystemTableContext& context, - int64_t schema_id) { - if (schema_id == context.table_schema->Id()) { - return context.table_schema; - } - SchemaManager schema_manager(context.fs, context.table_path, context.branch); - return schema_manager.ReadSchema(schema_id); -} - -Result> ValueStatsFields(const SystemTableContext& context, - int64_t schema_id) { - PAIMON_ASSIGN_OR_RAISE(std::shared_ptr data_schema, - LoadDataSchema(context, schema_id)); - return data_schema->Fields(); -} - -Result> WriteColsValue( - const std::optional>& write_cols, - const std::shared_ptr& pool) { - if (!write_cols) { - return std::shared_ptr(); - } - return std::make_shared( - InternalRowUtils::ToNotNullStringArrayData(write_cols.value(), pool)); -} - -} // namespace FilesSystemTable::FilesSystemTable(std::shared_ptr fs, std::string table_path, std::string branch, std::shared_ptr table_schema, @@ -213,8 +78,9 @@ Result> FilesSystemTable::BuildRows() const { PAIMON_ASSIGN_OR_RAISE(CoreOptions core_options, SystemTableUtils::CreateCoreOptions(context_)); PAIMON_ASSIGN_OR_RAISE(std::shared_ptr path_factory, SystemTableUtils::CreatePathFactory(context_, core_options, pool)); - PAIMON_ASSIGN_OR_RAISE(std::vector entries, - ReadLatestDataFiles(context_, path_factory, core_options, pool)); + PAIMON_ASSIGN_OR_RAISE( + std::vector entries, + SystemTableUtils::ReadLatestDataFiles(context_, path_factory, core_options, pool)); std::shared_ptr arrow_schema = DataField::ConvertDataFieldsToArrowSchema(context_.table_schema->Fields()); PAIMON_ASSIGN_OR_RAISE( @@ -231,9 +97,9 @@ Result> FilesSystemTable::BuildRows() const { const std::shared_ptr& file = entry.File(); PAIMON_ASSIGN_OR_RAISE(std::shared_ptr data_schema, - LoadDataSchema(context_, file->schema_id)); + SystemTableUtils::LoadDataSchema(context_, file->schema_id)); PAIMON_ASSIGN_OR_RAISE(std::vector value_stats_fields, - ValueStatsFields(context_, file->schema_id)); + SystemTableUtils::ValueStatsFields(context_, file->schema_id)); std::shared_ptr stats_evolution = stats_evolutions.GetOrCreate(data_schema); PAIMON_ASSIGN_OR_RAISE( @@ -244,12 +110,13 @@ Result> FilesSystemTable::BuildRows() const { if (context_.table_schema->PartitionKeys().empty()) { row.SetField(0, NullType()); } else { - PAIMON_ASSIGN_OR_RAISE(std::string partition, - PartitionString(path_factory, entry.Partition())); + PAIMON_ASSIGN_OR_RAISE(std::string partition, SystemTableUtils::PartitionString( + path_factory, entry.Partition())); row.SetField(0, SystemTableUtils::StringValue(partition)); } row.SetField(1, entry.Bucket()); - PAIMON_ASSIGN_OR_RAISE(std::string file_path, FilePath(path_factory, entry, *file)); + PAIMON_ASSIGN_OR_RAISE(std::string file_path, + SystemTableUtils::FilePath(path_factory, entry, *file)); row.SetField(2, SystemTableUtils::StringValue(file_path)); PAIMON_ASSIGN_OR_RAISE(std::string file_format, file->FileFormat()); row.SetField(3, SystemTableUtils::StringValue(file_format)); @@ -257,18 +124,21 @@ Result> FilesSystemTable::BuildRows() const { row.SetField(5, file->level); row.SetField(6, file->row_count); row.SetField(7, file->file_size); - row.SetField(8, - SystemTableUtils::OptionalStringValue(OptionalBinaryRowString(file->min_key))); - row.SetField(9, - SystemTableUtils::OptionalStringValue(OptionalBinaryRowString(file->max_key))); - PAIMON_ASSIGN_OR_RAISE(std::string null_value_counts, - NullValueCountsString(value_stats_fields, *stats.null_counts)); + row.SetField(8, SystemTableUtils::OptionalStringValue( + SystemTableUtils::OptionalBinaryRowString(file->min_key))); + row.SetField(9, SystemTableUtils::OptionalStringValue( + SystemTableUtils::OptionalBinaryRowString(file->max_key))); + PAIMON_ASSIGN_OR_RAISE( + std::string null_value_counts, + SystemTableUtils::NullValueCountsString(value_stats_fields, *stats.null_counts)); row.SetField(10, SystemTableUtils::StringValue(null_value_counts)); - PAIMON_ASSIGN_OR_RAISE(std::string min_value_stats, - FieldsValueMapString(value_stats_fields, *stats.min_values)); + PAIMON_ASSIGN_OR_RAISE( + std::string min_value_stats, + SystemTableUtils::FieldsValueMapString(value_stats_fields, *stats.min_values)); row.SetField(11, SystemTableUtils::StringValue(min_value_stats)); - PAIMON_ASSIGN_OR_RAISE(std::string max_value_stats, - FieldsValueMapString(value_stats_fields, *stats.max_values)); + PAIMON_ASSIGN_OR_RAISE( + std::string max_value_stats, + SystemTableUtils::FieldsValueMapString(value_stats_fields, *stats.max_values)); row.SetField(12, SystemTableUtils::StringValue(max_value_stats)); row.SetField(13, file->min_sequence_number); row.SetField(14, file->max_sequence_number); @@ -281,7 +151,7 @@ Result> FilesSystemTable::BuildRows() const { : VariantType(NullType())); row.SetField(18, SystemTableUtils::OptionalInt64Value(file->first_row_id)); PAIMON_ASSIGN_OR_RAISE(std::shared_ptr write_cols, - WriteColsValue(file->write_cols, pool)); + SystemTableUtils::WriteColsValue(file->write_cols, pool)); row.SetField(19, write_cols ? VariantType(write_cols) : VariantType(NullType())); rows.push_back(std::move(row)); } diff --git a/src/paimon/core/table/system/manifests_system_table.cpp b/src/paimon/core/table/system/manifests_system_table.cpp index 9ffbdcfbd..a67b5486c 100644 --- a/src/paimon/core/table/system/manifests_system_table.cpp +++ b/src/paimon/core/table/system/manifests_system_table.cpp @@ -19,9 +19,7 @@ #include #include "arrow/api.h" -#include "paimon/common/data/binary_row.h" #include "paimon/common/types/data_field.h" -#include "paimon/common/utils/binary_row_partition_computer.h" #include "paimon/core/core_options.h" #include "paimon/core/manifest/manifest_file_meta.h" #include "paimon/core/schema/table_schema.h" @@ -31,29 +29,6 @@ #include "paimon/memory/memory_pool.h" namespace paimon { -namespace { - -constexpr int32_t kMaxPartitionStatsLength = 255; - -Result> OptionalPartitionString( - const BinaryRow& row, const std::shared_ptr& partition_schema) { - if (row.GetFieldCount() <= 0) { - return std::optional(); - } - PAIMON_ASSIGN_OR_RAISE(std::string value, - BinaryRowPartitionComputer::PartToSimpleString( - partition_schema, row, ",", kMaxPartitionStatsLength)); - return std::optional(value); -} - -Result OptionalPartitionStringValue( - const BinaryRow& row, const std::shared_ptr& partition_schema) { - PAIMON_ASSIGN_OR_RAISE(std::optional value, - OptionalPartitionString(row, partition_schema)); - return SystemTableUtils::OptionalStringValue(value); -} - -} // namespace ManifestsSystemTable::ManifestsSystemTable(std::shared_ptr fs, std::string table_path, std::string branch, @@ -112,12 +87,12 @@ Result> ManifestsSystemTable::BuildRows() const { row.SetField(2, manifest.NumAddedFiles()); row.SetField(3, manifest.NumDeletedFiles()); row.SetField(4, manifest.SchemaId()); - PAIMON_ASSIGN_OR_RAISE( - VariantType min_partition, - OptionalPartitionStringValue(manifest.PartitionStats().MinValues(), partition_schema)); - PAIMON_ASSIGN_OR_RAISE( - VariantType max_partition, - OptionalPartitionStringValue(manifest.PartitionStats().MaxValues(), partition_schema)); + PAIMON_ASSIGN_OR_RAISE(VariantType min_partition, + SystemTableUtils::OptionalPartitionStringValue( + manifest.PartitionStats().MinValues(), partition_schema)); + PAIMON_ASSIGN_OR_RAISE(VariantType max_partition, + SystemTableUtils::OptionalPartitionStringValue( + manifest.PartitionStats().MaxValues(), partition_schema)); row.SetField(5, min_partition); row.SetField(6, max_partition); row.SetField(7, SystemTableUtils::OptionalInt64Value(manifest.MinRowId())); diff --git a/src/paimon/core/table/system/schemas_system_table.cpp b/src/paimon/core/table/system/schemas_system_table.cpp index f58cdf133..9be51ee61 100644 --- a/src/paimon/core/table/system/schemas_system_table.cpp +++ b/src/paimon/core/table/system/schemas_system_table.cpp @@ -20,30 +20,10 @@ #include #include "arrow/api.h" -#include "paimon/common/utils/rapidjson_util.h" #include "paimon/core/schema/schema_manager.h" #include "paimon/core/schema/table_schema.h" -#include "paimon/status.h" -#include "rapidjson/document.h" -#include "rapidjson/stringbuffer.h" -#include "rapidjson/writer.h" namespace paimon { -namespace { - -template -Result JsonString(const T& value) { - rapidjson::Document document; - auto json_value = RapidJsonUtil::SerializeValue(value, &document.GetAllocator()); - rapidjson::StringBuffer buffer; - rapidjson::Writer writer(buffer); - if (!json_value.Accept(writer)) { - return Status::Invalid("failed to serialize schemas system table value"); - } - return std::string(buffer.GetString(), buffer.GetSize()); -} - -} // namespace SchemasSystemTable::SchemasSystemTable(std::shared_ptr fs, std::string table_path, std::string branch) @@ -79,12 +59,14 @@ Result> SchemasSystemTable::BuildRows() const { for (int64_t id : schema_ids) { PAIMON_ASSIGN_OR_RAISE(std::shared_ptr table_schema, schema_manager.ReadSchema(id)); - PAIMON_ASSIGN_OR_RAISE(std::string fields_json, JsonString(table_schema->Fields())); + PAIMON_ASSIGN_OR_RAISE(std::string fields_json, + SystemTableUtils::JsonString(table_schema->Fields())); PAIMON_ASSIGN_OR_RAISE(std::string partition_keys_json, - JsonString(table_schema->PartitionKeys())); + SystemTableUtils::JsonString(table_schema->PartitionKeys())); PAIMON_ASSIGN_OR_RAISE(std::string primary_keys_json, - JsonString(table_schema->PrimaryKeys())); - PAIMON_ASSIGN_OR_RAISE(std::string options_json, JsonString(table_schema->Options())); + SystemTableUtils::JsonString(table_schema->PrimaryKeys())); + PAIMON_ASSIGN_OR_RAISE(std::string options_json, + SystemTableUtils::JsonString(table_schema->Options())); GenericRow row(schema->num_fields()); row.SetField(0, table_schema->Id()); diff --git a/src/paimon/core/table/system/system_table.cpp b/src/paimon/core/table/system/system_table.cpp index 7c86da729..35ae430b1 100644 --- a/src/paimon/core/table/system/system_table.cpp +++ b/src/paimon/core/table/system/system_table.cpp @@ -37,9 +37,8 @@ #include "paimon/core/table/system/options_system_table.h" #include "paimon/core/table/system/schemas_system_table.h" #include "paimon/core/table/system/snapshots_system_table.h" +#include "paimon/core/table/system/system_table_utils.h" #include "paimon/core/table/system/tags_system_table.h" -#include "paimon/core/utils/branch_manager.h" -#include "paimon/defs.h" #include "paimon/status.h" namespace paimon { @@ -54,25 +53,6 @@ struct SystemTableRegistryEntry { SystemTableFactory factory; }; -std::map MergeOptions( - const std::shared_ptr& table_schema, - const std::map& dynamic_options) { - auto options = table_schema->Options(); - for (const auto& [key, value] : dynamic_options) { - options[key] = value; - } - return options; -} - -std::string DefaultBranch() { - return BranchManager::DEFAULT_MAIN_BRANCH; -} - -std::string LoadBranch(const std::map& options) { - auto branch_iter = options.find(Options::BRANCH); - return branch_iter == options.end() ? DefaultBranch() : branch_iter->second; -} - const std::vector& SystemTableRegistry() { static const std::vector registry = { {OptionsSystemTable::kName, @@ -88,7 +68,8 @@ const std::vector& SystemTableRegistry() { const std::map& dynamic_options) -> Result> { return std::make_shared( - fs, table_path, table_schema, MergeOptions(table_schema, dynamic_options)); + fs, table_path, table_schema, + SystemTableUtils::MergeOptions(table_schema, dynamic_options)); }}, {BinlogSystemTable::kName, [](const std::shared_ptr& fs, const std::string& table_path, @@ -96,55 +77,62 @@ const std::vector& SystemTableRegistry() { const std::map& dynamic_options) -> Result> { return std::make_shared( - fs, table_path, table_schema, MergeOptions(table_schema, dynamic_options)); + fs, table_path, table_schema, + SystemTableUtils::MergeOptions(table_schema, dynamic_options)); }}, {SnapshotsSystemTable::kName, [](const std::shared_ptr& fs, const std::string& table_path, const std::shared_ptr& table_schema, const std::map& dynamic_options) -> Result> { - auto options = MergeOptions(table_schema, dynamic_options); - return std::make_shared(fs, table_path, LoadBranch(options)); + auto options = SystemTableUtils::MergeOptions(table_schema, dynamic_options); + return std::make_shared(fs, table_path, + SystemTableUtils::LoadBranch(options)); }}, {SchemasSystemTable::kName, [](const std::shared_ptr& fs, const std::string& table_path, const std::shared_ptr& table_schema, const std::map& dynamic_options) -> Result> { - auto options = MergeOptions(table_schema, dynamic_options); - return std::make_shared(fs, table_path, LoadBranch(options)); + auto options = SystemTableUtils::MergeOptions(table_schema, dynamic_options); + return std::make_shared(fs, table_path, + SystemTableUtils::LoadBranch(options)); }}, {TagsSystemTable::kName, [](const std::shared_ptr& fs, const std::string& table_path, const std::shared_ptr& table_schema, const std::map& dynamic_options) -> Result> { - auto options = MergeOptions(table_schema, dynamic_options); - return std::make_shared(fs, table_path, LoadBranch(options)); + auto options = SystemTableUtils::MergeOptions(table_schema, dynamic_options); + return std::make_shared(fs, table_path, + SystemTableUtils::LoadBranch(options)); }}, {BranchesSystemTable::kName, [](const std::shared_ptr& fs, const std::string& table_path, const std::shared_ptr& table_schema, const std::map& dynamic_options) -> Result> { - auto options = MergeOptions(table_schema, dynamic_options); - return std::make_shared(fs, table_path, LoadBranch(options)); + auto options = SystemTableUtils::MergeOptions(table_schema, dynamic_options); + return std::make_shared(fs, table_path, + SystemTableUtils::LoadBranch(options)); }}, {ConsumersSystemTable::kName, [](const std::shared_ptr& fs, const std::string& table_path, const std::shared_ptr& table_schema, const std::map& dynamic_options) -> Result> { - auto options = MergeOptions(table_schema, dynamic_options); - return std::make_shared(fs, table_path, LoadBranch(options)); + auto options = SystemTableUtils::MergeOptions(table_schema, dynamic_options); + return std::make_shared(fs, table_path, + SystemTableUtils::LoadBranch(options)); }}, {ManifestsSystemTable::kName, [](const std::shared_ptr& fs, const std::string& table_path, const std::shared_ptr& table_schema, const std::map& dynamic_options) -> Result> { - auto options = MergeOptions(table_schema, dynamic_options); - return std::make_shared(fs, table_path, LoadBranch(options), + auto options = SystemTableUtils::MergeOptions(table_schema, dynamic_options); + return std::make_shared(fs, table_path, + SystemTableUtils::LoadBranch(options), table_schema, std::move(options)); }}, {FilesSystemTable::kName, @@ -152,8 +140,9 @@ const std::vector& SystemTableRegistry() { const std::shared_ptr& table_schema, const std::map& dynamic_options) -> Result> { - auto options = MergeOptions(table_schema, dynamic_options); - return std::make_shared(fs, table_path, LoadBranch(options), + auto options = SystemTableUtils::MergeOptions(table_schema, dynamic_options); + return std::make_shared(fs, table_path, + SystemTableUtils::LoadBranch(options), table_schema, std::move(options)); }}, }; @@ -214,7 +203,8 @@ Result> SystemTableLoader::LoadFromPath( return Status::Invalid("path is not a system table path: ", path); } const auto& parsed = system_table_path.value(); - SchemaManager schema_manager(fs, parsed.table_path, parsed.branch.value_or(DefaultBranch())); + SchemaManager schema_manager(fs, parsed.table_path, + parsed.branch.value_or(SystemTableUtils::DefaultBranch())); PAIMON_ASSIGN_OR_RAISE(std::optional> latest_schema, schema_manager.Latest()); if (!latest_schema) { diff --git a/src/paimon/core/table/system/system_table_utils.cpp b/src/paimon/core/table/system/system_table_utils.cpp index d1a7634ac..a06361588 100644 --- a/src/paimon/core/table/system/system_table_utils.cpp +++ b/src/paimon/core/table/system/system_table_utils.cpp @@ -16,17 +16,33 @@ #include "paimon/core/table/system/system_table_utils.h" +#include #include +#include "fmt/format.h" +#include "fmt/ranges.h" +#include "paimon/common/data/binary_array.h" +#include "paimon/common/data/binary_row.h" #include "paimon/common/data/binary_string.h" +#include "paimon/common/data/internal_array.h" +#include "paimon/common/data/internal_row.h" #include "paimon/common/types/data_field.h" +#include "paimon/common/utils/binary_row_partition_computer.h" #include "paimon/common/utils/date_time_utils.h" +#include "paimon/common/utils/internal_row_utils.h" +#include "paimon/common/utils/path_util.h" #include "paimon/core/core_options.h" +#include "paimon/core/io/data_file_meta.h" +#include "paimon/core/manifest/file_entry.h" +#include "paimon/core/manifest/manifest_entry.h" +#include "paimon/core/manifest/manifest_file.h" #include "paimon/core/manifest/manifest_file_meta.h" #include "paimon/core/manifest/manifest_list.h" +#include "paimon/core/schema/schema_manager.h" #include "paimon/core/schema/table_schema.h" #include "paimon/core/snapshot.h" #include "paimon/core/utils/branch_manager.h" +#include "paimon/core/utils/field_mapping.h" #include "paimon/core/utils/file_store_path_factory.h" #include "paimon/core/utils/snapshot_manager.h" #include "paimon/fs/file_system.h" @@ -34,6 +50,11 @@ #include "paimon/status.h" namespace paimon { +namespace { + +constexpr int32_t kMaxPartitionStatsLength = 255; + +} // namespace SystemTableContext SystemTableUtils::CreateContext(std::shared_ptr fs, std::string table_path, std::string branch) { @@ -52,6 +73,78 @@ SystemTableContext SystemTableUtils::CreateContext(std::shared_ptr f }; } +std::map SystemTableUtils::MergeOptions( + const std::shared_ptr& table_schema, + const std::map& dynamic_options) { + auto options = table_schema->Options(); + for (const auto& [key, value] : dynamic_options) { + options[key] = value; + } + return options; +} + +std::string SystemTableUtils::DefaultBranch() { + return BranchManager::DEFAULT_MAIN_BRANCH; +} + +std::string SystemTableUtils::LoadBranch(const std::map& options) { + auto branch_iter = options.find(Options::BRANCH); + return branch_iter == options.end() ? DefaultBranch() : branch_iter->second; +} + +Result SystemTableUtils::LocalDateTimePartsToTimestampMillis( + const std::vector& parts) { + if (parts.size() < 6) { + return Status::Invalid("tag create time requires at least 6 date-time fields"); + } + + int64_t year = parts[0]; + int64_t month = parts[1]; + int64_t day = parts[2]; + int64_t hour = parts[3]; + int64_t minute = parts[4]; + int64_t second = parts[5]; + int64_t nanos = parts.size() > 6 ? parts[6] : 0; + auto is_leap_year = [](int64_t value) { + return value % 4 == 0 && (value % 100 != 0 || value % 400 == 0); + }; + int64_t days_in_month[] = {31, is_leap_year(year) ? 29 : 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, + 31}; + if (month < 1 || month > 12 || day < 1 || day > days_in_month[month - 1] || hour < 0 || + hour > 23 || minute < 0 || minute > 59 || second < 0 || second > 59 || nanos < 0 || + nanos > 999999999) { + return Status::Invalid("invalid tag create time fields"); + } + + year -= month <= 2 ? 1 : 0; + int64_t era = (year >= 0 ? year : year - 399) / 400; + auto year_of_era = static_cast(year - era * 400); + auto month_prime = static_cast(month + (month > 2 ? -3 : 9)); + uint32_t day_of_year = (153 * month_prime + 2) / 5 + static_cast(day) - 1; + uint32_t day_of_era = year_of_era * 365 + year_of_era / 4 - year_of_era / 100 + day_of_year; + int64_t epoch_day = era * 146097 + static_cast(day_of_era) - 719468; + return epoch_day * DateTimeUtils::MILLIS_PER_DAY + hour * 3600000 + minute * 60000 + + second * 1000 + nanos / 1000000; +} + +Result> SystemTableUtils::OptionalLocalDateTimePartsToTimestampMillis( + const std::optional>& parts) { + if (!parts) { + return std::optional(); + } + PAIMON_ASSIGN_OR_RAISE(int64_t timestamp_millis, + LocalDateTimePartsToTimestampMillis(parts.value())); + return std::optional(timestamp_millis); +} + +std::optional SystemTableUtils::OptionalDoubleToString( + const std::optional& value) { + if (!value) { + return std::optional(); + } + return std::to_string(value.value()); +} + VariantType SystemTableUtils::OptionalInt64Value(const std::optional& value) { if (!value) { return NullType(); @@ -70,14 +163,10 @@ VariantType SystemTableUtils::OptionalStringValue(const std::optional SystemTableUtils::LocalTimestampMillisValue(int64_t epoch_millis) { PAIMON_ASSIGN_OR_RAISE( Timestamp local_timestamp, @@ -91,6 +180,13 @@ Result SystemTableUtils::LocalTimestampMillisValue(const Timestamp& return LocalTimestampMillisValue(epoch_millis); } +VariantType SystemTableUtils::OptionalTimestampMillisValue(const std::optional& value) { + if (!value) { + return NullType(); + } + return TimestampMillisValue(value.value()); +} + Result SystemTableUtils::CreateCoreOptions(const SystemTableContext& context) { return CoreOptions::FromMap(context.options, context.fs); } @@ -134,4 +230,144 @@ Result> SystemTableUtils::ReadDataManifests( return manifests; } +Result> SystemTableUtils::ReadLatestManifestEntries( + const SystemTableContext& context, const std::shared_ptr& path_factory, + const CoreOptions& core_options, const std::shared_ptr& pool) { + PAIMON_ASSIGN_OR_RAISE(std::optional snapshot, LatestSnapshot(context)); + if (!snapshot) { + return std::vector(); + } + PAIMON_ASSIGN_OR_RAISE( + std::vector manifests, + ReadDataManifests(context, snapshot.value(), path_factory, core_options, pool)); + std::shared_ptr arrow_schema = + DataField::ConvertDataFieldsToArrowSchema(context.table_schema->Fields()); + PAIMON_ASSIGN_OR_RAISE( + std::shared_ptr partition_schema, + FieldMapping::GetPartitionSchema(arrow_schema, context.table_schema->PartitionKeys())); + PAIMON_ASSIGN_OR_RAISE(std::unique_ptr manifest_file, + ManifestFile::Create(context.fs, core_options.GetManifestFormat(), + core_options.GetManifestCompression(), path_factory, + core_options.GetManifestTargetFileSize(), pool, + core_options, partition_schema)); + std::vector entries; + for (const auto& manifest : manifests) { + PAIMON_RETURN_NOT_OK( + manifest_file->Read(manifest.FileName(), /*filter=*/nullptr, &entries)); + } + return entries; +} + +Result> SystemTableUtils::ReadLatestDataFiles( + const SystemTableContext& context, const std::shared_ptr& path_factory, + const CoreOptions& core_options, const std::shared_ptr& pool) { + PAIMON_ASSIGN_OR_RAISE(std::vector entries, + ReadLatestManifestEntries(context, path_factory, core_options, pool)); + std::vector merged_entries; + PAIMON_RETURN_NOT_OK(FileEntry::MergeEntries(entries, &merged_entries)); + return merged_entries; +} + +std::optional SystemTableUtils::OptionalBinaryRowString(const BinaryRow& row) { + if (row.GetFieldCount() <= 0) { + return std::nullopt; + } + return row.ToString(); +} + +Result> SystemTableUtils::OptionalPartitionString( + const BinaryRow& row, const std::shared_ptr& partition_schema) { + if (row.GetFieldCount() <= 0) { + return std::optional(); + } + PAIMON_ASSIGN_OR_RAISE(std::string value, + BinaryRowPartitionComputer::PartToSimpleString( + partition_schema, row, ",", kMaxPartitionStatsLength)); + return std::optional(value); +} + +Result SystemTableUtils::OptionalPartitionStringValue( + const BinaryRow& row, const std::shared_ptr& partition_schema) { + PAIMON_ASSIGN_OR_RAISE(std::optional value, + OptionalPartitionString(row, partition_schema)); + return OptionalStringValue(value); +} + +Result SystemTableUtils::PartitionString( + const std::shared_ptr& path_factory, const BinaryRow& partition) { + PAIMON_ASSIGN_OR_RAISE(std::string value, path_factory->GetPartitionString(partition)); + return value; +} + +Result SystemTableUtils::FilePath( + const std::shared_ptr& path_factory, const ManifestEntry& entry, + const DataFileMeta& file) { + if (file.external_path) { + return file.external_path.value(); + } + PAIMON_ASSIGN_OR_RAISE(std::string bucket_path, + path_factory->BucketPath(entry.Partition(), entry.Bucket())); + return PathUtil::JoinPath(bucket_path, file.file_name); +} + +Result SystemTableUtils::FieldsValueMapString(const std::vector& fields, + const InternalRow& row) { + std::shared_ptr schema = DataField::ConvertDataFieldsToArrowSchema(fields); + PAIMON_ASSIGN_OR_RAISE(std::vector getters, + InternalRowUtils::CreateFieldGetters(schema, /*use_view=*/false)); + std::vector values; + values.reserve(fields.size()); + for (size_t i = 0; i < fields.size(); ++i) { + std::string value = "null"; + if (!row.IsNullAt(i)) { + VariantType field_value = getters[i](row); + if (std::holds_alternative(field_value)) { + value = std::string(std::get(field_value)); + } else { + value = DataDefine::VariantValueToString(field_value); + } + } + values.emplace_back(fmt::format("{}:{}", fields[i].Name(), value)); + } + return fmt::format("{{{}}}", fmt::join(values, ", ")); +} + +Result SystemTableUtils::NullValueCountsString(const std::vector& fields, + const InternalArray& null_counts) { + std::vector values; + values.reserve(fields.size()); + for (size_t i = 0; i < fields.size(); ++i) { + std::string value = + null_counts.IsNullAt(i) ? "null" : std::to_string(null_counts.GetLong(i)); + values.emplace_back(fmt::format("{}:{}", fields[i].Name(), value)); + } + return fmt::format("{{{}}}", fmt::join(values, ", ")); +} + +Result> SystemTableUtils::LoadDataSchema( + const SystemTableContext& context, int64_t schema_id) { + if (schema_id == context.table_schema->Id()) { + return context.table_schema; + } + SchemaManager schema_manager(context.fs, context.table_path, context.branch); + return schema_manager.ReadSchema(schema_id); +} + +Result> SystemTableUtils::ValueStatsFields(const SystemTableContext& context, + int64_t schema_id) { + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr data_schema, + LoadDataSchema(context, schema_id)); + return data_schema->Fields(); +} + +Result> SystemTableUtils::WriteColsValue( + const std::optional>& write_cols, + const std::shared_ptr& pool) { + if (!write_cols) { + return std::shared_ptr(); + } + return std::make_shared( + InternalRowUtils::ToNotNullStringArrayData(write_cols.value(), pool)); +} + } // namespace paimon diff --git a/src/paimon/core/table/system/system_table_utils.h b/src/paimon/core/table/system/system_table_utils.h index 63e744e24..437d3dbfe 100644 --- a/src/paimon/core/table/system/system_table_utils.h +++ b/src/paimon/core/table/system/system_table_utils.h @@ -24,11 +24,20 @@ #include #include +#include "paimon/common/data/data_define.h" +#include "paimon/common/types/data_field.h" +#include "paimon/common/utils/rapidjson_util.h" #include "paimon/core/core_options.h" +#include "paimon/core/io/data_file_meta.h" +#include "paimon/core/manifest/manifest_entry.h" #include "paimon/core/manifest/manifest_file_meta.h" #include "paimon/core/snapshot.h" #include "paimon/data/timestamp.h" #include "paimon/result.h" +#include "paimon/status.h" +#include "rapidjson/document.h" +#include "rapidjson/stringbuffer.h" +#include "rapidjson/writer.h" namespace arrow { class Schema; @@ -64,11 +73,34 @@ class SystemTableUtils { std::string branch, std::shared_ptr table_schema, std::map options); + static std::map MergeOptions( + const std::shared_ptr& table_schema, + const std::map& dynamic_options); + static std::string DefaultBranch(); + static std::string LoadBranch(const std::map& options); + + template + static Result JsonString(const T& value) { + rapidjson::Document document; + auto json_value = RapidJsonUtil::SerializeValue(value, &document.GetAllocator()); + rapidjson::StringBuffer buffer; + rapidjson::Writer writer(buffer); + if (!json_value.Accept(writer)) { + return Status::Invalid("failed to serialize metadata system table value"); + } + return std::string(buffer.GetString(), buffer.GetSize()); + } + + static Result> OptionalLocalDateTimePartsToTimestampMillis( + const std::optional>& parts); + static std::optional OptionalDoubleToString(const std::optional& value); static VariantType OptionalInt64Value(const std::optional& value); static VariantType StringValue(const std::string& value); static VariantType OptionalStringValue(const std::optional& value); + static VariantType TimestampMillisValue(int64_t value); static Result LocalTimestampMillisValue(int64_t epoch_millis); static Result LocalTimestampMillisValue(const Timestamp& local_timestamp); + static VariantType OptionalTimestampMillisValue(const std::optional& value); static Result CreateCoreOptions(const SystemTableContext& context); static Result> CreatePathFactory( @@ -79,6 +111,38 @@ class SystemTableUtils { const SystemTableContext& context, const Snapshot& snapshot, const std::shared_ptr& path_factory, const CoreOptions& core_options, const std::shared_ptr& pool); + static Result> ReadLatestDataFiles( + const SystemTableContext& context, + const std::shared_ptr& path_factory, const CoreOptions& core_options, + const std::shared_ptr& pool); + + static std::optional OptionalBinaryRowString(const BinaryRow& row); + static Result OptionalPartitionStringValue( + const BinaryRow& row, const std::shared_ptr& partition_schema); + static Result PartitionString( + const std::shared_ptr& path_factory, const BinaryRow& partition); + static Result FilePath(const std::shared_ptr& path_factory, + const ManifestEntry& entry, const DataFileMeta& file); + static Result FieldsValueMapString(const std::vector& fields, + const InternalRow& row); + static Result NullValueCountsString(const std::vector& fields, + const InternalArray& null_counts); + static Result> LoadDataSchema(const SystemTableContext& context, + int64_t schema_id); + static Result> ValueStatsFields(const SystemTableContext& context, + int64_t schema_id); + static Result> WriteColsValue( + const std::optional>& write_cols, + const std::shared_ptr& pool); + + private: + static Result LocalDateTimePartsToTimestampMillis(const std::vector& parts); + static Result> OptionalPartitionString( + const BinaryRow& row, const std::shared_ptr& partition_schema); + static Result> ReadLatestManifestEntries( + const SystemTableContext& context, + const std::shared_ptr& path_factory, const CoreOptions& core_options, + const std::shared_ptr& pool); }; } // namespace paimon diff --git a/src/paimon/core/table/system/tags_system_table.cpp b/src/paimon/core/table/system/tags_system_table.cpp index 42d054fa8..e40158270 100644 --- a/src/paimon/core/table/system/tags_system_table.cpp +++ b/src/paimon/core/table/system/tags_system_table.cpp @@ -19,75 +19,10 @@ #include #include "arrow/api.h" -#include "paimon/common/data/data_define.h" -#include "paimon/common/utils/date_time_utils.h" #include "paimon/core/tag/tag.h" #include "paimon/core/utils/tag_manager.h" -#include "paimon/data/timestamp.h" -#include "paimon/status.h" namespace paimon { -namespace { - -Result LocalDateTimePartsToTimestampMillis(const std::vector& parts) { - if (parts.size() < 6) { - return Status::Invalid("tag create time requires at least 6 date-time fields"); - } - - int64_t year = parts[0]; - int64_t month = parts[1]; - int64_t day = parts[2]; - int64_t hour = parts[3]; - int64_t minute = parts[4]; - int64_t second = parts[5]; - int64_t nanos = parts.size() > 6 ? parts[6] : 0; - auto is_leap_year = [](int64_t value) { - return value % 4 == 0 && (value % 100 != 0 || value % 400 == 0); - }; - int64_t days_in_month[] = {31, is_leap_year(year) ? 29 : 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, - 31}; - if (month < 1 || month > 12 || day < 1 || day > days_in_month[month - 1] || hour < 0 || - hour > 23 || minute < 0 || minute > 59 || second < 0 || second > 59 || nanos < 0 || - nanos > 999999999) { - return Status::Invalid("invalid tag create time fields"); - } - - year -= month <= 2 ? 1 : 0; - int64_t era = (year >= 0 ? year : year - 399) / 400; - auto year_of_era = static_cast(year - era * 400); - auto month_prime = static_cast(month + (month > 2 ? -3 : 9)); - uint32_t day_of_year = (153 * month_prime + 2) / 5 + static_cast(day) - 1; - uint32_t day_of_era = year_of_era * 365 + year_of_era / 4 - year_of_era / 100 + day_of_year; - int64_t epoch_day = era * 146097 + static_cast(day_of_era) - 719468; - return epoch_day * DateTimeUtils::MILLIS_PER_DAY + hour * 3600000 + minute * 60000 + - second * 1000 + nanos / 1000000; -} - -Result> OptionalLocalDateTimePartsToTimestampMillis( - const std::optional>& parts) { - if (!parts) { - return std::optional(); - } - PAIMON_ASSIGN_OR_RAISE(int64_t timestamp_millis, - LocalDateTimePartsToTimestampMillis(parts.value())); - return std::optional(timestamp_millis); -} - -std::optional OptionalDoubleToString(const std::optional& value) { - if (!value) { - return std::optional(); - } - return std::to_string(value.value()); -} - -VariantType OptionalTimestampMillisValue(const std::optional& value) { - if (!value) { - return NullType(); - } - return Timestamp::FromEpochMillis(value.value()); -} - -} // namespace TagsSystemTable::TagsSystemTable(std::shared_ptr fs, std::string table_path, std::string branch) @@ -122,8 +57,9 @@ Result> TagsSystemTable::BuildRows() const { for (const auto& name : tag_names) { PAIMON_ASSIGN_OR_RAISE(Tag tag, tag_manager.GetOrThrow(name)); - PAIMON_ASSIGN_OR_RAISE(std::optional tag_create_time, - OptionalLocalDateTimePartsToTimestampMillis(tag.TagCreateTime())); + PAIMON_ASSIGN_OR_RAISE( + std::optional tag_create_time, + SystemTableUtils::OptionalLocalDateTimePartsToTimestampMillis(tag.TagCreateTime())); GenericRow row(schema->num_fields()); row.SetField(0, SystemTableUtils::StringValue(name)); row.SetField(1, tag.Id()); @@ -132,9 +68,9 @@ Result> TagsSystemTable::BuildRows() const { SystemTableUtils::LocalTimestampMillisValue(tag.TimeMillis())); row.SetField(3, commit_time); row.SetField(4, SystemTableUtils::OptionalInt64Value(tag.TotalRecordCount())); - row.SetField(5, OptionalTimestampMillisValue(tag_create_time)); + row.SetField(5, SystemTableUtils::OptionalTimestampMillisValue(tag_create_time)); row.SetField(6, SystemTableUtils::OptionalStringValue( - OptionalDoubleToString(tag.TagTimeRetained()))); + SystemTableUtils::OptionalDoubleToString(tag.TagTimeRetained()))); rows.push_back(std::move(row)); } From ff9be72d23090d4473ca786f4b7f8c0f3a2f6194 Mon Sep 17 00:00:00 2001 From: Socrates Date: Thu, 28 May 2026 10:20:06 +0800 Subject: [PATCH 5/8] Revert "Refactor system table implementations" This reverts commit 97eddb9c437f596519c9d076fbd1fa706c4bdb98. --- src/paimon/CMakeLists.txt | 10 +- .../table/system/branches_system_table.cpp | 67 -- .../core/table/system/branches_system_table.h | 44 - .../table/system/consumers_system_table.cpp | 60 -- .../table/system/consumers_system_table.h | 45 - .../core/table/system/files_system_table.cpp | 161 ---- .../core/table/system/files_system_table.h | 48 - .../table/system/manifests_system_table.cpp | 105 --- .../table/system/manifests_system_table.h | 48 - .../table/system/metadata_system_tables.cpp | 847 ++++++++++++++++++ .../table/system/metadata_system_tables.h | 165 ++++ .../table/system/options_system_table.cpp | 53 -- .../core/table/system/options_system_table.h | 43 - .../table/system/schemas_system_table.cpp | 87 -- .../core/table/system/schemas_system_table.h | 44 - .../table/system/snapshots_system_table.cpp | 92 -- .../table/system/snapshots_system_table.h | 45 - src/paimon/core/table/system/system_table.cpp | 69 +- .../core/table/system/system_table_utils.cpp | 373 -------- .../core/table/system/system_table_utils.h | 148 --- .../core/table/system/tags_system_table.cpp | 80 -- .../core/table/system/tags_system_table.h | 44 - 22 files changed, 1047 insertions(+), 1631 deletions(-) delete mode 100644 src/paimon/core/table/system/branches_system_table.cpp delete mode 100644 src/paimon/core/table/system/branches_system_table.h delete mode 100644 src/paimon/core/table/system/consumers_system_table.cpp delete mode 100644 src/paimon/core/table/system/consumers_system_table.h delete mode 100644 src/paimon/core/table/system/files_system_table.cpp delete mode 100644 src/paimon/core/table/system/files_system_table.h delete mode 100644 src/paimon/core/table/system/manifests_system_table.cpp delete mode 100644 src/paimon/core/table/system/manifests_system_table.h create mode 100644 src/paimon/core/table/system/metadata_system_tables.cpp create mode 100644 src/paimon/core/table/system/metadata_system_tables.h delete mode 100644 src/paimon/core/table/system/options_system_table.cpp delete mode 100644 src/paimon/core/table/system/options_system_table.h delete mode 100644 src/paimon/core/table/system/schemas_system_table.cpp delete mode 100644 src/paimon/core/table/system/schemas_system_table.h delete mode 100644 src/paimon/core/table/system/snapshots_system_table.cpp delete mode 100644 src/paimon/core/table/system/snapshots_system_table.h delete mode 100644 src/paimon/core/table/system/system_table_utils.cpp delete mode 100644 src/paimon/core/table/system/system_table_utils.h delete mode 100644 src/paimon/core/table/system/tags_system_table.cpp delete mode 100644 src/paimon/core/table/system/tags_system_table.h diff --git a/src/paimon/CMakeLists.txt b/src/paimon/CMakeLists.txt index edc9681e6..a517184a4 100644 --- a/src/paimon/CMakeLists.txt +++ b/src/paimon/CMakeLists.txt @@ -320,19 +320,11 @@ set(PAIMON_CORE_SRCS core/table/source/data_evolution_batch_scan.cpp core/table/system/audit_log_system_table.cpp core/table/system/binlog_system_table.cpp - core/table/system/branches_system_table.cpp - core/table/system/consumers_system_table.cpp - core/table/system/files_system_table.cpp core/table/system/in_memory_system_table.cpp - core/table/system/manifests_system_table.cpp - core/table/system/options_system_table.cpp - core/table/system/schemas_system_table.cpp - core/table/system/snapshots_system_table.cpp + core/table/system/metadata_system_tables.cpp core/table/system/system_table.cpp core/table/system/system_table_scan.cpp core/table/system/system_table_schema.cpp - core/table/system/system_table_utils.cpp - core/table/system/tags_system_table.cpp core/tag/tag.cpp core/utils/branch_manager.cpp core/utils/consumer_manager.cpp diff --git a/src/paimon/core/table/system/branches_system_table.cpp b/src/paimon/core/table/system/branches_system_table.cpp deleted file mode 100644 index a99553283..000000000 --- a/src/paimon/core/table/system/branches_system_table.cpp +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Copyright 2026-present Alibaba Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "paimon/core/table/system/branches_system_table.h" - -#include - -#include "arrow/api.h" -#include "paimon/core/utils/branch_manager.h" -#include "paimon/fs/file_system.h" - -namespace paimon { - -BranchesSystemTable::BranchesSystemTable(std::shared_ptr fs, std::string table_path, - std::string branch) - : InMemorySystemTable(table_path), - context_(SystemTableUtils::CreateContext(std::move(fs), std::move(table_path), - std::move(branch))) {} - -std::string BranchesSystemTable::Name() const { - return kName; -} - -Result> BranchesSystemTable::ArrowSchema() const { - return arrow::schema({ - arrow::field("branch_name", arrow::utf8(), /*nullable=*/false), - arrow::field("create_time", arrow::timestamp(arrow::TimeUnit::MILLI), - /*nullable=*/false), - }); -} - -Result> BranchesSystemTable::BuildRows() const { - PAIMON_ASSIGN_OR_RAISE(std::shared_ptr schema, ArrowSchema()); - PAIMON_ASSIGN_OR_RAISE(std::vector branches, - BranchManager::ListBranches(context_.fs, context_.table_path)); - std::vector rows; - rows.reserve(branches.size()); - - for (const auto& name : branches) { - PAIMON_ASSIGN_OR_RAISE( - std::unique_ptr branch_status, - context_.fs->GetFileStatus(BranchManager::BranchPath(context_.table_path, name))); - GenericRow row(schema->num_fields()); - row.SetField(0, SystemTableUtils::StringValue(name)); - PAIMON_ASSIGN_OR_RAISE(VariantType create_time, SystemTableUtils::LocalTimestampMillisValue( - branch_status->GetModificationTime())); - row.SetField(1, create_time); - rows.push_back(std::move(row)); - } - - return rows; -} - -} // namespace paimon diff --git a/src/paimon/core/table/system/branches_system_table.h b/src/paimon/core/table/system/branches_system_table.h deleted file mode 100644 index 7968b1c63..000000000 --- a/src/paimon/core/table/system/branches_system_table.h +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Copyright 2026-present Alibaba Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include - -#include "paimon/core/table/system/in_memory_system_table.h" -#include "paimon/core/table/system/system_table_utils.h" - -namespace paimon { -class FileSystem; - -/// System table for `T$branches`, exposing table branches including `main`. -class BranchesSystemTable : public InMemorySystemTable { - public: - static constexpr const char* kName = "branches"; - - BranchesSystemTable(std::shared_ptr fs, std::string table_path, std::string branch); - - std::string Name() const override; - Result> ArrowSchema() const override; - Result> BuildRows() const override; - - private: - SystemTableContext context_; -}; - -} // namespace paimon diff --git a/src/paimon/core/table/system/consumers_system_table.cpp b/src/paimon/core/table/system/consumers_system_table.cpp deleted file mode 100644 index 1c0fac998..000000000 --- a/src/paimon/core/table/system/consumers_system_table.cpp +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright 2026-present Alibaba Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "paimon/core/table/system/consumers_system_table.h" - -#include - -#include "arrow/api.h" -#include "paimon/core/utils/consumer_manager.h" - -namespace paimon { - -ConsumersSystemTable::ConsumersSystemTable(std::shared_ptr fs, std::string table_path, - std::string branch) - : InMemorySystemTable(table_path), - context_(SystemTableUtils::CreateContext(std::move(fs), std::move(table_path), - std::move(branch))) {} - -std::string ConsumersSystemTable::Name() const { - return kName; -} - -Result> ConsumersSystemTable::ArrowSchema() const { - return arrow::schema({ - arrow::field("consumer_id", arrow::utf8(), /*nullable=*/false), - arrow::field("next_snapshot_id", arrow::int64(), /*nullable=*/false), - }); -} - -Result> ConsumersSystemTable::BuildRows() const { - PAIMON_ASSIGN_OR_RAISE(std::shared_ptr schema, ArrowSchema()); - ConsumerManager consumer_manager(context_.fs, context_.table_path, context_.branch); - PAIMON_ASSIGN_OR_RAISE(auto consumers, consumer_manager.Consumers()); - std::vector rows; - rows.reserve(consumers.size()); - - for (const auto& [id, snapshot_id] : consumers) { - GenericRow row(schema->num_fields()); - row.SetField(0, SystemTableUtils::StringValue(id)); - row.SetField(1, snapshot_id); - rows.push_back(std::move(row)); - } - - return rows; -} - -} // namespace paimon diff --git a/src/paimon/core/table/system/consumers_system_table.h b/src/paimon/core/table/system/consumers_system_table.h deleted file mode 100644 index 13761d904..000000000 --- a/src/paimon/core/table/system/consumers_system_table.h +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright 2026-present Alibaba Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include - -#include "paimon/core/table/system/in_memory_system_table.h" -#include "paimon/core/table/system/system_table_utils.h" - -namespace paimon { -class FileSystem; - -/// System table for `T$consumers`, exposing persisted streaming consumer offsets. -class ConsumersSystemTable : public InMemorySystemTable { - public: - static constexpr const char* kName = "consumers"; - - ConsumersSystemTable(std::shared_ptr fs, std::string table_path, - std::string branch); - - std::string Name() const override; - Result> ArrowSchema() const override; - Result> BuildRows() const override; - - private: - SystemTableContext context_; -}; - -} // namespace paimon diff --git a/src/paimon/core/table/system/files_system_table.cpp b/src/paimon/core/table/system/files_system_table.cpp deleted file mode 100644 index 7effaec6a..000000000 --- a/src/paimon/core/table/system/files_system_table.cpp +++ /dev/null @@ -1,161 +0,0 @@ -/* - * Copyright 2026-present Alibaba Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "paimon/core/table/system/files_system_table.h" - -#include - -#include "arrow/api.h" -#include "paimon/common/data/data_define.h" -#include "paimon/common/data/internal_array.h" -#include "paimon/common/types/data_field.h" -#include "paimon/core/core_options.h" -#include "paimon/core/io/data_file_meta.h" -#include "paimon/core/manifest/file_kind.h" -#include "paimon/core/manifest/manifest_entry.h" -#include "paimon/core/schema/table_schema.h" -#include "paimon/core/stats/simple_stats_evolutions.h" -#include "paimon/core/utils/field_mapping.h" -#include "paimon/core/utils/file_store_path_factory.h" -#include "paimon/memory/memory_pool.h" - -namespace paimon { - -FilesSystemTable::FilesSystemTable(std::shared_ptr fs, std::string table_path, - std::string branch, std::shared_ptr table_schema, - std::map options) - : InMemorySystemTable(table_path), - context_(SystemTableUtils::CreateContext(std::move(fs), std::move(table_path), - std::move(branch), std::move(table_schema), - std::move(options))) {} - -std::string FilesSystemTable::Name() const { - return kName; -} - -Result> FilesSystemTable::ArrowSchema() const { - return arrow::schema({ - arrow::field("partition", arrow::utf8(), /*nullable=*/true), - arrow::field("bucket", arrow::int32(), /*nullable=*/false), - arrow::field("file_path", arrow::utf8(), /*nullable=*/false), - arrow::field("file_format", arrow::utf8(), /*nullable=*/false), - arrow::field("schema_id", arrow::int64(), /*nullable=*/false), - arrow::field("level", arrow::int32(), /*nullable=*/false), - arrow::field("record_count", arrow::int64(), /*nullable=*/false), - arrow::field("file_size_in_bytes", arrow::int64(), /*nullable=*/false), - arrow::field("min_key", arrow::utf8(), /*nullable=*/true), - arrow::field("max_key", arrow::utf8(), /*nullable=*/true), - arrow::field("null_value_counts", arrow::utf8(), /*nullable=*/false), - arrow::field("min_value_stats", arrow::utf8(), /*nullable=*/false), - arrow::field("max_value_stats", arrow::utf8(), /*nullable=*/false), - arrow::field("min_sequence_number", arrow::int64(), /*nullable=*/true), - arrow::field("max_sequence_number", arrow::int64(), /*nullable=*/true), - arrow::field("creation_time", arrow::timestamp(arrow::TimeUnit::MILLI), - /*nullable=*/true), - arrow::field("deleteRowCount", arrow::int64(), /*nullable=*/true), - arrow::field("file_source", arrow::utf8(), /*nullable=*/true), - arrow::field("first_row_id", arrow::int64(), /*nullable=*/true), - arrow::field("write_cols", arrow::list(arrow::utf8()), /*nullable=*/true), - }); -} - -Result> FilesSystemTable::BuildRows() const { - PAIMON_ASSIGN_OR_RAISE(std::shared_ptr schema, ArrowSchema()); - std::shared_ptr pool = GetDefaultPool(); - PAIMON_ASSIGN_OR_RAISE(CoreOptions core_options, SystemTableUtils::CreateCoreOptions(context_)); - PAIMON_ASSIGN_OR_RAISE(std::shared_ptr path_factory, - SystemTableUtils::CreatePathFactory(context_, core_options, pool)); - PAIMON_ASSIGN_OR_RAISE( - std::vector entries, - SystemTableUtils::ReadLatestDataFiles(context_, path_factory, core_options, pool)); - std::shared_ptr arrow_schema = - DataField::ConvertDataFieldsToArrowSchema(context_.table_schema->Fields()); - PAIMON_ASSIGN_OR_RAISE( - std::shared_ptr partition_schema, - FieldMapping::GetPartitionSchema(arrow_schema, context_.table_schema->PartitionKeys())); - - SimpleStatsEvolutions stats_evolutions(context_.table_schema, pool); - std::vector rows; - rows.reserve(entries.size()); - for (const auto& entry : entries) { - if (!(entry.Kind() == FileKind::Add())) { - continue; - } - - const std::shared_ptr& file = entry.File(); - PAIMON_ASSIGN_OR_RAISE(std::shared_ptr data_schema, - SystemTableUtils::LoadDataSchema(context_, file->schema_id)); - PAIMON_ASSIGN_OR_RAISE(std::vector value_stats_fields, - SystemTableUtils::ValueStatsFields(context_, file->schema_id)); - std::shared_ptr stats_evolution = - stats_evolutions.GetOrCreate(data_schema); - PAIMON_ASSIGN_OR_RAISE( - SimpleStatsEvolution::EvolutionStats stats, - stats_evolution->Evolution(file->value_stats, file->row_count, file->value_stats_cols)); - - GenericRow row(schema->num_fields()); - if (context_.table_schema->PartitionKeys().empty()) { - row.SetField(0, NullType()); - } else { - PAIMON_ASSIGN_OR_RAISE(std::string partition, SystemTableUtils::PartitionString( - path_factory, entry.Partition())); - row.SetField(0, SystemTableUtils::StringValue(partition)); - } - row.SetField(1, entry.Bucket()); - PAIMON_ASSIGN_OR_RAISE(std::string file_path, - SystemTableUtils::FilePath(path_factory, entry, *file)); - row.SetField(2, SystemTableUtils::StringValue(file_path)); - PAIMON_ASSIGN_OR_RAISE(std::string file_format, file->FileFormat()); - row.SetField(3, SystemTableUtils::StringValue(file_format)); - row.SetField(4, file->schema_id); - row.SetField(5, file->level); - row.SetField(6, file->row_count); - row.SetField(7, file->file_size); - row.SetField(8, SystemTableUtils::OptionalStringValue( - SystemTableUtils::OptionalBinaryRowString(file->min_key))); - row.SetField(9, SystemTableUtils::OptionalStringValue( - SystemTableUtils::OptionalBinaryRowString(file->max_key))); - PAIMON_ASSIGN_OR_RAISE( - std::string null_value_counts, - SystemTableUtils::NullValueCountsString(value_stats_fields, *stats.null_counts)); - row.SetField(10, SystemTableUtils::StringValue(null_value_counts)); - PAIMON_ASSIGN_OR_RAISE( - std::string min_value_stats, - SystemTableUtils::FieldsValueMapString(value_stats_fields, *stats.min_values)); - row.SetField(11, SystemTableUtils::StringValue(min_value_stats)); - PAIMON_ASSIGN_OR_RAISE( - std::string max_value_stats, - SystemTableUtils::FieldsValueMapString(value_stats_fields, *stats.max_values)); - row.SetField(12, SystemTableUtils::StringValue(max_value_stats)); - row.SetField(13, file->min_sequence_number); - row.SetField(14, file->max_sequence_number); - PAIMON_ASSIGN_OR_RAISE(VariantType creation_time, - SystemTableUtils::LocalTimestampMillisValue(file->creation_time)); - row.SetField(15, creation_time); - row.SetField(16, SystemTableUtils::OptionalInt64Value(file->delete_row_count)); - row.SetField(17, file->file_source - ? SystemTableUtils::StringValue(file->file_source.value().ToString()) - : VariantType(NullType())); - row.SetField(18, SystemTableUtils::OptionalInt64Value(file->first_row_id)); - PAIMON_ASSIGN_OR_RAISE(std::shared_ptr write_cols, - SystemTableUtils::WriteColsValue(file->write_cols, pool)); - row.SetField(19, write_cols ? VariantType(write_cols) : VariantType(NullType())); - rows.push_back(std::move(row)); - } - return rows; -} - -} // namespace paimon diff --git a/src/paimon/core/table/system/files_system_table.h b/src/paimon/core/table/system/files_system_table.h deleted file mode 100644 index 9a0e0ee8a..000000000 --- a/src/paimon/core/table/system/files_system_table.h +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright 2026-present Alibaba Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include -#include - -#include "paimon/core/table/system/in_memory_system_table.h" -#include "paimon/core/table/system/system_table_utils.h" - -namespace paimon { -class FileSystem; -class TableSchema; - -/// System table for `T$files`, exposing data file metadata in the latest snapshot. -class FilesSystemTable : public InMemorySystemTable { - public: - static constexpr const char* kName = "files"; - - FilesSystemTable(std::shared_ptr fs, std::string table_path, std::string branch, - std::shared_ptr table_schema, - std::map options); - - std::string Name() const override; - Result> ArrowSchema() const override; - Result> BuildRows() const override; - - private: - SystemTableContext context_; -}; - -} // namespace paimon diff --git a/src/paimon/core/table/system/manifests_system_table.cpp b/src/paimon/core/table/system/manifests_system_table.cpp deleted file mode 100644 index a67b5486c..000000000 --- a/src/paimon/core/table/system/manifests_system_table.cpp +++ /dev/null @@ -1,105 +0,0 @@ -/* - * Copyright 2026-present Alibaba Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "paimon/core/table/system/manifests_system_table.h" - -#include - -#include "arrow/api.h" -#include "paimon/common/types/data_field.h" -#include "paimon/core/core_options.h" -#include "paimon/core/manifest/manifest_file_meta.h" -#include "paimon/core/schema/table_schema.h" -#include "paimon/core/snapshot.h" -#include "paimon/core/utils/field_mapping.h" -#include "paimon/core/utils/file_store_path_factory.h" -#include "paimon/memory/memory_pool.h" - -namespace paimon { - -ManifestsSystemTable::ManifestsSystemTable(std::shared_ptr fs, std::string table_path, - std::string branch, - std::shared_ptr table_schema, - std::map options) - : InMemorySystemTable(table_path), - context_(SystemTableUtils::CreateContext(std::move(fs), std::move(table_path), - std::move(branch), std::move(table_schema), - std::move(options))) {} - -std::string ManifestsSystemTable::Name() const { - return kName; -} - -Result> ManifestsSystemTable::ArrowSchema() const { - return arrow::schema({ - arrow::field("file_name", arrow::utf8(), /*nullable=*/false), - arrow::field("file_size", arrow::int64(), /*nullable=*/false), - arrow::field("num_added_files", arrow::int64(), /*nullable=*/false), - arrow::field("num_deleted_files", arrow::int64(), /*nullable=*/false), - arrow::field("schema_id", arrow::int64(), /*nullable=*/false), - arrow::field("min_partition_stats", arrow::utf8(), /*nullable=*/true), - arrow::field("max_partition_stats", arrow::utf8(), /*nullable=*/true), - arrow::field("min_row_id", arrow::int64(), /*nullable=*/true), - arrow::field("max_row_id", arrow::int64(), /*nullable=*/true), - }); -} - -Result> ManifestsSystemTable::BuildRows() const { - PAIMON_ASSIGN_OR_RAISE(std::shared_ptr schema, ArrowSchema()); - PAIMON_ASSIGN_OR_RAISE(std::optional snapshot, - SystemTableUtils::LatestSnapshot(context_)); - if (!snapshot) { - return std::vector(); - } - - std::shared_ptr pool = GetDefaultPool(); - PAIMON_ASSIGN_OR_RAISE(CoreOptions core_options, SystemTableUtils::CreateCoreOptions(context_)); - PAIMON_ASSIGN_OR_RAISE(std::shared_ptr path_factory, - SystemTableUtils::CreatePathFactory(context_, core_options, pool)); - PAIMON_ASSIGN_OR_RAISE(std::vector manifests, - SystemTableUtils::ReadDataManifests(context_, snapshot.value(), - path_factory, core_options, pool)); - std::shared_ptr arrow_schema = - DataField::ConvertDataFieldsToArrowSchema(context_.table_schema->Fields()); - PAIMON_ASSIGN_OR_RAISE( - std::shared_ptr partition_schema, - FieldMapping::GetPartitionSchema(arrow_schema, context_.table_schema->PartitionKeys())); - - std::vector rows; - rows.reserve(manifests.size()); - for (const auto& manifest : manifests) { - GenericRow row(schema->num_fields()); - row.SetField(0, SystemTableUtils::StringValue(manifest.FileName())); - row.SetField(1, manifest.FileSize()); - row.SetField(2, manifest.NumAddedFiles()); - row.SetField(3, manifest.NumDeletedFiles()); - row.SetField(4, manifest.SchemaId()); - PAIMON_ASSIGN_OR_RAISE(VariantType min_partition, - SystemTableUtils::OptionalPartitionStringValue( - manifest.PartitionStats().MinValues(), partition_schema)); - PAIMON_ASSIGN_OR_RAISE(VariantType max_partition, - SystemTableUtils::OptionalPartitionStringValue( - manifest.PartitionStats().MaxValues(), partition_schema)); - row.SetField(5, min_partition); - row.SetField(6, max_partition); - row.SetField(7, SystemTableUtils::OptionalInt64Value(manifest.MinRowId())); - row.SetField(8, SystemTableUtils::OptionalInt64Value(manifest.MaxRowId())); - rows.push_back(std::move(row)); - } - return rows; -} - -} // namespace paimon diff --git a/src/paimon/core/table/system/manifests_system_table.h b/src/paimon/core/table/system/manifests_system_table.h deleted file mode 100644 index 20f1bf189..000000000 --- a/src/paimon/core/table/system/manifests_system_table.h +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright 2026-present Alibaba Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include -#include - -#include "paimon/core/table/system/in_memory_system_table.h" -#include "paimon/core/table/system/system_table_utils.h" - -namespace paimon { -class FileSystem; -class TableSchema; - -/// System table for `T$manifests`, exposing data manifest metadata in the latest snapshot. -class ManifestsSystemTable : public InMemorySystemTable { - public: - static constexpr const char* kName = "manifests"; - - ManifestsSystemTable(std::shared_ptr fs, std::string table_path, std::string branch, - std::shared_ptr table_schema, - std::map options); - - std::string Name() const override; - Result> ArrowSchema() const override; - Result> BuildRows() const override; - - private: - SystemTableContext context_; -}; - -} // namespace paimon diff --git a/src/paimon/core/table/system/metadata_system_tables.cpp b/src/paimon/core/table/system/metadata_system_tables.cpp new file mode 100644 index 000000000..3fdf687ee --- /dev/null +++ b/src/paimon/core/table/system/metadata_system_tables.cpp @@ -0,0 +1,847 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/core/table/system/metadata_system_tables.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "fmt/format.h" +#include "fmt/ranges.h" +#include "paimon/common/data/binary_string.h" +#include "paimon/common/data/data_define.h" +#include "paimon/common/data/generic_row.h" +#include "paimon/common/data/internal_array.h" +#include "paimon/common/data/internal_row.h" +#include "paimon/common/table/special_fields.h" +#include "paimon/common/types/data_field.h" +#include "paimon/common/utils/binary_row_partition_computer.h" +#include "paimon/common/utils/date_time_utils.h" +#include "paimon/common/utils/internal_row_utils.h" +#include "paimon/common/utils/path_util.h" +#include "paimon/common/utils/rapidjson_util.h" +#include "paimon/core/core_options.h" +#include "paimon/core/io/data_file_meta.h" +#include "paimon/core/manifest/file_entry.h" +#include "paimon/core/manifest/file_kind.h" +#include "paimon/core/manifest/manifest_entry.h" +#include "paimon/core/manifest/manifest_file.h" +#include "paimon/core/manifest/manifest_file_meta.h" +#include "paimon/core/manifest/manifest_list.h" +#include "paimon/core/schema/schema_manager.h" +#include "paimon/core/schema/table_schema.h" +#include "paimon/core/snapshot.h" +#include "paimon/core/stats/simple_stats_evolutions.h" +#include "paimon/core/tag/tag.h" +#include "paimon/core/utils/branch_manager.h" +#include "paimon/core/utils/consumer_manager.h" +#include "paimon/core/utils/field_mapping.h" +#include "paimon/core/utils/file_store_path_factory.h" +#include "paimon/core/utils/snapshot_manager.h" +#include "paimon/core/utils/tag_manager.h" +#include "paimon/data/timestamp.h" +#include "paimon/fs/file_system.h" +#include "paimon/memory/memory_pool.h" +#include "paimon/status.h" +#include "rapidjson/document.h" +#include "rapidjson/stringbuffer.h" +#include "rapidjson/writer.h" + +namespace paimon { +namespace { + +constexpr int32_t kMaxPartitionStatsLength = 255; + +template +Result JsonString(const T& value) { + rapidjson::Document document; + auto json_value = RapidJsonUtil::SerializeValue(value, &document.GetAllocator()); + rapidjson::StringBuffer buffer; + rapidjson::Writer writer(buffer); + if (!json_value.Accept(writer)) { + return Status::Invalid("failed to serialize metadata system table value"); + } + return std::string(buffer.GetString(), buffer.GetSize()); +} + +Result LocalDateTimePartsToTimestampMillis(const std::vector& parts) { + if (parts.size() < 6) { + return Status::Invalid("tag create time requires at least 6 date-time fields"); + } + + int64_t year = parts[0]; + int64_t month = parts[1]; + int64_t day = parts[2]; + int64_t hour = parts[3]; + int64_t minute = parts[4]; + int64_t second = parts[5]; + int64_t nanos = parts.size() > 6 ? parts[6] : 0; + auto is_leap_year = [](int64_t value) { + return value % 4 == 0 && (value % 100 != 0 || value % 400 == 0); + }; + int64_t days_in_month[] = {31, is_leap_year(year) ? 29 : 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, + 31}; + if (month < 1 || month > 12 || day < 1 || day > days_in_month[month - 1] || hour < 0 || + hour > 23 || minute < 0 || minute > 59 || second < 0 || second > 59 || nanos < 0 || + nanos > 999999999) { + return Status::Invalid("invalid tag create time fields"); + } + + year -= month <= 2 ? 1 : 0; + int64_t era = (year >= 0 ? year : year - 399) / 400; + auto year_of_era = static_cast(year - era * 400); + auto month_prime = static_cast(month + (month > 2 ? -3 : 9)); + uint32_t day_of_year = (153 * month_prime + 2) / 5 + static_cast(day) - 1; + uint32_t day_of_era = year_of_era * 365 + year_of_era / 4 - year_of_era / 100 + day_of_year; + int64_t epoch_day = era * 146097 + static_cast(day_of_era) - 719468; + return epoch_day * DateTimeUtils::MILLIS_PER_DAY + hour * 3600000 + minute * 60000 + + second * 1000 + nanos / 1000000; +} + +Result> OptionalLocalDateTimePartsToTimestampMillis( + const std::optional>& parts) { + if (!parts) { + return std::optional(); + } + PAIMON_ASSIGN_OR_RAISE(int64_t timestamp_millis, + LocalDateTimePartsToTimestampMillis(parts.value())); + return std::optional(timestamp_millis); +} + +std::optional OptionalDoubleToString(const std::optional& value) { + if (!value) { + return std::optional(); + } + return std::to_string(value.value()); +} + +VariantType OptionalInt64Value(const std::optional& value) { + if (!value) { + return NullType(); + } + return value.value(); +} + +VariantType StringValue(const std::string& value) { + return BinaryString::FromString(value, GetDefaultPool().get()); +} + +VariantType OptionalStringValue(const std::optional& value) { + if (!value) { + return NullType(); + } + return StringValue(value.value()); +} + +VariantType TimestampMillisValue(int64_t value) { + return Timestamp::FromEpochMillis(value); +} + +Result LocalTimestampMillisValue(int64_t epoch_millis) { + PAIMON_ASSIGN_OR_RAISE( + Timestamp local_timestamp, + DateTimeUtils::ToLocalTimestamp(Timestamp::FromEpochMillis(epoch_millis))); + return TimestampMillisValue(local_timestamp.GetMillisecond()); +} + +Result LocalTimestampMillisValue(const Timestamp& local_timestamp) { + PAIMON_ASSIGN_OR_RAISE(Timestamp utc_timestamp, DateTimeUtils::ToUTCTimestamp(local_timestamp)); + int64_t epoch_millis = utc_timestamp.GetMillisecond(); + return LocalTimestampMillisValue(epoch_millis); +} + +VariantType OptionalTimestampMillisValue(const std::optional& value) { + if (!value) { + return NullType(); + } + return TimestampMillisValue(value.value()); +} + +MetadataSystemTableContext CreateMetadataContext(std::shared_ptr fs, + std::string table_path, std::string branch) { + return { + std::move(fs), std::move(table_path), BranchManager::NormalizeBranch(branch), nullptr, {}, + }; +} + +MetadataSystemTableContext CreateMetadataContext(std::shared_ptr fs, + std::string table_path, std::string branch, + std::shared_ptr table_schema, + std::map options) { + return { + std::move(fs), std::move(table_path), BranchManager::NormalizeBranch(branch), + std::move(table_schema), std::move(options), + }; +} + +Result CreateCoreOptions(const MetadataSystemTableContext& context) { + return CoreOptions::FromMap(context.options, context.fs); +} + +Result> CreatePathFactory( + const MetadataSystemTableContext& context, const CoreOptions& core_options, + const std::shared_ptr& pool) { + std::shared_ptr arrow_schema = + DataField::ConvertDataFieldsToArrowSchema(context.table_schema->Fields()); + PAIMON_ASSIGN_OR_RAISE(std::vector external_paths, + core_options.CreateExternalPaths()); + PAIMON_ASSIGN_OR_RAISE(std::optional global_index_external_path, + core_options.CreateGlobalIndexExternalPath()); + PAIMON_ASSIGN_OR_RAISE( + std::unique_ptr path_factory, + FileStorePathFactory::Create( + context.table_path, arrow_schema, context.table_schema->PartitionKeys(), + core_options.GetPartitionDefaultName(), core_options.GetFileFormat()->Identifier(), + core_options.DataFilePrefix(), core_options.LegacyPartitionNameEnabled(), + external_paths, global_index_external_path, core_options.IndexFileInDataFileDir(), + pool)); + return std::shared_ptr(std::move(path_factory)); +} + +Result> LatestSnapshot(const MetadataSystemTableContext& context) { + SnapshotManager snapshot_manager(context.fs, context.table_path, context.branch); + return snapshot_manager.LatestSnapshot(); +} + +Result> ReadDataManifests( + const MetadataSystemTableContext& context, const Snapshot& snapshot, + const std::shared_ptr& path_factory, const CoreOptions& core_options, + const std::shared_ptr& pool) { + PAIMON_ASSIGN_OR_RAISE( + std::unique_ptr manifest_list, + ManifestList::Create(context.fs, core_options.GetManifestFormat(), + core_options.GetManifestCompression(), path_factory, pool)); + std::vector manifests; + PAIMON_RETURN_NOT_OK(manifest_list->ReadDataManifests(snapshot, &manifests)); + return manifests; +} + +Result> CreateManifestFile( + const MetadataSystemTableContext& context, + const std::shared_ptr& path_factory, const CoreOptions& core_options, + const std::shared_ptr& pool) { + std::shared_ptr arrow_schema = + DataField::ConvertDataFieldsToArrowSchema(context.table_schema->Fields()); + PAIMON_ASSIGN_OR_RAISE( + std::shared_ptr partition_schema, + FieldMapping::GetPartitionSchema(arrow_schema, context.table_schema->PartitionKeys())); + return ManifestFile::Create(context.fs, core_options.GetManifestFormat(), + core_options.GetManifestCompression(), path_factory, + core_options.GetManifestTargetFileSize(), pool, core_options, + partition_schema); +} + +Result> ReadLatestManifestEntries( + const MetadataSystemTableContext& context, + const std::shared_ptr& path_factory, const CoreOptions& core_options, + const std::shared_ptr& pool) { + PAIMON_ASSIGN_OR_RAISE(std::optional snapshot, LatestSnapshot(context)); + if (!snapshot) { + return std::vector(); + } + PAIMON_ASSIGN_OR_RAISE( + std::vector manifests, + ReadDataManifests(context, snapshot.value(), path_factory, core_options, pool)); + PAIMON_ASSIGN_OR_RAISE(std::unique_ptr manifest_file, + CreateManifestFile(context, path_factory, core_options, pool)); + std::vector entries; + for (const auto& manifest : manifests) { + PAIMON_RETURN_NOT_OK( + manifest_file->Read(manifest.FileName(), /*filter=*/nullptr, &entries)); + } + return entries; +} + +Result> ReadLatestDataFiles( + const MetadataSystemTableContext& context, + const std::shared_ptr& path_factory, const CoreOptions& core_options, + const std::shared_ptr& pool) { + PAIMON_ASSIGN_OR_RAISE(std::vector entries, + ReadLatestManifestEntries(context, path_factory, core_options, pool)); + std::vector merged_entries; + PAIMON_RETURN_NOT_OK(FileEntry::MergeEntries(entries, &merged_entries)); + return merged_entries; +} + +std::optional OptionalBinaryRowString(const BinaryRow& row) { + if (row.GetFieldCount() <= 0) { + return std::nullopt; + } + return row.ToString(); +} + +Result> OptionalPartitionString( + const BinaryRow& row, const std::shared_ptr& partition_schema) { + if (row.GetFieldCount() <= 0) { + return std::optional(); + } + PAIMON_ASSIGN_OR_RAISE(std::string value, + BinaryRowPartitionComputer::PartToSimpleString( + partition_schema, row, ",", kMaxPartitionStatsLength)); + return std::optional(value); +} + +Result OptionalPartitionStringValue( + const BinaryRow& row, const std::shared_ptr& partition_schema) { + PAIMON_ASSIGN_OR_RAISE(std::optional value, + OptionalPartitionString(row, partition_schema)); + return OptionalStringValue(value); +} + +Result PartitionString(const std::shared_ptr& path_factory, + const BinaryRow& partition) { + PAIMON_ASSIGN_OR_RAISE(std::string value, path_factory->GetPartitionString(partition)); + return value; +} + +Result FilePath(const std::shared_ptr& path_factory, + const ManifestEntry& entry, const DataFileMeta& file) { + if (file.external_path) { + return file.external_path.value(); + } + PAIMON_ASSIGN_OR_RAISE(std::string bucket_path, + path_factory->BucketPath(entry.Partition(), entry.Bucket())); + return PathUtil::JoinPath(bucket_path, file.file_name); +} + +Result FieldsValueMapString(const std::vector& fields, + const InternalRow& row) { + std::shared_ptr schema = DataField::ConvertDataFieldsToArrowSchema(fields); + PAIMON_ASSIGN_OR_RAISE(std::vector getters, + InternalRowUtils::CreateFieldGetters(schema, /*use_view=*/false)); + std::vector values; + values.reserve(fields.size()); + for (size_t i = 0; i < fields.size(); ++i) { + std::string value = "null"; + if (!row.IsNullAt(i)) { + VariantType field_value = getters[i](row); + if (std::holds_alternative(field_value)) { + value = std::string(std::get(field_value)); + } else { + value = DataDefine::VariantValueToString(field_value); + } + } + values.emplace_back(fmt::format("{}:{}", fields[i].Name(), value)); + } + return fmt::format("{{{}}}", fmt::join(values, ", ")); +} + +Result NullValueCountsString(const std::vector& fields, + const InternalArray& null_counts) { + std::vector values; + values.reserve(fields.size()); + for (size_t i = 0; i < fields.size(); ++i) { + std::string value = + null_counts.IsNullAt(i) ? "null" : std::to_string(null_counts.GetLong(i)); + values.emplace_back(fmt::format("{}:{}", fields[i].Name(), value)); + } + return fmt::format("{{{}}}", fmt::join(values, ", ")); +} + +Result> StatsFields(const std::shared_ptr& schema) { + return schema->Fields(); +} + +Result> LoadDataSchema(const MetadataSystemTableContext& context, + int64_t schema_id) { + if (schema_id == context.table_schema->Id()) { + return context.table_schema; + } + SchemaManager schema_manager(context.fs, context.table_path, context.branch); + return schema_manager.ReadSchema(schema_id); +} + +Result> ValueStatsFields(const MetadataSystemTableContext& context, + int64_t schema_id) { + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr data_schema, + LoadDataSchema(context, schema_id)); + PAIMON_ASSIGN_OR_RAISE(std::vector fields, StatsFields(data_schema)); + return fields; +} + +Result> WriteColsValue( + const std::optional>& write_cols, + const std::shared_ptr& pool) { + if (!write_cols) { + return std::shared_ptr(); + } + return std::make_shared( + InternalRowUtils::ToNotNullStringArrayData(write_cols.value(), pool)); +} + +} // namespace + +OptionsSystemTable::OptionsSystemTable(std::string table_path, + std::shared_ptr table_schema) + : InMemorySystemTable(std::move(table_path)), table_schema_(std::move(table_schema)) {} + +std::string OptionsSystemTable::Name() const { + return kName; +} + +Result> OptionsSystemTable::ArrowSchema() const { + return arrow::schema({arrow::field("key", arrow::utf8(), /*nullable=*/false), + arrow::field("value", arrow::utf8(), /*nullable=*/false)}); +} + +Result> OptionsSystemTable::BuildRows() const { + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr schema, ArrowSchema()); + std::vector rows; + rows.reserve(table_schema_->Options().size()); + for (const auto& [key, value] : table_schema_->Options()) { + GenericRow row(schema->num_fields()); + row.SetField(0, std::string_view(key)); + row.SetField(1, std::string_view(value)); + rows.push_back(std::move(row)); + } + return rows; +} + +SnapshotsSystemTable::SnapshotsSystemTable(std::shared_ptr fs, std::string table_path, + std::string branch) + : InMemorySystemTable(table_path), + context_(CreateMetadataContext(std::move(fs), std::move(table_path), std::move(branch))) {} + +std::string SnapshotsSystemTable::Name() const { + return kName; +} + +Result> SnapshotsSystemTable::ArrowSchema() const { + return arrow::schema({ + arrow::field("snapshot_id", arrow::int64(), /*nullable=*/false), + arrow::field("schema_id", arrow::int64(), /*nullable=*/false), + arrow::field("commit_user", arrow::utf8(), /*nullable=*/false), + arrow::field("commit_identifier", arrow::int64(), /*nullable=*/false), + arrow::field("commit_kind", arrow::utf8(), /*nullable=*/false), + arrow::field("commit_time", arrow::timestamp(arrow::TimeUnit::MILLI), + /*nullable=*/false), + arrow::field("base_manifest_list", arrow::utf8(), /*nullable=*/false), + arrow::field("delta_manifest_list", arrow::utf8(), /*nullable=*/false), + arrow::field("changelog_manifest_list", arrow::utf8(), /*nullable=*/true), + arrow::field("total_record_count", arrow::int64(), /*nullable=*/true), + arrow::field("delta_record_count", arrow::int64(), /*nullable=*/true), + arrow::field("changelog_record_count", arrow::int64(), /*nullable=*/true), + arrow::field("watermark", arrow::int64(), /*nullable=*/true), + arrow::field("next_row_id", arrow::int64(), /*nullable=*/true), + }); +} + +Result> SnapshotsSystemTable::BuildRows() const { + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr schema, ArrowSchema()); + SnapshotManager snapshot_manager(context_.fs, context_.table_path, context_.branch); + PAIMON_ASSIGN_OR_RAISE(std::vector snapshots, snapshot_manager.GetAllSnapshots()); + std::sort(snapshots.begin(), snapshots.end(), + [](const Snapshot& lhs, const Snapshot& rhs) { return lhs.Id() < rhs.Id(); }); + std::vector rows; + rows.reserve(snapshots.size()); + + for (const auto& snapshot : snapshots) { + GenericRow row(schema->num_fields()); + row.SetField(0, snapshot.Id()); + row.SetField(1, snapshot.SchemaId()); + row.SetField(2, StringValue(snapshot.CommitUser())); + row.SetField(3, snapshot.CommitIdentifier()); + row.SetField(4, StringValue(Snapshot::CommitKind::ToString(snapshot.GetCommitKind()))); + PAIMON_ASSIGN_OR_RAISE(VariantType commit_time, + LocalTimestampMillisValue(snapshot.TimeMillis())); + row.SetField(5, commit_time); + row.SetField(6, StringValue(snapshot.BaseManifestList())); + row.SetField(7, StringValue(snapshot.DeltaManifestList())); + row.SetField(8, OptionalStringValue(snapshot.ChangelogManifestList())); + row.SetField(9, OptionalInt64Value(snapshot.TotalRecordCount())); + row.SetField(10, OptionalInt64Value(snapshot.DeltaRecordCount())); + row.SetField(11, OptionalInt64Value(snapshot.ChangelogRecordCount())); + row.SetField(12, OptionalInt64Value(snapshot.Watermark())); + row.SetField(13, OptionalInt64Value(snapshot.NextRowId())); + rows.push_back(std::move(row)); + } + + return rows; +} + +SchemasSystemTable::SchemasSystemTable(std::shared_ptr fs, std::string table_path, + std::string branch) + : InMemorySystemTable(table_path), + context_(CreateMetadataContext(std::move(fs), std::move(table_path), std::move(branch))) {} + +std::string SchemasSystemTable::Name() const { + return kName; +} + +Result> SchemasSystemTable::ArrowSchema() const { + return arrow::schema({ + arrow::field("schema_id", arrow::int64(), /*nullable=*/false), + arrow::field("fields", arrow::utf8(), /*nullable=*/false), + arrow::field("partition_keys", arrow::utf8(), /*nullable=*/false), + arrow::field("primary_keys", arrow::utf8(), /*nullable=*/false), + arrow::field("options", arrow::utf8(), /*nullable=*/false), + arrow::field("comment", arrow::utf8(), /*nullable=*/true), + arrow::field("update_time", arrow::timestamp(arrow::TimeUnit::MILLI), + /*nullable=*/false), + }); +} + +Result> SchemasSystemTable::BuildRows() const { + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr schema, ArrowSchema()); + SchemaManager schema_manager(context_.fs, context_.table_path, context_.branch); + PAIMON_ASSIGN_OR_RAISE(std::vector schema_ids, schema_manager.ListAllIds()); + std::sort(schema_ids.begin(), schema_ids.end()); + std::vector rows; + rows.reserve(schema_ids.size()); + + for (int64_t id : schema_ids) { + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr table_schema, + schema_manager.ReadSchema(id)); + PAIMON_ASSIGN_OR_RAISE(std::string fields_json, JsonString(table_schema->Fields())); + PAIMON_ASSIGN_OR_RAISE(std::string partition_keys_json, + JsonString(table_schema->PartitionKeys())); + PAIMON_ASSIGN_OR_RAISE(std::string primary_keys_json, + JsonString(table_schema->PrimaryKeys())); + PAIMON_ASSIGN_OR_RAISE(std::string options_json, JsonString(table_schema->Options())); + + GenericRow row(schema->num_fields()); + row.SetField(0, table_schema->Id()); + row.SetField(1, StringValue(fields_json)); + row.SetField(2, StringValue(partition_keys_json)); + row.SetField(3, StringValue(primary_keys_json)); + row.SetField(4, StringValue(options_json)); + row.SetField(5, OptionalStringValue(table_schema->Comment())); + PAIMON_ASSIGN_OR_RAISE(VariantType update_time, + LocalTimestampMillisValue(table_schema->TimeMillis())); + row.SetField(6, update_time); + rows.push_back(std::move(row)); + } + + return rows; +} + +TagsSystemTable::TagsSystemTable(std::shared_ptr fs, std::string table_path, + std::string branch) + : InMemorySystemTable(table_path), + context_(CreateMetadataContext(std::move(fs), std::move(table_path), std::move(branch))) {} + +std::string TagsSystemTable::Name() const { + return kName; +} + +Result> TagsSystemTable::ArrowSchema() const { + return arrow::schema({ + arrow::field("tag_name", arrow::utf8(), /*nullable=*/false), + arrow::field("snapshot_id", arrow::int64(), /*nullable=*/false), + arrow::field("schema_id", arrow::int64(), /*nullable=*/false), + arrow::field("commit_time", arrow::timestamp(arrow::TimeUnit::MILLI), + /*nullable=*/false), + arrow::field("record_count", arrow::int64(), /*nullable=*/true), + arrow::field("create_time", arrow::timestamp(arrow::TimeUnit::MILLI), + /*nullable=*/true), + arrow::field("time_retained", arrow::utf8(), /*nullable=*/true), + }); +} + +Result> TagsSystemTable::BuildRows() const { + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr schema, ArrowSchema()); + TagManager tag_manager(context_.fs, context_.table_path, context_.branch); + PAIMON_ASSIGN_OR_RAISE(std::vector tag_names, tag_manager.ListTagNames()); + std::vector rows; + rows.reserve(tag_names.size()); + + for (const auto& name : tag_names) { + PAIMON_ASSIGN_OR_RAISE(Tag tag, tag_manager.GetOrThrow(name)); + PAIMON_ASSIGN_OR_RAISE(std::optional tag_create_time, + OptionalLocalDateTimePartsToTimestampMillis(tag.TagCreateTime())); + GenericRow row(schema->num_fields()); + row.SetField(0, StringValue(name)); + row.SetField(1, tag.Id()); + row.SetField(2, tag.SchemaId()); + PAIMON_ASSIGN_OR_RAISE(VariantType commit_time, + LocalTimestampMillisValue(tag.TimeMillis())); + row.SetField(3, commit_time); + row.SetField(4, OptionalInt64Value(tag.TotalRecordCount())); + row.SetField(5, OptionalTimestampMillisValue(tag_create_time)); + row.SetField(6, OptionalStringValue(OptionalDoubleToString(tag.TagTimeRetained()))); + rows.push_back(std::move(row)); + } + + return rows; +} + +BranchesSystemTable::BranchesSystemTable(std::shared_ptr fs, std::string table_path, + std::string branch) + : InMemorySystemTable(table_path), + context_(CreateMetadataContext(std::move(fs), std::move(table_path), std::move(branch))) {} + +std::string BranchesSystemTable::Name() const { + return kName; +} + +Result> BranchesSystemTable::ArrowSchema() const { + return arrow::schema({ + arrow::field("branch_name", arrow::utf8(), /*nullable=*/false), + arrow::field("create_time", arrow::timestamp(arrow::TimeUnit::MILLI), + /*nullable=*/false), + }); +} + +Result> BranchesSystemTable::BuildRows() const { + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr schema, ArrowSchema()); + PAIMON_ASSIGN_OR_RAISE(std::vector branches, + BranchManager::ListBranches(context_.fs, context_.table_path)); + std::vector rows; + rows.reserve(branches.size()); + + for (const auto& name : branches) { + PAIMON_ASSIGN_OR_RAISE( + std::unique_ptr branch_status, + context_.fs->GetFileStatus(BranchManager::BranchPath(context_.table_path, name))); + GenericRow row(schema->num_fields()); + row.SetField(0, StringValue(name)); + PAIMON_ASSIGN_OR_RAISE(VariantType create_time, + LocalTimestampMillisValue(branch_status->GetModificationTime())); + row.SetField(1, create_time); + rows.push_back(std::move(row)); + } + + return rows; +} + +ConsumersSystemTable::ConsumersSystemTable(std::shared_ptr fs, std::string table_path, + std::string branch) + : InMemorySystemTable(table_path), + context_(CreateMetadataContext(std::move(fs), std::move(table_path), std::move(branch))) {} + +std::string ConsumersSystemTable::Name() const { + return kName; +} + +Result> ConsumersSystemTable::ArrowSchema() const { + return arrow::schema({ + arrow::field("consumer_id", arrow::utf8(), /*nullable=*/false), + arrow::field("next_snapshot_id", arrow::int64(), /*nullable=*/false), + }); +} + +Result> ConsumersSystemTable::BuildRows() const { + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr schema, ArrowSchema()); + ConsumerManager consumer_manager(context_.fs, context_.table_path, context_.branch); + PAIMON_ASSIGN_OR_RAISE(auto consumers, consumer_manager.Consumers()); + std::vector rows; + rows.reserve(consumers.size()); + + for (const auto& [id, snapshot_id] : consumers) { + GenericRow row(schema->num_fields()); + row.SetField(0, StringValue(id)); + row.SetField(1, snapshot_id); + rows.push_back(std::move(row)); + } + + return rows; +} + +ManifestsSystemTable::ManifestsSystemTable(std::shared_ptr fs, std::string table_path, + std::string branch, + std::shared_ptr table_schema, + std::map options) + : InMemorySystemTable(table_path), + context_(CreateMetadataContext(std::move(fs), std::move(table_path), std::move(branch), + std::move(table_schema), std::move(options))) {} + +std::string ManifestsSystemTable::Name() const { + return kName; +} + +Result> ManifestsSystemTable::ArrowSchema() const { + return arrow::schema({ + arrow::field("file_name", arrow::utf8(), /*nullable=*/false), + arrow::field("file_size", arrow::int64(), /*nullable=*/false), + arrow::field("num_added_files", arrow::int64(), /*nullable=*/false), + arrow::field("num_deleted_files", arrow::int64(), /*nullable=*/false), + arrow::field("schema_id", arrow::int64(), /*nullable=*/false), + arrow::field("min_partition_stats", arrow::utf8(), /*nullable=*/true), + arrow::field("max_partition_stats", arrow::utf8(), /*nullable=*/true), + arrow::field("min_row_id", arrow::int64(), /*nullable=*/true), + arrow::field("max_row_id", arrow::int64(), /*nullable=*/true), + }); +} + +Result> ManifestsSystemTable::BuildRows() const { + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr schema, ArrowSchema()); + PAIMON_ASSIGN_OR_RAISE(std::optional snapshot, LatestSnapshot(context_)); + if (!snapshot) { + return std::vector(); + } + + std::shared_ptr pool = GetDefaultPool(); + PAIMON_ASSIGN_OR_RAISE(CoreOptions core_options, CreateCoreOptions(context_)); + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr path_factory, + CreatePathFactory(context_, core_options, pool)); + PAIMON_ASSIGN_OR_RAISE( + std::vector manifests, + ReadDataManifests(context_, snapshot.value(), path_factory, core_options, pool)); + std::shared_ptr arrow_schema = + DataField::ConvertDataFieldsToArrowSchema(context_.table_schema->Fields()); + PAIMON_ASSIGN_OR_RAISE( + std::shared_ptr partition_schema, + FieldMapping::GetPartitionSchema(arrow_schema, context_.table_schema->PartitionKeys())); + + std::vector rows; + rows.reserve(manifests.size()); + for (const auto& manifest : manifests) { + GenericRow row(schema->num_fields()); + row.SetField(0, StringValue(manifest.FileName())); + row.SetField(1, manifest.FileSize()); + row.SetField(2, manifest.NumAddedFiles()); + row.SetField(3, manifest.NumDeletedFiles()); + row.SetField(4, manifest.SchemaId()); + PAIMON_ASSIGN_OR_RAISE( + VariantType min_partition, + OptionalPartitionStringValue(manifest.PartitionStats().MinValues(), partition_schema)); + PAIMON_ASSIGN_OR_RAISE( + VariantType max_partition, + OptionalPartitionStringValue(manifest.PartitionStats().MaxValues(), partition_schema)); + row.SetField(5, min_partition); + row.SetField(6, max_partition); + row.SetField(7, OptionalInt64Value(manifest.MinRowId())); + row.SetField(8, OptionalInt64Value(manifest.MaxRowId())); + rows.push_back(std::move(row)); + } + return rows; +} + +FilesSystemTable::FilesSystemTable(std::shared_ptr fs, std::string table_path, + std::string branch, std::shared_ptr table_schema, + std::map options) + : InMemorySystemTable(table_path), + context_(CreateMetadataContext(std::move(fs), std::move(table_path), std::move(branch), + std::move(table_schema), std::move(options))) {} + +std::string FilesSystemTable::Name() const { + return kName; +} + +Result> FilesSystemTable::ArrowSchema() const { + return arrow::schema({ + arrow::field("partition", arrow::utf8(), /*nullable=*/true), + arrow::field("bucket", arrow::int32(), /*nullable=*/false), + arrow::field("file_path", arrow::utf8(), /*nullable=*/false), + arrow::field("file_format", arrow::utf8(), /*nullable=*/false), + arrow::field("schema_id", arrow::int64(), /*nullable=*/false), + arrow::field("level", arrow::int32(), /*nullable=*/false), + arrow::field("record_count", arrow::int64(), /*nullable=*/false), + arrow::field("file_size_in_bytes", arrow::int64(), /*nullable=*/false), + arrow::field("min_key", arrow::utf8(), /*nullable=*/true), + arrow::field("max_key", arrow::utf8(), /*nullable=*/true), + arrow::field("null_value_counts", arrow::utf8(), /*nullable=*/false), + arrow::field("min_value_stats", arrow::utf8(), /*nullable=*/false), + arrow::field("max_value_stats", arrow::utf8(), /*nullable=*/false), + arrow::field("min_sequence_number", arrow::int64(), /*nullable=*/true), + arrow::field("max_sequence_number", arrow::int64(), /*nullable=*/true), + arrow::field("creation_time", arrow::timestamp(arrow::TimeUnit::MILLI), + /*nullable=*/true), + arrow::field("deleteRowCount", arrow::int64(), /*nullable=*/true), + arrow::field("file_source", arrow::utf8(), /*nullable=*/true), + arrow::field("first_row_id", arrow::int64(), /*nullable=*/true), + arrow::field("write_cols", arrow::list(arrow::utf8()), /*nullable=*/true), + }); +} + +Result> FilesSystemTable::BuildRows() const { + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr schema, ArrowSchema()); + std::shared_ptr pool = GetDefaultPool(); + PAIMON_ASSIGN_OR_RAISE(CoreOptions core_options, CreateCoreOptions(context_)); + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr path_factory, + CreatePathFactory(context_, core_options, pool)); + PAIMON_ASSIGN_OR_RAISE(std::vector entries, + ReadLatestDataFiles(context_, path_factory, core_options, pool)); + std::shared_ptr arrow_schema = + DataField::ConvertDataFieldsToArrowSchema(context_.table_schema->Fields()); + PAIMON_ASSIGN_OR_RAISE( + std::shared_ptr partition_schema, + FieldMapping::GetPartitionSchema(arrow_schema, context_.table_schema->PartitionKeys())); + + SimpleStatsEvolutions stats_evolutions(context_.table_schema, pool); + std::vector rows; + rows.reserve(entries.size()); + for (const auto& entry : entries) { + if (!(entry.Kind() == FileKind::Add())) { + continue; + } + + const std::shared_ptr& file = entry.File(); + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr data_schema, + LoadDataSchema(context_, file->schema_id)); + PAIMON_ASSIGN_OR_RAISE(std::vector value_stats_fields, + ValueStatsFields(context_, file->schema_id)); + std::shared_ptr stats_evolution = + stats_evolutions.GetOrCreate(data_schema); + PAIMON_ASSIGN_OR_RAISE( + SimpleStatsEvolution::EvolutionStats stats, + stats_evolution->Evolution(file->value_stats, file->row_count, file->value_stats_cols)); + + GenericRow row(schema->num_fields()); + if (context_.table_schema->PartitionKeys().empty()) { + row.SetField(0, NullType()); + } else { + PAIMON_ASSIGN_OR_RAISE(std::string partition, + PartitionString(path_factory, entry.Partition())); + row.SetField(0, StringValue(partition)); + } + row.SetField(1, entry.Bucket()); + PAIMON_ASSIGN_OR_RAISE(std::string file_path, FilePath(path_factory, entry, *file)); + row.SetField(2, StringValue(file_path)); + PAIMON_ASSIGN_OR_RAISE(std::string file_format, file->FileFormat()); + row.SetField(3, StringValue(file_format)); + row.SetField(4, file->schema_id); + row.SetField(5, file->level); + row.SetField(6, file->row_count); + row.SetField(7, file->file_size); + row.SetField(8, OptionalStringValue(OptionalBinaryRowString(file->min_key))); + row.SetField(9, OptionalStringValue(OptionalBinaryRowString(file->max_key))); + PAIMON_ASSIGN_OR_RAISE(std::string null_value_counts, + NullValueCountsString(value_stats_fields, *stats.null_counts)); + row.SetField(10, StringValue(null_value_counts)); + PAIMON_ASSIGN_OR_RAISE(std::string min_value_stats, + FieldsValueMapString(value_stats_fields, *stats.min_values)); + row.SetField(11, StringValue(min_value_stats)); + PAIMON_ASSIGN_OR_RAISE(std::string max_value_stats, + FieldsValueMapString(value_stats_fields, *stats.max_values)); + row.SetField(12, StringValue(max_value_stats)); + row.SetField(13, file->min_sequence_number); + row.SetField(14, file->max_sequence_number); + PAIMON_ASSIGN_OR_RAISE(VariantType creation_time, + LocalTimestampMillisValue(file->creation_time)); + row.SetField(15, creation_time); + row.SetField(16, OptionalInt64Value(file->delete_row_count)); + row.SetField(17, file->file_source ? StringValue(file->file_source.value().ToString()) + : VariantType(NullType())); + row.SetField(18, OptionalInt64Value(file->first_row_id)); + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr write_cols, + WriteColsValue(file->write_cols, pool)); + row.SetField(19, write_cols ? VariantType(write_cols) : VariantType(NullType())); + rows.push_back(std::move(row)); + } + return rows; +} + +} // namespace paimon diff --git a/src/paimon/core/table/system/metadata_system_tables.h b/src/paimon/core/table/system/metadata_system_tables.h new file mode 100644 index 000000000..389ad5a95 --- /dev/null +++ b/src/paimon/core/table/system/metadata_system_tables.h @@ -0,0 +1,165 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +#include "paimon/core/table/system/in_memory_system_table.h" + +namespace paimon { +class FileSystem; +class TableSchema; + +/// System table for `T$options`, exposing the latest base table options as key/value rows. +class OptionsSystemTable : public InMemorySystemTable { + public: + static constexpr const char* kName = "options"; + + OptionsSystemTable(std::string table_path, std::shared_ptr table_schema); + + std::string Name() const override; + Result> ArrowSchema() const override; + Result> BuildRows() const override; + + private: + std::shared_ptr table_schema_; +}; + +/// Shared table metadata location used by metadata system tables. +struct MetadataSystemTableContext { + std::shared_ptr fs; + std::string table_path; + std::string branch; + std::shared_ptr table_schema; + std::map options; +}; + +/// System table for `T$snapshots`, exposing snapshot commit history. +class SnapshotsSystemTable : public InMemorySystemTable { + public: + static constexpr const char* kName = "snapshots"; + + SnapshotsSystemTable(std::shared_ptr fs, std::string table_path, + std::string branch); + + std::string Name() const override; + Result> ArrowSchema() const override; + Result> BuildRows() const override; + + private: + MetadataSystemTableContext context_; +}; + +/// System table for `T$schemas`, exposing schema evolution history. +class SchemasSystemTable : public InMemorySystemTable { + public: + static constexpr const char* kName = "schemas"; + + SchemasSystemTable(std::shared_ptr fs, std::string table_path, std::string branch); + + std::string Name() const override; + Result> ArrowSchema() const override; + Result> BuildRows() const override; + + private: + MetadataSystemTableContext context_; +}; + +/// System table for `T$tags`, exposing tags and the snapshots they reference. +class TagsSystemTable : public InMemorySystemTable { + public: + static constexpr const char* kName = "tags"; + + TagsSystemTable(std::shared_ptr fs, std::string table_path, std::string branch); + + std::string Name() const override; + Result> ArrowSchema() const override; + Result> BuildRows() const override; + + private: + MetadataSystemTableContext context_; +}; + +/// System table for `T$branches`, exposing table branches including `main`. +class BranchesSystemTable : public InMemorySystemTable { + public: + static constexpr const char* kName = "branches"; + + BranchesSystemTable(std::shared_ptr fs, std::string table_path, std::string branch); + + std::string Name() const override; + Result> ArrowSchema() const override; + Result> BuildRows() const override; + + private: + MetadataSystemTableContext context_; +}; + +/// System table for `T$consumers`, exposing persisted streaming consumer offsets. +class ConsumersSystemTable : public InMemorySystemTable { + public: + static constexpr const char* kName = "consumers"; + + ConsumersSystemTable(std::shared_ptr fs, std::string table_path, + std::string branch); + + std::string Name() const override; + Result> ArrowSchema() const override; + Result> BuildRows() const override; + + private: + MetadataSystemTableContext context_; +}; + +/// System table for `T$manifests`, exposing data manifest metadata in the latest snapshot. +class ManifestsSystemTable : public InMemorySystemTable { + public: + static constexpr const char* kName = "manifests"; + + ManifestsSystemTable(std::shared_ptr fs, std::string table_path, std::string branch, + std::shared_ptr table_schema, + std::map options); + + std::string Name() const override; + Result> ArrowSchema() const override; + Result> BuildRows() const override; + + private: + MetadataSystemTableContext context_; +}; + +/// System table for `T$files`, exposing data file metadata in the latest snapshot. +class FilesSystemTable : public InMemorySystemTable { + public: + static constexpr const char* kName = "files"; + + FilesSystemTable(std::shared_ptr fs, std::string table_path, std::string branch, + std::shared_ptr table_schema, + std::map options); + + std::string Name() const override; + Result> ArrowSchema() const override; + Result> BuildRows() const override; + + private: + MetadataSystemTableContext context_; +}; + +} // namespace paimon diff --git a/src/paimon/core/table/system/options_system_table.cpp b/src/paimon/core/table/system/options_system_table.cpp deleted file mode 100644 index feebb97cf..000000000 --- a/src/paimon/core/table/system/options_system_table.cpp +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Copyright 2026-present Alibaba Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "paimon/core/table/system/options_system_table.h" - -#include -#include - -#include "arrow/api.h" -#include "paimon/core/schema/table_schema.h" - -namespace paimon { - -OptionsSystemTable::OptionsSystemTable(std::string table_path, - std::shared_ptr table_schema) - : InMemorySystemTable(std::move(table_path)), table_schema_(std::move(table_schema)) {} - -std::string OptionsSystemTable::Name() const { - return kName; -} - -Result> OptionsSystemTable::ArrowSchema() const { - return arrow::schema({arrow::field("key", arrow::utf8(), /*nullable=*/false), - arrow::field("value", arrow::utf8(), /*nullable=*/false)}); -} - -Result> OptionsSystemTable::BuildRows() const { - PAIMON_ASSIGN_OR_RAISE(std::shared_ptr schema, ArrowSchema()); - std::vector rows; - rows.reserve(table_schema_->Options().size()); - for (const auto& [key, value] : table_schema_->Options()) { - GenericRow row(schema->num_fields()); - row.SetField(0, std::string_view(key)); - row.SetField(1, std::string_view(value)); - rows.push_back(std::move(row)); - } - return rows; -} - -} // namespace paimon diff --git a/src/paimon/core/table/system/options_system_table.h b/src/paimon/core/table/system/options_system_table.h deleted file mode 100644 index a87fd688d..000000000 --- a/src/paimon/core/table/system/options_system_table.h +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright 2026-present Alibaba Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include - -#include "paimon/core/table/system/in_memory_system_table.h" - -namespace paimon { -class TableSchema; - -/// System table for `T$options`, exposing the latest base table options as key/value rows. -class OptionsSystemTable : public InMemorySystemTable { - public: - static constexpr const char* kName = "options"; - - OptionsSystemTable(std::string table_path, std::shared_ptr table_schema); - - std::string Name() const override; - Result> ArrowSchema() const override; - Result> BuildRows() const override; - - private: - std::shared_ptr table_schema_; -}; - -} // namespace paimon diff --git a/src/paimon/core/table/system/schemas_system_table.cpp b/src/paimon/core/table/system/schemas_system_table.cpp deleted file mode 100644 index 9be51ee61..000000000 --- a/src/paimon/core/table/system/schemas_system_table.cpp +++ /dev/null @@ -1,87 +0,0 @@ -/* - * Copyright 2026-present Alibaba Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "paimon/core/table/system/schemas_system_table.h" - -#include -#include - -#include "arrow/api.h" -#include "paimon/core/schema/schema_manager.h" -#include "paimon/core/schema/table_schema.h" - -namespace paimon { - -SchemasSystemTable::SchemasSystemTable(std::shared_ptr fs, std::string table_path, - std::string branch) - : InMemorySystemTable(table_path), - context_(SystemTableUtils::CreateContext(std::move(fs), std::move(table_path), - std::move(branch))) {} - -std::string SchemasSystemTable::Name() const { - return kName; -} - -Result> SchemasSystemTable::ArrowSchema() const { - return arrow::schema({ - arrow::field("schema_id", arrow::int64(), /*nullable=*/false), - arrow::field("fields", arrow::utf8(), /*nullable=*/false), - arrow::field("partition_keys", arrow::utf8(), /*nullable=*/false), - arrow::field("primary_keys", arrow::utf8(), /*nullable=*/false), - arrow::field("options", arrow::utf8(), /*nullable=*/false), - arrow::field("comment", arrow::utf8(), /*nullable=*/true), - arrow::field("update_time", arrow::timestamp(arrow::TimeUnit::MILLI), - /*nullable=*/false), - }); -} - -Result> SchemasSystemTable::BuildRows() const { - PAIMON_ASSIGN_OR_RAISE(std::shared_ptr schema, ArrowSchema()); - SchemaManager schema_manager(context_.fs, context_.table_path, context_.branch); - PAIMON_ASSIGN_OR_RAISE(std::vector schema_ids, schema_manager.ListAllIds()); - std::sort(schema_ids.begin(), schema_ids.end()); - std::vector rows; - rows.reserve(schema_ids.size()); - - for (int64_t id : schema_ids) { - PAIMON_ASSIGN_OR_RAISE(std::shared_ptr table_schema, - schema_manager.ReadSchema(id)); - PAIMON_ASSIGN_OR_RAISE(std::string fields_json, - SystemTableUtils::JsonString(table_schema->Fields())); - PAIMON_ASSIGN_OR_RAISE(std::string partition_keys_json, - SystemTableUtils::JsonString(table_schema->PartitionKeys())); - PAIMON_ASSIGN_OR_RAISE(std::string primary_keys_json, - SystemTableUtils::JsonString(table_schema->PrimaryKeys())); - PAIMON_ASSIGN_OR_RAISE(std::string options_json, - SystemTableUtils::JsonString(table_schema->Options())); - - GenericRow row(schema->num_fields()); - row.SetField(0, table_schema->Id()); - row.SetField(1, SystemTableUtils::StringValue(fields_json)); - row.SetField(2, SystemTableUtils::StringValue(partition_keys_json)); - row.SetField(3, SystemTableUtils::StringValue(primary_keys_json)); - row.SetField(4, SystemTableUtils::StringValue(options_json)); - row.SetField(5, SystemTableUtils::OptionalStringValue(table_schema->Comment())); - PAIMON_ASSIGN_OR_RAISE(VariantType update_time, SystemTableUtils::LocalTimestampMillisValue( - table_schema->TimeMillis())); - row.SetField(6, update_time); - rows.push_back(std::move(row)); - } - - return rows; -} - -} // namespace paimon diff --git a/src/paimon/core/table/system/schemas_system_table.h b/src/paimon/core/table/system/schemas_system_table.h deleted file mode 100644 index a9a607dc8..000000000 --- a/src/paimon/core/table/system/schemas_system_table.h +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Copyright 2026-present Alibaba Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include - -#include "paimon/core/table/system/in_memory_system_table.h" -#include "paimon/core/table/system/system_table_utils.h" - -namespace paimon { -class FileSystem; - -/// System table for `T$schemas`, exposing schema evolution history. -class SchemasSystemTable : public InMemorySystemTable { - public: - static constexpr const char* kName = "schemas"; - - SchemasSystemTable(std::shared_ptr fs, std::string table_path, std::string branch); - - std::string Name() const override; - Result> ArrowSchema() const override; - Result> BuildRows() const override; - - private: - SystemTableContext context_; -}; - -} // namespace paimon diff --git a/src/paimon/core/table/system/snapshots_system_table.cpp b/src/paimon/core/table/system/snapshots_system_table.cpp deleted file mode 100644 index 4ca7cfa01..000000000 --- a/src/paimon/core/table/system/snapshots_system_table.cpp +++ /dev/null @@ -1,92 +0,0 @@ -/* - * Copyright 2026-present Alibaba Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "paimon/core/table/system/snapshots_system_table.h" - -#include -#include - -#include "arrow/api.h" -#include "paimon/core/snapshot.h" -#include "paimon/core/utils/snapshot_manager.h" - -namespace paimon { - -SnapshotsSystemTable::SnapshotsSystemTable(std::shared_ptr fs, std::string table_path, - std::string branch) - : InMemorySystemTable(table_path), - context_(SystemTableUtils::CreateContext(std::move(fs), std::move(table_path), - std::move(branch))) {} - -std::string SnapshotsSystemTable::Name() const { - return kName; -} - -Result> SnapshotsSystemTable::ArrowSchema() const { - return arrow::schema({ - arrow::field("snapshot_id", arrow::int64(), /*nullable=*/false), - arrow::field("schema_id", arrow::int64(), /*nullable=*/false), - arrow::field("commit_user", arrow::utf8(), /*nullable=*/false), - arrow::field("commit_identifier", arrow::int64(), /*nullable=*/false), - arrow::field("commit_kind", arrow::utf8(), /*nullable=*/false), - arrow::field("commit_time", arrow::timestamp(arrow::TimeUnit::MILLI), - /*nullable=*/false), - arrow::field("base_manifest_list", arrow::utf8(), /*nullable=*/false), - arrow::field("delta_manifest_list", arrow::utf8(), /*nullable=*/false), - arrow::field("changelog_manifest_list", arrow::utf8(), /*nullable=*/true), - arrow::field("total_record_count", arrow::int64(), /*nullable=*/true), - arrow::field("delta_record_count", arrow::int64(), /*nullable=*/true), - arrow::field("changelog_record_count", arrow::int64(), /*nullable=*/true), - arrow::field("watermark", arrow::int64(), /*nullable=*/true), - arrow::field("next_row_id", arrow::int64(), /*nullable=*/true), - }); -} - -Result> SnapshotsSystemTable::BuildRows() const { - PAIMON_ASSIGN_OR_RAISE(std::shared_ptr schema, ArrowSchema()); - SnapshotManager snapshot_manager(context_.fs, context_.table_path, context_.branch); - PAIMON_ASSIGN_OR_RAISE(std::vector snapshots, snapshot_manager.GetAllSnapshots()); - std::sort(snapshots.begin(), snapshots.end(), - [](const Snapshot& lhs, const Snapshot& rhs) { return lhs.Id() < rhs.Id(); }); - std::vector rows; - rows.reserve(snapshots.size()); - - for (const auto& snapshot : snapshots) { - GenericRow row(schema->num_fields()); - row.SetField(0, snapshot.Id()); - row.SetField(1, snapshot.SchemaId()); - row.SetField(2, SystemTableUtils::StringValue(snapshot.CommitUser())); - row.SetField(3, snapshot.CommitIdentifier()); - row.SetField(4, SystemTableUtils::StringValue( - Snapshot::CommitKind::ToString(snapshot.GetCommitKind()))); - PAIMON_ASSIGN_OR_RAISE(VariantType commit_time, - SystemTableUtils::LocalTimestampMillisValue(snapshot.TimeMillis())); - row.SetField(5, commit_time); - row.SetField(6, SystemTableUtils::StringValue(snapshot.BaseManifestList())); - row.SetField(7, SystemTableUtils::StringValue(snapshot.DeltaManifestList())); - row.SetField(8, SystemTableUtils::OptionalStringValue(snapshot.ChangelogManifestList())); - row.SetField(9, SystemTableUtils::OptionalInt64Value(snapshot.TotalRecordCount())); - row.SetField(10, SystemTableUtils::OptionalInt64Value(snapshot.DeltaRecordCount())); - row.SetField(11, SystemTableUtils::OptionalInt64Value(snapshot.ChangelogRecordCount())); - row.SetField(12, SystemTableUtils::OptionalInt64Value(snapshot.Watermark())); - row.SetField(13, SystemTableUtils::OptionalInt64Value(snapshot.NextRowId())); - rows.push_back(std::move(row)); - } - - return rows; -} - -} // namespace paimon diff --git a/src/paimon/core/table/system/snapshots_system_table.h b/src/paimon/core/table/system/snapshots_system_table.h deleted file mode 100644 index 2e2c48f01..000000000 --- a/src/paimon/core/table/system/snapshots_system_table.h +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright 2026-present Alibaba Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include - -#include "paimon/core/table/system/in_memory_system_table.h" -#include "paimon/core/table/system/system_table_utils.h" - -namespace paimon { -class FileSystem; - -/// System table for `T$snapshots`, exposing snapshot commit history. -class SnapshotsSystemTable : public InMemorySystemTable { - public: - static constexpr const char* kName = "snapshots"; - - SnapshotsSystemTable(std::shared_ptr fs, std::string table_path, - std::string branch); - - std::string Name() const override; - Result> ArrowSchema() const override; - Result> BuildRows() const override; - - private: - SystemTableContext context_; -}; - -} // namespace paimon diff --git a/src/paimon/core/table/system/system_table.cpp b/src/paimon/core/table/system/system_table.cpp index 35ae430b1..061e6d4f1 100644 --- a/src/paimon/core/table/system/system_table.cpp +++ b/src/paimon/core/table/system/system_table.cpp @@ -30,15 +30,8 @@ #include "paimon/core/schema/table_schema.h" #include "paimon/core/table/system/audit_log_system_table.h" #include "paimon/core/table/system/binlog_system_table.h" -#include "paimon/core/table/system/branches_system_table.h" -#include "paimon/core/table/system/consumers_system_table.h" -#include "paimon/core/table/system/files_system_table.h" -#include "paimon/core/table/system/manifests_system_table.h" -#include "paimon/core/table/system/options_system_table.h" -#include "paimon/core/table/system/schemas_system_table.h" -#include "paimon/core/table/system/snapshots_system_table.h" -#include "paimon/core/table/system/system_table_utils.h" -#include "paimon/core/table/system/tags_system_table.h" +#include "paimon/core/table/system/metadata_system_tables.h" +#include "paimon/core/utils/branch_manager.h" #include "paimon/status.h" namespace paimon { @@ -53,6 +46,21 @@ struct SystemTableRegistryEntry { SystemTableFactory factory; }; +std::map MergeOptions( + const std::shared_ptr& table_schema, + const std::map& dynamic_options) { + auto options = table_schema->Options(); + for (const auto& [key, value] : dynamic_options) { + options[key] = value; + } + return options; +} + +std::string LoadBranch(const std::map& options) { + auto branch_iter = options.find(Options::BRANCH); + return branch_iter == options.end() ? BranchManager::DEFAULT_MAIN_BRANCH : branch_iter->second; +} + const std::vector& SystemTableRegistry() { static const std::vector registry = { {OptionsSystemTable::kName, @@ -68,8 +76,7 @@ const std::vector& SystemTableRegistry() { const std::map& dynamic_options) -> Result> { return std::make_shared( - fs, table_path, table_schema, - SystemTableUtils::MergeOptions(table_schema, dynamic_options)); + fs, table_path, table_schema, MergeOptions(table_schema, dynamic_options)); }}, {BinlogSystemTable::kName, [](const std::shared_ptr& fs, const std::string& table_path, @@ -77,62 +84,55 @@ const std::vector& SystemTableRegistry() { const std::map& dynamic_options) -> Result> { return std::make_shared( - fs, table_path, table_schema, - SystemTableUtils::MergeOptions(table_schema, dynamic_options)); + fs, table_path, table_schema, MergeOptions(table_schema, dynamic_options)); }}, {SnapshotsSystemTable::kName, [](const std::shared_ptr& fs, const std::string& table_path, const std::shared_ptr& table_schema, const std::map& dynamic_options) -> Result> { - auto options = SystemTableUtils::MergeOptions(table_schema, dynamic_options); - return std::make_shared(fs, table_path, - SystemTableUtils::LoadBranch(options)); + auto options = MergeOptions(table_schema, dynamic_options); + return std::make_shared(fs, table_path, LoadBranch(options)); }}, {SchemasSystemTable::kName, [](const std::shared_ptr& fs, const std::string& table_path, const std::shared_ptr& table_schema, const std::map& dynamic_options) -> Result> { - auto options = SystemTableUtils::MergeOptions(table_schema, dynamic_options); - return std::make_shared(fs, table_path, - SystemTableUtils::LoadBranch(options)); + auto options = MergeOptions(table_schema, dynamic_options); + return std::make_shared(fs, table_path, LoadBranch(options)); }}, {TagsSystemTable::kName, [](const std::shared_ptr& fs, const std::string& table_path, const std::shared_ptr& table_schema, const std::map& dynamic_options) -> Result> { - auto options = SystemTableUtils::MergeOptions(table_schema, dynamic_options); - return std::make_shared(fs, table_path, - SystemTableUtils::LoadBranch(options)); + auto options = MergeOptions(table_schema, dynamic_options); + return std::make_shared(fs, table_path, LoadBranch(options)); }}, {BranchesSystemTable::kName, [](const std::shared_ptr& fs, const std::string& table_path, const std::shared_ptr& table_schema, const std::map& dynamic_options) -> Result> { - auto options = SystemTableUtils::MergeOptions(table_schema, dynamic_options); - return std::make_shared(fs, table_path, - SystemTableUtils::LoadBranch(options)); + auto options = MergeOptions(table_schema, dynamic_options); + return std::make_shared(fs, table_path, LoadBranch(options)); }}, {ConsumersSystemTable::kName, [](const std::shared_ptr& fs, const std::string& table_path, const std::shared_ptr& table_schema, const std::map& dynamic_options) -> Result> { - auto options = SystemTableUtils::MergeOptions(table_schema, dynamic_options); - return std::make_shared(fs, table_path, - SystemTableUtils::LoadBranch(options)); + auto options = MergeOptions(table_schema, dynamic_options); + return std::make_shared(fs, table_path, LoadBranch(options)); }}, {ManifestsSystemTable::kName, [](const std::shared_ptr& fs, const std::string& table_path, const std::shared_ptr& table_schema, const std::map& dynamic_options) -> Result> { - auto options = SystemTableUtils::MergeOptions(table_schema, dynamic_options); - return std::make_shared(fs, table_path, - SystemTableUtils::LoadBranch(options), + auto options = MergeOptions(table_schema, dynamic_options); + return std::make_shared(fs, table_path, LoadBranch(options), table_schema, std::move(options)); }}, {FilesSystemTable::kName, @@ -140,9 +140,8 @@ const std::vector& SystemTableRegistry() { const std::shared_ptr& table_schema, const std::map& dynamic_options) -> Result> { - auto options = SystemTableUtils::MergeOptions(table_schema, dynamic_options); - return std::make_shared(fs, table_path, - SystemTableUtils::LoadBranch(options), + auto options = MergeOptions(table_schema, dynamic_options); + return std::make_shared(fs, table_path, LoadBranch(options), table_schema, std::move(options)); }}, }; @@ -204,7 +203,7 @@ Result> SystemTableLoader::LoadFromPath( } const auto& parsed = system_table_path.value(); SchemaManager schema_manager(fs, parsed.table_path, - parsed.branch.value_or(SystemTableUtils::DefaultBranch())); + parsed.branch.value_or(BranchManager::DEFAULT_MAIN_BRANCH)); PAIMON_ASSIGN_OR_RAISE(std::optional> latest_schema, schema_manager.Latest()); if (!latest_schema) { diff --git a/src/paimon/core/table/system/system_table_utils.cpp b/src/paimon/core/table/system/system_table_utils.cpp deleted file mode 100644 index a06361588..000000000 --- a/src/paimon/core/table/system/system_table_utils.cpp +++ /dev/null @@ -1,373 +0,0 @@ -/* - * Copyright 2026-present Alibaba Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "paimon/core/table/system/system_table_utils.h" - -#include -#include - -#include "fmt/format.h" -#include "fmt/ranges.h" -#include "paimon/common/data/binary_array.h" -#include "paimon/common/data/binary_row.h" -#include "paimon/common/data/binary_string.h" -#include "paimon/common/data/internal_array.h" -#include "paimon/common/data/internal_row.h" -#include "paimon/common/types/data_field.h" -#include "paimon/common/utils/binary_row_partition_computer.h" -#include "paimon/common/utils/date_time_utils.h" -#include "paimon/common/utils/internal_row_utils.h" -#include "paimon/common/utils/path_util.h" -#include "paimon/core/core_options.h" -#include "paimon/core/io/data_file_meta.h" -#include "paimon/core/manifest/file_entry.h" -#include "paimon/core/manifest/manifest_entry.h" -#include "paimon/core/manifest/manifest_file.h" -#include "paimon/core/manifest/manifest_file_meta.h" -#include "paimon/core/manifest/manifest_list.h" -#include "paimon/core/schema/schema_manager.h" -#include "paimon/core/schema/table_schema.h" -#include "paimon/core/snapshot.h" -#include "paimon/core/utils/branch_manager.h" -#include "paimon/core/utils/field_mapping.h" -#include "paimon/core/utils/file_store_path_factory.h" -#include "paimon/core/utils/snapshot_manager.h" -#include "paimon/fs/file_system.h" -#include "paimon/memory/memory_pool.h" -#include "paimon/status.h" - -namespace paimon { -namespace { - -constexpr int32_t kMaxPartitionStatsLength = 255; - -} // namespace - -SystemTableContext SystemTableUtils::CreateContext(std::shared_ptr fs, - std::string table_path, std::string branch) { - return { - std::move(fs), std::move(table_path), BranchManager::NormalizeBranch(branch), nullptr, {}, - }; -} - -SystemTableContext SystemTableUtils::CreateContext(std::shared_ptr fs, - std::string table_path, std::string branch, - std::shared_ptr table_schema, - std::map options) { - return { - std::move(fs), std::move(table_path), BranchManager::NormalizeBranch(branch), - std::move(table_schema), std::move(options), - }; -} - -std::map SystemTableUtils::MergeOptions( - const std::shared_ptr& table_schema, - const std::map& dynamic_options) { - auto options = table_schema->Options(); - for (const auto& [key, value] : dynamic_options) { - options[key] = value; - } - return options; -} - -std::string SystemTableUtils::DefaultBranch() { - return BranchManager::DEFAULT_MAIN_BRANCH; -} - -std::string SystemTableUtils::LoadBranch(const std::map& options) { - auto branch_iter = options.find(Options::BRANCH); - return branch_iter == options.end() ? DefaultBranch() : branch_iter->second; -} - -Result SystemTableUtils::LocalDateTimePartsToTimestampMillis( - const std::vector& parts) { - if (parts.size() < 6) { - return Status::Invalid("tag create time requires at least 6 date-time fields"); - } - - int64_t year = parts[0]; - int64_t month = parts[1]; - int64_t day = parts[2]; - int64_t hour = parts[3]; - int64_t minute = parts[4]; - int64_t second = parts[5]; - int64_t nanos = parts.size() > 6 ? parts[6] : 0; - auto is_leap_year = [](int64_t value) { - return value % 4 == 0 && (value % 100 != 0 || value % 400 == 0); - }; - int64_t days_in_month[] = {31, is_leap_year(year) ? 29 : 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, - 31}; - if (month < 1 || month > 12 || day < 1 || day > days_in_month[month - 1] || hour < 0 || - hour > 23 || minute < 0 || minute > 59 || second < 0 || second > 59 || nanos < 0 || - nanos > 999999999) { - return Status::Invalid("invalid tag create time fields"); - } - - year -= month <= 2 ? 1 : 0; - int64_t era = (year >= 0 ? year : year - 399) / 400; - auto year_of_era = static_cast(year - era * 400); - auto month_prime = static_cast(month + (month > 2 ? -3 : 9)); - uint32_t day_of_year = (153 * month_prime + 2) / 5 + static_cast(day) - 1; - uint32_t day_of_era = year_of_era * 365 + year_of_era / 4 - year_of_era / 100 + day_of_year; - int64_t epoch_day = era * 146097 + static_cast(day_of_era) - 719468; - return epoch_day * DateTimeUtils::MILLIS_PER_DAY + hour * 3600000 + minute * 60000 + - second * 1000 + nanos / 1000000; -} - -Result> SystemTableUtils::OptionalLocalDateTimePartsToTimestampMillis( - const std::optional>& parts) { - if (!parts) { - return std::optional(); - } - PAIMON_ASSIGN_OR_RAISE(int64_t timestamp_millis, - LocalDateTimePartsToTimestampMillis(parts.value())); - return std::optional(timestamp_millis); -} - -std::optional SystemTableUtils::OptionalDoubleToString( - const std::optional& value) { - if (!value) { - return std::optional(); - } - return std::to_string(value.value()); -} - -VariantType SystemTableUtils::OptionalInt64Value(const std::optional& value) { - if (!value) { - return NullType(); - } - return value.value(); -} - -VariantType SystemTableUtils::StringValue(const std::string& value) { - return BinaryString::FromString(value, GetDefaultPool().get()); -} - -VariantType SystemTableUtils::OptionalStringValue(const std::optional& value) { - if (!value) { - return NullType(); - } - return StringValue(value.value()); -} - -VariantType SystemTableUtils::TimestampMillisValue(int64_t value) { - return Timestamp::FromEpochMillis(value); -} - -Result SystemTableUtils::LocalTimestampMillisValue(int64_t epoch_millis) { - PAIMON_ASSIGN_OR_RAISE( - Timestamp local_timestamp, - DateTimeUtils::ToLocalTimestamp(Timestamp::FromEpochMillis(epoch_millis))); - return TimestampMillisValue(local_timestamp.GetMillisecond()); -} - -Result SystemTableUtils::LocalTimestampMillisValue(const Timestamp& local_timestamp) { - PAIMON_ASSIGN_OR_RAISE(Timestamp utc_timestamp, DateTimeUtils::ToUTCTimestamp(local_timestamp)); - int64_t epoch_millis = utc_timestamp.GetMillisecond(); - return LocalTimestampMillisValue(epoch_millis); -} - -VariantType SystemTableUtils::OptionalTimestampMillisValue(const std::optional& value) { - if (!value) { - return NullType(); - } - return TimestampMillisValue(value.value()); -} - -Result SystemTableUtils::CreateCoreOptions(const SystemTableContext& context) { - return CoreOptions::FromMap(context.options, context.fs); -} - -Result> SystemTableUtils::CreatePathFactory( - const SystemTableContext& context, const CoreOptions& core_options, - const std::shared_ptr& pool) { - std::shared_ptr arrow_schema = - DataField::ConvertDataFieldsToArrowSchema(context.table_schema->Fields()); - PAIMON_ASSIGN_OR_RAISE(std::vector external_paths, - core_options.CreateExternalPaths()); - PAIMON_ASSIGN_OR_RAISE(std::optional global_index_external_path, - core_options.CreateGlobalIndexExternalPath()); - PAIMON_ASSIGN_OR_RAISE( - std::unique_ptr path_factory, - FileStorePathFactory::Create( - context.table_path, arrow_schema, context.table_schema->PartitionKeys(), - core_options.GetPartitionDefaultName(), core_options.GetFileFormat()->Identifier(), - core_options.DataFilePrefix(), core_options.LegacyPartitionNameEnabled(), - external_paths, global_index_external_path, core_options.IndexFileInDataFileDir(), - pool)); - return std::shared_ptr(std::move(path_factory)); -} - -Result> SystemTableUtils::LatestSnapshot( - const SystemTableContext& context) { - SnapshotManager snapshot_manager(context.fs, context.table_path, context.branch); - return snapshot_manager.LatestSnapshot(); -} - -Result> SystemTableUtils::ReadDataManifests( - const SystemTableContext& context, const Snapshot& snapshot, - const std::shared_ptr& path_factory, const CoreOptions& core_options, - const std::shared_ptr& pool) { - PAIMON_ASSIGN_OR_RAISE( - std::unique_ptr manifest_list, - ManifestList::Create(context.fs, core_options.GetManifestFormat(), - core_options.GetManifestCompression(), path_factory, pool)); - std::vector manifests; - PAIMON_RETURN_NOT_OK(manifest_list->ReadDataManifests(snapshot, &manifests)); - return manifests; -} - -Result> SystemTableUtils::ReadLatestManifestEntries( - const SystemTableContext& context, const std::shared_ptr& path_factory, - const CoreOptions& core_options, const std::shared_ptr& pool) { - PAIMON_ASSIGN_OR_RAISE(std::optional snapshot, LatestSnapshot(context)); - if (!snapshot) { - return std::vector(); - } - PAIMON_ASSIGN_OR_RAISE( - std::vector manifests, - ReadDataManifests(context, snapshot.value(), path_factory, core_options, pool)); - std::shared_ptr arrow_schema = - DataField::ConvertDataFieldsToArrowSchema(context.table_schema->Fields()); - PAIMON_ASSIGN_OR_RAISE( - std::shared_ptr partition_schema, - FieldMapping::GetPartitionSchema(arrow_schema, context.table_schema->PartitionKeys())); - PAIMON_ASSIGN_OR_RAISE(std::unique_ptr manifest_file, - ManifestFile::Create(context.fs, core_options.GetManifestFormat(), - core_options.GetManifestCompression(), path_factory, - core_options.GetManifestTargetFileSize(), pool, - core_options, partition_schema)); - std::vector entries; - for (const auto& manifest : manifests) { - PAIMON_RETURN_NOT_OK( - manifest_file->Read(manifest.FileName(), /*filter=*/nullptr, &entries)); - } - return entries; -} - -Result> SystemTableUtils::ReadLatestDataFiles( - const SystemTableContext& context, const std::shared_ptr& path_factory, - const CoreOptions& core_options, const std::shared_ptr& pool) { - PAIMON_ASSIGN_OR_RAISE(std::vector entries, - ReadLatestManifestEntries(context, path_factory, core_options, pool)); - std::vector merged_entries; - PAIMON_RETURN_NOT_OK(FileEntry::MergeEntries(entries, &merged_entries)); - return merged_entries; -} - -std::optional SystemTableUtils::OptionalBinaryRowString(const BinaryRow& row) { - if (row.GetFieldCount() <= 0) { - return std::nullopt; - } - return row.ToString(); -} - -Result> SystemTableUtils::OptionalPartitionString( - const BinaryRow& row, const std::shared_ptr& partition_schema) { - if (row.GetFieldCount() <= 0) { - return std::optional(); - } - PAIMON_ASSIGN_OR_RAISE(std::string value, - BinaryRowPartitionComputer::PartToSimpleString( - partition_schema, row, ",", kMaxPartitionStatsLength)); - return std::optional(value); -} - -Result SystemTableUtils::OptionalPartitionStringValue( - const BinaryRow& row, const std::shared_ptr& partition_schema) { - PAIMON_ASSIGN_OR_RAISE(std::optional value, - OptionalPartitionString(row, partition_schema)); - return OptionalStringValue(value); -} - -Result SystemTableUtils::PartitionString( - const std::shared_ptr& path_factory, const BinaryRow& partition) { - PAIMON_ASSIGN_OR_RAISE(std::string value, path_factory->GetPartitionString(partition)); - return value; -} - -Result SystemTableUtils::FilePath( - const std::shared_ptr& path_factory, const ManifestEntry& entry, - const DataFileMeta& file) { - if (file.external_path) { - return file.external_path.value(); - } - PAIMON_ASSIGN_OR_RAISE(std::string bucket_path, - path_factory->BucketPath(entry.Partition(), entry.Bucket())); - return PathUtil::JoinPath(bucket_path, file.file_name); -} - -Result SystemTableUtils::FieldsValueMapString(const std::vector& fields, - const InternalRow& row) { - std::shared_ptr schema = DataField::ConvertDataFieldsToArrowSchema(fields); - PAIMON_ASSIGN_OR_RAISE(std::vector getters, - InternalRowUtils::CreateFieldGetters(schema, /*use_view=*/false)); - std::vector values; - values.reserve(fields.size()); - for (size_t i = 0; i < fields.size(); ++i) { - std::string value = "null"; - if (!row.IsNullAt(i)) { - VariantType field_value = getters[i](row); - if (std::holds_alternative(field_value)) { - value = std::string(std::get(field_value)); - } else { - value = DataDefine::VariantValueToString(field_value); - } - } - values.emplace_back(fmt::format("{}:{}", fields[i].Name(), value)); - } - return fmt::format("{{{}}}", fmt::join(values, ", ")); -} - -Result SystemTableUtils::NullValueCountsString(const std::vector& fields, - const InternalArray& null_counts) { - std::vector values; - values.reserve(fields.size()); - for (size_t i = 0; i < fields.size(); ++i) { - std::string value = - null_counts.IsNullAt(i) ? "null" : std::to_string(null_counts.GetLong(i)); - values.emplace_back(fmt::format("{}:{}", fields[i].Name(), value)); - } - return fmt::format("{{{}}}", fmt::join(values, ", ")); -} - -Result> SystemTableUtils::LoadDataSchema( - const SystemTableContext& context, int64_t schema_id) { - if (schema_id == context.table_schema->Id()) { - return context.table_schema; - } - SchemaManager schema_manager(context.fs, context.table_path, context.branch); - return schema_manager.ReadSchema(schema_id); -} - -Result> SystemTableUtils::ValueStatsFields(const SystemTableContext& context, - int64_t schema_id) { - PAIMON_ASSIGN_OR_RAISE(std::shared_ptr data_schema, - LoadDataSchema(context, schema_id)); - return data_schema->Fields(); -} - -Result> SystemTableUtils::WriteColsValue( - const std::optional>& write_cols, - const std::shared_ptr& pool) { - if (!write_cols) { - return std::shared_ptr(); - } - return std::make_shared( - InternalRowUtils::ToNotNullStringArrayData(write_cols.value(), pool)); -} - -} // namespace paimon diff --git a/src/paimon/core/table/system/system_table_utils.h b/src/paimon/core/table/system/system_table_utils.h deleted file mode 100644 index 437d3dbfe..000000000 --- a/src/paimon/core/table/system/system_table_utils.h +++ /dev/null @@ -1,148 +0,0 @@ -/* - * Copyright 2026-present Alibaba Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include -#include -#include -#include -#include - -#include "paimon/common/data/data_define.h" -#include "paimon/common/types/data_field.h" -#include "paimon/common/utils/rapidjson_util.h" -#include "paimon/core/core_options.h" -#include "paimon/core/io/data_file_meta.h" -#include "paimon/core/manifest/manifest_entry.h" -#include "paimon/core/manifest/manifest_file_meta.h" -#include "paimon/core/snapshot.h" -#include "paimon/data/timestamp.h" -#include "paimon/result.h" -#include "paimon/status.h" -#include "rapidjson/document.h" -#include "rapidjson/stringbuffer.h" -#include "rapidjson/writer.h" - -namespace arrow { -class Schema; -} // namespace arrow - -namespace paimon { -class BinaryRow; -class FileStorePathFactory; -class FileSystem; -class InternalArray; -class InternalRow; -class MemoryPool; -class TableSchema; - -/// Shared base table metadata used by table-scoped system tables. -struct SystemTableContext { - std::shared_ptr fs; - std::string table_path; - std::string branch; - std::shared_ptr table_schema; - std::map options; -}; - -/// Utility methods shared by system table implementations. -class SystemTableUtils { - public: - SystemTableUtils() = delete; - ~SystemTableUtils() = delete; - - static SystemTableContext CreateContext(std::shared_ptr fs, std::string table_path, - std::string branch); - static SystemTableContext CreateContext(std::shared_ptr fs, std::string table_path, - std::string branch, - std::shared_ptr table_schema, - std::map options); - static std::map MergeOptions( - const std::shared_ptr& table_schema, - const std::map& dynamic_options); - static std::string DefaultBranch(); - static std::string LoadBranch(const std::map& options); - - template - static Result JsonString(const T& value) { - rapidjson::Document document; - auto json_value = RapidJsonUtil::SerializeValue(value, &document.GetAllocator()); - rapidjson::StringBuffer buffer; - rapidjson::Writer writer(buffer); - if (!json_value.Accept(writer)) { - return Status::Invalid("failed to serialize metadata system table value"); - } - return std::string(buffer.GetString(), buffer.GetSize()); - } - - static Result> OptionalLocalDateTimePartsToTimestampMillis( - const std::optional>& parts); - static std::optional OptionalDoubleToString(const std::optional& value); - static VariantType OptionalInt64Value(const std::optional& value); - static VariantType StringValue(const std::string& value); - static VariantType OptionalStringValue(const std::optional& value); - static VariantType TimestampMillisValue(int64_t value); - static Result LocalTimestampMillisValue(int64_t epoch_millis); - static Result LocalTimestampMillisValue(const Timestamp& local_timestamp); - static VariantType OptionalTimestampMillisValue(const std::optional& value); - - static Result CreateCoreOptions(const SystemTableContext& context); - static Result> CreatePathFactory( - const SystemTableContext& context, const CoreOptions& core_options, - const std::shared_ptr& pool); - static Result> LatestSnapshot(const SystemTableContext& context); - static Result> ReadDataManifests( - const SystemTableContext& context, const Snapshot& snapshot, - const std::shared_ptr& path_factory, const CoreOptions& core_options, - const std::shared_ptr& pool); - static Result> ReadLatestDataFiles( - const SystemTableContext& context, - const std::shared_ptr& path_factory, const CoreOptions& core_options, - const std::shared_ptr& pool); - - static std::optional OptionalBinaryRowString(const BinaryRow& row); - static Result OptionalPartitionStringValue( - const BinaryRow& row, const std::shared_ptr& partition_schema); - static Result PartitionString( - const std::shared_ptr& path_factory, const BinaryRow& partition); - static Result FilePath(const std::shared_ptr& path_factory, - const ManifestEntry& entry, const DataFileMeta& file); - static Result FieldsValueMapString(const std::vector& fields, - const InternalRow& row); - static Result NullValueCountsString(const std::vector& fields, - const InternalArray& null_counts); - static Result> LoadDataSchema(const SystemTableContext& context, - int64_t schema_id); - static Result> ValueStatsFields(const SystemTableContext& context, - int64_t schema_id); - static Result> WriteColsValue( - const std::optional>& write_cols, - const std::shared_ptr& pool); - - private: - static Result LocalDateTimePartsToTimestampMillis(const std::vector& parts); - static Result> OptionalPartitionString( - const BinaryRow& row, const std::shared_ptr& partition_schema); - static Result> ReadLatestManifestEntries( - const SystemTableContext& context, - const std::shared_ptr& path_factory, const CoreOptions& core_options, - const std::shared_ptr& pool); -}; - -} // namespace paimon diff --git a/src/paimon/core/table/system/tags_system_table.cpp b/src/paimon/core/table/system/tags_system_table.cpp deleted file mode 100644 index e40158270..000000000 --- a/src/paimon/core/table/system/tags_system_table.cpp +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Copyright 2026-present Alibaba Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "paimon/core/table/system/tags_system_table.h" - -#include - -#include "arrow/api.h" -#include "paimon/core/tag/tag.h" -#include "paimon/core/utils/tag_manager.h" - -namespace paimon { - -TagsSystemTable::TagsSystemTable(std::shared_ptr fs, std::string table_path, - std::string branch) - : InMemorySystemTable(table_path), - context_(SystemTableUtils::CreateContext(std::move(fs), std::move(table_path), - std::move(branch))) {} - -std::string TagsSystemTable::Name() const { - return kName; -} - -Result> TagsSystemTable::ArrowSchema() const { - return arrow::schema({ - arrow::field("tag_name", arrow::utf8(), /*nullable=*/false), - arrow::field("snapshot_id", arrow::int64(), /*nullable=*/false), - arrow::field("schema_id", arrow::int64(), /*nullable=*/false), - arrow::field("commit_time", arrow::timestamp(arrow::TimeUnit::MILLI), - /*nullable=*/false), - arrow::field("record_count", arrow::int64(), /*nullable=*/true), - arrow::field("create_time", arrow::timestamp(arrow::TimeUnit::MILLI), - /*nullable=*/true), - arrow::field("time_retained", arrow::utf8(), /*nullable=*/true), - }); -} - -Result> TagsSystemTable::BuildRows() const { - PAIMON_ASSIGN_OR_RAISE(std::shared_ptr schema, ArrowSchema()); - TagManager tag_manager(context_.fs, context_.table_path, context_.branch); - PAIMON_ASSIGN_OR_RAISE(std::vector tag_names, tag_manager.ListTagNames()); - std::vector rows; - rows.reserve(tag_names.size()); - - for (const auto& name : tag_names) { - PAIMON_ASSIGN_OR_RAISE(Tag tag, tag_manager.GetOrThrow(name)); - PAIMON_ASSIGN_OR_RAISE( - std::optional tag_create_time, - SystemTableUtils::OptionalLocalDateTimePartsToTimestampMillis(tag.TagCreateTime())); - GenericRow row(schema->num_fields()); - row.SetField(0, SystemTableUtils::StringValue(name)); - row.SetField(1, tag.Id()); - row.SetField(2, tag.SchemaId()); - PAIMON_ASSIGN_OR_RAISE(VariantType commit_time, - SystemTableUtils::LocalTimestampMillisValue(tag.TimeMillis())); - row.SetField(3, commit_time); - row.SetField(4, SystemTableUtils::OptionalInt64Value(tag.TotalRecordCount())); - row.SetField(5, SystemTableUtils::OptionalTimestampMillisValue(tag_create_time)); - row.SetField(6, SystemTableUtils::OptionalStringValue( - SystemTableUtils::OptionalDoubleToString(tag.TagTimeRetained()))); - rows.push_back(std::move(row)); - } - - return rows; -} - -} // namespace paimon diff --git a/src/paimon/core/table/system/tags_system_table.h b/src/paimon/core/table/system/tags_system_table.h deleted file mode 100644 index 86a727958..000000000 --- a/src/paimon/core/table/system/tags_system_table.h +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Copyright 2026-present Alibaba Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include - -#include "paimon/core/table/system/in_memory_system_table.h" -#include "paimon/core/table/system/system_table_utils.h" - -namespace paimon { -class FileSystem; - -/// System table for `T$tags`, exposing tags and the snapshots they reference. -class TagsSystemTable : public InMemorySystemTable { - public: - static constexpr const char* kName = "tags"; - - TagsSystemTable(std::shared_ptr fs, std::string table_path, std::string branch); - - std::string Name() const override; - Result> ArrowSchema() const override; - Result> BuildRows() const override; - - private: - SystemTableContext context_; -}; - -} // namespace paimon From 4d34c89abf9dc581041cfa2f463b9584bff8affb Mon Sep 17 00:00:00 2001 From: Socrates Date: Mon, 1 Jun 2026 22:33:41 +0800 Subject: [PATCH 6/8] Address files system table review comments --- .../table/system/metadata_system_tables.cpp | 159 ++++++++++++------ test/inte/read_inte_test.cpp | 93 +++++++++- 2 files changed, 198 insertions(+), 54 deletions(-) diff --git a/src/paimon/core/table/system/metadata_system_tables.cpp b/src/paimon/core/table/system/metadata_system_tables.cpp index 3fdf687ee..72e2e8d38 100644 --- a/src/paimon/core/table/system/metadata_system_tables.cpp +++ b/src/paimon/core/table/system/metadata_system_tables.cpp @@ -40,6 +40,7 @@ #include "paimon/common/utils/binary_row_partition_computer.h" #include "paimon/common/utils/date_time_utils.h" #include "paimon/common/utils/internal_row_utils.h" +#include "paimon/common/utils/object_utils.h" #include "paimon/common/utils/path_util.h" #include "paimon/common/utils/rapidjson_util.h" #include "paimon/core/core_options.h" @@ -53,7 +54,7 @@ #include "paimon/core/schema/schema_manager.h" #include "paimon/core/schema/table_schema.h" #include "paimon/core/snapshot.h" -#include "paimon/core/stats/simple_stats_evolutions.h" +#include "paimon/core/stats/simple_stats_evolution.h" #include "paimon/core/tag/tag.h" #include "paimon/core/utils/branch_manager.h" #include "paimon/core/utils/consumer_manager.h" @@ -167,9 +168,7 @@ Result LocalTimestampMillisValue(int64_t epoch_millis) { } Result LocalTimestampMillisValue(const Timestamp& local_timestamp) { - PAIMON_ASSIGN_OR_RAISE(Timestamp utc_timestamp, DateTimeUtils::ToUTCTimestamp(local_timestamp)); - int64_t epoch_millis = utc_timestamp.GetMillisecond(); - return LocalTimestampMillisValue(epoch_millis); + return TimestampMillisValue(local_timestamp.GetMillisecond()); } VariantType OptionalTimestampMillisValue(const std::optional& value) { @@ -234,6 +233,8 @@ Result> ReadDataManifests( ManifestList::Create(context.fs, core_options.GetManifestFormat(), core_options.GetManifestCompression(), path_factory, pool)); std::vector manifests; + // TODO: Align Java ReadAllManifests semantics by including changelog manifests once + // paimon-cpp exposes the required manifest-list support. PAIMON_RETURN_NOT_OK(manifest_list->ReadDataManifests(snapshot, &manifests)); return manifests; } @@ -285,13 +286,6 @@ Result> ReadLatestDataFiles( return merged_entries; } -std::optional OptionalBinaryRowString(const BinaryRow& row) { - if (row.GetFieldCount() <= 0) { - return std::nullopt; - } - return row.ToString(); -} - Result> OptionalPartitionString( const BinaryRow& row, const std::shared_ptr& partition_schema) { if (row.GetFieldCount() <= 0) { @@ -310,12 +304,6 @@ Result OptionalPartitionStringValue( return OptionalStringValue(value); } -Result PartitionString(const std::shared_ptr& path_factory, - const BinaryRow& partition) { - PAIMON_ASSIGN_OR_RAISE(std::string value, path_factory->GetPartitionString(partition)); - return value; -} - Result FilePath(const std::shared_ptr& path_factory, const ManifestEntry& entry, const DataFileMeta& file) { if (file.external_path) { @@ -326,42 +314,81 @@ Result FilePath(const std::shared_ptr& path_f return PathUtil::JoinPath(bucket_path, file.file_name); } -Result FieldsValueMapString(const std::vector& fields, - const InternalRow& row) { +Result> RowValueStrings(const std::vector& fields, + const InternalRow& row) { std::shared_ptr schema = DataField::ConvertDataFieldsToArrowSchema(fields); PAIMON_ASSIGN_OR_RAISE(std::vector getters, InternalRowUtils::CreateFieldGetters(schema, /*use_view=*/false)); std::vector values; - values.reserve(fields.size()); - for (size_t i = 0; i < fields.size(); ++i) { + int32_t length = std::min(static_cast(fields.size()), row.GetFieldCount()); + values.reserve(length); + for (int32_t i = 0; i < length; ++i) { std::string value = "null"; if (!row.IsNullAt(i)) { VariantType field_value = getters[i](row); - if (std::holds_alternative(field_value)) { - value = std::string(std::get(field_value)); - } else { - value = DataDefine::VariantValueToString(field_value); - } + value = DataDefine::VariantValueToString(field_value); } - values.emplace_back(fmt::format("{}:{}", fields[i].Name(), value)); + values.push_back(std::move(value)); + } + return values; +} + +Result RowValuesString(const std::vector& fields, const InternalRow& row, + std::string_view left, std::string_view right) { + PAIMON_ASSIGN_OR_RAISE(std::vector values, RowValueStrings(fields, row)); + return fmt::format("{}{}{}", left, fmt::join(values, ", "), right); +} + +Result> OptionalRowValuesString(const std::vector& fields, + const InternalRow& row, + std::string_view left, + std::string_view right) { + if (row.GetFieldCount() <= 0) { + return std::optional(); + } + PAIMON_ASSIGN_OR_RAISE(std::string value, RowValuesString(fields, row, left, right)); + return std::optional(value); +} + +Result FieldsValueMapString(const std::vector& fields, + const InternalRow& row) { + PAIMON_ASSIGN_OR_RAISE(std::vector values, RowValueStrings(fields, row)); + std::vector> field_values; + size_t length = std::min(fields.size(), values.size()); + field_values.reserve(length); + for (size_t i = 0; i < length; ++i) { + field_values.emplace_back(fields[i].Name(), std::move(values[i])); } - return fmt::format("{{{}}}", fmt::join(values, ", ")); + std::sort(field_values.begin(), field_values.end(), + [](const auto& lhs, const auto& rhs) { return lhs.first < rhs.first; }); + + std::vector entries; + entries.reserve(field_values.size()); + for (const auto& [name, value] : field_values) { + entries.emplace_back(fmt::format("{}={}", name, value)); + } + return fmt::format("{{{}}}", fmt::join(entries, ", ")); } Result NullValueCountsString(const std::vector& fields, const InternalArray& null_counts) { - std::vector values; - values.reserve(fields.size()); - for (size_t i = 0; i < fields.size(); ++i) { + std::vector> field_values; + int32_t length = std::min(static_cast(fields.size()), null_counts.Size()); + field_values.reserve(length); + for (int32_t i = 0; i < length; ++i) { std::string value = null_counts.IsNullAt(i) ? "null" : std::to_string(null_counts.GetLong(i)); - values.emplace_back(fmt::format("{}:{}", fields[i].Name(), value)); + field_values.emplace_back(fields[i].Name(), std::move(value)); } - return fmt::format("{{{}}}", fmt::join(values, ", ")); -} + std::sort(field_values.begin(), field_values.end(), + [](const auto& lhs, const auto& rhs) { return lhs.first < rhs.first; }); -Result> StatsFields(const std::shared_ptr& schema) { - return schema->Fields(); + std::vector entries; + entries.reserve(field_values.size()); + for (const auto& [name, value] : field_values) { + entries.emplace_back(fmt::format("{}={}", name, value)); + } + return fmt::format("{{{}}}", fmt::join(entries, ", ")); } Result> LoadDataSchema(const MetadataSystemTableContext& context, @@ -373,11 +400,29 @@ Result> LoadDataSchema(const MetadataSystemTableCon return schema_manager.ReadSchema(schema_id); } -Result> ValueStatsFields(const MetadataSystemTableContext& context, - int64_t schema_id) { - PAIMON_ASSIGN_OR_RAISE(std::shared_ptr data_schema, - LoadDataSchema(context, schema_id)); - PAIMON_ASSIGN_OR_RAISE(std::vector fields, StatsFields(data_schema)); +Result> ProjectWriteFields(const std::shared_ptr& data_schema, + const DataFileMeta& file) { + if (!file.write_cols) { + return data_schema->Fields(); + } + + std::vector fields; + fields.reserve(file.write_cols->size() + data_schema->PartitionKeys().size()); + for (const auto& write_col : file.write_cols.value()) { + if (write_col == SpecialFields::RowId().Name() || + write_col == SpecialFields::SequenceNumber().Name()) { + continue; + } + PAIMON_ASSIGN_OR_RAISE(DataField field, data_schema->GetField(write_col)); + fields.push_back(std::move(field)); + } + + for (const auto& partition_key : data_schema->PartitionKeys()) { + if (!ObjectUtils::Contains(file.write_cols.value(), partition_key)) { + PAIMON_ASSIGN_OR_RAISE(DataField field, data_schema->GetField(partition_key)); + fields.push_back(std::move(field)); + } + } return fields; } @@ -774,13 +819,11 @@ Result> FilesSystemTable::BuildRows() const { CreatePathFactory(context_, core_options, pool)); PAIMON_ASSIGN_OR_RAISE(std::vector entries, ReadLatestDataFiles(context_, path_factory, core_options, pool)); - std::shared_ptr arrow_schema = - DataField::ConvertDataFieldsToArrowSchema(context_.table_schema->Fields()); PAIMON_ASSIGN_OR_RAISE( - std::shared_ptr partition_schema, - FieldMapping::GetPartitionSchema(arrow_schema, context_.table_schema->PartitionKeys())); + std::vector partition_fields, + context_.table_schema->GetFields(context_.table_schema->PartitionKeys())); + const std::vector& value_stats_fields = context_.table_schema->Fields(); - SimpleStatsEvolutions stats_evolutions(context_.table_schema, pool); std::vector rows; rows.reserve(entries.size()); for (const auto& entry : entries) { @@ -791,10 +834,16 @@ Result> FilesSystemTable::BuildRows() const { const std::shared_ptr& file = entry.File(); PAIMON_ASSIGN_OR_RAISE(std::shared_ptr data_schema, LoadDataSchema(context_, file->schema_id)); - PAIMON_ASSIGN_OR_RAISE(std::vector value_stats_fields, - ValueStatsFields(context_, file->schema_id)); - std::shared_ptr stats_evolution = - stats_evolutions.GetOrCreate(data_schema); + PAIMON_ASSIGN_OR_RAISE(std::vector data_stats_fields, + ProjectWriteFields(data_schema, *file)); + PAIMON_ASSIGN_OR_RAISE(std::vector key_fields, + data_schema->TrimmedPrimaryKeyFields()); + if (key_fields.empty()) { + key_fields = data_schema->Fields(); + } + auto stats_evolution = std::make_shared( + data_stats_fields, value_stats_fields, + data_schema->Id() != context_.table_schema->Id() || file->write_cols.has_value(), pool); PAIMON_ASSIGN_OR_RAISE( SimpleStatsEvolution::EvolutionStats stats, stats_evolution->Evolution(file->value_stats, file->row_count, file->value_stats_cols)); @@ -804,7 +853,7 @@ Result> FilesSystemTable::BuildRows() const { row.SetField(0, NullType()); } else { PAIMON_ASSIGN_OR_RAISE(std::string partition, - PartitionString(path_factory, entry.Partition())); + RowValuesString(partition_fields, entry.Partition(), "{", "}")); row.SetField(0, StringValue(partition)); } row.SetField(1, entry.Bucket()); @@ -816,8 +865,12 @@ Result> FilesSystemTable::BuildRows() const { row.SetField(5, file->level); row.SetField(6, file->row_count); row.SetField(7, file->file_size); - row.SetField(8, OptionalStringValue(OptionalBinaryRowString(file->min_key))); - row.SetField(9, OptionalStringValue(OptionalBinaryRowString(file->max_key))); + PAIMON_ASSIGN_OR_RAISE(std::optional min_key, + OptionalRowValuesString(key_fields, file->min_key, "[", "]")); + PAIMON_ASSIGN_OR_RAISE(std::optional max_key, + OptionalRowValuesString(key_fields, file->max_key, "[", "]")); + row.SetField(8, OptionalStringValue(min_key)); + row.SetField(9, OptionalStringValue(max_key)); PAIMON_ASSIGN_OR_RAISE(std::string null_value_counts, NullValueCountsString(value_stats_fields, *stats.null_counts)); row.SetField(10, StringValue(null_value_counts)); diff --git a/test/inte/read_inte_test.cpp b/test/inte/read_inte_test.cpp index e1d927203..abaa29f16 100644 --- a/test/inte/read_inte_test.cpp +++ b/test/inte/read_inte_test.cpp @@ -814,10 +814,101 @@ TEST(SystemTableReadInteTest, TestReadFilesSystemTableForPartitionedTable) { ASSERT_EQ(files_array->length(), 1); auto partition_array = std::dynamic_pointer_cast(files_array->field(0)); auto file_path_array = std::dynamic_pointer_cast(files_array->field(2)); + auto min_key_array = std::dynamic_pointer_cast(files_array->field(8)); + auto max_key_array = std::dynamic_pointer_cast(files_array->field(9)); + auto null_value_counts_array = + std::dynamic_pointer_cast(files_array->field(10)); + auto min_value_stats_array = + std::dynamic_pointer_cast(files_array->field(11)); + auto max_value_stats_array = + std::dynamic_pointer_cast(files_array->field(12)); ASSERT_TRUE(partition_array); ASSERT_TRUE(file_path_array); - ASSERT_EQ(partition_array->GetString(0), "dt=20260527/"); + ASSERT_TRUE(min_key_array); + ASSERT_TRUE(max_key_array); + ASSERT_TRUE(null_value_counts_array); + ASSERT_TRUE(min_value_stats_array); + ASSERT_TRUE(max_value_stats_array); + ASSERT_EQ(partition_array->GetString(0), "{20260527}"); ASSERT_NE(file_path_array->GetString(0).find("/dt=20260527/bucket-0/"), std::string::npos); + ASSERT_EQ(min_key_array->GetString(0), "[a]"); + ASSERT_EQ(max_key_array->GetString(0), "[a]"); + ASSERT_EQ(null_value_counts_array->GetString(0), "{dt=0, pk=0, v=0}"); + ASSERT_EQ(min_value_stats_array->GetString(0), "{dt=20260527, pk=a, v=1}"); + ASSERT_EQ(max_value_stats_array->GetString(0), "{dt=20260527, pk=a, v=1}"); +} + +TEST(SystemTableReadInteTest, TestReadFilesSystemTableWithSchemaEvolutionStats) { + std::map options = {{Options::FILE_SYSTEM, "local"}}; + std::string table_path = paimon::test::GetDataDir() + + "/orc/append_table_with_alter_table_with_dense_field.db/" + "append_table_with_alter_table_with_dense_field"; + + ASSERT_OK_AND_ASSIGN(auto files_result, ReadSystemTable(table_path + "$files", options)); + auto files_array = SingleStructChunk(files_result); + ASSERT_EQ(StructFieldNames(files_array), (std::vector{"partition", + "bucket", + "file_path", + "file_format", + "schema_id", + "level", + "record_count", + "file_size_in_bytes", + "min_key", + "max_key", + "null_value_counts", + "min_value_stats", + "max_value_stats", + "min_sequence_number", + "max_sequence_number", + "creation_time", + "deleteRowCount", + "file_source", + "first_row_id", + "write_cols"})); + ASSERT_GT(files_array->length(), 0); + + auto partition_array = std::dynamic_pointer_cast(files_array->field(0)); + auto schema_id_array = std::dynamic_pointer_cast(files_array->field(4)); + auto null_value_counts_array = + std::dynamic_pointer_cast(files_array->field(10)); + auto min_value_stats_array = + std::dynamic_pointer_cast(files_array->field(11)); + auto max_value_stats_array = + std::dynamic_pointer_cast(files_array->field(12)); + ASSERT_TRUE(partition_array); + ASSERT_TRUE(schema_id_array); + ASSERT_TRUE(null_value_counts_array); + ASSERT_TRUE(min_value_stats_array); + ASSERT_TRUE(max_value_stats_array); + + bool found_old_schema_file = false; + bool found_latest_schema_file = false; + for (int64_t i = 0; i < files_array->length(); ++i) { + std::string partition = partition_array->GetString(i); + ASSERT_TRUE(partition == "{0}" || partition == "{1}"); + + std::string null_value_counts = null_value_counts_array->GetString(i); + std::string min_value_stats = min_value_stats_array->GetString(i); + std::string max_value_stats = max_value_stats_array->GetString(i); + ASSERT_NE(null_value_counts.find("f4="), std::string::npos); + ASSERT_NE(min_value_stats.find("f4="), std::string::npos); + ASSERT_NE(max_value_stats.find("f4="), std::string::npos); + ASSERT_EQ(null_value_counts.find("f0="), std::string::npos); + ASSERT_EQ(min_value_stats.find("f0="), std::string::npos); + ASSERT_EQ(max_value_stats.find("f0="), std::string::npos); + + if (schema_id_array->Value(i) == 0) { + found_old_schema_file = true; + ASSERT_NE(null_value_counts.find("f4="), std::string::npos); + ASSERT_NE(min_value_stats.find("f4=null"), std::string::npos); + ASSERT_NE(max_value_stats.find("f4=null"), std::string::npos); + } else if (schema_id_array->Value(i) == 1) { + found_latest_schema_file = true; + } + } + ASSERT_TRUE(found_old_schema_file); + ASSERT_TRUE(found_latest_schema_file); } TEST(SystemTableReadInteTest, TestReadManifestAndFilesSystemTablesForEmptyTable) { From 1a2d85ce68e79b25befcf8038d99553440faac87 Mon Sep 17 00:00:00 2001 From: Socrates Date: Wed, 3 Jun 2026 10:22:12 +0800 Subject: [PATCH 7/8] Fix manifests TODO lint --- src/paimon/core/table/system/metadata_system_tables.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/paimon/core/table/system/metadata_system_tables.cpp b/src/paimon/core/table/system/metadata_system_tables.cpp index 72e2e8d38..77e0de187 100644 --- a/src/paimon/core/table/system/metadata_system_tables.cpp +++ b/src/paimon/core/table/system/metadata_system_tables.cpp @@ -233,8 +233,9 @@ Result> ReadDataManifests( ManifestList::Create(context.fs, core_options.GetManifestFormat(), core_options.GetManifestCompression(), path_factory, pool)); std::vector manifests; - // TODO: Align Java ReadAllManifests semantics by including changelog manifests once - // paimon-cpp exposes the required manifest-list support. + // TODO(suxiaogang223): Align Java ReadAllManifests semantics by including changelog + // manifests. ReadAllManifests currently delegates to ReadChangelogManifests, which returns + // NotImplemented when a snapshot has a changelog manifest list. PAIMON_RETURN_NOT_OK(manifest_list->ReadDataManifests(snapshot, &manifests)); return manifests; } From 03b61848e9ad2742c69877b54d951129a0aecf99 Mon Sep 17 00:00:00 2001 From: Socrates Date: Wed, 3 Jun 2026 15:33:57 +0800 Subject: [PATCH 8/8] Simplify local timestamp handling --- src/paimon/core/table/system/metadata_system_tables.cpp | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/src/paimon/core/table/system/metadata_system_tables.cpp b/src/paimon/core/table/system/metadata_system_tables.cpp index 77e0de187..c63d08072 100644 --- a/src/paimon/core/table/system/metadata_system_tables.cpp +++ b/src/paimon/core/table/system/metadata_system_tables.cpp @@ -167,10 +167,6 @@ Result LocalTimestampMillisValue(int64_t epoch_millis) { return TimestampMillisValue(local_timestamp.GetMillisecond()); } -Result LocalTimestampMillisValue(const Timestamp& local_timestamp) { - return TimestampMillisValue(local_timestamp.GetMillisecond()); -} - VariantType OptionalTimestampMillisValue(const std::optional& value) { if (!value) { return NullType(); @@ -883,9 +879,7 @@ Result> FilesSystemTable::BuildRows() const { row.SetField(12, StringValue(max_value_stats)); row.SetField(13, file->min_sequence_number); row.SetField(14, file->max_sequence_number); - PAIMON_ASSIGN_OR_RAISE(VariantType creation_time, - LocalTimestampMillisValue(file->creation_time)); - row.SetField(15, creation_time); + row.SetField(15, TimestampMillisValue(file->creation_time.GetMillisecond())); row.SetField(16, OptionalInt64Value(file->delete_row_count)); row.SetField(17, file->file_source ? StringValue(file->file_source.value().ToString()) : VariantType(NullType()));