From 50a7382a63ea296562a4a5c2aa3f6065d38bad25 Mon Sep 17 00:00:00 2001 From: lxy264173 Date: Tue, 2 Jun 2026 21:24:00 +0800 Subject: [PATCH] refactor(blob): refactor BlobFileContext to honor the actual schema when classifying BLOB fields --- .../core/operation/blob_file_context.cpp | 60 +++++++-------- src/paimon/core/operation/blob_file_context.h | 16 ---- .../core/operation/blob_file_context_test.cpp | 74 +++++++++++-------- 3 files changed, 67 insertions(+), 83 deletions(-) diff --git a/src/paimon/core/operation/blob_file_context.cpp b/src/paimon/core/operation/blob_file_context.cpp index 375932382..438fd2cc0 100644 --- a/src/paimon/core/operation/blob_file_context.cpp +++ b/src/paimon/core/operation/blob_file_context.cpp @@ -39,51 +39,61 @@ BlobFileContext::BlobFileContext(std::set descriptor_fields, std::unique_ptr BlobFileContext::Create( const std::shared_ptr& schema, const CoreOptions& options) { - // Check if there are any BLOB fields in the schema - bool has_blob = false; + // Collect the BLOB field names that are present in the given schema. The schema may + // only contain a subset of the table columns (e.g. a projected read/write schema), so all + // field categories below must be derived from this set rather than from the options + // alone, which describe the full table. + std::set schema_blob_fields; for (int i = 0; i < schema->num_fields(); ++i) { - if (BlobUtils::IsBlobField(schema->field(i))) { - has_blob = true; - break; + const auto& field = schema->field(i); + if (BlobUtils::IsBlobField(field)) { + schema_blob_fields.insert(field->name()); } } - if (!has_blob) { + if (schema_blob_fields.empty()) { return nullptr; } // Populate descriptor fields std::set descriptor_fields; for (const auto& name : options.GetBlobDescriptorFields()) { - descriptor_fields.insert(name); + if (schema_blob_fields.count(name) > 0) { + descriptor_fields.insert(name); + } } // Populate view fields std::set view_fields; for (const auto& name : options.GetBlobViewFields()) { - view_fields.insert(name); + if (schema_blob_fields.count(name) > 0) { + view_fields.insert(name); + } } // Populate inline fields from options (descriptor ∪ view) std::set inline_fields; for (const auto& name : options.GetBlobInlineFields()) { - inline_fields.insert(name); + if (schema_blob_fields.count(name) > 0) { + inline_fields.insert(name); + } } // Populate external storage fields std::set external_storage_fields; for (const auto& name : options.GetBlobExternalStorageFields()) { - external_storage_fields.insert(name); + if (schema_blob_fields.count(name) > 0) { + external_storage_fields.insert(name); + } } // Populate external storage path std::optional external_storage_path = options.GetBlobExternalStoragePath(); - // Determine blob_file_fields: BLOB fields that are NOT inline + // Determine blob_file_fields: schema BLOB fields that are NOT inline std::set blob_file_fields; - for (int i = 0; i < schema->num_fields(); ++i) { - const auto& field = schema->field(i); - if (BlobUtils::IsBlobField(field) && inline_fields.count(field->name()) == 0) { - blob_file_fields.insert(field->name()); + for (const auto& name : schema_blob_fields) { + if (inline_fields.count(name) == 0) { + blob_file_fields.insert(name); } } @@ -93,26 +103,6 @@ std::unique_ptr BlobFileContext::Create( std::move(blob_file_fields), std::move(external_storage_path))); } -bool BlobFileContext::IsInlineField(const std::string& field_name) const { - return inline_fields_.count(field_name) > 0; -} - -bool BlobFileContext::IsBlobFileField(const std::string& field_name) const { - return blob_file_fields_.count(field_name) > 0; -} - -bool BlobFileContext::IsDescriptorField(const std::string& field_name) const { - return descriptor_fields_.count(field_name) > 0; -} - -bool BlobFileContext::IsViewField(const std::string& field_name) const { - return view_fields_.count(field_name) > 0; -} - -bool BlobFileContext::IsExternalStorageField(const std::string& field_name) const { - return external_storage_fields_.count(field_name) > 0; -} - bool BlobFileContext::RequireBlobFileWriter() const { return !blob_file_fields_.empty(); } diff --git a/src/paimon/core/operation/blob_file_context.h b/src/paimon/core/operation/blob_file_context.h index 2caf675a0..d11698f6f 100644 --- a/src/paimon/core/operation/blob_file_context.h +++ b/src/paimon/core/operation/blob_file_context.h @@ -46,22 +46,6 @@ class BlobFileContext { static std::unique_ptr Create(const std::shared_ptr& schema, const CoreOptions& options); - /// Returns true if the given field should be stored inline in the main data file - /// (either as descriptor bytes or view bytes). - bool IsInlineField(const std::string& field_name) const; - - /// Returns true if the given field should be written to a separate .blob file. - bool IsBlobFileField(const std::string& field_name) const; - - /// Returns true if the given field is a descriptor field. - bool IsDescriptorField(const std::string& field_name) const; - - /// Returns true if the given field is a view field. - bool IsViewField(const std::string& field_name) const; - - /// Returns true if the given field should be written to external storage. - bool IsExternalStorageField(const std::string& field_name) const; - /// Returns true if there are any BLOB fields that need a .blob file writer. bool RequireBlobFileWriter() const; diff --git a/src/paimon/core/operation/blob_file_context_test.cpp b/src/paimon/core/operation/blob_file_context_test.cpp index 70156f3bc..cabf10d62 100644 --- a/src/paimon/core/operation/blob_file_context_test.cpp +++ b/src/paimon/core/operation/blob_file_context_test.cpp @@ -91,18 +91,6 @@ TEST_F(BlobFileContextTest, MixedInlineAndBlobFile) { // blob file fields = non-inline blob fields ASSERT_EQ(context->GetBlobFileFields(), std::set({"video", "audio"})); - // Query methods - ASSERT_TRUE(context->IsInlineField("image")); - ASSERT_TRUE(context->IsDescriptorField("image")); - ASSERT_FALSE(context->IsViewField("image")); - ASSERT_FALSE(context->IsBlobFileField("image")); - - ASSERT_FALSE(context->IsInlineField("video")); - ASSERT_TRUE(context->IsBlobFileField("video")); - - ASSERT_FALSE(context->IsInlineField("audio")); - ASSERT_TRUE(context->IsBlobFileField("audio")); - // Requires blob file writer for video and audio ASSERT_TRUE(context->RequireBlobFileWriter()); ASSERT_FALSE(context->RequireExternalStorageWriter()); @@ -126,9 +114,6 @@ TEST_F(BlobFileContextTest, ExternalStorageFields) { ASSERT_EQ(context->GetExternalStoragePath(), "oss://bucket/blob/"); ASSERT_TRUE(context->GetBlobFileFields().empty()); - ASSERT_TRUE(context->IsExternalStorageField("image")); - ASSERT_FALSE(context->IsExternalStorageField("video")); - ASSERT_FALSE(context->RequireBlobFileWriter()); ASSERT_TRUE(context->RequireExternalStorageWriter()); } @@ -148,10 +133,6 @@ TEST_F(BlobFileContextTest, ViewFields) { ASSERT_EQ(context->GetInlineFields(), std::set({"ref_image"})); ASSERT_EQ(context->GetBlobFileFields(), std::set({"raw_blob"})); - ASSERT_TRUE(context->IsInlineField("ref_image")); - ASSERT_TRUE(context->IsViewField("ref_image")); - ASSERT_FALSE(context->IsDescriptorField("ref_image")); - ASSERT_TRUE(context->RequireBlobFileWriter()); ASSERT_FALSE(context->RequireExternalStorageWriter()); } @@ -176,24 +157,53 @@ TEST_F(BlobFileContextTest, DescriptorAndViewTogether) { ASSERT_EQ(context->GetExternalStoragePath(), "/tmp/ext/"); ASSERT_EQ(context->GetBlobFileFields(), std::set({"normal_blob"})); - ASSERT_TRUE(context->IsDescriptorField("desc_blob")); - ASSERT_TRUE(context->IsExternalStorageField("desc_blob")); - ASSERT_TRUE(context->IsInlineField("desc_blob")); - ASSERT_FALSE(context->IsBlobFileField("desc_blob")); + ASSERT_TRUE(context->RequireBlobFileWriter()); + ASSERT_TRUE(context->RequireExternalStorageWriter()); +} - ASSERT_TRUE(context->IsViewField("view_blob")); - ASSERT_TRUE(context->IsInlineField("view_blob")); - ASSERT_FALSE(context->IsDescriptorField("view_blob")); +TEST_F(BlobFileContextTest, PartialSchemaIgnoresAbsentFields) { + // Schema only carries "image"; "video" and "audio" are not part of this write schema. + auto schema = MakeSchema({"id"}, {"image"}); + std::map opts_map = { + {Options::BLOB_DESCRIPTOR_FIELD, "image,audio"}, + {Options::BLOB_VIEW_FIELD, "video"}, + {Options::BLOB_EXTERNAL_STORAGE_FIELD, "image,video"}, + {Options::BLOB_EXTERNAL_STORAGE_PATH, "oss://bucket/blob/"}, + }; + ASSERT_OK_AND_ASSIGN(auto options, CoreOptions::FromMap(opts_map)); + auto context = BlobFileContext::Create(schema, options); + ASSERT_TRUE(context); - ASSERT_FALSE(context->IsInlineField("normal_blob")); - ASSERT_TRUE(context->IsBlobFileField("normal_blob")); + // Only "image" survives filtering; "audio" / "video" are not in the schema. + ASSERT_EQ(context->GetDescriptorFields(), std::set({"image"})); + ASSERT_TRUE(context->GetViewFields().empty()); + ASSERT_EQ(context->GetInlineFields(), std::set({"image"})); + ASSERT_EQ(context->GetExternalStorageFields(), std::set({"image"})); - // Non-existent field - ASSERT_FALSE(context->IsInlineField("not_exist")); - ASSERT_FALSE(context->IsBlobFileField("not_exist")); + // No non-inline blob field remains in the schema. + ASSERT_TRUE(context->GetBlobFileFields().empty()); - ASSERT_TRUE(context->RequireBlobFileWriter()); + ASSERT_FALSE(context->RequireBlobFileWriter()); ASSERT_TRUE(context->RequireExternalStorageWriter()); } +TEST_F(BlobFileContextTest, PartialSchemaWithOnlyBlobFileField) { + auto schema = MakeSchema({"id"}, {"audio"}); + std::map opts_map = { + {Options::BLOB_DESCRIPTOR_FIELD, "image"}, {Options::BLOB_VIEW_FIELD, "video"}, + // "audio" is not configured as inline -> goes to .blob file + }; + ASSERT_OK_AND_ASSIGN(auto options, CoreOptions::FromMap(opts_map)); + auto context = BlobFileContext::Create(schema, options); + ASSERT_TRUE(context); + + ASSERT_TRUE(context->GetDescriptorFields().empty()); + ASSERT_TRUE(context->GetViewFields().empty()); + ASSERT_TRUE(context->GetInlineFields().empty()); + ASSERT_EQ(context->GetBlobFileFields(), std::set({"audio"})); + + ASSERT_TRUE(context->RequireBlobFileWriter()); + ASSERT_FALSE(context->RequireExternalStorageWriter()); +} + } // namespace paimon