From 2f0a0f4a6f4844842815fecb91d3c0fa4a4ee314 Mon Sep 17 00:00:00 2001 From: "jinli.zjw" Date: Mon, 1 Jun 2026 17:00:11 +0800 Subject: [PATCH 1/4] feat(fs): support uint64 file IO sizes --- include/paimon/fs/file_system.h | 12 +-- include/paimon/io/buffered_input_stream.h | 18 ++-- include/paimon/io/byte_array_input_stream.h | 10 +-- include/paimon/io/data_input_stream.h | 8 +- include/paimon/memory/bytes.h | 2 +- .../data/serializer/binary_row_serializer.cpp | 3 +- .../bloomfilter/bloom_filter_file_index.cpp | 4 +- .../bsi/bit_slice_index_bitmap_file_index.cpp | 4 +- .../rangebitmap/bit_slice_index_bitmap.cpp | 9 +- .../rangebitmap/range_bitmap_io_test.cpp | 2 +- src/paimon/common/fs/file_system.cpp | 10 +-- src/paimon/common/fs/file_system_test.cpp | 69 +++++++------- .../bitmap/bitmap_global_index_test.cpp | 1 - .../btree/btree_compatibility_test.cpp | 3 +- .../range_bitmap_global_index_test.cpp | 1 - .../wrap/file_index_writer_wrapper.h | 22 +---- .../common/io/buffered_input_stream.cpp | 44 ++++----- .../common/io/byte_array_input_stream.cpp | 29 +++--- src/paimon/common/io/cache_input_stream.h | 12 +-- .../common/io/cache_input_stream_test.cpp | 10 +-- .../io/data_input_output_stream_test.cpp | 4 +- src/paimon/common/io/data_input_stream.cpp | 24 ++--- src/paimon/common/io/data_output_stream.cpp | 10 +-- src/paimon/common/io/data_output_stream.h | 6 +- src/paimon/common/io/offset_input_stream.cpp | 56 ++++++------ src/paimon/common/io/offset_input_stream.h | 12 +-- src/paimon/common/memory/bytes.cpp | 2 +- .../common/memory/memory_segment_utils.cpp | 2 +- .../common/memory/memory_segment_utils.h | 2 +- .../prefetch_file_batch_reader_impl_test.cpp | 7 +- src/paimon/common/sst/bloom_filter_handle.h | 7 +- src/paimon/common/sst/sst_file_utils.h | 2 - src/paimon/common/sst/sst_file_writer.cpp | 10 +-- .../arrow/arrow_input_stream_adapter.cpp | 48 ++-------- .../arrow/arrow_output_stream_adapter.cpp | 16 ++-- src/paimon/common/utils/math.h | 13 +++ src/paimon/common/utils/read_ahead_cache.cpp | 11 +-- src/paimon/common/utils/roaring_bitmap32.cpp | 4 +- src/paimon/common/utils/roaring_bitmap64.cpp | 4 +- src/paimon/common/utils/serialization_utils.h | 3 +- src/paimon/common/utils/stream_utils.h | 6 +- .../bitmap_deletion_vector.cpp | 8 +- .../deletionvectors/deletion_file_writer.cpp | 16 ++-- .../deletionvectors/deletion_file_writer.h | 4 +- .../deletion_file_writer_test.cpp | 2 +- .../core/deletionvectors/deletion_vector.cpp | 4 +- .../manifest/manifest_committable_test.cpp | 2 +- .../lookup/remote_lookup_file_manager.cpp | 6 +- .../remote_lookup_file_manager_test.cpp | 9 +- src/paimon/core/utils/objects_file.h | 4 +- .../format/avro/avro_file_batch_reader.cpp | 8 +- .../format/avro/avro_input_stream_impl.cpp | 3 +- .../format/avro/avro_output_stream_impl.cpp | 4 +- src/paimon/format/blob/blob_format_writer.cpp | 25 +++--- src/paimon/format/blob/blob_format_writer.h | 4 +- .../format/blob/blob_format_writer_test.cpp | 2 +- .../format/orc/orc_input_stream_impl.cpp | 4 +- .../format/orc/orc_output_stream_impl.cpp | 6 +- src/paimon/fs/jindo/jindo_file_system.cpp | 34 ++++--- src/paimon/fs/jindo/jindo_file_system.h | 12 +-- .../fs/jindo/jindo_file_system_test.cpp | 6 +- src/paimon/fs/local/local_file.cpp | 53 +++++------ src/paimon/fs/local/local_file.h | 10 +-- src/paimon/fs/local/local_file_system.cpp | 24 ++--- src/paimon/fs/local/local_file_system.h | 12 +-- src/paimon/fs/local/local_file_test.cpp | 18 ++-- .../lucene/lucene_global_index_writer.cpp | 5 +- src/paimon/global_index/lucene/lucene_input.h | 2 +- .../lumina/lumina_file_io_test.cpp | 89 ++++++++----------- .../global_index/lumina/lumina_file_reader.h | 84 ++++------------- .../global_index/lumina/lumina_file_writer.h | 34 +++---- src/paimon/testing/mock/mock_file_system.h | 12 +-- .../testing/mock/mock_format_writer.cpp | 4 +- test/inte/read_inte_test.cpp | 8 +- test/inte/write_and_read_inte_test.cpp | 2 +- 75 files changed, 464 insertions(+), 578 deletions(-) diff --git a/include/paimon/fs/file_system.h b/include/paimon/fs/file_system.h index 7edde1558..89af108b2 100644 --- a/include/paimon/fs/file_system.h +++ b/include/paimon/fs/file_system.h @@ -65,7 +65,7 @@ class PAIMON_EXPORT InputStream : public Stream { /// @return Current position in the input stream. /// @return IOError returned if an I/O error occurred in the underlying stream. /// implementation while accessing the stream's position. - virtual Result GetPos() const = 0; + virtual Result GetPos() const = 0; /// Read data from the current position in the stream. /// @param[out] buffer Pointer to the buffer where read data will be stored. @@ -73,7 +73,7 @@ class PAIMON_EXPORT InputStream : public Stream { /// @return Result containing the actual number of bytes read on success, or an error status on /// failure. /// @note The stream position advances by the number of bytes actually read. - virtual Result Read(char* buffer, uint32_t size) = 0; + virtual Result Read(char* buffer, uint64_t size) = 0; /// Read data from given position in the stream. /// @@ -83,7 +83,7 @@ class PAIMON_EXPORT InputStream : public Stream { /// @param[out] buffer The buffer to store the read content. /// @param size The number of bytes to read. /// @param offset The position in the stream to read from. - virtual Result Read(char* buffer, uint32_t size, uint64_t offset) = 0; + virtual Result Read(char* buffer, uint64_t size, uint64_t offset) = 0; /// Asynchronously read data from the input stream. /// @@ -98,7 +98,7 @@ class PAIMON_EXPORT InputStream : public Stream { /// @param callback The callback function to be invoked upon completion of the read operation. /// The callback will receive a Status object indicating the success or failure /// of the read operation. - virtual void ReadAsync(char* buffer, uint32_t size, uint64_t offset, + virtual void ReadAsync(char* buffer, uint64_t size, uint64_t offset, std::function&& callback) = 0; /// Get an identifier that uniquely identify the underlying content. @@ -121,12 +121,12 @@ class PAIMON_EXPORT OutputStream : public Stream { /// @return Result containing the actual number of bytes written on success, or an error status /// on failure. /// @note The stream position advances by the number of bytes actually written. - virtual Result Write(const char* buffer, uint32_t size) = 0; + virtual Result Write(const char* buffer, uint64_t size) = 0; /// Flush pending data to the disk. virtual Status Flush() = 0; /// Get the write position. - virtual Result GetPos() const = 0; + virtual Result GetPos() const = 0; /// Get the uri of the output stream. virtual Result GetUri() const = 0; }; diff --git a/include/paimon/io/buffered_input_stream.h b/include/paimon/io/buffered_input_stream.h index 5f6e8452a..eb452b9cf 100644 --- a/include/paimon/io/buffered_input_stream.h +++ b/include/paimon/io/buffered_input_stream.h @@ -49,13 +49,13 @@ class PAIMON_EXPORT BufferedInputStream : public InputStream { Status Seek(int64_t offset, SeekOrigin origin) override; - Result GetPos() const override; + Result GetPos() const override; - Result Read(char* buffer, uint32_t size) override; + Result Read(char* buffer, uint64_t size) override; - Result Read(char* buffer, uint32_t size, uint64_t offset) override; + Result Read(char* buffer, uint64_t size, uint64_t offset) override; - void ReadAsync(char* buffer, uint32_t size, uint64_t offset, + void ReadAsync(char* buffer, uint64_t size, uint64_t offset, std::function&& callback) override; Result Length() const override; @@ -72,15 +72,15 @@ class PAIMON_EXPORT BufferedInputStream : public InputStream { /// Internal read implementation. /// @pre size > 0 - Result InnerRead(char* buffer, int32_t size); + Result InnerRead(char* buffer, uint64_t size); /// Validate that the expected number of bytes were read. - Status AssertReadLength(int32_t read_length, int32_t actual_read_length) const; + Status AssertReadLength(uint64_t read_length, uint64_t actual_read_length) const; private: - int32_t buffer_size_; - int32_t pos_ = 0; - int32_t count_ = 0; + uint64_t buffer_size_; + uint64_t pos_ = 0; + uint64_t count_ = 0; std::unique_ptr buffer_; std::shared_ptr in_; }; diff --git a/include/paimon/io/byte_array_input_stream.h b/include/paimon/io/byte_array_input_stream.h index fc7cba2fe..c387f8f97 100644 --- a/include/paimon/io/byte_array_input_stream.h +++ b/include/paimon/io/byte_array_input_stream.h @@ -37,15 +37,15 @@ class PAIMON_EXPORT ByteArrayInputStream : public InputStream { Status Seek(int64_t offset, SeekOrigin origin) override; - Result GetPos() const override { + Result GetPos() const override { return position_; } - Result Read(char* buffer, uint32_t size) override; + Result Read(char* buffer, uint64_t size) override; - Result Read(char* buffer, uint32_t size, uint64_t offset) override; + Result Read(char* buffer, uint64_t size, uint64_t offset) override; - void ReadAsync(char* buffer, uint32_t size, uint64_t offset, + void ReadAsync(char* buffer, uint64_t size, uint64_t offset, std::function&& callback) override; Result Length() const override { @@ -59,6 +59,6 @@ class PAIMON_EXPORT ByteArrayInputStream : public InputStream { private: const char* buffer_; const uint64_t length_; - int64_t position_; + uint64_t position_; }; } // namespace paimon diff --git a/include/paimon/io/data_input_stream.h b/include/paimon/io/data_input_stream.h index f9ca6351f..f4e574962 100644 --- a/include/paimon/io/data_input_stream.h +++ b/include/paimon/io/data_input_stream.h @@ -55,14 +55,14 @@ class PAIMON_EXPORT DataInputStream { /// Read raw data of specified size from the stream. /// @param data Buffer to store the read data. /// @param size Number of bytes to read. - Status Read(char* data, uint32_t size) const; + Status Read(char* data, uint64_t size) const; /// Read string from the stream. /// @note First read length (int16), then read string bytes. Result ReadString() const; /// Get the current position in the underlying input stream. - Result GetPos() const; + Result GetPos() const; /// Get the total length of the underlying input stream. Result Length() const; @@ -77,11 +77,11 @@ class PAIMON_EXPORT DataInputStream { /// Validate that the expected number of bytes were read. /// @param read_length Expected number of bytes to read. /// @param actual_read_length Actual number of bytes read. - Status AssertReadLength(int32_t read_length, int32_t actual_read_length) const; + Status AssertReadLength(uint64_t read_length, uint64_t actual_read_length) const; /// Check if there are enough bytes available to read. /// @param need_length Number of bytes needed. - Status AssertBoundary(int32_t need_length) const; + Status AssertBoundary(uint64_t need_length) const; /// Determine if byte swapping is needed based on current byte order and system endianness. /// @return `true` if byte swapping is required, `false` otherwise. diff --git a/include/paimon/memory/bytes.h b/include/paimon/memory/bytes.h index 1b624496d..22de00ca7 100644 --- a/include/paimon/memory/bytes.h +++ b/include/paimon/memory/bytes.h @@ -78,7 +78,7 @@ class PAIMON_EXPORT Bytes { /// @param length Number of bytes to allocate. /// @param pool Memory pool to use for allocation. /// @return Unique pointer to the newly allocated Bytes object. - static PAIMON_UNIQUE_PTR AllocateBytes(int32_t length, MemoryPool* pool); + static PAIMON_UNIQUE_PTR AllocateBytes(uint64_t length, MemoryPool* pool); /// Allocate a new Bytes object from string data. /// diff --git a/src/paimon/common/data/serializer/binary_row_serializer.cpp b/src/paimon/common/data/serializer/binary_row_serializer.cpp index 766bc3186..eb764dafa 100644 --- a/src/paimon/common/data/serializer/binary_row_serializer.cpp +++ b/src/paimon/common/data/serializer/binary_row_serializer.cpp @@ -44,7 +44,8 @@ Status BinaryRowSerializer::SerializeWithoutLength(const BinaryRow& record, Result BinaryRowSerializer::Deserialize(DataInputStream* source) const { BinaryRow row(num_fields_); PAIMON_ASSIGN_OR_RAISE(int32_t read_length, source->ReadValue()); - std::shared_ptr bytes = Bytes::AllocateBytes(read_length, pool_.get()); + std::shared_ptr bytes = + Bytes::AllocateBytes(static_cast(read_length), pool_.get()); PAIMON_RETURN_NOT_OK(source->ReadBytes(bytes.get())); row.PointTo(MemorySegment::Wrap(bytes), 0, read_length); return row; diff --git a/src/paimon/common/file_index/bloomfilter/bloom_filter_file_index.cpp b/src/paimon/common/file_index/bloomfilter/bloom_filter_file_index.cpp index 157361f32..c029ad42f 100644 --- a/src/paimon/common/file_index/bloomfilter/bloom_filter_file_index.cpp +++ b/src/paimon/common/file_index/bloomfilter/bloom_filter_file_index.cpp @@ -45,9 +45,9 @@ Result> BloomFilterFileIndex::CreateReader( PAIMON_RETURN_NOT_OK(input_stream->Seek(start, SeekOrigin::FS_SEEK_SET)); auto bytes = std::make_shared(length, pool.get()); - PAIMON_ASSIGN_OR_RAISE(int32_t actual_read_len, + PAIMON_ASSIGN_OR_RAISE(uint64_t actual_read_len, input_stream->Read(bytes->data(), bytes->size())); - if (static_cast(actual_read_len) != bytes->size()) { + if (actual_read_len != bytes->size()) { return Status::Invalid( fmt::format("create reader for BloomFilterFileIndex failed, expected read len " "{}, actual read len {}", diff --git a/src/paimon/common/file_index/bsi/bit_slice_index_bitmap_file_index.cpp b/src/paimon/common/file_index/bsi/bit_slice_index_bitmap_file_index.cpp index 0bbd993ec..00d86d08e 100644 --- a/src/paimon/common/file_index/bsi/bit_slice_index_bitmap_file_index.cpp +++ b/src/paimon/common/file_index/bsi/bit_slice_index_bitmap_file_index.cpp @@ -53,9 +53,9 @@ Result> BitSliceIndexBitmapFileIndex::CreateRea PAIMON_RETURN_NOT_OK(input_stream->Seek(start, SeekOrigin::FS_SEEK_SET)); auto bytes = std::make_unique(length, pool.get()); - PAIMON_ASSIGN_OR_RAISE(int32_t actual_read_len, + PAIMON_ASSIGN_OR_RAISE(uint64_t actual_read_len, input_stream->Read(bytes->data(), bytes->size())); - if (static_cast(actual_read_len) != bytes->size()) { + if (actual_read_len != bytes->size()) { return Status::Invalid( fmt::format("create reader for BitSliceIndexBitmapFileIndex failed, expected read len " "{}, actual read len {}", diff --git a/src/paimon/common/file_index/rangebitmap/bit_slice_index_bitmap.cpp b/src/paimon/common/file_index/rangebitmap/bit_slice_index_bitmap.cpp index 86007e5ad..48fe132d0 100644 --- a/src/paimon/common/file_index/rangebitmap/bit_slice_index_bitmap.cpp +++ b/src/paimon/common/file_index/rangebitmap/bit_slice_index_bitmap.cpp @@ -20,6 +20,7 @@ #include #include +#include #include #include "paimon/common/io/memory_segment_output_stream.h" @@ -43,7 +44,7 @@ Result> BitSliceIndexBitmap::Create( PAIMON_ASSIGN_OR_RAISE(int8_t slices_size, data_in->ReadValue()); PAIMON_ASSIGN_OR_RAISE(int32_t ebm_size, data_in->ReadValue()); PAIMON_ASSIGN_OR_RAISE(int32_t indexes_length, data_in->ReadValue()); - auto indexes = Bytes::AllocateBytes(indexes_length, pool.get()); + auto indexes = Bytes::AllocateBytes(static_cast(indexes_length), pool.get()); PAIMON_RETURN_NOT_OK(data_in->Read(indexes->data(), indexes_length)); return std::unique_ptr(new BitSliceIndexBitmap( indexes_length, std::move(indexes), ebm_size, slices_size, input_stream, @@ -82,7 +83,7 @@ BitSliceIndexBitmap::BitSliceIndexBitmap(int32_t indexes_length, PAIMON_UNIQUE_P Result BitSliceIndexBitmap::GetExistenceBitmap() { if (!ebm_.has_value()) { PAIMON_RETURN_NOT_OK(input_stream_->Seek(body_offset_, FS_SEEK_SET)); - const auto bytes = Bytes::AllocateBytes(ebm_length_, pool_.get()); + const auto bytes = Bytes::AllocateBytes(static_cast(ebm_length_), pool_.get()); PAIMON_RETURN_NOT_OK(input_stream_->Read(bytes->data(), ebm_length_)); RoaringBitmap32 bitmap; PAIMON_RETURN_NOT_OK(bitmap.Deserialize(bytes->data(), ebm_length_)); @@ -118,8 +119,8 @@ Status BitSliceIndexBitmap::LoadSlices(int32_t start, int32_t end) { length += slice_length; } PAIMON_RETURN_NOT_OK(input_stream_->Seek(body_offset_ + ebm_length_ + offset, FS_SEEK_SET)); - const auto bytes = Bytes::AllocateBytes(length, pool_.get()); - PAIMON_RETURN_NOT_OK(input_stream_->Read(bytes->data(), length)); + const auto bytes = Bytes::AllocateBytes(static_cast(length), pool_.get()); + PAIMON_RETURN_NOT_OK(input_stream_->Read(bytes->data(), static_cast(length))); int32_t byte_position = 0; for (int32_t i = start; i < end; ++i) { const int32_t slice_length = lengths[i]; diff --git a/src/paimon/common/file_index/rangebitmap/range_bitmap_io_test.cpp b/src/paimon/common/file_index/rangebitmap/range_bitmap_io_test.cpp index 848264178..17d98060f 100644 --- a/src/paimon/common/file_index/rangebitmap/range_bitmap_io_test.cpp +++ b/src/paimon/common/file_index/rangebitmap/range_bitmap_io_test.cpp @@ -103,7 +103,7 @@ TEST_F(RangeBitmapIoTest, TestSimple) { ASSERT_OK_AND_ASSIGN(std::shared_ptr out, fs_->Create(file_path, /*overwrite=*/false)); ASSERT_OK_AND_ASSIGN( - int32_t write_len, + uint64_t write_len, out->Write(reinterpret_cast(serialized_bytes->data()), serialized_bytes->size())); ASSERT_EQ(write_len, serialized_bytes->size()); ASSERT_OK(out->Flush()); diff --git a/src/paimon/common/fs/file_system.cpp b/src/paimon/common/fs/file_system.cpp index 262973efa..e22fa1dc0 100644 --- a/src/paimon/common/fs/file_system.cpp +++ b/src/paimon/common/fs/file_system.cpp @@ -49,9 +49,9 @@ Status FileSystem::ReadFile(const std::string& path, std::string* content) { (void)s; }); PAIMON_ASSIGN_OR_RAISE(uint64_t length, in->Length()); - content->resize(length); - PAIMON_ASSIGN_OR_RAISE(int32_t read_length, in->Read(content->data(), length)); - if (read_length != static_cast(length)) { + content->resize(static_cast(length)); + PAIMON_ASSIGN_OR_RAISE(uint64_t read_length, in->Read(content->data(), length)); + if (read_length != length) { return Status::IOError(fmt::format("path {}, expect read len {}, actual read len {}", path, length, read_length)); } @@ -69,8 +69,8 @@ Status FileSystem::WriteFile(const std::string& path, const std::string& content Status s = out->Close(); (void)s; }); - int32_t length = content.size(); - PAIMON_ASSIGN_OR_RAISE(int32_t write_length, out->Write(content.data(), length)); + uint64_t length = content.size(); + PAIMON_ASSIGN_OR_RAISE(uint64_t write_length, out->Write(content.data(), length)); if (write_length != length) { return Status::IOError(fmt::format("path {}, expect write len {}, actual write len {}", path, length, write_length)); diff --git a/src/paimon/common/fs/file_system_test.cpp b/src/paimon/common/fs/file_system_test.cpp index bf177d1ab..49c6983ed 100644 --- a/src/paimon/common/fs/file_system_test.cpp +++ b/src/paimon/common/fs/file_system_test.cpp @@ -91,7 +91,7 @@ class FileSystemTest : public ::testing::Test, public ::testing::WithParamInterf fs_->Create(file, /*overwrite=*/true)); std::string input = "paimon"; char chars[8] = {1, 2, 3, 4, 5, 6, 7, 8}; - ASSERT_OK_AND_ASSIGN(int32_t size, out->Write(chars, input.size())); + ASSERT_OK_AND_ASSIGN(uint64_t size, out->Write(chars, input.size())); ASSERT_EQ(size, input.size()); ASSERT_OK(out->Flush()); ASSERT_OK(out->Close()); @@ -202,7 +202,7 @@ TEST_P(FileSystemTest, TestCreate) { ASSERT_OK_AND_ASSIGN(std::unique_ptr out, fs_->Create(path, /*overwrite=*/true)); ASSERT_TRUE(out); std::string input = "paimon"; - ASSERT_OK_AND_ASSIGN(int32_t size, out->Write(input.data(), input.size())); + ASSERT_OK_AND_ASSIGN(uint64_t size, out->Write(input.data(), input.size())); ASSERT_EQ(size, input.size()); ASSERT_OK(out->Close()); @@ -215,11 +215,11 @@ TEST_P(FileSystemTest, TestSimpleWriteAndRead) { std::string file_path = test_root_ + "/file.data"; // write process ASSERT_OK_AND_ASSIGN(auto out_stream, fs_->Create(file_path, /*overwrite=*/true)); - ASSERT_OK_AND_ASSIGN(int32_t write_len, out_stream->Write(content.data(), content.size())); + ASSERT_OK_AND_ASSIGN(uint64_t write_len, out_stream->Write(content.data(), content.size())); ASSERT_EQ(write_len, content.size()); ASSERT_OK(out_stream->Flush()); - ASSERT_OK_AND_ASSIGN(int64_t pos, out_stream->GetPos()); + ASSERT_OK_AND_ASSIGN(uint64_t pos, out_stream->GetPos()); ASSERT_EQ(pos, content.size()); ASSERT_OK_AND_ASSIGN(std::string uri, out_stream->GetUri()); @@ -233,7 +233,7 @@ TEST_P(FileSystemTest, TestSimpleWriteAndRead) { // read from cur pos std::string read_content(content.size(), '\0'); - ASSERT_OK_AND_ASSIGN(int32_t read_len, + ASSERT_OK_AND_ASSIGN(uint64_t read_len, in_stream->Read(read_content.data(), read_content.size())); ASSERT_EQ(read_len, read_content.size()); ASSERT_EQ(content, read_content); @@ -260,18 +260,18 @@ TEST_P(FileSystemTest, TestWriteMultipleTimes) { // write process ASSERT_OK_AND_ASSIGN(auto out_stream, fs_->Create(file_path, /*overwrite=*/true)); for (const auto& str : content_vec) { - ASSERT_OK_AND_ASSIGN(int32_t write_len, out_stream->Write(str.data(), str.size())); + ASSERT_OK_AND_ASSIGN(uint64_t write_len, out_stream->Write(str.data(), str.size())); ASSERT_EQ(write_len, str.size()); } ASSERT_OK(out_stream->Flush()); - ASSERT_OK_AND_ASSIGN(int64_t pos, out_stream->GetPos()); + ASSERT_OK_AND_ASSIGN(uint64_t pos, out_stream->GetPos()); ASSERT_EQ(pos, content.size()); ASSERT_OK(out_stream->Close()); // read process ASSERT_OK_AND_ASSIGN(auto in_stream, fs_->Open(file_path)); std::string read_content(content.size(), '\0'); - ASSERT_OK_AND_ASSIGN(int32_t read_len, + ASSERT_OK_AND_ASSIGN(uint64_t read_len, in_stream->Read(read_content.data(), read_content.size())); ASSERT_EQ(read_len, read_content.size()); ASSERT_EQ(content, read_content); @@ -282,7 +282,7 @@ TEST_P(FileSystemTest, TestWriteInNotExistDir) { // write process std::string content = "abcdefghijk"; ASSERT_OK_AND_ASSIGN(auto out_stream, fs_->Create(file_path, /*overwrite=*/true)); - ASSERT_OK_AND_ASSIGN([[maybe_unused]] int32_t write_len, + ASSERT_OK_AND_ASSIGN([[maybe_unused]] uint64_t write_len, out_stream->Write(content.data(), content.size())); ASSERT_OK(out_stream->Flush()); ASSERT_OK(out_stream->Close()); @@ -290,7 +290,7 @@ TEST_P(FileSystemTest, TestWriteInNotExistDir) { // read process ASSERT_OK_AND_ASSIGN(auto in_stream, fs_->Open(file_path)); std::string read_content(content.size(), '\0'); - ASSERT_OK_AND_ASSIGN(int32_t read_len, + ASSERT_OK_AND_ASSIGN(uint64_t read_len, in_stream->Read(read_content.data(), read_content.size())); ASSERT_EQ(read_len, read_content.size()); ASSERT_EQ(content, read_content); @@ -304,7 +304,7 @@ TEST_P(FileSystemTest, TestWriteEmptyFile) { // write process std::string content = ""; ASSERT_OK_AND_ASSIGN(auto out_stream, fs_->Create(file_path, /*overwrite=*/true)); - ASSERT_OK_AND_ASSIGN(int32_t write_len, out_stream->Write(content.data(), content.size())); + ASSERT_OK_AND_ASSIGN(uint64_t write_len, out_stream->Write(content.data(), content.size())); ASSERT_EQ(write_len, 0); ASSERT_OK(out_stream->Flush()); ASSERT_OK(out_stream->Close()); @@ -321,7 +321,7 @@ TEST_P(FileSystemTest, TestWriteEmptyFile) { // read process ASSERT_OK_AND_ASSIGN(auto in_stream, fs_->Open(file_path)); std::string read_content(content.size(), '\0'); - ASSERT_OK_AND_ASSIGN(int32_t read_len, + ASSERT_OK_AND_ASSIGN(uint64_t read_len, in_stream->Read(read_content.data(), read_content.size())); ASSERT_EQ(read_len, read_content.size()); ASSERT_EQ(content, read_content); @@ -332,7 +332,7 @@ TEST_P(FileSystemTest, TestWriteWithOverwrite) { std::string file_path = test_root_ + "/file.data"; // write process ASSERT_OK_AND_ASSIGN(auto out_stream, fs_->Create(file_path, /*overwrite=*/true)); - ASSERT_OK_AND_ASSIGN(int32_t write_len, out_stream->Write(content.data(), content.size())); + ASSERT_OK_AND_ASSIGN(uint64_t write_len, out_stream->Write(content.data(), content.size())); ASSERT_EQ(write_len, content.size()); ASSERT_OK(out_stream->Flush()); ASSERT_OK(out_stream->Close()); @@ -349,7 +349,7 @@ TEST_P(FileSystemTest, TestWriteWithOverwrite) { // read process ASSERT_OK_AND_ASSIGN(auto in_stream, fs_->Open(file_path)); std::string read_content(new_content.size(), '\0'); - ASSERT_OK_AND_ASSIGN(int32_t read_len, + ASSERT_OK_AND_ASSIGN(uint64_t read_len, in_stream->Read(read_content.data(), read_content.size())); ASSERT_EQ(read_len, read_content.size()); ASSERT_EQ(new_content, read_content); @@ -365,7 +365,7 @@ TEST_P(FileSystemTest, TestAsyncRead) { std::string file_path = test_root_ + "/file.data"; // write process ASSERT_OK_AND_ASSIGN(auto out_stream, fs_->Create(file_path, /*overwrite=*/true)); - ASSERT_OK_AND_ASSIGN([[maybe_unused]] int32_t write_len, + ASSERT_OK_AND_ASSIGN([[maybe_unused]] uint64_t write_len, out_stream->Write(content.data(), content.size())); ASSERT_OK(out_stream->Flush()); ASSERT_OK(out_stream->Close()); @@ -374,8 +374,8 @@ TEST_P(FileSystemTest, TestAsyncRead) { ASSERT_OK_AND_ASSIGN(auto in_stream, fs_->Open(file_path)); std::string read_content(content.size(), '\0'); bool read_finished = false; - std::promise promise; - std::future future = promise.get_future(); + std::promise promise; + std::future future = promise.get_future(); auto callback = [&](Status status) { EXPECT_OK(status); if (status.ok()) { @@ -398,7 +398,7 @@ TEST_P(FileSystemTest, TestInvalidRead) { std::string file_path = test_root_ + "/file.data"; // write process ASSERT_OK_AND_ASSIGN(auto out_stream, fs_->Create(file_path, /*overwrite=*/true)); - ASSERT_OK_AND_ASSIGN([[maybe_unused]] int32_t write_len, + ASSERT_OK_AND_ASSIGN([[maybe_unused]] uint64_t write_len, out_stream->Write(content.data(), content.size())); ASSERT_OK(out_stream->Flush()); ASSERT_OK(out_stream->Close()); @@ -435,7 +435,7 @@ TEST_P(FileSystemTest, TestInvalidAsyncRead) { std::string file_path = test_root_ + "/file.data"; // write process ASSERT_OK_AND_ASSIGN(auto out_stream, fs_->Create(file_path, /*overwrite=*/true)); - ASSERT_OK_AND_ASSIGN([[maybe_unused]] int32_t write_len, + ASSERT_OK_AND_ASSIGN([[maybe_unused]] uint64_t write_len, out_stream->Write(content.data(), content.size())); ASSERT_OK(out_stream->Flush()); ASSERT_OK(out_stream->Close()); @@ -537,25 +537,25 @@ TEST_P(FileSystemTest, TestSeek) { std::string path = PathUtil::JoinPath(test_root_, "/test_file"); ASSERT_OK_AND_ASSIGN(std::unique_ptr out, fs_->Create(path, /*overwrite=*/true)); std::string input = "paimon"; - ASSERT_OK_AND_ASSIGN(int32_t size, out->Write(input.data(), input.size())); + ASSERT_OK_AND_ASSIGN(uint64_t size, out->Write(input.data(), input.size())); ASSERT_EQ(size, input.size()); ASSERT_OK(out->Close()); ASSERT_OK_AND_ASSIGN(std::unique_ptr in, fs_->Open(path)); ASSERT_OK(in->Seek(/*offset=*/1, SeekOrigin::FS_SEEK_SET)); - ASSERT_OK_AND_ASSIGN(int64_t pos, in->GetPos()); + ASSERT_OK_AND_ASSIGN(uint64_t pos, in->GetPos()); ASSERT_EQ(pos, 1); ASSERT_OK(in->Seek(/*offset=*/1, SeekOrigin::FS_SEEK_CUR)); - ASSERT_OK_AND_ASSIGN(int64_t pos2, in->GetPos()); + ASSERT_OK_AND_ASSIGN(uint64_t pos2, in->GetPos()); ASSERT_EQ(pos2, 2); ASSERT_OK(in->Seek(/*offset=*/-5, SeekOrigin::FS_SEEK_END)); - ASSERT_OK_AND_ASSIGN(int64_t pos3, in->GetPos()); + ASSERT_OK_AND_ASSIGN(uint64_t pos3, in->GetPos()); ASSERT_EQ(pos3, 1); std::string read_content(3, '\0'); - ASSERT_OK_AND_ASSIGN(int32_t read_len, in->Read(read_content.data(), read_content.size())); + ASSERT_OK_AND_ASSIGN(uint64_t read_len, in->Read(read_content.data(), read_content.size())); ASSERT_EQ(read_len, read_content.size()); ASSERT_EQ("aim", read_content); @@ -574,7 +574,7 @@ TEST_P(FileSystemTest, TestSeek2) { std::string file_path = test_root_ + "/file.data"; // write process ASSERT_OK_AND_ASSIGN(auto out_stream, fs_->Create(file_path, /*overwrite=*/true)); - ASSERT_OK_AND_ASSIGN(int32_t write_len, out_stream->Write(content.data(), content.size())); + ASSERT_OK_AND_ASSIGN(uint64_t write_len, out_stream->Write(content.data(), content.size())); ASSERT_EQ(write_len, content.size()); ASSERT_OK(out_stream->Flush()); ASSERT_OK(out_stream->Close()); @@ -599,7 +599,7 @@ TEST_P(FileSystemTest, TestSeek2) { // read from cur pos std::string read_content(3, '\0'); - ASSERT_OK_AND_ASSIGN(int32_t read_len, + ASSERT_OK_AND_ASSIGN(uint64_t read_len, in_stream->Read(read_content.data(), read_content.size())); ASSERT_EQ(read_len, read_content.size()); ASSERT_EQ("ijk", read_content); @@ -621,14 +621,14 @@ TEST_P(FileSystemTest, TestRename) { ASSERT_OK_AND_ASSIGN(std::unique_ptr out, fs_->Create(path, /*overwrite=*/true)); ASSERT_TRUE(out); std::string input = "paimon"; - ASSERT_OK_AND_ASSIGN(int32_t size, out->Write(input.data(), input.size())); + ASSERT_OK_AND_ASSIGN(uint64_t size, out->Write(input.data(), input.size())); ASSERT_EQ(size, input.size()); ASSERT_OK(out->Flush()); ASSERT_OK(out->Close()); ASSERT_OK_AND_ASSIGN(std::unique_ptr in, fs_->Open(path)); ASSERT_TRUE(in); char* data = new char[input.size() * 2]; - ASSERT_OK_AND_ASSIGN(int32_t size_read, in->Read(data, input.size(), /*offset=*/0)); + ASSERT_OK_AND_ASSIGN(uint64_t size_read, in->Read(data, input.size(), /*offset=*/0)); ASSERT_EQ(size_read, input.size()); std::string read_data(data, input.size()); ASSERT_EQ(read_data, input); @@ -985,8 +985,14 @@ TEST_P(FileSystemTest, TestMkdir) { ASSERT_OK(fs_->Mkdirs(test_root_ + "/tmp/local/f/1")); ASSERT_OK(fs_->Mkdirs(test_root_ + "/tmp1")); ASSERT_OK(fs_->Mkdirs(test_root_ + "/tmp1/f2/")); - ASSERT_OK(fs_->Mkdirs("/")); - ASSERT_NOK_WITH_MSG(fs_->Mkdirs(""), "path is an empty string."); + std::string file_system = GetParam(); + if (file_system == "jindo") { + ASSERT_NOK_WITH_MSG(fs_->Mkdirs("/"), "mkdir failed: url is not invalid"); + ASSERT_NOK_WITH_MSG(fs_->Mkdirs(""), "mkdir failed: url is NULL"); + } else { + ASSERT_OK(fs_->Mkdirs("/")); + ASSERT_NOK_WITH_MSG(fs_->Mkdirs(""), "path is an empty string."); + } } TEST_P(FileSystemTest, TestMkdir2) { @@ -1062,6 +1068,9 @@ TEST_P(FileSystemTest, TestMkdirMultiThreadWithSameName) { // test for create multi dir such as "partition1" and "partition1" (relative path) TEST_P(FileSystemTest, TestMkdirMultiThreadWithSameNameWithRelativePath) { + if (GetParam() == "jindo") { + GTEST_SKIP() << "skip jindo for relative path test"; + } uint32_t runs_count = 10; uint32_t thread_count = 10; auto executor = CreateDefaultExecutor(thread_count); diff --git a/src/paimon/common/global_index/bitmap/bitmap_global_index_test.cpp b/src/paimon/common/global_index/bitmap/bitmap_global_index_test.cpp index 1d18d1347..9dd546508 100644 --- a/src/paimon/common/global_index/bitmap/bitmap_global_index_test.cpp +++ b/src/paimon/common/global_index/bitmap/bitmap_global_index_test.cpp @@ -71,7 +71,6 @@ class BitmapGlobalIndexTest : public ::testing::Test { auto wrapper = std::dynamic_pointer_cast(global_writer); EXPECT_TRUE(wrapper); - wrapper->max_write_size_ = 128; ArrowArray c_array; PAIMON_RETURN_NOT_OK_FROM_ARROW(arrow::ExportArray(*array, &c_array)); diff --git a/src/paimon/common/global_index/btree/btree_compatibility_test.cpp b/src/paimon/common/global_index/btree/btree_compatibility_test.cpp index 5079461a1..efe3df382 100644 --- a/src/paimon/common/global_index/btree/btree_compatibility_test.cpp +++ b/src/paimon/common/global_index/btree/btree_compatibility_test.cpp @@ -51,8 +51,7 @@ class BTreeCompatibilityTest : public ::testing::Test { EXPECT_OK_AND_ASSIGN(auto input, fs_->Open(path)); EXPECT_OK_AND_ASSIGN(auto length, input->Length()); std::string buffer(static_cast(length), '\0'); - EXPECT_OK_AND_ASSIGN([[maybe_unused]] auto bytes_read, - input->Read(buffer.data(), static_cast(length))); + EXPECT_OK_AND_ASSIGN([[maybe_unused]] auto bytes_read, input->Read(buffer.data(), length)); return buffer; } diff --git a/src/paimon/common/global_index/rangebitmap/range_bitmap_global_index_test.cpp b/src/paimon/common/global_index/rangebitmap/range_bitmap_global_index_test.cpp index 14854f189..b5d99b7bc 100644 --- a/src/paimon/common/global_index/rangebitmap/range_bitmap_global_index_test.cpp +++ b/src/paimon/common/global_index/rangebitmap/range_bitmap_global_index_test.cpp @@ -70,7 +70,6 @@ class RangeBitmapGlobalIndexTest : public ::testing::Test { auto wrapper = std::dynamic_pointer_cast(global_writer); EXPECT_TRUE(wrapper); - wrapper->max_write_size_ = 128; ArrowArray c_array; PAIMON_RETURN_NOT_OK_FROM_ARROW(arrow::ExportArray(*array, &c_array)); diff --git a/src/paimon/common/global_index/wrap/file_index_writer_wrapper.h b/src/paimon/common/global_index/wrap/file_index_writer_wrapper.h index e4cc0b9f3..65c461af1 100644 --- a/src/paimon/common/global_index/wrap/file_index_writer_wrapper.h +++ b/src/paimon/common/global_index/wrap/file_index_writer_wrapper.h @@ -16,9 +16,7 @@ #pragma once -#include #include -#include #include #include #include @@ -60,19 +58,10 @@ class FileIndexWriterWrapper : public GlobalIndexWriter { file_manager_->NewOutputStream(file_name)); PAIMON_ASSIGN_OR_RAISE(PAIMON_UNIQUE_PTR bytes, writer_->SerializedBytes()); - uint64_t total_write_size = 0; - while (total_write_size < bytes->size()) { - uint64_t current_write_size = - std::min(bytes->size() - total_write_size, max_write_size_); - PAIMON_ASSIGN_OR_RAISE(int32_t actual_size, - out->Write(bytes->data() + total_write_size, - static_cast(current_write_size))); - if (static_cast(actual_size) != current_write_size) { - return Status::IOError( - fmt::format("expect write len {} mismatch actual write len {}", - current_write_size, actual_size)); - } - total_write_size += current_write_size; + PAIMON_ASSIGN_OR_RAISE(uint64_t actual_size, out->Write(bytes->data(), bytes->size())); + if (actual_size != bytes->size()) { + return Status::IOError(fmt::format("expect write len {} mismatch actual write len {}", + bytes->size(), actual_size)); } PAIMON_RETURN_NOT_OK(out->Flush()); PAIMON_RETURN_NOT_OK(out->Close()); @@ -82,11 +71,8 @@ class FileIndexWriterWrapper : public GlobalIndexWriter { } private: - static constexpr uint64_t kMaxWriteSize = std::numeric_limits::max(); - std::string index_type_; int64_t count_ = 0; - uint64_t max_write_size_ = kMaxWriteSize; std::shared_ptr file_manager_; std::shared_ptr writer_; }; diff --git a/src/paimon/common/io/buffered_input_stream.cpp b/src/paimon/common/io/buffered_input_stream.cpp index ff9006c5a..ef23fdfe3 100644 --- a/src/paimon/common/io/buffered_input_stream.cpp +++ b/src/paimon/common/io/buffered_input_stream.cpp @@ -29,7 +29,7 @@ class MemoryPool; BufferedInputStream::BufferedInputStream(const std::shared_ptr& in, int32_t buffer_size, MemoryPool* pool) - : buffer_size_(buffer_size), in_(in) { + : buffer_size_(static_cast(buffer_size)), in_(in) { assert(buffer_size > 0); buffer_ = std::make_unique(buffer_size, pool); } @@ -54,10 +54,10 @@ Status BufferedInputStream::Seek(int64_t offset, SeekOrigin origin) { // adjust pos_ without touching the underlying stream. if (count_ > 0) { PAIMON_ASSIGN_OR_RAISE(int64_t in_pos, in_->GetPos()); - const int64_t buf_start_abs = in_pos - count_; + const int64_t buf_start_abs = in_pos - static_cast(count_); const int64_t buf_end_abs = in_pos; if (target_abs_offset >= buf_start_abs && target_abs_offset <= buf_end_abs) { - pos_ = static_cast(target_abs_offset - buf_start_abs); + pos_ = static_cast(target_abs_offset - buf_start_abs); return Status::OK(); } } @@ -70,15 +70,15 @@ Status BufferedInputStream::Seek(int64_t offset, SeekOrigin origin) { return Status::OK(); } -Result BufferedInputStream::GetPos() const { - PAIMON_ASSIGN_OR_RAISE(int64_t in_pos, in_->GetPos()); +Result BufferedInputStream::GetPos() const { + PAIMON_ASSIGN_OR_RAISE(uint64_t in_pos, in_->GetPos()); return in_pos - count_ + pos_; } -Result BufferedInputStream::Read(char* buffer, uint32_t size) { - uint32_t actual_read_len = 0; +Result BufferedInputStream::Read(char* buffer, uint64_t size) { + uint64_t actual_read_len = 0; while (actual_read_len < size) { - PAIMON_ASSIGN_OR_RAISE(int32_t nread, + PAIMON_ASSIGN_OR_RAISE(uint64_t nread, InnerRead(buffer + actual_read_len, size - actual_read_len)); assert(nread > 0); actual_read_len += nread; @@ -87,11 +87,11 @@ Result BufferedInputStream::Read(char* buffer, uint32_t size) { return actual_read_len; } -Result BufferedInputStream::Read(char* buffer, uint32_t size, uint64_t offset) { +Result BufferedInputStream::Read(char* buffer, uint64_t size, uint64_t offset) { return Status::Invalid("BufferedInputStream does not support Read from offset"); } -void BufferedInputStream::ReadAsync(char* buffer, uint32_t size, uint64_t offset, +void BufferedInputStream::ReadAsync(char* buffer, uint64_t size, uint64_t offset, std::function&& callback) { callback(Status::NotImplemented("BufferedInputStream do not support ReadAsync")); } @@ -114,20 +114,20 @@ Result BufferedInputStream::GetUri() const { Status BufferedInputStream::Fill() { pos_ = 0; count_ = 0; - PAIMON_ASSIGN_OR_RAISE(int64_t in_pos, in_->GetPos()); - PAIMON_ASSIGN_OR_RAISE(int64_t length, in_->Length()); - int64_t left_to_read = std::min((length - in_pos), static_cast(buffer_size_)); - PAIMON_ASSIGN_OR_RAISE(int32_t actual_read_len, in_->Read(buffer_->data(), left_to_read)); + PAIMON_ASSIGN_OR_RAISE(uint64_t in_pos, in_->GetPos()); + PAIMON_ASSIGN_OR_RAISE(uint64_t length, in_->Length()); + uint64_t left_to_read = std::min(length - in_pos, buffer_size_); + PAIMON_ASSIGN_OR_RAISE(uint64_t actual_read_len, in_->Read(buffer_->data(), left_to_read)); PAIMON_RETURN_NOT_OK(AssertReadLength(left_to_read, actual_read_len)); count_ = actual_read_len; return Status::OK(); } -Result BufferedInputStream::InnerRead(char* buffer, int32_t size) { +Result BufferedInputStream::InnerRead(char* buffer, uint64_t size) { assert(size > 0); - int32_t avail = count_ - pos_; - if (avail <= 0) { - assert(avail == 0); + assert(pos_ <= count_); + uint64_t avail = count_ - pos_; + if (avail == 0) { /* If the requested length is at least as large as the buffer, and if there is no mark/reset activity, do not bother to copy the bytes into the local buffer. In this way buffered streams will @@ -137,21 +137,21 @@ Result BufferedInputStream::InnerRead(char* buffer, int32_t size) { } PAIMON_RETURN_NOT_OK(Fill()); avail = count_ - pos_; - if (avail <= 0) { + if (avail == 0) { return Status::Invalid(fmt::format( "InnerRead failed, after Fill(), still no bytes available (may read eof), but " "expect read {} bytes", size)); } } - int32_t copy_length = std::min(avail, size); + uint64_t copy_length = std::min(avail, size); memcpy(buffer, buffer_->data() + pos_, copy_length); pos_ += copy_length; return copy_length; } -Status BufferedInputStream::AssertReadLength(int32_t read_length, - int32_t actual_read_length) const { +Status BufferedInputStream::AssertReadLength(uint64_t read_length, + uint64_t actual_read_length) const { if (read_length != actual_read_length) { return Status::Invalid( fmt::format("assert read length failed: read length not match, read length {}, actual " diff --git a/src/paimon/common/io/byte_array_input_stream.cpp b/src/paimon/common/io/byte_array_input_stream.cpp index 7ca927e96..80d7306f6 100644 --- a/src/paimon/common/io/byte_array_input_stream.cpp +++ b/src/paimon/common/io/byte_array_input_stream.cpp @@ -33,33 +33,34 @@ const char* ByteArrayInputStream::GetRawData() const { } Status ByteArrayInputStream::Seek(int64_t offset, SeekOrigin origin) { + int64_t new_position = 0; switch (origin) { case SeekOrigin::FS_SEEK_SET: { - position_ = offset; + new_position = offset; break; } case SeekOrigin::FS_SEEK_CUR: { - position_ += offset; + new_position = static_cast(position_) + offset; break; } case SeekOrigin::FS_SEEK_END: { - PAIMON_ASSIGN_OR_RAISE(uint64_t length, Length()); - position_ = static_cast(length) + offset; + new_position = static_cast(length_) + offset; break; } default: return Status::Invalid( "invalid SeekOrigin, only support FS_SEEK_SET, FS_SEEK_CUR, and FS_SEEK_END"); } - if (position_ < 0 || position_ > static_cast(length_)) { - return Status::Invalid( - fmt::format("invalid seek, after seek, current pos {}, length {}", position_, length_)); + if (new_position < 0 || static_cast(new_position) > length_) { + return Status::Invalid(fmt::format("invalid seek, after seek, current pos {}, length {}", + new_position, length_)); } + position_ = static_cast(new_position); return Status::OK(); } -Result ByteArrayInputStream::Read(char* buffer, uint32_t size) { - if (position_ + static_cast(size) > static_cast(length_)) { +Result ByteArrayInputStream::Read(char* buffer, uint64_t size) { + if (size > length_ - position_) { return Status::Invalid( fmt::format("ByteArrayInputStream assert boundary failed: need length {}, current " "position {}, exceed length {}", @@ -70,8 +71,8 @@ Result ByteArrayInputStream::Read(char* buffer, uint32_t size) { return size; } -Result ByteArrayInputStream::Read(char* buffer, uint32_t size, uint64_t offset) { - if (offset + static_cast(size) > length_) { +Result ByteArrayInputStream::Read(char* buffer, uint64_t size, uint64_t offset) { + if (offset + size > length_) { return Status::Invalid( fmt::format("ByteArrayInputStream assert boundary failed: need length {}, read offset " "{}, exceed length {}", @@ -81,11 +82,11 @@ Result ByteArrayInputStream::Read(char* buffer, uint32_t size, uint64_t return size; } -void ByteArrayInputStream::ReadAsync(char* buffer, uint32_t size, uint64_t offset, +void ByteArrayInputStream::ReadAsync(char* buffer, uint64_t size, uint64_t offset, std::function&& callback) { - Result read_size = Read(buffer, size, offset); + Result read_size = Read(buffer, size, offset); Status status = Status::OK(); - if (read_size.ok() && static_cast(read_size.value()) != size) { + if (read_size.ok() && read_size.value() != size) { status = Status::Invalid(fmt::format( "ByteArrayInputStream async read size {} != expected {}", read_size.value(), size)); } else if (!read_size.ok()) { diff --git a/src/paimon/common/io/cache_input_stream.h b/src/paimon/common/io/cache_input_stream.h index e966530ff..7d35674c6 100644 --- a/src/paimon/common/io/cache_input_stream.h +++ b/src/paimon/common/io/cache_input_stream.h @@ -34,15 +34,15 @@ class CacheInputStream : public InputStream { Status Seek(int64_t offset, SeekOrigin origin) override { return input_stream_->Seek(offset, origin); } - Result GetPos() const override { + Result GetPos() const override { return input_stream_->GetPos(); } - Result Read(char* buffer, uint32_t size) override { + Result Read(char* buffer, uint64_t size) override { return input_stream_->Read(buffer, size); } - Result Read(char* buffer, uint32_t size, uint64_t offset) override { + Result Read(char* buffer, uint64_t size, uint64_t offset) override { if (cache_) { - ByteRange range{offset, static_cast(size)}; + ByteRange range{offset, size}; PAIMON_ASSIGN_OR_RAISE(ByteSlice slice, cache_->Read(range)); if (slice.buffer) { std::memcpy(buffer, slice.buffer->data() + slice.offset, slice.length); @@ -51,10 +51,10 @@ class CacheInputStream : public InputStream { } return input_stream_->Read(buffer, size, offset); } - void ReadAsync(char* buffer, uint32_t size, uint64_t offset, + void ReadAsync(char* buffer, uint64_t size, uint64_t offset, std::function&& callback) override { if (cache_) { - ByteRange range{offset, static_cast(size)}; + ByteRange range{offset, size}; Result slice = cache_->Read(range); if (!slice.ok()) { callback(slice.status()); diff --git a/src/paimon/common/io/cache_input_stream_test.cpp b/src/paimon/common/io/cache_input_stream_test.cpp index 0f197ba74..e1e90e53c 100644 --- a/src/paimon/common/io/cache_input_stream_test.cpp +++ b/src/paimon/common/io/cache_input_stream_test.cpp @@ -87,12 +87,12 @@ TEST_F(CacheInputStreamTest, TestProxyMethods) { // Seek + GetPos ASSERT_OK(stream.Seek(5, SeekOrigin::FS_SEEK_SET)); - ASSERT_OK_AND_ASSIGN(int64_t pos, stream.GetPos()); + ASSERT_OK_AND_ASSIGN(uint64_t pos, stream.GetPos()); ASSERT_EQ(pos, 5); // Read (sequential, no offset) std::string buffer(3, '\0'); - ASSERT_OK_AND_ASSIGN(int32_t bytes_read, stream.Read(buffer.data(), 3)); + ASSERT_OK_AND_ASSIGN(uint64_t bytes_read, stream.Read(buffer.data(), 3)); ASSERT_EQ(bytes_read, 3); ASSERT_EQ(buffer, "fgh"); @@ -106,7 +106,7 @@ TEST_F(CacheInputStreamTest, TestReadWithOffsetNullCache) { CacheInputStream stream(std::move(underlying), /*cache=*/nullptr); std::string buffer(5, '\0'); - ASSERT_OK_AND_ASSIGN(int32_t bytes_read, stream.Read(buffer.data(), 5, /*offset=*/2)); + ASSERT_OK_AND_ASSIGN(uint64_t bytes_read, stream.Read(buffer.data(), 5, /*offset=*/2)); ASSERT_EQ(bytes_read, 5); ASSERT_EQ(buffer, "cdefg"); } @@ -119,7 +119,7 @@ TEST_F(CacheInputStreamTest, TestReadWithOffsetCacheHit) { CacheInputStream stream(std::move(underlying), cache); std::string buffer(5, '\0'); - ASSERT_OK_AND_ASSIGN(int32_t bytes_read, stream.Read(buffer.data(), 5, /*offset=*/2)); + ASSERT_OK_AND_ASSIGN(uint64_t bytes_read, stream.Read(buffer.data(), 5, /*offset=*/2)); ASSERT_EQ(bytes_read, 5); ASSERT_EQ(buffer, "cdefg"); } @@ -132,7 +132,7 @@ TEST_F(CacheInputStreamTest, TestReadWithOffsetCacheMiss) { CacheInputStream stream(std::move(underlying), cache); std::string buffer(3, '\0'); - ASSERT_OK_AND_ASSIGN(int32_t bytes_read, stream.Read(buffer.data(), 3, /*offset=*/10)); + ASSERT_OK_AND_ASSIGN(uint64_t bytes_read, stream.Read(buffer.data(), 3, /*offset=*/10)); ASSERT_EQ(bytes_read, 3); ASSERT_EQ(buffer, "klm"); } diff --git a/src/paimon/common/io/data_input_output_stream_test.cpp b/src/paimon/common/io/data_input_output_stream_test.cpp index 888f6b59e..eb25f5e12 100644 --- a/src/paimon/common/io/data_input_output_stream_test.cpp +++ b/src/paimon/common/io/data_input_output_stream_test.cpp @@ -101,7 +101,7 @@ class DataInputOutputStreamTest : public ::testing::Test, ASSERT_OK(data_input_stream->ReadBytes(read_bytes.get())); ASSERT_EQ(*bytes, *read_bytes); // test GetPos - ASSERT_OK_AND_ASSIGN(int64_t in_pos, input_stream->GetPos()); + ASSERT_OK_AND_ASSIGN(uint64_t in_pos, input_stream->GetPos()); ASSERT_EQ(1 + 2 + 4 + 8 + 1 + 41 + 24, in_pos); // read eof, return bad status ASSERT_NOK(data_input_stream->ReadString()); @@ -132,7 +132,7 @@ TEST_P(DataInputOutputStreamTest, TestFileStream) { auto data_output_stream = std::make_unique(output_stream); data_output_stream->SetOrder(byte_order); WriteValues(data_output_stream.get()); - ASSERT_OK_AND_ASSIGN(int64_t out_pos, output_stream->GetPos()); + ASSERT_OK_AND_ASSIGN(uint64_t out_pos, output_stream->GetPos()); ASSERT_EQ(1 + 2 + 4 + 8 + 1 + 41 + 24, out_pos); ASSERT_OK(output_stream->Close()); // check file content diff --git a/src/paimon/common/io/data_input_stream.cpp b/src/paimon/common/io/data_input_stream.cpp index 44fa2f5cf..2d24c4065 100644 --- a/src/paimon/common/io/data_input_stream.cpp +++ b/src/paimon/common/io/data_input_stream.cpp @@ -39,10 +39,10 @@ Status DataInputStream::Seek(int64_t offset) const { template Result DataInputStream::ReadValue() const { static_assert(std::is_trivially_copyable_v, "T must be trivially copyable"); - int32_t read_length = sizeof(T); + uint64_t read_length = sizeof(T); PAIMON_RETURN_NOT_OK(AssertBoundary(read_length)); T value; - PAIMON_ASSIGN_OR_RAISE(int32_t actual_read_length, + PAIMON_ASSIGN_OR_RAISE(uint64_t actual_read_length, input_stream_->Read(reinterpret_cast(&value), read_length)); PAIMON_RETURN_NOT_OK(AssertReadLength(read_length, actual_read_length)); if (NeedSwap()) { @@ -52,17 +52,17 @@ Result DataInputStream::ReadValue() const { } Status DataInputStream::ReadBytes(Bytes* bytes) const { - int32_t read_length = bytes->size(); + uint64_t read_length = bytes->size(); PAIMON_RETURN_NOT_OK(AssertBoundary(read_length)); - PAIMON_ASSIGN_OR_RAISE(int32_t actual_read_length, + PAIMON_ASSIGN_OR_RAISE(uint64_t actual_read_length, input_stream_->Read(bytes->data(), read_length)); PAIMON_RETURN_NOT_OK(AssertReadLength(read_length, actual_read_length)); return Status::OK(); } -Status DataInputStream::Read(char* data, uint32_t size) const { +Status DataInputStream::Read(char* data, uint64_t size) const { PAIMON_RETURN_NOT_OK(AssertBoundary(size)); - PAIMON_ASSIGN_OR_RAISE(int32_t actual_read_length, input_stream_->Read(data, size)); + PAIMON_ASSIGN_OR_RAISE(uint64_t actual_read_length, input_stream_->Read(data, size)); PAIMON_RETURN_NOT_OK(AssertReadLength(size, actual_read_length)); return Status::OK(); } @@ -72,13 +72,13 @@ Result DataInputStream::ReadString() const { PAIMON_ASSIGN_OR_RAISE(read_length, ReadValue()); PAIMON_RETURN_NOT_OK(AssertBoundary(read_length)); std::string value(read_length, '\0'); - PAIMON_ASSIGN_OR_RAISE(int32_t actual_read_length, + PAIMON_ASSIGN_OR_RAISE(uint64_t actual_read_length, input_stream_->Read(value.data(), read_length)); PAIMON_RETURN_NOT_OK(AssertReadLength(read_length, actual_read_length)); return value; } -Result DataInputStream::GetPos() const { +Result DataInputStream::GetPos() const { return input_stream_->GetPos(); } @@ -86,7 +86,7 @@ Result DataInputStream::Length() const { return input_stream_->Length(); } -Status DataInputStream::AssertReadLength(int32_t read_length, int32_t actual_read_length) const { +Status DataInputStream::AssertReadLength(uint64_t read_length, uint64_t actual_read_length) const { if (read_length != actual_read_length) { return Status::Invalid( fmt::format("assert read length failed: read length not match, read length {}, actual " @@ -96,12 +96,12 @@ Status DataInputStream::AssertReadLength(int32_t read_length, int32_t actual_rea return Status::OK(); } -Status DataInputStream::AssertBoundary(int32_t need_length) const { +Status DataInputStream::AssertBoundary(uint64_t need_length) const { // TODO(jinli.zjw): Store current_pos and file_length as member variables to reduce the overhead // of I/O calls. - PAIMON_ASSIGN_OR_RAISE(int64_t pos, input_stream_->GetPos()); + PAIMON_ASSIGN_OR_RAISE(uint64_t pos, input_stream_->GetPos()); PAIMON_ASSIGN_OR_RAISE(uint64_t length, input_stream_->Length()); - if (pos + need_length > static_cast(length)) { + if (pos + need_length > length) { return Status::Invalid( fmt::format("DataInputStream assert boundary failed: need length {}, current position " "{}, exceed length {}", diff --git a/src/paimon/common/io/data_output_stream.cpp b/src/paimon/common/io/data_output_stream.cpp index 6d8c82b5b..09a114ba6 100644 --- a/src/paimon/common/io/data_output_stream.cpp +++ b/src/paimon/common/io/data_output_stream.cpp @@ -27,8 +27,8 @@ DataOutputStream::DataOutputStream(const std::shared_ptr& output_s } Status DataOutputStream::WriteBytes(const std::shared_ptr& bytes) { - int32_t write_length = bytes->size(); - PAIMON_ASSIGN_OR_RAISE(int32_t actual_write_length, + uint64_t write_length = bytes->size(); + PAIMON_ASSIGN_OR_RAISE(uint64_t actual_write_length, output_stream_->Write(bytes->data(), write_length)); PAIMON_RETURN_NOT_OK(AssertWriteLength(write_length, actual_write_length)); return Status::OK(); @@ -37,14 +37,14 @@ Status DataOutputStream::WriteBytes(const std::shared_ptr& bytes) { Status DataOutputStream::WriteString(const std::string& value) { uint16_t write_length = value.size(); PAIMON_RETURN_NOT_OK(WriteValue(write_length)); - PAIMON_ASSIGN_OR_RAISE(int32_t actual_write_length, + PAIMON_ASSIGN_OR_RAISE(uint64_t actual_write_length, output_stream_->Write(value.data(), write_length)); PAIMON_RETURN_NOT_OK(AssertWriteLength(write_length, actual_write_length)); return Status::OK(); } -Status DataOutputStream::AssertWriteLength(int32_t write_length, - int32_t actual_write_length) const { +Status DataOutputStream::AssertWriteLength(uint64_t write_length, + uint64_t actual_write_length) const { if (write_length != actual_write_length) { return Status::Invalid(fmt::format( "assert write length failed: write length not match, write length {}, actual " diff --git a/src/paimon/common/io/data_output_stream.h b/src/paimon/common/io/data_output_stream.h index 6c22b47c5..7a4118be4 100644 --- a/src/paimon/common/io/data_output_stream.h +++ b/src/paimon/common/io/data_output_stream.h @@ -45,9 +45,9 @@ class PAIMON_EXPORT DataOutputStream { if (NeedSwap()) { write_value = EndianSwapValue(value); } - int32_t write_length = sizeof(T); + uint64_t write_length = sizeof(T); PAIMON_ASSIGN_OR_RAISE( - int32_t actual_write_length, + uint64_t actual_write_length, output_stream_->Write(reinterpret_cast(&write_value), write_length)); PAIMON_RETURN_NOT_OK(AssertWriteLength(write_length, actual_write_length)); return Status::OK(); @@ -63,7 +63,7 @@ class PAIMON_EXPORT DataOutputStream { } private: - Status AssertWriteLength(int32_t write_length, int32_t actual_write_length) const; + Status AssertWriteLength(uint64_t write_length, uint64_t actual_write_length) const; bool NeedSwap() const; diff --git a/src/paimon/common/io/offset_input_stream.cpp b/src/paimon/common/io/offset_input_stream.cpp index 1382e6a9c..69bb49084 100644 --- a/src/paimon/common/io/offset_input_stream.cpp +++ b/src/paimon/common/io/offset_input_stream.cpp @@ -56,54 +56,54 @@ OffsetInputStream::OffsetInputStream(const std::shared_ptr& wrapped : wrapped_(wrapped), length_(length), offset_(offset) {} Status OffsetInputStream::Seek(int64_t offset, SeekOrigin origin) { + int64_t new_position = 0; switch (origin) { case SeekOrigin::FS_SEEK_SET: { - inner_position_ = offset; - PAIMON_RETURN_NOT_OK(AssertBoundary(inner_position_)); - return wrapped_->Seek(offset_ + inner_position_, SeekOrigin::FS_SEEK_SET); + new_position = offset; + PAIMON_RETURN_NOT_OK(AssertBoundary(new_position)); + PAIMON_RETURN_NOT_OK(wrapped_->Seek(offset_ + new_position, SeekOrigin::FS_SEEK_SET)); + break; } case SeekOrigin::FS_SEEK_CUR: { - inner_position_ += offset; - PAIMON_RETURN_NOT_OK(AssertBoundary(inner_position_)); - return wrapped_->Seek(offset, SeekOrigin::FS_SEEK_CUR); + new_position = static_cast(inner_position_) + offset; + PAIMON_RETURN_NOT_OK(AssertBoundary(new_position)); + PAIMON_RETURN_NOT_OK(wrapped_->Seek(offset, SeekOrigin::FS_SEEK_CUR)); + break; } case SeekOrigin::FS_SEEK_END: { - inner_position_ = length_ + offset; - PAIMON_RETURN_NOT_OK(AssertBoundary(inner_position_)); - return wrapped_->Seek(offset_ + inner_position_, SeekOrigin::FS_SEEK_SET); + new_position = length_ + offset; + PAIMON_RETURN_NOT_OK(AssertBoundary(new_position)); + PAIMON_RETURN_NOT_OK(wrapped_->Seek(offset_ + new_position, SeekOrigin::FS_SEEK_SET)); + break; } default: return Status::Invalid( "invalid SeekOrigin, only support FS_SEEK_SET, FS_SEEK_CUR, and FS_SEEK_END"); } + inner_position_ = static_cast(new_position); return Status::OK(); } -Result OffsetInputStream::Read(char* buffer, uint32_t size) { - PAIMON_RETURN_NOT_OK(AssertBoundary(inner_position_ + size)); - inner_position_ += size; - return wrapped_->Read(buffer, size); +Result OffsetInputStream::Read(char* buffer, uint64_t size) { + PAIMON_RETURN_NOT_OK(AssertBoundary(static_cast(inner_position_ + size))); + PAIMON_ASSIGN_OR_RAISE(uint64_t actual_read_len, wrapped_->Read(buffer, size)); + inner_position_ += actual_read_len; + return actual_read_len; } -Result OffsetInputStream::Read(char* buffer, uint32_t size, uint64_t offset) { - PAIMON_RETURN_NOT_OK(AssertBoundary(offset)); - PAIMON_RETURN_NOT_OK(AssertBoundary(offset + size)); - return wrapped_->Read(buffer, size, offset_ + offset); +Result OffsetInputStream::Read(char* buffer, uint64_t size, uint64_t offset) { + PAIMON_RETURN_NOT_OK(AssertBoundary(static_cast(offset + size))); + return wrapped_->Read(buffer, size, static_cast(offset_) + offset); } -void OffsetInputStream::ReadAsync(char* buffer, uint32_t size, uint64_t offset, +void OffsetInputStream::ReadAsync(char* buffer, uint64_t size, uint64_t offset, std::function&& callback) { - auto status = AssertBoundary(offset); + auto status = AssertBoundary(static_cast(offset + size)); if (!status.ok()) { callback(status); return; } - status = AssertBoundary(offset + size); - if (!status.ok()) { - callback(status); - return; - } - wrapped_->ReadAsync(buffer, size, offset_ + offset, std::move(callback)); + wrapped_->ReadAsync(buffer, size, static_cast(offset_) + offset, std::move(callback)); } Status OffsetInputStream::Close() { @@ -114,15 +114,15 @@ Result OffsetInputStream::GetUri() const { return wrapped_->GetUri(); } -Result OffsetInputStream::GetPos() const { +Result OffsetInputStream::GetPos() const { return inner_position_; } Result OffsetInputStream::Length() const { - return length_; + return static_cast(length_); } -Status OffsetInputStream::AssertBoundary(int32_t inner_pos) const { +Status OffsetInputStream::AssertBoundary(int64_t inner_pos) const { if (inner_pos < 0 || inner_pos > length_) { return Status::Invalid( fmt::format("OffsetInputStream assert boundary failed: inner pos {} exceed length {}", diff --git a/src/paimon/common/io/offset_input_stream.h b/src/paimon/common/io/offset_input_stream.h index 6316eecbc..c296b90bb 100644 --- a/src/paimon/common/io/offset_input_stream.h +++ b/src/paimon/common/io/offset_input_stream.h @@ -36,13 +36,13 @@ class PAIMON_EXPORT OffsetInputStream : public InputStream { Status Seek(int64_t offset, SeekOrigin origin) override; - Result GetPos() const override; + Result GetPos() const override; - Result Read(char* buffer, uint32_t size) override; + Result Read(char* buffer, uint64_t size) override; - Result Read(char* buffer, uint32_t size, uint64_t offset) override; + Result Read(char* buffer, uint64_t size, uint64_t offset) override; - void ReadAsync(char* buffer, uint32_t size, uint64_t offset, + void ReadAsync(char* buffer, uint64_t size, uint64_t offset, std::function&& callback) override; Result Length() const override; @@ -53,12 +53,12 @@ class PAIMON_EXPORT OffsetInputStream : public InputStream { private: OffsetInputStream(const std::shared_ptr& wrapped, int64_t length, int64_t offset); - Status AssertBoundary(int32_t inner_pos) const; + Status AssertBoundary(int64_t inner_pos) const; private: std::shared_ptr wrapped_; const int64_t length_; const int64_t offset_; - int64_t inner_position_ = 0; + uint64_t inner_position_ = 0; }; } // namespace paimon diff --git a/src/paimon/common/memory/bytes.cpp b/src/paimon/common/memory/bytes.cpp index 9cdb5ef95..58d64bb9d 100644 --- a/src/paimon/common/memory/bytes.cpp +++ b/src/paimon/common/memory/bytes.cpp @@ -30,7 +30,7 @@ const std::shared_ptr& Bytes::EmptyBytes() { return empty_bytes; } -PAIMON_UNIQUE_PTR Bytes::AllocateBytes(int32_t length, MemoryPool* pool) { +PAIMON_UNIQUE_PTR Bytes::AllocateBytes(uint64_t length, MemoryPool* pool) { return pool->AllocateUnique(length, pool); } diff --git a/src/paimon/common/memory/memory_segment_utils.cpp b/src/paimon/common/memory/memory_segment_utils.cpp index 8f58db3f0..c4bff6fef 100644 --- a/src/paimon/common/memory/memory_segment_utils.cpp +++ b/src/paimon/common/memory/memory_segment_utils.cpp @@ -21,7 +21,7 @@ #include "paimon/common/utils/murmurhash_utils.h" namespace paimon { -std::shared_ptr MemorySegmentUtils::AllocateBytes(int32_t length, MemoryPool* pool) { +std::shared_ptr MemorySegmentUtils::AllocateBytes(uint64_t length, MemoryPool* pool) { return Bytes::AllocateBytes(length, pool); } diff --git a/src/paimon/common/memory/memory_segment_utils.h b/src/paimon/common/memory/memory_segment_utils.h index 02b39a12e..233c07717 100644 --- a/src/paimon/common/memory/memory_segment_utils.h +++ b/src/paimon/common/memory/memory_segment_utils.h @@ -41,7 +41,7 @@ class PAIMON_EXPORT MemorySegmentUtils { ~MemorySegmentUtils() = delete; /// Allocate bytes in pool - static std::shared_ptr AllocateBytes(int32_t length, MemoryPool* pool); + static std::shared_ptr AllocateBytes(uint64_t length, MemoryPool* pool); /// Copy target segments from source byte[]. /// diff --git a/src/paimon/common/reader/prefetch_file_batch_reader_impl_test.cpp b/src/paimon/common/reader/prefetch_file_batch_reader_impl_test.cpp index f836bcc48..c098f70cc 100644 --- a/src/paimon/common/reader/prefetch_file_batch_reader_impl_test.cpp +++ b/src/paimon/common/reader/prefetch_file_batch_reader_impl_test.cpp @@ -532,7 +532,7 @@ TEST_F(PrefetchFileBatchReaderImplTest, WorkloopSetReadStatusWhenCacheInitFailed MockFormatReaderBuilder reader_builder(data_array, data_type_, batch_size); CacheConfig invalid_cache_config( /*buffer_size_limit=*/512 * 1024, - /*range_size_limit=*/static_cast(std::numeric_limits::max()) + 1, + /*range_size_limit=*/4 * 1024, /*hole_size_limit=*/8 * 1024, /*pre_buffer_limit=*/128 * 1024); @@ -548,9 +548,8 @@ TEST_F(PrefetchFileBatchReaderImplTest, WorkloopSetReadStatusWhenCacheInitFailed auto prefetch_reader = dynamic_cast(reader.get()); prefetch_reader->Workloop(); - Status status = prefetch_reader->GetReadStatus(); - ASSERT_FALSE(status.ok()); - ASSERT_TRUE(status.IsInvalid()); + ASSERT_NOK_WITH_MSG(prefetch_reader->GetReadStatus(), + "range size limit 4096 should be larger than hole size limit 8192"); } TEST_F(PrefetchFileBatchReaderImplTest, DoReadBatchReturnOkWhenShutdown) { diff --git a/src/paimon/common/sst/bloom_filter_handle.h b/src/paimon/common/sst/bloom_filter_handle.h index f90576851..0565bf7bb 100644 --- a/src/paimon/common/sst/bloom_filter_handle.h +++ b/src/paimon/common/sst/bloom_filter_handle.h @@ -16,11 +16,10 @@ #pragma once -#include +#include +#include -#include "paimon/common/memory/memory_segment.h" -#include "paimon/memory/bytes.h" -#include "paimon/result.h" +#include "paimon/visibility.h" namespace paimon { diff --git a/src/paimon/common/sst/sst_file_utils.h b/src/paimon/common/sst/sst_file_utils.h index 37e0a08b9..b4132ad5c 100644 --- a/src/paimon/common/sst/sst_file_utils.h +++ b/src/paimon/common/sst/sst_file_utils.h @@ -15,11 +15,9 @@ */ #pragma once -#include #include "fmt/format.h" #include "paimon/common/compression/block_compression_type.h" -#include "paimon/common/memory/memory_slice.h" namespace paimon { diff --git a/src/paimon/common/sst/sst_file_writer.cpp b/src/paimon/common/sst/sst_file_writer.cpp index fa9ae435e..67d0119c0 100644 --- a/src/paimon/common/sst/sst_file_writer.cpp +++ b/src/paimon/common/sst/sst_file_writer.cpp @@ -16,7 +16,6 @@ #include "paimon/common/sst/sst_file_writer.h" -#include "paimon/common/sst/sst_file_utils.h" #include "paimon/common/utils/crc32c.h" #include "paimon/common/utils/murmurhash_utils.h" @@ -73,8 +72,9 @@ Result> SstFileWriter::WriteBloomFilter() { } auto bf_slice = bloom_filter_->GetBitSet()->ToSlice(); auto data = bf_slice.ReadStringView(); - PAIMON_ASSIGN_OR_RAISE(int64_t bloom_filter_pos, out_->GetPos()); - BloomFilterHandle handle(bloom_filter_pos, data.size(), bloom_filter_->ExpectedEntries()); + PAIMON_ASSIGN_OR_RAISE(uint64_t bloom_filter_pos, out_->GetPos()); + BloomFilterHandle handle(static_cast(bloom_filter_pos), data.size(), + bloom_filter_->ExpectedEntries()); PAIMON_RETURN_NOT_OK(WriteBytes(data.data(), data.size())); @@ -115,8 +115,8 @@ Result SstFileWriter::FlushBlockWriter(BlockWriter* writer) { crc32c = CRC32C::calculate(&compression_val, 1, crc32c); auto trailer = BlockTrailer(static_cast(compression_type), crc32c); auto trailer_memory_slice = trailer.WriteBlockTrailer(pool_.get()); - PAIMON_ASSIGN_OR_RAISE(int64_t block_pos, out_->GetPos()); - BlockHandle block_handle(block_pos, view.size()); + PAIMON_ASSIGN_OR_RAISE(uint64_t block_pos, out_->GetPos()); + BlockHandle block_handle(static_cast(block_pos), view.size()); // 1. write data PAIMON_RETURN_NOT_OK(WriteBytes(view.data(), view.size())); diff --git a/src/paimon/common/utils/arrow/arrow_input_stream_adapter.cpp b/src/paimon/common/utils/arrow/arrow_input_stream_adapter.cpp index d581d8cc9..b5d8c5a4a 100644 --- a/src/paimon/common/utils/arrow/arrow_input_stream_adapter.cpp +++ b/src/paimon/common/utils/arrow/arrow_input_stream_adapter.cpp @@ -20,10 +20,7 @@ #include #include "arrow/api.h" -#include "fmt/format.h" #include "paimon/common/utils/arrow/status_utils.h" -#include "paimon/common/utils/math.h" -#include "paimon/common/utils/options_utils.h" #include "paimon/fs/file_system.h" #include "paimon/macros.h" #include "paimon/result.h" @@ -31,19 +28,6 @@ namespace paimon { -namespace { - -template -arrow::Status ValidateArrowIoRange(From value, const char* name) { - if (!InRange(value)) { - return arrow::Status::Invalid(fmt::format("{} value {} is out of bound of type {}", name, - value, OptionsUtils::GetTypeName())); - } - return arrow::Status::OK(); -} - -} // namespace - ArrowInputStreamAdapter::ArrowInputStreamAdapter( const std::shared_ptr& input_stream, const std::shared_ptr& pool, uint64_t file_size) @@ -58,13 +42,12 @@ arrow::Status ArrowInputStreamAdapter::Seek(int64_t position) { } arrow::Result ArrowInputStreamAdapter::Read(int64_t nbytes, void* out) { - ARROW_RETURN_NOT_OK(ValidateArrowIoRange(nbytes, "nbytes")); - Result read_bytes = - input_stream_->Read(static_cast(out), static_cast(nbytes)); + Result read_bytes = + input_stream_->Read(static_cast(out), static_cast(nbytes)); if (!read_bytes.ok()) { return ToArrowStatus(read_bytes.status()); } - return read_bytes.value(); + return static_cast(read_bytes.value()); } arrow::Result> ArrowInputStreamAdapter::Read(int64_t nbytes) { @@ -79,14 +62,12 @@ arrow::Result> ArrowInputStreamAdapter::Read(int6 arrow::Result ArrowInputStreamAdapter::ReadAt(int64_t position, int64_t nbytes, void* out) { - ARROW_RETURN_NOT_OK(ValidateArrowIoRange(position, "position")); - ARROW_RETURN_NOT_OK(ValidateArrowIoRange(nbytes, "nbytes")); - Result read_bytes = input_stream_->Read( - static_cast(out), static_cast(nbytes), static_cast(position)); + Result read_bytes = input_stream_->Read( + static_cast(out), static_cast(nbytes), static_cast(position)); if (!read_bytes.ok()) { return ToArrowStatus(read_bytes.status()); } - return read_bytes.value(); + return static_cast(read_bytes.value()); } arrow::Result> ArrowInputStreamAdapter::ReadAt(int64_t position, @@ -103,17 +84,6 @@ arrow::Result> ArrowInputStreamAdapter::ReadAt(in arrow::Future> ArrowInputStreamAdapter::ReadAsync( const arrow::io::IOContext& io_context, int64_t position, int64_t nbytes) { auto fut = arrow::Future>::Make(); - auto range_status = ValidateArrowIoRange(position, "position"); - if (!range_status.ok()) { - fut.MarkFinished(range_status); - return fut; - } - range_status = ValidateArrowIoRange(nbytes, "nbytes"); - if (!range_status.ok()) { - fut.MarkFinished(range_status); - return fut; - } - arrow::Result> buffer_result = arrow::AllocateResizableBuffer(nbytes, pool_.get()); if (PAIMON_UNLIKELY(!buffer_result.ok())) { @@ -122,7 +92,7 @@ arrow::Future> ArrowInputStreamAdapter::ReadAsync } std::shared_ptr buffer = std::move(buffer_result).ValueUnsafe(); input_stream_->ReadAsync(reinterpret_cast(buffer->mutable_data()), - static_cast(nbytes), static_cast(position), + static_cast(nbytes), static_cast(position), [fut, buffer](Status callback_status) mutable { if (callback_status.ok()) { fut.MarkFinished(std::move(buffer)); @@ -134,11 +104,11 @@ arrow::Future> ArrowInputStreamAdapter::ReadAsync } arrow::Result ArrowInputStreamAdapter::Tell() const { - Result position = input_stream_->GetPos(); + Result position = input_stream_->GetPos(); if (!position.ok()) { return ToArrowStatus(position.status()); } - return position.value(); + return static_cast(position.value()); } arrow::Result ArrowInputStreamAdapter::GetSize() { diff --git a/src/paimon/common/utils/arrow/arrow_output_stream_adapter.cpp b/src/paimon/common/utils/arrow/arrow_output_stream_adapter.cpp index cf651cfe9..ea325c542 100644 --- a/src/paimon/common/utils/arrow/arrow_output_stream_adapter.cpp +++ b/src/paimon/common/utils/arrow/arrow_output_stream_adapter.cpp @@ -36,11 +36,11 @@ arrow::Status ArrowOutputStreamAdapter::Close() { } arrow::Result ArrowOutputStreamAdapter::Tell() const { - paimon::Result pos = out_->GetPos(); + paimon::Result pos = out_->GetPos(); if (!pos.ok()) { return ToArrowStatus(pos.status()); } - return pos.value(); + return static_cast(pos.value()); } bool ArrowOutputStreamAdapter::closed() const { @@ -48,15 +48,15 @@ bool ArrowOutputStreamAdapter::closed() const { } arrow::Status ArrowOutputStreamAdapter::Write(const void* data, int64_t nbytes) { - if (!InRange(nbytes)) { - return arrow::Status::Invalid( - fmt::format("nbytes value {} is out of bound of uint32_t", nbytes)); - } - Result len = - out_->Write(static_cast(data), static_cast(nbytes)); + Result len = + out_->Write(static_cast(data), static_cast(nbytes)); if (!len.ok()) { return ToArrowStatus(len.status()); } + if (len.value() != static_cast(nbytes)) { + return arrow::Status::IOError( + fmt::format("expect write len {} mismatch actual write len {}", nbytes, len.value())); + } return arrow::Status::OK(); } diff --git a/src/paimon/common/utils/math.h b/src/paimon/common/utils/math.h index 24434cca2..84fb4b967 100644 --- a/src/paimon/common/utils/math.h +++ b/src/paimon/common/utils/math.h @@ -31,6 +31,10 @@ #include #include +#include "fmt/format.h" +#include "paimon/common/utils/options_utils.h" +#include "paimon/status.h" + namespace paimon { template @@ -58,6 +62,15 @@ constexpr bool InRange(From value) { } } +template +Status ValidateValueInRange(From value, const char* value_name) { + if (!InRange(value)) { + return Status::Invalid(fmt::format("{} {} is out of bound of type {}", value_name, value, + OptionsUtils::GetTypeName())); + } + return Status::OK(); +} + // Swaps between big and little endian. Can be used in combination with the // little-endian encoding/decoding functions in coding_lean.h and coding.h to // encode/decode big endian. diff --git a/src/paimon/common/utils/read_ahead_cache.cpp b/src/paimon/common/utils/read_ahead_cache.cpp index 029f9ccb1..d9f4f11c1 100644 --- a/src/paimon/common/utils/read_ahead_cache.cpp +++ b/src/paimon/common/utils/read_ahead_cache.cpp @@ -125,19 +125,10 @@ Status ReadAheadCache::Impl::Init(std::vector&& ranges) { if (is_initialized_) { return Status::Invalid("Cache has already been initialized"); } - if (config_.GetRangeSizeLimit() > static_cast(std::numeric_limits::max())) { - return Status::Invalid("CacheConfig range_size_limit exceeds uint32_t max"); - } - PAIMON_ASSIGN_OR_RAISE( std::vector pending_ranges, ByteRangeCombiner::CoalesceByteRanges(std::move(ranges), config_.GetHoleSizeLimit(), config_.GetRangeSizeLimit())); - for (const auto& pending_range : pending_ranges) { - if (pending_range.length > static_cast(std::numeric_limits::max())) { - return Status::Invalid("range length should not be larger than uint32_t max"); - } - } pending_ranges_ = pending_ranges; is_cached_ = std::vector>(pending_ranges_.size()); for (auto& is_cached : is_cached_) { @@ -228,7 +219,7 @@ std::vector ReadAheadCache::Impl::MakeCacheEntries( auto future = promise->get_future(); auto buffer = std::make_shared(range.length, memory_pool_.get()); stream_->ReadAsync( - buffer->data(), static_cast(buffer->size()), range.offset, + buffer->data(), buffer->size(), range.offset, [promise, buffer](Status status) mutable { promise->set_value(status); }); new_entries.emplace_back(range, std::move(buffer), std::move(future)); } diff --git a/src/paimon/common/utils/roaring_bitmap32.cpp b/src/paimon/common/utils/roaring_bitmap32.cpp index de8e90107..2bbc0301e 100644 --- a/src/paimon/common/utils/roaring_bitmap32.cpp +++ b/src/paimon/common/utils/roaring_bitmap32.cpp @@ -220,8 +220,8 @@ PAIMON_UNIQUE_PTR RoaringBitmap32::Serialize(MemoryPool* pool) const { Status RoaringBitmap32::Deserialize(ByteArrayInputStream* input_stream) { const char* data = input_stream->GetRawData(); - PAIMON_ASSIGN_OR_RAISE(int64_t pos, input_stream->GetPos()); - PAIMON_ASSIGN_OR_RAISE(int64_t total_length, input_stream->Length()); + PAIMON_ASSIGN_OR_RAISE(uint64_t pos, input_stream->GetPos()); + PAIMON_ASSIGN_OR_RAISE(uint64_t total_length, input_stream->Length()); try { GetRoaringBitmap(roaring_bitmap_) = roaring::Roaring::readSafe(data, /*maxbytes=*/total_length - pos); diff --git a/src/paimon/common/utils/roaring_bitmap64.cpp b/src/paimon/common/utils/roaring_bitmap64.cpp index 01542a20c..0f6fbc858 100644 --- a/src/paimon/common/utils/roaring_bitmap64.cpp +++ b/src/paimon/common/utils/roaring_bitmap64.cpp @@ -291,8 +291,8 @@ PAIMON_UNIQUE_PTR RoaringBitmap64::Serialize(MemoryPool* pool) const { Status RoaringBitmap64::Deserialize(ByteArrayInputStream* input_stream) { const char* data = input_stream->GetRawData(); - PAIMON_ASSIGN_OR_RAISE(int64_t pos, input_stream->GetPos()); - PAIMON_ASSIGN_OR_RAISE(int64_t total_length, input_stream->Length()); + PAIMON_ASSIGN_OR_RAISE(uint64_t pos, input_stream->GetPos()); + PAIMON_ASSIGN_OR_RAISE(uint64_t total_length, input_stream->Length()); try { GetRoaringBitmap(roaring_bitmap_) = roaring::Roaring64Map::readSafe(data, /*maxbytes*/ total_length - pos); diff --git a/src/paimon/common/utils/serialization_utils.h b/src/paimon/common/utils/serialization_utils.h index df7863065..a9e51d589 100644 --- a/src/paimon/common/utils/serialization_utils.h +++ b/src/paimon/common/utils/serialization_utils.h @@ -91,7 +91,8 @@ class SerializationUtils { static Result DeserializeBinaryRow(DataInputStream* input, MemoryPool* pool) { int32_t read_length = -1; PAIMON_ASSIGN_OR_RAISE(read_length, input->ReadValue()); - std::shared_ptr bytes = Bytes::AllocateBytes(read_length, pool); + std::shared_ptr bytes = + Bytes::AllocateBytes(static_cast(read_length), pool); PAIMON_RETURN_NOT_OK(input->ReadBytes(bytes.get())); return DeserializeBinaryRow(bytes); } diff --git a/src/paimon/common/utils/stream_utils.h b/src/paimon/common/utils/stream_utils.h index 972d01bd8..0c7ff8507 100644 --- a/src/paimon/common/utils/stream_utils.h +++ b/src/paimon/common/utils/stream_utils.h @@ -45,9 +45,9 @@ class StreamUtils { PAIMON_RETURN_NOT_OK(input_stream->Seek(0, FS_SEEK_SET)); PAIMON_ASSIGN_OR_RAISE(uint64_t file_length, input_stream->Length()); PAIMON_UNIQUE_PTR content = Bytes::AllocateBytes(file_length, pool.get()); - PAIMON_ASSIGN_OR_RAISE(int32_t actual_read_len, + PAIMON_ASSIGN_OR_RAISE(uint64_t actual_read_len, input_stream->Read(content->data(), content->size())); - if (static_cast(actual_read_len) != file_length) { + if (actual_read_len != file_length) { return Status::Invalid("actual read length {}, not match with expect length {}", actual_read_len, file_length); } @@ -67,7 +67,7 @@ class StreamUtils { PAIMON_ASSIGN_OR_RAISE(uint64_t file_length, input_stream->Length()); uint64_t read_offset = 0; - uint32_t read_len = std::min(file_length, kDefaultReadChunkSize); + uint64_t read_len = std::min(file_length, kDefaultReadChunkSize); std::vector> futures; futures.reserve(file_length / kDefaultReadChunkSize + 1); while (read_len > 0) { diff --git a/src/paimon/core/deletionvectors/bitmap_deletion_vector.cpp b/src/paimon/core/deletionvectors/bitmap_deletion_vector.cpp index 6568d9ba5..2295c60a7 100644 --- a/src/paimon/core/deletionvectors/bitmap_deletion_vector.cpp +++ b/src/paimon/core/deletionvectors/bitmap_deletion_vector.cpp @@ -19,6 +19,7 @@ #include "arrow/util/crc32.h" #include "fmt/format.h" #include "paimon/common/io/memory_segment_output_stream.h" +#include "paimon/common/utils/math.h" #include "paimon/io/byte_array_input_stream.h" #include "paimon/io/data_input_stream.h" @@ -27,10 +28,9 @@ namespace paimon { Result BitmapDeletionVector::SerializeTo(const std::shared_ptr& pool, DataOutputStream* out) { PAIMON_ASSIGN_OR_RAISE(std::shared_ptr data, SerializeToBytes(pool)); - int64_t size = data->size(); - if (size < 0 || size > std::numeric_limits::max()) { - return Status::Invalid("BitmapDeletionVector serialize size out of range: ", size); - } + size_t size = data->size(); + PAIMON_RETURN_NOT_OK( + ValidateValueInRange(size, "BitmapDeletionVector serialize size")); PAIMON_RETURN_NOT_OK(out->WriteValue(static_cast(size))); PAIMON_RETURN_NOT_OK(out->WriteBytes(data)); uint32_t crc32 = 0; diff --git a/src/paimon/core/deletionvectors/deletion_file_writer.cpp b/src/paimon/core/deletionvectors/deletion_file_writer.cpp index 55b218f5c..bd8a47d4a 100644 --- a/src/paimon/core/deletionvectors/deletion_file_writer.cpp +++ b/src/paimon/core/deletionvectors/deletion_file_writer.cpp @@ -17,6 +17,7 @@ #include "paimon/core/deletionvectors/deletion_file_writer.h" #include "paimon/common/io/data_output_stream.h" +#include "paimon/common/utils/path_util.h" #include "paimon/core/deletionvectors/deletion_vectors_index_file.h" namespace paimon { @@ -35,10 +36,8 @@ Result> DeletionFileWriter::Create( Status DeletionFileWriter::Write(const std::string& key, const std::shared_ptr& deletion_vector) { - PAIMON_ASSIGN_OR_RAISE(int64_t start, out_->GetPos()); - if (start < 0 || start > std::numeric_limits::max()) { - return Status::Invalid(fmt::format("Output position {} out of int32 range.", start)); - } + PAIMON_ASSIGN_OR_RAISE(uint64_t start, out_->GetPos()); + PAIMON_RETURN_NOT_OK(ValidateValueInRange(start, "Output position")); DataOutputStream output_stream(out_); PAIMON_ASSIGN_OR_RAISE(int32_t length, deletion_vector->SerializeTo(pool_, &output_stream)); dv_metas_.insert(key, DeletionVectorMeta(key, static_cast(start), length, @@ -47,10 +46,9 @@ Status DeletionFileWriter::Write(const std::string& key, } Result> DeletionFileWriter::GetResult() const { - int64_t length = output_bytes_; - if (length < 0 || length > std::numeric_limits::max()) { + if (output_bytes_ < 0 || output_bytes_ > std::numeric_limits::max()) { return Status::Invalid( - fmt::format("Deletion file result length {} out of int32 range.", length)); + fmt::format("Deletion file result length {} out of int32 range.", output_bytes_)); } std::optional final_path; if (is_external_path_) { @@ -58,8 +56,8 @@ Result> DeletionFileWriter::GetResult() const { final_path = external_path.ToString(); } return std::make_unique(DeletionVectorsIndexFile::DELETION_VECTORS_INDEX, - PathUtil::GetName(path_), length, dv_metas_.size(), - dv_metas_, final_path); + PathUtil::GetName(path_), output_bytes_, + dv_metas_.size(), dv_metas_, final_path); } } // namespace paimon diff --git a/src/paimon/core/deletionvectors/deletion_file_writer.h b/src/paimon/core/deletionvectors/deletion_file_writer.h index 064c4aecb..10fb59d6b 100644 --- a/src/paimon/core/deletionvectors/deletion_file_writer.h +++ b/src/paimon/core/deletionvectors/deletion_file_writer.h @@ -19,9 +19,7 @@ #include #include -#include "fmt/format.h" #include "paimon/common/utils/linked_hash_map.h" -#include "paimon/common/utils/path_util.h" #include "paimon/core/deletionvectors/deletion_vector.h" #include "paimon/core/index/deletion_vector_meta.h" #include "paimon/core/index/index_path_factory.h" @@ -36,7 +34,7 @@ class DeletionFileWriter { const std::shared_ptr& path_factory, const std::shared_ptr& fs, const std::shared_ptr& pool); - Result GetPos() const { + Result GetPos() const { return out_->GetPos(); } diff --git a/src/paimon/core/deletionvectors/deletion_file_writer_test.cpp b/src/paimon/core/deletionvectors/deletion_file_writer_test.cpp index af5c7dd2a..5c0f0382f 100644 --- a/src/paimon/core/deletionvectors/deletion_file_writer_test.cpp +++ b/src/paimon/core/deletionvectors/deletion_file_writer_test.cpp @@ -92,7 +92,7 @@ TEST(DeletionFileWriterTest, GetResultWithoutCloseShouldFail) { auto pool = GetDefaultPool(); ASSERT_OK_AND_ASSIGN(auto writer, DeletionFileWriter::Create(path_factory, fs, pool)); - ASSERT_NOK_WITH_MSG(writer->GetResult(), "result length -1 out of int32 range"); + ASSERT_NOK_WITH_MSG(writer->GetResult(), "Deletion file result length -1 out of int32 range"); } TEST(DeletionFileWriterTest, ExternalPathInResult) { diff --git a/src/paimon/core/deletionvectors/deletion_vector.cpp b/src/paimon/core/deletionvectors/deletion_vector.cpp index d8217c03e..b56202ad4 100644 --- a/src/paimon/core/deletionvectors/deletion_vector.cpp +++ b/src/paimon/core/deletionvectors/deletion_vector.cpp @@ -77,7 +77,7 @@ Result> DeletionVector::Read(const FileSystem* fmt::format("Size not match, actual size: {}, expect size: {}, file path: {}", actual_length, deletion_file.length, deletion_file.path)); } - auto bytes = Bytes::AllocateBytes(deletion_file.length, pool); + auto bytes = Bytes::AllocateBytes(static_cast(deletion_file.length), pool); PAIMON_RETURN_NOT_OK(file_input_stream.ReadBytes(bytes.get())); return DeserializeFromBytes(bytes.get(), pool); } @@ -99,7 +99,7 @@ Result> DeletionVector::Read(DataInputStream* return Status::Invalid(fmt::format("Invalid bitmap length: {}", bitmap_length)); } - auto bytes = Bytes::AllocateBytes(payload_length, pool); + auto bytes = Bytes::AllocateBytes(static_cast(payload_length), pool); PAIMON_RETURN_NOT_OK(input_stream->ReadBytes(bytes.get())); // skip crc (4 bytes) PAIMON_ASSIGN_OR_RAISE([[maybe_unused]] int32_t unused_crc, diff --git a/src/paimon/core/manifest/manifest_committable_test.cpp b/src/paimon/core/manifest/manifest_committable_test.cpp index b0cad59bc..aef361794 100644 --- a/src/paimon/core/manifest/manifest_committable_test.cpp +++ b/src/paimon/core/manifest/manifest_committable_test.cpp @@ -59,7 +59,7 @@ class ManifestCommittableTest : public testing::Test { EXPECT_OK_AND_ASSIGN(auto in_stream, file_system->Open(path)); EXPECT_OK_AND_ASSIGN( - [[maybe_unused]] int32_t read_bytes, + [[maybe_unused]] uint64_t read_bytes, in_stream->Read(reinterpret_cast(buffer.data()), buffer.size())); EXPECT_OK(in_stream->Close()); diff --git a/src/paimon/core/mergetree/lookup/remote_lookup_file_manager.cpp b/src/paimon/core/mergetree/lookup/remote_lookup_file_manager.cpp index 7d7f99ab4..45d013191 100644 --- a/src/paimon/core/mergetree/lookup/remote_lookup_file_manager.cpp +++ b/src/paimon/core/mergetree/lookup/remote_lookup_file_manager.cpp @@ -108,15 +108,15 @@ Status RemoteLookupFileManager::CopyFromInputToOutput( uint64_t write_size = 0; while (write_size < total_length) { uint64_t current_read_size = std::min(total_length - write_size, kBufferSize); - PAIMON_ASSIGN_OR_RAISE(int32_t bytes_read, + PAIMON_ASSIGN_OR_RAISE(uint64_t bytes_read, input_stream->Read(buffer->data(), current_read_size)); - if (static_cast(bytes_read) != current_read_size) { + if (bytes_read != current_read_size) { return Status::Invalid( fmt::format("CopyFromInputToOutput failed: expected read {} bytes, while " "actual read {} bytes", current_read_size, bytes_read)); } - PAIMON_ASSIGN_OR_RAISE(int32_t bytes_written, + PAIMON_ASSIGN_OR_RAISE(uint64_t bytes_written, output_stream->Write(buffer->data(), bytes_read)); if (bytes_written != bytes_read) { return Status::Invalid( diff --git a/src/paimon/core/mergetree/lookup/remote_lookup_file_manager_test.cpp b/src/paimon/core/mergetree/lookup/remote_lookup_file_manager_test.cpp index 75bb98a3a..acd2b8e5c 100644 --- a/src/paimon/core/mergetree/lookup/remote_lookup_file_manager_test.cpp +++ b/src/paimon/core/mergetree/lookup/remote_lookup_file_manager_test.cpp @@ -275,9 +275,9 @@ TEST_F(RemoteLookupFileManagerTest, TryToDownloadLargeFileAcrossMultipleBuffers) for (uint64_t i = 0; i < file_size; ++i) { write_buffer[i] = static_cast(i % 251); } - ASSERT_OK_AND_ASSIGN(int32_t bytes_written, + ASSERT_OK_AND_ASSIGN(uint64_t bytes_written, output_stream->Write(write_buffer.data(), file_size)); - ASSERT_EQ(static_cast(bytes_written), file_size); + ASSERT_EQ(bytes_written, file_size); ASSERT_OK(output_stream->Flush()); ASSERT_OK(output_stream->Close()); } @@ -295,8 +295,9 @@ TEST_F(RemoteLookupFileManagerTest, TryToDownloadLargeFileAcrossMultipleBuffers) { ASSERT_OK_AND_ASSIGN(auto input_stream, fs_->Open(local_file_path)); std::vector read_buffer(file_size); - ASSERT_OK_AND_ASSIGN(int32_t bytes_read, input_stream->Read(read_buffer.data(), file_size)); - ASSERT_EQ(static_cast(bytes_read), file_size); + ASSERT_OK_AND_ASSIGN(uint64_t bytes_read, + input_stream->Read(read_buffer.data(), file_size)); + ASSERT_EQ(bytes_read, file_size); for (uint64_t i = 0; i < file_size; ++i) { ASSERT_EQ(read_buffer[i], static_cast(i % 251)) << "Data mismatch at byte offset " << i; diff --git a/src/paimon/core/utils/objects_file.h b/src/paimon/core/utils/objects_file.h index cc3242b07..ac3a7f673 100644 --- a/src/paimon/core/utils/objects_file.h +++ b/src/paimon/core/utils/objects_file.h @@ -184,10 +184,10 @@ Result> ObjectsFile::WriteWithoutRolling( PAIMON_RETURN_NOT_OK(format_writer->Flush()); PAIMON_RETURN_NOT_OK(format_writer->Finish()); PAIMON_RETURN_NOT_OK(out->Flush()); - PAIMON_ASSIGN_OR_RAISE(int64_t pos, out->GetPos()); + PAIMON_ASSIGN_OR_RAISE(uint64_t pos, out->GetPos()); PAIMON_RETURN_NOT_OK(out->Close()); guard.Release(); - return std::make_pair(PathUtil::GetName(file_path), pos); + return std::make_pair(PathUtil::GetName(file_path), static_cast(pos)); } } // namespace paimon diff --git a/src/paimon/format/avro/avro_file_batch_reader.cpp b/src/paimon/format/avro/avro_file_batch_reader.cpp index 1cccdf1aa..e856f87b4 100644 --- a/src/paimon/format/avro/avro_file_batch_reader.cpp +++ b/src/paimon/format/avro/avro_file_batch_reader.cpp @@ -16,6 +16,7 @@ #include "paimon/format/avro/avro_file_batch_reader.h" +#include #include #include @@ -182,10 +183,11 @@ Result> AvroFileBatchReader::GetFileSchema() cons Result AvroFileBatchReader::GetNumberOfRows() const { if (!total_rows_) { - PAIMON_ASSIGN_OR_RAISE(int64_t current_pos, input_stream_->GetPos()); - ScopeGuard stream_guard([this, current_pos]() -> void { + PAIMON_ASSIGN_OR_RAISE(uint64_t current_pos, input_stream_->GetPos()); + auto seek_pos = static_cast(current_pos); + ScopeGuard stream_guard([this, seek_pos]() -> void { // reset input stream position to original position - Status status = input_stream_->Seek(current_pos, SeekOrigin::FS_SEEK_SET); + Status status = input_stream_->Seek(seek_pos, SeekOrigin::FS_SEEK_SET); (void)status; }); PAIMON_ASSIGN_OR_RAISE(std::unique_ptr<::avro::DataFileReaderBase> reader, diff --git a/src/paimon/format/avro/avro_input_stream_impl.cpp b/src/paimon/format/avro/avro_input_stream_impl.cpp index 3c9743238..7687ed385 100644 --- a/src/paimon/format/avro/avro_input_stream_impl.cpp +++ b/src/paimon/format/avro/avro_input_stream_impl.cpp @@ -68,8 +68,7 @@ bool AvroInputStreamImpl::next(const uint8_t** data, size_t* len) { return false; // eof } auto read_length = - in_->Read(reinterpret_cast(buffer_), - static_cast(std::min(buffer_size_, remaining))); + in_->Read(reinterpret_cast(buffer_), std::min(buffer_size_, remaining)); if (!read_length.ok()) { throw ::avro::Exception("Read failed: {}", read_length.status().ToString()); } diff --git a/src/paimon/format/avro/avro_output_stream_impl.cpp b/src/paimon/format/avro/avro_output_stream_impl.cpp index e3c1be164..144aed458 100644 --- a/src/paimon/format/avro/avro_output_stream_impl.cpp +++ b/src/paimon/format/avro/avro_output_stream_impl.cpp @@ -62,11 +62,11 @@ void AvroOutputStreamImpl::backup(size_t len) { void AvroOutputStreamImpl::FlushBuffer() { size_t length = buffer_size_ - available_; - Result write_len = out_->Write(reinterpret_cast(buffer_), length); + Result write_len = out_->Write(reinterpret_cast(buffer_), length); if (!write_len.ok()) { throw std::runtime_error("write failed, status: " + write_len.status().ToString()); } - if (static_cast(write_len.value()) != length) { + if (write_len.value() != length) { throw std::runtime_error( fmt::format("write failed, expected length: {}, actual write length: {}", length, write_len.value())); diff --git a/src/paimon/format/blob/blob_format_writer.cpp b/src/paimon/format/blob/blob_format_writer.cpp index f41e21e90..6b36eee3b 100644 --- a/src/paimon/format/blob/blob_format_writer.cpp +++ b/src/paimon/format/blob/blob_format_writer.cpp @@ -130,7 +130,7 @@ Status BlobFormatWriter::Finish() { Status BlobFormatWriter::WriteBlob(std::string_view blob_data) { crc32_ = 0; - PAIMON_ASSIGN_OR_RAISE(int64_t previous_pos, out_->GetPos()); + PAIMON_ASSIGN_OR_RAISE(uint64_t previous_pos, out_->GetPos()); // write magic number static PAIMON_UNIQUE_PTR kMagicNumberBytes = @@ -148,25 +148,24 @@ Status BlobFormatWriter::WriteBlob(std::string_view blob_data) { } PAIMON_ASSIGN_OR_RAISE(uint64_t file_length, in->Length()); uint64_t total_read_length = 0; - auto read_len = static_cast(std::min(file_length, tmp_buffer_->size())); + uint64_t read_len = std::min(file_length, tmp_buffer_->size()); while (read_len > 0) { - PAIMON_ASSIGN_OR_RAISE(int32_t actual_read_len, in->Read(tmp_buffer_->data(), read_len)); - if (static_cast(actual_read_len) != read_len) { + PAIMON_ASSIGN_OR_RAISE(uint64_t actual_read_len, in->Read(tmp_buffer_->data(), read_len)); + if (actual_read_len != read_len) { return Status::Invalid("actual read length {}, not match with expect length {}", actual_read_len, read_len); } PAIMON_RETURN_NOT_OK(WriteWithCrc32(tmp_buffer_->data(), actual_read_len)); total_read_length += actual_read_len; - read_len = static_cast( - std::min(file_length - total_read_length, tmp_buffer_->size())); + read_len = std::min(file_length - total_read_length, tmp_buffer_->size()); } // write bin length - PAIMON_ASSIGN_OR_RAISE(int64_t current_pos, out_->GetPos()); + PAIMON_ASSIGN_OR_RAISE(uint64_t current_pos, out_->GetPos()); /// magic number(4) + blob content(bin length - 16) + bin length(8) + crc32(4) /// ↑ ↑ /// previous_pos current_pos - int64_t bin_length = current_pos - previous_pos + 8 + 4; + auto bin_length = static_cast(current_pos - previous_pos + 8 + 4); bin_lengths_.push_back(bin_length); PAIMON_UNIQUE_PTR bin_length_bytes = IntegerToLittleEndian(bin_length, pool_); PAIMON_RETURN_NOT_OK(WriteWithCrc32(bin_length_bytes->data(), bin_length_bytes->size())); @@ -178,8 +177,8 @@ Status BlobFormatWriter::WriteBlob(std::string_view blob_data) { return Status::OK(); } -Status BlobFormatWriter::WriteBytes(const char* data, int32_t length) { - PAIMON_ASSIGN_OR_RAISE(int32_t actual, out_->Write(data, length)); +Status BlobFormatWriter::WriteBytes(const char* data, uint64_t length) { + PAIMON_ASSIGN_OR_RAISE(uint64_t actual, out_->Write(data, length)); if (actual != length) { return Status::Invalid("not suppose actual length {} not match with expect {}", actual, length); @@ -187,14 +186,14 @@ Status BlobFormatWriter::WriteBytes(const char* data, int32_t length) { return Status::OK(); } -Status BlobFormatWriter::WriteWithCrc32(const char* data, int32_t length) { +Status BlobFormatWriter::WriteWithCrc32(const char* data, uint64_t length) { crc32_ = arrow::internal::crc32(crc32_, data, length); return WriteBytes(data, length); } Result BlobFormatWriter::ReachTargetSize(bool suggested_check, int64_t target_size) const { - PAIMON_ASSIGN_OR_RAISE(int64_t current_pos, out_->GetPos()); - return current_pos >= target_size; + PAIMON_ASSIGN_OR_RAISE(uint64_t current_pos, out_->GetPos()); + return current_pos >= static_cast(target_size); } template diff --git a/src/paimon/format/blob/blob_format_writer.h b/src/paimon/format/blob/blob_format_writer.h index c52437e8d..6821f8276 100644 --- a/src/paimon/format/blob/blob_format_writer.h +++ b/src/paimon/format/blob/blob_format_writer.h @@ -72,8 +72,8 @@ class BlobFormatWriter : public FormatWriter { Status WriteBlob(std::string_view blob_data); - Status WriteBytes(const char* data, int32_t length); - Status WriteWithCrc32(const char* data, int32_t length); + Status WriteBytes(const char* data, uint64_t length); + Status WriteWithCrc32(const char* data, uint64_t length); template static PAIMON_UNIQUE_PTR IntegerToLittleEndian(T value, diff --git a/src/paimon/format/blob/blob_format_writer_test.cpp b/src/paimon/format/blob/blob_format_writer_test.cpp index 3f3779108..480b75042 100644 --- a/src/paimon/format/blob/blob_format_writer_test.cpp +++ b/src/paimon/format/blob/blob_format_writer_test.cpp @@ -296,7 +296,7 @@ TEST_P(BlobFormatWriterTest, TestLargeBlob) { // Write data larger than TMP_BUFFER_SIZE (1MB) const size_t large_size = BlobFormatWriter::kTmpBufferSize * 2 + 1000; // ~2MB std::vector large_data(large_size, 'A'); - ASSERT_OK_AND_ASSIGN(int32_t written, large_file_stream->Write(large_data.data(), large_size)); + ASSERT_OK_AND_ASSIGN(uint64_t written, large_file_stream->Write(large_data.data(), large_size)); ASSERT_EQ(written, large_size); ASSERT_OK(large_file_stream->Flush()); ASSERT_OK(large_file_stream->Close()); diff --git a/src/paimon/format/orc/orc_input_stream_impl.cpp b/src/paimon/format/orc/orc_input_stream_impl.cpp index cdedceeff..f2ab45bd6 100644 --- a/src/paimon/format/orc/orc_input_stream_impl.cpp +++ b/src/paimon/format/orc/orc_input_stream_impl.cpp @@ -61,11 +61,11 @@ void OrcInputStreamImpl::read(void* buf, uint64_t length, uint64_t offset) { metrics_->IOCount.fetch_add(1); } - Result read_bytes = input_stream_->Read(static_cast(buf), length, offset); + Result read_bytes = input_stream_->Read(static_cast(buf), length, offset); if (!read_bytes.ok()) { throw ::orc::ParseError("read failed, status: " + read_bytes.status().ToString()); } - if (static_cast(read_bytes.value()) != length) { + if (read_bytes.value() != length) { throw ::orc::ParseError( fmt::format("read failed, expected length: {}, actual read length: {}", length, read_bytes.value())); diff --git a/src/paimon/format/orc/orc_output_stream_impl.cpp b/src/paimon/format/orc/orc_output_stream_impl.cpp index e39158cb4..23fab4915 100644 --- a/src/paimon/format/orc/orc_output_stream_impl.cpp +++ b/src/paimon/format/orc/orc_output_stream_impl.cpp @@ -38,7 +38,7 @@ OrcOutputStreamImpl::OrcOutputStreamImpl(const std::shared_ptr pos = output_stream_->GetPos(); + Result pos = output_stream_->GetPos(); if (!pos.ok()) { throw std::runtime_error(fmt::format("get length failed, file name {}, error msg {}", file_name_, pos.status().ToString())); @@ -48,11 +48,11 @@ uint64_t OrcOutputStreamImpl::getLength() const { } void OrcOutputStreamImpl::write(const void* buf, size_t length) { - Result write_len = output_stream_->Write(static_cast(buf), length); + Result write_len = output_stream_->Write(static_cast(buf), length); if (!write_len.ok()) { throw std::runtime_error("write failed, status: " + write_len.status().ToString()); } - if (static_cast(write_len.value()) != length) { + if (write_len.value() != length) { throw std::runtime_error( fmt::format("write failed, expected length: {}, actual write length: {}", length, write_len.value())); diff --git a/src/paimon/fs/jindo/jindo_file_system.cpp b/src/paimon/fs/jindo/jindo_file_system.cpp index f86d1ec72..b34b42b1c 100644 --- a/src/paimon/fs/jindo/jindo_file_system.cpp +++ b/src/paimon/fs/jindo/jindo_file_system.cpp @@ -198,10 +198,13 @@ Status JindoInputStream::Seek(int64_t offset, SeekOrigin origin) { return Status::OK(); } -Result JindoInputStream::GetPos() const { +Result JindoInputStream::GetPos() const { int64_t pos = -1; PAIMON_RETURN_NOT_OK_FROM_JINDO(reader_->tell(pos)); - return pos; + if (pos < 0) { + return Status::Invalid(fmt::format("jindo input position {} is less than 0", pos)); + } + return static_cast(pos); } Result JindoInputStream::Length() const { @@ -210,24 +213,26 @@ Result JindoInputStream::Length() const { return len; } -Result JindoInputStream::Read(char* buffer, uint32_t size) { - PAIMON_RETURN_NOT_OK_FROM_JINDO(reader_->read(size, &result_, buffer)); +Result JindoInputStream::Read(char* buffer, uint64_t size) { + PAIMON_RETURN_NOT_OK_FROM_JINDO(reader_->read(static_cast(size), &result_, buffer)); return result_.length(); } -Result JindoInputStream::Read(char* buffer, uint32_t size, uint64_t offset) { - PAIMON_RETURN_NOT_OK_FROM_JINDO(reader_->pread(offset, size, &result_, buffer)); +Result JindoInputStream::Read(char* buffer, uint64_t size, uint64_t offset) { + PAIMON_RETURN_NOT_OK_FROM_JINDO( + reader_->pread(offset, static_cast(size), &result_, buffer)); return result_.length(); } -void JindoInputStream::ReadAsync(char* buffer, uint32_t size, uint64_t offset, +void JindoInputStream::ReadAsync(char* buffer, uint64_t size, uint64_t offset, std::function&& callback) { auto outer_callback = [=](JdoStatus status) { callback(status.ok() ? Status::OK() : Status::IOError(status.errMsg())); }; - auto task = reader_->preadAsync(offset, size, &result_, buffer, outer_callback); + auto task = + reader_->preadAsync(offset, static_cast(size), &result_, buffer, outer_callback); assert(task); - auto status = task->perform(); + [[maybe_unused]] auto status = task->perform(); } Status JindoInputStream::Close() { @@ -245,14 +250,17 @@ JindoOutputStream::JindoOutputStream(const std::shared_ptr& std::unique_ptr&& writer) : fs_(fs), writer_(std::move(writer)) {} -Result JindoOutputStream::GetPos() const { +Result JindoOutputStream::GetPos() const { int64_t pos = -1; PAIMON_RETURN_NOT_OK_FROM_JINDO(writer_->tell(pos)); - return pos; + if (pos < 0) { + return Status::Invalid(fmt::format("jindo output position {} is less than 0", pos)); + } + return static_cast(pos); } -Result JindoOutputStream::Write(const char* buffer, uint32_t size) { - std::string_view data(buffer, size); +Result JindoOutputStream::Write(const char* buffer, uint64_t size) { + std::string_view data(buffer, static_cast(size)); PAIMON_RETURN_NOT_OK_FROM_JINDO(writer_->write(data)); return size; } diff --git a/src/paimon/fs/jindo/jindo_file_system.h b/src/paimon/fs/jindo/jindo_file_system.h index 27e466a6e..eb4ef4bbe 100644 --- a/src/paimon/fs/jindo/jindo_file_system.h +++ b/src/paimon/fs/jindo/jindo_file_system.h @@ -66,10 +66,10 @@ class JindoInputStream : public InputStream { JindoInputStream(const std::shared_ptr& fs, std::unique_ptr&& reader); Status Seek(int64_t offset, SeekOrigin origin) override; - Result GetPos() const override; - Result Read(char* buffer, uint32_t size) override; - Result Read(char* buffer, uint32_t size, uint64_t offset) override; - void ReadAsync(char* buffer, uint32_t size, uint64_t offset, + Result GetPos() const override; + Result Read(char* buffer, uint64_t size) override; + Result Read(char* buffer, uint64_t size, uint64_t offset) override; + void ReadAsync(char* buffer, uint64_t size, uint64_t offset, std::function&& callback) override; Status Close() override; Result GetUri() const override; @@ -88,8 +88,8 @@ class JindoOutputStream : public OutputStream { JindoOutputStream(const std::shared_ptr& fs, std::unique_ptr&& writer); - Result GetPos() const override; - Result Write(const char* buffer, uint32_t size) override; + Result GetPos() const override; + Result Write(const char* buffer, uint64_t size) override; Status Flush() override; Status Close() override; Result GetUri() const override; diff --git a/src/paimon/fs/jindo/jindo_file_system_test.cpp b/src/paimon/fs/jindo/jindo_file_system_test.cpp index 542a30317..62605d601 100644 --- a/src/paimon/fs/jindo/jindo_file_system_test.cpp +++ b/src/paimon/fs/jindo/jindo_file_system_test.cpp @@ -50,7 +50,7 @@ TEST_F(JindoFileSystemTest, TestLifeCycle) { // read process ASSERT_OK_AND_ASSIGN(auto in_stream, tmp_fs->Open(file_path)); std::string read_content(content.size(), '\0'); - ASSERT_OK_AND_ASSIGN(int32_t read_len, + ASSERT_OK_AND_ASSIGN(uint64_t read_len, in_stream->Read(read_content.data(), read_content.size())); ASSERT_EQ(read_len, read_content.size()); ASSERT_EQ(content, read_content); @@ -77,7 +77,7 @@ TEST_F(JindoFileSystemTest, TestSeek) { std::string file_path = test_dir_ + "file.data"; // write process ASSERT_OK_AND_ASSIGN(auto out_stream, fs_->Create(file_path, /*overwrite=*/true)); - ASSERT_OK_AND_ASSIGN(int32_t write_len, out_stream->Write(content.data(), content.size())); + ASSERT_OK_AND_ASSIGN(uint64_t write_len, out_stream->Write(content.data(), content.size())); ASSERT_EQ(write_len, content.size()); ASSERT_OK(out_stream->Flush()); ASSERT_OK(out_stream->Close()); @@ -107,7 +107,7 @@ TEST_F(JindoFileSystemTest, TestSeek) { // read from cur pos std::string read_content(3, '\0'); - ASSERT_OK_AND_ASSIGN(int32_t read_len, + ASSERT_OK_AND_ASSIGN(uint64_t read_len, in_stream->Read(read_content.data(), read_content.size())); ASSERT_EQ(read_len, read_content.size()); ASSERT_EQ("ijk", read_content); diff --git a/src/paimon/fs/local/local_file.cpp b/src/paimon/fs/local/local_file.cpp index d67be522f..c017682cd 100644 --- a/src/paimon/fs/local/local_file.cpp +++ b/src/paimon/fs/local/local_file.cpp @@ -166,20 +166,16 @@ const std::string& LocalFile::GetAbsolutePath() const { return path_; } -Result LocalFile::Read(char* buffer, uint32_t length, uint64_t offset) { +Result LocalFile::Read(char* buffer, uint64_t length, uint64_t offset) { if (file_) { CHECK_HOOK(); int32_t fd = fileno(file_); - auto more = static_cast(length); - if (more < 0) { - return Status::IOError(fmt::format( - "pread file '{}' fail, length overflow int32_t, ec: EC_BADARGS", path_)); - } + uint64_t more = length; uint64_t off = 0; - int32_t ret = 0; while (more > 0) { - ret = ::pread(fd, buffer + off, more, offset + off); + ssize_t ret = ::pread(fd, buffer + off, static_cast(more), + static_cast(offset + off)); if (ret == -1) { return Status::IOError( fmt::format("pread file '{}' fail at off {}, with error {}, ec: {}", path_, off, @@ -188,8 +184,9 @@ Result LocalFile::Read(char* buffer, uint32_t length, uint64_t offset) if (ret == 0) { break; } - more -= ret; - off += ret; + auto read_size = static_cast(ret); + more -= read_size; + off += read_size; } return off; } @@ -197,19 +194,14 @@ Result LocalFile::Read(char* buffer, uint32_t length, uint64_t offset) "read file '{}' fail, can not read file which is opened fail, ec: EBADF", path_)); } -Result LocalFile::Read(char* buffer, uint32_t length) { +Result LocalFile::Read(char* buffer, uint64_t length) { if (file_) { CHECK_HOOK(); - auto more = static_cast(length); - if (more < 0) { - return Status::IOError( - fmt::format("fileName '{}', length '{}', ec: EC_BADARGS", path_, length)); - } - int32_t ret = 0; + uint64_t more = length; uint64_t off = 0; while (more > 0) { - ret = fread(buffer + off, 1, more, file_); + size_t ret = fread(buffer + off, 1, static_cast(more), file_); if (ferror(file_) != 0) { return Status::IOError( fmt::format("read file '{}' fail at off {}, with error {}, ec: {}", path_, off, @@ -228,23 +220,22 @@ Result LocalFile::Read(char* buffer, uint32_t length) { "read file '{}' fail, can not read file which is opened fail, ec: EBADF", path_)); } -Result LocalFile::Write(const char* buffer, uint32_t length) { +Result LocalFile::Write(const char* buffer, uint64_t length) { if (file_) { CHECK_HOOK(); - auto more = static_cast(length); - if (more < 0) { - return Status::IOError(fmt::format( - "write file '{}' fail, length overflow int32_t, ec: EC_BADARGS", path_)); - } - int32_t ret = 0; + uint64_t more = length; uint64_t off = 0; while (more > 0) { - ret = fwrite(buffer + off, 1, more, file_); + size_t ret = fwrite(buffer + off, 1, static_cast(more), file_); if (ferror(file_) != 0) { - return Status::IOError(fmt::format("write file '{}' fail, with error {}, ec: {}", - path_, off, strerror(errno), - std::strerror(errno))); + return Status::IOError(fmt::format("write file '{}' fail at off {}, ec: {}", path_, + off, std::strerror(errno))); + } + if (ret == 0) { + return Status::IOError( + fmt::format("write file '{}' fail at off {}, wrote zero bytes, ec: {}", path_, + off, std::strerror(errno))); } more -= ret; off += ret; @@ -327,7 +318,7 @@ Status LocalFile::Seek(int64_t offset, int32_t seek_origin) { fmt::format("seek '{}' fail, can not read file which not opened, ec: EBADF", path_)); } -Result LocalFile::Tell() const { +Result LocalFile::Tell() const { if (file_) { CHECK_HOOK(); int64_t ret = ftell(file_); @@ -335,7 +326,7 @@ Result LocalFile::Tell() const { return Status::IOError( fmt::format("tell '{}' fail, ec: {}", path_, std::strerror(errno))); } - return ret; + return static_cast(ret); } return Status::IOError( fmt::format("tell '{}' fail, can not read file which not opened, ec: EBADF", path_)); diff --git a/src/paimon/fs/local/local_file.h b/src/paimon/fs/local/local_file.h index cb4b3532f..fbbfe674b 100644 --- a/src/paimon/fs/local/local_file.h +++ b/src/paimon/fs/local/local_file.h @@ -52,16 +52,16 @@ class LocalFile { Result Length() const; Result LastModifiedTimeMs() const; Status OpenFile(bool is_read_file); - Result Read() { + Result Read() { return Status::NotImplemented(""); } - Result Read(char* buffer, uint32_t length); - Result Read(char* buffer, uint32_t length, uint64_t offset); - Result Write(const char* buffer, uint32_t length); + Result Read(char* buffer, uint64_t length); + Result Read(char* buffer, uint64_t length, uint64_t offset); + Result Write(const char* buffer, uint64_t length); Status Flush(); Status Close(); Status Seek(int64_t offset, int32_t seek_origin); - Result Tell() const; + Result Tell() const; bool IsEmpty() const { return path_.empty(); diff --git a/src/paimon/fs/local/local_file_system.cpp b/src/paimon/fs/local/local_file_system.cpp index 07a3fe34c..2be2fd02f 100644 --- a/src/paimon/fs/local/local_file_system.cpp +++ b/src/paimon/fs/local/local_file_system.cpp @@ -259,36 +259,36 @@ Status LocalInputStream::Seek(int64_t offset, SeekOrigin origin) { return file_.Seek(offset, convert_origin(origin)); } -Result LocalInputStream::GetPos() const { +Result LocalInputStream::GetPos() const { return file_.Tell(); } -Result LocalInputStream::Read(char* buffer, uint32_t size) { - PAIMON_ASSIGN_OR_RAISE(int32_t read_length, file_.Read(buffer, size)); - if (read_length != static_cast(size)) { +Result LocalInputStream::Read(char* buffer, uint64_t size) { + PAIMON_ASSIGN_OR_RAISE(uint64_t read_length, file_.Read(buffer, size)); + if (read_length != size) { return Status::IOError(fmt::format("file '{}' read size {} != expected {}", file_.GetAbsolutePath(), read_length, size)); } return read_length; } -Result LocalInputStream::Read(char* buffer, uint32_t size, uint64_t offset) { - PAIMON_ASSIGN_OR_RAISE(int32_t read_length, file_.Read(buffer, size, offset)); - if (read_length != static_cast(size)) { +Result LocalInputStream::Read(char* buffer, uint64_t size, uint64_t offset) { + PAIMON_ASSIGN_OR_RAISE(uint64_t read_length, file_.Read(buffer, size, offset)); + if (read_length != size) { return Status::IOError(fmt::format("file '{}' read size {} != expected {}", file_.GetAbsolutePath(), read_length, size)); } return read_length; } -void LocalInputStream::ReadAsync(char* buffer, uint32_t size, uint64_t offset, +void LocalInputStream::ReadAsync(char* buffer, uint64_t size, uint64_t offset, std::function&& callback) { - Result read_size = Read(buffer, size, offset); + Result read_size = Read(buffer, size, offset); Status status = Status::OK(); if (!read_size.ok()) { status = read_size.status(); } else { - assert(read_size.value() == static_cast(size)); + assert(read_size.value() == size); } callback(status); } @@ -309,10 +309,10 @@ Result> LocalOutputStream::Create(LocalFile& LocalOutputStream::LocalOutputStream(const LocalFile& file) : file_(file) {} -Result LocalOutputStream::GetPos() const { +Result LocalOutputStream::GetPos() const { return file_.Tell(); } -Result LocalOutputStream::Write(const char* buffer, uint32_t size) { +Result LocalOutputStream::Write(const char* buffer, uint64_t size) { return file_.Write(buffer, size); } Status LocalOutputStream::Flush() { diff --git a/src/paimon/fs/local/local_file_system.h b/src/paimon/fs/local/local_file_system.h index 9ab4bb1f2..28715b30c 100644 --- a/src/paimon/fs/local/local_file_system.h +++ b/src/paimon/fs/local/local_file_system.h @@ -71,10 +71,10 @@ class LocalInputStream : public InputStream { static Result> Create(LocalFile& file); Status Seek(int64_t offset, SeekOrigin origin) override; - Result GetPos() const override; - Result Read(char* buffer, uint32_t size) override; - Result Read(char* buffer, uint32_t size, uint64_t offset) override; - void ReadAsync(char* buffer, uint32_t size, uint64_t offset, + Result GetPos() const override; + Result Read(char* buffer, uint64_t size) override; + Result Read(char* buffer, uint64_t size, uint64_t offset) override; + void ReadAsync(char* buffer, uint64_t size, uint64_t offset, std::function&& callback) override; Status Close() override; @@ -93,8 +93,8 @@ class LocalOutputStream : public OutputStream { public: static Result> Create(LocalFile& file); - Result GetPos() const override; - Result Write(const char* buffer, uint32_t size) override; + Result GetPos() const override; + Result Write(const char* buffer, uint64_t size) override; Status Flush() override; Status Close() override; Result GetUri() const override { diff --git a/src/paimon/fs/local/local_file_test.cpp b/src/paimon/fs/local/local_file_test.cpp index 66489456a..53c933d6e 100644 --- a/src/paimon/fs/local/local_file_test.cpp +++ b/src/paimon/fs/local/local_file_test.cpp @@ -46,8 +46,8 @@ TEST(LocalFileTest, TestReadWriteEmptyContent) { ASSERT_OK(file.OpenFile(/*is_read_file=*/false)); const char* str = ""; - const int32_t str_size = 0; - ASSERT_OK_AND_ASSIGN(int32_t write_size, file.Write(str, str_size)); + const uint64_t str_size = 0; + ASSERT_OK_AND_ASSIGN(uint64_t write_size, file.Write(str, str_size)); ASSERT_EQ(write_size, str_size); ASSERT_OK(file.Flush()); @@ -58,7 +58,7 @@ TEST(LocalFileTest, TestReadWriteEmptyContent) { LocalFile file2(path); ASSERT_OK(file2.OpenFile(/*is_read_file=*/true)); char buffer[10]; - ASSERT_OK_AND_ASSIGN(int32_t read_len, file2.Read(buffer, 10)); + ASSERT_OK_AND_ASSIGN(uint64_t read_len, file2.Read(buffer, 10)); ASSERT_EQ(0, read_len); } @@ -83,8 +83,8 @@ TEST(LocalFileTest, TestSimple) { ASSERT_OK(file.OpenFile(/*is_read_file=*/false)); const char* str = "test_data"; - const int32_t str_size = 9; - ASSERT_OK_AND_ASSIGN(int32_t write_size, file.Write(str, str_size)); + const uint64_t str_size = 9; + ASSERT_OK_AND_ASSIGN(uint64_t write_size, file.Write(str, str_size)); ASSERT_EQ(write_size, str_size); ASSERT_OK(file.Flush()); @@ -108,22 +108,22 @@ TEST(LocalFileTest, TestSimple) { ASSERT_OK(file2.OpenFile(true)); char str_read[str_size + 1]; { - ASSERT_OK_AND_ASSIGN(int32_t read_size, file2.Read(str_read, 4)); + ASSERT_OK_AND_ASSIGN(uint64_t read_size, file2.Read(str_read, 4)); ASSERT_EQ(read_size, 4); str_read[read_size] = '\0'; ASSERT_EQ(strcmp(str_read, "test"), 0); } { - ASSERT_OK_AND_ASSIGN(int64_t pos, file2.Tell()); + ASSERT_OK_AND_ASSIGN(uint64_t pos, file2.Tell()); ASSERT_EQ(pos, 4); ASSERT_OK(file2.Seek(5, SEEK_SET)); - ASSERT_OK_AND_ASSIGN(int32_t read_size, file2.Read(str_read, 4)); + ASSERT_OK_AND_ASSIGN(uint64_t read_size, file2.Read(str_read, 4)); ASSERT_EQ(read_size, 4); str_read[read_size] = '\0'; ASSERT_EQ(strcmp(str_read, "data"), 0); } { - ASSERT_OK_AND_ASSIGN(int32_t read_size, file2.Read(str_read, str_size, 0)); + ASSERT_OK_AND_ASSIGN(uint64_t read_size, file2.Read(str_read, str_size, 0)); ASSERT_EQ(read_size, str_size); str_read[read_size] = '\0'; ASSERT_EQ(strcmp(str_read, "test_data"), 0); diff --git a/src/paimon/global_index/lucene/lucene_global_index_writer.cpp b/src/paimon/global_index/lucene/lucene_global_index_writer.cpp index b2fecd8ec..015739ae5 100644 --- a/src/paimon/global_index/lucene/lucene_global_index_writer.cpp +++ b/src/paimon/global_index/lucene/lucene_global_index_writer.cpp @@ -205,9 +205,8 @@ Result LuceneGlobalIndexWriter::FlushIndexToFinal() { static_cast(kDefaultReadBufferSize)); input->readBytes(reinterpret_cast(buffer->data()), /*offset=*/0, static_cast(current_write_size)); - PAIMON_ASSIGN_OR_RAISE( - int32_t actual_write_size, - out->Write(buffer->data(), static_cast(current_write_size))); + PAIMON_ASSIGN_OR_RAISE(uint64_t actual_write_size, + out->Write(buffer->data(), current_write_size)); if (static_cast(actual_write_size) != current_write_size) { return Status::Invalid( fmt::format("invalid write, try to write {} while actual write {}", diff --git a/src/paimon/global_index/lucene/lucene_input.h b/src/paimon/global_index/lucene/lucene_input.h index a16880a2f..9582d6d69 100644 --- a/src/paimon/global_index/lucene/lucene_input.h +++ b/src/paimon/global_index/lucene/lucene_input.h @@ -65,7 +65,7 @@ class LuceneIndexInput : public Lucene::BufferedIndexInput { throw Lucene::IOException( LuceneUtils::StringToWstring(read_result.status().ToString())); } - if (read_result.value() != length) { + if (static_cast(read_result.value()) != length) { throw Lucene::IOException(L"actual read len and expect read len mismatch"); } } diff --git a/src/paimon/global_index/lumina/lumina_file_io_test.cpp b/src/paimon/global_index/lumina/lumina_file_io_test.cpp index 90f55402a..eda1c2be9 100644 --- a/src/paimon/global_index/lumina/lumina_file_io_test.cpp +++ b/src/paimon/global_index/lumina/lumina_file_io_test.cpp @@ -26,50 +26,42 @@ class LuminaFileIOTest : public ::testing::Test { }; TEST_F(LuminaFileIOTest, TestSimple) { - auto check_write_and_read = [](uint64_t max_write_size, uint64_t max_read_size) { - std::string content = "Hello World."; - auto dir = paimon::test::UniqueTestDirectory::Create("local"); - auto fs = dir->GetFileSystem(); - std::string index_path = dir->Str() + "/lumina_test.index"; - // write content - ASSERT_OK_AND_ASSIGN(std::shared_ptr out, - fs->Create(index_path, /*overwrite=*/false)); - auto writer = std::make_shared(out); - writer->max_write_size_ = max_write_size; - ASSERT_EQ(writer->GetLength().Value(), 0); - ASSERT_TRUE(writer->Write(content.data(), content.length()).IsOk()); - ASSERT_TRUE(writer->Close().IsOk()); - - // check file exist - ASSERT_OK_AND_ASSIGN(bool exist, fs->Exists(index_path)); - ASSERT_TRUE(exist); - ASSERT_OK_AND_ASSIGN(std::unique_ptr file_status, - fs->GetFileStatus(index_path)); - ASSERT_FALSE(file_status->IsDir()); - ASSERT_EQ(file_status->GetLen(), content.length()); + std::string content = "Hello World."; + auto dir = paimon::test::UniqueTestDirectory::Create("local"); + auto fs = dir->GetFileSystem(); + std::string index_path = dir->Str() + "/lumina_test.index"; + // write content + ASSERT_OK_AND_ASSIGN(std::shared_ptr out, + fs->Create(index_path, /*overwrite=*/false)); + auto writer = std::make_shared(out); + ASSERT_EQ(writer->GetLength().Value(), 0); + ASSERT_TRUE(writer->Write(content.data(), content.length()).IsOk()); + ASSERT_TRUE(writer->Close().IsOk()); - // read content - ASSERT_OK_AND_ASSIGN(std::shared_ptr in, fs->Open(index_path)); - auto reader = std::make_shared(in); - reader->max_read_size_ = max_read_size; - ASSERT_EQ(reader->GetLength().Value(), content.length()); - ASSERT_EQ(reader->GetPosition().Value(), 0); - std::string read_content(content.size(), 0); - ASSERT_TRUE(reader->Read(read_content.data(), read_content.size()).IsOk()); - ASSERT_EQ(read_content, content); - ASSERT_EQ(reader->GetPosition().Value(), content.size()); + // check file exist + ASSERT_OK_AND_ASSIGN(bool exist, fs->Exists(index_path)); + ASSERT_TRUE(exist); + ASSERT_OK_AND_ASSIGN(std::unique_ptr file_status, fs->GetFileStatus(index_path)); + ASSERT_FALSE(file_status->IsDir()); + ASSERT_EQ(file_status->GetLen(), content.length()); - // test seek - ASSERT_TRUE(reader->Seek(2).IsOk()); - std::string read_content2(3, 0); - ASSERT_TRUE(reader->Read(read_content2.data(), read_content2.size()).IsOk()); - ASSERT_EQ(read_content2, "llo"); - ASSERT_EQ(reader->GetPosition().Value(), 5); - ASSERT_TRUE(reader->Close().IsOk()); - }; + // read content + ASSERT_OK_AND_ASSIGN(std::shared_ptr in, fs->Open(index_path)); + auto reader = std::make_shared(in); + ASSERT_EQ(reader->GetLength().Value(), content.length()); + ASSERT_EQ(reader->GetPosition().Value(), 0); + std::string read_content(content.size(), 0); + ASSERT_TRUE(reader->Read(read_content.data(), read_content.size()).IsOk()); + ASSERT_EQ(read_content, content); + ASSERT_EQ(reader->GetPosition().Value(), content.size()); - check_write_and_read(LuminaFileWriter::kMaxWriteSize, LuminaFileReader::kMaxReadSize); - check_write_and_read(2, 2); + // test seek + ASSERT_TRUE(reader->Seek(2).IsOk()); + std::string read_content2(3, 0); + ASSERT_TRUE(reader->Read(read_content2.data(), read_content2.size()).IsOk()); + ASSERT_EQ(read_content2, "llo"); + ASSERT_EQ(reader->GetPosition().Value(), 5); + ASSERT_TRUE(reader->Close().IsOk()); } TEST_F(LuminaFileIOTest, TestReadAsync) { @@ -82,8 +74,7 @@ TEST_F(LuminaFileIOTest, TestReadAsync) { ASSERT_OK_AND_ASSIGN(std::shared_ptr in, fs->Open(index_path)); auto reader = std::make_shared(in); - auto check_read_result = [&](uint64_t max_read_size, std::string& read_content) { - reader->max_read_size_ = max_read_size; + auto check_read_result = [&](std::string& read_content) { bool read_finished = false; std::promise promise; std::future future = promise.get_future(); @@ -103,12 +94,10 @@ TEST_F(LuminaFileIOTest, TestReadAsync) { ASSERT_EQ(content.substr(0, read_content.size()), read_content); }; - for (auto max_read_size : {1, 2, 3, 4, 5, 10, 100}) { - std::string read_content(content.size(), '\0'); - check_read_result(max_read_size, read_content); - // test read empty - std::string empty_content; - check_read_result(max_read_size, empty_content); - } + std::string read_content(content.size(), '\0'); + check_read_result(read_content); + // test read empty + std::string empty_content; + check_read_result(empty_content); } } // namespace paimon::lumina::test diff --git a/src/paimon/global_index/lumina/lumina_file_reader.h b/src/paimon/global_index/lumina/lumina_file_reader.h index 346614455..8e36dbc40 100644 --- a/src/paimon/global_index/lumina/lumina_file_reader.h +++ b/src/paimon/global_index/lumina/lumina_file_reader.h @@ -16,10 +16,8 @@ #pragma once -#include -#include +#include #include -#include #include "fmt/format.h" #include "lumina/io/FileReader.h" @@ -41,11 +39,11 @@ class LuminaFileReader : public ::lumina::io::FileReader { } ::lumina::core::Result GetPosition() const noexcept override { - Result pos_result = in_->GetPos(); + Result pos_result = in_->GetPos(); if (!pos_result.ok()) { return ::lumina::core::Result::Err(PaimonToLuminaStatus(pos_result.status())); } - return ::lumina::core::Result::Ok(static_cast(pos_result.value())); + return ::lumina::core::Result::Ok(pos_result.value()); } ::lumina::core::Status Seek(uint64_t position) noexcept override { @@ -53,21 +51,15 @@ class LuminaFileReader : public ::lumina::io::FileReader { } ::lumina::core::Status Read(char* data, uint64_t size) noexcept override { - uint64_t total_read_size = 0; - while (total_read_size < size) { - uint64_t current_read_size = std::min(size - total_read_size, max_read_size_); - Result read_result = - in_->Read(data + total_read_size, static_cast(current_read_size)); - if (!read_result.ok()) { - return PaimonToLuminaStatus(read_result.status()); - } - if (static_cast(read_result.value()) != current_read_size) { - return ::lumina::core::Status( - ::lumina::core::ErrorCode::IoError, - fmt::format("expect read len {} mismatch actual read len {}", current_read_size, - read_result.value())); - } - total_read_size += current_read_size; + Result read_result = in_->Read(data, size); + if (!read_result.ok()) { + return PaimonToLuminaStatus(read_result.status()); + } + if (read_result.value() != size) { + return ::lumina::core::Status( + ::lumina::core::ErrorCode::IoError, + fmt::format("expect read len {} mismatch actual read len {}", size, + read_result.value())); } return ::lumina::core::Status::Ok(); } @@ -79,50 +71,10 @@ class LuminaFileReader : public ::lumina::io::FileReader { return; } - struct ReadContext { - char* current_data; - uint64_t remaining; - uint64_t current_offset; - std::function final_call_back; - std::shared_ptr in; - }; - - auto ctx = std::make_shared( - ReadContext{data, size, offset, std::move(call_back), in_}); - - // recursive lambda to read next chunk - std::function read_next; - read_next = [ctx, max_read_size = max_read_size_, &read_next]() { - if (ctx->remaining == 0) { - // all done - ctx->final_call_back(::lumina::core::Status::Ok()); - return; - } - - // determine this chunk's size - uint64_t chunk_size = std::min(ctx->remaining, max_read_size); - auto safe_size = static_cast(chunk_size); - - // issue async read for this chunk - ctx->in->ReadAsync(ctx->current_data, safe_size, ctx->current_offset, - [ctx, safe_size, read_next](const Status& status) { - if (!status.ok()) { - // propagate error immediately - ctx->final_call_back(PaimonToLuminaStatus(status)); - return; - } - // advance pointers and counters - ctx->current_data += safe_size; - ctx->current_offset += safe_size; - ctx->remaining -= safe_size; - - // continue with next chunk - read_next(); - }); - }; - - // start the first read - read_next(); + in_->ReadAsync(data, size, offset, + [call_back = std::move(call_back)](const Status& status) { + call_back(PaimonToLuminaStatus(status)); + }); } ::lumina::core::Status Close() noexcept override { @@ -130,10 +82,6 @@ class LuminaFileReader : public ::lumina::io::FileReader { } private: - static constexpr uint64_t kMaxReadSize = std::numeric_limits::max(); - - private: - uint64_t max_read_size_ = kMaxReadSize; std::shared_ptr in_; }; } // namespace paimon::lumina diff --git a/src/paimon/global_index/lumina/lumina_file_writer.h b/src/paimon/global_index/lumina/lumina_file_writer.h index 09f72634a..c787ca52c 100644 --- a/src/paimon/global_index/lumina/lumina_file_writer.h +++ b/src/paimon/global_index/lumina/lumina_file_writer.h @@ -16,8 +16,6 @@ #pragma once -#include -#include #include #include "fmt/format.h" @@ -31,29 +29,23 @@ class LuminaFileWriter : public ::lumina::io::FileWriter { ~LuminaFileWriter() override = default; ::lumina::core::Result GetLength() const noexcept override { - Result pos_result = out_->GetPos(); + Result pos_result = out_->GetPos(); if (!pos_result.ok()) { return ::lumina::core::Result::Err(PaimonToLuminaStatus(pos_result.status())); } - return ::lumina::core::Result::Ok(static_cast(pos_result.value())); + return ::lumina::core::Result::Ok(pos_result.value()); } ::lumina::core::Status Write(const char* data, uint64_t size) noexcept override { - uint64_t total_write_size = 0; - while (total_write_size < size) { - uint64_t current_write_size = std::min(size - total_write_size, max_write_size_); - Result write_result = - out_->Write(data + total_write_size, static_cast(current_write_size)); - if (!write_result.ok()) { - return PaimonToLuminaStatus(write_result.status()); - } - if (static_cast(write_result.value()) != current_write_size) { - return ::lumina::core::Status( - ::lumina::core::ErrorCode::IoError, - fmt::format("expect write len {} mismatch actual write len {}", - current_write_size, write_result.value())); - } - total_write_size += current_write_size; + Result write_result = out_->Write(data, size); + if (!write_result.ok()) { + return PaimonToLuminaStatus(write_result.status()); + } + if (write_result.value() != size) { + return ::lumina::core::Status( + ::lumina::core::ErrorCode::IoError, + fmt::format("expect write len {} mismatch actual write len {}", size, + write_result.value())); } return ::lumina::core::Status::Ok(); } @@ -67,10 +59,6 @@ class LuminaFileWriter : public ::lumina::io::FileWriter { } private: - static constexpr uint64_t kMaxWriteSize = std::numeric_limits::max(); - - private: - uint64_t max_write_size_ = kMaxWriteSize; std::shared_ptr out_; }; } // namespace paimon::lumina diff --git a/src/paimon/testing/mock/mock_file_system.h b/src/paimon/testing/mock/mock_file_system.h index dbfef49c7..50397aaea 100644 --- a/src/paimon/testing/mock/mock_file_system.h +++ b/src/paimon/testing/mock/mock_file_system.h @@ -32,16 +32,16 @@ class MockInputStream : public InputStream { Status Seek(int64_t offset, SeekOrigin origin) override { return Status::OK(); } - Result GetPos() const override { + Result GetPos() const override { return 0; } - Result Read(char* buffer, uint32_t size) override { + Result Read(char* buffer, uint64_t size) override { return 0; } - Result Read(char* buffer, uint32_t size, uint64_t offset) override { + Result Read(char* buffer, uint64_t size, uint64_t offset) override { return 0; } - void ReadAsync(char* buffer, uint32_t size, uint64_t offset, + void ReadAsync(char* buffer, uint64_t size, uint64_t offset, std::function&& callback) override {} Status Close() override { @@ -60,10 +60,10 @@ class MockOutputStream : public OutputStream { MockOutputStream() = default; ~MockOutputStream() override = default; - Result GetPos() const override { + Result GetPos() const override { return 0; } - Result Write(const char* buffer, uint32_t size) override { + Result Write(const char* buffer, uint64_t size) override { return 0; } Status Flush() override { diff --git a/src/paimon/testing/mock/mock_format_writer.cpp b/src/paimon/testing/mock/mock_format_writer.cpp index 7e01f1857..28a4fbab9 100644 --- a/src/paimon/testing/mock/mock_format_writer.cpp +++ b/src/paimon/testing/mock/mock_format_writer.cpp @@ -38,8 +38,8 @@ MockFormatWriter::MockFormatWriter(const std::shared_ptr& out, Status MockFormatWriter::AddBatch(ArrowArray* batch) { ArrowArrayRelease(batch); std::string str = std::to_string(DateTimeUtils::GetCurrentUTCTimeUs()) + "\n"; - PAIMON_ASSIGN_OR_RAISE(int32_t res, out_->Write(str.data(), str.size())); - if (res != static_cast(str.size())) { + PAIMON_ASSIGN_OR_RAISE(uint64_t res, out_->Write(str.data(), str.size())); + if (res != str.size()) { return Status::IOError("write size does not match"); } counter_++; diff --git a/test/inte/read_inte_test.cpp b/test/inte/read_inte_test.cpp index 5211fdd8c..7ca302581 100644 --- a/test/inte/read_inte_test.cpp +++ b/test/inte/read_inte_test.cpp @@ -2952,18 +2952,18 @@ TEST_P(ReadInteTest, TestSpecificFs) { Status Seek(int64_t offset, SeekOrigin origin) override { return input_->Seek(offset, origin); } - Result GetPos() const override { + Result GetPos() const override { return input_->GetPos(); } - Result Read(char* buffer, uint32_t size) override { + Result Read(char* buffer, uint64_t size) override { (*io_count_)++; return input_->Read(buffer, size); } - Result Read(char* buffer, uint32_t size, uint64_t offset) override { + Result Read(char* buffer, uint64_t size, uint64_t offset) override { (*io_count_)++; return input_->Read(buffer, size, offset); } - void ReadAsync(char* buffer, uint32_t size, uint64_t offset, + void ReadAsync(char* buffer, uint64_t size, uint64_t offset, std::function&& callback) override { (*io_count_)++; return input_->ReadAsync(buffer, size, offset, std::move(callback)); diff --git a/test/inte/write_and_read_inte_test.cpp b/test/inte/write_and_read_inte_test.cpp index 9923fcbcf..0eb678b70 100644 --- a/test/inte/write_and_read_inte_test.cpp +++ b/test/inte/write_and_read_inte_test.cpp @@ -855,9 +855,9 @@ TEST_P(WriteAndReadInteTest, TestWriteSamePartitionTwiceWithAllBasicTypesForPk) std::vector> GetTestValuesForWriteAndReadInteTest() { std::vector> values = {{"parquet", "local"}}; + // values.emplace_back("parquet", "jindo"); #ifdef PAIMON_ENABLE_ORC values.emplace_back("orc", "local"); - // values.emplace_back("parquet", "jindo"); #endif #ifdef PAIMON_ENABLE_LANCE values.emplace_back("lance", "local"); From c4f12bcd999744399fae97ddc6ddf1ec103c1a2d Mon Sep 17 00:00:00 2001 From: "jinli.zjw" Date: Wed, 3 Jun 2026 10:39:58 +0800 Subject: [PATCH 2/4] fix review --- include/paimon/io/buffered_input_stream.h | 4 +- src/paimon/common/data/blob.cpp | 16 +++++- .../common/io/buffered_input_stream.cpp | 4 +- src/paimon/common/io/offset_input_stream.cpp | 52 +++++++++---------- src/paimon/common/io/offset_input_stream.h | 11 ++-- src/paimon/common/utils/math_test.cpp | 6 +++ src/paimon/fs/jindo/jindo_file_system.cpp | 10 ++-- .../fs/jindo/jindo_file_system_test.cpp | 2 +- 8 files changed, 60 insertions(+), 45 deletions(-) diff --git a/include/paimon/io/buffered_input_stream.h b/include/paimon/io/buffered_input_stream.h index eb452b9cf..362c0cbc5 100644 --- a/include/paimon/io/buffered_input_stream.h +++ b/include/paimon/io/buffered_input_stream.h @@ -42,7 +42,7 @@ class PAIMON_EXPORT BufferedInputStream : public InputStream { /// @param in The underlying input stream to wrap. /// @param buffer_size Size of the internal buffer in bytes. /// @param pool Memory pool for buffer allocation. - BufferedInputStream(const std::shared_ptr& in, int32_t buffer_size, + BufferedInputStream(const std::shared_ptr& in, uint64_t buffer_size, MemoryPool* pool); ~BufferedInputStream() noexcept override; @@ -64,7 +64,7 @@ class PAIMON_EXPORT BufferedInputStream : public InputStream { Result GetUri() const override; - static constexpr int32_t DEFAULT_BUFFER_SIZE = 8192; + static constexpr uint64_t DEFAULT_BUFFER_SIZE = 8192; private: /// Fill the internal buffer from the underlying stream. diff --git a/src/paimon/common/data/blob.cpp b/src/paimon/common/data/blob.cpp index e17455f1e..ff663763a 100644 --- a/src/paimon/common/data/blob.cpp +++ b/src/paimon/common/data/blob.cpp @@ -94,8 +94,20 @@ Result> Blob::NewInputStream( PAIMON_ASSIGN_OR_RAISE(std::unique_ptr file, fs->Open(impl_->GetDescriptor()->Uri())); - return OffsetInputStream::Create(std::move(file), impl_->GetDescriptor()->Length(), - impl_->GetDescriptor()->Offset()); + int64_t blob_length = impl_->GetDescriptor()->Length(); + int64_t blob_offset = impl_->GetDescriptor()->Offset(); + + PAIMON_ASSIGN_OR_RAISE(int64_t total_length, file->Length()); + if (PAIMON_UNLIKELY(blob_offset > total_length)) { + return Status::Invalid( + fmt::format("offset {} exceed total length {}", blob_offset, total_length)); + } + if (blob_length == -1) { + // blob_length == -1 means it's dynamic length, should read to the end + blob_length = total_length - blob_offset; + } + + return OffsetInputStream::Create(std::move(file), blob_length, blob_offset); } Result> Blob::ToData(const std::shared_ptr& fs, diff --git a/src/paimon/common/io/buffered_input_stream.cpp b/src/paimon/common/io/buffered_input_stream.cpp index ef23fdfe3..d71684cba 100644 --- a/src/paimon/common/io/buffered_input_stream.cpp +++ b/src/paimon/common/io/buffered_input_stream.cpp @@ -28,8 +28,8 @@ namespace paimon { class MemoryPool; BufferedInputStream::BufferedInputStream(const std::shared_ptr& in, - int32_t buffer_size, MemoryPool* pool) - : buffer_size_(static_cast(buffer_size)), in_(in) { + uint64_t buffer_size, MemoryPool* pool) + : buffer_size_(buffer_size), in_(in) { assert(buffer_size > 0); buffer_ = std::make_unique(buffer_size, pool); } diff --git a/src/paimon/common/io/offset_input_stream.cpp b/src/paimon/common/io/offset_input_stream.cpp index 69bb49084..ae5c5b081 100644 --- a/src/paimon/common/io/offset_input_stream.cpp +++ b/src/paimon/common/io/offset_input_stream.cpp @@ -23,26 +23,22 @@ namespace paimon { Result> OffsetInputStream::Create( - const std::shared_ptr& wrapped, int64_t length, int64_t offset) { + const std::shared_ptr& wrapped, uint64_t length, uint64_t offset) { if (PAIMON_UNLIKELY(wrapped == nullptr)) { return Status::Invalid("input stream is null pointer"); } - if (PAIMON_UNLIKELY(offset < 0)) { - return Status::Invalid(fmt::format("offset {} is less than 0", offset)); + if (PAIMON_UNLIKELY(offset == 0)) { + return Status::Invalid(fmt::format("offset {} is invalid", offset)); } - if (PAIMON_UNLIKELY(length < -1)) { - return Status::Invalid(fmt::format("length {} is less than -1", length)); + if (PAIMON_UNLIKELY(length == 0)) { + return Status::Invalid(fmt::format("length {} is invalid", length)); } PAIMON_ASSIGN_OR_RAISE(uint64_t total_length, wrapped->Length()); - if (PAIMON_UNLIKELY((uint64_t)offset > total_length)) { + if (PAIMON_UNLIKELY(offset > total_length)) { return Status::Invalid( fmt::format("offset {} exceed total length {}", offset, total_length)); } - if (length == -1) { - // length == -1 means it's dynamic length, should read to the end - length = total_length - offset; - } - if (PAIMON_UNLIKELY((uint64_t)offset + (uint64_t)length > total_length)) { + if (PAIMON_UNLIKELY(length > total_length - offset)) { return Status::Invalid(fmt::format("offset {} + length {} exceed total length {}", offset, length, total_length)); } @@ -51,8 +47,8 @@ Result> OffsetInputStream::Create( new OffsetInputStream(std::move(wrapped), length, offset)); } -OffsetInputStream::OffsetInputStream(const std::shared_ptr& wrapped, int64_t length, - int64_t offset) +OffsetInputStream::OffsetInputStream(const std::shared_ptr& wrapped, uint64_t length, + uint64_t offset) : wrapped_(wrapped), length_(length), offset_(offset) {} Status OffsetInputStream::Seek(int64_t offset, SeekOrigin origin) { @@ -60,20 +56,22 @@ Status OffsetInputStream::Seek(int64_t offset, SeekOrigin origin) { switch (origin) { case SeekOrigin::FS_SEEK_SET: { new_position = offset; - PAIMON_RETURN_NOT_OK(AssertBoundary(new_position)); - PAIMON_RETURN_NOT_OK(wrapped_->Seek(offset_ + new_position, SeekOrigin::FS_SEEK_SET)); + PAIMON_RETURN_NOT_OK(AssertBoundary(static_cast(new_position))); + PAIMON_RETURN_NOT_OK(wrapped_->Seek(static_cast(offset_) + new_position, + SeekOrigin::FS_SEEK_SET)); break; } case SeekOrigin::FS_SEEK_CUR: { new_position = static_cast(inner_position_) + offset; - PAIMON_RETURN_NOT_OK(AssertBoundary(new_position)); + PAIMON_RETURN_NOT_OK(AssertBoundary(static_cast(new_position))); PAIMON_RETURN_NOT_OK(wrapped_->Seek(offset, SeekOrigin::FS_SEEK_CUR)); break; } case SeekOrigin::FS_SEEK_END: { - new_position = length_ + offset; - PAIMON_RETURN_NOT_OK(AssertBoundary(new_position)); - PAIMON_RETURN_NOT_OK(wrapped_->Seek(offset_ + new_position, SeekOrigin::FS_SEEK_SET)); + new_position = static_cast(length_) + offset; + PAIMON_RETURN_NOT_OK(AssertBoundary(static_cast(new_position))); + PAIMON_RETURN_NOT_OK(wrapped_->Seek(static_cast(offset_) + new_position, + SeekOrigin::FS_SEEK_SET)); break; } default: @@ -85,25 +83,25 @@ Status OffsetInputStream::Seek(int64_t offset, SeekOrigin origin) { } Result OffsetInputStream::Read(char* buffer, uint64_t size) { - PAIMON_RETURN_NOT_OK(AssertBoundary(static_cast(inner_position_ + size))); + PAIMON_RETURN_NOT_OK(AssertBoundary(inner_position_ + size)); PAIMON_ASSIGN_OR_RAISE(uint64_t actual_read_len, wrapped_->Read(buffer, size)); inner_position_ += actual_read_len; return actual_read_len; } Result OffsetInputStream::Read(char* buffer, uint64_t size, uint64_t offset) { - PAIMON_RETURN_NOT_OK(AssertBoundary(static_cast(offset + size))); - return wrapped_->Read(buffer, size, static_cast(offset_) + offset); + PAIMON_RETURN_NOT_OK(AssertBoundary(offset + size)); + return wrapped_->Read(buffer, size, offset_ + offset); } void OffsetInputStream::ReadAsync(char* buffer, uint64_t size, uint64_t offset, std::function&& callback) { - auto status = AssertBoundary(static_cast(offset + size)); + auto status = AssertBoundary(offset + size); if (!status.ok()) { callback(status); return; } - wrapped_->ReadAsync(buffer, size, static_cast(offset_) + offset, std::move(callback)); + wrapped_->ReadAsync(buffer, size, offset_ + offset, std::move(callback)); } Status OffsetInputStream::Close() { @@ -119,11 +117,11 @@ Result OffsetInputStream::GetPos() const { } Result OffsetInputStream::Length() const { - return static_cast(length_); + return length_; } -Status OffsetInputStream::AssertBoundary(int64_t inner_pos) const { - if (inner_pos < 0 || inner_pos > length_) { +Status OffsetInputStream::AssertBoundary(uint64_t inner_pos) const { + if (inner_pos > length_) { return Status::Invalid( fmt::format("OffsetInputStream assert boundary failed: inner pos {} exceed length {}", inner_pos, length_)); diff --git a/src/paimon/common/io/offset_input_stream.h b/src/paimon/common/io/offset_input_stream.h index c296b90bb..f9e8c7eb2 100644 --- a/src/paimon/common/io/offset_input_stream.h +++ b/src/paimon/common/io/offset_input_stream.h @@ -31,7 +31,7 @@ namespace paimon { class PAIMON_EXPORT OffsetInputStream : public InputStream { public: static Result> Create( - const std::shared_ptr& wrapped, int64_t length, int64_t offset); + const std::shared_ptr& wrapped, uint64_t length, uint64_t offset); ~OffsetInputStream() override = default; Status Seek(int64_t offset, SeekOrigin origin) override; @@ -52,13 +52,14 @@ class PAIMON_EXPORT OffsetInputStream : public InputStream { Result GetUri() const override; private: - OffsetInputStream(const std::shared_ptr& wrapped, int64_t length, int64_t offset); - Status AssertBoundary(int64_t inner_pos) const; + OffsetInputStream(const std::shared_ptr& wrapped, uint64_t length, + uint64_t offset); + Status AssertBoundary(uint64_t inner_pos) const; private: std::shared_ptr wrapped_; - const int64_t length_; - const int64_t offset_; + const uint64_t length_; + const uint64_t offset_; uint64_t inner_position_ = 0; }; } // namespace paimon diff --git a/src/paimon/common/utils/math_test.cpp b/src/paimon/common/utils/math_test.cpp index 1a3751c5c..4beba6530 100644 --- a/src/paimon/common/utils/math_test.cpp +++ b/src/paimon/common/utils/math_test.cpp @@ -19,6 +19,7 @@ #include #include "gtest/gtest.h" +#include "paimon/testing/utils/testharness.h" namespace paimon::test { @@ -71,6 +72,11 @@ TEST(MathTest, InRange) { ASSERT_FALSE(InRange(std::numeric_limits::max())); ASSERT_TRUE(InRange(std::numeric_limits::max())); ASSERT_FALSE(InRange(static_cast(std::numeric_limits::max()) + 1)); + + ASSERT_OK(ValidateValueInRange( + static_cast(std::numeric_limits::lowest()), "signed value")); + ASSERT_NOK_WITH_MSG(ValidateValueInRange(-1, "negative value"), + "negative value -1 is out of bound of type"); } } // namespace paimon::test diff --git a/src/paimon/fs/jindo/jindo_file_system.cpp b/src/paimon/fs/jindo/jindo_file_system.cpp index b34b42b1c..1fc94dbb7 100644 --- a/src/paimon/fs/jindo/jindo_file_system.cpp +++ b/src/paimon/fs/jindo/jindo_file_system.cpp @@ -214,13 +214,12 @@ Result JindoInputStream::Length() const { } Result JindoInputStream::Read(char* buffer, uint64_t size) { - PAIMON_RETURN_NOT_OK_FROM_JINDO(reader_->read(static_cast(size), &result_, buffer)); + PAIMON_RETURN_NOT_OK_FROM_JINDO(reader_->read(size, &result_, buffer)); return result_.length(); } Result JindoInputStream::Read(char* buffer, uint64_t size, uint64_t offset) { - PAIMON_RETURN_NOT_OK_FROM_JINDO( - reader_->pread(offset, static_cast(size), &result_, buffer)); + PAIMON_RETURN_NOT_OK_FROM_JINDO(reader_->pread(offset, size, &result_, buffer)); return result_.length(); } @@ -229,8 +228,7 @@ void JindoInputStream::ReadAsync(char* buffer, uint64_t size, uint64_t offset, auto outer_callback = [=](JdoStatus status) { callback(status.ok() ? Status::OK() : Status::IOError(status.errMsg())); }; - auto task = - reader_->preadAsync(offset, static_cast(size), &result_, buffer, outer_callback); + auto task = reader_->preadAsync(offset, size, &result_, buffer, outer_callback); assert(task); [[maybe_unused]] auto status = task->perform(); } @@ -260,7 +258,7 @@ Result JindoOutputStream::GetPos() const { } Result JindoOutputStream::Write(const char* buffer, uint64_t size) { - std::string_view data(buffer, static_cast(size)); + std::string_view data(buffer, size); PAIMON_RETURN_NOT_OK_FROM_JINDO(writer_->write(data)); return size; } diff --git a/src/paimon/fs/jindo/jindo_file_system_test.cpp b/src/paimon/fs/jindo/jindo_file_system_test.cpp index 62605d601..e85890eb5 100644 --- a/src/paimon/fs/jindo/jindo_file_system_test.cpp +++ b/src/paimon/fs/jindo/jindo_file_system_test.cpp @@ -33,7 +33,7 @@ class JindoFileSystemTest : public ::testing::Test { fs_.reset(); } - private: + protected: std::unique_ptr dir_; std::string test_dir_; std::shared_ptr fs_; From 64452121382dbf2a444780b8b652f116305d619b Mon Sep 17 00:00:00 2001 From: "jinli.zjw" Date: Wed, 3 Jun 2026 16:59:22 +0800 Subject: [PATCH 3/4] fix --- src/paimon/common/io/offset_input_stream.cpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/paimon/common/io/offset_input_stream.cpp b/src/paimon/common/io/offset_input_stream.cpp index ae5c5b081..302419b88 100644 --- a/src/paimon/common/io/offset_input_stream.cpp +++ b/src/paimon/common/io/offset_input_stream.cpp @@ -27,12 +27,6 @@ Result> OffsetInputStream::Create( if (PAIMON_UNLIKELY(wrapped == nullptr)) { return Status::Invalid("input stream is null pointer"); } - if (PAIMON_UNLIKELY(offset == 0)) { - return Status::Invalid(fmt::format("offset {} is invalid", offset)); - } - if (PAIMON_UNLIKELY(length == 0)) { - return Status::Invalid(fmt::format("length {} is invalid", length)); - } PAIMON_ASSIGN_OR_RAISE(uint64_t total_length, wrapped->Length()); if (PAIMON_UNLIKELY(offset > total_length)) { return Status::Invalid( From 94e7101c1a98b413046ae1a071018ef0fe691d65 Mon Sep 17 00:00:00 2001 From: "jinli.zjw" Date: Wed, 3 Jun 2026 18:50:47 +0800 Subject: [PATCH 4/4] feat(fs): use int64 file IO sizes --- include/paimon/fs/file_system.h | 16 +++---- include/paimon/io/buffered_input_stream.h | 14 +++--- include/paimon/io/byte_array_input_stream.h | 10 ++--- include/paimon/io/data_input_stream.h | 10 ++--- src/paimon/common/data/blob_test.cpp | 4 +- .../bitmap/bitmap_file_index_test.cpp | 2 +- src/paimon/common/fs/file_system.cpp | 14 ++++-- src/paimon/common/fs/file_system_test.cpp | 34 +++++++------- .../wrap/file_index_writer_wrapper.h | 2 +- .../common/io/buffered_input_stream.cpp | 39 ++++++++-------- .../common/io/byte_array_input_stream.cpp | 19 ++++---- src/paimon/common/io/cache_input_stream.h | 27 +++++++++--- .../common/io/cache_input_stream_test.cpp | 12 ++--- .../io/data_input_output_stream_test.cpp | 4 +- src/paimon/common/io/data_input_stream.cpp | 33 ++++++++------ src/paimon/common/io/data_output_stream.cpp | 12 ++--- src/paimon/common/io/data_output_stream.h | 6 +-- src/paimon/common/io/offset_input_stream.cpp | 40 ++++++++++------- src/paimon/common/io/offset_input_stream.h | 23 +++++----- src/paimon/common/sst/sst_file_reader.cpp | 2 +- src/paimon/common/sst/sst_file_writer.cpp | 4 +- .../arrow/arrow_input_stream_adapter.cpp | 34 +++++++++----- .../arrow/arrow_output_stream_adapter.cpp | 12 ++--- .../utils/arrow/arrow_stream_adapter_test.cpp | 2 +- src/paimon/common/utils/roaring_bitmap32.cpp | 4 +- src/paimon/common/utils/roaring_bitmap64.cpp | 4 +- src/paimon/common/utils/stream_utils.h | 25 ++++++----- .../deletionvectors/deletion_file_writer.cpp | 2 +- .../deletionvectors/deletion_file_writer.h | 2 +- .../lookup/remote_lookup_file_manager.cpp | 14 +++--- src/paimon/core/utils/objects_file.h | 2 +- .../format/avro/avro_file_batch_reader.cpp | 2 +- .../format/avro/avro_input_stream_impl.cpp | 14 ++++-- .../format/avro/avro_output_stream_impl.cpp | 16 ++++--- .../format/blob/blob_file_batch_reader.cpp | 2 +- src/paimon/format/blob/blob_format_writer.cpp | 33 +++++++++----- .../format/blob/blob_format_writer_test.cpp | 6 +-- .../format/orc/orc_input_stream_impl.cpp | 21 ++++++--- .../format/orc/orc_output_stream_impl.cpp | 18 ++++++-- .../parquet/column_index_filter_test.cpp | 2 +- .../parquet/file_reader_wrapper_test.cpp | 2 +- .../page_filtered_row_group_reader_test.cpp | 12 ++--- .../format/parquet/parquet_reader_builder.h | 2 +- .../parquet/parquet_stats_extractor.cpp | 2 +- .../parquet/predicate_pushdown_test.cpp | 2 +- src/paimon/fs/jindo/jindo_file_status.h | 2 +- src/paimon/fs/jindo/jindo_file_system.cpp | 20 ++++----- src/paimon/fs/jindo/jindo_file_system.h | 14 +++--- .../fs/jindo/jindo_file_system_test.cpp | 2 +- src/paimon/fs/local/local_file.cpp | 31 ++++++++----- src/paimon/fs/local/local_file.h | 12 ++--- src/paimon/fs/local/local_file_status.h | 6 +-- src/paimon/fs/local/local_file_system.cpp | 20 ++++----- src/paimon/fs/local/local_file_system.h | 14 +++--- src/paimon/fs/local/local_file_test.cpp | 12 ++--- .../global_index/lumina/lumina_file_reader.h | 44 +++++++++++++++---- .../global_index/lumina/lumina_file_writer.h | 17 +++++-- src/paimon/testing/mock/mock_file_system.h | 16 +++---- .../testing/mock/mock_format_writer.cpp | 4 +- src/paimon/testing/utils/test_helper.h | 4 +- test/inte/read_inte_test.cpp | 10 ++--- 61 files changed, 468 insertions(+), 322 deletions(-) diff --git a/include/paimon/fs/file_system.h b/include/paimon/fs/file_system.h index 89af108b2..55207a63f 100644 --- a/include/paimon/fs/file_system.h +++ b/include/paimon/fs/file_system.h @@ -65,7 +65,7 @@ class PAIMON_EXPORT InputStream : public Stream { /// @return Current position in the input stream. /// @return IOError returned if an I/O error occurred in the underlying stream. /// implementation while accessing the stream's position. - virtual Result GetPos() const = 0; + virtual Result GetPos() const = 0; /// Read data from the current position in the stream. /// @param[out] buffer Pointer to the buffer where read data will be stored. @@ -73,7 +73,7 @@ class PAIMON_EXPORT InputStream : public Stream { /// @return Result containing the actual number of bytes read on success, or an error status on /// failure. /// @note The stream position advances by the number of bytes actually read. - virtual Result Read(char* buffer, uint64_t size) = 0; + virtual Result Read(char* buffer, int64_t size) = 0; /// Read data from given position in the stream. /// @@ -83,7 +83,7 @@ class PAIMON_EXPORT InputStream : public Stream { /// @param[out] buffer The buffer to store the read content. /// @param size The number of bytes to read. /// @param offset The position in the stream to read from. - virtual Result Read(char* buffer, uint64_t size, uint64_t offset) = 0; + virtual Result Read(char* buffer, int64_t size, int64_t offset) = 0; /// Asynchronously read data from the input stream. /// @@ -98,7 +98,7 @@ class PAIMON_EXPORT InputStream : public Stream { /// @param callback The callback function to be invoked upon completion of the read operation. /// The callback will receive a Status object indicating the success or failure /// of the read operation. - virtual void ReadAsync(char* buffer, uint64_t size, uint64_t offset, + virtual void ReadAsync(char* buffer, int64_t size, int64_t offset, std::function&& callback) = 0; /// Get an identifier that uniquely identify the underlying content. @@ -107,7 +107,7 @@ class PAIMON_EXPORT InputStream : public Stream { virtual Result GetUri() const = 0; /// Get the total length of the file in bytes. - virtual Result Length() const = 0; + virtual Result Length() const = 0; }; /// Abstract class for output stream operations. @@ -121,12 +121,12 @@ class PAIMON_EXPORT OutputStream : public Stream { /// @return Result containing the actual number of bytes written on success, or an error status /// on failure. /// @note The stream position advances by the number of bytes actually written. - virtual Result Write(const char* buffer, uint64_t size) = 0; + virtual Result Write(const char* buffer, int64_t size) = 0; /// Flush pending data to the disk. virtual Status Flush() = 0; /// Get the write position. - virtual Result GetPos() const = 0; + virtual Result GetPos() const = 0; /// Get the uri of the output stream. virtual Result GetUri() const = 0; }; @@ -160,7 +160,7 @@ class PAIMON_EXPORT FileStatus { /// Get the size of the file in bytes. /// @note For directories, this method is undefined behavior. - virtual uint64_t GetLen() const = 0; + virtual int64_t GetLen() const = 0; /// Check if this entry represents a directory. virtual bool IsDir() const = 0; diff --git a/include/paimon/io/buffered_input_stream.h b/include/paimon/io/buffered_input_stream.h index 362c0cbc5..0c6dccdb2 100644 --- a/include/paimon/io/buffered_input_stream.h +++ b/include/paimon/io/buffered_input_stream.h @@ -49,16 +49,16 @@ class PAIMON_EXPORT BufferedInputStream : public InputStream { Status Seek(int64_t offset, SeekOrigin origin) override; - Result GetPos() const override; + Result GetPos() const override; - Result Read(char* buffer, uint64_t size) override; + Result Read(char* buffer, int64_t size) override; - Result Read(char* buffer, uint64_t size, uint64_t offset) override; + Result Read(char* buffer, int64_t size, int64_t offset) override; - void ReadAsync(char* buffer, uint64_t size, uint64_t offset, + void ReadAsync(char* buffer, int64_t size, int64_t offset, std::function&& callback) override; - Result Length() const override; + Result Length() const override; Status Close() override; @@ -72,10 +72,10 @@ class PAIMON_EXPORT BufferedInputStream : public InputStream { /// Internal read implementation. /// @pre size > 0 - Result InnerRead(char* buffer, uint64_t size); + Result InnerRead(char* buffer, int64_t size); /// Validate that the expected number of bytes were read. - Status AssertReadLength(uint64_t read_length, uint64_t actual_read_length) const; + Status AssertReadLength(int64_t read_length, int64_t actual_read_length) const; private: uint64_t buffer_size_; diff --git a/include/paimon/io/byte_array_input_stream.h b/include/paimon/io/byte_array_input_stream.h index c387f8f97..7d7b6a4b6 100644 --- a/include/paimon/io/byte_array_input_stream.h +++ b/include/paimon/io/byte_array_input_stream.h @@ -37,18 +37,18 @@ class PAIMON_EXPORT ByteArrayInputStream : public InputStream { Status Seek(int64_t offset, SeekOrigin origin) override; - Result GetPos() const override { + Result GetPos() const override { return position_; } - Result Read(char* buffer, uint64_t size) override; + Result Read(char* buffer, int64_t size) override; - Result Read(char* buffer, uint64_t size, uint64_t offset) override; + Result Read(char* buffer, int64_t size, int64_t offset) override; - void ReadAsync(char* buffer, uint64_t size, uint64_t offset, + void ReadAsync(char* buffer, int64_t size, int64_t offset, std::function&& callback) override; - Result Length() const override { + Result Length() const override { return length_; } diff --git a/include/paimon/io/data_input_stream.h b/include/paimon/io/data_input_stream.h index f4e574962..4ecdfa1e9 100644 --- a/include/paimon/io/data_input_stream.h +++ b/include/paimon/io/data_input_stream.h @@ -55,17 +55,17 @@ class PAIMON_EXPORT DataInputStream { /// Read raw data of specified size from the stream. /// @param data Buffer to store the read data. /// @param size Number of bytes to read. - Status Read(char* data, uint64_t size) const; + Status Read(char* data, int64_t size) const; /// Read string from the stream. /// @note First read length (int16), then read string bytes. Result ReadString() const; /// Get the current position in the underlying input stream. - Result GetPos() const; + Result GetPos() const; /// Get the total length of the underlying input stream. - Result Length() const; + Result Length() const; /// Set the byte order for endianness conversion. /// @param order The byte order to use `PAIMON_BIG_ENDIAN` or `PAIMON_LITTLE_ENDIAN`. @@ -77,11 +77,11 @@ class PAIMON_EXPORT DataInputStream { /// Validate that the expected number of bytes were read. /// @param read_length Expected number of bytes to read. /// @param actual_read_length Actual number of bytes read. - Status AssertReadLength(uint64_t read_length, uint64_t actual_read_length) const; + Status AssertReadLength(int64_t read_length, int64_t actual_read_length) const; /// Check if there are enough bytes available to read. /// @param need_length Number of bytes needed. - Status AssertBoundary(uint64_t need_length) const; + Status AssertBoundary(int64_t need_length) const; /// Determine if byte swapping is needed based on current byte order and system endianness. /// @return `true` if byte swapping is required, `false` otherwise. diff --git a/src/paimon/common/data/blob_test.cpp b/src/paimon/common/data/blob_test.cpp index 9a6f709e0..c68d10152 100644 --- a/src/paimon/common/data/blob_test.cpp +++ b/src/paimon/common/data/blob_test.cpp @@ -116,7 +116,7 @@ TEST_F(BlobTest, TestNewInputStreamWithOffsetAndLength) { ASSERT_OK_AND_ASSIGN(auto input_stream, blob->NewInputStream(file_system_)); ASSERT_TRUE(input_stream); - ASSERT_OK_AND_ASSIGN(uint64_t length, input_stream->Length()); + ASSERT_OK_AND_ASSIGN(int64_t length, input_stream->Length()); ASSERT_EQ(6, length); // Test reading with offset and length applied @@ -133,7 +133,7 @@ TEST_F(BlobTest, TestNewInputStreamWithDynamicLength) { ASSERT_OK_AND_ASSIGN(auto input_stream, blob->NewInputStream(file_system_)); ASSERT_TRUE(input_stream); - ASSERT_OK_AND_ASSIGN(uint64_t length, input_stream->Length()); + ASSERT_OK_AND_ASSIGN(int64_t length, input_stream->Length()); ASSERT_EQ(12, length); // Test reading from offset to end (should read "cdefghijklmn") diff --git a/src/paimon/common/file_index/bitmap/bitmap_file_index_test.cpp b/src/paimon/common/file_index/bitmap/bitmap_file_index_test.cpp index 9af960956..f29a74be7 100644 --- a/src/paimon/common/file_index/bitmap/bitmap_file_index_test.cpp +++ b/src/paimon/common/file_index/bitmap/bitmap_file_index_test.cpp @@ -623,7 +623,7 @@ TEST_F(BitmapIndexTest, TestHighCardinalityForCompatibility) { auto file_system = std::make_unique(); ASSERT_OK_AND_ASSIGN(std::shared_ptr input_stream, file_system->Open(index_file_name)); - ASSERT_OK_AND_ASSIGN(uint64_t length, input_stream->Length()); + ASSERT_OK_AND_ASSIGN(int64_t length, input_stream->Length()); BitmapFileIndex file_index({}); ASSERT_OK_AND_ASSIGN(auto reader, file_index.CreateReader(CreateArrowSchema(type).get(), diff --git a/src/paimon/common/fs/file_system.cpp b/src/paimon/common/fs/file_system.cpp index e22fa1dc0..db1d553bc 100644 --- a/src/paimon/common/fs/file_system.cpp +++ b/src/paimon/common/fs/file_system.cpp @@ -20,6 +20,7 @@ #include #include "fmt/format.h" +#include "paimon/common/utils/math.h" #include "paimon/common/utils/path_util.h" #include "paimon/common/utils/scope_guard.h" #include "paimon/common/utils/string_utils.h" @@ -48,9 +49,13 @@ Status FileSystem::ReadFile(const std::string& path, std::string* content) { Status s = in->Close(); (void)s; }); - PAIMON_ASSIGN_OR_RAISE(uint64_t length, in->Length()); + PAIMON_ASSIGN_OR_RAISE(int64_t length, in->Length()); + if (length < 0) { + return Status::Invalid( + fmt::format("path {}, file length {} is less than 0", path, length)); + } content->resize(static_cast(length)); - PAIMON_ASSIGN_OR_RAISE(uint64_t read_length, in->Read(content->data(), length)); + PAIMON_ASSIGN_OR_RAISE(int64_t read_length, in->Read(content->data(), length)); if (read_length != length) { return Status::IOError(fmt::format("path {}, expect read len {}, actual read len {}", path, length, read_length)); @@ -69,8 +74,9 @@ Status FileSystem::WriteFile(const std::string& path, const std::string& content Status s = out->Close(); (void)s; }); - uint64_t length = content.size(); - PAIMON_ASSIGN_OR_RAISE(uint64_t write_length, out->Write(content.data(), length)); + PAIMON_RETURN_NOT_OK(ValidateValueInRange(content.size(), "content length")); + int64_t length = static_cast(content.size()); + PAIMON_ASSIGN_OR_RAISE(int64_t write_length, out->Write(content.data(), length)); if (write_length != length) { return Status::IOError(fmt::format("path {}, expect write len {}, actual write len {}", path, length, write_length)); diff --git a/src/paimon/common/fs/file_system_test.cpp b/src/paimon/common/fs/file_system_test.cpp index 49c6983ed..1d8222ef6 100644 --- a/src/paimon/common/fs/file_system_test.cpp +++ b/src/paimon/common/fs/file_system_test.cpp @@ -91,7 +91,7 @@ class FileSystemTest : public ::testing::Test, public ::testing::WithParamInterf fs_->Create(file, /*overwrite=*/true)); std::string input = "paimon"; char chars[8] = {1, 2, 3, 4, 5, 6, 7, 8}; - ASSERT_OK_AND_ASSIGN(uint64_t size, out->Write(chars, input.size())); + ASSERT_OK_AND_ASSIGN(int64_t size, out->Write(chars, input.size())); ASSERT_EQ(size, input.size()); ASSERT_OK(out->Flush()); ASSERT_OK(out->Close()); @@ -202,7 +202,7 @@ TEST_P(FileSystemTest, TestCreate) { ASSERT_OK_AND_ASSIGN(std::unique_ptr out, fs_->Create(path, /*overwrite=*/true)); ASSERT_TRUE(out); std::string input = "paimon"; - ASSERT_OK_AND_ASSIGN(uint64_t size, out->Write(input.data(), input.size())); + ASSERT_OK_AND_ASSIGN(int64_t size, out->Write(input.data(), input.size())); ASSERT_EQ(size, input.size()); ASSERT_OK(out->Close()); @@ -215,11 +215,11 @@ TEST_P(FileSystemTest, TestSimpleWriteAndRead) { std::string file_path = test_root_ + "/file.data"; // write process ASSERT_OK_AND_ASSIGN(auto out_stream, fs_->Create(file_path, /*overwrite=*/true)); - ASSERT_OK_AND_ASSIGN(uint64_t write_len, out_stream->Write(content.data(), content.size())); + ASSERT_OK_AND_ASSIGN(int64_t write_len, out_stream->Write(content.data(), content.size())); ASSERT_EQ(write_len, content.size()); ASSERT_OK(out_stream->Flush()); - ASSERT_OK_AND_ASSIGN(uint64_t pos, out_stream->GetPos()); + ASSERT_OK_AND_ASSIGN(int64_t pos, out_stream->GetPos()); ASSERT_EQ(pos, content.size()); ASSERT_OK_AND_ASSIGN(std::string uri, out_stream->GetUri()); @@ -245,7 +245,7 @@ TEST_P(FileSystemTest, TestSimpleWriteAndRead) { ASSERT_OK_AND_ASSIGN(uri, in_stream->GetUri()); ASSERT_EQ(uri, file_path); - ASSERT_OK_AND_ASSIGN(uint64_t file_len, in_stream->Length()); + ASSERT_OK_AND_ASSIGN(int64_t file_len, in_stream->Length()); ASSERT_EQ(file_len, content.size()); ASSERT_OK_AND_ASSIGN(pos, in_stream->GetPos()); @@ -260,11 +260,11 @@ TEST_P(FileSystemTest, TestWriteMultipleTimes) { // write process ASSERT_OK_AND_ASSIGN(auto out_stream, fs_->Create(file_path, /*overwrite=*/true)); for (const auto& str : content_vec) { - ASSERT_OK_AND_ASSIGN(uint64_t write_len, out_stream->Write(str.data(), str.size())); + ASSERT_OK_AND_ASSIGN(int64_t write_len, out_stream->Write(str.data(), str.size())); ASSERT_EQ(write_len, str.size()); } ASSERT_OK(out_stream->Flush()); - ASSERT_OK_AND_ASSIGN(uint64_t pos, out_stream->GetPos()); + ASSERT_OK_AND_ASSIGN(int64_t pos, out_stream->GetPos()); ASSERT_EQ(pos, content.size()); ASSERT_OK(out_stream->Close()); @@ -304,7 +304,7 @@ TEST_P(FileSystemTest, TestWriteEmptyFile) { // write process std::string content = ""; ASSERT_OK_AND_ASSIGN(auto out_stream, fs_->Create(file_path, /*overwrite=*/true)); - ASSERT_OK_AND_ASSIGN(uint64_t write_len, out_stream->Write(content.data(), content.size())); + ASSERT_OK_AND_ASSIGN(int64_t write_len, out_stream->Write(content.data(), content.size())); ASSERT_EQ(write_len, 0); ASSERT_OK(out_stream->Flush()); ASSERT_OK(out_stream->Close()); @@ -332,7 +332,7 @@ TEST_P(FileSystemTest, TestWriteWithOverwrite) { std::string file_path = test_root_ + "/file.data"; // write process ASSERT_OK_AND_ASSIGN(auto out_stream, fs_->Create(file_path, /*overwrite=*/true)); - ASSERT_OK_AND_ASSIGN(uint64_t write_len, out_stream->Write(content.data(), content.size())); + ASSERT_OK_AND_ASSIGN(int64_t write_len, out_stream->Write(content.data(), content.size())); ASSERT_EQ(write_len, content.size()); ASSERT_OK(out_stream->Flush()); ASSERT_OK(out_stream->Close()); @@ -537,25 +537,25 @@ TEST_P(FileSystemTest, TestSeek) { std::string path = PathUtil::JoinPath(test_root_, "/test_file"); ASSERT_OK_AND_ASSIGN(std::unique_ptr out, fs_->Create(path, /*overwrite=*/true)); std::string input = "paimon"; - ASSERT_OK_AND_ASSIGN(uint64_t size, out->Write(input.data(), input.size())); + ASSERT_OK_AND_ASSIGN(int64_t size, out->Write(input.data(), input.size())); ASSERT_EQ(size, input.size()); ASSERT_OK(out->Close()); ASSERT_OK_AND_ASSIGN(std::unique_ptr in, fs_->Open(path)); ASSERT_OK(in->Seek(/*offset=*/1, SeekOrigin::FS_SEEK_SET)); - ASSERT_OK_AND_ASSIGN(uint64_t pos, in->GetPos()); + ASSERT_OK_AND_ASSIGN(int64_t pos, in->GetPos()); ASSERT_EQ(pos, 1); ASSERT_OK(in->Seek(/*offset=*/1, SeekOrigin::FS_SEEK_CUR)); - ASSERT_OK_AND_ASSIGN(uint64_t pos2, in->GetPos()); + ASSERT_OK_AND_ASSIGN(int64_t pos2, in->GetPos()); ASSERT_EQ(pos2, 2); ASSERT_OK(in->Seek(/*offset=*/-5, SeekOrigin::FS_SEEK_END)); - ASSERT_OK_AND_ASSIGN(uint64_t pos3, in->GetPos()); + ASSERT_OK_AND_ASSIGN(int64_t pos3, in->GetPos()); ASSERT_EQ(pos3, 1); std::string read_content(3, '\0'); - ASSERT_OK_AND_ASSIGN(uint64_t read_len, in->Read(read_content.data(), read_content.size())); + ASSERT_OK_AND_ASSIGN(int64_t read_len, in->Read(read_content.data(), read_content.size())); ASSERT_EQ(read_len, read_content.size()); ASSERT_EQ("aim", read_content); @@ -574,7 +574,7 @@ TEST_P(FileSystemTest, TestSeek2) { std::string file_path = test_root_ + "/file.data"; // write process ASSERT_OK_AND_ASSIGN(auto out_stream, fs_->Create(file_path, /*overwrite=*/true)); - ASSERT_OK_AND_ASSIGN(uint64_t write_len, out_stream->Write(content.data(), content.size())); + ASSERT_OK_AND_ASSIGN(int64_t write_len, out_stream->Write(content.data(), content.size())); ASSERT_EQ(write_len, content.size()); ASSERT_OK(out_stream->Flush()); ASSERT_OK(out_stream->Close()); @@ -621,14 +621,14 @@ TEST_P(FileSystemTest, TestRename) { ASSERT_OK_AND_ASSIGN(std::unique_ptr out, fs_->Create(path, /*overwrite=*/true)); ASSERT_TRUE(out); std::string input = "paimon"; - ASSERT_OK_AND_ASSIGN(uint64_t size, out->Write(input.data(), input.size())); + ASSERT_OK_AND_ASSIGN(int64_t size, out->Write(input.data(), input.size())); ASSERT_EQ(size, input.size()); ASSERT_OK(out->Flush()); ASSERT_OK(out->Close()); ASSERT_OK_AND_ASSIGN(std::unique_ptr in, fs_->Open(path)); ASSERT_TRUE(in); char* data = new char[input.size() * 2]; - ASSERT_OK_AND_ASSIGN(uint64_t size_read, in->Read(data, input.size(), /*offset=*/0)); + ASSERT_OK_AND_ASSIGN(int64_t size_read, in->Read(data, input.size(), /*offset=*/0)); ASSERT_EQ(size_read, input.size()); std::string read_data(data, input.size()); ASSERT_EQ(read_data, input); diff --git a/src/paimon/common/global_index/wrap/file_index_writer_wrapper.h b/src/paimon/common/global_index/wrap/file_index_writer_wrapper.h index 65c461af1..82f03d04e 100644 --- a/src/paimon/common/global_index/wrap/file_index_writer_wrapper.h +++ b/src/paimon/common/global_index/wrap/file_index_writer_wrapper.h @@ -58,7 +58,7 @@ class FileIndexWriterWrapper : public GlobalIndexWriter { file_manager_->NewOutputStream(file_name)); PAIMON_ASSIGN_OR_RAISE(PAIMON_UNIQUE_PTR bytes, writer_->SerializedBytes()); - PAIMON_ASSIGN_OR_RAISE(uint64_t actual_size, out->Write(bytes->data(), bytes->size())); + PAIMON_ASSIGN_OR_RAISE(int64_t actual_size, out->Write(bytes->data(), bytes->size())); if (actual_size != bytes->size()) { return Status::IOError(fmt::format("expect write len {} mismatch actual write len {}", bytes->size(), actual_size)); diff --git a/src/paimon/common/io/buffered_input_stream.cpp b/src/paimon/common/io/buffered_input_stream.cpp index d71684cba..b21e247a6 100644 --- a/src/paimon/common/io/buffered_input_stream.cpp +++ b/src/paimon/common/io/buffered_input_stream.cpp @@ -22,6 +22,7 @@ #include #include "fmt/format.h" +#include "paimon/common/utils/math.h" #include "paimon/memory/bytes.h" namespace paimon { @@ -70,15 +71,15 @@ Status BufferedInputStream::Seek(int64_t offset, SeekOrigin origin) { return Status::OK(); } -Result BufferedInputStream::GetPos() const { - PAIMON_ASSIGN_OR_RAISE(uint64_t in_pos, in_->GetPos()); +Result BufferedInputStream::GetPos() const { + PAIMON_ASSIGN_OR_RAISE(int64_t in_pos, in_->GetPos()); return in_pos - count_ + pos_; } -Result BufferedInputStream::Read(char* buffer, uint64_t size) { - uint64_t actual_read_len = 0; +Result BufferedInputStream::Read(char* buffer, int64_t size) { + int64_t actual_read_len = 0; while (actual_read_len < size) { - PAIMON_ASSIGN_OR_RAISE(uint64_t nread, + PAIMON_ASSIGN_OR_RAISE(int64_t nread, InnerRead(buffer + actual_read_len, size - actual_read_len)); assert(nread > 0); actual_read_len += nread; @@ -87,16 +88,16 @@ Result BufferedInputStream::Read(char* buffer, uint64_t size) { return actual_read_len; } -Result BufferedInputStream::Read(char* buffer, uint64_t size, uint64_t offset) { +Result BufferedInputStream::Read(char* buffer, int64_t size, int64_t offset) { return Status::Invalid("BufferedInputStream does not support Read from offset"); } -void BufferedInputStream::ReadAsync(char* buffer, uint64_t size, uint64_t offset, +void BufferedInputStream::ReadAsync(char* buffer, int64_t size, int64_t offset, std::function&& callback) { callback(Status::NotImplemented("BufferedInputStream do not support ReadAsync")); } -Result BufferedInputStream::Length() const { +Result BufferedInputStream::Length() const { return in_->Length(); } @@ -114,16 +115,18 @@ Result BufferedInputStream::GetUri() const { Status BufferedInputStream::Fill() { pos_ = 0; count_ = 0; - PAIMON_ASSIGN_OR_RAISE(uint64_t in_pos, in_->GetPos()); - PAIMON_ASSIGN_OR_RAISE(uint64_t length, in_->Length()); - uint64_t left_to_read = std::min(length - in_pos, buffer_size_); - PAIMON_ASSIGN_OR_RAISE(uint64_t actual_read_len, in_->Read(buffer_->data(), left_to_read)); + PAIMON_ASSIGN_OR_RAISE(int64_t in_pos, in_->GetPos()); + PAIMON_ASSIGN_OR_RAISE(int64_t length, in_->Length()); + uint64_t left_to_read = std::min(static_cast(length - in_pos), buffer_size_); + PAIMON_RETURN_NOT_OK(ValidateValueInRange(left_to_read, "read length")); + PAIMON_ASSIGN_OR_RAISE(int64_t actual_read_len, + in_->Read(buffer_->data(), static_cast(left_to_read))); PAIMON_RETURN_NOT_OK(AssertReadLength(left_to_read, actual_read_len)); count_ = actual_read_len; return Status::OK(); } -Result BufferedInputStream::InnerRead(char* buffer, uint64_t size) { +Result BufferedInputStream::InnerRead(char* buffer, int64_t size) { assert(size > 0); assert(pos_ <= count_); uint64_t avail = count_ - pos_; @@ -132,7 +135,7 @@ Result BufferedInputStream::InnerRead(char* buffer, uint64_t size) { if there is no mark/reset activity, do not bother to copy the bytes into the local buffer. In this way buffered streams will cascade harmlessly. */ - if (size >= buffer_size_) { + if (static_cast(size) >= buffer_size_) { return in_->Read(buffer, size); } PAIMON_RETURN_NOT_OK(Fill()); @@ -144,14 +147,14 @@ Result BufferedInputStream::InnerRead(char* buffer, uint64_t size) { size)); } } - uint64_t copy_length = std::min(avail, size); + int64_t copy_length = std::min(static_cast(avail), size); memcpy(buffer, buffer_->data() + pos_, copy_length); - pos_ += copy_length; + pos_ += static_cast(copy_length); return copy_length; } -Status BufferedInputStream::AssertReadLength(uint64_t read_length, - uint64_t actual_read_length) const { +Status BufferedInputStream::AssertReadLength(int64_t read_length, + int64_t actual_read_length) const { if (read_length != actual_read_length) { return Status::Invalid( fmt::format("assert read length failed: read length not match, read length {}, actual " diff --git a/src/paimon/common/io/byte_array_input_stream.cpp b/src/paimon/common/io/byte_array_input_stream.cpp index 80d7306f6..1b8e4bc42 100644 --- a/src/paimon/common/io/byte_array_input_stream.cpp +++ b/src/paimon/common/io/byte_array_input_stream.cpp @@ -59,32 +59,33 @@ Status ByteArrayInputStream::Seek(int64_t offset, SeekOrigin origin) { return Status::OK(); } -Result ByteArrayInputStream::Read(char* buffer, uint64_t size) { - if (size > length_ - position_) { +Result ByteArrayInputStream::Read(char* buffer, int64_t size) { + if (size < 0 || static_cast(size) > length_ - position_) { return Status::Invalid( fmt::format("ByteArrayInputStream assert boundary failed: need length {}, current " "position {}, exceed length {}", size, position_, length_)); } - memcpy(buffer, buffer_ + position_, size); - position_ += size; + memcpy(buffer, buffer_ + position_, static_cast(size)); + position_ += static_cast(size); return size; } -Result ByteArrayInputStream::Read(char* buffer, uint64_t size, uint64_t offset) { - if (offset + size > length_) { +Result ByteArrayInputStream::Read(char* buffer, int64_t size, int64_t offset) { + if (size < 0 || offset < 0 || static_cast(offset) > length_ || + static_cast(size) > length_ - static_cast(offset)) { return Status::Invalid( fmt::format("ByteArrayInputStream assert boundary failed: need length {}, read offset " "{}, exceed length {}", size, offset, length_)); } - memcpy(buffer, buffer_ + offset, size); + memcpy(buffer, buffer_ + offset, static_cast(size)); return size; } -void ByteArrayInputStream::ReadAsync(char* buffer, uint64_t size, uint64_t offset, +void ByteArrayInputStream::ReadAsync(char* buffer, int64_t size, int64_t offset, std::function&& callback) { - Result read_size = Read(buffer, size, offset); + Result read_size = Read(buffer, size, offset); Status status = Status::OK(); if (read_size.ok() && read_size.value() != size) { status = Status::Invalid(fmt::format( diff --git a/src/paimon/common/io/cache_input_stream.h b/src/paimon/common/io/cache_input_stream.h index 7d35674c6..d8505556e 100644 --- a/src/paimon/common/io/cache_input_stream.h +++ b/src/paimon/common/io/cache_input_stream.h @@ -20,6 +20,7 @@ #include #include +#include "paimon/common/utils/math.h" #include "paimon/fs/file_system.h" #include "paimon/utils/read_ahead_cache.h" @@ -34,15 +35,17 @@ class CacheInputStream : public InputStream { Status Seek(int64_t offset, SeekOrigin origin) override { return input_stream_->Seek(offset, origin); } - Result GetPos() const override { + Result GetPos() const override { return input_stream_->GetPos(); } - Result Read(char* buffer, uint64_t size) override { + Result Read(char* buffer, int64_t size) override { return input_stream_->Read(buffer, size); } - Result Read(char* buffer, uint64_t size, uint64_t offset) override { + Result Read(char* buffer, int64_t size, int64_t offset) override { if (cache_) { - ByteRange range{offset, size}; + PAIMON_RETURN_NOT_OK(ValidateValueInRange(offset, "read offset")); + PAIMON_RETURN_NOT_OK(ValidateValueInRange(size, "read size")); + ByteRange range{static_cast(offset), static_cast(size)}; PAIMON_ASSIGN_OR_RAISE(ByteSlice slice, cache_->Read(range)); if (slice.buffer) { std::memcpy(buffer, slice.buffer->data() + slice.offset, slice.length); @@ -51,10 +54,20 @@ class CacheInputStream : public InputStream { } return input_stream_->Read(buffer, size, offset); } - void ReadAsync(char* buffer, uint64_t size, uint64_t offset, + void ReadAsync(char* buffer, int64_t size, int64_t offset, std::function&& callback) override { if (cache_) { - ByteRange range{offset, size}; + Status status = ValidateValueInRange(offset, "read offset"); + if (!status.ok()) { + callback(status); + return; + } + status = ValidateValueInRange(size, "read size"); + if (!status.ok()) { + callback(status); + return; + } + ByteRange range{static_cast(offset), static_cast(size)}; Result slice = cache_->Read(range); if (!slice.ok()) { callback(slice.status()); @@ -78,7 +91,7 @@ class CacheInputStream : public InputStream { return input_stream_->GetUri(); } - Result Length() const override { + Result Length() const override { return input_stream_->Length(); } diff --git a/src/paimon/common/io/cache_input_stream_test.cpp b/src/paimon/common/io/cache_input_stream_test.cpp index e1e90e53c..90713e8dc 100644 --- a/src/paimon/common/io/cache_input_stream_test.cpp +++ b/src/paimon/common/io/cache_input_stream_test.cpp @@ -78,7 +78,7 @@ TEST_F(CacheInputStreamTest, TestProxyMethods) { CacheInputStream stream(std::move(underlying), /*cache=*/nullptr); // Length - ASSERT_OK_AND_ASSIGN(uint64_t length, stream.Length()); + ASSERT_OK_AND_ASSIGN(int64_t length, stream.Length()); ASSERT_EQ(length, content_.size()); // GetUri @@ -87,12 +87,12 @@ TEST_F(CacheInputStreamTest, TestProxyMethods) { // Seek + GetPos ASSERT_OK(stream.Seek(5, SeekOrigin::FS_SEEK_SET)); - ASSERT_OK_AND_ASSIGN(uint64_t pos, stream.GetPos()); + ASSERT_OK_AND_ASSIGN(int64_t pos, stream.GetPos()); ASSERT_EQ(pos, 5); // Read (sequential, no offset) std::string buffer(3, '\0'); - ASSERT_OK_AND_ASSIGN(uint64_t bytes_read, stream.Read(buffer.data(), 3)); + ASSERT_OK_AND_ASSIGN(int64_t bytes_read, stream.Read(buffer.data(), 3)); ASSERT_EQ(bytes_read, 3); ASSERT_EQ(buffer, "fgh"); @@ -106,7 +106,7 @@ TEST_F(CacheInputStreamTest, TestReadWithOffsetNullCache) { CacheInputStream stream(std::move(underlying), /*cache=*/nullptr); std::string buffer(5, '\0'); - ASSERT_OK_AND_ASSIGN(uint64_t bytes_read, stream.Read(buffer.data(), 5, /*offset=*/2)); + ASSERT_OK_AND_ASSIGN(int64_t bytes_read, stream.Read(buffer.data(), 5, /*offset=*/2)); ASSERT_EQ(bytes_read, 5); ASSERT_EQ(buffer, "cdefg"); } @@ -119,7 +119,7 @@ TEST_F(CacheInputStreamTest, TestReadWithOffsetCacheHit) { CacheInputStream stream(std::move(underlying), cache); std::string buffer(5, '\0'); - ASSERT_OK_AND_ASSIGN(uint64_t bytes_read, stream.Read(buffer.data(), 5, /*offset=*/2)); + ASSERT_OK_AND_ASSIGN(int64_t bytes_read, stream.Read(buffer.data(), 5, /*offset=*/2)); ASSERT_EQ(bytes_read, 5); ASSERT_EQ(buffer, "cdefg"); } @@ -132,7 +132,7 @@ TEST_F(CacheInputStreamTest, TestReadWithOffsetCacheMiss) { CacheInputStream stream(std::move(underlying), cache); std::string buffer(3, '\0'); - ASSERT_OK_AND_ASSIGN(uint64_t bytes_read, stream.Read(buffer.data(), 3, /*offset=*/10)); + ASSERT_OK_AND_ASSIGN(int64_t bytes_read, stream.Read(buffer.data(), 3, /*offset=*/10)); ASSERT_EQ(bytes_read, 3); ASSERT_EQ(buffer, "klm"); } diff --git a/src/paimon/common/io/data_input_output_stream_test.cpp b/src/paimon/common/io/data_input_output_stream_test.cpp index eb25f5e12..888f6b59e 100644 --- a/src/paimon/common/io/data_input_output_stream_test.cpp +++ b/src/paimon/common/io/data_input_output_stream_test.cpp @@ -101,7 +101,7 @@ class DataInputOutputStreamTest : public ::testing::Test, ASSERT_OK(data_input_stream->ReadBytes(read_bytes.get())); ASSERT_EQ(*bytes, *read_bytes); // test GetPos - ASSERT_OK_AND_ASSIGN(uint64_t in_pos, input_stream->GetPos()); + ASSERT_OK_AND_ASSIGN(int64_t in_pos, input_stream->GetPos()); ASSERT_EQ(1 + 2 + 4 + 8 + 1 + 41 + 24, in_pos); // read eof, return bad status ASSERT_NOK(data_input_stream->ReadString()); @@ -132,7 +132,7 @@ TEST_P(DataInputOutputStreamTest, TestFileStream) { auto data_output_stream = std::make_unique(output_stream); data_output_stream->SetOrder(byte_order); WriteValues(data_output_stream.get()); - ASSERT_OK_AND_ASSIGN(uint64_t out_pos, output_stream->GetPos()); + ASSERT_OK_AND_ASSIGN(int64_t out_pos, output_stream->GetPos()); ASSERT_EQ(1 + 2 + 4 + 8 + 1 + 41 + 24, out_pos); ASSERT_OK(output_stream->Close()); // check file content diff --git a/src/paimon/common/io/data_input_stream.cpp b/src/paimon/common/io/data_input_stream.cpp index 2d24c4065..26dbe56d2 100644 --- a/src/paimon/common/io/data_input_stream.cpp +++ b/src/paimon/common/io/data_input_stream.cpp @@ -39,10 +39,10 @@ Status DataInputStream::Seek(int64_t offset) const { template Result DataInputStream::ReadValue() const { static_assert(std::is_trivially_copyable_v, "T must be trivially copyable"); - uint64_t read_length = sizeof(T); + int64_t read_length = sizeof(T); PAIMON_RETURN_NOT_OK(AssertBoundary(read_length)); T value; - PAIMON_ASSIGN_OR_RAISE(uint64_t actual_read_length, + PAIMON_ASSIGN_OR_RAISE(int64_t actual_read_length, input_stream_->Read(reinterpret_cast(&value), read_length)); PAIMON_RETURN_NOT_OK(AssertReadLength(read_length, actual_read_length)); if (NeedSwap()) { @@ -52,17 +52,18 @@ Result DataInputStream::ReadValue() const { } Status DataInputStream::ReadBytes(Bytes* bytes) const { - uint64_t read_length = bytes->size(); + PAIMON_RETURN_NOT_OK(ValidateValueInRange(bytes->size(), "bytes length")); + int64_t read_length = static_cast(bytes->size()); PAIMON_RETURN_NOT_OK(AssertBoundary(read_length)); - PAIMON_ASSIGN_OR_RAISE(uint64_t actual_read_length, + PAIMON_ASSIGN_OR_RAISE(int64_t actual_read_length, input_stream_->Read(bytes->data(), read_length)); PAIMON_RETURN_NOT_OK(AssertReadLength(read_length, actual_read_length)); return Status::OK(); } -Status DataInputStream::Read(char* data, uint64_t size) const { +Status DataInputStream::Read(char* data, int64_t size) const { PAIMON_RETURN_NOT_OK(AssertBoundary(size)); - PAIMON_ASSIGN_OR_RAISE(uint64_t actual_read_length, input_stream_->Read(data, size)); + PAIMON_ASSIGN_OR_RAISE(int64_t actual_read_length, input_stream_->Read(data, size)); PAIMON_RETURN_NOT_OK(AssertReadLength(size, actual_read_length)); return Status::OK(); } @@ -72,21 +73,21 @@ Result DataInputStream::ReadString() const { PAIMON_ASSIGN_OR_RAISE(read_length, ReadValue()); PAIMON_RETURN_NOT_OK(AssertBoundary(read_length)); std::string value(read_length, '\0'); - PAIMON_ASSIGN_OR_RAISE(uint64_t actual_read_length, + PAIMON_ASSIGN_OR_RAISE(int64_t actual_read_length, input_stream_->Read(value.data(), read_length)); PAIMON_RETURN_NOT_OK(AssertReadLength(read_length, actual_read_length)); return value; } -Result DataInputStream::GetPos() const { +Result DataInputStream::GetPos() const { return input_stream_->GetPos(); } -Result DataInputStream::Length() const { +Result DataInputStream::Length() const { return input_stream_->Length(); } -Status DataInputStream::AssertReadLength(uint64_t read_length, uint64_t actual_read_length) const { +Status DataInputStream::AssertReadLength(int64_t read_length, int64_t actual_read_length) const { if (read_length != actual_read_length) { return Status::Invalid( fmt::format("assert read length failed: read length not match, read length {}, actual " @@ -96,12 +97,16 @@ Status DataInputStream::AssertReadLength(uint64_t read_length, uint64_t actual_r return Status::OK(); } -Status DataInputStream::AssertBoundary(uint64_t need_length) const { +Status DataInputStream::AssertBoundary(int64_t need_length) const { + if (need_length < 0) { + return Status::Invalid(fmt::format( + "DataInputStream assert boundary failed: need length {} is less than 0", need_length)); + } // TODO(jinli.zjw): Store current_pos and file_length as member variables to reduce the overhead // of I/O calls. - PAIMON_ASSIGN_OR_RAISE(uint64_t pos, input_stream_->GetPos()); - PAIMON_ASSIGN_OR_RAISE(uint64_t length, input_stream_->Length()); - if (pos + need_length > length) { + PAIMON_ASSIGN_OR_RAISE(int64_t pos, input_stream_->GetPos()); + PAIMON_ASSIGN_OR_RAISE(int64_t length, input_stream_->Length()); + if (pos > length || need_length > length - pos) { return Status::Invalid( fmt::format("DataInputStream assert boundary failed: need length {}, current position " "{}, exceed length {}", diff --git a/src/paimon/common/io/data_output_stream.cpp b/src/paimon/common/io/data_output_stream.cpp index 09a114ba6..b594958ea 100644 --- a/src/paimon/common/io/data_output_stream.cpp +++ b/src/paimon/common/io/data_output_stream.cpp @@ -17,6 +17,7 @@ #include "paimon/common/io/data_output_stream.h" #include "fmt/format.h" +#include "paimon/common/utils/math.h" #include "paimon/memory/bytes.h" #include "paimon/result.h" @@ -27,8 +28,9 @@ DataOutputStream::DataOutputStream(const std::shared_ptr& output_s } Status DataOutputStream::WriteBytes(const std::shared_ptr& bytes) { - uint64_t write_length = bytes->size(); - PAIMON_ASSIGN_OR_RAISE(uint64_t actual_write_length, + PAIMON_RETURN_NOT_OK(ValidateValueInRange(bytes->size(), "bytes length")); + int64_t write_length = static_cast(bytes->size()); + PAIMON_ASSIGN_OR_RAISE(int64_t actual_write_length, output_stream_->Write(bytes->data(), write_length)); PAIMON_RETURN_NOT_OK(AssertWriteLength(write_length, actual_write_length)); return Status::OK(); @@ -37,14 +39,14 @@ Status DataOutputStream::WriteBytes(const std::shared_ptr& bytes) { Status DataOutputStream::WriteString(const std::string& value) { uint16_t write_length = value.size(); PAIMON_RETURN_NOT_OK(WriteValue(write_length)); - PAIMON_ASSIGN_OR_RAISE(uint64_t actual_write_length, + PAIMON_ASSIGN_OR_RAISE(int64_t actual_write_length, output_stream_->Write(value.data(), write_length)); PAIMON_RETURN_NOT_OK(AssertWriteLength(write_length, actual_write_length)); return Status::OK(); } -Status DataOutputStream::AssertWriteLength(uint64_t write_length, - uint64_t actual_write_length) const { +Status DataOutputStream::AssertWriteLength(int64_t write_length, + int64_t actual_write_length) const { if (write_length != actual_write_length) { return Status::Invalid(fmt::format( "assert write length failed: write length not match, write length {}, actual " diff --git a/src/paimon/common/io/data_output_stream.h b/src/paimon/common/io/data_output_stream.h index 7a4118be4..bbeaff577 100644 --- a/src/paimon/common/io/data_output_stream.h +++ b/src/paimon/common/io/data_output_stream.h @@ -45,9 +45,9 @@ class PAIMON_EXPORT DataOutputStream { if (NeedSwap()) { write_value = EndianSwapValue(value); } - uint64_t write_length = sizeof(T); + int64_t write_length = sizeof(T); PAIMON_ASSIGN_OR_RAISE( - uint64_t actual_write_length, + int64_t actual_write_length, output_stream_->Write(reinterpret_cast(&write_value), write_length)); PAIMON_RETURN_NOT_OK(AssertWriteLength(write_length, actual_write_length)); return Status::OK(); @@ -63,7 +63,7 @@ class PAIMON_EXPORT DataOutputStream { } private: - Status AssertWriteLength(uint64_t write_length, uint64_t actual_write_length) const; + Status AssertWriteLength(int64_t write_length, int64_t actual_write_length) const; bool NeedSwap() const; diff --git a/src/paimon/common/io/offset_input_stream.cpp b/src/paimon/common/io/offset_input_stream.cpp index 302419b88..a1df3077e 100644 --- a/src/paimon/common/io/offset_input_stream.cpp +++ b/src/paimon/common/io/offset_input_stream.cpp @@ -23,11 +23,17 @@ namespace paimon { Result> OffsetInputStream::Create( - const std::shared_ptr& wrapped, uint64_t length, uint64_t offset) { + const std::shared_ptr& wrapped, int64_t length, int64_t offset) { if (PAIMON_UNLIKELY(wrapped == nullptr)) { return Status::Invalid("input stream is null pointer"); } - PAIMON_ASSIGN_OR_RAISE(uint64_t total_length, wrapped->Length()); + if (PAIMON_UNLIKELY(offset < 0)) { + return Status::Invalid(fmt::format("offset {} is less than 0", offset)); + } + if (PAIMON_UNLIKELY(length < 0)) { + return Status::Invalid(fmt::format("length {} is less than 0", length)); + } + PAIMON_ASSIGN_OR_RAISE(int64_t total_length, wrapped->Length()); if (PAIMON_UNLIKELY(offset > total_length)) { return Status::Invalid( fmt::format("offset {} exceed total length {}", offset, total_length)); @@ -41,8 +47,8 @@ Result> OffsetInputStream::Create( new OffsetInputStream(std::move(wrapped), length, offset)); } -OffsetInputStream::OffsetInputStream(const std::shared_ptr& wrapped, uint64_t length, - uint64_t offset) +OffsetInputStream::OffsetInputStream(const std::shared_ptr& wrapped, int64_t length, + int64_t offset) : wrapped_(wrapped), length_(length), offset_(offset) {} Status OffsetInputStream::Seek(int64_t offset, SeekOrigin origin) { @@ -50,20 +56,20 @@ Status OffsetInputStream::Seek(int64_t offset, SeekOrigin origin) { switch (origin) { case SeekOrigin::FS_SEEK_SET: { new_position = offset; - PAIMON_RETURN_NOT_OK(AssertBoundary(static_cast(new_position))); + PAIMON_RETURN_NOT_OK(AssertBoundary(new_position)); PAIMON_RETURN_NOT_OK(wrapped_->Seek(static_cast(offset_) + new_position, SeekOrigin::FS_SEEK_SET)); break; } case SeekOrigin::FS_SEEK_CUR: { new_position = static_cast(inner_position_) + offset; - PAIMON_RETURN_NOT_OK(AssertBoundary(static_cast(new_position))); + PAIMON_RETURN_NOT_OK(AssertBoundary(new_position)); PAIMON_RETURN_NOT_OK(wrapped_->Seek(offset, SeekOrigin::FS_SEEK_CUR)); break; } case SeekOrigin::FS_SEEK_END: { - new_position = static_cast(length_) + offset; - PAIMON_RETURN_NOT_OK(AssertBoundary(static_cast(new_position))); + new_position = length_ + offset; + PAIMON_RETURN_NOT_OK(AssertBoundary(new_position)); PAIMON_RETURN_NOT_OK(wrapped_->Seek(static_cast(offset_) + new_position, SeekOrigin::FS_SEEK_SET)); break; @@ -72,23 +78,23 @@ Status OffsetInputStream::Seek(int64_t offset, SeekOrigin origin) { return Status::Invalid( "invalid SeekOrigin, only support FS_SEEK_SET, FS_SEEK_CUR, and FS_SEEK_END"); } - inner_position_ = static_cast(new_position); + inner_position_ = new_position; return Status::OK(); } -Result OffsetInputStream::Read(char* buffer, uint64_t size) { +Result OffsetInputStream::Read(char* buffer, int64_t size) { PAIMON_RETURN_NOT_OK(AssertBoundary(inner_position_ + size)); - PAIMON_ASSIGN_OR_RAISE(uint64_t actual_read_len, wrapped_->Read(buffer, size)); + PAIMON_ASSIGN_OR_RAISE(int64_t actual_read_len, wrapped_->Read(buffer, size)); inner_position_ += actual_read_len; return actual_read_len; } -Result OffsetInputStream::Read(char* buffer, uint64_t size, uint64_t offset) { +Result OffsetInputStream::Read(char* buffer, int64_t size, int64_t offset) { PAIMON_RETURN_NOT_OK(AssertBoundary(offset + size)); return wrapped_->Read(buffer, size, offset_ + offset); } -void OffsetInputStream::ReadAsync(char* buffer, uint64_t size, uint64_t offset, +void OffsetInputStream::ReadAsync(char* buffer, int64_t size, int64_t offset, std::function&& callback) { auto status = AssertBoundary(offset + size); if (!status.ok()) { @@ -106,16 +112,16 @@ Result OffsetInputStream::GetUri() const { return wrapped_->GetUri(); } -Result OffsetInputStream::GetPos() const { +Result OffsetInputStream::GetPos() const { return inner_position_; } -Result OffsetInputStream::Length() const { +Result OffsetInputStream::Length() const { return length_; } -Status OffsetInputStream::AssertBoundary(uint64_t inner_pos) const { - if (inner_pos > length_) { +Status OffsetInputStream::AssertBoundary(int64_t inner_pos) const { + if (inner_pos < 0 || inner_pos > length_) { return Status::Invalid( fmt::format("OffsetInputStream assert boundary failed: inner pos {} exceed length {}", inner_pos, length_)); diff --git a/src/paimon/common/io/offset_input_stream.h b/src/paimon/common/io/offset_input_stream.h index f9e8c7eb2..3f8a0192e 100644 --- a/src/paimon/common/io/offset_input_stream.h +++ b/src/paimon/common/io/offset_input_stream.h @@ -31,35 +31,34 @@ namespace paimon { class PAIMON_EXPORT OffsetInputStream : public InputStream { public: static Result> Create( - const std::shared_ptr& wrapped, uint64_t length, uint64_t offset); + const std::shared_ptr& wrapped, int64_t length, int64_t offset); ~OffsetInputStream() override = default; Status Seek(int64_t offset, SeekOrigin origin) override; - Result GetPos() const override; + Result GetPos() const override; - Result Read(char* buffer, uint64_t size) override; + Result Read(char* buffer, int64_t size) override; - Result Read(char* buffer, uint64_t size, uint64_t offset) override; + Result Read(char* buffer, int64_t size, int64_t offset) override; - void ReadAsync(char* buffer, uint64_t size, uint64_t offset, + void ReadAsync(char* buffer, int64_t size, int64_t offset, std::function&& callback) override; - Result Length() const override; + Result Length() const override; Status Close() override; Result GetUri() const override; private: - OffsetInputStream(const std::shared_ptr& wrapped, uint64_t length, - uint64_t offset); - Status AssertBoundary(uint64_t inner_pos) const; + OffsetInputStream(const std::shared_ptr& wrapped, int64_t length, int64_t offset); + Status AssertBoundary(int64_t inner_pos) const; private: std::shared_ptr wrapped_; - const uint64_t length_; - const uint64_t offset_; - uint64_t inner_position_ = 0; + const int64_t length_; + const int64_t offset_; + int64_t inner_position_ = 0; }; } // namespace paimon diff --git a/src/paimon/common/sst/sst_file_reader.cpp b/src/paimon/common/sst/sst_file_reader.cpp index 6b004c754..25aa00f48 100644 --- a/src/paimon/common/sst/sst_file_reader.cpp +++ b/src/paimon/common/sst/sst_file_reader.cpp @@ -65,7 +65,7 @@ Result> SstFileReader::Create( Result> SstFileReader::CreateForSortLookupStore( const std::shared_ptr& in, MemorySlice::SliceComparator comparator, const std::shared_ptr& block_cache, const std::shared_ptr& pool) { - PAIMON_ASSIGN_OR_RAISE(uint64_t file_len, in->Length()); + PAIMON_ASSIGN_OR_RAISE(int64_t file_len, in->Length()); PAIMON_RETURN_NOT_OK( in->Seek(file_len - SortLookupStoreFooter::ENCODED_LENGTH, SeekOrigin::FS_SEEK_SET)); auto footer_bytes = Bytes::AllocateBytes(SortLookupStoreFooter::ENCODED_LENGTH, pool.get()); diff --git a/src/paimon/common/sst/sst_file_writer.cpp b/src/paimon/common/sst/sst_file_writer.cpp index 67d0119c0..5691b8d1a 100644 --- a/src/paimon/common/sst/sst_file_writer.cpp +++ b/src/paimon/common/sst/sst_file_writer.cpp @@ -72,7 +72,7 @@ Result> SstFileWriter::WriteBloomFilter() { } auto bf_slice = bloom_filter_->GetBitSet()->ToSlice(); auto data = bf_slice.ReadStringView(); - PAIMON_ASSIGN_OR_RAISE(uint64_t bloom_filter_pos, out_->GetPos()); + PAIMON_ASSIGN_OR_RAISE(int64_t bloom_filter_pos, out_->GetPos()); BloomFilterHandle handle(static_cast(bloom_filter_pos), data.size(), bloom_filter_->ExpectedEntries()); @@ -115,7 +115,7 @@ Result SstFileWriter::FlushBlockWriter(BlockWriter* writer) { crc32c = CRC32C::calculate(&compression_val, 1, crc32c); auto trailer = BlockTrailer(static_cast(compression_type), crc32c); auto trailer_memory_slice = trailer.WriteBlockTrailer(pool_.get()); - PAIMON_ASSIGN_OR_RAISE(uint64_t block_pos, out_->GetPos()); + PAIMON_ASSIGN_OR_RAISE(int64_t block_pos, out_->GetPos()); BlockHandle block_handle(static_cast(block_pos), view.size()); // 1. write data diff --git a/src/paimon/common/utils/arrow/arrow_input_stream_adapter.cpp b/src/paimon/common/utils/arrow/arrow_input_stream_adapter.cpp index e3008de77..9060fcaa2 100644 --- a/src/paimon/common/utils/arrow/arrow_input_stream_adapter.cpp +++ b/src/paimon/common/utils/arrow/arrow_input_stream_adapter.cpp @@ -43,12 +43,14 @@ arrow::Status ArrowInputStreamAdapter::Seek(int64_t position) { } arrow::Result ArrowInputStreamAdapter::Read(int64_t nbytes, void* out) { - Result read_bytes = - input_stream_->Read(static_cast(out), static_cast(nbytes)); + if (nbytes < 0) { + return arrow::Status::Invalid("read size is negative"); + } + Result read_bytes = input_stream_->Read(static_cast(out), nbytes); if (!read_bytes.ok()) { return ToArrowStatus(read_bytes.status()); } - return static_cast(read_bytes.value()); + return read_bytes.value(); } arrow::Result> ArrowInputStreamAdapter::Read(int64_t nbytes) { @@ -63,12 +65,17 @@ arrow::Result> ArrowInputStreamAdapter::Read(int6 arrow::Result ArrowInputStreamAdapter::ReadAt(int64_t position, int64_t nbytes, void* out) { - Result read_bytes = input_stream_->Read( - static_cast(out), static_cast(nbytes), static_cast(position)); + if (position < 0) { + return arrow::Status::Invalid("read offset is negative"); + } + if (nbytes < 0) { + return arrow::Status::Invalid("read size is negative"); + } + Result read_bytes = input_stream_->Read(static_cast(out), nbytes, position); if (!read_bytes.ok()) { return ToArrowStatus(read_bytes.status()); } - return static_cast(read_bytes.value()); + return read_bytes.value(); } arrow::Result> ArrowInputStreamAdapter::ReadAt(int64_t position, @@ -85,6 +92,14 @@ arrow::Result> ArrowInputStreamAdapter::ReadAt(in arrow::Future> ArrowInputStreamAdapter::ReadAsync( const arrow::io::IOContext& io_context, int64_t position, int64_t nbytes) { auto fut = arrow::Future>::Make(); + if (position < 0) { + fut.MarkFinished(arrow::Status::Invalid("read offset is negative")); + return fut; + } + if (nbytes < 0) { + fut.MarkFinished(arrow::Status::Invalid("read size is negative")); + return fut; + } arrow::Result> buffer_result = arrow::AllocateResizableBuffer(nbytes, pool_.get()); if (PAIMON_UNLIKELY(!buffer_result.ok())) { @@ -92,8 +107,7 @@ arrow::Future> ArrowInputStreamAdapter::ReadAsync return fut; } std::shared_ptr buffer = std::move(buffer_result).ValueUnsafe(); - input_stream_->ReadAsync(reinterpret_cast(buffer->mutable_data()), - static_cast(nbytes), static_cast(position), + input_stream_->ReadAsync(reinterpret_cast(buffer->mutable_data()), nbytes, position, [fut, buffer](Status callback_status) mutable { if (callback_status.ok()) { fut.MarkFinished(std::move(buffer)); @@ -105,11 +119,11 @@ arrow::Future> ArrowInputStreamAdapter::ReadAsync } arrow::Result ArrowInputStreamAdapter::Tell() const { - Result position = input_stream_->GetPos(); + Result position = input_stream_->GetPos(); if (!position.ok()) { return ToArrowStatus(position.status()); } - return static_cast(position.value()); + return position.value(); } arrow::Result ArrowInputStreamAdapter::GetSize() { diff --git a/src/paimon/common/utils/arrow/arrow_output_stream_adapter.cpp b/src/paimon/common/utils/arrow/arrow_output_stream_adapter.cpp index ea325c542..69388e108 100644 --- a/src/paimon/common/utils/arrow/arrow_output_stream_adapter.cpp +++ b/src/paimon/common/utils/arrow/arrow_output_stream_adapter.cpp @@ -36,11 +36,11 @@ arrow::Status ArrowOutputStreamAdapter::Close() { } arrow::Result ArrowOutputStreamAdapter::Tell() const { - paimon::Result pos = out_->GetPos(); + paimon::Result pos = out_->GetPos(); if (!pos.ok()) { return ToArrowStatus(pos.status()); } - return static_cast(pos.value()); + return pos.value(); } bool ArrowOutputStreamAdapter::closed() const { @@ -48,12 +48,14 @@ bool ArrowOutputStreamAdapter::closed() const { } arrow::Status ArrowOutputStreamAdapter::Write(const void* data, int64_t nbytes) { - Result len = - out_->Write(static_cast(data), static_cast(nbytes)); + if (nbytes < 0) { + return arrow::Status::Invalid("write size is negative"); + } + Result len = out_->Write(static_cast(data), nbytes); if (!len.ok()) { return ToArrowStatus(len.status()); } - if (len.value() != static_cast(nbytes)) { + if (len.value() != nbytes) { return arrow::Status::IOError( fmt::format("expect write len {} mismatch actual write len {}", nbytes, len.value())); } diff --git a/src/paimon/common/utils/arrow/arrow_stream_adapter_test.cpp b/src/paimon/common/utils/arrow/arrow_stream_adapter_test.cpp index 9298ec70e..0554dc214 100644 --- a/src/paimon/common/utils/arrow/arrow_stream_adapter_test.cpp +++ b/src/paimon/common/utils/arrow/arrow_stream_adapter_test.cpp @@ -57,7 +57,7 @@ TEST(ArrowStreamAdapterTest, TestInputAndOutputStream) { // in stream ASSERT_OK_AND_ASSIGN(std::shared_ptr in, file_system->Open(file_name)); - ASSERT_OK_AND_ASSIGN(uint64_t length, in->Length()); + ASSERT_OK_AND_ASSIGN(int64_t length, in->Length()); auto in_stream = std::make_unique(in, GetArrowPool(GetDefaultPool()), length); ASSERT_EQ(in_stream->GetSize().ValueOrDie(), static_cast(data.length())); diff --git a/src/paimon/common/utils/roaring_bitmap32.cpp b/src/paimon/common/utils/roaring_bitmap32.cpp index 2bbc0301e..de8e90107 100644 --- a/src/paimon/common/utils/roaring_bitmap32.cpp +++ b/src/paimon/common/utils/roaring_bitmap32.cpp @@ -220,8 +220,8 @@ PAIMON_UNIQUE_PTR RoaringBitmap32::Serialize(MemoryPool* pool) const { Status RoaringBitmap32::Deserialize(ByteArrayInputStream* input_stream) { const char* data = input_stream->GetRawData(); - PAIMON_ASSIGN_OR_RAISE(uint64_t pos, input_stream->GetPos()); - PAIMON_ASSIGN_OR_RAISE(uint64_t total_length, input_stream->Length()); + PAIMON_ASSIGN_OR_RAISE(int64_t pos, input_stream->GetPos()); + PAIMON_ASSIGN_OR_RAISE(int64_t total_length, input_stream->Length()); try { GetRoaringBitmap(roaring_bitmap_) = roaring::Roaring::readSafe(data, /*maxbytes=*/total_length - pos); diff --git a/src/paimon/common/utils/roaring_bitmap64.cpp b/src/paimon/common/utils/roaring_bitmap64.cpp index 0f6fbc858..01542a20c 100644 --- a/src/paimon/common/utils/roaring_bitmap64.cpp +++ b/src/paimon/common/utils/roaring_bitmap64.cpp @@ -291,8 +291,8 @@ PAIMON_UNIQUE_PTR RoaringBitmap64::Serialize(MemoryPool* pool) const { Status RoaringBitmap64::Deserialize(ByteArrayInputStream* input_stream) { const char* data = input_stream->GetRawData(); - PAIMON_ASSIGN_OR_RAISE(uint64_t pos, input_stream->GetPos()); - PAIMON_ASSIGN_OR_RAISE(uint64_t total_length, input_stream->Length()); + PAIMON_ASSIGN_OR_RAISE(int64_t pos, input_stream->GetPos()); + PAIMON_ASSIGN_OR_RAISE(int64_t total_length, input_stream->Length()); try { GetRoaringBitmap(roaring_bitmap_) = roaring::Roaring64Map::readSafe(data, /*maxbytes*/ total_length - pos); diff --git a/src/paimon/common/utils/stream_utils.h b/src/paimon/common/utils/stream_utils.h index 0c7ff8507..f459a608a 100644 --- a/src/paimon/common/utils/stream_utils.h +++ b/src/paimon/common/utils/stream_utils.h @@ -26,6 +26,7 @@ #include "fmt/format.h" #include "paimon/common/executor/future.h" +#include "paimon/common/utils/math.h" #include "paimon/fs/file_system.h" #include "paimon/macros.h" #include "paimon/memory/bytes.h" @@ -43,10 +44,12 @@ class StreamUtils { static Result> ReadFully(std::unique_ptr input_stream, const std::shared_ptr& pool) { PAIMON_RETURN_NOT_OK(input_stream->Seek(0, FS_SEEK_SET)); - PAIMON_ASSIGN_OR_RAISE(uint64_t file_length, input_stream->Length()); - PAIMON_UNIQUE_PTR content = Bytes::AllocateBytes(file_length, pool.get()); - PAIMON_ASSIGN_OR_RAISE(uint64_t actual_read_len, - input_stream->Read(content->data(), content->size())); + PAIMON_ASSIGN_OR_RAISE(int64_t file_length, input_stream->Length()); + PAIMON_RETURN_NOT_OK(ValidateValueInRange(file_length, "file length")); + PAIMON_UNIQUE_PTR content = + Bytes::AllocateBytes(static_cast(file_length), pool.get()); + PAIMON_ASSIGN_OR_RAISE(int64_t actual_read_len, + input_stream->Read(content->data(), file_length)); if (actual_read_len != file_length) { return Status::Invalid("actual read length {}, not match with expect length {}", actual_read_len, file_length); @@ -56,18 +59,20 @@ class StreamUtils { static Result> ReadAsyncFully( std::unique_ptr input_stream, const std::shared_ptr& pool) { - PAIMON_ASSIGN_OR_RAISE(uint64_t file_length, input_stream->Length()); - PAIMON_UNIQUE_PTR content = Bytes::AllocateBytes(file_length, pool.get()); + PAIMON_ASSIGN_OR_RAISE(int64_t file_length, input_stream->Length()); + PAIMON_RETURN_NOT_OK(ValidateValueInRange(file_length, "file length")); + PAIMON_UNIQUE_PTR content = + Bytes::AllocateBytes(static_cast(file_length), pool.get()); PAIMON_RETURN_NOT_OK(ReadAsyncFully(std::move(input_stream), content->data())); return content; } static Status ReadAsyncFully(std::unique_ptr input_stream, char* content) { PAIMON_RETURN_NOT_OK(input_stream->Seek(0, FS_SEEK_SET)); - PAIMON_ASSIGN_OR_RAISE(uint64_t file_length, input_stream->Length()); + PAIMON_ASSIGN_OR_RAISE(int64_t file_length, input_stream->Length()); - uint64_t read_offset = 0; - uint64_t read_len = std::min(file_length, kDefaultReadChunkSize); + int64_t read_offset = 0; + int64_t read_len = std::min(file_length, kDefaultReadChunkSize); std::vector> futures; futures.reserve(file_length / kDefaultReadChunkSize + 1); while (read_len > 0) { @@ -77,7 +82,7 @@ class StreamUtils { [promise](Status status) { promise->set_value(status); }); read_offset += read_len; content += read_len; - read_len = std::min(file_length - read_offset, kDefaultReadChunkSize); + read_len = std::min(file_length - read_offset, kDefaultReadChunkSize); } for (const auto& status : CollectAll(futures)) { if (!status.ok()) { diff --git a/src/paimon/core/deletionvectors/deletion_file_writer.cpp b/src/paimon/core/deletionvectors/deletion_file_writer.cpp index bd8a47d4a..9b274d852 100644 --- a/src/paimon/core/deletionvectors/deletion_file_writer.cpp +++ b/src/paimon/core/deletionvectors/deletion_file_writer.cpp @@ -36,7 +36,7 @@ Result> DeletionFileWriter::Create( Status DeletionFileWriter::Write(const std::string& key, const std::shared_ptr& deletion_vector) { - PAIMON_ASSIGN_OR_RAISE(uint64_t start, out_->GetPos()); + PAIMON_ASSIGN_OR_RAISE(int64_t start, out_->GetPos()); PAIMON_RETURN_NOT_OK(ValidateValueInRange(start, "Output position")); DataOutputStream output_stream(out_); PAIMON_ASSIGN_OR_RAISE(int32_t length, deletion_vector->SerializeTo(pool_, &output_stream)); diff --git a/src/paimon/core/deletionvectors/deletion_file_writer.h b/src/paimon/core/deletionvectors/deletion_file_writer.h index 10fb59d6b..d5e832961 100644 --- a/src/paimon/core/deletionvectors/deletion_file_writer.h +++ b/src/paimon/core/deletionvectors/deletion_file_writer.h @@ -34,7 +34,7 @@ class DeletionFileWriter { const std::shared_ptr& path_factory, const std::shared_ptr& fs, const std::shared_ptr& pool); - Result GetPos() const { + Result GetPos() const { return out_->GetPos(); } diff --git a/src/paimon/core/mergetree/lookup/remote_lookup_file_manager.cpp b/src/paimon/core/mergetree/lookup/remote_lookup_file_manager.cpp index 45d013191..a6c7b1c36 100644 --- a/src/paimon/core/mergetree/lookup/remote_lookup_file_manager.cpp +++ b/src/paimon/core/mergetree/lookup/remote_lookup_file_manager.cpp @@ -104,11 +104,15 @@ Status RemoteLookupFileManager::CopyFromInputToOutput( std::unique_ptr&& input_stream, std::unique_ptr&& output_stream) const { auto buffer = std::make_shared(kBufferSize, pool_.get()); - PAIMON_ASSIGN_OR_RAISE(uint64_t total_length, input_stream->Length()); - uint64_t write_size = 0; + PAIMON_ASSIGN_OR_RAISE(int64_t total_length, input_stream->Length()); + if (total_length < 0) { + return Status::Invalid(fmt::format("input stream length {} is less than 0", total_length)); + } + int64_t write_size = 0; while (write_size < total_length) { - uint64_t current_read_size = std::min(total_length - write_size, kBufferSize); - PAIMON_ASSIGN_OR_RAISE(uint64_t bytes_read, + int64_t current_read_size = + std::min(total_length - write_size, static_cast(kBufferSize)); + PAIMON_ASSIGN_OR_RAISE(int64_t bytes_read, input_stream->Read(buffer->data(), current_read_size)); if (bytes_read != current_read_size) { return Status::Invalid( @@ -116,7 +120,7 @@ Status RemoteLookupFileManager::CopyFromInputToOutput( "actual read {} bytes", current_read_size, bytes_read)); } - PAIMON_ASSIGN_OR_RAISE(uint64_t bytes_written, + PAIMON_ASSIGN_OR_RAISE(int64_t bytes_written, output_stream->Write(buffer->data(), bytes_read)); if (bytes_written != bytes_read) { return Status::Invalid( diff --git a/src/paimon/core/utils/objects_file.h b/src/paimon/core/utils/objects_file.h index ac3a7f673..89c93bf3d 100644 --- a/src/paimon/core/utils/objects_file.h +++ b/src/paimon/core/utils/objects_file.h @@ -184,7 +184,7 @@ Result> ObjectsFile::WriteWithoutRolling( PAIMON_RETURN_NOT_OK(format_writer->Flush()); PAIMON_RETURN_NOT_OK(format_writer->Finish()); PAIMON_RETURN_NOT_OK(out->Flush()); - PAIMON_ASSIGN_OR_RAISE(uint64_t pos, out->GetPos()); + PAIMON_ASSIGN_OR_RAISE(int64_t pos, out->GetPos()); PAIMON_RETURN_NOT_OK(out->Close()); guard.Release(); return std::make_pair(PathUtil::GetName(file_path), static_cast(pos)); diff --git a/src/paimon/format/avro/avro_file_batch_reader.cpp b/src/paimon/format/avro/avro_file_batch_reader.cpp index e856f87b4..47f5110fd 100644 --- a/src/paimon/format/avro/avro_file_batch_reader.cpp +++ b/src/paimon/format/avro/avro_file_batch_reader.cpp @@ -183,7 +183,7 @@ Result> AvroFileBatchReader::GetFileSchema() cons Result AvroFileBatchReader::GetNumberOfRows() const { if (!total_rows_) { - PAIMON_ASSIGN_OR_RAISE(uint64_t current_pos, input_stream_->GetPos()); + PAIMON_ASSIGN_OR_RAISE(int64_t current_pos, input_stream_->GetPos()); auto seek_pos = static_cast(current_pos); ScopeGuard stream_guard([this, seek_pos]() -> void { // reset input stream position to original position diff --git a/src/paimon/format/avro/avro_input_stream_impl.cpp b/src/paimon/format/avro/avro_input_stream_impl.cpp index 7687ed385..4132be760 100644 --- a/src/paimon/format/avro/avro_input_stream_impl.cpp +++ b/src/paimon/format/avro/avro_input_stream_impl.cpp @@ -25,6 +25,7 @@ #include #include "avro/Exception.hh" +#include "paimon/common/utils/math.h" #include "paimon/fs/file_system.h" #include "paimon/memory/memory_pool.h" #include "paimon/status.h" @@ -34,9 +35,10 @@ namespace paimon::avro { Result> AvroInputStreamImpl::Create( const std::shared_ptr& input_stream, size_t buffer_size, const std::shared_ptr& pool) { - PAIMON_ASSIGN_OR_RAISE(uint64_t length, input_stream->Length()); + PAIMON_ASSIGN_OR_RAISE(int64_t length, input_stream->Length()); + PAIMON_RETURN_NOT_OK(ValidateValueInRange(length, "file length")); return std::unique_ptr( - new AvroInputStreamImpl(input_stream, buffer_size, length, pool)); + new AvroInputStreamImpl(input_stream, buffer_size, static_cast(length), pool)); } AvroInputStreamImpl::AvroInputStreamImpl(const std::shared_ptr& input_stream, @@ -67,8 +69,12 @@ bool AvroInputStreamImpl::next(const uint8_t** data, size_t* len) { if (remaining == 0) { return false; // eof } - auto read_length = - in_->Read(reinterpret_cast(buffer_), std::min(buffer_size_, remaining)); + uint64_t read_size = std::min(buffer_size_, remaining); + Status status = ValidateValueInRange(read_size, "read size"); + if (!status.ok()) { + throw ::avro::Exception("Read failed: {}", status.ToString()); + } + auto read_length = in_->Read(reinterpret_cast(buffer_), static_cast(read_size)); if (!read_length.ok()) { throw ::avro::Exception("Read failed: {}", read_length.status().ToString()); } diff --git a/src/paimon/format/avro/avro_output_stream_impl.cpp b/src/paimon/format/avro/avro_output_stream_impl.cpp index 144aed458..18e2700cd 100644 --- a/src/paimon/format/avro/avro_output_stream_impl.cpp +++ b/src/paimon/format/avro/avro_output_stream_impl.cpp @@ -20,6 +20,7 @@ #include #include "fmt/format.h" +#include "paimon/common/utils/math.h" #include "paimon/fs/file_system.h" #include "paimon/memory/memory_pool.h" #include "paimon/result.h" @@ -62,18 +63,23 @@ void AvroOutputStreamImpl::backup(size_t len) { void AvroOutputStreamImpl::FlushBuffer() { size_t length = buffer_size_ - available_; - Result write_len = out_->Write(reinterpret_cast(buffer_), length); + Status status = ValidateValueInRange(length, "write length"); + if (!status.ok()) { + throw std::runtime_error(status.ToString()); + } + Result write_len = + out_->Write(reinterpret_cast(buffer_), static_cast(length)); if (!write_len.ok()) { throw std::runtime_error("write failed, status: " + write_len.status().ToString()); } - if (write_len.value() != length) { + if (write_len.value() != static_cast(length)) { throw std::runtime_error( fmt::format("write failed, expected length: {}, actual write length: {}", length, write_len.value())); } - Status status = out_->Flush(); - if (!status.ok()) { - throw std::runtime_error("flush failed, status: " + status.ToString()); + Status flush_status = out_->Flush(); + if (!flush_status.ok()) { + throw std::runtime_error("flush failed, status: " + flush_status.ToString()); } next_ = buffer_; available_ = buffer_size_; diff --git a/src/paimon/format/blob/blob_file_batch_reader.cpp b/src/paimon/format/blob/blob_file_batch_reader.cpp index 82f0fa7ae..1f864ec7c 100644 --- a/src/paimon/format/blob/blob_file_batch_reader.cpp +++ b/src/paimon/format/blob/blob_file_batch_reader.cpp @@ -50,7 +50,7 @@ Result> BlobFileBatchReader::Create( batch_size)); } - PAIMON_ASSIGN_OR_RAISE(uint64_t file_size, input_stream->Length()); + PAIMON_ASSIGN_OR_RAISE(int64_t file_size, input_stream->Length()); PAIMON_RETURN_NOT_OK( input_stream->Seek(file_size - BlobDefs::kBlobFileHeaderLength, FS_SEEK_SET)); int8_t header[BlobDefs::kBlobFileHeaderLength]; diff --git a/src/paimon/format/blob/blob_format_writer.cpp b/src/paimon/format/blob/blob_format_writer.cpp index 84dbc1866..548d9524f 100644 --- a/src/paimon/format/blob/blob_format_writer.cpp +++ b/src/paimon/format/blob/blob_format_writer.cpp @@ -27,6 +27,7 @@ #include "paimon/common/metrics/metrics_impl.h" #include "paimon/common/utils/arrow/status_utils.h" #include "paimon/common/utils/delta_varint_compressor.h" +#include "paimon/common/utils/math.h" #include "paimon/data/blob.h" #include "paimon/io/byte_array_input_stream.h" @@ -156,7 +157,7 @@ Status BlobFormatWriter::Finish() { Status BlobFormatWriter::WriteBlob(std::string_view blob_data) { crc32_ = 0; - PAIMON_ASSIGN_OR_RAISE(uint64_t previous_pos, out_->GetPos()); + PAIMON_ASSIGN_OR_RAISE(int64_t previous_pos, out_->GetPos()); // write magic number static PAIMON_UNIQUE_PTR kMagicNumberBytes = @@ -177,22 +178,28 @@ Status BlobFormatWriter::WriteBlob(std::string_view blob_data) { } else { in = std::make_unique(blob_data.data(), blob_data.size()); } - PAIMON_ASSIGN_OR_RAISE(uint64_t file_length, in->Length()); - uint64_t total_read_length = 0; - uint64_t read_len = std::min(file_length, tmp_buffer_->size()); + PAIMON_ASSIGN_OR_RAISE(int64_t file_length, in->Length()); + if (file_length < 0) { + return Status::Invalid(fmt::format("blob input length {} is less than 0", file_length)); + } + PAIMON_RETURN_NOT_OK(ValidateValueInRange(tmp_buffer_->size(), "buffer size")); + int64_t total_read_length = 0; + int64_t read_len = + std::min(file_length, static_cast(tmp_buffer_->size())); while (read_len > 0) { - PAIMON_ASSIGN_OR_RAISE(uint64_t actual_read_len, in->Read(tmp_buffer_->data(), read_len)); + PAIMON_ASSIGN_OR_RAISE(int64_t actual_read_len, in->Read(tmp_buffer_->data(), read_len)); if (actual_read_len != read_len) { return Status::Invalid("actual read length {}, not match with expect length {}", actual_read_len, read_len); } PAIMON_RETURN_NOT_OK(WriteWithCrc32(tmp_buffer_->data(), actual_read_len)); total_read_length += actual_read_len; - read_len = std::min(file_length - total_read_length, tmp_buffer_->size()); + read_len = std::min(file_length - total_read_length, + static_cast(tmp_buffer_->size())); } // write bin length - PAIMON_ASSIGN_OR_RAISE(uint64_t current_pos, out_->GetPos()); + PAIMON_ASSIGN_OR_RAISE(int64_t current_pos, out_->GetPos()); /// magic number(4) + blob content(bin length - 16) + bin length(8) + crc32(4) /// ↑ ↑ /// previous_pos current_pos @@ -209,10 +216,12 @@ Status BlobFormatWriter::WriteBlob(std::string_view blob_data) { } Status BlobFormatWriter::WriteBytes(const char* data, uint64_t length) { - PAIMON_ASSIGN_OR_RAISE(uint64_t actual, out_->Write(data, length)); - if (actual != length) { + PAIMON_RETURN_NOT_OK(ValidateValueInRange(length, "write length")); + int64_t expected = static_cast(length); + PAIMON_ASSIGN_OR_RAISE(int64_t actual, out_->Write(data, expected)); + if (actual != expected) { return Status::Invalid("not suppose actual length {} not match with expect {}", actual, - length); + expected); } return Status::OK(); } @@ -223,8 +232,8 @@ Status BlobFormatWriter::WriteWithCrc32(const char* data, uint64_t length) { } Result BlobFormatWriter::ReachTargetSize(bool suggested_check, int64_t target_size) const { - PAIMON_ASSIGN_OR_RAISE(uint64_t current_pos, out_->GetPos()); - return current_pos >= static_cast(target_size); + PAIMON_ASSIGN_OR_RAISE(int64_t current_pos, out_->GetPos()); + return current_pos >= target_size; } template diff --git a/src/paimon/format/blob/blob_format_writer_test.cpp b/src/paimon/format/blob/blob_format_writer_test.cpp index 854fdfcbb..43f187dbf 100644 --- a/src/paimon/format/blob/blob_format_writer_test.cpp +++ b/src/paimon/format/blob/blob_format_writer_test.cpp @@ -317,7 +317,7 @@ TEST_P(BlobFormatWriterTest, TestEmptyWriter) { ASSERT_OK_AND_ASSIGN(std::shared_ptr input_stream, file_system_->Open(dir_->Str() + "/file.blob")); ASSERT_TRUE(input_stream); - ASSERT_OK_AND_ASSIGN(uint64_t file_length, input_stream->Length()); + ASSERT_OK_AND_ASSIGN(int64_t file_length, input_stream->Length()); ASSERT_EQ(file_length, 5); // Should have footer even if no data std::vector buffer(file_length); ASSERT_OK_AND_ASSIGN(auto read_length, input_stream->Read(buffer.data(), buffer.size())); @@ -339,7 +339,7 @@ TEST_P(BlobFormatWriterTest, TestLargeBlob) { // Write data larger than TMP_BUFFER_SIZE (1MB) const size_t large_size = BlobFormatWriter::kTmpBufferSize * 2 + 1000; // ~2MB std::vector large_data(large_size, 'A'); - ASSERT_OK_AND_ASSIGN(uint64_t written, large_file_stream->Write(large_data.data(), large_size)); + ASSERT_OK_AND_ASSIGN(int64_t written, large_file_stream->Write(large_data.data(), large_size)); ASSERT_EQ(written, large_size); ASSERT_OK(large_file_stream->Flush()); ASSERT_OK(large_file_stream->Close()); @@ -463,7 +463,7 @@ TEST_P(BlobFormatWriterTest, TestAddBatchWithZeroLengthBlob) { ASSERT_OK_AND_ASSIGN(std::shared_ptr input_stream, file_system_->Open(dir_->Str() + "/file.blob")); ASSERT_TRUE(input_stream); - ASSERT_OK_AND_ASSIGN(uint64_t file_length, input_stream->Length()); + ASSERT_OK_AND_ASSIGN(int64_t file_length, input_stream->Length()); ASSERT_EQ(file_length, 22); std::vector buffer(file_length); ASSERT_OK_AND_ASSIGN(auto read_length, diff --git a/src/paimon/format/orc/orc_input_stream_impl.cpp b/src/paimon/format/orc/orc_input_stream_impl.cpp index f2ab45bd6..133cd2c04 100644 --- a/src/paimon/format/orc/orc_input_stream_impl.cpp +++ b/src/paimon/format/orc/orc_input_stream_impl.cpp @@ -22,6 +22,7 @@ #include "fmt/format.h" #include "orc/Exceptions.hh" #include "orc/Reader.hh" +#include "paimon/common/utils/math.h" #include "paimon/fs/file_system.h" #include "paimon/status.h" @@ -29,9 +30,10 @@ namespace paimon::orc { Result> OrcInputStreamImpl::Create( const std::shared_ptr& input_stream, uint64_t natural_read_size) { PAIMON_ASSIGN_OR_RAISE(std::string name, input_stream->GetUri()); - PAIMON_ASSIGN_OR_RAISE(uint64_t length, input_stream->Length()); - return std::unique_ptr( - new OrcInputStreamImpl(input_stream, name, length, natural_read_size)); + PAIMON_ASSIGN_OR_RAISE(int64_t length, input_stream->Length()); + PAIMON_RETURN_NOT_OK(ValidateValueInRange(length, "file length")); + return std::unique_ptr(new OrcInputStreamImpl( + input_stream, name, static_cast(length), natural_read_size)); } OrcInputStreamImpl::OrcInputStreamImpl(const std::shared_ptr& input_stream, @@ -61,11 +63,20 @@ void OrcInputStreamImpl::read(void* buf, uint64_t length, uint64_t offset) { metrics_->IOCount.fetch_add(1); } - Result read_bytes = input_stream_->Read(static_cast(buf), length, offset); + Status status = ValidateValueInRange(length, "read length"); + if (!status.ok()) { + throw ::orc::ParseError(status.ToString()); + } + status = ValidateValueInRange(offset, "read offset"); + if (!status.ok()) { + throw ::orc::ParseError(status.ToString()); + } + Result read_bytes = input_stream_->Read( + static_cast(buf), static_cast(length), static_cast(offset)); if (!read_bytes.ok()) { throw ::orc::ParseError("read failed, status: " + read_bytes.status().ToString()); } - if (read_bytes.value() != length) { + if (read_bytes.value() != static_cast(length)) { throw ::orc::ParseError( fmt::format("read failed, expected length: {}, actual read length: {}", length, read_bytes.value())); diff --git a/src/paimon/format/orc/orc_output_stream_impl.cpp b/src/paimon/format/orc/orc_output_stream_impl.cpp index 23fab4915..ec7199c47 100644 --- a/src/paimon/format/orc/orc_output_stream_impl.cpp +++ b/src/paimon/format/orc/orc_output_stream_impl.cpp @@ -21,6 +21,7 @@ #include #include "fmt/format.h" +#include "paimon/common/utils/math.h" #include "paimon/fs/file_system.h" #include "paimon/status.h" @@ -38,21 +39,30 @@ OrcOutputStreamImpl::OrcOutputStreamImpl(const std::shared_ptr pos = output_stream_->GetPos(); + Result pos = output_stream_->GetPos(); if (!pos.ok()) { throw std::runtime_error(fmt::format("get length failed, file name {}, error msg {}", file_name_, pos.status().ToString())); } else { - return pos.value(); + Status status = ValidateValueInRange(pos.value(), "file length"); + if (!status.ok()) { + throw std::runtime_error(status.ToString()); + } + return static_cast(pos.value()); } } void OrcOutputStreamImpl::write(const void* buf, size_t length) { - Result write_len = output_stream_->Write(static_cast(buf), length); + Status status = ValidateValueInRange(length, "write length"); + if (!status.ok()) { + throw std::runtime_error(status.ToString()); + } + Result write_len = + output_stream_->Write(static_cast(buf), static_cast(length)); if (!write_len.ok()) { throw std::runtime_error("write failed, status: " + write_len.status().ToString()); } - if (write_len.value() != length) { + if (write_len.value() != static_cast(length)) { throw std::runtime_error( fmt::format("write failed, expected length: {}, actual write length: {}", length, write_len.value())); diff --git a/src/paimon/format/parquet/column_index_filter_test.cpp b/src/paimon/format/parquet/column_index_filter_test.cpp index 8249f6356..2e89d319d 100644 --- a/src/paimon/format/parquet/column_index_filter_test.cpp +++ b/src/paimon/format/parquet/column_index_filter_test.cpp @@ -248,7 +248,7 @@ class ColumnIndexFilterTest : public ::testing::Test { // Open as raw ParquetFileReader ASSERT_OK_AND_ASSIGN(std::shared_ptr in, fs_->Open(file_name_)); - ASSERT_OK_AND_ASSIGN(uint64_t length, in->Length()); + ASSERT_OK_AND_ASSIGN(int64_t length, in->Length()); auto in_stream = std::make_shared(in, arrow_pool_, length); parquet_reader_ = ::parquet::ParquetFileReader::Open(in_stream); ASSERT_TRUE(parquet_reader_); diff --git a/src/paimon/format/parquet/file_reader_wrapper_test.cpp b/src/paimon/format/parquet/file_reader_wrapper_test.cpp index b4c3d5880..c7096445a 100644 --- a/src/paimon/format/parquet/file_reader_wrapper_test.cpp +++ b/src/paimon/format/parquet/file_reader_wrapper_test.cpp @@ -118,7 +118,7 @@ class FileReaderWrapperTest : public ::testing::Test { Result> PrepareReaderWrapper( const std::string& file_path, int64_t wrapper_batch_size = 0) { PAIMON_ASSIGN_OR_RAISE(std::shared_ptr in, fs_->Open(file_path)); - PAIMON_ASSIGN_OR_RAISE(uint64_t file_length, in->Length()); + PAIMON_ASSIGN_OR_RAISE(int64_t file_length, in->Length()); auto input_stream = std::make_unique(in, arrow_pool_, file_length); ::parquet::arrow::FileReaderBuilder file_reader_builder; ::parquet::ReaderProperties reader_properties; diff --git a/src/paimon/format/parquet/page_filtered_row_group_reader_test.cpp b/src/paimon/format/parquet/page_filtered_row_group_reader_test.cpp index bd693730d..b9267e59d 100644 --- a/src/paimon/format/parquet/page_filtered_row_group_reader_test.cpp +++ b/src/paimon/format/parquet/page_filtered_row_group_reader_test.cpp @@ -108,7 +108,7 @@ class PageFilteredRowGroupReaderTest : public ::testing::Test { std::shared_ptr* out, int32_t batch_size = 1024) { ASSERT_OK_AND_ASSIGN(std::shared_ptr in, fs_->Open(file_name)); - ASSERT_OK_AND_ASSIGN(uint64_t length, in->Length()); + ASSERT_OK_AND_ASSIGN(int64_t length, in->Length()); auto in_stream = std::make_shared(in, arrow_pool_, length); std::map options; @@ -511,7 +511,7 @@ TEST_F(PageFilteredRowGroupReaderTest, ComputePageRangesPartialMatch) { // Open as raw ParquetFileReader ASSERT_OK_AND_ASSIGN(std::shared_ptr in, fs_->Open(file_name)); - ASSERT_OK_AND_ASSIGN(uint64_t length, in->Length()); + ASSERT_OK_AND_ASSIGN(int64_t length, in->Length()); auto in_stream = std::make_shared(in, arrow_pool_, length); auto parquet_reader = ::parquet::ParquetFileReader::Open(in_stream); ASSERT_TRUE(parquet_reader); @@ -536,7 +536,7 @@ TEST_F(PageFilteredRowGroupReaderTest, ComputePageRangesAllMatch) { WriteTestFile(file_name, data, /*write_batch_size=*/10, /*max_row_group_length=*/100); ASSERT_OK_AND_ASSIGN(std::shared_ptr in, fs_->Open(file_name)); - ASSERT_OK_AND_ASSIGN(uint64_t length, in->Length()); + ASSERT_OK_AND_ASSIGN(int64_t length, in->Length()); auto in_stream = std::make_shared(in, arrow_pool_, length); auto parquet_reader = ::parquet::ParquetFileReader::Open(in_stream); @@ -562,7 +562,7 @@ TEST_F(PageFilteredRowGroupReaderTest, ComputePageRangesNoMatch) { WriteTestFile(file_name, data, /*write_batch_size=*/10, /*max_row_group_length=*/100); ASSERT_OK_AND_ASSIGN(std::shared_ptr in, fs_->Open(file_name)); - ASSERT_OK_AND_ASSIGN(uint64_t length, in->Length()); + ASSERT_OK_AND_ASSIGN(int64_t length, in->Length()); auto in_stream = std::make_shared(in, arrow_pool_, length); auto parquet_reader = ::parquet::ParquetFileReader::Open(in_stream); @@ -581,7 +581,7 @@ TEST_F(PageFilteredRowGroupReaderTest, ComputePageRangesMultiColumn) { WriteTestFile(file_name, data, /*write_batch_size=*/10, /*max_row_group_length=*/100); ASSERT_OK_AND_ASSIGN(std::shared_ptr in, fs_->Open(file_name)); - ASSERT_OK_AND_ASSIGN(uint64_t length, in->Length()); + ASSERT_OK_AND_ASSIGN(int64_t length, in->Length()); auto in_stream = std::make_shared(in, arrow_pool_, length); auto parquet_reader = ::parquet::ParquetFileReader::Open(in_stream); @@ -607,7 +607,7 @@ TEST_F(PageFilteredRowGroupReaderTest, ComputePageRangesMultiplePages) { WriteTestFile(file_name, data, /*write_batch_size=*/10, /*max_row_group_length=*/100); ASSERT_OK_AND_ASSIGN(std::shared_ptr in, fs_->Open(file_name)); - ASSERT_OK_AND_ASSIGN(uint64_t length, in->Length()); + ASSERT_OK_AND_ASSIGN(int64_t length, in->Length()); auto in_stream = std::make_shared(in, arrow_pool_, length); auto parquet_reader = ::parquet::ParquetFileReader::Open(in_stream); diff --git a/src/paimon/format/parquet/parquet_reader_builder.h b/src/paimon/format/parquet/parquet_reader_builder.h index 0c182f6a5..8eb5e625d 100644 --- a/src/paimon/format/parquet/parquet_reader_builder.h +++ b/src/paimon/format/parquet/parquet_reader_builder.h @@ -43,7 +43,7 @@ class ParquetReaderBuilder : public ReaderBuilder { Result> Build( const std::shared_ptr& path) const override { - PAIMON_ASSIGN_OR_RAISE(uint64_t file_length, path->Length()); + PAIMON_ASSIGN_OR_RAISE(int64_t file_length, path->Length()); std::shared_ptr arrow_pool = GetArrowPool(pool_); auto input_stream = std::make_unique(path, arrow_pool, file_length); diff --git a/src/paimon/format/parquet/parquet_stats_extractor.cpp b/src/paimon/format/parquet/parquet_stats_extractor.cpp index 1c2b4b317..8b9170c2c 100644 --- a/src/paimon/format/parquet/parquet_stats_extractor.cpp +++ b/src/paimon/format/parquet/parquet_stats_extractor.cpp @@ -273,7 +273,7 @@ ParquetStatsExtractor::ExtractWithFileInfo(const std::shared_ptr& fi const std::shared_ptr& pool) { PAIMON_ASSIGN_OR_RAISE(std::unique_ptr input_stream, file_system->Open(path)); assert(input_stream); - PAIMON_ASSIGN_OR_RAISE(uint64_t file_length, input_stream->Length()); + PAIMON_ASSIGN_OR_RAISE(int64_t file_length, input_stream->Length()); std::shared_ptr parquet_memory_pool = GetArrowPool(pool); auto parquet_input_file = std::make_shared( std::move(input_stream), parquet_memory_pool, file_length); diff --git a/src/paimon/format/parquet/predicate_pushdown_test.cpp b/src/paimon/format/parquet/predicate_pushdown_test.cpp index ff976f0b2..ef06f0627 100644 --- a/src/paimon/format/parquet/predicate_pushdown_test.cpp +++ b/src/paimon/format/parquet/predicate_pushdown_test.cpp @@ -104,7 +104,7 @@ class PredicatePushdownTest : public ::testing::Test { uint32_t predicate_node_count_limit = paimon::parquet::DEFAULT_PARQUET_READ_PREDICATE_NODE_COUNT_LIMIT) { ASSERT_OK_AND_ASSIGN(std::shared_ptr in, fs_->Open(file_name_)); - ASSERT_OK_AND_ASSIGN(uint64_t length, in->Length()); + ASSERT_OK_AND_ASSIGN(int64_t length, in->Length()); auto in_stream = std::make_shared(in, arrow_pool_, length); std::map options; diff --git a/src/paimon/fs/jindo/jindo_file_status.h b/src/paimon/fs/jindo/jindo_file_status.h index 97b25a393..3ad5609c8 100644 --- a/src/paimon/fs/jindo/jindo_file_status.h +++ b/src/paimon/fs/jindo/jindo_file_status.h @@ -47,7 +47,7 @@ class JindoFileStatus : public FileStatus { return file_info_.getPath(); } - uint64_t GetLen() const override { + int64_t GetLen() const override { return file_info_.getLength(); } diff --git a/src/paimon/fs/jindo/jindo_file_system.cpp b/src/paimon/fs/jindo/jindo_file_system.cpp index 1fc94dbb7..9ee7416d8 100644 --- a/src/paimon/fs/jindo/jindo_file_system.cpp +++ b/src/paimon/fs/jindo/jindo_file_system.cpp @@ -190,7 +190,7 @@ Status JindoInputStream::Seek(int64_t offset, SeekOrigin origin) { PAIMON_ASSIGN_OR_RAISE(int64_t pos, GetPos()); PAIMON_RETURN_NOT_OK_FROM_JINDO(reader_->seek(offset + pos)); } else if (origin == FS_SEEK_END) { - PAIMON_ASSIGN_OR_RAISE(uint64_t len, Length()); + PAIMON_ASSIGN_OR_RAISE(int64_t len, Length()); PAIMON_RETURN_NOT_OK_FROM_JINDO(reader_->seek(len + offset)); } else { return Status::Invalid("unsupported seek origin"); @@ -198,32 +198,32 @@ Status JindoInputStream::Seek(int64_t offset, SeekOrigin origin) { return Status::OK(); } -Result JindoInputStream::GetPos() const { +Result JindoInputStream::GetPos() const { int64_t pos = -1; PAIMON_RETURN_NOT_OK_FROM_JINDO(reader_->tell(pos)); if (pos < 0) { return Status::Invalid(fmt::format("jindo input position {} is less than 0", pos)); } - return static_cast(pos); + return pos; } -Result JindoInputStream::Length() const { +Result JindoInputStream::Length() const { int64_t len = -1; PAIMON_RETURN_NOT_OK_FROM_JINDO(reader_->getFileLength(len)); return len; } -Result JindoInputStream::Read(char* buffer, uint64_t size) { +Result JindoInputStream::Read(char* buffer, int64_t size) { PAIMON_RETURN_NOT_OK_FROM_JINDO(reader_->read(size, &result_, buffer)); return result_.length(); } -Result JindoInputStream::Read(char* buffer, uint64_t size, uint64_t offset) { +Result JindoInputStream::Read(char* buffer, int64_t size, int64_t offset) { PAIMON_RETURN_NOT_OK_FROM_JINDO(reader_->pread(offset, size, &result_, buffer)); return result_.length(); } -void JindoInputStream::ReadAsync(char* buffer, uint64_t size, uint64_t offset, +void JindoInputStream::ReadAsync(char* buffer, int64_t size, int64_t offset, std::function&& callback) { auto outer_callback = [=](JdoStatus status) { callback(status.ok() ? Status::OK() : Status::IOError(status.errMsg())); @@ -248,16 +248,16 @@ JindoOutputStream::JindoOutputStream(const std::shared_ptr& std::unique_ptr&& writer) : fs_(fs), writer_(std::move(writer)) {} -Result JindoOutputStream::GetPos() const { +Result JindoOutputStream::GetPos() const { int64_t pos = -1; PAIMON_RETURN_NOT_OK_FROM_JINDO(writer_->tell(pos)); if (pos < 0) { return Status::Invalid(fmt::format("jindo output position {} is less than 0", pos)); } - return static_cast(pos); + return pos; } -Result JindoOutputStream::Write(const char* buffer, uint64_t size) { +Result JindoOutputStream::Write(const char* buffer, int64_t size) { std::string_view data(buffer, size); PAIMON_RETURN_NOT_OK_FROM_JINDO(writer_->write(data)); return size; diff --git a/src/paimon/fs/jindo/jindo_file_system.h b/src/paimon/fs/jindo/jindo_file_system.h index eb4ef4bbe..3b413b3c0 100644 --- a/src/paimon/fs/jindo/jindo_file_system.h +++ b/src/paimon/fs/jindo/jindo_file_system.h @@ -66,14 +66,14 @@ class JindoInputStream : public InputStream { JindoInputStream(const std::shared_ptr& fs, std::unique_ptr&& reader); Status Seek(int64_t offset, SeekOrigin origin) override; - Result GetPos() const override; - Result Read(char* buffer, uint64_t size) override; - Result Read(char* buffer, uint64_t size, uint64_t offset) override; - void ReadAsync(char* buffer, uint64_t size, uint64_t offset, + Result GetPos() const override; + Result Read(char* buffer, int64_t size) override; + Result Read(char* buffer, int64_t size, int64_t offset) override; + void ReadAsync(char* buffer, int64_t size, int64_t offset, std::function&& callback) override; Status Close() override; Result GetUri() const override; - Result Length() const override; + Result Length() const override; private: // The lifecycle of the fs used to create the Jindo Reader must be longer than the lifecycle of @@ -88,8 +88,8 @@ class JindoOutputStream : public OutputStream { JindoOutputStream(const std::shared_ptr& fs, std::unique_ptr&& writer); - Result GetPos() const override; - Result Write(const char* buffer, uint64_t size) override; + Result GetPos() const override; + Result Write(const char* buffer, int64_t size) override; Status Flush() override; Status Close() override; Result GetUri() const override; diff --git a/src/paimon/fs/jindo/jindo_file_system_test.cpp b/src/paimon/fs/jindo/jindo_file_system_test.cpp index e85890eb5..53d35a18e 100644 --- a/src/paimon/fs/jindo/jindo_file_system_test.cpp +++ b/src/paimon/fs/jindo/jindo_file_system_test.cpp @@ -77,7 +77,7 @@ TEST_F(JindoFileSystemTest, TestSeek) { std::string file_path = test_dir_ + "file.data"; // write process ASSERT_OK_AND_ASSIGN(auto out_stream, fs_->Create(file_path, /*overwrite=*/true)); - ASSERT_OK_AND_ASSIGN(uint64_t write_len, out_stream->Write(content.data(), content.size())); + ASSERT_OK_AND_ASSIGN(int64_t write_len, out_stream->Write(content.data(), content.size())); ASSERT_EQ(write_len, content.size()); ASSERT_OK(out_stream->Flush()); ASSERT_OK(out_stream->Close()); diff --git a/src/paimon/fs/local/local_file.cpp b/src/paimon/fs/local/local_file.cpp index c017682cd..84de3eca9 100644 --- a/src/paimon/fs/local/local_file.cpp +++ b/src/paimon/fs/local/local_file.cpp @@ -26,6 +26,7 @@ #include "fmt/format.h" #include "paimon/common/factories/io_hook.h" +#include "paimon/common/utils/math.h" #include "paimon/common/utils/path_util.h" #include "paimon/fs/local/local_file_status.h" @@ -140,7 +141,7 @@ Result> LocalFile::GetFileStatus() const { S_ISDIR(buf.st_mode)); } -Result LocalFile::Length() const { +Result LocalFile::Length() const { CHECK_HOOK(); PAIMON_ASSIGN_OR_RAISE(std::unique_ptr file_status, GetFileStatus()); return file_status->GetLen(); @@ -166,13 +167,17 @@ const std::string& LocalFile::GetAbsolutePath() const { return path_; } -Result LocalFile::Read(char* buffer, uint64_t length, uint64_t offset) { +Result LocalFile::Read(char* buffer, int64_t length, int64_t offset) { if (file_) { CHECK_HOOK(); + PAIMON_RETURN_NOT_OK(ValidateValueInRange(length, "read length")); + if (offset < 0) { + return Status::Invalid(fmt::format("read offset {} is less than 0", offset)); + } int32_t fd = fileno(file_); - uint64_t more = length; - uint64_t off = 0; + int64_t more = length; + int64_t off = 0; while (more > 0) { ssize_t ret = ::pread(fd, buffer + off, static_cast(more), static_cast(offset + off)); @@ -184,7 +189,7 @@ Result LocalFile::Read(char* buffer, uint64_t length, uint64_t offset) if (ret == 0) { break; } - auto read_size = static_cast(ret); + auto read_size = static_cast(ret); more -= read_size; off += read_size; } @@ -194,12 +199,13 @@ Result LocalFile::Read(char* buffer, uint64_t length, uint64_t offset) "read file '{}' fail, can not read file which is opened fail, ec: EBADF", path_)); } -Result LocalFile::Read(char* buffer, uint64_t length) { +Result LocalFile::Read(char* buffer, int64_t length) { if (file_) { CHECK_HOOK(); + PAIMON_RETURN_NOT_OK(ValidateValueInRange(length, "read length")); - uint64_t more = length; - uint64_t off = 0; + int64_t more = length; + int64_t off = 0; while (more > 0) { size_t ret = fread(buffer + off, 1, static_cast(more), file_); if (ferror(file_) != 0) { @@ -220,12 +226,13 @@ Result LocalFile::Read(char* buffer, uint64_t length) { "read file '{}' fail, can not read file which is opened fail, ec: EBADF", path_)); } -Result LocalFile::Write(const char* buffer, uint64_t length) { +Result LocalFile::Write(const char* buffer, int64_t length) { if (file_) { CHECK_HOOK(); + PAIMON_RETURN_NOT_OK(ValidateValueInRange(length, "write length")); - uint64_t more = length; - uint64_t off = 0; + int64_t more = length; + int64_t off = 0; while (more > 0) { size_t ret = fwrite(buffer + off, 1, static_cast(more), file_); if (ferror(file_) != 0) { @@ -318,7 +325,7 @@ Status LocalFile::Seek(int64_t offset, int32_t seek_origin) { fmt::format("seek '{}' fail, can not read file which not opened, ec: EBADF", path_)); } -Result LocalFile::Tell() const { +Result LocalFile::Tell() const { if (file_) { CHECK_HOOK(); int64_t ret = ftell(file_); diff --git a/src/paimon/fs/local/local_file.h b/src/paimon/fs/local/local_file.h index fbbfe674b..93fdb1e9c 100644 --- a/src/paimon/fs/local/local_file.h +++ b/src/paimon/fs/local/local_file.h @@ -49,19 +49,19 @@ class LocalFile { LocalFile GetParentFile() const; Result Mkdir() const; Result> GetFileStatus() const; - Result Length() const; + Result Length() const; Result LastModifiedTimeMs() const; Status OpenFile(bool is_read_file); - Result Read() { + Result Read() { return Status::NotImplemented(""); } - Result Read(char* buffer, uint64_t length); - Result Read(char* buffer, uint64_t length, uint64_t offset); - Result Write(const char* buffer, uint64_t length); + Result Read(char* buffer, int64_t length); + Result Read(char* buffer, int64_t length, int64_t offset); + Result Write(const char* buffer, int64_t length); Status Flush(); Status Close(); Status Seek(int64_t offset, int32_t seek_origin); - Result Tell() const; + Result Tell() const; bool IsEmpty() const { return path_.empty(); diff --git a/src/paimon/fs/local/local_file_status.h b/src/paimon/fs/local/local_file_status.h index a35d2212a..9ad4e2b72 100644 --- a/src/paimon/fs/local/local_file_status.h +++ b/src/paimon/fs/local/local_file_status.h @@ -41,7 +41,7 @@ class LocalBasicFileStatus : public BasicFileStatus { class LocalFileStatus : public FileStatus { public: - LocalFileStatus(const std::string& path, uint64_t length, int64_t last_modification_time, + LocalFileStatus(const std::string& path, int64_t length, int64_t last_modification_time, bool is_dir) : path_(path), length_(length), @@ -52,7 +52,7 @@ class LocalFileStatus : public FileStatus { return path_; } - uint64_t GetLen() const override { + int64_t GetLen() const override { return length_; } @@ -66,7 +66,7 @@ class LocalFileStatus : public FileStatus { private: std::string path_; - uint64_t length_; + int64_t length_; int64_t last_modification_time_; bool is_dir_; }; diff --git a/src/paimon/fs/local/local_file_system.cpp b/src/paimon/fs/local/local_file_system.cpp index 2be2fd02f..e56daeaa6 100644 --- a/src/paimon/fs/local/local_file_system.cpp +++ b/src/paimon/fs/local/local_file_system.cpp @@ -259,12 +259,12 @@ Status LocalInputStream::Seek(int64_t offset, SeekOrigin origin) { return file_.Seek(offset, convert_origin(origin)); } -Result LocalInputStream::GetPos() const { +Result LocalInputStream::GetPos() const { return file_.Tell(); } -Result LocalInputStream::Read(char* buffer, uint64_t size) { - PAIMON_ASSIGN_OR_RAISE(uint64_t read_length, file_.Read(buffer, size)); +Result LocalInputStream::Read(char* buffer, int64_t size) { + PAIMON_ASSIGN_OR_RAISE(int64_t read_length, file_.Read(buffer, size)); if (read_length != size) { return Status::IOError(fmt::format("file '{}' read size {} != expected {}", file_.GetAbsolutePath(), read_length, size)); @@ -272,8 +272,8 @@ Result LocalInputStream::Read(char* buffer, uint64_t size) { return read_length; } -Result LocalInputStream::Read(char* buffer, uint64_t size, uint64_t offset) { - PAIMON_ASSIGN_OR_RAISE(uint64_t read_length, file_.Read(buffer, size, offset)); +Result LocalInputStream::Read(char* buffer, int64_t size, int64_t offset) { + PAIMON_ASSIGN_OR_RAISE(int64_t read_length, file_.Read(buffer, size, offset)); if (read_length != size) { return Status::IOError(fmt::format("file '{}' read size {} != expected {}", file_.GetAbsolutePath(), read_length, size)); @@ -281,9 +281,9 @@ Result LocalInputStream::Read(char* buffer, uint64_t size, uint64_t of return read_length; } -void LocalInputStream::ReadAsync(char* buffer, uint64_t size, uint64_t offset, +void LocalInputStream::ReadAsync(char* buffer, int64_t size, int64_t offset, std::function&& callback) { - Result read_size = Read(buffer, size, offset); + Result read_size = Read(buffer, size, offset); Status status = Status::OK(); if (!read_size.ok()) { status = read_size.status(); @@ -293,7 +293,7 @@ void LocalInputStream::ReadAsync(char* buffer, uint64_t size, uint64_t offset, callback(status); } -Result LocalInputStream::Length() const { +Result LocalInputStream::Length() const { return file_.Length(); } @@ -309,10 +309,10 @@ Result> LocalOutputStream::Create(LocalFile& LocalOutputStream::LocalOutputStream(const LocalFile& file) : file_(file) {} -Result LocalOutputStream::GetPos() const { +Result LocalOutputStream::GetPos() const { return file_.Tell(); } -Result LocalOutputStream::Write(const char* buffer, uint64_t size) { +Result LocalOutputStream::Write(const char* buffer, int64_t size) { return file_.Write(buffer, size); } Status LocalOutputStream::Flush() { diff --git a/src/paimon/fs/local/local_file_system.h b/src/paimon/fs/local/local_file_system.h index 28715b30c..6cca1935e 100644 --- a/src/paimon/fs/local/local_file_system.h +++ b/src/paimon/fs/local/local_file_system.h @@ -71,17 +71,17 @@ class LocalInputStream : public InputStream { static Result> Create(LocalFile& file); Status Seek(int64_t offset, SeekOrigin origin) override; - Result GetPos() const override; - Result Read(char* buffer, uint64_t size) override; - Result Read(char* buffer, uint64_t size, uint64_t offset) override; - void ReadAsync(char* buffer, uint64_t size, uint64_t offset, + Result GetPos() const override; + Result Read(char* buffer, int64_t size) override; + Result Read(char* buffer, int64_t size, int64_t offset) override; + void ReadAsync(char* buffer, int64_t size, int64_t offset, std::function&& callback) override; Status Close() override; Result GetUri() const override { return file_.GetAbsolutePath(); } - Result Length() const override; + Result Length() const override; private: explicit LocalInputStream(const LocalFile& file); @@ -93,8 +93,8 @@ class LocalOutputStream : public OutputStream { public: static Result> Create(LocalFile& file); - Result GetPos() const override; - Result Write(const char* buffer, uint64_t size) override; + Result GetPos() const override; + Result Write(const char* buffer, int64_t size) override; Status Flush() override; Status Close() override; Result GetUri() const override { diff --git a/src/paimon/fs/local/local_file_test.cpp b/src/paimon/fs/local/local_file_test.cpp index 53c933d6e..a20c79d3d 100644 --- a/src/paimon/fs/local/local_file_test.cpp +++ b/src/paimon/fs/local/local_file_test.cpp @@ -47,7 +47,7 @@ TEST(LocalFileTest, TestReadWriteEmptyContent) { const char* str = ""; const uint64_t str_size = 0; - ASSERT_OK_AND_ASSIGN(uint64_t write_size, file.Write(str, str_size)); + ASSERT_OK_AND_ASSIGN(int64_t write_size, file.Write(str, str_size)); ASSERT_EQ(write_size, str_size); ASSERT_OK(file.Flush()); @@ -58,7 +58,7 @@ TEST(LocalFileTest, TestReadWriteEmptyContent) { LocalFile file2(path); ASSERT_OK(file2.OpenFile(/*is_read_file=*/true)); char buffer[10]; - ASSERT_OK_AND_ASSIGN(uint64_t read_len, file2.Read(buffer, 10)); + ASSERT_OK_AND_ASSIGN(int64_t read_len, file2.Read(buffer, 10)); ASSERT_EQ(0, read_len); } @@ -84,7 +84,7 @@ TEST(LocalFileTest, TestSimple) { const char* str = "test_data"; const uint64_t str_size = 9; - ASSERT_OK_AND_ASSIGN(uint64_t write_size, file.Write(str, str_size)); + ASSERT_OK_AND_ASSIGN(int64_t write_size, file.Write(str, str_size)); ASSERT_EQ(write_size, str_size); ASSERT_OK(file.Flush()); @@ -108,7 +108,7 @@ TEST(LocalFileTest, TestSimple) { ASSERT_OK(file2.OpenFile(true)); char str_read[str_size + 1]; { - ASSERT_OK_AND_ASSIGN(uint64_t read_size, file2.Read(str_read, 4)); + ASSERT_OK_AND_ASSIGN(int64_t read_size, file2.Read(str_read, 4)); ASSERT_EQ(read_size, 4); str_read[read_size] = '\0'; ASSERT_EQ(strcmp(str_read, "test"), 0); @@ -117,13 +117,13 @@ TEST(LocalFileTest, TestSimple) { ASSERT_OK_AND_ASSIGN(uint64_t pos, file2.Tell()); ASSERT_EQ(pos, 4); ASSERT_OK(file2.Seek(5, SEEK_SET)); - ASSERT_OK_AND_ASSIGN(uint64_t read_size, file2.Read(str_read, 4)); + ASSERT_OK_AND_ASSIGN(int64_t read_size, file2.Read(str_read, 4)); ASSERT_EQ(read_size, 4); str_read[read_size] = '\0'; ASSERT_EQ(strcmp(str_read, "data"), 0); } { - ASSERT_OK_AND_ASSIGN(uint64_t read_size, file2.Read(str_read, str_size, 0)); + ASSERT_OK_AND_ASSIGN(int64_t read_size, file2.Read(str_read, str_size, 0)); ASSERT_EQ(read_size, str_size); str_read[read_size] = '\0'; ASSERT_EQ(strcmp(str_read, "test_data"), 0); diff --git a/src/paimon/global_index/lumina/lumina_file_reader.h b/src/paimon/global_index/lumina/lumina_file_reader.h index 8e36dbc40..d52134a78 100644 --- a/src/paimon/global_index/lumina/lumina_file_reader.h +++ b/src/paimon/global_index/lumina/lumina_file_reader.h @@ -21,6 +21,7 @@ #include "fmt/format.h" #include "lumina/io/FileReader.h" +#include "paimon/common/utils/math.h" #include "paimon/fs/file_system.h" #include "paimon/global_index/lumina/lumina_utils.h" namespace paimon::lumina { @@ -30,32 +31,49 @@ class LuminaFileReader : public ::lumina::io::FileReader { ~LuminaFileReader() override = default; ::lumina::core::Result GetLength() const noexcept override { - Result length_result = in_->Length(); + Result length_result = in_->Length(); if (!length_result.ok()) { return ::lumina::core::Result::Err( PaimonToLuminaStatus(length_result.status())); } - return ::lumina::core::Result::Ok(length_result.value()); + Status status = ValidateValueInRange(length_result.value(), "file length"); + if (!status.ok()) { + return ::lumina::core::Result::Err(PaimonToLuminaStatus(status)); + } + return ::lumina::core::Result::Ok(static_cast(length_result.value())); } ::lumina::core::Result GetPosition() const noexcept override { - Result pos_result = in_->GetPos(); + Result pos_result = in_->GetPos(); if (!pos_result.ok()) { return ::lumina::core::Result::Err(PaimonToLuminaStatus(pos_result.status())); } - return ::lumina::core::Result::Ok(pos_result.value()); + Status status = ValidateValueInRange(pos_result.value(), "file position"); + if (!status.ok()) { + return ::lumina::core::Result::Err(PaimonToLuminaStatus(status)); + } + return ::lumina::core::Result::Ok(static_cast(pos_result.value())); } ::lumina::core::Status Seek(uint64_t position) noexcept override { - return PaimonToLuminaStatus(in_->Seek(position, SeekOrigin::FS_SEEK_SET)); + Status status = ValidateValueInRange(position, "seek position"); + if (!status.ok()) { + return PaimonToLuminaStatus(status); + } + return PaimonToLuminaStatus( + in_->Seek(static_cast(position), SeekOrigin::FS_SEEK_SET)); } ::lumina::core::Status Read(char* data, uint64_t size) noexcept override { - Result read_result = in_->Read(data, size); + Status status = ValidateValueInRange(size, "read size"); + if (!status.ok()) { + return PaimonToLuminaStatus(status); + } + Result read_result = in_->Read(data, static_cast(size)); if (!read_result.ok()) { return PaimonToLuminaStatus(read_result.status()); } - if (read_result.value() != size) { + if (read_result.value() != static_cast(size)) { return ::lumina::core::Status( ::lumina::core::ErrorCode::IoError, fmt::format("expect read len {} mismatch actual read len {}", size, @@ -71,7 +89,17 @@ class LuminaFileReader : public ::lumina::io::FileReader { return; } - in_->ReadAsync(data, size, offset, + Status status = ValidateValueInRange(size, "read size"); + if (!status.ok()) { + call_back(PaimonToLuminaStatus(status)); + return; + } + status = ValidateValueInRange(offset, "read offset"); + if (!status.ok()) { + call_back(PaimonToLuminaStatus(status)); + return; + } + in_->ReadAsync(data, static_cast(size), static_cast(offset), [call_back = std::move(call_back)](const Status& status) { call_back(PaimonToLuminaStatus(status)); }); diff --git a/src/paimon/global_index/lumina/lumina_file_writer.h b/src/paimon/global_index/lumina/lumina_file_writer.h index c787ca52c..7d316839e 100644 --- a/src/paimon/global_index/lumina/lumina_file_writer.h +++ b/src/paimon/global_index/lumina/lumina_file_writer.h @@ -20,6 +20,7 @@ #include "fmt/format.h" #include "lumina/io/FileWriter.h" +#include "paimon/common/utils/math.h" #include "paimon/fs/file_system.h" #include "paimon/global_index/lumina/lumina_utils.h" namespace paimon::lumina { @@ -29,19 +30,27 @@ class LuminaFileWriter : public ::lumina::io::FileWriter { ~LuminaFileWriter() override = default; ::lumina::core::Result GetLength() const noexcept override { - Result pos_result = out_->GetPos(); + Result pos_result = out_->GetPos(); if (!pos_result.ok()) { return ::lumina::core::Result::Err(PaimonToLuminaStatus(pos_result.status())); } - return ::lumina::core::Result::Ok(pos_result.value()); + Status status = ValidateValueInRange(pos_result.value(), "file length"); + if (!status.ok()) { + return ::lumina::core::Result::Err(PaimonToLuminaStatus(status)); + } + return ::lumina::core::Result::Ok(static_cast(pos_result.value())); } ::lumina::core::Status Write(const char* data, uint64_t size) noexcept override { - Result write_result = out_->Write(data, size); + Status status = ValidateValueInRange(size, "write size"); + if (!status.ok()) { + return PaimonToLuminaStatus(status); + } + Result write_result = out_->Write(data, static_cast(size)); if (!write_result.ok()) { return PaimonToLuminaStatus(write_result.status()); } - if (write_result.value() != size) { + if (write_result.value() != static_cast(size)) { return ::lumina::core::Status( ::lumina::core::ErrorCode::IoError, fmt::format("expect write len {} mismatch actual write len {}", size, diff --git a/src/paimon/testing/mock/mock_file_system.h b/src/paimon/testing/mock/mock_file_system.h index 50397aaea..6cf139484 100644 --- a/src/paimon/testing/mock/mock_file_system.h +++ b/src/paimon/testing/mock/mock_file_system.h @@ -32,16 +32,16 @@ class MockInputStream : public InputStream { Status Seek(int64_t offset, SeekOrigin origin) override { return Status::OK(); } - Result GetPos() const override { + Result GetPos() const override { return 0; } - Result Read(char* buffer, uint64_t size) override { + Result Read(char* buffer, int64_t size) override { return 0; } - Result Read(char* buffer, uint64_t size, uint64_t offset) override { + Result Read(char* buffer, int64_t size, int64_t offset) override { return 0; } - void ReadAsync(char* buffer, uint64_t size, uint64_t offset, + void ReadAsync(char* buffer, int64_t size, int64_t offset, std::function&& callback) override {} Status Close() override { @@ -50,7 +50,7 @@ class MockInputStream : public InputStream { Result GetUri() const override { return std::string(); } - Result Length() const override { + Result Length() const override { return 0; } }; @@ -60,10 +60,10 @@ class MockOutputStream : public OutputStream { MockOutputStream() = default; ~MockOutputStream() override = default; - Result GetPos() const override { + Result GetPos() const override { return 0; } - Result Write(const char* buffer, uint64_t size) override { + Result Write(const char* buffer, int64_t size) override { return 0; } Status Flush() override { @@ -85,7 +85,7 @@ class MockFileStatus : public FileStatus { std::string GetPath() const override { return ""; } - uint64_t GetLen() const override { + int64_t GetLen() const override { return 0; } int64_t GetModificationTime() const override { diff --git a/src/paimon/testing/mock/mock_format_writer.cpp b/src/paimon/testing/mock/mock_format_writer.cpp index 28a4fbab9..c0d0aea9e 100644 --- a/src/paimon/testing/mock/mock_format_writer.cpp +++ b/src/paimon/testing/mock/mock_format_writer.cpp @@ -38,8 +38,8 @@ MockFormatWriter::MockFormatWriter(const std::shared_ptr& out, Status MockFormatWriter::AddBatch(ArrowArray* batch) { ArrowArrayRelease(batch); std::string str = std::to_string(DateTimeUtils::GetCurrentUTCTimeUs()) + "\n"; - PAIMON_ASSIGN_OR_RAISE(uint64_t res, out_->Write(str.data(), str.size())); - if (res != str.size()) { + PAIMON_ASSIGN_OR_RAISE(int64_t res, out_->Write(str.data(), str.size())); + if (res != static_cast(str.size())) { return Status::IOError("write size does not match"); } counter_++; diff --git a/src/paimon/testing/utils/test_helper.h b/src/paimon/testing/utils/test_helper.h index a837ec2a5..832055a51 100644 --- a/src/paimon/testing/utils/test_helper.h +++ b/src/paimon/testing/utils/test_helper.h @@ -184,8 +184,8 @@ class TestHelper { PAIMON_ASSIGN_OR_RAISE(auto result_stream, result_blobs[i]->NewInputStream(fs)); PAIMON_ASSIGN_OR_RAISE(auto expected_stream, expected_blobs[i]->NewInputStream(fs)); - PAIMON_ASSIGN_OR_RAISE(uint64_t result_length, result_stream->Length()); - PAIMON_ASSIGN_OR_RAISE(uint64_t expected_length, expected_stream->Length()); + PAIMON_ASSIGN_OR_RAISE(int64_t result_length, result_stream->Length()); + PAIMON_ASSIGN_OR_RAISE(int64_t expected_length, expected_stream->Length()); if (result_length != expected_length) { auto result_descriptor_bytes = result_blobs[i]->ToDescriptor(GetDefaultPool()); auto expected_descriptor_bytes = expected_blobs[i]->ToDescriptor(GetDefaultPool()); diff --git a/test/inte/read_inte_test.cpp b/test/inte/read_inte_test.cpp index 7ca302581..967ec0156 100644 --- a/test/inte/read_inte_test.cpp +++ b/test/inte/read_inte_test.cpp @@ -2952,18 +2952,18 @@ TEST_P(ReadInteTest, TestSpecificFs) { Status Seek(int64_t offset, SeekOrigin origin) override { return input_->Seek(offset, origin); } - Result GetPos() const override { + Result GetPos() const override { return input_->GetPos(); } - Result Read(char* buffer, uint64_t size) override { + Result Read(char* buffer, int64_t size) override { (*io_count_)++; return input_->Read(buffer, size); } - Result Read(char* buffer, uint64_t size, uint64_t offset) override { + Result Read(char* buffer, int64_t size, int64_t offset) override { (*io_count_)++; return input_->Read(buffer, size, offset); } - void ReadAsync(char* buffer, uint64_t size, uint64_t offset, + void ReadAsync(char* buffer, int64_t size, int64_t offset, std::function&& callback) override { (*io_count_)++; return input_->ReadAsync(buffer, size, offset, std::move(callback)); @@ -2975,7 +2975,7 @@ TEST_P(ReadInteTest, TestSpecificFs) { Result GetUri() const override { return input_->GetUri(); } - Result Length() const override { + Result Length() const override { return input_->Length(); }