From 2d1c59ba4cd834e48b08aadc0807608aafc38afd Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Fri, 15 May 2026 20:55:35 +0100 Subject: [PATCH 1/6] Add Vortex file metadata segments Signed-off-by: "Nicholas Gates" --- vortex-file/public-api.lock | 12 ++ vortex-file/src/file.rs | 11 ++ vortex-file/src/footer/deserializer.rs | 60 +++++++- vortex-file/src/footer/mod.rs | 18 +++ vortex-file/src/footer/postscript.rs | 72 ++++++++++ vortex-file/src/footer/serializer.rs | 40 ++++++ vortex-file/src/lib.rs | 13 +- vortex-file/src/tests.rs | 46 ++++++ vortex-file/src/writer.rs | 60 ++++++-- .../flatbuffers/vortex-file/footer.fbs | 8 ++ vortex-flatbuffers/public-api.lock | 76 ++++++++++ vortex-flatbuffers/src/generated/footer.rs | 134 ++++++++++++++++++ 12 files changed, 533 insertions(+), 17 deletions(-) diff --git a/vortex-file/public-api.lock b/vortex-file/public-api.lock index bb2a9b00d81..6220703af51 100644 --- a/vortex-file/public-api.lock +++ b/vortex-file/public-api.lock @@ -184,6 +184,10 @@ pub fn vortex_file::Footer::into_serializer(self) -> vortex_file::FooterSerializ pub fn vortex_file::Footer::layout(&self) -> &vortex_layout::layout::LayoutRef +pub fn vortex_file::Footer::metadata_segment(&self, &str) -> core::option::Option<&vortex_buffer::ByteBuffer> + +pub fn vortex_file::Footer::metadata_segments(&self) -> &[(alloc::string::String, vortex_buffer::ByteBuffer)] + pub fn vortex_file::Footer::row_count(&self) -> u64 pub fn vortex_file::Footer::segment_map(&self) -> &alloc::sync::Arc<[vortex_file::SegmentSpec]> @@ -276,6 +280,10 @@ pub fn vortex_file::VortexFile::footer(&self) -> &vortex_file::Footer pub fn vortex_file::VortexFile::layout_reader(&self) -> vortex_error::VortexResult> +pub fn vortex_file::VortexFile::metadata_segment(&self, &str) -> core::option::Option<&vortex_buffer::ByteBuffer> + +pub fn vortex_file::VortexFile::metadata_segments(&self) -> &[(alloc::string::String, vortex_buffer::ByteBuffer)] + pub fn vortex_file::VortexFile::row_count(&self) -> u64 pub fn vortex_file::VortexFile::scan(&self) -> vortex_error::VortexResult> @@ -338,6 +346,10 @@ pub fn vortex_file::VortexWriteOptions::new(vortex_session::VortexSession) -> Se pub fn vortex_file::VortexWriteOptions::with_file_statistics(self, alloc::vec::Vec) -> Self +pub fn vortex_file::VortexWriteOptions::with_metadata_segment(self, impl core::convert::Into, impl core::convert::Into) -> Self + +pub fn vortex_file::VortexWriteOptions::with_metadata_segments(self, I) -> Self where I: core::iter::traits::collect::IntoIterator, K: core::convert::Into, B: core::convert::Into + pub fn vortex_file::VortexWriteOptions::with_strategy(self, alloc::sync::Arc) -> Self pub struct vortex_file::WriteStrategyBuilder diff --git a/vortex-file/src/file.rs b/vortex-file/src/file.rs index f321d774197..8dc91605bd5 100644 --- a/vortex-file/src/file.rs +++ b/vortex-file/src/file.rs @@ -23,6 +23,7 @@ use vortex_array::dtype::FieldPathSet; use vortex_array::expr::Expression; use vortex_array::expr::pruning::checked_pruning_expr; use vortex_array::scalar_fn::internal::row_count::substitute_row_count; +use vortex_buffer::ByteBuffer; use vortex_error::VortexResult; use vortex_layout::LayoutReader; use vortex_layout::scan::layout::LayoutReaderDataSource; @@ -76,6 +77,16 @@ impl VortexFile { self.footer.statistics() } + /// Returns the user-defined metadata segments stored in this file. + pub fn metadata_segments(&self) -> &[(String, ByteBuffer)] { + self.footer.metadata_segments() + } + + /// Returns the user-defined metadata segment for the given key. + pub fn metadata_segment(&self, key: &str) -> Option<&ByteBuffer> { + self.footer.metadata_segment(key) + } + /// Create a new segment source for reading from the file. /// /// This may spawn a background I/O driver that will exit when the returned segment source diff --git a/vortex-file/src/footer/deserializer.rs b/vortex-file/src/footer/deserializer.rs index 769b94c715b..d15eb5775bc 100644 --- a/vortex-file/src/footer/deserializer.rs +++ b/vortex-file/src/footer/deserializer.rs @@ -1,6 +1,8 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors +use std::sync::Arc; + use flatbuffers::root; use vortex_array::dtype::DType; use vortex_buffer::ByteBuffer; @@ -19,6 +21,7 @@ use crate::MAGIC_BYTES; use crate::VERSION; use crate::footer::FileStatistics; use crate::footer::postscript::Postscript; +use crate::footer::postscript::PostscriptMetadata; use crate::footer::postscript::PostscriptSegment; /// Deserialize a footer from the end of a Vortex file or created from a @@ -119,6 +122,9 @@ impl FooterDeserializer { if let Some(stats_segment) = &postscript.statistics { read_more_offset = read_more_offset.min(stats_segment.offset); } + for metadata in &postscript.metadata { + read_more_offset = read_more_offset.min(metadata.segment.offset); + } read_more_offset = read_more_offset.min(postscript.layout.offset); read_more_offset = read_more_offset.min(postscript.footer.offset); @@ -151,14 +157,20 @@ impl FooterDeserializer { ) }) .transpose()?; + let metadata: Arc<[(String, ByteBuffer)]> = postscript + .metadata + .iter() + .map(|metadata| self.parse_metadata_segment(initial_offset, &self.buffer, metadata)) + .collect::>>()? + .into(); Ok(DeserializeStep::Done(self.parse_footer( initial_offset, &self.buffer, - &postscript.footer, - &postscript.layout, + postscript, dtype, file_stats, + metadata, )?)) } @@ -238,27 +250,65 @@ impl FooterDeserializer { FileStatistics::from_flatbuffer(&fb, dtype, session) } + /// Parse a user-defined metadata segment from the initial read buffer. + fn parse_metadata_segment( + &self, + initial_offset: u64, + initial_read: &[u8], + metadata: &PostscriptMetadata, + ) -> VortexResult<(String, ByteBuffer)> { + let offset = usize::try_from(metadata.segment.offset - initial_offset)?; + let length = metadata.segment.length as usize; + let end = offset + .checked_add(length) + .ok_or_else(|| vortex_err!("Metadata segment range overflowed usize"))?; + + if end > initial_read.len() { + vortex_bail!( + "Metadata segment {} range {}..{} out of bounds for initial read of length {}", + metadata.key, + offset, + end, + initial_read.len() + ); + } + + Ok(( + metadata.key.clone(), + ByteBuffer::copy_from(&initial_read[offset..end]).aligned(metadata.segment.alignment), + )) + } + /// Parse the rest of the footer from the initial read. fn parse_footer( &self, initial_offset: u64, initial_read: &[u8], - footer_segment: &PostscriptSegment, - layout_segment: &PostscriptSegment, + postscript: &Postscript, dtype: DType, file_stats: Option, + metadata: Arc<[(String, ByteBuffer)]>, ) -> VortexResult