From 4f4aca5f32e93729f1cc6acf6767901e47fcf4aa Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 17 May 2026 22:01:35 +0000 Subject: [PATCH] vortex-row: add SortField and RowEncodeOptions Introduce the per-column sort-field options and the variadic-function options struct used by the upcoming RowSize / RowEncode scalar functions. `RowEncodeOptions::fields` uses a `SmallVec<[SortField; 4]>` so typical 1-4 column keys avoid a heap allocation. Includes a compact serialize / deserialize helper used later by the scalar-function metadata round-trip. Signed-off-by: Claude --- Cargo.lock | 2 + vortex-row/Cargo.toml | 2 + vortex-row/public-api.lock | 156 ++++++++++++++++++++++++++++++++++++ vortex-row/src/lib.rs | 4 + vortex-row/src/options.rs | 157 +++++++++++++++++++++++++++++++++++++ 5 files changed, 321 insertions(+) create mode 100644 vortex-row/src/options.rs diff --git a/Cargo.lock b/Cargo.lock index 63a608277d3..ce2b4e6d41b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -11038,6 +11038,8 @@ dependencies = [ name = "vortex-row" version = "0.1.0" dependencies = [ + "smallvec", + "vortex-error", "vortex-session", ] diff --git a/vortex-row/Cargo.toml b/vortex-row/Cargo.toml index 7515715392c..3e314fd7697 100644 --- a/vortex-row/Cargo.toml +++ b/vortex-row/Cargo.toml @@ -17,4 +17,6 @@ version = { workspace = true } workspace = true [dependencies] +smallvec = { workspace = true } +vortex-error = { workspace = true } vortex-session = { workspace = true } diff --git a/vortex-row/public-api.lock b/vortex-row/public-api.lock index d507aa46a00..998a7712f2d 100644 --- a/vortex-row/public-api.lock +++ b/vortex-row/public-api.lock @@ -1,3 +1,159 @@ pub mod vortex_row +pub mod vortex_row::options + +pub struct vortex_row::options::RowEncodeOptions + +pub vortex_row::options::RowEncodeOptions::fields: smallvec::SmallVec<[vortex_row::options::SortField; 4]> + +impl vortex_row::options::RowEncodeOptions + +pub fn vortex_row::options::RowEncodeOptions::new(impl core::iter::traits::collect::IntoIterator) -> Self + +impl core::clone::Clone for vortex_row::options::RowEncodeOptions + +pub fn vortex_row::options::RowEncodeOptions::clone(&self) -> vortex_row::options::RowEncodeOptions + +impl core::cmp::Eq for vortex_row::options::RowEncodeOptions + +impl core::cmp::PartialEq for vortex_row::options::RowEncodeOptions + +pub fn vortex_row::options::RowEncodeOptions::eq(&self, &vortex_row::options::RowEncodeOptions) -> bool + +impl core::fmt::Debug for vortex_row::options::RowEncodeOptions + +pub fn vortex_row::options::RowEncodeOptions::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::fmt::Display for vortex_row::options::RowEncodeOptions + +pub fn vortex_row::options::RowEncodeOptions::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::hash::Hash for vortex_row::options::RowEncodeOptions + +pub fn vortex_row::options::RowEncodeOptions::hash<__H: core::hash::Hasher>(&self, &mut __H) + +impl core::marker::StructuralPartialEq for vortex_row::options::RowEncodeOptions + +pub struct vortex_row::options::SortField + +pub vortex_row::options::SortField::descending: bool + +pub vortex_row::options::SortField::nulls_first: bool + +impl vortex_row::options::SortField + +pub fn vortex_row::options::SortField::new(bool, bool) -> Self + +pub fn vortex_row::options::SortField::non_null_sentinel(&self) -> u8 + +pub fn vortex_row::options::SortField::null_sentinel(&self) -> u8 + +impl core::clone::Clone for vortex_row::options::SortField + +pub fn vortex_row::options::SortField::clone(&self) -> vortex_row::options::SortField + +impl core::cmp::Eq for vortex_row::options::SortField + +impl core::cmp::PartialEq for vortex_row::options::SortField + +pub fn vortex_row::options::SortField::eq(&self, &vortex_row::options::SortField) -> bool + +impl core::default::Default for vortex_row::options::SortField + +pub fn vortex_row::options::SortField::default() -> Self + +impl core::fmt::Debug for vortex_row::options::SortField + +pub fn vortex_row::options::SortField::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::fmt::Display for vortex_row::options::SortField + +pub fn vortex_row::options::SortField::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::hash::Hash for vortex_row::options::SortField + +pub fn vortex_row::options::SortField::hash<__H: core::hash::Hasher>(&self, &mut __H) + +impl core::marker::Copy for vortex_row::options::SortField + +impl core::marker::StructuralPartialEq for vortex_row::options::SortField + +pub const vortex_row::options::FIELDS_INLINE: usize + +pub struct vortex_row::RowEncodeOptions + +pub vortex_row::RowEncodeOptions::fields: smallvec::SmallVec<[vortex_row::options::SortField; 4]> + +impl vortex_row::options::RowEncodeOptions + +pub fn vortex_row::options::RowEncodeOptions::new(impl core::iter::traits::collect::IntoIterator) -> Self + +impl core::clone::Clone for vortex_row::options::RowEncodeOptions + +pub fn vortex_row::options::RowEncodeOptions::clone(&self) -> vortex_row::options::RowEncodeOptions + +impl core::cmp::Eq for vortex_row::options::RowEncodeOptions + +impl core::cmp::PartialEq for vortex_row::options::RowEncodeOptions + +pub fn vortex_row::options::RowEncodeOptions::eq(&self, &vortex_row::options::RowEncodeOptions) -> bool + +impl core::fmt::Debug for vortex_row::options::RowEncodeOptions + +pub fn vortex_row::options::RowEncodeOptions::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::fmt::Display for vortex_row::options::RowEncodeOptions + +pub fn vortex_row::options::RowEncodeOptions::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::hash::Hash for vortex_row::options::RowEncodeOptions + +pub fn vortex_row::options::RowEncodeOptions::hash<__H: core::hash::Hasher>(&self, &mut __H) + +impl core::marker::StructuralPartialEq for vortex_row::options::RowEncodeOptions + +pub struct vortex_row::SortField + +pub vortex_row::SortField::descending: bool + +pub vortex_row::SortField::nulls_first: bool + +impl vortex_row::options::SortField + +pub fn vortex_row::options::SortField::new(bool, bool) -> Self + +pub fn vortex_row::options::SortField::non_null_sentinel(&self) -> u8 + +pub fn vortex_row::options::SortField::null_sentinel(&self) -> u8 + +impl core::clone::Clone for vortex_row::options::SortField + +pub fn vortex_row::options::SortField::clone(&self) -> vortex_row::options::SortField + +impl core::cmp::Eq for vortex_row::options::SortField + +impl core::cmp::PartialEq for vortex_row::options::SortField + +pub fn vortex_row::options::SortField::eq(&self, &vortex_row::options::SortField) -> bool + +impl core::default::Default for vortex_row::options::SortField + +pub fn vortex_row::options::SortField::default() -> Self + +impl core::fmt::Debug for vortex_row::options::SortField + +pub fn vortex_row::options::SortField::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::fmt::Display for vortex_row::options::SortField + +pub fn vortex_row::options::SortField::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::hash::Hash for vortex_row::options::SortField + +pub fn vortex_row::options::SortField::hash<__H: core::hash::Hasher>(&self, &mut __H) + +impl core::marker::Copy for vortex_row::options::SortField + +impl core::marker::StructuralPartialEq for vortex_row::options::SortField + pub fn vortex_row::initialize(&vortex_session::VortexSession) diff --git a/vortex-row/src/lib.rs b/vortex-row/src/lib.rs index f675ca12f4d..9e62f25caf2 100644 --- a/vortex-row/src/lib.rs +++ b/vortex-row/src/lib.rs @@ -6,6 +6,10 @@ //! Subsequent commits add the encoder, decoder helpers, and per-encoding fast paths. //! This commit only establishes the crate skeleton and an `initialize` stub. +pub mod options; + +pub use options::RowEncodeOptions; +pub use options::SortField; use vortex_session::VortexSession; /// Register the row-encoding scalar functions on the given session. diff --git a/vortex-row/src/options.rs b/vortex-row/src/options.rs new file mode 100644 index 00000000000..a9e5e2b18ab --- /dev/null +++ b/vortex-row/src/options.rs @@ -0,0 +1,157 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use std::fmt::Display; +use std::fmt::Formatter; + +use smallvec::SmallVec; + +/// Per-column options for the row-oriented byte encoder. +/// +/// These options control how a single column is encoded into row bytes: +/// - `descending`: if true, the encoded value bytes are bit-inverted so that +/// lexicographic byte comparison reflects the reverse of the natural ordering. +/// The null sentinel byte is NOT inverted, so nulls keep their requested +/// position relative to non-nulls. +/// - `nulls_first`: if true, nulls sort before non-nulls. If false, nulls sort +/// after non-nulls. Implemented via the sentinel byte that precedes every +/// value's encoded bytes. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub struct SortField { + /// If true, encoded value bytes are bit-inverted so lexicographic byte + /// comparison reflects the reverse of the natural ordering. + pub descending: bool, + /// If true, nulls sort before non-null values; otherwise nulls sort after. + pub nulls_first: bool, +} + +impl Default for SortField { + fn default() -> Self { + Self { + descending: false, + nulls_first: true, + } + } +} + +impl Display for SortField { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!( + f, + "descending={}, nulls_first={}", + self.descending, self.nulls_first + ) + } +} + +impl SortField { + /// Construct a new `SortField` with explicit options. + pub fn new(descending: bool, nulls_first: bool) -> Self { + Self { + descending, + nulls_first, + } + } + + /// Returns the sentinel byte to write for a non-null value. + #[inline] + pub fn non_null_sentinel(&self) -> u8 { + // Non-null is always 0x01. Null choices are < or > 0x01. + 0x01 + } + + /// Returns the sentinel byte to write for a null value. + #[inline] + pub fn null_sentinel(&self) -> u8 { + if self.nulls_first { + // Nulls before non-nulls (smaller byte sorts first). + 0x00 + } else { + // Nulls after non-nulls (larger byte sorts later). + 0x02 + } + } +} + +/// Inline capacity for [`RowEncodeOptions::fields`]. Up to this many [`SortField`]s +/// are held inline without a heap allocation; beyond, the storage spills. +pub const FIELDS_INLINE: usize = 4; + +/// Options for the variadic [`RowSize`] and [`RowEncode`] scalar functions: +/// one [`SortField`] per input column. +/// +/// Stored in a [`SmallVec`] so that typical 1–4 column keys avoid a heap +/// allocation; longer field lists spill to the heap transparently. +/// +/// [`RowSize`]: super::size::RowSize +/// [`RowEncode`]: super::encode::RowEncode +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct RowEncodeOptions { + /// Per-column sort fields, in left-to-right column order. + pub fields: SmallVec<[SortField; FIELDS_INLINE]>, +} + +impl RowEncodeOptions { + /// Construct a new `RowEncodeOptions` from any iterator of [`SortField`]s. + pub fn new(fields: impl IntoIterator) -> Self { + Self { + fields: fields.into_iter().collect(), + } + } +} + +impl Display for RowEncodeOptions { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "[")?; + for (i, field) in self.fields.iter().enumerate() { + if i > 0 { + write!(f, ", ")?; + } + write!(f, "{}", field)?; + } + write!(f, "]") + } +} + +/// Serialize a [`RowEncodeOptions`] to a compact byte vector: 4-byte LE length followed by +/// `2 * len` bytes (descending + nulls_first booleans for each field). +pub(crate) fn serialize_row_encode_options(opts: &RowEncodeOptions) -> Vec { + use vortex_error::VortexExpect; + let n = + u32::try_from(opts.fields.len()).vortex_expect("RowEncodeOptions length must fit in u32"); + let mut out = Vec::with_capacity(4 + 2 * opts.fields.len()); + out.extend_from_slice(&n.to_le_bytes()); + for f in &opts.fields { + out.push(u8::from(f.descending)); + out.push(u8::from(f.nulls_first)); + } + out +} + +/// Deserialize a [`RowEncodeOptions`] produced by [`serialize_row_encode_options`]. +pub(crate) fn deserialize_row_encode_options( + bytes: &[u8], +) -> vortex_error::VortexResult { + if bytes.len() < 4 { + vortex_error::vortex_bail!("RowEncodeOptions metadata must contain a 4-byte length prefix"); + } + let n = u32::from_le_bytes([bytes[0], bytes[1], bytes[2], bytes[3]]) as usize; + let expected = 4 + 2 * n; + if bytes.len() != expected { + vortex_error::vortex_bail!( + "RowEncodeOptions metadata wrong size: got {}, expected {}", + bytes.len(), + expected + ); + } + let mut fields: SmallVec<[SortField; FIELDS_INLINE]> = SmallVec::with_capacity(n); + let mut i = 4; + for _ in 0..n { + fields.push(SortField { + descending: bytes[i] != 0, + nulls_first: bytes[i + 1] != 0, + }); + i += 2; + } + Ok(RowEncodeOptions { fields }) +}