From 3335d80d3bd8d40404a8ad0c39c6e26f3d908ddf Mon Sep 17 00:00:00 2001 From: Alfonso Subiotto Marques Date: Tue, 3 Mar 2026 22:01:25 +0100 Subject: [PATCH 1/3] feat[vortex-runend]: handle canonicalizing Ree This allows writing these kinds of types. I'm unaware of the motivation behind restricting ree values to just primitive and bool prior to this commit. Signed-off-by: Alfonso Subiotto Marques --- encodings/runend/benches/run_end_compress.rs | 33 ++++++++- encodings/runend/src/arbitrary.rs | 3 +- encodings/runend/src/array.rs | 76 +++++++++++--------- encodings/runend/src/compress.rs | 62 ++++++++++++++-- 4 files changed, 131 insertions(+), 43 deletions(-) diff --git a/encodings/runend/benches/run_end_compress.rs b/encodings/runend/benches/run_end_compress.rs index 86311beed0e..533045c7314 100644 --- a/encodings/runend/benches/run_end_compress.rs +++ b/encodings/runend/benches/run_end_compress.rs @@ -5,12 +5,12 @@ use divan::Bencher; use itertools::repeat_n; -use vortex_array::DynArray; use vortex_array::IntoArray; use vortex_array::LEGACY_SESSION; use vortex_array::RecursiveCanonical; use vortex_array::VortexSessionExecute; use vortex_array::arrays::PrimitiveArray; +use vortex_array::arrays::VarBinViewArray; use vortex_array::compute::warm_up_vtables; use vortex_array::dtype::IntegerPType; use vortex_array::validity::Validity; @@ -76,8 +76,12 @@ fn decompress(bencher: Bencher, (length, run_step): (usize, usi let array = run_end_array.into_array(); bencher - .with_inputs(|| &array) - .bench_refs(|array| array.to_canonical()); + .with_inputs(|| (array.clone(), LEGACY_SESSION.create_execution_ctx())) + .bench_values(|(array, mut execution_ctx)| { + array + .execute::(&mut execution_ctx) + .unwrap() + }); } #[divan::bench(args = BENCH_ARGS)] @@ -113,3 +117,26 @@ fn take_indices(bencher: Bencher, (length, run_step): (usize, usize)) { .unwrap() }); } + +#[divan::bench(args = BENCH_ARGS)] +fn decompress_utf8(bencher: Bencher, (length, run_step): (usize, usize)) { + let num_runs = length.div_ceil(run_step); + let ends = (0..num_runs) + .map(|i| ((i + 1) * run_step).min(length) as u64) + .collect::>() + .into_array(); + + let values = VarBinViewArray::from_iter_str((0..num_runs).map(|i| format!("run_value_{i}"))) + .into_array(); + + let run_end_array = RunEndArray::new(ends, values); + let array = run_end_array.into_array(); + + bencher + .with_inputs(|| (array.clone(), LEGACY_SESSION.create_execution_ctx())) + .bench_values(|(array, mut execution_ctx)| { + array + .execute::(&mut execution_ctx) + .unwrap() + }); +} diff --git a/encodings/runend/src/arbitrary.rs b/encodings/runend/src/arbitrary.rs index cb561e7a20e..f4a3a83f150 100644 --- a/encodings/runend/src/arbitrary.rs +++ b/encodings/runend/src/arbitrary.rs @@ -22,8 +22,7 @@ pub struct ArbitraryRunEndArray(pub RunEndArray); impl<'a> Arbitrary<'a> for ArbitraryRunEndArray { fn arbitrary(u: &mut Unstructured<'a>) -> Result { - // RunEnd supports Bool or Primitive types for values - // Pick a random primitive type for values + // Pick a random primitive type for values. let ptype: PType = u.arbitrary()?; let nullability: Nullability = u.arbitrary()?; let dtype = DType::Primitive(ptype, nullability); diff --git a/encodings/runend/src/array.rs b/encodings/runend/src/array.rs index 267920c81e4..a904c4c47df 100644 --- a/encodings/runend/src/array.rs +++ b/encodings/runend/src/array.rs @@ -15,6 +15,7 @@ use vortex_array::Precision; use vortex_array::ProstMetadata; use vortex_array::SerializeMetadata; use vortex_array::arrays::PrimitiveVTable; +use vortex_array::arrays::VarBinViewArray; use vortex_array::buffer::BufferHandle; use vortex_array::dtype::DType; use vortex_array::dtype::Nullability; @@ -39,6 +40,7 @@ use vortex_session::VortexSession; use crate::compress::runend_decode_bools; use crate::compress::runend_decode_primitive; +use crate::compress::runend_decode_varbinview; use crate::compress::runend_encode; use crate::kernel::PARENT_KERNELS; use crate::rules::RULES; @@ -239,12 +241,6 @@ impl RunEndArray { "run ends must be unsigned integers, was {}", ends.dtype(), ); - vortex_ensure!( - values.dtype().is_primitive() || values.dtype().is_boolean(), - "RunEnd array can only have Bool or Primitive values, {} given", - values.dtype() - ); - vortex_ensure!( ends.len() == values.len(), "run ends len != run values len, {} != {}", @@ -342,32 +338,7 @@ impl RunEndArray { /// /// # Validation /// - /// The `ends` must be non-nullable unsigned integers. The values may be `Bool` or `Primitive` - /// types. - /// - /// # Examples - /// - /// ``` - /// # use vortex_array::arrays::{BoolArray, VarBinViewArray}; - /// # use vortex_array::IntoArray; - /// # use vortex_buffer::buffer; - /// # use vortex_runend::RunEndArray; - /// - /// // Error to provide incorrectly-typed values! - /// let result = RunEndArray::try_new( - /// buffer![1u8, 2u8].into_array(), - /// VarBinViewArray::from_iter_str(["bad", "dtype"]).into_array(), - /// ); - /// assert!(result.is_err()); - /// - /// // This array is happy - /// let result = RunEndArray::try_new( - /// buffer![1u8, 2u8].into_array(), - /// BoolArray::from_iter([false, true]).into_array(), - /// ); - /// - /// assert!(result.is_ok()); - /// ``` + /// The `ends` must be non-nullable unsigned integers. pub fn try_new(ends: ArrayRef, values: ArrayRef) -> VortexResult { let length: usize = if ends.is_empty() { 0 @@ -510,6 +481,7 @@ pub(super) fn run_end_canonicalize( ctx: &mut ExecutionCtx, ) -> VortexResult { let pends = array.ends().clone().execute_as("ends", ctx)?; + Ok(match array.dtype() { DType::Bool(_) => { let bools = array.values().clone().execute_as("values", ctx)?; @@ -519,13 +491,22 @@ pub(super) fn run_end_canonicalize( let pvalues = array.values().clone().execute_as("values", ctx)?; runend_decode_primitive(pends, pvalues, array.offset(), array.len())?.into_array() } - _ => vortex_panic!("Only Primitive and Bool values are supported"), + DType::Utf8(_) | DType::Binary(_) => { + let values = array + .values() + .clone() + .execute_as::("values", ctx)?; + runend_decode_varbinview(pends, values, array.offset(), array.len())?.into_array() + } + _ => vortex_bail!("Unsupported RunEnd value type: {}", array.dtype()), }) } #[cfg(test)] mod tests { use vortex_array::IntoArray; + use vortex_array::arrays::DictArray; + use vortex_array::arrays::VarBinViewArray; use vortex_array::assert_arrays_eq; use vortex_array::dtype::DType; use vortex_array::dtype::Nullability; @@ -552,4 +533,33 @@ mod tests { let expected = buffer![1, 1, 2, 2, 2, 3, 3, 3, 3, 3].into_array(); assert_arrays_eq!(arr.into_array(), expected); } + + #[test] + fn test_runend_utf8() { + let values = VarBinViewArray::from_iter_str(["a", "b", "c"]).into_array(); + let arr = RunEndArray::new(buffer![2u32, 5, 10].into_array(), values); + assert_eq!(arr.len(), 10); + assert_eq!(arr.dtype(), &DType::Utf8(Nullability::NonNullable)); + + let expected = + VarBinViewArray::from_iter_str(["a", "a", "b", "b", "b", "c", "c", "c", "c", "c"]) + .into_array(); + assert_arrays_eq!(arr.to_array(), expected); + } + + #[test] + fn test_runend_dict() { + let dict_values = VarBinViewArray::from_iter_str(["x", "y", "z"]).into_array(); + let dict_codes = buffer![0u32, 1, 2].into_array(); + let dict = DictArray::try_new(dict_codes, dict_values).unwrap(); + + let arr = + RunEndArray::try_new(buffer![2u32, 5, 10].into_array(), dict.into_array()).unwrap(); + assert_eq!(arr.len(), 10); + + let expected = + VarBinViewArray::from_iter_str(["x", "x", "y", "y", "y", "z", "z", "z", "z", "z"]) + .into_array(); + assert_arrays_eq!(arr.to_array(), expected); + } } diff --git a/encodings/runend/src/compress.rs b/encodings/runend/src/compress.rs index 94f42281f48..ff3462d961c 100644 --- a/encodings/runend/src/compress.rs +++ b/encodings/runend/src/compress.rs @@ -8,6 +8,8 @@ use vortex_array::ToCanonical; use vortex_array::arrays::BoolArray; use vortex_array::arrays::ConstantArray; use vortex_array::arrays::PrimitiveArray; +use vortex_array::arrays::VarBinViewArray; +use vortex_array::buffer::BufferHandle; use vortex_array::dtype::NativePType; use vortex_array::dtype::Nullability; use vortex_array::expr::stats::Precision; @@ -204,13 +206,16 @@ pub fn runend_decode_bools( })) } -pub fn runend_decode_typed_primitive( +/// Decode a run-end encoded slice of values into a flat `Buffer` and `Validity`. +/// +/// This is the core decode loop shared by primitive and varbinview run-end decoding. +fn runend_decode_slice( run_ends: impl Iterator, values: &[T], values_validity: Mask, values_nullability: Nullability, length: usize, -) -> PrimitiveArray { +) -> (Buffer, Validity) { match values_validity { Mask::AllTrue(_) => { let mut decoded: BufferMut = BufferMut::with_capacity(length); @@ -225,9 +230,9 @@ pub fn runend_decode_typed_primitive( // We preallocate enough capacity because we know the total length unsafe { decoded.push_n_unchecked(*value, end - decoded.len()) }; } - PrimitiveArray::new(decoded, values_nullability.into()) + (decoded.into(), values_nullability.into()) } - Mask::AllFalse(_) => PrimitiveArray::new(Buffer::::zeroed(length), Validity::AllInvalid), + Mask::AllFalse(_) => (Buffer::::zeroed(length), Validity::AllInvalid), Mask::Values(mask) => { let mut decoded = BufferMut::with_capacity(length); let mut decoded_validity = BitBufferMut::with_capacity(length); @@ -258,11 +263,28 @@ pub fn runend_decode_typed_primitive( } } } - PrimitiveArray::new(decoded, Validity::from(decoded_validity.freeze())) + (decoded.into(), Validity::from(decoded_validity.freeze())) } } } +pub fn runend_decode_typed_primitive( + run_ends: impl Iterator, + values: &[T], + values_validity: Mask, + values_nullability: Nullability, + length: usize, +) -> PrimitiveArray { + let (decoded, validity) = runend_decode_slice( + run_ends, + values, + values_validity, + values_nullability, + length, + ); + PrimitiveArray::new(decoded, validity) +} + pub fn runend_decode_typed_bool( run_ends: impl Iterator, values: &BitBuffer, @@ -304,6 +326,36 @@ pub fn runend_decode_typed_bool( } } +/// Decode a run-end encoded VarBinView array by expanding views directly. +pub fn runend_decode_varbinview( + ends: PrimitiveArray, + values: VarBinViewArray, + offset: usize, + length: usize, +) -> VortexResult { + let validity_mask = values.validity_mask()?; + let views = values.views(); + + let (decoded_views, validity) = match_each_unsigned_integer_ptype!(ends.ptype(), |E| { + runend_decode_slice( + trimmed_ends_iter(ends.as_slice::(), offset, length), + views, + validity_mask, + values.dtype().nullability(), + length, + ) + }); + + let parts = values.into_parts(); + let view_handle = BufferHandle::new_host(decoded_views.into_byte_buffer()); + + // SAFETY: we are expanding views from a valid VarBinViewArray with the same + // buffers, so all buffer indices and offsets remain valid. + Ok(unsafe { + VarBinViewArray::new_handle_unchecked(view_handle, parts.buffers, parts.dtype, validity) + }) +} + #[cfg(test)] mod test { use vortex_array::ToCanonical; From 8e15a05440041017557c43e248f0893124117a38 Mon Sep 17 00:00:00 2001 From: Robert Kruszewski Date: Fri, 6 Mar 2026 17:45:36 +0000 Subject: [PATCH 2/3] rebase Signed-off-by: Robert Kruszewski --- encodings/runend/src/array.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/encodings/runend/src/array.rs b/encodings/runend/src/array.rs index a904c4c47df..d36b77ffbb4 100644 --- a/encodings/runend/src/array.rs +++ b/encodings/runend/src/array.rs @@ -544,7 +544,7 @@ mod tests { let expected = VarBinViewArray::from_iter_str(["a", "a", "b", "b", "b", "c", "c", "c", "c", "c"]) .into_array(); - assert_arrays_eq!(arr.to_array(), expected); + assert_arrays_eq!(arr.into_array(), expected); } #[test] @@ -560,6 +560,6 @@ mod tests { let expected = VarBinViewArray::from_iter_str(["x", "x", "y", "y", "y", "z", "z", "z", "z", "z"]) .into_array(); - assert_arrays_eq!(arr.to_array(), expected); + assert_arrays_eq!(arr.into_array(), expected); } } From 3fbb1bc2b4097cb8a29c5d6f85fe2ded5806bd7c Mon Sep 17 00:00:00 2001 From: Robert Kruszewski Date: Fri, 6 Mar 2026 17:47:17 +0000 Subject: [PATCH 3/3] api Signed-off-by: Robert Kruszewski --- encodings/runend/public-api.lock | 2 ++ 1 file changed, 2 insertions(+) diff --git a/encodings/runend/public-api.lock b/encodings/runend/public-api.lock index 49307599400..3d558820fe1 100644 --- a/encodings/runend/public-api.lock +++ b/encodings/runend/public-api.lock @@ -10,6 +10,8 @@ pub fn vortex_runend::compress::runend_decode_typed_bool(run_ends: impl core::it pub fn vortex_runend::compress::runend_decode_typed_primitive(run_ends: impl core::iter::traits::iterator::Iterator, values: &[T], values_validity: vortex_mask::Mask, values_nullability: vortex_array::dtype::nullability::Nullability, length: usize) -> vortex_array::arrays::primitive::array::PrimitiveArray +pub fn vortex_runend::compress::runend_decode_varbinview(ends: vortex_array::arrays::primitive::array::PrimitiveArray, values: vortex_array::arrays::varbinview::array::VarBinViewArray, offset: usize, length: usize) -> vortex_error::VortexResult + pub fn vortex_runend::compress::runend_encode(array: &vortex_array::arrays::primitive::array::PrimitiveArray) -> (vortex_array::arrays::primitive::array::PrimitiveArray, vortex_array::array::ArrayRef) pub struct vortex_runend::RunEndArray