diff --git a/Cargo.lock b/Cargo.lock index 6f2b7bb670e..4bcfd04863a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -10009,6 +10009,7 @@ dependencies = [ "vortex-array", "vortex-buffer", "vortex-error", + "vortex-fastlanes", "vortex-mask", "vortex-session", ] diff --git a/encodings/fastlanes/src/_chunked_exec.rs b/encodings/fastlanes/src/_chunked_exec.rs new file mode 100644 index 00000000000..1e241ec4667 --- /dev/null +++ b/encodings/fastlanes/src/_chunked_exec.rs @@ -0,0 +1,623 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Bit-pack–aware chunked decoders that integrate with `vortex_array::_chunked_exec`. +//! +//! Avoids the full upfront bit-unpack the canonical executor performs. For a +//! `Dict, …>` array we unpack one 1024-element code chunk at a time and +//! immediately AVX2-gather it into the output buffer. The working set is the small +//! values dictionary plus the 4 KiB chunk-of-codes, never the materialised codes column. + +use std::mem::MaybeUninit; +use std::sync::Arc; + +use fastlanes::BitPacking; +use vortex_array::ArrayRef; +use vortex_array::ExecutionCtx; +use vortex_array::VTable; +use vortex_array::_chunked_exec::CHUNK_LEN; +use vortex_array::_chunked_exec::Scratch; +use vortex_array::_chunked_exec::primitive::PrimitiveChunkKernel; +use vortex_array::_chunked_exec::primitive::PrimitiveChunkKernelDispatcher; +use vortex_array::_chunked_exec::primitive::PatchedProducer; +use vortex_array::_chunked_exec::primitive::PrimitiveChunkProducer; +use vortex_array::_chunked_exec::take_into_uninit; +use vortex_array::arrays::Dict; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::arrays::dict::DictArraySlotsExt; +use vortex_array::arrays::primitive::PrimitiveArrayExt as _; +use vortex_array::dtype::DType; +use vortex_array::dtype::NativePType; +use vortex_array::dtype::Nullability; +use vortex_array::dtype::PType; +use vortex_array::dtype::UnsignedPType; +use vortex_buffer::Buffer; +use vortex_buffer::ByteBuffer; +use vortex_error::VortexResult; + +use crate::BitPacked; +use crate::BitPackedArrayExt; + +/// Chunked dict decoder where the codes child is bit-packed. +/// +/// `dict` is the materialised values buffer (small, expected L1-resident). `packed` is +/// the bit-packed codes buffer. Each chunk: bit-unpack 1024 codes into an internal +/// `[I; CHUNK_LEN]` (one stack-resident buffer reused across chunks), then AVX2-gather +/// them into the output via [`take_into_uninit`]. +pub struct BitPackedDictProducer { + dict: Buffer, + packed: ByteBuffer, + bit_width: usize, + elems_per_chunk: usize, + /// Number of *full* 1024-element chunks in the packed buffer. + full_chunks: usize, + /// Length of the trailing partial chunk in elements (0 if perfectly aligned). + trailer_len: usize, + /// Total remaining logical elements to produce. + remaining: usize, + /// Current full-chunk index. + chunk_idx: usize, + /// Scratch for one chunk of unpacked codes (4–8 KiB depending on `I`). + code_scratch: Box<[MaybeUninit; CHUNK_LEN]>, +} + +impl BitPackedDictProducer +where + T: NativePType, + I: NativePType + UnsignedPType + BitPacking, +{ + fn new( + dict: Buffer, + packed: ByteBuffer, + bit_width: usize, + full_chunks: usize, + trailer_len: usize, + ) -> Self { + let elems_per_chunk = 128 * bit_width / size_of::(); + let total = full_chunks * CHUNK_LEN + trailer_len; + Self { + dict, + packed, + bit_width, + elems_per_chunk, + full_chunks, + trailer_len, + remaining: total, + chunk_idx: 0, + code_scratch: Box::new([const { MaybeUninit::::uninit() }; CHUNK_LEN]), + } + } + + /// Bit-unpack fastlanes chunk `chunk_index` into `self.code_scratch[offset..offset+1024]`. + /// + /// # Safety + /// + /// `offset + 1024 ≤ code_scratch.len()` (the caller must ensure room for one fastlanes + /// chunk starting at `offset`). + unsafe fn unpack_chunk_into(&mut self, chunk_index: usize, offset: usize) { + let packed_bytes = self.packed.as_ref(); + // SAFETY: same alignment used by fastlanes; bit_width-derived chunk layout. + let packed_slice: &[I] = unsafe { + std::slice::from_raw_parts( + packed_bytes.as_ptr().cast::(), + packed_bytes.len() / size_of::(), + ) + }; + let chunk = &packed_slice[chunk_index * self.elems_per_chunk + ..chunk_index * self.elems_per_chunk + self.elems_per_chunk]; + // SAFETY: caller ensures offset + 1024 ≤ scratch capacity. + let dst_ptr = unsafe { self.code_scratch.as_mut_ptr().add(offset).cast::() }; + let dst_slice: &mut [I] = unsafe { std::slice::from_raw_parts_mut(dst_ptr, 1024) }; + unsafe { + BitPacking::unchecked_unpack(self.bit_width, chunk, dst_slice); + } + } +} + +impl PrimitiveChunkProducer for BitPackedDictProducer +where + T: NativePType, + I: NativePType + UnsignedPType + BitPacking, +{ + fn next_chunk<'a>( + &mut self, + scratch: &'a mut Scratch, + ) -> VortexResult> { + if self.remaining == 0 { + return Ok(None); + } + let dst = &mut scratch.as_uninit_mut()[..CHUNK_LEN]; + let dst_ptr = dst.as_ptr().cast::(); + let n = self.write_next_into(&mut dst[..])?; + match n { + Some(n) => Ok(Some(unsafe { std::slice::from_raw_parts(dst_ptr, n) })), + None => Ok(None), + } + } + + fn next_chunk_into_uninit( + &mut self, + _scratch: &mut Scratch, + dst: &mut [MaybeUninit], + ) -> VortexResult> { + self.write_next_into(dst) + } + + fn remaining(&self) -> usize { + self.remaining + } +} + +impl BitPackedDictProducer +where + T: NativePType, + I: NativePType + UnsignedPType + BitPacking, +{ + fn write_next_into(&mut self, dst: &mut [MaybeUninit]) -> VortexResult> { + if self.remaining == 0 { + return Ok(None); + } + // Unpack up to `CHUNK_LEN / FL_CHUNK = 4` fastlanes chunks into the local + // code_scratch, then issue a single AVX-gather over the full super-chunk. This + // amortises the per-chunk dispatch + gather-call overhead. + const FL_CHUNK: usize = 1024; + let super_chunk_max = CHUNK_LEN / FL_CHUNK; + debug_assert!(super_chunk_max >= 1); + let mut produced = 0usize; + let dst_cap = dst.len(); + + // Full fastlanes chunks first. + while self.chunk_idx < self.full_chunks + && produced + FL_CHUNK <= dst_cap + && produced + FL_CHUNK <= CHUNK_LEN + { + // SAFETY: code_scratch has CHUNK_LEN cells; produced + FL_CHUNK ≤ CHUNK_LEN. + unsafe { + self.unpack_chunk_into(self.chunk_idx, produced); + } + self.chunk_idx += 1; + produced += FL_CHUNK; + } + // Trailing partial chunk if there's room. + if produced < dst_cap && produced < CHUNK_LEN && self.trailer_len > 0 { + let trailer_take = self.trailer_len.min(dst_cap - produced).min(CHUNK_LEN - produced); + // Unpack the trailing fastlanes chunk in full into scratch (its prefix is what we want). + // SAFETY: code_scratch has space for CHUNK_LEN; produced + FL_CHUNK ≤ CHUNK_LEN. + unsafe { + self.unpack_chunk_into(self.chunk_idx, produced); + } + self.trailer_len -= trailer_take; + if self.trailer_len == 0 { + self.chunk_idx += 1; + } + produced += trailer_take; + } + + if produced == 0 { + return Ok(None); + } + + // Single AVX gather over the full super-chunk. + // SAFETY: code_scratch[..produced] is initialised by the unpack calls above. + let codes = unsafe { + std::slice::from_raw_parts(self.code_scratch.as_ptr().cast::(), produced) + }; + take_into_uninit::(self.dict.as_slice(), codes, &mut dst[..produced]); + self.remaining -= produced; + Ok(Some(produced)) + } +} + +/// Kernel that matches `Dict<…>` whose codes child is bit-packed. +/// +/// Falls back to the in-crate canonical `DictKernel` for non-bit-packed codes or sliced +/// arrays (those still go through the AVX2 gather in `take_into_uninit`, just with a +/// canonicalised codes buffer). +pub struct BitPackedDictKernel { + _marker: std::marker::PhantomData T>, +} + +impl BitPackedDictKernel { + /// Construct a new kernel marker. + pub fn new() -> Self { + Self { + _marker: std::marker::PhantomData, + } + } +} + +impl Default for BitPackedDictKernel { + fn default() -> Self { + Self::new() + } +} + +impl PrimitiveChunkKernel for BitPackedDictKernel { + fn build( + &self, + array: &ArrayRef, + ctx: &mut ExecutionCtx, + ) -> VortexResult>>> { + let Some(dict) = array.as_opt::() else { + return Ok(None); + }; + if !matches!(array.dtype().nullability(), Nullability::NonNullable) { + return Ok(None); + } + let codes = dict.codes(); + let values = dict.values(); + let Some(bp) = codes.as_opt::() else { + return Ok(None); + }; + // v1 fast path only handles non-sliced bit-packed inputs without patches. + if bp.offset() != 0 || bp.patches().is_some() { + return Ok(None); + } + if !matches!(codes.dtype().nullability(), Nullability::NonNullable) + || !matches!(values.dtype().nullability(), Nullability::NonNullable) + { + return Ok(None); + } + let DType::Primitive(values_ptype, _) = *values.dtype() else { + return Ok(None); + }; + if values_ptype != T::PTYPE { + return Ok(None); + } + let DType::Primitive(codes_ptype, _) = *codes.dtype() else { + return Ok(None); + }; + if !codes_ptype.is_unsigned_int() { + return Ok(None); + } + + let values_canonical = values.clone().execute::(ctx)?; + let dict_buf = values_canonical.into_buffer::(); + let len = codes.len(); + let packed = bp.packed().clone().unwrap_host(); + let bit_width = bp.bit_width() as usize; + let full_chunks = len / CHUNK_LEN; + let trailer_len = len % CHUNK_LEN; + + Ok(Some(match codes_ptype { + PType::U8 => Box::new(BitPackedDictProducer::::new( + dict_buf, + packed, + bit_width, + full_chunks, + trailer_len, + )), + PType::U16 => Box::new(BitPackedDictProducer::::new( + dict_buf, + packed, + bit_width, + full_chunks, + trailer_len, + )), + PType::U32 => Box::new(BitPackedDictProducer::::new( + dict_buf, + packed, + bit_width, + full_chunks, + trailer_len, + )), + PType::U64 => Box::new(BitPackedDictProducer::::new( + dict_buf, + packed, + bit_width, + full_chunks, + trailer_len, + )), + _ => return Ok(None), + })) + } +} + +// ---------------------------------------------------------------------------------- +// Plain BitPacked

-> Primitive chunked producer (no gather, no patches). +// ---------------------------------------------------------------------------------- + +/// Bit-unpacks a `BitPacked

` array (without internal patches) one 1024-element chunk at +/// a time into the output. Used as the *base* of a chunked +/// [`vortex_array::_chunked_exec::primitive::PatchedProducer`] so patch overlay happens +/// chunk-locally instead of scattering into a fully-materialised buffer. +/// +/// For v1 this requires `offset == 0` (non-sliced). +pub struct BitPackedPrimitiveProducer { + packed: ByteBuffer, + bit_width: usize, + elems_per_chunk: usize, + full_chunks: usize, + trailer_len: usize, + remaining: usize, + chunk_idx: usize, + scratch: Box<[MaybeUninit; CHUNK_LEN]>, +} + +impl BitPackedPrimitiveProducer { + fn new(packed: ByteBuffer, bit_width: usize, len: usize) -> Self { + let elems_per_chunk = 128 * bit_width / size_of::(); + Self { + packed, + bit_width, + elems_per_chunk, + full_chunks: len / CHUNK_LEN, + trailer_len: len % CHUNK_LEN, + remaining: len, + chunk_idx: 0, + scratch: Box::new([const { MaybeUninit::::uninit() }; CHUNK_LEN]), + } + } + + unsafe fn unpack_into(&mut self, chunk_index: usize, dst: *mut T) { + let packed_bytes = self.packed.as_ref(); + // SAFETY: fastlanes layout; bit_width-derived chunk size. + let packed_slice: &[T] = unsafe { + std::slice::from_raw_parts( + packed_bytes.as_ptr().cast::(), + packed_bytes.len() / size_of::(), + ) + }; + let chunk = &packed_slice[chunk_index * self.elems_per_chunk + ..chunk_index * self.elems_per_chunk + self.elems_per_chunk]; + let dst_slice: &mut [T] = unsafe { std::slice::from_raw_parts_mut(dst, 1024) }; + unsafe { BitPacking::unchecked_unpack(self.bit_width, chunk, dst_slice) }; + } + + fn write_next(&mut self, dst: *mut T, dst_cap: usize) -> Option { + if self.remaining == 0 { + return None; + } + if self.chunk_idx < self.full_chunks { + let n = CHUNK_LEN.min(dst_cap); + debug_assert!(n >= 1024); + // SAFETY: dst has dst_cap ≥ 1024 cells. + unsafe { self.unpack_into(self.chunk_idx, dst) }; + self.chunk_idx += 1; + self.remaining -= 1024; + Some(1024) + } else if self.trailer_len > 0 { + // Unpack the trailing fastlanes chunk into the local scratch, copy the prefix. + let scratch_ptr = self.scratch.as_mut_ptr().cast::(); + // SAFETY: scratch has CHUNK_LEN cells. + unsafe { self.unpack_into(self.chunk_idx, scratch_ptr) }; + let n = self.trailer_len.min(dst_cap); + // SAFETY: scratch[..n] just initialised; dst has n cells. + unsafe { std::ptr::copy_nonoverlapping(scratch_ptr, dst, n) }; + self.trailer_len -= n; + self.remaining -= n; + if self.trailer_len == 0 { + self.chunk_idx += 1; + } + Some(n) + } else { + None + } + } +} + +impl PrimitiveChunkProducer for BitPackedPrimitiveProducer { + fn next_chunk<'a>( + &mut self, + scratch: &'a mut Scratch, + ) -> VortexResult> { + let ptr = scratch.as_uninit_mut().as_mut_ptr().cast::(); + match self.write_next(ptr, CHUNK_LEN) { + Some(n) => Ok(Some(unsafe { std::slice::from_raw_parts(ptr, n) })), + None => Ok(None), + } + } + + fn next_chunk_into_uninit( + &mut self, + _scratch: &mut Scratch, + dst: &mut [MaybeUninit], + ) -> VortexResult> { + Ok(self.write_next(dst.as_mut_ptr().cast::(), dst.len())) + } + + fn remaining(&self) -> usize { + self.remaining + } +} + +/// Construct a [`BitPackedPrimitiveProducer`] for an unsigned, non-sliced, patch-free +/// `BitPacked` array. +pub fn build_bitpacked_primitive_producer( + bp: &vortex_array::Array, +) -> Option> { + if bp.offset() != 0 || bp.patches().is_some() { + return None; + } + let packed = bp.packed().clone().unwrap_host(); + Some(BitPackedPrimitiveProducer::::new( + packed, + bp.bit_width() as usize, + bp.as_ref().len(), + )) +} + +/// Split a non-sliced `BitPacked`-with-internal-patches into a chunked +/// [`PatchedProducer`] that overlays the (formerly internal) patches chunk-locally over +/// the bit-unpacked base. +/// +/// This re-expresses the "patches inside BitPacked" layout as the "PatchedArray on top of +/// patchless BitPacked" layout, but decoded through the chunked engine so the patch +/// overlay happens while each base chunk is still hot in L1 — instead of scattering into +/// a fully-materialised N-element buffer the way the canonical executor does. +pub fn build_chunked_patched_over_bitpacked( + bp: &vortex_array::Array, + ctx: &mut ExecutionCtx, +) -> VortexResult>> +where + T: NativePType + BitPacking + UnsignedPType, +{ + if bp.offset() != 0 { + return Ok(None); + } + let packed = bp.packed().clone().unwrap_host(); + let base = BitPackedPrimitiveProducer::::new( + packed, + bp.bit_width() as usize, + bp.as_ref().len(), + ); + + let (indices, values) = match bp.patches() { + Some(patches) => { + let off = patches.offset(); + let indices_prim = + patches.indices().clone().execute::(ctx)?; + let values_prim = patches.values().clone().execute::(ctx)?; + let idx_u32: Buffer = + vortex_array::match_each_integer_ptype!(indices_prim.ptype(), |P| { + indices_prim + .as_slice::

() + .iter() + .map(|&v| { + let p: usize = num_traits::AsPrimitive::::as_(v); + (p - off) as u32 + }) + .collect() + }); + (idx_u32, values_prim.into_buffer::()) + } + None => (Buffer::::empty(), Buffer::::empty()), + }; + + Ok(Some(PatchedProducer::::new( + Box::new(base), + indices, + values, + ))) +} + +/// Register the bit-packed chunked kernels onto `dispatcher` for every supported `T`. +pub fn register_chunk_kernels(dispatcher: &mut PrimitiveChunkKernelDispatcher) { + macro_rules! register_all_for { + ($($T:ty),*) => { + $( + // BitPackedDictKernel is registered LAST for `Dict.id()` so it tries first + // (dispatcher iterates in registration order; bit-packed match short-circuits). + dispatcher.register::<$T>(Dict.id(), Arc::new(BitPackedDictKernel::<$T>::new())); + )* + }; + } + register_all_for!(u8, u16, u32, u64, i8, i16, i32, i64, f32, f64); +} + +#[cfg(test)] +mod tests { + use vortex_array::IntoArray; + use vortex_array::VortexSessionExecute; + use vortex_array::_chunked_exec::primitive::PrimitiveChunkKernelDispatcher; + use vortex_array::_chunked_exec::primitive::decode_to_buffer; + use vortex_array::_chunked_exec::primitive::default_dispatcher; + use vortex_array::arrays::DictArray; + use vortex_array::arrays::PrimitiveArray; + use vortex_array::session::ArraySession; + use vortex_array::validity::Validity; + use vortex_buffer::Buffer; + use vortex_error::VortexResult; + use vortex_session::VortexSession; + + use super::register_chunk_kernels; + use crate::BitPackedData; + + fn session_dispatcher() -> (VortexSession, PrimitiveChunkKernelDispatcher) { + let s = VortexSession::empty().with::(); + crate::initialize(&s); + let mut d = default_dispatcher(); + register_chunk_kernels(&mut d); + (s, d) + } + + #[test] + fn dict_bitpacked_codes_chunked() -> VortexResult<()> { + let (session, dispatcher) = session_dispatcher(); + let mut ctx = session.create_execution_ctx(); + + let dict_values = Buffer::::from_iter((0..256).map(|i| i as i32 * 7 + 11)); + let dict = PrimitiveArray::new(dict_values.clone(), Validity::NonNullable); + + // 4096 codes — 4 full 1024-chunks. Use u16 at 8 bits (bit_width must be < type width). + let codes_vec: Vec = (0..4096u32).map(|i| (i % 256) as u16).collect(); + let codes_prim = PrimitiveArray::new( + Buffer::::from_iter(codes_vec.iter().copied()), + Validity::NonNullable, + ); + let bp_codes = BitPackedData::encode(&codes_prim.into_array(), 8, &mut ctx)?; + let dict_arr = DictArray::try_new(bp_codes.into_array(), dict.into_array())?; + + let buf = decode_to_buffer::(dict_arr.into_array(), &dispatcher, &mut ctx)?; + let expected: Vec = codes_vec + .iter() + .map(|c| dict_values.as_slice()[*c as usize]) + .collect(); + assert_eq!(buf.as_slice(), expected.as_slice()); + Ok(()) + } + + #[test] + fn dict_bitpacked_codes_trailing_partial_chunk() -> VortexResult<()> { + let (session, dispatcher) = session_dispatcher(); + let mut ctx = session.create_execution_ctx(); + let dict_values = Buffer::::from_iter([10, 20, 30, 40, 50]); + let dict = PrimitiveArray::new(dict_values.clone(), Validity::NonNullable); + // 1500 codes => 1 full chunk + 476 trailer. u16 at 3 bits fits 0..5. + let codes_vec: Vec = (0..1500u32).map(|i| (i % 5) as u16).collect(); + let codes_prim = PrimitiveArray::new( + Buffer::::from_iter(codes_vec.iter().copied()), + Validity::NonNullable, + ); + let bp_codes = BitPackedData::encode(&codes_prim.into_array(), 3, &mut ctx)?; + let dict_arr = DictArray::try_new(bp_codes.into_array(), dict.into_array())?; + let buf = decode_to_buffer::(dict_arr.into_array(), &dispatcher, &mut ctx)?; + let expected: Vec = codes_vec + .iter() + .map(|c| dict_values.as_slice()[*c as usize]) + .collect(); + assert_eq!(buf.as_slice(), expected.as_slice()); + Ok(()) + } + + #[test] + fn chunked_patched_over_bitpacked_matches_canonical() -> VortexResult<()> { + use vortex_array::_chunked_exec::Scratch; + use vortex_array::_chunked_exec::primitive::PrimitiveChunkProducer; + + use crate::BitPackedArrayExt; + + let (session, _dispatcher) = session_dispatcher(); + let mut ctx = session.create_execution_ctx(); + + // Values where most fit in 4 bits but every 13th is an exception → patches. + let n = 5000usize; + let values: Vec = (0..n) + .map(|i| if i % 13 == 0 { 1000 + i as u32 } else { (i % 16) as u32 }) + .collect(); + let prim = PrimitiveArray::new( + Buffer::::from_iter(values.iter().copied()), + Validity::NonNullable, + ); + let bp = BitPackedData::encode(&prim.into_array(), 4, &mut ctx)?; + assert!(bp.patches().is_some(), "expected exceptions to become patches"); + + // Canonical decode (Layout A: patches inside BitPacked). + let canonical = bp + .clone() + .into_array() + .execute::(&mut ctx)?; + assert_eq!(canonical.as_slice::(), values.as_slice()); + + // Chunked patched-over-bitpacked (Layout B). + let mut producer = super::build_chunked_patched_over_bitpacked::(&bp, &mut ctx)? + .expect("non-sliced"); + let mut scratch = Scratch::::new(); + let mut out = Vec::with_capacity(n); + while let Some(chunk) = producer.next_chunk(&mut scratch)? { + out.extend_from_slice(chunk); + } + assert_eq!(out, values); + Ok(()) + } +} diff --git a/encodings/fastlanes/src/lib.rs b/encodings/fastlanes/src/lib.rs index 9022b7c4e2b..62f77d78cf3 100644 --- a/encodings/fastlanes/src/lib.rs +++ b/encodings/fastlanes/src/lib.rs @@ -15,6 +15,8 @@ use vortex_buffer::Buffer; use vortex_buffer::BufferMut; use vortex_error::VortexResult; +#[doc(hidden)] +pub mod _chunked_exec; pub mod bit_transpose; mod bitpacking; mod delta; diff --git a/encodings/runend/Cargo.toml b/encodings/runend/Cargo.toml index 01a5b8d7a3e..5ad10f1535b 100644 --- a/encodings/runend/Cargo.toml +++ b/encodings/runend/Cargo.toml @@ -36,6 +36,7 @@ itertools = { workspace = true } rand = { workspace = true } rstest = { workspace = true } vortex-array = { workspace = true, features = ["_test-harness"] } +vortex-fastlanes = { workspace = true } [features] arbitrary = ["dep:arbitrary", "vortex-array/arbitrary"] @@ -52,3 +53,15 @@ harness = false [[bench]] name = "run_end_decode" harness = false + +[[bench]] +name = "chunked_exec" +harness = false + +[[bench]] +name = "chunked_sinks" +harness = false + +[[bench]] +name = "chunked_patched" +harness = false diff --git a/encodings/runend/benches/chunked_exec.rs b/encodings/runend/benches/chunked_exec.rs new file mode 100644 index 00000000000..3fff01caec2 --- /dev/null +++ b/encodings/runend/benches/chunked_exec.rs @@ -0,0 +1,563 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Chunked execution engine benchmark. +//! +//! Compares the new chunked decode path (small L1-resident scratch + fused kernels) +//! against the existing canonical-by-canonical executor on several shapes: +//! +//! - `Dict` — the simplest gather-from-small-dict workload. +//! - `RunEnd` — single-encoding streaming. +//! - `Dict>` — the fused stack: the dictionary's values are themselves +//! RunEnd-encoded, so the chunked Dict kernel materializes the small RunEnd inner once +//! and then streams the gather. The legacy path does roughly the same work but pays +//! more allocation overhead in the executor. +//! - `ListView` (canonical + bit-packed offsets) — row-window streaming. +//! +//! Run with `cargo bench -p vortex-runend --bench chunked_exec`. + +use std::fmt; +use std::sync::LazyLock; + +use divan::Bencher; +use divan::black_box; +use vortex_array::IntoArray; +use vortex_array::VortexSessionExecute; +use vortex_array::_chunked_exec::listview::build_listview_producer_typed; +use vortex_array::_chunked_exec::primitive::PrimitiveChunkKernelDispatcher; +use vortex_array::_chunked_exec::primitive::decode_to_buffer; +use vortex_array::_chunked_exec::primitive::default_dispatcher; +use vortex_array::arrays::DictArray; +use vortex_array::arrays::ListViewArray; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::session::ArraySession; +use vortex_array::validity::Validity; +use vortex_buffer::Buffer; +use vortex_fastlanes::BitPackedData; +use vortex_runend::RunEnd; +use vortex_runend::_chunked_exec::register_chunk_kernels as register_runend_chunk_kernels; +use vortex_session::VortexSession; + +fn main() { + divan::main(); +} + +static SESSION: LazyLock = LazyLock::new(|| { + let s = VortexSession::empty().with::(); + vortex_runend::initialize(&s); + vortex_fastlanes::initialize(&s); + s +}); + +static DISPATCHER: LazyLock = LazyLock::new(|| { + let mut d = default_dispatcher(); + register_runend_chunk_kernels(&mut d); + vortex_fastlanes::_chunked_exec::register_chunk_kernels(&mut d); + d +}); + +// ------------------------------------------------------------------------------------ +// Dict +// ------------------------------------------------------------------------------------ + +#[derive(Copy, Clone)] +struct DictArgs { + len: usize, + dict_size: usize, +} + +impl fmt::Display for DictArgs { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "len={} dict={}", self.len, self.dict_size) + } +} + +const DICT_ARGS: &[DictArgs] = &[ + DictArgs { + len: 16_384, + dict_size: 64, + }, + DictArgs { + len: 65_536, + dict_size: 256, + }, + DictArgs { + len: 262_144, + dict_size: 1024, + }, + DictArgs { + len: 1_048_576, + dict_size: 256, + }, + DictArgs { + len: 1_048_576, + dict_size: 4096, + }, + // Cache-stress: codes buffer is 4*N bytes (canonical u32 codes). + // Both paths have the same data flow here (no intermediate to save), + // so chunked is predicted to stay tied across cache boundaries. + DictArgs { + len: 4_194_304, // codes 16 MB, output 16 MB + dict_size: 256, + }, + DictArgs { + len: 16_777_216, // codes 64 MB, output 64 MB + dict_size: 256, + }, +]; + +fn make_dict_i32(args: DictArgs) -> vortex_array::ArrayRef { + let dict_size = args.dict_size; + let len = args.len; + let values: Vec = (0..dict_size as i32).map(|i| i * 17 + 11).collect(); + let codes: Vec = (0..len).map(|i| (i % dict_size) as u32).collect(); + let values = PrimitiveArray::new(Buffer::::from_iter(values), Validity::NonNullable); + let codes = PrimitiveArray::new(Buffer::::from_iter(codes), Validity::NonNullable); + DictArray::try_new(codes.into_array(), values.into_array()) + .expect("dict") + .into_array() +} + +#[divan::bench(args = DICT_ARGS)] +fn dict_primitive_chunked(bencher: Bencher, args: DictArgs) { + let array = make_dict_i32(args); + bencher.with_inputs(|| array.clone()).bench_local_refs(|a| { + let mut ctx = SESSION.create_execution_ctx(); + black_box(decode_to_buffer::(a.clone(), &DISPATCHER, &mut ctx).unwrap()) + }); +} + +#[divan::bench(args = DICT_ARGS)] +fn dict_primitive_canonical(bencher: Bencher, args: DictArgs) { + let array = make_dict_i32(args); + bencher.with_inputs(|| array.clone()).bench_local_refs(|a| { + let mut ctx = SESSION.create_execution_ctx(); + black_box(a.clone().execute::(&mut ctx).unwrap()) + }); +} + +// ------------------------------------------------------------------------------------ +// Dict codes> — the v2 bit-pack fusion case +// ------------------------------------------------------------------------------------ + +#[derive(Copy, Clone)] +struct DictBpArgs { + len: usize, + dict_size: usize, + bit_width: u8, +} + +impl fmt::Display for DictBpArgs { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "len={} dict={} bw={}", self.len, self.dict_size, self.bit_width) + } +} + +const DICT_BP_ARGS: &[DictBpArgs] = &[ + DictBpArgs { + len: 65_536, + dict_size: 256, + bit_width: 8, + }, + DictBpArgs { + len: 262_144, + dict_size: 256, + bit_width: 8, + }, + DictBpArgs { + len: 1_048_576, + dict_size: 256, + bit_width: 8, + }, + DictBpArgs { + len: 1_048_576, + dict_size: 1024, + bit_width: 10, + }, + DictBpArgs { + len: 1_048_576, + dict_size: 4096, + bit_width: 12, + }, + // Cache-stress shapes: intermediate codes Buffer = 2*N bytes. + // L2 on the test host is 2 MiB. Below crosses that boundary progressively. + DictBpArgs { + len: 4_194_304, // 8 MiB intermediate (4× L2) + dict_size: 256, + bit_width: 8, + }, + DictBpArgs { + len: 16_777_216, // 32 MiB intermediate (16× L2, still in L3) + dict_size: 256, + bit_width: 8, + }, + DictBpArgs { + len: 67_108_864, // 128 MiB intermediate (64× L2, half of L3) + dict_size: 256, + bit_width: 8, + }, +]; + +fn make_dict_bp_i32(args: DictBpArgs) -> vortex_array::ArrayRef { + let dict_values: Vec = (0..args.dict_size as i32).map(|i| i * 17 + 11).collect(); + let codes: Vec = (0..args.len) + .map(|i| (i % args.dict_size) as u16) + .collect(); + let dict = PrimitiveArray::new( + Buffer::::from_iter(dict_values), + Validity::NonNullable, + ); + let codes_prim = PrimitiveArray::new(Buffer::::from_iter(codes), Validity::NonNullable); + let mut ctx = SESSION.create_execution_ctx(); + let bp = BitPackedData::encode(&codes_prim.into_array(), args.bit_width, &mut ctx) + .expect("bitpack"); + DictArray::try_new(bp.into_array(), dict.into_array()) + .expect("dict") + .into_array() +} + +#[divan::bench(args = DICT_BP_ARGS)] +fn dict_bp_canonical(bencher: Bencher, args: DictBpArgs) { + let array = make_dict_bp_i32(args); + bencher.with_inputs(|| array.clone()).bench_local_refs(|a| { + let mut ctx = SESSION.create_execution_ctx(); + black_box(a.clone().execute::(&mut ctx).unwrap()) + }); +} + +#[divan::bench(args = DICT_BP_ARGS)] +fn dict_bp_chunked(bencher: Bencher, args: DictBpArgs) { + let array = make_dict_bp_i32(args); + bencher.with_inputs(|| array.clone()).bench_local_refs(|a| { + let mut ctx = SESSION.create_execution_ctx(); + black_box(decode_to_buffer::(a.clone(), &DISPATCHER, &mut ctx).unwrap()) + }); +} + +// ------------------------------------------------------------------------------------ +// RunEnd +// ------------------------------------------------------------------------------------ + +#[derive(Copy, Clone)] +struct RunEndArgs { + len: usize, + avg_run_len: usize, +} + +impl fmt::Display for RunEndArgs { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "len={} run={}", self.len, self.avg_run_len) + } +} + +const RUNEND_ARGS: &[RunEndArgs] = &[ + RunEndArgs { + len: 65_536, + avg_run_len: 4, + }, + RunEndArgs { + len: 65_536, + avg_run_len: 64, + }, + RunEndArgs { + len: 1_048_576, + avg_run_len: 16, + }, + RunEndArgs { + len: 1_048_576, + avg_run_len: 256, + }, +]; + +fn make_runend_i32(args: RunEndArgs) -> vortex_array::ArrayRef { + let mut values = Vec::with_capacity(args.len); + let mut run_idx = 0i32; + let mut pos = 0; + while pos < args.len { + let run = args.avg_run_len.min(args.len - pos); + values.extend(std::iter::repeat(run_idx % 1024).take(run)); + run_idx += 1; + pos += run; + } + let prim = PrimitiveArray::new(Buffer::::from_iter(values), Validity::NonNullable); + let mut ctx = SESSION.create_execution_ctx(); + RunEnd::encode(prim.into_array(), &mut ctx).unwrap().into_array() +} + +#[divan::bench(args = RUNEND_ARGS)] +fn runend_chunked(bencher: Bencher, args: RunEndArgs) { + let array = make_runend_i32(args); + bencher.with_inputs(|| array.clone()).bench_local_refs(|a| { + let mut ctx = SESSION.create_execution_ctx(); + black_box(decode_to_buffer::(a.clone(), &DISPATCHER, &mut ctx).unwrap()) + }); +} + +#[divan::bench(args = RUNEND_ARGS)] +fn runend_canonical(bencher: Bencher, args: RunEndArgs) { + let array = make_runend_i32(args); + bencher.with_inputs(|| array.clone()).bench_local_refs(|a| { + let mut ctx = SESSION.create_execution_ctx(); + black_box(a.clone().execute::(&mut ctx).unwrap()) + }); +} + +// ------------------------------------------------------------------------------------ +// Dict> (fused) +// ------------------------------------------------------------------------------------ + +#[derive(Copy, Clone)] +struct DictRunEndArgs { + len: usize, + dict_size: usize, + /// Average run length *inside the dictionary's values*. + inner_run_len: usize, +} + +impl fmt::Display for DictRunEndArgs { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + f, + "len={} dict={} inner_run={}", + self.len, self.dict_size, self.inner_run_len + ) + } +} + +const DICT_RUNEND_ARGS: &[DictRunEndArgs] = &[ + DictRunEndArgs { + len: 1_048_576, + dict_size: 256, + inner_run_len: 4, + }, + DictRunEndArgs { + len: 1_048_576, + dict_size: 4096, + inner_run_len: 16, + }, + DictRunEndArgs { + len: 4_194_304, + dict_size: 1024, + inner_run_len: 8, + }, +]; + +fn make_dict_runend_i32(args: DictRunEndArgs) -> vortex_array::ArrayRef { + // Build the inner dictionary values (RunEnd-encoded). + let mut inner_values = Vec::with_capacity(args.dict_size); + let mut run_idx = 0i32; + let mut pos = 0; + while pos < args.dict_size { + let run = args.inner_run_len.min(args.dict_size - pos); + inner_values.extend(std::iter::repeat(run_idx).take(run)); + run_idx += 1; + pos += run; + } + let inner_prim = PrimitiveArray::new(Buffer::::from_iter(inner_values), Validity::NonNullable); + let mut ctx = SESSION.create_execution_ctx(); + let inner_re = RunEnd::encode(inner_prim.into_array(), &mut ctx).unwrap(); + let codes: Vec = (0..args.len as u32).map(|i| i % args.dict_size as u32).collect(); + let codes = PrimitiveArray::new(Buffer::::from_iter(codes), Validity::NonNullable); + DictArray::try_new(codes.into_array(), inner_re.into_array()) + .unwrap() + .into_array() +} + +#[divan::bench(args = DICT_RUNEND_ARGS)] +fn dict_runend_fused_chunked(bencher: Bencher, args: DictRunEndArgs) { + let array = make_dict_runend_i32(args); + bencher.with_inputs(|| array.clone()).bench_local_refs(|a| { + let mut ctx = SESSION.create_execution_ctx(); + black_box(decode_to_buffer::(a.clone(), &DISPATCHER, &mut ctx).unwrap()) + }); +} + +#[divan::bench(args = DICT_RUNEND_ARGS)] +fn dict_runend_canonical(bencher: Bencher, args: DictRunEndArgs) { + let array = make_dict_runend_i32(args); + bencher.with_inputs(|| array.clone()).bench_local_refs(|a| { + let mut ctx = SESSION.create_execution_ctx(); + black_box(a.clone().execute::(&mut ctx).unwrap()) + }); +} + +// Diagnostic: how slow is *just* canonicalizing the inner RunEnd dict (small)? +#[divan::bench(args = DICT_RUNEND_ARGS)] +fn dict_runend_phase_inner_canonical(bencher: Bencher, args: DictRunEndArgs) { + let array = make_dict_runend_i32(args); + use vortex_array::arrays::dict::DictArraySlotsExt; + use vortex_array::arrays::Dict; + let inner = array.as_::().values().clone(); + bencher.with_inputs(|| inner.clone()).bench_local_refs(|a| { + let mut ctx = SESSION.create_execution_ctx(); + black_box(a.clone().execute::(&mut ctx).unwrap()) + }); +} + +// Diagnostic: take_primitive over the already-materialized dict + 1M codes. +#[divan::bench(args = DICT_RUNEND_ARGS)] +fn dict_runend_phase_take(bencher: Bencher, args: DictRunEndArgs) { + use vortex_array::arrays::dict::DictArraySlotsExt; + use vortex_array::arrays::Dict; + use vortex_array::builtins::ArrayBuiltins; + let array = make_dict_runend_i32(args); + let dict_view = array.as_::(); + let codes = dict_view.codes().clone(); + let values = dict_view.values().clone(); + let mut ctx = SESSION.create_execution_ctx(); + let inner = values.execute::(&mut ctx).unwrap().into_array(); + bencher + .with_inputs(|| (inner.clone(), codes.clone())) + .bench_local_refs(|(inner, codes)| { + black_box(inner.take(codes.clone()).unwrap()) + }); +} + +// ------------------------------------------------------------------------------------ +// ListView with bit-packed offsets + sizes +// ------------------------------------------------------------------------------------ + +#[derive(Copy, Clone)] +struct ListViewArgs { + rows: usize, + avg_list_len: usize, + bit_width_offsets: u8, + bit_width_sizes: u8, +} + +impl fmt::Display for ListViewArgs { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + f, + "rows={} avg_list={} bw_off={} bw_sz={}", + self.rows, self.avg_list_len, self.bit_width_offsets, self.bit_width_sizes + ) + } +} + +const LIST_ARGS: &[ListViewArgs] = &[ + ListViewArgs { + rows: 16_384, + avg_list_len: 8, + bit_width_offsets: 18, // covers up to 16_384 * 8 == 131_072 + bit_width_sizes: 5, // up to 31 elements per list + }, + ListViewArgs { + rows: 65_536, + avg_list_len: 4, + bit_width_offsets: 18, + bit_width_sizes: 4, + }, + ListViewArgs { + rows: 262_144, + avg_list_len: 4, + bit_width_offsets: 20, + bit_width_sizes: 4, + }, +]; + +/// Build a `ListView` where: +/// - `elements` is a plain i32 buffer with `rows * avg_list_len` values. +/// - `offsets` and `sizes` are bit-packed via fastlanes (so canonicalize must unpack). +fn make_listview_bp(args: ListViewArgs) -> vortex_array::ArrayRef { + let n_elements = args.rows * args.avg_list_len; + let elements: Vec = (0..n_elements as i32).collect(); + let offsets: Vec = (0..args.rows as u32).map(|i| i * args.avg_list_len as u32).collect(); + let sizes: Vec = vec![args.avg_list_len as u32; args.rows]; + + let elements_arr = PrimitiveArray::new(Buffer::::from_iter(elements), Validity::NonNullable); + let offsets_arr = PrimitiveArray::new(Buffer::::from_iter(offsets), Validity::NonNullable); + let sizes_arr = PrimitiveArray::new(Buffer::::from_iter(sizes), Validity::NonNullable); + + let mut ctx = SESSION.create_execution_ctx(); + let bp_offsets = BitPackedData::encode( + &offsets_arr.into_array(), + args.bit_width_offsets, + &mut ctx, + ) + .expect("offsets bitpack") + .into_array(); + let bp_sizes = + BitPackedData::encode(&sizes_arr.into_array(), args.bit_width_sizes, &mut ctx) + .expect("sizes bitpack") + .into_array(); + + ListViewArray::new( + elements_arr.into_array(), + bp_offsets, + bp_sizes, + Validity::NonNullable, + ) + .into_array() +} + +/// Walk the chunked rows summing elements end-to-end. Uses the typed-callback API so +/// there is no dyn dispatch in the inner loop. +#[divan::bench(args = LIST_ARGS)] +fn listview_chunked_sum(bencher: Bencher, args: ListViewArgs) { + let array = make_listview_bp(args); + bencher.with_inputs(|| array.clone()).bench_local_refs(|a| { + let mut ctx = SESSION.create_execution_ctx(); + let mut producer = + build_listview_producer_typed::(a.clone(), &mut ctx).unwrap(); + let mut sum: i64 = 0; + producer.for_each_chunk_typed(|offs, szs, elems| { + for i in 0..offs.len() { + let o = offs[i] as usize; + let s = szs[i] as usize; + for &v in &elems[o..o + s] { + sum = sum.wrapping_add(v as i64); + } + } + }); + black_box(sum) + }); +} + +/// Canonicalize the whole `ListView` then sum elements; the apples-to-apples baseline. +#[divan::bench(args = LIST_ARGS)] +fn listview_canonical_sum(bencher: Bencher, args: ListViewArgs) { + use vortex_array::Canonical; + use vortex_array::arrays::ListView; + use vortex_array::arrays::listview::ListViewArrayExt; + use vortex_array::arrays::primitive::PrimitiveArrayExt; + use vortex_array::dtype::NativePType; + + let array = make_listview_bp(args); + bencher.with_inputs(|| array.clone()).bench_local_refs(|a| { + let mut ctx = SESSION.create_execution_ctx(); + let canonical = a.clone().execute::(&mut ctx).unwrap(); + // Canonical is a ListView; sum the relevant element slice via offsets/sizes. + let lv = canonical.into_array(); + let view = lv.as_::(); + let offsets = view + .offsets() + .clone() + .execute::(&mut ctx) + .unwrap(); + let sizes = view + .sizes() + .clone() + .execute::(&mut ctx) + .unwrap(); + let elements = view + .elements() + .clone() + .execute::(&mut ctx) + .unwrap(); + let elements = elements.as_slice::(); + let off = offsets.as_slice::(); + let sz = sizes.as_slice::(); + let mut sum: i64 = 0; + for i in 0..off.len() { + let o = off[i] as usize; + let s = sz[i] as usize; + for &v in &elements[o..o + s] { + sum = sum.wrapping_add(v as i64); + } + } + // Silence unused if NativePType isn't otherwise referenced. + let _: i32 = i32::PTYPE.bit_width() as i32; + black_box(sum) + }); +} diff --git a/encodings/runend/benches/chunked_patched.rs b/encodings/runend/benches/chunked_patched.rs new file mode 100644 index 00000000000..e3fb610aee8 --- /dev/null +++ b/encodings/runend/benches/chunked_patched.rs @@ -0,0 +1,144 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Patched-decode benchmark: chunk-local patch overlay vs full-buffer scatter. +//! +//! Compares decoding a bit-packed primitive column with exceptions ("patches") two ways: +//! +//! - `bitpacked_patches_canonical`: the canonical executor path for a `BitPacked` array +//! that carries its patches internally. It bit-unpacks the entire column into an +//! N-element buffer, then scatters the exception values into that buffer by index. +//! The scatter is a sequence of random writes; once N spills L2/L3 each write can miss. +//! +//! - `bitpacked_patches_chunked`: the same logical data decoded through the chunked engine +//! as a `PatchedProducer` over a `BitPackedPrimitiveProducer`. Each 1024-element base +//! chunk is bit-unpacked into the scratch, the patches that fall in that chunk's range +//! are overlaid while the chunk is still hot in L1, then flushed. The patch writes never +//! touch a cold cache line. +//! +//! Run with `cargo bench -p vortex-runend --bench chunked_patched`. + +use std::fmt; +use std::sync::LazyLock; + +use divan::Bencher; +use divan::black_box; +use vortex_array::IntoArray; +use vortex_array::VortexSessionExecute; +use vortex_array::_chunked_exec::Scratch; +use vortex_array::_chunked_exec::primitive::PrimitiveChunkProducer; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::session::ArraySession; +use vortex_array::validity::Validity; +use vortex_buffer::Buffer; +use vortex_buffer::BufferMut; +use vortex_fastlanes::BitPackedArray; +use vortex_fastlanes::BitPackedArrayExt; +use vortex_fastlanes::BitPackedData; +use vortex_fastlanes::_chunked_exec::build_chunked_patched_over_bitpacked; +use vortex_session::VortexSession; + +fn main() { + divan::main(); +} + +static SESSION: LazyLock = LazyLock::new(|| { + let s = VortexSession::empty().with::(); + vortex_fastlanes::initialize(&s); + s +}); + +#[derive(Copy, Clone)] +struct Args { + len: usize, + /// Fraction of values that are exceptions, as 1-in-`patch_stride`. + patch_stride: usize, + bit_width: u8, +} + +impl fmt::Display for Args { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let pct = 100.0 / self.patch_stride as f64; + write!( + f, + "len={} patches={:.1}% bw={}", + self.len, pct, self.bit_width + ) + } +} + +const ARGS: &[Args] = &[ + // 1M: output 4 MiB (2x L2). Patch density swept. + Args { len: 1_048_576, patch_stride: 100, bit_width: 8 }, // 1% + Args { len: 1_048_576, patch_stride: 20, bit_width: 8 }, // 5% + Args { len: 1_048_576, patch_stride: 10, bit_width: 8 }, // 10% + // 4M: output 16 MiB (8x L2, in L3). + Args { len: 4_194_304, patch_stride: 100, bit_width: 8 }, + Args { len: 4_194_304, patch_stride: 20, bit_width: 8 }, + Args { len: 4_194_304, patch_stride: 10, bit_width: 8 }, + // 16M: output 64 MiB (deep in L3). + Args { len: 16_777_216, patch_stride: 100, bit_width: 8 }, + Args { len: 16_777_216, patch_stride: 20, bit_width: 8 }, + Args { len: 16_777_216, patch_stride: 10, bit_width: 8 }, +]; + +/// Build a `BitPacked` whose values mostly fit in `bit_width` bits, with every +/// `patch_stride`-th value an exception (forcing it into the patches sidecar). +fn make_bitpacked_with_patches(args: Args) -> BitPackedArray { + let cap = (1u32 << args.bit_width) - 1; + let values: Vec = (0..args.len) + .map(|i| { + if i % args.patch_stride == 0 { + // Exception value above the bit-width ceiling. + cap + 1000 + (i as u32 & 0xffff) + } else { + (i as u32) & cap + } + }) + .collect(); + let prim = PrimitiveArray::new(Buffer::::from_iter(values), Validity::NonNullable); + let mut ctx = SESSION.create_execution_ctx(); + let bp = BitPackedData::encode(&prim.into_array(), args.bit_width, &mut ctx) + .expect("bitpack encode"); + assert!(bp.patches().is_some(), "expected patches at this density"); + bp +} + +/// Layout A: patches stored inside the BitPacked array, decoded canonically. +/// Bit-unpack the whole column, then scatter patches into the full N-element buffer. +#[divan::bench(args = ARGS)] +fn bitpacked_patches_canonical(bencher: Bencher, args: Args) { + let bp = make_bitpacked_with_patches(args); + bencher + .with_inputs(|| bp.clone().into_array()) + .bench_local_refs(|a| { + let mut ctx = SESSION.create_execution_ctx(); + black_box(a.clone().execute::(&mut ctx).unwrap()) + }); +} + +/// Layout B: same data decoded as a chunked PatchedProducer over a patchless +/// BitPackedPrimitiveProducer. Patch overlay happens chunk-locally in L1. +#[divan::bench(args = ARGS)] +fn bitpacked_patches_chunked(bencher: Bencher, args: Args) { + let bp = make_bitpacked_with_patches(args); + bencher.with_inputs(|| bp.clone()).bench_local_refs(|bp| { + let mut ctx = SESSION.create_execution_ctx(); + let mut producer = build_chunked_patched_over_bitpacked::(bp, &mut ctx) + .unwrap() + .expect("non-sliced"); + let mut out = BufferMut::::with_capacity(args.len); + let mut scratch = Scratch::::new(); + let mut written = 0usize; + while let Some(chunk) = producer.next_chunk(&mut scratch).unwrap() { + // SAFETY: out has capacity args.len; chunk total never exceeds it. + unsafe { + let dst = out.spare_capacity_mut().as_mut_ptr().add(written).cast::(); + std::ptr::copy_nonoverlapping(chunk.as_ptr(), dst, chunk.len()); + } + written += chunk.len(); + } + unsafe { out.set_len(written) }; + black_box(out.freeze()) + }); +} diff --git a/encodings/runend/benches/chunked_sinks.rs b/encodings/runend/benches/chunked_sinks.rs new file mode 100644 index 00000000000..bdf994b6439 --- /dev/null +++ b/encodings/runend/benches/chunked_sinks.rs @@ -0,0 +1,281 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Sink-fusion benchmark for the chunked execution engine. +//! +//! Compares the fused-pipeline path (decode + operator in one chunked pass, no +//! intermediate `Buffer`) against the canonical two-pass equivalent +//! (`array.execute::(ctx)?` followed by a scalar loop over the materialised +//! buffer) for three common scalar operator shapes: +//! +//! - `filter(x > c)` on `Dict>` +//! - `cast(i32 → i64)` on `Dict` +//! - `scalar add` (`x + c`) on `Dict` +//! +//! Run with `cargo bench -p vortex-runend --bench chunked_sinks`. + +use std::fmt; +use std::sync::LazyLock; + +use divan::Bencher; +use divan::black_box; +use vortex_array::IntoArray; +use vortex_array::VortexSessionExecute; +use vortex_array::_chunked_exec::primitive::PrimitiveChunkKernelDispatcher; +use vortex_array::_chunked_exec::primitive::default_dispatcher; +use vortex_array::_chunked_exec::sink::FilterSink; +use vortex_array::_chunked_exec::sink::MapSink; +use vortex_array::_chunked_exec::sink::drive_into_sink; +use vortex_array::arrays::DictArray; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::session::ArraySession; +use vortex_array::validity::Validity; +use vortex_buffer::Buffer; +use vortex_buffer::BufferMut; +use vortex_fastlanes::BitPackedData; +use vortex_session::VortexSession; + +fn main() { + divan::main(); +} + +static SESSION: LazyLock = LazyLock::new(|| { + let s = VortexSession::empty().with::(); + vortex_runend::initialize(&s); + vortex_fastlanes::initialize(&s); + s +}); + +static DISPATCHER: LazyLock = LazyLock::new(|| { + let mut d = default_dispatcher(); + vortex_runend::_chunked_exec::register_chunk_kernels(&mut d); + vortex_fastlanes::_chunked_exec::register_chunk_kernels(&mut d); + d +}); + +// ============================================================================ +// Filter — Dict> with selectivity ~50% +// ============================================================================ + +#[derive(Copy, Clone)] +struct FilterArgs { + len: usize, + dict_size: usize, + bit_width: u8, + threshold: i32, +} + +impl fmt::Display for FilterArgs { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + f, + "len={} dict={} bw={} pred=x>{}", + self.len, self.dict_size, self.bit_width, self.threshold + ) + } +} + +const FILTER_ARGS: &[FilterArgs] = &[ + FilterArgs { + len: 1_048_576, + dict_size: 256, + bit_width: 8, + threshold: 2000, + }, + FilterArgs { + len: 4_194_304, + dict_size: 256, + bit_width: 8, + threshold: 2000, + }, + FilterArgs { + len: 16_777_216, + dict_size: 256, + bit_width: 8, + threshold: 2000, + }, +]; + +/// Build `Dict>` with dict values `[0, 17, 34, …]` so threshold=2000 +/// gives ~50% selectivity (passes if dict[code] > 2000 → roughly half the dict). +fn make_filter_input(args: FilterArgs) -> vortex_array::ArrayRef { + let dict_values: Vec = (0..args.dict_size as i32).map(|i| i * 17 + 11).collect(); + let codes: Vec = (0..args.len) + .map(|i| (i % args.dict_size) as u16) + .collect(); + let dict = PrimitiveArray::new( + Buffer::::from_iter(dict_values), + Validity::NonNullable, + ); + let codes_prim = PrimitiveArray::new(Buffer::::from_iter(codes), Validity::NonNullable); + let mut ctx = SESSION.create_execution_ctx(); + let bp = BitPackedData::encode(&codes_prim.into_array(), args.bit_width, &mut ctx) + .expect("bitpack"); + DictArray::try_new(bp.into_array(), dict.into_array()) + .expect("dict") + .into_array() +} + +#[divan::bench(args = FILTER_ARGS)] +fn filter_canonical_two_pass(bencher: Bencher, args: FilterArgs) { + let array = make_filter_input(args); + bencher.with_inputs(|| array.clone()).bench_local_refs(|a| { + let mut ctx = SESSION.create_execution_ctx(); + // Pass 1: decode the full array into a Buffer. + let prim = a.clone().execute::(&mut ctx).unwrap(); + let slice = prim.as_slice::(); + // Pass 2: walk the buffer, collecting survivors into a new buffer. + let mut out = BufferMut::::with_capacity(slice.len()); + for &v in slice { + if v > args.threshold { + out.push(v); + } + } + black_box(out.freeze()) + }); +} + +#[divan::bench(args = FILTER_ARGS)] +fn filter_chunked_sink(bencher: Bencher, args: FilterArgs) { + let array = make_filter_input(args); + bencher.with_inputs(|| array.clone()).bench_local_refs(|a| { + let mut ctx = SESSION.create_execution_ctx(); + let threshold = args.threshold; + let sink = FilterSink::::with_capacity(args.len, move |v| v > threshold); + let buf: Buffer = drive_into_sink(a.clone(), &DISPATCHER, sink, &mut ctx).unwrap(); + black_box(buf) + }); +} + +// ============================================================================ +// Cast — Dict → i64 +// ============================================================================ + +#[derive(Copy, Clone)] +struct CastArgs { + len: usize, + dict_size: usize, +} + +impl fmt::Display for CastArgs { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "len={} dict={}", self.len, self.dict_size) + } +} + +const CAST_ARGS: &[CastArgs] = &[ + CastArgs { + len: 1_048_576, + dict_size: 256, + }, + CastArgs { + len: 4_194_304, + dict_size: 256, + }, + CastArgs { + len: 16_777_216, + dict_size: 256, + }, +]; + +fn make_cast_input(args: CastArgs) -> vortex_array::ArrayRef { + let dict_values: Vec = (0..args.dict_size as i32).map(|i| i * 7 + 13).collect(); + let codes: Vec = (0..args.len).map(|i| (i % args.dict_size) as u32).collect(); + let dict = PrimitiveArray::new( + Buffer::::from_iter(dict_values), + Validity::NonNullable, + ); + let codes_prim = PrimitiveArray::new(Buffer::::from_iter(codes), Validity::NonNullable); + DictArray::try_new(codes_prim.into_array(), dict.into_array()) + .expect("dict") + .into_array() +} + +#[divan::bench(args = CAST_ARGS)] +fn cast_canonical_two_pass(bencher: Bencher, args: CastArgs) { + let array = make_cast_input(args); + bencher.with_inputs(|| array.clone()).bench_local_refs(|a| { + let mut ctx = SESSION.create_execution_ctx(); + let prim = a.clone().execute::(&mut ctx).unwrap(); + let slice = prim.as_slice::(); + let mut out = BufferMut::::with_capacity(slice.len()); + for &v in slice { + out.push(v as i64); + } + black_box(out.freeze()) + }); +} + +#[divan::bench(args = CAST_ARGS)] +fn cast_chunked_sink(bencher: Bencher, args: CastArgs) { + let array = make_cast_input(args); + bencher.with_inputs(|| array.clone()).bench_local_refs(|a| { + let mut ctx = SESSION.create_execution_ctx(); + let sink = MapSink::::with_capacity(args.len, |v| v as i64); + let buf: Buffer = drive_into_sink(a.clone(), &DISPATCHER, sink, &mut ctx).unwrap(); + black_box(buf) + }); +} + +// ============================================================================ +// Scalar add — Dict, output i32, `x + 42` +// ============================================================================ + +#[divan::bench(args = CAST_ARGS)] +fn scalar_add_canonical_two_pass(bencher: Bencher, args: CastArgs) { + let array = make_cast_input(args); + bencher.with_inputs(|| array.clone()).bench_local_refs(|a| { + let mut ctx = SESSION.create_execution_ctx(); + let prim = a.clone().execute::(&mut ctx).unwrap(); + let slice = prim.as_slice::(); + let mut out = BufferMut::::with_capacity(slice.len()); + for &v in slice { + out.push(v.wrapping_add(42)); + } + black_box(out.freeze()) + }); +} + +#[divan::bench(args = CAST_ARGS)] +fn scalar_add_chunked_sink(bencher: Bencher, args: CastArgs) { + let array = make_cast_input(args); + bencher.with_inputs(|| array.clone()).bench_local_refs(|a| { + let mut ctx = SESSION.create_execution_ctx(); + let sink = + MapSink::::with_capacity(args.len, |v| v.wrapping_add(42)); + let buf: Buffer = drive_into_sink(a.clone(), &DISPATCHER, sink, &mut ctx).unwrap(); + black_box(buf) + }); +} + +// ============================================================================ +// Scalar mul + add (richer scalar pipeline: `x * 3 + 7`) +// ============================================================================ + +#[divan::bench(args = CAST_ARGS)] +fn scalar_mul_add_canonical_two_pass(bencher: Bencher, args: CastArgs) { + let array = make_cast_input(args); + bencher.with_inputs(|| array.clone()).bench_local_refs(|a| { + let mut ctx = SESSION.create_execution_ctx(); + let prim = a.clone().execute::(&mut ctx).unwrap(); + let slice = prim.as_slice::(); + let mut out = BufferMut::::with_capacity(slice.len()); + for &v in slice { + out.push(v.wrapping_mul(3).wrapping_add(7)); + } + black_box(out.freeze()) + }); +} + +#[divan::bench(args = CAST_ARGS)] +fn scalar_mul_add_chunked_sink(bencher: Bencher, args: CastArgs) { + let array = make_cast_input(args); + bencher.with_inputs(|| array.clone()).bench_local_refs(|a| { + let mut ctx = SESSION.create_execution_ctx(); + let sink = MapSink::::with_capacity(args.len, |v| { + v.wrapping_mul(3).wrapping_add(7) + }); + let buf: Buffer = drive_into_sink(a.clone(), &DISPATCHER, sink, &mut ctx).unwrap(); + black_box(buf) + }); +} diff --git a/encodings/runend/examples/profile_chunked.rs b/encodings/runend/examples/profile_chunked.rs new file mode 100644 index 00000000000..206f73f025d --- /dev/null +++ b/encodings/runend/examples/profile_chunked.rs @@ -0,0 +1,152 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Standalone profile binary for the chunked execution engine. +//! +//! Builds with `cargo build --release --example profile_chunked -p vortex-runend`. +//! Designed to be sampled by `samply record` or `perf record` — runs each path in a +//! tight loop with no benchmark-framework overhead, so the profile shows the +//! decompress kernels directly. +//! +//! Usage: +//! samply record ./target/release/examples/profile_chunked dict_bp 4194304 256 8 chunked 50 +//! samply record ./target/release/examples/profile_chunked dict_bp 4194304 256 8 canonical 50 + +use std::env; +use std::sync::Arc; +use std::time::Instant; + +use vortex_array::IntoArray; +use vortex_array::VortexSessionExecute; +use vortex_array::_chunked_exec::primitive::decode_to_buffer; +use vortex_array::_chunked_exec::primitive::default_dispatcher; +use vortex_array::arrays::DictArray; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::session::ArraySession; +use vortex_array::validity::Validity; +use vortex_buffer::Buffer; +use vortex_fastlanes::BitPackedData; +use vortex_runend::_chunked_exec::register_chunk_kernels as register_runend_chunk_kernels; +use vortex_session::VortexSession; + +fn main() { + let args: Vec = env::args().collect(); + if args.len() < 6 { + eprintln!( + "usage: {} ", + args[0] + ); + eprintln!(" shape: dict_bp | dict_p"); + eprintln!(" path: chunked | canonical"); + std::process::exit(2); + } + let shape = &args[1]; + let len: usize = args[2].parse().unwrap(); + let dict_size: usize = args[3].parse().unwrap(); + let bit_width: u8 = args[4].parse().unwrap(); + let path = &args[5]; + let iters: usize = args.get(6).map(|s| s.parse().unwrap()).unwrap_or(20); + + let session = { + let s = VortexSession::empty().with::(); + vortex_runend::initialize(&s); + vortex_fastlanes::initialize(&s); + s + }; + let dispatcher = { + let mut d = default_dispatcher(); + register_runend_chunk_kernels(&mut d); + vortex_fastlanes::_chunked_exec::register_chunk_kernels(&mut d); + Arc::new(d) + }; + + let array = match shape.as_str() { + "dict_bp" => build_dict_bp(&session, len, dict_size, bit_width), + "dict_p" => build_dict_p(len, dict_size), + _ => panic!("unknown shape {shape}"), + }; + + // Warm caches. + for _ in 0..3 { + match path.as_str() { + "chunked" => { + let mut ctx = session.create_execution_ctx(); + let _w = decode_to_buffer::(array.clone(), &dispatcher, &mut ctx).unwrap(); + std::hint::black_box(_w); + } + "canonical" => { + let mut ctx = session.create_execution_ctx(); + let _w = array.clone().execute::(&mut ctx).unwrap(); + std::hint::black_box(_w); + } + _ => panic!("unknown path {path}"), + } + } + + // Time the loop body that the profiler will sample inside. + let t0 = Instant::now(); + for _ in 0..iters { + match path.as_str() { + "chunked" => { + let mut ctx = session.create_execution_ctx(); + let b = decode_to_buffer::(array.clone(), &dispatcher, &mut ctx).unwrap(); + std::hint::black_box(b); + } + "canonical" => { + let mut ctx = session.create_execution_ctx(); + let b = array.clone().execute::(&mut ctx).unwrap(); + std::hint::black_box(b); + } + _ => unreachable!(), + } + } + let elapsed = t0.elapsed(); + let per_iter_us = (elapsed.as_micros() as f64) / (iters as f64); + let elems_per_us = (len as f64) / per_iter_us; + println!( + "{} {} N={} dict={} bw={} iters={} per_iter={:.1}µs {:.1}M elems/sec", + shape, + path, + len, + dict_size, + bit_width, + iters, + per_iter_us, + elems_per_us + ); +} + +fn build_dict_bp( + session: &VortexSession, + len: usize, + dict_size: usize, + bit_width: u8, +) -> vortex_array::ArrayRef { + let dict_values: Vec = (0..dict_size as i32).map(|i| i * 17 + 11).collect(); + let codes: Vec = (0..len).map(|i| (i % dict_size) as u16).collect(); + let dict = PrimitiveArray::new( + Buffer::::from_iter(dict_values), + Validity::NonNullable, + ); + let codes_prim = + PrimitiveArray::new(Buffer::::from_iter(codes), Validity::NonNullable); + let mut ctx = session.create_execution_ctx(); + let bp = BitPackedData::encode(&codes_prim.into_array(), bit_width, &mut ctx).unwrap(); + DictArray::try_new(bp.into_array(), dict.into_array()) + .unwrap() + .into_array() +} + +fn build_dict_p(len: usize, dict_size: usize) -> vortex_array::ArrayRef { + let dict_values: Vec = (0..dict_size as i32).map(|i| i * 17 + 11).collect(); + let codes: Vec = (0..len).map(|i| (i % dict_size) as u32).collect(); + let dict = PrimitiveArray::new( + Buffer::::from_iter(dict_values), + Validity::NonNullable, + ); + let codes_prim = + PrimitiveArray::new(Buffer::::from_iter(codes), Validity::NonNullable); + DictArray::try_new(codes_prim.into_array(), dict.into_array()) + .unwrap() + .into_array() +} diff --git a/encodings/runend/src/_chunked_exec.rs b/encodings/runend/src/_chunked_exec.rs new file mode 100644 index 00000000000..e604e940651 --- /dev/null +++ b/encodings/runend/src/_chunked_exec.rs @@ -0,0 +1,188 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! `RunEnd` chunked decoder + registration with `vortex_array::_chunked_exec`. +//! +//! Lives in this crate because `RunEnd` is defined here; the producer itself +//! ([`vortex_array::_chunked_exec::primitive::RunEndPrimitiveProducer`]) is generic and +//! lives in `vortex-array`. The kernel here just bridges Vortex's encoding view to the +//! generic streaming primitive. + +use std::sync::Arc; + +use vortex_array::ArrayRef; +use vortex_array::ExecutionCtx; +use vortex_array::VTable; +use vortex_array::_chunked_exec::primitive::PrimitiveChunkKernel; +use vortex_array::_chunked_exec::primitive::PrimitiveChunkKernelDispatcher; +use vortex_array::_chunked_exec::primitive::PrimitiveChunkProducer; +use vortex_array::_chunked_exec::primitive::build_runend_producer; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::dtype::DType; +use vortex_array::dtype::NativePType; +use vortex_array::dtype::Nullability; +use vortex_error::VortexResult; + +use crate::RunEnd; +use crate::array::RunEndArrayExt as _; + +/// `RunEnd` chunked kernel. +pub struct RunEndKernel { + _marker: std::marker::PhantomData T>, +} + +impl RunEndKernel { + /// Construct an empty kernel marker. + pub fn new() -> Self { + Self { + _marker: std::marker::PhantomData, + } + } +} + +impl Default for RunEndKernel { + fn default() -> Self { + Self::new() + } +} + +impl PrimitiveChunkKernel for RunEndKernel { + fn build( + &self, + array: &ArrayRef, + ctx: &mut ExecutionCtx, + ) -> VortexResult>>> { + let Some(re) = array.as_opt::() else { + return Ok(None); + }; + if !matches!(array.dtype().nullability(), Nullability::NonNullable) { + return Ok(None); + } + let values = re.values(); + let ends = re.ends(); + if !matches!(values.dtype().nullability(), Nullability::NonNullable) + || !matches!(ends.dtype().nullability(), Nullability::NonNullable) + { + return Ok(None); + } + let DType::Primitive(values_ptype, _) = *values.dtype() else { + return Ok(None); + }; + if values_ptype != T::PTYPE { + return Ok(None); + } + let offset = re.offset(); + let len = array.len(); + let values_canonical = values.clone().execute::(ctx)?; + let ends_canonical = ends.clone().execute::(ctx)?; + Ok(Some(build_runend_producer::( + values_canonical, + ends_canonical, + offset, + len, + )?)) + } +} + +/// Register the `RunEnd` chunk kernel onto `dispatcher` for every primitive output type +/// that we currently care about. +/// +/// This is the runend-side equivalent of `vortex_array::_chunked_exec::register_defaults`; +/// downstream crates that want the fused `Dict>` path should call both. +pub fn register_chunk_kernels(dispatcher: &mut PrimitiveChunkKernelDispatcher) { + macro_rules! register_all_for { + ($($T:ty),*) => { + $( + dispatcher.register::<$T>(RunEnd.id(), Arc::new(RunEndKernel::<$T>::new())); + )* + }; + } + register_all_for!(u8, u16, u32, u64, i8, i16, i32, i64, f32, f64); +} + +#[cfg(test)] +mod tests { + use vortex_array::ExecutionCtx; + use vortex_array::IntoArray; + use vortex_array::LEGACY_SESSION; + use vortex_array::VortexSessionExecute; + use vortex_array::_chunked_exec::primitive::decode_to_buffer; + use vortex_array::_chunked_exec::primitive::default_dispatcher; + use vortex_array::arrays::DictArray; + use vortex_array::arrays::PrimitiveArray; + use vortex_array::validity::Validity; + use vortex_buffer::Buffer; + use vortex_error::VortexResult; + + use super::register_chunk_kernels; + use crate::RunEnd; + + fn ctx() -> ExecutionCtx { + LEGACY_SESSION.create_execution_ctx() + } + + fn dispatcher() -> vortex_array::_chunked_exec::primitive::PrimitiveChunkKernelDispatcher { + let mut d = default_dispatcher(); + register_chunk_kernels(&mut d); + d + } + + /// Construct a RunEnd from an iterator of values; useful in tests. + fn make_runend_i32(values: &[i32]) -> VortexResult { + let prim = PrimitiveArray::new( + Buffer::::from_iter(values.iter().copied()), + Validity::NonNullable, + ); + let re = RunEnd::encode(prim.into_array(), &mut ctx())?; + Ok(re.into_array()) + } + + #[test] + fn runend_chunked() -> VortexResult<()> { + let mut values = Vec::with_capacity(4000); + values.extend(std::iter::repeat(1i32).take(100)); + values.extend(std::iter::repeat(2i32).take(150)); + values.extend(std::iter::repeat(3i32).take(850)); + values.extend(std::iter::repeat(4i32).take(2900)); + let re = make_runend_i32(&values)?; + let buf = decode_to_buffer::(re, &dispatcher(), &mut ctx())?; + assert_eq!(buf.as_slice(), values.as_slice()); + Ok(()) + } + + #[test] + fn fused_dict_runend_chunked() -> VortexResult<()> { + // Inner dictionary VALUES are RunEnd-encoded (so the dict has 12 logical entries). + let inner_values = vec![100i32, 100, 100, 200, 200, 200, 300, 300, 300, 400, 400, 400]; + let inner_re = make_runend_i32(&inner_values)?; + + let codes_vec: Vec = (0..8192u32).map(|i| (i % 12) as u8).collect(); + let codes_arr = PrimitiveArray::new( + Buffer::::from_iter(codes_vec.iter().copied()), + Validity::NonNullable, + ); + let dict = DictArray::try_new(codes_arr.into_array(), inner_re)?; + + let buf = decode_to_buffer::(dict.into_array(), &dispatcher(), &mut ctx())?; + let expected: Vec = codes_vec + .iter() + .map(|c| inner_values[*c as usize]) + .collect(); + assert_eq!(buf.as_slice(), expected.as_slice()); + Ok(()) + } + + #[test] + fn runend_sliced_chunked() -> VortexResult<()> { + let mut values = Vec::with_capacity(2000); + for run_idx in 0..20 { + values.extend(std::iter::repeat(run_idx as i32 + 1).take(100)); + } + let re_full = make_runend_i32(&values)?; + let sliced = re_full.slice(50..1500)?; + let buf = decode_to_buffer::(sliced, &dispatcher(), &mut ctx())?; + assert_eq!(buf.as_slice(), &values[50..1500]); + Ok(()) + } + +} diff --git a/encodings/runend/src/compute/take.rs b/encodings/runend/src/compute/take.rs index 7100faf9eac..f6f09b68c65 100644 --- a/encodings/runend/src/compute/take.rs +++ b/encodings/runend/src/compute/take.rs @@ -31,6 +31,18 @@ impl TakeExecute for RunEnd { indices: &ArrayRef, ctx: &mut ExecutionCtx, ) -> VortexResult> { + // Fall back to the canonical-then-gather path when there are many more indices + // than the RunEnd array's logical length. In that regime, canonicalizing the + // (small) RunEnd values once and doing an AVX2 gather is dramatically cheaper + // than `indices.len()` independent binary searches into the ends array. The + // pathological case is `Dict` with a small RunEnd dictionary and many + // codes: each code triggers a `search_sorted` here, blowing up to N * log K + // work plus an O(N) `Vec` allocation, when a single decode of the RunEnd + // (size = array.len()) followed by an AVX2 take would do. + if indices.len() > array.len() { + return Ok(None); + } + let primitive_indices = indices.clone().execute::(ctx)?; let checked_indices = match_each_integer_ptype!(primitive_indices.ptype(), |P| { @@ -155,6 +167,49 @@ mod tests { assert_arrays_eq!(taken, expected.into_array()); } + /// Regression test: when `indices.len()` exceeds `array.len()`, the take impl falls back + /// (returns `None`) so the canonical executor picks up the small RunEnd canonicalize + + /// AVX2 gather path. Without the fallback, `Dict.execute::` would + /// pay `N * log K` work per index against a tiny ends array. See `dict_runend_canonical` + /// in `encodings/runend/benches/chunked_exec.rs`. + #[test] + fn ree_dict_take_dense_indices() -> vortex_error::VortexResult<()> { + use std::sync::LazyLock; + + use vortex_array::IntoArray; + use vortex_array::VortexSessionExecute; + use vortex_array::arrays::DictArray; + use vortex_array::session::ArraySession; + use vortex_array::validity::Validity; + use vortex_buffer::Buffer; + use vortex_session::VortexSession; + + static SESSION: LazyLock = + LazyLock::new(|| VortexSession::empty().with::()); + let mut ctx = SESSION.create_execution_ctx(); + + // dict_size=8, inner_run=2 → ends=[2,4,6,8], values=[0,1,2,3] + let dict_values = + PrimitiveArray::new(Buffer::::from_iter([0, 0, 1, 1, 2, 2, 3, 3]), Validity::NonNullable) + .into_array(); + let dict_re = RunEnd::encode(dict_values, &mut ctx)?.into_array(); + + // 32 codes (>> dict.len()=8), each `i % 8` so the result is 0,0,1,1,...,3,3 repeated 4x. + let codes_buf: Vec = (0..32u32).map(|i| i % 8).collect(); + let codes = + PrimitiveArray::new(Buffer::::from_iter(codes_buf), Validity::NonNullable).into_array(); + let dict = DictArray::try_new(codes, dict_re)?.into_array(); + + let taken = dict.execute::(&mut ctx)?; + let expected: Vec = (0..32).map(|i| (i % 8) / 2).collect(); + assert_arrays_eq!( + taken.into_array(), + PrimitiveArray::new(Buffer::::from_iter(expected), Validity::NonNullable) + .into_array() + ); + Ok(()) + } + #[rstest] #[case(ree_array())] #[case(RunEnd::encode( diff --git a/encodings/runend/src/lib.rs b/encodings/runend/src/lib.rs index 8770dbbf58e..84a96849d22 100644 --- a/encodings/runend/src/lib.rs +++ b/encodings/runend/src/lib.rs @@ -8,6 +8,8 @@ pub use arbitrary::ArbitraryRunEndArray; pub use array::*; pub use iter::trimmed_ends_iter; +#[doc(hidden)] +pub mod _chunked_exec; mod array; #[cfg(feature = "arrow")] mod arrow; diff --git a/vortex-array/src/_chunked_exec/listview.rs b/vortex-array/src/_chunked_exec/listview.rs new file mode 100644 index 00000000000..683b7e5973b --- /dev/null +++ b/vortex-array/src/_chunked_exec/listview.rs @@ -0,0 +1,377 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Chunked decode for [`ListView`] arrays with primitive elements. +//! +//! The canonical-by-canonical path for `ListView` decompresses the offsets, +//! sizes, and elements buffers up-front. When offsets and sizes are themselves +//! bit-packed (the common case), that's three full materializations before a consumer +//! sees a single row. +//! +//! The chunked path emits **row windows**: each window is a `(offsets, sizes, +//! elements_slice)` triple covering up to [`CHUNK_LEN`] rows. The producer keeps a +//! reusable [`Scratch`] for the offsets and another for the sizes (so the steady-state +//! decompress footprint is two scratches, not three full buffers), and the elements +//! slice is just a borrowed view onto the shared, already-materialized elements buffer. +//! +//! For v1 we materialize the elements buffer once up-front (since `ListView` allows +//! arbitrary, possibly-overlapping offsets we don't know in advance which slice each +//! chunk needs without a pre-pass). A future iteration can replace this with a +//! per-chunk min-offset / max-offset+size pre-pass and decompress only the referenced +//! slice — the producer API already supports that shape. + +use vortex_buffer::Buffer; +use vortex_error::VortexResult; + +use super::CHUNK_LEN; +use super::scratch::Scratch; +use crate::ArrayRef; +use crate::arrays::ListView; +use crate::arrays::PrimitiveArray; +use crate::arrays::listview::ListViewArrayExt; +use crate::arrays::primitive::PrimitiveArrayExt; +use crate::dtype::NativePType; +use crate::dtype::Nullability; +use crate::executor::ExecutionCtx; +use crate::match_each_integer_ptype; + +/// A row window emitted by [`ListChunkProducer`]. +/// +/// The lifetime is tied to the producer-owned `Scratch` buffers — the consumer must +/// finish reading the chunk before requesting the next. +pub struct ListChunk<'a, O, S, E> { + /// Offsets for this window of rows. + pub offsets: &'a [O], + /// Sizes for this window of rows. + pub sizes: &'a [S], + /// Shared reference to the (fully materialized) elements buffer. + pub elements: &'a [E], +} + +/// Streams chunks of `(offsets, sizes)` for a list-of-primitive array. +/// +/// `O` is the offset native type (`u32` or `u64`), `S` the size native type, and `E` the +/// element native type. +pub struct ListChunkProducer { + offsets: Buffer, + sizes: Buffer, + elements: Buffer, + cursor: usize, +} + +impl ListChunkProducer +where + O: NativePType, + S: NativePType, + E: NativePType, +{ + /// Construct directly from canonicalized buffers. + pub fn new(offsets: Buffer, sizes: Buffer, elements: Buffer) -> Self { + Self { + offsets, + sizes, + elements, + cursor: 0, + } + } + + /// Number of rows that remain to be emitted. + pub fn remaining(&self) -> usize { + self.offsets.as_slice().len().saturating_sub(self.cursor) + } + + /// Total number of rows. + pub fn len(&self) -> usize { + self.offsets.as_slice().len() + } + + /// Whether there are any rows at all. + pub fn is_empty(&self) -> bool { + self.offsets.as_slice().is_empty() + } + + /// Drive the producer to completion, calling `f` with each row chunk as typed + /// slices. This is the fast-path API for callers who know the offset/size types + /// at compile time — it has no dyn-dispatch in the hot loop. + pub fn for_each_chunk_typed(&mut self, mut f: F) + where + F: FnMut(&[O], &[S], &[E]), + { + let mut o_scratch = Scratch::::new(); + let mut s_scratch = Scratch::::new(); + while let Some(chunk) = self.next_chunk(&mut o_scratch, &mut s_scratch) { + f(chunk.offsets, chunk.sizes, chunk.elements); + } + } + + /// Pull the next row window. Returns slices directly out of the already-canonical + /// `offsets`/`sizes`/`elements` buffers — no memcpy, no scratch hop. + /// + /// The scratch arguments are kept for API symmetry with future chunked-bit-unpack + /// variants where the producer materialises its own offsets/sizes per chunk; for + /// the current canonical-then-chunk producer they are unused. + pub fn next_chunk<'a>( + &'a mut self, + _offset_scratch: &'a mut Scratch, + _size_scratch: &'a mut Scratch, + ) -> Option> { + let total = self.offsets.as_slice().len(); + if self.cursor >= total { + return None; + } + let take = CHUNK_LEN.min(total - self.cursor); + let offsets = &self.offsets.as_slice()[self.cursor..self.cursor + take]; + let sizes = &self.sizes.as_slice()[self.cursor..self.cursor + take]; + self.cursor += take; + Some(ListChunk { + offsets, + sizes, + elements: self.elements.as_slice(), + }) + } +} + +/// Build a typed [`ListChunkProducer`] directly. The caller commits to the offset and +/// size native types up front; the producer is callable without dyn dispatch in the hot +/// loop. +/// +/// Errors if the array's offset/size/element ptypes don't match ``. +pub fn build_listview_producer_typed( + array: ArrayRef, + ctx: &mut ExecutionCtx, +) -> VortexResult> +where + O: NativePType, + S: NativePType, + E: NativePType, +{ + let Some(lv) = array.as_opt::() else { + vortex_error::vortex_bail!( + "build_listview_producer_typed: expected ListView, got {}", + array.encoding_id() + ); + }; + if !matches!(array.dtype().nullability(), Nullability::NonNullable) { + vortex_error::vortex_bail!("build_listview_producer_typed: only non-nullable for v1"); + } + let offsets = lv.offsets().clone().execute::(ctx)?; + let sizes = lv.sizes().clone().execute::(ctx)?; + let elements = lv.elements().clone().execute::(ctx)?; + if O::PTYPE != offsets.ptype() + || S::PTYPE != sizes.ptype() + || E::PTYPE != elements.ptype() + { + vortex_error::vortex_bail!( + "build_listview_producer_typed: ptype mismatch ({}, {}, {}) vs requested ({}, {}, {})", + offsets.ptype(), + sizes.ptype(), + elements.ptype(), + O::PTYPE, + S::PTYPE, + E::PTYPE, + ); + } + Ok(ListChunkProducer::::new( + offsets.into_buffer::(), + sizes.into_buffer::(), + elements.into_buffer::(), + )) +} + +/// Build a [`ListChunkProducer`] for a `ListView` array. +/// +/// Canonicalizes offsets, sizes and elements once via the existing executor, then returns +/// a producer that streams row windows. The producer can be used to drive a downstream +/// consumer without ever materializing a full `ListViewArray`. +pub fn build_listview_primitive_producer( + array: ArrayRef, + ctx: &mut ExecutionCtx, +) -> VortexResult> { + let Some(lv) = array.as_opt::() else { + vortex_error::vortex_bail!( + "build_listview_primitive_producer: expected ListView, got {}", + array.encoding_id() + ); + }; + if !matches!(array.dtype().nullability(), Nullability::NonNullable) { + vortex_error::vortex_bail!( + "build_listview_primitive_producer: only non-nullable for v1" + ); + } + let offsets = lv.offsets().clone().execute::(ctx)?; + let sizes = lv.sizes().clone().execute::(ctx)?; + let elements = lv.elements().clone().execute::(ctx)?; + if E::PTYPE != elements.ptype() { + vortex_error::vortex_bail!( + "build_listview_primitive_producer: element type {} does not match {}", + E::PTYPE, + elements.ptype() + ); + } + let elements_buf = elements.into_buffer::(); + Ok(match_each_integer_ptype!(offsets.ptype(), |O| { + match_each_integer_ptype!(sizes.ptype(), |S| { + BoxedListChunkProducer::new::( + offsets.into_buffer::(), + sizes.into_buffer::(), + elements_buf, + ) + }) + })) +} + +/// Type-erased entry point handle that lets callers dispatch to a `ListChunkProducer` +/// without monomorphizing on every `(O, S, E)` triple. +pub struct BoxedListChunkProducer { + inner: Box>, +} + +impl BoxedListChunkProducer { + fn new(offsets: Buffer, sizes: Buffer, elements: Buffer) -> Self + where + O: NativePType + num_traits::AsPrimitive + Send + 'static, + S: NativePType + num_traits::AsPrimitive + Send + 'static, + E: NativePType + Send + 'static, + { + Self { + inner: Box::new(ErasedProducer::::new(ListChunkProducer::new( + offsets, sizes, elements, + ))), + } + } + + /// Total rows. + pub fn len(&self) -> usize { + self.inner.len_erased() + } + + /// Whether the producer has any rows. + pub fn is_empty(&self) -> bool { + self.inner.len_erased() == 0 + } + + /// Drive one chunk and call `f` with the (offsets, sizes, elements) triple. The + /// closure may borrow the chunk for the duration of the call. Returns `Some(())` if + /// a chunk was produced; `None` when exhausted. + /// + /// We hide the offset/size native types behind a `dyn` adaptor so callers don't + /// have to monomorphize over every integer combination. + pub fn for_each_chunk(&mut self, mut f: F) + where + F: FnMut(ListChunkErased<'_, E>), + { + while let Some(()) = self.inner.next_chunk_erased(&mut f) {} + } +} + +/// Type-erased chunk view passed to [`BoxedListChunkProducer::for_each_chunk`]. +pub struct ListChunkErased<'a, E: NativePType> { + /// Number of rows in this chunk. + pub n: usize, + /// Function that returns row `i`'s offset as `usize`. + pub offset_of: &'a dyn Fn(usize) -> usize, + /// Function that returns row `i`'s size as `usize`. + pub size_of: &'a dyn Fn(usize) -> usize, + /// The shared elements buffer. + pub elements: &'a [E], +} + +trait ListChunkProducerErased: Send { + fn len_erased(&self) -> usize; + fn next_chunk_erased( + &mut self, + f: &mut dyn FnMut(ListChunkErased<'_, E>), + ) -> Option<()>; +} + +struct ErasedProducer { + inner: ListChunkProducer, + /// Persisted scratches so we don't pay a heap allocation per chunk. + offset_scratch: Scratch, + size_scratch: Scratch, +} + +impl ErasedProducer { + fn new(inner: ListChunkProducer) -> Self { + Self { + inner, + offset_scratch: Scratch::::new(), + size_scratch: Scratch::::new(), + } + } +} + +impl ListChunkProducerErased for ErasedProducer +where + O: NativePType + num_traits::AsPrimitive + Send, + S: NativePType + num_traits::AsPrimitive + Send, + E: NativePType + Send, +{ + fn len_erased(&self) -> usize { + self.inner.len() + } + + fn next_chunk_erased( + &mut self, + f: &mut dyn FnMut(ListChunkErased<'_, E>), + ) -> Option<()> { + let chunk = self + .inner + .next_chunk(&mut self.offset_scratch, &mut self.size_scratch)?; + let offsets = chunk.offsets; + let sizes = chunk.sizes; + let elements = chunk.elements; + let off_fn = |i: usize| -> usize { offsets[i].as_() }; + let sz_fn = |i: usize| -> usize { sizes[i].as_() }; + f(ListChunkErased { + n: offsets.len(), + offset_of: &off_fn, + size_of: &sz_fn, + elements, + }); + Some(()) + } +} + +#[cfg(test)] +mod tests { + use vortex_buffer::buffer; + use vortex_error::VortexResult; + + use super::*; + use crate::IntoArray; + use crate::LEGACY_SESSION; + use crate::VortexSessionExecute; + use crate::arrays::ListViewArray; + use crate::validity::Validity; + + fn ctx() -> ExecutionCtx { + LEGACY_SESSION.create_execution_ctx() + } + + #[test] + fn list_chunked_round_trip() -> VortexResult<()> { + // 3 lists: [1,2], [3], [4,5,6] + let elements = buffer![1i32, 2, 3, 4, 5, 6]; + let offsets = buffer![0u32, 2, 3]; + let sizes = buffer![2u32, 1, 3]; + let lv = ListViewArray::new( + elements.into_array(), + offsets.into_array(), + sizes.into_array(), + Validity::NonNullable, + ); + let mut producer = build_listview_primitive_producer::(lv.into_array(), &mut ctx())?; + + let mut row_data: Vec> = Vec::new(); + producer.for_each_chunk(|chunk| { + for i in 0..chunk.n { + let o = (chunk.offset_of)(i); + let s = (chunk.size_of)(i); + row_data.push(chunk.elements[o..o + s].to_vec()); + } + }); + assert_eq!(row_data, vec![vec![1, 2], vec![3], vec![4, 5, 6]]); + Ok(()) + } +} diff --git a/vortex-array/src/_chunked_exec/mod.rs b/vortex-array/src/_chunked_exec/mod.rs new file mode 100644 index 00000000000..d60d6c9002c --- /dev/null +++ b/vortex-array/src/_chunked_exec/mod.rs @@ -0,0 +1,76 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Chunked execution engine — experimental. +//! +//! The default executor in [`crate::executor`] drives an array to a fully materialized +//! [`crate::Canonical`] one encoding at a time. Each intermediate is the size of the whole +//! array; for stacks like `Dict>` the working set is enormous, eviction +//! storms are inevitable, and any per-stage allocation runs at the rate of the *output* +//! size rather than the rate of useful work. +//! +//! This module models execution differently: +//! +//! 1. A **producer** yields decoded values into a small, driver-owned [`Scratch`] buffer +//! sized to fit comfortably in L1d (1024 elements, 4–8 KiB for primitives). +//! 2. The driver pulls chunks until the producer is exhausted, copying each chunk into +//! its final destination (a builder, an Arrow buffer, an aggregator, …). The scratch +//! is reused across iterations, so the steady-state memory footprint of decode is the +//! scratch size plus whatever fixed dictionaries the producer holds. +//! 3. Custom **chunk kernels** can be registered to fuse multiple encoding layers into a +//! single pass — the model rule of thumb is that an encoding's chunk kernel is allowed +//! to *materialise its own children up-front* if they are bounded in size (e.g. a +//! dictionary's `values` slot), and then stream the unbounded `codes` chunk-by-chunk. +//! This is the same pattern as [`crate::arrays::dict::TakeExecute`] — the fused take +//! kernel reads `Dict.values` once and then walks `Dict.codes` chunk-by-chunk. +//! +//! See the producer traits for the contract and [`build_primitive_producer`] for dispatch. +//! +//! ## Status +//! +//! v1 spike — covers primitive output for `Dict`, `RunEnd` and the +//! fused `Dict>` stack, plus a streaming [`listview::ListChunkProducer`] +//! over `ListView` rows with bit-packable offsets/sizes. The module is +//! `_`-prefixed so it does not leak into the public API surface yet. + +pub mod listview; +pub mod primitive; +pub mod sink; + +mod scratch; + +/// Re-export of the AVX2-aware take helper, hoisted into the chunked engine namespace so +/// out-of-crate kernels can call it without piercing private compute modules. +pub use crate::arrays::primitive::compute::take::take_into_uninit; + +pub use scratch::Scratch; + +/// Number of elements per scratch chunk. +/// +/// 1024 matches the fastlanes chunk size, keeps the scratch under 8 KiB for any +/// primitive up to `u64`/`f64`, and is small enough to leave room for one fixed +/// dictionary in L1d on every CPU we care about. Empirically larger super-chunks +/// (tested at 4096) didn't move primitive workloads and regressed RunEnd at moderate +/// run lengths — the per-chunk dispatch isn't the bottleneck the profile suggested. +pub const CHUNK_LEN: usize = 1024; + +/// Drive a producer to completion, invoking `sink` with each emitted chunk. +/// +/// This is the canonical helper for "decode the whole thing into a downstream buffer". +/// The producer's scratch is supplied by the driver, so the same allocation is reused +/// across every chunk for the lifetime of the call. +pub fn drive_primitive( + mut producer: P, + scratch: &mut Scratch, + mut sink: S, +) -> vortex_error::VortexResult<()> +where + T: crate::dtype::NativePType, + P: primitive::PrimitiveChunkProducer, + S: FnMut(&[T]), +{ + while let Some(chunk) = producer.next_chunk(scratch)? { + sink(chunk); + } + Ok(()) +} diff --git a/vortex-array/src/_chunked_exec/primitive.rs b/vortex-array/src/_chunked_exec/primitive.rs new file mode 100644 index 00000000000..8757592cbbc --- /dev/null +++ b/vortex-array/src/_chunked_exec/primitive.rs @@ -0,0 +1,854 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Chunked decode producers that emit primitive values into a [`Scratch`] buffer. +//! +//! See the module-level docs in [`super`] for the model. This file holds: +//! +//! - The [`PrimitiveChunkProducer`] trait — the producer contract. +//! - The [`PrimitiveChunkKernel`] trait + [`PrimitiveChunkKernelDispatcher`] — dyn-dispatch +//! registry of fused kernels keyed on the encoding identifier of the outermost array. +//! - Concrete producers: [`SliceProducer`] (canonical fallback), +//! [`DictPrimitiveProducer`] (chunked gather over a small materialized dict — this is +//! also the implementation that *naturally fuses* `Dict>` because [`DictKernel`] +//! materializes its values slot via the regular executor), and [`RunEndPrimitiveProducer`] +//! (standalone run-end streaming, built via [`build_runend_producer`] by encodings that +//! know how to fetch the run-end children). +//! - [`build_primitive_producer`] — the entry point that dispatches into a registered +//! fused kernel or falls back to a canonical-then-stream path. +//! +//! For v1, the fast paths require non-nullable inputs and primitive children of the +//! expected shape. Anything outside that envelope falls back to the canonical-then-stream +//! producer, which is correct but not faster than the array-by-array baseline. + +use std::any::Any; +use std::mem::MaybeUninit; +use std::sync::Arc; + +use vortex_buffer::Buffer; +use vortex_error::VortexResult; + +use super::CHUNK_LEN; +use super::scratch::Scratch; +use crate::ArrayRef; +use crate::array::ArrayId; +use crate::array::VTable; +use crate::arrays::Dict; +use crate::arrays::PrimitiveArray; +use crate::arrays::dict::DictArraySlotsExt; +use crate::arrays::primitive::PrimitiveArrayExt; +use crate::dtype::DType; +use crate::dtype::NativePType; +use crate::dtype::Nullability; +use crate::dtype::PType; +use crate::executor::ExecutionCtx; +use crate::match_each_integer_ptype; +use crate::match_each_native_ptype; +use crate::validity::Validity; + +/// A streaming producer of primitive chunks. +/// +/// Implementors write up to [`CHUNK_LEN`] elements into the supplied scratch and return +/// the initialized prefix to the driver. Returning `Ok(None)` signals end-of-stream. +/// +/// The producer is responsible for owning anything it needs across chunk calls (e.g. a +/// materialized dictionary). The scratch is supplied by the driver and reused across +/// calls. +pub trait PrimitiveChunkProducer: Send { + /// Decode the next chunk of values. + fn next_chunk<'a>( + &mut self, + scratch: &'a mut Scratch, + ) -> VortexResult>; + + /// Decode the next chunk directly into `dst` (bypassing scratch). The destination must + /// have at least [`CHUNK_LEN`] writable cells. Returns the number of elements written, + /// or `None` when the producer is exhausted. + /// + /// Default implementation routes through [`Self::next_chunk`] and `copy_nonoverlapping`. + /// Producers that can write directly to the destination should override this — e.g. + /// [`DictPrimitiveProducer`] calls the AVX2 gather kernel into `dst` directly, avoiding + /// the scratch hop entirely. + fn next_chunk_into_uninit( + &mut self, + scratch: &mut Scratch, + dst: &mut [MaybeUninit], + ) -> VortexResult> { + match self.next_chunk(scratch)? { + Some(chunk) => { + let n = chunk.len(); + assert!(dst.len() >= n); + // SAFETY: dst capacity ≥ n; chunk is `n` initialized T values. + unsafe { + std::ptr::copy_nonoverlapping( + chunk.as_ptr(), + dst.as_mut_ptr().cast::(), + n, + ); + } + Ok(Some(n)) + } + None => Ok(None), + } + } + + /// Number of elements not yet produced. + fn remaining(&self) -> usize; +} + +/// Pluggable kernel that can build a fused chunk producer for an array. +/// +/// Kernels are registered against the outermost encoding id; each kernel inspects the +/// children at build time to decide whether it can handle the array shape. Returning +/// `Ok(None)` means "this kernel doesn't apply"; the dispatcher tries the next kernel +/// (or the fallback) instead. +pub trait PrimitiveChunkKernel: Send + Sync { + /// Try to build a fused producer for `array`. Returning `Ok(None)` defers to the next + /// kernel. + fn build( + &self, + array: &ArrayRef, + ctx: &mut ExecutionCtx, + ) -> VortexResult>>>; +} + +/// A typed entry in [`PrimitiveChunkKernelDispatcher`]. +/// +/// Boxed so the registry can hold heterogeneous `T`s; the typed lookup downcasts via +/// `Any` to recover the strongly-typed kernel slice. +struct TypedKernels { + by_encoding: rustc_hash::FxHashMap>>>, +} + +impl TypedKernels { + fn new() -> Self { + Self { + by_encoding: rustc_hash::FxHashMap::default(), + } + } +} + +/// Registry of fused chunk kernels. +/// +/// In the long run this would hang off [`vortex_session::VortexSession`] and ride the same +/// session-scoped lookup as the existing [`crate::optimizer::kernels::ArrayKernels`]. For +/// the v1 spike the dispatcher is constructed per-call; see [`default_dispatcher`]. +pub struct PrimitiveChunkKernelDispatcher { + // Keyed by TypeId of T to support multiple primitive output types. + entries: rustc_hash::FxHashMap>, +} + +impl Default for PrimitiveChunkKernelDispatcher { + fn default() -> Self { + Self::new() + } +} + +impl PrimitiveChunkKernelDispatcher { + /// Empty registry. + pub fn new() -> Self { + Self { + entries: rustc_hash::FxHashMap::default(), + } + } + + /// Register `kernel` for outermost encoding `encoding`. + pub fn register( + &mut self, + encoding: ArrayId, + kernel: Arc>, + ) { + let typed = self + .entries + .entry(std::any::TypeId::of::()) + .or_insert_with(|| Box::new(TypedKernels::::new())); + let typed = typed + .downcast_mut::>() + .expect("TypedKernels matches TypeId"); + typed.by_encoding.entry(encoding).or_default().push(kernel); + } + + fn kernels_for( + &self, + encoding: ArrayId, + ) -> &[Arc>] { + let Some(typed) = self.entries.get(&std::any::TypeId::of::()) else { + return &[]; + }; + let typed = typed + .downcast_ref::>() + .expect("TypedKernels matches TypeId"); + typed + .by_encoding + .get(&encoding) + .map(Vec::as_slice) + .unwrap_or(&[]) + } +} + +/// Build the v1 default dispatcher with the in-crate kernels only (Dict). +/// +/// To register the `RunEnd` kernel (and the fused `Dict>` path that emerges from +/// it), call `vortex_runend::register_chunk_kernels(&mut dispatcher)` from the consumer +/// crate. Out-of-crate encodings layer their kernels on top in the same way. +pub fn default_dispatcher() -> PrimitiveChunkKernelDispatcher { + let mut d = PrimitiveChunkKernelDispatcher::new(); + register_defaults(&mut d); + d +} + +/// Register the in-crate v1 kernels onto `dispatcher` for every supported `T`. +pub fn register_defaults(dispatcher: &mut PrimitiveChunkKernelDispatcher) { + macro_rules! register_all_for { + ($($T:ty),*) => { + $( + dispatcher.register::<$T>(Dict.id(), Arc::new(DictKernel::<$T>::new())); + )* + }; + } + register_all_for!(u8, u16, u32, u64, i8, i16, i32, i64, f32, f64); +} + +/// Build a chunk producer for `array`, dispatching to any registered fused kernel and +/// falling back to a canonicalize-then-stream producer if none apply. +/// +/// The caller passes a dispatcher (typically obtained from [`default_dispatcher`]). For +/// session integration the caller would fetch the session-scoped registry, but the spike +/// stays out of `VortexSession` for now to keep the diff small. +pub fn build_primitive_producer( + array: ArrayRef, + dispatcher: &PrimitiveChunkKernelDispatcher, + ctx: &mut ExecutionCtx, +) -> VortexResult>> { + let encoding = array.encoding_id(); + for kernel in dispatcher.kernels_for::(encoding) { + if let Some(p) = kernel.build(&array, ctx)? { + return Ok(p); + } + } + // Fallback: canonicalize, then stream the resulting slice in fixed-size chunks. + let canonical = array.execute::(ctx)?; + Ok(Box::new(SliceProducer::::from_primitive(canonical)?)) +} + +// --------------------------------------------------------------------------------------- +// Slice producer +// --------------------------------------------------------------------------------------- + +/// Walks a primitive buffer and emits fixed-size chunks via `copy_from_slice`. +/// +/// Used as the universal fallback; also useful as a baseline in benchmarks. +pub struct SliceProducer { + buffer: Buffer, + cursor: usize, +} + +impl SliceProducer { + /// Construct a slice producer over a primitive buffer. + pub fn new(buffer: Buffer) -> Self { + Self { buffer, cursor: 0 } + } + + /// Construct a slice producer from a [`PrimitiveArray`]. + /// + /// Errors if the array has a different physical type than `T`. + pub fn from_primitive(array: PrimitiveArray) -> VortexResult { + if T::PTYPE != array.ptype() { + vortex_error::vortex_bail!( + "SliceProducer<{}> cannot be built from PrimitiveArray of {}", + T::PTYPE, + array.ptype() + ); + } + Ok(Self::new(array.into_buffer::())) + } +} + +impl PrimitiveChunkProducer for SliceProducer { + fn next_chunk<'a>( + &mut self, + scratch: &'a mut Scratch, + ) -> VortexResult> { + let len = self.buffer.as_slice().len(); + if self.cursor >= len { + return Ok(None); + } + let take = CHUNK_LEN.min(len - self.cursor); + let src = &self.buffer.as_slice()[self.cursor..self.cursor + take]; + let dst = &mut scratch.as_uninit_mut()[..take]; + // SAFETY: `MaybeUninit` has the same layout as `T`; we overwrite exactly `take`. + unsafe { + let dst_ptr = dst.as_mut_ptr().cast::(); + std::ptr::copy_nonoverlapping(src.as_ptr(), dst_ptr, take); + } + self.cursor += take; + // SAFETY: `take` elements were just initialized. + Ok(Some(unsafe { + std::slice::from_raw_parts(scratch.as_uninit_mut().as_ptr().cast::(), take) + })) + } + + fn next_chunk_into_uninit( + &mut self, + _scratch: &mut Scratch, + dst: &mut [MaybeUninit], + ) -> VortexResult> { + let len = self.buffer.as_slice().len(); + if self.cursor >= len { + return Ok(None); + } + let take = CHUNK_LEN.min(len - self.cursor).min(dst.len()); + let src = &self.buffer.as_slice()[self.cursor..self.cursor + take]; + // SAFETY: dst has at least `take` cells; src is `take` initialized T values. + unsafe { + std::ptr::copy_nonoverlapping(src.as_ptr(), dst.as_mut_ptr().cast::(), take); + } + self.cursor += take; + Ok(Some(take)) + } + + fn remaining(&self) -> usize { + self.buffer.as_slice().len().saturating_sub(self.cursor) + } +} + +// --------------------------------------------------------------------------------------- +// Dict producer +// --------------------------------------------------------------------------------------- + +/// Stream a `Dict` by chunked AVX2 gather over the codes into a +/// pre-materialized values buffer. +/// +/// The values are loaded once into a [`Buffer`] (the "dictionary"); each chunk picks +/// up to [`CHUNK_LEN`] codes and gathers values into the scratch via +/// [`take_into_uninit`], which selects the AVX2 kernel when available and falls back +/// to scalar. The dictionary is expected to fit in L1d (otherwise this encoding +/// shouldn't have been chosen). +pub struct DictPrimitiveProducer { + dict: Buffer, + codes: Buffer, + cursor: usize, +} + +impl DictPrimitiveProducer { + /// Construct directly from raw dict and code buffers. + pub fn new(dict: Buffer, codes: Buffer) -> Self { + Self { + dict, + codes, + cursor: 0, + } + } +} + +impl PrimitiveChunkProducer for DictPrimitiveProducer +where + I: crate::dtype::UnsignedPType, +{ + fn next_chunk<'a>( + &mut self, + scratch: &'a mut Scratch, + ) -> VortexResult> { + let total = self.codes.as_slice().len(); + if self.cursor >= total { + return Ok(None); + } + let take = CHUNK_LEN.min(total - self.cursor); + let codes_chunk = &self.codes.as_slice()[self.cursor..self.cursor + take]; + let dst: &mut [MaybeUninit] = &mut scratch.as_uninit_mut()[..take]; + + crate::arrays::primitive::compute::take::take_into_uninit::( + self.dict.as_slice(), + codes_chunk, + dst, + ); + + self.cursor += take; + // SAFETY: `take` elements just written by take_into_uninit. + let ptr = scratch.as_uninit_mut().as_ptr().cast::(); + Ok(Some(unsafe { std::slice::from_raw_parts(ptr, take) })) + } + + fn next_chunk_into_uninit( + &mut self, + _scratch: &mut Scratch, + dst: &mut [MaybeUninit], + ) -> VortexResult> { + let total = self.codes.as_slice().len(); + if self.cursor >= total { + return Ok(None); + } + let take = CHUNK_LEN.min(total - self.cursor); + let codes_chunk = &self.codes.as_slice()[self.cursor..self.cursor + take]; + crate::arrays::primitive::compute::take::take_into_uninit::( + self.dict.as_slice(), + codes_chunk, + &mut dst[..take], + ); + self.cursor += take; + Ok(Some(take)) + } + + fn remaining(&self) -> usize { + self.codes.as_slice().len().saturating_sub(self.cursor) + } +} + +/// The `Dict` kernel — and, when values are `RunEnd`, the fused +/// `Dict>` kernel. Both compose to a [`DictPrimitiveProducer`] after +/// any necessary up-front materialization of the (bounded) values slot. +pub struct DictKernel { + _marker: std::marker::PhantomData T>, +} + +impl DictKernel { + pub fn new() -> Self { + Self { + _marker: std::marker::PhantomData, + } + } +} + +impl Default for DictKernel { + fn default() -> Self { + Self::new() + } +} + +impl PrimitiveChunkKernel for DictKernel { + fn build( + &self, + array: &ArrayRef, + ctx: &mut ExecutionCtx, + ) -> VortexResult>>> { + let Some(dict) = array.as_opt::() else { + return Ok(None); + }; + if !matches!(array.dtype().nullability(), Nullability::NonNullable) { + return Ok(None); + } + let codes = dict.codes(); + let values = dict.values(); + if !matches!(codes.dtype().nullability(), Nullability::NonNullable) + || !matches!(values.dtype().nullability(), Nullability::NonNullable) + { + return Ok(None); + } + let DType::Primitive(values_ptype, _) = *values.dtype() else { + return Ok(None); + }; + if values_ptype != T::PTYPE { + return Ok(None); + } + + // Materialize the values slot to a primitive buffer of length `dict_len`. + // Dictionary sizes are bounded by definition; this is the "up-front" step that + // makes the streaming chunked gather possible. For `Dict>` this is + // also where the RunEnd is unrolled into a flat dict, so the produced kernel + // is the *fused* implementation. + let values_canonical = values.clone().execute::(ctx)?; + let dict_buf: Buffer = values_canonical.into_buffer::(); + + let codes_canonical = codes.clone().execute::(ctx)?; + let codes_ptype = codes_canonical.ptype(); + // Fast path only fires for unsigned codes (AVX2 gather requires unsigned indices). + // For signed codes, defer to the canonical executor which handles the cast. + if !codes_ptype.is_unsigned_int() { + return Ok(None); + } + Ok(Some(dispatch_dict_producer::( + dict_buf, + codes_canonical, + codes_ptype, + ))) + } +} + +fn dispatch_dict_producer( + dict: Buffer, + codes: PrimitiveArray, + codes_ptype: PType, +) -> Box> { + crate::match_each_unsigned_integer_ptype!(codes_ptype, |I| { + Box::new(DictPrimitiveProducer::::new( + dict, + codes.into_buffer::(), + )) + }) +} + +/// Stream a `RunEnd` by walking the run-ends and replicating each run value +/// into chunk-sized strides. +pub struct RunEndPrimitiveProducer { + values: Buffer, + ends: Buffer, + /// Logical position in the decoded array (post-offset). + cursor: usize, + /// Total logical length to produce. + len: usize, + /// The slicing offset to apply to `cursor` before searching ends. + offset: usize, + /// Cached run index for `cursor`, so we avoid re-searching across chunk boundaries. + run: usize, +} + +impl RunEndPrimitiveProducer +where + E: num_traits::AsPrimitive, +{ + /// Construct from value + end buffers plus a logical (offset, len) window. + pub fn new( + values: Buffer, + ends: Buffer, + offset: usize, + len: usize, + ) -> VortexResult { + let run = find_run(ends.as_slice(), offset); + Ok(Self { + values, + ends, + cursor: 0, + len, + offset, + run, + }) + } +} + +impl RunEndPrimitiveProducer +where + E: num_traits::AsPrimitive, +{ + /// Fill `dst_ptr[..take]` with the next `take` run-end-decoded values and advance the + /// cursor. The caller has already capped `take` to the remaining length and the dst + /// capacity; this function performs only writes. + /// + /// # Safety + /// + /// `dst_ptr` must be writable for at least `take` elements. + unsafe fn fill(&mut self, take: usize, dst_ptr: *mut T) { + let ends = self.ends.as_slice(); + let values = self.values.as_slice(); + let mut written = 0usize; + let chunk_end_encoded = self.cursor + take + self.offset; + while written < take { + let run_end_encoded: usize = ends[self.run].as_(); + let pos_encoded = self.cursor + written + self.offset; + let in_run = run_end_encoded.min(chunk_end_encoded) - pos_encoded; + let value = values[self.run]; + // SAFETY: caller guarantees dst_ptr[..take] is writable; written + in_run ≤ take. + unsafe { + let slot = std::slice::from_raw_parts_mut(dst_ptr.add(written), in_run); + for s in slot.iter_mut() { + *s = value; + } + } + written += in_run; + if run_end_encoded <= chunk_end_encoded && written < take { + self.run += 1; + } else if run_end_encoded == chunk_end_encoded { + self.run += 1; + } + } + self.cursor += take; + } +} + +impl PrimitiveChunkProducer + for RunEndPrimitiveProducer +where + E: num_traits::AsPrimitive, +{ + fn next_chunk<'a>( + &mut self, + scratch: &'a mut Scratch, + ) -> VortexResult> { + if self.cursor >= self.len { + return Ok(None); + } + let take = CHUNK_LEN.min(self.len - self.cursor); + let dst_ptr = scratch.as_uninit_mut().as_mut_ptr().cast::(); + // SAFETY: scratch has CHUNK_LEN cells; take ≤ CHUNK_LEN. + unsafe { self.fill(take, dst_ptr) }; + // SAFETY: `take` elements written. + Ok(Some(unsafe { std::slice::from_raw_parts(dst_ptr, take) })) + } + + fn next_chunk_into_uninit( + &mut self, + _scratch: &mut Scratch, + dst: &mut [MaybeUninit], + ) -> VortexResult> { + if self.cursor >= self.len { + return Ok(None); + } + let take = CHUNK_LEN.min(self.len - self.cursor).min(dst.len()); + // SAFETY: dst has at least `take` cells; fill initializes them all. + unsafe { self.fill(take, dst.as_mut_ptr().cast::()) }; + Ok(Some(take)) + } + + fn remaining(&self) -> usize { + self.len.saturating_sub(self.cursor) + } +} + +fn find_run(ends: &[E], target_encoded: usize) -> usize +where + E: NativePType + num_traits::AsPrimitive, +{ + // Right binary-search: the run index is the first end > target_encoded. + let (mut lo, mut hi) = (0usize, ends.len()); + while lo < hi { + let mid = (lo + hi) / 2; + let end: usize = ends[mid].as_(); + if end <= target_encoded { + lo = mid + 1; + } else { + hi = mid; + } + } + lo.min(ends.len().saturating_sub(1)) +} + +/// Helper used by encoding crates: build a [`RunEndPrimitiveProducer`] from the canonicalized +/// values + ends buffers and a logical `(offset, len)` window. Dispatches on `ends_ptype`. +pub fn build_runend_producer( + values: PrimitiveArray, + ends: PrimitiveArray, + offset: usize, + len: usize, +) -> VortexResult>> { + let ends_ptype = ends.ptype(); + let values_buf = values.into_buffer::(); + Ok(match_each_integer_ptype!(ends_ptype, |E| { + Box::new(RunEndPrimitiveProducer::::new( + values_buf, + ends.into_buffer::(), + offset, + len, + )?) + })) +} + +// --------------------------------------------------------------------------------------- +// Patched producer — overlay sorted patches chunk-locally over any inner producer +// --------------------------------------------------------------------------------------- + +/// Wraps any inner [`PrimitiveChunkProducer`] (the base, e.g. a bit-packed primitive) and +/// overlays a sorted list of `(index, value)` patches chunk-by-chunk. +/// +/// This is the chunked answer to "decode the base fully, then scatter exceptions into the +/// whole buffer". Because patch indices are sorted and chunks are emitted in order, the +/// overlay is a monotonic merge-walk: for each base chunk the producer applies only the +/// patches that fall inside that chunk's logical range, writing them while the chunk is +/// still resident in the scratch (and therefore in L1). The canonical path scatters +/// patches into the full N-element output, so each write can miss a cold cache line once +/// N spills L2/L3. +/// +/// Patch indices are logical positions in the *decoded* array (post-offset). +pub struct PatchedProducer { + inner: Box>, + inner_scratch: Scratch, + patch_indices: Buffer, + patch_values: Buffer, + patch_cursor: usize, + pos: usize, +} + +impl PatchedProducer { + /// Build from an inner producer plus sorted patch indices/values. + /// + /// `patch_indices` must be sorted ascending and in-range for the inner producer's + /// logical length. + pub fn new( + inner: Box>, + patch_indices: Buffer, + patch_values: Buffer, + ) -> Self { + Self { + inner, + inner_scratch: Scratch::::new(), + patch_indices, + patch_values, + patch_cursor: 0, + pos: 0, + } + } +} + +impl PrimitiveChunkProducer for PatchedProducer { + fn next_chunk<'a>( + &mut self, + scratch: &'a mut Scratch, + ) -> VortexResult> { + // Decode the base chunk into the driver's scratch directly. + let n = match self + .inner + .next_chunk_into_uninit(&mut self.inner_scratch, scratch.as_uninit_mut())? + { + Some(n) => n, + None => return Ok(None), + }; + let chunk_end = self.pos + n; + let ptr = scratch.as_uninit_mut().as_mut_ptr().cast::(); + + // Overlay the patches that fall within [pos, chunk_end) — they are sorted, so we + // advance the cursor monotonically. + let idxs = self.patch_indices.as_slice(); + let vals = self.patch_values.as_slice(); + while self.patch_cursor < idxs.len() { + let gidx = idxs[self.patch_cursor] as usize; + if gidx >= chunk_end { + break; + } + // SAFETY: gidx ∈ [pos, chunk_end); chunk_end - pos = n ≤ CHUNK_LEN; the scratch + // cell at gidx - pos was just initialised by the base decode. + unsafe { ptr.add(gidx - self.pos).write(vals[self.patch_cursor]) }; + self.patch_cursor += 1; + } + self.pos = chunk_end; + // SAFETY: scratch[..n] is fully initialised (base decode + patch overlay). + Ok(Some(unsafe { std::slice::from_raw_parts(ptr, n) })) + } + + fn remaining(&self) -> usize { + self.inner.remaining() + } +} + +// --------------------------------------------------------------------------------------- +// Convenience: decode an encoded ArrayRef into a fresh `Buffer` using the chunked path. +// --------------------------------------------------------------------------------------- + +/// Decode `array` to a fresh `Buffer` by driving the chunked producer to completion. +/// +/// This is the helper used by benchmarks and tests to materialize the chunked output into +/// a comparable form against the existing executor. +pub fn decode_to_buffer( + array: ArrayRef, + dispatcher: &PrimitiveChunkKernelDispatcher, + ctx: &mut ExecutionCtx, +) -> VortexResult> { + let len = array.len(); + let mut out = vortex_buffer::BufferMut::::with_capacity(len); + debug_assert!(out.spare_capacity_mut().len() >= len); + let mut producer = build_primitive_producer::(array, dispatcher, ctx)?; + let mut scratch = Scratch::::new(); + let mut written = 0usize; + loop { + // Slice the spare-capacity head for this chunk so the producer can write directly. + // SAFETY: spare_capacity_mut().len() >= len - written, and we hand the producer at most + // CHUNK_LEN cells (it will respect that and report what it wrote). + let dst = unsafe { + let cap_ptr = out.spare_capacity_mut().as_mut_ptr().add(written); + let cap_len = len - written; + std::slice::from_raw_parts_mut( + cap_ptr.cast::>(), + cap_len.min(CHUNK_LEN), + ) + }; + match producer.next_chunk_into_uninit(&mut scratch, dst)? { + Some(n) => written += n, + None => break, + } + } + // SAFETY: we wrote exactly `written` elements into the spare capacity. + unsafe { + out.set_len(written); + } + Ok(out.freeze()) +} + +// --------------------------------------------------------------------------------------- +// Decode into a builder, for use by the existing builder execution path. +// --------------------------------------------------------------------------------------- + +/// Decode `array` to a [`PrimitiveArray`] (non-nullable) via the chunked engine, choosing +/// the right concrete output type by inspecting the array's dtype. +/// +/// Useful as a one-shot replacement for `array.execute::(ctx)` in code +/// paths where a non-nullable primitive output is expected and the chunked engine has a +/// fused kernel registered. +pub fn execute_to_primitive( + array: ArrayRef, + dispatcher: &PrimitiveChunkKernelDispatcher, + ctx: &mut ExecutionCtx, +) -> VortexResult { + let DType::Primitive(ptype, nullability) = *array.dtype() else { + vortex_error::vortex_bail!( + "execute_to_primitive requires Primitive dtype, got {}", + array.dtype() + ); + }; + if !matches!(nullability, Nullability::NonNullable) { + // For now, fall back to the existing executor for nullable arrays. + return array.execute::(ctx); + } + Ok(match_each_native_ptype!(ptype, |T| { + let buf = decode_to_buffer::(array, dispatcher, ctx)?; + PrimitiveArray::new(buf, Validity::NonNullable) + })) +} + +#[cfg(test)] +mod tests { + use vortex_buffer::buffer; + use vortex_error::VortexResult; + + use super::*; + use crate::IntoArray; + use crate::LEGACY_SESSION; + use crate::VortexSessionExecute; + use crate::arrays::DictArray; + + fn ctx() -> ExecutionCtx { + LEGACY_SESSION.create_execution_ctx() + } + + #[test] + fn slice_producer_round_trip() -> VortexResult<()> { + let data = buffer![1i32, 2, 3, 4, 5, 6, 7, 8, 9, 10]; + let mut p = SliceProducer::::new(data.clone()); + let mut s = Scratch::::new(); + let mut out = Vec::new(); + while let Some(c) = p.next_chunk(&mut s)? { + out.extend_from_slice(c); + } + assert_eq!(out, data.as_slice()); + Ok(()) + } + + #[test] + fn dict_primitive_chunked() -> VortexResult<()> { + // dict-encoded i32 of 4096 elements, dict size 17. + let dict_values = + buffer![10i32, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170]; + let codes: Vec = (0..4096).map(|i| (i % 17) as u8).collect(); + let codes_arr = PrimitiveArray::new( + Buffer::::from_iter(codes.iter().copied()), + Validity::NonNullable, + ); + let values_arr = PrimitiveArray::new(dict_values.clone(), Validity::NonNullable); + let dict = DictArray::try_new(codes_arr.into_array(), values_arr.into_array())?; + + let dispatcher = default_dispatcher(); + let result = decode_to_buffer::(dict.into_array(), &dispatcher, &mut ctx())?; + let expected: Vec = codes + .iter() + .map(|c| dict_values.as_slice()[*c as usize]) + .collect(); + assert_eq!(result.as_slice(), expected.as_slice()); + Ok(()) + } + + #[test] + fn fallback_to_canonicalize() -> VortexResult<()> { + // Plain primitive — no kernel registered; should go through SliceProducer fallback. + let p = PrimitiveArray::new(buffer![1i32, 2, 3], Validity::NonNullable); + let dispatcher = default_dispatcher(); + let buf = decode_to_buffer::(p.into_array(), &dispatcher, &mut ctx())?; + assert_eq!(buf.as_slice(), &[1, 2, 3]); + Ok(()) + } +} diff --git a/vortex-array/src/_chunked_exec/scratch.rs b/vortex-array/src/_chunked_exec/scratch.rs new file mode 100644 index 00000000000..67c4b627bb5 --- /dev/null +++ b/vortex-array/src/_chunked_exec/scratch.rs @@ -0,0 +1,44 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use std::mem::MaybeUninit; + +use super::CHUNK_LEN; + +/// A reusable L1-resident scratch buffer of [`CHUNK_LEN`] elements. +/// +/// The scratch is heap-allocated so it does not bloat caller stack frames. It is uninit by +/// default and is written to by each producer chunk; readers only ever see the initialized +/// prefix returned alongside. +pub struct Scratch { + buf: Box<[MaybeUninit; CHUNK_LEN]>, +} + +impl Scratch { + /// Construct a new uninitialized scratch buffer. + pub fn new() -> Self { + let buf: Box<[MaybeUninit; CHUNK_LEN]> = + Box::new([const { MaybeUninit::::uninit() }; CHUNK_LEN]); + Self { buf } + } + + /// Capacity in elements. + #[inline] + pub fn capacity(&self) -> usize { + CHUNK_LEN + } + + /// Borrow the underlying storage as a slice of uninitialized cells. + /// + /// Producers write into this slice and return the initialized prefix to the driver. + #[inline] + pub fn as_uninit_mut(&mut self) -> &mut [MaybeUninit; CHUNK_LEN] { + &mut self.buf + } +} + +impl Default for Scratch { + fn default() -> Self { + Self::new() + } +} diff --git a/vortex-array/src/_chunked_exec/sink.rs b/vortex-array/src/_chunked_exec/sink.rs new file mode 100644 index 00000000000..4f59dee1897 --- /dev/null +++ b/vortex-array/src/_chunked_exec/sink.rs @@ -0,0 +1,410 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Sinks for fused chunked execution. +//! +//! A sink consumes [`crate::_chunked_exec::primitive::PrimitiveChunkProducer`] chunks one +//! at a time and produces a final result. Combining a producer with a sink fuses decode +//! and the downstream operator into a single pass, with no `Buffer` materialised +//! between them. The producer's L1-resident scratch flows directly into the operator's +//! register-resident accumulators / output buffer. +//! +//! See [`drive_into_sink`] for the canonical driver. +//! +//! ## Sinks shipped here +//! +//! - [`BufferSink`]: equivalent to `decode_to_buffer`. Collects chunks into a `Buffer`. +//! Useful as the "no fusion" baseline against which other sinks compare. +//! - [`SumSink`]: accumulates `sum(x) :: i64` across chunks. No output buffer at all. +//! - [`MapSink`]: applies a per-element `FnMut(T) -> U`, writing into a `Buffer`. Used +//! to express casts (`T as U`) and unary scalar functions (`x + c`, `x * c`, …) without +//! materialising the source as a `Buffer` first. +//! - [`FilterSink`]: applies a per-element predicate; surviving elements stream into a +//! `Buffer`. The mask is never materialised — selectivity is encoded directly in +//! the output length. + +use std::marker::PhantomData; + +use vortex_buffer::Buffer; +use vortex_buffer::BufferMut; +use vortex_error::VortexResult; + +use super::Scratch; +use super::primitive::PrimitiveChunkKernelDispatcher; +use super::primitive::build_primitive_producer; +use crate::ArrayRef; +use crate::dtype::NativePType; +use crate::executor::ExecutionCtx; + +/// Consumes chunks of decoded primitive values and produces a final result. +/// +/// Implementors should treat each `push` call as "process this chunk in-place" — the +/// slice borrowed by `chunk` is only valid until the next `push` call, since it points +/// into the producer's scratch buffer. +pub trait PrimitiveChunkSink { + /// The value produced once all chunks have been consumed. + type Output; + + /// Process one chunk. Implementors typically aggregate, transform, or selectively + /// store into an owned output buffer. + fn push(&mut self, chunk: &[T]) -> VortexResult<()>; + + /// Finalise the sink and produce the result. Consumes the sink. + fn finish(self) -> VortexResult; +} + +/// Drive a chunked producer to completion, feeding every chunk into `sink`. +/// +/// This is the fused-pipeline entry point: the producer never materialises a full +/// `Buffer`, the sink decides what to do with each chunk, and the only memory traffic +/// outside the input bytes is whatever the sink chooses to keep. +pub fn drive_into_sink( + array: ArrayRef, + dispatcher: &PrimitiveChunkKernelDispatcher, + mut sink: S, + ctx: &mut ExecutionCtx, +) -> VortexResult +where + T: NativePType, + S: PrimitiveChunkSink, +{ + let mut producer = build_primitive_producer::(array, dispatcher, ctx)?; + let mut scratch = Scratch::::new(); + while let Some(chunk) = producer.next_chunk(&mut scratch)? { + sink.push(chunk)?; + } + sink.finish() +} + +// ---------------------------------------------------------------------------------- +// BufferSink — the "no fusion" baseline. Collects everything into a Buffer. +// ---------------------------------------------------------------------------------- + +/// Collects chunks into a flat [`Buffer`]. Equivalent to the old `decode_to_buffer`, +/// reformulated as a sink for use with [`drive_into_sink`]. +pub struct BufferSink { + out: BufferMut, +} + +impl BufferSink { + /// Construct with a pre-allocated capacity of `len` elements. + pub fn with_capacity(len: usize) -> Self { + Self { + out: BufferMut::::with_capacity(len), + } + } +} + +impl PrimitiveChunkSink for BufferSink { + type Output = Buffer; + + fn push(&mut self, chunk: &[T]) -> VortexResult<()> { + // SAFETY: BufferMut maintains capacity invariant; we never exceed its allocation. + // The caller pre-allocated with `with_capacity(array.len())`. + let n = chunk.len(); + let written = self.out.as_slice().len(); + unsafe { + let dst = self + .out + .spare_capacity_mut() + .as_mut_ptr() + .add(0) + .cast::(); + std::ptr::copy_nonoverlapping(chunk.as_ptr(), dst, n); + self.out.set_len(written + n); + } + Ok(()) + } + + fn finish(self) -> VortexResult { + Ok(self.out.freeze()) + } +} + +// ---------------------------------------------------------------------------------- +// SumSink — proof of concept. No output buffer; just an i64 accumulator. +// ---------------------------------------------------------------------------------- + +/// Accumulates `sum(chunk_i) as i64` across all chunks. Produces a scalar. +/// +/// Useful as the smallest possible demonstration of operator fusion: the only memory +/// touched outside the input bytes is the producer's 4 KiB scratch. +pub struct SumSink { + acc: i64, + _marker: PhantomData T>, +} + +impl SumSink { + /// New sink, zero-initialised. + pub fn new() -> Self { + Self { + acc: 0, + _marker: PhantomData, + } + } +} + +impl Default for SumSink { + fn default() -> Self { + Self::new() + } +} + +impl PrimitiveChunkSink for SumSink +where + T: NativePType + num_traits::AsPrimitive, +{ + type Output = i64; + + fn push(&mut self, chunk: &[T]) -> VortexResult<()> { + let mut acc = self.acc; + for &v in chunk { + acc = acc.wrapping_add(v.as_()); + } + self.acc = acc; + Ok(()) + } + + fn finish(self) -> VortexResult { + Ok(self.acc) + } +} + +// ---------------------------------------------------------------------------------- +// MapSink — generic per-element transform. Used for cast + scalar funcs. +// ---------------------------------------------------------------------------------- + +/// Applies a per-element `FnMut(T) -> U` and writes the result to a [`Buffer`]. +/// +/// Casts (e.g. `T as U`) and unary scalar functions (`x + c`, `x * c`, `x.abs()`, …) are +/// the same shape — one element in, one element out. The fused win over canonical is that +/// the source array is never materialised as a `Buffer` between decode and map. +pub struct MapSink +where + T: NativePType, + U: NativePType, + F: FnMut(T) -> U, +{ + f: F, + out: BufferMut, + _marker: PhantomData, +} + +impl MapSink +where + T: NativePType, + U: NativePType, + F: FnMut(T) -> U, +{ + /// Construct with a pre-allocated output capacity of `len` elements. + pub fn with_capacity(len: usize, f: F) -> Self { + Self { + f, + out: BufferMut::::with_capacity(len), + _marker: PhantomData, + } + } +} + +impl PrimitiveChunkSink for MapSink +where + T: NativePType, + U: NativePType, + F: FnMut(T) -> U, +{ + type Output = Buffer; + + fn push(&mut self, chunk: &[T]) -> VortexResult<()> { + let written = self.out.as_slice().len(); + let n = chunk.len(); + // SAFETY: caller-pre-allocated capacity ≥ total array length; we never exceed it. + unsafe { + let dst = self.out.spare_capacity_mut().as_mut_ptr().cast::(); + for (i, &v) in chunk.iter().enumerate() { + dst.add(i).write((self.f)(v)); + } + self.out.set_len(written + n); + } + Ok(()) + } + + fn finish(self) -> VortexResult { + Ok(self.out.freeze()) + } +} + +// ---------------------------------------------------------------------------------- +// FilterSink — per-element predicate, surviving elements stream to a Buffer. +// ---------------------------------------------------------------------------------- + +/// Applies a per-element predicate. Surviving elements stream directly into the output +/// buffer — the boolean mask is never materialised. +/// +/// Canonical two-pass `decode → mask → filter` becomes a single pass that touches the +/// compressed input once and writes only the elements that survive. +pub struct FilterSink +where + T: NativePType, + P: FnMut(T) -> bool, +{ + pred: P, + out: BufferMut, +} + +impl FilterSink +where + T: NativePType, + P: FnMut(T) -> bool, +{ + /// Construct with a pre-allocated output capacity of `len` elements (the worst-case + /// upper bound when every element passes the predicate). + pub fn with_capacity(len: usize, pred: P) -> Self { + Self { + pred, + out: BufferMut::::with_capacity(len), + } + } +} + +impl PrimitiveChunkSink for FilterSink +where + T: NativePType, + P: FnMut(T) -> bool, +{ + type Output = Buffer; + + fn push(&mut self, chunk: &[T]) -> VortexResult<()> { + // SAFETY: BufferMut::with_capacity guaranteed at construction; never push past it. + let mut written = self.out.as_slice().len(); + let dst = self.out.spare_capacity_mut(); + let dst_ptr = dst.as_mut_ptr().cast::(); + // Track how many cells we've written into the spare-capacity region this call. + let mut local = 0usize; + for &v in chunk { + if (self.pred)(v) { + // SAFETY: total written ≤ pre-allocated capacity. + unsafe { dst_ptr.add(local).write(v) }; + local += 1; + } + } + written += local; + // SAFETY: just wrote exactly `local` elements past the previous len. + unsafe { self.out.set_len(written) }; + Ok(()) + } + + fn finish(self) -> VortexResult { + Ok(self.out.freeze()) + } +} + +#[cfg(test)] +mod tests { + use vortex_buffer::buffer; + use vortex_error::VortexResult; + + use super::*; + use crate::IntoArray; + use crate::LEGACY_SESSION; + use crate::VortexSessionExecute; + use crate::_chunked_exec::primitive::default_dispatcher; + use crate::arrays::DictArray; + use crate::arrays::PrimitiveArray; + use crate::validity::Validity; + + fn ctx() -> ExecutionCtx { + LEGACY_SESSION.create_execution_ctx() + } + + fn make_dict_i32(codes: &[u32], dict: &[i32]) -> ArrayRef { + let dict_arr = PrimitiveArray::new( + Buffer::::from_iter(dict.iter().copied()), + Validity::NonNullable, + ); + let codes_arr = PrimitiveArray::new( + Buffer::::from_iter(codes.iter().copied()), + Validity::NonNullable, + ); + DictArray::try_new(codes_arr.into_array(), dict_arr.into_array()) + .unwrap() + .into_array() + } + + #[test] + fn buffer_sink_round_trip() -> VortexResult<()> { + let array = make_dict_i32(&[0, 1, 0, 2, 1], &[10, 20, 30]); + let dispatcher = default_dispatcher(); + let buf: Buffer = drive_into_sink( + array, + &dispatcher, + BufferSink::::with_capacity(5), + &mut ctx(), + )?; + assert_eq!(buf.as_slice(), &[10, 20, 10, 30, 20]); + Ok(()) + } + + #[test] + fn sum_sink_fused() -> VortexResult<()> { + let array = make_dict_i32(&[0, 1, 0, 2, 1, 2, 2, 0], &[10, 20, 30]); + // Expected: 10+20+10+30+20+30+30+10 = 160. + let dispatcher = default_dispatcher(); + let s: i64 = drive_into_sink(array, &dispatcher, SumSink::::new(), &mut ctx())?; + assert_eq!(s, 160); + Ok(()) + } + + #[test] + fn map_sink_cast_i32_to_i64() -> VortexResult<()> { + let array = make_dict_i32(&[0, 1, 2], &[100, 200, 300]); + let dispatcher = default_dispatcher(); + let buf: Buffer = drive_into_sink( + array, + &dispatcher, + MapSink::::with_capacity(3, |x| x as i64), + &mut ctx(), + )?; + assert_eq!(buf.as_slice(), &[100i64, 200, 300]); + Ok(()) + } + + #[test] + fn map_sink_scalar_add() -> VortexResult<()> { + let array = make_dict_i32(&[0, 1, 0, 2], &[1, 2, 3]); + let dispatcher = default_dispatcher(); + let buf: Buffer = drive_into_sink( + array, + &dispatcher, + MapSink::::with_capacity(4, |x| x + 100), + &mut ctx(), + )?; + assert_eq!(buf.as_slice(), &[101, 102, 101, 103]); + Ok(()) + } + + #[test] + fn filter_sink_keeps_surviving() -> VortexResult<()> { + let array = make_dict_i32(&[0, 1, 2, 1, 0, 2], &[5, 15, 25]); + // Source values: [5, 15, 25, 15, 5, 25]. Predicate: > 10. + let dispatcher = default_dispatcher(); + let buf: Buffer = drive_into_sink( + array, + &dispatcher, + FilterSink::::with_capacity(6, |x| x > 10), + &mut ctx(), + )?; + assert_eq!(buf.as_slice(), &[15, 25, 15, 25]); + Ok(()) + } + + #[test] + fn sink_works_on_canonical_primitive() -> VortexResult<()> { + // Sanity check: even plain PrimitiveArray flows through the SliceProducer + // fallback path and into the sink. + let p = PrimitiveArray::new(buffer![1i32, 2, 3, 4, 5], Validity::NonNullable); + let dispatcher = default_dispatcher(); + let s = drive_into_sink(p.into_array(), &dispatcher, SumSink::::new(), &mut ctx())?; + assert_eq!(s, 15); + Ok(()) + } +} diff --git a/vortex-array/src/arrays/primitive/compute/mod.rs b/vortex-array/src/arrays/primitive/compute/mod.rs index 867ddf69d03..0c546b99c98 100644 --- a/vortex-array/src/arrays/primitive/compute/mod.rs +++ b/vortex-array/src/arrays/primitive/compute/mod.rs @@ -7,7 +7,7 @@ mod fill_null; mod mask; pub(crate) mod rules; mod slice; -mod take; +pub(crate) mod take; #[cfg(test)] mod tests { diff --git a/vortex-array/src/arrays/primitive/compute/take/avx2.rs b/vortex-array/src/arrays/primitive/compute/take/avx2.rs index e92304dc34b..91d6896fae4 100644 --- a/vortex-array/src/arrays/primitive/compute/take/avx2.rs +++ b/vortex-array/src/arrays/primitive/compute/take/avx2.rs @@ -110,6 +110,114 @@ where // AVX2 SIMD take algorithm // --------------------------------------------------------------------------- +/// AVX2 gather into a caller-supplied uninitialized destination slice. Used by the chunked +/// execution engine to avoid the per-call [`Buffer`] allocation that [`take_avx2`] performs. +/// +/// # Safety +/// +/// `dst` must be writable for `indices.len()` elements. The `avx2` feature must be enabled +/// on the calling CPU. +#[target_feature(enable = "avx2")] +pub(crate) unsafe fn take_avx2_into( + buffer: &[V], + indices: &[I], + dst: *mut V, +) { + macro_rules! dispatch_into { + ($indices:ty, $values:ty) => {{ + dispatch_into!($indices, $values, cast: $values); + }}; + ($indices:ty, $values:ty, cast: $cast:ty) => {{ + let indices = unsafe { std::mem::transmute::<&[I], &[$indices]>(indices) }; + let values = unsafe { std::mem::transmute::<&[V], &[$cast]>(buffer) }; + let dst = dst.cast::<$cast>(); + unsafe { + exec_take_into::<$cast, $indices, AVX2Gather>(values, indices, dst); + } + }}; + } + + if buffer.is_empty() { + // Zero-fill: caller still needs dst[..indices.len()] initialized. + for i in 0..indices.len() { + // SAFETY: caller guarantees dst has indices.len() cells. + unsafe { dst.add(i).write(V::default()) }; + } + return; + } + + match (I::PTYPE, V::PTYPE) { + (PType::U8, PType::I32) => dispatch_into!(u8, i32), + (PType::U8, PType::U32) => dispatch_into!(u8, u32), + (PType::U8, PType::I64) => dispatch_into!(u8, i64), + (PType::U8, PType::U64) => dispatch_into!(u8, u64), + (PType::U16, PType::I32) => dispatch_into!(u16, i32), + (PType::U16, PType::U32) => dispatch_into!(u16, u32), + (PType::U16, PType::I64) => dispatch_into!(u16, i64), + (PType::U16, PType::U64) => dispatch_into!(u16, u64), + (PType::U32, PType::I32) => dispatch_into!(u32, i32), + (PType::U32, PType::U32) => dispatch_into!(u32, u32), + (PType::U32, PType::I64) => dispatch_into!(u32, i64), + (PType::U32, PType::U64) => dispatch_into!(u32, u64), + (PType::U8, PType::F32) => dispatch_into!(u8, f32, cast: u32), + (PType::U16, PType::F32) => dispatch_into!(u16, f32, cast: u32), + (PType::U32, PType::F32) => dispatch_into!(u32, f32, cast: u32), + (PType::U64, PType::F32) => dispatch_into!(u64, f32, cast: u32), + (PType::U8, PType::F64) => dispatch_into!(u8, f64, cast: u64), + (PType::U16, PType::F64) => dispatch_into!(u16, f64, cast: u64), + (PType::U32, PType::F64) => dispatch_into!(u32, f64, cast: u64), + (PType::U64, PType::F64) => dispatch_into!(u64, f64, cast: u64), + _ => { + // Scalar fallback into dst. + for (i, idx) in indices.iter().enumerate() { + // SAFETY: dst has indices.len() cells; values bounds-checked. + unsafe { dst.add(i).write(buffer[(*idx).as_()]) }; + } + } + } +} + +/// Same SIMD gather loop as [`exec_take`], but writes into a caller-supplied destination +/// pointer instead of allocating. Marked `#[target_feature(enable = "avx2")]` so the AVX2 +/// gather intrinsics inside `AVX2Gather::gather` get the correct codegen context. +/// +/// # Safety +/// +/// - `dst` must point to at least `indices.len()` writable elements. +/// - The `avx2` feature must be enabled on the caller's CPU. +#[target_feature(enable = "avx2")] +unsafe fn exec_take_into( + values: &[Value], + indices: &[Idx], + dst: *mut Value, +) where + Value: Copy, + Idx: UnsignedPType, + Gather: GatherFn, +{ + let indices_len = indices.len(); + let max_index = Idx::from(values.len()).unwrap_or_else(|| Idx::max_value()); + let mut offset = 0; + while offset + Gather::STRIDE < indices_len { + // SAFETY: same as exec_take. + unsafe { + Gather::gather( + indices.as_ptr().add(offset), + max_index, + values.as_ptr(), + dst.add(offset), + ) + }; + offset += Gather::WIDTH; + } + while offset < indices_len { + // SAFETY: offset < indices_len ≤ dst capacity; indices[offset] is bounds-checked. + unsafe { dst.add(offset).write(values[indices[offset].as_()]) }; + offset += 1; + } + debug_assert_eq!(offset, indices_len); +} + /// Takes the specified indices into a new [`Buffer`] using AVX2 SIMD. /// /// # Panics @@ -121,7 +229,7 @@ where /// The caller must ensure the `avx2` feature is enabled. #[target_feature(enable = "avx2")] #[doc(hidden)] -unsafe fn take_avx2(buffer: &[V], indices: &[I]) -> Buffer { +pub(crate) unsafe fn take_avx2(buffer: &[V], indices: &[I]) -> Buffer { macro_rules! dispatch_avx2 { ($indices:ty, $values:ty) => { { let result = dispatch_avx2!($indices, $values, cast: $values); result } diff --git a/vortex-array/src/arrays/primitive/compute/take/avx512.rs b/vortex-array/src/arrays/primitive/compute/take/avx512.rs new file mode 100644 index 00000000000..f9261cc49f4 --- /dev/null +++ b/vortex-array/src/arrays/primitive/compute/take/avx512.rs @@ -0,0 +1,849 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! An AVX-512 implementation of the take operation using `vpgatherdd` / `vpgatherdq` / +//! `vpgatherqq` instructions. +//! +//! Only enabled for x86_64 hosts and gated at runtime behind feature detection to +//! ensure AVX-512F, AVX-512BW, AVX-512DQ, and AVX-512VL instructions are available. +//! +//! Mirrors `avx2.rs`. Index/value pairs that aren't implemented here fall back to the +//! AVX-2 kernel (which in turn can fall back to scalar). + +#![cfg(any(target_arch = "x86_64", target_arch = "x86"))] + +use std::arch::x86_64::__m256i; +use std::arch::x86_64::__m512i; +use std::arch::x86_64::__mmask8; +use std::arch::x86_64::__mmask16; +use std::arch::x86_64::_mm_loadu_si128; +use std::arch::x86_64::_mm256_cvtepu8_epi32; +use std::arch::x86_64::_mm256_cvtepu16_epi32; +use std::arch::x86_64::_mm256_loadu_si256; +use std::arch::x86_64::_mm256_setzero_si256; +use std::arch::x86_64::_mm256_storeu_si256; +use std::arch::x86_64::_mm512_cmple_epu32_mask; +use std::arch::x86_64::_mm512_cmple_epu64_mask; +use std::arch::x86_64::_mm512_cvtepu8_epi32; +use std::arch::x86_64::_mm512_cvtepu16_epi32; +use std::arch::x86_64::_mm512_loadu_si512; +use std::arch::x86_64::_mm512_mask_i32gather_epi32; +use std::arch::x86_64::_mm512_mask_i32gather_epi64; +use std::arch::x86_64::_mm512_mask_i64gather_epi32; +use std::arch::x86_64::_mm512_mask_i64gather_epi64; +use std::arch::x86_64::_mm512_set1_epi32; +use std::arch::x86_64::_mm512_set1_epi64; +use std::arch::x86_64::_mm512_setzero_si512; +use std::arch::x86_64::_mm512_storeu_si512; +use std::arch::x86_64::_mm512_zextsi256_si512; + +use vortex_buffer::Alignment; +use vortex_buffer::Buffer; +use vortex_buffer::BufferMut; +use vortex_error::VortexResult; + +use crate::ArrayRef; +use crate::IntoArray; +use crate::array::ArrayView; +use crate::arrays::PrimitiveArray; +use crate::arrays::primitive::compute::take::TakeImpl; +use crate::arrays::primitive::compute::take::avx2; +use crate::arrays::primitive::vtable::Primitive; +use crate::dtype::NativePType; +use crate::dtype::PType; +use crate::dtype::UnsignedPType; +use crate::match_each_native_ptype; +use crate::match_each_unsigned_integer_ptype; +use crate::validity::Validity; + +#[allow(unused)] +pub(super) struct TakeKernelAVX512; + +impl TakeImpl for TakeKernelAVX512 { + #[inline(always)] + fn take( + &self, + values: ArrayView<'_, Primitive>, + indices: ArrayView<'_, Primitive>, + validity: Validity, + ) -> VortexResult { + assert!(indices.ptype().is_unsigned_int()); + + Ok(match_each_unsigned_integer_ptype!(indices.ptype(), |I| { + match_each_native_ptype!(values.ptype(), |V| { + // SAFETY: This kernel is only selected when the required AVX-512 cpu-features + // are detected. + unsafe { + take_primitive_avx512(values.as_slice::(), indices.as_slice::(), validity) + } + }) + }) + .into_array()) + } +} + +/// # Safety +/// +/// The caller must ensure that if the validity has a length, it is the same length as the indices, +/// and that the `avx512f`, `avx512bw`, `avx512dq`, and `avx512vl` features are enabled. +#[target_feature(enable = "avx512f,avx512bw,avx512dq,avx512vl")] +#[allow(unused)] +unsafe fn take_primitive_avx512( + values: &[V], + indices: &[I], + validity: Validity, +) -> PrimitiveArray +where + V: NativePType, + I: UnsignedPType, +{ + // SAFETY: The caller guarantees the required features are enabled. + let buffer = unsafe { take_avx512(values, indices) }; + + debug_assert!( + validity + .maybe_len() + .is_none_or(|validity_len| validity_len == buffer.len()) + ); + + // SAFETY: The caller ensures that the validity and indices have the same length, so the taken + // buffer and the validity must have the same length. + unsafe { PrimitiveArray::new_unchecked(buffer, validity) } +} + +// --------------------------------------------------------------------------- +// AVX-512 SIMD take algorithm +// --------------------------------------------------------------------------- + +/// AVX-512 gather into a caller-supplied uninitialized destination slice. Used by the +/// chunked execution engine to avoid the per-call [`Buffer`] allocation that +/// [`take_avx512`] performs. +/// +/// # Safety +/// +/// `dst` must be writable for `indices.len()` elements. The `avx512f`, `avx512bw`, +/// `avx512dq`, and `avx512vl` features must be enabled on the calling CPU. +#[target_feature(enable = "avx512f,avx512bw,avx512dq,avx512vl")] +pub(crate) unsafe fn take_avx512_into( + buffer: &[V], + indices: &[I], + dst: *mut V, +) { + macro_rules! dispatch_into { + ($indices:ty, $values:ty) => {{ + dispatch_into!($indices, $values, cast: $values); + }}; + ($indices:ty, $values:ty, cast: $cast:ty) => {{ + let indices_typed = + unsafe { std::mem::transmute::<&[I], &[$indices]>(indices) }; + let values_typed = unsafe { std::mem::transmute::<&[V], &[$cast]>(buffer) }; + let dst_typed = dst.cast::<$cast>(); + unsafe { + exec_take_into_avx512::<$cast, $indices, AVX512Gather>( + values_typed, + indices_typed, + dst_typed, + ); + } + }}; + } + + if buffer.is_empty() { + // Zero-fill: caller still needs dst[..indices.len()] initialized. + for i in 0..indices.len() { + // SAFETY: caller guarantees dst has indices.len() cells. + unsafe { dst.add(i).write(V::default()) }; + } + return; + } + + match (I::PTYPE, V::PTYPE) { + // Int value types. Only 32 and 64 bit types are supported. + (PType::U8, PType::I32) => dispatch_into!(u8, i32), + (PType::U8, PType::U32) => dispatch_into!(u8, u32), + (PType::U8, PType::I64) => dispatch_into!(u8, i64), + (PType::U8, PType::U64) => dispatch_into!(u8, u64), + (PType::U16, PType::I32) => dispatch_into!(u16, i32), + (PType::U16, PType::U32) => dispatch_into!(u16, u32), + (PType::U16, PType::I64) => dispatch_into!(u16, i64), + (PType::U16, PType::U64) => dispatch_into!(u16, u64), + (PType::U32, PType::I32) => dispatch_into!(u32, i32), + (PType::U32, PType::U32) => dispatch_into!(u32, u32), + (PType::U32, PType::I64) => dispatch_into!(u32, i64), + (PType::U32, PType::U64) => dispatch_into!(u32, u64), + (PType::U64, PType::I32) => dispatch_into!(u64, i32), + (PType::U64, PType::U32) => dispatch_into!(u64, u32), + (PType::U64, PType::I64) => dispatch_into!(u64, i64), + (PType::U64, PType::U64) => dispatch_into!(u64, u64), + + // Float value types, treat them as if they were corresponding int types. + (PType::U8, PType::F32) => dispatch_into!(u8, f32, cast: u32), + (PType::U16, PType::F32) => dispatch_into!(u16, f32, cast: u32), + (PType::U32, PType::F32) => dispatch_into!(u32, f32, cast: u32), + (PType::U64, PType::F32) => dispatch_into!(u64, f32, cast: u32), + + (PType::U8, PType::F64) => dispatch_into!(u8, f64, cast: u64), + (PType::U16, PType::F64) => dispatch_into!(u16, f64, cast: u64), + (PType::U32, PType::F64) => dispatch_into!(u32, f64, cast: u64), + (PType::U64, PType::F64) => dispatch_into!(u64, f64, cast: u64), + + // Fall back to AVX-2 (or scalar) for unsupported pairs. + _ => { + // SAFETY: AVX-2 is a strict subset of the AVX-512 features required to call + // `take_avx512_into`, so the AVX-2 entry point is valid here. + unsafe { avx2::take_avx2_into::(buffer, indices, dst) }; + } + } +} + +/// AVX-512 inner gather loop. Writes into a caller-supplied destination pointer instead of +/// allocating. Marked `#[target_feature(enable = "avx512f,...")]` so the AVX-512 gather +/// intrinsics inside `AVX512Gather::gather` get the correct codegen context. +/// +/// # Safety +/// +/// - `dst` must point to at least `indices.len()` writable elements. +/// - The `avx512f`, `avx512bw`, `avx512dq`, and `avx512vl` features must be enabled on +/// the caller's CPU. +#[target_feature(enable = "avx512f,avx512bw,avx512dq,avx512vl")] +unsafe fn exec_take_into_avx512( + values: &[Value], + indices: &[Idx], + dst: *mut Value, +) where + Value: Copy, + Idx: UnsignedPType, + Gather: GatherFn, +{ + let indices_len = indices.len(); + let max_index = Idx::from(values.len()).unwrap_or_else(|| Idx::max_value()); + let mut offset = 0; + while offset + Gather::STRIDE < indices_len { + // SAFETY: same as exec_take_avx512. + unsafe { + Gather::gather( + indices.as_ptr().add(offset), + max_index, + values.as_ptr(), + dst.add(offset), + ) + }; + offset += Gather::WIDTH; + } + while offset < indices_len { + // SAFETY: offset < indices_len ≤ dst capacity; indices[offset] is bounds-checked. + unsafe { dst.add(offset).write(values[indices[offset].as_()]) }; + offset += 1; + } + debug_assert_eq!(offset, indices_len); +} + +/// Takes the specified indices into a new [`Buffer`] using AVX-512 SIMD. +/// +/// # Panics +/// +/// This function panics if any of the provided `indices` are out of bounds for `values`. +/// +/// # Safety +/// +/// The caller must ensure the `avx512f`, `avx512bw`, `avx512dq`, and `avx512vl` features +/// are enabled. +#[target_feature(enable = "avx512f,avx512bw,avx512dq,avx512vl")] +#[doc(hidden)] +pub(crate) unsafe fn take_avx512( + buffer: &[V], + indices: &[I], +) -> Buffer { + macro_rules! dispatch_avx512 { + ($indices:ty, $values:ty) => { + { let result = dispatch_avx512!($indices, $values, cast: $values); result } + }; + ($indices:ty, $values:ty, cast: $cast:ty) => {{ + let indices_typed = + unsafe { std::mem::transmute::<&[I], &[$indices]>(indices) }; + let values_typed = unsafe { std::mem::transmute::<&[V], &[$cast]>(buffer) }; + + let result = unsafe { + exec_take_avx512::<$cast, $indices, AVX512Gather>(values_typed, indices_typed) + }; + unsafe { result.transmute::() } + }}; + } + + if buffer.is_empty() { + return Buffer::zeroed(indices.len()); + } + + match (I::PTYPE, V::PTYPE) { + // Int value types. Only 32 and 64 bit types are supported. + (PType::U8, PType::I32) => dispatch_avx512!(u8, i32), + (PType::U8, PType::U32) => dispatch_avx512!(u8, u32), + (PType::U8, PType::I64) => dispatch_avx512!(u8, i64), + (PType::U8, PType::U64) => dispatch_avx512!(u8, u64), + (PType::U16, PType::I32) => dispatch_avx512!(u16, i32), + (PType::U16, PType::U32) => dispatch_avx512!(u16, u32), + (PType::U16, PType::I64) => dispatch_avx512!(u16, i64), + (PType::U16, PType::U64) => dispatch_avx512!(u16, u64), + (PType::U32, PType::I32) => dispatch_avx512!(u32, i32), + (PType::U32, PType::U32) => dispatch_avx512!(u32, u32), + (PType::U32, PType::I64) => dispatch_avx512!(u32, i64), + (PType::U32, PType::U64) => dispatch_avx512!(u32, u64), + (PType::U64, PType::I32) => dispatch_avx512!(u64, i32), + (PType::U64, PType::U32) => dispatch_avx512!(u64, u32), + (PType::U64, PType::I64) => dispatch_avx512!(u64, i64), + (PType::U64, PType::U64) => dispatch_avx512!(u64, u64), + + // Float value types, treat them as if they were corresponding int types. + (PType::U8, PType::F32) => dispatch_avx512!(u8, f32, cast: u32), + (PType::U16, PType::F32) => dispatch_avx512!(u16, f32, cast: u32), + (PType::U32, PType::F32) => dispatch_avx512!(u32, f32, cast: u32), + (PType::U64, PType::F32) => dispatch_avx512!(u64, f32, cast: u32), + + (PType::U8, PType::F64) => dispatch_avx512!(u8, f64, cast: u64), + (PType::U16, PType::F64) => dispatch_avx512!(u16, f64, cast: u64), + (PType::U32, PType::F64) => dispatch_avx512!(u32, f64, cast: u64), + (PType::U64, PType::F64) => dispatch_avx512!(u64, f64, cast: u64), + + // Fall back to AVX-2 (which itself falls back to scalar) for unsupported pairs. + _ => { + tracing::trace!( + "take AVX-512 kernel missing for indices {} values {}, falling back to AVX-2", + I::PTYPE, + V::PTYPE + ); + // SAFETY: AVX-2 is a strict subset of the AVX-512 features required to call + // `take_avx512`, so the AVX-2 entry point is valid here. + unsafe { avx2::take_avx2(buffer, indices) } + } + } +} + +/// The main gather function that is used by the inner loop kernel for AVX-512 gather. +trait GatherFn { + /// The number of data elements that are written to the `dst` on each loop iteration. + const WIDTH: usize; + /// The number of indices read from `indices` on each loop iteration. + const STRIDE: usize = Self::WIDTH; + + /// Gather values from `src` into the `dst` using the `indices`. + /// + /// # Safety + /// + /// This function can read up to `STRIDE` elements through `indices`, and read/write up to + /// `WIDTH` elements through `src` and `dst` respectively. + unsafe fn gather(indices: *const Idx, max_idx: Idx, src: *const Values, dst: *mut Values); +} + +/// AVX-512 version of [`GatherFn`] defined for 32- and 64-bit value types. +enum AVX512Gather {} + +// --------------------------------------------------------------------------- +// 16-lane (`__m512i` of i32 indices) gathers — output __m512i of 32-bit values. +// --------------------------------------------------------------------------- + +macro_rules! impl_gather_i32x16 { + ($idx:ty, $value:ty, load_idx: $load_idx:ident, extend_idx: $extend_idx:expr, WIDTH = $WIDTH:literal, STRIDE = $STRIDE:literal) => { + impl GatherFn<$idx, $value> for AVX512Gather { + const WIDTH: usize = $WIDTH; + const STRIDE: usize = $STRIDE; + + #[allow(unused_unsafe, clippy::cast_possible_truncation)] + #[inline(always)] + unsafe fn gather( + indices: *const $idx, + max_idx: $idx, + src: *const $value, + dst: *mut $value, + ) { + const { + assert!( + $WIDTH <= $STRIDE, + "dst cannot advance by more than the stride" + ); + assert!($WIDTH == 16); + } + + const SCALE: i32 = size_of::<$value>() as i32; + + // Load and zero-extend `indices` into 16 i32 lanes. + let raw = unsafe { $load_idx(indices.cast()) }; + let indices_vec: __m512i = unsafe { $extend_idx(raw) }; + + // Compute `valid_mask = idx < max_idx` (i.e. in-bounds positions). + let max_idx_vec = unsafe { _mm512_set1_epi32(max_idx as i32) }; + let valid_mask: __mmask16 = + unsafe { _mm512_cmple_epu32_mask(indices_vec, max_idx_vec) }; + + let zero_vec = unsafe { _mm512_setzero_si512() }; + + // Masked gather: gather where valid_mask=1, leave zero otherwise. + let values_vec = unsafe { + _mm512_mask_i32gather_epi32::( + zero_vec, + valid_mask, + indices_vec, + src.cast(), + ) + }; + + // Write the vec out to dst. + unsafe { _mm512_storeu_si512(dst.cast(), values_vec) }; + } + } + }; +} + +// kernels for u8 indices into 32-bit values +impl_gather_i32x16!( + u8, u32, + load_idx: _mm_loadu_si128, + extend_idx: _mm512_cvtepu8_epi32, + WIDTH = 16, STRIDE = 16 +); +impl_gather_i32x16!( + u8, i32, + load_idx: _mm_loadu_si128, + extend_idx: _mm512_cvtepu8_epi32, + WIDTH = 16, STRIDE = 16 +); + +// kernels for u16 indices into 32-bit values +impl_gather_i32x16!( + u16, u32, + load_idx: _mm256_loadu_si256, + extend_idx: _mm512_cvtepu16_epi32, + WIDTH = 16, STRIDE = 16 +); +impl_gather_i32x16!( + u16, i32, + load_idx: _mm256_loadu_si256, + extend_idx: _mm512_cvtepu16_epi32, + WIDTH = 16, STRIDE = 16 +); + +// kernels for u32 indices into 32-bit values — pass-through, no extension needed. +impl GatherFn for AVX512Gather { + const WIDTH: usize = 16; + const STRIDE: usize = 16; + + #[allow(clippy::cast_possible_truncation)] + #[inline(always)] + unsafe fn gather(indices: *const u32, max_idx: u32, src: *const u32, dst: *mut u32) { + const SCALE: i32 = size_of::() as i32; + + let indices_vec = unsafe { _mm512_loadu_si512(indices.cast()) }; + let max_idx_vec = unsafe { _mm512_set1_epi32(max_idx as i32) }; + let valid_mask = unsafe { _mm512_cmple_epu32_mask(indices_vec, max_idx_vec) }; + let zero_vec = unsafe { _mm512_setzero_si512() }; + let values_vec = unsafe { + _mm512_mask_i32gather_epi32::(zero_vec, valid_mask, indices_vec, src.cast()) + }; + unsafe { _mm512_storeu_si512(dst.cast(), values_vec) }; + } +} + +impl GatherFn for AVX512Gather { + const WIDTH: usize = 16; + const STRIDE: usize = 16; + + #[allow(clippy::cast_possible_truncation)] + #[inline(always)] + unsafe fn gather(indices: *const u32, max_idx: u32, src: *const i32, dst: *mut i32) { + const SCALE: i32 = size_of::() as i32; + + let indices_vec = unsafe { _mm512_loadu_si512(indices.cast()) }; + let max_idx_vec = unsafe { _mm512_set1_epi32(max_idx as i32) }; + let valid_mask = unsafe { _mm512_cmple_epu32_mask(indices_vec, max_idx_vec) }; + let zero_vec = unsafe { _mm512_setzero_si512() }; + let values_vec = unsafe { + _mm512_mask_i32gather_epi32::(zero_vec, valid_mask, indices_vec, src.cast()) + }; + unsafe { _mm512_storeu_si512(dst.cast(), values_vec) }; + } +} + +// --------------------------------------------------------------------------- +// 8-lane (`__m256i` of i32 indices) gathers — output __m512i of 64-bit values. +// --------------------------------------------------------------------------- + +macro_rules! impl_gather_i32_to_i64x8 { + ($idx:ty, $value:ty, load_idx: $load_idx:ident, extend_idx: $extend_idx:expr, WIDTH = $WIDTH:literal, STRIDE = $STRIDE:literal) => { + impl GatherFn<$idx, $value> for AVX512Gather { + const WIDTH: usize = $WIDTH; + const STRIDE: usize = $STRIDE; + + #[allow(unused_unsafe, clippy::cast_possible_truncation)] + #[inline(always)] + unsafe fn gather( + indices: *const $idx, + max_idx: $idx, + src: *const $value, + dst: *mut $value, + ) { + const { + assert!( + $WIDTH <= $STRIDE, + "dst cannot advance by more than the stride" + ); + assert!($WIDTH == 8); + } + + const SCALE: i32 = size_of::<$value>() as i32; + + // Load `STRIDE` indices, zero-extended to i32x8 (the upper lanes of the + // 256-bit vector are unused by the gather). + let raw = unsafe { $load_idx(indices.cast()) }; + let indices_vec: __m256i = unsafe { $extend_idx(raw) }; + + // Build a 256-bit max_idx vector and compare unsigned. We borrow the AVX-512VL + // 256-bit comparison helpers via the 512-bit zero-extended path: zero-extend + // both sides to a 512i and use `cmplt_epu32_mask`. + let max_idx_512 = unsafe { _mm512_set1_epi32(max_idx as i32) }; + let indices_512: __m512i = unsafe { _mm512_zextsi256_si512(indices_vec) }; + let valid_mask_16: __mmask16 = + unsafe { _mm512_cmple_epu32_mask(indices_512, max_idx_512) }; + // Only the low 8 lanes are meaningful for the i32->i64 gather. + let valid_mask: __mmask8 = (valid_mask_16 as u16 & 0x00FFu16) as __mmask8; + + let zero_vec = unsafe { _mm512_setzero_si512() }; + + let values_vec = unsafe { + _mm512_mask_i32gather_epi64::( + zero_vec, + valid_mask, + indices_vec, + src.cast(), + ) + }; + + unsafe { _mm512_storeu_si512(dst.cast(), values_vec) }; + } + } + }; +} + +// u8 → 64-bit values (load 8 u8 bytes into a __m128i, zero-extend to __m256i of 8 i32s). +// We need `_mm256_cvtepu8_epi32` from AVX2 for the index zero-extension. +impl GatherFn for AVX512Gather { + const WIDTH: usize = 8; + const STRIDE: usize = 16; + + #[allow(clippy::cast_possible_truncation)] + #[inline(always)] + unsafe fn gather(indices: *const u8, max_idx: u8, src: *const u64, dst: *mut u64) { + const SCALE: i32 = size_of::() as i32; + + // Load 16 u8s but only the low 8 are used (STRIDE=16 stays compatible with AVX-2). + let raw = unsafe { _mm_loadu_si128(indices.cast()) }; + let indices_vec: __m256i = unsafe { _mm256_cvtepu8_epi32(raw) }; + + let max_idx_512 = unsafe { _mm512_set1_epi32(max_idx as i32) }; + let indices_512 = unsafe { _mm512_zextsi256_si512(indices_vec) }; + let valid_mask_16 = unsafe { _mm512_cmple_epu32_mask(indices_512, max_idx_512) }; + let valid_mask: __mmask8 = (valid_mask_16 as u16 & 0x00FFu16) as __mmask8; + + let zero_vec = unsafe { _mm512_setzero_si512() }; + let values_vec = unsafe { + _mm512_mask_i32gather_epi64::(zero_vec, valid_mask, indices_vec, src.cast()) + }; + unsafe { _mm512_storeu_si512(dst.cast(), values_vec) }; + } +} + +impl GatherFn for AVX512Gather { + const WIDTH: usize = 8; + const STRIDE: usize = 16; + + #[allow(clippy::cast_possible_truncation)] + #[inline(always)] + unsafe fn gather(indices: *const u8, max_idx: u8, src: *const i64, dst: *mut i64) { + const SCALE: i32 = size_of::() as i32; + + let raw = unsafe { _mm_loadu_si128(indices.cast()) }; + let indices_vec: __m256i = unsafe { _mm256_cvtepu8_epi32(raw) }; + + let max_idx_512 = unsafe { _mm512_set1_epi32(max_idx as i32) }; + let indices_512 = unsafe { _mm512_zextsi256_si512(indices_vec) }; + let valid_mask_16 = unsafe { _mm512_cmple_epu32_mask(indices_512, max_idx_512) }; + let valid_mask: __mmask8 = (valid_mask_16 as u16 & 0x00FFu16) as __mmask8; + + let zero_vec = unsafe { _mm512_setzero_si512() }; + let values_vec = unsafe { + _mm512_mask_i32gather_epi64::(zero_vec, valid_mask, indices_vec, src.cast()) + }; + unsafe { _mm512_storeu_si512(dst.cast(), values_vec) }; + } +} + +// u16 → 64-bit values +impl_gather_i32_to_i64x8!( + u16, u64, + load_idx: _mm_loadu_si128, + extend_idx: _mm256_cvtepu16_epi32, + WIDTH = 8, STRIDE = 8 +); +impl_gather_i32_to_i64x8!( + u16, i64, + load_idx: _mm_loadu_si128, + extend_idx: _mm256_cvtepu16_epi32, + WIDTH = 8, STRIDE = 8 +); + +// u32 → 64-bit values (load 8 u32 into a __m256i, gather via i32->i64). +impl_gather_i32_to_i64x8!( + u32, u64, + load_idx: _mm256_loadu_si256, + extend_idx: identity_m256, + WIDTH = 8, STRIDE = 8 +); +impl_gather_i32_to_i64x8!( + u32, i64, + load_idx: _mm256_loadu_si256, + extend_idx: identity_m256, + WIDTH = 8, STRIDE = 8 +); + +#[inline(always)] +unsafe fn identity_m256(x: __m256i) -> __m256i { + x +} + +// --------------------------------------------------------------------------- +// 8-lane (`__m512i` of i64 indices) gathers — output __m512i of 64-bit values. +// --------------------------------------------------------------------------- + +impl GatherFn for AVX512Gather { + const WIDTH: usize = 8; + const STRIDE: usize = 8; + + #[allow(clippy::cast_possible_truncation, clippy::cast_possible_wrap)] + #[inline(always)] + unsafe fn gather(indices: *const u64, max_idx: u64, src: *const u64, dst: *mut u64) { + const SCALE: i32 = size_of::() as i32; + + let indices_vec = unsafe { _mm512_loadu_si512(indices.cast()) }; + let max_idx_vec = unsafe { _mm512_set1_epi64(max_idx as i64) }; + let valid_mask = unsafe { _mm512_cmple_epu64_mask(indices_vec, max_idx_vec) }; + let zero_vec = unsafe { _mm512_setzero_si512() }; + let values_vec = unsafe { + _mm512_mask_i64gather_epi64::(zero_vec, valid_mask, indices_vec, src.cast()) + }; + unsafe { _mm512_storeu_si512(dst.cast(), values_vec) }; + } +} + +impl GatherFn for AVX512Gather { + const WIDTH: usize = 8; + const STRIDE: usize = 8; + + #[allow(clippy::cast_possible_truncation, clippy::cast_possible_wrap)] + #[inline(always)] + unsafe fn gather(indices: *const u64, max_idx: u64, src: *const i64, dst: *mut i64) { + const SCALE: i32 = size_of::() as i32; + + let indices_vec = unsafe { _mm512_loadu_si512(indices.cast()) }; + let max_idx_vec = unsafe { _mm512_set1_epi64(max_idx as i64) }; + let valid_mask = unsafe { _mm512_cmple_epu64_mask(indices_vec, max_idx_vec) }; + let zero_vec = unsafe { _mm512_setzero_si512() }; + let values_vec = unsafe { + _mm512_mask_i64gather_epi64::(zero_vec, valid_mask, indices_vec, src.cast()) + }; + unsafe { _mm512_storeu_si512(dst.cast(), values_vec) }; + } +} + +// --------------------------------------------------------------------------- +// 8-lane (`__m512i` of i64 indices) gathers — output __m256i of 32-bit values. +// --------------------------------------------------------------------------- + +impl GatherFn for AVX512Gather { + const WIDTH: usize = 8; + const STRIDE: usize = 8; + + #[allow(clippy::cast_possible_truncation, clippy::cast_possible_wrap)] + #[inline(always)] + unsafe fn gather(indices: *const u64, max_idx: u64, src: *const u32, dst: *mut u32) { + const SCALE: i32 = size_of::() as i32; + + let indices_vec = unsafe { _mm512_loadu_si512(indices.cast()) }; + let max_idx_vec = unsafe { _mm512_set1_epi64(max_idx as i64) }; + let valid_mask = unsafe { _mm512_cmple_epu64_mask(indices_vec, max_idx_vec) }; + let zero_vec_256 = unsafe { _mm256_setzero_si256() }; + let values_vec = unsafe { + _mm512_mask_i64gather_epi32::(zero_vec_256, valid_mask, indices_vec, src.cast()) + }; + unsafe { _mm256_storeu_si256(dst.cast(), values_vec) }; + } +} + +impl GatherFn for AVX512Gather { + const WIDTH: usize = 8; + const STRIDE: usize = 8; + + #[allow(clippy::cast_possible_truncation, clippy::cast_possible_wrap)] + #[inline(always)] + unsafe fn gather(indices: *const u64, max_idx: u64, src: *const i32, dst: *mut i32) { + const SCALE: i32 = size_of::() as i32; + + let indices_vec = unsafe { _mm512_loadu_si512(indices.cast()) }; + let max_idx_vec = unsafe { _mm512_set1_epi64(max_idx as i64) }; + let valid_mask = unsafe { _mm512_cmple_epu64_mask(indices_vec, max_idx_vec) }; + let zero_vec_256 = unsafe { _mm256_setzero_si256() }; + let values_vec = unsafe { + _mm512_mask_i64gather_epi32::(zero_vec_256, valid_mask, indices_vec, src.cast()) + }; + unsafe { _mm256_storeu_si256(dst.cast(), values_vec) }; + } +} + +/// AVX-512 core inner loop for a specific `Idx` / `Value` type pair. +#[inline(always)] +unsafe fn exec_take_avx512(values: &[Value], indices: &[Idx]) -> Buffer +where + Value: Copy, + Idx: UnsignedPType, + Gather: GatherFn, +{ + let indices_len = indices.len(); + let max_index = Idx::from(values.len()).unwrap_or_else(|| Idx::max_value()); + let mut buffer = + BufferMut::::with_capacity_aligned(indices_len, Alignment::of::<__m512i>()); + let buf_uninit = buffer.spare_capacity_mut(); + + let mut offset = 0; + // Loop terminates STRIDE elements before the end of the indices array. + while offset + Gather::STRIDE < indices_len { + // SAFETY: `gather` reads at most STRIDE indices and writes WIDTH values, and we have + // STRIDE more indices available and WIDTH+ destination cells available. + unsafe { + Gather::gather( + indices.as_ptr().add(offset), + max_index, + values.as_ptr(), + buf_uninit.as_mut_ptr().add(offset).cast(), + ) + }; + offset += Gather::WIDTH; + } + + while offset < indices_len { + buf_uninit[offset].write(values[indices[offset].as_()]); + offset += 1; + } + + assert_eq!(offset, indices_len); + + // SAFETY: All elements have been initialized. + unsafe { buffer.set_len(indices_len) }; + + // Reset the buffer alignment to the Value type so downstream slicing works at value + // boundaries. + buffer = buffer.aligned(Alignment::of::()); + + buffer.freeze() +} + +#[cfg(test)] +#[cfg_attr(miri, ignore)] +#[cfg(target_arch = "x86_64")] +mod avx512_tests { + use super::*; + + fn host_has_avx512() -> bool { + is_x86_feature_detected!("avx512f") + && is_x86_feature_detected!("avx512bw") + && is_x86_feature_detected!("avx512dq") + && is_x86_feature_detected!("avx512vl") + } + + macro_rules! test_cases { + (index_type => $IDX:ty, value_types => $($VAL:ty),+) => { + paste::paste! { + $( + #[test] + #[allow(clippy::cast_possible_truncation)] + fn []() { + if !host_has_avx512() { + return; + } + let values: Vec<$VAL> = (1..=127).map(|x| x as $VAL).collect(); + let indices: Vec<$IDX> = (0..127).collect(); + + let result = unsafe { take_avx512(&values, &indices) }; + assert_eq!(&values, result.as_slice()); + } + + #[test] + #[should_panic] + #[allow(clippy::cast_possible_truncation)] + fn []() { + if !host_has_avx512() { + // Force the test to "panic" so the should_panic harness is satisfied + // on hosts without AVX-512. + panic!("avx512 not available; skipping"); + } + let values: Vec<$VAL> = vec![]; + let indices: Vec<$IDX> = (0..127).collect(); + let result = unsafe { take_avx512(&values, &indices) }; + assert!(result.is_empty()); + } + + #[test] + #[should_panic] + #[allow(clippy::cast_possible_truncation)] + fn []() { + if !host_has_avx512() { + panic!("avx512 not available; skipping"); + } + let values: Vec<$VAL> = (1..=127).map(|x| x as $VAL).collect(); + let indices: Vec<$IDX> = (127..=254).collect(); + + let result = unsafe { take_avx512(&values, &indices) }; + assert_eq!(&[0 as $VAL; 127], result.as_slice()); + } + )+ + } + }; + } + + test_cases!( + index_type => u8, + value_types => u32, i32, u64, i64, f32, f64 + ); + test_cases!( + index_type => u16, + value_types => u32, i32, u64, i64, f32, f64 + ); + test_cases!( + index_type => u32, + value_types => u32, i32, u64, i64, f32, f64 + ); + test_cases!( + index_type => u64, + value_types => u32, i32, u64, i64, f32, f64 + ); + + #[test] + fn test_avx512_take_last_valid_index_u8() { + if !host_has_avx512() { + return; + } + let values: Vec = (0..(255 + 1)).collect(); + let indices: Vec = vec![255; 20]; + + let result = unsafe { take_avx512(&values, &indices) }; + assert_eq!(&vec![255; indices.len()], result.as_slice()); + } + + #[test] + fn test_avx512_take_last_valid_index_u16() { + if !host_has_avx512() { + return; + } + let values: Vec = (0..(65535 + 1)).collect(); + let indices: Vec = vec![65535; 20]; + + let result = unsafe { take_avx512(&values, &indices) }; + assert_eq!(&vec![65535; indices.len()], result.as_slice()); + } +} diff --git a/vortex-array/src/arrays/primitive/compute/take/mod.rs b/vortex-array/src/arrays/primitive/compute/take/mod.rs index 4023991c65d..acbc1506ba3 100644 --- a/vortex-array/src/arrays/primitive/compute/take/mod.rs +++ b/vortex-array/src/arrays/primitive/compute/take/mod.rs @@ -3,6 +3,8 @@ #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] mod avx2; +#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +mod avx512; use std::sync::LazyLock; @@ -31,7 +33,13 @@ use crate::validity::Validity; static PRIMITIVE_TAKE_KERNEL: LazyLock<&'static dyn TakeImpl> = LazyLock::new(|| { #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] { - if is_x86_feature_detected!("avx2") { + if is_x86_feature_detected!("avx512f") + && is_x86_feature_detected!("avx512bw") + && is_x86_feature_detected!("avx512dq") + && is_x86_feature_detected!("avx512vl") + { + &avx512::TakeKernelAVX512 + } else if is_x86_feature_detected!("avx2") { &avx2::TakeKernelAVX2 } else { &TakeKernelScalar @@ -44,6 +52,67 @@ static PRIMITIVE_TAKE_KERNEL: LazyLock<&'static dyn TakeImpl> = LazyLock::new(|| } }); +#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +static AVX2_AVAILABLE: LazyLock = LazyLock::new(|| is_x86_feature_detected!("avx2")); + +#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +static AVX512_AVAILABLE: LazyLock = LazyLock::new(|| { + is_x86_feature_detected!("avx512f") + && is_x86_feature_detected!("avx512bw") + && is_x86_feature_detected!("avx512dq") + && is_x86_feature_detected!("avx512vl") +}); + +/// Take `indices` from `values` into a caller-supplied uninitialized destination slice, +/// selecting the AVX-512 gather kernel when available, then AVX-2, and falling back to +/// scalar otherwise. +/// +/// `dst` must hold at least `indices.len()` cells; this function initializes exactly +/// `indices.len()` cells starting from `dst[0]`. +/// +/// Used by the chunked execution engine to call into the same SIMD gather as the +/// canonical executor without allocating a heap [`Buffer`] per invocation. +pub fn take_into_uninit( + values: &[V], + indices: &[I], + dst: &mut [std::mem::MaybeUninit], +) { + assert!(dst.len() >= indices.len()); + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + { + if *AVX512_AVAILABLE { + // SAFETY: AVX-512 features confirmed at runtime; dst capacity ≥ indices.len(). + unsafe { + avx512::take_avx512_into::(values, indices, dst.as_mut_ptr().cast::()); + } + return; + } + if *AVX2_AVAILABLE { + // SAFETY: AVX2 feature confirmed at runtime; dst capacity ≥ indices.len(). + unsafe { + avx2::take_avx2_into::(values, indices, dst.as_mut_ptr().cast::()); + } + return; + } + } + take_into_uninit_scalar::(values, indices, dst); +} + +fn take_into_uninit_scalar( + values: &[V], + indices: &[I], + dst: &mut [std::mem::MaybeUninit], +) { + let ptr = dst.as_mut_ptr(); + for (i, idx) in indices.iter().enumerate() { + // SAFETY: dst capacity asserted by caller; values bounds-checked. + unsafe { + ptr.add(i) + .write(std::mem::MaybeUninit::new(values[(*idx).as_()])); + } + } +} + trait TakeImpl: Send + Sync { fn take( &self, diff --git a/vortex-array/src/lib.rs b/vortex-array/src/lib.rs index 0a9c5969ecc..7a84cb92095 100644 --- a/vortex-array/src/lib.rs +++ b/vortex-array/src/lib.rs @@ -29,6 +29,8 @@ use vortex_session::registry::Context; use crate::session::ArraySession; +#[doc(hidden)] +pub mod _chunked_exec; pub mod accessor; pub mod aggregate_fn; #[doc(hidden)]