diff --git a/encodings/fastlanes/public-api.lock b/encodings/fastlanes/public-api.lock index 4f0ce3df18c..7af1f658017 100644 --- a/encodings/fastlanes/public-api.lock +++ b/encodings/fastlanes/public-api.lock @@ -190,6 +190,14 @@ impl vortex_array::arrays::slice::SliceReduce for vortex_fastlanes::BitPacked pub fn vortex_fastlanes::BitPacked::slice(vortex_array::array::view::ArrayView<'_, Self>, core::ops::range::Range) -> vortex_error::VortexResult> +impl vortex_array::scalar_fn::fns::between::kernel::BetweenKernel for vortex_fastlanes::BitPacked + +pub fn vortex_fastlanes::BitPacked::between(vortex_array::array::view::ArrayView<'_, Self>, &vortex_array::array::erased::ArrayRef, &vortex_array::array::erased::ArrayRef, &vortex_array::scalar_fn::fns::between::BetweenOptions, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult> + +impl vortex_array::scalar_fn::fns::binary::compare::CompareKernel for vortex_fastlanes::BitPacked + +pub fn vortex_fastlanes::BitPacked::compare(vortex_array::array::view::ArrayView<'_, Self>, &vortex_array::array::erased::ArrayRef, vortex_array::scalar_fn::fns::operators::CompareOperator, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult> + impl vortex_array::scalar_fn::fns::cast::kernel::CastKernel for vortex_fastlanes::BitPacked pub fn vortex_fastlanes::BitPacked::cast(vortex_array::array::view::ArrayView<'_, Self>, &vortex_array::dtype::DType, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult> diff --git a/encodings/fastlanes/src/bitpacking/compute/between.rs b/encodings/fastlanes/src/bitpacking/compute/between.rs new file mode 100644 index 00000000000..66ceaa9f5b1 --- /dev/null +++ b/encodings/fastlanes/src/bitpacking/compute/between.rs @@ -0,0 +1,248 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Block-streaming between kernel for [`BitPackedArray`] against constant bounds. +//! +//! Reuses the same single-block scratch buffer as the compare kernel and folds a +//! `lower op_l v op_u upper` predicate per element, so the full primitive never +//! materialises. + +use vortex_array::ArrayRef; +use vortex_array::ArrayView; +use vortex_array::ExecutionCtx; +use vortex_array::dtype::NativePType; +use vortex_array::dtype::Nullability; +use vortex_array::match_each_integer_ptype; +use vortex_array::scalar_fn::fns::between::BetweenKernel; +use vortex_array::scalar_fn::fns::between::BetweenOptions; +use vortex_array::scalar_fn::fns::between::StrictComparison; +use vortex_error::VortexExpect; +use vortex_error::VortexResult; + +use crate::BitPacked; +use crate::bitpacking::compute::stream_predicate::stream_predicate; + +impl BetweenKernel for BitPacked { + fn between( + array: ArrayView<'_, Self>, + lower: &ArrayRef, + upper: &ArrayRef, + options: &BetweenOptions, + ctx: &mut ExecutionCtx, + ) -> VortexResult> { + // Only accelerate constant-bounds between; vary-by-row bounds fall through to the + // default `compare + and` pipeline. + let (Some(lower_const), Some(upper_const)) = (lower.as_constant(), upper.as_constant()) + else { + return Ok(None); + }; + let (Some(lower_prim), Some(upper_prim)) = ( + lower_const.as_primitive_opt(), + upper_const.as_primitive_opt(), + ) else { + return Ok(None); + }; + + let nullability = + array.dtype().nullability() | lower.dtype().nullability() | upper.dtype().nullability(); + let arr_ptype = array.dtype().as_ptype(); + if lower_prim.ptype() != arr_ptype || upper_prim.ptype() != arr_ptype { + return Ok(None); + } + + let result = match_each_integer_ptype!(arr_ptype, |T| { + let lo: T = lower_prim + .typed_value::() + .vortex_expect("between precondition strips null lower"); + let up: T = upper_prim + .typed_value::() + .vortex_expect("between precondition strips null upper"); + between_constant_typed::(array, lo, up, options, nullability, ctx)? + }); + Ok(Some(result)) + } +} + +fn between_constant_typed( + array: ArrayView<'_, BitPacked>, + lower: T, + upper: T, + options: &BetweenOptions, + nullability: Nullability, + ctx: &mut ExecutionCtx, +) -> VortexResult +where + T: NativePType + Copy + crate::unpack_iter::BitPacked, +{ + // Branch on strictness once at the top so each call into `between_impl` monomorphises + // a single tight predicate — same shape as `Primitive::between` in `vortex-array`. + match (options.lower_strict, options.upper_strict) { + (StrictComparison::Strict, StrictComparison::Strict) => between_impl( + array, + lower, + NativePType::is_lt, + upper, + NativePType::is_lt, + nullability, + ctx, + ), + (StrictComparison::Strict, StrictComparison::NonStrict) => between_impl( + array, + lower, + NativePType::is_lt, + upper, + NativePType::is_le, + nullability, + ctx, + ), + (StrictComparison::NonStrict, StrictComparison::Strict) => between_impl( + array, + lower, + NativePType::is_le, + upper, + NativePType::is_lt, + nullability, + ctx, + ), + (StrictComparison::NonStrict, StrictComparison::NonStrict) => between_impl( + array, + lower, + NativePType::is_le, + upper, + NativePType::is_le, + nullability, + ctx, + ), + } +} + +fn between_impl( + array: ArrayView<'_, BitPacked>, + lower: T, + lower_fn: Lo, + upper: T, + upper_fn: Up, + nullability: Nullability, + ctx: &mut ExecutionCtx, +) -> VortexResult +where + T: NativePType + Copy + crate::unpack_iter::BitPacked, + Lo: Fn(T, T) -> bool, + Up: Fn(T, T) -> bool, +{ + stream_predicate::( + array, + nullability, + |v| lower_fn(lower, v) && upper_fn(v, upper), + ctx, + ) +} + +#[cfg(test)] +mod tests { + use rstest::rstest; + use vortex_array::IntoArray; + use vortex_array::LEGACY_SESSION; + use vortex_array::VortexSessionExecute; + use vortex_array::arrays::BoolArray; + use vortex_array::arrays::ConstantArray; + use vortex_array::arrays::PrimitiveArray; + use vortex_array::assert_arrays_eq; + use vortex_array::builtins::ArrayBuiltins; + use vortex_array::scalar_fn::fns::between::BetweenOptions; + use vortex_array::scalar_fn::fns::between::StrictComparison; + use vortex_array::validity::Validity; + use vortex_buffer::BufferMut; + use vortex_error::VortexResult; + + use crate::BitPackedArrayExt; + use crate::BitPackedData; + + fn opts(lower: StrictComparison, upper: StrictComparison) -> BetweenOptions { + BetweenOptions { + lower_strict: lower, + upper_strict: upper, + } + } + + #[rstest] + #[case(StrictComparison::NonStrict, StrictComparison::NonStrict)] + #[case(StrictComparison::Strict, StrictComparison::NonStrict)] + #[case(StrictComparison::NonStrict, StrictComparison::Strict)] + #[case(StrictComparison::Strict, StrictComparison::Strict)] + fn multi_chunk_against_primitive_baseline( + #[case] lower_strict: StrictComparison, + #[case] upper_strict: StrictComparison, + ) -> VortexResult<()> { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let values: BufferMut = (0..3000u32).map(|i| i % 257).collect(); + let prim = PrimitiveArray::new(values.freeze(), Validity::NonNullable); + let packed = BitPackedData::encode(&prim.clone().into_array(), 9, &mut ctx)?; + + let lower = ConstantArray::new(40u32, prim.len()).into_array(); + let upper = ConstantArray::new(200u32, prim.len()).into_array(); + let options = opts(lower_strict, upper_strict); + + let expected = prim + .into_array() + .between(lower.clone(), upper.clone(), options.clone())? + .execute::(&mut ctx)?; + let actual = packed + .into_array() + .between(lower, upper, options)? + .execute::(&mut ctx)?; + + assert_arrays_eq!(actual, expected); + Ok(()) + } + + #[test] + fn signed_with_patches_against_primitive_baseline() -> VortexResult<()> { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let values: Vec = (0..1500) + .map(|i| if i % 73 == 0 { 100_000 + i } else { i % 100 }) + .collect(); + let prim = PrimitiveArray::from_iter(values); + let packed = BitPackedData::encode(&prim.clone().into_array(), 7, &mut ctx)?; + assert!(packed.patches().is_some(), "test setup expects patches"); + + let lower = ConstantArray::new(20i32, prim.len()).into_array(); + let upper = ConstantArray::new(80i32, prim.len()).into_array(); + let options = opts(StrictComparison::NonStrict, StrictComparison::NonStrict); + + let expected = prim + .into_array() + .between(lower.clone(), upper.clone(), options.clone())? + .execute::(&mut ctx)?; + let actual = packed + .into_array() + .between(lower, upper, options)? + .execute::(&mut ctx)?; + + assert_arrays_eq!(actual, expected); + Ok(()) + } + + #[test] + fn nullable_propagates_validity() -> VortexResult<()> { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let prim = + PrimitiveArray::from_option_iter([Some(1u32), None, Some(3), Some(4), None, Some(6)]); + let packed = BitPackedData::encode(&prim.clone().into_array(), 3, &mut ctx)?; + + let lower = ConstantArray::new(2u32, packed.len()).into_array(); + let upper = ConstantArray::new(5u32, packed.len()).into_array(); + let options = opts(StrictComparison::NonStrict, StrictComparison::NonStrict); + + let actual = packed + .into_array() + .between(lower.clone(), upper.clone(), options.clone())? + .execute::(&mut ctx)?; + let expected = prim + .into_array() + .between(lower, upper, options)? + .execute::(&mut ctx)?; + assert_arrays_eq!(actual, expected); + Ok(()) + } +} diff --git a/encodings/fastlanes/src/bitpacking/compute/compare.rs b/encodings/fastlanes/src/bitpacking/compute/compare.rs new file mode 100644 index 00000000000..44a8fce2abb --- /dev/null +++ b/encodings/fastlanes/src/bitpacking/compute/compare.rs @@ -0,0 +1,187 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Block-streaming compare kernel for [`BitPackedArray`] against a constant. +//! +//! Avoids materialising the full primitive: the array is walked one 1024-element FastLanes +//! block at a time through a reusable scratch buffer, and a per-element bool is folded into +//! a [`BitBuffer`]. Patches are re-applied at the end by overwriting bits at the patched +//! indices with `predicate(patch_value)`. + +use vortex_array::ArrayRef; +use vortex_array::ArrayView; +use vortex_array::ExecutionCtx; +use vortex_array::dtype::NativePType; +use vortex_array::dtype::Nullability; +use vortex_array::match_each_integer_ptype; +use vortex_array::scalar_fn::fns::binary::CompareKernel; +use vortex_array::scalar_fn::fns::operators::CompareOperator; +use vortex_error::VortexExpect; +use vortex_error::VortexResult; + +use crate::BitPacked; +use crate::bitpacking::compute::stream_predicate::stream_predicate; + +impl CompareKernel for BitPacked { + fn compare( + lhs: ArrayView<'_, Self>, + rhs: &ArrayRef, + operator: CompareOperator, + ctx: &mut ExecutionCtx, + ) -> VortexResult> { + // Only accelerate compare-against-constant. + let Some(constant) = rhs.as_constant() else { + return Ok(None); + }; + let Some(constant_prim) = constant.as_primitive_opt() else { + return Ok(None); + }; + + // Adaptor strips null-constant RHS, and the binary scalar-fn coerce_args step has + // already promoted both sides to a common ptype. + let nullability = lhs.dtype().nullability() | rhs.dtype().nullability(); + let lhs_ptype = lhs.dtype().as_ptype(); + if constant_prim.ptype() != lhs_ptype { + return Ok(None); + } + + let result = match_each_integer_ptype!(lhs_ptype, |T| { + let rhs: T = constant_prim + .typed_value::() + .vortex_expect("compare adaptor strips null constants"); + compare_constant_typed::(lhs, rhs, operator, nullability, ctx)? + }); + Ok(Some(result)) + } +} + +fn compare_constant_typed( + lhs: ArrayView<'_, BitPacked>, + rhs: T, + operator: CompareOperator, + nullability: Nullability, + ctx: &mut ExecutionCtx, +) -> VortexResult +where + T: NativePType + Copy + crate::unpack_iter::BitPacked, +{ + // `NativePType::is_eq` / `is_lt` etc. provide total comparison (matching the primitive + // between kernel's dispatch shape). `NotEq` has no direct method, so use `!is_eq`. + match operator { + CompareOperator::Eq => stream_predicate::(lhs, nullability, |v| v.is_eq(rhs), ctx), + CompareOperator::NotEq => { + stream_predicate::(lhs, nullability, |v| !v.is_eq(rhs), ctx) + } + CompareOperator::Lt => stream_predicate::(lhs, nullability, |v| v.is_lt(rhs), ctx), + CompareOperator::Lte => stream_predicate::(lhs, nullability, |v| v.is_le(rhs), ctx), + CompareOperator::Gt => stream_predicate::(lhs, nullability, |v| v.is_gt(rhs), ctx), + CompareOperator::Gte => stream_predicate::(lhs, nullability, |v| v.is_ge(rhs), ctx), + } +} + +#[cfg(test)] +mod tests { + use rstest::rstest; + use vortex_array::IntoArray; + use vortex_array::LEGACY_SESSION; + use vortex_array::VortexSessionExecute; + use vortex_array::arrays::BoolArray; + use vortex_array::arrays::ConstantArray; + use vortex_array::arrays::PrimitiveArray; + use vortex_array::assert_arrays_eq; + use vortex_array::builtins::ArrayBuiltins; + use vortex_array::scalar_fn::fns::operators::Operator; + use vortex_array::validity::Validity; + use vortex_buffer::BufferMut; + use vortex_error::VortexResult; + + use crate::BitPackedArrayExt; + use crate::BitPackedData; + + /// All six operators on a small in-range input. + #[rstest] + #[case(Operator::Eq, vec![false, false, false, true, false, false, true])] + #[case(Operator::NotEq, vec![true, true, true, false, true, true, false])] + #[case(Operator::Lt, vec![true, true, true, false, false, false, false])] + #[case(Operator::Lte, vec![true, true, true, true, false, false, true])] + #[case(Operator::Gt, vec![false, false, false, false, true, true, false])] + #[case(Operator::Gte, vec![false, false, false, true, true, true, true])] + fn small(#[case] op: Operator, #[case] expected: Vec) { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let values = PrimitiveArray::from_iter([0u32, 1, 2, 3, 4, 5, 3]); + let packed = BitPackedData::encode(&values.into_array(), 3, &mut ctx).unwrap(); + let rhs = ConstantArray::new(3u32, packed.len()).into_array(); + let result = packed + .into_array() + .binary(rhs, op) + .unwrap() + .execute::(&mut ctx) + .unwrap(); + assert_arrays_eq!(result, BoolArray::from_iter(expected)); + } + + /// Multi-block streaming path (3500 u32 elems span 4 FastLanes blocks including a + /// partial trailer). Compare against the Primitive fallback on the same inputs. + #[test] + fn multi_chunk_matches_primitive() -> VortexResult<()> { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let values: BufferMut = (0..3500u32).map(|i| i % 257).collect(); + let prim = PrimitiveArray::new(values.freeze(), Validity::NonNullable); + let packed = BitPackedData::encode(&prim.clone().into_array(), 9, &mut ctx)?; + let rhs = ConstantArray::new(123u32, prim.len()).into_array(); + let expected = prim + .into_array() + .binary(rhs.clone(), Operator::Lt)? + .execute::(&mut ctx)?; + let actual = packed + .into_array() + .binary(rhs, Operator::Lt)? + .execute::(&mut ctx)?; + assert_arrays_eq!(actual, expected); + Ok(()) + } + + /// Inline-patch path: encode signed i32 values that exceed the bit-width range so they + /// end up in `Patches`. The streaming kernel must splice the patches in before the + /// predicate runs. + #[test] + fn signed_with_patches_matches_primitive() -> VortexResult<()> { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let values: Vec = (0..1500) + .map(|i| if i % 73 == 0 { 100_000 + i } else { i % 100 }) + .collect(); + let prim = PrimitiveArray::from_iter(values); + let packed = BitPackedData::encode(&prim.clone().into_array(), 7, &mut ctx)?; + assert!(packed.patches().is_some(), "test setup expects patches"); + let rhs = ConstantArray::new(50i32, prim.len()).into_array(); + let expected = prim + .into_array() + .binary(rhs.clone(), Operator::Eq)? + .execute::(&mut ctx)?; + let actual = packed + .into_array() + .binary(rhs, Operator::Eq)? + .execute::(&mut ctx)?; + assert_arrays_eq!(actual, expected); + Ok(()) + } + + /// Nullable input — the result must carry the array's validity. + #[test] + fn nullable_propagates_validity() -> VortexResult<()> { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let prim = PrimitiveArray::from_option_iter([Some(1u32), None, Some(3), Some(4), None]); + let packed = BitPackedData::encode(&prim.clone().into_array(), 3, &mut ctx)?; + let rhs = ConstantArray::new(3u32, packed.len()).into_array(); + let actual = packed + .into_array() + .binary(rhs.clone(), Operator::Eq)? + .execute::(&mut ctx)?; + let expected = prim + .into_array() + .binary(rhs, Operator::Eq)? + .execute::(&mut ctx)?; + assert_arrays_eq!(actual, expected); + Ok(()) + } +} diff --git a/encodings/fastlanes/src/bitpacking/compute/mod.rs b/encodings/fastlanes/src/bitpacking/compute/mod.rs index 2501d952356..518f8319eb1 100644 --- a/encodings/fastlanes/src/bitpacking/compute/mod.rs +++ b/encodings/fastlanes/src/bitpacking/compute/mod.rs @@ -1,10 +1,13 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors +mod between; mod cast; +mod compare; mod filter; pub(crate) mod is_constant; mod slice; +mod stream_predicate; mod take; // TODO(connor): This is duplicated in `encodings/fastlanes/src/bitpacking/kernels/mod.rs`. diff --git a/encodings/fastlanes/src/bitpacking/compute/stream_predicate.rs b/encodings/fastlanes/src/bitpacking/compute/stream_predicate.rs new file mode 100644 index 00000000000..5e0cb99fe13 --- /dev/null +++ b/encodings/fastlanes/src/bitpacking/compute/stream_predicate.rs @@ -0,0 +1,211 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Streaming, cache-reusable predicate evaluation over a [`BitPackedArray`]. +//! +//! Walks the encoded array one 1024-element FastLanes block at a time through a single +//! reusable scratch buffer, splices any [`crate::patches::Patches`] into the unpacked block +//! in place via a sorted-index cursor, then folds a `Fn(T) -> bool` predicate over the +//! block. The fold matches the canonical [`vortex_buffer::BitBuffer::collect_bool`] shape +//! (pack 64 bools into a `u64` in a tight auto-vectorisable inner loop) and writes the +//! resulting words straight into the output bit buffer, so the materialised primitive +//! never appears anywhere. + +use lending_iterator::LendingIterator; +use num_traits::AsPrimitive; +use vortex_array::ArrayRef; +use vortex_array::ArrayView; +use vortex_array::ExecutionCtx; +use vortex_array::IntoArray; +use vortex_array::arrays::BoolArray; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::dtype::NativePType; +use vortex_array::dtype::Nullability; +use vortex_array::match_each_unsigned_integer_ptype; +use vortex_buffer::BitBufferMut; +use vortex_buffer::BufferMut; +use vortex_error::VortexResult; + +use crate::BitPacked; +use crate::BitPackedArrayExt; +use crate::unpack_iter::BitPacked as BitPackedIter; +use crate::unpack_iter::BitUnpackedChunks; + +/// Stream `predicate` over the unpacked values of a [`BitPackedArray`], one FastLanes +/// block at a time, producing a [`BoolArray`]. +pub(super) fn stream_predicate( + array: ArrayView<'_, BitPacked>, + nullability: Nullability, + predicate: P, + ctx: &mut ExecutionCtx, +) -> VortexResult +where + T: BitPackedIter + NativePType + Copy, + P: Fn(T) -> bool, +{ + let len = array.len(); + let num_words = len.div_ceil(64); + let mut words: BufferMut = BufferMut::zeroed(num_words); + let mut cursor: usize = 0; + + if len > 0 { + let mut chunks = array.unpacked_chunks::()?; + let words = words.as_mut_slice(); + + if let Some(p) = array.patches() { + let p_idx_arr = p.indices().clone().execute::(ctx)?; + let p_val_arr = p.values().clone().execute::(ctx)?; + let p_off = p.offset(); + match_each_unsigned_integer_ptype!(p_idx_arr.ptype(), |I| { + let p_idx = p_idx_arr.as_slice::(); + let p_val = p_val_arr.as_slice::(); + let mut p_cur: usize = 0; + cursor = walk_blocks(&mut chunks, len, cursor, |block, c| { + splice_patches::(block, c, &mut p_cur, p_idx, p_val, p_off); + write_block(words, c, block, &predicate, len) + }); + }); + } else { + cursor = walk_blocks(&mut chunks, len, cursor, |block, c| { + write_block(words, c, block, &predicate, len) + }); + } + } + + debug_assert_eq!(cursor, len); + let bits = BitBufferMut::from_buffer(words.into_byte_buffer(), 0, len); + let validity = array.validity()?.union_nullability(nullability); + Ok(BoolArray::new(bits.freeze(), validity).into_array()) +} + +/// Walk every unpacked block (initial / full / trailer) in order, invoking `f` once per +/// block. `f` receives the block and the current bit cursor and returns the new cursor. +/// The internal scratch buffer is reused between calls, so `f` must consume the block +/// before returning. +fn walk_blocks( + chunks: &mut BitUnpackedChunks, + len: usize, + start_cursor: usize, + mut f: F, +) -> usize +where + T: BitPackedIter, + F: FnMut(&mut [T], usize) -> usize, +{ + let mut cursor = start_cursor; + if let Some(initial) = chunks.initial() { + cursor = f(initial, cursor); + } + // When `num_chunks == 1` and not sliced at the tail, `initial` already consumed the + // whole array and `full_chunks` would re-yield the same data. Guard with the cursor. + if cursor < len { + let mut iter = chunks.full_chunks(); + while let Some(chunk) = iter.next() { + cursor = f(chunk, cursor); + if cursor >= len { + break; + } + } + } + if cursor < len + && let Some(trailer) = chunks.trailer() + { + cursor = f(trailer, cursor); + } + cursor +} + +/// Overwrite the unpacked block in place with any patches falling in +/// `[chunk_start, chunk_start + block.len())`, then advance `cursor` past them. Sorted +/// indices mean the cursor only moves forward across the whole walk. +#[inline] +fn splice_patches( + block: &mut [T], + chunk_start: usize, + cursor: &mut usize, + indices: &[I], + values: &[T], + patch_offset: usize, +) where + T: Copy, + I: AsPrimitive, +{ + let end = chunk_start + block.len(); + while *cursor < indices.len() { + let global: usize = indices[*cursor].as_(); + let local = global - patch_offset; + if local >= end { + break; + } + debug_assert!(local >= chunk_start); + block[local - chunk_start] = values[*cursor]; + *cursor += 1; + } +} + +/// Fold `predicate` over `block`, packing 64 bools into a `u64` per inner-loop pass and +/// writing the words directly into `words` at `start_bit`. Auto-vectorises into the same +/// `pcmpeq + psllq + por` shape that arrow-ord's `apply_op` lowers to. +#[inline] +fn write_block( + words: &mut [u64], + start_bit: usize, + block: &[T], + predicate: &P, + total_len: usize, +) -> usize +where + T: Copy, + P: Fn(T) -> bool, +{ + let end_bit = (start_bit + block.len()).min(total_len); + let active_len = end_bit - start_bit; + if active_len == 0 { + return start_bit; + } + + if start_bit.is_multiple_of(64) { + let mut word_idx = start_bit / 64; + let full_words = active_len / 64; + for w in 0..full_words { + let mut packed = 0u64; + for b in 0..64 { + // SAFETY: w * 64 + b < full_words * 64 <= active_len <= block.len(). + let v = unsafe { *block.get_unchecked(w * 64 + b) }; + packed |= (predicate(v) as u64) << b; + } + // SAFETY: word_idx < num_words = total_len.div_ceil(64) by construction. + unsafe { + *words.get_unchecked_mut(word_idx) = packed; + } + word_idx += 1; + } + let tail = active_len % 64; + if tail > 0 { + let base = full_words * 64; + let mut packed = 0u64; + for b in 0..tail { + // SAFETY: base + b < active_len <= block.len(). + let v = unsafe { *block.get_unchecked(base + b) }; + packed |= (predicate(v) as u64) << b; + } + unsafe { + *words.get_unchecked_mut(word_idx) = packed; + } + } + } else { + // Unaligned cursor — array sliced at a non-64-aligned offset. Per-bit OR. + for b in 0..active_len { + // SAFETY: b < active_len <= block.len(). + let v = unsafe { *block.get_unchecked(b) }; + if predicate(v) { + let bit_pos = start_bit + b; + unsafe { + *words.get_unchecked_mut(bit_pos / 64) |= 1u64 << (bit_pos % 64); + } + } + } + } + + end_bit +} diff --git a/encodings/fastlanes/src/bitpacking/vtable/kernels.rs b/encodings/fastlanes/src/bitpacking/vtable/kernels.rs index cb020dc2ce9..87332f736a7 100644 --- a/encodings/fastlanes/src/bitpacking/vtable/kernels.rs +++ b/encodings/fastlanes/src/bitpacking/vtable/kernels.rs @@ -5,12 +5,16 @@ use vortex_array::arrays::dict::TakeExecuteAdaptor; use vortex_array::arrays::filter::FilterExecuteAdaptor; use vortex_array::arrays::slice::SliceExecuteAdaptor; use vortex_array::kernel::ParentKernelSet; +use vortex_array::scalar_fn::fns::between::BetweenExecuteAdaptor; +use vortex_array::scalar_fn::fns::binary::CompareExecuteAdaptor; use vortex_array::scalar_fn::fns::cast::CastExecuteAdaptor; use crate::BitPacked; pub(crate) const PARENT_KERNELS: ParentKernelSet = ParentKernelSet::new(&[ + ParentKernelSet::lift(&BetweenExecuteAdaptor(BitPacked)), ParentKernelSet::lift(&CastExecuteAdaptor(BitPacked)), + ParentKernelSet::lift(&CompareExecuteAdaptor(BitPacked)), ParentKernelSet::lift(&FilterExecuteAdaptor(BitPacked)), ParentKernelSet::lift(&SliceExecuteAdaptor(BitPacked)), ParentKernelSet::lift(&TakeExecuteAdaptor(BitPacked)),