diff --git a/Cargo.lock b/Cargo.lock
index 6f2b7bb670e..4bcfd04863a 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -10009,6 +10009,7 @@ dependencies = [
  "vortex-array",
  "vortex-buffer",
  "vortex-error",
+ "vortex-fastlanes",
  "vortex-mask",
  "vortex-session",
 ]
diff --git a/encodings/fastlanes/src/_chunked_exec.rs b/encodings/fastlanes/src/_chunked_exec.rs
new file mode 100644
index 00000000000..1e241ec4667
--- /dev/null
+++ b/encodings/fastlanes/src/_chunked_exec.rs
@@ -0,0 +1,623 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Bit-pack–aware chunked decoders that integrate with `vortex_array::_chunked_exec`.
+//!
+//! Avoids the full upfront bit-unpack the canonical executor performs. For a
+//! `Dict<BitPacked<P>, …>` array we unpack one 1024-element code chunk at a time and
+//! immediately AVX2-gather it into the output buffer. The working set is the small
+//! values dictionary plus the 4 KiB chunk-of-codes, never the materialised codes column.
+
+use std::mem::MaybeUninit;
+use std::sync::Arc;
+
+use fastlanes::BitPacking;
+use vortex_array::ArrayRef;
+use vortex_array::ExecutionCtx;
+use vortex_array::VTable;
+use vortex_array::_chunked_exec::CHUNK_LEN;
+use vortex_array::_chunked_exec::Scratch;
+use vortex_array::_chunked_exec::primitive::PrimitiveChunkKernel;
+use vortex_array::_chunked_exec::primitive::PrimitiveChunkKernelDispatcher;
+use vortex_array::_chunked_exec::primitive::PatchedProducer;
+use vortex_array::_chunked_exec::primitive::PrimitiveChunkProducer;
+use vortex_array::_chunked_exec::take_into_uninit;
+use vortex_array::arrays::Dict;
+use vortex_array::arrays::PrimitiveArray;
+use vortex_array::arrays::dict::DictArraySlotsExt;
+use vortex_array::arrays::primitive::PrimitiveArrayExt as _;
+use vortex_array::dtype::DType;
+use vortex_array::dtype::NativePType;
+use vortex_array::dtype::Nullability;
+use vortex_array::dtype::PType;
+use vortex_array::dtype::UnsignedPType;
+use vortex_buffer::Buffer;
+use vortex_buffer::ByteBuffer;
+use vortex_error::VortexResult;
+
+use crate::BitPacked;
+use crate::BitPackedArrayExt;
+
+/// Chunked dict decoder where the codes child is bit-packed.
+///
+/// `dict` is the materialised values buffer (small, expected L1-resident). `packed` is
+/// the bit-packed codes buffer. Each chunk: bit-unpack 1024 codes into an internal
+/// `[I; CHUNK_LEN]` (one stack-resident buffer reused across chunks), then AVX2-gather
+/// them into the output via [`take_into_uninit`].
+pub struct BitPackedDictProducer<T: NativePType, I: NativePType + UnsignedPType + BitPacking> {
+    dict: Buffer<T>,
+    packed: ByteBuffer,
+    bit_width: usize,
+    elems_per_chunk: usize,
+    /// Number of *full* 1024-element chunks in the packed buffer.
+    full_chunks: usize,
+    /// Length of the trailing partial chunk in elements (0 if perfectly aligned).
+    trailer_len: usize,
+    /// Total remaining logical elements to produce.
+    remaining: usize,
+    /// Current full-chunk index.
+    chunk_idx: usize,
+    /// Scratch for one chunk of unpacked codes (4–8 KiB depending on `I`).
+    code_scratch: Box<[MaybeUninit<I>; CHUNK_LEN]>,
+}
+
+impl<T, I> BitPackedDictProducer<T, I>
+where
+    T: NativePType,
+    I: NativePType + UnsignedPType + BitPacking,
+{
+    fn new(
+        dict: Buffer<T>,
+        packed: ByteBuffer,
+        bit_width: usize,
+        full_chunks: usize,
+        trailer_len: usize,
+    ) -> Self {
+        let elems_per_chunk = 128 * bit_width / size_of::<I>();
+        let total = full_chunks * CHUNK_LEN + trailer_len;
+        Self {
+            dict,
+            packed,
+            bit_width,
+            elems_per_chunk,
+            full_chunks,
+            trailer_len,
+            remaining: total,
+            chunk_idx: 0,
+            code_scratch: Box::new([const { MaybeUninit::<I>::uninit() }; CHUNK_LEN]),
+        }
+    }
+
+    /// Bit-unpack fastlanes chunk `chunk_index` into `self.code_scratch[offset..offset+1024]`.
+    ///
+    /// # Safety
+    ///
+    /// `offset + 1024 ≤ code_scratch.len()` (the caller must ensure room for one fastlanes
+    /// chunk starting at `offset`).
+    unsafe fn unpack_chunk_into(&mut self, chunk_index: usize, offset: usize) {
+        let packed_bytes = self.packed.as_ref();
+        // SAFETY: same alignment used by fastlanes; bit_width-derived chunk layout.
+        let packed_slice: &[I] = unsafe {
+            std::slice::from_raw_parts(
+                packed_bytes.as_ptr().cast::<I>(),
+                packed_bytes.len() / size_of::<I>(),
+            )
+        };
+        let chunk = &packed_slice[chunk_index * self.elems_per_chunk
+            ..chunk_index * self.elems_per_chunk + self.elems_per_chunk];
+        // SAFETY: caller ensures offset + 1024 ≤ scratch capacity.
+        let dst_ptr = unsafe { self.code_scratch.as_mut_ptr().add(offset).cast::<I>() };
+        let dst_slice: &mut [I] = unsafe { std::slice::from_raw_parts_mut(dst_ptr, 1024) };
+        unsafe {
+            BitPacking::unchecked_unpack(self.bit_width, chunk, dst_slice);
+        }
+    }
+}
+
+impl<T, I> PrimitiveChunkProducer<T> for BitPackedDictProducer<T, I>
+where
+    T: NativePType,
+    I: NativePType + UnsignedPType + BitPacking,
+{
+    fn next_chunk<'a>(
+        &mut self,
+        scratch: &'a mut Scratch<T>,
+    ) -> VortexResult<Option<&'a [T]>> {
+        if self.remaining == 0 {
+            return Ok(None);
+        }
+        let dst = &mut scratch.as_uninit_mut()[..CHUNK_LEN];
+        let dst_ptr = dst.as_ptr().cast::<T>();
+        let n = self.write_next_into(&mut dst[..])?;
+        match n {
+            Some(n) => Ok(Some(unsafe { std::slice::from_raw_parts(dst_ptr, n) })),
+            None => Ok(None),
+        }
+    }
+
+    fn next_chunk_into_uninit(
+        &mut self,
+        _scratch: &mut Scratch<T>,
+        dst: &mut [MaybeUninit<T>],
+    ) -> VortexResult<Option<usize>> {
+        self.write_next_into(dst)
+    }
+
+    fn remaining(&self) -> usize {
+        self.remaining
+    }
+}
+
+impl<T, I> BitPackedDictProducer<T, I>
+where
+    T: NativePType,
+    I: NativePType + UnsignedPType + BitPacking,
+{
+    fn write_next_into(&mut self, dst: &mut [MaybeUninit<T>]) -> VortexResult<Option<usize>> {
+        if self.remaining == 0 {
+            return Ok(None);
+        }
+        // Unpack up to `CHUNK_LEN / FL_CHUNK = 4` fastlanes chunks into the local
+        // code_scratch, then issue a single AVX-gather over the full super-chunk. This
+        // amortises the per-chunk dispatch + gather-call overhead.
+        const FL_CHUNK: usize = 1024;
+        let super_chunk_max = CHUNK_LEN / FL_CHUNK;
+        debug_assert!(super_chunk_max >= 1);
+        let mut produced = 0usize;
+        let dst_cap = dst.len();
+
+        // Full fastlanes chunks first.
+        while self.chunk_idx < self.full_chunks
+            && produced + FL_CHUNK <= dst_cap
+            && produced + FL_CHUNK <= CHUNK_LEN
+        {
+            // SAFETY: code_scratch has CHUNK_LEN cells; produced + FL_CHUNK ≤ CHUNK_LEN.
+            unsafe {
+                self.unpack_chunk_into(self.chunk_idx, produced);
+            }
+            self.chunk_idx += 1;
+            produced += FL_CHUNK;
+        }
+        // Trailing partial chunk if there's room.
+        if produced < dst_cap && produced < CHUNK_LEN && self.trailer_len > 0 {
+            let trailer_take = self.trailer_len.min(dst_cap - produced).min(CHUNK_LEN - produced);
+            // Unpack the trailing fastlanes chunk in full into scratch (its prefix is what we want).
+            // SAFETY: code_scratch has space for CHUNK_LEN; produced + FL_CHUNK ≤ CHUNK_LEN.
+            unsafe {
+                self.unpack_chunk_into(self.chunk_idx, produced);
+            }
+            self.trailer_len -= trailer_take;
+            if self.trailer_len == 0 {
+                self.chunk_idx += 1;
+            }
+            produced += trailer_take;
+        }
+
+        if produced == 0 {
+            return Ok(None);
+        }
+
+        // Single AVX gather over the full super-chunk.
+        // SAFETY: code_scratch[..produced] is initialised by the unpack calls above.
+        let codes = unsafe {
+            std::slice::from_raw_parts(self.code_scratch.as_ptr().cast::<I>(), produced)
+        };
+        take_into_uninit::<T, I>(self.dict.as_slice(), codes, &mut dst[..produced]);
+        self.remaining -= produced;
+        Ok(Some(produced))
+    }
+}
+
+/// Kernel that matches `Dict<…>` whose codes child is bit-packed.
+///
+/// Falls back to the in-crate canonical `DictKernel` for non-bit-packed codes or sliced
+/// arrays (those still go through the AVX2 gather in `take_into_uninit`, just with a
+/// canonicalised codes buffer).
+pub struct BitPackedDictKernel<T: NativePType> {
+    _marker: std::marker::PhantomData<fn() -> T>,
+}
+
+impl<T: NativePType> BitPackedDictKernel<T> {
+    /// Construct a new kernel marker.
+    pub fn new() -> Self {
+        Self {
+            _marker: std::marker::PhantomData,
+        }
+    }
+}
+
+impl<T: NativePType> Default for BitPackedDictKernel<T> {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl<T: NativePType> PrimitiveChunkKernel<T> for BitPackedDictKernel<T> {
+    fn build(
+        &self,
+        array: &ArrayRef,
+        ctx: &mut ExecutionCtx,
+    ) -> VortexResult<Option<Box<dyn PrimitiveChunkProducer<T>>>> {
+        let Some(dict) = array.as_opt::<Dict>() else {
+            return Ok(None);
+        };
+        if !matches!(array.dtype().nullability(), Nullability::NonNullable) {
+            return Ok(None);
+        }
+        let codes = dict.codes();
+        let values = dict.values();
+        let Some(bp) = codes.as_opt::<BitPacked>() else {
+            return Ok(None);
+        };
+        // v1 fast path only handles non-sliced bit-packed inputs without patches.
+        if bp.offset() != 0 || bp.patches().is_some() {
+            return Ok(None);
+        }
+        if !matches!(codes.dtype().nullability(), Nullability::NonNullable)
+            || !matches!(values.dtype().nullability(), Nullability::NonNullable)
+        {
+            return Ok(None);
+        }
+        let DType::Primitive(values_ptype, _) = *values.dtype() else {
+            return Ok(None);
+        };
+        if values_ptype != T::PTYPE {
+            return Ok(None);
+        }
+        let DType::Primitive(codes_ptype, _) = *codes.dtype() else {
+            return Ok(None);
+        };
+        if !codes_ptype.is_unsigned_int() {
+            return Ok(None);
+        }
+
+        let values_canonical = values.clone().execute::<PrimitiveArray>(ctx)?;
+        let dict_buf = values_canonical.into_buffer::<T>();
+        let len = codes.len();
+        let packed = bp.packed().clone().unwrap_host();
+        let bit_width = bp.bit_width() as usize;
+        let full_chunks = len / CHUNK_LEN;
+        let trailer_len = len % CHUNK_LEN;
+
+        Ok(Some(match codes_ptype {
+            PType::U8 => Box::new(BitPackedDictProducer::<T, u8>::new(
+                dict_buf,
+                packed,
+                bit_width,
+                full_chunks,
+                trailer_len,
+            )),
+            PType::U16 => Box::new(BitPackedDictProducer::<T, u16>::new(
+                dict_buf,
+                packed,
+                bit_width,
+                full_chunks,
+                trailer_len,
+            )),
+            PType::U32 => Box::new(BitPackedDictProducer::<T, u32>::new(
+                dict_buf,
+                packed,
+                bit_width,
+                full_chunks,
+                trailer_len,
+            )),
+            PType::U64 => Box::new(BitPackedDictProducer::<T, u64>::new(
+                dict_buf,
+                packed,
+                bit_width,
+                full_chunks,
+                trailer_len,
+            )),
+            _ => return Ok(None),
+        }))
+    }
+}
+
+// ----------------------------------------------------------------------------------
+// Plain BitPacked<P> -> Primitive chunked producer (no gather, no patches).
+// ----------------------------------------------------------------------------------
+
+/// Bit-unpacks a `BitPacked<P>` array (without internal patches) one 1024-element chunk at
+/// a time into the output. Used as the *base* of a chunked
+/// [`vortex_array::_chunked_exec::primitive::PatchedProducer`] so patch overlay happens
+/// chunk-locally instead of scattering into a fully-materialised buffer.
+///
+/// For v1 this requires `offset == 0` (non-sliced).
+pub struct BitPackedPrimitiveProducer<T: NativePType + BitPacking> {
+    packed: ByteBuffer,
+    bit_width: usize,
+    elems_per_chunk: usize,
+    full_chunks: usize,
+    trailer_len: usize,
+    remaining: usize,
+    chunk_idx: usize,
+    scratch: Box<[MaybeUninit<T>; CHUNK_LEN]>,
+}
+
+impl<T: NativePType + BitPacking> BitPackedPrimitiveProducer<T> {
+    fn new(packed: ByteBuffer, bit_width: usize, len: usize) -> Self {
+        let elems_per_chunk = 128 * bit_width / size_of::<T>();
+        Self {
+            packed,
+            bit_width,
+            elems_per_chunk,
+            full_chunks: len / CHUNK_LEN,
+            trailer_len: len % CHUNK_LEN,
+            remaining: len,
+            chunk_idx: 0,
+            scratch: Box::new([const { MaybeUninit::<T>::uninit() }; CHUNK_LEN]),
+        }
+    }
+
+    unsafe fn unpack_into(&mut self, chunk_index: usize, dst: *mut T) {
+        let packed_bytes = self.packed.as_ref();
+        // SAFETY: fastlanes layout; bit_width-derived chunk size.
+        let packed_slice: &[T] = unsafe {
+            std::slice::from_raw_parts(
+                packed_bytes.as_ptr().cast::<T>(),
+                packed_bytes.len() / size_of::<T>(),
+            )
+        };
+        let chunk = &packed_slice[chunk_index * self.elems_per_chunk
+            ..chunk_index * self.elems_per_chunk + self.elems_per_chunk];
+        let dst_slice: &mut [T] = unsafe { std::slice::from_raw_parts_mut(dst, 1024) };
+        unsafe { BitPacking::unchecked_unpack(self.bit_width, chunk, dst_slice) };
+    }
+
+    fn write_next(&mut self, dst: *mut T, dst_cap: usize) -> Option<usize> {
+        if self.remaining == 0 {
+            return None;
+        }
+        if self.chunk_idx < self.full_chunks {
+            let n = CHUNK_LEN.min(dst_cap);
+            debug_assert!(n >= 1024);
+            // SAFETY: dst has dst_cap ≥ 1024 cells.
+            unsafe { self.unpack_into(self.chunk_idx, dst) };
+            self.chunk_idx += 1;
+            self.remaining -= 1024;
+            Some(1024)
+        } else if self.trailer_len > 0 {
+            // Unpack the trailing fastlanes chunk into the local scratch, copy the prefix.
+            let scratch_ptr = self.scratch.as_mut_ptr().cast::<T>();
+            // SAFETY: scratch has CHUNK_LEN cells.
+            unsafe { self.unpack_into(self.chunk_idx, scratch_ptr) };
+            let n = self.trailer_len.min(dst_cap);
+            // SAFETY: scratch[..n] just initialised; dst has n cells.
+            unsafe { std::ptr::copy_nonoverlapping(scratch_ptr, dst, n) };
+            self.trailer_len -= n;
+            self.remaining -= n;
+            if self.trailer_len == 0 {
+                self.chunk_idx += 1;
+            }
+            Some(n)
+        } else {
+            None
+        }
+    }
+}
+
+impl<T: NativePType + BitPacking> PrimitiveChunkProducer<T> for BitPackedPrimitiveProducer<T> {
+    fn next_chunk<'a>(
+        &mut self,
+        scratch: &'a mut Scratch<T>,
+    ) -> VortexResult<Option<&'a [T]>> {
+        let ptr = scratch.as_uninit_mut().as_mut_ptr().cast::<T>();
+        match self.write_next(ptr, CHUNK_LEN) {
+            Some(n) => Ok(Some(unsafe { std::slice::from_raw_parts(ptr, n) })),
+            None => Ok(None),
+        }
+    }
+
+    fn next_chunk_into_uninit(
+        &mut self,
+        _scratch: &mut Scratch<T>,
+        dst: &mut [MaybeUninit<T>],
+    ) -> VortexResult<Option<usize>> {
+        Ok(self.write_next(dst.as_mut_ptr().cast::<T>(), dst.len()))
+    }
+
+    fn remaining(&self) -> usize {
+        self.remaining
+    }
+}
+
+/// Construct a [`BitPackedPrimitiveProducer`] for an unsigned, non-sliced, patch-free
+/// `BitPacked<T>` array.
+pub fn build_bitpacked_primitive_producer<T: NativePType + BitPacking + UnsignedPType>(
+    bp: &vortex_array::Array<BitPacked>,
+) -> Option<BitPackedPrimitiveProducer<T>> {
+    if bp.offset() != 0 || bp.patches().is_some() {
+        return None;
+    }
+    let packed = bp.packed().clone().unwrap_host();
+    Some(BitPackedPrimitiveProducer::<T>::new(
+        packed,
+        bp.bit_width() as usize,
+        bp.as_ref().len(),
+    ))
+}
+
+/// Split a non-sliced `BitPacked<T>`-with-internal-patches into a chunked
+/// [`PatchedProducer`] that overlays the (formerly internal) patches chunk-locally over
+/// the bit-unpacked base.
+///
+/// This re-expresses the "patches inside BitPacked" layout as the "PatchedArray on top of
+/// patchless BitPacked" layout, but decoded through the chunked engine so the patch
+/// overlay happens while each base chunk is still hot in L1 — instead of scattering into
+/// a fully-materialised N-element buffer the way the canonical executor does.
+pub fn build_chunked_patched_over_bitpacked<T>(
+    bp: &vortex_array::Array<BitPacked>,
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<Option<PatchedProducer<T>>>
+where
+    T: NativePType + BitPacking + UnsignedPType,
+{
+    if bp.offset() != 0 {
+        return Ok(None);
+    }
+    let packed = bp.packed().clone().unwrap_host();
+    let base = BitPackedPrimitiveProducer::<T>::new(
+        packed,
+        bp.bit_width() as usize,
+        bp.as_ref().len(),
+    );
+
+    let (indices, values) = match bp.patches() {
+        Some(patches) => {
+            let off = patches.offset();
+            let indices_prim =
+                patches.indices().clone().execute::<PrimitiveArray>(ctx)?;
+            let values_prim = patches.values().clone().execute::<PrimitiveArray>(ctx)?;
+            let idx_u32: Buffer<u32> =
+                vortex_array::match_each_integer_ptype!(indices_prim.ptype(), |P| {
+                    indices_prim
+                        .as_slice::<P>()
+                        .iter()
+                        .map(|&v| {
+                            let p: usize = num_traits::AsPrimitive::<usize>::as_(v);
+                            (p - off) as u32
+                        })
+                        .collect()
+                });
+            (idx_u32, values_prim.into_buffer::<T>())
+        }
+        None => (Buffer::<u32>::empty(), Buffer::<T>::empty()),
+    };
+
+    Ok(Some(PatchedProducer::<T>::new(
+        Box::new(base),
+        indices,
+        values,
+    )))
+}
+
+/// Register the bit-packed chunked kernels onto `dispatcher` for every supported `T`.
+pub fn register_chunk_kernels(dispatcher: &mut PrimitiveChunkKernelDispatcher) {
+    macro_rules! register_all_for {
+        ($($T:ty),*) => {
+            $(
+                // BitPackedDictKernel is registered LAST for `Dict.id()` so it tries first
+                // (dispatcher iterates in registration order; bit-packed match short-circuits).
+                dispatcher.register::<$T>(Dict.id(), Arc::new(BitPackedDictKernel::<$T>::new()));
+            )*
+        };
+    }
+    register_all_for!(u8, u16, u32, u64, i8, i16, i32, i64, f32, f64);
+}
+
+#[cfg(test)]
+mod tests {
+    use vortex_array::IntoArray;
+    use vortex_array::VortexSessionExecute;
+    use vortex_array::_chunked_exec::primitive::PrimitiveChunkKernelDispatcher;
+    use vortex_array::_chunked_exec::primitive::decode_to_buffer;
+    use vortex_array::_chunked_exec::primitive::default_dispatcher;
+    use vortex_array::arrays::DictArray;
+    use vortex_array::arrays::PrimitiveArray;
+    use vortex_array::session::ArraySession;
+    use vortex_array::validity::Validity;
+    use vortex_buffer::Buffer;
+    use vortex_error::VortexResult;
+    use vortex_session::VortexSession;
+
+    use super::register_chunk_kernels;
+    use crate::BitPackedData;
+
+    fn session_dispatcher() -> (VortexSession, PrimitiveChunkKernelDispatcher) {
+        let s = VortexSession::empty().with::<ArraySession>();
+        crate::initialize(&s);
+        let mut d = default_dispatcher();
+        register_chunk_kernels(&mut d);
+        (s, d)
+    }
+
+    #[test]
+    fn dict_bitpacked_codes_chunked() -> VortexResult<()> {
+        let (session, dispatcher) = session_dispatcher();
+        let mut ctx = session.create_execution_ctx();
+
+        let dict_values = Buffer::<i32>::from_iter((0..256).map(|i| i as i32 * 7 + 11));
+        let dict = PrimitiveArray::new(dict_values.clone(), Validity::NonNullable);
+
+        // 4096 codes — 4 full 1024-chunks. Use u16 at 8 bits (bit_width must be < type width).
+        let codes_vec: Vec<u16> = (0..4096u32).map(|i| (i % 256) as u16).collect();
+        let codes_prim = PrimitiveArray::new(
+            Buffer::<u16>::from_iter(codes_vec.iter().copied()),
+            Validity::NonNullable,
+        );
+        let bp_codes = BitPackedData::encode(&codes_prim.into_array(), 8, &mut ctx)?;
+        let dict_arr = DictArray::try_new(bp_codes.into_array(), dict.into_array())?;
+
+        let buf = decode_to_buffer::<i32>(dict_arr.into_array(), &dispatcher, &mut ctx)?;
+        let expected: Vec<i32> = codes_vec
+            .iter()
+            .map(|c| dict_values.as_slice()[*c as usize])
+            .collect();
+        assert_eq!(buf.as_slice(), expected.as_slice());
+        Ok(())
+    }
+
+    #[test]
+    fn dict_bitpacked_codes_trailing_partial_chunk() -> VortexResult<()> {
+        let (session, dispatcher) = session_dispatcher();
+        let mut ctx = session.create_execution_ctx();
+        let dict_values = Buffer::<i32>::from_iter([10, 20, 30, 40, 50]);
+        let dict = PrimitiveArray::new(dict_values.clone(), Validity::NonNullable);
+        // 1500 codes => 1 full chunk + 476 trailer. u16 at 3 bits fits 0..5.
+        let codes_vec: Vec<u16> = (0..1500u32).map(|i| (i % 5) as u16).collect();
+        let codes_prim = PrimitiveArray::new(
+            Buffer::<u16>::from_iter(codes_vec.iter().copied()),
+            Validity::NonNullable,
+        );
+        let bp_codes = BitPackedData::encode(&codes_prim.into_array(), 3, &mut ctx)?;
+        let dict_arr = DictArray::try_new(bp_codes.into_array(), dict.into_array())?;
+        let buf = decode_to_buffer::<i32>(dict_arr.into_array(), &dispatcher, &mut ctx)?;
+        let expected: Vec<i32> = codes_vec
+            .iter()
+            .map(|c| dict_values.as_slice()[*c as usize])
+            .collect();
+        assert_eq!(buf.as_slice(), expected.as_slice());
+        Ok(())
+    }
+
+    #[test]
+    fn chunked_patched_over_bitpacked_matches_canonical() -> VortexResult<()> {
+        use vortex_array::_chunked_exec::Scratch;
+        use vortex_array::_chunked_exec::primitive::PrimitiveChunkProducer;
+
+        use crate::BitPackedArrayExt;
+
+        let (session, _dispatcher) = session_dispatcher();
+        let mut ctx = session.create_execution_ctx();
+
+        // Values where most fit in 4 bits but every 13th is an exception → patches.
+        let n = 5000usize;
+        let values: Vec<u32> = (0..n)
+            .map(|i| if i % 13 == 0 { 1000 + i as u32 } else { (i % 16) as u32 })
+            .collect();
+        let prim = PrimitiveArray::new(
+            Buffer::<u32>::from_iter(values.iter().copied()),
+            Validity::NonNullable,
+        );
+        let bp = BitPackedData::encode(&prim.into_array(), 4, &mut ctx)?;
+        assert!(bp.patches().is_some(), "expected exceptions to become patches");
+
+        // Canonical decode (Layout A: patches inside BitPacked).
+        let canonical = bp
+            .clone()
+            .into_array()
+            .execute::<PrimitiveArray>(&mut ctx)?;
+        assert_eq!(canonical.as_slice::<u32>(), values.as_slice());
+
+        // Chunked patched-over-bitpacked (Layout B).
+        let mut producer = super::build_chunked_patched_over_bitpacked::<u32>(&bp, &mut ctx)?
+            .expect("non-sliced");
+        let mut scratch = Scratch::<u32>::new();
+        let mut out = Vec::with_capacity(n);
+        while let Some(chunk) = producer.next_chunk(&mut scratch)? {
+            out.extend_from_slice(chunk);
+        }
+        assert_eq!(out, values);
+        Ok(())
+    }
+}
diff --git a/encodings/fastlanes/src/lib.rs b/encodings/fastlanes/src/lib.rs
index 9022b7c4e2b..62f77d78cf3 100644
--- a/encodings/fastlanes/src/lib.rs
+++ b/encodings/fastlanes/src/lib.rs
@@ -15,6 +15,8 @@ use vortex_buffer::Buffer;
 use vortex_buffer::BufferMut;
 use vortex_error::VortexResult;
 
+#[doc(hidden)]
+pub mod _chunked_exec;
 pub mod bit_transpose;
 mod bitpacking;
 mod delta;
diff --git a/encodings/runend/Cargo.toml b/encodings/runend/Cargo.toml
index 01a5b8d7a3e..5ad10f1535b 100644
--- a/encodings/runend/Cargo.toml
+++ b/encodings/runend/Cargo.toml
@@ -36,6 +36,7 @@ itertools = { workspace = true }
 rand = { workspace = true }
 rstest = { workspace = true }
 vortex-array = { workspace = true, features = ["_test-harness"] }
+vortex-fastlanes = { workspace = true }
 
 [features]
 arbitrary = ["dep:arbitrary", "vortex-array/arbitrary"]
@@ -52,3 +53,15 @@ harness = false
 [[bench]]
 name = "run_end_decode"
 harness = false
+
+[[bench]]
+name = "chunked_exec"
+harness = false
+
+[[bench]]
+name = "chunked_sinks"
+harness = false
+
+[[bench]]
+name = "chunked_patched"
+harness = false
diff --git a/encodings/runend/benches/chunked_exec.rs b/encodings/runend/benches/chunked_exec.rs
new file mode 100644
index 00000000000..3fff01caec2
--- /dev/null
+++ b/encodings/runend/benches/chunked_exec.rs
@@ -0,0 +1,563 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Chunked execution engine benchmark.
+//!
+//! Compares the new chunked decode path (small L1-resident scratch + fused kernels)
+//! against the existing canonical-by-canonical executor on several shapes:
+//!
+//! - `Dict<Primitive>` — the simplest gather-from-small-dict workload.
+//! - `RunEnd<Primitive>` — single-encoding streaming.
+//! - `Dict<RunEnd<Primitive>>` — the fused stack: the dictionary's values are themselves
+//!   RunEnd-encoded, so the chunked Dict kernel materializes the small RunEnd inner once
+//!   and then streams the gather. The legacy path does roughly the same work but pays
+//!   more allocation overhead in the executor.
+//! - `ListView<Primitive>` (canonical + bit-packed offsets) — row-window streaming.
+//!
+//! Run with `cargo bench -p vortex-runend --bench chunked_exec`.
+
+use std::fmt;
+use std::sync::LazyLock;
+
+use divan::Bencher;
+use divan::black_box;
+use vortex_array::IntoArray;
+use vortex_array::VortexSessionExecute;
+use vortex_array::_chunked_exec::listview::build_listview_producer_typed;
+use vortex_array::_chunked_exec::primitive::PrimitiveChunkKernelDispatcher;
+use vortex_array::_chunked_exec::primitive::decode_to_buffer;
+use vortex_array::_chunked_exec::primitive::default_dispatcher;
+use vortex_array::arrays::DictArray;
+use vortex_array::arrays::ListViewArray;
+use vortex_array::arrays::PrimitiveArray;
+use vortex_array::session::ArraySession;
+use vortex_array::validity::Validity;
+use vortex_buffer::Buffer;
+use vortex_fastlanes::BitPackedData;
+use vortex_runend::RunEnd;
+use vortex_runend::_chunked_exec::register_chunk_kernels as register_runend_chunk_kernels;
+use vortex_session::VortexSession;
+
+fn main() {
+    divan::main();
+}
+
+static SESSION: LazyLock<VortexSession> = LazyLock::new(|| {
+    let s = VortexSession::empty().with::<ArraySession>();
+    vortex_runend::initialize(&s);
+    vortex_fastlanes::initialize(&s);
+    s
+});
+
+static DISPATCHER: LazyLock<PrimitiveChunkKernelDispatcher> = LazyLock::new(|| {
+    let mut d = default_dispatcher();
+    register_runend_chunk_kernels(&mut d);
+    vortex_fastlanes::_chunked_exec::register_chunk_kernels(&mut d);
+    d
+});
+
+// ------------------------------------------------------------------------------------
+// Dict<Primitive>
+// ------------------------------------------------------------------------------------
+
+#[derive(Copy, Clone)]
+struct DictArgs {
+    len: usize,
+    dict_size: usize,
+}
+
+impl fmt::Display for DictArgs {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "len={} dict={}", self.len, self.dict_size)
+    }
+}
+
+const DICT_ARGS: &[DictArgs] = &[
+    DictArgs {
+        len: 16_384,
+        dict_size: 64,
+    },
+    DictArgs {
+        len: 65_536,
+        dict_size: 256,
+    },
+    DictArgs {
+        len: 262_144,
+        dict_size: 1024,
+    },
+    DictArgs {
+        len: 1_048_576,
+        dict_size: 256,
+    },
+    DictArgs {
+        len: 1_048_576,
+        dict_size: 4096,
+    },
+    // Cache-stress: codes buffer is 4*N bytes (canonical u32 codes).
+    // Both paths have the same data flow here (no intermediate to save),
+    // so chunked is predicted to stay tied across cache boundaries.
+    DictArgs {
+        len: 4_194_304, // codes 16 MB, output 16 MB
+        dict_size: 256,
+    },
+    DictArgs {
+        len: 16_777_216, // codes 64 MB, output 64 MB
+        dict_size: 256,
+    },
+];
+
+fn make_dict_i32(args: DictArgs) -> vortex_array::ArrayRef {
+    let dict_size = args.dict_size;
+    let len = args.len;
+    let values: Vec<i32> = (0..dict_size as i32).map(|i| i * 17 + 11).collect();
+    let codes: Vec<u32> = (0..len).map(|i| (i % dict_size) as u32).collect();
+    let values = PrimitiveArray::new(Buffer::<i32>::from_iter(values), Validity::NonNullable);
+    let codes = PrimitiveArray::new(Buffer::<u32>::from_iter(codes), Validity::NonNullable);
+    DictArray::try_new(codes.into_array(), values.into_array())
+        .expect("dict")
+        .into_array()
+}
+
+#[divan::bench(args = DICT_ARGS)]
+fn dict_primitive_chunked(bencher: Bencher, args: DictArgs) {
+    let array = make_dict_i32(args);
+    bencher.with_inputs(|| array.clone()).bench_local_refs(|a| {
+        let mut ctx = SESSION.create_execution_ctx();
+        black_box(decode_to_buffer::<i32>(a.clone(), &DISPATCHER, &mut ctx).unwrap())
+    });
+}
+
+#[divan::bench(args = DICT_ARGS)]
+fn dict_primitive_canonical(bencher: Bencher, args: DictArgs) {
+    let array = make_dict_i32(args);
+    bencher.with_inputs(|| array.clone()).bench_local_refs(|a| {
+        let mut ctx = SESSION.create_execution_ctx();
+        black_box(a.clone().execute::<PrimitiveArray>(&mut ctx).unwrap())
+    });
+}
+
+// ------------------------------------------------------------------------------------
+// Dict<BitPacked<u16> codes> — the v2 bit-pack fusion case
+// ------------------------------------------------------------------------------------
+
+#[derive(Copy, Clone)]
+struct DictBpArgs {
+    len: usize,
+    dict_size: usize,
+    bit_width: u8,
+}
+
+impl fmt::Display for DictBpArgs {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "len={} dict={} bw={}", self.len, self.dict_size, self.bit_width)
+    }
+}
+
+const DICT_BP_ARGS: &[DictBpArgs] = &[
+    DictBpArgs {
+        len: 65_536,
+        dict_size: 256,
+        bit_width: 8,
+    },
+    DictBpArgs {
+        len: 262_144,
+        dict_size: 256,
+        bit_width: 8,
+    },
+    DictBpArgs {
+        len: 1_048_576,
+        dict_size: 256,
+        bit_width: 8,
+    },
+    DictBpArgs {
+        len: 1_048_576,
+        dict_size: 1024,
+        bit_width: 10,
+    },
+    DictBpArgs {
+        len: 1_048_576,
+        dict_size: 4096,
+        bit_width: 12,
+    },
+    // Cache-stress shapes: intermediate codes Buffer<u16> = 2*N bytes.
+    // L2 on the test host is 2 MiB. Below crosses that boundary progressively.
+    DictBpArgs {
+        len: 4_194_304, // 8 MiB intermediate (4× L2)
+        dict_size: 256,
+        bit_width: 8,
+    },
+    DictBpArgs {
+        len: 16_777_216, // 32 MiB intermediate (16× L2, still in L3)
+        dict_size: 256,
+        bit_width: 8,
+    },
+    DictBpArgs {
+        len: 67_108_864, // 128 MiB intermediate (64× L2, half of L3)
+        dict_size: 256,
+        bit_width: 8,
+    },
+];
+
+fn make_dict_bp_i32(args: DictBpArgs) -> vortex_array::ArrayRef {
+    let dict_values: Vec<i32> = (0..args.dict_size as i32).map(|i| i * 17 + 11).collect();
+    let codes: Vec<u16> = (0..args.len)
+        .map(|i| (i % args.dict_size) as u16)
+        .collect();
+    let dict = PrimitiveArray::new(
+        Buffer::<i32>::from_iter(dict_values),
+        Validity::NonNullable,
+    );
+    let codes_prim = PrimitiveArray::new(Buffer::<u16>::from_iter(codes), Validity::NonNullable);
+    let mut ctx = SESSION.create_execution_ctx();
+    let bp = BitPackedData::encode(&codes_prim.into_array(), args.bit_width, &mut ctx)
+        .expect("bitpack");
+    DictArray::try_new(bp.into_array(), dict.into_array())
+        .expect("dict")
+        .into_array()
+}
+
+#[divan::bench(args = DICT_BP_ARGS)]
+fn dict_bp_canonical(bencher: Bencher, args: DictBpArgs) {
+    let array = make_dict_bp_i32(args);
+    bencher.with_inputs(|| array.clone()).bench_local_refs(|a| {
+        let mut ctx = SESSION.create_execution_ctx();
+        black_box(a.clone().execute::<PrimitiveArray>(&mut ctx).unwrap())
+    });
+}
+
+#[divan::bench(args = DICT_BP_ARGS)]
+fn dict_bp_chunked(bencher: Bencher, args: DictBpArgs) {
+    let array = make_dict_bp_i32(args);
+    bencher.with_inputs(|| array.clone()).bench_local_refs(|a| {
+        let mut ctx = SESSION.create_execution_ctx();
+        black_box(decode_to_buffer::<i32>(a.clone(), &DISPATCHER, &mut ctx).unwrap())
+    });
+}
+
+// ------------------------------------------------------------------------------------
+// RunEnd<Primitive>
+// ------------------------------------------------------------------------------------
+
+#[derive(Copy, Clone)]
+struct RunEndArgs {
+    len: usize,
+    avg_run_len: usize,
+}
+
+impl fmt::Display for RunEndArgs {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "len={} run={}", self.len, self.avg_run_len)
+    }
+}
+
+const RUNEND_ARGS: &[RunEndArgs] = &[
+    RunEndArgs {
+        len: 65_536,
+        avg_run_len: 4,
+    },
+    RunEndArgs {
+        len: 65_536,
+        avg_run_len: 64,
+    },
+    RunEndArgs {
+        len: 1_048_576,
+        avg_run_len: 16,
+    },
+    RunEndArgs {
+        len: 1_048_576,
+        avg_run_len: 256,
+    },
+];
+
+fn make_runend_i32(args: RunEndArgs) -> vortex_array::ArrayRef {
+    let mut values = Vec::with_capacity(args.len);
+    let mut run_idx = 0i32;
+    let mut pos = 0;
+    while pos < args.len {
+        let run = args.avg_run_len.min(args.len - pos);
+        values.extend(std::iter::repeat(run_idx % 1024).take(run));
+        run_idx += 1;
+        pos += run;
+    }
+    let prim = PrimitiveArray::new(Buffer::<i32>::from_iter(values), Validity::NonNullable);
+    let mut ctx = SESSION.create_execution_ctx();
+    RunEnd::encode(prim.into_array(), &mut ctx).unwrap().into_array()
+}
+
+#[divan::bench(args = RUNEND_ARGS)]
+fn runend_chunked(bencher: Bencher, args: RunEndArgs) {
+    let array = make_runend_i32(args);
+    bencher.with_inputs(|| array.clone()).bench_local_refs(|a| {
+        let mut ctx = SESSION.create_execution_ctx();
+        black_box(decode_to_buffer::<i32>(a.clone(), &DISPATCHER, &mut ctx).unwrap())
+    });
+}
+
+#[divan::bench(args = RUNEND_ARGS)]
+fn runend_canonical(bencher: Bencher, args: RunEndArgs) {
+    let array = make_runend_i32(args);
+    bencher.with_inputs(|| array.clone()).bench_local_refs(|a| {
+        let mut ctx = SESSION.create_execution_ctx();
+        black_box(a.clone().execute::<PrimitiveArray>(&mut ctx).unwrap())
+    });
+}
+
+// ------------------------------------------------------------------------------------
+// Dict<RunEnd<Primitive>> (fused)
+// ------------------------------------------------------------------------------------
+
+#[derive(Copy, Clone)]
+struct DictRunEndArgs {
+    len: usize,
+    dict_size: usize,
+    /// Average run length *inside the dictionary's values*.
+    inner_run_len: usize,
+}
+
+impl fmt::Display for DictRunEndArgs {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(
+            f,
+            "len={} dict={} inner_run={}",
+            self.len, self.dict_size, self.inner_run_len
+        )
+    }
+}
+
+const DICT_RUNEND_ARGS: &[DictRunEndArgs] = &[
+    DictRunEndArgs {
+        len: 1_048_576,
+        dict_size: 256,
+        inner_run_len: 4,
+    },
+    DictRunEndArgs {
+        len: 1_048_576,
+        dict_size: 4096,
+        inner_run_len: 16,
+    },
+    DictRunEndArgs {
+        len: 4_194_304,
+        dict_size: 1024,
+        inner_run_len: 8,
+    },
+];
+
+fn make_dict_runend_i32(args: DictRunEndArgs) -> vortex_array::ArrayRef {
+    // Build the inner dictionary values (RunEnd-encoded).
+    let mut inner_values = Vec::with_capacity(args.dict_size);
+    let mut run_idx = 0i32;
+    let mut pos = 0;
+    while pos < args.dict_size {
+        let run = args.inner_run_len.min(args.dict_size - pos);
+        inner_values.extend(std::iter::repeat(run_idx).take(run));
+        run_idx += 1;
+        pos += run;
+    }
+    let inner_prim = PrimitiveArray::new(Buffer::<i32>::from_iter(inner_values), Validity::NonNullable);
+    let mut ctx = SESSION.create_execution_ctx();
+    let inner_re = RunEnd::encode(inner_prim.into_array(), &mut ctx).unwrap();
+    let codes: Vec<u32> = (0..args.len as u32).map(|i| i % args.dict_size as u32).collect();
+    let codes = PrimitiveArray::new(Buffer::<u32>::from_iter(codes), Validity::NonNullable);
+    DictArray::try_new(codes.into_array(), inner_re.into_array())
+        .unwrap()
+        .into_array()
+}
+
+#[divan::bench(args = DICT_RUNEND_ARGS)]
+fn dict_runend_fused_chunked(bencher: Bencher, args: DictRunEndArgs) {
+    let array = make_dict_runend_i32(args);
+    bencher.with_inputs(|| array.clone()).bench_local_refs(|a| {
+        let mut ctx = SESSION.create_execution_ctx();
+        black_box(decode_to_buffer::<i32>(a.clone(), &DISPATCHER, &mut ctx).unwrap())
+    });
+}
+
+#[divan::bench(args = DICT_RUNEND_ARGS)]
+fn dict_runend_canonical(bencher: Bencher, args: DictRunEndArgs) {
+    let array = make_dict_runend_i32(args);
+    bencher.with_inputs(|| array.clone()).bench_local_refs(|a| {
+        let mut ctx = SESSION.create_execution_ctx();
+        black_box(a.clone().execute::<PrimitiveArray>(&mut ctx).unwrap())
+    });
+}
+
+// Diagnostic: how slow is *just* canonicalizing the inner RunEnd dict (small)?
+#[divan::bench(args = DICT_RUNEND_ARGS)]
+fn dict_runend_phase_inner_canonical(bencher: Bencher, args: DictRunEndArgs) {
+    let array = make_dict_runend_i32(args);
+    use vortex_array::arrays::dict::DictArraySlotsExt;
+    use vortex_array::arrays::Dict;
+    let inner = array.as_::<Dict>().values().clone();
+    bencher.with_inputs(|| inner.clone()).bench_local_refs(|a| {
+        let mut ctx = SESSION.create_execution_ctx();
+        black_box(a.clone().execute::<PrimitiveArray>(&mut ctx).unwrap())
+    });
+}
+
+// Diagnostic: take_primitive over the already-materialized dict + 1M codes.
+#[divan::bench(args = DICT_RUNEND_ARGS)]
+fn dict_runend_phase_take(bencher: Bencher, args: DictRunEndArgs) {
+    use vortex_array::arrays::dict::DictArraySlotsExt;
+    use vortex_array::arrays::Dict;
+    use vortex_array::builtins::ArrayBuiltins;
+    let array = make_dict_runend_i32(args);
+    let dict_view = array.as_::<Dict>();
+    let codes = dict_view.codes().clone();
+    let values = dict_view.values().clone();
+    let mut ctx = SESSION.create_execution_ctx();
+    let inner = values.execute::<PrimitiveArray>(&mut ctx).unwrap().into_array();
+    bencher
+        .with_inputs(|| (inner.clone(), codes.clone()))
+        .bench_local_refs(|(inner, codes)| {
+            black_box(inner.take(codes.clone()).unwrap())
+        });
+}
+
+// ------------------------------------------------------------------------------------
+// ListView<Primitive> with bit-packed offsets + sizes
+// ------------------------------------------------------------------------------------
+
+#[derive(Copy, Clone)]
+struct ListViewArgs {
+    rows: usize,
+    avg_list_len: usize,
+    bit_width_offsets: u8,
+    bit_width_sizes: u8,
+}
+
+impl fmt::Display for ListViewArgs {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(
+            f,
+            "rows={} avg_list={} bw_off={} bw_sz={}",
+            self.rows, self.avg_list_len, self.bit_width_offsets, self.bit_width_sizes
+        )
+    }
+}
+
+const LIST_ARGS: &[ListViewArgs] = &[
+    ListViewArgs {
+        rows: 16_384,
+        avg_list_len: 8,
+        bit_width_offsets: 18, // covers up to 16_384 * 8 == 131_072
+        bit_width_sizes: 5,    // up to 31 elements per list
+    },
+    ListViewArgs {
+        rows: 65_536,
+        avg_list_len: 4,
+        bit_width_offsets: 18,
+        bit_width_sizes: 4,
+    },
+    ListViewArgs {
+        rows: 262_144,
+        avg_list_len: 4,
+        bit_width_offsets: 20,
+        bit_width_sizes: 4,
+    },
+];
+
+/// Build a `ListView<i32>` where:
+/// - `elements` is a plain i32 buffer with `rows * avg_list_len` values.
+/// - `offsets` and `sizes` are bit-packed via fastlanes (so canonicalize must unpack).
+fn make_listview_bp(args: ListViewArgs) -> vortex_array::ArrayRef {
+    let n_elements = args.rows * args.avg_list_len;
+    let elements: Vec<i32> = (0..n_elements as i32).collect();
+    let offsets: Vec<u32> = (0..args.rows as u32).map(|i| i * args.avg_list_len as u32).collect();
+    let sizes: Vec<u32> = vec![args.avg_list_len as u32; args.rows];
+
+    let elements_arr = PrimitiveArray::new(Buffer::<i32>::from_iter(elements), Validity::NonNullable);
+    let offsets_arr = PrimitiveArray::new(Buffer::<u32>::from_iter(offsets), Validity::NonNullable);
+    let sizes_arr = PrimitiveArray::new(Buffer::<u32>::from_iter(sizes), Validity::NonNullable);
+
+    let mut ctx = SESSION.create_execution_ctx();
+    let bp_offsets = BitPackedData::encode(
+        &offsets_arr.into_array(),
+        args.bit_width_offsets,
+        &mut ctx,
+    )
+    .expect("offsets bitpack")
+    .into_array();
+    let bp_sizes =
+        BitPackedData::encode(&sizes_arr.into_array(), args.bit_width_sizes, &mut ctx)
+            .expect("sizes bitpack")
+            .into_array();
+
+    ListViewArray::new(
+        elements_arr.into_array(),
+        bp_offsets,
+        bp_sizes,
+        Validity::NonNullable,
+    )
+    .into_array()
+}
+
+/// Walk the chunked rows summing elements end-to-end. Uses the typed-callback API so
+/// there is no dyn dispatch in the inner loop.
+#[divan::bench(args = LIST_ARGS)]
+fn listview_chunked_sum(bencher: Bencher, args: ListViewArgs) {
+    let array = make_listview_bp(args);
+    bencher.with_inputs(|| array.clone()).bench_local_refs(|a| {
+        let mut ctx = SESSION.create_execution_ctx();
+        let mut producer =
+            build_listview_producer_typed::<u32, u32, i32>(a.clone(), &mut ctx).unwrap();
+        let mut sum: i64 = 0;
+        producer.for_each_chunk_typed(|offs, szs, elems| {
+            for i in 0..offs.len() {
+                let o = offs[i] as usize;
+                let s = szs[i] as usize;
+                for &v in &elems[o..o + s] {
+                    sum = sum.wrapping_add(v as i64);
+                }
+            }
+        });
+        black_box(sum)
+    });
+}
+
+/// Canonicalize the whole `ListView` then sum elements; the apples-to-apples baseline.
+#[divan::bench(args = LIST_ARGS)]
+fn listview_canonical_sum(bencher: Bencher, args: ListViewArgs) {
+    use vortex_array::Canonical;
+    use vortex_array::arrays::ListView;
+    use vortex_array::arrays::listview::ListViewArrayExt;
+    use vortex_array::arrays::primitive::PrimitiveArrayExt;
+    use vortex_array::dtype::NativePType;
+
+    let array = make_listview_bp(args);
+    bencher.with_inputs(|| array.clone()).bench_local_refs(|a| {
+        let mut ctx = SESSION.create_execution_ctx();
+        let canonical = a.clone().execute::<Canonical>(&mut ctx).unwrap();
+        // Canonical is a ListView; sum the relevant element slice via offsets/sizes.
+        let lv = canonical.into_array();
+        let view = lv.as_::<ListView>();
+        let offsets = view
+            .offsets()
+            .clone()
+            .execute::<PrimitiveArray>(&mut ctx)
+            .unwrap();
+        let sizes = view
+            .sizes()
+            .clone()
+            .execute::<PrimitiveArray>(&mut ctx)
+            .unwrap();
+        let elements = view
+            .elements()
+            .clone()
+            .execute::<PrimitiveArray>(&mut ctx)
+            .unwrap();
+        let elements = elements.as_slice::<i32>();
+        let off = offsets.as_slice::<u32>();
+        let sz = sizes.as_slice::<u32>();
+        let mut sum: i64 = 0;
+        for i in 0..off.len() {
+            let o = off[i] as usize;
+            let s = sz[i] as usize;
+            for &v in &elements[o..o + s] {
+                sum = sum.wrapping_add(v as i64);
+            }
+        }
+        // Silence unused if NativePType isn't otherwise referenced.
+        let _: i32 = i32::PTYPE.bit_width() as i32;
+        black_box(sum)
+    });
+}
diff --git a/encodings/runend/benches/chunked_patched.rs b/encodings/runend/benches/chunked_patched.rs
new file mode 100644
index 00000000000..e3fb610aee8
--- /dev/null
+++ b/encodings/runend/benches/chunked_patched.rs
@@ -0,0 +1,144 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Patched-decode benchmark: chunk-local patch overlay vs full-buffer scatter.
+//!
+//! Compares decoding a bit-packed primitive column with exceptions ("patches") two ways:
+//!
+//! - `bitpacked_patches_canonical`: the canonical executor path for a `BitPacked` array
+//!   that carries its patches internally. It bit-unpacks the entire column into an
+//!   N-element buffer, then scatters the exception values into that buffer by index.
+//!   The scatter is a sequence of random writes; once N spills L2/L3 each write can miss.
+//!
+//! - `bitpacked_patches_chunked`: the same logical data decoded through the chunked engine
+//!   as a `PatchedProducer` over a `BitPackedPrimitiveProducer`. Each 1024-element base
+//!   chunk is bit-unpacked into the scratch, the patches that fall in that chunk's range
+//!   are overlaid while the chunk is still hot in L1, then flushed. The patch writes never
+//!   touch a cold cache line.
+//!
+//! Run with `cargo bench -p vortex-runend --bench chunked_patched`.
+
+use std::fmt;
+use std::sync::LazyLock;
+
+use divan::Bencher;
+use divan::black_box;
+use vortex_array::IntoArray;
+use vortex_array::VortexSessionExecute;
+use vortex_array::_chunked_exec::Scratch;
+use vortex_array::_chunked_exec::primitive::PrimitiveChunkProducer;
+use vortex_array::arrays::PrimitiveArray;
+use vortex_array::session::ArraySession;
+use vortex_array::validity::Validity;
+use vortex_buffer::Buffer;
+use vortex_buffer::BufferMut;
+use vortex_fastlanes::BitPackedArray;
+use vortex_fastlanes::BitPackedArrayExt;
+use vortex_fastlanes::BitPackedData;
+use vortex_fastlanes::_chunked_exec::build_chunked_patched_over_bitpacked;
+use vortex_session::VortexSession;
+
+fn main() {
+    divan::main();
+}
+
+static SESSION: LazyLock<VortexSession> = LazyLock::new(|| {
+    let s = VortexSession::empty().with::<ArraySession>();
+    vortex_fastlanes::initialize(&s);
+    s
+});
+
+#[derive(Copy, Clone)]
+struct Args {
+    len: usize,
+    /// Fraction of values that are exceptions, as 1-in-`patch_stride`.
+    patch_stride: usize,
+    bit_width: u8,
+}
+
+impl fmt::Display for Args {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        let pct = 100.0 / self.patch_stride as f64;
+        write!(
+            f,
+            "len={} patches={:.1}% bw={}",
+            self.len, pct, self.bit_width
+        )
+    }
+}
+
+const ARGS: &[Args] = &[
+    // 1M: output 4 MiB (2x L2). Patch density swept.
+    Args { len: 1_048_576, patch_stride: 100, bit_width: 8 }, // 1%
+    Args { len: 1_048_576, patch_stride: 20, bit_width: 8 },  // 5%
+    Args { len: 1_048_576, patch_stride: 10, bit_width: 8 },  // 10%
+    // 4M: output 16 MiB (8x L2, in L3).
+    Args { len: 4_194_304, patch_stride: 100, bit_width: 8 },
+    Args { len: 4_194_304, patch_stride: 20, bit_width: 8 },
+    Args { len: 4_194_304, patch_stride: 10, bit_width: 8 },
+    // 16M: output 64 MiB (deep in L3).
+    Args { len: 16_777_216, patch_stride: 100, bit_width: 8 },
+    Args { len: 16_777_216, patch_stride: 20, bit_width: 8 },
+    Args { len: 16_777_216, patch_stride: 10, bit_width: 8 },
+];
+
+/// Build a `BitPacked<u32>` whose values mostly fit in `bit_width` bits, with every
+/// `patch_stride`-th value an exception (forcing it into the patches sidecar).
+fn make_bitpacked_with_patches(args: Args) -> BitPackedArray {
+    let cap = (1u32 << args.bit_width) - 1;
+    let values: Vec<u32> = (0..args.len)
+        .map(|i| {
+            if i % args.patch_stride == 0 {
+                // Exception value above the bit-width ceiling.
+                cap + 1000 + (i as u32 & 0xffff)
+            } else {
+                (i as u32) & cap
+            }
+        })
+        .collect();
+    let prim = PrimitiveArray::new(Buffer::<u32>::from_iter(values), Validity::NonNullable);
+    let mut ctx = SESSION.create_execution_ctx();
+    let bp = BitPackedData::encode(&prim.into_array(), args.bit_width, &mut ctx)
+        .expect("bitpack encode");
+    assert!(bp.patches().is_some(), "expected patches at this density");
+    bp
+}
+
+/// Layout A: patches stored inside the BitPacked array, decoded canonically.
+/// Bit-unpack the whole column, then scatter patches into the full N-element buffer.
+#[divan::bench(args = ARGS)]
+fn bitpacked_patches_canonical(bencher: Bencher, args: Args) {
+    let bp = make_bitpacked_with_patches(args);
+    bencher
+        .with_inputs(|| bp.clone().into_array())
+        .bench_local_refs(|a| {
+            let mut ctx = SESSION.create_execution_ctx();
+            black_box(a.clone().execute::<PrimitiveArray>(&mut ctx).unwrap())
+        });
+}
+
+/// Layout B: same data decoded as a chunked PatchedProducer over a patchless
+/// BitPackedPrimitiveProducer. Patch overlay happens chunk-locally in L1.
+#[divan::bench(args = ARGS)]
+fn bitpacked_patches_chunked(bencher: Bencher, args: Args) {
+    let bp = make_bitpacked_with_patches(args);
+    bencher.with_inputs(|| bp.clone()).bench_local_refs(|bp| {
+        let mut ctx = SESSION.create_execution_ctx();
+        let mut producer = build_chunked_patched_over_bitpacked::<u32>(bp, &mut ctx)
+            .unwrap()
+            .expect("non-sliced");
+        let mut out = BufferMut::<u32>::with_capacity(args.len);
+        let mut scratch = Scratch::<u32>::new();
+        let mut written = 0usize;
+        while let Some(chunk) = producer.next_chunk(&mut scratch).unwrap() {
+            // SAFETY: out has capacity args.len; chunk total never exceeds it.
+            unsafe {
+                let dst = out.spare_capacity_mut().as_mut_ptr().add(written).cast::<u32>();
+                std::ptr::copy_nonoverlapping(chunk.as_ptr(), dst, chunk.len());
+            }
+            written += chunk.len();
+        }
+        unsafe { out.set_len(written) };
+        black_box(out.freeze())
+    });
+}
diff --git a/encodings/runend/benches/chunked_sinks.rs b/encodings/runend/benches/chunked_sinks.rs
new file mode 100644
index 00000000000..bdf994b6439
--- /dev/null
+++ b/encodings/runend/benches/chunked_sinks.rs
@@ -0,0 +1,281 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Sink-fusion benchmark for the chunked execution engine.
+//!
+//! Compares the fused-pipeline path (decode + operator in one chunked pass, no
+//! intermediate `Buffer<T>`) against the canonical two-pass equivalent
+//! (`array.execute::<PrimitiveArray>(ctx)?` followed by a scalar loop over the materialised
+//! buffer) for three common scalar operator shapes:
+//!
+//! - `filter(x > c)` on `Dict<BitPacked<u16>>`
+//! - `cast(i32 → i64)` on `Dict<i32>`
+//! - `scalar add` (`x + c`) on `Dict<i32>`
+//!
+//! Run with `cargo bench -p vortex-runend --bench chunked_sinks`.
+
+use std::fmt;
+use std::sync::LazyLock;
+
+use divan::Bencher;
+use divan::black_box;
+use vortex_array::IntoArray;
+use vortex_array::VortexSessionExecute;
+use vortex_array::_chunked_exec::primitive::PrimitiveChunkKernelDispatcher;
+use vortex_array::_chunked_exec::primitive::default_dispatcher;
+use vortex_array::_chunked_exec::sink::FilterSink;
+use vortex_array::_chunked_exec::sink::MapSink;
+use vortex_array::_chunked_exec::sink::drive_into_sink;
+use vortex_array::arrays::DictArray;
+use vortex_array::arrays::PrimitiveArray;
+use vortex_array::session::ArraySession;
+use vortex_array::validity::Validity;
+use vortex_buffer::Buffer;
+use vortex_buffer::BufferMut;
+use vortex_fastlanes::BitPackedData;
+use vortex_session::VortexSession;
+
+fn main() {
+    divan::main();
+}
+
+static SESSION: LazyLock<VortexSession> = LazyLock::new(|| {
+    let s = VortexSession::empty().with::<ArraySession>();
+    vortex_runend::initialize(&s);
+    vortex_fastlanes::initialize(&s);
+    s
+});
+
+static DISPATCHER: LazyLock<PrimitiveChunkKernelDispatcher> = LazyLock::new(|| {
+    let mut d = default_dispatcher();
+    vortex_runend::_chunked_exec::register_chunk_kernels(&mut d);
+    vortex_fastlanes::_chunked_exec::register_chunk_kernels(&mut d);
+    d
+});
+
+// ============================================================================
+// Filter — Dict<BitPacked<u16>> with selectivity ~50%
+// ============================================================================
+
+#[derive(Copy, Clone)]
+struct FilterArgs {
+    len: usize,
+    dict_size: usize,
+    bit_width: u8,
+    threshold: i32,
+}
+
+impl fmt::Display for FilterArgs {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(
+            f,
+            "len={} dict={} bw={} pred=x>{}",
+            self.len, self.dict_size, self.bit_width, self.threshold
+        )
+    }
+}
+
+const FILTER_ARGS: &[FilterArgs] = &[
+    FilterArgs {
+        len: 1_048_576,
+        dict_size: 256,
+        bit_width: 8,
+        threshold: 2000,
+    },
+    FilterArgs {
+        len: 4_194_304,
+        dict_size: 256,
+        bit_width: 8,
+        threshold: 2000,
+    },
+    FilterArgs {
+        len: 16_777_216,
+        dict_size: 256,
+        bit_width: 8,
+        threshold: 2000,
+    },
+];
+
+/// Build `Dict<BitPacked<u16>>` with dict values `[0, 17, 34, …]` so threshold=2000
+/// gives ~50% selectivity (passes if dict[code] > 2000 → roughly half the dict).
+fn make_filter_input(args: FilterArgs) -> vortex_array::ArrayRef {
+    let dict_values: Vec<i32> = (0..args.dict_size as i32).map(|i| i * 17 + 11).collect();
+    let codes: Vec<u16> = (0..args.len)
+        .map(|i| (i % args.dict_size) as u16)
+        .collect();
+    let dict = PrimitiveArray::new(
+        Buffer::<i32>::from_iter(dict_values),
+        Validity::NonNullable,
+    );
+    let codes_prim = PrimitiveArray::new(Buffer::<u16>::from_iter(codes), Validity::NonNullable);
+    let mut ctx = SESSION.create_execution_ctx();
+    let bp = BitPackedData::encode(&codes_prim.into_array(), args.bit_width, &mut ctx)
+        .expect("bitpack");
+    DictArray::try_new(bp.into_array(), dict.into_array())
+        .expect("dict")
+        .into_array()
+}
+
+#[divan::bench(args = FILTER_ARGS)]
+fn filter_canonical_two_pass(bencher: Bencher, args: FilterArgs) {
+    let array = make_filter_input(args);
+    bencher.with_inputs(|| array.clone()).bench_local_refs(|a| {
+        let mut ctx = SESSION.create_execution_ctx();
+        // Pass 1: decode the full array into a Buffer<i32>.
+        let prim = a.clone().execute::<PrimitiveArray>(&mut ctx).unwrap();
+        let slice = prim.as_slice::<i32>();
+        // Pass 2: walk the buffer, collecting survivors into a new buffer.
+        let mut out = BufferMut::<i32>::with_capacity(slice.len());
+        for &v in slice {
+            if v > args.threshold {
+                out.push(v);
+            }
+        }
+        black_box(out.freeze())
+    });
+}
+
+#[divan::bench(args = FILTER_ARGS)]
+fn filter_chunked_sink(bencher: Bencher, args: FilterArgs) {
+    let array = make_filter_input(args);
+    bencher.with_inputs(|| array.clone()).bench_local_refs(|a| {
+        let mut ctx = SESSION.create_execution_ctx();
+        let threshold = args.threshold;
+        let sink = FilterSink::<i32, _>::with_capacity(args.len, move |v| v > threshold);
+        let buf: Buffer<i32> = drive_into_sink(a.clone(), &DISPATCHER, sink, &mut ctx).unwrap();
+        black_box(buf)
+    });
+}
+
+// ============================================================================
+// Cast — Dict<i32> → i64
+// ============================================================================
+
+#[derive(Copy, Clone)]
+struct CastArgs {
+    len: usize,
+    dict_size: usize,
+}
+
+impl fmt::Display for CastArgs {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "len={} dict={}", self.len, self.dict_size)
+    }
+}
+
+const CAST_ARGS: &[CastArgs] = &[
+    CastArgs {
+        len: 1_048_576,
+        dict_size: 256,
+    },
+    CastArgs {
+        len: 4_194_304,
+        dict_size: 256,
+    },
+    CastArgs {
+        len: 16_777_216,
+        dict_size: 256,
+    },
+];
+
+fn make_cast_input(args: CastArgs) -> vortex_array::ArrayRef {
+    let dict_values: Vec<i32> = (0..args.dict_size as i32).map(|i| i * 7 + 13).collect();
+    let codes: Vec<u32> = (0..args.len).map(|i| (i % args.dict_size) as u32).collect();
+    let dict = PrimitiveArray::new(
+        Buffer::<i32>::from_iter(dict_values),
+        Validity::NonNullable,
+    );
+    let codes_prim = PrimitiveArray::new(Buffer::<u32>::from_iter(codes), Validity::NonNullable);
+    DictArray::try_new(codes_prim.into_array(), dict.into_array())
+        .expect("dict")
+        .into_array()
+}
+
+#[divan::bench(args = CAST_ARGS)]
+fn cast_canonical_two_pass(bencher: Bencher, args: CastArgs) {
+    let array = make_cast_input(args);
+    bencher.with_inputs(|| array.clone()).bench_local_refs(|a| {
+        let mut ctx = SESSION.create_execution_ctx();
+        let prim = a.clone().execute::<PrimitiveArray>(&mut ctx).unwrap();
+        let slice = prim.as_slice::<i32>();
+        let mut out = BufferMut::<i64>::with_capacity(slice.len());
+        for &v in slice {
+            out.push(v as i64);
+        }
+        black_box(out.freeze())
+    });
+}
+
+#[divan::bench(args = CAST_ARGS)]
+fn cast_chunked_sink(bencher: Bencher, args: CastArgs) {
+    let array = make_cast_input(args);
+    bencher.with_inputs(|| array.clone()).bench_local_refs(|a| {
+        let mut ctx = SESSION.create_execution_ctx();
+        let sink = MapSink::<i32, i64, _>::with_capacity(args.len, |v| v as i64);
+        let buf: Buffer<i64> = drive_into_sink(a.clone(), &DISPATCHER, sink, &mut ctx).unwrap();
+        black_box(buf)
+    });
+}
+
+// ============================================================================
+// Scalar add — Dict<i32>, output i32, `x + 42`
+// ============================================================================
+
+#[divan::bench(args = CAST_ARGS)]
+fn scalar_add_canonical_two_pass(bencher: Bencher, args: CastArgs) {
+    let array = make_cast_input(args);
+    bencher.with_inputs(|| array.clone()).bench_local_refs(|a| {
+        let mut ctx = SESSION.create_execution_ctx();
+        let prim = a.clone().execute::<PrimitiveArray>(&mut ctx).unwrap();
+        let slice = prim.as_slice::<i32>();
+        let mut out = BufferMut::<i32>::with_capacity(slice.len());
+        for &v in slice {
+            out.push(v.wrapping_add(42));
+        }
+        black_box(out.freeze())
+    });
+}
+
+#[divan::bench(args = CAST_ARGS)]
+fn scalar_add_chunked_sink(bencher: Bencher, args: CastArgs) {
+    let array = make_cast_input(args);
+    bencher.with_inputs(|| array.clone()).bench_local_refs(|a| {
+        let mut ctx = SESSION.create_execution_ctx();
+        let sink =
+            MapSink::<i32, i32, _>::with_capacity(args.len, |v| v.wrapping_add(42));
+        let buf: Buffer<i32> = drive_into_sink(a.clone(), &DISPATCHER, sink, &mut ctx).unwrap();
+        black_box(buf)
+    });
+}
+
+// ============================================================================
+// Scalar mul + add (richer scalar pipeline: `x * 3 + 7`)
+// ============================================================================
+
+#[divan::bench(args = CAST_ARGS)]
+fn scalar_mul_add_canonical_two_pass(bencher: Bencher, args: CastArgs) {
+    let array = make_cast_input(args);
+    bencher.with_inputs(|| array.clone()).bench_local_refs(|a| {
+        let mut ctx = SESSION.create_execution_ctx();
+        let prim = a.clone().execute::<PrimitiveArray>(&mut ctx).unwrap();
+        let slice = prim.as_slice::<i32>();
+        let mut out = BufferMut::<i32>::with_capacity(slice.len());
+        for &v in slice {
+            out.push(v.wrapping_mul(3).wrapping_add(7));
+        }
+        black_box(out.freeze())
+    });
+}
+
+#[divan::bench(args = CAST_ARGS)]
+fn scalar_mul_add_chunked_sink(bencher: Bencher, args: CastArgs) {
+    let array = make_cast_input(args);
+    bencher.with_inputs(|| array.clone()).bench_local_refs(|a| {
+        let mut ctx = SESSION.create_execution_ctx();
+        let sink = MapSink::<i32, i32, _>::with_capacity(args.len, |v| {
+            v.wrapping_mul(3).wrapping_add(7)
+        });
+        let buf: Buffer<i32> = drive_into_sink(a.clone(), &DISPATCHER, sink, &mut ctx).unwrap();
+        black_box(buf)
+    });
+}
diff --git a/encodings/runend/examples/profile_chunked.rs b/encodings/runend/examples/profile_chunked.rs
new file mode 100644
index 00000000000..206f73f025d
--- /dev/null
+++ b/encodings/runend/examples/profile_chunked.rs
@@ -0,0 +1,152 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Standalone profile binary for the chunked execution engine.
+//!
+//! Builds with `cargo build --release --example profile_chunked -p vortex-runend`.
+//! Designed to be sampled by `samply record` or `perf record` — runs each path in a
+//! tight loop with no benchmark-framework overhead, so the profile shows the
+//! decompress kernels directly.
+//!
+//! Usage:
+//!   samply record ./target/release/examples/profile_chunked dict_bp 4194304 256 8 chunked 50
+//!   samply record ./target/release/examples/profile_chunked dict_bp 4194304 256 8 canonical 50
+
+use std::env;
+use std::sync::Arc;
+use std::time::Instant;
+
+use vortex_array::IntoArray;
+use vortex_array::VortexSessionExecute;
+use vortex_array::_chunked_exec::primitive::decode_to_buffer;
+use vortex_array::_chunked_exec::primitive::default_dispatcher;
+use vortex_array::arrays::DictArray;
+use vortex_array::arrays::PrimitiveArray;
+use vortex_array::session::ArraySession;
+use vortex_array::validity::Validity;
+use vortex_buffer::Buffer;
+use vortex_fastlanes::BitPackedData;
+use vortex_runend::_chunked_exec::register_chunk_kernels as register_runend_chunk_kernels;
+use vortex_session::VortexSession;
+
+fn main() {
+    let args: Vec<String> = env::args().collect();
+    if args.len() < 6 {
+        eprintln!(
+            "usage: {} <shape> <len> <dict_size> <bit_width> <path> <iters>",
+            args[0]
+        );
+        eprintln!("  shape: dict_bp | dict_p");
+        eprintln!("  path: chunked | canonical");
+        std::process::exit(2);
+    }
+    let shape = &args[1];
+    let len: usize = args[2].parse().unwrap();
+    let dict_size: usize = args[3].parse().unwrap();
+    let bit_width: u8 = args[4].parse().unwrap();
+    let path = &args[5];
+    let iters: usize = args.get(6).map(|s| s.parse().unwrap()).unwrap_or(20);
+
+    let session = {
+        let s = VortexSession::empty().with::<ArraySession>();
+        vortex_runend::initialize(&s);
+        vortex_fastlanes::initialize(&s);
+        s
+    };
+    let dispatcher = {
+        let mut d = default_dispatcher();
+        register_runend_chunk_kernels(&mut d);
+        vortex_fastlanes::_chunked_exec::register_chunk_kernels(&mut d);
+        Arc::new(d)
+    };
+
+    let array = match shape.as_str() {
+        "dict_bp" => build_dict_bp(&session, len, dict_size, bit_width),
+        "dict_p" => build_dict_p(len, dict_size),
+        _ => panic!("unknown shape {shape}"),
+    };
+
+    // Warm caches.
+    for _ in 0..3 {
+        match path.as_str() {
+            "chunked" => {
+                let mut ctx = session.create_execution_ctx();
+                let _w = decode_to_buffer::<i32>(array.clone(), &dispatcher, &mut ctx).unwrap();
+                std::hint::black_box(_w);
+            }
+            "canonical" => {
+                let mut ctx = session.create_execution_ctx();
+                let _w = array.clone().execute::<PrimitiveArray>(&mut ctx).unwrap();
+                std::hint::black_box(_w);
+            }
+            _ => panic!("unknown path {path}"),
+        }
+    }
+
+    // Time the loop body that the profiler will sample inside.
+    let t0 = Instant::now();
+    for _ in 0..iters {
+        match path.as_str() {
+            "chunked" => {
+                let mut ctx = session.create_execution_ctx();
+                let b = decode_to_buffer::<i32>(array.clone(), &dispatcher, &mut ctx).unwrap();
+                std::hint::black_box(b);
+            }
+            "canonical" => {
+                let mut ctx = session.create_execution_ctx();
+                let b = array.clone().execute::<PrimitiveArray>(&mut ctx).unwrap();
+                std::hint::black_box(b);
+            }
+            _ => unreachable!(),
+        }
+    }
+    let elapsed = t0.elapsed();
+    let per_iter_us = (elapsed.as_micros() as f64) / (iters as f64);
+    let elems_per_us = (len as f64) / per_iter_us;
+    println!(
+        "{} {} N={} dict={} bw={} iters={}  per_iter={:.1}µs  {:.1}M elems/sec",
+        shape,
+        path,
+        len,
+        dict_size,
+        bit_width,
+        iters,
+        per_iter_us,
+        elems_per_us
+    );
+}
+
+fn build_dict_bp(
+    session: &VortexSession,
+    len: usize,
+    dict_size: usize,
+    bit_width: u8,
+) -> vortex_array::ArrayRef {
+    let dict_values: Vec<i32> = (0..dict_size as i32).map(|i| i * 17 + 11).collect();
+    let codes: Vec<u16> = (0..len).map(|i| (i % dict_size) as u16).collect();
+    let dict = PrimitiveArray::new(
+        Buffer::<i32>::from_iter(dict_values),
+        Validity::NonNullable,
+    );
+    let codes_prim =
+        PrimitiveArray::new(Buffer::<u16>::from_iter(codes), Validity::NonNullable);
+    let mut ctx = session.create_execution_ctx();
+    let bp = BitPackedData::encode(&codes_prim.into_array(), bit_width, &mut ctx).unwrap();
+    DictArray::try_new(bp.into_array(), dict.into_array())
+        .unwrap()
+        .into_array()
+}
+
+fn build_dict_p(len: usize, dict_size: usize) -> vortex_array::ArrayRef {
+    let dict_values: Vec<i32> = (0..dict_size as i32).map(|i| i * 17 + 11).collect();
+    let codes: Vec<u32> = (0..len).map(|i| (i % dict_size) as u32).collect();
+    let dict = PrimitiveArray::new(
+        Buffer::<i32>::from_iter(dict_values),
+        Validity::NonNullable,
+    );
+    let codes_prim =
+        PrimitiveArray::new(Buffer::<u32>::from_iter(codes), Validity::NonNullable);
+    DictArray::try_new(codes_prim.into_array(), dict.into_array())
+        .unwrap()
+        .into_array()
+}
diff --git a/encodings/runend/src/_chunked_exec.rs b/encodings/runend/src/_chunked_exec.rs
new file mode 100644
index 00000000000..e604e940651
--- /dev/null
+++ b/encodings/runend/src/_chunked_exec.rs
@@ -0,0 +1,188 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! `RunEnd<Primitive>` chunked decoder + registration with `vortex_array::_chunked_exec`.
+//!
+//! Lives in this crate because `RunEnd` is defined here; the producer itself
+//! ([`vortex_array::_chunked_exec::primitive::RunEndPrimitiveProducer`]) is generic and
+//! lives in `vortex-array`. The kernel here just bridges Vortex's encoding view to the
+//! generic streaming primitive.
+
+use std::sync::Arc;
+
+use vortex_array::ArrayRef;
+use vortex_array::ExecutionCtx;
+use vortex_array::VTable;
+use vortex_array::_chunked_exec::primitive::PrimitiveChunkKernel;
+use vortex_array::_chunked_exec::primitive::PrimitiveChunkKernelDispatcher;
+use vortex_array::_chunked_exec::primitive::PrimitiveChunkProducer;
+use vortex_array::_chunked_exec::primitive::build_runend_producer;
+use vortex_array::arrays::PrimitiveArray;
+use vortex_array::dtype::DType;
+use vortex_array::dtype::NativePType;
+use vortex_array::dtype::Nullability;
+use vortex_error::VortexResult;
+
+use crate::RunEnd;
+use crate::array::RunEndArrayExt as _;
+
+/// `RunEnd<Primitive>` chunked kernel.
+pub struct RunEndKernel<T: NativePType> {
+    _marker: std::marker::PhantomData<fn() -> T>,
+}
+
+impl<T: NativePType> RunEndKernel<T> {
+    /// Construct an empty kernel marker.
+    pub fn new() -> Self {
+        Self {
+            _marker: std::marker::PhantomData,
+        }
+    }
+}
+
+impl<T: NativePType> Default for RunEndKernel<T> {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl<T: NativePType> PrimitiveChunkKernel<T> for RunEndKernel<T> {
+    fn build(
+        &self,
+        array: &ArrayRef,
+        ctx: &mut ExecutionCtx,
+    ) -> VortexResult<Option<Box<dyn PrimitiveChunkProducer<T>>>> {
+        let Some(re) = array.as_opt::<RunEnd>() else {
+            return Ok(None);
+        };
+        if !matches!(array.dtype().nullability(), Nullability::NonNullable) {
+            return Ok(None);
+        }
+        let values = re.values();
+        let ends = re.ends();
+        if !matches!(values.dtype().nullability(), Nullability::NonNullable)
+            || !matches!(ends.dtype().nullability(), Nullability::NonNullable)
+        {
+            return Ok(None);
+        }
+        let DType::Primitive(values_ptype, _) = *values.dtype() else {
+            return Ok(None);
+        };
+        if values_ptype != T::PTYPE {
+            return Ok(None);
+        }
+        let offset = re.offset();
+        let len = array.len();
+        let values_canonical = values.clone().execute::<PrimitiveArray>(ctx)?;
+        let ends_canonical = ends.clone().execute::<PrimitiveArray>(ctx)?;
+        Ok(Some(build_runend_producer::<T>(
+            values_canonical,
+            ends_canonical,
+            offset,
+            len,
+        )?))
+    }
+}
+
+/// Register the `RunEnd` chunk kernel onto `dispatcher` for every primitive output type
+/// that we currently care about.
+///
+/// This is the runend-side equivalent of `vortex_array::_chunked_exec::register_defaults`;
+/// downstream crates that want the fused `Dict<RunEnd<P>>` path should call both.
+pub fn register_chunk_kernels(dispatcher: &mut PrimitiveChunkKernelDispatcher) {
+    macro_rules! register_all_for {
+        ($($T:ty),*) => {
+            $(
+                dispatcher.register::<$T>(RunEnd.id(), Arc::new(RunEndKernel::<$T>::new()));
+            )*
+        };
+    }
+    register_all_for!(u8, u16, u32, u64, i8, i16, i32, i64, f32, f64);
+}
+
+#[cfg(test)]
+mod tests {
+    use vortex_array::ExecutionCtx;
+    use vortex_array::IntoArray;
+    use vortex_array::LEGACY_SESSION;
+    use vortex_array::VortexSessionExecute;
+    use vortex_array::_chunked_exec::primitive::decode_to_buffer;
+    use vortex_array::_chunked_exec::primitive::default_dispatcher;
+    use vortex_array::arrays::DictArray;
+    use vortex_array::arrays::PrimitiveArray;
+    use vortex_array::validity::Validity;
+    use vortex_buffer::Buffer;
+    use vortex_error::VortexResult;
+
+    use super::register_chunk_kernels;
+    use crate::RunEnd;
+
+    fn ctx() -> ExecutionCtx {
+        LEGACY_SESSION.create_execution_ctx()
+    }
+
+    fn dispatcher() -> vortex_array::_chunked_exec::primitive::PrimitiveChunkKernelDispatcher {
+        let mut d = default_dispatcher();
+        register_chunk_kernels(&mut d);
+        d
+    }
+
+    /// Construct a RunEnd<Primitive> from an iterator of values; useful in tests.
+    fn make_runend_i32(values: &[i32]) -> VortexResult<vortex_array::ArrayRef> {
+        let prim = PrimitiveArray::new(
+            Buffer::<i32>::from_iter(values.iter().copied()),
+            Validity::NonNullable,
+        );
+        let re = RunEnd::encode(prim.into_array(), &mut ctx())?;
+        Ok(re.into_array())
+    }
+
+    #[test]
+    fn runend_chunked() -> VortexResult<()> {
+        let mut values = Vec::with_capacity(4000);
+        values.extend(std::iter::repeat(1i32).take(100));
+        values.extend(std::iter::repeat(2i32).take(150));
+        values.extend(std::iter::repeat(3i32).take(850));
+        values.extend(std::iter::repeat(4i32).take(2900));
+        let re = make_runend_i32(&values)?;
+        let buf = decode_to_buffer::<i32>(re, &dispatcher(), &mut ctx())?;
+        assert_eq!(buf.as_slice(), values.as_slice());
+        Ok(())
+    }
+
+    #[test]
+    fn fused_dict_runend_chunked() -> VortexResult<()> {
+        // Inner dictionary VALUES are RunEnd-encoded (so the dict has 12 logical entries).
+        let inner_values = vec![100i32, 100, 100, 200, 200, 200, 300, 300, 300, 400, 400, 400];
+        let inner_re = make_runend_i32(&inner_values)?;
+
+        let codes_vec: Vec<u8> = (0..8192u32).map(|i| (i % 12) as u8).collect();
+        let codes_arr = PrimitiveArray::new(
+            Buffer::<u8>::from_iter(codes_vec.iter().copied()),
+            Validity::NonNullable,
+        );
+        let dict = DictArray::try_new(codes_arr.into_array(), inner_re)?;
+
+        let buf = decode_to_buffer::<i32>(dict.into_array(), &dispatcher(), &mut ctx())?;
+        let expected: Vec<i32> = codes_vec
+            .iter()
+            .map(|c| inner_values[*c as usize])
+            .collect();
+        assert_eq!(buf.as_slice(), expected.as_slice());
+        Ok(())
+    }
+
+    #[test]
+    fn runend_sliced_chunked() -> VortexResult<()> {
+        let mut values = Vec::with_capacity(2000);
+        for run_idx in 0..20 {
+            values.extend(std::iter::repeat(run_idx as i32 + 1).take(100));
+        }
+        let re_full = make_runend_i32(&values)?;
+        let sliced = re_full.slice(50..1500)?;
+        let buf = decode_to_buffer::<i32>(sliced, &dispatcher(), &mut ctx())?;
+        assert_eq!(buf.as_slice(), &values[50..1500]);
+        Ok(())
+    }
+
+}
diff --git a/encodings/runend/src/compute/take.rs b/encodings/runend/src/compute/take.rs
index 7100faf9eac..f6f09b68c65 100644
--- a/encodings/runend/src/compute/take.rs
+++ b/encodings/runend/src/compute/take.rs
@@ -31,6 +31,18 @@ impl TakeExecute for RunEnd {
         indices: &ArrayRef,
         ctx: &mut ExecutionCtx,
     ) -> VortexResult<Option<ArrayRef>> {
+        // Fall back to the canonical-then-gather path when there are many more indices
+        // than the RunEnd array's logical length. In that regime, canonicalizing the
+        // (small) RunEnd values once and doing an AVX2 gather is dramatically cheaper
+        // than `indices.len()` independent binary searches into the ends array. The
+        // pathological case is `Dict<RunEnd>` with a small RunEnd dictionary and many
+        // codes: each code triggers a `search_sorted` here, blowing up to N * log K
+        // work plus an O(N) `Vec<u64>` allocation, when a single decode of the RunEnd
+        // (size = array.len()) followed by an AVX2 take would do.
+        if indices.len() > array.len() {
+            return Ok(None);
+        }
+
         let primitive_indices = indices.clone().execute::<PrimitiveArray>(ctx)?;
 
         let checked_indices = match_each_integer_ptype!(primitive_indices.ptype(), |P| {
@@ -155,6 +167,49 @@ mod tests {
         assert_arrays_eq!(taken, expected.into_array());
     }
 
+    /// Regression test: when `indices.len()` exceeds `array.len()`, the take impl falls back
+    /// (returns `None`) so the canonical executor picks up the small RunEnd canonicalize +
+    /// AVX2 gather path. Without the fallback, `Dict<RunEnd>.execute::<PrimitiveArray>` would
+    /// pay `N * log K` work per index against a tiny ends array. See `dict_runend_canonical`
+    /// in `encodings/runend/benches/chunked_exec.rs`.
+    #[test]
+    fn ree_dict_take_dense_indices() -> vortex_error::VortexResult<()> {
+        use std::sync::LazyLock;
+
+        use vortex_array::IntoArray;
+        use vortex_array::VortexSessionExecute;
+        use vortex_array::arrays::DictArray;
+        use vortex_array::session::ArraySession;
+        use vortex_array::validity::Validity;
+        use vortex_buffer::Buffer;
+        use vortex_session::VortexSession;
+
+        static SESSION: LazyLock<VortexSession> =
+            LazyLock::new(|| VortexSession::empty().with::<ArraySession>());
+        let mut ctx = SESSION.create_execution_ctx();
+
+        // dict_size=8, inner_run=2 → ends=[2,4,6,8], values=[0,1,2,3]
+        let dict_values =
+            PrimitiveArray::new(Buffer::<i32>::from_iter([0, 0, 1, 1, 2, 2, 3, 3]), Validity::NonNullable)
+                .into_array();
+        let dict_re = RunEnd::encode(dict_values, &mut ctx)?.into_array();
+
+        // 32 codes (>> dict.len()=8), each `i % 8` so the result is 0,0,1,1,...,3,3 repeated 4x.
+        let codes_buf: Vec<u32> = (0..32u32).map(|i| i % 8).collect();
+        let codes =
+            PrimitiveArray::new(Buffer::<u32>::from_iter(codes_buf), Validity::NonNullable).into_array();
+        let dict = DictArray::try_new(codes, dict_re)?.into_array();
+
+        let taken = dict.execute::<PrimitiveArray>(&mut ctx)?;
+        let expected: Vec<i32> = (0..32).map(|i| (i % 8) / 2).collect();
+        assert_arrays_eq!(
+            taken.into_array(),
+            PrimitiveArray::new(Buffer::<i32>::from_iter(expected), Validity::NonNullable)
+                .into_array()
+        );
+        Ok(())
+    }
+
     #[rstest]
     #[case(ree_array())]
     #[case(RunEnd::encode(
diff --git a/encodings/runend/src/lib.rs b/encodings/runend/src/lib.rs
index 8770dbbf58e..84a96849d22 100644
--- a/encodings/runend/src/lib.rs
+++ b/encodings/runend/src/lib.rs
@@ -8,6 +8,8 @@ pub use arbitrary::ArbitraryRunEndArray;
 pub use array::*;
 pub use iter::trimmed_ends_iter;
 
+#[doc(hidden)]
+pub mod _chunked_exec;
 mod array;
 #[cfg(feature = "arrow")]
 mod arrow;
diff --git a/vortex-array/src/_chunked_exec/listview.rs b/vortex-array/src/_chunked_exec/listview.rs
new file mode 100644
index 00000000000..683b7e5973b
--- /dev/null
+++ b/vortex-array/src/_chunked_exec/listview.rs
@@ -0,0 +1,377 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Chunked decode for [`ListView`] arrays with primitive elements.
+//!
+//! The canonical-by-canonical path for `ListView<Primitive>` decompresses the offsets,
+//! sizes, and elements buffers up-front. When offsets and sizes are themselves
+//! bit-packed (the common case), that's three full materializations before a consumer
+//! sees a single row.
+//!
+//! The chunked path emits **row windows**: each window is a `(offsets, sizes,
+//! elements_slice)` triple covering up to [`CHUNK_LEN`] rows. The producer keeps a
+//! reusable [`Scratch`] for the offsets and another for the sizes (so the steady-state
+//! decompress footprint is two scratches, not three full buffers), and the elements
+//! slice is just a borrowed view onto the shared, already-materialized elements buffer.
+//!
+//! For v1 we materialize the elements buffer once up-front (since `ListView` allows
+//! arbitrary, possibly-overlapping offsets we don't know in advance which slice each
+//! chunk needs without a pre-pass). A future iteration can replace this with a
+//! per-chunk min-offset / max-offset+size pre-pass and decompress only the referenced
+//! slice — the producer API already supports that shape.
+
+use vortex_buffer::Buffer;
+use vortex_error::VortexResult;
+
+use super::CHUNK_LEN;
+use super::scratch::Scratch;
+use crate::ArrayRef;
+use crate::arrays::ListView;
+use crate::arrays::PrimitiveArray;
+use crate::arrays::listview::ListViewArrayExt;
+use crate::arrays::primitive::PrimitiveArrayExt;
+use crate::dtype::NativePType;
+use crate::dtype::Nullability;
+use crate::executor::ExecutionCtx;
+use crate::match_each_integer_ptype;
+
+/// A row window emitted by [`ListChunkProducer`].
+///
+/// The lifetime is tied to the producer-owned `Scratch` buffers — the consumer must
+/// finish reading the chunk before requesting the next.
+pub struct ListChunk<'a, O, S, E> {
+    /// Offsets for this window of rows.
+    pub offsets: &'a [O],
+    /// Sizes for this window of rows.
+    pub sizes: &'a [S],
+    /// Shared reference to the (fully materialized) elements buffer.
+    pub elements: &'a [E],
+}
+
+/// Streams chunks of `(offsets, sizes)` for a list-of-primitive array.
+///
+/// `O` is the offset native type (`u32` or `u64`), `S` the size native type, and `E` the
+/// element native type.
+pub struct ListChunkProducer<O: NativePType, S: NativePType, E: NativePType> {
+    offsets: Buffer<O>,
+    sizes: Buffer<S>,
+    elements: Buffer<E>,
+    cursor: usize,
+}
+
+impl<O, S, E> ListChunkProducer<O, S, E>
+where
+    O: NativePType,
+    S: NativePType,
+    E: NativePType,
+{
+    /// Construct directly from canonicalized buffers.
+    pub fn new(offsets: Buffer<O>, sizes: Buffer<S>, elements: Buffer<E>) -> Self {
+        Self {
+            offsets,
+            sizes,
+            elements,
+            cursor: 0,
+        }
+    }
+
+    /// Number of rows that remain to be emitted.
+    pub fn remaining(&self) -> usize {
+        self.offsets.as_slice().len().saturating_sub(self.cursor)
+    }
+
+    /// Total number of rows.
+    pub fn len(&self) -> usize {
+        self.offsets.as_slice().len()
+    }
+
+    /// Whether there are any rows at all.
+    pub fn is_empty(&self) -> bool {
+        self.offsets.as_slice().is_empty()
+    }
+
+    /// Drive the producer to completion, calling `f` with each row chunk as typed
+    /// slices. This is the fast-path API for callers who know the offset/size types
+    /// at compile time — it has no dyn-dispatch in the hot loop.
+    pub fn for_each_chunk_typed<F>(&mut self, mut f: F)
+    where
+        F: FnMut(&[O], &[S], &[E]),
+    {
+        let mut o_scratch = Scratch::<O>::new();
+        let mut s_scratch = Scratch::<S>::new();
+        while let Some(chunk) = self.next_chunk(&mut o_scratch, &mut s_scratch) {
+            f(chunk.offsets, chunk.sizes, chunk.elements);
+        }
+    }
+
+    /// Pull the next row window. Returns slices directly out of the already-canonical
+    /// `offsets`/`sizes`/`elements` buffers — no memcpy, no scratch hop.
+    ///
+    /// The scratch arguments are kept for API symmetry with future chunked-bit-unpack
+    /// variants where the producer materialises its own offsets/sizes per chunk; for
+    /// the current canonical-then-chunk producer they are unused.
+    pub fn next_chunk<'a>(
+        &'a mut self,
+        _offset_scratch: &'a mut Scratch<O>,
+        _size_scratch: &'a mut Scratch<S>,
+    ) -> Option<ListChunk<'a, O, S, E>> {
+        let total = self.offsets.as_slice().len();
+        if self.cursor >= total {
+            return None;
+        }
+        let take = CHUNK_LEN.min(total - self.cursor);
+        let offsets = &self.offsets.as_slice()[self.cursor..self.cursor + take];
+        let sizes = &self.sizes.as_slice()[self.cursor..self.cursor + take];
+        self.cursor += take;
+        Some(ListChunk {
+            offsets,
+            sizes,
+            elements: self.elements.as_slice(),
+        })
+    }
+}
+
+/// Build a typed [`ListChunkProducer`] directly. The caller commits to the offset and
+/// size native types up front; the producer is callable without dyn dispatch in the hot
+/// loop.
+///
+/// Errors if the array's offset/size/element ptypes don't match `<O, S, E>`.
+pub fn build_listview_producer_typed<O, S, E>(
+    array: ArrayRef,
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<ListChunkProducer<O, S, E>>
+where
+    O: NativePType,
+    S: NativePType,
+    E: NativePType,
+{
+    let Some(lv) = array.as_opt::<ListView>() else {
+        vortex_error::vortex_bail!(
+            "build_listview_producer_typed: expected ListView, got {}",
+            array.encoding_id()
+        );
+    };
+    if !matches!(array.dtype().nullability(), Nullability::NonNullable) {
+        vortex_error::vortex_bail!("build_listview_producer_typed: only non-nullable for v1");
+    }
+    let offsets = lv.offsets().clone().execute::<PrimitiveArray>(ctx)?;
+    let sizes = lv.sizes().clone().execute::<PrimitiveArray>(ctx)?;
+    let elements = lv.elements().clone().execute::<PrimitiveArray>(ctx)?;
+    if O::PTYPE != offsets.ptype()
+        || S::PTYPE != sizes.ptype()
+        || E::PTYPE != elements.ptype()
+    {
+        vortex_error::vortex_bail!(
+            "build_listview_producer_typed: ptype mismatch ({}, {}, {}) vs requested ({}, {}, {})",
+            offsets.ptype(),
+            sizes.ptype(),
+            elements.ptype(),
+            O::PTYPE,
+            S::PTYPE,
+            E::PTYPE,
+        );
+    }
+    Ok(ListChunkProducer::<O, S, E>::new(
+        offsets.into_buffer::<O>(),
+        sizes.into_buffer::<S>(),
+        elements.into_buffer::<E>(),
+    ))
+}
+
+/// Build a [`ListChunkProducer`] for a `ListView<Primitive>` array.
+///
+/// Canonicalizes offsets, sizes and elements once via the existing executor, then returns
+/// a producer that streams row windows. The producer can be used to drive a downstream
+/// consumer without ever materializing a full `ListViewArray`.
+pub fn build_listview_primitive_producer<E: NativePType>(
+    array: ArrayRef,
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<BoxedListChunkProducer<E>> {
+    let Some(lv) = array.as_opt::<ListView>() else {
+        vortex_error::vortex_bail!(
+            "build_listview_primitive_producer: expected ListView, got {}",
+            array.encoding_id()
+        );
+    };
+    if !matches!(array.dtype().nullability(), Nullability::NonNullable) {
+        vortex_error::vortex_bail!(
+            "build_listview_primitive_producer: only non-nullable for v1"
+        );
+    }
+    let offsets = lv.offsets().clone().execute::<PrimitiveArray>(ctx)?;
+    let sizes = lv.sizes().clone().execute::<PrimitiveArray>(ctx)?;
+    let elements = lv.elements().clone().execute::<PrimitiveArray>(ctx)?;
+    if E::PTYPE != elements.ptype() {
+        vortex_error::vortex_bail!(
+            "build_listview_primitive_producer: element type {} does not match {}",
+            E::PTYPE,
+            elements.ptype()
+        );
+    }
+    let elements_buf = elements.into_buffer::<E>();
+    Ok(match_each_integer_ptype!(offsets.ptype(), |O| {
+        match_each_integer_ptype!(sizes.ptype(), |S| {
+            BoxedListChunkProducer::new::<O, S>(
+                offsets.into_buffer::<O>(),
+                sizes.into_buffer::<S>(),
+                elements_buf,
+            )
+        })
+    }))
+}
+
+/// Type-erased entry point handle that lets callers dispatch to a `ListChunkProducer`
+/// without monomorphizing on every `(O, S, E)` triple.
+pub struct BoxedListChunkProducer<E: NativePType> {
+    inner: Box<dyn ListChunkProducerErased<E>>,
+}
+
+impl<E: NativePType> BoxedListChunkProducer<E> {
+    fn new<O, S>(offsets: Buffer<O>, sizes: Buffer<S>, elements: Buffer<E>) -> Self
+    where
+        O: NativePType + num_traits::AsPrimitive<usize> + Send + 'static,
+        S: NativePType + num_traits::AsPrimitive<usize> + Send + 'static,
+        E: NativePType + Send + 'static,
+    {
+        Self {
+            inner: Box::new(ErasedProducer::<O, S, E>::new(ListChunkProducer::new(
+                offsets, sizes, elements,
+            ))),
+        }
+    }
+
+    /// Total rows.
+    pub fn len(&self) -> usize {
+        self.inner.len_erased()
+    }
+
+    /// Whether the producer has any rows.
+    pub fn is_empty(&self) -> bool {
+        self.inner.len_erased() == 0
+    }
+
+    /// Drive one chunk and call `f` with the (offsets, sizes, elements) triple. The
+    /// closure may borrow the chunk for the duration of the call. Returns `Some(())` if
+    /// a chunk was produced; `None` when exhausted.
+    ///
+    /// We hide the offset/size native types behind a `dyn` adaptor so callers don't
+    /// have to monomorphize over every integer combination.
+    pub fn for_each_chunk<F>(&mut self, mut f: F)
+    where
+        F: FnMut(ListChunkErased<'_, E>),
+    {
+        while let Some(()) = self.inner.next_chunk_erased(&mut f) {}
+    }
+}
+
+/// Type-erased chunk view passed to [`BoxedListChunkProducer::for_each_chunk`].
+pub struct ListChunkErased<'a, E: NativePType> {
+    /// Number of rows in this chunk.
+    pub n: usize,
+    /// Function that returns row `i`'s offset as `usize`.
+    pub offset_of: &'a dyn Fn(usize) -> usize,
+    /// Function that returns row `i`'s size as `usize`.
+    pub size_of: &'a dyn Fn(usize) -> usize,
+    /// The shared elements buffer.
+    pub elements: &'a [E],
+}
+
+trait ListChunkProducerErased<E: NativePType>: Send {
+    fn len_erased(&self) -> usize;
+    fn next_chunk_erased(
+        &mut self,
+        f: &mut dyn FnMut(ListChunkErased<'_, E>),
+    ) -> Option<()>;
+}
+
+struct ErasedProducer<O: NativePType, S: NativePType, E: NativePType> {
+    inner: ListChunkProducer<O, S, E>,
+    /// Persisted scratches so we don't pay a heap allocation per chunk.
+    offset_scratch: Scratch<O>,
+    size_scratch: Scratch<S>,
+}
+
+impl<O: NativePType, S: NativePType, E: NativePType> ErasedProducer<O, S, E> {
+    fn new(inner: ListChunkProducer<O, S, E>) -> Self {
+        Self {
+            inner,
+            offset_scratch: Scratch::<O>::new(),
+            size_scratch: Scratch::<S>::new(),
+        }
+    }
+}
+
+impl<O, S, E> ListChunkProducerErased<E> for ErasedProducer<O, S, E>
+where
+    O: NativePType + num_traits::AsPrimitive<usize> + Send,
+    S: NativePType + num_traits::AsPrimitive<usize> + Send,
+    E: NativePType + Send,
+{
+    fn len_erased(&self) -> usize {
+        self.inner.len()
+    }
+
+    fn next_chunk_erased(
+        &mut self,
+        f: &mut dyn FnMut(ListChunkErased<'_, E>),
+    ) -> Option<()> {
+        let chunk = self
+            .inner
+            .next_chunk(&mut self.offset_scratch, &mut self.size_scratch)?;
+        let offsets = chunk.offsets;
+        let sizes = chunk.sizes;
+        let elements = chunk.elements;
+        let off_fn = |i: usize| -> usize { offsets[i].as_() };
+        let sz_fn = |i: usize| -> usize { sizes[i].as_() };
+        f(ListChunkErased {
+            n: offsets.len(),
+            offset_of: &off_fn,
+            size_of: &sz_fn,
+            elements,
+        });
+        Some(())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use vortex_buffer::buffer;
+    use vortex_error::VortexResult;
+
+    use super::*;
+    use crate::IntoArray;
+    use crate::LEGACY_SESSION;
+    use crate::VortexSessionExecute;
+    use crate::arrays::ListViewArray;
+    use crate::validity::Validity;
+
+    fn ctx() -> ExecutionCtx {
+        LEGACY_SESSION.create_execution_ctx()
+    }
+
+    #[test]
+    fn list_chunked_round_trip() -> VortexResult<()> {
+        // 3 lists: [1,2], [3], [4,5,6]
+        let elements = buffer![1i32, 2, 3, 4, 5, 6];
+        let offsets = buffer![0u32, 2, 3];
+        let sizes = buffer![2u32, 1, 3];
+        let lv = ListViewArray::new(
+            elements.into_array(),
+            offsets.into_array(),
+            sizes.into_array(),
+            Validity::NonNullable,
+        );
+        let mut producer = build_listview_primitive_producer::<i32>(lv.into_array(), &mut ctx())?;
+
+        let mut row_data: Vec<Vec<i32>> = Vec::new();
+        producer.for_each_chunk(|chunk| {
+            for i in 0..chunk.n {
+                let o = (chunk.offset_of)(i);
+                let s = (chunk.size_of)(i);
+                row_data.push(chunk.elements[o..o + s].to_vec());
+            }
+        });
+        assert_eq!(row_data, vec![vec![1, 2], vec![3], vec![4, 5, 6]]);
+        Ok(())
+    }
+}
diff --git a/vortex-array/src/_chunked_exec/mod.rs b/vortex-array/src/_chunked_exec/mod.rs
new file mode 100644
index 00000000000..d60d6c9002c
--- /dev/null
+++ b/vortex-array/src/_chunked_exec/mod.rs
@@ -0,0 +1,76 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Chunked execution engine — experimental.
+//!
+//! The default executor in [`crate::executor`] drives an array to a fully materialized
+//! [`crate::Canonical`] one encoding at a time. Each intermediate is the size of the whole
+//! array; for stacks like `Dict<RunEnd<Primitive>>` the working set is enormous, eviction
+//! storms are inevitable, and any per-stage allocation runs at the rate of the *output*
+//! size rather than the rate of useful work.
+//!
+//! This module models execution differently:
+//!
+//! 1. A **producer** yields decoded values into a small, driver-owned [`Scratch`] buffer
+//!    sized to fit comfortably in L1d (1024 elements, 4–8 KiB for primitives).
+//! 2. The driver pulls chunks until the producer is exhausted, copying each chunk into
+//!    its final destination (a builder, an Arrow buffer, an aggregator, …). The scratch
+//!    is reused across iterations, so the steady-state memory footprint of decode is the
+//!    scratch size plus whatever fixed dictionaries the producer holds.
+//! 3. Custom **chunk kernels** can be registered to fuse multiple encoding layers into a
+//!    single pass — the model rule of thumb is that an encoding's chunk kernel is allowed
+//!    to *materialise its own children up-front* if they are bounded in size (e.g. a
+//!    dictionary's `values` slot), and then stream the unbounded `codes` chunk-by-chunk.
+//!    This is the same pattern as [`crate::arrays::dict::TakeExecute`] — the fused take
+//!    kernel reads `Dict.values` once and then walks `Dict.codes` chunk-by-chunk.
+//!
+//! See the producer traits for the contract and [`build_primitive_producer`] for dispatch.
+//!
+//! ## Status
+//!
+//! v1 spike — covers primitive output for `Dict<Primitive>`, `RunEnd<Primitive>` and the
+//! fused `Dict<RunEnd<Primitive>>` stack, plus a streaming [`listview::ListChunkProducer`]
+//! over `ListView<Primitive>` rows with bit-packable offsets/sizes. The module is
+//! `_`-prefixed so it does not leak into the public API surface yet.
+
+pub mod listview;
+pub mod primitive;
+pub mod sink;
+
+mod scratch;
+
+/// Re-export of the AVX2-aware take helper, hoisted into the chunked engine namespace so
+/// out-of-crate kernels can call it without piercing private compute modules.
+pub use crate::arrays::primitive::compute::take::take_into_uninit;
+
+pub use scratch::Scratch;
+
+/// Number of elements per scratch chunk.
+///
+/// 1024 matches the fastlanes chunk size, keeps the scratch under 8 KiB for any
+/// primitive up to `u64`/`f64`, and is small enough to leave room for one fixed
+/// dictionary in L1d on every CPU we care about. Empirically larger super-chunks
+/// (tested at 4096) didn't move primitive workloads and regressed RunEnd at moderate
+/// run lengths — the per-chunk dispatch isn't the bottleneck the profile suggested.
+pub const CHUNK_LEN: usize = 1024;
+
+/// Drive a producer to completion, invoking `sink` with each emitted chunk.
+///
+/// This is the canonical helper for "decode the whole thing into a downstream buffer".
+/// The producer's scratch is supplied by the driver, so the same allocation is reused
+/// across every chunk for the lifetime of the call.
+pub fn drive_primitive<T, P, S>(
+    mut producer: P,
+    scratch: &mut Scratch<T>,
+    mut sink: S,
+) -> vortex_error::VortexResult<()>
+where
+    T: crate::dtype::NativePType,
+    P: primitive::PrimitiveChunkProducer<T>,
+    S: FnMut(&[T]),
+{
+    while let Some(chunk) = producer.next_chunk(scratch)? {
+        sink(chunk);
+    }
+    Ok(())
+}
diff --git a/vortex-array/src/_chunked_exec/primitive.rs b/vortex-array/src/_chunked_exec/primitive.rs
new file mode 100644
index 00000000000..8757592cbbc
--- /dev/null
+++ b/vortex-array/src/_chunked_exec/primitive.rs
@@ -0,0 +1,854 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Chunked decode producers that emit primitive values into a [`Scratch`] buffer.
+//!
+//! See the module-level docs in [`super`] for the model. This file holds:
+//!
+//! - The [`PrimitiveChunkProducer`] trait — the producer contract.
+//! - The [`PrimitiveChunkKernel`] trait + [`PrimitiveChunkKernelDispatcher`] — dyn-dispatch
+//!   registry of fused kernels keyed on the encoding identifier of the outermost array.
+//! - Concrete producers: [`SliceProducer`] (canonical fallback),
+//!   [`DictPrimitiveProducer`] (chunked gather over a small materialized dict — this is
+//!   also the implementation that *naturally fuses* `Dict<RunEnd<P>>` because [`DictKernel`]
+//!   materializes its values slot via the regular executor), and [`RunEndPrimitiveProducer`]
+//!   (standalone run-end streaming, built via [`build_runend_producer`] by encodings that
+//!   know how to fetch the run-end children).
+//! - [`build_primitive_producer`] — the entry point that dispatches into a registered
+//!   fused kernel or falls back to a canonical-then-stream path.
+//!
+//! For v1, the fast paths require non-nullable inputs and primitive children of the
+//! expected shape. Anything outside that envelope falls back to the canonical-then-stream
+//! producer, which is correct but not faster than the array-by-array baseline.
+
+use std::any::Any;
+use std::mem::MaybeUninit;
+use std::sync::Arc;
+
+use vortex_buffer::Buffer;
+use vortex_error::VortexResult;
+
+use super::CHUNK_LEN;
+use super::scratch::Scratch;
+use crate::ArrayRef;
+use crate::array::ArrayId;
+use crate::array::VTable;
+use crate::arrays::Dict;
+use crate::arrays::PrimitiveArray;
+use crate::arrays::dict::DictArraySlotsExt;
+use crate::arrays::primitive::PrimitiveArrayExt;
+use crate::dtype::DType;
+use crate::dtype::NativePType;
+use crate::dtype::Nullability;
+use crate::dtype::PType;
+use crate::executor::ExecutionCtx;
+use crate::match_each_integer_ptype;
+use crate::match_each_native_ptype;
+use crate::validity::Validity;
+
+/// A streaming producer of primitive chunks.
+///
+/// Implementors write up to [`CHUNK_LEN`] elements into the supplied scratch and return
+/// the initialized prefix to the driver. Returning `Ok(None)` signals end-of-stream.
+///
+/// The producer is responsible for owning anything it needs across chunk calls (e.g. a
+/// materialized dictionary). The scratch is supplied by the driver and reused across
+/// calls.
+pub trait PrimitiveChunkProducer<T: NativePType>: Send {
+    /// Decode the next chunk of values.
+    fn next_chunk<'a>(
+        &mut self,
+        scratch: &'a mut Scratch<T>,
+    ) -> VortexResult<Option<&'a [T]>>;
+
+    /// Decode the next chunk directly into `dst` (bypassing scratch). The destination must
+    /// have at least [`CHUNK_LEN`] writable cells. Returns the number of elements written,
+    /// or `None` when the producer is exhausted.
+    ///
+    /// Default implementation routes through [`Self::next_chunk`] and `copy_nonoverlapping`.
+    /// Producers that can write directly to the destination should override this — e.g.
+    /// [`DictPrimitiveProducer`] calls the AVX2 gather kernel into `dst` directly, avoiding
+    /// the scratch hop entirely.
+    fn next_chunk_into_uninit(
+        &mut self,
+        scratch: &mut Scratch<T>,
+        dst: &mut [MaybeUninit<T>],
+    ) -> VortexResult<Option<usize>> {
+        match self.next_chunk(scratch)? {
+            Some(chunk) => {
+                let n = chunk.len();
+                assert!(dst.len() >= n);
+                // SAFETY: dst capacity ≥ n; chunk is `n` initialized T values.
+                unsafe {
+                    std::ptr::copy_nonoverlapping(
+                        chunk.as_ptr(),
+                        dst.as_mut_ptr().cast::<T>(),
+                        n,
+                    );
+                }
+                Ok(Some(n))
+            }
+            None => Ok(None),
+        }
+    }
+
+    /// Number of elements not yet produced.
+    fn remaining(&self) -> usize;
+}
+
+/// Pluggable kernel that can build a fused chunk producer for an array.
+///
+/// Kernels are registered against the outermost encoding id; each kernel inspects the
+/// children at build time to decide whether it can handle the array shape. Returning
+/// `Ok(None)` means "this kernel doesn't apply"; the dispatcher tries the next kernel
+/// (or the fallback) instead.
+pub trait PrimitiveChunkKernel<T: NativePType>: Send + Sync {
+    /// Try to build a fused producer for `array`. Returning `Ok(None)` defers to the next
+    /// kernel.
+    fn build(
+        &self,
+        array: &ArrayRef,
+        ctx: &mut ExecutionCtx,
+    ) -> VortexResult<Option<Box<dyn PrimitiveChunkProducer<T>>>>;
+}
+
+/// A typed entry in [`PrimitiveChunkKernelDispatcher`].
+///
+/// Boxed so the registry can hold heterogeneous `T`s; the typed lookup downcasts via
+/// `Any` to recover the strongly-typed kernel slice.
+struct TypedKernels<T: NativePType> {
+    by_encoding: rustc_hash::FxHashMap<ArrayId, Vec<Arc<dyn PrimitiveChunkKernel<T>>>>,
+}
+
+impl<T: NativePType> TypedKernels<T> {
+    fn new() -> Self {
+        Self {
+            by_encoding: rustc_hash::FxHashMap::default(),
+        }
+    }
+}
+
+/// Registry of fused chunk kernels.
+///
+/// In the long run this would hang off [`vortex_session::VortexSession`] and ride the same
+/// session-scoped lookup as the existing [`crate::optimizer::kernels::ArrayKernels`]. For
+/// the v1 spike the dispatcher is constructed per-call; see [`default_dispatcher`].
+pub struct PrimitiveChunkKernelDispatcher {
+    // Keyed by TypeId of T to support multiple primitive output types.
+    entries: rustc_hash::FxHashMap<std::any::TypeId, Box<dyn Any + Send + Sync>>,
+}
+
+impl Default for PrimitiveChunkKernelDispatcher {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl PrimitiveChunkKernelDispatcher {
+    /// Empty registry.
+    pub fn new() -> Self {
+        Self {
+            entries: rustc_hash::FxHashMap::default(),
+        }
+    }
+
+    /// Register `kernel` for outermost encoding `encoding`.
+    pub fn register<T: NativePType>(
+        &mut self,
+        encoding: ArrayId,
+        kernel: Arc<dyn PrimitiveChunkKernel<T>>,
+    ) {
+        let typed = self
+            .entries
+            .entry(std::any::TypeId::of::<T>())
+            .or_insert_with(|| Box::new(TypedKernels::<T>::new()));
+        let typed = typed
+            .downcast_mut::<TypedKernels<T>>()
+            .expect("TypedKernels<T> matches TypeId<T>");
+        typed.by_encoding.entry(encoding).or_default().push(kernel);
+    }
+
+    fn kernels_for<T: NativePType>(
+        &self,
+        encoding: ArrayId,
+    ) -> &[Arc<dyn PrimitiveChunkKernel<T>>] {
+        let Some(typed) = self.entries.get(&std::any::TypeId::of::<T>()) else {
+            return &[];
+        };
+        let typed = typed
+            .downcast_ref::<TypedKernels<T>>()
+            .expect("TypedKernels<T> matches TypeId<T>");
+        typed
+            .by_encoding
+            .get(&encoding)
+            .map(Vec::as_slice)
+            .unwrap_or(&[])
+    }
+}
+
+/// Build the v1 default dispatcher with the in-crate kernels only (Dict).
+///
+/// To register the `RunEnd` kernel (and the fused `Dict<RunEnd<P>>` path that emerges from
+/// it), call `vortex_runend::register_chunk_kernels(&mut dispatcher)` from the consumer
+/// crate. Out-of-crate encodings layer their kernels on top in the same way.
+pub fn default_dispatcher() -> PrimitiveChunkKernelDispatcher {
+    let mut d = PrimitiveChunkKernelDispatcher::new();
+    register_defaults(&mut d);
+    d
+}
+
+/// Register the in-crate v1 kernels onto `dispatcher` for every supported `T`.
+pub fn register_defaults(dispatcher: &mut PrimitiveChunkKernelDispatcher) {
+    macro_rules! register_all_for {
+        ($($T:ty),*) => {
+            $(
+                dispatcher.register::<$T>(Dict.id(), Arc::new(DictKernel::<$T>::new()));
+            )*
+        };
+    }
+    register_all_for!(u8, u16, u32, u64, i8, i16, i32, i64, f32, f64);
+}
+
+/// Build a chunk producer for `array`, dispatching to any registered fused kernel and
+/// falling back to a canonicalize-then-stream producer if none apply.
+///
+/// The caller passes a dispatcher (typically obtained from [`default_dispatcher`]). For
+/// session integration the caller would fetch the session-scoped registry, but the spike
+/// stays out of `VortexSession` for now to keep the diff small.
+pub fn build_primitive_producer<T: NativePType>(
+    array: ArrayRef,
+    dispatcher: &PrimitiveChunkKernelDispatcher,
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<Box<dyn PrimitiveChunkProducer<T>>> {
+    let encoding = array.encoding_id();
+    for kernel in dispatcher.kernels_for::<T>(encoding) {
+        if let Some(p) = kernel.build(&array, ctx)? {
+            return Ok(p);
+        }
+    }
+    // Fallback: canonicalize, then stream the resulting slice in fixed-size chunks.
+    let canonical = array.execute::<PrimitiveArray>(ctx)?;
+    Ok(Box::new(SliceProducer::<T>::from_primitive(canonical)?))
+}
+
+// ---------------------------------------------------------------------------------------
+// Slice producer
+// ---------------------------------------------------------------------------------------
+
+/// Walks a primitive buffer and emits fixed-size chunks via `copy_from_slice`.
+///
+/// Used as the universal fallback; also useful as a baseline in benchmarks.
+pub struct SliceProducer<T: NativePType> {
+    buffer: Buffer<T>,
+    cursor: usize,
+}
+
+impl<T: NativePType> SliceProducer<T> {
+    /// Construct a slice producer over a primitive buffer.
+    pub fn new(buffer: Buffer<T>) -> Self {
+        Self { buffer, cursor: 0 }
+    }
+
+    /// Construct a slice producer from a [`PrimitiveArray`].
+    ///
+    /// Errors if the array has a different physical type than `T`.
+    pub fn from_primitive(array: PrimitiveArray) -> VortexResult<Self> {
+        if T::PTYPE != array.ptype() {
+            vortex_error::vortex_bail!(
+                "SliceProducer<{}> cannot be built from PrimitiveArray of {}",
+                T::PTYPE,
+                array.ptype()
+            );
+        }
+        Ok(Self::new(array.into_buffer::<T>()))
+    }
+}
+
+impl<T: NativePType> PrimitiveChunkProducer<T> for SliceProducer<T> {
+    fn next_chunk<'a>(
+        &mut self,
+        scratch: &'a mut Scratch<T>,
+    ) -> VortexResult<Option<&'a [T]>> {
+        let len = self.buffer.as_slice().len();
+        if self.cursor >= len {
+            return Ok(None);
+        }
+        let take = CHUNK_LEN.min(len - self.cursor);
+        let src = &self.buffer.as_slice()[self.cursor..self.cursor + take];
+        let dst = &mut scratch.as_uninit_mut()[..take];
+        // SAFETY: `MaybeUninit<T>` has the same layout as `T`; we overwrite exactly `take`.
+        unsafe {
+            let dst_ptr = dst.as_mut_ptr().cast::<T>();
+            std::ptr::copy_nonoverlapping(src.as_ptr(), dst_ptr, take);
+        }
+        self.cursor += take;
+        // SAFETY: `take` elements were just initialized.
+        Ok(Some(unsafe {
+            std::slice::from_raw_parts(scratch.as_uninit_mut().as_ptr().cast::<T>(), take)
+        }))
+    }
+
+    fn next_chunk_into_uninit(
+        &mut self,
+        _scratch: &mut Scratch<T>,
+        dst: &mut [MaybeUninit<T>],
+    ) -> VortexResult<Option<usize>> {
+        let len = self.buffer.as_slice().len();
+        if self.cursor >= len {
+            return Ok(None);
+        }
+        let take = CHUNK_LEN.min(len - self.cursor).min(dst.len());
+        let src = &self.buffer.as_slice()[self.cursor..self.cursor + take];
+        // SAFETY: dst has at least `take` cells; src is `take` initialized T values.
+        unsafe {
+            std::ptr::copy_nonoverlapping(src.as_ptr(), dst.as_mut_ptr().cast::<T>(), take);
+        }
+        self.cursor += take;
+        Ok(Some(take))
+    }
+
+    fn remaining(&self) -> usize {
+        self.buffer.as_slice().len().saturating_sub(self.cursor)
+    }
+}
+
+// ---------------------------------------------------------------------------------------
+// Dict<Primitive> producer
+// ---------------------------------------------------------------------------------------
+
+/// Stream a `Dict<Primitive>` by chunked AVX2 gather over the codes into a
+/// pre-materialized values buffer.
+///
+/// The values are loaded once into a [`Buffer<T>`] (the "dictionary"); each chunk picks
+/// up to [`CHUNK_LEN`] codes and gathers values into the scratch via
+/// [`take_into_uninit`], which selects the AVX2 kernel when available and falls back
+/// to scalar. The dictionary is expected to fit in L1d (otherwise this encoding
+/// shouldn't have been chosen).
+pub struct DictPrimitiveProducer<T: NativePType, I: NativePType> {
+    dict: Buffer<T>,
+    codes: Buffer<I>,
+    cursor: usize,
+}
+
+impl<T: NativePType, I: NativePType> DictPrimitiveProducer<T, I> {
+    /// Construct directly from raw dict and code buffers.
+    pub fn new(dict: Buffer<T>, codes: Buffer<I>) -> Self {
+        Self {
+            dict,
+            codes,
+            cursor: 0,
+        }
+    }
+}
+
+impl<T: NativePType, I> PrimitiveChunkProducer<T> for DictPrimitiveProducer<T, I>
+where
+    I: crate::dtype::UnsignedPType,
+{
+    fn next_chunk<'a>(
+        &mut self,
+        scratch: &'a mut Scratch<T>,
+    ) -> VortexResult<Option<&'a [T]>> {
+        let total = self.codes.as_slice().len();
+        if self.cursor >= total {
+            return Ok(None);
+        }
+        let take = CHUNK_LEN.min(total - self.cursor);
+        let codes_chunk = &self.codes.as_slice()[self.cursor..self.cursor + take];
+        let dst: &mut [MaybeUninit<T>] = &mut scratch.as_uninit_mut()[..take];
+
+        crate::arrays::primitive::compute::take::take_into_uninit::<T, I>(
+            self.dict.as_slice(),
+            codes_chunk,
+            dst,
+        );
+
+        self.cursor += take;
+        // SAFETY: `take` elements just written by take_into_uninit.
+        let ptr = scratch.as_uninit_mut().as_ptr().cast::<T>();
+        Ok(Some(unsafe { std::slice::from_raw_parts(ptr, take) }))
+    }
+
+    fn next_chunk_into_uninit(
+        &mut self,
+        _scratch: &mut Scratch<T>,
+        dst: &mut [MaybeUninit<T>],
+    ) -> VortexResult<Option<usize>> {
+        let total = self.codes.as_slice().len();
+        if self.cursor >= total {
+            return Ok(None);
+        }
+        let take = CHUNK_LEN.min(total - self.cursor);
+        let codes_chunk = &self.codes.as_slice()[self.cursor..self.cursor + take];
+        crate::arrays::primitive::compute::take::take_into_uninit::<T, I>(
+            self.dict.as_slice(),
+            codes_chunk,
+            &mut dst[..take],
+        );
+        self.cursor += take;
+        Ok(Some(take))
+    }
+
+    fn remaining(&self) -> usize {
+        self.codes.as_slice().len().saturating_sub(self.cursor)
+    }
+}
+
+/// The `Dict<Primitive>` kernel — and, when values are `RunEnd<Primitive>`, the fused
+/// `Dict<RunEnd<Primitive>>` kernel. Both compose to a [`DictPrimitiveProducer`] after
+/// any necessary up-front materialization of the (bounded) values slot.
+pub struct DictKernel<T: NativePType> {
+    _marker: std::marker::PhantomData<fn() -> T>,
+}
+
+impl<T: NativePType> DictKernel<T> {
+    pub fn new() -> Self {
+        Self {
+            _marker: std::marker::PhantomData,
+        }
+    }
+}
+
+impl<T: NativePType> Default for DictKernel<T> {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl<T: NativePType> PrimitiveChunkKernel<T> for DictKernel<T> {
+    fn build(
+        &self,
+        array: &ArrayRef,
+        ctx: &mut ExecutionCtx,
+    ) -> VortexResult<Option<Box<dyn PrimitiveChunkProducer<T>>>> {
+        let Some(dict) = array.as_opt::<Dict>() else {
+            return Ok(None);
+        };
+        if !matches!(array.dtype().nullability(), Nullability::NonNullable) {
+            return Ok(None);
+        }
+        let codes = dict.codes();
+        let values = dict.values();
+        if !matches!(codes.dtype().nullability(), Nullability::NonNullable)
+            || !matches!(values.dtype().nullability(), Nullability::NonNullable)
+        {
+            return Ok(None);
+        }
+        let DType::Primitive(values_ptype, _) = *values.dtype() else {
+            return Ok(None);
+        };
+        if values_ptype != T::PTYPE {
+            return Ok(None);
+        }
+
+        // Materialize the values slot to a primitive buffer of length `dict_len`.
+        // Dictionary sizes are bounded by definition; this is the "up-front" step that
+        // makes the streaming chunked gather possible. For `Dict<RunEnd<P>>` this is
+        // also where the RunEnd is unrolled into a flat dict, so the produced kernel
+        // is the *fused* implementation.
+        let values_canonical = values.clone().execute::<PrimitiveArray>(ctx)?;
+        let dict_buf: Buffer<T> = values_canonical.into_buffer::<T>();
+
+        let codes_canonical = codes.clone().execute::<PrimitiveArray>(ctx)?;
+        let codes_ptype = codes_canonical.ptype();
+        // Fast path only fires for unsigned codes (AVX2 gather requires unsigned indices).
+        // For signed codes, defer to the canonical executor which handles the cast.
+        if !codes_ptype.is_unsigned_int() {
+            return Ok(None);
+        }
+        Ok(Some(dispatch_dict_producer::<T>(
+            dict_buf,
+            codes_canonical,
+            codes_ptype,
+        )))
+    }
+}
+
+fn dispatch_dict_producer<T: NativePType>(
+    dict: Buffer<T>,
+    codes: PrimitiveArray,
+    codes_ptype: PType,
+) -> Box<dyn PrimitiveChunkProducer<T>> {
+    crate::match_each_unsigned_integer_ptype!(codes_ptype, |I| {
+        Box::new(DictPrimitiveProducer::<T, I>::new(
+            dict,
+            codes.into_buffer::<I>(),
+        ))
+    })
+}
+
+/// Stream a `RunEnd<Primitive>` by walking the run-ends and replicating each run value
+/// into chunk-sized strides.
+pub struct RunEndPrimitiveProducer<T: NativePType, E: NativePType> {
+    values: Buffer<T>,
+    ends: Buffer<E>,
+    /// Logical position in the decoded array (post-offset).
+    cursor: usize,
+    /// Total logical length to produce.
+    len: usize,
+    /// The slicing offset to apply to `cursor` before searching ends.
+    offset: usize,
+    /// Cached run index for `cursor`, so we avoid re-searching across chunk boundaries.
+    run: usize,
+}
+
+impl<T: NativePType, E: NativePType> RunEndPrimitiveProducer<T, E>
+where
+    E: num_traits::AsPrimitive<usize>,
+{
+    /// Construct from value + end buffers plus a logical (offset, len) window.
+    pub fn new(
+        values: Buffer<T>,
+        ends: Buffer<E>,
+        offset: usize,
+        len: usize,
+    ) -> VortexResult<Self> {
+        let run = find_run(ends.as_slice(), offset);
+        Ok(Self {
+            values,
+            ends,
+            cursor: 0,
+            len,
+            offset,
+            run,
+        })
+    }
+}
+
+impl<T: NativePType, E: NativePType> RunEndPrimitiveProducer<T, E>
+where
+    E: num_traits::AsPrimitive<usize>,
+{
+    /// Fill `dst_ptr[..take]` with the next `take` run-end-decoded values and advance the
+    /// cursor. The caller has already capped `take` to the remaining length and the dst
+    /// capacity; this function performs only writes.
+    ///
+    /// # Safety
+    ///
+    /// `dst_ptr` must be writable for at least `take` elements.
+    unsafe fn fill(&mut self, take: usize, dst_ptr: *mut T) {
+        let ends = self.ends.as_slice();
+        let values = self.values.as_slice();
+        let mut written = 0usize;
+        let chunk_end_encoded = self.cursor + take + self.offset;
+        while written < take {
+            let run_end_encoded: usize = ends[self.run].as_();
+            let pos_encoded = self.cursor + written + self.offset;
+            let in_run = run_end_encoded.min(chunk_end_encoded) - pos_encoded;
+            let value = values[self.run];
+            // SAFETY: caller guarantees dst_ptr[..take] is writable; written + in_run ≤ take.
+            unsafe {
+                let slot = std::slice::from_raw_parts_mut(dst_ptr.add(written), in_run);
+                for s in slot.iter_mut() {
+                    *s = value;
+                }
+            }
+            written += in_run;
+            if run_end_encoded <= chunk_end_encoded && written < take {
+                self.run += 1;
+            } else if run_end_encoded == chunk_end_encoded {
+                self.run += 1;
+            }
+        }
+        self.cursor += take;
+    }
+}
+
+impl<T: NativePType, E: NativePType> PrimitiveChunkProducer<T>
+    for RunEndPrimitiveProducer<T, E>
+where
+    E: num_traits::AsPrimitive<usize>,
+{
+    fn next_chunk<'a>(
+        &mut self,
+        scratch: &'a mut Scratch<T>,
+    ) -> VortexResult<Option<&'a [T]>> {
+        if self.cursor >= self.len {
+            return Ok(None);
+        }
+        let take = CHUNK_LEN.min(self.len - self.cursor);
+        let dst_ptr = scratch.as_uninit_mut().as_mut_ptr().cast::<T>();
+        // SAFETY: scratch has CHUNK_LEN cells; take ≤ CHUNK_LEN.
+        unsafe { self.fill(take, dst_ptr) };
+        // SAFETY: `take` elements written.
+        Ok(Some(unsafe { std::slice::from_raw_parts(dst_ptr, take) }))
+    }
+
+    fn next_chunk_into_uninit(
+        &mut self,
+        _scratch: &mut Scratch<T>,
+        dst: &mut [MaybeUninit<T>],
+    ) -> VortexResult<Option<usize>> {
+        if self.cursor >= self.len {
+            return Ok(None);
+        }
+        let take = CHUNK_LEN.min(self.len - self.cursor).min(dst.len());
+        // SAFETY: dst has at least `take` cells; fill initializes them all.
+        unsafe { self.fill(take, dst.as_mut_ptr().cast::<T>()) };
+        Ok(Some(take))
+    }
+
+    fn remaining(&self) -> usize {
+        self.len.saturating_sub(self.cursor)
+    }
+}
+
+fn find_run<E>(ends: &[E], target_encoded: usize) -> usize
+where
+    E: NativePType + num_traits::AsPrimitive<usize>,
+{
+    // Right binary-search: the run index is the first end > target_encoded.
+    let (mut lo, mut hi) = (0usize, ends.len());
+    while lo < hi {
+        let mid = (lo + hi) / 2;
+        let end: usize = ends[mid].as_();
+        if end <= target_encoded {
+            lo = mid + 1;
+        } else {
+            hi = mid;
+        }
+    }
+    lo.min(ends.len().saturating_sub(1))
+}
+
+/// Helper used by encoding crates: build a [`RunEndPrimitiveProducer`] from the canonicalized
+/// values + ends buffers and a logical `(offset, len)` window. Dispatches on `ends_ptype`.
+pub fn build_runend_producer<T: NativePType>(
+    values: PrimitiveArray,
+    ends: PrimitiveArray,
+    offset: usize,
+    len: usize,
+) -> VortexResult<Box<dyn PrimitiveChunkProducer<T>>> {
+    let ends_ptype = ends.ptype();
+    let values_buf = values.into_buffer::<T>();
+    Ok(match_each_integer_ptype!(ends_ptype, |E| {
+        Box::new(RunEndPrimitiveProducer::<T, E>::new(
+            values_buf,
+            ends.into_buffer::<E>(),
+            offset,
+            len,
+        )?)
+    }))
+}
+
+// ---------------------------------------------------------------------------------------
+// Patched producer — overlay sorted patches chunk-locally over any inner producer
+// ---------------------------------------------------------------------------------------
+
+/// Wraps any inner [`PrimitiveChunkProducer`] (the base, e.g. a bit-packed primitive) and
+/// overlays a sorted list of `(index, value)` patches chunk-by-chunk.
+///
+/// This is the chunked answer to "decode the base fully, then scatter exceptions into the
+/// whole buffer". Because patch indices are sorted and chunks are emitted in order, the
+/// overlay is a monotonic merge-walk: for each base chunk the producer applies only the
+/// patches that fall inside that chunk's logical range, writing them while the chunk is
+/// still resident in the scratch (and therefore in L1). The canonical path scatters
+/// patches into the full N-element output, so each write can miss a cold cache line once
+/// N spills L2/L3.
+///
+/// Patch indices are logical positions in the *decoded* array (post-offset).
+pub struct PatchedProducer<T: NativePType> {
+    inner: Box<dyn PrimitiveChunkProducer<T>>,
+    inner_scratch: Scratch<T>,
+    patch_indices: Buffer<u32>,
+    patch_values: Buffer<T>,
+    patch_cursor: usize,
+    pos: usize,
+}
+
+impl<T: NativePType> PatchedProducer<T> {
+    /// Build from an inner producer plus sorted patch indices/values.
+    ///
+    /// `patch_indices` must be sorted ascending and in-range for the inner producer's
+    /// logical length.
+    pub fn new(
+        inner: Box<dyn PrimitiveChunkProducer<T>>,
+        patch_indices: Buffer<u32>,
+        patch_values: Buffer<T>,
+    ) -> Self {
+        Self {
+            inner,
+            inner_scratch: Scratch::<T>::new(),
+            patch_indices,
+            patch_values,
+            patch_cursor: 0,
+            pos: 0,
+        }
+    }
+}
+
+impl<T: NativePType> PrimitiveChunkProducer<T> for PatchedProducer<T> {
+    fn next_chunk<'a>(
+        &mut self,
+        scratch: &'a mut Scratch<T>,
+    ) -> VortexResult<Option<&'a [T]>> {
+        // Decode the base chunk into the driver's scratch directly.
+        let n = match self
+            .inner
+            .next_chunk_into_uninit(&mut self.inner_scratch, scratch.as_uninit_mut())?
+        {
+            Some(n) => n,
+            None => return Ok(None),
+        };
+        let chunk_end = self.pos + n;
+        let ptr = scratch.as_uninit_mut().as_mut_ptr().cast::<T>();
+
+        // Overlay the patches that fall within [pos, chunk_end) — they are sorted, so we
+        // advance the cursor monotonically.
+        let idxs = self.patch_indices.as_slice();
+        let vals = self.patch_values.as_slice();
+        while self.patch_cursor < idxs.len() {
+            let gidx = idxs[self.patch_cursor] as usize;
+            if gidx >= chunk_end {
+                break;
+            }
+            // SAFETY: gidx ∈ [pos, chunk_end); chunk_end - pos = n ≤ CHUNK_LEN; the scratch
+            // cell at gidx - pos was just initialised by the base decode.
+            unsafe { ptr.add(gidx - self.pos).write(vals[self.patch_cursor]) };
+            self.patch_cursor += 1;
+        }
+        self.pos = chunk_end;
+        // SAFETY: scratch[..n] is fully initialised (base decode + patch overlay).
+        Ok(Some(unsafe { std::slice::from_raw_parts(ptr, n) }))
+    }
+
+    fn remaining(&self) -> usize {
+        self.inner.remaining()
+    }
+}
+
+// ---------------------------------------------------------------------------------------
+// Convenience: decode an encoded ArrayRef into a fresh `Buffer<T>` using the chunked path.
+// ---------------------------------------------------------------------------------------
+
+/// Decode `array` to a fresh `Buffer<T>` by driving the chunked producer to completion.
+///
+/// This is the helper used by benchmarks and tests to materialize the chunked output into
+/// a comparable form against the existing executor.
+pub fn decode_to_buffer<T: NativePType>(
+    array: ArrayRef,
+    dispatcher: &PrimitiveChunkKernelDispatcher,
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<Buffer<T>> {
+    let len = array.len();
+    let mut out = vortex_buffer::BufferMut::<T>::with_capacity(len);
+    debug_assert!(out.spare_capacity_mut().len() >= len);
+    let mut producer = build_primitive_producer::<T>(array, dispatcher, ctx)?;
+    let mut scratch = Scratch::<T>::new();
+    let mut written = 0usize;
+    loop {
+        // Slice the spare-capacity head for this chunk so the producer can write directly.
+        // SAFETY: spare_capacity_mut().len() >= len - written, and we hand the producer at most
+        // CHUNK_LEN cells (it will respect that and report what it wrote).
+        let dst = unsafe {
+            let cap_ptr = out.spare_capacity_mut().as_mut_ptr().add(written);
+            let cap_len = len - written;
+            std::slice::from_raw_parts_mut(
+                cap_ptr.cast::<MaybeUninit<T>>(),
+                cap_len.min(CHUNK_LEN),
+            )
+        };
+        match producer.next_chunk_into_uninit(&mut scratch, dst)? {
+            Some(n) => written += n,
+            None => break,
+        }
+    }
+    // SAFETY: we wrote exactly `written` elements into the spare capacity.
+    unsafe {
+        out.set_len(written);
+    }
+    Ok(out.freeze())
+}
+
+// ---------------------------------------------------------------------------------------
+// Decode into a builder, for use by the existing builder execution path.
+// ---------------------------------------------------------------------------------------
+
+/// Decode `array` to a [`PrimitiveArray`] (non-nullable) via the chunked engine, choosing
+/// the right concrete output type by inspecting the array's dtype.
+///
+/// Useful as a one-shot replacement for `array.execute::<PrimitiveArray>(ctx)` in code
+/// paths where a non-nullable primitive output is expected and the chunked engine has a
+/// fused kernel registered.
+pub fn execute_to_primitive(
+    array: ArrayRef,
+    dispatcher: &PrimitiveChunkKernelDispatcher,
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<PrimitiveArray> {
+    let DType::Primitive(ptype, nullability) = *array.dtype() else {
+        vortex_error::vortex_bail!(
+            "execute_to_primitive requires Primitive dtype, got {}",
+            array.dtype()
+        );
+    };
+    if !matches!(nullability, Nullability::NonNullable) {
+        // For now, fall back to the existing executor for nullable arrays.
+        return array.execute::<PrimitiveArray>(ctx);
+    }
+    Ok(match_each_native_ptype!(ptype, |T| {
+        let buf = decode_to_buffer::<T>(array, dispatcher, ctx)?;
+        PrimitiveArray::new(buf, Validity::NonNullable)
+    }))
+}
+
+#[cfg(test)]
+mod tests {
+    use vortex_buffer::buffer;
+    use vortex_error::VortexResult;
+
+    use super::*;
+    use crate::IntoArray;
+    use crate::LEGACY_SESSION;
+    use crate::VortexSessionExecute;
+    use crate::arrays::DictArray;
+
+    fn ctx() -> ExecutionCtx {
+        LEGACY_SESSION.create_execution_ctx()
+    }
+
+    #[test]
+    fn slice_producer_round_trip() -> VortexResult<()> {
+        let data = buffer![1i32, 2, 3, 4, 5, 6, 7, 8, 9, 10];
+        let mut p = SliceProducer::<i32>::new(data.clone());
+        let mut s = Scratch::<i32>::new();
+        let mut out = Vec::new();
+        while let Some(c) = p.next_chunk(&mut s)? {
+            out.extend_from_slice(c);
+        }
+        assert_eq!(out, data.as_slice());
+        Ok(())
+    }
+
+    #[test]
+    fn dict_primitive_chunked() -> VortexResult<()> {
+        // dict-encoded i32 of 4096 elements, dict size 17.
+        let dict_values =
+            buffer![10i32, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170];
+        let codes: Vec<u8> = (0..4096).map(|i| (i % 17) as u8).collect();
+        let codes_arr = PrimitiveArray::new(
+            Buffer::<u8>::from_iter(codes.iter().copied()),
+            Validity::NonNullable,
+        );
+        let values_arr = PrimitiveArray::new(dict_values.clone(), Validity::NonNullable);
+        let dict = DictArray::try_new(codes_arr.into_array(), values_arr.into_array())?;
+
+        let dispatcher = default_dispatcher();
+        let result = decode_to_buffer::<i32>(dict.into_array(), &dispatcher, &mut ctx())?;
+        let expected: Vec<i32> = codes
+            .iter()
+            .map(|c| dict_values.as_slice()[*c as usize])
+            .collect();
+        assert_eq!(result.as_slice(), expected.as_slice());
+        Ok(())
+    }
+
+    #[test]
+    fn fallback_to_canonicalize() -> VortexResult<()> {
+        // Plain primitive — no kernel registered; should go through SliceProducer fallback.
+        let p = PrimitiveArray::new(buffer![1i32, 2, 3], Validity::NonNullable);
+        let dispatcher = default_dispatcher();
+        let buf = decode_to_buffer::<i32>(p.into_array(), &dispatcher, &mut ctx())?;
+        assert_eq!(buf.as_slice(), &[1, 2, 3]);
+        Ok(())
+    }
+}
diff --git a/vortex-array/src/_chunked_exec/scratch.rs b/vortex-array/src/_chunked_exec/scratch.rs
new file mode 100644
index 00000000000..67c4b627bb5
--- /dev/null
+++ b/vortex-array/src/_chunked_exec/scratch.rs
@@ -0,0 +1,44 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+use std::mem::MaybeUninit;
+
+use super::CHUNK_LEN;
+
+/// A reusable L1-resident scratch buffer of [`CHUNK_LEN`] elements.
+///
+/// The scratch is heap-allocated so it does not bloat caller stack frames. It is uninit by
+/// default and is written to by each producer chunk; readers only ever see the initialized
+/// prefix returned alongside.
+pub struct Scratch<T> {
+    buf: Box<[MaybeUninit<T>; CHUNK_LEN]>,
+}
+
+impl<T> Scratch<T> {
+    /// Construct a new uninitialized scratch buffer.
+    pub fn new() -> Self {
+        let buf: Box<[MaybeUninit<T>; CHUNK_LEN]> =
+            Box::new([const { MaybeUninit::<T>::uninit() }; CHUNK_LEN]);
+        Self { buf }
+    }
+
+    /// Capacity in elements.
+    #[inline]
+    pub fn capacity(&self) -> usize {
+        CHUNK_LEN
+    }
+
+    /// Borrow the underlying storage as a slice of uninitialized cells.
+    ///
+    /// Producers write into this slice and return the initialized prefix to the driver.
+    #[inline]
+    pub fn as_uninit_mut(&mut self) -> &mut [MaybeUninit<T>; CHUNK_LEN] {
+        &mut self.buf
+    }
+}
+
+impl<T> Default for Scratch<T> {
+    fn default() -> Self {
+        Self::new()
+    }
+}
diff --git a/vortex-array/src/_chunked_exec/sink.rs b/vortex-array/src/_chunked_exec/sink.rs
new file mode 100644
index 00000000000..4f59dee1897
--- /dev/null
+++ b/vortex-array/src/_chunked_exec/sink.rs
@@ -0,0 +1,410 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Sinks for fused chunked execution.
+//!
+//! A sink consumes [`crate::_chunked_exec::primitive::PrimitiveChunkProducer`] chunks one
+//! at a time and produces a final result. Combining a producer with a sink fuses decode
+//! and the downstream operator into a single pass, with no `Buffer<T>` materialised
+//! between them. The producer's L1-resident scratch flows directly into the operator's
+//! register-resident accumulators / output buffer.
+//!
+//! See [`drive_into_sink`] for the canonical driver.
+//!
+//! ## Sinks shipped here
+//!
+//! - [`BufferSink`]: equivalent to `decode_to_buffer`. Collects chunks into a `Buffer<T>`.
+//!   Useful as the "no fusion" baseline against which other sinks compare.
+//! - [`SumSink`]: accumulates `sum(x) :: i64` across chunks. No output buffer at all.
+//! - [`MapSink`]: applies a per-element `FnMut(T) -> U`, writing into a `Buffer<U>`. Used
+//!   to express casts (`T as U`) and unary scalar functions (`x + c`, `x * c`, …) without
+//!   materialising the source as a `Buffer<T>` first.
+//! - [`FilterSink`]: applies a per-element predicate; surviving elements stream into a
+//!   `Buffer<T>`. The mask is never materialised — selectivity is encoded directly in
+//!   the output length.
+
+use std::marker::PhantomData;
+
+use vortex_buffer::Buffer;
+use vortex_buffer::BufferMut;
+use vortex_error::VortexResult;
+
+use super::Scratch;
+use super::primitive::PrimitiveChunkKernelDispatcher;
+use super::primitive::build_primitive_producer;
+use crate::ArrayRef;
+use crate::dtype::NativePType;
+use crate::executor::ExecutionCtx;
+
+/// Consumes chunks of decoded primitive values and produces a final result.
+///
+/// Implementors should treat each `push` call as "process this chunk in-place" — the
+/// slice borrowed by `chunk` is only valid until the next `push` call, since it points
+/// into the producer's scratch buffer.
+pub trait PrimitiveChunkSink<T: NativePType> {
+    /// The value produced once all chunks have been consumed.
+    type Output;
+
+    /// Process one chunk. Implementors typically aggregate, transform, or selectively
+    /// store into an owned output buffer.
+    fn push(&mut self, chunk: &[T]) -> VortexResult<()>;
+
+    /// Finalise the sink and produce the result. Consumes the sink.
+    fn finish(self) -> VortexResult<Self::Output>;
+}
+
+/// Drive a chunked producer to completion, feeding every chunk into `sink`.
+///
+/// This is the fused-pipeline entry point: the producer never materialises a full
+/// `Buffer<T>`, the sink decides what to do with each chunk, and the only memory traffic
+/// outside the input bytes is whatever the sink chooses to keep.
+pub fn drive_into_sink<T, S>(
+    array: ArrayRef,
+    dispatcher: &PrimitiveChunkKernelDispatcher,
+    mut sink: S,
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<S::Output>
+where
+    T: NativePType,
+    S: PrimitiveChunkSink<T>,
+{
+    let mut producer = build_primitive_producer::<T>(array, dispatcher, ctx)?;
+    let mut scratch = Scratch::<T>::new();
+    while let Some(chunk) = producer.next_chunk(&mut scratch)? {
+        sink.push(chunk)?;
+    }
+    sink.finish()
+}
+
+// ----------------------------------------------------------------------------------
+// BufferSink — the "no fusion" baseline. Collects everything into a Buffer<T>.
+// ----------------------------------------------------------------------------------
+
+/// Collects chunks into a flat [`Buffer<T>`]. Equivalent to the old `decode_to_buffer`,
+/// reformulated as a sink for use with [`drive_into_sink`].
+pub struct BufferSink<T: NativePType> {
+    out: BufferMut<T>,
+}
+
+impl<T: NativePType> BufferSink<T> {
+    /// Construct with a pre-allocated capacity of `len` elements.
+    pub fn with_capacity(len: usize) -> Self {
+        Self {
+            out: BufferMut::<T>::with_capacity(len),
+        }
+    }
+}
+
+impl<T: NativePType> PrimitiveChunkSink<T> for BufferSink<T> {
+    type Output = Buffer<T>;
+
+    fn push(&mut self, chunk: &[T]) -> VortexResult<()> {
+        // SAFETY: BufferMut maintains capacity invariant; we never exceed its allocation.
+        // The caller pre-allocated with `with_capacity(array.len())`.
+        let n = chunk.len();
+        let written = self.out.as_slice().len();
+        unsafe {
+            let dst = self
+                .out
+                .spare_capacity_mut()
+                .as_mut_ptr()
+                .add(0)
+                .cast::<T>();
+            std::ptr::copy_nonoverlapping(chunk.as_ptr(), dst, n);
+            self.out.set_len(written + n);
+        }
+        Ok(())
+    }
+
+    fn finish(self) -> VortexResult<Self::Output> {
+        Ok(self.out.freeze())
+    }
+}
+
+// ----------------------------------------------------------------------------------
+// SumSink — proof of concept. No output buffer; just an i64 accumulator.
+// ----------------------------------------------------------------------------------
+
+/// Accumulates `sum(chunk_i) as i64` across all chunks. Produces a scalar.
+///
+/// Useful as the smallest possible demonstration of operator fusion: the only memory
+/// touched outside the input bytes is the producer's 4 KiB scratch.
+pub struct SumSink<T: NativePType> {
+    acc: i64,
+    _marker: PhantomData<fn() -> T>,
+}
+
+impl<T: NativePType> SumSink<T> {
+    /// New sink, zero-initialised.
+    pub fn new() -> Self {
+        Self {
+            acc: 0,
+            _marker: PhantomData,
+        }
+    }
+}
+
+impl<T: NativePType> Default for SumSink<T> {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl<T> PrimitiveChunkSink<T> for SumSink<T>
+where
+    T: NativePType + num_traits::AsPrimitive<i64>,
+{
+    type Output = i64;
+
+    fn push(&mut self, chunk: &[T]) -> VortexResult<()> {
+        let mut acc = self.acc;
+        for &v in chunk {
+            acc = acc.wrapping_add(v.as_());
+        }
+        self.acc = acc;
+        Ok(())
+    }
+
+    fn finish(self) -> VortexResult<Self::Output> {
+        Ok(self.acc)
+    }
+}
+
+// ----------------------------------------------------------------------------------
+// MapSink — generic per-element transform. Used for cast + scalar funcs.
+// ----------------------------------------------------------------------------------
+
+/// Applies a per-element `FnMut(T) -> U` and writes the result to a [`Buffer<U>`].
+///
+/// Casts (e.g. `T as U`) and unary scalar functions (`x + c`, `x * c`, `x.abs()`, …) are
+/// the same shape — one element in, one element out. The fused win over canonical is that
+/// the source array is never materialised as a `Buffer<T>` between decode and map.
+pub struct MapSink<T, U, F>
+where
+    T: NativePType,
+    U: NativePType,
+    F: FnMut(T) -> U,
+{
+    f: F,
+    out: BufferMut<U>,
+    _marker: PhantomData<fn(T)>,
+}
+
+impl<T, U, F> MapSink<T, U, F>
+where
+    T: NativePType,
+    U: NativePType,
+    F: FnMut(T) -> U,
+{
+    /// Construct with a pre-allocated output capacity of `len` elements.
+    pub fn with_capacity(len: usize, f: F) -> Self {
+        Self {
+            f,
+            out: BufferMut::<U>::with_capacity(len),
+            _marker: PhantomData,
+        }
+    }
+}
+
+impl<T, U, F> PrimitiveChunkSink<T> for MapSink<T, U, F>
+where
+    T: NativePType,
+    U: NativePType,
+    F: FnMut(T) -> U,
+{
+    type Output = Buffer<U>;
+
+    fn push(&mut self, chunk: &[T]) -> VortexResult<()> {
+        let written = self.out.as_slice().len();
+        let n = chunk.len();
+        // SAFETY: caller-pre-allocated capacity ≥ total array length; we never exceed it.
+        unsafe {
+            let dst = self.out.spare_capacity_mut().as_mut_ptr().cast::<U>();
+            for (i, &v) in chunk.iter().enumerate() {
+                dst.add(i).write((self.f)(v));
+            }
+            self.out.set_len(written + n);
+        }
+        Ok(())
+    }
+
+    fn finish(self) -> VortexResult<Self::Output> {
+        Ok(self.out.freeze())
+    }
+}
+
+// ----------------------------------------------------------------------------------
+// FilterSink — per-element predicate, surviving elements stream to a Buffer<T>.
+// ----------------------------------------------------------------------------------
+
+/// Applies a per-element predicate. Surviving elements stream directly into the output
+/// buffer — the boolean mask is never materialised.
+///
+/// Canonical two-pass `decode → mask → filter` becomes a single pass that touches the
+/// compressed input once and writes only the elements that survive.
+pub struct FilterSink<T, P>
+where
+    T: NativePType,
+    P: FnMut(T) -> bool,
+{
+    pred: P,
+    out: BufferMut<T>,
+}
+
+impl<T, P> FilterSink<T, P>
+where
+    T: NativePType,
+    P: FnMut(T) -> bool,
+{
+    /// Construct with a pre-allocated output capacity of `len` elements (the worst-case
+    /// upper bound when every element passes the predicate).
+    pub fn with_capacity(len: usize, pred: P) -> Self {
+        Self {
+            pred,
+            out: BufferMut::<T>::with_capacity(len),
+        }
+    }
+}
+
+impl<T, P> PrimitiveChunkSink<T> for FilterSink<T, P>
+where
+    T: NativePType,
+    P: FnMut(T) -> bool,
+{
+    type Output = Buffer<T>;
+
+    fn push(&mut self, chunk: &[T]) -> VortexResult<()> {
+        // SAFETY: BufferMut::with_capacity guaranteed at construction; never push past it.
+        let mut written = self.out.as_slice().len();
+        let dst = self.out.spare_capacity_mut();
+        let dst_ptr = dst.as_mut_ptr().cast::<T>();
+        // Track how many cells we've written into the spare-capacity region this call.
+        let mut local = 0usize;
+        for &v in chunk {
+            if (self.pred)(v) {
+                // SAFETY: total written ≤ pre-allocated capacity.
+                unsafe { dst_ptr.add(local).write(v) };
+                local += 1;
+            }
+        }
+        written += local;
+        // SAFETY: just wrote exactly `local` elements past the previous len.
+        unsafe { self.out.set_len(written) };
+        Ok(())
+    }
+
+    fn finish(self) -> VortexResult<Self::Output> {
+        Ok(self.out.freeze())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use vortex_buffer::buffer;
+    use vortex_error::VortexResult;
+
+    use super::*;
+    use crate::IntoArray;
+    use crate::LEGACY_SESSION;
+    use crate::VortexSessionExecute;
+    use crate::_chunked_exec::primitive::default_dispatcher;
+    use crate::arrays::DictArray;
+    use crate::arrays::PrimitiveArray;
+    use crate::validity::Validity;
+
+    fn ctx() -> ExecutionCtx {
+        LEGACY_SESSION.create_execution_ctx()
+    }
+
+    fn make_dict_i32(codes: &[u32], dict: &[i32]) -> ArrayRef {
+        let dict_arr = PrimitiveArray::new(
+            Buffer::<i32>::from_iter(dict.iter().copied()),
+            Validity::NonNullable,
+        );
+        let codes_arr = PrimitiveArray::new(
+            Buffer::<u32>::from_iter(codes.iter().copied()),
+            Validity::NonNullable,
+        );
+        DictArray::try_new(codes_arr.into_array(), dict_arr.into_array())
+            .unwrap()
+            .into_array()
+    }
+
+    #[test]
+    fn buffer_sink_round_trip() -> VortexResult<()> {
+        let array = make_dict_i32(&[0, 1, 0, 2, 1], &[10, 20, 30]);
+        let dispatcher = default_dispatcher();
+        let buf: Buffer<i32> = drive_into_sink(
+            array,
+            &dispatcher,
+            BufferSink::<i32>::with_capacity(5),
+            &mut ctx(),
+        )?;
+        assert_eq!(buf.as_slice(), &[10, 20, 10, 30, 20]);
+        Ok(())
+    }
+
+    #[test]
+    fn sum_sink_fused() -> VortexResult<()> {
+        let array = make_dict_i32(&[0, 1, 0, 2, 1, 2, 2, 0], &[10, 20, 30]);
+        // Expected: 10+20+10+30+20+30+30+10 = 160.
+        let dispatcher = default_dispatcher();
+        let s: i64 = drive_into_sink(array, &dispatcher, SumSink::<i32>::new(), &mut ctx())?;
+        assert_eq!(s, 160);
+        Ok(())
+    }
+
+    #[test]
+    fn map_sink_cast_i32_to_i64() -> VortexResult<()> {
+        let array = make_dict_i32(&[0, 1, 2], &[100, 200, 300]);
+        let dispatcher = default_dispatcher();
+        let buf: Buffer<i64> = drive_into_sink(
+            array,
+            &dispatcher,
+            MapSink::<i32, i64, _>::with_capacity(3, |x| x as i64),
+            &mut ctx(),
+        )?;
+        assert_eq!(buf.as_slice(), &[100i64, 200, 300]);
+        Ok(())
+    }
+
+    #[test]
+    fn map_sink_scalar_add() -> VortexResult<()> {
+        let array = make_dict_i32(&[0, 1, 0, 2], &[1, 2, 3]);
+        let dispatcher = default_dispatcher();
+        let buf: Buffer<i32> = drive_into_sink(
+            array,
+            &dispatcher,
+            MapSink::<i32, i32, _>::with_capacity(4, |x| x + 100),
+            &mut ctx(),
+        )?;
+        assert_eq!(buf.as_slice(), &[101, 102, 101, 103]);
+        Ok(())
+    }
+
+    #[test]
+    fn filter_sink_keeps_surviving() -> VortexResult<()> {
+        let array = make_dict_i32(&[0, 1, 2, 1, 0, 2], &[5, 15, 25]);
+        // Source values: [5, 15, 25, 15, 5, 25]. Predicate: > 10.
+        let dispatcher = default_dispatcher();
+        let buf: Buffer<i32> = drive_into_sink(
+            array,
+            &dispatcher,
+            FilterSink::<i32, _>::with_capacity(6, |x| x > 10),
+            &mut ctx(),
+        )?;
+        assert_eq!(buf.as_slice(), &[15, 25, 15, 25]);
+        Ok(())
+    }
+
+    #[test]
+    fn sink_works_on_canonical_primitive() -> VortexResult<()> {
+        // Sanity check: even plain PrimitiveArray flows through the SliceProducer
+        // fallback path and into the sink.
+        let p = PrimitiveArray::new(buffer![1i32, 2, 3, 4, 5], Validity::NonNullable);
+        let dispatcher = default_dispatcher();
+        let s = drive_into_sink(p.into_array(), &dispatcher, SumSink::<i32>::new(), &mut ctx())?;
+        assert_eq!(s, 15);
+        Ok(())
+    }
+}
diff --git a/vortex-array/src/arrays/primitive/compute/mod.rs b/vortex-array/src/arrays/primitive/compute/mod.rs
index 867ddf69d03..0c546b99c98 100644
--- a/vortex-array/src/arrays/primitive/compute/mod.rs
+++ b/vortex-array/src/arrays/primitive/compute/mod.rs
@@ -7,7 +7,7 @@ mod fill_null;
 mod mask;
 pub(crate) mod rules;
 mod slice;
-mod take;
+pub(crate) mod take;
 
 #[cfg(test)]
 mod tests {
diff --git a/vortex-array/src/arrays/primitive/compute/take/avx2.rs b/vortex-array/src/arrays/primitive/compute/take/avx2.rs
index e92304dc34b..91d6896fae4 100644
--- a/vortex-array/src/arrays/primitive/compute/take/avx2.rs
+++ b/vortex-array/src/arrays/primitive/compute/take/avx2.rs
@@ -110,6 +110,114 @@ where
 // AVX2 SIMD take algorithm
 // ---------------------------------------------------------------------------
 
+/// AVX2 gather into a caller-supplied uninitialized destination slice. Used by the chunked
+/// execution engine to avoid the per-call [`Buffer`] allocation that [`take_avx2`] performs.
+///
+/// # Safety
+///
+/// `dst` must be writable for `indices.len()` elements. The `avx2` feature must be enabled
+/// on the calling CPU.
+#[target_feature(enable = "avx2")]
+pub(crate) unsafe fn take_avx2_into<V: NativePType, I: UnsignedPType>(
+    buffer: &[V],
+    indices: &[I],
+    dst: *mut V,
+) {
+    macro_rules! dispatch_into {
+        ($indices:ty, $values:ty) => {{
+            dispatch_into!($indices, $values, cast: $values);
+        }};
+        ($indices:ty, $values:ty, cast: $cast:ty) => {{
+            let indices = unsafe { std::mem::transmute::<&[I], &[$indices]>(indices) };
+            let values = unsafe { std::mem::transmute::<&[V], &[$cast]>(buffer) };
+            let dst = dst.cast::<$cast>();
+            unsafe {
+                exec_take_into::<$cast, $indices, AVX2Gather>(values, indices, dst);
+            }
+        }};
+    }
+
+    if buffer.is_empty() {
+        // Zero-fill: caller still needs dst[..indices.len()] initialized.
+        for i in 0..indices.len() {
+            // SAFETY: caller guarantees dst has indices.len() cells.
+            unsafe { dst.add(i).write(V::default()) };
+        }
+        return;
+    }
+
+    match (I::PTYPE, V::PTYPE) {
+        (PType::U8, PType::I32) => dispatch_into!(u8, i32),
+        (PType::U8, PType::U32) => dispatch_into!(u8, u32),
+        (PType::U8, PType::I64) => dispatch_into!(u8, i64),
+        (PType::U8, PType::U64) => dispatch_into!(u8, u64),
+        (PType::U16, PType::I32) => dispatch_into!(u16, i32),
+        (PType::U16, PType::U32) => dispatch_into!(u16, u32),
+        (PType::U16, PType::I64) => dispatch_into!(u16, i64),
+        (PType::U16, PType::U64) => dispatch_into!(u16, u64),
+        (PType::U32, PType::I32) => dispatch_into!(u32, i32),
+        (PType::U32, PType::U32) => dispatch_into!(u32, u32),
+        (PType::U32, PType::I64) => dispatch_into!(u32, i64),
+        (PType::U32, PType::U64) => dispatch_into!(u32, u64),
+        (PType::U8, PType::F32) => dispatch_into!(u8, f32, cast: u32),
+        (PType::U16, PType::F32) => dispatch_into!(u16, f32, cast: u32),
+        (PType::U32, PType::F32) => dispatch_into!(u32, f32, cast: u32),
+        (PType::U64, PType::F32) => dispatch_into!(u64, f32, cast: u32),
+        (PType::U8, PType::F64) => dispatch_into!(u8, f64, cast: u64),
+        (PType::U16, PType::F64) => dispatch_into!(u16, f64, cast: u64),
+        (PType::U32, PType::F64) => dispatch_into!(u32, f64, cast: u64),
+        (PType::U64, PType::F64) => dispatch_into!(u64, f64, cast: u64),
+        _ => {
+            // Scalar fallback into dst.
+            for (i, idx) in indices.iter().enumerate() {
+                // SAFETY: dst has indices.len() cells; values bounds-checked.
+                unsafe { dst.add(i).write(buffer[(*idx).as_()]) };
+            }
+        }
+    }
+}
+
+/// Same SIMD gather loop as [`exec_take`], but writes into a caller-supplied destination
+/// pointer instead of allocating. Marked `#[target_feature(enable = "avx2")]` so the AVX2
+/// gather intrinsics inside `AVX2Gather::gather` get the correct codegen context.
+///
+/// # Safety
+///
+/// - `dst` must point to at least `indices.len()` writable elements.
+/// - The `avx2` feature must be enabled on the caller's CPU.
+#[target_feature(enable = "avx2")]
+unsafe fn exec_take_into<Value, Idx, Gather>(
+    values: &[Value],
+    indices: &[Idx],
+    dst: *mut Value,
+) where
+    Value: Copy,
+    Idx: UnsignedPType,
+    Gather: GatherFn<Idx, Value>,
+{
+    let indices_len = indices.len();
+    let max_index = Idx::from(values.len()).unwrap_or_else(|| Idx::max_value());
+    let mut offset = 0;
+    while offset + Gather::STRIDE < indices_len {
+        // SAFETY: same as exec_take.
+        unsafe {
+            Gather::gather(
+                indices.as_ptr().add(offset),
+                max_index,
+                values.as_ptr(),
+                dst.add(offset),
+            )
+        };
+        offset += Gather::WIDTH;
+    }
+    while offset < indices_len {
+        // SAFETY: offset < indices_len ≤ dst capacity; indices[offset] is bounds-checked.
+        unsafe { dst.add(offset).write(values[indices[offset].as_()]) };
+        offset += 1;
+    }
+    debug_assert_eq!(offset, indices_len);
+}
+
 /// Takes the specified indices into a new [`Buffer`] using AVX2 SIMD.
 ///
 /// # Panics
@@ -121,7 +229,7 @@ where
 /// The caller must ensure the `avx2` feature is enabled.
 #[target_feature(enable = "avx2")]
 #[doc(hidden)]
-unsafe fn take_avx2<V: NativePType, I: UnsignedPType>(buffer: &[V], indices: &[I]) -> Buffer<V> {
+pub(crate) unsafe fn take_avx2<V: NativePType, I: UnsignedPType>(buffer: &[V], indices: &[I]) -> Buffer<V> {
     macro_rules! dispatch_avx2 {
         ($indices:ty, $values:ty) => {
             { let result = dispatch_avx2!($indices, $values, cast: $values); result }
diff --git a/vortex-array/src/arrays/primitive/compute/take/avx512.rs b/vortex-array/src/arrays/primitive/compute/take/avx512.rs
new file mode 100644
index 00000000000..f9261cc49f4
--- /dev/null
+++ b/vortex-array/src/arrays/primitive/compute/take/avx512.rs
@@ -0,0 +1,849 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! An AVX-512 implementation of the take operation using `vpgatherdd` / `vpgatherdq` /
+//! `vpgatherqq` instructions.
+//!
+//! Only enabled for x86_64 hosts and gated at runtime behind feature detection to
+//! ensure AVX-512F, AVX-512BW, AVX-512DQ, and AVX-512VL instructions are available.
+//!
+//! Mirrors `avx2.rs`. Index/value pairs that aren't implemented here fall back to the
+//! AVX-2 kernel (which in turn can fall back to scalar).
+
+#![cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+
+use std::arch::x86_64::__m256i;
+use std::arch::x86_64::__m512i;
+use std::arch::x86_64::__mmask8;
+use std::arch::x86_64::__mmask16;
+use std::arch::x86_64::_mm_loadu_si128;
+use std::arch::x86_64::_mm256_cvtepu8_epi32;
+use std::arch::x86_64::_mm256_cvtepu16_epi32;
+use std::arch::x86_64::_mm256_loadu_si256;
+use std::arch::x86_64::_mm256_setzero_si256;
+use std::arch::x86_64::_mm256_storeu_si256;
+use std::arch::x86_64::_mm512_cmple_epu32_mask;
+use std::arch::x86_64::_mm512_cmple_epu64_mask;
+use std::arch::x86_64::_mm512_cvtepu8_epi32;
+use std::arch::x86_64::_mm512_cvtepu16_epi32;
+use std::arch::x86_64::_mm512_loadu_si512;
+use std::arch::x86_64::_mm512_mask_i32gather_epi32;
+use std::arch::x86_64::_mm512_mask_i32gather_epi64;
+use std::arch::x86_64::_mm512_mask_i64gather_epi32;
+use std::arch::x86_64::_mm512_mask_i64gather_epi64;
+use std::arch::x86_64::_mm512_set1_epi32;
+use std::arch::x86_64::_mm512_set1_epi64;
+use std::arch::x86_64::_mm512_setzero_si512;
+use std::arch::x86_64::_mm512_storeu_si512;
+use std::arch::x86_64::_mm512_zextsi256_si512;
+
+use vortex_buffer::Alignment;
+use vortex_buffer::Buffer;
+use vortex_buffer::BufferMut;
+use vortex_error::VortexResult;
+
+use crate::ArrayRef;
+use crate::IntoArray;
+use crate::array::ArrayView;
+use crate::arrays::PrimitiveArray;
+use crate::arrays::primitive::compute::take::TakeImpl;
+use crate::arrays::primitive::compute::take::avx2;
+use crate::arrays::primitive::vtable::Primitive;
+use crate::dtype::NativePType;
+use crate::dtype::PType;
+use crate::dtype::UnsignedPType;
+use crate::match_each_native_ptype;
+use crate::match_each_unsigned_integer_ptype;
+use crate::validity::Validity;
+
+#[allow(unused)]
+pub(super) struct TakeKernelAVX512;
+
+impl TakeImpl for TakeKernelAVX512 {
+    #[inline(always)]
+    fn take(
+        &self,
+        values: ArrayView<'_, Primitive>,
+        indices: ArrayView<'_, Primitive>,
+        validity: Validity,
+    ) -> VortexResult<ArrayRef> {
+        assert!(indices.ptype().is_unsigned_int());
+
+        Ok(match_each_unsigned_integer_ptype!(indices.ptype(), |I| {
+            match_each_native_ptype!(values.ptype(), |V| {
+                // SAFETY: This kernel is only selected when the required AVX-512 cpu-features
+                // are detected.
+                unsafe {
+                    take_primitive_avx512(values.as_slice::<V>(), indices.as_slice::<I>(), validity)
+                }
+            })
+        })
+        .into_array())
+    }
+}
+
+/// # Safety
+///
+/// The caller must ensure that if the validity has a length, it is the same length as the indices,
+/// and that the `avx512f`, `avx512bw`, `avx512dq`, and `avx512vl` features are enabled.
+#[target_feature(enable = "avx512f,avx512bw,avx512dq,avx512vl")]
+#[allow(unused)]
+unsafe fn take_primitive_avx512<V, I>(
+    values: &[V],
+    indices: &[I],
+    validity: Validity,
+) -> PrimitiveArray
+where
+    V: NativePType,
+    I: UnsignedPType,
+{
+    // SAFETY: The caller guarantees the required features are enabled.
+    let buffer = unsafe { take_avx512(values, indices) };
+
+    debug_assert!(
+        validity
+            .maybe_len()
+            .is_none_or(|validity_len| validity_len == buffer.len())
+    );
+
+    // SAFETY: The caller ensures that the validity and indices have the same length, so the taken
+    // buffer and the validity must have the same length.
+    unsafe { PrimitiveArray::new_unchecked(buffer, validity) }
+}
+
+// ---------------------------------------------------------------------------
+// AVX-512 SIMD take algorithm
+// ---------------------------------------------------------------------------
+
+/// AVX-512 gather into a caller-supplied uninitialized destination slice. Used by the
+/// chunked execution engine to avoid the per-call [`Buffer`] allocation that
+/// [`take_avx512`] performs.
+///
+/// # Safety
+///
+/// `dst` must be writable for `indices.len()` elements. The `avx512f`, `avx512bw`,
+/// `avx512dq`, and `avx512vl` features must be enabled on the calling CPU.
+#[target_feature(enable = "avx512f,avx512bw,avx512dq,avx512vl")]
+pub(crate) unsafe fn take_avx512_into<V: NativePType, I: UnsignedPType>(
+    buffer: &[V],
+    indices: &[I],
+    dst: *mut V,
+) {
+    macro_rules! dispatch_into {
+        ($indices:ty, $values:ty) => {{
+            dispatch_into!($indices, $values, cast: $values);
+        }};
+        ($indices:ty, $values:ty, cast: $cast:ty) => {{
+            let indices_typed =
+                unsafe { std::mem::transmute::<&[I], &[$indices]>(indices) };
+            let values_typed = unsafe { std::mem::transmute::<&[V], &[$cast]>(buffer) };
+            let dst_typed = dst.cast::<$cast>();
+            unsafe {
+                exec_take_into_avx512::<$cast, $indices, AVX512Gather>(
+                    values_typed,
+                    indices_typed,
+                    dst_typed,
+                );
+            }
+        }};
+    }
+
+    if buffer.is_empty() {
+        // Zero-fill: caller still needs dst[..indices.len()] initialized.
+        for i in 0..indices.len() {
+            // SAFETY: caller guarantees dst has indices.len() cells.
+            unsafe { dst.add(i).write(V::default()) };
+        }
+        return;
+    }
+
+    match (I::PTYPE, V::PTYPE) {
+        // Int value types. Only 32 and 64 bit types are supported.
+        (PType::U8, PType::I32) => dispatch_into!(u8, i32),
+        (PType::U8, PType::U32) => dispatch_into!(u8, u32),
+        (PType::U8, PType::I64) => dispatch_into!(u8, i64),
+        (PType::U8, PType::U64) => dispatch_into!(u8, u64),
+        (PType::U16, PType::I32) => dispatch_into!(u16, i32),
+        (PType::U16, PType::U32) => dispatch_into!(u16, u32),
+        (PType::U16, PType::I64) => dispatch_into!(u16, i64),
+        (PType::U16, PType::U64) => dispatch_into!(u16, u64),
+        (PType::U32, PType::I32) => dispatch_into!(u32, i32),
+        (PType::U32, PType::U32) => dispatch_into!(u32, u32),
+        (PType::U32, PType::I64) => dispatch_into!(u32, i64),
+        (PType::U32, PType::U64) => dispatch_into!(u32, u64),
+        (PType::U64, PType::I32) => dispatch_into!(u64, i32),
+        (PType::U64, PType::U32) => dispatch_into!(u64, u32),
+        (PType::U64, PType::I64) => dispatch_into!(u64, i64),
+        (PType::U64, PType::U64) => dispatch_into!(u64, u64),
+
+        // Float value types, treat them as if they were corresponding int types.
+        (PType::U8, PType::F32) => dispatch_into!(u8, f32, cast: u32),
+        (PType::U16, PType::F32) => dispatch_into!(u16, f32, cast: u32),
+        (PType::U32, PType::F32) => dispatch_into!(u32, f32, cast: u32),
+        (PType::U64, PType::F32) => dispatch_into!(u64, f32, cast: u32),
+
+        (PType::U8, PType::F64) => dispatch_into!(u8, f64, cast: u64),
+        (PType::U16, PType::F64) => dispatch_into!(u16, f64, cast: u64),
+        (PType::U32, PType::F64) => dispatch_into!(u32, f64, cast: u64),
+        (PType::U64, PType::F64) => dispatch_into!(u64, f64, cast: u64),
+
+        // Fall back to AVX-2 (or scalar) for unsupported pairs.
+        _ => {
+            // SAFETY: AVX-2 is a strict subset of the AVX-512 features required to call
+            // `take_avx512_into`, so the AVX-2 entry point is valid here.
+            unsafe { avx2::take_avx2_into::<V, I>(buffer, indices, dst) };
+        }
+    }
+}
+
+/// AVX-512 inner gather loop. Writes into a caller-supplied destination pointer instead of
+/// allocating. Marked `#[target_feature(enable = "avx512f,...")]` so the AVX-512 gather
+/// intrinsics inside `AVX512Gather::gather` get the correct codegen context.
+///
+/// # Safety
+///
+/// - `dst` must point to at least `indices.len()` writable elements.
+/// - The `avx512f`, `avx512bw`, `avx512dq`, and `avx512vl` features must be enabled on
+///   the caller's CPU.
+#[target_feature(enable = "avx512f,avx512bw,avx512dq,avx512vl")]
+unsafe fn exec_take_into_avx512<Value, Idx, Gather>(
+    values: &[Value],
+    indices: &[Idx],
+    dst: *mut Value,
+) where
+    Value: Copy,
+    Idx: UnsignedPType,
+    Gather: GatherFn<Idx, Value>,
+{
+    let indices_len = indices.len();
+    let max_index = Idx::from(values.len()).unwrap_or_else(|| Idx::max_value());
+    let mut offset = 0;
+    while offset + Gather::STRIDE < indices_len {
+        // SAFETY: same as exec_take_avx512.
+        unsafe {
+            Gather::gather(
+                indices.as_ptr().add(offset),
+                max_index,
+                values.as_ptr(),
+                dst.add(offset),
+            )
+        };
+        offset += Gather::WIDTH;
+    }
+    while offset < indices_len {
+        // SAFETY: offset < indices_len ≤ dst capacity; indices[offset] is bounds-checked.
+        unsafe { dst.add(offset).write(values[indices[offset].as_()]) };
+        offset += 1;
+    }
+    debug_assert_eq!(offset, indices_len);
+}
+
+/// Takes the specified indices into a new [`Buffer`] using AVX-512 SIMD.
+///
+/// # Panics
+///
+/// This function panics if any of the provided `indices` are out of bounds for `values`.
+///
+/// # Safety
+///
+/// The caller must ensure the `avx512f`, `avx512bw`, `avx512dq`, and `avx512vl` features
+/// are enabled.
+#[target_feature(enable = "avx512f,avx512bw,avx512dq,avx512vl")]
+#[doc(hidden)]
+pub(crate) unsafe fn take_avx512<V: NativePType, I: UnsignedPType>(
+    buffer: &[V],
+    indices: &[I],
+) -> Buffer<V> {
+    macro_rules! dispatch_avx512 {
+        ($indices:ty, $values:ty) => {
+            { let result = dispatch_avx512!($indices, $values, cast: $values); result }
+        };
+        ($indices:ty, $values:ty, cast: $cast:ty) => {{
+            let indices_typed =
+                unsafe { std::mem::transmute::<&[I], &[$indices]>(indices) };
+            let values_typed = unsafe { std::mem::transmute::<&[V], &[$cast]>(buffer) };
+
+            let result = unsafe {
+                exec_take_avx512::<$cast, $indices, AVX512Gather>(values_typed, indices_typed)
+            };
+            unsafe { result.transmute::<V>() }
+        }};
+    }
+
+    if buffer.is_empty() {
+        return Buffer::zeroed(indices.len());
+    }
+
+    match (I::PTYPE, V::PTYPE) {
+        // Int value types. Only 32 and 64 bit types are supported.
+        (PType::U8, PType::I32) => dispatch_avx512!(u8, i32),
+        (PType::U8, PType::U32) => dispatch_avx512!(u8, u32),
+        (PType::U8, PType::I64) => dispatch_avx512!(u8, i64),
+        (PType::U8, PType::U64) => dispatch_avx512!(u8, u64),
+        (PType::U16, PType::I32) => dispatch_avx512!(u16, i32),
+        (PType::U16, PType::U32) => dispatch_avx512!(u16, u32),
+        (PType::U16, PType::I64) => dispatch_avx512!(u16, i64),
+        (PType::U16, PType::U64) => dispatch_avx512!(u16, u64),
+        (PType::U32, PType::I32) => dispatch_avx512!(u32, i32),
+        (PType::U32, PType::U32) => dispatch_avx512!(u32, u32),
+        (PType::U32, PType::I64) => dispatch_avx512!(u32, i64),
+        (PType::U32, PType::U64) => dispatch_avx512!(u32, u64),
+        (PType::U64, PType::I32) => dispatch_avx512!(u64, i32),
+        (PType::U64, PType::U32) => dispatch_avx512!(u64, u32),
+        (PType::U64, PType::I64) => dispatch_avx512!(u64, i64),
+        (PType::U64, PType::U64) => dispatch_avx512!(u64, u64),
+
+        // Float value types, treat them as if they were corresponding int types.
+        (PType::U8, PType::F32) => dispatch_avx512!(u8, f32, cast: u32),
+        (PType::U16, PType::F32) => dispatch_avx512!(u16, f32, cast: u32),
+        (PType::U32, PType::F32) => dispatch_avx512!(u32, f32, cast: u32),
+        (PType::U64, PType::F32) => dispatch_avx512!(u64, f32, cast: u32),
+
+        (PType::U8, PType::F64) => dispatch_avx512!(u8, f64, cast: u64),
+        (PType::U16, PType::F64) => dispatch_avx512!(u16, f64, cast: u64),
+        (PType::U32, PType::F64) => dispatch_avx512!(u32, f64, cast: u64),
+        (PType::U64, PType::F64) => dispatch_avx512!(u64, f64, cast: u64),
+
+        // Fall back to AVX-2 (which itself falls back to scalar) for unsupported pairs.
+        _ => {
+            tracing::trace!(
+                "take AVX-512 kernel missing for indices {} values {}, falling back to AVX-2",
+                I::PTYPE,
+                V::PTYPE
+            );
+            // SAFETY: AVX-2 is a strict subset of the AVX-512 features required to call
+            // `take_avx512`, so the AVX-2 entry point is valid here.
+            unsafe { avx2::take_avx2(buffer, indices) }
+        }
+    }
+}
+
+/// The main gather function that is used by the inner loop kernel for AVX-512 gather.
+trait GatherFn<Idx, Values> {
+    /// The number of data elements that are written to the `dst` on each loop iteration.
+    const WIDTH: usize;
+    /// The number of indices read from `indices` on each loop iteration.
+    const STRIDE: usize = Self::WIDTH;
+
+    /// Gather values from `src` into the `dst` using the `indices`.
+    ///
+    /// # Safety
+    ///
+    /// This function can read up to `STRIDE` elements through `indices`, and read/write up to
+    /// `WIDTH` elements through `src` and `dst` respectively.
+    unsafe fn gather(indices: *const Idx, max_idx: Idx, src: *const Values, dst: *mut Values);
+}
+
+/// AVX-512 version of [`GatherFn`] defined for 32- and 64-bit value types.
+enum AVX512Gather {}
+
+// ---------------------------------------------------------------------------
+// 16-lane (`__m512i` of i32 indices) gathers — output __m512i of 32-bit values.
+// ---------------------------------------------------------------------------
+
+macro_rules! impl_gather_i32x16 {
+    ($idx:ty, $value:ty, load_idx: $load_idx:ident, extend_idx: $extend_idx:expr, WIDTH = $WIDTH:literal, STRIDE = $STRIDE:literal) => {
+        impl GatherFn<$idx, $value> for AVX512Gather {
+            const WIDTH: usize = $WIDTH;
+            const STRIDE: usize = $STRIDE;
+
+            #[allow(unused_unsafe, clippy::cast_possible_truncation)]
+            #[inline(always)]
+            unsafe fn gather(
+                indices: *const $idx,
+                max_idx: $idx,
+                src: *const $value,
+                dst: *mut $value,
+            ) {
+                const {
+                    assert!(
+                        $WIDTH <= $STRIDE,
+                        "dst cannot advance by more than the stride"
+                    );
+                    assert!($WIDTH == 16);
+                }
+
+                const SCALE: i32 = size_of::<$value>() as i32;
+
+                // Load and zero-extend `indices` into 16 i32 lanes.
+                let raw = unsafe { $load_idx(indices.cast()) };
+                let indices_vec: __m512i = unsafe { $extend_idx(raw) };
+
+                // Compute `valid_mask = idx < max_idx` (i.e. in-bounds positions).
+                let max_idx_vec = unsafe { _mm512_set1_epi32(max_idx as i32) };
+                let valid_mask: __mmask16 =
+                    unsafe { _mm512_cmple_epu32_mask(indices_vec, max_idx_vec) };
+
+                let zero_vec = unsafe { _mm512_setzero_si512() };
+
+                // Masked gather: gather where valid_mask=1, leave zero otherwise.
+                let values_vec = unsafe {
+                    _mm512_mask_i32gather_epi32::<SCALE>(
+                        zero_vec,
+                        valid_mask,
+                        indices_vec,
+                        src.cast(),
+                    )
+                };
+
+                // Write the vec out to dst.
+                unsafe { _mm512_storeu_si512(dst.cast(), values_vec) };
+            }
+        }
+    };
+}
+
+// kernels for u8 indices into 32-bit values
+impl_gather_i32x16!(
+    u8, u32,
+    load_idx: _mm_loadu_si128,
+    extend_idx: _mm512_cvtepu8_epi32,
+    WIDTH = 16, STRIDE = 16
+);
+impl_gather_i32x16!(
+    u8, i32,
+    load_idx: _mm_loadu_si128,
+    extend_idx: _mm512_cvtepu8_epi32,
+    WIDTH = 16, STRIDE = 16
+);
+
+// kernels for u16 indices into 32-bit values
+impl_gather_i32x16!(
+    u16, u32,
+    load_idx: _mm256_loadu_si256,
+    extend_idx: _mm512_cvtepu16_epi32,
+    WIDTH = 16, STRIDE = 16
+);
+impl_gather_i32x16!(
+    u16, i32,
+    load_idx: _mm256_loadu_si256,
+    extend_idx: _mm512_cvtepu16_epi32,
+    WIDTH = 16, STRIDE = 16
+);
+
+// kernels for u32 indices into 32-bit values — pass-through, no extension needed.
+impl GatherFn<u32, u32> for AVX512Gather {
+    const WIDTH: usize = 16;
+    const STRIDE: usize = 16;
+
+    #[allow(clippy::cast_possible_truncation)]
+    #[inline(always)]
+    unsafe fn gather(indices: *const u32, max_idx: u32, src: *const u32, dst: *mut u32) {
+        const SCALE: i32 = size_of::<u32>() as i32;
+
+        let indices_vec = unsafe { _mm512_loadu_si512(indices.cast()) };
+        let max_idx_vec = unsafe { _mm512_set1_epi32(max_idx as i32) };
+        let valid_mask = unsafe { _mm512_cmple_epu32_mask(indices_vec, max_idx_vec) };
+        let zero_vec = unsafe { _mm512_setzero_si512() };
+        let values_vec = unsafe {
+            _mm512_mask_i32gather_epi32::<SCALE>(zero_vec, valid_mask, indices_vec, src.cast())
+        };
+        unsafe { _mm512_storeu_si512(dst.cast(), values_vec) };
+    }
+}
+
+impl GatherFn<u32, i32> for AVX512Gather {
+    const WIDTH: usize = 16;
+    const STRIDE: usize = 16;
+
+    #[allow(clippy::cast_possible_truncation)]
+    #[inline(always)]
+    unsafe fn gather(indices: *const u32, max_idx: u32, src: *const i32, dst: *mut i32) {
+        const SCALE: i32 = size_of::<i32>() as i32;
+
+        let indices_vec = unsafe { _mm512_loadu_si512(indices.cast()) };
+        let max_idx_vec = unsafe { _mm512_set1_epi32(max_idx as i32) };
+        let valid_mask = unsafe { _mm512_cmple_epu32_mask(indices_vec, max_idx_vec) };
+        let zero_vec = unsafe { _mm512_setzero_si512() };
+        let values_vec = unsafe {
+            _mm512_mask_i32gather_epi32::<SCALE>(zero_vec, valid_mask, indices_vec, src.cast())
+        };
+        unsafe { _mm512_storeu_si512(dst.cast(), values_vec) };
+    }
+}
+
+// ---------------------------------------------------------------------------
+// 8-lane (`__m256i` of i32 indices) gathers — output __m512i of 64-bit values.
+// ---------------------------------------------------------------------------
+
+macro_rules! impl_gather_i32_to_i64x8 {
+    ($idx:ty, $value:ty, load_idx: $load_idx:ident, extend_idx: $extend_idx:expr, WIDTH = $WIDTH:literal, STRIDE = $STRIDE:literal) => {
+        impl GatherFn<$idx, $value> for AVX512Gather {
+            const WIDTH: usize = $WIDTH;
+            const STRIDE: usize = $STRIDE;
+
+            #[allow(unused_unsafe, clippy::cast_possible_truncation)]
+            #[inline(always)]
+            unsafe fn gather(
+                indices: *const $idx,
+                max_idx: $idx,
+                src: *const $value,
+                dst: *mut $value,
+            ) {
+                const {
+                    assert!(
+                        $WIDTH <= $STRIDE,
+                        "dst cannot advance by more than the stride"
+                    );
+                    assert!($WIDTH == 8);
+                }
+
+                const SCALE: i32 = size_of::<$value>() as i32;
+
+                // Load `STRIDE` indices, zero-extended to i32x8 (the upper lanes of the
+                // 256-bit vector are unused by the gather).
+                let raw = unsafe { $load_idx(indices.cast()) };
+                let indices_vec: __m256i = unsafe { $extend_idx(raw) };
+
+                // Build a 256-bit max_idx vector and compare unsigned. We borrow the AVX-512VL
+                // 256-bit comparison helpers via the 512-bit zero-extended path: zero-extend
+                // both sides to a 512i and use `cmplt_epu32_mask`.
+                let max_idx_512 = unsafe { _mm512_set1_epi32(max_idx as i32) };
+                let indices_512: __m512i = unsafe { _mm512_zextsi256_si512(indices_vec) };
+                let valid_mask_16: __mmask16 =
+                    unsafe { _mm512_cmple_epu32_mask(indices_512, max_idx_512) };
+                // Only the low 8 lanes are meaningful for the i32->i64 gather.
+                let valid_mask: __mmask8 = (valid_mask_16 as u16 & 0x00FFu16) as __mmask8;
+
+                let zero_vec = unsafe { _mm512_setzero_si512() };
+
+                let values_vec = unsafe {
+                    _mm512_mask_i32gather_epi64::<SCALE>(
+                        zero_vec,
+                        valid_mask,
+                        indices_vec,
+                        src.cast(),
+                    )
+                };
+
+                unsafe { _mm512_storeu_si512(dst.cast(), values_vec) };
+            }
+        }
+    };
+}
+
+// u8 → 64-bit values (load 8 u8 bytes into a __m128i, zero-extend to __m256i of 8 i32s).
+// We need `_mm256_cvtepu8_epi32` from AVX2 for the index zero-extension.
+impl GatherFn<u8, u64> for AVX512Gather {
+    const WIDTH: usize = 8;
+    const STRIDE: usize = 16;
+
+    #[allow(clippy::cast_possible_truncation)]
+    #[inline(always)]
+    unsafe fn gather(indices: *const u8, max_idx: u8, src: *const u64, dst: *mut u64) {
+        const SCALE: i32 = size_of::<u64>() as i32;
+
+        // Load 16 u8s but only the low 8 are used (STRIDE=16 stays compatible with AVX-2).
+        let raw = unsafe { _mm_loadu_si128(indices.cast()) };
+        let indices_vec: __m256i = unsafe { _mm256_cvtepu8_epi32(raw) };
+
+        let max_idx_512 = unsafe { _mm512_set1_epi32(max_idx as i32) };
+        let indices_512 = unsafe { _mm512_zextsi256_si512(indices_vec) };
+        let valid_mask_16 = unsafe { _mm512_cmple_epu32_mask(indices_512, max_idx_512) };
+        let valid_mask: __mmask8 = (valid_mask_16 as u16 & 0x00FFu16) as __mmask8;
+
+        let zero_vec = unsafe { _mm512_setzero_si512() };
+        let values_vec = unsafe {
+            _mm512_mask_i32gather_epi64::<SCALE>(zero_vec, valid_mask, indices_vec, src.cast())
+        };
+        unsafe { _mm512_storeu_si512(dst.cast(), values_vec) };
+    }
+}
+
+impl GatherFn<u8, i64> for AVX512Gather {
+    const WIDTH: usize = 8;
+    const STRIDE: usize = 16;
+
+    #[allow(clippy::cast_possible_truncation)]
+    #[inline(always)]
+    unsafe fn gather(indices: *const u8, max_idx: u8, src: *const i64, dst: *mut i64) {
+        const SCALE: i32 = size_of::<i64>() as i32;
+
+        let raw = unsafe { _mm_loadu_si128(indices.cast()) };
+        let indices_vec: __m256i = unsafe { _mm256_cvtepu8_epi32(raw) };
+
+        let max_idx_512 = unsafe { _mm512_set1_epi32(max_idx as i32) };
+        let indices_512 = unsafe { _mm512_zextsi256_si512(indices_vec) };
+        let valid_mask_16 = unsafe { _mm512_cmple_epu32_mask(indices_512, max_idx_512) };
+        let valid_mask: __mmask8 = (valid_mask_16 as u16 & 0x00FFu16) as __mmask8;
+
+        let zero_vec = unsafe { _mm512_setzero_si512() };
+        let values_vec = unsafe {
+            _mm512_mask_i32gather_epi64::<SCALE>(zero_vec, valid_mask, indices_vec, src.cast())
+        };
+        unsafe { _mm512_storeu_si512(dst.cast(), values_vec) };
+    }
+}
+
+// u16 → 64-bit values
+impl_gather_i32_to_i64x8!(
+    u16, u64,
+    load_idx: _mm_loadu_si128,
+    extend_idx: _mm256_cvtepu16_epi32,
+    WIDTH = 8, STRIDE = 8
+);
+impl_gather_i32_to_i64x8!(
+    u16, i64,
+    load_idx: _mm_loadu_si128,
+    extend_idx: _mm256_cvtepu16_epi32,
+    WIDTH = 8, STRIDE = 8
+);
+
+// u32 → 64-bit values (load 8 u32 into a __m256i, gather via i32->i64).
+impl_gather_i32_to_i64x8!(
+    u32, u64,
+    load_idx: _mm256_loadu_si256,
+    extend_idx: identity_m256,
+    WIDTH = 8, STRIDE = 8
+);
+impl_gather_i32_to_i64x8!(
+    u32, i64,
+    load_idx: _mm256_loadu_si256,
+    extend_idx: identity_m256,
+    WIDTH = 8, STRIDE = 8
+);
+
+#[inline(always)]
+unsafe fn identity_m256(x: __m256i) -> __m256i {
+    x
+}
+
+// ---------------------------------------------------------------------------
+// 8-lane (`__m512i` of i64 indices) gathers — output __m512i of 64-bit values.
+// ---------------------------------------------------------------------------
+
+impl GatherFn<u64, u64> for AVX512Gather {
+    const WIDTH: usize = 8;
+    const STRIDE: usize = 8;
+
+    #[allow(clippy::cast_possible_truncation, clippy::cast_possible_wrap)]
+    #[inline(always)]
+    unsafe fn gather(indices: *const u64, max_idx: u64, src: *const u64, dst: *mut u64) {
+        const SCALE: i32 = size_of::<u64>() as i32;
+
+        let indices_vec = unsafe { _mm512_loadu_si512(indices.cast()) };
+        let max_idx_vec = unsafe { _mm512_set1_epi64(max_idx as i64) };
+        let valid_mask = unsafe { _mm512_cmple_epu64_mask(indices_vec, max_idx_vec) };
+        let zero_vec = unsafe { _mm512_setzero_si512() };
+        let values_vec = unsafe {
+            _mm512_mask_i64gather_epi64::<SCALE>(zero_vec, valid_mask, indices_vec, src.cast())
+        };
+        unsafe { _mm512_storeu_si512(dst.cast(), values_vec) };
+    }
+}
+
+impl GatherFn<u64, i64> for AVX512Gather {
+    const WIDTH: usize = 8;
+    const STRIDE: usize = 8;
+
+    #[allow(clippy::cast_possible_truncation, clippy::cast_possible_wrap)]
+    #[inline(always)]
+    unsafe fn gather(indices: *const u64, max_idx: u64, src: *const i64, dst: *mut i64) {
+        const SCALE: i32 = size_of::<i64>() as i32;
+
+        let indices_vec = unsafe { _mm512_loadu_si512(indices.cast()) };
+        let max_idx_vec = unsafe { _mm512_set1_epi64(max_idx as i64) };
+        let valid_mask = unsafe { _mm512_cmple_epu64_mask(indices_vec, max_idx_vec) };
+        let zero_vec = unsafe { _mm512_setzero_si512() };
+        let values_vec = unsafe {
+            _mm512_mask_i64gather_epi64::<SCALE>(zero_vec, valid_mask, indices_vec, src.cast())
+        };
+        unsafe { _mm512_storeu_si512(dst.cast(), values_vec) };
+    }
+}
+
+// ---------------------------------------------------------------------------
+// 8-lane (`__m512i` of i64 indices) gathers — output __m256i of 32-bit values.
+// ---------------------------------------------------------------------------
+
+impl GatherFn<u64, u32> for AVX512Gather {
+    const WIDTH: usize = 8;
+    const STRIDE: usize = 8;
+
+    #[allow(clippy::cast_possible_truncation, clippy::cast_possible_wrap)]
+    #[inline(always)]
+    unsafe fn gather(indices: *const u64, max_idx: u64, src: *const u32, dst: *mut u32) {
+        const SCALE: i32 = size_of::<u32>() as i32;
+
+        let indices_vec = unsafe { _mm512_loadu_si512(indices.cast()) };
+        let max_idx_vec = unsafe { _mm512_set1_epi64(max_idx as i64) };
+        let valid_mask = unsafe { _mm512_cmple_epu64_mask(indices_vec, max_idx_vec) };
+        let zero_vec_256 = unsafe { _mm256_setzero_si256() };
+        let values_vec = unsafe {
+            _mm512_mask_i64gather_epi32::<SCALE>(zero_vec_256, valid_mask, indices_vec, src.cast())
+        };
+        unsafe { _mm256_storeu_si256(dst.cast(), values_vec) };
+    }
+}
+
+impl GatherFn<u64, i32> for AVX512Gather {
+    const WIDTH: usize = 8;
+    const STRIDE: usize = 8;
+
+    #[allow(clippy::cast_possible_truncation, clippy::cast_possible_wrap)]
+    #[inline(always)]
+    unsafe fn gather(indices: *const u64, max_idx: u64, src: *const i32, dst: *mut i32) {
+        const SCALE: i32 = size_of::<i32>() as i32;
+
+        let indices_vec = unsafe { _mm512_loadu_si512(indices.cast()) };
+        let max_idx_vec = unsafe { _mm512_set1_epi64(max_idx as i64) };
+        let valid_mask = unsafe { _mm512_cmple_epu64_mask(indices_vec, max_idx_vec) };
+        let zero_vec_256 = unsafe { _mm256_setzero_si256() };
+        let values_vec = unsafe {
+            _mm512_mask_i64gather_epi32::<SCALE>(zero_vec_256, valid_mask, indices_vec, src.cast())
+        };
+        unsafe { _mm256_storeu_si256(dst.cast(), values_vec) };
+    }
+}
+
+/// AVX-512 core inner loop for a specific `Idx` / `Value` type pair.
+#[inline(always)]
+unsafe fn exec_take_avx512<Value, Idx, Gather>(values: &[Value], indices: &[Idx]) -> Buffer<Value>
+where
+    Value: Copy,
+    Idx: UnsignedPType,
+    Gather: GatherFn<Idx, Value>,
+{
+    let indices_len = indices.len();
+    let max_index = Idx::from(values.len()).unwrap_or_else(|| Idx::max_value());
+    let mut buffer =
+        BufferMut::<Value>::with_capacity_aligned(indices_len, Alignment::of::<__m512i>());
+    let buf_uninit = buffer.spare_capacity_mut();
+
+    let mut offset = 0;
+    // Loop terminates STRIDE elements before the end of the indices array.
+    while offset + Gather::STRIDE < indices_len {
+        // SAFETY: `gather` reads at most STRIDE indices and writes WIDTH values, and we have
+        // STRIDE more indices available and WIDTH+ destination cells available.
+        unsafe {
+            Gather::gather(
+                indices.as_ptr().add(offset),
+                max_index,
+                values.as_ptr(),
+                buf_uninit.as_mut_ptr().add(offset).cast(),
+            )
+        };
+        offset += Gather::WIDTH;
+    }
+
+    while offset < indices_len {
+        buf_uninit[offset].write(values[indices[offset].as_()]);
+        offset += 1;
+    }
+
+    assert_eq!(offset, indices_len);
+
+    // SAFETY: All elements have been initialized.
+    unsafe { buffer.set_len(indices_len) };
+
+    // Reset the buffer alignment to the Value type so downstream slicing works at value
+    // boundaries.
+    buffer = buffer.aligned(Alignment::of::<Value>());
+
+    buffer.freeze()
+}
+
+#[cfg(test)]
+#[cfg_attr(miri, ignore)]
+#[cfg(target_arch = "x86_64")]
+mod avx512_tests {
+    use super::*;
+
+    fn host_has_avx512() -> bool {
+        is_x86_feature_detected!("avx512f")
+            && is_x86_feature_detected!("avx512bw")
+            && is_x86_feature_detected!("avx512dq")
+            && is_x86_feature_detected!("avx512vl")
+    }
+
+    macro_rules! test_cases {
+        (index_type => $IDX:ty, value_types => $($VAL:ty),+) => {
+            paste::paste! {
+                $(
+                    #[test]
+                    #[allow(clippy::cast_possible_truncation)]
+                    fn [<test_avx512_take_simple_ $IDX _ $VAL>]() {
+                        if !host_has_avx512() {
+                            return;
+                        }
+                        let values: Vec<$VAL> = (1..=127).map(|x| x as $VAL).collect();
+                        let indices: Vec<$IDX> = (0..127).collect();
+
+                        let result = unsafe { take_avx512(&values, &indices) };
+                        assert_eq!(&values, result.as_slice());
+                    }
+
+                    #[test]
+                    #[should_panic]
+                    #[allow(clippy::cast_possible_truncation)]
+                    fn [<test_avx512_take_empty_ $IDX _ $VAL>]() {
+                        if !host_has_avx512() {
+                            // Force the test to "panic" so the should_panic harness is satisfied
+                            // on hosts without AVX-512.
+                            panic!("avx512 not available; skipping");
+                        }
+                        let values: Vec<$VAL> = vec![];
+                        let indices: Vec<$IDX> = (0..127).collect();
+                        let result = unsafe { take_avx512(&values, &indices) };
+                        assert!(result.is_empty());
+                    }
+
+                    #[test]
+                    #[should_panic]
+                    #[allow(clippy::cast_possible_truncation)]
+                    fn [<test_avx512_take_invalid_ $IDX _ $VAL>]() {
+                        if !host_has_avx512() {
+                            panic!("avx512 not available; skipping");
+                        }
+                        let values: Vec<$VAL> = (1..=127).map(|x| x as $VAL).collect();
+                        let indices: Vec<$IDX> = (127..=254).collect();
+
+                        let result = unsafe { take_avx512(&values, &indices) };
+                        assert_eq!(&[0 as $VAL; 127], result.as_slice());
+                    }
+                )+
+            }
+        };
+    }
+
+    test_cases!(
+        index_type => u8,
+        value_types => u32, i32, u64, i64, f32, f64
+    );
+    test_cases!(
+        index_type => u16,
+        value_types => u32, i32, u64, i64, f32, f64
+    );
+    test_cases!(
+        index_type => u32,
+        value_types => u32, i32, u64, i64, f32, f64
+    );
+    test_cases!(
+        index_type => u64,
+        value_types => u32, i32, u64, i64, f32, f64
+    );
+
+    #[test]
+    fn test_avx512_take_last_valid_index_u8() {
+        if !host_has_avx512() {
+            return;
+        }
+        let values: Vec<i64> = (0..(255 + 1)).collect();
+        let indices: Vec<u8> = vec![255; 20];
+
+        let result = unsafe { take_avx512(&values, &indices) };
+        assert_eq!(&vec![255; indices.len()], result.as_slice());
+    }
+
+    #[test]
+    fn test_avx512_take_last_valid_index_u16() {
+        if !host_has_avx512() {
+            return;
+        }
+        let values: Vec<i64> = (0..(65535 + 1)).collect();
+        let indices: Vec<u16> = vec![65535; 20];
+
+        let result = unsafe { take_avx512(&values, &indices) };
+        assert_eq!(&vec![65535; indices.len()], result.as_slice());
+    }
+}
diff --git a/vortex-array/src/arrays/primitive/compute/take/mod.rs b/vortex-array/src/arrays/primitive/compute/take/mod.rs
index 4023991c65d..acbc1506ba3 100644
--- a/vortex-array/src/arrays/primitive/compute/take/mod.rs
+++ b/vortex-array/src/arrays/primitive/compute/take/mod.rs
@@ -3,6 +3,8 @@
 
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 mod avx2;
+#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+mod avx512;
 
 use std::sync::LazyLock;
 
@@ -31,7 +33,13 @@ use crate::validity::Validity;
 static PRIMITIVE_TAKE_KERNEL: LazyLock<&'static dyn TakeImpl> = LazyLock::new(|| {
     #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
     {
-        if is_x86_feature_detected!("avx2") {
+        if is_x86_feature_detected!("avx512f")
+            && is_x86_feature_detected!("avx512bw")
+            && is_x86_feature_detected!("avx512dq")
+            && is_x86_feature_detected!("avx512vl")
+        {
+            &avx512::TakeKernelAVX512
+        } else if is_x86_feature_detected!("avx2") {
             &avx2::TakeKernelAVX2
         } else {
             &TakeKernelScalar
@@ -44,6 +52,67 @@ static PRIMITIVE_TAKE_KERNEL: LazyLock<&'static dyn TakeImpl> = LazyLock::new(||
     }
 });
 
+#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+static AVX2_AVAILABLE: LazyLock<bool> = LazyLock::new(|| is_x86_feature_detected!("avx2"));
+
+#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+static AVX512_AVAILABLE: LazyLock<bool> = LazyLock::new(|| {
+    is_x86_feature_detected!("avx512f")
+        && is_x86_feature_detected!("avx512bw")
+        && is_x86_feature_detected!("avx512dq")
+        && is_x86_feature_detected!("avx512vl")
+});
+
+/// Take `indices` from `values` into a caller-supplied uninitialized destination slice,
+/// selecting the AVX-512 gather kernel when available, then AVX-2, and falling back to
+/// scalar otherwise.
+///
+/// `dst` must hold at least `indices.len()` cells; this function initializes exactly
+/// `indices.len()` cells starting from `dst[0]`.
+///
+/// Used by the chunked execution engine to call into the same SIMD gather as the
+/// canonical executor without allocating a heap [`Buffer`] per invocation.
+pub fn take_into_uninit<V: NativePType, I: crate::dtype::UnsignedPType>(
+    values: &[V],
+    indices: &[I],
+    dst: &mut [std::mem::MaybeUninit<V>],
+) {
+    assert!(dst.len() >= indices.len());
+    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+    {
+        if *AVX512_AVAILABLE {
+            // SAFETY: AVX-512 features confirmed at runtime; dst capacity ≥ indices.len().
+            unsafe {
+                avx512::take_avx512_into::<V, I>(values, indices, dst.as_mut_ptr().cast::<V>());
+            }
+            return;
+        }
+        if *AVX2_AVAILABLE {
+            // SAFETY: AVX2 feature confirmed at runtime; dst capacity ≥ indices.len().
+            unsafe {
+                avx2::take_avx2_into::<V, I>(values, indices, dst.as_mut_ptr().cast::<V>());
+            }
+            return;
+        }
+    }
+    take_into_uninit_scalar::<V, I>(values, indices, dst);
+}
+
+fn take_into_uninit_scalar<V: NativePType, I: IntegerPType>(
+    values: &[V],
+    indices: &[I],
+    dst: &mut [std::mem::MaybeUninit<V>],
+) {
+    let ptr = dst.as_mut_ptr();
+    for (i, idx) in indices.iter().enumerate() {
+        // SAFETY: dst capacity asserted by caller; values bounds-checked.
+        unsafe {
+            ptr.add(i)
+                .write(std::mem::MaybeUninit::new(values[(*idx).as_()]));
+        }
+    }
+}
+
 trait TakeImpl: Send + Sync {
     fn take(
         &self,
diff --git a/vortex-array/src/lib.rs b/vortex-array/src/lib.rs
index 0a9c5969ecc..7a84cb92095 100644
--- a/vortex-array/src/lib.rs
+++ b/vortex-array/src/lib.rs
@@ -29,6 +29,8 @@ use vortex_session::registry::Context;
 
 use crate::session::ArraySession;
 
+#[doc(hidden)]
+pub mod _chunked_exec;
 pub mod accessor;
 pub mod aggregate_fn;
 #[doc(hidden)]