diff --git a/Cargo.lock b/Cargo.lock
index 2819f2bacd0..949a9594359 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -10687,6 +10687,7 @@ version = "0.1.0"
 dependencies = [
  "codspeed-divan-compat",
  "fastlanes",
+ "inventory",
  "itertools 0.14.0",
  "lending-iterator",
  "num-traits",
@@ -10699,6 +10700,7 @@ dependencies = [
  "vortex-error",
  "vortex-fastlanes",
  "vortex-mask",
+ "vortex-row",
  "vortex-session",
 ]
 
@@ -11034,6 +11036,29 @@ dependencies = [
  "vortex-tui",
 ]
 
+[[package]]
+name = "vortex-row"
+version = "0.1.0"
+dependencies = [
+ "arrow-array 58.2.0",
+ "arrow-row 58.2.0",
+ "arrow-schema 58.2.0",
+ "bytes",
+ "codspeed-divan-compat",
+ "inventory",
+ "mimalloc",
+ "rand 0.10.1",
+ "rstest",
+ "smallvec",
+ "vortex-array",
+ "vortex-buffer",
+ "vortex-error",
+ "vortex-fastlanes",
+ "vortex-mask",
+ "vortex-session",
+ "vortex-utils",
+]
+
 [[package]]
 name = "vortex-runend"
 version = "0.1.0"
@@ -11042,6 +11067,7 @@ dependencies = [
  "arrow-array 58.2.0",
  "arrow-schema 58.2.0",
  "codspeed-divan-compat",
+ "inventory",
  "itertools 0.14.0",
  "num-traits",
  "prost 0.14.3",
@@ -11051,6 +11077,7 @@ dependencies = [
  "vortex-buffer",
  "vortex-error",
  "vortex-mask",
+ "vortex-row",
  "vortex-session",
 ]
 
diff --git a/Cargo.toml b/Cargo.toml
index fb87a953154..cdf28137563 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -12,6 +12,7 @@ members = [
     "vortex-io",
     "vortex-proto",
     "vortex-array",
+    "vortex-row",
     "vortex-tensor",
     "vortex-turboquant",
     "vortex-compressor",
@@ -102,6 +103,7 @@ arrow-cast = "58"
 arrow-data = "58"
 arrow-ipc = "58"
 arrow-ord = "58"
+arrow-row = "58"
 arrow-schema = "58"
 arrow-select = "58"
 arrow-string = "58"
@@ -291,6 +293,7 @@ vortex-mask = { version = "0.1.0", path = "./vortex-mask", default-features = fa
 vortex-metrics = { version = "0.1.0", path = "./vortex-metrics", default-features = false }
 vortex-pco = { version = "0.1.0", path = "./encodings/pco", default-features = false }
 vortex-proto = { version = "0.1.0", path = "./vortex-proto", default-features = false }
+vortex-row = { version = "0.1.0", path = "./vortex-row", default-features = false }
 vortex-runend = { version = "0.1.0", path = "./encodings/runend", default-features = false }
 vortex-scan = { version = "0.1.0", path = "./vortex-scan", default-features = false }
 vortex-sequence = { version = "0.1.0", path = "encodings/sequence", default-features = false }
diff --git a/encodings/fastlanes/Cargo.toml b/encodings/fastlanes/Cargo.toml
index a14e19389bc..3a255310ac5 100644
--- a/encodings/fastlanes/Cargo.toml
+++ b/encodings/fastlanes/Cargo.toml
@@ -18,6 +18,7 @@ workspace = true
 
 [dependencies]
 fastlanes = { workspace = true }
+inventory = { workspace = true }
 itertools = { workspace = true }
 lending-iterator = { workspace = true }
 num-traits = { workspace = true }
@@ -27,6 +28,7 @@ vortex-array = { workspace = true }
 vortex-buffer = { workspace = true }
 vortex-error = { workspace = true }
 vortex-mask = { workspace = true }
+vortex-row = { workspace = true }
 vortex-session = { workspace = true }
 
 [dev-dependencies]
diff --git a/encodings/fastlanes/src/bitpacking/compute/mod.rs b/encodings/fastlanes/src/bitpacking/compute/mod.rs
index 2501d952356..a260d0e0ffd 100644
--- a/encodings/fastlanes/src/bitpacking/compute/mod.rs
+++ b/encodings/fastlanes/src/bitpacking/compute/mod.rs
@@ -4,6 +4,7 @@
 mod cast;
 mod filter;
 pub(crate) mod is_constant;
+pub(crate) mod row_encode;
 mod slice;
 mod take;
 
diff --git a/encodings/fastlanes/src/bitpacking/compute/row_encode.rs b/encodings/fastlanes/src/bitpacking/compute/row_encode.rs
new file mode 100644
index 00000000000..412993dd166
--- /dev/null
+++ b/encodings/fastlanes/src/bitpacking/compute/row_encode.rs
@@ -0,0 +1,403 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Row-encode kernel for `BitPackedArray`.
+//!
+//! Walks the bit-packed storage in 1024-element chunks, unpacks each chunk into a
+//! stack-local buffer, and writes the row-encoded bytes in one pass. Avoids
+//! materializing a canonical `PrimitiveArray` first.
+
+#![allow(
+    clippy::cast_possible_truncation,
+    clippy::cast_possible_wrap,
+    clippy::cast_sign_loss,
+    reason = "row encoding indexes into u32-sized buffers and bit-packed widths are small"
+)]
+#![allow(
+    unused_imports,
+    reason = "Item is consumed by the #[gat(Item)] macro expansion"
+)]
+
+use lending_iterator::gat;
+#[allow(unused_imports)]
+use lending_iterator::prelude::Item;
+#[gat(Item)]
+use lending_iterator::prelude::LendingIterator;
+use vortex_array::ArrayId;
+use vortex_array::ArrayRef;
+use vortex_array::ExecutionCtx;
+use vortex_array::arrays::PrimitiveArray;
+use vortex_array::dtype::NativePType;
+use vortex_array::dtype::PType;
+use vortex_array::match_each_integer_ptype;
+use vortex_array::validity::Validity;
+use vortex_error::VortexResult;
+use vortex_row::RowEncodeRegistration;
+use vortex_row::options::SortField;
+
+use crate::BitPacked;
+use crate::BitPackedArrayExt;
+use crate::row_encode_common::PrimRowEncode;
+use crate::row_encode_common::encode_primitive_chunk;
+use crate::row_encode_common::encoded_size_for_ptype;
+use crate::unpack_iter::BitPacked as BitPackedUnpack;
+
+/// Per-row size contribution for a `BitPacked` column.
+fn bitpacked_size_contribution(
+    column: &ArrayRef,
+    _field: SortField,
+    sizes: &mut [u32],
+    _ctx: &mut ExecutionCtx,
+) -> VortexResult<Option<()>> {
+    let Some(view) = column.as_opt::<BitPacked>() else {
+        return Ok(None);
+    };
+    let add = encoded_size_for_ptype(view.dtype().as_ptype());
+    for s in sizes.iter_mut().take(view.as_ref().len()) {
+        *s += add;
+    }
+    Ok(Some(()))
+}
+
+/// Per-row byte encoding for a `BitPacked` column.
+fn bitpacked_encode_into(
+    column: &ArrayRef,
+    field: SortField,
+    offsets: &[u32],
+    cursors: &mut [u32],
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<Option<()>> {
+    let Some(view) = column.as_opt::<BitPacked>() else {
+        return Ok(None);
+    };
+    let ptype = view.dtype().as_ptype();
+    if !matches!(
+        ptype,
+        PType::I8
+            | PType::I16
+            | PType::I32
+            | PType::I64
+            | PType::U8
+            | PType::U16
+            | PType::U32
+            | PType::U64
+    ) {
+        return Ok(None);
+    }
+    // Materialize validity once and fast-path the common all-valid case.
+    // Use the explicit Ext method which returns a `Validity` (the inherent `validity()` on
+    // `ArrayView` returns `VortexResult<Validity>`).
+    let validity = BitPackedArrayExt::validity(&view);
+    let mask = match &validity {
+        Validity::NonNullable | Validity::AllValid => None,
+        _ => Some(validity.execute_mask(view.as_ref().len(), ctx)?),
+    };
+
+    // Materialize patches (rare; if patches are present we materialize the patch
+    // index/value slices once outside the hot loop).
+    let patches = view.patches();
+    let patch_pairs = if let Some(p) = patches {
+        let indices = p.indices().clone().execute::<PrimitiveArray>(ctx)?;
+        let values = p.values().clone().execute::<PrimitiveArray>(ctx)?;
+        Some((indices, values, p.offset()))
+    } else {
+        None
+    };
+
+    match_each_integer_ptype!(ptype, |T| {
+        encode_bitpacked_typed::<T>(
+            view,
+            field,
+            offsets,
+            cursors,
+            out,
+            mask.as_ref(),
+            patch_pairs.as_ref(),
+        )?;
+    });
+    Ok(Some(()))
+}
+
+#[allow(clippy::too_many_arguments)]
+fn encode_bitpacked_typed<T>(
+    arr_view: vortex_array::ArrayView<'_, BitPacked>,
+    field: SortField,
+    offsets: &[u32],
+    cursors: &mut [u32],
+    out: &mut [u8],
+    mask: Option<&vortex_mask::Mask>,
+    patch_pairs: Option<&(PrimitiveArray, PrimitiveArray, usize)>,
+) -> VortexResult<()>
+where
+    T: BitPackedUnpack + NativePType + PrimRowEncode,
+{
+    let total_len = arr_view.as_ref().len();
+    let descending = field.descending;
+    let non_null = field.non_null_sentinel();
+    let null = field.null_sentinel();
+    let value_bytes = size_of::<T>();
+    let stride = (1 + value_bytes) as u32;
+
+    let mut local_idx: usize = 0;
+    let mut unpacked = arr_view.unpacked_chunks::<T>()?;
+
+    // Walk the array: initial sliced chunk, full middle chunks, trailing sliced chunk.
+    if let Some(initial) = unpacked.initial() {
+        let len_chunk = initial.len();
+        // Apply patches that fall in this chunk (logical rows local_idx..local_idx+len_chunk).
+        apply_patches_in_range::<T>(initial, patch_pairs, local_idx, local_idx + len_chunk);
+        write_chunk_rows::<T>(
+            initial,
+            local_idx,
+            offsets,
+            cursors,
+            out,
+            mask,
+            non_null,
+            null,
+            descending,
+            value_bytes,
+            stride,
+        );
+        local_idx += len_chunk;
+    }
+
+    let mut chunks_iter = unpacked.full_chunks();
+    while let Some(chunk) = chunks_iter.next() {
+        // Determine logical length: full chunk is 1024.
+        let len_chunk = 1024.min(total_len - local_idx);
+        // Apply patches that fall in this chunk.
+        apply_patches_in_range::<T>(
+            &mut chunk[..len_chunk],
+            patch_pairs,
+            local_idx,
+            local_idx + len_chunk,
+        );
+        write_chunk_rows::<T>(
+            &chunk[..len_chunk],
+            local_idx,
+            offsets,
+            cursors,
+            out,
+            mask,
+            non_null,
+            null,
+            descending,
+            value_bytes,
+            stride,
+        );
+        local_idx += len_chunk;
+    }
+
+    if let Some(trailer) = unpacked.trailer() {
+        let len_chunk = trailer.len();
+        apply_patches_in_range::<T>(trailer, patch_pairs, local_idx, local_idx + len_chunk);
+        write_chunk_rows::<T>(
+            trailer,
+            local_idx,
+            offsets,
+            cursors,
+            out,
+            mask,
+            non_null,
+            null,
+            descending,
+            value_bytes,
+            stride,
+        );
+        local_idx += len_chunk;
+    }
+
+    debug_assert_eq!(local_idx, total_len);
+    Ok(())
+}
+
+/// Overwrite values in `chunk` (which covers logical rows `[chunk_start, chunk_end)`) with
+/// any patch values that fall in that range.
+fn apply_patches_in_range<T: NativePType>(
+    chunk: &mut [T],
+    patch_pairs: Option<&(PrimitiveArray, PrimitiveArray, usize)>,
+    chunk_start: usize,
+    chunk_end: usize,
+) {
+    let Some((indices_p, values_p, patch_offset)) = patch_pairs else {
+        return;
+    };
+    let values: &[T] = values_p.as_slice();
+    // Indices may be u32 or u64. We search for the first index >= chunk_start + patch_offset.
+    // For simplicity, scan linearly per chunk; patches are rare.
+    let logical_start = chunk_start + *patch_offset;
+    let logical_end = chunk_end + *patch_offset;
+    let indices_ptype = indices_p.ptype();
+    match indices_ptype {
+        PType::U32 => {
+            let idx: &[u32] = indices_p.as_slice();
+            for (i, &raw_idx) in idx.iter().enumerate() {
+                let raw_idx = raw_idx as usize;
+                if raw_idx < logical_start {
+                    continue;
+                }
+                if raw_idx >= logical_end {
+                    break;
+                }
+                let local = raw_idx - logical_start;
+                chunk[local] = values[i];
+            }
+        }
+        PType::U64 => {
+            let idx: &[u64] = indices_p.as_slice();
+            for (i, &raw_idx) in idx.iter().enumerate() {
+                let raw_idx = raw_idx as usize;
+                if raw_idx < logical_start {
+                    continue;
+                }
+                if raw_idx >= logical_end {
+                    break;
+                }
+                let local = raw_idx - logical_start;
+                chunk[local] = values[i];
+            }
+        }
+        PType::U16 => {
+            let idx: &[u16] = indices_p.as_slice();
+            for (i, &raw_idx) in idx.iter().enumerate() {
+                let raw_idx = raw_idx as usize;
+                if raw_idx < logical_start {
+                    continue;
+                }
+                if raw_idx >= logical_end {
+                    break;
+                }
+                let local = raw_idx - logical_start;
+                chunk[local] = values[i];
+            }
+        }
+        PType::U8 => {
+            let idx: &[u8] = indices_p.as_slice();
+            for (i, &raw_idx) in idx.iter().enumerate() {
+                let raw_idx = raw_idx as usize;
+                if raw_idx < logical_start {
+                    continue;
+                }
+                if raw_idx >= logical_end {
+                    break;
+                }
+                let local = raw_idx - logical_start;
+                chunk[local] = values[i];
+            }
+        }
+        _ => {}
+    }
+}
+
+#[allow(clippy::too_many_arguments)]
+fn write_chunk_rows<T: NativePType + PrimRowEncode>(
+    chunk: &[T],
+    row_start: usize,
+    offsets: &[u32],
+    cursors: &mut [u32],
+    out: &mut [u8],
+    mask: Option<&vortex_mask::Mask>,
+    non_null: u8,
+    null: u8,
+    descending: bool,
+    value_bytes: usize,
+    stride: u32,
+) {
+    encode_primitive_chunk::<T>(
+        chunk,
+        row_start,
+        offsets,
+        cursors,
+        out,
+        mask,
+        non_null,
+        null,
+        descending,
+        value_bytes,
+        stride,
+    );
+}
+
+fn bitpacked_array_id() -> ArrayId {
+    use vortex_session::registry::CachedId;
+    static ID: CachedId = CachedId::new("fastlanes.bitpacked");
+    *ID
+}
+
+inventory::submit! {
+    RowEncodeRegistration {
+        id: bitpacked_array_id,
+        size: bitpacked_size_contribution,
+        encode: bitpacked_encode_into,
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use vortex_array::IntoArray;
+    use vortex_array::LEGACY_SESSION;
+    use vortex_array::VortexSessionExecute;
+    use vortex_array::arrays::ListViewArray;
+    use vortex_array::arrays::PrimitiveArray;
+    use vortex_array::arrays::listview::ListViewArrayExt;
+    use vortex_buffer::buffer;
+    use vortex_error::VortexResult;
+    use vortex_row::SortField;
+    use vortex_row::convert_columns;
+
+    use crate::BitPackedArrayExt;
+    use crate::BitPackedData;
+
+    fn collect_rows(arr: &ListViewArray) -> Vec<Vec<u8>> {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        let n = arr.len();
+        (0..n)
+            .map(|i| {
+                let slice = arr.list_elements_at(i).unwrap();
+                let p = slice.execute::<PrimitiveArray>(&mut ctx).unwrap();
+                p.as_slice::<u8>().to_vec()
+            })
+            .collect()
+    }
+
+    #[test]
+    fn bitpacked_row_encode_matches_canonical() -> VortexResult<()> {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        let raw = buffer![1u32, 2, 3, 4, 5, 6, 7, 8, 9].into_array();
+        let bp = BitPackedData::encode(&raw, 4, &mut ctx)?.into_array();
+
+        let by_canonical = convert_columns(&[raw], &[SortField::default()], &mut ctx)?;
+        let by_bp = convert_columns(&[bp], &[SortField::default()], &mut ctx)?;
+        assert_eq!(collect_rows(&by_canonical), collect_rows(&by_bp));
+        Ok(())
+    }
+
+    #[test]
+    fn bitpacked_row_encode_with_patches() -> VortexResult<()> {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        let values: Vec<u32> = (0..200)
+            .map(|i| if i % 30 == 0 { 5000 + i } else { i % 16 })
+            .collect();
+        let raw = PrimitiveArray::from_iter(values).into_array();
+        let bp = BitPackedData::encode(&raw, 4, &mut ctx)?.into_array();
+        assert!(bp.as_opt::<crate::BitPacked>().unwrap().patches().is_some());
+        let by_canonical = convert_columns(&[raw], &[SortField::default()], &mut ctx)?;
+        let by_bp = convert_columns(&[bp], &[SortField::default()], &mut ctx)?;
+        assert_eq!(collect_rows(&by_canonical), collect_rows(&by_bp));
+        Ok(())
+    }
+
+    #[test]
+    fn bitpacked_row_encode_multi_chunk() -> VortexResult<()> {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        let values: Vec<u32> = (0..3000).map(|i| i % 64).collect();
+        let raw = PrimitiveArray::from_iter(values).into_array();
+        let bp = BitPackedData::encode(&raw, 6, &mut ctx)?.into_array();
+        let by_canonical = convert_columns(&[raw], &[SortField::default()], &mut ctx)?;
+        let by_bp = convert_columns(&[bp], &[SortField::default()], &mut ctx)?;
+        assert_eq!(collect_rows(&by_canonical), collect_rows(&by_bp));
+        Ok(())
+    }
+}
diff --git a/encodings/fastlanes/src/delta/compute/mod.rs b/encodings/fastlanes/src/delta/compute/mod.rs
index fa79c62a596..61fb9ff15e7 100644
--- a/encodings/fastlanes/src/delta/compute/mod.rs
+++ b/encodings/fastlanes/src/delta/compute/mod.rs
@@ -2,3 +2,4 @@
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
 mod cast;
+pub(crate) mod row_encode;
diff --git a/encodings/fastlanes/src/delta/compute/row_encode.rs b/encodings/fastlanes/src/delta/compute/row_encode.rs
new file mode 100644
index 00000000000..706bb7247b0
--- /dev/null
+++ b/encodings/fastlanes/src/delta/compute/row_encode.rs
@@ -0,0 +1,247 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Row-encode kernel for `DeltaArray`.
+//!
+//! Decompresses the deltas directly into a temporary primitive buffer (avoiding the
+//! `PrimitiveArray` wrapper allocation and validity attachment) and then walks the buffer
+//! once to produce row-encoded bytes.
+
+#![allow(
+    clippy::cast_possible_truncation,
+    clippy::cast_possible_wrap,
+    clippy::cast_sign_loss,
+    reason = "row encoding indexes into u32-sized buffers"
+)]
+
+use fastlanes::Delta as DeltaTrait;
+use fastlanes::FastLanes;
+use fastlanes::Transpose;
+use vortex_array::ArrayId;
+use vortex_array::ArrayRef;
+use vortex_array::ExecutionCtx;
+use vortex_array::arrays::PrimitiveArray;
+use vortex_array::arrays::primitive::PrimitiveArrayExt;
+use vortex_array::dtype::NativePType;
+use vortex_array::match_each_unsigned_integer_ptype;
+use vortex_array::validity::Validity;
+use vortex_error::VortexResult;
+use vortex_row::RowEncodeRegistration;
+use vortex_row::options::SortField;
+
+use crate::Delta;
+use crate::bit_transpose::untranspose_validity;
+use crate::delta::array::DeltaArrayExt;
+use crate::delta::array::delta_decompress::decompress_primitive;
+use crate::row_encode_common::PrimRowEncode;
+use crate::row_encode_common::encode_primitive_chunk;
+use crate::row_encode_common::encoded_size_for_ptype;
+
+/// Per-row size contribution for a `Delta` column.
+fn delta_size_contribution(
+    column: &ArrayRef,
+    _field: SortField,
+    sizes: &mut [u32],
+    _ctx: &mut ExecutionCtx,
+) -> VortexResult<Option<()>> {
+    let Some(view) = column.as_opt::<Delta>() else {
+        return Ok(None);
+    };
+    let add = encoded_size_for_ptype(view.as_ref().dtype().as_ptype());
+    for s in sizes.iter_mut().take(view.as_ref().len()) {
+        *s += add;
+    }
+    Ok(Some(()))
+}
+
+/// Per-row byte encoding for a `Delta` column.
+fn delta_encode_into(
+    column: &ArrayRef,
+    field: SortField,
+    offsets: &[u32],
+    cursors: &mut [u32],
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<Option<()>> {
+    let Some(view) = column.as_opt::<Delta>() else {
+        return Ok(None);
+    };
+
+    // Materialize bases and deltas (these are already primitive arrays).
+    let bases = view.bases().clone().execute::<PrimitiveArray>(ctx)?;
+    let deltas = view.deltas().clone().execute::<PrimitiveArray>(ctx)?;
+    let start = view.offset();
+    let total_len = view.as_ref().len();
+    let end = start + total_len;
+
+    // Following delta_decompress: validity is transposed on the deltas, untranspose it.
+    let validity = untranspose_validity(&deltas.validity()?, ctx)?;
+    let validity = validity.slice(start..end)?;
+
+    let descending = field.descending;
+    let non_null = field.non_null_sentinel();
+    let null = field.null_sentinel();
+    let original_ptype = deltas.ptype();
+    let value_bytes = original_ptype.byte_width();
+    let stride = (1 + value_bytes) as u32;
+
+    let mask = match &validity {
+        Validity::NonNullable | Validity::AllValid => None,
+        _ => Some(validity.execute_mask(total_len, ctx)?),
+    };
+
+    // Operate on the unsigned reinterpretation (matches `delta_decompress`).
+    let bases_u = bases.reinterpret_cast(original_ptype.to_unsigned());
+    let deltas_u = deltas.reinterpret_cast(original_ptype.to_unsigned());
+    let is_signed = original_ptype.is_signed_int();
+
+    match_each_unsigned_integer_ptype!(deltas_u.ptype(), |T| {
+        const LANES: usize = T::LANES;
+        let buffer = decompress_primitive::<T, LANES>(bases_u.as_slice(), deltas_u.as_slice());
+        let slice = &buffer.as_slice()[start..end];
+        if is_signed {
+            // Reinterpret each unsigned element as its signed counterpart for encoding.
+            // SAFETY: `T` and its signed counterpart have the same size and alignment.
+            let signed: &[<T as ToSigned>::Signed] = unsafe {
+                std::slice::from_raw_parts(
+                    slice.as_ptr().cast::<<T as ToSigned>::Signed>(),
+                    slice.len(),
+                )
+            };
+            encode_primitive_chunk::<<T as ToSigned>::Signed>(
+                signed,
+                0,
+                offsets,
+                cursors,
+                out,
+                mask.as_ref(),
+                non_null,
+                null,
+                descending,
+                value_bytes,
+                stride,
+            );
+        } else {
+            encode_primitive_chunk::<T>(
+                slice,
+                0,
+                offsets,
+                cursors,
+                out,
+                mask.as_ref(),
+                non_null,
+                null,
+                descending,
+                value_bytes,
+                stride,
+            );
+        }
+    });
+
+    Ok(Some(()))
+}
+
+/// Helper trait mapping unsigned types to their signed counterparts so we can encode signed
+/// values without losing the sign-bit-flip semantics of `PrimRowEncode`.
+trait ToSigned: Copy {
+    type Signed: Copy + NativePType + PrimRowEncode;
+}
+impl ToSigned for u8 {
+    type Signed = i8;
+}
+impl ToSigned for u16 {
+    type Signed = i16;
+}
+impl ToSigned for u32 {
+    type Signed = i32;
+}
+impl ToSigned for u64 {
+    type Signed = i64;
+}
+
+fn delta_array_id() -> ArrayId {
+    use vortex_session::registry::CachedId;
+    static ID: CachedId = CachedId::new("fastlanes.delta");
+    *ID
+}
+
+inventory::submit! {
+    RowEncodeRegistration {
+        id: delta_array_id,
+        size: delta_size_contribution,
+        encode: delta_encode_into,
+    }
+}
+
+// Silence the warning about `Transpose` / `FastLanes` being unused: they are referenced via
+// the trait bound chain on `decompress_primitive::<T, LANES>`.
+#[allow(dead_code)]
+const fn _trait_dep<T: DeltaTrait + Transpose + FastLanes>() {}
+
+#[cfg(test)]
+mod tests {
+    use vortex_array::IntoArray;
+    use vortex_array::LEGACY_SESSION;
+    use vortex_array::VortexSessionExecute;
+    use vortex_array::arrays::ListViewArray;
+    use vortex_array::arrays::PrimitiveArray;
+    use vortex_array::arrays::listview::ListViewArrayExt;
+    use vortex_buffer::buffer;
+    use vortex_error::VortexResult;
+    use vortex_row::SortField;
+    use vortex_row::convert_columns;
+
+    use crate::Delta;
+
+    fn collect_rows(arr: &ListViewArray) -> Vec<Vec<u8>> {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        let n = arr.len();
+        (0..n)
+            .map(|i| {
+                let slice = arr.list_elements_at(i).unwrap();
+                let p = slice.execute::<PrimitiveArray>(&mut ctx).unwrap();
+                p.as_slice::<u8>().to_vec()
+            })
+            .collect()
+    }
+
+    #[test]
+    fn delta_row_encode_matches_canonical_u64() -> VortexResult<()> {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        let raw = buffer![1u64, 2, 3, 5, 10, 11, 20].into_array();
+        let p = PrimitiveArray::from_iter([1u64, 2, 3, 5, 10, 11, 20]);
+        let delta = Delta::try_from_primitive_array(&p, &mut ctx)?.into_array();
+
+        let by_raw = convert_columns(&[raw], &[SortField::default()], &mut ctx)?;
+        let by_delta = convert_columns(&[delta], &[SortField::default()], &mut ctx)?;
+        assert_eq!(collect_rows(&by_raw), collect_rows(&by_delta));
+        Ok(())
+    }
+
+    #[test]
+    fn delta_row_encode_matches_canonical_i64() -> VortexResult<()> {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        let raw = buffer![-3i64, -2, -1, 0, 1, 2].into_array();
+        let p = PrimitiveArray::from_iter([-3i64, -2, -1, 0, 1, 2]);
+        let delta = Delta::try_from_primitive_array(&p, &mut ctx)?.into_array();
+
+        let by_raw = convert_columns(&[raw], &[SortField::default()], &mut ctx)?;
+        let by_delta = convert_columns(&[delta], &[SortField::default()], &mut ctx)?;
+        assert_eq!(collect_rows(&by_raw), collect_rows(&by_delta));
+        Ok(())
+    }
+
+    #[test]
+    fn delta_row_encode_multi_chunk_i64() -> VortexResult<()> {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        let values: Vec<i64> = (0..3000).map(|i| 1000 + i as i64 * 3).collect();
+        let raw = PrimitiveArray::from_iter(values.clone()).into_array();
+        let p = PrimitiveArray::from_iter(values);
+        let delta = Delta::try_from_primitive_array(&p, &mut ctx)?.into_array();
+
+        let by_raw = convert_columns(&[raw], &[SortField::default()], &mut ctx)?;
+        let by_delta = convert_columns(&[delta], &[SortField::default()], &mut ctx)?;
+        assert_eq!(collect_rows(&by_raw), collect_rows(&by_delta));
+        Ok(())
+    }
+}
diff --git a/encodings/fastlanes/src/for/compute/mod.rs b/encodings/fastlanes/src/for/compute/mod.rs
index 7307b50d776..aec27e74000 100644
--- a/encodings/fastlanes/src/for/compute/mod.rs
+++ b/encodings/fastlanes/src/for/compute/mod.rs
@@ -5,6 +5,7 @@ mod cast;
 mod compare;
 pub(crate) mod is_constant;
 pub(crate) mod is_sorted;
+pub(crate) mod row_encode;
 
 use vortex_array::ArrayRef;
 use vortex_array::ArrayView;
diff --git a/encodings/fastlanes/src/for/compute/row_encode.rs b/encodings/fastlanes/src/for/compute/row_encode.rs
new file mode 100644
index 00000000000..6dd8e580e62
--- /dev/null
+++ b/encodings/fastlanes/src/for/compute/row_encode.rs
@@ -0,0 +1,394 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Row-encode kernel for `FoRArray`.
+//!
+//! For the common fused path (`encoded` is a `BitPacked` with unsigned encoded values), this
+//! walks the bit-packed storage in 1024-element chunks, applies the FoR base inline via a
+//! custom `UnpackStrategy`, and writes the row-encoded bytes in one pass. Other shapes fall
+//! through to the canonicalize path.
+
+#![allow(
+    clippy::cast_possible_truncation,
+    clippy::cast_possible_wrap,
+    clippy::cast_sign_loss,
+    reason = "row encoding indexes into u32-sized buffers"
+)]
+use std::mem::MaybeUninit;
+
+use fastlanes::FoR as FoRTrait;
+use num_traits::WrappingAdd;
+use vortex_array::ArrayId;
+use vortex_array::ArrayRef;
+use vortex_array::ExecutionCtx;
+use vortex_array::arrays::Primitive;
+use vortex_array::arrays::PrimitiveArray;
+use vortex_array::dtype::NativePType;
+use vortex_array::dtype::PType;
+use vortex_array::dtype::PhysicalPType;
+use vortex_array::dtype::UnsignedPType;
+use vortex_array::match_each_integer_ptype;
+use vortex_array::match_each_unsigned_integer_ptype;
+use vortex_array::validity::Validity;
+use vortex_error::VortexExpect;
+use vortex_error::VortexResult;
+use vortex_row::RowEncodeRegistration;
+use vortex_row::options::SortField;
+
+use crate::BitPacked;
+use crate::BitPackedArrayExt;
+use crate::FoR;
+use crate::r#for::array::FoRArrayExt;
+use crate::row_encode_common::PrimRowEncode;
+use crate::row_encode_common::encode_primitive_chunk;
+use crate::row_encode_common::encoded_size_for_ptype;
+use crate::unpack_iter::BitPacked as BitPackedUnpack;
+
+/// Per-row size contribution for a `FoR` column.
+fn for_size_contribution(
+    column: &ArrayRef,
+    _field: SortField,
+    sizes: &mut [u32],
+    _ctx: &mut ExecutionCtx,
+) -> VortexResult<Option<()>> {
+    let Some(view) = column.as_opt::<FoR>() else {
+        return Ok(None);
+    };
+    let add = encoded_size_for_ptype(view.as_ref().dtype().as_ptype());
+    for s in sizes.iter_mut().take(view.as_ref().len()) {
+        *s += add;
+    }
+    Ok(Some(()))
+}
+
+/// Per-row byte encoding for a `FoR` column.
+fn for_encode_into(
+    column: &ArrayRef,
+    field: SortField,
+    offsets: &[u32],
+    cursors: &mut [u32],
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<Option<()>> {
+    let Some(view) = column.as_opt::<FoR>() else {
+        return Ok(None);
+    };
+    let ptype = view.as_ref().dtype().as_ptype();
+
+    // Fast path: encoded is a BitPacked with unsigned encoded values (the case used by the
+    // standard compressor). We do fused unpack + base-add + row-write in one pass.
+    if view.reference_scalar().dtype().is_unsigned_int()
+        && let Some(bp) = view.encoded().as_opt::<BitPacked>()
+    {
+        match_each_unsigned_integer_ptype!(ptype, |T| {
+            encode_for_bitpacked::<T>(view, bp, field, offsets, cursors, out, ctx)?;
+        });
+        return Ok(Some(()));
+    }
+
+    // Slower path: encoded is already a primitive array (or a non-BitPacked encoded). Walk
+    // the canonical primitive buffer directly and add the base.
+    if view.encoded().as_opt::<Primitive>().is_some() {
+        match_each_integer_ptype!(ptype, |T| {
+            encode_for_primitive::<T>(view, field, offsets, cursors, out, ctx)?;
+        });
+        return Ok(Some(()));
+    }
+
+    // Decline; the default canonicalization path will handle it.
+    Ok(None)
+}
+
+#[allow(clippy::too_many_arguments)]
+fn encode_for_bitpacked<T>(
+    for_view: vortex_array::ArrayView<'_, FoR>,
+    bp_view: vortex_array::ArrayView<'_, BitPacked>,
+    field: SortField,
+    offsets: &[u32],
+    cursors: &mut [u32],
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()>
+where
+    T: BitPackedUnpack
+        + PhysicalPType<Physical = T>
+        + UnsignedPType
+        + FoRTrait
+        + WrappingAdd
+        + NativePType
+        + PrimRowEncode,
+{
+    let ref_value: T = for_view
+        .reference_scalar()
+        .as_primitive()
+        .as_::<T>()
+        .vortex_expect("FoR reference cannot be null");
+
+    let total_len = bp_view.as_ref().len();
+    let descending = field.descending;
+    let non_null = field.non_null_sentinel();
+    let null = field.null_sentinel();
+    let value_bytes = size_of::<T>();
+    let stride = (1 + value_bytes) as u32;
+
+    // Materialize validity once.
+    let validity = BitPackedArrayExt::validity(&bp_view);
+    let mask = match &validity {
+        Validity::NonNullable | Validity::AllValid => None,
+        _ => Some(validity.execute_mask(total_len, ctx)?),
+    };
+
+    // Pre-canonicalize patches: rare. They have already been wrapping-added with the base by
+    // the FoR fused path; here we mirror that contract by adding the base after looking up
+    // the patch value.
+    let patches = bp_view.patches();
+    let patch_pairs = if let Some(p) = patches {
+        let indices = p.indices().clone().execute::<PrimitiveArray>(ctx)?;
+        let values = p.values().clone().execute::<PrimitiveArray>(ctx)?;
+        Some((indices, values, p.offset()))
+    } else {
+        None
+    };
+
+    // Walk chunks directly: 1024 elements per chunk, FoR base added inline.
+    let bit_width = bp_view.bit_width() as usize;
+    let offset = bp_view.offset() as usize;
+    let packed_bytes = bp_view.packed().as_host();
+    // SAFETY: packed bytes are aligned as `T` per FastLanes layout invariants.
+    let packed_slice: &[T] = unsafe {
+        std::slice::from_raw_parts(
+            packed_bytes.as_ptr().cast::<T>(),
+            packed_bytes.len() / size_of::<T>(),
+        )
+    };
+    let elems_per_chunk = 128 * bit_width / size_of::<T>();
+    let num_chunks = (offset + total_len).div_ceil(1024);
+
+    let mut buf: [MaybeUninit<T>; 1024] = [const { MaybeUninit::<T>::uninit() }; 1024];
+    let mut local_idx: usize = 0;
+    for chunk_idx in 0..num_chunks {
+        // SAFETY: `chunk` covers `elems_per_chunk` packed elements; `buf` is exactly 1024 entries.
+        unsafe {
+            let chunk = &packed_slice[chunk_idx * elems_per_chunk..][..elems_per_chunk];
+            FoRTrait::unchecked_unfor_pack(
+                bit_width,
+                chunk,
+                ref_value,
+                std::mem::transmute::<&mut [MaybeUninit<T>; 1024], &mut [T; 1024]>(&mut buf),
+            );
+        }
+        // SAFETY: just initialized 1024 elements.
+        let unpacked: &mut [T; 1024] =
+            unsafe { std::mem::transmute::<&mut [MaybeUninit<T>; 1024], &mut [T; 1024]>(&mut buf) };
+
+        // Determine the logical range within this chunk.
+        let chunk_offset = if chunk_idx == 0 { offset } else { 0 };
+        let chunk_logical_start = chunk_idx * 1024;
+        let chunk_logical_end = ((chunk_idx + 1) * 1024).min(offset + total_len);
+        let usable = &mut unpacked[chunk_offset..(chunk_logical_end - chunk_idx * 1024)];
+
+        // Apply patches that fall in this chunk.
+        apply_patches_in_range_for::<T>(
+            usable,
+            patch_pairs.as_ref(),
+            local_idx,
+            local_idx + usable.len(),
+            ref_value,
+        );
+
+        encode_primitive_chunk::<T>(
+            usable,
+            local_idx,
+            offsets,
+            cursors,
+            out,
+            mask.as_ref(),
+            non_null,
+            null,
+            descending,
+            value_bytes,
+            stride,
+        );
+        local_idx += usable.len();
+        let _ = chunk_logical_start;
+    }
+
+    debug_assert_eq!(local_idx, total_len);
+    Ok(())
+}
+
+fn apply_patches_in_range_for<T>(
+    chunk: &mut [T],
+    patch_pairs: Option<&(PrimitiveArray, PrimitiveArray, usize)>,
+    chunk_start: usize,
+    chunk_end: usize,
+    ref_value: T,
+) where
+    T: NativePType + WrappingAdd,
+{
+    let Some((indices_p, values_p, patch_offset)) = patch_pairs else {
+        return;
+    };
+    let values: &[T] = values_p.as_slice();
+    let logical_start = chunk_start + *patch_offset;
+    let logical_end = chunk_end + *patch_offset;
+    macro_rules! walk {
+        ($idx_ty:ty) => {{
+            let idx: &[$idx_ty] = indices_p.as_slice();
+            for (i, &raw_idx) in idx.iter().enumerate() {
+                let raw_idx = raw_idx as usize;
+                if raw_idx < logical_start {
+                    continue;
+                }
+                if raw_idx >= logical_end {
+                    break;
+                }
+                let local = raw_idx - logical_start;
+                chunk[local] = values[i].wrapping_add(&ref_value);
+            }
+        }};
+    }
+    match indices_p.ptype() {
+        PType::U64 => walk!(u64),
+        PType::U32 => walk!(u32),
+        PType::U16 => walk!(u16),
+        PType::U8 => walk!(u8),
+        _ => {}
+    }
+}
+
+fn encode_for_primitive<T>(
+    for_view: vortex_array::ArrayView<'_, FoR>,
+    field: SortField,
+    offsets: &[u32],
+    cursors: &mut [u32],
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()>
+where
+    T: NativePType + PrimRowEncode + WrappingAdd,
+{
+    let encoded = for_view.encoded().clone().execute::<PrimitiveArray>(ctx)?;
+    let total_len = encoded.len();
+    let descending = field.descending;
+    let non_null = field.non_null_sentinel();
+    let null = field.null_sentinel();
+    let value_bytes = size_of::<T>();
+    let stride = (1 + value_bytes) as u32;
+    let ref_value: T = for_view
+        .reference_scalar()
+        .as_primitive()
+        .as_::<T>()
+        .vortex_expect("FoR reference cannot be null");
+
+    let validity = encoded.validity()?;
+    let mask = match &validity {
+        Validity::NonNullable | Validity::AllValid => None,
+        _ => Some(validity.execute_mask(total_len, ctx)?),
+    };
+
+    let slice: &[T] = encoded.as_slice();
+    match mask {
+        None => {
+            for (i, &v) in slice.iter().enumerate() {
+                let val = v.wrapping_add(&ref_value);
+                let pos = (offsets[i] + cursors[i]) as usize;
+                out[pos] = non_null;
+                val.row_encode_to(&mut out[pos + 1..pos + 1 + value_bytes], descending);
+                cursors[i] += stride;
+            }
+        }
+        Some(m) => {
+            for (i, &v) in slice.iter().enumerate() {
+                let pos = (offsets[i] + cursors[i]) as usize;
+                if m.value(i) {
+                    let val = v.wrapping_add(&ref_value);
+                    out[pos] = non_null;
+                    val.row_encode_to(&mut out[pos + 1..pos + 1 + value_bytes], descending);
+                } else {
+                    out[pos] = null;
+                    for b in &mut out[pos + 1..pos + 1 + value_bytes] {
+                        *b = 0;
+                    }
+                }
+                cursors[i] += stride;
+            }
+        }
+    }
+    Ok(())
+}
+
+fn for_array_id() -> ArrayId {
+    use vortex_session::registry::CachedId;
+    static ID: CachedId = CachedId::new("fastlanes.for");
+    *ID
+}
+
+inventory::submit! {
+    RowEncodeRegistration {
+        id: for_array_id,
+        size: for_size_contribution,
+        encode: for_encode_into,
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use vortex_array::IntoArray;
+    use vortex_array::LEGACY_SESSION;
+    use vortex_array::VortexSessionExecute;
+    use vortex_array::arrays::ListViewArray;
+    use vortex_array::arrays::PrimitiveArray;
+    use vortex_array::arrays::listview::ListViewArrayExt;
+    use vortex_array::scalar::Scalar;
+    use vortex_buffer::buffer;
+    use vortex_error::VortexResult;
+    use vortex_row::SortField;
+    use vortex_row::convert_columns;
+
+    use crate::BitPackedData;
+    use crate::FoR;
+
+    fn collect_rows(arr: &ListViewArray) -> Vec<Vec<u8>> {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        let n = arr.len();
+        (0..n)
+            .map(|i| {
+                let slice = arr.list_elements_at(i).unwrap();
+                let p = slice.execute::<PrimitiveArray>(&mut ctx).unwrap();
+                p.as_slice::<u8>().to_vec()
+            })
+            .collect()
+    }
+
+    #[test]
+    fn for_row_encode_matches_canonical_primitive_encoded() -> VortexResult<()> {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        // FoR with a plain primitive `encoded` array (no BitPacked underneath).
+        let encoded = buffer![5i64, 6, 7, 8, 9].into_array();
+        let arr = FoR::try_new(encoded, Scalar::from(100i64))?.into_array();
+        let raw = buffer![105i64, 106, 107, 108, 109].into_array();
+
+        let by_raw = convert_columns(&[raw], &[SortField::default()], &mut ctx)?;
+        let by_for = convert_columns(&[arr], &[SortField::default()], &mut ctx)?;
+        assert_eq!(collect_rows(&by_raw), collect_rows(&by_for));
+        Ok(())
+    }
+
+    #[test]
+    fn for_row_encode_matches_canonical_bitpacked_encoded() -> VortexResult<()> {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        let n: usize = 1500;
+        let raw_values: Vec<u32> = (0..n as u32).map(|i| 10_000 + i).collect();
+        let raw_arr = PrimitiveArray::from_iter(raw_values).into_array();
+        let unsigned_encoded =
+            PrimitiveArray::from_iter((0..n as u32).collect::<Vec<u32>>()).into_array();
+        let bp = BitPackedData::encode(&unsigned_encoded, 11, &mut ctx)?.into_array();
+        let arr = FoR::try_new(bp, Scalar::from(10_000u32))?.into_array();
+
+        let by_raw = convert_columns(&[raw_arr], &[SortField::default()], &mut ctx)?;
+        let by_for = convert_columns(&[arr], &[SortField::default()], &mut ctx)?;
+        assert_eq!(collect_rows(&by_raw), collect_rows(&by_for));
+        Ok(())
+    }
+}
diff --git a/encodings/fastlanes/src/lib.rs b/encodings/fastlanes/src/lib.rs
index 9022b7c4e2b..613a4784356 100644
--- a/encodings/fastlanes/src/lib.rs
+++ b/encodings/fastlanes/src/lib.rs
@@ -20,6 +20,7 @@ mod bitpacking;
 mod delta;
 mod r#for;
 mod rle;
+mod row_encode_common;
 
 pub(crate) const FL_CHUNK_SIZE: usize = 1024;
 
diff --git a/encodings/fastlanes/src/row_encode_common.rs b/encodings/fastlanes/src/row_encode_common.rs
new file mode 100644
index 00000000000..7f44ebc7463
--- /dev/null
+++ b/encodings/fastlanes/src/row_encode_common.rs
@@ -0,0 +1,126 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Shared helpers for the FastLanes row-encode kernels (BitPacked, FoR, Delta).
+//!
+//! Each kernel walks the compressed storage in 1024-element chunks, unpacks each chunk into
+//! a stack-local buffer, and writes the row-encoded bytes in one pass. This module defines
+//! the per-row write primitive used after a chunk has been unpacked.
+
+#![allow(
+    clippy::cast_possible_truncation,
+    clippy::cast_possible_wrap,
+    reason = "row encoding indexes into u32-sized buffers"
+)]
+
+use vortex_array::dtype::NativePType;
+use vortex_array::dtype::PType;
+
+/// Trait implemented by primitive types that can be written into a row-encoded byte slot.
+///
+/// Mirrors `vortex_row::codec::RowEncode` for the integer types that show up as the output
+/// of BitPacked/FoR/Delta.
+pub trait PrimRowEncode: Copy {
+    /// Encode this value into `out`, inverting the bytes for descending order.
+    fn row_encode_to(self, out: &mut [u8], descending: bool);
+}
+
+macro_rules! impl_unsigned {
+    ($t:ty) => {
+        impl PrimRowEncode for $t {
+            #[inline]
+            fn row_encode_to(self, out: &mut [u8], descending: bool) {
+                let bytes = self.to_be_bytes();
+                if descending {
+                    for (i, b) in bytes.iter().enumerate() {
+                        out[i] = b ^ 0xFF;
+                    }
+                } else {
+                    out.copy_from_slice(&bytes);
+                }
+            }
+        }
+    };
+}
+
+macro_rules! impl_signed {
+    ($t:ty) => {
+        impl PrimRowEncode for $t {
+            #[inline]
+            fn row_encode_to(self, out: &mut [u8], descending: bool) {
+                let mut bytes = self.to_be_bytes();
+                bytes[0] ^= 0x80;
+                if descending {
+                    for (i, b) in bytes.iter().enumerate() {
+                        out[i] = b ^ 0xFF;
+                    }
+                } else {
+                    out.copy_from_slice(&bytes);
+                }
+            }
+        }
+    };
+}
+
+impl_unsigned!(u8);
+impl_unsigned!(u16);
+impl_unsigned!(u32);
+impl_unsigned!(u64);
+impl_signed!(i8);
+impl_signed!(i16);
+impl_signed!(i32);
+impl_signed!(i64);
+
+/// Encoded row width (sentinel + value bytes) for the given primitive type.
+#[inline]
+pub fn encoded_size_for_ptype(ptype: PType) -> u32 {
+    1 + (ptype.byte_width() as u32)
+}
+
+/// Write a contiguous slice of unpacked values (one chunk) into the row-encoded output buffer.
+///
+/// `chunk[j]` is the value for logical row `row_start + j`. The output position for row `i`
+/// is `offsets[i] + cursors[i]`; the cursor is advanced by `stride` after each row write.
+#[allow(clippy::too_many_arguments)]
+#[inline]
+pub fn encode_primitive_chunk<T: NativePType + PrimRowEncode>(
+    chunk: &[T],
+    row_start: usize,
+    offsets: &[u32],
+    cursors: &mut [u32],
+    out: &mut [u8],
+    mask: Option<&vortex_mask::Mask>,
+    non_null: u8,
+    null: u8,
+    descending: bool,
+    value_bytes: usize,
+    stride: u32,
+) {
+    match mask {
+        None => {
+            for (j, &v) in chunk.iter().enumerate() {
+                let row = row_start + j;
+                let pos = (offsets[row] + cursors[row]) as usize;
+                out[pos] = non_null;
+                v.row_encode_to(&mut out[pos + 1..pos + 1 + value_bytes], descending);
+                cursors[row] += stride;
+            }
+        }
+        Some(m) => {
+            for (j, &v) in chunk.iter().enumerate() {
+                let row = row_start + j;
+                let pos = (offsets[row] + cursors[row]) as usize;
+                if m.value(row) {
+                    out[pos] = non_null;
+                    v.row_encode_to(&mut out[pos + 1..pos + 1 + value_bytes], descending);
+                } else {
+                    out[pos] = null;
+                    for b in &mut out[pos + 1..pos + 1 + value_bytes] {
+                        *b = 0;
+                    }
+                }
+                cursors[row] += stride;
+            }
+        }
+    }
+}
diff --git a/encodings/runend/Cargo.toml b/encodings/runend/Cargo.toml
index 01a5b8d7a3e..0ec0302a4a8 100644
--- a/encodings/runend/Cargo.toml
+++ b/encodings/runend/Cargo.toml
@@ -16,6 +16,7 @@ version = { workspace = true }
 [dependencies]
 arbitrary = { workspace = true, optional = true }
 arrow-array = { workspace = true, optional = true }
+inventory = { workspace = true }
 itertools = { workspace = true }
 num-traits = { workspace = true }
 prost = { workspace = true }
@@ -23,6 +24,7 @@ vortex-array = { workspace = true }
 vortex-buffer = { workspace = true }
 vortex-error = { workspace = true }
 vortex-mask = { workspace = true }
+vortex-row = { workspace = true }
 vortex-session = { workspace = true }
 
 [lints]
diff --git a/encodings/runend/src/compute/mod.rs b/encodings/runend/src/compute/mod.rs
index 9bdb6d67c00..2b9ba9092ad 100644
--- a/encodings/runend/src/compute/mod.rs
+++ b/encodings/runend/src/compute/mod.rs
@@ -8,6 +8,7 @@ pub(crate) mod filter;
 pub(crate) mod is_constant;
 pub(crate) mod is_sorted;
 pub(crate) mod min_max;
+mod row_encode;
 pub(crate) mod take;
 pub(crate) mod take_from;
 
diff --git a/encodings/runend/src/compute/row_encode.rs b/encodings/runend/src/compute/row_encode.rs
new file mode 100644
index 00000000000..aa24fd9eea7
--- /dev/null
+++ b/encodings/runend/src/compute/row_encode.rs
@@ -0,0 +1,214 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Row-encode kernels for `RunEndArray`.
+//!
+//! Like `Dict`, the per-row size and per-row encoded bytes are determined by the column's
+//! *values*, so we encode each run-value once and broadcast it across the indices in that
+//! run. The per-unique-value cost is amortized over the number of runs rather than the
+//! row count.
+
+#![allow(
+    clippy::cast_possible_truncation,
+    clippy::cast_sign_loss,
+    reason = "row encoding indexes into u32-sized buffers; ends are non-negative"
+)]
+
+use num_traits::AsPrimitive;
+use vortex_array::ArrayId;
+use vortex_array::ArrayRef;
+use vortex_array::ExecutionCtx;
+use vortex_array::arrays::PrimitiveArray;
+use vortex_array::dtype::NativePType;
+use vortex_array::match_each_integer_ptype;
+use vortex_error::VortexResult;
+use vortex_row::RowEncodeRegistration;
+use vortex_row::encode::dispatch_encode;
+use vortex_row::options::SortField;
+use vortex_row::size::dispatch_size;
+
+use crate::RunEnd;
+use crate::RunEndArrayExt;
+
+/// Function pointer registered for the size contribution of a `RunEnd` column.
+fn run_end_size_contribution(
+    column: &ArrayRef,
+    field: SortField,
+    sizes: &mut [u32],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<Option<()>> {
+    let Some(view) = column.as_opt::<RunEnd>() else {
+        return Ok(None);
+    };
+    let nruns = view.ends().len();
+    if nruns > view.len() {
+        return Ok(None);
+    }
+
+    let mut value_sizes = vec![0u32; view.values().len()];
+    dispatch_size(view.values(), field, &mut value_sizes, ctx)?;
+
+    let offset = view.offset() as u64;
+    let len = view.len();
+    let ends_prim = view.ends().clone().execute::<PrimitiveArray>(ctx)?;
+
+    match_each_integer_ptype!(ends_prim.ptype(), |E| {
+        let ends = ends_prim.as_slice::<E>();
+        walk_runs::<E>(ends, offset, len, |run_idx, start, stop| {
+            let add = value_sizes[run_idx];
+            if add == 0 {
+                return;
+            }
+            for s in &mut sizes[start..stop] {
+                *s += add;
+            }
+        });
+    });
+    Ok(Some(()))
+}
+
+/// Function pointer registered for the per-row encode of a `RunEnd` column.
+fn run_end_encode_into(
+    column: &ArrayRef,
+    field: SortField,
+    offsets: &[u32],
+    cursors: &mut [u32],
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<Option<()>> {
+    let Some(view) = column.as_opt::<RunEnd>() else {
+        return Ok(None);
+    };
+    let nruns = view.ends().len();
+    if nruns > view.len() {
+        return Ok(None);
+    }
+
+    let n_values = view.values().len();
+    let mut value_sizes = vec![0u32; n_values];
+    dispatch_size(view.values(), field, &mut value_sizes, ctx)?;
+
+    let mut value_offsets = vec![0u32; n_values + 1];
+    let mut total: u64 = 0;
+    for i in 0..n_values {
+        value_offsets[i] = total as u32;
+        total += u64::from(value_sizes[i]);
+    }
+    value_offsets[n_values] = total as u32;
+    let mut value_buf = vec![0u8; total as usize];
+    let zero_offsets = vec![0u32; n_values];
+    let mut inner_cursors = value_offsets[..n_values].to_vec();
+    dispatch_encode(
+        view.values(),
+        field,
+        &zero_offsets,
+        &mut inner_cursors,
+        &mut value_buf,
+        ctx,
+    )?;
+
+    let offset = view.offset() as u64;
+    let len = view.len();
+    let ends_prim = view.ends().clone().execute::<PrimitiveArray>(ctx)?;
+
+    match_each_integer_ptype!(ends_prim.ptype(), |E| {
+        let ends = ends_prim.as_slice::<E>();
+        walk_runs::<E>(ends, offset, len, |run_idx, start, stop| {
+            let v_start = value_offsets[run_idx] as usize;
+            let v_size = value_sizes[run_idx] as usize;
+            if v_size == 0 {
+                return;
+            }
+            let value_bytes = &value_buf[v_start..v_start + v_size];
+            let v_size_u32 = v_size as u32;
+            for i in start..stop {
+                let pos = (offsets[i] + cursors[i]) as usize;
+                out[pos..pos + v_size].copy_from_slice(value_bytes);
+                cursors[i] += v_size_u32;
+            }
+        });
+    });
+    Ok(Some(()))
+}
+
+/// For each run, call `f(run_idx, start_logical, stop_logical)` where the logical range is
+/// `[max(prev_end - offset, 0), min(curr_end - offset, len))`.
+#[inline]
+fn walk_runs<E>(ends: &[E], offset: u64, len: usize, mut f: impl FnMut(usize, usize, usize))
+where
+    E: NativePType + AsPrimitive<u64>,
+{
+    let mut prev: u64 = offset;
+    for (run_idx, &end) in ends.iter().enumerate() {
+        let end_u64: u64 = end.as_();
+        if end_u64 <= offset {
+            prev = end_u64;
+            continue;
+        }
+        let start = (prev.saturating_sub(offset)) as usize;
+        let stop_u64 = end_u64 - offset;
+        let stop = (stop_u64 as usize).min(len);
+        if start < stop {
+            f(run_idx, start, stop);
+        }
+        prev = end_u64;
+        if stop >= len {
+            break;
+        }
+    }
+}
+
+fn run_end_array_id() -> ArrayId {
+    use vortex_session::registry::CachedId;
+    static ID: CachedId = CachedId::new("vortex.runend");
+    *ID
+}
+
+inventory::submit! {
+    RowEncodeRegistration {
+        id: run_end_array_id,
+        size: run_end_size_contribution,
+        encode: run_end_encode_into,
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use vortex_array::IntoArray;
+    use vortex_array::LEGACY_SESSION;
+    use vortex_array::VortexSessionExecute;
+    use vortex_array::arrays::ListViewArray;
+    use vortex_array::arrays::PrimitiveArray;
+    use vortex_array::arrays::listview::ListViewArrayExt;
+    use vortex_buffer::buffer;
+    use vortex_error::VortexResult;
+    use vortex_row::SortField;
+    use vortex_row::convert_columns;
+
+    use crate::RunEnd;
+
+    fn collect_rows(arr: &ListViewArray) -> Vec<Vec<u8>> {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        let n = arr.len();
+        (0..n)
+            .map(|i| {
+                let slice = arr.list_elements_at(i).unwrap();
+                let p = slice.execute::<PrimitiveArray>(&mut ctx).unwrap();
+                p.as_slice::<u8>().to_vec()
+            })
+            .collect()
+    }
+
+    #[test]
+    fn runend_row_encode_matches_canonical() -> VortexResult<()> {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        let raw = buffer![1i32, 1, 1, 2, 2, 3, 3, 3, 3].into_array();
+        let ree = RunEnd::encode(raw.clone(), &mut ctx)?.into_array();
+
+        let by_canonical = convert_columns(&[raw], &[SortField::default()], &mut ctx)?;
+        let by_ree = convert_columns(&[ree], &[SortField::default()], &mut ctx)?;
+
+        assert_eq!(collect_rows(&by_canonical), collect_rows(&by_ree));
+        Ok(())
+    }
+}
diff --git a/vortex-row/Cargo.toml b/vortex-row/Cargo.toml
new file mode 100644
index 00000000000..ca60a791f98
--- /dev/null
+++ b/vortex-row/Cargo.toml
@@ -0,0 +1,46 @@
+[package]
+name = "vortex-row"
+authors = { workspace = true }
+categories = { workspace = true }
+description = "Row-oriented byte encoder for Vortex arrays, analogous to arrow-row."
+edition = { workspace = true }
+homepage = { workspace = true }
+include = { workspace = true }
+keywords = { workspace = true }
+license = { workspace = true }
+readme = { workspace = true }
+repository = { workspace = true }
+rust-version = { workspace = true }
+version = { workspace = true }
+
+[lints]
+workspace = true
+
+[dependencies]
+bytes = { workspace = true }
+inventory = { workspace = true }
+smallvec = { workspace = true }
+vortex-array = { workspace = true }
+vortex-buffer = { workspace = true }
+vortex-error = { workspace = true }
+vortex-mask = { workspace = true }
+vortex-session = { workspace = true }
+vortex-utils = { workspace = true, features = ["dyn-traits"] }
+
+[dev-dependencies]
+arrow-array = { workspace = true }
+arrow-row = { workspace = true }
+arrow-schema = { workspace = true }
+divan = { workspace = true }
+mimalloc = { workspace = true }
+rand = { workspace = true }
+rstest = { workspace = true }
+vortex-array = { workspace = true, features = ["_test-harness"] }
+# Path-only (no version) so cargo publish doesn't try to look up vortex-fastlanes on
+# crates.io. vortex-fastlanes itself depends on vortex-row, so a versioned dev-dep here
+# would create a publish-time chicken-and-egg. The bench file is the only consumer.
+vortex-fastlanes = { path = "../encodings/fastlanes" }
+
+[[bench]]
+name = "row_encode"
+harness = false
diff --git a/vortex-row/benches/row_encode.rs b/vortex-row/benches/row_encode.rs
new file mode 100644
index 00000000000..5dba51938a0
--- /dev/null
+++ b/vortex-row/benches/row_encode.rs
@@ -0,0 +1,568 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+#![expect(
+    clippy::unwrap_used,
+    clippy::clone_on_ref_ptr,
+    clippy::cloned_ref_to_slice_refs,
+    clippy::cast_possible_truncation,
+    clippy::cast_possible_wrap,
+    clippy::redundant_clone
+)]
+
+//! Row-encode throughput benchmarks comparing `arrow-row` against vortex's `convert_columns`
+//! for the canonical scenarios shipped in PR 1: a primitive i64 column, a Utf8 column,
+//! and a mixed-field struct. Per-encoding fast paths (Constant, Dict, Patched, BitPacked,
+//! FoR, Delta) gain their own triplets in PR 3.
+
+use std::sync::Arc;
+
+use arrow_array::DictionaryArray;
+use arrow_array::Int32Array;
+use arrow_array::Int64Array;
+use arrow_array::PrimitiveArray as ArrowPrimitiveArray;
+use arrow_array::StringArray;
+use arrow_array::StructArray as ArrowStructArray;
+use arrow_array::types::Int32Type;
+use arrow_row::RowConverter;
+use arrow_row::SortField as ArrowSortField;
+use arrow_schema::DataType;
+use arrow_schema::Field;
+use divan::counter::BytesCount;
+use mimalloc::MiMalloc;
+use rand::RngExt;
+use rand::SeedableRng;
+use rand::distr::Alphanumeric;
+use rand::rngs::StdRng;
+use vortex_array::Canonical;
+use vortex_array::IntoArray;
+use vortex_array::LEGACY_SESSION;
+use vortex_array::VortexSessionExecute;
+use vortex_array::arrays::ConstantArray;
+use vortex_array::arrays::Patched;
+use vortex_array::arrays::PrimitiveArray;
+use vortex_array::arrays::StructArray;
+use vortex_array::arrays::VarBinViewArray;
+use vortex_array::builders::dict::dict_encode;
+use vortex_array::patches::Patches;
+use vortex_array::scalar::Scalar;
+use vortex_fastlanes::BitPackedData;
+use vortex_fastlanes::Delta;
+use vortex_fastlanes::FoR;
+use vortex_row::SortField;
+use vortex_row::convert_columns;
+
+#[global_allocator]
+static GLOBAL: MiMalloc = MiMalloc;
+
+const N: usize = 100_000;
+
+fn main() {
+    divan::main();
+}
+
+fn gen_i64(n: usize, seed: u64) -> Vec<i64> {
+    let mut rng = StdRng::seed_from_u64(seed);
+    (0..n)
+        .map(|_| rng.random_range(i64::MIN..i64::MAX))
+        .collect()
+}
+
+fn gen_words(n: usize, mean_len: usize, seed: u64) -> Vec<String> {
+    let rng = &mut StdRng::seed_from_u64(seed);
+    (0..n)
+        .map(|_| {
+            let len = rng.random_range(mean_len.saturating_sub(4)..=mean_len + 4);
+            rng.sample_iter(&Alphanumeric)
+                .take(len)
+                .map(char::from)
+                .collect::<String>()
+        })
+        .collect()
+}
+
+// ---------- primitive_i64 ----------
+
+#[divan::bench]
+fn primitive_i64_arrow_row(bencher: divan::Bencher) {
+    let v = gen_i64(N, 0);
+    let arr = Arc::new(Int64Array::from(v.clone())) as arrow_array::ArrayRef;
+    let conv = RowConverter::new(vec![ArrowSortField::new(DataType::Int64)]).unwrap();
+    let bytes = (N * (1 + 8)) as u64;
+    bencher
+        .counter(BytesCount::new(bytes))
+        .bench_local(|| conv.convert_columns(&[arr.clone()]).unwrap())
+}
+
+#[divan::bench]
+fn primitive_i64_vortex(bencher: divan::Bencher) {
+    let v = gen_i64(N, 0);
+    let col = PrimitiveArray::from_iter(v.clone()).into_array();
+    let bytes = (N * (1 + 8)) as u64;
+    bencher.counter(BytesCount::new(bytes)).bench_local(|| {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        convert_columns(&[col.clone()], &[SortField::default()], &mut ctx).unwrap()
+    })
+}
+
+// ---------- utf8 ----------
+
+#[divan::bench]
+fn utf8_arrow_row(bencher: divan::Bencher) {
+    let words = gen_words(N, 16, 7);
+    let total: u64 = words
+        .iter()
+        .map(|w| 1 + (w.len().div_ceil(32) * 33) as u64)
+        .sum();
+    let arr = Arc::new(StringArray::from(words.clone())) as arrow_array::ArrayRef;
+    let conv = RowConverter::new(vec![ArrowSortField::new(DataType::Utf8)]).unwrap();
+    bencher
+        .counter(BytesCount::new(total))
+        .bench_local(|| conv.convert_columns(&[arr.clone()]).unwrap())
+}
+
+#[divan::bench]
+fn utf8_vortex(bencher: divan::Bencher) {
+    let words = gen_words(N, 16, 7);
+    let total: u64 = words
+        .iter()
+        .map(|w| 1 + (w.len().div_ceil(32) * 33) as u64)
+        .sum();
+    let col = VarBinViewArray::from_iter_str(words.iter().map(String::as_str)).into_array();
+    bencher.counter(BytesCount::new(total)).bench_local(|| {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        convert_columns(&[col.clone()], &[SortField::default()], &mut ctx).unwrap()
+    })
+}
+
+// ---------- struct_mixed ----------
+
+fn struct_mixed_inputs() -> (Vec<i64>, Vec<String>, u64) {
+    let ids = gen_i64(N, 1);
+    let names = gen_words(N, 16, 2);
+    // sentinel (1) + i64 (1+8=9) + utf8-name (1 + ceil(len/32)*33)
+    let total: u64 = (0..N)
+        .map(|i| {
+            let name_bytes = 1 + (names[i].len().div_ceil(32) * 33) as u64;
+            1u64 + 9u64 + name_bytes
+        })
+        .sum();
+    (ids, names, total)
+}
+
+#[divan::bench]
+fn struct_mixed_arrow_row(bencher: divan::Bencher) {
+    let (ids, names, total) = struct_mixed_inputs();
+    let id_arr = Arc::new(Int64Array::from(ids)) as arrow_array::ArrayRef;
+    let name_arr = Arc::new(StringArray::from(names)) as arrow_array::ArrayRef;
+    let arrow_struct = Arc::new(ArrowStructArray::from(vec![
+        (Arc::new(Field::new("id", DataType::Int64, false)), id_arr),
+        (
+            Arc::new(Field::new("name", DataType::Utf8, false)),
+            name_arr,
+        ),
+    ])) as arrow_array::ArrayRef;
+    let struct_fields = vec![
+        Arc::new(Field::new("id", DataType::Int64, false)),
+        Arc::new(Field::new("name", DataType::Utf8, false)),
+    ];
+    let conv = RowConverter::new(vec![ArrowSortField::new(DataType::Struct(
+        struct_fields.into(),
+    ))])
+    .unwrap();
+    bencher
+        .counter(BytesCount::new(total))
+        .bench_local(|| conv.convert_columns(&[arrow_struct.clone()]).unwrap())
+}
+
+#[divan::bench]
+fn struct_mixed_vortex(bencher: divan::Bencher) {
+    let (ids, names, total) = struct_mixed_inputs();
+    let id_arr = PrimitiveArray::from_iter(ids).into_array();
+    let name_arr = VarBinViewArray::from_iter_str(names.iter().map(String::as_str)).into_array();
+    let struct_arr = StructArray::from_fields(&[("id", id_arr), ("name", name_arr)])
+        .unwrap()
+        .into_array();
+    bencher.counter(BytesCount::new(total)).bench_local(|| {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        convert_columns(&[struct_arr.clone()], &[SortField::default()], &mut ctx).unwrap()
+    })
+}
+
+// ---------- constant_i64 ----------
+
+#[divan::bench]
+fn constant_i64_arrow_row(bencher: divan::Bencher) {
+    let arr = Arc::new(Int64Array::from(vec![42i64; N])) as arrow_array::ArrayRef;
+    let conv = RowConverter::new(vec![ArrowSortField::new(DataType::Int64)]).unwrap();
+    let total = (N * (1 + 8)) as u64;
+    bencher
+        .counter(BytesCount::new(total))
+        .bench_local(|| conv.convert_columns(&[arr.clone()]).unwrap())
+}
+
+#[divan::bench]
+fn constant_i64_vortex_with_kernel(bencher: divan::Bencher) {
+    let arr = ConstantArray::new(42i64, N).into_array();
+    let total = (N * (1 + 8)) as u64;
+    bencher.counter(BytesCount::new(total)).bench_local(|| {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        convert_columns(&[arr.clone()], &[SortField::default()], &mut ctx).unwrap()
+    })
+}
+
+#[divan::bench]
+fn constant_i64_vortex_without_kernel(bencher: divan::Bencher) {
+    let arr = ConstantArray::new(42i64, N).into_array();
+    let total = (N * (1 + 8)) as u64;
+    bencher.counter(BytesCount::new(total)).bench_local(|| {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        let canonical = arr
+            .clone()
+            .execute::<Canonical>(&mut ctx)
+            .unwrap()
+            .into_array();
+        convert_columns(&[canonical], &[SortField::default()], &mut ctx).unwrap()
+    })
+}
+
+// ---------- dict_utf8 ----------
+
+fn dict_utf8_inputs() -> (Vec<String>, Vec<String>, Vec<i32>, u64) {
+    let n_unique = 1024usize;
+    let unique = gen_words(n_unique, 16, 13);
+    let mut rng = StdRng::seed_from_u64(17);
+    let codes: Vec<i32> = (0..N)
+        .map(|_| rng.random_range(0..n_unique) as i32)
+        .collect();
+    let strings: Vec<String> = codes.iter().map(|&c| unique[c as usize].clone()).collect();
+    let bytes: u64 = strings
+        .iter()
+        .map(|w| 1 + (w.len().div_ceil(32) * 33) as u64)
+        .sum();
+    (unique, strings, codes, bytes)
+}
+
+#[divan::bench]
+fn dict_utf8_arrow_dict(bencher: divan::Bencher) {
+    let (unique, _, codes, total) = dict_utf8_inputs();
+    let values: Arc<dyn arrow_array::Array> = Arc::new(StringArray::from(unique.clone()));
+    let dict_arr: DictionaryArray<Int32Type> =
+        DictionaryArray::new(ArrowPrimitiveArray::from(codes), values);
+    let arr = Arc::new(dict_arr) as arrow_array::ArrayRef;
+    let conv = RowConverter::new(vec![ArrowSortField::new(DataType::Dictionary(
+        Box::new(DataType::Int32),
+        Box::new(DataType::Utf8),
+    ))])
+    .unwrap();
+    bencher
+        .counter(BytesCount::new(total))
+        .bench_local(|| conv.convert_columns(&[arr.clone()]).unwrap())
+}
+
+#[divan::bench]
+fn dict_utf8_arrow_canonical(bencher: divan::Bencher) {
+    let (_, strings, _, total) = dict_utf8_inputs();
+    let arr = Arc::new(StringArray::from(strings.clone())) as arrow_array::ArrayRef;
+    let conv = RowConverter::new(vec![ArrowSortField::new(DataType::Utf8)]).unwrap();
+    bencher
+        .counter(BytesCount::new(total))
+        .bench_local(|| conv.convert_columns(&[arr.clone()]).unwrap())
+}
+
+#[divan::bench]
+fn dict_utf8_vortex_with_kernel(bencher: divan::Bencher) {
+    let (_, strings, _, total) = dict_utf8_inputs();
+    let raw = VarBinViewArray::from_iter_str(strings.iter().map(String::as_str)).into_array();
+    let dict = dict_encode(&raw).unwrap().into_array();
+    bencher.counter(BytesCount::new(total)).bench_local(|| {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        convert_columns(&[dict.clone()], &[SortField::default()], &mut ctx).unwrap()
+    })
+}
+
+#[divan::bench]
+fn dict_utf8_vortex_without_kernel(bencher: divan::Bencher) {
+    let (_, strings, _, total) = dict_utf8_inputs();
+    let raw = VarBinViewArray::from_iter_str(strings.iter().map(String::as_str)).into_array();
+    let dict = dict_encode(&raw).unwrap().into_array();
+    bencher.counter(BytesCount::new(total)).bench_local(|| {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        let canonical = dict
+            .clone()
+            .execute::<Canonical>(&mut ctx)
+            .unwrap()
+            .into_array();
+        convert_columns(&[canonical], &[SortField::default()], &mut ctx).unwrap()
+    })
+}
+
+// ---------- patched_i32 ----------
+
+fn gen_patched_i32_inputs() -> (Vec<i32>, Vec<i32>, u64) {
+    let mut rng = StdRng::seed_from_u64(400);
+    // Inner is mostly zero, with random patches at ~5% of positions.
+    let mut inner = vec![0i32; N];
+    let mut values = Vec::new();
+    for slot in inner.iter_mut().take(N) {
+        if rng.random_range(0u32..100) < 5 {
+            let v = rng.random_range(1i32..1_000_000);
+            *slot = v;
+            values.push(v);
+        }
+    }
+    let bytes = (N * (1 + 4)) as u64;
+    (inner, values, bytes)
+}
+
+#[divan::bench]
+fn patched_i32_arrow_row(bencher: divan::Bencher) {
+    let (inner, _, bytes) = gen_patched_i32_inputs();
+    let arr = Arc::new(Int32Array::from(inner)) as arrow_array::ArrayRef;
+    let conv = RowConverter::new(vec![ArrowSortField::new(DataType::Int32)]).unwrap();
+    bencher
+        .counter(BytesCount::new(bytes))
+        .bench_local(|| conv.convert_columns(&[arr.clone()]).unwrap())
+}
+
+fn patched_i32_array() -> (vortex_array::ArrayRef, u64) {
+    let mut rng = StdRng::seed_from_u64(400);
+    let mut indices: Vec<u32> = Vec::new();
+    let mut values: Vec<i32> = Vec::new();
+    let mut inner = vec![0i32; N];
+    for i in 0..N {
+        if rng.random_range(0u32..100) < 5 {
+            let v = rng.random_range(1i32..1_000_000);
+            inner[i] = v;
+            indices.push(i as u32);
+            values.push(v);
+        }
+    }
+    let inner_arr = PrimitiveArray::from_iter(vec![0i32; N]).into_array();
+    let patches = Patches::new(
+        N,
+        0,
+        PrimitiveArray::from_iter(indices).into_array(),
+        PrimitiveArray::from_iter(values).into_array(),
+        None,
+    )
+    .unwrap();
+    let mut setup_ctx = LEGACY_SESSION.create_execution_ctx();
+    let patched = Patched::from_array_and_patches(inner_arr, &patches, &mut setup_ctx)
+        .unwrap()
+        .into_array();
+    drop(inner);
+    let bytes = (N * (1 + 4)) as u64;
+    (patched, bytes)
+}
+
+#[divan::bench]
+fn patched_i32_with_kernel(bencher: divan::Bencher) {
+    let (arr, bytes) = patched_i32_array();
+    bencher.counter(BytesCount::new(bytes)).bench_local(|| {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        convert_columns(&[arr.clone()], &[SortField::default()], &mut ctx).unwrap()
+    })
+}
+
+#[divan::bench]
+fn patched_i32_without_kernel(bencher: divan::Bencher) {
+    let (arr, bytes) = patched_i32_array();
+    bencher.counter(BytesCount::new(bytes)).bench_local(|| {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        let canonical = arr
+            .clone()
+            .execute::<Canonical>(&mut ctx)
+            .unwrap()
+            .into_array();
+        convert_columns(&[canonical], &[SortField::default()], &mut ctx).unwrap()
+    })
+}
+
+// ---------- bitpacked_i32 ----------
+
+fn gen_bitpacked_i32_values(n: usize, seed: u64) -> Vec<i32> {
+    // Small positive integers in the 0..255 range so they bit-pack to 8 bits without patches.
+    let mut rng = StdRng::seed_from_u64(seed);
+    (0..n).map(|_| rng.random_range(0i32..256)).collect()
+}
+
+#[divan::bench]
+fn bitpacked_i32_arrow_row(bencher: divan::Bencher) {
+    let v = gen_bitpacked_i32_values(N, 100);
+    let arr = Arc::new(Int32Array::from(v.clone())) as arrow_array::ArrayRef;
+    let conv = RowConverter::new(vec![ArrowSortField::new(DataType::Int32)]).unwrap();
+    let bytes = (N * (1 + 4)) as u64;
+    bencher
+        .counter(BytesCount::new(bytes))
+        .bench_local(|| conv.convert_columns(&[arr.clone()]).unwrap())
+}
+
+#[divan::bench]
+fn bitpacked_i32_with_kernel(bencher: divan::Bencher) {
+    let v = gen_bitpacked_i32_values(N, 100);
+    let raw = PrimitiveArray::from_iter(v.clone()).into_array();
+    let mut setup_ctx = LEGACY_SESSION.create_execution_ctx();
+    let bp = BitPackedData::encode(&raw, 8, &mut setup_ctx)
+        .unwrap()
+        .into_array();
+    let bytes = (N * (1 + 4)) as u64;
+    bencher.counter(BytesCount::new(bytes)).bench_local(|| {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        convert_columns(&[bp.clone()], &[SortField::default()], &mut ctx).unwrap()
+    })
+}
+
+#[divan::bench]
+fn bitpacked_i32_without_kernel(bencher: divan::Bencher) {
+    let v = gen_bitpacked_i32_values(N, 100);
+    let raw = PrimitiveArray::from_iter(v.clone()).into_array();
+    let mut setup_ctx = LEGACY_SESSION.create_execution_ctx();
+    let bp = BitPackedData::encode(&raw, 8, &mut setup_ctx)
+        .unwrap()
+        .into_array();
+    let bytes = (N * (1 + 4)) as u64;
+    bencher.counter(BytesCount::new(bytes)).bench_local(|| {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        let canonical = bp
+            .clone()
+            .execute::<Canonical>(&mut ctx)
+            .unwrap()
+            .into_array();
+        convert_columns(&[canonical], &[SortField::default()], &mut ctx).unwrap()
+    })
+}
+
+// ---------- for_i64 ----------
+
+fn gen_for_i64_values(n: usize, seed: u64) -> Vec<i64> {
+    // Tightly clustered values around a base — FoR will store small deltas.
+    let mut rng = StdRng::seed_from_u64(seed);
+    (0..n)
+        .map(|_| 1_000_000_000i64 + rng.random_range(0i64..65536))
+        .collect()
+}
+
+#[divan::bench]
+fn for_i64_arrow_row(bencher: divan::Bencher) {
+    let v = gen_for_i64_values(N, 200);
+    let arr = Arc::new(Int64Array::from(v)) as arrow_array::ArrayRef;
+    let conv = RowConverter::new(vec![ArrowSortField::new(DataType::Int64)]).unwrap();
+    let bytes = (N * (1 + 8)) as u64;
+    bencher
+        .counter(BytesCount::new(bytes))
+        .bench_local(|| conv.convert_columns(&[arr.clone()]).unwrap())
+}
+
+#[divan::bench]
+fn for_i64_with_kernel(bencher: divan::Bencher) {
+    let v = gen_for_i64_values(N, 200);
+    let raw = PrimitiveArray::from_iter(v.clone()).into_array();
+    // Build a FoR around a BitPacked array. The encoded form is `value - reference`
+    // bit-packed to fit the deltas (16 bits suffices for the 0..65536 range above).
+    let deltas: Vec<u64> = v.iter().map(|&x| (x - 1_000_000_000) as u64).collect();
+    let mut setup_ctx = LEGACY_SESSION.create_execution_ctx();
+    let bp = BitPackedData::encode(
+        &PrimitiveArray::from_iter(deltas).into_array(),
+        17,
+        &mut setup_ctx,
+    )
+    .unwrap()
+    .into_array();
+    drop(raw);
+    let arr = FoR::try_new(bp, Scalar::from(1_000_000_000u64))
+        .unwrap()
+        .into_array();
+    let bytes = (N * (1 + 8)) as u64;
+    bencher.counter(BytesCount::new(bytes)).bench_local(|| {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        convert_columns(&[arr.clone()], &[SortField::default()], &mut ctx).unwrap()
+    })
+}
+
+#[divan::bench]
+fn for_i64_without_kernel(bencher: divan::Bencher) {
+    let v = gen_for_i64_values(N, 200);
+    let deltas: Vec<u64> = v.iter().map(|&x| (x - 1_000_000_000) as u64).collect();
+    let mut setup_ctx = LEGACY_SESSION.create_execution_ctx();
+    let bp = BitPackedData::encode(
+        &PrimitiveArray::from_iter(deltas).into_array(),
+        17,
+        &mut setup_ctx,
+    )
+    .unwrap()
+    .into_array();
+    let arr = FoR::try_new(bp, Scalar::from(1_000_000_000u64))
+        .unwrap()
+        .into_array();
+    let bytes = (N * (1 + 8)) as u64;
+    bencher.counter(BytesCount::new(bytes)).bench_local(|| {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        let canonical = arr
+            .clone()
+            .execute::<Canonical>(&mut ctx)
+            .unwrap()
+            .into_array();
+        convert_columns(&[canonical], &[SortField::default()], &mut ctx).unwrap()
+    })
+}
+
+// ---------- delta_i64 ----------
+
+fn gen_delta_i64_values(n: usize, seed: u64) -> Vec<i64> {
+    // Monotonic-ish sequence with small step sizes (a common shape for sorted IDs / timestamps).
+    let mut rng = StdRng::seed_from_u64(seed);
+    let mut v = Vec::with_capacity(n);
+    let mut cur: i64 = 1_700_000_000_000;
+    for _ in 0..n {
+        cur = cur.wrapping_add(rng.random_range(1i64..100));
+        v.push(cur);
+    }
+    v
+}
+
+#[divan::bench]
+fn delta_i64_arrow_row(bencher: divan::Bencher) {
+    let v = gen_delta_i64_values(N, 300);
+    let arr = Arc::new(Int64Array::from(v)) as arrow_array::ArrayRef;
+    let conv = RowConverter::new(vec![ArrowSortField::new(DataType::Int64)]).unwrap();
+    let bytes = (N * (1 + 8)) as u64;
+    bencher
+        .counter(BytesCount::new(bytes))
+        .bench_local(|| conv.convert_columns(&[arr.clone()]).unwrap())
+}
+
+#[divan::bench]
+fn delta_i64_with_kernel(bencher: divan::Bencher) {
+    let v = gen_delta_i64_values(N, 300);
+    let p = PrimitiveArray::from_iter(v.clone());
+    let mut setup_ctx = LEGACY_SESSION.create_execution_ctx();
+    let arr = Delta::try_from_primitive_array(&p, &mut setup_ctx)
+        .unwrap()
+        .into_array();
+    let bytes = (N * (1 + 8)) as u64;
+    bencher.counter(BytesCount::new(bytes)).bench_local(|| {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        convert_columns(&[arr.clone()], &[SortField::default()], &mut ctx).unwrap()
+    })
+}
+
+#[divan::bench]
+fn delta_i64_without_kernel(bencher: divan::Bencher) {
+    let v = gen_delta_i64_values(N, 300);
+    let p = PrimitiveArray::from_iter(v);
+    let mut setup_ctx = LEGACY_SESSION.create_execution_ctx();
+    let arr = Delta::try_from_primitive_array(&p, &mut setup_ctx)
+        .unwrap()
+        .into_array();
+    let bytes = (N * (1 + 8)) as u64;
+    bencher.counter(BytesCount::new(bytes)).bench_local(|| {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        let canonical = arr
+            .clone()
+            .execute::<Canonical>(&mut ctx)
+            .unwrap()
+            .into_array();
+        convert_columns(&[canonical], &[SortField::default()], &mut ctx).unwrap()
+    })
+}
diff --git a/vortex-row/public-api.lock b/vortex-row/public-api.lock
new file mode 100644
index 00000000000..ecc834cca19
--- /dev/null
+++ b/vortex-row/public-api.lock
@@ -0,0 +1,501 @@
+pub mod vortex_row
+
+pub mod vortex_row::codec
+
+pub enum vortex_row::codec::RowWidth
+
+pub vortex_row::codec::RowWidth::Fixed(u32)
+
+pub vortex_row::codec::RowWidth::Variable
+
+impl core::clone::Clone for vortex_row::codec::RowWidth
+
+pub fn vortex_row::codec::RowWidth::clone(&self) -> vortex_row::codec::RowWidth
+
+impl core::cmp::Eq for vortex_row::codec::RowWidth
+
+impl core::cmp::PartialEq for vortex_row::codec::RowWidth
+
+pub fn vortex_row::codec::RowWidth::eq(&self, &vortex_row::codec::RowWidth) -> bool
+
+impl core::fmt::Debug for vortex_row::codec::RowWidth
+
+pub fn vortex_row::codec::RowWidth::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result
+
+impl core::marker::Copy for vortex_row::codec::RowWidth
+
+impl core::marker::StructuralPartialEq for vortex_row::codec::RowWidth
+
+pub const vortex_row::codec::BOOL_ENCODED_SIZE: u32
+
+pub const vortex_row::codec::VARLEN_BLOCK_SIZE: usize
+
+pub const vortex_row::codec::VARLEN_BLOCK_TOTAL: usize
+
+pub trait vortex_row::codec::RowEncode: core::marker::Copy
+
+pub fn vortex_row::codec::RowEncode::encode_to(self, &mut [u8], bool)
+
+impl vortex_row::codec::RowEncode for f32
+
+pub fn f32::encode_to(self, &mut [u8], bool)
+
+impl vortex_row::codec::RowEncode for f64
+
+pub fn f64::encode_to(self, &mut [u8], bool)
+
+impl vortex_row::codec::RowEncode for half::binary16::f16
+
+pub fn half::binary16::f16::encode_to(self, &mut [u8], bool)
+
+impl vortex_row::codec::RowEncode for i128
+
+pub fn i128::encode_to(self, &mut [u8], bool)
+
+impl vortex_row::codec::RowEncode for i16
+
+pub fn i16::encode_to(self, &mut [u8], bool)
+
+impl vortex_row::codec::RowEncode for i32
+
+pub fn i32::encode_to(self, &mut [u8], bool)
+
+impl vortex_row::codec::RowEncode for i64
+
+pub fn i64::encode_to(self, &mut [u8], bool)
+
+impl vortex_row::codec::RowEncode for i8
+
+pub fn i8::encode_to(self, &mut [u8], bool)
+
+impl vortex_row::codec::RowEncode for u16
+
+pub fn u16::encode_to(self, &mut [u8], bool)
+
+impl vortex_row::codec::RowEncode for u32
+
+pub fn u32::encode_to(self, &mut [u8], bool)
+
+impl vortex_row::codec::RowEncode for u64
+
+pub fn u64::encode_to(self, &mut [u8], bool)
+
+impl vortex_row::codec::RowEncode for u8
+
+pub fn u8::encode_to(self, &mut [u8], bool)
+
+pub fn vortex_row::codec::encode_scalar(&vortex_array::scalar::Scalar, vortex_row::options::SortField) -> vortex_error::VortexResult<bytes::bytes::Bytes>
+
+pub fn vortex_row::codec::encode_scalar_bool(core::option::Option<bool>, vortex_row::options::SortField, &mut vortex_buffer::ByteBufferMut)
+
+pub fn vortex_row::codec::encode_scalar_null(vortex_row::options::SortField, bool, &mut vortex_buffer::ByteBufferMut)
+
+pub fn vortex_row::codec::encode_scalar_primitive(vortex_array::dtype::ptype::PType, vortex_array::scalar::typed_view::primitive::pvalue::PValue, vortex_row::options::SortField, bool, &mut vortex_buffer::ByteBufferMut) -> vortex_error::VortexResult<()>
+
+pub fn vortex_row::codec::encode_scalar_varlen(core::option::Option<&[u8]>, vortex_row::options::SortField, &mut vortex_buffer::ByteBufferMut)
+
+pub fn vortex_row::codec::encoded_size_for_scalar(&vortex_array::scalar::Scalar, vortex_row::options::SortField) -> vortex_error::VortexResult<u32>
+
+pub fn vortex_row::codec::field_encode(&vortex_array::canonical::Canonical, vortex_row::options::SortField, &[u32], &mut [u32], &mut [u8], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<()>
+
+pub fn vortex_row::codec::field_encode_fixed_arithmetic(&vortex_array::canonical::Canonical, vortex_row::options::SortField, u32, u32, core::option::Option<&[u32]>, u32, &mut [u8], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<()>
+
+pub fn vortex_row::codec::field_size(&vortex_array::canonical::Canonical, vortex_row::options::SortField, &mut [u32], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<()>
+
+pub fn vortex_row::codec::row_width_for_dtype(&vortex_array::dtype::DType) -> vortex_error::VortexResult<vortex_row::codec::RowWidth>
+
+pub mod vortex_row::convert
+
+pub fn vortex_row::convert::compute_row_sizes(&[vortex_array::array::erased::ArrayRef], &[vortex_row::options::SortField], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<vortex_array::array::erased::ArrayRef>
+
+pub fn vortex_row::convert::convert_columns(&[vortex_array::array::erased::ArrayRef], &[vortex_row::options::SortField], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<vortex_array::arrays::listview::vtable::ListViewArray>
+
+pub mod vortex_row::encode
+
+pub struct vortex_row::encode::RowEncode
+
+impl core::clone::Clone for vortex_row::encode::RowEncode
+
+pub fn vortex_row::encode::RowEncode::clone(&self) -> vortex_row::encode::RowEncode
+
+impl core::fmt::Debug for vortex_row::encode::RowEncode
+
+pub fn vortex_row::encode::RowEncode::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result
+
+impl vortex_array::scalar_fn::vtable::ScalarFnVTable for vortex_row::encode::RowEncode
+
+pub type vortex_row::encode::RowEncode::Options = vortex_row::options::RowEncodeOptions
+
+pub fn vortex_row::encode::RowEncode::arity(&self, &Self::Options) -> vortex_array::scalar_fn::vtable::Arity
+
+pub fn vortex_row::encode::RowEncode::child_name(&self, &Self::Options, usize) -> vortex_array::scalar_fn::vtable::ChildName
+
+pub fn vortex_row::encode::RowEncode::deserialize(&self, &[u8], &vortex_session::VortexSession) -> vortex_error::VortexResult<Self::Options>
+
+pub fn vortex_row::encode::RowEncode::execute(&self, &Self::Options, &dyn vortex_array::scalar_fn::vtable::ExecutionArgs, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<vortex_array::array::erased::ArrayRef>
+
+pub fn vortex_row::encode::RowEncode::id(&self) -> vortex_array::scalar_fn::ScalarFnId
+
+pub fn vortex_row::encode::RowEncode::is_fallible(&self, &Self::Options) -> bool
+
+pub fn vortex_row::encode::RowEncode::is_null_sensitive(&self, &Self::Options) -> bool
+
+pub fn vortex_row::encode::RowEncode::return_dtype(&self, &Self::Options, &[vortex_array::dtype::DType]) -> vortex_error::VortexResult<vortex_array::dtype::DType>
+
+pub fn vortex_row::encode::RowEncode::serialize(&self, &Self::Options) -> vortex_error::VortexResult<core::option::Option<alloc::vec::Vec<u8>>>
+
+pub trait vortex_row::encode::RowEncodeKernel: vortex_array::array::vtable::VTable
+
+pub fn vortex_row::encode::RowEncodeKernel::row_encode_into(vortex_array::array::view::ArrayView<'_, Self>, vortex_row::options::SortField, &[u32], &mut [u32], &mut [u8], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<()>>
+
+impl vortex_row::encode::RowEncodeKernel for vortex_array::arrays::constant::vtable::Constant
+
+pub fn vortex_array::arrays::constant::vtable::Constant::row_encode_into(vortex_array::array::view::ArrayView<'_, Self>, vortex_row::options::SortField, &[u32], &mut [u32], &mut [u8], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<()>>
+
+impl vortex_row::encode::RowEncodeKernel for vortex_array::arrays::dict::vtable::Dict
+
+pub fn vortex_array::arrays::dict::vtable::Dict::row_encode_into(vortex_array::array::view::ArrayView<'_, Self>, vortex_row::options::SortField, &[u32], &mut [u32], &mut [u8], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<()>>
+
+impl vortex_row::encode::RowEncodeKernel for vortex_array::arrays::patched::vtable::Patched
+
+pub fn vortex_array::arrays::patched::vtable::Patched::row_encode_into(vortex_array::array::view::ArrayView<'_, Self>, vortex_row::options::SortField, &[u32], &mut [u32], &mut [u8], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<()>>
+
+pub fn vortex_row::encode::dispatch_encode(&vortex_array::array::erased::ArrayRef, vortex_row::options::SortField, &[u32], &mut [u32], &mut [u8], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<()>
+
+pub mod vortex_row::options
+
+pub struct vortex_row::options::RowEncodeOptions
+
+pub vortex_row::options::RowEncodeOptions::fields: smallvec::SmallVec<[vortex_row::options::SortField; 4]>
+
+impl vortex_row::options::RowEncodeOptions
+
+pub fn vortex_row::options::RowEncodeOptions::new(impl core::iter::traits::collect::IntoIterator<Item = vortex_row::options::SortField>) -> Self
+
+impl core::clone::Clone for vortex_row::options::RowEncodeOptions
+
+pub fn vortex_row::options::RowEncodeOptions::clone(&self) -> vortex_row::options::RowEncodeOptions
+
+impl core::cmp::Eq for vortex_row::options::RowEncodeOptions
+
+impl core::cmp::PartialEq for vortex_row::options::RowEncodeOptions
+
+pub fn vortex_row::options::RowEncodeOptions::eq(&self, &vortex_row::options::RowEncodeOptions) -> bool
+
+impl core::fmt::Debug for vortex_row::options::RowEncodeOptions
+
+pub fn vortex_row::options::RowEncodeOptions::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result
+
+impl core::fmt::Display for vortex_row::options::RowEncodeOptions
+
+pub fn vortex_row::options::RowEncodeOptions::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result
+
+impl core::hash::Hash for vortex_row::options::RowEncodeOptions
+
+pub fn vortex_row::options::RowEncodeOptions::hash<__H: core::hash::Hasher>(&self, &mut __H)
+
+impl core::marker::StructuralPartialEq for vortex_row::options::RowEncodeOptions
+
+pub struct vortex_row::options::SortField
+
+pub vortex_row::options::SortField::descending: bool
+
+pub vortex_row::options::SortField::nulls_first: bool
+
+impl vortex_row::options::SortField
+
+pub fn vortex_row::options::SortField::new(bool, bool) -> Self
+
+pub fn vortex_row::options::SortField::non_null_sentinel(&self) -> u8
+
+pub fn vortex_row::options::SortField::null_sentinel(&self) -> u8
+
+impl core::clone::Clone for vortex_row::options::SortField
+
+pub fn vortex_row::options::SortField::clone(&self) -> vortex_row::options::SortField
+
+impl core::cmp::Eq for vortex_row::options::SortField
+
+impl core::cmp::PartialEq for vortex_row::options::SortField
+
+pub fn vortex_row::options::SortField::eq(&self, &vortex_row::options::SortField) -> bool
+
+impl core::default::Default for vortex_row::options::SortField
+
+pub fn vortex_row::options::SortField::default() -> Self
+
+impl core::fmt::Debug for vortex_row::options::SortField
+
+pub fn vortex_row::options::SortField::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result
+
+impl core::fmt::Display for vortex_row::options::SortField
+
+pub fn vortex_row::options::SortField::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result
+
+impl core::hash::Hash for vortex_row::options::SortField
+
+pub fn vortex_row::options::SortField::hash<__H: core::hash::Hasher>(&self, &mut __H)
+
+impl core::marker::Copy for vortex_row::options::SortField
+
+impl core::marker::StructuralPartialEq for vortex_row::options::SortField
+
+pub const vortex_row::options::FIELDS_INLINE: usize
+
+pub mod vortex_row::registry
+
+pub struct vortex_row::registry::RowEncodeRegistration
+
+pub vortex_row::registry::RowEncodeRegistration::encode: vortex_row::registry::DynEncodeFn
+
+pub vortex_row::registry::RowEncodeRegistration::id: fn() -> vortex_array::array::ArrayId
+
+pub vortex_row::registry::RowEncodeRegistration::size: vortex_row::registry::DynSizeFn
+
+impl inventory::Collect for vortex_row::registry::RowEncodeRegistration
+
+pub fn vortex_row::registry::lookup(&vortex_array::array::ArrayId) -> core::option::Option<(vortex_row::registry::DynSizeFn, vortex_row::registry::DynEncodeFn)>
+
+pub type vortex_row::registry::DynEncodeFn = fn(&vortex_array::array::erased::ArrayRef, vortex_row::options::SortField, &[u32], &mut [u32], &mut [u8], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<()>>
+
+pub type vortex_row::registry::DynSizeFn = fn(&vortex_array::array::erased::ArrayRef, vortex_row::options::SortField, &mut [u32], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<()>>
+
+pub mod vortex_row::size
+
+pub struct vortex_row::size::RowSize
+
+impl core::clone::Clone for vortex_row::size::RowSize
+
+pub fn vortex_row::size::RowSize::clone(&self) -> vortex_row::size::RowSize
+
+impl core::fmt::Debug for vortex_row::size::RowSize
+
+pub fn vortex_row::size::RowSize::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result
+
+impl vortex_array::scalar_fn::vtable::ScalarFnVTable for vortex_row::size::RowSize
+
+pub type vortex_row::size::RowSize::Options = vortex_row::options::RowEncodeOptions
+
+pub fn vortex_row::size::RowSize::arity(&self, &Self::Options) -> vortex_array::scalar_fn::vtable::Arity
+
+pub fn vortex_row::size::RowSize::child_name(&self, &Self::Options, usize) -> vortex_array::scalar_fn::vtable::ChildName
+
+pub fn vortex_row::size::RowSize::deserialize(&self, &[u8], &vortex_session::VortexSession) -> vortex_error::VortexResult<Self::Options>
+
+pub fn vortex_row::size::RowSize::execute(&self, &Self::Options, &dyn vortex_array::scalar_fn::vtable::ExecutionArgs, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<vortex_array::array::erased::ArrayRef>
+
+pub fn vortex_row::size::RowSize::id(&self) -> vortex_array::scalar_fn::ScalarFnId
+
+pub fn vortex_row::size::RowSize::is_fallible(&self, &Self::Options) -> bool
+
+pub fn vortex_row::size::RowSize::is_null_sensitive(&self, &Self::Options) -> bool
+
+pub fn vortex_row::size::RowSize::return_dtype(&self, &Self::Options, &[vortex_array::dtype::DType]) -> vortex_error::VortexResult<vortex_array::dtype::DType>
+
+pub fn vortex_row::size::RowSize::serialize(&self, &Self::Options) -> vortex_error::VortexResult<core::option::Option<alloc::vec::Vec<u8>>>
+
+pub trait vortex_row::size::RowSizeKernel: vortex_array::array::vtable::VTable
+
+pub fn vortex_row::size::RowSizeKernel::row_size_contribution(vortex_array::array::view::ArrayView<'_, Self>, vortex_row::options::SortField, &mut [u32], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<()>>
+
+impl vortex_row::size::RowSizeKernel for vortex_array::arrays::constant::vtable::Constant
+
+pub fn vortex_array::arrays::constant::vtable::Constant::row_size_contribution(vortex_array::array::view::ArrayView<'_, Self>, vortex_row::options::SortField, &mut [u32], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<()>>
+
+impl vortex_row::size::RowSizeKernel for vortex_array::arrays::dict::vtable::Dict
+
+pub fn vortex_array::arrays::dict::vtable::Dict::row_size_contribution(vortex_array::array::view::ArrayView<'_, Self>, vortex_row::options::SortField, &mut [u32], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<()>>
+
+impl vortex_row::size::RowSizeKernel for vortex_array::arrays::patched::vtable::Patched
+
+pub fn vortex_array::arrays::patched::vtable::Patched::row_size_contribution(vortex_array::array::view::ArrayView<'_, Self>, vortex_row::options::SortField, &mut [u32], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<()>>
+
+pub fn vortex_row::size::dispatch_size(&vortex_array::array::erased::ArrayRef, vortex_row::options::SortField, &mut [u32], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<()>
+
+pub struct vortex_row::RowEncode
+
+impl core::clone::Clone for vortex_row::encode::RowEncode
+
+pub fn vortex_row::encode::RowEncode::clone(&self) -> vortex_row::encode::RowEncode
+
+impl core::fmt::Debug for vortex_row::encode::RowEncode
+
+pub fn vortex_row::encode::RowEncode::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result
+
+impl vortex_array::scalar_fn::vtable::ScalarFnVTable for vortex_row::encode::RowEncode
+
+pub type vortex_row::encode::RowEncode::Options = vortex_row::options::RowEncodeOptions
+
+pub fn vortex_row::encode::RowEncode::arity(&self, &Self::Options) -> vortex_array::scalar_fn::vtable::Arity
+
+pub fn vortex_row::encode::RowEncode::child_name(&self, &Self::Options, usize) -> vortex_array::scalar_fn::vtable::ChildName
+
+pub fn vortex_row::encode::RowEncode::deserialize(&self, &[u8], &vortex_session::VortexSession) -> vortex_error::VortexResult<Self::Options>
+
+pub fn vortex_row::encode::RowEncode::execute(&self, &Self::Options, &dyn vortex_array::scalar_fn::vtable::ExecutionArgs, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<vortex_array::array::erased::ArrayRef>
+
+pub fn vortex_row::encode::RowEncode::id(&self) -> vortex_array::scalar_fn::ScalarFnId
+
+pub fn vortex_row::encode::RowEncode::is_fallible(&self, &Self::Options) -> bool
+
+pub fn vortex_row::encode::RowEncode::is_null_sensitive(&self, &Self::Options) -> bool
+
+pub fn vortex_row::encode::RowEncode::return_dtype(&self, &Self::Options, &[vortex_array::dtype::DType]) -> vortex_error::VortexResult<vortex_array::dtype::DType>
+
+pub fn vortex_row::encode::RowEncode::serialize(&self, &Self::Options) -> vortex_error::VortexResult<core::option::Option<alloc::vec::Vec<u8>>>
+
+pub struct vortex_row::RowEncodeOptions
+
+pub vortex_row::RowEncodeOptions::fields: smallvec::SmallVec<[vortex_row::options::SortField; 4]>
+
+impl vortex_row::options::RowEncodeOptions
+
+pub fn vortex_row::options::RowEncodeOptions::new(impl core::iter::traits::collect::IntoIterator<Item = vortex_row::options::SortField>) -> Self
+
+impl core::clone::Clone for vortex_row::options::RowEncodeOptions
+
+pub fn vortex_row::options::RowEncodeOptions::clone(&self) -> vortex_row::options::RowEncodeOptions
+
+impl core::cmp::Eq for vortex_row::options::RowEncodeOptions
+
+impl core::cmp::PartialEq for vortex_row::options::RowEncodeOptions
+
+pub fn vortex_row::options::RowEncodeOptions::eq(&self, &vortex_row::options::RowEncodeOptions) -> bool
+
+impl core::fmt::Debug for vortex_row::options::RowEncodeOptions
+
+pub fn vortex_row::options::RowEncodeOptions::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result
+
+impl core::fmt::Display for vortex_row::options::RowEncodeOptions
+
+pub fn vortex_row::options::RowEncodeOptions::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result
+
+impl core::hash::Hash for vortex_row::options::RowEncodeOptions
+
+pub fn vortex_row::options::RowEncodeOptions::hash<__H: core::hash::Hasher>(&self, &mut __H)
+
+impl core::marker::StructuralPartialEq for vortex_row::options::RowEncodeOptions
+
+pub struct vortex_row::RowEncodeRegistration
+
+pub vortex_row::RowEncodeRegistration::encode: vortex_row::registry::DynEncodeFn
+
+pub vortex_row::RowEncodeRegistration::id: fn() -> vortex_array::array::ArrayId
+
+pub vortex_row::RowEncodeRegistration::size: vortex_row::registry::DynSizeFn
+
+impl inventory::Collect for vortex_row::registry::RowEncodeRegistration
+
+pub struct vortex_row::RowSize
+
+impl core::clone::Clone for vortex_row::size::RowSize
+
+pub fn vortex_row::size::RowSize::clone(&self) -> vortex_row::size::RowSize
+
+impl core::fmt::Debug for vortex_row::size::RowSize
+
+pub fn vortex_row::size::RowSize::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result
+
+impl vortex_array::scalar_fn::vtable::ScalarFnVTable for vortex_row::size::RowSize
+
+pub type vortex_row::size::RowSize::Options = vortex_row::options::RowEncodeOptions
+
+pub fn vortex_row::size::RowSize::arity(&self, &Self::Options) -> vortex_array::scalar_fn::vtable::Arity
+
+pub fn vortex_row::size::RowSize::child_name(&self, &Self::Options, usize) -> vortex_array::scalar_fn::vtable::ChildName
+
+pub fn vortex_row::size::RowSize::deserialize(&self, &[u8], &vortex_session::VortexSession) -> vortex_error::VortexResult<Self::Options>
+
+pub fn vortex_row::size::RowSize::execute(&self, &Self::Options, &dyn vortex_array::scalar_fn::vtable::ExecutionArgs, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<vortex_array::array::erased::ArrayRef>
+
+pub fn vortex_row::size::RowSize::id(&self) -> vortex_array::scalar_fn::ScalarFnId
+
+pub fn vortex_row::size::RowSize::is_fallible(&self, &Self::Options) -> bool
+
+pub fn vortex_row::size::RowSize::is_null_sensitive(&self, &Self::Options) -> bool
+
+pub fn vortex_row::size::RowSize::return_dtype(&self, &Self::Options, &[vortex_array::dtype::DType]) -> vortex_error::VortexResult<vortex_array::dtype::DType>
+
+pub fn vortex_row::size::RowSize::serialize(&self, &Self::Options) -> vortex_error::VortexResult<core::option::Option<alloc::vec::Vec<u8>>>
+
+pub struct vortex_row::SortField
+
+pub vortex_row::SortField::descending: bool
+
+pub vortex_row::SortField::nulls_first: bool
+
+impl vortex_row::options::SortField
+
+pub fn vortex_row::options::SortField::new(bool, bool) -> Self
+
+pub fn vortex_row::options::SortField::non_null_sentinel(&self) -> u8
+
+pub fn vortex_row::options::SortField::null_sentinel(&self) -> u8
+
+impl core::clone::Clone for vortex_row::options::SortField
+
+pub fn vortex_row::options::SortField::clone(&self) -> vortex_row::options::SortField
+
+impl core::cmp::Eq for vortex_row::options::SortField
+
+impl core::cmp::PartialEq for vortex_row::options::SortField
+
+pub fn vortex_row::options::SortField::eq(&self, &vortex_row::options::SortField) -> bool
+
+impl core::default::Default for vortex_row::options::SortField
+
+pub fn vortex_row::options::SortField::default() -> Self
+
+impl core::fmt::Debug for vortex_row::options::SortField
+
+pub fn vortex_row::options::SortField::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result
+
+impl core::fmt::Display for vortex_row::options::SortField
+
+pub fn vortex_row::options::SortField::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result
+
+impl core::hash::Hash for vortex_row::options::SortField
+
+pub fn vortex_row::options::SortField::hash<__H: core::hash::Hasher>(&self, &mut __H)
+
+impl core::marker::Copy for vortex_row::options::SortField
+
+impl core::marker::StructuralPartialEq for vortex_row::options::SortField
+
+pub trait vortex_row::RowEncodeKernel: vortex_array::array::vtable::VTable
+
+pub fn vortex_row::RowEncodeKernel::row_encode_into(vortex_array::array::view::ArrayView<'_, Self>, vortex_row::options::SortField, &[u32], &mut [u32], &mut [u8], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<()>>
+
+impl vortex_row::encode::RowEncodeKernel for vortex_array::arrays::constant::vtable::Constant
+
+pub fn vortex_array::arrays::constant::vtable::Constant::row_encode_into(vortex_array::array::view::ArrayView<'_, Self>, vortex_row::options::SortField, &[u32], &mut [u32], &mut [u8], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<()>>
+
+impl vortex_row::encode::RowEncodeKernel for vortex_array::arrays::dict::vtable::Dict
+
+pub fn vortex_array::arrays::dict::vtable::Dict::row_encode_into(vortex_array::array::view::ArrayView<'_, Self>, vortex_row::options::SortField, &[u32], &mut [u32], &mut [u8], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<()>>
+
+impl vortex_row::encode::RowEncodeKernel for vortex_array::arrays::patched::vtable::Patched
+
+pub fn vortex_array::arrays::patched::vtable::Patched::row_encode_into(vortex_array::array::view::ArrayView<'_, Self>, vortex_row::options::SortField, &[u32], &mut [u32], &mut [u8], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<()>>
+
+pub trait vortex_row::RowSizeKernel: vortex_array::array::vtable::VTable
+
+pub fn vortex_row::RowSizeKernel::row_size_contribution(vortex_array::array::view::ArrayView<'_, Self>, vortex_row::options::SortField, &mut [u32], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<()>>
+
+impl vortex_row::size::RowSizeKernel for vortex_array::arrays::constant::vtable::Constant
+
+pub fn vortex_array::arrays::constant::vtable::Constant::row_size_contribution(vortex_array::array::view::ArrayView<'_, Self>, vortex_row::options::SortField, &mut [u32], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<()>>
+
+impl vortex_row::size::RowSizeKernel for vortex_array::arrays::dict::vtable::Dict
+
+pub fn vortex_array::arrays::dict::vtable::Dict::row_size_contribution(vortex_array::array::view::ArrayView<'_, Self>, vortex_row::options::SortField, &mut [u32], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<()>>
+
+impl vortex_row::size::RowSizeKernel for vortex_array::arrays::patched::vtable::Patched
+
+pub fn vortex_array::arrays::patched::vtable::Patched::row_size_contribution(vortex_array::array::view::ArrayView<'_, Self>, vortex_row::options::SortField, &mut [u32], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<()>>
+
+pub fn vortex_row::compute_row_sizes(&[vortex_array::array::erased::ArrayRef], &[vortex_row::options::SortField], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<vortex_array::array::erased::ArrayRef>
+
+pub fn vortex_row::convert_columns(&[vortex_array::array::erased::ArrayRef], &[vortex_row::options::SortField], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<vortex_array::arrays::listview::vtable::ListViewArray>
+
+pub fn vortex_row::initialize(&vortex_session::VortexSession)
diff --git a/vortex-row/src/codec.rs b/vortex-row/src/codec.rs
new file mode 100644
index 00000000000..4138a0d55b8
--- /dev/null
+++ b/vortex-row/src/codec.rs
@@ -0,0 +1,1535 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+#![allow(
+    clippy::cast_possible_truncation,
+    clippy::expect_used,
+    reason = "row encoding indexes into u32-sized buffers; lengths are validated to fit in u32 elsewhere"
+)]
+
+//! Pure byte-encoding kernels for row-oriented output, operating on `Canonical` variants.
+//!
+//! The encoded byte format produces a lexicographically byte-comparable representation:
+//! comparing the byte slices of two encoded rows yields the same ordering as the
+//! original logical (tuple) comparison of their values, modulo nulls placement and
+//! descending-ness as configured by [`SortField`].
+//!
+//! Conventions:
+//! - Every value is preceded by a 1-byte sentinel that orders nulls relative to non-nulls.
+//! - For `descending`, only the **value** bytes are bit-inverted (XOR with 0xFF), not the
+//!   sentinel.
+//! - Fixed-width integers are big-endian, with the sign bit flipped for signed types.
+//! - Floats are bit-pattern big-endian with sign-aware mask: non-negative flips the top
+//!   bit; negative flips all bits.
+//!
+//! This commit covers only the fixed-width canonical variants (Null, Bool, Primitive,
+//! Decimal); variable-length and nested canonical variants land in later commits.
+
+use vortex_array::Canonical;
+use vortex_array::ExecutionCtx;
+use vortex_array::accessor::ArrayAccessor;
+use vortex_array::arrays::BoolArray;
+use vortex_array::arrays::DecimalArray;
+use vortex_array::arrays::ExtensionArray;
+use vortex_array::arrays::FixedSizeListArray;
+use vortex_array::arrays::NullArray;
+use vortex_array::arrays::PrimitiveArray;
+use vortex_array::arrays::StructArray;
+use vortex_array::arrays::VarBinViewArray;
+use vortex_array::arrays::extension::ExtensionArrayExt;
+use vortex_array::arrays::fixed_size_list::FixedSizeListArrayExt;
+use vortex_array::arrays::struct_::StructArrayExt;
+use vortex_array::dtype::DType;
+use vortex_array::dtype::DecimalType;
+use vortex_array::dtype::NativePType;
+use vortex_array::dtype::PType;
+use vortex_array::dtype::half::f16;
+use vortex_array::match_each_native_ptype;
+use vortex_array::validity::Validity;
+use vortex_buffer::ByteBufferMut;
+use vortex_error::VortexResult;
+use vortex_error::vortex_bail;
+
+use crate::options::SortField;
+
+/// Size in bytes of the encoded form of a single bool value (sentinel + 1 content byte).
+pub const BOOL_ENCODED_SIZE: u32 = 2;
+
+/// Block size used in the variable-length encoding.
+pub const VARLEN_BLOCK_SIZE: usize = 32;
+/// Total bytes per varlen block including the trailing continuation marker.
+pub const VARLEN_BLOCK_TOTAL: usize = VARLEN_BLOCK_SIZE + 1;
+
+/// Returns the size in bytes of the encoded form of a variable-length value of the given length.
+#[inline]
+fn encoded_size_for_varlen(len: usize) -> u32 {
+    // 1 sentinel + ceil(len/32)*33 content bytes (or 1 zero terminator if empty)
+    if len == 0 {
+        1 + 1
+    } else {
+        let blocks = len.div_ceil(VARLEN_BLOCK_SIZE);
+        1 + (blocks as u32) * (VARLEN_BLOCK_TOTAL as u32)
+    }
+}
+
+/// Constant per-row size in bytes for fixed-width encodings (including 1-byte sentinel).
+#[inline]
+const fn encoded_size_for_fixed(value_bytes: u32) -> u32 {
+    1 + value_bytes
+}
+
+/// Pre-resolved per-row validity for the row encoders.
+///
+/// Encoders pattern-match on this once before their inner loop so the
+/// no-nulls fast path avoids per-row `mask.value(i)` branches entirely,
+/// and the nullable path holds the materialized mask exactly once.
+pub(crate) enum ValidityKind {
+    /// Column statically has no nulls (`Validity::NonNullable` or `AllValid`); no mask
+    /// allocation needed.
+    AllValid,
+    /// Column may have nulls; the materialized per-row mask is included.
+    Mask(vortex_mask::Mask),
+}
+
+/// Resolve a [`Validity`] into a [`ValidityKind`], materializing the mask only when
+/// the column may actually have nulls.
+#[inline]
+pub(crate) fn resolve_validity(
+    validity: Validity,
+    len: usize,
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<ValidityKind> {
+    Ok(match validity {
+        Validity::NonNullable | Validity::AllValid => ValidityKind::AllValid,
+        other => ValidityKind::Mask(other.execute_mask(len, ctx)?),
+    })
+}
+
+/// Per-row width classification for a column.
+///
+/// `Fixed(w)` means every row encodes to exactly `w` bytes (sentinel + value), regardless
+/// of null-ness or value. `Variable` means per-row sizes depend on the data (Utf8/Binary,
+/// List, or any composite that recurses through a variable-width field).
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum RowWidth {
+    /// Per-row width is the same constant for every row in the column.
+    Fixed(u32),
+    /// Per-row width is data-dependent.
+    Variable,
+}
+
+/// Classify a column's per-row encoded width by inspecting only its [`DType`].
+///
+/// Returns `Fixed(w)` when every row encodes to exactly `w` bytes (sentinel + value),
+/// regardless of null-ness or value. Returns `Variable` when per-row sizes depend on the
+/// data.
+///
+/// Classification does not depend on the [`SortField`]: null-vs-non-null encoding width is
+/// the same for fixed-width types (the sentinel byte plus zero-fill for nulls).
+///
+/// # Errors
+///
+/// Returns an error for dtypes that the row encoder does not yet support. Variable-length
+/// dtypes (Utf8/Binary), nested dtypes (Struct/FixedSizeList/Extension), and
+/// Variant/Union/List arrive in later commits.
+pub fn row_width_for_dtype(dtype: &DType) -> VortexResult<RowWidth> {
+    match dtype {
+        DType::Null => Ok(RowWidth::Fixed(1)),
+        DType::Bool(_) => Ok(RowWidth::Fixed(BOOL_ENCODED_SIZE)),
+        DType::Primitive(ptype, _) => Ok(RowWidth::Fixed(encoded_size_for_fixed(
+            ptype.byte_width() as u32,
+        ))),
+        DType::Decimal(dt, _) => {
+            let vt = DecimalType::smallest_decimal_value_type(dt);
+            Ok(RowWidth::Fixed(encoded_size_for_fixed(
+                vt.byte_width() as u32
+            )))
+        }
+        DType::Utf8(_) | DType::Binary(_) => Ok(RowWidth::Variable),
+        DType::FixedSizeList(elem, n, _) => match row_width_for_dtype(elem)? {
+            // FSL is fixed iff its element type is fixed. Add a sentinel byte for the FSL
+            // itself, then `n` copies of the element width.
+            RowWidth::Fixed(w) => {
+                let body = w.saturating_mul(*n);
+                Ok(RowWidth::Fixed(body.saturating_add(1)))
+            }
+            RowWidth::Variable => Ok(RowWidth::Variable),
+        },
+        DType::Struct(fields, _) => {
+            // Struct is fixed iff all its fields are fixed; sum their widths plus a sentinel.
+            let mut total: u32 = 1; // outer sentinel
+            for field_dtype in fields.fields() {
+                match row_width_for_dtype(&field_dtype)? {
+                    RowWidth::Fixed(w) => total = total.saturating_add(w),
+                    RowWidth::Variable => return Ok(RowWidth::Variable),
+                }
+            }
+            Ok(RowWidth::Fixed(total))
+        }
+        DType::List(..) => Ok(RowWidth::Variable),
+        DType::Extension(ext) => row_width_for_dtype(ext.storage_dtype()),
+        DType::Variant(_) => {
+            vortex_bail!("row encoding does not support Variant arrays (no well-defined ordering)")
+        }
+        DType::Union(_) => vortex_bail!("row encoding does not support Union arrays"),
+    }
+}
+
+/// Compute the per-row size in bytes for the given canonical view, adding into `sizes`.
+///
+/// `sizes` is expected to be initialized (typically zeroed). This function *adds* the
+/// per-row size to each entry so multiple columns can accumulate into the same buffer.
+///
+/// # Errors
+///
+/// Returns an error for unsupported canonical variants. Variable-length and nested
+/// variants land in later commits.
+pub fn field_size(
+    canonical: &Canonical,
+    field: SortField,
+    sizes: &mut [u32],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    match canonical {
+        Canonical::Null(arr) => add_size_null(arr, sizes),
+        Canonical::Bool(_) => add_size_const(sizes, encoded_size_for_fixed(1)),
+        Canonical::Primitive(arr) => add_size_primitive(arr, sizes),
+        Canonical::Decimal(arr) => add_size_decimal(arr, sizes),
+        Canonical::VarBinView(arr) => add_size_varbinview(arr, sizes, ctx)?,
+        Canonical::Struct(arr) => add_size_struct(arr, field, sizes, ctx)?,
+        Canonical::FixedSizeList(arr) => add_size_fsl(arr, field, sizes, ctx)?,
+        Canonical::Extension(arr) => add_size_extension(arr, field, sizes, ctx)?,
+        Canonical::List(_) => vortex_bail!(
+            "row encoding does not yet support canonical type {:?}",
+            canonical.dtype()
+        ),
+        Canonical::Variant(_) => {
+            vortex_bail!("row encoding does not support Variant arrays (no well-defined ordering)")
+        }
+    }
+    Ok(())
+}
+
+/// Encode each row's bytes for the given canonical view at arithmetic offsets, without
+/// reading or writing any per-row cursor.
+///
+/// For row `i`, the column's bytes are written at `out[i * row_stride + var_prefix[i] +
+/// col_prefix ..]` where `var_prefix` is the exclusive prefix sum of varlen contributions
+/// (`None` when there are no varlen columns in the row layout). This is the fast path used
+/// for fixed-width columns that appear before any varlen column in the row.
+///
+/// This path requires that `canonical` is a fixed-width type; the per-column slot has
+/// exactly `width` bytes per row (sentinel + value).
+///
+/// # Errors
+///
+/// Returns an error if `canonical` is not a supported fixed-width canonical variant.
+#[allow(clippy::too_many_arguments)]
+pub fn field_encode_fixed_arithmetic(
+    canonical: &Canonical,
+    field: SortField,
+    col_prefix: u32,
+    row_stride: u32,
+    var_prefix: Option<&[u32]>,
+    width: u32,
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    match canonical {
+        Canonical::Null(arr) => {
+            encode_null_arith(arr, field, col_prefix, row_stride, var_prefix, out)
+        }
+        Canonical::Bool(arr) => {
+            encode_bool_arith(arr, field, col_prefix, row_stride, var_prefix, out, ctx)?
+        }
+        Canonical::Primitive(arr) => encode_primitive_arith(
+            arr, field, col_prefix, row_stride, var_prefix, width, out, ctx,
+        )?,
+        Canonical::Decimal(arr) => encode_decimal_arith(
+            arr, field, col_prefix, row_stride, var_prefix, width, out, ctx,
+        )?,
+        Canonical::Struct(arr) => encode_struct_arith(
+            arr, field, col_prefix, row_stride, var_prefix, width, out, ctx,
+        )?,
+        Canonical::FixedSizeList(arr) => encode_fsl_arith(
+            arr, field, col_prefix, row_stride, var_prefix, width, out, ctx,
+        )?,
+        Canonical::Extension(arr) => {
+            let storage = arr.storage_array().clone().execute::<Canonical>(ctx)?;
+            field_encode_fixed_arithmetic(
+                &storage, field, col_prefix, row_stride, var_prefix, width, out, ctx,
+            )?;
+        }
+        Canonical::VarBinView(_) | Canonical::List(_) | Canonical::Variant(_) => {
+            vortex_bail!(
+                "field_encode_fixed_arithmetic called on non-fixed canonical type {:?}",
+                canonical.dtype()
+            )
+        }
+    }
+    Ok(())
+}
+
+/// Encode each row's bytes for the given canonical view into `out`, writing starting at
+/// `offsets[i] + cursors[i]` for row `i` and advancing `cursors[i]` by the number of
+/// bytes written.
+///
+/// After this call returns successfully, `cursors[i]` will have advanced by exactly the
+/// per-row contribution previously computed by [`field_size`] for the same column.
+pub fn field_encode(
+    canonical: &Canonical,
+    field: SortField,
+    offsets: &[u32],
+    cursors: &mut [u32],
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    match canonical {
+        Canonical::Null(arr) => encode_null(arr, field, offsets, cursors, out),
+        Canonical::Bool(arr) => encode_bool(arr, field, offsets, cursors, out, ctx)?,
+        Canonical::Primitive(arr) => encode_primitive(arr, field, offsets, cursors, out, ctx)?,
+        Canonical::Decimal(arr) => encode_decimal(arr, field, offsets, cursors, out, ctx)?,
+        Canonical::VarBinView(arr) => encode_varbinview(arr, field, offsets, cursors, out, ctx)?,
+        Canonical::Struct(arr) => encode_struct(arr, field, offsets, cursors, out, ctx)?,
+        Canonical::FixedSizeList(arr) => encode_fsl(arr, field, offsets, cursors, out, ctx)?,
+        Canonical::Extension(arr) => encode_extension(arr, field, offsets, cursors, out, ctx)?,
+        Canonical::List(_) => vortex_bail!(
+            "row encoding does not yet support canonical type {:?}",
+            canonical.dtype()
+        ),
+        Canonical::Variant(_) => {
+            vortex_bail!("row encoding does not support Variant arrays (no well-defined ordering)")
+        }
+    }
+    Ok(())
+}
+
+fn add_size_const(sizes: &mut [u32], add: u32) {
+    for s in sizes.iter_mut() {
+        *s += add;
+    }
+}
+
+fn add_size_null(arr: &NullArray, sizes: &mut [u32]) {
+    debug_assert_eq!(arr.len(), sizes.len());
+    // Just a sentinel byte per row.
+    for s in sizes.iter_mut() {
+        *s += 1;
+    }
+}
+
+fn add_size_primitive(arr: &PrimitiveArray, sizes: &mut [u32]) {
+    let width = arr.ptype().byte_width() as u32;
+    add_size_const(sizes, encoded_size_for_fixed(width));
+}
+
+fn add_size_decimal(arr: &DecimalArray, sizes: &mut [u32]) {
+    let width = arr.values_type().byte_width() as u32;
+    add_size_const(sizes, encoded_size_for_fixed(width));
+}
+
+fn add_size_varbinview(
+    arr: &VarBinViewArray,
+    sizes: &mut [u32],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    let views = arr.views();
+    match resolve_validity(arr.as_ref().validity()?, arr.len(), ctx)? {
+        ValidityKind::AllValid => {
+            for (i, view) in views.iter().enumerate() {
+                sizes[i] += encoded_size_for_varlen(view.len() as usize);
+            }
+        }
+        ValidityKind::Mask(mask) => {
+            for (i, view) in views.iter().enumerate() {
+                if mask.value(i) {
+                    sizes[i] += encoded_size_for_varlen(view.len() as usize);
+                } else {
+                    sizes[i] += 1; // sentinel only
+                }
+            }
+        }
+    }
+    Ok(())
+}
+
+fn add_size_struct(
+    arr: &StructArray,
+    field: SortField,
+    sizes: &mut [u32],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    // null sentinel: 1 byte per row.
+    for s in sizes.iter_mut() {
+        *s += 1;
+    }
+    // Each field adds its own per-row size.
+    for child in arr.iter_unmasked_fields() {
+        let canonical = child.clone().execute::<Canonical>(ctx)?;
+        field_size(&canonical, field, sizes, ctx)?;
+    }
+    Ok(())
+}
+
+fn add_size_fsl(
+    arr: &FixedSizeListArray,
+    field: SortField,
+    sizes: &mut [u32],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    let n = arr.len();
+    debug_assert_eq!(n, sizes.len());
+    let list_size = arr.list_size() as usize;
+    let elements = arr.elements().clone().execute::<Canonical>(ctx)?;
+    debug_assert_eq!(elements.len(), n * list_size);
+    // Sizing: 1 sentinel + sum of element sizes (`list_size` per row).
+    // We compute element-wise sizes into a contiguous scratch buffer then reduce by row.
+    let mut elem_sizes = vec![0u32; n * list_size];
+    field_size(&elements, field, &mut elem_sizes, ctx)?;
+    for i in 0..n {
+        let mut sum: u32 = 1; // sentinel
+        let base = i * list_size;
+        for j in 0..list_size {
+            sum = sum.saturating_add(elem_sizes[base + j]);
+        }
+        sizes[i] += sum;
+    }
+    Ok(())
+}
+
+fn add_size_extension(
+    arr: &ExtensionArray,
+    field: SortField,
+    sizes: &mut [u32],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    let storage = arr.storage_array().clone().execute::<Canonical>(ctx)?;
+    field_size(&storage, field, sizes, ctx)
+}
+
+fn encode_null(
+    arr: &NullArray,
+    field: SortField,
+    row_offsets: &[u32],
+    col_offset: &mut [u32],
+    out: &mut [u8],
+) {
+    let sentinel = field.null_sentinel();
+    for i in 0..arr.len() {
+        let pos = (row_offsets[i] + col_offset[i]) as usize;
+        out[pos] = sentinel;
+        col_offset[i] += 1;
+    }
+}
+
+fn encode_bool(
+    arr: &BoolArray,
+    field: SortField,
+    row_offsets: &[u32],
+    col_offset: &mut [u32],
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    let bits = arr.clone().into_bit_buffer();
+    let non_null = field.non_null_sentinel();
+    let xor = if field.descending { 0xFF } else { 0x00 };
+    match resolve_validity(arr.as_ref().validity()?, arr.len(), ctx)? {
+        ValidityKind::AllValid => {
+            for i in 0..bits.len() {
+                let pos = (row_offsets[i] + col_offset[i]) as usize;
+                out[pos] = non_null;
+                let raw = if bits.value(i) { 0x02u8 } else { 0x01u8 };
+                out[pos + 1] = raw ^ xor;
+                col_offset[i] += BOOL_ENCODED_SIZE;
+            }
+        }
+        ValidityKind::Mask(mask) => {
+            let null = field.null_sentinel();
+            for i in 0..bits.len() {
+                let pos = (row_offsets[i] + col_offset[i]) as usize;
+                if mask.value(i) {
+                    out[pos] = non_null;
+                    // false=0x01, true=0x02 so false < true; XOR for descending
+                    let raw = if bits.value(i) { 0x02u8 } else { 0x01u8 };
+                    out[pos + 1] = raw ^ xor;
+                } else {
+                    out[pos] = null;
+                    out[pos + 1] = 0;
+                }
+                col_offset[i] += BOOL_ENCODED_SIZE;
+            }
+        }
+    }
+    Ok(())
+}
+
+fn encode_primitive(
+    arr: &PrimitiveArray,
+    field: SortField,
+    row_offsets: &[u32],
+    col_offset: &mut [u32],
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    match_each_native_ptype!(arr.ptype(), |T| {
+        encode_primitive_typed::<T>(arr, field, row_offsets, col_offset, out, ctx)?;
+    });
+    Ok(())
+}
+
+fn encode_primitive_typed<T: NativePType + RowEncode>(
+    arr: &PrimitiveArray,
+    field: SortField,
+    row_offsets: &[u32],
+    col_offset: &mut [u32],
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    let slice: &[T] = arr.as_slice();
+    let non_null = field.non_null_sentinel();
+    let value_bytes = size_of::<T>();
+    let stride = encoded_size_for_fixed(value_bytes as u32);
+    match resolve_validity(arr.as_ref().validity()?, arr.len(), ctx)? {
+        ValidityKind::AllValid => {
+            for (i, &v) in slice.iter().enumerate() {
+                let pos = (row_offsets[i] + col_offset[i]) as usize;
+                out[pos] = non_null;
+                v.encode_to(&mut out[pos + 1..pos + 1 + value_bytes], field.descending);
+                col_offset[i] += stride;
+            }
+        }
+        ValidityKind::Mask(mask) => {
+            let null = field.null_sentinel();
+            for (i, &v) in slice.iter().enumerate() {
+                let pos = (row_offsets[i] + col_offset[i]) as usize;
+                if mask.value(i) {
+                    out[pos] = non_null;
+                    v.encode_to(&mut out[pos + 1..pos + 1 + value_bytes], field.descending);
+                } else {
+                    out[pos] = null;
+                    for b in &mut out[pos + 1..pos + 1 + value_bytes] {
+                        *b = 0;
+                    }
+                }
+                col_offset[i] += stride;
+            }
+        }
+    }
+    Ok(())
+}
+
+fn encode_decimal(
+    arr: &DecimalArray,
+    field: SortField,
+    row_offsets: &[u32],
+    col_offset: &mut [u32],
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    let mask = arr.as_ref().validity()?.execute_mask(arr.len(), ctx)?;
+    match arr.values_type() {
+        DecimalType::I8 => {
+            encode_decimal_typed::<i8>(arr, &mask, field, row_offsets, col_offset, out)
+        }
+        DecimalType::I16 => {
+            encode_decimal_typed::<i16>(arr, &mask, field, row_offsets, col_offset, out)
+        }
+        DecimalType::I32 => {
+            encode_decimal_typed::<i32>(arr, &mask, field, row_offsets, col_offset, out)
+        }
+        DecimalType::I64 => {
+            encode_decimal_typed::<i64>(arr, &mask, field, row_offsets, col_offset, out)
+        }
+        DecimalType::I128 => {
+            encode_decimal_typed::<i128>(arr, &mask, field, row_offsets, col_offset, out)
+        }
+        DecimalType::I256 => {
+            vortex_bail!("row encoding for Decimal256 is not yet implemented")
+        }
+    }
+    Ok(())
+}
+
+fn encode_decimal_typed<T>(
+    arr: &DecimalArray,
+    mask: &vortex_mask::Mask,
+    field: SortField,
+    row_offsets: &[u32],
+    col_offset: &mut [u32],
+    out: &mut [u8],
+) where
+    T: vortex_array::dtype::NativeDecimalType + RowEncode,
+{
+    let non_null = field.non_null_sentinel();
+    let null = field.null_sentinel();
+    let value_bytes = size_of::<T>();
+    let total = encoded_size_for_fixed(value_bytes as u32);
+    let slice = arr.buffer::<T>();
+    for i in 0..slice.len() {
+        let pos = (row_offsets[i] + col_offset[i]) as usize;
+        if mask.value(i) {
+            out[pos] = non_null;
+            slice[i].encode_to(&mut out[pos + 1..pos + 1 + value_bytes], field.descending);
+        } else {
+            out[pos] = null;
+            for b in &mut out[pos + 1..pos + 1 + value_bytes] {
+                *b = 0;
+            }
+        }
+        col_offset[i] += total;
+    }
+}
+
+fn encode_varbinview(
+    arr: &VarBinViewArray,
+    field: SortField,
+    row_offsets: &[u32],
+    col_offset: &mut [u32],
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    let non_null = field.non_null_sentinel();
+    let descending = field.descending;
+    let views = arr.views();
+    let n_buffers = arr.data_buffers().len();
+    match resolve_validity(arr.as_ref().validity()?, arr.len(), ctx)? {
+        ValidityKind::AllValid => {
+            // Cache data-buffer slices once. For inlined views (len <= 12), bytes live
+            // inside the view itself.
+            let buffers: smallvec::SmallVec<[&[u8]; 4]> =
+                (0..n_buffers).map(|i| arr.buffer(i).as_slice()).collect();
+            for (i, view) in views.iter().enumerate() {
+                let pos = (row_offsets[i] + col_offset[i]) as usize;
+                out[pos] = non_null;
+                let len = view.len() as usize;
+                // SAFETY: BinaryView's inlined-vs-ref discriminant is its `size` field
+                // (read by `view.len()`); for len <= 12 the bytes are inline in the view
+                // (we read from `as_inlined().value()`); for larger we index into the
+                // pre-validated buffer at `view_ref.offset..offset+size`. Both reads
+                // produce a slice of exactly `len` valid bytes.
+                let bytes: &[u8] = if view.is_inlined() {
+                    view.as_inlined().value()
+                } else {
+                    let r = view.as_view();
+                    let off = r.offset as usize;
+                    &buffers[r.buffer_index as usize][off..off + len]
+                };
+                let written = encode_varlen_value(bytes, &mut out[pos + 1..], descending);
+                col_offset[i] += 1 + written;
+            }
+        }
+        ValidityKind::Mask(mask) => {
+            let null = field.null_sentinel();
+            arr.with_iterator(|iter| {
+                for (i, maybe) in iter.enumerate() {
+                    let pos = (row_offsets[i] + col_offset[i]) as usize;
+                    if !mask.value(i) {
+                        out[pos] = null;
+                        col_offset[i] += 1;
+                        continue;
+                    }
+                    let bytes: &[u8] = maybe.unwrap_or(&[]);
+                    out[pos] = non_null;
+                    let written = encode_varlen_value(bytes, &mut out[pos + 1..], descending);
+                    col_offset[i] += 1 + written;
+                }
+            });
+        }
+    }
+    Ok(())
+}
+
+fn encode_struct(
+    arr: &StructArray,
+    field: SortField,
+    row_offsets: &[u32],
+    col_offset: &mut [u32],
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    let n = arr.len();
+    let mask = arr.as_ref().validity()?.execute_mask(n, ctx)?;
+    let non_null = field.non_null_sentinel();
+    let null = field.null_sentinel();
+
+    // First, write the sentinel for each row. We track the post-sentinel cursor offsets
+    // for the body in `body_cursors` (which start exactly at +1 of the input cursor).
+    // For null rows we additionally need to zero-fill the (uniform-width) field bytes,
+    // but because struct widths are variable in general, we record null indexes first
+    // and zero-fill after we know each row's contribution.
+    //
+    // To keep the implementation simple we:
+    //   1) advance the cursor past the sentinel,
+    //   2) recursively encode each field's bytes (the field encoders ignore nullness of
+    //      the struct, but use their own per-field nullness),
+    //   3) for null struct rows, overwrite the body bytes with zeros so the encoded form
+    //      depends only on the sentinel.
+    let body_start: Vec<u32> = (0..n).map(|i| col_offset[i] + 1).collect();
+    for i in 0..n {
+        let pos = (row_offsets[i] + col_offset[i]) as usize;
+        out[pos] = if mask.value(i) { non_null } else { null };
+        col_offset[i] += 1;
+    }
+
+    for child in arr.iter_unmasked_fields() {
+        let canonical = child.clone().execute::<Canonical>(ctx)?;
+        field_encode(&canonical, field, row_offsets, col_offset, out, ctx)?;
+    }
+
+    // Zero-fill body bytes of null rows (the field encoders may have written values).
+    for i in 0..n {
+        if !mask.value(i) {
+            let start = (row_offsets[i] + body_start[i]) as usize;
+            let end = (row_offsets[i] + col_offset[i]) as usize;
+            for b in &mut out[start..end] {
+                *b = 0;
+            }
+        }
+    }
+
+    Ok(())
+}
+
+fn encode_fsl(
+    arr: &FixedSizeListArray,
+    field: SortField,
+    row_offsets: &[u32],
+    col_offset: &mut [u32],
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    let n = arr.len();
+    let list_size = arr.list_size() as usize;
+    let mask = arr.as_ref().validity()?.execute_mask(n, ctx)?;
+    let non_null = field.non_null_sentinel();
+    let null = field.null_sentinel();
+    let elements = arr.elements().clone().execute::<Canonical>(ctx)?;
+    debug_assert_eq!(elements.len(), n * list_size);
+
+    // Write sentinels and remember body start for null zero-fill.
+    let body_start: Vec<u32> = (0..n).map(|i| col_offset[i] + 1).collect();
+    for i in 0..n {
+        let pos = (row_offsets[i] + col_offset[i]) as usize;
+        out[pos] = if mask.value(i) { non_null } else { null };
+        col_offset[i] += 1;
+    }
+
+    // Encode all `n * list_size` elements into the body. Build a fresh
+    // (offsets, cursors) pair where each element gets one slot. Then sum bytes back
+    // into the parent col_offset.
+    let mut elem_sizes = vec![0u32; n * list_size];
+    field_size(&elements, field, &mut elem_sizes, ctx)?;
+    // Element offsets are sequential starting at each parent's current cursor position.
+    let mut elem_offsets = vec![0u32; n * list_size];
+    for i in 0..n {
+        let mut acc = row_offsets[i] + col_offset[i];
+        for j in 0..list_size {
+            elem_offsets[i * list_size + j] = acc;
+            acc = acc.saturating_add(elem_sizes[i * list_size + j]);
+        }
+    }
+    let mut elem_cursors = vec![0u32; n * list_size];
+    field_encode(&elements, field, &elem_offsets, &mut elem_cursors, out, ctx)?;
+    // Advance the parent cursors by the total per-row element bytes.
+    for i in 0..n {
+        let mut sum: u32 = 0;
+        for j in 0..list_size {
+            sum = sum.saturating_add(elem_sizes[i * list_size + j]);
+        }
+        col_offset[i] = col_offset[i].saturating_add(sum);
+    }
+
+    // Zero-fill null bodies.
+    for i in 0..n {
+        if !mask.value(i) {
+            let start = (row_offsets[i] + body_start[i]) as usize;
+            let end = (row_offsets[i] + col_offset[i]) as usize;
+            for b in &mut out[start..end] {
+                *b = 0;
+            }
+        }
+    }
+
+    Ok(())
+}
+
+fn encode_extension(
+    arr: &ExtensionArray,
+    field: SortField,
+    row_offsets: &[u32],
+    col_offset: &mut [u32],
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    let storage = arr.storage_array().clone().execute::<Canonical>(ctx)?;
+    field_encode(&storage, field, row_offsets, col_offset, out, ctx)
+}
+
+/// Encode a variable-length byte slice into `out` in 32-byte blocks with
+/// continuation markers. Returns the number of bytes written.
+///
+/// For the ascending path (descending == false), the hot loop is a `copy_nonoverlapping`
+/// of 32 bytes per block plus one stamped continuation byte — no per-byte work. For the
+/// descending path, the hot loop reads u64-at-a-time and XORs with 0xFF to give LLVM
+/// a vectorizable inner loop.
+fn encode_varlen_value(bytes: &[u8], out: &mut [u8], descending: bool) -> u32 {
+    if bytes.is_empty() {
+        // Single zero terminator (descending flips it to 0xFF).
+        out[0] = if descending { 0xFF } else { 0 };
+        return 1;
+    }
+    let len = bytes.len();
+    let full_blocks = len / VARLEN_BLOCK_SIZE;
+    let partial = len % VARLEN_BLOCK_SIZE;
+    let (full_to_write, partial_block_len) = if partial == 0 {
+        // Length is an exact multiple of 32. The spec emits (full_blocks-1) full blocks
+        // with 0xFF continuation, plus a final block whose continuation byte is 32.
+        (full_blocks - 1, VARLEN_BLOCK_SIZE)
+    } else {
+        (full_blocks, partial)
+    };
+    let total = (full_to_write + 1) * VARLEN_BLOCK_TOTAL;
+    debug_assert!(out.len() >= total);
+
+    // SAFETY: bounds checked above. The encoder always invokes us with `out.len()`
+    // >= encoded_size_for_varlen(bytes.len()) - 1 (the leading sentinel is written by the
+    // caller and not counted here).
+    unsafe {
+        let mut src = bytes.as_ptr();
+        let mut dst = out.as_mut_ptr();
+
+        if !descending {
+            // Ascending fast path: full blocks are memcpy + a single 0xFF stamp.
+            for _ in 0..full_to_write {
+                std::ptr::copy_nonoverlapping(src, dst, VARLEN_BLOCK_SIZE);
+                *dst.add(VARLEN_BLOCK_SIZE) = 0xFF;
+                src = src.add(VARLEN_BLOCK_SIZE);
+                dst = dst.add(VARLEN_BLOCK_TOTAL);
+            }
+            // Final block: copy the partial data, zero-pad the tail, write the
+            // length byte as the continuation marker.
+            std::ptr::copy_nonoverlapping(src, dst, partial_block_len);
+            std::ptr::write_bytes(
+                dst.add(partial_block_len),
+                0,
+                VARLEN_BLOCK_SIZE - partial_block_len,
+            );
+            *dst.add(VARLEN_BLOCK_SIZE) = partial_block_len as u8;
+        } else {
+            // Descending: invert all value bytes. u64-stride XOR gives LLVM a
+            // vectorizable inner loop; the tail handles the partial block.
+            for _ in 0..full_to_write {
+                xor_copy_block(src, dst);
+                *dst.add(VARLEN_BLOCK_SIZE) = 0x00; // descending counterpart of 0xFF
+                src = src.add(VARLEN_BLOCK_SIZE);
+                dst = dst.add(VARLEN_BLOCK_TOTAL);
+            }
+            // Final block: XOR-copy the partial data, fill the tail with 0xFF
+            // (which is 0x00 XOR 0xFF), then write the inverted length byte.
+            for i in 0..partial_block_len {
+                *dst.add(i) = *src.add(i) ^ 0xFF;
+            }
+            std::ptr::write_bytes(
+                dst.add(partial_block_len),
+                0xFF,
+                VARLEN_BLOCK_SIZE - partial_block_len,
+            );
+            *dst.add(VARLEN_BLOCK_SIZE) = (partial_block_len as u8) ^ 0xFF;
+        }
+    }
+    total as u32
+}
+
+/// Copy 32 bytes from `src` to `dst`, XORing each with 0xFF. Auto-vectorized by LLVM
+/// into SIMD on x86 (verified via cargo asm in earlier iterations).
+///
+/// # Safety
+/// `src` must be valid for 32 reads; `dst` must be valid for 32 writes; the regions
+/// may not overlap.
+#[inline(always)]
+unsafe fn xor_copy_block(src: *const u8, dst: *mut u8) {
+    // Use u64 chunks (4 lanes of 8 bytes = 32 bytes total).
+    for i in 0..4 {
+        let off = i * 8;
+        // SAFETY: caller upholds the contract that src/dst are valid for 32 bytes.
+        let v = unsafe { std::ptr::read_unaligned(src.add(off) as *const u64) };
+        unsafe { std::ptr::write_unaligned(dst.add(off) as *mut u64, v ^ u64::MAX) };
+    }
+}
+
+/// Internal trait for encoding a fixed-width native value into byte slots.
+///
+/// Implementations must produce a sequence of `size_of::<Self>()` bytes that is
+/// lexicographically byte-comparable according to the natural ordering of the type.
+pub trait RowEncode: Copy {
+    /// Encode this value into `out`, inverting the bytes for descending order.
+    fn encode_to(self, out: &mut [u8], descending: bool);
+}
+
+macro_rules! impl_row_encode_unsigned {
+    ($t:ty) => {
+        impl RowEncode for $t {
+            #[inline]
+            fn encode_to(self, out: &mut [u8], descending: bool) {
+                let bytes = self.to_be_bytes();
+                if descending {
+                    for (i, b) in bytes.iter().enumerate() {
+                        out[i] = b ^ 0xFF;
+                    }
+                } else {
+                    out.copy_from_slice(&bytes);
+                }
+            }
+        }
+    };
+}
+
+macro_rules! impl_row_encode_signed {
+    ($t:ty) => {
+        impl RowEncode for $t {
+            #[inline]
+            fn encode_to(self, out: &mut [u8], descending: bool) {
+                let mut bytes = self.to_be_bytes();
+                // Flip sign bit so negatives < non-negatives lexicographically.
+                bytes[0] ^= 0x80;
+                if descending {
+                    for (i, b) in bytes.iter().enumerate() {
+                        out[i] = b ^ 0xFF;
+                    }
+                } else {
+                    out.copy_from_slice(&bytes);
+                }
+            }
+        }
+    };
+}
+
+impl_row_encode_unsigned!(u8);
+impl_row_encode_unsigned!(u16);
+impl_row_encode_unsigned!(u32);
+impl_row_encode_unsigned!(u64);
+impl_row_encode_signed!(i8);
+impl_row_encode_signed!(i16);
+impl_row_encode_signed!(i32);
+impl_row_encode_signed!(i64);
+impl_row_encode_signed!(i128);
+
+impl RowEncode for f32 {
+    fn encode_to(self, out: &mut [u8], descending: bool) {
+        let bits = self.to_bits();
+        let mask: u32 = if (bits >> 31) == 0 {
+            0x8000_0000
+        } else {
+            0xFFFF_FFFF
+        };
+        let mut bytes = (bits ^ mask).to_be_bytes();
+        if descending {
+            for b in bytes.iter_mut() {
+                *b ^= 0xFF;
+            }
+        }
+        out.copy_from_slice(&bytes);
+    }
+}
+
+impl RowEncode for f64 {
+    fn encode_to(self, out: &mut [u8], descending: bool) {
+        let bits = self.to_bits();
+        let mask: u64 = if (bits >> 63) == 0 {
+            0x8000_0000_0000_0000
+        } else {
+            0xFFFF_FFFF_FFFF_FFFF
+        };
+        let mut bytes = (bits ^ mask).to_be_bytes();
+        if descending {
+            for b in bytes.iter_mut() {
+                *b ^= 0xFF;
+            }
+        }
+        out.copy_from_slice(&bytes);
+    }
+}
+
+impl RowEncode for f16 {
+    fn encode_to(self, out: &mut [u8], descending: bool) {
+        let bits = self.to_bits();
+        let mask: u16 = if (bits >> 15) == 0 { 0x8000 } else { 0xFFFF };
+        let mut bytes = (bits ^ mask).to_be_bytes();
+        if descending {
+            for b in bytes.iter_mut() {
+                *b ^= 0xFF;
+            }
+        }
+        out.copy_from_slice(&bytes);
+    }
+}
+
+/// Encode a single scalar primitive value of a known PType into a buffer slot.
+pub fn encode_scalar_primitive(
+    ptype: PType,
+    value: vortex_array::scalar::PValue,
+    field: SortField,
+    is_null: bool,
+    out: &mut ByteBufferMut,
+) -> VortexResult<()> {
+    if is_null {
+        out.push(field.null_sentinel());
+        return Ok(());
+    }
+    out.push(field.non_null_sentinel());
+    let width = ptype.byte_width();
+    let mut tmp = [0u8; 16];
+    let buf = &mut tmp[..width];
+    match_each_native_ptype!(
+        ptype,
+        integral: |T| {
+            let v: T = T::try_from(value)?;
+            v.encode_to(buf, field.descending);
+        },
+        floating: |T| {
+            let v: T = T::try_from(value)?;
+            v.encode_to(buf, field.descending);
+        }
+    );
+    out.extend_from_slice(buf);
+    Ok(())
+}
+
+/// Encode a single varlen value into a buffer.
+pub fn encode_scalar_varlen(value: Option<&[u8]>, field: SortField, out: &mut ByteBufferMut) {
+    match value {
+        None => out.push(field.null_sentinel()),
+        Some(bytes) => {
+            out.push(field.non_null_sentinel());
+            let needed = if bytes.is_empty() {
+                1
+            } else {
+                bytes.len().div_ceil(VARLEN_BLOCK_SIZE) * VARLEN_BLOCK_TOTAL
+            };
+            let start = out.len();
+            for _ in 0..needed {
+                out.push(0);
+            }
+            let written = encode_varlen_value(bytes, &mut out[start..], field.descending);
+            debug_assert_eq!(written as usize, needed);
+        }
+    }
+}
+
+/// Encode a single boolean value.
+pub fn encode_scalar_bool(value: Option<bool>, field: SortField, out: &mut ByteBufferMut) {
+    match value {
+        None => {
+            out.push(field.null_sentinel());
+            out.push(0);
+        }
+        Some(b) => {
+            out.push(field.non_null_sentinel());
+            let raw = if b { 0x02u8 } else { 0x01u8 };
+            let xor = if field.descending { 0xFFu8 } else { 0 };
+            out.push(raw ^ xor);
+        }
+    }
+}
+
+/// Encode a single null-type value (only the sentinel).
+pub fn encode_scalar_null(field: SortField, is_null: bool, out: &mut ByteBufferMut) {
+    if is_null {
+        out.push(field.null_sentinel());
+    } else {
+        out.push(field.non_null_sentinel());
+    }
+}
+
+/// Returns the per-row encoded size for a scalar value (used for the Constant fast path).
+pub fn encoded_size_for_scalar(
+    scalar: &vortex_array::scalar::Scalar,
+    _field: SortField,
+) -> VortexResult<u32> {
+    if scalar.is_null() {
+        match scalar.dtype() {
+            DType::Null => Ok(1),
+            DType::Bool(_) => Ok(BOOL_ENCODED_SIZE),
+            DType::Primitive(ptype, _) => Ok(encoded_size_for_fixed(ptype.byte_width() as u32)),
+            DType::Decimal(dt, _) => {
+                let vt = DecimalType::smallest_decimal_value_type(dt);
+                Ok(encoded_size_for_fixed(vt.byte_width() as u32))
+            }
+            DType::Utf8(_) | DType::Binary(_) => Ok(1),
+            _ => vortex_bail!(
+                "unsupported scalar dtype for row encoding: {}",
+                scalar.dtype()
+            ),
+        }
+    } else {
+        match scalar.dtype() {
+            DType::Null => Ok(1),
+            DType::Bool(_) => Ok(BOOL_ENCODED_SIZE),
+            DType::Primitive(ptype, _) => Ok(encoded_size_for_fixed(ptype.byte_width() as u32)),
+            DType::Decimal(..) => {
+                let dec = scalar.as_decimal();
+                let vt = dec
+                    .decimal_value()
+                    .map(|v| v.decimal_type())
+                    .unwrap_or(DecimalType::I128);
+                Ok(encoded_size_for_fixed(vt.byte_width() as u32))
+            }
+            DType::Utf8(_) => {
+                let bs = scalar
+                    .as_utf8()
+                    .value()
+                    .map(|s| s.as_str().len())
+                    .unwrap_or(0);
+                Ok(encoded_size_for_varlen(bs))
+            }
+            DType::Binary(_) => {
+                let bs = scalar.as_binary().value().map(|b| b.len()).unwrap_or(0);
+                Ok(encoded_size_for_varlen(bs))
+            }
+            _ => vortex_bail!(
+                "unsupported scalar dtype for row encoding: {}",
+                scalar.dtype()
+            ),
+        }
+    }
+}
+
+/// Encode a single scalar value into a fresh `Bytes` buffer.
+pub fn encode_scalar(
+    scalar: &vortex_array::scalar::Scalar,
+    field: SortField,
+) -> VortexResult<bytes::Bytes> {
+    use vortex_array::scalar::PValue;
+    let size = encoded_size_for_scalar(scalar, field)? as usize;
+    let mut out = ByteBufferMut::with_capacity(size);
+    if scalar.is_null() {
+        match scalar.dtype() {
+            DType::Null => out.push(field.null_sentinel()),
+            DType::Bool(_) => {
+                out.push(field.null_sentinel());
+                out.push(0);
+            }
+            DType::Primitive(ptype, _) => {
+                out.push(field.null_sentinel());
+                let width = ptype.byte_width();
+                for _ in 0..width {
+                    out.push(0);
+                }
+            }
+            DType::Decimal(dt, _) => {
+                out.push(field.null_sentinel());
+                let vt = DecimalType::smallest_decimal_value_type(dt);
+                for _ in 0..vt.byte_width() {
+                    out.push(0);
+                }
+            }
+            DType::Utf8(_) | DType::Binary(_) => out.push(field.null_sentinel()),
+            _ => vortex_bail!(
+                "unsupported scalar dtype for row encoding: {}",
+                scalar.dtype()
+            ),
+        }
+    } else {
+        match scalar.dtype() {
+            DType::Null => out.push(field.non_null_sentinel()),
+            DType::Bool(_) => {
+                let v = scalar.as_bool().value().unwrap_or(false);
+                encode_scalar_bool(Some(v), field, &mut out);
+            }
+            DType::Primitive(ptype, _) => {
+                let v: PValue = scalar
+                    .as_primitive()
+                    .pvalue()
+                    .ok_or_else(|| vortex_error::vortex_err!("missing primitive value"))?;
+                encode_scalar_primitive(*ptype, v, field, false, &mut out)?;
+            }
+            DType::Decimal(..) => {
+                let dec = scalar.as_decimal();
+                out.push(field.non_null_sentinel());
+                let value = dec
+                    .decimal_value()
+                    .ok_or_else(|| vortex_error::vortex_err!("missing decimal value"))?;
+                match value {
+                    vortex_array::scalar::DecimalValue::I8(v) => {
+                        let mut tmp = [0u8; 1];
+                        v.encode_to(&mut tmp, field.descending);
+                        out.extend_from_slice(&tmp);
+                    }
+                    vortex_array::scalar::DecimalValue::I16(v) => {
+                        let mut tmp = [0u8; 2];
+                        v.encode_to(&mut tmp, field.descending);
+                        out.extend_from_slice(&tmp);
+                    }
+                    vortex_array::scalar::DecimalValue::I32(v) => {
+                        let mut tmp = [0u8; 4];
+                        v.encode_to(&mut tmp, field.descending);
+                        out.extend_from_slice(&tmp);
+                    }
+                    vortex_array::scalar::DecimalValue::I64(v) => {
+                        let mut tmp = [0u8; 8];
+                        v.encode_to(&mut tmp, field.descending);
+                        out.extend_from_slice(&tmp);
+                    }
+                    vortex_array::scalar::DecimalValue::I128(v) => {
+                        let mut tmp = [0u8; 16];
+                        v.encode_to(&mut tmp, field.descending);
+                        out.extend_from_slice(&tmp);
+                    }
+                    vortex_array::scalar::DecimalValue::I256(_) => {
+                        vortex_bail!("row encoding for Decimal256 is not yet implemented")
+                    }
+                }
+            }
+            DType::Utf8(_) => {
+                let v = scalar.as_utf8();
+                let bytes = v.value().map(|s| s.as_str().as_bytes()).unwrap_or(&[]);
+                encode_scalar_varlen(Some(bytes), field, &mut out);
+            }
+            DType::Binary(_) => {
+                let v = scalar.as_binary();
+                let bytes = v.value().map(|b| b.as_slice()).unwrap_or(&[]);
+                encode_scalar_varlen(Some(bytes), field, &mut out);
+            }
+            _ => vortex_bail!(
+                "unsupported scalar dtype for row encoding: {}",
+                scalar.dtype()
+            ),
+        }
+    }
+    Ok(out.freeze().into_inner())
+}
+
+/// Per-row write start offset for the fixed-arithmetic encode path.
+#[inline]
+fn arith_pos(i: usize, col_prefix: u32, row_stride: u32, var_prefix: Option<&[u32]>) -> usize {
+    let base = (i as u32) * row_stride + col_prefix;
+    let pos = match var_prefix {
+        Some(vp) => base + vp[i],
+        None => base,
+    };
+    pos as usize
+}
+
+fn encode_null_arith(
+    arr: &NullArray,
+    field: SortField,
+    col_prefix: u32,
+    row_stride: u32,
+    var_prefix: Option<&[u32]>,
+    out: &mut [u8],
+) {
+    let sentinel = field.null_sentinel();
+    let n = arr.len();
+    for i in 0..n {
+        let pos = arith_pos(i, col_prefix, row_stride, var_prefix);
+        out[pos] = sentinel;
+    }
+}
+
+fn encode_bool_arith(
+    arr: &BoolArray,
+    field: SortField,
+    col_prefix: u32,
+    row_stride: u32,
+    var_prefix: Option<&[u32]>,
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    let mask = arr.as_ref().validity()?.execute_mask(arr.len(), ctx)?;
+    let bits = arr.clone().into_bit_buffer();
+    let non_null = field.non_null_sentinel();
+    let null = field.null_sentinel();
+    let xor = if field.descending { 0xFF } else { 0x00 };
+    for i in 0..bits.len() {
+        let pos = arith_pos(i, col_prefix, row_stride, var_prefix);
+        if mask.value(i) {
+            out[pos] = non_null;
+            let raw = if bits.value(i) { 0x02u8 } else { 0x01u8 };
+            out[pos + 1] = raw ^ xor;
+        } else {
+            out[pos] = null;
+            out[pos + 1] = 0;
+        }
+    }
+    Ok(())
+}
+
+#[allow(clippy::too_many_arguments)]
+fn encode_primitive_arith(
+    arr: &PrimitiveArray,
+    field: SortField,
+    col_prefix: u32,
+    row_stride: u32,
+    var_prefix: Option<&[u32]>,
+    _width: u32,
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    match_each_native_ptype!(arr.ptype(), |T| {
+        encode_primitive_arith_typed::<T>(
+            arr, field, col_prefix, row_stride, var_prefix, out, ctx,
+        )?;
+    });
+    Ok(())
+}
+
+#[inline]
+fn encode_primitive_arith_typed<T: NativePType + RowEncode>(
+    arr: &PrimitiveArray,
+    field: SortField,
+    col_prefix: u32,
+    row_stride: u32,
+    var_prefix: Option<&[u32]>,
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    let mask = arr.as_ref().validity()?.execute_mask(arr.len(), ctx)?;
+    let slice: &[T] = arr.as_slice();
+    let non_null = field.non_null_sentinel();
+    let null = field.null_sentinel();
+    let value_bytes = size_of::<T>();
+    let stride = row_stride as usize;
+    let prefix = col_prefix as usize;
+    let descending = field.descending;
+    // `slot_size` = sentinel + value bytes. This is a compile-time constant per T.
+    let slot_size = 1 + value_bytes;
+
+    // Hot path: contiguous all-valid, no varlen prefix. The chunked write isolates each
+    // row's slot into a fixed-length `&mut [u8; slot_size]` view, which lets the compiler
+    // fold the bounds check on the inner write — matching `arrow-row::encode_not_null`.
+    if var_prefix.is_none() && mask.all_true() {
+        let chunks = out.chunks_exact_mut(stride);
+        for (chunk, &v) in chunks.zip(slice.iter()) {
+            // SAFETY: `prefix + slot_size <= stride` always holds (the row width sums to
+            // the stride; classifier and Phase 1 maintain that invariant). The first byte
+            // is the sentinel; the next `value_bytes` are the encoded value.
+            let slot_ptr = unsafe { chunk.as_mut_ptr().add(prefix) };
+            // SAFETY: `slot_ptr..slot_ptr+slot_size` lies within `chunk`, which is of
+            // length `stride`. Writing one byte for the sentinel is safe.
+            unsafe { slot_ptr.write(non_null) };
+            // SAFETY: the encoded value's `value_bytes` lie in `slot_ptr+1..slot_ptr+1+
+            // value_bytes`, all within the chunk.
+            let val_slice =
+                unsafe { std::slice::from_raw_parts_mut(slot_ptr.add(1), value_bytes) };
+            v.encode_to(val_slice, descending);
+        }
+        return Ok(());
+    }
+    if var_prefix.is_none() {
+        let chunks = out.chunks_exact_mut(stride);
+        for (i, (chunk, &v)) in chunks.zip(slice.iter()).enumerate() {
+            // SAFETY: classifier guarantees `prefix + slot_size <= stride`.
+            let slot: &mut [u8] =
+                unsafe { chunk.get_unchecked_mut(prefix..prefix + slot_size) };
+            if mask.value(i) {
+                slot[0] = non_null;
+                v.encode_to(&mut slot[1..], descending);
+            } else {
+                slot[0] = null;
+                for b in &mut slot[1..] {
+                    *b = 0;
+                }
+            }
+        }
+        return Ok(());
+    }
+    // Mixed (fixed-before-varlen with varlen prefix array).
+    let vp = var_prefix.unwrap_or_else(|| unreachable!());
+    for (i, &v) in slice.iter().enumerate() {
+        let pos = ((i as u32) * row_stride + col_prefix + vp[i]) as usize;
+        if mask.value(i) {
+            out[pos] = non_null;
+            v.encode_to(&mut out[pos + 1..pos + 1 + value_bytes], descending);
+        } else {
+            out[pos] = null;
+            for b in &mut out[pos + 1..pos + 1 + value_bytes] {
+                *b = 0;
+            }
+        }
+    }
+    Ok(())
+}
+
+#[allow(clippy::too_many_arguments)]
+fn encode_decimal_arith(
+    arr: &DecimalArray,
+    field: SortField,
+    col_prefix: u32,
+    row_stride: u32,
+    var_prefix: Option<&[u32]>,
+    _width: u32,
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    let mask = arr.as_ref().validity()?.execute_mask(arr.len(), ctx)?;
+    match arr.values_type() {
+        DecimalType::I8 => encode_decimal_arith_typed::<i8>(
+            arr, &mask, field, col_prefix, row_stride, var_prefix, out,
+        ),
+        DecimalType::I16 => encode_decimal_arith_typed::<i16>(
+            arr, &mask, field, col_prefix, row_stride, var_prefix, out,
+        ),
+        DecimalType::I32 => encode_decimal_arith_typed::<i32>(
+            arr, &mask, field, col_prefix, row_stride, var_prefix, out,
+        ),
+        DecimalType::I64 => encode_decimal_arith_typed::<i64>(
+            arr, &mask, field, col_prefix, row_stride, var_prefix, out,
+        ),
+        DecimalType::I128 => encode_decimal_arith_typed::<i128>(
+            arr, &mask, field, col_prefix, row_stride, var_prefix, out,
+        ),
+        DecimalType::I256 => {
+            vortex_bail!("row encoding for Decimal256 is not yet implemented")
+        }
+    }
+    Ok(())
+}
+
+fn encode_decimal_arith_typed<T>(
+    arr: &DecimalArray,
+    mask: &vortex_mask::Mask,
+    field: SortField,
+    col_prefix: u32,
+    row_stride: u32,
+    var_prefix: Option<&[u32]>,
+    out: &mut [u8],
+) where
+    T: vortex_array::dtype::NativeDecimalType + RowEncode,
+{
+    let non_null = field.non_null_sentinel();
+    let null = field.null_sentinel();
+    let value_bytes = size_of::<T>();
+    let slice = arr.buffer::<T>();
+    for i in 0..slice.len() {
+        let pos = arith_pos(i, col_prefix, row_stride, var_prefix);
+        if mask.value(i) {
+            out[pos] = non_null;
+            slice[i].encode_to(&mut out[pos + 1..pos + 1 + value_bytes], field.descending);
+        } else {
+            out[pos] = null;
+            for b in &mut out[pos + 1..pos + 1 + value_bytes] {
+                *b = 0;
+            }
+        }
+    }
+}
+
+#[allow(clippy::too_many_arguments)]
+fn encode_struct_arith(
+    arr: &StructArray,
+    field: SortField,
+    col_prefix: u32,
+    row_stride: u32,
+    var_prefix: Option<&[u32]>,
+    width: u32,
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    // Struct is only "fixed" when all fields are fixed; classifier guarantees that here.
+    let n = arr.len();
+    let mask = arr.as_ref().validity()?.execute_mask(n, ctx)?;
+    let non_null = field.non_null_sentinel();
+    let null = field.null_sentinel();
+
+    // Write outer sentinel.
+    for i in 0..n {
+        let pos = arith_pos(i, col_prefix, row_stride, var_prefix);
+        out[pos] = if mask.value(i) { non_null } else { null };
+    }
+
+    // Recursively encode each child at child_prefix = col_prefix + 1 + sum-of-prior-child-widths.
+    let mut child_prefix = col_prefix.saturating_add(1);
+    for child in arr.iter_unmasked_fields() {
+        let cw = match row_width_for_dtype(child.dtype())? {
+            RowWidth::Fixed(w) => w,
+            RowWidth::Variable => unreachable!("encode_struct_arith called on non-fixed struct"),
+        };
+        let canonical = child.clone().execute::<Canonical>(ctx)?;
+        field_encode_fixed_arithmetic(
+            &canonical,
+            field,
+            child_prefix,
+            row_stride,
+            var_prefix,
+            cw,
+            out,
+            ctx,
+        )?;
+        child_prefix = child_prefix.saturating_add(cw);
+    }
+
+    // Zero-fill body bytes of null rows.
+    let body_len = width.saturating_sub(1) as usize;
+    if body_len > 0 {
+        for i in 0..n {
+            if !mask.value(i) {
+                let pos = arith_pos(i, col_prefix, row_stride, var_prefix);
+                for b in &mut out[pos + 1..pos + 1 + body_len] {
+                    *b = 0;
+                }
+            }
+        }
+    }
+
+    Ok(())
+}
+
+#[allow(clippy::too_many_arguments)]
+fn encode_fsl_arith(
+    arr: &FixedSizeListArray,
+    field: SortField,
+    col_prefix: u32,
+    row_stride: u32,
+    var_prefix: Option<&[u32]>,
+    width: u32,
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    // FSL is only "fixed" when its element type is fixed; classifier guarantees that here.
+    let n = arr.len();
+    let list_size = arr.list_size() as usize;
+    let mask = arr.as_ref().validity()?.execute_mask(n, ctx)?;
+    let non_null = field.non_null_sentinel();
+    let null = field.null_sentinel();
+    let elements = arr.elements().clone().execute::<Canonical>(ctx)?;
+    debug_assert_eq!(elements.len(), n * list_size);
+
+    let elem_width = match row_width_for_dtype(elements.dtype())? {
+        RowWidth::Fixed(w) => w,
+        RowWidth::Variable => unreachable!("encode_fsl_arith called on non-fixed FSL"),
+    };
+
+    // Write outer sentinel for each row.
+    for i in 0..n {
+        let pos = arith_pos(i, col_prefix, row_stride, var_prefix);
+        out[pos] = if mask.value(i) { non_null } else { null };
+    }
+
+    // Build temporary row_offsets/cursors arrays where each *element* slot has its own
+    // offset. Since FSL is fixed-arith, the per-element start position is deterministic.
+    // Fall back to the cursor-based path for elements: it's a constant computation but
+    // simpler than threading arithmetic offsets through recursion.
+    let mut elem_offsets = vec![0u32; n * list_size];
+    for i in 0..n {
+        let row_start = arith_pos(i, col_prefix, row_stride, var_prefix) as u32 + 1;
+        for j in 0..list_size {
+            elem_offsets[i * list_size + j] = row_start + (j as u32) * elem_width;
+        }
+    }
+    let mut elem_cursors = vec![0u32; n * list_size];
+    field_encode(&elements, field, &elem_offsets, &mut elem_cursors, out, ctx)?;
+
+    // Zero-fill null bodies.
+    let body_len = width.saturating_sub(1) as usize;
+    if body_len > 0 {
+        for i in 0..n {
+            if !mask.value(i) {
+                let pos = arith_pos(i, col_prefix, row_stride, var_prefix);
+                for b in &mut out[pos + 1..pos + 1 + body_len] {
+                    *b = 0;
+                }
+            }
+        }
+    }
+    Ok(())
+}
diff --git a/vortex-row/src/convert.rs b/vortex-row/src/convert.rs
new file mode 100644
index 00000000000..c3b06d92748
--- /dev/null
+++ b/vortex-row/src/convert.rs
@@ -0,0 +1,75 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! User-facing entry point: turn N columnar arrays into one row-encoded `ListView<u8>`.
+
+use vortex_array::ArrayRef;
+use vortex_array::ExecutionCtx;
+use vortex_array::arrays::ListViewArray;
+use vortex_array::scalar_fn::ScalarFnVTable;
+use vortex_array::scalar_fn::VecExecutionArgs;
+use vortex_error::VortexResult;
+use vortex_error::vortex_bail;
+
+use crate::encode::RowEncode;
+use crate::options::RowEncodeOptions;
+use crate::options::SortField;
+use crate::size::RowSize;
+
+/// Convert N columnar arrays into a single row-oriented [`ListViewArray`] of `u8` whose
+/// bytes are lexicographically comparable in the same order as a tuple comparison of the
+/// input values according to `fields`.
+pub fn convert_columns(
+    cols: &[ArrayRef],
+    fields: &[SortField],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<ListViewArray> {
+    if cols.len() != fields.len() {
+        vortex_bail!(
+            "convert_columns: cols.len() ({}) does not match fields.len() ({})",
+            cols.len(),
+            fields.len()
+        );
+    }
+    if cols.is_empty() {
+        vortex_bail!("convert_columns: at least one column is required");
+    }
+    let nrows = cols[0].len();
+    for (i, col) in cols.iter().enumerate() {
+        if col.len() != nrows {
+            vortex_bail!(
+                "convert_columns: column {} has length {} but expected {}",
+                i,
+                col.len(),
+                nrows
+            );
+        }
+    }
+
+    let options = RowEncodeOptions::new(fields.iter().copied());
+    let args = VecExecutionArgs::new(cols.to_vec(), nrows);
+    let result = RowEncode.execute(&options, &args, ctx)?;
+    result.execute::<ListViewArray>(ctx)
+}
+
+/// Compute only the per-row sizes (in bytes) of the row-encoded form for N columns.
+pub fn compute_row_sizes(
+    cols: &[ArrayRef],
+    fields: &[SortField],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<ArrayRef> {
+    if cols.len() != fields.len() {
+        vortex_bail!(
+            "compute_row_sizes: cols.len() ({}) does not match fields.len() ({})",
+            cols.len(),
+            fields.len()
+        );
+    }
+    if cols.is_empty() {
+        vortex_bail!("compute_row_sizes: at least one column is required");
+    }
+    let nrows = cols[0].len();
+    let options = RowEncodeOptions::new(fields.iter().copied());
+    let args = VecExecutionArgs::new(cols.to_vec(), nrows);
+    RowSize.execute(&options, &args, ctx)
+}
diff --git a/vortex-row/src/encode.rs b/vortex-row/src/encode.rs
new file mode 100644
index 00000000000..82bec7a0f47
--- /dev/null
+++ b/vortex-row/src/encode.rs
@@ -0,0 +1,520 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+#![allow(
+    clippy::cast_possible_truncation,
+    reason = "row encoding indexes into u32-sized buffers; lengths are validated to fit in u32"
+)]
+
+//! `RowEncode` variadic scalar function: encode N input columns into a single `ListView<u8>`.
+//!
+//! The output's `(elements, offsets, sizes)` triple is built up in a single left-to-right
+//! pass over the input columns. The `sizes` array doubles as the per-row write cursor, so
+//! when the last column finishes encoding, the accumulator is the final array - no separate
+//! conversion step is needed.
+
+use std::sync::Arc;
+
+use vortex_array::ArrayRef;
+use vortex_array::ArrayView;
+use vortex_array::Canonical;
+use vortex_array::ExecutionCtx;
+use vortex_array::IntoArray;
+use vortex_array::VTable;
+use vortex_array::arrays::Constant;
+use vortex_array::arrays::ListViewArray;
+use vortex_array::arrays::Primitive;
+use vortex_array::arrays::PrimitiveArray;
+use vortex_array::arrays::dict::Dict;
+use vortex_array::arrays::patched::Patched;
+use vortex_array::dtype::DType;
+use vortex_array::dtype::Nullability;
+use vortex_array::dtype::PType;
+use vortex_array::scalar_fn::Arity;
+use vortex_array::scalar_fn::ChildName;
+use vortex_array::scalar_fn::ExecutionArgs;
+use vortex_array::scalar_fn::ScalarFnId;
+use vortex_array::scalar_fn::ScalarFnVTable;
+use vortex_array::validity::Validity;
+use vortex_buffer::Buffer;
+use vortex_buffer::BufferMut;
+use vortex_error::VortexExpect;
+use vortex_error::VortexResult;
+use vortex_error::vortex_bail;
+use vortex_session::VortexSession;
+
+use crate::codec;
+use crate::options::RowEncodeOptions;
+use crate::options::SortField;
+use crate::options::deserialize_row_encode_options;
+use crate::options::serialize_row_encode_options;
+use crate::registry;
+use crate::size::ColKind;
+use crate::size::compute_sizes;
+
+/// Variadic scalar function that encodes N input columns into a single `List<u8>`
+/// [`ListViewArray`] where row `i` contains the row-encoded bytes for column values
+/// `cols[0][i], cols[1][i], ...` concatenated left-to-right.
+#[derive(Clone, Debug)]
+pub struct RowEncode;
+
+impl ScalarFnVTable for RowEncode {
+    type Options = RowEncodeOptions;
+
+    fn id(&self) -> ScalarFnId {
+        ScalarFnId::from("vortex.row_encode")
+    }
+
+    fn serialize(&self, options: &Self::Options) -> VortexResult<Option<Vec<u8>>> {
+        Ok(Some(serialize_row_encode_options(options)))
+    }
+
+    fn deserialize(
+        &self,
+        metadata: &[u8],
+        _session: &VortexSession,
+    ) -> VortexResult<Self::Options> {
+        deserialize_row_encode_options(metadata)
+    }
+
+    fn arity(&self, _options: &Self::Options) -> Arity {
+        Arity::Variadic { min: 1, max: None }
+    }
+
+    fn child_name(&self, _options: &Self::Options, child_idx: usize) -> ChildName {
+        ChildName::from(Arc::from(format!("col_{}", child_idx)))
+    }
+
+    fn return_dtype(&self, _options: &Self::Options, _args: &[DType]) -> VortexResult<DType> {
+        Ok(DType::List(
+            Arc::new(DType::Primitive(PType::U8, Nullability::NonNullable)),
+            Nullability::NonNullable,
+        ))
+    }
+
+    fn execute(
+        &self,
+        options: &Self::Options,
+        args: &dyn ExecutionArgs,
+        ctx: &mut ExecutionCtx,
+    ) -> VortexResult<ArrayRef> {
+        execute_row_encode(options, args, ctx)
+    }
+
+    fn is_null_sensitive(&self, _options: &Self::Options) -> bool {
+        true
+    }
+
+    fn is_fallible(&self, _options: &Self::Options) -> bool {
+        false
+    }
+}
+
+fn execute_row_encode(
+    options: &RowEncodeOptions,
+    args: &dyn ExecutionArgs,
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<ArrayRef> {
+    let nrows = args.row_count();
+
+    // ===== Phase 1: classify + size pass =====
+    let crate::size::SizePassResult {
+        fixed_per_row,
+        var_lengths,
+        col_kinds,
+        first_varlen_idx,
+        columns,
+    } = compute_sizes(options, args, ctx, "RowEncode")?;
+
+    // ===== Phase 2: totals + buffer =====
+    let var_total: u64 = var_lengths
+        .as_ref()
+        .map_or(0, |v| v.iter().map(|&x| u64::from(x)).sum());
+    let total: u64 = (nrows as u64)
+        .checked_mul(u64::from(fixed_per_row))
+        .and_then(|t| t.checked_add(var_total))
+        .vortex_expect("row-encoded total bytes overflow");
+    if total > u32::MAX as u64 {
+        vortex_bail!("row-encoded output size {} bytes exceeds u32::MAX", total);
+    }
+    let total_len = total as usize;
+
+    let mut out_buf: BufferMut<u8> = BufferMut::with_capacity(total_len);
+    // Every encoder writes every byte in its row range: non-null values are written
+    // directly; null fixed-width slots are sentinel + explicit zero-fill; varlen partial
+    // blocks zero-pad via the encoder's own loop; null struct/FSL bodies are zero-filled
+    // after the child encoders run. So the pre-zero-init of the buffer is redundant;
+    // skipping it saves a memset of `total_len` bytes per call (significant for
+    // varlen-heavy inputs where total_len reaches multiple MB).
+    //
+    // SAFETY: we just allocated `total_len` capacity. By the size-pass + encoder
+    // contract every byte in [0, total_len) is written before the buffer is read out.
+    unsafe { out_buf.set_len(total_len) };
+
+    // ===== Phase 3: per-row offsets =====
+    // listview_offsets[i] is the absolute byte offset where row `i` begins.
+    // For pure-fixed: i * fixed_per_row.
+    // For mixed: i * fixed_per_row + exclusive prefix sum of var_lengths.
+    //
+    // When fixed-before-varlen columns exist alongside a varlen column, we also build
+    // `var_prefix_for_arith[i] = exclusive cumsum of var_lengths[..i]` and pass it to
+    // the arithmetic encoders so they can compute per-row positions without a cursor.
+    let need_arith_prefix = first_varlen_idx.is_some()
+        && col_kinds.iter().any(|k| {
+            matches!(
+                k,
+                ColKind::Fixed {
+                    before_varlen: true,
+                    ..
+                }
+            )
+        });
+
+    let mut listview_offsets: Vec<u32> = Vec::with_capacity(nrows);
+    let mut var_prefix_for_arith: Option<Vec<u32>> = None;
+    match var_lengths.as_ref() {
+        None => {
+            // Pure-fixed: offsets[i] = i * fixed_per_row. Materialize via a tight
+            // pointer-write loop that LLVM auto-vectorizes; we already validated total
+            // fits in u32 above so the multiplications can't overflow.
+            // SAFETY: reserved nrows; pointers within [0, nrows) are valid.
+            unsafe {
+                let ptr = listview_offsets.as_mut_ptr();
+                for i in 0..nrows {
+                    ptr.add(i).write((i as u32) * fixed_per_row);
+                }
+                listview_offsets.set_len(nrows);
+            }
+        }
+        Some(v) => {
+            // Mixed path: offsets[i] = i * fixed_per_row + var_prefix[i] where
+            // var_prefix is the exclusive cumsum of varlen lengths. Same raw-pointer
+            // write loop as the pure-fixed branch (auto-vectorized); the total was
+            // validated to fit in u32 upstream so `wrapping_add` is sound here.
+            let mut vp: Option<Vec<u32>> = need_arith_prefix.then(|| Vec::with_capacity(nrows));
+            // SAFETY: we just reserved nrows; writes at indices [0, nrows) are valid.
+            // Likewise `vp` (if Some) has reserved nrows.
+            unsafe {
+                let off_ptr = listview_offsets.as_mut_ptr();
+                let vp_ptr = vp.as_mut().map(|p| p.as_mut_ptr());
+                let mut acc: u32 = 0;
+                for (i, &l) in v.iter().enumerate() {
+                    if let Some(p) = vp_ptr {
+                        p.add(i).write(acc);
+                    }
+                    off_ptr
+                        .add(i)
+                        .write((i as u32).wrapping_mul(fixed_per_row).wrapping_add(acc));
+                    acc = acc.wrapping_add(l);
+                }
+                listview_offsets.set_len(nrows);
+                if let Some(p) = vp.as_mut() {
+                    p.set_len(nrows);
+                }
+            }
+            var_prefix_for_arith = vp;
+        }
+    }
+
+    // Per-row write cursor (also doubles as the ListView `sizes` slot when done).
+    //
+    // The cursor path starts at `prefix_at_first_varlen` so that `listview_offsets[i] +
+    // cursors[i]` lands at the position of the first cursor-path column (i.e. after the
+    // bytes already written by the arithmetic path for fixed-before-varlen columns).
+    //
+    // When there are no varlen columns at all, every column went through the arith path,
+    // so the cursor path runs zero iterations. Pre-seeding the cursors with
+    // `fixed_per_row` makes them already correct as per-row sizes in that case.
+    let initial_cursor: u32 = match first_varlen_idx {
+        Some(idx) => match col_kinds[idx] {
+            ColKind::Variable { fixed_prefix } => fixed_prefix,
+            ColKind::Fixed { .. } => unreachable!("first_varlen_idx points to a varlen column"),
+        },
+        None => fixed_per_row,
+    };
+    let mut row_cursors = vec![initial_cursor; nrows];
+
+    // ===== Phase 4: encode columns =====
+    // Fixed-before-varlen columns take the arithmetic write path (no cursor mutation).
+    // Fixed-after-varlen and varlen columns take the cursor path, which already runs
+    // through `dispatch_encode`.
+    for (i, col) in columns.iter().enumerate() {
+        match col_kinds[i] {
+            ColKind::Fixed {
+                width,
+                prefix,
+                before_varlen: true,
+            } => {
+                dispatch_encode_fixed_arith(
+                    col,
+                    options.fields[i],
+                    prefix,
+                    fixed_per_row,
+                    var_prefix_for_arith.as_deref(),
+                    width,
+                    &mut out_buf,
+                    ctx,
+                )?;
+            }
+            ColKind::Fixed { .. } | ColKind::Variable { .. } => {
+                dispatch_encode(
+                    col,
+                    options.fields[i],
+                    &listview_offsets,
+                    &mut row_cursors,
+                    &mut out_buf,
+                    ctx,
+                )?;
+            }
+        }
+    }
+
+    // ===== Phase 5: build ListView output =====
+    let elements = PrimitiveArray::new(out_buf.freeze(), Validity::NonNullable).into_array();
+    let offsets_arr = PrimitiveArray::new(
+        Buffer::<u32>::copy_from(&listview_offsets),
+        Validity::NonNullable,
+    )
+    .into_array();
+    let sizes_arr = PrimitiveArray::new(
+        Buffer::<u32>::copy_from(&row_cursors),
+        Validity::NonNullable,
+    )
+    .into_array();
+    // SAFETY: The encoder constructs `elements`, `offsets_arr`, and `sizes_arr` itself.
+    // - `elements` is a `PrimitiveArray<u8>` of length `total_bytes`.
+    // - `offsets[i]` is `i * fixed_per_row + var_prefix[i]`, monotonically increasing,
+    //   each value in `0..total_bytes`.
+    // - `sizes[i]` is the per-row size; `offsets[i] + sizes[i] <= total_bytes` by
+    //   construction of the buffer.
+    // - Each row's slice is disjoint from every other row's slice.
+    // The constructor's `validate` re-walks every row to verify these invariants; we know
+    // they hold by construction, so we skip that walk.
+    Ok(unsafe {
+        ListViewArray::new_unchecked(elements, offsets_arr, sizes_arr, Validity::NonNullable)
+    }
+    .into_array())
+}
+
+/// Dispatch a single column's encoding through the arithmetic fast path. This is used for
+/// fixed-width columns that appear before any variable-length column in the row layout: the
+/// within-row write offset is a constant `col_prefix + var_prefix[i]` (or just `col_prefix`
+/// for the pure-fixed case), so we can skip the per-row cursor read/write entirely.
+#[allow(clippy::too_many_arguments)]
+fn dispatch_encode_fixed_arith(
+    col: &ArrayRef,
+    field: SortField,
+    col_prefix: u32,
+    row_stride: u32,
+    var_prefix: Option<&[u32]>,
+    width: u32,
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    // Already-canonical PrimitiveArray: bypass the canonicalization machinery entirely so
+    // the hot loop is reached without going through `execute_until::<AnyCanonical>`.
+    if col.as_opt::<Primitive>().is_some()
+        && let Ok(parr) = col.clone().try_downcast::<Primitive>()
+    {
+        let canonical = Canonical::Primitive(parr);
+        return codec::field_encode_fixed_arithmetic(
+            &canonical, field, col_prefix, row_stride, var_prefix, width, out, ctx,
+        );
+    }
+    // Constant fast path: write the same scalar bytes at each per-row position.
+    if let Some(view) = col.as_opt::<Constant>() {
+        return encode_constant_arith(view, field, col_prefix, row_stride, var_prefix, width, out);
+    }
+    // For other fixed columns route through canonicalization and the codec helpers.
+    let canonical = col.clone().execute::<Canonical>(ctx)?;
+    codec::field_encode_fixed_arithmetic(
+        &canonical, field, col_prefix, row_stride, var_prefix, width, out, ctx,
+    )
+}
+
+/// Constant-specific arithmetic writer. Encodes the scalar bytes once, then writes the same
+/// bytes into each per-row slot via direct register-sized stores for the common small
+/// lengths (2/5/9/17), or `copy_nonoverlapping` as a fallback.
+fn encode_constant_arith(
+    view: ArrayView<'_, Constant>,
+    field: SortField,
+    col_prefix: u32,
+    row_stride: u32,
+    var_prefix: Option<&[u32]>,
+    _width: u32,
+    out: &mut [u8],
+) -> VortexResult<()> {
+    let bytes = codec::encode_scalar(view.scalar(), field)?;
+    let len = bytes.len();
+    if len == 0 {
+        return Ok(());
+    }
+    let n = view.len();
+    // SAFETY: encoded scalar length matches the per-row width contributed to the size pass,
+    // so `pos + len <= out.len()` by buffer construction. For small fixed lengths (the
+    // common case: bool=2, i32=5, i64=9, i128=17) we hoist the encoded bytes into
+    // register-sized loads before the loop and emit direct write_unaligned stores per row.
+    // This is faster than copy_nonoverlapping for small `len` because the compiler emits a
+    // real memcpy call rather than inlining the 1- or 2-word store sequence.
+    unsafe {
+        let src = bytes.as_ptr();
+        let stride = row_stride as usize;
+        match (var_prefix, len) {
+            // i64-typical: 1 sentinel + 8 value bytes = 9 bytes, no varlen prefix.
+            (None, 9) => {
+                let v_lo = std::ptr::read_unaligned(src as *const u64);
+                let v_hi = *src.add(8);
+                let mut dst = out.as_mut_ptr().add(col_prefix as usize);
+                for _ in 0..n {
+                    std::ptr::write_unaligned(dst as *mut u64, v_lo);
+                    *dst.add(8) = v_hi;
+                    dst = dst.add(stride);
+                }
+            }
+            // i32-typical: 1 sentinel + 4 value bytes = 5 bytes, no varlen prefix.
+            (None, 5) => {
+                let v_lo = std::ptr::read_unaligned(src as *const u32);
+                let v_hi = *src.add(4);
+                let mut dst = out.as_mut_ptr().add(col_prefix as usize);
+                for _ in 0..n {
+                    std::ptr::write_unaligned(dst as *mut u32, v_lo);
+                    *dst.add(4) = v_hi;
+                    dst = dst.add(stride);
+                }
+            }
+            // bool / i8: 1 sentinel + 1 value byte = 2 bytes, no varlen prefix.
+            (None, 2) => {
+                let v = std::ptr::read_unaligned(src as *const u16);
+                let mut dst = out.as_mut_ptr().add(col_prefix as usize);
+                for _ in 0..n {
+                    std::ptr::write_unaligned(dst as *mut u16, v);
+                    dst = dst.add(stride);
+                }
+            }
+            // i128: 1 sentinel + 16 value bytes = 17 bytes, no varlen prefix.
+            (None, 17) => {
+                let v_lo = std::ptr::read_unaligned(src as *const u128);
+                let v_hi = *src.add(16);
+                let mut dst = out.as_mut_ptr().add(col_prefix as usize);
+                for _ in 0..n {
+                    std::ptr::write_unaligned(dst as *mut u128, v_lo);
+                    *dst.add(16) = v_hi;
+                    dst = dst.add(stride);
+                }
+            }
+            // General fallback for other lengths.
+            (None, _) => {
+                let mut dst = out.as_mut_ptr().add(col_prefix as usize);
+                for _ in 0..n {
+                    std::ptr::copy_nonoverlapping(src, dst, len);
+                    dst = dst.add(stride);
+                }
+            }
+            (Some(vp), 9) => {
+                let v_lo = std::ptr::read_unaligned(src as *const u64);
+                let v_hi = *src.add(8);
+                let base = out.as_mut_ptr();
+                for i in 0..n {
+                    let pos = (i as u32) * row_stride + col_prefix + vp[i];
+                    let dst = base.add(pos as usize);
+                    std::ptr::write_unaligned(dst as *mut u64, v_lo);
+                    *dst.add(8) = v_hi;
+                }
+            }
+            (Some(vp), 5) => {
+                let v_lo = std::ptr::read_unaligned(src as *const u32);
+                let v_hi = *src.add(4);
+                let base = out.as_mut_ptr();
+                for i in 0..n {
+                    let pos = (i as u32) * row_stride + col_prefix + vp[i];
+                    let dst = base.add(pos as usize);
+                    std::ptr::write_unaligned(dst as *mut u32, v_lo);
+                    *dst.add(4) = v_hi;
+                }
+            }
+            (Some(vp), 2) => {
+                let v = std::ptr::read_unaligned(src as *const u16);
+                let base = out.as_mut_ptr();
+                for i in 0..n {
+                    let pos = (i as u32) * row_stride + col_prefix + vp[i];
+                    std::ptr::write_unaligned(base.add(pos as usize) as *mut u16, v);
+                }
+            }
+            (Some(vp), 17) => {
+                let v_lo = std::ptr::read_unaligned(src as *const u128);
+                let v_hi = *src.add(16);
+                let base = out.as_mut_ptr();
+                for i in 0..n {
+                    let pos = (i as u32) * row_stride + col_prefix + vp[i];
+                    let dst = base.add(pos as usize);
+                    std::ptr::write_unaligned(dst as *mut u128, v_lo);
+                    *dst.add(16) = v_hi;
+                }
+            }
+            (Some(vp), _) => {
+                let base = out.as_mut_ptr();
+                for i in 0..n {
+                    let pos = (i as u32) * row_stride + col_prefix + vp[i];
+                    std::ptr::copy_nonoverlapping(src, base.add(pos as usize), len);
+                }
+            }
+        }
+    }
+    Ok(())
+}
+
+/// Dispatch a single column's encoding into the shared `out` buffer.
+///
+/// Tries the in-crate per-encoding fast paths first, then falls back to canonicalization.
+/// Per-encoding kernels currently return `Ok(None)` (stubs added alongside the trait); the
+/// real impls land in follow-up commits. The downstream-encoding registry is added next.
+pub fn dispatch_encode(
+    col: &ArrayRef,
+    field: SortField,
+    offsets: &[u32],
+    cursors: &mut [u32],
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    if let Some(view) = col.as_opt::<Constant>()
+        && Constant::row_encode_into(view, field, offsets, cursors, out, ctx)?.is_some()
+    {
+        return Ok(());
+    }
+    if let Some(view) = col.as_opt::<Dict>()
+        && Dict::row_encode_into(view, field, offsets, cursors, out, ctx)?.is_some()
+    {
+        return Ok(());
+    }
+    if let Some(view) = col.as_opt::<Patched>()
+        && Patched::row_encode_into(view, field, offsets, cursors, out, ctx)?.is_some()
+    {
+        return Ok(());
+    }
+    if let Some((_, encode_fn)) = registry::lookup(&col.encoding_id())
+        && encode_fn(col, field, offsets, cursors, out, ctx)?.is_some()
+    {
+        return Ok(());
+    }
+    let canonical = col.clone().execute::<Canonical>(ctx)?;
+    codec::field_encode(&canonical, field, offsets, cursors, out, ctx)
+}
+
+/// Mutate-buffer kernel: write this column's per-row bytes into `out` at
+/// `offsets[i] + cursors[i]`, advancing `cursors[i]` by the bytes written.
+///
+/// Return `Ok(None)` to decline and fall back to the canonical path.
+///
+/// Trait is defined now; per-encoding impls and dispatch wiring land in PR 3.
+pub trait RowEncodeKernel: VTable {
+    /// Write this column's per-row bytes into `out` at `offsets[i] + cursors[i]`, advancing
+    /// `cursors[i]` by the bytes written.
+    fn row_encode_into(
+        column: ArrayView<'_, Self>,
+        field: SortField,
+        offsets: &[u32],
+        cursors: &mut [u32],
+        out: &mut [u8],
+        ctx: &mut ExecutionCtx,
+    ) -> VortexResult<Option<()>>;
+}
diff --git a/vortex-row/src/kernels/constant.rs b/vortex-row/src/kernels/constant.rs
new file mode 100644
index 00000000000..2c8a87b5ffc
--- /dev/null
+++ b/vortex-row/src/kernels/constant.rs
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Row-encode kernels for `ConstantArray`.
+
+#![allow(
+    clippy::cast_possible_truncation,
+    reason = "row encoding indexes into u32-sized buffers; lengths are validated to fit in u32"
+)]
+
+use vortex_array::ArrayView;
+use vortex_array::ExecutionCtx;
+use vortex_array::arrays::Constant;
+use vortex_error::VortexResult;
+
+use crate::codec;
+use crate::encode::RowEncodeKernel;
+use crate::options::SortField;
+use crate::size::RowSizeKernel;
+
+impl RowSizeKernel for Constant {
+    fn row_size_contribution(
+        column: ArrayView<'_, Self>,
+        field: SortField,
+        sizes: &mut [u32],
+        _ctx: &mut ExecutionCtx,
+    ) -> VortexResult<Option<()>> {
+        let add = codec::encoded_size_for_scalar(column.scalar(), field)?;
+        for s in sizes.iter_mut().take(column.len()) {
+            *s += add;
+        }
+        Ok(Some(()))
+    }
+}
+
+impl RowEncodeKernel for Constant {
+    fn row_encode_into(
+        column: ArrayView<'_, Self>,
+        field: SortField,
+        offsets: &[u32],
+        cursors: &mut [u32],
+        out: &mut [u8],
+        _ctx: &mut ExecutionCtx,
+    ) -> VortexResult<Option<()>> {
+        let bytes = codec::encode_scalar(column.scalar(), field)?;
+        let len = bytes.len();
+        let len_u32 = len as u32;
+        let n = column.len();
+        if len == 0 {
+            return Ok(Some(()));
+        }
+        // SAFETY: bytes is len bytes; offsets[i] + cursors[i] + len <= out.len() by
+        // construction of the buffer (the size pass already accounted for this column's
+        // contribution). copy_nonoverlapping elides the bounds check + slice creation
+        // that copy_from_slice would do per row.
+        unsafe {
+            let src = bytes.as_ptr();
+            let out_ptr = out.as_mut_ptr();
+            for i in 0..n {
+                let pos = (offsets[i] + cursors[i]) as usize;
+                std::ptr::copy_nonoverlapping(src, out_ptr.add(pos), len);
+                cursors[i] += len_u32;
+            }
+        }
+        Ok(Some(()))
+    }
+}
diff --git a/vortex-row/src/kernels/dict.rs b/vortex-row/src/kernels/dict.rs
new file mode 100644
index 00000000000..79928b2c421
--- /dev/null
+++ b/vortex-row/src/kernels/dict.rs
@@ -0,0 +1,160 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Row-encode kernels for `DictArray`.
+//!
+//! These kernels skip canonicalization by encoding each *unique value* once into a small
+//! per-value buffer keyed by code, then materializing the per-row contribution via the codes
+//! array. The per-unique-value cost is amortized over the dictionary cardinality rather than
+//! the row count.
+
+#![allow(
+    clippy::cast_possible_truncation,
+    clippy::cast_sign_loss,
+    reason = "row encoding indexes into u32-sized buffers; codes are non-negative indices into the values array"
+)]
+
+use vortex_array::ArrayView;
+use vortex_array::ExecutionCtx;
+use vortex_array::arrays::PrimitiveArray;
+use vortex_array::arrays::dict::Dict;
+use vortex_array::arrays::dict::DictArraySlotsExt;
+use vortex_array::dtype::NativePType;
+use vortex_array::dtype::PType;
+use vortex_array::match_each_integer_ptype;
+use vortex_error::VortexResult;
+
+use crate::encode::RowEncodeKernel;
+use crate::encode::dispatch_encode;
+use crate::options::SortField;
+use crate::size::RowSizeKernel;
+use crate::size::dispatch_size;
+
+impl RowSizeKernel for Dict {
+    fn row_size_contribution(
+        column: ArrayView<'_, Self>,
+        field: SortField,
+        sizes: &mut [u32],
+        ctx: &mut ExecutionCtx,
+    ) -> VortexResult<Option<()>> {
+        if column.values().len() > column.codes().len() {
+            return Ok(None);
+        }
+        let n_values = column.values().len();
+        let mut value_sizes = vec![0u32; n_values];
+        dispatch_size(column.values(), field, &mut value_sizes, ctx)?;
+
+        let codes_prim = column.codes().clone().execute::<PrimitiveArray>(ctx)?;
+        let ptype = codes_prim.ptype();
+        match_each_integer_ptype!(ptype, |T| {
+            add_codes_sizes::<T>(&codes_prim, &value_sizes, sizes);
+        });
+        Ok(Some(()))
+    }
+}
+
+impl RowEncodeKernel for Dict {
+    fn row_encode_into(
+        column: ArrayView<'_, Self>,
+        field: SortField,
+        offsets: &[u32],
+        cursors: &mut [u32],
+        out: &mut [u8],
+        ctx: &mut ExecutionCtx,
+    ) -> VortexResult<Option<()>> {
+        if column.values().len() > column.codes().len() {
+            return Ok(None);
+        }
+
+        let n_values = column.values().len();
+        let mut value_sizes = vec![0u32; n_values];
+        dispatch_size(column.values(), field, &mut value_sizes, ctx)?;
+
+        // Build per-value offsets and a small contiguous per-value encoded buffer.
+        let mut value_offsets = vec![0u32; n_values + 1];
+        let mut total: u64 = 0;
+        for i in 0..n_values {
+            value_offsets[i] = total as u32;
+            total += u64::from(value_sizes[i]);
+        }
+        value_offsets[n_values] = total as u32;
+
+        let mut value_buf = vec![0u8; total as usize];
+        // Inner dispatch uses zero base offsets (small buffer) with per-value start cursors.
+        let zero_offsets = vec![0u32; n_values];
+        let mut inner_cursors = value_offsets[..n_values].to_vec();
+        dispatch_encode(
+            column.values(),
+            field,
+            &zero_offsets,
+            &mut inner_cursors,
+            &mut value_buf,
+            ctx,
+        )?;
+
+        let codes_prim = column.codes().clone().execute::<PrimitiveArray>(ctx)?;
+        let ptype = codes_prim.ptype();
+        match_each_integer_ptype!(ptype, |T| {
+            copy_codes::<T>(
+                &codes_prim,
+                &value_buf,
+                &value_offsets,
+                &value_sizes,
+                offsets,
+                cursors,
+                out,
+            );
+        });
+        Ok(Some(()))
+    }
+}
+
+#[inline]
+fn add_codes_sizes<T>(codes: &PrimitiveArray, value_sizes: &[u32], sizes: &mut [u32])
+where
+    T: NativePType + Copy + TryInto<usize>,
+{
+    let slice: &[T] = codes.as_slice();
+    debug_assert_eq!(slice.len(), sizes.len());
+    if T::PTYPE == PType::U8 {
+        // SAFETY: T == u8
+        let raw = unsafe { std::slice::from_raw_parts(slice.as_ptr().cast::<u8>(), slice.len()) };
+        for (i, &c) in raw.iter().enumerate() {
+            sizes[i] += value_sizes[c as usize];
+        }
+        return;
+    }
+    for (i, &c) in slice.iter().enumerate() {
+        let idx: usize = c
+            .try_into()
+            .unwrap_or_else(|_| vortex_error::vortex_panic!("dict code does not fit in usize"));
+        sizes[i] += value_sizes[idx];
+    }
+}
+
+#[inline]
+#[allow(clippy::too_many_arguments)]
+fn copy_codes<T>(
+    codes: &PrimitiveArray,
+    value_buf: &[u8],
+    value_offsets: &[u32],
+    value_sizes: &[u32],
+    offsets: &[u32],
+    cursors: &mut [u32],
+    out: &mut [u8],
+) where
+    T: NativePType + Copy + TryInto<usize>,
+{
+    let slice: &[T] = codes.as_slice();
+    debug_assert_eq!(slice.len(), cursors.len());
+    for (i, &c) in slice.iter().enumerate() {
+        let idx: usize = c
+            .try_into()
+            .unwrap_or_else(|_| vortex_error::vortex_panic!("dict code does not fit in usize"));
+        let v_start = value_offsets[idx] as usize;
+        let v_size = value_sizes[idx] as usize;
+        let dst = (offsets[i] + cursors[i]) as usize;
+        out[dst..dst + v_size].copy_from_slice(&value_buf[v_start..v_start + v_size]);
+        cursors[i] += v_size as u32;
+    }
+}
diff --git a/vortex-row/src/kernels/mod.rs b/vortex-row/src/kernels/mod.rs
new file mode 100644
index 00000000000..492f619ef0d
--- /dev/null
+++ b/vortex-row/src/kernels/mod.rs
@@ -0,0 +1,17 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Per-encoding fast-path implementations of [`RowSizeKernel`] and [`RowEncodeKernel`] for
+//! encodings defined in `vortex-array`.
+//!
+//! Each impl in this module lives here (rather than under the corresponding encoding's
+//! `compute` module in `vortex-array`) so the orphan rule is satisfied: the trait is
+//! defined in `vortex-row` and the impl is also in `vortex-row`, while the array type
+//! (`Constant`, `Dict`, `Patched`) remains in `vortex-array`.
+//!
+//! [`RowSizeKernel`]: crate::size::RowSizeKernel
+//! [`RowEncodeKernel`]: crate::encode::RowEncodeKernel
+
+mod constant;
+mod dict;
+mod patched;
diff --git a/vortex-row/src/kernels/patched.rs b/vortex-row/src/kernels/patched.rs
new file mode 100644
index 00000000000..ecd5d65d556
--- /dev/null
+++ b/vortex-row/src/kernels/patched.rs
@@ -0,0 +1,261 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Row-encode kernels for `Patched`.
+//!
+//! Row size is identical to the underlying `inner` array (patches don't change dtype). For
+//! row encoding, we first delegate to the inner array's row-encode path, then overlay each
+//! patched row's value directly into the output, overwriting the few bytes that the inner
+//! encoder wrote at that row's slot.
+
+#![allow(
+    clippy::cast_possible_truncation,
+    clippy::cast_sign_loss,
+    reason = "row encoding indexes into u32-sized buffers; lengths are validated to fit in u32"
+)]
+
+use vortex_array::ArrayView;
+use vortex_array::ExecutionCtx;
+use vortex_array::arrays::PrimitiveArray;
+use vortex_array::arrays::patched::Patched;
+use vortex_array::arrays::patched::PatchedArrayExt;
+use vortex_array::arrays::patched::PatchedArraySlotsExt;
+use vortex_array::dtype::DType;
+use vortex_array::match_each_native_ptype;
+use vortex_error::VortexResult;
+
+use crate::codec::RowEncode;
+use crate::encode::RowEncodeKernel;
+use crate::encode::dispatch_encode;
+use crate::options::SortField;
+use crate::size::RowSizeKernel;
+use crate::size::dispatch_size;
+
+impl RowSizeKernel for Patched {
+    fn row_size_contribution(
+        column: ArrayView<'_, Self>,
+        field: SortField,
+        sizes: &mut [u32],
+        ctx: &mut ExecutionCtx,
+    ) -> VortexResult<Option<()>> {
+        // Per-row size matches the inner array; patches share its dtype.
+        dispatch_size(column.inner(), field, sizes, ctx)?;
+        Ok(Some(()))
+    }
+}
+
+impl RowEncodeKernel for Patched {
+    fn row_encode_into(
+        column: ArrayView<'_, Self>,
+        field: SortField,
+        offsets: &[u32],
+        cursors: &mut [u32],
+        out: &mut [u8],
+        ctx: &mut ExecutionCtx,
+    ) -> VortexResult<Option<()>> {
+        let DType::Primitive(ptype, _) = *column.as_ref().dtype() else {
+            return Ok(None);
+        };
+        let value_bytes = ptype.byte_width();
+
+        // Snapshot per-row write start positions before the inner encoder advances cursors.
+        let pre_cursors: Vec<u32> = cursors.to_vec();
+        dispatch_encode(column.inner(), field, offsets, cursors, out, ctx)?;
+
+        overlay_patches(
+            column,
+            ptype,
+            value_bytes,
+            field,
+            offsets,
+            &pre_cursors,
+            out,
+            ctx,
+        )?;
+        Ok(Some(()))
+    }
+}
+
+/// Overlay patch values onto rows whose inner-encoded bytes need to be replaced.
+#[allow(clippy::too_many_arguments)]
+fn overlay_patches(
+    column: ArrayView<'_, Patched>,
+    ptype: vortex_array::dtype::PType,
+    value_bytes: usize,
+    field: SortField,
+    offsets: &[u32],
+    pre_cursors: &[u32],
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    let patch_indices: PrimitiveArray = column
+        .patch_indices()
+        .clone()
+        .execute::<PrimitiveArray>(ctx)?;
+    if patch_indices.is_empty() {
+        return Ok(());
+    }
+    let patch_values: PrimitiveArray = column
+        .patch_values()
+        .clone()
+        .execute::<PrimitiveArray>(ctx)?;
+    let lane_offsets: PrimitiveArray = column
+        .lane_offsets()
+        .clone()
+        .execute::<PrimitiveArray>(ctx)?;
+    let patch_indices_slice: &[u16] = patch_indices.as_slice();
+    let lane_offsets_slice: &[u32] = lane_offsets.as_slice();
+    let n_lanes = column.n_lanes();
+    let patched_offset = column.offset();
+    let array_len = column.as_ref().len();
+    let n_chunks = (array_len + patched_offset).div_ceil(1024);
+    let non_null = field.non_null_sentinel();
+    let descending = field.descending;
+
+    match_each_native_ptype!(ptype, |T| {
+        let values_slice: &[T] = patch_values.as_slice();
+        overlay_chunks::<T>(
+            values_slice,
+            patch_indices_slice,
+            lane_offsets_slice,
+            n_lanes,
+            patched_offset,
+            array_len,
+            n_chunks,
+            offsets,
+            pre_cursors,
+            out,
+            value_bytes,
+            non_null,
+            descending,
+        );
+    });
+    Ok(())
+}
+
+#[allow(clippy::too_many_arguments)]
+fn overlay_chunks<T: Copy + RowEncode>(
+    values_slice: &[T],
+    patch_indices_slice: &[u16],
+    lane_offsets_slice: &[u32],
+    n_lanes: usize,
+    patched_offset: usize,
+    array_len: usize,
+    n_chunks: usize,
+    offsets: &[u32],
+    pre_cursors: &[u32],
+    out: &mut [u8],
+    value_bytes: usize,
+    non_null: u8,
+    descending: bool,
+) {
+    for chunk in 0..n_chunks {
+        for lane in 0..n_lanes {
+            let slot = chunk * n_lanes + lane;
+            if slot + 1 >= lane_offsets_slice.len() {
+                break;
+            }
+            let start = lane_offsets_slice[slot] as usize;
+            let stop = lane_offsets_slice[slot + 1] as usize;
+            for k in start..stop {
+                let chunk_local = patch_indices_slice[k] as usize;
+                let logical_idx = chunk * 1024 + chunk_local;
+                if logical_idx < patched_offset {
+                    continue;
+                }
+                let row = logical_idx - patched_offset;
+                if row >= array_len {
+                    continue;
+                }
+                let slot_start = (offsets[row] + pre_cursors[row]) as usize;
+                out[slot_start] = non_null;
+                values_slice[k].encode_to(
+                    &mut out[slot_start + 1..slot_start + 1 + value_bytes],
+                    descending,
+                );
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use vortex_array::IntoArray;
+    use vortex_array::LEGACY_SESSION;
+    use vortex_array::VortexSessionExecute;
+    use vortex_array::arrays::ListViewArray;
+    use vortex_array::arrays::Patched;
+    use vortex_array::arrays::PrimitiveArray;
+    use vortex_array::arrays::listview::ListViewArrayExt;
+    use vortex_array::patches::Patches;
+    use vortex_buffer::buffer;
+    use vortex_error::VortexResult;
+
+    use crate::SortField;
+    use crate::convert_columns;
+
+    fn collect_rows(arr: &ListViewArray) -> Vec<Vec<u8>> {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        let n = arr.len();
+        (0..n)
+            .map(|i| {
+                let slice = arr.list_elements_at(i).unwrap();
+                let p = slice.execute::<PrimitiveArray>(&mut ctx).unwrap();
+                p.as_slice::<u8>().to_vec()
+            })
+            .collect()
+    }
+
+    #[test]
+    fn patched_row_encode_matches_canonical() -> VortexResult<()> {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        let inner = buffer![0u32; 32].into_array();
+        let patches = Patches::new(
+            32,
+            0,
+            buffer![1u32, 2, 3].into_array(),
+            buffer![100u32, 200, 300].into_array(),
+            None,
+        )?;
+        let patched = Patched::from_array_and_patches(inner, &patches, &mut ctx)?.into_array();
+
+        let mut canonical_vals = vec![0u32; 32];
+        canonical_vals[1] = 100;
+        canonical_vals[2] = 200;
+        canonical_vals[3] = 300;
+        let canonical = PrimitiveArray::from_iter(canonical_vals).into_array();
+
+        let by_canonical = convert_columns(&[canonical], &[SortField::default()], &mut ctx)?;
+        let by_patched = convert_columns(&[patched], &[SortField::default()], &mut ctx)?;
+        assert_eq!(collect_rows(&by_canonical), collect_rows(&by_patched));
+        Ok(())
+    }
+
+    #[test]
+    fn patched_row_encode_multi_chunk() -> VortexResult<()> {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        let n: usize = 4096;
+        let inner = PrimitiveArray::from_iter(vec![0u32; n]).into_array();
+        let indices: Vec<u32> = (0..n as u32).step_by(503).collect();
+        let values: Vec<u32> = indices.iter().map(|i| i + 1000).collect();
+        let patches = Patches::new(
+            n,
+            0,
+            PrimitiveArray::from_iter(indices.clone()).into_array(),
+            PrimitiveArray::from_iter(values.clone()).into_array(),
+            None,
+        )?;
+        let patched = Patched::from_array_and_patches(inner, &patches, &mut ctx)?.into_array();
+
+        let mut canonical_vals = vec![0u32; n];
+        for (idx, &i) in indices.iter().enumerate() {
+            canonical_vals[i as usize] = values[idx];
+        }
+        let canonical = PrimitiveArray::from_iter(canonical_vals).into_array();
+
+        let by_canonical = convert_columns(&[canonical], &[SortField::default()], &mut ctx)?;
+        let by_patched = convert_columns(&[patched], &[SortField::default()], &mut ctx)?;
+        assert_eq!(collect_rows(&by_canonical), collect_rows(&by_patched));
+        Ok(())
+    }
+}
diff --git a/vortex-row/src/lib.rs b/vortex-row/src/lib.rs
new file mode 100644
index 00000000000..bd583bf77ad
--- /dev/null
+++ b/vortex-row/src/lib.rs
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Row-oriented byte encoder, analogous to Apache Arrow's `arrow-row` crate.
+//!
+//! The encoder converts N columnar arrays into a single `List<u8>` array where each row's
+//! bytes are lexicographically comparable in the same order as a tuple comparison of the
+//! original values. This is useful for sorting, hashing into row containers, and other
+//! operations that benefit from a sort-friendly opaque byte representation of a multi-column
+//! key.
+//!
+//! Two variadic scalar functions drive the implementation:
+//! - [`RowSize`] computes per-row byte sizes across all N input columns.
+//! - [`RowEncode`] writes the row-encoded bytes into a single `ListView<u8>` accumulator
+//!   in one left-to-right pass.
+//!
+//! Each scalar function exposes a per-encoding fast-path trait
+//! ([`RowSizeKernel`] / [`RowEncodeKernel`]) for downstream encodings to plug into; PR 3
+//! adds in-crate impls for `Constant`, `Dict`, and `Patched` and an inventory-based
+//! registry for external encodings.
+//!
+//! The user-facing entry point is [`convert_columns`].
+//!
+//! Row-encoding scalar functions are not registered in the default
+//! [`VortexSession`]. Call [`initialize`] on a session to make `RowSize` and `RowEncode`
+//! available via the expression layer.
+
+pub mod codec;
+pub mod convert;
+pub mod encode;
+mod kernels;
+pub mod options;
+pub mod registry;
+pub mod size;
+
+#[cfg(test)]
+mod tests;
+
+pub use convert::compute_row_sizes;
+pub use convert::convert_columns;
+pub use encode::RowEncode;
+pub use encode::RowEncodeKernel;
+pub use options::RowEncodeOptions;
+pub use options::SortField;
+pub use registry::RowEncodeRegistration;
+pub use size::RowSize;
+pub use size::RowSizeKernel;
+use vortex_array::scalar_fn::session::ScalarFnSessionExt;
+use vortex_session::VortexSession;
+
+/// Register the row-encoding scalar functions ([`RowSize`] and [`RowEncode`]) on the given
+/// session.
+///
+/// Call once on session construction if you want row encoding available via the expression
+/// layer or via [`convert_columns`].
+pub fn initialize(session: &VortexSession) {
+    session.scalar_fns().register(RowSize);
+    session.scalar_fns().register(RowEncode);
+}
diff --git a/vortex-row/src/options.rs b/vortex-row/src/options.rs
new file mode 100644
index 00000000000..a9e5e2b18ab
--- /dev/null
+++ b/vortex-row/src/options.rs
@@ -0,0 +1,157 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+use std::fmt::Display;
+use std::fmt::Formatter;
+
+use smallvec::SmallVec;
+
+/// Per-column options for the row-oriented byte encoder.
+///
+/// These options control how a single column is encoded into row bytes:
+/// - `descending`: if true, the encoded value bytes are bit-inverted so that
+///   lexicographic byte comparison reflects the reverse of the natural ordering.
+///   The null sentinel byte is NOT inverted, so nulls keep their requested
+///   position relative to non-nulls.
+/// - `nulls_first`: if true, nulls sort before non-nulls. If false, nulls sort
+///   after non-nulls. Implemented via the sentinel byte that precedes every
+///   value's encoded bytes.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct SortField {
+    /// If true, encoded value bytes are bit-inverted so lexicographic byte
+    /// comparison reflects the reverse of the natural ordering.
+    pub descending: bool,
+    /// If true, nulls sort before non-null values; otherwise nulls sort after.
+    pub nulls_first: bool,
+}
+
+impl Default for SortField {
+    fn default() -> Self {
+        Self {
+            descending: false,
+            nulls_first: true,
+        }
+    }
+}
+
+impl Display for SortField {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "descending={}, nulls_first={}",
+            self.descending, self.nulls_first
+        )
+    }
+}
+
+impl SortField {
+    /// Construct a new `SortField` with explicit options.
+    pub fn new(descending: bool, nulls_first: bool) -> Self {
+        Self {
+            descending,
+            nulls_first,
+        }
+    }
+
+    /// Returns the sentinel byte to write for a non-null value.
+    #[inline]
+    pub fn non_null_sentinel(&self) -> u8 {
+        // Non-null is always 0x01. Null choices are < or > 0x01.
+        0x01
+    }
+
+    /// Returns the sentinel byte to write for a null value.
+    #[inline]
+    pub fn null_sentinel(&self) -> u8 {
+        if self.nulls_first {
+            // Nulls before non-nulls (smaller byte sorts first).
+            0x00
+        } else {
+            // Nulls after non-nulls (larger byte sorts later).
+            0x02
+        }
+    }
+}
+
+/// Inline capacity for [`RowEncodeOptions::fields`]. Up to this many [`SortField`]s
+/// are held inline without a heap allocation; beyond, the storage spills.
+pub const FIELDS_INLINE: usize = 4;
+
+/// Options for the variadic [`RowSize`] and [`RowEncode`] scalar functions:
+/// one [`SortField`] per input column.
+///
+/// Stored in a [`SmallVec`] so that typical 1–4 column keys avoid a heap
+/// allocation; longer field lists spill to the heap transparently.
+///
+/// [`RowSize`]: super::size::RowSize
+/// [`RowEncode`]: super::encode::RowEncode
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub struct RowEncodeOptions {
+    /// Per-column sort fields, in left-to-right column order.
+    pub fields: SmallVec<[SortField; FIELDS_INLINE]>,
+}
+
+impl RowEncodeOptions {
+    /// Construct a new `RowEncodeOptions` from any iterator of [`SortField`]s.
+    pub fn new(fields: impl IntoIterator<Item = SortField>) -> Self {
+        Self {
+            fields: fields.into_iter().collect(),
+        }
+    }
+}
+
+impl Display for RowEncodeOptions {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "[")?;
+        for (i, field) in self.fields.iter().enumerate() {
+            if i > 0 {
+                write!(f, ", ")?;
+            }
+            write!(f, "{}", field)?;
+        }
+        write!(f, "]")
+    }
+}
+
+/// Serialize a [`RowEncodeOptions`] to a compact byte vector: 4-byte LE length followed by
+/// `2 * len` bytes (descending + nulls_first booleans for each field).
+pub(crate) fn serialize_row_encode_options(opts: &RowEncodeOptions) -> Vec<u8> {
+    use vortex_error::VortexExpect;
+    let n =
+        u32::try_from(opts.fields.len()).vortex_expect("RowEncodeOptions length must fit in u32");
+    let mut out = Vec::with_capacity(4 + 2 * opts.fields.len());
+    out.extend_from_slice(&n.to_le_bytes());
+    for f in &opts.fields {
+        out.push(u8::from(f.descending));
+        out.push(u8::from(f.nulls_first));
+    }
+    out
+}
+
+/// Deserialize a [`RowEncodeOptions`] produced by [`serialize_row_encode_options`].
+pub(crate) fn deserialize_row_encode_options(
+    bytes: &[u8],
+) -> vortex_error::VortexResult<RowEncodeOptions> {
+    if bytes.len() < 4 {
+        vortex_error::vortex_bail!("RowEncodeOptions metadata must contain a 4-byte length prefix");
+    }
+    let n = u32::from_le_bytes([bytes[0], bytes[1], bytes[2], bytes[3]]) as usize;
+    let expected = 4 + 2 * n;
+    if bytes.len() != expected {
+        vortex_error::vortex_bail!(
+            "RowEncodeOptions metadata wrong size: got {}, expected {}",
+            bytes.len(),
+            expected
+        );
+    }
+    let mut fields: SmallVec<[SortField; FIELDS_INLINE]> = SmallVec::with_capacity(n);
+    let mut i = 4;
+    for _ in 0..n {
+        fields.push(SortField {
+            descending: bytes[i] != 0,
+            nulls_first: bytes[i + 1] != 0,
+        });
+        i += 2;
+    }
+    Ok(RowEncodeOptions { fields })
+}
diff --git a/vortex-row/src/registry.rs b/vortex-row/src/registry.rs
new file mode 100644
index 00000000000..d806d090e0a
--- /dev/null
+++ b/vortex-row/src/registry.rs
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Registry for per-encoding row-encode fast paths from downstream crates.
+//!
+//! Encodings that live outside `vortex-array` (such as `RunEnd` in `encodings/runend`) cannot
+//! be directly downcast from inside the variadic [`RowSize`] / [`RowEncode`] dispatch loops.
+//! Instead, they submit a [`RowEncodeRegistration`] via the `inventory` crate, and the
+//! dispatch loop looks them up by [`ArrayId`].
+//!
+//! [`RowSize`]: super::size::RowSize
+//! [`RowEncode`]: super::encode::RowEncode
+
+use std::sync::OnceLock;
+
+use vortex_array::ArrayId;
+use vortex_array::ArrayRef;
+use vortex_array::ExecutionCtx;
+use vortex_error::VortexResult;
+use vortex_utils::aliases::hash_map::HashMap;
+
+use crate::options::SortField;
+
+/// Function pointer signature for an encoding's per-row size contribution.
+pub type DynSizeFn =
+    fn(&ArrayRef, SortField, &mut [u32], &mut ExecutionCtx) -> VortexResult<Option<()>>;
+
+/// Function pointer signature for an encoding's per-row byte encoding.
+pub type DynEncodeFn = fn(
+    &ArrayRef,
+    SortField,
+    &[u32],
+    &mut [u32],
+    &mut [u8],
+    &mut ExecutionCtx,
+) -> VortexResult<Option<()>>;
+
+/// A registration submitted by an encoding crate to plug into the row encoder.
+///
+/// Because [`ArrayId`] requires runtime string interning, the encoding id is passed as a
+/// function pointer that is called once at registry initialization time.
+pub struct RowEncodeRegistration {
+    /// Returns the [`ArrayId`] of the encoding this registration applies to.
+    pub id: fn() -> ArrayId,
+    /// Per-row size contribution function.
+    pub size: DynSizeFn,
+    /// Per-row encoding function.
+    pub encode: DynEncodeFn,
+}
+
+inventory::collect!(RowEncodeRegistration);
+
+/// Look up a (size, encode) pair for the given encoding id.
+pub fn lookup(id: &ArrayId) -> Option<(DynSizeFn, DynEncodeFn)> {
+    static MAP: OnceLock<HashMap<ArrayId, (DynSizeFn, DynEncodeFn)>> = OnceLock::new();
+    let map = MAP.get_or_init(|| {
+        inventory::iter::<RowEncodeRegistration>
+            .into_iter()
+            .map(|r| ((r.id)(), (r.size, r.encode)))
+            .collect()
+    });
+    map.get(id).copied()
+}
diff --git a/vortex-row/src/size.rs b/vortex-row/src/size.rs
new file mode 100644
index 00000000000..0b54f19de4e
--- /dev/null
+++ b/vortex-row/src/size.rs
@@ -0,0 +1,308 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! `RowSize` variadic scalar function: aggregate per-row byte sizes for N input columns.
+
+use std::sync::Arc;
+
+use vortex_array::ArrayRef;
+use vortex_array::ArrayView;
+use vortex_array::Canonical;
+use vortex_array::ExecutionCtx;
+use vortex_array::IntoArray;
+use vortex_array::VTable;
+use vortex_array::arrays::Constant;
+use vortex_array::arrays::ConstantArray;
+use vortex_array::arrays::PrimitiveArray;
+use vortex_array::arrays::StructArray;
+use vortex_array::arrays::dict::Dict;
+use vortex_array::arrays::patched::Patched;
+use vortex_array::dtype::DType;
+use vortex_array::dtype::FieldName;
+use vortex_array::dtype::FieldNames;
+use vortex_array::dtype::Nullability;
+use vortex_array::dtype::PType;
+use vortex_array::dtype::StructFields;
+use vortex_array::scalar::Scalar;
+use vortex_array::scalar_fn::Arity;
+use vortex_array::scalar_fn::ChildName;
+use vortex_array::scalar_fn::ExecutionArgs;
+use vortex_array::scalar_fn::ScalarFnId;
+use vortex_array::scalar_fn::ScalarFnVTable;
+use vortex_array::validity::Validity;
+use vortex_buffer::Buffer;
+use vortex_error::VortexExpect;
+use vortex_error::VortexResult;
+use vortex_error::vortex_bail;
+use vortex_session::VortexSession;
+
+use crate::codec;
+use crate::codec::RowWidth;
+use crate::options::RowEncodeOptions;
+use crate::options::SortField;
+use crate::options::deserialize_row_encode_options;
+use crate::options::serialize_row_encode_options;
+use crate::registry;
+
+/// Classification of a single input column for the size pass.
+///
+/// Tracks each column's within-row byte offset (the constant prefix from all preceding
+/// fixed-width columns) and, for fixed columns, whether any variable-length column has
+/// appeared yet — the encode pass uses this to choose between the arithmetic-write fast
+/// path (no varlen before this column, so the within-row position is constant) and the
+/// cursor-write path.
+#[derive(Clone, Copy, Debug)]
+pub(crate) enum ColKind {
+    /// Column has fixed width `width`. `prefix` is the within-row byte offset of this
+    /// column's first byte. If `before_varlen` is true, no variable-length column precedes
+    /// this one, so the within-row offset is constant for every row.
+    Fixed {
+        width: u32,
+        prefix: u32,
+        before_varlen: bool,
+    },
+    /// Column has variable per-row width. `fixed_prefix` is the sum of widths of all
+    /// preceding fixed columns; the varlen contribution from earlier varlen columns is
+    /// added per row.
+    Variable { fixed_prefix: u32 },
+}
+
+/// Result of the size pass: enough information for both [`RowSize::execute`] and the
+/// downstream [`RowEncode`](super::encode::RowEncode) pipeline.
+pub(crate) struct SizePassResult {
+    pub fixed_per_row: u32,
+    pub var_lengths: Option<Vec<u32>>,
+    pub col_kinds: Vec<ColKind>,
+    pub first_varlen_idx: Option<usize>,
+    pub columns: Vec<ArrayRef>,
+}
+
+/// Walk N input columns once, classifying each as fixed-width or variable-length and
+/// accumulating per-row size contributions.
+///
+/// Fixed-width columns contribute a single scalar increment to `fixed_per_row`; they do
+/// not touch `var_lengths`. Variable-length columns add per-row contributions into the
+/// lazily-allocated `var_lengths` vec via [`dispatch_size`].
+///
+/// This is shared by [`RowSize::execute`] (which wraps the result into a
+/// `Struct { fixed, var }`) and the [`RowEncode`](super::encode::RowEncode) pipeline
+/// (which uses the full result, including `col_kinds`, to drive the encode pass).
+pub(crate) fn compute_sizes(
+    options: &RowEncodeOptions,
+    args: &dyn ExecutionArgs,
+    ctx: &mut ExecutionCtx,
+    op_name: &'static str,
+) -> VortexResult<SizePassResult> {
+    let n_inputs = args.num_inputs();
+    if n_inputs == 0 {
+        vortex_bail!("{} requires at least one input column", op_name);
+    }
+    if options.fields.len() != n_inputs {
+        vortex_bail!(
+            "{} options.fields.len()={} does not match num_inputs={}",
+            op_name,
+            options.fields.len(),
+            n_inputs
+        );
+    }
+    let nrows = args.row_count();
+
+    let mut columns: Vec<ArrayRef> = Vec::with_capacity(n_inputs);
+    let mut col_kinds: Vec<ColKind> = Vec::with_capacity(n_inputs);
+    let mut fixed_per_row: u32 = 0;
+    let mut var_lengths: Option<Vec<u32>> = None;
+    let mut first_varlen_idx: Option<usize> = None;
+    let mut running_fixed_prefix: u32 = 0;
+
+    for i in 0..n_inputs {
+        let col = args.get(i)?;
+        if col.len() != nrows {
+            vortex_bail!(
+                "{}: column {} has length {} but expected {}",
+                op_name,
+                i,
+                col.len(),
+                nrows
+            );
+        }
+        match codec::row_width_for_dtype(col.dtype())? {
+            RowWidth::Fixed(w) => {
+                col_kinds.push(ColKind::Fixed {
+                    width: w,
+                    prefix: running_fixed_prefix,
+                    before_varlen: first_varlen_idx.is_none(),
+                });
+                fixed_per_row = fixed_per_row
+                    .checked_add(w)
+                    .vortex_expect("row width overflow");
+                running_fixed_prefix = running_fixed_prefix
+                    .checked_add(w)
+                    .vortex_expect("row width overflow");
+            }
+            RowWidth::Variable => {
+                if first_varlen_idx.is_none() {
+                    first_varlen_idx = Some(i);
+                }
+                let v = var_lengths.get_or_insert_with(|| vec![0u32; nrows]);
+                dispatch_size(&col, options.fields[i], v, ctx)?;
+                col_kinds.push(ColKind::Variable {
+                    fixed_prefix: running_fixed_prefix,
+                });
+            }
+        }
+        columns.push(col);
+    }
+
+    Ok(SizePassResult {
+        fixed_per_row,
+        var_lengths,
+        col_kinds,
+        first_varlen_idx,
+        columns,
+    })
+}
+
+/// Variadic scalar function that, given N input columns and per-column [`SortField`]s,
+/// returns a `Struct { fixed: U32, var: U32 }` array of per-row byte sizes for the
+/// row-oriented encoding produced by [`RowEncode`](super::encode::RowEncode).
+///
+/// The `fixed` field is always a [`ConstantArray`] holding the sum of the per-column
+/// constant widths of fixed-width inputs (sentinel + value bytes). The `var` field is a
+/// `ConstantArray(0)` when there are no variable-length input columns, and a
+/// [`PrimitiveArray<u32>`] of per-row varlen-byte sums otherwise.
+///
+/// The total per-row byte size is `fixed + var`.
+#[derive(Clone, Debug)]
+pub struct RowSize;
+
+/// Returns the [`FieldNames`] used by the [`RowSize`] output struct.
+pub(crate) fn row_size_field_names() -> FieldNames {
+    FieldNames::from([FieldName::from("fixed"), FieldName::from("var")])
+}
+
+/// Returns the output [`DType`] of [`RowSize`].
+pub(crate) fn row_size_struct_dtype() -> DType {
+    DType::Struct(
+        StructFields::new(
+            row_size_field_names(),
+            vec![
+                DType::Primitive(PType::U32, Nullability::NonNullable),
+                DType::Primitive(PType::U32, Nullability::NonNullable),
+            ],
+        ),
+        Nullability::NonNullable,
+    )
+}
+
+impl ScalarFnVTable for RowSize {
+    type Options = RowEncodeOptions;
+
+    fn id(&self) -> ScalarFnId {
+        ScalarFnId::from("vortex.row_size")
+    }
+
+    fn serialize(&self, options: &Self::Options) -> VortexResult<Option<Vec<u8>>> {
+        Ok(Some(serialize_row_encode_options(options)))
+    }
+
+    fn deserialize(
+        &self,
+        metadata: &[u8],
+        _session: &VortexSession,
+    ) -> VortexResult<Self::Options> {
+        deserialize_row_encode_options(metadata)
+    }
+
+    fn arity(&self, _options: &Self::Options) -> Arity {
+        Arity::Variadic { min: 1, max: None }
+    }
+
+    fn child_name(&self, _options: &Self::Options, child_idx: usize) -> ChildName {
+        ChildName::from(Arc::from(format!("col_{}", child_idx)))
+    }
+
+    fn return_dtype(&self, _options: &Self::Options, _args: &[DType]) -> VortexResult<DType> {
+        Ok(row_size_struct_dtype())
+    }
+
+    fn execute(
+        &self,
+        options: &Self::Options,
+        args: &dyn ExecutionArgs,
+        ctx: &mut ExecutionCtx,
+    ) -> VortexResult<ArrayRef> {
+        let nrows = args.row_count();
+        let result = compute_sizes(options, args, ctx, "RowSize")?;
+        let fixed_array =
+            ConstantArray::new(Scalar::from(result.fixed_per_row), nrows).into_array();
+        let var_array = match result.var_lengths {
+            Some(v) => PrimitiveArray::new(Buffer::<u32>::copy_from(&v), Validity::NonNullable)
+                .into_array(),
+            None => ConstantArray::new(Scalar::from(0u32), nrows).into_array(),
+        };
+        Ok(StructArray::try_new(
+            row_size_field_names(),
+            vec![fixed_array, var_array],
+            nrows,
+            Validity::NonNullable,
+        )?
+        .into_array())
+    }
+
+    fn is_null_sensitive(&self, _options: &Self::Options) -> bool {
+        true
+    }
+
+    fn is_fallible(&self, _options: &Self::Options) -> bool {
+        false
+    }
+}
+
+/// Dispatch a single column's per-row size contribution.
+///
+/// Tries the in-crate per-encoding fast paths first, then falls back to canonicalization.
+/// Per-encoding kernels currently return `Ok(None)` (stubs added alongside the trait); the
+/// real impls land in follow-up commits. The downstream-encoding registry is added next.
+pub fn dispatch_size(
+    col: &ArrayRef,
+    field: SortField,
+    sizes: &mut [u32],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    if let Some(view) = col.as_opt::<Constant>()
+        && Constant::row_size_contribution(view, field, sizes, ctx)?.is_some()
+    {
+        return Ok(());
+    }
+    if let Some(view) = col.as_opt::<Dict>()
+        && Dict::row_size_contribution(view, field, sizes, ctx)?.is_some()
+    {
+        return Ok(());
+    }
+    if let Some(view) = col.as_opt::<Patched>()
+        && Patched::row_size_contribution(view, field, sizes, ctx)?.is_some()
+    {
+        return Ok(());
+    }
+    if let Some((size_fn, _)) = registry::lookup(&col.encoding_id())
+        && size_fn(col, field, sizes, ctx)?.is_some()
+    {
+        return Ok(());
+    }
+    let canonical = col.clone().execute::<Canonical>(ctx)?;
+    codec::field_size(&canonical, field, sizes, ctx)
+}
+
+/// Mutate-buffer kernel: add this column's per-row byte contribution into the shared
+/// `sizes` slice. Return `Ok(None)` to decline and fall back to the canonical path.
+///
+/// Trait is defined now; per-encoding impls and dispatch wiring land in PR 3.
+pub trait RowSizeKernel: VTable {
+    /// Add this column's per-row byte contribution into `sizes`.
+    fn row_size_contribution(
+        column: ArrayView<'_, Self>,
+        field: SortField,
+        sizes: &mut [u32],
+        ctx: &mut ExecutionCtx,
+    ) -> VortexResult<Option<()>>;
+}
diff --git a/vortex-row/src/tests.rs b/vortex-row/src/tests.rs
new file mode 100644
index 00000000000..33ecdc699f8
--- /dev/null
+++ b/vortex-row/src/tests.rs
@@ -0,0 +1,364 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+#![allow(
+    clippy::approx_constant,
+    clippy::cloned_ref_to_slice_refs,
+    clippy::redundant_clone,
+    reason = "tests value clarity over micro-optimization"
+)]
+
+//! Tests for the row encoder.
+
+use rstest::rstest;
+use vortex_array::IntoArray;
+use vortex_array::LEGACY_SESSION;
+use vortex_array::VortexSessionExecute;
+use vortex_array::arrays::BoolArray;
+use vortex_array::arrays::ConstantArray;
+use vortex_array::arrays::ListViewArray;
+use vortex_array::arrays::PrimitiveArray;
+use vortex_array::arrays::VarBinViewArray;
+use vortex_array::arrays::listview::ListViewArrayExt;
+use vortex_array::builders::dict::dict_encode;
+use vortex_array::dtype::DType;
+use vortex_array::dtype::Nullability;
+use vortex_error::VortexResult;
+
+use crate::SortField;
+use crate::convert_columns;
+
+fn collect_row_bytes(array: &ListViewArray) -> Vec<Vec<u8>> {
+    let mut ctx = LEGACY_SESSION.create_execution_ctx();
+    let nrows = array.len();
+    (0..nrows)
+        .map(|i| {
+            let slice = array.list_elements_at(i).unwrap();
+            let p = slice.execute::<PrimitiveArray>(&mut ctx).unwrap();
+            p.as_slice::<u8>().to_vec()
+        })
+        .collect()
+}
+
+/// Encode each column independently, sort the resulting row bytes, and check the permutation
+/// matches the natural sort order of `values`.
+fn assert_sort_order_i64(values: Vec<i64>, descending: bool) -> VortexResult<()> {
+    let mut ctx = LEGACY_SESSION.create_execution_ctx();
+    let col = PrimitiveArray::from_iter(values.clone()).into_array();
+    let field = SortField {
+        descending,
+        nulls_first: true,
+    };
+    let encoded = convert_columns(&[col], &[field], &mut ctx)?;
+    let rows = collect_row_bytes(&encoded);
+
+    // Build expected permutation: sort values naturally then compare to bytes-sorted order.
+    let mut idx: Vec<usize> = (0..values.len()).collect();
+    if descending {
+        idx.sort_by(|a, b| values[*b].cmp(&values[*a]));
+    } else {
+        idx.sort_by(|a, b| values[*a].cmp(&values[*b]));
+    }
+    let expected_order: Vec<Vec<u8>> = idx.iter().map(|&i| rows[i].clone()).collect();
+
+    let mut sorted = rows.clone();
+    sorted.sort();
+    assert_eq!(
+        sorted, expected_order,
+        "Row-encoded bytes do not match natural sort order"
+    );
+    Ok(())
+}
+
+#[rstest]
+#[case::ascending(false)]
+#[case::descending(true)]
+fn primitive_i64_roundtrip(#[case] descending: bool) -> VortexResult<()> {
+    let values: Vec<i64> = vec![-5, 0, 5, i64::MIN, i64::MAX, 7, -7, 1];
+    assert_sort_order_i64(values, descending)
+}
+
+#[test]
+fn primitive_u32_sort_order() -> VortexResult<()> {
+    let mut ctx = LEGACY_SESSION.create_execution_ctx();
+    let values: Vec<u32> = vec![0, 1, 100, u32::MAX, 42, 17];
+    let col = PrimitiveArray::from_iter(values.clone()).into_array();
+    let encoded = convert_columns(&[col], &[SortField::default()], &mut ctx)?;
+    let rows = collect_row_bytes(&encoded);
+
+    let mut sorted_rows = rows.clone();
+    sorted_rows.sort();
+
+    let mut sorted_idx: Vec<usize> = (0..values.len()).collect();
+    sorted_idx.sort_by(|a, b| values[*a].cmp(&values[*b]));
+    let expected: Vec<Vec<u8>> = sorted_idx.iter().map(|&i| rows[i].clone()).collect();
+    assert_eq!(sorted_rows, expected);
+    Ok(())
+}
+
+#[test]
+fn primitive_f64_sort_order() -> VortexResult<()> {
+    let mut ctx = LEGACY_SESSION.create_execution_ctx();
+    // We use IEEE total-ordering semantics: -0.0 < +0.0 in the byte encoding (matches
+    // `arrow-row`). Avoid -0.0 in the natural-order baseline since partial_cmp says
+    // -0.0 == 0.0.
+    let values: Vec<f64> = vec![-1.5, 0.0, 1.5, f64::INFINITY, f64::NEG_INFINITY, 3.14];
+    let col = PrimitiveArray::from_iter(values.clone()).into_array();
+    let encoded = convert_columns(&[col], &[SortField::default()], &mut ctx)?;
+    let rows = collect_row_bytes(&encoded);
+
+    let mut sorted_rows = rows.clone();
+    sorted_rows.sort();
+
+    let mut sorted_idx: Vec<usize> = (0..values.len()).collect();
+    sorted_idx.sort_by(|a, b| values[*a].partial_cmp(&values[*b]).unwrap());
+    let expected: Vec<Vec<u8>> = sorted_idx.iter().map(|&i| rows[i].clone()).collect();
+    assert_eq!(sorted_rows, expected);
+    Ok(())
+}
+
+#[test]
+fn bool_sort_order() -> VortexResult<()> {
+    let mut ctx = LEGACY_SESSION.create_execution_ctx();
+    let col = BoolArray::from_iter([true, false, true, false]).into_array();
+    let encoded = convert_columns(&[col], &[SortField::default()], &mut ctx)?;
+    let rows = collect_row_bytes(&encoded);
+
+    let mut sorted = rows.clone();
+    sorted.sort();
+    // false rows come first (2x), true rows after (2x)
+    assert_eq!(sorted[0], rows[1]);
+    assert_eq!(sorted[1], rows[3]);
+    assert_eq!(sorted[2], rows[0]);
+    assert_eq!(sorted[3], rows[2]);
+    Ok(())
+}
+
+#[test]
+fn utf8_sort_order() -> VortexResult<()> {
+    let mut ctx = LEGACY_SESSION.create_execution_ctx();
+    let values = vec![
+        "banana",
+        "apple",
+        "",
+        "cherry",
+        "ban",
+        "banana_loaf_for_test",
+    ];
+    let col = VarBinViewArray::from_iter_str(values.clone()).into_array();
+    let encoded = convert_columns(&[col], &[SortField::default()], &mut ctx)?;
+    let rows = collect_row_bytes(&encoded);
+
+    let mut sorted = rows.clone();
+    sorted.sort();
+
+    let mut sorted_idx: Vec<usize> = (0..values.len()).collect();
+    sorted_idx.sort_by(|a, b| values[*a].cmp(values[*b]));
+    let expected: Vec<Vec<u8>> = sorted_idx.iter().map(|&i| rows[i].clone()).collect();
+    assert_eq!(sorted, expected);
+    Ok(())
+}
+
+#[test]
+fn multi_column_sort() -> VortexResult<()> {
+    let mut ctx = LEGACY_SESSION.create_execution_ctx();
+    let ints: Vec<i32> = vec![1, 2, 1, 2, 1, 3];
+    let strs = vec!["b", "a", "a", "b", "c", "z"];
+    let col0 = PrimitiveArray::from_iter(ints.clone()).into_array();
+    let col1 = VarBinViewArray::from_iter_str(strs.clone()).into_array();
+    let encoded = convert_columns(
+        &[col0, col1],
+        &[SortField::default(), SortField::default()],
+        &mut ctx,
+    )?;
+    let rows = collect_row_bytes(&encoded);
+
+    let mut sorted = rows.clone();
+    sorted.sort();
+    let mut idx: Vec<usize> = (0..ints.len()).collect();
+    idx.sort_by(|a, b| ints[*a].cmp(&ints[*b]).then_with(|| strs[*a].cmp(strs[*b])));
+    let expected: Vec<Vec<u8>> = idx.iter().map(|&i| rows[i].clone()).collect();
+    assert_eq!(sorted, expected);
+    Ok(())
+}
+
+#[test]
+fn nulls_first_and_last() -> VortexResult<()> {
+    let mut ctx = LEGACY_SESSION.create_execution_ctx();
+    let values: Vec<Option<i32>> = vec![Some(5), None, Some(1), None, Some(3)];
+    let col = PrimitiveArray::from_option_iter(values.clone()).into_array();
+
+    // nulls_first=true
+    let encoded = convert_columns(
+        &[col.clone()],
+        &[SortField {
+            descending: false,
+            nulls_first: true,
+        }],
+        &mut ctx,
+    )?;
+    let rows = collect_row_bytes(&encoded);
+    let mut sorted = rows.clone();
+    sorted.sort();
+    // The first two sorted entries should be nulls
+    let null_count = values.iter().filter(|v| v.is_none()).count();
+    for i in 0..null_count {
+        // a null encoded row begins with 0x00
+        assert_eq!(sorted[i][0], 0x00);
+    }
+    // nulls_first=false
+    let encoded = convert_columns(
+        &[col],
+        &[SortField {
+            descending: false,
+            nulls_first: false,
+        }],
+        &mut ctx,
+    )?;
+    let rows = collect_row_bytes(&encoded);
+    let mut sorted = rows.clone();
+    sorted.sort();
+    // The last two sorted entries should be nulls
+    for i in 0..null_count {
+        let pos = sorted.len() - 1 - i;
+        assert_eq!(sorted[pos][0], 0x02);
+    }
+    Ok(())
+}
+
+#[test]
+fn dict_path_matches_canonical() -> VortexResult<()> {
+    let mut ctx = LEGACY_SESSION.create_execution_ctx();
+    let raw = VarBinViewArray::from_iter(
+        vec![Some("a"), Some("bb"), Some("a"), Some("ccc"), Some("bb")],
+        DType::Utf8(Nullability::NonNullable),
+    )
+    .into_array();
+    let dict_arr = dict_encode(&raw)?.into_array();
+
+    let canonical_enc = convert_columns(&[raw], &[SortField::default()], &mut ctx)?;
+    let dict_enc = convert_columns(&[dict_arr], &[SortField::default()], &mut ctx)?;
+
+    assert_eq!(
+        collect_row_bytes(&canonical_enc),
+        collect_row_bytes(&dict_enc)
+    );
+    Ok(())
+}
+
+#[test]
+fn constant_path_matches_canonical() -> VortexResult<()> {
+    let mut ctx = LEGACY_SESSION.create_execution_ctx();
+    let nrows = 8usize;
+    let const_arr = ConstantArray::new(42i64, nrows).into_array();
+    let canonical = PrimitiveArray::from_iter(vec![42i64; nrows]).into_array();
+
+    let from_const = convert_columns(&[const_arr], &[SortField::default()], &mut ctx)?;
+    let from_canon = convert_columns(&[canonical], &[SortField::default()], &mut ctx)?;
+    assert_eq!(
+        collect_row_bytes(&from_const),
+        collect_row_bytes(&from_canon)
+    );
+    Ok(())
+}
+
+#[test]
+fn struct_sort_order() -> VortexResult<()> {
+    use vortex_array::arrays::StructArray;
+    let mut ctx = LEGACY_SESSION.create_execution_ctx();
+    let ids: Vec<i64> = vec![3, 1, 3, 1, 2];
+    let names = vec!["b", "a", "a", "b", "z"];
+    let id_arr = PrimitiveArray::from_iter(ids.clone()).into_array();
+    let name_arr = VarBinViewArray::from_iter_str(names.clone()).into_array();
+    let struct_arr = StructArray::from_fields(&[("id", id_arr), ("name", name_arr)])?.into_array();
+
+    let encoded = convert_columns(&[struct_arr], &[SortField::default()], &mut ctx)?;
+    let rows = collect_row_bytes(&encoded);
+
+    let mut sorted = rows.clone();
+    sorted.sort();
+    let mut idx: Vec<usize> = (0..ids.len()).collect();
+    idx.sort_by(|a, b| ids[*a].cmp(&ids[*b]).then_with(|| names[*a].cmp(names[*b])));
+    let expected: Vec<Vec<u8>> = idx.iter().map(|&i| rows[i].clone()).collect();
+    assert_eq!(sorted, expected);
+    Ok(())
+}
+
+#[test]
+fn row_size_struct_shape() -> VortexResult<()> {
+    use vortex_array::arrays::Constant;
+    use vortex_array::arrays::StructArray;
+    use vortex_array::arrays::struct_::StructArrayExt;
+
+    use crate::compute_row_sizes;
+
+    let mut ctx = LEGACY_SESSION.create_execution_ctx();
+    let ints: Vec<i32> = vec![1, 2, 3, 4, 5];
+    let strs = vec!["a", "bb", "ccc", "", "eeeee"];
+    let col0 = PrimitiveArray::from_iter(ints).into_array();
+    let col1 = VarBinViewArray::from_iter_str(strs).into_array();
+
+    let sizes = compute_row_sizes(
+        &[col0, col1],
+        &[SortField::default(), SortField::default()],
+        &mut ctx,
+    )?;
+    // Shape must be Struct { fixed, var }
+    let struct_arr = sizes.execute::<StructArray>(&mut ctx)?;
+    assert_eq!(struct_arr.struct_fields().nfields(), 2);
+    let fixed = struct_arr.unmasked_field(0);
+    let var = struct_arr.unmasked_field(1);
+
+    // `fixed` must be ConstantArray with value = encoded i32 width = 1 + 4 = 5.
+    let fixed_const = fixed
+        .as_opt::<Constant>()
+        .expect("fixed field should be a ConstantArray");
+    assert_eq!(
+        fixed_const.scalar(),
+        &vortex_array::scalar::Scalar::from(5u32),
+        "fixed scalar should be encoded primitive i32 width"
+    );
+
+    // `var` must be a PrimitiveArray<u32>, since we have a varlen column.
+    let var_prim = var.clone().execute::<PrimitiveArray>(&mut ctx)?;
+    let v: &[u32] = var_prim.as_slice();
+    assert_eq!(v.len(), 5);
+    // empty string: sentinel(1) + 1 byte; non-empty: sentinel(1) + 33 bytes (single block).
+    let expected: Vec<u32> = vec![34, 34, 34, 2, 34];
+    assert_eq!(v, expected.as_slice());
+    Ok(())
+}
+
+#[test]
+fn single_buffer_invariant() -> VortexResult<()> {
+    let mut ctx = LEGACY_SESSION.create_execution_ctx();
+    // Encoded rows here are all > 12 bytes, forcing the Ref-view path that points back into
+    // the shared data buffer.
+    let nrows = 64usize;
+    let primitives: Vec<i64> = (0..nrows as i64).collect();
+    let strings: Vec<String> = (0..nrows)
+        .map(|i| format!("row_{}_with_padding", i))
+        .collect();
+    let col0 = PrimitiveArray::from_iter(primitives.clone()).into_array();
+    let col1 = VarBinViewArray::from_iter_str(strings.iter().map(String::as_str)).into_array();
+    let encoded = convert_columns(
+        &[col0, col1],
+        &[SortField::default(), SortField::default()],
+        &mut ctx,
+    )?;
+
+    let rows = collect_row_bytes(&encoded);
+    let expected_total: usize = rows.iter().map(|r| r.len()).sum();
+
+    // The shared data buffer holds the contiguous concatenation of every row's encoded bytes;
+    // per-row allocations would produce many small buffers instead of one shared buffer.
+    // ListView's elements array is a single contiguous primitive (u8) array; its length
+    // equals the sum of all per-row sizes. A per-row allocation strategy would instead
+    // produce N separate elements arrays or a sparse one.
+    let elements_len = encoded.elements().len();
+    assert_eq!(
+        elements_len, expected_total,
+        "elements buffer size mismatch"
+    );
+    Ok(())
+}