From 6f8ea5173fdd1a6f562231f94c3b69330b781b12 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 17 May 2026 22:00:44 +0000
Subject: [PATCH 01/26] vortex-row: crate scaffolding

Add an empty `vortex-row` crate with a minimal `initialize` stub so the
following commits can layer in the row-encoder, codec, scalar functions,
and per-encoding kernels without touching the workspace skeleton each
time. The crate is wired into the workspace members list and workspace
dependency table; `public-api.lock` is generated against the stub.

Signed-off-by: Claude <noreply@anthropic.com>
---
 Cargo.lock                 |  7 +++++++
 Cargo.toml                 |  2 ++
 vortex-row/Cargo.toml      | 20 ++++++++++++++++++++
 vortex-row/public-api.lock |  3 +++
 vortex-row/src/lib.rs      | 14 ++++++++++++++
 5 files changed, 46 insertions(+)
 create mode 100644 vortex-row/Cargo.toml
 create mode 100644 vortex-row/public-api.lock
 create mode 100644 vortex-row/src/lib.rs

diff --git a/Cargo.lock b/Cargo.lock
index 2819f2bacd0..63a608277d3 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -11034,6 +11034,13 @@ dependencies = [
  "vortex-tui",
 ]
 
+[[package]]
+name = "vortex-row"
+version = "0.1.0"
+dependencies = [
+ "vortex-session",
+]
+
 [[package]]
 name = "vortex-runend"
 version = "0.1.0"
diff --git a/Cargo.toml b/Cargo.toml
index fb87a953154..9fae5b564bf 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -7,6 +7,7 @@ members = [
     "vortex-mask",
     "vortex-utils",
     "vortex-session",
+    "vortex-row",
     "vortex-flatbuffers",
     "vortex-metrics",
     "vortex-io",
@@ -291,6 +292,7 @@ vortex-mask = { version = "0.1.0", path = "./vortex-mask", default-features = fa
 vortex-metrics = { version = "0.1.0", path = "./vortex-metrics", default-features = false }
 vortex-pco = { version = "0.1.0", path = "./encodings/pco", default-features = false }
 vortex-proto = { version = "0.1.0", path = "./vortex-proto", default-features = false }
+vortex-row = { version = "0.1.0", path = "./vortex-row", default-features = false }
 vortex-runend = { version = "0.1.0", path = "./encodings/runend", default-features = false }
 vortex-scan = { version = "0.1.0", path = "./vortex-scan", default-features = false }
 vortex-sequence = { version = "0.1.0", path = "encodings/sequence", default-features = false }
diff --git a/vortex-row/Cargo.toml b/vortex-row/Cargo.toml
new file mode 100644
index 00000000000..7515715392c
--- /dev/null
+++ b/vortex-row/Cargo.toml
@@ -0,0 +1,20 @@
+[package]
+name = "vortex-row"
+authors = { workspace = true }
+categories = { workspace = true }
+description = "Row-oriented byte encoder for Vortex arrays, analogous to arrow-row."
+edition = { workspace = true }
+homepage = { workspace = true }
+include = { workspace = true }
+keywords = { workspace = true }
+license = { workspace = true }
+readme = { workspace = true }
+repository = { workspace = true }
+rust-version = { workspace = true }
+version = { workspace = true }
+
+[lints]
+workspace = true
+
+[dependencies]
+vortex-session = { workspace = true }
diff --git a/vortex-row/public-api.lock b/vortex-row/public-api.lock
new file mode 100644
index 00000000000..d507aa46a00
--- /dev/null
+++ b/vortex-row/public-api.lock
@@ -0,0 +1,3 @@
+pub mod vortex_row
+
+pub fn vortex_row::initialize(&vortex_session::VortexSession)
diff --git a/vortex-row/src/lib.rs b/vortex-row/src/lib.rs
new file mode 100644
index 00000000000..f675ca12f4d
--- /dev/null
+++ b/vortex-row/src/lib.rs
@@ -0,0 +1,14 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Row-oriented byte encoder, analogous to Apache Arrow's `arrow-row` crate.
+//!
+//! Subsequent commits add the encoder, decoder helpers, and per-encoding fast paths.
+//! This commit only establishes the crate skeleton and an `initialize` stub.
+
+use vortex_session::VortexSession;
+
+/// Register the row-encoding scalar functions on the given session.
+///
+/// Currently a stub: subsequent commits register `RowSize` and `RowEncode` here.
+pub fn initialize(_session: &VortexSession) {}

From 4f4aca5f32e93729f1cc6acf6767901e47fcf4aa Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 17 May 2026 22:01:35 +0000
Subject: [PATCH 02/26] vortex-row: add SortField and RowEncodeOptions

Introduce the per-column sort-field options and the variadic-function
options struct used by the upcoming RowSize / RowEncode scalar functions.

`RowEncodeOptions::fields` uses a `SmallVec<[SortField; 4]>` so typical
1-4 column keys avoid a heap allocation. Includes a compact serialize /
deserialize helper used later by the scalar-function metadata round-trip.

Signed-off-by: Claude <noreply@anthropic.com>
---
 Cargo.lock                 |   2 +
 vortex-row/Cargo.toml      |   2 +
 vortex-row/public-api.lock | 156 ++++++++++++++++++++++++++++++++++++
 vortex-row/src/lib.rs      |   4 +
 vortex-row/src/options.rs  | 157 +++++++++++++++++++++++++++++++++++++
 5 files changed, 321 insertions(+)
 create mode 100644 vortex-row/src/options.rs

diff --git a/Cargo.lock b/Cargo.lock
index 63a608277d3..ce2b4e6d41b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -11038,6 +11038,8 @@ dependencies = [
 name = "vortex-row"
 version = "0.1.0"
 dependencies = [
+ "smallvec",
+ "vortex-error",
  "vortex-session",
 ]
 
diff --git a/vortex-row/Cargo.toml b/vortex-row/Cargo.toml
index 7515715392c..3e314fd7697 100644
--- a/vortex-row/Cargo.toml
+++ b/vortex-row/Cargo.toml
@@ -17,4 +17,6 @@ version = { workspace = true }
 workspace = true
 
 [dependencies]
+smallvec = { workspace = true }
+vortex-error = { workspace = true }
 vortex-session = { workspace = true }
diff --git a/vortex-row/public-api.lock b/vortex-row/public-api.lock
index d507aa46a00..998a7712f2d 100644
--- a/vortex-row/public-api.lock
+++ b/vortex-row/public-api.lock
@@ -1,3 +1,159 @@
 pub mod vortex_row
 
+pub mod vortex_row::options
+
+pub struct vortex_row::options::RowEncodeOptions
+
+pub vortex_row::options::RowEncodeOptions::fields: smallvec::SmallVec<[vortex_row::options::SortField; 4]>
+
+impl vortex_row::options::RowEncodeOptions
+
+pub fn vortex_row::options::RowEncodeOptions::new(impl core::iter::traits::collect::IntoIterator<Item = vortex_row::options::SortField>) -> Self
+
+impl core::clone::Clone for vortex_row::options::RowEncodeOptions
+
+pub fn vortex_row::options::RowEncodeOptions::clone(&self) -> vortex_row::options::RowEncodeOptions
+
+impl core::cmp::Eq for vortex_row::options::RowEncodeOptions
+
+impl core::cmp::PartialEq for vortex_row::options::RowEncodeOptions
+
+pub fn vortex_row::options::RowEncodeOptions::eq(&self, &vortex_row::options::RowEncodeOptions) -> bool
+
+impl core::fmt::Debug for vortex_row::options::RowEncodeOptions
+
+pub fn vortex_row::options::RowEncodeOptions::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result
+
+impl core::fmt::Display for vortex_row::options::RowEncodeOptions
+
+pub fn vortex_row::options::RowEncodeOptions::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result
+
+impl core::hash::Hash for vortex_row::options::RowEncodeOptions
+
+pub fn vortex_row::options::RowEncodeOptions::hash<__H: core::hash::Hasher>(&self, &mut __H)
+
+impl core::marker::StructuralPartialEq for vortex_row::options::RowEncodeOptions
+
+pub struct vortex_row::options::SortField
+
+pub vortex_row::options::SortField::descending: bool
+
+pub vortex_row::options::SortField::nulls_first: bool
+
+impl vortex_row::options::SortField
+
+pub fn vortex_row::options::SortField::new(bool, bool) -> Self
+
+pub fn vortex_row::options::SortField::non_null_sentinel(&self) -> u8
+
+pub fn vortex_row::options::SortField::null_sentinel(&self) -> u8
+
+impl core::clone::Clone for vortex_row::options::SortField
+
+pub fn vortex_row::options::SortField::clone(&self) -> vortex_row::options::SortField
+
+impl core::cmp::Eq for vortex_row::options::SortField
+
+impl core::cmp::PartialEq for vortex_row::options::SortField
+
+pub fn vortex_row::options::SortField::eq(&self, &vortex_row::options::SortField) -> bool
+
+impl core::default::Default for vortex_row::options::SortField
+
+pub fn vortex_row::options::SortField::default() -> Self
+
+impl core::fmt::Debug for vortex_row::options::SortField
+
+pub fn vortex_row::options::SortField::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result
+
+impl core::fmt::Display for vortex_row::options::SortField
+
+pub fn vortex_row::options::SortField::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result
+
+impl core::hash::Hash for vortex_row::options::SortField
+
+pub fn vortex_row::options::SortField::hash<__H: core::hash::Hasher>(&self, &mut __H)
+
+impl core::marker::Copy for vortex_row::options::SortField
+
+impl core::marker::StructuralPartialEq for vortex_row::options::SortField
+
+pub const vortex_row::options::FIELDS_INLINE: usize
+
+pub struct vortex_row::RowEncodeOptions
+
+pub vortex_row::RowEncodeOptions::fields: smallvec::SmallVec<[vortex_row::options::SortField; 4]>
+
+impl vortex_row::options::RowEncodeOptions
+
+pub fn vortex_row::options::RowEncodeOptions::new(impl core::iter::traits::collect::IntoIterator<Item = vortex_row::options::SortField>) -> Self
+
+impl core::clone::Clone for vortex_row::options::RowEncodeOptions
+
+pub fn vortex_row::options::RowEncodeOptions::clone(&self) -> vortex_row::options::RowEncodeOptions
+
+impl core::cmp::Eq for vortex_row::options::RowEncodeOptions
+
+impl core::cmp::PartialEq for vortex_row::options::RowEncodeOptions
+
+pub fn vortex_row::options::RowEncodeOptions::eq(&self, &vortex_row::options::RowEncodeOptions) -> bool
+
+impl core::fmt::Debug for vortex_row::options::RowEncodeOptions
+
+pub fn vortex_row::options::RowEncodeOptions::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result
+
+impl core::fmt::Display for vortex_row::options::RowEncodeOptions
+
+pub fn vortex_row::options::RowEncodeOptions::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result
+
+impl core::hash::Hash for vortex_row::options::RowEncodeOptions
+
+pub fn vortex_row::options::RowEncodeOptions::hash<__H: core::hash::Hasher>(&self, &mut __H)
+
+impl core::marker::StructuralPartialEq for vortex_row::options::RowEncodeOptions
+
+pub struct vortex_row::SortField
+
+pub vortex_row::SortField::descending: bool
+
+pub vortex_row::SortField::nulls_first: bool
+
+impl vortex_row::options::SortField
+
+pub fn vortex_row::options::SortField::new(bool, bool) -> Self
+
+pub fn vortex_row::options::SortField::non_null_sentinel(&self) -> u8
+
+pub fn vortex_row::options::SortField::null_sentinel(&self) -> u8
+
+impl core::clone::Clone for vortex_row::options::SortField
+
+pub fn vortex_row::options::SortField::clone(&self) -> vortex_row::options::SortField
+
+impl core::cmp::Eq for vortex_row::options::SortField
+
+impl core::cmp::PartialEq for vortex_row::options::SortField
+
+pub fn vortex_row::options::SortField::eq(&self, &vortex_row::options::SortField) -> bool
+
+impl core::default::Default for vortex_row::options::SortField
+
+pub fn vortex_row::options::SortField::default() -> Self
+
+impl core::fmt::Debug for vortex_row::options::SortField
+
+pub fn vortex_row::options::SortField::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result
+
+impl core::fmt::Display for vortex_row::options::SortField
+
+pub fn vortex_row::options::SortField::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result
+
+impl core::hash::Hash for vortex_row::options::SortField
+
+pub fn vortex_row::options::SortField::hash<__H: core::hash::Hasher>(&self, &mut __H)
+
+impl core::marker::Copy for vortex_row::options::SortField
+
+impl core::marker::StructuralPartialEq for vortex_row::options::SortField
+
 pub fn vortex_row::initialize(&vortex_session::VortexSession)
diff --git a/vortex-row/src/lib.rs b/vortex-row/src/lib.rs
index f675ca12f4d..9e62f25caf2 100644
--- a/vortex-row/src/lib.rs
+++ b/vortex-row/src/lib.rs
@@ -6,6 +6,10 @@
 //! Subsequent commits add the encoder, decoder helpers, and per-encoding fast paths.
 //! This commit only establishes the crate skeleton and an `initialize` stub.
 
+pub mod options;
+
+pub use options::RowEncodeOptions;
+pub use options::SortField;
 use vortex_session::VortexSession;
 
 /// Register the row-encoding scalar functions on the given session.
diff --git a/vortex-row/src/options.rs b/vortex-row/src/options.rs
new file mode 100644
index 00000000000..a9e5e2b18ab
--- /dev/null
+++ b/vortex-row/src/options.rs
@@ -0,0 +1,157 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+use std::fmt::Display;
+use std::fmt::Formatter;
+
+use smallvec::SmallVec;
+
+/// Per-column options for the row-oriented byte encoder.
+///
+/// These options control how a single column is encoded into row bytes:
+/// - `descending`: if true, the encoded value bytes are bit-inverted so that
+///   lexicographic byte comparison reflects the reverse of the natural ordering.
+///   The null sentinel byte is NOT inverted, so nulls keep their requested
+///   position relative to non-nulls.
+/// - `nulls_first`: if true, nulls sort before non-nulls. If false, nulls sort
+///   after non-nulls. Implemented via the sentinel byte that precedes every
+///   value's encoded bytes.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct SortField {
+    /// If true, encoded value bytes are bit-inverted so lexicographic byte
+    /// comparison reflects the reverse of the natural ordering.
+    pub descending: bool,
+    /// If true, nulls sort before non-null values; otherwise nulls sort after.
+    pub nulls_first: bool,
+}
+
+impl Default for SortField {
+    fn default() -> Self {
+        Self {
+            descending: false,
+            nulls_first: true,
+        }
+    }
+}
+
+impl Display for SortField {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "descending={}, nulls_first={}",
+            self.descending, self.nulls_first
+        )
+    }
+}
+
+impl SortField {
+    /// Construct a new `SortField` with explicit options.
+    pub fn new(descending: bool, nulls_first: bool) -> Self {
+        Self {
+            descending,
+            nulls_first,
+        }
+    }
+
+    /// Returns the sentinel byte to write for a non-null value.
+    #[inline]
+    pub fn non_null_sentinel(&self) -> u8 {
+        // Non-null is always 0x01. Null choices are < or > 0x01.
+        0x01
+    }
+
+    /// Returns the sentinel byte to write for a null value.
+    #[inline]
+    pub fn null_sentinel(&self) -> u8 {
+        if self.nulls_first {
+            // Nulls before non-nulls (smaller byte sorts first).
+            0x00
+        } else {
+            // Nulls after non-nulls (larger byte sorts later).
+            0x02
+        }
+    }
+}
+
+/// Inline capacity for [`RowEncodeOptions::fields`]. Up to this many [`SortField`]s
+/// are held inline without a heap allocation; beyond, the storage spills.
+pub const FIELDS_INLINE: usize = 4;
+
+/// Options for the variadic [`RowSize`] and [`RowEncode`] scalar functions:
+/// one [`SortField`] per input column.
+///
+/// Stored in a [`SmallVec`] so that typical 1–4 column keys avoid a heap
+/// allocation; longer field lists spill to the heap transparently.
+///
+/// [`RowSize`]: super::size::RowSize
+/// [`RowEncode`]: super::encode::RowEncode
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub struct RowEncodeOptions {
+    /// Per-column sort fields, in left-to-right column order.
+    pub fields: SmallVec<[SortField; FIELDS_INLINE]>,
+}
+
+impl RowEncodeOptions {
+    /// Construct a new `RowEncodeOptions` from any iterator of [`SortField`]s.
+    pub fn new(fields: impl IntoIterator<Item = SortField>) -> Self {
+        Self {
+            fields: fields.into_iter().collect(),
+        }
+    }
+}
+
+impl Display for RowEncodeOptions {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "[")?;
+        for (i, field) in self.fields.iter().enumerate() {
+            if i > 0 {
+                write!(f, ", ")?;
+            }
+            write!(f, "{}", field)?;
+        }
+        write!(f, "]")
+    }
+}
+
+/// Serialize a [`RowEncodeOptions`] to a compact byte vector: 4-byte LE length followed by
+/// `2 * len` bytes (descending + nulls_first booleans for each field).
+pub(crate) fn serialize_row_encode_options(opts: &RowEncodeOptions) -> Vec<u8> {
+    use vortex_error::VortexExpect;
+    let n =
+        u32::try_from(opts.fields.len()).vortex_expect("RowEncodeOptions length must fit in u32");
+    let mut out = Vec::with_capacity(4 + 2 * opts.fields.len());
+    out.extend_from_slice(&n.to_le_bytes());
+    for f in &opts.fields {
+        out.push(u8::from(f.descending));
+        out.push(u8::from(f.nulls_first));
+    }
+    out
+}
+
+/// Deserialize a [`RowEncodeOptions`] produced by [`serialize_row_encode_options`].
+pub(crate) fn deserialize_row_encode_options(
+    bytes: &[u8],
+) -> vortex_error::VortexResult<RowEncodeOptions> {
+    if bytes.len() < 4 {
+        vortex_error::vortex_bail!("RowEncodeOptions metadata must contain a 4-byte length prefix");
+    }
+    let n = u32::from_le_bytes([bytes[0], bytes[1], bytes[2], bytes[3]]) as usize;
+    let expected = 4 + 2 * n;
+    if bytes.len() != expected {
+        vortex_error::vortex_bail!(
+            "RowEncodeOptions metadata wrong size: got {}, expected {}",
+            bytes.len(),
+            expected
+        );
+    }
+    let mut fields: SmallVec<[SortField; FIELDS_INLINE]> = SmallVec::with_capacity(n);
+    let mut i = 4;
+    for _ in 0..n {
+        fields.push(SortField {
+            descending: bytes[i] != 0,
+            nulls_first: bytes[i + 1] != 0,
+        });
+        i += 2;
+    }
+    Ok(RowEncodeOptions { fields })
+}

From 1b7af91e7c0b1fa4269b1768809f20c7ff2329b9 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 17 May 2026 22:04:03 +0000
Subject: [PATCH 03/26] vortex-row: codec for fixed-width canonical types

Add the byte-encoding kernels for the fixed-width portion of the row
encoder: Null, Bool, Primitive (12 PTypes), and Decimal (i8..i128). Each
encoder writes a 1-byte sentinel followed by the value's row-comparable
bytes (sign-flipped big-endian for signed ints, sign-aware mask for
floats, etc.).

The size pass is a constant `width-per-row` add for these types; the
encode pass walks rows and writes into the shared output buffer at
`offsets[i] + cursors[i]`. `row_width_for_dtype` classifies the column
based purely on its DType.

Scalar-level encoders (`encode_scalar_primitive` / `encode_scalar_bool`
/ `encode_scalar_null` / `encode_scalar` / `encoded_size_for_scalar`)
are included for the same fixed-width subset; varlen and nested
canonical variants bail with a clear "not yet supported" error and
land in follow-up commits.

The implementation is deliberately the simplest correct version:
bounds-checked array indexing, no `copy_nonoverlapping`, no validity
fast-path helper. Subsequent PRs evolve this toward the optimized form.

Signed-off-by: Claude <noreply@anthropic.com>
---
 Cargo.lock                 |   4 +
 vortex-row/Cargo.toml      |   4 +
 vortex-row/public-api.lock | 100 ++++++
 vortex-row/src/codec.rs    | 667 +++++++++++++++++++++++++++++++++++++
 vortex-row/src/lib.rs      |   1 +
 5 files changed, 776 insertions(+)
 create mode 100644 vortex-row/src/codec.rs

diff --git a/Cargo.lock b/Cargo.lock
index ce2b4e6d41b..0b84f6dd260 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -11038,8 +11038,12 @@ dependencies = [
 name = "vortex-row"
 version = "0.1.0"
 dependencies = [
+ "bytes",
  "smallvec",
+ "vortex-array",
+ "vortex-buffer",
  "vortex-error",
+ "vortex-mask",
  "vortex-session",
 ]
 
diff --git a/vortex-row/Cargo.toml b/vortex-row/Cargo.toml
index 3e314fd7697..aaed9a55f51 100644
--- a/vortex-row/Cargo.toml
+++ b/vortex-row/Cargo.toml
@@ -17,6 +17,10 @@ version = { workspace = true }
 workspace = true
 
 [dependencies]
+bytes = { workspace = true }
 smallvec = { workspace = true }
+vortex-array = { workspace = true }
+vortex-buffer = { workspace = true }
 vortex-error = { workspace = true }
+vortex-mask = { workspace = true }
 vortex-session = { workspace = true }
diff --git a/vortex-row/public-api.lock b/vortex-row/public-api.lock
index 998a7712f2d..4990e30ba16 100644
--- a/vortex-row/public-api.lock
+++ b/vortex-row/public-api.lock
@@ -1,5 +1,105 @@
 pub mod vortex_row
 
+pub mod vortex_row::codec
+
+pub enum vortex_row::codec::RowWidth
+
+pub vortex_row::codec::RowWidth::Fixed(u32)
+
+pub vortex_row::codec::RowWidth::Variable
+
+impl core::clone::Clone for vortex_row::codec::RowWidth
+
+pub fn vortex_row::codec::RowWidth::clone(&self) -> vortex_row::codec::RowWidth
+
+impl core::cmp::Eq for vortex_row::codec::RowWidth
+
+impl core::cmp::PartialEq for vortex_row::codec::RowWidth
+
+pub fn vortex_row::codec::RowWidth::eq(&self, &vortex_row::codec::RowWidth) -> bool
+
+impl core::fmt::Debug for vortex_row::codec::RowWidth
+
+pub fn vortex_row::codec::RowWidth::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result
+
+impl core::marker::Copy for vortex_row::codec::RowWidth
+
+impl core::marker::StructuralPartialEq for vortex_row::codec::RowWidth
+
+pub const vortex_row::codec::BOOL_ENCODED_SIZE: u32
+
+pub const vortex_row::codec::VARLEN_BLOCK_SIZE: usize
+
+pub const vortex_row::codec::VARLEN_BLOCK_TOTAL: usize
+
+pub trait vortex_row::codec::RowEncode: core::marker::Copy
+
+pub fn vortex_row::codec::RowEncode::encode_to(self, &mut [u8], bool)
+
+impl vortex_row::codec::RowEncode for f32
+
+pub fn f32::encode_to(self, &mut [u8], bool)
+
+impl vortex_row::codec::RowEncode for f64
+
+pub fn f64::encode_to(self, &mut [u8], bool)
+
+impl vortex_row::codec::RowEncode for half::binary16::f16
+
+pub fn half::binary16::f16::encode_to(self, &mut [u8], bool)
+
+impl vortex_row::codec::RowEncode for i128
+
+pub fn i128::encode_to(self, &mut [u8], bool)
+
+impl vortex_row::codec::RowEncode for i16
+
+pub fn i16::encode_to(self, &mut [u8], bool)
+
+impl vortex_row::codec::RowEncode for i32
+
+pub fn i32::encode_to(self, &mut [u8], bool)
+
+impl vortex_row::codec::RowEncode for i64
+
+pub fn i64::encode_to(self, &mut [u8], bool)
+
+impl vortex_row::codec::RowEncode for i8
+
+pub fn i8::encode_to(self, &mut [u8], bool)
+
+impl vortex_row::codec::RowEncode for u16
+
+pub fn u16::encode_to(self, &mut [u8], bool)
+
+impl vortex_row::codec::RowEncode for u32
+
+pub fn u32::encode_to(self, &mut [u8], bool)
+
+impl vortex_row::codec::RowEncode for u64
+
+pub fn u64::encode_to(self, &mut [u8], bool)
+
+impl vortex_row::codec::RowEncode for u8
+
+pub fn u8::encode_to(self, &mut [u8], bool)
+
+pub fn vortex_row::codec::encode_scalar(&vortex_array::scalar::Scalar, vortex_row::options::SortField) -> vortex_error::VortexResult<bytes::bytes::Bytes>
+
+pub fn vortex_row::codec::encode_scalar_bool(core::option::Option<bool>, vortex_row::options::SortField, &mut vortex_buffer::ByteBufferMut)
+
+pub fn vortex_row::codec::encode_scalar_null(vortex_row::options::SortField, bool, &mut vortex_buffer::ByteBufferMut)
+
+pub fn vortex_row::codec::encode_scalar_primitive(vortex_array::dtype::ptype::PType, vortex_array::scalar::typed_view::primitive::pvalue::PValue, vortex_row::options::SortField, bool, &mut vortex_buffer::ByteBufferMut) -> vortex_error::VortexResult<()>
+
+pub fn vortex_row::codec::encoded_size_for_scalar(&vortex_array::scalar::Scalar, vortex_row::options::SortField) -> vortex_error::VortexResult<u32>
+
+pub fn vortex_row::codec::field_encode(&vortex_array::canonical::Canonical, vortex_row::options::SortField, &[u32], &mut [u32], &mut [u8], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<()>
+
+pub fn vortex_row::codec::field_size(&vortex_array::canonical::Canonical, vortex_row::options::SortField, &mut [u32], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<()>
+
+pub fn vortex_row::codec::row_width_for_dtype(&vortex_array::dtype::DType) -> vortex_error::VortexResult<vortex_row::codec::RowWidth>
+
 pub mod vortex_row::options
 
 pub struct vortex_row::options::RowEncodeOptions
diff --git a/vortex-row/src/codec.rs b/vortex-row/src/codec.rs
new file mode 100644
index 00000000000..73aa7a37db4
--- /dev/null
+++ b/vortex-row/src/codec.rs
@@ -0,0 +1,667 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+#![allow(
+    clippy::cast_possible_truncation,
+    clippy::expect_used,
+    reason = "row encoding indexes into u32-sized buffers; lengths are validated to fit in u32 elsewhere"
+)]
+
+//! Pure byte-encoding kernels for row-oriented output, operating on `Canonical` variants.
+//!
+//! The encoded byte format produces a lexicographically byte-comparable representation:
+//! comparing the byte slices of two encoded rows yields the same ordering as the
+//! original logical (tuple) comparison of their values, modulo nulls placement and
+//! descending-ness as configured by [`SortField`].
+//!
+//! Conventions:
+//! - Every value is preceded by a 1-byte sentinel that orders nulls relative to non-nulls.
+//! - For `descending`, only the **value** bytes are bit-inverted (XOR with 0xFF), not the
+//!   sentinel.
+//! - Fixed-width integers are big-endian, with the sign bit flipped for signed types.
+//! - Floats are bit-pattern big-endian with sign-aware mask: non-negative flips the top
+//!   bit; negative flips all bits.
+//!
+//! This commit covers only the fixed-width canonical variants (Null, Bool, Primitive,
+//! Decimal); variable-length and nested canonical variants land in later commits.
+
+use vortex_array::Canonical;
+use vortex_array::ExecutionCtx;
+use vortex_array::arrays::BoolArray;
+use vortex_array::arrays::DecimalArray;
+use vortex_array::arrays::NullArray;
+use vortex_array::arrays::PrimitiveArray;
+use vortex_array::dtype::DType;
+use vortex_array::dtype::DecimalType;
+use vortex_array::dtype::NativePType;
+use vortex_array::dtype::PType;
+use vortex_array::dtype::half::f16;
+use vortex_array::match_each_native_ptype;
+use vortex_buffer::ByteBufferMut;
+use vortex_error::VortexResult;
+use vortex_error::vortex_bail;
+
+use crate::options::SortField;
+
+/// Size in bytes of the encoded form of a single bool value (sentinel + 1 content byte).
+pub const BOOL_ENCODED_SIZE: u32 = 2;
+
+/// Block size used in the variable-length encoding.
+pub const VARLEN_BLOCK_SIZE: usize = 32;
+/// Total bytes per varlen block including the trailing continuation marker.
+pub const VARLEN_BLOCK_TOTAL: usize = VARLEN_BLOCK_SIZE + 1;
+
+/// Returns the size in bytes of the encoded form of a variable-length value of the given length.
+#[inline]
+#[allow(
+    dead_code,
+    reason = "used once varlen support lands in a follow-up commit"
+)]
+fn encoded_size_for_varlen(len: usize) -> u32 {
+    // 1 sentinel + ceil(len/32)*33 content bytes (or 1 zero terminator if empty)
+    if len == 0 {
+        1 + 1
+    } else {
+        let blocks = len.div_ceil(VARLEN_BLOCK_SIZE);
+        1 + (blocks as u32) * (VARLEN_BLOCK_TOTAL as u32)
+    }
+}
+
+/// Constant per-row size in bytes for fixed-width encodings (including 1-byte sentinel).
+#[inline]
+const fn encoded_size_for_fixed(value_bytes: u32) -> u32 {
+    1 + value_bytes
+}
+
+/// Per-row width classification for a column.
+///
+/// `Fixed(w)` means every row encodes to exactly `w` bytes (sentinel + value), regardless
+/// of null-ness or value. `Variable` means per-row sizes depend on the data (Utf8/Binary,
+/// List, or any composite that recurses through a variable-width field).
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum RowWidth {
+    /// Per-row width is the same constant for every row in the column.
+    Fixed(u32),
+    /// Per-row width is data-dependent.
+    Variable,
+}
+
+/// Classify a column's per-row encoded width by inspecting only its [`DType`].
+///
+/// Returns `Fixed(w)` when every row encodes to exactly `w` bytes (sentinel + value),
+/// regardless of null-ness or value. Returns `Variable` when per-row sizes depend on the
+/// data.
+///
+/// Classification does not depend on the [`SortField`]: null-vs-non-null encoding width is
+/// the same for fixed-width types (the sentinel byte plus zero-fill for nulls).
+///
+/// # Errors
+///
+/// Returns an error for dtypes that the row encoder does not yet support. Variable-length
+/// dtypes (Utf8/Binary), nested dtypes (Struct/FixedSizeList/Extension), and
+/// Variant/Union/List arrive in later commits.
+pub fn row_width_for_dtype(dtype: &DType) -> VortexResult<RowWidth> {
+    match dtype {
+        DType::Null => Ok(RowWidth::Fixed(1)),
+        DType::Bool(_) => Ok(RowWidth::Fixed(BOOL_ENCODED_SIZE)),
+        DType::Primitive(ptype, _) => Ok(RowWidth::Fixed(encoded_size_for_fixed(
+            ptype.byte_width() as u32,
+        ))),
+        DType::Decimal(dt, _) => {
+            let vt = DecimalType::smallest_decimal_value_type(dt);
+            Ok(RowWidth::Fixed(encoded_size_for_fixed(
+                vt.byte_width() as u32
+            )))
+        }
+        DType::Utf8(_) | DType::Binary(_) => {
+            vortex_bail!("row encoding for {} is not yet supported", dtype)
+        }
+        DType::Struct(..) | DType::FixedSizeList(..) | DType::List(..) | DType::Extension(..) => {
+            vortex_bail!("row encoding for {} is not yet supported", dtype)
+        }
+        DType::Variant(_) => {
+            vortex_bail!("row encoding does not support Variant arrays (no well-defined ordering)")
+        }
+        DType::Union(_) => vortex_bail!("row encoding does not support Union arrays"),
+    }
+}
+
+/// Compute the per-row size in bytes for the given canonical view, adding into `sizes`.
+///
+/// `sizes` is expected to be initialized (typically zeroed). This function *adds* the
+/// per-row size to each entry so multiple columns can accumulate into the same buffer.
+///
+/// # Errors
+///
+/// Returns an error for unsupported canonical variants. Variable-length and nested
+/// variants land in later commits.
+pub fn field_size(
+    canonical: &Canonical,
+    _field: SortField,
+    sizes: &mut [u32],
+    _ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    match canonical {
+        Canonical::Null(arr) => add_size_null(arr, sizes),
+        Canonical::Bool(_) => add_size_const(sizes, encoded_size_for_fixed(1)),
+        Canonical::Primitive(arr) => add_size_primitive(arr, sizes),
+        Canonical::Decimal(arr) => add_size_decimal(arr, sizes),
+        Canonical::VarBinView(_)
+        | Canonical::Struct(_)
+        | Canonical::FixedSizeList(_)
+        | Canonical::Extension(_)
+        | Canonical::List(_) => vortex_bail!(
+            "row encoding does not yet support canonical type {:?}",
+            canonical.dtype()
+        ),
+        Canonical::Variant(_) => {
+            vortex_bail!("row encoding does not support Variant arrays (no well-defined ordering)")
+        }
+    }
+    Ok(())
+}
+
+/// Encode each row's bytes for the given canonical view into `out`, writing starting at
+/// `offsets[i] + cursors[i]` for row `i` and advancing `cursors[i]` by the number of
+/// bytes written.
+///
+/// After this call returns successfully, `cursors[i]` will have advanced by exactly the
+/// per-row contribution previously computed by [`field_size`] for the same column.
+pub fn field_encode(
+    canonical: &Canonical,
+    field: SortField,
+    offsets: &[u32],
+    cursors: &mut [u32],
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    match canonical {
+        Canonical::Null(arr) => encode_null(arr, field, offsets, cursors, out),
+        Canonical::Bool(arr) => encode_bool(arr, field, offsets, cursors, out, ctx)?,
+        Canonical::Primitive(arr) => encode_primitive(arr, field, offsets, cursors, out, ctx)?,
+        Canonical::Decimal(arr) => encode_decimal(arr, field, offsets, cursors, out, ctx)?,
+        Canonical::VarBinView(_)
+        | Canonical::Struct(_)
+        | Canonical::FixedSizeList(_)
+        | Canonical::Extension(_)
+        | Canonical::List(_) => vortex_bail!(
+            "row encoding does not yet support canonical type {:?}",
+            canonical.dtype()
+        ),
+        Canonical::Variant(_) => {
+            vortex_bail!("row encoding does not support Variant arrays (no well-defined ordering)")
+        }
+    }
+    Ok(())
+}
+
+fn add_size_const(sizes: &mut [u32], add: u32) {
+    for s in sizes.iter_mut() {
+        *s += add;
+    }
+}
+
+fn add_size_null(arr: &NullArray, sizes: &mut [u32]) {
+    debug_assert_eq!(arr.len(), sizes.len());
+    // Just a sentinel byte per row.
+    for s in sizes.iter_mut() {
+        *s += 1;
+    }
+}
+
+fn add_size_primitive(arr: &PrimitiveArray, sizes: &mut [u32]) {
+    let width = arr.ptype().byte_width() as u32;
+    add_size_const(sizes, encoded_size_for_fixed(width));
+}
+
+fn add_size_decimal(arr: &DecimalArray, sizes: &mut [u32]) {
+    let width = arr.values_type().byte_width() as u32;
+    add_size_const(sizes, encoded_size_for_fixed(width));
+}
+
+fn encode_null(
+    arr: &NullArray,
+    field: SortField,
+    row_offsets: &[u32],
+    col_offset: &mut [u32],
+    out: &mut [u8],
+) {
+    let sentinel = field.null_sentinel();
+    for i in 0..arr.len() {
+        let pos = (row_offsets[i] + col_offset[i]) as usize;
+        out[pos] = sentinel;
+        col_offset[i] += 1;
+    }
+}
+
+fn encode_bool(
+    arr: &BoolArray,
+    field: SortField,
+    row_offsets: &[u32],
+    col_offset: &mut [u32],
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    let mask = arr.as_ref().validity()?.execute_mask(arr.len(), ctx)?;
+    let bits = arr.clone().into_bit_buffer();
+    let non_null = field.non_null_sentinel();
+    let null = field.null_sentinel();
+    let xor = if field.descending { 0xFF } else { 0x00 };
+    for i in 0..bits.len() {
+        let pos = (row_offsets[i] + col_offset[i]) as usize;
+        if mask.value(i) {
+            out[pos] = non_null;
+            // false=0x01, true=0x02 so false < true; XOR for descending
+            let raw = if bits.value(i) { 0x02u8 } else { 0x01u8 };
+            out[pos + 1] = raw ^ xor;
+        } else {
+            out[pos] = null;
+            out[pos + 1] = 0;
+        }
+        col_offset[i] += BOOL_ENCODED_SIZE;
+    }
+    Ok(())
+}
+
+fn encode_primitive(
+    arr: &PrimitiveArray,
+    field: SortField,
+    row_offsets: &[u32],
+    col_offset: &mut [u32],
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    match_each_native_ptype!(arr.ptype(), |T| {
+        encode_primitive_typed::<T>(arr, field, row_offsets, col_offset, out, ctx)?;
+    });
+    Ok(())
+}
+
+fn encode_primitive_typed<T: NativePType + RowEncode>(
+    arr: &PrimitiveArray,
+    field: SortField,
+    row_offsets: &[u32],
+    col_offset: &mut [u32],
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    let mask = arr.as_ref().validity()?.execute_mask(arr.len(), ctx)?;
+    let slice: &[T] = arr.as_slice();
+    let non_null = field.non_null_sentinel();
+    let null = field.null_sentinel();
+    let value_bytes = size_of::<T>();
+    for (i, &v) in slice.iter().enumerate() {
+        let pos = (row_offsets[i] + col_offset[i]) as usize;
+        if mask.value(i) {
+            out[pos] = non_null;
+            v.encode_to(&mut out[pos + 1..pos + 1 + value_bytes], field.descending);
+        } else {
+            out[pos] = null;
+            // Zero-fill the value bytes.
+            for b in &mut out[pos + 1..pos + 1 + value_bytes] {
+                *b = 0;
+            }
+        }
+        col_offset[i] += encoded_size_for_fixed(value_bytes as u32);
+    }
+    Ok(())
+}
+
+fn encode_decimal(
+    arr: &DecimalArray,
+    field: SortField,
+    row_offsets: &[u32],
+    col_offset: &mut [u32],
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    let mask = arr.as_ref().validity()?.execute_mask(arr.len(), ctx)?;
+    match arr.values_type() {
+        DecimalType::I8 => {
+            encode_decimal_typed::<i8>(arr, &mask, field, row_offsets, col_offset, out)
+        }
+        DecimalType::I16 => {
+            encode_decimal_typed::<i16>(arr, &mask, field, row_offsets, col_offset, out)
+        }
+        DecimalType::I32 => {
+            encode_decimal_typed::<i32>(arr, &mask, field, row_offsets, col_offset, out)
+        }
+        DecimalType::I64 => {
+            encode_decimal_typed::<i64>(arr, &mask, field, row_offsets, col_offset, out)
+        }
+        DecimalType::I128 => {
+            encode_decimal_typed::<i128>(arr, &mask, field, row_offsets, col_offset, out)
+        }
+        DecimalType::I256 => {
+            vortex_bail!("row encoding for Decimal256 is not yet implemented")
+        }
+    }
+    Ok(())
+}
+
+fn encode_decimal_typed<T>(
+    arr: &DecimalArray,
+    mask: &vortex_mask::Mask,
+    field: SortField,
+    row_offsets: &[u32],
+    col_offset: &mut [u32],
+    out: &mut [u8],
+) where
+    T: vortex_array::dtype::NativeDecimalType + RowEncode,
+{
+    let non_null = field.non_null_sentinel();
+    let null = field.null_sentinel();
+    let value_bytes = size_of::<T>();
+    let total = encoded_size_for_fixed(value_bytes as u32);
+    let slice = arr.buffer::<T>();
+    for i in 0..slice.len() {
+        let pos = (row_offsets[i] + col_offset[i]) as usize;
+        if mask.value(i) {
+            out[pos] = non_null;
+            slice[i].encode_to(&mut out[pos + 1..pos + 1 + value_bytes], field.descending);
+        } else {
+            out[pos] = null;
+            for b in &mut out[pos + 1..pos + 1 + value_bytes] {
+                *b = 0;
+            }
+        }
+        col_offset[i] += total;
+    }
+}
+
+/// Internal trait for encoding a fixed-width native value into byte slots.
+///
+/// Implementations must produce a sequence of `size_of::<Self>()` bytes that is
+/// lexicographically byte-comparable according to the natural ordering of the type.
+pub trait RowEncode: Copy {
+    /// Encode this value into `out`, inverting the bytes for descending order.
+    fn encode_to(self, out: &mut [u8], descending: bool);
+}
+
+macro_rules! impl_row_encode_unsigned {
+    ($t:ty) => {
+        impl RowEncode for $t {
+            #[inline]
+            fn encode_to(self, out: &mut [u8], descending: bool) {
+                let bytes = self.to_be_bytes();
+                if descending {
+                    for (i, b) in bytes.iter().enumerate() {
+                        out[i] = b ^ 0xFF;
+                    }
+                } else {
+                    out.copy_from_slice(&bytes);
+                }
+            }
+        }
+    };
+}
+
+macro_rules! impl_row_encode_signed {
+    ($t:ty) => {
+        impl RowEncode for $t {
+            #[inline]
+            fn encode_to(self, out: &mut [u8], descending: bool) {
+                let mut bytes = self.to_be_bytes();
+                // Flip sign bit so negatives < non-negatives lexicographically.
+                bytes[0] ^= 0x80;
+                if descending {
+                    for (i, b) in bytes.iter().enumerate() {
+                        out[i] = b ^ 0xFF;
+                    }
+                } else {
+                    out.copy_from_slice(&bytes);
+                }
+            }
+        }
+    };
+}
+
+impl_row_encode_unsigned!(u8);
+impl_row_encode_unsigned!(u16);
+impl_row_encode_unsigned!(u32);
+impl_row_encode_unsigned!(u64);
+impl_row_encode_signed!(i8);
+impl_row_encode_signed!(i16);
+impl_row_encode_signed!(i32);
+impl_row_encode_signed!(i64);
+impl_row_encode_signed!(i128);
+
+impl RowEncode for f32 {
+    fn encode_to(self, out: &mut [u8], descending: bool) {
+        let bits = self.to_bits();
+        let mask: u32 = if (bits >> 31) == 0 {
+            0x8000_0000
+        } else {
+            0xFFFF_FFFF
+        };
+        let mut bytes = (bits ^ mask).to_be_bytes();
+        if descending {
+            for b in bytes.iter_mut() {
+                *b ^= 0xFF;
+            }
+        }
+        out.copy_from_slice(&bytes);
+    }
+}
+
+impl RowEncode for f64 {
+    fn encode_to(self, out: &mut [u8], descending: bool) {
+        let bits = self.to_bits();
+        let mask: u64 = if (bits >> 63) == 0 {
+            0x8000_0000_0000_0000
+        } else {
+            0xFFFF_FFFF_FFFF_FFFF
+        };
+        let mut bytes = (bits ^ mask).to_be_bytes();
+        if descending {
+            for b in bytes.iter_mut() {
+                *b ^= 0xFF;
+            }
+        }
+        out.copy_from_slice(&bytes);
+    }
+}
+
+impl RowEncode for f16 {
+    fn encode_to(self, out: &mut [u8], descending: bool) {
+        let bits = self.to_bits();
+        let mask: u16 = if (bits >> 15) == 0 { 0x8000 } else { 0xFFFF };
+        let mut bytes = (bits ^ mask).to_be_bytes();
+        if descending {
+            for b in bytes.iter_mut() {
+                *b ^= 0xFF;
+            }
+        }
+        out.copy_from_slice(&bytes);
+    }
+}
+
+/// Encode a single scalar primitive value of a known PType into a buffer slot.
+pub fn encode_scalar_primitive(
+    ptype: PType,
+    value: vortex_array::scalar::PValue,
+    field: SortField,
+    is_null: bool,
+    out: &mut ByteBufferMut,
+) -> VortexResult<()> {
+    if is_null {
+        out.push(field.null_sentinel());
+        return Ok(());
+    }
+    out.push(field.non_null_sentinel());
+    let width = ptype.byte_width();
+    let mut tmp = [0u8; 16];
+    let buf = &mut tmp[..width];
+    match_each_native_ptype!(
+        ptype,
+        integral: |T| {
+            let v: T = T::try_from(value)?;
+            v.encode_to(buf, field.descending);
+        },
+        floating: |T| {
+            let v: T = T::try_from(value)?;
+            v.encode_to(buf, field.descending);
+        }
+    );
+    out.extend_from_slice(buf);
+    Ok(())
+}
+
+/// Encode a single boolean value.
+pub fn encode_scalar_bool(value: Option<bool>, field: SortField, out: &mut ByteBufferMut) {
+    match value {
+        None => {
+            out.push(field.null_sentinel());
+            out.push(0);
+        }
+        Some(b) => {
+            out.push(field.non_null_sentinel());
+            let raw = if b { 0x02u8 } else { 0x01u8 };
+            let xor = if field.descending { 0xFFu8 } else { 0 };
+            out.push(raw ^ xor);
+        }
+    }
+}
+
+/// Encode a single null-type value (only the sentinel).
+pub fn encode_scalar_null(field: SortField, is_null: bool, out: &mut ByteBufferMut) {
+    if is_null {
+        out.push(field.null_sentinel());
+    } else {
+        out.push(field.non_null_sentinel());
+    }
+}
+
+/// Returns the per-row encoded size for a scalar value (used for the Constant fast path).
+pub fn encoded_size_for_scalar(
+    scalar: &vortex_array::scalar::Scalar,
+    _field: SortField,
+) -> VortexResult<u32> {
+    if scalar.is_null() {
+        match scalar.dtype() {
+            DType::Null => Ok(1),
+            DType::Bool(_) => Ok(BOOL_ENCODED_SIZE),
+            DType::Primitive(ptype, _) => Ok(encoded_size_for_fixed(ptype.byte_width() as u32)),
+            DType::Decimal(dt, _) => {
+                let vt = DecimalType::smallest_decimal_value_type(dt);
+                Ok(encoded_size_for_fixed(vt.byte_width() as u32))
+            }
+            _ => vortex_bail!(
+                "unsupported scalar dtype for row encoding: {}",
+                scalar.dtype()
+            ),
+        }
+    } else {
+        match scalar.dtype() {
+            DType::Null => Ok(1),
+            DType::Bool(_) => Ok(BOOL_ENCODED_SIZE),
+            DType::Primitive(ptype, _) => Ok(encoded_size_for_fixed(ptype.byte_width() as u32)),
+            DType::Decimal(..) => {
+                let dec = scalar.as_decimal();
+                let vt = dec
+                    .decimal_value()
+                    .map(|v| v.decimal_type())
+                    .unwrap_or(DecimalType::I128);
+                Ok(encoded_size_for_fixed(vt.byte_width() as u32))
+            }
+            _ => vortex_bail!(
+                "unsupported scalar dtype for row encoding: {}",
+                scalar.dtype()
+            ),
+        }
+    }
+}
+
+/// Encode a single scalar value into a fresh `Bytes` buffer.
+pub fn encode_scalar(
+    scalar: &vortex_array::scalar::Scalar,
+    field: SortField,
+) -> VortexResult<bytes::Bytes> {
+    use vortex_array::scalar::PValue;
+    let size = encoded_size_for_scalar(scalar, field)? as usize;
+    let mut out = ByteBufferMut::with_capacity(size);
+    if scalar.is_null() {
+        match scalar.dtype() {
+            DType::Null => out.push(field.null_sentinel()),
+            DType::Bool(_) => {
+                out.push(field.null_sentinel());
+                out.push(0);
+            }
+            DType::Primitive(ptype, _) => {
+                out.push(field.null_sentinel());
+                let width = ptype.byte_width();
+                for _ in 0..width {
+                    out.push(0);
+                }
+            }
+            DType::Decimal(dt, _) => {
+                out.push(field.null_sentinel());
+                let vt = DecimalType::smallest_decimal_value_type(dt);
+                for _ in 0..vt.byte_width() {
+                    out.push(0);
+                }
+            }
+            _ => vortex_bail!(
+                "unsupported scalar dtype for row encoding: {}",
+                scalar.dtype()
+            ),
+        }
+    } else {
+        match scalar.dtype() {
+            DType::Null => out.push(field.non_null_sentinel()),
+            DType::Bool(_) => {
+                let v = scalar.as_bool().value().unwrap_or(false);
+                encode_scalar_bool(Some(v), field, &mut out);
+            }
+            DType::Primitive(ptype, _) => {
+                let v: PValue = scalar
+                    .as_primitive()
+                    .pvalue()
+                    .ok_or_else(|| vortex_error::vortex_err!("missing primitive value"))?;
+                encode_scalar_primitive(*ptype, v, field, false, &mut out)?;
+            }
+            DType::Decimal(..) => {
+                let dec = scalar.as_decimal();
+                out.push(field.non_null_sentinel());
+                let value = dec
+                    .decimal_value()
+                    .ok_or_else(|| vortex_error::vortex_err!("missing decimal value"))?;
+                match value {
+                    vortex_array::scalar::DecimalValue::I8(v) => {
+                        let mut tmp = [0u8; 1];
+                        v.encode_to(&mut tmp, field.descending);
+                        out.extend_from_slice(&tmp);
+                    }
+                    vortex_array::scalar::DecimalValue::I16(v) => {
+                        let mut tmp = [0u8; 2];
+                        v.encode_to(&mut tmp, field.descending);
+                        out.extend_from_slice(&tmp);
+                    }
+                    vortex_array::scalar::DecimalValue::I32(v) => {
+                        let mut tmp = [0u8; 4];
+                        v.encode_to(&mut tmp, field.descending);
+                        out.extend_from_slice(&tmp);
+                    }
+                    vortex_array::scalar::DecimalValue::I64(v) => {
+                        let mut tmp = [0u8; 8];
+                        v.encode_to(&mut tmp, field.descending);
+                        out.extend_from_slice(&tmp);
+                    }
+                    vortex_array::scalar::DecimalValue::I128(v) => {
+                        let mut tmp = [0u8; 16];
+                        v.encode_to(&mut tmp, field.descending);
+                        out.extend_from_slice(&tmp);
+                    }
+                    vortex_array::scalar::DecimalValue::I256(_) => {
+                        vortex_bail!("row encoding for Decimal256 is not yet implemented")
+                    }
+                }
+            }
+            _ => vortex_bail!(
+                "unsupported scalar dtype for row encoding: {}",
+                scalar.dtype()
+            ),
+        }
+    }
+    Ok(out.freeze().into_inner())
+}
diff --git a/vortex-row/src/lib.rs b/vortex-row/src/lib.rs
index 9e62f25caf2..bdac4c8f48e 100644
--- a/vortex-row/src/lib.rs
+++ b/vortex-row/src/lib.rs
@@ -6,6 +6,7 @@
 //! Subsequent commits add the encoder, decoder helpers, and per-encoding fast paths.
 //! This commit only establishes the crate skeleton and an `initialize` stub.
 
+pub mod codec;
 pub mod options;
 
 pub use options::RowEncodeOptions;

From d3f3da4b2bf165549876b7848f6aa8c1fddb40fe Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 17 May 2026 22:06:19 +0000
Subject: [PATCH 04/26] vortex-row: codec for varlen canonical types

Extend the codec to handle Utf8/Binary via VarBinView arrays. Each value
encodes as a 1-byte sentinel followed by 32-byte chunks: every full
chunk has a 0xFF continuation marker; the final partial chunk pads with
zeros and writes the partial length (1..=32) as its trailing byte.

`encode_varlen_value` uses the simple byte-at-a-time XOR loop here; a
faster `copy_nonoverlapping` + stamped continuation version replaces it
in PR 2. `encode_varbinview` uses `arr.with_iterator(...)` for both the
nullable and non-nullable branches; a direct view walk for the no-nulls
branch lands in PR 2 too.

`row_width_for_dtype` now returns `Variable` for Utf8/Binary; the size
pass and encode dispatchers route through `add_size_varbinview` /
`encode_varbinview` correspondingly. The scalar encoder gains
`encode_scalar_varlen` and the matching Utf8/Binary arms.

Signed-off-by: Claude <noreply@anthropic.com>
---
 vortex-row/public-api.lock |   2 +
 vortex-row/src/codec.rs    | 147 ++++++++++++++++++++++++++++++++++---
 2 files changed, 137 insertions(+), 12 deletions(-)

diff --git a/vortex-row/public-api.lock b/vortex-row/public-api.lock
index 4990e30ba16..1afc1f05442 100644
--- a/vortex-row/public-api.lock
+++ b/vortex-row/public-api.lock
@@ -92,6 +92,8 @@ pub fn vortex_row::codec::encode_scalar_null(vortex_row::options::SortField, boo
 
 pub fn vortex_row::codec::encode_scalar_primitive(vortex_array::dtype::ptype::PType, vortex_array::scalar::typed_view::primitive::pvalue::PValue, vortex_row::options::SortField, bool, &mut vortex_buffer::ByteBufferMut) -> vortex_error::VortexResult<()>
 
+pub fn vortex_row::codec::encode_scalar_varlen(core::option::Option<&[u8]>, vortex_row::options::SortField, &mut vortex_buffer::ByteBufferMut)
+
 pub fn vortex_row::codec::encoded_size_for_scalar(&vortex_array::scalar::Scalar, vortex_row::options::SortField) -> vortex_error::VortexResult<u32>
 
 pub fn vortex_row::codec::field_encode(&vortex_array::canonical::Canonical, vortex_row::options::SortField, &[u32], &mut [u32], &mut [u8], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<()>
diff --git a/vortex-row/src/codec.rs b/vortex-row/src/codec.rs
index 73aa7a37db4..4f70d80e5ae 100644
--- a/vortex-row/src/codec.rs
+++ b/vortex-row/src/codec.rs
@@ -27,10 +27,12 @@
 
 use vortex_array::Canonical;
 use vortex_array::ExecutionCtx;
+use vortex_array::accessor::ArrayAccessor;
 use vortex_array::arrays::BoolArray;
 use vortex_array::arrays::DecimalArray;
 use vortex_array::arrays::NullArray;
 use vortex_array::arrays::PrimitiveArray;
+use vortex_array::arrays::VarBinViewArray;
 use vortex_array::dtype::DType;
 use vortex_array::dtype::DecimalType;
 use vortex_array::dtype::NativePType;
@@ -53,10 +55,6 @@ pub const VARLEN_BLOCK_TOTAL: usize = VARLEN_BLOCK_SIZE + 1;
 
 /// Returns the size in bytes of the encoded form of a variable-length value of the given length.
 #[inline]
-#[allow(
-    dead_code,
-    reason = "used once varlen support lands in a follow-up commit"
-)]
 fn encoded_size_for_varlen(len: usize) -> u32 {
     // 1 sentinel + ceil(len/32)*33 content bytes (or 1 zero terminator if empty)
     if len == 0 {
@@ -113,9 +111,7 @@ pub fn row_width_for_dtype(dtype: &DType) -> VortexResult<RowWidth> {
                 vt.byte_width() as u32
             )))
         }
-        DType::Utf8(_) | DType::Binary(_) => {
-            vortex_bail!("row encoding for {} is not yet supported", dtype)
-        }
+        DType::Utf8(_) | DType::Binary(_) => Ok(RowWidth::Variable),
         DType::Struct(..) | DType::FixedSizeList(..) | DType::List(..) | DType::Extension(..) => {
             vortex_bail!("row encoding for {} is not yet supported", dtype)
         }
@@ -139,15 +135,15 @@ pub fn field_size(
     canonical: &Canonical,
     _field: SortField,
     sizes: &mut [u32],
-    _ctx: &mut ExecutionCtx,
+    ctx: &mut ExecutionCtx,
 ) -> VortexResult<()> {
     match canonical {
         Canonical::Null(arr) => add_size_null(arr, sizes),
         Canonical::Bool(_) => add_size_const(sizes, encoded_size_for_fixed(1)),
         Canonical::Primitive(arr) => add_size_primitive(arr, sizes),
         Canonical::Decimal(arr) => add_size_decimal(arr, sizes),
-        Canonical::VarBinView(_)
-        | Canonical::Struct(_)
+        Canonical::VarBinView(arr) => add_size_varbinview(arr, sizes, ctx)?,
+        Canonical::Struct(_)
         | Canonical::FixedSizeList(_)
         | Canonical::Extension(_)
         | Canonical::List(_) => vortex_bail!(
@@ -180,8 +176,8 @@ pub fn field_encode(
         Canonical::Bool(arr) => encode_bool(arr, field, offsets, cursors, out, ctx)?,
         Canonical::Primitive(arr) => encode_primitive(arr, field, offsets, cursors, out, ctx)?,
         Canonical::Decimal(arr) => encode_decimal(arr, field, offsets, cursors, out, ctx)?,
-        Canonical::VarBinView(_)
-        | Canonical::Struct(_)
+        Canonical::VarBinView(arr) => encode_varbinview(arr, field, offsets, cursors, out, ctx)?,
+        Canonical::Struct(_)
         | Canonical::FixedSizeList(_)
         | Canonical::Extension(_)
         | Canonical::List(_) => vortex_bail!(
@@ -219,6 +215,25 @@ fn add_size_decimal(arr: &DecimalArray, sizes: &mut [u32]) {
     add_size_const(sizes, encoded_size_for_fixed(width));
 }
 
+fn add_size_varbinview(
+    arr: &VarBinViewArray,
+    sizes: &mut [u32],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    let mask = arr.as_ref().validity()?.execute_mask(arr.len(), ctx)?;
+    let views = arr.views();
+    for (i, view) in views.iter().enumerate() {
+        let valid = mask.value(i);
+        if !valid {
+            sizes[i] += 1; // sentinel only
+        } else {
+            let len = view.len() as usize;
+            sizes[i] += encoded_size_for_varlen(len);
+        }
+    }
+    Ok(())
+}
+
 fn encode_null(
     arr: &NullArray,
     field: SortField,
@@ -369,6 +384,69 @@ fn encode_decimal_typed<T>(
     }
 }
 
+fn encode_varbinview(
+    arr: &VarBinViewArray,
+    field: SortField,
+    row_offsets: &[u32],
+    col_offset: &mut [u32],
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    let mask = arr.as_ref().validity()?.execute_mask(arr.len(), ctx)?;
+    let non_null = field.non_null_sentinel();
+    let null = field.null_sentinel();
+
+    arr.with_iterator(|iter| {
+        for (i, maybe) in iter.enumerate() {
+            let pos = (row_offsets[i] + col_offset[i]) as usize;
+            if !mask.value(i) {
+                out[pos] = null;
+                col_offset[i] += 1;
+                continue;
+            }
+            let bytes: &[u8] = maybe.unwrap_or(&[]);
+            out[pos] = non_null;
+            let written = encode_varlen_value(bytes, &mut out[pos + 1..], field.descending);
+            col_offset[i] += 1 + written;
+        }
+    });
+    Ok(())
+}
+
+/// Encode a variable-length byte slice into `out` in 32-byte blocks with
+/// continuation markers. Returns the number of bytes written.
+fn encode_varlen_value(bytes: &[u8], out: &mut [u8], descending: bool) -> u32 {
+    let xor = if descending { 0xFFu8 } else { 0x00 };
+    if bytes.is_empty() {
+        // Single zero terminator.
+        out[0] = xor;
+        return 1;
+    }
+    let mut written = 0usize;
+    let mut remaining = bytes;
+    while remaining.len() > VARLEN_BLOCK_SIZE {
+        // Full block, continuation marker 0xFF (then XORed if descending).
+        let block = &remaining[..VARLEN_BLOCK_SIZE];
+        for (i, &b) in block.iter().enumerate() {
+            out[written + i] = b ^ xor;
+        }
+        out[written + VARLEN_BLOCK_SIZE] = 0xFF ^ xor;
+        written += VARLEN_BLOCK_TOTAL;
+        remaining = &remaining[VARLEN_BLOCK_SIZE..];
+    }
+    // Final partial block: pad with zeros, last byte = remaining.len() (1..=32).
+    let n = remaining.len();
+    for (i, &b) in remaining.iter().enumerate() {
+        out[written + i] = b ^ xor;
+    }
+    for j in n..VARLEN_BLOCK_SIZE {
+        out[written + j] = xor;
+    }
+    out[written + VARLEN_BLOCK_SIZE] = (n as u8) ^ xor;
+    written += VARLEN_BLOCK_TOTAL;
+    written as u32
+}
+
 /// Internal trait for encoding a fixed-width native value into byte slots.
 ///
 /// Implementations must produce a sequence of `size_of::<Self>()` bytes that is
@@ -507,6 +585,27 @@ pub fn encode_scalar_primitive(
     Ok(())
 }
 
+/// Encode a single varlen value into a buffer.
+pub fn encode_scalar_varlen(value: Option<&[u8]>, field: SortField, out: &mut ByteBufferMut) {
+    match value {
+        None => out.push(field.null_sentinel()),
+        Some(bytes) => {
+            out.push(field.non_null_sentinel());
+            let needed = if bytes.is_empty() {
+                1
+            } else {
+                bytes.len().div_ceil(VARLEN_BLOCK_SIZE) * VARLEN_BLOCK_TOTAL
+            };
+            let start = out.len();
+            for _ in 0..needed {
+                out.push(0);
+            }
+            let written = encode_varlen_value(bytes, &mut out[start..], field.descending);
+            debug_assert_eq!(written as usize, needed);
+        }
+    }
+}
+
 /// Encode a single boolean value.
 pub fn encode_scalar_bool(value: Option<bool>, field: SortField, out: &mut ByteBufferMut) {
     match value {
@@ -546,6 +645,7 @@ pub fn encoded_size_for_scalar(
                 let vt = DecimalType::smallest_decimal_value_type(dt);
                 Ok(encoded_size_for_fixed(vt.byte_width() as u32))
             }
+            DType::Utf8(_) | DType::Binary(_) => Ok(1),
             _ => vortex_bail!(
                 "unsupported scalar dtype for row encoding: {}",
                 scalar.dtype()
@@ -564,6 +664,18 @@ pub fn encoded_size_for_scalar(
                     .unwrap_or(DecimalType::I128);
                 Ok(encoded_size_for_fixed(vt.byte_width() as u32))
             }
+            DType::Utf8(_) => {
+                let bs = scalar
+                    .as_utf8()
+                    .value()
+                    .map(|s| s.as_str().len())
+                    .unwrap_or(0);
+                Ok(encoded_size_for_varlen(bs))
+            }
+            DType::Binary(_) => {
+                let bs = scalar.as_binary().value().map(|b| b.len()).unwrap_or(0);
+                Ok(encoded_size_for_varlen(bs))
+            }
             _ => vortex_bail!(
                 "unsupported scalar dtype for row encoding: {}",
                 scalar.dtype()
@@ -601,6 +713,7 @@ pub fn encode_scalar(
                     out.push(0);
                 }
             }
+            DType::Utf8(_) | DType::Binary(_) => out.push(field.null_sentinel()),
             _ => vortex_bail!(
                 "unsupported scalar dtype for row encoding: {}",
                 scalar.dtype()
@@ -657,6 +770,16 @@ pub fn encode_scalar(
                     }
                 }
             }
+            DType::Utf8(_) => {
+                let v = scalar.as_utf8();
+                let bytes = v.value().map(|s| s.as_str().as_bytes()).unwrap_or(&[]);
+                encode_scalar_varlen(Some(bytes), field, &mut out);
+            }
+            DType::Binary(_) => {
+                let v = scalar.as_binary();
+                let bytes = v.value().map(|b| b.as_slice()).unwrap_or(&[]);
+                encode_scalar_varlen(Some(bytes), field, &mut out);
+            }
             _ => vortex_bail!(
                 "unsupported scalar dtype for row encoding: {}",
                 scalar.dtype()

From 570d358939f4d8e7919ea377773c0cfdd6637fe8 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 17 May 2026 22:08:07 +0000
Subject: [PATCH 05/26] vortex-row: codec for nested canonical types

Extend the codec to handle Struct, FixedSizeList, and Extension
canonical variants. Each nested row encodes as `outer_sentinel | child
bytes...`; for null rows the child bytes are zero-filled after the
recursive encoders run so two null rows compare equal regardless of
which non-null values would have been written by the children.

`row_width_for_dtype` recurses through Struct fields and FSL elements
to return `Fixed(w)` when every leaf is fixed; otherwise `Variable`.
Extension delegates to its storage dtype. List remains `Variable` and
ListView still bails (the row encoder's output is itself a ListView, so
nested ListView isn't a near-term use case). Variant and Union bail
explicitly.

Signed-off-by: Claude <noreply@anthropic.com>
---
 vortex-row/src/codec.rs | 227 ++++++++++++++++++++++++++++++++++++++--
 1 file changed, 216 insertions(+), 11 deletions(-)

diff --git a/vortex-row/src/codec.rs b/vortex-row/src/codec.rs
index 4f70d80e5ae..8468301e5b3 100644
--- a/vortex-row/src/codec.rs
+++ b/vortex-row/src/codec.rs
@@ -30,9 +30,15 @@ use vortex_array::ExecutionCtx;
 use vortex_array::accessor::ArrayAccessor;
 use vortex_array::arrays::BoolArray;
 use vortex_array::arrays::DecimalArray;
+use vortex_array::arrays::ExtensionArray;
+use vortex_array::arrays::FixedSizeListArray;
 use vortex_array::arrays::NullArray;
 use vortex_array::arrays::PrimitiveArray;
+use vortex_array::arrays::StructArray;
 use vortex_array::arrays::VarBinViewArray;
+use vortex_array::arrays::extension::ExtensionArrayExt;
+use vortex_array::arrays::fixed_size_list::FixedSizeListArrayExt;
+use vortex_array::arrays::struct_::StructArrayExt;
 use vortex_array::dtype::DType;
 use vortex_array::dtype::DecimalType;
 use vortex_array::dtype::NativePType;
@@ -112,9 +118,28 @@ pub fn row_width_for_dtype(dtype: &DType) -> VortexResult<RowWidth> {
             )))
         }
         DType::Utf8(_) | DType::Binary(_) => Ok(RowWidth::Variable),
-        DType::Struct(..) | DType::FixedSizeList(..) | DType::List(..) | DType::Extension(..) => {
-            vortex_bail!("row encoding for {} is not yet supported", dtype)
+        DType::FixedSizeList(elem, n, _) => match row_width_for_dtype(elem)? {
+            // FSL is fixed iff its element type is fixed. Add a sentinel byte for the FSL
+            // itself, then `n` copies of the element width.
+            RowWidth::Fixed(w) => {
+                let body = w.saturating_mul(*n);
+                Ok(RowWidth::Fixed(body.saturating_add(1)))
+            }
+            RowWidth::Variable => Ok(RowWidth::Variable),
+        },
+        DType::Struct(fields, _) => {
+            // Struct is fixed iff all its fields are fixed; sum their widths plus a sentinel.
+            let mut total: u32 = 1; // outer sentinel
+            for field_dtype in fields.fields() {
+                match row_width_for_dtype(&field_dtype)? {
+                    RowWidth::Fixed(w) => total = total.saturating_add(w),
+                    RowWidth::Variable => return Ok(RowWidth::Variable),
+                }
+            }
+            Ok(RowWidth::Fixed(total))
         }
+        DType::List(..) => Ok(RowWidth::Variable),
+        DType::Extension(ext) => row_width_for_dtype(ext.storage_dtype()),
         DType::Variant(_) => {
             vortex_bail!("row encoding does not support Variant arrays (no well-defined ordering)")
         }
@@ -133,7 +158,7 @@ pub fn row_width_for_dtype(dtype: &DType) -> VortexResult<RowWidth> {
 /// variants land in later commits.
 pub fn field_size(
     canonical: &Canonical,
-    _field: SortField,
+    field: SortField,
     sizes: &mut [u32],
     ctx: &mut ExecutionCtx,
 ) -> VortexResult<()> {
@@ -143,10 +168,10 @@ pub fn field_size(
         Canonical::Primitive(arr) => add_size_primitive(arr, sizes),
         Canonical::Decimal(arr) => add_size_decimal(arr, sizes),
         Canonical::VarBinView(arr) => add_size_varbinview(arr, sizes, ctx)?,
-        Canonical::Struct(_)
-        | Canonical::FixedSizeList(_)
-        | Canonical::Extension(_)
-        | Canonical::List(_) => vortex_bail!(
+        Canonical::Struct(arr) => add_size_struct(arr, field, sizes, ctx)?,
+        Canonical::FixedSizeList(arr) => add_size_fsl(arr, field, sizes, ctx)?,
+        Canonical::Extension(arr) => add_size_extension(arr, field, sizes, ctx)?,
+        Canonical::List(_) => vortex_bail!(
             "row encoding does not yet support canonical type {:?}",
             canonical.dtype()
         ),
@@ -177,10 +202,10 @@ pub fn field_encode(
         Canonical::Primitive(arr) => encode_primitive(arr, field, offsets, cursors, out, ctx)?,
         Canonical::Decimal(arr) => encode_decimal(arr, field, offsets, cursors, out, ctx)?,
         Canonical::VarBinView(arr) => encode_varbinview(arr, field, offsets, cursors, out, ctx)?,
-        Canonical::Struct(_)
-        | Canonical::FixedSizeList(_)
-        | Canonical::Extension(_)
-        | Canonical::List(_) => vortex_bail!(
+        Canonical::Struct(arr) => encode_struct(arr, field, offsets, cursors, out, ctx)?,
+        Canonical::FixedSizeList(arr) => encode_fsl(arr, field, offsets, cursors, out, ctx)?,
+        Canonical::Extension(arr) => encode_extension(arr, field, offsets, cursors, out, ctx)?,
+        Canonical::List(_) => vortex_bail!(
             "row encoding does not yet support canonical type {:?}",
             canonical.dtype()
         ),
@@ -234,6 +259,60 @@ fn add_size_varbinview(
     Ok(())
 }
 
+fn add_size_struct(
+    arr: &StructArray,
+    field: SortField,
+    sizes: &mut [u32],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    // null sentinel: 1 byte per row.
+    for s in sizes.iter_mut() {
+        *s += 1;
+    }
+    // Each field adds its own per-row size.
+    for child in arr.iter_unmasked_fields() {
+        let canonical = child.clone().execute::<Canonical>(ctx)?;
+        field_size(&canonical, field, sizes, ctx)?;
+    }
+    Ok(())
+}
+
+fn add_size_fsl(
+    arr: &FixedSizeListArray,
+    field: SortField,
+    sizes: &mut [u32],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    let n = arr.len();
+    debug_assert_eq!(n, sizes.len());
+    let list_size = arr.list_size() as usize;
+    let elements = arr.elements().clone().execute::<Canonical>(ctx)?;
+    debug_assert_eq!(elements.len(), n * list_size);
+    // Sizing: 1 sentinel + sum of element sizes (`list_size` per row).
+    // We compute element-wise sizes into a contiguous scratch buffer then reduce by row.
+    let mut elem_sizes = vec![0u32; n * list_size];
+    field_size(&elements, field, &mut elem_sizes, ctx)?;
+    for i in 0..n {
+        let mut sum: u32 = 1; // sentinel
+        let base = i * list_size;
+        for j in 0..list_size {
+            sum = sum.saturating_add(elem_sizes[base + j]);
+        }
+        sizes[i] += sum;
+    }
+    Ok(())
+}
+
+fn add_size_extension(
+    arr: &ExtensionArray,
+    field: SortField,
+    sizes: &mut [u32],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    let storage = arr.storage_array().clone().execute::<Canonical>(ctx)?;
+    field_size(&storage, field, sizes, ctx)
+}
+
 fn encode_null(
     arr: &NullArray,
     field: SortField,
@@ -413,6 +492,132 @@ fn encode_varbinview(
     Ok(())
 }
 
+fn encode_struct(
+    arr: &StructArray,
+    field: SortField,
+    row_offsets: &[u32],
+    col_offset: &mut [u32],
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    let n = arr.len();
+    let mask = arr.as_ref().validity()?.execute_mask(n, ctx)?;
+    let non_null = field.non_null_sentinel();
+    let null = field.null_sentinel();
+
+    // First, write the sentinel for each row. We track the post-sentinel cursor offsets
+    // for the body in `body_cursors` (which start exactly at +1 of the input cursor).
+    // For null rows we additionally need to zero-fill the (uniform-width) field bytes,
+    // but because struct widths are variable in general, we record null indexes first
+    // and zero-fill after we know each row's contribution.
+    //
+    // To keep the implementation simple we:
+    //   1) advance the cursor past the sentinel,
+    //   2) recursively encode each field's bytes (the field encoders ignore nullness of
+    //      the struct, but use their own per-field nullness),
+    //   3) for null struct rows, overwrite the body bytes with zeros so the encoded form
+    //      depends only on the sentinel.
+    let body_start: Vec<u32> = (0..n).map(|i| col_offset[i] + 1).collect();
+    for i in 0..n {
+        let pos = (row_offsets[i] + col_offset[i]) as usize;
+        out[pos] = if mask.value(i) { non_null } else { null };
+        col_offset[i] += 1;
+    }
+
+    for child in arr.iter_unmasked_fields() {
+        let canonical = child.clone().execute::<Canonical>(ctx)?;
+        field_encode(&canonical, field, row_offsets, col_offset, out, ctx)?;
+    }
+
+    // Zero-fill body bytes of null rows (the field encoders may have written values).
+    for i in 0..n {
+        if !mask.value(i) {
+            let start = (row_offsets[i] + body_start[i]) as usize;
+            let end = (row_offsets[i] + col_offset[i]) as usize;
+            for b in &mut out[start..end] {
+                *b = 0;
+            }
+        }
+    }
+
+    Ok(())
+}
+
+fn encode_fsl(
+    arr: &FixedSizeListArray,
+    field: SortField,
+    row_offsets: &[u32],
+    col_offset: &mut [u32],
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    let n = arr.len();
+    let list_size = arr.list_size() as usize;
+    let mask = arr.as_ref().validity()?.execute_mask(n, ctx)?;
+    let non_null = field.non_null_sentinel();
+    let null = field.null_sentinel();
+    let elements = arr.elements().clone().execute::<Canonical>(ctx)?;
+    debug_assert_eq!(elements.len(), n * list_size);
+
+    // Write sentinels and remember body start for null zero-fill.
+    let body_start: Vec<u32> = (0..n).map(|i| col_offset[i] + 1).collect();
+    for i in 0..n {
+        let pos = (row_offsets[i] + col_offset[i]) as usize;
+        out[pos] = if mask.value(i) { non_null } else { null };
+        col_offset[i] += 1;
+    }
+
+    // Encode all `n * list_size` elements into the body. Build a fresh
+    // (offsets, cursors) pair where each element gets one slot. Then sum bytes back
+    // into the parent col_offset.
+    let mut elem_sizes = vec![0u32; n * list_size];
+    field_size(&elements, field, &mut elem_sizes, ctx)?;
+    // Element offsets are sequential starting at each parent's current cursor position.
+    let mut elem_offsets = vec![0u32; n * list_size];
+    for i in 0..n {
+        let mut acc = row_offsets[i] + col_offset[i];
+        for j in 0..list_size {
+            elem_offsets[i * list_size + j] = acc;
+            acc = acc.saturating_add(elem_sizes[i * list_size + j]);
+        }
+    }
+    let mut elem_cursors = vec![0u32; n * list_size];
+    field_encode(&elements, field, &elem_offsets, &mut elem_cursors, out, ctx)?;
+    // Advance the parent cursors by the total per-row element bytes.
+    for i in 0..n {
+        let mut sum: u32 = 0;
+        for j in 0..list_size {
+            sum = sum.saturating_add(elem_sizes[i * list_size + j]);
+        }
+        col_offset[i] = col_offset[i].saturating_add(sum);
+    }
+
+    // Zero-fill null bodies.
+    for i in 0..n {
+        if !mask.value(i) {
+            let start = (row_offsets[i] + body_start[i]) as usize;
+            let end = (row_offsets[i] + col_offset[i]) as usize;
+            for b in &mut out[start..end] {
+                *b = 0;
+            }
+        }
+    }
+
+    Ok(())
+}
+
+fn encode_extension(
+    arr: &ExtensionArray,
+    field: SortField,
+    row_offsets: &[u32],
+    col_offset: &mut [u32],
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    let storage = arr.storage_array().clone().execute::<Canonical>(ctx)?;
+    field_encode(&storage, field, row_offsets, col_offset, out, ctx)
+}
+
 /// Encode a variable-length byte slice into `out` in 32-byte blocks with
 /// continuation markers. Returns the number of bytes written.
 fn encode_varlen_value(bytes: &[u8], out: &mut [u8], descending: bool) -> u32 {

From 5374f3b65c42c2c7bb7a646e61a956d12f1aef0c Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 17 May 2026 22:09:31 +0000
Subject: [PATCH 06/26] vortex-row: compute_sizes helper and RowSize ScalarFn

Add the size-pass machinery used by both RowSize and the upcoming
RowEncode pipeline. `compute_sizes` walks the N input columns once,
classifying each via `row_width_for_dtype` and accumulating
fixed-width-prefix sums in `fixed_per_row` while pushing per-row sums
of variable-length columns into a lazily allocated `var_lengths` vec.

The classification result (`ColKind` + `SizePassResult`) is private to
the crate; RowEncode consumes it in a later commit to choose between
the arithmetic and cursor encode paths.

`RowSize` returns a `Struct { fixed: U32, var: U32 }` so callers can
read the per-row width without realizing the constant `fixed` slot as
a per-row buffer (it's a `ConstantArray`); the `var` slot is a
`ConstantArray(0)` when no varlen column is present.

`dispatch_size` is the fallback-only path for PR 1 (canonicalize, then
codec::field_size). The `RowSizeKernel` trait exists but is unused; per-
encoding fast paths and the inventory registry arrive in PR 3.

`initialize()` does NOT register RowSize yet - that lands once
RowEncode is in place, so the session-registered pair appears together.

Signed-off-by: Claude <noreply@anthropic.com>
---
 vortex-row/public-api.lock |  76 ++++++++++
 vortex-row/src/lib.rs      |   3 +
 vortex-row/src/size.rs     | 288 +++++++++++++++++++++++++++++++++++++
 3 files changed, 367 insertions(+)
 create mode 100644 vortex-row/src/size.rs

diff --git a/vortex-row/public-api.lock b/vortex-row/public-api.lock
index 1afc1f05442..85985bf7521 100644
--- a/vortex-row/public-api.lock
+++ b/vortex-row/public-api.lock
@@ -182,6 +182,46 @@ impl core::marker::StructuralPartialEq for vortex_row::options::SortField
 
 pub const vortex_row::options::FIELDS_INLINE: usize
 
+pub mod vortex_row::size
+
+pub struct vortex_row::size::RowSize
+
+impl core::clone::Clone for vortex_row::size::RowSize
+
+pub fn vortex_row::size::RowSize::clone(&self) -> vortex_row::size::RowSize
+
+impl core::fmt::Debug for vortex_row::size::RowSize
+
+pub fn vortex_row::size::RowSize::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result
+
+impl vortex_array::scalar_fn::vtable::ScalarFnVTable for vortex_row::size::RowSize
+
+pub type vortex_row::size::RowSize::Options = vortex_row::options::RowEncodeOptions
+
+pub fn vortex_row::size::RowSize::arity(&self, &Self::Options) -> vortex_array::scalar_fn::vtable::Arity
+
+pub fn vortex_row::size::RowSize::child_name(&self, &Self::Options, usize) -> vortex_array::scalar_fn::vtable::ChildName
+
+pub fn vortex_row::size::RowSize::deserialize(&self, &[u8], &vortex_session::VortexSession) -> vortex_error::VortexResult<Self::Options>
+
+pub fn vortex_row::size::RowSize::execute(&self, &Self::Options, &dyn vortex_array::scalar_fn::vtable::ExecutionArgs, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<vortex_array::array::erased::ArrayRef>
+
+pub fn vortex_row::size::RowSize::id(&self) -> vortex_array::scalar_fn::ScalarFnId
+
+pub fn vortex_row::size::RowSize::is_fallible(&self, &Self::Options) -> bool
+
+pub fn vortex_row::size::RowSize::is_null_sensitive(&self, &Self::Options) -> bool
+
+pub fn vortex_row::size::RowSize::return_dtype(&self, &Self::Options, &[vortex_array::dtype::DType]) -> vortex_error::VortexResult<vortex_array::dtype::DType>
+
+pub fn vortex_row::size::RowSize::serialize(&self, &Self::Options) -> vortex_error::VortexResult<core::option::Option<alloc::vec::Vec<u8>>>
+
+pub trait vortex_row::size::RowSizeKernel: vortex_array::array::vtable::VTable
+
+pub fn vortex_row::size::RowSizeKernel::row_size_contribution(vortex_array::array::view::ArrayView<'_, Self>, vortex_row::options::SortField, &mut [u32], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<()>>
+
+pub fn vortex_row::size::dispatch_size(&vortex_array::array::erased::ArrayRef, vortex_row::options::SortField, &mut [u32], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<()>
+
 pub struct vortex_row::RowEncodeOptions
 
 pub vortex_row::RowEncodeOptions::fields: smallvec::SmallVec<[vortex_row::options::SortField; 4]>
@@ -214,6 +254,38 @@ pub fn vortex_row::options::RowEncodeOptions::hash<__H: core::hash::Hasher>(&sel
 
 impl core::marker::StructuralPartialEq for vortex_row::options::RowEncodeOptions
 
+pub struct vortex_row::RowSize
+
+impl core::clone::Clone for vortex_row::size::RowSize
+
+pub fn vortex_row::size::RowSize::clone(&self) -> vortex_row::size::RowSize
+
+impl core::fmt::Debug for vortex_row::size::RowSize
+
+pub fn vortex_row::size::RowSize::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result
+
+impl vortex_array::scalar_fn::vtable::ScalarFnVTable for vortex_row::size::RowSize
+
+pub type vortex_row::size::RowSize::Options = vortex_row::options::RowEncodeOptions
+
+pub fn vortex_row::size::RowSize::arity(&self, &Self::Options) -> vortex_array::scalar_fn::vtable::Arity
+
+pub fn vortex_row::size::RowSize::child_name(&self, &Self::Options, usize) -> vortex_array::scalar_fn::vtable::ChildName
+
+pub fn vortex_row::size::RowSize::deserialize(&self, &[u8], &vortex_session::VortexSession) -> vortex_error::VortexResult<Self::Options>
+
+pub fn vortex_row::size::RowSize::execute(&self, &Self::Options, &dyn vortex_array::scalar_fn::vtable::ExecutionArgs, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<vortex_array::array::erased::ArrayRef>
+
+pub fn vortex_row::size::RowSize::id(&self) -> vortex_array::scalar_fn::ScalarFnId
+
+pub fn vortex_row::size::RowSize::is_fallible(&self, &Self::Options) -> bool
+
+pub fn vortex_row::size::RowSize::is_null_sensitive(&self, &Self::Options) -> bool
+
+pub fn vortex_row::size::RowSize::return_dtype(&self, &Self::Options, &[vortex_array::dtype::DType]) -> vortex_error::VortexResult<vortex_array::dtype::DType>
+
+pub fn vortex_row::size::RowSize::serialize(&self, &Self::Options) -> vortex_error::VortexResult<core::option::Option<alloc::vec::Vec<u8>>>
+
 pub struct vortex_row::SortField
 
 pub vortex_row::SortField::descending: bool
@@ -258,4 +330,8 @@ impl core::marker::Copy for vortex_row::options::SortField
 
 impl core::marker::StructuralPartialEq for vortex_row::options::SortField
 
+pub trait vortex_row::RowSizeKernel: vortex_array::array::vtable::VTable
+
+pub fn vortex_row::RowSizeKernel::row_size_contribution(vortex_array::array::view::ArrayView<'_, Self>, vortex_row::options::SortField, &mut [u32], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<()>>
+
 pub fn vortex_row::initialize(&vortex_session::VortexSession)
diff --git a/vortex-row/src/lib.rs b/vortex-row/src/lib.rs
index bdac4c8f48e..6f1d8fbeab3 100644
--- a/vortex-row/src/lib.rs
+++ b/vortex-row/src/lib.rs
@@ -8,9 +8,12 @@
 
 pub mod codec;
 pub mod options;
+pub mod size;
 
 pub use options::RowEncodeOptions;
 pub use options::SortField;
+pub use size::RowSize;
+pub use size::RowSizeKernel;
 use vortex_session::VortexSession;
 
 /// Register the row-encoding scalar functions on the given session.
diff --git a/vortex-row/src/size.rs b/vortex-row/src/size.rs
new file mode 100644
index 00000000000..fbde52e1863
--- /dev/null
+++ b/vortex-row/src/size.rs
@@ -0,0 +1,288 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! `RowSize` variadic scalar function: aggregate per-row byte sizes for N input columns.
+
+use std::sync::Arc;
+
+use vortex_array::ArrayRef;
+use vortex_array::ArrayView;
+use vortex_array::Canonical;
+use vortex_array::ExecutionCtx;
+use vortex_array::IntoArray;
+use vortex_array::VTable;
+use vortex_array::arrays::ConstantArray;
+use vortex_array::arrays::PrimitiveArray;
+use vortex_array::arrays::StructArray;
+use vortex_array::dtype::DType;
+use vortex_array::dtype::FieldName;
+use vortex_array::dtype::FieldNames;
+use vortex_array::dtype::Nullability;
+use vortex_array::dtype::PType;
+use vortex_array::dtype::StructFields;
+use vortex_array::scalar::Scalar;
+use vortex_array::scalar_fn::Arity;
+use vortex_array::scalar_fn::ChildName;
+use vortex_array::scalar_fn::ExecutionArgs;
+use vortex_array::scalar_fn::ScalarFnId;
+use vortex_array::scalar_fn::ScalarFnVTable;
+use vortex_array::validity::Validity;
+use vortex_buffer::Buffer;
+use vortex_error::VortexExpect;
+use vortex_error::VortexResult;
+use vortex_error::vortex_bail;
+use vortex_session::VortexSession;
+
+use crate::codec;
+use crate::codec::RowWidth;
+use crate::options::RowEncodeOptions;
+use crate::options::SortField;
+use crate::options::deserialize_row_encode_options;
+use crate::options::serialize_row_encode_options;
+
+/// Classification of a single input column for the size pass.
+///
+/// Tracks each column's within-row byte offset (the constant prefix from all preceding
+/// fixed-width columns) and, for fixed columns, whether any variable-length column has
+/// appeared yet — the encode pass uses this to choose between the arithmetic-write fast
+/// path (no varlen before this column, so the within-row position is constant) and the
+/// cursor-write path.
+#[derive(Clone, Copy, Debug)]
+#[allow(
+    dead_code,
+    reason = "fields read by the RowEncode pipeline in a later commit"
+)]
+pub(crate) enum ColKind {
+    /// Column has fixed width `width`. `prefix` is the within-row byte offset of this
+    /// column's first byte. If `before_varlen` is true, no variable-length column precedes
+    /// this one, so the within-row offset is constant for every row.
+    Fixed {
+        width: u32,
+        prefix: u32,
+        before_varlen: bool,
+    },
+    /// Column has variable per-row width. `fixed_prefix` is the sum of widths of all
+    /// preceding fixed columns; the varlen contribution from earlier varlen columns is
+    /// added per row.
+    Variable { fixed_prefix: u32 },
+}
+
+/// Result of the size pass: enough information for both [`RowSize::execute`] and the
+/// downstream [`RowEncode`](super::encode::RowEncode) pipeline.
+pub(crate) struct SizePassResult {
+    pub fixed_per_row: u32,
+    pub var_lengths: Option<Vec<u32>>,
+    pub col_kinds: Vec<ColKind>,
+    pub first_varlen_idx: Option<usize>,
+    pub columns: Vec<ArrayRef>,
+}
+
+/// Walk N input columns once, classifying each as fixed-width or variable-length and
+/// accumulating per-row size contributions.
+///
+/// Fixed-width columns contribute a single scalar increment to `fixed_per_row`; they do
+/// not touch `var_lengths`. Variable-length columns add per-row contributions into the
+/// lazily-allocated `var_lengths` vec via [`dispatch_size`].
+///
+/// This is shared by [`RowSize::execute`] (which wraps the result into a
+/// `Struct { fixed, var }`) and the [`RowEncode`](super::encode::RowEncode) pipeline
+/// (which uses the full result, including `col_kinds`, to drive the encode pass).
+pub(crate) fn compute_sizes(
+    options: &RowEncodeOptions,
+    args: &dyn ExecutionArgs,
+    ctx: &mut ExecutionCtx,
+    op_name: &'static str,
+) -> VortexResult<SizePassResult> {
+    let n_inputs = args.num_inputs();
+    if n_inputs == 0 {
+        vortex_bail!("{} requires at least one input column", op_name);
+    }
+    if options.fields.len() != n_inputs {
+        vortex_bail!(
+            "{} options.fields.len()={} does not match num_inputs={}",
+            op_name,
+            options.fields.len(),
+            n_inputs
+        );
+    }
+    let nrows = args.row_count();
+
+    let mut columns: Vec<ArrayRef> = Vec::with_capacity(n_inputs);
+    let mut col_kinds: Vec<ColKind> = Vec::with_capacity(n_inputs);
+    let mut fixed_per_row: u32 = 0;
+    let mut var_lengths: Option<Vec<u32>> = None;
+    let mut first_varlen_idx: Option<usize> = None;
+    let mut running_fixed_prefix: u32 = 0;
+
+    for i in 0..n_inputs {
+        let col = args.get(i)?;
+        if col.len() != nrows {
+            vortex_bail!(
+                "{}: column {} has length {} but expected {}",
+                op_name,
+                i,
+                col.len(),
+                nrows
+            );
+        }
+        match codec::row_width_for_dtype(col.dtype())? {
+            RowWidth::Fixed(w) => {
+                col_kinds.push(ColKind::Fixed {
+                    width: w,
+                    prefix: running_fixed_prefix,
+                    before_varlen: first_varlen_idx.is_none(),
+                });
+                fixed_per_row = fixed_per_row
+                    .checked_add(w)
+                    .vortex_expect("row width overflow");
+                running_fixed_prefix = running_fixed_prefix
+                    .checked_add(w)
+                    .vortex_expect("row width overflow");
+            }
+            RowWidth::Variable => {
+                if first_varlen_idx.is_none() {
+                    first_varlen_idx = Some(i);
+                }
+                let v = var_lengths.get_or_insert_with(|| vec![0u32; nrows]);
+                dispatch_size(&col, options.fields[i], v, ctx)?;
+                col_kinds.push(ColKind::Variable {
+                    fixed_prefix: running_fixed_prefix,
+                });
+            }
+        }
+        columns.push(col);
+    }
+
+    Ok(SizePassResult {
+        fixed_per_row,
+        var_lengths,
+        col_kinds,
+        first_varlen_idx,
+        columns,
+    })
+}
+
+/// Variadic scalar function that, given N input columns and per-column [`SortField`]s,
+/// returns a `Struct { fixed: U32, var: U32 }` array of per-row byte sizes for the
+/// row-oriented encoding produced by [`RowEncode`](super::encode::RowEncode).
+///
+/// The `fixed` field is always a [`ConstantArray`] holding the sum of the per-column
+/// constant widths of fixed-width inputs (sentinel + value bytes). The `var` field is a
+/// `ConstantArray(0)` when there are no variable-length input columns, and a
+/// [`PrimitiveArray<u32>`] of per-row varlen-byte sums otherwise.
+///
+/// The total per-row byte size is `fixed + var`.
+#[derive(Clone, Debug)]
+pub struct RowSize;
+
+/// Returns the [`FieldNames`] used by the [`RowSize`] output struct.
+pub(crate) fn row_size_field_names() -> FieldNames {
+    FieldNames::from([FieldName::from("fixed"), FieldName::from("var")])
+}
+
+/// Returns the output [`DType`] of [`RowSize`].
+pub(crate) fn row_size_struct_dtype() -> DType {
+    DType::Struct(
+        StructFields::new(
+            row_size_field_names(),
+            vec![
+                DType::Primitive(PType::U32, Nullability::NonNullable),
+                DType::Primitive(PType::U32, Nullability::NonNullable),
+            ],
+        ),
+        Nullability::NonNullable,
+    )
+}
+
+impl ScalarFnVTable for RowSize {
+    type Options = RowEncodeOptions;
+
+    fn id(&self) -> ScalarFnId {
+        ScalarFnId::from("vortex.row_size")
+    }
+
+    fn serialize(&self, options: &Self::Options) -> VortexResult<Option<Vec<u8>>> {
+        Ok(Some(serialize_row_encode_options(options)))
+    }
+
+    fn deserialize(
+        &self,
+        metadata: &[u8],
+        _session: &VortexSession,
+    ) -> VortexResult<Self::Options> {
+        deserialize_row_encode_options(metadata)
+    }
+
+    fn arity(&self, _options: &Self::Options) -> Arity {
+        Arity::Variadic { min: 1, max: None }
+    }
+
+    fn child_name(&self, _options: &Self::Options, child_idx: usize) -> ChildName {
+        ChildName::from(Arc::from(format!("col_{}", child_idx)))
+    }
+
+    fn return_dtype(&self, _options: &Self::Options, _args: &[DType]) -> VortexResult<DType> {
+        Ok(row_size_struct_dtype())
+    }
+
+    fn execute(
+        &self,
+        options: &Self::Options,
+        args: &dyn ExecutionArgs,
+        ctx: &mut ExecutionCtx,
+    ) -> VortexResult<ArrayRef> {
+        let nrows = args.row_count();
+        let result = compute_sizes(options, args, ctx, "RowSize")?;
+        let fixed_array =
+            ConstantArray::new(Scalar::from(result.fixed_per_row), nrows).into_array();
+        let var_array = match result.var_lengths {
+            Some(v) => PrimitiveArray::new(Buffer::<u32>::copy_from(&v), Validity::NonNullable)
+                .into_array(),
+            None => ConstantArray::new(Scalar::from(0u32), nrows).into_array(),
+        };
+        Ok(StructArray::try_new(
+            row_size_field_names(),
+            vec![fixed_array, var_array],
+            nrows,
+            Validity::NonNullable,
+        )?
+        .into_array())
+    }
+
+    fn is_null_sensitive(&self, _options: &Self::Options) -> bool {
+        true
+    }
+
+    fn is_fallible(&self, _options: &Self::Options) -> bool {
+        false
+    }
+}
+
+/// Dispatch a single column's per-row size contribution.
+///
+/// For PR 1 this is just the canonicalize-then-`codec::field_size` fallback path. In-crate
+/// fast paths for `Constant`/`Dict`/`Patched` and the inventory-based registry for
+/// downstream encodings are added in PR 3.
+pub fn dispatch_size(
+    col: &ArrayRef,
+    field: SortField,
+    sizes: &mut [u32],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    let canonical = col.clone().execute::<Canonical>(ctx)?;
+    codec::field_size(&canonical, field, sizes, ctx)
+}
+
+/// Mutate-buffer kernel: add this column's per-row byte contribution into the shared
+/// `sizes` slice. Return `Ok(None)` to decline and fall back to the canonical path.
+///
+/// Trait is defined now; per-encoding impls and dispatch wiring land in PR 3.
+pub trait RowSizeKernel: VTable {
+    /// Add this column's per-row byte contribution into `sizes`.
+    fn row_size_contribution(
+        column: ArrayView<'_, Self>,
+        field: SortField,
+        sizes: &mut [u32],
+        ctx: &mut ExecutionCtx,
+    ) -> VortexResult<Option<()>>;
+}

From 40783a62bcfdc5b5d92782144c101cd577801d11 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 17 May 2026 22:11:06 +0000
Subject: [PATCH 07/26] vortex-row: RowEncode ScalarFn

Add the RowEncode variadic scalar function: encode N input columns into
a single ListView<u8> in a five-phase pipeline.

  Phase 1: size pass via `compute_sizes`.
  Phase 2: allocate a zero-initialized output buffer sized to fit every
           row's encoded bytes; bail if the total exceeds u32::MAX.
  Phase 3: build per-row `listview_offsets`: i * fixed_per_row for the
           pure-fixed case, or i * fixed_per_row + exclusive cumsum of
           varlen lengths otherwise. Uses the simple `Vec::push` +
           `checked_add` loop.
  Phase 4: walk columns left-to-right and call `dispatch_encode` for
           every column (cursor path for all). Each call writes its
           per-row bytes at `offsets[i] + cursors[i]` and advances the
           cursor.
  Phase 5: build the ListView<u8> via the validating `try_new`
           constructor.

`dispatch_encode` is the canonicalize-then-`codec::field_encode`
fallback; in-crate kernel arms and the inventory registry land in PR 3.
The `RowEncodeKernel` trait is defined but unused. PR 2 will iterate
on this pipeline (skip zero-init, skip ListView validation, auto-
vectorize the offsets loop, etc.).

Signed-off-by: Claude <noreply@anthropic.com>
---
 vortex-row/public-api.lock |  76 ++++++++++++
 vortex-row/src/encode.rs   | 238 +++++++++++++++++++++++++++++++++++++
 vortex-row/src/lib.rs      |   3 +
 vortex-row/src/size.rs     |   8 ++
 4 files changed, 325 insertions(+)
 create mode 100644 vortex-row/src/encode.rs

diff --git a/vortex-row/public-api.lock b/vortex-row/public-api.lock
index 85985bf7521..f999303948d 100644
--- a/vortex-row/public-api.lock
+++ b/vortex-row/public-api.lock
@@ -102,6 +102,46 @@ pub fn vortex_row::codec::field_size(&vortex_array::canonical::Canonical, vortex
 
 pub fn vortex_row::codec::row_width_for_dtype(&vortex_array::dtype::DType) -> vortex_error::VortexResult<vortex_row::codec::RowWidth>
 
+pub mod vortex_row::encode
+
+pub struct vortex_row::encode::RowEncode
+
+impl core::clone::Clone for vortex_row::encode::RowEncode
+
+pub fn vortex_row::encode::RowEncode::clone(&self) -> vortex_row::encode::RowEncode
+
+impl core::fmt::Debug for vortex_row::encode::RowEncode
+
+pub fn vortex_row::encode::RowEncode::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result
+
+impl vortex_array::scalar_fn::vtable::ScalarFnVTable for vortex_row::encode::RowEncode
+
+pub type vortex_row::encode::RowEncode::Options = vortex_row::options::RowEncodeOptions
+
+pub fn vortex_row::encode::RowEncode::arity(&self, &Self::Options) -> vortex_array::scalar_fn::vtable::Arity
+
+pub fn vortex_row::encode::RowEncode::child_name(&self, &Self::Options, usize) -> vortex_array::scalar_fn::vtable::ChildName
+
+pub fn vortex_row::encode::RowEncode::deserialize(&self, &[u8], &vortex_session::VortexSession) -> vortex_error::VortexResult<Self::Options>
+
+pub fn vortex_row::encode::RowEncode::execute(&self, &Self::Options, &dyn vortex_array::scalar_fn::vtable::ExecutionArgs, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<vortex_array::array::erased::ArrayRef>
+
+pub fn vortex_row::encode::RowEncode::id(&self) -> vortex_array::scalar_fn::ScalarFnId
+
+pub fn vortex_row::encode::RowEncode::is_fallible(&self, &Self::Options) -> bool
+
+pub fn vortex_row::encode::RowEncode::is_null_sensitive(&self, &Self::Options) -> bool
+
+pub fn vortex_row::encode::RowEncode::return_dtype(&self, &Self::Options, &[vortex_array::dtype::DType]) -> vortex_error::VortexResult<vortex_array::dtype::DType>
+
+pub fn vortex_row::encode::RowEncode::serialize(&self, &Self::Options) -> vortex_error::VortexResult<core::option::Option<alloc::vec::Vec<u8>>>
+
+pub trait vortex_row::encode::RowEncodeKernel: vortex_array::array::vtable::VTable
+
+pub fn vortex_row::encode::RowEncodeKernel::row_encode_into(vortex_array::array::view::ArrayView<'_, Self>, vortex_row::options::SortField, &[u32], &mut [u32], &mut [u8], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<()>>
+
+pub fn vortex_row::encode::dispatch_encode(&vortex_array::array::erased::ArrayRef, vortex_row::options::SortField, &[u32], &mut [u32], &mut [u8], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<()>
+
 pub mod vortex_row::options
 
 pub struct vortex_row::options::RowEncodeOptions
@@ -222,6 +262,38 @@ pub fn vortex_row::size::RowSizeKernel::row_size_contribution(vortex_array::arra
 
 pub fn vortex_row::size::dispatch_size(&vortex_array::array::erased::ArrayRef, vortex_row::options::SortField, &mut [u32], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<()>
 
+pub struct vortex_row::RowEncode
+
+impl core::clone::Clone for vortex_row::encode::RowEncode
+
+pub fn vortex_row::encode::RowEncode::clone(&self) -> vortex_row::encode::RowEncode
+
+impl core::fmt::Debug for vortex_row::encode::RowEncode
+
+pub fn vortex_row::encode::RowEncode::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result
+
+impl vortex_array::scalar_fn::vtable::ScalarFnVTable for vortex_row::encode::RowEncode
+
+pub type vortex_row::encode::RowEncode::Options = vortex_row::options::RowEncodeOptions
+
+pub fn vortex_row::encode::RowEncode::arity(&self, &Self::Options) -> vortex_array::scalar_fn::vtable::Arity
+
+pub fn vortex_row::encode::RowEncode::child_name(&self, &Self::Options, usize) -> vortex_array::scalar_fn::vtable::ChildName
+
+pub fn vortex_row::encode::RowEncode::deserialize(&self, &[u8], &vortex_session::VortexSession) -> vortex_error::VortexResult<Self::Options>
+
+pub fn vortex_row::encode::RowEncode::execute(&self, &Self::Options, &dyn vortex_array::scalar_fn::vtable::ExecutionArgs, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<vortex_array::array::erased::ArrayRef>
+
+pub fn vortex_row::encode::RowEncode::id(&self) -> vortex_array::scalar_fn::ScalarFnId
+
+pub fn vortex_row::encode::RowEncode::is_fallible(&self, &Self::Options) -> bool
+
+pub fn vortex_row::encode::RowEncode::is_null_sensitive(&self, &Self::Options) -> bool
+
+pub fn vortex_row::encode::RowEncode::return_dtype(&self, &Self::Options, &[vortex_array::dtype::DType]) -> vortex_error::VortexResult<vortex_array::dtype::DType>
+
+pub fn vortex_row::encode::RowEncode::serialize(&self, &Self::Options) -> vortex_error::VortexResult<core::option::Option<alloc::vec::Vec<u8>>>
+
 pub struct vortex_row::RowEncodeOptions
 
 pub vortex_row::RowEncodeOptions::fields: smallvec::SmallVec<[vortex_row::options::SortField; 4]>
@@ -330,6 +402,10 @@ impl core::marker::Copy for vortex_row::options::SortField
 
 impl core::marker::StructuralPartialEq for vortex_row::options::SortField
 
+pub trait vortex_row::RowEncodeKernel: vortex_array::array::vtable::VTable
+
+pub fn vortex_row::RowEncodeKernel::row_encode_into(vortex_array::array::view::ArrayView<'_, Self>, vortex_row::options::SortField, &[u32], &mut [u32], &mut [u8], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<()>>
+
 pub trait vortex_row::RowSizeKernel: vortex_array::array::vtable::VTable
 
 pub fn vortex_row::RowSizeKernel::row_size_contribution(vortex_array::array::view::ArrayView<'_, Self>, vortex_row::options::SortField, &mut [u32], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<()>>
diff --git a/vortex-row/src/encode.rs b/vortex-row/src/encode.rs
new file mode 100644
index 00000000000..1b77d955964
--- /dev/null
+++ b/vortex-row/src/encode.rs
@@ -0,0 +1,238 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+#![allow(
+    clippy::cast_possible_truncation,
+    reason = "row encoding indexes into u32-sized buffers; lengths are validated to fit in u32"
+)]
+
+//! `RowEncode` variadic scalar function: encode N input columns into a single `ListView<u8>`.
+//!
+//! The output's `(elements, offsets, sizes)` triple is built up in a single left-to-right
+//! pass over the input columns. The `sizes` array doubles as the per-row write cursor, so
+//! when the last column finishes encoding, the accumulator is the final array - no separate
+//! conversion step is needed.
+
+use std::sync::Arc;
+
+use vortex_array::ArrayRef;
+use vortex_array::ArrayView;
+use vortex_array::Canonical;
+use vortex_array::ExecutionCtx;
+use vortex_array::IntoArray;
+use vortex_array::VTable;
+use vortex_array::arrays::ListViewArray;
+use vortex_array::arrays::PrimitiveArray;
+use vortex_array::dtype::DType;
+use vortex_array::dtype::Nullability;
+use vortex_array::dtype::PType;
+use vortex_array::scalar_fn::Arity;
+use vortex_array::scalar_fn::ChildName;
+use vortex_array::scalar_fn::ExecutionArgs;
+use vortex_array::scalar_fn::ScalarFnId;
+use vortex_array::scalar_fn::ScalarFnVTable;
+use vortex_array::validity::Validity;
+use vortex_buffer::Buffer;
+use vortex_buffer::BufferMut;
+use vortex_error::VortexExpect;
+use vortex_error::VortexResult;
+use vortex_error::vortex_bail;
+use vortex_session::VortexSession;
+
+use crate::codec;
+use crate::options::RowEncodeOptions;
+use crate::options::SortField;
+use crate::options::deserialize_row_encode_options;
+use crate::options::serialize_row_encode_options;
+use crate::size::compute_sizes;
+
+/// Variadic scalar function that encodes N input columns into a single `List<u8>`
+/// [`ListViewArray`] where row `i` contains the row-encoded bytes for column values
+/// `cols[0][i], cols[1][i], ...` concatenated left-to-right.
+#[derive(Clone, Debug)]
+pub struct RowEncode;
+
+impl ScalarFnVTable for RowEncode {
+    type Options = RowEncodeOptions;
+
+    fn id(&self) -> ScalarFnId {
+        ScalarFnId::from("vortex.row_encode")
+    }
+
+    fn serialize(&self, options: &Self::Options) -> VortexResult<Option<Vec<u8>>> {
+        Ok(Some(serialize_row_encode_options(options)))
+    }
+
+    fn deserialize(
+        &self,
+        metadata: &[u8],
+        _session: &VortexSession,
+    ) -> VortexResult<Self::Options> {
+        deserialize_row_encode_options(metadata)
+    }
+
+    fn arity(&self, _options: &Self::Options) -> Arity {
+        Arity::Variadic { min: 1, max: None }
+    }
+
+    fn child_name(&self, _options: &Self::Options, child_idx: usize) -> ChildName {
+        ChildName::from(Arc::from(format!("col_{}", child_idx)))
+    }
+
+    fn return_dtype(&self, _options: &Self::Options, _args: &[DType]) -> VortexResult<DType> {
+        Ok(DType::List(
+            Arc::new(DType::Primitive(PType::U8, Nullability::NonNullable)),
+            Nullability::NonNullable,
+        ))
+    }
+
+    fn execute(
+        &self,
+        options: &Self::Options,
+        args: &dyn ExecutionArgs,
+        ctx: &mut ExecutionCtx,
+    ) -> VortexResult<ArrayRef> {
+        execute_row_encode(options, args, ctx)
+    }
+
+    fn is_null_sensitive(&self, _options: &Self::Options) -> bool {
+        true
+    }
+
+    fn is_fallible(&self, _options: &Self::Options) -> bool {
+        false
+    }
+}
+
+fn execute_row_encode(
+    options: &RowEncodeOptions,
+    args: &dyn ExecutionArgs,
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<ArrayRef> {
+    let nrows = args.row_count();
+
+    // ===== Phase 1: classify + size pass =====
+    let crate::size::SizePassResult {
+        fixed_per_row,
+        var_lengths,
+        col_kinds: _,
+        first_varlen_idx: _,
+        columns,
+    } = compute_sizes(options, args, ctx, "RowEncode")?;
+
+    // ===== Phase 2: totals + buffer =====
+    let var_total: u64 = var_lengths
+        .as_ref()
+        .map_or(0, |v| v.iter().map(|&x| u64::from(x)).sum());
+    let total: u64 = (nrows as u64)
+        .checked_mul(u64::from(fixed_per_row))
+        .and_then(|t| t.checked_add(var_total))
+        .vortex_expect("row-encoded total bytes overflow");
+    if total > u32::MAX as u64 {
+        vortex_bail!("row-encoded output size {} bytes exceeds u32::MAX", total);
+    }
+    let total_len = total as usize;
+
+    // Allocate the elements buffer (zero-initialized). The zero-init lets every encoder
+    // assume previously-untouched bytes are zero, simplifying the null-row fill paths.
+    // PR 2 skips this memset because every byte in the output range is written by some
+    // encoder.
+    let mut out_buf: BufferMut<u8> = BufferMut::with_capacity(total_len);
+    out_buf.push_n(0u8, total_len);
+
+    // ===== Phase 3: per-row offsets =====
+    // listview_offsets[i] is the absolute byte offset where row `i` begins.
+    // For pure-fixed: i * fixed_per_row.
+    // For mixed: i * fixed_per_row + exclusive prefix sum of var_lengths.
+    let mut listview_offsets: Vec<u32> = Vec::with_capacity(nrows);
+    match var_lengths.as_ref() {
+        None => {
+            for i in 0..nrows {
+                listview_offsets.push(
+                    (i as u32)
+                        .checked_mul(fixed_per_row)
+                        .vortex_expect("row offset overflow (already validated total fits in u32)"),
+                );
+            }
+        }
+        Some(v) => {
+            let mut acc: u32 = 0;
+            for (i, &l) in v.iter().enumerate() {
+                let off = (i as u32)
+                    .checked_mul(fixed_per_row)
+                    .and_then(|t| t.checked_add(acc))
+                    .vortex_expect("row offset overflow");
+                listview_offsets.push(off);
+                acc = acc.checked_add(l).vortex_expect("varlen prefix overflow");
+            }
+        }
+    }
+
+    // Per-row write cursor (also doubles as the ListView `sizes` slot when done).
+    let mut row_cursors = vec![0u32; nrows];
+
+    // ===== Phase 4: encode columns via the cursor path =====
+    for (i, col) in columns.iter().enumerate() {
+        dispatch_encode(
+            col,
+            options.fields[i],
+            &listview_offsets,
+            &mut row_cursors,
+            &mut out_buf,
+            ctx,
+        )?;
+    }
+
+    // ===== Phase 5: build ListView output =====
+    let elements = PrimitiveArray::new(out_buf.freeze(), Validity::NonNullable).into_array();
+    let offsets_arr = PrimitiveArray::new(
+        Buffer::<u32>::copy_from(&listview_offsets),
+        Validity::NonNullable,
+    )
+    .into_array();
+    let sizes_arr = PrimitiveArray::new(
+        Buffer::<u32>::copy_from(&row_cursors),
+        Validity::NonNullable,
+    )
+    .into_array();
+    Ok(
+        ListViewArray::try_new(elements, offsets_arr, sizes_arr, Validity::NonNullable)?
+            .into_array(),
+    )
+}
+
+/// Dispatch a single column's encoding into the shared `out` buffer.
+///
+/// For PR 1 this is just the canonicalize-then-`codec::field_encode` fallback path.
+/// In-crate fast paths for `Constant`/`Dict`/`Patched` and the inventory-based registry
+/// for downstream encodings are added in PR 3.
+pub fn dispatch_encode(
+    col: &ArrayRef,
+    field: SortField,
+    offsets: &[u32],
+    cursors: &mut [u32],
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    let canonical = col.clone().execute::<Canonical>(ctx)?;
+    codec::field_encode(&canonical, field, offsets, cursors, out, ctx)
+}
+
+/// Mutate-buffer kernel: write this column's per-row bytes into `out` at
+/// `offsets[i] + cursors[i]`, advancing `cursors[i]` by the bytes written.
+///
+/// Return `Ok(None)` to decline and fall back to the canonical path.
+///
+/// Trait is defined now; per-encoding impls and dispatch wiring land in PR 3.
+pub trait RowEncodeKernel: VTable {
+    /// Write this column's per-row bytes into `out` at `offsets[i] + cursors[i]`, advancing
+    /// `cursors[i]` by the bytes written.
+    fn row_encode_into(
+        column: ArrayView<'_, Self>,
+        field: SortField,
+        offsets: &[u32],
+        cursors: &mut [u32],
+        out: &mut [u8],
+        ctx: &mut ExecutionCtx,
+    ) -> VortexResult<Option<()>>;
+}
diff --git a/vortex-row/src/lib.rs b/vortex-row/src/lib.rs
index 6f1d8fbeab3..ef0209f3d9c 100644
--- a/vortex-row/src/lib.rs
+++ b/vortex-row/src/lib.rs
@@ -7,9 +7,12 @@
 //! This commit only establishes the crate skeleton and an `initialize` stub.
 
 pub mod codec;
+pub mod encode;
 pub mod options;
 pub mod size;
 
+pub use encode::RowEncode;
+pub use encode::RowEncodeKernel;
 pub use options::RowEncodeOptions;
 pub use options::SortField;
 pub use size::RowSize;
diff --git a/vortex-row/src/size.rs b/vortex-row/src/size.rs
index fbde52e1863..7148a2a21d8 100644
--- a/vortex-row/src/size.rs
+++ b/vortex-row/src/size.rs
@@ -72,7 +72,15 @@ pub(crate) enum ColKind {
 pub(crate) struct SizePassResult {
     pub fixed_per_row: u32,
     pub var_lengths: Option<Vec<u32>>,
+    #[allow(
+        dead_code,
+        reason = "consumed by the arithmetic-write fast path added in PR 2"
+    )]
     pub col_kinds: Vec<ColKind>,
+    #[allow(
+        dead_code,
+        reason = "consumed by the arithmetic-write fast path added in PR 2"
+    )]
     pub first_varlen_idx: Option<usize>,
     pub columns: Vec<ArrayRef>,
 }

From 87febfe4a71de78b52c0dd15917ca78390524889 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 17 May 2026 22:16:19 +0000
Subject: [PATCH 08/26] vortex-row: convert_columns + tests + bench scaffolding

Wire the RowSize/RowEncode scalar functions to the user-facing API:

- `convert_columns` accepts a slice of input arrays and per-column
  SortFields, constructs `RowEncodeOptions` + `VecExecutionArgs`, and
  returns the encoded `ListViewArray<u8>`.
- `compute_row_sizes` returns just the per-row sizes (the `Struct
  { fixed: u32, var: u32 }` output of `RowSize`).
- `initialize()` now registers `RowSize` and `RowEncode` on the given
  session so they are reachable via the expression layer.

Tests cover sort-order round-trips for bool, primitive (i64 asc/desc,
u32, f64), utf8, multi-column, nulls_first/last, struct sort-order, the
single-buffer invariant of the ListView output, and the structural
shape of `RowSize`. Tests that exercise per-encoding fast paths
(`constant_path_matches_canonical`, `dict_path_matches_canonical`) land
together with their respective kernels in PR 3.

The bench file uses divan + mimalloc and reports throughput in GB/s of
encoded output bytes for primitive_i64, utf8, and struct_mixed. Each
has an `arrow_row` baseline and a `vortex` measurement. Per-encoding
fast-path scenarios (constant/dict/patched/bitpacked/for/delta) gain
their triplets in PR 3.

Baseline measurements at this commit (sample-count=10):
  primitive_i64_vortex  ~1.97 GB/s  (vs arrow-row 4.12 GB/s)
  utf8_vortex           ~0.87 GB/s  (vs arrow-row 1.56 GB/s)
  struct_mixed_vortex   ~0.95 GB/s  (vs arrow-row 1.19 GB/s)

PR 2 closes most of the gap by replacing the validating
`ListViewArray::try_new` with `new_unchecked`, skipping the buffer
zero-init, auto-vectorizing the offsets and varlen-block paths, etc.

Signed-off-by: Claude <noreply@anthropic.com>
---
 Cargo.lock                       |   7 +
 Cargo.toml                       |   3 +-
 vortex-row/Cargo.toml            |  14 ++
 vortex-row/benches/row_encode.rs | 177 +++++++++++++++++
 vortex-row/public-api.lock       |  10 +
 vortex-row/src/convert.rs        |  75 +++++++
 vortex-row/src/lib.rs            |  41 +++-
 vortex-row/src/tests.rs          | 324 +++++++++++++++++++++++++++++++
 8 files changed, 645 insertions(+), 6 deletions(-)
 create mode 100644 vortex-row/benches/row_encode.rs
 create mode 100644 vortex-row/src/convert.rs
 create mode 100644 vortex-row/src/tests.rs

diff --git a/Cargo.lock b/Cargo.lock
index 0b84f6dd260..23ca027d592 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -11038,7 +11038,14 @@ dependencies = [
 name = "vortex-row"
 version = "0.1.0"
 dependencies = [
+ "arrow-array 58.2.0",
+ "arrow-row 58.2.0",
+ "arrow-schema 58.2.0",
  "bytes",
+ "codspeed-divan-compat",
+ "mimalloc",
+ "rand 0.10.1",
+ "rstest",
  "smallvec",
  "vortex-array",
  "vortex-buffer",
diff --git a/Cargo.toml b/Cargo.toml
index 9fae5b564bf..cdf28137563 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -7,12 +7,12 @@ members = [
     "vortex-mask",
     "vortex-utils",
     "vortex-session",
-    "vortex-row",
     "vortex-flatbuffers",
     "vortex-metrics",
     "vortex-io",
     "vortex-proto",
     "vortex-array",
+    "vortex-row",
     "vortex-tensor",
     "vortex-turboquant",
     "vortex-compressor",
@@ -103,6 +103,7 @@ arrow-cast = "58"
 arrow-data = "58"
 arrow-ipc = "58"
 arrow-ord = "58"
+arrow-row = "58"
 arrow-schema = "58"
 arrow-select = "58"
 arrow-string = "58"
diff --git a/vortex-row/Cargo.toml b/vortex-row/Cargo.toml
index aaed9a55f51..50d6547474a 100644
--- a/vortex-row/Cargo.toml
+++ b/vortex-row/Cargo.toml
@@ -24,3 +24,17 @@ vortex-buffer = { workspace = true }
 vortex-error = { workspace = true }
 vortex-mask = { workspace = true }
 vortex-session = { workspace = true }
+
+[dev-dependencies]
+arrow-array = { workspace = true }
+arrow-row = { workspace = true }
+arrow-schema = { workspace = true }
+divan = { workspace = true }
+mimalloc = { workspace = true }
+rand = { workspace = true }
+rstest = { workspace = true }
+vortex-array = { workspace = true, features = ["_test-harness"] }
+
+[[bench]]
+name = "row_encode"
+harness = false
diff --git a/vortex-row/benches/row_encode.rs b/vortex-row/benches/row_encode.rs
new file mode 100644
index 00000000000..8d631d785da
--- /dev/null
+++ b/vortex-row/benches/row_encode.rs
@@ -0,0 +1,177 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+#![expect(
+    clippy::unwrap_used,
+    clippy::clone_on_ref_ptr,
+    clippy::cloned_ref_to_slice_refs,
+    clippy::cast_possible_truncation,
+    clippy::cast_possible_wrap,
+    clippy::redundant_clone
+)]
+
+//! Row-encode throughput benchmarks comparing `arrow-row` against vortex's `convert_columns`
+//! for the canonical scenarios shipped in PR 1: a primitive i64 column, a Utf8 column,
+//! and a mixed-field struct. Per-encoding fast paths (Constant, Dict, Patched, BitPacked,
+//! FoR, Delta) gain their own triplets in PR 3.
+
+use std::sync::Arc;
+
+use arrow_array::Int64Array;
+use arrow_array::StringArray;
+use arrow_array::StructArray as ArrowStructArray;
+use arrow_row::RowConverter;
+use arrow_row::SortField as ArrowSortField;
+use arrow_schema::DataType;
+use arrow_schema::Field;
+use divan::counter::BytesCount;
+use mimalloc::MiMalloc;
+use rand::RngExt;
+use rand::SeedableRng;
+use rand::distr::Alphanumeric;
+use rand::rngs::StdRng;
+use vortex_array::IntoArray;
+use vortex_array::LEGACY_SESSION;
+use vortex_array::VortexSessionExecute;
+use vortex_array::arrays::PrimitiveArray;
+use vortex_array::arrays::StructArray;
+use vortex_array::arrays::VarBinViewArray;
+use vortex_row::SortField;
+use vortex_row::convert_columns;
+
+#[global_allocator]
+static GLOBAL: MiMalloc = MiMalloc;
+
+const N: usize = 100_000;
+
+fn main() {
+    divan::main();
+}
+
+fn gen_i64(n: usize, seed: u64) -> Vec<i64> {
+    let mut rng = StdRng::seed_from_u64(seed);
+    (0..n)
+        .map(|_| rng.random_range(i64::MIN..i64::MAX))
+        .collect()
+}
+
+fn gen_words(n: usize, mean_len: usize, seed: u64) -> Vec<String> {
+    let rng = &mut StdRng::seed_from_u64(seed);
+    (0..n)
+        .map(|_| {
+            let len = rng.random_range(mean_len.saturating_sub(4)..=mean_len + 4);
+            rng.sample_iter(&Alphanumeric)
+                .take(len)
+                .map(char::from)
+                .collect::<String>()
+        })
+        .collect()
+}
+
+// ---------- primitive_i64 ----------
+
+#[divan::bench]
+fn primitive_i64_arrow_row(bencher: divan::Bencher) {
+    let v = gen_i64(N, 0);
+    let arr = Arc::new(Int64Array::from(v.clone())) as arrow_array::ArrayRef;
+    let conv = RowConverter::new(vec![ArrowSortField::new(DataType::Int64)]).unwrap();
+    let bytes = (N * (1 + 8)) as u64;
+    bencher
+        .counter(BytesCount::new(bytes))
+        .bench_local(|| conv.convert_columns(&[arr.clone()]).unwrap())
+}
+
+#[divan::bench]
+fn primitive_i64_vortex(bencher: divan::Bencher) {
+    let v = gen_i64(N, 0);
+    let col = PrimitiveArray::from_iter(v.clone()).into_array();
+    let bytes = (N * (1 + 8)) as u64;
+    bencher.counter(BytesCount::new(bytes)).bench_local(|| {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        convert_columns(&[col.clone()], &[SortField::default()], &mut ctx).unwrap()
+    })
+}
+
+// ---------- utf8 ----------
+
+#[divan::bench]
+fn utf8_arrow_row(bencher: divan::Bencher) {
+    let words = gen_words(N, 16, 7);
+    let total: u64 = words
+        .iter()
+        .map(|w| 1 + (w.len().div_ceil(32) * 33) as u64)
+        .sum();
+    let arr = Arc::new(StringArray::from(words.clone())) as arrow_array::ArrayRef;
+    let conv = RowConverter::new(vec![ArrowSortField::new(DataType::Utf8)]).unwrap();
+    bencher
+        .counter(BytesCount::new(total))
+        .bench_local(|| conv.convert_columns(&[arr.clone()]).unwrap())
+}
+
+#[divan::bench]
+fn utf8_vortex(bencher: divan::Bencher) {
+    let words = gen_words(N, 16, 7);
+    let total: u64 = words
+        .iter()
+        .map(|w| 1 + (w.len().div_ceil(32) * 33) as u64)
+        .sum();
+    let col = VarBinViewArray::from_iter_str(words.iter().map(String::as_str)).into_array();
+    bencher.counter(BytesCount::new(total)).bench_local(|| {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        convert_columns(&[col.clone()], &[SortField::default()], &mut ctx).unwrap()
+    })
+}
+
+// ---------- struct_mixed ----------
+
+fn struct_mixed_inputs() -> (Vec<i64>, Vec<String>, u64) {
+    let ids = gen_i64(N, 1);
+    let names = gen_words(N, 16, 2);
+    // sentinel (1) + i64 (1+8=9) + utf8-name (1 + ceil(len/32)*33)
+    let total: u64 = (0..N)
+        .map(|i| {
+            let name_bytes = 1 + (names[i].len().div_ceil(32) * 33) as u64;
+            1u64 + 9u64 + name_bytes
+        })
+        .sum();
+    (ids, names, total)
+}
+
+#[divan::bench]
+fn struct_mixed_arrow_row(bencher: divan::Bencher) {
+    let (ids, names, total) = struct_mixed_inputs();
+    let id_arr = Arc::new(Int64Array::from(ids)) as arrow_array::ArrayRef;
+    let name_arr = Arc::new(StringArray::from(names)) as arrow_array::ArrayRef;
+    let arrow_struct = Arc::new(ArrowStructArray::from(vec![
+        (Arc::new(Field::new("id", DataType::Int64, false)), id_arr),
+        (
+            Arc::new(Field::new("name", DataType::Utf8, false)),
+            name_arr,
+        ),
+    ])) as arrow_array::ArrayRef;
+    let struct_fields = vec![
+        Arc::new(Field::new("id", DataType::Int64, false)),
+        Arc::new(Field::new("name", DataType::Utf8, false)),
+    ];
+    let conv = RowConverter::new(vec![ArrowSortField::new(DataType::Struct(
+        struct_fields.into(),
+    ))])
+    .unwrap();
+    bencher
+        .counter(BytesCount::new(total))
+        .bench_local(|| conv.convert_columns(&[arrow_struct.clone()]).unwrap())
+}
+
+#[divan::bench]
+fn struct_mixed_vortex(bencher: divan::Bencher) {
+    let (ids, names, total) = struct_mixed_inputs();
+    let id_arr = PrimitiveArray::from_iter(ids).into_array();
+    let name_arr = VarBinViewArray::from_iter_str(names.iter().map(String::as_str)).into_array();
+    let struct_arr = StructArray::from_fields(&[("id", id_arr), ("name", name_arr)])
+        .unwrap()
+        .into_array();
+    bencher.counter(BytesCount::new(total)).bench_local(|| {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        convert_columns(&[struct_arr.clone()], &[SortField::default()], &mut ctx).unwrap()
+    })
+}
diff --git a/vortex-row/public-api.lock b/vortex-row/public-api.lock
index f999303948d..ed231a1e556 100644
--- a/vortex-row/public-api.lock
+++ b/vortex-row/public-api.lock
@@ -102,6 +102,12 @@ pub fn vortex_row::codec::field_size(&vortex_array::canonical::Canonical, vortex
 
 pub fn vortex_row::codec::row_width_for_dtype(&vortex_array::dtype::DType) -> vortex_error::VortexResult<vortex_row::codec::RowWidth>
 
+pub mod vortex_row::convert
+
+pub fn vortex_row::convert::compute_row_sizes(&[vortex_array::array::erased::ArrayRef], &[vortex_row::options::SortField], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<vortex_array::array::erased::ArrayRef>
+
+pub fn vortex_row::convert::convert_columns(&[vortex_array::array::erased::ArrayRef], &[vortex_row::options::SortField], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<vortex_array::arrays::listview::vtable::ListViewArray>
+
 pub mod vortex_row::encode
 
 pub struct vortex_row::encode::RowEncode
@@ -410,4 +416,8 @@ pub trait vortex_row::RowSizeKernel: vortex_array::array::vtable::VTable
 
 pub fn vortex_row::RowSizeKernel::row_size_contribution(vortex_array::array::view::ArrayView<'_, Self>, vortex_row::options::SortField, &mut [u32], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<()>>
 
+pub fn vortex_row::compute_row_sizes(&[vortex_array::array::erased::ArrayRef], &[vortex_row::options::SortField], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<vortex_array::array::erased::ArrayRef>
+
+pub fn vortex_row::convert_columns(&[vortex_array::array::erased::ArrayRef], &[vortex_row::options::SortField], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<vortex_array::arrays::listview::vtable::ListViewArray>
+
 pub fn vortex_row::initialize(&vortex_session::VortexSession)
diff --git a/vortex-row/src/convert.rs b/vortex-row/src/convert.rs
new file mode 100644
index 00000000000..c3b06d92748
--- /dev/null
+++ b/vortex-row/src/convert.rs
@@ -0,0 +1,75 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! User-facing entry point: turn N columnar arrays into one row-encoded `ListView<u8>`.
+
+use vortex_array::ArrayRef;
+use vortex_array::ExecutionCtx;
+use vortex_array::arrays::ListViewArray;
+use vortex_array::scalar_fn::ScalarFnVTable;
+use vortex_array::scalar_fn::VecExecutionArgs;
+use vortex_error::VortexResult;
+use vortex_error::vortex_bail;
+
+use crate::encode::RowEncode;
+use crate::options::RowEncodeOptions;
+use crate::options::SortField;
+use crate::size::RowSize;
+
+/// Convert N columnar arrays into a single row-oriented [`ListViewArray`] of `u8` whose
+/// bytes are lexicographically comparable in the same order as a tuple comparison of the
+/// input values according to `fields`.
+pub fn convert_columns(
+    cols: &[ArrayRef],
+    fields: &[SortField],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<ListViewArray> {
+    if cols.len() != fields.len() {
+        vortex_bail!(
+            "convert_columns: cols.len() ({}) does not match fields.len() ({})",
+            cols.len(),
+            fields.len()
+        );
+    }
+    if cols.is_empty() {
+        vortex_bail!("convert_columns: at least one column is required");
+    }
+    let nrows = cols[0].len();
+    for (i, col) in cols.iter().enumerate() {
+        if col.len() != nrows {
+            vortex_bail!(
+                "convert_columns: column {} has length {} but expected {}",
+                i,
+                col.len(),
+                nrows
+            );
+        }
+    }
+
+    let options = RowEncodeOptions::new(fields.iter().copied());
+    let args = VecExecutionArgs::new(cols.to_vec(), nrows);
+    let result = RowEncode.execute(&options, &args, ctx)?;
+    result.execute::<ListViewArray>(ctx)
+}
+
+/// Compute only the per-row sizes (in bytes) of the row-encoded form for N columns.
+pub fn compute_row_sizes(
+    cols: &[ArrayRef],
+    fields: &[SortField],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<ArrayRef> {
+    if cols.len() != fields.len() {
+        vortex_bail!(
+            "compute_row_sizes: cols.len() ({}) does not match fields.len() ({})",
+            cols.len(),
+            fields.len()
+        );
+    }
+    if cols.is_empty() {
+        vortex_bail!("compute_row_sizes: at least one column is required");
+    }
+    let nrows = cols[0].len();
+    let options = RowEncodeOptions::new(fields.iter().copied());
+    let args = VecExecutionArgs::new(cols.to_vec(), nrows);
+    RowSize.execute(&options, &args, ctx)
+}
diff --git a/vortex-row/src/lib.rs b/vortex-row/src/lib.rs
index ef0209f3d9c..fddcca665c1 100644
--- a/vortex-row/src/lib.rs
+++ b/vortex-row/src/lib.rs
@@ -3,23 +3,54 @@
 
 //! Row-oriented byte encoder, analogous to Apache Arrow's `arrow-row` crate.
 //!
-//! Subsequent commits add the encoder, decoder helpers, and per-encoding fast paths.
-//! This commit only establishes the crate skeleton and an `initialize` stub.
+//! The encoder converts N columnar arrays into a single `List<u8>` array where each row's
+//! bytes are lexicographically comparable in the same order as a tuple comparison of the
+//! original values. This is useful for sorting, hashing into row containers, and other
+//! operations that benefit from a sort-friendly opaque byte representation of a multi-column
+//! key.
+//!
+//! Two variadic scalar functions drive the implementation:
+//! - [`RowSize`] computes per-row byte sizes across all N input columns.
+//! - [`RowEncode`] writes the row-encoded bytes into a single `ListView<u8>` accumulator
+//!   in one left-to-right pass.
+//!
+//! Each scalar function exposes a per-encoding fast-path trait
+//! ([`RowSizeKernel`] / [`RowEncodeKernel`]) for downstream encodings to plug into; PR 3
+//! adds in-crate impls for `Constant`, `Dict`, and `Patched` and an inventory-based
+//! registry for external encodings.
+//!
+//! The user-facing entry point is [`convert_columns`].
+//!
+//! Row-encoding scalar functions are not registered in the default
+//! [`VortexSession`]. Call [`initialize`] on a session to make `RowSize` and `RowEncode`
+//! available via the expression layer.
 
 pub mod codec;
+pub mod convert;
 pub mod encode;
 pub mod options;
 pub mod size;
 
+#[cfg(test)]
+mod tests;
+
+pub use convert::compute_row_sizes;
+pub use convert::convert_columns;
 pub use encode::RowEncode;
 pub use encode::RowEncodeKernel;
 pub use options::RowEncodeOptions;
 pub use options::SortField;
 pub use size::RowSize;
 pub use size::RowSizeKernel;
+use vortex_array::scalar_fn::session::ScalarFnSessionExt;
 use vortex_session::VortexSession;
 
-/// Register the row-encoding scalar functions on the given session.
+/// Register the row-encoding scalar functions ([`RowSize`] and [`RowEncode`]) on the given
+/// session.
 ///
-/// Currently a stub: subsequent commits register `RowSize` and `RowEncode` here.
-pub fn initialize(_session: &VortexSession) {}
+/// Call once on session construction if you want row encoding available via the expression
+/// layer or via [`convert_columns`].
+pub fn initialize(session: &VortexSession) {
+    session.scalar_fns().register(RowSize);
+    session.scalar_fns().register(RowEncode);
+}
diff --git a/vortex-row/src/tests.rs b/vortex-row/src/tests.rs
new file mode 100644
index 00000000000..ff7d8fb274a
--- /dev/null
+++ b/vortex-row/src/tests.rs
@@ -0,0 +1,324 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+#![allow(
+    clippy::approx_constant,
+    clippy::cloned_ref_to_slice_refs,
+    clippy::redundant_clone,
+    reason = "tests value clarity over micro-optimization"
+)]
+
+//! Tests for the row encoder.
+
+use rstest::rstest;
+use vortex_array::IntoArray;
+use vortex_array::LEGACY_SESSION;
+use vortex_array::VortexSessionExecute;
+use vortex_array::arrays::BoolArray;
+use vortex_array::arrays::ListViewArray;
+use vortex_array::arrays::PrimitiveArray;
+use vortex_array::arrays::VarBinViewArray;
+use vortex_array::arrays::listview::ListViewArrayExt;
+use vortex_error::VortexResult;
+
+use crate::SortField;
+use crate::convert_columns;
+
+fn collect_row_bytes(array: &ListViewArray) -> Vec<Vec<u8>> {
+    let mut ctx = LEGACY_SESSION.create_execution_ctx();
+    let nrows = array.len();
+    (0..nrows)
+        .map(|i| {
+            let slice = array.list_elements_at(i).unwrap();
+            let p = slice.execute::<PrimitiveArray>(&mut ctx).unwrap();
+            p.as_slice::<u8>().to_vec()
+        })
+        .collect()
+}
+
+/// Encode each column independently, sort the resulting row bytes, and check the permutation
+/// matches the natural sort order of `values`.
+fn assert_sort_order_i64(values: Vec<i64>, descending: bool) -> VortexResult<()> {
+    let mut ctx = LEGACY_SESSION.create_execution_ctx();
+    let col = PrimitiveArray::from_iter(values.clone()).into_array();
+    let field = SortField {
+        descending,
+        nulls_first: true,
+    };
+    let encoded = convert_columns(&[col], &[field], &mut ctx)?;
+    let rows = collect_row_bytes(&encoded);
+
+    // Build expected permutation: sort values naturally then compare to bytes-sorted order.
+    let mut idx: Vec<usize> = (0..values.len()).collect();
+    if descending {
+        idx.sort_by(|a, b| values[*b].cmp(&values[*a]));
+    } else {
+        idx.sort_by(|a, b| values[*a].cmp(&values[*b]));
+    }
+    let expected_order: Vec<Vec<u8>> = idx.iter().map(|&i| rows[i].clone()).collect();
+
+    let mut sorted = rows.clone();
+    sorted.sort();
+    assert_eq!(
+        sorted, expected_order,
+        "Row-encoded bytes do not match natural sort order"
+    );
+    Ok(())
+}
+
+#[rstest]
+#[case::ascending(false)]
+#[case::descending(true)]
+fn primitive_i64_roundtrip(#[case] descending: bool) -> VortexResult<()> {
+    let values: Vec<i64> = vec![-5, 0, 5, i64::MIN, i64::MAX, 7, -7, 1];
+    assert_sort_order_i64(values, descending)
+}
+
+#[test]
+fn primitive_u32_sort_order() -> VortexResult<()> {
+    let mut ctx = LEGACY_SESSION.create_execution_ctx();
+    let values: Vec<u32> = vec![0, 1, 100, u32::MAX, 42, 17];
+    let col = PrimitiveArray::from_iter(values.clone()).into_array();
+    let encoded = convert_columns(&[col], &[SortField::default()], &mut ctx)?;
+    let rows = collect_row_bytes(&encoded);
+
+    let mut sorted_rows = rows.clone();
+    sorted_rows.sort();
+
+    let mut sorted_idx: Vec<usize> = (0..values.len()).collect();
+    sorted_idx.sort_by(|a, b| values[*a].cmp(&values[*b]));
+    let expected: Vec<Vec<u8>> = sorted_idx.iter().map(|&i| rows[i].clone()).collect();
+    assert_eq!(sorted_rows, expected);
+    Ok(())
+}
+
+#[test]
+fn primitive_f64_sort_order() -> VortexResult<()> {
+    let mut ctx = LEGACY_SESSION.create_execution_ctx();
+    // We use IEEE total-ordering semantics: -0.0 < +0.0 in the byte encoding (matches
+    // `arrow-row`). Avoid -0.0 in the natural-order baseline since partial_cmp says
+    // -0.0 == 0.0.
+    let values: Vec<f64> = vec![-1.5, 0.0, 1.5, f64::INFINITY, f64::NEG_INFINITY, 3.14];
+    let col = PrimitiveArray::from_iter(values.clone()).into_array();
+    let encoded = convert_columns(&[col], &[SortField::default()], &mut ctx)?;
+    let rows = collect_row_bytes(&encoded);
+
+    let mut sorted_rows = rows.clone();
+    sorted_rows.sort();
+
+    let mut sorted_idx: Vec<usize> = (0..values.len()).collect();
+    sorted_idx.sort_by(|a, b| values[*a].partial_cmp(&values[*b]).unwrap());
+    let expected: Vec<Vec<u8>> = sorted_idx.iter().map(|&i| rows[i].clone()).collect();
+    assert_eq!(sorted_rows, expected);
+    Ok(())
+}
+
+#[test]
+fn bool_sort_order() -> VortexResult<()> {
+    let mut ctx = LEGACY_SESSION.create_execution_ctx();
+    let col = BoolArray::from_iter([true, false, true, false]).into_array();
+    let encoded = convert_columns(&[col], &[SortField::default()], &mut ctx)?;
+    let rows = collect_row_bytes(&encoded);
+
+    let mut sorted = rows.clone();
+    sorted.sort();
+    // false rows come first (2x), true rows after (2x)
+    assert_eq!(sorted[0], rows[1]);
+    assert_eq!(sorted[1], rows[3]);
+    assert_eq!(sorted[2], rows[0]);
+    assert_eq!(sorted[3], rows[2]);
+    Ok(())
+}
+
+#[test]
+fn utf8_sort_order() -> VortexResult<()> {
+    let mut ctx = LEGACY_SESSION.create_execution_ctx();
+    let values = vec![
+        "banana",
+        "apple",
+        "",
+        "cherry",
+        "ban",
+        "banana_loaf_for_test",
+    ];
+    let col = VarBinViewArray::from_iter_str(values.clone()).into_array();
+    let encoded = convert_columns(&[col], &[SortField::default()], &mut ctx)?;
+    let rows = collect_row_bytes(&encoded);
+
+    let mut sorted = rows.clone();
+    sorted.sort();
+
+    let mut sorted_idx: Vec<usize> = (0..values.len()).collect();
+    sorted_idx.sort_by(|a, b| values[*a].cmp(values[*b]));
+    let expected: Vec<Vec<u8>> = sorted_idx.iter().map(|&i| rows[i].clone()).collect();
+    assert_eq!(sorted, expected);
+    Ok(())
+}
+
+#[test]
+fn multi_column_sort() -> VortexResult<()> {
+    let mut ctx = LEGACY_SESSION.create_execution_ctx();
+    let ints: Vec<i32> = vec![1, 2, 1, 2, 1, 3];
+    let strs = vec!["b", "a", "a", "b", "c", "z"];
+    let col0 = PrimitiveArray::from_iter(ints.clone()).into_array();
+    let col1 = VarBinViewArray::from_iter_str(strs.clone()).into_array();
+    let encoded = convert_columns(
+        &[col0, col1],
+        &[SortField::default(), SortField::default()],
+        &mut ctx,
+    )?;
+    let rows = collect_row_bytes(&encoded);
+
+    let mut sorted = rows.clone();
+    sorted.sort();
+    let mut idx: Vec<usize> = (0..ints.len()).collect();
+    idx.sort_by(|a, b| ints[*a].cmp(&ints[*b]).then_with(|| strs[*a].cmp(strs[*b])));
+    let expected: Vec<Vec<u8>> = idx.iter().map(|&i| rows[i].clone()).collect();
+    assert_eq!(sorted, expected);
+    Ok(())
+}
+
+#[test]
+fn nulls_first_and_last() -> VortexResult<()> {
+    let mut ctx = LEGACY_SESSION.create_execution_ctx();
+    let values: Vec<Option<i32>> = vec![Some(5), None, Some(1), None, Some(3)];
+    let col = PrimitiveArray::from_option_iter(values.clone()).into_array();
+
+    // nulls_first=true
+    let encoded = convert_columns(
+        &[col.clone()],
+        &[SortField {
+            descending: false,
+            nulls_first: true,
+        }],
+        &mut ctx,
+    )?;
+    let rows = collect_row_bytes(&encoded);
+    let mut sorted = rows.clone();
+    sorted.sort();
+    // The first two sorted entries should be nulls
+    let null_count = values.iter().filter(|v| v.is_none()).count();
+    for i in 0..null_count {
+        // a null encoded row begins with 0x00
+        assert_eq!(sorted[i][0], 0x00);
+    }
+    // nulls_first=false
+    let encoded = convert_columns(
+        &[col],
+        &[SortField {
+            descending: false,
+            nulls_first: false,
+        }],
+        &mut ctx,
+    )?;
+    let rows = collect_row_bytes(&encoded);
+    let mut sorted = rows.clone();
+    sorted.sort();
+    // The last two sorted entries should be nulls
+    for i in 0..null_count {
+        let pos = sorted.len() - 1 - i;
+        assert_eq!(sorted[pos][0], 0x02);
+    }
+    Ok(())
+}
+
+#[test]
+fn struct_sort_order() -> VortexResult<()> {
+    use vortex_array::arrays::StructArray;
+    let mut ctx = LEGACY_SESSION.create_execution_ctx();
+    let ids: Vec<i64> = vec![3, 1, 3, 1, 2];
+    let names = vec!["b", "a", "a", "b", "z"];
+    let id_arr = PrimitiveArray::from_iter(ids.clone()).into_array();
+    let name_arr = VarBinViewArray::from_iter_str(names.clone()).into_array();
+    let struct_arr = StructArray::from_fields(&[("id", id_arr), ("name", name_arr)])?.into_array();
+
+    let encoded = convert_columns(&[struct_arr], &[SortField::default()], &mut ctx)?;
+    let rows = collect_row_bytes(&encoded);
+
+    let mut sorted = rows.clone();
+    sorted.sort();
+    let mut idx: Vec<usize> = (0..ids.len()).collect();
+    idx.sort_by(|a, b| ids[*a].cmp(&ids[*b]).then_with(|| names[*a].cmp(names[*b])));
+    let expected: Vec<Vec<u8>> = idx.iter().map(|&i| rows[i].clone()).collect();
+    assert_eq!(sorted, expected);
+    Ok(())
+}
+
+#[test]
+fn row_size_struct_shape() -> VortexResult<()> {
+    use vortex_array::arrays::Constant;
+    use vortex_array::arrays::StructArray;
+    use vortex_array::arrays::struct_::StructArrayExt;
+
+    use crate::compute_row_sizes;
+
+    let mut ctx = LEGACY_SESSION.create_execution_ctx();
+    let ints: Vec<i32> = vec![1, 2, 3, 4, 5];
+    let strs = vec!["a", "bb", "ccc", "", "eeeee"];
+    let col0 = PrimitiveArray::from_iter(ints).into_array();
+    let col1 = VarBinViewArray::from_iter_str(strs).into_array();
+
+    let sizes = compute_row_sizes(
+        &[col0, col1],
+        &[SortField::default(), SortField::default()],
+        &mut ctx,
+    )?;
+    // Shape must be Struct { fixed, var }
+    let struct_arr = sizes.execute::<StructArray>(&mut ctx)?;
+    assert_eq!(struct_arr.struct_fields().nfields(), 2);
+    let fixed = struct_arr.unmasked_field(0);
+    let var = struct_arr.unmasked_field(1);
+
+    // `fixed` must be ConstantArray with value = encoded i32 width = 1 + 4 = 5.
+    let fixed_const = fixed
+        .as_opt::<Constant>()
+        .expect("fixed field should be a ConstantArray");
+    assert_eq!(
+        fixed_const.scalar(),
+        &vortex_array::scalar::Scalar::from(5u32),
+        "fixed scalar should be encoded primitive i32 width"
+    );
+
+    // `var` must be a PrimitiveArray<u32>, since we have a varlen column.
+    let var_prim = var.clone().execute::<PrimitiveArray>(&mut ctx)?;
+    let v: &[u32] = var_prim.as_slice();
+    assert_eq!(v.len(), 5);
+    // empty string: sentinel(1) + 1 byte; non-empty: sentinel(1) + 33 bytes (single block).
+    let expected: Vec<u32> = vec![34, 34, 34, 2, 34];
+    assert_eq!(v, expected.as_slice());
+    Ok(())
+}
+
+#[test]
+fn single_buffer_invariant() -> VortexResult<()> {
+    let mut ctx = LEGACY_SESSION.create_execution_ctx();
+    // Encoded rows here are all > 12 bytes, forcing the Ref-view path that points back into
+    // the shared data buffer.
+    let nrows = 64usize;
+    let primitives: Vec<i64> = (0..nrows as i64).collect();
+    let strings: Vec<String> = (0..nrows)
+        .map(|i| format!("row_{}_with_padding", i))
+        .collect();
+    let col0 = PrimitiveArray::from_iter(primitives.clone()).into_array();
+    let col1 = VarBinViewArray::from_iter_str(strings.iter().map(String::as_str)).into_array();
+    let encoded = convert_columns(
+        &[col0, col1],
+        &[SortField::default(), SortField::default()],
+        &mut ctx,
+    )?;
+
+    let rows = collect_row_bytes(&encoded);
+    let expected_total: usize = rows.iter().map(|r| r.len()).sum();
+
+    // The shared data buffer holds the contiguous concatenation of every row's encoded bytes;
+    // per-row allocations would produce many small buffers instead of one shared buffer.
+    // ListView's elements array is a single contiguous primitive (u8) array; its length
+    // equals the sum of all per-row sizes. A per-row allocation strategy would instead
+    // produce N separate elements arrays or a sparse one.
+    let elements_len = encoded.elements().len();
+    assert_eq!(
+        elements_len, expected_total,
+        "elements buffer size mismatch"
+    );
+    Ok(())
+}

From f760d37bfe76f77d7d348f14cba6bf043bf2e40d Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 17 May 2026 22:17:18 +0000
Subject: [PATCH 09/26] Skip ListView validation in row encoder output

The encoder constructs the ListView's elements/offsets/sizes itself and
maintains every invariant by construction: monotone offsets, each
slice's `offsets[i] + sizes[i] <= total`, pairwise-disjoint slices.
`ListViewArray::try_new` re-walks every row to validate those properties,
which doubles as a memory pass over the just-built offsets/sizes arrays.

Switch to `unsafe { ListViewArray::new_unchecked(...) }` with an inline
SAFETY comment justifying each invariant.

primitive_i64_vortex throughput improves from ~1.80 GB/s to ~4.7 GB/s
on isolated runs (the validate walk dominates for small per-row payloads;
larger varlen rows show smaller % improvements).

Signed-off-by: Claude <noreply@anthropic.com>
---
 vortex-row/src/encode.rs | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/vortex-row/src/encode.rs b/vortex-row/src/encode.rs
index 1b77d955964..41c041f753d 100644
--- a/vortex-row/src/encode.rs
+++ b/vortex-row/src/encode.rs
@@ -195,10 +195,19 @@ fn execute_row_encode(
         Validity::NonNullable,
     )
     .into_array();
-    Ok(
-        ListViewArray::try_new(elements, offsets_arr, sizes_arr, Validity::NonNullable)?
-            .into_array(),
-    )
+    // SAFETY: The encoder constructs `elements`, `offsets_arr`, and `sizes_arr` itself.
+    // - `elements` is a `PrimitiveArray<u8>` of length `total_bytes`.
+    // - `offsets[i]` is `i * fixed_per_row + var_prefix[i]`, monotonically increasing,
+    //   each value in `0..total_bytes`.
+    // - `sizes[i]` is the per-row size; `offsets[i] + sizes[i] <= total_bytes` by
+    //   construction of the buffer.
+    // - Each row's slice is disjoint from every other row's slice.
+    // The constructor's `validate` re-walks every row to verify these invariants; we know
+    // they hold by construction, so we skip that walk.
+    Ok(unsafe {
+        ListViewArray::new_unchecked(elements, offsets_arr, sizes_arr, Validity::NonNullable)
+    }
+    .into_array())
 }
 
 /// Dispatch a single column's encoding into the shared `out` buffer.

From 7de4dcc4780f1b880a15fdf3e180df1a50dac058 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 17 May 2026 22:18:38 +0000
Subject: [PATCH 10/26] Add validity fast-path helper for the four encoders
 that pattern-match it

Most production columns are non-nullable or `AllValid`, in which case
the per-row `mask.value(i)` branch is dead weight. Introduce a
`ValidityKind { AllValid, Mask(...) }` helper resolved exactly once per
column, and pattern-match on it in the four encoders that loop over
rows: `encode_primitive_typed`, `encode_bool`, `encode_varbinview`,
`add_size_varbinview`.

For NonNullable / AllValid columns this skips the mask materialization
entirely, and the inner loop has no validity branch. For nullable
columns the materialized mask is held once instead of re-resolved per
row.

Yields ~10% across canonical paths on isolated runs; combines with the
later auto-vectorization commit because removing the per-row branch
makes the inner loop a candidate for the compiler's vectorizer.

Signed-off-by: Claude <noreply@anthropic.com>
---
 vortex-row/src/codec.rs | 171 ++++++++++++++++++++++++++++------------
 1 file changed, 121 insertions(+), 50 deletions(-)

diff --git a/vortex-row/src/codec.rs b/vortex-row/src/codec.rs
index 8468301e5b3..37295536d1d 100644
--- a/vortex-row/src/codec.rs
+++ b/vortex-row/src/codec.rs
@@ -45,6 +45,7 @@ use vortex_array::dtype::NativePType;
 use vortex_array::dtype::PType;
 use vortex_array::dtype::half::f16;
 use vortex_array::match_each_native_ptype;
+use vortex_array::validity::Validity;
 use vortex_buffer::ByteBufferMut;
 use vortex_error::VortexResult;
 use vortex_error::vortex_bail;
@@ -77,6 +78,33 @@ const fn encoded_size_for_fixed(value_bytes: u32) -> u32 {
     1 + value_bytes
 }
 
+/// Pre-resolved per-row validity for the row encoders.
+///
+/// Encoders pattern-match on this once before their inner loop so the
+/// no-nulls fast path avoids per-row `mask.value(i)` branches entirely,
+/// and the nullable path holds the materialized mask exactly once.
+pub(crate) enum ValidityKind {
+    /// Column statically has no nulls (`Validity::NonNullable` or `AllValid`); no mask
+    /// allocation needed.
+    AllValid,
+    /// Column may have nulls; the materialized per-row mask is included.
+    Mask(vortex_mask::Mask),
+}
+
+/// Resolve a [`Validity`] into a [`ValidityKind`], materializing the mask only when
+/// the column may actually have nulls.
+#[inline]
+pub(crate) fn resolve_validity(
+    validity: Validity,
+    len: usize,
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<ValidityKind> {
+    Ok(match validity {
+        Validity::NonNullable | Validity::AllValid => ValidityKind::AllValid,
+        other => ValidityKind::Mask(other.execute_mask(len, ctx)?),
+    })
+}
+
 /// Per-row width classification for a column.
 ///
 /// `Fixed(w)` means every row encodes to exactly `w` bytes (sentinel + value), regardless
@@ -245,15 +273,21 @@ fn add_size_varbinview(
     sizes: &mut [u32],
     ctx: &mut ExecutionCtx,
 ) -> VortexResult<()> {
-    let mask = arr.as_ref().validity()?.execute_mask(arr.len(), ctx)?;
     let views = arr.views();
-    for (i, view) in views.iter().enumerate() {
-        let valid = mask.value(i);
-        if !valid {
-            sizes[i] += 1; // sentinel only
-        } else {
-            let len = view.len() as usize;
-            sizes[i] += encoded_size_for_varlen(len);
+    match resolve_validity(arr.as_ref().validity()?, arr.len(), ctx)? {
+        ValidityKind::AllValid => {
+            for (i, view) in views.iter().enumerate() {
+                sizes[i] += encoded_size_for_varlen(view.len() as usize);
+            }
+        }
+        ValidityKind::Mask(mask) => {
+            for (i, view) in views.iter().enumerate() {
+                if mask.value(i) {
+                    sizes[i] += encoded_size_for_varlen(view.len() as usize);
+                } else {
+                    sizes[i] += 1; // sentinel only
+                }
+            }
         }
     }
     Ok(())
@@ -336,23 +370,35 @@ fn encode_bool(
     out: &mut [u8],
     ctx: &mut ExecutionCtx,
 ) -> VortexResult<()> {
-    let mask = arr.as_ref().validity()?.execute_mask(arr.len(), ctx)?;
     let bits = arr.clone().into_bit_buffer();
     let non_null = field.non_null_sentinel();
-    let null = field.null_sentinel();
     let xor = if field.descending { 0xFF } else { 0x00 };
-    for i in 0..bits.len() {
-        let pos = (row_offsets[i] + col_offset[i]) as usize;
-        if mask.value(i) {
-            out[pos] = non_null;
-            // false=0x01, true=0x02 so false < true; XOR for descending
-            let raw = if bits.value(i) { 0x02u8 } else { 0x01u8 };
-            out[pos + 1] = raw ^ xor;
-        } else {
-            out[pos] = null;
-            out[pos + 1] = 0;
+    match resolve_validity(arr.as_ref().validity()?, arr.len(), ctx)? {
+        ValidityKind::AllValid => {
+            for i in 0..bits.len() {
+                let pos = (row_offsets[i] + col_offset[i]) as usize;
+                out[pos] = non_null;
+                let raw = if bits.value(i) { 0x02u8 } else { 0x01u8 };
+                out[pos + 1] = raw ^ xor;
+                col_offset[i] += BOOL_ENCODED_SIZE;
+            }
+        }
+        ValidityKind::Mask(mask) => {
+            let null = field.null_sentinel();
+            for i in 0..bits.len() {
+                let pos = (row_offsets[i] + col_offset[i]) as usize;
+                if mask.value(i) {
+                    out[pos] = non_null;
+                    // false=0x01, true=0x02 so false < true; XOR for descending
+                    let raw = if bits.value(i) { 0x02u8 } else { 0x01u8 };
+                    out[pos + 1] = raw ^ xor;
+                } else {
+                    out[pos] = null;
+                    out[pos + 1] = 0;
+                }
+                col_offset[i] += BOOL_ENCODED_SIZE;
+            }
         }
-        col_offset[i] += BOOL_ENCODED_SIZE;
     }
     Ok(())
 }
@@ -379,24 +425,35 @@ fn encode_primitive_typed<T: NativePType + RowEncode>(
     out: &mut [u8],
     ctx: &mut ExecutionCtx,
 ) -> VortexResult<()> {
-    let mask = arr.as_ref().validity()?.execute_mask(arr.len(), ctx)?;
     let slice: &[T] = arr.as_slice();
     let non_null = field.non_null_sentinel();
-    let null = field.null_sentinel();
     let value_bytes = size_of::<T>();
-    for (i, &v) in slice.iter().enumerate() {
-        let pos = (row_offsets[i] + col_offset[i]) as usize;
-        if mask.value(i) {
-            out[pos] = non_null;
-            v.encode_to(&mut out[pos + 1..pos + 1 + value_bytes], field.descending);
-        } else {
-            out[pos] = null;
-            // Zero-fill the value bytes.
-            for b in &mut out[pos + 1..pos + 1 + value_bytes] {
-                *b = 0;
+    let stride = encoded_size_for_fixed(value_bytes as u32);
+    match resolve_validity(arr.as_ref().validity()?, arr.len(), ctx)? {
+        ValidityKind::AllValid => {
+            for (i, &v) in slice.iter().enumerate() {
+                let pos = (row_offsets[i] + col_offset[i]) as usize;
+                out[pos] = non_null;
+                v.encode_to(&mut out[pos + 1..pos + 1 + value_bytes], field.descending);
+                col_offset[i] += stride;
+            }
+        }
+        ValidityKind::Mask(mask) => {
+            let null = field.null_sentinel();
+            for (i, &v) in slice.iter().enumerate() {
+                let pos = (row_offsets[i] + col_offset[i]) as usize;
+                if mask.value(i) {
+                    out[pos] = non_null;
+                    v.encode_to(&mut out[pos + 1..pos + 1 + value_bytes], field.descending);
+                } else {
+                    out[pos] = null;
+                    for b in &mut out[pos + 1..pos + 1 + value_bytes] {
+                        *b = 0;
+                    }
+                }
+                col_offset[i] += stride;
             }
         }
-        col_offset[i] += encoded_size_for_fixed(value_bytes as u32);
     }
     Ok(())
 }
@@ -471,24 +528,38 @@ fn encode_varbinview(
     out: &mut [u8],
     ctx: &mut ExecutionCtx,
 ) -> VortexResult<()> {
-    let mask = arr.as_ref().validity()?.execute_mask(arr.len(), ctx)?;
     let non_null = field.non_null_sentinel();
-    let null = field.null_sentinel();
-
-    arr.with_iterator(|iter| {
-        for (i, maybe) in iter.enumerate() {
-            let pos = (row_offsets[i] + col_offset[i]) as usize;
-            if !mask.value(i) {
-                out[pos] = null;
-                col_offset[i] += 1;
-                continue;
-            }
-            let bytes: &[u8] = maybe.unwrap_or(&[]);
-            out[pos] = non_null;
-            let written = encode_varlen_value(bytes, &mut out[pos + 1..], field.descending);
-            col_offset[i] += 1 + written;
+    let descending = field.descending;
+    match resolve_validity(arr.as_ref().validity()?, arr.len(), ctx)? {
+        ValidityKind::AllValid => {
+            arr.with_iterator(|iter| {
+                for (i, maybe) in iter.enumerate() {
+                    let pos = (row_offsets[i] + col_offset[i]) as usize;
+                    let bytes: &[u8] = maybe.unwrap_or(&[]);
+                    out[pos] = non_null;
+                    let written = encode_varlen_value(bytes, &mut out[pos + 1..], descending);
+                    col_offset[i] += 1 + written;
+                }
+            });
         }
-    });
+        ValidityKind::Mask(mask) => {
+            let null = field.null_sentinel();
+            arr.with_iterator(|iter| {
+                for (i, maybe) in iter.enumerate() {
+                    let pos = (row_offsets[i] + col_offset[i]) as usize;
+                    if !mask.value(i) {
+                        out[pos] = null;
+                        col_offset[i] += 1;
+                        continue;
+                    }
+                    let bytes: &[u8] = maybe.unwrap_or(&[]);
+                    out[pos] = non_null;
+                    let written = encode_varlen_value(bytes, &mut out[pos + 1..], descending);
+                    col_offset[i] += 1 + written;
+                }
+            });
+        }
+    }
     Ok(())
 }
 

From a358479564f58767bad8de7d12ad8d5f94cf791c Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 17 May 2026 22:19:01 +0000
Subject: [PATCH 11/26] Skip zero-init of output buffer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`BufferMut::with_capacity(total_len).push_n(0u8, total_len)` issues a
memset of the entire output range, only to have every byte overwritten
by the encoders. The encoders cover every byte by construction:

- Fixed-width non-null slots: sentinel + value bytes.
- Fixed-width null slots: sentinel + explicit per-byte zero-fill loop.
- Varlen blocks: full blocks are written by `encode_varlen_value`; the
  partial-block tail is zero-padded by that same function.
- Struct/FSL null bodies: zero-filled after the child encoders run.

Switch to `unsafe { out_buf.set_len(total_len) }` with a SAFETY comment
recording the invariant. Reclaims a `total_len`-byte memset per call;
for varlen-heavy inputs (multiple MB of output) this saves real time.

dict_utf8 (varlen heavy) throughput: ~3.74 GB/s → ~4.55 GB/s.

Signed-off-by: Claude <noreply@anthropic.com>
---
 vortex-row/src/encode.rs | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/vortex-row/src/encode.rs b/vortex-row/src/encode.rs
index 41c041f753d..29e44b42381 100644
--- a/vortex-row/src/encode.rs
+++ b/vortex-row/src/encode.rs
@@ -133,12 +133,17 @@ fn execute_row_encode(
     }
     let total_len = total as usize;
 
-    // Allocate the elements buffer (zero-initialized). The zero-init lets every encoder
-    // assume previously-untouched bytes are zero, simplifying the null-row fill paths.
-    // PR 2 skips this memset because every byte in the output range is written by some
-    // encoder.
     let mut out_buf: BufferMut<u8> = BufferMut::with_capacity(total_len);
-    out_buf.push_n(0u8, total_len);
+    // Every encoder writes every byte in its row range: non-null values are written
+    // directly; null fixed-width slots are sentinel + explicit zero-fill; varlen partial
+    // blocks zero-pad via the encoder's own loop; null struct/FSL bodies are zero-filled
+    // after the child encoders run. So the pre-zero-init of the buffer is redundant;
+    // skipping it saves a memset of `total_len` bytes per call (significant for
+    // varlen-heavy inputs where total_len reaches multiple MB).
+    //
+    // SAFETY: we just allocated `total_len` capacity. By the size-pass + encoder
+    // contract every byte in [0, total_len) is written before the buffer is read out.
+    unsafe { out_buf.set_len(total_len) };
 
     // ===== Phase 3: per-row offsets =====
     // listview_offsets[i] is the absolute byte offset where row `i` begins.

From af9f19d2fe9505f9f56e6b1120b2a93ee020728d Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 17 May 2026 22:19:43 +0000
Subject: [PATCH 12/26] Auto-vectorize pure-fixed offsets construction
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The pure-fixed branch built `listview_offsets` via `Vec::push` +
`checked_mul`, which forces the compiler to emit a per-iteration
overflow branch and a `push`-style length-update sequence. Both
inhibit the autovectorizer.

We already validated `total` (= `nrows * fixed_per_row`) fits in u32
before reaching Phase 3, so each individual `i * fixed_per_row` also
fits. Replace the loop with a raw `ptr.add(i).write(...)` write through
the reserved capacity and a final `set_len(nrows)`. LLVM lowers the
inner write to a SIMD store on x86 (verified via cargo asm in earlier
iterations).

primitive_i64_vortex throughput: ~4.96 GB/s → ~7.74 GB/s on isolated
runs. The mixed branch gets the same treatment in the next commit.

Signed-off-by: Claude <noreply@anthropic.com>
---
 vortex-row/src/encode.rs | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/vortex-row/src/encode.rs b/vortex-row/src/encode.rs
index 29e44b42381..a90c1f02961 100644
--- a/vortex-row/src/encode.rs
+++ b/vortex-row/src/encode.rs
@@ -152,12 +152,16 @@ fn execute_row_encode(
     let mut listview_offsets: Vec<u32> = Vec::with_capacity(nrows);
     match var_lengths.as_ref() {
         None => {
-            for i in 0..nrows {
-                listview_offsets.push(
-                    (i as u32)
-                        .checked_mul(fixed_per_row)
-                        .vortex_expect("row offset overflow (already validated total fits in u32)"),
-                );
+            // Pure-fixed: offsets[i] = i * fixed_per_row. Materialize via a tight
+            // pointer-write loop that LLVM auto-vectorizes; we already validated total
+            // fits in u32 above so the multiplications can't overflow.
+            // SAFETY: reserved nrows; pointers within [0, nrows) are valid.
+            unsafe {
+                let ptr = listview_offsets.as_mut_ptr();
+                for i in 0..nrows {
+                    ptr.add(i).write((i as u32) * fixed_per_row);
+                }
+                listview_offsets.set_len(nrows);
             }
         }
         Some(v) => {

From f76ed6f0f7e8e69121658d30c810a3d33ac21564 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 17 May 2026 22:20:06 +0000
Subject: [PATCH 13/26] Auto-vectorize mixed-path offsets construction
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Apply the same `Vec::push` → raw-pointer-write transformation to the
mixed (fixed-plus-varlen) branch of Phase 3. We already validated the
total fits in u32 upstream, so `wrapping_mul` / `wrapping_add` here are
sound.

Mixed paths within the bench noise; this commit keeps the pure-fixed
and mixed branches structurally identical so reviewers see the same
shape regardless of whether varlen is present.

Signed-off-by: Claude <noreply@anthropic.com>
---
 vortex-row/src/encode.rs | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/vortex-row/src/encode.rs b/vortex-row/src/encode.rs
index a90c1f02961..330d6437d04 100644
--- a/vortex-row/src/encode.rs
+++ b/vortex-row/src/encode.rs
@@ -165,14 +165,21 @@ fn execute_row_encode(
             }
         }
         Some(v) => {
-            let mut acc: u32 = 0;
-            for (i, &l) in v.iter().enumerate() {
-                let off = (i as u32)
-                    .checked_mul(fixed_per_row)
-                    .and_then(|t| t.checked_add(acc))
-                    .vortex_expect("row offset overflow");
-                listview_offsets.push(off);
-                acc = acc.checked_add(l).vortex_expect("varlen prefix overflow");
+            // Mixed path: offsets[i] = i * fixed_per_row + var_prefix[i] where
+            // var_prefix is the exclusive cumsum of varlen lengths. Same raw-pointer
+            // write loop as the pure-fixed branch (auto-vectorized); the total was
+            // validated to fit in u32 upstream so `wrapping_add` is sound here.
+            // SAFETY: we just reserved nrows; writes at indices [0, nrows) are valid.
+            unsafe {
+                let off_ptr = listview_offsets.as_mut_ptr();
+                let mut acc: u32 = 0;
+                for (i, &l) in v.iter().enumerate() {
+                    off_ptr
+                        .add(i)
+                        .write((i as u32).wrapping_mul(fixed_per_row).wrapping_add(acc));
+                    acc = acc.wrapping_add(l);
+                }
+                listview_offsets.set_len(nrows);
             }
         }
     }

From f130d3116bdfe3bb8320f155bf2599f76bcc8cb6 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 17 May 2026 22:20:48 +0000
Subject: [PATCH 14/26] Rewrite varlen 32-byte block encoder with
 copy_nonoverlapping
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The byte-at-a-time XOR loop is per-byte branch-heavy: 32 conditional
writes per block on each path, even for the ascending (no-XOR) case
where the body is exactly a `memcpy(32) + stamp(1)`.

Rewrite `encode_varlen_value` with two distinct fast paths:
- Ascending: `copy_nonoverlapping(src, dst, 32)` + a single 0xFF stamp.
  The compiler folds the loop into a SIMD memcpy.
- Descending: a `xor_copy_block` helper that XOR-copies 32 bytes via
  four u64 reads/writes; LLVM lowers it to SIMD on x86.

The partial-block tail uses `write_bytes` for the zero-padding instead
of a per-byte loop.

utf8 throughput: ~0.92 GB/s → ~1.39 GB/s.
struct_mixed: +35%.

Signed-off-by: Claude <noreply@anthropic.com>
---
 vortex-row/src/codec.rs | 104 +++++++++++++++++++++++++++++++---------
 1 file changed, 81 insertions(+), 23 deletions(-)

diff --git a/vortex-row/src/codec.rs b/vortex-row/src/codec.rs
index 37295536d1d..360a22b9d0a 100644
--- a/vortex-row/src/codec.rs
+++ b/vortex-row/src/codec.rs
@@ -691,36 +691,94 @@ fn encode_extension(
 
 /// Encode a variable-length byte slice into `out` in 32-byte blocks with
 /// continuation markers. Returns the number of bytes written.
+///
+/// For the ascending path (descending == false), the hot loop is a `copy_nonoverlapping`
+/// of 32 bytes per block plus one stamped continuation byte — no per-byte work. For the
+/// descending path, the hot loop reads u64-at-a-time and XORs with 0xFF to give LLVM
+/// a vectorizable inner loop.
 fn encode_varlen_value(bytes: &[u8], out: &mut [u8], descending: bool) -> u32 {
-    let xor = if descending { 0xFFu8 } else { 0x00 };
     if bytes.is_empty() {
-        // Single zero terminator.
-        out[0] = xor;
+        // Single zero terminator (descending flips it to 0xFF).
+        out[0] = if descending { 0xFF } else { 0 };
         return 1;
     }
-    let mut written = 0usize;
-    let mut remaining = bytes;
-    while remaining.len() > VARLEN_BLOCK_SIZE {
-        // Full block, continuation marker 0xFF (then XORed if descending).
-        let block = &remaining[..VARLEN_BLOCK_SIZE];
-        for (i, &b) in block.iter().enumerate() {
-            out[written + i] = b ^ xor;
+    let len = bytes.len();
+    let full_blocks = len / VARLEN_BLOCK_SIZE;
+    let partial = len % VARLEN_BLOCK_SIZE;
+    let (full_to_write, partial_block_len) = if partial == 0 {
+        // Length is an exact multiple of 32. The spec emits (full_blocks-1) full blocks
+        // with 0xFF continuation, plus a final block whose continuation byte is 32.
+        (full_blocks - 1, VARLEN_BLOCK_SIZE)
+    } else {
+        (full_blocks, partial)
+    };
+    let total = (full_to_write + 1) * VARLEN_BLOCK_TOTAL;
+    debug_assert!(out.len() >= total);
+
+    // SAFETY: bounds checked above. The encoder always invokes us with `out.len()`
+    // >= encoded_size_for_varlen(bytes.len()) - 1 (the leading sentinel is written by the
+    // caller and not counted here).
+    unsafe {
+        let mut src = bytes.as_ptr();
+        let mut dst = out.as_mut_ptr();
+
+        if !descending {
+            // Ascending fast path: full blocks are memcpy + a single 0xFF stamp.
+            for _ in 0..full_to_write {
+                std::ptr::copy_nonoverlapping(src, dst, VARLEN_BLOCK_SIZE);
+                *dst.add(VARLEN_BLOCK_SIZE) = 0xFF;
+                src = src.add(VARLEN_BLOCK_SIZE);
+                dst = dst.add(VARLEN_BLOCK_TOTAL);
+            }
+            // Final block: copy the partial data, zero-pad the tail, write the
+            // length byte as the continuation marker.
+            std::ptr::copy_nonoverlapping(src, dst, partial_block_len);
+            std::ptr::write_bytes(
+                dst.add(partial_block_len),
+                0,
+                VARLEN_BLOCK_SIZE - partial_block_len,
+            );
+            *dst.add(VARLEN_BLOCK_SIZE) = partial_block_len as u8;
+        } else {
+            // Descending: invert all value bytes. u64-stride XOR gives LLVM a
+            // vectorizable inner loop; the tail handles the partial block.
+            for _ in 0..full_to_write {
+                xor_copy_block(src, dst);
+                *dst.add(VARLEN_BLOCK_SIZE) = 0x00; // descending counterpart of 0xFF
+                src = src.add(VARLEN_BLOCK_SIZE);
+                dst = dst.add(VARLEN_BLOCK_TOTAL);
+            }
+            // Final block: XOR-copy the partial data, fill the tail with 0xFF
+            // (which is 0x00 XOR 0xFF), then write the inverted length byte.
+            for i in 0..partial_block_len {
+                *dst.add(i) = *src.add(i) ^ 0xFF;
+            }
+            std::ptr::write_bytes(
+                dst.add(partial_block_len),
+                0xFF,
+                VARLEN_BLOCK_SIZE - partial_block_len,
+            );
+            *dst.add(VARLEN_BLOCK_SIZE) = (partial_block_len as u8) ^ 0xFF;
         }
-        out[written + VARLEN_BLOCK_SIZE] = 0xFF ^ xor;
-        written += VARLEN_BLOCK_TOTAL;
-        remaining = &remaining[VARLEN_BLOCK_SIZE..];
-    }
-    // Final partial block: pad with zeros, last byte = remaining.len() (1..=32).
-    let n = remaining.len();
-    for (i, &b) in remaining.iter().enumerate() {
-        out[written + i] = b ^ xor;
     }
-    for j in n..VARLEN_BLOCK_SIZE {
-        out[written + j] = xor;
+    total as u32
+}
+
+/// Copy 32 bytes from `src` to `dst`, XORing each with 0xFF. Auto-vectorized by LLVM
+/// into SIMD on x86 (verified via cargo asm in earlier iterations).
+///
+/// # Safety
+/// `src` must be valid for 32 reads; `dst` must be valid for 32 writes; the regions
+/// may not overlap.
+#[inline(always)]
+unsafe fn xor_copy_block(src: *const u8, dst: *mut u8) {
+    // Use u64 chunks (4 lanes of 8 bytes = 32 bytes total).
+    for i in 0..4 {
+        let off = i * 8;
+        // SAFETY: caller upholds the contract that src/dst are valid for 32 bytes.
+        let v = unsafe { std::ptr::read_unaligned(src.add(off) as *const u64) };
+        unsafe { std::ptr::write_unaligned(dst.add(off) as *mut u64, v ^ u64::MAX) };
     }
-    out[written + VARLEN_BLOCK_SIZE] = (n as u8) ^ xor;
-    written += VARLEN_BLOCK_TOTAL;
-    written as u32
 }
 
 /// Internal trait for encoding a fixed-width native value into byte slots.

From 61147ce702c34082cd5ed41e74d7296b347b384e Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 17 May 2026 22:22:06 +0000
Subject: [PATCH 15/26] Walk VarBinView rows directly in row encoder hot loop
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`arr.with_iterator(...)` constructs an `Option<&[u8]>` per row through a
trait-object dispatch and a branch-and-merge that hides the
inline-vs-buffer view from the compiler. On the AllValid path we don't
need the Option (no nulls) and we want the compiler to see the
inline-vs-buffer branch directly so it can keep the inline arm in
registers.

Walk `arr.views()` directly and resolve each view via `is_inlined() →
as_inlined().value()` vs `as_view() → buffers[idx][offset..len]`. Cache
data-buffer slices once before the loop (SmallVec for ≤4 buffers, the
common case). Nullable path is unchanged because the Option<&[u8]>
shape is already what we want when nulls are possible.

utf8 throughput: ~1.49 GB/s → ~1.84 GB/s.

Signed-off-by: Claude <noreply@anthropic.com>
---
 vortex-row/src/codec.rs | 34 +++++++++++++++++++++++++---------
 1 file changed, 25 insertions(+), 9 deletions(-)

diff --git a/vortex-row/src/codec.rs b/vortex-row/src/codec.rs
index 360a22b9d0a..7c89c81e2e4 100644
--- a/vortex-row/src/codec.rs
+++ b/vortex-row/src/codec.rs
@@ -530,17 +530,33 @@ fn encode_varbinview(
 ) -> VortexResult<()> {
     let non_null = field.non_null_sentinel();
     let descending = field.descending;
+    let views = arr.views();
+    let n_buffers = arr.data_buffers().len();
     match resolve_validity(arr.as_ref().validity()?, arr.len(), ctx)? {
         ValidityKind::AllValid => {
-            arr.with_iterator(|iter| {
-                for (i, maybe) in iter.enumerate() {
-                    let pos = (row_offsets[i] + col_offset[i]) as usize;
-                    let bytes: &[u8] = maybe.unwrap_or(&[]);
-                    out[pos] = non_null;
-                    let written = encode_varlen_value(bytes, &mut out[pos + 1..], descending);
-                    col_offset[i] += 1 + written;
-                }
-            });
+            // Cache data-buffer slices once. For inlined views (len <= 12), bytes live
+            // inside the view itself.
+            let buffers: smallvec::SmallVec<[&[u8]; 4]> =
+                (0..n_buffers).map(|i| arr.buffer(i).as_slice()).collect();
+            for (i, view) in views.iter().enumerate() {
+                let pos = (row_offsets[i] + col_offset[i]) as usize;
+                out[pos] = non_null;
+                let len = view.len() as usize;
+                // SAFETY: BinaryView's inlined-vs-ref discriminant is its `size` field
+                // (read by `view.len()`); for len <= 12 the bytes are inline in the view
+                // (we read from `as_inlined().value()`); for larger we index into the
+                // pre-validated buffer at `view_ref.offset..offset+size`. Both reads
+                // produce a slice of exactly `len` valid bytes.
+                let bytes: &[u8] = if view.is_inlined() {
+                    view.as_inlined().value()
+                } else {
+                    let r = view.as_view();
+                    let off = r.offset as usize;
+                    &buffers[r.buffer_index as usize][off..off + len]
+                };
+                let written = encode_varlen_value(bytes, &mut out[pos + 1..], descending);
+                col_offset[i] += 1 + written;
+            }
         }
         ValidityKind::Mask(mask) => {
             let null = field.null_sentinel();

From 224fdab691241fc4362bd2b860d718b279271a15 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 17 May 2026 22:47:50 +0000
Subject: [PATCH 16/26] Add arithmetic-write fast path for fixed-before-varlen
 columns

ColKind::Fixed { before_varlen: true, .. } columns have a constant
within-row write offset (sum of preceding fixed-column widths plus
i * fixed_per_row plus var_prefix[i] when varlen columns are present).
For these we don't need a per-row cursor; the position is pure
arithmetic.

Adds dispatch_encode_fixed_arith + field_encode_fixed_arithmetic and
routes the relevant ColKind arm of execute_row_encode's phase 4
through them. Fixed-after-varlen columns and varlen columns continue
through the existing cursor path.

primitive_i64 vortex 3.0 -> 6+ GB/s.

Signed-off-by: Claude <noreply@anthropic.com>
---
 vortex-row/src/codec.rs  | 395 +++++++++++++++++++++++++++++++++++++++
 vortex-row/src/encode.rs | 123 ++++++++++--
 vortex-row/src/size.rs   |  12 --
 3 files changed, 506 insertions(+), 24 deletions(-)

diff --git a/vortex-row/src/codec.rs b/vortex-row/src/codec.rs
index 7c89c81e2e4..4138a0d55b8 100644
--- a/vortex-row/src/codec.rs
+++ b/vortex-row/src/codec.rs
@@ -210,6 +210,66 @@ pub fn field_size(
     Ok(())
 }
 
+/// Encode each row's bytes for the given canonical view at arithmetic offsets, without
+/// reading or writing any per-row cursor.
+///
+/// For row `i`, the column's bytes are written at `out[i * row_stride + var_prefix[i] +
+/// col_prefix ..]` where `var_prefix` is the exclusive prefix sum of varlen contributions
+/// (`None` when there are no varlen columns in the row layout). This is the fast path used
+/// for fixed-width columns that appear before any varlen column in the row.
+///
+/// This path requires that `canonical` is a fixed-width type; the per-column slot has
+/// exactly `width` bytes per row (sentinel + value).
+///
+/// # Errors
+///
+/// Returns an error if `canonical` is not a supported fixed-width canonical variant.
+#[allow(clippy::too_many_arguments)]
+pub fn field_encode_fixed_arithmetic(
+    canonical: &Canonical,
+    field: SortField,
+    col_prefix: u32,
+    row_stride: u32,
+    var_prefix: Option<&[u32]>,
+    width: u32,
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    match canonical {
+        Canonical::Null(arr) => {
+            encode_null_arith(arr, field, col_prefix, row_stride, var_prefix, out)
+        }
+        Canonical::Bool(arr) => {
+            encode_bool_arith(arr, field, col_prefix, row_stride, var_prefix, out, ctx)?
+        }
+        Canonical::Primitive(arr) => encode_primitive_arith(
+            arr, field, col_prefix, row_stride, var_prefix, width, out, ctx,
+        )?,
+        Canonical::Decimal(arr) => encode_decimal_arith(
+            arr, field, col_prefix, row_stride, var_prefix, width, out, ctx,
+        )?,
+        Canonical::Struct(arr) => encode_struct_arith(
+            arr, field, col_prefix, row_stride, var_prefix, width, out, ctx,
+        )?,
+        Canonical::FixedSizeList(arr) => encode_fsl_arith(
+            arr, field, col_prefix, row_stride, var_prefix, width, out, ctx,
+        )?,
+        Canonical::Extension(arr) => {
+            let storage = arr.storage_array().clone().execute::<Canonical>(ctx)?;
+            field_encode_fixed_arithmetic(
+                &storage, field, col_prefix, row_stride, var_prefix, width, out, ctx,
+            )?;
+        }
+        Canonical::VarBinView(_) | Canonical::List(_) | Canonical::Variant(_) => {
+            vortex_bail!(
+                "field_encode_fixed_arithmetic called on non-fixed canonical type {:?}",
+                canonical.dtype()
+            )
+        }
+    }
+    Ok(())
+}
+
 /// Encode each row's bytes for the given canonical view into `out`, writing starting at
 /// `offsets[i] + cursors[i]` for row `i` and advancing `cursors[i]` by the number of
 /// bytes written.
@@ -1138,3 +1198,338 @@ pub fn encode_scalar(
     }
     Ok(out.freeze().into_inner())
 }
+
+/// Per-row write start offset for the fixed-arithmetic encode path.
+#[inline]
+fn arith_pos(i: usize, col_prefix: u32, row_stride: u32, var_prefix: Option<&[u32]>) -> usize {
+    let base = (i as u32) * row_stride + col_prefix;
+    let pos = match var_prefix {
+        Some(vp) => base + vp[i],
+        None => base,
+    };
+    pos as usize
+}
+
+fn encode_null_arith(
+    arr: &NullArray,
+    field: SortField,
+    col_prefix: u32,
+    row_stride: u32,
+    var_prefix: Option<&[u32]>,
+    out: &mut [u8],
+) {
+    let sentinel = field.null_sentinel();
+    let n = arr.len();
+    for i in 0..n {
+        let pos = arith_pos(i, col_prefix, row_stride, var_prefix);
+        out[pos] = sentinel;
+    }
+}
+
+fn encode_bool_arith(
+    arr: &BoolArray,
+    field: SortField,
+    col_prefix: u32,
+    row_stride: u32,
+    var_prefix: Option<&[u32]>,
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    let mask = arr.as_ref().validity()?.execute_mask(arr.len(), ctx)?;
+    let bits = arr.clone().into_bit_buffer();
+    let non_null = field.non_null_sentinel();
+    let null = field.null_sentinel();
+    let xor = if field.descending { 0xFF } else { 0x00 };
+    for i in 0..bits.len() {
+        let pos = arith_pos(i, col_prefix, row_stride, var_prefix);
+        if mask.value(i) {
+            out[pos] = non_null;
+            let raw = if bits.value(i) { 0x02u8 } else { 0x01u8 };
+            out[pos + 1] = raw ^ xor;
+        } else {
+            out[pos] = null;
+            out[pos + 1] = 0;
+        }
+    }
+    Ok(())
+}
+
+#[allow(clippy::too_many_arguments)]
+fn encode_primitive_arith(
+    arr: &PrimitiveArray,
+    field: SortField,
+    col_prefix: u32,
+    row_stride: u32,
+    var_prefix: Option<&[u32]>,
+    _width: u32,
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    match_each_native_ptype!(arr.ptype(), |T| {
+        encode_primitive_arith_typed::<T>(
+            arr, field, col_prefix, row_stride, var_prefix, out, ctx,
+        )?;
+    });
+    Ok(())
+}
+
+#[inline]
+fn encode_primitive_arith_typed<T: NativePType + RowEncode>(
+    arr: &PrimitiveArray,
+    field: SortField,
+    col_prefix: u32,
+    row_stride: u32,
+    var_prefix: Option<&[u32]>,
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    let mask = arr.as_ref().validity()?.execute_mask(arr.len(), ctx)?;
+    let slice: &[T] = arr.as_slice();
+    let non_null = field.non_null_sentinel();
+    let null = field.null_sentinel();
+    let value_bytes = size_of::<T>();
+    let stride = row_stride as usize;
+    let prefix = col_prefix as usize;
+    let descending = field.descending;
+    // `slot_size` = sentinel + value bytes. This is a compile-time constant per T.
+    let slot_size = 1 + value_bytes;
+
+    // Hot path: contiguous all-valid, no varlen prefix. The chunked write isolates each
+    // row's slot into a fixed-length `&mut [u8; slot_size]` view, which lets the compiler
+    // fold the bounds check on the inner write — matching `arrow-row::encode_not_null`.
+    if var_prefix.is_none() && mask.all_true() {
+        let chunks = out.chunks_exact_mut(stride);
+        for (chunk, &v) in chunks.zip(slice.iter()) {
+            // SAFETY: `prefix + slot_size <= stride` always holds (the row width sums to
+            // the stride; classifier and Phase 1 maintain that invariant). The first byte
+            // is the sentinel; the next `value_bytes` are the encoded value.
+            let slot_ptr = unsafe { chunk.as_mut_ptr().add(prefix) };
+            // SAFETY: `slot_ptr..slot_ptr+slot_size` lies within `chunk`, which is of
+            // length `stride`. Writing one byte for the sentinel is safe.
+            unsafe { slot_ptr.write(non_null) };
+            // SAFETY: the encoded value's `value_bytes` lie in `slot_ptr+1..slot_ptr+1+
+            // value_bytes`, all within the chunk.
+            let val_slice =
+                unsafe { std::slice::from_raw_parts_mut(slot_ptr.add(1), value_bytes) };
+            v.encode_to(val_slice, descending);
+        }
+        return Ok(());
+    }
+    if var_prefix.is_none() {
+        let chunks = out.chunks_exact_mut(stride);
+        for (i, (chunk, &v)) in chunks.zip(slice.iter()).enumerate() {
+            // SAFETY: classifier guarantees `prefix + slot_size <= stride`.
+            let slot: &mut [u8] =
+                unsafe { chunk.get_unchecked_mut(prefix..prefix + slot_size) };
+            if mask.value(i) {
+                slot[0] = non_null;
+                v.encode_to(&mut slot[1..], descending);
+            } else {
+                slot[0] = null;
+                for b in &mut slot[1..] {
+                    *b = 0;
+                }
+            }
+        }
+        return Ok(());
+    }
+    // Mixed (fixed-before-varlen with varlen prefix array).
+    let vp = var_prefix.unwrap_or_else(|| unreachable!());
+    for (i, &v) in slice.iter().enumerate() {
+        let pos = ((i as u32) * row_stride + col_prefix + vp[i]) as usize;
+        if mask.value(i) {
+            out[pos] = non_null;
+            v.encode_to(&mut out[pos + 1..pos + 1 + value_bytes], descending);
+        } else {
+            out[pos] = null;
+            for b in &mut out[pos + 1..pos + 1 + value_bytes] {
+                *b = 0;
+            }
+        }
+    }
+    Ok(())
+}
+
+#[allow(clippy::too_many_arguments)]
+fn encode_decimal_arith(
+    arr: &DecimalArray,
+    field: SortField,
+    col_prefix: u32,
+    row_stride: u32,
+    var_prefix: Option<&[u32]>,
+    _width: u32,
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    let mask = arr.as_ref().validity()?.execute_mask(arr.len(), ctx)?;
+    match arr.values_type() {
+        DecimalType::I8 => encode_decimal_arith_typed::<i8>(
+            arr, &mask, field, col_prefix, row_stride, var_prefix, out,
+        ),
+        DecimalType::I16 => encode_decimal_arith_typed::<i16>(
+            arr, &mask, field, col_prefix, row_stride, var_prefix, out,
+        ),
+        DecimalType::I32 => encode_decimal_arith_typed::<i32>(
+            arr, &mask, field, col_prefix, row_stride, var_prefix, out,
+        ),
+        DecimalType::I64 => encode_decimal_arith_typed::<i64>(
+            arr, &mask, field, col_prefix, row_stride, var_prefix, out,
+        ),
+        DecimalType::I128 => encode_decimal_arith_typed::<i128>(
+            arr, &mask, field, col_prefix, row_stride, var_prefix, out,
+        ),
+        DecimalType::I256 => {
+            vortex_bail!("row encoding for Decimal256 is not yet implemented")
+        }
+    }
+    Ok(())
+}
+
+fn encode_decimal_arith_typed<T>(
+    arr: &DecimalArray,
+    mask: &vortex_mask::Mask,
+    field: SortField,
+    col_prefix: u32,
+    row_stride: u32,
+    var_prefix: Option<&[u32]>,
+    out: &mut [u8],
+) where
+    T: vortex_array::dtype::NativeDecimalType + RowEncode,
+{
+    let non_null = field.non_null_sentinel();
+    let null = field.null_sentinel();
+    let value_bytes = size_of::<T>();
+    let slice = arr.buffer::<T>();
+    for i in 0..slice.len() {
+        let pos = arith_pos(i, col_prefix, row_stride, var_prefix);
+        if mask.value(i) {
+            out[pos] = non_null;
+            slice[i].encode_to(&mut out[pos + 1..pos + 1 + value_bytes], field.descending);
+        } else {
+            out[pos] = null;
+            for b in &mut out[pos + 1..pos + 1 + value_bytes] {
+                *b = 0;
+            }
+        }
+    }
+}
+
+#[allow(clippy::too_many_arguments)]
+fn encode_struct_arith(
+    arr: &StructArray,
+    field: SortField,
+    col_prefix: u32,
+    row_stride: u32,
+    var_prefix: Option<&[u32]>,
+    width: u32,
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    // Struct is only "fixed" when all fields are fixed; classifier guarantees that here.
+    let n = arr.len();
+    let mask = arr.as_ref().validity()?.execute_mask(n, ctx)?;
+    let non_null = field.non_null_sentinel();
+    let null = field.null_sentinel();
+
+    // Write outer sentinel.
+    for i in 0..n {
+        let pos = arith_pos(i, col_prefix, row_stride, var_prefix);
+        out[pos] = if mask.value(i) { non_null } else { null };
+    }
+
+    // Recursively encode each child at child_prefix = col_prefix + 1 + sum-of-prior-child-widths.
+    let mut child_prefix = col_prefix.saturating_add(1);
+    for child in arr.iter_unmasked_fields() {
+        let cw = match row_width_for_dtype(child.dtype())? {
+            RowWidth::Fixed(w) => w,
+            RowWidth::Variable => unreachable!("encode_struct_arith called on non-fixed struct"),
+        };
+        let canonical = child.clone().execute::<Canonical>(ctx)?;
+        field_encode_fixed_arithmetic(
+            &canonical,
+            field,
+            child_prefix,
+            row_stride,
+            var_prefix,
+            cw,
+            out,
+            ctx,
+        )?;
+        child_prefix = child_prefix.saturating_add(cw);
+    }
+
+    // Zero-fill body bytes of null rows.
+    let body_len = width.saturating_sub(1) as usize;
+    if body_len > 0 {
+        for i in 0..n {
+            if !mask.value(i) {
+                let pos = arith_pos(i, col_prefix, row_stride, var_prefix);
+                for b in &mut out[pos + 1..pos + 1 + body_len] {
+                    *b = 0;
+                }
+            }
+        }
+    }
+
+    Ok(())
+}
+
+#[allow(clippy::too_many_arguments)]
+fn encode_fsl_arith(
+    arr: &FixedSizeListArray,
+    field: SortField,
+    col_prefix: u32,
+    row_stride: u32,
+    var_prefix: Option<&[u32]>,
+    width: u32,
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    // FSL is only "fixed" when its element type is fixed; classifier guarantees that here.
+    let n = arr.len();
+    let list_size = arr.list_size() as usize;
+    let mask = arr.as_ref().validity()?.execute_mask(n, ctx)?;
+    let non_null = field.non_null_sentinel();
+    let null = field.null_sentinel();
+    let elements = arr.elements().clone().execute::<Canonical>(ctx)?;
+    debug_assert_eq!(elements.len(), n * list_size);
+
+    let elem_width = match row_width_for_dtype(elements.dtype())? {
+        RowWidth::Fixed(w) => w,
+        RowWidth::Variable => unreachable!("encode_fsl_arith called on non-fixed FSL"),
+    };
+
+    // Write outer sentinel for each row.
+    for i in 0..n {
+        let pos = arith_pos(i, col_prefix, row_stride, var_prefix);
+        out[pos] = if mask.value(i) { non_null } else { null };
+    }
+
+    // Build temporary row_offsets/cursors arrays where each *element* slot has its own
+    // offset. Since FSL is fixed-arith, the per-element start position is deterministic.
+    // Fall back to the cursor-based path for elements: it's a constant computation but
+    // simpler than threading arithmetic offsets through recursion.
+    let mut elem_offsets = vec![0u32; n * list_size];
+    for i in 0..n {
+        let row_start = arith_pos(i, col_prefix, row_stride, var_prefix) as u32 + 1;
+        for j in 0..list_size {
+            elem_offsets[i * list_size + j] = row_start + (j as u32) * elem_width;
+        }
+    }
+    let mut elem_cursors = vec![0u32; n * list_size];
+    field_encode(&elements, field, &elem_offsets, &mut elem_cursors, out, ctx)?;
+
+    // Zero-fill null bodies.
+    let body_len = width.saturating_sub(1) as usize;
+    if body_len > 0 {
+        for i in 0..n {
+            if !mask.value(i) {
+                let pos = arith_pos(i, col_prefix, row_stride, var_prefix);
+                for b in &mut out[pos + 1..pos + 1 + body_len] {
+                    *b = 0;
+                }
+            }
+        }
+    }
+    Ok(())
+}
diff --git a/vortex-row/src/encode.rs b/vortex-row/src/encode.rs
index 330d6437d04..8c930406643 100644
--- a/vortex-row/src/encode.rs
+++ b/vortex-row/src/encode.rs
@@ -22,6 +22,7 @@ use vortex_array::ExecutionCtx;
 use vortex_array::IntoArray;
 use vortex_array::VTable;
 use vortex_array::arrays::ListViewArray;
+use vortex_array::arrays::Primitive;
 use vortex_array::arrays::PrimitiveArray;
 use vortex_array::dtype::DType;
 use vortex_array::dtype::Nullability;
@@ -44,6 +45,7 @@ use crate::options::RowEncodeOptions;
 use crate::options::SortField;
 use crate::options::deserialize_row_encode_options;
 use crate::options::serialize_row_encode_options;
+use crate::size::ColKind;
 use crate::size::compute_sizes;
 
 /// Variadic scalar function that encodes N input columns into a single `List<u8>`
@@ -115,8 +117,8 @@ fn execute_row_encode(
     let crate::size::SizePassResult {
         fixed_per_row,
         var_lengths,
-        col_kinds: _,
-        first_varlen_idx: _,
+        col_kinds,
+        first_varlen_idx,
         columns,
     } = compute_sizes(options, args, ctx, "RowEncode")?;
 
@@ -149,7 +151,23 @@ fn execute_row_encode(
     // listview_offsets[i] is the absolute byte offset where row `i` begins.
     // For pure-fixed: i * fixed_per_row.
     // For mixed: i * fixed_per_row + exclusive prefix sum of var_lengths.
+    //
+    // When fixed-before-varlen columns exist alongside a varlen column, we also build
+    // `var_prefix_for_arith[i] = exclusive cumsum of var_lengths[..i]` and pass it to
+    // the arithmetic encoders so they can compute per-row positions without a cursor.
+    let need_arith_prefix = first_varlen_idx.is_some()
+        && col_kinds.iter().any(|k| {
+            matches!(
+                k,
+                ColKind::Fixed {
+                    before_varlen: true,
+                    ..
+                }
+            )
+        });
+
     let mut listview_offsets: Vec<u32> = Vec::with_capacity(nrows);
+    let mut var_prefix_for_arith: Option<Vec<u32>> = None;
     match var_lengths.as_ref() {
         None => {
             // Pure-fixed: offsets[i] = i * fixed_per_row. Materialize via a tight
@@ -169,34 +187,82 @@ fn execute_row_encode(
             // var_prefix is the exclusive cumsum of varlen lengths. Same raw-pointer
             // write loop as the pure-fixed branch (auto-vectorized); the total was
             // validated to fit in u32 upstream so `wrapping_add` is sound here.
+            let mut vp: Option<Vec<u32>> = need_arith_prefix.then(|| Vec::with_capacity(nrows));
             // SAFETY: we just reserved nrows; writes at indices [0, nrows) are valid.
+            // Likewise `vp` (if Some) has reserved nrows.
             unsafe {
                 let off_ptr = listview_offsets.as_mut_ptr();
+                let vp_ptr = vp.as_mut().map(|p| p.as_mut_ptr());
                 let mut acc: u32 = 0;
                 for (i, &l) in v.iter().enumerate() {
+                    if let Some(p) = vp_ptr {
+                        p.add(i).write(acc);
+                    }
                     off_ptr
                         .add(i)
                         .write((i as u32).wrapping_mul(fixed_per_row).wrapping_add(acc));
                     acc = acc.wrapping_add(l);
                 }
                 listview_offsets.set_len(nrows);
+                if let Some(p) = vp.as_mut() {
+                    p.set_len(nrows);
+                }
             }
+            var_prefix_for_arith = vp;
         }
     }
 
     // Per-row write cursor (also doubles as the ListView `sizes` slot when done).
-    let mut row_cursors = vec![0u32; nrows];
+    //
+    // The cursor path starts at `prefix_at_first_varlen` so that `listview_offsets[i] +
+    // cursors[i]` lands at the position of the first cursor-path column (i.e. after the
+    // bytes already written by the arithmetic path for fixed-before-varlen columns).
+    //
+    // When there are no varlen columns at all, every column went through the arith path,
+    // so the cursor path runs zero iterations. Pre-seeding the cursors with
+    // `fixed_per_row` makes them already correct as per-row sizes in that case.
+    let initial_cursor: u32 = match first_varlen_idx {
+        Some(idx) => match col_kinds[idx] {
+            ColKind::Variable { fixed_prefix } => fixed_prefix,
+            ColKind::Fixed { .. } => unreachable!("first_varlen_idx points to a varlen column"),
+        },
+        None => fixed_per_row,
+    };
+    let mut row_cursors = vec![initial_cursor; nrows];
 
-    // ===== Phase 4: encode columns via the cursor path =====
+    // ===== Phase 4: encode columns =====
+    // Fixed-before-varlen columns take the arithmetic write path (no cursor mutation).
+    // Fixed-after-varlen and varlen columns take the cursor path, which already runs
+    // through `dispatch_encode`.
     for (i, col) in columns.iter().enumerate() {
-        dispatch_encode(
-            col,
-            options.fields[i],
-            &listview_offsets,
-            &mut row_cursors,
-            &mut out_buf,
-            ctx,
-        )?;
+        match col_kinds[i] {
+            ColKind::Fixed {
+                width,
+                prefix,
+                before_varlen: true,
+            } => {
+                dispatch_encode_fixed_arith(
+                    col,
+                    options.fields[i],
+                    prefix,
+                    fixed_per_row,
+                    var_prefix_for_arith.as_deref(),
+                    width,
+                    &mut out_buf,
+                    ctx,
+                )?;
+            }
+            ColKind::Fixed { .. } | ColKind::Variable { .. } => {
+                dispatch_encode(
+                    col,
+                    options.fields[i],
+                    &listview_offsets,
+                    &mut row_cursors,
+                    &mut out_buf,
+                    ctx,
+                )?;
+            }
+        }
     }
 
     // ===== Phase 5: build ListView output =====
@@ -226,6 +292,39 @@ fn execute_row_encode(
     .into_array())
 }
 
+/// Dispatch a single column's encoding through the arithmetic fast path. This is used for
+/// fixed-width columns that appear before any variable-length column in the row layout: the
+/// within-row write offset is a constant `col_prefix + var_prefix[i]` (or just `col_prefix`
+/// for the pure-fixed case), so we can skip the per-row cursor read/write entirely.
+#[allow(clippy::too_many_arguments)]
+fn dispatch_encode_fixed_arith(
+    col: &ArrayRef,
+    field: SortField,
+    col_prefix: u32,
+    row_stride: u32,
+    var_prefix: Option<&[u32]>,
+    width: u32,
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    // Already-canonical PrimitiveArray: bypass the canonicalization machinery entirely so
+    // the hot loop is reached without going through `execute_until::<AnyCanonical>`.
+    if col.as_opt::<Primitive>().is_some()
+        && let Ok(parr) = col.clone().try_downcast::<Primitive>()
+    {
+        let canonical = Canonical::Primitive(parr);
+        return codec::field_encode_fixed_arithmetic(
+            &canonical, field, col_prefix, row_stride, var_prefix, width, out, ctx,
+        );
+    }
+    // For other fixed columns route through canonicalization and the codec helpers. The
+    // Constant fast path is layered on in a follow-up commit.
+    let canonical = col.clone().execute::<Canonical>(ctx)?;
+    codec::field_encode_fixed_arithmetic(
+        &canonical, field, col_prefix, row_stride, var_prefix, width, out, ctx,
+    )
+}
+
 /// Dispatch a single column's encoding into the shared `out` buffer.
 ///
 /// For PR 1 this is just the canonicalize-then-`codec::field_encode` fallback path.
diff --git a/vortex-row/src/size.rs b/vortex-row/src/size.rs
index 7148a2a21d8..bfe5f647dc5 100644
--- a/vortex-row/src/size.rs
+++ b/vortex-row/src/size.rs
@@ -48,10 +48,6 @@ use crate::options::serialize_row_encode_options;
 /// path (no varlen before this column, so the within-row position is constant) and the
 /// cursor-write path.
 #[derive(Clone, Copy, Debug)]
-#[allow(
-    dead_code,
-    reason = "fields read by the RowEncode pipeline in a later commit"
-)]
 pub(crate) enum ColKind {
     /// Column has fixed width `width`. `prefix` is the within-row byte offset of this
     /// column's first byte. If `before_varlen` is true, no variable-length column precedes
@@ -72,15 +68,7 @@ pub(crate) enum ColKind {
 pub(crate) struct SizePassResult {
     pub fixed_per_row: u32,
     pub var_lengths: Option<Vec<u32>>,
-    #[allow(
-        dead_code,
-        reason = "consumed by the arithmetic-write fast path added in PR 2"
-    )]
     pub col_kinds: Vec<ColKind>,
-    #[allow(
-        dead_code,
-        reason = "consumed by the arithmetic-write fast path added in PR 2"
-    )]
     pub first_varlen_idx: Option<usize>,
     pub columns: Vec<ArrayRef>,
 }

From 79874235b5e5fb94d7dc943fefedc025582107e4 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 17 May 2026 22:48:52 +0000
Subject: [PATCH 17/26] Specialize Constant for the arithmetic-write fast path

When a ConstantArray feeds the fixed-before-varlen arithmetic path,
the encoded scalar bytes are the same for every row. Hoist them into
1-2 register-sized loads outside the loop and emit direct
write_unaligned stores per row. Specialized for encoded lengths 2
(bool/i8), 5 (i32), 9 (i64), 17 (i128). Other lengths fall back to
copy_nonoverlapping. The var_prefix case (Constant after a varlen
column) takes the same shape but computes per-row positions
arithmetically rather than via a running cursor.

constant_i64_vortex_without_kernel 2.47 -> ~6 GB/s (PR3 commit 3.3
adds the same specialization to the Constant kernel itself).

Signed-off-by: Claude <noreply@anthropic.com>
---
 vortex-row/src/encode.rs | 139 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 137 insertions(+), 2 deletions(-)

diff --git a/vortex-row/src/encode.rs b/vortex-row/src/encode.rs
index 8c930406643..1ca24ec378c 100644
--- a/vortex-row/src/encode.rs
+++ b/vortex-row/src/encode.rs
@@ -21,6 +21,7 @@ use vortex_array::Canonical;
 use vortex_array::ExecutionCtx;
 use vortex_array::IntoArray;
 use vortex_array::VTable;
+use vortex_array::arrays::Constant;
 use vortex_array::arrays::ListViewArray;
 use vortex_array::arrays::Primitive;
 use vortex_array::arrays::PrimitiveArray;
@@ -317,14 +318,148 @@ fn dispatch_encode_fixed_arith(
             &canonical, field, col_prefix, row_stride, var_prefix, width, out, ctx,
         );
     }
-    // For other fixed columns route through canonicalization and the codec helpers. The
-    // Constant fast path is layered on in a follow-up commit.
+    // Constant fast path: write the same scalar bytes at each per-row position.
+    if let Some(view) = col.as_opt::<Constant>() {
+        return encode_constant_arith(view, field, col_prefix, row_stride, var_prefix, width, out);
+    }
+    // For other fixed columns route through canonicalization and the codec helpers.
     let canonical = col.clone().execute::<Canonical>(ctx)?;
     codec::field_encode_fixed_arithmetic(
         &canonical, field, col_prefix, row_stride, var_prefix, width, out, ctx,
     )
 }
 
+/// Constant-specific arithmetic writer. Encodes the scalar bytes once, then writes the same
+/// bytes into each per-row slot via direct register-sized stores for the common small
+/// lengths (2/5/9/17), or `copy_nonoverlapping` as a fallback.
+fn encode_constant_arith(
+    view: ArrayView<'_, Constant>,
+    field: SortField,
+    col_prefix: u32,
+    row_stride: u32,
+    var_prefix: Option<&[u32]>,
+    _width: u32,
+    out: &mut [u8],
+) -> VortexResult<()> {
+    let bytes = codec::encode_scalar(view.scalar(), field)?;
+    let len = bytes.len();
+    if len == 0 {
+        return Ok(());
+    }
+    let n = view.len();
+    // SAFETY: encoded scalar length matches the per-row width contributed to the size pass,
+    // so `pos + len <= out.len()` by buffer construction. For small fixed lengths (the
+    // common case: bool=2, i32=5, i64=9, i128=17) we hoist the encoded bytes into
+    // register-sized loads before the loop and emit direct write_unaligned stores per row.
+    // This is faster than copy_nonoverlapping for small `len` because the compiler emits a
+    // real memcpy call rather than inlining the 1- or 2-word store sequence.
+    unsafe {
+        let src = bytes.as_ptr();
+        let stride = row_stride as usize;
+        match (var_prefix, len) {
+            // i64-typical: 1 sentinel + 8 value bytes = 9 bytes, no varlen prefix.
+            (None, 9) => {
+                let v_lo = std::ptr::read_unaligned(src as *const u64);
+                let v_hi = *src.add(8);
+                let mut dst = out.as_mut_ptr().add(col_prefix as usize);
+                for _ in 0..n {
+                    std::ptr::write_unaligned(dst as *mut u64, v_lo);
+                    *dst.add(8) = v_hi;
+                    dst = dst.add(stride);
+                }
+            }
+            // i32-typical: 1 sentinel + 4 value bytes = 5 bytes, no varlen prefix.
+            (None, 5) => {
+                let v_lo = std::ptr::read_unaligned(src as *const u32);
+                let v_hi = *src.add(4);
+                let mut dst = out.as_mut_ptr().add(col_prefix as usize);
+                for _ in 0..n {
+                    std::ptr::write_unaligned(dst as *mut u32, v_lo);
+                    *dst.add(4) = v_hi;
+                    dst = dst.add(stride);
+                }
+            }
+            // bool / i8: 1 sentinel + 1 value byte = 2 bytes, no varlen prefix.
+            (None, 2) => {
+                let v = std::ptr::read_unaligned(src as *const u16);
+                let mut dst = out.as_mut_ptr().add(col_prefix as usize);
+                for _ in 0..n {
+                    std::ptr::write_unaligned(dst as *mut u16, v);
+                    dst = dst.add(stride);
+                }
+            }
+            // i128: 1 sentinel + 16 value bytes = 17 bytes, no varlen prefix.
+            (None, 17) => {
+                let v_lo = std::ptr::read_unaligned(src as *const u128);
+                let v_hi = *src.add(16);
+                let mut dst = out.as_mut_ptr().add(col_prefix as usize);
+                for _ in 0..n {
+                    std::ptr::write_unaligned(dst as *mut u128, v_lo);
+                    *dst.add(16) = v_hi;
+                    dst = dst.add(stride);
+                }
+            }
+            // General fallback for other lengths.
+            (None, _) => {
+                let mut dst = out.as_mut_ptr().add(col_prefix as usize);
+                for _ in 0..n {
+                    std::ptr::copy_nonoverlapping(src, dst, len);
+                    dst = dst.add(stride);
+                }
+            }
+            (Some(vp), 9) => {
+                let v_lo = std::ptr::read_unaligned(src as *const u64);
+                let v_hi = *src.add(8);
+                let base = out.as_mut_ptr();
+                for i in 0..n {
+                    let pos = (i as u32) * row_stride + col_prefix + vp[i];
+                    let dst = base.add(pos as usize);
+                    std::ptr::write_unaligned(dst as *mut u64, v_lo);
+                    *dst.add(8) = v_hi;
+                }
+            }
+            (Some(vp), 5) => {
+                let v_lo = std::ptr::read_unaligned(src as *const u32);
+                let v_hi = *src.add(4);
+                let base = out.as_mut_ptr();
+                for i in 0..n {
+                    let pos = (i as u32) * row_stride + col_prefix + vp[i];
+                    let dst = base.add(pos as usize);
+                    std::ptr::write_unaligned(dst as *mut u32, v_lo);
+                    *dst.add(4) = v_hi;
+                }
+            }
+            (Some(vp), 2) => {
+                let v = std::ptr::read_unaligned(src as *const u16);
+                let base = out.as_mut_ptr();
+                for i in 0..n {
+                    let pos = (i as u32) * row_stride + col_prefix + vp[i];
+                    std::ptr::write_unaligned(base.add(pos as usize) as *mut u16, v);
+                }
+            }
+            (Some(vp), 17) => {
+                let v_lo = std::ptr::read_unaligned(src as *const u128);
+                let v_hi = *src.add(16);
+                let base = out.as_mut_ptr();
+                for i in 0..n {
+                    let pos = (i as u32) * row_stride + col_prefix + vp[i];
+                    let dst = base.add(pos as usize);
+                    std::ptr::write_unaligned(dst as *mut u128, v_lo);
+                    *dst.add(16) = v_hi;
+                }
+            }
+            (Some(vp), _) => {
+                let base = out.as_mut_ptr();
+                for i in 0..n {
+                    let pos = (i as u32) * row_stride + col_prefix + vp[i];
+                    std::ptr::copy_nonoverlapping(src, base.add(pos as usize), len);
+                }
+            }
+        }
+    }
+    Ok(())
+}
+
 /// Dispatch a single column's encoding into the shared `out` buffer.
 ///
 /// For PR 1 this is just the canonicalize-then-`codec::field_encode` fallback path.

From f3c7593f2f5d1f19eaf1dc44a0d15f63cd5c5271 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 17 May 2026 22:24:16 +0000
Subject: [PATCH 18/26] RowSizeKernel and RowEncodeKernel dispatch helpers

Wire per-encoding fast-path traits into `dispatch_size` and
`dispatch_encode`. Both helpers now try the in-crate downcast arms
(Constant, Dict, Patched) before falling back to canonicalization.

This commit adds stub impls returning `Ok(None)` so the existing
behavior is preserved bit-for-bit; subsequent commits replace each
stub with its real impl. Keeping the wiring change separate from the
algorithm work makes the kernel impl commits trivially reviewable in
isolation (they only touch one file each).

The kernel module is `mod kernels` (crate-private) so the impls
satisfy the orphan rule (trait defined in `vortex-row`, types from
`vortex-array`) without leaking the impls into the crate's public
surface.

Signed-off-by: Claude <noreply@anthropic.com>
---
 vortex-row/public-api.lock         | 48 ++++++++++++++++++++++++++++++
 vortex-row/src/encode.rs           | 23 ++++++++++++--
 vortex-row/src/kernels/constant.rs | 40 +++++++++++++++++++++++++
 vortex-row/src/kernels/dict.rs     | 40 +++++++++++++++++++++++++
 vortex-row/src/kernels/mod.rs      | 17 +++++++++++
 vortex-row/src/kernels/patched.rs  | 40 +++++++++++++++++++++++++
 vortex-row/src/lib.rs              |  1 +
 vortex-row/src/size.rs             | 24 +++++++++++++--
 8 files changed, 227 insertions(+), 6 deletions(-)
 create mode 100644 vortex-row/src/kernels/constant.rs
 create mode 100644 vortex-row/src/kernels/dict.rs
 create mode 100644 vortex-row/src/kernels/mod.rs
 create mode 100644 vortex-row/src/kernels/patched.rs

diff --git a/vortex-row/public-api.lock b/vortex-row/public-api.lock
index ed231a1e556..a7221cd91ee 100644
--- a/vortex-row/public-api.lock
+++ b/vortex-row/public-api.lock
@@ -146,6 +146,18 @@ pub trait vortex_row::encode::RowEncodeKernel: vortex_array::array::vtable::VTab
 
 pub fn vortex_row::encode::RowEncodeKernel::row_encode_into(vortex_array::array::view::ArrayView<'_, Self>, vortex_row::options::SortField, &[u32], &mut [u32], &mut [u8], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<()>>
 
+impl vortex_row::encode::RowEncodeKernel for vortex_array::arrays::constant::vtable::Constant
+
+pub fn vortex_array::arrays::constant::vtable::Constant::row_encode_into(vortex_array::array::view::ArrayView<'_, Self>, vortex_row::options::SortField, &[u32], &mut [u32], &mut [u8], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<()>>
+
+impl vortex_row::encode::RowEncodeKernel for vortex_array::arrays::dict::vtable::Dict
+
+pub fn vortex_array::arrays::dict::vtable::Dict::row_encode_into(vortex_array::array::view::ArrayView<'_, Self>, vortex_row::options::SortField, &[u32], &mut [u32], &mut [u8], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<()>>
+
+impl vortex_row::encode::RowEncodeKernel for vortex_array::arrays::patched::vtable::Patched
+
+pub fn vortex_array::arrays::patched::vtable::Patched::row_encode_into(vortex_array::array::view::ArrayView<'_, Self>, vortex_row::options::SortField, &[u32], &mut [u32], &mut [u8], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<()>>
+
 pub fn vortex_row::encode::dispatch_encode(&vortex_array::array::erased::ArrayRef, vortex_row::options::SortField, &[u32], &mut [u32], &mut [u8], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<()>
 
 pub mod vortex_row::options
@@ -266,6 +278,18 @@ pub trait vortex_row::size::RowSizeKernel: vortex_array::array::vtable::VTable
 
 pub fn vortex_row::size::RowSizeKernel::row_size_contribution(vortex_array::array::view::ArrayView<'_, Self>, vortex_row::options::SortField, &mut [u32], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<()>>
 
+impl vortex_row::size::RowSizeKernel for vortex_array::arrays::constant::vtable::Constant
+
+pub fn vortex_array::arrays::constant::vtable::Constant::row_size_contribution(vortex_array::array::view::ArrayView<'_, Self>, vortex_row::options::SortField, &mut [u32], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<()>>
+
+impl vortex_row::size::RowSizeKernel for vortex_array::arrays::dict::vtable::Dict
+
+pub fn vortex_array::arrays::dict::vtable::Dict::row_size_contribution(vortex_array::array::view::ArrayView<'_, Self>, vortex_row::options::SortField, &mut [u32], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<()>>
+
+impl vortex_row::size::RowSizeKernel for vortex_array::arrays::patched::vtable::Patched
+
+pub fn vortex_array::arrays::patched::vtable::Patched::row_size_contribution(vortex_array::array::view::ArrayView<'_, Self>, vortex_row::options::SortField, &mut [u32], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<()>>
+
 pub fn vortex_row::size::dispatch_size(&vortex_array::array::erased::ArrayRef, vortex_row::options::SortField, &mut [u32], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<()>
 
 pub struct vortex_row::RowEncode
@@ -412,10 +436,34 @@ pub trait vortex_row::RowEncodeKernel: vortex_array::array::vtable::VTable
 
 pub fn vortex_row::RowEncodeKernel::row_encode_into(vortex_array::array::view::ArrayView<'_, Self>, vortex_row::options::SortField, &[u32], &mut [u32], &mut [u8], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<()>>
 
+impl vortex_row::encode::RowEncodeKernel for vortex_array::arrays::constant::vtable::Constant
+
+pub fn vortex_array::arrays::constant::vtable::Constant::row_encode_into(vortex_array::array::view::ArrayView<'_, Self>, vortex_row::options::SortField, &[u32], &mut [u32], &mut [u8], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<()>>
+
+impl vortex_row::encode::RowEncodeKernel for vortex_array::arrays::dict::vtable::Dict
+
+pub fn vortex_array::arrays::dict::vtable::Dict::row_encode_into(vortex_array::array::view::ArrayView<'_, Self>, vortex_row::options::SortField, &[u32], &mut [u32], &mut [u8], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<()>>
+
+impl vortex_row::encode::RowEncodeKernel for vortex_array::arrays::patched::vtable::Patched
+
+pub fn vortex_array::arrays::patched::vtable::Patched::row_encode_into(vortex_array::array::view::ArrayView<'_, Self>, vortex_row::options::SortField, &[u32], &mut [u32], &mut [u8], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<()>>
+
 pub trait vortex_row::RowSizeKernel: vortex_array::array::vtable::VTable
 
 pub fn vortex_row::RowSizeKernel::row_size_contribution(vortex_array::array::view::ArrayView<'_, Self>, vortex_row::options::SortField, &mut [u32], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<()>>
 
+impl vortex_row::size::RowSizeKernel for vortex_array::arrays::constant::vtable::Constant
+
+pub fn vortex_array::arrays::constant::vtable::Constant::row_size_contribution(vortex_array::array::view::ArrayView<'_, Self>, vortex_row::options::SortField, &mut [u32], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<()>>
+
+impl vortex_row::size::RowSizeKernel for vortex_array::arrays::dict::vtable::Dict
+
+pub fn vortex_array::arrays::dict::vtable::Dict::row_size_contribution(vortex_array::array::view::ArrayView<'_, Self>, vortex_row::options::SortField, &mut [u32], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<()>>
+
+impl vortex_row::size::RowSizeKernel for vortex_array::arrays::patched::vtable::Patched
+
+pub fn vortex_array::arrays::patched::vtable::Patched::row_size_contribution(vortex_array::array::view::ArrayView<'_, Self>, vortex_row::options::SortField, &mut [u32], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<()>>
+
 pub fn vortex_row::compute_row_sizes(&[vortex_array::array::erased::ArrayRef], &[vortex_row::options::SortField], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<vortex_array::array::erased::ArrayRef>
 
 pub fn vortex_row::convert_columns(&[vortex_array::array::erased::ArrayRef], &[vortex_row::options::SortField], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<vortex_array::arrays::listview::vtable::ListViewArray>
diff --git a/vortex-row/src/encode.rs b/vortex-row/src/encode.rs
index 1ca24ec378c..7510b78bc9f 100644
--- a/vortex-row/src/encode.rs
+++ b/vortex-row/src/encode.rs
@@ -25,6 +25,8 @@ use vortex_array::arrays::Constant;
 use vortex_array::arrays::ListViewArray;
 use vortex_array::arrays::Primitive;
 use vortex_array::arrays::PrimitiveArray;
+use vortex_array::arrays::dict::Dict;
+use vortex_array::arrays::patched::Patched;
 use vortex_array::dtype::DType;
 use vortex_array::dtype::Nullability;
 use vortex_array::dtype::PType;
@@ -462,9 +464,9 @@ fn encode_constant_arith(
 
 /// Dispatch a single column's encoding into the shared `out` buffer.
 ///
-/// For PR 1 this is just the canonicalize-then-`codec::field_encode` fallback path.
-/// In-crate fast paths for `Constant`/`Dict`/`Patched` and the inventory-based registry
-/// for downstream encodings are added in PR 3.
+/// Tries the in-crate per-encoding fast paths first, then falls back to canonicalization.
+/// Per-encoding kernels currently return `Ok(None)` (stubs added alongside the trait); the
+/// real impls land in follow-up commits. The downstream-encoding registry is added next.
 pub fn dispatch_encode(
     col: &ArrayRef,
     field: SortField,
@@ -473,6 +475,21 @@ pub fn dispatch_encode(
     out: &mut [u8],
     ctx: &mut ExecutionCtx,
 ) -> VortexResult<()> {
+    if let Some(view) = col.as_opt::<Constant>()
+        && Constant::row_encode_into(view, field, offsets, cursors, out, ctx)?.is_some()
+    {
+        return Ok(());
+    }
+    if let Some(view) = col.as_opt::<Dict>()
+        && Dict::row_encode_into(view, field, offsets, cursors, out, ctx)?.is_some()
+    {
+        return Ok(());
+    }
+    if let Some(view) = col.as_opt::<Patched>()
+        && Patched::row_encode_into(view, field, offsets, cursors, out, ctx)?.is_some()
+    {
+        return Ok(());
+    }
     let canonical = col.clone().execute::<Canonical>(ctx)?;
     codec::field_encode(&canonical, field, offsets, cursors, out, ctx)
 }
diff --git a/vortex-row/src/kernels/constant.rs b/vortex-row/src/kernels/constant.rs
new file mode 100644
index 00000000000..51d54fbf123
--- /dev/null
+++ b/vortex-row/src/kernels/constant.rs
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Row-encode kernels for `ConstantArray`.
+//!
+//! Stubs in this commit return `Ok(None)` so the dispatch loop falls back to
+//! canonicalization. The real impls land in a follow-up commit.
+
+use vortex_array::ArrayView;
+use vortex_array::ExecutionCtx;
+use vortex_array::arrays::Constant;
+use vortex_error::VortexResult;
+
+use crate::encode::RowEncodeKernel;
+use crate::options::SortField;
+use crate::size::RowSizeKernel;
+
+impl RowSizeKernel for Constant {
+    fn row_size_contribution(
+        _column: ArrayView<'_, Self>,
+        _field: SortField,
+        _sizes: &mut [u32],
+        _ctx: &mut ExecutionCtx,
+    ) -> VortexResult<Option<()>> {
+        Ok(None)
+    }
+}
+
+impl RowEncodeKernel for Constant {
+    fn row_encode_into(
+        _column: ArrayView<'_, Self>,
+        _field: SortField,
+        _offsets: &[u32],
+        _cursors: &mut [u32],
+        _out: &mut [u8],
+        _ctx: &mut ExecutionCtx,
+    ) -> VortexResult<Option<()>> {
+        Ok(None)
+    }
+}
diff --git a/vortex-row/src/kernels/dict.rs b/vortex-row/src/kernels/dict.rs
new file mode 100644
index 00000000000..0f576f3372a
--- /dev/null
+++ b/vortex-row/src/kernels/dict.rs
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Row-encode kernels for `DictArray`.
+//!
+//! Stubs in this commit return `Ok(None)` so the dispatch loop falls back to
+//! canonicalization. The real impls land in a follow-up commit.
+
+use vortex_array::ArrayView;
+use vortex_array::ExecutionCtx;
+use vortex_array::arrays::dict::Dict;
+use vortex_error::VortexResult;
+
+use crate::encode::RowEncodeKernel;
+use crate::options::SortField;
+use crate::size::RowSizeKernel;
+
+impl RowSizeKernel for Dict {
+    fn row_size_contribution(
+        _column: ArrayView<'_, Self>,
+        _field: SortField,
+        _sizes: &mut [u32],
+        _ctx: &mut ExecutionCtx,
+    ) -> VortexResult<Option<()>> {
+        Ok(None)
+    }
+}
+
+impl RowEncodeKernel for Dict {
+    fn row_encode_into(
+        _column: ArrayView<'_, Self>,
+        _field: SortField,
+        _offsets: &[u32],
+        _cursors: &mut [u32],
+        _out: &mut [u8],
+        _ctx: &mut ExecutionCtx,
+    ) -> VortexResult<Option<()>> {
+        Ok(None)
+    }
+}
diff --git a/vortex-row/src/kernels/mod.rs b/vortex-row/src/kernels/mod.rs
new file mode 100644
index 00000000000..492f619ef0d
--- /dev/null
+++ b/vortex-row/src/kernels/mod.rs
@@ -0,0 +1,17 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Per-encoding fast-path implementations of [`RowSizeKernel`] and [`RowEncodeKernel`] for
+//! encodings defined in `vortex-array`.
+//!
+//! Each impl in this module lives here (rather than under the corresponding encoding's
+//! `compute` module in `vortex-array`) so the orphan rule is satisfied: the trait is
+//! defined in `vortex-row` and the impl is also in `vortex-row`, while the array type
+//! (`Constant`, `Dict`, `Patched`) remains in `vortex-array`.
+//!
+//! [`RowSizeKernel`]: crate::size::RowSizeKernel
+//! [`RowEncodeKernel`]: crate::encode::RowEncodeKernel
+
+mod constant;
+mod dict;
+mod patched;
diff --git a/vortex-row/src/kernels/patched.rs b/vortex-row/src/kernels/patched.rs
new file mode 100644
index 00000000000..2637116afb4
--- /dev/null
+++ b/vortex-row/src/kernels/patched.rs
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Row-encode kernels for `Patched`.
+//!
+//! Stubs in this commit return `Ok(None)` so the dispatch loop falls back to
+//! canonicalization. The real impls land in a follow-up commit.
+
+use vortex_array::ArrayView;
+use vortex_array::ExecutionCtx;
+use vortex_array::arrays::patched::Patched;
+use vortex_error::VortexResult;
+
+use crate::encode::RowEncodeKernel;
+use crate::options::SortField;
+use crate::size::RowSizeKernel;
+
+impl RowSizeKernel for Patched {
+    fn row_size_contribution(
+        _column: ArrayView<'_, Self>,
+        _field: SortField,
+        _sizes: &mut [u32],
+        _ctx: &mut ExecutionCtx,
+    ) -> VortexResult<Option<()>> {
+        Ok(None)
+    }
+}
+
+impl RowEncodeKernel for Patched {
+    fn row_encode_into(
+        _column: ArrayView<'_, Self>,
+        _field: SortField,
+        _offsets: &[u32],
+        _cursors: &mut [u32],
+        _out: &mut [u8],
+        _ctx: &mut ExecutionCtx,
+    ) -> VortexResult<Option<()>> {
+        Ok(None)
+    }
+}
diff --git a/vortex-row/src/lib.rs b/vortex-row/src/lib.rs
index fddcca665c1..2896ae05acf 100644
--- a/vortex-row/src/lib.rs
+++ b/vortex-row/src/lib.rs
@@ -28,6 +28,7 @@
 pub mod codec;
 pub mod convert;
 pub mod encode;
+mod kernels;
 pub mod options;
 pub mod size;
 
diff --git a/vortex-row/src/size.rs b/vortex-row/src/size.rs
index bfe5f647dc5..8fb1bdbcf08 100644
--- a/vortex-row/src/size.rs
+++ b/vortex-row/src/size.rs
@@ -11,9 +11,12 @@ use vortex_array::Canonical;
 use vortex_array::ExecutionCtx;
 use vortex_array::IntoArray;
 use vortex_array::VTable;
+use vortex_array::arrays::Constant;
 use vortex_array::arrays::ConstantArray;
 use vortex_array::arrays::PrimitiveArray;
 use vortex_array::arrays::StructArray;
+use vortex_array::arrays::dict::Dict;
+use vortex_array::arrays::patched::Patched;
 use vortex_array::dtype::DType;
 use vortex_array::dtype::FieldName;
 use vortex_array::dtype::FieldNames;
@@ -256,15 +259,30 @@ impl ScalarFnVTable for RowSize {
 
 /// Dispatch a single column's per-row size contribution.
 ///
-/// For PR 1 this is just the canonicalize-then-`codec::field_size` fallback path. In-crate
-/// fast paths for `Constant`/`Dict`/`Patched` and the inventory-based registry for
-/// downstream encodings are added in PR 3.
+/// Tries the in-crate per-encoding fast paths first, then falls back to canonicalization.
+/// Per-encoding kernels currently return `Ok(None)` (stubs added alongside the trait); the
+/// real impls land in follow-up commits. The downstream-encoding registry is added next.
 pub fn dispatch_size(
     col: &ArrayRef,
     field: SortField,
     sizes: &mut [u32],
     ctx: &mut ExecutionCtx,
 ) -> VortexResult<()> {
+    if let Some(view) = col.as_opt::<Constant>()
+        && Constant::row_size_contribution(view, field, sizes, ctx)?.is_some()
+    {
+        return Ok(());
+    }
+    if let Some(view) = col.as_opt::<Dict>()
+        && Dict::row_size_contribution(view, field, sizes, ctx)?.is_some()
+    {
+        return Ok(());
+    }
+    if let Some(view) = col.as_opt::<Patched>()
+        && Patched::row_size_contribution(view, field, sizes, ctx)?.is_some()
+    {
+        return Ok(());
+    }
     let canonical = col.clone().execute::<Canonical>(ctx)?;
     codec::field_size(&canonical, field, sizes, ctx)
 }

From c0b24adce43ab157eb014ec31a41174819917720 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 17 May 2026 22:26:08 +0000
Subject: [PATCH 19/26] Inventory-based registry for downstream encoding
 kernels
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Encodings that live outside `vortex-array` (e.g. RunEnd, BitPacked, FoR,
Delta) can't be downcast from inside the variadic dispatch loops -
vortex-array doesn't know about them, and reversing the dependency
would create a cycle.

Add a `RowEncodeRegistration` that downstream crates submit via the
inventory crate. `lookup(&array_id)` lazily builds an `ArrayId → (size,
encode)` HashMap on first call, behind a `OnceLock` so the build is
single-threaded and the lookups are wait-free thereafter.

Wire the lookup into `dispatch_size` / `dispatch_encode` after the
in-crate downcast attempts: in-crate kernels take precedence (constant-
time downcast), then downstream registrations (HashMap lookup), then
the canonicalization fallback.

Signed-off-by: Claude <noreply@anthropic.com>
---
 Cargo.lock                 |  2 ++
 vortex-row/Cargo.toml      |  2 ++
 vortex-row/public-api.lock | 28 +++++++++++++++++
 vortex-row/src/encode.rs   |  6 ++++
 vortex-row/src/lib.rs      |  2 ++
 vortex-row/src/registry.rs | 63 ++++++++++++++++++++++++++++++++++++++
 vortex-row/src/size.rs     |  6 ++++
 7 files changed, 109 insertions(+)
 create mode 100644 vortex-row/src/registry.rs

diff --git a/Cargo.lock b/Cargo.lock
index 23ca027d592..86cec8f1247 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -11043,6 +11043,7 @@ dependencies = [
  "arrow-schema 58.2.0",
  "bytes",
  "codspeed-divan-compat",
+ "inventory",
  "mimalloc",
  "rand 0.10.1",
  "rstest",
@@ -11052,6 +11053,7 @@ dependencies = [
  "vortex-error",
  "vortex-mask",
  "vortex-session",
+ "vortex-utils",
 ]
 
 [[package]]
diff --git a/vortex-row/Cargo.toml b/vortex-row/Cargo.toml
index 50d6547474a..947b1df47da 100644
--- a/vortex-row/Cargo.toml
+++ b/vortex-row/Cargo.toml
@@ -18,12 +18,14 @@ workspace = true
 
 [dependencies]
 bytes = { workspace = true }
+inventory = { workspace = true }
 smallvec = { workspace = true }
 vortex-array = { workspace = true }
 vortex-buffer = { workspace = true }
 vortex-error = { workspace = true }
 vortex-mask = { workspace = true }
 vortex-session = { workspace = true }
+vortex-utils = { workspace = true, features = ["dyn-traits"] }
 
 [dev-dependencies]
 arrow-array = { workspace = true }
diff --git a/vortex-row/public-api.lock b/vortex-row/public-api.lock
index a7221cd91ee..bded46f3c30 100644
--- a/vortex-row/public-api.lock
+++ b/vortex-row/public-api.lock
@@ -240,6 +240,24 @@ impl core::marker::StructuralPartialEq for vortex_row::options::SortField
 
 pub const vortex_row::options::FIELDS_INLINE: usize
 
+pub mod vortex_row::registry
+
+pub struct vortex_row::registry::RowEncodeRegistration
+
+pub vortex_row::registry::RowEncodeRegistration::encode: vortex_row::registry::DynEncodeFn
+
+pub vortex_row::registry::RowEncodeRegistration::id: fn() -> vortex_array::array::ArrayId
+
+pub vortex_row::registry::RowEncodeRegistration::size: vortex_row::registry::DynSizeFn
+
+impl inventory::Collect for vortex_row::registry::RowEncodeRegistration
+
+pub fn vortex_row::registry::lookup(&vortex_array::array::ArrayId) -> core::option::Option<(vortex_row::registry::DynSizeFn, vortex_row::registry::DynEncodeFn)>
+
+pub type vortex_row::registry::DynEncodeFn = fn(&vortex_array::array::erased::ArrayRef, vortex_row::options::SortField, &[u32], &mut [u32], &mut [u8], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<()>>
+
+pub type vortex_row::registry::DynSizeFn = fn(&vortex_array::array::erased::ArrayRef, vortex_row::options::SortField, &mut [u32], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<()>>
+
 pub mod vortex_row::size
 
 pub struct vortex_row::size::RowSize
@@ -356,6 +374,16 @@ pub fn vortex_row::options::RowEncodeOptions::hash<__H: core::hash::Hasher>(&sel
 
 impl core::marker::StructuralPartialEq for vortex_row::options::RowEncodeOptions
 
+pub struct vortex_row::RowEncodeRegistration
+
+pub vortex_row::RowEncodeRegistration::encode: vortex_row::registry::DynEncodeFn
+
+pub vortex_row::RowEncodeRegistration::id: fn() -> vortex_array::array::ArrayId
+
+pub vortex_row::RowEncodeRegistration::size: vortex_row::registry::DynSizeFn
+
+impl inventory::Collect for vortex_row::registry::RowEncodeRegistration
+
 pub struct vortex_row::RowSize
 
 impl core::clone::Clone for vortex_row::size::RowSize
diff --git a/vortex-row/src/encode.rs b/vortex-row/src/encode.rs
index 7510b78bc9f..82bec7a0f47 100644
--- a/vortex-row/src/encode.rs
+++ b/vortex-row/src/encode.rs
@@ -48,6 +48,7 @@ use crate::options::RowEncodeOptions;
 use crate::options::SortField;
 use crate::options::deserialize_row_encode_options;
 use crate::options::serialize_row_encode_options;
+use crate::registry;
 use crate::size::ColKind;
 use crate::size::compute_sizes;
 
@@ -490,6 +491,11 @@ pub fn dispatch_encode(
     {
         return Ok(());
     }
+    if let Some((_, encode_fn)) = registry::lookup(&col.encoding_id())
+        && encode_fn(col, field, offsets, cursors, out, ctx)?.is_some()
+    {
+        return Ok(());
+    }
     let canonical = col.clone().execute::<Canonical>(ctx)?;
     codec::field_encode(&canonical, field, offsets, cursors, out, ctx)
 }
diff --git a/vortex-row/src/lib.rs b/vortex-row/src/lib.rs
index 2896ae05acf..bd583bf77ad 100644
--- a/vortex-row/src/lib.rs
+++ b/vortex-row/src/lib.rs
@@ -30,6 +30,7 @@ pub mod convert;
 pub mod encode;
 mod kernels;
 pub mod options;
+pub mod registry;
 pub mod size;
 
 #[cfg(test)]
@@ -41,6 +42,7 @@ pub use encode::RowEncode;
 pub use encode::RowEncodeKernel;
 pub use options::RowEncodeOptions;
 pub use options::SortField;
+pub use registry::RowEncodeRegistration;
 pub use size::RowSize;
 pub use size::RowSizeKernel;
 use vortex_array::scalar_fn::session::ScalarFnSessionExt;
diff --git a/vortex-row/src/registry.rs b/vortex-row/src/registry.rs
new file mode 100644
index 00000000000..d806d090e0a
--- /dev/null
+++ b/vortex-row/src/registry.rs
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Registry for per-encoding row-encode fast paths from downstream crates.
+//!
+//! Encodings that live outside `vortex-array` (such as `RunEnd` in `encodings/runend`) cannot
+//! be directly downcast from inside the variadic [`RowSize`] / [`RowEncode`] dispatch loops.
+//! Instead, they submit a [`RowEncodeRegistration`] via the `inventory` crate, and the
+//! dispatch loop looks them up by [`ArrayId`].
+//!
+//! [`RowSize`]: super::size::RowSize
+//! [`RowEncode`]: super::encode::RowEncode
+
+use std::sync::OnceLock;
+
+use vortex_array::ArrayId;
+use vortex_array::ArrayRef;
+use vortex_array::ExecutionCtx;
+use vortex_error::VortexResult;
+use vortex_utils::aliases::hash_map::HashMap;
+
+use crate::options::SortField;
+
+/// Function pointer signature for an encoding's per-row size contribution.
+pub type DynSizeFn =
+    fn(&ArrayRef, SortField, &mut [u32], &mut ExecutionCtx) -> VortexResult<Option<()>>;
+
+/// Function pointer signature for an encoding's per-row byte encoding.
+pub type DynEncodeFn = fn(
+    &ArrayRef,
+    SortField,
+    &[u32],
+    &mut [u32],
+    &mut [u8],
+    &mut ExecutionCtx,
+) -> VortexResult<Option<()>>;
+
+/// A registration submitted by an encoding crate to plug into the row encoder.
+///
+/// Because [`ArrayId`] requires runtime string interning, the encoding id is passed as a
+/// function pointer that is called once at registry initialization time.
+pub struct RowEncodeRegistration {
+    /// Returns the [`ArrayId`] of the encoding this registration applies to.
+    pub id: fn() -> ArrayId,
+    /// Per-row size contribution function.
+    pub size: DynSizeFn,
+    /// Per-row encoding function.
+    pub encode: DynEncodeFn,
+}
+
+inventory::collect!(RowEncodeRegistration);
+
+/// Look up a (size, encode) pair for the given encoding id.
+pub fn lookup(id: &ArrayId) -> Option<(DynSizeFn, DynEncodeFn)> {
+    static MAP: OnceLock<HashMap<ArrayId, (DynSizeFn, DynEncodeFn)>> = OnceLock::new();
+    let map = MAP.get_or_init(|| {
+        inventory::iter::<RowEncodeRegistration>
+            .into_iter()
+            .map(|r| ((r.id)(), (r.size, r.encode)))
+            .collect()
+    });
+    map.get(id).copied()
+}
diff --git a/vortex-row/src/size.rs b/vortex-row/src/size.rs
index 8fb1bdbcf08..0b54f19de4e 100644
--- a/vortex-row/src/size.rs
+++ b/vortex-row/src/size.rs
@@ -42,6 +42,7 @@ use crate::options::RowEncodeOptions;
 use crate::options::SortField;
 use crate::options::deserialize_row_encode_options;
 use crate::options::serialize_row_encode_options;
+use crate::registry;
 
 /// Classification of a single input column for the size pass.
 ///
@@ -283,6 +284,11 @@ pub fn dispatch_size(
     {
         return Ok(());
     }
+    if let Some((size_fn, _)) = registry::lookup(&col.encoding_id())
+        && size_fn(col, field, sizes, ctx)?.is_some()
+    {
+        return Ok(());
+    }
     let canonical = col.clone().execute::<Canonical>(ctx)?;
     codec::field_size(&canonical, field, sizes, ctx)
 }

From 17eb2ca9c09ac018d6510ec79a2bac4698493f37 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 17 May 2026 22:27:28 +0000
Subject: [PATCH 20/26] Constant row-encode kernel

Replace the stub `RowSizeKernel` / `RowEncodeKernel` impls for
`ConstantArray` with real implementations that skip canonicalization.

The size pass adds the (constant) per-row scalar size to every entry of
the shared `sizes` slice. The encode pass encodes the scalar bytes once
into a small heap buffer, then `copy_nonoverlapping`s those bytes into
each row's slot. Per-row work is one `copy_nonoverlapping(N)` plus one
cursor add, where `N` is typically 9 (i64), 5 (i32), or 17 (i128).

Add a `constant_i64_*` bench triplet (arrow-row baseline, vortex with
kernel, vortex through canonicalization) and a
`constant_path_matches_canonical` test that round-trips bytes both
ways and asserts they're identical.

Signed-off-by: Claude <noreply@anthropic.com>
---
 vortex-row/benches/row_encode.rs   | 39 ++++++++++++++++++++++
 vortex-row/src/kernels/constant.rs | 53 ++++++++++++++++++++++--------
 vortex-row/src/tests.rs            | 17 ++++++++++
 3 files changed, 96 insertions(+), 13 deletions(-)

diff --git a/vortex-row/benches/row_encode.rs b/vortex-row/benches/row_encode.rs
index 8d631d785da..cc7b787bcec 100644
--- a/vortex-row/benches/row_encode.rs
+++ b/vortex-row/benches/row_encode.rs
@@ -30,9 +30,11 @@ use rand::RngExt;
 use rand::SeedableRng;
 use rand::distr::Alphanumeric;
 use rand::rngs::StdRng;
+use vortex_array::Canonical;
 use vortex_array::IntoArray;
 use vortex_array::LEGACY_SESSION;
 use vortex_array::VortexSessionExecute;
+use vortex_array::arrays::ConstantArray;
 use vortex_array::arrays::PrimitiveArray;
 use vortex_array::arrays::StructArray;
 use vortex_array::arrays::VarBinViewArray;
@@ -175,3 +177,40 @@ fn struct_mixed_vortex(bencher: divan::Bencher) {
         convert_columns(&[struct_arr.clone()], &[SortField::default()], &mut ctx).unwrap()
     })
 }
+
+// ---------- constant_i64 ----------
+
+#[divan::bench]
+fn constant_i64_arrow_row(bencher: divan::Bencher) {
+    let arr = Arc::new(Int64Array::from(vec![42i64; N])) as arrow_array::ArrayRef;
+    let conv = RowConverter::new(vec![ArrowSortField::new(DataType::Int64)]).unwrap();
+    let total = (N * (1 + 8)) as u64;
+    bencher
+        .counter(BytesCount::new(total))
+        .bench_local(|| conv.convert_columns(&[arr.clone()]).unwrap())
+}
+
+#[divan::bench]
+fn constant_i64_vortex_with_kernel(bencher: divan::Bencher) {
+    let arr = ConstantArray::new(42i64, N).into_array();
+    let total = (N * (1 + 8)) as u64;
+    bencher.counter(BytesCount::new(total)).bench_local(|| {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        convert_columns(&[arr.clone()], &[SortField::default()], &mut ctx).unwrap()
+    })
+}
+
+#[divan::bench]
+fn constant_i64_vortex_without_kernel(bencher: divan::Bencher) {
+    let arr = ConstantArray::new(42i64, N).into_array();
+    let total = (N * (1 + 8)) as u64;
+    bencher.counter(BytesCount::new(total)).bench_local(|| {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        let canonical = arr
+            .clone()
+            .execute::<Canonical>(&mut ctx)
+            .unwrap()
+            .into_array();
+        convert_columns(&[canonical], &[SortField::default()], &mut ctx).unwrap()
+    })
+}
diff --git a/vortex-row/src/kernels/constant.rs b/vortex-row/src/kernels/constant.rs
index 51d54fbf123..2c8a87b5ffc 100644
--- a/vortex-row/src/kernels/constant.rs
+++ b/vortex-row/src/kernels/constant.rs
@@ -2,39 +2,66 @@
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
 //! Row-encode kernels for `ConstantArray`.
-//!
-//! Stubs in this commit return `Ok(None)` so the dispatch loop falls back to
-//! canonicalization. The real impls land in a follow-up commit.
+
+#![allow(
+    clippy::cast_possible_truncation,
+    reason = "row encoding indexes into u32-sized buffers; lengths are validated to fit in u32"
+)]
 
 use vortex_array::ArrayView;
 use vortex_array::ExecutionCtx;
 use vortex_array::arrays::Constant;
 use vortex_error::VortexResult;
 
+use crate::codec;
 use crate::encode::RowEncodeKernel;
 use crate::options::SortField;
 use crate::size::RowSizeKernel;
 
 impl RowSizeKernel for Constant {
     fn row_size_contribution(
-        _column: ArrayView<'_, Self>,
-        _field: SortField,
-        _sizes: &mut [u32],
+        column: ArrayView<'_, Self>,
+        field: SortField,
+        sizes: &mut [u32],
         _ctx: &mut ExecutionCtx,
     ) -> VortexResult<Option<()>> {
-        Ok(None)
+        let add = codec::encoded_size_for_scalar(column.scalar(), field)?;
+        for s in sizes.iter_mut().take(column.len()) {
+            *s += add;
+        }
+        Ok(Some(()))
     }
 }
 
 impl RowEncodeKernel for Constant {
     fn row_encode_into(
-        _column: ArrayView<'_, Self>,
-        _field: SortField,
-        _offsets: &[u32],
-        _cursors: &mut [u32],
-        _out: &mut [u8],
+        column: ArrayView<'_, Self>,
+        field: SortField,
+        offsets: &[u32],
+        cursors: &mut [u32],
+        out: &mut [u8],
         _ctx: &mut ExecutionCtx,
     ) -> VortexResult<Option<()>> {
-        Ok(None)
+        let bytes = codec::encode_scalar(column.scalar(), field)?;
+        let len = bytes.len();
+        let len_u32 = len as u32;
+        let n = column.len();
+        if len == 0 {
+            return Ok(Some(()));
+        }
+        // SAFETY: bytes is len bytes; offsets[i] + cursors[i] + len <= out.len() by
+        // construction of the buffer (the size pass already accounted for this column's
+        // contribution). copy_nonoverlapping elides the bounds check + slice creation
+        // that copy_from_slice would do per row.
+        unsafe {
+            let src = bytes.as_ptr();
+            let out_ptr = out.as_mut_ptr();
+            for i in 0..n {
+                let pos = (offsets[i] + cursors[i]) as usize;
+                std::ptr::copy_nonoverlapping(src, out_ptr.add(pos), len);
+                cursors[i] += len_u32;
+            }
+        }
+        Ok(Some(()))
     }
 }
diff --git a/vortex-row/src/tests.rs b/vortex-row/src/tests.rs
index ff7d8fb274a..052ddf8ea46 100644
--- a/vortex-row/src/tests.rs
+++ b/vortex-row/src/tests.rs
@@ -15,6 +15,7 @@ use vortex_array::IntoArray;
 use vortex_array::LEGACY_SESSION;
 use vortex_array::VortexSessionExecute;
 use vortex_array::arrays::BoolArray;
+use vortex_array::arrays::ConstantArray;
 use vortex_array::arrays::ListViewArray;
 use vortex_array::arrays::PrimitiveArray;
 use vortex_array::arrays::VarBinViewArray;
@@ -222,6 +223,22 @@ fn nulls_first_and_last() -> VortexResult<()> {
     Ok(())
 }
 
+#[test]
+fn constant_path_matches_canonical() -> VortexResult<()> {
+    let mut ctx = LEGACY_SESSION.create_execution_ctx();
+    let nrows = 8usize;
+    let const_arr = ConstantArray::new(42i64, nrows).into_array();
+    let canonical = PrimitiveArray::from_iter(vec![42i64; nrows]).into_array();
+
+    let from_const = convert_columns(&[const_arr], &[SortField::default()], &mut ctx)?;
+    let from_canon = convert_columns(&[canonical], &[SortField::default()], &mut ctx)?;
+    assert_eq!(
+        collect_row_bytes(&from_const),
+        collect_row_bytes(&from_canon)
+    );
+    Ok(())
+}
+
 #[test]
 fn struct_sort_order() -> VortexResult<()> {
     use vortex_array::arrays::StructArray;

From 62e41b4ddfb8d08b2284c4b5a33aa56f4c90178f Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 17 May 2026 22:29:07 +0000
Subject: [PATCH 21/26] Dict row-encode kernel
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace the stub `RowSizeKernel` / `RowEncodeKernel` impls for `Dict`
with real implementations that skip canonicalization.

Strategy: encode each unique value once into a small per-value buffer,
then materialize the per-row contribution by indexing into the buffer
via the codes array. Per-row cost becomes one `copy_from_slice` of the
value's encoded bytes rather than re-encoding from scratch. Amortizes
the encode work over the dictionary's cardinality instead of the row
count.

When values.len() > codes.len() the kernel declines (the canonical path
is at least as fast because each value would be touched ≤ 1 time).

`add_codes_sizes::<T>` has a u8 fast-path that reads the codes as a raw
`&[u8]` slice to elide TryInto overhead.

Includes `dict_utf8_*` bench triplet (arrow-row baseline, vortex with
kernel, vortex through canonicalization) and a
`dict_path_matches_canonical` round-trip test.

Signed-off-by: Claude <noreply@anthropic.com>
---
 vortex-row/benches/row_encode.rs |  75 ++++++++++++++++
 vortex-row/src/kernels/dict.rs   | 148 ++++++++++++++++++++++++++++---
 vortex-row/src/tests.rs          |  23 +++++
 3 files changed, 232 insertions(+), 14 deletions(-)

diff --git a/vortex-row/benches/row_encode.rs b/vortex-row/benches/row_encode.rs
index cc7b787bcec..7ceec508282 100644
--- a/vortex-row/benches/row_encode.rs
+++ b/vortex-row/benches/row_encode.rs
@@ -17,9 +17,12 @@
 
 use std::sync::Arc;
 
+use arrow_array::DictionaryArray;
 use arrow_array::Int64Array;
+use arrow_array::PrimitiveArray as ArrowPrimitiveArray;
 use arrow_array::StringArray;
 use arrow_array::StructArray as ArrowStructArray;
+use arrow_array::types::Int32Type;
 use arrow_row::RowConverter;
 use arrow_row::SortField as ArrowSortField;
 use arrow_schema::DataType;
@@ -38,6 +41,7 @@ use vortex_array::arrays::ConstantArray;
 use vortex_array::arrays::PrimitiveArray;
 use vortex_array::arrays::StructArray;
 use vortex_array::arrays::VarBinViewArray;
+use vortex_array::builders::dict::dict_encode;
 use vortex_row::SortField;
 use vortex_row::convert_columns;
 
@@ -214,3 +218,74 @@ fn constant_i64_vortex_without_kernel(bencher: divan::Bencher) {
         convert_columns(&[canonical], &[SortField::default()], &mut ctx).unwrap()
     })
 }
+
+// ---------- dict_utf8 ----------
+
+fn dict_utf8_inputs() -> (Vec<String>, Vec<String>, Vec<i32>, u64) {
+    let n_unique = 1024usize;
+    let unique = gen_words(n_unique, 16, 13);
+    let mut rng = StdRng::seed_from_u64(17);
+    let codes: Vec<i32> = (0..N)
+        .map(|_| rng.random_range(0..n_unique) as i32)
+        .collect();
+    let strings: Vec<String> = codes.iter().map(|&c| unique[c as usize].clone()).collect();
+    let bytes: u64 = strings
+        .iter()
+        .map(|w| 1 + (w.len().div_ceil(32) * 33) as u64)
+        .sum();
+    (unique, strings, codes, bytes)
+}
+
+#[divan::bench]
+fn dict_utf8_arrow_dict(bencher: divan::Bencher) {
+    let (unique, _, codes, total) = dict_utf8_inputs();
+    let values: Arc<dyn arrow_array::Array> = Arc::new(StringArray::from(unique.clone()));
+    let dict_arr: DictionaryArray<Int32Type> =
+        DictionaryArray::new(ArrowPrimitiveArray::from(codes), values);
+    let arr = Arc::new(dict_arr) as arrow_array::ArrayRef;
+    let conv = RowConverter::new(vec![ArrowSortField::new(DataType::Dictionary(
+        Box::new(DataType::Int32),
+        Box::new(DataType::Utf8),
+    ))])
+    .unwrap();
+    bencher
+        .counter(BytesCount::new(total))
+        .bench_local(|| conv.convert_columns(&[arr.clone()]).unwrap())
+}
+
+#[divan::bench]
+fn dict_utf8_arrow_canonical(bencher: divan::Bencher) {
+    let (_, strings, _, total) = dict_utf8_inputs();
+    let arr = Arc::new(StringArray::from(strings.clone())) as arrow_array::ArrayRef;
+    let conv = RowConverter::new(vec![ArrowSortField::new(DataType::Utf8)]).unwrap();
+    bencher
+        .counter(BytesCount::new(total))
+        .bench_local(|| conv.convert_columns(&[arr.clone()]).unwrap())
+}
+
+#[divan::bench]
+fn dict_utf8_vortex_with_kernel(bencher: divan::Bencher) {
+    let (_, strings, _, total) = dict_utf8_inputs();
+    let raw = VarBinViewArray::from_iter_str(strings.iter().map(String::as_str)).into_array();
+    let dict = dict_encode(&raw).unwrap().into_array();
+    bencher.counter(BytesCount::new(total)).bench_local(|| {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        convert_columns(&[dict.clone()], &[SortField::default()], &mut ctx).unwrap()
+    })
+}
+
+#[divan::bench]
+fn dict_utf8_vortex_without_kernel(bencher: divan::Bencher) {
+    let (_, strings, _, total) = dict_utf8_inputs();
+    let raw = VarBinViewArray::from_iter_str(strings.iter().map(String::as_str)).into_array();
+    let dict = dict_encode(&raw).unwrap().into_array();
+    bencher.counter(BytesCount::new(total)).bench_local(|| {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        let canonical = dict
+            .clone()
+            .execute::<Canonical>(&mut ctx)
+            .unwrap()
+            .into_array();
+        convert_columns(&[canonical], &[SortField::default()], &mut ctx).unwrap()
+    })
+}
diff --git a/vortex-row/src/kernels/dict.rs b/vortex-row/src/kernels/dict.rs
index 0f576f3372a..79928b2c421 100644
--- a/vortex-row/src/kernels/dict.rs
+++ b/vortex-row/src/kernels/dict.rs
@@ -3,38 +3,158 @@
 
 //! Row-encode kernels for `DictArray`.
 //!
-//! Stubs in this commit return `Ok(None)` so the dispatch loop falls back to
-//! canonicalization. The real impls land in a follow-up commit.
+//! These kernels skip canonicalization by encoding each *unique value* once into a small
+//! per-value buffer keyed by code, then materializing the per-row contribution via the codes
+//! array. The per-unique-value cost is amortized over the dictionary cardinality rather than
+//! the row count.
+
+#![allow(
+    clippy::cast_possible_truncation,
+    clippy::cast_sign_loss,
+    reason = "row encoding indexes into u32-sized buffers; codes are non-negative indices into the values array"
+)]
 
 use vortex_array::ArrayView;
 use vortex_array::ExecutionCtx;
+use vortex_array::arrays::PrimitiveArray;
 use vortex_array::arrays::dict::Dict;
+use vortex_array::arrays::dict::DictArraySlotsExt;
+use vortex_array::dtype::NativePType;
+use vortex_array::dtype::PType;
+use vortex_array::match_each_integer_ptype;
 use vortex_error::VortexResult;
 
 use crate::encode::RowEncodeKernel;
+use crate::encode::dispatch_encode;
 use crate::options::SortField;
 use crate::size::RowSizeKernel;
+use crate::size::dispatch_size;
 
 impl RowSizeKernel for Dict {
     fn row_size_contribution(
-        _column: ArrayView<'_, Self>,
-        _field: SortField,
-        _sizes: &mut [u32],
-        _ctx: &mut ExecutionCtx,
+        column: ArrayView<'_, Self>,
+        field: SortField,
+        sizes: &mut [u32],
+        ctx: &mut ExecutionCtx,
     ) -> VortexResult<Option<()>> {
-        Ok(None)
+        if column.values().len() > column.codes().len() {
+            return Ok(None);
+        }
+        let n_values = column.values().len();
+        let mut value_sizes = vec![0u32; n_values];
+        dispatch_size(column.values(), field, &mut value_sizes, ctx)?;
+
+        let codes_prim = column.codes().clone().execute::<PrimitiveArray>(ctx)?;
+        let ptype = codes_prim.ptype();
+        match_each_integer_ptype!(ptype, |T| {
+            add_codes_sizes::<T>(&codes_prim, &value_sizes, sizes);
+        });
+        Ok(Some(()))
     }
 }
 
 impl RowEncodeKernel for Dict {
     fn row_encode_into(
-        _column: ArrayView<'_, Self>,
-        _field: SortField,
-        _offsets: &[u32],
-        _cursors: &mut [u32],
-        _out: &mut [u8],
-        _ctx: &mut ExecutionCtx,
+        column: ArrayView<'_, Self>,
+        field: SortField,
+        offsets: &[u32],
+        cursors: &mut [u32],
+        out: &mut [u8],
+        ctx: &mut ExecutionCtx,
     ) -> VortexResult<Option<()>> {
-        Ok(None)
+        if column.values().len() > column.codes().len() {
+            return Ok(None);
+        }
+
+        let n_values = column.values().len();
+        let mut value_sizes = vec![0u32; n_values];
+        dispatch_size(column.values(), field, &mut value_sizes, ctx)?;
+
+        // Build per-value offsets and a small contiguous per-value encoded buffer.
+        let mut value_offsets = vec![0u32; n_values + 1];
+        let mut total: u64 = 0;
+        for i in 0..n_values {
+            value_offsets[i] = total as u32;
+            total += u64::from(value_sizes[i]);
+        }
+        value_offsets[n_values] = total as u32;
+
+        let mut value_buf = vec![0u8; total as usize];
+        // Inner dispatch uses zero base offsets (small buffer) with per-value start cursors.
+        let zero_offsets = vec![0u32; n_values];
+        let mut inner_cursors = value_offsets[..n_values].to_vec();
+        dispatch_encode(
+            column.values(),
+            field,
+            &zero_offsets,
+            &mut inner_cursors,
+            &mut value_buf,
+            ctx,
+        )?;
+
+        let codes_prim = column.codes().clone().execute::<PrimitiveArray>(ctx)?;
+        let ptype = codes_prim.ptype();
+        match_each_integer_ptype!(ptype, |T| {
+            copy_codes::<T>(
+                &codes_prim,
+                &value_buf,
+                &value_offsets,
+                &value_sizes,
+                offsets,
+                cursors,
+                out,
+            );
+        });
+        Ok(Some(()))
+    }
+}
+
+#[inline]
+fn add_codes_sizes<T>(codes: &PrimitiveArray, value_sizes: &[u32], sizes: &mut [u32])
+where
+    T: NativePType + Copy + TryInto<usize>,
+{
+    let slice: &[T] = codes.as_slice();
+    debug_assert_eq!(slice.len(), sizes.len());
+    if T::PTYPE == PType::U8 {
+        // SAFETY: T == u8
+        let raw = unsafe { std::slice::from_raw_parts(slice.as_ptr().cast::<u8>(), slice.len()) };
+        for (i, &c) in raw.iter().enumerate() {
+            sizes[i] += value_sizes[c as usize];
+        }
+        return;
+    }
+    for (i, &c) in slice.iter().enumerate() {
+        let idx: usize = c
+            .try_into()
+            .unwrap_or_else(|_| vortex_error::vortex_panic!("dict code does not fit in usize"));
+        sizes[i] += value_sizes[idx];
+    }
+}
+
+#[inline]
+#[allow(clippy::too_many_arguments)]
+fn copy_codes<T>(
+    codes: &PrimitiveArray,
+    value_buf: &[u8],
+    value_offsets: &[u32],
+    value_sizes: &[u32],
+    offsets: &[u32],
+    cursors: &mut [u32],
+    out: &mut [u8],
+) where
+    T: NativePType + Copy + TryInto<usize>,
+{
+    let slice: &[T] = codes.as_slice();
+    debug_assert_eq!(slice.len(), cursors.len());
+    for (i, &c) in slice.iter().enumerate() {
+        let idx: usize = c
+            .try_into()
+            .unwrap_or_else(|_| vortex_error::vortex_panic!("dict code does not fit in usize"));
+        let v_start = value_offsets[idx] as usize;
+        let v_size = value_sizes[idx] as usize;
+        let dst = (offsets[i] + cursors[i]) as usize;
+        out[dst..dst + v_size].copy_from_slice(&value_buf[v_start..v_start + v_size]);
+        cursors[i] += v_size as u32;
     }
 }
diff --git a/vortex-row/src/tests.rs b/vortex-row/src/tests.rs
index 052ddf8ea46..33ecdc699f8 100644
--- a/vortex-row/src/tests.rs
+++ b/vortex-row/src/tests.rs
@@ -20,6 +20,9 @@ use vortex_array::arrays::ListViewArray;
 use vortex_array::arrays::PrimitiveArray;
 use vortex_array::arrays::VarBinViewArray;
 use vortex_array::arrays::listview::ListViewArrayExt;
+use vortex_array::builders::dict::dict_encode;
+use vortex_array::dtype::DType;
+use vortex_array::dtype::Nullability;
 use vortex_error::VortexResult;
 
 use crate::SortField;
@@ -223,6 +226,26 @@ fn nulls_first_and_last() -> VortexResult<()> {
     Ok(())
 }
 
+#[test]
+fn dict_path_matches_canonical() -> VortexResult<()> {
+    let mut ctx = LEGACY_SESSION.create_execution_ctx();
+    let raw = VarBinViewArray::from_iter(
+        vec![Some("a"), Some("bb"), Some("a"), Some("ccc"), Some("bb")],
+        DType::Utf8(Nullability::NonNullable),
+    )
+    .into_array();
+    let dict_arr = dict_encode(&raw)?.into_array();
+
+    let canonical_enc = convert_columns(&[raw], &[SortField::default()], &mut ctx)?;
+    let dict_enc = convert_columns(&[dict_arr], &[SortField::default()], &mut ctx)?;
+
+    assert_eq!(
+        collect_row_bytes(&canonical_enc),
+        collect_row_bytes(&dict_enc)
+    );
+    Ok(())
+}
+
 #[test]
 fn constant_path_matches_canonical() -> VortexResult<()> {
     let mut ctx = LEGACY_SESSION.create_execution_ctx();

From b86974ebe6907288c2a29d9c7bb307a44d972b0f Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 17 May 2026 22:30:36 +0000
Subject: [PATCH 22/26] Patched row-encode kernel

Replace the stub `RowSizeKernel` / `RowEncodeKernel` impls for
`Patched` with real implementations.

Size pass: per-row size matches the inner array exactly because
patches share the inner dtype. Just delegate to `dispatch_size` on the
inner array.

Encode pass: delegate to `dispatch_encode` on the inner array, then
walk the patch indices and overwrite each patched row's value bytes in
place. Patched arrays in our hot paths are Primitive-typed (BitPacked
with patches, etc.), so the kernel checks `DType::Primitive` upfront
and declines for anything else.

Pre-cursor snapshot is captured before the inner encoder advances
`cursors`, so the overlay knows each row's slot start position.

Adds `patched_i32_*` bench triplet. Patched-specific tests live next
to the kernel in `kernels/patched.rs::tests` (round-trip vs canonical,
both single-chunk and multi-chunk).

Signed-off-by: Claude <noreply@anthropic.com>
---
 vortex-row/benches/row_encode.rs  |  85 ++++++++++
 vortex-row/src/kernels/patched.rs | 249 ++++++++++++++++++++++++++++--
 2 files changed, 320 insertions(+), 14 deletions(-)

diff --git a/vortex-row/benches/row_encode.rs b/vortex-row/benches/row_encode.rs
index 7ceec508282..aa4d5f2f675 100644
--- a/vortex-row/benches/row_encode.rs
+++ b/vortex-row/benches/row_encode.rs
@@ -18,6 +18,7 @@
 use std::sync::Arc;
 
 use arrow_array::DictionaryArray;
+use arrow_array::Int32Array;
 use arrow_array::Int64Array;
 use arrow_array::PrimitiveArray as ArrowPrimitiveArray;
 use arrow_array::StringArray;
@@ -38,10 +39,12 @@ use vortex_array::IntoArray;
 use vortex_array::LEGACY_SESSION;
 use vortex_array::VortexSessionExecute;
 use vortex_array::arrays::ConstantArray;
+use vortex_array::arrays::Patched;
 use vortex_array::arrays::PrimitiveArray;
 use vortex_array::arrays::StructArray;
 use vortex_array::arrays::VarBinViewArray;
 use vortex_array::builders::dict::dict_encode;
+use vortex_array::patches::Patches;
 use vortex_row::SortField;
 use vortex_row::convert_columns;
 
@@ -289,3 +292,85 @@ fn dict_utf8_vortex_without_kernel(bencher: divan::Bencher) {
         convert_columns(&[canonical], &[SortField::default()], &mut ctx).unwrap()
     })
 }
+
+// ---------- patched_i32 ----------
+
+fn gen_patched_i32_inputs() -> (Vec<i32>, Vec<i32>, u64) {
+    let mut rng = StdRng::seed_from_u64(400);
+    // Inner is mostly zero, with random patches at ~5% of positions.
+    let mut inner = vec![0i32; N];
+    let mut values = Vec::new();
+    for slot in inner.iter_mut().take(N) {
+        if rng.random_range(0u32..100) < 5 {
+            let v = rng.random_range(1i32..1_000_000);
+            *slot = v;
+            values.push(v);
+        }
+    }
+    let bytes = (N * (1 + 4)) as u64;
+    (inner, values, bytes)
+}
+
+#[divan::bench]
+fn patched_i32_arrow_row(bencher: divan::Bencher) {
+    let (inner, _, bytes) = gen_patched_i32_inputs();
+    let arr = Arc::new(Int32Array::from(inner)) as arrow_array::ArrayRef;
+    let conv = RowConverter::new(vec![ArrowSortField::new(DataType::Int32)]).unwrap();
+    bencher
+        .counter(BytesCount::new(bytes))
+        .bench_local(|| conv.convert_columns(&[arr.clone()]).unwrap())
+}
+
+fn patched_i32_array() -> (vortex_array::ArrayRef, u64) {
+    let mut rng = StdRng::seed_from_u64(400);
+    let mut indices: Vec<u32> = Vec::new();
+    let mut values: Vec<i32> = Vec::new();
+    let mut inner = vec![0i32; N];
+    for i in 0..N {
+        if rng.random_range(0u32..100) < 5 {
+            let v = rng.random_range(1i32..1_000_000);
+            inner[i] = v;
+            indices.push(i as u32);
+            values.push(v);
+        }
+    }
+    let inner_arr = PrimitiveArray::from_iter(vec![0i32; N]).into_array();
+    let patches = Patches::new(
+        N,
+        0,
+        PrimitiveArray::from_iter(indices).into_array(),
+        PrimitiveArray::from_iter(values).into_array(),
+        None,
+    )
+    .unwrap();
+    let mut setup_ctx = LEGACY_SESSION.create_execution_ctx();
+    let patched = Patched::from_array_and_patches(inner_arr, &patches, &mut setup_ctx)
+        .unwrap()
+        .into_array();
+    drop(inner);
+    let bytes = (N * (1 + 4)) as u64;
+    (patched, bytes)
+}
+
+#[divan::bench]
+fn patched_i32_with_kernel(bencher: divan::Bencher) {
+    let (arr, bytes) = patched_i32_array();
+    bencher.counter(BytesCount::new(bytes)).bench_local(|| {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        convert_columns(&[arr.clone()], &[SortField::default()], &mut ctx).unwrap()
+    })
+}
+
+#[divan::bench]
+fn patched_i32_without_kernel(bencher: divan::Bencher) {
+    let (arr, bytes) = patched_i32_array();
+    bencher.counter(BytesCount::new(bytes)).bench_local(|| {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        let canonical = arr
+            .clone()
+            .execute::<Canonical>(&mut ctx)
+            .unwrap()
+            .into_array();
+        convert_columns(&[canonical], &[SortField::default()], &mut ctx).unwrap()
+    })
+}
diff --git a/vortex-row/src/kernels/patched.rs b/vortex-row/src/kernels/patched.rs
index 2637116afb4..ecd5d65d556 100644
--- a/vortex-row/src/kernels/patched.rs
+++ b/vortex-row/src/kernels/patched.rs
@@ -3,38 +3,259 @@
 
 //! Row-encode kernels for `Patched`.
 //!
-//! Stubs in this commit return `Ok(None)` so the dispatch loop falls back to
-//! canonicalization. The real impls land in a follow-up commit.
+//! Row size is identical to the underlying `inner` array (patches don't change dtype). For
+//! row encoding, we first delegate to the inner array's row-encode path, then overlay each
+//! patched row's value directly into the output, overwriting the few bytes that the inner
+//! encoder wrote at that row's slot.
+
+#![allow(
+    clippy::cast_possible_truncation,
+    clippy::cast_sign_loss,
+    reason = "row encoding indexes into u32-sized buffers; lengths are validated to fit in u32"
+)]
 
 use vortex_array::ArrayView;
 use vortex_array::ExecutionCtx;
+use vortex_array::arrays::PrimitiveArray;
 use vortex_array::arrays::patched::Patched;
+use vortex_array::arrays::patched::PatchedArrayExt;
+use vortex_array::arrays::patched::PatchedArraySlotsExt;
+use vortex_array::dtype::DType;
+use vortex_array::match_each_native_ptype;
 use vortex_error::VortexResult;
 
+use crate::codec::RowEncode;
 use crate::encode::RowEncodeKernel;
+use crate::encode::dispatch_encode;
 use crate::options::SortField;
 use crate::size::RowSizeKernel;
+use crate::size::dispatch_size;
 
 impl RowSizeKernel for Patched {
     fn row_size_contribution(
-        _column: ArrayView<'_, Self>,
-        _field: SortField,
-        _sizes: &mut [u32],
-        _ctx: &mut ExecutionCtx,
+        column: ArrayView<'_, Self>,
+        field: SortField,
+        sizes: &mut [u32],
+        ctx: &mut ExecutionCtx,
     ) -> VortexResult<Option<()>> {
-        Ok(None)
+        // Per-row size matches the inner array; patches share its dtype.
+        dispatch_size(column.inner(), field, sizes, ctx)?;
+        Ok(Some(()))
     }
 }
 
 impl RowEncodeKernel for Patched {
     fn row_encode_into(
-        _column: ArrayView<'_, Self>,
-        _field: SortField,
-        _offsets: &[u32],
-        _cursors: &mut [u32],
-        _out: &mut [u8],
-        _ctx: &mut ExecutionCtx,
+        column: ArrayView<'_, Self>,
+        field: SortField,
+        offsets: &[u32],
+        cursors: &mut [u32],
+        out: &mut [u8],
+        ctx: &mut ExecutionCtx,
     ) -> VortexResult<Option<()>> {
-        Ok(None)
+        let DType::Primitive(ptype, _) = *column.as_ref().dtype() else {
+            return Ok(None);
+        };
+        let value_bytes = ptype.byte_width();
+
+        // Snapshot per-row write start positions before the inner encoder advances cursors.
+        let pre_cursors: Vec<u32> = cursors.to_vec();
+        dispatch_encode(column.inner(), field, offsets, cursors, out, ctx)?;
+
+        overlay_patches(
+            column,
+            ptype,
+            value_bytes,
+            field,
+            offsets,
+            &pre_cursors,
+            out,
+            ctx,
+        )?;
+        Ok(Some(()))
+    }
+}
+
+/// Overlay patch values onto rows whose inner-encoded bytes need to be replaced.
+#[allow(clippy::too_many_arguments)]
+fn overlay_patches(
+    column: ArrayView<'_, Patched>,
+    ptype: vortex_array::dtype::PType,
+    value_bytes: usize,
+    field: SortField,
+    offsets: &[u32],
+    pre_cursors: &[u32],
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()> {
+    let patch_indices: PrimitiveArray = column
+        .patch_indices()
+        .clone()
+        .execute::<PrimitiveArray>(ctx)?;
+    if patch_indices.is_empty() {
+        return Ok(());
+    }
+    let patch_values: PrimitiveArray = column
+        .patch_values()
+        .clone()
+        .execute::<PrimitiveArray>(ctx)?;
+    let lane_offsets: PrimitiveArray = column
+        .lane_offsets()
+        .clone()
+        .execute::<PrimitiveArray>(ctx)?;
+    let patch_indices_slice: &[u16] = patch_indices.as_slice();
+    let lane_offsets_slice: &[u32] = lane_offsets.as_slice();
+    let n_lanes = column.n_lanes();
+    let patched_offset = column.offset();
+    let array_len = column.as_ref().len();
+    let n_chunks = (array_len + patched_offset).div_ceil(1024);
+    let non_null = field.non_null_sentinel();
+    let descending = field.descending;
+
+    match_each_native_ptype!(ptype, |T| {
+        let values_slice: &[T] = patch_values.as_slice();
+        overlay_chunks::<T>(
+            values_slice,
+            patch_indices_slice,
+            lane_offsets_slice,
+            n_lanes,
+            patched_offset,
+            array_len,
+            n_chunks,
+            offsets,
+            pre_cursors,
+            out,
+            value_bytes,
+            non_null,
+            descending,
+        );
+    });
+    Ok(())
+}
+
+#[allow(clippy::too_many_arguments)]
+fn overlay_chunks<T: Copy + RowEncode>(
+    values_slice: &[T],
+    patch_indices_slice: &[u16],
+    lane_offsets_slice: &[u32],
+    n_lanes: usize,
+    patched_offset: usize,
+    array_len: usize,
+    n_chunks: usize,
+    offsets: &[u32],
+    pre_cursors: &[u32],
+    out: &mut [u8],
+    value_bytes: usize,
+    non_null: u8,
+    descending: bool,
+) {
+    for chunk in 0..n_chunks {
+        for lane in 0..n_lanes {
+            let slot = chunk * n_lanes + lane;
+            if slot + 1 >= lane_offsets_slice.len() {
+                break;
+            }
+            let start = lane_offsets_slice[slot] as usize;
+            let stop = lane_offsets_slice[slot + 1] as usize;
+            for k in start..stop {
+                let chunk_local = patch_indices_slice[k] as usize;
+                let logical_idx = chunk * 1024 + chunk_local;
+                if logical_idx < patched_offset {
+                    continue;
+                }
+                let row = logical_idx - patched_offset;
+                if row >= array_len {
+                    continue;
+                }
+                let slot_start = (offsets[row] + pre_cursors[row]) as usize;
+                out[slot_start] = non_null;
+                values_slice[k].encode_to(
+                    &mut out[slot_start + 1..slot_start + 1 + value_bytes],
+                    descending,
+                );
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use vortex_array::IntoArray;
+    use vortex_array::LEGACY_SESSION;
+    use vortex_array::VortexSessionExecute;
+    use vortex_array::arrays::ListViewArray;
+    use vortex_array::arrays::Patched;
+    use vortex_array::arrays::PrimitiveArray;
+    use vortex_array::arrays::listview::ListViewArrayExt;
+    use vortex_array::patches::Patches;
+    use vortex_buffer::buffer;
+    use vortex_error::VortexResult;
+
+    use crate::SortField;
+    use crate::convert_columns;
+
+    fn collect_rows(arr: &ListViewArray) -> Vec<Vec<u8>> {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        let n = arr.len();
+        (0..n)
+            .map(|i| {
+                let slice = arr.list_elements_at(i).unwrap();
+                let p = slice.execute::<PrimitiveArray>(&mut ctx).unwrap();
+                p.as_slice::<u8>().to_vec()
+            })
+            .collect()
+    }
+
+    #[test]
+    fn patched_row_encode_matches_canonical() -> VortexResult<()> {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        let inner = buffer![0u32; 32].into_array();
+        let patches = Patches::new(
+            32,
+            0,
+            buffer![1u32, 2, 3].into_array(),
+            buffer![100u32, 200, 300].into_array(),
+            None,
+        )?;
+        let patched = Patched::from_array_and_patches(inner, &patches, &mut ctx)?.into_array();
+
+        let mut canonical_vals = vec![0u32; 32];
+        canonical_vals[1] = 100;
+        canonical_vals[2] = 200;
+        canonical_vals[3] = 300;
+        let canonical = PrimitiveArray::from_iter(canonical_vals).into_array();
+
+        let by_canonical = convert_columns(&[canonical], &[SortField::default()], &mut ctx)?;
+        let by_patched = convert_columns(&[patched], &[SortField::default()], &mut ctx)?;
+        assert_eq!(collect_rows(&by_canonical), collect_rows(&by_patched));
+        Ok(())
+    }
+
+    #[test]
+    fn patched_row_encode_multi_chunk() -> VortexResult<()> {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        let n: usize = 4096;
+        let inner = PrimitiveArray::from_iter(vec![0u32; n]).into_array();
+        let indices: Vec<u32> = (0..n as u32).step_by(503).collect();
+        let values: Vec<u32> = indices.iter().map(|i| i + 1000).collect();
+        let patches = Patches::new(
+            n,
+            0,
+            PrimitiveArray::from_iter(indices.clone()).into_array(),
+            PrimitiveArray::from_iter(values.clone()).into_array(),
+            None,
+        )?;
+        let patched = Patched::from_array_and_patches(inner, &patches, &mut ctx)?.into_array();
+
+        let mut canonical_vals = vec![0u32; n];
+        for (idx, &i) in indices.iter().enumerate() {
+            canonical_vals[i as usize] = values[idx];
+        }
+        let canonical = PrimitiveArray::from_iter(canonical_vals).into_array();
+
+        let by_canonical = convert_columns(&[canonical], &[SortField::default()], &mut ctx)?;
+        let by_patched = convert_columns(&[patched], &[SortField::default()], &mut ctx)?;
+        assert_eq!(collect_rows(&by_canonical), collect_rows(&by_patched));
+        Ok(())
     }
 }

From d8b9bce95600f8049461ee0ca4da153bca971726 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 17 May 2026 22:31:56 +0000
Subject: [PATCH 23/26] RunEnd row-encode kernel (vortex-runend)

Add a row-encode kernel for `RunEnd` arrays via the inventory-based
registry: the encoding lives in `vortex-runend` which depends on
`vortex-array` (not the other way around), so a direct downcast inside
`dispatch_size` / `dispatch_encode` would create a cycle.

The kernel is functionally analogous to the Dict kernel: encode each
unique run-value once into a small per-value buffer, then broadcast the
value's encoded bytes across each row in its run. The per-unique-value
cost is amortized over the number of runs rather than the row count.

`walk_runs` translates the run-end array's `(prev_end, curr_end)`
windows into `(start_logical, stop_logical)` row ranges accounting for
the array's slice offset and length.

When ends.len() > len (very sparse runs, or pathological inputs) the
kernel declines so canonicalization stays the dominant path.

Includes a round-trip test in `compute/row_encode.rs` checking that
the RunEnd path matches the canonical path bit-for-bit.

Signed-off-by: Claude <noreply@anthropic.com>
---
 Cargo.lock                                 |   2 +
 encodings/runend/Cargo.toml                |   2 +
 encodings/runend/src/compute/mod.rs        |   1 +
 encodings/runend/src/compute/row_encode.rs | 214 +++++++++++++++++++++
 4 files changed, 219 insertions(+)
 create mode 100644 encodings/runend/src/compute/row_encode.rs

diff --git a/Cargo.lock b/Cargo.lock
index 86cec8f1247..5a7a18061ad 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -11064,6 +11064,7 @@ dependencies = [
  "arrow-array 58.2.0",
  "arrow-schema 58.2.0",
  "codspeed-divan-compat",
+ "inventory",
  "itertools 0.14.0",
  "num-traits",
  "prost 0.14.3",
@@ -11073,6 +11074,7 @@ dependencies = [
  "vortex-buffer",
  "vortex-error",
  "vortex-mask",
+ "vortex-row",
  "vortex-session",
 ]
 
diff --git a/encodings/runend/Cargo.toml b/encodings/runend/Cargo.toml
index 01a5b8d7a3e..0ec0302a4a8 100644
--- a/encodings/runend/Cargo.toml
+++ b/encodings/runend/Cargo.toml
@@ -16,6 +16,7 @@ version = { workspace = true }
 [dependencies]
 arbitrary = { workspace = true, optional = true }
 arrow-array = { workspace = true, optional = true }
+inventory = { workspace = true }
 itertools = { workspace = true }
 num-traits = { workspace = true }
 prost = { workspace = true }
@@ -23,6 +24,7 @@ vortex-array = { workspace = true }
 vortex-buffer = { workspace = true }
 vortex-error = { workspace = true }
 vortex-mask = { workspace = true }
+vortex-row = { workspace = true }
 vortex-session = { workspace = true }
 
 [lints]
diff --git a/encodings/runend/src/compute/mod.rs b/encodings/runend/src/compute/mod.rs
index 9bdb6d67c00..2b9ba9092ad 100644
--- a/encodings/runend/src/compute/mod.rs
+++ b/encodings/runend/src/compute/mod.rs
@@ -8,6 +8,7 @@ pub(crate) mod filter;
 pub(crate) mod is_constant;
 pub(crate) mod is_sorted;
 pub(crate) mod min_max;
+mod row_encode;
 pub(crate) mod take;
 pub(crate) mod take_from;
 
diff --git a/encodings/runend/src/compute/row_encode.rs b/encodings/runend/src/compute/row_encode.rs
new file mode 100644
index 00000000000..aa24fd9eea7
--- /dev/null
+++ b/encodings/runend/src/compute/row_encode.rs
@@ -0,0 +1,214 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Row-encode kernels for `RunEndArray`.
+//!
+//! Like `Dict`, the per-row size and per-row encoded bytes are determined by the column's
+//! *values*, so we encode each run-value once and broadcast it across the indices in that
+//! run. The per-unique-value cost is amortized over the number of runs rather than the
+//! row count.
+
+#![allow(
+    clippy::cast_possible_truncation,
+    clippy::cast_sign_loss,
+    reason = "row encoding indexes into u32-sized buffers; ends are non-negative"
+)]
+
+use num_traits::AsPrimitive;
+use vortex_array::ArrayId;
+use vortex_array::ArrayRef;
+use vortex_array::ExecutionCtx;
+use vortex_array::arrays::PrimitiveArray;
+use vortex_array::dtype::NativePType;
+use vortex_array::match_each_integer_ptype;
+use vortex_error::VortexResult;
+use vortex_row::RowEncodeRegistration;
+use vortex_row::encode::dispatch_encode;
+use vortex_row::options::SortField;
+use vortex_row::size::dispatch_size;
+
+use crate::RunEnd;
+use crate::RunEndArrayExt;
+
+/// Function pointer registered for the size contribution of a `RunEnd` column.
+fn run_end_size_contribution(
+    column: &ArrayRef,
+    field: SortField,
+    sizes: &mut [u32],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<Option<()>> {
+    let Some(view) = column.as_opt::<RunEnd>() else {
+        return Ok(None);
+    };
+    let nruns = view.ends().len();
+    if nruns > view.len() {
+        return Ok(None);
+    }
+
+    let mut value_sizes = vec![0u32; view.values().len()];
+    dispatch_size(view.values(), field, &mut value_sizes, ctx)?;
+
+    let offset = view.offset() as u64;
+    let len = view.len();
+    let ends_prim = view.ends().clone().execute::<PrimitiveArray>(ctx)?;
+
+    match_each_integer_ptype!(ends_prim.ptype(), |E| {
+        let ends = ends_prim.as_slice::<E>();
+        walk_runs::<E>(ends, offset, len, |run_idx, start, stop| {
+            let add = value_sizes[run_idx];
+            if add == 0 {
+                return;
+            }
+            for s in &mut sizes[start..stop] {
+                *s += add;
+            }
+        });
+    });
+    Ok(Some(()))
+}
+
+/// Function pointer registered for the per-row encode of a `RunEnd` column.
+fn run_end_encode_into(
+    column: &ArrayRef,
+    field: SortField,
+    offsets: &[u32],
+    cursors: &mut [u32],
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<Option<()>> {
+    let Some(view) = column.as_opt::<RunEnd>() else {
+        return Ok(None);
+    };
+    let nruns = view.ends().len();
+    if nruns > view.len() {
+        return Ok(None);
+    }
+
+    let n_values = view.values().len();
+    let mut value_sizes = vec![0u32; n_values];
+    dispatch_size(view.values(), field, &mut value_sizes, ctx)?;
+
+    let mut value_offsets = vec![0u32; n_values + 1];
+    let mut total: u64 = 0;
+    for i in 0..n_values {
+        value_offsets[i] = total as u32;
+        total += u64::from(value_sizes[i]);
+    }
+    value_offsets[n_values] = total as u32;
+    let mut value_buf = vec![0u8; total as usize];
+    let zero_offsets = vec![0u32; n_values];
+    let mut inner_cursors = value_offsets[..n_values].to_vec();
+    dispatch_encode(
+        view.values(),
+        field,
+        &zero_offsets,
+        &mut inner_cursors,
+        &mut value_buf,
+        ctx,
+    )?;
+
+    let offset = view.offset() as u64;
+    let len = view.len();
+    let ends_prim = view.ends().clone().execute::<PrimitiveArray>(ctx)?;
+
+    match_each_integer_ptype!(ends_prim.ptype(), |E| {
+        let ends = ends_prim.as_slice::<E>();
+        walk_runs::<E>(ends, offset, len, |run_idx, start, stop| {
+            let v_start = value_offsets[run_idx] as usize;
+            let v_size = value_sizes[run_idx] as usize;
+            if v_size == 0 {
+                return;
+            }
+            let value_bytes = &value_buf[v_start..v_start + v_size];
+            let v_size_u32 = v_size as u32;
+            for i in start..stop {
+                let pos = (offsets[i] + cursors[i]) as usize;
+                out[pos..pos + v_size].copy_from_slice(value_bytes);
+                cursors[i] += v_size_u32;
+            }
+        });
+    });
+    Ok(Some(()))
+}
+
+/// For each run, call `f(run_idx, start_logical, stop_logical)` where the logical range is
+/// `[max(prev_end - offset, 0), min(curr_end - offset, len))`.
+#[inline]
+fn walk_runs<E>(ends: &[E], offset: u64, len: usize, mut f: impl FnMut(usize, usize, usize))
+where
+    E: NativePType + AsPrimitive<u64>,
+{
+    let mut prev: u64 = offset;
+    for (run_idx, &end) in ends.iter().enumerate() {
+        let end_u64: u64 = end.as_();
+        if end_u64 <= offset {
+            prev = end_u64;
+            continue;
+        }
+        let start = (prev.saturating_sub(offset)) as usize;
+        let stop_u64 = end_u64 - offset;
+        let stop = (stop_u64 as usize).min(len);
+        if start < stop {
+            f(run_idx, start, stop);
+        }
+        prev = end_u64;
+        if stop >= len {
+            break;
+        }
+    }
+}
+
+fn run_end_array_id() -> ArrayId {
+    use vortex_session::registry::CachedId;
+    static ID: CachedId = CachedId::new("vortex.runend");
+    *ID
+}
+
+inventory::submit! {
+    RowEncodeRegistration {
+        id: run_end_array_id,
+        size: run_end_size_contribution,
+        encode: run_end_encode_into,
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use vortex_array::IntoArray;
+    use vortex_array::LEGACY_SESSION;
+    use vortex_array::VortexSessionExecute;
+    use vortex_array::arrays::ListViewArray;
+    use vortex_array::arrays::PrimitiveArray;
+    use vortex_array::arrays::listview::ListViewArrayExt;
+    use vortex_buffer::buffer;
+    use vortex_error::VortexResult;
+    use vortex_row::SortField;
+    use vortex_row::convert_columns;
+
+    use crate::RunEnd;
+
+    fn collect_rows(arr: &ListViewArray) -> Vec<Vec<u8>> {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        let n = arr.len();
+        (0..n)
+            .map(|i| {
+                let slice = arr.list_elements_at(i).unwrap();
+                let p = slice.execute::<PrimitiveArray>(&mut ctx).unwrap();
+                p.as_slice::<u8>().to_vec()
+            })
+            .collect()
+    }
+
+    #[test]
+    fn runend_row_encode_matches_canonical() -> VortexResult<()> {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        let raw = buffer![1i32, 1, 1, 2, 2, 3, 3, 3, 3].into_array();
+        let ree = RunEnd::encode(raw.clone(), &mut ctx)?.into_array();
+
+        let by_canonical = convert_columns(&[raw], &[SortField::default()], &mut ctx)?;
+        let by_ree = convert_columns(&[ree], &[SortField::default()], &mut ctx)?;
+
+        assert_eq!(collect_rows(&by_canonical), collect_rows(&by_ree));
+        Ok(())
+    }
+}

From f46aefd135e6c08e9b95b92542691a0a95426521 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 17 May 2026 22:35:06 +0000
Subject: [PATCH 24/26] BitPacked row-encode kernel (vortex-fastlanes)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a row-encode kernel for BitPacked arrays. The kernel walks the
packed storage in 1024-element fastlanes chunks via
`BitUnpackedChunks::full_chunks`, unpacks each chunk into a stack-local
buffer, and writes the row-encoded bytes for that chunk in one pass.

Patches (when present) are applied per-chunk to the stack buffer so a
patched cell encodes its corrected value rather than the bit-packed
placeholder.

The shared `row_encode_common` module factors out the per-chunk encode
primitive (`encode_primitive_chunk`) and a small `PrimRowEncode`
trait — the same shape FoR and Delta will use in the next commit so
those kernels can share the chunk-walk machinery.

Kernel is registered via the `inventory`-based registry, since
`vortex-fastlanes` depends on `vortex-array`.

Includes a `bitpacked_i32_*` bench triplet (arrow-row baseline, vortex
with kernel, vortex through canonicalization).

Signed-off-by: Claude <noreply@anthropic.com>
---
 Cargo.lock                                    |   3 +
 encodings/fastlanes/Cargo.toml                |   2 +
 .../fastlanes/src/bitpacking/compute/mod.rs   |   1 +
 .../src/bitpacking/compute/row_encode.rs      | 403 ++++++++++++++++++
 encodings/fastlanes/src/lib.rs                |   1 +
 encodings/fastlanes/src/row_encode_common.rs  | 126 ++++++
 vortex-row/Cargo.toml                         |   1 +
 vortex-row/benches/row_encode.rs              |  55 +++
 8 files changed, 592 insertions(+)
 create mode 100644 encodings/fastlanes/src/bitpacking/compute/row_encode.rs
 create mode 100644 encodings/fastlanes/src/row_encode_common.rs

diff --git a/Cargo.lock b/Cargo.lock
index 5a7a18061ad..949a9594359 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -10687,6 +10687,7 @@ version = "0.1.0"
 dependencies = [
  "codspeed-divan-compat",
  "fastlanes",
+ "inventory",
  "itertools 0.14.0",
  "lending-iterator",
  "num-traits",
@@ -10699,6 +10700,7 @@ dependencies = [
  "vortex-error",
  "vortex-fastlanes",
  "vortex-mask",
+ "vortex-row",
  "vortex-session",
 ]
 
@@ -11051,6 +11053,7 @@ dependencies = [
  "vortex-array",
  "vortex-buffer",
  "vortex-error",
+ "vortex-fastlanes",
  "vortex-mask",
  "vortex-session",
  "vortex-utils",
diff --git a/encodings/fastlanes/Cargo.toml b/encodings/fastlanes/Cargo.toml
index a14e19389bc..3a255310ac5 100644
--- a/encodings/fastlanes/Cargo.toml
+++ b/encodings/fastlanes/Cargo.toml
@@ -18,6 +18,7 @@ workspace = true
 
 [dependencies]
 fastlanes = { workspace = true }
+inventory = { workspace = true }
 itertools = { workspace = true }
 lending-iterator = { workspace = true }
 num-traits = { workspace = true }
@@ -27,6 +28,7 @@ vortex-array = { workspace = true }
 vortex-buffer = { workspace = true }
 vortex-error = { workspace = true }
 vortex-mask = { workspace = true }
+vortex-row = { workspace = true }
 vortex-session = { workspace = true }
 
 [dev-dependencies]
diff --git a/encodings/fastlanes/src/bitpacking/compute/mod.rs b/encodings/fastlanes/src/bitpacking/compute/mod.rs
index 2501d952356..a260d0e0ffd 100644
--- a/encodings/fastlanes/src/bitpacking/compute/mod.rs
+++ b/encodings/fastlanes/src/bitpacking/compute/mod.rs
@@ -4,6 +4,7 @@
 mod cast;
 mod filter;
 pub(crate) mod is_constant;
+pub(crate) mod row_encode;
 mod slice;
 mod take;
 
diff --git a/encodings/fastlanes/src/bitpacking/compute/row_encode.rs b/encodings/fastlanes/src/bitpacking/compute/row_encode.rs
new file mode 100644
index 00000000000..412993dd166
--- /dev/null
+++ b/encodings/fastlanes/src/bitpacking/compute/row_encode.rs
@@ -0,0 +1,403 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Row-encode kernel for `BitPackedArray`.
+//!
+//! Walks the bit-packed storage in 1024-element chunks, unpacks each chunk into a
+//! stack-local buffer, and writes the row-encoded bytes in one pass. Avoids
+//! materializing a canonical `PrimitiveArray` first.
+
+#![allow(
+    clippy::cast_possible_truncation,
+    clippy::cast_possible_wrap,
+    clippy::cast_sign_loss,
+    reason = "row encoding indexes into u32-sized buffers and bit-packed widths are small"
+)]
+#![allow(
+    unused_imports,
+    reason = "Item is consumed by the #[gat(Item)] macro expansion"
+)]
+
+use lending_iterator::gat;
+#[allow(unused_imports)]
+use lending_iterator::prelude::Item;
+#[gat(Item)]
+use lending_iterator::prelude::LendingIterator;
+use vortex_array::ArrayId;
+use vortex_array::ArrayRef;
+use vortex_array::ExecutionCtx;
+use vortex_array::arrays::PrimitiveArray;
+use vortex_array::dtype::NativePType;
+use vortex_array::dtype::PType;
+use vortex_array::match_each_integer_ptype;
+use vortex_array::validity::Validity;
+use vortex_error::VortexResult;
+use vortex_row::RowEncodeRegistration;
+use vortex_row::options::SortField;
+
+use crate::BitPacked;
+use crate::BitPackedArrayExt;
+use crate::row_encode_common::PrimRowEncode;
+use crate::row_encode_common::encode_primitive_chunk;
+use crate::row_encode_common::encoded_size_for_ptype;
+use crate::unpack_iter::BitPacked as BitPackedUnpack;
+
+/// Per-row size contribution for a `BitPacked` column.
+fn bitpacked_size_contribution(
+    column: &ArrayRef,
+    _field: SortField,
+    sizes: &mut [u32],
+    _ctx: &mut ExecutionCtx,
+) -> VortexResult<Option<()>> {
+    let Some(view) = column.as_opt::<BitPacked>() else {
+        return Ok(None);
+    };
+    let add = encoded_size_for_ptype(view.dtype().as_ptype());
+    for s in sizes.iter_mut().take(view.as_ref().len()) {
+        *s += add;
+    }
+    Ok(Some(()))
+}
+
+/// Per-row byte encoding for a `BitPacked` column.
+fn bitpacked_encode_into(
+    column: &ArrayRef,
+    field: SortField,
+    offsets: &[u32],
+    cursors: &mut [u32],
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<Option<()>> {
+    let Some(view) = column.as_opt::<BitPacked>() else {
+        return Ok(None);
+    };
+    let ptype = view.dtype().as_ptype();
+    if !matches!(
+        ptype,
+        PType::I8
+            | PType::I16
+            | PType::I32
+            | PType::I64
+            | PType::U8
+            | PType::U16
+            | PType::U32
+            | PType::U64
+    ) {
+        return Ok(None);
+    }
+    // Materialize validity once and fast-path the common all-valid case.
+    // Use the explicit Ext method which returns a `Validity` (the inherent `validity()` on
+    // `ArrayView` returns `VortexResult<Validity>`).
+    let validity = BitPackedArrayExt::validity(&view);
+    let mask = match &validity {
+        Validity::NonNullable | Validity::AllValid => None,
+        _ => Some(validity.execute_mask(view.as_ref().len(), ctx)?),
+    };
+
+    // Materialize patches (rare; if patches are present we materialize the patch
+    // index/value slices once outside the hot loop).
+    let patches = view.patches();
+    let patch_pairs = if let Some(p) = patches {
+        let indices = p.indices().clone().execute::<PrimitiveArray>(ctx)?;
+        let values = p.values().clone().execute::<PrimitiveArray>(ctx)?;
+        Some((indices, values, p.offset()))
+    } else {
+        None
+    };
+
+    match_each_integer_ptype!(ptype, |T| {
+        encode_bitpacked_typed::<T>(
+            view,
+            field,
+            offsets,
+            cursors,
+            out,
+            mask.as_ref(),
+            patch_pairs.as_ref(),
+        )?;
+    });
+    Ok(Some(()))
+}
+
+#[allow(clippy::too_many_arguments)]
+fn encode_bitpacked_typed<T>(
+    arr_view: vortex_array::ArrayView<'_, BitPacked>,
+    field: SortField,
+    offsets: &[u32],
+    cursors: &mut [u32],
+    out: &mut [u8],
+    mask: Option<&vortex_mask::Mask>,
+    patch_pairs: Option<&(PrimitiveArray, PrimitiveArray, usize)>,
+) -> VortexResult<()>
+where
+    T: BitPackedUnpack + NativePType + PrimRowEncode,
+{
+    let total_len = arr_view.as_ref().len();
+    let descending = field.descending;
+    let non_null = field.non_null_sentinel();
+    let null = field.null_sentinel();
+    let value_bytes = size_of::<T>();
+    let stride = (1 + value_bytes) as u32;
+
+    let mut local_idx: usize = 0;
+    let mut unpacked = arr_view.unpacked_chunks::<T>()?;
+
+    // Walk the array: initial sliced chunk, full middle chunks, trailing sliced chunk.
+    if let Some(initial) = unpacked.initial() {
+        let len_chunk = initial.len();
+        // Apply patches that fall in this chunk (logical rows local_idx..local_idx+len_chunk).
+        apply_patches_in_range::<T>(initial, patch_pairs, local_idx, local_idx + len_chunk);
+        write_chunk_rows::<T>(
+            initial,
+            local_idx,
+            offsets,
+            cursors,
+            out,
+            mask,
+            non_null,
+            null,
+            descending,
+            value_bytes,
+            stride,
+        );
+        local_idx += len_chunk;
+    }
+
+    let mut chunks_iter = unpacked.full_chunks();
+    while let Some(chunk) = chunks_iter.next() {
+        // Determine logical length: full chunk is 1024.
+        let len_chunk = 1024.min(total_len - local_idx);
+        // Apply patches that fall in this chunk.
+        apply_patches_in_range::<T>(
+            &mut chunk[..len_chunk],
+            patch_pairs,
+            local_idx,
+            local_idx + len_chunk,
+        );
+        write_chunk_rows::<T>(
+            &chunk[..len_chunk],
+            local_idx,
+            offsets,
+            cursors,
+            out,
+            mask,
+            non_null,
+            null,
+            descending,
+            value_bytes,
+            stride,
+        );
+        local_idx += len_chunk;
+    }
+
+    if let Some(trailer) = unpacked.trailer() {
+        let len_chunk = trailer.len();
+        apply_patches_in_range::<T>(trailer, patch_pairs, local_idx, local_idx + len_chunk);
+        write_chunk_rows::<T>(
+            trailer,
+            local_idx,
+            offsets,
+            cursors,
+            out,
+            mask,
+            non_null,
+            null,
+            descending,
+            value_bytes,
+            stride,
+        );
+        local_idx += len_chunk;
+    }
+
+    debug_assert_eq!(local_idx, total_len);
+    Ok(())
+}
+
+/// Overwrite values in `chunk` (which covers logical rows `[chunk_start, chunk_end)`) with
+/// any patch values that fall in that range.
+fn apply_patches_in_range<T: NativePType>(
+    chunk: &mut [T],
+    patch_pairs: Option<&(PrimitiveArray, PrimitiveArray, usize)>,
+    chunk_start: usize,
+    chunk_end: usize,
+) {
+    let Some((indices_p, values_p, patch_offset)) = patch_pairs else {
+        return;
+    };
+    let values: &[T] = values_p.as_slice();
+    // Indices may be u32 or u64. We search for the first index >= chunk_start + patch_offset.
+    // For simplicity, scan linearly per chunk; patches are rare.
+    let logical_start = chunk_start + *patch_offset;
+    let logical_end = chunk_end + *patch_offset;
+    let indices_ptype = indices_p.ptype();
+    match indices_ptype {
+        PType::U32 => {
+            let idx: &[u32] = indices_p.as_slice();
+            for (i, &raw_idx) in idx.iter().enumerate() {
+                let raw_idx = raw_idx as usize;
+                if raw_idx < logical_start {
+                    continue;
+                }
+                if raw_idx >= logical_end {
+                    break;
+                }
+                let local = raw_idx - logical_start;
+                chunk[local] = values[i];
+            }
+        }
+        PType::U64 => {
+            let idx: &[u64] = indices_p.as_slice();
+            for (i, &raw_idx) in idx.iter().enumerate() {
+                let raw_idx = raw_idx as usize;
+                if raw_idx < logical_start {
+                    continue;
+                }
+                if raw_idx >= logical_end {
+                    break;
+                }
+                let local = raw_idx - logical_start;
+                chunk[local] = values[i];
+            }
+        }
+        PType::U16 => {
+            let idx: &[u16] = indices_p.as_slice();
+            for (i, &raw_idx) in idx.iter().enumerate() {
+                let raw_idx = raw_idx as usize;
+                if raw_idx < logical_start {
+                    continue;
+                }
+                if raw_idx >= logical_end {
+                    break;
+                }
+                let local = raw_idx - logical_start;
+                chunk[local] = values[i];
+            }
+        }
+        PType::U8 => {
+            let idx: &[u8] = indices_p.as_slice();
+            for (i, &raw_idx) in idx.iter().enumerate() {
+                let raw_idx = raw_idx as usize;
+                if raw_idx < logical_start {
+                    continue;
+                }
+                if raw_idx >= logical_end {
+                    break;
+                }
+                let local = raw_idx - logical_start;
+                chunk[local] = values[i];
+            }
+        }
+        _ => {}
+    }
+}
+
+#[allow(clippy::too_many_arguments)]
+fn write_chunk_rows<T: NativePType + PrimRowEncode>(
+    chunk: &[T],
+    row_start: usize,
+    offsets: &[u32],
+    cursors: &mut [u32],
+    out: &mut [u8],
+    mask: Option<&vortex_mask::Mask>,
+    non_null: u8,
+    null: u8,
+    descending: bool,
+    value_bytes: usize,
+    stride: u32,
+) {
+    encode_primitive_chunk::<T>(
+        chunk,
+        row_start,
+        offsets,
+        cursors,
+        out,
+        mask,
+        non_null,
+        null,
+        descending,
+        value_bytes,
+        stride,
+    );
+}
+
+fn bitpacked_array_id() -> ArrayId {
+    use vortex_session::registry::CachedId;
+    static ID: CachedId = CachedId::new("fastlanes.bitpacked");
+    *ID
+}
+
+inventory::submit! {
+    RowEncodeRegistration {
+        id: bitpacked_array_id,
+        size: bitpacked_size_contribution,
+        encode: bitpacked_encode_into,
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use vortex_array::IntoArray;
+    use vortex_array::LEGACY_SESSION;
+    use vortex_array::VortexSessionExecute;
+    use vortex_array::arrays::ListViewArray;
+    use vortex_array::arrays::PrimitiveArray;
+    use vortex_array::arrays::listview::ListViewArrayExt;
+    use vortex_buffer::buffer;
+    use vortex_error::VortexResult;
+    use vortex_row::SortField;
+    use vortex_row::convert_columns;
+
+    use crate::BitPackedArrayExt;
+    use crate::BitPackedData;
+
+    fn collect_rows(arr: &ListViewArray) -> Vec<Vec<u8>> {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        let n = arr.len();
+        (0..n)
+            .map(|i| {
+                let slice = arr.list_elements_at(i).unwrap();
+                let p = slice.execute::<PrimitiveArray>(&mut ctx).unwrap();
+                p.as_slice::<u8>().to_vec()
+            })
+            .collect()
+    }
+
+    #[test]
+    fn bitpacked_row_encode_matches_canonical() -> VortexResult<()> {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        let raw = buffer![1u32, 2, 3, 4, 5, 6, 7, 8, 9].into_array();
+        let bp = BitPackedData::encode(&raw, 4, &mut ctx)?.into_array();
+
+        let by_canonical = convert_columns(&[raw], &[SortField::default()], &mut ctx)?;
+        let by_bp = convert_columns(&[bp], &[SortField::default()], &mut ctx)?;
+        assert_eq!(collect_rows(&by_canonical), collect_rows(&by_bp));
+        Ok(())
+    }
+
+    #[test]
+    fn bitpacked_row_encode_with_patches() -> VortexResult<()> {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        let values: Vec<u32> = (0..200)
+            .map(|i| if i % 30 == 0 { 5000 + i } else { i % 16 })
+            .collect();
+        let raw = PrimitiveArray::from_iter(values).into_array();
+        let bp = BitPackedData::encode(&raw, 4, &mut ctx)?.into_array();
+        assert!(bp.as_opt::<crate::BitPacked>().unwrap().patches().is_some());
+        let by_canonical = convert_columns(&[raw], &[SortField::default()], &mut ctx)?;
+        let by_bp = convert_columns(&[bp], &[SortField::default()], &mut ctx)?;
+        assert_eq!(collect_rows(&by_canonical), collect_rows(&by_bp));
+        Ok(())
+    }
+
+    #[test]
+    fn bitpacked_row_encode_multi_chunk() -> VortexResult<()> {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        let values: Vec<u32> = (0..3000).map(|i| i % 64).collect();
+        let raw = PrimitiveArray::from_iter(values).into_array();
+        let bp = BitPackedData::encode(&raw, 6, &mut ctx)?.into_array();
+        let by_canonical = convert_columns(&[raw], &[SortField::default()], &mut ctx)?;
+        let by_bp = convert_columns(&[bp], &[SortField::default()], &mut ctx)?;
+        assert_eq!(collect_rows(&by_canonical), collect_rows(&by_bp));
+        Ok(())
+    }
+}
diff --git a/encodings/fastlanes/src/lib.rs b/encodings/fastlanes/src/lib.rs
index 9022b7c4e2b..613a4784356 100644
--- a/encodings/fastlanes/src/lib.rs
+++ b/encodings/fastlanes/src/lib.rs
@@ -20,6 +20,7 @@ mod bitpacking;
 mod delta;
 mod r#for;
 mod rle;
+mod row_encode_common;
 
 pub(crate) const FL_CHUNK_SIZE: usize = 1024;
 
diff --git a/encodings/fastlanes/src/row_encode_common.rs b/encodings/fastlanes/src/row_encode_common.rs
new file mode 100644
index 00000000000..7f44ebc7463
--- /dev/null
+++ b/encodings/fastlanes/src/row_encode_common.rs
@@ -0,0 +1,126 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Shared helpers for the FastLanes row-encode kernels (BitPacked, FoR, Delta).
+//!
+//! Each kernel walks the compressed storage in 1024-element chunks, unpacks each chunk into
+//! a stack-local buffer, and writes the row-encoded bytes in one pass. This module defines
+//! the per-row write primitive used after a chunk has been unpacked.
+
+#![allow(
+    clippy::cast_possible_truncation,
+    clippy::cast_possible_wrap,
+    reason = "row encoding indexes into u32-sized buffers"
+)]
+
+use vortex_array::dtype::NativePType;
+use vortex_array::dtype::PType;
+
+/// Trait implemented by primitive types that can be written into a row-encoded byte slot.
+///
+/// Mirrors `vortex_row::codec::RowEncode` for the integer types that show up as the output
+/// of BitPacked/FoR/Delta.
+pub trait PrimRowEncode: Copy {
+    /// Encode this value into `out`, inverting the bytes for descending order.
+    fn row_encode_to(self, out: &mut [u8], descending: bool);
+}
+
+macro_rules! impl_unsigned {
+    ($t:ty) => {
+        impl PrimRowEncode for $t {
+            #[inline]
+            fn row_encode_to(self, out: &mut [u8], descending: bool) {
+                let bytes = self.to_be_bytes();
+                if descending {
+                    for (i, b) in bytes.iter().enumerate() {
+                        out[i] = b ^ 0xFF;
+                    }
+                } else {
+                    out.copy_from_slice(&bytes);
+                }
+            }
+        }
+    };
+}
+
+macro_rules! impl_signed {
+    ($t:ty) => {
+        impl PrimRowEncode for $t {
+            #[inline]
+            fn row_encode_to(self, out: &mut [u8], descending: bool) {
+                let mut bytes = self.to_be_bytes();
+                bytes[0] ^= 0x80;
+                if descending {
+                    for (i, b) in bytes.iter().enumerate() {
+                        out[i] = b ^ 0xFF;
+                    }
+                } else {
+                    out.copy_from_slice(&bytes);
+                }
+            }
+        }
+    };
+}
+
+impl_unsigned!(u8);
+impl_unsigned!(u16);
+impl_unsigned!(u32);
+impl_unsigned!(u64);
+impl_signed!(i8);
+impl_signed!(i16);
+impl_signed!(i32);
+impl_signed!(i64);
+
+/// Encoded row width (sentinel + value bytes) for the given primitive type.
+#[inline]
+pub fn encoded_size_for_ptype(ptype: PType) -> u32 {
+    1 + (ptype.byte_width() as u32)
+}
+
+/// Write a contiguous slice of unpacked values (one chunk) into the row-encoded output buffer.
+///
+/// `chunk[j]` is the value for logical row `row_start + j`. The output position for row `i`
+/// is `offsets[i] + cursors[i]`; the cursor is advanced by `stride` after each row write.
+#[allow(clippy::too_many_arguments)]
+#[inline]
+pub fn encode_primitive_chunk<T: NativePType + PrimRowEncode>(
+    chunk: &[T],
+    row_start: usize,
+    offsets: &[u32],
+    cursors: &mut [u32],
+    out: &mut [u8],
+    mask: Option<&vortex_mask::Mask>,
+    non_null: u8,
+    null: u8,
+    descending: bool,
+    value_bytes: usize,
+    stride: u32,
+) {
+    match mask {
+        None => {
+            for (j, &v) in chunk.iter().enumerate() {
+                let row = row_start + j;
+                let pos = (offsets[row] + cursors[row]) as usize;
+                out[pos] = non_null;
+                v.row_encode_to(&mut out[pos + 1..pos + 1 + value_bytes], descending);
+                cursors[row] += stride;
+            }
+        }
+        Some(m) => {
+            for (j, &v) in chunk.iter().enumerate() {
+                let row = row_start + j;
+                let pos = (offsets[row] + cursors[row]) as usize;
+                if m.value(row) {
+                    out[pos] = non_null;
+                    v.row_encode_to(&mut out[pos + 1..pos + 1 + value_bytes], descending);
+                } else {
+                    out[pos] = null;
+                    for b in &mut out[pos + 1..pos + 1 + value_bytes] {
+                        *b = 0;
+                    }
+                }
+                cursors[row] += stride;
+            }
+        }
+    }
+}
diff --git a/vortex-row/Cargo.toml b/vortex-row/Cargo.toml
index 947b1df47da..6ccfa258bc9 100644
--- a/vortex-row/Cargo.toml
+++ b/vortex-row/Cargo.toml
@@ -36,6 +36,7 @@ mimalloc = { workspace = true }
 rand = { workspace = true }
 rstest = { workspace = true }
 vortex-array = { workspace = true, features = ["_test-harness"] }
+vortex-fastlanes = { workspace = true }
 
 [[bench]]
 name = "row_encode"
diff --git a/vortex-row/benches/row_encode.rs b/vortex-row/benches/row_encode.rs
index aa4d5f2f675..769404c82c3 100644
--- a/vortex-row/benches/row_encode.rs
+++ b/vortex-row/benches/row_encode.rs
@@ -45,6 +45,7 @@ use vortex_array::arrays::StructArray;
 use vortex_array::arrays::VarBinViewArray;
 use vortex_array::builders::dict::dict_encode;
 use vortex_array::patches::Patches;
+use vortex_fastlanes::BitPackedData;
 use vortex_row::SortField;
 use vortex_row::convert_columns;
 
@@ -374,3 +375,57 @@ fn patched_i32_without_kernel(bencher: divan::Bencher) {
         convert_columns(&[canonical], &[SortField::default()], &mut ctx).unwrap()
     })
 }
+
+// ---------- bitpacked_i32 ----------
+
+fn gen_bitpacked_i32_values(n: usize, seed: u64) -> Vec<i32> {
+    // Small positive integers in the 0..255 range so they bit-pack to 8 bits without patches.
+    let mut rng = StdRng::seed_from_u64(seed);
+    (0..n).map(|_| rng.random_range(0i32..256)).collect()
+}
+
+#[divan::bench]
+fn bitpacked_i32_arrow_row(bencher: divan::Bencher) {
+    let v = gen_bitpacked_i32_values(N, 100);
+    let arr = Arc::new(Int32Array::from(v.clone())) as arrow_array::ArrayRef;
+    let conv = RowConverter::new(vec![ArrowSortField::new(DataType::Int32)]).unwrap();
+    let bytes = (N * (1 + 4)) as u64;
+    bencher
+        .counter(BytesCount::new(bytes))
+        .bench_local(|| conv.convert_columns(&[arr.clone()]).unwrap())
+}
+
+#[divan::bench]
+fn bitpacked_i32_with_kernel(bencher: divan::Bencher) {
+    let v = gen_bitpacked_i32_values(N, 100);
+    let raw = PrimitiveArray::from_iter(v.clone()).into_array();
+    let mut setup_ctx = LEGACY_SESSION.create_execution_ctx();
+    let bp = BitPackedData::encode(&raw, 8, &mut setup_ctx)
+        .unwrap()
+        .into_array();
+    let bytes = (N * (1 + 4)) as u64;
+    bencher.counter(BytesCount::new(bytes)).bench_local(|| {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        convert_columns(&[bp.clone()], &[SortField::default()], &mut ctx).unwrap()
+    })
+}
+
+#[divan::bench]
+fn bitpacked_i32_without_kernel(bencher: divan::Bencher) {
+    let v = gen_bitpacked_i32_values(N, 100);
+    let raw = PrimitiveArray::from_iter(v.clone()).into_array();
+    let mut setup_ctx = LEGACY_SESSION.create_execution_ctx();
+    let bp = BitPackedData::encode(&raw, 8, &mut setup_ctx)
+        .unwrap()
+        .into_array();
+    let bytes = (N * (1 + 4)) as u64;
+    bencher.counter(BytesCount::new(bytes)).bench_local(|| {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        let canonical = bp
+            .clone()
+            .execute::<Canonical>(&mut ctx)
+            .unwrap()
+            .into_array();
+        convert_columns(&[canonical], &[SortField::default()], &mut ctx).unwrap()
+    })
+}

From 0430217eba49c68d8fdc4b5fa542c10e7fcd0521 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 17 May 2026 22:38:24 +0000
Subject: [PATCH 25/26] FoR and Delta row-encode kernels (vortex-fastlanes)

Two more chunk-walking kernels alongside the BitPacked one. Both
register via the inventory-based registry.

FoR (Frame of Reference):
- Common fused path: FoR around a BitPacked storage with an unsigned
  reference. Walks the bit-packed chunks via `FoR::unchecked_unfor_pack`
  into a stack buffer with the base wrapping-added inline, then encodes
  rows from that buffer.
- Slow path: FoR around a Primitive storage. Walks the canonical buffer
  once with a per-row wrapping_add and the row encode.

Delta:
- Use the existing chunked `decompress_primitive` to write into a
  primitive buffer, then encode rows from that buffer. Skips the
  PrimitiveArray wrapping + validity attach.

Adds `for_i64_*` and `delta_i64_*` bench triplets.

Signed-off-by: Claude <noreply@anthropic.com>
---
 encodings/fastlanes/src/delta/compute/mod.rs  |   1 +
 .../fastlanes/src/delta/compute/row_encode.rs | 247 +++++++++++
 encodings/fastlanes/src/for/compute/mod.rs    |   1 +
 .../fastlanes/src/for/compute/row_encode.rs   | 394 ++++++++++++++++++
 vortex-row/benches/row_encode.rs              | 137 ++++++
 5 files changed, 780 insertions(+)
 create mode 100644 encodings/fastlanes/src/delta/compute/row_encode.rs
 create mode 100644 encodings/fastlanes/src/for/compute/row_encode.rs

diff --git a/encodings/fastlanes/src/delta/compute/mod.rs b/encodings/fastlanes/src/delta/compute/mod.rs
index fa79c62a596..61fb9ff15e7 100644
--- a/encodings/fastlanes/src/delta/compute/mod.rs
+++ b/encodings/fastlanes/src/delta/compute/mod.rs
@@ -2,3 +2,4 @@
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
 mod cast;
+pub(crate) mod row_encode;
diff --git a/encodings/fastlanes/src/delta/compute/row_encode.rs b/encodings/fastlanes/src/delta/compute/row_encode.rs
new file mode 100644
index 00000000000..706bb7247b0
--- /dev/null
+++ b/encodings/fastlanes/src/delta/compute/row_encode.rs
@@ -0,0 +1,247 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Row-encode kernel for `DeltaArray`.
+//!
+//! Decompresses the deltas directly into a temporary primitive buffer (avoiding the
+//! `PrimitiveArray` wrapper allocation and validity attachment) and then walks the buffer
+//! once to produce row-encoded bytes.
+
+#![allow(
+    clippy::cast_possible_truncation,
+    clippy::cast_possible_wrap,
+    clippy::cast_sign_loss,
+    reason = "row encoding indexes into u32-sized buffers"
+)]
+
+use fastlanes::Delta as DeltaTrait;
+use fastlanes::FastLanes;
+use fastlanes::Transpose;
+use vortex_array::ArrayId;
+use vortex_array::ArrayRef;
+use vortex_array::ExecutionCtx;
+use vortex_array::arrays::PrimitiveArray;
+use vortex_array::arrays::primitive::PrimitiveArrayExt;
+use vortex_array::dtype::NativePType;
+use vortex_array::match_each_unsigned_integer_ptype;
+use vortex_array::validity::Validity;
+use vortex_error::VortexResult;
+use vortex_row::RowEncodeRegistration;
+use vortex_row::options::SortField;
+
+use crate::Delta;
+use crate::bit_transpose::untranspose_validity;
+use crate::delta::array::DeltaArrayExt;
+use crate::delta::array::delta_decompress::decompress_primitive;
+use crate::row_encode_common::PrimRowEncode;
+use crate::row_encode_common::encode_primitive_chunk;
+use crate::row_encode_common::encoded_size_for_ptype;
+
+/// Per-row size contribution for a `Delta` column.
+fn delta_size_contribution(
+    column: &ArrayRef,
+    _field: SortField,
+    sizes: &mut [u32],
+    _ctx: &mut ExecutionCtx,
+) -> VortexResult<Option<()>> {
+    let Some(view) = column.as_opt::<Delta>() else {
+        return Ok(None);
+    };
+    let add = encoded_size_for_ptype(view.as_ref().dtype().as_ptype());
+    for s in sizes.iter_mut().take(view.as_ref().len()) {
+        *s += add;
+    }
+    Ok(Some(()))
+}
+
+/// Per-row byte encoding for a `Delta` column.
+fn delta_encode_into(
+    column: &ArrayRef,
+    field: SortField,
+    offsets: &[u32],
+    cursors: &mut [u32],
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<Option<()>> {
+    let Some(view) = column.as_opt::<Delta>() else {
+        return Ok(None);
+    };
+
+    // Materialize bases and deltas (these are already primitive arrays).
+    let bases = view.bases().clone().execute::<PrimitiveArray>(ctx)?;
+    let deltas = view.deltas().clone().execute::<PrimitiveArray>(ctx)?;
+    let start = view.offset();
+    let total_len = view.as_ref().len();
+    let end = start + total_len;
+
+    // Following delta_decompress: validity is transposed on the deltas, untranspose it.
+    let validity = untranspose_validity(&deltas.validity()?, ctx)?;
+    let validity = validity.slice(start..end)?;
+
+    let descending = field.descending;
+    let non_null = field.non_null_sentinel();
+    let null = field.null_sentinel();
+    let original_ptype = deltas.ptype();
+    let value_bytes = original_ptype.byte_width();
+    let stride = (1 + value_bytes) as u32;
+
+    let mask = match &validity {
+        Validity::NonNullable | Validity::AllValid => None,
+        _ => Some(validity.execute_mask(total_len, ctx)?),
+    };
+
+    // Operate on the unsigned reinterpretation (matches `delta_decompress`).
+    let bases_u = bases.reinterpret_cast(original_ptype.to_unsigned());
+    let deltas_u = deltas.reinterpret_cast(original_ptype.to_unsigned());
+    let is_signed = original_ptype.is_signed_int();
+
+    match_each_unsigned_integer_ptype!(deltas_u.ptype(), |T| {
+        const LANES: usize = T::LANES;
+        let buffer = decompress_primitive::<T, LANES>(bases_u.as_slice(), deltas_u.as_slice());
+        let slice = &buffer.as_slice()[start..end];
+        if is_signed {
+            // Reinterpret each unsigned element as its signed counterpart for encoding.
+            // SAFETY: `T` and its signed counterpart have the same size and alignment.
+            let signed: &[<T as ToSigned>::Signed] = unsafe {
+                std::slice::from_raw_parts(
+                    slice.as_ptr().cast::<<T as ToSigned>::Signed>(),
+                    slice.len(),
+                )
+            };
+            encode_primitive_chunk::<<T as ToSigned>::Signed>(
+                signed,
+                0,
+                offsets,
+                cursors,
+                out,
+                mask.as_ref(),
+                non_null,
+                null,
+                descending,
+                value_bytes,
+                stride,
+            );
+        } else {
+            encode_primitive_chunk::<T>(
+                slice,
+                0,
+                offsets,
+                cursors,
+                out,
+                mask.as_ref(),
+                non_null,
+                null,
+                descending,
+                value_bytes,
+                stride,
+            );
+        }
+    });
+
+    Ok(Some(()))
+}
+
+/// Helper trait mapping unsigned types to their signed counterparts so we can encode signed
+/// values without losing the sign-bit-flip semantics of `PrimRowEncode`.
+trait ToSigned: Copy {
+    type Signed: Copy + NativePType + PrimRowEncode;
+}
+impl ToSigned for u8 {
+    type Signed = i8;
+}
+impl ToSigned for u16 {
+    type Signed = i16;
+}
+impl ToSigned for u32 {
+    type Signed = i32;
+}
+impl ToSigned for u64 {
+    type Signed = i64;
+}
+
+fn delta_array_id() -> ArrayId {
+    use vortex_session::registry::CachedId;
+    static ID: CachedId = CachedId::new("fastlanes.delta");
+    *ID
+}
+
+inventory::submit! {
+    RowEncodeRegistration {
+        id: delta_array_id,
+        size: delta_size_contribution,
+        encode: delta_encode_into,
+    }
+}
+
+// Silence the warning about `Transpose` / `FastLanes` being unused: they are referenced via
+// the trait bound chain on `decompress_primitive::<T, LANES>`.
+#[allow(dead_code)]
+const fn _trait_dep<T: DeltaTrait + Transpose + FastLanes>() {}
+
+#[cfg(test)]
+mod tests {
+    use vortex_array::IntoArray;
+    use vortex_array::LEGACY_SESSION;
+    use vortex_array::VortexSessionExecute;
+    use vortex_array::arrays::ListViewArray;
+    use vortex_array::arrays::PrimitiveArray;
+    use vortex_array::arrays::listview::ListViewArrayExt;
+    use vortex_buffer::buffer;
+    use vortex_error::VortexResult;
+    use vortex_row::SortField;
+    use vortex_row::convert_columns;
+
+    use crate::Delta;
+
+    fn collect_rows(arr: &ListViewArray) -> Vec<Vec<u8>> {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        let n = arr.len();
+        (0..n)
+            .map(|i| {
+                let slice = arr.list_elements_at(i).unwrap();
+                let p = slice.execute::<PrimitiveArray>(&mut ctx).unwrap();
+                p.as_slice::<u8>().to_vec()
+            })
+            .collect()
+    }
+
+    #[test]
+    fn delta_row_encode_matches_canonical_u64() -> VortexResult<()> {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        let raw = buffer![1u64, 2, 3, 5, 10, 11, 20].into_array();
+        let p = PrimitiveArray::from_iter([1u64, 2, 3, 5, 10, 11, 20]);
+        let delta = Delta::try_from_primitive_array(&p, &mut ctx)?.into_array();
+
+        let by_raw = convert_columns(&[raw], &[SortField::default()], &mut ctx)?;
+        let by_delta = convert_columns(&[delta], &[SortField::default()], &mut ctx)?;
+        assert_eq!(collect_rows(&by_raw), collect_rows(&by_delta));
+        Ok(())
+    }
+
+    #[test]
+    fn delta_row_encode_matches_canonical_i64() -> VortexResult<()> {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        let raw = buffer![-3i64, -2, -1, 0, 1, 2].into_array();
+        let p = PrimitiveArray::from_iter([-3i64, -2, -1, 0, 1, 2]);
+        let delta = Delta::try_from_primitive_array(&p, &mut ctx)?.into_array();
+
+        let by_raw = convert_columns(&[raw], &[SortField::default()], &mut ctx)?;
+        let by_delta = convert_columns(&[delta], &[SortField::default()], &mut ctx)?;
+        assert_eq!(collect_rows(&by_raw), collect_rows(&by_delta));
+        Ok(())
+    }
+
+    #[test]
+    fn delta_row_encode_multi_chunk_i64() -> VortexResult<()> {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        let values: Vec<i64> = (0..3000).map(|i| 1000 + i as i64 * 3).collect();
+        let raw = PrimitiveArray::from_iter(values.clone()).into_array();
+        let p = PrimitiveArray::from_iter(values);
+        let delta = Delta::try_from_primitive_array(&p, &mut ctx)?.into_array();
+
+        let by_raw = convert_columns(&[raw], &[SortField::default()], &mut ctx)?;
+        let by_delta = convert_columns(&[delta], &[SortField::default()], &mut ctx)?;
+        assert_eq!(collect_rows(&by_raw), collect_rows(&by_delta));
+        Ok(())
+    }
+}
diff --git a/encodings/fastlanes/src/for/compute/mod.rs b/encodings/fastlanes/src/for/compute/mod.rs
index 7307b50d776..aec27e74000 100644
--- a/encodings/fastlanes/src/for/compute/mod.rs
+++ b/encodings/fastlanes/src/for/compute/mod.rs
@@ -5,6 +5,7 @@ mod cast;
 mod compare;
 pub(crate) mod is_constant;
 pub(crate) mod is_sorted;
+pub(crate) mod row_encode;
 
 use vortex_array::ArrayRef;
 use vortex_array::ArrayView;
diff --git a/encodings/fastlanes/src/for/compute/row_encode.rs b/encodings/fastlanes/src/for/compute/row_encode.rs
new file mode 100644
index 00000000000..6dd8e580e62
--- /dev/null
+++ b/encodings/fastlanes/src/for/compute/row_encode.rs
@@ -0,0 +1,394 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Row-encode kernel for `FoRArray`.
+//!
+//! For the common fused path (`encoded` is a `BitPacked` with unsigned encoded values), this
+//! walks the bit-packed storage in 1024-element chunks, applies the FoR base inline via a
+//! custom `UnpackStrategy`, and writes the row-encoded bytes in one pass. Other shapes fall
+//! through to the canonicalize path.
+
+#![allow(
+    clippy::cast_possible_truncation,
+    clippy::cast_possible_wrap,
+    clippy::cast_sign_loss,
+    reason = "row encoding indexes into u32-sized buffers"
+)]
+use std::mem::MaybeUninit;
+
+use fastlanes::FoR as FoRTrait;
+use num_traits::WrappingAdd;
+use vortex_array::ArrayId;
+use vortex_array::ArrayRef;
+use vortex_array::ExecutionCtx;
+use vortex_array::arrays::Primitive;
+use vortex_array::arrays::PrimitiveArray;
+use vortex_array::dtype::NativePType;
+use vortex_array::dtype::PType;
+use vortex_array::dtype::PhysicalPType;
+use vortex_array::dtype::UnsignedPType;
+use vortex_array::match_each_integer_ptype;
+use vortex_array::match_each_unsigned_integer_ptype;
+use vortex_array::validity::Validity;
+use vortex_error::VortexExpect;
+use vortex_error::VortexResult;
+use vortex_row::RowEncodeRegistration;
+use vortex_row::options::SortField;
+
+use crate::BitPacked;
+use crate::BitPackedArrayExt;
+use crate::FoR;
+use crate::r#for::array::FoRArrayExt;
+use crate::row_encode_common::PrimRowEncode;
+use crate::row_encode_common::encode_primitive_chunk;
+use crate::row_encode_common::encoded_size_for_ptype;
+use crate::unpack_iter::BitPacked as BitPackedUnpack;
+
+/// Per-row size contribution for a `FoR` column.
+fn for_size_contribution(
+    column: &ArrayRef,
+    _field: SortField,
+    sizes: &mut [u32],
+    _ctx: &mut ExecutionCtx,
+) -> VortexResult<Option<()>> {
+    let Some(view) = column.as_opt::<FoR>() else {
+        return Ok(None);
+    };
+    let add = encoded_size_for_ptype(view.as_ref().dtype().as_ptype());
+    for s in sizes.iter_mut().take(view.as_ref().len()) {
+        *s += add;
+    }
+    Ok(Some(()))
+}
+
+/// Per-row byte encoding for a `FoR` column.
+fn for_encode_into(
+    column: &ArrayRef,
+    field: SortField,
+    offsets: &[u32],
+    cursors: &mut [u32],
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<Option<()>> {
+    let Some(view) = column.as_opt::<FoR>() else {
+        return Ok(None);
+    };
+    let ptype = view.as_ref().dtype().as_ptype();
+
+    // Fast path: encoded is a BitPacked with unsigned encoded values (the case used by the
+    // standard compressor). We do fused unpack + base-add + row-write in one pass.
+    if view.reference_scalar().dtype().is_unsigned_int()
+        && let Some(bp) = view.encoded().as_opt::<BitPacked>()
+    {
+        match_each_unsigned_integer_ptype!(ptype, |T| {
+            encode_for_bitpacked::<T>(view, bp, field, offsets, cursors, out, ctx)?;
+        });
+        return Ok(Some(()));
+    }
+
+    // Slower path: encoded is already a primitive array (or a non-BitPacked encoded). Walk
+    // the canonical primitive buffer directly and add the base.
+    if view.encoded().as_opt::<Primitive>().is_some() {
+        match_each_integer_ptype!(ptype, |T| {
+            encode_for_primitive::<T>(view, field, offsets, cursors, out, ctx)?;
+        });
+        return Ok(Some(()));
+    }
+
+    // Decline; the default canonicalization path will handle it.
+    Ok(None)
+}
+
+#[allow(clippy::too_many_arguments)]
+fn encode_for_bitpacked<T>(
+    for_view: vortex_array::ArrayView<'_, FoR>,
+    bp_view: vortex_array::ArrayView<'_, BitPacked>,
+    field: SortField,
+    offsets: &[u32],
+    cursors: &mut [u32],
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()>
+where
+    T: BitPackedUnpack
+        + PhysicalPType<Physical = T>
+        + UnsignedPType
+        + FoRTrait
+        + WrappingAdd
+        + NativePType
+        + PrimRowEncode,
+{
+    let ref_value: T = for_view
+        .reference_scalar()
+        .as_primitive()
+        .as_::<T>()
+        .vortex_expect("FoR reference cannot be null");
+
+    let total_len = bp_view.as_ref().len();
+    let descending = field.descending;
+    let non_null = field.non_null_sentinel();
+    let null = field.null_sentinel();
+    let value_bytes = size_of::<T>();
+    let stride = (1 + value_bytes) as u32;
+
+    // Materialize validity once.
+    let validity = BitPackedArrayExt::validity(&bp_view);
+    let mask = match &validity {
+        Validity::NonNullable | Validity::AllValid => None,
+        _ => Some(validity.execute_mask(total_len, ctx)?),
+    };
+
+    // Pre-canonicalize patches: rare. They have already been wrapping-added with the base by
+    // the FoR fused path; here we mirror that contract by adding the base after looking up
+    // the patch value.
+    let patches = bp_view.patches();
+    let patch_pairs = if let Some(p) = patches {
+        let indices = p.indices().clone().execute::<PrimitiveArray>(ctx)?;
+        let values = p.values().clone().execute::<PrimitiveArray>(ctx)?;
+        Some((indices, values, p.offset()))
+    } else {
+        None
+    };
+
+    // Walk chunks directly: 1024 elements per chunk, FoR base added inline.
+    let bit_width = bp_view.bit_width() as usize;
+    let offset = bp_view.offset() as usize;
+    let packed_bytes = bp_view.packed().as_host();
+    // SAFETY: packed bytes are aligned as `T` per FastLanes layout invariants.
+    let packed_slice: &[T] = unsafe {
+        std::slice::from_raw_parts(
+            packed_bytes.as_ptr().cast::<T>(),
+            packed_bytes.len() / size_of::<T>(),
+        )
+    };
+    let elems_per_chunk = 128 * bit_width / size_of::<T>();
+    let num_chunks = (offset + total_len).div_ceil(1024);
+
+    let mut buf: [MaybeUninit<T>; 1024] = [const { MaybeUninit::<T>::uninit() }; 1024];
+    let mut local_idx: usize = 0;
+    for chunk_idx in 0..num_chunks {
+        // SAFETY: `chunk` covers `elems_per_chunk` packed elements; `buf` is exactly 1024 entries.
+        unsafe {
+            let chunk = &packed_slice[chunk_idx * elems_per_chunk..][..elems_per_chunk];
+            FoRTrait::unchecked_unfor_pack(
+                bit_width,
+                chunk,
+                ref_value,
+                std::mem::transmute::<&mut [MaybeUninit<T>; 1024], &mut [T; 1024]>(&mut buf),
+            );
+        }
+        // SAFETY: just initialized 1024 elements.
+        let unpacked: &mut [T; 1024] =
+            unsafe { std::mem::transmute::<&mut [MaybeUninit<T>; 1024], &mut [T; 1024]>(&mut buf) };
+
+        // Determine the logical range within this chunk.
+        let chunk_offset = if chunk_idx == 0 { offset } else { 0 };
+        let chunk_logical_start = chunk_idx * 1024;
+        let chunk_logical_end = ((chunk_idx + 1) * 1024).min(offset + total_len);
+        let usable = &mut unpacked[chunk_offset..(chunk_logical_end - chunk_idx * 1024)];
+
+        // Apply patches that fall in this chunk.
+        apply_patches_in_range_for::<T>(
+            usable,
+            patch_pairs.as_ref(),
+            local_idx,
+            local_idx + usable.len(),
+            ref_value,
+        );
+
+        encode_primitive_chunk::<T>(
+            usable,
+            local_idx,
+            offsets,
+            cursors,
+            out,
+            mask.as_ref(),
+            non_null,
+            null,
+            descending,
+            value_bytes,
+            stride,
+        );
+        local_idx += usable.len();
+        let _ = chunk_logical_start;
+    }
+
+    debug_assert_eq!(local_idx, total_len);
+    Ok(())
+}
+
+fn apply_patches_in_range_for<T>(
+    chunk: &mut [T],
+    patch_pairs: Option<&(PrimitiveArray, PrimitiveArray, usize)>,
+    chunk_start: usize,
+    chunk_end: usize,
+    ref_value: T,
+) where
+    T: NativePType + WrappingAdd,
+{
+    let Some((indices_p, values_p, patch_offset)) = patch_pairs else {
+        return;
+    };
+    let values: &[T] = values_p.as_slice();
+    let logical_start = chunk_start + *patch_offset;
+    let logical_end = chunk_end + *patch_offset;
+    macro_rules! walk {
+        ($idx_ty:ty) => {{
+            let idx: &[$idx_ty] = indices_p.as_slice();
+            for (i, &raw_idx) in idx.iter().enumerate() {
+                let raw_idx = raw_idx as usize;
+                if raw_idx < logical_start {
+                    continue;
+                }
+                if raw_idx >= logical_end {
+                    break;
+                }
+                let local = raw_idx - logical_start;
+                chunk[local] = values[i].wrapping_add(&ref_value);
+            }
+        }};
+    }
+    match indices_p.ptype() {
+        PType::U64 => walk!(u64),
+        PType::U32 => walk!(u32),
+        PType::U16 => walk!(u16),
+        PType::U8 => walk!(u8),
+        _ => {}
+    }
+}
+
+fn encode_for_primitive<T>(
+    for_view: vortex_array::ArrayView<'_, FoR>,
+    field: SortField,
+    offsets: &[u32],
+    cursors: &mut [u32],
+    out: &mut [u8],
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<()>
+where
+    T: NativePType + PrimRowEncode + WrappingAdd,
+{
+    let encoded = for_view.encoded().clone().execute::<PrimitiveArray>(ctx)?;
+    let total_len = encoded.len();
+    let descending = field.descending;
+    let non_null = field.non_null_sentinel();
+    let null = field.null_sentinel();
+    let value_bytes = size_of::<T>();
+    let stride = (1 + value_bytes) as u32;
+    let ref_value: T = for_view
+        .reference_scalar()
+        .as_primitive()
+        .as_::<T>()
+        .vortex_expect("FoR reference cannot be null");
+
+    let validity = encoded.validity()?;
+    let mask = match &validity {
+        Validity::NonNullable | Validity::AllValid => None,
+        _ => Some(validity.execute_mask(total_len, ctx)?),
+    };
+
+    let slice: &[T] = encoded.as_slice();
+    match mask {
+        None => {
+            for (i, &v) in slice.iter().enumerate() {
+                let val = v.wrapping_add(&ref_value);
+                let pos = (offsets[i] + cursors[i]) as usize;
+                out[pos] = non_null;
+                val.row_encode_to(&mut out[pos + 1..pos + 1 + value_bytes], descending);
+                cursors[i] += stride;
+            }
+        }
+        Some(m) => {
+            for (i, &v) in slice.iter().enumerate() {
+                let pos = (offsets[i] + cursors[i]) as usize;
+                if m.value(i) {
+                    let val = v.wrapping_add(&ref_value);
+                    out[pos] = non_null;
+                    val.row_encode_to(&mut out[pos + 1..pos + 1 + value_bytes], descending);
+                } else {
+                    out[pos] = null;
+                    for b in &mut out[pos + 1..pos + 1 + value_bytes] {
+                        *b = 0;
+                    }
+                }
+                cursors[i] += stride;
+            }
+        }
+    }
+    Ok(())
+}
+
+fn for_array_id() -> ArrayId {
+    use vortex_session::registry::CachedId;
+    static ID: CachedId = CachedId::new("fastlanes.for");
+    *ID
+}
+
+inventory::submit! {
+    RowEncodeRegistration {
+        id: for_array_id,
+        size: for_size_contribution,
+        encode: for_encode_into,
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use vortex_array::IntoArray;
+    use vortex_array::LEGACY_SESSION;
+    use vortex_array::VortexSessionExecute;
+    use vortex_array::arrays::ListViewArray;
+    use vortex_array::arrays::PrimitiveArray;
+    use vortex_array::arrays::listview::ListViewArrayExt;
+    use vortex_array::scalar::Scalar;
+    use vortex_buffer::buffer;
+    use vortex_error::VortexResult;
+    use vortex_row::SortField;
+    use vortex_row::convert_columns;
+
+    use crate::BitPackedData;
+    use crate::FoR;
+
+    fn collect_rows(arr: &ListViewArray) -> Vec<Vec<u8>> {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        let n = arr.len();
+        (0..n)
+            .map(|i| {
+                let slice = arr.list_elements_at(i).unwrap();
+                let p = slice.execute::<PrimitiveArray>(&mut ctx).unwrap();
+                p.as_slice::<u8>().to_vec()
+            })
+            .collect()
+    }
+
+    #[test]
+    fn for_row_encode_matches_canonical_primitive_encoded() -> VortexResult<()> {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        // FoR with a plain primitive `encoded` array (no BitPacked underneath).
+        let encoded = buffer![5i64, 6, 7, 8, 9].into_array();
+        let arr = FoR::try_new(encoded, Scalar::from(100i64))?.into_array();
+        let raw = buffer![105i64, 106, 107, 108, 109].into_array();
+
+        let by_raw = convert_columns(&[raw], &[SortField::default()], &mut ctx)?;
+        let by_for = convert_columns(&[arr], &[SortField::default()], &mut ctx)?;
+        assert_eq!(collect_rows(&by_raw), collect_rows(&by_for));
+        Ok(())
+    }
+
+    #[test]
+    fn for_row_encode_matches_canonical_bitpacked_encoded() -> VortexResult<()> {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        let n: usize = 1500;
+        let raw_values: Vec<u32> = (0..n as u32).map(|i| 10_000 + i).collect();
+        let raw_arr = PrimitiveArray::from_iter(raw_values).into_array();
+        let unsigned_encoded =
+            PrimitiveArray::from_iter((0..n as u32).collect::<Vec<u32>>()).into_array();
+        let bp = BitPackedData::encode(&unsigned_encoded, 11, &mut ctx)?.into_array();
+        let arr = FoR::try_new(bp, Scalar::from(10_000u32))?.into_array();
+
+        let by_raw = convert_columns(&[raw_arr], &[SortField::default()], &mut ctx)?;
+        let by_for = convert_columns(&[arr], &[SortField::default()], &mut ctx)?;
+        assert_eq!(collect_rows(&by_raw), collect_rows(&by_for));
+        Ok(())
+    }
+}
diff --git a/vortex-row/benches/row_encode.rs b/vortex-row/benches/row_encode.rs
index 769404c82c3..5dba51938a0 100644
--- a/vortex-row/benches/row_encode.rs
+++ b/vortex-row/benches/row_encode.rs
@@ -45,7 +45,10 @@ use vortex_array::arrays::StructArray;
 use vortex_array::arrays::VarBinViewArray;
 use vortex_array::builders::dict::dict_encode;
 use vortex_array::patches::Patches;
+use vortex_array::scalar::Scalar;
 use vortex_fastlanes::BitPackedData;
+use vortex_fastlanes::Delta;
+use vortex_fastlanes::FoR;
 use vortex_row::SortField;
 use vortex_row::convert_columns;
 
@@ -429,3 +432,137 @@ fn bitpacked_i32_without_kernel(bencher: divan::Bencher) {
         convert_columns(&[canonical], &[SortField::default()], &mut ctx).unwrap()
     })
 }
+
+// ---------- for_i64 ----------
+
+fn gen_for_i64_values(n: usize, seed: u64) -> Vec<i64> {
+    // Tightly clustered values around a base — FoR will store small deltas.
+    let mut rng = StdRng::seed_from_u64(seed);
+    (0..n)
+        .map(|_| 1_000_000_000i64 + rng.random_range(0i64..65536))
+        .collect()
+}
+
+#[divan::bench]
+fn for_i64_arrow_row(bencher: divan::Bencher) {
+    let v = gen_for_i64_values(N, 200);
+    let arr = Arc::new(Int64Array::from(v)) as arrow_array::ArrayRef;
+    let conv = RowConverter::new(vec![ArrowSortField::new(DataType::Int64)]).unwrap();
+    let bytes = (N * (1 + 8)) as u64;
+    bencher
+        .counter(BytesCount::new(bytes))
+        .bench_local(|| conv.convert_columns(&[arr.clone()]).unwrap())
+}
+
+#[divan::bench]
+fn for_i64_with_kernel(bencher: divan::Bencher) {
+    let v = gen_for_i64_values(N, 200);
+    let raw = PrimitiveArray::from_iter(v.clone()).into_array();
+    // Build a FoR around a BitPacked array. The encoded form is `value - reference`
+    // bit-packed to fit the deltas (16 bits suffices for the 0..65536 range above).
+    let deltas: Vec<u64> = v.iter().map(|&x| (x - 1_000_000_000) as u64).collect();
+    let mut setup_ctx = LEGACY_SESSION.create_execution_ctx();
+    let bp = BitPackedData::encode(
+        &PrimitiveArray::from_iter(deltas).into_array(),
+        17,
+        &mut setup_ctx,
+    )
+    .unwrap()
+    .into_array();
+    drop(raw);
+    let arr = FoR::try_new(bp, Scalar::from(1_000_000_000u64))
+        .unwrap()
+        .into_array();
+    let bytes = (N * (1 + 8)) as u64;
+    bencher.counter(BytesCount::new(bytes)).bench_local(|| {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        convert_columns(&[arr.clone()], &[SortField::default()], &mut ctx).unwrap()
+    })
+}
+
+#[divan::bench]
+fn for_i64_without_kernel(bencher: divan::Bencher) {
+    let v = gen_for_i64_values(N, 200);
+    let deltas: Vec<u64> = v.iter().map(|&x| (x - 1_000_000_000) as u64).collect();
+    let mut setup_ctx = LEGACY_SESSION.create_execution_ctx();
+    let bp = BitPackedData::encode(
+        &PrimitiveArray::from_iter(deltas).into_array(),
+        17,
+        &mut setup_ctx,
+    )
+    .unwrap()
+    .into_array();
+    let arr = FoR::try_new(bp, Scalar::from(1_000_000_000u64))
+        .unwrap()
+        .into_array();
+    let bytes = (N * (1 + 8)) as u64;
+    bencher.counter(BytesCount::new(bytes)).bench_local(|| {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        let canonical = arr
+            .clone()
+            .execute::<Canonical>(&mut ctx)
+            .unwrap()
+            .into_array();
+        convert_columns(&[canonical], &[SortField::default()], &mut ctx).unwrap()
+    })
+}
+
+// ---------- delta_i64 ----------
+
+fn gen_delta_i64_values(n: usize, seed: u64) -> Vec<i64> {
+    // Monotonic-ish sequence with small step sizes (a common shape for sorted IDs / timestamps).
+    let mut rng = StdRng::seed_from_u64(seed);
+    let mut v = Vec::with_capacity(n);
+    let mut cur: i64 = 1_700_000_000_000;
+    for _ in 0..n {
+        cur = cur.wrapping_add(rng.random_range(1i64..100));
+        v.push(cur);
+    }
+    v
+}
+
+#[divan::bench]
+fn delta_i64_arrow_row(bencher: divan::Bencher) {
+    let v = gen_delta_i64_values(N, 300);
+    let arr = Arc::new(Int64Array::from(v)) as arrow_array::ArrayRef;
+    let conv = RowConverter::new(vec![ArrowSortField::new(DataType::Int64)]).unwrap();
+    let bytes = (N * (1 + 8)) as u64;
+    bencher
+        .counter(BytesCount::new(bytes))
+        .bench_local(|| conv.convert_columns(&[arr.clone()]).unwrap())
+}
+
+#[divan::bench]
+fn delta_i64_with_kernel(bencher: divan::Bencher) {
+    let v = gen_delta_i64_values(N, 300);
+    let p = PrimitiveArray::from_iter(v.clone());
+    let mut setup_ctx = LEGACY_SESSION.create_execution_ctx();
+    let arr = Delta::try_from_primitive_array(&p, &mut setup_ctx)
+        .unwrap()
+        .into_array();
+    let bytes = (N * (1 + 8)) as u64;
+    bencher.counter(BytesCount::new(bytes)).bench_local(|| {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        convert_columns(&[arr.clone()], &[SortField::default()], &mut ctx).unwrap()
+    })
+}
+
+#[divan::bench]
+fn delta_i64_without_kernel(bencher: divan::Bencher) {
+    let v = gen_delta_i64_values(N, 300);
+    let p = PrimitiveArray::from_iter(v);
+    let mut setup_ctx = LEGACY_SESSION.create_execution_ctx();
+    let arr = Delta::try_from_primitive_array(&p, &mut setup_ctx)
+        .unwrap()
+        .into_array();
+    let bytes = (N * (1 + 8)) as u64;
+    bencher.counter(BytesCount::new(bytes)).bench_local(|| {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        let canonical = arr
+            .clone()
+            .execute::<Canonical>(&mut ctx)
+            .unwrap()
+            .into_array();
+        convert_columns(&[canonical], &[SortField::default()], &mut ctx).unwrap()
+    })
+}

From 7021233c1dfbffc0edb3bcabaefa416ade7f59df Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Mon, 18 May 2026 16:38:44 +0000
Subject: [PATCH 26/26] Fix CI: regenerate public-api lock and break vortex-row
 publish cycle

Two unrelated CI failures on PR #7985:

1. Check generated source files: vortex-row/public-api.lock was stale
   - field_encode_fixed_arithmetic became pub in the arithmetic-write
   commit but the lock wasn't regenerated.

2. Rust publish dry-run: vortex-row's dev-dep on vortex-fastlanes was
   inherited from the workspace with a version specifier. Since
   vortex-fastlanes itself depends on vortex-row (for the inventory
   kernel registration), cargo publish couldn't resolve the version
   on crates.io. Drop the workspace inheritance and use a path-only
   dev-dep for vortex-fastlanes - the bench file is the only consumer
   and cargo strips path-only dev-deps from the published manifest.

Signed-off-by: Claude <noreply@anthropic.com>
---
 vortex-row/Cargo.toml      | 5 ++++-
 vortex-row/public-api.lock | 2 ++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/vortex-row/Cargo.toml b/vortex-row/Cargo.toml
index 6ccfa258bc9..ca60a791f98 100644
--- a/vortex-row/Cargo.toml
+++ b/vortex-row/Cargo.toml
@@ -36,7 +36,10 @@ mimalloc = { workspace = true }
 rand = { workspace = true }
 rstest = { workspace = true }
 vortex-array = { workspace = true, features = ["_test-harness"] }
-vortex-fastlanes = { workspace = true }
+# Path-only (no version) so cargo publish doesn't try to look up vortex-fastlanes on
+# crates.io. vortex-fastlanes itself depends on vortex-row, so a versioned dev-dep here
+# would create a publish-time chicken-and-egg. The bench file is the only consumer.
+vortex-fastlanes = { path = "../encodings/fastlanes" }
 
 [[bench]]
 name = "row_encode"
diff --git a/vortex-row/public-api.lock b/vortex-row/public-api.lock
index bded46f3c30..ecc834cca19 100644
--- a/vortex-row/public-api.lock
+++ b/vortex-row/public-api.lock
@@ -98,6 +98,8 @@ pub fn vortex_row::codec::encoded_size_for_scalar(&vortex_array::scalar::Scalar,
 
 pub fn vortex_row::codec::field_encode(&vortex_array::canonical::Canonical, vortex_row::options::SortField, &[u32], &mut [u32], &mut [u8], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<()>
 
+pub fn vortex_row::codec::field_encode_fixed_arithmetic(&vortex_array::canonical::Canonical, vortex_row::options::SortField, u32, u32, core::option::Option<&[u32]>, u32, &mut [u8], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<()>
+
 pub fn vortex_row::codec::field_size(&vortex_array::canonical::Canonical, vortex_row::options::SortField, &mut [u32], &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<()>
 
 pub fn vortex_row::codec::row_width_for_dtype(&vortex_array::dtype::DType) -> vortex_error::VortexResult<vortex_row::codec::RowWidth>