diff --git a/vortex-row/src/codec.rs b/vortex-row/src/codec.rs
index 37295536d1d..360a22b9d0a 100644
--- a/vortex-row/src/codec.rs
+++ b/vortex-row/src/codec.rs
@@ -691,36 +691,94 @@ fn encode_extension(
 
 /// Encode a variable-length byte slice into `out` in 32-byte blocks with
 /// continuation markers. Returns the number of bytes written.
+///
+/// For the ascending path (descending == false), the hot loop is a `copy_nonoverlapping`
+/// of 32 bytes per block plus one stamped continuation byte — no per-byte work. For the
+/// descending path, the hot loop reads u64-at-a-time and XORs with 0xFF to give LLVM
+/// a vectorizable inner loop.
 fn encode_varlen_value(bytes: &[u8], out: &mut [u8], descending: bool) -> u32 {
-    let xor = if descending { 0xFFu8 } else { 0x00 };
     if bytes.is_empty() {
-        // Single zero terminator.
-        out[0] = xor;
+        // Single zero terminator (descending flips it to 0xFF).
+        out[0] = if descending { 0xFF } else { 0 };
         return 1;
     }
-    let mut written = 0usize;
-    let mut remaining = bytes;
-    while remaining.len() > VARLEN_BLOCK_SIZE {
-        // Full block, continuation marker 0xFF (then XORed if descending).
-        let block = &remaining[..VARLEN_BLOCK_SIZE];
-        for (i, &b) in block.iter().enumerate() {
-            out[written + i] = b ^ xor;
+    let len = bytes.len();
+    let full_blocks = len / VARLEN_BLOCK_SIZE;
+    let partial = len % VARLEN_BLOCK_SIZE;
+    let (full_to_write, partial_block_len) = if partial == 0 {
+        // Length is an exact multiple of 32. The spec emits (full_blocks-1) full blocks
+        // with 0xFF continuation, plus a final block whose continuation byte is 32.
+        (full_blocks - 1, VARLEN_BLOCK_SIZE)
+    } else {
+        (full_blocks, partial)
+    };
+    let total = (full_to_write + 1) * VARLEN_BLOCK_TOTAL;
+    debug_assert!(out.len() >= total);
+
+    // SAFETY: bounds checked above. The encoder always invokes us with `out.len()`
+    // >= encoded_size_for_varlen(bytes.len()) - 1 (the leading sentinel is written by the
+    // caller and not counted here).
+    unsafe {
+        let mut src = bytes.as_ptr();
+        let mut dst = out.as_mut_ptr();
+
+        if !descending {
+            // Ascending fast path: full blocks are memcpy + a single 0xFF stamp.
+            for _ in 0..full_to_write {
+                std::ptr::copy_nonoverlapping(src, dst, VARLEN_BLOCK_SIZE);
+                *dst.add(VARLEN_BLOCK_SIZE) = 0xFF;
+                src = src.add(VARLEN_BLOCK_SIZE);
+                dst = dst.add(VARLEN_BLOCK_TOTAL);
+            }
+            // Final block: copy the partial data, zero-pad the tail, write the
+            // length byte as the continuation marker.
+            std::ptr::copy_nonoverlapping(src, dst, partial_block_len);
+            std::ptr::write_bytes(
+                dst.add(partial_block_len),
+                0,
+                VARLEN_BLOCK_SIZE - partial_block_len,
+            );
+            *dst.add(VARLEN_BLOCK_SIZE) = partial_block_len as u8;
+        } else {
+            // Descending: invert all value bytes. u64-stride XOR gives LLVM a
+            // vectorizable inner loop; the tail handles the partial block.
+            for _ in 0..full_to_write {
+                xor_copy_block(src, dst);
+                *dst.add(VARLEN_BLOCK_SIZE) = 0x00; // descending counterpart of 0xFF
+                src = src.add(VARLEN_BLOCK_SIZE);
+                dst = dst.add(VARLEN_BLOCK_TOTAL);
+            }
+            // Final block: XOR-copy the partial data, fill the tail with 0xFF
+            // (which is 0x00 XOR 0xFF), then write the inverted length byte.
+            for i in 0..partial_block_len {
+                *dst.add(i) = *src.add(i) ^ 0xFF;
+            }
+            std::ptr::write_bytes(
+                dst.add(partial_block_len),
+                0xFF,
+                VARLEN_BLOCK_SIZE - partial_block_len,
+            );
+            *dst.add(VARLEN_BLOCK_SIZE) = (partial_block_len as u8) ^ 0xFF;
         }
-        out[written + VARLEN_BLOCK_SIZE] = 0xFF ^ xor;
-        written += VARLEN_BLOCK_TOTAL;
-        remaining = &remaining[VARLEN_BLOCK_SIZE..];
-    }
-    // Final partial block: pad with zeros, last byte = remaining.len() (1..=32).
-    let n = remaining.len();
-    for (i, &b) in remaining.iter().enumerate() {
-        out[written + i] = b ^ xor;
     }
-    for j in n..VARLEN_BLOCK_SIZE {
-        out[written + j] = xor;
+    total as u32
+}
+
+/// Copy 32 bytes from `src` to `dst`, XORing each with 0xFF. Auto-vectorized by LLVM
+/// into SIMD on x86 (verified via cargo asm in earlier iterations).
+///
+/// # Safety
+/// `src` must be valid for 32 reads; `dst` must be valid for 32 writes; the regions
+/// may not overlap.
+#[inline(always)]
+unsafe fn xor_copy_block(src: *const u8, dst: *mut u8) {
+    // Use u64 chunks (4 lanes of 8 bytes = 32 bytes total).
+    for i in 0..4 {
+        let off = i * 8;
+        // SAFETY: caller upholds the contract that src/dst are valid for 32 bytes.
+        let v = unsafe { std::ptr::read_unaligned(src.add(off) as *const u64) };
+        unsafe { std::ptr::write_unaligned(dst.add(off) as *mut u64, v ^ u64::MAX) };
     }
-    out[written + VARLEN_BLOCK_SIZE] = (n as u8) ^ xor;
-    written += VARLEN_BLOCK_TOTAL;
-    written as u32
 }
 
 /// Internal trait for encoding a fixed-width native value into byte slots.