diff --git a/vortex-row/src/encode.rs b/vortex-row/src/encode.rs index 8c930406643..1ca24ec378c 100644 --- a/vortex-row/src/encode.rs +++ b/vortex-row/src/encode.rs @@ -21,6 +21,7 @@ use vortex_array::Canonical; use vortex_array::ExecutionCtx; use vortex_array::IntoArray; use vortex_array::VTable; +use vortex_array::arrays::Constant; use vortex_array::arrays::ListViewArray; use vortex_array::arrays::Primitive; use vortex_array::arrays::PrimitiveArray; @@ -317,14 +318,148 @@ fn dispatch_encode_fixed_arith( &canonical, field, col_prefix, row_stride, var_prefix, width, out, ctx, ); } - // For other fixed columns route through canonicalization and the codec helpers. The - // Constant fast path is layered on in a follow-up commit. + // Constant fast path: write the same scalar bytes at each per-row position. + if let Some(view) = col.as_opt::() { + return encode_constant_arith(view, field, col_prefix, row_stride, var_prefix, width, out); + } + // For other fixed columns route through canonicalization and the codec helpers. let canonical = col.clone().execute::(ctx)?; codec::field_encode_fixed_arithmetic( &canonical, field, col_prefix, row_stride, var_prefix, width, out, ctx, ) } +/// Constant-specific arithmetic writer. Encodes the scalar bytes once, then writes the same +/// bytes into each per-row slot via direct register-sized stores for the common small +/// lengths (2/5/9/17), or `copy_nonoverlapping` as a fallback. +fn encode_constant_arith( + view: ArrayView<'_, Constant>, + field: SortField, + col_prefix: u32, + row_stride: u32, + var_prefix: Option<&[u32]>, + _width: u32, + out: &mut [u8], +) -> VortexResult<()> { + let bytes = codec::encode_scalar(view.scalar(), field)?; + let len = bytes.len(); + if len == 0 { + return Ok(()); + } + let n = view.len(); + // SAFETY: encoded scalar length matches the per-row width contributed to the size pass, + // so `pos + len <= out.len()` by buffer construction. For small fixed lengths (the + // common case: bool=2, i32=5, i64=9, i128=17) we hoist the encoded bytes into + // register-sized loads before the loop and emit direct write_unaligned stores per row. + // This is faster than copy_nonoverlapping for small `len` because the compiler emits a + // real memcpy call rather than inlining the 1- or 2-word store sequence. + unsafe { + let src = bytes.as_ptr(); + let stride = row_stride as usize; + match (var_prefix, len) { + // i64-typical: 1 sentinel + 8 value bytes = 9 bytes, no varlen prefix. + (None, 9) => { + let v_lo = std::ptr::read_unaligned(src as *const u64); + let v_hi = *src.add(8); + let mut dst = out.as_mut_ptr().add(col_prefix as usize); + for _ in 0..n { + std::ptr::write_unaligned(dst as *mut u64, v_lo); + *dst.add(8) = v_hi; + dst = dst.add(stride); + } + } + // i32-typical: 1 sentinel + 4 value bytes = 5 bytes, no varlen prefix. + (None, 5) => { + let v_lo = std::ptr::read_unaligned(src as *const u32); + let v_hi = *src.add(4); + let mut dst = out.as_mut_ptr().add(col_prefix as usize); + for _ in 0..n { + std::ptr::write_unaligned(dst as *mut u32, v_lo); + *dst.add(4) = v_hi; + dst = dst.add(stride); + } + } + // bool / i8: 1 sentinel + 1 value byte = 2 bytes, no varlen prefix. + (None, 2) => { + let v = std::ptr::read_unaligned(src as *const u16); + let mut dst = out.as_mut_ptr().add(col_prefix as usize); + for _ in 0..n { + std::ptr::write_unaligned(dst as *mut u16, v); + dst = dst.add(stride); + } + } + // i128: 1 sentinel + 16 value bytes = 17 bytes, no varlen prefix. + (None, 17) => { + let v_lo = std::ptr::read_unaligned(src as *const u128); + let v_hi = *src.add(16); + let mut dst = out.as_mut_ptr().add(col_prefix as usize); + for _ in 0..n { + std::ptr::write_unaligned(dst as *mut u128, v_lo); + *dst.add(16) = v_hi; + dst = dst.add(stride); + } + } + // General fallback for other lengths. + (None, _) => { + let mut dst = out.as_mut_ptr().add(col_prefix as usize); + for _ in 0..n { + std::ptr::copy_nonoverlapping(src, dst, len); + dst = dst.add(stride); + } + } + (Some(vp), 9) => { + let v_lo = std::ptr::read_unaligned(src as *const u64); + let v_hi = *src.add(8); + let base = out.as_mut_ptr(); + for i in 0..n { + let pos = (i as u32) * row_stride + col_prefix + vp[i]; + let dst = base.add(pos as usize); + std::ptr::write_unaligned(dst as *mut u64, v_lo); + *dst.add(8) = v_hi; + } + } + (Some(vp), 5) => { + let v_lo = std::ptr::read_unaligned(src as *const u32); + let v_hi = *src.add(4); + let base = out.as_mut_ptr(); + for i in 0..n { + let pos = (i as u32) * row_stride + col_prefix + vp[i]; + let dst = base.add(pos as usize); + std::ptr::write_unaligned(dst as *mut u32, v_lo); + *dst.add(4) = v_hi; + } + } + (Some(vp), 2) => { + let v = std::ptr::read_unaligned(src as *const u16); + let base = out.as_mut_ptr(); + for i in 0..n { + let pos = (i as u32) * row_stride + col_prefix + vp[i]; + std::ptr::write_unaligned(base.add(pos as usize) as *mut u16, v); + } + } + (Some(vp), 17) => { + let v_lo = std::ptr::read_unaligned(src as *const u128); + let v_hi = *src.add(16); + let base = out.as_mut_ptr(); + for i in 0..n { + let pos = (i as u32) * row_stride + col_prefix + vp[i]; + let dst = base.add(pos as usize); + std::ptr::write_unaligned(dst as *mut u128, v_lo); + *dst.add(16) = v_hi; + } + } + (Some(vp), _) => { + let base = out.as_mut_ptr(); + for i in 0..n { + let pos = (i as u32) * row_stride + col_prefix + vp[i]; + std::ptr::copy_nonoverlapping(src, base.add(pos as usize), len); + } + } + } + } + Ok(()) +} + /// Dispatch a single column's encoding into the shared `out` buffer. /// /// For PR 1 this is just the canonicalize-then-`codec::field_encode` fallback path.