rust-seq · RagnarGrootKoerkamp · Sep 26, 2025 · Sep 16, 2025 · Sep 14, 2025 · Sep 16, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,11 @@
 # Changelog
 
+## 4.0.0
+- **Breaking**: Make `delay` passed into `par_iter_bp_delayed` a strong type `Delay(pub usize)` to
+  reduce potential for bugs.
+- **Breaking**: Encapsulate parallel iterators in new `PaddedIt { it, padding: usize }` type with `.map`, `.advance`, and `.collect_into` functions.
+- Make `intrinsics::transpose` public for use in `collect_and_dedup` in `simd_minimizers`.
+
 ## 3.2.1
 - Add `Seq::read_{revcomp}_kmer_u128` with more tests
 - Fix bug in `revcomp_u128`

diff --git a/Cargo.toml b/Cargo.toml
@@ -26,3 +26,6 @@ pyo3 = { version = "0.25", features = ["extension-module"], optional = true }
 [features]
 # Also needed for tests.
 default = ["rand"]
+
+# Hides the `simd` warnings when neither AVX2 nor NEON is detected.
+scalar = []
diff --git a/README.md b/README.md
@@ -26,8 +26,12 @@ crate was developed:
 This library supports AVX2 and NEON instruction sets.
 Make sure to set `RUSTFLAGS="-C target-cpu=native"` when compiling to use the instruction sets available on your architecture.
 
-    RUSTFLAGS="-C target-cpu=native" cargo run --release
+``` sh
+RUSTFLAGS="-C target-cpu=native" cargo run --release
+```
 
+Enable the `-F scalar` feature flag to fall back to a scalar implementation with
+reduced performance.
 
 ## Usage example
 

diff --git a/src/ascii.rs b/src/ascii.rs
@@ -1,4 +1,4 @@
-use crate::{intrinsics::transpose, packed_seq::read_slice};
+use crate::{intrinsics::transpose, packed_seq::read_slice, padded_it::ChunkIt};
 
 use super::*;
 
@@ -71,13 +71,13 @@ impl Seq<'_> for &[u8] {
 
     /// Iter the ASCII characters.
     #[inline(always)]
-    fn iter_bp(self) -> impl ExactSizeIterator<Item = u8> + Clone {
+    fn iter_bp(self) -> impl ExactSizeIterator<Item = u8> {
         self.iter().copied()
     }
 
     /// Iter the ASCII characters in parallel.
     #[inline(always)]
-    fn par_iter_bp(self, context: usize) -> (impl ExactSizeIterator<Item = S> + Clone, usize) {
+    fn par_iter_bp(self, context: usize) -> PaddedIt<impl ChunkIt<u32x8>> {
         let num_kmers = self.len().saturating_sub(context - 1);
         let n = num_kmers.div_ceil(L);
         let padding = L * n - num_kmers;
@@ -112,15 +112,15 @@ impl Seq<'_> for &[u8] {
             },
         );
 
-        (it, padding)
+        PaddedIt { it, padding }
     }
 
     #[inline(always)]
     fn par_iter_bp_delayed(
         self,
         context: usize,
-        delay: usize,
-    ) -> (impl ExactSizeIterator<Item = (S, S)> + Clone, usize) {
+        Delay(delay): Delay,
+    ) -> PaddedIt<impl ChunkIt<(u32x8, u32x8)>> {
         assert!(
             delay < usize::MAX / 2,
             "Delay={} should be >=0.",
@@ -185,16 +185,16 @@ impl Seq<'_> for &[u8] {
             },
         );
 
-        (it, padding)
+        PaddedIt { it, padding }
     }
 
     #[inline(always)]
     fn par_iter_bp_delayed_2(
         self,
         context: usize,
-        delay1: usize,
-        delay2: usize,
-    ) -> (impl ExactSizeIterator<Item = (S, S, S)> + Clone, usize) {
+        Delay(delay1): Delay,
+        Delay(delay2): Delay,
+    ) -> PaddedIt<impl ChunkIt<(u32x8, u32x8, u32x8)>> {
         assert!(delay1 <= delay2, "Delay1 must be at most delay2.");
 
         let num_kmers = self.len().saturating_sub(context - 1);
@@ -266,7 +266,7 @@ impl Seq<'_> for &[u8] {
             },
         );
 
-        (it, padding)
+        PaddedIt { it, padding }
     }
 
     // TODO: This is not very optimized.

diff --git a/src/ascii_seq.rs b/src/ascii_seq.rs
@@ -1,4 +1,4 @@
-use crate::{intrinsics::transpose, packed_seq::read_slice};
+use crate::{intrinsics::transpose, packed_seq::read_slice, padded_it::ChunkIt};
 
 use super::*;
 
@@ -240,7 +240,7 @@ impl<'s> Seq<'s> for AsciiSeq<'s> {
     ///
     /// NOTE: This is only efficient on x86_64 with `BMI2` support for `pext`.
     #[inline(always)]
-    fn iter_bp(self) -> impl ExactSizeIterator<Item = u8> + Clone {
+    fn iter_bp(self) -> impl ExactSizeIterator<Item = u8> {
         #[cfg(all(target_arch = "x86_64", target_feature = "bmi2"))]
         {
             let mut cache = 0;
@@ -274,7 +274,7 @@ impl<'s> Seq<'s> for AsciiSeq<'s> {
 
     /// Iterate the basepairs in the sequence in 8 parallel streams, assuming values in `0..4`.
     #[inline(always)]
-    fn par_iter_bp(self, context: usize) -> (impl ExactSizeIterator<Item = S> + Clone, usize) {
+    fn par_iter_bp(self, context: usize) -> PaddedIt<impl ChunkIt<S>> {
         let num_kmers = self.len().saturating_sub(context - 1);
         let n = num_kmers.div_ceil(L);
         let padding = L * n - num_kmers;
@@ -312,15 +312,15 @@ impl<'s> Seq<'s> for AsciiSeq<'s> {
             },
         );
 
-        (it, padding)
+        PaddedIt { it, padding }
     }
 
     #[inline(always)]
     fn par_iter_bp_delayed(
         self,
         context: usize,
-        delay: usize,
-    ) -> (impl ExactSizeIterator<Item = (S, S)> + Clone, usize) {
+        Delay(delay): Delay,
+    ) -> PaddedIt<impl ChunkIt<(S, S)>> {
         assert!(
             delay < usize::MAX / 2,
             "Delay={} should be >=0.",
@@ -388,16 +388,16 @@ impl<'s> Seq<'s> for AsciiSeq<'s> {
             },
         );
 
-        (it, padding)
+        PaddedIt { it, padding }
     }
 
     #[inline(always)]
     fn par_iter_bp_delayed_2(
         self,
         context: usize,
-        delay1: usize,
-        delay2: usize,
-    ) -> (impl ExactSizeIterator<Item = (S, S, S)> + Clone, usize) {
+        Delay(delay1): Delay,
+        Delay(delay2): Delay,
+    ) -> PaddedIt<impl ChunkIt<(S, S, S)>> {
         assert!(delay1 <= delay2, "Delay1 must be at most delay2.");
 
         let num_kmers = self.len().saturating_sub(context - 1);
@@ -472,7 +472,7 @@ impl<'s> Seq<'s> for AsciiSeq<'s> {
             },
         );
 
-        (it, padding)
+        PaddedIt { it, padding }
     }
 
     // TODO: This is not very optimized.

diff --git a/src/intrinsics/transpose.rs b/src/intrinsics/transpose.rs
@@ -3,7 +3,7 @@
 use wide::u32x4;
 use wide::u32x8 as S;
 
-/// Transpose a matrix of 8 SIMD vectors.
+/// Transpose an 8x8 matrix of 8 `u32x8` SIMD elements.
 /// <https://stackoverflow.com/questions/25622745/transpose-an-8x8-float-using-avx-avx2>
 // TODO: Investigate other transpose functions mentioned there?
 pub fn transpose(m: [S; 8]) -> [S; 8] {

diff --git a/src/lib.rs b/src/lib.rs
@@ -52,6 +52,10 @@
 //! But instead of just returning a single character, they also return a second (and third) character, that is `delay` positions _behind_ the new character (at index `idx - delay`).
 //! This way, k-mers can be enumerated by setting `delay=k` and then mapping e.g. `|(add, remove)| kmer = (kmer<<2) ^ add ^ (remove << (2*k))`.
 //!
+//! #### Collect
+//!
+//! Use [`PaddedIt::collect`] and [`PaddedIt::collect_into`] to collect the values returned by a parallel iterator over `u32x8` into a flat `Vec<u32>`.
+//!
 //! ## Example
 //!
 //! ```
@@ -78,19 +82,42 @@
 //! // Iterate over 8 chunks at the same time.
 //! let seq = b"AAAACCTTGGTTACTG"; // plain ASCII sequence
 //! // chunks:  ^ ^ ^ ^ ^ ^ ^ ^
-//! let (par_iter, padding) = seq.as_slice().par_iter_bp(1);
-//! let mut par_iter_u8 = par_iter.map(|x| x.as_array_ref().map(|c| c as u8));
+//! // the `1` argument indicates a 'context' length of 1,
+//! // since we're just iterating single characters.
+//! let par_iter = seq.as_slice().par_iter_bp(1);
+//! let mut par_iter_u8 = par_iter.it.map(|x| x.as_array_ref().map(|c| c as u8));
 //! assert_eq!(par_iter_u8.next(), Some(*b"AACTGTAT"));
 //! assert_eq!(par_iter_u8.next(), Some(*b"AACTGTCG"));
 //! assert_eq!(par_iter_u8.next(), None);
+//!
+//! let bases: Vec<u32> = seq.as_slice().par_iter_bp(1).collect();
+//! let bases: Vec<u8> = bases.into_iter().map(|x| x as u8).collect();
+//! assert_eq!(bases, seq);
+//!
+//! // With context=3, the chunks overlap by 2 characters,
+//! // which can be skipped using `advance`.
+//! let bases: Vec<u32> = seq.as_slice().par_iter_bp(3).advance(2).collect();
+//! let bases: Vec<u8> = bases.into_iter().map(|x| x as u8).collect();
+//! assert_eq!(bases, &seq[2..]);
 //! ```
 //!
 //! ## Feature flags
 //! - `epserde` enables `derive(epserde::Epserde)` for `PackedSeqVec` and `AsciiSeqVec`, and adds its `SerializeInner` and `DeserializeInner` traits to `SeqVec`.
 //! - `pyo3` enables `derive(pyo3::pyclass)` for `PackedSeqVec` and `AsciiSeqVec`.
 
+#[cfg(not(any(
+    doc,
+    debug_assertions,
+    target_feature = "avx2",
+    target_feature = "neon",
+    feature = "scalar"
+)))]
+compile_error!(
+    "Packed-seq uses AVX2 or NEON SIMD instructions. Compile using `-C target-cpu=native` to get the expected performance. Silence this error using the `scalar` feature."
+);
+
 /// Functions with architecture-specific implementations.
-mod intrinsics {
+pub mod intrinsics {
     mod transpose;
     pub use transpose::transpose;
 }
@@ -100,6 +127,7 @@ mod traits;
 mod ascii;
 mod ascii_seq;
 mod packed_seq;
+mod padded_it;
 
 #[cfg(test)]
 mod test;
@@ -114,7 +142,8 @@ pub use packed_seq::{
     complement_base, complement_base_simd, complement_char, pack_char, unpack_base,
 };
 pub use packed_seq::{PackedSeq, PackedSeqVec};
-pub use traits::{Seq, SeqVec};
+pub use padded_it::{Advance, ChunkIt, PaddedIt};
+pub use traits::{Delay, Seq, SeqVec};
 
 // For internal use only.
 use core::array::from_fn;