Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# Changelog

## 4.0.0
- **Breaking**: Make `delay` passed into `par_iter_bp_delayed` a strong type `Delay(pub usize)` to
reduce potential for bugs.
- **Breaking**: Encapsulate parallel iterators in new `PaddedIt { it, padding: usize }` type with `.map`, `.advance`, and `.collect_into` functions.
- Make `intrinsics::transpose` public for use in `collect_and_dedup` in `simd_minimizers`.

## 3.2.1
- Add `Seq::read_{revcomp}_kmer_u128` with more tests
- Fix bug in `revcomp_u128`
Expand Down
3 changes: 3 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,6 @@ pyo3 = { version = "0.25", features = ["extension-module"], optional = true }
[features]
# Also needed for tests.
default = ["rand"]

# Hides the `simd` warnings when neither AVX2 nor NEON is detected.
scalar = []
6 changes: 5 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,12 @@ crate was developed:
This library supports AVX2 and NEON instruction sets.
Make sure to set `RUSTFLAGS="-C target-cpu=native"` when compiling to use the instruction sets available on your architecture.

RUSTFLAGS="-C target-cpu=native" cargo run --release
``` sh
RUSTFLAGS="-C target-cpu=native" cargo run --release
```

Enable the `-F scalar` feature flag to fall back to a scalar implementation with
reduced performance.

## Usage example

Expand Down
22 changes: 11 additions & 11 deletions src/ascii.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use crate::{intrinsics::transpose, packed_seq::read_slice};
use crate::{intrinsics::transpose, packed_seq::read_slice, padded_it::ChunkIt};

use super::*;

Expand Down Expand Up @@ -71,13 +71,13 @@ impl Seq<'_> for &[u8] {

/// Iter the ASCII characters.
#[inline(always)]
fn iter_bp(self) -> impl ExactSizeIterator<Item = u8> + Clone {
fn iter_bp(self) -> impl ExactSizeIterator<Item = u8> {
self.iter().copied()
}

/// Iter the ASCII characters in parallel.
#[inline(always)]
fn par_iter_bp(self, context: usize) -> (impl ExactSizeIterator<Item = S> + Clone, usize) {
fn par_iter_bp(self, context: usize) -> PaddedIt<impl ChunkIt<u32x8>> {
let num_kmers = self.len().saturating_sub(context - 1);
let n = num_kmers.div_ceil(L);
let padding = L * n - num_kmers;
Expand Down Expand Up @@ -112,15 +112,15 @@ impl Seq<'_> for &[u8] {
},
);

(it, padding)
PaddedIt { it, padding }
}

#[inline(always)]
fn par_iter_bp_delayed(
self,
context: usize,
delay: usize,
) -> (impl ExactSizeIterator<Item = (S, S)> + Clone, usize) {
Delay(delay): Delay,
) -> PaddedIt<impl ChunkIt<(u32x8, u32x8)>> {
assert!(
delay < usize::MAX / 2,
"Delay={} should be >=0.",
Expand Down Expand Up @@ -185,16 +185,16 @@ impl Seq<'_> for &[u8] {
},
);

(it, padding)
PaddedIt { it, padding }
}

#[inline(always)]
fn par_iter_bp_delayed_2(
self,
context: usize,
delay1: usize,
delay2: usize,
) -> (impl ExactSizeIterator<Item = (S, S, S)> + Clone, usize) {
Delay(delay1): Delay,
Delay(delay2): Delay,
) -> PaddedIt<impl ChunkIt<(u32x8, u32x8, u32x8)>> {
assert!(delay1 <= delay2, "Delay1 must be at most delay2.");

let num_kmers = self.len().saturating_sub(context - 1);
Expand Down Expand Up @@ -266,7 +266,7 @@ impl Seq<'_> for &[u8] {
},
);

(it, padding)
PaddedIt { it, padding }
}

// TODO: This is not very optimized.
Expand Down
22 changes: 11 additions & 11 deletions src/ascii_seq.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use crate::{intrinsics::transpose, packed_seq::read_slice};
use crate::{intrinsics::transpose, packed_seq::read_slice, padded_it::ChunkIt};

use super::*;

Expand Down Expand Up @@ -240,7 +240,7 @@ impl<'s> Seq<'s> for AsciiSeq<'s> {
///
/// NOTE: This is only efficient on x86_64 with `BMI2` support for `pext`.
#[inline(always)]
fn iter_bp(self) -> impl ExactSizeIterator<Item = u8> + Clone {
fn iter_bp(self) -> impl ExactSizeIterator<Item = u8> {
#[cfg(all(target_arch = "x86_64", target_feature = "bmi2"))]
{
let mut cache = 0;
Expand Down Expand Up @@ -274,7 +274,7 @@ impl<'s> Seq<'s> for AsciiSeq<'s> {

/// Iterate the basepairs in the sequence in 8 parallel streams, assuming values in `0..4`.
#[inline(always)]
fn par_iter_bp(self, context: usize) -> (impl ExactSizeIterator<Item = S> + Clone, usize) {
fn par_iter_bp(self, context: usize) -> PaddedIt<impl ChunkIt<S>> {
let num_kmers = self.len().saturating_sub(context - 1);
let n = num_kmers.div_ceil(L);
let padding = L * n - num_kmers;
Expand Down Expand Up @@ -312,15 +312,15 @@ impl<'s> Seq<'s> for AsciiSeq<'s> {
},
);

(it, padding)
PaddedIt { it, padding }
}

#[inline(always)]
fn par_iter_bp_delayed(
self,
context: usize,
delay: usize,
) -> (impl ExactSizeIterator<Item = (S, S)> + Clone, usize) {
Delay(delay): Delay,
) -> PaddedIt<impl ChunkIt<(S, S)>> {
assert!(
delay < usize::MAX / 2,
"Delay={} should be >=0.",
Expand Down Expand Up @@ -388,16 +388,16 @@ impl<'s> Seq<'s> for AsciiSeq<'s> {
},
);

(it, padding)
PaddedIt { it, padding }
}

#[inline(always)]
fn par_iter_bp_delayed_2(
self,
context: usize,
delay1: usize,
delay2: usize,
) -> (impl ExactSizeIterator<Item = (S, S, S)> + Clone, usize) {
Delay(delay1): Delay,
Delay(delay2): Delay,
) -> PaddedIt<impl ChunkIt<(S, S, S)>> {
assert!(delay1 <= delay2, "Delay1 must be at most delay2.");

let num_kmers = self.len().saturating_sub(context - 1);
Expand Down Expand Up @@ -472,7 +472,7 @@ impl<'s> Seq<'s> for AsciiSeq<'s> {
},
);

(it, padding)
PaddedIt { it, padding }
}

// TODO: This is not very optimized.
Expand Down
2 changes: 1 addition & 1 deletion src/intrinsics/transpose.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
use wide::u32x4;
use wide::u32x8 as S;

/// Transpose a matrix of 8 SIMD vectors.
/// Transpose an 8x8 matrix of 8 `u32x8` SIMD elements.
/// <https://stackoverflow.com/questions/25622745/transpose-an-8x8-float-using-avx-avx2>
// TODO: Investigate other transpose functions mentioned there?
pub fn transpose(m: [S; 8]) -> [S; 8] {
Expand Down
37 changes: 33 additions & 4 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,10 @@
//! But instead of just returning a single character, they also return a second (and third) character, that is `delay` positions _behind_ the new character (at index `idx - delay`).
//! This way, k-mers can be enumerated by setting `delay=k` and then mapping e.g. `|(add, remove)| kmer = (kmer<<2) ^ add ^ (remove << (2*k))`.
//!
//! #### Collect
//!
//! Use [`PaddedIt::collect`] and [`PaddedIt::collect_into`] to collect the values returned by a parallel iterator over `u32x8` into a flat `Vec<u32>`.
//!
//! ## Example
//!
//! ```
Expand All @@ -78,19 +82,42 @@
//! // Iterate over 8 chunks at the same time.
//! let seq = b"AAAACCTTGGTTACTG"; // plain ASCII sequence
//! // chunks: ^ ^ ^ ^ ^ ^ ^ ^
//! let (par_iter, padding) = seq.as_slice().par_iter_bp(1);
//! let mut par_iter_u8 = par_iter.map(|x| x.as_array_ref().map(|c| c as u8));
//! // the `1` argument indicates a 'context' length of 1,
//! // since we're just iterating single characters.
//! let par_iter = seq.as_slice().par_iter_bp(1);
//! let mut par_iter_u8 = par_iter.it.map(|x| x.as_array_ref().map(|c| c as u8));
//! assert_eq!(par_iter_u8.next(), Some(*b"AACTGTAT"));
//! assert_eq!(par_iter_u8.next(), Some(*b"AACTGTCG"));
//! assert_eq!(par_iter_u8.next(), None);
//!
//! let bases: Vec<u32> = seq.as_slice().par_iter_bp(1).collect();
//! let bases: Vec<u8> = bases.into_iter().map(|x| x as u8).collect();
//! assert_eq!(bases, seq);
//!
//! // With context=3, the chunks overlap by 2 characters,
//! // which can be skipped using `advance`.
//! let bases: Vec<u32> = seq.as_slice().par_iter_bp(3).advance(2).collect();
//! let bases: Vec<u8> = bases.into_iter().map(|x| x as u8).collect();
//! assert_eq!(bases, &seq[2..]);
//! ```
//!
//! ## Feature flags
//! - `epserde` enables `derive(epserde::Epserde)` for `PackedSeqVec` and `AsciiSeqVec`, and adds its `SerializeInner` and `DeserializeInner` traits to `SeqVec`.
//! - `pyo3` enables `derive(pyo3::pyclass)` for `PackedSeqVec` and `AsciiSeqVec`.

#[cfg(not(any(
doc,
debug_assertions,
target_feature = "avx2",
target_feature = "neon",
feature = "scalar"
)))]
compile_error!(
"Packed-seq uses AVX2 or NEON SIMD instructions. Compile using `-C target-cpu=native` to get the expected performance. Silence this error using the `scalar` feature."
);

/// Functions with architecture-specific implementations.
mod intrinsics {
pub mod intrinsics {
mod transpose;
pub use transpose::transpose;
}
Expand All @@ -100,6 +127,7 @@ mod traits;
mod ascii;
mod ascii_seq;
mod packed_seq;
mod padded_it;

#[cfg(test)]
mod test;
Expand All @@ -114,7 +142,8 @@ pub use packed_seq::{
complement_base, complement_base_simd, complement_char, pack_char, unpack_base,
};
pub use packed_seq::{PackedSeq, PackedSeqVec};
pub use traits::{Seq, SeqVec};
pub use padded_it::{Advance, ChunkIt, PaddedIt};
pub use traits::{Delay, Seq, SeqVec};

// For internal use only.
use core::array::from_fn;
Expand Down
Loading