Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions rust/bioscript-core/src/variant.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ pub struct VariantSpec {
pub grch38_assembly_ref: Option<String>,
pub reference: Option<String>,
pub alternate: Option<String>,
pub observed_alternates: Vec<String>,
pub kind: Option<VariantKind>,
pub deletion_length: Option<usize>,
pub motifs: Vec<String>,
Expand Down
2 changes: 1 addition & 1 deletion rust/bioscript-formats/src/alignment.rs
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ pub fn for_each_raw_cram_record_with_reader<R, F>(
) -> Result<(), RuntimeError>
where
R: Read + Seek,
F: FnMut(cram::Record<'_>) -> Result<bool, RuntimeError>,
F: FnMut(&cram::Record<'_>) -> Result<bool, RuntimeError>,
{
for_each_raw_cram_record_with_reader_inner(reader, label, locus, true, on_record)
}
Expand Down
10 changes: 5 additions & 5 deletions rust/bioscript-formats/src/alignment/cram_stream.rs
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ pub(crate) fn for_each_raw_cram_record_with_reader_inner<R, F>(
) -> Result<(), RuntimeError>
where
R: Read + Seek,
F: FnMut(cram::Record<'_>) -> Result<bool, RuntimeError>,
F: FnMut(&cram::Record<'_>) -> Result<bool, RuntimeError>,
{
// Re-seeks to position 0 before reading the header so this helper is
// idempotent across repeated calls on the same indexed reader (e.g. a
Expand Down Expand Up @@ -178,7 +178,7 @@ where
selected_containers,
allow_reference_md5_mismatch,
&mut |record| {
let alignment_record = build_alignment_record_from_cram(label, &record)?;
let alignment_record = build_alignment_record_from_cram(label, record)?;
on_record(alignment_record)
},
)
Expand All @@ -196,7 +196,7 @@ fn stream_selected_cram_records<R, F>(
) -> Result<(), RuntimeError>
where
R: Read + Seek,
F: FnMut(cram::Record<'_>) -> Result<bool, RuntimeError>,
F: FnMut(&cram::Record<'_>) -> Result<bool, RuntimeError>,
{
let interval = region.interval();

Expand Down Expand Up @@ -325,7 +325,7 @@ fn handle_decoded_cram_record<F>(
on_record: &mut F,
) -> bool
where
F: FnMut(cram::Record<'_>) -> Result<bool, RuntimeError>,
F: FnMut(&cram::Record<'_>) -> Result<bool, RuntimeError>,
{
let alignment_record = match build_alignment_record_from_cram(label, record) {
Ok(record) => record,
Expand All @@ -344,7 +344,7 @@ where
return true;
}

match on_record(record.clone()) {
match on_record(record) {
Ok(true) => true,
Ok(false) => {
*stop = true;
Expand Down
2 changes: 2 additions & 0 deletions rust/bioscript-formats/src/genotype.rs
Original file line number Diff line number Diff line change
Expand Up @@ -396,6 +396,7 @@ mod tests {
grch38_assembly_ref: None,
reference: Some("A".to_owned()),
alternate: Some("G".to_owned()),
observed_alternates: vec!["G".to_owned()],
kind: Some(VariantKind::Snp),
deletion_length: None,
motifs: Vec::new(),
Expand Down Expand Up @@ -1530,6 +1531,7 @@ mod tests {
&locus("chr_test", 1000, 1000),
"A",
"AT",
&[2],
Some("mini_indel".to_owned()),
Some(Assembly::Grch38),
)
Expand Down
17 changes: 17 additions & 0 deletions rust/bioscript-formats/src/genotype/alignment_bytes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -138,12 +138,14 @@ impl AlignmentBytesBackend {
VariantKind::Insertion | VariantKind::Indel => {
let reference = variant.reference.clone().unwrap_or_else(|| "I".to_owned());
let alternate = variant.alternate.clone().unwrap_or_else(|| "D".to_owned());
let alternate_lengths = indel_alternate_lengths(variant, &alternate);
observe_cram_indel_with_reader(
reader,
LABEL,
&locus,
&reference,
&alternate,
&alternate_lengths,
matched_rsid,
Some(assembly),
)
Expand Down Expand Up @@ -185,6 +187,21 @@ impl AlignmentBytesBackend {
}
}

fn indel_alternate_lengths(variant: &VariantSpec, fallback_alternate: &str) -> Vec<usize> {
let mut lengths = variant
.observed_alternates
.iter()
.map(String::len)
.filter(|len| *len > 0)
.collect::<Vec<_>>();
if lengths.is_empty() {
lengths.push(fallback_alternate.len());
}
lengths.sort_unstable();
lengths.dedup();
lengths
}

fn first_base(value: Option<&str>) -> Option<char> {
value.and_then(|s| s.chars().next())
}
134 changes: 87 additions & 47 deletions rust/bioscript-formats/src/genotype/bam_backend.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use std::{
collections::BTreeMap,
collections::{BTreeMap, BTreeSet},
fs::File,
io::{Read, Seek},
};
Expand Down Expand Up @@ -113,6 +113,7 @@ pub fn observe_bam_variant<R: Read + Seek>(
&locus,
ref_char,
alt_char,
&variant.observed_alternates,
variant.rsids.first().cloned(),
assembly,
)
Expand All @@ -133,12 +134,14 @@ pub fn observe_bam_variant<R: Read + Seek>(
variant.rsids.first().map_or("variant", String::as_str)
))
})?;
let alternate_lengths = indel_alternate_lengths(variant, alternate);
observe_bam_indel_with_reader(
reader,
label,
&locus,
reference,
alternate,
&alternate_lengths,
variant.rsids.first().cloned(),
assembly,
)
Expand All @@ -156,7 +159,8 @@ fn observe_bam_snp_with_reader<R: Read + Seek>(
label: &str,
locus: &GenomicLocus,
reference: char,
alternate: char,
mut alternate: char,
observed_alternates: &[String],
matched_rsid: Option<String>,
assembly: Option<Assembly>,
) -> Result<VariantObservation, RuntimeError> {
Expand Down Expand Up @@ -242,6 +246,14 @@ fn observe_bam_snp_with_reader<R: Read + Seek>(
}
}

alternate = select_observed_snp_alternate(
reference,
alternate,
observed_alternates,
&counts.filtered_base_counts,
&counts.raw_base_counts,
);
recount_bam_snp_counts(&mut counts, reference, alternate);
let ref_count = counts.filtered_ref_count;
let alt_count = counts.filtered_alt_count;
let depth = counts.filtered_depth;
Expand All @@ -265,6 +277,53 @@ fn observe_bam_snp_with_reader<R: Read + Seek>(
})
}

fn select_observed_snp_alternate(
reference: char,
preferred_alternate: char,
observed_alternates: &[String],
filtered_base_counts: &BTreeMap<String, u32>,
raw_base_counts: &BTreeMap<String, u32>,
) -> char {
let preferred_alternate = preferred_alternate.to_ascii_uppercase();
let reference = reference.to_ascii_uppercase();
let mut candidates = BTreeSet::from([preferred_alternate]);
candidates.extend(
observed_alternates
.iter()
.filter_map(|alt| alt.trim().chars().next())
.map(|alt| alt.to_ascii_uppercase())
.filter(|alt| *alt != reference),
);
candidates
.into_iter()
.max_by_key(|candidate| {
let key = candidate.to_string();
(
filtered_base_counts.get(&key).copied().unwrap_or(0),
raw_base_counts.get(&key).copied().unwrap_or(0),
u8::from(*candidate == preferred_alternate),
)
})
.unwrap_or(preferred_alternate)
}

fn recount_bam_snp_counts(counts: &mut BamSnpPileupCounts, reference: char, alternate: char) {
let reference = reference.to_ascii_uppercase().to_string();
let alternate = alternate.to_ascii_uppercase().to_string();
counts.filtered_ref_count = counts
.filtered_base_counts
.get(&reference)
.copied()
.unwrap_or(0);
counts.filtered_alt_count = counts
.filtered_base_counts
.get(&alternate)
.copied()
.unwrap_or(0);
counts.raw_ref_count = counts.raw_base_counts.get(&reference).copied().unwrap_or(0);
counts.raw_alt_count = counts.raw_base_counts.get(&alternate).copied().unwrap_or(0);
}

fn observe_bam_deletion_with_reader<R: Read + Seek>(
reader: &mut noodles::bam::io::indexed_reader::IndexedReader<noodles::bgzf::io::Reader<R>>,
label: &str,
Expand Down Expand Up @@ -334,6 +393,7 @@ fn observe_bam_indel_with_reader<R: Read + Seek>(
locus: &GenomicLocus,
reference: &str,
alternate: &str,
alternate_lengths: &[usize],
matched_rsid: Option<String>,
assembly: Option<Assembly>,
) -> Result<VariantObservation, RuntimeError> {
Expand All @@ -354,8 +414,12 @@ fn observe_bam_indel_with_reader<R: Read + Seek>(
if alignment_record.is_unmapped || !record_overlaps_locus(&alignment_record, locus) {
continue;
}
let classification =
classify_expected_indel(&alignment_record, locus, reference.len(), alternate)?;
let classification = classify_expected_indel_lengths(
&alignment_record,
locus,
reference.len(),
alternate_lengths,
)?;
if !classification.covering {
continue;
}
Expand Down Expand Up @@ -397,54 +461,30 @@ fn observe_bam_indel_with_reader<R: Read + Seek>(
})
}

fn read_bam_header<R: Read + Seek>(
reader: &mut noodles::bam::io::indexed_reader::IndexedReader<noodles::bgzf::io::Reader<R>>,
label: &str,
) -> Result<noodles::sam::Header, RuntimeError> {
reader
.get_mut()
.seek(noodles::bgzf::VirtualPosition::MIN)
.map_err(|err| RuntimeError::Io(format!("failed to rewind BAM {label}: {err}")))?;
reader
.read_header()
.map_err(|err| RuntimeError::Io(format!("failed to read BAM header {label}: {err}")))
}

fn bam_region(
header: &noodles::sam::Header,
locus: &GenomicLocus,
) -> Result<noodles::core::Region, RuntimeError> {
let chrom = resolve_bam_reference_name(header, &locus.chrom).ok_or_else(|| {
RuntimeError::Unsupported(format!(
"indexed BAM does not contain contig {} for {}:{}-{}",
locus.chrom, locus.chrom, locus.start, locus.end
))
})?;
format!("{chrom}:{}-{}", locus.start, locus.end)
.parse()
.map_err(|err| RuntimeError::Io(format!("invalid BAM query region: {err}")))
}

fn resolve_bam_reference_name(header: &noodles::sam::Header, chrom: &str) -> Option<String> {
let candidates = [
chrom.to_owned(),
format!("chr{chrom}"),
chrom.trim_start_matches("chr").to_owned(),
];
candidates.into_iter().find(|candidate| {
header.reference_sequences().iter().any(|(name, _)| {
let name_bytes: &[u8] = name.as_ref();
name_bytes == candidate.as_bytes()
})
})
fn indel_alternate_lengths(variant: &VariantSpec, fallback_alternate: &str) -> Vec<usize> {
let mut lengths = variant
.observed_alternates
.iter()
.map(String::len)
.filter(|len| *len > 0)
.collect::<Vec<_>>();
if lengths.is_empty() {
lengths.push(fallback_alternate.len());
}
lengths.sort_unstable();
lengths.dedup();
lengths
}

#[path = "bam_backend/pileup.rs"]
mod pileup;
#[path = "bam_backend/query.rs"]
mod query;

use pileup::{
BamSnpPileupCounts, bam_alignment_record, bam_base_quality_at_reference_position,
classify_expected_indel, describe_copy_number_decision_rule, describe_snp_decision_rule,
indel_at_anchor, infer_copy_number_genotype, infer_snp_genotype, normalize_pileup_base,
record_overlaps_locus, spans_position,
classify_expected_indel_lengths, describe_copy_number_decision_rule,
describe_snp_decision_rule, indel_at_anchor, infer_copy_number_genotype, infer_snp_genotype,
normalize_pileup_base, record_overlaps_locus, spans_position,
};
use query::{bam_region, read_bam_header};
13 changes: 11 additions & 2 deletions rust/bioscript-formats/src/genotype/bam_backend/pileup.rs
Original file line number Diff line number Diff line change
Expand Up @@ -193,13 +193,22 @@ pub(super) fn indel_at_anchor(
None
}

#[allow(dead_code)]
pub(super) fn classify_expected_indel(
record: &AlignmentRecord,
locus: &GenomicLocus,
reference_len: usize,
alternate: &str,
) -> Result<IndelClassification, RuntimeError> {
let alt_len = alternate.len();
classify_expected_indel_lengths(record, locus, reference_len, &[alternate.len()])
}

pub(super) fn classify_expected_indel_lengths(
record: &AlignmentRecord,
locus: &GenomicLocus,
reference_len: usize,
alternate_lengths: &[usize],
) -> Result<IndelClassification, RuntimeError> {
let anchor_start = locus.start.saturating_sub(1);
let anchor_end = locus.end;

Expand All @@ -226,7 +235,7 @@ pub(super) fn classify_expected_indel(
return Ok(IndelClassification {
covering: true,
reference_like: false,
matches_alt: observed_len == alt_len,
matches_alt: alternate_lengths.contains(&observed_len),
observed_len,
});
}
Expand Down
Loading
Loading