diff --git a/noodles b/noodles index 51b904d..5868f00 160000 --- a/noodles +++ b/noodles @@ -1 +1 @@ -Subproject commit 51b904dd4ec4c462755ee14266f65f554a4e6d64 +Subproject commit 5868f00a1f4fa6a0e0c32b685819fd3ef67b6473 diff --git a/rust/bioscript-core/src/variant.rs b/rust/bioscript-core/src/variant.rs index 2ae9022..66c56fe 100644 --- a/rust/bioscript-core/src/variant.rs +++ b/rust/bioscript-core/src/variant.rs @@ -31,6 +31,7 @@ pub struct VariantSpec { pub grch38_assembly_ref: Option, pub reference: Option, pub alternate: Option, + pub observed_alternates: Vec, pub kind: Option, pub deletion_length: Option, pub motifs: Vec, diff --git a/rust/bioscript-formats/src/alignment.rs b/rust/bioscript-formats/src/alignment.rs index 60ac7fb..a7d2cdb 100644 --- a/rust/bioscript-formats/src/alignment.rs +++ b/rust/bioscript-formats/src/alignment.rs @@ -140,7 +140,7 @@ pub fn for_each_raw_cram_record_with_reader( ) -> Result<(), RuntimeError> where R: Read + Seek, - F: FnMut(cram::Record<'_>) -> Result, + F: FnMut(&cram::Record<'_>) -> Result, { for_each_raw_cram_record_with_reader_inner(reader, label, locus, true, on_record) } diff --git a/rust/bioscript-formats/src/alignment/cram_stream.rs b/rust/bioscript-formats/src/alignment/cram_stream.rs index da4cc16..ce7343e 100644 --- a/rust/bioscript-formats/src/alignment/cram_stream.rs +++ b/rust/bioscript-formats/src/alignment/cram_stream.rs @@ -74,7 +74,7 @@ pub(crate) fn for_each_raw_cram_record_with_reader_inner( ) -> Result<(), RuntimeError> where R: Read + Seek, - F: FnMut(cram::Record<'_>) -> Result, + F: FnMut(&cram::Record<'_>) -> Result, { // Re-seeks to position 0 before reading the header so this helper is // idempotent across repeated calls on the same indexed reader (e.g. a @@ -178,7 +178,7 @@ where selected_containers, allow_reference_md5_mismatch, &mut |record| { - let alignment_record = build_alignment_record_from_cram(label, &record)?; + let alignment_record = build_alignment_record_from_cram(label, record)?; on_record(alignment_record) }, ) @@ -196,7 +196,7 @@ fn stream_selected_cram_records( ) -> Result<(), RuntimeError> where R: Read + Seek, - F: FnMut(cram::Record<'_>) -> Result, + F: FnMut(&cram::Record<'_>) -> Result, { let interval = region.interval(); @@ -325,7 +325,7 @@ fn handle_decoded_cram_record( on_record: &mut F, ) -> bool where - F: FnMut(cram::Record<'_>) -> Result, + F: FnMut(&cram::Record<'_>) -> Result, { let alignment_record = match build_alignment_record_from_cram(label, record) { Ok(record) => record, @@ -344,7 +344,7 @@ where return true; } - match on_record(record.clone()) { + match on_record(record) { Ok(true) => true, Ok(false) => { *stop = true; diff --git a/rust/bioscript-formats/src/genotype.rs b/rust/bioscript-formats/src/genotype.rs index ccad4a4..3da2a39 100644 --- a/rust/bioscript-formats/src/genotype.rs +++ b/rust/bioscript-formats/src/genotype.rs @@ -396,6 +396,7 @@ mod tests { grch38_assembly_ref: None, reference: Some("A".to_owned()), alternate: Some("G".to_owned()), + observed_alternates: vec!["G".to_owned()], kind: Some(VariantKind::Snp), deletion_length: None, motifs: Vec::new(), @@ -1530,6 +1531,7 @@ mod tests { &locus("chr_test", 1000, 1000), "A", "AT", + &[2], Some("mini_indel".to_owned()), Some(Assembly::Grch38), ) diff --git a/rust/bioscript-formats/src/genotype/alignment_bytes.rs b/rust/bioscript-formats/src/genotype/alignment_bytes.rs index 2b11bc5..9926768 100644 --- a/rust/bioscript-formats/src/genotype/alignment_bytes.rs +++ b/rust/bioscript-formats/src/genotype/alignment_bytes.rs @@ -138,12 +138,14 @@ impl AlignmentBytesBackend { VariantKind::Insertion | VariantKind::Indel => { let reference = variant.reference.clone().unwrap_or_else(|| "I".to_owned()); let alternate = variant.alternate.clone().unwrap_or_else(|| "D".to_owned()); + let alternate_lengths = indel_alternate_lengths(variant, &alternate); observe_cram_indel_with_reader( reader, LABEL, &locus, &reference, &alternate, + &alternate_lengths, matched_rsid, Some(assembly), ) @@ -185,6 +187,21 @@ impl AlignmentBytesBackend { } } +fn indel_alternate_lengths(variant: &VariantSpec, fallback_alternate: &str) -> Vec { + let mut lengths = variant + .observed_alternates + .iter() + .map(String::len) + .filter(|len| *len > 0) + .collect::>(); + if lengths.is_empty() { + lengths.push(fallback_alternate.len()); + } + lengths.sort_unstable(); + lengths.dedup(); + lengths +} + fn first_base(value: Option<&str>) -> Option { value.and_then(|s| s.chars().next()) } diff --git a/rust/bioscript-formats/src/genotype/bam_backend.rs b/rust/bioscript-formats/src/genotype/bam_backend.rs index 37a5ab3..9b202f6 100644 --- a/rust/bioscript-formats/src/genotype/bam_backend.rs +++ b/rust/bioscript-formats/src/genotype/bam_backend.rs @@ -1,5 +1,5 @@ use std::{ - collections::BTreeMap, + collections::{BTreeMap, BTreeSet}, fs::File, io::{Read, Seek}, }; @@ -113,6 +113,7 @@ pub fn observe_bam_variant( &locus, ref_char, alt_char, + &variant.observed_alternates, variant.rsids.first().cloned(), assembly, ) @@ -133,12 +134,14 @@ pub fn observe_bam_variant( variant.rsids.first().map_or("variant", String::as_str) )) })?; + let alternate_lengths = indel_alternate_lengths(variant, alternate); observe_bam_indel_with_reader( reader, label, &locus, reference, alternate, + &alternate_lengths, variant.rsids.first().cloned(), assembly, ) @@ -156,7 +159,8 @@ fn observe_bam_snp_with_reader( label: &str, locus: &GenomicLocus, reference: char, - alternate: char, + mut alternate: char, + observed_alternates: &[String], matched_rsid: Option, assembly: Option, ) -> Result { @@ -242,6 +246,14 @@ fn observe_bam_snp_with_reader( } } + alternate = select_observed_snp_alternate( + reference, + alternate, + observed_alternates, + &counts.filtered_base_counts, + &counts.raw_base_counts, + ); + recount_bam_snp_counts(&mut counts, reference, alternate); let ref_count = counts.filtered_ref_count; let alt_count = counts.filtered_alt_count; let depth = counts.filtered_depth; @@ -265,6 +277,53 @@ fn observe_bam_snp_with_reader( }) } +fn select_observed_snp_alternate( + reference: char, + preferred_alternate: char, + observed_alternates: &[String], + filtered_base_counts: &BTreeMap, + raw_base_counts: &BTreeMap, +) -> char { + let preferred_alternate = preferred_alternate.to_ascii_uppercase(); + let reference = reference.to_ascii_uppercase(); + let mut candidates = BTreeSet::from([preferred_alternate]); + candidates.extend( + observed_alternates + .iter() + .filter_map(|alt| alt.trim().chars().next()) + .map(|alt| alt.to_ascii_uppercase()) + .filter(|alt| *alt != reference), + ); + candidates + .into_iter() + .max_by_key(|candidate| { + let key = candidate.to_string(); + ( + filtered_base_counts.get(&key).copied().unwrap_or(0), + raw_base_counts.get(&key).copied().unwrap_or(0), + u8::from(*candidate == preferred_alternate), + ) + }) + .unwrap_or(preferred_alternate) +} + +fn recount_bam_snp_counts(counts: &mut BamSnpPileupCounts, reference: char, alternate: char) { + let reference = reference.to_ascii_uppercase().to_string(); + let alternate = alternate.to_ascii_uppercase().to_string(); + counts.filtered_ref_count = counts + .filtered_base_counts + .get(&reference) + .copied() + .unwrap_or(0); + counts.filtered_alt_count = counts + .filtered_base_counts + .get(&alternate) + .copied() + .unwrap_or(0); + counts.raw_ref_count = counts.raw_base_counts.get(&reference).copied().unwrap_or(0); + counts.raw_alt_count = counts.raw_base_counts.get(&alternate).copied().unwrap_or(0); +} + fn observe_bam_deletion_with_reader( reader: &mut noodles::bam::io::indexed_reader::IndexedReader>, label: &str, @@ -334,6 +393,7 @@ fn observe_bam_indel_with_reader( locus: &GenomicLocus, reference: &str, alternate: &str, + alternate_lengths: &[usize], matched_rsid: Option, assembly: Option, ) -> Result { @@ -354,8 +414,12 @@ fn observe_bam_indel_with_reader( if alignment_record.is_unmapped || !record_overlaps_locus(&alignment_record, locus) { continue; } - let classification = - classify_expected_indel(&alignment_record, locus, reference.len(), alternate)?; + let classification = classify_expected_indel_lengths( + &alignment_record, + locus, + reference.len(), + alternate_lengths, + )?; if !classification.covering { continue; } @@ -397,54 +461,30 @@ fn observe_bam_indel_with_reader( }) } -fn read_bam_header( - reader: &mut noodles::bam::io::indexed_reader::IndexedReader>, - label: &str, -) -> Result { - reader - .get_mut() - .seek(noodles::bgzf::VirtualPosition::MIN) - .map_err(|err| RuntimeError::Io(format!("failed to rewind BAM {label}: {err}")))?; - reader - .read_header() - .map_err(|err| RuntimeError::Io(format!("failed to read BAM header {label}: {err}"))) -} - -fn bam_region( - header: &noodles::sam::Header, - locus: &GenomicLocus, -) -> Result { - let chrom = resolve_bam_reference_name(header, &locus.chrom).ok_or_else(|| { - RuntimeError::Unsupported(format!( - "indexed BAM does not contain contig {} for {}:{}-{}", - locus.chrom, locus.chrom, locus.start, locus.end - )) - })?; - format!("{chrom}:{}-{}", locus.start, locus.end) - .parse() - .map_err(|err| RuntimeError::Io(format!("invalid BAM query region: {err}"))) -} - -fn resolve_bam_reference_name(header: &noodles::sam::Header, chrom: &str) -> Option { - let candidates = [ - chrom.to_owned(), - format!("chr{chrom}"), - chrom.trim_start_matches("chr").to_owned(), - ]; - candidates.into_iter().find(|candidate| { - header.reference_sequences().iter().any(|(name, _)| { - let name_bytes: &[u8] = name.as_ref(); - name_bytes == candidate.as_bytes() - }) - }) +fn indel_alternate_lengths(variant: &VariantSpec, fallback_alternate: &str) -> Vec { + let mut lengths = variant + .observed_alternates + .iter() + .map(String::len) + .filter(|len| *len > 0) + .collect::>(); + if lengths.is_empty() { + lengths.push(fallback_alternate.len()); + } + lengths.sort_unstable(); + lengths.dedup(); + lengths } #[path = "bam_backend/pileup.rs"] mod pileup; +#[path = "bam_backend/query.rs"] +mod query; use pileup::{ BamSnpPileupCounts, bam_alignment_record, bam_base_quality_at_reference_position, - classify_expected_indel, describe_copy_number_decision_rule, describe_snp_decision_rule, - indel_at_anchor, infer_copy_number_genotype, infer_snp_genotype, normalize_pileup_base, - record_overlaps_locus, spans_position, + classify_expected_indel_lengths, describe_copy_number_decision_rule, + describe_snp_decision_rule, indel_at_anchor, infer_copy_number_genotype, infer_snp_genotype, + normalize_pileup_base, record_overlaps_locus, spans_position, }; +use query::{bam_region, read_bam_header}; diff --git a/rust/bioscript-formats/src/genotype/bam_backend/pileup.rs b/rust/bioscript-formats/src/genotype/bam_backend/pileup.rs index 89b91a1..ae177b6 100644 --- a/rust/bioscript-formats/src/genotype/bam_backend/pileup.rs +++ b/rust/bioscript-formats/src/genotype/bam_backend/pileup.rs @@ -193,13 +193,22 @@ pub(super) fn indel_at_anchor( None } +#[allow(dead_code)] pub(super) fn classify_expected_indel( record: &AlignmentRecord, locus: &GenomicLocus, reference_len: usize, alternate: &str, ) -> Result { - let alt_len = alternate.len(); + classify_expected_indel_lengths(record, locus, reference_len, &[alternate.len()]) +} + +pub(super) fn classify_expected_indel_lengths( + record: &AlignmentRecord, + locus: &GenomicLocus, + reference_len: usize, + alternate_lengths: &[usize], +) -> Result { let anchor_start = locus.start.saturating_sub(1); let anchor_end = locus.end; @@ -226,7 +235,7 @@ pub(super) fn classify_expected_indel( return Ok(IndelClassification { covering: true, reference_like: false, - matches_alt: observed_len == alt_len, + matches_alt: alternate_lengths.contains(&observed_len), observed_len, }); } diff --git a/rust/bioscript-formats/src/genotype/bam_backend/query.rs b/rust/bioscript-formats/src/genotype/bam_backend/query.rs new file mode 100644 index 0000000..a37939e --- /dev/null +++ b/rust/bioscript-formats/src/genotype/bam_backend/query.rs @@ -0,0 +1,45 @@ +use std::io::{Read, Seek}; + +use bioscript_core::{GenomicLocus, RuntimeError}; + +pub(super) fn read_bam_header( + reader: &mut noodles::bam::io::indexed_reader::IndexedReader>, + label: &str, +) -> Result { + reader + .get_mut() + .seek(noodles::bgzf::VirtualPosition::MIN) + .map_err(|err| RuntimeError::Io(format!("failed to rewind BAM {label}: {err}")))?; + reader + .read_header() + .map_err(|err| RuntimeError::Io(format!("failed to read BAM header {label}: {err}"))) +} + +pub(super) fn bam_region( + header: &noodles::sam::Header, + locus: &GenomicLocus, +) -> Result { + let chrom = resolve_bam_reference_name(header, &locus.chrom).ok_or_else(|| { + RuntimeError::Unsupported(format!( + "indexed BAM does not contain contig {} for {}:{}-{}", + locus.chrom, locus.chrom, locus.start, locus.end + )) + })?; + format!("{chrom}:{}-{}", locus.start, locus.end) + .parse() + .map_err(|err| RuntimeError::Io(format!("invalid BAM query region: {err}"))) +} + +fn resolve_bam_reference_name(header: &noodles::sam::Header, chrom: &str) -> Option { + let candidates = [ + chrom.to_owned(), + format!("chr{chrom}"), + chrom.trim_start_matches("chr").to_owned(), + ]; + candidates.into_iter().find(|candidate| { + header.reference_sequences().iter().any(|(name, _)| { + let name_bytes: &[u8] = name.as_ref(); + name_bytes == candidate.as_bytes() + }) + }) +} diff --git a/rust/bioscript-formats/src/genotype/cram_backend.rs b/rust/bioscript-formats/src/genotype/cram_backend.rs index 228926d..448d4c3 100644 --- a/rust/bioscript-formats/src/genotype/cram_backend.rs +++ b/rust/bioscript-formats/src/genotype/cram_backend.rs @@ -1,5 +1,5 @@ use std::{ - collections::BTreeMap, + collections::{BTreeMap, BTreeSet}, io::{Read, Seek}, path::Path, }; @@ -20,9 +20,9 @@ mod reader; mod store; #[cfg(test)] -pub(crate) use indel::len_as_i64; +pub(crate) use indel::{classify_expected_indel, len_as_i64}; pub(crate) use indel::{ - classify_expected_indel, indel_at_anchor, record_overlaps_locus, spans_position, + classify_expected_indel_lengths, indel_at_anchor, record_overlaps_locus, spans_position, }; pub use reader::{ observe_cram_deletion_with_reader, observe_cram_indel_with_reader, observe_cram_snp_with_reader, @@ -265,6 +265,56 @@ impl SnpPileupCounts { } } +pub(crate) fn select_observed_snp_alternate( + reference: char, + preferred_alternate: char, + observed_alternates: &[String], + filtered_base_counts: &BTreeMap, + raw_base_counts: &BTreeMap, +) -> char { + let preferred_alternate = preferred_alternate.to_ascii_uppercase(); + let reference = reference.to_ascii_uppercase(); + let mut candidates = BTreeSet::from([preferred_alternate]); + candidates.extend( + observed_alternates + .iter() + .filter_map(|alt| first_base(alt)) + .filter(|alt| *alt != reference), + ); + candidates + .into_iter() + .max_by_key(|candidate| { + let key = candidate.to_string(); + ( + filtered_base_counts.get(&key).copied().unwrap_or(0), + raw_base_counts.get(&key).copied().unwrap_or(0), + u8::from(*candidate == preferred_alternate), + ) + }) + .unwrap_or(preferred_alternate) +} + +pub(crate) fn recount_snp_pileup_counts( + counts: &mut SnpPileupCounts, + reference: char, + alternate: char, +) { + let reference = reference.to_ascii_uppercase().to_string(); + let alternate = alternate.to_ascii_uppercase().to_string(); + counts.filtered_ref_count = counts + .filtered_base_counts + .get(&reference) + .copied() + .unwrap_or(0); + counts.filtered_alt_count = counts + .filtered_base_counts + .get(&alternate) + .copied() + .unwrap_or(0); + counts.raw_ref_count = counts.raw_base_counts.get(&reference).copied().unwrap_or(0); + counts.raw_alt_count = counts.raw_base_counts.get(&alternate).copied().unwrap_or(0); +} + fn observe_snp_pileup( cram_path: &Path, options: &GenotypeLoadOptions, @@ -333,7 +383,7 @@ pub(super) fn snp_pileup_with_reader( } let Some((base, base_quality)) = - cram_base_quality_at_reference_position(&record, target_position, reference_base)? + cram_base_quality_at_reference_position(record, target_position, reference_base)? else { return Ok(true); }; diff --git a/rust/bioscript-formats/src/genotype/cram_backend/indel.rs b/rust/bioscript-formats/src/genotype/cram_backend/indel.rs index 2d8752e..27fad5b 100644 --- a/rust/bioscript-formats/src/genotype/cram_backend/indel.rs +++ b/rust/bioscript-formats/src/genotype/cram_backend/indel.rs @@ -56,13 +56,22 @@ pub(crate) fn indel_at_anchor( None } +#[allow(dead_code)] pub(crate) fn classify_expected_indel( record: &AlignmentRecord, locus: &GenomicLocus, reference_len: usize, alternate: &str, ) -> Result { - let alt_len = alternate.len(); + classify_expected_indel_lengths(record, locus, reference_len, &[alternate.len()]) +} + +pub(crate) fn classify_expected_indel_lengths( + record: &AlignmentRecord, + locus: &GenomicLocus, + reference_len: usize, + alternate_lengths: &[usize], +) -> Result { let anchor_start = locus.start.saturating_sub(1); let anchor_end = locus.end; @@ -89,7 +98,7 @@ pub(crate) fn classify_expected_indel( return Ok(IndelClassification { covering: true, reference_like: false, - matches_alt: observed_len == alt_len, + matches_alt: alternate_lengths.contains(&observed_len), observed_len, }); } diff --git a/rust/bioscript-formats/src/genotype/cram_backend/observation.rs b/rust/bioscript-formats/src/genotype/cram_backend/observation.rs index 33f78c6..2b8d9e2 100644 --- a/rust/bioscript-formats/src/genotype/cram_backend/observation.rs +++ b/rust/bioscript-formats/src/genotype/cram_backend/observation.rs @@ -12,9 +12,10 @@ use crate::alignment::{self, AlignmentOpKind}; use crate::genotype::types::CramBackend; use super::{ - anchor_window, classify_expected_indel, describe_copy_number_decision_rule, describe_locus, - describe_snp_decision_rule, first_base, indel_at_anchor, infer_copy_number_genotype, - infer_snp_genotype, observe_snp_pileup, record_overlaps_locus, snp_pileup_with_reader, + anchor_window, classify_expected_indel_lengths, describe_copy_number_decision_rule, + describe_locus, describe_snp_decision_rule, first_base, indel_at_anchor, + infer_copy_number_genotype, infer_snp_genotype, observe_snp_pileup, record_overlaps_locus, + recount_snp_pileup_counts, select_observed_snp_alternate, snp_pileup_with_reader, spans_position, }; @@ -69,7 +70,7 @@ impl CramBackend { })?; let target_pos = locus.start; - let pileup = observe_snp_pileup( + let mut pileup = observe_snp_pileup( &self.path, &self.options, reference_file, @@ -77,6 +78,14 @@ impl CramBackend { reference, alternate, )?; + let alternate = select_observed_snp_alternate( + reference, + alternate, + &variant.observed_alternates, + &pileup.filtered_base_counts, + &pileup.raw_base_counts, + ); + recount_snp_pileup_counts(&mut pileup, reference, alternate); let ref_count = pileup.filtered_ref_count; let alt_count = pileup.filtered_alt_count; let depth = pileup.filtered_depth; @@ -122,7 +131,7 @@ impl CramBackend { RuntimeError::InvalidArguments("SNP variant requires alt/alternate".to_owned()) })?; - let pileup = snp_pileup_with_reader( + let mut pileup = snp_pileup_with_reader( reader, label, locus, @@ -130,6 +139,14 @@ impl CramBackend { alternate, self.options.allow_reference_md5_mismatch, )?; + let alternate = select_observed_snp_alternate( + reference, + alternate, + &variant.observed_alternates, + &pileup.filtered_base_counts, + &pileup.raw_base_counts, + ); + recount_snp_pileup_counts(&mut pileup, reference, alternate); let ref_count = pileup.filtered_ref_count; let alt_count = pileup.filtered_alt_count; let depth = pileup.filtered_depth; @@ -282,6 +299,7 @@ impl CramBackend { let alternate = variant.alternate.clone().ok_or_else(|| { RuntimeError::InvalidArguments("indel variant requires alt/alternate".to_owned()) })?; + let alternate_lengths = indel_alternate_lengths(variant, &alternate); let records = alignment::query_cram_records(&self.path, &self.options, reference_file, locus)?; @@ -297,8 +315,12 @@ impl CramBackend { if !record_overlaps_locus(&record, locus) { continue; } - let classification = - classify_expected_indel(&record, locus, reference.len(), &alternate)?; + let classification = classify_expected_indel_lengths( + &record, + locus, + reference.len(), + &alternate_lengths, + )?; if !classification.covering { continue; } @@ -360,6 +382,7 @@ impl CramBackend { let alternate = variant.alternate.clone().ok_or_else(|| { RuntimeError::InvalidArguments("indel variant requires alt/alternate".to_owned()) })?; + let alternate_lengths = indel_alternate_lengths(variant, &alternate); super::observe_cram_indel_with_reader( reader, @@ -367,12 +390,28 @@ impl CramBackend { locus, &reference, &alternate, + &alternate_lengths, variant.rsids.first().cloned(), Some(assembly), ) } } +fn indel_alternate_lengths(variant: &VariantSpec, fallback_alternate: &str) -> Vec { + let mut lengths = variant + .observed_alternates + .iter() + .map(String::len) + .filter(|len| *len > 0) + .collect::>(); + if lengths.is_empty() { + lengths.push(fallback_alternate.len()); + } + lengths.sort_unstable(); + lengths.dedup(); + lengths +} + #[cfg(test)] mod tests { use std::{fs, path::PathBuf}; diff --git a/rust/bioscript-formats/src/genotype/cram_backend/reader.rs b/rust/bioscript-formats/src/genotype/cram_backend/reader.rs index 60a66df..24f4121 100644 --- a/rust/bioscript-formats/src/genotype/cram_backend/reader.rs +++ b/rust/bioscript-formats/src/genotype/cram_backend/reader.rs @@ -10,9 +10,9 @@ use bioscript_core::{Assembly, GenomicLocus, RuntimeError, VariantObservation}; use crate::alignment::{self, AlignmentOpKind}; use super::{ - anchor_window, classify_expected_indel, describe_copy_number_decision_rule, describe_locus, - describe_snp_decision_rule, indel_at_anchor, infer_copy_number_genotype, infer_snp_genotype, - record_overlaps_locus, snp_pileup_with_reader, spans_position, + anchor_window, classify_expected_indel_lengths, describe_copy_number_decision_rule, + describe_locus, describe_snp_decision_rule, indel_at_anchor, infer_copy_number_genotype, + infer_snp_genotype, record_overlaps_locus, snp_pileup_with_reader, spans_position, }; /// Observe a SNP at `locus` over an already-built CRAM `IndexedReader` and @@ -116,6 +116,7 @@ pub fn observe_cram_indel_with_reader( locus: &GenomicLocus, reference: &str, alternate: &str, + alternate_lengths: &[usize], matched_rsid: Option, assembly: Option, ) -> Result { @@ -128,7 +129,8 @@ pub fn observe_cram_indel_with_reader( if record.is_unmapped || !record_overlaps_locus(&record, locus) { return Ok(true); } - let classification = classify_expected_indel(&record, locus, reference.len(), alternate)?; + let classification = + classify_expected_indel_lengths(&record, locus, reference.len(), alternate_lengths)?; if !classification.covering { return Ok(true); } diff --git a/rust/bioscript-formats/src/genotype/cram_backend/store.rs b/rust/bioscript-formats/src/genotype/cram_backend/store.rs index 0b5fa25..002d11c 100644 --- a/rust/bioscript-formats/src/genotype/cram_backend/store.rs +++ b/rust/bioscript-formats/src/genotype/cram_backend/store.rs @@ -211,7 +211,7 @@ impl CramBackend { } } -const DEFAULT_MAX_CRAM_WORKERS: usize = 16; +const DEFAULT_MAX_CRAM_WORKERS: usize = 2; fn cram_lookup_worker_count(job_count: usize) -> usize { if job_count <= 1 { diff --git a/rust/bioscript-reporting/src/html.rs b/rust/bioscript-reporting/src/html.rs index 5328c8a..d322318 100644 --- a/rust/bioscript-reporting/src/html.rs +++ b/rust/bioscript-reporting/src/html.rs @@ -23,10 +23,10 @@ pub fn render_app_html_document( reports: &[serde_json::Value], ) -> Result { let mut out = String::from( - r#"BioScript report
"#, + r#"BioScript report
"#, ); out.push_str( - r"", + r"", ); let label_findings = collect_report_findings(reports, "bioscript:pgx-label:1.0"); let summary_findings = collect_report_findings(reports, "bioscript:pgx-summary:1.0"); @@ -179,6 +179,8 @@ mod tests { "version": "1.0", "name": "pgx-panel", "label": "PGx Panel", + "summary": "Panel-level summary text.", + "summary_url": "https://example.test/panel-summary", "tags": ["pgx", "demo"], "members": [ {"kind": "variant", "path": "variants/rs123.yaml", "version": "1.0"}, @@ -256,6 +258,8 @@ mod tests { let html = render_app_html_document(&observations, &reports).unwrap(); assert!(html.contains("PGx Panel")); + assert!(html.contains("Panel-level summary text.")); + assert!(html.contains("https://example.test/panel-summary")); assert!(html.contains("participant-filter")); assert!(html.contains("observations-table")); assert!(html.contains("analysis-table-0")); diff --git a/rust/bioscript-reporting/src/html/analysis.rs b/rust/bioscript-reporting/src/html/analysis.rs index 171dcca..d9d2f65 100644 --- a/rust/bioscript-reporting/src/html/analysis.rs +++ b/rust/bioscript-reporting/src/html/analysis.rs @@ -49,6 +49,13 @@ pub(super) fn render_analysis_tables( continue; } out.push_str("

Results

"); + if should_render_analysis_narrative_rows(&headers) { + render_analysis_narrative_rows(out, analysis, &rows); + render_analysis_notes(out, ¬es); + render_weak_indel_analysis_note(out, weak_indel_dependency); + out.push_str("
"); + continue; + } let header_refs = headers.iter().map(String::as_str).collect::>(); render_table_start(out, &table_id, &header_refs); let participant = value_str(analysis, "participant_id"); @@ -70,6 +77,31 @@ pub(super) fn render_analysis_tables( } } +fn should_render_analysis_narrative_rows(headers: &[String]) -> bool { + headers.len() == 2 && headers[0] == "rsid" && headers[1] == "info" +} + +fn render_analysis_narrative_rows( + out: &mut String, + analysis: &serde_json::Value, + rows: &[serde_json::Value], +) { + out.push_str("
"); + let participant = value_str(analysis, "participant_id"); + for row in rows { + let rsid = json_field_as_tsv(row.get("rsid")); + let info = json_field_as_tsv(row.get("info")); + let _ = write!( + out, + "
{}

{}

", + html_escape(participant), + html_escape(&rsid), + render_analysis_value("info", &info) + ); + } + out.push_str("
"); +} + pub(super) fn analysis_title(analysis: &serde_json::Value) -> String { let label = value_str(analysis, "analysis_label"); if label.is_empty() { @@ -305,3 +337,34 @@ pub(super) fn render_analysis_logic(out: &mut String, analysis: &serde_json::Val } out.push_str("
"); } + +#[cfg(test)] +mod tests { + use super::*; + use serde_json::json; + + #[test] + fn renders_rsid_info_analysis_as_narrative_rows() { + let analysis = json!({ + "analysis_id": "glp1_nature_findings", + "analysis_label": "GLP-1 medication response findings", + "participant_id": "P001", + "emits": [ + {"key": "rsid", "label": "Variant"}, + {"key": "info", "label": "Finding and your result"} + ], + "rows": [ + {"participant_id": "P001", "rsid": "rs10305420", "info": "Genotype: CC. Finding: Study text. Your result: You do not have T."}, + {"participant_id": "P001", "rsid": "rs11760106", "info": "Genotype: GT. Finding: Study text. Your result: You have T."} + ] + }); + let mut html = String::new(); + + render_analysis_tables(&mut html, &[analysis], &[], false); + + assert!(html.contains("analysis-narrative-list")); + assert!(html.contains("
rs10305420
")); + assert!(html.contains("Genotype: CC.")); + assert!(!html.contains("analysis-table-0")); + } +} diff --git a/rust/bioscript-reporting/src/html/helpers.rs b/rust/bioscript-reporting/src/html/helpers.rs index 6001d50..e6a3998 100644 --- a/rust/bioscript-reporting/src/html/helpers.rs +++ b/rust/bioscript-reporting/src/html/helpers.rs @@ -1,4 +1,14 @@ use std::fmt::Write as _; + +#[path = "helpers/repeats.rs"] +mod repeats; + +pub(super) use repeats::{ + genotype_repeat_notation, genotype_repeat_notation_html, padded_homopolymer_repeat_notation, + padded_homopolymer_repeat_notation_html, paired_reference_repeat_notation, + paired_reference_repeat_notation_html, paired_repeat_notation, paired_repeat_notation_html, + repeat_notation, repeat_notation_html, same_homopolymer_base, +}; pub(super) fn render_table_start(out: &mut String, table_id: &str, headers: &[&str]) { let escaped_id = html_escape(table_id); let _ = write!( diff --git a/rust/bioscript-reporting/src/html/helpers/repeats.rs b/rust/bioscript-reporting/src/html/helpers/repeats.rs new file mode 100644 index 0000000..32e166c --- /dev/null +++ b/rust/bioscript-reporting/src/html/helpers/repeats.rs @@ -0,0 +1,396 @@ +use std::{cmp::Ordering, fmt::Write as _}; + +use super::html_escape; + +pub(crate) fn repeat_notation(value: &str) -> String { + if let Some((unit, repeat_count)) = repeat_unit_count(value) { + return format!("{unit}[{repeat_count}]"); + } + value.to_owned() +} + +pub(crate) fn padded_homopolymer_repeat_notation(value: &str) -> String { + if let Some((unit, repeat_count)) = padded_homopolymer_repeat_unit_count(value) { + return format!("{unit}[{repeat_count}]"); + } + repeat_notation(value) +} + +#[cfg(test)] +pub(super) fn allele_list_repeat_notation(value: &str) -> String { + value + .split(',') + .map(repeat_notation) + .collect::>() + .join(", ") +} + +pub(crate) fn repeat_notation_html(value: &str, reference: Option<&str>) -> String { + let notation = repeat_notation(value); + let class_name = repeat_delta_class(value, reference); + if class_name.is_empty() || class_name == "repeat-same" { + return html_escape(¬ation); + } + format!( + "{}", + html_escape(¬ation) + ) +} + +pub(crate) fn paired_repeat_notation(reference: &str, alternate: &str) -> String { + if let Some(display) = paired_repeat_display(reference, alternate) { + return display.alternate; + } + repeat_notation(alternate) +} + +pub(crate) fn paired_repeat_notation_html(reference: &str, alternate: &str) -> String { + if let Some(display) = paired_repeat_display(reference, alternate) { + if display.alt_class.is_empty() || display.alt_class == "repeat-same" { + return html_escape(&display.alternate); + } + return format!( + "{}", + display.alt_class, + html_escape(&display.alternate) + ); + } + repeat_notation_html(alternate, Some(reference)) +} + +pub(crate) fn paired_reference_repeat_notation(reference: &str, alternate: &str) -> String { + paired_repeat_display(reference, alternate) + .map_or_else(|| repeat_notation(reference), |display| display.reference) +} + +pub(crate) fn paired_reference_repeat_notation_html(reference: &str, alternate: &str) -> String { + paired_repeat_display(reference, alternate).map_or_else( + || repeat_notation_html(reference, None), + |display| html_escape(&display.reference), + ) +} + +pub(crate) fn padded_homopolymer_repeat_notation_html(value: &str, reference: &str) -> String { + let notation = padded_homopolymer_repeat_notation(value); + let class_name = padded_homopolymer_repeat_delta_class(value, reference); + if class_name.is_empty() || class_name == "repeat-same" { + return html_escape(¬ation); + } + format!( + "{}", + html_escape(¬ation) + ) +} + +struct PairedRepeatDisplay { + reference: String, + alternate: String, + alt_class: &'static str, +} + +fn paired_repeat_display(reference: &str, alternate: &str) -> Option { + if reference.is_empty() || !reference.chars().all(is_sequence_char) { + return None; + } + if let Some((unit, ref_count)) = exact_repeat_unit_count(reference, 2) + && let Some(alt_count) = repeat_count_for_unit(alternate, &unit) + { + return Some(PairedRepeatDisplay { + reference: repeat_token(&unit, ref_count), + alternate: repeat_token(&unit, alt_count), + alt_class: repeat_count_delta_class(alt_count, ref_count), + }); + } + anchored_suffix_repeat_display(reference, alternate) +} + +fn anchored_suffix_repeat_display(reference: &str, alternate: &str) -> Option { + if alternate.is_empty() || !reference.ends_with(alternate) || reference.len() <= alternate.len() + { + return None; + } + let repeat_part = &reference[..reference.len() - alternate.len()]; + let (unit, ref_count) = exact_repeat_unit_count(repeat_part, 2)?; + Some(PairedRepeatDisplay { + reference: format!("{}{}", repeat_token(&unit, ref_count), alternate), + alternate: alternate.to_owned(), + alt_class: "repeat-less", + }) +} + +pub(crate) fn genotype_repeat_notation(value: &str, ref_allele: &str, alt_allele: &str) -> String { + if value.is_empty() { + return String::new(); + } + if value.contains('/') || value.contains('|') { + return value + .split_inclusive(['/', '|']) + .fold(String::new(), |mut formatted, token| { + let separator = token + .chars() + .last() + .filter(|ch| matches!(ch, '/' | '|')) + .map(|ch| ch.to_string()) + .unwrap_or_default(); + let allele = token.trim_end_matches(['/', '|']); + let _ = write!( + formatted, + "{}{}", + genotype_allele_repeat_notation(allele, ref_allele, alt_allele), + separator + ); + formatted + }); + } + if ref_allele.is_empty() + || alt_allele.is_empty() + || (ref_allele.len() == 1 && alt_allele.len() == 1) + { + return repeat_notation(value); + } + for (left, right) in [ + (ref_allele, ref_allele), + (alt_allele, alt_allele), + (ref_allele, alt_allele), + (alt_allele, ref_allele), + ] { + if value.eq_ignore_ascii_case(&format!("{left}{right}")) { + return format!("{}/{}", repeat_notation(left), repeat_notation(right)); + } + } + repeat_notation(value) +} + +pub(crate) fn genotype_repeat_notation_html( + value: &str, + ref_allele: &str, + alt_allele: &str, +) -> String { + if value.is_empty() { + return String::new(); + } + if value.contains('/') || value.contains('|') { + return value + .split_inclusive(['/', '|']) + .fold(String::new(), |mut formatted, token| { + let separator = token + .chars() + .last() + .filter(|ch| matches!(ch, '/' | '|')) + .map(|ch| ch.to_string()) + .unwrap_or_default(); + let allele = token.trim_end_matches(['/', '|']); + let _ = write!( + formatted, + "{}{}", + genotype_allele_repeat_notation_html(allele, ref_allele, alt_allele), + html_escape(&separator) + ); + formatted + }); + } + if ref_allele.is_empty() + || alt_allele.is_empty() + || (ref_allele.len() == 1 && alt_allele.len() == 1) + { + return repeat_notation_html(value, Some(ref_allele)); + } + for (left, right) in [ + (ref_allele, ref_allele), + (alt_allele, alt_allele), + (ref_allele, alt_allele), + (alt_allele, ref_allele), + ] { + if value.eq_ignore_ascii_case(&format!("{left}{right}")) { + return format!( + "{}/{}", + genotype_allele_repeat_notation_html(left, ref_allele, alt_allele), + genotype_allele_repeat_notation_html(right, ref_allele, alt_allele) + ); + } + } + repeat_notation_html(value, Some(ref_allele)) +} + +fn genotype_allele_repeat_notation(allele: &str, ref_allele: &str, alt_allele: &str) -> String { + if same_homopolymer_base(ref_allele, alt_allele).is_some() + && padded_homopolymer_repeat_unit_count(allele).is_some() + { + return padded_homopolymer_repeat_notation(allele); + } + if allele.eq_ignore_ascii_case(ref_allele) + && let Some(display) = paired_repeat_display(ref_allele, alt_allele) + { + return display.reference; + } + if let Some(display) = paired_repeat_display(ref_allele, allele) { + return display.alternate; + } + repeat_notation(allele) +} + +fn genotype_allele_repeat_notation_html( + allele: &str, + ref_allele: &str, + alt_allele: &str, +) -> String { + if same_homopolymer_base(ref_allele, alt_allele).is_some() + && padded_homopolymer_repeat_unit_count(allele).is_some() + { + return padded_homopolymer_repeat_notation_html(allele, ref_allele); + } + if allele.eq_ignore_ascii_case(ref_allele) + && let Some(display) = paired_repeat_display(ref_allele, alt_allele) + { + return html_escape(&display.reference); + } + if paired_repeat_display(ref_allele, allele).is_some() { + return paired_repeat_notation_html(ref_allele, allele); + } + repeat_notation_html(allele, Some(ref_allele)) +} + +fn is_sequence_char(ch: char) -> bool { + matches!(ch.to_ascii_uppercase(), 'A' | 'C' | 'G' | 'T' | 'N') +} + +fn repeat_unit_count(value: &str) -> Option<(String, usize)> { + if !value.chars().all(is_sequence_char) { + return None; + } + exact_repeat_unit_count(value, 3) +} + +fn exact_repeat_unit_count(value: &str, min_count: usize) -> Option<(String, usize)> { + if value.is_empty() || !value.chars().all(is_sequence_char) { + return None; + } + let chars = value.chars().collect::>(); + for unit_len in 1..=chars.len().min(32) { + if chars.len() % unit_len != 0 { + continue; + } + let repeat_count = chars.len() / unit_len; + if repeat_count < min_count { + continue; + } + if chars + .iter() + .enumerate() + .all(|(idx, ch)| ch.eq_ignore_ascii_case(&chars[idx % unit_len])) + { + let unit = chars[..unit_len] + .iter() + .map(char::to_ascii_uppercase) + .collect::(); + return Some((unit, repeat_count)); + } + } + None +} + +fn repeat_count_for_unit(value: &str, unit: &str) -> Option { + if value.is_empty() { + return Some(0); + } + if unit.is_empty() || !value.chars().all(is_sequence_char) { + return None; + } + let value_chars = value.chars().collect::>(); + let unit_chars = unit.chars().collect::>(); + if value_chars.len() % unit_chars.len() != 0 { + return None; + } + let repeat_count = value_chars.len() / unit_chars.len(); + if value_chars + .iter() + .enumerate() + .all(|(idx, ch)| ch.eq_ignore_ascii_case(&unit_chars[idx % unit_chars.len()])) + { + Some(repeat_count) + } else { + None + } +} + +fn repeat_token(unit: &str, count: usize) -> String { + let unit = unit.to_ascii_uppercase(); + if unit.chars().count() == 1 { + format!("{unit}[{count}]") + } else { + format!("({unit})[{count}]") + } +} + +fn repeat_count_delta_class(count: usize, reference_count: usize) -> &'static str { + match count.cmp(&reference_count) { + Ordering::Less => "repeat-less", + Ordering::Equal => "repeat-same", + Ordering::Greater => "repeat-more", + } +} + +fn padded_homopolymer_repeat_unit_count(value: &str) -> Option<(String, usize)> { + let chars = value.chars().collect::>(); + if !(3..=6).contains(&chars.len()) || !chars.iter().all(|ch| is_sequence_char(*ch)) { + return None; + } + let first = chars.first()?.to_ascii_uppercase(); + if chars.iter().all(|ch| ch.to_ascii_uppercase() == first) { + return Some((first.to_string(), chars.len() - 1)); + } + None +} + +pub(crate) fn same_homopolymer_base(reference: &str, alternate: &str) -> Option { + let reference_base = homopolymer_base(reference)?; + let alternate_base = homopolymer_base(alternate)?; + if reference_base == alternate_base { + Some(reference_base) + } else { + None + } +} + +fn homopolymer_base(value: &str) -> Option { + let mut chars = value.chars(); + let first = chars.next()?.to_ascii_uppercase(); + if !is_sequence_char(first) { + return None; + } + if chars.all(|ch| ch.to_ascii_uppercase() == first) { + Some(first) + } else { + None + } +} + +fn repeat_delta_class(value: &str, reference: Option<&str>) -> &'static str { + let Some(reference) = reference else { + return ""; + }; + let Some((unit, count)) = repeat_unit_count(value) else { + return ""; + }; + let Some((reference_unit, reference_count)) = repeat_unit_count(reference) else { + return ""; + }; + if unit != reference_unit { + return ""; + } + repeat_count_delta_class(count, reference_count) +} + +fn padded_homopolymer_repeat_delta_class(value: &str, reference: &str) -> &'static str { + let Some((unit, count)) = padded_homopolymer_repeat_unit_count(value) else { + return ""; + }; + let Some((reference_unit, reference_count)) = padded_homopolymer_repeat_unit_count(reference) + else { + return ""; + }; + if unit != reference_unit { + return ""; + } + repeat_count_delta_class(count, reference_count) +} diff --git a/rust/bioscript-reporting/src/html/observations.rs b/rust/bioscript-reporting/src/html/observations.rs index 1c9ef86..35ec072 100644 --- a/rust/bioscript-reporting/src/html/observations.rs +++ b/rust/bioscript-reporting/src/html/observations.rs @@ -1,5 +1,9 @@ use super::helpers::{ - class_cell, html_escape, json_field_as_tsv, render_table_start, table_column_class, value_str, + class_cell, genotype_repeat_notation, genotype_repeat_notation_html, html_escape, + json_field_as_tsv, padded_homopolymer_repeat_notation, padded_homopolymer_repeat_notation_html, + paired_reference_repeat_notation, paired_reference_repeat_notation_html, + paired_repeat_notation, paired_repeat_notation_html, render_table_start, repeat_notation, + repeat_notation_html, same_homopolymer_base, table_column_class, value_str, }; use std::fmt::Write as _; pub(super) fn render_observation_table( @@ -207,20 +211,45 @@ pub(super) fn render_observation_cell( .get("outcome") .and_then(serde_json::Value::as_str) .unwrap_or_default(); - let value = json_field_as_tsv(observation.get(header)); + let raw_value = json_field_as_tsv(observation.get(header)); + let ref_allele = observation + .get("ref") + .and_then(serde_json::Value::as_str) + .unwrap_or_default(); + let alt_allele = observation + .get("alt") + .and_then(serde_json::Value::as_str) + .unwrap_or_default(); + let value = genotype_repeat_notation(&raw_value, ref_allele, alt_allele); + let html_value = genotype_repeat_notation_html(&raw_value, ref_allele, alt_allele); if matches!(outcome, "variant" | "observed_alt" | "unknown_alt") { let alt = observation .get("alt") .and_then(serde_json::Value::as_str) .unwrap_or_default(); + let escaped_value = html_escape(&value); + let cell_html = if html_value == escaped_value { + highlight_allele(&value, alt) + } else { + html_value + }; let _ = write!( out, - "{}", + "{}", cell_class, - highlight_allele(&value, alt) + html_escape(&raw_value), + cell_html ); return; } + let _ = write!( + out, + "{}", + cell_class, + html_escape(&raw_value), + html_value + ); + return; } let _ = write!( out, @@ -232,10 +261,13 @@ pub(super) fn render_observation_cell( fn ref_alt_cell(out: &mut String, observation: &serde_json::Value) { let value = observation_ref_alt(observation); - let escaped_value = html_escape(&value); + let html_value = observation_ref_alt_html(observation); + let raw_value = observation_ref_alt_raw(observation); + let escaped_raw_value = html_escape(&raw_value); let _ = write!( out, - "{escaped_value}" + "{html_value}", + html_escape(&value) ); } @@ -260,17 +292,124 @@ pub(super) fn observation_ref_alt(observation: &serde_json::Value) -> String { .get("ref") .and_then(serde_json::Value::as_str) .unwrap_or_default(); - let alt_allele = observation - .get("alt") + let alt_alleles = observation_alt_alleles(observation); + if ref_allele.is_empty() && alt_alleles.is_empty() { + String::new() + } else if observation_uses_padded_homopolymer_repeat(observation, ref_allele, &alt_alleles) { + format!( + "{} > {}", + padded_homopolymer_repeat_notation(ref_allele), + alt_alleles + .iter() + .map(|allele| padded_homopolymer_repeat_notation(allele)) + .collect::>() + .join(", ") + ) + } else { + let reference_display = alt_alleles.first().map_or_else( + || repeat_notation(ref_allele), + |alt| paired_reference_repeat_notation(ref_allele, alt), + ); + format!( + "{} > {}", + reference_display, + alt_alleles + .iter() + .map(|allele| paired_repeat_notation(ref_allele, allele)) + .collect::>() + .join(", ") + ) + } +} + +fn observation_ref_alt_html(observation: &serde_json::Value) -> String { + let ref_allele = observation + .get("ref") .and_then(serde_json::Value::as_str) .unwrap_or_default(); - if ref_allele.is_empty() && alt_allele.is_empty() { + let alt_alleles = observation_alt_alleles(observation); + if ref_allele.is_empty() && alt_alleles.is_empty() { String::new() + } else if observation_uses_padded_homopolymer_repeat(observation, ref_allele, &alt_alleles) { + format!( + "{} > {}", + padded_homopolymer_repeat_notation(ref_allele), + alt_alleles + .iter() + .map(|allele| padded_homopolymer_repeat_notation_html(allele, ref_allele)) + .collect::>() + .join(", ") + ) } else { - format!("{ref_allele}->{}", alt_allele.replace(',', "/")) + let reference_display = alt_alleles.first().map_or_else( + || repeat_notation_html(ref_allele, None), + |alt| paired_reference_repeat_notation_html(ref_allele, alt), + ); + format!( + "{} > {}", + reference_display, + alt_alleles + .iter() + .map(|allele| paired_repeat_notation_html(ref_allele, allele)) + .collect::>() + .join(", ") + ) } } +fn observation_ref_alt_raw(observation: &serde_json::Value) -> String { + let ref_allele = observation + .get("ref") + .and_then(serde_json::Value::as_str) + .unwrap_or_default(); + let alt_alleles = observation_alt_alleles(observation); + if ref_allele.is_empty() && alt_alleles.is_empty() { + String::new() + } else { + format!("{ref_allele} > {}", alt_alleles.join(", ")) + } +} + +fn observation_alt_alleles(observation: &serde_json::Value) -> Vec { + let alts = observation + .get("alts") + .and_then(serde_json::Value::as_array) + .into_iter() + .flatten() + .filter_map(serde_json::Value::as_str) + .filter(|value| !value.is_empty()) + .map(ToOwned::to_owned) + .collect::>(); + if !alts.is_empty() { + return alts; + } + observation + .get("alt") + .and_then(serde_json::Value::as_str) + .unwrap_or_default() + .split(',') + .filter(|value| !value.is_empty()) + .map(ToOwned::to_owned) + .collect() +} + +fn observation_uses_padded_homopolymer_repeat( + observation: &serde_json::Value, + ref_allele: &str, + alt_alleles: &[String], +) -> bool { + let kind = observation + .get("kind") + .and_then(serde_json::Value::as_str) + .unwrap_or_default(); + matches!(kind, "indel" | "deletion" | "insertion") + && !ref_allele.is_empty() + && !alt_alleles.is_empty() + && alt_alleles + .iter() + .all(|alt| same_homopolymer_base(ref_allele, alt).is_some()) +} + pub(super) fn highlight_allele(value: &str, allele: &str) -> String { if value.is_empty() || allele.is_empty() { return html_escape(value); diff --git a/rust/bioscript-reporting/src/html/pgx.rs b/rust/bioscript-reporting/src/html/pgx.rs index e23618d..21d24cf 100644 --- a/rust/bioscript-reporting/src/html/pgx.rs +++ b/rust/bioscript-reporting/src/html/pgx.rs @@ -1,6 +1,6 @@ use super::helpers::{ class_cell, html_escape, join_drugs, join_string_array, link_cell, render_table_end, - render_table_start, table_cell, value_str, + render_table_start, repeat_notation, table_cell, value_str, }; use super::observations::highlight_allele; use std::fmt::Write as _; @@ -338,16 +338,43 @@ pub(super) fn matched_ref_alt(finding: &serde_json::Value) -> String { .get("ref") .and_then(serde_json::Value::as_str) .unwrap_or_default(); - let alt_allele = observation - .get("alt") - .and_then(serde_json::Value::as_str) - .unwrap_or_default(); - if ref_allele.is_empty() && alt_allele.is_empty() { + let alt_alleles = matched_observation_alt_alleles(observation); + if ref_allele.is_empty() && alt_alleles.is_empty() { String::new() } else { - let alt_display = alt_allele.replace(',', "/"); - format!("{ref_allele}->{alt_display}") + format!( + "{} > {}", + repeat_notation(ref_allele), + alt_alleles + .iter() + .map(|allele| repeat_notation(allele)) + .collect::>() + .join(", ") + ) + } +} + +fn matched_observation_alt_alleles(observation: &serde_json::Value) -> Vec { + let alts = observation + .get("alts") + .and_then(serde_json::Value::as_array) + .into_iter() + .flatten() + .filter_map(serde_json::Value::as_str) + .filter(|value| !value.is_empty()) + .map(ToOwned::to_owned) + .collect::>(); + if !alts.is_empty() { + return alts; } + observation + .get("alt") + .and_then(serde_json::Value::as_str) + .unwrap_or_default() + .split(',') + .filter(|value| !value.is_empty()) + .map(ToOwned::to_owned) + .collect() } pub(super) fn evidence_level_group(level: &str) -> String { diff --git a/rust/bioscript-reporting/src/html/sections.rs b/rust/bioscript-reporting/src/html/sections.rs index 8d265de..877f122 100644 --- a/rust/bioscript-reporting/src/html/sections.rs +++ b/rust/bioscript-reporting/src/html/sections.rs @@ -47,6 +47,22 @@ pub(super) fn render_report_manifest_header(out: &mut String, reports: &[serde_j title }; let _ = write!(out, "

{}

", html_escape(title)); + let summary = value_str(manifest, "summary"); + let summary_url = value_str(manifest, "summary_url"); + if !summary.is_empty() || !summary_url.is_empty() { + out.push_str("
"); + if !summary.is_empty() { + let _ = write!(out, "

{}

", html_escape(summary)); + } + if !summary_url.is_empty() { + let _ = write!( + out, + "

Learn more about the GLP-1 medication response study

", + html_escape(summary_url) + ); + } + out.push_str("
"); + } out.push_str("

Disclaimer: This is not medical or clinical advice, only for research purposes. Always consult a licensed professional to interpret medical information.

This report was generated offline on your system.

For more information see https://app.biovault.net

"); } @@ -60,6 +76,8 @@ pub(super) fn render_report_source_section(out: &mut String, reports: &[serde_js report_manifest_kv(out, "Version", value_str(manifest, "version")); report_manifest_kv(out, "Name", value_str(manifest, "name")); report_manifest_kv(out, "Label", value_str(manifest, "label")); + report_manifest_kv(out, "Summary", value_str(manifest, "summary")); + report_manifest_kv(out, "Summary URL", value_str(manifest, "summary_url")); report_manifest_kv(out, "Tags", &manifest_tags(manifest)); report_manifest_kv(out, "Members", &manifest_member_summary(manifest)); out.push_str(""); diff --git a/rust/bioscript-reporting/src/manifest.rs b/rust/bioscript-reporting/src/manifest.rs index 30cecb9..91ab999 100644 --- a/rust/bioscript-reporting/src/manifest.rs +++ b/rust/bioscript-reporting/src/manifest.rs @@ -147,6 +147,8 @@ pub fn report_manifest_metadata( "version": yaml_string(&value, "version"), "name": yaml_string(&value, "name"), "label": yaml_string(&value, "label").or_else(|| yaml_string(&value, "name")), + "summary": yaml_string(&value, "summary"), + "summary_url": yaml_string(&value, "summary_url"), "tags": yaml_string_sequence(&value, "tags"), "members": members, })) @@ -475,10 +477,10 @@ mod tests { ExecutableAssayMember, ExecutablePanelMember, ManifestWorkspace, ReportManifestKind, assay_executable_member, assay_executable_member_path, collect_analysis_manifest_tasks, collect_variant_manifest_tasks, load_manifest_findings, load_report_manifest_context, - matches_analysis_path_filters, matches_variant_manifest_filters, panel_executable_member, - panel_executable_member_path, report_assay_id, report_manifest_kind, - report_manifest_metadata, report_manifest_schema, resolve_filesystem_manifest_path, - traversable_manifest_member_paths, + load_variant_manifest_task_by_path, matches_analysis_path_filters, + matches_variant_manifest_filters, panel_executable_member, panel_executable_member_path, + report_assay_id, report_manifest_kind, report_manifest_metadata, report_manifest_schema, + resolve_filesystem_manifest_path, traversable_manifest_member_paths, }; struct InlineWorkspace { @@ -623,6 +625,8 @@ alleles: schema: bioscript:panel:1.0 version: "1.0" name: pgx-panel +summary: Panel summary text. +summary_url: https://example.test/panel-summary tags: [pgx, cardiology, 7] members: - kind: variant @@ -636,6 +640,11 @@ members: let metadata = report_manifest_metadata(&workspace, "panel.yaml").unwrap(); assert_eq!(metadata["schema"], "bioscript:panel:1.0"); assert_eq!(metadata["label"], "pgx-panel"); + assert_eq!(metadata["summary"], "Panel summary text."); + assert_eq!( + metadata["summary_url"], + "https://example.test/panel-summary" + ); assert_eq!(metadata["tags"], serde_json::json!(["pgx", "cardiology"])); assert_eq!(metadata["members"][0]["kind"], "variant"); assert_eq!(metadata["members"][0]["path"], "rs1.yaml"); @@ -868,6 +877,38 @@ rs1 rs1 rs1 GENE A G snp 1 123 assert_eq!(tasks[0].manifest_path, "catalogue.yaml#rs1"); } + #[test] + fn variant_catalogue_tasks_preserve_observed_alts_separately_from_reportable_alts() { + let workspace = MapWorkspace { + files: BTreeMap::from([ + ( + "catalogue.yaml".to_owned(), + r#" +schema: bioscript:variant-catalogue:1.0 +version: "1.0" +name: catalogue +variants: + source: variants.tsv +"# + .to_owned(), + ), + ( + "variants.tsv".to_owned(), + "variant_id\tname\trsid\tgene\tref\talts\tobserved_alts\tkind\tgrch38_chrom\tgrch38_pos\nrs1\trs1\trs1\tGENE\tA\tC\tC|G|T\tsnp\t1\t123\n" + .to_owned(), + ), + ]), + }; + + let task = load_variant_manifest_task_by_path(&workspace, "catalogue.yaml#rs1").unwrap(); + + assert_eq!(task.manifest.spec.alternate.as_deref(), Some("C")); + assert_eq!( + task.manifest.spec.observed_alternates, + vec!["C".to_owned(), "G".to_owned(), "T".to_owned()] + ); + } + #[test] fn manifest_context_and_findings_follow_includes_members_and_inherited_bindings() { let workspace = MapWorkspace { diff --git a/rust/bioscript-reporting/src/manifest_catalogue.rs b/rust/bioscript-reporting/src/manifest_catalogue.rs index e7db82f..8fa5820 100644 --- a/rust/bioscript-reporting/src/manifest_catalogue.rs +++ b/rust/bioscript-reporting/src/manifest_catalogue.rs @@ -89,17 +89,28 @@ fn catalogue_row_task( )); rsids.sort(); rsids.dedup(); + let alternates = split_list( + columns.value(row, "alleles.alts"), + columns.separator("alleles.alts"), + ); + let observed_alternates = { + let observed = split_list( + columns.value(row, "alleles.observed_alts"), + columns.separator("alleles.observed_alts"), + ); + if observed.is_empty() { + alternates.clone() + } else { + observed + } + }; let spec = VariantSpec { rsids, grch37: catalogue_locus(columns, row, "grch37")?, grch38: catalogue_locus(columns, row, "grch38")?, reference: columns.value(row, "alleles.ref").map(ToOwned::to_owned), - alternate: split_list( - columns.value(row, "alleles.alts"), - columns.separator("alleles.alts"), - ) - .into_iter() - .next(), + alternate: alternates.first().cloned(), + observed_alternates, kind: columns .value(row, "alleles.kind") .map(catalogue_variant_kind), @@ -279,6 +290,7 @@ impl CatalogueColumns { columns.insert_default("alleles.kind", "kind"); columns.insert_default("alleles.ref", "ref"); columns.insert_default("alleles.alts", "alts"); + columns.insert_default("alleles.observed_alts", "observed_alts"); columns.insert_default("coordinates.grch37.chrom", "grch37_chrom"); columns.insert_default("coordinates.grch37.pos", "grch37_pos"); columns.insert_default("coordinates.grch37.start", "grch37_start"); diff --git a/rust/bioscript-reporting/src/observation.rs b/rust/bioscript-reporting/src/observation.rs index 1341143..115762e 100644 --- a/rust/bioscript-reporting/src/observation.rs +++ b/rust/bioscript-reporting/src/observation.rs @@ -23,6 +23,7 @@ pub struct AppObservationInput<'a> { pub manifest: VariantManifest, pub gene: String, pub source: serde_json::Value, + pub alt_alleles: Vec, pub observed_alt_alleles: Vec, pub inferred_sex: Option<&'a SexInference>, pub fallback_assembly: Option, @@ -31,6 +32,7 @@ pub struct AppObservationInput<'a> { struct AppObservationJson { allele_balance: Option, alt_count: Option, + alt_alleles: Vec, assay_id: String, assembly: String, call: ObservationCallValues, @@ -69,12 +71,13 @@ pub fn app_observation_from_manifest_row(input: AppObservationInput<'_>) -> serd manifest, gene, source, + alt_alleles, observed_alt_alleles, inferred_sex, fallback_assembly, } = input; let ref_allele = manifest.spec.reference.clone().unwrap_or_default(); - let reportable_alt = manifest.spec.alternate.clone().unwrap_or_default(); + let mut reportable_alt = manifest.spec.alternate.clone().unwrap_or_default(); let mut genotype_display = row .get("genotype") .filter(|value| !value.is_empty()) @@ -82,35 +85,33 @@ pub fn app_observation_from_manifest_row(input: AppObservationInput<'_>) -> serd .or_else(|| genotype_display_from_raw_counts(row.get("raw_counts")?)) .unwrap_or_default(); let depth = parse_optional_u32(row.get("depth")); - let ref_count = parse_optional_u32(row.get("ref_count")); - let alt_count = parse_optional_u32(row.get("alt_count")); + let mut ref_count = parse_optional_u32(row.get("ref_count")); + let mut alt_count = parse_optional_u32(row.get("alt_count")); if let Some(normalized_display) = deletion_copy_number_display(row, &manifest, depth, alt_count) { genotype_display = normalized_display; } let weak_indel_match = is_weak_delimited_indel_match(row, &manifest, &genotype_display); + let (assembly, locus) = observation_assembly_and_locus(row, &manifest, fallback_assembly); + let chrom = locus.map_or(String::new(), |locus| locus.chrom.clone()); + reportable_alt = select_observed_reportable_alt( + &genotype_display, + &ref_allele, + &reportable_alt, + &observed_alt_alleles, + ); + if let Some((raw_ref_count, raw_alt_count)) = + raw_snv_counts_for_selected_alleles(row, &manifest, &ref_allele, &reportable_alt) + { + ref_count = Some(raw_ref_count); + alt_count = Some(raw_alt_count); + } let allele_balance = match (alt_count, depth) { (Some(alt_count), Some(depth)) if depth > 0 => { Some(f64::from(alt_count) / f64::from(depth)) } _ => None, }; - let assembly = row - .get("assembly") - .filter(|value| !value.is_empty()) - .cloned() - .or_else(|| fallback_assembly.map(assembly_row_value)) - .unwrap_or_default(); - let locus = if assembly.eq_ignore_ascii_case("grch37") { - manifest.spec.grch37.as_ref() - } else { - manifest - .spec - .grch38 - .as_ref() - .or(manifest.spec.grch37.as_ref()) - }; - let chrom = locus.map_or(String::new(), |locus| locus.chrom.clone()); let (genotype, zygosity) = normalize_app_genotype( &genotype_display, &ref_allele, @@ -139,6 +140,7 @@ pub fn app_observation_from_manifest_row(input: AppObservationInput<'_>) -> serd render_app_observation_json(AppObservationJson { allele_balance, alt_count, + alt_alleles, assay_id: assay_id.to_owned(), assembly, call, @@ -164,6 +166,102 @@ pub fn app_observation_from_manifest_row(input: AppObservationInput<'_>) -> serd }) } +fn observation_assembly_and_locus<'a>( + row: &BTreeMap, + manifest: &'a VariantManifest, + fallback_assembly: Option, +) -> (String, Option<&'a GenomicLocus>) { + let assembly = row + .get("assembly") + .filter(|value| !value.is_empty()) + .cloned() + .or_else(|| fallback_assembly.map(assembly_row_value)) + .unwrap_or_default(); + let locus = if assembly.eq_ignore_ascii_case("grch37") { + manifest.spec.grch37.as_ref() + } else { + manifest + .spec + .grch38 + .as_ref() + .or(manifest.spec.grch37.as_ref()) + }; + (assembly, locus) +} + +fn select_observed_reportable_alt( + genotype_display: &str, + ref_allele: &str, + reportable_alt: &str, + observed_alt_alleles: &[String], +) -> String { + if genotype_display.is_empty() + || ref_allele.len() != 1 + || reportable_alt.len() != 1 + || observed_alt_alleles.is_empty() + { + return reportable_alt.to_owned(); + } + let genotype_alleles = genotype_display + .chars() + .filter(char::is_ascii_alphabetic) + .map(|ch| ch.to_ascii_uppercase()) + .collect::>(); + if genotype_alleles.is_empty() { + return reportable_alt.to_owned(); + } + let reportable_ch = reportable_alt + .chars() + .next() + .unwrap_or_default() + .to_ascii_uppercase(); + if genotype_alleles.contains(&reportable_ch) { + return reportable_alt.to_owned(); + } + let ref_ch = ref_allele + .chars() + .next() + .unwrap_or_default() + .to_ascii_uppercase(); + observed_alt_alleles + .iter() + .find(|alt| { + alt.len() == 1 + && alt.chars().next().is_some_and(|alt_ch| { + let alt_ch = alt_ch.to_ascii_uppercase(); + alt_ch != ref_ch && genotype_alleles.contains(&alt_ch) + }) + }) + .cloned() + .unwrap_or_else(|| reportable_alt.to_owned()) +} + +fn raw_snv_counts_for_selected_alleles( + row: &BTreeMap, + manifest: &VariantManifest, + ref_allele: &str, + reportable_alt: &str, +) -> Option<(u32, u32)> { + if !matches!(manifest.spec.kind, Some(bioscript_core::VariantKind::Snp)) + || ref_allele.len() != 1 + || reportable_alt.len() != 1 + || !matches!(row.get("backend").map(String::as_str), Some("cram" | "bam")) + { + return None; + } + let counts: serde_json::Map = + serde_json::from_str(row.get("raw_counts")?).ok()?; + let count_for = |allele: &str| -> u32 { + let allele = allele.to_ascii_uppercase(); + counts + .get(&allele) + .and_then(serde_json::Value::as_u64) + .and_then(|value| u32::try_from(value).ok()) + .unwrap_or(0) + }; + Some((count_for(ref_allele), count_for(reportable_alt))) +} + fn observation_call_values( depth: Option, non_reportable_status: Option<&'static str>, @@ -211,6 +309,7 @@ fn render_app_observation_json(input: AppObservationJson) -> serde_json::Value { let AppObservationJson { allele_balance, alt_count, + alt_alleles, assay_id, assembly, call, @@ -258,6 +357,7 @@ fn render_app_observation_json(input: AppObservationJson) -> serde_json::Value { "pos_end": locus.as_ref().map_or(serde_json::Value::Null, |locus| serde_json::Value::from(locus.end)), "ref": ref_allele, "alt": reportable_alt, + "alts": alt_alleles, "kind": kind, "match_status": if row.get("matched_rsid").is_some_and(|value| !value.is_empty()) || !genotype_display.is_empty() { "found" } else { "not_found" }, "coverage_status": depth.map_or("covered", |depth| if depth > 0 { "covered" } else { "not_covered" }), @@ -403,6 +503,50 @@ mod tests { ); } + #[test] + fn multi_alt_observations_use_the_alt_present_in_the_genotype() { + assert_eq!( + select_observed_reportable_alt("CT", "C", "G", &["G".to_owned(), "T".to_owned()]), + "T" + ); + assert_eq!( + select_observed_reportable_alt("CG", "C", "G", &["G".to_owned(), "T".to_owned()]), + "G" + ); + } + + #[test] + fn cram_multi_alt_observations_recount_against_the_selected_alt() { + let row = BTreeMap::from([ + ("participant_id".to_owned(), "p1".to_owned()), + ("matched_rsid".to_owned(), "rs1".to_owned()), + ("backend".to_owned(), "cram".to_owned()), + ("raw_counts".to_owned(), r#"{"T":45}"#.to_owned()), + ("ref_count".to_owned(), "0".to_owned()), + ("alt_count".to_owned(), "0".to_owned()), + ("depth".to_owned(), "45".to_owned()), + ]); + let observation = app_observation_from_manifest_row(AppObservationInput { + row: &row, + row_path: "variants/rs1.yaml", + assay_id: "assay", + manifest: manifest(VariantKind::Snp, "6", "C", "G"), + gene: "FOXO3".to_owned(), + source: serde_json::Value::Null, + alt_alleles: vec!["G".to_owned(), "T".to_owned()], + observed_alt_alleles: vec!["G".to_owned(), "T".to_owned()], + inferred_sex: None, + fallback_assembly: None, + }); + + assert_eq!(observation["alt"], "T"); + assert_eq!(observation["ref_count"], 0); + assert_eq!(observation["alt_count"], 45); + assert_eq!(observation["allele_balance"], 1.0); + assert_eq!(observation["genotype"], "1/1"); + assert_eq!(observation["outcome"], "variant"); + } + #[test] fn single_allele_sex_chromosome_calls_are_treated_as_hemizygous() { assert_eq!( @@ -495,6 +639,7 @@ mod tests { manifest: manifest(VariantKind::Snp, "1", "A", "G"), gene: "ABC".to_owned(), source: serde_json::json!({"kind": "database"}), + alt_alleles: vec!["G".to_owned()], observed_alt_alleles: Vec::new(), inferred_sex: None, fallback_assembly: None, @@ -526,6 +671,7 @@ mod tests { manifest: manifest(VariantKind::Snp, "1", "A", "G"), gene: "ABC".to_owned(), source: serde_json::Value::Null, + alt_alleles: vec!["G".to_owned()], observed_alt_alleles: Vec::new(), inferred_sex: None, fallback_assembly: Some(Assembly::Grch37), @@ -546,6 +692,7 @@ mod tests { manifest: manifest(VariantKind::Snp, "1", "A", "G"), gene: "ABC".to_owned(), source: serde_json::Value::Null, + alt_alleles: vec!["G".to_owned()], observed_alt_alleles: Vec::new(), inferred_sex: None, fallback_assembly: Some(Assembly::Grch38), @@ -570,14 +717,18 @@ mod tests { manifest: manifest(VariantKind::Snp, "X", "A", "G"), gene: "ABC".to_owned(), source: serde_json::Value::Null, + alt_alleles: vec!["G".to_owned(), "T".to_owned()], observed_alt_alleles: vec!["T".to_owned()], inferred_sex: Some(&inferred_sex), fallback_assembly: None, }); - assert_eq!(observation["outcome"], "observed_alt"); - assert_eq!(observation["call_status"], "observed_alt"); - assert_eq!(observation["facets"], "observed_alt;known_observed_alts=T"); + assert_eq!(observation["alt"], "T"); + assert_eq!(observation["genotype"], "1"); + assert_eq!(observation["zygosity"], "hem_alt"); + assert_eq!(observation["outcome"], "variant"); + assert_eq!(observation["call_status"], "called"); + assert_eq!(observation["facets"], serde_json::Value::Null); assert!( observation["evidence_raw"] .as_str() @@ -603,6 +754,7 @@ mod tests { manifest: manifest(VariantKind::Deletion, "22", "TTATAA", ""), gene: "APOL1".to_owned(), source: serde_json::Value::Null, + alt_alleles: vec!["".to_owned()], observed_alt_alleles: Vec::new(), inferred_sex: None, fallback_assembly: Some(Assembly::Grch38), diff --git a/rust/bioscript-reporting/src/observation/genotype_display.rs b/rust/bioscript-reporting/src/observation/genotype_display.rs index c944d2e..4ca95f8 100644 --- a/rust/bioscript-reporting/src/observation/genotype_display.rs +++ b/rust/bioscript-reporting/src/observation/genotype_display.rs @@ -67,6 +67,9 @@ pub(super) fn normalize_app_genotype( { return normalize_app_genotype(display, "I", "D", None, chrom, inferred_sex); } + if let Some(normalized) = normalize_long_allele_genotype(display, ref_allele, alt_allele) { + return normalized; + } let alleles: Vec = display.chars().filter(char::is_ascii_alphabetic).collect(); if ref_allele.len() != 1 || alt_allele.len() != 1 { return (display.to_owned(), "unknown".to_owned()); @@ -106,6 +109,42 @@ pub(super) fn normalize_app_genotype( } } +fn normalize_long_allele_genotype( + display: &str, + ref_allele: &str, + alt_allele: &str, +) -> Option<(String, String)> { + if ref_allele.is_empty() + || alt_allele.is_empty() + || (ref_allele.len() == 1 && alt_allele.len() == 1) + { + return None; + } + let display = normalize_sequence_token(display); + let ref_allele = normalize_sequence_token(ref_allele); + let alt_allele = normalize_sequence_token(alt_allele); + if display == format!("{ref_allele}{ref_allele}") { + return Some(("0/0".to_owned(), "hom_ref".to_owned())); + } + if display == format!("{alt_allele}{alt_allele}") { + return Some(("1/1".to_owned(), "hom_alt".to_owned())); + } + if display == format!("{ref_allele}{alt_allele}") + || display == format!("{alt_allele}{ref_allele}") + { + return Some(("0/1".to_owned(), "het".to_owned())); + } + None +} + +fn normalize_sequence_token(value: &str) -> String { + value + .chars() + .filter(|ch| !matches!(ch, '/' | '|' | ' ' | '-' | '.')) + .map(|ch| ch.to_ascii_uppercase()) + .collect() +} + fn is_confident_male_sex_chromosome(chrom: &str, inferred_sex: Option<&SexInference>) -> bool { is_haploid_sex_chromosome(chrom) && inferred_sex.is_some_and(|sex| { diff --git a/rust/bioscript-reporting/src/runner.rs b/rust/bioscript-reporting/src/runner.rs index 89324c8..56d42df 100644 --- a/rust/bioscript-reporting/src/runner.rs +++ b/rust/bioscript-reporting/src/runner.rs @@ -167,13 +167,23 @@ impl ReportWorkspace for FilesystemManifestWorkspace { fallback_assembly: Option, ) -> Result { let row_path = row.get("path").cloned().unwrap_or_default(); - let (manifest, gene, source, observed_alt_alleles) = if row_path.contains('#') { + let (manifest, gene, source, alt_alleles, observed_alt_alleles) = if row_path.contains('#') + { let task = crate::load_variant_manifest_task_by_path(self, &row_path)?; + let alt_alleles = task + .manifest + .spec + .alternate + .clone() + .into_iter() + .collect::>(); + let observed_alt_alleles = task.manifest.spec.observed_alternates.clone(); ( task.manifest, String::new(), serde_json::Value::Null, - Vec::new(), + alt_alleles, + observed_alt_alleles, ) } else { let text = self.load_text(&row_path)?; @@ -183,6 +193,7 @@ impl ReportWorkspace for FilesystemManifestWorkspace { manifest, yaml_string(&value, "gene").unwrap_or_default(), variant_primary_source_from_yaml(&value)?, + variant_alt_alleles_from_yaml(&value), variant_observed_alt_alleles_from_yaml(&value), ) }; @@ -194,6 +205,7 @@ impl ReportWorkspace for FilesystemManifestWorkspace { manifest, gene, source, + alt_alleles, observed_alt_alleles, inferred_sex, fallback_assembly, @@ -252,6 +264,19 @@ fn variant_observed_alt_alleles_from_yaml(value: &serde_yaml::Value) -> Vec Vec { + value + .get("alleles") + .and_then(serde_yaml::Value::as_mapping) + .and_then(|mapping| mapping.get(serde_yaml::Value::String("alts".to_owned()))) + .and_then(serde_yaml::Value::as_sequence) + .into_iter() + .flatten() + .filter_map(serde_yaml::Value::as_str) + .map(ToOwned::to_owned) + .collect() +} + fn yaml_string(value: &serde_yaml::Value, key: &str) -> Option { value .get(key) diff --git a/rust/bioscript-schema/src/validator/spec.rs b/rust/bioscript-schema/src/validator/spec.rs index a778953..752a56d 100644 --- a/rust/bioscript-schema/src/validator/spec.rs +++ b/rust/bioscript-schema/src/validator/spec.rs @@ -8,6 +8,9 @@ pub(crate) fn variant_spec_from_root(root: &Value) -> Result Result Result { let grch37 = locus_from_root(root, "grch37")?; let grch38 = locus_from_root(root, "grch38")?; let reference = scalar_at(root, &["alleles", "ref"]); - let alternate = seq_of_strings(root, &["alleles", "alts"]).and_then(|alts| alts.first().cloned()); + let alts = seq_of_strings(root, &["alleles", "alts"]).unwrap_or_default(); + let observed_alternates = + seq_of_strings(root, &["alleles", "observed_alts"]).unwrap_or_else(|| alts.clone()); + let alternate = alts.first().cloned(); let deletion_length = value_at(root, &["alleles", "deletion_length"]) .and_then(Value::as_u64) .and_then(|value| usize::try_from(value).ok()); @@ -463,6 +466,7 @@ fn variant_spec_from_root(root: &Value) -> Result { grch38_assembly_ref: scalar_at(root, &["coordinates", "grch38", "assembly_ref"]), reference, alternate, + observed_alternates, kind, deletion_length, motifs, diff --git a/rust/bioscript-wasm/src/lookup_api.rs b/rust/bioscript-wasm/src/lookup_api.rs index 4c2f91f..d59c55e 100644 --- a/rust/bioscript-wasm/src/lookup_api.rs +++ b/rust/bioscript-wasm/src/lookup_api.rs @@ -27,6 +27,8 @@ struct VariantInput { #[serde(rename = "alt")] alt_base: String, #[serde(default)] + observed_alts: Vec, + #[serde(default)] rsid: Option, #[serde(default)] assembly: Option, @@ -138,15 +140,19 @@ pub fn lookup_cram_variants( assembly, ) } - VariantKind::Insertion | VariantKind::Indel => observe_cram_indel_with_reader( - &mut indexed, - &variant.name, - &locus, - &variant.ref_base, - &variant.alt_base, - variant.rsid.clone(), - assembly, - ), + VariantKind::Insertion | VariantKind::Indel => { + let alternate_lengths = indel_alternate_lengths(&variant); + observe_cram_indel_with_reader( + &mut indexed, + &variant.name, + &locus, + &variant.ref_base, + &variant.alt_base, + &alternate_lengths, + variant.rsid.clone(), + assembly, + ) + } other => { return Err(JsError::new(&format!( "variant {} has unsupported kind {:?} for web CRAM lookup", @@ -343,12 +349,32 @@ fn variant_input_to_spec(variant: &VariantInput) -> Result grch38_assembly_ref: None, reference: Some(variant.ref_base.clone()), alternate: Some(variant.alt_base.clone()), + observed_alternates: if variant.observed_alts.is_empty() { + vec![variant.alt_base.clone()] + } else { + variant.observed_alts.clone() + }, kind, deletion_length: variant.deletion_length, motifs: Vec::new(), }) } +fn indel_alternate_lengths(variant: &VariantInput) -> Vec { + let mut lengths = variant + .observed_alts + .iter() + .map(String::len) + .filter(|len| *len > 0) + .collect::>(); + if lengths.is_empty() { + lengths.push(variant.alt_base.len()); + } + lengths.sort_unstable(); + lengths.dedup(); + lengths +} + fn parse_variant_kind(kind: Option<&str>) -> Option { match kind.unwrap_or("").to_ascii_lowercase().as_str() { "snp" | "snv" | "variant" | "" => Some(VariantKind::Snp), diff --git a/rust/bioscript-wasm/src/report_lookup/alignment.rs b/rust/bioscript-wasm/src/report_lookup/alignment.rs index e78fb8b..8558e8f 100644 --- a/rust/bioscript-wasm/src/report_lookup/alignment.rs +++ b/rust/bioscript-wasm/src/report_lookup/alignment.rs @@ -144,12 +144,14 @@ fn observe_cram_variant( .unwrap_or("variant") )) })?; + let alternate_lengths = indel_alternate_lengths(variant, alternate); bioscript_formats::observe_cram_indel_with_reader( reader, label, &locus, reference, alternate, + &alternate_lengths, variant.rsids.first().cloned(), assembly, ) @@ -165,3 +167,18 @@ fn observe_cram_variant( ))), } } + +fn indel_alternate_lengths(variant: &VariantSpec, fallback_alternate: &str) -> Vec { + let mut lengths = variant + .observed_alternates + .iter() + .map(String::len) + .filter(|len| *len > 0) + .collect::>(); + if lengths.is_empty() { + lengths.push(fallback_alternate.len()); + } + lengths.sort_unstable(); + lengths.dedup(); + lengths +} diff --git a/rust/bioscript-wasm/src/report_lookup/alignment/bam.rs b/rust/bioscript-wasm/src/report_lookup/alignment/bam.rs index 16923d2..ff11c2f 100644 --- a/rust/bioscript-wasm/src/report_lookup/alignment/bam.rs +++ b/rust/bioscript-wasm/src/report_lookup/alignment/bam.rs @@ -153,12 +153,14 @@ fn observe_bam_variant( .unwrap_or("variant") )) })?; + let alternate_lengths = indel_alternate_lengths(variant, alternate); observe_bam_indel_with_reader( reader, label, &locus, reference, alternate, + &alternate_lengths, variant.rsids.first().cloned(), assembly, ) @@ -360,6 +362,7 @@ fn observe_bam_indel_with_reader( locus: &GenomicLocus, reference: &str, alternate: &str, + alternate_lengths: &[usize], matched_rsid: Option, assembly: Option, ) -> Result { @@ -380,8 +383,12 @@ fn observe_bam_indel_with_reader( if alignment_record.is_unmapped || !record_overlaps_locus(&alignment_record, locus) { continue; } - let classification = - classify_expected_indel(&alignment_record, locus, reference.len(), alternate)?; + let classification = classify_expected_indel_lengths( + &alignment_record, + locus, + reference.len(), + alternate_lengths, + )?; if !classification.covering { continue; } @@ -423,6 +430,21 @@ fn observe_bam_indel_with_reader( }) } +fn indel_alternate_lengths(variant: &VariantSpec, fallback_alternate: &str) -> Vec { + let mut lengths = variant + .observed_alternates + .iter() + .map(String::len) + .filter(|len| *len > 0) + .collect::>(); + if lengths.is_empty() { + lengths.push(fallback_alternate.len()); + } + lengths.sort_unstable(); + lengths.dedup(); + lengths +} + fn read_bam_header( reader: &mut noodles::bam::io::indexed_reader::IndexedReader>, label: &str, @@ -470,7 +492,7 @@ mod pileup; use pileup::{ BamSnpPileupCounts, bam_alignment_record, bam_base_quality_at_reference_position, - classify_expected_indel, describe_copy_number_decision_rule, describe_snp_decision_rule, - indel_at_anchor, infer_copy_number_genotype, infer_snp_genotype, normalize_pileup_base, - record_overlaps_locus, spans_position, + classify_expected_indel_lengths, describe_copy_number_decision_rule, + describe_snp_decision_rule, indel_at_anchor, infer_copy_number_genotype, infer_snp_genotype, + normalize_pileup_base, record_overlaps_locus, spans_position, }; diff --git a/rust/bioscript-wasm/src/report_lookup/alignment/bam/pileup.rs b/rust/bioscript-wasm/src/report_lookup/alignment/bam/pileup.rs index bf0fbac..aed14af 100644 --- a/rust/bioscript-wasm/src/report_lookup/alignment/bam/pileup.rs +++ b/rust/bioscript-wasm/src/report_lookup/alignment/bam/pileup.rs @@ -207,13 +207,22 @@ pub(super) fn indel_at_anchor( None } +#[allow(dead_code)] pub(super) fn classify_expected_indel( record: &bioscript_formats::alignment::AlignmentRecord, locus: &GenomicLocus, reference_len: usize, alternate: &str, ) -> Result { - let alt_len = alternate.len(); + classify_expected_indel_lengths(record, locus, reference_len, &[alternate.len()]) +} + +pub(super) fn classify_expected_indel_lengths( + record: &bioscript_formats::alignment::AlignmentRecord, + locus: &GenomicLocus, + reference_len: usize, + alternate_lengths: &[usize], +) -> Result { let anchor_start = locus.start.saturating_sub(1); let anchor_end = locus.end; @@ -242,7 +251,7 @@ pub(super) fn classify_expected_indel( return Ok(IndelClassification { covering: true, reference_like: false, - matches_alt: observed_len == alt_len, + matches_alt: alternate_lengths.contains(&observed_len), observed_len, }); }