diff --git a/rust/bioscript-formats/src/genotype.rs b/rust/bioscript-formats/src/genotype.rs index 3da2a39..ca8617b 100644 --- a/rust/bioscript-formats/src/genotype.rs +++ b/rust/bioscript-formats/src/genotype.rs @@ -712,6 +712,15 @@ mod tests { Some("--") ); assert_eq!(genotype_from_vcf_gt("2/2", "A", &["G"]), None); + assert_eq!( + genotype_from_vcf_gt("0/1", "TTTTTTTTTTTTT", &["TTTTTTTTTTTT"]).as_deref(), + Some("TTTTTTTTTTTTT/TTTTTTTTTTTT") + ); + assert_eq!( + genotype_from_vcf_gt("0/2", "TTTTTTTTTTTTT", &["TTTTTTTTTTTT", "TTTTTTTTTTTTTT"]) + .as_deref(), + Some("TTTTTTTTTTTTT/TTTTTTTTTTTTTT") + ); assert_eq!(vcf_reference_token("AT", &["A"]), "I"); assert_eq!(vcf_reference_token("A", &["AT"]), "D"); assert_eq!(vcf_reference_token("A", &[""]), "A"); diff --git a/rust/bioscript-formats/src/genotype/vcf.rs b/rust/bioscript-formats/src/genotype/vcf.rs index 33a587c..18ad60e 100644 --- a/rust/bioscript-formats/src/genotype/vcf.rs +++ b/rust/bioscript-formats/src/genotype/vcf.rs @@ -309,7 +309,10 @@ fn resolve_vcf_row( backend: backend.backend_name().to_owned(), matched_rsid: Some(rsid.clone()), assembly: targets.detected_assembly, - genotype: Some(row.genotype.clone()), + genotype: Some(matching::vcf_row_genotype_for_variant( + row, + &targets.variants[target_idx], + )), evidence: vec![ format!("resolved by rsid {rsid}"), format!("source line: {}", row.raw_line), @@ -339,7 +342,10 @@ fn resolve_vcf_row( backend: backend.backend_name().to_owned(), matched_rsid: row.rsid.clone(), assembly: targets.detected_assembly, - genotype: Some(row.genotype.clone()), + genotype: Some(matching::vcf_row_genotype_for_variant( + row, + &targets.variants[target_idx], + )), evidence: vec![ format!("resolved by locus {}:{}", row.chrom, row.position), format!("source line: {}", row.raw_line), diff --git a/rust/bioscript-formats/src/genotype/vcf/matching.rs b/rust/bioscript-formats/src/genotype/vcf/matching.rs index a0bbddc..9496a4c 100644 --- a/rust/bioscript-formats/src/genotype/vcf/matching.rs +++ b/rust/bioscript-formats/src/genotype/vcf/matching.rs @@ -142,6 +142,51 @@ pub(crate) fn vcf_row_matches_variant( } } +pub(crate) fn vcf_row_genotype_for_variant(row: &ParsedVcfRow, variant: &VariantSpec) -> String { + if !matches!( + variant.kind, + Some(VariantKind::Deletion | VariantKind::Insertion | VariantKind::Indel) + ) { + return row.genotype.clone(); + } + + let parts: Vec<&str> = row.genotype.split('/').collect(); + if parts.len() <= 1 { + return row.genotype.clone(); + } + + let alternates: Vec<&str> = row.alternates.iter().map(String::as_str).collect(); + let mut tokens = Vec::with_capacity(parts.len()); + for part in parts { + if part.eq_ignore_ascii_case(&row.reference) { + tokens.push(super::super::vcf_tokens::vcf_reference_token( + &row.reference, + &alternates, + )); + } else if let Some(alternate) = row + .alternates + .iter() + .find(|alternate| alternate.eq_ignore_ascii_case(part)) + { + tokens.push(super::super::vcf_tokens::vcf_alt_token( + &row.reference, + alternate, + )); + } else { + return row.genotype.clone(); + } + } + + if tokens + .iter() + .all(|token| token.chars().count() == 1 && token != "--") + { + return super::super::normalize_genotype(&tokens.join("")); + } + + row.genotype.clone() +} + fn snp_row_has_catalog_allele(row: &ParsedVcfRow, variant: &VariantSpec) -> bool { let Some(alternate) = variant.alternate.as_ref() else { return true; diff --git a/rust/bioscript-formats/src/genotype/vcf/reader.rs b/rust/bioscript-formats/src/genotype/vcf/reader.rs index 6a2c1b1..2a18f14 100644 --- a/rust/bioscript-formats/src/genotype/vcf/reader.rs +++ b/rust/bioscript-formats/src/genotype/vcf/reader.rs @@ -7,7 +7,7 @@ use noodles::tabix; use bioscript_core::{Assembly, GenomicLocus, RuntimeError, VariantObservation, VariantSpec}; -use super::{parse_vcf_record, vcf_row_matches_variant}; +use super::{matching::vcf_row_genotype_for_variant, parse_vcf_record, vcf_row_matches_variant}; /// Observe a SNP at `locus` over an already-built tabix-indexed bgzipped VCF /// reader. Caller builds `csi::io::IndexedReader::new(reader, tabix_index)` @@ -208,7 +208,7 @@ where backend: "vcf".to_owned(), matched_rsid: matched_rsid.or_else(|| row.rsid.clone()), assembly, - genotype: Some(row.genotype.clone()), + genotype: Some(vcf_row_genotype_for_variant(&row, variant)), evidence: vec![format!("{label}: resolved by indexed locus {locus_label}")], ..VariantObservation::default() }); diff --git a/rust/bioscript-formats/src/genotype/vcf_tokens.rs b/rust/bioscript-formats/src/genotype/vcf_tokens.rs index c1aae58..9accdd0 100644 --- a/rust/bioscript-formats/src/genotype/vcf_tokens.rs +++ b/rust/bioscript-formats/src/genotype/vcf_tokens.rs @@ -15,21 +15,40 @@ pub(crate) fn genotype_from_vcf_gt( return Some("--".to_owned()); } - let ref_token = vcf_reference_token(reference, alternates); - let mut out = String::new(); + let mut indexes = Vec::with_capacity(parts.len()); for part in parts { let Ok(idx) = part.parse::() else { return Some("--".to_owned()); }; + indexes.push(idx); + } + + let alternate_only = indexes.iter().all(|idx| *idx > 0); + let mut tokens = Vec::with_capacity(indexes.len()); + for idx in indexes { if idx == 0 { - out.push_str(&ref_token); + tokens.push(normalize_sequence_token(reference)); } else { let alt = alternates.get(idx - 1)?; - out.push_str(&vcf_alt_token(reference, alt)); + if is_symbolic_vcf_alt(alt) { + return Some("--".to_owned()); + } + if alternate_only { + tokens.push(vcf_alt_token(reference, alt)); + } else { + tokens.push(normalize_sequence_token(alt)); + } } } - Some(normalize_genotype(&out)) + if tokens + .iter() + .all(|token| token.chars().count() == 1 && token != "--") + { + return Some(normalize_genotype(&tokens.join(""))); + } + + Some(tokens.join("/")) } pub(crate) fn vcf_reference_token(reference: &str, alternates: &[&str]) -> String { diff --git a/rust/bioscript-reporting/src/manifest_catalogue.rs b/rust/bioscript-reporting/src/manifest_catalogue.rs index 8fa5820..1b2d5b8 100644 --- a/rust/bioscript-reporting/src/manifest_catalogue.rs +++ b/rust/bioscript-reporting/src/manifest_catalogue.rs @@ -87,8 +87,13 @@ fn catalogue_row_task( columns.value(row, "identifier.aliases"), columns.separator("identifier.aliases"), )); - rsids.sort(); - rsids.dedup(); + let mut deduped_rsids = Vec::with_capacity(rsids.len()); + for rsid in rsids { + if !deduped_rsids.iter().any(|seen| seen == &rsid) { + deduped_rsids.push(rsid); + } + } + let rsids = deduped_rsids; let alternates = split_list( columns.value(row, "alleles.alts"), columns.separator("alleles.alts"), diff --git a/rust/bioscript-reporting/src/observation.rs b/rust/bioscript-reporting/src/observation.rs index 115762e..2d57846 100644 --- a/rust/bioscript-reporting/src/observation.rs +++ b/rust/bioscript-reporting/src/observation.rs @@ -338,6 +338,12 @@ fn render_app_observation_json(input: AppObservationJson) -> serde_json::Value { } else { gene }; + let canonical_rsid = manifest.spec.rsids.first().filter(|rsid| !rsid.is_empty()); + let matched_rsid = row.get("matched_rsid").filter(|rsid| !rsid.is_empty()); + let alias_match_note = matched_rsid + .filter(|matched| canonical_rsid.is_some_and(|canonical| canonical != *matched)) + .filter(|matched| manifest.spec.rsids.iter().any(|rsid| rsid == *matched)) + .map(|matched| format!("matched alias: {matched}")); let source = if source.is_null() { manifest_default_source(&row, &manifest) } else { @@ -349,7 +355,7 @@ fn render_app_observation_json(input: AppObservationJson) -> serde_json::Value { "assay_version": "1.0", "variant_key": manifest.name, "variant_path": row_path, - "rsid": row.get("matched_rsid").filter(|value| !value.is_empty()).cloned().or_else(|| manifest.spec.rsids.first().cloned()), + "rsid": canonical_rsid.cloned().or_else(|| matched_rsid.cloned()), "gene": gene, "assembly": if assembly.is_empty() { serde_json::Value::Null } else { serde_json::Value::String(assembly.to_uppercase()) }, "chrom": chrom, @@ -377,6 +383,8 @@ fn render_app_observation_json(input: AppObservationJson) -> serde_json::Value { "match_quality": if weak_indel_match { serde_json::Value::String("weak".to_owned()) } else { serde_json::Value::Null }, "match_notes": if weak_indel_match { serde_json::Value::String("consumer genotype file reported an insertion/deletion token at the marker, not sequence-resolved evidence for the exact deletion allele".to_owned()) + } else if let Some(note) = alias_match_note { + serde_json::Value::String(note) } else { serde_json::Value::Null }, @@ -397,10 +405,10 @@ fn manifest_default_source( manifest: &VariantManifest, ) -> serde_json::Value { let rsid = row - .get("matched_rsid") + .get("rsid") .filter(|rsid| !rsid.is_empty()) - .or_else(|| row.get("rsid").filter(|rsid| !rsid.is_empty())) - .or_else(|| manifest.spec.rsids.first().filter(|rsid| !rsid.is_empty())); + .or_else(|| manifest.spec.rsids.first().filter(|rsid| !rsid.is_empty())) + .or_else(|| row.get("matched_rsid").filter(|rsid| !rsid.is_empty())); let Some(rsid) = rsid else { return serde_json::Value::Null; }; @@ -445,6 +453,32 @@ mod tests { ); } + #[test] + fn repeat_indel_insertion_deletion_tokens_remain_ambiguous_without_sequence_alleles() { + assert_eq!( + normalize_app_genotype( + "II", + "TTTTTTTTTTTTT", + "TTTTTTTTTTTT", + Some(VariantKind::Indel), + "19", + None, + ), + ("II".to_owned(), "unknown".to_owned()) + ); + assert_eq!( + normalize_app_genotype( + "ID", + "TTTTTTTTTTTTT", + "TTTTTTTTTTTT", + Some(VariantKind::Indel), + "19", + None, + ), + ("ID".to_owned(), "unknown".to_owned()) + ); + } + #[test] fn displays_cram_long_deletion_copy_number_as_insertion_deletion_tokens() { let manifest = VariantManifest { diff --git a/rust/bioscript-reporting/src/observation/genotype_display.rs b/rust/bioscript-reporting/src/observation/genotype_display.rs index 4ca95f8..723fa0f 100644 --- a/rust/bioscript-reporting/src/observation/genotype_display.rs +++ b/rust/bioscript-reporting/src/observation/genotype_display.rs @@ -58,14 +58,17 @@ pub(super) fn normalize_app_genotype( if display.is_empty() { return ("./.".to_owned(), "unknown".to_owned()); } - if matches!(kind, Some(VariantKind::Deletion)) - && ref_allele.len() != 1 - && display - .chars() - .filter(char::is_ascii_alphabetic) - .all(|allele| matches!(allele.to_ascii_uppercase(), 'I' | 'D')) + if let Some((reference_token, alternate_token)) = + indel_display_tokens(display, ref_allele, alt_allele, kind) { - return normalize_app_genotype(display, "I", "D", None, chrom, inferred_sex); + return normalize_app_genotype( + display, + &reference_token, + &alternate_token, + None, + chrom, + inferred_sex, + ); } if let Some(normalized) = normalize_long_allele_genotype(display, ref_allele, alt_allele) { return normalized; @@ -109,6 +112,33 @@ pub(super) fn normalize_app_genotype( } } +fn indel_display_tokens( + display: &str, + ref_allele: &str, + _alt_allele: &str, + kind: Option, +) -> Option<(String, String)> { + if ref_allele.len() <= 1 + || !matches!(kind, Some(VariantKind::Deletion | VariantKind::Insertion)) + { + return None; + } + if !display + .chars() + .filter(char::is_ascii_alphabetic) + .all(|allele| matches!(allele.to_ascii_uppercase(), 'I' | 'D')) + { + return None; + } + if matches!(kind, Some(VariantKind::Deletion)) { + return Some(("I".to_owned(), "D".to_owned())); + } + if matches!(kind, Some(VariantKind::Insertion)) { + return Some(("D".to_owned(), "I".to_owned())); + } + None +} + fn normalize_long_allele_genotype( display: &str, ref_allele: &str,