Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions rust/bioscript-formats/src/genotype.rs
Original file line number Diff line number Diff line change
Expand Up @@ -712,6 +712,15 @@ mod tests {
Some("--")
);
assert_eq!(genotype_from_vcf_gt("2/2", "A", &["G"]), None);
assert_eq!(
genotype_from_vcf_gt("0/1", "TTTTTTTTTTTTT", &["TTTTTTTTTTTT"]).as_deref(),
Some("TTTTTTTTTTTTT/TTTTTTTTTTTT")
);
assert_eq!(
genotype_from_vcf_gt("0/2", "TTTTTTTTTTTTT", &["TTTTTTTTTTTT", "TTTTTTTTTTTTTT"])
.as_deref(),
Some("TTTTTTTTTTTTT/TTTTTTTTTTTTTT")
);
assert_eq!(vcf_reference_token("AT", &["A"]), "I");
assert_eq!(vcf_reference_token("A", &["AT"]), "D");
assert_eq!(vcf_reference_token("A", &["<NON_REF>"]), "A");
Expand Down
10 changes: 8 additions & 2 deletions rust/bioscript-formats/src/genotype/vcf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -309,7 +309,10 @@ fn resolve_vcf_row(
backend: backend.backend_name().to_owned(),
matched_rsid: Some(rsid.clone()),
assembly: targets.detected_assembly,
genotype: Some(row.genotype.clone()),
genotype: Some(matching::vcf_row_genotype_for_variant(
row,
&targets.variants[target_idx],
)),
evidence: vec![
format!("resolved by rsid {rsid}"),
format!("source line: {}", row.raw_line),
Expand Down Expand Up @@ -339,7 +342,10 @@ fn resolve_vcf_row(
backend: backend.backend_name().to_owned(),
matched_rsid: row.rsid.clone(),
assembly: targets.detected_assembly,
genotype: Some(row.genotype.clone()),
genotype: Some(matching::vcf_row_genotype_for_variant(
row,
&targets.variants[target_idx],
)),
evidence: vec![
format!("resolved by locus {}:{}", row.chrom, row.position),
format!("source line: {}", row.raw_line),
Expand Down
45 changes: 45 additions & 0 deletions rust/bioscript-formats/src/genotype/vcf/matching.rs
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,51 @@ pub(crate) fn vcf_row_matches_variant(
}
}

pub(crate) fn vcf_row_genotype_for_variant(row: &ParsedVcfRow, variant: &VariantSpec) -> String {
if !matches!(
variant.kind,
Some(VariantKind::Deletion | VariantKind::Insertion | VariantKind::Indel)
) {
return row.genotype.clone();
}

let parts: Vec<&str> = row.genotype.split('/').collect();
if parts.len() <= 1 {
return row.genotype.clone();
}

let alternates: Vec<&str> = row.alternates.iter().map(String::as_str).collect();
let mut tokens = Vec::with_capacity(parts.len());
for part in parts {
if part.eq_ignore_ascii_case(&row.reference) {
tokens.push(super::super::vcf_tokens::vcf_reference_token(
&row.reference,
&alternates,
));
} else if let Some(alternate) = row
.alternates
.iter()
.find(|alternate| alternate.eq_ignore_ascii_case(part))
{
tokens.push(super::super::vcf_tokens::vcf_alt_token(
&row.reference,
alternate,
));
} else {
return row.genotype.clone();
}
}

if tokens
.iter()
.all(|token| token.chars().count() == 1 && token != "--")
{
return super::super::normalize_genotype(&tokens.join(""));
}

row.genotype.clone()
}

fn snp_row_has_catalog_allele(row: &ParsedVcfRow, variant: &VariantSpec) -> bool {
let Some(alternate) = variant.alternate.as_ref() else {
return true;
Expand Down
4 changes: 2 additions & 2 deletions rust/bioscript-formats/src/genotype/vcf/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use noodles::tabix;

use bioscript_core::{Assembly, GenomicLocus, RuntimeError, VariantObservation, VariantSpec};

use super::{parse_vcf_record, vcf_row_matches_variant};
use super::{matching::vcf_row_genotype_for_variant, parse_vcf_record, vcf_row_matches_variant};

/// Observe a SNP at `locus` over an already-built tabix-indexed bgzipped VCF
/// reader. Caller builds `csi::io::IndexedReader::new(reader, tabix_index)`
Expand Down Expand Up @@ -208,7 +208,7 @@ where
backend: "vcf".to_owned(),
matched_rsid: matched_rsid.or_else(|| row.rsid.clone()),
assembly,
genotype: Some(row.genotype.clone()),
genotype: Some(vcf_row_genotype_for_variant(&row, variant)),
evidence: vec![format!("{label}: resolved by indexed locus {locus_label}")],
..VariantObservation::default()
});
Expand Down
29 changes: 24 additions & 5 deletions rust/bioscript-formats/src/genotype/vcf_tokens.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,21 +15,40 @@ pub(crate) fn genotype_from_vcf_gt(
return Some("--".to_owned());
}

let ref_token = vcf_reference_token(reference, alternates);
let mut out = String::new();
let mut indexes = Vec::with_capacity(parts.len());
for part in parts {
let Ok(idx) = part.parse::<usize>() else {
return Some("--".to_owned());
};
indexes.push(idx);
}

let alternate_only = indexes.iter().all(|idx| *idx > 0);
let mut tokens = Vec::with_capacity(indexes.len());
for idx in indexes {
if idx == 0 {
out.push_str(&ref_token);
tokens.push(normalize_sequence_token(reference));
} else {
let alt = alternates.get(idx - 1)?;
out.push_str(&vcf_alt_token(reference, alt));
if is_symbolic_vcf_alt(alt) {
return Some("--".to_owned());
}
if alternate_only {
tokens.push(vcf_alt_token(reference, alt));
} else {
tokens.push(normalize_sequence_token(alt));
}
}
}

Some(normalize_genotype(&out))
if tokens
.iter()
.all(|token| token.chars().count() == 1 && token != "--")
{
return Some(normalize_genotype(&tokens.join("")));
}

Some(tokens.join("/"))
}

pub(crate) fn vcf_reference_token(reference: &str, alternates: &[&str]) -> String {
Expand Down
9 changes: 7 additions & 2 deletions rust/bioscript-reporting/src/manifest_catalogue.rs
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,13 @@ fn catalogue_row_task(
columns.value(row, "identifier.aliases"),
columns.separator("identifier.aliases"),
));
rsids.sort();
rsids.dedup();
let mut deduped_rsids = Vec::with_capacity(rsids.len());
for rsid in rsids {
if !deduped_rsids.iter().any(|seen| seen == &rsid) {
deduped_rsids.push(rsid);
}
}
let rsids = deduped_rsids;
let alternates = split_list(
columns.value(row, "alleles.alts"),
columns.separator("alleles.alts"),
Expand Down
42 changes: 38 additions & 4 deletions rust/bioscript-reporting/src/observation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -338,6 +338,12 @@ fn render_app_observation_json(input: AppObservationJson) -> serde_json::Value {
} else {
gene
};
let canonical_rsid = manifest.spec.rsids.first().filter(|rsid| !rsid.is_empty());
let matched_rsid = row.get("matched_rsid").filter(|rsid| !rsid.is_empty());
let alias_match_note = matched_rsid
.filter(|matched| canonical_rsid.is_some_and(|canonical| canonical != *matched))
.filter(|matched| manifest.spec.rsids.iter().any(|rsid| rsid == *matched))
.map(|matched| format!("matched alias: {matched}"));
let source = if source.is_null() {
manifest_default_source(&row, &manifest)
} else {
Expand All @@ -349,7 +355,7 @@ fn render_app_observation_json(input: AppObservationJson) -> serde_json::Value {
"assay_version": "1.0",
"variant_key": manifest.name,
"variant_path": row_path,
"rsid": row.get("matched_rsid").filter(|value| !value.is_empty()).cloned().or_else(|| manifest.spec.rsids.first().cloned()),
"rsid": canonical_rsid.cloned().or_else(|| matched_rsid.cloned()),
"gene": gene,
"assembly": if assembly.is_empty() { serde_json::Value::Null } else { serde_json::Value::String(assembly.to_uppercase()) },
"chrom": chrom,
Expand Down Expand Up @@ -377,6 +383,8 @@ fn render_app_observation_json(input: AppObservationJson) -> serde_json::Value {
"match_quality": if weak_indel_match { serde_json::Value::String("weak".to_owned()) } else { serde_json::Value::Null },
"match_notes": if weak_indel_match {
serde_json::Value::String("consumer genotype file reported an insertion/deletion token at the marker, not sequence-resolved evidence for the exact deletion allele".to_owned())
} else if let Some(note) = alias_match_note {
serde_json::Value::String(note)
} else {
serde_json::Value::Null
},
Expand All @@ -397,10 +405,10 @@ fn manifest_default_source(
manifest: &VariantManifest,
) -> serde_json::Value {
let rsid = row
.get("matched_rsid")
.get("rsid")
.filter(|rsid| !rsid.is_empty())
.or_else(|| row.get("rsid").filter(|rsid| !rsid.is_empty()))
.or_else(|| manifest.spec.rsids.first().filter(|rsid| !rsid.is_empty()));
.or_else(|| manifest.spec.rsids.first().filter(|rsid| !rsid.is_empty()))
.or_else(|| row.get("matched_rsid").filter(|rsid| !rsid.is_empty()));
let Some(rsid) = rsid else {
return serde_json::Value::Null;
};
Expand Down Expand Up @@ -445,6 +453,32 @@ mod tests {
);
}

#[test]
fn repeat_indel_insertion_deletion_tokens_remain_ambiguous_without_sequence_alleles() {
assert_eq!(
normalize_app_genotype(
"II",
"TTTTTTTTTTTTT",
"TTTTTTTTTTTT",
Some(VariantKind::Indel),
"19",
None,
),
("II".to_owned(), "unknown".to_owned())
);
assert_eq!(
normalize_app_genotype(
"ID",
"TTTTTTTTTTTTT",
"TTTTTTTTTTTT",
Some(VariantKind::Indel),
"19",
None,
),
("ID".to_owned(), "unknown".to_owned())
);
}

#[test]
fn displays_cram_long_deletion_copy_number_as_insertion_deletion_tokens() {
let manifest = VariantManifest {
Expand Down
44 changes: 37 additions & 7 deletions rust/bioscript-reporting/src/observation/genotype_display.rs
Original file line number Diff line number Diff line change
Expand Up @@ -58,14 +58,17 @@ pub(super) fn normalize_app_genotype(
if display.is_empty() {
return ("./.".to_owned(), "unknown".to_owned());
}
if matches!(kind, Some(VariantKind::Deletion))
&& ref_allele.len() != 1
&& display
.chars()
.filter(char::is_ascii_alphabetic)
.all(|allele| matches!(allele.to_ascii_uppercase(), 'I' | 'D'))
if let Some((reference_token, alternate_token)) =
indel_display_tokens(display, ref_allele, alt_allele, kind)
{
return normalize_app_genotype(display, "I", "D", None, chrom, inferred_sex);
return normalize_app_genotype(
display,
&reference_token,
&alternate_token,
None,
chrom,
inferred_sex,
);
}
if let Some(normalized) = normalize_long_allele_genotype(display, ref_allele, alt_allele) {
return normalized;
Expand Down Expand Up @@ -109,6 +112,33 @@ pub(super) fn normalize_app_genotype(
}
}

fn indel_display_tokens(
display: &str,
ref_allele: &str,
_alt_allele: &str,
kind: Option<VariantKind>,
) -> Option<(String, String)> {
if ref_allele.len() <= 1
|| !matches!(kind, Some(VariantKind::Deletion | VariantKind::Insertion))
{
return None;
}
if !display
.chars()
.filter(char::is_ascii_alphabetic)
.all(|allele| matches!(allele.to_ascii_uppercase(), 'I' | 'D'))
{
return None;
}
if matches!(kind, Some(VariantKind::Deletion)) {
return Some(("I".to_owned(), "D".to_owned()));
}
if matches!(kind, Some(VariantKind::Insertion)) {
return Some(("D".to_owned(), "I".to_owned()));
}
None
}

fn normalize_long_allele_genotype(
display: &str,
ref_allele: &str,
Expand Down
Loading