diff --git a/rust/Cargo.lock b/rust/Cargo.lock index 7e35e9b..8b5cbaf 100644 --- a/rust/Cargo.lock +++ b/rust/Cargo.lock @@ -1158,6 +1158,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34e7ed524a472dbd0dc69cc7b63408f37552e48460028af44a3024f2eb77f036" dependencies = [ "noodles-bam", + "noodles-bcf", "noodles-bgzf", "noodles-core", "noodles-cram", @@ -1181,6 +1182,18 @@ dependencies = [ "noodles-sam", ] +[[package]] +name = "noodles-bcf" +version = "0.86.0" +dependencies = [ + "indexmap", + "memchr", + "noodles-bgzf", + "noodles-core", + "noodles-csi", + "noodles-vcf", +] + [[package]] name = "noodles-bgzf" version = "0.47.0" diff --git a/rust/Cargo.toml b/rust/Cargo.toml index 79214a0..67ca3a6 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -22,6 +22,7 @@ codegen-units = 1 [patch.crates-io] lexical-util = { path = "vendor/lexical-util" } noodles-bam = { path = "../noodles/noodles-bam" } +noodles-bcf = { path = "../noodles/noodles-bcf" } noodles-bgzf = { path = "../noodles/noodles-bgzf" } noodles-core = { path = "../noodles/noodles-core" } noodles-cram = { path = "../noodles/noodles-cram" } diff --git a/rust/bioscript-cli/src/cli_bootstrap.rs b/rust/bioscript-cli/src/cli_bootstrap.rs index f27e560..80dd0ac 100644 --- a/rust/bioscript-cli/src/cli_bootstrap.rs +++ b/rust/bioscript-cli/src/cli_bootstrap.rs @@ -11,7 +11,7 @@ use std::{ use bioscript_formats::{ GenotypeLoadOptions, GenotypeSourceFormat, GenotypeStore, InferredSex, InspectOptions, PrepareRequest, SexDetectionConfidence, SexInference, inspect_file, prepare_indexes, - shell_flags, + shell_flags, convert_23andme_grch37_to_grch38, }; use bioscript_runtime::{BioscriptRuntime, RuntimeConfig, StageTiming}; use bioscript_schema::{ @@ -57,7 +57,7 @@ fn run_cli() -> Result<(), String> { Ok(()) } -const USAGE: &str = "usage: bioscript [--root ] [--input-file ] [--output-file ] [--observations-file ] [--asset id=path] [--participant-id ] [--trace-report ] [--timing-report ] [--filter key=value] [--input-format auto|text|zip|vcf|cram] [--input-index ] [--reference-file ] [--reference-index ] [--auto-index] [--cache-dir ] [--max-duration-ms N] [--max-memory-bytes N] [--max-allocations N] [--max-recursion-depth N]\n bioscript report --input-file [--input-file ...] --output-dir [--html] [--open] [--root ] [--input-format auto|text|zip|vcf|cram] [--input-index ] [--reference-file ] [--reference-index ] [--allow-md5-mismatch] [--detect-sex] [--sample-sex male|female|unknown] [--analysis-max-duration-ms N]\n bioscript review --cases --output-dir [--html] [--root ] [--filter key=value]\n bioscript import-package [--root ] [--output-dir ]\n bioscript validate-variants [--report ]\n bioscript validate-panels [--report ]\n bioscript validate-assays [--report ]\n bioscript prepare [--root ] [--input-file ] [--reference-file ] [--input-format auto|text|zip|vcf|cram] [--cache-dir ]\n bioscript inspect [--input-index ] [--reference-file ] [--reference-index ] [--detect-sex]"; +const USAGE: &str = "usage: bioscript [--root ] [--input-file ] [--output-file ] [--observations-file ] [--asset id=path] [--participant-id ] [--trace-report ] [--timing-report ] [--filter key=value] [--input-format auto|text|zip|vcf|bcf|cram] [--input-index ] [--reference-file ] [--reference-index ] [--auto-index] [--cache-dir ] [--max-duration-ms N] [--max-memory-bytes N] [--max-allocations N] [--max-recursion-depth N]\n bioscript report --input-file [--input-file ...] --output-dir [--html] [--open] [--root ] [--input-format auto|text|zip|vcf|bcf|cram] [--input-index ] [--reference-file ] [--reference-index ] [--allow-md5-mismatch] [--detect-sex] [--sample-sex male|female|unknown] [--analysis-max-duration-ms N]\n bioscript review --cases --output-dir [--html] [--root ] [--filter key=value]\n bioscript import-package [--root ] [--output-dir ]\n bioscript validate-variants [--report ]\n bioscript validate-panels [--report ]\n bioscript validate-assays [--report ]\n bioscript prepare [--root ] [--input-file ] [--reference-file ] [--input-format auto|text|zip|vcf|bcf|cram] [--cache-dir ]\n bioscript inspect [--input-index ] [--reference-file ] [--reference-index ] [--detect-sex]\n bioscript liftover-23andme [--unmapped ]"; struct CliOptions { script_path: Option, @@ -90,6 +90,7 @@ fn dispatch_subcommand(args: &[String]) -> Result { "validate-assays" => run_validate_assays(rest).map(|()| true), "prepare" => run_prepare(rest).map(|()| true), "inspect" => run_inspect(rest).map(|()| true), + "liftover-23andme" => run_liftover_23andme(rest).map(|()| true), _ => Ok(false), } } diff --git a/rust/bioscript-cli/src/cli_commands.rs b/rust/bioscript-cli/src/cli_commands.rs index 68898ca..e12ea8e 100644 --- a/rust/bioscript-cli/src/cli_commands.rs +++ b/rust/bioscript-cli/src/cli_commands.rs @@ -112,6 +112,53 @@ fn run_inspect(args: Vec) -> Result<(), String> { Ok(()) } +fn run_liftover_23andme(args: Vec) -> Result<(), String> { + let mut input: Option = None; + let mut output: Option = None; + let mut unmapped: Option = None; + + let mut iter = args.into_iter(); + while let Some(arg) = iter.next() { + match arg.as_str() { + "--unmapped" => { + unmapped = Some(PathBuf::from( + iter.next().ok_or("--unmapped requires a path")?, + )); + } + other if input.is_none() => input = Some(PathBuf::from(other)), + other if output.is_none() => output = Some(PathBuf::from(other)), + other => return Err(format!("unexpected argument: {other}")), + } + } + + let Some(input) = input else { + return Err( + "usage: bioscript liftover-23andme [--unmapped ]" + .to_owned(), + ); + }; + let Some(output) = output else { + return Err( + "usage: bioscript liftover-23andme [--unmapped ]" + .to_owned(), + ); + }; + let unmapped = unmapped.unwrap_or_else(|| output.with_extension("unmapped.tsv")); + + let stats = convert_23andme_grch37_to_grch38(&input, &output, &unmapped) + .map_err(|err| err.to_string())?; + println!("total_markers={}", stats.total_markers); + println!("mapped={}", stats.mapped); + println!("unmapped={}", stats.unmapped); + println!( + "reverse_strand_genotypes={}", + stats.reverse_strand_genotypes + ); + println!("output={}", output.display()); + println!("unmapped_report={}", unmapped.display()); + Ok(()) +} + fn run_validate_variants(args: Vec) -> Result<(), String> { let mut path: Option = None; let mut report_path: Option = None; @@ -321,6 +368,20 @@ alleles: assert!(run_inspect(vec!["sample.cram".to_owned(), "--input-index".to_owned()]) .unwrap_err() .contains("--input-index requires")); + + assert!(run_liftover_23andme(Vec::new()) + .unwrap_err() + .contains("usage")); + assert!(run_liftover_23andme(vec!["input.txt".to_owned()]) + .unwrap_err() + .contains("usage")); + assert!(run_liftover_23andme(vec![ + "input.txt".to_owned(), + "output.txt".to_owned(), + "--unmapped".to_owned(), + ]) + .unwrap_err() + .contains("--unmapped requires")); } #[test] @@ -365,6 +426,35 @@ alleles: fs::remove_dir_all(dir).unwrap(); } + #[test] + fn liftover_23andme_command_uses_bundled_chain() { + let dir = temp_dir("liftover"); + let input = dir.join("genome.txt"); + let output = dir.join("genome.grch38.txt"); + let unmapped = dir.join("unmapped.tsv"); + fs::write( + &input, + "# We are using reference human assembly build 37\n\ +rs1800437\t19\t46181392\tCG\n", + ) + .unwrap(); + + run_liftover_23andme(vec![ + input.display().to_string(), + output.display().to_string(), + "--unmapped".to_owned(), + unmapped.display().to_string(), + ]) + .unwrap(); + + let lifted = fs::read_to_string(&output).unwrap(); + assert!(lifted.contains("# Coordinates lifted")); + assert!(lifted.contains("rs1800437\t19\t45678134\tCG")); + assert!(fs::read_to_string(&unmapped).unwrap().contains("reason")); + + fs::remove_dir_all(dir).unwrap(); + } + #[test] fn validate_panels_and_assays_cover_report_and_error_paths() { let dir = temp_dir("panels-assays"); diff --git a/rust/bioscript-cli/src/report_options.rs b/rust/bioscript-cli/src/report_options.rs index a3248bf..0147363 100644 --- a/rust/bioscript-cli/src/report_options.rs +++ b/rust/bioscript-cli/src/report_options.rs @@ -312,7 +312,11 @@ fn loader_with_inspection( let mut loader = base.clone(); if loader.format.is_none() { loader.format = if inspection.container == bioscript_formats::FileContainer::Zip { - Some(GenotypeSourceFormat::Zip) + if inspection.detected_kind == bioscript_formats::DetectedKind::Bcf { + Some(GenotypeSourceFormat::Bcf) + } else { + Some(GenotypeSourceFormat::Zip) + } } else { match inspection.detected_kind { bioscript_formats::DetectedKind::AlignmentBam => { @@ -322,6 +326,7 @@ fn loader_with_inspection( Some(GenotypeSourceFormat::Cram) } bioscript_formats::DetectedKind::Vcf => Some(GenotypeSourceFormat::Vcf), + bioscript_formats::DetectedKind::Bcf => Some(GenotypeSourceFormat::Bcf), bioscript_formats::DetectedKind::GenotypeText => Some(GenotypeSourceFormat::Text), _ => None, } diff --git a/rust/bioscript-formats/Cargo.toml b/rust/bioscript-formats/Cargo.toml index 52db7fc..e48ea2a 100644 --- a/rust/bioscript-formats/Cargo.toml +++ b/rust/bioscript-formats/Cargo.toml @@ -9,7 +9,7 @@ crate-type = ["rlib"] [dependencies] bioscript-core = { path = "../bioscript-core" } flate2 = "1.1.9" -noodles = { version = "0.110.0", features = ["bam", "bgzf", "core", "cram", "csi", "fasta", "sam", "tabix", "vcf"] } +noodles = { version = "0.110.0", features = ["bam", "bcf", "bgzf", "core", "cram", "csi", "fasta", "sam", "tabix", "vcf"] } zip = { version = "2.2.0", default-features = false, features = ["deflate"] } [lints.clippy] diff --git a/rust/bioscript-formats/assets/liftover/hg19ToHg38.over.chain.gz b/rust/bioscript-formats/assets/liftover/hg19ToHg38.over.chain.gz new file mode 100644 index 0000000..6da318c Binary files /dev/null and b/rust/bioscript-formats/assets/liftover/hg19ToHg38.over.chain.gz differ diff --git a/rust/bioscript-formats/src/genotype.rs b/rust/bioscript-formats/src/genotype.rs index ca8617b..8324772 100644 --- a/rust/bioscript-formats/src/genotype.rs +++ b/rust/bioscript-formats/src/genotype.rs @@ -12,6 +12,7 @@ use bioscript_core::{RuntimeError, VariantObservation}; mod alignment_bytes; mod backends; mod bam_backend; +mod bcf; mod cache; mod common; mod cram_backend; @@ -39,8 +40,8 @@ use io::{ select_zip_entry, }; use types::{ - AlignmentBytesBackend, BamBackend, CramBackend, DelimitedBackend, QueryBackend, RsidMapBackend, - VcfBackend, + AlignmentBytesBackend, BamBackend, BcfBackend, BcfSource, CramBackend, DelimitedBackend, + QueryBackend, RsidMapBackend, VcfBackend, }; pub use types::{ BackendCapabilities, GenotypeLoadOptions, GenotypeSourceFormat, GenotypeStore, QueryKind, @@ -123,6 +124,17 @@ impl GenotypeStore { )), GenotypeSourceFormat::Zip => Self::from_zip_file(path, options), GenotypeSourceFormat::Vcf => Ok(Self::from_vcf_file(path, options)), + GenotypeSourceFormat::Bcf => { + if path + .to_string_lossy() + .to_ascii_lowercase() + .ends_with(".zip") + { + Self::from_zip_file(path, options) + } else { + Ok(Self::from_bcf_file(path, options)) + } + } GenotypeSourceFormat::Cram => Self::from_cram_file(path, options), GenotypeSourceFormat::Bam => Ok(Self::from_bam_file(path, options)), } @@ -137,6 +149,13 @@ impl GenotypeStore { if lower.ends_with(".zip") || bytes_look_like_zip(bytes) { return Self::from_zip_bytes(name, bytes); } + if lower.ends_with(".bcf") { + return Ok(Self::from_bcf_bytes( + name, + bytes, + &GenotypeLoadOptions::default(), + )); + } let reader = BufReader::new(Cursor::new(bytes)); if lower.ends_with(".vcf") || bytes_look_like_vcf(bytes) { return Self::from_vcf_reader(reader, name); @@ -148,6 +167,33 @@ impl GenotypeStore { let mut archive = ZipArchive::new(Cursor::new(bytes)).map_err(|err| { RuntimeError::Io(format!("failed to read genotype zip {name}: {err}")) })?; + let bcf_entries = collect_bcf_zip_entries_from_archive(&mut archive, name)?; + if !bcf_entries.is_empty() { + let mut entries = Vec::with_capacity(bcf_entries.len()); + for entry_name in bcf_entries { + let mut entry = archive.by_name(&entry_name).map_err(|err| { + RuntimeError::Io(format!( + "failed to open genotype entry {entry_name} in {name}: {err}" + )) + })?; + let mut data = Vec::new(); + std::io::Read::read_to_end(&mut entry, &mut data).map_err(|err| { + RuntimeError::Io(format!( + "failed to read genotype entry {entry_name} in {name}: {err}" + )) + })?; + entries.push((entry_name, data)); + } + return Ok(Self { + backend: QueryBackend::Bcf(BcfBackend { + source: BcfSource::ZipBytes { + name: name.to_owned(), + entries, + }, + options: GenotypeLoadOptions::default(), + }), + }); + } let mut selected = None; for idx in 0..archive.len() { let entry = archive.by_index(idx).map_err(|err| { @@ -159,6 +205,7 @@ impl GenotypeStore { let entry_name = entry.name().to_owned(); let lower = entry_name.to_ascii_lowercase(); if lower.ends_with(".vcf") + || lower.ends_with(".bcf") || lower.ends_with(".txt") || lower.ends_with(".tsv") || lower.ends_with(".csv") @@ -187,6 +234,20 @@ impl GenotypeStore { if selected.to_ascii_lowercase().ends_with(".vcf") { return Self::from_vcf_reader(reader, &label); } + if selected.to_ascii_lowercase().ends_with(".bcf") { + let mut entry = reader.into_inner(); + let mut data = Vec::new(); + std::io::Read::read_to_end(&mut entry, &mut data).map_err(|err| { + RuntimeError::Io(format!( + "failed to read genotype entry {selected} in {name}: {err}" + )) + })?; + return Ok(Self::from_bcf_bytes( + &label, + &data, + &GenotypeLoadOptions::default(), + )); + } Self::from_delimited_reader(GenotypeSourceFormat::Zip, reader, &label) } @@ -199,7 +260,40 @@ impl GenotypeStore { } } + fn from_bcf_file(path: &Path, options: &GenotypeLoadOptions) -> Self { + Self { + backend: QueryBackend::Bcf(BcfBackend { + source: BcfSource::File(path.to_path_buf()), + options: options.clone(), + }), + } + } + + fn from_bcf_bytes(name: &str, bytes: &[u8], options: &GenotypeLoadOptions) -> Self { + Self { + backend: QueryBackend::Bcf(BcfBackend { + source: BcfSource::Bytes { + name: name.to_owned(), + data: bytes.to_vec(), + }, + options: options.clone(), + }), + } + } + fn from_zip_file(path: &Path, options: &GenotypeLoadOptions) -> Result { + let bcf_entries = collect_bcf_zip_entries(path)?; + if !bcf_entries.is_empty() { + return Ok(Self { + backend: QueryBackend::Bcf(BcfBackend { + source: BcfSource::ZipFile { + path: path.to_path_buf(), + entries: bcf_entries, + }, + options: options.clone(), + }), + }); + } let selected = select_zip_entry(path)?; let lower = selected.to_ascii_lowercase(); if lower.ends_with(".vcf") || lower.ends_with(".vcf.gz") { @@ -308,6 +402,43 @@ impl GenotypeStore { } } +fn collect_bcf_zip_entries(path: &Path) -> Result, RuntimeError> { + let file = File::open(path).map_err(|err| { + RuntimeError::Io(format!( + "failed to open genotype zip {}: {err}", + path.display() + )) + })?; + let mut archive = ZipArchive::new(file).map_err(|err| { + RuntimeError::Io(format!( + "failed to read genotype zip {}: {err}", + path.display() + )) + })?; + collect_bcf_zip_entries_from_archive(&mut archive, &path.display().to_string()) +} + +fn collect_bcf_zip_entries_from_archive( + archive: &mut ZipArchive, + label: &str, +) -> Result, RuntimeError> { + let mut entries = Vec::new(); + for idx in 0..archive.len() { + let entry = archive.by_index(idx).map_err(|err| { + RuntimeError::Io(format!("failed to inspect genotype zip {label}: {err}")) + })?; + if entry.is_dir() { + continue; + } + let name = entry.name().to_owned(); + if name.to_ascii_lowercase().ends_with(".bcf") { + entries.push(name); + } + } + entries.sort(); + Ok(entries) +} + fn bytes_look_like_zip(bytes: &[u8]) -> bool { bytes.starts_with(b"PK\x03\x04") || bytes.starts_with(b"PK\x05\x06") diff --git a/rust/bioscript-formats/src/genotype/backends.rs b/rust/bioscript-formats/src/genotype/backends.rs index 584c18a..58b8168 100644 --- a/rust/bioscript-formats/src/genotype/backends.rs +++ b/rust/bioscript-formats/src/genotype/backends.rs @@ -1,8 +1,9 @@ use bioscript_core::{Assembly, GenomicLocus, RuntimeError, VariantObservation, VariantSpec}; use super::{ + bcf::scan_bcf_variants, describe_query, lookup_indexed_vcf_variants, scan_delimited_variants, scan_vcf_variants, - types::{DelimitedBackend, GenotypeSourceFormat, RsidMapBackend, VcfBackend}, + types::{BcfBackend, DelimitedBackend, GenotypeSourceFormat, RsidMapBackend, VcfBackend}, }; impl RsidMapBackend { @@ -11,6 +12,7 @@ impl RsidMapBackend { GenotypeSourceFormat::Text => "text", GenotypeSourceFormat::Zip => "zip", GenotypeSourceFormat::Vcf => "vcf", + GenotypeSourceFormat::Bcf => "bcf", GenotypeSourceFormat::Cram => "cram", GenotypeSourceFormat::Bam => "bam", } @@ -104,6 +106,7 @@ impl DelimitedBackend { GenotypeSourceFormat::Text => "text", GenotypeSourceFormat::Zip => "zip", GenotypeSourceFormat::Vcf => "vcf", + GenotypeSourceFormat::Bcf => "bcf", GenotypeSourceFormat::Cram => "cram", GenotypeSourceFormat::Bam => "bam", } @@ -164,3 +167,32 @@ impl VcfBackend { scan_vcf_variants(self, variants) } } + +impl BcfBackend { + pub(super) fn backend_name(&self) -> &'static str { + "bcf" + } + + pub(super) fn get(&self, rsid: &str) -> Result, RuntimeError> { + let results = self.lookup_variants(&[VariantSpec { + rsids: vec![rsid.to_owned()], + ..VariantSpec::default() + }])?; + Ok(results.into_iter().next().and_then(|obs| obs.genotype)) + } + + pub(super) fn lookup_variant( + &self, + variant: &VariantSpec, + ) -> Result { + let mut results = self.lookup_variants(std::slice::from_ref(variant))?; + Ok(results.pop().unwrap_or_default()) + } + + pub(super) fn lookup_variants( + &self, + variants: &[VariantSpec], + ) -> Result, RuntimeError> { + scan_bcf_variants(self, variants) + } +} diff --git a/rust/bioscript-formats/src/genotype/bcf.rs b/rust/bioscript-formats/src/genotype/bcf.rs new file mode 100644 index 0000000..3b644ae --- /dev/null +++ b/rust/bioscript-formats/src/genotype/bcf.rs @@ -0,0 +1,378 @@ +use std::{ + collections::{HashMap, HashSet}, + fs::File, + io::{Cursor, Read}, +}; + +use bioscript_core::{Assembly, RuntimeError, VariantKind, VariantObservation, VariantSpec}; +use noodles::bcf; +use zip::ZipArchive; + +mod records; + +use records::{bcf_record_to_vcf_row, read_bcf_header_lenient}; + +use super::{ + common::variant_sort_key, + types::{BcfBackend, BcfSource}, + vcf::{ + ParsedVcfRow, choose_variant_locus_for_assembly, imputed_reference_observation, + normalize_chromosome_name, vcf_row_genotype_for_variant, vcf_row_matches_variant, + }, +}; + +pub(crate) fn scan_bcf_variants( + backend: &BcfBackend, + variants: &[VariantSpec], +) -> Result, RuntimeError> { + let assembly = backend.options.assembly.or(Some(Assembly::Grch38)); + let mut indexed: Vec<(usize, &VariantSpec)> = variants.iter().enumerate().collect(); + indexed.sort_by_cached_key(|(_, variant)| variant_sort_key(variant)); + + let targets = BcfTargets::new(variants, assembly); + let mut results = vec![VariantObservation::default(); variants.len()]; + let mut unresolved = variants.len(); + + for shard in backend.shards()? { + if unresolved == 0 { + break; + } + if !targets.should_scan_shard(&shard.name) { + continue; + } + scan_bcf_shard( + backend, + &shard, + &targets, + &mut results, + &mut unresolved, + assembly, + )?; + } + + for (idx, variant) in indexed { + if results[idx].genotype.is_none() { + let label = backend.label(); + results[idx] = if backend.options.impute_vcf_missing_as_reference { + choose_variant_locus_for_assembly(variant, assembly) + .and_then(|locus| { + imputed_reference_observation( + backend.backend_name(), + &label, + variant, + &locus, + assembly, + backend.options.inferred_sex, + "no matching rsid or locus found in BCF", + ) + }) + .unwrap_or_else(|| missing_observation(backend, assembly)) + } else { + missing_observation(backend, assembly) + }; + } + } + + Ok(results) +} + +struct BcfTargets<'a> { + variants: &'a [VariantSpec], + assembly: Option, + rsid_targets: HashMap>, + coord_targets: HashMap<(String, i64), Vec>, + chroms: HashSet, +} + +impl<'a> BcfTargets<'a> { + fn new(variants: &'a [VariantSpec], assembly: Option) -> Self { + let mut rsid_targets: HashMap> = HashMap::new(); + let mut coord_targets: HashMap<(String, i64), Vec> = HashMap::new(); + let mut chroms = HashSet::new(); + for (idx, variant) in variants.iter().enumerate() { + for rsid in &variant.rsids { + rsid_targets.entry(rsid.clone()).or_default().push(idx); + } + if let Some(locus) = choose_variant_locus_for_assembly(variant, assembly) { + let chrom = normalize_chromosome_name(&locus.chrom); + chroms.insert(chrom.clone()); + coord_targets + .entry((chrom.clone(), locus.start)) + .or_default() + .push(idx); + if matches!( + variant.kind, + Some(VariantKind::Deletion | VariantKind::Insertion | VariantKind::Indel) + ) { + coord_targets + .entry((chrom, locus.start.saturating_sub(1))) + .or_default() + .push(idx); + } + } + } + Self { + variants, + assembly, + rsid_targets, + coord_targets, + chroms, + } + } + + fn should_scan_shard(&self, name: &str) -> bool { + if self.chroms.is_empty() { + return true; + } + let Some(chrom) = chrom_from_bcf_shard_name(name) else { + return true; + }; + self.chroms.contains(&chrom) + } +} + +struct BcfShard { + name: String, + data: BcfShardData, +} + +enum BcfShardData { + File(std::path::PathBuf), + Bytes(Vec), + ZipEntry { + zip_path: std::path::PathBuf, + entry_name: String, + }, +} + +impl BcfBackend { + fn label(&self) -> String { + match &self.source { + BcfSource::File(path) | BcfSource::ZipFile { path, .. } => path.display().to_string(), + BcfSource::Bytes { name, .. } | BcfSource::ZipBytes { name, .. } => name.clone(), + } + } + + fn shards(&self) -> Result, RuntimeError> { + match &self.source { + BcfSource::File(path) => Ok(vec![BcfShard { + name: path.display().to_string(), + data: BcfShardData::File(path.clone()), + }]), + BcfSource::Bytes { name, data } => Ok(vec![BcfShard { + name: name.clone(), + data: BcfShardData::Bytes(data.clone()), + }]), + BcfSource::ZipFile { path, entries } => Ok(entries + .iter() + .map(|entry_name| BcfShard { + name: entry_name.clone(), + data: BcfShardData::ZipEntry { + zip_path: path.clone(), + entry_name: entry_name.clone(), + }, + }) + .collect()), + BcfSource::ZipBytes { entries, .. } => Ok(entries + .iter() + .map(|(name, data)| BcfShard { + name: name.clone(), + data: BcfShardData::Bytes(data.clone()), + }) + .collect()), + } + } +} + +fn scan_bcf_shard( + backend: &BcfBackend, + shard: &BcfShard, + targets: &BcfTargets<'_>, + results: &mut [VariantObservation], + unresolved: &mut usize, + assembly: Option, +) -> Result<(), RuntimeError> { + match &shard.data { + BcfShardData::File(path) => { + let file = File::open(path).map_err(|err| { + RuntimeError::Io(format!("failed to open BCF file {}: {err}", path.display())) + })?; + scan_bcf_reader( + backend, + shard, + bcf::io::Reader::new(file), + targets, + results, + unresolved, + assembly, + ) + } + BcfShardData::Bytes(data) => scan_bcf_reader( + backend, + shard, + bcf::io::Reader::new(Cursor::new(data)), + targets, + results, + unresolved, + assembly, + ), + BcfShardData::ZipEntry { + zip_path, + entry_name, + } => { + let file = File::open(zip_path).map_err(|err| { + RuntimeError::Io(format!( + "failed to open genotype zip {}: {err}", + zip_path.display() + )) + })?; + let mut archive = ZipArchive::new(file).map_err(|err| { + RuntimeError::Io(format!( + "failed to read genotype zip {}: {err}", + zip_path.display() + )) + })?; + let mut entry = archive.by_name(entry_name).map_err(|err| { + RuntimeError::Io(format!( + "failed to open genotype entry {entry_name} in {}: {err}", + zip_path.display() + )) + })?; + let mut data = Vec::new(); + entry.read_to_end(&mut data).map_err(|err| { + RuntimeError::Io(format!( + "failed to read genotype entry {entry_name} in {}: {err}", + zip_path.display() + )) + })?; + scan_bcf_reader( + backend, + shard, + bcf::io::Reader::new(Cursor::new(data)), + targets, + results, + unresolved, + assembly, + ) + } + } +} + +fn scan_bcf_reader( + backend: &BcfBackend, + shard: &BcfShard, + mut reader: bcf::io::Reader, + targets: &BcfTargets<'_>, + results: &mut [VariantObservation], + unresolved: &mut usize, + assembly: Option, +) -> Result<(), RuntimeError> { + let header = read_bcf_header_lenient(&mut reader, &shard.name)?; + let string_maps = header.string_maps().clone(); + + for record_result in reader.records() { + if *unresolved == 0 { + break; + } + let record = record_result.map_err(|err| { + RuntimeError::Io(format!("failed to read BCF record {}: {err}", shard.name)) + })?; + let row = bcf_record_to_vcf_row(&header, &string_maps, &record, &shard.name)?; + resolve_bcf_row(backend, &row, targets, results, unresolved, assembly); + } + Ok(()) +} + +fn resolve_bcf_row( + backend: &BcfBackend, + row: &ParsedVcfRow, + targets: &BcfTargets<'_>, + results: &mut [VariantObservation], + unresolved: &mut usize, + assembly: Option, +) { + if let Some(rsid) = row.rsid.as_ref() + && let Some(target_indexes) = targets.rsid_targets.get(rsid) + { + for &target_idx in target_indexes { + if results[target_idx].genotype.is_none() { + results[target_idx] = observation_from_row( + backend, + row, + target_idx, + targets, + assembly, + Some(format!("resolved by rsid {rsid}")), + ); + *unresolved = (*unresolved).saturating_sub(1); + } + } + } + + if *unresolved == 0 { + return; + } + + let key = (normalize_chromosome_name(&row.chrom), row.position); + if let Some(target_indexes) = targets.coord_targets.get(&key) { + for &target_idx in target_indexes { + if results[target_idx].genotype.is_none() + && vcf_row_matches_variant(row, &targets.variants[target_idx], targets.assembly) + { + results[target_idx] = observation_from_row( + backend, + row, + target_idx, + targets, + assembly, + Some(format!("resolved by locus {}:{}", row.chrom, row.position)), + ); + *unresolved = (*unresolved).saturating_sub(1); + } + } + } +} + +fn observation_from_row( + backend: &BcfBackend, + row: &ParsedVcfRow, + target_idx: usize, + targets: &BcfTargets<'_>, + assembly: Option, + resolved: Option, +) -> VariantObservation { + let resolved = resolved.unwrap_or_else(|| "resolved by BCF record".to_owned()); + VariantObservation { + backend: backend.backend_name().to_owned(), + matched_rsid: row.rsid.clone(), + assembly, + genotype: Some(vcf_row_genotype_for_variant( + row, + &targets.variants[target_idx], + )), + evidence: vec![resolved, format!("source record: {}", row.raw_line)], + ..VariantObservation::default() + } +} + +fn missing_observation(backend: &BcfBackend, assembly: Option) -> VariantObservation { + VariantObservation { + backend: backend.backend_name().to_owned(), + assembly, + evidence: vec!["no matching rsid or locus found in BCF".to_owned()], + ..VariantObservation::default() + } +} + +fn chrom_from_bcf_shard_name(name: &str) -> Option { + let file = name.rsplit('/').next().unwrap_or(name); + let lower = file.to_ascii_lowercase(); + let chrom = lower.strip_suffix(".bcf").unwrap_or(&lower); + let chrom = chrom + .strip_prefix("chr") + .unwrap_or(chrom) + .split('.') + .next() + .unwrap_or(chrom); + (!chrom.is_empty()).then(|| chrom.to_owned()) +} diff --git a/rust/bioscript-formats/src/genotype/bcf/records.rs b/rust/bioscript-formats/src/genotype/bcf/records.rs new file mode 100644 index 0000000..351cb52 --- /dev/null +++ b/rust/bioscript-formats/src/genotype/bcf/records.rs @@ -0,0 +1,309 @@ +use std::io::Read; + +use bioscript_core::RuntimeError; +use noodles::{ + bcf, vcf, + vcf::variant::record::{ + AlternateBases as _, Ids as _, ReferenceBases as _, + samples::Sample as _, + samples::series::{Value as SampleValue, value::Array}, + }, +}; + +use super::{super::vcf::ParsedVcfRow, chrom_from_bcf_shard_name}; + +pub(super) fn read_bcf_header_lenient( + reader: &mut bcf::io::Reader, + label: &str, +) -> Result { + let mut header_reader = reader.header_reader(); + header_reader + .read_magic_number() + .map_err(|err| RuntimeError::Io(format!("failed to read BCF magic {label}: {err}")))?; + header_reader + .read_format_version() + .map_err(|err| RuntimeError::Io(format!("failed to read BCF version {label}: {err}")))?; + let mut raw_reader = header_reader + .raw_vcf_header_reader() + .map_err(|err| RuntimeError::Io(format!("failed to open BCF VCF header {label}: {err}")))?; + let mut raw = String::new(); + raw_reader + .read_to_string(&mut raw) + .map_err(|err| RuntimeError::Io(format!("failed to read BCF VCF header {label}: {err}")))?; + raw_reader.discard_to_end().map_err(|err| { + RuntimeError::Io(format!( + "failed to discard BCF VCF header padding {label}: {err}" + )) + })?; + + let sanitized = sanitize_bcf_vcf_header(&raw); + sanitized + .parse::() + .map_err(|err| RuntimeError::Io(format!("failed to parse BCF VCF header {label}: {err}"))) +} + +pub(super) fn bcf_record_to_vcf_row( + header: &vcf::Header, + string_maps: &vcf::header::StringMaps, + record: &bcf::Record, + label: &str, +) -> Result { + let chrom = record + .reference_sequence_name(string_maps) + .map(str::to_owned) + .or_else(|_| { + chrom_from_bcf_shard_name(label) + .map(|chrom| { + if chrom == "mt" { + "chrM".to_owned() + } else { + format!("chr{chrom}") + } + }) + .ok_or_else(|| { + std::io::Error::new( + std::io::ErrorKind::InvalidData, + "missing reference sequence name in contig string map", + ) + }) + }) + .map_err(|err| { + RuntimeError::Io(format!("{label}: failed to read BCF chromosome: {err}")) + })?; + let position = i64::try_from( + record + .variant_start() + .transpose() + .map_err(|err| { + RuntimeError::Io(format!("{label}: failed to read BCF position: {err}")) + })? + .ok_or_else(|| RuntimeError::Io(format!("{label}: BCF record missing position")))? + .get(), + ) + .map_err(|_| RuntimeError::Io(format!("{label}: BCF position exceeds i64 range")))?; + let ids_buf = record.ids(); + let ids: Vec<&str> = ids_buf.iter().collect(); + let rsid = ids + .iter() + .find(|id| !id.is_empty() && **id != ".") + .map(|id| (*id).to_owned()); + let reference = String::from_utf8( + record + .reference_bases() + .iter() + .collect::, _>>() + .map_err(|err| { + RuntimeError::Io(format!( + "{label}: failed to read BCF reference bases: {err}" + )) + })?, + ) + .map_err(|err| RuntimeError::Io(format!("{label}: invalid BCF reference bases: {err}")))?; + let alternates = record + .alternate_bases() + .iter() + .map(|result| result.map(str::to_owned)) + .collect::, _>>() + .map_err(|err| { + RuntimeError::Io(format!( + "{label}: failed to read BCF alternate bases: {err}" + )) + })?; + let hds = extract_hds(header, record, label)?; + let genotype = genotype_from_hds(&reference, alternates.first().map(String::as_str), &hds) + .unwrap_or_else(|| "--".to_owned()); + let raw_line = format!( + "{} {} {} {} {} FORMAT=HDS HDS={}", + chrom, + position, + rsid.as_deref().unwrap_or("."), + reference, + alternates.join(","), + hds.iter() + .map(|value| format_hds_value(*value)) + .collect::>() + .join(",") + ); + + Ok(ParsedVcfRow { + rsid, + chrom, + position, + reference, + alternates, + genotype, + raw_line, + }) +} + +fn sanitize_bcf_vcf_header(raw: &str) -> String { + let is_23andme_imputed = raw.contains("##DISCLAIMER=") && raw.contains("23andMe"); + let mut inserted_23andme_contigs = false; + raw.lines() + .filter_map(|line| { + if is_23andme_imputed && line.starts_with("##contig=<") { + if inserted_23andme_contigs { + return None; + } + inserted_23andme_contigs = true; + return Some( + (1..=22) + .map(|chrom| format!("##contig=")) + .chain(std::iter::once("##contig=".to_owned())) + .collect::>() + .join("\n"), + ); + } + if line.starts_with("##FORMAT=<") + && line.contains("ID=HDS") + && !line.contains("Description=") + { + Some( + "##FORMAT=" + .to_owned(), + ) + } else { + Some(line.to_owned()) + } + }) + .collect::>() + .join("\n") +} + +fn extract_hds( + header: &vcf::Header, + record: &bcf::Record, + label: &str, +) -> Result, RuntimeError> { + let samples = record + .samples() + .map_err(|err| RuntimeError::Io(format!("{label}: failed to read BCF samples: {err}")))?; + if let Some(values) = parse_first_bcf_float_series(samples.as_ref()) { + return Ok(values); + } + let Some(sample) = samples.get_index(0) else { + return Ok(Vec::new()); + }; + let Some(value) = sample + .get(header, "HDS") + .transpose() + .map_err(|err| RuntimeError::Io(format!("{label}: failed to read BCF HDS: {err}")))? + .flatten() + else { + return Ok(Vec::new()); + }; + match value { + SampleValue::Array(Array::Float(values)) => values + .iter() + .filter_map(|result| { + result + .map_err(|err| { + RuntimeError::Io(format!("{label}: failed to read HDS value: {err}")) + }) + .transpose() + }) + .collect(), + SampleValue::Float(value) => Ok(vec![value]), + _ => Ok(Vec::new()), + } +} + +fn parse_first_bcf_float_series(src: &[u8]) -> Option> { + let mut offset = 0usize; + skip_bcf_typed_value(src, &mut offset)?; + let descriptor = *src.get(offset)?; + offset += 1; + let (len, ty) = bcf_descriptor_len_ty(src, descriptor, &mut offset)?; + if ty != 5 { + return None; + } + let mut values = Vec::with_capacity(len); + for _ in 0..len { + let bytes: [u8; 4] = src.get(offset..offset + 4)?.try_into().ok()?; + offset += 4; + let value = f32::from_le_bytes(bytes); + if !value.is_nan() { + values.push(value); + } + } + Some(values) +} + +fn skip_bcf_typed_value(src: &[u8], offset: &mut usize) -> Option<()> { + let descriptor = *src.get(*offset)?; + *offset += 1; + let (len, ty) = bcf_descriptor_len_ty(src, descriptor, offset)?; + let width = match ty { + 1 | 7 => 1, + 2 => 2, + 3 | 5 => 4, + _ => return None, + }; + *offset = offset.checked_add(len.checked_mul(width)?)?; + (*offset <= src.len()).then_some(()) +} + +fn bcf_descriptor_len_ty(src: &[u8], descriptor: u8, offset: &mut usize) -> Option<(usize, u8)> { + let ty = descriptor & 0x0f; + let mut len = usize::from(descriptor >> 4); + if len == 15 { + let len_descriptor = *src.get(*offset)?; + *offset += 1; + let (len_len, len_ty) = bcf_descriptor_len_ty(src, len_descriptor, offset)?; + if len_len != 1 { + return None; + } + len = match len_ty { + 1 => usize::from(*src.get(*offset)?), + 2 => { + let bytes: [u8; 2] = src.get(*offset..*offset + 2)?.try_into().ok()?; + usize::try_from(i16::from_le_bytes(bytes)).ok()? + } + 3 => { + let bytes: [u8; 4] = src.get(*offset..*offset + 4)?.try_into().ok()?; + usize::try_from(i32::from_le_bytes(bytes)).ok()? + } + _ => return None, + }; + *offset += match len_ty { + 1 => 1, + 2 => 2, + 3 => 4, + _ => return None, + }; + } + Some((len, ty)) +} + +fn genotype_from_hds(reference: &str, alternate: Option<&str>, hds: &[f32]) -> Option { + let alternate = alternate?; + if reference.is_empty() || alternate.is_empty() || hds.is_empty() { + return None; + } + let mut alleles = Vec::with_capacity(hds.len().max(2)); + for dosage in hds { + if *dosage >= 0.5 { + alleles.push(alternate.to_owned()); + } else { + alleles.push(reference.to_owned()); + } + } + if alleles.len() == 1 { + alleles.push(reference.to_owned()); + } + if alleles.iter().any(|allele| allele.len() > 1) { + return Some(alleles.join("/")); + } + Some(super::super::normalize_genotype(&alleles.join("/"))) +} + +fn format_hds_value(value: f32) -> String { + if (value.fract()).abs() < f32::EPSILON { + format!("{value:.0}") + } else { + format!("{value:.3}") + .trim_end_matches('0') + .trim_end_matches('.') + .to_owned() + } +} diff --git a/rust/bioscript-formats/src/genotype/io.rs b/rust/bioscript-formats/src/genotype/io.rs index 8d69e55..8b35553 100644 --- a/rust/bioscript-formats/src/genotype/io.rs +++ b/rust/bioscript-formats/src/genotype/io.rs @@ -58,6 +58,7 @@ pub(crate) fn select_zip_entry(path: &Path) -> Result { || lower.ends_with(".tsv") || lower.ends_with(".vcf") || lower.ends_with(".vcf.gz") + || lower.ends_with(".bcf") { return Ok(name); } @@ -117,6 +118,9 @@ pub(crate) fn detect_source_format( if lower.ends_with(".vcf") || lower.ends_with(".vcf.gz") { return Ok(GenotypeSourceFormat::Vcf); } + if lower.ends_with(".bcf") { + return Ok(GenotypeSourceFormat::Bcf); + } let lines = read_plain_lines(path)?; if looks_like_vcf_lines(&lines) { diff --git a/rust/bioscript-formats/src/genotype/query.rs b/rust/bioscript-formats/src/genotype/query.rs index 0a375e1..745b879 100644 --- a/rust/bioscript-formats/src/genotype/query.rs +++ b/rust/bioscript-formats/src/genotype/query.rs @@ -13,10 +13,12 @@ impl GenotypeStore { rsid_lookup: true, locus_lookup: false, }, - QueryBackend::Delimited(_) | QueryBackend::Vcf(_) => BackendCapabilities { - rsid_lookup: true, - locus_lookup: true, - }, + QueryBackend::Delimited(_) | QueryBackend::Vcf(_) | QueryBackend::Bcf(_) => { + BackendCapabilities { + rsid_lookup: true, + locus_lookup: true, + } + } QueryBackend::Cram(_) | QueryBackend::Bam(_) | QueryBackend::AlignmentBytes(_) => { BackendCapabilities { rsid_lookup: false, @@ -48,6 +50,7 @@ impl GenotypeStore { QueryBackend::RsidMap(map) => map.backend_name(), QueryBackend::Delimited(backend) => backend.backend_name(), QueryBackend::Vcf(backend) => backend.backend_name(), + QueryBackend::Bcf(backend) => backend.backend_name(), QueryBackend::Cram(backend) => backend.backend_name(), QueryBackend::Bam(backend) => backend.backend_name(), QueryBackend::AlignmentBytes(backend) => backend.backend_name(), @@ -60,6 +63,7 @@ impl GenotypeStore { QueryBackend::RsidMap(map) => Ok(map.values.get(rsid).cloned()), QueryBackend::Delimited(backend) => backend.get(rsid), QueryBackend::Vcf(backend) => backend.get(rsid), + QueryBackend::Bcf(backend) => backend.get(rsid), QueryBackend::Cram(backend) => backend .lookup_variant(&VariantSpec { rsids: vec![rsid.to_owned()], @@ -111,6 +115,7 @@ impl GenotypeStore { QueryBackend::RsidMap(map) => map.lookup_variant(variant), QueryBackend::Delimited(backend) => backend.lookup_variant(variant), QueryBackend::Vcf(backend) => backend.lookup_variant(variant), + QueryBackend::Bcf(backend) => backend.lookup_variant(variant), QueryBackend::Cram(backend) => backend.lookup_variant(variant), QueryBackend::Bam(backend) => backend.lookup_variant(variant), QueryBackend::AlignmentBytes(backend) => backend.lookup_variant(variant), @@ -177,6 +182,9 @@ impl GenotypeStore { if let QueryBackend::Vcf(backend) = &self.backend { return backend.lookup_variants(variants); } + if let QueryBackend::Bcf(backend) = &self.backend { + return backend.lookup_variants(variants); + } if let QueryBackend::Cram(backend) = &self.backend { return backend.lookup_variants(variants); } diff --git a/rust/bioscript-formats/src/genotype/types.rs b/rust/bioscript-formats/src/genotype/types.rs index 65a7f7d..288116c 100644 --- a/rust/bioscript-formats/src/genotype/types.rs +++ b/rust/bioscript-formats/src/genotype/types.rs @@ -14,6 +14,7 @@ pub(crate) enum QueryBackend { RsidMap(RsidMapBackend), Delimited(DelimitedBackend), Vcf(VcfBackend), + Bcf(BcfBackend), Cram(CramBackend), Bam(BamBackend), /// In-memory CRAM/BAM: alignment + index (+ reference for CRAM) held as @@ -63,6 +64,29 @@ pub(crate) struct VcfBackend { pub(crate) options: GenotypeLoadOptions, } +#[derive(Debug, Clone)] +pub(crate) struct BcfBackend { + pub(crate) source: BcfSource, + pub(crate) options: GenotypeLoadOptions, +} + +#[derive(Debug, Clone)] +pub(crate) enum BcfSource { + File(PathBuf), + ZipFile { + path: PathBuf, + entries: Vec, + }, + Bytes { + name: String, + data: Vec, + }, + ZipBytes { + name: String, + entries: Vec<(String, Vec)>, + }, +} + #[derive(Debug, Clone)] pub(crate) struct CramBackend { pub(crate) path: PathBuf, @@ -107,6 +131,7 @@ pub enum GenotypeSourceFormat { Text, Zip, Vcf, + Bcf, Cram, Bam, } @@ -119,6 +144,7 @@ impl FromStr for GenotypeSourceFormat { "txt" | "text" | "genotype" => Ok(Self::Text), "zip" => Ok(Self::Zip), "vcf" => Ok(Self::Vcf), + "bcf" => Ok(Self::Bcf), "cram" => Ok(Self::Cram), "bam" => Ok(Self::Bam), other => Err(format!("unsupported input format: {other}")), diff --git a/rust/bioscript-formats/src/genotype/vcf.rs b/rust/bioscript-formats/src/genotype/vcf.rs index 18ad60e..f672ad9 100644 --- a/rust/bioscript-formats/src/genotype/vcf.rs +++ b/rust/bioscript-formats/src/genotype/vcf.rs @@ -21,7 +21,9 @@ mod matching; mod reader; pub use matching::{choose_variant_locus_for_assembly, imputed_reference_observation}; -pub(crate) use matching::{normalize_chromosome_name, vcf_row_matches_variant}; +pub(crate) use matching::{ + normalize_chromosome_name, vcf_row_genotype_for_variant, vcf_row_matches_variant, +}; pub use reader::{observe_vcf_snp_with_reader, observe_vcf_variant_with_reader}; #[derive(Debug, Clone)] diff --git a/rust/bioscript-formats/src/inspect.rs b/rust/bioscript-formats/src/inspect.rs index 9cc7520..27f052b 100644 --- a/rust/bioscript-formats/src/inspect.rs +++ b/rust/bioscript-formats/src/inspect.rs @@ -57,6 +57,7 @@ pub enum FileContainer { pub enum DetectedKind { GenotypeText, Vcf, + Bcf, AlignmentCram, AlignmentBam, ReferenceFasta, @@ -126,6 +127,19 @@ pub fn inspect_bytes( if lower.ends_with(".zip") { let selected_entry = select_zip_entry_from_bytes(bytes)?; + if selected_entry.to_ascii_lowercase().ends_with(".bcf") { + let mut inspection = inspect_from_bcf( + path, + FileContainer::Zip, + Some(selected_entry), + options, + started.elapsed().as_millis(), + ); + inspection + .evidence + .push("selected BCF zip entry".to_owned()); + return Ok(inspection); + } let sample_lines = read_zip_sample_lines_from_bytes(bytes, &selected_entry)?; let mut inspection = inspect_from_textual_sample( path, @@ -151,6 +165,9 @@ pub fn inspect_bytes( } else if lower.ends_with(".bam") { evidence.push("extension .bam".to_owned()); DetectedKind::AlignmentBam + } else if lower.ends_with(".bcf") { + evidence.push("extension .bcf".to_owned()); + DetectedKind::Bcf } else if is_reference_path(path) { evidence.push("reference fasta extension".to_owned()); DetectedKind::ReferenceFasta @@ -177,6 +194,7 @@ pub fn inspect_bytes( DetectedKind::AlignmentCram | DetectedKind::AlignmentBam | DetectedKind::ReferenceFasta => { Vec::new() } + DetectedKind::Bcf => Vec::new(), _ => read_plain_sample_lines_from_bytes(&lower, bytes)?, }; let inspection_context = inspect_context_name(&lower, options); @@ -239,6 +257,19 @@ pub fn inspect_file(path: &Path, options: &InspectOptions) -> Result Result Result { Vec::new() } + DetectedKind::Bcf => Vec::new(), _ => read_plain_sample_lines(path)?, }; let inspection_context = inspect_context_name(&lower, options); @@ -401,6 +436,33 @@ fn inspect_from_textual_sample( } } +fn inspect_from_bcf( + path: &Path, + container: FileContainer, + selected_entry: Option, + options: &InspectOptions, + duration_ms: u128, +) -> FileInspection { + let (has_index, index_path) = detect_index(path, DetectedKind::Bcf, options); + FileInspection { + path: path.to_path_buf(), + container, + detected_kind: DetectedKind::Bcf, + confidence: DetectionConfidence::Authoritative, + source: None, + assembly: Some(Assembly::Grch38), + phased: None, + selected_entry, + has_index, + index_path, + reference_matches: None, + inferred_sex: None, + evidence: vec!["BCF input".to_owned()], + warnings: Vec::new(), + duration_ms, + } +} + fn inspect_context_name(lower_name: &str, options: &InspectOptions) -> String { let mut context = lower_name.to_owned(); for path in [ diff --git a/rust/bioscript-formats/src/inspect/heuristics.rs b/rust/bioscript-formats/src/inspect/heuristics.rs index 1c03a2b..0cac9ca 100644 --- a/rust/bioscript-formats/src/inspect/heuristics.rs +++ b/rust/bioscript-formats/src/inspect/heuristics.rs @@ -422,9 +422,10 @@ pub(crate) fn classify_confidence( DetectedKind::Vcf if looks_like_vcf_lines(sample_lines) => { DetectionConfidence::Authoritative } - DetectedKind::AlignmentCram | DetectedKind::AlignmentBam | DetectedKind::ReferenceFasta => { - DetectionConfidence::Authoritative - } + DetectedKind::Bcf + | DetectedKind::AlignmentCram + | DetectedKind::AlignmentBam + | DetectedKind::ReferenceFasta => DetectionConfidence::Authoritative, DetectedKind::GenotypeText if source.is_some() => DetectionConfidence::StrongHeuristic, DetectedKind::GenotypeText => DetectionConfidence::WeakHeuristic, DetectedKind::Unknown => DetectionConfidence::Unknown, diff --git a/rust/bioscript-formats/src/inspect/io.rs b/rust/bioscript-formats/src/inspect/io.rs index a6f06a2..efca6c0 100644 --- a/rust/bioscript-formats/src/inspect/io.rs +++ b/rust/bioscript-formats/src/inspect/io.rs @@ -63,6 +63,7 @@ pub(crate) fn select_zip_entry_from_bytes(bytes: &[u8]) -> Result Result { let lower = name.to_ascii_lowercase(); if lower.ends_with(".vcf") || lower.ends_with(".vcf.gz") + || lower.ends_with(".bcf") || lower.ends_with(".txt") || lower.ends_with(".tsv") || lower.ends_with(".csv") diff --git a/rust/bioscript-formats/src/inspect/render.rs b/rust/bioscript-formats/src/inspect/render.rs index 13180f6..89240f0 100644 --- a/rust/bioscript-formats/src/inspect/render.rs +++ b/rust/bioscript-formats/src/inspect/render.rs @@ -89,6 +89,7 @@ pub(crate) fn render_kind(value: DetectedKind) -> &'static str { match value { DetectedKind::GenotypeText => "genotype_text", DetectedKind::Vcf => "vcf", + DetectedKind::Bcf => "bcf", DetectedKind::AlignmentCram => "alignment_cram", DetectedKind::AlignmentBam => "alignment_bam", DetectedKind::ReferenceFasta => "reference_fasta", diff --git a/rust/bioscript-formats/src/lib.rs b/rust/bioscript-formats/src/lib.rs index e8ae348..cfda399 100644 --- a/rust/bioscript-formats/src/lib.rs +++ b/rust/bioscript-formats/src/lib.rs @@ -11,6 +11,7 @@ pub mod alignment; mod genotype; mod inspect; +mod liftover; mod prepare; pub use genotype::{ @@ -24,4 +25,8 @@ pub use inspect::{ SexDetectionConfidence, SexInference, SourceMetadata, infer_sex_from_alignment_reader, infer_sex_from_named_reader, infer_sex_from_text_lines, inspect_bytes, inspect_file, }; +pub use liftover::{ + ChainIndex, LiftOverOptions, LiftedLocus, LiftoverStats, convert_23andme_grch37_to_grch38, + grch37_to_grch38_chain, +}; pub use prepare::{PrepareRequest, PreparedPaths, prepare_indexes, shell_flags}; diff --git a/rust/bioscript-formats/src/liftover.rs b/rust/bioscript-formats/src/liftover.rs new file mode 100644 index 0000000..e4b9116 --- /dev/null +++ b/rust/bioscript-formats/src/liftover.rs @@ -0,0 +1,533 @@ +use std::{ + cmp::Ordering, + collections::HashMap, + fs::File, + io::{self, BufRead, BufReader, Cursor, Write}, + path::Path, +}; + +use bioscript_core::{Assembly, RuntimeError}; +use flate2::read::GzDecoder; + +const HG19_TO_HG38_CHAIN_GZ: &[u8] = include_bytes!("../assets/liftover/hg19ToHg38.over.chain.gz"); + +const PRIMARY_CHROMS: [&str; 25] = [ + "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", + "18", "19", "20", "21", "22", "X", "Y", "MT", +]; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct LiftOverOptions { + pub primary_only: bool, +} + +impl Default for LiftOverOptions { + fn default() -> Self { + Self { primary_only: true } + } +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct LiftedLocus { + pub chrom: String, + pub pos: i64, + pub strand: char, + pub score: i64, +} + +#[derive(Debug, Clone, Default, PartialEq, Eq)] +pub struct LiftoverStats { + pub total_markers: u64, + pub mapped: u64, + pub unmapped: u64, + pub reverse_strand_genotypes: u64, +} + +#[derive(Debug, Clone)] +pub struct ChainIndex { + from: Assembly, + to: Assembly, + blocks_by_chrom: HashMap, +} + +#[derive(Debug, Clone)] +struct ChromBlocks { + starts: Vec, + blocks: Vec, +} + +#[derive(Debug, Clone)] +struct ChainBlock { + source_start: i64, + source_end: i64, + target_start: i64, + target_name: String, + target_size: i64, + target_strand: char, + score: i64, + index: usize, +} + +#[derive(Debug, Clone)] +struct ChainHeader { + score: i64, + source_name: String, + source_pos: i64, + target_name: String, + target_size: i64, + target_strand: char, + target_pos: i64, + index: usize, +} + +impl ChainIndex { + pub fn from_reader( + from: Assembly, + to: Assembly, + reader: R, + options: LiftOverOptions, + ) -> Result { + if (from, to) != (Assembly::Grch37, Assembly::Grch38) { + return Err(RuntimeError::Unsupported( + "liftover currently supports only GRCh37 to GRCh38".to_owned(), + )); + } + + let mut blocks_by_chrom: HashMap> = HashMap::new(); + let mut current: Option = None; + let mut chain_index = 0usize; + + for raw in reader.lines() { + let raw = + raw.map_err(|err| RuntimeError::Io(format!("failed to read chain: {err}")))?; + let line = raw.trim(); + if line.is_empty() { + current = None; + continue; + } + + let parts: Vec<&str> = line.split_whitespace().collect(); + if parts.first() == Some(&"chain") { + current = parse_chain_header(&parts, chain_index, options.primary_only)?; + if current.is_some() { + chain_index += 1; + } + continue; + } + + let Some(header) = current.as_mut() else { + continue; + }; + let size = parse_i64(parts[0], "chain block size")?; + let source_start = header.source_pos; + let target_start = header.target_pos; + let source_end = source_start + size; + + blocks_by_chrom + .entry(header.source_name.clone()) + .or_default() + .push(ChainBlock { + source_start, + source_end, + target_start, + target_name: header.target_name.clone(), + target_size: header.target_size, + target_strand: header.target_strand, + score: header.score, + index: header.index, + }); + + if parts.len() == 3 { + header.source_pos = source_end + parse_i64(parts[1], "chain dt")?; + header.target_pos = target_start + size + parse_i64(parts[2], "chain dq")?; + } else { + current = None; + } + } + + let blocks_by_chrom = blocks_by_chrom + .into_iter() + .map(|(chrom, mut blocks)| { + blocks.sort_by(|a, b| { + a.source_start + .cmp(&b.source_start) + .then_with(|| b.score.cmp(&a.score)) + .then_with(|| a.index.cmp(&b.index)) + }); + let starts = blocks.iter().map(|block| block.source_start).collect(); + (chrom, ChromBlocks { starts, blocks }) + }) + .collect(); + + Ok(Self { + from, + to, + blocks_by_chrom, + }) + } + + pub fn lookup( + &self, + from: Assembly, + to: Assembly, + chrom: &str, + pos1: i64, + ) -> Option { + if from != self.from || to != self.to || pos1 <= 0 { + return None; + } + + let source_chrom = chain_chrom(chrom); + let chrom_blocks = self + .blocks_by_chrom + .get(&source_chrom) + .or_else(|| self.blocks_by_chrom.get(chrom))?; + let pos0 = pos1 - 1; + let mut i = chrom_blocks.starts.partition_point(|start| *start <= pos0); + if i == 0 { + return None; + } + i -= 1; + + let mut best: Option<&ChainBlock> = None; + loop { + let block = &chrom_blocks.blocks[i]; + if block.source_start <= pos0 + && pos0 < block.source_end + && best.is_none_or(|candidate| compare_blocks(block, candidate).is_gt()) + { + best = Some(block); + } + + if let Some(candidate) = best + && chrom_blocks.starts[i] < candidate.source_start - 1_000_000 + { + break; + } + if best.is_none() && pos0 - chrom_blocks.starts[i] > 10_000_000 { + break; + } + if i == 0 { + break; + } + i -= 1; + } + + let best = best?; + let offset = pos0 - best.source_start; + let target_pos0 = if best.target_strand == '+' { + best.target_start + offset + } else { + best.target_size - (best.target_start + offset) - 1 + }; + + Some(LiftedLocus { + chrom: canon_chrom(&best.target_name), + pos: target_pos0 + 1, + strand: best.target_strand, + score: best.score, + }) + } +} + +pub fn grch37_to_grch38_chain() -> Result { + let reader = BufReader::new(GzDecoder::new(Cursor::new(HG19_TO_HG38_CHAIN_GZ))); + ChainIndex::from_reader( + Assembly::Grch37, + Assembly::Grch38, + reader, + LiftOverOptions::default(), + ) +} + +pub fn convert_23andme_grch37_to_grch38( + input_path: &Path, + output_path: &Path, + unmapped_path: &Path, +) -> Result { + let chain = grch37_to_grch38_chain()?; + convert_23andme_with_chain(input_path, output_path, unmapped_path, &chain) +} + +pub fn convert_23andme_with_chain( + input_path: &Path, + output_path: &Path, + unmapped_path: &Path, + chain: &ChainIndex, +) -> Result { + let input = File::open(input_path).map_err(|err| { + RuntimeError::Io(format!("failed to open {}: {err}", input_path.display())) + })?; + let mut output = File::create(output_path).map_err(|err| { + RuntimeError::Io(format!("failed to create {}: {err}", output_path.display())) + })?; + let mut unmapped = File::create(unmapped_path).map_err(|err| { + RuntimeError::Io(format!( + "failed to create unmapped report {}: {err}", + unmapped_path.display() + )) + })?; + + convert_23andme_reader_with_chain(BufReader::new(input), &mut output, &mut unmapped, chain) +} + +pub fn convert_23andme_reader_with_chain( + reader: R, + output: &mut W, + unmapped: &mut U, + chain: &ChainIndex, +) -> Result { + let mut stats = LiftoverStats::default(); + let mut added_note = false; + writeln!(unmapped, "rsid\tchromosome\tposition\tgenotype\treason") + .map_err(write_error("unmapped report"))?; + + for raw in reader.lines() { + let raw = + raw.map_err(|err| RuntimeError::Io(format!("failed to read 23andMe txt: {err}")))?; + let line = raw.trim_end_matches(['\n', '\r']); + if line.starts_with('#') { + if !added_note && line.contains("reference human assembly build 37") { + writeln!( + output, + "# Coordinates lifted from reference human assembly build 37 to build 38 / GRCh38." + ) + .map_err(write_error("lifted 23andMe output"))?; + writeln!( + output, + "# Genotype bases were reverse-complemented for markers mapping through reverse-strand chains." + ) + .map_err(write_error("lifted 23andMe output"))?; + added_note = true; + } + writeln!( + output, + "{}", + line.replace("build 37", "build 38 / GRCh38") + .replace("Annotation Release 104", "GRCh38") + ) + .map_err(write_error("lifted 23andMe output"))?; + continue; + } + + if line.is_empty() { + writeln!(output).map_err(write_error("lifted 23andMe output"))?; + continue; + } + + stats.total_markers += 1; + let mut fields: Vec = line.split('\t').map(ToOwned::to_owned).collect(); + if fields.len() < 4 { + stats.unmapped += 1; + writeln!(unmapped, "{line}\tmalformed").map_err(write_error("unmapped report"))?; + continue; + } + + let Ok(pos) = fields[2].parse::() else { + stats.unmapped += 1; + writeln!( + unmapped, + "{}\t{}\t{}\t{}\tnon_integer_position", + fields[0], fields[1], fields[2], fields[3] + ) + .map_err(write_error("unmapped report"))?; + continue; + }; + + let Some(lifted) = chain.lookup(Assembly::Grch37, Assembly::Grch38, &fields[1], pos) else { + stats.unmapped += 1; + writeln!( + unmapped, + "{}\t{}\t{}\t{}\tno_primary_mapping", + fields[0], fields[1], fields[2], fields[3] + ) + .map_err(write_error("unmapped report"))?; + continue; + }; + + if lifted.strand == '-' { + fields[3] = revcomp_genotype(&fields[3]); + stats.reverse_strand_genotypes += 1; + } + fields[1] = lifted.chrom; + fields[2] = lifted.pos.to_string(); + writeln!(output, "{}", fields.join("\t")).map_err(write_error("lifted 23andMe output"))?; + stats.mapped += 1; + } + + Ok(stats) +} + +fn parse_chain_header( + parts: &[&str], + index: usize, + primary_only: bool, +) -> Result, RuntimeError> { + if parts.len() < 13 { + return Err(RuntimeError::Io("malformed chain header".to_owned())); + } + let source_name = parts[2].to_owned(); + let source_strand = parts[4]; + let source_pos = parse_i64(parts[5], "chain tStart")?; + let target_name = parts[7].to_owned(); + let target_size = parse_i64(parts[8], "chain qSize")?; + let target_strand = parts[9].chars().next().unwrap_or('+'); + let target_pos = parse_i64(parts[10], "chain qStart")?; + + if source_strand != "+" { + return Ok(None); + } + if primary_only + && (!is_primary_chrom(&canon_chrom(&source_name)) + || !is_primary_chrom(&canon_chrom(&target_name))) + { + return Ok(None); + } + + Ok(Some(ChainHeader { + score: parse_i64(parts[1], "chain score")?, + source_name, + source_pos, + target_name, + target_size, + target_strand, + target_pos, + index, + })) +} + +fn compare_blocks(a: &ChainBlock, b: &ChainBlock) -> Ordering { + a.score.cmp(&b.score).then_with(|| b.index.cmp(&a.index)) +} + +fn parse_i64(value: &str, label: &str) -> Result { + value + .parse::() + .map_err(|err| RuntimeError::Io(format!("failed to parse {label} '{value}': {err}"))) +} + +fn canon_chrom(name: &str) -> String { + let stripped = name.strip_prefix("chr").unwrap_or(name); + if stripped == "M" { + "MT".to_owned() + } else { + stripped.to_owned() + } +} + +fn chain_chrom(name: &str) -> String { + let canon = canon_chrom(name); + if canon == "MT" { + "chrM".to_owned() + } else { + format!("chr{canon}") + } +} + +fn is_primary_chrom(chrom: &str) -> bool { + PRIMARY_CHROMS.contains(&chrom) +} + +fn revcomp_genotype(genotype: &str) -> String { + genotype + .chars() + .map(|ch| match ch { + 'A' => 'T', + 'C' => 'G', + 'G' => 'C', + 'T' => 'A', + 'a' => 't', + 'c' => 'g', + 'g' => 'c', + 't' => 'a', + other => other, + }) + .collect() +} + +fn write_error(label: &'static str) -> impl FnOnce(io::Error) -> RuntimeError { + move |err| RuntimeError::Io(format!("failed to write {label}: {err}")) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn tiny_chain() -> ChainIndex { + let chain = "\ +chain 100 chr1 1000 + 9 30 chr1 1000 + 99 120 1\n\ +10 5 5\n\ +10\n\ +\n\ +chain 90 chr2 1000 + 49 60 chr2 1000 - 100 111 2\n\ +10\n\ +"; + ChainIndex::from_reader( + Assembly::Grch37, + Assembly::Grch38, + BufReader::new(chain.as_bytes()), + LiftOverOptions::default(), + ) + .unwrap() + } + + #[test] + fn bundled_chain_maps_known_23andme_marker() { + let chain = grch37_to_grch38_chain().unwrap(); + let lifted = chain + .lookup(Assembly::Grch37, Assembly::Grch38, "19", 46_181_392) + .unwrap(); + assert_eq!(lifted.chrom, "19"); + assert_eq!(lifted.pos, 45_678_134); + assert_eq!(lifted.strand, '+'); + } + + #[test] + fn chain_lookup_handles_forward_and_reverse_blocks() { + let chain = tiny_chain(); + let forward = chain + .lookup(Assembly::Grch37, Assembly::Grch38, "1", 11) + .unwrap(); + assert_eq!(forward.chrom, "1"); + assert_eq!(forward.pos, 101); + assert_eq!(forward.strand, '+'); + + let reverse = chain + .lookup(Assembly::Grch37, Assembly::Grch38, "2", 50) + .unwrap(); + assert_eq!(reverse.chrom, "2"); + assert_eq!(reverse.pos, 900); + assert_eq!(reverse.strand, '-'); + } + + #[test] + fn convert_23andme_lifts_positions_and_reverse_complements() { + let chain = tiny_chain(); + let input = b"# We are using reference human assembly build 37\n\ +rsForward\t1\t11\tAG\n\ +rsReverse\t2\t50\tAC\n\ +rsMissing\t3\t10\tTT\n"; + let mut out = Vec::new(); + let mut unmapped = Vec::new(); + let stats = convert_23andme_reader_with_chain( + BufReader::new(&input[..]), + &mut out, + &mut unmapped, + &chain, + ) + .unwrap(); + + assert_eq!(stats.total_markers, 3); + assert_eq!(stats.mapped, 2); + assert_eq!(stats.unmapped, 1); + assert_eq!(stats.reverse_strand_genotypes, 1); + + let out = String::from_utf8(out).unwrap(); + assert!(out.contains("# Coordinates lifted")); + assert!(out.contains("rsForward\t1\t101\tAG")); + assert!(out.contains("rsReverse\t2\t900\tTG")); + + let unmapped = String::from_utf8(unmapped).unwrap(); + assert!(unmapped.contains("rsMissing\t3\t10\tTT\tno_primary_mapping")); + } +} diff --git a/rust/bioscript-formats/tests/file_formats.rs b/rust/bioscript-formats/tests/file_formats.rs index a561b34..4c51a16 100644 --- a/rust/bioscript-formats/tests/file_formats.rs +++ b/rust/bioscript-formats/tests/file_formats.rs @@ -76,6 +76,8 @@ fn zip_bytes(entry_name: &str, contents: &[u8]) -> Vec { mod alignment_tests; #[path = "file_formats/basic.rs"] mod basic; +#[path = "file_formats/bcf.rs"] +mod bcf; #[path = "file_formats/cram.rs"] mod cram; #[path = "file_formats/delimited.rs"] diff --git a/rust/bioscript-formats/tests/file_formats/bcf.rs b/rust/bioscript-formats/tests/file_formats/bcf.rs new file mode 100644 index 0000000..038d4c5 --- /dev/null +++ b/rust/bioscript-formats/tests/file_formats/bcf.rs @@ -0,0 +1,427 @@ +use super::*; + +use bioscript_core::GenomicLocus; +use bioscript_formats::{ + DetectedKind, FileContainer, InspectOptions, convert_23andme_grch37_to_grch38, inspect_file, +}; +use noodles::core::Position; +use noodles::vcf::{ + self, + header::record::value::{ + Map, + map::{ + Contig, Filter, Format, + format::{Number, Type}, + }, + }, + variant::{ + io::Write as _, + record_buf::{ + Samples, + samples::{Keys, sample::Value}, + }, + }, +}; +use std::{ + io::{BufRead, BufReader}, + path::Path, +}; + +type BcfRecord<'a> = (&'a str, i64, &'a str, &'a str, &'a str, &'a [f32]); + +struct LiftedVariants { + txt_genotypes: Vec, + variants: Vec, +} + +struct Concordance { + queried: usize, + found: usize, + matched: usize, + mismatched: usize, + examples: Vec, +} + +fn locus(chrom: &str, start: i64, end: i64) -> GenomicLocus { + GenomicLocus { + chrom: chrom.to_owned(), + start, + end, + } +} + +fn bcf_bytes(records: &[BcfRecord<'_>]) -> Vec { + let header = vcf::Header::builder() + .add_filter("PASS", Map::::pass()) + .add_contig("chr6", Map::::new()) + .add_contig("chr19", Map::::new()) + .add_format( + "HDS", + Map::::new(Number::Unknown, Type::Float, "haploid alternate dosage"), + ) + .add_sample_name("1") + .build(); + + let mut data = Vec::new(); + let mut writer = noodles::bcf::io::Writer::new(&mut data); + writer.write_header(&header).unwrap(); + + let keys: Keys = ["HDS".to_owned()].into_iter().collect(); + for (chrom, pos, id, reference, alternate, hds) in records { + let samples = Samples::new( + keys.clone(), + vec![vec![Some(Value::from( + hds.iter().copied().map(Some).collect::>(), + ))]], + ); + let mut builder = vcf::variant::RecordBuf::builder() + .set_reference_sequence_name(*chrom) + .set_variant_start(Position::try_from(usize::try_from(*pos).unwrap()).unwrap()) + .set_reference_bases(*reference) + .set_alternate_bases(vec![(*alternate).to_owned()].into()) + .set_samples(samples); + if *id != "." { + builder = builder.set_ids([(*id).to_owned()].into_iter().collect()); + } + let record = builder.build(); + writer.write_variant_record(&header, &record).unwrap(); + } + writer.try_finish().unwrap(); + drop(writer); + data +} + +fn lifted_text_variants(lifted: &Path) -> LiftedVariants { + let mut txt_genotypes = Vec::new(); + let mut variants = Vec::new(); + let file = fs::File::open(lifted).unwrap(); + for line in BufReader::new(file).lines() { + let line = line.unwrap(); + if line.is_empty() || line.starts_with('#') { + continue; + } + let fields: Vec<&str> = line.split('\t').collect(); + if fields.len() < 4 { + continue; + } + let chrom = fields[1]; + if matches!(chrom, "Y" | "MT" | "M") { + continue; + } + let genotype = fields[3].trim().to_ascii_uppercase(); + if !is_acgt_genotype(&genotype) { + continue; + } + let Ok(pos) = fields[2].parse::() else { + continue; + }; + txt_genotypes.push(normalized_acgt_genotype(&genotype)); + variants.push(VariantSpec { + rsids: vec![fields[0].to_owned()], + grch38: Some(locus(chrom, pos, pos)), + kind: Some(VariantKind::Snp), + ..VariantSpec::default() + }); + } + + LiftedVariants { + txt_genotypes, + variants, + } +} + +fn bcf_concordance( + bcf_zip: &Path, + variants: &[VariantSpec], + txt_genotypes: &[String], +) -> Concordance { + let bcf_store = GenotypeStore::from_file(bcf_zip).unwrap(); + let observations = bcf_store.lookup_variants(variants).unwrap(); + let mut concordance = Concordance { + queried: 0, + found: 0, + matched: 0, + mismatched: 0, + examples: Vec::new(), + }; + + for (idx, observation) in observations.iter().enumerate() { + concordance.queried += 1; + let Some(bcf_genotype) = observation.genotype.as_deref() else { + continue; + }; + concordance.found += 1; + if normalized_acgt_genotype(bcf_genotype) == txt_genotypes[idx] { + concordance.matched += 1; + } else { + concordance.mismatched += 1; + if concordance.examples.len() < 8 { + let locus = variants[idx].grch38.as_ref().unwrap(); + concordance.examples.push(format!( + "{} {}:{} txt={} bcf={}", + variants[idx].rsids[0], + locus.chrom, + locus.start, + txt_genotypes[idx], + bcf_genotype + )); + } + } + } + + concordance +} + +#[test] +fn individual_bcf_hds_records_are_readable_by_locus() { + let dir = temp_dir("bcf-hds"); + let path = dir.join("chr6.bcf"); + fs::write( + &path, + bcf_bytes(&[ + ("chr6", 39_011_352, ".", "A", "G", &[0.0, 0.0]), + ("chr6", 39_048_860, ".", "C", "T", &[1.0, 1.0]), + ("chr6", 39_051_898, ".", "G", "T", &[0.0, 1.0]), + ]), + ) + .unwrap(); + + let store = GenotypeStore::from_file(&path).unwrap(); + assert_eq!(store.backend_name(), "bcf"); + + let observations = store + .lookup_variants(&[ + VariantSpec { + grch38: Some(locus("6", 39_011_352, 39_011_352)), + reference: Some("A".to_owned()), + alternate: Some("G".to_owned()), + kind: Some(VariantKind::Snp), + ..VariantSpec::default() + }, + VariantSpec { + grch38: Some(locus("6", 39_048_860, 39_048_860)), + reference: Some("C".to_owned()), + alternate: Some("T".to_owned()), + kind: Some(VariantKind::Snp), + ..VariantSpec::default() + }, + VariantSpec { + grch38: Some(locus("6", 39_051_898, 39_051_898)), + reference: Some("G".to_owned()), + alternate: Some("T".to_owned()), + kind: Some(VariantKind::Snp), + ..VariantSpec::default() + }, + ]) + .unwrap(); + + assert_eq!(observations[0].genotype.as_deref(), Some("AA")); + assert_eq!(observations[1].genotype.as_deref(), Some("TT")); + assert_eq!(observations[2].genotype.as_deref(), Some("GT")); + assert_eq!(observations[0].assembly, Some(Assembly::Grch38)); + assert!( + observations[2].evidence[1].contains("HDS=0,1"), + "{:?}", + observations[2].evidence + ); +} + +#[test] +fn zip_of_bcf_shards_is_treated_as_one_logical_store() { + let dir = temp_dir("bcf-zip"); + let zip_path = dir.join("imputed_genotype_data_r6_sample.zip"); + let chr6 = bcf_bytes(&[("chr6", 39_051_898, ".", "G", "T", &[0.0, 1.0])]); + let chr19 = bcf_bytes(&[("chr19", 45_678_134, ".", "G", "C", &[1.0, 0.0])]); + + let file = fs::File::create(&zip_path).unwrap(); + let mut writer = zip::ZipWriter::new(file); + writer + .start_file("fbbdf42da6/chr6.bcf", SimpleFileOptions::default()) + .unwrap(); + writer.write_all(&chr6).unwrap(); + writer + .start_file("fbbdf42da6/chr19.bcf", SimpleFileOptions::default()) + .unwrap(); + writer.write_all(&chr19).unwrap(); + writer.finish().unwrap(); + + let store = GenotypeStore::from_file(&zip_path).unwrap(); + assert_eq!(store.backend_name(), "bcf"); + + let observations = store + .lookup_variants(&[ + VariantSpec { + grch38: Some(locus("19", 45_678_134, 45_678_134)), + reference: Some("G".to_owned()), + alternate: Some("C".to_owned()), + kind: Some(VariantKind::Snp), + ..VariantSpec::default() + }, + VariantSpec { + grch38: Some(locus("6", 39_051_898, 39_051_898)), + reference: Some("G".to_owned()), + alternate: Some("T".to_owned()), + kind: Some(VariantKind::Snp), + ..VariantSpec::default() + }, + ]) + .unwrap(); + + assert_eq!(observations[0].genotype.as_deref(), Some("CG")); + assert_eq!(observations[1].genotype.as_deref(), Some("GT")); + + let inspection = inspect_file(&zip_path, &InspectOptions::default()).unwrap(); + assert_eq!(inspection.container, FileContainer::Zip); + assert_eq!(inspection.detected_kind, DetectedKind::Bcf); + assert_eq!(inspection.assembly, Some(Assembly::Grch38)); +} + +#[test] +fn bcf_left_anchored_indel_matches_23andme_style_catalog_variant() { + let dir = temp_dir("bcf-indel"); + let path = dir.join("chr19.bcf"); + fs::write( + &path, + bcf_bytes(&[ + ("chr19", 45_679_773, ".", "A", "AT", &[1.0, 0.0]), + ("chr19", 45_679_773, ".", "A", "ATT", &[0.0, 0.0]), + ]), + ) + .unwrap(); + + let store = GenotypeStore::from_file(&path).unwrap(); + let observation = store + .lookup_variant(&VariantSpec { + rsids: vec!["rs71338792".to_owned()], + grch38: Some(locus("19", 45_679_774, 45_679_786)), + reference: Some("TTTTTTTTTTTTT".to_owned()), + alternate: Some("TTTTTTTTTTTTTT".to_owned()), + kind: Some(VariantKind::Indel), + ..VariantSpec::default() + }) + .unwrap(); + + assert_eq!(observation.genotype.as_deref(), Some("DI")); + assert!( + observation.evidence[0].contains("resolved by locus chr19:45679773"), + "{:?}", + observation.evidence + ); +} + +#[test] +fn local_23andme_lifted_text_matches_real_bcf_zip_for_known_variant_when_enabled() { + if std::env::var_os("BIOSCRIPT_RUN_LOCAL_23ANDME_BCF").is_none() { + eprintln!("skipping local 23andMe BCF concordance; set BIOSCRIPT_RUN_LOCAL_23ANDME_BCF=1"); + return; + } + + let workspace = repo_root().parent().expect("workspace root").to_path_buf(); + let raw_txt = workspace.join("genome_Madhava_Jay_v4_Full_20250613052552.txt"); + let bcf_zip = workspace + .join("exvitae") + .join("imputed_genotype_data_r6_Madhava_Jay.zip"); + if !raw_txt.exists() || !bcf_zip.exists() { + eprintln!( + "skipping local 23andMe BCF concordance; missing {} or {}", + raw_txt.display(), + bcf_zip.display() + ); + return; + } + + let dir = temp_dir("real-23andme-bcf"); + let lifted = dir.join("genome.grch38.txt"); + let unmapped = dir.join("unmapped.tsv"); + convert_23andme_grch37_to_grch38(&raw_txt, &lifted, &unmapped).unwrap(); + + let lifted_store = GenotypeStore::from_file(&lifted).unwrap(); + let bcf_store = GenotypeStore::from_file(&bcf_zip).unwrap(); + let variant = VariantSpec { + rsids: vec!["rs1800437".to_owned()], + grch38: Some(locus("19", 45_678_134, 45_678_134)), + reference: Some("G".to_owned()), + alternate: Some("C".to_owned()), + kind: Some(VariantKind::Snp), + ..VariantSpec::default() + }; + + let lifted_observation = lifted_store.lookup_variant(&variant).unwrap(); + let bcf_observation = bcf_store.lookup_variant(&variant).unwrap(); + assert_eq!(lifted_observation.genotype.as_deref(), Some("CG")); + assert_eq!(bcf_observation.genotype.as_deref(), Some("CG")); + assert_eq!(lifted_observation.genotype, bcf_observation.genotype); +} + +#[test] +fn local_23andme_lifted_text_broadly_matches_real_bcf_zip_when_enabled() { + if std::env::var_os("BIOSCRIPT_RUN_LOCAL_23ANDME_BCF").is_none() { + eprintln!( + "skipping local 23andMe BCF broad concordance; set BIOSCRIPT_RUN_LOCAL_23ANDME_BCF=1" + ); + return; + } + + let workspace = repo_root().parent().expect("workspace root").to_path_buf(); + let raw_txt = workspace.join("genome_Madhava_Jay_v4_Full_20250613052552.txt"); + let bcf_zip = workspace + .join("exvitae") + .join("imputed_genotype_data_r6_Madhava_Jay.zip"); + if !raw_txt.exists() || !bcf_zip.exists() { + eprintln!( + "skipping local 23andMe BCF broad concordance; missing {} or {}", + raw_txt.display(), + bcf_zip.display() + ); + return; + } + + let dir = temp_dir("real-23andme-bcf-broad"); + let lifted = dir.join("genome.grch38.txt"); + let unmapped = dir.join("unmapped.tsv"); + convert_23andme_grch37_to_grch38(&raw_txt, &lifted, &unmapped).unwrap(); + + let lifted_variants = lifted_text_variants(&lifted); + let concordance = bcf_concordance( + &bcf_zip, + &lifted_variants.variants, + &lifted_variants.txt_genotypes, + ); + + eprintln!( + "local 23andMe BCF broad concordance: queried={queried} found={found} matched={matched} mismatched={mismatched} missing={missing}", + queried = concordance.queried, + found = concordance.found, + matched = concordance.matched, + mismatched = concordance.mismatched, + missing = concordance.queried - concordance.found + ); + if !concordance.examples.is_empty() { + eprintln!("mismatch examples: {}", concordance.examples.join("; ")); + } + assert!( + concordance.queried > 500_000, + "expected broad 23andMe SNP comparison" + ); + assert!( + concordance.found > 500_000, + "expected most lifted SNPs to be present in BCF" + ); +} + +fn is_acgt_genotype(genotype: &str) -> bool { + !genotype.is_empty() + && genotype != "--" + && genotype + .chars() + .all(|base| matches!(base, 'A' | 'C' | 'G' | 'T')) +} + +fn normalized_acgt_genotype(genotype: &str) -> String { + let mut chars: Vec = genotype + .chars() + .filter(|base| matches!(base, 'A' | 'C' | 'G' | 'T')) + .collect(); + chars.sort_unstable(); + chars.into_iter().collect() +} diff --git a/rust/bioscript-reporting/src/report_json.rs b/rust/bioscript-reporting/src/report_json.rs index 1652809..3a88e34 100644 --- a/rust/bioscript-reporting/src/report_json.rs +++ b/rust/bioscript-reporting/src/report_json.rs @@ -141,6 +141,7 @@ fn detected_kind_name(value: bioscript_formats::DetectedKind) -> &'static str { match value { bioscript_formats::DetectedKind::GenotypeText => "genotype_text", bioscript_formats::DetectedKind::Vcf => "vcf", + bioscript_formats::DetectedKind::Bcf => "bcf", bioscript_formats::DetectedKind::AlignmentCram => "alignment_cram", bioscript_formats::DetectedKind::AlignmentBam => "alignment_bam", bioscript_formats::DetectedKind::ReferenceFasta => "reference_fasta", diff --git a/rust/bioscript-wasm/src/inspect_api.rs b/rust/bioscript-wasm/src/inspect_api.rs index 429e102..dcc4016 100644 --- a/rust/bioscript-wasm/src/inspect_api.rs +++ b/rust/bioscript-wasm/src/inspect_api.rs @@ -138,6 +138,7 @@ fn render_kind(k: DetectedKind) -> &'static str { match k { DetectedKind::GenotypeText => "genotype_text", DetectedKind::Vcf => "vcf", + DetectedKind::Bcf => "bcf", DetectedKind::AlignmentCram => "alignment_cram", DetectedKind::AlignmentBam => "alignment_bam", DetectedKind::ReferenceFasta => "reference_fasta",