From 346f794df34133a41458298ebbf9cecbe577eef9 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 9 Feb 2026 15:56:05 -0600 Subject: [PATCH 01/26] fix(fpkm): update imports for zFPKM calculation improvements Signed-off-by: Josh Loecker --- main/como/rnaseq_preprocess.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/main/como/rnaseq_preprocess.py b/main/como/rnaseq_preprocess.py index 60457fb9..a7e44e20 100644 --- a/main/como/rnaseq_preprocess.py +++ b/main/como/rnaseq_preprocess.py @@ -1,19 +1,23 @@ from __future__ import annotations import asyncio +import csv import functools +import io import json import re import sys +from collections.abc import Sequence from dataclasses import asdict, dataclass, field -from io import TextIOWrapper from itertools import chain from pathlib import Path from typing import Final, Literal, cast -import aiofiles import numpy as np +import numpy.typing as npt import pandas as pd +import pandera.pandas as pa +import pandera.typing.pandas as pat from fast_bioservices.biothings.mygene import MyGene from fast_bioservices.pipeline import gene_symbol_to_ensembl_and_gene_id from loguru import logger From 985c6f23ec033a84cf01b5caa8f2af677ab9a7d3 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 9 Feb 2026 16:01:36 -0600 Subject: [PATCH 02/26] fix(fpkm): use Salmon quantification instead of STAR quantification Signed-off-by: Josh Loecker --- main/como/rnaseq_preprocess.py | 428 ++++++++++++++++++--------------- 1 file changed, 230 insertions(+), 198 deletions(-) diff --git a/main/como/rnaseq_preprocess.py b/main/como/rnaseq_preprocess.py index a7e44e20..6835423e 100644 --- a/main/como/rnaseq_preprocess.py +++ b/main/como/rnaseq_preprocess.py @@ -27,71 +27,45 @@ @dataclass -class _STARinformation: - num_unmapped: list[int] - num_multimapping: list[int] - num_no_feature: list[int] - num_ambiguous: list[int] +class _QuantInformation: gene_names: list[str] count_matrix: pd.DataFrame - - @property - def num_genes(self) -> int: - return len(self.count_matrix) + sample_name: str + filepath: Path @classmethod - async def build_from_tab(cls, filepath: Path) -> _STARinformation: - if filepath.suffix != ".tab": + def build_from_sf(cls, filepath: Path) -> _QuantInformation: + if filepath.suffix != ".sf": _log_and_raise_error( - f"Building STAR information requires a '.tab' file; received: '{filepath}'", + f"Building quantification information requires a '.sf' file; received: '{filepath}'", error=ValueError, level=LogLevel.ERROR, ) if not filepath.exists(): _log_and_raise_error( - f"Unable to find the .tab file '{filepath}'", + f"Unable to find the .sf file: {filepath}", error=FileNotFoundError, level=LogLevel.ERROR, ) - async with aiofiles.open(filepath) as i_stream: - # Cannot use `asyncio.gather()` here because the order of execution is not guaranteed - unmapped = await i_stream.readline() - multimapping = await i_stream.readline() - no_feature = await i_stream.readline() - ambiguous = await i_stream.readline() - - num_unmapped = [int(i) for i in unmapped.removesuffix("\n").split("\t")[1:]] - num_multimapping = [int(i) for i in multimapping.removesuffix("\n").split("\t")[1:]] - num_no_feature = [int(i) for i in no_feature.removesuffix("\n").split("\t")[1:]] - num_ambiguous = [int(i) for i in ambiguous.removesuffix("\n").split("\t")[1:]] - - df = await _read_file( - path=filepath, + sample_name = filepath.stem.removesuffix("_quant.genes.sf") + df = pd.read_csv( + io.StringIO(filepath.read_text()), sep="\t", - header=None, - skiprows=4, - names=[ - "ensembl_gene_id", - "unstranded_rna_counts", - "first_read_transcription_strand", - "second_read_transcription_strand", - ], + names=["ensembl_gene_id", "length", "effective_length", "tpm", sample_name], ) - return _STARinformation( - num_unmapped=num_unmapped, - num_multimapping=num_multimapping, - num_no_feature=num_no_feature, - num_ambiguous=num_ambiguous, - gene_names=df["ensembl_gene_id"].values.tolist(), + return cls( + gene_names=df["ensembl_gene_id"].to_list(), count_matrix=df, + sample_name=sample_name, + filepath=filepath, ) @dataclass class _StudyMetrics: study_name: str - count_files: list[Path] + quant_files: list[Path] strand_files: list[Path] __sample_names: list[str] = field(default_factory=list) __num_samples: int = 0 @@ -105,24 +79,24 @@ def num_samples(self): return self.__num_samples def __post_init__(self): - self.__num_samples = len(self.count_files) - self.__sample_names = [f.stem for f in self.count_files] + self.__num_samples = len(self.quant_files) + self.__sample_names = [f.stem for f in self.quant_files] - if len(self.count_files) != len(self.strand_files): + if len(self.quant_files) != len(self.strand_files): _log_and_raise_error( ( f"Unequal number of count files and strand files for study '{self.study_name}'. " - f"Found {len(self.count_files)} count files and {len(self.strand_files)} strand files." + f"Found {len(self.quant_files)} count files and {len(self.strand_files)} strand files." ), error=ValueError, level=LogLevel.ERROR, ) - if self.num_samples != len(self.count_files): + if self.num_samples != len(self.quant_files): _log_and_raise_error( ( f"Unequal number of samples and count files for study '{self.study_name}'. " - f"Found {self.num_samples} samples and {len(self.count_files)} count files." + f"Found {self.num_samples} samples and {len(self.quant_files)} count files." ), error=ValueError, level=LogLevel.ERROR, @@ -145,7 +119,7 @@ def __post_init__(self): level=LogLevel.ERROR, ) - self.count_files.sort() + self.quant_files.sort() self.strand_files.sort() self.__sample_names.sort() @@ -153,39 +127,68 @@ def __post_init__(self): @dataclass(slots=True) class SampleConfiguration: sample_name: str - fragment_length: float + effective_lengths: pd.DataFrame + mean_effective_length: float layout: str strand: str study: str library_prep: str + def __post_init__(self): + if len(self.effective_lengths.columns) > 2: + _log_and_raise_error( + message=f"Effective lengths dataframe for sample '{self.sample_name}' has more than 2 columns, expected 'name' and 'effective_length'", + error=ValueError, + level=LogLevel.ERROR, + ) + if "name" not in self.effective_lengths.columns: + _log_and_raise_error( + message=f"Effective lengths dataframe for sample '{self.sample_name}' is missing 'name' column", + error=ValueError, + level=LogLevel.ERROR, + ) + if "effective_length" not in self.effective_lengths.columns: + _log_and_raise_error( + message=f"Effective lengths dataframe for sample '{self.sample_name}' is missing 'effective_length' column", + error=ValueError, + level=LogLevel.ERROR, + ) -async def _read_text(path: Path | None, *, default: str, lower: bool = False) -> str: - if path is None: - return default - async with aiofiles.open(path) as f: - txt = (await f.read()).strip() - return txt.lower() if lower else txt + @classmethod + def to_dataframe(cls, samples: list[SampleConfiguration]) -> tuple[pd.DataFrame, pd.DataFrame]: + """Convert a list of SampleConfiguration to a dataframe. + + :param samples: The list of SampleConfiguration objects to convert. + :return: A tuple of dataframes: + [0]: The sample configuration as a dataframe + [1]: The effective lengths as a separate data frame with `same_name` as columns + """ + config = pd.DataFrame( + columns=["sample_name", "mean_effective_length", "layout", "strand", "study", "library_prep"] + ) + genes = set() + for s in samples: + genes.update(s.effective_lengths["name"].to_list()) -def _sample_name_from_filepath(file: Path) -> str: - return re.search(r".+_S\d+R\d+(r\d+)?", file.stem).group() + lengths = pd.DataFrame(data=np.float64(0.0), columns=[s.sample_name for s in samples], index=list(genes)) + for sample in samples: + ids: list[str] = sample.effective_lengths["name"].to_list() + data: npt.NDArray[np.floating] = sample.effective_lengths["effective_length"].to_numpy(dtype=np.float64) + lengths.loc[ids, sample.sample_name] = data + return config, lengths -def _require_one(paths: list[Path], kind: Literal["layout", "strand", "preparation", "fragment"], label: str) -> Path | None: - if len(paths) == 1: - return paths[0] - if len(paths) == 0: - return None - _log_and_raise_error( - f"Multiple matching {kind} files for {label}, make sure there is only one copy for each replicate in COMO_input", - error=ValueError, - level=LogLevel.ERROR, - ) - return None # explicit return None to satisfy type-check + +def _sample_name_from_filepath(file: Path) -> str: + return re.search(r".+_S\d+R\d+(r\d+)?", file.stem).group() -def _require_one(paths: list[Path], kind: Literal["layout", "strand", "preparation", "fragment"], label: str) -> Path | None: +def _require_one( + paths: list[Path], + kind: Literal["layout", "strand", "preparation", "fragment"], + label: str, +) -> Path | None: if len(paths) == 1: return paths[0] if len(paths) == 0: @@ -195,22 +198,28 @@ def _require_one(paths: list[Path], kind: Literal["layout", "strand", "preparati error=ValueError, level=LogLevel.ERROR, ) - return None # explicit return None to satisfy type-check def _organize_gene_counts_files(data_dir: Path) -> list[_StudyMetrics]: - gene_count_dir = Path(data_dir, "geneCounts") + quant_dir = Path(data_dir, "quantification") strand_dir = Path(data_dir, "strandedness") - gene_counts_directories: list[Path] = sorted([p for p in gene_count_dir.glob("*") if not p.name.startswith(".")]) + if not quant_dir.exists(): + raise FileNotFoundError(f"Quantification directory not found: {quant_dir}") + + if not strand_dir.exists(): + raise FileNotFoundError(f"Strandedness directory not found: {strand_dir}") + + quantification_directories: list[Path] = sorted([p for p in quant_dir.glob("*") if not p.name.startswith(".")]) strandedness_directories: list[Path] = sorted([p for p in strand_dir.glob("*") if not p.name.startswith(".")]) - if len(gene_counts_directories) != len(strandedness_directories): + if len(quantification_directories) != len(strandedness_directories): _log_and_raise_error( ( - f"Unequal number of gene count directories and strandedness directories. " - f"Found {len(gene_counts_directories)} gene count directories and {len(strandedness_directories)} strandedness directories." - f"\nGene count directory: {gene_count_dir}\nStrandedness directory: {strand_dir}" + f"Unequal number of quantification directories and strandedness directories. " + f"Found {len(quantification_directories)} quantification directories and " + f"{len(strandedness_directories)} strandedness directories." + f"\nQuantification directory: {quant_dir}\nStrandedness directory: {strand_dir}" ), error=ValueError, level=LogLevel.ERROR, @@ -218,49 +227,34 @@ def _organize_gene_counts_files(data_dir: Path) -> list[_StudyMetrics]: # For each study, collect gene count files, fragment files, insert size files, layouts, and strandedness information study_metrics: list[_StudyMetrics] = [] - for gene_dir, strand_dir in zip(gene_counts_directories, strandedness_directories, strict=True): - count_files = list(gene_dir.glob("*.tab")) + for quant, strand_dir in zip(quantification_directories, strandedness_directories, strict=True): + quant_files = list(quant.glob("*_quant.genes.sf")) strand_files = list(strand_dir.glob("*.txt")) - if len(count_files) == 0: - _log_and_raise_error(f"No count files found for study '{gene_dir.stem}'.", error=ValueError, level=LogLevel.ERROR) + if len(quant_files) == 0: + _log_and_raise_error(f"No quant found for study '{quant.stem}'.", error=ValueError, level=LogLevel.ERROR) if len(strand_files) == 0: _log_and_raise_error( - f"No strandedness files found for study '{gene_dir.stem}'.", + f"No strandedness files found for study '{quant.stem}'.", error=ValueError, level=LogLevel.ERROR, ) study_metrics.append( _StudyMetrics( - study_name=gene_dir.stem, - count_files=count_files, + study_name=quant.stem, + quant_files=quant_files, strand_files=strand_files, ) ) return study_metrics -async def _process_first_multirun_sample(strand_file: Path, all_counts_files: list[Path]): +def _process_first_multirun_sample(strand_file: Path, all_quant_files: list[Path]): sample_count = pd.DataFrame() - all_star_information: list[_STARinformation] = await asyncio.gather(*[_STARinformation.build_from_tab(file) for file in all_counts_files]) - - for star_information in all_star_information: - strand_information = strand_file.read_text().rstrip("\n").lower() - - if strand_information not in ("none", "first_read_transcription_strand", "second_read_transcription_strand"): - _log_and_raise_error( - ( - f"Unrecognized Strand Information: {strand_information}; " - f"expected 'none', 'first_read_transcription_strand', or 'second_read_transcription_strand'" - ), - error=ValueError, - level=LogLevel.ERROR, - ) + quant_information: list[_QuantInformation] = [_QuantInformation.build_from_sf(f) for f in all_quant_files] - if strand_information == "none": - strand_information = "unstranded_rna_counts" - - run_counts = star_information.count_matrix[["ensembl_gene_id", strand_information]] + for info in quant_information: + run_counts = info.count_matrix[["ensembl_gene_id", info.sample_name]] run_counts.columns = ["ensembl_gene_id", "counts"] sample_count = run_counts if sample_count.empty else sample_count.merge(run_counts, on=["ensembl_gene_id", "counts"], how="outer") @@ -274,63 +268,48 @@ async def _process_first_multirun_sample(strand_file: Path, all_counts_files: li return count_sums -async def _process_standard_replicate(counts_file: Path, strand_file: Path, sample_name: str): - star_information = await _STARinformation.build_from_tab(counts_file) - strand_information = strand_file.read_text().rstrip("\n").lower() - - if strand_information not in ("none", "first_read_transcription_strand", "second_read_transcription_strand"): - _log_and_raise_error( - ( - f"Unrecognized Strand Information: {strand_information}; " - f"expected 'none', 'first_read_transcription_strand', or 'second_read_transcription_strand'" - ), - error=ValueError, - level=LogLevel.ERROR, - ) - if strand_information == "none": - strand_information = "unstranded_rna_counts" +def _process_standard_replicate(counts_file: Path, strand_file: Path, sample_name: str): + quant_information = _QuantInformation.build_from_sf(counts_file) + return quant_information.count_matrix - sample_count = star_information.count_matrix[["ensembl_gene_id", strand_information]] - sample_count.columns = ["ensembl_gene_id", sample_name] - return sample_count - -async def _prepare_sample_counts( +def _prepare_sample_counts( sample_name: str, counts_file: Path, strand_file: Path, - all_counts_files: list[Path], -) -> pd.DataFrame | Literal["SKIP"]: + all_quant_files: list[Path], +) -> pd.DataFrame | None: # Test if the counts_file is the first run in a multi-run smaple if re.search(r"R\d+r1", counts_file.as_posix()): - return await _process_first_multirun_sample(strand_file=strand_file, all_counts_files=all_counts_files) - elif re.search(r"R\d+r\d+", counts_file.as_posix()): - return "SKIP" + return _process_first_multirun_sample(strand_file=strand_file, all_quant_files=all_quant_files) + elif re.search(r"R\d+r[2-9]+", counts_file.as_posix()): + return None else: - return await _process_standard_replicate(counts_file, strand_file, sample_name) + return _process_standard_replicate(counts_file, strand_file, sample_name) -async def _create_sample_counts_matrix(metrics: _StudyMetrics) -> pd.DataFrame: +def _create_sample_counts_matrix(metrics: _StudyMetrics) -> pd.DataFrame: adjusted_index = 0 - counts: pd.DataFrame | Literal["SKIP"] = await _prepare_sample_counts( + counts: pd.DataFrame | None = _prepare_sample_counts( sample_name=metrics.sample_names[0], - counts_file=metrics.count_files[0], + counts_file=metrics.quant_files[0], strand_file=metrics.strand_files[0], - all_counts_files=metrics.count_files, + all_quant_files=metrics.quant_files, ) for i in range(1, metrics.num_samples): - new_counts = await _prepare_sample_counts( + new_counts = _prepare_sample_counts( sample_name=metrics.sample_names[i], - counts_file=metrics.count_files[i], + counts_file=metrics.quant_files[i], strand_file=metrics.strand_files[i], - all_counts_files=metrics.count_files, + all_quant_files=metrics.quant_files, ) - if isinstance(new_counts, str) and new_counts == "SKIP": + if new_counts is None: adjusted_index += 1 continue + assert isinstance(counts, pd.DataFrame) # noqa: S101 counts: pd.DataFrame = counts.merge(new_counts, on="ensembl_gene_id", how="outer") counts = counts.fillna(value=0) @@ -340,14 +319,19 @@ async def _create_sample_counts_matrix(metrics: _StudyMetrics) -> pd.DataFrame: old_col_name = counts.columns[i + 1 - adjusted_index] counts.rename(columns={old_col_name: new_sample_name}, inplace=True) + if counts is None: + raise ValueError(f"No valid counts were processed for study '{metrics.study_name}'") + return counts async def _write_counts_matrix( *, config_df: pd.DataFrame, + fragment_lengths: pd.DataFrame, como_context_dir: Path, output_counts_matrix_filepath: Path, + output_fragment_lengths_filepath: Path, rna: RNAType, ) -> pd.DataFrame: """Create a counts matrix file by reading gene counts table(s). @@ -362,16 +346,22 @@ async def _write_counts_matrix( A pandas DataFrame representing the final counts matrix. """ study_metrics = _organize_gene_counts_files(data_dir=como_context_dir) - counts: list[pd.DataFrame] = await asyncio.gather(*[_create_sample_counts_matrix(metric) for metric in study_metrics]) - rna_specific_sample_names = set(config_df.loc[config_df["library_prep"].str.lower() == rna.value.lower(), "sample_name"].tolist()) + counts: list[pd.DataFrame] = [_create_sample_counts_matrix(metric) for metric in study_metrics] + rna_specific_sample_names = set( + config_df.loc[config_df["library_prep"].str.lower() == rna.value.lower(), "sample_name"].tolist() + ) final_matrix: pd.DataFrame = functools.reduce(lambda left, right: pd.merge(left, right, on="ensembl_gene_id", how="outer"), counts) final_matrix.fillna(value=0, inplace=True) - final_matrix.iloc[:, 1:] = final_matrix.iloc[:, 1:].astype(np.uint64) + final_matrix.iloc[:, 1:] = final_matrix.iloc[:, 1:].astype(int) final_matrix = cast(pd.DataFrame, final_matrix[["ensembl_gene_id", *rna_specific_sample_names]]) output_counts_matrix_filepath.parent.mkdir(parents=True, exist_ok=True) + output_fragment_lengths_filepath.parent.mkdir(parents=True, exist_ok=True) + final_matrix.to_csv(output_counts_matrix_filepath, index=False) + fragment_lengths[rna_specific_sample_names].to_csv(output_fragment_lengths_filepath, index=True) + logger.success(f"Wrote gene count matrix for '{rna.value}' RNA at '{output_counts_matrix_filepath}'") return final_matrix @@ -383,9 +373,9 @@ async def _create_config_df( # noqa: C901 gene_count_dirname: str = "geneCounts", layout_dirname: str = "layouts", strandedness_dirname: str = "strandedness", - fragment_sizes_dirname: str = "fragmentSizes", + quantification_dir: str = "quantification", prep_method_dirname: str = "prepMethods", -) -> pd.DataFrame: +) -> tuple[pd.DataFrame, pd.DataFrame]: """Create configuration sheet. The configuration file created is based on the gene counts matrix. @@ -398,15 +388,17 @@ async def _create_config_df( # noqa: C901 gene_count_dirname: Name of the subdirectory containing gene count files. layout_dirname: Name of the subdirectory containing layout files. strandedness_dirname: Name of the subdirectory containing strandedness files. - fragment_sizes_dirname: Name of the subdirectory containing fragment size files. + quantification_dir: Name of the subdirectory containing Salmon's quantification files. prep_method_dirname: Name of the subdirectory containing library preparation method files. Returns: - A pandas DataFrame representing the configuration sheet. + [0]: A pandas DataFrame representing the configuration sheet. + [1]: Fragment lengths for downstream calculations """ label_regex: Final = re.compile(r"(?PS\d{1,3})(?PR\d{1,3})(?Pr\d{1,3})?") - gene_counts: list[Path] = list((como_context_dir / gene_count_dirname).rglob("*.tab")) - if not gene_counts: + quant_files: list[Path] = list((como_context_dir / quantification_dir).rglob("*.genes.sf")) + # gene_counts: list[Path] = list((como_context_dir / gene_count_dirname).rglob("*.tab")) + if not quant_files: _log_and_raise_error( f"No gene count files found in '{gene_count_dirname}'", error=FileNotFoundError, @@ -416,7 +408,7 @@ async def _create_config_df( # noqa: C901 auxillary_directories = { "layout": como_context_dir / layout_dirname, "strand": como_context_dir / strandedness_dirname, - "fragment": como_context_dir / fragment_sizes_dirname, + "quantification": como_context_dir / quantification_dir, "prep": como_context_dir / prep_method_dirname, } aux_lookup: dict[str, dict[str, Path]] = {kind: {} for kind in auxillary_directories} @@ -430,15 +422,15 @@ async def _create_config_df( # noqa: C901 aux_lookup[kind][m.group(0)] = p rows: list[SampleConfiguration] = [] - for gene_count_path in sorted(gene_counts): - m = label_regex.search(gene_count_path.as_posix()) - if not m: + for quant_file in sorted(quant_files): + m = label_regex.search(quant_file.as_posix()) + if m is None: _log_and_raise_error( - f"Filename '{gene_count_path.name}' does not match contextName_SXRYrZ.tab pattern", + f"Filename '{quant_file.name}' does not match contextName_SXRYrZ.tab pattern", error=ValueError, level=LogLevel.ERROR, ) - label = m.group(0) + label = m.group() study_number = m["study"] rep_number = m["rep"] sample_id = f"{context_name}_{study_number}{rep_number}" @@ -447,41 +439,50 @@ async def _create_config_df( # noqa: C901 strand_path = _require_one([aux_lookup["strand"].get(label)], "strand", label) prep_path = _require_one([aux_lookup["prep"].get(label)], "preparation", label) - layout, strand, prep = await asyncio.gather( - *[ - _read_text(layout_path, default="UNKNOWN"), - _read_text(strand_path, default="UNKNOWN"), - _read_text(prep_path, default="total", lower=True), - ], - ) + layout = layout_path.read_text().rstrip() + strand = strand_path.read_text().rstrip() + prep = prep_path.read_text().rstrip() if prep not in {"total", "mrna"}: _log_and_raise_error( f"Prep method must be 'total' or 'mrna' (got '{prep}') for {label}", error=ValueError, level=LogLevel.ERROR, ) + if layout == "": + _log_and_raise_error( + message=f"No layout file found for '{label}'.", + error=FileNotFoundError, + level=LogLevel.WARNING, + ) - fragment_label = f"{context_name}_{label}_fragment_size.txt" - frag_paths = [p for p in aux_lookup["fragment"].values() if p.name == fragment_label] - if not frag_paths and prep.lower() != RNAType.TRNA.value.lower(): - logger.warning(f"No fragment file for '{label}'; defaulting to 100 bp (needed for zFPKM).") - mean_frag = 100.0 - elif len(frag_paths) == 1 and layout == "single-end": - mean_frag = 0.0 - else: # 1-N files, paired end - dfs: list[pd.DataFrame] = cast( - typ=list[pd.DataFrame], - val=await asyncio.gather(*[_read_file(f, sep="\t", on_bad_lines="skip") for f in frag_paths]), + quant_paths = [p for p in aux_lookup["quantification"].values() if p.name == f"{sample_id}_quant.genes.sf"] + if ( + not quant_paths + and layout in ["paired-end", "", None] + and prep.lower() in [RNAType.TRNA.value.lower(), RNAType.MRNA.value.lower()] + ): + _log_and_raise_error( + message=f"No quantification file found for '{label}'; defaulting to 100 bp (needed for zFPKM).", + error=FileNotFoundError, + level=LogLevel.WARNING, ) - for df in dfs: - df["meanxcount"] = df["frag_mean"] * df["frag_count"] - counts = np.array([df["frag_count"].sum() for df in dfs]) - means = np.array([(df["meanxcount"] / df["frag_count"].sum()).sum() for df in dfs]) - mean_frag = float(np.average(means, weights=counts)) + elif len(quant_paths) == 1 and layout == "single-end": + effective_len = pd.DataFrame({"Name": [], "EffectiveLength": []}) + mean_effective_len = 0.0 # cannot compute FPKM for single-ended data + else: + df = _read_file(quant_file) + df.columns = [c.lower() for c in df.columns] + df = df.rename(columns={"effectivelength": "effective_length"}) + + effective_len = df[["name", "effective_length"]] + effective_len["effective_length"] = effective_len["effective_length"].astype(np.float64) + mean_effective_len: float = effective_len["effective_length"].sum() / len(df) + rows.append( SampleConfiguration( sample_name=sample_id, - fragment_length=mean_frag, + effective_lengths=effective_len, + mean_effective_length=mean_effective_len, layout=layout, strand=strand, study=study_number, @@ -489,8 +490,7 @@ async def _create_config_df( # noqa: C901 ) ) - df = pd.DataFrame.from_records([asdict(r) for r in rows]).sort_values("sample_name", ignore_index=True) - return df + return SampleConfiguration.to_dataframe(rows) # 6-3-25: Intentionally left commented-out code to test its replacement # gene_counts_dir = como_context_dir / gene_count_dirname @@ -735,14 +735,17 @@ async def _process_como_input( output_config_filepath: Path, como_context_dir: PATH_TYPE, output_counts_matrix_filepath: Path, + output_fragment_lengths_filepath: Path, rna: RNAType, ) -> None: - config_df = await _create_config_df(context_name, como_context_dir=como_context_dir) + config_df, fragment_lengths = _create_config_df(context_name, como_context_dir=como_context_dir) await _write_counts_matrix( config_df=config_df, + fragment_lengths=fragment_lengths, como_context_dir=como_context_dir, output_counts_matrix_filepath=output_counts_matrix_filepath, + output_fragment_lengths_filepath=output_fragment_lengths_filepath, rna=rna, ) with pd.ExcelWriter(output_config_filepath) as writer: @@ -756,6 +759,8 @@ async def _process( output_gene_info_filepath: Path, como_context_dir: Path | None, input_matrix_filepath: list[Path] | None, + output_trna_fragment_lengths_filepath: Path | None, + output_mrna_fragment_lengths_filepath: Path | None, output_trna_config_filepath: Path | None, output_mrna_config_filepath: Path | None, output_trna_matrix_filepath: Path | None, @@ -764,29 +769,50 @@ async def _process( cache: bool, create_gene_info_only: bool, ): - rna_types: list[tuple[RNAType, Path, Path]] = [] - if output_trna_config_filepath: - rna_types.append((RNAType.TRNA, output_trna_config_filepath, output_trna_matrix_filepath)) - if output_mrna_config_filepath: - rna_types.append((RNAType.MRNA, output_mrna_config_filepath, output_mrna_matrix_filepath)) + rna_types: list[tuple[RNAType, Path, Path, Path]] = [] + if output_trna_config_filepath is not None and output_trna_fragment_lengths_filepath is not None: + rna_types.append( + ( + RNAType.TRNA, + output_trna_config_filepath, + output_trna_matrix_filepath, + output_trna_fragment_lengths_filepath, + ) + ) + if output_mrna_config_filepath is not None and output_mrna_fragment_lengths_filepath is not None: + rna_types.append( + ( + RNAType.MRNA, + output_mrna_config_filepath, + output_mrna_matrix_filepath, + output_mrna_fragment_lengths_filepath, + ) + ) # if provided, iterate through como-input specific directories if not create_gene_info_only: - tasks = [] - for rna, output_config_filepath, output_matrix_filepath in rna_types: - tasks.append( - asyncio.create_task( - _process_como_input( - context_name=context_name, - output_config_filepath=output_config_filepath, - como_context_dir=como_context_dir, - output_counts_matrix_filepath=output_matrix_filepath, - rna=rna, - ) - ) + if como_context_dir is None: + _log_and_raise_error( + message="como_context_dir must be provided if create_gene_info_only is False", + error=ValueError, + level=LogLevel.ERROR, + ) + if output_trna_fragment_lengths_filepath is None: + _log_and_raise_error( + message="output_fragment_lengths_filepath must be provided if create_gene_info_only is False", + error=ValueError, + level=LogLevel.ERROR, ) - await asyncio.gather(*tasks) + for rna, out_config, out_matrix, out_frag_len in rna_types: + _process_como_input( + context_name=context_name, + output_config_filepath=out_config, + como_context_dir=como_context_dir, + output_counts_matrix_filepath=out_matrix, + output_fragment_lengths_filepath=out_frag_len, + rna=rna, + ) # create the gene info filepath based on provided data input_files = [] @@ -811,6 +837,8 @@ async def rnaseq_preprocess( output_gene_info_filepath: Path, como_context_dir: Path | None = None, input_matrix_filepath: Path | list[Path] | None = None, + output_trna_fragment_lengths_filepath: Path | None = None, + output_mrna_fragment_lengths_filepath: Path | None = None, output_trna_metadata_filepath: Path | None = None, output_mrna_metadata_filepath: Path | None = None, output_trna_count_matrix_filepath: Path | None = None, @@ -829,6 +857,8 @@ async def rnaseq_preprocess( :param context_name: The context/cell type being processed :param taxon: The NCBI taxonomy ID :param output_gene_info_filepath: Path to the output gene information CSV file + :param output_trna_fragment_lengths_filepath: Path to the output tRNA fragment lengths CSV file (if in "create" mode) + :param output_mrna_fragment_lengths_filepath: Path to the output mRNA fragment lengths CSV file (if in "create" mode) :param output_trna_metadata_filepath: Path to the output tRNA config file (if in "create" mode) :param output_mrna_metadata_filepath: Path to the output mRNA config file (if in "create" mode) :param output_trna_count_matrix_filepath: The path to write total RNA count matrices @@ -860,9 +890,11 @@ async def rnaseq_preprocess( input_matrix_filepath=input_matrix_filepath, output_gene_info_filepath=output_gene_info_filepath, output_trna_config_filepath=output_trna_metadata_filepath, - output_mrna_config_filepath=output_mrna_metadata_filepath, output_trna_matrix_filepath=output_trna_count_matrix_filepath, + output_trna_fragment_lengths_filepath=output_trna_fragment_lengths_filepath, + output_mrna_config_filepath=output_mrna_metadata_filepath, output_mrna_matrix_filepath=output_mrna_count_matrix_filepath, + output_mrna_fragment_lengths_filepath=output_mrna_fragment_lengths_filepath, cache=cache, create_gene_info_only=create_gene_info_only, ) From d35006351f966852431827fd58e7bbd00f3221e4 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 9 Feb 2026 16:02:29 -0600 Subject: [PATCH 03/26] chore: ruff formatting Signed-off-by: Josh Loecker --- main/como/rnaseq_preprocess.py | 38 ++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/main/como/rnaseq_preprocess.py b/main/como/rnaseq_preprocess.py index 6835423e..7fd9ac78 100644 --- a/main/como/rnaseq_preprocess.py +++ b/main/como/rnaseq_preprocess.py @@ -256,7 +256,9 @@ def _process_first_multirun_sample(strand_file: Path, all_quant_files: list[Path for info in quant_information: run_counts = info.count_matrix[["ensembl_gene_id", info.sample_name]] run_counts.columns = ["ensembl_gene_id", "counts"] - sample_count = run_counts if sample_count.empty else sample_count.merge(run_counts, on=["ensembl_gene_id", "counts"], how="outer") + sample_count = ( + run_counts if sample_count.empty else sample_count.join(run_counts, on=["ensembl_gene_id"], how="outer") + ) # Set na values to 0 sample_count = sample_count.fillna(value="0") @@ -336,14 +338,13 @@ async def _write_counts_matrix( ) -> pd.DataFrame: """Create a counts matrix file by reading gene counts table(s). - Args: - config_df: Configuration DataFrame containing sample information. - como_context_dir: Path to the COMO_input directory containing gene count files. - output_counts_matrix_filepath: Path where the output counts matrix CSV will be saved. - rna: RNAType enum indicating whether to process 'trna' or 'mrna' samples. - - Returns: - A pandas DataFrame representing the final counts matrix. + :param config_df: Configuration DataFrame containing sample information. + :param fragment_lengths: DataFrame containing effective lengths for each gene and sample, used for zFPKM normalization. + :param como_context_dir: Path to the COMO_input directory containing gene count files. + :param output_counts_matrix_filepath: Path where the output counts matrix CSV will be saved. + :param output_fragment_lengths_filepath: Path where the output fragment lengths CSV will be saved. + :param rna: RNAType enum indicating whether to process 'trna' or 'mrna' samples. + :return: A pandas DataFrame representing the final counts matrix. """ study_metrics = _organize_gene_counts_files(data_dir=como_context_dir) counts: list[pd.DataFrame] = [_create_sample_counts_matrix(metric) for metric in study_metrics] @@ -363,6 +364,7 @@ async def _write_counts_matrix( fragment_lengths[rna_specific_sample_names].to_csv(output_fragment_lengths_filepath, index=True) logger.success(f"Wrote gene count matrix for '{rna.value}' RNA at '{output_counts_matrix_filepath}'") + return final_matrix @@ -675,9 +677,9 @@ async def read_ensembl_gene_ids(file: Path) -> list[str]: return data["ensembl_gene_id"].tolist() try: conversion = await gene_symbol_to_ensembl_and_gene_id(symbols=data.var_names.tolist(), taxon=taxon) - except json.JSONDecodeError: + except json.JSONDecodeError as e: _log_and_raise_error( - f"Got a JSON decode error for file '{counts_matrix_filepaths}'", + f"Got a JSON decode error for file '{counts_matrix_filepaths}' ({e})", error=ValueError, level=LogLevel.CRITICAL, ) @@ -724,7 +726,7 @@ async def read_ensembl_gene_ids(file: Path) -> list[str]: # we would set `entrez_gene_id` to int here as well, but not all ensembl ids are mapped to entrez ids, # and as a result, there are still "-" values in the entrez id column that cannot be converted to an integer - gene_info: pd.DataFrame = cast(pd.DataFrame, gene_info.sort_values(by="ensembl_gene_id")) + gene_info = gene_info.sort_values(by="ensembl_gene_id") output_filepath.parent.mkdir(parents=True, exist_ok=True) gene_info.to_csv(output_filepath, index=False) logger.success(f"Gene Info file written at '{output_filepath}'") @@ -733,7 +735,7 @@ async def read_ensembl_gene_ids(file: Path) -> list[str]: async def _process_como_input( context_name: str, output_config_filepath: Path, - como_context_dir: PATH_TYPE, + como_context_dir: Path, output_counts_matrix_filepath: Path, output_fragment_lengths_filepath: Path, rna: RNAType, @@ -845,7 +847,7 @@ async def rnaseq_preprocess( output_mrna_count_matrix_filepath: Path | None = None, cache: bool = True, log_level: LogLevel | str = LogLevel.INFO, - log_location: str | TextIOWrapper = sys.stderr, + log_location: str | io.TextIOWrapper = sys.stderr, *, create_gene_info_only: bool = False, ) -> None: @@ -880,8 +882,12 @@ async def rnaseq_preprocess( input_matrix_filepath = [i.resolve() for i in _listify(input_matrix_filepath)] if input_matrix_filepath else None output_trna_metadata_filepath = output_trna_metadata_filepath.resolve() if output_trna_metadata_filepath else None output_mrna_metadata_filepath = output_mrna_metadata_filepath.resolve() if output_mrna_metadata_filepath else None - output_trna_count_matrix_filepath = output_trna_count_matrix_filepath.resolve() if output_trna_count_matrix_filepath else None - output_mrna_count_matrix_filepath = output_mrna_count_matrix_filepath.resolve() if output_mrna_count_matrix_filepath else None + output_trna_count_matrix_filepath = ( + output_trna_count_matrix_filepath.resolve() if output_trna_count_matrix_filepath else None + ) + output_mrna_count_matrix_filepath = ( + output_mrna_count_matrix_filepath.resolve() if output_mrna_count_matrix_filepath else None + ) await _process( context_name=context_name, From 748225015dbeb73ad29fb3c79d4d9ac23f0a7cfd Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 9 Feb 2026 16:02:45 -0600 Subject: [PATCH 04/26] chore: fill with integers for faster processing Signed-off-by: Josh Loecker --- main/como/rnaseq_preprocess.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/main/como/rnaseq_preprocess.py b/main/como/rnaseq_preprocess.py index 7fd9ac78..eebe1a78 100644 --- a/main/como/rnaseq_preprocess.py +++ b/main/como/rnaseq_preprocess.py @@ -261,14 +261,13 @@ def _process_first_multirun_sample(strand_file: Path, all_quant_files: list[Path ) # Set na values to 0 - sample_count = sample_count.fillna(value="0") + sample_count = sample_count.fillna(value=0) sample_count["counts"] = sample_count["counts"].astype(float) - count_sums = sample_count.groupby("ensembl_gene_id", as_index=False)["counts"].mean() - count_sums["counts"] = np.ceil(count_sums["counts"].astype(np.uint32)) - count_sums.columns = ["ensembl_gene_id", _sample_name_from_filepath(strand_file)] - return count_sums - + count_avg = sample_count.groupby("ensembl_gene_id", as_index=False)["counts"].mean() + count_avg["counts"] = np.ceil(count_avg["counts"].astype(int)) + count_avg.columns = ["ensembl_gene_id", _sample_name_from_filepath(strand_file)] + return count_avg def _process_standard_replicate(counts_file: Path, strand_file: Path, sample_name: str): From 155c8221ccea3b4b5961151ab79a8ccaf5100d54 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 9 Feb 2026 16:03:46 -0600 Subject: [PATCH 05/26] chore: remove unnecessary async function usage Signed-off-by: Josh Loecker --- main/como/rnaseq_preprocess.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/main/como/rnaseq_preprocess.py b/main/como/rnaseq_preprocess.py index eebe1a78..be0e714a 100644 --- a/main/como/rnaseq_preprocess.py +++ b/main/como/rnaseq_preprocess.py @@ -326,7 +326,7 @@ def _create_sample_counts_matrix(metrics: _StudyMetrics) -> pd.DataFrame: return counts -async def _write_counts_matrix( +def _write_counts_matrix( *, config_df: pd.DataFrame, fragment_lengths: pd.DataFrame, @@ -367,7 +367,7 @@ async def _write_counts_matrix( return final_matrix -async def _create_config_df( # noqa: C901 +def _create_config_df( # noqa: C901 context_name: str, /, como_context_dir: Path, @@ -670,7 +670,7 @@ async def _create_gene_info_file( """ async def read_ensembl_gene_ids(file: Path) -> list[str]: - data = await _read_file(file, h5ad_as_df=False) + data = _read_file(file, h5ad_as_df=False) if isinstance(data, pd.DataFrame): data: pd.DataFrame return data["ensembl_gene_id"].tolist() @@ -731,7 +731,7 @@ async def read_ensembl_gene_ids(file: Path) -> list[str]: logger.success(f"Gene Info file written at '{output_filepath}'") -async def _process_como_input( +def _process_como_input( context_name: str, output_config_filepath: Path, como_context_dir: Path, @@ -741,7 +741,7 @@ async def _process_como_input( ) -> None: config_df, fragment_lengths = _create_config_df(context_name, como_context_dir=como_context_dir) - await _write_counts_matrix( + _write_counts_matrix( config_df=config_df, fragment_lengths=fragment_lengths, como_context_dir=como_context_dir, From f7b3a0672325b9893282c71a6bfd3184012f8ba0 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 9 Feb 2026 16:04:03 -0600 Subject: [PATCH 06/26] fix: remove non existant genes from conversion Signed-off-by: Josh Loecker --- main/como/rnaseq_preprocess.py | 1 + 1 file changed, 1 insertion(+) diff --git a/main/como/rnaseq_preprocess.py b/main/como/rnaseq_preprocess.py index be0e714a..6defb860 100644 --- a/main/como/rnaseq_preprocess.py +++ b/main/como/rnaseq_preprocess.py @@ -684,6 +684,7 @@ async def read_ensembl_gene_ids(file: Path) -> list[str]: ) # Remove NA values from entrez_gene_id dataframe column + conversion = conversion[~conversion["ensembl_gene_id"].isna()] return conversion["ensembl_gene_id"].tolist() logger.info("Fetching gene info - this can take up to 5 minutes depending on the number of genes and your internet connection") From 0e4a2c3d9de8572c3d8bcb99a22aed4f60f58590 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 9 Feb 2026 16:04:57 -0600 Subject: [PATCH 07/26] refactor: use more explicit (albeit longer) code to create gene_info dataframe object Signed-off-by: Josh Loecker --- main/como/rnaseq_preprocess.py | 65 ++++++++++++++++++++++------------ 1 file changed, 43 insertions(+), 22 deletions(-) diff --git a/main/como/rnaseq_preprocess.py b/main/como/rnaseq_preprocess.py index 6defb860..f9f1b6ce 100644 --- a/main/como/rnaseq_preprocess.py +++ b/main/como/rnaseq_preprocess.py @@ -351,7 +351,9 @@ def _write_counts_matrix( config_df.loc[config_df["library_prep"].str.lower() == rna.value.lower(), "sample_name"].tolist() ) - final_matrix: pd.DataFrame = functools.reduce(lambda left, right: pd.merge(left, right, on="ensembl_gene_id", how="outer"), counts) + final_matrix: pd.DataFrame = functools.reduce( + lambda left, right: pd.merge(left, right, on="ensembl_gene_id", how="outer"), counts + ) final_matrix.fillna(value=0, inplace=True) final_matrix.iloc[:, 1:] = final_matrix.iloc[:, 1:].astype(int) final_matrix = cast(pd.DataFrame, final_matrix[["ensembl_gene_id", *rna_specific_sample_names]]) @@ -687,42 +689,61 @@ async def read_ensembl_gene_ids(file: Path) -> list[str]: conversion = conversion[~conversion["ensembl_gene_id"].isna()] return conversion["ensembl_gene_id"].tolist() - logger.info("Fetching gene info - this can take up to 5 minutes depending on the number of genes and your internet connection") + logger.info( + "Fetching gene info - this can take up to 5 minutes depending on the number of genes and your internet connection" + ) - ensembl_ids: set[str] = set(chain.from_iterable(await asyncio.gather(*[read_ensembl_gene_ids(f) for f in counts_matrix_filepaths]))) + ensembl_ids: set[str] = set( + chain.from_iterable(await asyncio.gather(*[read_ensembl_gene_ids(f) for f in counts_matrix_filepaths])) + ) gene_data: list[dict[str, str | int | list[str] | list[int] | None]] = await MyGene(cache=cache).query( items=list(ensembl_ids), taxon=taxon, scopes="ensemblgene", ) - gene_info: pd.DataFrame = pd.DataFrame( - data=None, - columns=pd.Index(data=["ensembl_gene_id", "gene_symbol", "entrez_gene_id", "size"]), - index=pd.Index(data=list(range(len(ensembl_ids)))), - ) + + n = len(gene_data) + all_gene_symbols: list[str] = ["-"] * n + all_entrez_ids: list[str | int] = ["-"] * n + all_ensembl_ids: list[str] = ["-"] * n + all_sizes: list[int] = [-1] * n + + def _avg_pos(value: int | list[int] | None) -> int: + if value is None: + return 0 + if isinstance(value, list): + return int(sum(value) / len(value)) if value else 0 + return int(value) for i, data in enumerate(gene_data): data: dict[str, str | int | list[str] | list[int] | None] - ensembl_genes: str | list[str] = cast(str | list[str], data.get("ensembl.gene", "-")) - start_pos: int | list[int] = cast(int | list[int], data.get("genomic_pos.start", 0)) - end_pos: int | list[int] = cast(int | list[int], data.get("genomic_pos.end", 0)) - avg_start: int | float = sum(start_pos) / len(start_pos) if isinstance(start_pos, list) else start_pos - avg_end: int | float = sum(end_pos) / len(end_pos) if isinstance(end_pos, list) else end_pos - size: int = int(avg_end - avg_start) + start = _avg_pos(data.get("genomic_pos.start", 0)) + end = _avg_pos(data.get("genomic_pos.end", 0)) + size = end - start + + ensembl_id: int = data.get("ensembl.gene", "-") + all_ensembl_ids[i] = ( + ",".join(map(str, ensembl_id)) if isinstance(ensembl_id, list) and ensembl_id else ensembl_id + ) + all_gene_symbols[i] = str(data.get("symbol", "-")) + all_entrez_ids[i] = str(data.get("entrezgene", "-")) + all_sizes[i] = size if size > 0 else -1 - gene_info.at[i, "gene_symbol"] = data.get("symbol", "-") - gene_info.at[i, "entrez_gene_id"] = data.get("entrezgene", "-") - gene_info.at[i, "ensembl_gene_id"] = ",".join(ensembl_genes) if isinstance(ensembl_genes, list) else ensembl_genes - gene_info.at[i, "size"] = size if size > 0 else -1 + gene_info: pd.DataFrame = pd.DataFrame( + { + "ensembl_gene_id": all_ensembl_ids, + "gene_symbol": all_gene_symbols, + "entrez_gene_id": all_entrez_ids, + "size": all_sizes, + } + ) - gene_info["size"] = gene_info["size"].astype(str) # replace no-length values with "-" to match rows where every value is "-" - gene_info["size"] = gene_info["size"].replace("-1", "-") - gene_info = cast(pd.DataFrame, gene_info[~(gene_info == "-").all(axis=1)]) # remove rows where every value is "-" + # remove rows where every gene size value is -1 (not available) + gene_info = gene_info[~(gene_info == -1).all(axis=1)] gene_info["ensembl_gene_id"] = gene_info["ensembl_gene_id"].str.split(",") # extend lists into multiple rows gene_info = gene_info.explode(column=["ensembl_gene_id"]) - gene_info["size"] = gene_info["size"].astype(int) # we would set `entrez_gene_id` to int here as well, but not all ensembl ids are mapped to entrez ids, # and as a result, there are still "-" values in the entrez id column that cannot be converted to an integer From ab66599d1c280acaec9f56143721f7ab07fad738 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 9 Feb 2026 16:05:20 -0600 Subject: [PATCH 08/26] chore: import required modules Signed-off-by: Josh Loecker --- main/como/rnaseq_gen.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py index a3e496fe..042e8145 100644 --- a/main/como/rnaseq_gen.py +++ b/main/como/rnaseq_gen.py @@ -1,34 +1,33 @@ from __future__ import annotations import itertools -import multiprocessing import sys -import time from collections import namedtuple from collections.abc import Callable -from concurrent.futures import Future, ProcessPoolExecutor, as_completed from dataclasses import dataclass, field from enum import Enum from pathlib import Path from typing import NamedTuple, TextIO, cast -import matplotlib.pyplot as plt +import anndata as ad +import boolean import numpy as np import numpy.typing as npt import pandas as pd -import seaborn as sns +import scanpy as sc import sklearn import sklearn.neighbors +from anndata.compat import XDataArray +from anndata.experimental.backed import Dataset2D from fast_bioservices.pipeline import ensembl_to_gene_id_and_symbol, gene_symbol_to_ensembl_and_gene_id from loguru import logger -from pandas import DataFrame +from scipy import sparse +from zfpkm import zFPKM, zfpkm_plot -from como.data_types import FilteringTechnique, LogLevel, PeakIdentificationParameters, RNAType -from como.density import density +from como.data_types import FilteringTechnique, LogLevel, RNAType from como.migrations import gene_info_migrations -from como.peak_finder import find_peaks from como.project import Config -from como.utils import _log_and_raise_error, _num_columns, _read_file, _set_up_logging +from como.utils import _log_and_raise_error, _read_file, _set_up_logging class _FilteringOptions(NamedTuple): From 95654b342ac2683fb05a8e93ce7a9d8eabeaab2c Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 9 Feb 2026 16:07:20 -0600 Subject: [PATCH 09/26] refactor: optional argument for fragment data Signed-off-by: Josh Loecker --- main/como/rnaseq_gen.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py index 042e8145..32a84fed 100644 --- a/main/como/rnaseq_gen.py +++ b/main/como/rnaseq_gen.py @@ -149,12 +149,13 @@ def genefilter(data: pd.DataFrame | npt.NDArray, filter_func: Callable[[npt.NDAr async def _build_matrix_results( + matrix: pd.DataFrame | sc.AnnData, *, - matrix: pd.DataFrame, gene_info: pd.DataFrame, metadata_df: pd.DataFrame, + fragment_df: pd.DataFrame | None, taxon: int, -) -> _ReadMatrixResults: +) -> tuple[NamedMetrics, list[int]]: """Read the counts matrix and returns the results. Arg: From dec37b0c4f1973f17f4ddc08c07208e99882768a Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 9 Feb 2026 16:09:45 -0600 Subject: [PATCH 10/26] refactor: improve handling for single cell data Signed-off-by: Josh Loecker --- main/como/rnaseq_gen.py | 173 ++++++++++++++++++++++++++++------------ 1 file changed, 120 insertions(+), 53 deletions(-) diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py index 32a84fed..e9cfc6e4 100644 --- a/main/como/rnaseq_gen.py +++ b/main/como/rnaseq_gen.py @@ -49,14 +49,14 @@ class LayoutMethod(Enum): class _StudyMetrics: study: str num_samples: int - count_matrix: pd.DataFrame - fragment_lengths: npt.NDArray[float] + count_matrix: pd.DataFrame | sc.AnnData + fragment_lengths: npt.NDArray[np.floating] | None sample_names: list[str] layout: list[LayoutMethod] entrez_gene_ids: npt.NDArray[int] gene_sizes: npt.NDArray[int] __normalization_matrix: pd.DataFrame = field(default_factory=pd.DataFrame) - __z_score_matrix: pd.DataFrame = field(default_factory=pd.DataFrame) + __z_score_matrix: pd.DataFrame | sc.AnnData | None = field(default=None) __high_confidence_entrez_gene_ids: list[str] = field(default_factory=list) def __post_init__(self): @@ -77,11 +77,11 @@ def normalization_matrix(self, value: pd.DataFrame) -> None: self.__normalization_matrix = value @property - def z_score_matrix(self) -> pd.DataFrame: + def z_score_matrix(self) -> pd.DataFrame | sc.AnnData | None: return self.__z_score_matrix @z_score_matrix.setter - def z_score_matrix(self, value: pd.DataFrame) -> None: + def z_score_matrix(self, value: pd.DataFrame | sc.AnnData) -> None: self.__z_score_matrix = value @property @@ -111,9 +111,10 @@ class _ReadMatrixResults(NamedTuple): def k_over_a(k: int, a: float) -> Callable[[npt.NDArray], bool]: - """Return a function that filters rows of an array based on the sum of elements being greater than or equal to A at least k times. + """Filter rows of an array based on the sum of elements being greater than or equal to A at least k times. - This code is based on the `kOverA` function found in R's `genefilter` package: https://www.rdocumentation.org/packages/genefilter/versions/1.54.2/topics/kOverA + This code is based on the `kOverA` function found in R's `genefilter` package + https://www.rdocumentation.org/packages/genefilter/versions/1.54.2/topics/kOverA :param k: The minimum number of times the sum of elements must be greater than or equal to A. :param a: The value to compare the sum of elements to. @@ -121,7 +122,7 @@ def k_over_a(k: int, a: float) -> Callable[[npt.NDArray], bool]: """ def filter_func(row: npt.NDArray) -> bool: - return np.sum(row >= a) >= k + return bool(np.sum(row >= a) >= k) return filter_func @@ -168,20 +169,35 @@ async def _build_matrix_results( """ conversion = await ensembl_to_gene_id_and_symbol(ids=matrix["ensembl_gene_id"].tolist(), taxon=taxon) - # If all columns are empty, it is indicative that the incorrect taxon id was provided - if all(conversion[col].eq("-").all() for col in conversion.columns): - logger.critical(f"Conversion of Ensembl Gene IDs to Entrez IDs and Gene Symbols was empty - is '{taxon}' the correct taxon ID for this data?") - - # 2025-NOV-3: commented out `conversion` types to evaluate if it can be skipped - # conversion["ensembl_gene_id"] = conversion["ensembl_gene_id"].str.split(",") - # conversion = conversion.explode("ensembl_gene_id") - # conversion.reset_index(inplace=True, drop=True) - # conversion = conversion[conversion["entrez_gene_id"] != "-"] # drop missing entrez IDs - # conversion["entrez_gene_id"] = conversion["entrez_gene_id"].astype(int) # float32 is needed because np.nan is a float - - # merge_on should contain at least one of "ensembl_gene_id", "entrez_gene_id", or "gene_symbol" - merge_on: list[str] = list(set(matrix.columns).intersection(conversion.columns)) - if not merge_on: + matrix.var = matrix.var.reset_index(drop=False, names=["gene_symbol"]) + conversion = await gene_symbol_to_ensembl_and_gene_id(symbols=matrix.var["gene_symbol"].tolist(), taxon=taxon) + else: + if "ensembl_gene_id" not in matrix.columns: + _log_and_raise_error( + message="'ensembl_gene_id' column not found in the provided DataFrame.", + error=ValueError, + level=LogLevel.CRITICAL, + ) + conversion: pd.DataFrame = await ensembl_to_gene_id_and_symbol( + ids=matrix["ensembl_gene_id"].tolist(), taxon=taxon + ) + # If the entrez gene id column is empty, it is indicative that the incorrect taxon id was provided + if conversion["entrez_gene_id"].eq("-").all(): + logger.critical( + f"Conversion of Ensembl Gene IDs to Entrez IDs and Gene Symbols was empty - " + f"is '{taxon}' the correct taxon ID for this data?" + ) + conversion["ensembl_gene_id"] = conversion["ensembl_gene_id"].str.split(",") + conversion = conversion.explode("ensembl_gene_id") + conversion = conversion[conversion["entrez_gene_id"] != "-"] + conversion["entrez_gene_id"] = conversion["entrez_gene_id"] + conversion = conversion.reset_index(drop=False) + + # conversion_merge_on should contain at least one of "ensembl_gene_id", "entrez_gene_id", or "gene_symbol" + conversion_merge_on: list[str] = list( + set(matrix.columns if isinstance(matrix, pd.DataFrame) else matrix.var.columns) & set(conversion.columns) + ) + if not conversion_merge_on: _log_and_raise_error( ( "No columns to merge on, unable to find at least one of `ensembl_gene_id`, `entrez_gene_id`, or `gene_symbol`. " @@ -190,48 +206,91 @@ async def _build_matrix_results( error=ValueError, level=LogLevel.ERROR, ) - if "entrez_gene_id" in matrix.columns: - matrix["entrez_gene_id"] = matrix["entrez_gene_id"].astype(int) - matrix = matrix.merge(conversion, on=merge_on, how="left") - # drop rows that have `0` in `entrez_gene_id` column - # matrix = matrix[matrix["entrez_gene_id"] != 0].reset_index(drop=True, inplace=False) - # gene_info = gene_info[gene_info["entrez_gene_id"] != 0].reset_index(drop=True, inplace=False) + if isinstance(matrix, pd.DataFrame): + if "entrez_gene_id" in matrix.columns: + matrix["entrez_gene_id"] = matrix["entrez_gene_id"].astype(int) + matrix = matrix.merge(conversion, on=conversion_merge_on, how="left") + elif isinstance(matrix, sc.AnnData): + if "entrez_gene_id" in matrix.var.columns: + matrix.var["entrez_gene_id"] = matrix.var["entrez_gene_id"].astype(int) + matrix.var = matrix.var.merge(conversion, on=conversion_merge_on, how="left") gene_info = gene_info_migrations(gene_info) - # gene_info["entrez_gene_id"] = gene_info["entrez_gene_id"].astype(int) + gene_info = gene_info[gene_info["entrez_gene_id"] != "-"] + gene_info.loc[:, "entrez_gene_id"] = gene_info.loc[:, "entrez_gene_id"].astype(int) - counts_matrix = matrix.merge( - gene_info[["entrez_gene_id", "ensembl_gene_id"]], - on=["entrez_gene_id", "ensembl_gene_id"], - how="inner", + gene_info_merge_on: list[str] = list( + set(matrix.columns if isinstance(matrix, pd.DataFrame) else matrix.var.columns) & set(gene_info.columns) ) - gene_info = gene_info.merge( - counts_matrix[["entrez_gene_id", "ensembl_gene_id"]], - on=["entrez_gene_id", "ensembl_gene_id"], - how="inner", - ) + if "entrez_gene_id" in gene_info_merge_on: + gene_info = gene_info[~gene_info["entrez_gene_id"].isna()] + gene_info["entrez_gene_id"] = gene_info["entrez_gene_id"].astype(int) + + if isinstance(matrix, pd.DataFrame): + matrix = matrix[~matrix["entrez_gene_id"].isna()] + matrix["entrez_gene_id"] = matrix["entrez_gene_id"].astype(int) + elif isinstance(matrix, sc.AnnData): + if isinstance(matrix.var, XDataArray): + raise TypeError("Expected matrix.var object to be 'pd.DataFrame', got 'anndata.compat.XDataArray'") + matrix = matrix[:, ~matrix.var["entrez_gene_id"].isna()] + matrix.var["entrez_gene_id"] = matrix.var["entrez_gene_id"].astype(int) + + if isinstance(matrix, pd.DataFrame): + matrix = matrix.merge(gene_info, on=gene_info_merge_on, how="inner") + elif isinstance(matrix, sc.AnnData): + if not isinstance(matrix.var, pd.DataFrame): + raise TypeError(f"Expected matrix.var object to be 'pd.DataFrame', got '{type(matrix.var)}'") + matrix.var["original_index"] = matrix.var.index + new_var = matrix.var.merge(gene_info, on=gene_info_merge_on, how="inner") + new_matrix = matrix[:, new_var["original_index"]].copy() + new_matrix.var = new_var + new_matrix.var = new_matrix.var.drop(columns=["original_index"]) + new_matrix.var.reset_index(drop=True) + matrix = new_matrix + + non_duplicates = ~matrix.var.duplicated(subset=matrix.var.columns, keep="first") + matrix = matrix[:, non_duplicates].copy() - entrez_gene_ids: npt.NDArray[int] = gene_info["entrez_gene_id"].to_numpy() metrics: NamedMetrics = {} for study in metadata_df["study"].unique(): - study_sample_names = metadata_df[metadata_df["study"] == study]["sample_name"].tolist() - layouts = metadata_df[metadata_df["study"] == study]["layout"].tolist() + study_sample_names: list[str] = metadata_df[metadata_df["study"] == study]["sample_name"].tolist() + layouts: list[str] = metadata_df[metadata_df["study"] == study]["layout"].tolist() + + if isinstance(matrix, pd.DataFrame): + subset = matrix.set_index(keys=["entrez_gene_id"], drop=True) + subset = subset[subset.columns.intersection(study_sample_names)] + subset.index = subset.index.astype(int) + entrez_gene_ids = subset.index.to_numpy(copy=False) + gene_sizes = matrix["size"].to_numpy(dtype=int, copy=False) + elif isinstance(matrix, sc.AnnData): + # matrix.var = matrix.var.set_index(keys=["entrez_gene_id"], drop=True) + subset = matrix[matrix.obs_names.intersection(study_sample_names)] + entrez_gene_ids = subset.var["entrez_gene_id"].to_numpy(dtype=int) + gene_sizes = subset.var["size"].to_numpy(dtype=int) + else: + _log_and_raise_error( + message=f"Matrix must be a pandas DataFrame or scanpy AnnData object, got: '{type(matrix)}'.", + error=TypeError, + level=LogLevel.CRITICAL, + ) + + frag_lengths = None + if fragment_df is not None: + frag_lengths = fragment_df["effective_length"].to_numpy(dtype=np.float64) metrics[study] = _StudyMetrics( - count_matrix=cast(pd.DataFrame, counts_matrix[counts_matrix.columns.intersection(study_sample_names)]), - fragment_lengths=metadata_df[metadata_df["study"] == study]["fragment_length"].values.astype(float), + count_matrix=subset, + fragment_lengths=frag_lengths, sample_names=study_sample_names, layout=[LayoutMethod(layout) for layout in layouts], num_samples=len(study_sample_names), entrez_gene_ids=entrez_gene_ids, - gene_sizes=gene_info["size"].values.astype(int), + gene_sizes=gene_sizes, study=study, ) - metrics[study].fragment_lengths[np.isnan(metrics[study].fragment_lengths)] = 0 - metrics[study].count_matrix.index = pd.Index(entrez_gene_ids, name="entrez_gene_id") - return _ReadMatrixResults(metrics=metrics, entrez_gene_ids=gene_info["entrez_gene_id"].tolist()) + return metrics, gene_info["entrez_gene_id"].astype(int).tolist() def calculate_tpm(metrics: NamedMetrics) -> NamedMetrics: @@ -243,13 +302,21 @@ def calculate_tpm(metrics: NamedMetrics) -> NamedMetrics: Returns: A dictionary of study metrics with TPM calculated. """ - for sample in metrics: - count_matrix = metrics[sample].count_matrix - - gene_sizes = pd.Series(metrics[sample].gene_sizes, index=count_matrix.index) - adjusted_counts = count_matrix.add(1e-6) + for sample, metric in metrics.items(): + if isinstance(metric.count_matrix, sc.AnnData): + adata = metric.count_matrix + gene_sizes = pd.Series(metric.gene_sizes, index=adata.var_names) + counts_df = pd.DataFrame( + data=np.asarray(adata.X.toarray() if sparse.issparse(adata.X) else adata.X), + index=adata.var_names, + columns=adata.obs_names, + ) + else: + counts_df = metric.count_matrix + gene_sizes = pd.Series(metric.gene_sizes) - tpm_matrix = adjusted_counts.divide(gene_sizes, axis=0) # (count + 1) / gene_length + adjusted_counts = counts_df.add(1e-6) + tpm_matrix = adjusted_counts.div(gene_sizes, axis=0) # (count + 1) / gene_length tpm_matrix = tpm_matrix.div(tpm_matrix.sum(axis=0), axis=1) # normalize by total tpm_matrix = tpm_matrix.mul(1e6) # scale to per-million metrics[sample].normalization_matrix = tpm_matrix From fc1d45f0cb30359979e5a621c8ab88c57b637c1c Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 9 Feb 2026 16:10:00 -0600 Subject: [PATCH 11/26] chore: generalize data type input Signed-off-by: Josh Loecker --- main/como/rnaseq_gen.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py index e9cfc6e4..3ba71f6d 100644 --- a/main/como/rnaseq_gen.py +++ b/main/como/rnaseq_gen.py @@ -53,8 +53,8 @@ class _StudyMetrics: fragment_lengths: npt.NDArray[np.floating] | None sample_names: list[str] layout: list[LayoutMethod] - entrez_gene_ids: npt.NDArray[int] - gene_sizes: npt.NDArray[int] + entrez_gene_ids: npt.NDArray[np.integer] + gene_sizes: npt.NDArray[np.integer] __normalization_matrix: pd.DataFrame = field(default_factory=pd.DataFrame) __z_score_matrix: pd.DataFrame | sc.AnnData | None = field(default=None) __high_confidence_entrez_gene_ids: list[str] = field(default_factory=list) From e1505d153bff3254ef3d7e1a559a41da0ce72080 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 9 Feb 2026 16:10:11 -0600 Subject: [PATCH 12/26] chore: ruff formatting Signed-off-by: Josh Loecker --- main/como/rnaseq_gen.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py index 3ba71f6d..c8c2e207 100644 --- a/main/como/rnaseq_gen.py +++ b/main/como/rnaseq_gen.py @@ -146,7 +146,11 @@ def genefilter(data: pd.DataFrame | npt.NDArray, filter_func: Callable[[npt.NDAr level=LogLevel.CRITICAL, ) - return data.apply(filter_func, axis=1).values if isinstance(data, pd.DataFrame) else np.apply_along_axis(filter_func, axis=1, arr=data) + return ( + data.apply(filter_func, axis=1).to_numpy() + if isinstance(data, pd.DataFrame) + else np.apply_along_axis(filter_func, axis=1, arr=data) + ) async def _build_matrix_results( @@ -159,15 +163,14 @@ async def _build_matrix_results( ) -> tuple[NamedMetrics, list[int]]: """Read the counts matrix and returns the results. - Arg: - matrix: The gene counts matrix to process - metadata_df: The configuration dataframe related to the current context - taxon: The NCBI Taxon ID - - Returns: - A dataclass `ReadMatrixResults` + :param matrix: The gene counts matrix to process + :param metadata_df: The configuration dataframe related to the current context + :param taxon: The NCBI Taxon ID + :returns: A dataclass `ReadMatrixResults` """ - conversion = await ensembl_to_gene_id_and_symbol(ids=matrix["ensembl_gene_id"].tolist(), taxon=taxon) + if isinstance(matrix, sc.AnnData): + if not isinstance(matrix.var, pd.DataFrame): + raise TypeError("AnnData.var is expected to be a pandas.DataFrame") matrix.var = matrix.var.reset_index(drop=False, names=["gene_symbol"]) conversion = await gene_symbol_to_ensembl_and_gene_id(symbols=matrix.var["gene_symbol"].tolist(), taxon=taxon) @@ -325,7 +328,7 @@ def calculate_tpm(metrics: NamedMetrics) -> NamedMetrics: def _calculate_fpkm(metrics: NamedMetrics, scale: float = 1e6) -> NamedMetrics: - """Calculate the Fragments Per Kilobase of transcript per Million mapped reads (FPKM) for each sample in the metrics dictionary. + """Calculate the Fragments Per Kilobase of transcript per Million mapped reads (FPKM) for each i in the metrics dictionary. Args: metrics: A dictionary of study metrics to calculate FPKM for. From 849ba2e64da91e05cbf7ffd5eb7aa13fb80a38fc Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 9 Feb 2026 16:11:52 -0600 Subject: [PATCH 13/26] chore: simplify FPKM/RPKM calculations; properly compute per-gene FPKM scores Signed-off-by: Josh Loecker --- main/como/rnaseq_gen.py | 62 +++++++++++------------------------------ 1 file changed, 17 insertions(+), 45 deletions(-) diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py index c8c2e207..ce8de24b 100644 --- a/main/como/rnaseq_gen.py +++ b/main/como/rnaseq_gen.py @@ -338,53 +338,25 @@ def _calculate_fpkm(metrics: NamedMetrics, scale: float = 1e6) -> NamedMetrics: A dictionary of study metrics with FPKM calculated. """ for study in metrics: - matrix_values = [] - - for sample in range(metrics[study].num_samples): - layout = metrics[study].layout[sample] - count_matrix: npt.NDArray[float] = metrics[study].count_matrix.iloc[:, sample].values - gene_lengths = ( - metrics[study].fragment_lengths[sample].astype(float) - if layout == LayoutMethod.paired_end - else metrics[study].gene_sizes.astype(float) + matrix_values: dict[str, npt.NDArray[np.floating]] = {} + count_matrix = metrics[study].count_matrix + if not isinstance(count_matrix, pd.DataFrame): + _log_and_raise_error( + message="FPKM cannot be performed on scanpy.AnnData objects!", + error=TypeError, + level=LogLevel.CRITICAL, ) - gene_lengths_kb = gene_lengths / 1000.0 - - match layout: - case LayoutMethod.paired_end: # FPKM - total_fragments = count_matrix.sum(axis=0) - if total_fragments == 0: - fragments_per_kilobase_million = np.nan - else: - counts_per_million = total_fragments / scale - fragments_per_kilobase = count_matrix / gene_lengths_kb - fragments_per_kilobase_million = fragments_per_kilobase / counts_per_million - matrix_values.append(fragments_per_kilobase_million) - case LayoutMethod.single_end: # RPKM - reads_per_kilobase = count_matrix / gene_lengths_kb - total_reads = count_matrix.sum(axis=0) - counts_per_million = total_reads / scale - reads_per_kilobase_million = reads_per_kilobase / counts_per_million - matrix_values.append(reads_per_kilobase_million) - case _: - _log_and_raise_error( - ( - f"Invalid normalization method specified ''. " - f"Must be one of '{LayoutMethod.paired_end.value}' or '{LayoutMethod.single_end.value}'." - ), - error=ValueError, - level=LogLevel.ERROR, - ) - - # Transpose is needed because values were appended as rows - fpkm_matrix = pd.DataFrame(matrix_values).T - fpkm_matrix.index = metrics[study].count_matrix.index - fpkm_matrix.columns = metrics[study].sample_names - - fpkm_matrix = fpkm_matrix[~pd.isna(fpkm_matrix)] - metrics[study].normalization_matrix = fpkm_matrix - metrics[study].normalization_matrix.columns = metrics[study].count_matrix.columns + study_counts = count_matrix.to_numpy(dtype=int, copy=False) + for i in range(metrics[study].num_samples): + layout = metrics[study].layout[i] + sample_name = metrics[study].sample_names[i] + length = metrics[study].fragment_lengths if layout == LayoutMethod.paired_end else metrics[study].gene_sizes + counts = study_counts[:, i] + mapped_reads = counts.sum() + matrix_values[sample_name] = ((counts * 1e9) / (length * mapped_reads)).astype(int) + + metrics[study].normalization_matrix = pd.DataFrame(matrix_values, index=metrics[study].entrez_gene_ids) return metrics From 3234413e2bdd7361e4b10e560d712f6874a18793 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 9 Feb 2026 16:12:28 -0600 Subject: [PATCH 14/26] refactor: move zfpkm calculation to external package Signed-off-by: Josh Loecker --- main/como/rnaseq_gen.py | 169 +--------------------------------------- 1 file changed, 3 insertions(+), 166 deletions(-) diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py index ce8de24b..7f12b057 100644 --- a/main/como/rnaseq_gen.py +++ b/main/como/rnaseq_gen.py @@ -360,171 +360,6 @@ def _calculate_fpkm(metrics: NamedMetrics, scale: float = 1e6) -> NamedMetrics: return metrics -def _zfpkm_calculation(col_fpkm: pd.Series, min_peak_height: float, min_peak_distance: int): - """ZFPKM Transformations. - - This function reproduces R's `zFPKM::zFPKM` function. - - References: - 1) zFPKM implementation in R: https://github.com/ronammar/zFPKM - 2) zFPKM publication: https://doi.org/10.1186/1471-2164-14-778 - - Args: - col_fpkm: The raw FPKM values to perform zFPKM on - min_peak_distance: Minimum distance between peaks; passed on to `find_peaks` function - min_peak_height: Minimum height of peaks; passed on to `find_peaks` function - - Returns: - A named tuple containing the zFPKM values, density estimate, mean (mu), standard deviation, and maximum FPKM value. - """ - # Ignore np.log2(0) errors; we know this will happen, and are removing non-finite values in the density calculation - # This is required in order to match R's zFPKM calculations, as R's `density` function removes NA values. - with np.errstate(divide="ignore", invalid="ignore"): - log2fpkm: npt.NDArray[float] = np.log2(col_fpkm.values).astype(float) - d = density(log2fpkm) - - peaks: pd.DataFrame = find_peaks(d.y_grid, min_peak_height=min_peak_height, min_peak_distance=min_peak_distance) - peak_positions = d.x_grid[peaks["peak_idx"].astype(int).tolist()] - - sd = 1.0 - mu = 0.0 - fpkm_at_mu = 0.0 - if peak_positions.size > 0: - mu = float(peak_positions.max()) - u = float(log2fpkm[log2fpkm > mu].mean()) - fpkm_at_mu = float(d.y_grid[int(peaks.loc[np.argmax(peak_positions), "peak_idx"])]) - sd = float((u - mu) * np.sqrt(np.pi / 2)) - zfpkm = pd.Series((log2fpkm - mu) / sd, dtype=float, name=col_fpkm.name, index=col_fpkm.index) - return _ZFPKMResult(zfpkm=zfpkm, density=Density(d.x_grid, d.y_grid), mu=mu, std_dev=sd, fpkm_at_mu=fpkm_at_mu) - - -def zfpkm_transform( - fpkm_df: pd.DataFrame, - min_peak_height: float = 0.02, - min_peak_distance: int = 1, - update_every_percent: float = 0.1, - remove_na: bool = True, -) -> tuple[dict[str, _ZFPKMResult], DataFrame]: - """Perform zFPKM calculation/transformation. - - Args: - fpkm_df: A DataFrame containing FPKM values with genes as rows and samples as columns. - min_peak_height: Minimum height of peaks; passed on to `find_peaks` function. - min_peak_distance: Minimum distance between peaks; passed on to `find_peaks` function. - update_every_percent: Frequency of progress updates as a decimal between 0 and 1 (e.g., 0.1 for every 10%). - remove_na: Whether to remove NaN & blank values from the input DataFrame before processing. - - Returns: - A tuple containing: - - A dictionary of intermediate results for each sample. - - A DataFrame of zFPKM values with the same shape as the input fpkm_df. - """ - if update_every_percent > 1: - logger.warning(f"update_every_percent should be a decimal value between 0 and 1; got: {update_every_percent} - will convert to percentage") - update_every_percent /= 100 - - total_samples = _num_columns(fpkm_df) - update_per_step: int = int(np.ceil(total_samples * update_every_percent)) - - # Get at least 1 core and at most cpu_count() - 2 - cores = max(min(multiprocessing.cpu_count() - 2, total_samples), 1) - logger.debug(f"zFPKM transforming {len(fpkm_df.columns)} sample(s) containing {len(fpkm_df):,} genes(s) using {cores} core(s)") - logger.debug(f"Will update every {update_per_step:,} steps (~{update_every_percent:.1%} of {total_samples:,})") - - chunk_time = time.time() - start_time = time.time() - log_padding = len(str(f"{total_samples:,}")) - zfpkm_series: list[pd.Series] = [] - results: dict[str, _ZFPKMResult] = {} - - slim_fpkm_df: pd.DataFrame = cast(pd.DataFrame, fpkm_df[fpkm_df.index != "-"] if remove_na else fpkm_df) - with ProcessPoolExecutor(max_workers=cores) as pool: - futures: list[Future[_ZFPKMResult]] = [ - pool.submit( - _zfpkm_calculation, - col_fpkm=fpkm_df[column], - min_peak_height=min_peak_height, - min_peak_distance=min_peak_distance, - ) - for column in slim_fpkm_df - ] - - for i, future in enumerate(as_completed(futures)): - result = future.result() - key = str(result.zfpkm.name) - results[key] = result - zfpkm_series.append(result.zfpkm) - - if i != 0 and ((i + 1) % update_per_step == 0 or (i + 1) == total_samples): - current_time = time.time() - chunk = current_time - chunk_time - total_time = current_time - start_time - chunk_num = f"{i + 1:,}" - logger.debug( - f"Processed {chunk_num:>{log_padding}} of {total_samples:,} - " - f"chunk took {chunk:.1f} seconds - " - f"running for {total_time:.1f} seconds" - ) - chunk_time = current_time - - zfpkm_df = pd.DataFrame({series.name: series for series in zfpkm_series}, index=fpkm_df.index) - return results, zfpkm_df - - -def zfpkm_plot(results: dict[str, _ZFPKMResult], *, output_png_dirpath: Path, plot_xfloor: int = -4, subplot_titles: bool = True) -> None: - """Plot the log2(FPKM) density and fitted Gaussian for each sample. - - Args: - results: A dictionary of intermediate results from zfpkm_transform. - output_png_dirpath: Output directory location - subplot_titles: Whether to display facet titles (sample names). - plot_xfloor: Lower limit for the x-axis. - subplot_titles: Whether to display facet titles (sample names). - - """ - to_concat: list[pd.DataFrame] = [] - for name, result in results.items(): - stddev: float = float(result.std_dev) - x: npt.NDArray[float] = result.density.x.flatten() - y: npt.NDArray[float] = result.density.y.flatten() - - fitted: npt.NDArray[float] = np.exp(-0.5 * ((x - result.mu) / stddev) ** 2) / (stddev * np.sqrt(2 * np.pi)) - fpkm_at_mu: float = result.fpkm_at_mu - max_fitted: float = float(fitted.max()) - scale_fitted: float = fitted * fpkm_at_mu / max_fitted - to_concat.append(pd.DataFrame({"sample_name": name, "log2fpkm": x, "fpkm_density": y, "zfpkm_density": scale_fitted})) - - mega_df = pd.concat(to_concat, ignore_index=True) - mega_df.columns = pd.Series(data=["sample_name", "log2fpkm", "fpkm_density", "zfpkm_density"]) - mega_df = mega_df.melt(id_vars=["log2fpkm", "sample_name"], var_name="source", value_name="density") - - fig: plt.Figure - axes: list[plt.Axes] - fig, axes = plt.subplots(nrows=len(results), ncols=1, figsize=(8, 4 * len(results))) - if len(results) == 1: - axes = [axes] - - for i, sample_name in enumerate(results): - sample_data = mega_df[mega_df["sample_name"] == sample_name] - axis = axes[i] - - for source_type in sample_data["source"].unique(): - group = sample_data[sample_data["source"] == source_type] - sns.lineplot(data=group, x="log2fpkm", y="density", label=source_type, ax=axis) - - if subplot_titles: - axis.set_title(f"Sample: {sample_name}") - axis.set_xlim(plot_xfloor, sample_data["log2fpkm"].max()) - axis.set_xlabel("log2(FPKM)") - axis.set_ylabel("density [scaled]") - axis.legend(title="Source") - - output_png_dirpath.mkdir(parents=True, exist_ok=True) - sample_name: str = next(iter(results.keys()))[:-2] # Go from 'control1hr_S1R1' to 'control1hr_S1' - plt.tight_layout() - plt.savefig(Path(output_png_dirpath, f"{sample_name}_zfpkm_density.png")) - - def calculate_z_score(metrics: NamedMetrics) -> NamedMetrics: """Calculate the z-score for each sample in the metrics dictionary. @@ -536,7 +371,9 @@ def calculate_z_score(metrics: NamedMetrics) -> NamedMetrics: """ for sample in metrics: log_matrix = np.log(metrics[sample].normalization_matrix) - z_matrix = pd.DataFrame(data=sklearn.preprocessing.scale(log_matrix, axis=1), columns=metrics[sample].sample_names) + z_matrix = pd.DataFrame( + data=sklearn.preprocessing.scale(log_matrix, axis=1), columns=metrics[sample].sample_names + ) metrics[sample].z_score_matrix = z_matrix return metrics From f90c38837478f288f6d8e4522345b9cd48ac9420 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 9 Feb 2026 16:13:04 -0600 Subject: [PATCH 15/26] chore: use np.bool for boolean array Signed-off-by: Josh Loecker --- main/como/rnaseq_gen.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py index 7f12b057..32be1a87 100644 --- a/main/como/rnaseq_gen.py +++ b/main/como/rnaseq_gen.py @@ -472,17 +472,19 @@ def tpm_quantile_filter(*, metrics: NamedMetrics, filtering_options: _FilteringO min_func = k_over_a(min_samples, 0.9) top_func = k_over_a(top_samples, 0.9) - min_genes: npt.NDArray[bool] = genefilter(boolean_expression, min_func) - top_genes: npt.NDArray[bool] = genefilter(boolean_expression, top_func) + min_genes: npt.NDArray[np.bool] = genefilter(boolean_expression, min_func) + top_genes: npt.NDArray[np.bool] = genefilter(boolean_expression, top_func) # Only keep `entrez_gene_ids` that pass `min_genes` metric.entrez_gene_ids = [gene for gene, keep in zip(entrez_ids, min_genes, strict=True) if keep] - metric.gene_sizes = np.array(gene for gene, keep in zip(gene_size, min_genes, strict=True) if keep) + metric.gene_sizes = np.asarray(gene for gene, keep in zip(gene_size, min_genes, strict=True) if keep) metric.count_matrix = cast(pd.DataFrame, metric.count_matrix.iloc[min_genes, :]) metric.normalization_matrix = cast(pd.DataFrame, metrics[sample].normalization_matrix.iloc[min_genes, :]) keep_top_genes = [gene for gene, keep in zip(entrez_ids, top_genes, strict=True) if keep] - metric.high_confidence_entrez_gene_ids = [gene for gene, keep in zip(entrez_ids, keep_top_genes, strict=True) if keep] + metric.high_confidence_entrez_gene_ids = [ + gene for gene, keep in zip(entrez_ids, keep_top_genes, strict=True) if keep + ] metrics = calculate_z_score(metrics) From 8253a7d34d96645f89869d05c10d1120783b537f Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 9 Feb 2026 16:13:19 -0600 Subject: [PATCH 16/26] chore: ruff formatting Signed-off-by: Josh Loecker --- main/como/rnaseq_gen.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py index 32be1a87..8b10d480 100644 --- a/main/como/rnaseq_gen.py +++ b/main/como/rnaseq_gen.py @@ -466,8 +466,10 @@ def tpm_quantile_filter(*, metrics: NamedMetrics, filtering_options: _FilteringO top_samples = round(n_top * len(tpm_matrix.columns)) tpm_quantile = tpm_matrix[tpm_matrix > 0] - quantile_cutoff = np.quantile(a=tpm_quantile.values, q=1 - (cut_off / 100), axis=0) # Compute quantile across columns - boolean_expression = pd.DataFrame(data=tpm_matrix > quantile_cutoff, index=tpm_matrix.index, columns=tpm_matrix.columns).astype(int) + quantile_cutoff = np.quantile(a=tpm_quantile.values, q=1 - (cut_off / 100), axis=0) + boolean_expression = pd.DataFrame( + data=tpm_matrix > quantile_cutoff, index=tpm_matrix.index, columns=tpm_matrix.columns + ).astype(int) min_func = k_over_a(min_samples, 0.9) top_func = k_over_a(top_samples, 0.9) From c52d2e845add7166cad1367b468b41badfd18830 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 9 Feb 2026 16:13:47 -0600 Subject: [PATCH 17/26] feat: allow setting negative zFPKM results to 0 Signed-off-by: Josh Loecker --- main/como/rnaseq_gen.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py index 8b10d480..4b482a82 100644 --- a/main/como/rnaseq_gen.py +++ b/main/como/rnaseq_gen.py @@ -502,6 +502,7 @@ def zfpkm_filter( min_peak_height: float, min_peak_distance: int, output_png_dirpath: Path | None, + force_negative_to_zero: bool = False, ) -> NamedMetrics: """Apply zFPKM filtering to the FPKM matrix for a given sample. @@ -513,6 +514,8 @@ def zfpkm_filter( min_peak_height: Minimum peak height for zFPKM peak identification. min_peak_distance: Minimum peak distance for zFPKM peak identification. output_png_dirpath: Optional directory path to save zFPKM plots. + force_negative_to_zero: Should negative values be forcibly set to 0? + This could happen as a result of normalization producing negative near-zero values (e.g., -0.001) Returns: A dictionary of filtered study metrics. From e2e6350c30f9a432e56988a6b162d6576a80e108 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 9 Feb 2026 16:14:29 -0600 Subject: [PATCH 18/26] feat: simplification to use external zfpkm package Signed-off-by: Josh Loecker --- main/como/rnaseq_gen.py | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py index 4b482a82..80e11df6 100644 --- a/main/como/rnaseq_gen.py +++ b/main/como/rnaseq_gen.py @@ -529,34 +529,38 @@ def zfpkm_filter( metric: _StudyMetrics # if fpkm was not calculated, the normalization matrix will be empty; collect the count matrix instead matrix = metric.count_matrix if metric.normalization_matrix.empty else metric.normalization_matrix + if not isinstance(matrix, pd.DataFrame): + raise TypeError(f"Expected a pandas.DataFrame for zFPKM filtering, got: '{type(matrix)}'") # TODO: 2025-OCT-31: Re-evaluate whether to remove rows with all 0 counts # matrix = matrix[matrix.sum(axis=1) > 0] # remove rows (genes) that have no counts across all samples - results, zfpkm_df = zfpkm_transform( - fpkm_df=matrix, - min_peak_height=min_peak_height, - min_peak_distance=min_peak_distance, - ) - zfpkm_df[(matrix == 0) | (zfpkm_df.isna())] = -4 + matrix.replace(to_replace=np.nan, value=0.0, inplace=True) + if force_negative_to_zero: + matrix[matrix < 0] = 0.0 + + zfpkm_df, zfpkm_results = zFPKM(matrix) - if len(results) > 10 and not force_zfpkm_plot: + if len(zfpkm_results) > 25 and not force_zfpkm_plot: logger.warning( - "Not plotting zFPKM results because more than 10 plots would be created. " + "Not plotting zFPKM results because more than 25 plots would be created. " "If you would like to plot them anyway, set 'force_zfpkm_plot' to True" ) elif output_png_dirpath is None: logger.critical("Output zFPKM PNG filepath is None, set a path to plot zFPKM graphs") else: - zfpkm_plot(results, output_png_dirpath=output_png_dirpath) + sample_name = zfpkm_results[0].name.split("_")[0] # go from 'control1hr_S1R1' to 'control1hr' + zfpkm_plot(zfpkm_results, save_filepath=output_png_dirpath / f"{sample_name}_zfpkm_density.png") metric.z_score_matrix = zfpkm_df # determine which genes are expressed min_samples = round(min_sample_expression * len(zfpkm_df.columns)) min_func = k_over_a(min_samples, cut_off) - min_genes: npt.NDArray[bool] = genefilter(zfpkm_df, min_func) - metric.entrez_gene_ids = [gene for gene, keep in zip(zfpkm_df.index, min_genes, strict=True) if keep] + min_genes: npt.NDArray[np.bool] = genefilter(zfpkm_df, min_func) + metric.entrez_gene_ids = np.asarray( + [g_id for g_id, keep in zip(zfpkm_df.index, min_genes, strict=True) if keep], dtype=int + ) # determine which genes are confidently expressed top_samples = round(high_confidence_sample_expression * len(zfpkm_df.columns)) @@ -578,6 +582,7 @@ def filter_counts( zfpkm_min_peak_height: float, zfpkm_min_peak_distance: int, output_zfpkm_plot_dirpath: Path | None = None, + force_negative_to_zero: bool = False, ) -> NamedMetrics: """Filter the count matrix based on the specified technique. @@ -591,6 +596,8 @@ def filter_counts( zfpkm_min_peak_height: Minimum peak height for zFPKM peak identification. zfpkm_min_peak_distance: Minimum peak distance for zFPKM peak identification. output_zfpkm_plot_dirpath: Optional filepath to save the zFPKM plot. + :param force_negative_to_zero: Should negative values be forcibly set to 0? + This could happen as a result of normalization producing negative near-zero values (e.g., -0.001) Returns: A dictionary of filtered study metrics. @@ -609,6 +616,7 @@ def filter_counts( min_peak_height=zfpkm_min_peak_height, min_peak_distance=zfpkm_min_peak_distance, output_png_dirpath=output_zfpkm_plot_dirpath, + force_negative_to_zero=force_negative_to_zero, ) case FilteringTechnique.UMI: # UMI filtering is the same as zFPKM filtering without calculating FPKM From 2ad9887222b578e3ca5b0fd9720f380804c23a27 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 9 Feb 2026 16:17:29 -0600 Subject: [PATCH 19/26] feat: allow providing the fragment size filepath (from rnaseq preprocessing) Signed-off-by: Josh Loecker --- main/como/rnaseq_gen.py | 109 ++++++++++++++++++++++++---------------- 1 file changed, 67 insertions(+), 42 deletions(-) diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py index 80e11df6..104e68cb 100644 --- a/main/como/rnaseq_gen.py +++ b/main/como/rnaseq_gen.py @@ -586,16 +586,17 @@ def filter_counts( ) -> NamedMetrics: """Filter the count matrix based on the specified technique. - Args: - context_name: The name of the context being processed. - metrics: A dictionary of study metrics to filter. - technique: The filtering technique to use. - filtering_options: Options for filtering the count matrix. - prep: The RNA preparation type. - force_zfpkm_plot: Whether to force plotting of zFPKM results even if there are many samples. - zfpkm_min_peak_height: Minimum peak height for zFPKM peak identification. - zfpkm_min_peak_distance: Minimum peak distance for zFPKM peak identification. - output_zfpkm_plot_dirpath: Optional filepath to save the zFPKM plot. + :param context_name: The name of the context being processed. + :param metrics: A dictionary of study metrics to filter. + :param technique: The filtering technique to use. + :param filtering_options: Options for filtering the count matrix. + :param prep: The RNA preparation type. + :param force_zfpkm_plot: Whether to force plotting of zFPKM results even if there are many samples. + :param zfpkm_min_peak_height: Minimum peak height for zFPKM peak identification. + :param zfpkm_min_peak_distance: Minimum peak distance for zFPKM peak identification. + :param umi_target_sum: The target sum for UMI normalization. + :param umi_perform_normalization: Whether to perform normalization before UMI filtering. + :param output_zfpkm_plot_dirpath: Optional filepath to save the zFPKM plot. :param force_negative_to_zero: Should negative values be forcibly set to 0? This could happen as a result of normalization producing negative near-zero values (e.g., -0.001) @@ -642,6 +643,7 @@ async def _process( rnaseq_matrix_filepath: Path, metadata_df: pd.DataFrame, gene_info_df: pd.DataFrame, + fragment_df: pd.DataFrame | None, prep: RNAType, taxon: int, replicate_ratio: float, @@ -656,18 +658,12 @@ async def _process( output_boolean_activity_filepath: Path, output_zscore_normalization_filepath: Path, output_zfpkm_plot_dirpath: Path | None, + force_negative_to_zero: bool, ): """Save the results of the RNA-Seq tests to a CSV file.""" output_boolean_activity_filepath.parent.mkdir(parents=True, exist_ok=True) - rnaseq_matrix: pd.DataFrame = await _read_file(rnaseq_matrix_filepath, h5ad_as_df=True) - - if rnaseq_matrix_filepath.suffix == ".h5ad": - conversion = await gene_symbol_to_ensembl_and_gene_id(symbols=rnaseq_matrix["gene_symbol"].tolist(), taxon=taxon) - conversion.reset_index(inplace=True) - rnaseq_matrix = rnaseq_matrix.merge(conversion, how="left", on="gene_symbol") - rnaseq_matrix.replace(to_replace=pd.NA, value="-") - + rnaseq_matrix: pd.DataFrame | sc.AnnData = _read_file(rnaseq_matrix_filepath, h5ad_as_df=False) filtering_options = _FilteringOptions( replicate_ratio=replicate_ratio, batch_ratio=batch_ratio, @@ -676,16 +672,14 @@ async def _process( high_batch_ratio=high_batch_ratio, ) - read_counts_results: _ReadMatrixResults = await _build_matrix_results( - matrix=rnaseq_matrix, + metrics, entrez_gene_ids = await _build_matrix_results( + rnaseq_matrix, gene_info=gene_info_df, metadata_df=metadata_df, + fragment_df=fragment_df, taxon=taxon, ) - - metrics = read_counts_results.metrics - - metrics: NamedMetrics = filter_counts( + metrics = filter_counts( context_name=context_name, metrics=metrics, technique=technique, @@ -695,25 +689,41 @@ async def _process( zfpkm_min_peak_height=zfpkm_min_peak_height, zfpkm_min_peak_distance=zfpkm_min_peak_distance, output_zfpkm_plot_dirpath=output_zfpkm_plot_dirpath, + force_negative_to_zero=force_negative_to_zero, ) - merged_zscore_df = pd.concat([m.z_score_matrix[m.z_score_matrix.index != "-"] for m in metrics.values()], axis="columns") - merged_zscore_df.fillna(-4, inplace=True) - expressed_genes: list[str] = list(itertools.chain.from_iterable(m.entrez_gene_ids for m in metrics.values())) - top_genes: list[str] = list(itertools.chain.from_iterable(m.high_confidence_entrez_gene_ids for m in metrics.values())) + if isinstance(rnaseq_matrix, pd.DataFrame): + merged_zscores = pd.concat( + [m.z_score_matrix[m.z_score_matrix.index != "-"] for m in metrics.values()], axis="columns" + ) - # If any of the normalization metrics are not empty, write the normalized metrics to disk - if not all(metric.normalization_matrix.empty for metric in metrics.values()): - merged_zscore_df: pd.DataFrame = merged_zscore_df.reindex(columns=sorted(merged_zscore_df)) - merged_zscore_df.to_csv(output_zscore_normalization_filepath, index=True) - logger.success(f"Wrote z-score normalization matrix to {output_zscore_normalization_filepath}") - else: - logger.warning( - "Not writing z-score normalization matrix because no normalization matrices exist. This is expected if you are using UMI filtering." + merged_zscores.index.name = ( + "entrez_gene_id" + if merged_zscores.index.astype(str).str.isdigit().all() + else "ensembl_gene_id" + if merged_zscores.index.astype(str).str.startswith("ENS").all() + else "gene_symbol" ) + merged_zscores = merged_zscores.reindex(columns=sorted(merged_zscores.columns)) + merged_zscores = merged_zscores.groupby("entrez_gene_id").mean() + merged_zscores.to_csv(output_zscore_normalization_filepath, index=True) + elif isinstance(rnaseq_matrix, sc.AnnData): + merged_zscores = ad.concat([m.z_score_matrix for m in metrics.values()], axis="obs") + merged_zscores.var.index.name = "entrez_gene_id" + merged_zscores.obs = merged_zscores.obs.reindex(columns=sorted(merged_zscores.obs.columns)) + merged_zscores.write_h5ad(output_zscore_normalization_filepath.with_suffix(".h5ad")) + expressed_genes: list[str] = list(itertools.chain.from_iterable(m.entrez_gene_ids for m in metrics.values())) + top_genes: list[str] = list( + itertools.chain.from_iterable(m.high_confidence_entrez_gene_ids for m in metrics.values()) + ) + + logger.success(f"Wrote z-score normalization matrix to {output_zscore_normalization_filepath}") + expression_frequency = pd.Series(expressed_genes).value_counts() - expression_df = pd.DataFrame({"entrez_gene_id": expression_frequency.index, "frequency": expression_frequency.values}) + expression_df = pd.DataFrame( + {"entrez_gene_id": expression_frequency.index, "frequency": expression_frequency.values} + ) expression_df["prop"] = expression_df["frequency"] / len(metrics) expression_df = expression_df[expression_df["prop"] >= filtering_options.batch_ratio] @@ -722,10 +732,10 @@ async def _process( top_df["prop"] = top_df["frequency"] / len(metrics) top_df = top_df[top_df["prop"] >= filtering_options.high_batch_ratio] - entrez_id_series = pd.Series(read_counts_results.entrez_gene_ids) + entrez_id_series = pd.Series(entrez_gene_ids) boolean_matrix = pd.DataFrame( data={ - "entrez_gene_id": read_counts_results.entrez_gene_ids, + "entrez_gene_id": entrez_gene_ids, "expressed": entrez_id_series.isin(expression_df["entrez_gene_id"]).astype(int), "high": entrez_id_series.isin(top_df["entrez_gene_id"]).astype(int), } @@ -736,8 +746,13 @@ async def _process( # TODO: 2025-OCT-31: commented out dropping entrez ids, should this be kept? # boolean_matrix.dropna(subset="entrez_gene_id", inplace=True) + boolean_matrix = boolean_matrix.groupby("entrez_gene_id", as_index=False).mean() + boolean_matrix["expressed"] = boolean_matrix["expressed"].copy().astype(int) + boolean_matrix["high"] = boolean_matrix["high"].copy().astype(int) boolean_matrix.to_csv(output_boolean_activity_filepath, index=False) - logger.info(f"{context_name} - Found {expressed_count} expressed genes, {high_confidence_count} of which are confidently expressed") + logger.info( + f"{context_name} - Found {expressed_count} expressed genes, {high_confidence_count} of which are confidently expressed" + ) logger.success(f"Wrote boolean matrix to {output_boolean_activity_filepath}") @@ -757,11 +772,13 @@ async def rnaseq_gen( # noqa: C901 technique: FilteringTechnique | str = FilteringTechnique.ZFPKM, zfpkm_min_peak_height: float = 0.02, zfpkm_min_peak_distance: int = 1, + input_fragment_lengths: Path | None = None, cutoff: int | float | None = None, force_zfpkm_plot: bool = False, log_level: LogLevel = LogLevel.INFO, log_location: str | TextIO = sys.stderr, output_zfpkm_plot_dirpath: Path | None = None, + force_negative_counts_to_zero: bool = False, ) -> None: """Generate a list of active and high-confidence genes from a gene count matrix. @@ -777,6 +794,7 @@ async def rnaseq_gen( # noqa: C901 :param prep: The preparation method :param taxon_id: The NCBI Taxon ID :param input_metadata_filepath_or_df: The filepath or dataframe containing metadata information + :param input_fragment_lengths: The filepath to the fragment lengths file, if applicable. :param replicate_ratio: The percentage of replicates that a gene must appear in for a gene to be marked as "active" in a batch/study :param batch_ratio: The percentage of batches that a gene must appear in for a gene to be marked as 'active" @@ -792,6 +810,9 @@ async def rnaseq_gen( # noqa: C901 :param log_level: The level of logging to output :param log_location: The location to write logs to :param output_zfpkm_plot_dirpath: Optional filepath to save zFPKM plots + :param force_negative_counts_to_zero: Should negative values be forcibly set to 0? + This could happen as a result of normalization producing negative near-zero values (e.g., -0.001) + :return: None """ _set_up_logging(level=log_level, location=log_location) @@ -817,8 +838,10 @@ async def rnaseq_gen( # noqa: C901 elif cutoff: cutoff = "default" - case FilteringTechnique.ZFPKM | FilteringTechnique.UMI: + case FilteringTechnique.ZFPKM: cutoff: int | float = cutoff or -3 + case FilteringTechnique.UMI: + cutoff: int = cutoff or 1 case _: _log_and_raise_error( f"Technique must be one of {','.join(FilteringTechnique)}. Got: {technique.value}", @@ -870,7 +893,8 @@ async def rnaseq_gen( # noqa: C901 context_name=context_name, rnaseq_matrix_filepath=input_rnaseq_filepath, metadata_df=metadata_df, - gene_info_df=await _read_file(input_gene_info_filepath), + gene_info_df=_read_file(input_gene_info_filepath), + fragment_df=_read_file(input_fragment_lengths), prep=prep, taxon=taxon_id, replicate_ratio=replicate_ratio, @@ -885,4 +909,5 @@ async def rnaseq_gen( # noqa: C901 output_boolean_activity_filepath=output_boolean_activity_filepath, output_zscore_normalization_filepath=output_zscore_normalization_filepath, output_zfpkm_plot_dirpath=output_zfpkm_plot_dirpath, + force_negative_to_zero=force_negative_counts_to_zero, ) From 6af3990cd7236f3ebbe52f81ca1e96546cf4c2da Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 9 Feb 2026 16:21:22 -0600 Subject: [PATCH 20/26] chore(ruff): reduce max line length Signed-off-by: Josh Loecker --- main/como/rnaseq_gen.py | 2 ++ main/como/rnaseq_preprocess.py | 2 -- ruff.toml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py index 104e68cb..4c522e3c 100644 --- a/main/como/rnaseq_gen.py +++ b/main/como/rnaseq_gen.py @@ -772,7 +772,9 @@ async def rnaseq_gen( # noqa: C901 technique: FilteringTechnique | str = FilteringTechnique.ZFPKM, zfpkm_min_peak_height: float = 0.02, zfpkm_min_peak_distance: int = 1, + umi_target_sum: int = 10_000, input_fragment_lengths: Path | None = None, + umi_perform_normalization: bool = False, cutoff: int | float | None = None, force_zfpkm_plot: bool = False, log_level: LogLevel = LogLevel.INFO, diff --git a/main/como/rnaseq_preprocess.py b/main/como/rnaseq_preprocess.py index f9f1b6ce..06f33922 100644 --- a/main/como/rnaseq_preprocess.py +++ b/main/como/rnaseq_preprocess.py @@ -16,8 +16,6 @@ import numpy as np import numpy.typing as npt import pandas as pd -import pandera.pandas as pa -import pandera.typing.pandas as pat from fast_bioservices.biothings.mygene import MyGene from fast_bioservices.pipeline import gene_symbol_to_ensembl_and_gene_id from loguru import logger diff --git a/ruff.toml b/ruff.toml index a556c25f..b7ddbd88 100644 --- a/ruff.toml +++ b/ruff.toml @@ -1,4 +1,4 @@ -line-length = 150 +line-length = 120 extend-include = ["docs/**/*.py", "tests/**/*.py", "**/*.ipynb"] [format] From 479fce2d064c044dc3ce4eb9eb74e51bf2d543e2 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 9 Feb 2026 16:21:52 -0600 Subject: [PATCH 21/26] chore(ruff): mark unsorted imports as fixable Signed-off-by: Josh Loecker --- ruff.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/ruff.toml b/ruff.toml index b7ddbd88..691022d2 100644 --- a/ruff.toml +++ b/ruff.toml @@ -6,6 +6,7 @@ quote-style = "double" docstring-code-format = true [lint] +extend-fixable = ["I001"] # Linting rules: https://docs.astral.sh/ruff/rules/ unfixable = [ "F401", # warn about, but do not remove, unused imports From d83e974d597f57fd9ee8ffa004750a35fd0e3940 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 9 Feb 2026 16:22:41 -0600 Subject: [PATCH 22/26] chore(uv): lock pyproject file Signed-off-by: Josh Loecker --- pyproject.toml | 17 +- uv.lock | 413 ++++++++++--------------------------------------- 2 files changed, 86 insertions(+), 344 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 51f81319..488e14ef 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,11 +12,14 @@ dependencies = [ "cobamp@git+https://github.com/JoshLoecker/cobamp@master", "cobra>=0.28.0", "fast-bioservices>=0.3.9", + "joypy>=0.2.6", "kaleido>=1.0.0", "loguru>=0.7.2", + "notebook>=7.4.7", "numpy>=2", "openpyxl>=3.1.5", "pandas>=1.3.5", + "python-louvain", "scanpy>=1.10.4", "scikit-learn>=1.5.2", "scipy>=1.13.0", @@ -25,6 +28,7 @@ dependencies = [ "statsmodels>=0.13.0; python_version < '3.12'", "statsmodels>=0.14.0; python_version >= '3.12'", "troppo@git+https://github.com/JoshLoecker/troppo@master", + "zfpkm>=1.0.3", ] [project.optional-dependencies] @@ -36,19 +40,11 @@ interactive = [ "jupyterlab>=4.3.2" ] dev = [ - "commitizen>=4.8.3", - "commitlint>=1.3.1", "como", "hatchling>=1.27.0", - "pandas-stubs>=2.3.2.250827", - "pre-commit>=4.2.0", - "pyright>=1.1.405", - "pytest>=8.4.1", "pytest-asyncio>=1.1.0", "pytest-cov>=6.2.1", - "ruff>=0.12.11", - "scipy-stubs>=1.16.1.1", - "types-aiofiles>=24.1.0.20250822", + "pytest>=8.4.1", ] [tool.hatch.version] @@ -62,3 +58,6 @@ allow-direct-references = true [tool.pytest.ini_options] pythonpath = ["main/src"] + +[tool.uv.sources] +python-louvain = { git = "https://github.com/taynaud/python-louvain" } diff --git a/uv.lock b/uv.lock index 4fcb4c61..a1a2c903 100644 --- a/uv.lock +++ b/uv.lock @@ -85,15 +85,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/81/29/5ecc3a15d5a33e31b26c11426c45c501e439cb865d0bff96315d86443b78/appnope-0.1.4-py2.py3-none-any.whl", hash = "sha256:502575ee11cd7a28c0205f379b525beefebab9d161b7c964670864014ed7213c", size = 4321, upload-time = "2024-02-06T09:43:09.663Z" }, ] -[[package]] -name = "argcomplete" -version = "3.6.2" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/16/0f/861e168fc813c56a78b35f3c30d91c6757d1fd185af1110f1aec784b35d0/argcomplete-3.6.2.tar.gz", hash = "sha256:d0519b1bc867f5f4f4713c41ad0aba73a4a5f007449716b16f385f2166dc6adf", size = 73403, upload-time = "2025-04-03T04:57:03.52Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/31/da/e42d7a9d8dd33fa775f467e4028a47936da2f01e4b0e561f9ba0d74cb0ca/argcomplete-3.6.2-py3-none-any.whl", hash = "sha256:65b3133a29ad53fb42c48cf5114752c7ab66c1c38544fdf6460f450c09b42591", size = 43708, upload-time = "2025-04-03T04:57:01.591Z" }, -] - [[package]] name = "argon2-cffi" version = "25.1.0" @@ -281,15 +272,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/cb/0e/02ceeec9a7d6ee63bb596121c2c8e9b3a9e150936f4fbef6ca1943e6137c/cffi-2.0.0-cp313-cp313-win_arm64.whl", hash = "sha256:256f80b80ca3853f90c21b23ee78cd008713787b1b1e93eae9f3d6a7134abd91", size = 177780, upload-time = "2025-09-08T23:23:16.761Z" }, ] -[[package]] -name = "cfgv" -version = "3.4.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/11/74/539e56497d9bd1d484fd863dd69cbbfa653cd2aa27abfe35653494d85e94/cfgv-3.4.0.tar.gz", hash = "sha256:e52591d4c5f5dead8e0f673fb16db7949d2cfb3f7da4582893288f0ded8fe560", size = 7114, upload-time = "2023-08-12T20:38:17.776Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c5/55/51844dd50c4fc7a33b653bfaba4c2456f06955289ca770a5dbd5fd267374/cfgv-3.4.0-py2.py3-none-any.whl", hash = "sha256:b7265b1f29fd3316bfcd2b330d63d024f2bfd8bcb8b0272f8e19a504856c48f9", size = 7249, upload-time = "2023-08-12T20:38:16.269Z" }, -] - [[package]] name = "charset-normalizer" version = "3.4.3" @@ -403,38 +385,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/60/97/891a0971e1e4a8c5d2b20bbe0e524dc04548d2307fee33cdeba148fd4fc7/comm-0.2.3-py3-none-any.whl", hash = "sha256:c615d91d75f7f04f095b30d1c1711babd43bdc6419c1be9886a85f2f4e489417", size = 7294, upload-time = "2025-07-25T14:02:02.896Z" }, ] -[[package]] -name = "commitizen" -version = "4.9.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "argcomplete" }, - { name = "charset-normalizer" }, - { name = "colorama" }, - { name = "decli" }, - { name = "deprecated" }, - { name = "jinja2" }, - { name = "packaging" }, - { name = "prompt-toolkit" }, - { name = "pyyaml" }, - { name = "questionary" }, - { name = "termcolor" }, - { name = "tomlkit" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/77/19/927ac5b0eabb9451e2d5bb45b30813915c9a1260713b5b68eeb31358ea23/commitizen-4.9.1.tar.gz", hash = "sha256:b076b24657718f7a35b1068f2083bd39b4065d250164a1398d1dac235c51753b", size = 56610, upload-time = "2025-09-10T14:19:33.746Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/cf/49/577035b841442fe031b017027c3d99278b46104d227f0353c69dbbe55148/commitizen-4.9.1-py3-none-any.whl", hash = "sha256:4241b2ecae97b8109af8e587c36bc3b805a09b9a311084d159098e12d6ead497", size = 80624, upload-time = "2025-09-10T14:19:32.102Z" }, -] - -[[package]] -name = "commitlint" -version = "1.3.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/99/c1/42ee438955d0df9bf311dc4c573a49fb7215d915b224ee49566c6d11a318/commitlint-1.3.1.tar.gz", hash = "sha256:2a0123636bd12cb47f96034af0711d302403e80e47bac815f26c495420929d53", size = 23896, upload-time = "2025-08-25T13:19:35.965Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/78/bb/7413a18bff34f38a0b3283558dc25119c21426964619080e0061aadd3bb0/commitlint-1.3.1-py3-none-any.whl", hash = "sha256:02024a64a785c7b5f2c6accb33415efb61d428b90e0231480ac49b8b07624520", size = 26643, upload-time = "2025-08-25T13:19:35.173Z" }, -] - [[package]] name = "como" source = { editable = "." } @@ -444,11 +394,14 @@ dependencies = [ { name = "cobamp" }, { name = "cobra" }, { name = "fast-bioservices" }, + { name = "joypy" }, { name = "kaleido" }, { name = "loguru" }, + { name = "notebook" }, { name = "numpy" }, { name = "openpyxl" }, { name = "pandas" }, + { name = "python-louvain" }, { name = "scanpy" }, { name = "scikit-learn" }, { name = "scipy" }, @@ -456,22 +409,15 @@ dependencies = [ { name = "setuptools" }, { name = "statsmodels" }, { name = "troppo" }, + { name = "zfpkm" }, ] [package.optional-dependencies] dev = [ - { name = "commitizen" }, - { name = "commitlint" }, { name = "hatchling" }, - { name = "pandas-stubs" }, - { name = "pre-commit" }, - { name = "pyright" }, { name = "pytest" }, { name = "pytest-asyncio" }, { name = "pytest-cov" }, - { name = "ruff" }, - { name = "scipy-stubs" }, - { name = "types-aiofiles" }, ] gurobi = [ { name = "gurobipy" }, @@ -487,35 +433,31 @@ requires-dist = [ { name = "anndata", specifier = ">=0.12.0" }, { name = "cobamp", git = "https://github.com/JoshLoecker/cobamp?rev=master" }, { name = "cobra", specifier = ">=0.28.0" }, - { name = "commitizen", marker = "extra == 'dev'", specifier = ">=4.8.3" }, - { name = "commitlint", marker = "extra == 'dev'", specifier = ">=1.3.1" }, - { name = "fast-bioservices", specifier = ">=0.3.9" }, + { name = "fast-bioservices", editable = "../fast_bioservices" }, { name = "gurobipy", marker = "extra == 'gurobi'", specifier = "<14" }, { name = "hatchling", marker = "extra == 'dev'", specifier = ">=1.27.0" }, { name = "ipython", marker = "extra == 'interactive'", specifier = ">=8.0.0" }, + { name = "joypy", specifier = ">=0.2.6" }, { name = "jupyterlab", marker = "extra == 'interactive'", specifier = ">=4.3.2" }, { name = "kaleido", specifier = ">=1.0.0" }, { name = "loguru", specifier = ">=0.7.2" }, + { name = "notebook", specifier = ">=7.4.7" }, { name = "numpy", specifier = ">=2" }, { name = "openpyxl", specifier = ">=3.1.5" }, { name = "pandas", specifier = ">=1.3.5" }, - { name = "pandas-stubs", marker = "extra == 'dev'", specifier = ">=2.3.2.250827" }, - { name = "pre-commit", marker = "extra == 'dev'", specifier = ">=4.2.0" }, - { name = "pyright", marker = "extra == 'dev'", specifier = ">=1.1.405" }, { name = "pytest", marker = "extra == 'dev'", specifier = ">=8.4.1" }, { name = "pytest-asyncio", marker = "extra == 'dev'", specifier = ">=1.1.0" }, { name = "pytest-cov", marker = "extra == 'dev'", specifier = ">=6.2.1" }, - { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.12.11" }, + { name = "python-louvain", git = "https://github.com/taynaud/python-louvain" }, { name = "scanpy", specifier = ">=1.10.4" }, { name = "scikit-learn", specifier = ">=1.5.2" }, { name = "scipy", specifier = ">=1.13.0" }, - { name = "scipy-stubs", marker = "extra == 'dev'", specifier = ">=1.16.1.1" }, { name = "seaborn", specifier = ">=0.13.2" }, { name = "setuptools", specifier = ">=78.1.1" }, { name = "statsmodels", marker = "python_full_version < '3.12'", specifier = ">=0.13.0" }, { name = "statsmodels", marker = "python_full_version >= '3.12'", specifier = ">=0.14.0" }, { name = "troppo", git = "https://github.com/JoshLoecker/troppo?rev=master" }, - { name = "types-aiofiles", marker = "extra == 'dev'", specifier = ">=24.1.0.20250822" }, + { name = "zfpkm", specifier = ">=1.0.3" }, ] provides-extras = ["dev", "gurobi", "interactive"] @@ -727,15 +669,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b0/d0/89247ec250369fc76db477720a26b2fce7ba079ff1380e4ab4529d2fe233/debugpy-1.8.17-py2.py3-none-any.whl", hash = "sha256:60c7dca6571efe660ccb7a9508d73ca14b8796c4ed484c2002abba714226cfef", size = 5283210, upload-time = "2025-09-17T16:34:25.835Z" }, ] -[[package]] -name = "decli" -version = "0.6.3" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/0c/59/d4ffff1dee2c8f6f2dd8f87010962e60f7b7847504d765c91ede5a466730/decli-0.6.3.tar.gz", hash = "sha256:87f9d39361adf7f16b9ca6e3b614badf7519da13092f2db3c80ca223c53c7656", size = 7564, upload-time = "2025-06-01T15:23:41.25Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/d8/fa/ec878c28bc7f65b77e7e17af3522c9948a9711b9fa7fc4c5e3140a7e3578/decli-0.6.3-py3-none-any.whl", hash = "sha256:5152347c7bb8e3114ad65db719e5709b28d7f7f45bdb709f70167925e55640f3", size = 7989, upload-time = "2025-06-01T15:23:40.228Z" }, -] - [[package]] name = "decorator" version = "5.2.1" @@ -763,18 +696,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/1f/10/5fe7a7778cc8a701373662f99393f443541353018d3cf2bf6c8f91b032d6/depinfo-2.2.0-py3-none-any.whl", hash = "sha256:3d9ba933e7a9d718b9915f75c844a38c5603cd3cdba1816ab95e0b148b100d8f", size = 24025, upload-time = "2022-09-07T16:27:49.813Z" }, ] -[[package]] -name = "deprecated" -version = "1.2.18" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "wrapt" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/98/97/06afe62762c9a8a86af0cfb7bfdab22a43ad17138b07af5b1a58442690a2/deprecated-1.2.18.tar.gz", hash = "sha256:422b6f6d859da6f2ef57857761bfb392480502a64c3028ca9bbe86085d72115d", size = 2928744, upload-time = "2025-01-27T10:46:25.7Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/6e/c6/ac0b6c1e2d138f1002bcf799d330bd6d85084fece321e662a14223794041/Deprecated-1.2.18-py2.py3-none-any.whl", hash = "sha256:bd5011788200372a32418f888e326a09ff80d0214bd961147cfed01b5c018eec", size = 9998, upload-time = "2025-01-27T10:46:09.186Z" }, -] - [[package]] name = "dill" version = "0.4.0" @@ -793,15 +714,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/3f/27/4570e78fc0bf5ea0ca45eb1de3818a23787af9b390c0b0a0033a1b8236f9/diskcache-5.6.3-py3-none-any.whl", hash = "sha256:5e31b2d5fbad117cc363ebaf6b689474db18a1f6438bc82358b024abd4c2ca19", size = 45550, upload-time = "2023-08-31T06:11:58.822Z" }, ] -[[package]] -name = "distlib" -version = "0.4.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/96/8e/709914eb2b5749865801041647dc7f4e6d00b549cfe88b65ca192995f07c/distlib-0.4.0.tar.gz", hash = "sha256:feec40075be03a04501a973d81f633735b4b69f98b05450592310c0f401a4e0d", size = 614605, upload-time = "2025-07-17T16:52:00.465Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/33/6b/e0547afaf41bf2c42e52430072fa5658766e3d65bd4b03a563d1b6336f57/distlib-0.4.0-py2.py3-none-any.whl", hash = "sha256:9659f7d87e46584a30b5780e43ac7a2143098441670ff0a49d5f9034c54a6c16", size = 469047, upload-time = "2025-07-17T16:51:58.613Z" }, -] - [[package]] name = "donfig" version = "0.8.1.post1" @@ -834,8 +746,7 @@ wheels = [ [[package]] name = "fast-bioservices" -version = "0.3.9" -source = { registry = "https://pypi.org/simple" } +source = { editable = "../fast_bioservices" } dependencies = [ { name = "aiofiles" }, { name = "appdirs" }, @@ -844,9 +755,24 @@ dependencies = [ { name = "loguru" }, { name = "pandas" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/76/f2/1575a1233ee470cdc52efa1ad5e00050bb39b3f7ebdb3813fba42035e7c3/fast_bioservices-0.3.9.tar.gz", hash = "sha256:4094d5963b5baab2f7d3a02a74d1d841e83670341065ea0ed0d1f09ba658bf05", size = 47042, upload-time = "2024-12-04T19:32:29.458Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/48/6c/6033e67a7d734ba90ff186e5404f78b0e3b59ae46e78bec11764ae50c508/fast_bioservices-0.3.9-py3-none-any.whl", hash = "sha256:f041a30300d4de5c7d2d5e0405b8505e7a7f79248e986ecf45ddb3473d7c4d8f", size = 22687, upload-time = "2024-12-04T19:32:28.023Z" }, + +[package.metadata] +requires-dist = [ + { name = "aiofiles", specifier = ">=24.1.0" }, + { name = "appdirs", specifier = ">=1.4.4" }, + { name = "hishel", specifier = ">=0.1.1" }, + { name = "httpx", specifier = ">=0.27.2" }, + { name = "loguru", specifier = ">=0.7.2" }, + { name = "pandas", specifier = ">=1.5.3" }, +] + +[package.metadata.requires-dev] +dev = [ + { name = "commitlint", specifier = ">=1.3.0" }, + { name = "pre-commit", specifier = ">=4.0.1" }, + { name = "pytest", specifier = ">=8.3.2" }, + { name = "pytest-asyncio", specifier = ">=0.24.0" }, + { name = "pytest-cov", specifier = ">=6.0.0" }, ] [[package]] @@ -858,15 +784,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/cb/a8/20d0723294217e47de6d9e2e40fd4a9d2f7c4b6ef974babd482a59743694/fastjsonschema-2.21.2-py3-none-any.whl", hash = "sha256:1c797122d0a86c5cace2e54bf4e819c36223b552017172f32c5c024a6b77e463", size = 24024, upload-time = "2025-08-14T18:49:34.776Z" }, ] -[[package]] -name = "filelock" -version = "3.19.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/40/bb/0ab3e58d22305b6f5440629d20683af28959bf793d98d11950e305c1c326/filelock-3.19.1.tar.gz", hash = "sha256:66eda1888b0171c998b35be2bcc0f6d75c388a7ce20c3f3f37aa8e96c2dddf58", size = 17687, upload-time = "2025-08-14T16:56:03.016Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/42/14/42b2651a2f46b022ccd948bca9f2d5af0fd8929c4eec235b8d6d844fbe67/filelock-3.19.1-py3-none-any.whl", hash = "sha256:d38e30481def20772f5baf097c122c3babc4fcdb7e14e57049eb9d88c6dc017d", size = 15988, upload-time = "2025-08-14T16:56:01.633Z" }, -] - [[package]] name = "fonttools" version = "4.60.1" @@ -1027,15 +944,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" }, ] -[[package]] -name = "identify" -version = "2.6.15" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/ff/e7/685de97986c916a6d93b3876139e00eef26ad5bbbd61925d670ae8013449/identify-2.6.15.tar.gz", hash = "sha256:e4f4864b96c6557ef2a1e1c951771838f4edc9df3a72ec7118b338801b11c7bf", size = 99311, upload-time = "2025-10-02T17:43:40.631Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/0f/1c/e5fd8f973d4f375adb21565739498e2e9a1e54c858a97b9a8ccfdc81da9b/identify-2.6.15-py2.py3-none-any.whl", hash = "sha256:1181ef7608e00704db228516541eb83a88a9f94433a8c80bb9b5bd54b1d81757", size = 99183, upload-time = "2025-10-02T17:43:39.137Z" }, -] - [[package]] name = "idna" version = "3.10" @@ -1175,6 +1083,21 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/1e/e8/685f47e0d754320684db4425a0967f7d3fa70126bffd76110b7009a0090f/joblib-1.5.2-py3-none-any.whl", hash = "sha256:4e1f0bdbb987e6d843c70cf43714cb276623def372df3c22fe5266b2670bc241", size = 308396, upload-time = "2025-08-27T12:15:45.188Z" }, ] +[[package]] +name = "joypy" +version = "0.2.6" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "matplotlib" }, + { name = "numpy" }, + { name = "pandas" }, + { name = "scipy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/89/f4/49636d4c5fa30822028a1e2af234cecf488ba3c7e9ff5aba88e36fb0c95c/joypy-0.2.6.tar.gz", hash = "sha256:099da2d6c7d81b5eccc957bd9446831f565ba42d5abbab0fa92b81892449522e", size = 10270, upload-time = "2021-12-19T09:42:52.541Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e9/8c/4d32c8935431eb10fd140faa4b13b6b8de222223a88fa9ad2a7711b7f1a9/joypy-0.2.6-py2.py3-none-any.whl", hash = "sha256:fffe882e8281e56e08b374a3148436cb448562ba39e4d566204c7e8ee2caddab", size = 8584, upload-time = "2021-12-19T09:42:50.786Z" }, +] + [[package]] name = "json5" version = "0.12.1" @@ -1787,12 +1710,19 @@ wheels = [ ] [[package]] -name = "nodeenv" -version = "1.9.1" +name = "notebook" +version = "7.4.7" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/43/16/fc88b08840de0e0a72a2f9d8c6bae36be573e475a6326ae854bcc549fc45/nodeenv-1.9.1.tar.gz", hash = "sha256:6ec12890a2dab7946721edbfbcd91f3319c6ccc9aec47be7c7e6b7011ee6645f", size = 47437, upload-time = "2024-06-04T18:44:11.171Z" } +dependencies = [ + { name = "jupyter-server" }, + { name = "jupyterlab" }, + { name = "jupyterlab-server" }, + { name = "notebook-shim" }, + { name = "tornado" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/04/09/f6f64ba156842ef68d3ea763fa171a2f7e7224f200a15dd4af5b83c34756/notebook-7.4.7.tar.gz", hash = "sha256:3f0a04027dfcee8a876de48fba13ab77ec8c12f72f848a222ed7f5081b9e342a", size = 13937702, upload-time = "2025-09-27T08:00:22.536Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/d2/1d/1b658dbd2b9fa9c4c9f32accbfc0205d532c8c6194dc0f2a4c0428e7128a/nodeenv-1.9.1-py2.py3-none-any.whl", hash = "sha256:ba11c9782d29c27c70ffbdda2d7415098754709be8a7056d79a737cd901155c9", size = 22314, upload-time = "2024-06-04T18:44:08.352Z" }, + { url = "https://files.pythonhosted.org/packages/6c/d7/06d13087e20388926e7423d2489e728d2e59f2453039cdb0574a7c070e76/notebook-7.4.7-py3-none-any.whl", hash = "sha256:362b7c95527f7dd3c4c84d410b782872fd9c734fb2524c11dd92758527b6eda6", size = 14342894, upload-time = "2025-09-27T08:00:18.496Z" }, ] [[package]] @@ -1925,18 +1855,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/af/11/0cc63f9f321ccf63886ac203336777140011fb669e739da36d8db3c53b98/numpy-2.3.3-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:2e267c7da5bf7309670523896df97f93f6e469fb931161f483cd6882b3b1a5dc", size = 12971844, upload-time = "2025-09-09T15:58:57.359Z" }, ] -[[package]] -name = "numpy-typing-compat" -version = "20250818.2.3" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "numpy" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/c9/e3/1a29f174c1e09a2bf111d37a41afceea1b501371abb39e73170ca31a7599/numpy_typing_compat-20250818.2.3.tar.gz", hash = "sha256:72e83d535b635d668ba7315e43ae80be1469a6faea6fc96d312516f39b3d8fa5", size = 4974, upload-time = "2025-08-18T23:46:42.968Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c9/4a/fa4c90a03d6a8ee1a7f0e0fb101887d9a8cdb9b07a5901af9ae831e9feea/numpy_typing_compat-20250818.2.3-py3-none-any.whl", hash = "sha256:930413d34dd9083c0bf418815576222f1c66ea2d68950f447fd27ea1a78b26b0", size = 6286, upload-time = "2025-08-18T23:46:35.681Z" }, -] - [[package]] name = "openpyxl" version = "3.1.5" @@ -1962,24 +1880,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/52/18/8215ef740dd5b5d982de9c4cd352c99ce92c40c208245a2e8909bea7c0d6/optlang-1.8.3-py2.py3-none-any.whl", hash = "sha256:b81f4e873f0c1d0d907410add63aea427762d911245eb04a4a1126da5fedb595", size = 141752, upload-time = "2025-01-08T12:45:28.063Z" }, ] -[[package]] -name = "optype" -version = "0.13.4" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "typing-extensions", marker = "python_full_version < '3.13'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/20/7f/daa32a35b2a6a564a79723da49c0ddc464c462e67a906fc2b66a0d64f28e/optype-0.13.4.tar.gz", hash = "sha256:131d8e0f1c12d8095d553e26b54598597133830983233a6a2208886e7a388432", size = 99547, upload-time = "2025-08-19T19:52:44.242Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/37/bb/b51940f2d91071325d5ae2044562aa698470a105474d9317b9dbdaad63df/optype-0.13.4-py3-none-any.whl", hash = "sha256:500c89cfac82e2f9448a54ce0a5d5c415b6976b039c2494403cd6395bd531979", size = 87919, upload-time = "2025-08-19T19:52:41.314Z" }, -] - -[package.optional-dependencies] -numpy = [ - { name = "numpy" }, - { name = "numpy-typing-compat" }, -] - [[package]] name = "orjson" version = "3.11.3" @@ -2092,19 +1992,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/bd/17/e756653095a083d8a37cbd816cb87148debcfcd920129b25f99dd8d04271/pandas-2.3.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:c4fc4c21971a1a9f4bdb4c73978c7f7256caa3e62b323f70d6cb80db583350bc", size = 13199233, upload-time = "2025-09-29T23:24:24.876Z" }, ] -[[package]] -name = "pandas-stubs" -version = "2.3.2.250926" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "numpy" }, - { name = "types-pytz" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/1b/3b/32be58a125db39d0b5f62cc93795f32b5bb2915bd5c4a46f0e35171985e2/pandas_stubs-2.3.2.250926.tar.gz", hash = "sha256:c64b9932760ceefb96a3222b953e6a251321a9832a28548be6506df473a66406", size = 102147, upload-time = "2025-09-26T19:50:39.522Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/40/96/1e4a035eaf4dce9610aac6e43026d0c6baa05773daf6d21e635a4fe19e21/pandas_stubs-2.3.2.250926-py3-none-any.whl", hash = "sha256:81121818453dcfe00f45c852f4dceee043640b813830f6e7bd084a4ef7ff7270", size = 159995, upload-time = "2025-09-26T19:50:38.241Z" }, -] - [[package]] name = "pandocfilters" version = "1.5.1" @@ -2269,22 +2156,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/5b/23/6aef7c24f4ee6f765aeaaaa3bf24cfdb0730a20336a02b1a061d227d84be/ppft-1.7.7-py3-none-any.whl", hash = "sha256:fb7524db110682de886b4bb5b08f7bf6a38940566074ef2f62521cbbd3864676", size = 56764, upload-time = "2025-04-16T01:47:39.453Z" }, ] -[[package]] -name = "pre-commit" -version = "4.3.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "cfgv" }, - { name = "identify" }, - { name = "nodeenv" }, - { name = "pyyaml" }, - { name = "virtualenv" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/ff/29/7cf5bbc236333876e4b41f56e06857a87937ce4bf91e117a6991a2dbb02a/pre_commit-4.3.0.tar.gz", hash = "sha256:499fe450cc9d42e9d58e606262795ecb64dd05438943c62b66f6a8673da30b16", size = 193792, upload-time = "2025-08-09T18:56:14.651Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/5b/a5/987a405322d78a73b66e39e4a90e4ef156fd7141bf71df987e50717c321b/pre_commit-4.3.0-py2.py3-none-any.whl", hash = "sha256:2b0747ad7e6e967169136edffee14c16e148a778a54e4f967921aa1ebf2308d8", size = 220965, upload-time = "2025-08-09T18:56:13.192Z" }, -] - [[package]] name = "prometheus-client" version = "0.23.1" @@ -2463,19 +2334,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/10/5e/1aa9a93198c6b64513c9d7752de7422c06402de6600a8767da1524f9570b/pyparsing-3.2.5-py3-none-any.whl", hash = "sha256:e38a4f02064cf41fe6593d328d0512495ad1f3d8a91c4f73fc401b3079a59a5e", size = 113890, upload-time = "2025-09-21T04:11:04.117Z" }, ] -[[package]] -name = "pyright" -version = "1.1.406" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "nodeenv" }, - { name = "typing-extensions" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/f7/16/6b4fbdd1fef59a0292cbb99f790b44983e390321eccbc5921b4d161da5d1/pyright-1.1.406.tar.gz", hash = "sha256:c4872bc58c9643dac09e8a2e74d472c62036910b3bd37a32813989ef7576ea2c", size = 4113151, upload-time = "2025-10-02T01:04:45.488Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/f6/a2/e309afbb459f50507103793aaef85ca4348b66814c86bc73908bdeb66d12/pyright-1.1.406-py3-none-any.whl", hash = "sha256:1d81fb43c2407bf566e97e57abb01c811973fdb21b2df8df59f870f688bdca71", size = 5980982, upload-time = "2025-10-02T01:04:43.137Z" }, -] - [[package]] name = "pytest" version = "8.4.2" @@ -2575,6 +2433,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b8/49/70f6288db3ce3ef006627318d518235836c47b34e4aa7716760e33b639b9/python_libsbml-5.20.5-cp313-cp313-win_amd64.whl", hash = "sha256:763222865e39d51e408c2c9af3dafa0d58f613e75d9ff117de8f8a2b9f7eb59e", size = 6027822, upload-time = "2025-05-05T06:43:59.128Z" }, ] +[[package]] +name = "python-louvain" +version = "0.16" +source = { git = "https://github.com/taynaud/python-louvain#def91793772c3e77ab4167d175903a5365c24b4b" } +dependencies = [ + { name = "networkx" }, + { name = "numpy" }, +] + [[package]] name = "pytz" version = "2025.2" @@ -2697,18 +2564,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/01/1b/5dbe84eefc86f48473947e2f41711aded97eecef1231f4558f1f02713c12/pyzmq-27.1.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:c9f7f6e13dff2e44a6afeaf2cf54cee5929ad64afaf4d40b50f93c58fc687355", size = 544862, upload-time = "2025-09-08T23:09:56.509Z" }, ] -[[package]] -name = "questionary" -version = "2.1.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "prompt-toolkit" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/f6/45/eafb0bba0f9988f6a2520f9ca2df2c82ddfa8d67c95d6625452e97b204a5/questionary-2.1.1.tar.gz", hash = "sha256:3d7e980292bb0107abaa79c68dd3eee3c561b83a0f89ae482860b181c8bd412d", size = 25845, upload-time = "2025-08-28T19:00:20.851Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/3c/26/1062c7ec1b053db9e499b4d2d5bc231743201b74051c973dadeac80a8f43/questionary-2.1.1-py3-none-any.whl", hash = "sha256:a51af13f345f1cdea62347589fbb6df3b290306ab8930713bfae4d475a7d4a59", size = 36753, upload-time = "2025-08-28T19:00:19.56Z" }, -] - [[package]] name = "referencing" version = "0.36.2" @@ -2913,32 +2768,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/51/79/76aba16a1689b50528224b182f71097ece338e7a4ab55e84c2e73443b78a/ruamel.yaml.clib-0.2.14-cp313-cp313-win_amd64.whl", hash = "sha256:090782b5fb9d98df96509eecdbcaffd037d47389a89492320280d52f91330d78", size = 115238, upload-time = "2025-09-22T19:51:07.081Z" }, ] -[[package]] -name = "ruff" -version = "0.13.3" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/c7/8e/f9f9ca747fea8e3ac954e3690d4698c9737c23b51731d02df999c150b1c9/ruff-0.13.3.tar.gz", hash = "sha256:5b0ba0db740eefdfbcce4299f49e9eaefc643d4d007749d77d047c2bab19908e", size = 5438533, upload-time = "2025-10-02T19:29:31.582Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/d2/33/8f7163553481466a92656d35dea9331095122bb84cf98210bef597dd2ecd/ruff-0.13.3-py3-none-linux_armv6l.whl", hash = "sha256:311860a4c5e19189c89d035638f500c1e191d283d0cc2f1600c8c80d6dcd430c", size = 12484040, upload-time = "2025-10-02T19:28:49.199Z" }, - { url = "https://files.pythonhosted.org/packages/b0/b5/4a21a4922e5dd6845e91896b0d9ef493574cbe061ef7d00a73c61db531af/ruff-0.13.3-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:2bdad6512fb666b40fcadb65e33add2b040fc18a24997d2e47fee7d66f7fcae2", size = 13122975, upload-time = "2025-10-02T19:28:52.446Z" }, - { url = "https://files.pythonhosted.org/packages/40/90/15649af836d88c9f154e5be87e64ae7d2b1baa5a3ef317cb0c8fafcd882d/ruff-0.13.3-py3-none-macosx_11_0_arm64.whl", hash = "sha256:fc6fa4637284708d6ed4e5e970d52fc3b76a557d7b4e85a53013d9d201d93286", size = 12346621, upload-time = "2025-10-02T19:28:54.712Z" }, - { url = "https://files.pythonhosted.org/packages/a5/42/bcbccb8141305f9a6d3f72549dd82d1134299177cc7eaf832599700f95a7/ruff-0.13.3-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1c9e6469864f94a98f412f20ea143d547e4c652f45e44f369d7b74ee78185838", size = 12574408, upload-time = "2025-10-02T19:28:56.679Z" }, - { url = "https://files.pythonhosted.org/packages/ce/19/0f3681c941cdcfa2d110ce4515624c07a964dc315d3100d889fcad3bfc9e/ruff-0.13.3-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5bf62b705f319476c78891e0e97e965b21db468b3c999086de8ffb0d40fd2822", size = 12285330, upload-time = "2025-10-02T19:28:58.79Z" }, - { url = "https://files.pythonhosted.org/packages/10/f8/387976bf00d126b907bbd7725219257feea58650e6b055b29b224d8cb731/ruff-0.13.3-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:78cc1abed87ce40cb07ee0667ce99dbc766c9f519eabfd948ed87295d8737c60", size = 13980815, upload-time = "2025-10-02T19:29:01.577Z" }, - { url = "https://files.pythonhosted.org/packages/0c/a6/7c8ec09d62d5a406e2b17d159e4817b63c945a8b9188a771193b7e1cc0b5/ruff-0.13.3-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:4fb75e7c402d504f7a9a259e0442b96403fa4a7310ffe3588d11d7e170d2b1e3", size = 14987733, upload-time = "2025-10-02T19:29:04.036Z" }, - { url = "https://files.pythonhosted.org/packages/97/e5/f403a60a12258e0fd0c2195341cfa170726f254c788673495d86ab5a9a9d/ruff-0.13.3-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:17b951f9d9afb39330b2bdd2dd144ce1c1335881c277837ac1b50bfd99985ed3", size = 14439848, upload-time = "2025-10-02T19:29:06.684Z" }, - { url = "https://files.pythonhosted.org/packages/39/49/3de381343e89364c2334c9f3268b0349dc734fc18b2d99a302d0935c8345/ruff-0.13.3-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6052f8088728898e0a449f0dde8fafc7ed47e4d878168b211977e3e7e854f662", size = 13421890, upload-time = "2025-10-02T19:29:08.767Z" }, - { url = "https://files.pythonhosted.org/packages/ab/b5/c0feca27d45ae74185a6bacc399f5d8920ab82df2d732a17213fb86a2c4c/ruff-0.13.3-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dc742c50f4ba72ce2a3be362bd359aef7d0d302bf7637a6f942eaa763bd292af", size = 13444870, upload-time = "2025-10-02T19:29:11.234Z" }, - { url = "https://files.pythonhosted.org/packages/50/a1/b655298a1f3fda4fdc7340c3f671a4b260b009068fbeb3e4e151e9e3e1bf/ruff-0.13.3-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:8e5640349493b378431637019366bbd73c927e515c9c1babfea3e932f5e68e1d", size = 13691599, upload-time = "2025-10-02T19:29:13.353Z" }, - { url = "https://files.pythonhosted.org/packages/32/b0/a8705065b2dafae007bcae21354e6e2e832e03eb077bb6c8e523c2becb92/ruff-0.13.3-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:6b139f638a80eae7073c691a5dd8d581e0ba319540be97c343d60fb12949c8d0", size = 12421893, upload-time = "2025-10-02T19:29:15.668Z" }, - { url = "https://files.pythonhosted.org/packages/0d/1e/cbe7082588d025cddbb2f23e6dfef08b1a2ef6d6f8328584ad3015b5cebd/ruff-0.13.3-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:6b547def0a40054825de7cfa341039ebdfa51f3d4bfa6a0772940ed351d2746c", size = 12267220, upload-time = "2025-10-02T19:29:17.583Z" }, - { url = "https://files.pythonhosted.org/packages/a5/99/4086f9c43f85e0755996d09bdcb334b6fee9b1eabdf34e7d8b877fadf964/ruff-0.13.3-py3-none-musllinux_1_2_i686.whl", hash = "sha256:9cc48a3564423915c93573f1981d57d101e617839bef38504f85f3677b3a0a3e", size = 13177818, upload-time = "2025-10-02T19:29:19.943Z" }, - { url = "https://files.pythonhosted.org/packages/9b/de/7b5db7e39947d9dc1c5f9f17b838ad6e680527d45288eeb568e860467010/ruff-0.13.3-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:1a993b17ec03719c502881cb2d5f91771e8742f2ca6de740034433a97c561989", size = 13618715, upload-time = "2025-10-02T19:29:22.527Z" }, - { url = "https://files.pythonhosted.org/packages/28/d3/bb25ee567ce2f61ac52430cf99f446b0e6d49bdfa4188699ad005fdd16aa/ruff-0.13.3-py3-none-win32.whl", hash = "sha256:f14e0d1fe6460f07814d03c6e32e815bff411505178a1f539a38f6097d3e8ee3", size = 12334488, upload-time = "2025-10-02T19:29:24.782Z" }, - { url = "https://files.pythonhosted.org/packages/cf/49/12f5955818a1139eed288753479ba9d996f6ea0b101784bb1fe6977ec128/ruff-0.13.3-py3-none-win_amd64.whl", hash = "sha256:621e2e5812b691d4f244638d693e640f188bacbb9bc793ddd46837cea0503dd2", size = 13455262, upload-time = "2025-10-02T19:29:26.882Z" }, - { url = "https://files.pythonhosted.org/packages/fe/72/7b83242b26627a00e3af70d0394d68f8f02750d642567af12983031777fc/ruff-0.13.3-py3-none-win_arm64.whl", hash = "sha256:9e9e9d699841eaf4c2c798fa783df2fabc680b72059a02ca0ed81c460bc58330", size = 12538484, upload-time = "2025-10-02T19:29:28.951Z" }, -] - [[package]] name = "scanpy" version = "1.11.4" @@ -3056,18 +2885,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d6/5e/2cc7555fd81d01814271412a1d59a289d25f8b63208a0a16c21069d55d3e/scipy-1.16.2-cp313-cp313t-win_arm64.whl", hash = "sha256:98e22834650be81d42982360382b43b17f7ba95e0e6993e2a4f5b9ad9283a94d", size = 25787992, upload-time = "2025-09-11T17:43:19.745Z" }, ] -[[package]] -name = "scipy-stubs" -version = "1.16.2.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "optype", extra = ["numpy"] }, -] -sdist = { url = "https://files.pythonhosted.org/packages/4b/84/b4c2caf7748f331870992e7ede5b5df0b080671bcef8c8c7e27a3cf8694a/scipy_stubs-1.16.2.0.tar.gz", hash = "sha256:8fdd45155fca401bb755b1b63ac2f192f84f25c3be8da2c99d1cafb2708f3052", size = 352676, upload-time = "2025-09-11T23:28:59.236Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/83/c8/67d984c264f759e7653c130a4b12ae3b4f4304867579560e9a869adb7883/scipy_stubs-1.16.2.0-py3-none-any.whl", hash = "sha256:18c50d49e3c932033fdd4f7fa4fea9e45c8787f92bceaec9e86ccbd140e835d5", size = 553247, upload-time = "2025-09-11T23:28:57.688Z" }, -] - [[package]] name = "seaborn" version = "0.13.2" @@ -3272,15 +3089,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a2/09/77d55d46fd61b4a135c444fc97158ef34a095e5681d0a6c10b75bf356191/sympy-1.14.0-py3-none-any.whl", hash = "sha256:e091cc3e99d2141a0ba2847328f5479b05d94a6635cb96148ccb3f34671bd8f5", size = 6299353, upload-time = "2025-04-27T18:04:59.103Z" }, ] -[[package]] -name = "termcolor" -version = "3.1.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/ca/6c/3d75c196ac07ac8749600b60b03f4f6094d54e132c4d94ebac6ee0e0add0/termcolor-3.1.0.tar.gz", hash = "sha256:6a6dd7fbee581909eeec6a756cff1d7f7c376063b14e4a298dc4980309e55970", size = 14324, upload-time = "2025-04-30T11:37:53.791Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/4f/bd/de8d508070629b6d84a30d01d57e4a65c69aa7f5abe7560b8fad3b50ea59/termcolor-3.1.0-py3-none-any.whl", hash = "sha256:591dd26b5c2ce03b9e43f391264626557873ce1d379019786f99b0c2bee140aa", size = 7684, upload-time = "2025-04-30T11:37:52.382Z" }, -] - [[package]] name = "terminado" version = "0.18.1" @@ -3355,15 +3163,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/6e/c2/61d3e0f47e2b74ef40a68b9e6ad5984f6241a942f7cd3bbfbdbd03861ea9/tomli-2.2.1-py3-none-any.whl", hash = "sha256:cb55c73c5f4408779d0cf3eef9f762b9c9f147a77de7b258bef0a5628adc85cc", size = 14257, upload-time = "2024-11-27T22:38:35.385Z" }, ] -[[package]] -name = "tomlkit" -version = "0.13.3" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/cc/18/0bbf3884e9eaa38819ebe46a7bd25dcd56b67434402b66a58c4b8e552575/tomlkit-0.13.3.tar.gz", hash = "sha256:430cf247ee57df2b94ee3fbe588e71d362a941ebb545dec29b53961d61add2a1", size = 185207, upload-time = "2025-06-05T07:13:44.947Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/bd/75/8539d011f6be8e29f339c42e633aae3cb73bffa95dd0f9adec09b9c58e85/tomlkit-0.13.3-py3-none-any.whl", hash = "sha256:c89c649d79ee40629a9fda55f8ace8c6a1b42deb912b2a8fd8d942ddadb606b0", size = 38901, upload-time = "2025-06-05T07:13:43.546Z" }, -] - [[package]] name = "tornado" version = "6.5.2" @@ -3423,15 +3222,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e1/85/a4ff8758c66f1fc32aa5e9a145908394bf9cf1c79ffd1113cfdeb77e74e4/trove_classifiers-2025.9.11.17-py3-none-any.whl", hash = "sha256:5d392f2d244deb1866556457d6f3516792124a23d1c3a463a2e8668a5d1c15dd", size = 14158, upload-time = "2025-09-11T17:07:49.886Z" }, ] -[[package]] -name = "types-aiofiles" -version = "24.1.0.20250822" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/19/48/c64471adac9206cc844afb33ed311ac5a65d2f59df3d861e0f2d0cad7414/types_aiofiles-24.1.0.20250822.tar.gz", hash = "sha256:9ab90d8e0c307fe97a7cf09338301e3f01a163e39f3b529ace82466355c84a7b", size = 14484, upload-time = "2025-08-22T03:02:23.039Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/bc/8e/5e6d2215e1d8f7c2a94c6e9d0059ae8109ce0f5681956d11bb0a228cef04/types_aiofiles-24.1.0.20250822-py3-none-any.whl", hash = "sha256:0ec8f8909e1a85a5a79aed0573af7901f53120dd2a29771dd0b3ef48e12328b0", size = 14322, upload-time = "2025-08-22T03:02:21.918Z" }, -] - [[package]] name = "types-python-dateutil" version = "2.9.0.20250822" @@ -3441,15 +3231,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ab/d9/a29dfa84363e88b053bf85a8b7f212a04f0d7343a4d24933baa45c06e08b/types_python_dateutil-2.9.0.20250822-py3-none-any.whl", hash = "sha256:849d52b737e10a6dc6621d2bd7940ec7c65fcb69e6aa2882acf4e56b2b508ddc", size = 17892, upload-time = "2025-08-22T03:01:59.436Z" }, ] -[[package]] -name = "types-pytz" -version = "2025.2.0.20250809" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/07/e2/c774f754de26848f53f05defff5bb21dd9375a059d1ba5b5ea943cf8206e/types_pytz-2025.2.0.20250809.tar.gz", hash = "sha256:222e32e6a29bb28871f8834e8785e3801f2dc4441c715cd2082b271eecbe21e5", size = 10876, upload-time = "2025-08-09T03:14:17.453Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/db/d0/91c24fe54e565f2344d7a6821e6c6bb099841ef09007ea6321a0bac0f808/types_pytz-2025.2.0.20250809-py3-none-any.whl", hash = "sha256:4f55ed1b43e925cf851a756fe1707e0f5deeb1976e15bf844bcaa025e8fbd0db", size = 10095, upload-time = "2025-08-09T03:14:16.674Z" }, -] - [[package]] name = "typing-extensions" version = "4.15.0" @@ -3515,20 +3296,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a7/c2/fe1e52489ae3122415c51f387e221dd0773709bad6c6cdaa599e8a2c5185/urllib3-2.5.0-py3-none-any.whl", hash = "sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc", size = 129795, upload-time = "2025-06-18T14:07:40.39Z" }, ] -[[package]] -name = "virtualenv" -version = "20.34.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "distlib" }, - { name = "filelock" }, - { name = "platformdirs" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/1c/14/37fcdba2808a6c615681cd216fecae00413c9dab44fb2e57805ecf3eaee3/virtualenv-20.34.0.tar.gz", hash = "sha256:44815b2c9dee7ed86e387b842a84f20b93f7f417f95886ca1996a72a4138eb1a", size = 6003808, upload-time = "2025-08-13T14:24:07.464Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/76/06/04c8e804f813cf972e3262f3f8584c232de64f0cde9f703b46cf53a45090/virtualenv-20.34.0-py3-none-any.whl", hash = "sha256:341f5afa7eee943e4984a9207c025feedd768baff6753cd660c857ceb3e36026", size = 5983279, upload-time = "2025-08-13T14:24:05.111Z" }, -] - [[package]] name = "wcwidth" version = "0.2.14" @@ -3574,45 +3341,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e1/07/c6fe3ad3e685340704d314d765b7912993bcb8dc198f0e7a89382d37974b/win32_setctime-1.2.0-py3-none-any.whl", hash = "sha256:95d644c4e708aba81dc3704a116d8cbc974d70b3bdb8be1d150e36be6e9d1390", size = 4083, upload-time = "2024-12-07T15:28:26.465Z" }, ] -[[package]] -name = "wrapt" -version = "1.17.3" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/95/8f/aeb76c5b46e273670962298c23e7ddde79916cb74db802131d49a85e4b7d/wrapt-1.17.3.tar.gz", hash = "sha256:f66eb08feaa410fe4eebd17f2a2c8e2e46d3476e9f8c783daa8e09e0faa666d0", size = 55547, upload-time = "2025-08-12T05:53:21.714Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/52/db/00e2a219213856074a213503fdac0511203dceefff26e1daa15250cc01a0/wrapt-1.17.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:273a736c4645e63ac582c60a56b0acb529ef07f78e08dc6bfadf6a46b19c0da7", size = 53482, upload-time = "2025-08-12T05:51:45.79Z" }, - { url = "https://files.pythonhosted.org/packages/5e/30/ca3c4a5eba478408572096fe9ce36e6e915994dd26a4e9e98b4f729c06d9/wrapt-1.17.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5531d911795e3f935a9c23eb1c8c03c211661a5060aab167065896bbf62a5f85", size = 38674, upload-time = "2025-08-12T05:51:34.629Z" }, - { url = "https://files.pythonhosted.org/packages/31/25/3e8cc2c46b5329c5957cec959cb76a10718e1a513309c31399a4dad07eb3/wrapt-1.17.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:0610b46293c59a3adbae3dee552b648b984176f8562ee0dba099a56cfbe4df1f", size = 38959, upload-time = "2025-08-12T05:51:56.074Z" }, - { url = "https://files.pythonhosted.org/packages/5d/8f/a32a99fc03e4b37e31b57cb9cefc65050ea08147a8ce12f288616b05ef54/wrapt-1.17.3-cp311-cp311-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:b32888aad8b6e68f83a8fdccbf3165f5469702a7544472bdf41f582970ed3311", size = 82376, upload-time = "2025-08-12T05:52:32.134Z" }, - { url = "https://files.pythonhosted.org/packages/31/57/4930cb8d9d70d59c27ee1332a318c20291749b4fba31f113c2f8ac49a72e/wrapt-1.17.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8cccf4f81371f257440c88faed6b74f1053eef90807b77e31ca057b2db74edb1", size = 83604, upload-time = "2025-08-12T05:52:11.663Z" }, - { url = "https://files.pythonhosted.org/packages/a8/f3/1afd48de81d63dd66e01b263a6fbb86e1b5053b419b9b33d13e1f6d0f7d0/wrapt-1.17.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d8a210b158a34164de8bb68b0e7780041a903d7b00c87e906fb69928bf7890d5", size = 82782, upload-time = "2025-08-12T05:52:12.626Z" }, - { url = "https://files.pythonhosted.org/packages/1e/d7/4ad5327612173b144998232f98a85bb24b60c352afb73bc48e3e0d2bdc4e/wrapt-1.17.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:79573c24a46ce11aab457b472efd8d125e5a51da2d1d24387666cd85f54c05b2", size = 82076, upload-time = "2025-08-12T05:52:33.168Z" }, - { url = "https://files.pythonhosted.org/packages/bb/59/e0adfc831674a65694f18ea6dc821f9fcb9ec82c2ce7e3d73a88ba2e8718/wrapt-1.17.3-cp311-cp311-win32.whl", hash = "sha256:c31eebe420a9a5d2887b13000b043ff6ca27c452a9a22fa71f35f118e8d4bf89", size = 36457, upload-time = "2025-08-12T05:53:03.936Z" }, - { url = "https://files.pythonhosted.org/packages/83/88/16b7231ba49861b6f75fc309b11012ede4d6b0a9c90969d9e0db8d991aeb/wrapt-1.17.3-cp311-cp311-win_amd64.whl", hash = "sha256:0b1831115c97f0663cb77aa27d381237e73ad4f721391a9bfb2fe8bc25fa6e77", size = 38745, upload-time = "2025-08-12T05:53:02.885Z" }, - { url = "https://files.pythonhosted.org/packages/9a/1e/c4d4f3398ec073012c51d1c8d87f715f56765444e1a4b11e5180577b7e6e/wrapt-1.17.3-cp311-cp311-win_arm64.whl", hash = "sha256:5a7b3c1ee8265eb4c8f1b7d29943f195c00673f5ab60c192eba2d4a7eae5f46a", size = 36806, upload-time = "2025-08-12T05:52:53.368Z" }, - { url = "https://files.pythonhosted.org/packages/9f/41/cad1aba93e752f1f9268c77270da3c469883d56e2798e7df6240dcb2287b/wrapt-1.17.3-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:ab232e7fdb44cdfbf55fc3afa31bcdb0d8980b9b95c38b6405df2acb672af0e0", size = 53998, upload-time = "2025-08-12T05:51:47.138Z" }, - { url = "https://files.pythonhosted.org/packages/60/f8/096a7cc13097a1869fe44efe68dace40d2a16ecb853141394047f0780b96/wrapt-1.17.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:9baa544e6acc91130e926e8c802a17f3b16fbea0fd441b5a60f5cf2cc5c3deba", size = 39020, upload-time = "2025-08-12T05:51:35.906Z" }, - { url = "https://files.pythonhosted.org/packages/33/df/bdf864b8997aab4febb96a9ae5c124f700a5abd9b5e13d2a3214ec4be705/wrapt-1.17.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6b538e31eca1a7ea4605e44f81a48aa24c4632a277431a6ed3f328835901f4fd", size = 39098, upload-time = "2025-08-12T05:51:57.474Z" }, - { url = "https://files.pythonhosted.org/packages/9f/81/5d931d78d0eb732b95dc3ddaeeb71c8bb572fb01356e9133916cd729ecdd/wrapt-1.17.3-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:042ec3bb8f319c147b1301f2393bc19dba6e176b7da446853406d041c36c7828", size = 88036, upload-time = "2025-08-12T05:52:34.784Z" }, - { url = "https://files.pythonhosted.org/packages/ca/38/2e1785df03b3d72d34fc6252d91d9d12dc27a5c89caef3335a1bbb8908ca/wrapt-1.17.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3af60380ba0b7b5aeb329bc4e402acd25bd877e98b3727b0135cb5c2efdaefe9", size = 88156, upload-time = "2025-08-12T05:52:13.599Z" }, - { url = "https://files.pythonhosted.org/packages/b3/8b/48cdb60fe0603e34e05cffda0b2a4adab81fd43718e11111a4b0100fd7c1/wrapt-1.17.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:0b02e424deef65c9f7326d8c19220a2c9040c51dc165cddb732f16198c168396", size = 87102, upload-time = "2025-08-12T05:52:14.56Z" }, - { url = "https://files.pythonhosted.org/packages/3c/51/d81abca783b58f40a154f1b2c56db1d2d9e0d04fa2d4224e357529f57a57/wrapt-1.17.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:74afa28374a3c3a11b3b5e5fca0ae03bef8450d6aa3ab3a1e2c30e3a75d023dc", size = 87732, upload-time = "2025-08-12T05:52:36.165Z" }, - { url = "https://files.pythonhosted.org/packages/9e/b1/43b286ca1392a006d5336412d41663eeef1ad57485f3e52c767376ba7e5a/wrapt-1.17.3-cp312-cp312-win32.whl", hash = "sha256:4da9f45279fff3543c371d5ababc57a0384f70be244de7759c85a7f989cb4ebe", size = 36705, upload-time = "2025-08-12T05:53:07.123Z" }, - { url = "https://files.pythonhosted.org/packages/28/de/49493f962bd3c586ab4b88066e967aa2e0703d6ef2c43aa28cb83bf7b507/wrapt-1.17.3-cp312-cp312-win_amd64.whl", hash = "sha256:e71d5c6ebac14875668a1e90baf2ea0ef5b7ac7918355850c0908ae82bcb297c", size = 38877, upload-time = "2025-08-12T05:53:05.436Z" }, - { url = "https://files.pythonhosted.org/packages/f1/48/0f7102fe9cb1e8a5a77f80d4f0956d62d97034bbe88d33e94699f99d181d/wrapt-1.17.3-cp312-cp312-win_arm64.whl", hash = "sha256:604d076c55e2fdd4c1c03d06dc1a31b95130010517b5019db15365ec4a405fc6", size = 36885, upload-time = "2025-08-12T05:52:54.367Z" }, - { url = "https://files.pythonhosted.org/packages/fc/f6/759ece88472157acb55fc195e5b116e06730f1b651b5b314c66291729193/wrapt-1.17.3-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:a47681378a0439215912ef542c45a783484d4dd82bac412b71e59cf9c0e1cea0", size = 54003, upload-time = "2025-08-12T05:51:48.627Z" }, - { url = "https://files.pythonhosted.org/packages/4f/a9/49940b9dc6d47027dc850c116d79b4155f15c08547d04db0f07121499347/wrapt-1.17.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:54a30837587c6ee3cd1a4d1c2ec5d24e77984d44e2f34547e2323ddb4e22eb77", size = 39025, upload-time = "2025-08-12T05:51:37.156Z" }, - { url = "https://files.pythonhosted.org/packages/45/35/6a08de0f2c96dcdd7fe464d7420ddb9a7655a6561150e5fc4da9356aeaab/wrapt-1.17.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:16ecf15d6af39246fe33e507105d67e4b81d8f8d2c6598ff7e3ca1b8a37213f7", size = 39108, upload-time = "2025-08-12T05:51:58.425Z" }, - { url = "https://files.pythonhosted.org/packages/0c/37/6faf15cfa41bf1f3dba80cd3f5ccc6622dfccb660ab26ed79f0178c7497f/wrapt-1.17.3-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:6fd1ad24dc235e4ab88cda009e19bf347aabb975e44fd5c2fb22a3f6e4141277", size = 88072, upload-time = "2025-08-12T05:52:37.53Z" }, - { url = "https://files.pythonhosted.org/packages/78/f2/efe19ada4a38e4e15b6dff39c3e3f3f73f5decf901f66e6f72fe79623a06/wrapt-1.17.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0ed61b7c2d49cee3c027372df5809a59d60cf1b6c2f81ee980a091f3afed6a2d", size = 88214, upload-time = "2025-08-12T05:52:15.886Z" }, - { url = "https://files.pythonhosted.org/packages/40/90/ca86701e9de1622b16e09689fc24b76f69b06bb0150990f6f4e8b0eeb576/wrapt-1.17.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:423ed5420ad5f5529db9ce89eac09c8a2f97da18eb1c870237e84c5a5c2d60aa", size = 87105, upload-time = "2025-08-12T05:52:17.914Z" }, - { url = "https://files.pythonhosted.org/packages/fd/e0/d10bd257c9a3e15cbf5523025252cc14d77468e8ed644aafb2d6f54cb95d/wrapt-1.17.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:e01375f275f010fcbf7f643b4279896d04e571889b8a5b3f848423d91bf07050", size = 87766, upload-time = "2025-08-12T05:52:39.243Z" }, - { url = "https://files.pythonhosted.org/packages/e8/cf/7d848740203c7b4b27eb55dbfede11aca974a51c3d894f6cc4b865f42f58/wrapt-1.17.3-cp313-cp313-win32.whl", hash = "sha256:53e5e39ff71b3fc484df8a522c933ea2b7cdd0d5d15ae82e5b23fde87d44cbd8", size = 36711, upload-time = "2025-08-12T05:53:10.074Z" }, - { url = "https://files.pythonhosted.org/packages/57/54/35a84d0a4d23ea675994104e667ceff49227ce473ba6a59ba2c84f250b74/wrapt-1.17.3-cp313-cp313-win_amd64.whl", hash = "sha256:1f0b2f40cf341ee8cc1a97d51ff50dddb9fcc73241b9143ec74b30fc4f44f6cb", size = 38885, upload-time = "2025-08-12T05:53:08.695Z" }, - { url = "https://files.pythonhosted.org/packages/01/77/66e54407c59d7b02a3c4e0af3783168fff8e5d61def52cda8728439d86bc/wrapt-1.17.3-cp313-cp313-win_arm64.whl", hash = "sha256:7425ac3c54430f5fc5e7b6f41d41e704db073309acfc09305816bc6a0b26bb16", size = 36896, upload-time = "2025-08-12T05:52:55.34Z" }, - { url = "https://files.pythonhosted.org/packages/1f/f6/a933bd70f98e9cf3e08167fc5cd7aaaca49147e48411c0bd5ae701bb2194/wrapt-1.17.3-py3-none-any.whl", hash = "sha256:7171ae35d2c33d326ac19dd8facb1e82e5fd04ef8c6c0e394d7af55a55051c22", size = 23591, upload-time = "2025-08-12T05:53:20.674Z" }, -] - [[package]] name = "xlrd" version = "2.0.2" @@ -3637,3 +3365,18 @@ sdist = { url = "https://files.pythonhosted.org/packages/d6/67/14be68a7bad15eecd wheels = [ { url = "https://files.pythonhosted.org/packages/1a/71/9de7229515a53d1cc5705ca9c411530f711a2242f962214d9dbfe2741aa4/zarr-3.1.3-py3-none-any.whl", hash = "sha256:45f67f87f65f14fa453f99dd8110a5936b7ac69f3a21981d33e90407c80c302a", size = 276427, upload-time = "2025-09-18T19:32:40.042Z" }, ] + +[[package]] +name = "zfpkm" +version = "1.0.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "loguru" }, + { name = "matplotlib" }, + { name = "numpy" }, + { name = "pandas" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/e3/7f/ff714f85601cd66439f2beed0d740772509b32b8be5a8b01a53652248714/zfpkm-1.0.3.tar.gz", hash = "sha256:58830ea61e6adc0c75f28d5304885bd03a33a6e9e56aa693856cbb37e30a5046", size = 15410, upload-time = "2025-11-10T16:47:45.614Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/11/f8/ef2baeaf2d15682d5d663c3f5165b63abad114fc0c4cd90b67b1ed0a6456/zfpkm-1.0.3-py3-none-any.whl", hash = "sha256:085007f97e75e50d686677ee28e3fceba5fc19958b35e9fbad3756ca2302a219", size = 17841, upload-time = "2025-11-10T16:47:44.805Z" }, +] From 5afa6f3f627cc00ab7f23e5cac938ed6c0fefb29 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 9 Feb 2026 16:32:02 -0600 Subject: [PATCH 23/26] fix: rename count to quant in testing files Signed-off-by: Josh Loecker --- tests/unit/test_rnaseq_preprocess.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/test_rnaseq_preprocess.py b/tests/unit/test_rnaseq_preprocess.py index 6b3419c2..19ff9cab 100644 --- a/tests/unit/test_rnaseq_preprocess.py +++ b/tests/unit/test_rnaseq_preprocess.py @@ -52,9 +52,9 @@ def test_sample_name_from_filepath(any_como_input_filepath: Path): def test_organize_gene_counts_files(como_input_data_directory: Path): metric: _StudyMetrics for metric in _organize_gene_counts_files(como_input_data_directory): - assert len(metric.sample_names) == metric.num_samples == len(metric.count_files) == len(metric.strand_files) + assert len(metric.sample_names) == metric.num_samples == len(metric.quant_files) == len(metric.strand_files) - for file in metric.count_files: + for file in metric.quant_files: assert f"/{metric.study_name}/" in file.as_posix() assert "geneCounts" in file.as_posix() assert file.suffix == ".tab" From 351e93c597924f92bd4107bdd0cfad40f3bf9d84 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 9 Feb 2026 16:32:49 -0600 Subject: [PATCH 24/26] feat: add single cell normalization using scanpy defaults Signed-off-by: Josh Loecker --- main/como/rnaseq_gen.py | 116 +++++++++++++++++++++++++++++++++++----- 1 file changed, 104 insertions(+), 12 deletions(-) diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py index 4c522e3c..327c6ba5 100644 --- a/main/como/rnaseq_gen.py +++ b/main/como/rnaseq_gen.py @@ -565,8 +565,93 @@ def zfpkm_filter( # determine which genes are confidently expressed top_samples = round(high_confidence_sample_expression * len(zfpkm_df.columns)) top_func = k_over_a(top_samples, cut_off) - top_genes: npt.NDArray[bool] = genefilter(zfpkm_df, top_func) - metric.high_confidence_entrez_gene_ids = [gene for gene, keep in zip(zfpkm_df.index, top_genes, strict=True) if keep] + top_genes: npt.NDArray[np.bool] = genefilter(zfpkm_df, top_func) + metric.high_confidence_entrez_gene_ids = [ + gene for gene, keep in zip(zfpkm_df.index, top_genes, strict=True) if keep + ] + + return metrics + + +def umi_filter( + metrics: NamedMetrics, + filtering_options: _FilteringOptions, + target_sum: int = 10_000, + perform_normalization: bool = False, +) -> NamedMetrics: + """Perform UMI-based filtering. + + UMI filtering uses ScanPy's built-in `sc.pp.scale` (if `perform_normalization=True`) + Otherwise, this function assumes that data has been pre-normalized+scaled beforehand and will evaluate expressed & highly expressed genes directly + + For each metric's matrix: + - The rows are genomic identifiers (gene symbol, entrez gene id, ensembl gene id, etc.) + - The columns are cell identifiers (e.g., barcodes) + + Calculating counts per cell should, therefore, be a column-wise sum (axis=0) + + + :param metrics: The metrics to perform UMI filtering on + :param filtering_options: Options for filtering the count matrix. + :param target_sum: The target sum for UMI normalization. + :param perform_normalization: Whether to perform normalization before filtering. + + :returns: The filtered metrics + """ + min_sample_expression = filtering_options.replicate_ratio + high_confidence_sample_expression = filtering_options.high_replicate_ratio + cut_off = filtering_options.cut_off + + if min_sample_expression > 0.20: + logger.warning( + "Setting a minimum sample expression greater than ~20% for UMI-based filtering will likely result in very few/no genes being marked as active. " # noqa: E501 + "Activity values ranging from 10-20% are recommended based on recent literature. " + f"Got: {min_sample_expression} for option 'replicate_ratio'" + ) + if high_confidence_sample_expression > 0.40: + logger.warning( + f"Setting high-confidence expression greater than ~40% for UMI-based filtering will likely result in very few to no genes being marked as highly active. " # noqa: E501 + "Activity values ranging from 20-30% are recommended based on recent literature. " + f"Got: {high_confidence_sample_expression} for option 'high_replicate_ratio'." + ) + + for metric in metrics.values(): + metric: _StudyMetrics + if not isinstance(metric.count_matrix, sc.AnnData): + raise TypeError(f"Expected a scanpy.AnnData for UMI filtering, got: '{type(metric.count_matrix)}'") + adata: sc.AnnData = metric.count_matrix + + if perform_normalization: + if adata.raw is not None: + adata.X = adata.raw.X.copy() + sc.pp.filter_cells(adata, min_genes=20) + sc.pp.filter_genes(adata, min_cells=1) + sc.pp.normalize_total(adata, target_sum=target_sum) + sc.pp.log1p(adata) + # sc.pp.scale(adata, max_value=15) # abs(values)>10 standard deviations away will be set to +/-10 + + metric.z_score_matrix = adata + + adata_x = adata.X + n_cells, n_genes = adata.shape + + min_samples: float = round(min_sample_expression * n_cells) + min_func = k_over_a(min_samples, cut_off) + min_genes_mask = np.zeros(n_genes, dtype=bool) + for j in range(n_genes): + col = adata_x.getcol(j).toarray().ravel() if sparse.issparse(adata_x) else adata_x[:, j] + min_genes_mask[j] = min_func(col) + metric.entrez_gene_ids = ( + adata.var.loc[min_genes_mask, "entrez_gene_id"].dropna().tolist() + ) # at this point we do not need/want NA entrez IDs + + top_samples = round(high_confidence_sample_expression * n_cells) + top_func = k_over_a(top_samples, cut_off) + top_genes_mask = np.zeros(n_genes, dtype=bool) + for j in range(n_genes): + col = adata_x.getcol(j).toarray().ravel() if sparse.issparse(adata_x) else adata_x[:, j] + top_genes_mask[j] = top_func(col) + metric.high_confidence_entrez_gene_ids = adata.var.loc[top_genes_mask, "entrez_gene_id"].dropna().tolist() return metrics @@ -581,6 +666,8 @@ def filter_counts( force_zfpkm_plot: bool, zfpkm_min_peak_height: float, zfpkm_min_peak_distance: int, + umi_target_sum: int = 10_000, + umi_perform_normalization: bool = False, output_zfpkm_plot_dirpath: Path | None = None, force_negative_to_zero: bool = False, ) -> NamedMetrics: @@ -600,12 +687,13 @@ def filter_counts( :param force_negative_to_zero: Should negative values be forcibly set to 0? This could happen as a result of normalization producing negative near-zero values (e.g., -0.001) - Returns: - A dictionary of filtered study metrics. + :returns: A dictionary of filtered study metrics. """ match technique: case FilteringTechnique.CPM: - return cpm_filter(context_name=context_name, metrics=metrics, filtering_options=filtering_options, prep=prep) + return cpm_filter( + context_name=context_name, metrics=metrics, filtering_options=filtering_options, prep=prep + ) case FilteringTechnique.TPM: return tpm_quantile_filter(metrics=metrics, filtering_options=filtering_options) case FilteringTechnique.ZFPKM: @@ -620,15 +708,11 @@ def filter_counts( force_negative_to_zero=force_negative_to_zero, ) case FilteringTechnique.UMI: - # UMI filtering is the same as zFPKM filtering without calculating FPKM - return zfpkm_filter( + return umi_filter( metrics=metrics, filtering_options=filtering_options, - calculate_fpkm=False, - force_zfpkm_plot=force_zfpkm_plot, - min_peak_height=zfpkm_min_peak_height, - min_peak_distance=zfpkm_min_peak_distance, - output_png_dirpath=output_zfpkm_plot_dirpath, + target_sum=umi_target_sum, + perform_normalization=umi_perform_normalization, ) case _: _log_and_raise_error( @@ -655,6 +739,8 @@ async def _process( force_zfpkm_plot: bool, zfpkm_min_peak_height: float, zfpkm_min_peak_distance: int, + umi_target_sum: int, + umi_perform_normalization: bool, output_boolean_activity_filepath: Path, output_zscore_normalization_filepath: Path, output_zfpkm_plot_dirpath: Path | None, @@ -688,6 +774,8 @@ async def _process( force_zfpkm_plot=force_zfpkm_plot, zfpkm_min_peak_height=zfpkm_min_peak_height, zfpkm_min_peak_distance=zfpkm_min_peak_distance, + umi_target_sum=umi_target_sum, + umi_perform_normalization=umi_perform_normalization, output_zfpkm_plot_dirpath=output_zfpkm_plot_dirpath, force_negative_to_zero=force_negative_to_zero, ) @@ -807,6 +895,8 @@ async def rnaseq_gen( # noqa: C901 :param technique: The filtering technique to use :param zfpkm_min_peak_height: The height of the zFPKM peak :param zfpkm_min_peak_distance: The distance of the zFPKM peak + :param umi_target_sum: The target sum for UMI normalization + :param umi_perform_normalization: Should UMI normalization be performed? :param cutoff: The cutoff value to use for the provided filtering technique :param force_zfpkm_plot: If too many samples exist, should plotting be done anyway? :param log_level: The level of logging to output @@ -908,6 +998,8 @@ async def rnaseq_gen( # noqa: C901 force_zfpkm_plot=force_zfpkm_plot, zfpkm_min_peak_height=zfpkm_min_peak_height, zfpkm_min_peak_distance=zfpkm_min_peak_distance, + umi_target_sum=umi_target_sum, + umi_perform_normalization=umi_perform_normalization, output_boolean_activity_filepath=output_boolean_activity_filepath, output_zscore_normalization_filepath=output_zscore_normalization_filepath, output_zfpkm_plot_dirpath=output_zfpkm_plot_dirpath, From 2fd9249fe5d7055a3637b2f6b39f8b1ebd81bb90 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 9 Feb 2026 16:44:25 -0600 Subject: [PATCH 25/26] fix: test new quant information Signed-off-by: Josh Loecker --- main/como/rnaseq_preprocess.py | 2 +- tests/unit/test_rnaseq_preprocess.py | 35 ++++++++++++++-------------- 2 files changed, 18 insertions(+), 19 deletions(-) diff --git a/main/como/rnaseq_preprocess.py b/main/como/rnaseq_preprocess.py index 06f33922..967236b7 100644 --- a/main/como/rnaseq_preprocess.py +++ b/main/como/rnaseq_preprocess.py @@ -46,7 +46,7 @@ def build_from_sf(cls, filepath: Path) -> _QuantInformation: level=LogLevel.ERROR, ) - sample_name = filepath.stem.removesuffix("_quant.genes.sf") + sample_name = filepath.stem.removesuffix("_quant.genes") df = pd.read_csv( io.StringIO(filepath.read_text()), sep="\t", diff --git a/tests/unit/test_rnaseq_preprocess.py b/tests/unit/test_rnaseq_preprocess.py index 19ff9cab..7057f3c1 100644 --- a/tests/unit/test_rnaseq_preprocess.py +++ b/tests/unit/test_rnaseq_preprocess.py @@ -7,8 +7,8 @@ from como.rnaseq_preprocess import ( _organize_gene_counts_files, _process_first_multirun_sample, + _QuantInformation, _sample_name_from_filepath, - _STARinformation, _StudyMetrics, ) @@ -22,26 +22,25 @@ ) -class TestSTARInformation: - valid_data = Path("main/data/COMO_input/naiveB/geneCounts/S1/naiveB_S1R1.tab").resolve() - invalid_data = Path("main/data/COMO_input/naiveB/fragmentSizes/S1/naiveB_S1R1_fragment_size.txt").resolve() +class TestQuantInformation: + valid_data = Path("main/data/COMO_input/naiveB/quantification/S1/naiveB_S1R1_quant.genes.sf").resolve() + invalid_data = Path("main/data/COMO_input/naiveB/strandedness/S1/naiveB_S1R1_strandedness.txt").resolve() - @pytest.mark.asyncio - async def test_build_from_tab_valid_file(self) -> None: - """Validate building STAR information object.""" - star: _STARinformation = await _STARinformation.build_from_tab(TestSTARInformation.valid_data) + def test_build_from_sf_valid_file(self) -> None: + quant: _QuantInformation = _QuantInformation.build_from_sf(TestQuantInformation.valid_data) + assert len(quant.gene_names) == len(quant.count_matrix) == 78900 + assert quant.sample_name == "naiveB_S1R1" + assert quant.filepath.as_posix().endswith( + "/COMO/main/data/COMO_input/naiveB/quantification/S1/naiveB_S1R1_quant.genes.sf" + ) - assert len(star.gene_names) == len(star.count_matrix) == 61541 - assert len(star.num_unmapped) == 3 - assert len(star.num_multimapping) == 3 - assert len(star.num_no_feature) == 3 - assert len(star.num_ambiguous) == 3 + def test_build_from_sf_invalid_file(self): + with pytest.raises(ValueError, match=r"Building quantification information requires a '.sf' file; received: "): + _QuantInformation.build_from_sf(TestQuantInformation.invalid_data) - @pytest.mark.asyncio - async def test_build_from_tab_invalid_file(self): - """Validate error on invalid file.""" - with pytest.raises(ValueError, match=r"Building STAR information requires a '\.tab' file"): - await _STARinformation.build_from_tab(TestSTARInformation.invalid_data) + def test_build_from_missing_file(self): + with pytest.raises(FileNotFoundError, match=r"Unable to find the .sf file: "): + _QuantInformation.build_from_sf(Path("missing_file.txt")) def test_sample_name_from_filepath(any_como_input_filepath: Path): From 12b04255e7b86c697ed5f95ba0e0578327d4d672 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 9 Feb 2026 16:48:39 -0600 Subject: [PATCH 26/26] chore: use quant files instead of strand files Signed-off-by: Josh Loecker --- tests/fixtures/collect_files.py | 11 ++++++----- tests/unit/test_rnaseq_preprocess.py | 5 ++--- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/fixtures/collect_files.py b/tests/fixtures/collect_files.py index 93d59e70..71ca1a9d 100644 --- a/tests/fixtures/collect_files.py +++ b/tests/fixtures/collect_files.py @@ -5,7 +5,8 @@ from _pytest.fixtures import SubRequest _fragment_size_filepaths = list(Path("main/data/COMO_input").absolute().rglob("*fragment_size*.txt")) -_gene_count_filepaths = list(Path("main/data/COMO_input").absolute().rglob("*.tab")) +_quant_filepaths = list(Path("main/data/COMO_input").absolute().rglob("*.sf")) +# _gene_count_filepaths = list(Path("main/data/COMO_input").absolute().rglob("*.tab")) _insert_size_filepaths = list(Path("main/data/COMO_input").absolute().rglob("*_insert_size.txt")) _layout_filepaths = list(Path("main/data/COMO_input").absolute().rglob("*_layout.txt")) _preparation_method_filepaths = list(Path("main/data/COMO_input").absolute().rglob("*_prep_method.txt")) @@ -27,14 +28,14 @@ def fragment_size_filepath(request: SubRequest) -> Path: return request.param -@pytest.fixture(params=_gene_count_filepaths) -def gene_count_filepath(request: SubRequest) -> Path: +@pytest.fixture(params=_quant_filepaths) +def quant_filepaths(request: SubRequest) -> Path: return request.param @pytest.fixture def all_gene_count_filepaths() -> list[Path]: - return _gene_count_filepaths + return _quant_filepaths @pytest.fixture(params=_insert_size_filepaths) @@ -62,7 +63,7 @@ def strand_filepath(request: SubRequest) -> Path: file for filepaths in [ _fragment_size_filepaths, - _gene_count_filepaths, + _quant_filepaths, _insert_size_filepaths, _layout_filepaths, _preparation_method_filepaths, diff --git a/tests/unit/test_rnaseq_preprocess.py b/tests/unit/test_rnaseq_preprocess.py index 7057f3c1..20b2fcb3 100644 --- a/tests/unit/test_rnaseq_preprocess.py +++ b/tests/unit/test_rnaseq_preprocess.py @@ -64,9 +64,8 @@ def test_organize_gene_counts_files(como_input_data_directory: Path): assert file.suffix == ".txt" -@pytest.mark.asyncio -async def test_process_first_multirun_sample(strand_filepath: Path, all_gene_count_filepaths: list[Path]): - result: pd.DataFrame = await _process_first_multirun_sample(strand_filepath, all_gene_count_filepaths) +def test_process_first_multirun_sample(strand_filepath: Path, all_gene_count_filepaths: list[Path]): + result: pd.DataFrame = _process_first_multirun_sample(strand_filepath, all_gene_count_filepaths) assert result.columns[0] == "ensembl_gene_id" assert len(result.columns) == 2 assert result.columns.tolist()[1] in strand_filepath.as_posix()