From 346f794df34133a41458298ebbf9cecbe577eef9 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 9 Feb 2026 15:56:05 -0600
Subject: [PATCH 01/26] fix(fpkm): update imports for zFPKM calculation
 improvements

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/rnaseq_preprocess.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/main/como/rnaseq_preprocess.py b/main/como/rnaseq_preprocess.py
index 60457fb9..a7e44e20 100644
--- a/main/como/rnaseq_preprocess.py
+++ b/main/como/rnaseq_preprocess.py
@@ -1,19 +1,23 @@
 from __future__ import annotations
 
 import asyncio
+import csv
 import functools
+import io
 import json
 import re
 import sys
+from collections.abc import Sequence
 from dataclasses import asdict, dataclass, field
-from io import TextIOWrapper
 from itertools import chain
 from pathlib import Path
 from typing import Final, Literal, cast
 
-import aiofiles
 import numpy as np
+import numpy.typing as npt
 import pandas as pd
+import pandera.pandas as pa
+import pandera.typing.pandas as pat
 from fast_bioservices.biothings.mygene import MyGene
 from fast_bioservices.pipeline import gene_symbol_to_ensembl_and_gene_id
 from loguru import logger

From 985c6f23ec033a84cf01b5caa8f2af677ab9a7d3 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 9 Feb 2026 16:01:36 -0600
Subject: [PATCH 02/26] fix(fpkm): use Salmon quantification instead of STAR
 quantification

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/rnaseq_preprocess.py | 428 ++++++++++++++++++---------------
 1 file changed, 230 insertions(+), 198 deletions(-)

diff --git a/main/como/rnaseq_preprocess.py b/main/como/rnaseq_preprocess.py
index a7e44e20..6835423e 100644
--- a/main/como/rnaseq_preprocess.py
+++ b/main/como/rnaseq_preprocess.py
@@ -27,71 +27,45 @@
 
 
 @dataclass
-class _STARinformation:
-    num_unmapped: list[int]
-    num_multimapping: list[int]
-    num_no_feature: list[int]
-    num_ambiguous: list[int]
+class _QuantInformation:
     gene_names: list[str]
     count_matrix: pd.DataFrame
-
-    @property
-    def num_genes(self) -> int:
-        return len(self.count_matrix)
+    sample_name: str
+    filepath: Path
 
     @classmethod
-    async def build_from_tab(cls, filepath: Path) -> _STARinformation:
-        if filepath.suffix != ".tab":
+    def build_from_sf(cls, filepath: Path) -> _QuantInformation:
+        if filepath.suffix != ".sf":
             _log_and_raise_error(
-                f"Building STAR information requires a '.tab' file; received: '{filepath}'",
+                f"Building quantification information requires a '.sf' file; received: '{filepath}'",
                 error=ValueError,
                 level=LogLevel.ERROR,
             )
         if not filepath.exists():
             _log_and_raise_error(
-                f"Unable to find the .tab file '{filepath}'",
+                f"Unable to find the .sf file: {filepath}",
                 error=FileNotFoundError,
                 level=LogLevel.ERROR,
             )
 
-        async with aiofiles.open(filepath) as i_stream:
-            # Cannot use `asyncio.gather()` here because the order of execution is not guaranteed
-            unmapped = await i_stream.readline()
-            multimapping = await i_stream.readline()
-            no_feature = await i_stream.readline()
-            ambiguous = await i_stream.readline()
-
-            num_unmapped = [int(i) for i in unmapped.removesuffix("\n").split("\t")[1:]]
-            num_multimapping = [int(i) for i in multimapping.removesuffix("\n").split("\t")[1:]]
-            num_no_feature = [int(i) for i in no_feature.removesuffix("\n").split("\t")[1:]]
-            num_ambiguous = [int(i) for i in ambiguous.removesuffix("\n").split("\t")[1:]]
-
-        df = await _read_file(
-            path=filepath,
+        sample_name = filepath.stem.removesuffix("_quant.genes.sf")
+        df = pd.read_csv(
+            io.StringIO(filepath.read_text()),
             sep="\t",
-            header=None,
-            skiprows=4,
-            names=[
-                "ensembl_gene_id",
-                "unstranded_rna_counts",
-                "first_read_transcription_strand",
-                "second_read_transcription_strand",
-            ],
+            names=["ensembl_gene_id", "length", "effective_length", "tpm", sample_name],
         )
-        return _STARinformation(
-            num_unmapped=num_unmapped,
-            num_multimapping=num_multimapping,
-            num_no_feature=num_no_feature,
-            num_ambiguous=num_ambiguous,
-            gene_names=df["ensembl_gene_id"].values.tolist(),
+        return cls(
+            gene_names=df["ensembl_gene_id"].to_list(),
             count_matrix=df,
+            sample_name=sample_name,
+            filepath=filepath,
         )
 
 
 @dataclass
 class _StudyMetrics:
     study_name: str
-    count_files: list[Path]
+    quant_files: list[Path]
     strand_files: list[Path]
     __sample_names: list[str] = field(default_factory=list)
     __num_samples: int = 0
@@ -105,24 +79,24 @@ def num_samples(self):
         return self.__num_samples
 
     def __post_init__(self):
-        self.__num_samples = len(self.count_files)
-        self.__sample_names = [f.stem for f in self.count_files]
+        self.__num_samples = len(self.quant_files)
+        self.__sample_names = [f.stem for f in self.quant_files]
 
-        if len(self.count_files) != len(self.strand_files):
+        if len(self.quant_files) != len(self.strand_files):
             _log_and_raise_error(
                 (
                     f"Unequal number of count files and strand files for study '{self.study_name}'. "
-                    f"Found {len(self.count_files)} count files and {len(self.strand_files)} strand files."
+                    f"Found {len(self.quant_files)} count files and {len(self.strand_files)} strand files."
                 ),
                 error=ValueError,
                 level=LogLevel.ERROR,
             )
 
-        if self.num_samples != len(self.count_files):
+        if self.num_samples != len(self.quant_files):
             _log_and_raise_error(
                 (
                     f"Unequal number of samples and count files for study '{self.study_name}'. "
-                    f"Found {self.num_samples} samples and {len(self.count_files)} count files."
+                    f"Found {self.num_samples} samples and {len(self.quant_files)} count files."
                 ),
                 error=ValueError,
                 level=LogLevel.ERROR,
@@ -145,7 +119,7 @@ def __post_init__(self):
                 level=LogLevel.ERROR,
             )
 
-        self.count_files.sort()
+        self.quant_files.sort()
         self.strand_files.sort()
         self.__sample_names.sort()
 
@@ -153,39 +127,68 @@ def __post_init__(self):
 @dataclass(slots=True)
 class SampleConfiguration:
     sample_name: str
-    fragment_length: float
+    effective_lengths: pd.DataFrame
+    mean_effective_length: float
     layout: str
     strand: str
     study: str
     library_prep: str
 
+    def __post_init__(self):
+        if len(self.effective_lengths.columns) > 2:
+            _log_and_raise_error(
+                message=f"Effective lengths dataframe for sample '{self.sample_name}' has more than 2 columns, expected 'name' and 'effective_length'",
+                error=ValueError,
+                level=LogLevel.ERROR,
+            )
+        if "name" not in self.effective_lengths.columns:
+            _log_and_raise_error(
+                message=f"Effective lengths dataframe for sample '{self.sample_name}' is missing 'name' column",
+                error=ValueError,
+                level=LogLevel.ERROR,
+            )
+        if "effective_length" not in self.effective_lengths.columns:
+            _log_and_raise_error(
+                message=f"Effective lengths dataframe for sample '{self.sample_name}' is missing 'effective_length' column",
+                error=ValueError,
+                level=LogLevel.ERROR,
+            )
 
-async def _read_text(path: Path | None, *, default: str, lower: bool = False) -> str:
-    if path is None:
-        return default
-    async with aiofiles.open(path) as f:
-        txt = (await f.read()).strip()
-    return txt.lower() if lower else txt
+    @classmethod
+    def to_dataframe(cls, samples: list[SampleConfiguration]) -> tuple[pd.DataFrame, pd.DataFrame]:
+        """Convert a list of SampleConfiguration to a dataframe.
+
+        :param samples: The list of SampleConfiguration objects to convert.
+        :return: A tuple of dataframes:
+            [0]: The sample configuration as a dataframe
+            [1]: The effective lengths as a separate data frame with `same_name` as columns
+        """
+        config = pd.DataFrame(
+            columns=["sample_name", "mean_effective_length", "layout", "strand", "study", "library_prep"]
+        )
 
+        genes = set()
+        for s in samples:
+            genes.update(s.effective_lengths["name"].to_list())
 
-def _sample_name_from_filepath(file: Path) -> str:
-    return re.search(r".+_S\d+R\d+(r\d+)?", file.stem).group()
+        lengths = pd.DataFrame(data=np.float64(0.0), columns=[s.sample_name for s in samples], index=list(genes))
+        for sample in samples:
+            ids: list[str] = sample.effective_lengths["name"].to_list()
+            data: npt.NDArray[np.floating] = sample.effective_lengths["effective_length"].to_numpy(dtype=np.float64)
+            lengths.loc[ids, sample.sample_name] = data
 
+        return config, lengths
 
-def _require_one(paths: list[Path], kind: Literal["layout", "strand", "preparation", "fragment"], label: str) -> Path | None:
-    if len(paths) == 1:
-        return paths[0]
-    if len(paths) == 0:
-        return None
-    _log_and_raise_error(
-        f"Multiple matching {kind} files for {label}, make sure there is only one copy for each replicate in COMO_input",
-        error=ValueError,
-        level=LogLevel.ERROR,
-    )
-    return None  # explicit return None to satisfy type-check
+
+def _sample_name_from_filepath(file: Path) -> str:
+    return re.search(r".+_S\d+R\d+(r\d+)?", file.stem).group()
 
 
-def _require_one(paths: list[Path], kind: Literal["layout", "strand", "preparation", "fragment"], label: str) -> Path | None:
+def _require_one(
+    paths: list[Path],
+    kind: Literal["layout", "strand", "preparation", "fragment"],
+    label: str,
+) -> Path | None:
     if len(paths) == 1:
         return paths[0]
     if len(paths) == 0:
@@ -195,22 +198,28 @@ def _require_one(paths: list[Path], kind: Literal["layout", "strand", "preparati
         error=ValueError,
         level=LogLevel.ERROR,
     )
-    return None  # explicit return None to satisfy type-check
 
 
 def _organize_gene_counts_files(data_dir: Path) -> list[_StudyMetrics]:
-    gene_count_dir = Path(data_dir, "geneCounts")
+    quant_dir = Path(data_dir, "quantification")
     strand_dir = Path(data_dir, "strandedness")
 
-    gene_counts_directories: list[Path] = sorted([p for p in gene_count_dir.glob("*") if not p.name.startswith(".")])
+    if not quant_dir.exists():
+        raise FileNotFoundError(f"Quantification directory not found: {quant_dir}")
+
+    if not strand_dir.exists():
+        raise FileNotFoundError(f"Strandedness directory not found: {strand_dir}")
+
+    quantification_directories: list[Path] = sorted([p for p in quant_dir.glob("*") if not p.name.startswith(".")])
     strandedness_directories: list[Path] = sorted([p for p in strand_dir.glob("*") if not p.name.startswith(".")])
 
-    if len(gene_counts_directories) != len(strandedness_directories):
+    if len(quantification_directories) != len(strandedness_directories):
         _log_and_raise_error(
             (
-                f"Unequal number of gene count directories and strandedness directories. "
-                f"Found {len(gene_counts_directories)} gene count directories and {len(strandedness_directories)} strandedness directories."
-                f"\nGene count directory: {gene_count_dir}\nStrandedness directory: {strand_dir}"
+                f"Unequal number of quantification directories and strandedness directories. "
+                f"Found {len(quantification_directories)} quantification directories and "
+                f"{len(strandedness_directories)} strandedness directories."
+                f"\nQuantification directory: {quant_dir}\nStrandedness directory: {strand_dir}"
             ),
             error=ValueError,
             level=LogLevel.ERROR,
@@ -218,49 +227,34 @@ def _organize_gene_counts_files(data_dir: Path) -> list[_StudyMetrics]:
 
     # For each study, collect gene count files, fragment files, insert size files, layouts, and strandedness information
     study_metrics: list[_StudyMetrics] = []
-    for gene_dir, strand_dir in zip(gene_counts_directories, strandedness_directories, strict=True):
-        count_files = list(gene_dir.glob("*.tab"))
+    for quant, strand_dir in zip(quantification_directories, strandedness_directories, strict=True):
+        quant_files = list(quant.glob("*_quant.genes.sf"))
         strand_files = list(strand_dir.glob("*.txt"))
-        if len(count_files) == 0:
-            _log_and_raise_error(f"No count files found for study '{gene_dir.stem}'.", error=ValueError, level=LogLevel.ERROR)
+        if len(quant_files) == 0:
+            _log_and_raise_error(f"No quant found for study '{quant.stem}'.", error=ValueError, level=LogLevel.ERROR)
         if len(strand_files) == 0:
             _log_and_raise_error(
-                f"No strandedness files found for study '{gene_dir.stem}'.",
+                f"No strandedness files found for study '{quant.stem}'.",
                 error=ValueError,
                 level=LogLevel.ERROR,
             )
 
         study_metrics.append(
             _StudyMetrics(
-                study_name=gene_dir.stem,
-                count_files=count_files,
+                study_name=quant.stem,
+                quant_files=quant_files,
                 strand_files=strand_files,
             )
         )
     return study_metrics
 
 
-async def _process_first_multirun_sample(strand_file: Path, all_counts_files: list[Path]):
+def _process_first_multirun_sample(strand_file: Path, all_quant_files: list[Path]):
     sample_count = pd.DataFrame()
-    all_star_information: list[_STARinformation] = await asyncio.gather(*[_STARinformation.build_from_tab(file) for file in all_counts_files])
-
-    for star_information in all_star_information:
-        strand_information = strand_file.read_text().rstrip("\n").lower()
-
-        if strand_information not in ("none", "first_read_transcription_strand", "second_read_transcription_strand"):
-            _log_and_raise_error(
-                (
-                    f"Unrecognized Strand Information: {strand_information}; "
-                    f"expected 'none', 'first_read_transcription_strand', or 'second_read_transcription_strand'"
-                ),
-                error=ValueError,
-                level=LogLevel.ERROR,
-            )
+    quant_information: list[_QuantInformation] = [_QuantInformation.build_from_sf(f) for f in all_quant_files]
 
-        if strand_information == "none":
-            strand_information = "unstranded_rna_counts"
-
-        run_counts = star_information.count_matrix[["ensembl_gene_id", strand_information]]
+    for info in quant_information:
+        run_counts = info.count_matrix[["ensembl_gene_id", info.sample_name]]
         run_counts.columns = ["ensembl_gene_id", "counts"]
         sample_count = run_counts if sample_count.empty else sample_count.merge(run_counts, on=["ensembl_gene_id", "counts"], how="outer")
 
@@ -274,63 +268,48 @@ async def _process_first_multirun_sample(strand_file: Path, all_counts_files: li
     return count_sums
 
 
-async def _process_standard_replicate(counts_file: Path, strand_file: Path, sample_name: str):
-    star_information = await _STARinformation.build_from_tab(counts_file)
-    strand_information = strand_file.read_text().rstrip("\n").lower()
-
-    if strand_information not in ("none", "first_read_transcription_strand", "second_read_transcription_strand"):
-        _log_and_raise_error(
-            (
-                f"Unrecognized Strand Information: {strand_information}; "
-                f"expected 'none', 'first_read_transcription_strand', or 'second_read_transcription_strand'"
-            ),
-            error=ValueError,
-            level=LogLevel.ERROR,
-        )
 
-    if strand_information == "none":
-        strand_information = "unstranded_rna_counts"
+def _process_standard_replicate(counts_file: Path, strand_file: Path, sample_name: str):
+    quant_information = _QuantInformation.build_from_sf(counts_file)
+    return quant_information.count_matrix
 
-    sample_count = star_information.count_matrix[["ensembl_gene_id", strand_information]]
-    sample_count.columns = ["ensembl_gene_id", sample_name]
-    return sample_count
 
-
-async def _prepare_sample_counts(
+def _prepare_sample_counts(
     sample_name: str,
     counts_file: Path,
     strand_file: Path,
-    all_counts_files: list[Path],
-) -> pd.DataFrame | Literal["SKIP"]:
+    all_quant_files: list[Path],
+) -> pd.DataFrame | None:
     # Test if the counts_file is the first run in a multi-run smaple
     if re.search(r"R\d+r1", counts_file.as_posix()):
-        return await _process_first_multirun_sample(strand_file=strand_file, all_counts_files=all_counts_files)
-    elif re.search(r"R\d+r\d+", counts_file.as_posix()):
-        return "SKIP"
+        return _process_first_multirun_sample(strand_file=strand_file, all_quant_files=all_quant_files)
+    elif re.search(r"R\d+r[2-9]+", counts_file.as_posix()):
+        return None
     else:
-        return await _process_standard_replicate(counts_file, strand_file, sample_name)
+        return _process_standard_replicate(counts_file, strand_file, sample_name)
 
 
-async def _create_sample_counts_matrix(metrics: _StudyMetrics) -> pd.DataFrame:
+def _create_sample_counts_matrix(metrics: _StudyMetrics) -> pd.DataFrame:
     adjusted_index = 0
-    counts: pd.DataFrame | Literal["SKIP"] = await _prepare_sample_counts(
+    counts: pd.DataFrame | None = _prepare_sample_counts(
         sample_name=metrics.sample_names[0],
-        counts_file=metrics.count_files[0],
+        counts_file=metrics.quant_files[0],
         strand_file=metrics.strand_files[0],
-        all_counts_files=metrics.count_files,
+        all_quant_files=metrics.quant_files,
     )
 
     for i in range(1, metrics.num_samples):
-        new_counts = await _prepare_sample_counts(
+        new_counts = _prepare_sample_counts(
             sample_name=metrics.sample_names[i],
-            counts_file=metrics.count_files[i],
+            counts_file=metrics.quant_files[i],
             strand_file=metrics.strand_files[i],
-            all_counts_files=metrics.count_files,
+            all_quant_files=metrics.quant_files,
         )
-        if isinstance(new_counts, str) and new_counts == "SKIP":
+        if new_counts is None:
             adjusted_index += 1
             continue
 
+        assert isinstance(counts, pd.DataFrame)  # noqa: S101
         counts: pd.DataFrame = counts.merge(new_counts, on="ensembl_gene_id", how="outer")
         counts = counts.fillna(value=0)
 
@@ -340,14 +319,19 @@ async def _create_sample_counts_matrix(metrics: _StudyMetrics) -> pd.DataFrame:
             old_col_name = counts.columns[i + 1 - adjusted_index]
             counts.rename(columns={old_col_name: new_sample_name}, inplace=True)
 
+    if counts is None:
+        raise ValueError(f"No valid counts were processed for study '{metrics.study_name}'")
+
     return counts
 
 
 async def _write_counts_matrix(
     *,
     config_df: pd.DataFrame,
+    fragment_lengths: pd.DataFrame,
     como_context_dir: Path,
     output_counts_matrix_filepath: Path,
+    output_fragment_lengths_filepath: Path,
     rna: RNAType,
 ) -> pd.DataFrame:
     """Create a counts matrix file by reading gene counts table(s).
@@ -362,16 +346,22 @@ async def _write_counts_matrix(
         A pandas DataFrame representing the final counts matrix.
     """
     study_metrics = _organize_gene_counts_files(data_dir=como_context_dir)
-    counts: list[pd.DataFrame] = await asyncio.gather(*[_create_sample_counts_matrix(metric) for metric in study_metrics])
-    rna_specific_sample_names = set(config_df.loc[config_df["library_prep"].str.lower() == rna.value.lower(), "sample_name"].tolist())
+    counts: list[pd.DataFrame] = [_create_sample_counts_matrix(metric) for metric in study_metrics]
+    rna_specific_sample_names = set(
+        config_df.loc[config_df["library_prep"].str.lower() == rna.value.lower(), "sample_name"].tolist()
+    )
 
     final_matrix: pd.DataFrame = functools.reduce(lambda left, right: pd.merge(left, right, on="ensembl_gene_id", how="outer"), counts)
     final_matrix.fillna(value=0, inplace=True)
-    final_matrix.iloc[:, 1:] = final_matrix.iloc[:, 1:].astype(np.uint64)
+    final_matrix.iloc[:, 1:] = final_matrix.iloc[:, 1:].astype(int)
     final_matrix = cast(pd.DataFrame, final_matrix[["ensembl_gene_id", *rna_specific_sample_names]])
 
     output_counts_matrix_filepath.parent.mkdir(parents=True, exist_ok=True)
+    output_fragment_lengths_filepath.parent.mkdir(parents=True, exist_ok=True)
+
     final_matrix.to_csv(output_counts_matrix_filepath, index=False)
+    fragment_lengths[rna_specific_sample_names].to_csv(output_fragment_lengths_filepath, index=True)
+
     logger.success(f"Wrote gene count matrix for '{rna.value}' RNA at '{output_counts_matrix_filepath}'")
     return final_matrix
 
@@ -383,9 +373,9 @@ async def _create_config_df(  # noqa: C901
     gene_count_dirname: str = "geneCounts",
     layout_dirname: str = "layouts",
     strandedness_dirname: str = "strandedness",
-    fragment_sizes_dirname: str = "fragmentSizes",
+    quantification_dir: str = "quantification",
     prep_method_dirname: str = "prepMethods",
-) -> pd.DataFrame:
+) -> tuple[pd.DataFrame, pd.DataFrame]:
     """Create configuration sheet.
 
     The configuration file created is based on the gene counts matrix.
@@ -398,15 +388,17 @@ async def _create_config_df(  # noqa: C901
         gene_count_dirname: Name of the subdirectory containing gene count files.
         layout_dirname: Name of the subdirectory containing layout files.
         strandedness_dirname: Name of the subdirectory containing strandedness files.
-        fragment_sizes_dirname: Name of the subdirectory containing fragment size files.
+        quantification_dir: Name of the subdirectory containing Salmon's quantification files.
         prep_method_dirname: Name of the subdirectory containing library preparation method files.
 
     Returns:
-        A pandas DataFrame representing the configuration sheet.
+        [0]: A pandas DataFrame representing the configuration sheet.
+        [1]: Fragment lengths for downstream calculations
     """
     label_regex: Final = re.compile(r"(?P<study>S\d{1,3})(?P<rep>R\d{1,3})(?P<run>r\d{1,3})?")
-    gene_counts: list[Path] = list((como_context_dir / gene_count_dirname).rglob("*.tab"))
-    if not gene_counts:
+    quant_files: list[Path] = list((como_context_dir / quantification_dir).rglob("*.genes.sf"))
+    # gene_counts: list[Path] = list((como_context_dir / gene_count_dirname).rglob("*.tab"))
+    if not quant_files:
         _log_and_raise_error(
             f"No gene count files found in '{gene_count_dirname}'",
             error=FileNotFoundError,
@@ -416,7 +408,7 @@ async def _create_config_df(  # noqa: C901
     auxillary_directories = {
         "layout": como_context_dir / layout_dirname,
         "strand": como_context_dir / strandedness_dirname,
-        "fragment": como_context_dir / fragment_sizes_dirname,
+        "quantification": como_context_dir / quantification_dir,
         "prep": como_context_dir / prep_method_dirname,
     }
     aux_lookup: dict[str, dict[str, Path]] = {kind: {} for kind in auxillary_directories}
@@ -430,15 +422,15 @@ async def _create_config_df(  # noqa: C901
                     aux_lookup[kind][m.group(0)] = p
 
     rows: list[SampleConfiguration] = []
-    for gene_count_path in sorted(gene_counts):
-        m = label_regex.search(gene_count_path.as_posix())
-        if not m:
+    for quant_file in sorted(quant_files):
+        m = label_regex.search(quant_file.as_posix())
+        if m is None:
             _log_and_raise_error(
-                f"Filename '{gene_count_path.name}' does not match contextName_SXRYrZ.tab pattern",
+                f"Filename '{quant_file.name}' does not match contextName_SXRYrZ.tab pattern",
                 error=ValueError,
                 level=LogLevel.ERROR,
             )
-        label = m.group(0)
+        label = m.group()
         study_number = m["study"]
         rep_number = m["rep"]
         sample_id = f"{context_name}_{study_number}{rep_number}"
@@ -447,41 +439,50 @@ async def _create_config_df(  # noqa: C901
         strand_path = _require_one([aux_lookup["strand"].get(label)], "strand", label)
         prep_path = _require_one([aux_lookup["prep"].get(label)], "preparation", label)
 
-        layout, strand, prep = await asyncio.gather(
-            *[
-                _read_text(layout_path, default="UNKNOWN"),
-                _read_text(strand_path, default="UNKNOWN"),
-                _read_text(prep_path, default="total", lower=True),
-            ],
-        )
+        layout = layout_path.read_text().rstrip()
+        strand = strand_path.read_text().rstrip()
+        prep = prep_path.read_text().rstrip()
         if prep not in {"total", "mrna"}:
             _log_and_raise_error(
                 f"Prep method must be 'total' or 'mrna' (got '{prep}') for {label}",
                 error=ValueError,
                 level=LogLevel.ERROR,
             )
+        if layout == "":
+            _log_and_raise_error(
+                message=f"No layout file found for '{label}'.",
+                error=FileNotFoundError,
+                level=LogLevel.WARNING,
+            )
 
-        fragment_label = f"{context_name}_{label}_fragment_size.txt"
-        frag_paths = [p for p in aux_lookup["fragment"].values() if p.name == fragment_label]
-        if not frag_paths and prep.lower() != RNAType.TRNA.value.lower():
-            logger.warning(f"No fragment file for '{label}'; defaulting to 100 bp (needed for zFPKM).")
-            mean_frag = 100.0
-        elif len(frag_paths) == 1 and layout == "single-end":
-            mean_frag = 0.0
-        else:  # 1-N files, paired end
-            dfs: list[pd.DataFrame] = cast(
-                typ=list[pd.DataFrame],
-                val=await asyncio.gather(*[_read_file(f, sep="\t", on_bad_lines="skip") for f in frag_paths]),
+        quant_paths = [p for p in aux_lookup["quantification"].values() if p.name == f"{sample_id}_quant.genes.sf"]
+        if (
+            not quant_paths
+            and layout in ["paired-end", "", None]
+            and prep.lower() in [RNAType.TRNA.value.lower(), RNAType.MRNA.value.lower()]
+        ):
+            _log_and_raise_error(
+                message=f"No quantification file found for '{label}'; defaulting to 100 bp (needed for zFPKM).",
+                error=FileNotFoundError,
+                level=LogLevel.WARNING,
             )
-            for df in dfs:
-                df["meanxcount"] = df["frag_mean"] * df["frag_count"]
-                counts = np.array([df["frag_count"].sum() for df in dfs])
-                means = np.array([(df["meanxcount"] / df["frag_count"].sum()).sum() for df in dfs])
-                mean_frag = float(np.average(means, weights=counts))
+        elif len(quant_paths) == 1 and layout == "single-end":
+            effective_len = pd.DataFrame({"Name": [], "EffectiveLength": []})
+            mean_effective_len = 0.0  # cannot compute FPKM for single-ended data
+        else:
+            df = _read_file(quant_file)
+            df.columns = [c.lower() for c in df.columns]
+            df = df.rename(columns={"effectivelength": "effective_length"})
+
+            effective_len = df[["name", "effective_length"]]
+            effective_len["effective_length"] = effective_len["effective_length"].astype(np.float64)
+            mean_effective_len: float = effective_len["effective_length"].sum() / len(df)
+
         rows.append(
             SampleConfiguration(
                 sample_name=sample_id,
-                fragment_length=mean_frag,
+                effective_lengths=effective_len,
+                mean_effective_length=mean_effective_len,
                 layout=layout,
                 strand=strand,
                 study=study_number,
@@ -489,8 +490,7 @@ async def _create_config_df(  # noqa: C901
             )
         )
 
-    df = pd.DataFrame.from_records([asdict(r) for r in rows]).sort_values("sample_name", ignore_index=True)
-    return df
+    return SampleConfiguration.to_dataframe(rows)
 
     # 6-3-25: Intentionally left commented-out code to test its replacement
     # gene_counts_dir = como_context_dir / gene_count_dirname
@@ -735,14 +735,17 @@ async def _process_como_input(
     output_config_filepath: Path,
     como_context_dir: PATH_TYPE,
     output_counts_matrix_filepath: Path,
+    output_fragment_lengths_filepath: Path,
     rna: RNAType,
 ) -> None:
-    config_df = await _create_config_df(context_name, como_context_dir=como_context_dir)
+    config_df, fragment_lengths = _create_config_df(context_name, como_context_dir=como_context_dir)
 
     await _write_counts_matrix(
         config_df=config_df,
+        fragment_lengths=fragment_lengths,
         como_context_dir=como_context_dir,
         output_counts_matrix_filepath=output_counts_matrix_filepath,
+        output_fragment_lengths_filepath=output_fragment_lengths_filepath,
         rna=rna,
     )
     with pd.ExcelWriter(output_config_filepath) as writer:
@@ -756,6 +759,8 @@ async def _process(
     output_gene_info_filepath: Path,
     como_context_dir: Path | None,
     input_matrix_filepath: list[Path] | None,
+    output_trna_fragment_lengths_filepath: Path | None,
+    output_mrna_fragment_lengths_filepath: Path | None,
     output_trna_config_filepath: Path | None,
     output_mrna_config_filepath: Path | None,
     output_trna_matrix_filepath: Path | None,
@@ -764,29 +769,50 @@ async def _process(
     cache: bool,
     create_gene_info_only: bool,
 ):
-    rna_types: list[tuple[RNAType, Path, Path]] = []
-    if output_trna_config_filepath:
-        rna_types.append((RNAType.TRNA, output_trna_config_filepath, output_trna_matrix_filepath))
-    if output_mrna_config_filepath:
-        rna_types.append((RNAType.MRNA, output_mrna_config_filepath, output_mrna_matrix_filepath))
+    rna_types: list[tuple[RNAType, Path, Path, Path]] = []
+    if output_trna_config_filepath is not None and output_trna_fragment_lengths_filepath is not None:
+        rna_types.append(
+            (
+                RNAType.TRNA,
+                output_trna_config_filepath,
+                output_trna_matrix_filepath,
+                output_trna_fragment_lengths_filepath,
+            )
+        )
+    if output_mrna_config_filepath is not None and output_mrna_fragment_lengths_filepath is not None:
+        rna_types.append(
+            (
+                RNAType.MRNA,
+                output_mrna_config_filepath,
+                output_mrna_matrix_filepath,
+                output_mrna_fragment_lengths_filepath,
+            )
+        )
 
     # if provided, iterate through como-input specific directories
     if not create_gene_info_only:
-        tasks = []
-        for rna, output_config_filepath, output_matrix_filepath in rna_types:
-            tasks.append(
-                asyncio.create_task(
-                    _process_como_input(
-                        context_name=context_name,
-                        output_config_filepath=output_config_filepath,
-                        como_context_dir=como_context_dir,
-                        output_counts_matrix_filepath=output_matrix_filepath,
-                        rna=rna,
-                    )
-                )
+        if como_context_dir is None:
+            _log_and_raise_error(
+                message="como_context_dir must be provided if create_gene_info_only is False",
+                error=ValueError,
+                level=LogLevel.ERROR,
+            )
+        if output_trna_fragment_lengths_filepath is None:
+            _log_and_raise_error(
+                message="output_fragment_lengths_filepath must be provided if create_gene_info_only is False",
+                error=ValueError,
+                level=LogLevel.ERROR,
             )
 
-        await asyncio.gather(*tasks)
+        for rna, out_config, out_matrix, out_frag_len in rna_types:
+            _process_como_input(
+                context_name=context_name,
+                output_config_filepath=out_config,
+                como_context_dir=como_context_dir,
+                output_counts_matrix_filepath=out_matrix,
+                output_fragment_lengths_filepath=out_frag_len,
+                rna=rna,
+            )
 
     # create the gene info filepath based on provided data
     input_files = []
@@ -811,6 +837,8 @@ async def rnaseq_preprocess(
     output_gene_info_filepath: Path,
     como_context_dir: Path | None = None,
     input_matrix_filepath: Path | list[Path] | None = None,
+    output_trna_fragment_lengths_filepath: Path | None = None,
+    output_mrna_fragment_lengths_filepath: Path | None = None,
     output_trna_metadata_filepath: Path | None = None,
     output_mrna_metadata_filepath: Path | None = None,
     output_trna_count_matrix_filepath: Path | None = None,
@@ -829,6 +857,8 @@ async def rnaseq_preprocess(
     :param context_name: The context/cell type being processed
     :param taxon: The NCBI taxonomy ID
     :param output_gene_info_filepath: Path to the output gene information CSV file
+    :param output_trna_fragment_lengths_filepath: Path to the output tRNA fragment lengths CSV file (if in "create" mode)
+    :param output_mrna_fragment_lengths_filepath: Path to the output mRNA fragment lengths CSV file (if in "create" mode)
     :param output_trna_metadata_filepath: Path to the output tRNA config file (if in "create" mode)
     :param output_mrna_metadata_filepath: Path to the output mRNA config file (if in "create" mode)
     :param output_trna_count_matrix_filepath: The path to write total RNA count matrices
@@ -860,9 +890,11 @@ async def rnaseq_preprocess(
         input_matrix_filepath=input_matrix_filepath,
         output_gene_info_filepath=output_gene_info_filepath,
         output_trna_config_filepath=output_trna_metadata_filepath,
-        output_mrna_config_filepath=output_mrna_metadata_filepath,
         output_trna_matrix_filepath=output_trna_count_matrix_filepath,
+        output_trna_fragment_lengths_filepath=output_trna_fragment_lengths_filepath,
+        output_mrna_config_filepath=output_mrna_metadata_filepath,
         output_mrna_matrix_filepath=output_mrna_count_matrix_filepath,
+        output_mrna_fragment_lengths_filepath=output_mrna_fragment_lengths_filepath,
         cache=cache,
         create_gene_info_only=create_gene_info_only,
     )

From d35006351f966852431827fd58e7bbd00f3221e4 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 9 Feb 2026 16:02:29 -0600
Subject: [PATCH 03/26] chore: ruff formatting

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/rnaseq_preprocess.py | 38 ++++++++++++++++++++--------------
 1 file changed, 22 insertions(+), 16 deletions(-)

diff --git a/main/como/rnaseq_preprocess.py b/main/como/rnaseq_preprocess.py
index 6835423e..7fd9ac78 100644
--- a/main/como/rnaseq_preprocess.py
+++ b/main/como/rnaseq_preprocess.py
@@ -256,7 +256,9 @@ def _process_first_multirun_sample(strand_file: Path, all_quant_files: list[Path
     for info in quant_information:
         run_counts = info.count_matrix[["ensembl_gene_id", info.sample_name]]
         run_counts.columns = ["ensembl_gene_id", "counts"]
-        sample_count = run_counts if sample_count.empty else sample_count.merge(run_counts, on=["ensembl_gene_id", "counts"], how="outer")
+        sample_count = (
+            run_counts if sample_count.empty else sample_count.join(run_counts, on=["ensembl_gene_id"], how="outer")
+        )
 
     # Set na values to 0
     sample_count = sample_count.fillna(value="0")
@@ -336,14 +338,13 @@ async def _write_counts_matrix(
 ) -> pd.DataFrame:
     """Create a counts matrix file by reading gene counts table(s).
 
-    Args:
-        config_df: Configuration DataFrame containing sample information.
-        como_context_dir: Path to the COMO_input directory containing gene count files.
-        output_counts_matrix_filepath: Path where the output counts matrix CSV will be saved.
-        rna: RNAType enum indicating whether to process 'trna' or 'mrna' samples.
-
-    Returns:
-        A pandas DataFrame representing the final counts matrix.
+    :param config_df: Configuration DataFrame containing sample information.
+    :param fragment_lengths: DataFrame containing effective lengths for each gene and sample, used for zFPKM normalization.
+    :param como_context_dir: Path to the COMO_input directory containing gene count files.
+    :param output_counts_matrix_filepath: Path where the output counts matrix CSV will be saved.
+    :param output_fragment_lengths_filepath: Path where the output fragment lengths CSV will be saved.
+    :param rna: RNAType enum indicating whether to process 'trna' or 'mrna' samples.
+    :return: A pandas DataFrame representing the final counts matrix.
     """
     study_metrics = _organize_gene_counts_files(data_dir=como_context_dir)
     counts: list[pd.DataFrame] = [_create_sample_counts_matrix(metric) for metric in study_metrics]
@@ -363,6 +364,7 @@ async def _write_counts_matrix(
     fragment_lengths[rna_specific_sample_names].to_csv(output_fragment_lengths_filepath, index=True)
 
     logger.success(f"Wrote gene count matrix for '{rna.value}' RNA at '{output_counts_matrix_filepath}'")
+
     return final_matrix
 
 
@@ -675,9 +677,9 @@ async def read_ensembl_gene_ids(file: Path) -> list[str]:
             return data["ensembl_gene_id"].tolist()
         try:
             conversion = await gene_symbol_to_ensembl_and_gene_id(symbols=data.var_names.tolist(), taxon=taxon)
-        except json.JSONDecodeError:
+        except json.JSONDecodeError as e:
             _log_and_raise_error(
-                f"Got a JSON decode error for file '{counts_matrix_filepaths}'",
+                f"Got a JSON decode error for file '{counts_matrix_filepaths}' ({e})",
                 error=ValueError,
                 level=LogLevel.CRITICAL,
             )
@@ -724,7 +726,7 @@ async def read_ensembl_gene_ids(file: Path) -> list[str]:
     # we would set `entrez_gene_id` to int here as well, but not all ensembl ids are mapped to entrez ids,
     #   and as a result, there are still "-" values in the entrez id column that cannot be converted to an integer
 
-    gene_info: pd.DataFrame = cast(pd.DataFrame, gene_info.sort_values(by="ensembl_gene_id"))
+    gene_info = gene_info.sort_values(by="ensembl_gene_id")
     output_filepath.parent.mkdir(parents=True, exist_ok=True)
     gene_info.to_csv(output_filepath, index=False)
     logger.success(f"Gene Info file written at '{output_filepath}'")
@@ -733,7 +735,7 @@ async def read_ensembl_gene_ids(file: Path) -> list[str]:
 async def _process_como_input(
     context_name: str,
     output_config_filepath: Path,
-    como_context_dir: PATH_TYPE,
+    como_context_dir: Path,
     output_counts_matrix_filepath: Path,
     output_fragment_lengths_filepath: Path,
     rna: RNAType,
@@ -845,7 +847,7 @@ async def rnaseq_preprocess(
     output_mrna_count_matrix_filepath: Path | None = None,
     cache: bool = True,
     log_level: LogLevel | str = LogLevel.INFO,
-    log_location: str | TextIOWrapper = sys.stderr,
+    log_location: str | io.TextIOWrapper = sys.stderr,
     *,
     create_gene_info_only: bool = False,
 ) -> None:
@@ -880,8 +882,12 @@ async def rnaseq_preprocess(
     input_matrix_filepath = [i.resolve() for i in _listify(input_matrix_filepath)] if input_matrix_filepath else None
     output_trna_metadata_filepath = output_trna_metadata_filepath.resolve() if output_trna_metadata_filepath else None
     output_mrna_metadata_filepath = output_mrna_metadata_filepath.resolve() if output_mrna_metadata_filepath else None
-    output_trna_count_matrix_filepath = output_trna_count_matrix_filepath.resolve() if output_trna_count_matrix_filepath else None
-    output_mrna_count_matrix_filepath = output_mrna_count_matrix_filepath.resolve() if output_mrna_count_matrix_filepath else None
+    output_trna_count_matrix_filepath = (
+        output_trna_count_matrix_filepath.resolve() if output_trna_count_matrix_filepath else None
+    )
+    output_mrna_count_matrix_filepath = (
+        output_mrna_count_matrix_filepath.resolve() if output_mrna_count_matrix_filepath else None
+    )
 
     await _process(
         context_name=context_name,

From 748225015dbeb73ad29fb3c79d4d9ac23f0a7cfd Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 9 Feb 2026 16:02:45 -0600
Subject: [PATCH 04/26] chore: fill with integers for faster processing

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/rnaseq_preprocess.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/main/como/rnaseq_preprocess.py b/main/como/rnaseq_preprocess.py
index 7fd9ac78..eebe1a78 100644
--- a/main/como/rnaseq_preprocess.py
+++ b/main/como/rnaseq_preprocess.py
@@ -261,14 +261,13 @@ def _process_first_multirun_sample(strand_file: Path, all_quant_files: list[Path
         )
 
     # Set na values to 0
-    sample_count = sample_count.fillna(value="0")
+    sample_count = sample_count.fillna(value=0)
     sample_count["counts"] = sample_count["counts"].astype(float)
 
-    count_sums = sample_count.groupby("ensembl_gene_id", as_index=False)["counts"].mean()
-    count_sums["counts"] = np.ceil(count_sums["counts"].astype(np.uint32))
-    count_sums.columns = ["ensembl_gene_id", _sample_name_from_filepath(strand_file)]
-    return count_sums
-
+    count_avg = sample_count.groupby("ensembl_gene_id", as_index=False)["counts"].mean()
+    count_avg["counts"] = np.ceil(count_avg["counts"].astype(int))
+    count_avg.columns = ["ensembl_gene_id", _sample_name_from_filepath(strand_file)]
+    return count_avg
 
 
 def _process_standard_replicate(counts_file: Path, strand_file: Path, sample_name: str):

From 155c8221ccea3b4b5961151ab79a8ccaf5100d54 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 9 Feb 2026 16:03:46 -0600
Subject: [PATCH 05/26] chore: remove unnecessary async function usage

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/rnaseq_preprocess.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/main/como/rnaseq_preprocess.py b/main/como/rnaseq_preprocess.py
index eebe1a78..be0e714a 100644
--- a/main/como/rnaseq_preprocess.py
+++ b/main/como/rnaseq_preprocess.py
@@ -326,7 +326,7 @@ def _create_sample_counts_matrix(metrics: _StudyMetrics) -> pd.DataFrame:
     return counts
 
 
-async def _write_counts_matrix(
+def _write_counts_matrix(
     *,
     config_df: pd.DataFrame,
     fragment_lengths: pd.DataFrame,
@@ -367,7 +367,7 @@ async def _write_counts_matrix(
     return final_matrix
 
 
-async def _create_config_df(  # noqa: C901
+def _create_config_df(  # noqa: C901
     context_name: str,
     /,
     como_context_dir: Path,
@@ -670,7 +670,7 @@ async def _create_gene_info_file(
     """
 
     async def read_ensembl_gene_ids(file: Path) -> list[str]:
-        data = await _read_file(file, h5ad_as_df=False)
+        data = _read_file(file, h5ad_as_df=False)
         if isinstance(data, pd.DataFrame):
             data: pd.DataFrame
             return data["ensembl_gene_id"].tolist()
@@ -731,7 +731,7 @@ async def read_ensembl_gene_ids(file: Path) -> list[str]:
     logger.success(f"Gene Info file written at '{output_filepath}'")
 
 
-async def _process_como_input(
+def _process_como_input(
     context_name: str,
     output_config_filepath: Path,
     como_context_dir: Path,
@@ -741,7 +741,7 @@ async def _process_como_input(
 ) -> None:
     config_df, fragment_lengths = _create_config_df(context_name, como_context_dir=como_context_dir)
 
-    await _write_counts_matrix(
+    _write_counts_matrix(
         config_df=config_df,
         fragment_lengths=fragment_lengths,
         como_context_dir=como_context_dir,

From f7b3a0672325b9893282c71a6bfd3184012f8ba0 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 9 Feb 2026 16:04:03 -0600
Subject: [PATCH 06/26] fix: remove non existant genes from conversion

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/rnaseq_preprocess.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/main/como/rnaseq_preprocess.py b/main/como/rnaseq_preprocess.py
index be0e714a..6defb860 100644
--- a/main/como/rnaseq_preprocess.py
+++ b/main/como/rnaseq_preprocess.py
@@ -684,6 +684,7 @@ async def read_ensembl_gene_ids(file: Path) -> list[str]:
             )
 
         # Remove NA values from entrez_gene_id dataframe column
+        conversion = conversion[~conversion["ensembl_gene_id"].isna()]
         return conversion["ensembl_gene_id"].tolist()
 
     logger.info("Fetching gene info - this can take up to 5 minutes depending on the number of genes and your internet connection")

From 0e4a2c3d9de8572c3d8bcb99a22aed4f60f58590 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 9 Feb 2026 16:04:57 -0600
Subject: [PATCH 07/26] refactor: use more explicit (albeit longer) code to
 create gene_info dataframe object

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/rnaseq_preprocess.py | 65 ++++++++++++++++++++++------------
 1 file changed, 43 insertions(+), 22 deletions(-)

diff --git a/main/como/rnaseq_preprocess.py b/main/como/rnaseq_preprocess.py
index 6defb860..f9f1b6ce 100644
--- a/main/como/rnaseq_preprocess.py
+++ b/main/como/rnaseq_preprocess.py
@@ -351,7 +351,9 @@ def _write_counts_matrix(
         config_df.loc[config_df["library_prep"].str.lower() == rna.value.lower(), "sample_name"].tolist()
     )
 
-    final_matrix: pd.DataFrame = functools.reduce(lambda left, right: pd.merge(left, right, on="ensembl_gene_id", how="outer"), counts)
+    final_matrix: pd.DataFrame = functools.reduce(
+        lambda left, right: pd.merge(left, right, on="ensembl_gene_id", how="outer"), counts
+    )
     final_matrix.fillna(value=0, inplace=True)
     final_matrix.iloc[:, 1:] = final_matrix.iloc[:, 1:].astype(int)
     final_matrix = cast(pd.DataFrame, final_matrix[["ensembl_gene_id", *rna_specific_sample_names]])
@@ -687,42 +689,61 @@ async def read_ensembl_gene_ids(file: Path) -> list[str]:
         conversion = conversion[~conversion["ensembl_gene_id"].isna()]
         return conversion["ensembl_gene_id"].tolist()
 
-    logger.info("Fetching gene info - this can take up to 5 minutes depending on the number of genes and your internet connection")
+    logger.info(
+        "Fetching gene info - this can take up to 5 minutes depending on the number of genes and your internet connection"
+    )
 
-    ensembl_ids: set[str] = set(chain.from_iterable(await asyncio.gather(*[read_ensembl_gene_ids(f) for f in counts_matrix_filepaths])))
+    ensembl_ids: set[str] = set(
+        chain.from_iterable(await asyncio.gather(*[read_ensembl_gene_ids(f) for f in counts_matrix_filepaths]))
+    )
     gene_data: list[dict[str, str | int | list[str] | list[int] | None]] = await MyGene(cache=cache).query(
         items=list(ensembl_ids),
         taxon=taxon,
         scopes="ensemblgene",
     )
-    gene_info: pd.DataFrame = pd.DataFrame(
-        data=None,
-        columns=pd.Index(data=["ensembl_gene_id", "gene_symbol", "entrez_gene_id", "size"]),
-        index=pd.Index(data=list(range(len(ensembl_ids)))),
-    )
+
+    n = len(gene_data)
+    all_gene_symbols: list[str] = ["-"] * n
+    all_entrez_ids: list[str | int] = ["-"] * n
+    all_ensembl_ids: list[str] = ["-"] * n
+    all_sizes: list[int] = [-1] * n
+
+    def _avg_pos(value: int | list[int] | None) -> int:
+        if value is None:
+            return 0
+        if isinstance(value, list):
+            return int(sum(value) / len(value)) if value else 0
+        return int(value)
 
     for i, data in enumerate(gene_data):
         data: dict[str, str | int | list[str] | list[int] | None]
-        ensembl_genes: str | list[str] = cast(str | list[str], data.get("ensembl.gene", "-"))
-        start_pos: int | list[int] = cast(int | list[int], data.get("genomic_pos.start", 0))
-        end_pos: int | list[int] = cast(int | list[int], data.get("genomic_pos.end", 0))
 
-        avg_start: int | float = sum(start_pos) / len(start_pos) if isinstance(start_pos, list) else start_pos
-        avg_end: int | float = sum(end_pos) / len(end_pos) if isinstance(end_pos, list) else end_pos
-        size: int = int(avg_end - avg_start)
+        start = _avg_pos(data.get("genomic_pos.start", 0))
+        end = _avg_pos(data.get("genomic_pos.end", 0))
+        size = end - start
+
+        ensembl_id: int = data.get("ensembl.gene", "-")
+        all_ensembl_ids[i] = (
+            ",".join(map(str, ensembl_id)) if isinstance(ensembl_id, list) and ensembl_id else ensembl_id
+        )
+        all_gene_symbols[i] = str(data.get("symbol", "-"))
+        all_entrez_ids[i] = str(data.get("entrezgene", "-"))
+        all_sizes[i] = size if size > 0 else -1
 
-        gene_info.at[i, "gene_symbol"] = data.get("symbol", "-")
-        gene_info.at[i, "entrez_gene_id"] = data.get("entrezgene", "-")
-        gene_info.at[i, "ensembl_gene_id"] = ",".join(ensembl_genes) if isinstance(ensembl_genes, list) else ensembl_genes
-        gene_info.at[i, "size"] = size if size > 0 else -1
+    gene_info: pd.DataFrame = pd.DataFrame(
+        {
+            "ensembl_gene_id": all_ensembl_ids,
+            "gene_symbol": all_gene_symbols,
+            "entrez_gene_id": all_entrez_ids,
+            "size": all_sizes,
+        }
+    )
 
-    gene_info["size"] = gene_info["size"].astype(str)  # replace no-length values with "-" to match rows where every value is "-"
-    gene_info["size"] = gene_info["size"].replace("-1", "-")
-    gene_info = cast(pd.DataFrame, gene_info[~(gene_info == "-").all(axis=1)])  # remove rows where every value is "-"
+    # remove rows where every gene size value is -1 (not available)
+    gene_info = gene_info[~(gene_info == -1).all(axis=1)]
 
     gene_info["ensembl_gene_id"] = gene_info["ensembl_gene_id"].str.split(",")  # extend lists into multiple rows
     gene_info = gene_info.explode(column=["ensembl_gene_id"])
-    gene_info["size"] = gene_info["size"].astype(int)
     # we would set `entrez_gene_id` to int here as well, but not all ensembl ids are mapped to entrez ids,
     #   and as a result, there are still "-" values in the entrez id column that cannot be converted to an integer
 

From ab66599d1c280acaec9f56143721f7ab07fad738 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 9 Feb 2026 16:05:20 -0600
Subject: [PATCH 08/26] chore: import required modules

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/rnaseq_gen.py | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py
index a3e496fe..042e8145 100644
--- a/main/como/rnaseq_gen.py
+++ b/main/como/rnaseq_gen.py
@@ -1,34 +1,33 @@
 from __future__ import annotations
 
 import itertools
-import multiprocessing
 import sys
-import time
 from collections import namedtuple
 from collections.abc import Callable
-from concurrent.futures import Future, ProcessPoolExecutor, as_completed
 from dataclasses import dataclass, field
 from enum import Enum
 from pathlib import Path
 from typing import NamedTuple, TextIO, cast
 
-import matplotlib.pyplot as plt
+import anndata as ad
+import boolean
 import numpy as np
 import numpy.typing as npt
 import pandas as pd
-import seaborn as sns
+import scanpy as sc
 import sklearn
 import sklearn.neighbors
+from anndata.compat import XDataArray
+from anndata.experimental.backed import Dataset2D
 from fast_bioservices.pipeline import ensembl_to_gene_id_and_symbol, gene_symbol_to_ensembl_and_gene_id
 from loguru import logger
-from pandas import DataFrame
+from scipy import sparse
+from zfpkm import zFPKM, zfpkm_plot
 
-from como.data_types import FilteringTechnique, LogLevel, PeakIdentificationParameters, RNAType
-from como.density import density
+from como.data_types import FilteringTechnique, LogLevel, RNAType
 from como.migrations import gene_info_migrations
-from como.peak_finder import find_peaks
 from como.project import Config
-from como.utils import _log_and_raise_error, _num_columns, _read_file, _set_up_logging
+from como.utils import _log_and_raise_error, _read_file, _set_up_logging
 
 
 class _FilteringOptions(NamedTuple):

From 95654b342ac2683fb05a8e93ce7a9d8eabeaab2c Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 9 Feb 2026 16:07:20 -0600
Subject: [PATCH 09/26] refactor: optional argument for fragment data

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/rnaseq_gen.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py
index 042e8145..32a84fed 100644
--- a/main/como/rnaseq_gen.py
+++ b/main/como/rnaseq_gen.py
@@ -149,12 +149,13 @@ def genefilter(data: pd.DataFrame | npt.NDArray, filter_func: Callable[[npt.NDAr
 
 
 async def _build_matrix_results(
+    matrix: pd.DataFrame | sc.AnnData,
     *,
-    matrix: pd.DataFrame,
     gene_info: pd.DataFrame,
     metadata_df: pd.DataFrame,
+    fragment_df: pd.DataFrame | None,
     taxon: int,
-) -> _ReadMatrixResults:
+) -> tuple[NamedMetrics, list[int]]:
     """Read the counts matrix and returns the results.
 
     Arg:

From dec37b0c4f1973f17f4ddc08c07208e99882768a Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 9 Feb 2026 16:09:45 -0600
Subject: [PATCH 10/26] refactor: improve handling for single cell data

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/rnaseq_gen.py | 173 ++++++++++++++++++++++++++++------------
 1 file changed, 120 insertions(+), 53 deletions(-)

diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py
index 32a84fed..e9cfc6e4 100644
--- a/main/como/rnaseq_gen.py
+++ b/main/como/rnaseq_gen.py
@@ -49,14 +49,14 @@ class LayoutMethod(Enum):
 class _StudyMetrics:
     study: str
     num_samples: int
-    count_matrix: pd.DataFrame
-    fragment_lengths: npt.NDArray[float]
+    count_matrix: pd.DataFrame | sc.AnnData
+    fragment_lengths: npt.NDArray[np.floating] | None
     sample_names: list[str]
     layout: list[LayoutMethod]
     entrez_gene_ids: npt.NDArray[int]
     gene_sizes: npt.NDArray[int]
     __normalization_matrix: pd.DataFrame = field(default_factory=pd.DataFrame)
-    __z_score_matrix: pd.DataFrame = field(default_factory=pd.DataFrame)
+    __z_score_matrix: pd.DataFrame | sc.AnnData | None = field(default=None)
     __high_confidence_entrez_gene_ids: list[str] = field(default_factory=list)
 
     def __post_init__(self):
@@ -77,11 +77,11 @@ def normalization_matrix(self, value: pd.DataFrame) -> None:
         self.__normalization_matrix = value
 
     @property
-    def z_score_matrix(self) -> pd.DataFrame:
+    def z_score_matrix(self) -> pd.DataFrame | sc.AnnData | None:
         return self.__z_score_matrix
 
     @z_score_matrix.setter
-    def z_score_matrix(self, value: pd.DataFrame) -> None:
+    def z_score_matrix(self, value: pd.DataFrame | sc.AnnData) -> None:
         self.__z_score_matrix = value
 
     @property
@@ -111,9 +111,10 @@ class _ReadMatrixResults(NamedTuple):
 
 
 def k_over_a(k: int, a: float) -> Callable[[npt.NDArray], bool]:
-    """Return a function that filters rows of an array based on the sum of elements being greater than or equal to A at least k times.
+    """Filter rows of an array based on the sum of elements being greater than or equal to A at least k times.
 
-    This code is based on the `kOverA` function found in R's `genefilter` package: https://www.rdocumentation.org/packages/genefilter/versions/1.54.2/topics/kOverA
+    This code is based on the `kOverA` function found in R's `genefilter` package
+    https://www.rdocumentation.org/packages/genefilter/versions/1.54.2/topics/kOverA
 
     :param k: The minimum number of times the sum of elements must be greater than or equal to A.
     :param a: The value to compare the sum of elements to.
@@ -121,7 +122,7 @@ def k_over_a(k: int, a: float) -> Callable[[npt.NDArray], bool]:
     """
 
     def filter_func(row: npt.NDArray) -> bool:
-        return np.sum(row >= a) >= k
+        return bool(np.sum(row >= a) >= k)
 
     return filter_func
 
@@ -168,20 +169,35 @@ async def _build_matrix_results(
     """
     conversion = await ensembl_to_gene_id_and_symbol(ids=matrix["ensembl_gene_id"].tolist(), taxon=taxon)
 
-    # If all columns are empty, it is indicative that the incorrect taxon id was provided
-    if all(conversion[col].eq("-").all() for col in conversion.columns):
-        logger.critical(f"Conversion of Ensembl Gene IDs to Entrez IDs and Gene Symbols was empty - is '{taxon}' the correct taxon ID for this data?")
-
-    # 2025-NOV-3: commented out `conversion` types to evaluate if it can be skipped
-    # conversion["ensembl_gene_id"] = conversion["ensembl_gene_id"].str.split(",")
-    # conversion = conversion.explode("ensembl_gene_id")
-    # conversion.reset_index(inplace=True, drop=True)
-    # conversion = conversion[conversion["entrez_gene_id"] != "-"]  # drop missing entrez IDs
-    # conversion["entrez_gene_id"] = conversion["entrez_gene_id"].astype(int)  # float32 is needed because np.nan is a float
-
-    # merge_on should contain at least one of "ensembl_gene_id", "entrez_gene_id", or "gene_symbol"
-    merge_on: list[str] = list(set(matrix.columns).intersection(conversion.columns))
-    if not merge_on:
+        matrix.var = matrix.var.reset_index(drop=False, names=["gene_symbol"])
+        conversion = await gene_symbol_to_ensembl_and_gene_id(symbols=matrix.var["gene_symbol"].tolist(), taxon=taxon)
+    else:
+        if "ensembl_gene_id" not in matrix.columns:
+            _log_and_raise_error(
+                message="'ensembl_gene_id' column not found in the provided DataFrame.",
+                error=ValueError,
+                level=LogLevel.CRITICAL,
+            )
+        conversion: pd.DataFrame = await ensembl_to_gene_id_and_symbol(
+            ids=matrix["ensembl_gene_id"].tolist(), taxon=taxon
+        )
+    # If the entrez gene id column is empty, it is indicative that the incorrect taxon id was provided
+    if conversion["entrez_gene_id"].eq("-").all():
+        logger.critical(
+            f"Conversion of Ensembl Gene IDs to Entrez IDs and Gene Symbols was empty - "
+            f"is '{taxon}' the correct taxon ID for this data?"
+        )
+    conversion["ensembl_gene_id"] = conversion["ensembl_gene_id"].str.split(",")
+    conversion = conversion.explode("ensembl_gene_id")
+    conversion = conversion[conversion["entrez_gene_id"] != "-"]
+    conversion["entrez_gene_id"] = conversion["entrez_gene_id"]
+    conversion = conversion.reset_index(drop=False)
+
+    # conversion_merge_on should contain at least one of "ensembl_gene_id", "entrez_gene_id", or "gene_symbol"
+    conversion_merge_on: list[str] = list(
+        set(matrix.columns if isinstance(matrix, pd.DataFrame) else matrix.var.columns) & set(conversion.columns)
+    )
+    if not conversion_merge_on:
         _log_and_raise_error(
             (
                 "No columns to merge on, unable to find at least one of `ensembl_gene_id`, `entrez_gene_id`, or `gene_symbol`. "
@@ -190,48 +206,91 @@ async def _build_matrix_results(
             error=ValueError,
             level=LogLevel.ERROR,
         )
-    if "entrez_gene_id" in matrix.columns:
-        matrix["entrez_gene_id"] = matrix["entrez_gene_id"].astype(int)
-    matrix = matrix.merge(conversion, on=merge_on, how="left")
 
-    # drop rows that have `0` in `entrez_gene_id` column
-    # matrix = matrix[matrix["entrez_gene_id"] != 0].reset_index(drop=True, inplace=False)
-    # gene_info = gene_info[gene_info["entrez_gene_id"] != 0].reset_index(drop=True, inplace=False)
+    if isinstance(matrix, pd.DataFrame):
+        if "entrez_gene_id" in matrix.columns:
+            matrix["entrez_gene_id"] = matrix["entrez_gene_id"].astype(int)
+        matrix = matrix.merge(conversion, on=conversion_merge_on, how="left")
+    elif isinstance(matrix, sc.AnnData):
+        if "entrez_gene_id" in matrix.var.columns:
+            matrix.var["entrez_gene_id"] = matrix.var["entrez_gene_id"].astype(int)
+        matrix.var = matrix.var.merge(conversion, on=conversion_merge_on, how="left")
 
     gene_info = gene_info_migrations(gene_info)
-    # gene_info["entrez_gene_id"] = gene_info["entrez_gene_id"].astype(int)
+    gene_info = gene_info[gene_info["entrez_gene_id"] != "-"]
+    gene_info.loc[:, "entrez_gene_id"] = gene_info.loc[:, "entrez_gene_id"].astype(int)
 
-    counts_matrix = matrix.merge(
-        gene_info[["entrez_gene_id", "ensembl_gene_id"]],
-        on=["entrez_gene_id", "ensembl_gene_id"],
-        how="inner",
+    gene_info_merge_on: list[str] = list(
+        set(matrix.columns if isinstance(matrix, pd.DataFrame) else matrix.var.columns) & set(gene_info.columns)
     )
 
-    gene_info = gene_info.merge(
-        counts_matrix[["entrez_gene_id", "ensembl_gene_id"]],
-        on=["entrez_gene_id", "ensembl_gene_id"],
-        how="inner",
-    )
+    if "entrez_gene_id" in gene_info_merge_on:
+        gene_info = gene_info[~gene_info["entrez_gene_id"].isna()]
+        gene_info["entrez_gene_id"] = gene_info["entrez_gene_id"].astype(int)
+
+        if isinstance(matrix, pd.DataFrame):
+            matrix = matrix[~matrix["entrez_gene_id"].isna()]
+            matrix["entrez_gene_id"] = matrix["entrez_gene_id"].astype(int)
+        elif isinstance(matrix, sc.AnnData):
+            if isinstance(matrix.var, XDataArray):
+                raise TypeError("Expected matrix.var object to be 'pd.DataFrame', got 'anndata.compat.XDataArray'")
+            matrix = matrix[:, ~matrix.var["entrez_gene_id"].isna()]
+            matrix.var["entrez_gene_id"] = matrix.var["entrez_gene_id"].astype(int)
+
+    if isinstance(matrix, pd.DataFrame):
+        matrix = matrix.merge(gene_info, on=gene_info_merge_on, how="inner")
+    elif isinstance(matrix, sc.AnnData):
+        if not isinstance(matrix.var, pd.DataFrame):
+            raise TypeError(f"Expected matrix.var object to be 'pd.DataFrame', got '{type(matrix.var)}'")
+        matrix.var["original_index"] = matrix.var.index
+        new_var = matrix.var.merge(gene_info, on=gene_info_merge_on, how="inner")
+        new_matrix = matrix[:, new_var["original_index"]].copy()
+        new_matrix.var = new_var
+        new_matrix.var = new_matrix.var.drop(columns=["original_index"])
+        new_matrix.var.reset_index(drop=True)
+        matrix = new_matrix
+
+        non_duplicates = ~matrix.var.duplicated(subset=matrix.var.columns, keep="first")
+        matrix = matrix[:, non_duplicates].copy()
 
-    entrez_gene_ids: npt.NDArray[int] = gene_info["entrez_gene_id"].to_numpy()
     metrics: NamedMetrics = {}
     for study in metadata_df["study"].unique():
-        study_sample_names = metadata_df[metadata_df["study"] == study]["sample_name"].tolist()
-        layouts = metadata_df[metadata_df["study"] == study]["layout"].tolist()
+        study_sample_names: list[str] = metadata_df[metadata_df["study"] == study]["sample_name"].tolist()
+        layouts: list[str] = metadata_df[metadata_df["study"] == study]["layout"].tolist()
+
+        if isinstance(matrix, pd.DataFrame):
+            subset = matrix.set_index(keys=["entrez_gene_id"], drop=True)
+            subset = subset[subset.columns.intersection(study_sample_names)]
+            subset.index = subset.index.astype(int)
+            entrez_gene_ids = subset.index.to_numpy(copy=False)
+            gene_sizes = matrix["size"].to_numpy(dtype=int, copy=False)
+        elif isinstance(matrix, sc.AnnData):
+            # matrix.var = matrix.var.set_index(keys=["entrez_gene_id"], drop=True)
+            subset = matrix[matrix.obs_names.intersection(study_sample_names)]
+            entrez_gene_ids = subset.var["entrez_gene_id"].to_numpy(dtype=int)
+            gene_sizes = subset.var["size"].to_numpy(dtype=int)
+        else:
+            _log_and_raise_error(
+                message=f"Matrix must be a pandas DataFrame or scanpy AnnData object, got: '{type(matrix)}'.",
+                error=TypeError,
+                level=LogLevel.CRITICAL,
+            )
+
+        frag_lengths = None
+        if fragment_df is not None:
+            frag_lengths = fragment_df["effective_length"].to_numpy(dtype=np.float64)
         metrics[study] = _StudyMetrics(
-            count_matrix=cast(pd.DataFrame, counts_matrix[counts_matrix.columns.intersection(study_sample_names)]),
-            fragment_lengths=metadata_df[metadata_df["study"] == study]["fragment_length"].values.astype(float),
+            count_matrix=subset,
+            fragment_lengths=frag_lengths,
             sample_names=study_sample_names,
             layout=[LayoutMethod(layout) for layout in layouts],
             num_samples=len(study_sample_names),
             entrez_gene_ids=entrez_gene_ids,
-            gene_sizes=gene_info["size"].values.astype(int),
+            gene_sizes=gene_sizes,
             study=study,
         )
-        metrics[study].fragment_lengths[np.isnan(metrics[study].fragment_lengths)] = 0
-        metrics[study].count_matrix.index = pd.Index(entrez_gene_ids, name="entrez_gene_id")
 
-    return _ReadMatrixResults(metrics=metrics, entrez_gene_ids=gene_info["entrez_gene_id"].tolist())
+    return metrics, gene_info["entrez_gene_id"].astype(int).tolist()
 
 
 def calculate_tpm(metrics: NamedMetrics) -> NamedMetrics:
@@ -243,13 +302,21 @@ def calculate_tpm(metrics: NamedMetrics) -> NamedMetrics:
     Returns:
         A dictionary of study metrics with TPM calculated.
     """
-    for sample in metrics:
-        count_matrix = metrics[sample].count_matrix
-
-        gene_sizes = pd.Series(metrics[sample].gene_sizes, index=count_matrix.index)
-        adjusted_counts = count_matrix.add(1e-6)
+    for sample, metric in metrics.items():
+        if isinstance(metric.count_matrix, sc.AnnData):
+            adata = metric.count_matrix
+            gene_sizes = pd.Series(metric.gene_sizes, index=adata.var_names)
+            counts_df = pd.DataFrame(
+                data=np.asarray(adata.X.toarray() if sparse.issparse(adata.X) else adata.X),
+                index=adata.var_names,
+                columns=adata.obs_names,
+            )
+        else:
+            counts_df = metric.count_matrix
+            gene_sizes = pd.Series(metric.gene_sizes)
 
-        tpm_matrix = adjusted_counts.divide(gene_sizes, axis=0)  # (count + 1) / gene_length
+        adjusted_counts = counts_df.add(1e-6)
+        tpm_matrix = adjusted_counts.div(gene_sizes, axis=0)  # (count + 1) / gene_length
         tpm_matrix = tpm_matrix.div(tpm_matrix.sum(axis=0), axis=1)  # normalize by total
         tpm_matrix = tpm_matrix.mul(1e6)  # scale to per-million
         metrics[sample].normalization_matrix = tpm_matrix

From fc1d45f0cb30359979e5a621c8ab88c57b637c1c Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 9 Feb 2026 16:10:00 -0600
Subject: [PATCH 11/26] chore: generalize data type input

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/rnaseq_gen.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py
index e9cfc6e4..3ba71f6d 100644
--- a/main/como/rnaseq_gen.py
+++ b/main/como/rnaseq_gen.py
@@ -53,8 +53,8 @@ class _StudyMetrics:
     fragment_lengths: npt.NDArray[np.floating] | None
     sample_names: list[str]
     layout: list[LayoutMethod]
-    entrez_gene_ids: npt.NDArray[int]
-    gene_sizes: npt.NDArray[int]
+    entrez_gene_ids: npt.NDArray[np.integer]
+    gene_sizes: npt.NDArray[np.integer]
     __normalization_matrix: pd.DataFrame = field(default_factory=pd.DataFrame)
     __z_score_matrix: pd.DataFrame | sc.AnnData | None = field(default=None)
     __high_confidence_entrez_gene_ids: list[str] = field(default_factory=list)

From e1505d153bff3254ef3d7e1a559a41da0ce72080 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 9 Feb 2026 16:10:11 -0600
Subject: [PATCH 12/26] chore: ruff formatting

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/rnaseq_gen.py | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py
index 3ba71f6d..c8c2e207 100644
--- a/main/como/rnaseq_gen.py
+++ b/main/como/rnaseq_gen.py
@@ -146,7 +146,11 @@ def genefilter(data: pd.DataFrame | npt.NDArray, filter_func: Callable[[npt.NDAr
             level=LogLevel.CRITICAL,
         )
 
-    return data.apply(filter_func, axis=1).values if isinstance(data, pd.DataFrame) else np.apply_along_axis(filter_func, axis=1, arr=data)
+    return (
+        data.apply(filter_func, axis=1).to_numpy()
+        if isinstance(data, pd.DataFrame)
+        else np.apply_along_axis(filter_func, axis=1, arr=data)
+    )
 
 
 async def _build_matrix_results(
@@ -159,15 +163,14 @@ async def _build_matrix_results(
 ) -> tuple[NamedMetrics, list[int]]:
     """Read the counts matrix and returns the results.
 
-    Arg:
-        matrix: The gene counts matrix to process
-        metadata_df: The configuration dataframe related to the current context
-        taxon: The NCBI Taxon ID
-
-    Returns:
-        A dataclass `ReadMatrixResults`
+    :param matrix: The gene counts matrix to process
+    :param metadata_df: The configuration dataframe related to the current context
+    :param taxon: The NCBI Taxon ID
+    :returns: A dataclass `ReadMatrixResults`
     """
-    conversion = await ensembl_to_gene_id_and_symbol(ids=matrix["ensembl_gene_id"].tolist(), taxon=taxon)
+    if isinstance(matrix, sc.AnnData):
+        if not isinstance(matrix.var, pd.DataFrame):
+            raise TypeError("AnnData.var is expected to be a pandas.DataFrame")
 
         matrix.var = matrix.var.reset_index(drop=False, names=["gene_symbol"])
         conversion = await gene_symbol_to_ensembl_and_gene_id(symbols=matrix.var["gene_symbol"].tolist(), taxon=taxon)
@@ -325,7 +328,7 @@ def calculate_tpm(metrics: NamedMetrics) -> NamedMetrics:
 
 
 def _calculate_fpkm(metrics: NamedMetrics, scale: float = 1e6) -> NamedMetrics:
-    """Calculate the Fragments Per Kilobase of transcript per Million mapped reads (FPKM) for each sample in the metrics dictionary.
+    """Calculate the Fragments Per Kilobase of transcript per Million mapped reads (FPKM) for each i in the metrics dictionary.
 
     Args:
         metrics: A dictionary of study metrics to calculate FPKM for.

From 849ba2e64da91e05cbf7ffd5eb7aa13fb80a38fc Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 9 Feb 2026 16:11:52 -0600
Subject: [PATCH 13/26] chore: simplify FPKM/RPKM calculations; properly
 compute per-gene FPKM scores

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/rnaseq_gen.py | 62 +++++++++++------------------------------
 1 file changed, 17 insertions(+), 45 deletions(-)

diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py
index c8c2e207..ce8de24b 100644
--- a/main/como/rnaseq_gen.py
+++ b/main/como/rnaseq_gen.py
@@ -338,53 +338,25 @@ def _calculate_fpkm(metrics: NamedMetrics, scale: float = 1e6) -> NamedMetrics:
         A dictionary of study metrics with FPKM calculated.
     """
     for study in metrics:
-        matrix_values = []
-
-        for sample in range(metrics[study].num_samples):
-            layout = metrics[study].layout[sample]
-            count_matrix: npt.NDArray[float] = metrics[study].count_matrix.iloc[:, sample].values
-            gene_lengths = (
-                metrics[study].fragment_lengths[sample].astype(float)
-                if layout == LayoutMethod.paired_end
-                else metrics[study].gene_sizes.astype(float)
+        matrix_values: dict[str, npt.NDArray[np.floating]] = {}
+        count_matrix = metrics[study].count_matrix
+        if not isinstance(count_matrix, pd.DataFrame):
+            _log_and_raise_error(
+                message="FPKM cannot be performed on scanpy.AnnData objects!",
+                error=TypeError,
+                level=LogLevel.CRITICAL,
             )
-            gene_lengths_kb = gene_lengths / 1000.0
-
-            match layout:
-                case LayoutMethod.paired_end:  # FPKM
-                    total_fragments = count_matrix.sum(axis=0)
-                    if total_fragments == 0:
-                        fragments_per_kilobase_million = np.nan
-                    else:
-                        counts_per_million = total_fragments / scale
-                        fragments_per_kilobase = count_matrix / gene_lengths_kb
-                        fragments_per_kilobase_million = fragments_per_kilobase / counts_per_million
-                    matrix_values.append(fragments_per_kilobase_million)
-                case LayoutMethod.single_end:  # RPKM
-                    reads_per_kilobase = count_matrix / gene_lengths_kb
-                    total_reads = count_matrix.sum(axis=0)
-                    counts_per_million = total_reads / scale
-                    reads_per_kilobase_million = reads_per_kilobase / counts_per_million
-                    matrix_values.append(reads_per_kilobase_million)
-                case _:
-                    _log_and_raise_error(
-                        (
-                            f"Invalid normalization method specified ''. "
-                            f"Must be one of '{LayoutMethod.paired_end.value}' or '{LayoutMethod.single_end.value}'."
-                        ),
-                        error=ValueError,
-                        level=LogLevel.ERROR,
-                    )
-
-        # Transpose is needed because values were appended as rows
-        fpkm_matrix = pd.DataFrame(matrix_values).T
-        fpkm_matrix.index = metrics[study].count_matrix.index
-        fpkm_matrix.columns = metrics[study].sample_names
-
-        fpkm_matrix = fpkm_matrix[~pd.isna(fpkm_matrix)]
-        metrics[study].normalization_matrix = fpkm_matrix
-        metrics[study].normalization_matrix.columns = metrics[study].count_matrix.columns
 
+        study_counts = count_matrix.to_numpy(dtype=int, copy=False)
+        for i in range(metrics[study].num_samples):
+            layout = metrics[study].layout[i]
+            sample_name = metrics[study].sample_names[i]
+            length = metrics[study].fragment_lengths if layout == LayoutMethod.paired_end else metrics[study].gene_sizes
+            counts = study_counts[:, i]
+            mapped_reads = counts.sum()
+            matrix_values[sample_name] = ((counts * 1e9) / (length * mapped_reads)).astype(int)
+
+        metrics[study].normalization_matrix = pd.DataFrame(matrix_values, index=metrics[study].entrez_gene_ids)
     return metrics
 
 

From 3234413e2bdd7361e4b10e560d712f6874a18793 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 9 Feb 2026 16:12:28 -0600
Subject: [PATCH 14/26] refactor: move zfpkm calculation to external package

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/rnaseq_gen.py | 169 +---------------------------------------
 1 file changed, 3 insertions(+), 166 deletions(-)

diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py
index ce8de24b..7f12b057 100644
--- a/main/como/rnaseq_gen.py
+++ b/main/como/rnaseq_gen.py
@@ -360,171 +360,6 @@ def _calculate_fpkm(metrics: NamedMetrics, scale: float = 1e6) -> NamedMetrics:
     return metrics
 
 
-def _zfpkm_calculation(col_fpkm: pd.Series, min_peak_height: float, min_peak_distance: int):
-    """ZFPKM Transformations.
-
-    This function reproduces R's `zFPKM::zFPKM` function.
-
-    References:
-        1) zFPKM implementation in R: https://github.com/ronammar/zFPKM
-        2) zFPKM publication: https://doi.org/10.1186/1471-2164-14-778
-
-    Args:
-        col_fpkm: The raw FPKM values to perform zFPKM on
-        min_peak_distance: Minimum distance between peaks; passed on to `find_peaks` function
-        min_peak_height: Minimum height of peaks; passed on to `find_peaks` function
-
-    Returns:
-        A named tuple containing the zFPKM values, density estimate, mean (mu), standard deviation, and maximum FPKM value.
-    """
-    # Ignore np.log2(0) errors; we know this will happen, and are removing non-finite values in the density calculation
-    # This is required in order to match R's zFPKM calculations, as R's `density` function removes NA values.
-    with np.errstate(divide="ignore", invalid="ignore"):
-        log2fpkm: npt.NDArray[float] = np.log2(col_fpkm.values).astype(float)
-    d = density(log2fpkm)
-
-    peaks: pd.DataFrame = find_peaks(d.y_grid, min_peak_height=min_peak_height, min_peak_distance=min_peak_distance)
-    peak_positions = d.x_grid[peaks["peak_idx"].astype(int).tolist()]
-
-    sd = 1.0
-    mu = 0.0
-    fpkm_at_mu = 0.0
-    if peak_positions.size > 0:
-        mu = float(peak_positions.max())
-        u = float(log2fpkm[log2fpkm > mu].mean())
-        fpkm_at_mu = float(d.y_grid[int(peaks.loc[np.argmax(peak_positions), "peak_idx"])])
-        sd = float((u - mu) * np.sqrt(np.pi / 2))
-    zfpkm = pd.Series((log2fpkm - mu) / sd, dtype=float, name=col_fpkm.name, index=col_fpkm.index)
-    return _ZFPKMResult(zfpkm=zfpkm, density=Density(d.x_grid, d.y_grid), mu=mu, std_dev=sd, fpkm_at_mu=fpkm_at_mu)
-
-
-def zfpkm_transform(
-    fpkm_df: pd.DataFrame,
-    min_peak_height: float = 0.02,
-    min_peak_distance: int = 1,
-    update_every_percent: float = 0.1,
-    remove_na: bool = True,
-) -> tuple[dict[str, _ZFPKMResult], DataFrame]:
-    """Perform zFPKM calculation/transformation.
-
-    Args:
-        fpkm_df: A DataFrame containing FPKM values with genes as rows and samples as columns.
-        min_peak_height: Minimum height of peaks; passed on to `find_peaks` function.
-        min_peak_distance: Minimum distance between peaks; passed on to `find_peaks` function.
-        update_every_percent: Frequency of progress updates as a decimal between 0 and 1 (e.g., 0.1 for every 10%).
-        remove_na: Whether to remove NaN & blank values from the input DataFrame before processing.
-
-    Returns:
-        A tuple containing:
-            - A dictionary of intermediate results for each sample.
-            - A DataFrame of zFPKM values with the same shape as the input fpkm_df.
-    """
-    if update_every_percent > 1:
-        logger.warning(f"update_every_percent should be a decimal value between 0 and 1; got: {update_every_percent} - will convert to percentage")
-        update_every_percent /= 100
-
-    total_samples = _num_columns(fpkm_df)
-    update_per_step: int = int(np.ceil(total_samples * update_every_percent))
-
-    # Get at least 1 core and at most cpu_count() - 2
-    cores = max(min(multiprocessing.cpu_count() - 2, total_samples), 1)
-    logger.debug(f"zFPKM transforming {len(fpkm_df.columns)} sample(s) containing {len(fpkm_df):,} genes(s) using {cores} core(s)")
-    logger.debug(f"Will update every {update_per_step:,} steps (~{update_every_percent:.1%} of {total_samples:,})")
-
-    chunk_time = time.time()
-    start_time = time.time()
-    log_padding = len(str(f"{total_samples:,}"))
-    zfpkm_series: list[pd.Series] = []
-    results: dict[str, _ZFPKMResult] = {}
-
-    slim_fpkm_df: pd.DataFrame = cast(pd.DataFrame, fpkm_df[fpkm_df.index != "-"] if remove_na else fpkm_df)
-    with ProcessPoolExecutor(max_workers=cores) as pool:
-        futures: list[Future[_ZFPKMResult]] = [
-            pool.submit(
-                _zfpkm_calculation,
-                col_fpkm=fpkm_df[column],
-                min_peak_height=min_peak_height,
-                min_peak_distance=min_peak_distance,
-            )
-            for column in slim_fpkm_df
-        ]
-
-        for i, future in enumerate(as_completed(futures)):
-            result = future.result()
-            key = str(result.zfpkm.name)
-            results[key] = result
-            zfpkm_series.append(result.zfpkm)
-
-            if i != 0 and ((i + 1) % update_per_step == 0 or (i + 1) == total_samples):
-                current_time = time.time()
-                chunk = current_time - chunk_time
-                total_time = current_time - start_time
-                chunk_num = f"{i + 1:,}"
-                logger.debug(
-                    f"Processed {chunk_num:>{log_padding}} of {total_samples:,} - "
-                    f"chunk took {chunk:.1f} seconds - "
-                    f"running for {total_time:.1f} seconds"
-                )
-                chunk_time = current_time
-
-    zfpkm_df = pd.DataFrame({series.name: series for series in zfpkm_series}, index=fpkm_df.index)
-    return results, zfpkm_df
-
-
-def zfpkm_plot(results: dict[str, _ZFPKMResult], *, output_png_dirpath: Path, plot_xfloor: int = -4, subplot_titles: bool = True) -> None:
-    """Plot the log2(FPKM) density and fitted Gaussian for each sample.
-
-    Args:
-        results: A dictionary of intermediate results from zfpkm_transform.
-        output_png_dirpath: Output directory location
-        subplot_titles: Whether to display facet titles (sample names).
-        plot_xfloor: Lower limit for the x-axis.
-        subplot_titles: Whether to display facet titles (sample names).
-
-    """
-    to_concat: list[pd.DataFrame] = []
-    for name, result in results.items():
-        stddev: float = float(result.std_dev)
-        x: npt.NDArray[float] = result.density.x.flatten()
-        y: npt.NDArray[float] = result.density.y.flatten()
-
-        fitted: npt.NDArray[float] = np.exp(-0.5 * ((x - result.mu) / stddev) ** 2) / (stddev * np.sqrt(2 * np.pi))
-        fpkm_at_mu: float = result.fpkm_at_mu
-        max_fitted: float = float(fitted.max())
-        scale_fitted: float = fitted * fpkm_at_mu / max_fitted
-        to_concat.append(pd.DataFrame({"sample_name": name, "log2fpkm": x, "fpkm_density": y, "zfpkm_density": scale_fitted}))
-
-    mega_df = pd.concat(to_concat, ignore_index=True)
-    mega_df.columns = pd.Series(data=["sample_name", "log2fpkm", "fpkm_density", "zfpkm_density"])
-    mega_df = mega_df.melt(id_vars=["log2fpkm", "sample_name"], var_name="source", value_name="density")
-
-    fig: plt.Figure
-    axes: list[plt.Axes]
-    fig, axes = plt.subplots(nrows=len(results), ncols=1, figsize=(8, 4 * len(results)))
-    if len(results) == 1:
-        axes = [axes]
-
-    for i, sample_name in enumerate(results):
-        sample_data = mega_df[mega_df["sample_name"] == sample_name]
-        axis = axes[i]
-
-        for source_type in sample_data["source"].unique():
-            group = sample_data[sample_data["source"] == source_type]
-            sns.lineplot(data=group, x="log2fpkm", y="density", label=source_type, ax=axis)
-
-        if subplot_titles:
-            axis.set_title(f"Sample: {sample_name}")
-        axis.set_xlim(plot_xfloor, sample_data["log2fpkm"].max())
-        axis.set_xlabel("log2(FPKM)")
-        axis.set_ylabel("density [scaled]")
-        axis.legend(title="Source")
-
-    output_png_dirpath.mkdir(parents=True, exist_ok=True)
-    sample_name: str = next(iter(results.keys()))[:-2]  # Go from 'control1hr_S1R1' to 'control1hr_S1'
-    plt.tight_layout()
-    plt.savefig(Path(output_png_dirpath, f"{sample_name}_zfpkm_density.png"))
-
-
 def calculate_z_score(metrics: NamedMetrics) -> NamedMetrics:
     """Calculate the z-score for each sample in the metrics dictionary.
 
@@ -536,7 +371,9 @@ def calculate_z_score(metrics: NamedMetrics) -> NamedMetrics:
     """
     for sample in metrics:
         log_matrix = np.log(metrics[sample].normalization_matrix)
-        z_matrix = pd.DataFrame(data=sklearn.preprocessing.scale(log_matrix, axis=1), columns=metrics[sample].sample_names)
+        z_matrix = pd.DataFrame(
+            data=sklearn.preprocessing.scale(log_matrix, axis=1), columns=metrics[sample].sample_names
+        )
         metrics[sample].z_score_matrix = z_matrix
     return metrics
 

From f90c38837478f288f6d8e4522345b9cd48ac9420 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 9 Feb 2026 16:13:04 -0600
Subject: [PATCH 15/26] chore: use np.bool for boolean array

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/rnaseq_gen.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py
index 7f12b057..32be1a87 100644
--- a/main/como/rnaseq_gen.py
+++ b/main/como/rnaseq_gen.py
@@ -472,17 +472,19 @@ def tpm_quantile_filter(*, metrics: NamedMetrics, filtering_options: _FilteringO
         min_func = k_over_a(min_samples, 0.9)
         top_func = k_over_a(top_samples, 0.9)
 
-        min_genes: npt.NDArray[bool] = genefilter(boolean_expression, min_func)
-        top_genes: npt.NDArray[bool] = genefilter(boolean_expression, top_func)
+        min_genes: npt.NDArray[np.bool] = genefilter(boolean_expression, min_func)
+        top_genes: npt.NDArray[np.bool] = genefilter(boolean_expression, top_func)
 
         # Only keep `entrez_gene_ids` that pass `min_genes`
         metric.entrez_gene_ids = [gene for gene, keep in zip(entrez_ids, min_genes, strict=True) if keep]
-        metric.gene_sizes = np.array(gene for gene, keep in zip(gene_size, min_genes, strict=True) if keep)
+        metric.gene_sizes = np.asarray(gene for gene, keep in zip(gene_size, min_genes, strict=True) if keep)
         metric.count_matrix = cast(pd.DataFrame, metric.count_matrix.iloc[min_genes, :])
         metric.normalization_matrix = cast(pd.DataFrame, metrics[sample].normalization_matrix.iloc[min_genes, :])
 
         keep_top_genes = [gene for gene, keep in zip(entrez_ids, top_genes, strict=True) if keep]
-        metric.high_confidence_entrez_gene_ids = [gene for gene, keep in zip(entrez_ids, keep_top_genes, strict=True) if keep]
+        metric.high_confidence_entrez_gene_ids = [
+            gene for gene, keep in zip(entrez_ids, keep_top_genes, strict=True) if keep
+        ]
 
     metrics = calculate_z_score(metrics)
 

From 8253a7d34d96645f89869d05c10d1120783b537f Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 9 Feb 2026 16:13:19 -0600
Subject: [PATCH 16/26] chore: ruff formatting

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/rnaseq_gen.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py
index 32be1a87..8b10d480 100644
--- a/main/como/rnaseq_gen.py
+++ b/main/como/rnaseq_gen.py
@@ -466,8 +466,10 @@ def tpm_quantile_filter(*, metrics: NamedMetrics, filtering_options: _FilteringO
         top_samples = round(n_top * len(tpm_matrix.columns))
 
         tpm_quantile = tpm_matrix[tpm_matrix > 0]
-        quantile_cutoff = np.quantile(a=tpm_quantile.values, q=1 - (cut_off / 100), axis=0)  # Compute quantile across columns
-        boolean_expression = pd.DataFrame(data=tpm_matrix > quantile_cutoff, index=tpm_matrix.index, columns=tpm_matrix.columns).astype(int)
+        quantile_cutoff = np.quantile(a=tpm_quantile.values, q=1 - (cut_off / 100), axis=0)
+        boolean_expression = pd.DataFrame(
+            data=tpm_matrix > quantile_cutoff, index=tpm_matrix.index, columns=tpm_matrix.columns
+        ).astype(int)
 
         min_func = k_over_a(min_samples, 0.9)
         top_func = k_over_a(top_samples, 0.9)

From c52d2e845add7166cad1367b468b41badfd18830 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 9 Feb 2026 16:13:47 -0600
Subject: [PATCH 17/26] feat: allow setting negative zFPKM results to 0

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/rnaseq_gen.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py
index 8b10d480..4b482a82 100644
--- a/main/como/rnaseq_gen.py
+++ b/main/como/rnaseq_gen.py
@@ -502,6 +502,7 @@ def zfpkm_filter(
     min_peak_height: float,
     min_peak_distance: int,
     output_png_dirpath: Path | None,
+    force_negative_to_zero: bool = False,
 ) -> NamedMetrics:
     """Apply zFPKM filtering to the FPKM matrix for a given sample.
 
@@ -513,6 +514,8 @@ def zfpkm_filter(
         min_peak_height: Minimum peak height for zFPKM peak identification.
         min_peak_distance: Minimum peak distance for zFPKM peak identification.
         output_png_dirpath: Optional directory path to save zFPKM plots.
+        force_negative_to_zero: Should negative values be forcibly set to 0?
+            This could happen as a result of normalization producing negative near-zero values (e.g., -0.001)
 
     Returns:
         A dictionary of filtered study metrics.

From e2e6350c30f9a432e56988a6b162d6576a80e108 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 9 Feb 2026 16:14:29 -0600
Subject: [PATCH 18/26] feat: simplification to use external zfpkm package

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/rnaseq_gen.py | 30 +++++++++++++++++++-----------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py
index 4b482a82..80e11df6 100644
--- a/main/como/rnaseq_gen.py
+++ b/main/como/rnaseq_gen.py
@@ -529,34 +529,38 @@ def zfpkm_filter(
         metric: _StudyMetrics
         # if fpkm was not calculated, the normalization matrix will be empty; collect the count matrix instead
         matrix = metric.count_matrix if metric.normalization_matrix.empty else metric.normalization_matrix
+        if not isinstance(matrix, pd.DataFrame):
+            raise TypeError(f"Expected a pandas.DataFrame for zFPKM filtering, got: '{type(matrix)}'")
 
         # TODO: 2025-OCT-31: Re-evaluate whether to remove rows with all 0 counts
         # matrix = matrix[matrix.sum(axis=1) > 0]  # remove rows (genes) that have no counts across all samples
 
-        results, zfpkm_df = zfpkm_transform(
-            fpkm_df=matrix,
-            min_peak_height=min_peak_height,
-            min_peak_distance=min_peak_distance,
-        )
-        zfpkm_df[(matrix == 0) | (zfpkm_df.isna())] = -4
+        matrix.replace(to_replace=np.nan, value=0.0, inplace=True)
+        if force_negative_to_zero:
+            matrix[matrix < 0] = 0.0
+
+        zfpkm_df, zfpkm_results = zFPKM(matrix)
 
-        if len(results) > 10 and not force_zfpkm_plot:
+        if len(zfpkm_results) > 25 and not force_zfpkm_plot:
             logger.warning(
-                "Not plotting zFPKM results because more than 10 plots would be created. "
+                "Not plotting zFPKM results because more than 25 plots would be created. "
                 "If you would like to plot them anyway, set 'force_zfpkm_plot' to True"
             )
         elif output_png_dirpath is None:
             logger.critical("Output zFPKM PNG filepath is None, set a path to plot zFPKM graphs")
         else:
-            zfpkm_plot(results, output_png_dirpath=output_png_dirpath)
+            sample_name = zfpkm_results[0].name.split("_")[0]  # go from 'control1hr_S1R1' to 'control1hr'
+            zfpkm_plot(zfpkm_results, save_filepath=output_png_dirpath / f"{sample_name}_zfpkm_density.png")
 
         metric.z_score_matrix = zfpkm_df
 
         # determine which genes are expressed
         min_samples = round(min_sample_expression * len(zfpkm_df.columns))
         min_func = k_over_a(min_samples, cut_off)
-        min_genes: npt.NDArray[bool] = genefilter(zfpkm_df, min_func)
-        metric.entrez_gene_ids = [gene for gene, keep in zip(zfpkm_df.index, min_genes, strict=True) if keep]
+        min_genes: npt.NDArray[np.bool] = genefilter(zfpkm_df, min_func)
+        metric.entrez_gene_ids = np.asarray(
+            [g_id for g_id, keep in zip(zfpkm_df.index, min_genes, strict=True) if keep], dtype=int
+        )
 
         # determine which genes are confidently expressed
         top_samples = round(high_confidence_sample_expression * len(zfpkm_df.columns))
@@ -578,6 +582,7 @@ def filter_counts(
     zfpkm_min_peak_height: float,
     zfpkm_min_peak_distance: int,
     output_zfpkm_plot_dirpath: Path | None = None,
+    force_negative_to_zero: bool = False,
 ) -> NamedMetrics:
     """Filter the count matrix based on the specified technique.
 
@@ -591,6 +596,8 @@ def filter_counts(
         zfpkm_min_peak_height: Minimum peak height for zFPKM peak identification.
         zfpkm_min_peak_distance: Minimum peak distance for zFPKM peak identification.
         output_zfpkm_plot_dirpath: Optional filepath to save the zFPKM plot.
+    :param force_negative_to_zero: Should negative values be forcibly set to 0?
+            This could happen as a result of normalization producing negative near-zero values (e.g., -0.001)
 
     Returns:
         A dictionary of filtered study metrics.
@@ -609,6 +616,7 @@ def filter_counts(
                 min_peak_height=zfpkm_min_peak_height,
                 min_peak_distance=zfpkm_min_peak_distance,
                 output_png_dirpath=output_zfpkm_plot_dirpath,
+                force_negative_to_zero=force_negative_to_zero,
             )
         case FilteringTechnique.UMI:
             # UMI filtering is the same as zFPKM filtering without calculating FPKM

From 2ad9887222b578e3ca5b0fd9720f380804c23a27 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 9 Feb 2026 16:17:29 -0600
Subject: [PATCH 19/26] feat: allow providing the fragment size filepath (from
 rnaseq preprocessing)

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/rnaseq_gen.py | 109 ++++++++++++++++++++++++----------------
 1 file changed, 67 insertions(+), 42 deletions(-)

diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py
index 80e11df6..104e68cb 100644
--- a/main/como/rnaseq_gen.py
+++ b/main/como/rnaseq_gen.py
@@ -586,16 +586,17 @@ def filter_counts(
 ) -> NamedMetrics:
     """Filter the count matrix based on the specified technique.
 
-    Args:
-        context_name: The name of the context being processed.
-        metrics: A dictionary of study metrics to filter.
-        technique: The filtering technique to use.
-        filtering_options: Options for filtering the count matrix.
-        prep: The RNA preparation type.
-        force_zfpkm_plot: Whether to force plotting of zFPKM results even if there are many samples.
-        zfpkm_min_peak_height: Minimum peak height for zFPKM peak identification.
-        zfpkm_min_peak_distance: Minimum peak distance for zFPKM peak identification.
-        output_zfpkm_plot_dirpath: Optional filepath to save the zFPKM plot.
+    :param context_name: The name of the context being processed.
+    :param metrics: A dictionary of study metrics to filter.
+    :param technique: The filtering technique to use.
+    :param filtering_options: Options for filtering the count matrix.
+    :param prep: The RNA preparation type.
+    :param force_zfpkm_plot: Whether to force plotting of zFPKM results even if there are many samples.
+    :param zfpkm_min_peak_height: Minimum peak height for zFPKM peak identification.
+    :param zfpkm_min_peak_distance: Minimum peak distance for zFPKM peak identification.
+    :param umi_target_sum: The target sum for UMI normalization.
+    :param umi_perform_normalization: Whether to perform normalization before UMI filtering.
+    :param output_zfpkm_plot_dirpath: Optional filepath to save the zFPKM plot.
     :param force_negative_to_zero: Should negative values be forcibly set to 0?
             This could happen as a result of normalization producing negative near-zero values (e.g., -0.001)
 
@@ -642,6 +643,7 @@ async def _process(
     rnaseq_matrix_filepath: Path,
     metadata_df: pd.DataFrame,
     gene_info_df: pd.DataFrame,
+    fragment_df: pd.DataFrame | None,
     prep: RNAType,
     taxon: int,
     replicate_ratio: float,
@@ -656,18 +658,12 @@ async def _process(
     output_boolean_activity_filepath: Path,
     output_zscore_normalization_filepath: Path,
     output_zfpkm_plot_dirpath: Path | None,
+    force_negative_to_zero: bool,
 ):
     """Save the results of the RNA-Seq tests to a CSV file."""
     output_boolean_activity_filepath.parent.mkdir(parents=True, exist_ok=True)
 
-    rnaseq_matrix: pd.DataFrame = await _read_file(rnaseq_matrix_filepath, h5ad_as_df=True)
-
-    if rnaseq_matrix_filepath.suffix == ".h5ad":
-        conversion = await gene_symbol_to_ensembl_and_gene_id(symbols=rnaseq_matrix["gene_symbol"].tolist(), taxon=taxon)
-        conversion.reset_index(inplace=True)
-        rnaseq_matrix = rnaseq_matrix.merge(conversion, how="left", on="gene_symbol")
-        rnaseq_matrix.replace(to_replace=pd.NA, value="-")
-
+    rnaseq_matrix: pd.DataFrame | sc.AnnData = _read_file(rnaseq_matrix_filepath, h5ad_as_df=False)
     filtering_options = _FilteringOptions(
         replicate_ratio=replicate_ratio,
         batch_ratio=batch_ratio,
@@ -676,16 +672,14 @@ async def _process(
         high_batch_ratio=high_batch_ratio,
     )
 
-    read_counts_results: _ReadMatrixResults = await _build_matrix_results(
-        matrix=rnaseq_matrix,
+    metrics, entrez_gene_ids = await _build_matrix_results(
+        rnaseq_matrix,
         gene_info=gene_info_df,
         metadata_df=metadata_df,
+        fragment_df=fragment_df,
         taxon=taxon,
     )
-
-    metrics = read_counts_results.metrics
-
-    metrics: NamedMetrics = filter_counts(
+    metrics = filter_counts(
         context_name=context_name,
         metrics=metrics,
         technique=technique,
@@ -695,25 +689,41 @@ async def _process(
         zfpkm_min_peak_height=zfpkm_min_peak_height,
         zfpkm_min_peak_distance=zfpkm_min_peak_distance,
         output_zfpkm_plot_dirpath=output_zfpkm_plot_dirpath,
+        force_negative_to_zero=force_negative_to_zero,
     )
 
-    merged_zscore_df = pd.concat([m.z_score_matrix[m.z_score_matrix.index != "-"] for m in metrics.values()], axis="columns")
-    merged_zscore_df.fillna(-4, inplace=True)
-    expressed_genes: list[str] = list(itertools.chain.from_iterable(m.entrez_gene_ids for m in metrics.values()))
-    top_genes: list[str] = list(itertools.chain.from_iterable(m.high_confidence_entrez_gene_ids for m in metrics.values()))
+    if isinstance(rnaseq_matrix, pd.DataFrame):
+        merged_zscores = pd.concat(
+            [m.z_score_matrix[m.z_score_matrix.index != "-"] for m in metrics.values()], axis="columns"
+        )
 
-    # If any of the normalization metrics are not empty, write the normalized metrics to disk
-    if not all(metric.normalization_matrix.empty for metric in metrics.values()):
-        merged_zscore_df: pd.DataFrame = merged_zscore_df.reindex(columns=sorted(merged_zscore_df))
-        merged_zscore_df.to_csv(output_zscore_normalization_filepath, index=True)
-        logger.success(f"Wrote z-score normalization matrix to {output_zscore_normalization_filepath}")
-    else:
-        logger.warning(
-            "Not writing z-score normalization matrix because no normalization matrices exist. This is expected if you are using UMI filtering."
+        merged_zscores.index.name = (
+            "entrez_gene_id"
+            if merged_zscores.index.astype(str).str.isdigit().all()
+            else "ensembl_gene_id"
+            if merged_zscores.index.astype(str).str.startswith("ENS").all()
+            else "gene_symbol"
         )
 
+        merged_zscores = merged_zscores.reindex(columns=sorted(merged_zscores.columns))
+        merged_zscores = merged_zscores.groupby("entrez_gene_id").mean()
+        merged_zscores.to_csv(output_zscore_normalization_filepath, index=True)
+    elif isinstance(rnaseq_matrix, sc.AnnData):
+        merged_zscores = ad.concat([m.z_score_matrix for m in metrics.values()], axis="obs")
+        merged_zscores.var.index.name = "entrez_gene_id"
+        merged_zscores.obs = merged_zscores.obs.reindex(columns=sorted(merged_zscores.obs.columns))
+        merged_zscores.write_h5ad(output_zscore_normalization_filepath.with_suffix(".h5ad"))
+    expressed_genes: list[str] = list(itertools.chain.from_iterable(m.entrez_gene_ids for m in metrics.values()))
+    top_genes: list[str] = list(
+        itertools.chain.from_iterable(m.high_confidence_entrez_gene_ids for m in metrics.values())
+    )
+
+    logger.success(f"Wrote z-score normalization matrix to {output_zscore_normalization_filepath}")
+
     expression_frequency = pd.Series(expressed_genes).value_counts()
-    expression_df = pd.DataFrame({"entrez_gene_id": expression_frequency.index, "frequency": expression_frequency.values})
+    expression_df = pd.DataFrame(
+        {"entrez_gene_id": expression_frequency.index, "frequency": expression_frequency.values}
+    )
     expression_df["prop"] = expression_df["frequency"] / len(metrics)
     expression_df = expression_df[expression_df["prop"] >= filtering_options.batch_ratio]
 
@@ -722,10 +732,10 @@ async def _process(
     top_df["prop"] = top_df["frequency"] / len(metrics)
     top_df = top_df[top_df["prop"] >= filtering_options.high_batch_ratio]
 
-    entrez_id_series = pd.Series(read_counts_results.entrez_gene_ids)
+    entrez_id_series = pd.Series(entrez_gene_ids)
     boolean_matrix = pd.DataFrame(
         data={
-            "entrez_gene_id": read_counts_results.entrez_gene_ids,
+            "entrez_gene_id": entrez_gene_ids,
             "expressed": entrez_id_series.isin(expression_df["entrez_gene_id"]).astype(int),
             "high": entrez_id_series.isin(top_df["entrez_gene_id"]).astype(int),
         }
@@ -736,8 +746,13 @@ async def _process(
 
     # TODO: 2025-OCT-31: commented out dropping entrez ids, should this be kept?
     # boolean_matrix.dropna(subset="entrez_gene_id", inplace=True)
+    boolean_matrix = boolean_matrix.groupby("entrez_gene_id", as_index=False).mean()
+    boolean_matrix["expressed"] = boolean_matrix["expressed"].copy().astype(int)
+    boolean_matrix["high"] = boolean_matrix["high"].copy().astype(int)
     boolean_matrix.to_csv(output_boolean_activity_filepath, index=False)
-    logger.info(f"{context_name} - Found {expressed_count} expressed genes, {high_confidence_count} of which are confidently expressed")
+    logger.info(
+        f"{context_name} - Found {expressed_count} expressed genes, {high_confidence_count} of which are confidently expressed"
+    )
     logger.success(f"Wrote boolean matrix to {output_boolean_activity_filepath}")
 
 
@@ -757,11 +772,13 @@ async def rnaseq_gen(  # noqa: C901
     technique: FilteringTechnique | str = FilteringTechnique.ZFPKM,
     zfpkm_min_peak_height: float = 0.02,
     zfpkm_min_peak_distance: int = 1,
+    input_fragment_lengths: Path | None = None,
     cutoff: int | float | None = None,
     force_zfpkm_plot: bool = False,
     log_level: LogLevel = LogLevel.INFO,
     log_location: str | TextIO = sys.stderr,
     output_zfpkm_plot_dirpath: Path | None = None,
+    force_negative_counts_to_zero: bool = False,
 ) -> None:
     """Generate a list of active and high-confidence genes from a gene count matrix.
 
@@ -777,6 +794,7 @@ async def rnaseq_gen(  # noqa: C901
     :param prep: The preparation method
     :param taxon_id: The NCBI Taxon ID
     :param input_metadata_filepath_or_df: The filepath or dataframe containing metadata information
+    :param input_fragment_lengths: The filepath to the fragment lengths file, if applicable.
     :param replicate_ratio: The percentage of replicates that a gene must
         appear in for a gene to be marked as "active" in a batch/study
     :param batch_ratio: The percentage of batches that a gene must appear in for a gene to be marked as 'active"
@@ -792,6 +810,9 @@ async def rnaseq_gen(  # noqa: C901
     :param log_level: The level of logging to output
     :param log_location: The location to write logs to
     :param output_zfpkm_plot_dirpath: Optional filepath to save zFPKM plots
+    :param force_negative_counts_to_zero: Should negative values be forcibly set to 0?
+        This could happen as a result of normalization producing negative near-zero values (e.g., -0.001)
+
     :return: None
     """
     _set_up_logging(level=log_level, location=log_location)
@@ -817,8 +838,10 @@ async def rnaseq_gen(  # noqa: C901
             elif cutoff:
                 cutoff = "default"
 
-        case FilteringTechnique.ZFPKM | FilteringTechnique.UMI:
+        case FilteringTechnique.ZFPKM:
             cutoff: int | float = cutoff or -3
+        case FilteringTechnique.UMI:
+            cutoff: int = cutoff or 1
         case _:
             _log_and_raise_error(
                 f"Technique must be one of {','.join(FilteringTechnique)}. Got: {technique.value}",
@@ -870,7 +893,8 @@ async def rnaseq_gen(  # noqa: C901
         context_name=context_name,
         rnaseq_matrix_filepath=input_rnaseq_filepath,
         metadata_df=metadata_df,
-        gene_info_df=await _read_file(input_gene_info_filepath),
+        gene_info_df=_read_file(input_gene_info_filepath),
+        fragment_df=_read_file(input_fragment_lengths),
         prep=prep,
         taxon=taxon_id,
         replicate_ratio=replicate_ratio,
@@ -885,4 +909,5 @@ async def rnaseq_gen(  # noqa: C901
         output_boolean_activity_filepath=output_boolean_activity_filepath,
         output_zscore_normalization_filepath=output_zscore_normalization_filepath,
         output_zfpkm_plot_dirpath=output_zfpkm_plot_dirpath,
+        force_negative_to_zero=force_negative_counts_to_zero,
     )

From 6af3990cd7236f3ebbe52f81ca1e96546cf4c2da Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 9 Feb 2026 16:21:22 -0600
Subject: [PATCH 20/26] chore(ruff): reduce max line length

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/rnaseq_gen.py        | 2 ++
 main/como/rnaseq_preprocess.py | 2 --
 ruff.toml                      | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py
index 104e68cb..4c522e3c 100644
--- a/main/como/rnaseq_gen.py
+++ b/main/como/rnaseq_gen.py
@@ -772,7 +772,9 @@ async def rnaseq_gen(  # noqa: C901
     technique: FilteringTechnique | str = FilteringTechnique.ZFPKM,
     zfpkm_min_peak_height: float = 0.02,
     zfpkm_min_peak_distance: int = 1,
+    umi_target_sum: int = 10_000,
     input_fragment_lengths: Path | None = None,
+    umi_perform_normalization: bool = False,
     cutoff: int | float | None = None,
     force_zfpkm_plot: bool = False,
     log_level: LogLevel = LogLevel.INFO,
diff --git a/main/como/rnaseq_preprocess.py b/main/como/rnaseq_preprocess.py
index f9f1b6ce..06f33922 100644
--- a/main/como/rnaseq_preprocess.py
+++ b/main/como/rnaseq_preprocess.py
@@ -16,8 +16,6 @@
 import numpy as np
 import numpy.typing as npt
 import pandas as pd
-import pandera.pandas as pa
-import pandera.typing.pandas as pat
 from fast_bioservices.biothings.mygene import MyGene
 from fast_bioservices.pipeline import gene_symbol_to_ensembl_and_gene_id
 from loguru import logger
diff --git a/ruff.toml b/ruff.toml
index a556c25f..b7ddbd88 100644
--- a/ruff.toml
+++ b/ruff.toml
@@ -1,4 +1,4 @@
-line-length = 150
+line-length = 120
 extend-include = ["docs/**/*.py", "tests/**/*.py", "**/*.ipynb"]
 
 [format]

From 479fce2d064c044dc3ce4eb9eb74e51bf2d543e2 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 9 Feb 2026 16:21:52 -0600
Subject: [PATCH 21/26] chore(ruff): mark unsorted imports as fixable

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 ruff.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ruff.toml b/ruff.toml
index b7ddbd88..691022d2 100644
--- a/ruff.toml
+++ b/ruff.toml
@@ -6,6 +6,7 @@ quote-style = "double"
 docstring-code-format = true
 
 [lint]
+extend-fixable = ["I001"]
 # Linting rules: https://docs.astral.sh/ruff/rules/
 unfixable = [
     "F401", # warn about, but do not remove, unused imports

From d83e974d597f57fd9ee8ffa004750a35fd0e3940 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 9 Feb 2026 16:22:41 -0600
Subject: [PATCH 22/26] chore(uv): lock pyproject file

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 pyproject.toml |  17 +-
 uv.lock        | 413 ++++++++++---------------------------------------
 2 files changed, 86 insertions(+), 344 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 51f81319..488e14ef 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -12,11 +12,14 @@ dependencies = [
     "cobamp@git+https://github.com/JoshLoecker/cobamp@master",
     "cobra>=0.28.0",
     "fast-bioservices>=0.3.9",
+    "joypy>=0.2.6",
     "kaleido>=1.0.0",
     "loguru>=0.7.2",
+    "notebook>=7.4.7",
     "numpy>=2",
     "openpyxl>=3.1.5",
     "pandas>=1.3.5",
+    "python-louvain",
     "scanpy>=1.10.4",
     "scikit-learn>=1.5.2",
     "scipy>=1.13.0",
@@ -25,6 +28,7 @@ dependencies = [
     "statsmodels>=0.13.0; python_version < '3.12'",
     "statsmodels>=0.14.0; python_version >= '3.12'",
     "troppo@git+https://github.com/JoshLoecker/troppo@master",
+    "zfpkm>=1.0.3",
 ]
 
 [project.optional-dependencies]
@@ -36,19 +40,11 @@ interactive = [
     "jupyterlab>=4.3.2"
 ]
 dev = [
-    "commitizen>=4.8.3",
-    "commitlint>=1.3.1",
     "como",
     "hatchling>=1.27.0",
-    "pandas-stubs>=2.3.2.250827",
-    "pre-commit>=4.2.0",
-    "pyright>=1.1.405",
-    "pytest>=8.4.1",
     "pytest-asyncio>=1.1.0",
     "pytest-cov>=6.2.1",
-    "ruff>=0.12.11",
-    "scipy-stubs>=1.16.1.1",
-    "types-aiofiles>=24.1.0.20250822",
+    "pytest>=8.4.1",
 ]
 
 [tool.hatch.version]
@@ -62,3 +58,6 @@ allow-direct-references = true
 
 [tool.pytest.ini_options]
 pythonpath = ["main/src"]
+
+[tool.uv.sources]
+python-louvain = { git = "https://github.com/taynaud/python-louvain" }
diff --git a/uv.lock b/uv.lock
index 4fcb4c61..a1a2c903 100644
--- a/uv.lock
+++ b/uv.lock
@@ -85,15 +85,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/81/29/5ecc3a15d5a33e31b26c11426c45c501e439cb865d0bff96315d86443b78/appnope-0.1.4-py2.py3-none-any.whl", hash = "sha256:502575ee11cd7a28c0205f379b525beefebab9d161b7c964670864014ed7213c", size = 4321, upload-time = "2024-02-06T09:43:09.663Z" },
 ]
 
-[[package]]
-name = "argcomplete"
-version = "3.6.2"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/16/0f/861e168fc813c56a78b35f3c30d91c6757d1fd185af1110f1aec784b35d0/argcomplete-3.6.2.tar.gz", hash = "sha256:d0519b1bc867f5f4f4713c41ad0aba73a4a5f007449716b16f385f2166dc6adf", size = 73403, upload-time = "2025-04-03T04:57:03.52Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/31/da/e42d7a9d8dd33fa775f467e4028a47936da2f01e4b0e561f9ba0d74cb0ca/argcomplete-3.6.2-py3-none-any.whl", hash = "sha256:65b3133a29ad53fb42c48cf5114752c7ab66c1c38544fdf6460f450c09b42591", size = 43708, upload-time = "2025-04-03T04:57:01.591Z" },
-]
-
 [[package]]
 name = "argon2-cffi"
 version = "25.1.0"
@@ -281,15 +272,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/cb/0e/02ceeec9a7d6ee63bb596121c2c8e9b3a9e150936f4fbef6ca1943e6137c/cffi-2.0.0-cp313-cp313-win_arm64.whl", hash = "sha256:256f80b80ca3853f90c21b23ee78cd008713787b1b1e93eae9f3d6a7134abd91", size = 177780, upload-time = "2025-09-08T23:23:16.761Z" },
 ]
 
-[[package]]
-name = "cfgv"
-version = "3.4.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/11/74/539e56497d9bd1d484fd863dd69cbbfa653cd2aa27abfe35653494d85e94/cfgv-3.4.0.tar.gz", hash = "sha256:e52591d4c5f5dead8e0f673fb16db7949d2cfb3f7da4582893288f0ded8fe560", size = 7114, upload-time = "2023-08-12T20:38:17.776Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/c5/55/51844dd50c4fc7a33b653bfaba4c2456f06955289ca770a5dbd5fd267374/cfgv-3.4.0-py2.py3-none-any.whl", hash = "sha256:b7265b1f29fd3316bfcd2b330d63d024f2bfd8bcb8b0272f8e19a504856c48f9", size = 7249, upload-time = "2023-08-12T20:38:16.269Z" },
-]
-
 [[package]]
 name = "charset-normalizer"
 version = "3.4.3"
@@ -403,38 +385,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/60/97/891a0971e1e4a8c5d2b20bbe0e524dc04548d2307fee33cdeba148fd4fc7/comm-0.2.3-py3-none-any.whl", hash = "sha256:c615d91d75f7f04f095b30d1c1711babd43bdc6419c1be9886a85f2f4e489417", size = 7294, upload-time = "2025-07-25T14:02:02.896Z" },
 ]
 
-[[package]]
-name = "commitizen"
-version = "4.9.1"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "argcomplete" },
-    { name = "charset-normalizer" },
-    { name = "colorama" },
-    { name = "decli" },
-    { name = "deprecated" },
-    { name = "jinja2" },
-    { name = "packaging" },
-    { name = "prompt-toolkit" },
-    { name = "pyyaml" },
-    { name = "questionary" },
-    { name = "termcolor" },
-    { name = "tomlkit" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/77/19/927ac5b0eabb9451e2d5bb45b30813915c9a1260713b5b68eeb31358ea23/commitizen-4.9.1.tar.gz", hash = "sha256:b076b24657718f7a35b1068f2083bd39b4065d250164a1398d1dac235c51753b", size = 56610, upload-time = "2025-09-10T14:19:33.746Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/cf/49/577035b841442fe031b017027c3d99278b46104d227f0353c69dbbe55148/commitizen-4.9.1-py3-none-any.whl", hash = "sha256:4241b2ecae97b8109af8e587c36bc3b805a09b9a311084d159098e12d6ead497", size = 80624, upload-time = "2025-09-10T14:19:32.102Z" },
-]
-
-[[package]]
-name = "commitlint"
-version = "1.3.1"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/99/c1/42ee438955d0df9bf311dc4c573a49fb7215d915b224ee49566c6d11a318/commitlint-1.3.1.tar.gz", hash = "sha256:2a0123636bd12cb47f96034af0711d302403e80e47bac815f26c495420929d53", size = 23896, upload-time = "2025-08-25T13:19:35.965Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/78/bb/7413a18bff34f38a0b3283558dc25119c21426964619080e0061aadd3bb0/commitlint-1.3.1-py3-none-any.whl", hash = "sha256:02024a64a785c7b5f2c6accb33415efb61d428b90e0231480ac49b8b07624520", size = 26643, upload-time = "2025-08-25T13:19:35.173Z" },
-]
-
 [[package]]
 name = "como"
 source = { editable = "." }
@@ -444,11 +394,14 @@ dependencies = [
     { name = "cobamp" },
     { name = "cobra" },
     { name = "fast-bioservices" },
+    { name = "joypy" },
     { name = "kaleido" },
     { name = "loguru" },
+    { name = "notebook" },
     { name = "numpy" },
     { name = "openpyxl" },
     { name = "pandas" },
+    { name = "python-louvain" },
     { name = "scanpy" },
     { name = "scikit-learn" },
     { name = "scipy" },
@@ -456,22 +409,15 @@ dependencies = [
     { name = "setuptools" },
     { name = "statsmodels" },
     { name = "troppo" },
+    { name = "zfpkm" },
 ]
 
 [package.optional-dependencies]
 dev = [
-    { name = "commitizen" },
-    { name = "commitlint" },
     { name = "hatchling" },
-    { name = "pandas-stubs" },
-    { name = "pre-commit" },
-    { name = "pyright" },
     { name = "pytest" },
     { name = "pytest-asyncio" },
     { name = "pytest-cov" },
-    { name = "ruff" },
-    { name = "scipy-stubs" },
-    { name = "types-aiofiles" },
 ]
 gurobi = [
     { name = "gurobipy" },
@@ -487,35 +433,31 @@ requires-dist = [
     { name = "anndata", specifier = ">=0.12.0" },
     { name = "cobamp", git = "https://github.com/JoshLoecker/cobamp?rev=master" },
     { name = "cobra", specifier = ">=0.28.0" },
-    { name = "commitizen", marker = "extra == 'dev'", specifier = ">=4.8.3" },
-    { name = "commitlint", marker = "extra == 'dev'", specifier = ">=1.3.1" },
-    { name = "fast-bioservices", specifier = ">=0.3.9" },
+    { name = "fast-bioservices", editable = "../fast_bioservices" },
     { name = "gurobipy", marker = "extra == 'gurobi'", specifier = "<14" },
     { name = "hatchling", marker = "extra == 'dev'", specifier = ">=1.27.0" },
     { name = "ipython", marker = "extra == 'interactive'", specifier = ">=8.0.0" },
+    { name = "joypy", specifier = ">=0.2.6" },
     { name = "jupyterlab", marker = "extra == 'interactive'", specifier = ">=4.3.2" },
     { name = "kaleido", specifier = ">=1.0.0" },
     { name = "loguru", specifier = ">=0.7.2" },
+    { name = "notebook", specifier = ">=7.4.7" },
     { name = "numpy", specifier = ">=2" },
     { name = "openpyxl", specifier = ">=3.1.5" },
     { name = "pandas", specifier = ">=1.3.5" },
-    { name = "pandas-stubs", marker = "extra == 'dev'", specifier = ">=2.3.2.250827" },
-    { name = "pre-commit", marker = "extra == 'dev'", specifier = ">=4.2.0" },
-    { name = "pyright", marker = "extra == 'dev'", specifier = ">=1.1.405" },
     { name = "pytest", marker = "extra == 'dev'", specifier = ">=8.4.1" },
     { name = "pytest-asyncio", marker = "extra == 'dev'", specifier = ">=1.1.0" },
     { name = "pytest-cov", marker = "extra == 'dev'", specifier = ">=6.2.1" },
-    { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.12.11" },
+    { name = "python-louvain", git = "https://github.com/taynaud/python-louvain" },
     { name = "scanpy", specifier = ">=1.10.4" },
     { name = "scikit-learn", specifier = ">=1.5.2" },
     { name = "scipy", specifier = ">=1.13.0" },
-    { name = "scipy-stubs", marker = "extra == 'dev'", specifier = ">=1.16.1.1" },
     { name = "seaborn", specifier = ">=0.13.2" },
     { name = "setuptools", specifier = ">=78.1.1" },
     { name = "statsmodels", marker = "python_full_version < '3.12'", specifier = ">=0.13.0" },
     { name = "statsmodels", marker = "python_full_version >= '3.12'", specifier = ">=0.14.0" },
     { name = "troppo", git = "https://github.com/JoshLoecker/troppo?rev=master" },
-    { name = "types-aiofiles", marker = "extra == 'dev'", specifier = ">=24.1.0.20250822" },
+    { name = "zfpkm", specifier = ">=1.0.3" },
 ]
 provides-extras = ["dev", "gurobi", "interactive"]
 
@@ -727,15 +669,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/b0/d0/89247ec250369fc76db477720a26b2fce7ba079ff1380e4ab4529d2fe233/debugpy-1.8.17-py2.py3-none-any.whl", hash = "sha256:60c7dca6571efe660ccb7a9508d73ca14b8796c4ed484c2002abba714226cfef", size = 5283210, upload-time = "2025-09-17T16:34:25.835Z" },
 ]
 
-[[package]]
-name = "decli"
-version = "0.6.3"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/0c/59/d4ffff1dee2c8f6f2dd8f87010962e60f7b7847504d765c91ede5a466730/decli-0.6.3.tar.gz", hash = "sha256:87f9d39361adf7f16b9ca6e3b614badf7519da13092f2db3c80ca223c53c7656", size = 7564, upload-time = "2025-06-01T15:23:41.25Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/d8/fa/ec878c28bc7f65b77e7e17af3522c9948a9711b9fa7fc4c5e3140a7e3578/decli-0.6.3-py3-none-any.whl", hash = "sha256:5152347c7bb8e3114ad65db719e5709b28d7f7f45bdb709f70167925e55640f3", size = 7989, upload-time = "2025-06-01T15:23:40.228Z" },
-]
-
 [[package]]
 name = "decorator"
 version = "5.2.1"
@@ -763,18 +696,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/1f/10/5fe7a7778cc8a701373662f99393f443541353018d3cf2bf6c8f91b032d6/depinfo-2.2.0-py3-none-any.whl", hash = "sha256:3d9ba933e7a9d718b9915f75c844a38c5603cd3cdba1816ab95e0b148b100d8f", size = 24025, upload-time = "2022-09-07T16:27:49.813Z" },
 ]
 
-[[package]]
-name = "deprecated"
-version = "1.2.18"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "wrapt" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/98/97/06afe62762c9a8a86af0cfb7bfdab22a43ad17138b07af5b1a58442690a2/deprecated-1.2.18.tar.gz", hash = "sha256:422b6f6d859da6f2ef57857761bfb392480502a64c3028ca9bbe86085d72115d", size = 2928744, upload-time = "2025-01-27T10:46:25.7Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/6e/c6/ac0b6c1e2d138f1002bcf799d330bd6d85084fece321e662a14223794041/Deprecated-1.2.18-py2.py3-none-any.whl", hash = "sha256:bd5011788200372a32418f888e326a09ff80d0214bd961147cfed01b5c018eec", size = 9998, upload-time = "2025-01-27T10:46:09.186Z" },
-]
-
 [[package]]
 name = "dill"
 version = "0.4.0"
@@ -793,15 +714,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/3f/27/4570e78fc0bf5ea0ca45eb1de3818a23787af9b390c0b0a0033a1b8236f9/diskcache-5.6.3-py3-none-any.whl", hash = "sha256:5e31b2d5fbad117cc363ebaf6b689474db18a1f6438bc82358b024abd4c2ca19", size = 45550, upload-time = "2023-08-31T06:11:58.822Z" },
 ]
 
-[[package]]
-name = "distlib"
-version = "0.4.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/96/8e/709914eb2b5749865801041647dc7f4e6d00b549cfe88b65ca192995f07c/distlib-0.4.0.tar.gz", hash = "sha256:feec40075be03a04501a973d81f633735b4b69f98b05450592310c0f401a4e0d", size = 614605, upload-time = "2025-07-17T16:52:00.465Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/33/6b/e0547afaf41bf2c42e52430072fa5658766e3d65bd4b03a563d1b6336f57/distlib-0.4.0-py2.py3-none-any.whl", hash = "sha256:9659f7d87e46584a30b5780e43ac7a2143098441670ff0a49d5f9034c54a6c16", size = 469047, upload-time = "2025-07-17T16:51:58.613Z" },
-]
-
 [[package]]
 name = "donfig"
 version = "0.8.1.post1"
@@ -834,8 +746,7 @@ wheels = [
 
 [[package]]
 name = "fast-bioservices"
-version = "0.3.9"
-source = { registry = "https://pypi.org/simple" }
+source = { editable = "../fast_bioservices" }
 dependencies = [
     { name = "aiofiles" },
     { name = "appdirs" },
@@ -844,9 +755,24 @@ dependencies = [
     { name = "loguru" },
     { name = "pandas" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/76/f2/1575a1233ee470cdc52efa1ad5e00050bb39b3f7ebdb3813fba42035e7c3/fast_bioservices-0.3.9.tar.gz", hash = "sha256:4094d5963b5baab2f7d3a02a74d1d841e83670341065ea0ed0d1f09ba658bf05", size = 47042, upload-time = "2024-12-04T19:32:29.458Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/48/6c/6033e67a7d734ba90ff186e5404f78b0e3b59ae46e78bec11764ae50c508/fast_bioservices-0.3.9-py3-none-any.whl", hash = "sha256:f041a30300d4de5c7d2d5e0405b8505e7a7f79248e986ecf45ddb3473d7c4d8f", size = 22687, upload-time = "2024-12-04T19:32:28.023Z" },
+
+[package.metadata]
+requires-dist = [
+    { name = "aiofiles", specifier = ">=24.1.0" },
+    { name = "appdirs", specifier = ">=1.4.4" },
+    { name = "hishel", specifier = ">=0.1.1" },
+    { name = "httpx", specifier = ">=0.27.2" },
+    { name = "loguru", specifier = ">=0.7.2" },
+    { name = "pandas", specifier = ">=1.5.3" },
+]
+
+[package.metadata.requires-dev]
+dev = [
+    { name = "commitlint", specifier = ">=1.3.0" },
+    { name = "pre-commit", specifier = ">=4.0.1" },
+    { name = "pytest", specifier = ">=8.3.2" },
+    { name = "pytest-asyncio", specifier = ">=0.24.0" },
+    { name = "pytest-cov", specifier = ">=6.0.0" },
 ]
 
 [[package]]
@@ -858,15 +784,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/cb/a8/20d0723294217e47de6d9e2e40fd4a9d2f7c4b6ef974babd482a59743694/fastjsonschema-2.21.2-py3-none-any.whl", hash = "sha256:1c797122d0a86c5cace2e54bf4e819c36223b552017172f32c5c024a6b77e463", size = 24024, upload-time = "2025-08-14T18:49:34.776Z" },
 ]
 
-[[package]]
-name = "filelock"
-version = "3.19.1"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/40/bb/0ab3e58d22305b6f5440629d20683af28959bf793d98d11950e305c1c326/filelock-3.19.1.tar.gz", hash = "sha256:66eda1888b0171c998b35be2bcc0f6d75c388a7ce20c3f3f37aa8e96c2dddf58", size = 17687, upload-time = "2025-08-14T16:56:03.016Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/42/14/42b2651a2f46b022ccd948bca9f2d5af0fd8929c4eec235b8d6d844fbe67/filelock-3.19.1-py3-none-any.whl", hash = "sha256:d38e30481def20772f5baf097c122c3babc4fcdb7e14e57049eb9d88c6dc017d", size = 15988, upload-time = "2025-08-14T16:56:01.633Z" },
-]
-
 [[package]]
 name = "fonttools"
 version = "4.60.1"
@@ -1027,15 +944,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" },
 ]
 
-[[package]]
-name = "identify"
-version = "2.6.15"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/ff/e7/685de97986c916a6d93b3876139e00eef26ad5bbbd61925d670ae8013449/identify-2.6.15.tar.gz", hash = "sha256:e4f4864b96c6557ef2a1e1c951771838f4edc9df3a72ec7118b338801b11c7bf", size = 99311, upload-time = "2025-10-02T17:43:40.631Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/0f/1c/e5fd8f973d4f375adb21565739498e2e9a1e54c858a97b9a8ccfdc81da9b/identify-2.6.15-py2.py3-none-any.whl", hash = "sha256:1181ef7608e00704db228516541eb83a88a9f94433a8c80bb9b5bd54b1d81757", size = 99183, upload-time = "2025-10-02T17:43:39.137Z" },
-]
-
 [[package]]
 name = "idna"
 version = "3.10"
@@ -1175,6 +1083,21 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/1e/e8/685f47e0d754320684db4425a0967f7d3fa70126bffd76110b7009a0090f/joblib-1.5.2-py3-none-any.whl", hash = "sha256:4e1f0bdbb987e6d843c70cf43714cb276623def372df3c22fe5266b2670bc241", size = 308396, upload-time = "2025-08-27T12:15:45.188Z" },
 ]
 
+[[package]]
+name = "joypy"
+version = "0.2.6"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "matplotlib" },
+    { name = "numpy" },
+    { name = "pandas" },
+    { name = "scipy" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/89/f4/49636d4c5fa30822028a1e2af234cecf488ba3c7e9ff5aba88e36fb0c95c/joypy-0.2.6.tar.gz", hash = "sha256:099da2d6c7d81b5eccc957bd9446831f565ba42d5abbab0fa92b81892449522e", size = 10270, upload-time = "2021-12-19T09:42:52.541Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e9/8c/4d32c8935431eb10fd140faa4b13b6b8de222223a88fa9ad2a7711b7f1a9/joypy-0.2.6-py2.py3-none-any.whl", hash = "sha256:fffe882e8281e56e08b374a3148436cb448562ba39e4d566204c7e8ee2caddab", size = 8584, upload-time = "2021-12-19T09:42:50.786Z" },
+]
+
 [[package]]
 name = "json5"
 version = "0.12.1"
@@ -1787,12 +1710,19 @@ wheels = [
 ]
 
 [[package]]
-name = "nodeenv"
-version = "1.9.1"
+name = "notebook"
+version = "7.4.7"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/43/16/fc88b08840de0e0a72a2f9d8c6bae36be573e475a6326ae854bcc549fc45/nodeenv-1.9.1.tar.gz", hash = "sha256:6ec12890a2dab7946721edbfbcd91f3319c6ccc9aec47be7c7e6b7011ee6645f", size = 47437, upload-time = "2024-06-04T18:44:11.171Z" }
+dependencies = [
+    { name = "jupyter-server" },
+    { name = "jupyterlab" },
+    { name = "jupyterlab-server" },
+    { name = "notebook-shim" },
+    { name = "tornado" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/04/09/f6f64ba156842ef68d3ea763fa171a2f7e7224f200a15dd4af5b83c34756/notebook-7.4.7.tar.gz", hash = "sha256:3f0a04027dfcee8a876de48fba13ab77ec8c12f72f848a222ed7f5081b9e342a", size = 13937702, upload-time = "2025-09-27T08:00:22.536Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/d2/1d/1b658dbd2b9fa9c4c9f32accbfc0205d532c8c6194dc0f2a4c0428e7128a/nodeenv-1.9.1-py2.py3-none-any.whl", hash = "sha256:ba11c9782d29c27c70ffbdda2d7415098754709be8a7056d79a737cd901155c9", size = 22314, upload-time = "2024-06-04T18:44:08.352Z" },
+    { url = "https://files.pythonhosted.org/packages/6c/d7/06d13087e20388926e7423d2489e728d2e59f2453039cdb0574a7c070e76/notebook-7.4.7-py3-none-any.whl", hash = "sha256:362b7c95527f7dd3c4c84d410b782872fd9c734fb2524c11dd92758527b6eda6", size = 14342894, upload-time = "2025-09-27T08:00:18.496Z" },
 ]
 
 [[package]]
@@ -1925,18 +1855,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/af/11/0cc63f9f321ccf63886ac203336777140011fb669e739da36d8db3c53b98/numpy-2.3.3-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:2e267c7da5bf7309670523896df97f93f6e469fb931161f483cd6882b3b1a5dc", size = 12971844, upload-time = "2025-09-09T15:58:57.359Z" },
 ]
 
-[[package]]
-name = "numpy-typing-compat"
-version = "20250818.2.3"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "numpy" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/c9/e3/1a29f174c1e09a2bf111d37a41afceea1b501371abb39e73170ca31a7599/numpy_typing_compat-20250818.2.3.tar.gz", hash = "sha256:72e83d535b635d668ba7315e43ae80be1469a6faea6fc96d312516f39b3d8fa5", size = 4974, upload-time = "2025-08-18T23:46:42.968Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/c9/4a/fa4c90a03d6a8ee1a7f0e0fb101887d9a8cdb9b07a5901af9ae831e9feea/numpy_typing_compat-20250818.2.3-py3-none-any.whl", hash = "sha256:930413d34dd9083c0bf418815576222f1c66ea2d68950f447fd27ea1a78b26b0", size = 6286, upload-time = "2025-08-18T23:46:35.681Z" },
-]
-
 [[package]]
 name = "openpyxl"
 version = "3.1.5"
@@ -1962,24 +1880,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/52/18/8215ef740dd5b5d982de9c4cd352c99ce92c40c208245a2e8909bea7c0d6/optlang-1.8.3-py2.py3-none-any.whl", hash = "sha256:b81f4e873f0c1d0d907410add63aea427762d911245eb04a4a1126da5fedb595", size = 141752, upload-time = "2025-01-08T12:45:28.063Z" },
 ]
 
-[[package]]
-name = "optype"
-version = "0.13.4"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "typing-extensions", marker = "python_full_version < '3.13'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/20/7f/daa32a35b2a6a564a79723da49c0ddc464c462e67a906fc2b66a0d64f28e/optype-0.13.4.tar.gz", hash = "sha256:131d8e0f1c12d8095d553e26b54598597133830983233a6a2208886e7a388432", size = 99547, upload-time = "2025-08-19T19:52:44.242Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/37/bb/b51940f2d91071325d5ae2044562aa698470a105474d9317b9dbdaad63df/optype-0.13.4-py3-none-any.whl", hash = "sha256:500c89cfac82e2f9448a54ce0a5d5c415b6976b039c2494403cd6395bd531979", size = 87919, upload-time = "2025-08-19T19:52:41.314Z" },
-]
-
-[package.optional-dependencies]
-numpy = [
-    { name = "numpy" },
-    { name = "numpy-typing-compat" },
-]
-
 [[package]]
 name = "orjson"
 version = "3.11.3"
@@ -2092,19 +1992,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/bd/17/e756653095a083d8a37cbd816cb87148debcfcd920129b25f99dd8d04271/pandas-2.3.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:c4fc4c21971a1a9f4bdb4c73978c7f7256caa3e62b323f70d6cb80db583350bc", size = 13199233, upload-time = "2025-09-29T23:24:24.876Z" },
 ]
 
-[[package]]
-name = "pandas-stubs"
-version = "2.3.2.250926"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "numpy" },
-    { name = "types-pytz" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/1b/3b/32be58a125db39d0b5f62cc93795f32b5bb2915bd5c4a46f0e35171985e2/pandas_stubs-2.3.2.250926.tar.gz", hash = "sha256:c64b9932760ceefb96a3222b953e6a251321a9832a28548be6506df473a66406", size = 102147, upload-time = "2025-09-26T19:50:39.522Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/40/96/1e4a035eaf4dce9610aac6e43026d0c6baa05773daf6d21e635a4fe19e21/pandas_stubs-2.3.2.250926-py3-none-any.whl", hash = "sha256:81121818453dcfe00f45c852f4dceee043640b813830f6e7bd084a4ef7ff7270", size = 159995, upload-time = "2025-09-26T19:50:38.241Z" },
-]
-
 [[package]]
 name = "pandocfilters"
 version = "1.5.1"
@@ -2269,22 +2156,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/5b/23/6aef7c24f4ee6f765aeaaaa3bf24cfdb0730a20336a02b1a061d227d84be/ppft-1.7.7-py3-none-any.whl", hash = "sha256:fb7524db110682de886b4bb5b08f7bf6a38940566074ef2f62521cbbd3864676", size = 56764, upload-time = "2025-04-16T01:47:39.453Z" },
 ]
 
-[[package]]
-name = "pre-commit"
-version = "4.3.0"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "cfgv" },
-    { name = "identify" },
-    { name = "nodeenv" },
-    { name = "pyyaml" },
-    { name = "virtualenv" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/ff/29/7cf5bbc236333876e4b41f56e06857a87937ce4bf91e117a6991a2dbb02a/pre_commit-4.3.0.tar.gz", hash = "sha256:499fe450cc9d42e9d58e606262795ecb64dd05438943c62b66f6a8673da30b16", size = 193792, upload-time = "2025-08-09T18:56:14.651Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/5b/a5/987a405322d78a73b66e39e4a90e4ef156fd7141bf71df987e50717c321b/pre_commit-4.3.0-py2.py3-none-any.whl", hash = "sha256:2b0747ad7e6e967169136edffee14c16e148a778a54e4f967921aa1ebf2308d8", size = 220965, upload-time = "2025-08-09T18:56:13.192Z" },
-]
-
 [[package]]
 name = "prometheus-client"
 version = "0.23.1"
@@ -2463,19 +2334,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/10/5e/1aa9a93198c6b64513c9d7752de7422c06402de6600a8767da1524f9570b/pyparsing-3.2.5-py3-none-any.whl", hash = "sha256:e38a4f02064cf41fe6593d328d0512495ad1f3d8a91c4f73fc401b3079a59a5e", size = 113890, upload-time = "2025-09-21T04:11:04.117Z" },
 ]
 
-[[package]]
-name = "pyright"
-version = "1.1.406"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "nodeenv" },
-    { name = "typing-extensions" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/f7/16/6b4fbdd1fef59a0292cbb99f790b44983e390321eccbc5921b4d161da5d1/pyright-1.1.406.tar.gz", hash = "sha256:c4872bc58c9643dac09e8a2e74d472c62036910b3bd37a32813989ef7576ea2c", size = 4113151, upload-time = "2025-10-02T01:04:45.488Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/f6/a2/e309afbb459f50507103793aaef85ca4348b66814c86bc73908bdeb66d12/pyright-1.1.406-py3-none-any.whl", hash = "sha256:1d81fb43c2407bf566e97e57abb01c811973fdb21b2df8df59f870f688bdca71", size = 5980982, upload-time = "2025-10-02T01:04:43.137Z" },
-]
-
 [[package]]
 name = "pytest"
 version = "8.4.2"
@@ -2575,6 +2433,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/b8/49/70f6288db3ce3ef006627318d518235836c47b34e4aa7716760e33b639b9/python_libsbml-5.20.5-cp313-cp313-win_amd64.whl", hash = "sha256:763222865e39d51e408c2c9af3dafa0d58f613e75d9ff117de8f8a2b9f7eb59e", size = 6027822, upload-time = "2025-05-05T06:43:59.128Z" },
 ]
 
+[[package]]
+name = "python-louvain"
+version = "0.16"
+source = { git = "https://github.com/taynaud/python-louvain#def91793772c3e77ab4167d175903a5365c24b4b" }
+dependencies = [
+    { name = "networkx" },
+    { name = "numpy" },
+]
+
 [[package]]
 name = "pytz"
 version = "2025.2"
@@ -2697,18 +2564,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/01/1b/5dbe84eefc86f48473947e2f41711aded97eecef1231f4558f1f02713c12/pyzmq-27.1.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:c9f7f6e13dff2e44a6afeaf2cf54cee5929ad64afaf4d40b50f93c58fc687355", size = 544862, upload-time = "2025-09-08T23:09:56.509Z" },
 ]
 
-[[package]]
-name = "questionary"
-version = "2.1.1"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "prompt-toolkit" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/f6/45/eafb0bba0f9988f6a2520f9ca2df2c82ddfa8d67c95d6625452e97b204a5/questionary-2.1.1.tar.gz", hash = "sha256:3d7e980292bb0107abaa79c68dd3eee3c561b83a0f89ae482860b181c8bd412d", size = 25845, upload-time = "2025-08-28T19:00:20.851Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/3c/26/1062c7ec1b053db9e499b4d2d5bc231743201b74051c973dadeac80a8f43/questionary-2.1.1-py3-none-any.whl", hash = "sha256:a51af13f345f1cdea62347589fbb6df3b290306ab8930713bfae4d475a7d4a59", size = 36753, upload-time = "2025-08-28T19:00:19.56Z" },
-]
-
 [[package]]
 name = "referencing"
 version = "0.36.2"
@@ -2913,32 +2768,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/51/79/76aba16a1689b50528224b182f71097ece338e7a4ab55e84c2e73443b78a/ruamel.yaml.clib-0.2.14-cp313-cp313-win_amd64.whl", hash = "sha256:090782b5fb9d98df96509eecdbcaffd037d47389a89492320280d52f91330d78", size = 115238, upload-time = "2025-09-22T19:51:07.081Z" },
 ]
 
-[[package]]
-name = "ruff"
-version = "0.13.3"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/c7/8e/f9f9ca747fea8e3ac954e3690d4698c9737c23b51731d02df999c150b1c9/ruff-0.13.3.tar.gz", hash = "sha256:5b0ba0db740eefdfbcce4299f49e9eaefc643d4d007749d77d047c2bab19908e", size = 5438533, upload-time = "2025-10-02T19:29:31.582Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/d2/33/8f7163553481466a92656d35dea9331095122bb84cf98210bef597dd2ecd/ruff-0.13.3-py3-none-linux_armv6l.whl", hash = "sha256:311860a4c5e19189c89d035638f500c1e191d283d0cc2f1600c8c80d6dcd430c", size = 12484040, upload-time = "2025-10-02T19:28:49.199Z" },
-    { url = "https://files.pythonhosted.org/packages/b0/b5/4a21a4922e5dd6845e91896b0d9ef493574cbe061ef7d00a73c61db531af/ruff-0.13.3-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:2bdad6512fb666b40fcadb65e33add2b040fc18a24997d2e47fee7d66f7fcae2", size = 13122975, upload-time = "2025-10-02T19:28:52.446Z" },
-    { url = "https://files.pythonhosted.org/packages/40/90/15649af836d88c9f154e5be87e64ae7d2b1baa5a3ef317cb0c8fafcd882d/ruff-0.13.3-py3-none-macosx_11_0_arm64.whl", hash = "sha256:fc6fa4637284708d6ed4e5e970d52fc3b76a557d7b4e85a53013d9d201d93286", size = 12346621, upload-time = "2025-10-02T19:28:54.712Z" },
-    { url = "https://files.pythonhosted.org/packages/a5/42/bcbccb8141305f9a6d3f72549dd82d1134299177cc7eaf832599700f95a7/ruff-0.13.3-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1c9e6469864f94a98f412f20ea143d547e4c652f45e44f369d7b74ee78185838", size = 12574408, upload-time = "2025-10-02T19:28:56.679Z" },
-    { url = "https://files.pythonhosted.org/packages/ce/19/0f3681c941cdcfa2d110ce4515624c07a964dc315d3100d889fcad3bfc9e/ruff-0.13.3-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5bf62b705f319476c78891e0e97e965b21db468b3c999086de8ffb0d40fd2822", size = 12285330, upload-time = "2025-10-02T19:28:58.79Z" },
-    { url = "https://files.pythonhosted.org/packages/10/f8/387976bf00d126b907bbd7725219257feea58650e6b055b29b224d8cb731/ruff-0.13.3-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:78cc1abed87ce40cb07ee0667ce99dbc766c9f519eabfd948ed87295d8737c60", size = 13980815, upload-time = "2025-10-02T19:29:01.577Z" },
-    { url = "https://files.pythonhosted.org/packages/0c/a6/7c8ec09d62d5a406e2b17d159e4817b63c945a8b9188a771193b7e1cc0b5/ruff-0.13.3-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:4fb75e7c402d504f7a9a259e0442b96403fa4a7310ffe3588d11d7e170d2b1e3", size = 14987733, upload-time = "2025-10-02T19:29:04.036Z" },
-    { url = "https://files.pythonhosted.org/packages/97/e5/f403a60a12258e0fd0c2195341cfa170726f254c788673495d86ab5a9a9d/ruff-0.13.3-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:17b951f9d9afb39330b2bdd2dd144ce1c1335881c277837ac1b50bfd99985ed3", size = 14439848, upload-time = "2025-10-02T19:29:06.684Z" },
-    { url = "https://files.pythonhosted.org/packages/39/49/3de381343e89364c2334c9f3268b0349dc734fc18b2d99a302d0935c8345/ruff-0.13.3-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6052f8088728898e0a449f0dde8fafc7ed47e4d878168b211977e3e7e854f662", size = 13421890, upload-time = "2025-10-02T19:29:08.767Z" },
-    { url = "https://files.pythonhosted.org/packages/ab/b5/c0feca27d45ae74185a6bacc399f5d8920ab82df2d732a17213fb86a2c4c/ruff-0.13.3-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dc742c50f4ba72ce2a3be362bd359aef7d0d302bf7637a6f942eaa763bd292af", size = 13444870, upload-time = "2025-10-02T19:29:11.234Z" },
-    { url = "https://files.pythonhosted.org/packages/50/a1/b655298a1f3fda4fdc7340c3f671a4b260b009068fbeb3e4e151e9e3e1bf/ruff-0.13.3-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:8e5640349493b378431637019366bbd73c927e515c9c1babfea3e932f5e68e1d", size = 13691599, upload-time = "2025-10-02T19:29:13.353Z" },
-    { url = "https://files.pythonhosted.org/packages/32/b0/a8705065b2dafae007bcae21354e6e2e832e03eb077bb6c8e523c2becb92/ruff-0.13.3-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:6b139f638a80eae7073c691a5dd8d581e0ba319540be97c343d60fb12949c8d0", size = 12421893, upload-time = "2025-10-02T19:29:15.668Z" },
-    { url = "https://files.pythonhosted.org/packages/0d/1e/cbe7082588d025cddbb2f23e6dfef08b1a2ef6d6f8328584ad3015b5cebd/ruff-0.13.3-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:6b547def0a40054825de7cfa341039ebdfa51f3d4bfa6a0772940ed351d2746c", size = 12267220, upload-time = "2025-10-02T19:29:17.583Z" },
-    { url = "https://files.pythonhosted.org/packages/a5/99/4086f9c43f85e0755996d09bdcb334b6fee9b1eabdf34e7d8b877fadf964/ruff-0.13.3-py3-none-musllinux_1_2_i686.whl", hash = "sha256:9cc48a3564423915c93573f1981d57d101e617839bef38504f85f3677b3a0a3e", size = 13177818, upload-time = "2025-10-02T19:29:19.943Z" },
-    { url = "https://files.pythonhosted.org/packages/9b/de/7b5db7e39947d9dc1c5f9f17b838ad6e680527d45288eeb568e860467010/ruff-0.13.3-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:1a993b17ec03719c502881cb2d5f91771e8742f2ca6de740034433a97c561989", size = 13618715, upload-time = "2025-10-02T19:29:22.527Z" },
-    { url = "https://files.pythonhosted.org/packages/28/d3/bb25ee567ce2f61ac52430cf99f446b0e6d49bdfa4188699ad005fdd16aa/ruff-0.13.3-py3-none-win32.whl", hash = "sha256:f14e0d1fe6460f07814d03c6e32e815bff411505178a1f539a38f6097d3e8ee3", size = 12334488, upload-time = "2025-10-02T19:29:24.782Z" },
-    { url = "https://files.pythonhosted.org/packages/cf/49/12f5955818a1139eed288753479ba9d996f6ea0b101784bb1fe6977ec128/ruff-0.13.3-py3-none-win_amd64.whl", hash = "sha256:621e2e5812b691d4f244638d693e640f188bacbb9bc793ddd46837cea0503dd2", size = 13455262, upload-time = "2025-10-02T19:29:26.882Z" },
-    { url = "https://files.pythonhosted.org/packages/fe/72/7b83242b26627a00e3af70d0394d68f8f02750d642567af12983031777fc/ruff-0.13.3-py3-none-win_arm64.whl", hash = "sha256:9e9e9d699841eaf4c2c798fa783df2fabc680b72059a02ca0ed81c460bc58330", size = 12538484, upload-time = "2025-10-02T19:29:28.951Z" },
-]
-
 [[package]]
 name = "scanpy"
 version = "1.11.4"
@@ -3056,18 +2885,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/d6/5e/2cc7555fd81d01814271412a1d59a289d25f8b63208a0a16c21069d55d3e/scipy-1.16.2-cp313-cp313t-win_arm64.whl", hash = "sha256:98e22834650be81d42982360382b43b17f7ba95e0e6993e2a4f5b9ad9283a94d", size = 25787992, upload-time = "2025-09-11T17:43:19.745Z" },
 ]
 
-[[package]]
-name = "scipy-stubs"
-version = "1.16.2.0"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "optype", extra = ["numpy"] },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/4b/84/b4c2caf7748f331870992e7ede5b5df0b080671bcef8c8c7e27a3cf8694a/scipy_stubs-1.16.2.0.tar.gz", hash = "sha256:8fdd45155fca401bb755b1b63ac2f192f84f25c3be8da2c99d1cafb2708f3052", size = 352676, upload-time = "2025-09-11T23:28:59.236Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/83/c8/67d984c264f759e7653c130a4b12ae3b4f4304867579560e9a869adb7883/scipy_stubs-1.16.2.0-py3-none-any.whl", hash = "sha256:18c50d49e3c932033fdd4f7fa4fea9e45c8787f92bceaec9e86ccbd140e835d5", size = 553247, upload-time = "2025-09-11T23:28:57.688Z" },
-]
-
 [[package]]
 name = "seaborn"
 version = "0.13.2"
@@ -3272,15 +3089,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/a2/09/77d55d46fd61b4a135c444fc97158ef34a095e5681d0a6c10b75bf356191/sympy-1.14.0-py3-none-any.whl", hash = "sha256:e091cc3e99d2141a0ba2847328f5479b05d94a6635cb96148ccb3f34671bd8f5", size = 6299353, upload-time = "2025-04-27T18:04:59.103Z" },
 ]
 
-[[package]]
-name = "termcolor"
-version = "3.1.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/ca/6c/3d75c196ac07ac8749600b60b03f4f6094d54e132c4d94ebac6ee0e0add0/termcolor-3.1.0.tar.gz", hash = "sha256:6a6dd7fbee581909eeec6a756cff1d7f7c376063b14e4a298dc4980309e55970", size = 14324, upload-time = "2025-04-30T11:37:53.791Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/4f/bd/de8d508070629b6d84a30d01d57e4a65c69aa7f5abe7560b8fad3b50ea59/termcolor-3.1.0-py3-none-any.whl", hash = "sha256:591dd26b5c2ce03b9e43f391264626557873ce1d379019786f99b0c2bee140aa", size = 7684, upload-time = "2025-04-30T11:37:52.382Z" },
-]
-
 [[package]]
 name = "terminado"
 version = "0.18.1"
@@ -3355,15 +3163,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/6e/c2/61d3e0f47e2b74ef40a68b9e6ad5984f6241a942f7cd3bbfbdbd03861ea9/tomli-2.2.1-py3-none-any.whl", hash = "sha256:cb55c73c5f4408779d0cf3eef9f762b9c9f147a77de7b258bef0a5628adc85cc", size = 14257, upload-time = "2024-11-27T22:38:35.385Z" },
 ]
 
-[[package]]
-name = "tomlkit"
-version = "0.13.3"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/cc/18/0bbf3884e9eaa38819ebe46a7bd25dcd56b67434402b66a58c4b8e552575/tomlkit-0.13.3.tar.gz", hash = "sha256:430cf247ee57df2b94ee3fbe588e71d362a941ebb545dec29b53961d61add2a1", size = 185207, upload-time = "2025-06-05T07:13:44.947Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/bd/75/8539d011f6be8e29f339c42e633aae3cb73bffa95dd0f9adec09b9c58e85/tomlkit-0.13.3-py3-none-any.whl", hash = "sha256:c89c649d79ee40629a9fda55f8ace8c6a1b42deb912b2a8fd8d942ddadb606b0", size = 38901, upload-time = "2025-06-05T07:13:43.546Z" },
-]
-
 [[package]]
 name = "tornado"
 version = "6.5.2"
@@ -3423,15 +3222,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/e1/85/a4ff8758c66f1fc32aa5e9a145908394bf9cf1c79ffd1113cfdeb77e74e4/trove_classifiers-2025.9.11.17-py3-none-any.whl", hash = "sha256:5d392f2d244deb1866556457d6f3516792124a23d1c3a463a2e8668a5d1c15dd", size = 14158, upload-time = "2025-09-11T17:07:49.886Z" },
 ]
 
-[[package]]
-name = "types-aiofiles"
-version = "24.1.0.20250822"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/19/48/c64471adac9206cc844afb33ed311ac5a65d2f59df3d861e0f2d0cad7414/types_aiofiles-24.1.0.20250822.tar.gz", hash = "sha256:9ab90d8e0c307fe97a7cf09338301e3f01a163e39f3b529ace82466355c84a7b", size = 14484, upload-time = "2025-08-22T03:02:23.039Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/bc/8e/5e6d2215e1d8f7c2a94c6e9d0059ae8109ce0f5681956d11bb0a228cef04/types_aiofiles-24.1.0.20250822-py3-none-any.whl", hash = "sha256:0ec8f8909e1a85a5a79aed0573af7901f53120dd2a29771dd0b3ef48e12328b0", size = 14322, upload-time = "2025-08-22T03:02:21.918Z" },
-]
-
 [[package]]
 name = "types-python-dateutil"
 version = "2.9.0.20250822"
@@ -3441,15 +3231,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/ab/d9/a29dfa84363e88b053bf85a8b7f212a04f0d7343a4d24933baa45c06e08b/types_python_dateutil-2.9.0.20250822-py3-none-any.whl", hash = "sha256:849d52b737e10a6dc6621d2bd7940ec7c65fcb69e6aa2882acf4e56b2b508ddc", size = 17892, upload-time = "2025-08-22T03:01:59.436Z" },
 ]
 
-[[package]]
-name = "types-pytz"
-version = "2025.2.0.20250809"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/07/e2/c774f754de26848f53f05defff5bb21dd9375a059d1ba5b5ea943cf8206e/types_pytz-2025.2.0.20250809.tar.gz", hash = "sha256:222e32e6a29bb28871f8834e8785e3801f2dc4441c715cd2082b271eecbe21e5", size = 10876, upload-time = "2025-08-09T03:14:17.453Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/db/d0/91c24fe54e565f2344d7a6821e6c6bb099841ef09007ea6321a0bac0f808/types_pytz-2025.2.0.20250809-py3-none-any.whl", hash = "sha256:4f55ed1b43e925cf851a756fe1707e0f5deeb1976e15bf844bcaa025e8fbd0db", size = 10095, upload-time = "2025-08-09T03:14:16.674Z" },
-]
-
 [[package]]
 name = "typing-extensions"
 version = "4.15.0"
@@ -3515,20 +3296,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/a7/c2/fe1e52489ae3122415c51f387e221dd0773709bad6c6cdaa599e8a2c5185/urllib3-2.5.0-py3-none-any.whl", hash = "sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc", size = 129795, upload-time = "2025-06-18T14:07:40.39Z" },
 ]
 
-[[package]]
-name = "virtualenv"
-version = "20.34.0"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "distlib" },
-    { name = "filelock" },
-    { name = "platformdirs" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/1c/14/37fcdba2808a6c615681cd216fecae00413c9dab44fb2e57805ecf3eaee3/virtualenv-20.34.0.tar.gz", hash = "sha256:44815b2c9dee7ed86e387b842a84f20b93f7f417f95886ca1996a72a4138eb1a", size = 6003808, upload-time = "2025-08-13T14:24:07.464Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/76/06/04c8e804f813cf972e3262f3f8584c232de64f0cde9f703b46cf53a45090/virtualenv-20.34.0-py3-none-any.whl", hash = "sha256:341f5afa7eee943e4984a9207c025feedd768baff6753cd660c857ceb3e36026", size = 5983279, upload-time = "2025-08-13T14:24:05.111Z" },
-]
-
 [[package]]
 name = "wcwidth"
 version = "0.2.14"
@@ -3574,45 +3341,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/e1/07/c6fe3ad3e685340704d314d765b7912993bcb8dc198f0e7a89382d37974b/win32_setctime-1.2.0-py3-none-any.whl", hash = "sha256:95d644c4e708aba81dc3704a116d8cbc974d70b3bdb8be1d150e36be6e9d1390", size = 4083, upload-time = "2024-12-07T15:28:26.465Z" },
 ]
 
-[[package]]
-name = "wrapt"
-version = "1.17.3"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/95/8f/aeb76c5b46e273670962298c23e7ddde79916cb74db802131d49a85e4b7d/wrapt-1.17.3.tar.gz", hash = "sha256:f66eb08feaa410fe4eebd17f2a2c8e2e46d3476e9f8c783daa8e09e0faa666d0", size = 55547, upload-time = "2025-08-12T05:53:21.714Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/52/db/00e2a219213856074a213503fdac0511203dceefff26e1daa15250cc01a0/wrapt-1.17.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:273a736c4645e63ac582c60a56b0acb529ef07f78e08dc6bfadf6a46b19c0da7", size = 53482, upload-time = "2025-08-12T05:51:45.79Z" },
-    { url = "https://files.pythonhosted.org/packages/5e/30/ca3c4a5eba478408572096fe9ce36e6e915994dd26a4e9e98b4f729c06d9/wrapt-1.17.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5531d911795e3f935a9c23eb1c8c03c211661a5060aab167065896bbf62a5f85", size = 38674, upload-time = "2025-08-12T05:51:34.629Z" },
-    { url = "https://files.pythonhosted.org/packages/31/25/3e8cc2c46b5329c5957cec959cb76a10718e1a513309c31399a4dad07eb3/wrapt-1.17.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:0610b46293c59a3adbae3dee552b648b984176f8562ee0dba099a56cfbe4df1f", size = 38959, upload-time = "2025-08-12T05:51:56.074Z" },
-    { url = "https://files.pythonhosted.org/packages/5d/8f/a32a99fc03e4b37e31b57cb9cefc65050ea08147a8ce12f288616b05ef54/wrapt-1.17.3-cp311-cp311-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:b32888aad8b6e68f83a8fdccbf3165f5469702a7544472bdf41f582970ed3311", size = 82376, upload-time = "2025-08-12T05:52:32.134Z" },
-    { url = "https://files.pythonhosted.org/packages/31/57/4930cb8d9d70d59c27ee1332a318c20291749b4fba31f113c2f8ac49a72e/wrapt-1.17.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8cccf4f81371f257440c88faed6b74f1053eef90807b77e31ca057b2db74edb1", size = 83604, upload-time = "2025-08-12T05:52:11.663Z" },
-    { url = "https://files.pythonhosted.org/packages/a8/f3/1afd48de81d63dd66e01b263a6fbb86e1b5053b419b9b33d13e1f6d0f7d0/wrapt-1.17.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d8a210b158a34164de8bb68b0e7780041a903d7b00c87e906fb69928bf7890d5", size = 82782, upload-time = "2025-08-12T05:52:12.626Z" },
-    { url = "https://files.pythonhosted.org/packages/1e/d7/4ad5327612173b144998232f98a85bb24b60c352afb73bc48e3e0d2bdc4e/wrapt-1.17.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:79573c24a46ce11aab457b472efd8d125e5a51da2d1d24387666cd85f54c05b2", size = 82076, upload-time = "2025-08-12T05:52:33.168Z" },
-    { url = "https://files.pythonhosted.org/packages/bb/59/e0adfc831674a65694f18ea6dc821f9fcb9ec82c2ce7e3d73a88ba2e8718/wrapt-1.17.3-cp311-cp311-win32.whl", hash = "sha256:c31eebe420a9a5d2887b13000b043ff6ca27c452a9a22fa71f35f118e8d4bf89", size = 36457, upload-time = "2025-08-12T05:53:03.936Z" },
-    { url = "https://files.pythonhosted.org/packages/83/88/16b7231ba49861b6f75fc309b11012ede4d6b0a9c90969d9e0db8d991aeb/wrapt-1.17.3-cp311-cp311-win_amd64.whl", hash = "sha256:0b1831115c97f0663cb77aa27d381237e73ad4f721391a9bfb2fe8bc25fa6e77", size = 38745, upload-time = "2025-08-12T05:53:02.885Z" },
-    { url = "https://files.pythonhosted.org/packages/9a/1e/c4d4f3398ec073012c51d1c8d87f715f56765444e1a4b11e5180577b7e6e/wrapt-1.17.3-cp311-cp311-win_arm64.whl", hash = "sha256:5a7b3c1ee8265eb4c8f1b7d29943f195c00673f5ab60c192eba2d4a7eae5f46a", size = 36806, upload-time = "2025-08-12T05:52:53.368Z" },
-    { url = "https://files.pythonhosted.org/packages/9f/41/cad1aba93e752f1f9268c77270da3c469883d56e2798e7df6240dcb2287b/wrapt-1.17.3-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:ab232e7fdb44cdfbf55fc3afa31bcdb0d8980b9b95c38b6405df2acb672af0e0", size = 53998, upload-time = "2025-08-12T05:51:47.138Z" },
-    { url = "https://files.pythonhosted.org/packages/60/f8/096a7cc13097a1869fe44efe68dace40d2a16ecb853141394047f0780b96/wrapt-1.17.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:9baa544e6acc91130e926e8c802a17f3b16fbea0fd441b5a60f5cf2cc5c3deba", size = 39020, upload-time = "2025-08-12T05:51:35.906Z" },
-    { url = "https://files.pythonhosted.org/packages/33/df/bdf864b8997aab4febb96a9ae5c124f700a5abd9b5e13d2a3214ec4be705/wrapt-1.17.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6b538e31eca1a7ea4605e44f81a48aa24c4632a277431a6ed3f328835901f4fd", size = 39098, upload-time = "2025-08-12T05:51:57.474Z" },
-    { url = "https://files.pythonhosted.org/packages/9f/81/5d931d78d0eb732b95dc3ddaeeb71c8bb572fb01356e9133916cd729ecdd/wrapt-1.17.3-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:042ec3bb8f319c147b1301f2393bc19dba6e176b7da446853406d041c36c7828", size = 88036, upload-time = "2025-08-12T05:52:34.784Z" },
-    { url = "https://files.pythonhosted.org/packages/ca/38/2e1785df03b3d72d34fc6252d91d9d12dc27a5c89caef3335a1bbb8908ca/wrapt-1.17.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3af60380ba0b7b5aeb329bc4e402acd25bd877e98b3727b0135cb5c2efdaefe9", size = 88156, upload-time = "2025-08-12T05:52:13.599Z" },
-    { url = "https://files.pythonhosted.org/packages/b3/8b/48cdb60fe0603e34e05cffda0b2a4adab81fd43718e11111a4b0100fd7c1/wrapt-1.17.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:0b02e424deef65c9f7326d8c19220a2c9040c51dc165cddb732f16198c168396", size = 87102, upload-time = "2025-08-12T05:52:14.56Z" },
-    { url = "https://files.pythonhosted.org/packages/3c/51/d81abca783b58f40a154f1b2c56db1d2d9e0d04fa2d4224e357529f57a57/wrapt-1.17.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:74afa28374a3c3a11b3b5e5fca0ae03bef8450d6aa3ab3a1e2c30e3a75d023dc", size = 87732, upload-time = "2025-08-12T05:52:36.165Z" },
-    { url = "https://files.pythonhosted.org/packages/9e/b1/43b286ca1392a006d5336412d41663eeef1ad57485f3e52c767376ba7e5a/wrapt-1.17.3-cp312-cp312-win32.whl", hash = "sha256:4da9f45279fff3543c371d5ababc57a0384f70be244de7759c85a7f989cb4ebe", size = 36705, upload-time = "2025-08-12T05:53:07.123Z" },
-    { url = "https://files.pythonhosted.org/packages/28/de/49493f962bd3c586ab4b88066e967aa2e0703d6ef2c43aa28cb83bf7b507/wrapt-1.17.3-cp312-cp312-win_amd64.whl", hash = "sha256:e71d5c6ebac14875668a1e90baf2ea0ef5b7ac7918355850c0908ae82bcb297c", size = 38877, upload-time = "2025-08-12T05:53:05.436Z" },
-    { url = "https://files.pythonhosted.org/packages/f1/48/0f7102fe9cb1e8a5a77f80d4f0956d62d97034bbe88d33e94699f99d181d/wrapt-1.17.3-cp312-cp312-win_arm64.whl", hash = "sha256:604d076c55e2fdd4c1c03d06dc1a31b95130010517b5019db15365ec4a405fc6", size = 36885, upload-time = "2025-08-12T05:52:54.367Z" },
-    { url = "https://files.pythonhosted.org/packages/fc/f6/759ece88472157acb55fc195e5b116e06730f1b651b5b314c66291729193/wrapt-1.17.3-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:a47681378a0439215912ef542c45a783484d4dd82bac412b71e59cf9c0e1cea0", size = 54003, upload-time = "2025-08-12T05:51:48.627Z" },
-    { url = "https://files.pythonhosted.org/packages/4f/a9/49940b9dc6d47027dc850c116d79b4155f15c08547d04db0f07121499347/wrapt-1.17.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:54a30837587c6ee3cd1a4d1c2ec5d24e77984d44e2f34547e2323ddb4e22eb77", size = 39025, upload-time = "2025-08-12T05:51:37.156Z" },
-    { url = "https://files.pythonhosted.org/packages/45/35/6a08de0f2c96dcdd7fe464d7420ddb9a7655a6561150e5fc4da9356aeaab/wrapt-1.17.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:16ecf15d6af39246fe33e507105d67e4b81d8f8d2c6598ff7e3ca1b8a37213f7", size = 39108, upload-time = "2025-08-12T05:51:58.425Z" },
-    { url = "https://files.pythonhosted.org/packages/0c/37/6faf15cfa41bf1f3dba80cd3f5ccc6622dfccb660ab26ed79f0178c7497f/wrapt-1.17.3-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:6fd1ad24dc235e4ab88cda009e19bf347aabb975e44fd5c2fb22a3f6e4141277", size = 88072, upload-time = "2025-08-12T05:52:37.53Z" },
-    { url = "https://files.pythonhosted.org/packages/78/f2/efe19ada4a38e4e15b6dff39c3e3f3f73f5decf901f66e6f72fe79623a06/wrapt-1.17.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0ed61b7c2d49cee3c027372df5809a59d60cf1b6c2f81ee980a091f3afed6a2d", size = 88214, upload-time = "2025-08-12T05:52:15.886Z" },
-    { url = "https://files.pythonhosted.org/packages/40/90/ca86701e9de1622b16e09689fc24b76f69b06bb0150990f6f4e8b0eeb576/wrapt-1.17.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:423ed5420ad5f5529db9ce89eac09c8a2f97da18eb1c870237e84c5a5c2d60aa", size = 87105, upload-time = "2025-08-12T05:52:17.914Z" },
-    { url = "https://files.pythonhosted.org/packages/fd/e0/d10bd257c9a3e15cbf5523025252cc14d77468e8ed644aafb2d6f54cb95d/wrapt-1.17.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:e01375f275f010fcbf7f643b4279896d04e571889b8a5b3f848423d91bf07050", size = 87766, upload-time = "2025-08-12T05:52:39.243Z" },
-    { url = "https://files.pythonhosted.org/packages/e8/cf/7d848740203c7b4b27eb55dbfede11aca974a51c3d894f6cc4b865f42f58/wrapt-1.17.3-cp313-cp313-win32.whl", hash = "sha256:53e5e39ff71b3fc484df8a522c933ea2b7cdd0d5d15ae82e5b23fde87d44cbd8", size = 36711, upload-time = "2025-08-12T05:53:10.074Z" },
-    { url = "https://files.pythonhosted.org/packages/57/54/35a84d0a4d23ea675994104e667ceff49227ce473ba6a59ba2c84f250b74/wrapt-1.17.3-cp313-cp313-win_amd64.whl", hash = "sha256:1f0b2f40cf341ee8cc1a97d51ff50dddb9fcc73241b9143ec74b30fc4f44f6cb", size = 38885, upload-time = "2025-08-12T05:53:08.695Z" },
-    { url = "https://files.pythonhosted.org/packages/01/77/66e54407c59d7b02a3c4e0af3783168fff8e5d61def52cda8728439d86bc/wrapt-1.17.3-cp313-cp313-win_arm64.whl", hash = "sha256:7425ac3c54430f5fc5e7b6f41d41e704db073309acfc09305816bc6a0b26bb16", size = 36896, upload-time = "2025-08-12T05:52:55.34Z" },
-    { url = "https://files.pythonhosted.org/packages/1f/f6/a933bd70f98e9cf3e08167fc5cd7aaaca49147e48411c0bd5ae701bb2194/wrapt-1.17.3-py3-none-any.whl", hash = "sha256:7171ae35d2c33d326ac19dd8facb1e82e5fd04ef8c6c0e394d7af55a55051c22", size = 23591, upload-time = "2025-08-12T05:53:20.674Z" },
-]
-
 [[package]]
 name = "xlrd"
 version = "2.0.2"
@@ -3637,3 +3365,18 @@ sdist = { url = "https://files.pythonhosted.org/packages/d6/67/14be68a7bad15eecd
 wheels = [
     { url = "https://files.pythonhosted.org/packages/1a/71/9de7229515a53d1cc5705ca9c411530f711a2242f962214d9dbfe2741aa4/zarr-3.1.3-py3-none-any.whl", hash = "sha256:45f67f87f65f14fa453f99dd8110a5936b7ac69f3a21981d33e90407c80c302a", size = 276427, upload-time = "2025-09-18T19:32:40.042Z" },
 ]
+
+[[package]]
+name = "zfpkm"
+version = "1.0.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "loguru" },
+    { name = "matplotlib" },
+    { name = "numpy" },
+    { name = "pandas" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/e3/7f/ff714f85601cd66439f2beed0d740772509b32b8be5a8b01a53652248714/zfpkm-1.0.3.tar.gz", hash = "sha256:58830ea61e6adc0c75f28d5304885bd03a33a6e9e56aa693856cbb37e30a5046", size = 15410, upload-time = "2025-11-10T16:47:45.614Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/11/f8/ef2baeaf2d15682d5d663c3f5165b63abad114fc0c4cd90b67b1ed0a6456/zfpkm-1.0.3-py3-none-any.whl", hash = "sha256:085007f97e75e50d686677ee28e3fceba5fc19958b35e9fbad3756ca2302a219", size = 17841, upload-time = "2025-11-10T16:47:44.805Z" },
+]

From 5afa6f3f627cc00ab7f23e5cac938ed6c0fefb29 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 9 Feb 2026 16:32:02 -0600
Subject: [PATCH 23/26] fix: rename count to quant in testing files

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 tests/unit/test_rnaseq_preprocess.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/unit/test_rnaseq_preprocess.py b/tests/unit/test_rnaseq_preprocess.py
index 6b3419c2..19ff9cab 100644
--- a/tests/unit/test_rnaseq_preprocess.py
+++ b/tests/unit/test_rnaseq_preprocess.py
@@ -52,9 +52,9 @@ def test_sample_name_from_filepath(any_como_input_filepath: Path):
 def test_organize_gene_counts_files(como_input_data_directory: Path):
     metric: _StudyMetrics
     for metric in _organize_gene_counts_files(como_input_data_directory):
-        assert len(metric.sample_names) == metric.num_samples == len(metric.count_files) == len(metric.strand_files)
+        assert len(metric.sample_names) == metric.num_samples == len(metric.quant_files) == len(metric.strand_files)
 
-        for file in metric.count_files:
+        for file in metric.quant_files:
             assert f"/{metric.study_name}/" in file.as_posix()
             assert "geneCounts" in file.as_posix()
             assert file.suffix == ".tab"

From 351e93c597924f92bd4107bdd0cfad40f3bf9d84 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 9 Feb 2026 16:32:49 -0600
Subject: [PATCH 24/26] feat: add single cell normalization using scanpy
 defaults

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/rnaseq_gen.py | 116 +++++++++++++++++++++++++++++++++++-----
 1 file changed, 104 insertions(+), 12 deletions(-)

diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py
index 4c522e3c..327c6ba5 100644
--- a/main/como/rnaseq_gen.py
+++ b/main/como/rnaseq_gen.py
@@ -565,8 +565,93 @@ def zfpkm_filter(
         # determine which genes are confidently expressed
         top_samples = round(high_confidence_sample_expression * len(zfpkm_df.columns))
         top_func = k_over_a(top_samples, cut_off)
-        top_genes: npt.NDArray[bool] = genefilter(zfpkm_df, top_func)
-        metric.high_confidence_entrez_gene_ids = [gene for gene, keep in zip(zfpkm_df.index, top_genes, strict=True) if keep]
+        top_genes: npt.NDArray[np.bool] = genefilter(zfpkm_df, top_func)
+        metric.high_confidence_entrez_gene_ids = [
+            gene for gene, keep in zip(zfpkm_df.index, top_genes, strict=True) if keep
+        ]
+
+    return metrics
+
+
+def umi_filter(
+    metrics: NamedMetrics,
+    filtering_options: _FilteringOptions,
+    target_sum: int = 10_000,
+    perform_normalization: bool = False,
+) -> NamedMetrics:
+    """Perform UMI-based filtering.
+
+    UMI filtering uses ScanPy's built-in `sc.pp.scale` (if `perform_normalization=True`)
+    Otherwise, this function assumes that data has been pre-normalized+scaled beforehand and will evaluate expressed & highly expressed genes directly
+
+    For each metric's matrix:
+        - The rows are genomic identifiers (gene symbol, entrez gene id, ensembl gene id, etc.)
+        - The columns are cell identifiers (e.g., barcodes)
+
+    Calculating counts per cell should, therefore, be a column-wise sum (axis=0)
+
+
+    :param metrics: The metrics to perform UMI filtering on
+    :param filtering_options: Options for filtering the count matrix.
+    :param target_sum: The target sum for UMI normalization.
+    :param perform_normalization: Whether to perform normalization before filtering.
+
+    :returns: The filtered metrics
+    """
+    min_sample_expression = filtering_options.replicate_ratio
+    high_confidence_sample_expression = filtering_options.high_replicate_ratio
+    cut_off = filtering_options.cut_off
+
+    if min_sample_expression > 0.20:
+        logger.warning(
+            "Setting a minimum sample expression greater than ~20% for UMI-based filtering will likely result in very few/no genes being marked as active. "  # noqa: E501
+            "Activity values ranging from 10-20% are recommended based on recent literature. "
+            f"Got: {min_sample_expression} for option 'replicate_ratio'"
+        )
+    if high_confidence_sample_expression > 0.40:
+        logger.warning(
+            f"Setting high-confidence expression greater than ~40% for UMI-based filtering will likely result in very few to no genes being marked as highly active. "  # noqa: E501
+            "Activity values ranging from 20-30% are recommended based on recent literature. "
+            f"Got: {high_confidence_sample_expression} for option 'high_replicate_ratio'."
+        )
+
+    for metric in metrics.values():
+        metric: _StudyMetrics
+        if not isinstance(metric.count_matrix, sc.AnnData):
+            raise TypeError(f"Expected a scanpy.AnnData for UMI filtering, got: '{type(metric.count_matrix)}'")
+        adata: sc.AnnData = metric.count_matrix
+
+        if perform_normalization:
+            if adata.raw is not None:
+                adata.X = adata.raw.X.copy()
+            sc.pp.filter_cells(adata, min_genes=20)
+            sc.pp.filter_genes(adata, min_cells=1)
+            sc.pp.normalize_total(adata, target_sum=target_sum)
+            sc.pp.log1p(adata)
+            # sc.pp.scale(adata, max_value=15)  # abs(values)>10 standard deviations away will be set to +/-10
+
+        metric.z_score_matrix = adata
+
+        adata_x = adata.X
+        n_cells, n_genes = adata.shape
+
+        min_samples: float = round(min_sample_expression * n_cells)
+        min_func = k_over_a(min_samples, cut_off)
+        min_genes_mask = np.zeros(n_genes, dtype=bool)
+        for j in range(n_genes):
+            col = adata_x.getcol(j).toarray().ravel() if sparse.issparse(adata_x) else adata_x[:, j]
+            min_genes_mask[j] = min_func(col)
+        metric.entrez_gene_ids = (
+            adata.var.loc[min_genes_mask, "entrez_gene_id"].dropna().tolist()
+        )  # at this point we do not need/want NA entrez IDs
+
+        top_samples = round(high_confidence_sample_expression * n_cells)
+        top_func = k_over_a(top_samples, cut_off)
+        top_genes_mask = np.zeros(n_genes, dtype=bool)
+        for j in range(n_genes):
+            col = adata_x.getcol(j).toarray().ravel() if sparse.issparse(adata_x) else adata_x[:, j]
+            top_genes_mask[j] = top_func(col)
+        metric.high_confidence_entrez_gene_ids = adata.var.loc[top_genes_mask, "entrez_gene_id"].dropna().tolist()
 
     return metrics
 
@@ -581,6 +666,8 @@ def filter_counts(
     force_zfpkm_plot: bool,
     zfpkm_min_peak_height: float,
     zfpkm_min_peak_distance: int,
+    umi_target_sum: int = 10_000,
+    umi_perform_normalization: bool = False,
     output_zfpkm_plot_dirpath: Path | None = None,
     force_negative_to_zero: bool = False,
 ) -> NamedMetrics:
@@ -600,12 +687,13 @@ def filter_counts(
     :param force_negative_to_zero: Should negative values be forcibly set to 0?
             This could happen as a result of normalization producing negative near-zero values (e.g., -0.001)
 
-    Returns:
-        A dictionary of filtered study metrics.
+    :returns: A dictionary of filtered study metrics.
     """
     match technique:
         case FilteringTechnique.CPM:
-            return cpm_filter(context_name=context_name, metrics=metrics, filtering_options=filtering_options, prep=prep)
+            return cpm_filter(
+                context_name=context_name, metrics=metrics, filtering_options=filtering_options, prep=prep
+            )
         case FilteringTechnique.TPM:
             return tpm_quantile_filter(metrics=metrics, filtering_options=filtering_options)
         case FilteringTechnique.ZFPKM:
@@ -620,15 +708,11 @@ def filter_counts(
                 force_negative_to_zero=force_negative_to_zero,
             )
         case FilteringTechnique.UMI:
-            # UMI filtering is the same as zFPKM filtering without calculating FPKM
-            return zfpkm_filter(
+            return umi_filter(
                 metrics=metrics,
                 filtering_options=filtering_options,
-                calculate_fpkm=False,
-                force_zfpkm_plot=force_zfpkm_plot,
-                min_peak_height=zfpkm_min_peak_height,
-                min_peak_distance=zfpkm_min_peak_distance,
-                output_png_dirpath=output_zfpkm_plot_dirpath,
+                target_sum=umi_target_sum,
+                perform_normalization=umi_perform_normalization,
             )
         case _:
             _log_and_raise_error(
@@ -655,6 +739,8 @@ async def _process(
     force_zfpkm_plot: bool,
     zfpkm_min_peak_height: float,
     zfpkm_min_peak_distance: int,
+    umi_target_sum: int,
+    umi_perform_normalization: bool,
     output_boolean_activity_filepath: Path,
     output_zscore_normalization_filepath: Path,
     output_zfpkm_plot_dirpath: Path | None,
@@ -688,6 +774,8 @@ async def _process(
         force_zfpkm_plot=force_zfpkm_plot,
         zfpkm_min_peak_height=zfpkm_min_peak_height,
         zfpkm_min_peak_distance=zfpkm_min_peak_distance,
+        umi_target_sum=umi_target_sum,
+        umi_perform_normalization=umi_perform_normalization,
         output_zfpkm_plot_dirpath=output_zfpkm_plot_dirpath,
         force_negative_to_zero=force_negative_to_zero,
     )
@@ -807,6 +895,8 @@ async def rnaseq_gen(  # noqa: C901
     :param technique: The filtering technique to use
     :param zfpkm_min_peak_height: The height of the zFPKM peak
     :param zfpkm_min_peak_distance: The distance of the zFPKM peak
+    :param umi_target_sum: The target sum for UMI normalization
+    :param umi_perform_normalization: Should UMI normalization be performed?
     :param cutoff: The cutoff value to use for the provided filtering technique
     :param force_zfpkm_plot: If too many samples exist, should plotting be done anyway?
     :param log_level: The level of logging to output
@@ -908,6 +998,8 @@ async def rnaseq_gen(  # noqa: C901
         force_zfpkm_plot=force_zfpkm_plot,
         zfpkm_min_peak_height=zfpkm_min_peak_height,
         zfpkm_min_peak_distance=zfpkm_min_peak_distance,
+        umi_target_sum=umi_target_sum,
+        umi_perform_normalization=umi_perform_normalization,
         output_boolean_activity_filepath=output_boolean_activity_filepath,
         output_zscore_normalization_filepath=output_zscore_normalization_filepath,
         output_zfpkm_plot_dirpath=output_zfpkm_plot_dirpath,

From 2fd9249fe5d7055a3637b2f6b39f8b1ebd81bb90 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 9 Feb 2026 16:44:25 -0600
Subject: [PATCH 25/26] fix: test new quant information

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/rnaseq_preprocess.py       |  2 +-
 tests/unit/test_rnaseq_preprocess.py | 35 ++++++++++++++--------------
 2 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/main/como/rnaseq_preprocess.py b/main/como/rnaseq_preprocess.py
index 06f33922..967236b7 100644
--- a/main/como/rnaseq_preprocess.py
+++ b/main/como/rnaseq_preprocess.py
@@ -46,7 +46,7 @@ def build_from_sf(cls, filepath: Path) -> _QuantInformation:
                 level=LogLevel.ERROR,
             )
 
-        sample_name = filepath.stem.removesuffix("_quant.genes.sf")
+        sample_name = filepath.stem.removesuffix("_quant.genes")
         df = pd.read_csv(
             io.StringIO(filepath.read_text()),
             sep="\t",
diff --git a/tests/unit/test_rnaseq_preprocess.py b/tests/unit/test_rnaseq_preprocess.py
index 19ff9cab..7057f3c1 100644
--- a/tests/unit/test_rnaseq_preprocess.py
+++ b/tests/unit/test_rnaseq_preprocess.py
@@ -7,8 +7,8 @@
 from como.rnaseq_preprocess import (
     _organize_gene_counts_files,
     _process_first_multirun_sample,
+    _QuantInformation,
     _sample_name_from_filepath,
-    _STARinformation,
     _StudyMetrics,
 )
 
@@ -22,26 +22,25 @@
 )
 
 
-class TestSTARInformation:
-    valid_data = Path("main/data/COMO_input/naiveB/geneCounts/S1/naiveB_S1R1.tab").resolve()
-    invalid_data = Path("main/data/COMO_input/naiveB/fragmentSizes/S1/naiveB_S1R1_fragment_size.txt").resolve()
+class TestQuantInformation:
+    valid_data = Path("main/data/COMO_input/naiveB/quantification/S1/naiveB_S1R1_quant.genes.sf").resolve()
+    invalid_data = Path("main/data/COMO_input/naiveB/strandedness/S1/naiveB_S1R1_strandedness.txt").resolve()
 
-    @pytest.mark.asyncio
-    async def test_build_from_tab_valid_file(self) -> None:
-        """Validate building STAR information object."""
-        star: _STARinformation = await _STARinformation.build_from_tab(TestSTARInformation.valid_data)
+    def test_build_from_sf_valid_file(self) -> None:
+        quant: _QuantInformation = _QuantInformation.build_from_sf(TestQuantInformation.valid_data)
+        assert len(quant.gene_names) == len(quant.count_matrix) == 78900
+        assert quant.sample_name == "naiveB_S1R1"
+        assert quant.filepath.as_posix().endswith(
+            "/COMO/main/data/COMO_input/naiveB/quantification/S1/naiveB_S1R1_quant.genes.sf"
+        )
 
-        assert len(star.gene_names) == len(star.count_matrix) == 61541
-        assert len(star.num_unmapped) == 3
-        assert len(star.num_multimapping) == 3
-        assert len(star.num_no_feature) == 3
-        assert len(star.num_ambiguous) == 3
+    def test_build_from_sf_invalid_file(self):
+        with pytest.raises(ValueError, match=r"Building quantification information requires a '.sf' file; received: "):
+            _QuantInformation.build_from_sf(TestQuantInformation.invalid_data)
 
-    @pytest.mark.asyncio
-    async def test_build_from_tab_invalid_file(self):
-        """Validate error on invalid file."""
-        with pytest.raises(ValueError, match=r"Building STAR information requires a '\.tab' file"):
-            await _STARinformation.build_from_tab(TestSTARInformation.invalid_data)
+    def test_build_from_missing_file(self):
+        with pytest.raises(FileNotFoundError, match=r"Unable to find the .sf file: "):
+            _QuantInformation.build_from_sf(Path("missing_file.txt"))
 
 
 def test_sample_name_from_filepath(any_como_input_filepath: Path):

From 12b04255e7b86c697ed5f95ba0e0578327d4d672 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 9 Feb 2026 16:48:39 -0600
Subject: [PATCH 26/26] chore: use quant files instead of strand files

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 tests/fixtures/collect_files.py      | 11 ++++++-----
 tests/unit/test_rnaseq_preprocess.py |  5 ++---
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/tests/fixtures/collect_files.py b/tests/fixtures/collect_files.py
index 93d59e70..71ca1a9d 100644
--- a/tests/fixtures/collect_files.py
+++ b/tests/fixtures/collect_files.py
@@ -5,7 +5,8 @@
 from _pytest.fixtures import SubRequest
 
 _fragment_size_filepaths = list(Path("main/data/COMO_input").absolute().rglob("*fragment_size*.txt"))
-_gene_count_filepaths = list(Path("main/data/COMO_input").absolute().rglob("*.tab"))
+_quant_filepaths = list(Path("main/data/COMO_input").absolute().rglob("*.sf"))
+# _gene_count_filepaths = list(Path("main/data/COMO_input").absolute().rglob("*.tab"))
 _insert_size_filepaths = list(Path("main/data/COMO_input").absolute().rglob("*_insert_size.txt"))
 _layout_filepaths = list(Path("main/data/COMO_input").absolute().rglob("*_layout.txt"))
 _preparation_method_filepaths = list(Path("main/data/COMO_input").absolute().rglob("*_prep_method.txt"))
@@ -27,14 +28,14 @@ def fragment_size_filepath(request: SubRequest) -> Path:
     return request.param
 
 
-@pytest.fixture(params=_gene_count_filepaths)
-def gene_count_filepath(request: SubRequest) -> Path:
+@pytest.fixture(params=_quant_filepaths)
+def quant_filepaths(request: SubRequest) -> Path:
     return request.param
 
 
 @pytest.fixture
 def all_gene_count_filepaths() -> list[Path]:
-    return _gene_count_filepaths
+    return _quant_filepaths
 
 
 @pytest.fixture(params=_insert_size_filepaths)
@@ -62,7 +63,7 @@ def strand_filepath(request: SubRequest) -> Path:
         file
         for filepaths in [
             _fragment_size_filepaths,
-            _gene_count_filepaths,
+            _quant_filepaths,
             _insert_size_filepaths,
             _layout_filepaths,
             _preparation_method_filepaths,
diff --git a/tests/unit/test_rnaseq_preprocess.py b/tests/unit/test_rnaseq_preprocess.py
index 7057f3c1..20b2fcb3 100644
--- a/tests/unit/test_rnaseq_preprocess.py
+++ b/tests/unit/test_rnaseq_preprocess.py
@@ -64,9 +64,8 @@ def test_organize_gene_counts_files(como_input_data_directory: Path):
             assert file.suffix == ".txt"
 
 
-@pytest.mark.asyncio
-async def test_process_first_multirun_sample(strand_filepath: Path, all_gene_count_filepaths: list[Path]):
-    result: pd.DataFrame = await _process_first_multirun_sample(strand_filepath, all_gene_count_filepaths)
+def test_process_first_multirun_sample(strand_filepath: Path, all_gene_count_filepaths: list[Path]):
+    result: pd.DataFrame = _process_first_multirun_sample(strand_filepath, all_gene_count_filepaths)
     assert result.columns[0] == "ensembl_gene_id"
     assert len(result.columns) == 2
     assert result.columns.tolist()[1] in strand_filepath.as_posix()