Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
346f794
fix(fpkm): update imports for zFPKM calculation improvements
JoshLoecker Feb 9, 2026
985c6f2
fix(fpkm): use Salmon quantification instead of STAR quantification
JoshLoecker Feb 9, 2026
d350063
chore: ruff formatting
JoshLoecker Feb 9, 2026
7482250
chore: fill with integers for faster processing
JoshLoecker Feb 9, 2026
155c822
chore: remove unnecessary async function usage
JoshLoecker Feb 9, 2026
f7b3a06
fix: remove non existant genes from conversion
JoshLoecker Feb 9, 2026
0e4a2c3
refactor: use more explicit (albeit longer) code to create gene_info …
JoshLoecker Feb 9, 2026
ab66599
chore: import required modules
JoshLoecker Feb 9, 2026
95654b3
refactor: optional argument for fragment data
JoshLoecker Feb 9, 2026
dec37b0
refactor: improve handling for single cell data
JoshLoecker Feb 9, 2026
fc1d45f
chore: generalize data type input
JoshLoecker Feb 9, 2026
e1505d1
chore: ruff formatting
JoshLoecker Feb 9, 2026
849ba2e
chore: simplify FPKM/RPKM calculations; properly compute per-gene FPK…
JoshLoecker Feb 9, 2026
3234413
refactor: move zfpkm calculation to external package
JoshLoecker Feb 9, 2026
f90c388
chore: use np.bool for boolean array
JoshLoecker Feb 9, 2026
8253a7d
chore: ruff formatting
JoshLoecker Feb 9, 2026
c52d2e8
feat: allow setting negative zFPKM results to 0
JoshLoecker Feb 9, 2026
e2e6350
feat: simplification to use external zfpkm package
JoshLoecker Feb 9, 2026
2ad9887
feat: allow providing the fragment size filepath (from rnaseq preproc…
JoshLoecker Feb 9, 2026
6af3990
chore(ruff): reduce max line length
JoshLoecker Feb 9, 2026
479fce2
chore(ruff): mark unsorted imports as fixable
JoshLoecker Feb 9, 2026
d83e974
chore(uv): lock pyproject file
JoshLoecker Feb 9, 2026
5afa6f3
fix: rename count to quant in testing files
JoshLoecker Feb 9, 2026
351e93c
feat: add single cell normalization using scanpy defaults
JoshLoecker Feb 9, 2026
2fd9249
fix: test new quant information
JoshLoecker Feb 9, 2026
12b0425
chore: use quant files instead of strand files
JoshLoecker Feb 9, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
735 changes: 374 additions & 361 deletions main/como/rnaseq_gen.py

Large diffs are not rendered by default.

559 changes: 310 additions & 249 deletions main/como/rnaseq_preprocess.py

Large diffs are not rendered by default.

17 changes: 8 additions & 9 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,14 @@ dependencies = [
"cobamp@git+https://github.com/JoshLoecker/cobamp@master",
"cobra>=0.28.0",
"fast-bioservices>=0.3.9",
"joypy>=0.2.6",
"kaleido>=1.0.0",
"loguru>=0.7.2",
"notebook>=7.4.7",
"numpy>=2",
"openpyxl>=3.1.5",
"pandas>=1.3.5",
"python-louvain",
"scanpy>=1.10.4",
"scikit-learn>=1.5.2",
"scipy>=1.13.0",
Expand All @@ -25,6 +28,7 @@ dependencies = [
"statsmodels>=0.13.0; python_version < '3.12'",
"statsmodels>=0.14.0; python_version >= '3.12'",
"troppo@git+https://github.com/JoshLoecker/troppo@master",
"zfpkm>=1.0.3",
]

[project.optional-dependencies]
Expand All @@ -36,19 +40,11 @@ interactive = [
"jupyterlab>=4.3.2"
]
dev = [
"commitizen>=4.8.3",
"commitlint>=1.3.1",
"como",
"hatchling>=1.27.0",
"pandas-stubs>=2.3.2.250827",
"pre-commit>=4.2.0",
"pyright>=1.1.405",
"pytest>=8.4.1",
"pytest-asyncio>=1.1.0",
"pytest-cov>=6.2.1",
"ruff>=0.12.11",
"scipy-stubs>=1.16.1.1",
"types-aiofiles>=24.1.0.20250822",
"pytest>=8.4.1",
]

[tool.hatch.version]
Expand All @@ -62,3 +58,6 @@ allow-direct-references = true

[tool.pytest.ini_options]
pythonpath = ["main/src"]

[tool.uv.sources]
python-louvain = { git = "https://github.com/taynaud/python-louvain" }
3 changes: 2 additions & 1 deletion ruff.toml
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
line-length = 150
line-length = 120
extend-include = ["docs/**/*.py", "tests/**/*.py", "**/*.ipynb"]

[format]
quote-style = "double"
docstring-code-format = true

[lint]
extend-fixable = ["I001"]
# Linting rules: https://docs.astral.sh/ruff/rules/
unfixable = [
"F401", # warn about, but do not remove, unused imports
Expand Down
11 changes: 6 additions & 5 deletions tests/fixtures/collect_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
from _pytest.fixtures import SubRequest

_fragment_size_filepaths = list(Path("main/data/COMO_input").absolute().rglob("*fragment_size*.txt"))
_gene_count_filepaths = list(Path("main/data/COMO_input").absolute().rglob("*.tab"))
_quant_filepaths = list(Path("main/data/COMO_input").absolute().rglob("*.sf"))
# _gene_count_filepaths = list(Path("main/data/COMO_input").absolute().rglob("*.tab"))
_insert_size_filepaths = list(Path("main/data/COMO_input").absolute().rglob("*_insert_size.txt"))
_layout_filepaths = list(Path("main/data/COMO_input").absolute().rglob("*_layout.txt"))
_preparation_method_filepaths = list(Path("main/data/COMO_input").absolute().rglob("*_prep_method.txt"))
Expand All @@ -27,14 +28,14 @@ def fragment_size_filepath(request: SubRequest) -> Path:
return request.param


@pytest.fixture(params=_gene_count_filepaths)
def gene_count_filepath(request: SubRequest) -> Path:
@pytest.fixture(params=_quant_filepaths)
def quant_filepaths(request: SubRequest) -> Path:
return request.param


@pytest.fixture
def all_gene_count_filepaths() -> list[Path]:
return _gene_count_filepaths
return _quant_filepaths


@pytest.fixture(params=_insert_size_filepaths)
Expand Down Expand Up @@ -62,7 +63,7 @@ def strand_filepath(request: SubRequest) -> Path:
file
for filepaths in [
_fragment_size_filepaths,
_gene_count_filepaths,
_quant_filepaths,
_insert_size_filepaths,
_layout_filepaths,
_preparation_method_filepaths,
Expand Down
44 changes: 21 additions & 23 deletions tests/unit/test_rnaseq_preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
from como.rnaseq_preprocess import (
_organize_gene_counts_files,
_process_first_multirun_sample,
_QuantInformation,
_sample_name_from_filepath,
_STARinformation,
_StudyMetrics,
)

Expand All @@ -22,26 +22,25 @@
)


class TestSTARInformation:
valid_data = Path("main/data/COMO_input/naiveB/geneCounts/S1/naiveB_S1R1.tab").resolve()
invalid_data = Path("main/data/COMO_input/naiveB/fragmentSizes/S1/naiveB_S1R1_fragment_size.txt").resolve()
class TestQuantInformation:
valid_data = Path("main/data/COMO_input/naiveB/quantification/S1/naiveB_S1R1_quant.genes.sf").resolve()
invalid_data = Path("main/data/COMO_input/naiveB/strandedness/S1/naiveB_S1R1_strandedness.txt").resolve()

@pytest.mark.asyncio
async def test_build_from_tab_valid_file(self) -> None:
"""Validate building STAR information object."""
star: _STARinformation = await _STARinformation.build_from_tab(TestSTARInformation.valid_data)
def test_build_from_sf_valid_file(self) -> None:
quant: _QuantInformation = _QuantInformation.build_from_sf(TestQuantInformation.valid_data)
assert len(quant.gene_names) == len(quant.count_matrix) == 78900
assert quant.sample_name == "naiveB_S1R1"
assert quant.filepath.as_posix().endswith(
"/COMO/main/data/COMO_input/naiveB/quantification/S1/naiveB_S1R1_quant.genes.sf"
)

assert len(star.gene_names) == len(star.count_matrix) == 61541
assert len(star.num_unmapped) == 3
assert len(star.num_multimapping) == 3
assert len(star.num_no_feature) == 3
assert len(star.num_ambiguous) == 3
def test_build_from_sf_invalid_file(self):
with pytest.raises(ValueError, match=r"Building quantification information requires a '.sf' file; received: "):
_QuantInformation.build_from_sf(TestQuantInformation.invalid_data)

@pytest.mark.asyncio
async def test_build_from_tab_invalid_file(self):
"""Validate error on invalid file."""
with pytest.raises(ValueError, match=r"Building STAR information requires a '\.tab' file"):
await _STARinformation.build_from_tab(TestSTARInformation.invalid_data)
def test_build_from_missing_file(self):
with pytest.raises(FileNotFoundError, match=r"Unable to find the .sf file: "):
_QuantInformation.build_from_sf(Path("missing_file.txt"))


def test_sample_name_from_filepath(any_como_input_filepath: Path):
Expand All @@ -52,9 +51,9 @@ def test_sample_name_from_filepath(any_como_input_filepath: Path):
def test_organize_gene_counts_files(como_input_data_directory: Path):
metric: _StudyMetrics
for metric in _organize_gene_counts_files(como_input_data_directory):
assert len(metric.sample_names) == metric.num_samples == len(metric.count_files) == len(metric.strand_files)
assert len(metric.sample_names) == metric.num_samples == len(metric.quant_files) == len(metric.strand_files)

for file in metric.count_files:
for file in metric.quant_files:
assert f"/{metric.study_name}/" in file.as_posix()
assert "geneCounts" in file.as_posix()
assert file.suffix == ".tab"
Expand All @@ -65,9 +64,8 @@ def test_organize_gene_counts_files(como_input_data_directory: Path):
assert file.suffix == ".txt"


@pytest.mark.asyncio
async def test_process_first_multirun_sample(strand_filepath: Path, all_gene_count_filepaths: list[Path]):
result: pd.DataFrame = await _process_first_multirun_sample(strand_filepath, all_gene_count_filepaths)
def test_process_first_multirun_sample(strand_filepath: Path, all_gene_count_filepaths: list[Path]):
result: pd.DataFrame = _process_first_multirun_sample(strand_filepath, all_gene_count_filepaths)
assert result.columns[0] == "ensembl_gene_id"
assert len(result.columns) == 2
assert result.columns.tolist()[1] in strand_filepath.as_posix()
Expand Down
Loading
Loading