Skip to content

Commit 71c0ddd

Browse files
authored
Merge pull request #2 from SingleRust/mwu-testing
Merge pull request #1 from SingleRust/main
2 parents a43185c + d41dad2 commit 71c0ddd

File tree

11 files changed

+348
-115
lines changed

11 files changed

+348
-115
lines changed

Cargo.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "single-statistics"
3-
version = "0.7.0"
3+
version = "0.8.1"
44
edition = "2024"
55
license-file = "LICENSE.md"
66
readme = "README.md"

src/enrichment/aucell.rs

Lines changed: 0 additions & 105 deletions
Original file line numberDiff line numberDiff line change
@@ -1,107 +1,2 @@
11
// Following the general implementation presented here, But adapted to nalgebra_sparse and multithreading: https://github.com/scverse/decoupler/blob/main/src/decoupler/mt/_aucell.py
22

3-
use std::mem::offset_of;
4-
5-
use nalgebra_sparse::CsrMatrix;
6-
use ndarray::Array2;
7-
use single_utilities::traits::{FloatOps, FloatOpsTS};
8-
9-
fn validate_n_up<T>(n_genes: usize, n_up: Option<T>) -> anyhow::Result<usize>
10-
where
11-
T: FloatOps {
12-
let n_up_value = match n_up {
13-
Some(val) => {
14-
let n_up_int = num_traits::Float::ceil(val).to_usize().unwrap();
15-
n_up_int
16-
},
17-
None => {
18-
let n_up_float = T::from(0.05).unwrap() * T::from(n_genes).unwrap();
19-
let n_up_ceil = num_traits::Float::ceil(n_up_float);
20-
let n_up_int = n_up_ceil.to_usize().unwrap();
21-
n_up_int.clamp(2, n_genes)
22-
},
23-
};
24-
25-
if n_up_value <= 1 || n_up_value > n_genes {
26-
return Err(anyhow::anyhow!(
27-
"For n_genes={}, n_up={} must be between 1 and {}",
28-
n_genes, n_up_value, n_genes
29-
));
30-
}
31-
Ok(n_up_value)
32-
}
33-
34-
fn get_gene_set(
35-
connectivity: &[usize],
36-
starts: &[usize],
37-
offsets: &[usize],
38-
source_idx: usize
39-
) -> Vec<usize> {
40-
let start = starts[source_idx];
41-
let offset = offsets[source_idx];
42-
connectivity[start..start + offset].to_vec()
43-
}
44-
45-
fn rank_data<T>(values: &[T]) -> Vec<usize>
46-
where
47-
T: FloatOps
48-
{
49-
let mut indexed_values: Vec<(usize, T)> = values
50-
.iter()
51-
.enumerate()
52-
.map(|(i, &v)| (i,v))
53-
.collect();
54-
55-
indexed_values.sort_by(|a, b| {
56-
b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal)
57-
});
58-
59-
let mut ranks = vec![0; values.len()];
60-
for (rank, (original_idx, _)) in indexed_values.iter().enumerate() {
61-
ranks[*original_idx] = rank + 1;
62-
}
63-
64-
ranks
65-
}
66-
67-
68-
69-
fn compute_chunk_size(n_cells: usize, n_genes: usize) -> usize {
70-
let n_cores = rayon::current_num_threads();
71-
72-
let base_chunk_size = if n_genes > 20000 {
73-
200
74-
} else if n_genes > 10000 {
75-
500
76-
} else {
77-
1000
78-
};
79-
80-
let min_chunks = n_cores;
81-
let max_chunk_size = (n_cells + min_chunks - 1) / min_chunks;
82-
base_chunk_size.min(max_chunk_size).max(1)
83-
}
84-
85-
pub(crate) fn aucell_compute<T>(
86-
matrix: &CsrMatrix<T>,
87-
connectivity: &[usize],
88-
starts: &[usize],
89-
offsets: &[usize],
90-
n_up: Option<T>
91-
) -> anyhow::Result<Array2<T>>
92-
where
93-
T: FloatOpsTS {
94-
let n_cells = matrix.nrows();
95-
let n_genes = matrix.ncols();
96-
let n_sources = starts.len();
97-
98-
if connectivity.is_empty() || starts.is_empty() || offsets.is_empty() {
99-
return Err(anyhow::anyhow!("Connectivity arrays cannot be empty!!"))
100-
}
101-
102-
if starts.len() != offsets.len() {
103-
return Err(anyhow::anyhow!("Starts and offsets must have the same length!"))
104-
}
105-
106-
todo!()
107-
}

src/enrichment/mod.rs

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,22 @@
1+
//! Gene set enrichment analysis methods for single-cell data.
2+
//!
3+
//! This module provides various approaches to gene set enrichment analysis, allowing you to
4+
//! determine whether predefined sets of genes show statistically significant enrichment in
5+
//! your single-cell data.
6+
//!
7+
//! ## Available Methods
8+
//!
9+
//! - **GSEA** (`gsea`): Gene Set Enrichment Analysis using ranking-based approaches
10+
//! - **AUCell** (`aucell`): Area Under the Curve method for gene set activity scoring
11+
//! - **ORA** (`ora`): Over-Representation Analysis using hypergeometric testing
12+
//!
13+
//! ## Quick Example
14+
//!
15+
//! ```rust,no_run
16+
//! // Gene set enrichment analysis workflow would go here
17+
//! // (Implementation depends on the specific modules)
18+
//! ```
19+
120
mod gsea;
221
mod aucell;
322
mod ora;

src/lib.rs

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,28 @@
1+
//! # single-statistics
2+
//!
3+
//! A specialized Rust library for statistical analysis of single-cell data, part of the single-rust ecosystem.
4+
//!
5+
//! This crate provides robust statistical methods for biological analysis of single-cell data, focusing on
6+
//! differential expression analysis, marker gene identification, and related statistical tests. It is optimized
7+
//! for sparse single-cell matrices and provides both parametric and non-parametric statistical tests.
8+
//!
9+
//! ## Core Features
10+
//!
11+
//! - **Differential Expression Analysis**: T-tests, Mann-Whitney U tests, and other statistical methods
12+
//! - **Multiple Testing Correction**: FDR, Bonferroni, and other correction methods
13+
//! - **Effect Size Calculations**: Cohen's d and other effect size measures
14+
//! - **Sparse Matrix Support**: Optimized for `CsrMatrix` from nalgebra-sparse
15+
//!
16+
//! ## Quick Start
17+
//!
18+
//! Use the `MatrixStatTests` trait to perform differential expression analysis on sparse matrices.
19+
//! The library supports various statistical tests including t-tests and Mann-Whitney U tests,
20+
//! with automatic multiple testing correction.
21+
//!
22+
//! ## Module Organization
23+
//!
24+
//! - **[`testing`]**: Statistical tests, hypothesis testing, and multiple testing correction
25+
//! - **[`enrichment`]**: Gene set enrichment analysis methods (GSEA, ORA, AUCell)
26+
127
pub mod testing;
228
pub mod enrichment;

src/testing/correction/mod.rs

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,28 @@
1+
//! Multiple testing correction methods for controlling false positives in differential expression analysis.
2+
//!
3+
//! When testing thousands of genes simultaneously (as is common in single-cell RNA-seq analysis),
4+
//! the probability of false positives increases dramatically. These correction methods help control
5+
//! either the Family-Wise Error Rate (FWER) or False Discovery Rate (FDR).
6+
//!
7+
//! ## Available Methods
8+
//!
9+
//! - **Bonferroni**: Conservative FWER control, multiplies p-values by number of tests
10+
//! - **Benjamini-Hochberg**: FDR control, less conservative than Bonferroni
11+
//! - **Benjamini-Yekutieli**: FDR control for dependent tests
12+
//! - **Holm-Bonferroni**: Step-down FWER control, less conservative than Bonferroni
13+
//! - **Storey's q-value**: Improved FDR estimation
14+
//!
15+
//! ## Choosing a Method
16+
//!
17+
//! - **For single-cell DE analysis**: Use Benjamini-Hochberg (most common)
18+
//! - **For very strict control**: Use Bonferroni or Holm-Bonferroni
19+
//! - **For dependent tests**: Use Benjamini-Yekutieli
20+
//! - **For large datasets**: Consider Storey's q-value
21+
122
use anyhow::{Result, anyhow};
223
use single_utilities::traits::FloatOps;
324
use std::cmp::Ordering;
425

5-
/// Multiple testing correction methods to control for false positives
6-
/// when performing many statistical tests simultaneously.
7-
826
/// Apply Bonferroni correction to p-values
927
///
1028
/// Bonferroni correction is a simple but conservative method that multiplies

src/testing/inference/mod.rs

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,24 +11,85 @@ pub mod parametric;
1111

1212
pub mod nonparametric;
1313

14+
/// Statistical testing methods for sparse matrices, particularly suited for single-cell data.
15+
///
16+
/// This trait extends sparse matrix types (like `CsrMatrix`) with statistical testing capabilities.
17+
/// It provides methods for differential expression analysis and other statistical comparisons
18+
/// optimized for single-cell RNA-seq data.
19+
///
20+
/// # Matrix Format
21+
///
22+
/// The expected matrix format is **genes × cells** (features × observations), where:
23+
/// - Rows represent genes/features
24+
/// - Columns represent cells/observations
25+
/// - Values represent expression levels (normalized counts, log-transformed, etc.)
26+
///
27+
1428
pub trait MatrixStatTests<T>
1529
where
1630
T: FloatOpsTS,
1731
{
32+
/// Perform t-tests comparing two groups of cells for all genes.
33+
///
34+
/// Runs Student's or Welch's t-test for each gene (row) in the matrix, comparing
35+
/// expression levels between the specified groups of cells (columns).
36+
///
37+
/// # Arguments
38+
///
39+
/// * `group1_indices` - Column indices for the first group of cells
40+
/// * `group2_indices` - Column indices for the second group of cells
41+
/// * `test_type` - Type of t-test (`Student` or `Welch`)
42+
///
43+
/// # Returns
44+
///
45+
/// Vector of `TestResult` objects, one per gene, containing test statistics and p-values.
46+
///
47+
1848
fn t_test(
1949
&self,
2050
group1_indices: &[usize],
2151
group2_indices: &[usize],
2252
test_type: TTestType,
2353
) -> anyhow::Result<Vec<TestResult<f64>>>;
2454

55+
/// Perform Mann-Whitney U tests comparing two groups of cells for all genes.
56+
///
57+
/// Runs non-parametric Mann-Whitney U (Wilcoxon rank-sum) tests for each gene,
58+
/// comparing the distributions between the specified groups.
59+
///
60+
/// # Arguments
61+
///
62+
/// * `group1_indices` - Column indices for the first group of cells
63+
/// * `group2_indices` - Column indices for the second group of cells
64+
/// * `alternative` - Type of alternative hypothesis (two-sided, less, greater)
65+
///
66+
/// # Returns
67+
///
68+
/// Vector of `TestResult` objects containing U statistics and p-values.
2569
fn mann_whitney_test(
2670
&self,
2771
group1_indices: &[usize],
2872
group2_indices: &[usize],
2973
alternative: Alternative,
3074
) -> anyhow::Result<Vec<TestResult<f64>>>;
3175

76+
/// Comprehensive differential expression analysis with multiple testing correction.
77+
///
78+
/// This is the main method for differential expression analysis. It performs the
79+
/// specified statistical test on all genes and applies multiple testing correction
80+
/// to control the false discovery rate.
81+
///
82+
/// # Arguments
83+
///
84+
/// * `group_ids` - Vector assigning each cell to a group (currently supports 2 groups)
85+
/// * `test_method` - Statistical test to perform
86+
///
87+
/// # Returns
88+
///
89+
/// `MultipleTestResults` containing statistics, p-values, adjusted p-values, and
90+
/// metadata for all genes.
91+
///
92+
3293
fn differential_expression(
3394
&self,
3495
group_ids: &[usize],

src/testing/inference/nonparametric.rs

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,12 @@
1+
//! Non-parametric statistical tests for single-cell data analysis.
2+
//!
3+
//! This module implements non-parametric statistical tests that make fewer assumptions about
4+
//! data distribution. These tests are particularly useful for single-cell data which often
5+
//! exhibits non-normal distributions, high sparsity, and outliers.
6+
//!
7+
//! The primary test implemented is the Mann-Whitney U test (also known as the Wilcoxon
8+
//! rank-sum test), which compares the distributions of two groups without assuming normality.
9+
110
use std::{cmp::Ordering, f64};
211

312
use nalgebra_sparse::CsrMatrix;
@@ -13,6 +22,34 @@ struct TieInfo {
1322
tie_correction: f64,
1423
}
1524

25+
/// Perform Mann-Whitney U tests on all genes comparing two groups of cells.
26+
///
27+
/// This function efficiently computes Mann-Whitney U statistics for all genes in a sparse
28+
/// matrix, comparing expression distributions between two groups of cells. The implementation
29+
/// uses parallel processing for improved performance on large datasets.
30+
///
31+
/// # Arguments
32+
///
33+
/// * `matrix` - Sparse expression matrix (genes × cells)
34+
/// * `group1_indices` - Column indices for the first group of cells
35+
/// * `group2_indices` - Column indices for the second group of cells
36+
/// * `alternative` - Type of alternative hypothesis (two-sided, less, greater)
37+
///
38+
/// # Returns
39+
///
40+
/// Vector of `TestResult` objects containing U statistics and p-values for each gene.
41+
/// let group1 = vec![0, 1, 2]; // First group of cells
42+
/// let group2 = vec![3, 4, 5]; // Second group of cells
43+
///
44+
/// let results = mann_whitney_matrix_groups(
45+
/// &matrix,
46+
/// &group1,
47+
/// &group2,
48+
/// Alternative::TwoSided
49+
/// )?;
50+
/// # Ok(())
51+
/// # }
52+
/// ```
1653
pub fn mann_whitney_matrix_groups<T>(
1754
matrix: &CsrMatrix<T>,
1855
group1_indices: &[usize],
@@ -48,6 +85,34 @@ where
4885
Ok(results)
4986
}
5087

88+
/// Perform an optimized Mann-Whitney U test on two samples.
89+
///
90+
/// This function computes the Mann-Whitney U statistic and p-value for comparing two
91+
/// independent samples. It handles ties correctly and supports different alternative
92+
/// hypotheses.
93+
///
94+
/// # Arguments
95+
///
96+
/// * `x` - First sample
97+
/// * `y` - Second sample
98+
/// * `alternative` - Type of alternative hypothesis
99+
///
100+
/// # Returns
101+
///
102+
/// `TestResult` containing the U statistic and p-value.
103+
///
104+
/// # Example
105+
///
106+
/// ```rust
107+
/// use single_statistics::testing::inference::nonparametric::mann_whitney_optimized;
108+
/// use single_statistics::testing::Alternative;
109+
///
110+
/// let group1 = vec![1.0, 2.0, 3.0];
111+
/// let group2 = vec![4.0, 5.0, 6.0];
112+
/// let result = mann_whitney_optimized(&group1, &group2, Alternative::TwoSided);
113+
///
114+
/// println!("U statistic: {}, p-value: {}", result.statistic, result.p_value);
115+
/// ```
51116
pub fn mann_whitney_optimized(x: &[f64], y: &[f64], alternative: Alternative) -> TestResult<f64> {
52117
let nx = x.len();
53118
let ny = y.len();

0 commit comments

Comments
 (0)