From a43185c98cfcf523af3b092c87c253fab5897a6b Mon Sep 17 00:00:00 2001 From: Ian Date: Thu, 4 Sep 2025 12:52:20 +0000 Subject: [PATCH] small intermediate push, not production ready! --- src/testing/correction/mod.rs | 2 +- src/testing/inference/mod.rs | 31 +-- src/testing/inference/nonparametric.rs | 355 ++++++++++++++++++++----- 3 files changed, 294 insertions(+), 94 deletions(-) diff --git a/src/testing/correction/mod.rs b/src/testing/correction/mod.rs index 509494f..18d71fd 100644 --- a/src/testing/correction/mod.rs +++ b/src/testing/correction/mod.rs @@ -79,7 +79,7 @@ where return Err(anyhow::anyhow!("Invalid p-value at index {:?}: {:?}", i, p)); } } - + let n_f = T::from(n).unwrap(); let mut indices: Vec = (0..n).collect(); diff --git a/src/testing/inference/mod.rs b/src/testing/inference/mod.rs index 7086033..3b7ea8d 100644 --- a/src/testing/inference/mod.rs +++ b/src/testing/inference/mod.rs @@ -1,7 +1,9 @@ +use crate::testing::utils::{extract_unique_groups, get_group_indices}; +use crate::testing::{ + Alternative, MultipleTestResults, TTestType, TestMethod, TestResult, correction, +}; use nalgebra_sparse::CsrMatrix; use single_utilities::traits::FloatOpsTS; -use crate::testing::{correction, Alternative, MultipleTestResults, TTestType, TestMethod, TestResult}; -use crate::testing::utils::{extract_unique_groups, get_group_indices}; pub mod discrete; @@ -20,7 +22,6 @@ where test_type: TTestType, ) -> anyhow::Result>>; - fn mann_whitney_test( &self, group1_indices: &[usize], @@ -37,7 +38,8 @@ where impl MatrixStatTests for CsrMatrix where - T: FloatOpsTS + T: FloatOpsTS, + f64: std::convert::From, { fn t_test( &self, @@ -45,12 +47,7 @@ where group2_indices: &[usize], test_type: TTestType, ) -> anyhow::Result>> { - parametric::t_test_matrix_groups( - self, - group1_indices, - group2_indices, - test_type - ) + parametric::t_test_matrix_groups(self, group1_indices, group2_indices, test_type) } fn mann_whitney_test( @@ -79,19 +76,14 @@ where match test_method { TestMethod::TTest(test_type) => { // Run t-tests - let results = self.t_test( - &group1_indices, - &group2_indices, - test_type, - )?; + let results = self.t_test(&group1_indices, &group2_indices, test_type)?; // Extract statistics and p-values let statistics: Vec<_> = results.iter().map(|r| r.statistic).collect(); let p_values: Vec<_> = results.iter().map(|r| r.p_value).collect(); // Apply multiple testing correction - let adjusted_p_values = - correction::benjamini_hochberg_correction(&p_values)?; + let adjusted_p_values = correction::benjamini_hochberg_correction(&p_values)?; // Extract effect sizes if available let effect_sizes = results @@ -127,8 +119,7 @@ where let p_values: Vec<_> = results.iter().map(|r| r.p_value).collect(); // Apply multiple testing correction - let adjusted_p_values = - correction::benjamini_hochberg_correction(&p_values)?; + let adjusted_p_values = correction::benjamini_hochberg_correction(&p_values)?; Ok(MultipleTestResults::new(statistics, p_values) .with_adjusted_p_values(adjusted_p_values) @@ -139,4 +130,4 @@ where _ => Err(anyhow::anyhow!("Test method not implemented yet")), } } -} \ No newline at end of file +} diff --git a/src/testing/inference/nonparametric.rs b/src/testing/inference/nonparametric.rs index fcbf09e..19dd3ef 100644 --- a/src/testing/inference/nonparametric.rs +++ b/src/testing/inference/nonparametric.rs @@ -1,10 +1,17 @@ -use crate::testing::{Alternative, TestResult}; +use std::{cmp::Ordering, f64}; + use nalgebra_sparse::CsrMatrix; -use rayon::iter::IntoParallelIterator; -use rayon::iter::ParallelIterator; +use rayon::iter::{IntoParallelIterator, ParallelIterator}; use single_utilities::traits::FloatOpsTS; use statrs::distribution::{ContinuousCDF, Normal}; -use std::cmp::Ordering; + +use crate::testing::{Alternative, TestResult}; + +#[derive(Debug, Clone)] +struct TieInfo { + tie_counts: Vec, + tie_correction: f64, +} pub fn mann_whitney_matrix_groups( matrix: &CsrMatrix, @@ -14,11 +21,15 @@ pub fn mann_whitney_matrix_groups( ) -> anyhow::Result>> where T: FloatOpsTS, + f64: std::convert::From, { if group1_indices.is_empty() || group2_indices.is_empty() { - return Err(anyhow::anyhow!("Group indices cannot be empty")); + return Err(anyhow::anyhow!( + "Single-Statistics | Group indices cannot be empty. Error code: SS-NP-001" + )); } + // Simplified: use the same implementation regardless of size let nrows = matrix.nrows(); let results: Vec<_> = (0..nrows) @@ -27,111 +38,309 @@ where let mut group1_values: Vec = Vec::with_capacity(group1_indices.len()); let mut group2_values: Vec = Vec::with_capacity(group2_indices.len()); - for &col in group1_indices { - if let Some(entry) = matrix.get_entry(row, col) { - let value = entry.into_value(); - group1_values.push(value.to_f64().unwrap()); - } - } + extract_row_values(matrix, row, group1_indices, &mut group1_values); + extract_row_values(matrix, row, group2_indices, &mut group2_values); - for &col in group2_indices { - if let Some(entry) = matrix.get_entry(row, col) { - let value = entry.into_value(); - group2_values.push(value.to_f64().unwrap()); - } - } - mann_whitney(&group1_values, &group2_values, alternative) + mann_whitney_optimized(&group1_values, &group2_values, alternative) }) .collect(); Ok(results) } -pub fn mann_whitney(x: &[f64], y: &[f64], alternative: Alternative) -> TestResult { +pub fn mann_whitney_optimized(x: &[f64], y: &[f64], alternative: Alternative) -> TestResult { let nx = x.len(); let ny = y.len(); if nx == 0 || ny == 0 { - return TestResult::new(f64::NAN, 1.0); // Insufficient data + return TestResult::new(f64::NAN, 1.0); } - // Combine samples and assign group labels (0 for x, 1 for y) - let mut combined: Vec<(f64, usize)> = Vec::with_capacity(nx + ny); - combined.extend(x.iter().map(|&v| (v, 0))); - combined.extend(y.iter().map(|&v| (v, 1))); + if nx == 1 && ny == 1 { + let (u, p_value) = if x[0] > y[0] { + ( + 1.0, + match alternative { + Alternative::Greater => 0.5, + Alternative::Less => 1.0, + Alternative::TwoSided => 1.0, + }, + ) + } else if x[0] < y[0] { + ( + 0.0, + match alternative { + Alternative::Greater => 1.0, + Alternative::Less => 0.5, + Alternative::TwoSided => 1.0, + }, + ) + } else { + (0.5, 1.0) + }; + return TestResult::new(u, p_value); + } - // Sort by value - combined.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(Ordering::Equal)); + let total_size = nx + ny; + let mut combined: Vec<(f64, u8)> = Vec::with_capacity(total_size); - // Assign ranks (with ties averaged) - let mut ranks = vec![0.0; nx + ny]; - let mut i = 0; - while i < combined.len() { - let val = combined[i].0; - let mut j = i + 1; + let mut valid_nx = 0; + let mut valid_ny = 0; - // Find tied values - while j < combined.len() && combined[j].0 == val { - j += 1; + for &v in x { + if v.is_finite() { + combined.push((v, 0)); + valid_nx += 1; } + } - // Assign average rank to ties - let rank = (i + j - 1) as f64 / 2.0 + 1.0; - for k in i..j { - ranks[k] = rank; + for &v in y { + if v.is_finite() { + combined.push((v, 1)); + valid_ny += 1; } - - i = j; } - // Calculate rank sum for group X - let mut rank_sum_x = 0.0; - for i in 0..combined.len() { - if combined[i].1 == 0 { - rank_sum_x += ranks[i]; - } + if valid_nx == 0 || valid_ny == 0 { + return TestResult::new(f64::NAN, 1.0); } - let u_x = rank_sum_x - (nx * (nx + 1)) as f64 / 2.0; - let u_y = (nx * ny) as f64 - u_x; + combined.sort_unstable_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(Ordering::Equal)); - let u = match alternative { - Alternative::TwoSided => f64::min(u_x, u_y), - Alternative::Less => u_x, - Alternative::Greater => u_y, - }; + let (rank_sum_x, tie_info) = calculate_rank_sum_with_ties(&combined, valid_nx, valid_ny); - let mean_u = (nx * ny) as f64 / 2.0; - let var_u = (nx * ny * (ny + nx + 1)) as f64 / 12.0; + let nx_f64 = valid_nx as f64; + let ny_f64 = valid_ny as f64; - let correction = 0.5; + let u_x = rank_sum_x - (nx_f64 * (nx_f64 + 1.0)) / 2.0; + let u_y = (nx_f64 * ny_f64) - u_x; - let z = match alternative { + let mean_u = nx_f64 * ny_f64 / 2.0; + + let n_total = nx_f64 + ny_f64; + let var_u = if tie_info.tie_correction > 0.0 { + (nx_f64 * ny_f64 / 12.0) + * (n_total + 1.0 - tie_info.tie_correction / (n_total * (n_total - 1.0))) + } else { + nx_f64 * ny_f64 * (n_total + 1.0) / 12.0 + }; + + let (u_statistic, z_score) = match alternative { Alternative::TwoSided => { - let z_score = (f64::max(u, mean_u) - mean_u - correction) / var_u.sqrt(); - z_score.abs() + let u = u_x.min(u_y); + let z = if var_u > 0.0 { + let corrected_u = if u < mean_u { u + 0.5 } else { u - 0.5 }; + (mean_u - corrected_u) / var_u.sqrt() + } else { + 0.0 + }; + (u, z.abs()) + } + Alternative::Less => { + let z = if var_u > 0.0 { + (mean_u - u_x - 0.5) / var_u.sqrt() + } else { + 0.0 + }; + (u_x, z) + } + Alternative::Greater => { + let z = if var_u > 0.0 { + (u_x - mean_u - 0.5) / var_u.sqrt() + } else { + 0.0 + }; + (u_x, z) } - Alternative::Less => (u_x - mean_u + correction) / var_u.sqrt(), - Alternative::Greater => (u_y - mean_u + correction) / var_u.sqrt(), }; - let normal = Normal::new(0.0, 1.0).unwrap(); + let p_value = calculate_p_value(z_score, alternative, nx_f64, ny_f64); - let p_value = match alternative { - Alternative::TwoSided => 2.0 * (1.0 - normal.cdf(z)), - _ => 1.0 - normal.cdf(z), + let effect_size = if nx_f64 + ny_f64 > 0.0 { + z_score / (nx_f64 + ny_f64).sqrt() + } else { + 0.0 }; - let effect_size = z / ((nx + ny) as f64).sqrt(); - - // Standard error of U let standard_error = var_u.sqrt(); - TestResult::with_effect_size(u, p_value, effect_size) + TestResult::with_effect_size(u_statistic, p_value, effect_size) .with_standard_error(standard_error) - .with_metadata("z_score", z) + .with_metadata("z_score", z_score.abs()) .with_metadata("mean_u", mean_u) .with_metadata("var_u", var_u) - .with_metadata("nx", nx as f64) - .with_metadata("ny", ny as f64) + .with_metadata("nx", nx_f64) + .with_metadata("ny", ny_f64) + .with_metadata("tie_correction", tie_info.tie_correction) +} + +#[inline] +fn calculate_p_value(z: f64, alternative: Alternative, nx: f64, ny: f64) -> f64 { + if nx < 3.0 || ny < 3.0 { + return 1.0; + } + + if !z.is_finite() { + return 1.0; + } + + if z.abs() > 37.0 { + let log_p = log_normal_tail_probability(z.abs()); + return match alternative { + Alternative::TwoSided => (2.0 * log_p.exp()).min(1.0), + Alternative::Less | Alternative::Greater => log_p.exp().min(1.0), + }; + } + + match Normal::new(0.0, 1.0) { + Ok(normal) => match alternative { + Alternative::TwoSided => 2.0 * (1.0 - normal.cdf(z.abs())).min(0.5), + Alternative::Less => normal.cdf(z), + Alternative::Greater => 1.0 - normal.cdf(z), + }, + Err(_) => 1.0, + } +} + +#[inline] +fn calculate_rank_sum_with_ties(combined: &[(f64, u8)], nx: usize, ny: usize) -> (f64, TieInfo) { + let mut rank_sum_x = 0.0; + let mut tie_counts = Vec::new(); + let mut tie_correction = 0.0; + let mut i = 0; + let len = combined.len(); + + while i < len { + let val = combined[i].0; + let tie_start = i; + + // Count ties + while i < len && combined[i].0 == val { + i += 1; + } + + let tie_end = i; + let tie_size = tie_end - tie_start; + + // Average rank for tied values + let avg_rank = (tie_start + tie_end - 1) as f64 / 2.0 + 1.0; + + // Accumulate rank sum for group x + for j in tie_start..tie_end { + if combined[j].1 == 0 { + rank_sum_x += avg_rank; + } + } + + // Calculate tie correction if there are ties + if tie_size > 1 { + tie_counts.push(tie_size); + let t = tie_size as f64; + tie_correction += t * (t * t - 1.0); + } + } + + ( + rank_sum_x, + TieInfo { + tie_counts, + tie_correction, + }, + ) +} + +#[inline] +fn log_normal_tail_probability(x: f64) -> f64 { + if x < 0.0 { + return 0.0; + } + + if x > 8.0 { + let x_sq = x * x; + return -0.5 * x_sq - (x * (2.0 * std::f64::consts::PI).sqrt()).ln(); + } + + let z = x / (2.0_f64).sqrt(); + log_erfc(z) - (2.0_f64).ln() +} + +#[inline] +fn log_erfc(x: f64) -> f64 { + if x < 0.0 { + return 0.0; + } + + if x > 26.0 { + let x_sq = x * x; + return -x_sq - 0.5 * (std::f64::consts::PI).ln() - x.ln(); + } + + continued_fraction_log_erfc(x) +} + +#[inline] +fn continued_fraction_log_erfc(x: f64) -> f64 { + if x < 2.0 { + let erf_val = erf_series(x); + return (1.0 - erf_val).ln(); + } + + let x_sq = x * x; + let mut a = 1.0; + let mut b = 2.0 * x_sq; + let mut result = a / b; + + for n in 1..50 { + a = -(2 * n - 1) as f64; + b = 2.0 * x_sq + a / result; + let new_result = a / b; + + if (result - new_result).abs() < 1e-15 { + break; + } + result = new_result; + } + + -x_sq + (result / (x * (std::f64::consts::PI).sqrt())).ln() +} + +#[inline] +fn erf_series(x: f64) -> f64 { + let x_sq = x * x; + let mut term = x; + let mut result = term; + + for n in 1..100 { + term *= -x_sq / (n as f64); + let new_term = term / (2.0 * n as f64 + 1.0); + result += new_term; + + if new_term.abs() < 1e-16 { + break; + } + } + + result * 2.0 / (std::f64::consts::PI).sqrt() +} + +#[inline] +fn extract_row_values( + matrix: &CsrMatrix, + row: usize, + indices: &[usize], + values: &mut Vec, +) where + T: FloatOpsTS, + f64: std::convert::From, +{ + // Get row slice for efficient access + let row_view = matrix.row(row); + + for &col in indices { + if let Some(value) = row_view.get_entry(col) { + values.push(value.into_value().into()); + } else { + values.push(0.0); + } + } }