From a43185c98cfcf523af3b092c87c253fab5897a6b Mon Sep 17 00:00:00 2001
From: Ian <ian.diks@uni-saarland.de>
Date: Thu, 4 Sep 2025 12:52:20 +0000
Subject: [PATCH] small intermediate push, not production ready!

---
 src/testing/correction/mod.rs          |   2 +-
 src/testing/inference/mod.rs           |  31 +--
 src/testing/inference/nonparametric.rs | 355 ++++++++++++++++++++-----
 3 files changed, 294 insertions(+), 94 deletions(-)
diff --git a/src/testing/correction/mod.rs b/src/testing/correction/mod.rs
index 509494f..18d71fd 100644
--- a/src/testing/correction/mod.rs
+++ b/src/testing/correction/mod.rs
@@ -79,7 +79,7 @@ where
             return Err(anyhow::anyhow!("Invalid p-value at index {:?}: {:?}", i, p));
         }
     }
-
+    
     let n_f = T::from(n).unwrap();
 
     let mut indices: Vec<usize> = (0..n).collect();
diff --git a/src/testing/inference/mod.rs b/src/testing/inference/mod.rs
index 7086033..3b7ea8d 100644
--- a/src/testing/inference/mod.rs
+++ b/src/testing/inference/mod.rs
@@ -1,7 +1,9 @@
+use crate::testing::utils::{extract_unique_groups, get_group_indices};
+use crate::testing::{
+    Alternative, MultipleTestResults, TTestType, TestMethod, TestResult, correction,
+};
 use nalgebra_sparse::CsrMatrix;
 use single_utilities::traits::FloatOpsTS;
-use crate::testing::{correction, Alternative, MultipleTestResults, TTestType, TestMethod, TestResult};
-use crate::testing::utils::{extract_unique_groups, get_group_indices};
 
 pub mod discrete;
 
@@ -20,7 +22,6 @@ where
         test_type: TTestType,
     ) -> anyhow::Result<Vec<TestResult<f64>>>;
 
-
     fn mann_whitney_test(
         &self,
         group1_indices: &[usize],
@@ -37,7 +38,8 @@ where
 
 impl<T> MatrixStatTests<T> for CsrMatrix<T>
 where
-    T: FloatOpsTS
+    T: FloatOpsTS,
+    f64: std::convert::From<T>,
 {
     fn t_test(
         &self,
@@ -45,12 +47,7 @@ where
         group2_indices: &[usize],
         test_type: TTestType,
     ) -> anyhow::Result<Vec<TestResult<f64>>> {
-        parametric::t_test_matrix_groups(
-            self,
-            group1_indices,
-            group2_indices,
-            test_type
-        )
+        parametric::t_test_matrix_groups(self, group1_indices, group2_indices, test_type)
     }
 
     fn mann_whitney_test(
@@ -79,19 +76,14 @@ where
         match test_method {
             TestMethod::TTest(test_type) => {
                 // Run t-tests
-                let results = self.t_test(
-                    &group1_indices,
-                    &group2_indices,
-                    test_type,
-                )?;
+                let results = self.t_test(&group1_indices, &group2_indices, test_type)?;
 
                 // Extract statistics and p-values
                 let statistics: Vec<_> = results.iter().map(|r| r.statistic).collect();
                 let p_values: Vec<_> = results.iter().map(|r| r.p_value).collect();
 
                 // Apply multiple testing correction
-                let adjusted_p_values =
-                    correction::benjamini_hochberg_correction(&p_values)?;
+                let adjusted_p_values = correction::benjamini_hochberg_correction(&p_values)?;
 
                 // Extract effect sizes if available
                 let effect_sizes = results
@@ -127,8 +119,7 @@ where
                 let p_values: Vec<_> = results.iter().map(|r| r.p_value).collect();
 
                 // Apply multiple testing correction
-                let adjusted_p_values =
-                    correction::benjamini_hochberg_correction(&p_values)?;
+                let adjusted_p_values = correction::benjamini_hochberg_correction(&p_values)?;
 
                 Ok(MultipleTestResults::new(statistics, p_values)
                     .with_adjusted_p_values(adjusted_p_values)
@@ -139,4 +130,4 @@ where
             _ => Err(anyhow::anyhow!("Test method not implemented yet")),
         }
     }
-}
\ No newline at end of file
+}
diff --git a/src/testing/inference/nonparametric.rs b/src/testing/inference/nonparametric.rs
index fcbf09e..19dd3ef 100644
--- a/src/testing/inference/nonparametric.rs
+++ b/src/testing/inference/nonparametric.rs
@@ -1,10 +1,17 @@
-use crate::testing::{Alternative, TestResult};
+use std::{cmp::Ordering, f64};
+
 use nalgebra_sparse::CsrMatrix;
-use rayon::iter::IntoParallelIterator;
-use rayon::iter::ParallelIterator;
+use rayon::iter::{IntoParallelIterator, ParallelIterator};
 use single_utilities::traits::FloatOpsTS;
 use statrs::distribution::{ContinuousCDF, Normal};
-use std::cmp::Ordering;
+
+use crate::testing::{Alternative, TestResult};
+
+#[derive(Debug, Clone)]
+struct TieInfo {
+    tie_counts: Vec<usize>,
+    tie_correction: f64,
+}
 
 pub fn mann_whitney_matrix_groups<T>(
     matrix: &CsrMatrix<T>,
@@ -14,11 +21,15 @@ pub fn mann_whitney_matrix_groups<T>(
 ) -> anyhow::Result<Vec<TestResult<f64>>>
 where
     T: FloatOpsTS,
+    f64: std::convert::From<T>,
 {
     if group1_indices.is_empty() || group2_indices.is_empty() {
-        return Err(anyhow::anyhow!("Group indices cannot be empty"));
+        return Err(anyhow::anyhow!(
+            "Single-Statistics | Group indices cannot be empty. Error code: SS-NP-001"
+        ));
     }
 
+    // Simplified: use the same implementation regardless of size
     let nrows = matrix.nrows();
 
     let results: Vec<_> = (0..nrows)
@@ -27,111 +38,309 @@ where
             let mut group1_values: Vec<f64> = Vec::with_capacity(group1_indices.len());
             let mut group2_values: Vec<f64> = Vec::with_capacity(group2_indices.len());
 
-            for &col in group1_indices {
-                if let Some(entry) = matrix.get_entry(row, col) {
-                    let value = entry.into_value();
-                    group1_values.push(value.to_f64().unwrap());
-                }
-            }
+            extract_row_values(matrix, row, group1_indices, &mut group1_values);
+            extract_row_values(matrix, row, group2_indices, &mut group2_values);
 
-            for &col in group2_indices {
-                if let Some(entry) = matrix.get_entry(row, col) {
-                    let value = entry.into_value();
-                    group2_values.push(value.to_f64().unwrap());
-                }
-            }
-            mann_whitney(&group1_values, &group2_values, alternative)
+            mann_whitney_optimized(&group1_values, &group2_values, alternative)
         })
         .collect();
 
     Ok(results)
 }
 
-pub fn mann_whitney(x: &[f64], y: &[f64], alternative: Alternative) -> TestResult<f64> {
+pub fn mann_whitney_optimized(x: &[f64], y: &[f64], alternative: Alternative) -> TestResult<f64> {
     let nx = x.len();
     let ny = y.len();
 
     if nx == 0 || ny == 0 {
-        return TestResult::new(f64::NAN, 1.0); // Insufficient data
+        return TestResult::new(f64::NAN, 1.0);
     }
 
-    // Combine samples and assign group labels (0 for x, 1 for y)
-    let mut combined: Vec<(f64, usize)> = Vec::with_capacity(nx + ny);
-    combined.extend(x.iter().map(|&v| (v, 0)));
-    combined.extend(y.iter().map(|&v| (v, 1)));
+    if nx == 1 && ny == 1 {
+        let (u, p_value) = if x[0] > y[0] {
+            (
+                1.0,
+                match alternative {
+                    Alternative::Greater => 0.5,
+                    Alternative::Less => 1.0,
+                    Alternative::TwoSided => 1.0,
+                },
+            )
+        } else if x[0] < y[0] {
+            (
+                0.0,
+                match alternative {
+                    Alternative::Greater => 1.0,
+                    Alternative::Less => 0.5,
+                    Alternative::TwoSided => 1.0,
+                },
+            )
+        } else {
+            (0.5, 1.0)
+        };
+        return TestResult::new(u, p_value);
+    }
 
-    // Sort by value
-    combined.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(Ordering::Equal));
+    let total_size = nx + ny;
+    let mut combined: Vec<(f64, u8)> = Vec::with_capacity(total_size);
 
-    // Assign ranks (with ties averaged)
-    let mut ranks = vec![0.0; nx + ny];
-    let mut i = 0;
-    while i < combined.len() {
-        let val = combined[i].0;
-        let mut j = i + 1;
+    let mut valid_nx = 0;
+    let mut valid_ny = 0;
 
-        // Find tied values
-        while j < combined.len() && combined[j].0 == val {
-            j += 1;
+    for &v in x {
+        if v.is_finite() {
+            combined.push((v, 0));
+            valid_nx += 1;
         }
+    }
 
-        // Assign average rank to ties
-        let rank = (i + j - 1) as f64 / 2.0 + 1.0;
-        for k in i..j {
-            ranks[k] = rank;
+    for &v in y {
+        if v.is_finite() {
+            combined.push((v, 1));
+            valid_ny += 1;
         }
-
-        i = j;
     }
 
-    // Calculate rank sum for group X
-    let mut rank_sum_x = 0.0;
-    for i in 0..combined.len() {
-        if combined[i].1 == 0 {
-            rank_sum_x += ranks[i];
-        }
+    if valid_nx == 0 || valid_ny == 0 {
+        return TestResult::new(f64::NAN, 1.0);
     }
 
-    let u_x = rank_sum_x - (nx * (nx + 1)) as f64 / 2.0;
-    let u_y = (nx * ny) as f64 - u_x;
+    combined.sort_unstable_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(Ordering::Equal));
 
-    let u = match alternative {
-        Alternative::TwoSided => f64::min(u_x, u_y),
-        Alternative::Less => u_x,
-        Alternative::Greater => u_y,
-    };
+    let (rank_sum_x, tie_info) = calculate_rank_sum_with_ties(&combined, valid_nx, valid_ny);
 
-    let mean_u = (nx * ny) as f64 / 2.0;
-    let var_u = (nx * ny * (ny + nx + 1)) as f64 / 12.0;
+    let nx_f64 = valid_nx as f64;
+    let ny_f64 = valid_ny as f64;
 
-    let correction = 0.5;
+    let u_x = rank_sum_x - (nx_f64 * (nx_f64 + 1.0)) / 2.0;
+    let u_y = (nx_f64 * ny_f64) - u_x;
 
-    let z = match alternative {
+    let mean_u = nx_f64 * ny_f64 / 2.0;
+
+    let n_total = nx_f64 + ny_f64;
+    let var_u = if tie_info.tie_correction > 0.0 {
+        (nx_f64 * ny_f64 / 12.0)
+            * (n_total + 1.0 - tie_info.tie_correction / (n_total * (n_total - 1.0)))
+    } else {
+        nx_f64 * ny_f64 * (n_total + 1.0) / 12.0
+    };
+
+    let (u_statistic, z_score) = match alternative {
         Alternative::TwoSided => {
-            let z_score = (f64::max(u, mean_u) - mean_u - correction) / var_u.sqrt();
-            z_score.abs()
+            let u = u_x.min(u_y);
+            let z = if var_u > 0.0 {
+                let corrected_u = if u < mean_u { u + 0.5 } else { u - 0.5 };
+                (mean_u - corrected_u) / var_u.sqrt()
+            } else {
+                0.0
+            };
+            (u, z.abs())
+        }
+        Alternative::Less => {
+            let z = if var_u > 0.0 {
+                (mean_u - u_x - 0.5) / var_u.sqrt()
+            } else {
+                0.0
+            };
+            (u_x, z)
+        }
+        Alternative::Greater => {
+            let z = if var_u > 0.0 {
+                (u_x - mean_u - 0.5) / var_u.sqrt()
+            } else {
+                0.0
+            };
+            (u_x, z)
         }
-        Alternative::Less => (u_x - mean_u + correction) / var_u.sqrt(),
-        Alternative::Greater => (u_y - mean_u + correction) / var_u.sqrt(),
     };
 
-    let normal = Normal::new(0.0, 1.0).unwrap();
+    let p_value = calculate_p_value(z_score, alternative, nx_f64, ny_f64);
 
-    let p_value = match alternative {
-        Alternative::TwoSided => 2.0 * (1.0 - normal.cdf(z)),
-        _ => 1.0 - normal.cdf(z),
+    let effect_size = if nx_f64 + ny_f64 > 0.0 {
+        z_score / (nx_f64 + ny_f64).sqrt()
+    } else {
+        0.0
     };
 
-    let effect_size = z / ((nx + ny) as f64).sqrt();
-
-    // Standard error of U
     let standard_error = var_u.sqrt();
 
-    TestResult::with_effect_size(u, p_value, effect_size)
+    TestResult::with_effect_size(u_statistic, p_value, effect_size)
         .with_standard_error(standard_error)
-        .with_metadata("z_score", z)
+        .with_metadata("z_score", z_score.abs())
         .with_metadata("mean_u", mean_u)
         .with_metadata("var_u", var_u)
-        .with_metadata("nx", nx as f64)
-        .with_metadata("ny", ny as f64)
+        .with_metadata("nx", nx_f64)
+        .with_metadata("ny", ny_f64)
+        .with_metadata("tie_correction", tie_info.tie_correction)
+}
+
+#[inline]
+fn calculate_p_value(z: f64, alternative: Alternative, nx: f64, ny: f64) -> f64 {
+    if nx < 3.0 || ny < 3.0 {
+        return 1.0;
+    }
+
+    if !z.is_finite() {
+        return 1.0;
+    }
+
+    if z.abs() > 37.0 {
+        let log_p = log_normal_tail_probability(z.abs());
+        return match alternative {
+            Alternative::TwoSided => (2.0 * log_p.exp()).min(1.0),
+            Alternative::Less | Alternative::Greater => log_p.exp().min(1.0),
+        };
+    }
+
+    match Normal::new(0.0, 1.0) {
+        Ok(normal) => match alternative {
+            Alternative::TwoSided => 2.0 * (1.0 - normal.cdf(z.abs())).min(0.5),
+            Alternative::Less => normal.cdf(z),
+            Alternative::Greater => 1.0 - normal.cdf(z),
+        },
+        Err(_) => 1.0,
+    }
+}
+
+#[inline]
+fn calculate_rank_sum_with_ties(combined: &[(f64, u8)], nx: usize, ny: usize) -> (f64, TieInfo) {
+    let mut rank_sum_x = 0.0;
+    let mut tie_counts = Vec::new();
+    let mut tie_correction = 0.0;
+    let mut i = 0;
+    let len = combined.len();
+
+    while i < len {
+        let val = combined[i].0;
+        let tie_start = i;
+
+        // Count ties
+        while i < len && combined[i].0 == val {
+            i += 1;
+        }
+
+        let tie_end = i;
+        let tie_size = tie_end - tie_start;
+
+        // Average rank for tied values
+        let avg_rank = (tie_start + tie_end - 1) as f64 / 2.0 + 1.0;
+
+        // Accumulate rank sum for group x
+        for j in tie_start..tie_end {
+            if combined[j].1 == 0 {
+                rank_sum_x += avg_rank;
+            }
+        }
+
+        // Calculate tie correction if there are ties
+        if tie_size > 1 {
+            tie_counts.push(tie_size);
+            let t = tie_size as f64;
+            tie_correction += t * (t * t - 1.0);
+        }
+    }
+
+    (
+        rank_sum_x,
+        TieInfo {
+            tie_counts,
+            tie_correction,
+        },
+    )
+}
+
+#[inline]
+fn log_normal_tail_probability(x: f64) -> f64 {
+    if x < 0.0 {
+        return 0.0;
+    }
+
+    if x > 8.0 {
+        let x_sq = x * x;
+        return -0.5 * x_sq - (x * (2.0 * std::f64::consts::PI).sqrt()).ln();
+    }
+
+    let z = x / (2.0_f64).sqrt();
+    log_erfc(z) - (2.0_f64).ln()
+}
+
+#[inline]
+fn log_erfc(x: f64) -> f64 {
+    if x < 0.0 {
+        return 0.0;
+    }
+
+    if x > 26.0 {
+        let x_sq = x * x;
+        return -x_sq - 0.5 * (std::f64::consts::PI).ln() - x.ln();
+    }
+
+    continued_fraction_log_erfc(x)
+}
+
+#[inline]
+fn continued_fraction_log_erfc(x: f64) -> f64 {
+    if x < 2.0 {
+        let erf_val = erf_series(x);
+        return (1.0 - erf_val).ln();
+    }
+
+    let x_sq = x * x;
+    let mut a = 1.0;
+    let mut b = 2.0 * x_sq;
+    let mut result = a / b;
+
+    for n in 1..50 {
+        a = -(2 * n - 1) as f64;
+        b = 2.0 * x_sq + a / result;
+        let new_result = a / b;
+
+        if (result - new_result).abs() < 1e-15 {
+            break;
+        }
+        result = new_result;
+    }
+
+    -x_sq + (result / (x * (std::f64::consts::PI).sqrt())).ln()
+}
+
+#[inline]
+fn erf_series(x: f64) -> f64 {
+    let x_sq = x * x;
+    let mut term = x;
+    let mut result = term;
+
+    for n in 1..100 {
+        term *= -x_sq / (n as f64);
+        let new_term = term / (2.0 * n as f64 + 1.0);
+        result += new_term;
+
+        if new_term.abs() < 1e-16 {
+            break;
+        }
+    }
+
+    result * 2.0 / (std::f64::consts::PI).sqrt()
+}
+
+#[inline]
+fn extract_row_values<T>(
+    matrix: &CsrMatrix<T>,
+    row: usize,
+    indices: &[usize],
+    values: &mut Vec<f64>,
+) where
+    T: FloatOpsTS,
+    f64: std::convert::From<T>,
+{
+    // Get row slice for efficient access
+    let row_view = matrix.row(row);
+
+    for &col in indices {
+        if let Some(value) = row_view.get_entry(col) {
+            values.push(value.into_value().into());
+        } else {
+            values.push(0.0);
+        }
+    }
 }