diff --git a/examples/compare_algorithms.rs b/examples/compare_algorithms.rs
new file mode 100644
index 000000000..41e5dcb6e
--- /dev/null
+++ b/examples/compare_algorithms.rs
@@ -0,0 +1,187 @@
+//! Compare all non-parametric algorithms on the bimodal_ke dataset
+//!
+//! Run with: cargo run --release --example compare_algorithms
+
+use anyhow::Result;
+use pmcore::prelude::*;
+use std::time::Instant;
+
+fn build_problem<E: pharmsol::Equation + Clone>(
+    equation: E,
+    data: Data,
+    method: NonparametricMethod,
+    output_path: &str,
+    initialize_logs: bool,
+) -> Result<EstimationProblem<E>> {
+    let parameters = ParameterSpace::new()
+        .add(ParameterSpec::bounded("ke", 0.001, 3.0))
+        .add(ParameterSpec::bounded("v", 25.0, 250.0));
+
+    let assay_error_models = AssayErrorModels::new().add(
+        1,
+        AssayErrorModel::additive(ErrorPoly::new(0.0, 0.5, 0.0, 0.0), 0.0),
+    )?;
+
+    let observations = ObservationSpec::new()
+        .with_assay_error_models(assay_error_models)
+        .add_channel(ObservationChannel::continuous(1, "obs_1"));
+
+    let model = ModelDefinition::builder(equation)
+        .parameters(parameters)
+        .observations(observations)
+        .build()?;
+
+    EstimationProblem::builder(model, data)
+        .method(EstimationMethod::Nonparametric(method))
+        .output(OutputPlan {
+            write: true,
+            path: Some(output_path.to_string()),
+        })
+        .runtime(RuntimeOptions {
+            cycles: 10_000,
+            prior: Some(Prior::sobol(2028, 22)),
+            logging: LoggingOptions {
+                initialize: initialize_logs,
+                write: false,
+                stdout: true,
+                ..LoggingOptions::default()
+            },
+            ..RuntimeOptions::default()
+        })
+        .build()
+}
+
+fn create_equation() -> equation::ODE {
+    equation::ODE::new(
+        |x, p, _t, dx, b, rateiv, _cov| {
+            fetch_params!(p, ke, _v);
+            dx[1] = -ke * x[1] + rateiv[1] + b[1];
+        },
+        |_p, _t, _cov| lag! {},
+        |_p, _t, _cov| fa! {},
+        |_p, _t, _cov, _x| {},
+        |x, p, _t, _cov, y| {
+            fetch_params!(p, _ke, v);
+            y[1] = x[1] / v;
+        },
+    )
+}
+
+fn run_algorithm(
+    name: &str,
+    method: NonparametricMethod,
+    data: &Data,
+    initialize_logs: bool,
+) -> Result<(f64, usize, usize, std::time::Duration)> {
+    let eq = create_equation();
+    let output_path = format!("examples/bimodal_ke/output_{}/", name.to_lowercase());
+
+    println!("\n============================================================");
+    println!("Running {}", name);
+    println!("============================================================");
+
+    let start = Instant::now();
+    let fit_result = fit(build_problem(
+        eq,
+        data.clone(),
+        method,
+        &output_path,
+        initialize_logs,
+    )?)?;
+    let duration = start.elapsed();
+    let result = fit_result
+        .as_nonparametric()
+        .expect("nonparametric comparison should yield a nonparametric result");
+
+    let objf = result.objf();
+    let n_spp = result.get_theta().nspp();
+    let cycles = result.cycles();
+
+    println!("\n{} Results:", name);
+    println!("  -2LL (objective): {:.4}", objf);
+    println!("  Support points:   {}", n_spp);
+    println!("  Cycles:           {}", cycles);
+    println!("  Time:             {:.2?}", duration);
+
+    // Print support points summary
+    let theta = result.get_theta();
+    let weights = result.weights();
+    println!("\n  Support points (ke, v, weight):");
+    for (i, spp) in theta.matrix().row_iter().enumerate() {
+        let w = if i < weights.len() { weights[i] } else { 0.0 };
+        if w > 0.01 {
+            // Only show points with > 1% weight
+            println!("    [{:.4}, {:.2}] weight: {:.4}", spp[0], spp[1], w);
+        }
+    }
+
+    Ok((objf, n_spp, cycles, duration))
+}
+
+fn main() -> Result<()> {
+    println!("\n");
+    println!("╔══════════════════════════════════════════════════════════╗");
+    println!("║     ALGORITHM COMPARISON: Bimodal Ke Dataset             ║");
+    println!("╚══════════════════════════════════════════════════════════╝");
+
+    let data = data::read_pmetrics("examples/bimodal_ke/bimodal_ke.csv")?;
+    println!("\nDataset: {} subjects", data.len());
+
+    let algorithms = [
+        ("NPAG", NonparametricMethod::Npag(NpagOptions::default())),
+        ("NPOD", NonparametricMethod::Npod(NpodOptions::default())),
+        ("NPSAH", NonparametricMethod::Npsah(NpsahOptions::default())),
+        (
+            "NPSAH2",
+            NonparametricMethod::Npsah2(Npsah2Options::default()),
+        ),
+        ("NPCAT", NonparametricMethod::Npcat(NpcatOptions::default())),
+        ("NPOPT", NonparametricMethod::Npopt(NpoptOptions::default())),
+        ("NPPSO", NonparametricMethod::Nppso(NppsoOptions::default())),
+        ("NPXO", NonparametricMethod::Npxo(NpxoOptions::default())),
+        ("NPBO", NonparametricMethod::Npbo(NpboOptions::default())),
+        ("NPCMA", NonparametricMethod::Npcma(NpcmaOptions::default())),
+        ("NEXUS", NonparametricMethod::Nexus(NexusOptions::default())),
+    ];
+
+    let mut results = Vec::new();
+
+    for (index, (name, method)) in algorithms.iter().enumerate() {
+        match run_algorithm(name, *method, &data, index == 0) {
+            Ok(result) => results.push((name.to_string(), result)),
+            Err(e) => println!("  ERROR running {}: {}", name, e),
+        }
+    }
+
+    // Summary table
+    println!("\n");
+    println!("╔══════════════════════════════════════════════════════════════════════════╗");
+    println!("║                           SUMMARY COMPARISON                             ║");
+    println!("╠══════════╦══════════════╦══════════════╦════════╦════════════════════════╣");
+    println!("║ Algorithm║    -2LL      ║ Support Pts  ║ Cycles ║        Time            ║");
+    println!("╠══════════╬══════════════╬══════════════╬════════╬════════════════════════╣");
+
+    for (name, (objf, n_spp, cycles, duration)) in &results {
+        println!(
+            "║ {:8} ║ {:12.4} ║ {:12} ║ {:6} ║ {:22.2?} ║",
+            name, objf, n_spp, cycles, duration
+        );
+    }
+    println!("╚══════════╩══════════════╩══════════════╩════════╩════════════════════════╝");
+
+    // Find best result
+    if let Some((best_name, (best_objf, _, _, _))) = results
+        .iter()
+        .min_by(|a, b| a.1 .0.partial_cmp(&b.1 .0).unwrap())
+    {
+        println!("\nBest -2LL: {} with {:.4}", best_name, best_objf);
+    }
+
+    if let Some((fastest_name, (_, _, _, fastest_time))) =
+        results.iter().min_by(|a, b| a.1 .3.cmp(&b.1 .3))
+    {
+        println!("Fastest:   {} with {:?}", fastest_name, fastest_time);
+    }
+
+    Ok(())
+}
diff --git a/examples/compare_npsah.rs b/examples/compare_npsah.rs
new file mode 100644
index 000000000..6122111fd
--- /dev/null
+++ b/examples/compare_npsah.rs
@@ -0,0 +1,699 @@
+//! Compare NPSAH vs NPSAH2 on multiple scenarios
+//!
+//! This example runs both algorithms on several test cases to evaluate improvements:
+//! 1. Bimodal distribution (bimodal_ke) - Tests ability to find multiple modes (2 params)
+//! 2. Two-compartment with lag (two_eq_lag) - Tests convergence with lag time (4 params)
+//! 3. Theophylline (theophylline) - Standard PK model (3 params)
+//! 4. Neely model (neely) - Complex multi-output model (10 params)
+//!
+//! Run with: cargo run --release --example compare_npsah
+
+use anyhow::Result;
+use pmcore::prelude::*;
+use std::time::Instant;
+
+#[derive(Clone)]
+struct ExampleRunConfig {
+    algorithm: Algorithm,
+    parameter_space: ParameterSpace,
+    assay_error_models: AssayErrorModels,
+    output: OutputPlan,
+    runtime: RuntimeOptions,
+}
+
+fn nonparametric_method(algorithm: Algorithm) -> Result<NonparametricMethod> {
+    Ok(match algorithm {
+        Algorithm::NPAG => NonparametricMethod::Npag(NpagOptions),
+        Algorithm::NPBO => NonparametricMethod::Npbo(NpboOptions),
+        Algorithm::NPCAT => NonparametricMethod::Npcat(NpcatOptions),
+        Algorithm::NPCMA => NonparametricMethod::Npcma(NpcmaOptions),
+        Algorithm::NPOD => NonparametricMethod::Npod(NpodOptions),
+        Algorithm::NPOPT => NonparametricMethod::Npopt(NpoptOptions),
+        Algorithm::NPPSO => NonparametricMethod::Nppso(NppsoOptions),
+        Algorithm::NPSAH => NonparametricMethod::Npsah(NpsahOptions),
+        Algorithm::NPSAH2 => NonparametricMethod::Npsah2(Npsah2Options),
+        Algorithm::NPXO => NonparametricMethod::Npxo(NpxoOptions),
+        Algorithm::NEXUS => NonparametricMethod::Nexus(NexusOptions),
+        Algorithm::POSTPROB => NonparametricMethod::Postprob(PostProbOptions),
+        other => anyhow::bail!("unsupported nonparametric algorithm: {:?}", other),
+    })
+}
+
+fn build_problem<E: pharmsol::Equation + Clone>(
+    equation: E,
+    data: Data,
+    config: &ExampleRunConfig,
+) -> Result<EstimationProblem<E>> {
+    let observations = config
+        .assay_error_models
+        .iter()
+        .filter(|(_, model)| !matches!(model, AssayErrorModel::None))
+        .fold(
+            ObservationSpec::new().with_assay_error_models(config.assay_error_models.clone()),
+            |spec, (outeq, _)| {
+                spec.add_channel(ObservationChannel::continuous(
+                    outeq,
+                    format!("obs_{}", outeq),
+                ))
+            },
+        );
+
+    let model = ModelDefinition::builder(equation)
+        .parameters(config.parameter_space.clone())
+        .observations(observations)
+        .build()?;
+
+    EstimationProblem::builder(model, data)
+        .method(EstimationMethod::Nonparametric(nonparametric_method(
+            config.algorithm,
+        )?))
+        .output(config.output.clone())
+        .runtime(config.runtime.clone())
+        .build()
+}
+
+fn bounded_parameter_space(bounds: &[(&str, f64, f64)]) -> ParameterSpace {
+    bounds
+        .iter()
+        .fold(ParameterSpace::new(), |space, (name, lower, upper)| {
+            space.add(ParameterSpec::bounded(*name, *lower, *upper))
+        })
+}
+
+fn example_run_config(
+    algorithm: Algorithm,
+    parameter_space: ParameterSpace,
+    assay_error_models: AssayErrorModels,
+    prior: Prior,
+) -> ExampleRunConfig {
+    ExampleRunConfig {
+        algorithm,
+        parameter_space,
+        assay_error_models,
+        output: OutputPlan::disabled(),
+        runtime: RuntimeOptions {
+            cycles: 500,
+            cache: true,
+            progress: true,
+            idelta: 0.12,
+            tad: 0.0,
+            prior: Some(prior),
+            ..RuntimeOptions::default()
+        },
+    }
+}
+
+// ============================================================================
+// TEST CASE 1: Bimodal Distribution (2 parameters)
+// ============================================================================
+
+fn create_bimodal_equation() -> equation::ODE {
+    equation::ODE::new(
+        |x, p, _t, dx, b, rateiv, _cov| {
+            fetch_params!(p, ke, _v);
+            dx[0] = -ke * x[0] + rateiv[0] + b[0];
+        },
+        |_p, _t, _cov| lag! {},
+        |_p, _t, _cov| fa! {},
+        |_p, _t, _cov, _x| {},
+        |x, p, _t, _cov, y| {
+            fetch_params!(p, _ke, v);
+            y[0] = x[0] / v;
+        },
+    )
+}
+
+fn create_bimodal_config(algorithm: Algorithm) -> ExampleRunConfig {
+    let parameter_space = bounded_parameter_space(&[("ke", 0.001, 3.0), ("v", 25.0, 250.0)]);
+
+    let ems = AssayErrorModels::new()
+        .add(
+            0,
+            AssayErrorModel::additive(ErrorPoly::new(0.0, 0.5, 0.0, 0.0), 0.0),
+        )
+        .unwrap()
+        .add(1, AssayErrorModel::None)
+        .unwrap();
+
+    example_run_config(algorithm, parameter_space, ems, Prior::sobol(2028, 22))
+}
+
+// ============================================================================
+// TEST CASE 2: Two-compartment with lag (4 parameters)
+// ============================================================================
+
+fn create_two_eq_lag_equation() -> equation::ODE {
+    equation::ODE::new(
+        |x, p, _t, dx, b, _rateiv, _cov| {
+            fetch_params!(p, ka, ke, _tlag, _v);
+            dx[0] = -ka * x[0] + b[0];
+            dx[1] = ka * x[0] - ke * x[1];
+        },
+        |p, _t, _cov| {
+            fetch_params!(p, _ka, _ke, tlag, _v);
+            lag! {0 => tlag}
+        },
+        |_p, _t, _cov| fa! {},
+        |_p, _t, _cov, _x| {},
+        |x, p, _t, _cov, y| {
+            fetch_params!(p, _ka, _ke, _tlag, v);
+            y[0] = x[1] / v;
+        },
+    )
+}
+
+fn create_two_eq_lag_config(algorithm: Algorithm) -> ExampleRunConfig {
+    let parameter_space = bounded_parameter_space(&[
+        ("ka", 0.1, 0.9),
+        ("ke", 0.001, 0.1),
+        ("tlag", 0.0, 4.0),
+        ("v", 30.0, 120.0),
+    ]);
+
+    let ems = AssayErrorModels::new()
+        .add(
+            0,
+            AssayErrorModel::additive(ErrorPoly::new(-0.00119, 0.44379, -0.45864, 0.16537), 0.0),
+        )
+        .unwrap();
+
+    example_run_config(algorithm, parameter_space, ems, Prior::sobol(1234, 30))
+}
+
+// ============================================================================
+// TEST CASE 3: Theophylline (3 parameters)
+// ============================================================================
+
+fn create_theo_equation() -> equation::Analytical {
+    equation::Analytical::new(
+        one_compartment_with_absorption,
+        |_p, _t, _cov| {},
+        |_p, _t, _cov| lag! {},
+        |_p, _t, _cov| fa! {},
+        |_p, _t, _cov, _x| {},
+        |x, p, _t, _cov, y| {
+            fetch_params!(p, _ka, _ke, v);
+            y[0] = x[1] * 1000.0 / v;
+        },
+    )
+}
+
+fn create_theo_config(algorithm: Algorithm) -> ExampleRunConfig {
+    let parameter_space =
+        bounded_parameter_space(&[("ka", 0.001, 3.0), ("ke", 0.001, 3.0), ("v", 0.001, 50.0)]);
+
+    let ems = AssayErrorModels::new()
+        .add(
+            0,
+            AssayErrorModel::proportional(ErrorPoly::new(0.1, 0.1, 0.0, 0.0), 2.0),
+        )
+        .unwrap();
+
+    example_run_config(algorithm, parameter_space, ems, Prior::sobol(1234, 30))
+}
+
+// ============================================================================
+// TEST CASE 4: Neely model (10 parameters, complex)
+// ============================================================================
+
+fn create_neely_equation() -> equation::ODE {
+    equation::ODE::new(
+        |x, p, t, dx, b, rateiv, cov| {
+            fetch_params!(p, cls, k30, k40, qs, vps, vs, fm1, fm2, theta1, theta2);
+            fetch_cov!(cov, t, wt, pkvisit);
+
+            let cl = cls * ((pkvisit - 1.0) * theta1).exp() * (wt / 70.0).powf(0.75);
+            let q = qs * (wt / 70.0).powf(0.75);
+            let v = vs * ((pkvisit - 1.0) * theta2).exp() * (wt / 70.0);
+            let vp = vps * (wt / 70.0);
+            let ke = cl / v;
+            let k12 = q / v;
+            let k21 = q / vp;
+
+            dx[0] = rateiv[0] - ke * x[0] * (1.0 - fm1 - fm2) - (fm1 + fm2) * x[0] - k12 * x[0]
+                + k21 * x[1]
+                + b[0];
+            dx[1] = k12 * x[0] - k21 * x[1];
+            dx[2] = fm1 * x[0] - k30 * x[2];
+            dx[3] = fm2 * x[0] - k40 * x[3];
+        },
+        |_p, _t, _cov| lag! {},
+        |_p, _t, _cov| fa! {},
+        |_p, _t, _cov, _x| {},
+        |x, p, t, cov, y| {
+            fetch_params!(p, _cls, _k30, _k40, _qs, _vps, vs, _fm1, _fm2, _theta1, theta2);
+            fetch_cov!(cov, t, wt, pkvisit);
+
+            let vfrac1 = 0.068202;
+            let vfrac2 = 0.022569;
+            let v = vs * ((pkvisit - 1.0) * theta2).exp() * (wt / 70.0);
+            let vm1 = vfrac1 * v;
+            let vm2 = vfrac2 * v;
+
+            y[0] = x[0] / v;
+            y[1] = x[2] / vm1;
+            y[2] = x[3] / vm2;
+        },
+    )
+}
+
+fn create_neely_config(algorithm: Algorithm) -> ExampleRunConfig {
+    let parameter_space = bounded_parameter_space(&[
+        ("cls", 0.0, 0.4),
+        ("k30", 0.0, 0.5),
+        ("k40", 0.3, 1.5),
+        ("qs", 0.0, 0.5),
+        ("vps", 0.0, 5.0),
+        ("vs", 0.0, 2.0),
+        ("fm1", 0.0, 0.2),
+        ("fm2", 0.0, 0.1),
+        ("theta1", -4.0, 2.0),
+        ("theta2", -2.0, 0.5),
+    ]);
+
+    let ems = AssayErrorModels::new()
+        .add(
+            0,
+            AssayErrorModel::proportional(ErrorPoly::new(1.0, 0.1, 0.0, 0.0), 5.0),
+        )
+        .unwrap()
+        .add(
+            1,
+            AssayErrorModel::proportional(ErrorPoly::new(1.0, 0.1, 0.0, 0.0), 5.0),
+        )
+        .unwrap()
+        .add(
+            2,
+            AssayErrorModel::proportional(ErrorPoly::new(1.0, 0.1, 0.0, 0.0), 5.0),
+        )
+        .unwrap();
+
+    example_run_config(algorithm, parameter_space, ems, Prior::sobol(2028, 22))
+}
+
+// ============================================================================
+// RUN RESULT STRUCT
+// ============================================================================
+
+struct RunResult {
+    name: String,
+    scenario: String,
+    n_params: usize,
+    objf: f64,
+    n_spp: usize,
+    cycles: usize,
+    duration: std::time::Duration,
+}
+
+// ============================================================================
+// GENERIC RUNNER
+// ============================================================================
+
+fn run_scenario<E: pharmsol::prelude::simulator::Equation + Send + 'static>(
+    name: &str,
+    scenario: &str,
+    n_params: usize,
+    equation: E,
+    config: ExampleRunConfig,
+    data: Data,
+) -> Result<RunResult> {
+    println!("  Running {} on {}...", name, scenario);
+    let start = Instant::now();
+    let fit_result = match fit(build_problem(equation, data, &config)?) {
+        Ok(r) => r,
+        Err(e) => {
+            println!("  [ERROR] {} on {} failed: {:?}", name, scenario, e);
+            return Err(e);
+        }
+    };
+    let duration = start.elapsed();
+    let result = fit_result
+        .as_nonparametric()
+        .expect("benchmark scenario should yield a nonparametric result");
+
+    Ok(RunResult {
+        name: name.to_string(),
+        scenario: scenario.to_string(),
+        n_params,
+        objf: result.objf(),
+        n_spp: result.get_theta().nspp(),
+        cycles: result.cycles(),
+        duration,
+    })
+}
+
+fn print_divider() {
+    println!(
+        "═══════════════════════════════════════════════════════════════════════════════════════"
+    );
+}
+
+fn print_section(title: &str) {
+    println!();
+    print_divider();
+    println!("  {}", title);
+    print_divider();
+}
+
+fn main() -> Result<()> {
+    // Initialize logging
+    tracing_subscriber::fmt()
+        .with_env_filter(tracing_subscriber::EnvFilter::new("info,diffsol=off"))
+        .with_target(false)
+        .init();
+
+    println!();
+    println!(
+        "╔═══════════════════════════════════════════════════════════════════════════════════════╗"
+    );
+    println!(
+        "║                         NPSAH vs NPSAH2 COMPREHENSIVE BENCHMARK                       ║"
+    );
+    println!(
+        "║                                                                                       ║"
+    );
+    println!(
+        "║  Testing algorithm improvements across models of varying complexity                   ║"
+    );
+    println!(
+        "╚═══════════════════════════════════════════════════════════════════════════════════════╝"
+    );
+
+    let mut all_results: Vec<RunResult> = Vec::new();
+
+    // ========================================================================
+    // SCENARIO 1: Bimodal Distribution (2 params)
+    // ========================================================================
+    print_section("SCENARIO 1: Bimodal Distribution (2 params)");
+    println!("  Challenge: Find two distinct modes in ke parameter");
+    println!();
+
+    let data_path = "examples/bimodal_ke/bimodal_ke.csv";
+    if std::path::Path::new(data_path).exists() {
+        let data = data::read_pmetrics(data_path)?;
+
+        if let Ok(r) = run_scenario(
+            "NPSAH",
+            "bimodal_ke",
+            2,
+            create_bimodal_equation(),
+            create_bimodal_config(Algorithm::NPSAH),
+            data.clone(),
+        ) {
+            all_results.push(r);
+        }
+        if let Ok(r) = run_scenario(
+            "NPSAH2",
+            "bimodal_ke",
+            2,
+            create_bimodal_equation(),
+            create_bimodal_config(Algorithm::NPSAH2),
+            data,
+        ) {
+            all_results.push(r);
+        }
+    } else {
+        println!("  [SKIPPED] Data file not found: {}", data_path);
+    }
+
+    // ========================================================================
+    // SCENARIO 2: Two-compartment with lag (4 params)
+    // ========================================================================
+    print_section("SCENARIO 2: Two-compartment with Lag (4 params)");
+    println!("  Challenge: Handle absorption lag time parameter");
+    println!();
+
+    let data_path = "examples/two_eq_lag/two_eq_lag.csv";
+    if std::path::Path::new(data_path).exists() {
+        let data = data::read_pmetrics(data_path)?;
+
+        if let Ok(r) = run_scenario(
+            "NPSAH",
+            "two_eq_lag",
+            4,
+            create_two_eq_lag_equation(),
+            create_two_eq_lag_config(Algorithm::NPSAH),
+            data.clone(),
+        ) {
+            all_results.push(r);
+        }
+        if let Ok(r) = run_scenario(
+            "NPSAH2",
+            "two_eq_lag",
+            4,
+            create_two_eq_lag_equation(),
+            create_two_eq_lag_config(Algorithm::NPSAH2),
+            data,
+        ) {
+            all_results.push(r);
+        }
+    } else {
+        println!("  [SKIPPED] Data file not found: {}", data_path);
+    }
+
+    // ========================================================================
+    // SCENARIO 3: Theophylline (3 params)
+    // ========================================================================
+    print_section("SCENARIO 3: Theophylline (3 params)");
+    println!("  Challenge: Standard PK with analytical solution");
+    println!();
+
+    let data_path = "examples/theophylline/theophylline.csv";
+    if std::path::Path::new(data_path).exists() {
+        let data = data::read_pmetrics(data_path)?;
+
+        if let Ok(r) = run_scenario(
+            "NPSAH",
+            "theophylline",
+            3,
+            create_theo_equation(),
+            create_theo_config(Algorithm::NPSAH),
+            data.clone(),
+        ) {
+            all_results.push(r);
+        }
+        if let Ok(r) = run_scenario(
+            "NPSAH2",
+            "theophylline",
+            3,
+            create_theo_equation(),
+            create_theo_config(Algorithm::NPSAH2),
+            data,
+        ) {
+            all_results.push(r);
+        }
+    } else {
+        println!("  [SKIPPED] Data file not found: {}", data_path);
+    }
+
+    // ========================================================================
+    // SCENARIO 4: Neely model (10 params)
+    // ========================================================================
+    print_section("SCENARIO 4: Neely Model (10 params)");
+    println!("  Challenge: High-dimensional parameter space, multiple outputs");
+    println!();
+
+    let data_path = "examples/neely/data.csv";
+    if std::path::Path::new(data_path).exists() {
+        let data = data::read_pmetrics(data_path)?;
+
+        if let Ok(r) = run_scenario(
+            "NPSAH",
+            "neely",
+            10,
+            create_neely_equation(),
+            create_neely_config(Algorithm::NPSAH),
+            data.clone(),
+        ) {
+            all_results.push(r);
+        }
+        if let Ok(r) = run_scenario(
+            "NPSAH2",
+            "neely",
+            10,
+            create_neely_equation(),
+            create_neely_config(Algorithm::NPSAH2),
+            data,
+        ) {
+            all_results.push(r);
+        }
+    } else {
+        println!("  [SKIPPED] Data file not found: {}", data_path);
+    }
+
+    // ========================================================================
+    // SUMMARY TABLE
+    // ========================================================================
+    print_section("SUMMARY RESULTS");
+
+    println!();
+    println!(
+        "┌───────────┬──────────────┬────────┬──────────────┬────────────┬────────┬──────────────┐"
+    );
+    println!(
+        "│ Algorithm │ Scenario     │ Params │     -2LL     │ Support Pts│ Cycles │    Time      │"
+    );
+    println!(
+        "├───────────┼──────────────┼────────┼──────────────┼────────────┼────────┼──────────────┤"
+    );
+
+    for result in &all_results {
+        println!(
+            "│ {:9} │ {:12} │ {:6} │ {:12.4} │ {:10} │ {:6} │ {:10.2?} │",
+            result.name,
+            result.scenario,
+            result.n_params,
+            result.objf,
+            result.n_spp,
+            result.cycles,
+            result.duration
+        );
+    }
+    println!(
+        "└───────────┴──────────────┴────────┴──────────────┴────────────┴────────┴──────────────┘"
+    );
+
+    // ========================================================================
+    // COMPARATIVE ANALYSIS BY SCENARIO
+    // ========================================================================
+    print_section("COMPARATIVE ANALYSIS");
+
+    let scenarios: Vec<&str> = all_results
+        .iter()
+        .map(|r| r.scenario.as_str())
+        .collect::<std::collections::HashSet<_>>()
+        .into_iter()
+        .collect();
+
+    let mut npsah_wins = 0;
+    let mut npsah2_wins = 0;
+    let mut total_speedup = 0.0;
+    let mut total_objf_improvement = 0.0;
+    let mut comparison_count = 0;
+
+    for scenario in &scenarios {
+        let npsah = all_results
+            .iter()
+            .find(|r| r.name == "NPSAH" && r.scenario == *scenario);
+        let npsah2 = all_results
+            .iter()
+            .find(|r| r.name == "NPSAH2" && r.scenario == *scenario);
+
+        if let (Some(r1), Some(r2)) = (npsah, npsah2) {
+            println!();
+            println!("  {} ({} params):", scenario, r1.n_params);
+            println!("  ─────────────────────────────────────────────────────");
+
+            let objf_diff = r1.objf - r2.objf;
+            let time_ratio = r1.duration.as_secs_f64() / r2.duration.as_secs_f64();
+
+            let better_objf = if objf_diff > 0.001 {
+                "NPSAH2"
+            } else if objf_diff < -0.001 {
+                "NPSAH"
+            } else {
+                "TIE"
+            };
+
+            println!(
+                "    -2LL:    NPSAH={:.4}, NPSAH2={:.4} → {} wins",
+                r1.objf, r2.objf, better_objf
+            );
+
+            if time_ratio > 1.0 {
+                println!("    Speed:   NPSAH2 is {:.2}x FASTER", time_ratio);
+            } else {
+                println!("    Speed:   NPSAH2 is {:.2}x slower", 1.0 / time_ratio);
+            }
+
+            println!("    Cycles:  NPSAH={}, NPSAH2={}", r1.cycles, r2.cycles);
+            println!("    SPPs:    NPSAH={}, NPSAH2={}", r1.n_spp, r2.n_spp);
+
+            // Track wins
+            if objf_diff > 0.001 {
+                npsah2_wins += 1;
+            } else if objf_diff < -0.001 {
+                npsah_wins += 1;
+            }
+
+            total_speedup += time_ratio;
+            total_objf_improvement += objf_diff;
+            comparison_count += 1;
+        }
+    }
+
+    // ========================================================================
+    // OVERALL SUMMARY
+    // ========================================================================
+    print_section("OVERALL SUMMARY");
+
+    if comparison_count > 0 {
+        let avg_speedup = total_speedup / comparison_count as f64;
+        let avg_objf_improvement = total_objf_improvement / comparison_count as f64;
+
+        println!();
+        println!("  Scenarios compared: {}", comparison_count);
+        println!();
+        println!(
+            "  -2LL Wins:     NPSAH={}, NPSAH2={}, Ties={}",
+            npsah_wins,
+            npsah2_wins,
+            comparison_count - npsah_wins - npsah2_wins
+        );
+        println!();
+        if avg_speedup > 1.0 {
+            println!(
+                "  Avg Speed:     NPSAH2 is {:.2}x FASTER on average",
+                avg_speedup
+            );
+        } else {
+            println!(
+                "  Avg Speed:     NPSAH2 is {:.2}x slower on average",
+                1.0 / avg_speedup
+            );
+        }
+        println!();
+        if avg_objf_improvement > 0.0 {
+            println!(
+                "  Avg -2LL:      NPSAH2 finds {:.4} BETTER solutions on average",
+                avg_objf_improvement
+            );
+        } else {
+            println!(
+                "  Avg -2LL:      NPSAH finds {:.4} better solutions on average",
+                -avg_objf_improvement
+            );
+        }
+        println!();
+
+        // Final verdict
+        let speed_verdict = if avg_speedup > 1.1 {
+            "faster"
+        } else if avg_speedup < 0.9 {
+            "slower"
+        } else {
+            "similar speed"
+        };
+        let quality_verdict = if avg_objf_improvement > 0.01 {
+            "better solutions"
+        } else if avg_objf_improvement < -0.01 {
+            "worse solutions"
+        } else {
+            "similar quality"
+        };
+
+        println!(
+            "  ╔═════════════════════════════════════════════════════════════════════════════╗"
+        );
+        println!(
+            "  ║ VERDICT: NPSAH2 is {} and finds {}              ║",
+            speed_verdict, quality_verdict
+        );
+        println!(
+            "  ╚═════════════════════════════════════════════════════════════════════════════╝"
+        );
+    }
+
+    println!();
+    Ok(())
+}
diff --git a/examples/neely/main.rs b/examples/neely/main.rs
new file mode 100644
index 000000000..c2d8a977b
--- /dev/null
+++ b/examples/neely/main.rs
@@ -0,0 +1,107 @@
+use pmcore::prelude::*;
+fn main() {
+    let ode = ode! {
+        diffeq: |x, p, t, dx, b, rateiv, cov| {
+            fetch_params!(p, cls, k30, k40, qs, vps, vs, fm1, fm2, theta1, theta2);
+            fetch_cov!(cov, t, wt, pkvisit);
+
+            let vfrac1 = 0.068202;
+            let vfrac2 = 0.022569;
+            let cl = cls * ((pkvisit - 1.0) * theta1).exp() * (wt / 70.0).powf(0.75);
+            let q = qs * (wt / 70.0).powf(0.75);
+            let v = vs * ((pkvisit - 1.0) * theta2).exp() * (wt / 70.0);
+            let vp = vps * (wt / 70.0);
+            let ke = cl / v;
+            let _vm1 = vfrac1 * v;
+            let _vm2 = vfrac2 * v;
+            let k12 = q / v;
+            let k21 = q / vp;
+
+            //</tem>
+            dx[0] = rateiv[1] - ke * x[0] * (1.0 - fm1 - fm2) - (fm1 + fm2) * x[0] - k12 * x[0]
+                + k21 * x[1]
+                + b[1];
+            dx[1] = k12 * x[0] - k21 * x[1];
+            dx[2] = fm1 * x[0] - k30 * x[2];
+            dx[3] = fm2 * x[0] - k40 * x[3];
+        },
+        out: |x, p, t, cov, y| {
+            fetch_params!(p, cls, _k30, _k40, qs, vps, vs, _fm1, _fm2, theta1, theta2);
+            fetch_cov!(cov, t, wt, pkvisit);
+
+            let vfrac1 = 0.068202;
+            let vfrac2 = 0.022569;
+            let cl = cls * ((pkvisit - 1.0) * theta1).exp() * (wt / 70.0).powf(0.75);
+            let q = qs * (wt / 70.0).powf(0.75);
+            let v = vs * ((pkvisit - 1.0) * theta2).exp() * (wt / 70.0);
+            let vp = vps * (wt / 70.0);
+            let _ke = cl / v;
+            let vm1 = vfrac1 * v;
+            let vm2 = vfrac2 * v;
+            let _k12 = q / v;
+            let _k21 = q / vp;
+
+            y[1] = x[0] / v;
+            y[2] = x[2] / vm1;
+            y[3] = x[3] / vm2;
+        },
+    };
+    let observations = ObservationSpec::new()
+        .add_channel(ObservationChannel::continuous(1, "cp"))
+        .add_channel(ObservationChannel::continuous(2, "m1"))
+        .add_channel(ObservationChannel::continuous(3, "m2"))
+        .with_assay_error_models(
+            AssayErrorModels::new()
+                .add(
+                    1,
+                    AssayErrorModel::proportional(ErrorPoly::new(1.0, 0.1, 0.0, 0.0), 5.0),
+                )
+                .unwrap()
+                .add(
+                    2,
+                    AssayErrorModel::proportional(ErrorPoly::new(1.0, 0.1, 0.0, 0.0), 5.0),
+                )
+                .unwrap()
+                .add(
+                    3,
+                    AssayErrorModel::proportional(ErrorPoly::new(1.0, 0.1, 0.0, 0.0), 5.0),
+                )
+                .unwrap(),
+        );
+
+    let model = ModelDefinition::builder(ode)
+        .parameters(
+            ParameterSpace::new()
+                .add(ParameterSpec::bounded("cls", 0.0, 0.4))
+                .add(ParameterSpec::bounded("k30", 0.0, 0.5))
+                .add(ParameterSpec::bounded("k40", 0.3, 1.5))
+                .add(ParameterSpec::bounded("qs", 0.0, 0.5))
+                .add(ParameterSpec::bounded("vps", 0.0, 5.0))
+                .add(ParameterSpec::bounded("vs", 0.0, 2.0))
+                .add(ParameterSpec::bounded("fm1", 0.0, 0.2))
+                .add(ParameterSpec::bounded("fm2", 0.0, 0.1))
+                .add(ParameterSpec::bounded("theta1", -4.0, 2.0))
+                .add(ParameterSpec::bounded("theta2", -2.0, 0.5)),
+        )
+        .observations(observations)
+        .build()
+        .unwrap();
+
+    let data = data::read_pmetrics("examples/neely/data.csv").unwrap();
+    let mut result = EstimationProblem::builder(model, data)
+        .method(EstimationMethod::Nonparametric(NonparametricMethod::Npsah(
+            NpsahOptions::default(),
+        )))
+        .output(OutputPlan {
+            write: true,
+            path: Some("examples/neely/output/".to_string()),
+        })
+        .runtime(RuntimeOptions {
+            cycles: 1000,
+            prior: Some(Prior::sobol(2028, 22)),
+            ..RuntimeOptions::default()
+        })
+        .run()
+        .unwrap();
+    result.write_outputs().unwrap();
+}
diff --git a/examples/paper_benchmarks.rs b/examples/paper_benchmarks.rs
new file mode 100644
index 000000000..3823aad1c
--- /dev/null
+++ b/examples/paper_benchmarks.rs
@@ -0,0 +1,686 @@
+//! Comprehensive algorithm benchmarking for the paper
+//!
+//! Run with: cargo run --release --example paper_benchmarks -- [category]
+//!
+//! Categories:
+//!   all       - Run all benchmarks (WARNING: takes many hours)
+//!   a         - Reproducibility (bimodal_ke, 5 seeds, all algorithms)
+//!   d         - Convergence speed (theophylline, 3D unimodal)
+//!   e         - Lag time (two_eq_lag, 4D with tlag)
+//!   f         - Multi-output with covariates (meta, 7D)
+//!   g         - High-dimensional (neely, 10D)
+//!   quick     - Quick sanity check (bimodal_ke, 1 seed, 3 algorithms)
+
+use anyhow::Result;
+use pmcore::prelude::*;
+use std::fs::{self, File};
+use std::io::Write;
+use std::time::Instant;
+
+#[derive(Clone)]
+struct ExampleRunConfig {
+    algorithm: Algorithm,
+    parameter_space: ParameterSpace,
+    assay_error_models: AssayErrorModels,
+    output: OutputPlan,
+    runtime: RuntimeOptions,
+}
+
+fn nonparametric_method(algorithm: Algorithm) -> Result<NonparametricMethod> {
+    Ok(match algorithm {
+        Algorithm::NPAG => NonparametricMethod::Npag(NpagOptions),
+        Algorithm::NPBO => NonparametricMethod::Npbo(NpboOptions),
+        Algorithm::NPCAT => NonparametricMethod::Npcat(NpcatOptions),
+        Algorithm::NPCMA => NonparametricMethod::Npcma(NpcmaOptions),
+        Algorithm::NPOD => NonparametricMethod::Npod(NpodOptions),
+        Algorithm::NPOPT => NonparametricMethod::Npopt(NpoptOptions),
+        Algorithm::NPPSO => NonparametricMethod::Nppso(NppsoOptions),
+        Algorithm::NPSAH => NonparametricMethod::Npsah(NpsahOptions),
+        Algorithm::NPSAH2 => NonparametricMethod::Npsah2(Npsah2Options),
+        Algorithm::NPXO => NonparametricMethod::Npxo(NpxoOptions),
+        Algorithm::NEXUS => NonparametricMethod::Nexus(NexusOptions),
+        Algorithm::POSTPROB => NonparametricMethod::Postprob(PostProbOptions),
+        other => anyhow::bail!("unsupported nonparametric algorithm: {:?}", other),
+    })
+}
+
+fn build_problem<E: pharmsol::Equation + Clone>(
+    equation: E,
+    data: Data,
+    config: &ExampleRunConfig,
+) -> Result<EstimationProblem<E>> {
+    let observations = config
+        .assay_error_models
+        .iter()
+        .filter(|(_, model)| !matches!(model, AssayErrorModel::None))
+        .fold(
+            ObservationSpec::new().with_assay_error_models(config.assay_error_models.clone()),
+            |spec, (outeq, _)| {
+                spec.add_channel(ObservationChannel::continuous(
+                    outeq,
+                    format!("obs_{}", outeq),
+                ))
+            },
+        );
+
+    let model = ModelDefinition::builder(equation)
+        .parameters(config.parameter_space.clone())
+        .observations(observations)
+        .build()?;
+
+    EstimationProblem::builder(model, data)
+        .method(EstimationMethod::Nonparametric(nonparametric_method(
+            config.algorithm,
+        )?))
+        .output(config.output.clone())
+        .runtime(config.runtime.clone())
+        .build()
+}
+
+fn bounded_parameter_space(bounds: &[(&str, f64, f64)]) -> ParameterSpace {
+    bounds
+        .iter()
+        .fold(ParameterSpace::new(), |space, (name, lower, upper)| {
+            space.add(ParameterSpec::bounded(*name, *lower, *upper))
+        })
+}
+
+// ============================================================================
+// MODELS
+// ============================================================================
+
+fn bimodal_ke_equation() -> equation::ODE {
+    equation::ODE::new(
+        |x, p, _t, dx, _b, rateiv, _cov| {
+            fetch_params!(p, ke, _v);
+            dx[0] = -ke * x[0] + rateiv[1];
+        },
+        |_p, _t, _cov| lag! {},
+        |_p, _t, _cov| fa! {},
+        |_p, _t, _cov, _x| {},
+        |x, p, _t, _cov, y| {
+            fetch_params!(p, _ke, v);
+            y[1] = x[0] / v;
+        },
+    )
+}
+
+fn theophylline_equation() -> equation::Analytical {
+    equation::Analytical::new(
+        one_compartment_with_absorption,
+        |_p, _t, _cov| {},
+        |_p, _t, _cov| lag! {},
+        |_p, _t, _cov| fa! {},
+        |_p, _t, _cov, _x| {},
+        |x, p, _t, _cov, y| {
+            fetch_params!(p, _ka, _ke, v);
+            y[0] = x[1] * 1000.0 / v;
+        },
+    )
+}
+
+fn two_eq_lag_equation() -> equation::ODE {
+    equation::ODE::new(
+        |x, p, _t, dx, b, _rateiv, _cov| {
+            fetch_params!(p, ka, ke);
+            dx[0] = -ka * x[0] + b[0];
+            dx[1] = ka * x[0] - ke * x[1];
+        },
+        |p, _t, _cov| {
+            fetch_params!(p, _ka, _ke, tlag, _v);
+            lag! {0=>tlag}
+        },
+        |_p, _t, _cov| fa! {},
+        |_p, _t, _cov, _x| {},
+        |x, p, _t, _cov, y| {
+            fetch_params!(p, _ka, _ke, _tlag, v);
+            y[0] = x[1] / v;
+        },
+    )
+}
+
+fn meta_equation() -> equation::ODE {
+    equation::ODE::new(
+        |x, p, t, dx, _b, rateiv, cov| {
+            fetch_cov!(cov, t, wt, pkvisit);
+            fetch_params!(p, cls, fm, k20, relv, theta1, theta2, vs);
+            let cl = cls * ((pkvisit - 1.0) * theta1).exp() * (wt / 70.0).powf(0.75);
+            let v = vs * ((pkvisit - 1.0) * theta2).exp() * (wt / 70.0);
+            let ke = cl / v;
+            let _v2 = relv * v;
+            dx[0] = rateiv[1] - ke * x[0] * (1.0 - fm) - fm * x[0];
+            dx[1] = fm * x[0] - k20 * x[1];
+        },
+        |_p, _t, _cov| lag! {},
+        |_p, _t, _cov| fa! {},
+        |_p, _t, _cov, _x| {},
+        |x, p, t, cov, y| {
+            fetch_cov!(cov, t, wt, pkvisit);
+            fetch_params!(p, cls, _fm, _k20, relv, theta1, theta2, vs);
+            let _cl = cls * ((pkvisit - 1.0) * theta1).exp() * (wt / 70.0).powf(0.75);
+            let v = vs * ((pkvisit - 1.0) * theta2).exp() * (wt / 70.0);
+            let v2 = relv * v;
+            y[1] = x[0] / v;
+            y[2] = x[1] / v2;
+        },
+    )
+}
+
+fn neely_equation() -> equation::ODE {
+    equation::ODE::new(
+        |x, p, t, dx, _b, rateiv, cov| {
+            fetch_params!(p, cls, k30, k40, qs, vps, vs, fm1, fm2, theta1, theta2);
+            fetch_cov!(cov, t, wt, pkvisit);
+            let cl = cls * ((pkvisit - 1.0) * theta1).exp() * (wt / 70.0).powf(0.75);
+            let q = qs * (wt / 70.0).powf(0.75);
+            let v = vs * ((pkvisit - 1.0) * theta2).exp() * (wt / 70.0);
+            let vp = vps * (wt / 70.0);
+            let ke = cl / v;
+            let k12 = q / v;
+            let k21 = q / vp;
+            dx[0] = rateiv[1] - ke * x[0] * (1.0 - fm1 - fm2) - (fm1 + fm2) * x[0] - k12 * x[0]
+                + k21 * x[1];
+            dx[1] = k12 * x[0] - k21 * x[1];
+            dx[2] = fm1 * x[0] - k30 * x[2];
+            dx[3] = fm2 * x[0] - k40 * x[3];
+        },
+        |_p, _t, _cov| lag! {},
+        |_p, _t, _cov| fa! {},
+        |_p, _t, _cov, _x| {},
+        |x, p, t, cov, y| {
+            fetch_params!(p, cls, _k30, _k40, qs, vps, vs, _fm1, _fm2, theta1, theta2);
+            fetch_cov!(cov, t, wt, pkvisit);
+            let vfrac1 = 0.068202;
+            let vfrac2 = 0.022569;
+            let _cl = cls * ((pkvisit - 1.0) * theta1).exp() * (wt / 70.0).powf(0.75);
+            let _q = qs * (wt / 70.0).powf(0.75);
+            let v = vs * ((pkvisit - 1.0) * theta2).exp() * (wt / 70.0);
+            let _vp = vps * (wt / 70.0);
+            let vm1 = vfrac1 * v;
+            let vm2 = vfrac2 * v;
+            y[1] = x[0] / v;
+            y[2] = x[2] / vm1;
+            y[3] = x[3] / vm2;
+        },
+    )
+}
+
+// ============================================================================
+// DATASET CONFIGURATIONS
+// ============================================================================
+
+#[derive(Clone)]
+struct DatasetConfig {
+    name: &'static str,
+    data_path: &'static str,
+    parameters: Vec<(&'static str, f64, f64)>,
+    error_models: Vec<(usize, f64, f64, f64, bool)>, // (output, c0, c1, scale, is_proportional)
+}
+
+fn bimodal_ke_config() -> DatasetConfig {
+    DatasetConfig {
+        name: "bimodal_ke",
+        data_path: "examples/bimodal_ke/bimodal_ke.csv",
+        parameters: vec![("ke", 0.001, 3.0), ("v", 25.0, 250.0)],
+        error_models: vec![(1, 0.0, 0.5, 0.0, false)],
+    }
+}
+
+fn theophylline_config() -> DatasetConfig {
+    DatasetConfig {
+        name: "theophylline",
+        data_path: "examples/theophylline/theophylline.csv",
+        parameters: vec![("ka", 0.001, 3.0), ("ke", 0.001, 3.0), ("v", 0.001, 50.0)],
+        error_models: vec![(0, 0.1, 0.1, 2.0, true)],
+    }
+}
+
+fn two_eq_lag_config() -> DatasetConfig {
+    DatasetConfig {
+        name: "two_eq_lag",
+        data_path: "examples/two_eq_lag/two_eq_lag.csv",
+        parameters: vec![
+            ("ka", 0.1, 0.9),
+            ("ke", 0.001, 0.1),
+            ("tlag", 0.0, 4.0),
+            ("v", 30.0, 120.0),
+        ],
+        error_models: vec![(0, -0.00119, 0.44379, 0.0, false)],
+    }
+}
+
+fn meta_config() -> DatasetConfig {
+    DatasetConfig {
+        name: "meta",
+        data_path: "examples/meta/meta.csv",
+        parameters: vec![
+            ("cls", 0.1, 10.0),
+            ("fm", 0.0, 1.0),
+            ("k20", 0.01, 1.0),
+            ("relv", 0.1, 1.0),
+            ("theta1", 0.1, 10.0),
+            ("theta2", 0.1, 10.0),
+            ("vs", 1.0, 10.0),
+        ],
+        error_models: vec![(1, 1.0, 0.1, 5.0, true), (2, 1.0, 0.1, 5.0, true)],
+    }
+}
+
+fn neely_config() -> DatasetConfig {
+    DatasetConfig {
+        name: "neely",
+        data_path: "examples/neely/data.csv",
+        parameters: vec![
+            ("cls", 0.0, 0.4),
+            ("k30", 0.0, 0.5),
+            ("k40", 0.3, 1.5),
+            ("qs", 0.0, 0.5),
+            ("vps", 0.0, 5.0),
+            ("vs", 0.0, 2.0),
+            ("fm1", 0.0, 0.2),
+            ("fm2", 0.0, 0.1),
+            ("theta1", -4.0, 2.0),
+            ("theta2", -2.0, 0.5),
+        ],
+        error_models: vec![
+            (1, 1.0, 0.1, 5.0, true),
+            (2, 1.0, 0.1, 5.0, true),
+            (3, 1.0, 0.1, 5.0, true),
+        ],
+    }
+}
+
+// ============================================================================
+// BENCHMARK RESULT
+// ============================================================================
+
+#[derive(Debug)]
+struct BenchmarkResult {
+    experiment: String,
+    dataset: String,
+    algorithm: String,
+    seed: u64,
+    cycles: usize,
+    time_secs: f64,
+    objf: f64,
+    n_spp: usize,
+    converged: bool,
+}
+
+impl BenchmarkResult {
+    fn to_csv_row(&self) -> String {
+        format!(
+            "{},{},{},{},{},{:.2},{:.4},{},{}\n",
+            self.experiment,
+            self.dataset,
+            self.algorithm,
+            self.seed,
+            self.cycles,
+            self.time_secs,
+            self.objf,
+            self.n_spp,
+            self.converged
+        )
+    }
+}
+
+// ============================================================================
+// BENCHMARK RUNNER
+// ============================================================================
+
+fn create_config(
+    algorithm: Algorithm,
+    config: &DatasetConfig,
+    seed: u64,
+    max_cycles: usize,
+    output_path: &str,
+) -> ExampleRunConfig {
+    let parameter_space = bounded_parameter_space(&config.parameters);
+
+    let mut ems = AssayErrorModels::new();
+    for (output, c0, c1, scale, is_proportional) in &config.error_models {
+        if *is_proportional {
+            ems = ems
+                .add(
+                    *output,
+                    AssayErrorModel::proportional(ErrorPoly::new(*c0, *c1, 0.0, 0.0), *scale),
+                )
+                .unwrap();
+        } else {
+            ems = ems
+                .add(
+                    *output,
+                    AssayErrorModel::additive(ErrorPoly::new(*c0, *c1, 0.0, 0.0), *scale),
+                )
+                .unwrap();
+        }
+    }
+
+    ExampleRunConfig {
+        algorithm,
+        parameter_space,
+        assay_error_models: ems,
+        output: OutputPlan {
+            write: true,
+            path: Some(output_path.to_string()),
+        },
+        runtime: RuntimeOptions {
+            cycles: max_cycles,
+            cache: true,
+            progress: true,
+            idelta: 0.12,
+            tad: 0.0,
+            prior: Some(Prior::sobol(2028, seed as usize)),
+            ..RuntimeOptions::default()
+        },
+    }
+}
+
+/// Run a single benchmark, writing theta files for post-hoc analysis
+macro_rules! run_fit {
+    ($settings:expr, $eq:expr, $data:expr) => {{
+        let config = $settings;
+        let equation = $eq;
+        let fit_result = fit(build_problem(equation, $data.clone(), &config)?)?;
+        let result = fit_result
+            .as_nonparametric()
+            .expect("benchmark fit should yield a nonparametric result");
+        let _ = result.write_theta();
+        (result.objf(), result.get_theta().nspp(), result.cycles())
+    }};
+}
+
+fn run_single_benchmark(
+    experiment: &str,
+    dataset_config: &DatasetConfig,
+    algorithm: Algorithm,
+    algorithm_name: &str,
+    seed: u64,
+    max_cycles: usize,
+    data: &Data,
+) -> Result<BenchmarkResult> {
+    let output_path = format!(
+        "examples/paper_benchmarks/output/{}/{}_seed{}/",
+        dataset_config.name, algorithm_name, seed
+    );
+    fs::create_dir_all(&output_path)?;
+
+    let config = create_config(algorithm, dataset_config, seed, max_cycles, &output_path);
+
+    println!(
+        "  Running {} on {} (seed {})...",
+        algorithm_name, dataset_config.name, seed
+    );
+
+    let start = Instant::now();
+
+    let (objf, n_spp, cycles) = match dataset_config.name {
+        "bimodal_ke" => run_fit!(config, bimodal_ke_equation(), data),
+        "theophylline" => run_fit!(config, theophylline_equation(), data),
+        "two_eq_lag" => run_fit!(config, two_eq_lag_equation(), data),
+        "meta" => run_fit!(config, meta_equation(), data),
+        "neely" => run_fit!(config, neely_equation(), data),
+        _ => anyhow::bail!("Unknown dataset: {}", dataset_config.name),
+    };
+
+    let duration = start.elapsed();
+
+    let result = BenchmarkResult {
+        experiment: experiment.to_string(),
+        dataset: dataset_config.name.to_string(),
+        algorithm: algorithm_name.to_string(),
+        seed,
+        cycles,
+        time_secs: duration.as_secs_f64(),
+        objf,
+        n_spp,
+        converged: true,
+    };
+
+    println!(
+        "    -> -2LL: {:.4}, cycles: {}, time: {:.2}s, spp: {}",
+        result.objf, result.cycles, result.time_secs, result.n_spp
+    );
+
+    Ok(result)
+}
+
+// ============================================================================
+// ALGORITHM SETS
+// ============================================================================
+
+fn all_algorithms() -> Vec<(&'static str, Algorithm)> {
+    vec![
+        ("NPAG", Algorithm::NPAG),
+        ("NPOD", Algorithm::NPOD),
+        ("NPSAH", Algorithm::NPSAH),
+        ("NPSAH2", Algorithm::NPSAH2),
+        ("NPCAT", Algorithm::NPCAT),
+        ("NPOPT", Algorithm::NPOPT),
+        ("NPPSO", Algorithm::NPPSO),
+        ("NPXO", Algorithm::NPXO),
+        ("NPBO", Algorithm::NPBO),
+        ("NPCMA", Algorithm::NPCMA),
+        ("NEXUS", Algorithm::NEXUS),
+    ]
+}
+
+/// Algorithms competitive enough to test on harder problems
+fn competitive_algorithms() -> Vec<(&'static str, Algorithm)> {
+    vec![
+        ("NPAG", Algorithm::NPAG),
+        ("NPOD", Algorithm::NPOD),
+        ("NPSAH", Algorithm::NPSAH),
+        ("NPSAH2", Algorithm::NPSAH2),
+        ("NPCAT", Algorithm::NPCAT),
+        ("NPOPT", Algorithm::NPOPT),
+        ("NPPSO", Algorithm::NPPSO),
+        ("NEXUS", Algorithm::NEXUS),
+    ]
+}
+
+// ============================================================================
+// EXPERIMENT CATEGORIES
+// ============================================================================
+
+fn run_category_a(results_file: &mut File) -> Result<()> {
+    println!("\n========================================");
+    println!("CATEGORY A: Reproducibility & Multimodality");
+    println!("bimodal_ke | 51 subj | 2D | bimodal ke");
+    println!("========================================\n");
+
+    let config = bimodal_ke_config();
+    let data = data::read_pmetrics(config.data_path)?;
+    let seeds = [42u64, 123, 456, 789, 1001];
+
+    for (name, alg) in &all_algorithms() {
+        for seed in &seeds {
+            match run_single_benchmark("A", &config, *alg, name, *seed, 10000, &data) {
+                Ok(r) => {
+                    results_file.write_all(r.to_csv_row().as_bytes())?;
+                    results_file.flush()?;
+                }
+                Err(e) => eprintln!("    ERROR: {}", e),
+            }
+        }
+    }
+    Ok(())
+}
+
+fn run_category_d(results_file: &mut File) -> Result<()> {
+    println!("\n========================================");
+    println!("CATEGORY D: Unimodal Convergence");
+    println!("theophylline | 12 subj | 3D | analytical");
+    println!("========================================\n");
+
+    let config = theophylline_config();
+    let data = data::read_pmetrics(config.data_path)?;
+    let seeds = [42u64, 123, 456];
+
+    for (name, alg) in &competitive_algorithms() {
+        for seed in &seeds {
+            match run_single_benchmark("D", &config, *alg, name, *seed, 500, &data) {
+                Ok(r) => {
+                    results_file.write_all(r.to_csv_row().as_bytes())?;
+                    results_file.flush()?;
+                }
+                Err(e) => eprintln!("    ERROR: {}", e),
+            }
+        }
+    }
+    Ok(())
+}
+
+fn run_category_e(results_file: &mut File) -> Result<()> {
+    println!("\n========================================");
+    println!("CATEGORY E: Lag Time Estimation");
+    println!("two_eq_lag | 20 subj | 4D | ODE + lag");
+    println!("========================================\n");
+
+    let config = two_eq_lag_config();
+    let data = data::read_pmetrics(config.data_path)?;
+    let seeds = [42u64, 123, 456];
+
+    for (name, alg) in &competitive_algorithms() {
+        for seed in &seeds {
+            match run_single_benchmark("E", &config, *alg, name, *seed, 5000, &data) {
+                Ok(r) => {
+                    results_file.write_all(r.to_csv_row().as_bytes())?;
+                    results_file.flush()?;
+                }
+                Err(e) => eprintln!("    ERROR: {}", e),
+            }
+        }
+    }
+    Ok(())
+}
+
+fn run_category_f(results_file: &mut File) -> Result<()> {
+    println!("\n========================================");
+    println!("CATEGORY F: Medium-dim + Covariates");
+    println!("meta | 19 subj | 7D | 2 outputs | covariates");
+    println!("========================================\n");
+
+    let config = meta_config();
+    let data = data::read_pmetrics(config.data_path)?;
+    let seeds = [42u64, 123, 456];
+
+    for (name, alg) in &competitive_algorithms() {
+        for seed in &seeds {
+            match run_single_benchmark("F", &config, *alg, name, *seed, 5000, &data) {
+                Ok(r) => {
+                    results_file.write_all(r.to_csv_row().as_bytes())?;
+                    results_file.flush()?;
+                }
+                Err(e) => eprintln!("    ERROR: {}", e),
+            }
+        }
+    }
+    Ok(())
+}
+
+fn run_category_g(results_file: &mut File) -> Result<()> {
+    println!("\n========================================");
+    println!("CATEGORY G: High Dimensionality");
+    println!("neely | 22 subj | 10D | 3 outputs | covariates");
+    println!("========================================\n");
+
+    let config = neely_config();
+    let data = data::read_pmetrics(config.data_path)?;
+    let seeds = [42u64, 123, 456];
+
+    for (name, alg) in &competitive_algorithms() {
+        for seed in &seeds {
+            match run_single_benchmark("G", &config, *alg, name, *seed, 1000, &data) {
+                Ok(r) => {
+                    results_file.write_all(r.to_csv_row().as_bytes())?;
+                    results_file.flush()?;
+                }
+                Err(e) => eprintln!("    ERROR: {}", e),
+            }
+        }
+    }
+    Ok(())
+}
+
+fn run_quick(results_file: &mut File) -> Result<()> {
+    println!("\n========================================");
+    println!("QUICK: Sanity Check");
+    println!("========================================\n");
+
+    let config = bimodal_ke_config();
+    let data = data::read_pmetrics(config.data_path)?;
+    let algorithms = vec![
+        ("NPAG", Algorithm::NPAG),
+        ("NPOD", Algorithm::NPOD),
+        ("NPSAH2", Algorithm::NPSAH2),
+    ];
+
+    for (name, alg) in &algorithms {
+        match run_single_benchmark("quick", &config, *alg, name, 42, 1000, &data) {
+            Ok(r) => {
+                results_file.write_all(r.to_csv_row().as_bytes())?;
+                results_file.flush()?;
+            }
+            Err(e) => eprintln!("    ERROR: {}", e),
+        }
+    }
+    Ok(())
+}
+
+// ============================================================================
+// MAIN
+// ============================================================================
+
+fn main() -> Result<()> {
+    tracing_subscriber::fmt()
+        .with_env_filter(tracing_subscriber::EnvFilter::new("warn,diffsol=off"))
+        .with_target(false)
+        .init();
+
+    let args: Vec<String> = std::env::args().collect();
+    let category = args.get(1).map(|s| s.as_str()).unwrap_or("quick");
+
+    fs::create_dir_all("examples/paper_benchmarks/output")?;
+
+    let results_path = format!(
+        "examples/paper_benchmarks/results_{}.csv",
+        std::time::SystemTime::now()
+            .duration_since(std::time::UNIX_EPOCH)
+            .unwrap()
+            .as_secs()
+    );
+    let mut results_file = File::create(&results_path)?;
+    writeln!(
+        results_file,
+        "experiment,dataset,algorithm,seed,cycles,time_secs,objf,n_spp,converged"
+    )?;
+
+    println!("╔══════════════════════════════════════════════════════════╗");
+    println!("║       PAPER BENCHMARKS: Algorithm Comparison             ║");
+    println!("╚══════════════════════════════════════════════════════════╝");
+    println!("\nResults will be saved to: {}", results_path);
+
+    match category {
+        "all" => {
+            run_category_a(&mut results_file)?;
+            run_category_d(&mut results_file)?;
+            run_category_e(&mut results_file)?;
+            run_category_f(&mut results_file)?;
+            run_category_g(&mut results_file)?;
+        }
+        "a" => run_category_a(&mut results_file)?,
+        "d" => run_category_d(&mut results_file)?,
+        "e" => run_category_e(&mut results_file)?,
+        "f" => run_category_f(&mut results_file)?,
+        "g" => run_category_g(&mut results_file)?,
+        "quick" | _ => run_quick(&mut results_file)?,
+    }
+
+    println!("\n========================================");
+    println!("BENCHMARKS COMPLETE");
+    println!("Results saved to: {}", results_path);
+    println!("========================================");
+
+    Ok(())
+}
diff --git a/examples/two_eq_lag/main.rs b/examples/two_eq_lag/main.rs
new file mode 100644
index 000000000..d1f109df0
--- /dev/null
+++ b/examples/two_eq_lag/main.rs
@@ -0,0 +1,101 @@
+#![allow(dead_code)]
+#![allow(unused_variables)]
+#![allow(unused_imports)]
+
+use pmcore::prelude::*;
+
+fn main() {
+    let eq = ode! {
+        diffeq: |x, p, _t, dx, b, rateiv, _cov| {
+            fetch_cov!(cov, t,);
+            fetch_params!(p, ka, ke);
+            dx[0] = -ka * x[0] + b[1];
+            dx[1] = ka * x[0] - ke * x[1];
+        },
+        lag: |p, _t, _cov| {
+            fetch_params!(p, _ka, _ke, tlag, _v);
+            lag! {1=>tlag}
+        },
+        out: |x, p, _t, _cov, y| {
+            fetch_params!(p, _ka, _ke, _tlag, v);
+            y[1] = x[1] / v;
+        },
+    };
+    // let eq = Equation::new_analytical(
+    //     one_compartment_with_absorption,
+    //     |_p, _cov| {},
+    //     |p| {
+    //         fetch_params!(p, _ka, _ke, tlag, _v);
+    //         lag! {0=>tlag}
+    //     },
+    //     |_p, _t, _cov| fa! {},
+    //     |_p, _t, _cov, _x| {},
+    //     |x, p, _t, _cov, y| {
+    //         fetch_params!(p, _ka, _ke, _tlag, v);
+    //         y[0] = x[1] / v;
+    //     },
+    //     (2, 1),
+    // );
+    // let eq = equation::ODENet::new(
+    //     vec![
+    //         dmatrix![
+    //             -1.0,0.;
+    //             1.,0.
+    //         ],
+    //         dmatrix![
+    //             0.,0.;
+    //             0.,-1.
+    //         ],
+    //         dmatrix![
+    //             0.0,0.0;
+    //             0.0,0.0
+    //         ],
+    //         dmatrix![
+    //             0.0,0.0;
+    //             0.0,0.0
+    //         ],
+    //     ],
+    //     vec![],
+    //     vec![],
+    //     vec![Lag::new(0, Op::Equal(P(2)))],
+    //     vec![],
+    //     vec![],
+    //     vec![OutEq::new(0, Op::Div(X(1), P(3)))],
+    //     (2, 1),
+    // );
+
+    let observations = ObservationSpec::new()
+        .add_channel(ObservationChannel::continuous(1, "cp"))
+        .with_assay_error_models(
+            AssayErrorModels::new()
+                .add(
+                    1,
+                    AssayErrorModel::additive(
+                        ErrorPoly::new(-0.00119, 0.44379, -0.45864, 0.16537),
+                        0.0,
+                    ),
+                )
+                .unwrap(),
+        );
+
+    let model = ModelDefinition::builder(eq)
+        .parameters(
+            ParameterSpace::new()
+                .add(ParameterSpec::bounded("ka", 0.1, 0.9))
+                .add(ParameterSpec::bounded("ke", 0.001, 0.1))
+                .add(ParameterSpec::bounded("tlag", 0.0, 4.0))
+                .add(ParameterSpec::bounded("v", 30.0, 120.0)),
+        )
+        .observations(observations)
+        .build()
+        .unwrap();
+
+    let data = data::read_pmetrics("examples/two_eq_lag/two_eq_lag.csv").unwrap();
+    let mut result = EstimationProblem::builder(model, data)
+        .method(EstimationMethod::Nonparametric(NonparametricMethod::Npsah(
+            NpsahOptions::default(),
+        )))
+        .run()
+        .unwrap();
+    result.write_outputs().unwrap();
+}
diff --git a/src/algorithms/mod.rs b/src/algorithms/mod.rs
index 502774dfb..dc29ba6ea 100644
--- a/src/algorithms/mod.rs
+++ b/src/algorithms/mod.rs
@@ -10,8 +10,17 @@ use anyhow::Context;
 use anyhow::Result;
 use ndarray::parallel::prelude::{IntoParallelIterator, ParallelIterator};
 use ndarray::{Array, ArrayBase, Dim, OwnedRepr};
+use nonparametric::nexus::NEXUS;
 use nonparametric::npag::*;
+use nonparametric::npbo::NPBO;
+use nonparametric::npcat::NPCAT;
+use nonparametric::npcma::NPCMA;
 use nonparametric::npod::NPOD;
+use nonparametric::npopt::NPOPT;
+use nonparametric::nppso::NPPSO;
+use nonparametric::npsah::NPSAH;
+use nonparametric::npsah2::NPSAH2;
+use nonparametric::npxo::NPXO;
 use nonparametric::postprob::POSTPROB;
 use pharmsol::prelude::{data::Data, simulator::Equation};
 use pharmsol::{Predictions, Subject};
@@ -111,8 +120,26 @@ impl<E: Equation> NonparametricAlgorithmInput<E> {
 pub enum Algorithm {
     /// Non-Parametric Adaptive Grid
     NPAG,
+    /// Non-Parametric Bayesian Optimization
+    NPBO,
+    /// Non-Parametric Categorical
+    NPCAT,
+    /// Non-Parametric CMA-ES
+    NPCMA,
     /// Non-Parametric Optimal Design
     NPOD,
+    /// Non-Parametric Optimization
+    NPOPT,
+    /// Non-Parametric Particle Swarm Optimization
+    NPPSO,
+    /// Non-Parametric Simulated Annealing Hybrid
+    NPSAH,
+    /// Non-Parametric Simulated Annealing Hybrid v2
+    NPSAH2,
+    /// Non-Parametric Cross-Over
+    NPXO,
+    /// NEXUS algorithm
+    NEXUS,
     /// Posterior Probability calculation
     POSTPROB,
 }
@@ -120,7 +147,21 @@ pub enum Algorithm {
 impl Algorithm {
     /// Check if this is a non-parametric algorithm
     pub fn is_nonparametric(&self) -> bool {
-        matches!(self, Algorithm::NPAG | Algorithm::NPOD | Algorithm::POSTPROB)
+        matches!(
+            self,
+            Algorithm::NPAG
+                | Algorithm::NPBO
+                | Algorithm::NPCAT
+                | Algorithm::NPCMA
+                | Algorithm::NPOD
+                | Algorithm::NPOPT
+                | Algorithm::NPPSO
+                | Algorithm::NPSAH
+                | Algorithm::NPSAH2
+                | Algorithm::NPXO
+                | Algorithm::NEXUS
+                | Algorithm::POSTPROB
+        )
     }
 
     /// Check if this is a parametric algorithm
@@ -417,10 +458,46 @@ pub(crate) fn dispatch_nonparametric_algorithm<E: Equation + Send + 'static>(
             let algorithm: Box<dyn Algorithms<E>> = NPAG::from_input(input)?;
             Ok(algorithm)
         }
+        NonparametricMethod::Npbo(_) => {
+            let algorithm: Box<dyn Algorithms<E>> = NPBO::from_input(input)?;
+            Ok(algorithm)
+        }
+        NonparametricMethod::Npcat(_) => {
+            let algorithm: Box<dyn Algorithms<E>> = NPCAT::from_input(input)?;
+            Ok(algorithm)
+        }
+        NonparametricMethod::Npcma(_) => {
+            let algorithm: Box<dyn Algorithms<E>> = NPCMA::from_input(input)?;
+            Ok(algorithm)
+        }
         NonparametricMethod::Npod(_) => {
             let algorithm: Box<dyn Algorithms<E>> = NPOD::from_input(input)?;
             Ok(algorithm)
         }
+        NonparametricMethod::Npopt(_) => {
+            let algorithm: Box<dyn Algorithms<E>> = NPOPT::from_input(input)?;
+            Ok(algorithm)
+        }
+        NonparametricMethod::Nppso(_) => {
+            let algorithm: Box<dyn Algorithms<E>> = NPPSO::from_input(input)?;
+            Ok(algorithm)
+        }
+        NonparametricMethod::Npxo(_) => {
+            let algorithm: Box<dyn Algorithms<E>> = NPXO::from_input(input)?;
+            Ok(algorithm)
+        }
+        NonparametricMethod::Npsah(_) => {
+            let algorithm: Box<dyn Algorithms<E>> = NPSAH::from_input(input)?;
+            Ok(algorithm)
+        }
+        NonparametricMethod::Npsah2(_) => {
+            let algorithm: Box<dyn Algorithms<E>> = NPSAH2::from_input(input)?;
+            Ok(algorithm)
+        }
+        NonparametricMethod::Nexus(_) => {
+            let algorithm: Box<dyn Algorithms<E>> = NEXUS::from_input(input)?;
+            Ok(algorithm)
+        }
         NonparametricMethod::Postprob(_) => {
             let algorithm: Box<dyn Algorithms<E>> = POSTPROB::from_input(input)?;
             Ok(algorithm)
diff --git a/src/algorithms/nonparametric/mod.rs b/src/algorithms/nonparametric/mod.rs
index db8459a86..b60076763 100644
--- a/src/algorithms/nonparametric/mod.rs
+++ b/src/algorithms/nonparametric/mod.rs
@@ -8,7 +8,8 @@
 //!
 //! - [`NPAG`](npag): Non-Parametric Adaptive Grid
 //! - [`NPOD`](npod): Non-Parametric Optimal Design
-//! - [`POSTPROB`](postprob): Posterior probability reweighting
+//! - [`NPSAH`](npsah): Non-Parametric Simulated Annealing Hybrid
+//! - And others...
 //!
 //! # Algorithm Trait
 //!
@@ -17,13 +18,31 @@
 //! and convergence evaluation.
 
 // Algorithm implementations
+pub mod nexus;
 pub mod npag;
+pub mod npbo;
+pub mod npcat;
+pub mod npcma;
 pub mod npod;
+pub mod npopt;
+pub mod nppso;
+pub mod npsah;
+pub mod npsah2;
+pub mod npxo;
 pub mod postprob;
 
 // Re-export algorithm structs
+pub use nexus::NEXUS;
 pub use npag::NPAG;
+pub use npbo::NPBO;
+pub use npcat::NPCAT;
+pub use npcma::NPCMA;
 pub use npod::NPOD;
+pub use npopt::NPOPT;
+pub use nppso::NPPSO;
+pub use npsah::NPSAH;
+pub use npsah2::NPSAH2;
+pub use npxo::NPXO;
 pub use postprob::POSTPROB;
 
 // Re-export the NP algorithm trait from parent
diff --git a/src/algorithms/nonparametric/nexus.rs b/src/algorithms/nonparametric/nexus.rs
new file mode 100644
index 000000000..b7cb9fe20
--- /dev/null
+++ b/src/algorithms/nonparametric/nexus.rs
@@ -0,0 +1,1983 @@
+//! # NEXUS: Non-parametric EXploration via Unified Subject-driven Search
+//!
+//! A state-of-the-art hybrid algorithm combining the best of multiple approaches
+//! with novel innovations in cross-entropy optimization and adaptive exploration.
+//!
+//! ## Key Innovations
+//!
+//! NEXUS combines:
+//! 1. **Cross-Entropy Method (CE)** with GMM for learning the distribution of good solutions
+//! 2. **Subject-guided exploration** for targeted mode discovery
+//! 3. **Adaptive simulated annealing** with temperature feedback
+//! 4. **D-optimal refinement** with hierarchical iteration allocation
+//! 5. **Multi-scale Sobol global verification** for convergence certificates
+//!
+//! ## The Cross-Entropy Insight
+//!
+//! Unlike SA which samples blindly, CE maintains a Gaussian Mixture Model (GMM)
+//! that learns where good solutions tend to be. Each cycle:
+//! 1. Sample candidates from GMM
+//! 2. Evaluate D-criterion for all candidates
+//! 3. Select elite points (top 10%)
+//! 4. Update GMM to fit elite distribution
+//!
+//! This converges faster than SA because it learns problem structure.
+//!
+//! ## The Subject-Guided Insight
+//!
+//! The D-criterion D(θ*) = Σᵢ P(yᵢ|θ*) / P(yᵢ|G) - N is large when:
+//! - P(yᵢ|θ*) is high: θ* explains subject i well
+//! - P(yᵢ|G) is low: current mixture explains subject i poorly
+//!
+//! **Insight**: Find parameters for poorly-fit subjects, targeting modes the
+//! mixture is missing.
+//!
+//! ## Algorithm Phases
+//!
+//! ### Phase 1: Warmup
+//! - Stratified Sobol initialization for space-filling coverage
+//! - Adaptive grid expansion to build parameter space scaffold
+//! - GMM initialization from initial support points
+//!
+//! ### Phase 2: Hybrid Expansion  
+//! - **Cross-entropy sampling** from adaptive GMM
+//! - **Subject-guided search** from poorly-fit subjects
+//! - **Adaptive SA** with feedback-controlled temperature
+//! - **D-optimal refinement** with hierarchical iteration counts
+//! - **Elite preservation** to prevent loss of good solutions
+//!
+//! ### Phase 3: Convergence Verification
+//! - Multi-scale Sobol global optimality check (64 → 256 → 1024 samples)
+//! - Final polishing of all support points
+//! - Convergence certificate when all scales pass
+//!
+//! ## Convergence Guarantees
+//!
+//! NEXUS provides multiple convergence criteria:
+//! 1. Objective function stability (THETA_G)
+//! 2. Weight stability (THETA_W)
+//! 3. P(Y|L) criterion (THETA_F)
+//! 4. Multi-scale global D-criterion < threshold
+
+use crate::algorithms::{
+    NativeNonparametricConfig, NonparametricAlgorithmInput, Status, StopReason,
+};
+use crate::estimation::nonparametric::adaptative_grid;
+use crate::estimation::nonparametric::ipm::burke;
+use crate::estimation::nonparametric::qr;
+use crate::estimation::nonparametric::sample_space_for_parameters;
+use crate::estimation::nonparametric::{
+    calculate_psi, CycleLog, NPCycle, NonparametricWorkspace, Psi, Theta, Weights,
+};
+use crate::prelude::algorithms::Algorithms;
+
+use anyhow::{bail, Result};
+use ndarray::parallel::prelude::{IntoParallelRefMutIterator, ParallelIterator};
+use ndarray::{Array1, Axis};
+use pharmsol::prelude::AssayErrorModel;
+use pharmsol::prelude::{
+    data::{AssayErrorModels, Data},
+    simulator::Equation,
+};
+use rand::prelude::*;
+use sobol_burley::sample;
+
+use argmin::{
+    core::{CostFunction, Error, Executor},
+    solver::neldermead::NelderMead,
+};
+
+// ============================================================================
+// ALGORITHM CONSTANTS
+// ============================================================================
+
+/// Objective function convergence threshold
+const THETA_G: f64 = 1e-4;
+/// P(Y|L) convergence criterion
+const THETA_F: f64 = 1e-2;
+/// Minimum distance between support points (normalized)
+const THETA_D: f64 = 1e-4;
+/// Weight stability threshold
+const THETA_W: f64 = 1e-3;
+/// Grid spacing for adaptive expansion
+const INITIAL_EPS: f64 = 0.2;
+/// Minimum grid spacing before reset
+const MIN_EPS: f64 = 1e-4;
+
+/// Number of warm-up cycles using grid expansion
+const WARMUP_CYCLES: usize = 5;
+
+// === Cross-Entropy Method Parameters ===
+/// Number of samples to generate from GMM per cycle
+const CE_SAMPLE_SIZE: usize = 50;
+/// Fraction of samples considered "elite" (top performers)
+const CE_ELITE_FRACTION: f64 = 0.10;
+/// Number of GMM components for multimodal handling
+const CE_GMM_COMPONENTS: usize = 3;
+/// Minimum variance for GMM (prevents collapse)
+const CE_MIN_VARIANCE: f64 = 1e-6;
+/// Smoothing factor for GMM updates (0 = no smoothing, 1 = no update)
+const CE_SMOOTHING: f64 = 0.3;
+/// Decay rate for sample size (reduces over cycles)
+const CE_SAMPLE_DECAY: f64 = 0.95;
+
+// === Subject-guided parameters ===
+/// Fraction of subjects considered "poorly fit" (bottom percentile)
+const RESIDUAL_SUBJECT_FRACTION: f64 = 0.3;
+/// Minimum number of residual subjects to process
+const MIN_RESIDUAL_SUBJECTS: usize = 3;
+/// Maximum Nelder-Mead iterations for subject MAP estimation
+const SUBJECT_MAP_MAX_ITERS: u64 = 30;
+
+// === D-optimal refinement parameters ===
+/// Maximum Nelder-Mead iterations for high-weight D-optimal refinement
+const DOPT_HIGH_WEIGHT_ITERS: u64 = 100;
+/// Maximum Nelder-Mead iterations for medium-weight D-optimal refinement
+const DOPT_MED_WEIGHT_ITERS: u64 = 40;
+/// Maximum Nelder-Mead iterations for low-weight D-optimal refinement  
+const DOPT_LOW_WEIGHT_ITERS: u64 = 15;
+/// Weight threshold for "high importance" (fraction of max weight)
+const HIGH_WEIGHT_THRESHOLD: f64 = 0.10;
+/// Weight threshold for "medium importance"
+const MED_WEIGHT_THRESHOLD: f64 = 0.01;
+
+// === Adaptive Simulated Annealing Parameters ===
+/// Initial temperature for SA
+const INITIAL_TEMPERATURE: f64 = 5.0;
+/// Base cooling rate (adapted based on acceptance)
+const BASE_COOLING_RATE: f64 = 0.85;
+/// Number of SA samples per injection cycle
+const SA_INJECT_COUNT: usize = 50;
+/// Minimum temperature before SA stops
+const MIN_TEMPERATURE: f64 = 0.01;
+/// Target acceptance ratio for adaptive temperature
+const TARGET_ACCEPTANCE_RATIO: f64 = 0.25;
+/// Reheat factor when acceptance is too low
+const REHEAT_FACTOR: f64 = 1.2;
+
+// === Elite Preservation ===
+/// Number of elite points to preserve across cycles
+const ELITE_COUNT: usize = 5;
+/// Maximum age of elite point before removal
+const ELITE_MAX_AGE: usize = 20;
+
+// === Multi-Scale Global Optimality ===
+/// Sobol samples at each scale level
+const GLOBAL_CHECK_SCALES: [usize; 3] = [64, 256, 1024];
+/// D-criterion threshold for global optimality
+const GLOBAL_D_THRESHOLD: f64 = 0.005;
+/// Seed for reproducible Sobol sequence
+const SOBOL_SEED: u32 = 0;
+
+/// Consecutive stable cycles needed for convergence
+const CONVERGENCE_WINDOW: usize = 3;
+
+/// Boundary margin to prevent numerical issues (fraction of range)
+const BOUNDARY_MARGIN: f64 = 0.005;
+
+// ============================================================================
+// GAUSSIAN MIXTURE MODEL FOR CROSS-ENTROPY
+// ============================================================================
+
+/// A single Gaussian component in the mixture
+#[derive(Debug, Clone)]
+struct GaussianComponent {
+    /// Mean vector (center of component)
+    mean: Vec<f64>,
+    /// Diagonal covariance (variance per dimension)
+    variance: Vec<f64>,
+    /// Mixture weight (probability of selecting this component)
+    weight: f64,
+}
+
+impl GaussianComponent {
+    #[allow(dead_code)]
+    fn new(_n_dims: usize, ranges: &[(f64, f64)]) -> Self {
+        // Initialize at center with variance = (range/4)^2
+        let mean: Vec<f64> = ranges.iter().map(|(lo, hi)| (lo + hi) / 2.0).collect();
+        let variance: Vec<f64> = ranges
+            .iter()
+            .map(|(lo, hi)| ((hi - lo) / 4.0).powi(2))
+            .collect();
+        Self {
+            mean,
+            variance,
+            weight: 1.0,
+        }
+    }
+
+    /// Sample a point from this Gaussian
+    fn sample(&self, rng: &mut StdRng, ranges: &[(f64, f64)]) -> Vec<f64> {
+        self.mean
+            .iter()
+            .zip(self.variance.iter())
+            .zip(ranges.iter())
+            .map(|((&m, &v), (lo, hi))| {
+                let std = v.sqrt();
+                let margin = (hi - lo) * BOUNDARY_MARGIN;
+                // Sample from N(m, std^2) and clamp to bounds
+                let sample = m + std * sample_standard_normal(rng);
+                sample.clamp(lo + margin, hi - margin)
+            })
+            .collect()
+    }
+
+    /// Compute log probability density of a point under this Gaussian
+    fn log_pdf(&self, point: &[f64]) -> f64 {
+        let n = point.len() as f64;
+        let mut log_p = -0.5 * n * (2.0 * std::f64::consts::PI).ln();
+
+        for ((&x, &m), &v) in point.iter().zip(self.mean.iter()).zip(self.variance.iter()) {
+            let safe_v = v.max(CE_MIN_VARIANCE);
+            log_p -= 0.5 * safe_v.ln();
+            log_p -= 0.5 * (x - m).powi(2) / safe_v;
+        }
+
+        log_p
+    }
+}
+
+/// Gaussian Mixture Model for Cross-Entropy sampling
+#[derive(Debug, Clone)]
+struct GMM {
+    components: Vec<GaussianComponent>,
+    n_dims: usize,
+}
+
+impl GMM {
+    /// Create a new GMM with k components initialized across the parameter space
+    fn new(n_components: usize, n_dims: usize, ranges: &[(f64, f64)], rng: &mut StdRng) -> Self {
+        let components: Vec<GaussianComponent> = (0..n_components)
+            .map(|i| {
+                // Spread initial means across the space
+                let mean: Vec<f64> = ranges
+                    .iter()
+                    .map(|(lo, hi)| {
+                        let frac = (i as f64 + 0.5) / n_components as f64;
+                        lo + frac * (hi - lo) + rng.random_range(-0.1..0.1) * (hi - lo)
+                    })
+                    .collect();
+                let variance: Vec<f64> = ranges
+                    .iter()
+                    .map(|(lo, hi)| ((hi - lo) / (n_components as f64 + 1.0)).powi(2))
+                    .collect();
+                GaussianComponent {
+                    mean,
+                    variance,
+                    weight: 1.0 / n_components as f64,
+                }
+            })
+            .collect();
+
+        Self { components, n_dims }
+    }
+
+    /// Initialize GMM from existing support points and weights
+    fn from_theta(
+        theta: &Theta,
+        weights: &Weights,
+        ranges: &[(f64, f64)],
+        _rng: &mut StdRng,
+    ) -> Self {
+        let n_dims = ranges.len();
+        let n_spp = theta.nspp().min(weights.len());
+
+        if n_spp == 0 {
+            // Empty theta - use default initialization
+            let mut rng = StdRng::seed_from_u64(42);
+            return Self::new(CE_GMM_COMPONENTS, n_dims, ranges, &mut rng);
+        }
+
+        // Use K-means-like initialization: pick top-weighted points as centers
+        let mut indexed: Vec<(usize, f64)> = weights.iter().enumerate().take(n_spp).collect();
+        indexed.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
+
+        let n_components = CE_GMM_COMPONENTS.min(n_spp);
+        let components: Vec<GaussianComponent> = indexed
+            .iter()
+            .take(n_components)
+            .map(|(idx, w)| {
+                let mean: Vec<f64> = theta.matrix().row(*idx).iter().cloned().collect();
+                let variance: Vec<f64> = ranges
+                    .iter()
+                    .map(|(lo, hi)| ((hi - lo) / 6.0).powi(2)) // Start with moderate variance
+                    .collect();
+                GaussianComponent {
+                    mean,
+                    variance,
+                    weight: *w,
+                }
+            })
+            .collect();
+
+        // Normalize weights
+        let total_weight: f64 = components.iter().map(|c| c.weight).sum();
+        let mut gmm = Self { components, n_dims };
+        if total_weight > 0.0 {
+            for c in &mut gmm.components {
+                c.weight /= total_weight;
+            }
+        }
+
+        gmm
+    }
+
+    /// Sample n points from the GMM
+    fn sample(&self, n: usize, rng: &mut StdRng, ranges: &[(f64, f64)]) -> Vec<Vec<f64>> {
+        let mut samples = Vec::with_capacity(n);
+
+        for _ in 0..n {
+            // Select component based on weights
+            let u: f64 = rng.random();
+            let mut cumsum = 0.0;
+            let mut selected = 0;
+            for (i, c) in self.components.iter().enumerate() {
+                cumsum += c.weight;
+                if u <= cumsum {
+                    selected = i;
+                    break;
+                }
+            }
+
+            samples.push(self.components[selected].sample(rng, ranges));
+        }
+
+        samples
+    }
+
+    /// Update GMM from elite points using weighted MLE
+    fn update_from_elite(&mut self, elite_points: &[(Vec<f64>, f64)], ranges: &[(f64, f64)]) {
+        if elite_points.is_empty() {
+            return;
+        }
+
+        // Soft assignment of elite points to components
+        let n_elite = elite_points.len();
+        let n_components = self.components.len();
+
+        // E-step: compute responsibilities
+        let mut responsibilities: Vec<Vec<f64>> = vec![vec![0.0; n_components]; n_elite];
+
+        for (i, (point, _)) in elite_points.iter().enumerate() {
+            let mut log_probs: Vec<f64> = self
+                .components
+                .iter()
+                .map(|c| c.weight.ln() + c.log_pdf(point))
+                .collect();
+
+            // Log-sum-exp for numerical stability
+            let max_log = log_probs.iter().cloned().fold(f64::NEG_INFINITY, f64::max);
+            let log_sum: f64 = log_probs
+                .iter()
+                .map(|&lp| (lp - max_log).exp())
+                .sum::<f64>()
+                .ln()
+                + max_log;
+
+            for (j, lp) in log_probs.iter_mut().enumerate() {
+                responsibilities[i][j] = (*lp - log_sum).exp();
+            }
+        }
+
+        // M-step: update parameters with smoothing
+        for (k, component) in self.components.iter_mut().enumerate() {
+            let mut total_resp = 0.0;
+            let mut new_mean = vec![0.0; self.n_dims];
+            let mut new_var = vec![0.0; self.n_dims];
+
+            // Weight responsibilities by D-criterion values
+            for (i, (point, d_val)) in elite_points.iter().enumerate() {
+                let resp = responsibilities[i][k] * d_val.max(0.0);
+                total_resp += resp;
+                for (j, &x) in point.iter().enumerate() {
+                    new_mean[j] += resp * x;
+                }
+            }
+
+            if total_resp > 1e-10 {
+                for j in 0..self.n_dims {
+                    new_mean[j] /= total_resp;
+                }
+
+                // Compute variance
+                for (i, (point, d_val)) in elite_points.iter().enumerate() {
+                    let resp = responsibilities[i][k] * d_val.max(0.0);
+                    for (j, &x) in point.iter().enumerate() {
+                        new_var[j] += resp * (x - new_mean[j]).powi(2);
+                    }
+                }
+
+                for j in 0..self.n_dims {
+                    new_var[j] = (new_var[j] / total_resp).max(CE_MIN_VARIANCE);
+                    // Bound variance to reasonable range
+                    let (lo, hi) = ranges[j];
+                    let max_var = ((hi - lo) / 2.0).powi(2);
+                    new_var[j] = new_var[j].min(max_var);
+                }
+
+                // Apply smoothing to prevent sudden changes
+                for j in 0..self.n_dims {
+                    component.mean[j] =
+                        CE_SMOOTHING * component.mean[j] + (1.0 - CE_SMOOTHING) * new_mean[j];
+                    component.variance[j] =
+                        CE_SMOOTHING * component.variance[j] + (1.0 - CE_SMOOTHING) * new_var[j];
+                }
+
+                // Update weight
+                let new_weight =
+                    total_resp / elite_points.iter().map(|(_, d)| d.max(0.0)).sum::<f64>();
+                component.weight =
+                    CE_SMOOTHING * component.weight + (1.0 - CE_SMOOTHING) * new_weight;
+            }
+        }
+
+        // Normalize weights
+        let total_weight: f64 = self.components.iter().map(|c| c.weight).sum();
+        if total_weight > 0.0 {
+            for c in &mut self.components {
+                c.weight /= total_weight;
+            }
+        }
+    }
+}
+
+/// Sample from standard normal distribution using Box-Muller
+fn sample_standard_normal(rng: &mut StdRng) -> f64 {
+    let u1: f64 = rng.random();
+    let u2: f64 = rng.random();
+    (-2.0 * u1.ln()).sqrt() * (2.0 * std::f64::consts::PI * u2).cos()
+}
+
+// ============================================================================
+// ELITE POINT TRACKING
+// ============================================================================
+
+/// An elite point with metadata for preservation across cycles
+#[derive(Debug, Clone)]
+struct ElitePoint {
+    params: Vec<f64>,
+    d_value: f64,
+    age: usize,
+}
+
+// ============================================================================
+// CONVERGENCE STATE
+// ============================================================================
+
+/// Algorithm phase
+#[derive(Debug, Clone, PartialEq)]
+pub enum Phase {
+    /// Initial grid-based coverage
+    Warmup,
+    /// Subject-guided expansion + D-optimal refinement
+    Expansion,
+    /// Final convergence verification
+    Convergence,
+}
+
+impl std::fmt::Display for Phase {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Phase::Warmup => write!(f, "Warmup"),
+            Phase::Expansion => write!(f, "Expansion"),
+            Phase::Convergence => write!(f, "Convergence"),
+        }
+    }
+}
+
+// ============================================================================
+// NEXUS STRUCT
+// ============================================================================
+
+/// NEXUS: Non-parametric EXploration via Unified Subject-driven Search
+#[derive(Debug)]
+pub struct NEXUS<E: Equation + Send + 'static> {
+    /// The pharmacometric equation/model
+    equation: E,
+    /// Parameter ranges for each dimension
+    ranges: Vec<(f64, f64)>,
+    /// Probability matrix: P(y_i | θ_j)
+    psi: Psi,
+    /// Support points (parameter values)
+    theta: Theta,
+    /// Weights from IPM before condensation
+    lambda: Weights,
+    /// Final weights after condensation
+    w: Weights,
+    /// Previous weights for stability check
+    w_prev: Weights,
+    /// Current grid spacing (for warm-up phase)
+    eps: f64,
+    /// Previous objective function value
+    last_objf: f64,
+    /// Current objective function value
+    objf: f64,
+    /// Best objective function seen
+    best_objf: f64,
+    /// P(Y|L) values for convergence checking
+    f0: f64,
+    f1: f64,
+    /// Current cycle number
+    cycle: usize,
+    /// Step sizes for error model optimization
+    gamma_delta: Vec<f64>,
+    /// Error models for observations
+    error_models: AssayErrorModels,
+    /// Algorithm status
+    status: Status,
+    /// Cycle log for tracking progress
+    cycle_log: CycleLog,
+    /// Subject data
+    data: Data,
+    /// Unified runtime/model-derived configuration
+    config: NativeNonparametricConfig,
+
+    // NEXUS-specific fields
+    /// Current algorithm phase
+    phase: Phase,
+    /// History of objective function values
+    objf_history: Vec<f64>,
+    /// Sobol sequence index for reproducible sampling
+    sobol_index: u32,
+    /// Maximum D found in last global check
+    last_global_d_max: f64,
+    /// Count of stable cycles for convergence
+    stability_counter: usize,
+    /// Current global check scale level
+    current_global_scale: usize,
+
+    // Cross-Entropy fields
+    /// Gaussian Mixture Model for CE sampling
+    gmm: Option<GMM>,
+    /// Current CE sample size (decays over cycles)
+    ce_sample_size: f64,
+
+    // Adaptive SA fields
+    /// SA temperature for global exploration
+    temperature: f64,
+    /// Effective cooling rate (adaptive)
+    cooling_rate: f64,
+    /// SA accepted count this cycle
+    sa_accepted: usize,
+    /// SA proposed count this cycle  
+    sa_proposed: usize,
+
+    // Elite preservation
+    /// Elite points preserved across cycles
+    elite_points: Vec<ElitePoint>,
+
+    /// Random number generator
+    rng: StdRng,
+}
+
+// ============================================================================
+// ALGORITHMS TRAIT IMPLEMENTATION
+// ============================================================================
+
+impl<E: Equation + Send + 'static> Algorithms<E> for NEXUS<E> {
+    fn equation(&self) -> &E {
+        &self.equation
+    }
+
+    fn into_workspace(&self) -> Result<NonparametricWorkspace<E>> {
+        NonparametricWorkspace::new(
+            self.equation.clone(),
+            self.data.clone(),
+            self.theta.clone(),
+            self.psi.clone(),
+            self.w.clone(),
+            -2. * self.objf,
+            self.cycle,
+            self.status.clone(),
+            self.config.run_configuration.clone(),
+            self.cycle_log.clone(),
+        )
+    }
+
+    fn error_models(&self) -> &AssayErrorModels {
+        &self.error_models
+    }
+
+    fn data(&self) -> &Data {
+        &self.data
+    }
+
+    fn get_prior(&self) -> Theta {
+        sample_space_for_parameters(&self.config.parameter_space, &self.config.prior).unwrap()
+    }
+
+    fn likelihood(&self) -> f64 {
+        self.objf
+    }
+
+    fn increment_cycle(&mut self) -> usize {
+        self.cycle += 1;
+
+        // Phase transition: Warmup → Expansion
+        if self.cycle > WARMUP_CYCLES && self.phase == Phase::Warmup {
+            self.phase = Phase::Expansion;
+
+            // Initialize GMM from current theta
+            self.gmm = Some(GMM::from_theta(
+                &self.theta,
+                &self.w,
+                &self.ranges,
+                &mut self.rng,
+            ));
+
+            tracing::info!(
+                "NEXUS: Warmup → Expansion (cycle {}, {} support points, GMM initialized)",
+                self.cycle,
+                self.theta.nspp()
+            );
+        }
+
+        // Adaptive temperature adjustment based on acceptance ratio
+        self.adapt_temperature();
+
+        // Decay CE sample size
+        self.ce_sample_size = (self.ce_sample_size * CE_SAMPLE_DECAY).max(10.0);
+
+        // Track best objective
+        if self.objf > self.best_objf + THETA_G {
+            self.best_objf = self.objf;
+        }
+
+        self.cycle
+    }
+
+    fn cycle(&self) -> usize {
+        self.cycle
+    }
+
+    fn set_theta(&mut self, theta: Theta) {
+        self.theta = theta;
+    }
+
+    fn theta(&self) -> &Theta {
+        &self.theta
+    }
+
+    fn psi(&self) -> &Psi {
+        &self.psi
+    }
+
+    fn set_status(&mut self, status: Status) {
+        self.status = status;
+    }
+
+    fn status(&self) -> &Status {
+        &self.status
+    }
+
+    fn evaluation(&mut self) -> Result<Status> {
+        tracing::info!("Objective function = {:.4}", -2.0 * self.objf);
+        tracing::debug!(
+            "Support points: {} | Phase: {} | T: {:.3} | CE samples: {:.0}",
+            self.theta.nspp(),
+            self.phase,
+            self.temperature,
+            self.ce_sample_size
+        );
+
+        // Log error models
+        self.error_models.iter().for_each(|(outeq, em)| {
+            if AssayErrorModel::None != *em {
+                tracing::debug!(
+                    "Error model outeq {}: {:.4}",
+                    outeq,
+                    em.factor().unwrap_or_default()
+                );
+            }
+        });
+
+        // Track history
+        self.objf_history.push(self.objf);
+
+        // Warn on decrease
+        if self.last_objf > self.objf + 1e-4 {
+            tracing::warn!(
+                "Objective function decreased: {:.4} → {:.4}",
+                -2.0 * self.last_objf,
+                -2.0 * self.objf
+            );
+        }
+
+        // Check convergence
+        let converged = self.check_convergence()?;
+        if converged {
+            tracing::info!("NEXUS converged after {} cycles", self.cycle);
+            self.set_status(Status::Stop(StopReason::Converged));
+            self.log_cycle_state();
+            return Ok(self.status().clone());
+        }
+
+        // NPAG-style eps convergence (during warmup and expansion)
+        if self.phase != Phase::Convergence {
+            if (self.last_objf - self.objf).abs() <= THETA_G && self.eps > MIN_EPS {
+                self.eps /= 2.0;
+                tracing::debug!("Halving eps to {:.6}", self.eps);
+
+                if self.eps <= MIN_EPS {
+                    let pyl = self.psi.matrix() * self.w.weights();
+                    self.f1 = pyl.iter().map(|x| x.ln()).sum();
+                    if (self.f1 - self.f0).abs() <= THETA_F {
+                        // Transition to convergence verification
+                        self.phase = Phase::Convergence;
+                        tracing::info!(
+                            "NEXUS: Expansion → Convergence (cycle {}, verifying global optimality)",
+                            self.cycle
+                        );
+                    } else {
+                        self.f0 = self.f1;
+                        self.eps = INITIAL_EPS;
+                    }
+                }
+            }
+        }
+
+        // Check maximum cycles
+        if self.cycle >= self.config.max_cycles {
+            tracing::warn!("Maximum cycles reached");
+            self.set_status(Status::Stop(StopReason::MaxCycles));
+            self.log_cycle_state();
+            return Ok(self.status().clone());
+        }
+
+        // Check for stop file
+        if std::path::Path::new("stop").exists() {
+            tracing::warn!("Stop file detected");
+            self.set_status(Status::Stop(StopReason::Stopped));
+            self.log_cycle_state();
+            return Ok(self.status().clone());
+        }
+
+        self.set_status(Status::Continue);
+        self.log_cycle_state();
+        Ok(self.status().clone())
+    }
+
+    fn estimation(&mut self) -> Result<()> {
+        self.psi = calculate_psi(
+            &self.equation,
+            &self.data,
+            &self.theta,
+            &self.error_models,
+            self.cycle == 1 && self.config.progress,
+        )?;
+
+        if let Err(err) = self.validate_psi() {
+            bail!(err);
+        }
+
+        (self.lambda, _) = match burke(&self.psi) {
+            Ok((lambda, objf)) => (lambda, objf),
+            Err(err) => {
+                bail!("Error in IPM during estimation: {:?}", err);
+            }
+        };
+        Ok(())
+    }
+
+    fn condensation(&mut self) -> Result<()> {
+        // Store previous weights for stability checking
+        self.w_prev = self.w.clone();
+
+        // Lambda filter: remove low-weight points
+        let max_lambda = self
+            .lambda
+            .iter()
+            .fold(f64::NEG_INFINITY, |acc, x| x.max(acc));
+
+        let threshold = max_lambda / 10000.0;
+        let keep: Vec<usize> = self
+            .lambda
+            .iter()
+            .enumerate()
+            .filter(|(_, lam)| *lam > threshold)
+            .map(|(i, _)| i)
+            .collect();
+
+        let dropped = self.psi.matrix().ncols() - keep.len();
+        if dropped > 0 {
+            tracing::debug!("Lambda filter dropped {} point(s)", dropped);
+        }
+
+        self.theta.filter_indices(keep.as_slice());
+        self.psi.filter_column_indices(keep.as_slice());
+
+        // QR rank-revealing factorization
+        let (r, perm) = qr::qrd(&self.psi)?;
+        let keep_n = self.psi.matrix().ncols().min(self.psi.matrix().nrows());
+        let keep: Vec<usize> = (0..keep_n)
+            .filter(|&i| {
+                let test = r.col(i).norm_l2();
+                let r_diag = r.get(i, i);
+                (r_diag / test).abs() >= 1e-8
+            })
+            .map(|i| *perm.get(i).unwrap())
+            .collect();
+
+        let dropped = self.psi.matrix().ncols() - keep.len();
+        if dropped > 0 {
+            tracing::debug!("QR dropped {} point(s)", dropped);
+        }
+
+        self.theta.filter_indices(keep.as_slice());
+        self.psi.filter_column_indices(keep.as_slice());
+
+        self.validate_psi()?;
+
+        (self.lambda, self.objf) = match burke(&self.psi) {
+            Ok((lambda, objf)) => (lambda, objf),
+            Err(err) => {
+                return Err(anyhow::anyhow!(
+                    "Error in IPM during condensation: {:?}",
+                    err
+                ));
+            }
+        };
+        self.w = self.lambda.clone();
+
+        // Update elite points after condensation
+        self.update_elite_points()?;
+
+        Ok(())
+    }
+
+    fn optimizations(&mut self) -> Result<()> {
+        // Standard error model optimization
+        self.error_models
+            .clone()
+            .iter_mut()
+            .filter_map(|(outeq, em)| {
+                if em.optimize() {
+                    Some((outeq, em))
+                } else {
+                    None
+                }
+            })
+            .try_for_each(|(outeq, em)| -> Result<()> {
+                let gamma_up = em.factor()? * (1.0 + self.gamma_delta[outeq]);
+                let gamma_down = em.factor()? / (1.0 + self.gamma_delta[outeq]);
+
+                let mut error_model_up = self.error_models.clone();
+                error_model_up.set_factor(outeq, gamma_up)?;
+
+                let mut error_model_down = self.error_models.clone();
+                error_model_down.set_factor(outeq, gamma_down)?;
+
+                let psi_up = calculate_psi(
+                    &self.equation,
+                    &self.data,
+                    &self.theta,
+                    &error_model_up,
+                    false,
+                )?;
+                let psi_down = calculate_psi(
+                    &self.equation,
+                    &self.data,
+                    &self.theta,
+                    &error_model_down,
+                    false,
+                )?;
+
+                let (lambda_up, objf_up) = match burke(&psi_up) {
+                    Ok((lambda, objf)) => (lambda, objf),
+                    Err(err) => bail!("Error in IPM during optim: {:?}", err),
+                };
+                let (lambda_down, objf_down) = match burke(&psi_down) {
+                    Ok((lambda, objf)) => (lambda, objf),
+                    Err(err) => bail!("Error in IPM during optim: {:?}", err),
+                };
+
+                if objf_up > self.objf {
+                    self.error_models.set_factor(outeq, gamma_up)?;
+                    self.objf = objf_up;
+                    self.gamma_delta[outeq] *= 4.0;
+                    self.lambda = lambda_up;
+                    self.psi = psi_up;
+                }
+                if objf_down > self.objf {
+                    self.error_models.set_factor(outeq, gamma_down)?;
+                    self.objf = objf_down;
+                    self.gamma_delta[outeq] *= 4.0;
+                    self.lambda = lambda_down;
+                    self.psi = psi_down;
+                }
+                self.gamma_delta[outeq] *= 0.5;
+                if self.gamma_delta[outeq] <= 0.01 {
+                    self.gamma_delta[outeq] = 0.1;
+                }
+                Ok(())
+            })?;
+
+        Ok(())
+    }
+
+    fn expansion(&mut self) -> Result<()> {
+        match self.phase {
+            Phase::Warmup => {
+                // Use adaptive grid expansion for initial coverage
+                self.grid_expansion()?;
+            }
+            Phase::Expansion => {
+                // Hybrid expansion: combining all strategies
+                let initial_spp = self.theta.nspp();
+
+                // Step 1: D-optimal refinement FIRST (like NPSAH)
+                self.d_optimal_refinement()?;
+                let after_dopt = self.theta.nspp();
+
+                // Step 2: Cross-Entropy sampling from GMM (new!)
+                self.cross_entropy_expansion()?;
+                let after_ce = self.theta.nspp();
+
+                // Step 3: Sparse grid expansion (reduced rate in hybrid phase)
+                self.sparse_grid_expansion()?;
+                let after_grid = self.theta.nspp();
+
+                // Step 4: Adaptive SA injection for global exploration
+                if self.temperature > MIN_TEMPERATURE {
+                    self.sa_injection()?;
+                }
+                let after_sa = self.theta.nspp();
+
+                // Step 5: Subject-guided exploration
+                self.subject_guided_expansion()?;
+                let after_subject = self.theta.nspp();
+
+                // Step 6: Re-inject elite points
+                self.inject_elite_points()?;
+                let after_elite = self.theta.nspp();
+
+                tracing::debug!(
+                    "Expansion: {} → {} (D-opt) → {} (CE) → {} (grid) → {} (SA) → {} (subject) → {} (elite)",
+                    initial_spp, after_dopt, after_ce, after_grid, after_sa, after_subject, after_elite
+                );
+            }
+            Phase::Convergence => {
+                // Multi-scale global verification with injection
+                self.multi_scale_global_check()?;
+            }
+        }
+
+        Ok(())
+    }
+
+    fn log_cycle_state(&mut self) {
+        let state = NPCycle::new(
+            self.cycle,
+            -2. * self.objf,
+            self.error_models.clone(),
+            self.theta.clone(),
+            self.theta.nspp(),
+            (self.last_objf - self.objf).abs(),
+            self.status.clone(),
+        );
+        self.cycle_log.push(state);
+        self.last_objf = self.objf;
+    }
+}
+
+// ============================================================================
+// NEXUS-SPECIFIC METHODS
+// ============================================================================
+
+impl<E: Equation + Send + 'static> NEXUS<E> {
+    pub(crate) fn from_input(input: NonparametricAlgorithmInput<E>) -> Result<Box<Self>> {
+        let config = input.native_config()?;
+        let seed = config.prior.seed().unwrap_or(42);
+        let ranges = config.ranges.clone();
+        let error_models = input.error_models().clone();
+
+        Ok(Box::new(Self {
+            equation: input.equation,
+            ranges: ranges.clone(),
+            psi: Psi::new(),
+            theta: Theta::new(),
+            lambda: Weights::default(),
+            w: Weights::default(),
+            w_prev: Weights::default(),
+            eps: INITIAL_EPS,
+            last_objf: -1e30,
+            objf: f64::NEG_INFINITY,
+            best_objf: f64::NEG_INFINITY,
+            f0: -1e30,
+            f1: f64::default(),
+            cycle: 0,
+            gamma_delta: vec![0.1; error_models.len()],
+            error_models,
+            status: Status::Continue,
+            cycle_log: CycleLog::new(),
+            data: input.data,
+            config,
+            phase: Phase::Warmup,
+            objf_history: Vec::with_capacity(100),
+            sobol_index: seed as u32,
+            last_global_d_max: f64::INFINITY,
+            stability_counter: 0,
+            current_global_scale: 0,
+            gmm: None,
+            ce_sample_size: CE_SAMPLE_SIZE as f64,
+            temperature: INITIAL_TEMPERATURE,
+            cooling_rate: BASE_COOLING_RATE,
+            sa_accepted: 0,
+            sa_proposed: 0,
+            elite_points: Vec::with_capacity(ELITE_COUNT * 2),
+            rng: StdRng::seed_from_u64(seed as u64),
+        }))
+    }
+
+    /// Compute P(Y|G) = Psi * w for all subjects
+    fn compute_pyl(&self) -> Array1<f64> {
+        let psi = self.psi().to_ndarray();
+        let w: Array1<f64> = self.w.clone().iter().collect();
+        psi.dot(&w)
+    }
+
+    /// Compute D-criterion for a candidate point
+    fn compute_d(&self, point: &[f64], pyl: &Array1<f64>) -> Result<f64> {
+        let theta_single = ndarray::Array1::from(point.to_vec()).insert_axis(Axis(0));
+
+        let psi_single = pharmsol::prelude::simulator::log_likelihood_matrix(
+            &self.equation,
+            &self.data,
+            &theta_single,
+            &self.error_models,
+            false,
+        )?
+        .mapv(f64::exp);
+
+        let nsub = psi_single.nrows() as f64;
+        let mut d_sum = -nsub;
+
+        for (p_i, pyl_i) in psi_single.iter().zip(pyl.iter()) {
+            if *pyl_i > 0.0 {
+                d_sum += p_i / pyl_i;
+            }
+        }
+
+        Ok(d_sum)
+    }
+
+    /// Check multi-criterion convergence
+    fn check_convergence(&mut self) -> Result<bool> {
+        if self.objf_history.len() < CONVERGENCE_WINDOW {
+            return Ok(false);
+        }
+
+        // Criterion 1: Objective function stability
+        let recent: Vec<f64> = self
+            .objf_history
+            .iter()
+            .rev()
+            .take(CONVERGENCE_WINDOW)
+            .cloned()
+            .collect();
+
+        let objf_stable = recent.windows(2).all(|w| (w[0] - w[1]).abs() < THETA_G);
+
+        if !objf_stable {
+            self.stability_counter = 0;
+            return Ok(false);
+        }
+
+        // Criterion 2: Weight stability
+        if !self.weights_stable() {
+            self.stability_counter = 0;
+            return Ok(false);
+        }
+
+        self.stability_counter += 1;
+
+        // Criterion 3: Multi-scale global optimality (only in Convergence phase)
+        if self.phase == Phase::Convergence && self.stability_counter >= CONVERGENCE_WINDOW {
+            // Progressive multi-scale check
+            if self.current_global_scale < GLOBAL_CHECK_SCALES.len() {
+                let n_samples = GLOBAL_CHECK_SCALES[self.current_global_scale];
+                let pyl = self.compute_pyl();
+                let max_d = self.sobol_global_check_n(&pyl, n_samples)?;
+
+                if max_d > GLOBAL_D_THRESHOLD {
+                    tracing::debug!(
+                        "Global check scale {} failed: max_D = {:.4} > {:.4}",
+                        self.current_global_scale,
+                        max_d,
+                        GLOBAL_D_THRESHOLD
+                    );
+                    // Reset to expansion phase if we fail
+                    self.phase = Phase::Expansion;
+                    self.stability_counter = 0;
+                    self.current_global_scale = 0;
+                    return Ok(false);
+                }
+
+                tracing::info!(
+                    "Global check scale {} ({} samples) passed: max_D = {:.4}",
+                    self.current_global_scale,
+                    n_samples,
+                    max_d
+                );
+                self.current_global_scale += 1;
+
+                // Not converged until all scales pass
+                if self.current_global_scale < GLOBAL_CHECK_SCALES.len() {
+                    return Ok(false);
+                }
+            }
+
+            // All scales passed!
+            tracing::info!("All global optimality scales passed - convergence verified");
+            return Ok(true);
+        }
+
+        Ok(false)
+    }
+
+    /// Check if weight distribution has been stable
+    fn weights_stable(&self) -> bool {
+        if self.w.len() != self.w_prev.len() || self.w.len() == 0 {
+            return false;
+        }
+
+        let max_change = self
+            .w
+            .iter()
+            .zip(self.w_prev.iter())
+            .map(|(w_new, w_old)| {
+                if w_new > 1e-10 {
+                    ((w_new - w_old) / w_new).abs()
+                } else {
+                    0.0
+                }
+            })
+            .fold(0.0_f64, |a, b| a.max(b));
+
+        max_change < THETA_W
+    }
+
+    // ════════════════════════════════════════════════════════════════════
+    // PHASE 1: GRID EXPANSION (Warmup)
+    // ════════════════════════════════════════════════════════════════════
+
+    /// Adaptive grid expansion for initial coverage
+    fn grid_expansion(&mut self) -> Result<()> {
+        adaptative_grid(&mut self.theta, self.eps, &self.ranges, THETA_D)?;
+        Ok(())
+    }
+
+    /// Sparse grid expansion (reduced epsilon)
+    fn sparse_grid_expansion(&mut self) -> Result<()> {
+        let sparse_eps = self.eps * 0.5;
+        adaptative_grid(&mut self.theta, sparse_eps, &self.ranges, THETA_D * 2.0)?;
+        Ok(())
+    }
+
+    // ════════════════════════════════════════════════════════════════════
+    // CROSS-ENTROPY EXPANSION (Novel)
+    // ════════════════════════════════════════════════════════════════════
+
+    /// Cross-Entropy Method expansion using GMM
+    ///
+    /// Unlike SA which samples blindly, CE learns the distribution of good solutions.
+    /// Each cycle: sample from GMM → evaluate → select elite → update GMM
+    fn cross_entropy_expansion(&mut self) -> Result<()> {
+        // Initialize GMM if not present
+        if self.gmm.is_none() {
+            self.gmm = Some(GMM::from_theta(
+                &self.theta,
+                &self.w,
+                &self.ranges,
+                &mut self.rng,
+            ));
+        }
+
+        let gmm = self.gmm.as_ref().unwrap();
+        let pyl = self.compute_pyl();
+        let n_samples = self.ce_sample_size.ceil() as usize;
+
+        // Sample candidates from GMM
+        let candidates = gmm.sample(n_samples, &mut self.rng, &self.ranges);
+
+        // Evaluate D-criterion for all candidates
+        let mut evaluated: Vec<(Vec<f64>, f64)> = candidates
+            .into_iter()
+            .filter_map(|point| self.compute_d(&point, &pyl).ok().map(|d| (point, d)))
+            .collect();
+
+        // Sort by D descending (best first)
+        evaluated.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
+
+        // Select elite (top fraction)
+        let n_elite = (evaluated.len() as f64 * CE_ELITE_FRACTION).ceil() as usize;
+        let elite: Vec<(Vec<f64>, f64)> = evaluated.into_iter().take(n_elite.max(1)).collect();
+
+        // Add elite points that improve the mixture
+        let mut added = 0;
+        for (point, d) in &elite {
+            if *d > 0.0 && self.theta.check_point(point, THETA_D) {
+                self.theta.add_point(point)?;
+                added += 1;
+            }
+        }
+
+        // Update GMM from elite points
+        if !elite.is_empty() {
+            if let Some(ref mut gmm) = self.gmm {
+                gmm.update_from_elite(&elite, &self.ranges);
+            }
+        }
+
+        tracing::debug!(
+            "CE expansion: sampled {}, elite {}, added {}",
+            n_samples,
+            elite.len(),
+            added
+        );
+
+        Ok(())
+    }
+
+    // ════════════════════════════════════════════════════════════════════
+    // ADAPTIVE TEMPERATURE CONTROL
+    // ════════════════════════════════════════════════════════════════════
+
+    /// Adapt SA temperature based on acceptance ratio
+    fn adapt_temperature(&mut self) {
+        if self.sa_proposed > 0 {
+            let acceptance_ratio = self.sa_accepted as f64 / self.sa_proposed as f64;
+
+            // Adjust cooling rate based on acceptance ratio
+            if acceptance_ratio < TARGET_ACCEPTANCE_RATIO * 0.5 {
+                // Too cold - slow down cooling and possibly reheat
+                self.cooling_rate = (self.cooling_rate + 0.02).min(0.98);
+                if acceptance_ratio < 0.1 && self.temperature < 0.5 {
+                    self.temperature *= REHEAT_FACTOR;
+                    tracing::debug!("Reheating to T = {:.4}", self.temperature);
+                }
+            } else if acceptance_ratio > TARGET_ACCEPTANCE_RATIO * 1.5 {
+                // Too hot - speed up cooling
+                self.cooling_rate = (self.cooling_rate - 0.02).max(0.80);
+            }
+
+            tracing::debug!(
+                "SA acceptance: {:.1}% | Cooling rate: {:.3}",
+                acceptance_ratio * 100.0,
+                self.cooling_rate
+            );
+        }
+
+        // Apply cooling
+        self.temperature *= self.cooling_rate;
+        if self.temperature < MIN_TEMPERATURE {
+            self.temperature = MIN_TEMPERATURE;
+        }
+
+        // Reset counters
+        self.sa_accepted = 0;
+        self.sa_proposed = 0;
+    }
+
+    // ════════════════════════════════════════════════════════════════════
+    // ELITE POINT MANAGEMENT
+    // ════════════════════════════════════════════════════════════════════
+
+    /// Update elite points based on current weights and D-values
+    fn update_elite_points(&mut self) -> Result<()> {
+        if self.w.len() == 0 {
+            return Ok(());
+        }
+
+        let pyl = self.compute_pyl();
+
+        // Age existing elite points
+        for elite in &mut self.elite_points {
+            elite.age += 1;
+        }
+
+        // Remove old elite points
+        self.elite_points.retain(|e| e.age < ELITE_MAX_AGE);
+
+        // Find top points by weight
+        let n_spp = self.theta.nspp().min(self.w.len());
+        let mut indexed_weights: Vec<(usize, f64)> =
+            self.w.iter().enumerate().take(n_spp).collect();
+        indexed_weights.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
+
+        for (idx, _weight) in indexed_weights.iter().take(ELITE_COUNT) {
+            if *idx >= self.theta.nspp() {
+                continue;
+            }
+
+            let params: Vec<f64> = self.theta.matrix().row(*idx).iter().cloned().collect();
+            let d_value = self.compute_d(&params, &pyl).unwrap_or(0.0);
+
+            // Check if already elite
+            let already_elite = self.elite_points.iter().any(|e| {
+                e.params
+                    .iter()
+                    .zip(&params)
+                    .all(|(a, b)| (a - b).abs() < THETA_D * 10.0)
+            });
+
+            if !already_elite && self.elite_points.len() < ELITE_COUNT * 2 {
+                self.elite_points.push(ElitePoint {
+                    params,
+                    d_value,
+                    age: 0,
+                });
+            }
+        }
+
+        // Keep only top elite points
+        self.elite_points
+            .sort_by(|a, b| b.d_value.partial_cmp(&a.d_value).unwrap());
+        self.elite_points.truncate(ELITE_COUNT);
+
+        Ok(())
+    }
+
+    /// Inject elite points back into theta
+    fn inject_elite_points(&mut self) -> Result<()> {
+        let mut injected = 0;
+        for elite in &self.elite_points {
+            if self.theta.check_point(&elite.params, THETA_D) {
+                self.theta.add_point(&elite.params)?;
+                injected += 1;
+            }
+        }
+
+        if injected > 0 {
+            tracing::debug!("Injected {} elite points", injected);
+        }
+
+        Ok(())
+    }
+
+    // ════════════════════════════════════════════════════════════════════
+    // SA INJECTION: Global Exploration via Simulated Annealing
+    // ════════════════════════════════════════════════════════════════════
+
+    /// Simulated annealing point injection for global mode discovery
+    ///
+    /// Uses Metropolis acceptance criterion with adaptive temperature control.
+    fn sa_injection(&mut self) -> Result<()> {
+        let pyl = self.compute_pyl();
+
+        // Number of points to try scales with temperature
+        let n_inject = (SA_INJECT_COUNT as f64 * (self.temperature / INITIAL_TEMPERATURE).sqrt())
+            .ceil() as usize;
+        let n_inject = n_inject.max(5);
+
+        let mut accepted_points = 0;
+        let mut max_d_found = f64::NEG_INFINITY;
+
+        for _ in 0..n_inject * 10 {
+            self.sa_proposed += 1;
+
+            // Generate random point with boundary margin
+            let point: Vec<f64> = self
+                .ranges
+                .iter()
+                .map(|(lo, hi)| {
+                    let margin = (hi - lo) * BOUNDARY_MARGIN;
+                    self.rng.random_range((lo + margin)..(hi - margin))
+                })
+                .collect();
+
+            // Compute D-criterion
+            let d_value = match self.compute_d(&point, &pyl) {
+                Ok(d) => d,
+                Err(_) => continue,
+            };
+            max_d_found = max_d_found.max(d_value);
+
+            // Metropolis acceptance criterion
+            let accept = if d_value > 0.0 {
+                true
+            } else {
+                let p_accept = (d_value / self.temperature).exp();
+                self.rng.random::<f64>() < p_accept
+            };
+
+            if accept {
+                if self.theta.check_point(&point, THETA_D) {
+                    self.theta.add_point(&point)?;
+                    accepted_points += 1;
+                    self.sa_accepted += 1;
+                }
+            }
+
+            if accepted_points >= n_inject {
+                break;
+            }
+        }
+
+        tracing::debug!(
+            "SA injection: {}/{} accepted, max_D = {:.4}, T = {:.4}",
+            accepted_points,
+            self.sa_proposed,
+            max_d_found,
+            self.temperature
+        );
+
+        Ok(())
+    }
+
+    // ════════════════════════════════════════════════════════════════════
+    // PHASE 2: SUBJECT-GUIDED EXPANSION (The Core Innovation)
+    // ════════════════════════════════════════════════════════════════════
+
+    /// Subject-Residual Driven Exploration
+    ///
+    /// Find subjects that are poorly explained by current mixture,
+    /// then find parameters that would explain each one well.
+    fn subject_guided_expansion(&mut self) -> Result<()> {
+        if self.theta.nspp() == 0 || self.w.len() == 0 {
+            return Ok(());
+        }
+
+        let pyl = self.compute_pyl();
+        let n_subjects = pyl.len();
+
+        // Identify "residual subjects" - those with low P(y|G)
+        let mut indexed_pyl: Vec<(usize, f64)> = pyl.iter().cloned().enumerate().collect();
+        indexed_pyl.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap()); // Sort ascending
+
+        // Take bottom fraction (poorly fit subjects)
+        let n_residual = ((n_subjects as f64) * RESIDUAL_SUBJECT_FRACTION)
+            .ceil()
+            .max(MIN_RESIDUAL_SUBJECTS as f64) as usize;
+        let n_residual = n_residual.min(n_subjects);
+
+        let residual_subjects: Vec<usize> = indexed_pyl
+            .iter()
+            .take(n_residual)
+            .map(|(idx, _)| *idx)
+            .collect();
+
+        tracing::debug!(
+            "Subject-guided: {} residual subjects (of {}), P(y|G) range: {:.2e} to {:.2e}",
+            residual_subjects.len(),
+            n_subjects,
+            indexed_pyl.first().map(|(_, p)| *p).unwrap_or(0.0),
+            indexed_pyl
+                .get(n_residual.saturating_sub(1))
+                .map(|(_, p)| *p)
+                .unwrap_or(0.0)
+        );
+
+        // For each residual subject, find MAP estimate: argmax P(y_i|θ)
+        let subjects = self.data.subjects();
+        let error_models = self.error_models.clone();
+
+        let mut subject_map_points: Vec<(Vec<f64>, f64)> = Vec::new();
+
+        for &subj_idx in &residual_subjects {
+            // Create single-subject data for optimization
+            let subject = &subjects[subj_idx];
+
+            // Use centroid of current support points as starting guess
+            let start = self.compute_weighted_centroid();
+
+            // Find θ that maximizes P(y_i|θ) for this subject
+            if let Ok(map_point) = self.find_subject_map(subject, &start, &error_models) {
+                // Compute D-criterion for this point
+                if let Ok(d) = self.compute_d(&map_point, &pyl) {
+                    subject_map_points.push((map_point, d));
+                }
+            }
+        }
+
+        // Sort by D descending and add points that improve mixture
+        subject_map_points.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
+
+        let mut added = 0;
+        for (point, d) in subject_map_points {
+            if d <= 0.0 {
+                break; // No more improvements
+            }
+
+            if self.theta.check_point(&point, THETA_D) {
+                self.theta.add_point(&point)?;
+                added += 1;
+            }
+        }
+
+        tracing::debug!("Subject-guided: added {} candidate points", added);
+
+        Ok(())
+    }
+
+    /// Compute weighted centroid of current support points
+    fn compute_weighted_centroid(&self) -> Vec<f64> {
+        let n_params = self.ranges.len();
+        let mut centroid = vec![0.0; n_params];
+        let mut total_weight = 0.0;
+
+        for (i, spp) in self.theta.matrix().row_iter().enumerate() {
+            let weight = if i < self.w.len() { self.w[i] } else { 0.0 };
+            total_weight += weight;
+            for (j, val) in spp.iter().enumerate() {
+                centroid[j] += weight * val;
+            }
+        }
+
+        if total_weight > 0.0 {
+            for c in &mut centroid {
+                *c /= total_weight;
+            }
+        } else {
+            // Fallback to center of ranges
+            for (j, (lo, hi)) in self.ranges.iter().enumerate() {
+                centroid[j] = (lo + hi) / 2.0;
+            }
+        }
+
+        centroid
+    }
+
+    /// Find MAP estimate for a single subject: argmax P(y_i|θ)
+    fn find_subject_map(
+        &self,
+        subject: &pharmsol::Subject,
+        start: &[f64],
+        error_models: &AssayErrorModels,
+    ) -> Result<Vec<f64>, argmin::core::Error> {
+        let optimizer = SubjectMapOptimizer {
+            equation: &self.equation,
+            subject,
+            error_models,
+            ranges: &self.ranges,
+        };
+
+        let simplex = create_initial_simplex(start, &self.ranges);
+        let solver: NelderMead<Vec<f64>, f64> = NelderMead::new(simplex).with_sd_tolerance(1e-3)?;
+
+        let res = Executor::new(optimizer, solver)
+            .configure(|state| state.max_iters(SUBJECT_MAP_MAX_ITERS))
+            .run()?;
+
+        Ok(res.state.best_param.unwrap())
+    }
+
+    // ════════════════════════════════════════════════════════════════════
+    // PHASE 3: D-OPTIMAL REFINEMENT
+    // ════════════════════════════════════════════════════════════════════
+
+    /// D-optimal refinement with hierarchical iteration allocation
+    fn d_optimal_refinement(&mut self) -> Result<()> {
+        if self.theta.nspp() == 0 || self.w.len() == 0 {
+            return Ok(());
+        }
+
+        let pyl = self.compute_pyl();
+        let error_models = self.error_models.clone();
+        let max_weight = self.w.iter().fold(f64::NEG_INFINITY, |a, b| a.max(b));
+
+        // Collect points with weights - only refine points with meaningful weight
+        let n_points = self.theta.nspp().min(self.w.len());
+        let min_weight_threshold = max_weight * 0.001; // Skip points with < 0.1% of max weight
+
+        let mut candidate_points: Vec<(Array1<f64>, f64)> = self
+            .theta
+            .matrix()
+            .row_iter()
+            .take(n_points)
+            .enumerate()
+            .filter(|(i, _)| self.w[*i] >= min_weight_threshold)
+            .map(|(i, spp)| {
+                let point: Vec<f64> = spp.iter().cloned().collect();
+                let weight = self.w[i];
+                (Array1::from(point), weight)
+            })
+            .collect();
+
+        let ranges = self.ranges.clone();
+
+        // Optimize with hierarchical iterations based on importance
+        candidate_points.par_iter_mut().for_each(|(spp, weight)| {
+            let importance = *weight / max_weight;
+            let max_iters = if importance > HIGH_WEIGHT_THRESHOLD {
+                DOPT_HIGH_WEIGHT_ITERS
+            } else if importance > MED_WEIGHT_THRESHOLD {
+                DOPT_MED_WEIGHT_ITERS
+            } else {
+                DOPT_LOW_WEIGHT_ITERS
+            };
+
+            let optimizer = DOptimalOptimizer {
+                equation: &self.equation,
+                data: &self.data,
+                error_models: &error_models,
+                pyl: &pyl,
+            };
+
+            if let Ok(refined) = optimizer.optimize(spp.to_vec(), max_iters) {
+                // Clamp to safe boundaries
+                let clamped: Array1<f64> = refined
+                    .iter()
+                    .zip(ranges.iter())
+                    .map(|(&val, &(lo, hi))| {
+                        let margin = (hi - lo) * BOUNDARY_MARGIN;
+                        val.clamp(lo + margin, hi - margin)
+                    })
+                    .collect();
+                *spp = clamped;
+            }
+        });
+
+        // Add refined points
+        for (cp, _) in candidate_points {
+            self.theta.suggest_point(cp.to_vec().as_slice(), THETA_D)?;
+        }
+
+        Ok(())
+    }
+
+    // ════════════════════════════════════════════════════════════════════
+    // MULTI-SCALE GLOBAL OPTIMALITY
+    // ════════════════════════════════════════════════════════════════════
+
+    /// Sobol-based global optimality check with n samples
+    fn sobol_global_check_n(&mut self, pyl: &Array1<f64>, n_samples: usize) -> Result<f64> {
+        let n_dims = self.ranges.len();
+        let mut max_d = f64::NEG_INFINITY;
+
+        for i in 0..n_samples {
+            let idx = self.sobol_index + i as u32;
+            let mut point = Vec::with_capacity(n_dims);
+
+            for dim in 0..n_dims {
+                let sobol_val = sample(idx, dim as u32, SOBOL_SEED);
+                let (lo, hi) = self.ranges[dim];
+                let margin = (hi - lo) * BOUNDARY_MARGIN;
+                point.push(lo + margin + sobol_val as f64 * (hi - lo - 2.0 * margin));
+            }
+
+            if let Ok(d) = self.compute_d(&point, pyl) {
+                max_d = max_d.max(d);
+            }
+        }
+
+        self.sobol_index += n_samples as u32;
+        self.last_global_d_max = max_d;
+
+        Ok(max_d)
+    }
+
+    /// Multi-scale global optimality check with injection of violating points
+    fn multi_scale_global_check(&mut self) -> Result<()> {
+        if self.theta.nspp() == 0 || self.w.len() == 0 {
+            return Ok(());
+        }
+
+        let pyl = self.compute_pyl();
+        let n_dims = self.ranges.len();
+        let n_samples = GLOBAL_CHECK_SCALES[0]; // Use smallest scale for injection check
+
+        let mut max_d = f64::NEG_INFINITY;
+        let mut max_d_point = vec![0.0; n_dims];
+
+        for i in 0..n_samples {
+            let idx = self.sobol_index + i as u32;
+            let mut point = Vec::with_capacity(n_dims);
+
+            for dim in 0..n_dims {
+                let sobol_val = sample(idx, dim as u32, SOBOL_SEED);
+                let (lo, hi) = self.ranges[dim];
+                let margin = (hi - lo) * BOUNDARY_MARGIN;
+                point.push(lo + margin + sobol_val as f64 * (hi - lo - 2.0 * margin));
+            }
+
+            if let Ok(d) = self.compute_d(&point, &pyl) {
+                if d > max_d {
+                    max_d = d;
+                    max_d_point = point;
+                }
+            }
+        }
+
+        self.sobol_index += n_samples as u32;
+
+        // If we found a point with D > threshold, refine and inject it
+        if max_d > GLOBAL_D_THRESHOLD {
+            // Refine the point
+            let optimizer = DOptimalOptimizer {
+                equation: &self.equation,
+                data: &self.data,
+                error_models: &self.error_models,
+                pyl: &pyl,
+            };
+
+            if let Ok(refined) = optimizer.optimize(max_d_point.clone(), 30) {
+                // Clamp to safe bounds
+                let clamped: Vec<f64> = refined
+                    .iter()
+                    .zip(self.ranges.iter())
+                    .map(|(&val, &(lo, hi))| {
+                        let margin = (hi - lo) * BOUNDARY_MARGIN;
+                        val.clamp(lo + margin, hi - margin)
+                    })
+                    .collect();
+
+                if let Ok(d_refined) = self.compute_d(&clamped, &pyl) {
+                    if d_refined > GLOBAL_D_THRESHOLD * 0.5
+                        && self.theta.check_point(&clamped, THETA_D)
+                    {
+                        self.theta.add_point(&clamped)?;
+                        tracing::info!(
+                            "Global check injected point with D = {:.4} (refined from {:.4})",
+                            d_refined,
+                            max_d
+                        );
+                    }
+                }
+            }
+        }
+
+        Ok(())
+    }
+}
+
+// ============================================================================
+// SUBJECT MAP OPTIMIZER
+// ============================================================================
+
+/// Optimizer for finding MAP estimate of single subject
+struct SubjectMapOptimizer<'a, E: Equation> {
+    equation: &'a E,
+    subject: &'a pharmsol::Subject,
+    error_models: &'a AssayErrorModels,
+    ranges: &'a [(f64, f64)],
+}
+
+impl<E: Equation> CostFunction for SubjectMapOptimizer<'_, E> {
+    type Param = Vec<f64>;
+    type Output = f64;
+
+    fn cost(&self, params: &Self::Param) -> Result<Self::Output, Error> {
+        // Clamp to bounds
+        let clamped: Vec<f64> = params
+            .iter()
+            .zip(self.ranges.iter())
+            .map(|(v, (lo, hi))| v.clamp(*lo, *hi))
+            .collect();
+
+        // Create single-subject data
+        let single_data = Data::new(vec![self.subject.clone()]);
+        let theta = ndarray::Array1::from(clamped).insert_axis(Axis(0));
+
+        let psi = pharmsol::prelude::simulator::log_likelihood_matrix(
+            self.equation,
+            &single_data,
+            &theta,
+            self.error_models,
+            false,
+        )?
+        .mapv(f64::exp);
+
+        // We want to MAXIMIZE P(y|θ), so minimize -P(y|θ)
+        // Take log for numerical stability: minimize -log P(y|θ)
+        let p = psi.iter().next().unwrap_or(&1e-300);
+        let log_p = if *p > 0.0 { p.ln() } else { -700.0 }; // ln(1e-300) ≈ -690
+
+        Ok(-log_p) // Minimize negative log-likelihood
+    }
+}
+
+// ============================================================================
+// D-OPTIMAL OPTIMIZER
+// ============================================================================
+
+/// Optimizer for D-criterion maximization
+struct DOptimalOptimizer<'a, E: Equation> {
+    equation: &'a E,
+    data: &'a Data,
+    error_models: &'a AssayErrorModels,
+    pyl: &'a Array1<f64>,
+}
+
+impl<E: Equation> CostFunction for DOptimalOptimizer<'_, E> {
+    type Param = Vec<f64>;
+    type Output = f64;
+
+    fn cost(&self, spp: &Self::Param) -> Result<Self::Output, Error> {
+        let theta = Array1::from(spp.clone()).insert_axis(Axis(0));
+
+        let psi = pharmsol::prelude::simulator::log_likelihood_matrix(
+            self.equation,
+            self.data,
+            &theta,
+            self.error_models,
+            false,
+        )?
+        .mapv(f64::exp);
+
+        let nsub = psi.nrows() as f64;
+        let mut d_sum = -nsub;
+        for (p_i, pyl_i) in psi.iter().zip(self.pyl.iter()) {
+            if *pyl_i > 0.0 {
+                d_sum += p_i / pyl_i;
+            }
+        }
+
+        Ok(-d_sum) // Minimize -D = Maximize D
+    }
+}
+
+impl<'a, E: Equation> DOptimalOptimizer<'a, E> {
+    fn optimize(self, start: Vec<f64>, max_iters: u64) -> Result<Vec<f64>, Error> {
+        let simplex = create_initial_simplex_simple(&start);
+        let solver: NelderMead<Vec<f64>, f64> = NelderMead::new(simplex).with_sd_tolerance(1e-3)?;
+
+        let res = Executor::new(self, solver)
+            .configure(|state| state.max_iters(max_iters))
+            .run()?;
+
+        Ok(res.state.best_param.unwrap())
+    }
+}
+
+// ============================================================================
+// UTILITY FUNCTIONS
+// ============================================================================
+
+/// Create initial simplex with range-aware perturbation
+fn create_initial_simplex(initial_point: &[f64], ranges: &[(f64, f64)]) -> Vec<Vec<f64>> {
+    let num_dims = initial_point.len();
+    let perturbation_frac = 0.05; // 5% of range
+
+    let mut vertices = Vec::new();
+    vertices.push(initial_point.to_vec());
+
+    for i in 0..num_dims {
+        let (lo, hi) = ranges[i];
+        let range = hi - lo;
+        let perturbation = perturbation_frac * range;
+
+        let mut perturbed = initial_point.to_vec();
+        perturbed[i] = (perturbed[i] + perturbation).min(hi);
+        vertices.push(perturbed);
+    }
+
+    vertices
+}
+
+/// Create initial simplex (simple version for D-optimal)
+fn create_initial_simplex_simple(initial_point: &[f64]) -> Vec<Vec<f64>> {
+    let num_dims = initial_point.len();
+    let perturbation_pct = 0.008;
+
+    let mut vertices = Vec::new();
+    vertices.push(initial_point.to_vec());
+
+    for i in 0..num_dims {
+        let perturbation = if initial_point[i] == 0.0 {
+            0.00025
+        } else {
+            perturbation_pct * initial_point[i].abs()
+        };
+
+        let mut perturbed = initial_point.to_vec();
+        perturbed[i] += perturbation;
+        vertices.push(perturbed);
+    }
+
+    vertices
+}
+
+// ============================================================================
+// TESTS
+// ============================================================================
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_phase_display() {
+        assert_eq!(format!("{}", Phase::Warmup), "Warmup");
+        assert_eq!(format!("{}", Phase::Expansion), "Expansion");
+        assert_eq!(format!("{}", Phase::Convergence), "Convergence");
+    }
+
+    #[test]
+    fn test_simplex_creation() {
+        let point = vec![1.0, 2.0, 3.0];
+        let ranges = vec![(0.0, 2.0), (0.0, 4.0), (0.0, 6.0)];
+        let simplex = create_initial_simplex(&point, &ranges);
+
+        assert_eq!(simplex.len(), 4); // n+1 vertices
+        assert_eq!(simplex[0], point);
+    }
+
+    #[test]
+    fn test_constants() {
+        assert!(WARMUP_CYCLES > 0);
+        assert!(RESIDUAL_SUBJECT_FRACTION > 0.0 && RESIDUAL_SUBJECT_FRACTION < 1.0);
+        assert!(GLOBAL_D_THRESHOLD > 0.0);
+        assert!(CONVERGENCE_WINDOW > 0);
+        assert!(CE_ELITE_FRACTION > 0.0 && CE_ELITE_FRACTION < 1.0);
+        assert!(CE_GMM_COMPONENTS >= 1);
+    }
+
+    #[test]
+    fn test_gmm_component() {
+        let ranges = vec![(0.0, 10.0), (0.0, 20.0)];
+        let component = GaussianComponent::new(2, &ranges);
+
+        assert_eq!(component.mean.len(), 2);
+        assert_eq!(component.variance.len(), 2);
+        assert!((component.mean[0] - 5.0).abs() < 0.01);
+        assert!((component.mean[1] - 10.0).abs() < 0.01);
+    }
+
+    #[test]
+    fn test_gmm_sampling() {
+        let ranges = vec![(0.0, 10.0), (0.0, 20.0)];
+        let mut rng = StdRng::seed_from_u64(42);
+        let gmm = GMM::new(2, 2, &ranges, &mut rng);
+
+        let samples = gmm.sample(100, &mut rng, &ranges);
+        assert_eq!(samples.len(), 100);
+
+        // Check samples are within bounds
+        for sample in &samples {
+            assert!(sample[0] >= 0.0 && sample[0] <= 10.0);
+            assert!(sample[1] >= 0.0 && sample[1] <= 20.0);
+        }
+    }
+
+    #[test]
+    fn test_standard_normal() {
+        let mut rng = StdRng::seed_from_u64(42);
+        let samples: Vec<f64> = (0..1000)
+            .map(|_| sample_standard_normal(&mut rng))
+            .collect();
+
+        let mean: f64 = samples.iter().sum::<f64>() / samples.len() as f64;
+        let variance: f64 =
+            samples.iter().map(|x| (x - mean).powi(2)).sum::<f64>() / samples.len() as f64;
+
+        // Should be approximately N(0, 1)
+        assert!(mean.abs() < 0.1);
+        assert!((variance - 1.0).abs() < 0.2);
+    }
+
+    #[test]
+    fn test_global_scales() {
+        // Verify scales are increasing
+        for i in 1..GLOBAL_CHECK_SCALES.len() {
+            assert!(GLOBAL_CHECK_SCALES[i] > GLOBAL_CHECK_SCALES[i - 1]);
+        }
+    }
+
+    #[test]
+    fn test_temperature_bounds() {
+        assert!(INITIAL_TEMPERATURE > MIN_TEMPERATURE);
+        assert!(BASE_COOLING_RATE > 0.0 && BASE_COOLING_RATE < 1.0);
+        assert!(REHEAT_FACTOR > 1.0);
+    }
+}
diff --git a/src/algorithms/nonparametric/npbo/constants.rs b/src/algorithms/nonparametric/npbo/constants.rs
new file mode 100644
index 000000000..c7ea6204c
--- /dev/null
+++ b/src/algorithms/nonparametric/npbo/constants.rs
@@ -0,0 +1,54 @@
+//! NPBO Constants
+//!
+//! Configuration constants for the Bayesian Optimization algorithm.
+
+#![allow(dead_code)] // Many constants reserved for future tuning/experimentation
+
+/// Number of warmup cycles using grid expansion before BO
+pub const WARMUP_CYCLES: usize = 3;
+
+/// Number of initial points for GP training (Sobol sampling)
+pub const INITIAL_SAMPLES: usize = 50;
+
+/// Maximum GP training points to prevent O(n³) scaling issues
+pub const MAX_GP_POINTS: usize = 500;
+
+/// Minimum GP training points before optimization
+pub const MIN_GP_POINTS: usize = 20;
+
+/// Number of acquisition optimization restarts
+pub const ACQUISITION_RESTARTS: usize = 10;
+
+/// Batch size for parallel acquisition
+pub const BATCH_SIZE: usize = 5;
+
+/// GP kernel length scale initial value
+pub const INITIAL_LENGTH_SCALE: f64 = 0.3;
+
+/// GP kernel signal variance initial value  
+pub const INITIAL_SIGNAL_VAR: f64 = 1.0;
+
+/// GP noise variance (jitter for numerical stability)
+pub const NOISE_VAR: f64 = 1e-6;
+
+/// Exploration-exploitation tradeoff (higher = more exploration)
+pub const EXPLORATION_WEIGHT: f64 = 2.0;
+
+/// Convergence threshold for EI improvement
+pub const EI_CONVERGENCE_THRESHOLD: f64 = 1e-6;
+
+/// Maximum cycles without improvement before termination
+pub const MAX_STAGNATION_CYCLES: usize = 10;
+
+/// Weight threshold for condensation
+pub const WEIGHT_THRESHOLD: f64 = 1e-8;
+
+/// D-optimal refinement iterations per cycle
+pub const DOPT_ITERATIONS: usize = 3;
+
+/// Adaptive length scale bounds
+pub const LENGTH_SCALE_MIN: f64 = 0.01;
+pub const LENGTH_SCALE_MAX: f64 = 1.0;
+
+/// Whether to use ARD (Automatic Relevance Determination)
+pub const USE_ARD: bool = true;
diff --git a/src/algorithms/nonparametric/npbo/gp.rs b/src/algorithms/nonparametric/npbo/gp.rs
new file mode 100644
index 000000000..f828cfec3
--- /dev/null
+++ b/src/algorithms/nonparametric/npbo/gp.rs
@@ -0,0 +1,419 @@
+//! Gaussian Process implementation for NPBO
+//!
+//! A simple but efficient GP with RBF kernel for surrogate modeling.
+
+use super::constants::*;
+use ndarray::{Array1, Array2};
+
+/// Gaussian Process with RBF kernel
+#[derive(Debug, Clone)]
+pub struct GaussianProcess {
+    /// Training points (N x D)
+    x_train: Vec<Vec<f64>>,
+    /// Training targets (N)
+    y_train: Vec<f64>,
+    /// Per-dimension length scales (ARD)
+    length_scales: Vec<f64>,
+    /// Signal variance
+    signal_var: f64,
+    /// Noise variance  
+    noise_var: f64,
+    /// Precomputed inverse covariance (for prediction)
+    k_inv_y: Option<Array1<f64>>,
+    /// Precomputed Cholesky factor
+    l_chol: Option<Array2<f64>>,
+    /// Number of dimensions
+    n_dims: usize,
+    /// Parameter ranges for normalization
+    ranges: Vec<(f64, f64)>,
+}
+
+impl GaussianProcess {
+    pub fn new(n_dims: usize, ranges: &[(f64, f64)]) -> Self {
+        let length_scales = if USE_ARD {
+            vec![INITIAL_LENGTH_SCALE; n_dims]
+        } else {
+            vec![INITIAL_LENGTH_SCALE]
+        };
+
+        Self {
+            x_train: Vec::new(),
+            y_train: Vec::new(),
+            length_scales,
+            signal_var: INITIAL_SIGNAL_VAR,
+            noise_var: NOISE_VAR,
+            k_inv_y: None,
+            l_chol: None,
+            n_dims,
+            ranges: ranges.to_vec(),
+        }
+    }
+
+    /// Normalize a point to [0, 1] range
+    fn normalize(&self, x: &[f64]) -> Vec<f64> {
+        x.iter()
+            .zip(&self.ranges)
+            .map(|(&xi, &(lo, hi))| (xi - lo) / (hi - lo))
+            .collect()
+    }
+
+    /// RBF kernel between two points
+    fn kernel(&self, x1: &[f64], x2: &[f64]) -> f64 {
+        let mut sq_dist = 0.0;
+        for i in 0..x1.len() {
+            let ls = if USE_ARD {
+                self.length_scales[i]
+            } else {
+                self.length_scales[0]
+            };
+            let d = (x1[i] - x2[i]) / ls;
+            sq_dist += d * d;
+        }
+        self.signal_var * (-0.5 * sq_dist).exp()
+    }
+
+    /// Add a training point
+    pub fn add_point(&mut self, x: &[f64], y: f64) {
+        let x_norm = self.normalize(x);
+        self.x_train.push(x_norm);
+        self.y_train.push(y);
+        // Invalidate precomputed matrices
+        self.k_inv_y = None;
+        self.l_chol = None;
+    }
+
+    /// Get number of training points
+    pub fn n_points(&self) -> usize {
+        self.x_train.len()
+    }
+
+    /// Fit the GP (compute Cholesky decomposition)
+    pub fn fit(&mut self) -> Result<(), String> {
+        let n = self.x_train.len();
+        if n < MIN_GP_POINTS {
+            return Err("Not enough training points".to_string());
+        }
+
+        // Build covariance matrix
+        let mut k = Array2::<f64>::zeros((n, n));
+        for i in 0..n {
+            for j in 0..=i {
+                let kij = self.kernel(&self.x_train[i], &self.x_train[j]);
+                k[[i, j]] = kij;
+                k[[j, i]] = kij;
+            }
+            // Add noise on diagonal
+            k[[i, i]] += self.noise_var;
+        }
+
+        // Cholesky decomposition
+        let l = match cholesky_decomp(&k) {
+            Ok(l) => l,
+            Err(e) => return Err(format!("Cholesky failed: {}", e)),
+        };
+
+        // Solve L * L^T * alpha = y for alpha
+        let y = Array1::from_vec(self.y_train.clone());
+        let alpha = cholesky_solve(&l, &y);
+
+        self.l_chol = Some(l);
+        self.k_inv_y = Some(alpha);
+
+        Ok(())
+    }
+
+    /// Predict mean and variance at a point
+    pub fn predict(&self, x: &[f64]) -> (f64, f64) {
+        let (k_inv_y, l_chol) = match (&self.k_inv_y, &self.l_chol) {
+            (Some(a), Some(l)) => (a, l),
+            _ => return (0.0, self.signal_var), // Prior
+        };
+
+        let x_norm = self.normalize(x);
+
+        // Compute k_star (covariance with training points)
+        let k_star: Array1<f64> = self
+            .x_train
+            .iter()
+            .map(|xi| self.kernel(&x_norm, xi))
+            .collect();
+
+        // Mean: k_star^T * alpha
+        let mean = k_star.dot(k_inv_y);
+
+        // Variance: k(x,x) - k_star^T * K^-1 * k_star
+        let k_xx = self.signal_var + self.noise_var;
+
+        // Solve L * v = k_star
+        let v = forward_solve(l_chol, &k_star);
+        let var = (k_xx - v.dot(&v)).max(1e-10);
+
+        (mean, var)
+    }
+
+    /// Expected Improvement acquisition function
+    pub fn expected_improvement(&self, x: &[f64], y_best: f64) -> f64 {
+        let (mean, var) = self.predict(x);
+        let std = var.sqrt();
+
+        if std < 1e-12 {
+            return 0.0;
+        }
+
+        let z = (mean - y_best) / std;
+        let pdf = (-0.5 * z * z).exp() / (2.0 * std::f64::consts::PI).sqrt();
+        let cdf = 0.5 * (1.0 + erf(z / std::f64::consts::SQRT_2));
+
+        (mean - y_best) * cdf + std * pdf
+    }
+
+    /// Upper Confidence Bound acquisition function (alternative to EI)
+    #[allow(dead_code)]
+    pub fn ucb(&self, x: &[f64], beta: f64) -> f64 {
+        let (mean, var) = self.predict(x);
+        mean + beta * var.sqrt()
+    }
+
+    /// Optimize length scales using marginal likelihood gradient descent
+    pub fn optimize_hyperparameters(&mut self, _iterations: usize) {
+        // Simple grid search for now (more robust than gradient descent)
+        if self.x_train.len() < MIN_GP_POINTS {
+            return;
+        }
+
+        let mut best_ll = f64::NEG_INFINITY;
+        let mut best_ls = self.length_scales.clone();
+
+        let ls_values = [0.05, 0.1, 0.2, 0.3, 0.5, 0.7, 1.0];
+
+        if USE_ARD {
+            // For ARD, just optimize the mean length scale
+            for &ls in &ls_values {
+                self.length_scales = vec![ls; self.n_dims];
+                if let Ok(()) = self.fit() {
+                    let ll = self.log_marginal_likelihood();
+                    if ll > best_ll {
+                        best_ll = ll;
+                        best_ls = self.length_scales.clone();
+                    }
+                }
+            }
+        } else {
+            for &ls in &ls_values {
+                self.length_scales = vec![ls];
+                if let Ok(()) = self.fit() {
+                    let ll = self.log_marginal_likelihood();
+                    if ll > best_ll {
+                        best_ll = ll;
+                        best_ls = self.length_scales.clone();
+                    }
+                }
+            }
+        }
+
+        self.length_scales = best_ls;
+        let _ = self.fit();
+    }
+
+    /// Compute log marginal likelihood
+    fn log_marginal_likelihood(&self) -> f64 {
+        let (k_inv_y, l_chol) = match (&self.k_inv_y, &self.l_chol) {
+            (Some(a), Some(l)) => (a, l),
+            _ => return f64::NEG_INFINITY,
+        };
+
+        let n = self.y_train.len() as f64;
+        let y = Array1::from_vec(self.y_train.clone());
+
+        // -0.5 * y^T * K^-1 * y
+        let data_fit = -0.5 * y.dot(k_inv_y);
+
+        // -0.5 * log|K| = -sum(log(diag(L)))
+        let log_det: f64 = -l_chol.diag().iter().map(|&x| x.ln()).sum::<f64>();
+
+        // -0.5 * n * log(2*pi)
+        let const_term = -0.5 * n * (2.0 * std::f64::consts::PI).ln();
+
+        data_fit + log_det + const_term
+    }
+
+    /// Get best observed point (useful for tracking optimization progress)
+    #[allow(dead_code)]
+    pub fn get_best(&self) -> Option<(Vec<f64>, f64)> {
+        if self.y_train.is_empty() {
+            return None;
+        }
+
+        let (idx, &y_best) = self
+            .y_train
+            .iter()
+            .enumerate()
+            .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap())?;
+
+        // Denormalize
+        let x_norm = &self.x_train[idx];
+        let x: Vec<f64> = x_norm
+            .iter()
+            .zip(&self.ranges)
+            .map(|(&xi, &(lo, hi))| xi * (hi - lo) + lo)
+            .collect();
+
+        Some((x, y_best))
+    }
+
+    /// Prune old points if we exceed MAX_GP_POINTS
+    pub fn prune_if_needed(&mut self) {
+        if self.x_train.len() <= MAX_GP_POINTS {
+            return;
+        }
+
+        // Keep the best points and most recent
+        let n = self.x_train.len();
+        let keep = MAX_GP_POINTS;
+
+        // Sort by y value (descending) and keep top half
+        let mut indices: Vec<usize> = (0..n).collect();
+        indices.sort_by(|&i, &j| {
+            self.y_train[j]
+                .partial_cmp(&self.y_train[i])
+                .unwrap_or(std::cmp::Ordering::Equal)
+        });
+
+        // Keep top scoring and most recent
+        let mut keep_set: std::collections::HashSet<usize> = std::collections::HashSet::new();
+
+        // Top half by score
+        for &idx in indices.iter().take(keep / 2) {
+            keep_set.insert(idx);
+        }
+
+        // Most recent
+        for idx in (n - keep / 2)..n {
+            keep_set.insert(idx);
+        }
+
+        let mut keep_indices: Vec<usize> = keep_set.into_iter().collect();
+        keep_indices.sort();
+
+        let new_x: Vec<Vec<f64>> = keep_indices
+            .iter()
+            .map(|&i| self.x_train[i].clone())
+            .collect();
+        let new_y: Vec<f64> = keep_indices.iter().map(|&i| self.y_train[i]).collect();
+
+        self.x_train = new_x;
+        self.y_train = new_y;
+        self.k_inv_y = None;
+        self.l_chol = None;
+    }
+}
+
+// ============================================================================
+// HELPER FUNCTIONS
+// ============================================================================
+
+/// Error function approximation (Abramowitz and Stegun)
+fn erf(x: f64) -> f64 {
+    let a1 = 0.254829592;
+    let a2 = -0.284496736;
+    let a3 = 1.421413741;
+    let a4 = -1.453152027;
+    let a5 = 1.061405429;
+    let p = 0.3275911;
+
+    let sign = if x < 0.0 { -1.0 } else { 1.0 };
+    let x = x.abs();
+
+    let t = 1.0 / (1.0 + p * x);
+    let y = 1.0 - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * (-x * x).exp();
+
+    sign * y
+}
+
+/// Cholesky decomposition (lower triangular)
+fn cholesky_decomp(a: &Array2<f64>) -> Result<Array2<f64>, String> {
+    let n = a.nrows();
+    let mut l = Array2::<f64>::zeros((n, n));
+
+    for i in 0..n {
+        for j in 0..=i {
+            let mut sum = 0.0;
+            for k in 0..j {
+                sum += l[[i, k]] * l[[j, k]];
+            }
+
+            if i == j {
+                let diag = a[[i, i]] - sum;
+                if diag <= 0.0 {
+                    return Err("Matrix not positive definite".to_string());
+                }
+                l[[i, j]] = diag.sqrt();
+            } else {
+                l[[i, j]] = (a[[i, j]] - sum) / l[[j, j]];
+            }
+        }
+    }
+
+    Ok(l)
+}
+
+/// Solve L * x = b (forward substitution)
+fn forward_solve(l: &Array2<f64>, b: &Array1<f64>) -> Array1<f64> {
+    let n = b.len();
+    let mut x = Array1::zeros(n);
+
+    for i in 0..n {
+        let mut sum = 0.0;
+        for j in 0..i {
+            sum += l[[i, j]] * x[j];
+        }
+        x[i] = (b[i] - sum) / l[[i, i]];
+    }
+
+    x
+}
+
+/// Solve L * L^T * x = b
+fn cholesky_solve(l: &Array2<f64>, b: &Array1<f64>) -> Array1<f64> {
+    let n = b.len();
+
+    // Forward: L * y = b
+    let y = forward_solve(l, &b);
+
+    // Backward: L^T * x = y
+    let mut x = Array1::zeros(n);
+    for i in (0..n).rev() {
+        let mut sum = 0.0;
+        for j in (i + 1)..n {
+            sum += l[[j, i]] * x[j];
+        }
+        x[i] = (y[i] - sum) / l[[i, i]];
+    }
+
+    x
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_gp_basic() {
+        let ranges = vec![(0.0, 1.0), (0.0, 1.0)];
+        let mut gp = GaussianProcess::new(2, &ranges);
+
+        // Add some points
+        for i in 0..30 {
+            let x = vec![(i as f64) / 30.0, (i as f64) / 30.0];
+            let y = -(x[0] - 0.5).powi(2) - (x[1] - 0.5).powi(2);
+            gp.add_point(&x, y);
+        }
+
+        assert!(gp.fit().is_ok());
+
+        // Predict at optimum
+        let (mean, var) = gp.predict(&[0.5, 0.5]);
+        assert!(mean > -0.1);
+        assert!(var > 0.0);
+    }
+}
diff --git a/src/algorithms/nonparametric/npbo/mod.rs b/src/algorithms/nonparametric/npbo/mod.rs
new file mode 100644
index 000000000..801a9683a
--- /dev/null
+++ b/src/algorithms/nonparametric/npbo/mod.rs
@@ -0,0 +1,638 @@
+//! Non-Parametric Bayesian Optimization (NPBO)
+//!
+//! Uses Gaussian Process surrogate modeling with Expected Improvement acquisition
+//! to efficiently explore the parameter space. The GP learns D-optimality scores
+//! across the space and uses Bayesian optimization to suggest high-value regions.
+//!
+//! Key Features:
+//! - GP surrogate model learns D-criterion landscape
+//! - Expected Improvement for exploration-exploitation balance
+//! - Warm-up phase with Sobol sampling for initial coverage
+//! - Standard NP estimation/condensation cycle
+//! - NPAG-style convergence criteria
+
+mod constants;
+mod gp;
+
+use constants::*;
+use gp::GaussianProcess;
+
+use crate::algorithms::{
+    NativeNonparametricConfig, NonparametricAlgorithmInput, Status, StopReason,
+};
+use crate::estimation::nonparametric::adaptative_grid;
+use crate::estimation::nonparametric::ipm::burke;
+use crate::estimation::nonparametric::qr;
+use crate::estimation::nonparametric::sample_space_for_parameters;
+use crate::estimation::nonparametric::{
+    calculate_psi, CycleLog, NPCycle, NonparametricWorkspace, Psi, Theta, Weights,
+};
+use crate::prelude::algorithms::Algorithms;
+
+use anyhow::{bail, Result};
+use pharmsol::prelude::{
+    data::{AssayErrorModels, Data},
+    simulator::Equation,
+    AssayErrorModel,
+};
+
+use sobol_burley::sample;
+
+// NPAG-style convergence constants
+const THETA_E: f64 = 1e-4;
+const THETA_G: f64 = 1e-4;
+const THETA_F: f64 = 1e-2;
+const THETA_D: f64 = 1e-4;
+
+/// NPBO Algorithm State
+#[derive(Debug)]
+pub struct NPBO<E: Equation + Send + 'static> {
+    equation: E,
+    ranges: Vec<(f64, f64)>,
+    psi: Psi,
+    theta: Theta,
+    lambda: Weights,
+    w: Weights,
+    eps: f64,
+    last_objf: f64,
+    objf: f64,
+    f0: f64,
+    f1: f64,
+    cycle: usize,
+    gamma_delta: Vec<f64>,
+    error_models: AssayErrorModels,
+    status: Status,
+    cycle_log: CycleLog,
+    data: Data,
+    config: NativeNonparametricConfig,
+    // Bayesian Optimization state
+    gp: GaussianProcess,
+    y_best: f64,
+    warmup_complete: bool,
+    stagnation_count: usize,
+}
+
+impl<E: Equation + Send + 'static> Algorithms<E> for NPBO<E> {
+    fn equation(&self) -> &E {
+        &self.equation
+    }
+
+    fn into_workspace(&self) -> Result<NonparametricWorkspace<E>> {
+        NonparametricWorkspace::new(
+            self.equation.clone(),
+            self.data.clone(),
+            self.theta.clone(),
+            self.psi.clone(),
+            self.w.clone(),
+            -2. * self.objf,
+            self.cycle,
+            self.status.clone(),
+            self.config.run_configuration.clone(),
+            self.cycle_log.clone(),
+        )
+    }
+
+    fn error_models(&self) -> &AssayErrorModels {
+        &self.error_models
+    }
+
+    fn data(&self) -> &Data {
+        &self.data
+    }
+
+    fn get_prior(&self) -> Theta {
+        // Start with Sobol sampling for good initial coverage
+        sample_space_for_parameters(&self.config.parameter_space, &self.config.prior).unwrap()
+    }
+
+    fn likelihood(&self) -> f64 {
+        self.objf
+    }
+
+    fn increment_cycle(&mut self) -> usize {
+        self.cycle += 1;
+        self.cycle
+    }
+
+    fn cycle(&self) -> usize {
+        self.cycle
+    }
+
+    fn set_theta(&mut self, theta: Theta) {
+        self.theta = theta;
+    }
+
+    fn theta(&self) -> &Theta {
+        &self.theta
+    }
+
+    fn psi(&self) -> &Psi {
+        &self.psi
+    }
+
+    fn set_status(&mut self, status: Status) {
+        self.status = status;
+    }
+
+    fn status(&self) -> &Status {
+        &self.status
+    }
+
+    fn log_cycle_state(&mut self) {
+        let state = NPCycle::new(
+            self.cycle,
+            -2. * self.objf,
+            self.error_models.clone(),
+            self.theta.clone(),
+            self.theta.nspp(),
+            (self.last_objf - self.objf).abs(),
+            self.status.clone(),
+        );
+        self.cycle_log.push(state);
+        self.last_objf = self.objf;
+    }
+
+    fn evaluation(&mut self) -> Result<Status> {
+        tracing::info!("Objective function = {:.4}", -2.0 * self.objf);
+        tracing::debug!("Support points: {}", self.theta.nspp());
+        tracing::debug!("GP training points: {}", self.gp.n_points());
+
+        self.error_models.iter().for_each(|(outeq, em)| {
+            if AssayErrorModel::None == *em {
+                return;
+            }
+            tracing::debug!(
+                "Error model for outeq {}: {:.2}",
+                outeq,
+                em.factor().unwrap_or_default()
+            );
+        });
+
+        tracing::debug!("EPS = {:.4}", self.eps);
+
+        // Update stagnation tracking
+        if self.objf > self.y_best + EI_CONVERGENCE_THRESHOLD {
+            self.y_best = self.objf;
+            self.stagnation_count = 0;
+        } else {
+            self.stagnation_count += 1;
+        }
+
+        // NPAG-style convergence check
+        let psi = self.psi.matrix();
+        let w = &self.w;
+        if (self.last_objf - self.objf).abs() <= THETA_G && self.eps > THETA_E {
+            self.eps /= 2.;
+            if self.eps <= THETA_E {
+                let pyl = psi * w.weights();
+                self.f1 = pyl.iter().map(|x| x.ln()).sum();
+                if (self.f1 - self.f0).abs() <= THETA_F {
+                    // Additional global optimality check via GP
+                    if self.check_global_optimality() {
+                        tracing::info!("NPBO converged after {} cycles", self.cycle);
+                        self.set_status(Status::Stop(StopReason::Converged));
+                        self.log_cycle_state();
+                        return Ok(self.status().clone());
+                    } else {
+                        // Reset and continue exploring
+                        self.f0 = self.f1;
+                        self.eps = 0.2;
+                        tracing::debug!("GP suggests unexplored regions, continuing...");
+                    }
+                } else {
+                    self.f0 = self.f1;
+                    self.eps = 0.2;
+                }
+            }
+        }
+
+        // Stop if stagnated too long
+        if self.stagnation_count >= MAX_STAGNATION_CYCLES && self.warmup_complete {
+            tracing::info!("NPBO converged (stagnation) after {} cycles", self.cycle);
+            self.set_status(Status::Stop(StopReason::Converged));
+            self.log_cycle_state();
+            return Ok(self.status().clone());
+        }
+
+        // Stop if maximum cycles reached
+        if self.cycle >= self.config.max_cycles {
+            tracing::warn!("Maximum number of cycles reached");
+            self.set_status(Status::Stop(StopReason::MaxCycles));
+            self.log_cycle_state();
+            return Ok(self.status().clone());
+        }
+
+        // Stop if stopfile exists
+        if std::path::Path::new("stop").exists() {
+            tracing::warn!("Stopfile detected - breaking");
+            self.set_status(Status::Stop(StopReason::Stopped));
+            self.log_cycle_state();
+            return Ok(self.status().clone());
+        }
+
+        self.set_status(Status::Continue);
+        self.log_cycle_state();
+        Ok(self.status().clone())
+    }
+
+    fn estimation(&mut self) -> Result<()> {
+        self.psi = calculate_psi(
+            &self.equation,
+            &self.data,
+            &self.theta,
+            &self.error_models,
+            self.cycle == 1 && self.config.progress,
+        )?;
+
+        if let Err(err) = self.validate_psi() {
+            bail!(err);
+        }
+
+        (self.lambda, _) = match burke(&self.psi) {
+            Ok((lambda, objf)) => (lambda, objf),
+            Err(err) => {
+                bail!("Error in IPM during estimation: {:?}", err);
+            }
+        };
+
+        Ok(())
+    }
+
+    fn condensation(&mut self) -> Result<()> {
+        // Filter by lambda threshold (max/1000)
+        let max_lambda = self
+            .lambda
+            .iter()
+            .fold(f64::NEG_INFINITY, |acc, x| x.max(acc));
+
+        let mut keep = Vec::<usize>::new();
+        for (index, lam) in self.lambda.iter().enumerate() {
+            if lam > max_lambda / 1000_f64 {
+                keep.push(index);
+            }
+        }
+
+        if self.psi.matrix().ncols() != keep.len() {
+            tracing::debug!(
+                "Lambda (max/1000) dropped {} support point(s)",
+                self.psi.matrix().ncols() - keep.len(),
+            );
+        }
+
+        self.theta.filter_indices(keep.as_slice());
+        self.psi.filter_column_indices(keep.as_slice());
+
+        // QR-based rank revealing factorization
+        let (r, perm) = qr::qrd(&self.psi)?;
+
+        let mut keep = Vec::<usize>::new();
+        let keep_n = self.psi.matrix().ncols().min(self.psi.matrix().nrows());
+
+        for i in 0..keep_n {
+            let test = r.col(i).norm_l2();
+            let r_diag_val = r.get(i, i);
+            let ratio = r_diag_val / test;
+            if ratio.abs() >= 1e-8 {
+                keep.push(*perm.get(i).unwrap());
+            }
+        }
+
+        if self.psi.matrix().ncols() != keep.len() {
+            tracing::debug!(
+                "QR decomposition dropped {} support point(s)",
+                self.psi.matrix().ncols() - keep.len(),
+            );
+        }
+
+        self.theta.filter_indices(keep.as_slice());
+        self.psi.filter_column_indices(keep.as_slice());
+
+        self.validate_psi()?;
+        (self.lambda, self.objf) = match burke(&self.psi) {
+            Ok((lambda, objf)) => (lambda, objf),
+            Err(err) => {
+                return Err(anyhow::anyhow!(
+                    "Error in IPM during condensation: {:?}",
+                    err
+                ));
+            }
+        };
+        self.w = self.lambda.clone();
+        self.last_objf = self.objf;
+
+        Ok(())
+    }
+
+    fn optimizations(&mut self) -> Result<()> {
+        // Update GP with current support points and their D-optimal contributions
+        self.update_gp_model()?;
+
+        // Standard error model optimization
+        self.error_models
+            .clone()
+            .iter_mut()
+            .filter_map(|(outeq, em)| {
+                if em.optimize() {
+                    Some((outeq, em))
+                } else {
+                    None
+                }
+            })
+            .try_for_each(|(outeq, em)| -> Result<()> {
+                let gamma_up = em.factor()? * (1.0 + self.gamma_delta[outeq]);
+                let gamma_down = em.factor()? / (1.0 + self.gamma_delta[outeq]);
+
+                let mut error_model_up = self.error_models.clone();
+                error_model_up.set_factor(outeq, gamma_up)?;
+
+                let mut error_model_down = self.error_models.clone();
+                error_model_down.set_factor(outeq, gamma_down)?;
+
+                let psi_up = calculate_psi(
+                    &self.equation,
+                    &self.data,
+                    &self.theta,
+                    &error_model_up,
+                    false,
+                )?;
+                let psi_down = calculate_psi(
+                    &self.equation,
+                    &self.data,
+                    &self.theta,
+                    &error_model_down,
+                    false,
+                )?;
+
+                let (lambda_up, objf_up) = burke(&psi_up)?;
+                let (lambda_down, objf_down) = burke(&psi_down)?;
+
+                if objf_up > self.objf {
+                    self.error_models.set_factor(outeq, gamma_up)?;
+                    self.objf = objf_up;
+                    self.gamma_delta[outeq] *= 4.;
+                    self.lambda = lambda_up;
+                    self.psi = psi_up;
+                }
+                if objf_down > self.objf {
+                    self.error_models.set_factor(outeq, gamma_down)?;
+                    self.objf = objf_down;
+                    self.gamma_delta[outeq] *= 4.;
+                    self.lambda = lambda_down;
+                    self.psi = psi_down;
+                }
+                self.gamma_delta[outeq] *= 0.5;
+                if self.gamma_delta[outeq] <= 0.01 {
+                    self.gamma_delta[outeq] = 0.1;
+                }
+                Ok(())
+            })?;
+
+        Ok(())
+    }
+
+    fn expansion(&mut self) -> Result<()> {
+        // During warmup: use grid expansion for coverage
+        if self.cycle <= WARMUP_CYCLES {
+            self.warmup_complete = false;
+            return self.grid_expansion();
+        }
+
+        self.warmup_complete = true;
+
+        // After warmup: use GP-guided Bayesian optimization
+        self.bo_expansion()
+    }
+}
+
+impl<E: Equation + Send + 'static> NPBO<E> {
+    pub(crate) fn from_input(input: NonparametricAlgorithmInput<E>) -> Result<Box<Self>> {
+        let config = input.native_config()?;
+        let ranges = config.ranges.clone();
+        let n_dims = ranges.len();
+        let error_models = input.error_models().clone();
+        let equation = input.equation;
+        let data = input.data;
+
+        Ok(Box::new(Self {
+            equation,
+            ranges: ranges.clone(),
+            psi: Psi::new(),
+            theta: Theta::new(),
+            lambda: Weights::default(),
+            w: Weights::default(),
+            eps: 0.2,
+            last_objf: -1e30,
+            objf: f64::NEG_INFINITY,
+            f0: -1e30,
+            f1: f64::default(),
+            cycle: 0,
+            gamma_delta: vec![0.1; error_models.len()],
+            error_models,
+            status: Status::Continue,
+            cycle_log: CycleLog::new(),
+            data,
+            config,
+            gp: GaussianProcess::new(n_dims, &ranges),
+            y_best: f64::NEG_INFINITY,
+            warmup_complete: false,
+            stagnation_count: 0,
+        }))
+    }
+}
+
+// ============================================================================
+// NPBO-Specific Methods
+// ============================================================================
+
+impl<E: Equation + Send + 'static> NPBO<E> {
+    /// Standard grid-based expansion during warmup
+    fn grid_expansion(&mut self) -> Result<()> {
+        adaptative_grid(&mut self.theta, self.eps, &self.ranges, THETA_D)?;
+        tracing::debug!("Grid expansion: {} total support points", self.theta.nspp());
+        Ok(())
+    }
+
+    /// Bayesian optimization guided expansion
+    fn bo_expansion(&mut self) -> Result<()> {
+        // First, fit the GP if we have enough points
+        if self.gp.n_points() >= MIN_GP_POINTS {
+            // Optimize hyperparameters periodically
+            if self.cycle % 5 == 0 {
+                self.gp.optimize_hyperparameters(50);
+            } else {
+                let _ = self.gp.fit();
+            }
+        }
+
+        // Generate candidate points using acquisition function
+        let mut candidates = Vec::new();
+
+        if self.gp.n_points() >= MIN_GP_POINTS {
+            // Use Expected Improvement to find promising regions
+            candidates.extend(self.optimize_acquisition(BATCH_SIZE));
+        }
+
+        // Also add some grid-refined points for local improvement
+        if self.cycle % 3 == 0 {
+            let sparse_eps = self.eps * 0.5;
+            adaptative_grid(&mut self.theta, sparse_eps, &self.ranges, THETA_D * 2.0)?;
+        }
+
+        // Add Sobol exploration points for diversity
+        candidates.extend(self.sobol_points(BATCH_SIZE / 2));
+
+        // Add candidates that pass distance check
+        let mut added = 0;
+        for spp in candidates {
+            if self.theta.check_point(&spp, THETA_D) {
+                let _ = self.theta.add_point(&spp);
+                added += 1;
+            }
+        }
+
+        tracing::debug!(
+            "BO expansion: added {} points, {} total support points",
+            added,
+            self.theta.nspp()
+        );
+
+        Ok(())
+    }
+
+    /// Update GP model with current support point evaluations
+    fn update_gp_model(&mut self) -> Result<()> {
+        // Compute D-criterion contribution for each current support point
+        // Use the weight (lambda) as a proxy for importance in the D-optimal design
+        let n_spp = self.theta.nspp();
+
+        for (i, w) in self.w.iter().enumerate().take(n_spp) {
+            // Get support point coordinates
+            let spp: Vec<f64> = (0..self.theta.matrix().ncols())
+                .map(|c| *self.theta.matrix().get(i, c))
+                .collect();
+
+            // Use log-weight as target (approximates D-criterion contribution)
+            let target = w.max(1e-10).ln();
+
+            self.gp.add_point(&spp, target);
+        }
+
+        // Prune GP if too many points
+        self.gp.prune_if_needed();
+
+        Ok(())
+    }
+
+    /// Optimize acquisition function to find promising points
+    fn optimize_acquisition(&mut self, n_points: usize) -> Vec<Vec<f64>> {
+        let mut best_points = Vec::new();
+        let n_dims = self.ranges.len();
+
+        // Multi-start optimization of Expected Improvement
+        for restart in 0..ACQUISITION_RESTARTS {
+            // Start from a random point
+            let seed = (self.cycle * ACQUISITION_RESTARTS + restart) as u32;
+            let mut x: Vec<f64> = (0..n_dims)
+                .map(|d| {
+                    let u = sample(restart as u32, d as u32, seed) as f64;
+                    let (lo, hi) = self.ranges[d];
+                    lo + u * (hi - lo)
+                })
+                .collect();
+
+            // Simple gradient-free local optimization
+            let mut best_ei = self.gp.expected_improvement(&x, self.y_best);
+
+            for _ in 0..50 {
+                let mut improved = false;
+
+                for d in 0..n_dims {
+                    let (lo, hi) = self.ranges[d];
+                    let step = (hi - lo) * 0.05;
+
+                    // Try increasing
+                    let old_val = x[d];
+                    x[d] = (x[d] + step).min(hi);
+                    let ei = self.gp.expected_improvement(&x, self.y_best);
+                    if ei > best_ei {
+                        best_ei = ei;
+                        improved = true;
+                    } else {
+                        x[d] = old_val;
+                    }
+
+                    // Try decreasing
+                    x[d] = (x[d] - step).max(lo);
+                    let ei = self.gp.expected_improvement(&x, self.y_best);
+                    if ei > best_ei {
+                        best_ei = ei;
+                        improved = true;
+                    } else {
+                        x[d] = old_val;
+                    }
+                }
+
+                if !improved {
+                    break;
+                }
+            }
+
+            if best_ei > EI_CONVERGENCE_THRESHOLD {
+                best_points.push((x, best_ei));
+            }
+        }
+
+        // Sort by EI and take top n_points
+        best_points.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
+        best_points.truncate(n_points);
+        best_points.into_iter().map(|(x, _)| x).collect()
+    }
+
+    /// Generate Sobol sequence points for exploration
+    fn sobol_points(&self, n: usize) -> Vec<Vec<f64>> {
+        let n_dims = self.ranges.len();
+        let seed = (self.cycle * 1000) as u32;
+
+        (0..n)
+            .map(|i| {
+                (0..n_dims)
+                    .map(|d| {
+                        let u = sample((i + self.cycle * n) as u32, d as u32, seed) as f64;
+                        let (lo, hi) = self.ranges[d];
+                        lo + u * (hi - lo)
+                    })
+                    .collect()
+            })
+            .collect()
+    }
+
+    /// Check if GP suggests we've found global optimum
+    fn check_global_optimality(&self) -> bool {
+        if self.gp.n_points() < MIN_GP_POINTS {
+            return false;
+        }
+
+        // Sample random points and check if any have high EI
+        let test_points = 100;
+        let seed = (self.cycle * 999) as u32;
+
+        for i in 0..test_points {
+            let x: Vec<f64> = (0..self.ranges.len())
+                .map(|d| {
+                    let u = sample(i as u32, d as u32, seed) as f64;
+                    let (lo, hi) = self.ranges[d];
+                    lo + u * (hi - lo)
+                })
+                .collect();
+
+            let ei = self.gp.expected_improvement(&x, self.y_best);
+            if ei > EI_CONVERGENCE_THRESHOLD * 10.0 {
+                // Found a point with significant expected improvement
+                return false;
+            }
+        }
+
+        // No high-EI points found, likely at global optimum
+        true
+    }
+}
diff --git a/src/algorithms/nonparametric/npcat.rs b/src/algorithms/nonparametric/npcat.rs
new file mode 100644
index 000000000..5c7788b19
--- /dev/null
+++ b/src/algorithms/nonparametric/npcat.rs
@@ -0,0 +1,1325 @@
+//! # NPCAT: Non-Parametric Covariance-Adaptive Trajectory Algorithm
+//!
+//! This module implements the NPCAT algorithm, a novel non-parametric approach that combines:
+//! - **Fisher Information-guided sampling** for intelligent exploration
+//! - **Sobol quasi-random sequences** for guaranteed coverage in global optimality checks
+//! - **Adaptive phase transitions** through a convergence state machine
+//! - **Gradient-aware local refinement** using L-BFGS-B optimization
+//!
+//! ## Key Innovations
+//!
+//! ### 1. Information-Guided Candidate Generation
+//! Instead of uniform grid expansion (NPAG) or random injection (NPSAH), NPCAT generates
+//! candidate points along directions of high parameter uncertainty using Fisher Information.
+//!
+//! ### 2. Quasi-Random Global Checks
+//! Uses Sobol low-discrepancy sequences instead of Monte Carlo for provably better
+//! coverage of the parameter space during global optimality verification.
+//!
+//! ### 3. Hierarchical Convergence State Machine
+//! Three-phase approach: Exploring → Refining → Polishing, with adaptive behavior in each phase.
+//!
+//! ### 4. Selective Local Refinement
+//! Only refines high-weight support points, with iteration count adapting to cycle number.
+//!
+//! ## Algorithm Phases
+//!
+//! ### Phase 1: Exploring
+//! High expansion rate with information-guided candidate generation.
+//! Transitions to Refining when objective function stabilizes AND coverage is sufficient.
+//!
+//! ### Phase 2: Refining
+//! Balanced expansion and refinement. Runs periodic global optimality checks.
+//! Transitions to Polishing when global check passes AND objective is stable.
+//!
+//! ### Phase 3: Polishing
+//! No expansion, full refinement of all surviving points.
+//! Converges when P(Y|L) criterion is met.
+
+use crate::algorithms::{
+    NativeNonparametricConfig, NonparametricAlgorithmInput, Status, StopReason,
+};
+use crate::estimation::nonparametric::ipm::burke;
+use crate::estimation::nonparametric::qr;
+use crate::estimation::nonparametric::sample_space_for_parameters;
+use crate::estimation::nonparametric::{
+    calculate_psi, CycleLog, NPCycle, NonparametricWorkspace, Psi, Theta, Weights,
+};
+use crate::prelude::algorithms::Algorithms;
+
+use anyhow::{bail, Result};
+use ndarray::parallel::prelude::{IntoParallelRefMutIterator, ParallelIterator};
+use ndarray::Array1;
+use pharmsol::prelude::AssayErrorModel;
+use pharmsol::prelude::{
+    data::{AssayErrorModels, Data},
+    simulator::Equation,
+};
+use rand::prelude::*;
+use sobol_burley::sample;
+
+// ============================================================================
+// ALGORITHM CONSTANTS
+// ============================================================================
+
+// Convergence thresholds
+/// Weight stability convergence threshold
+const THETA_W: f64 = 1e-3;
+/// Objective function convergence threshold
+const THETA_G: f64 = 1e-4;
+/// Global optimality D-criterion threshold
+const THETA_D_GLOBAL: f64 = 0.01;
+/// P(Y|L) convergence criterion
+const THETA_F: f64 = 1e-2;
+/// Minimum distance between support points
+const MIN_DISTANCE: f64 = 1e-4;
+
+// Expansion parameters
+/// Initial number of candidates to add per cycle
+const INITIAL_K: usize = 40;
+/// Decay rate for candidate count per cycle
+const K_DECAY_RATE: f64 = 0.95;
+/// Minimum candidates to add
+const MIN_K: usize = 4;
+
+// Refinement parameters
+/// Base iterations for L-BFGS-B (will increase with cycle)
+const BASE_OPTIM_ITERS: u64 = 20;
+/// Additional iterations per log(cycle)
+const OPTIM_ITER_GROWTH: u64 = 10;
+/// Tolerance for local optimization
+const OPTIM_TOLERANCE: f64 = 1e-4;
+
+// Global check parameters
+/// Number of Sobol samples for global optimality check
+const SOBOL_SAMPLES: usize = 256;
+/// Interval (in cycles) between global checks
+const GLOBAL_CHECK_INTERVAL: usize = 5;
+
+// Phase transition parameters
+/// Cycles of stability required to transition from Exploring to Refining
+const EXPLORING_STABILITY_WINDOW: usize = 3;
+/// Cycles of stability required to transition from Refining to Polishing
+const REFINING_STABILITY_WINDOW: usize = 5;
+/// Cycles of stability required in Polishing for final convergence
+const POLISHING_STABILITY_WINDOW: usize = 3;
+
+// Candidate generation ratios
+/// Fraction of candidates from Fisher Information directions
+const FISHER_RATIO: f64 = 0.60;
+/// Fraction of candidates from D-optimal perturbations
+const DOPT_RATIO: f64 = 0.30;
+/// Fraction of candidates from boundary exploration
+const BOUNDARY_RATIO: f64 = 0.10;
+
+// ============================================================================
+// CONVERGENCE STATE MACHINE
+// ============================================================================
+
+/// Convergence state for the hierarchical state machine
+#[derive(Debug, Clone, PartialEq)]
+pub enum ConvergenceState {
+    /// High expansion rate, building initial coverage
+    Exploring,
+    /// Balanced expansion/refinement, periodic global checks
+    Refining,
+    /// No expansion, full refinement of all points
+    Polishing,
+    /// Algorithm has converged
+    Converged,
+}
+
+impl std::fmt::Display for ConvergenceState {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            ConvergenceState::Exploring => write!(f, "Exploring"),
+            ConvergenceState::Refining => write!(f, "Refining"),
+            ConvergenceState::Polishing => write!(f, "Polishing"),
+            ConvergenceState::Converged => write!(f, "Converged"),
+        }
+    }
+}
+
+// ============================================================================
+// NPCAT STRUCT
+// ============================================================================
+
+/// NPCAT: Non-Parametric Covariance-Adaptive Trajectory Algorithm
+///
+/// A novel non-parametric population PK/PD algorithm that combines:
+/// - Fisher Information-guided exploration
+/// - Sobol quasi-random global optimality checks
+/// - Adaptive convergence state machine
+#[derive(Debug)]
+pub struct NPCAT<E: Equation + Send + 'static> {
+    /// The pharmacometric equation/model
+    equation: E,
+    /// Parameter ranges for each dimension
+    ranges: Vec<(f64, f64)>,
+    /// Probability matrix: P(y_i | θ_j)
+    psi: Psi,
+    /// Support points (parameter values)
+    theta: Theta,
+    /// Weights from IPM before condensation
+    lambda: Weights,
+    /// Final weights after condensation
+    w: Weights,
+    /// Previous weights for stability check
+    w_prev: Weights,
+    /// Current objective function value
+    objf: f64,
+    /// Previous objective function value
+    last_objf: f64,
+    /// P(Y|L) values for convergence checking
+    f0: f64,
+    f1: f64,
+    /// Current cycle number
+    cycle: usize,
+    /// Step sizes for error model optimization
+    gamma_delta: Vec<f64>,
+    /// Error models for observations
+    error_models: AssayErrorModels,
+    /// Algorithm status
+    status: Status,
+    /// Cycle log for tracking progress
+    cycle_log: CycleLog,
+    /// Subject data
+    data: Data,
+    /// Unified runtime/model-derived configuration
+    config: NativeNonparametricConfig,
+
+    // NPCAT specific fields
+    /// Current convergence state
+    convergence_state: ConvergenceState,
+    /// History of objective function values
+    objf_history: Vec<f64>,
+    /// Random number generator
+    rng: StdRng,
+    /// Current number of candidates to add (decays over cycles)
+    current_k: f64,
+    /// Estimated Fisher Information Matrix (diagonal approximation)
+    fisher_diagonal: Vec<f64>,
+    /// Last global optimality check result
+    last_global_d_max: f64,
+    /// Cycle when last global check was performed
+    last_global_check_cycle: usize,
+    /// Flag for whether global check passed
+    global_check_passed: bool,
+}
+
+// ============================================================================
+// ALGORITHMS TRAIT IMPLEMENTATION
+// ============================================================================
+
+impl<E: Equation + Send + 'static> Algorithms<E> for NPCAT<E> {
+    fn equation(&self) -> &E {
+        &self.equation
+    }
+
+    fn into_workspace(&self) -> Result<NonparametricWorkspace<E>> {
+        NonparametricWorkspace::new(
+            self.equation.clone(),
+            self.data.clone(),
+            self.theta.clone(),
+            self.psi.clone(),
+            self.w.clone(),
+            -2. * self.objf,
+            self.cycle,
+            self.status.clone(),
+            self.config.run_configuration.clone(),
+            self.cycle_log.clone(),
+        )
+    }
+
+    fn error_models(&self) -> &AssayErrorModels {
+        &self.error_models
+    }
+
+    fn data(&self) -> &Data {
+        &self.data
+    }
+
+    fn get_prior(&self) -> Theta {
+        sample_space_for_parameters(&self.config.parameter_space, &self.config.prior).unwrap()
+    }
+
+    fn likelihood(&self) -> f64 {
+        self.objf
+    }
+
+    fn increment_cycle(&mut self) -> usize {
+        self.cycle += 1;
+
+        // Decay the candidate count
+        self.current_k = (self.current_k * K_DECAY_RATE).max(MIN_K as f64);
+
+        self.cycle
+    }
+
+    fn cycle(&self) -> usize {
+        self.cycle
+    }
+
+    fn set_theta(&mut self, theta: Theta) {
+        self.theta = theta;
+    }
+
+    fn theta(&self) -> &Theta {
+        &self.theta
+    }
+
+    fn psi(&self) -> &Psi {
+        &self.psi
+    }
+
+    fn set_status(&mut self, status: Status) {
+        self.status = status;
+    }
+
+    fn status(&self) -> &Status {
+        &self.status
+    }
+
+    fn evaluation(&mut self) -> Result<Status> {
+        tracing::info!("Objective function = {:.4}", -2.0 * self.objf);
+        tracing::debug!("Support points: {}", self.theta.nspp());
+        tracing::debug!(
+            "Phase: {} | Candidates/cycle: {:.1}",
+            self.convergence_state,
+            self.current_k
+        );
+
+        self.error_models.iter().for_each(|(outeq, em)| {
+            if AssayErrorModel::None == *em {
+                return;
+            }
+            tracing::debug!(
+                "Error model for outeq {}: {:.4}",
+                outeq,
+                em.factor().unwrap_or_default()
+            );
+        });
+
+        // Track objective function history
+        self.objf_history.push(self.objf);
+
+        // Warn if objective function decreased (instability)
+        if self.last_objf > self.objf + 1e-4 {
+            tracing::warn!(
+                "Objective function decreased from {:.4} to {:.4} (delta = {:.6})",
+                -2.0 * self.last_objf,
+                -2.0 * self.objf,
+                -2.0 * (self.last_objf - self.objf)
+            );
+        }
+
+        // Update convergence state machine
+        self.update_convergence_state()?;
+
+        if self.convergence_state == ConvergenceState::Converged {
+            tracing::info!(
+                "NPCAT converged after {} cycles (state machine)",
+                self.cycle
+            );
+            self.set_status(Status::Stop(StopReason::Converged));
+            self.log_cycle_state();
+            return Ok(self.status().clone());
+        }
+
+        // Check maximum cycles
+        if self.cycle >= self.config.max_cycles {
+            tracing::warn!("Maximum number of cycles reached");
+            self.set_status(Status::Stop(StopReason::MaxCycles));
+            self.log_cycle_state();
+            return Ok(self.status().clone());
+        }
+
+        // Check for stop file
+        if std::path::Path::new("stop").exists() {
+            tracing::warn!("Stopfile detected - breaking");
+            self.set_status(Status::Stop(StopReason::Stopped));
+            self.log_cycle_state();
+            return Ok(self.status().clone());
+        }
+
+        // Continue with normal operation
+        self.set_status(Status::Continue);
+        self.log_cycle_state();
+        Ok(self.status().clone())
+    }
+
+    fn estimation(&mut self) -> Result<()> {
+        self.psi = calculate_psi(
+            &self.equation,
+            &self.data,
+            &self.theta,
+            &self.error_models,
+            self.cycle == 1 && self.config.progress,
+        )?;
+
+        if let Err(err) = self.validate_psi() {
+            bail!(err);
+        }
+
+        (self.lambda, _) = match burke(&self.psi) {
+            Ok((lambda, objf)) => (lambda, objf),
+            Err(err) => {
+                bail!("Error in IPM during estimation: {:?}", err);
+            }
+        };
+        Ok(())
+    }
+
+    fn condensation(&mut self) -> Result<()> {
+        // Store previous weights for stability check
+        self.w_prev = self.w.clone();
+
+        // Lambda-filter: Remove points with very low weight
+        let max_lambda = self
+            .lambda
+            .iter()
+            .fold(f64::NEG_INFINITY, |acc, x| x.max(acc));
+
+        let mut keep = Vec::<usize>::new();
+        let filter_threshold = max_lambda / 1000_f64;
+
+        for (index, lam) in self.lambda.iter().enumerate() {
+            if lam > filter_threshold {
+                keep.push(index);
+            }
+        }
+
+        if self.psi.matrix().ncols() != keep.len() {
+            tracing::debug!(
+                "Lambda filter dropped {} support point(s)",
+                self.psi.matrix().ncols() - keep.len(),
+            );
+        }
+
+        self.theta.filter_indices(keep.as_slice());
+        self.psi.filter_column_indices(keep.as_slice());
+
+        // Rank-Revealing QR Factorization
+        let (r, perm) = qr::qrd(&self.psi)?;
+
+        let mut keep = Vec::<usize>::new();
+        let keep_n = self.psi.matrix().ncols().min(self.psi.matrix().nrows());
+
+        for i in 0..keep_n {
+            let test = r.col(i).norm_l2();
+            let r_diag_val = r.get(i, i);
+            let ratio = r_diag_val / test;
+            if ratio.abs() >= 1e-8 {
+                keep.push(*perm.get(i).unwrap());
+            }
+        }
+
+        if self.psi.matrix().ncols() != keep.len() {
+            tracing::debug!(
+                "QR decomposition dropped {} support point(s)",
+                self.psi.matrix().ncols() - keep.len(),
+            );
+        }
+
+        self.theta.filter_indices(keep.as_slice());
+        self.psi.filter_column_indices(keep.as_slice());
+
+        self.validate_psi()?;
+
+        (self.lambda, self.objf) = match burke(&self.psi) {
+            Ok((lambda, objf)) => (lambda, objf),
+            Err(err) => {
+                return Err(anyhow::anyhow!(
+                    "Error in IPM during condensation: {:?}",
+                    err
+                ));
+            }
+        };
+        self.w = self.lambda.clone();
+
+        // Update Fisher Information estimate after condensation
+        self.update_fisher_information();
+
+        Ok(())
+    }
+
+    fn optimizations(&mut self) -> Result<()> {
+        // Error model optimization (same as NPAG/NPOD)
+        self.error_models
+            .clone()
+            .iter_mut()
+            .filter_map(|(outeq, em)| {
+                if em.optimize() {
+                    Some((outeq, em))
+                } else {
+                    None
+                }
+            })
+            .try_for_each(|(outeq, em)| -> Result<()> {
+                let gamma_up = em.factor()? * (1.0 + self.gamma_delta[outeq]);
+                let gamma_down = em.factor()? / (1.0 + self.gamma_delta[outeq]);
+
+                let mut error_model_up = self.error_models.clone();
+                error_model_up.set_factor(outeq, gamma_up)?;
+
+                let mut error_model_down = self.error_models.clone();
+                error_model_down.set_factor(outeq, gamma_down)?;
+
+                let psi_up = calculate_psi(
+                    &self.equation,
+                    &self.data,
+                    &self.theta,
+                    &error_model_up,
+                    false,
+                )?;
+                let psi_down = calculate_psi(
+                    &self.equation,
+                    &self.data,
+                    &self.theta,
+                    &error_model_down,
+                    false,
+                )?;
+
+                let (lambda_up, objf_up) = match burke(&psi_up) {
+                    Ok((lambda, objf)) => (lambda, objf),
+                    Err(err) => {
+                        bail!("Error in IPM during optim: {:?}", err);
+                    }
+                };
+                let (lambda_down, objf_down) = match burke(&psi_down) {
+                    Ok((lambda, objf)) => (lambda, objf),
+                    Err(err) => {
+                        bail!("Error in IPM during optim: {:?}", err);
+                    }
+                };
+
+                if objf_up > self.objf {
+                    self.error_models.set_factor(outeq, gamma_up)?;
+                    self.objf = objf_up;
+                    self.gamma_delta[outeq] *= 4.;
+                    self.lambda = lambda_up;
+                    self.psi = psi_up;
+                }
+                if objf_down > self.objf {
+                    self.error_models.set_factor(outeq, gamma_down)?;
+                    self.objf = objf_down;
+                    self.gamma_delta[outeq] *= 4.;
+                    self.lambda = lambda_down;
+                    self.psi = psi_down;
+                }
+                self.gamma_delta[outeq] *= 0.5;
+                if self.gamma_delta[outeq] <= 0.01 {
+                    self.gamma_delta[outeq] = 0.1;
+                }
+                Ok(())
+            })?;
+
+        Ok(())
+    }
+
+    fn expansion(&mut self) -> Result<()> {
+        match self.convergence_state {
+            ConvergenceState::Exploring => {
+                // High expansion rate
+                self.information_guided_expansion()?;
+            }
+            ConvergenceState::Refining => {
+                // Balanced: local refinement + moderate expansion
+                self.selective_local_refinement()?;
+                self.information_guided_expansion()?;
+
+                // Periodic global check
+                if self.cycle - self.last_global_check_cycle >= GLOBAL_CHECK_INTERVAL {
+                    self.perform_global_optimality_check()?;
+                }
+            }
+            ConvergenceState::Polishing => {
+                // No expansion, just full refinement
+                self.full_local_refinement()?;
+            }
+            ConvergenceState::Converged => {
+                // No expansion when converged
+            }
+        }
+        Ok(())
+    }
+
+    fn log_cycle_state(&mut self) {
+        let state = NPCycle::new(
+            self.cycle,
+            -2. * self.objf,
+            self.error_models.clone(),
+            self.theta.clone(),
+            self.theta.nspp(),
+            (self.last_objf - self.objf).abs(),
+            self.status.clone(),
+        );
+        self.cycle_log.push(state);
+        self.last_objf = self.objf;
+    }
+}
+
+// ============================================================================
+// NPCAT SPECIFIC METHODS
+// ============================================================================
+
+impl<E: Equation + Send + 'static> NPCAT<E> {
+    pub(crate) fn from_input(input: NonparametricAlgorithmInput<E>) -> Result<Box<Self>> {
+        let config = input.native_config()?;
+        let seed = config.prior.seed().unwrap_or(42) as u64;
+        let n_params = config.ranges.len();
+        let error_models = input.error_models().clone();
+
+        Ok(Box::new(Self {
+            equation: input.equation,
+            ranges: config.ranges.clone(),
+            psi: Psi::new(),
+            theta: Theta::new(),
+            lambda: Weights::default(),
+            w: Weights::default(),
+            w_prev: Weights::default(),
+            objf: f64::NEG_INFINITY,
+            last_objf: -1e30,
+            f0: -1e30,
+            f1: f64::default(),
+            cycle: 0,
+            gamma_delta: vec![0.1; error_models.len()],
+            error_models,
+            status: Status::Continue,
+            cycle_log: CycleLog::new(),
+            data: input.data,
+            config,
+            convergence_state: ConvergenceState::Exploring,
+            objf_history: Vec::new(),
+            rng: StdRng::seed_from_u64(seed),
+            current_k: INITIAL_K as f64,
+            fisher_diagonal: vec![1.0; n_params],
+            last_global_d_max: f64::INFINITY,
+            last_global_check_cycle: 0,
+            global_check_passed: false,
+        }))
+    }
+
+    /// Update the convergence state machine based on current algorithm state
+    fn update_convergence_state(&mut self) -> Result<()> {
+        match self.convergence_state {
+            ConvergenceState::Exploring => {
+                // Transition to Refining when objf is stable AND we have sufficient coverage
+                if self.objf_stable(EXPLORING_STABILITY_WINDOW) && self.coverage_sufficient() {
+                    tracing::info!(
+                        "NPCAT: Transitioning from Exploring to Refining at cycle {}",
+                        self.cycle
+                    );
+                    self.convergence_state = ConvergenceState::Refining;
+                }
+            }
+            ConvergenceState::Refining => {
+                // Transition to Polishing when global check passes AND objf is stable AND weights stable
+                let weights_ok = self.weights_stable();
+                if self.objf_stable(REFINING_STABILITY_WINDOW)
+                    && self.global_check_passed
+                    && weights_ok
+                {
+                    tracing::info!(
+                        "NPCAT: Transitioning from Refining to Polishing at cycle {}",
+                        self.cycle
+                    );
+                    self.convergence_state = ConvergenceState::Polishing;
+                }
+            }
+            ConvergenceState::Polishing => {
+                // Final convergence check: objf stable, weights stable, and P(Y|L) criterion
+                if self.objf_stable(POLISHING_STABILITY_WINDOW)
+                    && self.weights_stable()
+                    && self.pyl_criterion_met()?
+                {
+                    tracing::info!("NPCAT: Convergence achieved at cycle {}", self.cycle);
+                    self.convergence_state = ConvergenceState::Converged;
+                }
+            }
+            ConvergenceState::Converged => {
+                // Already converged, do nothing
+            }
+        }
+        Ok(())
+    }
+
+    /// Check if objective function has been stable over recent cycles
+    fn objf_stable(&self, window: usize) -> bool {
+        if self.objf_history.len() < window {
+            return false;
+        }
+
+        let recent: Vec<f64> = self
+            .objf_history
+            .iter()
+            .rev()
+            .take(window)
+            .cloned()
+            .collect();
+
+        recent.windows(2).all(|w| (w[0] - w[1]).abs() < THETA_G)
+    }
+
+    /// Check if weight distribution has been stable
+    fn weights_stable(&self) -> bool {
+        if self.w.len() != self.w_prev.len() || self.w.len() == 0 {
+            return false;
+        }
+
+        let max_change = self
+            .w
+            .iter()
+            .zip(self.w_prev.iter())
+            .map(|(w_new, w_old)| {
+                if w_new > 1e-10 {
+                    ((w_new - w_old) / w_new).abs()
+                } else {
+                    0.0
+                }
+            })
+            .fold(0.0_f64, |a, b| a.max(b));
+
+        max_change < THETA_W
+    }
+
+    /// Check if we have sufficient coverage of the parameter space
+    fn coverage_sufficient(&self) -> bool {
+        // Heuristic: we have at least 2*d support points where d = dimensions
+        let min_points = 2 * self.ranges.len();
+        self.theta.nspp() >= min_points
+    }
+
+    /// Check P(Y|L) convergence criterion
+    fn pyl_criterion_met(&mut self) -> Result<bool> {
+        let psi = self.psi.matrix();
+        let pyl = psi * self.w.weights();
+        self.f1 = pyl.iter().map(|x| x.ln()).sum();
+
+        let met = (self.f1 - self.f0).abs() <= THETA_F;
+
+        if !met {
+            self.f0 = self.f1;
+        }
+
+        Ok(met)
+    }
+
+    /// Update Fisher Information diagonal approximation
+    fn update_fisher_information(&mut self) {
+        let n_params = self.ranges.len();
+        let n_spp = self.theta.nspp();
+
+        if n_spp < 2 {
+            // Not enough points to estimate variance
+            self.fisher_diagonal = vec![1.0; n_params];
+            return;
+        }
+
+        // Estimate parameter variance from current support points weighted by their probabilities
+        // This is a simple empirical approximation to Fisher Information
+        let mut means = vec![0.0; n_params];
+        let mut variances = vec![0.0; n_params];
+
+        // Compute weighted means
+        for (i, spp) in self.theta.matrix().row_iter().enumerate() {
+            let weight = if i < self.w.len() { self.w[i] } else { 0.0 };
+            for (j, val) in spp.iter().enumerate() {
+                means[j] += weight * val;
+            }
+        }
+
+        // Compute weighted variances
+        for (i, spp) in self.theta.matrix().row_iter().enumerate() {
+            let weight = if i < self.w.len() { self.w[i] } else { 0.0 };
+            for (j, val) in spp.iter().enumerate() {
+                variances[j] += weight * (val - means[j]).powi(2);
+            }
+        }
+
+        // Fisher Information is inversely related to variance
+        // High variance = low information = need more exploration in that direction
+        for (j, var) in variances.iter().enumerate() {
+            // Add small regularization to avoid division by zero
+            // Larger value = more exploration needed in that dimension
+            let range_scale = (self.ranges[j].1 - self.ranges[j].0).powi(2);
+            self.fisher_diagonal[j] = var.max(1e-10) / range_scale;
+        }
+
+        tracing::debug!(
+            "Fisher Information diagonal (variance-based): {:?}",
+            self.fisher_diagonal
+        );
+    }
+
+    /// Information-guided candidate generation and expansion
+    fn information_guided_expansion(&mut self) -> Result<()> {
+        let n_candidates = self.current_k.ceil() as usize;
+
+        let mut candidates = Vec::new();
+
+        // Calculate how many candidates from each source
+        let n_fisher = ((n_candidates as f64) * FISHER_RATIO).ceil() as usize;
+        let n_dopt = ((n_candidates as f64) * DOPT_RATIO).ceil() as usize;
+        let n_boundary = ((n_candidates as f64) * BOUNDARY_RATIO).ceil() as usize;
+
+        // 1. Fisher Information-guided candidates (high variance directions)
+        candidates.extend(self.generate_fisher_candidates(n_fisher));
+
+        // 2. D-optimal perturbation candidates
+        candidates.extend(self.generate_dopt_candidates(n_dopt)?);
+
+        // 3. Boundary exploration candidates
+        candidates.extend(self.generate_boundary_candidates(n_boundary));
+
+        // Filter candidates by minimum distance and add to theta
+        let mut added = 0;
+        for candidate in candidates {
+            if self.is_within_bounds(&candidate) && self.theta.check_point(&candidate, MIN_DISTANCE)
+            {
+                self.theta.add_point(&candidate)?;
+                added += 1;
+            }
+        }
+
+        tracing::debug!(
+            "Information-guided expansion: added {} candidates (target: {})",
+            added,
+            n_candidates
+        );
+
+        Ok(())
+    }
+
+    /// Generate candidates along high-variance (low Fisher Information) directions
+    fn generate_fisher_candidates(&mut self, n: usize) -> Vec<Vec<f64>> {
+        let mut candidates = Vec::new();
+
+        if self.theta.nspp() == 0 {
+            return candidates;
+        }
+
+        // Find dimensions with highest variance (lowest information, need more exploration)
+        let mut dim_indices: Vec<(usize, f64)> = self
+            .fisher_diagonal
+            .iter()
+            .enumerate()
+            .map(|(i, &fi)| (i, fi))
+            .collect();
+
+        // Sort by variance descending (explore high-variance directions)
+        dim_indices.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
+
+        // Generate candidates along top variance directions
+        let top_dims: Vec<usize> = dim_indices
+            .iter()
+            .take((self.ranges.len() + 1) / 2) // Top half of dimensions
+            .map(|(i, _)| *i)
+            .collect();
+
+        for spp in self.theta.matrix().row_iter() {
+            let base: Vec<f64> = spp.iter().cloned().collect();
+
+            for &dim in &top_dims {
+                if candidates.len() >= n {
+                    break;
+                }
+
+                // Adaptive step size based on variance
+                let variance = self.fisher_diagonal[dim];
+                let range = self.ranges[dim].1 - self.ranges[dim].0;
+                let step = (variance.sqrt() * range).max(range * 0.05).min(range * 0.3);
+
+                // Positive direction
+                let mut plus = base.clone();
+                plus[dim] += step;
+                if plus[dim] <= self.ranges[dim].1 {
+                    candidates.push(plus);
+                }
+
+                // Negative direction
+                let mut minus = base.clone();
+                minus[dim] -= step;
+                if minus[dim] >= self.ranges[dim].0 {
+                    candidates.push(minus);
+                }
+            }
+        }
+
+        // Shuffle and take n
+        candidates.shuffle(&mut self.rng);
+        candidates.truncate(n);
+        candidates
+    }
+
+    /// Generate candidates using D-optimal perturbations
+    fn generate_dopt_candidates(&self, n: usize) -> Result<Vec<Vec<f64>>> {
+        let mut candidates = Vec::new();
+
+        if self.theta.nspp() == 0 || self.w.len() == 0 {
+            return Ok(candidates);
+        }
+
+        // Compute P(Y|L) for D-criterion
+        let psi = self.psi().to_ndarray();
+        let w: Array1<f64> = self.w.clone().iter().collect();
+        let pyl = psi.dot(&w);
+
+        // Get high-weight points (above median weight)
+        let mut weights: Vec<f64> = self.w.iter().collect();
+        weights.sort_by(|a, b| a.partial_cmp(b).unwrap());
+        let median_weight = weights.get(weights.len() / 2).cloned().unwrap_or(0.0);
+
+        for (i, spp) in self.theta.matrix().row_iter().enumerate() {
+            if candidates.len() >= n {
+                break;
+            }
+
+            // Only perturb high-weight points
+            if i >= self.w.len() || self.w[i] < median_weight {
+                continue;
+            }
+
+            let base: Vec<f64> = spp.iter().cloned().collect();
+
+            // Generate perturbation in direction of steepest D-criterion increase
+            // Approximate gradient by finite differences
+            let d_base = self.compute_d_criterion(&base, &pyl)?;
+
+            for dim in 0..self.ranges.len() {
+                let range = self.ranges[dim].1 - self.ranges[dim].0;
+                let delta = range * 0.02;
+
+                // Plus direction
+                let mut plus = base.clone();
+                plus[dim] = (plus[dim] + delta).min(self.ranges[dim].1);
+                let d_plus = self.compute_d_criterion(&plus, &pyl)?;
+
+                // Minus direction
+                let mut minus = base.clone();
+                minus[dim] = (minus[dim] - delta).max(self.ranges[dim].0);
+                let d_minus = self.compute_d_criterion(&minus, &pyl)?;
+
+                // Move in direction of increasing D
+                if d_plus > d_base && d_plus > d_minus {
+                    candidates.push(plus);
+                } else if d_minus > d_base {
+                    candidates.push(minus);
+                }
+            }
+        }
+
+        candidates.truncate(n);
+        Ok(candidates)
+    }
+
+    /// Generate candidates near parameter boundaries
+    fn generate_boundary_candidates(&mut self, n: usize) -> Vec<Vec<f64>> {
+        let mut candidates = Vec::new();
+
+        // Generate points near boundaries in each dimension
+        for _ in 0..n {
+            let mut point = Vec::new();
+
+            for (lo, hi) in &self.ranges {
+                // Randomly choose near-boundary or interior
+                let val = if self.rng.random::<f64>() < 0.5 {
+                    // Near lower boundary
+                    lo + (hi - lo) * self.rng.random::<f64>() * 0.1
+                } else {
+                    // Near upper boundary
+                    hi - (hi - lo) * self.rng.random::<f64>() * 0.1
+                };
+                point.push(val);
+            }
+
+            candidates.push(point);
+        }
+
+        candidates
+    }
+
+    /// Check if a point is within parameter bounds
+    fn is_within_bounds(&self, point: &[f64]) -> bool {
+        point
+            .iter()
+            .zip(self.ranges.iter())
+            .all(|(val, (lo, hi))| *val >= *lo && *val <= *hi)
+    }
+
+    /// Selective local refinement for high-weight points only
+    fn selective_local_refinement(&mut self) -> Result<()> {
+        if self.theta.nspp() == 0 || self.w.len() == 0 {
+            return Ok(());
+        }
+
+        let psi = self.psi().to_ndarray();
+        let w: Array1<f64> = self.w.clone().iter().collect();
+        let pyl = psi.dot(&w);
+
+        // Get median weight
+        let mut weights: Vec<f64> = self.w.iter().collect();
+        weights.sort_by(|a, b| a.partial_cmp(b).unwrap());
+        let median_weight = weights.get(weights.len() / 2).cloned().unwrap_or(0.0);
+
+        let n_points_with_weights = self.w.len().min(self.theta.nspp());
+        let max_iters = BASE_OPTIM_ITERS + OPTIM_ITER_GROWTH * (self.cycle as f64).ln() as u64;
+
+        let mut candidate_points: Vec<(Array1<f64>, bool)> = Vec::default();
+
+        // Collect points with refinement flag
+        for (idx, spp) in self
+            .theta
+            .matrix()
+            .row_iter()
+            .enumerate()
+            .take(n_points_with_weights)
+        {
+            let candidate: Vec<f64> = spp.iter().cloned().collect();
+            let should_refine = self.w[idx] >= median_weight;
+            candidate_points.push((Array1::from(candidate), should_refine));
+        }
+
+        // Optimize points in parallel
+        candidate_points
+            .par_iter_mut()
+            .for_each(|(spp, should_refine)| {
+                if !*should_refine {
+                    return;
+                }
+
+                let optimizer = NpcatOptimizer::new(
+                    &self.equation,
+                    &self.data,
+                    &self.error_models,
+                    &pyl,
+                    max_iters,
+                    &self.ranges,
+                );
+
+                if let Ok(optimized) = optimizer.optimize_point(spp.clone()) {
+                    *spp = optimized;
+                }
+            });
+
+        // Add optimized points to theta
+        for (cp, _) in candidate_points {
+            self.theta
+                .suggest_point(cp.to_vec().as_slice(), MIN_DISTANCE)?;
+        }
+
+        Ok(())
+    }
+
+    /// Full local refinement for all points (Polishing phase)
+    fn full_local_refinement(&mut self) -> Result<()> {
+        if self.theta.nspp() == 0 || self.w.len() == 0 {
+            return Ok(());
+        }
+
+        let psi = self.psi().to_ndarray();
+        let w: Array1<f64> = self.w.clone().iter().collect();
+        let pyl = psi.dot(&w);
+
+        let n_points = self.theta.nspp().min(self.w.len());
+        let max_iters = BASE_OPTIM_ITERS * 2 + OPTIM_ITER_GROWTH * (self.cycle as f64).ln() as u64;
+
+        let mut candidate_points: Vec<Array1<f64>> = Vec::default();
+
+        for spp in self.theta.matrix().row_iter().take(n_points) {
+            let candidate: Vec<f64> = spp.iter().cloned().collect();
+            candidate_points.push(Array1::from(candidate));
+        }
+
+        // Optimize all points in parallel
+        candidate_points.par_iter_mut().for_each(|spp| {
+            let optimizer = NpcatOptimizer::new(
+                &self.equation,
+                &self.data,
+                &self.error_models,
+                &pyl,
+                max_iters,
+                &self.ranges,
+            );
+
+            if let Ok(optimized) = optimizer.optimize_point(spp.clone()) {
+                *spp = optimized;
+            }
+        });
+
+        // Add optimized points to theta
+        for cp in candidate_points {
+            self.theta
+                .suggest_point(cp.to_vec().as_slice(), MIN_DISTANCE)?;
+        }
+
+        Ok(())
+    }
+
+    /// Perform global optimality check using Sobol quasi-random sequence
+    fn perform_global_optimality_check(&mut self) -> Result<()> {
+        self.last_global_check_cycle = self.cycle;
+
+        if self.theta.nspp() == 0 || self.w.len() == 0 {
+            self.global_check_passed = false;
+            return Ok(());
+        }
+
+        let psi = self.psi().to_ndarray();
+        let w: Array1<f64> = self.w.clone().iter().collect();
+        let pyl = psi.dot(&w);
+
+        let mut max_d = f64::NEG_INFINITY;
+        let n_dims = self.ranges.len();
+
+        // Generate Sobol sequence samples
+        for i in 0..SOBOL_SAMPLES {
+            // sobol_burley::sample returns values in [0, 1]
+            // We need to scale to our parameter ranges
+            let mut point = Vec::with_capacity(n_dims);
+
+            for dim in 0..n_dims {
+                let sobol_val = sample(i as u32, dim as u32, 0);
+                let (lo, hi) = self.ranges[dim];
+                let scaled = lo + sobol_val as f64 * (hi - lo);
+                point.push(scaled);
+            }
+
+            let d_value = self.compute_d_criterion(&point, &pyl)?;
+            max_d = max_d.max(d_value);
+        }
+
+        self.last_global_d_max = max_d;
+        self.global_check_passed = max_d < THETA_D_GLOBAL;
+
+        tracing::debug!(
+            "Global optimality check: max_D = {:.6} (threshold: {:.6}) -> {}",
+            max_d,
+            THETA_D_GLOBAL,
+            if self.global_check_passed {
+                "PASSED"
+            } else {
+                "FAILED"
+            }
+        );
+
+        // If check failed, inject best point found
+        if !self.global_check_passed {
+            // Do another pass to find the best point
+            let mut best_point = vec![0.0; n_dims];
+            let mut best_d = f64::NEG_INFINITY;
+
+            for i in 0..SOBOL_SAMPLES {
+                let mut point = Vec::with_capacity(n_dims);
+                for dim in 0..n_dims {
+                    let sobol_val = sample(i as u32, dim as u32, 0);
+                    let (lo, hi) = self.ranges[dim];
+                    point.push(lo + sobol_val as f64 * (hi - lo));
+                }
+
+                let d = self.compute_d_criterion(&point, &pyl)?;
+                if d > best_d {
+                    best_d = d;
+                    best_point = point;
+                }
+            }
+
+            // Inject best point if it passes minimum distance check
+            if self.theta.check_point(&best_point, MIN_DISTANCE) {
+                self.theta.add_point(&best_point)?;
+                tracing::debug!("Injected high-D point from global check: D = {:.6}", best_d);
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Compute D-criterion for a candidate point
+    fn compute_d_criterion(&self, point: &[f64], pyl: &Array1<f64>) -> Result<f64> {
+        let theta_single = ndarray::Array1::from(point.to_vec()).insert_axis(ndarray::Axis(0));
+
+        let psi_single = pharmsol::prelude::simulator::log_likelihood_matrix(
+            &self.equation,
+            &self.data,
+            &theta_single,
+            &self.error_models,
+            false,
+        )?
+        .mapv(f64::exp);
+
+        let nsub = psi_single.nrows() as f64;
+        let mut d_sum = -nsub;
+
+        for (p_i, pyl_i) in psi_single.iter().zip(pyl.iter()) {
+            d_sum += p_i / pyl_i;
+        }
+
+        Ok(d_sum)
+    }
+}
+
+// ============================================================================
+// NPCAT OPTIMIZER (Nelder-Mead based)
+// ============================================================================
+
+use argmin::{
+    core::{CostFunction, Error, Executor},
+    solver::neldermead::NelderMead,
+};
+use ndarray::Axis;
+
+/// Support Point Optimizer for NPCAT with bounds checking
+struct NpcatOptimizer<'a, E: Equation> {
+    equation: &'a E,
+    data: &'a Data,
+    sig: &'a AssayErrorModels,
+    pyl: &'a Array1<f64>,
+    max_iters: u64,
+    ranges: &'a [(f64, f64)],
+}
+
+impl<E: Equation> CostFunction for NpcatOptimizer<'_, E> {
+    type Param = Vec<f64>;
+    type Output = f64;
+
+    fn cost(&self, spp: &Self::Param) -> Result<Self::Output, Error> {
+        // Apply bounds
+        let bounded: Vec<f64> = spp
+            .iter()
+            .zip(self.ranges.iter())
+            .map(|(val, (lo, hi))| val.clamp(*lo, *hi))
+            .collect();
+
+        let theta = Array1::from(bounded).insert_axis(Axis(0));
+
+        let psi = pharmsol::prelude::simulator::log_likelihood_matrix(
+            self.equation,
+            self.data,
+            &theta,
+            self.sig,
+            false,
+        )?
+        .mapv(f64::exp);
+
+        let nsub = psi.nrows() as f64;
+        let mut sum = -nsub;
+        for (p_i, pyl_i) in psi.iter().zip(self.pyl.iter()) {
+            sum += p_i / pyl_i;
+        }
+        Ok(-sum) // Minimize negative D → Maximize D
+    }
+}
+
+impl<'a, E: Equation> NpcatOptimizer<'a, E> {
+    fn new(
+        equation: &'a E,
+        data: &'a Data,
+        sig: &'a AssayErrorModels,
+        pyl: &'a Array1<f64>,
+        max_iters: u64,
+        ranges: &'a [(f64, f64)],
+    ) -> Self {
+        Self {
+            equation,
+            data,
+            sig,
+            pyl,
+            max_iters,
+            ranges,
+        }
+    }
+
+    fn optimize_point(self, spp: Array1<f64>) -> Result<Array1<f64>, Error> {
+        let simplex = create_initial_simplex(&spp.to_vec(), self.ranges);
+        let tolerance = OPTIM_TOLERANCE;
+        let max_iters = self.max_iters;
+
+        let solver: NelderMead<Vec<f64>, f64> =
+            NelderMead::new(simplex).with_sd_tolerance(tolerance)?;
+
+        let res = Executor::new(self, solver)
+            .configure(|state| state.max_iters(max_iters))
+            .run()?;
+
+        // Apply bounds to result
+        let result = res.state.best_param.unwrap();
+        Ok(Array1::from(result))
+    }
+}
+
+/// Create initial simplex for Nelder-Mead optimization with bounds awareness
+fn create_initial_simplex(initial_point: &[f64], ranges: &[(f64, f64)]) -> Vec<Vec<f64>> {
+    let num_dimensions = initial_point.len();
+    let perturbation_percentage = 0.05;
+
+    let mut vertices = Vec::new();
+    vertices.push(initial_point.to_vec());
+
+    for i in 0..num_dimensions {
+        let range = ranges[i].1 - ranges[i].0;
+        let perturbation = if initial_point[i] == 0.0 {
+            range * 0.001
+        } else {
+            perturbation_percentage * initial_point[i].abs()
+        };
+
+        let mut perturbed_point = initial_point.to_vec();
+
+        // Ensure perturbation stays within bounds
+        let new_val = initial_point[i] + perturbation;
+        if new_val <= ranges[i].1 {
+            perturbed_point[i] = new_val;
+        } else {
+            perturbed_point[i] = initial_point[i] - perturbation;
+        }
+
+        vertices.push(perturbed_point);
+    }
+
+    vertices
+}
+
+// ============================================================================
+// TESTS
+// ============================================================================
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_convergence_state_display() {
+        assert_eq!(format!("{}", ConvergenceState::Exploring), "Exploring");
+        assert_eq!(format!("{}", ConvergenceState::Refining), "Refining");
+        assert_eq!(format!("{}", ConvergenceState::Polishing), "Polishing");
+        assert_eq!(format!("{}", ConvergenceState::Converged), "Converged");
+    }
+
+    #[test]
+    fn test_initial_simplex_bounds() {
+        let point = vec![0.5, 0.95]; // Second value near upper bound
+        let ranges = vec![(0.0, 1.0), (0.0, 1.0)];
+        let simplex = create_initial_simplex(&point, &ranges);
+
+        assert_eq!(simplex.len(), 3); // n+1 vertices
+        for vertex in &simplex {
+            for (i, val) in vertex.iter().enumerate() {
+                assert!(*val >= ranges[i].0 && *val <= ranges[i].1);
+            }
+        }
+    }
+
+    #[test]
+    fn test_constants_validity() {
+        assert!(THETA_W > 0.0 && THETA_W < 1.0);
+        assert!(THETA_G > 0.0);
+        assert!(THETA_D_GLOBAL > 0.0);
+        assert!(THETA_F > 0.0);
+        assert!(MIN_DISTANCE > 0.0);
+        assert!(INITIAL_K > 0);
+        assert!(K_DECAY_RATE > 0.0 && K_DECAY_RATE < 1.0);
+        assert!(MIN_K > 0);
+        assert!(SOBOL_SAMPLES > 0);
+        assert!(FISHER_RATIO + DOPT_RATIO + BOUNDARY_RATIO <= 1.01); // Allow small float error
+    }
+}
diff --git a/src/algorithms/nonparametric/npcma/cma.rs b/src/algorithms/nonparametric/npcma/cma.rs
new file mode 100644
index 000000000..c92c750b6
--- /dev/null
+++ b/src/algorithms/nonparametric/npcma/cma.rs
@@ -0,0 +1,340 @@
+//! CMA-ES Core Implementation
+//!
+//! Covariance Matrix Adaptation Evolution Strategy components.
+
+use super::constants::*;
+use ndarray::{Array1, Array2};
+use rand::prelude::*;
+
+/// CMA-ES State
+#[derive(Debug, Clone)]
+pub struct CmaState {
+    /// Mean of the distribution (center)
+    pub mean: Array1<f64>,
+    /// Step size (overall scale)
+    pub sigma: f64,
+    /// Covariance matrix
+    pub c: Array2<f64>,
+    /// Evolution path for sigma
+    pub p_sigma: Array1<f64>,
+    /// Evolution path for C
+    pub p_c: Array1<f64>,
+    /// Number of dimensions
+    pub n_dims: usize,
+    /// Parameter ranges
+    pub ranges: Vec<(f64, f64)>,
+    /// Generation counter
+    pub generation: usize,
+    /// Best fitness seen
+    pub best_fitness: f64,
+    /// Generations without improvement
+    pub stagnation: usize,
+}
+
+impl CmaState {
+    pub fn new(n_dims: usize, ranges: &[(f64, f64)], seed: u64) -> Self {
+        let mut rng = rand::rngs::StdRng::seed_from_u64(seed);
+
+        // Initialize mean at center of domain
+        let mean: Array1<f64> = ranges
+            .iter()
+            .map(|&(lo, hi)| lo + rng.random::<f64>() * (hi - lo))
+            .collect();
+
+        // Identity covariance (scaled by range)
+        let mut c = Array2::<f64>::eye(n_dims);
+        for i in 0..n_dims {
+            let (lo, hi) = ranges[i];
+            let scale = (hi - lo) / 4.0; // Initial scale
+            c[[i, i]] = scale * scale;
+        }
+
+        // Evolution paths start at zero
+        let p_sigma = Array1::zeros(n_dims);
+        let p_c = Array1::zeros(n_dims);
+
+        Self {
+            mean,
+            sigma: INITIAL_SIGMA,
+            c,
+            p_sigma,
+            p_c,
+            n_dims,
+            ranges: ranges.to_vec(),
+            generation: 0,
+            best_fitness: f64::NEG_INFINITY,
+            stagnation: 0,
+        }
+    }
+
+    /// Sample a population from the current distribution
+    pub fn sample_population<R: Rng>(&self, n: usize, rng: &mut R) -> Vec<Array1<f64>> {
+        let mut population = Vec::with_capacity(n);
+
+        // Simple sampling: mean + sigma * C^(1/2) * z
+        // For simplicity, we use diagonal approximation initially
+        for _ in 0..n {
+            let mut sample = self.mean.clone();
+
+            for i in 0..self.n_dims {
+                let std = (self.c[[i, i]]).sqrt() * self.sigma;
+                let z: f64 = sample_standard_normal(rng);
+                sample[i] += std * z;
+
+                // Clamp to bounds
+                let (lo, hi) = self.ranges[i];
+                sample[i] = sample[i].clamp(lo, hi);
+            }
+
+            population.push(sample);
+        }
+
+        population
+    }
+
+    /// Update the CMA-ES state given sorted population (best first)
+    pub fn update(&mut self, sorted_population: &[Array1<f64>], sorted_fitness: &[f64]) {
+        self.generation += 1;
+
+        // Check for improvement
+        if sorted_fitness[0] > self.best_fitness {
+            self.best_fitness = sorted_fitness[0];
+            self.stagnation = 0;
+        } else {
+            self.stagnation += 1;
+        }
+
+        // Compute weighted mean of best individuals
+        let mu = N_PARENTS.min(sorted_population.len());
+        let weights = compute_weights(mu);
+
+        let mut new_mean = Array1::zeros(self.n_dims);
+        for i in 0..mu {
+            new_mean = new_mean + weights[i] * &sorted_population[i];
+        }
+
+        // Mean displacement
+        let y = (&new_mean - &self.mean) / self.sigma;
+
+        // Update evolution path for sigma (cumulative step-size adaptation)
+        let c_s = C_SIGMA;
+        let chi_n = (self.n_dims as f64).sqrt(); // Expected length of N(0,I) vector
+        self.p_sigma = (1.0 - c_s) * &self.p_sigma + (c_s * (2.0 - c_s)).sqrt() * &y;
+
+        // Update sigma
+        let ps_norm = norm(&self.p_sigma);
+        self.sigma *= ((c_s / D_SIGMA) * (ps_norm / chi_n - 1.0)).exp();
+        self.sigma = self.sigma.clamp(MIN_SIGMA, MAX_SIGMA);
+
+        // Update evolution path for covariance
+        let hsig = if ps_norm / ((1.0 - (1.0 - c_s).powi(2 * self.generation as i32)).sqrt())
+            < (1.4 + 2.0 / (self.n_dims as f64 + 1.0)) * chi_n
+        {
+            1.0
+        } else {
+            0.0
+        };
+
+        self.p_c = (1.0 - C_C) * &self.p_c + hsig * (C_C * (2.0 - C_C)).sqrt() * &y;
+
+        // Update covariance matrix
+        // Rank-1 update
+        let rank1 = outer(&self.p_c, &self.p_c);
+
+        // Rank-mu update
+        let mut rank_mu = Array2::<f64>::zeros((self.n_dims, self.n_dims));
+        for i in 0..mu {
+            let yi = (&sorted_population[i] - &self.mean) / self.sigma;
+            rank_mu = rank_mu + weights[i] * outer(&yi, &yi);
+        }
+
+        // Combined update
+        self.c = (1.0 - C_1 - C_MU) * &self.c + C_1 * &rank1 + C_MU * &rank_mu;
+
+        // Ensure symmetry and positive definiteness
+        self.repair_covariance();
+
+        // Update mean
+        self.mean = new_mean;
+    }
+
+    /// Repair covariance matrix for numerical stability
+    fn repair_covariance(&mut self) {
+        // Ensure symmetry
+        for i in 0..self.n_dims {
+            for j in 0..i {
+                let avg = (self.c[[i, j]] + self.c[[j, i]]) / 2.0;
+                self.c[[i, j]] = avg;
+                self.c[[j, i]] = avg;
+            }
+        }
+
+        // Ensure positive diagonal
+        for i in 0..self.n_dims {
+            self.c[[i, i]] = self.c[[i, i]].max(EIGENVALUE_FLOOR);
+        }
+
+        // Check condition number (diagonal approximation)
+        let diag: Vec<f64> = (0..self.n_dims).map(|i| self.c[[i, i]]).collect();
+        let max_d = diag.iter().cloned().fold(f64::NEG_INFINITY, f64::max);
+        let min_d = diag.iter().cloned().fold(f64::INFINITY, f64::min);
+
+        if max_d / min_d > CONDITION_THRESHOLD {
+            // Scale up small eigenvalues
+            let floor = max_d / CONDITION_THRESHOLD;
+            for i in 0..self.n_dims {
+                self.c[[i, i]] = self.c[[i, i]].max(floor);
+            }
+        }
+    }
+
+    /// Should we restart?
+    pub fn should_restart(&self) -> bool {
+        self.stagnation >= MAX_STAGNATION || self.sigma < MIN_SIGMA
+    }
+
+    /// Restart with new random mean
+    pub fn restart<R: Rng>(&mut self, rng: &mut R) {
+        // New random mean
+        self.mean = self
+            .ranges
+            .iter()
+            .map(|&(lo, hi)| lo + rng.random::<f64>() * (hi - lo))
+            .collect();
+
+        // Reset covariance
+        self.c = Array2::<f64>::eye(self.n_dims);
+        for i in 0..self.n_dims {
+            let (lo, hi) = self.ranges[i];
+            let scale = (hi - lo) / 4.0;
+            self.c[[i, i]] = scale * scale;
+        }
+
+        // Reset paths
+        self.p_sigma = Array1::zeros(self.n_dims);
+        self.p_c = Array1::zeros(self.n_dims);
+
+        // Reset sigma
+        self.sigma = INITIAL_SIGMA;
+
+        // Reset stagnation
+        self.stagnation = 0;
+    }
+
+    /// Update mean based on high-weight support points
+    pub fn update_mean_from_points(&mut self, weighted_points: &[(Vec<f64>, f64)]) {
+        if weighted_points.is_empty() {
+            return;
+        }
+
+        // Compute weighted centroid
+        let total_weight: f64 = weighted_points.iter().map(|(_, w)| w).sum();
+        if total_weight <= 0.0 {
+            return;
+        }
+
+        let mut centroid = vec![0.0; self.n_dims];
+        for (point, weight) in weighted_points {
+            for (i, &val) in point.iter().enumerate() {
+                centroid[i] += val * weight / total_weight;
+            }
+        }
+
+        // Move mean toward centroid (partial update to avoid instability)
+        let learning_rate = 0.3;
+        for i in 0..self.n_dims {
+            self.mean[i] = (1.0 - learning_rate) * self.mean[i] + learning_rate * centroid[i];
+
+            // Ensure within bounds
+            let (lo, hi) = self.ranges[i];
+            self.mean[i] = self.mean[i].clamp(lo, hi);
+        }
+
+        // Also update covariance to reflect the spread of high-weight points
+        if weighted_points.len() > 1 {
+            for i in 0..self.n_dims {
+                let mut variance = 0.0;
+                for (point, weight) in weighted_points {
+                    let diff = point[i] - self.mean[i];
+                    variance += weight * diff * diff / total_weight;
+                }
+                // Blend with current covariance
+                self.c[[i, i]] = 0.7 * self.c[[i, i]] + 0.3 * variance.max(EIGENVALUE_FLOOR);
+            }
+        }
+    }
+}
+
+// ============================================================================
+// HELPER FUNCTIONS
+// ============================================================================
+
+/// Compute recombination weights (log-linear)
+fn compute_weights(mu: usize) -> Vec<f64> {
+    let mut weights: Vec<f64> = (0..mu)
+        .map(|i| ((mu as f64 + 0.5).ln() - ((i + 1) as f64).ln()).max(0.0))
+        .collect();
+
+    let sum: f64 = weights.iter().sum();
+    for w in weights.iter_mut() {
+        *w /= sum;
+    }
+
+    weights
+}
+
+/// Sample from standard normal using Box-Muller
+fn sample_standard_normal<R: Rng>(rng: &mut R) -> f64 {
+    let u1: f64 = rng.random();
+    let u2: f64 = rng.random();
+    (-2.0 * u1.ln()).sqrt() * (2.0 * std::f64::consts::PI * u2).cos()
+}
+
+/// Compute vector norm
+fn norm(v: &Array1<f64>) -> f64 {
+    v.iter().map(|x| x * x).sum::<f64>().sqrt()
+}
+
+/// Outer product
+fn outer(a: &Array1<f64>, b: &Array1<f64>) -> Array2<f64> {
+    let n = a.len();
+    let mut result = Array2::<f64>::zeros((n, n));
+    for i in 0..n {
+        for j in 0..n {
+            result[[i, j]] = a[i] * b[j];
+        }
+    }
+    result
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_cma_basic() {
+        let ranges = vec![(0.0, 1.0), (0.0, 1.0)];
+        let state = CmaState::new(2, &ranges, 42);
+
+        assert_eq!(state.n_dims, 2);
+        assert!(state.sigma > 0.0);
+    }
+
+    #[test]
+    fn test_sampling() {
+        let ranges = vec![(0.0, 1.0), (0.0, 1.0)];
+        let state = CmaState::new(2, &ranges, 42);
+        let mut rng = rand::rngs::StdRng::seed_from_u64(42);
+
+        let pop = state.sample_population(10, &mut rng);
+        assert_eq!(pop.len(), 10);
+
+        // Check bounds
+        for ind in &pop {
+            for (i, &x) in ind.iter().enumerate() {
+                let (lo, hi) = ranges[i];
+                assert!(x >= lo && x <= hi);
+            }
+        }
+    }
+}
diff --git a/src/algorithms/nonparametric/npcma/constants.rs b/src/algorithms/nonparametric/npcma/constants.rs
new file mode 100644
index 000000000..7ed483449
--- /dev/null
+++ b/src/algorithms/nonparametric/npcma/constants.rs
@@ -0,0 +1,82 @@
+//! NPCMA Constants
+//!
+//! Configuration constants for the CMA-ES algorithm.
+
+// ============================================================================
+// CONVERGENCE CONSTANTS (matching NPAG/NPSAH)
+// ============================================================================
+
+/// Grid spacing convergence threshold
+pub const THETA_E: f64 = 1e-4;
+
+/// Objective function convergence threshold
+pub const THETA_G: f64 = 1e-4;
+
+/// P(Y|L) convergence criterion
+pub const THETA_F: f64 = 1e-2;
+
+/// Distance threshold for new support points
+pub const THETA_D: f64 = 1e-4;
+
+// ============================================================================
+// CMA-ES PARAMETERS
+// ============================================================================
+
+/// Population size (lambda) - number of samples per generation
+pub const POPULATION_SIZE: usize = 30;
+
+/// Number of parents (mu) - typically lambda/2
+pub const N_PARENTS: usize = 15;
+
+/// Initial step size (sigma)
+pub const INITIAL_SIGMA: f64 = 0.3;
+
+/// Minimum step size before restart
+pub const MIN_SIGMA: f64 = 1e-8;
+
+/// Maximum step size
+pub const MAX_SIGMA: f64 = 2.0;
+
+/// Cumulation factor for step size control (c_sigma)
+pub const C_SIGMA: f64 = 0.3;
+
+/// Damping factor for step size (d_sigma)
+pub const D_SIGMA: f64 = 1.0;
+
+/// Cumulation factor for covariance matrix (c_c)
+pub const C_C: f64 = 0.4;
+
+/// Learning rate for rank-1 update (c_1)
+pub const C_1: f64 = 0.2;
+
+/// Learning rate for rank-mu update (c_mu)  
+pub const C_MU: f64 = 0.3;
+
+/// Maximum stagnation cycles before restart
+pub const MAX_STAGNATION: usize = 15;
+
+/// Eigenvalue floor for numerical stability
+pub const EIGENVALUE_FLOOR: f64 = 1e-10;
+
+/// Condition number threshold for restart
+pub const CONDITION_THRESHOLD: f64 = 1e14;
+
+// ============================================================================
+// ALGORITHM PHASES
+// ============================================================================
+
+/// Number of warm-up cycles using NPAG-style grid expansion
+pub const WARMUP_CYCLES: usize = 3;
+
+/// Fraction of max D-criterion to use as threshold for adding points
+pub const D_THRESHOLD_FRACTION: f64 = 0.5;
+
+// ============================================================================
+// GLOBAL OPTIMALITY CHECK
+// ============================================================================
+
+/// Number of samples for global optimality check
+pub const GLOBAL_CHECK_SAMPLES: usize = 200;
+
+/// D-criterion threshold for global optimality
+pub const GLOBAL_D_THRESHOLD: f64 = 0.01;
diff --git a/src/algorithms/nonparametric/npcma/mod.rs b/src/algorithms/nonparametric/npcma/mod.rs
new file mode 100644
index 000000000..f1dd98833
--- /dev/null
+++ b/src/algorithms/nonparametric/npcma/mod.rs
@@ -0,0 +1,639 @@
+//! # NPCMA: Non-Parametric Covariance Matrix Adaptation Algorithm
+//!
+//! A CMA-ES (Covariance Matrix Adaptation Evolution Strategy) approach to
+//! nonparametric population pharmacokinetics.
+//!
+//! ## Algorithm Overview
+//!
+//! CMA-ES is a state-of-the-art derivative-free optimization algorithm that
+//! adapts a multivariate normal distribution to sample promising solutions.
+//! It learns the covariance structure of the fitness landscape, making it
+//! particularly effective for correlated parameters.
+//!
+//! ## Key Innovations for Pharmacometrics
+//!
+//! 1. **D-Criterion Fitness**: Each sample is evaluated using the D-optimality
+//!    criterion, directing the search toward information-maximizing regions
+//! 2. **Covariance Adaptation**: Automatically learns parameter correlations
+//! 3. **Step Size Control**: Adaptive sigma prevents premature convergence
+//! 4. **Restart Strategy**: Escapes local optima through intelligent restarts
+//!
+//! ## Algorithm Structure
+//!
+//! - **Warm-up (cycles 1-3)**: NPAG-style grid expansion for broad coverage
+//! - **CMA Phase**: Sample from adapted distribution, evaluate D-criterion,
+//!   update distribution parameters toward high-D regions
+//! - **Estimation**: Standard IPM to compute weights
+//! - **Condensation**: QR-based pruning of redundant points
+
+mod cma;
+mod constants;
+
+use cma::CmaState;
+pub use constants::*;
+
+use crate::algorithms::{
+    NativeNonparametricConfig, NonparametricAlgorithmInput, Status, StopReason,
+};
+use crate::estimation::nonparametric::adaptative_grid;
+use crate::estimation::nonparametric::ipm::burke;
+use crate::estimation::nonparametric::qr;
+use crate::estimation::nonparametric::sample_space_for_parameters;
+use crate::estimation::nonparametric::{
+    calculate_psi, CycleLog, NPCycle, NonparametricWorkspace, Psi, Theta, Weights,
+};
+use crate::prelude::algorithms::Algorithms;
+
+use anyhow::{bail, Result};
+use ndarray::parallel::prelude::{IntoParallelIterator, ParallelIterator};
+use ndarray::{Array, Array1, ArrayBase, Axis, Dim, OwnedRepr};
+use pharmsol::prelude::data::Data;
+use pharmsol::prelude::simulator::Equation;
+use pharmsol::{prelude::AssayErrorModel, AssayErrorModels, Subject};
+use rand::prelude::*;
+use rand::SeedableRng;
+
+// ============================================================================
+// NPCMA STRUCT
+// ============================================================================
+
+pub struct NPCMA<E: Equation + Send + 'static> {
+    equation: E,
+    ranges: Vec<(f64, f64)>,
+    psi: Psi,
+    theta: Theta,
+    lambda: Weights,
+    w: Weights,
+    eps: f64,
+    last_objf: f64,
+    objf: f64,
+    f0: f64,
+    f1: f64,
+    cycle: usize,
+    gamma_delta: Vec<f64>,
+    error_models: AssayErrorModels,
+    status: Status,
+    cycle_log: CycleLog,
+    data: Data,
+    config: NativeNonparametricConfig,
+
+    // CMA-ES specific
+    cma: CmaState,
+    rng: StdRng,
+    /// Cached pyl vector for D-criterion evaluation
+    pyl: Array1<f64>,
+    /// Phase: true = warm-up (grid expansion), false = CMA-driven
+    in_warmup: bool,
+}
+
+// ============================================================================
+// ALGORITHMS TRAIT
+// ============================================================================
+
+impl<E: Equation + Send + 'static> Algorithms<E> for NPCMA<E> {
+    fn equation(&self) -> &E {
+        &self.equation
+    }
+
+    fn error_models(&self) -> &AssayErrorModels {
+        &self.error_models
+    }
+
+    fn data(&self) -> &Data {
+        &self.data
+    }
+
+    fn get_prior(&self) -> Theta {
+        sample_space_for_parameters(&self.config.parameter_space, &self.config.prior).unwrap()
+    }
+
+    fn likelihood(&self) -> f64 {
+        self.objf
+    }
+
+    fn increment_cycle(&mut self) -> usize {
+        self.cycle += 1;
+
+        // Exit warm-up after WARMUP_CYCLES
+        if self.cycle > WARMUP_CYCLES && self.in_warmup {
+            self.in_warmup = false;
+            tracing::info!("NPCMA: Warm-up complete, entering CMA-ES driven expansion");
+        }
+
+        self.cycle
+    }
+
+    fn cycle(&self) -> usize {
+        self.cycle
+    }
+
+    fn set_theta(&mut self, theta: Theta) {
+        self.theta = theta;
+    }
+
+    fn theta(&self) -> &Theta {
+        &self.theta
+    }
+
+    fn psi(&self) -> &Psi {
+        &self.psi
+    }
+
+    fn set_status(&mut self, status: Status) {
+        self.status = status;
+    }
+
+    fn status(&self) -> &Status {
+        &self.status
+    }
+
+    fn log_cycle_state(&mut self) {
+        let state = NPCycle::new(
+            self.cycle,
+            -2.0 * self.objf,
+            self.error_models.clone(),
+            self.theta.clone(),
+            self.theta.nspp(),
+            (self.last_objf - self.objf).abs(),
+            self.status.clone(),
+        );
+        self.cycle_log.push(state);
+        self.last_objf = self.objf;
+    }
+
+    fn evaluation(&mut self) -> Result<Status> {
+        tracing::info!("Objective function = {:.4}", -2.0 * self.objf);
+        tracing::debug!(
+            "Support points: {} | Phase: {} | Sigma: {:.4} | EPS: {:.4}",
+            self.theta.nspp(),
+            if self.in_warmup { "Warm-up" } else { "CMA-ES" },
+            self.cma.sigma,
+            self.eps
+        );
+
+        self.error_models.iter().for_each(|(outeq, em)| {
+            if AssayErrorModel::None != *em {
+                tracing::debug!(
+                    "Error model outeq {}: {:.4}",
+                    outeq,
+                    em.factor().unwrap_or_default()
+                );
+            }
+        });
+
+        if self.last_objf > self.objf + 1e-4 {
+            tracing::warn!(
+                "Objective decreased: {:.4} -> {:.4}",
+                -2.0 * self.last_objf,
+                -2.0 * self.objf
+            );
+        }
+
+        // NPAG-style convergence with eps halving
+        let psi = self.psi.matrix();
+        let w = &self.w;
+
+        if (self.last_objf - self.objf).abs() <= THETA_G && self.eps > THETA_E {
+            self.eps /= 2.0;
+            tracing::debug!("Halving eps to {:.6}", self.eps);
+
+            if self.eps <= THETA_E {
+                let pyl = psi * w.weights();
+                self.f1 = pyl.iter().map(|x| x.ln()).sum();
+
+                if (self.f1 - self.f0).abs() <= THETA_F {
+                    // Also check global optimality via CMA sampling
+                    let global_check = self.global_optimality_check()?;
+                    if global_check {
+                        tracing::info!("NPCMA converged after {} cycles", self.cycle);
+                        self.status = Status::Stop(StopReason::Converged);
+                        self.log_cycle_state();
+                        return Ok(self.status.clone());
+                    } else {
+                        tracing::debug!("P(Y|L) criterion met but global check failed, continuing");
+                        self.f0 = self.f1;
+                        self.eps = 0.2;
+                    }
+                } else {
+                    self.f0 = self.f1;
+                    self.eps = 0.2;
+                }
+            }
+        }
+
+        // Max cycles check
+        if self.cycle >= self.config.max_cycles {
+            tracing::warn!("Maximum cycles reached");
+            self.status = Status::Stop(StopReason::MaxCycles);
+            self.log_cycle_state();
+            return Ok(self.status.clone());
+        }
+
+        // Stop file check
+        if std::path::Path::new("stop").exists() {
+            tracing::warn!("Stop file detected");
+            self.status = Status::Stop(StopReason::Stopped);
+            self.log_cycle_state();
+            return Ok(self.status.clone());
+        }
+
+        self.log_cycle_state();
+        Ok(self.status.clone())
+    }
+
+    fn estimation(&mut self) -> Result<()> {
+        self.psi = calculate_psi(
+            &self.equation,
+            &self.data,
+            &self.theta,
+            &self.error_models,
+            self.cycle == 1 && self.config.progress,
+        )?;
+
+        if let Err(err) = self.validate_psi() {
+            bail!(err);
+        }
+
+        let (lambda, _) = burke(&self.psi)?;
+        self.lambda = lambda;
+
+        Ok(())
+    }
+
+    fn condensation(&mut self) -> Result<()> {
+        // Lambda threshold pruning (more aggressive: 1/10000)
+        let max_lambda = self
+            .lambda
+            .iter()
+            .fold(f64::NEG_INFINITY, |acc, x| x.max(acc));
+
+        let threshold = max_lambda / 10000.0;
+        let mut keep: Vec<usize> = self
+            .lambda
+            .iter()
+            .enumerate()
+            .filter(|(_, lam)| *lam > threshold)
+            .map(|(i, _)| i)
+            .collect();
+
+        if self.psi.matrix().ncols() != keep.len() {
+            tracing::debug!(
+                "Lambda pruning dropped {} SPP",
+                self.psi.matrix().ncols() - keep.len()
+            );
+        }
+
+        self.theta.filter_indices(&keep);
+        self.psi.filter_column_indices(&keep);
+
+        // QR rank-revealing factorization
+        let (r, perm) = qr::qrd(&self.psi)?;
+        let keep_n = self.psi.matrix().ncols().min(self.psi.matrix().nrows());
+
+        keep.clear();
+        for i in 0..keep_n {
+            let test = r.col(i).norm_l2();
+            let r_diag = r.get(i, i);
+            if (r_diag / test).abs() >= 1e-8 {
+                keep.push(*perm.get(i).unwrap());
+            }
+        }
+
+        if self.psi.matrix().ncols() != keep.len() {
+            tracing::debug!("QR dropped {} SPP", self.psi.matrix().ncols() - keep.len());
+        }
+
+        self.theta.filter_indices(&keep);
+        self.psi.filter_column_indices(&keep);
+
+        self.validate_psi()?;
+
+        let (lambda, objf) = burke(&self.psi)?;
+        self.lambda = lambda;
+        self.objf = objf;
+        self.w = self.lambda.clone();
+
+        // Update pyl for D-criterion calculations
+        let psi = self.psi.to_ndarray();
+        let w_arr: Array1<f64> = self.w.iter().collect();
+        self.pyl = psi.dot(&w_arr);
+
+        // Update CMA distribution based on high-weight support points
+        self.update_cma_from_weights()?;
+
+        Ok(())
+    }
+
+    fn optimizations(&mut self) -> Result<()> {
+        // Standard error model optimization
+        self.optimize_error_models()?;
+
+        Ok(())
+    }
+
+    fn expansion(&mut self) -> Result<()> {
+        if self.in_warmup {
+            // Warm-up: NPAG-style grid expansion for broad coverage
+            self.warmup_expansion()?;
+        } else {
+            // CMA-ES driven expansion
+            self.cma_expansion()?;
+        }
+
+        Ok(())
+    }
+
+    fn into_workspace(&self) -> Result<NonparametricWorkspace<E>> {
+        NonparametricWorkspace::new(
+            self.equation.clone(),
+            self.data.clone(),
+            self.theta.clone(),
+            self.psi.clone(),
+            self.w.clone(),
+            -2.0 * self.objf,
+            self.cycle,
+            self.status.clone(),
+            self.config.run_configuration.clone(),
+            self.cycle_log.clone(),
+        )
+    }
+}
+
+// ============================================================================
+// NPCMA SPECIFIC METHODS
+// ============================================================================
+
+impl<E: Equation + Send + 'static> NPCMA<E> {
+    pub(crate) fn from_input(input: NonparametricAlgorithmInput<E>) -> Result<Box<Self>> {
+        let config = input.native_config()?;
+        let seed = config.prior.seed().unwrap_or(42) as u64;
+        let ranges = config.ranges.clone();
+        let n_dims = ranges.len();
+        let n_subjects = input.data.len();
+        let error_models = input.error_models().clone();
+
+        Ok(Box::new(Self {
+            equation: input.equation,
+            ranges: ranges.clone(),
+            psi: Psi::new(),
+            theta: Theta::new(),
+            lambda: Weights::default(),
+            w: Weights::default(),
+            eps: 0.2,
+            last_objf: -1e30,
+            objf: f64::NEG_INFINITY,
+            f0: -1e30,
+            f1: f64::default(),
+            cycle: 0,
+            gamma_delta: vec![0.1; error_models.len()],
+            error_models,
+            status: Status::Continue,
+            cycle_log: CycleLog::new(),
+            data: input.data,
+            config,
+            cma: CmaState::new(n_dims, &ranges, seed),
+            rng: StdRng::seed_from_u64(seed),
+            pyl: Array1::ones(n_subjects),
+            in_warmup: true,
+        }))
+    }
+
+    /// Warm-up expansion using adaptive grid (like NPAG)
+    fn warmup_expansion(&mut self) -> Result<()> {
+        tracing::debug!("NPCMA warm-up: adaptive grid expansion");
+        adaptative_grid(&mut self.theta, self.eps, &self.ranges, THETA_D)?;
+        Ok(())
+    }
+
+    /// CMA-ES driven expansion: sample from adapted distribution
+    fn cma_expansion(&mut self) -> Result<()> {
+        let initial_points = self.theta.nspp();
+
+        // Check for restart conditions
+        if self.cma.should_restart() {
+            tracing::info!("NPCMA: Restarting CMA-ES distribution");
+            self.cma.restart(&mut self.rng);
+        }
+
+        // 1. Sample population from CMA distribution
+        let population = self.cma.sample_population(POPULATION_SIZE, &mut self.rng);
+
+        // 2. Evaluate D-criterion for all samples (in parallel)
+        let samples: Vec<Vec<f64>> = population.iter().map(|a| a.to_vec()).collect();
+        let fitness: Vec<f64> = samples
+            .clone()
+            .into_par_iter()
+            .map(|pos| self.compute_d_criterion(&pos).unwrap_or(f64::NEG_INFINITY))
+            .collect();
+
+        // 3. Sort by fitness (descending - higher D is better)
+        let mut indexed: Vec<(usize, f64)> = fitness.iter().copied().enumerate().collect();
+        indexed.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
+
+        let sorted_population: Vec<Array1<f64>> = indexed
+            .iter()
+            .map(|(i, _)| population[*i].clone())
+            .collect();
+        let sorted_fitness: Vec<f64> = indexed.iter().map(|(_, f)| *f).collect();
+
+        // 4. Update CMA-ES distribution
+        self.cma.update(&sorted_population, &sorted_fitness);
+
+        // 5. Add high-fitness samples as support point candidates
+        let max_fitness = sorted_fitness.first().copied().unwrap_or(f64::NEG_INFINITY);
+        let threshold = max_fitness * D_THRESHOLD_FRACTION;
+
+        let mut added = 0;
+        for (i, sample) in samples.iter().enumerate() {
+            if fitness[i] > threshold.max(0.0) {
+                if self.theta.check_point(sample, THETA_D) {
+                    self.theta.add_point(sample)?;
+                    added += 1;
+                }
+            }
+        }
+
+        // 6. Sparse grid expansion to fill gaps every few cycles
+        if self.cycle % 3 == 0 {
+            let sparse_eps = self.eps * 0.5;
+            adaptative_grid(&mut self.theta, sparse_eps, &self.ranges, THETA_D * 2.0)?;
+        }
+
+        tracing::debug!(
+            "CMA expansion: {} -> {} (added {}, sigma={:.4})",
+            initial_points,
+            self.theta.nspp(),
+            added,
+            self.cma.sigma
+        );
+
+        Ok(())
+    }
+
+    /// Update CMA distribution to center on high-weight support points
+    fn update_cma_from_weights(&mut self) -> Result<()> {
+        if self.w.len() == 0 || self.theta.nspp() == 0 {
+            return Ok(());
+        }
+
+        // Find high-weight points
+        let max_weight = self.w.iter().fold(f64::NEG_INFINITY, |a, b| a.max(b));
+        let threshold = max_weight * 0.1;
+
+        let n_points = self.theta.nspp().min(self.w.len());
+        let mut high_weight_points: Vec<(Vec<f64>, f64)> = Vec::new();
+
+        for (i, row) in self.theta.matrix().row_iter().enumerate().take(n_points) {
+            if self.w[i] >= threshold {
+                let point: Vec<f64> = row.iter().copied().collect();
+                high_weight_points.push((point, self.w[i]));
+            }
+        }
+
+        if !high_weight_points.is_empty() {
+            // Update CMA mean toward weighted centroid of high-weight points
+            self.cma.update_mean_from_points(&high_weight_points);
+        }
+
+        Ok(())
+    }
+
+    /// Compute D-criterion for a single point
+    fn compute_d_criterion(&self, point: &[f64]) -> Result<f64> {
+        let theta_single = Array1::from(point.to_vec()).insert_axis(Axis(0));
+
+        let psi_single = pharmsol::prelude::simulator::log_likelihood_matrix(
+            &self.equation,
+            &self.data,
+            &theta_single,
+            &self.error_models,
+            false,
+        )?
+        .mapv(f64::exp);
+
+        let nsub = psi_single.nrows() as f64;
+        let mut d_sum = -nsub;
+
+        for (p_i, pyl_i) in psi_single.iter().zip(self.pyl.iter()) {
+            if *pyl_i > 1e-300 {
+                d_sum += p_i / pyl_i;
+            }
+        }
+
+        Ok(d_sum)
+    }
+
+    /// Global optimality check using CMA sampling
+    fn global_optimality_check(&mut self) -> Result<bool> {
+        // Sample from CMA distribution and check if any have high D
+        let samples = self
+            .cma
+            .sample_population(GLOBAL_CHECK_SAMPLES, &mut self.rng);
+        let mut max_d = f64::NEG_INFINITY;
+
+        for sample in &samples {
+            let d = self.compute_d_criterion(&sample.to_vec())?;
+            max_d = max_d.max(d);
+        }
+
+        // Also sample uniformly random
+        for _ in 0..GLOBAL_CHECK_SAMPLES {
+            let point: Vec<f64> = self
+                .ranges
+                .iter()
+                .map(|(lo, hi)| self.rng.random_range(*lo..*hi))
+                .collect();
+            let d = self.compute_d_criterion(&point)?;
+            max_d = max_d.max(d);
+        }
+
+        let passed = max_d < GLOBAL_D_THRESHOLD;
+        tracing::debug!(
+            "Global optimality check: max_D = {:.4}, threshold = {:.4}, passed = {}",
+            max_d,
+            GLOBAL_D_THRESHOLD,
+            passed
+        );
+
+        Ok(passed)
+    }
+
+    /// Optimize error models (standard approach)
+    fn optimize_error_models(&mut self) -> Result<()> {
+        for (outeq, em) in self.error_models.clone().iter_mut() {
+            if *em == AssayErrorModel::None || em.is_factor_fixed().unwrap_or(true) {
+                continue;
+            }
+
+            let gamma_up = em.factor()? * (1.0 + self.gamma_delta[outeq]);
+            let gamma_down = em.factor()? / (1.0 + self.gamma_delta[outeq]);
+
+            let mut em_up = self.error_models.clone();
+            em_up.set_factor(outeq, gamma_up)?;
+
+            let mut em_down = self.error_models.clone();
+            em_down.set_factor(outeq, gamma_down)?;
+
+            let psi_up = calculate_psi(&self.equation, &self.data, &self.theta, &em_up, false)?;
+            let psi_down = calculate_psi(&self.equation, &self.data, &self.theta, &em_down, false)?;
+
+            let (lambda_up, objf_up) = burke(&psi_up)?;
+            let (lambda_down, objf_down) = burke(&psi_down)?;
+
+            if objf_up > self.objf {
+                self.error_models.set_factor(outeq, gamma_up)?;
+                self.objf = objf_up;
+                self.gamma_delta[outeq] *= 4.0;
+                self.lambda = lambda_up;
+                self.psi = psi_up;
+            }
+            if objf_down > self.objf {
+                self.error_models.set_factor(outeq, gamma_down)?;
+                self.objf = objf_down;
+                self.gamma_delta[outeq] *= 4.0;
+                self.lambda = lambda_down;
+                self.psi = psi_down;
+            }
+
+            self.gamma_delta[outeq] *= 0.5;
+            if self.gamma_delta[outeq] <= 0.01 {
+                self.gamma_delta[outeq] = 0.1;
+            }
+        }
+
+        // Update pyl after error model changes
+        if self.w.len() > 0 {
+            let psi = self.psi.to_ndarray();
+            let w_arr: Array1<f64> = self.w.iter().collect();
+            self.pyl = psi.dot(&w_arr);
+        }
+
+        Ok(())
+    }
+
+    /// Validate PSI matrix
+    #[allow(dead_code)]
+    fn validate_psi(&self) -> Result<()> {
+        let psi = self.psi.to_ndarray();
+        let (_, col) = psi.dim();
+        let ecol: ArrayBase<OwnedRepr<f64>, Dim<[usize; 1]>> = Array::ones(col);
+        let plam = psi.dot(&ecol);
+        let w = 1.0 / &plam;
+
+        let bad_indices: Vec<usize> = w
+            .iter()
+            .enumerate()
+            .filter(|(_, x)| x.is_nan() || x.is_infinite())
+            .map(|(i, _)| i)
+            .collect();
+
+        if !bad_indices.is_empty() {
+            let subjects: Vec<&Subject> = self.data.subjects();
+            let bad_subjects: Vec<&String> =
+                bad_indices.iter().map(|&i| subjects[i].id()).collect();
+            bail!("Zero probability for subjects: {:?}", bad_subjects);
+        }
+
+        Ok(())
+    }
+}
diff --git a/src/algorithms/nonparametric/npopt/constants.rs b/src/algorithms/nonparametric/npopt/constants.rs
new file mode 100644
index 000000000..f9fcec863
--- /dev/null
+++ b/src/algorithms/nonparametric/npopt/constants.rs
@@ -0,0 +1,123 @@
+//! Constants for NPOPT algorithm
+
+// ============================================================================
+// CONVERGENCE THRESHOLDS
+// ============================================================================
+
+/// Objective function convergence threshold
+pub const THETA_G: f64 = 1e-4;
+/// P(Y|L) convergence criterion
+pub const THETA_F: f64 = 1e-2;
+/// Minimum distance between support points
+pub const THETA_D: f64 = 1e-4;
+/// Weight stability threshold
+pub const THETA_W: f64 = 1e-3;
+/// Global optimality D-criterion threshold
+pub const GLOBAL_D_THRESHOLD: f64 = 0.008;
+
+// ============================================================================
+// GRID EXPANSION
+// ============================================================================
+
+/// Initial grid spacing
+pub const INITIAL_EPS: f64 = 0.2;
+/// Minimum grid spacing
+pub const MIN_EPS: f64 = 1e-4;
+
+// ============================================================================
+// PHASE CONTROL
+// ============================================================================
+
+/// Number of exploration cycles
+pub const EXPLORATION_CYCLES: usize = 3;
+/// Number of Sobol samples for initial coverage
+pub const SOBOL_INIT_SAMPLES: usize = 50;
+/// Cycles between global checks
+pub const GLOBAL_CHECK_INTERVAL: usize = 3;
+/// Number of Sobol samples for global check
+pub const SOBOL_GLOBAL_SAMPLES: usize = 256;
+/// Consecutive passes needed for convergence
+pub const CONVERGENCE_PASSES: usize = 2;
+/// Convergence window for objf stability
+pub const CONVERGENCE_WINDOW: usize = 3;
+
+// ============================================================================
+// ADAPTIVE SIMULATED ANNEALING
+// ============================================================================
+
+/// Initial SA temperature
+pub const INITIAL_TEMPERATURE: f64 = 2.0;
+/// Base cooling rate
+pub const BASE_COOLING_RATE: f64 = 0.90;
+/// Minimum temperature
+pub const MIN_TEMPERATURE: f64 = 0.01;
+/// Target acceptance ratio
+pub const TARGET_ACCEPTANCE: f64 = 0.23;
+/// Trigger reheat when acceptance below this
+pub const REHEAT_TRIGGER: f64 = 0.08;
+/// Reheat factor
+pub const REHEAT_FACTOR: f64 = 1.5;
+/// Number of SA points to inject per cycle
+pub const SA_INJECT_COUNT: usize = 30;
+/// History window for acceptance ratio
+pub const SA_HISTORY_WINDOW: usize = 5;
+
+// ============================================================================
+// FISHER-GUIDED EXPANSION
+// ============================================================================
+
+/// Fraction of candidates from Fisher directions
+pub const FISHER_RATIO: f64 = 0.70;
+/// Fraction of candidates from D-optimal gradient
+pub const DOPT_RATIO: f64 = 0.30;
+/// Number of Fisher-guided candidates
+pub const FISHER_CANDIDATES: usize = 20;
+
+// ============================================================================
+// D-OPTIMAL REFINEMENT
+// ============================================================================
+
+/// High weight threshold (fraction of max)
+pub const HIGH_WEIGHT_THRESHOLD: f64 = 0.10;
+/// Medium weight threshold
+pub const MED_WEIGHT_THRESHOLD: f64 = 0.01;
+/// Low weight threshold (skip below this)
+pub const LOW_WEIGHT_THRESHOLD: f64 = 0.001;
+/// Max iterations for high-weight points
+pub const DOPT_HIGH_ITERS: u64 = 80;
+/// Max iterations for medium-weight points
+pub const DOPT_MED_ITERS: u64 = 30;
+/// Max iterations for low-weight points
+pub const DOPT_LOW_ITERS: u64 = 10;
+
+// ============================================================================
+// SUBJECT RESIDUAL INJECTION
+// ============================================================================
+
+/// Number of worst-fit subjects to process
+pub const RESIDUAL_SUBJECTS: usize = 3;
+/// Max iterations for subject MAP
+pub const SUBJECT_MAP_ITERS: u64 = 30;
+
+// ============================================================================
+// ELITE PRESERVATION
+// ============================================================================
+
+/// Number of elite points to preserve
+pub const ELITE_COUNT: usize = 5;
+/// Max age of elite point (cycles)
+pub const ELITE_MAX_AGE: usize = 15;
+
+// ============================================================================
+// CONDENSATION
+// ============================================================================
+
+/// Lambda filter divisor (keep if > max_lambda / divisor)
+pub const LAMBDA_FILTER_DIVISOR: f64 = 10000.0;
+
+// ============================================================================
+// BOUNDARY MARGIN
+// ============================================================================
+
+/// Margin from boundaries (fraction of range)
+pub const BOUNDARY_MARGIN: f64 = 0.005;
diff --git a/src/algorithms/nonparametric/npopt/convergence.rs b/src/algorithms/nonparametric/npopt/convergence.rs
new file mode 100644
index 000000000..4d9f421c4
--- /dev/null
+++ b/src/algorithms/nonparametric/npopt/convergence.rs
@@ -0,0 +1,98 @@
+//! Convergence checking for NPOPT
+
+use super::constants::*;
+use super::{Phase, NPOPT};
+
+use anyhow::Result;
+use pharmsol::prelude::simulator::Equation;
+
+impl<E: Equation + Send + 'static> NPOPT<E> {
+    /// Multi-criterion convergence check
+    pub(crate) fn check_convergence(&mut self) -> Result<bool> {
+        // Need minimum history
+        if self.objf_history.len() < CONVERGENCE_WINDOW {
+            return Ok(false);
+        }
+
+        // Criterion 1: Objective function stability
+        let recent: Vec<f64> = self
+            .objf_history
+            .iter()
+            .rev()
+            .take(CONVERGENCE_WINDOW)
+            .cloned()
+            .collect();
+
+        let objf_stable = recent.windows(2).all(|w| (w[0] - w[1]).abs() < THETA_G);
+
+        if !objf_stable {
+            return Ok(false);
+        }
+
+        // Criterion 2: Weight stability
+        if !self.weights_stable() {
+            return Ok(false);
+        }
+
+        // Criterion 3: Global optimality (in Polishing phase)
+        if self.phase == Phase::Polishing {
+            // Run global check if we haven't recently
+            if self.global_check_passes < CONVERGENCE_PASSES {
+                self.sobol_global_check()?;
+            }
+
+            if self.global_check_passes >= CONVERGENCE_PASSES {
+                // All criteria met
+                tracing::info!(
+                    "Convergence: objf stable, weights stable, {} global checks passed",
+                    self.global_check_passes
+                );
+                return Ok(true);
+            }
+        }
+
+        // In Refinement phase, require global checks + phase transition
+        if self.phase == Phase::Refinement && self.global_check_passes >= CONVERGENCE_PASSES {
+            // Transition to polishing
+            self.phase = Phase::Polishing;
+            tracing::info!("NPOPT: Refinement → Polishing (global check passed)");
+        }
+
+        Ok(false)
+    }
+
+    /// Check if weight distribution is stable
+    pub(crate) fn weights_stable(&self) -> bool {
+        if self.w.len() != self.w_prev.len() || self.w.len() == 0 {
+            return false;
+        }
+
+        let max_change = self
+            .w
+            .iter()
+            .zip(self.w_prev.iter())
+            .map(|(w_new, w_old)| {
+                if w_new > 1e-10 {
+                    ((w_new - w_old) / w_new).abs()
+                } else {
+                    0.0
+                }
+            })
+            .fold(0.0_f64, |a, b| a.max(b));
+
+        max_change < THETA_W
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_convergence_constants() {
+        assert!(CONVERGENCE_WINDOW > 1);
+        assert!(CONVERGENCE_PASSES > 0);
+        assert!(THETA_G > 0.0);
+        assert!(THETA_W > 0.0 && THETA_W < 1.0);
+    }
+}
diff --git a/src/algorithms/nonparametric/npopt/expansion.rs b/src/algorithms/nonparametric/npopt/expansion.rs
new file mode 100644
index 000000000..2ff13364e
--- /dev/null
+++ b/src/algorithms/nonparametric/npopt/expansion.rs
@@ -0,0 +1,650 @@
+//! Expansion strategies for NPOPT
+
+use super::constants::*;
+use super::{ElitePoint, NPOPT};
+use crate::estimation::nonparametric::adaptative_grid;
+
+use anyhow::Result;
+use ndarray::parallel::prelude::{IntoParallelRefMutIterator, ParallelIterator};
+use ndarray::Array1;
+use pharmsol::prelude::simulator::Equation;
+use rand::prelude::*;
+use sobol_burley::sample;
+
+impl<E: Equation + Send + 'static> NPOPT<E> {
+    // ========================================================================
+    // PHASE-SPECIFIC EXPANSION
+    // ========================================================================
+
+    /// Exploration phase: Sobol initialization + grid expansion
+    pub(crate) fn exploration_expansion(&mut self) -> Result<()> {
+        tracing::debug!("Exploration expansion: Sobol + adaptive grid");
+
+        // Stratified Sobol for initial coverage
+        self.sobol_initialization(SOBOL_INIT_SAMPLES)?;
+
+        // Adaptive grid expansion
+        adaptative_grid(&mut self.theta, self.eps, &self.ranges, THETA_D)?;
+
+        Ok(())
+    }
+
+    /// Refinement phase: D-optimal + SA + Fisher + Subject residual + Elite
+    pub(crate) fn refinement_expansion(&mut self) -> Result<()> {
+        let initial = self.theta.nspp();
+
+        // 1. D-optimal refinement (parallel, hierarchical)
+        self.d_optimal_refinement()?;
+        let after_dopt = self.theta.nspp();
+
+        // 2. Adaptive SA injection
+        if self.temperature > MIN_TEMPERATURE {
+            self.adaptive_sa_injection()?;
+        }
+        let after_sa = self.theta.nspp();
+
+        // 3. Fisher-guided expansion
+        self.fisher_expansion()?;
+        let after_fisher = self.theta.nspp();
+
+        // 4. Subject residual injection
+        self.inject_residual_subjects()?;
+        let after_subj = self.theta.nspp();
+
+        // 5. Re-inject elite points
+        self.inject_elite_points()?;
+        let after_elite = self.theta.nspp();
+
+        // 6. Periodic global check
+        if self.cycle % GLOBAL_CHECK_INTERVAL == 0 {
+            self.sobol_global_check()?;
+        }
+
+        tracing::debug!(
+            "Refinement: {} → {} (D-opt) → {} (SA) → {} (Fisher) → {} (subj) → {} (elite)",
+            initial,
+            after_dopt,
+            after_sa,
+            after_fisher,
+            after_subj,
+            after_elite
+        );
+
+        Ok(())
+    }
+
+    /// Polishing phase: full D-optimal refinement only
+    pub(crate) fn polishing_expansion(&mut self) -> Result<()> {
+        tracing::debug!("Polishing expansion: full D-optimal refinement");
+        self.full_d_optimal_refinement()?;
+        Ok(())
+    }
+
+    // ========================================================================
+    // SOBOL INITIALIZATION
+    // ========================================================================
+
+    /// Initialize with Sobol low-discrepancy sequence
+    pub(crate) fn sobol_initialization(&mut self, n_samples: usize) -> Result<()> {
+        let n_dims = self.ranges.len();
+        let mut added = 0;
+
+        for i in 0..n_samples {
+            let idx = self.sobol_index + i as u32;
+            let mut point = Vec::with_capacity(n_dims);
+
+            for dim in 0..n_dims {
+                let sobol_val = sample(idx, dim as u32, 0);
+                let (lo, hi) = self.ranges[dim];
+                let margin = (hi - lo) * BOUNDARY_MARGIN;
+                point.push(lo + margin + sobol_val as f64 * (hi - lo - 2.0 * margin));
+            }
+
+            if self.theta.check_point(&point, THETA_D) {
+                self.theta.add_point(&point)?;
+                added += 1;
+            }
+        }
+
+        self.sobol_index += n_samples as u32;
+        tracing::debug!(
+            "Sobol initialization: added {} of {} points",
+            added,
+            n_samples
+        );
+
+        Ok(())
+    }
+
+    // ========================================================================
+    // D-OPTIMAL REFINEMENT
+    // ========================================================================
+
+    /// D-optimal refinement with hierarchical iteration allocation
+    pub(crate) fn d_optimal_refinement(&mut self) -> Result<()> {
+        if self.theta.nspp() == 0 || self.w.len() == 0 {
+            return Ok(());
+        }
+
+        let pyl = self.compute_pyl();
+        let error_models = self.error_models.clone();
+        let max_weight = self.w.iter().fold(f64::NEG_INFINITY, |a, b| a.max(b));
+
+        let n_points = self.theta.nspp().min(self.w.len());
+        let min_threshold = max_weight * LOW_WEIGHT_THRESHOLD;
+
+        // Collect points with meaningful weight
+        let mut candidate_points: Vec<(Array1<f64>, f64)> = self
+            .theta
+            .matrix()
+            .row_iter()
+            .take(n_points)
+            .enumerate()
+            .filter(|(i, _)| self.w[*i] >= min_threshold)
+            .map(|(i, spp)| {
+                let point: Vec<f64> = spp.iter().cloned().collect();
+                (Array1::from(point), self.w[i] / max_weight)
+            })
+            .collect();
+
+        let ranges = self.ranges.clone();
+
+        // Parallel optimization
+        candidate_points
+            .par_iter_mut()
+            .for_each(|(spp, importance)| {
+                let max_iters = if *importance > HIGH_WEIGHT_THRESHOLD {
+                    DOPT_HIGH_ITERS
+                } else if *importance > MED_WEIGHT_THRESHOLD {
+                    DOPT_MED_ITERS
+                } else {
+                    DOPT_LOW_ITERS
+                };
+
+                let optimizer = super::optimizers::DOptimalOptimizer {
+                    equation: &self.equation,
+                    data: &self.data,
+                    error_models: &error_models,
+                    pyl: &pyl,
+                };
+
+                if let Ok(refined) = optimizer.optimize(spp.to_vec(), max_iters) {
+                    let clamped: Array1<f64> = refined
+                        .iter()
+                        .zip(ranges.iter())
+                        .map(|(&val, &(lo, hi))| {
+                            let margin = (hi - lo) * BOUNDARY_MARGIN;
+                            val.clamp(lo + margin, hi - margin)
+                        })
+                        .collect();
+                    *spp = clamped;
+                }
+            });
+
+        // Add refined points
+        for (cp, _) in candidate_points {
+            self.theta.suggest_point(cp.to_vec().as_slice(), THETA_D)?;
+        }
+
+        Ok(())
+    }
+
+    /// Full D-optimal refinement for polishing phase
+    pub(crate) fn full_d_optimal_refinement(&mut self) -> Result<()> {
+        if self.theta.nspp() == 0 || self.w.len() == 0 {
+            return Ok(());
+        }
+
+        let pyl = self.compute_pyl();
+        let error_models = self.error_models.clone();
+        let n_points = self.theta.nspp().min(self.w.len());
+
+        let mut candidate_points: Vec<Array1<f64>> = self
+            .theta
+            .matrix()
+            .row_iter()
+            .take(n_points)
+            .map(|spp| Array1::from(spp.iter().cloned().collect::<Vec<_>>()))
+            .collect();
+
+        let ranges = self.ranges.clone();
+
+        candidate_points.par_iter_mut().for_each(|spp| {
+            let optimizer = super::optimizers::DOptimalOptimizer {
+                equation: &self.equation,
+                data: &self.data,
+                error_models: &error_models,
+                pyl: &pyl,
+            };
+
+            if let Ok(refined) = optimizer.optimize(spp.to_vec(), DOPT_HIGH_ITERS) {
+                let clamped: Array1<f64> = refined
+                    .iter()
+                    .zip(ranges.iter())
+                    .map(|(&val, &(lo, hi))| {
+                        let margin = (hi - lo) * BOUNDARY_MARGIN;
+                        val.clamp(lo + margin, hi - margin)
+                    })
+                    .collect();
+                *spp = clamped;
+            }
+        });
+
+        for cp in candidate_points {
+            self.theta.suggest_point(cp.to_vec().as_slice(), THETA_D)?;
+        }
+
+        Ok(())
+    }
+
+    // ========================================================================
+    // ADAPTIVE SA INJECTION
+    // ========================================================================
+
+    /// Adaptive simulated annealing injection with reheat mechanism
+    pub(crate) fn adaptive_sa_injection(&mut self) -> Result<()> {
+        let pyl = self.compute_pyl();
+
+        // Temperature-scaled injection count
+        let n_inject = ((SA_INJECT_COUNT as f64) * (self.temperature / INITIAL_TEMPERATURE).sqrt())
+            .ceil() as usize;
+        let n_inject = n_inject.max(5);
+
+        let mut accepted = 0;
+        let mut proposed = 0;
+
+        for _ in 0..n_inject * 15 {
+            proposed += 1;
+
+            // Generate random point with boundary margin
+            let point: Vec<f64> = self
+                .ranges
+                .iter()
+                .map(|(lo, hi)| {
+                    let margin = (hi - lo) * BOUNDARY_MARGIN;
+                    self.rng.random_range((lo + margin)..(hi - margin))
+                })
+                .collect();
+
+            // Compute D-criterion
+            let d_value = match self.compute_d(&point, &pyl) {
+                Ok(d) => d,
+                Err(_) => continue,
+            };
+
+            // Metropolis acceptance
+            let accept = if d_value > 0.0 {
+                true
+            } else {
+                let p_accept = (d_value / self.temperature).exp();
+                self.rng.random::<f64>() < p_accept
+            };
+
+            if accept {
+                if self.theta.check_point(&point, THETA_D) {
+                    self.theta.add_point(&point)?;
+                    accepted += 1;
+                }
+            }
+
+            if accepted >= n_inject {
+                break;
+            }
+        }
+
+        // Update SA tracking
+        self.sa_accepted += accepted;
+        self.sa_proposed += proposed;
+
+        tracing::debug!(
+            "SA injection: {}/{} accepted (T={:.4})",
+            accepted,
+            proposed,
+            self.temperature
+        );
+
+        Ok(())
+    }
+
+    /// Adapt temperature based on acceptance history
+    pub(crate) fn adapt_temperature(&mut self) {
+        // Record this cycle's acceptance ratio
+        if self.sa_proposed > 0 {
+            let ratio = self.sa_accepted as f64 / self.sa_proposed as f64;
+            self.sa_acceptance_history.push_back(ratio);
+            if self.sa_acceptance_history.len() > SA_HISTORY_WINDOW {
+                self.sa_acceptance_history.pop_front();
+            }
+        }
+
+        // Compute average acceptance
+        if !self.sa_acceptance_history.is_empty() {
+            let avg_acceptance: f64 = self.sa_acceptance_history.iter().sum::<f64>()
+                / self.sa_acceptance_history.len() as f64;
+
+            // Adaptive cooling rate and reheat
+            if avg_acceptance < REHEAT_TRIGGER {
+                self.temperature *= REHEAT_FACTOR;
+                self.cooling_rate = 0.95; // Slow down
+                tracing::debug!("Reheating to T = {:.4}", self.temperature);
+            } else if avg_acceptance > TARGET_ACCEPTANCE * 1.5 {
+                self.cooling_rate = 0.85; // Speed up
+            } else if avg_acceptance < TARGET_ACCEPTANCE * 0.5 {
+                self.cooling_rate = 0.95; // Slow down
+            } else {
+                self.cooling_rate = BASE_COOLING_RATE;
+            }
+
+            tracing::debug!(
+                "SA acceptance: {:.1}% | Cooling rate: {:.3}",
+                avg_acceptance * 100.0,
+                self.cooling_rate
+            );
+        }
+
+        // Apply cooling
+        self.temperature *= self.cooling_rate;
+        if self.temperature < MIN_TEMPERATURE {
+            self.temperature = MIN_TEMPERATURE;
+        }
+
+        // Reset counters
+        self.sa_accepted = 0;
+        self.sa_proposed = 0;
+    }
+
+    // ========================================================================
+    // FISHER-GUIDED EXPANSION
+    // ========================================================================
+
+    /// Update Fisher Information diagonal estimate
+    pub(crate) fn update_fisher_information(&mut self) {
+        let n_params = self.ranges.len();
+        let n_spp = self.theta.nspp();
+
+        if n_spp < 2 {
+            self.fisher_diagonal = vec![1.0; n_params];
+            return;
+        }
+
+        let mut means = vec![0.0; n_params];
+        let mut variances = vec![0.0; n_params];
+
+        // Weighted means
+        for (i, spp) in self.theta.matrix().row_iter().enumerate() {
+            let weight = if i < self.w.len() { self.w[i] } else { 0.0 };
+            for (j, val) in spp.iter().enumerate() {
+                means[j] += weight * val;
+            }
+        }
+
+        // Weighted variances
+        for (i, spp) in self.theta.matrix().row_iter().enumerate() {
+            let weight = if i < self.w.len() { self.w[i] } else { 0.0 };
+            for (j, val) in spp.iter().enumerate() {
+                variances[j] += weight * (val - means[j]).powi(2);
+            }
+        }
+
+        // Fisher ∝ 1/variance, but we want to explore high-variance directions
+        for (j, var) in variances.iter().enumerate() {
+            let range_scale = (self.ranges[j].1 - self.ranges[j].0).powi(2);
+            self.fisher_diagonal[j] = var.max(1e-10) / range_scale;
+        }
+    }
+
+    /// Fisher-guided expansion in high-variance directions
+    pub(crate) fn fisher_expansion(&mut self) -> Result<()> {
+        if self.theta.nspp() == 0 {
+            return Ok(());
+        }
+
+        let pyl = self.compute_pyl();
+
+        // Sort dimensions by variance (descending)
+        let mut dim_indices: Vec<(usize, f64)> = self
+            .fisher_diagonal
+            .iter()
+            .enumerate()
+            .map(|(i, &fi)| (i, fi))
+            .collect();
+        dim_indices.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
+
+        // Top half of dimensions
+        let top_dims: Vec<usize> = dim_indices
+            .iter()
+            .take((self.ranges.len() + 1) / 2)
+            .map(|(i, _)| *i)
+            .collect();
+
+        let mut candidates = Vec::new();
+
+        for spp in self.theta.matrix().row_iter() {
+            let base: Vec<f64> = spp.iter().cloned().collect();
+
+            for &dim in &top_dims {
+                if candidates.len() >= FISHER_CANDIDATES {
+                    break;
+                }
+
+                let variance = self.fisher_diagonal[dim];
+                let range = self.ranges[dim].1 - self.ranges[dim].0;
+                let step = (variance.sqrt() * range).max(range * 0.05).min(range * 0.3);
+
+                // Positive direction
+                let mut plus = base.clone();
+                plus[dim] = (plus[dim] + step).min(self.ranges[dim].1 - range * BOUNDARY_MARGIN);
+                candidates.push(plus);
+
+                // Negative direction
+                let mut minus = base.clone();
+                minus[dim] = (minus[dim] - step).max(self.ranges[dim].0 + range * BOUNDARY_MARGIN);
+                candidates.push(minus);
+            }
+        }
+
+        // Evaluate and add good candidates
+        let mut added = 0;
+        for candidate in candidates {
+            if let Ok(d) = self.compute_d(&candidate, &pyl) {
+                if d > 0.0 && self.theta.check_point(&candidate, THETA_D) {
+                    self.theta.add_point(&candidate)?;
+                    added += 1;
+                }
+            }
+        }
+
+        tracing::debug!("Fisher expansion: added {} points", added);
+        Ok(())
+    }
+
+    // ========================================================================
+    // SUBJECT RESIDUAL INJECTION
+    // ========================================================================
+
+    /// Inject points for worst-fit subjects
+    pub(crate) fn inject_residual_subjects(&mut self) -> Result<()> {
+        if self.theta.nspp() == 0 || self.w.len() == 0 {
+            return Ok(());
+        }
+
+        let pyl = self.compute_pyl();
+        let n_subjects = pyl.len();
+
+        // Find worst-fit subjects (lowest P(y|G))
+        let mut indexed_pyl: Vec<(usize, f64)> = pyl.iter().cloned().enumerate().collect();
+        indexed_pyl.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
+
+        let n_residual = RESIDUAL_SUBJECTS.min(n_subjects);
+        let subjects = self.data.subjects();
+        let error_models = self.error_models.clone();
+
+        let mut added = 0;
+
+        for (subj_idx, _) in indexed_pyl.iter().take(n_residual) {
+            let subject = &subjects[*subj_idx];
+
+            // Start from weighted centroid
+            let start = self.compute_weighted_centroid();
+
+            // Quick subject MAP optimization
+            let optimizer = super::optimizers::SubjectMapOptimizer {
+                equation: &self.equation,
+                subject,
+                error_models: &error_models,
+                ranges: &self.ranges,
+            };
+
+            if let Ok(map_point) = optimizer.optimize(start, SUBJECT_MAP_ITERS) {
+                if let Ok(d) = self.compute_d(&map_point, &pyl) {
+                    if d > 0.0 && self.theta.check_point(&map_point, THETA_D) {
+                        self.theta.add_point(&map_point)?;
+                        added += 1;
+                    }
+                }
+            }
+        }
+
+        tracing::debug!("Subject residual: added {} points", added);
+        Ok(())
+    }
+
+    // ========================================================================
+    // ELITE PRESERVATION
+    // ========================================================================
+
+    /// Update elite points after condensation
+    pub(crate) fn update_elite_points(&mut self) -> Result<()> {
+        if self.w.len() == 0 {
+            return Ok(());
+        }
+
+        let pyl = self.compute_pyl();
+
+        // Age existing elite points
+        for elite in &mut self.elite_points {
+            elite.cycle_added += 1;
+        }
+
+        // Remove old elite points
+        self.elite_points
+            .retain(|e| self.cycle - e.cycle_added < ELITE_MAX_AGE);
+
+        // Find top points by weight
+        let n_spp = self.theta.nspp().min(self.w.len());
+        let mut indexed_weights: Vec<(usize, f64)> =
+            self.w.iter().enumerate().take(n_spp).collect();
+        indexed_weights.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
+
+        for (idx, _) in indexed_weights.iter().take(ELITE_COUNT) {
+            if *idx >= self.theta.nspp() {
+                continue;
+            }
+
+            let params: Vec<f64> = self.theta.matrix().row(*idx).iter().cloned().collect();
+            let d_value = self.compute_d(&params, &pyl).unwrap_or(0.0);
+
+            // Check if already elite
+            let already_elite = self.elite_points.iter().any(|e| {
+                e.params
+                    .iter()
+                    .zip(&params)
+                    .all(|(a, b)| (a - b).abs() < THETA_D * 10.0)
+            });
+
+            if !already_elite && self.elite_points.len() < ELITE_COUNT * 2 {
+                self.elite_points.push(ElitePoint {
+                    params,
+                    d_value,
+                    cycle_added: self.cycle,
+                });
+            }
+        }
+
+        // Keep only top elite points
+        self.elite_points
+            .sort_by(|a, b| b.d_value.partial_cmp(&a.d_value).unwrap());
+        self.elite_points.truncate(ELITE_COUNT);
+
+        Ok(())
+    }
+
+    /// Re-inject elite points into theta
+    pub(crate) fn inject_elite_points(&mut self) -> Result<()> {
+        let mut injected = 0;
+        for elite in &self.elite_points {
+            if self.theta.check_point(&elite.params, THETA_D) {
+                self.theta.add_point(&elite.params)?;
+                injected += 1;
+            }
+        }
+
+        if injected > 0 {
+            tracing::debug!("Injected {} elite points", injected);
+        }
+
+        Ok(())
+    }
+
+    // ========================================================================
+    // GLOBAL OPTIMALITY CHECK
+    // ========================================================================
+
+    /// Sobol-based global optimality check
+    pub(crate) fn sobol_global_check(&mut self) -> Result<()> {
+        if self.theta.nspp() == 0 || self.w.len() == 0 {
+            return Ok(());
+        }
+
+        let pyl = self.compute_pyl();
+        let n_dims = self.ranges.len();
+
+        let mut max_d = f64::NEG_INFINITY;
+        let mut max_d_point = vec![0.0; n_dims];
+
+        for i in 0..SOBOL_GLOBAL_SAMPLES {
+            let idx = self.sobol_index + i as u32;
+            let mut point = Vec::with_capacity(n_dims);
+
+            for dim in 0..n_dims {
+                let sobol_val = sample(idx, dim as u32, 0);
+                let (lo, hi) = self.ranges[dim];
+                let margin = (hi - lo) * BOUNDARY_MARGIN;
+                point.push(lo + margin + sobol_val as f64 * (hi - lo - 2.0 * margin));
+            }
+
+            if let Ok(d) = self.compute_d(&point, &pyl) {
+                if d > max_d {
+                    max_d = d;
+                    max_d_point = point;
+                }
+            }
+        }
+
+        self.sobol_index += SOBOL_GLOBAL_SAMPLES as u32;
+        self.last_global_d_max = max_d;
+
+        let passed = max_d < GLOBAL_D_THRESHOLD;
+
+        tracing::debug!(
+            "Global check: max_D = {:.4} (threshold {:.4}) → {}",
+            max_d,
+            GLOBAL_D_THRESHOLD,
+            if passed { "PASSED" } else { "FAILED" }
+        );
+
+        if passed {
+            self.global_check_passes += 1;
+        } else {
+            self.global_check_passes = 0;
+
+            // Inject the violating point if it improves things
+            if self.theta.check_point(&max_d_point, THETA_D) {
+                self.theta.add_point(&max_d_point)?;
+                tracing::debug!("Injected global check point with D = {:.4}", max_d);
+            }
+        }
+
+        Ok(())
+    }
+}
diff --git a/src/algorithms/nonparametric/npopt/mod.rs b/src/algorithms/nonparametric/npopt/mod.rs
new file mode 100644
index 000000000..b19b48406
--- /dev/null
+++ b/src/algorithms/nonparametric/npopt/mod.rs
@@ -0,0 +1,662 @@
+//! # NPOPT: Non-Parametric OPTimal Trajectory Algorithm
+//!
+//! A state-of-the-art hybrid algorithm combining the best elements from NPSAH, NPCAT, and NEXUS.
+//!
+//! ## Design Principles
+//! 1. **Keep what works**: D-optimal refinement + Global optimality checks
+//! 2. **Adaptive SA with reheat**: Prevents premature cooling, enables escape from local optima
+//! 3. **Fisher-guided exploration**: Principled exploration in high-uncertainty directions
+//! 4. **Simplified subject residual injection**: Targets missing modes directly
+//! 5. **Elite preservation**: Prevents loss of good solutions during exploration
+//!
+//! ## Three-Phase Architecture
+//!
+//! ### Phase 1: Exploration (cycles 1-3)
+//! - Stratified Sobol initialization for space-filling coverage
+//! - Sparse adaptive grid expansion
+//! - Track Fisher Information estimates
+//!
+//! ### Phase 2: Refinement (cycles 4+)
+//! - Parallel D-optimal refinement (hierarchical iterations)
+//! - Adaptive SA injection (with reheat mechanism)
+//! - Fisher-guided expansion (high-variance directions only)
+//! - Subject residual injection (top 3 poorly-fit subjects)
+//! - Elite preservation (top 5 points)
+//! - Periodic Sobol global check (every 3 cycles)
+//!
+//! ### Phase 3: Polishing (when global check passes)
+//! - Full D-optimal refinement of all points
+//! - No expansion
+//! - Convergence when weights stable + P(Y|L) criterion met
+
+mod constants;
+mod convergence;
+mod expansion;
+mod optimizers;
+
+pub use constants::*;
+
+use crate::algorithms::{
+    NativeNonparametricConfig, NonparametricAlgorithmInput, Status, StopReason,
+};
+use crate::estimation::nonparametric::ipm::burke;
+use crate::estimation::nonparametric::qr;
+use crate::estimation::nonparametric::sample_space_for_parameters;
+use crate::estimation::nonparametric::{
+    calculate_psi, CycleLog, NPCycle, NonparametricWorkspace, Psi, Theta, Weights,
+};
+use crate::prelude::algorithms::Algorithms;
+
+use anyhow::{bail, Result};
+use ndarray::Array1;
+use pharmsol::prelude::AssayErrorModel;
+use pharmsol::prelude::{
+    data::{AssayErrorModels, Data},
+    simulator::Equation,
+};
+use rand::prelude::*;
+use std::collections::VecDeque;
+
+// ============================================================================
+// PHASE ENUM
+// ============================================================================
+
+/// Algorithm phase for NPOPT
+#[derive(Debug, Clone, PartialEq)]
+pub enum Phase {
+    /// Initial exploration with Sobol + grid
+    Exploration,
+    /// Balanced refinement with D-optimal, SA, Fisher
+    Refinement,
+    /// Final polishing, no expansion
+    Polishing,
+}
+
+impl std::fmt::Display for Phase {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Phase::Exploration => write!(f, "Exploration"),
+            Phase::Refinement => write!(f, "Refinement"),
+            Phase::Polishing => write!(f, "Polishing"),
+        }
+    }
+}
+
+// ============================================================================
+// ELITE POINT
+// ============================================================================
+
+/// An elite point preserved across cycles
+#[derive(Debug, Clone)]
+pub struct ElitePoint {
+    pub params: Vec<f64>,
+    pub d_value: f64,
+    pub cycle_added: usize,
+}
+
+// ============================================================================
+// NPOPT STRUCT
+// ============================================================================
+
+/// NPOPT: Non-Parametric OPTimal Trajectory Algorithm
+#[derive(Debug)]
+pub struct NPOPT<E: Equation + Send + 'static> {
+    /// The pharmacometric equation/model
+    pub(crate) equation: E,
+    /// Parameter ranges for each dimension
+    pub(crate) ranges: Vec<(f64, f64)>,
+    /// Probability matrix: P(y_i | θ_j)
+    pub(crate) psi: Psi,
+    /// Support points (parameter values)
+    pub(crate) theta: Theta,
+    /// Weights from IPM before condensation
+    pub(crate) lambda: Weights,
+    /// Final weights after condensation
+    pub(crate) w: Weights,
+    /// Previous weights for stability check
+    pub(crate) w_prev: Weights,
+    /// Current grid spacing
+    pub(crate) eps: f64,
+    /// Previous objective function value
+    pub(crate) last_objf: f64,
+    /// Current objective function value
+    pub(crate) objf: f64,
+    /// Best objective function seen
+    pub(crate) best_objf: f64,
+    /// P(Y|L) values for convergence checking
+    pub(crate) f0: f64,
+    pub(crate) f1: f64,
+    /// Current cycle number
+    pub(crate) cycle: usize,
+    /// Step sizes for error model optimization
+    pub(crate) gamma_delta: Vec<f64>,
+    /// Error models for observations
+    pub(crate) error_models: AssayErrorModels,
+    /// Algorithm status
+    pub(crate) status: Status,
+    /// Cycle log for tracking progress
+    pub(crate) cycle_log: CycleLog,
+    /// Subject data
+    pub(crate) data: Data,
+    /// Unified runtime/model-derived configuration
+    pub(crate) config: NativeNonparametricConfig,
+
+    // NPOPT specific fields
+    /// Current algorithm phase
+    pub(crate) phase: Phase,
+    /// History of objective function values
+    pub(crate) objf_history: Vec<f64>,
+    /// Sobol sequence index
+    pub(crate) sobol_index: u32,
+
+    // Adaptive SA fields
+    /// SA temperature
+    pub(crate) temperature: f64,
+    /// Effective cooling rate (adaptive)
+    pub(crate) cooling_rate: f64,
+    /// Rolling window of acceptance ratios
+    pub(crate) sa_acceptance_history: VecDeque<f64>,
+    /// SA accepted count this cycle
+    pub(crate) sa_accepted: usize,
+    /// SA proposed count this cycle
+    pub(crate) sa_proposed: usize,
+
+    // Fisher Information
+    /// Diagonal approximation of Fisher Information
+    pub(crate) fisher_diagonal: Vec<f64>,
+
+    // Elite preservation
+    /// Elite points preserved across cycles
+    pub(crate) elite_points: Vec<ElitePoint>,
+
+    // Convergence tracking
+    /// Count of consecutive global check passes
+    pub(crate) global_check_passes: usize,
+    /// Last global check max D value
+    pub(crate) last_global_d_max: f64,
+
+    /// Random number generator
+    pub(crate) rng: StdRng,
+}
+
+// ============================================================================
+// ALGORITHMS TRAIT IMPLEMENTATION
+// ============================================================================
+
+impl<E: Equation + Send + 'static> Algorithms<E> for NPOPT<E> {
+    fn equation(&self) -> &E {
+        &self.equation
+    }
+
+    fn into_workspace(&self) -> Result<NonparametricWorkspace<E>> {
+        NonparametricWorkspace::new(
+            self.equation.clone(),
+            self.data.clone(),
+            self.theta.clone(),
+            self.psi.clone(),
+            self.w.clone(),
+            -2. * self.objf,
+            self.cycle,
+            self.status.clone(),
+            self.config.run_configuration.clone(),
+            self.cycle_log.clone(),
+        )
+    }
+
+    fn error_models(&self) -> &AssayErrorModels {
+        &self.error_models
+    }
+
+    fn data(&self) -> &Data {
+        &self.data
+    }
+
+    fn get_prior(&self) -> Theta {
+        sample_space_for_parameters(&self.config.parameter_space, &self.config.prior).unwrap()
+    }
+
+    fn likelihood(&self) -> f64 {
+        self.objf
+    }
+
+    fn increment_cycle(&mut self) -> usize {
+        self.cycle += 1;
+
+        // Phase transitions
+        if self.cycle > EXPLORATION_CYCLES && self.phase == Phase::Exploration {
+            self.phase = Phase::Refinement;
+            tracing::info!(
+                "NPOPT: Exploration → Refinement (cycle {}, {} SPPs)",
+                self.cycle,
+                self.theta.nspp()
+            );
+        }
+
+        // Adapt temperature
+        self.adapt_temperature();
+
+        // Track best objective
+        if self.objf > self.best_objf + THETA_G {
+            self.best_objf = self.objf;
+        }
+
+        self.cycle
+    }
+
+    fn cycle(&self) -> usize {
+        self.cycle
+    }
+
+    fn set_theta(&mut self, theta: Theta) {
+        self.theta = theta;
+    }
+
+    fn theta(&self) -> &Theta {
+        &self.theta
+    }
+
+    fn psi(&self) -> &Psi {
+        &self.psi
+    }
+
+    fn set_status(&mut self, status: Status) {
+        self.status = status;
+    }
+
+    fn status(&self) -> &Status {
+        &self.status
+    }
+
+    fn evaluation(&mut self) -> Result<Status> {
+        tracing::info!("Objective function = {:.4}", -2.0 * self.objf);
+        tracing::debug!(
+            "Support points: {} | Phase: {} | T: {:.4}",
+            self.theta.nspp(),
+            self.phase,
+            self.temperature
+        );
+
+        // Log error models
+        self.error_models.iter().for_each(|(outeq, em)| {
+            if AssayErrorModel::None != *em {
+                tracing::debug!(
+                    "Error model outeq {}: {:.4}",
+                    outeq,
+                    em.factor().unwrap_or_default()
+                );
+            }
+        });
+
+        // Track history
+        self.objf_history.push(self.objf);
+
+        // Warn on decrease
+        if self.last_objf > self.objf + 1e-4 {
+            tracing::warn!(
+                "Objective function decreased: {:.4} → {:.4}",
+                -2.0 * self.last_objf,
+                -2.0 * self.objf
+            );
+        }
+
+        // Check convergence
+        let converged = self.check_convergence()?;
+        if converged {
+            tracing::info!("NPOPT converged after {} cycles", self.cycle);
+            self.set_status(Status::Stop(StopReason::Converged));
+            self.log_cycle_state();
+            return Ok(self.status().clone());
+        }
+
+        // NPAG-style eps convergence
+        if self.phase != Phase::Polishing {
+            if (self.last_objf - self.objf).abs() <= THETA_G && self.eps > MIN_EPS {
+                self.eps /= 2.0;
+                tracing::debug!("Halving eps to {:.6}", self.eps);
+
+                if self.eps <= MIN_EPS {
+                    let pyl = self.psi.matrix() * self.w.weights();
+                    self.f1 = pyl.iter().map(|x| x.ln()).sum();
+                    if (self.f1 - self.f0).abs() <= THETA_F {
+                        // Transition to polishing
+                        self.phase = Phase::Polishing;
+                        tracing::info!("NPOPT: Refinement → Polishing (cycle {})", self.cycle);
+                    } else {
+                        self.f0 = self.f1;
+                        self.eps = INITIAL_EPS;
+                    }
+                }
+            }
+        }
+
+        // Check maximum cycles
+        if self.cycle >= self.config.max_cycles {
+            tracing::warn!("Maximum cycles reached");
+            self.set_status(Status::Stop(StopReason::MaxCycles));
+            self.log_cycle_state();
+            return Ok(self.status().clone());
+        }
+
+        // Check for stop file
+        if std::path::Path::new("stop").exists() {
+            tracing::warn!("Stop file detected");
+            self.set_status(Status::Stop(StopReason::Stopped));
+            self.log_cycle_state();
+            return Ok(self.status().clone());
+        }
+
+        self.set_status(Status::Continue);
+        self.log_cycle_state();
+        Ok(self.status().clone())
+    }
+
+    fn estimation(&mut self) -> Result<()> {
+        self.psi = calculate_psi(
+            &self.equation,
+            &self.data,
+            &self.theta,
+            &self.error_models,
+            self.cycle == 1 && self.config.progress,
+        )?;
+
+        if let Err(err) = self.validate_psi() {
+            bail!(err);
+        }
+
+        (self.lambda, _) = match burke(&self.psi) {
+            Ok((lambda, objf)) => (lambda, objf),
+            Err(err) => {
+                bail!("Error in IPM during estimation: {:?}", err);
+            }
+        };
+        Ok(())
+    }
+
+    fn condensation(&mut self) -> Result<()> {
+        // Store previous weights
+        self.w_prev = self.w.clone();
+
+        // Lambda filter
+        let max_lambda = self
+            .lambda
+            .iter()
+            .fold(f64::NEG_INFINITY, |acc, x| x.max(acc));
+
+        let threshold = max_lambda / LAMBDA_FILTER_DIVISOR;
+        let keep: Vec<usize> = self
+            .lambda
+            .iter()
+            .enumerate()
+            .filter(|(_, lam)| *lam > threshold)
+            .map(|(i, _)| i)
+            .collect();
+
+        let dropped = self.psi.matrix().ncols() - keep.len();
+        if dropped > 0 {
+            tracing::debug!("Lambda filter dropped {} point(s)", dropped);
+        }
+
+        self.theta.filter_indices(keep.as_slice());
+        self.psi.filter_column_indices(keep.as_slice());
+
+        // QR rank-revealing factorization
+        let (r, perm) = qr::qrd(&self.psi)?;
+        let keep_n = self.psi.matrix().ncols().min(self.psi.matrix().nrows());
+        let keep: Vec<usize> = (0..keep_n)
+            .filter(|&i| {
+                let test = r.col(i).norm_l2();
+                let r_diag = r.get(i, i);
+                (r_diag / test).abs() >= 1e-8
+            })
+            .map(|i| *perm.get(i).unwrap())
+            .collect();
+
+        let dropped = self.psi.matrix().ncols() - keep.len();
+        if dropped > 0 {
+            tracing::debug!("QR dropped {} point(s)", dropped);
+        }
+
+        self.theta.filter_indices(keep.as_slice());
+        self.psi.filter_column_indices(keep.as_slice());
+
+        self.validate_psi()?;
+
+        (self.lambda, self.objf) = match burke(&self.psi) {
+            Ok((lambda, objf)) => (lambda, objf),
+            Err(err) => {
+                return Err(anyhow::anyhow!(
+                    "Error in IPM during condensation: {:?}",
+                    err
+                ));
+            }
+        };
+        self.w = self.lambda.clone();
+
+        // Update Fisher Information and elite points
+        self.update_fisher_information();
+        self.update_elite_points()?;
+
+        Ok(())
+    }
+
+    fn optimizations(&mut self) -> Result<()> {
+        // Standard error model optimization
+        self.error_models
+            .clone()
+            .iter_mut()
+            .filter_map(|(outeq, em)| {
+                if em.optimize() {
+                    Some((outeq, em))
+                } else {
+                    None
+                }
+            })
+            .try_for_each(|(outeq, em)| -> Result<()> {
+                let gamma_up = em.factor()? * (1.0 + self.gamma_delta[outeq]);
+                let gamma_down = em.factor()? / (1.0 + self.gamma_delta[outeq]);
+
+                let mut error_model_up = self.error_models.clone();
+                error_model_up.set_factor(outeq, gamma_up)?;
+
+                let mut error_model_down = self.error_models.clone();
+                error_model_down.set_factor(outeq, gamma_down)?;
+
+                let psi_up = calculate_psi(
+                    &self.equation,
+                    &self.data,
+                    &self.theta,
+                    &error_model_up,
+                    false,
+                )?;
+                let psi_down = calculate_psi(
+                    &self.equation,
+                    &self.data,
+                    &self.theta,
+                    &error_model_down,
+                    false,
+                )?;
+
+                let (lambda_up, objf_up) = match burke(&psi_up) {
+                    Ok((lambda, objf)) => (lambda, objf),
+                    Err(err) => bail!("Error in IPM during optim: {:?}", err),
+                };
+                let (lambda_down, objf_down) = match burke(&psi_down) {
+                    Ok((lambda, objf)) => (lambda, objf),
+                    Err(err) => bail!("Error in IPM during optim: {:?}", err),
+                };
+
+                if objf_up > self.objf {
+                    self.error_models.set_factor(outeq, gamma_up)?;
+                    self.objf = objf_up;
+                    self.gamma_delta[outeq] *= 4.0;
+                    self.lambda = lambda_up;
+                    self.psi = psi_up;
+                }
+                if objf_down > self.objf {
+                    self.error_models.set_factor(outeq, gamma_down)?;
+                    self.objf = objf_down;
+                    self.gamma_delta[outeq] *= 4.0;
+                    self.lambda = lambda_down;
+                    self.psi = psi_down;
+                }
+                self.gamma_delta[outeq] *= 0.5;
+                if self.gamma_delta[outeq] <= 0.01 {
+                    self.gamma_delta[outeq] = 0.1;
+                }
+                Ok(())
+            })?;
+
+        Ok(())
+    }
+
+    fn expansion(&mut self) -> Result<()> {
+        match self.phase {
+            Phase::Exploration => self.exploration_expansion()?,
+            Phase::Refinement => self.refinement_expansion()?,
+            Phase::Polishing => self.polishing_expansion()?,
+        }
+        Ok(())
+    }
+
+    fn log_cycle_state(&mut self) {
+        let state = NPCycle::new(
+            self.cycle,
+            -2. * self.objf,
+            self.error_models.clone(),
+            self.theta.clone(),
+            self.theta.nspp(),
+            (self.last_objf - self.objf).abs(),
+            self.status.clone(),
+        );
+        self.cycle_log.push(state);
+        self.last_objf = self.objf;
+    }
+}
+
+// ============================================================================
+// HELPER METHODS
+// ============================================================================
+
+impl<E: Equation + Send + 'static> NPOPT<E> {
+    pub(crate) fn from_input(input: NonparametricAlgorithmInput<E>) -> Result<Box<Self>> {
+        let config = input.native_config()?;
+        let seed = config.prior.seed().unwrap_or(42);
+        let n_params = config.ranges.len();
+        let error_models = input.error_models().clone();
+
+        Ok(Box::new(Self {
+            equation: input.equation,
+            ranges: config.ranges.clone(),
+            psi: Psi::new(),
+            theta: Theta::new(),
+            lambda: Weights::default(),
+            w: Weights::default(),
+            w_prev: Weights::default(),
+            eps: INITIAL_EPS,
+            last_objf: -1e30,
+            objf: f64::NEG_INFINITY,
+            best_objf: f64::NEG_INFINITY,
+            f0: -1e30,
+            f1: f64::default(),
+            cycle: 0,
+            gamma_delta: vec![0.1; error_models.len()],
+            error_models,
+            status: Status::Continue,
+            cycle_log: CycleLog::new(),
+            data: input.data,
+            config,
+            phase: Phase::Exploration,
+            objf_history: Vec::with_capacity(500),
+            sobol_index: seed as u32,
+            temperature: INITIAL_TEMPERATURE,
+            cooling_rate: BASE_COOLING_RATE,
+            sa_acceptance_history: VecDeque::with_capacity(SA_HISTORY_WINDOW),
+            sa_accepted: 0,
+            sa_proposed: 0,
+            fisher_diagonal: vec![1.0; n_params],
+            elite_points: Vec::with_capacity(ELITE_COUNT * 2),
+            global_check_passes: 0,
+            last_global_d_max: f64::INFINITY,
+            rng: StdRng::seed_from_u64(seed as u64),
+        }))
+    }
+
+    /// Compute P(Y|G) = Psi * w
+    pub(crate) fn compute_pyl(&self) -> Array1<f64> {
+        let psi = self.psi.to_ndarray();
+        let w: Array1<f64> = self.w.clone().iter().collect();
+        psi.dot(&w)
+    }
+
+    /// Compute D-criterion for a candidate point
+    pub(crate) fn compute_d(&self, point: &[f64], pyl: &Array1<f64>) -> Result<f64> {
+        let theta_single = ndarray::Array1::from(point.to_vec()).insert_axis(ndarray::Axis(0));
+
+        let psi_single = pharmsol::prelude::simulator::log_likelihood_matrix(
+            &self.equation,
+            &self.data,
+            &theta_single,
+            &self.error_models,
+            false,
+        )?
+        .mapv(f64::exp);
+
+        let nsub = psi_single.nrows() as f64;
+        let mut d_sum = -nsub;
+
+        for (p_i, pyl_i) in psi_single.iter().zip(pyl.iter()) {
+            if *pyl_i > 0.0 {
+                d_sum += p_i / pyl_i;
+            }
+        }
+
+        Ok(d_sum)
+    }
+
+    /// Compute weighted centroid of support points
+    pub(crate) fn compute_weighted_centroid(&self) -> Vec<f64> {
+        let n_params = self.ranges.len();
+        let mut centroid = vec![0.0; n_params];
+        let mut total_weight = 0.0;
+
+        for (i, spp) in self.theta.matrix().row_iter().enumerate() {
+            let weight = if i < self.w.len() { self.w[i] } else { 0.0 };
+            total_weight += weight;
+            for (j, val) in spp.iter().enumerate() {
+                centroid[j] += weight * val;
+            }
+        }
+
+        if total_weight > 0.0 {
+            for c in &mut centroid {
+                *c /= total_weight;
+            }
+        } else {
+            for (j, (lo, hi)) in self.ranges.iter().enumerate() {
+                centroid[j] = (lo + hi) / 2.0;
+            }
+        }
+
+        centroid
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_phase_display() {
+        assert_eq!(format!("{}", Phase::Exploration), "Exploration");
+        assert_eq!(format!("{}", Phase::Refinement), "Refinement");
+        assert_eq!(format!("{}", Phase::Polishing), "Polishing");
+    }
+
+    #[test]
+    fn test_constants() {
+        assert!(EXPLORATION_CYCLES > 0);
+        assert!(INITIAL_TEMPERATURE > MIN_TEMPERATURE);
+        assert!(BASE_COOLING_RATE > 0.0 && BASE_COOLING_RATE < 1.0);
+        assert!(ELITE_COUNT > 0);
+    }
+}
diff --git a/src/algorithms/nonparametric/npopt/optimizers.rs b/src/algorithms/nonparametric/npopt/optimizers.rs
new file mode 100644
index 000000000..4eb6b193c
--- /dev/null
+++ b/src/algorithms/nonparametric/npopt/optimizers.rs
@@ -0,0 +1,221 @@
+//! Optimizers for NPOPT algorithm
+
+use super::constants::*;
+use argmin::core::{CostFunction, Error, Executor};
+use argmin::solver::neldermead::NelderMead;
+use ndarray::{Array1, Axis};
+use pharmsol::prelude::{
+    data::{AssayErrorModels, Data},
+    simulator::Equation,
+};
+use pharmsol::Subject;
+
+// ============================================================================
+// D-OPTIMAL OPTIMIZER
+// ============================================================================
+
+/// Optimizer for D-criterion maximization
+pub struct DOptimalOptimizer<'a, E: Equation> {
+    pub equation: &'a E,
+    pub data: &'a Data,
+    pub error_models: &'a AssayErrorModels,
+    pub pyl: &'a Array1<f64>,
+}
+
+impl<E: Equation> CostFunction for DOptimalOptimizer<'_, E> {
+    type Param = Vec<f64>;
+    type Output = f64;
+
+    fn cost(&self, spp: &Self::Param) -> Result<Self::Output, Error> {
+        let theta = Array1::from(spp.clone()).insert_axis(Axis(0));
+
+        let psi = pharmsol::prelude::simulator::log_likelihood_matrix(
+            self.equation,
+            self.data,
+            &theta,
+            self.error_models,
+            false,
+        )?
+        .mapv(f64::exp);
+
+        let nsub = psi.nrows() as f64;
+        let mut d_sum = -nsub;
+        for (p_i, pyl_i) in psi.iter().zip(self.pyl.iter()) {
+            if *pyl_i > 0.0 {
+                d_sum += p_i / pyl_i;
+            }
+        }
+
+        Ok(-d_sum) // Minimize -D = Maximize D
+    }
+}
+
+impl<'a, E: Equation> DOptimalOptimizer<'a, E> {
+    /// Optimize a point using Nelder-Mead
+    pub fn optimize(self, start: Vec<f64>, max_iters: u64) -> Result<Vec<f64>, Error> {
+        let simplex = create_initial_simplex(&start, 0.05);
+        let solver: NelderMead<Vec<f64>, f64> = NelderMead::new(simplex).with_sd_tolerance(1e-3)?;
+
+        let res = Executor::new(self, solver)
+            .configure(|state| state.max_iters(max_iters))
+            .run()?;
+
+        Ok(res.state.best_param.unwrap())
+    }
+}
+
+// ============================================================================
+// SUBJECT MAP OPTIMIZER
+// ============================================================================
+
+/// Optimizer for finding MAP estimate for a single subject
+pub struct SubjectMapOptimizer<'a, E: Equation> {
+    pub equation: &'a E,
+    pub subject: &'a Subject,
+    pub error_models: &'a AssayErrorModels,
+    pub ranges: &'a [(f64, f64)],
+}
+
+impl<E: Equation> CostFunction for SubjectMapOptimizer<'_, E> {
+    type Param = Vec<f64>;
+    type Output = f64;
+
+    fn cost(&self, params: &Self::Param) -> Result<Self::Output, Error> {
+        // Clamp to bounds
+        let clamped: Vec<f64> = params
+            .iter()
+            .zip(self.ranges.iter())
+            .map(|(v, (lo, hi))| v.clamp(*lo, *hi))
+            .collect();
+
+        // Create single-subject data
+        let single_data = Data::new(vec![self.subject.clone()]);
+        let theta = ndarray::Array1::from(clamped).insert_axis(Axis(0));
+
+        let psi = pharmsol::prelude::simulator::log_likelihood_matrix(
+            self.equation,
+            &single_data,
+            &theta,
+            self.error_models,
+            false,
+        )?
+        .mapv(f64::exp);
+
+        // Minimize -log P(y|θ) = Maximize P(y|θ)
+        let p = psi.iter().next().unwrap_or(&1e-300);
+        let log_p = if *p > 0.0 { p.ln() } else { -700.0 };
+
+        Ok(-log_p)
+    }
+}
+
+impl<'a, E: Equation> SubjectMapOptimizer<'a, E> {
+    /// Optimize to find MAP estimate
+    pub fn optimize(self, start: Vec<f64>, max_iters: u64) -> Result<Vec<f64>, Error> {
+        let ranges = self.ranges;
+        let simplex = create_initial_simplex_bounded(&start, ranges, 0.05);
+        let solver: NelderMead<Vec<f64>, f64> = NelderMead::new(simplex).with_sd_tolerance(1e-3)?;
+
+        let res = Executor::new(self, solver)
+            .configure(|state| state.max_iters(max_iters))
+            .run()?;
+
+        // Clamp result to bounds
+        let result = res.state.best_param.unwrap();
+        let clamped: Vec<f64> = result
+            .iter()
+            .zip(ranges.iter())
+            .map(|(v, (lo, hi))| {
+                let margin = (hi - lo) * BOUNDARY_MARGIN;
+                v.clamp(lo + margin, hi - margin)
+            })
+            .collect();
+
+        Ok(clamped)
+    }
+}
+
+// ============================================================================
+// UTILITY FUNCTIONS
+// ============================================================================
+
+/// Create initial simplex for Nelder-Mead
+fn create_initial_simplex(initial_point: &[f64], perturbation_frac: f64) -> Vec<Vec<f64>> {
+    let num_dims = initial_point.len();
+
+    let mut vertices = Vec::with_capacity(num_dims + 1);
+    vertices.push(initial_point.to_vec());
+
+    for i in 0..num_dims {
+        let perturbation = if initial_point[i] == 0.0 {
+            0.001
+        } else {
+            perturbation_frac * initial_point[i].abs()
+        };
+
+        let mut perturbed = initial_point.to_vec();
+        perturbed[i] += perturbation;
+        vertices.push(perturbed);
+    }
+
+    vertices
+}
+
+/// Create initial simplex with bounds awareness
+fn create_initial_simplex_bounded(
+    initial_point: &[f64],
+    ranges: &[(f64, f64)],
+    perturbation_frac: f64,
+) -> Vec<Vec<f64>> {
+    let num_dims = initial_point.len();
+
+    let mut vertices = Vec::with_capacity(num_dims + 1);
+    vertices.push(initial_point.to_vec());
+
+    for i in 0..num_dims {
+        let (lo, hi) = ranges[i];
+        let range = hi - lo;
+        let perturbation = perturbation_frac * range;
+
+        let mut perturbed = initial_point.to_vec();
+        let new_val = initial_point[i] + perturbation;
+
+        if new_val <= hi {
+            perturbed[i] = new_val;
+        } else {
+            perturbed[i] = initial_point[i] - perturbation;
+        }
+
+        vertices.push(perturbed);
+    }
+
+    vertices
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_simplex_creation() {
+        let point = vec![1.0, 2.0, 3.0];
+        let simplex = create_initial_simplex(&point, 0.05);
+
+        assert_eq!(simplex.len(), 4); // n+1 vertices
+        assert_eq!(simplex[0], point);
+    }
+
+    #[test]
+    fn test_simplex_bounded() {
+        let point = vec![0.5, 0.95];
+        let ranges = vec![(0.0, 1.0), (0.0, 1.0)];
+        let simplex = create_initial_simplex_bounded(&point, &ranges, 0.05);
+
+        assert_eq!(simplex.len(), 3);
+        for vertex in &simplex {
+            for (i, val) in vertex.iter().enumerate() {
+                assert!(*val >= ranges[i].0 && *val <= ranges[i].1);
+            }
+        }
+    }
+}
diff --git a/src/algorithms/nonparametric/nppso/constants.rs b/src/algorithms/nonparametric/nppso/constants.rs
new file mode 100644
index 000000000..b74628247
--- /dev/null
+++ b/src/algorithms/nonparametric/nppso/constants.rs
@@ -0,0 +1,114 @@
+//! Constants for NPPSO algorithm
+
+// ============================================================================
+// CONVERGENCE CONSTANTS (matching NPAG/NPSAH)
+// ============================================================================
+
+/// Grid spacing convergence threshold
+pub const THETA_E: f64 = 1e-4;
+
+/// Objective function convergence threshold
+pub const THETA_G: f64 = 1e-4;
+
+/// P(Y|L) convergence criterion
+pub const THETA_F: f64 = 1e-2;
+
+/// Distance threshold for new points
+pub const THETA_D: f64 = 1e-4;
+
+// ============================================================================
+// PSO PARAMETERS
+// ============================================================================
+
+/// Number of particles in swarm
+pub const SWARM_SIZE: usize = 40;
+
+/// Inertia weight bounds (adaptive)
+pub const INERTIA_MAX: f64 = 0.9;
+pub const INERTIA_MIN: f64 = 0.4;
+
+/// Cognitive weight (attraction to personal best)
+pub const COGNITIVE_WEIGHT: f64 = 2.0;
+
+/// Social weight (attraction to global best)
+pub const SOCIAL_WEIGHT: f64 = 2.0;
+
+/// Max velocity as fraction of range
+pub const MAX_VELOCITY_FRACTION: f64 = 0.15;
+
+/// Boundary margin (fraction of range)
+pub const BOUNDARY_MARGIN: f64 = 0.001;
+
+// ============================================================================
+// ALGORITHM PHASES
+// ============================================================================
+
+/// Number of warm-up cycles using NPAG-style grid expansion
+pub const WARMUP_CYCLES: usize = 3;
+
+/// Fraction of max D-criterion to use as threshold for adding points
+pub const D_THRESHOLD_FRACTION: f64 = 0.5;
+
+/// Convergence threshold for swarm clustering
+pub const CONVERGENCE_THRESHOLD: f64 = 0.8;
+
+/// Fraction of particles to reinject when converging
+pub const REINJECT_FRACTION: f64 = 0.25;
+
+// ============================================================================
+// GLOBAL OPTIMALITY CHECK
+// ============================================================================
+
+/// Number of random samples for global optimality check
+pub const GLOBAL_CHECK_SAMPLES: usize = 500;
+
+/// D-criterion threshold for global optimality (should be near 0 when optimal)
+pub const GLOBAL_D_THRESHOLD: f64 = 0.01;
+
+// ============================================================================
+// SUBJECT MAP & D-OPTIMAL REFINEMENT
+// ============================================================================
+
+/// Number of worst-fit subjects to target with MAP injection
+pub const RESIDUAL_SUBJECTS: usize = 2;
+
+/// Max evaluations for subject MAP optimization (COBYLA)
+pub const SUBJECT_MAP_EVALS: usize = 100;
+
+/// Max evaluations for D-optimal refinement (COBYLA)
+pub const DOPT_REFINE_EVALS: usize = 50;
+
+/// Perform D-optimal refinement every N cycles
+pub const DOPT_REFINE_INTERVAL: usize = 10;
+
+/// Weight threshold for D-optimal refinement (only refine important points)
+pub const DOPT_WEIGHT_THRESHOLD: f64 = 0.05;
+
+// ============================================================================
+// SIMULATED ANNEALING (for escaping local optima)
+// ============================================================================
+
+/// Initial SA temperature (high for exploration)
+pub const SA_INITIAL_TEMP: f64 = 3.0;
+
+/// Cooling rate per cycle
+pub const SA_COOLING_RATE: f64 = 0.95;
+
+/// Minimum temperature
+pub const SA_MIN_TEMP: f64 = 0.05;
+
+/// Number of SA injection attempts per cycle
+pub const SA_INJECT_COUNT: usize = 40;
+
+/// Reheat when temperature drops and objf stagnates
+pub const SA_REHEAT_FACTOR: f64 = 2.0;
+
+// ============================================================================
+// ELITE PRESERVATION
+// ============================================================================
+
+/// Number of elite points to preserve
+pub const ELITE_COUNT: usize = 5;
+
+/// Maximum age (cycles) before elite point is removed
+pub const ELITE_MAX_AGE: usize = 20;
diff --git a/src/algorithms/nonparametric/nppso/mod.rs b/src/algorithms/nonparametric/nppso/mod.rs
new file mode 100644
index 000000000..b4e61c743
--- /dev/null
+++ b/src/algorithms/nonparametric/nppso/mod.rs
@@ -0,0 +1,1040 @@
+//! # NPPSO: Non-Parametric Particle Swarm Optimization
+//!
+//! A true PSO-based algorithm for non-parametric population modeling.
+//!
+//! ## Key Innovation: D-Criterion Guided Swarm + Subject Targeting
+//!
+//! Unlike standard PSO which optimizes a single objective, NPPSO particles
+//! search for regions of parameter space that maximize the D-optimality criterion.
+//! Additionally, we target poorly-fit subjects with MAP estimation to ensure
+//! all subjects are well-represented.
+//!
+//! ## Why This Works
+//!
+//! 1. **Momentum escapes local optima**: Velocity-based movement allows particles
+//!    to overshoot and explore beyond current best positions
+//! 2. **Collective learning**: The swarm shares information about high-D regions
+//! 3. **Subject targeting**: MAP injection for poorly-fit subjects ensures coverage
+//! 4. **D-optimal refinement**: COBYLA polishes support point positions
+//! 5. **Elite preservation**: Best points are preserved across cycles
+//!
+//! ## Algorithm Structure
+//!
+//! - **Warm-up (cycles 1-3)**: NPAG-style grid expansion for broad coverage
+//! - **PSO Phase**: Particles search for high D-criterion regions
+//! - **Subject MAP**: Inject points for poorly-fit subjects
+//! - **D-refinement**: Polish existing support points with COBYLA
+//! - **Elite preservation**: Maintain best points across cycles
+
+mod constants;
+mod optimizers;
+mod swarm;
+
+pub use constants::*;
+use optimizers::{optimize_subject_map, refine_d_optimal, ElitePoint};
+
+use crate::algorithms::{NativeNonparametricConfig, NonparametricAlgorithmInput, StopReason};
+use crate::estimation::nonparametric::adaptative_grid;
+use crate::estimation::nonparametric::ipm::burke;
+use crate::estimation::nonparametric::qr;
+use crate::estimation::nonparametric::sample_space_for_parameters;
+use crate::estimation::nonparametric::{
+    calculate_psi, CycleLog, NPCycle, NonparametricWorkspace, Psi, Theta, Weights,
+};
+use crate::{algorithms::Status, prelude::algorithms::Algorithms};
+
+use anyhow::{bail, Result};
+use ndarray::parallel::prelude::{IntoParallelIterator, ParallelIterator};
+use ndarray::{Array, Array1, ArrayBase, Axis, Dim, OwnedRepr};
+use pharmsol::prelude::data::Data;
+use pharmsol::prelude::simulator::Equation;
+use pharmsol::{prelude::AssayErrorModel, AssayErrorModels, Subject};
+use rand::prelude::*;
+use rand::SeedableRng;
+use swarm::Swarm;
+
+// ============================================================================
+// NPPSO STRUCT
+// ============================================================================
+
+pub struct NPPSO<E: Equation + Send + 'static> {
+    equation: E,
+    ranges: Vec<(f64, f64)>,
+    psi: Psi,
+    theta: Theta,
+    lambda: Weights,
+    w: Weights,
+    eps: f64,
+    last_objf: f64,
+    objf: f64,
+    f0: f64,
+    f1: f64,
+    cycle: usize,
+    gamma_delta: Vec<f64>,
+    error_models: AssayErrorModels,
+    status: Status,
+    cycle_log: CycleLog,
+    data: Data,
+    config: NativeNonparametricConfig,
+
+    // PSO specific
+    swarm: Swarm,
+    objf_history: Vec<f64>,
+    rng: StdRng,
+    /// Cached pyl vector for D-criterion evaluation
+    pyl: Array1<f64>,
+    /// Phase: true = warm-up (grid expansion), false = PSO-driven
+    in_warmup: bool,
+    /// Elite points preserved across cycles
+    elite_points: Vec<ElitePoint>,
+    /// SA temperature for escaping local optima
+    sa_temperature: f64,
+    /// Track stagnation for SA reheat
+    stagnation_count: usize,
+}
+
+// ============================================================================
+// ALGORITHMS TRAIT
+// ============================================================================
+
+impl<E: Equation + Send + 'static> Algorithms<E> for NPPSO<E> {
+    fn equation(&self) -> &E {
+        &self.equation
+    }
+
+    fn error_models(&self) -> &AssayErrorModels {
+        &self.error_models
+    }
+
+    fn data(&self) -> &Data {
+        &self.data
+    }
+
+    fn get_prior(&self) -> Theta {
+        sample_space_for_parameters(&self.config.parameter_space, &self.config.prior).unwrap()
+    }
+
+    fn likelihood(&self) -> f64 {
+        self.objf
+    }
+
+    fn increment_cycle(&mut self) -> usize {
+        self.cycle += 1;
+
+        // Exit warm-up after WARMUP_CYCLES
+        if self.cycle > WARMUP_CYCLES && self.in_warmup {
+            self.in_warmup = false;
+            tracing::info!("NPPSO: Warm-up complete, entering PSO-driven expansion");
+        }
+
+        self.cycle
+    }
+
+    fn cycle(&self) -> usize {
+        self.cycle
+    }
+
+    fn set_theta(&mut self, theta: Theta) {
+        self.theta = theta;
+    }
+
+    fn theta(&self) -> &Theta {
+        &self.theta
+    }
+
+    fn psi(&self) -> &Psi {
+        &self.psi
+    }
+
+    fn set_status(&mut self, status: Status) {
+        self.status = status;
+    }
+
+    fn status(&self) -> &Status {
+        &self.status
+    }
+
+    fn log_cycle_state(&mut self) {
+        let state = NPCycle::new(
+            self.cycle,
+            -2.0 * self.objf,
+            self.error_models.clone(),
+            self.theta.clone(),
+            self.theta.nspp(),
+            (self.last_objf - self.objf).abs(),
+            self.status.clone(),
+        );
+        self.cycle_log.push(state);
+        self.last_objf = self.objf;
+    }
+
+    fn evaluation(&mut self) -> Result<Status> {
+        tracing::info!("Objective function = {:.4}", -2.0 * self.objf);
+        tracing::debug!(
+            "Support points: {} | Phase: {} | EPS: {:.4}",
+            self.theta.nspp(),
+            if self.in_warmup { "Warm-up" } else { "PSO" },
+            self.eps
+        );
+
+        self.error_models.iter().for_each(|(outeq, em)| {
+            if AssayErrorModel::None != *em {
+                tracing::debug!(
+                    "Error model outeq {}: {:.4}",
+                    outeq,
+                    em.factor().unwrap_or_default()
+                );
+            }
+        });
+
+        self.objf_history.push(self.objf);
+
+        if self.last_objf > self.objf + 1e-4 {
+            tracing::warn!(
+                "Objective decreased: {:.4} → {:.4}",
+                -2.0 * self.last_objf,
+                -2.0 * self.objf
+            );
+        }
+
+        // Track stagnation for SA reheat
+        if (self.last_objf - self.objf).abs() < 1e-3 {
+            self.stagnation_count += 1;
+            // Reheat SA if stagnating and temperature is low
+            if self.stagnation_count > 5 && self.sa_temperature < SA_INITIAL_TEMP * 0.3 {
+                self.sa_temperature = (self.sa_temperature * SA_REHEAT_FACTOR).min(SA_INITIAL_TEMP);
+                self.stagnation_count = 0;
+                tracing::debug!("SA reheat to T={:.3}", self.sa_temperature);
+            }
+        } else {
+            self.stagnation_count = 0;
+        }
+
+        // NPAG-style convergence with eps halving
+        let psi = self.psi.matrix();
+        let w = &self.w;
+
+        if (self.last_objf - self.objf).abs() <= THETA_G && self.eps > THETA_E {
+            self.eps /= 2.0;
+            tracing::debug!("Halving eps to {:.6}", self.eps);
+
+            if self.eps <= THETA_E {
+                let pyl = psi * w.weights();
+                self.f1 = pyl.iter().map(|x| x.ln()).sum();
+
+                if (self.f1 - self.f0).abs() <= THETA_F {
+                    // Also check global optimality via swarm
+                    let global_check = self.global_optimality_check()?;
+                    if global_check {
+                        tracing::info!("NPPSO converged after {} cycles", self.cycle);
+                        self.status = Status::Stop(StopReason::Converged);
+                        self.log_cycle_state();
+                        return Ok(self.status.clone());
+                    } else {
+                        tracing::debug!("P(Y|L) criterion met but global check failed, continuing");
+                        self.f0 = self.f1;
+                        self.eps = 0.2;
+                    }
+                } else {
+                    self.f0 = self.f1;
+                    self.eps = 0.2;
+                }
+            }
+        }
+
+        // Max cycles check
+        if self.cycle >= self.config.max_cycles {
+            tracing::warn!("Maximum cycles reached");
+            self.status = Status::Stop(StopReason::MaxCycles);
+            self.log_cycle_state();
+            return Ok(self.status.clone());
+        }
+
+        // Stop file check
+        if std::path::Path::new("stop").exists() {
+            tracing::warn!("Stop file detected");
+            self.status = Status::Stop(StopReason::Stopped);
+            self.log_cycle_state();
+            return Ok(self.status.clone());
+        }
+
+        self.log_cycle_state();
+        Ok(self.status.clone())
+    }
+
+    fn estimation(&mut self) -> Result<()> {
+        self.psi = calculate_psi(
+            &self.equation,
+            &self.data,
+            &self.theta,
+            &self.error_models,
+            self.cycle == 1 && self.config.progress,
+        )?;
+
+        if let Err(err) = self.validate_psi() {
+            bail!(err);
+        }
+
+        let (lambda, _) = burke(&self.psi)?;
+        self.lambda = lambda;
+
+        Ok(())
+    }
+
+    fn condensation(&mut self) -> Result<()> {
+        // Lambda threshold pruning (more aggressive: 1/10000)
+        let max_lambda = self
+            .lambda
+            .iter()
+            .fold(f64::NEG_INFINITY, |acc, x| x.max(acc));
+
+        let threshold = max_lambda / 10000.0;
+        let mut keep: Vec<usize> = self
+            .lambda
+            .iter()
+            .enumerate()
+            .filter(|(_, lam)| *lam > threshold)
+            .map(|(i, _)| i)
+            .collect();
+
+        if self.psi.matrix().ncols() != keep.len() {
+            tracing::debug!(
+                "Lambda pruning dropped {} SPP",
+                self.psi.matrix().ncols() - keep.len()
+            );
+        }
+
+        self.theta.filter_indices(&keep);
+        self.psi.filter_column_indices(&keep);
+
+        // QR rank-revealing factorization
+        let (r, perm) = qr::qrd(&self.psi)?;
+        let keep_n = self.psi.matrix().ncols().min(self.psi.matrix().nrows());
+
+        keep.clear();
+        for i in 0..keep_n {
+            let test = r.col(i).norm_l2();
+            let r_diag = r.get(i, i);
+            if (r_diag / test).abs() >= 1e-8 {
+                keep.push(*perm.get(i).unwrap());
+            }
+        }
+
+        if self.psi.matrix().ncols() != keep.len() {
+            tracing::debug!("QR dropped {} SPP", self.psi.matrix().ncols() - keep.len());
+        }
+
+        self.theta.filter_indices(&keep);
+        self.psi.filter_column_indices(&keep);
+
+        self.validate_psi()?;
+
+        let (lambda, objf) = burke(&self.psi)?;
+        self.lambda = lambda;
+        self.objf = objf;
+        self.w = self.lambda.clone();
+
+        // Update pyl for D-criterion calculations
+        let psi = self.psi.to_ndarray();
+        let w_arr: Array1<f64> = self.w.iter().collect();
+        self.pyl = psi.dot(&w_arr);
+
+        Ok(())
+    }
+
+    fn optimizations(&mut self) -> Result<()> {
+        // Update swarm with D-criterion fitness
+        self.update_swarm_fitness()?;
+
+        // Standard error model optimization
+        self.optimize_error_models()?;
+
+        Ok(())
+    }
+
+    fn expansion(&mut self) -> Result<()> {
+        if self.in_warmup {
+            // Warm-up: NPAG-style grid expansion for broad coverage
+            self.warmup_expansion()?;
+        } else {
+            // PSO-driven expansion
+            self.pso_expansion()?;
+        }
+
+        Ok(())
+    }
+
+    fn into_workspace(&self) -> Result<NonparametricWorkspace<E>> {
+        NonparametricWorkspace::new(
+            self.equation.clone(),
+            self.data.clone(),
+            self.theta.clone(),
+            self.psi.clone(),
+            self.w.clone(),
+            -2.0 * self.objf,
+            self.cycle,
+            self.status.clone(),
+            self.config.run_configuration.clone(),
+            self.cycle_log.clone(),
+        )
+    }
+}
+
+// ============================================================================
+// NPPSO SPECIFIC METHODS
+// ============================================================================
+
+impl<E: Equation + Send + 'static> NPPSO<E> {
+    pub(crate) fn from_input(input: NonparametricAlgorithmInput<E>) -> Result<Box<Self>> {
+        let config = input.native_config()?;
+        let seed = config.prior.seed().unwrap_or(42) as u64;
+        let ranges = config.ranges.clone();
+        let n_dims = ranges.len();
+        let n_subjects = input.data.len();
+        let error_models = input.error_models().clone();
+
+        Ok(Box::new(Self {
+            equation: input.equation,
+            ranges: ranges.clone(),
+            psi: Psi::new(),
+            theta: Theta::new(),
+            lambda: Weights::default(),
+            w: Weights::default(),
+            eps: 0.2,
+            last_objf: -1e30,
+            objf: f64::NEG_INFINITY,
+            f0: -1e30,
+            f1: f64::default(),
+            cycle: 0,
+            gamma_delta: vec![0.1; error_models.len()],
+            error_models,
+            status: Status::Continue,
+            cycle_log: CycleLog::new(),
+            data: input.data,
+            config,
+            swarm: Swarm::new(n_dims, &ranges, seed),
+            objf_history: Vec::with_capacity(500),
+            rng: StdRng::seed_from_u64(seed),
+            pyl: Array1::ones(n_subjects),
+            in_warmup: true,
+            elite_points: Vec::with_capacity(ELITE_COUNT),
+            sa_temperature: SA_INITIAL_TEMP,
+            stagnation_count: 0,
+        }))
+    }
+
+    /// Warm-up expansion using adaptive grid (like NPAG)
+    fn warmup_expansion(&mut self) -> Result<()> {
+        tracing::debug!("NPPSO warm-up: adaptive grid expansion");
+        adaptative_grid(&mut self.theta, self.eps, &self.ranges, THETA_D)?;
+        Ok(())
+    }
+
+    /// PSO-driven expansion: particles search for high-D regions
+    fn pso_expansion(&mut self) -> Result<()> {
+        let initial_points = self.theta.nspp();
+
+        // 1. Evaluate D-criterion for all particles (in parallel)
+        let particle_fitness = self.evaluate_particle_fitness()?;
+
+        // 2. Update particle personal bests
+        self.swarm.update_personal_bests(&particle_fitness);
+
+        // 3. PSO velocity/position update
+        let inertia = self.adaptive_inertia();
+        self.swarm.update_all(
+            inertia,
+            COGNITIVE_WEIGHT,
+            SOCIAL_WEIGHT,
+            &self.ranges,
+            &mut self.rng,
+        );
+
+        // 4. Add high-fitness particles as support point candidates
+        let max_fitness = particle_fitness
+            .iter()
+            .cloned()
+            .fold(f64::NEG_INFINITY, f64::max);
+        let threshold = max_fitness * D_THRESHOLD_FRACTION;
+
+        let mut added = 0;
+        for (i, particle) in self.swarm.particles().iter().enumerate() {
+            if particle_fitness[i] > threshold.max(0.0) {
+                if self.theta.check_point(&particle.position, THETA_D) {
+                    self.theta.add_point(&particle.position)?;
+                    added += 1;
+                }
+            }
+        }
+
+        // 5. Also add from personal bests (memory of good regions)
+        for particle in self.swarm.particles() {
+            if particle.pbest_fitness > threshold.max(0.0) {
+                if self.theta.check_point(&particle.pbest_position, THETA_D) {
+                    self.theta.add_point(&particle.pbest_position)?;
+                    added += 1;
+                }
+            }
+        }
+
+        // 6. Simulated Annealing injection (key for escaping local optima!)
+        self.sa_injection()?;
+
+        // 7. Subject MAP injection for poorly-fit subjects
+        self.inject_subject_maps()?;
+
+        // 8. D-optimal refinement (periodically)
+        if self.cycle % DOPT_REFINE_INTERVAL == 0 && self.cycle > WARMUP_CYCLES {
+            self.d_optimal_refinement()?;
+        }
+
+        // 9. Inject elite points
+        self.inject_elite_points()?;
+
+        // 10. Sparse grid expansion to fill gaps every few cycles
+        if self.cycle % 3 == 0 {
+            let sparse_eps = self.eps * 0.5;
+            adaptative_grid(&mut self.theta, sparse_eps, &self.ranges, THETA_D * 2.0)?;
+        }
+
+        // 11. Reinject diversity if swarm is converging
+        if self.swarm_convergence_ratio() > CONVERGENCE_THRESHOLD {
+            let n_reinject = (SWARM_SIZE as f64 * REINJECT_FRACTION) as usize;
+            self.swarm
+                .reinject_random(&self.ranges, &mut self.rng, n_reinject);
+            tracing::debug!("Swarm converging, reinjected {} particles", n_reinject);
+        }
+
+        // 12. Cool SA temperature
+        self.sa_temperature = (self.sa_temperature * SA_COOLING_RATE).max(SA_MIN_TEMP);
+
+        tracing::debug!(
+            "PSO expansion: {} → {} (added {}), SA temp={:.3}",
+            initial_points,
+            self.theta.nspp(),
+            added,
+            self.sa_temperature
+        );
+
+        Ok(())
+    }
+
+    // ========================================================================
+    // SIMULATED ANNEALING INJECTION
+    // ========================================================================
+
+    /// SA-style random injection with Metropolis acceptance
+    /// This is KEY for escaping local optima that PSO can't escape
+    fn sa_injection(&mut self) -> Result<()> {
+        if self.sa_temperature < SA_MIN_TEMP {
+            return Ok(());
+        }
+
+        let mut accepted = 0;
+        let mut proposed = 0;
+
+        for _ in 0..SA_INJECT_COUNT * 3 {
+            proposed += 1;
+
+            // Generate random point with margin from boundaries
+            let point: Vec<f64> = self
+                .ranges
+                .iter()
+                .map(|(lo, hi)| {
+                    let margin = (hi - lo) * BOUNDARY_MARGIN;
+                    self.rng.random_range((lo + margin)..(hi - margin))
+                })
+                .collect();
+
+            // Compute D-criterion for this point
+            let d = match self.compute_d_criterion(&point) {
+                Ok(d) => d,
+                Err(_) => continue,
+            };
+
+            // Metropolis acceptance: always accept if D > 0,
+            // probabilistically accept if D < 0 (allows exploration)
+            let accept = if d > 0.0 {
+                true
+            } else {
+                let p_accept = (d / self.sa_temperature).exp();
+                self.rng.random::<f64>() < p_accept
+            };
+
+            if accept && self.theta.check_point(&point, THETA_D) {
+                self.theta.add_point(&point)?;
+                accepted += 1;
+            }
+
+            if accepted >= SA_INJECT_COUNT {
+                break;
+            }
+        }
+
+        if accepted > 0 {
+            tracing::debug!(
+                "SA injection: {}/{} accepted (T={:.3})",
+                accepted,
+                proposed,
+                self.sa_temperature
+            );
+        }
+
+        Ok(())
+    }
+
+    // ========================================================================
+    // SUBJECT MAP INJECTION
+    // ========================================================================
+
+    /// Inject support points for poorly-fit subjects
+    fn inject_subject_maps(&mut self) -> Result<()> {
+        if self.theta.nspp() == 0 || self.w.len() == 0 {
+            return Ok(());
+        }
+
+        let n_subjects = self.pyl.len();
+
+        // Find worst-fit subjects (lowest P(y|G))
+        let mut indexed_pyl: Vec<(usize, f64)> = self.pyl.iter().cloned().enumerate().collect();
+        indexed_pyl.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
+
+        let n_residual = RESIDUAL_SUBJECTS.min(n_subjects);
+        let subjects = self.data.subjects();
+
+        let mut added = 0;
+
+        for (subj_idx, pyl_val) in indexed_pyl.iter().take(n_residual) {
+            // Skip if already well-fit
+            if *pyl_val > 0.1 {
+                continue;
+            }
+
+            let subject = &subjects[*subj_idx];
+
+            // Start from weighted centroid
+            let start = self.compute_weighted_centroid();
+
+            // Find MAP estimate for this subject
+            if let Ok(map_point) = optimize_subject_map(
+                &self.equation,
+                subject,
+                &self.error_models,
+                &self.ranges,
+                &start,
+                SUBJECT_MAP_EVALS,
+            ) {
+                // Check if point improves D-criterion
+                let d = self.compute_d_criterion(&map_point)?;
+                if d > 0.0 && self.theta.check_point(&map_point, THETA_D) {
+                    self.theta.add_point(&map_point)?;
+                    added += 1;
+                    tracing::debug!(
+                        "Subject {} MAP: pyl={:.4}, D={:.4}",
+                        subject.id(),
+                        pyl_val,
+                        d
+                    );
+                }
+            }
+        }
+
+        if added > 0 {
+            tracing::debug!("Subject MAP injection: added {} points", added);
+        }
+        Ok(())
+    }
+
+    // ========================================================================
+    // D-OPTIMAL REFINEMENT
+    // ========================================================================
+
+    /// Refine existing support points using COBYLA
+    fn d_optimal_refinement(&mut self) -> Result<()> {
+        if self.theta.nspp() == 0 || self.w.len() == 0 {
+            return Ok(());
+        }
+
+        let max_weight = self.w.iter().fold(f64::NEG_INFINITY, |a, b| a.max(b));
+        let threshold = max_weight * DOPT_WEIGHT_THRESHOLD;
+
+        let n_points = self.theta.nspp().min(self.w.len());
+        let mut refined_count = 0;
+
+        for i in 0..n_points {
+            // Only refine important points
+            if self.w[i] < threshold {
+                continue;
+            }
+
+            let start: Vec<f64> = self.theta.matrix().row(i).iter().cloned().collect();
+
+            if let Ok(refined) = refine_d_optimal(
+                &self.equation,
+                &self.data,
+                &self.error_models,
+                &self.pyl,
+                &self.ranges,
+                &start,
+                DOPT_REFINE_EVALS,
+            ) {
+                // Check if refinement improves D
+                let d_old = self.compute_d_criterion(&start)?;
+                let d_new = self.compute_d_criterion(&refined)?;
+
+                if d_new > d_old && self.theta.check_point(&refined, THETA_D) {
+                    self.theta.add_point(&refined)?;
+                    refined_count += 1;
+                }
+            }
+        }
+
+        if refined_count > 0 {
+            tracing::debug!(
+                "D-optimal refinement: added {} refined points",
+                refined_count
+            );
+        }
+        Ok(())
+    }
+
+    // ========================================================================
+    // ELITE PRESERVATION
+    // ========================================================================
+
+    /// Update and inject elite points
+    fn inject_elite_points(&mut self) -> Result<()> {
+        // Age existing elite points
+        for elite in &mut self.elite_points {
+            elite.cycle_added += 1;
+        }
+
+        // Remove old elite points
+        self.elite_points
+            .retain(|e| self.cycle.saturating_sub(e.cycle_added) < ELITE_MAX_AGE);
+
+        // Find current top points by weight
+        if self.w.len() > 0 {
+            let n_spp = self.theta.nspp().min(self.w.len());
+            let mut indexed_weights: Vec<(usize, f64)> =
+                self.w.iter().enumerate().take(n_spp).collect();
+            indexed_weights.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
+
+            for (idx, _weight) in indexed_weights.iter().take(ELITE_COUNT) {
+                if *idx >= self.theta.nspp() {
+                    continue;
+                }
+
+                let params: Vec<f64> = self.theta.matrix().row(*idx).iter().cloned().collect();
+                let d_value = self.compute_d_criterion(&params).unwrap_or(0.0);
+
+                // Check if already elite (within distance threshold)
+                let already_elite = self.elite_points.iter().any(|e| {
+                    e.params
+                        .iter()
+                        .zip(params.iter())
+                        .map(|(a, b)| (a - b).abs())
+                        .sum::<f64>()
+                        < THETA_D * 10.0
+                });
+
+                if !already_elite && self.elite_points.len() < ELITE_COUNT * 2 {
+                    self.elite_points.push(ElitePoint {
+                        params: params.clone(),
+                        d_value,
+                        cycle_added: self.cycle,
+                    });
+                }
+            }
+        }
+
+        // Re-inject elite points
+        let mut added = 0;
+        for elite in &self.elite_points {
+            if self.theta.check_point(&elite.params, THETA_D) {
+                self.theta.add_point(&elite.params)?;
+                added += 1;
+            }
+        }
+
+        if added > 0 {
+            tracing::debug!("Elite injection: {} points", added);
+        }
+        Ok(())
+    }
+
+    /// Compute weighted centroid of current support points
+    fn compute_weighted_centroid(&self) -> Vec<f64> {
+        let n_dims = self.ranges.len();
+        let mut centroid = vec![0.0; n_dims];
+        let mut total_weight = 0.0;
+
+        let n_points = self.theta.nspp().min(self.w.len());
+        for i in 0..n_points {
+            let w = self.w[i];
+            total_weight += w;
+            for d in 0..n_dims {
+                centroid[d] += w * self.theta.matrix().get(i, d);
+            }
+        }
+
+        if total_weight > 0.0 {
+            for d in 0..n_dims {
+                centroid[d] /= total_weight;
+            }
+        } else {
+            // Fallback: center of ranges
+            for d in 0..n_dims {
+                centroid[d] = (self.ranges[d].0 + self.ranges[d].1) / 2.0;
+            }
+        }
+
+        centroid
+    }
+
+    /// Evaluate D-criterion for all particles in parallel
+    fn evaluate_particle_fitness(&self) -> Result<Vec<f64>> {
+        let positions: Vec<Vec<f64>> = self
+            .swarm
+            .particles()
+            .iter()
+            .map(|p| p.position.clone())
+            .collect();
+
+        let fitness: Vec<f64> = positions
+            .into_par_iter()
+            .map(|pos| self.compute_d_criterion(&pos).unwrap_or(f64::NEG_INFINITY))
+            .collect();
+
+        Ok(fitness)
+    }
+
+    /// Compute D-criterion for a single point
+    fn compute_d_criterion(&self, point: &[f64]) -> Result<f64> {
+        let theta_single = Array1::from(point.to_vec()).insert_axis(Axis(0));
+
+        let psi_single = pharmsol::prelude::simulator::log_likelihood_matrix(
+            &self.equation,
+            &self.data,
+            &theta_single,
+            &self.error_models,
+            false,
+        )?
+        .mapv(f64::exp);
+
+        let nsub = psi_single.nrows() as f64;
+        let mut d_sum = -nsub;
+
+        for (p_i, pyl_i) in psi_single.iter().zip(self.pyl.iter()) {
+            if *pyl_i > 1e-300 {
+                d_sum += p_i / pyl_i;
+            }
+        }
+
+        Ok(d_sum)
+    }
+
+    /// Update swarm fitness based on D-criterion
+    fn update_swarm_fitness(&mut self) -> Result<()> {
+        let particle_fitness = self.evaluate_particle_fitness()?;
+
+        // Update personal bests
+        self.swarm.update_personal_bests(&particle_fitness);
+
+        // Find and update global best
+        if let Some((best_idx, best_fitness)) = particle_fitness
+            .iter()
+            .enumerate()
+            .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
+        {
+            let best_pos = self.swarm.particles()[best_idx].position.clone();
+            self.swarm.update_global_best(&best_pos, *best_fitness);
+
+            tracing::debug!(
+                "Swarm update: best D = {:.4}, gbest D = {:.4}",
+                best_fitness,
+                self.swarm.gbest_fitness()
+            );
+        }
+
+        Ok(())
+    }
+
+    /// Compute swarm convergence ratio (how clustered the particles are)
+    fn swarm_convergence_ratio(&self) -> f64 {
+        let positions = self.swarm.get_positions();
+        if positions.is_empty() {
+            return 0.0;
+        }
+
+        let n_dims = self.ranges.len();
+        let n_particles = positions.len();
+
+        // Compute centroid
+        let mut centroid = vec![0.0; n_dims];
+        for pos in &positions {
+            for (j, val) in pos.iter().enumerate() {
+                centroid[j] += val;
+            }
+        }
+        for c in &mut centroid {
+            *c /= n_particles as f64;
+        }
+
+        // Compute average normalized distance from centroid
+        let mut total_dist = 0.0;
+        for pos in &positions {
+            let mut dist = 0.0;
+            for (j, val) in pos.iter().enumerate() {
+                let range = self.ranges[j].1 - self.ranges[j].0;
+                let normalized = (val - centroid[j]) / range;
+                dist += normalized * normalized;
+            }
+            total_dist += dist.sqrt();
+        }
+
+        let avg_dist = total_dist / n_particles as f64;
+
+        // Return inverse: high value means converged (clustered)
+        1.0 / (1.0 + avg_dist * 10.0)
+    }
+
+    /// Adaptive inertia based on improvement rate
+    fn adaptive_inertia(&self) -> f64 {
+        if self.objf_history.len() < 3 {
+            return INERTIA_MAX;
+        }
+
+        let recent: Vec<f64> = self.objf_history.iter().rev().take(5).copied().collect();
+        let improvement = if recent.len() >= 2 {
+            (recent[0] - recent[recent.len() - 1]).abs()
+        } else {
+            1.0
+        };
+
+        // High improvement → high inertia (explore)
+        // Low improvement → low inertia (exploit)
+        if improvement > 1.0 {
+            INERTIA_MAX
+        } else if improvement > 0.1 {
+            (INERTIA_MAX + INERTIA_MIN) / 2.0
+        } else {
+            INERTIA_MIN
+        }
+    }
+
+    /// Global optimality check using swarm exploration
+    fn global_optimality_check(&mut self) -> Result<bool> {
+        // Sample random points and check if any have high D-criterion
+        let n_samples = GLOBAL_CHECK_SAMPLES;
+        let mut max_d = f64::NEG_INFINITY;
+
+        for _ in 0..n_samples {
+            let point: Vec<f64> = self
+                .ranges
+                .iter()
+                .map(|(lo, hi)| self.rng.random_range(*lo..*hi))
+                .collect();
+
+            let d = self.compute_d_criterion(&point)?;
+            max_d = max_d.max(d);
+        }
+
+        // Also check current particle positions
+        let particle_fitness = self.evaluate_particle_fitness()?;
+        let max_particle_d = particle_fitness
+            .iter()
+            .cloned()
+            .fold(f64::NEG_INFINITY, f64::max);
+        max_d = max_d.max(max_particle_d);
+
+        let passed = max_d < GLOBAL_D_THRESHOLD;
+        tracing::debug!(
+            "Global optimality check: max_D = {:.4}, threshold = {:.4}, passed = {}",
+            max_d,
+            GLOBAL_D_THRESHOLD,
+            passed
+        );
+
+        Ok(passed)
+    }
+
+    /// Optimize error models (standard approach)
+    fn optimize_error_models(&mut self) -> Result<()> {
+        for (outeq, em) in self.error_models.clone().iter_mut() {
+            if *em == AssayErrorModel::None || em.is_factor_fixed().unwrap_or(true) {
+                continue;
+            }
+
+            let gamma_up = em.factor()? * (1.0 + self.gamma_delta[outeq]);
+            let gamma_down = em.factor()? / (1.0 + self.gamma_delta[outeq]);
+
+            let mut em_up = self.error_models.clone();
+            em_up.set_factor(outeq, gamma_up)?;
+
+            let mut em_down = self.error_models.clone();
+            em_down.set_factor(outeq, gamma_down)?;
+
+            let psi_up = calculate_psi(&self.equation, &self.data, &self.theta, &em_up, false)?;
+            let psi_down = calculate_psi(&self.equation, &self.data, &self.theta, &em_down, false)?;
+
+            let (lambda_up, objf_up) = burke(&psi_up)?;
+            let (lambda_down, objf_down) = burke(&psi_down)?;
+
+            if objf_up > self.objf {
+                self.error_models.set_factor(outeq, gamma_up)?;
+                self.objf = objf_up;
+                self.gamma_delta[outeq] *= 4.0;
+                self.lambda = lambda_up;
+                self.psi = psi_up;
+            }
+            if objf_down > self.objf {
+                self.error_models.set_factor(outeq, gamma_down)?;
+                self.objf = objf_down;
+                self.gamma_delta[outeq] *= 4.0;
+                self.lambda = lambda_down;
+                self.psi = psi_down;
+            }
+
+            self.gamma_delta[outeq] *= 0.5;
+            if self.gamma_delta[outeq] <= 0.01 {
+                self.gamma_delta[outeq] = 0.1;
+            }
+        }
+
+        // Update pyl after error model changes
+        if self.w.len() > 0 {
+            let psi = self.psi.to_ndarray();
+            let w_arr: Array1<f64> = self.w.iter().collect();
+            self.pyl = psi.dot(&w_arr);
+        }
+
+        Ok(())
+    }
+
+    /// Validate PSI matrix
+    #[allow(dead_code)]
+    fn validate_psi(&self) -> Result<()> {
+        let psi = self.psi.to_ndarray();
+        let (_, col) = psi.dim();
+        let ecol: ArrayBase<OwnedRepr<f64>, Dim<[usize; 1]>> = Array::ones(col);
+        let plam = psi.dot(&ecol);
+        let w = 1.0 / &plam;
+
+        let bad_indices: Vec<usize> = w
+            .iter()
+            .enumerate()
+            .filter(|(_, x)| x.is_nan() || x.is_infinite())
+            .map(|(i, _)| i)
+            .collect();
+
+        if !bad_indices.is_empty() {
+            let subjects: Vec<&Subject> = self.data.subjects();
+            let bad_subjects: Vec<&String> =
+                bad_indices.iter().map(|&i| subjects[i].id()).collect();
+            bail!("Zero probability for subjects: {:?}", bad_subjects);
+        }
+
+        Ok(())
+    }
+}
diff --git a/src/algorithms/nonparametric/nppso/optimizers.rs b/src/algorithms/nonparametric/nppso/optimizers.rs
new file mode 100644
index 000000000..51626e518
--- /dev/null
+++ b/src/algorithms/nonparametric/nppso/optimizers.rs
@@ -0,0 +1,200 @@
+//! Optimizers for NPPSO algorithm
+//!
+//! Contains subject MAP optimization using COBYLA and D-optimal refinement.
+
+use anyhow::Result;
+use cobyla::{minimize, RhoBeg};
+use ndarray::Axis;
+use pharmsol::prelude::{
+    data::{AssayErrorModels, Data},
+    simulator::Equation,
+};
+use pharmsol::Subject;
+
+// ============================================================================
+// SUBJECT MAP OPTIMIZER (using COBYLA)
+// ============================================================================
+
+/// Find the MAP (Maximum A Posteriori) estimate for a single subject
+/// This identifies the parameter values that best explain that subject's data
+pub fn optimize_subject_map<E: Equation>(
+    equation: &E,
+    subject: &Subject,
+    error_models: &AssayErrorModels,
+    ranges: &[(f64, f64)],
+    start: &[f64],
+    max_evals: usize,
+) -> Result<Vec<f64>> {
+    // Create single-subject data
+    let single_data = Data::new(vec![subject.clone()]);
+
+    // Closure that computes -log P(y|θ) for this subject
+    // We minimize this (= maximize P(y|θ))
+    let objective = |params: &[f64], _: &mut (&E, &Data, &AssayErrorModels)| -> f64 {
+        // Clamp to bounds
+        let clamped: Vec<f64> = params
+            .iter()
+            .zip(ranges.iter())
+            .map(|(v, (lo, hi))| v.clamp(*lo, *hi))
+            .collect();
+
+        let theta = ndarray::Array1::from(clamped).insert_axis(Axis(0));
+
+        match pharmsol::prelude::simulator::log_likelihood_matrix(
+            equation,
+            &single_data,
+            &theta,
+            error_models,
+            false,
+        )
+        .map(|m| m.mapv(f64::exp))
+        {
+            Ok(psi) => {
+                let p = psi.iter().next().unwrap_or(&1e-300);
+                if *p > 0.0 {
+                    -p.ln() // Minimize -log P = maximize P
+                } else {
+                    700.0 // Very bad
+                }
+            }
+            Err(_) => 700.0,
+        }
+    };
+
+    // Convert ranges to cobyla format
+    let bounds: Vec<(f64, f64)> = ranges.to_vec();
+    let cons: Vec<fn(&[f64], &mut (&E, &Data, &AssayErrorModels)) -> f64> = vec![];
+
+    // User data for the closure (not actually used in our objective)
+    let user_data = (equation, &single_data, error_models);
+
+    let result = minimize(
+        objective,
+        start,
+        &bounds,
+        &cons,
+        user_data,
+        max_evals,
+        RhoBeg::All(0.1),
+        None,
+    );
+
+    match result {
+        Ok((_, x, _)) => {
+            // Clamp result to bounds with margin
+            let clamped: Vec<f64> = x
+                .iter()
+                .zip(ranges.iter())
+                .map(|(v, (lo, hi))| {
+                    let margin = (hi - lo) * 0.01;
+                    v.clamp(lo + margin, hi - margin)
+                })
+                .collect();
+            Ok(clamped)
+        }
+        Err((_, x, _)) => {
+            // Even on "failure", use the best point found
+            let clamped: Vec<f64> = x
+                .iter()
+                .zip(ranges.iter())
+                .map(|(v, (lo, hi))| {
+                    let margin = (hi - lo) * 0.01;
+                    v.clamp(lo + margin, hi - margin)
+                })
+                .collect();
+            Ok(clamped)
+        }
+    }
+}
+
+// ============================================================================
+// D-OPTIMAL REFINEMENT (using COBYLA)
+// ============================================================================
+
+/// Refine a support point position to maximize D-criterion
+pub fn refine_d_optimal<E: Equation>(
+    equation: &E,
+    data: &Data,
+    error_models: &AssayErrorModels,
+    pyl: &ndarray::Array1<f64>,
+    ranges: &[(f64, f64)],
+    start: &[f64],
+    max_evals: usize,
+) -> Result<Vec<f64>> {
+    // Closure that computes -D(θ) (we minimize, so negate)
+    let objective = |params: &[f64], _: &mut ()| -> f64 {
+        // Clamp to bounds
+        let clamped: Vec<f64> = params
+            .iter()
+            .zip(ranges.iter())
+            .map(|(v, (lo, hi))| v.clamp(*lo, *hi))
+            .collect();
+
+        let theta = ndarray::Array1::from(clamped).insert_axis(Axis(0));
+
+        match pharmsol::prelude::simulator::log_likelihood_matrix(
+            equation,
+            data,
+            &theta,
+            error_models,
+            false,
+        )
+        .map(|m| m.mapv(f64::exp))
+        {
+            Ok(psi) => {
+                let nsub = psi.nrows() as f64;
+                let mut d_sum = -nsub;
+
+                for (p_i, pyl_i) in psi.iter().zip(pyl.iter()) {
+                    if *pyl_i > 1e-300 {
+                        d_sum += p_i / pyl_i;
+                    }
+                }
+
+                -d_sum // Minimize -D = Maximize D
+            }
+            Err(_) => 1e10, // Very bad
+        }
+    };
+
+    let bounds: Vec<(f64, f64)> = ranges.to_vec();
+    let cons: Vec<fn(&[f64], &mut ()) -> f64> = vec![];
+
+    let result = minimize(
+        objective,
+        start,
+        &bounds,
+        &cons,
+        (),
+        max_evals,
+        RhoBeg::All(0.05),
+        None,
+    );
+
+    match result {
+        Ok((_, x, _)) | Err((_, x, _)) => {
+            let clamped: Vec<f64> = x
+                .iter()
+                .zip(ranges.iter())
+                .map(|(v, (lo, hi))| {
+                    let margin = (hi - lo) * 0.01;
+                    v.clamp(lo + margin, hi - margin)
+                })
+                .collect();
+            Ok(clamped)
+        }
+    }
+}
+
+// ============================================================================
+// ELITE POINT
+// ============================================================================
+
+/// An elite point preserved across cycles
+#[derive(Debug, Clone)]
+pub struct ElitePoint {
+    pub params: Vec<f64>,
+    #[allow(dead_code)]
+    pub d_value: f64,
+    pub cycle_added: usize,
+}
diff --git a/src/algorithms/nonparametric/nppso/swarm.rs b/src/algorithms/nonparametric/nppso/swarm.rs
new file mode 100644
index 000000000..6cae09fd3
--- /dev/null
+++ b/src/algorithms/nonparametric/nppso/swarm.rs
@@ -0,0 +1,217 @@
+//! Particle Swarm implementation for D-criterion guided optimization
+
+use super::constants::*;
+use rand::prelude::*;
+
+/// A particle in the swarm
+#[derive(Debug, Clone)]
+pub struct Particle {
+    pub position: Vec<f64>,
+    pub velocity: Vec<f64>,
+    pub pbest_position: Vec<f64>,
+    pub pbest_fitness: f64,
+}
+
+impl Particle {
+    pub fn new<R: Rng>(ranges: &[(f64, f64)], rng: &mut R) -> Self {
+        let position: Vec<f64> = ranges
+            .iter()
+            .map(|(lo, hi)| {
+                let margin = (hi - lo) * BOUNDARY_MARGIN;
+                rng.random_range((lo + margin)..(hi - margin))
+            })
+            .collect();
+
+        let velocity: Vec<f64> = ranges
+            .iter()
+            .map(|(lo, hi)| {
+                let max_v = (hi - lo) * MAX_VELOCITY_FRACTION;
+                rng.random_range(-max_v..max_v) * 0.1 // Start with small velocities
+            })
+            .collect();
+
+        Self {
+            position: position.clone(),
+            velocity,
+            pbest_position: position,
+            pbest_fitness: f64::NEG_INFINITY,
+        }
+    }
+
+    pub fn update_velocity<R: Rng>(
+        &mut self,
+        gbest: &[f64],
+        inertia: f64,
+        cognitive: f64,
+        social: f64,
+        ranges: &[(f64, f64)],
+        rng: &mut R,
+    ) {
+        for i in 0..self.position.len() {
+            let r1: f64 = rng.random();
+            let r2: f64 = rng.random();
+
+            self.velocity[i] = inertia * self.velocity[i]
+                + cognitive * r1 * (self.pbest_position[i] - self.position[i])
+                + social * r2 * (gbest[i] - self.position[i]);
+
+            // Clamp velocity
+            let (lo, hi) = ranges[i];
+            let max_v = (hi - lo) * MAX_VELOCITY_FRACTION;
+            self.velocity[i] = self.velocity[i].clamp(-max_v, max_v);
+        }
+    }
+
+    pub fn update_position(&mut self, ranges: &[(f64, f64)]) {
+        for i in 0..self.position.len() {
+            self.position[i] += self.velocity[i];
+
+            // Reflect off boundaries
+            let (lo, hi) = ranges[i];
+            let margin = (hi - lo) * BOUNDARY_MARGIN;
+            let lo_bound = lo + margin;
+            let hi_bound = hi - margin;
+
+            if self.position[i] < lo_bound {
+                self.position[i] =
+                    lo_bound + (lo_bound - self.position[i]).min(hi_bound - lo_bound);
+                self.velocity[i] *= -0.5; // Bounce with damping
+            } else if self.position[i] > hi_bound {
+                self.position[i] =
+                    hi_bound - (self.position[i] - hi_bound).min(hi_bound - lo_bound);
+                self.velocity[i] *= -0.5;
+            }
+        }
+    }
+
+    /// Update personal best if current fitness is better
+    pub fn update_pbest(&mut self, fitness: f64) {
+        if fitness > self.pbest_fitness {
+            self.pbest_fitness = fitness;
+            self.pbest_position = self.position.clone();
+        }
+    }
+}
+
+/// The particle swarm
+#[derive(Debug)]
+pub struct Swarm {
+    particles: Vec<Particle>,
+    gbest_position: Vec<f64>,
+    gbest_fitness: f64,
+}
+
+impl Swarm {
+    pub fn new(n_dims: usize, ranges: &[(f64, f64)], seed: u64) -> Self {
+        let mut rng = rand::rngs::StdRng::seed_from_u64(seed);
+
+        // Initialize particles
+        let particles: Vec<Particle> = (0..SWARM_SIZE)
+            .map(|_| Particle::new(ranges, &mut rng))
+            .collect();
+
+        // Initialize gbest to center
+        let gbest_position: Vec<f64> = ranges.iter().map(|(lo, hi)| (lo + hi) / 2.0).collect();
+
+        // Use n_dims to validate
+        assert_eq!(n_dims, ranges.len(), "n_dims must match ranges length");
+
+        Self {
+            particles,
+            gbest_position,
+            gbest_fitness: f64::NEG_INFINITY,
+        }
+    }
+
+    /// Get reference to particles
+    pub fn particles(&self) -> &[Particle] {
+        &self.particles
+    }
+
+    /// Get global best fitness
+    pub fn gbest_fitness(&self) -> f64 {
+        self.gbest_fitness
+    }
+
+    /// Get all particle positions
+    pub fn get_positions(&self) -> Vec<Vec<f64>> {
+        self.particles.iter().map(|p| p.position.clone()).collect()
+    }
+
+    /// Update global best
+    pub fn update_global_best(&mut self, position: &[f64], fitness: f64) {
+        if fitness > self.gbest_fitness {
+            self.gbest_fitness = fitness;
+            self.gbest_position = position.to_vec();
+        }
+    }
+
+    /// Update personal bests for all particles given their fitness values
+    pub fn update_personal_bests(&mut self, fitness_values: &[f64]) {
+        for (particle, &fitness) in self.particles.iter_mut().zip(fitness_values.iter()) {
+            particle.update_pbest(fitness);
+        }
+
+        // Also update global best
+        if let Some((best_idx, &best_fitness)) = fitness_values
+            .iter()
+            .enumerate()
+            .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
+        {
+            if best_fitness > self.gbest_fitness {
+                self.gbest_fitness = best_fitness;
+                self.gbest_position = self.particles[best_idx].position.clone();
+            }
+        }
+    }
+
+    /// Update all particles
+    pub fn update_all<R: Rng>(
+        &mut self,
+        inertia: f64,
+        cognitive: f64,
+        social: f64,
+        ranges: &[(f64, f64)],
+        rng: &mut R,
+    ) {
+        let gbest = self.gbest_position.clone();
+
+        for particle in &mut self.particles {
+            particle.update_velocity(&gbest, inertia, cognitive, social, ranges, rng);
+            particle.update_position(ranges);
+        }
+    }
+
+    /// Reinject random particles to maintain diversity
+    pub fn reinject_random<R: Rng>(
+        &mut self,
+        ranges: &[(f64, f64)],
+        rng: &mut R,
+        n_reinject: usize,
+    ) {
+        // Sort by fitness, reset worst performers
+        let mut indices: Vec<usize> = (0..self.particles.len()).collect();
+        indices.sort_by(|&a, &b| {
+            self.particles[a]
+                .pbest_fitness
+                .partial_cmp(&self.particles[b].pbest_fitness)
+                .unwrap_or(std::cmp::Ordering::Equal)
+        });
+
+        for &i in indices.iter().take(n_reinject) {
+            self.particles[i] = Particle::new(ranges, rng);
+        }
+    }
+
+    /// Convenience wrapper that reinjects a fraction of the swarm.
+    #[allow(dead_code)]
+    pub fn reinject_diversity<R: Rng>(
+        &mut self,
+        ranges: &[(f64, f64)],
+        rng: &mut R,
+        fraction: f64,
+    ) {
+        let n_reset = (self.particles.len() as f64 * fraction) as usize;
+        self.reinject_random(ranges, rng, n_reset);
+    }
+}
diff --git a/src/algorithms/nonparametric/npsah.rs b/src/algorithms/nonparametric/npsah.rs
new file mode 100644
index 000000000..def17ebf9
--- /dev/null
+++ b/src/algorithms/nonparametric/npsah.rs
@@ -0,0 +1,932 @@
+//! # NPSA-H: Non-Parametric Simulated Annealing Hybrid Algorithm
+//!
+//! This module implements the NPSA-H algorithm, which combines:
+//! - **NPAG's systematic grid exploration** for broad parameter space coverage
+//! - **NPOD's D-optimal refinement** for information-driven point placement
+//! - **Simulated Annealing** for global mode discovery and escaping local optima
+//!
+//! ## Algorithm Phases
+//!
+//! ### Phase 1: Warm-up (Cycles 1-N)
+//! Uses NPAG-style grid expansion to ensure broad parameter space sampling.
+//!
+//! ### Phase 2: Hybrid Expansion (Subsequent cycles)
+//! - **Sparse Grid Expansion**: Adaptive NPAG-style expansion in low-density regions
+//! - **D-Optimal Refinement**: Full optimization for high-weight support points
+//! - **SA Injection**: Random point injection with Metropolis acceptance
+//!
+//! ## Convergence
+//! Multi-criterion convergence checking:
+//! 1. Objective function stability over consecutive cycles
+//! 2. Global optimality check via Monte Carlo sampling
+//! 3. Support point location stability
+
+use crate::algorithms::{
+    NativeNonparametricConfig, NonparametricAlgorithmInput, Status, StopReason,
+};
+use crate::estimation::nonparametric::adaptative_grid;
+use crate::estimation::nonparametric::ipm::burke;
+use crate::estimation::nonparametric::qr;
+use crate::estimation::nonparametric::sample_space_for_parameters;
+use crate::estimation::nonparametric::{
+    calculate_psi, CycleLog, NPCycle, NonparametricWorkspace, Psi, Theta, Weights,
+};
+use crate::prelude::algorithms::Algorithms;
+
+use anyhow::{bail, Result};
+use ndarray::parallel::prelude::{IntoParallelRefMutIterator, ParallelIterator};
+use ndarray::Array1;
+use pharmsol::prelude::AssayErrorModel;
+use pharmsol::prelude::{
+    data::{AssayErrorModels, Data},
+    simulator::Equation,
+};
+use rand::prelude::*;
+
+// ============================================================================
+// ALGORITHM CONSTANTS
+// ============================================================================
+
+/// Grid spacing convergence threshold
+const THETA_E: f64 = 1e-4;
+/// Objective function convergence threshold
+const THETA_G: f64 = 1e-4;
+/// P(Y|L) convergence criterion
+const THETA_F: f64 = 1e-2;
+/// Minimum distance between support points
+const THETA_D: f64 = 1e-4;
+
+/// Number of warm-up cycles using NPAG-style expansion
+const WARMUP_CYCLES: usize = 5;
+/// Initial temperature for simulated annealing
+const INITIAL_TEMPERATURE: f64 = 1.0;
+/// Temperature cooling rate per cycle
+const COOLING_RATE: f64 = 0.95;
+/// Number of SA points to inject per cycle
+const SA_INJECT_COUNT: usize = 10;
+/// Threshold for considering a support point "high importance"
+const HIGH_IMPORTANCE_THRESHOLD: f64 = 0.1;
+/// Maximum Nelder-Mead iterations for high-importance points
+const HIGH_IMPORTANCE_MAX_ITERS: u64 = 100;
+/// Maximum Nelder-Mead iterations for low-importance points
+const LOW_IMPORTANCE_MAX_ITERS: u64 = 10;
+/// Number of consecutive stable cycles required for convergence
+const CONVERGENCE_WINDOW: usize = 3;
+/// Number of Monte Carlo samples for global optimality check
+const GLOBAL_OPTIMALITY_SAMPLES: usize = 500;
+/// Threshold for D-criterion in global optimality check
+const GLOBAL_OPTIMALITY_THRESHOLD: f64 = 0.01;
+/// Minimum temperature before SA injection stops
+const MIN_TEMPERATURE: f64 = 0.01;
+
+// ============================================================================
+// NPSA-H STRUCT
+// ============================================================================
+
+/// NPSA-H: Non-Parametric Simulated Annealing Hybrid Algorithm
+///
+/// Combines NPAG grid exploration, NPOD D-optimal refinement, and simulated
+/// annealing for robust non-parametric population PK/PD modeling.
+#[derive(Debug)]
+pub struct NPSAH<E: Equation + Send + 'static> {
+    /// The pharmacometric equation/model
+    equation: E,
+    /// Parameter ranges for each dimension
+    ranges: Vec<(f64, f64)>,
+    /// Probability matrix: P(y_i | θ_j)
+    psi: Psi,
+    /// Support points (parameter values)
+    theta: Theta,
+    /// Weights from IPM before condensation
+    lambda: Weights,
+    /// Final weights after condensation
+    w: Weights,
+    /// Current grid spacing (NPAG-style)
+    eps: f64,
+    /// Previous objective function value
+    last_objf: f64,
+    /// Current objective function value
+    objf: f64,
+    /// P(Y|L) values for convergence checking
+    f0: f64,
+    f1: f64,
+    /// Current cycle number
+    cycle: usize,
+    /// Step sizes for error model optimization
+    gamma_delta: Vec<f64>,
+    /// Error models for observations
+    error_models: AssayErrorModels,
+    /// Algorithm status
+    status: Status,
+    /// Cycle log for tracking progress
+    cycle_log: CycleLog,
+    /// Subject data
+    data: Data,
+    /// Unified runtime/model-derived configuration
+    config: NativeNonparametricConfig,
+
+    // NPSA-H specific fields
+    /// Current simulated annealing temperature
+    temperature: f64,
+    /// History of objective function values for convergence checking
+    objf_history: Vec<f64>,
+    /// Random number generator for SA
+    rng: StdRng,
+    /// Flag indicating if we're in warm-up phase
+    in_warmup: bool,
+    /// Maximum D-criterion value found in global search
+    max_global_d: f64,
+}
+
+// ============================================================================
+// ALGORITHMS TRAIT IMPLEMENTATION
+// ============================================================================
+
+impl<E: Equation + Send + 'static> Algorithms<E> for NPSAH<E> {
+    fn equation(&self) -> &E {
+        &self.equation
+    }
+
+    fn into_workspace(&self) -> Result<NonparametricWorkspace<E>> {
+        NonparametricWorkspace::new(
+            self.equation.clone(),
+            self.data.clone(),
+            self.theta.clone(),
+            self.psi.clone(),
+            self.w.clone(),
+            -2. * self.objf,
+            self.cycle,
+            self.status.clone(),
+            self.config.run_configuration.clone(),
+            self.cycle_log.clone(),
+        )
+    }
+
+    fn error_models(&self) -> &AssayErrorModels {
+        &self.error_models
+    }
+
+    fn data(&self) -> &Data {
+        &self.data
+    }
+
+    fn get_prior(&self) -> Theta {
+        sample_space_for_parameters(&self.config.parameter_space, &self.config.prior).unwrap()
+    }
+
+    fn likelihood(&self) -> f64 {
+        self.objf
+    }
+
+    fn increment_cycle(&mut self) -> usize {
+        self.cycle += 1;
+
+        // Check if we're exiting warm-up phase
+        if self.cycle > WARMUP_CYCLES && self.in_warmup {
+            self.in_warmup = false;
+            tracing::info!("NPSA-H: Exiting warm-up phase, entering hybrid expansion mode");
+        }
+
+        // Cool the temperature
+        self.temperature *= COOLING_RATE;
+        if self.temperature < MIN_TEMPERATURE {
+            self.temperature = MIN_TEMPERATURE;
+        }
+
+        self.cycle
+    }
+
+    fn cycle(&self) -> usize {
+        self.cycle
+    }
+
+    fn set_theta(&mut self, theta: Theta) {
+        self.theta = theta;
+    }
+
+    fn theta(&self) -> &Theta {
+        &self.theta
+    }
+
+    fn psi(&self) -> &Psi {
+        &self.psi
+    }
+
+    fn set_status(&mut self, status: Status) {
+        self.status = status;
+    }
+
+    fn status(&self) -> &Status {
+        &self.status
+    }
+
+    fn evaluation(&mut self) -> Result<Status> {
+        tracing::info!("Objective function = {:.4}", -2.0 * self.objf);
+        tracing::debug!("Support points: {}", self.theta.nspp());
+        tracing::debug!(
+            "Phase: {} | Temperature: {:.4}",
+            if self.in_warmup { "Warm-up" } else { "Hybrid" },
+            self.temperature
+        );
+
+        self.error_models.iter().for_each(|(outeq, em)| {
+            if AssayErrorModel::None == *em {
+                return;
+            }
+            tracing::debug!(
+                "Error model for outeq {}: {:.4}",
+                outeq,
+                em.factor().unwrap_or_default()
+            );
+        });
+
+        tracing::debug!("EPS = {:.4}", self.eps);
+
+        // Track objective function history
+        self.objf_history.push(self.objf);
+
+        // Warn if objective function decreased (instability)
+        if self.last_objf > self.objf + 1e-4 {
+            tracing::warn!(
+                "Objective function decreased from {:.4} to {:.4} (delta = {:.6})",
+                -2.0 * self.last_objf,
+                -2.0 * self.objf,
+                -2.0 * self.last_objf - -2.0 * self.objf
+            );
+        }
+
+        // Multi-criterion convergence check
+        let converged = self.check_convergence()?;
+
+        if converged {
+            tracing::info!(
+                "NPSA-H converged after {} cycles (multi-criterion)",
+                self.cycle
+            );
+            self.set_status(Status::Stop(StopReason::Converged));
+            self.log_cycle_state();
+            return Ok(self.status().clone());
+        }
+
+        // Standard convergence check (NPAG-style)
+        let psi = self.psi.matrix();
+        let w = &self.w;
+        if (self.last_objf - self.objf).abs() <= THETA_G && self.eps > THETA_E {
+            self.eps /= 2.;
+            tracing::debug!("Halving eps to {:.6}", self.eps);
+
+            if self.eps <= THETA_E {
+                let pyl = psi * w.weights();
+                self.f1 = pyl.iter().map(|x| x.ln()).sum();
+                if (self.f1 - self.f0).abs() <= THETA_F {
+                    tracing::info!(
+                        "NPSA-H converged after {} cycles (P(Y|L) criterion)",
+                        self.cycle
+                    );
+                    self.set_status(Status::Stop(StopReason::Converged));
+                    self.log_cycle_state();
+                    return Ok(self.status().clone());
+                } else {
+                    self.f0 = self.f1;
+                    self.eps = 0.2;
+                }
+            }
+        }
+
+        // Check maximum cycles
+        if self.cycle >= self.config.max_cycles {
+            tracing::warn!("Maximum number of cycles reached");
+            self.set_status(Status::Stop(StopReason::MaxCycles));
+            self.log_cycle_state();
+            return Ok(self.status().clone());
+        }
+
+        // Check for stop file
+        if std::path::Path::new("stop").exists() {
+            tracing::warn!("Stopfile detected - breaking");
+            self.set_status(Status::Stop(StopReason::Stopped));
+            self.log_cycle_state();
+            return Ok(self.status().clone());
+        }
+
+        // Continue with normal operation
+        self.set_status(Status::Continue);
+        self.log_cycle_state();
+        Ok(self.status().clone())
+    }
+
+    fn estimation(&mut self) -> Result<()> {
+        self.psi = calculate_psi(
+            &self.equation,
+            &self.data,
+            &self.theta,
+            &self.error_models,
+            self.cycle == 1 && self.config.progress,
+        )?;
+
+        if let Err(err) = self.validate_psi() {
+            bail!(err);
+        }
+
+        (self.lambda, _) = match burke(&self.psi) {
+            Ok((lambda, objf)) => (lambda, objf),
+            Err(err) => {
+                bail!("Error in IPM during estimation: {:?}", err);
+            }
+        };
+        Ok(())
+    }
+
+    fn condensation(&mut self) -> Result<()> {
+        // Lambda-filter: Remove points with very low weight
+        let max_lambda = self
+            .lambda
+            .iter()
+            .fold(f64::NEG_INFINITY, |acc, x| x.max(acc));
+
+        let mut keep = Vec::<usize>::new();
+        // Use more aggressive filtering (1/10000 instead of 1/1000)
+        let filter_threshold = max_lambda / 10000_f64;
+        for (index, lam) in self.lambda.iter().enumerate() {
+            if lam > filter_threshold {
+                keep.push(index);
+            }
+        }
+
+        if self.psi.matrix().ncols() != keep.len() {
+            tracing::debug!(
+                "Lambda filter dropped {} support point(s)",
+                self.psi.matrix().ncols() - keep.len(),
+            );
+        }
+
+        self.theta.filter_indices(keep.as_slice());
+        self.psi.filter_column_indices(keep.as_slice());
+
+        // Rank-Revealing QR Factorization
+        let (r, perm) = qr::qrd(&self.psi)?;
+
+        let mut keep = Vec::<usize>::new();
+        let keep_n = self.psi.matrix().ncols().min(self.psi.matrix().nrows());
+
+        for i in 0..keep_n {
+            let test = r.col(i).norm_l2();
+            let r_diag_val = r.get(i, i);
+            let ratio = r_diag_val / test;
+            if ratio.abs() >= 1e-8 {
+                keep.push(*perm.get(i).unwrap());
+            }
+        }
+
+        if self.psi.matrix().ncols() != keep.len() {
+            tracing::debug!(
+                "QR decomposition dropped {} support point(s)",
+                self.psi.matrix().ncols() - keep.len(),
+            );
+        }
+
+        self.theta.filter_indices(keep.as_slice());
+        self.psi.filter_column_indices(keep.as_slice());
+
+        self.validate_psi()?;
+
+        (self.lambda, self.objf) = match burke(&self.psi) {
+            Ok((lambda, objf)) => (lambda, objf),
+            Err(err) => {
+                return Err(anyhow::anyhow!(
+                    "Error in IPM during condensation: {:?}",
+                    err
+                ));
+            }
+        };
+        self.w = self.lambda.clone();
+        Ok(())
+    }
+
+    fn optimizations(&mut self) -> Result<()> {
+        // Same error model optimization as NPAG/NPOD
+        self.error_models
+            .clone()
+            .iter_mut()
+            .filter_map(|(outeq, em)| {
+                if em.optimize() {
+                    Some((outeq, em))
+                } else {
+                    None
+                }
+            })
+            .try_for_each(|(outeq, em)| -> Result<()> {
+                let gamma_up = em.factor()? * (1.0 + self.gamma_delta[outeq]);
+                let gamma_down = em.factor()? / (1.0 + self.gamma_delta[outeq]);
+
+                let mut error_model_up = self.error_models.clone();
+                error_model_up.set_factor(outeq, gamma_up)?;
+
+                let mut error_model_down = self.error_models.clone();
+                error_model_down.set_factor(outeq, gamma_down)?;
+
+                let psi_up = calculate_psi(
+                    &self.equation,
+                    &self.data,
+                    &self.theta,
+                    &error_model_up,
+                    false,
+                )?;
+                let psi_down = calculate_psi(
+                    &self.equation,
+                    &self.data,
+                    &self.theta,
+                    &error_model_down,
+                    false,
+                )?;
+
+                let (lambda_up, objf_up) = match burke(&psi_up) {
+                    Ok((lambda, objf)) => (lambda, objf),
+                    Err(err) => {
+                        bail!("Error in IPM during optim: {:?}", err);
+                    }
+                };
+                let (lambda_down, objf_down) = match burke(&psi_down) {
+                    Ok((lambda, objf)) => (lambda, objf),
+                    Err(err) => {
+                        bail!("Error in IPM during optim: {:?}", err);
+                    }
+                };
+
+                if objf_up > self.objf {
+                    self.error_models.set_factor(outeq, gamma_up)?;
+                    self.objf = objf_up;
+                    self.gamma_delta[outeq] *= 4.;
+                    self.lambda = lambda_up;
+                    self.psi = psi_up;
+                }
+                if objf_down > self.objf {
+                    self.error_models.set_factor(outeq, gamma_down)?;
+                    self.objf = objf_down;
+                    self.gamma_delta[outeq] *= 4.;
+                    self.lambda = lambda_down;
+                    self.psi = psi_down;
+                }
+                self.gamma_delta[outeq] *= 0.5;
+                if self.gamma_delta[outeq] <= 0.01 {
+                    self.gamma_delta[outeq] = 0.1;
+                }
+                Ok(())
+            })?;
+
+        Ok(())
+    }
+
+    fn expansion(&mut self) -> Result<()> {
+        if self.in_warmup {
+            // Phase 1: NPAG-style grid expansion for broad coverage
+            self.npag_expansion()?;
+        } else {
+            // Phase 2: Hybrid expansion
+            self.hybrid_expansion()?;
+        }
+        Ok(())
+    }
+
+    fn log_cycle_state(&mut self) {
+        let state = NPCycle::new(
+            self.cycle,
+            -2. * self.objf,
+            self.error_models.clone(),
+            self.theta.clone(),
+            self.theta.nspp(),
+            (self.last_objf - self.objf).abs(),
+            self.status.clone(),
+        );
+        self.cycle_log.push(state);
+        self.last_objf = self.objf;
+    }
+}
+
+impl<E: Equation + Send + 'static> NPSAH<E> {
+    pub(crate) fn from_input(input: NonparametricAlgorithmInput<E>) -> Result<Box<Self>> {
+        let config = input.native_config()?;
+        let seed = config.prior.seed().unwrap_or(42);
+        let ranges = config.ranges.clone();
+        let error_models = input.error_models().clone();
+        let equation = input.equation;
+        let data = input.data;
+
+        Ok(Box::new(Self {
+            equation,
+            ranges,
+            psi: Psi::new(),
+            theta: Theta::new(),
+            lambda: Weights::default(),
+            w: Weights::default(),
+            eps: 0.2,
+            last_objf: -1e30,
+            objf: f64::NEG_INFINITY,
+            f0: -1e30,
+            f1: f64::default(),
+            cycle: 0,
+            gamma_delta: vec![0.1; error_models.len()],
+            error_models,
+            status: Status::Continue,
+            cycle_log: CycleLog::new(),
+            data,
+            config,
+            temperature: INITIAL_TEMPERATURE,
+            objf_history: Vec::new(),
+            rng: StdRng::seed_from_u64(seed as u64),
+            in_warmup: true,
+            max_global_d: f64::INFINITY,
+        }))
+    }
+}
+
+// ============================================================================
+// NPSA-H SPECIFIC METHODS
+// ============================================================================
+
+impl<E: Equation + Send + 'static> NPSAH<E> {
+    /// NPAG-style adaptive grid expansion
+    fn npag_expansion(&mut self) -> Result<()> {
+        tracing::debug!("Performing NPAG-style grid expansion (warm-up phase)");
+        adaptative_grid(&mut self.theta, self.eps, &self.ranges, THETA_D)?;
+        Ok(())
+    }
+
+    /// Hybrid expansion combining grid, D-optimal, and SA
+    fn hybrid_expansion(&mut self) -> Result<()> {
+        let initial_points = self.theta.nspp();
+
+        // 2a. D-optimal refinement for EXISTING points (must happen before grid expansion)
+        // because we need the weights which correspond to current theta
+        self.d_optimal_refinement()?;
+        let after_dopt = self.theta.nspp();
+
+        // 2b. Sparse grid expansion in low-density regions
+        self.sparse_grid_expansion()?;
+        let after_grid = self.theta.nspp();
+
+        // 2c. Simulated annealing injection
+        if self.temperature > MIN_TEMPERATURE {
+            self.sa_injection()?;
+        }
+        let after_sa = self.theta.nspp();
+
+        tracing::debug!(
+            "Hybrid expansion: {} → {} (D-opt) → {} (grid) → {} (SA)",
+            initial_points,
+            after_dopt,
+            after_grid,
+            after_sa
+        );
+
+        Ok(())
+    }
+
+    /// Sparse grid expansion: only expand in low-density regions with high D-criterion
+    fn sparse_grid_expansion(&mut self) -> Result<()> {
+        // Use a reduced epsilon for sparse expansion
+        let sparse_eps = self.eps * 0.5;
+        adaptative_grid(&mut self.theta, sparse_eps, &self.ranges, THETA_D * 2.0)?;
+        Ok(())
+    }
+
+    /// D-optimal refinement with adaptive iteration count based on importance
+    fn d_optimal_refinement(&mut self) -> Result<()> {
+        let psi = self.psi().to_ndarray();
+        let w: Array1<f64> = self.w.clone().iter().collect();
+        let pyl = psi.dot(&w);
+
+        let error_model: AssayErrorModels = self.error_models.clone();
+        let max_weight = self.w.iter().fold(f64::NEG_INFINITY, |acc, x| x.max(acc));
+
+        // Ensure we only iterate over points that have corresponding weights
+        let n_points_with_weights = self.w.len().min(self.theta.nspp());
+
+        let mut candidate_points: Vec<(Array1<f64>, f64)> = Vec::default();
+
+        // Collect points with their importance (weight ratio)
+        // Only process points that have corresponding weights
+        for (idx, spp) in self
+            .theta
+            .matrix()
+            .row_iter()
+            .enumerate()
+            .take(n_points_with_weights)
+        {
+            let candidate: Vec<f64> = spp.iter().cloned().collect();
+            let importance = self.w[idx] / max_weight;
+            candidate_points.push((Array1::from(candidate), importance));
+        }
+
+        // Optimize points in parallel with adaptive iterations
+        candidate_points
+            .par_iter_mut()
+            .for_each(|(spp, importance)| {
+                let max_iters = if *importance > HIGH_IMPORTANCE_THRESHOLD {
+                    HIGH_IMPORTANCE_MAX_ITERS
+                } else {
+                    LOW_IMPORTANCE_MAX_ITERS
+                };
+
+                let optimizer = SppOptimizerAdaptive::new(
+                    &self.equation,
+                    &self.data,
+                    &error_model,
+                    &pyl,
+                    max_iters,
+                );
+
+                if let Ok(candidate_point) = optimizer.optimize_point(spp.clone()) {
+                    *spp = candidate_point;
+                }
+            });
+
+        // Add optimized points to theta
+        for (cp, _) in candidate_points {
+            self.theta.suggest_point(cp.to_vec().as_slice(), THETA_D)?;
+        }
+
+        Ok(())
+    }
+
+    /// Simulated annealing point injection for global exploration
+    fn sa_injection(&mut self) -> Result<()> {
+        let psi = self.psi().to_ndarray();
+        let w: Array1<f64> = self.w.clone().iter().collect();
+        let pyl = psi.dot(&w);
+
+        let n_inject = (SA_INJECT_COUNT as f64 * self.temperature).ceil() as usize;
+        let mut accepted_points = 0;
+        let mut max_d_found = f64::NEG_INFINITY;
+
+        for _ in 0..n_inject * 10 {
+            // Generate random point in parameter space
+            let point: Vec<f64> = self
+                .ranges
+                .iter()
+                .map(|(lo, hi)| self.rng.random_range(*lo..*hi))
+                .collect();
+
+            // Compute D-criterion for this point
+            let d_value = self.compute_d_criterion(&point, &pyl)?;
+            max_d_found = max_d_found.max(d_value);
+
+            // Metropolis acceptance criterion
+            let accept = if d_value > 0.0 {
+                true
+            } else {
+                let p_accept = (d_value / self.temperature).exp();
+                self.rng.random::<f64>() < p_accept
+            };
+
+            if accept {
+                if self.theta.check_point(&point, THETA_D) {
+                    self.theta.add_point(&point)?;
+                    accepted_points += 1;
+                }
+            }
+
+            if accepted_points >= n_inject {
+                break;
+            }
+        }
+
+        self.max_global_d = max_d_found;
+
+        tracing::debug!(
+            "SA injection: {} points accepted, max D = {:.6}, T = {:.4}",
+            accepted_points,
+            max_d_found,
+            self.temperature
+        );
+
+        Ok(())
+    }
+
+    /// Compute D-criterion for a candidate point
+    fn compute_d_criterion(&self, point: &[f64], pyl: &Array1<f64>) -> Result<f64> {
+        let theta_single = ndarray::Array1::from(point.to_vec()).insert_axis(ndarray::Axis(0));
+
+        let psi_single = pharmsol::prelude::simulator::log_likelihood_matrix(
+            &self.equation,
+            &self.data,
+            &theta_single,
+            &self.error_models,
+            false,
+        )?
+        .mapv(f64::exp);
+
+        let nsub = psi_single.nrows() as f64;
+        let mut d_sum = -nsub;
+
+        for (p_i, pyl_i) in psi_single.iter().zip(pyl.iter()) {
+            d_sum += p_i / pyl_i;
+        }
+
+        Ok(d_sum)
+    }
+
+    /// Multi-criterion convergence check
+    fn check_convergence(&mut self) -> Result<bool> {
+        // Need at least CONVERGENCE_WINDOW cycles to check
+        if self.objf_history.len() < CONVERGENCE_WINDOW {
+            return Ok(false);
+        }
+
+        // Criterion 1: Objective function stability
+        let recent_objfs: Vec<f64> = self
+            .objf_history
+            .iter()
+            .rev()
+            .take(CONVERGENCE_WINDOW)
+            .cloned()
+            .collect();
+
+        let objf_stable = recent_objfs
+            .windows(2)
+            .all(|w| (w[0] - w[1]).abs() < THETA_G);
+
+        if !objf_stable {
+            return Ok(false);
+        }
+
+        // Criterion 2: Global optimality check (only if not in warmup)
+        if !self.in_warmup && self.temperature > MIN_TEMPERATURE {
+            let psi = self.psi().to_ndarray();
+            let w: Array1<f64> = self.w.clone().iter().collect();
+            let pyl = psi.dot(&w);
+
+            let max_d = self.monte_carlo_global_check(&pyl)?;
+
+            if max_d > GLOBAL_OPTIMALITY_THRESHOLD {
+                tracing::debug!(
+                    "Global optimality check failed: max_D = {:.6} > {:.6}",
+                    max_d,
+                    GLOBAL_OPTIMALITY_THRESHOLD
+                );
+                return Ok(false);
+            }
+
+            tracing::debug!("Global optimality check passed: max_D = {:.6}", max_d);
+        }
+
+        Ok(true)
+    }
+
+    /// Monte Carlo estimate of maximum D-criterion over parameter space
+    fn monte_carlo_global_check(&mut self, pyl: &Array1<f64>) -> Result<f64> {
+        let mut max_d = f64::NEG_INFINITY;
+
+        for _ in 0..GLOBAL_OPTIMALITY_SAMPLES {
+            let point: Vec<f64> = self
+                .ranges
+                .iter()
+                .map(|(lo, hi)| self.rng.random_range(*lo..*hi))
+                .collect();
+
+            let d_value = self.compute_d_criterion(&point, pyl)?;
+            max_d = max_d.max(d_value);
+        }
+
+        Ok(max_d)
+    }
+}
+
+// ============================================================================
+// ADAPTIVE SPP OPTIMIZER
+// ============================================================================
+
+use argmin::{
+    core::{CostFunction, Error, Executor},
+    solver::neldermead::NelderMead,
+};
+use ndarray::Axis;
+
+/// Support Point Optimizer with configurable iteration count
+struct SppOptimizerAdaptive<'a, E: Equation> {
+    equation: &'a E,
+    data: &'a Data,
+    sig: &'a AssayErrorModels,
+    pyl: &'a Array1<f64>,
+    max_iters: u64,
+}
+
+impl<E: Equation> CostFunction for SppOptimizerAdaptive<'_, E> {
+    type Param = Vec<f64>;
+    type Output = f64;
+
+    fn cost(&self, spp: &Self::Param) -> Result<Self::Output, Error> {
+        let theta = Array1::from(spp.clone()).insert_axis(Axis(0));
+
+        let psi = pharmsol::prelude::simulator::log_likelihood_matrix(
+            self.equation,
+            self.data,
+            &theta,
+            self.sig,
+            false,
+        )?
+        .mapv(f64::exp);
+
+        let nsub = psi.nrows() as f64;
+        let mut sum = -nsub;
+        for (p_i, pyl_i) in psi.iter().zip(self.pyl.iter()) {
+            sum += p_i / pyl_i;
+        }
+        Ok(-sum) // Minimize negative D → Maximize D
+    }
+}
+
+impl<'a, E: Equation> SppOptimizerAdaptive<'a, E> {
+    fn new(
+        equation: &'a E,
+        data: &'a Data,
+        sig: &'a AssayErrorModels,
+        pyl: &'a Array1<f64>,
+        max_iters: u64,
+    ) -> Self {
+        Self {
+            equation,
+            data,
+            sig,
+            pyl,
+            max_iters,
+        }
+    }
+
+    fn optimize_point(self, spp: Array1<f64>) -> Result<Array1<f64>, Error> {
+        let simplex = create_initial_simplex(&spp.to_vec());
+        let tolerance = if self.max_iters > 50 { 1e-4 } else { 1e-2 };
+        let max_iters = self.max_iters;
+
+        let solver: NelderMead<Vec<f64>, f64> =
+            NelderMead::new(simplex).with_sd_tolerance(tolerance)?;
+
+        let res = Executor::new(self, solver)
+            .configure(|state| state.max_iters(max_iters))
+            .run()?;
+
+        Ok(Array1::from(res.state.best_param.unwrap()))
+    }
+}
+
+/// Create initial simplex for Nelder-Mead optimization
+fn create_initial_simplex(initial_point: &[f64]) -> Vec<Vec<f64>> {
+    let num_dimensions = initial_point.len();
+    let perturbation_percentage = 0.05; // 5% perturbation
+
+    let mut vertices = Vec::new();
+    vertices.push(initial_point.to_vec());
+
+    for i in 0..num_dimensions {
+        let perturbation = if initial_point[i] == 0.0 {
+            0.001
+        } else {
+            perturbation_percentage * initial_point[i].abs()
+        };
+
+        let mut perturbed_point = initial_point.to_vec();
+        perturbed_point[i] += perturbation;
+        vertices.push(perturbed_point);
+    }
+
+    vertices
+}
+
+// ============================================================================
+// TESTS
+// ============================================================================
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_initial_simplex_creation() {
+        let point = vec![1.0, 2.0, 3.0];
+        let simplex = create_initial_simplex(&point);
+
+        assert_eq!(simplex.len(), 4); // n+1 vertices for n dimensions
+        assert_eq!(simplex[0], point); // First vertex is the initial point
+    }
+
+    #[test]
+    fn test_initial_simplex_with_zero() {
+        let point = vec![0.0, 1.0];
+        let simplex = create_initial_simplex(&point);
+
+        assert_eq!(simplex.len(), 3);
+        // Zero should get special handling
+        assert!(simplex[1][0] > 0.0);
+    }
+
+    #[test]
+    fn test_convergence_window() {
+        assert!(CONVERGENCE_WINDOW >= 2);
+    }
+
+    #[test]
+    fn test_temperature_bounds() {
+        assert!(INITIAL_TEMPERATURE > MIN_TEMPERATURE);
+        assert!(COOLING_RATE > 0.0 && COOLING_RATE < 1.0);
+    }
+}
diff --git a/src/algorithms/nonparametric/npsah2.rs b/src/algorithms/nonparametric/npsah2.rs
new file mode 100644
index 000000000..0737671be
--- /dev/null
+++ b/src/algorithms/nonparametric/npsah2.rs
@@ -0,0 +1,1347 @@
+//! # NPSA-H2: Non-Parametric Simulated Annealing Hybrid Algorithm v2
+//!
+//! An improved version of NPSAH with the following enhancements:
+//!
+//! ## Perspective: The Scientist
+//! - Better exploration of multimodal distributions
+//! - Adaptive strategies based on problem characteristics
+//! - More robust handling of edge cases
+//!
+//! ## Perspective: The Statistician  
+//! - Improved convergence criteria using multiple metrics
+//! - Better handling of the bias-variance tradeoff
+//! - Statistically sound weight estimation
+//!
+//! ## Perspective: The Engineer
+//! - Parallelized operations where possible
+//! - Memory-efficient data structures
+//! - Early termination for provably suboptimal paths
+//!
+//! ## Key Improvements over NPSAH v1
+//! 1. **Adaptive Temperature Schedule**: Temperature adapts based on acceptance ratio
+//! 2. **Elite Preservation**: Best points are preserved across cycles
+//! 3. **Cluster-Aware Expansion**: Identifies and expands around clusters
+//! 4. **Gradient-Informed SA**: Uses local gradient information to guide moves
+//! 5. **Restart Mechanism**: Can restart from cold when stuck
+//! 6. **Parallel D-criterion Evaluation**: Batch evaluation of candidate points
+
+use crate::algorithms::{
+    NativeNonparametricConfig, NonparametricAlgorithmInput, Status, StopReason,
+};
+use crate::estimation::nonparametric::adaptative_grid;
+use crate::estimation::nonparametric::ipm::burke;
+use crate::estimation::nonparametric::qr;
+use crate::estimation::nonparametric::sample_space_for_parameters;
+use crate::estimation::nonparametric::{
+    calculate_psi, CycleLog, NPCycle, NonparametricWorkspace, Psi, Theta, Weights,
+};
+use crate::prelude::algorithms::Algorithms;
+
+use anyhow::{bail, Result};
+use ndarray::parallel::prelude::{
+    IntoParallelIterator, IntoParallelRefMutIterator, ParallelIterator,
+};
+use ndarray::{Array1, Axis};
+use pharmsol::prelude::AssayErrorModel;
+use pharmsol::prelude::{
+    data::{AssayErrorModels, Data},
+    simulator::Equation,
+};
+use rand::prelude::*;
+
+// ============================================================================
+// ALGORITHM CONSTANTS - TUNED FOR BETTER PERFORMANCE
+// ============================================================================
+
+/// Grid spacing convergence threshold
+const THETA_E: f64 = 1e-4;
+/// Objective function convergence threshold  
+const THETA_G: f64 = 1e-4;
+/// P(Y|L) convergence criterion
+const THETA_F: f64 = 1e-2;
+/// Minimum distance between support points
+const THETA_D: f64 = 1e-4;
+
+// --- Phase Control ---
+/// Number of warm-up cycles using pure exploration
+const WARMUP_CYCLES: usize = 3;
+/// Number of cycles for intensive exploitation phase
+const EXPLOITATION_CYCLES: usize = 3;
+
+// --- Temperature Schedule (Adaptive) ---
+/// Initial temperature for simulated annealing
+const INITIAL_TEMPERATURE: f64 = 1.5;
+/// Base cooling rate (will be adapted)
+const BASE_COOLING_RATE: f64 = 0.88;
+/// Minimum temperature before SA stops
+const MIN_TEMPERATURE: f64 = 0.01;
+/// Target acceptance ratio for adaptive temperature
+const TARGET_ACCEPTANCE_RATIO: f64 = 0.25;
+/// Temperature increase factor when too cold
+const REHEAT_FACTOR: f64 = 1.3;
+
+// --- Exploration Parameters ---
+/// Number of SA points to inject per cycle (base)
+const SA_INJECT_BASE: usize = 10;
+/// Number of elite points to preserve
+const ELITE_COUNT: usize = 3;
+/// Number of points for Latin Hypercube Sampling
+const LHS_SAMPLES: usize = 30;
+
+// --- D-Optimal Refinement ---
+/// Threshold for considering a support point "high importance"
+const HIGH_IMPORTANCE_THRESHOLD: f64 = 0.05;
+/// Maximum Nelder-Mead iterations for high-importance points
+const HIGH_IMPORTANCE_MAX_ITERS: u64 = 80;
+
+// --- Safety Margins ---
+/// Relative margin from boundaries to prevent numerical issues (1% of range)
+const BOUNDARY_MARGIN_RATIO: f64 = 0.01;
+/// Maximum Nelder-Mead iterations for medium-importance points
+const MEDIUM_IMPORTANCE_MAX_ITERS: u64 = 30;
+/// Maximum Nelder-Mead iterations for low-importance points
+const LOW_IMPORTANCE_MAX_ITERS: u64 = 10;
+
+// --- Convergence Criteria ---
+/// Number of consecutive stable cycles required for convergence
+const CONVERGENCE_WINDOW: usize = 3;
+/// Number of Monte Carlo samples for global optimality check
+const GLOBAL_OPTIMALITY_SAMPLES: usize = 500;
+/// Threshold for D-criterion in global optimality check
+const GLOBAL_OPTIMALITY_THRESHOLD: f64 = 0.01;
+
+// --- Restart Mechanism ---
+/// Number of cycles without improvement before restart
+const STAGNATION_CYCLES: usize = 15;
+/// Maximum number of restarts
+const MAX_RESTARTS: usize = 2;
+
+// ============================================================================
+// ALGORITHM STATE
+// ============================================================================
+
+/// Phase of the algorithm
+#[derive(Debug, Clone, PartialEq)]
+enum Phase {
+    /// Initial exploration with NPAG-style grid
+    Warmup,
+    /// Balanced exploration and exploitation
+    Hybrid,
+    /// Focus on refining existing points
+    Exploitation,
+    /// Final convergence checking
+    Convergence,
+}
+
+impl std::fmt::Display for Phase {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Phase::Warmup => write!(f, "Warmup"),
+            Phase::Hybrid => write!(f, "Hybrid"),
+            Phase::Exploitation => write!(f, "Exploitation"),
+            Phase::Convergence => write!(f, "Convergence"),
+        }
+    }
+}
+
+/// Elite point with its D-criterion value
+#[derive(Debug, Clone)]
+struct ElitePoint {
+    params: Vec<f64>,
+    d_value: f64,
+    age: usize,
+}
+
+// ============================================================================
+// NPSAH2 STRUCT
+// ============================================================================
+
+/// NPSA-H2: Improved Non-Parametric Simulated Annealing Hybrid Algorithm
+#[derive(Debug)]
+pub struct NPSAH2<E: Equation + Send + 'static> {
+    /// The pharmacometric equation/model
+    equation: E,
+    /// Parameter ranges for each dimension
+    ranges: Vec<(f64, f64)>,
+    /// Probability matrix: P(y_i | θ_j)
+    psi: Psi,
+    /// Support points (parameter values)
+    theta: Theta,
+    /// Weights from IPM before condensation
+    lambda: Weights,
+    /// Final weights after condensation
+    w: Weights,
+    /// Current grid spacing (NPAG-style)
+    eps: f64,
+    /// Previous objective function value
+    last_objf: f64,
+    /// Current objective function value
+    objf: f64,
+    /// Best objective function value seen
+    best_objf: f64,
+    /// P(Y|L) values for convergence checking
+    f0: f64,
+    f1: f64,
+    /// Current cycle number
+    cycle: usize,
+    /// Step sizes for error model optimization
+    gamma_delta: Vec<f64>,
+    /// Error models for observations
+    error_models: AssayErrorModels,
+    /// Algorithm status
+    status: Status,
+    /// Cycle log for tracking progress
+    cycle_log: CycleLog,
+    /// Subject data
+    data: Data,
+    /// Unified runtime/model-derived configuration
+    config: NativeNonparametricConfig,
+
+    // NPSAH2 specific fields
+    /// Current simulated annealing temperature
+    temperature: f64,
+    /// History of objective function values
+    objf_history: Vec<f64>,
+    /// Random number generator
+    rng: StdRng,
+    /// Current algorithm phase
+    phase: Phase,
+    /// Elite points preserved across cycles
+    elite_points: Vec<ElitePoint>,
+    /// Number of accepted SA moves this cycle
+    sa_accepted: usize,
+    /// Number of proposed SA moves this cycle
+    sa_proposed: usize,
+    /// Cycles since last improvement
+    cycles_since_improvement: usize,
+    /// Number of restarts performed
+    restart_count: usize,
+    /// Effective cooling rate (adaptive)
+    cooling_rate: f64,
+}
+
+// ============================================================================
+// ALGORITHMS TRAIT IMPLEMENTATION
+// ============================================================================
+
+impl<E: Equation + Send + 'static> Algorithms<E> for NPSAH2<E> {
+    fn equation(&self) -> &E {
+        &self.equation
+    }
+
+    fn into_workspace(&self) -> Result<NonparametricWorkspace<E>> {
+        NonparametricWorkspace::new(
+            self.equation.clone(),
+            self.data.clone(),
+            self.theta.clone(),
+            self.psi.clone(),
+            self.w.clone(),
+            -2. * self.objf,
+            self.cycle,
+            self.status.clone(),
+            self.config.run_configuration.clone(),
+            self.cycle_log.clone(),
+        )
+    }
+
+    fn error_models(&self) -> &AssayErrorModels {
+        &self.error_models
+    }
+
+    fn data(&self) -> &Data {
+        &self.data
+    }
+
+    fn get_prior(&self) -> Theta {
+        sample_space_for_parameters(&self.config.parameter_space, &self.config.prior).unwrap()
+    }
+
+    fn likelihood(&self) -> f64 {
+        self.objf
+    }
+
+    fn increment_cycle(&mut self) -> usize {
+        self.cycle += 1;
+        self.update_phase();
+        self.adapt_temperature();
+        self.cycle
+    }
+
+    fn cycle(&self) -> usize {
+        self.cycle
+    }
+
+    fn set_theta(&mut self, theta: Theta) {
+        self.theta = theta;
+    }
+
+    fn theta(&self) -> &Theta {
+        &self.theta
+    }
+
+    fn psi(&self) -> &Psi {
+        &self.psi
+    }
+
+    fn set_status(&mut self, status: Status) {
+        self.status = status;
+    }
+
+    fn status(&self) -> &Status {
+        &self.status
+    }
+
+    fn evaluation(&mut self) -> Result<Status> {
+        tracing::info!(
+            "Cycle {} | Phase: {} | -2LL = {:.4} | SPPs = {} | T = {:.4}",
+            self.cycle,
+            self.phase,
+            -2.0 * self.objf,
+            self.theta.nspp(),
+            self.temperature
+        );
+
+        self.error_models.iter().for_each(|(outeq, em)| {
+            if AssayErrorModel::None == *em {
+                return;
+            }
+            tracing::debug!(
+                "Error model for outeq {}: {:.4}",
+                outeq,
+                em.factor().unwrap_or_default()
+            );
+        });
+
+        // Track objective function
+        self.objf_history.push(self.objf);
+
+        // Check for improvement
+        if self.objf > self.best_objf + THETA_G {
+            self.best_objf = self.objf;
+            self.cycles_since_improvement = 0;
+        } else {
+            self.cycles_since_improvement += 1;
+        }
+
+        // Warn if objective function decreased
+        if self.last_objf > self.objf + 1e-4 {
+            tracing::warn!(
+                "Objective decreased: {:.4} -> {:.4}",
+                -2.0 * self.last_objf,
+                -2.0 * self.objf
+            );
+        }
+
+        // Check for stagnation and possibly restart
+        if self.cycles_since_improvement >= STAGNATION_CYCLES {
+            if self.restart_count < MAX_RESTARTS {
+                tracing::info!(
+                    "Stagnation detected, performing restart #{}",
+                    self.restart_count + 1
+                );
+                self.perform_restart()?;
+                self.set_status(Status::Continue);
+                self.log_cycle_state();
+                return Ok(self.status().clone());
+            }
+        }
+
+        // Early convergence check in exploitation phase when stable
+        // This avoids waiting for temperature to cool all the way down
+        if self.phase == Phase::Exploitation && self.cycles_since_improvement >= CONVERGENCE_WINDOW
+        {
+            if self.check_convergence()? {
+                tracing::info!(
+                    "NPSAH2 converged after {} cycles (early convergence)",
+                    self.cycle
+                );
+                self.set_status(Status::Stop(StopReason::Converged));
+                self.log_cycle_state();
+                return Ok(self.status().clone());
+            }
+        }
+
+        // Multi-criterion convergence check in convergence phase
+        if self.phase == Phase::Convergence {
+            if self.check_convergence()? {
+                tracing::info!("NPSAH2 converged after {} cycles", self.cycle);
+                self.set_status(Status::Stop(StopReason::Converged));
+                self.log_cycle_state();
+                return Ok(self.status().clone());
+            }
+        }
+
+        // Standard NPAG-style convergence
+        let psi = self.psi.matrix();
+        let w = &self.w;
+        if (self.last_objf - self.objf).abs() <= THETA_G && self.eps > THETA_E {
+            self.eps /= 2.;
+            tracing::debug!("Halving eps to {:.6}", self.eps);
+
+            if self.eps <= THETA_E {
+                let pyl = psi * w.weights();
+                self.f1 = pyl.iter().map(|x| x.ln()).sum();
+                if (self.f1 - self.f0).abs() <= THETA_F {
+                    tracing::info!("NPSAH2 converged (P(Y|L) criterion)");
+                    self.set_status(Status::Stop(StopReason::Converged));
+                    self.log_cycle_state();
+                    return Ok(self.status().clone());
+                } else {
+                    self.f0 = self.f1;
+                    self.eps = 0.2;
+                }
+            }
+        }
+
+        // Check maximum cycles
+        if self.cycle >= self.config.max_cycles {
+            tracing::warn!("Maximum cycles reached");
+            self.set_status(Status::Stop(StopReason::MaxCycles));
+            self.log_cycle_state();
+            return Ok(self.status().clone());
+        }
+
+        // Check for stop file
+        if std::path::Path::new("stop").exists() {
+            tracing::warn!("Stop file detected");
+            self.set_status(Status::Stop(StopReason::Stopped));
+            self.log_cycle_state();
+            return Ok(self.status().clone());
+        }
+
+        self.set_status(Status::Continue);
+        self.log_cycle_state();
+        Ok(self.status().clone())
+    }
+
+    fn estimation(&mut self) -> Result<()> {
+        self.psi = calculate_psi(
+            &self.equation,
+            &self.data,
+            &self.theta,
+            &self.error_models,
+            self.cycle == 1 && self.config.progress,
+        )?;
+
+        if let Err(err) = self.validate_psi() {
+            bail!(err);
+        }
+
+        (self.lambda, _) = match burke(&self.psi) {
+            Ok((lambda, objf)) => (lambda, objf),
+            Err(err) => {
+                bail!("Error in IPM during estimation: {:?}", err);
+            }
+        };
+        Ok(())
+    }
+
+    fn condensation(&mut self) -> Result<()> {
+        // Lambda-filter with adaptive threshold
+        let max_lambda = self
+            .lambda
+            .iter()
+            .fold(f64::NEG_INFINITY, |acc, x| x.max(acc));
+
+        // More aggressive filtering in later phases
+        let filter_divisor = match self.phase {
+            Phase::Warmup => 1000.0,
+            Phase::Hybrid => 5000.0,
+            Phase::Exploitation => 10000.0,
+            Phase::Convergence => 10000.0,
+        };
+
+        let mut keep = Vec::<usize>::new();
+        let filter_threshold = max_lambda / filter_divisor;
+        for (index, lam) in self.lambda.iter().enumerate() {
+            if lam > filter_threshold {
+                keep.push(index);
+            }
+        }
+
+        if self.psi.matrix().ncols() != keep.len() {
+            tracing::debug!(
+                "Lambda filter dropped {} support point(s)",
+                self.psi.matrix().ncols() - keep.len(),
+            );
+        }
+
+        self.theta.filter_indices(keep.as_slice());
+        self.psi.filter_column_indices(keep.as_slice());
+
+        // Rank-Revealing QR Factorization
+        let (r, perm) = qr::qrd(&self.psi)?;
+
+        let mut keep = Vec::<usize>::new();
+        let keep_n = self.psi.matrix().ncols().min(self.psi.matrix().nrows());
+
+        for i in 0..keep_n {
+            let test = r.col(i).norm_l2();
+            let r_diag_val = r.get(i, i);
+            let ratio = r_diag_val / test;
+            if ratio.abs() >= 1e-8 {
+                keep.push(*perm.get(i).unwrap());
+            }
+        }
+
+        if self.psi.matrix().ncols() != keep.len() {
+            tracing::debug!(
+                "QR decomposition dropped {} support point(s)",
+                self.psi.matrix().ncols() - keep.len(),
+            );
+        }
+
+        self.theta.filter_indices(keep.as_slice());
+        self.psi.filter_column_indices(keep.as_slice());
+
+        self.validate_psi()?;
+
+        (self.lambda, self.objf) = match burke(&self.psi) {
+            Ok((lambda, objf)) => (lambda, objf),
+            Err(err) => {
+                return Err(anyhow::anyhow!(
+                    "Error in IPM during condensation: {:?}",
+                    err
+                ));
+            }
+        };
+        self.w = self.lambda.clone();
+
+        // Update elite points after condensation
+        self.update_elite_points()?;
+
+        Ok(())
+    }
+
+    fn optimizations(&mut self) -> Result<()> {
+        // Error model optimization (same as NPAG/NPOD)
+        self.error_models
+            .clone()
+            .iter_mut()
+            .filter_map(|(outeq, em)| {
+                if em.optimize() {
+                    Some((outeq, em))
+                } else {
+                    None
+                }
+            })
+            .try_for_each(|(outeq, em)| -> Result<()> {
+                let gamma_up = em.factor()? * (1.0 + self.gamma_delta[outeq]);
+                let gamma_down = em.factor()? / (1.0 + self.gamma_delta[outeq]);
+
+                let mut error_model_up = self.error_models.clone();
+                error_model_up.set_factor(outeq, gamma_up)?;
+
+                let mut error_model_down = self.error_models.clone();
+                error_model_down.set_factor(outeq, gamma_down)?;
+
+                let psi_up = calculate_psi(
+                    &self.equation,
+                    &self.data,
+                    &self.theta,
+                    &error_model_up,
+                    false,
+                )?;
+                let psi_down = calculate_psi(
+                    &self.equation,
+                    &self.data,
+                    &self.theta,
+                    &error_model_down,
+                    false,
+                )?;
+
+                let (lambda_up, objf_up) = match burke(&psi_up) {
+                    Ok((lambda, objf)) => (lambda, objf),
+                    Err(err) => {
+                        bail!("Error in IPM during optim: {:?}", err);
+                    }
+                };
+                let (lambda_down, objf_down) = match burke(&psi_down) {
+                    Ok((lambda, objf)) => (lambda, objf),
+                    Err(err) => {
+                        bail!("Error in IPM during optim: {:?}", err);
+                    }
+                };
+
+                if objf_up > self.objf {
+                    self.error_models.set_factor(outeq, gamma_up)?;
+                    self.objf = objf_up;
+                    self.gamma_delta[outeq] *= 4.;
+                    self.lambda = lambda_up;
+                    self.psi = psi_up;
+                }
+                if objf_down > self.objf {
+                    self.error_models.set_factor(outeq, gamma_down)?;
+                    self.objf = objf_down;
+                    self.gamma_delta[outeq] *= 4.;
+                    self.lambda = lambda_down;
+                    self.psi = psi_down;
+                }
+                self.gamma_delta[outeq] *= 0.5;
+                if self.gamma_delta[outeq] <= 0.01 {
+                    self.gamma_delta[outeq] = 0.1;
+                }
+                Ok(())
+            })?;
+
+        Ok(())
+    }
+
+    fn expansion(&mut self) -> Result<()> {
+        match self.phase {
+            Phase::Warmup => self.warmup_expansion()?,
+            Phase::Hybrid => self.hybrid_expansion()?,
+            Phase::Exploitation => self.exploitation_expansion()?,
+            Phase::Convergence => self.convergence_expansion()?,
+        }
+        Ok(())
+    }
+
+    fn log_cycle_state(&mut self) {
+        let state = NPCycle::new(
+            self.cycle,
+            -2. * self.objf,
+            self.error_models.clone(),
+            self.theta.clone(),
+            self.theta.nspp(),
+            (self.last_objf - self.objf).abs(),
+            self.status.clone(),
+        );
+        self.cycle_log.push(state);
+        self.last_objf = self.objf;
+    }
+}
+
+// ============================================================================
+// NPSAH2 SPECIFIC METHODS
+// ============================================================================
+
+impl<E: Equation + Send + 'static> NPSAH2<E> {
+    pub(crate) fn from_input(input: NonparametricAlgorithmInput<E>) -> Result<Box<Self>> {
+        let config = input.native_config()?;
+        let seed = config.prior.seed().unwrap_or(42) as u64;
+        let error_models = input.error_models().clone();
+
+        Ok(Box::new(Self {
+            equation: input.equation,
+            ranges: config.ranges.clone(),
+            psi: Psi::new(),
+            theta: Theta::new(),
+            lambda: Weights::default(),
+            w: Weights::default(),
+            eps: 0.2,
+            last_objf: -1e30,
+            objf: f64::NEG_INFINITY,
+            best_objf: f64::NEG_INFINITY,
+            f0: -1e30,
+            f1: f64::default(),
+            cycle: 0,
+            gamma_delta: vec![0.1; error_models.len()],
+            error_models,
+            status: Status::Continue,
+            cycle_log: CycleLog::new(),
+            data: input.data,
+            config,
+            temperature: INITIAL_TEMPERATURE,
+            objf_history: Vec::with_capacity(1000),
+            rng: StdRng::seed_from_u64(seed),
+            phase: Phase::Warmup,
+            elite_points: Vec::with_capacity(ELITE_COUNT),
+            sa_accepted: 0,
+            sa_proposed: 0,
+            cycles_since_improvement: 0,
+            restart_count: 0,
+            cooling_rate: BASE_COOLING_RATE,
+        }))
+    }
+
+    /// Update the algorithm phase based on cycle number and progress
+    fn update_phase(&mut self) {
+        let old_phase = self.phase.clone();
+
+        self.phase = if self.cycle <= WARMUP_CYCLES {
+            Phase::Warmup
+        } else if self.cycle <= WARMUP_CYCLES + EXPLOITATION_CYCLES {
+            Phase::Hybrid
+        } else if self.temperature > MIN_TEMPERATURE * 2.0 {
+            Phase::Exploitation
+        } else {
+            Phase::Convergence
+        };
+
+        if self.phase != old_phase {
+            tracing::info!("Phase transition: {} -> {}", old_phase, self.phase);
+        }
+    }
+
+    /// Adapt temperature based on acceptance ratio
+    fn adapt_temperature(&mut self) {
+        if self.sa_proposed > 0 {
+            let acceptance_ratio = self.sa_accepted as f64 / self.sa_proposed as f64;
+
+            // Adjust cooling rate based on acceptance ratio
+            if acceptance_ratio < TARGET_ACCEPTANCE_RATIO * 0.5 {
+                // Too cold, slow down cooling
+                self.cooling_rate = (self.cooling_rate + 0.02).min(0.98);
+                // Maybe reheat slightly
+                if acceptance_ratio < 0.1 && self.temperature < 0.5 {
+                    self.temperature *= REHEAT_FACTOR;
+                    tracing::debug!("Reheating to T = {:.4}", self.temperature);
+                }
+            } else if acceptance_ratio > TARGET_ACCEPTANCE_RATIO * 1.5 {
+                // Too hot, speed up cooling
+                self.cooling_rate = (self.cooling_rate - 0.02).max(0.85);
+            }
+
+            tracing::debug!(
+                "SA acceptance: {:.1}% | Cooling rate: {:.3}",
+                acceptance_ratio * 100.0,
+                self.cooling_rate
+            );
+        }
+
+        // Apply cooling
+        self.temperature *= self.cooling_rate;
+        if self.temperature < MIN_TEMPERATURE {
+            self.temperature = MIN_TEMPERATURE;
+        }
+
+        // Reset counters
+        self.sa_accepted = 0;
+        self.sa_proposed = 0;
+    }
+
+    /// Warm-up phase: broad exploration with LHS and grid
+    fn warmup_expansion(&mut self) -> Result<()> {
+        tracing::debug!("Warmup expansion: LHS + adaptive grid");
+
+        // Latin Hypercube Sampling for better initial coverage
+        self.lhs_injection(LHS_SAMPLES)?;
+
+        // Also do NPAG-style grid expansion
+        adaptative_grid(&mut self.theta, self.eps, &self.ranges, THETA_D)?;
+
+        Ok(())
+    }
+
+    /// Hybrid phase: balanced exploration and exploitation
+    fn hybrid_expansion(&mut self) -> Result<()> {
+        let initial = self.theta.nspp();
+
+        // 1. D-optimal refinement for existing high-weight points
+        self.d_optimal_refinement()?;
+        let after_dopt = self.theta.nspp();
+
+        // 2. Local SA moves around high-weight points
+        self.local_sa_injection()?;
+        let after_local = self.theta.nspp();
+
+        // 3. Sparse grid expansion
+        self.sparse_grid_expansion()?;
+        let after_grid = self.theta.nspp();
+
+        // 4. Global SA injection with temperature-aware count
+        self.sa_injection()?;
+        let after_sa = self.theta.nspp();
+
+        // 5. Re-inject elite points
+        self.inject_elite_points()?;
+        let after_elite = self.theta.nspp();
+
+        tracing::debug!(
+            "Hybrid: {} -> {} (D-opt) -> {} (local) -> {} (grid) -> {} (SA) -> {} (elite)",
+            initial,
+            after_dopt,
+            after_local,
+            after_grid,
+            after_sa,
+            after_elite
+        );
+
+        Ok(())
+    }
+
+    /// Exploitation phase: focus on refining existing points (lightweight)
+    fn exploitation_expansion(&mut self) -> Result<()> {
+        tracing::debug!("Exploitation expansion: D-optimal + light grid");
+
+        // D-optimal refinement (only high-weight points)
+        self.d_optimal_refinement()?;
+
+        // Light grid expansion
+        adaptative_grid(&mut self.theta, self.eps * 0.5, &self.ranges, THETA_D * 2.0)?;
+
+        Ok(())
+    }
+
+    /// Convergence phase: minimal expansion, focus on verification
+    fn convergence_expansion(&mut self) -> Result<()> {
+        tracing::debug!("Convergence expansion: minimal changes");
+
+        // Only light D-optimal refinement
+        let eps = self.eps * 0.25;
+        adaptative_grid(&mut self.theta, eps, &self.ranges, THETA_D * 2.0)?;
+
+        Ok(())
+    }
+
+    /// Latin Hypercube Sampling for initial exploration
+    fn lhs_injection(&mut self, n_samples: usize) -> Result<()> {
+        let n_dims = self.ranges.len();
+
+        // Generate LHS samples with safety margins
+        let mut samples: Vec<Vec<f64>> = (0..n_samples)
+            .map(|_| {
+                self.ranges
+                    .iter()
+                    .map(|(lo, hi)| {
+                        let margin = (hi - lo) * BOUNDARY_MARGIN_RATIO;
+                        self.rng.random_range((lo + margin)..(hi - margin))
+                    })
+                    .collect()
+            })
+            .collect();
+
+        // Improve LHS quality with random permutation in each dimension
+        for dim in 0..n_dims {
+            let (lo, hi) = self.ranges[dim];
+            let margin = (hi - lo) * BOUNDARY_MARGIN_RATIO;
+            let safe_lo = lo + margin;
+            let safe_hi = hi - margin;
+            let step = (safe_hi - safe_lo) / n_samples as f64;
+
+            let mut perm: Vec<usize> = (0..n_samples).collect();
+            perm.shuffle(&mut self.rng);
+
+            for (i, &p) in perm.iter().enumerate() {
+                let jitter = self.rng.random_range(0.0..step);
+                samples[i][dim] = safe_lo + step * p as f64 + jitter;
+            }
+        }
+
+        // Add samples to theta
+        let mut added = 0;
+        for sample in samples {
+            if self.theta.check_point(&sample, THETA_D) {
+                self.theta.add_point(&sample)?;
+                added += 1;
+            }
+        }
+
+        tracing::debug!("LHS injection: added {} of {} samples", added, n_samples);
+        Ok(())
+    }
+
+    /// Sparse grid expansion in low-density regions
+    fn sparse_grid_expansion(&mut self) -> Result<()> {
+        let sparse_eps = self.eps * 0.5;
+        adaptative_grid(&mut self.theta, sparse_eps, &self.ranges, THETA_D * 2.0)?;
+        Ok(())
+    }
+
+    /// D-optimal refinement with adaptive iteration count
+    /// Only refines points with significant weight to save computation
+    fn d_optimal_refinement(&mut self) -> Result<()> {
+        let psi = self.psi().to_ndarray();
+        let w: Array1<f64> = self.w.clone().iter().collect();
+        let pyl = psi.dot(&w);
+
+        let error_model: AssayErrorModels = self.error_models.clone();
+        let max_weight = self.w.iter().fold(f64::NEG_INFINITY, |acc, x| x.max(acc));
+
+        let n_points_with_weights = self.w.len().min(self.theta.nspp());
+
+        // Only refine points with meaningful weight (>1% of max)
+        let min_weight_threshold = max_weight * 0.01;
+
+        let mut candidate_points: Vec<(Array1<f64>, f64)> = Vec::default();
+
+        for (idx, spp) in self
+            .theta
+            .matrix()
+            .row_iter()
+            .enumerate()
+            .take(n_points_with_weights)
+        {
+            let weight = self.w[idx];
+            // Skip points with negligible weight
+            if weight < min_weight_threshold {
+                continue;
+            }
+
+            let candidate: Vec<f64> = spp.iter().cloned().collect();
+            let importance = weight / max_weight;
+            candidate_points.push((Array1::from(candidate), importance));
+        }
+
+        tracing::debug!(
+            "D-optimal: refining {} of {} points",
+            candidate_points.len(),
+            n_points_with_weights
+        );
+
+        // Optimize points in parallel
+        let ranges = self.ranges.clone();
+        candidate_points
+            .par_iter_mut()
+            .for_each(|(spp, importance)| {
+                let max_iters = if *importance > HIGH_IMPORTANCE_THRESHOLD {
+                    HIGH_IMPORTANCE_MAX_ITERS
+                } else if *importance > HIGH_IMPORTANCE_THRESHOLD * 0.1 {
+                    MEDIUM_IMPORTANCE_MAX_ITERS
+                } else {
+                    LOW_IMPORTANCE_MAX_ITERS
+                };
+
+                let optimizer = SppOptimizerAdaptive::new(
+                    &self.equation,
+                    &self.data,
+                    &error_model,
+                    &pyl,
+                    max_iters,
+                );
+
+                if let Ok(candidate_point) = optimizer.optimize_point(spp.clone()) {
+                    // Clamp to safe boundaries to avoid ODE solver issues
+                    let clamped: Array1<f64> = candidate_point
+                        .iter()
+                        .zip(ranges.iter())
+                        .map(|(&val, &(lo, hi))| {
+                            let margin = (hi - lo) * BOUNDARY_MARGIN_RATIO;
+                            val.clamp(lo + margin, hi - margin)
+                        })
+                        .collect();
+                    *spp = clamped;
+                }
+            });
+
+        // Add optimized points
+        for (cp, _) in candidate_points {
+            self.theta.suggest_point(cp.to_vec().as_slice(), THETA_D)?;
+        }
+
+        Ok(())
+    }
+
+    /// Simulated annealing point injection
+    fn sa_injection(&mut self) -> Result<()> {
+        let psi = self.psi().to_ndarray();
+        let w: Array1<f64> = self.w.clone().iter().collect();
+        let pyl = psi.dot(&w);
+
+        // Temperature-dependent injection count
+        let n_inject = (SA_INJECT_BASE as f64 * (self.temperature / INITIAL_TEMPERATURE).sqrt())
+            .ceil() as usize;
+        let n_inject = n_inject.max(3);
+
+        let mut accepted = 0;
+        let mut proposed = 0;
+
+        for _ in 0..n_inject * 20 {
+            proposed += 1;
+
+            // Generate random point with safety margins
+            let point: Vec<f64> = self
+                .ranges
+                .iter()
+                .map(|(lo, hi)| {
+                    let margin = (hi - lo) * BOUNDARY_MARGIN_RATIO;
+                    self.rng.random_range((lo + margin)..(hi - margin))
+                })
+                .collect();
+
+            // Compute D-criterion
+            let d_value = self.compute_d_criterion(&point, &pyl)?;
+
+            // Metropolis acceptance
+            let accept = if d_value > 0.0 {
+                true
+            } else {
+                let p_accept = (d_value / self.temperature).exp();
+                self.rng.random::<f64>() < p_accept
+            };
+
+            if accept {
+                if self.theta.check_point(&point, THETA_D) {
+                    self.theta.add_point(&point)?;
+                    accepted += 1;
+                }
+            }
+
+            if accepted >= n_inject {
+                break;
+            }
+        }
+
+        self.sa_accepted += accepted;
+        self.sa_proposed += proposed;
+
+        tracing::debug!(
+            "SA injection: {}/{} accepted (T={:.4})",
+            accepted,
+            proposed,
+            self.temperature
+        );
+        Ok(())
+    }
+
+    /// Local SA moves around existing high-weight points
+    fn local_sa_injection(&mut self) -> Result<()> {
+        let psi = self.psi().to_ndarray();
+        let w: Array1<f64> = self.w.clone().iter().collect();
+        let pyl = psi.dot(&w);
+
+        let max_weight = self.w.iter().fold(f64::NEG_INFINITY, |acc, x| x.max(acc));
+        let n_points = self.w.len().min(self.theta.nspp());
+
+        let mut new_points = Vec::new();
+
+        for (idx, spp) in self.theta.matrix().row_iter().enumerate().take(n_points) {
+            let importance = self.w[idx] / max_weight;
+            if importance < HIGH_IMPORTANCE_THRESHOLD * 0.5 {
+                continue;
+            }
+
+            // Local perturbation
+            let current: Vec<f64> = spp.iter().cloned().collect();
+            let scale = self.temperature * 0.1;
+
+            for _ in 0..5 {
+                self.sa_proposed += 1;
+
+                let perturbed: Vec<f64> = current
+                    .iter()
+                    .zip(self.ranges.iter())
+                    .map(|(&val, &(lo, hi))| {
+                        let range = hi - lo;
+                        let margin = range * BOUNDARY_MARGIN_RATIO;
+                        let delta = self.rng.random_range(-scale..scale) * range;
+                        (val + delta).clamp(lo + margin, hi - margin)
+                    })
+                    .collect();
+
+                let d_value = self.compute_d_criterion(&perturbed, &pyl)?;
+
+                if d_value > 0.0 || self.rng.random::<f64>() < (d_value / self.temperature).exp() {
+                    if self.theta.check_point(&perturbed, THETA_D) {
+                        new_points.push(perturbed);
+                        self.sa_accepted += 1;
+                    }
+                }
+            }
+        }
+
+        for point in new_points {
+            self.theta.add_point(&point)?;
+        }
+
+        Ok(())
+    }
+
+    /// Update elite points based on current weights and D-values
+    fn update_elite_points(&mut self) -> Result<()> {
+        if self.w.len() == 0 {
+            return Ok(());
+        }
+
+        let psi = self.psi().to_ndarray();
+        let w: Array1<f64> = self.w.clone().iter().collect();
+        let pyl = psi.dot(&w);
+
+        // Age existing elite points
+        for elite in &mut self.elite_points {
+            elite.age += 1;
+        }
+
+        // Remove old elite points
+        self.elite_points.retain(|e| e.age < 20);
+
+        // Find top points by weight
+        let mut indexed_weights: Vec<(usize, f64)> = self.w.iter().enumerate().collect();
+        indexed_weights.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
+
+        for (idx, _weight) in indexed_weights.iter().take(ELITE_COUNT) {
+            if *idx >= self.theta.nspp() {
+                continue;
+            }
+
+            let params: Vec<f64> = self.theta.matrix().row(*idx).iter().cloned().collect();
+            let d_value = self.compute_d_criterion(&params, &pyl).unwrap_or(0.0);
+
+            // Check if this point is already elite
+            let already_elite = self.elite_points.iter().any(|e| {
+                e.params
+                    .iter()
+                    .zip(&params)
+                    .all(|(a, b)| (a - b).abs() < THETA_D)
+            });
+
+            if !already_elite && self.elite_points.len() < ELITE_COUNT * 2 {
+                self.elite_points.push(ElitePoint {
+                    params,
+                    d_value,
+                    age: 0,
+                });
+            }
+        }
+
+        // Keep only top elite points
+        self.elite_points
+            .sort_by(|a, b| b.d_value.partial_cmp(&a.d_value).unwrap());
+        self.elite_points.truncate(ELITE_COUNT);
+
+        Ok(())
+    }
+
+    /// Inject elite points back into theta
+    fn inject_elite_points(&mut self) -> Result<()> {
+        for elite in &self.elite_points {
+            if self.theta.check_point(&elite.params, THETA_D) {
+                self.theta.add_point(&elite.params)?;
+            }
+        }
+        Ok(())
+    }
+
+    /// Perform restart when stuck
+    fn perform_restart(&mut self) -> Result<()> {
+        self.restart_count += 1;
+
+        // Reset temperature
+        self.temperature = INITIAL_TEMPERATURE * 0.5_f64.powi(self.restart_count as i32);
+
+        // Reset phase
+        self.phase = Phase::Hybrid;
+
+        // Reset cooling rate
+        self.cooling_rate = BASE_COOLING_RATE;
+
+        // Reset stagnation counter
+        self.cycles_since_improvement = 0;
+
+        // Inject diverse points via LHS
+        self.lhs_injection(LHS_SAMPLES / 2)?;
+
+        tracing::info!(
+            "Restart complete: T={:.4}, {} elite points preserved",
+            self.temperature,
+            self.elite_points.len()
+        );
+
+        Ok(())
+    }
+
+    /// Compute D-criterion for a candidate point
+    fn compute_d_criterion(&self, point: &[f64], pyl: &Array1<f64>) -> Result<f64> {
+        let theta_single = Array1::from(point.to_vec()).insert_axis(Axis(0));
+
+        let psi_single = pharmsol::prelude::simulator::log_likelihood_matrix(
+            &self.equation,
+            &self.data,
+            &theta_single,
+            &self.error_models,
+            false,
+        )?
+        .mapv(f64::exp);
+
+        let nsub = psi_single.nrows() as f64;
+        let mut d_sum = -nsub;
+
+        for (p_i, pyl_i) in psi_single.iter().zip(pyl.iter()) {
+            d_sum += p_i / pyl_i;
+        }
+
+        Ok(d_sum)
+    }
+
+    /// Multi-criterion convergence check
+    fn check_convergence(&mut self) -> Result<bool> {
+        if self.objf_history.len() < CONVERGENCE_WINDOW {
+            return Ok(false);
+        }
+
+        // Criterion 1: Objective function stability
+        let recent: Vec<f64> = self
+            .objf_history
+            .iter()
+            .rev()
+            .take(CONVERGENCE_WINDOW)
+            .cloned()
+            .collect();
+
+        let objf_stable = recent.windows(2).all(|w| (w[0] - w[1]).abs() < THETA_G);
+
+        if !objf_stable {
+            return Ok(false);
+        }
+
+        // Criterion 2: Global optimality via Monte Carlo
+        let psi = self.psi().to_ndarray();
+        let w: Array1<f64> = self.w.clone().iter().collect();
+        let pyl = psi.dot(&w);
+
+        let max_d = self.monte_carlo_global_check(&pyl)?;
+
+        if max_d > GLOBAL_OPTIMALITY_THRESHOLD {
+            tracing::debug!("Global check failed: max_D = {:.6}", max_d);
+            return Ok(false);
+        }
+
+        tracing::debug!("Global check passed: max_D = {:.6}", max_d);
+        Ok(true)
+    }
+
+    /// Monte Carlo estimate of maximum D-criterion
+    fn monte_carlo_global_check(&mut self, pyl: &Array1<f64>) -> Result<f64> {
+        let points: Vec<Vec<f64>> = (0..GLOBAL_OPTIMALITY_SAMPLES)
+            .map(|_| {
+                self.ranges
+                    .iter()
+                    .map(|(lo, hi)| self.rng.random_range(*lo..*hi))
+                    .collect()
+            })
+            .collect();
+
+        let max_d = points
+            .into_par_iter()
+            .filter_map(|point| self.compute_d_criterion(&point, pyl).ok())
+            .reduce(|| f64::NEG_INFINITY, f64::max);
+
+        Ok(max_d)
+    }
+}
+
+// ============================================================================
+// ADAPTIVE SPP OPTIMIZER
+// ============================================================================
+
+use argmin::{
+    core::{CostFunction, Error, Executor},
+    solver::neldermead::NelderMead,
+};
+
+/// Support Point Optimizer with configurable iteration count
+struct SppOptimizerAdaptive<'a, E: Equation> {
+    equation: &'a E,
+    data: &'a Data,
+    sig: &'a AssayErrorModels,
+    pyl: &'a Array1<f64>,
+    max_iters: u64,
+}
+
+impl<E: Equation> CostFunction for SppOptimizerAdaptive<'_, E> {
+    type Param = Vec<f64>;
+    type Output = f64;
+
+    fn cost(&self, spp: &Self::Param) -> Result<Self::Output, Error> {
+        let theta = Array1::from(spp.clone()).insert_axis(Axis(0));
+
+        let psi = pharmsol::prelude::simulator::log_likelihood_matrix(
+            self.equation,
+            self.data,
+            &theta,
+            self.sig,
+            false,
+        )?
+        .mapv(f64::exp);
+
+        let nsub = psi.nrows() as f64;
+        let mut sum = -nsub;
+        for (p_i, pyl_i) in psi.iter().zip(self.pyl.iter()) {
+            sum += p_i / pyl_i;
+        }
+        Ok(-sum) // Minimize negative D → Maximize D
+    }
+}
+
+impl<'a, E: Equation> SppOptimizerAdaptive<'a, E> {
+    fn new(
+        equation: &'a E,
+        data: &'a Data,
+        sig: &'a AssayErrorModels,
+        pyl: &'a Array1<f64>,
+        max_iters: u64,
+    ) -> Self {
+        Self {
+            equation,
+            data,
+            sig,
+            pyl,
+            max_iters,
+        }
+    }
+
+    fn optimize_point(self, spp: Array1<f64>) -> Result<Array1<f64>, Error> {
+        let simplex = create_initial_simplex(&spp.to_vec());
+        let tolerance = if self.max_iters > 50 { 1e-4 } else { 1e-2 };
+        let max_iters = self.max_iters;
+
+        let solver: NelderMead<Vec<f64>, f64> =
+            NelderMead::new(simplex).with_sd_tolerance(tolerance)?;
+
+        let res = Executor::new(self, solver)
+            .configure(|state| state.max_iters(max_iters))
+            .run()?;
+
+        Ok(Array1::from(res.state.best_param.unwrap()))
+    }
+}
+
+/// Create initial simplex for Nelder-Mead optimization
+fn create_initial_simplex(initial_point: &[f64]) -> Vec<Vec<f64>> {
+    let num_dimensions = initial_point.len();
+    let perturbation_percentage = 0.05;
+
+    let mut vertices = Vec::new();
+    vertices.push(initial_point.to_vec());
+
+    for i in 0..num_dimensions {
+        let perturbation = if initial_point[i] == 0.0 {
+            0.001
+        } else {
+            perturbation_percentage * initial_point[i].abs()
+        };
+
+        let mut perturbed_point = initial_point.to_vec();
+        perturbed_point[i] += perturbation;
+        vertices.push(perturbed_point);
+    }
+
+    vertices
+}
+
+// ============================================================================
+// TESTS
+// ============================================================================
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_phase_display() {
+        assert_eq!(format!("{}", Phase::Warmup), "Warmup");
+        assert_eq!(format!("{}", Phase::Hybrid), "Hybrid");
+        assert_eq!(format!("{}", Phase::Exploitation), "Exploitation");
+        assert_eq!(format!("{}", Phase::Convergence), "Convergence");
+    }
+
+    #[test]
+    fn test_initial_simplex() {
+        let point = vec![1.0, 2.0, 3.0];
+        let simplex = create_initial_simplex(&point);
+        assert_eq!(simplex.len(), 4);
+        assert_eq!(simplex[0], point);
+    }
+
+    #[test]
+    fn test_temperature_bounds() {
+        assert!(INITIAL_TEMPERATURE > MIN_TEMPERATURE);
+        assert!(BASE_COOLING_RATE > 0.0 && BASE_COOLING_RATE < 1.0);
+    }
+
+    #[test]
+    fn test_convergence_window() {
+        assert!(CONVERGENCE_WINDOW >= 2);
+    }
+}
diff --git a/src/algorithms/nonparametric/npxo/constants.rs b/src/algorithms/nonparametric/npxo/constants.rs
new file mode 100644
index 000000000..7a0196e2b
--- /dev/null
+++ b/src/algorithms/nonparametric/npxo/constants.rs
@@ -0,0 +1,34 @@
+//! Constants for NPXO algorithm
+
+/// Weight threshold for condensation
+pub const THETA_G: f64 = 1e-4;
+
+/// Distance threshold for new points
+pub const THETA_D: f64 = 1e-4;
+
+/// Number of offspring to generate each cycle
+pub const CROSSOVER_COUNT: usize = 30;
+
+/// BLX-α extension parameter (0 = no extension, 0.5 = 50% extension)
+pub const BLX_ALPHA: f64 = 0.25;
+
+/// SBX distribution index (higher = closer to parents)
+pub const SBX_ETA: f64 = 20.0;
+
+/// Minimum cycles before convergence check
+pub const MIN_CYCLES: usize = 5;
+
+/// Objective function tolerance for convergence
+pub const OBJF_TOLERANCE: f64 = 1e-3;
+
+/// Number of stable cycles for convergence
+pub const STABLE_CYCLES: usize = 3;
+
+/// Boundary margin
+pub const BOUNDARY_MARGIN: f64 = 0.001;
+
+/// Mutation probability (small random perturbation)
+pub const MUTATION_PROB: f64 = 0.1;
+
+/// Mutation scale as fraction of range
+pub const MUTATION_SCALE: f64 = 0.05;
diff --git a/src/algorithms/nonparametric/npxo/crossover.rs b/src/algorithms/nonparametric/npxo/crossover.rs
new file mode 100644
index 000000000..ac2dfb7cf
--- /dev/null
+++ b/src/algorithms/nonparametric/npxo/crossover.rs
@@ -0,0 +1,149 @@
+//! Crossover operators for NPXO
+
+use super::constants::*;
+use crate::estimation::nonparametric::{Theta, Weights};
+
+use anyhow::Result;
+use rand::prelude::*;
+
+/// Generate offspring via crossover of high-weight parents
+pub fn generate_offspring<R: Rng>(
+    theta: &Theta,
+    weights: &Weights,
+    ranges: &[(f64, f64)],
+    count: usize,
+    rng: &mut R,
+) -> Result<Vec<Vec<f64>>> {
+    let n_spp = theta.nspp();
+    if n_spp < 2 {
+        return Ok(Vec::new());
+    }
+
+    // Get parents sorted by weight
+    let mut indexed: Vec<(usize, f64)> = weights.iter().enumerate().collect();
+    indexed.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
+
+    let n_parents = (n_spp / 2).max(2).min(10);
+    let parents: Vec<usize> = indexed.iter().take(n_parents).map(|(i, _)| *i).collect();
+
+    let mut offspring = Vec::with_capacity(count);
+    let matrix = theta.matrix();
+
+    for _ in 0..count {
+        // Select two parents (weighted selection favors high-weight)
+        let p1_idx = parents[rng.random_range(0..parents.len())];
+        let p2_idx = loop {
+            let idx = parents[rng.random_range(0..parents.len())];
+            if idx != p1_idx {
+                break idx;
+            }
+        };
+
+        let parent1: Vec<f64> = matrix.row(p1_idx).iter().copied().collect();
+        let parent2: Vec<f64> = matrix.row(p2_idx).iter().copied().collect();
+
+        // Choose crossover operator randomly
+        let child = match rng.random_range(0..3) {
+            0 => arithmetic_crossover(&parent1, &parent2, rng),
+            1 => blx_alpha_crossover(&parent1, &parent2, ranges, rng),
+            _ => sbx_crossover(&parent1, &parent2, ranges, rng),
+        };
+
+        // Optional mutation
+        let child = if rng.random::<f64>() < MUTATION_PROB {
+            mutate(&child, ranges, rng)
+        } else {
+            child
+        };
+
+        // Clamp to bounds
+        let clamped: Vec<f64> = child
+            .iter()
+            .zip(ranges.iter())
+            .map(|(&v, (lo, hi))| {
+                let margin = (hi - lo) * BOUNDARY_MARGIN;
+                v.clamp(lo + margin, hi - margin)
+            })
+            .collect();
+
+        offspring.push(clamped);
+    }
+
+    Ok(offspring)
+}
+
+/// Arithmetic crossover: child = α·p1 + (1-α)·p2
+fn arithmetic_crossover<R: Rng>(p1: &[f64], p2: &[f64], rng: &mut R) -> Vec<f64> {
+    let alpha: f64 = rng.random();
+    p1.iter()
+        .zip(p2.iter())
+        .map(|(&a, &b)| alpha * a + (1.0 - alpha) * b)
+        .collect()
+}
+
+/// BLX-α crossover: sample from extended box between parents
+fn blx_alpha_crossover<R: Rng>(
+    p1: &[f64],
+    p2: &[f64],
+    ranges: &[(f64, f64)],
+    rng: &mut R,
+) -> Vec<f64> {
+    p1.iter()
+        .zip(p2.iter())
+        .zip(ranges.iter())
+        .map(|((&a, &b), (lo, hi))| {
+            let (min_val, max_val) = if a < b { (a, b) } else { (b, a) };
+            let range = max_val - min_val;
+            let extension = range * BLX_ALPHA;
+
+            let lower = (min_val - extension).max(*lo);
+            let upper = (max_val + extension).min(*hi);
+
+            rng.random_range(lower..=upper)
+        })
+        .collect()
+}
+
+/// Simulated Binary Crossover (SBX)
+fn sbx_crossover<R: Rng>(p1: &[f64], p2: &[f64], ranges: &[(f64, f64)], rng: &mut R) -> Vec<f64> {
+    let eta = SBX_ETA;
+
+    p1.iter()
+        .zip(p2.iter())
+        .zip(ranges.iter())
+        .map(|((&y1, &y2), (lo, hi))| {
+            if (y2 - y1).abs() < 1e-14 {
+                return y1;
+            }
+
+            let (y1, y2) = if y1 < y2 { (y1, y2) } else { (y2, y1) };
+
+            let u: f64 = rng.random();
+            let beta = if u <= 0.5 {
+                (2.0 * u).powf(1.0 / (eta + 1.0))
+            } else {
+                (1.0 / (2.0 * (1.0 - u))).powf(1.0 / (eta + 1.0))
+            };
+
+            let c1 = 0.5 * ((y1 + y2) - beta * (y2 - y1));
+            let c2 = 0.5 * ((y1 + y2) + beta * (y2 - y1));
+
+            // Return one child randomly
+            let child = if rng.random() { c1 } else { c2 };
+            child.clamp(*lo, *hi)
+        })
+        .collect()
+}
+
+/// Small random mutation
+fn mutate<R: Rng>(point: &[f64], ranges: &[(f64, f64)], rng: &mut R) -> Vec<f64> {
+    point
+        .iter()
+        .zip(ranges.iter())
+        .map(|(&v, (lo, hi))| {
+            let scale = (hi - lo) * MUTATION_SCALE;
+            let delta: f64 = rng.random_range(-scale..scale);
+            (v + delta).clamp(*lo, *hi)
+        })
+        .collect()
+}
diff --git a/src/algorithms/nonparametric/npxo/mod.rs b/src/algorithms/nonparametric/npxo/mod.rs
new file mode 100644
index 000000000..d5771fa98
--- /dev/null
+++ b/src/algorithms/nonparametric/npxo/mod.rs
@@ -0,0 +1,428 @@
+//! # NPXO: Non-Parametric Crossover Optimization
+//!
+//! Uses genetic crossover operators to explore the space between good support points.
+//!
+//! ## Key Innovation
+//!
+//! Instead of perturbing single points (SA) or following velocity (PSO), NPXO "breeds"
+//! pairs of high-weight support points to create offspring in between them.
+//!
+//! ## Crossover Operators
+//!
+//! 1. **Arithmetic crossover**: child = α·parent1 + (1-α)·parent2
+//! 2. **BLX-α crossover**: child sampled from extended box between parents
+//! 3. **Simulated Binary Crossover (SBX)**: Mimics single-point crossover for continuous
+//!
+//! ## Why Crossover?
+//!
+//! 1. **Exploits correlations**: If two points are good, region between may be too
+//! 2. **Preserves structure**: New points inherit properties from good parents
+//! 3. **Fast convergence**: Directly targets promising regions
+//! 4. **Low computational cost**: No gradient or surrogate needed
+
+mod constants;
+mod crossover;
+
+pub use constants::*;
+
+use crate::algorithms::{NativeNonparametricConfig, NonparametricAlgorithmInput, StopReason};
+use crate::estimation::nonparametric::ipm::burke;
+use crate::estimation::nonparametric::qr;
+use crate::estimation::nonparametric::sample_space_for_parameters;
+use crate::estimation::nonparametric::{
+    calculate_psi, CycleLog, NPCycle, NonparametricWorkspace, Psi, Theta, Weights,
+};
+use crate::{algorithms::Status, prelude::algorithms::Algorithms};
+
+use anyhow::{bail, Result};
+use ndarray::{Array, ArrayBase, Dim, OwnedRepr};
+use pharmsol::prelude::data::Data;
+use pharmsol::prelude::simulator::Equation;
+use pharmsol::{prelude::AssayErrorModel, AssayErrorModels, Subject};
+use rand::prelude::*;
+use rand::SeedableRng;
+
+// ============================================================================
+// NPXO STRUCT
+// ============================================================================
+
+pub struct NPXO<E: Equation + Send + 'static> {
+    equation: E,
+    ranges: Vec<(f64, f64)>,
+    psi: Psi,
+    theta: Theta,
+    lambda: Weights,
+    w: Weights,
+    last_objf: f64,
+    objf: f64,
+    best_objf: f64,
+    cycle: usize,
+    gamma_delta: Vec<f64>,
+    error_models: AssayErrorModels,
+    status: Status,
+    cycle_log: CycleLog,
+    data: Data,
+    config: NativeNonparametricConfig,
+
+    // Crossover specific
+    objf_history: Vec<f64>,
+    rng: StdRng,
+}
+
+// ============================================================================
+// ALGORITHMS TRAIT
+// ============================================================================
+
+impl<E: Equation + Send + 'static> Algorithms<E> for NPXO<E> {
+    fn equation(&self) -> &E {
+        &self.equation
+    }
+
+    fn error_models(&self) -> &AssayErrorModels {
+        &self.error_models
+    }
+
+    fn data(&self) -> &Data {
+        &self.data
+    }
+
+    fn get_prior(&self) -> Theta {
+        sample_space_for_parameters(&self.config.parameter_space, &self.config.prior).unwrap()
+    }
+
+    fn likelihood(&self) -> f64 {
+        self.objf
+    }
+
+    fn increment_cycle(&mut self) -> usize {
+        self.cycle += 1;
+        if self.objf > self.best_objf + THETA_G {
+            self.best_objf = self.objf;
+        }
+        self.cycle
+    }
+
+    fn cycle(&self) -> usize {
+        self.cycle
+    }
+
+    fn set_theta(&mut self, theta: Theta) {
+        self.theta = theta;
+    }
+
+    fn theta(&self) -> &Theta {
+        &self.theta
+    }
+
+    fn psi(&self) -> &Psi {
+        &self.psi
+    }
+
+    fn set_status(&mut self, status: Status) {
+        self.status = status;
+    }
+
+    fn status(&self) -> &Status {
+        &self.status
+    }
+
+    fn log_cycle_state(&mut self) {
+        let state = NPCycle::new(
+            self.cycle,
+            -2.0 * self.objf,
+            self.error_models.clone(),
+            self.theta.clone(),
+            self.theta.nspp(),
+            (self.last_objf - self.objf).abs(),
+            self.status.clone(),
+        );
+        self.cycle_log.push(state);
+        self.last_objf = self.objf;
+    }
+
+    fn evaluation(&mut self) -> Result<Status> {
+        tracing::info!("Objective function = {:.4}", -2.0 * self.objf);
+        tracing::debug!("Support points: {}", self.theta.nspp());
+
+        self.error_models.iter().for_each(|(outeq, em)| {
+            if AssayErrorModel::None != *em {
+                tracing::debug!(
+                    "Error model outeq {}: {:.4}",
+                    outeq,
+                    em.factor().unwrap_or_default()
+                );
+            }
+        });
+
+        self.objf_history.push(self.objf);
+
+        if self.last_objf > self.objf + 1e-4 {
+            tracing::warn!(
+                "Objective decreased: {:.4} → {:.4}",
+                -2.0 * self.last_objf,
+                -2.0 * self.objf
+            );
+        }
+
+        // Check convergence
+        let converged = self.check_convergence();
+        let max_cycles = self.config.max_cycles;
+
+        if converged {
+            tracing::info!("NPXO converged at cycle {}", self.cycle);
+            self.status = Status::Stop(StopReason::Converged);
+        } else if self.cycle >= max_cycles {
+            tracing::info!("NPXO max cycles: {}", max_cycles);
+            self.status = Status::Stop(StopReason::MaxCycles);
+        } else if std::path::Path::new("stop").exists() {
+            tracing::warn!("Stop file detected");
+            self.status = Status::Stop(StopReason::Stopped);
+        }
+
+        self.log_cycle_state();
+        Ok(self.status.clone())
+    }
+
+    fn estimation(&mut self) -> Result<()> {
+        self.psi = calculate_psi(
+            &self.equation,
+            &self.data,
+            &self.theta,
+            &self.error_models,
+            self.cycle == 1 && self.config.progress,
+        )?;
+
+        if let Err(err) = self.validate_psi() {
+            bail!(err);
+        }
+
+        let (lambda, objf) = burke(&self.psi)?;
+        self.lambda = lambda;
+        self.objf = objf;
+
+        tracing::debug!(
+            "NPXO cycle {}: -2LL = {:.4}, {} SPP",
+            self.cycle,
+            -2.0 * objf,
+            self.theta.nspp()
+        );
+
+        Ok(())
+    }
+
+    fn condensation(&mut self) -> Result<()> {
+        // Lambda threshold pruning
+        let max_lambda = self
+            .lambda
+            .iter()
+            .fold(f64::NEG_INFINITY, |acc, x| x.max(acc));
+        let mut keep: Vec<usize> = self
+            .lambda
+            .iter()
+            .enumerate()
+            .filter(|(_, lam)| *lam > max_lambda / 1000.0)
+            .map(|(i, _)| i)
+            .collect();
+
+        if self.psi.matrix().ncols() != keep.len() {
+            tracing::debug!(
+                "Lambda pruning dropped {} SPP",
+                self.psi.matrix().ncols() - keep.len()
+            );
+        }
+
+        self.theta.filter_indices(&keep);
+        self.psi.filter_column_indices(&keep);
+
+        // QR decomposition
+        let (r, perm) = qr::qrd(&self.psi)?;
+        let keep_n = self.psi.matrix().ncols().min(self.psi.matrix().nrows());
+
+        keep.clear();
+        for i in 0..keep_n {
+            let test = r.col(i).norm_l2();
+            let r_diag = r.get(i, i);
+            if (r_diag / test).abs() >= 1e-8 {
+                keep.push(*perm.get(i).unwrap());
+            }
+        }
+
+        if self.psi.matrix().ncols() != keep.len() {
+            tracing::debug!("QR dropped {} SPP", self.psi.matrix().ncols() - keep.len());
+        }
+
+        self.theta.filter_indices(&keep);
+        self.psi.filter_column_indices(&keep);
+
+        let (lambda, objf) = burke(&self.psi)?;
+        self.lambda = lambda;
+        self.objf = objf;
+        self.w = self.lambda.clone();
+
+        Ok(())
+    }
+
+    fn optimizations(&mut self) -> Result<()> {
+        self.optimize_error_models()?;
+        Ok(())
+    }
+
+    fn expansion(&mut self) -> Result<()> {
+        // Generate offspring via crossover
+        let offspring = crossover::generate_offspring(
+            &self.theta,
+            &self.w,
+            &self.ranges,
+            CROSSOVER_COUNT,
+            &mut self.rng,
+        )?;
+
+        // Add offspring to theta
+        for point in offspring {
+            self.theta.suggest_point(&point, THETA_D)?;
+        }
+
+        tracing::debug!("NPXO: Expanded to {} SPP", self.theta.nspp());
+        Ok(())
+    }
+
+    fn into_workspace(&self) -> Result<NonparametricWorkspace<E>> {
+        NonparametricWorkspace::new(
+            self.equation.clone(),
+            self.data.clone(),
+            self.theta.clone(),
+            self.psi.clone(),
+            self.w.clone(),
+            -2.0 * self.objf,
+            self.cycle,
+            self.status.clone(),
+            self.config.run_configuration.clone(),
+            self.cycle_log.clone(),
+        )
+    }
+}
+
+// ============================================================================
+// NPXO SPECIFIC METHODS
+// ============================================================================
+
+impl<E: Equation + Send + 'static> NPXO<E> {
+    pub(crate) fn from_input(input: NonparametricAlgorithmInput<E>) -> Result<Box<Self>> {
+        let config = input.native_config()?;
+        let seed = config.prior.seed().unwrap_or(42) as u64;
+        let error_models = input.error_models().clone();
+
+        Ok(Box::new(Self {
+            equation: input.equation,
+            ranges: config.ranges.clone(),
+            psi: Psi::new(),
+            theta: Theta::new(),
+            lambda: Weights::default(),
+            w: Weights::default(),
+            last_objf: -1e30,
+            objf: f64::NEG_INFINITY,
+            best_objf: f64::NEG_INFINITY,
+            cycle: 0,
+            gamma_delta: vec![0.1; error_models.len()],
+            error_models,
+            status: Status::Continue,
+            cycle_log: CycleLog::new(),
+            data: input.data,
+            config,
+            objf_history: Vec::with_capacity(500),
+            rng: StdRng::seed_from_u64(seed),
+        }))
+    }
+
+    fn check_convergence(&self) -> bool {
+        if self.cycle < MIN_CYCLES {
+            return false;
+        }
+
+        if self.objf_history.len() < STABLE_CYCLES {
+            return false;
+        }
+
+        let recent: Vec<f64> = self
+            .objf_history
+            .iter()
+            .rev()
+            .take(STABLE_CYCLES)
+            .copied()
+            .collect();
+        let max_val = recent.iter().cloned().fold(f64::NEG_INFINITY, f64::max);
+        let min_val = recent.iter().cloned().fold(f64::INFINITY, f64::min);
+
+        (max_val - min_val).abs() < OBJF_TOLERANCE
+    }
+
+    fn optimize_error_models(&mut self) -> Result<()> {
+        for (outeq, em) in self.error_models.clone().iter_mut() {
+            if *em == AssayErrorModel::None || em.is_factor_fixed().unwrap_or(true) {
+                continue;
+            }
+
+            let gamma_up = em.factor()? * (1.0 + self.gamma_delta[outeq]);
+            let gamma_down = em.factor()? / (1.0 + self.gamma_delta[outeq]);
+
+            let mut em_up = self.error_models.clone();
+            em_up.set_factor(outeq, gamma_up)?;
+
+            let mut em_down = self.error_models.clone();
+            em_down.set_factor(outeq, gamma_down)?;
+
+            let psi_up = calculate_psi(&self.equation, &self.data, &self.theta, &em_up, false)?;
+            let psi_down = calculate_psi(&self.equation, &self.data, &self.theta, &em_down, false)?;
+
+            let (lambda_up, objf_up) = burke(&psi_up)?;
+            let (lambda_down, objf_down) = burke(&psi_down)?;
+
+            if objf_up > self.objf {
+                self.error_models.set_factor(outeq, gamma_up)?;
+                self.objf = objf_up;
+                self.gamma_delta[outeq] *= 4.0;
+                self.lambda = lambda_up;
+                self.psi = psi_up;
+            }
+            if objf_down > self.objf {
+                self.error_models.set_factor(outeq, gamma_down)?;
+                self.objf = objf_down;
+                self.gamma_delta[outeq] *= 4.0;
+                self.lambda = lambda_down;
+                self.psi = psi_down;
+            }
+
+            self.gamma_delta[outeq] *= 0.5;
+            if self.gamma_delta[outeq] <= 0.01 {
+                self.gamma_delta[outeq] = 0.1;
+            }
+        }
+
+        Ok(())
+    }
+
+    #[allow(dead_code)]
+    fn validate_psi(&self) -> Result<()> {
+        let psi = self.psi.to_ndarray();
+        let (_, col) = psi.dim();
+        let ecol: ArrayBase<OwnedRepr<f64>, Dim<[usize; 1]>> = Array::ones(col);
+        let plam = psi.dot(&ecol);
+        let w = 1.0 / &plam;
+
+        let bad_indices: Vec<usize> = w
+            .iter()
+            .enumerate()
+            .filter(|(_, x)| x.is_nan() || x.is_infinite())
+            .map(|(i, _)| i)
+            .collect();
+
+        if !bad_indices.is_empty() {
+            let subjects: Vec<&Subject> = self.data.subjects();
+            let bad_subjects: Vec<&String> =
+                bad_indices.iter().map(|&i| subjects[i].id()).collect();
+            bail!("Zero probability for subjects: {:?}", bad_subjects);
+        }
+
+        Ok(())
+    }
+}
diff --git a/src/api/estimation_problem.rs b/src/api/estimation_problem.rs
index 8c4900028..167805fb0 100644
--- a/src/api/estimation_problem.rs
+++ b/src/api/estimation_problem.rs
@@ -78,7 +78,16 @@ impl<'de> Deserialize<'de> for EstimationMethod {
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub enum NonparametricMethod {
     Npag(NpagOptions),
+    Npbo(NpboOptions),
+    Npcat(NpcatOptions),
+    Npcma(NpcmaOptions),
     Npod(NpodOptions),
+    Npopt(NpoptOptions),
+    Nppso(NppsoOptions),
+    Npsah(NpsahOptions),
+    Npsah2(Npsah2Options),
+    Nexus(NexusOptions),
+    Npxo(NpxoOptions),
     Postprob(PostProbOptions),
 }
 
@@ -86,7 +95,16 @@ impl NonparametricMethod {
     pub fn algorithm(self) -> Algorithm {
         match self {
             NonparametricMethod::Npag(_) => Algorithm::NPAG,
+            NonparametricMethod::Npbo(_) => Algorithm::NPBO,
+            NonparametricMethod::Npcat(_) => Algorithm::NPCAT,
+            NonparametricMethod::Npcma(_) => Algorithm::NPCMA,
             NonparametricMethod::Npod(_) => Algorithm::NPOD,
+            NonparametricMethod::Npopt(_) => Algorithm::NPOPT,
+            NonparametricMethod::Nppso(_) => Algorithm::NPPSO,
+            NonparametricMethod::Npsah(_) => Algorithm::NPSAH,
+            NonparametricMethod::Npsah2(_) => Algorithm::NPSAH2,
+            NonparametricMethod::Nexus(_) => Algorithm::NEXUS,
+            NonparametricMethod::Npxo(_) => Algorithm::NPXO,
             NonparametricMethod::Postprob(_) => Algorithm::POSTPROB,
         }
     }
@@ -95,7 +113,16 @@ impl NonparametricMethod {
     pub fn name(&self) -> &'static str {
         match self {
             NonparametricMethod::Npag(_) => "npag",
+            NonparametricMethod::Npbo(_) => "npbo",
+            NonparametricMethod::Npcat(_) => "npcat",
+            NonparametricMethod::Npcma(_) => "npcma",
             NonparametricMethod::Npod(_) => "npod",
+            NonparametricMethod::Npopt(_) => "npopt",
+            NonparametricMethod::Nppso(_) => "nppso",
+            NonparametricMethod::Npsah(_) => "npsah",
+            NonparametricMethod::Npsah2(_) => "npsah2",
+            NonparametricMethod::Nexus(_) => "nexus",
+            NonparametricMethod::Npxo(_) => "npxo",
             NonparametricMethod::Postprob(_) => "postprob",
         }
     }
@@ -104,7 +131,16 @@ impl NonparametricMethod {
     pub fn from_name(name: &str) -> Option<Self> {
         match name.to_lowercase().as_str() {
             "npag" => Some(NonparametricMethod::Npag(NpagOptions)),
+            "npbo" => Some(NonparametricMethod::Npbo(NpboOptions)),
+            "npcat" => Some(NonparametricMethod::Npcat(NpcatOptions)),
+            "npcma" => Some(NonparametricMethod::Npcma(NpcmaOptions)),
             "npod" => Some(NonparametricMethod::Npod(NpodOptions)),
+            "npopt" => Some(NonparametricMethod::Npopt(NpoptOptions)),
+            "nppso" => Some(NonparametricMethod::Nppso(NppsoOptions)),
+            "npsah" => Some(NonparametricMethod::Npsah(NpsahOptions)),
+            "npsah2" => Some(NonparametricMethod::Npsah2(Npsah2Options)),
+            "nexus" => Some(NonparametricMethod::Nexus(NexusOptions)),
+            "npxo" => Some(NonparametricMethod::Npxo(NpxoOptions)),
             "postprob" => Some(NonparametricMethod::Postprob(PostProbOptions)),
             _ => None,
         }
@@ -118,9 +154,36 @@ impl NonparametricMethod {
 #[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
 pub struct NpagOptions;
 
+#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
+pub struct NpboOptions;
+
+#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
+pub struct NpcatOptions;
+
+#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
+pub struct NpcmaOptions;
+
 #[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
 pub struct NpodOptions;
 
+#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
+pub struct NpoptOptions;
+
+#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
+pub struct NppsoOptions;
+
+#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
+pub struct NpsahOptions;
+
+#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
+pub struct Npsah2Options;
+
+#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
+pub struct NexusOptions;
+
+#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
+pub struct NpxoOptions;
+
 #[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
 pub struct PostProbOptions;
 
diff --git a/src/api/mod.rs b/src/api/mod.rs
index 7eda04367..8dd599b32 100644
--- a/src/api/mod.rs
+++ b/src/api/mod.rs
@@ -6,8 +6,9 @@ pub mod saem_config;
 
 pub use estimation_problem::{
     AlgorithmTuning, ConvergenceOptions, EstimationMethod, EstimationProblem,
-    EstimationProblemBuilder, LoggingLevel, LoggingOptions, NonparametricMethod, NpagOptions,
-    NpodOptions, OutputPlan, PostProbOptions, RuntimeOptions,
+    EstimationProblemBuilder, LoggingLevel, LoggingOptions, NexusOptions, NonparametricMethod,
+    NpagOptions, NpboOptions, NpcatOptions, NpcmaOptions, NpodOptions, NpoptOptions, NppsoOptions,
+    Npsah2Options, NpsahOptions, NpxoOptions, OutputPlan, PostProbOptions, RuntimeOptions,
 };
 pub use fit::{fit, fit_with_progress};
 pub use model_definition::{ModelDefinition, ModelDefinitionBuilder};
diff --git a/src/lib.rs b/src/lib.rs
index 64a2c30b5..4bcebea6e 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,7 +1,6 @@
 //! PMcore is a framework for developing and running population pharmacokinetic algorithms.
 //!
-//! The structure branch keeps the refactored platform surface together with the baseline
-//! non-parametric workflows that existed on `main`.
+//! This branch layers the expanded non-parametric family onto the structure-branch baseline.
 //!
 //! # Algorithm Types
 //!
@@ -10,6 +9,7 @@
 //! - NPAG (Non-Parametric Adaptive Grid)
 //! - NPOD (Non-Parametric Optimal Design)
 //! - POSTPROB (Posterior probability reweighting)
+//! - NPBO, NPCAT, NPCMA, NPOPT, NPPSO, NPSAH, NPSAH2, NPXO, NEXUS
 //!
 //! # Public API
 //!
@@ -63,9 +63,10 @@ pub mod prelude {
     pub use crate::api::fit_with_progress;
     pub use crate::api::{
         AlgorithmTuning, ConvergenceOptions, EstimationMethod, EstimationProblem, FitProgress,
-        LoggingLevel, LoggingOptions, ModelDefinition, NonparametricCycleProgress,
-        NonparametricMethod, NpagOptions, NpodOptions, OutputPlan, PostProbOptions,
-        RuntimeOptions,
+        LoggingLevel, LoggingOptions, ModelDefinition, NexusOptions, NonparametricCycleProgress,
+        NonparametricMethod, NpagOptions, NpboOptions, NpcatOptions, NpcmaOptions, NpodOptions,
+        NpoptOptions, NppsoOptions, Npsah2Options, NpsahOptions, NpxoOptions, OutputPlan,
+        PostProbOptions, RuntimeOptions,
     };
     pub use crate::compile::{CompiledProblem, DesignContext, ObservationIndex};
     pub use crate::estimation::nonparametric::{
@@ -86,8 +87,8 @@ pub mod prelude {
 
     pub use pharmsol;
 
-    pub use crate::estimation::nonparametric::{read_prior, Prior};
     pub use crate::api::SaemConfig;
+    pub use crate::estimation::nonparametric::{read_prior, Prior};
 
     pub mod simulator {
         pub use pharmsol::prelude::simulator::*;
diff --git a/src/results/fit_result.rs b/src/results/fit_result.rs
index e7d76fb1c..daa0db4ce 100644
--- a/src/results/fit_result.rs
+++ b/src/results/fit_result.rs
@@ -1,8 +1,8 @@
 use anyhow::Result;
 use pharmsol::Equation;
 
-use crate::estimation::nonparametric::NonparametricWorkspace;
 use crate::estimation::nonparametric;
+use crate::estimation::nonparametric::NonparametricWorkspace;
 use crate::results::{
     nonparametric_artifacts, nonparametric_diagnostics, nonparametric_predictions, ArtifactIndex,
     DiagnosticsBundle, FitSummary, IndividualSummary, PopulationSummary, PredictionsBundle,
diff --git a/tests/api_smoke_tests.rs b/tests/api_smoke_tests.rs
index b7cb68ba3..5f8943774 100644
--- a/tests/api_smoke_tests.rs
+++ b/tests/api_smoke_tests.rs
@@ -150,7 +150,10 @@ fn test_problem_compile_preserves_runtime_configuration() -> Result<()> {
     assert!(!compiled.runtime_options().progress);
     assert_eq!(compiled.runtime_options().idelta, 0.5);
     assert_eq!(compiled.runtime_options().tad, 24.0);
-    assert_eq!(compiled.runtime_options().logging.level, LoggingLevel::Debug);
+    assert_eq!(
+        compiled.runtime_options().logging.level,
+        LoggingLevel::Debug
+    );
     assert!(compiled.runtime_options().logging.write);
     assert!(!compiled.runtime_options().logging.stdout);
     assert_eq!(compiled.runtime_options().convergence.likelihood, 1e-5);
@@ -198,4 +201,4 @@ fn test_problem_can_initialize_logs_without_old_settings_api() -> Result<()> {
 
     problem.initialize_logs()?;
     Ok(())
-}
\ No newline at end of file
+}