|
1 | | -# dfextension/quantile_fit_nd/bench_quantile_fit_nd.py |
2 | | -# Simple timing benchmark across N points, distributions, and grid sizes. |
| 1 | +#!/usr/bin/env python3 |
| 2 | +# dfextensions/quantile_fit_nd/bench_quantile_fit_nd.py |
| 3 | +""" |
| 4 | +Benchmark speed + precision for fit_quantile_linear_nd with scaling checks. |
| 5 | +
|
| 6 | +- Distributions: uniform, poisson (via randomized PIT), gaussian |
| 7 | +- q_centers step = 0.025; dq = 0.05 (more points per z-bin) |
| 8 | +- Precision metrics per N: |
| 9 | + * rms_b := sqrt(mean( (b_meas(z) - b_exp(z))^2 )) over informative z-bins |
| 10 | + * rel_err_sigmaQ := median relative error of sigma_Q vs truth per z-bin |
| 11 | + * rms_rt := round-trip inversion RMS for a random subset of events |
| 12 | +- Scaling check: |
| 13 | + * expect alpha_b ≈ -0.5 (rms_b ∝ N^{-1/2}) |
| 14 | + * expect alpha_rt ≈ 0.0 (rms_rt roughly flat; per-event noise) |
| 15 | +- Prints E*sqrt(N) for rms_b as a constancy sanity check. |
| 16 | +- Optional: PNG plots (log-log), CSV export, memory profiling, strict assertions. |
| 17 | +""" |
| 18 | + |
| 19 | +import argparse |
| 20 | +import json |
| 21 | +import warnings |
| 22 | +from math import erf, sqrt |
3 | 23 | import time |
4 | 24 | import numpy as np |
5 | 25 | import pandas as pd |
6 | 26 |
|
7 | | -from dfextensions.quantile_fit_nd.quantile_fit_nd import fit_quantile_linear_nd |
| 27 | +from dfextensions.quantile_fit_nd.quantile_fit_nd import ( |
| 28 | + fit_quantile_linear_nd, |
| 29 | + QuantileEvaluator, |
| 30 | +) |
| 31 | +from dfextensions.quantile_fit_nd.utils import discrete_to_uniform_rank_poisson |
| 32 | + |
| 33 | +RNG = np.random.default_rng(123456) |
8 | 34 |
|
9 | | -RNG = np.random.default_rng(1234) |
10 | 35 |
|
| 36 | +# ----------------------- Synthetic data generation ----------------------- |
11 | 37 |
|
12 | | -def gen_data(n: int, dist: str = "uniform", sigma_X: float = 0.5): |
| 38 | +def gen_Q_from_distribution(dist: str, n: int, *, lam: float) -> np.ndarray: |
13 | 39 | if dist == "uniform": |
14 | | - Q = RNG.uniform(0, 1, size=n) |
| 40 | + return RNG.uniform(0.0, 1.0, size=n) |
15 | 41 | elif dist == "poisson": |
16 | | - lam = 20.0 |
17 | | - m = RNG.poisson(lam, size=n) |
18 | | - from math import erf |
19 | | - z = (m + 0.5 - lam) / np.sqrt(max(lam, 1e-6)) |
20 | | - Q = 0.5 * (1.0 + np.array([erf(zi / np.sqrt(2)) for zi in z])) |
21 | | - Q = np.clip(Q, 0, 1) |
| 42 | + k = RNG.poisson(lam, size=n) |
| 43 | + return discrete_to_uniform_rank_poisson(k, lam, mode="randomized", rng=RNG) |
22 | 44 | elif dist == "gaussian": |
23 | | - g = RNG.normal(0.0, 1.0, size=n) |
24 | | - from math import erf |
25 | | - Q = 0.5 * (1.0 + np.array([erf(gi / np.sqrt(2)) for gi in g])) |
26 | | - Q = np.clip(Q, 0, 1) |
| 45 | + z = RNG.normal(0.0, 1.0, size=n) # standard normal |
| 46 | + cdf = 0.5 * (1.0 + np.array([erf(zi / np.sqrt(2)) for zi in z])) |
| 47 | + return np.clip(cdf, 0.0, 1.0) |
27 | 48 | else: |
28 | | - raise ValueError |
| 49 | + raise ValueError(f"unknown dist {dist}") |
29 | 50 |
|
30 | | - z = np.clip(RNG.normal(0.0, 5.0, size=n), -10, 10) |
31 | | - a = 10.0 + 0.5 * z |
32 | | - b = (50.0 + 2.0 * z / 10.0).clip(min=5.0) |
33 | | - X = a + b * Q + RNG.normal(0.0, sigma_X, size=n) |
34 | | - df = pd.DataFrame({"channel_id": "bench", "Q": Q, "X": X, "z_vtx": z, "is_outlier": False}) |
35 | | - return df |
36 | 51 |
|
| 52 | +def gen_synthetic_df( |
| 53 | + n: int, |
| 54 | + dist: str, |
| 55 | + *, |
| 56 | + lam: float, |
| 57 | + z_sigma_cm: float = 5.0, |
| 58 | + z_range_cm: float = 10.0, |
| 59 | + sigma_X_given_Q: float = 0.5, |
| 60 | + a0: float = 10.0, |
| 61 | + a1: float = 0.5, |
| 62 | + b0: float = 50.0, |
| 63 | + b1: float = 2.0, |
| 64 | +) -> tuple[pd.DataFrame, dict]: |
| 65 | + Q = gen_Q_from_distribution(dist, n, lam=lam) |
| 66 | + z = np.clip(RNG.normal(0.0, z_sigma_cm, size=n), -z_range_cm, z_range_cm) |
| 67 | + a_true = a0 + a1 * z |
| 68 | + b_true = (b0 + b1 * z / max(z_range_cm, 1e-6)).clip(min=5.0) |
| 69 | + X = a_true + b_true * Q + RNG.normal(0.0, sigma_X_given_Q, size=n) |
| 70 | + df = pd.DataFrame({ |
| 71 | + "channel_id": np.repeat("ch0", n), |
| 72 | + "Q": Q, |
| 73 | + "X": X, |
| 74 | + "z_vtx": z, |
| 75 | + "is_outlier": np.zeros(n, dtype=bool), |
| 76 | + }) |
| 77 | + truth = { |
| 78 | + "a0": a0, "a1": a1, |
| 79 | + "b0": b0, "b1": b1, |
| 80 | + "sigma_X_given_Q": sigma_X_given_Q, |
| 81 | + "z_range": z_range_cm, |
| 82 | + "lam": lam |
| 83 | + } |
| 84 | + return df, truth |
37 | 85 |
|
38 | | -def run_one(n, dist, q_bins=11, z_bins=10): |
39 | | - df = gen_data(n, dist=dist) |
40 | | - t0 = time.perf_counter() |
41 | | - table = fit_quantile_linear_nd( |
42 | | - df, |
43 | | - channel_key="channel_id", |
44 | | - q_centers=np.linspace(0, 1, q_bins), |
| 86 | + |
| 87 | +# ----------------------- Helpers for expectations ----------------------- |
| 88 | + |
| 89 | +def _edges_from_centers(centers: np.ndarray) -> np.ndarray: |
| 90 | + mid = 0.5 * (centers[1:] + centers[:-1]) |
| 91 | + first = centers[0] - (mid[0] - centers[0]) |
| 92 | + last = centers[-1] + (centers[-1] - mid[-1]) |
| 93 | + return np.concatenate([[first], mid, [last]]) |
| 94 | + |
| 95 | + |
| 96 | +def expected_b_per_zbin_from_sample(df: pd.DataFrame, z_edges: np.ndarray, truth: dict) -> np.ndarray: |
| 97 | + z_vals = df["z_vtx"].to_numpy(np.float64) |
| 98 | + b_true_all = (truth["b0"] + truth["b1"] * z_vals / max(truth["z_range"], 1e-6)).clip(min=5.0) |
| 99 | + b_expected = [] |
| 100 | + for i in range(len(z_edges) - 1): |
| 101 | + m = (z_vals >= z_edges[i]) & (z_vals <= z_edges[i+1]) |
| 102 | + b_expected.append(np.mean(b_true_all[m]) if m.sum() > 0 else np.nan) |
| 103 | + return np.array(b_expected, dtype=np.float64) |
| 104 | + |
| 105 | + |
| 106 | +def weights_from_fit_stats(col: pd.Series) -> np.ndarray: |
| 107 | + w = [] |
| 108 | + for s in col: |
| 109 | + try: |
| 110 | + d = json.loads(s) |
| 111 | + except Exception: |
| 112 | + d = {} |
| 113 | + w.append(d.get("n_used", np.nan)) |
| 114 | + return np.array(w, dtype=float) |
| 115 | + |
| 116 | + |
| 117 | +# ----------------------------- Benchmark core ----------------------------- |
| 118 | + |
| 119 | +def run_one( |
| 120 | + dist: str, |
| 121 | + n: int, |
| 122 | + *, |
| 123 | + q_step=0.025, |
45 | 124 | dq=0.05, |
46 | | - nuisance_axes={"z": "z_vtx"}, |
47 | | - n_bins_axes={"z": z_bins}, |
48 | | - mask_col="is_outlier", |
49 | | - b_min_option="auto", |
50 | | - fit_mode="ols", |
51 | | - kappa_w=1.3, |
| 125 | + z_bins=20, |
| 126 | + lam=50.0, |
| 127 | + sample_fraction=0.006, |
| 128 | + mem_profile: bool = False, |
| 129 | +) -> dict: |
| 130 | + df, truth = gen_synthetic_df(n, dist, lam=lam) |
| 131 | + q_centers = np.arange(0.0, 1.0 + 1e-12, q_step) # 0..1 inclusive |
| 132 | + |
| 133 | + def _do_fit(): |
| 134 | + return fit_quantile_linear_nd( |
| 135 | + df, |
| 136 | + channel_key="channel_id", |
| 137 | + q_centers=q_centers, |
| 138 | + dq=dq, |
| 139 | + nuisance_axes={"z": "z_vtx"}, |
| 140 | + n_bins_axes={"z": z_bins}, |
| 141 | + ) |
| 142 | + |
| 143 | + t0 = time.perf_counter() |
| 144 | + if mem_profile: |
| 145 | + try: |
| 146 | + from memory_profiler import memory_usage |
| 147 | + mem_trace, table = memory_usage((_do_fit, ), retval=True, max_iterations=1) |
| 148 | + peak_mem_mb = float(np.max(mem_trace)) if len(mem_trace) else np.nan |
| 149 | + except Exception as e: |
| 150 | + warnings.warn(f"memory_profiler unavailable or failed: {e}") |
| 151 | + table = _do_fit() |
| 152 | + peak_mem_mb = np.nan |
| 153 | + else: |
| 154 | + table = _do_fit() |
| 155 | + peak_mem_mb = np.nan |
| 156 | + t_fit = time.perf_counter() - t0 |
| 157 | + |
| 158 | + # Expected b per z-bin (from sample) |
| 159 | + z_centers = np.sort(table["z_center"].unique()) |
| 160 | + z_edges = _edges_from_centers(z_centers) |
| 161 | + b_exp = expected_b_per_zbin_from_sample(df, z_edges, truth) |
| 162 | + |
| 163 | + # Measured b per z-bin (weighted by window n_used) |
| 164 | + b_meas_w = np.full_like(b_exp, np.nan, dtype=float) |
| 165 | + for i, zc in enumerate(z_centers): |
| 166 | + g = table[table["z_center"] == zc] |
| 167 | + if g.empty: |
| 168 | + continue |
| 169 | + w = weights_from_fit_stats(g["fit_stats"]) |
| 170 | + ok = np.isfinite(g["b"].to_numpy()) & (w > 0) |
| 171 | + if ok.sum() == 0: |
| 172 | + continue |
| 173 | + bvals = g["b"].to_numpy()[ok] |
| 174 | + ww = w[ok] |
| 175 | + b_meas_w[i] = np.average(bvals, weights=ww) |
| 176 | + |
| 177 | + # Informative mask |
| 178 | + m = np.isfinite(b_meas_w) & np.isfinite(b_exp) |
| 179 | + |
| 180 | + # Slope error metrics |
| 181 | + rms_b = float(np.sqrt(np.nanmean((b_meas_w[m] - b_exp[m]) ** 2))) if m.any() else np.nan |
| 182 | + |
| 183 | + # sigma_Q check (median rel err by z-bin) |
| 184 | + sigma_q_true = truth["sigma_X_given_Q"] / np.maximum(1e-9, b_exp) |
| 185 | + sigma_q_meas = table.groupby("z_center")["sigma_Q"].median().reindex(z_centers).to_numpy() |
| 186 | + mk = np.isfinite(sigma_q_true) & np.isfinite(sigma_q_meas) |
| 187 | + rel_err_sigmaQ = float(np.nanmean(np.abs(sigma_q_meas[mk] - sigma_q_true[mk]) / |
| 188 | + np.maximum(1e-9, sigma_q_true[mk]))) if mk.any() else np.nan |
| 189 | + |
| 190 | + # Round-trip inversion RMS (sample to limit CPU) |
| 191 | + k = max(1, int(round(sample_fraction * len(df)))) |
| 192 | + idx = RNG.choice(len(df), size=min(k, len(df)), replace=False) |
| 193 | + evalr = QuantileEvaluator(table) |
| 194 | + resid = [] |
| 195 | + for ii in idx: |
| 196 | + z = float(df.loc[ii, "z_vtx"]) |
| 197 | + q_true = float(df.loc[ii, "Q"]) |
| 198 | + x = float(df.loc[ii, "X"]) |
| 199 | + q_hat = evalr.invert_rank(x, channel_id="ch0", z=z) |
| 200 | + resid.append(q_hat - q_true) |
| 201 | + resid = np.array(resid, dtype=float) |
| 202 | + rms_rt = float(np.sqrt(np.mean(np.square(resid)))) if resid.size else np.nan |
| 203 | + |
| 204 | + return dict( |
| 205 | + dist=dist, N=int(n), |
| 206 | + lam=float(lam), |
| 207 | + q_step=float(q_step), dq=float(dq), z_bins=int(z_bins), |
| 208 | + t_fit=float(t_fit), |
| 209 | + rms_b=rms_b, |
| 210 | + rel_err_sigmaQ=rel_err_sigmaQ, |
| 211 | + rms_rt=rms_rt, |
| 212 | + n_z_inf=int(np.sum(m)), |
| 213 | + peak_mem_mb=peak_mem_mb, |
52 | 214 | ) |
53 | | - dt = time.perf_counter() - t0 |
54 | | - return dt, len(table) |
| 215 | + |
| 216 | + |
| 217 | +def fit_log_slope(xs: np.ndarray, ys: np.ndarray) -> float: |
| 218 | + # Fit log(ys) ~ alpha * log(xs) + c ; return alpha |
| 219 | + m = np.isfinite(xs) & np.isfinite(ys) & (ys > 0) |
| 220 | + if m.sum() < 2: |
| 221 | + warnings.warn("Insufficient points for scaling slope fit.", RuntimeWarning) |
| 222 | + return np.nan |
| 223 | + lx = np.log(xs[m].astype(float)) |
| 224 | + ly = np.log(ys[m].astype(float)) |
| 225 | + A = np.column_stack([lx, np.ones_like(lx)]) |
| 226 | + sol, _, _, _ = np.linalg.lstsq(A, ly, rcond=None) |
| 227 | + return float(sol[0]) |
| 228 | + |
| 229 | + |
| 230 | +def _plot_scaling(res: pd.DataFrame, dists: list[str]): |
| 231 | + try: |
| 232 | + import matplotlib.pyplot as plt |
| 233 | + except Exception as e: |
| 234 | + warnings.warn(f"--plot requested but matplotlib unavailable: {e}") |
| 235 | + return |
| 236 | + for dist in dists: |
| 237 | + sub = res[res["dist"] == dist].sort_values("N") |
| 238 | + if sub.empty: |
| 239 | + continue |
| 240 | + fig, ax = plt.subplots(figsize=(6.2, 4.2), dpi=140) |
| 241 | + ax.loglog(sub["N"], sub["rms_b"], marker="o", label="rms_b") |
| 242 | + ax.loglog(sub["N"], sub["rms_rt"], marker="s", label="rms_rt") |
| 243 | + ax.set_title(f"Scaling — {dist}") |
| 244 | + ax.set_xlabel("N events") |
| 245 | + ax.set_ylabel("Error") |
| 246 | + ax.grid(True, which="both", ls=":") |
| 247 | + ax.legend() |
| 248 | + fig.tight_layout() |
| 249 | + fig.savefig(f"bench_scaling_{dist}.png") |
| 250 | + plt.close(fig) |
55 | 251 |
|
56 | 252 |
|
57 | 253 | def main(): |
58 | | - Ns = [5_000, 50_000, 200_000] |
59 | | - dists = ["uniform", "poisson", "gaussian"] |
60 | | - print("N, dist, q_bins, z_bins, secs, rows") |
61 | | - for n in Ns: |
62 | | - for dist in dists: |
63 | | - dt, rows = run_one(n, dist, q_bins=11, z_bins=10) |
64 | | - print(f"{n:>8}, {dist:>8}, {11:>2}, {10:>2}, {dt:7.3f}, {rows:>6}") |
| 254 | + ap = argparse.ArgumentParser() |
| 255 | + ap.add_argument("--Ns", type=str, default="2000,5000,10000,20000,50000", |
| 256 | + help="comma-separated N list") |
| 257 | + ap.add_argument("--dists", type=str, default="uniform,poisson,gaussian", |
| 258 | + help="comma-separated distributions") |
| 259 | + ap.add_argument("--lam", type=float, default=50.0, help="Poisson mean λ") |
| 260 | + ap.add_argument("--q_step", type=float, default=0.025, help="q_center step") |
| 261 | + ap.add_argument("--dq", type=float, default=0.05, help="Δq window") |
| 262 | + ap.add_argument("--z_bins", type=int, default=20, help="# z bins") |
| 263 | + ap.add_argument("--sample_fraction", type=float, default=0.006, help="fraction for round-trip sampling") |
| 264 | + ap.add_argument("--plot", action="store_true", help="save log-log plots as PNGs") |
| 265 | + ap.add_argument("--save_csv", type=str, default="", help="path to save CSV results") |
| 266 | + ap.add_argument("--mem_profile", action="store_true", help="profile peak memory (if memory_profiler available)") |
| 267 | + ap.add_argument("--strict", action="store_true", help="raise AssertionError on scaling deviations") |
| 268 | + ap.add_argument("--scaling_tol", type=float, default=0.20, help="tolerance for |alpha_b + 0.5|") |
| 269 | + ap.add_argument("--rt_tol", type=float, default=0.10, help="tolerance for |alpha_rt - 0.0|") |
| 270 | + args = ap.parse_args() |
| 271 | + |
| 272 | + Ns = [int(s) for s in args.Ns.split(",") if s.strip()] |
| 273 | + dists = [s.strip() for s in args.dists.split(",") if s.strip()] |
| 274 | + |
| 275 | + print(f"Distributions: {', '.join(dists)} (Poisson uses PIT, λ={args.lam})") |
| 276 | + print(f"q_step={args.q_step}, dq={args.dq}, z_bins={args.z_bins}, sample_fraction={args.sample_fraction}\n") |
| 277 | + |
| 278 | + rows = [] |
| 279 | + for dist in dists: |
| 280 | + print(f"=== Benchmark: {dist} ===") |
| 281 | + for N in Ns: |
| 282 | + r = run_one( |
| 283 | + dist, N, |
| 284 | + q_step=args.q_step, dq=args.dq, z_bins=args.z_bins, |
| 285 | + lam=args.lam, sample_fraction=args.sample_fraction, |
| 286 | + mem_profile=args.mem_profile, |
| 287 | + ) |
| 288 | + rows.append(r) |
| 289 | + print(f"N={N:7d} | t_fit={r['t_fit']:.3f}s | rms_b={r['rms_b']:.5f} " |
| 290 | + f"(rms_b*√N={r['rms_b']*sqrt(N):.5f}) | σQ_rel={r['rel_err_sigmaQ']:.3f} | " |
| 291 | + f"rt_rms={r['rms_rt']:.5f} (rt_rms*√N={r['rms_rt']*sqrt(N):.5f}) | " |
| 292 | + f"z_inf={r['n_z_inf']} | mem={r['peak_mem_mb']:.1f}MB") |
| 293 | + |
| 294 | + res = pd.DataFrame(rows) |
| 295 | + |
| 296 | + # Scaling summary per distribution |
| 297 | + print("\n=== Scaling summary (expect: α_b ≈ -0.5, α_rt ≈ 0.0) ===") |
| 298 | + summary_rows = [] |
| 299 | + for dist in dists: |
| 300 | + sub = res[res["dist"] == dist].sort_values("N") |
| 301 | + a_b = fit_log_slope(sub["N"].to_numpy(), sub["rms_b"].to_numpy()) |
| 302 | + a_rt = fit_log_slope(sub["N"].to_numpy(), sub["rms_rt"].to_numpy()) |
| 303 | + const_b = (sub["rms_b"] * np.sqrt(sub["N"])).to_numpy() |
| 304 | + print(f"{dist:8s} | α_b={a_b: .3f} (→ -0.5) | α_rt={a_rt: .3f} (→ 0.0) " |
| 305 | + f"| mean(rms_b√N)={np.nanmean(const_b):.5f}") |
| 306 | + summary_rows.append({"dist": dist, "alpha_rms_b": a_b, "alpha_rms_rt": a_rt}) |
| 307 | + summary = pd.DataFrame(summary_rows) |
| 308 | + |
| 309 | + # CSV export |
| 310 | + if args.save_csv: |
| 311 | + res.to_csv(args.save_csv, index=False) |
| 312 | + print(f"\nSaved CSV to: {args.save_csv}") |
| 313 | + |
| 314 | + # Plots |
| 315 | + if args.plot: |
| 316 | + _plot_scaling(res, dists) |
| 317 | + print("Saved PNG plots: bench_scaling_{dist}.png") |
| 318 | + |
| 319 | + # Checks (warn by default; --strict to raise) |
| 320 | + for dist in dists: |
| 321 | + a_b = float(summary[summary["dist"] == dist]["alpha_rms_b"].iloc[0]) |
| 322 | + a_rt = float(summary[summary["dist"] == dist]["alpha_rms_rt"].iloc[0]) |
| 323 | + ok_b = np.isfinite(a_b) and (abs(a_b + 0.5) <= args.scaling_tol) |
| 324 | + ok_rt = np.isfinite(a_rt) and (abs(a_rt - 0.0) <= args.rt_tol) |
| 325 | + msg = f"SCALING [{dist}] α_b={a_b:.3f} vs -0.5 (tol {args.scaling_tol}) | α_rt={a_rt:.3f} vs 0.0 (tol {args.rt_tol})" |
| 326 | + if ok_b and ok_rt: |
| 327 | + print("PASS - " + msg) |
| 328 | + else: |
| 329 | + if args.strict: |
| 330 | + raise AssertionError("FAIL - " + msg) |
| 331 | + warnings.warn("WARN - " + msg) |
65 | 332 |
|
66 | 333 |
|
67 | 334 | if __name__ == "__main__": |
|
0 commit comments