From 38122c493136f1875cb8d49162cf29826f73bf48 Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Mon, 2 Mar 2026 16:29:16 +0000 Subject: [PATCH 1/6] Show absolute z-score on benchmarks Signed-off-by: Adam Gutglick --- scripts/compare-benchmark-jsons.py | 29 +++++++++++++++++++---------- vortex-bench/src/measurements.rs | 26 ++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 10 deletions(-) diff --git a/scripts/compare-benchmark-jsons.py b/scripts/compare-benchmark-jsons.py index 1802325c4cd..5e01ffaddd1 100644 --- a/scripts/compare-benchmark-jsons.py +++ b/scripts/compare-benchmark-jsons.py @@ -63,6 +63,10 @@ def extract_dataset_key(df): improvement_threshold = 1.0 - (threshold_pct / 100.0) # e.g., 0.7 for 30%, 0.9 for 10% regression_threshold = 1.0 + (threshold_pct / 100.0) # e.g., 1.3 for 30%, 1.1 for 10% +# After merge with suffixes, z-score columns become abs_z_score_base and abs_z_score_pr +has_z_base = "abs_z_score_base" in df3.columns +has_z_pr = "abs_z_score_pr" in df3.columns + # Generate summary statistics df3["ratio"] = df3["value_pr"] / df3["value_base"] df3["remark"] = pd.Series([""] * len(df3)) @@ -183,16 +187,21 @@ def format_performance(ratio, target_name): ) # Build table -table_df = pd.DataFrame( - { - "name": df3["name"], - f"PR {pr_commit_id[:8]}": df3["value_pr"], - f"base {base_commit_id[:8]}": df3["value_base"], - "ratio (PR/base)": df3["ratio"], - "unit": df3["unit_base"], - "remark": df3["remark"], - } -) +table_dict = { + "name": df3["name"], + f"PR {pr_commit_id[:8]}": df3["value_pr"], + f"base {base_commit_id[:8]}": df3["value_base"], + "ratio (PR/base)": df3["ratio"], + "unit": df3["unit_base"], +} + +if has_z_pr: + table_dict["|z| PR"] = df3["abs_z_score_pr"] +if has_z_base: + table_dict["|z| base"] = df3["abs_z_score_base"] + +table_dict["remark"] = df3["remark"] +table_df = pd.DataFrame(table_dict) # Output complete formatted markdown print("\n".join(summary_lines)) diff --git a/vortex-bench/src/measurements.rs b/vortex-bench/src/measurements.rs index f49349cd95e..af8a05cca5f 100644 --- a/vortex-bench/src/measurements.rs +++ b/vortex-bench/src/measurements.rs @@ -272,6 +272,27 @@ impl QueryMeasurement { ) } } + + /// Compute |z-score| = |median - mean| / stddev for the runs. + /// Returns `None` if fewer than 2 runs (stddev is undefined). + pub fn abs_z_score(&self) -> Option { + let n = self.runs.len(); + if n < 2 { + return None; + } + + let nanos: Vec = self.runs.iter().map(|d| d.as_nanos() as f64).collect(); + let mean = nanos.iter().sum::() / n as f64; + let variance = nanos.iter().map(|x| (x - mean).powi(2)).sum::() / (n - 1) as f64; + let stddev = variance.sqrt(); + + if stddev == 0.0 { + return Some(0.0); + } + + let median = self.median_run().as_nanos() as f64; + Some(((median - mean) / stddev).abs()) + } } #[derive(Serialize, Deserialize)] @@ -282,6 +303,10 @@ pub struct QueryMeasurementJson { pub unit: String, pub value: u128, pub all_runtimes: Vec, + /// Absolute z-score of the median relative to the mean: |median - mean| / stddev. + /// Indicates how representative the reported median is. `None` when fewer than 2 runs. + #[serde(skip_serializing_if = "Option::is_none")] + pub abs_z_score: Option, pub target: Target, pub commit_id: String, pub env_triple: TripleJson, @@ -313,6 +338,7 @@ impl ToJson for QueryMeasurement { unit: "ns".to_string(), value: self.median_run().as_nanos(), all_runtimes: self.runs.iter().map(|r| r.as_nanos()).collect_vec(), + abs_z_score: self.abs_z_score(), commit_id: GIT_COMMIT_ID.to_string(), target: self.target, env_triple: TripleJson { From 2aed98ad8f4443ef63326397c464f7c642f3493b Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Mon, 2 Mar 2026 16:49:42 +0000 Subject: [PATCH 2/6] python all the things Signed-off-by: Adam Gutglick --- scripts/compare-benchmark-jsons.py | 28 +++++++++++++++++++++++++--- vortex-bench/src/measurements.rs | 26 -------------------------- 2 files changed, 25 insertions(+), 29 deletions(-) diff --git a/scripts/compare-benchmark-jsons.py b/scripts/compare-benchmark-jsons.py index 5e01ffaddd1..81385371faa 100644 --- a/scripts/compare-benchmark-jsons.py +++ b/scripts/compare-benchmark-jsons.py @@ -63,9 +63,31 @@ def extract_dataset_key(df): improvement_threshold = 1.0 - (threshold_pct / 100.0) # e.g., 0.7 for 30%, 0.9 for 10% regression_threshold = 1.0 + (threshold_pct / 100.0) # e.g., 1.3 for 30%, 1.1 for 10% -# After merge with suffixes, z-score columns become abs_z_score_base and abs_z_score_pr -has_z_base = "abs_z_score_base" in df3.columns -has_z_pr = "abs_z_score_pr" in df3.columns +def compute_abs_z_score(runtimes): + """Compute |median - mean| / stddev from a list of runtimes.""" + if not isinstance(runtimes, list) or len(runtimes) < 2: + return float("nan") + n = len(runtimes) + mean = sum(runtimes) / n + variance = sum((x - mean) ** 2 for x in runtimes) / (n - 1) + stddev = math.sqrt(variance) + if stddev == 0: + return 0.0 + sorted_rt = sorted(runtimes) + if n % 2 == 1: + median = sorted_rt[n // 2] + else: + median = (sorted_rt[n // 2 - 1] + sorted_rt[n // 2]) / 2 + return abs((median - mean) / stddev) + + +# Compute |z-score| from all_runtimes when available +has_z_pr = "all_runtimes_pr" in df3.columns +has_z_base = "all_runtimes_base" in df3.columns +if has_z_pr: + df3["abs_z_score_pr"] = df3["all_runtimes_pr"].apply(compute_abs_z_score) +if has_z_base: + df3["abs_z_score_base"] = df3["all_runtimes_base"].apply(compute_abs_z_score) # Generate summary statistics df3["ratio"] = df3["value_pr"] / df3["value_base"] diff --git a/vortex-bench/src/measurements.rs b/vortex-bench/src/measurements.rs index af8a05cca5f..f49349cd95e 100644 --- a/vortex-bench/src/measurements.rs +++ b/vortex-bench/src/measurements.rs @@ -272,27 +272,6 @@ impl QueryMeasurement { ) } } - - /// Compute |z-score| = |median - mean| / stddev for the runs. - /// Returns `None` if fewer than 2 runs (stddev is undefined). - pub fn abs_z_score(&self) -> Option { - let n = self.runs.len(); - if n < 2 { - return None; - } - - let nanos: Vec = self.runs.iter().map(|d| d.as_nanos() as f64).collect(); - let mean = nanos.iter().sum::() / n as f64; - let variance = nanos.iter().map(|x| (x - mean).powi(2)).sum::() / (n - 1) as f64; - let stddev = variance.sqrt(); - - if stddev == 0.0 { - return Some(0.0); - } - - let median = self.median_run().as_nanos() as f64; - Some(((median - mean) / stddev).abs()) - } } #[derive(Serialize, Deserialize)] @@ -303,10 +282,6 @@ pub struct QueryMeasurementJson { pub unit: String, pub value: u128, pub all_runtimes: Vec, - /// Absolute z-score of the median relative to the mean: |median - mean| / stddev. - /// Indicates how representative the reported median is. `None` when fewer than 2 runs. - #[serde(skip_serializing_if = "Option::is_none")] - pub abs_z_score: Option, pub target: Target, pub commit_id: String, pub env_triple: TripleJson, @@ -338,7 +313,6 @@ impl ToJson for QueryMeasurement { unit: "ns".to_string(), value: self.median_run().as_nanos(), all_runtimes: self.runs.iter().map(|r| r.as_nanos()).collect_vec(), - abs_z_score: self.abs_z_score(), commit_id: GIT_COMMIT_ID.to_string(), target: self.target, env_triple: TripleJson { From 4d587461d7838b61044015e39f90912d545896ff Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Mon, 2 Mar 2026 16:53:42 +0000 Subject: [PATCH 3/6] ruff format Signed-off-by: Adam Gutglick --- scripts/compare-benchmark-jsons.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/compare-benchmark-jsons.py b/scripts/compare-benchmark-jsons.py index 81385371faa..9b0d556cbe9 100644 --- a/scripts/compare-benchmark-jsons.py +++ b/scripts/compare-benchmark-jsons.py @@ -63,6 +63,7 @@ def extract_dataset_key(df): improvement_threshold = 1.0 - (threshold_pct / 100.0) # e.g., 0.7 for 30%, 0.9 for 10% regression_threshold = 1.0 + (threshold_pct / 100.0) # e.g., 1.3 for 30%, 1.1 for 10% + def compute_abs_z_score(runtimes): """Compute |median - mean| / stddev from a list of runtimes.""" if not isinstance(runtimes, list) or len(runtimes) < 2: From 3d9f6926f9dbe16d5d2d9d8552c2f45252641e12 Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Mon, 2 Mar 2026 17:06:08 +0000 Subject: [PATCH 4/6] Fix z-score table formatting Signed-off-by: Adam Gutglick --- scripts/compare-benchmark-jsons.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/compare-benchmark-jsons.py b/scripts/compare-benchmark-jsons.py index 9b0d556cbe9..cc06a544e24 100644 --- a/scripts/compare-benchmark-jsons.py +++ b/scripts/compare-benchmark-jsons.py @@ -219,9 +219,9 @@ def format_performance(ratio, target_name): } if has_z_pr: - table_dict["|z| PR"] = df3["abs_z_score_pr"] + table_dict["abs(z-score) PR"] = df3["abs_z_score_pr"] if has_z_base: - table_dict["|z| base"] = df3["abs_z_score_base"] + table_dict["abs(z-score) base"] = df3["abs_z_score_base"] table_dict["remark"] = df3["remark"] table_df = pd.DataFrame(table_dict) From 27e362feb3a1ce9cd3dd2a3141e366e7d1e4baae Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Thu, 5 Mar 2026 14:52:54 +0000 Subject: [PATCH 5/6] variance instead of z score Signed-off-by: Adam Gutglick --- scripts/compare-benchmark-jsons.py | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) diff --git a/scripts/compare-benchmark-jsons.py b/scripts/compare-benchmark-jsons.py index cc06a544e24..d6545b9fb60 100644 --- a/scripts/compare-benchmark-jsons.py +++ b/scripts/compare-benchmark-jsons.py @@ -64,31 +64,22 @@ def extract_dataset_key(df): regression_threshold = 1.0 + (threshold_pct / 100.0) # e.g., 1.3 for 30%, 1.1 for 10% -def compute_abs_z_score(runtimes): - """Compute |median - mean| / stddev from a list of runtimes.""" +def compute_variance(runtimes): + """Compute sample variance from a list of runtimes.""" if not isinstance(runtimes, list) or len(runtimes) < 2: return float("nan") n = len(runtimes) mean = sum(runtimes) / n - variance = sum((x - mean) ** 2 for x in runtimes) / (n - 1) - stddev = math.sqrt(variance) - if stddev == 0: - return 0.0 - sorted_rt = sorted(runtimes) - if n % 2 == 1: - median = sorted_rt[n // 2] - else: - median = (sorted_rt[n // 2 - 1] + sorted_rt[n // 2]) / 2 - return abs((median - mean) / stddev) + return sum((x - mean) ** 2 for x in runtimes) / (n - 1) -# Compute |z-score| from all_runtimes when available +# Compute variance from all_runtimes when available has_z_pr = "all_runtimes_pr" in df3.columns has_z_base = "all_runtimes_base" in df3.columns if has_z_pr: - df3["abs_z_score_pr"] = df3["all_runtimes_pr"].apply(compute_abs_z_score) + df3["variance_pr"] = df3["all_runtimes_pr"].apply(compute_variance) if has_z_base: - df3["abs_z_score_base"] = df3["all_runtimes_base"].apply(compute_abs_z_score) + df3["variance_base"] = df3["all_runtimes_base"].apply(compute_variance) # Generate summary statistics df3["ratio"] = df3["value_pr"] / df3["value_base"] @@ -219,9 +210,9 @@ def format_performance(ratio, target_name): } if has_z_pr: - table_dict["abs(z-score) PR"] = df3["abs_z_score_pr"] + table_dict["variance PR"] = df3["variance_pr"] if has_z_base: - table_dict["abs(z-score) base"] = df3["abs_z_score_base"] + table_dict["variance base"] = df3["variance_base"] table_dict["remark"] = df3["remark"] table_df = pd.DataFrame(table_dict) From 08d7e5fa53cc4147624f3fbb0573443477f2b92d Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Fri, 6 Mar 2026 13:30:42 +0000 Subject: [PATCH 6/6] cv Signed-off-by: Adam Gutglick --- scripts/compare-benchmark-jsons.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/scripts/compare-benchmark-jsons.py b/scripts/compare-benchmark-jsons.py index d6545b9fb60..499294bfd42 100644 --- a/scripts/compare-benchmark-jsons.py +++ b/scripts/compare-benchmark-jsons.py @@ -64,22 +64,25 @@ def extract_dataset_key(df): regression_threshold = 1.0 + (threshold_pct / 100.0) # e.g., 1.3 for 30%, 1.1 for 10% -def compute_variance(runtimes): - """Compute sample variance from a list of runtimes.""" +def compute_cv_pct(runtimes): + """Compute coefficient of variation (std_dev / mean * 100) as a percentage.""" if not isinstance(runtimes, list) or len(runtimes) < 2: return float("nan") n = len(runtimes) mean = sum(runtimes) / n - return sum((x - mean) ** 2 for x in runtimes) / (n - 1) + if mean == 0: + return float("nan") + variance = sum((x - mean) ** 2 for x in runtimes) / (n - 1) + return (variance**0.5 / mean) * 100 -# Compute variance from all_runtimes when available +# Compute CV% from all_runtimes when available has_z_pr = "all_runtimes_pr" in df3.columns has_z_base = "all_runtimes_base" in df3.columns if has_z_pr: - df3["variance_pr"] = df3["all_runtimes_pr"].apply(compute_variance) + df3["cv_pct_pr"] = df3["all_runtimes_pr"].apply(compute_cv_pct) if has_z_base: - df3["variance_base"] = df3["all_runtimes_base"].apply(compute_variance) + df3["cv_pct_base"] = df3["all_runtimes_base"].apply(compute_cv_pct) # Generate summary statistics df3["ratio"] = df3["value_pr"] / df3["value_base"] @@ -210,9 +213,9 @@ def format_performance(ratio, target_name): } if has_z_pr: - table_dict["variance PR"] = df3["variance_pr"] + table_dict["CV% PR"] = df3["cv_pct_pr"] if has_z_base: - table_dict["variance base"] = df3["variance_base"] + table_dict["CV% base"] = df3["cv_pct_base"] table_dict["remark"] = df3["remark"] table_df = pd.DataFrame(table_dict)