From 38122c493136f1875cb8d49162cf29826f73bf48 Mon Sep 17 00:00:00 2001
From: Adam Gutglick <adam@spiraldb.com>
Date: Mon, 2 Mar 2026 16:29:16 +0000
Subject: [PATCH 1/6] Show absolute z-score on benchmarks

Signed-off-by: Adam Gutglick <adam@spiraldb.com>
---
 scripts/compare-benchmark-jsons.py | 29 +++++++++++++++++++----------
 vortex-bench/src/measurements.rs   | 26 ++++++++++++++++++++++++++
 2 files changed, 45 insertions(+), 10 deletions(-)
diff --git a/scripts/compare-benchmark-jsons.py b/scripts/compare-benchmark-jsons.py
index 1802325c4cd..5e01ffaddd1 100644
--- a/scripts/compare-benchmark-jsons.py
+++ b/scripts/compare-benchmark-jsons.py
@@ -63,6 +63,10 @@ def extract_dataset_key(df):
 improvement_threshold = 1.0 - (threshold_pct / 100.0)  # e.g., 0.7 for 30%, 0.9 for 10%
 regression_threshold = 1.0 + (threshold_pct / 100.0)  # e.g., 1.3 for 30%, 1.1 for 10%
 
+# After merge with suffixes, z-score columns become abs_z_score_base and abs_z_score_pr
+has_z_base = "abs_z_score_base" in df3.columns
+has_z_pr = "abs_z_score_pr" in df3.columns
+
 # Generate summary statistics
 df3["ratio"] = df3["value_pr"] / df3["value_base"]
 df3["remark"] = pd.Series([""] * len(df3))
@@ -183,16 +187,21 @@ def format_performance(ratio, target_name):
     )
 
 # Build table
-table_df = pd.DataFrame(
-    {
-        "name": df3["name"],
-        f"PR {pr_commit_id[:8]}": df3["value_pr"],
-        f"base {base_commit_id[:8]}": df3["value_base"],
-        "ratio (PR/base)": df3["ratio"],
-        "unit": df3["unit_base"],
-        "remark": df3["remark"],
-    }
-)
+table_dict = {
+    "name": df3["name"],
+    f"PR {pr_commit_id[:8]}": df3["value_pr"],
+    f"base {base_commit_id[:8]}": df3["value_base"],
+    "ratio (PR/base)": df3["ratio"],
+    "unit": df3["unit_base"],
+}
+
+if has_z_pr:
+    table_dict["|z| PR"] = df3["abs_z_score_pr"]
+if has_z_base:
+    table_dict["|z| base"] = df3["abs_z_score_base"]
+
+table_dict["remark"] = df3["remark"]
+table_df = pd.DataFrame(table_dict)
 
 # Output complete formatted markdown
 print("\n".join(summary_lines))
diff --git a/vortex-bench/src/measurements.rs b/vortex-bench/src/measurements.rs
index f49349cd95e..af8a05cca5f 100644
--- a/vortex-bench/src/measurements.rs
+++ b/vortex-bench/src/measurements.rs
@@ -272,6 +272,27 @@ impl QueryMeasurement {
             )
         }
     }
+
+    /// Compute |z-score| = |median - mean| / stddev for the runs.
+    /// Returns `None` if fewer than 2 runs (stddev is undefined).
+    pub fn abs_z_score(&self) -> Option<f64> {
+        let n = self.runs.len();
+        if n < 2 {
+            return None;
+        }
+
+        let nanos: Vec<f64> = self.runs.iter().map(|d| d.as_nanos() as f64).collect();
+        let mean = nanos.iter().sum::<f64>() / n as f64;
+        let variance = nanos.iter().map(|x| (x - mean).powi(2)).sum::<f64>() / (n - 1) as f64;
+        let stddev = variance.sqrt();
+
+        if stddev == 0.0 {
+            return Some(0.0);
+        }
+
+        let median = self.median_run().as_nanos() as f64;
+        Some(((median - mean) / stddev).abs())
+    }
 }
 
 #[derive(Serialize, Deserialize)]
@@ -282,6 +303,10 @@ pub struct QueryMeasurementJson {
     pub unit: String,
     pub value: u128,
     pub all_runtimes: Vec<u128>,
+    /// Absolute z-score of the median relative to the mean: |median - mean| / stddev.
+    /// Indicates how representative the reported median is. `None` when fewer than 2 runs.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub abs_z_score: Option<f64>,
     pub target: Target,
     pub commit_id: String,
     pub env_triple: TripleJson,
@@ -313,6 +338,7 @@ impl ToJson for QueryMeasurement {
             unit: "ns".to_string(),
             value: self.median_run().as_nanos(),
             all_runtimes: self.runs.iter().map(|r| r.as_nanos()).collect_vec(),
+            abs_z_score: self.abs_z_score(),
             commit_id: GIT_COMMIT_ID.to_string(),
             target: self.target,
             env_triple: TripleJson {

From 2aed98ad8f4443ef63326397c464f7c642f3493b Mon Sep 17 00:00:00 2001
From: Adam Gutglick <adam@spiraldb.com>
Date: Mon, 2 Mar 2026 16:49:42 +0000
Subject: [PATCH 2/6] python all the things

Signed-off-by: Adam Gutglick <adam@spiraldb.com>
---
 scripts/compare-benchmark-jsons.py | 28 +++++++++++++++++++++++++---
 vortex-bench/src/measurements.rs   | 26 --------------------------
 2 files changed, 25 insertions(+), 29 deletions(-)

diff --git a/scripts/compare-benchmark-jsons.py b/scripts/compare-benchmark-jsons.py
index 5e01ffaddd1..81385371faa 100644
--- a/scripts/compare-benchmark-jsons.py
+++ b/scripts/compare-benchmark-jsons.py
@@ -63,9 +63,31 @@ def extract_dataset_key(df):
 improvement_threshold = 1.0 - (threshold_pct / 100.0)  # e.g., 0.7 for 30%, 0.9 for 10%
 regression_threshold = 1.0 + (threshold_pct / 100.0)  # e.g., 1.3 for 30%, 1.1 for 10%
 
-# After merge with suffixes, z-score columns become abs_z_score_base and abs_z_score_pr
-has_z_base = "abs_z_score_base" in df3.columns
-has_z_pr = "abs_z_score_pr" in df3.columns
+def compute_abs_z_score(runtimes):
+    """Compute |median - mean| / stddev from a list of runtimes."""
+    if not isinstance(runtimes, list) or len(runtimes) < 2:
+        return float("nan")
+    n = len(runtimes)
+    mean = sum(runtimes) / n
+    variance = sum((x - mean) ** 2 for x in runtimes) / (n - 1)
+    stddev = math.sqrt(variance)
+    if stddev == 0:
+        return 0.0
+    sorted_rt = sorted(runtimes)
+    if n % 2 == 1:
+        median = sorted_rt[n // 2]
+    else:
+        median = (sorted_rt[n // 2 - 1] + sorted_rt[n // 2]) / 2
+    return abs((median - mean) / stddev)
+
+
+# Compute |z-score| from all_runtimes when available
+has_z_pr = "all_runtimes_pr" in df3.columns
+has_z_base = "all_runtimes_base" in df3.columns
+if has_z_pr:
+    df3["abs_z_score_pr"] = df3["all_runtimes_pr"].apply(compute_abs_z_score)
+if has_z_base:
+    df3["abs_z_score_base"] = df3["all_runtimes_base"].apply(compute_abs_z_score)
 
 # Generate summary statistics
 df3["ratio"] = df3["value_pr"] / df3["value_base"]
diff --git a/vortex-bench/src/measurements.rs b/vortex-bench/src/measurements.rs
index af8a05cca5f..f49349cd95e 100644
--- a/vortex-bench/src/measurements.rs
+++ b/vortex-bench/src/measurements.rs
@@ -272,27 +272,6 @@ impl QueryMeasurement {
             )
         }
     }
-
-    /// Compute |z-score| = |median - mean| / stddev for the runs.
-    /// Returns `None` if fewer than 2 runs (stddev is undefined).
-    pub fn abs_z_score(&self) -> Option<f64> {
-        let n = self.runs.len();
-        if n < 2 {
-            return None;
-        }
-
-        let nanos: Vec<f64> = self.runs.iter().map(|d| d.as_nanos() as f64).collect();
-        let mean = nanos.iter().sum::<f64>() / n as f64;
-        let variance = nanos.iter().map(|x| (x - mean).powi(2)).sum::<f64>() / (n - 1) as f64;
-        let stddev = variance.sqrt();
-
-        if stddev == 0.0 {
-            return Some(0.0);
-        }
-
-        let median = self.median_run().as_nanos() as f64;
-        Some(((median - mean) / stddev).abs())
-    }
 }
 
 #[derive(Serialize, Deserialize)]
@@ -303,10 +282,6 @@ pub struct QueryMeasurementJson {
     pub unit: String,
     pub value: u128,
     pub all_runtimes: Vec<u128>,
-    /// Absolute z-score of the median relative to the mean: |median - mean| / stddev.
-    /// Indicates how representative the reported median is. `None` when fewer than 2 runs.
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub abs_z_score: Option<f64>,
     pub target: Target,
     pub commit_id: String,
     pub env_triple: TripleJson,
@@ -338,7 +313,6 @@ impl ToJson for QueryMeasurement {
             unit: "ns".to_string(),
             value: self.median_run().as_nanos(),
             all_runtimes: self.runs.iter().map(|r| r.as_nanos()).collect_vec(),
-            abs_z_score: self.abs_z_score(),
             commit_id: GIT_COMMIT_ID.to_string(),
             target: self.target,
             env_triple: TripleJson {

From 4d587461d7838b61044015e39f90912d545896ff Mon Sep 17 00:00:00 2001
From: Adam Gutglick <adam@spiraldb.com>
Date: Mon, 2 Mar 2026 16:53:42 +0000
Subject: [PATCH 3/6] ruff format

Signed-off-by: Adam Gutglick <adam@spiraldb.com>
---
 scripts/compare-benchmark-jsons.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/compare-benchmark-jsons.py b/scripts/compare-benchmark-jsons.py
index 81385371faa..9b0d556cbe9 100644
--- a/scripts/compare-benchmark-jsons.py
+++ b/scripts/compare-benchmark-jsons.py
@@ -63,6 +63,7 @@ def extract_dataset_key(df):
 improvement_threshold = 1.0 - (threshold_pct / 100.0)  # e.g., 0.7 for 30%, 0.9 for 10%
 regression_threshold = 1.0 + (threshold_pct / 100.0)  # e.g., 1.3 for 30%, 1.1 for 10%
 
+
 def compute_abs_z_score(runtimes):
     """Compute |median - mean| / stddev from a list of runtimes."""
     if not isinstance(runtimes, list) or len(runtimes) < 2:

From 3d9f6926f9dbe16d5d2d9d8552c2f45252641e12 Mon Sep 17 00:00:00 2001
From: Adam Gutglick <adam@spiraldb.com>
Date: Mon, 2 Mar 2026 17:06:08 +0000
Subject: [PATCH 4/6] Fix z-score table formatting

Signed-off-by: Adam Gutglick <adam@spiraldb.com>
---
 scripts/compare-benchmark-jsons.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/compare-benchmark-jsons.py b/scripts/compare-benchmark-jsons.py
index 9b0d556cbe9..cc06a544e24 100644
--- a/scripts/compare-benchmark-jsons.py
+++ b/scripts/compare-benchmark-jsons.py
@@ -219,9 +219,9 @@ def format_performance(ratio, target_name):
 }
 
 if has_z_pr:
-    table_dict["|z| PR"] = df3["abs_z_score_pr"]
+    table_dict["abs(z-score) PR"] = df3["abs_z_score_pr"]
 if has_z_base:
-    table_dict["|z| base"] = df3["abs_z_score_base"]
+    table_dict["abs(z-score) base"] = df3["abs_z_score_base"]
 
 table_dict["remark"] = df3["remark"]
 table_df = pd.DataFrame(table_dict)

From 27e362feb3a1ce9cd3dd2a3141e366e7d1e4baae Mon Sep 17 00:00:00 2001
From: Adam Gutglick <adam@spiraldb.com>
Date: Thu, 5 Mar 2026 14:52:54 +0000
Subject: [PATCH 5/6] variance instead of z score

Signed-off-by: Adam Gutglick <adam@spiraldb.com>
---
 scripts/compare-benchmark-jsons.py | 25 ++++++++-----------------
 1 file changed, 8 insertions(+), 17 deletions(-)

diff --git a/scripts/compare-benchmark-jsons.py b/scripts/compare-benchmark-jsons.py
index cc06a544e24..d6545b9fb60 100644
--- a/scripts/compare-benchmark-jsons.py
+++ b/scripts/compare-benchmark-jsons.py
@@ -64,31 +64,22 @@ def extract_dataset_key(df):
 regression_threshold = 1.0 + (threshold_pct / 100.0)  # e.g., 1.3 for 30%, 1.1 for 10%
 
 
-def compute_abs_z_score(runtimes):
-    """Compute |median - mean| / stddev from a list of runtimes."""
+def compute_variance(runtimes):
+    """Compute sample variance from a list of runtimes."""
     if not isinstance(runtimes, list) or len(runtimes) < 2:
         return float("nan")
     n = len(runtimes)
     mean = sum(runtimes) / n
-    variance = sum((x - mean) ** 2 for x in runtimes) / (n - 1)
-    stddev = math.sqrt(variance)
-    if stddev == 0:
-        return 0.0
-    sorted_rt = sorted(runtimes)
-    if n % 2 == 1:
-        median = sorted_rt[n // 2]
-    else:
-        median = (sorted_rt[n // 2 - 1] + sorted_rt[n // 2]) / 2
-    return abs((median - mean) / stddev)
+    return sum((x - mean) ** 2 for x in runtimes) / (n - 1)
 
 
-# Compute |z-score| from all_runtimes when available
+# Compute variance from all_runtimes when available
 has_z_pr = "all_runtimes_pr" in df3.columns
 has_z_base = "all_runtimes_base" in df3.columns
 if has_z_pr:
-    df3["abs_z_score_pr"] = df3["all_runtimes_pr"].apply(compute_abs_z_score)
+    df3["variance_pr"] = df3["all_runtimes_pr"].apply(compute_variance)
 if has_z_base:
-    df3["abs_z_score_base"] = df3["all_runtimes_base"].apply(compute_abs_z_score)
+    df3["variance_base"] = df3["all_runtimes_base"].apply(compute_variance)
 
 # Generate summary statistics
 df3["ratio"] = df3["value_pr"] / df3["value_base"]
@@ -219,9 +210,9 @@ def format_performance(ratio, target_name):
 }
 
 if has_z_pr:
-    table_dict["abs(z-score) PR"] = df3["abs_z_score_pr"]
+    table_dict["variance PR"] = df3["variance_pr"]
 if has_z_base:
-    table_dict["abs(z-score) base"] = df3["abs_z_score_base"]
+    table_dict["variance base"] = df3["variance_base"]
 
 table_dict["remark"] = df3["remark"]
 table_df = pd.DataFrame(table_dict)

From 08d7e5fa53cc4147624f3fbb0573443477f2b92d Mon Sep 17 00:00:00 2001
From: Adam Gutglick <adam@spiraldb.com>
Date: Fri, 6 Mar 2026 13:30:42 +0000
Subject: [PATCH 6/6] cv

Signed-off-by: Adam Gutglick <adam@spiraldb.com>
---
 scripts/compare-benchmark-jsons.py | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/scripts/compare-benchmark-jsons.py b/scripts/compare-benchmark-jsons.py
index d6545b9fb60..499294bfd42 100644
--- a/scripts/compare-benchmark-jsons.py
+++ b/scripts/compare-benchmark-jsons.py
@@ -64,22 +64,25 @@ def extract_dataset_key(df):
 regression_threshold = 1.0 + (threshold_pct / 100.0)  # e.g., 1.3 for 30%, 1.1 for 10%
 
 
-def compute_variance(runtimes):
-    """Compute sample variance from a list of runtimes."""
+def compute_cv_pct(runtimes):
+    """Compute coefficient of variation (std_dev / mean * 100) as a percentage."""
     if not isinstance(runtimes, list) or len(runtimes) < 2:
         return float("nan")
     n = len(runtimes)
     mean = sum(runtimes) / n
-    return sum((x - mean) ** 2 for x in runtimes) / (n - 1)
+    if mean == 0:
+        return float("nan")
+    variance = sum((x - mean) ** 2 for x in runtimes) / (n - 1)
+    return (variance**0.5 / mean) * 100
 
 
-# Compute variance from all_runtimes when available
+# Compute CV% from all_runtimes when available
 has_z_pr = "all_runtimes_pr" in df3.columns
 has_z_base = "all_runtimes_base" in df3.columns
 if has_z_pr:
-    df3["variance_pr"] = df3["all_runtimes_pr"].apply(compute_variance)
+    df3["cv_pct_pr"] = df3["all_runtimes_pr"].apply(compute_cv_pct)
 if has_z_base:
-    df3["variance_base"] = df3["all_runtimes_base"].apply(compute_variance)
+    df3["cv_pct_base"] = df3["all_runtimes_base"].apply(compute_cv_pct)
 
 # Generate summary statistics
 df3["ratio"] = df3["value_pr"] / df3["value_base"]
@@ -210,9 +213,9 @@ def format_performance(ratio, target_name):
 }
 
 if has_z_pr:
-    table_dict["variance PR"] = df3["variance_pr"]
+    table_dict["CV% PR"] = df3["cv_pct_pr"]
 if has_z_base:
-    table_dict["variance base"] = df3["variance_base"]
+    table_dict["CV% base"] = df3["cv_pct_base"]
 
 table_dict["remark"] = df3["remark"]
 table_df = pd.DataFrame(table_dict)