diff --git a/scripts/compare-benchmark-jsons.py b/scripts/compare-benchmark-jsons.py index 1802325c4cd..499294bfd42 100644 --- a/scripts/compare-benchmark-jsons.py +++ b/scripts/compare-benchmark-jsons.py @@ -63,6 +63,27 @@ def extract_dataset_key(df): improvement_threshold = 1.0 - (threshold_pct / 100.0) # e.g., 0.7 for 30%, 0.9 for 10% regression_threshold = 1.0 + (threshold_pct / 100.0) # e.g., 1.3 for 30%, 1.1 for 10% + +def compute_cv_pct(runtimes): + """Compute coefficient of variation (std_dev / mean * 100) as a percentage.""" + if not isinstance(runtimes, list) or len(runtimes) < 2: + return float("nan") + n = len(runtimes) + mean = sum(runtimes) / n + if mean == 0: + return float("nan") + variance = sum((x - mean) ** 2 for x in runtimes) / (n - 1) + return (variance**0.5 / mean) * 100 + + +# Compute CV% from all_runtimes when available +has_z_pr = "all_runtimes_pr" in df3.columns +has_z_base = "all_runtimes_base" in df3.columns +if has_z_pr: + df3["cv_pct_pr"] = df3["all_runtimes_pr"].apply(compute_cv_pct) +if has_z_base: + df3["cv_pct_base"] = df3["all_runtimes_base"].apply(compute_cv_pct) + # Generate summary statistics df3["ratio"] = df3["value_pr"] / df3["value_base"] df3["remark"] = pd.Series([""] * len(df3)) @@ -183,16 +204,21 @@ def format_performance(ratio, target_name): ) # Build table -table_df = pd.DataFrame( - { - "name": df3["name"], - f"PR {pr_commit_id[:8]}": df3["value_pr"], - f"base {base_commit_id[:8]}": df3["value_base"], - "ratio (PR/base)": df3["ratio"], - "unit": df3["unit_base"], - "remark": df3["remark"], - } -) +table_dict = { + "name": df3["name"], + f"PR {pr_commit_id[:8]}": df3["value_pr"], + f"base {base_commit_id[:8]}": df3["value_base"], + "ratio (PR/base)": df3["ratio"], + "unit": df3["unit_base"], +} + +if has_z_pr: + table_dict["CV% PR"] = df3["cv_pct_pr"] +if has_z_base: + table_dict["CV% base"] = df3["cv_pct_base"] + +table_dict["remark"] = df3["remark"] +table_df = pd.DataFrame(table_dict) # Output complete formatted markdown print("\n".join(summary_lines))