maxscheurer · maxscheurer · Mar 17, 2026 · Mar 17, 2026
diff --git a/.gitignore b/.gitignore
@@ -45,3 +45,7 @@ __pycache__
 
 # autogenerated by setuptools-scm
 _version.py
+
+# Benchmark/plot artifacts
+benchmarks/results/
+benchmarks/plots/
diff --git a/README.md b/README.md
@@ -73,6 +73,13 @@ python -m pip install -e .[test]
 pytest
 ```
 
+### Benchmarks
+Pytest-based benchmark suites live under `benchmarks/` and can be run with
+```
+python -m pytest benchmarks -m benchmark
+```
+Results are written to `benchmarks/results` as JSON and CSV artifacts.
+
 
 ## Citation
 

diff --git a/benchmarks/.gitignore b/benchmarks/.gitignore
@@ -0,0 +1,3 @@
+__pycache__/
+results/
+plots/
diff --git a/benchmarks/README.md b/benchmarks/README.md
@@ -0,0 +1,76 @@
+# CPPE Benchmarks
+
+This directory contains pytest-based benchmark suites that compare CPPE backends
+and track scaling with the number of polarizable sites.
+
+## Run
+
+From a configured development environment:
+
+```bash
+python -m pytest benchmarks -m benchmark
+```
+
+Single-thread parity (recommended for fair comparisons):
+
+```bash
+OMP_NUM_THREADS=1 NUMBA_NUM_THREADS=1 python -m pytest benchmarks -m benchmark
+```
+
+Optional runtime controls:
+
+- `CPPE_BENCH_REPEAT` (default: `3`)
+- `CPPE_BENCH_WARMUP` (default: `1`)
+- `--benchmark-output-dir` (default: `benchmarks/results`)
+
+Example:
+
+```bash
+CPPE_BENCH_REPEAT=5 CPPE_BENCH_WARMUP=2 \
+python -m pytest benchmarks -m benchmark --benchmark-output-dir benchmarks/results
+```
+
+## Outputs
+
+Each run writes both JSON and CSV artifacts:
+
+- `benchmarks/results/latest.json`
+- `benchmarks/results/latest.csv`
+- timestamped copies for history
+
+## Plotting
+
+Generate runtime and speedup plots from benchmark CSV data:
+
+```bash
+python benchmarks/plot_benchmarks.py \
+  --input benchmarks/results/latest.csv \
+  --output-dir benchmarks/plots \
+  --format png
+```
+
+By default, this writes **compact overview plots** only.
+
+Optional:
+
+- add additional output formats: `--format svg`
+- disable log y-axis: `--no-log-y`
+- generate per-benchmark detailed plots: `--detailed`
+
+The script writes:
+
+- `benchmarks/plots/summary_aggregated.csv`
+- `benchmarks/plots/summary_speedup.csv`
+- `benchmarks/plots/overview_runtime.*`
+- `benchmarks/plots/overview_speedup.*`
+- `benchmarks/plots/runtime_*.png` and `speedup_*.png` (only with `--detailed`)
+- `benchmarks/plots/REPORT.md`
+
+## Large synthetic systems
+
+Large synthetic waterbox-like benchmarks are provided separately and are not run in
+the default benchmark marker selection.
+
+```bash
+OMP_NUM_THREADS=1 NUMBA_NUM_THREADS=1 python -m pytest benchmarks -m benchmark_large
+```
diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py
diff --git a/benchmarks/conftest.py b/benchmarks/conftest.py
@@ -0,0 +1,76 @@
+import csv
+import json
+import os
+from datetime import datetime, timezone
+from pathlib import Path
+
+import pytest
+
+os.environ.setdefault("OMP_NUM_THREADS", "1")
+os.environ.setdefault("NUMBA_NUM_THREADS", "1")
+
+
+def pytest_addoption(parser):
+    parser.addoption(
+        "--benchmark-output-dir",
+        action="store",
+        default="benchmarks/results",
+        help="Directory for benchmark result artifacts.",
+    )
+
+
+def pytest_configure(config):
+    config.addinivalue_line(
+        "markers",
+        "benchmark: marks benchmark-style tests that collect timing metrics",
+    )
+    config.addinivalue_line(
+        "markers",
+        "benchmark_large: marks large synthetic benchmark tests",
+    )
+
+
+@pytest.fixture(scope="session", autouse=True)
+def benchmark_thread_control():
+    from numba import set_num_threads
+
+    threads = int(os.environ.get("NUMBA_NUM_THREADS", "1"))
+    set_num_threads(threads)
+
+
+@pytest.fixture(scope="session")
+def benchmark_recorder(request):
+    records = []
+
+    def _record(**kwargs):
+        records.append(kwargs)
+
+    yield _record
+
+    output_dir = Path(request.config.getoption("benchmark_output_dir"))
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
+    payload = {
+        "created_utc": timestamp,
+        "repeat": int(os.environ.get("CPPE_BENCH_REPEAT", "3")),
+        "warmup": int(os.environ.get("CPPE_BENCH_WARMUP", "1")),
+        "omp_num_threads": int(os.environ.get("OMP_NUM_THREADS", "1")),
+        "numba_num_threads": int(os.environ.get("NUMBA_NUM_THREADS", "1")),
+        "records": records,
+    }
+
+    latest_json = output_dir / "latest.json"
+    latest_json.write_text(json.dumps(payload, indent=2), encoding="utf-8")
+    stamped_json = output_dir / f"benchmarks-{timestamp}.json"
+    stamped_json.write_text(json.dumps(payload, indent=2), encoding="utf-8")
+
+    if records:
+        fieldnames = sorted({k for rec in records for k in rec.keys()})
+        latest_csv = output_dir / "latest.csv"
+        stamped_csv = output_dir / f"benchmarks-{timestamp}.csv"
+        for csv_path in (latest_csv, stamped_csv):
+            with csv_path.open("w", newline="", encoding="utf-8") as handle:
+                writer = csv.DictWriter(handle, fieldnames=fieldnames)
+                writer.writeheader()
+                writer.writerows(records)