diff --git a/.github/dependabot.yml b/.github/dependabot.yml index f8f779b5..e7750e98 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -12,3 +12,30 @@ updates: github-actions: patterns: - '*' + +# Pinned ``[benchmarks]`` extra in pyproject.toml. One PR per dep bump +# → CodSpeed CI runs and attributes any perf delta to that specific +# bump. Keeps the cross-version ``sweep`` baseline (lockfile-pinned) +# stable while still surfacing upstream perf changes per-PR with +# eyes-open review. Loose ``[project.dependencies]`` (numpy, scipy, ...) +# have no version specifier so Dependabot leaves them alone — only the +# ``==`` pins in ``[benchmarks]`` produce PRs. +- package-ecosystem: pip + directory: / + schedule: + interval: monthly + open-pull-requests-limit: 5 + groups: + # Measurement scaffolding + CLI/notebook tooling. Perf-irrelevant — + # they don't move CodSpeed signal, so batching into one PR cuts + # review noise. Perf-relevant deps (numpy, xarray, highspy, …) stay + # un-grouped so each gets its own attributed CodSpeed delta. + benchmark-tooling: + patterns: + - pytest + - pytest-benchmark + - pytest-memray + - pytest-codspeed + - nbconvert + - typer + - plotly diff --git a/.github/workflows/benchmark-smoke.yml b/.github/workflows/benchmark-smoke.yml new file mode 100644 index 00000000..84cbadc9 --- /dev/null +++ b/.github/workflows/benchmark-smoke.yml @@ -0,0 +1,43 @@ +name: Benchmark smoke + +# Builds every spec and fires every phase once under --quick +# --benchmark-disable: a "did a refactor break a spec?" check, not timing. + +on: + push: + branches: [ master ] + pull_request: + branches: [ '*' ] + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + smoke: + name: Benchmark smoke (quick) + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v6 + with: + fetch-depth: 0 # setuptools_scm + + - name: Set up Python 3.12 + uses: actions/setup-python@v6 + with: + python-version: "3.12" + + - name: Install package and benchmark dependencies + run: | + python -m pip install uv + uv pip install --system -e ".[dev,benchmarks]" + + - name: Run benchmark smoke + run: | + python -m benchmarks smoke + + - name: Execute walkthrough notebook + # Catches doc rot: the walkthrough must stay runnable end-to-end. + run: | + python -m benchmarks notebook diff --git a/.github/workflows/codspeed.yml b/.github/workflows/codspeed.yml new file mode 100644 index 00000000..4aead0e2 --- /dev/null +++ b/.github/workflows/codspeed.yml @@ -0,0 +1,45 @@ +name: CodSpeed + +# Cachegrind micro-benchmarks for instruction-count regression detection. +# Runs only when master advances (establishing/updating the baseline) plus a +# manual trigger for ad-hoc branch checks — deliberately off every PR: the +# cachegrind run is ~10–20× slower and regressions surface as master-to-master +# deltas. Wall-clock comparison stays in ``sweep``. + +on: + push: + branches: [ master ] + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + codspeed: + name: CodSpeed (micro regression detection) + runs-on: ubuntu-latest + # Red until CODSPEED_TOKEN is set on the org — don't fail master on it. + continue-on-error: true + steps: + - uses: actions/checkout@v6 + with: + fetch-depth: 0 # setuptools_scm + + - name: Set up Python 3.12 + uses: actions/setup-python@v6 + with: + python-version: "3.12" + + - name: Install pinned benchmark environment + # Pinned ``[benchmarks]`` extra so Dependabot bumps → one CodSpeed delta each. + run: | + python -m pip install uv + uv pip install --system -e ".[dev,benchmarks]" + + - name: Run benchmarks under CodSpeed + uses: CodSpeedHQ/action@v3 + with: + token: ${{ secrets.CODSPEED_TOKEN }} + run: | + pytest benchmarks/ --quick --codspeed diff --git a/.gitignore b/.gitignore index 8b369aea..7e6d63e2 100644 --- a/.gitignore +++ b/.gitignore @@ -45,6 +45,10 @@ benchmark/scripts/__pycache__ benchmark/scripts/benchmarks-pypsa-eur/__pycache__ benchmark/scripts/leftovers/ +# Benchmarks (internal suite): regenerable .ipynb viewing artifacts +benchmarks/walkthrough.ipynb +benchmarks/.ipynb_checkpoints/ + # IDE .idea/ diff --git a/benchmarks/README.md b/benchmarks/README.md index 22ac73ce..d264f682 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -1,94 +1,69 @@ # Internal Performance Benchmarks -Measures linopy's own performance (build time, LP write speed, memory usage) across problem sizes using [pytest-benchmark](https://pytest-benchmark.readthedocs.io/) and [pytest-memray](https://pytest-memray.readthedocs.io/). Use these to check whether a code change introduces a regression or improvement. +End-to-end performance tracking for `linopy` — build → solver handoff +→ netCDF (de)serialization → fixed PyPSA model. Solver algorithm +runtime is out of scope. -> **Note:** The `benchmark/` directory (singular) contains *external* benchmarks comparing linopy against other modeling frameworks. This directory (`benchmarks/`) is for *internal* performance tracking only. +**The walkthrough is load-bearing.** Phase coverage, CLI introspection, +the two-snapshot regression workflow with inline Plotly views, and +how to extend the suite live in [`walkthrough.md`](walkthrough.md). +This README only covers install and how to open the walkthrough. -## Setup +> `benchmark/` (singular) is the legacy external-framework suite. +> `benchmarks/` (plural) is this internal suite. -```bash -pip install -e ".[benchmarks]" -``` +## Models vs patterns -## Running benchmarks +Two kinds of benchmark spec, same harness (time + peak memory, same phases), +distinguished by their sweep axis: -```bash -# Quick smoke test (small sizes only) -pytest benchmarks/ --quick +- **Models** (`models/`, `REGISTRY`) — whole `linopy.Model`s swept over + `size` (axis `n`): "how does cost scale with the problem?" +- **Patterns** (`patterns/`, `PATTERNS`) — fragments of realistic modelling + code (a balance constraint, a KVL contraction) swept over `severity` + (0–100, axis `severity`): "how does cost respond as one data shape goes + from benign to pathological?" Each `PatternSpec.description` documents what + its dial means (`"0: …, 100: …"`). -# Full timing benchmarks -pytest benchmarks/test_build.py benchmarks/test_lp_write.py benchmarks/test_matrices.py +Both kinds build a complete `linopy.Model`, so both run the **same phases** and +share the phase drivers (`test_build.py`, `test_matrices.py`, …) and `memory` +grid — they're just more `(spec, value)` rows, tagged by `axis`. There is no +separate pattern driver. Running a pattern through `build` *and* `lp_write` +shows whether a dense-`_term` blow-up propagates to export or collapses. -# Run a specific model -pytest benchmarks/test_build.py -k basic -``` +Patterns target the operations where the dense-`_term` representation forces +materialisation — `groupby().sum()` padding, sparse `@` densification — so a +`severity` sweep draws the cost cliff, and a cross-version `compare` shows a +kernel change bending it. Adding either is one file: drop it in `models/` or +`patterns/`, call `register(...)` / `register_pattern(...)`. -## Comparing timing between branches +## Install ```bash -# Save baseline results on master -git checkout master -pytest benchmarks/test_build.py --benchmark-save=master - -# Switch to feature branch and compare -git checkout my-feature -pytest benchmarks/test_build.py --benchmark-save=my-feature --benchmark-compare=0001_master - -# Compare saved results without re-running -pytest-benchmark compare 0001_master 0002_my-feature --columns=median,iqr +uv sync --extra dev --extra benchmarks +source .venv/bin/activate ``` -Results are stored in `.benchmarks/` (gitignored). - -## Memory benchmarks +`pypsa` is optional — `pypsa_scigrid` and +`test_pypsa_carbon_management.py` skip gracefully without it. Install +when you need them: `uv pip install pypsa`. -`memory.py` runs each test in a separate process with pytest-memray to get accurate per-test peak memory (including C/numpy allocations). Results are saved as JSON and can be compared across branches. +The `[benchmarks]` extra in `pyproject.toml` pins every direct dep that +affects measurement (`numpy`, `scipy`, `xarray`, `pandas`, `polars`, +`dask`, etc.). `sweep` installs these into each per-version venv, so +"same deps, only linopy varies" comes for free without a separate +lockfile — bump the pins in pyproject and the next sweep picks them up. -By default, only the build phase (`test_build.py`) is measured. Unlike timing benchmarks where `benchmark()` isolates the measured function, memray tracks all allocations within a test — including model construction in setup. This means LP write and matrix tests would report build + phase memory combined, making the phase-specific contribution impossible to isolate. Since model construction dominates memory usage, measuring build alone gives the most actionable numbers. +## Open the walkthrough ```bash -# Save baseline on master -git checkout master -python benchmarks/memory.py save master - -# Save feature branch -git checkout my-feature -python benchmarks/memory.py save my-feature - -# Compare -python benchmarks/memory.py compare master my-feature - -# Quick mode (smaller sizes, faster) -python benchmarks/memory.py save master --quick - -# Measure a specific phase (includes build overhead) -python benchmarks/memory.py save master --test-path benchmarks/test_lp_write.py +python -m benchmarks notebook --build # (re)generate walkthrough.ipynb +jupyter lab benchmarks/walkthrough.ipynb # ...or PyCharm / VSCode ``` -Results are stored in `.benchmarks/memory/` (gitignored). Requires Linux or macOS (memray is not available on Windows). - -> **Note:** Small tests (~5 MiB) are near the import-overhead floor and may show noise of ~1 MiB between runs. Focus on larger tests for meaningful memory comparisons. Do not combine `--memray` with timing benchmarks — memray adds ~2x overhead that invalidates timing results. - -## Models - -| Model | Description | Sizes | -|-------|-------------|-------| -| `basic` | Dense N*N model, 2*N^2 vars/cons | 10 — 1600 | -| `knapsack` | N binary variables, 1 constraint | 100 — 1M | -| `expression_arithmetic` | Broadcasting, scaling, summation across dims | 10 — 1000 | -| `sparse_network` | Ring network with mismatched bus/line coords | 10 — 1000 | -| `pypsa_scigrid` | Real power system (requires `pypsa`) | 10 — 200 snapshots | - -## Phases - -| Phase | File | What it measures | -|-------|------|------------------| -| Build | `test_build.py` | Model construction (add_variables, add_constraints, add_objective) | -| LP write | `test_lp_write.py` | Writing the model to an LP file | -| Matrices | `test_matrices.py` | Generating sparse matrices (A, b, c, bounds) from the model | - -## Adding a new model +The `.md` is the source of truth; the `.ipynb` is a disposable, +gitignored build artifact. Edit the `.md`, re-run `--build`, re-open. +Same workflow in any editor. -1. Create `benchmarks/models/my_model.py` with a `build_my_model(n)` function and a `SIZES` list -2. Add parametrized tests in the relevant `test_*.py` files -3. Add a quick threshold in `conftest.py` +CI executes the walkthrough end-to-end on every PR +(`python -m benchmarks notebook`) so the examples can't silently rot. diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py index 6bf202cc..f0a8fa70 100644 --- a/benchmarks/__init__.py +++ b/benchmarks/__init__.py @@ -1 +1,129 @@ -"""Linopy benchmark suite — run with ``pytest benchmarks/`` (use ``--quick`` for smaller sizes).""" +""" +Linopy benchmark suite. + +Run with ``pytest benchmarks/`` (use ``--quick`` for smaller sizes). + +This package also exposes a **reusable model registry** for any test, profiling +session, or example that wants ready-made linopy models of varying sizes and +features. Each entry exposes a ``build(size) -> linopy.Model`` callable plus +metadata:: + + from benchmarks import REGISTRY, QUADRATIC + + # Look up by name + model = REGISTRY["basic"].build(100) + + # Iterate / filter + for spec in REGISTRY.values(): + m = spec.build(spec.sizes[0]) + ... + + from benchmarks import filter_by + qp_specs = filter_by(has_feature=QUADRATIC) +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from pathlib import Path + + import pandas as pd + + from benchmarks.snapshot import Metric + +# Importing the models / patterns packages triggers each module's +# ``register(...)`` / ``register_pattern(...)`` call at import time. +from benchmarks import bench, models, patterns # noqa: F401, E402 + + +def load_long_df( + snapshots: list[Path], metric: Metric = "min" +) -> tuple[pd.DataFrame, str]: + """ + Load one or more benchmark JSON snapshots into a tidy DataFrame. + + Thin re-export of :func:`benchmarks.snapshot.load_long_df` so callers + can do their own analysis without importing the plotting module + (which pulls in plotly). Returns ``(df, unit)`` where ``df`` has one + row per ``(snapshot, test_id)`` with columns ``snapshot, test_id, + phase, spec, size, value``, and ``unit`` is ``"s"`` (timing) or + ``"MiB"`` (memory). + """ + from benchmarks.snapshot import load_long_df as _impl + + return _impl(snapshots, metric) + + +from benchmarks.registry import ( # noqa: F401, E402 — re-export + ALL_FEATURES, + ALL_PHASES, + BINARY, + BUILD, + CONTINUOUS, + DEFAULT_PHASES, + DEFAULT_SEVERITIES, + INTEGER, + LP_WRITE, + MASKED, + MATRICES, + NETCDF, + PATTERNS, + PIECEWISE, + QUADRATIC, + REGISTRY, + SOS, + TO_GUROBIPY, + TO_HIGHSPY, + TO_MOSEK, + TO_XPRESS, + BenchSpec, + ModelSpec, + PatternSpec, + all_specs, + filter_by, + get, + get_pattern, + iter_params, + param_ids, + register, + register_pattern, +) + +__all__ = [ + "ALL_FEATURES", + "ALL_PHASES", + "BINARY", + "BUILD", + "CONTINUOUS", + "DEFAULT_PHASES", + "DEFAULT_SEVERITIES", + "INTEGER", + "LP_WRITE", + "MASKED", + "MATRICES", + "BenchSpec", + "ModelSpec", + "NETCDF", + "PATTERNS", + "PIECEWISE", + "PatternSpec", + "QUADRATIC", + "REGISTRY", + "SOS", + "TO_GUROBIPY", + "TO_HIGHSPY", + "TO_MOSEK", + "TO_XPRESS", + "all_specs", + "bench", + "filter_by", + "get", + "get_pattern", + "iter_params", + "load_long_df", + "param_ids", + "register", + "register_pattern", +] diff --git a/benchmarks/__main__.py b/benchmarks/__main__.py new file mode 100644 index 00000000..34a28439 --- /dev/null +++ b/benchmarks/__main__.py @@ -0,0 +1,5 @@ +"""Allow ``python -m benchmarks ``.""" + +from benchmarks.cli import app + +app() diff --git a/benchmarks/_tests/test_bench.py b/benchmarks/_tests/test_bench.py new file mode 100644 index 00000000..1b9ef5c7 --- /dev/null +++ b/benchmarks/_tests/test_bench.py @@ -0,0 +1,105 @@ +""" +Tests for the ad-hoc ``bench`` helper. + +The contract under test is the *seam*: a ``bench`` result must round-trip +into ``snapshot.load_long_df`` exactly like a real snapshot, and its +in-process ``to_df`` must line up column-for-column with the loaded frame. +These are the only non-obvious behaviours — the timing math itself is not +asserted beyond "finite and positive", since wall-clock values aren't +reproducible. +""" + +from __future__ import annotations + +from pathlib import Path + +import pytest + +import linopy +from benchmarks import REGISTRY, bench +from benchmarks.phases import touch_matrices +from benchmarks.snapshot import load_long_df + + +def _tiny() -> int: + return sum(range(1000)) + + +def _alloc() -> int: + # Allocate ~16 MB so the memray peak is unambiguously above zero; + # ``_tiny`` allocates nothing measurable. + data = [0] * 2_000_000 + return len(data) + + +def test_timing_snapshot_round_trips_into_loader(tmp_path: Path) -> None: + """A synthesized id parses back into the (phase, spec, size) columns.""" + snap = tmp_path / "t.json" + bench.time(_tiny, rounds=3).to_snapshot(snap, spec="basic", size=100, phase="build") + + df, unit = load_long_df([snap]) + assert unit == "s" + assert len(df) == 1 + row = df.iloc[0] + assert (row["phase"], row["spec"], row["size"]) == ("build", "basic", 100) + assert row["value"] > 0 + + +def test_compare_writes_n_entries(tmp_path: Path) -> None: + """``compare`` collects N cases into one snapshot → N loadable rows.""" + snap = tmp_path / "cmp.json" + rs = bench.compare({"a": _tiny, "b": _tiny, "c": _tiny}, kind="time", rounds=2) + rs.to_snapshot(snap) + + df, unit = load_long_df([snap]) + assert unit == "s" + assert len(df) == 3 + assert set(df["test_id"]) == {"a", "b", "c"} + + +def test_to_df_columns_match_loader(tmp_path: Path) -> None: + """In-process ``to_df`` shares the loader's exact column set/order.""" + snap = tmp_path / "t.json" + result = bench.time(_tiny, rounds=2) + result.to_snapshot(snap, spec="basic", size=10, phase="build") + + loaded, _ = load_long_df([snap]) + assert list(result.to_df().columns) == list(loaded.columns) + + +def test_memory_path_round_trips(tmp_path: Path) -> None: + """Memory results carry MiB and round-trip through the loader.""" + pytest.importorskip("memray") + snap = tmp_path / "m.json" + result = bench.memory(_alloc) + assert result.peak_mib > 0 + result.to_snapshot(snap, spec="basic", size=10, phase="build") + + df, unit = load_long_df([snap]) + assert unit == "MiB" + assert df.iloc[0]["value"] > 0 + + +def test_phase_verb_on_custom_model() -> None: + """The headline use case: a phase verb timed on a hand-built model.""" + m = linopy.Model() + x = m.add_variables(lower=0, name="x") + m.add_constraints(x >= 1) + m.add_objective(x) + + result = bench.time(touch_matrices, m, rounds=2) + assert result.stats["min"] > 0 + assert result.stats["rounds"] == 2 + + +def test_registry_builder_times() -> None: + """A registry builder is a plain callable — no special-casing needed.""" + result = bench.time(REGISTRY["basic"].build, 50, rounds=2) + assert result.stats["min"] > 0 + + +def test_partial_id_spec_rejected(tmp_path: Path) -> None: + """A half-given (spec/size/phase) id is ambiguous and must error.""" + result = bench.time(_tiny, rounds=1) + with pytest.raises(ValueError, match="given together"): + result.to_snapshot(tmp_path / "x.json", spec="basic") diff --git a/benchmarks/_tests/test_memory_id_alignment.py b/benchmarks/_tests/test_memory_id_alignment.py new file mode 100644 index 00000000..5d2377c8 --- /dev/null +++ b/benchmarks/_tests/test_memory_id_alignment.py @@ -0,0 +1,72 @@ +""" +Guard test for the timing ↔ memory test-id seam. + +``memory.py`` hand-rolls f-strings to label each measurement with the +same node id pytest-benchmark produces (e.g. +``benchmarks/test_matrices.py::test_matrices[basic-n=10]``). If a +benchmark test function gets renamed and the matching f-string in +``memory.py`` isn't updated, ``plot`` would silently end up with +non-overlapping timing and memory sets — no error, just missing data. + +This test exercises both sides once and asserts every memory-emitted +id is present in pytest's collection. +""" + +from __future__ import annotations + +import re +import subprocess +import sys +from pathlib import Path + +from benchmarks.memory import MEMORY_PHASES, _measurements +from benchmarks.registry import REGISTRY + + +def _collect_benchmark_ids() -> set[str]: + """Return the set of node ids pytest collects under ``benchmarks/``.""" + repo_root = Path(__file__).resolve().parents[2] + result = subprocess.run( + [ + sys.executable, + "-m", + "pytest", + "benchmarks/", + "--collect-only", + "-q", + "--no-header", + "--co", + ], + capture_output=True, + text=True, + check=True, + cwd=repo_root, + ) + # pytest -q --co emits one node id per line; trailing summary lines + # like "N tests collected" can be ignored. + return { + line.strip() + for line in result.stdout.splitlines() + if re.match(r"^benchmarks/.*::.*\[.*\]$", line.strip()) + } + + +def test_memory_node_ids_match_pytest_collection() -> None: + collected = _collect_benchmark_ids() + assert collected, "pytest collected zero benchmark node ids — sanity broken" + + # ``basic`` at its smallest size is cheap and declares every default + # phase, so it exercises every node-id format ``_measurements`` emits. + spec = REGISTRY["basic"] + size = spec.sizes[0] + + mem_ids: set[str] = set() + for phase in MEMORY_PHASES: + for test_id, _ in _measurements(phase, spec, size): + mem_ids.add(test_id) + + missing = mem_ids - collected + assert not missing, ( + "memory.py emits node ids that pytest doesn't collect " + "(test rename drift?):\n" + "\n".join(f" {m}" for m in sorted(missing)) + ) diff --git a/benchmarks/_tests/test_sweep.py b/benchmarks/_tests/test_sweep.py new file mode 100644 index 00000000..3531aebb --- /dev/null +++ b/benchmarks/_tests/test_sweep.py @@ -0,0 +1,33 @@ +"""Unit tests for sweep helpers (no venvs spun up).""" + +from __future__ import annotations + +import pytest + +from benchmarks.sweep import _snapshot_label + + +@pytest.mark.parametrize( + "spec,expected", + [ + # plain releases pass through unchanged + ("0.6.1", "0.6.1"), + ("0.5.0a1", "0.5.0a1"), + # git spec pinned to a sha -> the sha (clean, reproducible filename) + ("git+https://github.com/PyPSA/linopy.git@2993b95", "2993b95"), + # git spec on a branch -> the branch name + ("git+https://github.com/PyPSA/linopy.git@main", "main"), + # PEP 508 local file url -> sanitised (no slashes survive) + ("linopy @ file:///home/me/linopy", "file-home-me-linopy"), + ], +) +def test_snapshot_label(spec: str, expected: str) -> None: + label = _snapshot_label(spec) + assert label == expected + # whatever the input, the label must be a safe single path segment. + assert "/" not in label and " " not in label and label + + +def test_snapshot_label_never_empty() -> None: + # a spec that sanitises to nothing still yields a usable stub. + assert _snapshot_label("@@@") == "spec" diff --git a/benchmarks/bench.py b/benchmarks/bench.py new file mode 100644 index 00000000..17d74bcb --- /dev/null +++ b/benchmarks/bench.py @@ -0,0 +1,358 @@ +""" +Ad-hoc benchmarking of arbitrary callables on the *current* linopy tree. + +Where the pytest suite measures the fixed registry grid and ``sweep`` +measures across installed linopy versions, ``bench`` is for the +interactive middle: time or memory-profile any callable — a registry +builder, a phase verb applied to a model you built by hand, or a one-off +lambda — get a result object back, and either inspect it as a DataFrame +or drop it into a snapshot the existing ``plot`` / ``compare`` machinery +already understands:: + + from benchmarks import bench, REGISTRY + + r = bench.time(REGISTRY["basic"].build, 100) + r # rich repr in a notebook + r.to_snapshot("a.json", spec="basic", size=100, phase="build") + + bench.compare({"v1": f1, "v2": f2}).to_snapshot("cmp.json") + +This plugs into the *output* side of the pipeline (snapshot JSON read by +``snapshot.load_long_df``), not into ``sweep``: a sweep runs pytest inside +per-version venvs as subprocesses, so it can only measure importable +registry models — an in-process callable can't cross that boundary. To +sweep a custom model across versions, promote it to ``benchmarks/models/``. + +**Methodology.** Timing is built on :class:`timeit.Timer`: an +``autorange`` calibration picks the inner iteration count (so timer +resolution doesn't dominate fast callables), then the per-iteration time +is sampled across rounds with the suite's min-of-N convention (the +fastest sample approximates the no-noise floor). It is *not* +pytest-benchmark's calibrated timer, so absolute numbers are not +interchangeable with suite snapshots — compare ``bench`` to ``bench`` and +suite to suite. +""" + +from __future__ import annotations + +from collections.abc import Callable, Sequence +from dataclasses import dataclass, field +from pathlib import Path +from statistics import mean, median, stdev +from timeit import Timer +from typing import TYPE_CHECKING, Any, Literal + +from benchmarks.snapshot import ( + parse_test_id, + synth_test_id, + write_memory_snapshot, + write_timing_snapshot, +) + +if TYPE_CHECKING: + import pandas as pd + +__all__ = [ + "MemoryResult", + "ResultSet", + "TimingResult", + "compare", + "memory", + "time", +] + +# Floor / cap on the auto-tuned round count when ``rounds`` is unset. +# The floor guarantees a meaningful min-of-N even for slow callables that +# blow past ``min_time`` in one shot; the cap stops a microsecond callable +# from spinning forever. +_ROUND_FLOOR = 5 +_ROUND_CAP = 10_000 + + +def _fn_name(fn: Callable[..., object]) -> str: + """Best-effort label for a callable (``functools.partial`` has no name).""" + return getattr(fn, "__name__", None) or repr(fn) + + +def _row(test_id: str, value: float) -> dict[str, object]: + """One ``load_long_df``-shaped row for an in-process result.""" + phase, spec, size, axis = parse_test_id(test_id) + return { + "snapshot": test_id, + "test_id": test_id, + "phase": phase, + "spec": spec, + "size": size, + "axis": axis, + "value": value, + } + + +def _frame(rows: list[dict[str, object]]) -> pd.DataFrame: + """Build a DataFrame with the exact column set/dtype of ``load_long_df``.""" + import pandas as pd + + df = pd.DataFrame( + rows, + columns=["snapshot", "test_id", "phase", "spec", "size", "axis", "value"], + ) + df["size"] = df["size"].astype("Int64") + return df + + +# --- Result types ---------------------------------------------------------- + + +@dataclass(frozen=True) +class TimingResult: + """One timed callable: per-round stats with ``min`` as the headline.""" + + label: str + stats: dict[str, float] + unit: Literal["s"] = "s" + + def to_snapshot( + self, + path: str | Path, + *, + spec: str | None = None, + size: int | None = None, + phase: str | None = None, + ) -> Path: + """Write a pytest-benchmark-shaped timing snapshot (seconds).""" + test_id = synth_test_id(self.label, spec=spec, size=size, phase=phase) + return write_timing_snapshot(path, [(test_id, dict(self.stats))]) + + def to_df(self) -> pd.DataFrame: + """``load_long_df``-shaped frame (one row, ``value`` = min seconds).""" + return _frame([_row(self.label, self.stats["min"])]) + + def __repr__(self) -> str: + return ( + f"TimingResult({self.label!r}, min={self.stats['min']:.4g}s, " + f"rounds={int(self.stats['rounds'])}x{int(self.stats.get('iterations', 1))})" + ) + + def _repr_html_(self) -> str: + rows = [ + ("min", f"{self.stats['min']:.4g} s"), + ("median", f"{self.stats['median']:.4g} s"), + ("mean", f"{self.stats['mean']:.4g} s"), + ("max", f"{self.stats['max']:.4g} s"), + ("stddev", f"{self.stats['stddev']:.4g} s"), + ("rounds", int(self.stats["rounds"])), + ("iterations", int(self.stats.get("iterations", 1))), + ] + return _html_table("TimingResult", self.label, rows) + + +@dataclass(frozen=True) +class MemoryResult: + """One memory-profiled callable: peak RSS in MiB.""" + + label: str + peak_mib: float + unit: Literal["MiB"] = "MiB" + + def to_snapshot( + self, + path: str | Path, + *, + spec: str | None = None, + size: int | None = None, + phase: str | None = None, + ) -> Path: + """Write a memory.py-shaped snapshot (peak MiB).""" + test_id = synth_test_id(self.label, spec=spec, size=size, phase=phase) + return write_memory_snapshot(path, self.label, {test_id: self.peak_mib}) + + def to_df(self) -> pd.DataFrame: + """``load_long_df``-shaped frame (one row, ``value`` = peak MiB).""" + return _frame([_row(self.label, self.peak_mib)]) + + def __repr__(self) -> str: + return f"MemoryResult({self.label!r}, peak={self.peak_mib:.1f} MiB)" + + def _repr_html_(self) -> str: + return _html_table( + "MemoryResult", self.label, [("peak", f"{self.peak_mib:.1f} MiB")] + ) + + +@dataclass(frozen=True) +class ResultSet: + """ + Several results of one kind (all timing, or all memory). + + ``to_snapshot`` writes every result into a single file keyed by its + label — the natural "compare these N variants" case. For + size-parametrized ``scaling`` plots, write each result individually + with ``spec``/``size``/``phase`` instead. + """ + + results: list[TimingResult | MemoryResult] = field(default_factory=list) + unit: Literal["s", "MiB"] = "s" + + def to_snapshot(self, path: str | Path) -> Path: + """Write all results into one snapshot, each keyed by its label.""" + if self.unit == "s": + return write_timing_snapshot( + path, + [ + (r.label, dict(r.stats)) + for r in self.results + if isinstance(r, TimingResult) + ], + ) + peaks = { + r.label: r.peak_mib for r in self.results if isinstance(r, MemoryResult) + } + return write_memory_snapshot(path, "compare", peaks) + + def to_df(self) -> pd.DataFrame: + """Concatenate the per-result frames (shares ``load_long_df`` columns).""" + import pandas as pd + + return pd.concat([r.to_df() for r in self.results], ignore_index=True) + + def __repr__(self) -> str: + labels = ", ".join(r.label for r in self.results) + return f"ResultSet(unit={self.unit!r}, [{labels}])" + + def _repr_html_(self) -> str: + rows = [ + ( + r.label, + f"{r.stats['min']:.4g} s" + if isinstance(r, TimingResult) + else f"{r.peak_mib:.1f} MiB", + ) + for r in self.results + ] + return _html_table("ResultSet", self.unit, rows) + + +def _html_table(kind: str, header: str, rows: Sequence[tuple[str, object]]) -> str: + """Compact two-column Jupyter table, mirroring ``ModelSpec._repr_html_``.""" + body = "".join( + f"{k}{v}" + for k, v in rows + ) + return ( + f"{kind} {header}" + f"{body}
" + ) + + +# --- Entry points ---------------------------------------------------------- + + +def time( + fn: Callable[..., object], + /, + *args: object, + rounds: int | None = None, + warmup: int = 1, + min_time: float = 0.5, + label: str | None = None, + **kwargs: object, +) -> TimingResult: + """ + Time ``fn(*args, **kwargs)`` and return a :class:`TimingResult`. + + Built on :class:`timeit.Timer`: an ``autorange`` calibration first + picks the inner iteration count so timer resolution doesn't dominate + for fast callables (the bespoke "one call per round" loop this + replaced was unstable in exactly that regime). Each round then runs + that many calibrated iterations; the per-iteration time is the + sample. ``warmup`` rounds are discarded to prime caches. + + With ``rounds`` set, run exactly that many rounds; otherwise + auto-tune — keep going until cumulative timed wall-clock reaches + ``min_time`` (floor of 5 rounds, hard cap). The headline number is + ``stats["min"]``; ``stats["iterations"]`` records the calibrated + inner count. + + This is *not* pytest-benchmark's calibrated timer — ``bench`` numbers + are only comparable to other ``bench`` numbers, not to suite + snapshots. + """ + timer = Timer(lambda: fn(*args, **kwargs)) + + # Calibrate inner iterations so a single round is long enough that + # ``perf_counter`` granularity is negligible (timeit targets ~0.2 s). + number, _ = timer.autorange() + + for _ in range(max(0, warmup)): + timer.timeit(number) + + samples: list[float] = [] # per-iteration seconds + if rounds is not None: + samples = [ + t / number for t in timer.repeat(repeat=max(1, rounds), number=number) + ] + else: + total = 0.0 + while True: + t = timer.timeit(number) + samples.append(t / number) + total += t + if len(samples) >= _ROUND_FLOOR and total >= min_time: + break + if len(samples) >= _ROUND_CAP: + break + + stats = { + "min": min(samples), + "max": max(samples), + "mean": mean(samples), + "median": median(samples), + "stddev": stdev(samples) if len(samples) > 1 else 0.0, + "rounds": float(len(samples)), + "iterations": float(number), + } + return TimingResult(label=label or _fn_name(fn), stats=stats) + + +def memory( + fn: Callable[..., object], + /, + *args: object, + repeats: int = 1, + label: str | None = None, + **kwargs: object, +) -> MemoryResult: + """ + Peak-RSS profile ``fn(*args, **kwargs)`` and return a :class:`MemoryResult`. + + Thin wrapper over :func:`benchmarks.memory.measure_peak`; ``repeats > 1`` + keeps the minimum peak. Raises on Windows (no ``memray``). + """ + from benchmarks.memory import measure_peak + + peak = measure_peak(lambda: fn(*args, **kwargs), repeats=repeats) + return MemoryResult(label=label or _fn_name(fn), peak_mib=peak) + + +def compare( + cases: dict[str, Callable[[], object]], + *, + kind: Literal["time", "memory"] = "time", + **opts: Any, +) -> ResultSet: + """ + Run each zero-arg callable in ``cases`` and collect a :class:`ResultSet`. + + ``kind`` selects timing (default) or memory; ``opts`` are forwarded to + :func:`time` / :func:`memory` (e.g. ``rounds=``, ``repeats=``). The + dict key becomes each case's label. + """ + if kind == "time": + results: list[TimingResult | MemoryResult] = [ + time(fn, label=name, **opts) for name, fn in cases.items() + ] + return ResultSet(results=results, unit="s") + if kind == "memory": + results = [memory(fn, label=name, **opts) for name, fn in cases.items()] + return ResultSet(results=results, unit="MiB") + raise ValueError(f"kind must be 'time' or 'memory', got {kind!r}") diff --git a/benchmarks/cli/__init__.py b/benchmarks/cli/__init__.py new file mode 100644 index 00000000..0d71afe0 --- /dev/null +++ b/benchmarks/cli/__init__.py @@ -0,0 +1,32 @@ +""" +linopy benchmark CLI — one entry point for the suite. + +Run with:: + + python -m benchmarks [options] + +The CLI is a thin layer over pytest for the timing / smoke commands, plus +direct dispatch for registry introspection and memory snapshots. Each command +group lives in its own module and registers onto the shared ``app`` from +``_base``; importing them here (in display order) wires up the flat command +surface. +""" + +from __future__ import annotations + +from benchmarks.cli._base import app + +# Imported for side effect: each module registers its commands onto ``app``. +# Kept in this order — and shielded from isort — because it is the order the +# commands appear in ``--help``. +# isort: off +from benchmarks.cli import introspect # noqa: F401 +from benchmarks.cli import run # noqa: F401 +from benchmarks.cli import sweep # noqa: F401 +from benchmarks.cli import compare # noqa: F401 +from benchmarks.cli import plot # noqa: F401 +from benchmarks.cli import memory # noqa: F401 + +# isort: on + +__all__ = ["app"] diff --git a/benchmarks/cli/_base.py b/benchmarks/cli/_base.py new file mode 100644 index 00000000..6909fa6d --- /dev/null +++ b/benchmarks/cli/_base.py @@ -0,0 +1,67 @@ +""" +Shared app object, types, and helpers for the benchmark CLI. + +The command groups (``introspect``, ``run``, ``sweep``, ``compare``, +``plot``, ``memory``) all register onto the single ``app`` defined here, so +the user-facing command surface stays flat (``python -m benchmarks run`` etc.). + +Note on colour: ``typer.secho`` strips colour automatically when stdout isn't +a TTY, so piping any command into ``grep`` still yields plain text. +""" + +from __future__ import annotations + +from typing import Literal + +import typer + +from benchmarks.snapshot import discover_snapshots + +app = typer.Typer( + help=( + "Linopy internal benchmark suite — a thin layer over pytest plus " + "registry introspection and memory snapshots." + ), + no_args_is_help=True, + rich_markup_mode="rich", +) + +memory_app = typer.Typer( + help="Peak-RSS memory snapshots (pytest-memray under the hood).", + no_args_is_help=True, +) +app.add_typer(memory_app, name="memory") + + +PhaseName = Literal["build", "matrices", "lp_write", "netcdf", "solver_handoff"] +SpecKind = Literal["all", "models", "patterns"] + + +_PHASE_TEST_FILE: dict[PhaseName, str] = { + "build": "benchmarks/test_build.py", + "matrices": "benchmarks/test_matrices.py", + "lp_write": "benchmarks/test_lp_write.py", + "netcdf": "benchmarks/test_netcdf.py", + "solver_handoff": "benchmarks/test_solver_handoff.py", +} + +# pytest args that constitute a "smoke" run — quick sizes, no timings. +# Shared between the top-level ``smoke`` command and ``sweep --smoke`` so +# bumping the definition stays single-source. +_SMOKE_PYTEST_ARGS = ["benchmarks/", "--quick", "--benchmark-disable", "-q"] + + +def _suggest_snapshots(reason: str) -> None: + """Print an error + a hint listing whatever snapshots we can find.""" + typer.secho(reason, fg=typer.colors.RED, err=True) + found = discover_snapshots() + if found: + typer.echo("\nAvailable snapshots under .benchmarks/:", err=True) + for p in found: + typer.echo(f" {p}", err=True) + else: + typer.echo( + "\nNo snapshots found under .benchmarks/. Generate one with:\n" + " python -m benchmarks run --json .benchmarks/