diff --git a/.gitignore b/.gitignore
index 7b962a6b..263d41fe 100644
--- a/.gitignore
+++ b/.gitignore
@@ -32,6 +32,7 @@ ENV/
 env.bak/
 venv.bak/
 
+benchmarks/results/
 benchmark/*.pdf
 benchmark/benchmarks
 benchmark/.snakemake
diff --git a/benchmarks/README.md b/benchmarks/README.md
new file mode 100644
index 00000000..dff9effa
--- /dev/null
+++ b/benchmarks/README.md
@@ -0,0 +1,68 @@
+# Benchmarks
+
+Modular benchmark framework for linopy. All commands use [`just`](https://github.com/casey/just).
+
+```
+$ just --list
+Available recipes:
+    [benchmark]
+    all name iterations=default_iterations
+    compare ref="master" model=default_model phase=default_phase iterations=default_iterations quick="False"
+    compare-all ref="master" iterations=default_iterations
+    compare-quick ref="master"
+    list
+    model name model phase=default_phase iterations=default_iterations quick="False"
+    plot +files
+    quick name="quick"
+```
+
+Start with `just list` to see available models and phases, then `just quick` for a smoke test.
+
+## Examples
+
+```bash
+# Discover available models and phases
+just list
+
+# Quick smoke test (basic model, all phases, 5 iterations)
+just quick
+
+# Full suite (all models, all phases)
+just all my-branch
+
+# Single model + phase
+just model my-branch knapsack memory
+
+# Compare current branch against master (basic model, all phases)
+just compare
+
+# Compare all models against master
+just compare-all
+
+# Quick compare (basic model, small sizes, 5 iterations)
+just compare-quick perf/lp-write-speed
+
+# Compare against a remote fork
+just compare FBumann:perf/lp-write-speed
+
+# Plot existing result files
+just plot benchmarks/results/master_basic_build.json benchmarks/results/feat_basic_build.json
+```
+
+## Overriding defaults
+
+Parameters showing `=default_*` reference top-level justfile variables. Override them with `--set`:
+
+```bash
+just --set default_phase lp_write compare perf/lp-write-speed
+just --set default_model knapsack --set default_iterations 20 compare master
+```
+
+## Output
+
+Results are saved as JSON in `benchmarks/results/` (gitignored), named `{name}_{model}_{phase}.json`. Comparison plots are saved as PNG alongside.
+
+## Ideas for future models
+
+- **sparse**: A model with variables on mismatched coordinate subsets to exercise sparse/outer-join alignment (e.g. lines indexed by `(bus_from, bus_to)` vs bus-level variables).
+- **large_expr**: A model that stress-tests expression building — chaining many arithmetic operations, combining expressions from different variable groups, or building expressions incrementally.
diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py
new file mode 100644
index 00000000..b2d71789
--- /dev/null
+++ b/benchmarks/__init__.py
@@ -0,0 +1 @@
+"""Linopy benchmark framework."""
diff --git a/benchmarks/compare.py b/benchmarks/compare.py
new file mode 100644
index 00000000..6c6acba7
--- /dev/null
+++ b/benchmarks/compare.py
@@ -0,0 +1,229 @@
+"""Compare benchmark results across branches and produce plots."""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+# Metric keys per phase: (median, q25, q75)
+METRIC_KEYS: dict[str, tuple[str, str, str]] = {
+    "build": ("build_time_median_s", "build_time_q25_s", "build_time_q75_s"),
+    "memory": ("peak_memory_median_mb", "peak_memory_median_mb", "peak_memory_max_mb"),
+    "lp_write": ("write_time_median_s", "write_time_q25_s", "write_time_q75_s"),
+}
+
+METRIC_UNITS: dict[str, str] = {
+    "build": "Build time (ms)",
+    "memory": "Peak memory (MB)",
+    "lp_write": "Write time (ms)",
+}
+
+# Phases where raw values are seconds → display in ms
+MS_PHASES = {"build", "lp_write"}
+
+COLORS = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b"]
+MARKERS = ["o", "s", "D", "^", "v", "P"]
+
+
+def _load(path: str) -> dict:
+    with open(path) as f:
+        data = json.load(f)
+    data.setdefault("name", Path(path).stem)
+    return data
+
+
+def _extract(
+    runs: list[dict], phase: str
+) -> tuple[list[int], list[float], list[float], list[float]]:
+    """Extract nvars, median, lo, hi from runs. Convert to ms where needed."""
+    keys = METRIC_KEYS.get(phase)
+    if not keys or not runs:
+        return [], [], [], []
+
+    med_key, lo_key, hi_key = keys
+    scale = 1000.0 if phase in MS_PHASES else 1.0
+
+    nvars = [r["nvars"] for r in runs]
+    med = [r[med_key] * scale for r in runs]
+    lo = [r.get(lo_key, r[med_key]) * scale for r in runs]
+    hi = [r.get(hi_key, r[med_key]) * scale for r in runs]
+    return nvars, med, lo, hi
+
+
+def _plot_errorbar(ax, nvars, med, lo, hi, **kwargs):
+    yerr_lo = [m - l for m, l in zip(med, lo)]
+    yerr_hi = [h - m for m, h in zip(med, hi)]
+    ax.errorbar(nvars, med, yerr=[yerr_lo, yerr_hi], capsize=3, **kwargs)
+
+
+def compare(*paths: str) -> None:
+    """
+    Compare any number of result JSONs for the same model x phase.
+
+    Produces a 4-panel plot:
+      Top-left:     Log-log overview with error bars
+      Top-right:    Speedup ratio vs baseline with uncertainty bounds
+      Bottom-left:  Small models (linear scale)
+      Bottom-right: Large models (log scale)
+    """
+    if len(paths) < 2:
+        print("Need at least 2 files to compare.")
+        return
+
+    import matplotlib.pyplot as plt
+
+    datasets = [_load(p) for p in paths]
+    phase = datasets[0].get("phase", "unknown")
+    model_name = datasets[0].get("model", "unknown")
+    ylabel = METRIC_UNITS.get(phase, phase)
+
+    for d in datasets[1:]:
+        if d.get("model") != model_name or d.get("phase") != phase:
+            print(
+                f"Warning: mixing model/phase — "
+                f"expected {model_name}/{phase}, "
+                f"got {d.get('model')}/{d.get('phase')}"
+            )
+
+    # Extract stats for each dataset
+    all_stats = []
+    for d in datasets:
+        nvars, med, lo, hi = _extract(d.get("runs", []), phase)
+        all_stats.append((d.get("name", d.get("label", "unknown")), nvars, med, lo, hi))
+
+    if not all_stats[0][1]:
+        print("No data to plot.")
+        return
+
+    labels = [s[0] for s in all_stats]
+    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
+    fig.suptitle(
+        f"Benchmark: {model_name} / {phase}\n{' vs '.join(labels)}",
+        fontsize=14,
+    )
+
+    # --- Panel 1: All data, log-log ---
+    ax = axes[0, 0]
+    for i, (label, nvars, med, lo, hi) in enumerate(all_stats):
+        color = COLORS[i % len(COLORS)]
+        marker = MARKERS[i % len(MARKERS)]
+        ls = "--" if i == 0 else "-"
+        _plot_errorbar(
+            ax,
+            nvars,
+            med,
+            lo,
+            hi,
+            marker=marker,
+            color=color,
+            linestyle=ls,
+            label=label,
+            alpha=0.8,
+        )
+    ax.set_xscale("log")
+    ax.set_yscale("log")
+    ax.set_xlabel("Number of variables")
+    ax.set_ylabel(ylabel)
+    ax.set_title("Overview (log-log)")
+    ax.legend(fontsize=9)
+    ax.grid(True, alpha=0.3)
+
+    # --- Panel 2: Speedup ratio with uncertainty bounds ---
+    ax = axes[0, 1]
+    base_label, base_nv, base_med, base_lo, base_hi = all_stats[0]
+    for i, (label, nvars, med, lo, hi) in enumerate(all_stats[1:], 1):
+        if len(nvars) != len(base_nv):
+            continue
+        color = COLORS[i % len(COLORS)]
+        # Ratio: baseline / current (>1 means current is faster)
+        ratio = [b / c if c > 0 else float("nan") for b, c in zip(base_med, med)]
+        # Uncertainty: best = base_hi/lo_cur, worst = base_lo/hi_cur
+        ratio_lo = [bl / ch if ch > 0 else float("nan") for bl, ch in zip(base_lo, hi)]
+        ratio_hi = [bh / cl if cl > 0 else float("nan") for bh, cl in zip(base_hi, lo)]
+        yerr_lo = [r - rl for r, rl in zip(ratio, ratio_lo)]
+        yerr_hi = [rh - r for r, rh in zip(ratio, ratio_hi)]
+        ax.errorbar(
+            nvars,
+            ratio,
+            yerr=[yerr_lo, yerr_hi],
+            marker=MARKERS[i % len(MARKERS)],
+            color=color,
+            capsize=3,
+            label=label,
+        )
+        ax.fill_between(nvars, ratio_lo, ratio_hi, alpha=0.15, color=color)
+        for x, r in zip(nvars, ratio):
+            ax.annotate(
+                f"{r:.2f}",
+                (x, r),
+                textcoords="offset points",
+                xytext=(0, 10),
+                ha="center",
+                fontsize=8,
+                color=color,
+            )
+    ax.axhline(1.0, color="gray", linestyle="--", alpha=0.5)
+    ax.set_xscale("log")
+    ax.set_xlabel("Number of variables")
+    ax.set_ylabel(f"Speedup ({base_label} / other)")
+    ax.set_title("Relative performance")
+    ax.legend(fontsize=9)
+    ax.grid(True, alpha=0.3)
+
+    # --- Panels 3 & 4: Small vs large models ---
+    cutoff = 25000
+
+    for panel_idx, (title, filt, use_log) in enumerate(
+        [
+            (f"Small models (≤ {cutoff:,} vars)", lambda n: n <= cutoff, False),
+            (f"Large models (> {cutoff:,} vars)", lambda n: n > cutoff, True),
+        ]
+    ):
+        ax = axes[1, panel_idx]
+        has_data = False
+        for i, (label, nvars, med, lo, hi) in enumerate(all_stats):
+            idx = [j for j, n in enumerate(nvars) if filt(n)]
+            if not idx:
+                continue
+            has_data = True
+            color = COLORS[i % len(COLORS)]
+            marker = MARKERS[i % len(MARKERS)]
+            ls = "--" if i == 0 else "-"
+            _plot_errorbar(
+                ax,
+                [nvars[j] for j in idx],
+                [med[j] for j in idx],
+                [lo[j] for j in idx],
+                [hi[j] for j in idx],
+                marker=marker,
+                color=color,
+                linestyle=ls,
+                label=label,
+                alpha=0.8,
+            )
+        if use_log and has_data:
+            ax.set_xscale("log")
+        if not use_log:
+            ax.set_ylim(bottom=0)
+        ax.set_xlabel("Number of variables")
+        ax.set_ylabel(ylabel)
+        ax.set_title(title)
+        ax.legend(fontsize=9)
+        ax.grid(True, alpha=0.3)
+        if not has_data:
+            ax.text(
+                0.5,
+                0.5,
+                "No data",
+                ha="center",
+                va="center",
+                transform=ax.transAxes,
+                fontsize=12,
+                color="gray",
+            )
+
+    plt.tight_layout()
+    out_png = Path(paths[0]).parent / f"compare_{model_name}_{phase}.png"
+    plt.savefig(out_png, dpi=150, bbox_inches="tight")
+    print(f"Saved: {out_png}")
+    plt.close()
diff --git a/benchmarks/models/__init__.py b/benchmarks/models/__init__.py
new file mode 100644
index 00000000..c9ce1393
--- /dev/null
+++ b/benchmarks/models/__init__.py
@@ -0,0 +1,34 @@
+"""Model registry for benchmarks."""
+
+from __future__ import annotations
+
+import importlib
+import pkgutil
+from types import ModuleType
+
+_MODELS: dict[str, ModuleType] = {}
+
+
+def _discover() -> None:
+    """Auto-discover model modules in this package."""
+    if _MODELS:
+        return
+    package = importlib.import_module("benchmarks.models")
+    for info in pkgutil.iter_modules(package.__path__):
+        if info.name.startswith("_"):
+            continue
+        mod = importlib.import_module(f"benchmarks.models.{info.name}")
+        if hasattr(mod, "build") and hasattr(mod, "SIZES"):
+            _MODELS[info.name] = mod
+
+
+def get_model(name: str) -> ModuleType:
+    """Return a model module by name."""
+    _discover()
+    return _MODELS[name]
+
+
+def list_models() -> list[str]:
+    """Return sorted list of available model names."""
+    _discover()
+    return sorted(_MODELS)
diff --git a/benchmarks/models/basic.py b/benchmarks/models/basic.py
new file mode 100644
index 00000000..21ae643c
--- /dev/null
+++ b/benchmarks/models/basic.py
@@ -0,0 +1,21 @@
+"""Basic benchmark model: 2*N^2 variables and constraints."""
+
+from __future__ import annotations
+
+import linopy
+
+LABEL = "basic N={n}"
+SIZES = [{"n": n} for n in [10, 50, 100, 250, 500, 1000, 1600]]
+QUICK_SIZES = [{"n": n} for n in [10, 50, 100]]
+DESCRIPTION = "2*N^2 vars/cons — simple dense model"
+
+
+def build(n: int) -> linopy.Model:
+    """Build a basic N×N model."""
+    m = linopy.Model()
+    x = m.add_variables(coords=[range(n), range(n)], dims=["i", "j"], name="x")
+    y = m.add_variables(coords=[range(n), range(n)], dims=["i", "j"], name="y")
+    m.add_constraints(x + y <= 10, name="upper")
+    m.add_constraints(x - y >= -5, name="lower")
+    m.add_objective(x.sum() + 2 * y.sum())
+    return m
diff --git a/benchmarks/models/knapsack.py b/benchmarks/models/knapsack.py
new file mode 100644
index 00000000..d835d003
--- /dev/null
+++ b/benchmarks/models/knapsack.py
@@ -0,0 +1,26 @@
+"""Knapsack benchmark model: N binary variables."""
+
+from __future__ import annotations
+
+import numpy as np
+
+import linopy
+
+LABEL = "knapsack N={n}"
+SIZES = [{"n": n} for n in [100, 1_000, 10_000, 100_000, 1_000_000, 5_000_000]]
+QUICK_SIZES = [{"n": n} for n in [100, 1_000, 10_000]]
+DESCRIPTION = "N binary variables — integer programming stress test"
+
+
+def build(n: int) -> linopy.Model:
+    """Build a knapsack model with N items."""
+    rng = np.random.default_rng(42)
+    weights = rng.integers(1, 100, size=n)
+    values = rng.integers(1, 100, size=n)
+    capacity = int(weights.sum() * 0.5)
+
+    m = linopy.Model()
+    x = m.add_variables(coords=[range(n)], dims=["item"], binary=True, name="x")
+    m.add_constraints((x * weights).sum() <= capacity, name="capacity")
+    m.add_objective(-(x * values).sum())
+    return m
diff --git a/benchmarks/models/pypsa_scigrid.py b/benchmarks/models/pypsa_scigrid.py
new file mode 100644
index 00000000..64ab9754
--- /dev/null
+++ b/benchmarks/models/pypsa_scigrid.py
@@ -0,0 +1,26 @@
+"""PyPSA SciGrid-DE benchmark model."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    import linopy
+
+LABEL = "pypsa snapshots={snapshots}"
+SIZES = [{"snapshots": s} for s in [10, 50, 100, 200]]
+QUICK_SIZES = [{"snapshots": s} for s in [10, 50]]
+DESCRIPTION = "Real power system model from PyPSA SciGrid-DE"
+
+
+def build(snapshots: int = 100) -> linopy.Model | None:
+    """Build PyPSA SciGrid model. Returns None if pypsa not installed."""
+    try:
+        import pypsa
+    except ImportError:
+        return None
+
+    n = pypsa.examples.scigrid_de()
+    n.set_snapshots(n.snapshots[:snapshots])
+    n.optimize.create_model()
+    return n.model
diff --git a/benchmarks/run.py b/benchmarks/run.py
new file mode 100644
index 00000000..6a831e2b
--- /dev/null
+++ b/benchmarks/run.py
@@ -0,0 +1,128 @@
+"""Benchmark orchestrator — main entry point for running benchmarks."""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+from benchmarks.models import get_model, list_models
+from benchmarks.runners import get_runner, list_phases
+
+
+def run_single(
+    model_name: str,
+    phase: str,
+    name: str = "dev",
+    iterations: int = 30,
+    quick: bool = False,
+    output_dir: str = "benchmarks/results",
+) -> dict:
+    """Run one model x one phase, save JSON, return results."""
+    model_mod = get_model(model_name)
+    runner = get_runner(phase)
+    sizes = (
+        model_mod.QUICK_SIZES
+        if quick and hasattr(model_mod, "QUICK_SIZES")
+        else model_mod.SIZES
+    )
+
+    results = {
+        "name": name,
+        "model": model_name,
+        "phase": phase,
+        "runs": [],
+    }
+
+    for kwargs in sizes:
+        desc = model_mod.LABEL.format(**kwargs)
+        print(f"  {desc} ... ", end="", flush=True)
+        res = runner.run(
+            name=name,
+            builder=model_mod.build,
+            builder_args=kwargs,
+            iterations=iterations,
+        )
+        if res is None:
+            print("skipped")
+            continue
+        results["runs"].append(res)
+        # Print a compact summary
+        summary_parts = []
+        for key, val in res.items():
+            if key in ("phase", "name", "params", "iterations"):
+                continue
+            if isinstance(val, float):
+                summary_parts.append(f"{key}={val:.3f}")
+            elif isinstance(val, int):
+                summary_parts.append(f"{key}={val}")
+        print(", ".join(summary_parts))
+
+    # Save
+    out_path = Path(output_dir)
+    out_path.mkdir(parents=True, exist_ok=True)
+    filename = out_path / f"{name}_{model_name}_{phase}.json"
+    with open(filename, "w") as f:
+        json.dump(results, f, indent=2)
+    print(f"  Saved: {filename}")
+    return results
+
+
+def run_phase(
+    phase: str,
+    name: str = "dev",
+    iterations: int = 30,
+    quick: bool = False,
+    output_dir: str = "benchmarks/results",
+) -> list[dict]:
+    """Run all models for one phase."""
+    all_results = []
+    for model_name in list_models():
+        print(f"\n[{phase}] Model: {model_name}")
+        res = run_single(
+            model_name,
+            phase,
+            name=name,
+            iterations=iterations,
+            quick=quick,
+            output_dir=output_dir,
+        )
+        all_results.append(res)
+    return all_results
+
+
+def run_all(
+    name: str = "dev",
+    iterations: int = 30,
+    quick: bool = False,
+    output_dir: str = "benchmarks/results",
+) -> list[dict]:
+    """Run all phases x all models."""
+    all_results = []
+    for phase in list_phases():
+        print(f"\n{'=' * 60}")
+        print(f"Phase: {phase}")
+        print(f"{'=' * 60}")
+        results = run_phase(
+            phase,
+            name=name,
+            iterations=iterations,
+            quick=quick,
+            output_dir=output_dir,
+        )
+        all_results.extend(results)
+    return all_results
+
+
+def list_available() -> None:
+    """Print available models and phases."""
+    print("Models:")
+    for name in list_models():
+        mod = get_model(name)
+        desc = getattr(mod, "DESCRIPTION", "")
+        print(f"  {name:20s} {desc}")
+
+    print("\nPhases:")
+    for phase in list_phases():
+        runner = get_runner(phase)
+        doc = (runner.run.__doc__ or "").strip().split("\n")[0]
+        print(f"  {phase:20s} {doc}")
diff --git a/benchmarks/runners/__init__.py b/benchmarks/runners/__init__.py
new file mode 100644
index 00000000..ae3530e3
--- /dev/null
+++ b/benchmarks/runners/__init__.py
@@ -0,0 +1,21 @@
+"""Runner registry for benchmarks."""
+
+from __future__ import annotations
+
+from benchmarks.runners import build, lp_write, memory
+
+_RUNNERS = {
+    "build": build,
+    "memory": memory,
+    "lp_write": lp_write,
+}
+
+
+def get_runner(phase: str):
+    """Return a runner module by phase name."""
+    return _RUNNERS[phase]
+
+
+def list_phases() -> list[str]:
+    """Return sorted list of available phase names."""
+    return sorted(_RUNNERS)
diff --git a/benchmarks/runners/build.py b/benchmarks/runners/build.py
new file mode 100644
index 00000000..ddbac59e
--- /dev/null
+++ b/benchmarks/runners/build.py
@@ -0,0 +1,57 @@
+"""Build runner: measures model construction speed."""
+
+from __future__ import annotations
+
+import gc
+import time
+
+import numpy as np
+
+PHASE = "build"
+
+
+def run(
+    name: str,
+    builder,
+    builder_args: dict,
+    iterations: int = 30,
+    **kwargs,
+) -> dict | None:
+    """
+    Time model construction over multiple iterations.
+
+    Returns dict with median, q25, q75 build times and model stats.
+    """
+    # Warmup
+    model = builder(**builder_args)
+    if model is None:
+        return None
+    del model
+    gc.collect()
+
+    times = []
+    nvars = 0
+    ncons = 0
+
+    for _ in range(iterations):
+        gc.collect()
+        t0 = time.perf_counter()
+        model = builder(**builder_args)
+        elapsed = time.perf_counter() - t0
+        times.append(elapsed)
+        nvars = int(getattr(model, "nvars", 0))
+        ncons = int(getattr(model, "ncons", 0))
+        del model
+
+    times_arr = np.array(times)
+    return {
+        "phase": PHASE,
+        "name": name,
+        "params": builder_args,
+        "iterations": iterations,
+        "build_time_median_s": float(np.median(times_arr)),
+        "build_time_q25_s": float(np.percentile(times_arr, 25)),
+        "build_time_q75_s": float(np.percentile(times_arr, 75)),
+        "nvars": nvars,
+        "ncons": ncons,
+    }
diff --git a/benchmarks/runners/lp_write.py b/benchmarks/runners/lp_write.py
new file mode 100644
index 00000000..f681699e
--- /dev/null
+++ b/benchmarks/runners/lp_write.py
@@ -0,0 +1,60 @@
+"""LP write runner: measures LP file writing speed."""
+
+from __future__ import annotations
+
+import gc
+import tempfile
+import time
+from pathlib import Path
+
+import numpy as np
+
+PHASE = "lp_write"
+
+
+def run(
+    name: str,
+    builder,
+    builder_args: dict,
+    iterations: int = 10,
+    **kwargs,
+) -> dict | None:
+    """
+    Time LP file writing over multiple iterations.
+
+    Builds the model once, then times repeated LP file writes.
+    Returns dict with median, q25, q75 write times.
+    """
+    model = builder(**builder_args)
+    if model is None:
+        return None
+
+    nvars = int(getattr(model, "nvars", 0))
+    ncons = int(getattr(model, "ncons", 0))
+
+    times = []
+    with tempfile.TemporaryDirectory() as tmpdir:
+        lp_path = Path(tmpdir) / "model.lp"
+
+        # Warmup
+        model.to_file(lp_path, progress=False)
+
+        for _ in range(iterations):
+            gc.collect()
+            t0 = time.perf_counter()
+            model.to_file(lp_path, progress=False)
+            elapsed = time.perf_counter() - t0
+            times.append(elapsed)
+
+    times_arr = np.array(times)
+    return {
+        "phase": PHASE,
+        "name": name,
+        "params": builder_args,
+        "iterations": iterations,
+        "write_time_median_s": float(np.median(times_arr)),
+        "write_time_q25_s": float(np.percentile(times_arr, 25)),
+        "write_time_q75_s": float(np.percentile(times_arr, 75)),
+        "nvars": nvars,
+        "ncons": ncons,
+    }
diff --git a/benchmarks/runners/memory.py b/benchmarks/runners/memory.py
new file mode 100644
index 00000000..951230b8
--- /dev/null
+++ b/benchmarks/runners/memory.py
@@ -0,0 +1,70 @@
+"""Memory runner: measures peak memory during model construction."""
+
+from __future__ import annotations
+
+import gc
+import tracemalloc
+
+import numpy as np
+
+PHASE = "memory"
+
+
+def run(
+    name: str,
+    builder,
+    builder_args: dict,
+    iterations: int = 5,
+    **kwargs,
+) -> dict | None:
+    """
+    Measure peak memory via tracemalloc over multiple iterations.
+
+    Uses fewer iterations by default since memory measurement is slower.
+    Returns dict with median/max peak memory and model stats.
+    """
+    # Warmup
+    model = builder(**builder_args)
+    if model is None:
+        return None
+    del model
+    gc.collect()
+
+    peaks = []
+    nvars = 0
+    ncons = 0
+
+    for _ in range(iterations):
+        gc.collect()
+        if tracemalloc.is_tracing():
+            tracemalloc.stop()
+        tracemalloc.start()
+        tracemalloc.reset_peak()
+
+        model = builder(**builder_args)
+
+        _, peak = tracemalloc.get_traced_memory()
+        tracemalloc.stop()
+
+        if model is None:
+            continue
+
+        nvars = int(getattr(model, "nvars", 0))
+        ncons = int(getattr(model, "ncons", 0))
+        peaks.append(peak / 1e6)  # bytes to MB
+        del model
+
+    if not peaks:
+        return None
+
+    peaks_arr = np.array(peaks)
+    return {
+        "phase": PHASE,
+        "name": name,
+        "params": builder_args,
+        "iterations": iterations,
+        "peak_memory_median_mb": float(np.median(peaks_arr)),
+        "peak_memory_max_mb": float(np.max(peaks_arr)),
+        "nvars": nvars,
+        "ncons": ncons,
+    }
diff --git a/justfile b/justfile
new file mode 100644
index 00000000..501fd7be
--- /dev/null
+++ b/justfile
@@ -0,0 +1,104 @@
+default_iterations := "10"
+default_model := "basic"
+default_phase := "all"
+results_dir := "benchmarks/results"
+
+[group('benchmark')]
+all name iterations=default_iterations:
+    python -c "from benchmarks.run import run_all; run_all(name='{{name}}', iterations={{iterations}}, output_dir='{{results_dir}}')"
+
+[group('benchmark')]
+model name model phase=default_phase iterations=default_iterations quick="False":
+    python -c "from benchmarks.run import run_single; run_single('{{model}}', '{{phase}}', name='{{name}}', iterations={{iterations}}, quick={{quick}}, output_dir='{{results_dir}}')"
+
+[group('benchmark')]
+quick name="quick":
+    just _run basic build {{name}} 5 True
+    just _run basic memory {{name}} 5 True
+    just _run basic lp_write {{name}} 5 True
+
+[group('benchmark')]
+compare ref="master" model=default_model phase=default_phase iterations=default_iterations quick="False":
+    #!/usr/bin/env bash
+    set -euo pipefail
+    home_branch=$(git rev-parse --abbrev-ref HEAD)
+    home_name=$(echo "$home_branch" | tr '/:' '--')
+    ref_name=$(echo "{{ref}}" | tr '/:' '--')
+
+    if [[ "{{phase}}" == "all" ]]; then
+        phases="build memory lp_write"
+    else
+        phases="{{phase}}"
+    fi
+
+    if [[ "{{model}}" == "all" ]]; then
+        models=$(python -c "from benchmarks.models import list_models; print(' '.join(list_models()))")
+    else
+        models="{{model}}"
+    fi
+
+    ref="{{ref}}"
+    if [[ "$ref" == *:* ]]; then
+        remote="${ref%%:*}"
+        branch="${ref#*:}"
+        git remote get-url "$remote" 2>/dev/null || git remote add "$remote" "https://github.com/$remote/linopy.git"
+        git fetch "$remote" "$branch" --no-tags --no-recurse-submodules
+        checkout_ref="FETCH_HEAD"
+    else
+        git fetch origin --no-tags --no-recurse-submodules 2>&1 || true
+        checkout_ref="$ref"
+    fi
+
+    echo ">>> Checking out $checkout_ref ..."
+    git checkout --detach "$checkout_ref"
+    pip install -e . --quiet 2>/dev/null || true
+
+    echo ">>> Benchmarking $ref_name (models=$models, phases=$phases, quick={{quick}}) ..."
+    for model in $models; do
+        for phase in $phases; do
+            just _run "$model" "$phase" "$ref_name" "{{iterations}}" "{{quick}}"
+        done
+    done
+
+    echo ">>> Returning to $home_branch ..."
+    git checkout "$home_branch"
+    pip install -e . --quiet 2>/dev/null || true
+
+    echo ">>> Benchmarking $home_name (models=$models, phases=$phases, quick={{quick}}) ..."
+    for model in $models; do
+        for phase in $phases; do
+            just _run "$model" "$phase" "$home_name" "{{iterations}}" "{{quick}}"
+        done
+    done
+
+    echo ">>> Comparing results ..."
+    for model in $models; do
+        for phase in $phases; do
+            old="benchmarks/results/${ref_name}_${model}_${phase}.json"
+            new="benchmarks/results/${home_name}_${model}_${phase}.json"
+            if [[ -f "$old" && -f "$new" ]]; then
+                python -c "from benchmarks.compare import compare; compare('$old', '$new')"
+            fi
+        done
+    done
+    echo ">>> Done."
+
+[group('benchmark')]
+compare-all ref="master" iterations=default_iterations:
+    just compare {{ref}} all all {{iterations}} False
+
+[group('benchmark')]
+compare-quick ref="master":
+    just compare {{ref}} basic all 5 True
+
+[group('benchmark')]
+plot +files:
+    python -c "import sys; from benchmarks.compare import compare; compare(*sys.argv[1:])" {{files}}
+
+[group('benchmark')]
+list:
+    python -c "from benchmarks.run import list_available; list_available()"
+
+[private]
+_run model phase name iterations quick:
+    python -c "from benchmarks.run import run_single; run_single('{{model}}', '{{phase}}', name='{{name}}', iterations={{iterations}}, quick={{quick}}, output_dir='{{results_dir}}')"
diff --git a/pyproject.toml b/pyproject.toml
index 52d5e3d5..e6e0ac7f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -99,7 +99,7 @@ version_scheme = "no-guess-dev"
 
 [tool.pytest.ini_options]
 testpaths = ["test"]
-norecursedirs = ["dev-scripts", "doc", "examples", "benchmark"]
+norecursedirs = ["dev-scripts", "doc", "examples", "benchmark", "benchmarks"]
 markers = [
     "gpu: marks tests as requiring GPU hardware (deselect with '-m \"not gpu\"')",
 ]
@@ -112,7 +112,7 @@ omit = ["test/*"]
 exclude_also = ["if TYPE_CHECKING:"]
 
 [tool.mypy]
-exclude = ['dev/*', 'examples/*', 'benchmark/*', 'doc/*']
+exclude = ['dev/*', 'examples/*', 'benchmark/*', 'benchmarks/*', 'doc/*']
 ignore_missing_imports = true
 no_implicit_optional = true
 warn_unused_ignores = true