diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index f525644..b5bc9b8 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -105,7 +105,7 @@ jobs: run: | python -m pip install --upgrade pip python -m pip install -r requirements-lock.txt - python -m pip install 'pytest>=8,<9' 'hypothesis>=6.100,<7' + python -m pip install 'pytest>=8,<9' 'pytest-benchmark==4.0.0' 'hypothesis>=6.100,<7' - name: Run unittest suite run: python -m unittest discover tests -v @@ -114,7 +114,8 @@ jobs: # Pytest fixtures (tests/conftest.py) build a temp workspaceStorage and # exercise Flask routes via app.test_client(). Only listed files — not # `pytest tests/` — to avoid re-collecting unittest.TestCase classes above. - run: python -m pytest tests/test_api_search.py tests/test_api_workspaces.py tests/test_api_export.py tests/test_pdf_export.py tests/test_search_helpers.py -v --tb=short + # -o addopts= avoids inheriting benchmark-only options from pyproject.toml. + run: python -m pytest tests/test_api_search.py tests/test_api_workspaces.py tests/test_api_export.py tests/test_pdf_export.py tests/test_search_helpers.py -v --tb=short -o addopts= # ── PyInstaller desktop build (Windows only, once per workflow) ──────── # Closes #44. Builds the onedir bundle and smoke-tests --help so the @@ -213,3 +214,41 @@ jobs: --verbose \ --redact \ --exit-code 1 + + # ── Performance benchmarks: summary cache (issue #7) ─────────────────────── + benchmarks: + name: Performance benchmarks (gated) + needs: [unittest] + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + + - name: Set up Python + uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 + with: + python-version: "3.12" + + - name: Install runtime + benchmark dependencies + run: | + python -m pip install --upgrade pip + python -m pip install -r requirements-lock.txt + python -m pip install 'pytest>=8,<9' 'pytest-benchmark==4.0.0' + + - name: Run summary-cache benchmarks + run: > + python -m pytest tests/benchmarks/ + --benchmark-only + --benchmark-json=benchmark-results.json + --benchmark-columns=min,max,mean,stddev,rounds + -o addopts= + + - name: Regression gate + run: python scripts/check_benchmark_regression.py benchmark-results.json benchmarks/baselines.json + + - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4 + if: always() + with: + name: benchmark-results + path: benchmark-results.json diff --git a/.gitignore b/.gitignore index 5fd078f..f204306 100644 --- a/.gitignore +++ b/.gitignore @@ -44,3 +44,5 @@ Thumbs.db htmlcov/ coverage.xml .hypothesis/ +benchmark-results.json +benchmarks/_raw.json diff --git a/benchmarks/baselines.json b/benchmarks/baselines.json new file mode 100644 index 0000000..d664af3 --- /dev/null +++ b/benchmarks/baselines.json @@ -0,0 +1,15 @@ +{ + "_note": "Gated means from ubuntu-latest CI benchmark-results.json (PR #120, run 28123677675). Refresh: pytest tests/benchmarks/ --benchmark-only --benchmark-json=benchmark-results.json -o addopts=", + "updated": "2026-06-24T19:20:27Z", + "machine": "Linux", + "groups": { + "summary-cache": { + "test_summary_cache_hit": 6.3e-05, + "test_summary_cache_miss": 6.3e-05, + "test_fingerprint_workspace_entries[10]": 0.001844, + "test_fingerprint_workspace_entries[50]": 0.007759, + "test_fingerprint_workspace_entries[200]": 0.022231, + "test_summary_cache_round_trip": 0.000351 + } + } +} diff --git a/pyproject.toml b/pyproject.toml index a49ac40..6c37998 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,10 +31,19 @@ desktop = ["pywebview>=5.0,<6"] # Development tooling: testing + type checking. dev = [ "pytest>=8,<9", + "pytest-benchmark>=4,<5", "mypy>=1.10,<2", "hypothesis>=6.100,<7", ] +[tool.pytest.ini_options] +pythonpath = ["."] +addopts = "--benchmark-disable" +testpaths = ["tests"] +markers = [ + "benchmark: performance benchmarks (pytest-benchmark)", +] + [project.scripts] # Primary CLI: export Cursor chat histories to Markdown / zip. # Usage: cursor-chat-export [--since all|last] [--out DIR] [--no-zip] [--help] diff --git a/requirements-lock.txt b/requirements-lock.txt index 4a65662..beaa107 100644 --- a/requirements-lock.txt +++ b/requirements-lock.txt @@ -6,7 +6,7 @@ # Lock is generated on Linux (CI / update-lock.yml). Windows-only transitives (e.g. # colorama via click) are omitted — pip still installs them on Windows when needed. blinker==1.9.0 # via flask -click==8.4.1 # via flask +click==8.4.2 # via flask defusedxml==0.7.1 # via fpdf2 flask==3.1.3 # via -r requirements.txt fonttools==4.63.0 # via fpdf2 diff --git a/scripts/check_benchmark_regression.py b/scripts/check_benchmark_regression.py new file mode 100644 index 0000000..d2fc79c --- /dev/null +++ b/scripts/check_benchmark_regression.py @@ -0,0 +1,163 @@ +"""Compare pytest-benchmark JSON output against stored baselines.""" + +from __future__ import annotations + +import argparse +import json +import sys +from pathlib import Path + +THRESHOLD = 1.20 + + +class BenchmarkDataError(ValueError): + """Raised when benchmark JSON input is malformed or missing required fields.""" + + +def normalize_benchmark_name(name: str) -> str: + """Strip pytest file node prefix so baselines match short or full benchmark names.""" + text = str(name) + if "::" not in text: + return text + prefix, _, suffix = text.partition("::") + # Only strip module paths (…/test_foo.py::test_name); leave "::" inside [param::value] intact. + if prefix.endswith(".py"): + return suffix + return text + + +def load_results(results_path: str | Path) -> dict[str, float]: + path = Path(results_path) + try: + data = json.loads(path.read_text(encoding="utf-8")) + except OSError as exc: + raise BenchmarkDataError(f"cannot read {path}: {exc}") from exc + except json.JSONDecodeError as exc: + raise BenchmarkDataError(f"invalid JSON in {path}: {exc}") from exc + try: + benchmarks = data["benchmarks"] + except (KeyError, TypeError) as exc: + raise BenchmarkDataError(f"{path} missing top-level 'benchmarks' array") from exc + if not isinstance(benchmarks, list): + raise BenchmarkDataError(f"{path} 'benchmarks' must be an array") + + results: dict[str, float] = {} + for index, entry in enumerate(benchmarks): + if not isinstance(entry, dict): + raise BenchmarkDataError(f"{path} benchmarks[{index}] must be an object") + try: + raw_name = entry["name"] + mean = float(entry["stats"]["mean"]) + except (KeyError, TypeError, ValueError) as exc: + raise BenchmarkDataError( + f"{path} benchmarks[{index}] missing 'name' or 'stats.mean'" + ) from exc + name = normalize_benchmark_name(str(raw_name)) + if name in results: + raise BenchmarkDataError(f"{path} duplicate benchmark name {name!r}") + results[name] = mean + return results + + +def load_baseline_means(baselines_path: str | Path) -> dict[str, float]: + path = Path(baselines_path) + try: + data = json.loads(path.read_text(encoding="utf-8")) + except OSError as exc: + raise BenchmarkDataError(f"cannot read {path}: {exc}") from exc + except json.JSONDecodeError as exc: + raise BenchmarkDataError(f"invalid JSON in {path}: {exc}") from exc + if not isinstance(data, dict): + raise BenchmarkDataError(f"{path} root value must be an object") + + if "groups" not in data: + raise BenchmarkDataError(f"{path} missing required 'groups' key") + groups = data["groups"] + if not isinstance(groups, dict): + raise BenchmarkDataError(f"{path} 'groups' must be an object") + + means: dict[str, float] = {} + for group_name, value in groups.items(): + if not isinstance(value, dict): + raise BenchmarkDataError( + f"{path} groups[{group_name!r}] must be an object of benchmark means" + ) + for name, mean in value.items(): + bench_name = normalize_benchmark_name(str(name)) + if bench_name in means: + raise BenchmarkDataError( + f"{path} duplicate benchmark name {bench_name!r} across groups" + ) + try: + means[bench_name] = float(mean) + except (TypeError, ValueError) as exc: + raise BenchmarkDataError( + f"{path} groups[{group_name!r}][{name!r}] is not a numeric mean" + ) from exc + return means + + +def check_regression( + results_path: str | Path, + baselines_path: str | Path, + *, + threshold: float = THRESHOLD, +) -> int: + """Return 0 when within threshold; 1 when any gated benchmark regresses.""" + flat = load_results(results_path) + baseline_means = load_baseline_means(baselines_path) + + failures: list[str] = [] + missing: list[str] = [] + for name, base in baseline_means.items(): + cur = flat.get(name) + if cur is None: + print(f"FAIL: no current result for gated baseline {name!r}") + missing.append(name) + continue + if base == 0: + print(f"WARN: baseline for {name!r} is zero; skipping ratio check") + continue + ratio = cur / base + tag = "FAIL" if ratio > threshold else "ok" + print(f"[{tag}] {name}: {cur:.6f}s vs {base:.6f}s ({ratio:.2f}x)") + if ratio > threshold: + failures.append(name) + + for name in flat: + if name not in baseline_means: + print(f"WARN: {name!r} has no baseline yet; not gated") + + if failures: + print(f"\nREGRESSION: {len(failures)} benchmark(s) exceeded {threshold:.0%}") + if missing: + print(f"\nMISSING: {len(missing)} gated benchmark(s) absent from current results") + if failures or missing: + return 1 + return 0 + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("results_path", help="pytest-benchmark --benchmark-json output") + parser.add_argument("baselines_path", help="path to benchmarks/baselines.json") + parser.add_argument( + "--threshold", + type=float, + default=THRESHOLD, + help="fail when current mean exceeds baseline by more than this ratio (default: 1.20)", + ) + args = parser.parse_args(argv) + try: + return check_regression( + args.results_path, + args.baselines_path, + threshold=args.threshold, + ) + except BenchmarkDataError as exc: + print(f"ERROR: {exc}", file=sys.stderr) + return 2 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/benchmarks/__init__.py b/tests/benchmarks/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/benchmarks/conftest.py b/tests/benchmarks/conftest.py new file mode 100644 index 0000000..25d9234 --- /dev/null +++ b/tests/benchmarks/conftest.py @@ -0,0 +1,88 @@ +"""Synthetic workspace trees for summary-cache performance benchmarks.""" + +from __future__ import annotations + +from pathlib import Path +from typing import Any + +import pytest + +from services import summary_cache +from services.summary_cache import fingerprint_workspace_storage + + +def make_workspace_entries(workspace_root: Path, count: int) -> list[dict[str, Any]]: + """Build *count* synthetic workspace entries with on-disk state files.""" + entries: list[dict[str, Any]] = [] + for i in range(count): + name = f"ws_{i:04d}" + entry_dir = workspace_root / name + entry_dir.mkdir(parents=True, exist_ok=True) + (entry_dir / "state.vscdb").write_bytes(b"bench") + workspace_json = entry_dir / "workspace.json" + workspace_json.write_text('{"folder": "/bench"}', encoding="utf-8") + entries.append( + { + "name": name, + "workspaceJsonPath": str(workspace_json), + } + ) + return entries + + +@pytest.fixture +def summary_cache_dir(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> Path: + """Redirect summary-cache files to an isolated temp directory. + + Patches ``CACHE_DIR`` (also used by tab-summary paths via ``_tab_summaries_path``) + plus the projects/composer-map file constants used by current benchmarks. + """ + cache_dir = tmp_path / "cache" + cache_dir.mkdir() + monkeypatch.setattr(summary_cache, "CACHE_DIR", cache_dir) + monkeypatch.setattr(summary_cache, "PROJECTS_CACHE_FILE", cache_dir / "projects.json") + monkeypatch.setattr( + summary_cache, + "COMPOSER_MAP_CACHE_FILE", + cache_dir / "composer-id-to-ws.json", + ) + return cache_dir + + +@pytest.fixture +def sample_projects() -> list[dict[str, Any]]: + return [ + { + "id": "ws_0000", + "name": "Bench Project", + "conversationCount": 3, + "lastModified": "2026-06-24T00:00:00Z", + } + ] + + +@pytest.fixture +def synthetic_workspace(tmp_path: Path, request: pytest.FixtureRequest) -> tuple[str, list[dict[str, Any]]]: + """Workspace path + entries. Parametrize via indirect ``workspace_entry_count``.""" + count = getattr(request, "param", 10) + workspace_root = tmp_path / "workspaceStorage" + workspace_root.mkdir() + entries = make_workspace_entries(workspace_root, count) + return str(workspace_root), entries + + +@pytest.fixture +def workspace_fingerprint(synthetic_workspace: tuple[str, list[dict[str, Any]]]) -> dict[str, Any]: + workspace_path, entries = synthetic_workspace + return fingerprint_workspace_storage( + workspace_path, + entries, + global_db_path=None, + rules=[], + ) + + +@pytest.fixture +def stale_fingerprint(workspace_fingerprint: dict[str, Any]) -> dict[str, Any]: + """Return a fingerprint guaranteed to differ from the stored one.""" + return {**workspace_fingerprint, "rules_digest": "deadbeefdeadbeef"} diff --git a/tests/benchmarks/test_summary_cache_bench.py b/tests/benchmarks/test_summary_cache_bench.py new file mode 100644 index 0000000..b9a4595 --- /dev/null +++ b/tests/benchmarks/test_summary_cache_bench.py @@ -0,0 +1,74 @@ +"""pytest-benchmark coverage for services/summary_cache.py hot paths.""" + +from __future__ import annotations + +from pathlib import Path +from typing import Any + +import pytest + +from services.summary_cache import ( + fingerprint_workspace_storage, + get_cached_projects, + set_cached_projects, +) + + +@pytest.mark.benchmark(group="summary-cache") +def test_summary_cache_hit( + benchmark, + summary_cache_dir: Path, + workspace_fingerprint: dict[str, Any], + sample_projects: list[dict[str, Any]], +) -> None: + set_cached_projects(workspace_fingerprint, sample_projects, []) + benchmark(get_cached_projects, workspace_fingerprint) + + +@pytest.mark.benchmark(group="summary-cache") +def test_summary_cache_miss( + benchmark, + summary_cache_dir: Path, + workspace_fingerprint: dict[str, Any], + stale_fingerprint: dict[str, Any], + sample_projects: list[dict[str, Any]], +) -> None: + set_cached_projects(workspace_fingerprint, sample_projects, []) + benchmark(get_cached_projects, stale_fingerprint) + + +@pytest.mark.benchmark(group="summary-cache") +@pytest.mark.parametrize( + "synthetic_workspace", + [10, 50, 200], + indirect=True, +) +def test_fingerprint_workspace_entries( + benchmark, + synthetic_workspace: tuple[str, list[dict[str, Any]]], +) -> None: + workspace_path, entries = synthetic_workspace + benchmark( + fingerprint_workspace_storage, + workspace_path, + entries, + global_db_path=None, + rules=[], + ) + + +@pytest.mark.benchmark(group="summary-cache") +def test_summary_cache_round_trip( + benchmark, + summary_cache_dir: Path, + workspace_fingerprint: dict[str, Any], + sample_projects: list[dict[str, Any]], +) -> None: + fp = workspace_fingerprint + projects = sample_projects + + def _run() -> None: + set_cached_projects(fp, projects, []) + get_cached_projects(fp) + + benchmark(_run) diff --git a/tests/test_check_benchmark_regression.py b/tests/test_check_benchmark_regression.py new file mode 100644 index 0000000..8de10a8 --- /dev/null +++ b/tests/test_check_benchmark_regression.py @@ -0,0 +1,215 @@ +"""Tests for scripts/check_benchmark_regression.py.""" + +from __future__ import annotations + +import json + +import pytest + +from scripts.check_benchmark_regression import ( + BenchmarkDataError, + check_regression, + load_baseline_means, + load_results, + normalize_benchmark_name, +) + +GATED_BENCH = "test_summary_cache_hit" + + +def _write_results(path, benchmarks: list[dict]) -> None: + path.write_text( + json.dumps({"benchmarks": benchmarks}, indent=2), + encoding="utf-8", + ) + + +def _write_baselines(path, groups: dict[str, dict[str, float]]) -> None: + path.write_text( + json.dumps({"groups": groups}, indent=2), + encoding="utf-8", + ) + + +def test_normalize_benchmark_name_strips_module_prefix() -> None: + full = "tests/benchmarks/test_summary_cache_bench.py::test_summary_cache_hit" + assert normalize_benchmark_name(full) == "test_summary_cache_hit" + assert normalize_benchmark_name("test_summary_cache_hit") == "test_summary_cache_hit" + + +def test_normalize_benchmark_name_preserves_colons_in_param_values() -> None: + short = "test_x[param::v]" + full = f"tests/benchmarks/test_x.py::{short}" + assert normalize_benchmark_name(short) == short + assert normalize_benchmark_name(full) == short + + +def test_load_results_normalizes_full_node_id(tmp_path) -> None: + path = tmp_path / "results.json" + _write_results( + path, + [ + { + "name": "tests/benchmarks/test_summary_cache_bench.py::test_summary_cache_hit", + "stats": {"mean": 0.0001}, + } + ], + ) + + assert load_results(path)["test_summary_cache_hit"] == pytest.approx(0.0001) + + +def test_missing_baseline_warns_without_failing( + tmp_path, capsys: pytest.CaptureFixture[str] +) -> None: + results = tmp_path / "results.json" + baselines = tmp_path / "baselines.json" + _write_results( + results, + [ + {"name": "test_new_bench", "stats": {"mean": 0.01}}, + {"name": GATED_BENCH, "stats": {"mean": 0.0001}}, + ], + ) + _write_baselines( + baselines, + {"summary-cache": {GATED_BENCH: 0.0001}}, + ) + + assert check_regression(results, baselines) == 0 + out = capsys.readouterr().out + assert "WARN: 'test_new_bench' has no baseline yet" in out + + +def test_regression_over_threshold_fails(tmp_path, capsys: pytest.CaptureFixture[str]) -> None: + results = tmp_path / "results.json" + baselines = tmp_path / "baselines.json" + _write_results( + results, + [{"name": GATED_BENCH, "stats": {"mean": 0.00025}}], + ) + _write_baselines( + baselines, + {"summary-cache": {GATED_BENCH: 0.0002}}, + ) + + assert check_regression(results, baselines) == 1 + out = capsys.readouterr().out + assert "REGRESSION" in out + + +def test_within_threshold_passes(tmp_path) -> None: + results = tmp_path / "results.json" + baselines = tmp_path / "baselines.json" + _write_results( + results, + [{"name": GATED_BENCH, "stats": {"mean": 0.00022}}], + ) + _write_baselines( + baselines, + {"summary-cache": {GATED_BENCH: 0.0002}}, + ) + + assert check_regression(results, baselines) == 0 + + +def test_load_results_rejects_malformed_json(tmp_path) -> None: + path = tmp_path / "bad.json" + path.write_text("{not json", encoding="utf-8") + with pytest.raises(BenchmarkDataError, match="invalid JSON"): + load_results(path) + + +def test_load_results_requires_benchmarks_array(tmp_path) -> None: + path = tmp_path / "results.json" + path.write_text("{}", encoding="utf-8") + with pytest.raises(BenchmarkDataError, match="'benchmarks' array"): + load_results(path) + + +def test_load_results_rejects_missing_file(tmp_path) -> None: + with pytest.raises(BenchmarkDataError, match="cannot read"): + load_results(tmp_path / "missing.json") + + +def test_zero_baseline_skips_ratio_check(tmp_path, capsys: pytest.CaptureFixture[str]) -> None: + results = tmp_path / "results.json" + baselines = tmp_path / "baselines.json" + _write_results( + results, + [{"name": GATED_BENCH, "stats": {"mean": 0.00025}}], + ) + _write_baselines( + baselines, + {"summary-cache": {GATED_BENCH: 0.0}}, + ) + + assert check_regression(results, baselines) == 0 + assert f"baseline for '{GATED_BENCH}' is zero" in capsys.readouterr().out + + +def test_exactly_at_threshold_passes(tmp_path) -> None: + results = tmp_path / "results.json" + baselines = tmp_path / "baselines.json" + _write_results( + results, + [{"name": GATED_BENCH, "stats": {"mean": 0.00024}}], + ) + _write_baselines( + baselines, + {"summary-cache": {GATED_BENCH: 0.0002}}, + ) + + assert check_regression(results, baselines) == 0 + + +def test_missing_current_result_fails(tmp_path, capsys: pytest.CaptureFixture[str]) -> None: + results = tmp_path / "results.json" + baselines = tmp_path / "baselines.json" + _write_results(results, []) + _write_baselines( + baselines, + {"summary-cache": {GATED_BENCH: 0.0002}}, + ) + + assert check_regression(results, baselines) == 1 + out = capsys.readouterr().out + assert "MISSING" in out + assert "no current result for gated baseline" in out + + +def test_main_reports_benchmark_data_error(tmp_path, capsys: pytest.CaptureFixture[str]) -> None: + from scripts.check_benchmark_regression import main + + bad = tmp_path / "bad.json" + bad.write_text("{}", encoding="utf-8") + baselines = tmp_path / "baselines.json" + _write_baselines(baselines, {"summary-cache": {GATED_BENCH: 0.0002}}) + + assert main([str(bad), str(baselines)]) == 2 + assert "ERROR:" in capsys.readouterr().err + + +def test_duplicate_baseline_name_raises(tmp_path) -> None: + baselines = tmp_path / "baselines.json" + _write_baselines( + baselines, + { + "summary-cache": {GATED_BENCH: 0.0002}, + "export": {GATED_BENCH: 0.0003}, + }, + ) + + with pytest.raises(BenchmarkDataError, match="duplicate benchmark name"): + load_baseline_means(baselines) + + +def test_load_baseline_means_rejects_non_dict_group(tmp_path) -> None: + baselines = tmp_path / "baselines.json" + baselines.write_text( + json.dumps({"groups": {"summary-cache": "not-a-dict"}}), + encoding="utf-8", + ) + + with pytest.raises(BenchmarkDataError, match="must be an object"): + load_baseline_means(baselines)