-
Notifications
You must be signed in to change notification settings - Fork 1
test: add summary-cache pytest-benchmark suite with CI regression gate #120
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
c126227
e4d79a6
83bb248
b1c2f93
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -105,7 +105,7 @@ jobs: | |
| run: | | ||
| python -m pip install --upgrade pip | ||
| python -m pip install -r requirements-lock.txt | ||
| python -m pip install 'pytest>=8,<9' 'hypothesis>=6.100,<7' | ||
| python -m pip install 'pytest>=8,<9' 'pytest-benchmark==4.0.0' 'hypothesis>=6.100,<7' | ||
|
|
||
| - name: Run unittest suite | ||
| run: python -m unittest discover tests -v | ||
|
|
@@ -114,7 +114,8 @@ jobs: | |
| # Pytest fixtures (tests/conftest.py) build a temp workspaceStorage and | ||
| # exercise Flask routes via app.test_client(). Only listed files — not | ||
| # `pytest tests/` — to avoid re-collecting unittest.TestCase classes above. | ||
| run: python -m pytest tests/test_api_search.py tests/test_api_workspaces.py tests/test_api_export.py tests/test_pdf_export.py tests/test_search_helpers.py -v --tb=short | ||
| # -o addopts= avoids inheriting benchmark-only options from pyproject.toml. | ||
| run: python -m pytest tests/test_api_search.py tests/test_api_workspaces.py tests/test_api_export.py tests/test_pdf_export.py tests/test_search_helpers.py -v --tb=short -o addopts= | ||
|
|
||
| # ── PyInstaller desktop build (Windows only, once per workflow) ──────── | ||
| # Closes #44. Builds the onedir bundle and smoke-tests --help so the | ||
|
|
@@ -213,3 +214,41 @@ jobs: | |
| --verbose \ | ||
| --redact \ | ||
| --exit-code 1 | ||
|
|
||
| # ── Performance benchmarks: summary cache (issue #7) ─────────────────────── | ||
| benchmarks: | ||
| name: Performance benchmarks (gated) | ||
| needs: [unittest] | ||
| runs-on: ubuntu-latest | ||
| steps: | ||
| - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 | ||
| with: | ||
| persist-credentials: false | ||
|
|
||
| - name: Set up Python | ||
| uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 | ||
| with: | ||
| python-version: "3.12" | ||
|
|
||
| - name: Install runtime + benchmark dependencies | ||
| run: | | ||
| python -m pip install --upgrade pip | ||
| python -m pip install -r requirements-lock.txt | ||
| python -m pip install 'pytest>=8,<9' 'pytest-benchmark==4.0.0' | ||
|
|
||
| - name: Run summary-cache benchmarks | ||
| run: > | ||
| python -m pytest tests/benchmarks/ | ||
| --benchmark-only | ||
| --benchmark-json=benchmark-results.json | ||
| --benchmark-columns=min,max,mean,stddev,rounds | ||
| -o addopts= | ||
|
Comment on lines
+239
to
+245
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Consider pinning --benchmark-min-rounds (e.g. 5) to reduce CI timing variance near the 20% threshold, especially for sub-millisecond hit/miss benchmarks. |
||
|
|
||
| - name: Regression gate | ||
| run: python scripts/check_benchmark_regression.py benchmark-results.json benchmarks/baselines.json | ||
|
|
||
| - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4 | ||
| if: always() | ||
| with: | ||
| name: benchmark-results | ||
| path: benchmark-results.json | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -44,3 +44,5 @@ Thumbs.db | |
| htmlcov/ | ||
| coverage.xml | ||
| .hypothesis/ | ||
| benchmark-results.json | ||
| benchmarks/_raw.json | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,15 @@ | ||
| { | ||
| "_note": "Gated means from ubuntu-latest CI benchmark-results.json (PR #120, run 28123677675). Refresh: pytest tests/benchmarks/ --benchmark-only --benchmark-json=benchmark-results.json -o addopts=", | ||
|
Comment on lines
+1
to
+2
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. _note refresh command is truncated (-o addopts= with no value). Complete the example: -o addopts= --benchmark-json=benchmark-results.json. |
||
| "updated": "2026-06-24T19:20:27Z", | ||
| "machine": "Linux", | ||
| "groups": { | ||
| "summary-cache": { | ||
| "test_summary_cache_hit": 6.3e-05, | ||
| "test_summary_cache_miss": 6.3e-05, | ||
| "test_fingerprint_workspace_entries[10]": 0.001844, | ||
| "test_fingerprint_workspace_entries[50]": 0.007759, | ||
| "test_fingerprint_workspace_entries[200]": 0.022231, | ||
| "test_summary_cache_round_trip": 0.000351 | ||
| } | ||
| } | ||
| } | ||
|
coderabbitai[bot] marked this conversation as resolved.
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -31,10 +31,19 @@ desktop = ["pywebview>=5.0,<6"] | |
| # Development tooling: testing + type checking. | ||
| dev = [ | ||
| "pytest>=8,<9", | ||
| "pytest-benchmark>=4,<5", | ||
| "mypy>=1.10,<2", | ||
| "hypothesis>=6.100,<7", | ||
| ] | ||
|
|
||
| [tool.pytest.ini_options] | ||
| pythonpath = ["."] | ||
| addopts = "--benchmark-disable" | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. PR description says --benchmark-skip but config uses --benchmark-disable. With disable, benchmark tests still run once on a bare pytest (functional smoke, ~0.4s). Either align the docs or switch to --benchmark-skip if the goal is zero benchmark execution during normal pytest. |
||
| testpaths = ["tests"] | ||
|
Comment on lines
+39
to
+42
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. New [tool.pytest.ini_options] with addopts = "--benchmark-disable" requires pytest-benchmark installed for any default pytest invocation. CI handles this; document in PR/issue that devs need pip install -e ".[dev]" (or pytest-benchmark) before running pytest without -o addopts=. |
||
| markers = [ | ||
| "benchmark: performance benchmarks (pytest-benchmark)", | ||
| ] | ||
|
coderabbitai[bot] marked this conversation as resolved.
|
||
|
|
||
| [project.scripts] | ||
| # Primary CLI: export Cursor chat histories to Markdown / zip. | ||
| # Usage: cursor-chat-export [--since all|last] [--out DIR] [--no-zip] [--help] | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -6,7 +6,7 @@ | |
| # Lock is generated on Linux (CI / update-lock.yml). Windows-only transitives (e.g. | ||
| # colorama via click) are omitted — pip still installs them on Windows when needed. | ||
| blinker==1.9.0 # via flask | ||
| click==8.4.1 # via flask | ||
| click==8.4.2 # via flask | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Unrelated click==8.4.1 → 8.4.2 bump is out of scope for a benchmarks-only PR. Revert unless there's a documented dependency reason; otherwise it adds review nois |
||
| defusedxml==0.7.1 # via fpdf2 | ||
| flask==3.1.3 # via -r requirements.txt | ||
| fonttools==4.63.0 # via fpdf2 | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,163 @@ | ||
| """Compare pytest-benchmark JSON output against stored baselines.""" | ||
|
|
||
| from __future__ import annotations | ||
|
|
||
| import argparse | ||
| import json | ||
| import sys | ||
| from pathlib import Path | ||
|
|
||
| THRESHOLD = 1.20 | ||
|
|
||
|
|
||
| class BenchmarkDataError(ValueError): | ||
| """Raised when benchmark JSON input is malformed or missing required fields.""" | ||
|
|
||
|
|
||
| def normalize_benchmark_name(name: str) -> str: | ||
| """Strip pytest file node prefix so baselines match short or full benchmark names.""" | ||
| text = str(name) | ||
| if "::" not in text: | ||
| return text | ||
| prefix, _, suffix = text.partition("::") | ||
| # Only strip module paths (…/test_foo.py::test_name); leave "::" inside [param::value] intact. | ||
| if prefix.endswith(".py"): | ||
| return suffix | ||
| return text | ||
|
|
||
|
|
||
| def load_results(results_path: str | Path) -> dict[str, float]: | ||
| path = Path(results_path) | ||
| try: | ||
| data = json.loads(path.read_text(encoding="utf-8")) | ||
| except OSError as exc: | ||
| raise BenchmarkDataError(f"cannot read {path}: {exc}") from exc | ||
| except json.JSONDecodeError as exc: | ||
| raise BenchmarkDataError(f"invalid JSON in {path}: {exc}") from exc | ||
| try: | ||
| benchmarks = data["benchmarks"] | ||
| except (KeyError, TypeError) as exc: | ||
| raise BenchmarkDataError(f"{path} missing top-level 'benchmarks' array") from exc | ||
| if not isinstance(benchmarks, list): | ||
| raise BenchmarkDataError(f"{path} 'benchmarks' must be an array") | ||
|
|
||
| results: dict[str, float] = {} | ||
| for index, entry in enumerate(benchmarks): | ||
| if not isinstance(entry, dict): | ||
| raise BenchmarkDataError(f"{path} benchmarks[{index}] must be an object") | ||
| try: | ||
| raw_name = entry["name"] | ||
| mean = float(entry["stats"]["mean"]) | ||
| except (KeyError, TypeError, ValueError) as exc: | ||
| raise BenchmarkDataError( | ||
| f"{path} benchmarks[{index}] missing 'name' or 'stats.mean'" | ||
| ) from exc | ||
| name = normalize_benchmark_name(str(raw_name)) | ||
| if name in results: | ||
| raise BenchmarkDataError(f"{path} duplicate benchmark name {name!r}") | ||
| results[name] = mean | ||
| return results | ||
|
|
||
|
|
||
| def load_baseline_means(baselines_path: str | Path) -> dict[str, float]: | ||
| path = Path(baselines_path) | ||
| try: | ||
| data = json.loads(path.read_text(encoding="utf-8")) | ||
| except OSError as exc: | ||
| raise BenchmarkDataError(f"cannot read {path}: {exc}") from exc | ||
| except json.JSONDecodeError as exc: | ||
| raise BenchmarkDataError(f"invalid JSON in {path}: {exc}") from exc | ||
| if not isinstance(data, dict): | ||
| raise BenchmarkDataError(f"{path} root value must be an object") | ||
|
|
||
| if "groups" not in data: | ||
| raise BenchmarkDataError(f"{path} missing required 'groups' key") | ||
| groups = data["groups"] | ||
| if not isinstance(groups, dict): | ||
| raise BenchmarkDataError(f"{path} 'groups' must be an object") | ||
|
|
||
| means: dict[str, float] = {} | ||
| for group_name, value in groups.items(): | ||
| if not isinstance(value, dict): | ||
| raise BenchmarkDataError( | ||
| f"{path} groups[{group_name!r}] must be an object of benchmark means" | ||
| ) | ||
| for name, mean in value.items(): | ||
| bench_name = normalize_benchmark_name(str(name)) | ||
| if bench_name in means: | ||
| raise BenchmarkDataError( | ||
| f"{path} duplicate benchmark name {bench_name!r} across groups" | ||
| ) | ||
| try: | ||
| means[bench_name] = float(mean) | ||
| except (TypeError, ValueError) as exc: | ||
| raise BenchmarkDataError( | ||
| f"{path} groups[{group_name!r}][{name!r}] is not a numeric mean" | ||
| ) from exc | ||
| return means | ||
|
|
||
|
|
||
| def check_regression( | ||
| results_path: str | Path, | ||
| baselines_path: str | Path, | ||
| *, | ||
| threshold: float = THRESHOLD, | ||
| ) -> int: | ||
| """Return 0 when within threshold; 1 when any gated benchmark regresses.""" | ||
| flat = load_results(results_path) | ||
| baseline_means = load_baseline_means(baselines_path) | ||
|
|
||
| failures: list[str] = [] | ||
| missing: list[str] = [] | ||
| for name, base in baseline_means.items(): | ||
| cur = flat.get(name) | ||
| if cur is None: | ||
| print(f"FAIL: no current result for gated baseline {name!r}") | ||
| missing.append(name) | ||
| continue | ||
| if base == 0: | ||
| print(f"WARN: baseline for {name!r} is zero; skipping ratio check") | ||
| continue | ||
| ratio = cur / base | ||
| tag = "FAIL" if ratio > threshold else "ok" | ||
| print(f"[{tag}] {name}: {cur:.6f}s vs {base:.6f}s ({ratio:.2f}x)") | ||
| if ratio > threshold: | ||
| failures.append(name) | ||
|
|
||
|
Comment on lines
+118
to
+126
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. New benchmarks without baselines only WARN; gated missing benchmarks correctly FAIL (fixed in 83bb248). Consider also failing when all current results are faster than baseline by >50% (stale baseline drift), or document a baseline-refresh workflow in benchmarks/baselines.json. |
||
| for name in flat: | ||
| if name not in baseline_means: | ||
| print(f"WARN: {name!r} has no baseline yet; not gated") | ||
|
|
||
| if failures: | ||
| print(f"\nREGRESSION: {len(failures)} benchmark(s) exceeded {threshold:.0%}") | ||
| if missing: | ||
| print(f"\nMISSING: {len(missing)} gated benchmark(s) absent from current results") | ||
| if failures or missing: | ||
| return 1 | ||
| return 0 | ||
|
|
||
|
|
||
| def main(argv: list[str] | None = None) -> int: | ||
| parser = argparse.ArgumentParser(description=__doc__) | ||
| parser.add_argument("results_path", help="pytest-benchmark --benchmark-json output") | ||
| parser.add_argument("baselines_path", help="path to benchmarks/baselines.json") | ||
| parser.add_argument( | ||
| "--threshold", | ||
| type=float, | ||
| default=THRESHOLD, | ||
| help="fail when current mean exceeds baseline by more than this ratio (default: 1.20)", | ||
| ) | ||
| args = parser.parse_args(argv) | ||
| try: | ||
| return check_regression( | ||
| args.results_path, | ||
| args.baselines_path, | ||
| threshold=args.threshold, | ||
| ) | ||
| except BenchmarkDataError as exc: | ||
| print(f"ERROR: {exc}", file=sys.stderr) | ||
| return 2 | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| sys.exit(main()) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,88 @@ | ||
| """Synthetic workspace trees for summary-cache performance benchmarks.""" | ||
|
|
||
| from __future__ import annotations | ||
|
|
||
| from pathlib import Path | ||
| from typing import Any | ||
|
|
||
| import pytest | ||
|
|
||
| from services import summary_cache | ||
| from services.summary_cache import fingerprint_workspace_storage | ||
|
|
||
|
|
||
| def make_workspace_entries(workspace_root: Path, count: int) -> list[dict[str, Any]]: | ||
| """Build *count* synthetic workspace entries with on-disk state files.""" | ||
| entries: list[dict[str, Any]] = [] | ||
| for i in range(count): | ||
| name = f"ws_{i:04d}" | ||
| entry_dir = workspace_root / name | ||
| entry_dir.mkdir(parents=True, exist_ok=True) | ||
| (entry_dir / "state.vscdb").write_bytes(b"bench") | ||
| workspace_json = entry_dir / "workspace.json" | ||
| workspace_json.write_text('{"folder": "/bench"}', encoding="utf-8") | ||
| entries.append( | ||
| { | ||
| "name": name, | ||
| "workspaceJsonPath": str(workspace_json), | ||
| } | ||
| ) | ||
| return entries | ||
|
|
||
|
|
||
| @pytest.fixture | ||
| def summary_cache_dir(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> Path: | ||
| """Redirect summary-cache files to an isolated temp directory. | ||
|
|
||
| Patches ``CACHE_DIR`` (also used by tab-summary paths via ``_tab_summaries_path``) | ||
| plus the projects/composer-map file constants used by current benchmarks. | ||
| """ | ||
| cache_dir = tmp_path / "cache" | ||
| cache_dir.mkdir() | ||
| monkeypatch.setattr(summary_cache, "CACHE_DIR", cache_dir) | ||
| monkeypatch.setattr(summary_cache, "PROJECTS_CACHE_FILE", cache_dir / "projects.json") | ||
| monkeypatch.setattr( | ||
| summary_cache, | ||
| "COMPOSER_MAP_CACHE_FILE", | ||
| cache_dir / "composer-id-to-ws.json", | ||
| ) | ||
|
Comment on lines
+33
to
+48
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. summary_cache_dir patches PROJECTS_CACHE_FILE and COMPOSER_MAP_CACHE_FILE but not tab-summary paths (_tab_summaries_path uses CACHE_DIR prefix only). Fine for current scope; add a comment that tab-summary benchmarks belong in issue #8's unified suite. |
||
| return cache_dir | ||
|
|
||
|
|
||
| @pytest.fixture | ||
| def sample_projects() -> list[dict[str, Any]]: | ||
| return [ | ||
| { | ||
| "id": "ws_0000", | ||
| "name": "Bench Project", | ||
| "conversationCount": 3, | ||
| "lastModified": "2026-06-24T00:00:00Z", | ||
| } | ||
| ] | ||
|
|
||
|
|
||
| @pytest.fixture | ||
| def synthetic_workspace(tmp_path: Path, request: pytest.FixtureRequest) -> tuple[str, list[dict[str, Any]]]: | ||
| """Workspace path + entries. Parametrize via indirect ``workspace_entry_count``.""" | ||
| count = getattr(request, "param", 10) | ||
| workspace_root = tmp_path / "workspaceStorage" | ||
| workspace_root.mkdir() | ||
| entries = make_workspace_entries(workspace_root, count) | ||
| return str(workspace_root), entries | ||
|
|
||
|
|
||
| @pytest.fixture | ||
| def workspace_fingerprint(synthetic_workspace: tuple[str, list[dict[str, Any]]]) -> dict[str, Any]: | ||
| workspace_path, entries = synthetic_workspace | ||
| return fingerprint_workspace_storage( | ||
| workspace_path, | ||
| entries, | ||
| global_db_path=None, | ||
| rules=[], | ||
| ) | ||
|
|
||
|
|
||
| @pytest.fixture | ||
| def stale_fingerprint(workspace_fingerprint: dict[str, Any]) -> dict[str, Any]: | ||
| """Return a fingerprint guaranteed to differ from the stored one.""" | ||
| return {**workspace_fingerprint, "rules_digest": "deadbeefdeadbeef"} | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
tests/test_check_benchmark_regression.py (15 tests for the regression gate) is not run in CI. unittest discover won't collect pytest functions, and the integration pytest step lists only API/search files. Add this file to the pytest step or run pytest tests/test_check_benchmark_regression.py -o addopts= so gate logic is covered on every matrix run.