diff --git a/README.md b/README.md index 6a583c2..42f20f2 100644 --- a/README.md +++ b/README.md @@ -69,19 +69,7 @@ semble search "save_pretrained" ./my-project semble search "save model to disk" ./my-project --top-k 10 ​``` -If you anticipate doing more than one search, use `semble index` to create an index. - -​```bash -semble index ./my-project -o my_index -​``` - -You can then reuse this index later on: - -​```bash -semble search "save_pretrained" --index my_index -​``` - -An index is not automatically updated, so if the code changes significantly, reindex. If you notice stale results while resolving searches to files, reindex. +The index is built on first run (and cached for subsequent runs) and invalidated automatically when files change. Use `--content docs` to search documentation and prose, `--content config` for config files (yaml, toml, etc.), or `--content all` to search code, docs, and config: @@ -97,20 +85,17 @@ Use `semble find-related` to discover code similar to a known location (pass `fi semble find-related src/auth.py 42 ./my-project ​``` -Like search, `find-related` also accepts an `--index` argument. - `path` defaults to the current directory when omitted; git URLs are accepted. If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its place. ### Workflow -1. Index the repo using `semble index -o cached_index`. -2. Start with `semble search` to find relevant chunks. Pass the index to achieve results faster. -3. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. -4. Inspect full files only when the returned chunk does not give enough context. -5. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. -6. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. +1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. +2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. +3. Inspect full files only when the returned chunk does not give enough context. +4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. +5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. ``` @@ -330,7 +315,55 @@ Add to `~/.config/zed/settings.json` (or `.zed/settings.json` in your project): By default the MCP server indexes only code files. To also index documentation, config, or everything, append `--content docs`, `--content config`, or `--content all` to the server command, or a combination, e.g. `--content code docs`. For example, in Claude Code: `claude mcp add semble -s user -- uvx --from "semble[mcp]" semble --content all`. -## Sub-agent setup + + +## Bash / AGENTS.md + +An alternative to MCP is to invoke Semble via Bash. Sub-agents cannot call MCP tools directly, so this is the only option for sub-agent support; it can also be used alongside MCP for the top-level agent. + +To add Bash support, append the following to your `AGENTS.md`, `CLAUDE.md`, `GEMINI.md`, or equivalent: + +```markdown +## Code Search + +Use `semble search` to find code by describing what it does or naming a symbol/identifier, instead of grep: + +​```bash +semble search "authentication flow" ./my-project +semble search "save_pretrained" ./my-project +semble search "save model to disk" ./my-project --top-k 10 +​``` + +The index is built on first run (and cached for subsequent runs) and invalidated automatically when files change. + +Use `--content docs` to search documentation and prose, `--content config` for config files (yaml, toml, etc.), or `--content all` to search code, docs, and config: + +​```bash +semble search "deployment guide" ./my-project --content docs +semble search "database host port" ./my-project --content config +semble search "authentication" ./my-project --content all +​``` + +Use `semble find-related` to discover code similar to a known location (pass `file_path` and `line` from a prior search result): + +​```bash +semble find-related src/auth.py 42 ./my-project +​``` + +`path` defaults to the current directory when omitted; git URLs are accepted. + +If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its place. + +### Workflow + +1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. +2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. +3. Inspect full files only when the returned chunk does not give enough context. +4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. +5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. +``` + +### Sub-agent setup Claude Code, Gemini CLI, Cursor, OpenCode, GitHub Copilot CLI, and Kiro all support a dedicated semble search sub-agent. Run `semble init` once in your project root: @@ -350,13 +383,9 @@ If semble is not on `$PATH`, prefix the command with `uvx --from "semble[mcp]"`. Semble also ships as a standalone CLI. This is useful in scripts or anywhere you want search results without an MCP session. ```bash -# Search a local repo +# Search a local repo (index is built and cached automatically) semble search "authentication flow" ./my-project -# Index first for faster repeated searches (--index works with any command below) -semble index ./my-project -o my-index -semble search "authentication flow" --index my-index - # Search a remote repo (cloned on demand) semble search "save model to disk" https://github.com/MinishLab/model2vec @@ -372,6 +401,31 @@ semble find-related src/auth.py 42 ./my-project `--content` accepts `code` (default), `docs`, `config`, or `all`. `path` defaults to the current directory when omitted; git URLs are accepted. If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its place. +
+Controlling which files are indexed + +Semble reads `.gitignore` and `.sembleignore` files to determine which files to index. Both files use standard gitignore syntax and their patterns are merged. `.sembleignore` lets you add semble-specific rules without touching `.gitignore`. Rules are applied recursively, so a `.sembleignore` in a subdirectory applies to that subtree. + +**Excluding files:** add patterns the same way you would in `.gitignore`: + +``` +# .sembleignore +generated/ # exclude generated dir +*.pb.go. # exclude Go protobuf files +``` + +**Including non-default extensions:** prefix the extension pattern with `!` to force-include files that semble wouldn't index by default: + +``` +# .sembleignore +!*.proto # include Protobuf files +!*.cob # include COBOL files +``` + +Semble also always skips a set of well-known non-source directories regardless of ignore files (e.g. `node_modules/`, `.venv/`, `dist/`, `build/`, `__pycache__/`, and similar). + +
+
Savings @@ -394,7 +448,7 @@ semble savings --verbose # also show breakdown by call type Savings are calculated as follows: for each call, semble records the total character count of the unique files containing returned chunks and the character count of the snippets returned. Estimated tokens saved is `(file chars − snippet chars) / 4` (4 chars per token). This is a conservative estimate: the baseline is reading matched files in full, which is how coding agents often explore unfamiliar code. -Stats are stored in `~/.semble/savings.jsonl`. +Stats are stored in the OS cache folder (`~/Library/Caches/semble/` on macOS, `~/.cache/semble/` on Linux, `%LOCALAPPDATA%\semble\Cache\` on Windows).
diff --git a/benchmarks/baselines/ablations.py b/benchmarks/baselines/ablations.py index 7f91b67..63bd6ab 100644 --- a/benchmarks/baselines/ablations.py +++ b/benchmarks/baselines/ablations.py @@ -17,7 +17,7 @@ ) from benchmarks.run_benchmark import RepoResult, evaluate from semble import SembleIndex -from semble.index.dense import _DEFAULT_MODEL_NAME +from semble.utils import DEFAULT_MODEL_NAME # alpha=None → raw mode, input depends on query # alpha=0.0 → hybrid pipeline, BM25-only input @@ -129,7 +129,7 @@ def main() -> None: summary = { "tool": "semble-ablations", - "model": _DEFAULT_MODEL_NAME, + "model": DEFAULT_MODEL_NAME, "by_mode": summarize_modes(results, modes), "repos": [asdict(r) for r in results], } diff --git a/benchmarks/run_benchmark.py b/benchmarks/run_benchmark.py index 4af095e..003e25c 100644 --- a/benchmarks/run_benchmark.py +++ b/benchmarks/run_benchmark.py @@ -16,8 +16,8 @@ ) from benchmarks.metrics import ndcg_at_k, target_rank from semble import SembleIndex -from semble.index.dense import _DEFAULT_MODEL_NAME from semble.types import SearchResult +from semble.utils import DEFAULT_MODEL_NAME _LATENCY_RUNS = 5 _DIRECT_TOP_K = 10 @@ -259,7 +259,7 @@ def _save_results(results: list[RepoResult]) -> None: n_repos = len(results) output = { "tool": "semble-hybrid", - "model": _DEFAULT_MODEL_NAME, + "model": DEFAULT_MODEL_NAME, "summary": { "ndcg10": round(sum(r.ndcg10 for r in results) / n_repos, 4), "tokens": round(sum(r.tokens for r in results) / n_repos, 0), diff --git a/benchmarks/speed_benchmark.py b/benchmarks/speed_benchmark.py index b96ad75..1eee147 100644 --- a/benchmarks/speed_benchmark.py +++ b/benchmarks/speed_benchmark.py @@ -11,8 +11,8 @@ from benchmarks.data import RepoSpec, Task, available_repo_specs, load_tasks, save_results from benchmarks.tools import run_colgrep_files, run_ripgrep_count from semble import SembleIndex -from semble.index.dense import _DEFAULT_MODEL_NAME from semble.types import EmbeddingMatrix +from semble.utils import DEFAULT_MODEL_NAME # One representative repo per language (medium size, healthy NDCG on the main benchmark). _REPOS: list[str] = [ @@ -192,7 +192,7 @@ def main() -> None: print("Loading semble model...", file=sys.stderr) started = time.perf_counter() - semble_model = StaticModel.from_pretrained(_DEFAULT_MODEL_NAME) + semble_model = StaticModel.from_pretrained(DEFAULT_MODEL_NAME) print(f" loaded in {(time.perf_counter() - started) * 1000:.0f}ms", file=sys.stderr) print("Loading CodeRankEmbed...", file=sys.stderr) diff --git a/benchmarks/token_efficiency.py b/benchmarks/token_efficiency.py index 7c44ac5..77f10ed 100644 --- a/benchmarks/token_efficiency.py +++ b/benchmarks/token_efficiency.py @@ -24,10 +24,10 @@ target_matches_location, ) from semble import SembleIndex -from semble.index.dense import _DEFAULT_MODEL_NAME from semble.index.file_walker import DEFAULT_IGNORED_DIRS, FILE_TYPES, FileCategory from semble.ranking.boosting import _STOPWORDS as _SEMBLE_STOPWORDS from semble.types import Chunk +from semble.utils import DEFAULT_MODEL_NAME _RG_INCLUDE_GLOBS: tuple[str, ...] = tuple( f"*{ext}" for ext, spec in FILE_TYPES.items() if spec.category == FileCategory.CODE @@ -378,7 +378,7 @@ def run_recall(args: argparse.Namespace) -> None: print("Loading tokenizer + model...", file=sys.stderr) enc = tiktoken.get_encoding(_TOKENIZER_NAME) - model = StaticModel.from_pretrained(_DEFAULT_MODEL_NAME) + model = StaticModel.from_pretrained(DEFAULT_MODEL_NAME) method_curves: dict[str, MethodCurves] = defaultdict(list) print(f"\n{'Repo':<22} {'Language':<12} {'Tasks':>6} {'Time':>8}", file=sys.stderr) diff --git a/src/semble/agents/claude.md b/src/semble/agents/claude.md index 895e282..2cdc0f5 100644 --- a/src/semble/agents/claude.md +++ b/src/semble/agents/claude.md @@ -12,19 +12,7 @@ semble search "save_pretrained" ./my-project semble search "save model to disk" ./my-project --top-k 10 ``` -If you anticipate doing more than one search, use `semble index` to create an index. - -```bash -semble index ./my-project -o my_index -``` - -You can then reuse this index later on: - -```bash -semble search "save_pretrained" --index my_index -``` - -An index is not automatically updated, so if the code changes significantly, reindex. If you notice stale results while resolving searches to files, reindex. +Results are cached automatically on first run and invalidated when files change. Use `--content docs` to search documentation and prose, `--content config` for config files (yaml, toml, etc.), or `--content all` to search code, docs, and config: @@ -40,17 +28,14 @@ Use `semble find-related` to discover code similar to a known location (pass `fi semble find-related src/auth.py 42 ./my-project ``` -Like search, `find-related` also accepts an `--index` argument. - `path` defaults to the current directory when omitted; git URLs are accepted. If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its place. ### Workflow -1. Index the repo using `semble index -o cached_index`. -2. Start with `semble search` to find relevant chunks. Pass the index to achieve results faster. -3. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. -4. Inspect full files only when the returned chunk does not give enough context. -5. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. -6. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. +1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. +2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. +3. Inspect full files only when the returned chunk does not give enough context. +4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. +5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. diff --git a/src/semble/agents/copilot.md b/src/semble/agents/copilot.md index 895e282..2cdc0f5 100644 --- a/src/semble/agents/copilot.md +++ b/src/semble/agents/copilot.md @@ -12,19 +12,7 @@ semble search "save_pretrained" ./my-project semble search "save model to disk" ./my-project --top-k 10 ``` -If you anticipate doing more than one search, use `semble index` to create an index. - -```bash -semble index ./my-project -o my_index -``` - -You can then reuse this index later on: - -```bash -semble search "save_pretrained" --index my_index -``` - -An index is not automatically updated, so if the code changes significantly, reindex. If you notice stale results while resolving searches to files, reindex. +Results are cached automatically on first run and invalidated when files change. Use `--content docs` to search documentation and prose, `--content config` for config files (yaml, toml, etc.), or `--content all` to search code, docs, and config: @@ -40,17 +28,14 @@ Use `semble find-related` to discover code similar to a known location (pass `fi semble find-related src/auth.py 42 ./my-project ``` -Like search, `find-related` also accepts an `--index` argument. - `path` defaults to the current directory when omitted; git URLs are accepted. If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its place. ### Workflow -1. Index the repo using `semble index -o cached_index`. -2. Start with `semble search` to find relevant chunks. Pass the index to achieve results faster. -3. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. -4. Inspect full files only when the returned chunk does not give enough context. -5. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. -6. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. +1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. +2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. +3. Inspect full files only when the returned chunk does not give enough context. +4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. +5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. diff --git a/src/semble/agents/cursor.md b/src/semble/agents/cursor.md index baf455c..2071c27 100644 --- a/src/semble/agents/cursor.md +++ b/src/semble/agents/cursor.md @@ -11,19 +11,7 @@ semble search "save_pretrained" ./my-project semble search "save model to disk" ./my-project --top-k 10 ``` -If you anticipate doing more than one search, use `semble index` to create an index. - -```bash -semble index ./my-project -o my_index -``` - -You can then reuse this index later on: - -```bash -semble search "save_pretrained" --index my_index -``` - -An index is not automatically updated, so if the code changes significantly, reindex. If you notice stale results while resolving searches to files, reindex. +Results are cached automatically on first run and invalidated when files change. Use `--content docs` to search documentation and prose, `--content config` for config files (yaml, toml, etc.), or `--content all` to search code, docs, and config: @@ -39,17 +27,14 @@ Use `semble find-related` to discover code similar to a known location (pass `fi semble find-related src/auth.py 42 ./my-project ``` -Like search, `find-related` also accepts an `--index` argument. - `path` defaults to the current directory when omitted; git URLs are accepted. If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its place. ### Workflow -1. Index the repo using `semble index -o cached_index`. -2. Start with `semble search` to find relevant chunks. Pass the index to achieve results faster. -3. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. -4. Inspect full files only when the returned chunk does not give enough context. -5. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. -6. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. +1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. +2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. +3. Inspect full files only when the returned chunk does not give enough context. +4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. +5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. diff --git a/src/semble/agents/gemini.md b/src/semble/agents/gemini.md index e4e9b6a..a20fcd9 100644 --- a/src/semble/agents/gemini.md +++ b/src/semble/agents/gemini.md @@ -14,19 +14,7 @@ semble search "save_pretrained" ./my-project semble search "save model to disk" ./my-project --top-k 10 ``` -If you anticipate doing more than one search, use `semble index` to create an index. - -```bash -semble index ./my-project -o my_index -``` - -You can then reuse this index later on: - -```bash -semble search "save_pretrained" --index my_index -``` - -An index is not automatically updated, so if the code changes significantly, reindex. If you notice stale results while resolving searches to files, reindex. +Results are cached automatically on first run and invalidated when files change. Use `--content docs` to search documentation and prose, `--content config` for config files (yaml, toml, etc.), or `--content all` to search code, docs, and config: @@ -42,17 +30,14 @@ Use `semble find-related` to discover code similar to a known location (pass `fi semble find-related src/auth.py 42 ./my-project ``` -Like search, `find-related` also accepts an `--index` argument. - `path` defaults to the current directory when omitted; git URLs are accepted. If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its place. ### Workflow -1. Index the repo using `semble index -o cached_index`. -2. Start with `semble search` to find relevant chunks. Pass the index to achieve results faster. -3. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. -4. Inspect full files only when the returned chunk does not give enough context. -5. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. -6. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. +1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. +2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. +3. Inspect full files only when the returned chunk does not give enough context. +4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. +5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. diff --git a/src/semble/agents/kiro.md b/src/semble/agents/kiro.md index d556c13..bf5d5fc 100644 --- a/src/semble/agents/kiro.md +++ b/src/semble/agents/kiro.md @@ -14,19 +14,7 @@ semble search "save_pretrained" ./my-project semble search "save model to disk" ./my-project --top-k 10 ``` -If you anticipate doing more than one search, use `semble index` to create an index. - -```bash -semble index ./my-project -o my_index -``` - -You can then reuse this index later on: - -```bash -semble search "save_pretrained" --index my_index -``` - -An index is not automatically updated, so if the code changes significantly, reindex. If you notice stale results while resolving searches to files, reindex. +Results are cached automatically on first run and invalidated when files change. Use `--content docs` to search documentation and prose, `--content config` for config files (yaml, toml, etc.), or `--content all` to search code, docs, and config: @@ -42,17 +30,14 @@ Use `semble find-related` to discover code similar to a known location (pass `fi semble find-related src/auth.py 42 ./my-project ``` -Like search, `find-related` also accepts an `--index` argument. - `path` defaults to the current directory when omitted; git URLs are accepted. If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its place. ### Workflow -1. Index the repo using `semble index -o cached_index`. -2. Start with `semble search` to find relevant chunks. Pass the index to achieve results faster. -3. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. -4. Inspect full files only when the returned chunk does not give enough context. -5. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. -6. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. +1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. +2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. +3. Inspect full files only when the returned chunk does not give enough context. +4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. +5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. diff --git a/src/semble/agents/opencode.md b/src/semble/agents/opencode.md index 2ec43c8..fbfcede 100644 --- a/src/semble/agents/opencode.md +++ b/src/semble/agents/opencode.md @@ -15,19 +15,7 @@ semble search "save_pretrained" ./my-project semble search "save model to disk" ./my-project --top-k 10 ``` -If you anticipate doing more than one search, use `semble index` to create an index. - -```bash -semble index ./my-project -o my_index -``` - -You can then reuse this index later on: - -```bash -semble search "save_pretrained" --index my_index -``` - -An index is not automatically updated, so if the code changes significantly, reindex. If you notice stale results while resolving searches to files, reindex. +Results are cached automatically on first run and invalidated when files change. Use `--content docs` to search documentation and prose, `--content config` for config files (yaml, toml, etc.), or `--content all` to search code, docs, and config: @@ -43,17 +31,14 @@ Use `semble find-related` to discover code similar to a known location (pass `fi semble find-related src/auth.py 42 ./my-project ``` -Like search, `find-related` also accepts an `--index` argument. - `path` defaults to the current directory when omitted; git URLs are accepted. If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its place. ### Workflow -1. Index the repo using `semble index -o cached_index`. -2. Start with `semble search` to find relevant chunks. Pass the index to achieve results faster. -3. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. -4. Inspect full files only when the returned chunk does not give enough context. -5. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. -6. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. +1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. +2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. +3. Inspect full files only when the returned chunk does not give enough context. +4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. +5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. diff --git a/src/semble/cache.py b/src/semble/cache.py new file mode 100644 index 0000000..6f6ea49 --- /dev/null +++ b/src/semble/cache.py @@ -0,0 +1,124 @@ +import hashlib +import json +import os +import shutil +import sys +from collections.abc import Sequence +from pathlib import Path +from typing import TYPE_CHECKING + +from semble.index.file_walker import walk_files +from semble.index.files import FileStatus, get_extensions, get_file_status +from semble.index.types import PersistencePath +from semble.types import ContentType +from semble.utils import is_git_url, resolve_model_name + +if TYPE_CHECKING: + from semble.index import SembleIndex + + +def find_index_from_cache_folder(path: str) -> Path: + """Finds an index from a cache folder and a project path.""" + if is_git_url(path): + data = path.encode("utf-8") + else: + normalized = Path(path).expanduser().resolve() + data = str(normalized).encode("utf-8") + subdir_path = hashlib.new("sha256", data).hexdigest() + cache_dir = resolve_cache_folder() / subdir_path + return cache_dir / "index" + + +def _windows_cache_dir(name: str) -> Path: + """Get the default windows cache dir.""" + env_base = os.getenv("LOCALAPPDATA") or os.getenv("APPDATA") + base = Path(env_base) if env_base is not None else Path.home() / "AppData" / "Local" + return base / name / "Cache" + + +def _macos_cache_dir(name: str) -> Path: + """Get the default macOS cache dir.""" + return Path.home() / "Library" / "Caches" / name + + +def _linux_cache_dir(name: str) -> Path: + """Get the default Linux cache dir.""" + env_base = os.getenv("XDG_CACHE_HOME") + base = Path(env_base) if env_base else Path.home() / ".cache" + return base / name + + +def resolve_cache_folder() -> Path: + """Resolves a cache folder, respects XDG_CACHE_HOME.""" + name = "semble" + if sys.platform == "win32": + cache_dir = _windows_cache_dir(name) + elif sys.platform == "darwin": + cache_dir = _macos_cache_dir(name) + else: + cache_dir = _linux_cache_dir(name) + + cache_dir.mkdir(parents=True, exist_ok=True) + return cache_dir + + +def clear_cache(path: str) -> None: + """Clears the cache for the given path.""" + index_path = find_index_from_cache_folder(path) + if index_path.exists(): + shutil.rmtree(index_path) + + +def save_index_to_cache(index: "SembleIndex", path: str) -> None: + """Save an index to the cache folder if it was freshly built.""" + if not index.loaded_from_disk: + index.save(find_index_from_cache_folder(path)) + + +def _metadata_matches(metadata: dict, model_path: str, content: Sequence[ContentType]) -> bool: + """Return True if the stored metadata is compatible with the requested parameters.""" + try: + content_type = tuple(ContentType(s) for s in metadata["content_type"]) + return metadata["model_path"] == model_path and set(content_type) == set(content) + except (KeyError, ValueError): + return False + + +def get_validated_cache(path: str, model_path: str | None, content: Sequence[ContentType]) -> Path | None: + """Validates the cache folder and returns the index path.""" + index_path = find_index_from_cache_folder(path) + if not index_path.exists(): + return None + + persistence_path = PersistencePath.from_path(index_path) + if persistence_path.non_existing(): + return None + + if model_path is None: + model_path = resolve_model_name() + with open(persistence_path.metadata) as f: + metadata = json.load(f) + if not _metadata_matches(metadata, model_path, content): + return None + + if is_git_url(str(path)): + return index_path + + write_time = metadata["time"] + extensions = get_extensions(content) + + path_as_path = Path(path).resolve() + stored_files: list[str] = metadata.get("file_paths", []) + current_files = [] + for file_path in walk_files(path_as_path, extensions=extensions): + file_status = get_file_status(file_path, write_time) + if file_status == FileStatus.NEWER: + return None + if file_status != FileStatus.VALID: + continue + current_files.append(str(file_path.relative_to(path_as_path))) + + if set(current_files) != set(stored_files): + return None + + return index_path diff --git a/src/semble/cli.py b/src/semble/cli.py index aac944d..1a0e8b4 100644 --- a/src/semble/cli.py +++ b/src/semble/cli.py @@ -10,6 +10,7 @@ from model2vec.utils import get_package_extras +from semble.cache import find_index_from_cache_folder from semble.index import SembleIndex from semble.stats import format_savings_report from semble.types import ContentType @@ -26,7 +27,26 @@ class Agent(str, Enum): _DEFAULT_AGENT = Agent.CLAUDE -_CLI_DISPATCH_ARGS = frozenset({"search", "find-related", "init", "savings", "-h", "--help", "index"}) +_CLI_DISPATCH_ARGS = frozenset({"search", "find-related", "init", "savings", "-h", "--help"}) + + +def _build_index(path: str, content: list[ContentType]) -> SembleIndex: + """Build an index from a local path or git URL.""" + return ( + SembleIndex.from_git(path, content=content) + if is_git_url(path) + else SembleIndex.from_path(path, content=content) + ) + + +def _maybe_save_index(index: SembleIndex, path: str) -> None: + """Save the index to the cache folder if it was not loaded from disk.""" + if not index.loaded_from_disk: + try: + cache_folder = find_index_from_cache_folder(path) + index.save(cache_folder) + except Exception as e: + print(f"Error saving index: {e}", file=sys.stderr) def _agent_path(agent: Agent) -> Path: @@ -83,16 +103,6 @@ def _mcp_main() -> None: asyncio.run(serve(args.path, ref=args.ref, content=content)) -def _run_index(*, path: str, include_text_files: bool = False, out: str) -> None: - """Index and store a codebase.""" - if is_git_url(path): - index = SembleIndex.from_git(path, include_text_files=include_text_files) - else: - index = SembleIndex.from_path(path, include_text_files=include_text_files) - Path(out).mkdir(parents=True, exist_ok=True) - index.save(out) - - def _run_init(*, agent: Agent = _DEFAULT_AGENT, force: bool = False) -> None: """Write the semble sub-agent file for the given coding agent into the current project.""" dest = _agent_path(agent) @@ -118,24 +128,49 @@ def _resolve_content(content: list[str], include_text_files: bool) -> list[Conte return [ContentType(c) for c in content] +def _load_index(path: str, content: list[ContentType]) -> SembleIndex: + """Build an index from a local path or git URL, exiting on FileNotFoundError.""" + try: + return _build_index(path, content) + except FileNotFoundError as e: + print(str(e), file=sys.stderr) + sys.exit(1) + + +def _run_search(path: str, query: str, top_k: int, content: list[ContentType]) -> None: + """Handle the `search` subcommand.""" + index = _load_index(path, content) + results = index.search(query, top_k=top_k) + out = format_results(query, results) if results else {"error": "No results found."} + print(json.dumps(out)) + _maybe_save_index(index, path) + + +def _run_find_related(path: str, file_path: str, line: int, top_k: int, content: list[ContentType]) -> None: + """Handle the `find-related` subcommand.""" + index = _load_index(path, content) + chunk = resolve_chunk(index.chunks, file_path, line) + if chunk is None: + print(f"No chunk found at {file_path}:{line}.", file=sys.stderr) + sys.exit(1) + results = index.find_related(chunk, top_k=top_k) + out = ( + format_results(f"Chunks related to {file_path}:{line}", results) + if results + else {"error": f"No related chunks found for {file_path}:{line}."} + ) + print(json.dumps(out)) + _maybe_save_index(index, path) + + def _cli_main() -> None: parser = argparse.ArgumentParser(prog="semble") sub = parser.add_subparsers(dest="command") - index_p = sub.add_parser("index", help="Index and store a codebase.") - index_p.add_argument("path", nargs="?", default=".", help="Local path or git URL (default: current directory).") - index_p.add_argument( - "--include-text-files", - action="store_true", - help="Also index non-code text files (.md, .yaml, .json, etc.).", - ) - index_p.add_argument("-o", "--out", type=str, required=True, help="The path to write the pre-built index to.") - search_p = sub.add_parser("search", help="Search a codebase.") search_p.add_argument("query", help="Natural language or code query.") search_p.add_argument("path", nargs="?", default=".", help="Local path or git URL (default: current directory).") search_p.add_argument("-k", "--top-k", type=int, default=5, help="Number of results (default: 5).") - search_p.add_argument("--index", type=str, default=None, help="A path pointing to a pre-built index.") _add_content_args(search_p) related_p = sub.add_parser("find-related", help="Find code similar to a specific location.") @@ -143,7 +178,6 @@ def _cli_main() -> None: related_p.add_argument("line", type=int, help="Line number (1-indexed).") related_p.add_argument("path", nargs="?", default=".", help="Local path or git URL (default: current directory).") related_p.add_argument("-k", "--top-k", type=int, default=5, help="Number of results (default: 5).") - related_p.add_argument("--index", type=str, default=None, help="A path pointing to a pre-built index.") _add_content_args(related_p) init_p = sub.add_parser("init", help="Write a semble sub-agent file for your coding agent.") @@ -163,42 +197,11 @@ def _cli_main() -> None: if args.command == "init": _run_init(agent=Agent(args.agent), force=args.force) - return - - if args.command == "index": - _run_index(path=args.path, include_text_files=args.include_text_files, out=args.out) - return - - if args.command == "savings": - print(format_savings_report(verbose=args.verbose), end="") - return - - if args.index: - index = SembleIndex.load_from_disk(args.index) - else: - content = _resolve_content(args.content, args.include_text_files) - index = ( - SembleIndex.from_git(args.path, content=content) - if is_git_url(args.path) - else SembleIndex.from_path(args.path, content=content) - ) - - if args.command == "search": - results = index.search(args.query, top_k=args.top_k) - if not results: - out = {"error": "No results found."} - else: - out = format_results(args.query, results) - print(json.dumps(out)) - + elif args.command == "savings": + print(format_savings_report(verbose=args.verbose)) + elif args.command == "search": + _run_search(args.path, args.query, args.top_k, _resolve_content(args.content, args.include_text_files)) elif args.command == "find-related": - chunk = resolve_chunk(index.chunks, args.file_path, args.line) - if chunk is None: - print(f"No chunk found at {args.file_path}:{args.line}.", file=sys.stderr) - sys.exit(1) - results = index.find_related(chunk, top_k=args.top_k) - if not results: - out = {"error": f"No related chunks found for {args.file_path}:{args.line}."} - else: - out = format_results(f"Chunks related to {args.file_path}:{args.line}", results) - print(json.dumps(out)) + _run_find_related( + args.path, args.file_path, args.line, args.top_k, _resolve_content(args.content, args.include_text_files) + ) diff --git a/src/semble/index/create.py b/src/semble/index/create.py index b72a055..b4dd189 100644 --- a/src/semble/index/create.py +++ b/src/semble/index/create.py @@ -9,18 +9,15 @@ from semble.chunking import chunk_source from semble.index.dense import SelectableBasicBackend, embed_chunks from semble.index.file_walker import walk_files -from semble.index.files import detect_language, get_extensions +from semble.index.files import FileStatus, detect_language, get_extensions, get_file_status, read_file_text from semble.index.sparse import enrich_for_bm25 from semble.tokens import tokenize from semble.types import Chunk, ContentType -_MAX_FILE_BYTES = 1_000_000 # 1 MB max file size to read and index - def create_index_from_path( path: Path, model: StaticModel, - extensions: Sequence[str] | None = None, content: ContentType | Sequence[ContentType] = (ContentType.CODE,), display_root: Path | None = None, ) -> tuple[bm25s.BM25, SelectableBasicBackend, list[Chunk]]: @@ -28,7 +25,6 @@ def create_index_from_path( :param path: Resolved absolute path to index. :param model: The model to use for indexing. - :param extensions: File extensions to include. :param content: Content types to index. :param display_root: If set, chunk file paths are stored relative to this root. :raises ValueError: if no items were found, no index can be created. @@ -36,13 +32,14 @@ def create_index_from_path( """ chunks: list[Chunk] = [] normalized = (content,) if isinstance(content, ContentType) else content - resolved_extensions = get_extensions(normalized, extensions) + resolved_extensions = get_extensions(normalized) for file_path in walk_files(path, resolved_extensions): language = detect_language(file_path) with contextlib.suppress(OSError): - if file_path.stat().st_size > _MAX_FILE_BYTES: + file_status = get_file_status(file_path, None) + if file_status != FileStatus.VALID: continue - source = file_path.read_text(encoding="utf-8", errors="replace") + source = read_file_text(file_path) chunk_path = file_path.relative_to(display_root) if display_root else file_path chunks.extend(chunk_source(source, str(chunk_path), language)) diff --git a/src/semble/index/dense.py b/src/semble/index/dense.py index 9677c22..e3c0e26 100644 --- a/src/semble/index/dense.py +++ b/src/semble/index/dense.py @@ -12,8 +12,7 @@ from vicinity.utils import normalize from semble.types import Chunk - -_DEFAULT_MODEL_NAME = "minishlab/potion-code-16M" +from semble.utils import resolve_model_name @cache @@ -32,7 +31,7 @@ def _load_cached(model_path: str) -> StaticModel: def load_model(model_path: str | None = None) -> tuple[StaticModel, str]: """Return the current model, loading the default if none was provided.""" if model_path is None: - model_path = _DEFAULT_MODEL_NAME + model_path = resolve_model_name() model = _load_cached(model_path) return model, model_path diff --git a/src/semble/index/files.py b/src/semble/index/files.py index a20e804..7aa0702 100644 --- a/src/semble/index/files.py +++ b/src/semble/index/files.py @@ -1,9 +1,12 @@ from collections import defaultdict from collections.abc import Sequence +from enum import Enum from pathlib import Path from semble.types import ContentType +_MAX_FILE_BYTES = 1_000_000 # 1 MB max file size to read and index +_EMPTY_FILE_BYTES = 128 _EXTENSION_TO_LANGUAGE = { ".4th": "forth", ".ada": "ada", @@ -461,7 +464,7 @@ def detect_language(file_name: Path) -> str | None: return _EXTENSION_TO_LANGUAGE.get(file_name.suffix.lower()) -def get_extensions(types: Sequence[ContentType], extensions: Sequence[str] | None) -> list[str]: +def get_extensions(types: Sequence[ContentType]) -> list[str]: """Returns a list of supported file extensions for the given content types.""" languages: set[str] = set() for content_type in types: @@ -469,7 +472,35 @@ def get_extensions(types: Sequence[ContentType], extensions: Sequence[str] | Non all_extensions: set[str] = set() for language in languages: all_extensions.update(_LANGUAGE_TO_EXTENSION.get(language, set())) - if extensions is not None: - all_extensions.update(extensions) return sorted(all_extensions) + + +class FileStatus(str, Enum): + NEWER = "newer" + TOO_LARGE = "too_large" + EMPTY = "empty" + VALID = "valid" + + +def read_file_text(file_path: Path) -> str: + """Read a file's text content, replacing invalid characters and silencing read errors.""" + return file_path.read_text(encoding="utf-8", errors="replace") + + +def get_file_status(file_path: Path, write_time: float | None) -> FileStatus: + """Checks if a file should be indexed based on its size and modification time.""" + stat = file_path.stat() + if write_time is not None and stat.st_mtime > write_time: + # Index invalid, file invalid + return FileStatus.NEWER + size = stat.st_size + if size > _MAX_FILE_BYTES: + # index valid, file invalid + return FileStatus.TOO_LARGE + if size < _EMPTY_FILE_BYTES and not read_file_text(file_path).strip(): + # index valid, file invalid + return FileStatus.EMPTY + + # Both valid + return FileStatus.VALID diff --git a/src/semble/index/index.py b/src/semble/index/index.py index 7949471..8bab699 100644 --- a/src/semble/index/index.py +++ b/src/semble/index/index.py @@ -15,8 +15,10 @@ from bm25s import BM25 from model2vec.model import StaticModel +from semble.cache import get_validated_cache from semble.index.create import create_index_from_path from semble.index.dense import SelectableBasicBackend, load_model +from semble.index.files import read_file_text from semble.index.types import PersistencePath from semble.search import _search_semantic, search from semble.stats import save_search_stats @@ -57,6 +59,7 @@ def __init__( model_path: str, root: Path | None = None, content: ContentType | Sequence[ContentType] = _DEFAULT_CONTENT, + loaded_from_disk: bool = False, ) -> None: """Initialize a SembleIndex. Should be created with from_path or from_git. @@ -67,6 +70,7 @@ def __init__( :param model_path: Path to the model file. :param root: Root directory used to read file sizes for token-savings stats. :param content: Content type used when indexing; controls the search pipeline. + :param loaded_from_disk: Whether the index was loaded from disk (cache hit); controls CLI messaging. """ self.model = model self.chunks: list[Chunk] = chunks @@ -77,6 +81,7 @@ def __init__( self._content: tuple[ContentType, ...] = (content,) if isinstance(content, ContentType) else tuple(content) self._file_sizes: dict[str, int] = self._compute_file_sizes(root) if root else {} self._file_mapping, self._language_mapping = self._populate_mapping() + self.loaded_from_disk: bool = loaded_from_disk def _populate_mapping(self) -> tuple[dict[str, list[int]], dict[str, list[int]]]: """Build (file → chunk indices, language → chunk indices) mappings, in that order.""" @@ -97,7 +102,7 @@ def _compute_file_sizes(self, root: Path) -> dict[str, int]: if chunk.file_path in sizes: continue try: - sizes[chunk.file_path] = len((root / chunk.file_path).read_text(encoding="utf-8", errors="replace")) + sizes[chunk.file_path] = len(read_file_text(root / chunk.file_path)) except OSError: pass return sizes @@ -120,7 +125,6 @@ def stats(self) -> IndexStats: def from_path( cls, path: str | Path, - extensions: Sequence[str] | None = None, content: ContentType | Sequence[ContentType] = _DEFAULT_CONTENT, include_text_files: bool | None = None, model_path: str | None = None, @@ -128,7 +132,6 @@ def from_path( """Create and index a SembleIndex from a directory. :param path: Root directory to index. - :param extensions: File extensions to include. Defaults to a standard set of code extensions. :param content: Content types to index, e.g. ContentType.CODE or [ContentType.CODE, ContentType.DOCS]. :param include_text_files: Deprecated. Pass a content sequence directly instead. :param model_path: Path to the model to use. If None, the default model will be used. @@ -136,18 +139,22 @@ def from_path( :raises FileNotFoundError: If `path` does not exist. :raises NotADirectoryError: If `path` exists but is not a directory. """ - model, model_path = load_model(model_path) - normalized = _apply_include_text_files(content, include_text_files) path = Path(path) if not path.exists(): raise FileNotFoundError(f"Path does not exist: {path}") if not path.is_dir(): raise NotADirectoryError(f"Path is not a directory: {path}") + + normalized = _apply_include_text_files(content, include_text_files) + cache_path = get_validated_cache(str(path), model_path, normalized) + if cache_path: + return cls.load_from_disk(cache_path) + model, model_path = load_model(model_path) + path = path.resolve() bm25, vicinity, chunks = create_index_from_path( path, model=model, - extensions=extensions, content=normalized, display_root=path, ) @@ -159,7 +166,6 @@ def from_git( cls, url: str, ref: str | None = None, - extensions: Sequence[str] | None = None, model_path: str | None = None, content: ContentType | Sequence[ContentType] = _DEFAULT_CONTENT, include_text_files: bool | None = None, @@ -173,7 +179,6 @@ def from_git( :param url: URL of the git repository to clone (any git provider). :param ref: Branch or tag to check out. Defaults to the remote HEAD. - :param extensions: File extensions to include. Defaults to a standard set of code extensions. :param model_path: Path to the model to use. If None, the default model will be used. :param content: Content types to index, e.g. (ContentType.CODE,) or (ContentType.CODE, ContentType.DOCS). :param include_text_files: Deprecated. Pass content=(ContentType.CODE, ContentType.DOCS, ...) instead. @@ -181,6 +186,11 @@ def from_git( :raises RuntimeError: If git is not on PATH, the clone fails, or times out. """ normalized = _apply_include_text_files(content, include_text_files) + cache_key = f"{url}@{ref}" if ref else url + cache_path = get_validated_cache(cache_key, model_path, normalized) + if cache_path: + return cls.load_from_disk(cache_path) + with tempfile.TemporaryDirectory() as tmp_dir: # `--` prevents `url` from being interpreted as a git option (e.g. `--upload-pack=...`). cmd = ["git", "clone", "--depth", "1", *(["--branch", ref] if ref else []), "--", url, tmp_dir] @@ -200,12 +210,19 @@ def from_git( bm25, vicinity, chunks = create_index_from_path( resolved_path, model=model, - extensions=extensions, content=normalized, display_root=resolved_path, ) - return SembleIndex(model, bm25, vicinity, chunks, model_path, root=resolved_path, content=normalized) + return SembleIndex( + model, + bm25, + vicinity, + chunks, + model_path, + root=resolved_path, + content=normalized, + ) def find_related(self, source: Chunk | SearchResult, *, top_k: int = 5) -> list[SearchResult]: """Return chunks semantically similar to the given chunk or search result. @@ -300,12 +317,22 @@ def load_from_disk(cls: type[SembleIndex], path: Path | str) -> SembleIndex: chunks.append(Chunk.from_dict(chunk_item)) root_path = metadata["root_path"] model_path = metadata["model_path"] + content = tuple(ContentType(s) for s in metadata.get("content_type", ["code"])) if root_path: root_path = Path(root_path) model, model_path = load_model(model_path) - return cls(model, bm_25_index, semantic_index, chunks, model_path, root=root_path) + return cls( + model, + bm_25_index, + semantic_index, + chunks, + model_path, + root=root_path, + content=content, + loaded_from_disk=True, + ) def save(self, path: Path | str) -> None: """Save the index to disk.""" @@ -321,7 +348,13 @@ def save(self, path: Path | str) -> None: data = orjson.dumps(chunks_as_dict) f.write(data) root_str = None if self._root is None else str(self._root) - metadata = {"root_path": root_str, "time": datetime.now().timestamp(), "model_path": self._model_path} + metadata = { + "root_path": root_str, + "time": datetime.now().timestamp(), + "model_path": self._model_path, + "content_type": list(x.value for x in self._content), + "file_paths": sorted(self._file_mapping), + } with open(persistence_paths.metadata, "wb") as f: data = orjson.dumps(metadata) f.write(data) diff --git a/src/semble/mcp.py b/src/semble/mcp.py index 3aa526a..f31d0d8 100644 --- a/src/semble/mcp.py +++ b/src/semble/mcp.py @@ -12,6 +12,7 @@ from mcp.server.fastmcp import FastMCP from pydantic import Field +from semble.cache import save_index_to_cache from semble.index import SembleIndex from semble.index.dense import load_model from semble.types import ContentType @@ -174,6 +175,19 @@ def _compute_cache_key(self, source: str, ref: str | None = None) -> str: is_git = is_git_url(source) return (f"{source}@{ref}" if ref else source) if is_git else str(Path(source).resolve()) + def _build_and_cache_index(self, source: str, ref: str | None, model_path: str, cache_key: str) -> SembleIndex: + """Build an index for the given source and cache it.""" + index = ( + SembleIndex.from_git(source, ref=ref, model_path=model_path, content=self._content) + if is_git_url(source) + else SembleIndex.from_path(cache_key, model_path=model_path, content=self._content) + ) + try: + save_index_to_cache(index, cache_key) + except Exception: + logger.warning("Failed to save index cache for %r", cache_key, exc_info=True) + return index + def evict(self, source: str) -> None: self._tasks.pop(self._compute_cache_key(source), None) @@ -203,25 +217,9 @@ async def get(self, source: str, ref: str | None = None) -> SembleIndex: if cache_key not in self._tasks: if len(self._tasks) >= _CACHE_MAX_SIZE: self._tasks.popitem(last=False) - if is_git_url(source): - self._tasks[cache_key] = asyncio.create_task( - asyncio.to_thread( - SembleIndex.from_git, - source, - ref=ref, - model_path=model_path, - content=self._content, - ) - ) - else: - self._tasks[cache_key] = asyncio.create_task( - asyncio.to_thread( - SembleIndex.from_path, - cache_key, - model_path=model_path, - content=self._content, - ) - ) + self._tasks[cache_key] = asyncio.create_task( + asyncio.to_thread(self._build_and_cache_index, source, ref, model_path, cache_key) + ) self._tasks.move_to_end(cache_key) task = self._tasks[cache_key] try: diff --git a/src/semble/search.py b/src/semble/search.py index f7c8fbb..238d9eb 100644 --- a/src/semble/search.py +++ b/src/semble/search.py @@ -105,8 +105,7 @@ def search( normalized_semantic = _rrf_scores(semantic_scores) normalized_bm25 = _rrf_scores(bm25_scores) - # Sort by the file path and start line to - # counteract randomness introduces by hashing. + # Sort by start line to counteract randomness introduced by hashing. all_candidates = sorted( {*normalized_semantic, *normalized_bm25}, key=lambda c: c.start_line, diff --git a/src/semble/stats.py b/src/semble/stats.py index 90f75a2..bebc988 100644 --- a/src/semble/stats.py +++ b/src/semble/stats.py @@ -5,11 +5,15 @@ from datetime import datetime, timedelta, timezone from pathlib import Path +from semble.cache import resolve_cache_folder from semble.types import CallType, SearchResult logger = logging.getLogger(__name__) -_STATS_FILE = Path.home() / ".semble" / "savings.jsonl" + +def _get_stats_file() -> Path: + """Safely create a stats file.""" + return resolve_cache_folder() / "savings.jsonl" @dataclass @@ -52,15 +56,18 @@ def save_search_stats( "snippet_chars": snippet_chars, "file_chars": file_chars, } - _STATS_FILE.parent.mkdir(parents=True, exist_ok=True) - with _STATS_FILE.open("a") as f: + stats_file = _get_stats_file() + stats_file.parent.mkdir(parents=True, exist_ok=True) + with stats_file.open("a") as f: f.write(json.dumps(record) + "\n") except OSError: pass -def build_savings_summary(path: Path = _STATS_FILE) -> SavingsSummary: +def build_savings_summary(path: Path | None = None) -> SavingsSummary: """Read savings.jsonl and return a SavingsSummary.""" + if path is None: + path = _get_stats_file() now = datetime.now(timezone.utc) today = now.date() seven_days_ago = (now - timedelta(days=7)).date() @@ -98,7 +105,7 @@ def build_savings_summary(path: Path = _STATS_FILE) -> SavingsSummary: def format_savings_report(path: Path | None = None, *, verbose: bool = False) -> str: """Return a formatted token-savings report.""" if path is None: - path = _STATS_FILE + path = _get_stats_file() if not path.exists(): return "No stats yet. Run a search first." diff --git a/src/semble/utils.py b/src/semble/utils.py index 4b71395..b11ee29 100644 --- a/src/semble/utils.py +++ b/src/semble/utils.py @@ -1,5 +1,6 @@ from __future__ import annotations +import os import re from typing import Any @@ -7,6 +8,7 @@ _GIT_URL_SCHEMES = ("https://", "http://", "ssh://", "git://", "git+ssh://", "file://") _SCP_GIT_URL_RE = re.compile(r"^[\w.-]+@[\w.-]+:(?!/)") +DEFAULT_MODEL_NAME = "minishlab/potion-code-16M" def is_git_url(path: str) -> bool: @@ -33,3 +35,8 @@ def resolve_chunk(chunks: list[Chunk], file_path: str, line: int) -> Chunk | Non def format_results(query: str, results: list[SearchResult]) -> dict[str, Any]: """Render SearchResult objects as a JSONable object.""" return {"query": query, "results": [r.to_dict() for r in results]} + + +def resolve_model_name() -> str: + """Resolve a model name to a configurable.""" + return os.environ.get("SEMBLE_MODEL_NAME", DEFAULT_MODEL_NAME) diff --git a/src/semble/version.py b/src/semble/version.py index 6036609..b7e3609 100644 --- a/src/semble/version.py +++ b/src/semble/version.py @@ -1,2 +1,2 @@ -__version_triple__ = (0, 2, 0) +__version_triple__ = (0, 3, 0) __version__ = ".".join(map(str, __version_triple__)) diff --git a/tests/index/test_index.py b/tests/index/test_index.py index a4d05b8..76a9d04 100644 --- a/tests/index/test_index.py +++ b/tests/index/test_index.py @@ -6,7 +6,8 @@ from model2vec import StaticModel from semble import SembleIndex -from semble.index.create import _MAX_FILE_BYTES, create_index_from_path +from semble.index.create import create_index_from_path +from semble.index.files import _MAX_FILE_BYTES, FileStatus, get_file_status from semble.types import ContentType from tests.conftest import make_chunk @@ -74,6 +75,13 @@ def test_oversized_file_is_skipped(mock_model: StaticModel, tmp_path: Path) -> N create_index_from_path(tmp_path, mock_model) +def test_tiny_invalid_utf8_file_status_does_not_crash(tmp_path: Path) -> None: + """Tiny files with invalid UTF-8 bytes are treated as non-empty.""" + path = tmp_path / "latin1.py" + path.write_bytes(b"\xff") + assert get_file_status(path, None) is FileStatus.VALID + + def test_index_language_counts(indexed_index: SembleIndex) -> None: """Language breakdown in stats includes python with at least one chunk.""" stats = indexed_index.stats @@ -185,6 +193,22 @@ def test_roundtrip(tmp_path: Path, indexed_index: SembleIndex) -> None: assert index_2._root == indexed_index._root +def test_load_save_roundtrip_preserves_manifest(tmp_path: Path, indexed_index: SembleIndex) -> None: + """load_from_disk followed by save must not clobber file_paths with an empty list.""" + save_a = tmp_path / "a" + save_b = tmp_path / "b" + indexed_index.save(save_a) + with patch.object(StaticModel, "from_pretrained"): + loaded = SembleIndex.load_from_disk(save_a) + loaded.save(save_b) + import json + + manifest_a = json.loads((save_a / "metadata.json").read_text())["file_paths"] + manifest_b = json.loads((save_b / "metadata.json").read_text())["file_paths"] + assert manifest_b == manifest_a + assert len(manifest_b) > 0 + + def test_load_non_existent(tmp_path: Path, indexed_index: SembleIndex) -> None: """Test that saving and loading a folder leads to the same data.""" with pytest.raises(FileNotFoundError): @@ -208,3 +232,22 @@ def test_load_from_disk_missing_files_reports_them(tmp_path: Path) -> None: assert "metadata.json" in error_msg # The file we did create should NOT be listed as missing. assert "chunks.json" not in error_msg + + +def test_from_path_uses_cache_when_valid(tmp_project: Path) -> None: + """from_path returns the cached index directly when get_validated_cache hits.""" + fake_cached = MagicMock(spec=SembleIndex) + with patch("semble.index.index.get_validated_cache", return_value=tmp_project / "cache"): + with patch.object(SembleIndex, "load_from_disk", return_value=fake_cached): + result = SembleIndex.from_path(tmp_project) + assert result is fake_cached + + +@pytest.mark.parametrize("ref", [None, "v1.0"]) +def test_from_git_uses_cache_when_valid(ref: str | None) -> None: + """from_git uses the cache for both URL-only and URL@ref cache keys.""" + fake_cached = MagicMock(spec=SembleIndex) + with patch("semble.index.index.get_validated_cache", return_value=Path("/cache")): + with patch.object(SembleIndex, "load_from_disk", return_value=fake_cached): + result = SembleIndex.from_git("https://github.com/org/repo.git", ref=ref) + assert result is fake_cached diff --git a/tests/test_cache.py b/tests/test_cache.py new file mode 100644 index 0000000..e46eb20 --- /dev/null +++ b/tests/test_cache.py @@ -0,0 +1,243 @@ +from __future__ import annotations + +import json +import sys +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + +from semble.cache import ( + _linux_cache_dir, + _windows_cache_dir, + clear_cache, + find_index_from_cache_folder, + get_validated_cache, + resolve_cache_folder, + save_index_to_cache, +) +from semble.types import ContentType + + +def test_find_index_from_cache_folder_local_path(tmp_path: Path) -> None: + """Local paths are normalised before hashing, result ends with /index.""" + result = find_index_from_cache_folder(str(tmp_path)) + assert result.name == "index" + assert result == find_index_from_cache_folder(str(tmp_path)) + + +def test_find_index_from_cache_folder_git_url() -> None: + """Git URLs are hashed as-is (not expanded via Path.resolve).""" + url = "https://github.com/org/repo.git" + result = find_index_from_cache_folder(url) + assert result.name == "index" + assert result != find_index_from_cache_folder("https://github.com/org/other.git") + + +@pytest.mark.parametrize( + ("env", "expected_base"), + [ + ({"LOCALAPPDATA": "C:\\Local", "APPDATA": "C:\\Roaming"}, "C:\\Local"), + ({"APPDATA": "C:\\Roaming"}, "C:\\Roaming"), + ], +) +def test_windows_cache_dir_env(env: dict[str, str], expected_base: str) -> None: + """_windows_cache_dir prefers LOCALAPPDATA, falls back to APPDATA.""" + with patch.dict("os.environ", env, clear=True): + assert _windows_cache_dir("semble") == Path(expected_base) / "semble" / "Cache" + + +def test_linux_cache_dir_with_xdg() -> None: + """_linux_cache_dir uses XDG_CACHE_HOME when set.""" + with patch.dict("os.environ", {"XDG_CACHE_HOME": "/xdg"}, clear=True): + assert _linux_cache_dir("semble") == Path("/xdg") / "semble" + + +@pytest.mark.parametrize( + ("fn", "expected_rel"), + [ + (_windows_cache_dir, Path("AppData") / "Local" / "semble" / "Cache"), + (_linux_cache_dir, Path(".cache") / "semble"), + ], +) +def test_cache_dir_no_env(fn: object, expected_rel: Path) -> None: + """Both helpers fall back to a home-relative path when no env vars are set.""" + home = Path("/fake/home") + with patch.dict("os.environ", {}, clear=True): + with patch("pathlib.Path.home", return_value=home): + assert fn("semble") == home / expected_rel # type: ignore[operator] + + +def test_save_index_to_cache(tmp_path: Path) -> None: + """A freshly built index is saved under its cache key.""" + index = MagicMock(loaded_from_disk=False) + with patch("semble.cache.find_index_from_cache_folder", return_value=tmp_path / "index"): + save_index_to_cache(index, "repo") + index.save.assert_called_once_with(tmp_path / "index") + + +@pytest.mark.parametrize( + ("platform", "mock_target", "expected"), + [ + ("win32", "semble.cache._windows_cache_dir", Path("/win")), + ("linux", "semble.cache._linux_cache_dir", Path("/linux")), + ], +) +def test_resolve_cache_folder(platform: str, mock_target: str, expected: Path) -> None: + """resolve_cache_folder calls the correct platform helper.""" + with patch.object(sys, "platform", platform): + with patch(mock_target, return_value=expected) as mock_fn: + with patch("pathlib.Path.mkdir"): + result = resolve_cache_folder() + mock_fn.assert_called_once_with("semble") + assert result == expected + + +def test_clear_cache(tmp_path: Path) -> None: + """clear_cache removes the index directory when it exists and is a no-op otherwise.""" + index_path = tmp_path / "index" + with patch("semble.cache.find_index_from_cache_folder", return_value=index_path): + clear_cache("/some/path") # no-op: path doesn't exist yet + index_path.mkdir() + with patch("semble.cache.find_index_from_cache_folder", return_value=index_path): + clear_cache("/some/path") + assert not index_path.exists() + + +def _write_metadata( + path: Path, + model_path: str, + content_type: list[str], + write_time: float, + file_paths: list[str] | None = None, +) -> None: + path.mkdir(parents=True, exist_ok=True) + (path / "chunks.json").write_text("[]") + (path / "bm25_index").write_text("") + (path / "semantic_index").write_text("") + (path / "metadata.json").write_text( + json.dumps( + { + "model_path": model_path, + "content_type": content_type, + "time": write_time, + "file_paths": file_paths if file_paths is not None else [], + } + ) + ) + + +def test_get_validated_cache_invalid_index(tmp_path: Path) -> None: + """Returns None when the index directory is missing or incomplete.""" + with patch("semble.cache.find_index_from_cache_folder", return_value=tmp_path / "missing"): + assert get_validated_cache("/path", None, [ContentType.CODE]) is None + + index_path = tmp_path / "index" + index_path.mkdir() + with patch("semble.cache.find_index_from_cache_folder", return_value=index_path): + assert get_validated_cache("/path", None, [ContentType.CODE]) is None + + +@pytest.mark.parametrize( + ("stored_model", "stored_content", "req_model", "req_content"), + [ + ("other/model", ["code"], "my/model", [ContentType.CODE]), # model mismatch + ("my/model", ["docs"], "my/model", [ContentType.CODE]), # content mismatch + ("my/model", ["unknown_type"], "my/model", [ContentType.CODE]), # invalid content value + ], +) +def test_get_validated_cache_metadata_mismatch( + stored_model: str, + stored_content: list[str], + req_model: str, + req_content: list[ContentType], + tmp_path: Path, +) -> None: + """Returns None when stored model or content type doesn't match the request.""" + index_path = tmp_path / "index" + _write_metadata(index_path, stored_model, stored_content, 0.0) + with patch("semble.cache.find_index_from_cache_folder", return_value=index_path): + assert get_validated_cache("/path", req_model, req_content) is None + + +def test_get_validated_cache_legacy_metadata_returns_none(tmp_path: Path) -> None: + """Old cache metadata missing content_type returns None instead of crashing.""" + index_path = tmp_path / "index" + index_path.mkdir(parents=True) + (index_path / "chunks.json").write_text("[]") + (index_path / "bm25_index").write_text("") + (index_path / "semantic_index").write_text("") + (index_path / "metadata.json").write_text(json.dumps({"model_path": "my/model", "time": 0.0})) + with patch("semble.cache.find_index_from_cache_folder", return_value=index_path): + assert get_validated_cache("/path", "my/model", [ContentType.CODE]) is None + + +def test_get_validated_cache_resolves_default_model(tmp_path: Path) -> None: + """When model_path is None, resolve_model_name() is used for comparison.""" + index_path = tmp_path / "index" + _write_metadata(index_path, "default/model", ["code"], 0.0) + with patch("semble.cache.find_index_from_cache_folder", return_value=index_path): + with patch("semble.cache.resolve_model_name", return_value="other/model"): + assert get_validated_cache("/path", None, [ContentType.CODE]) is None + + +def test_get_validated_cache_git_url_returns_immediately(tmp_path: Path) -> None: + """Git URL paths skip file-mtime checks and return the index path directly.""" + index_path = tmp_path / "index" + _write_metadata(index_path, "my/model", ["code"], 0.0) + url = "https://github.com/org/repo.git" + with patch("semble.cache.find_index_from_cache_folder", return_value=index_path): + result = get_validated_cache(url, "my/model", [ContentType.CODE]) + assert result == index_path + + +@pytest.mark.parametrize( + ("write_time", "walk_result", "write", "expected"), + [ + (0.0, "stale", True, None), # file newer than index → stale + (float("inf"), [], True, "index"), # no newer files → valid + (float("inf"), "stale", False, None), # no index, returns None + ], +) +def test_get_validated_cache_mtime( + write_time: float, walk_result: str | list, write: bool, expected: str | None, tmp_path: Path +) -> None: + """Returns None when a tracked file is newer than the index; the path otherwise.""" + index_path = tmp_path / "index" + stale_file = tmp_path / "src.py" + stale_file.write_text("x = 1" if write else "") + files = [stale_file] if walk_result == "stale" else walk_result + # Include the file in stored manifest so manifest check passes and mtime check fires. + stored_files = ["src.py"] if walk_result == "stale" else [] + _write_metadata(index_path, "my/model", ["code"], write_time, file_paths=stored_files) + + with patch("semble.cache.find_index_from_cache_folder", return_value=index_path): + with patch("semble.cache.get_extensions", return_value={".py"}): + with patch("semble.cache.walk_files", return_value=files): + result = get_validated_cache(str(tmp_path), "my/model", [ContentType.CODE]) + assert result == (index_path if expected == "index" else None) + + +@pytest.mark.parametrize( + ("stored_files", "current_files"), + [ + (["deleted.py"], []), # file deleted since indexing + ([], ["new.py"]), # new file added since indexing + ], +) +def test_get_validated_cache_manifest_mismatch( + stored_files: list[str], current_files: list[str], tmp_path: Path +) -> None: + """Returns None when the current file set differs from the stored manifest.""" + index_path = tmp_path / "index" + walk_return = [] + for f in current_files: + p = tmp_path / f + # Make sure file is not empty + p.write_text("a") + walk_return.append(p) + _write_metadata(index_path, "my/model", ["code"], float("inf"), file_paths=stored_files) + with patch("semble.cache.find_index_from_cache_folder", return_value=index_path): + with patch("semble.cache.walk_files", return_value=walk_return): + result = get_validated_cache(str(tmp_path), "my/model", [ContentType.CODE]) + assert result is None diff --git a/tests/test_cli.py b/tests/test_cli.py index b998e44..e0b5de2 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -5,7 +5,7 @@ import pytest -from semble.cli import Agent, _agent_path, _cli_main, _run_index, _run_init, main +from semble.cli import Agent, _agent_path, _cli_main, _maybe_save_index, _run_init, main from semble.types import ContentType, SearchResult from tests.conftest import make_chunk @@ -195,51 +195,23 @@ def test_mcp_main_exits_with_message_when_extras_missing( assert "pip install 'semble[mcp]'" in capsys.readouterr().err -def test_run_index(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: - """_run_index creates the output directory and saves the index.""" - out_dir = tmp_path / "index_output" - fake_index = MagicMock() - with patch("semble.cli.SembleIndex.from_path", return_value=fake_index) as mock_from_path: - _run_index(path="/some/path", include_text_files=True, out=str(out_dir)) - mock_from_path.assert_called_once_with("/some/path", include_text_files=True) - assert out_dir.exists() - fake_index.save.assert_called_once_with(str(out_dir)) - - -def test_index_via_cli(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: - """_cli_main index subcommand calls _run_index with the correct arguments.""" - out_dir = tmp_path / "built_index" - fake_index = MagicMock() - monkeypatch.setattr(sys, "argv", ["semble", "index", "/some/path", "-o", str(out_dir)]) - with patch("semble.cli.SembleIndex.from_path", return_value=fake_index): - _cli_main() - assert out_dir.exists() - fake_index.save.assert_called_once_with(str(out_dir)) - - -def test_index_git_via_cli(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: - """_cli_main index subcommand calls _run_index with the correct arguments.""" - out_dir = tmp_path / "built_index" - fake_index = MagicMock() - monkeypatch.setattr(sys, "argv", ["semble", "index", "git://xyz.git", "-o", str(out_dir)]) - with patch("semble.cli.SembleIndex.from_git", return_value=fake_index): - _cli_main() - assert out_dir.exists() - fake_index.save.assert_called_once_with(str(out_dir)) - - -def test_cli_search_with_prebuilt_index(monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]) -> None: - """_cli_main search with --index loads the pre-built index from disk.""" - chunk = make_chunk("def foo(): pass", "src/foo.py") - fake_index = MagicMock() - fake_index.search.return_value = [SearchResult(chunk=chunk, score=0.95)] - monkeypatch.setattr(sys, "argv", ["semble", "search", "query text", ".", "--index", "/some/prebuilt"]) - with patch("semble.cli.SembleIndex.load_from_disk", return_value=fake_index) as mock_load: - _cli_main() - mock_load.assert_called_once_with("/some/prebuilt") - out = capsys.readouterr().out - assert "query text" in out - assert "0.95" in out +@pytest.mark.parametrize( + ("command", "argv"), + [ + ("search", ["semble", "search", "query", "/no/such/path"]), + ("find-related", ["semble", "find-related", "src/foo.py", "1", "/no/such/path"]), + ], +) +def test_cli_path_not_found( + command: str, argv: list[str], monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str] +) -> None: + """index, search, and find-related exit 1 with a friendly message when the path does not exist.""" + monkeypatch.setattr(sys, "argv", argv) + with patch("semble.cli._build_index", side_effect=FileNotFoundError("Path does not exist: /no/such/path")): + with pytest.raises(SystemExit) as exc_info: + _cli_main() + assert exc_info.value.code == 1 + assert "Path does not exist" in capsys.readouterr().err def test_include_text_files_cli_deprecated( @@ -287,6 +259,16 @@ def test_cli_content_argument( assert list(mock_from_path.call_args.kwargs["content"]) == expected +def test_maybe_save_index_logs_error_on_save_failure(capsys: pytest.CaptureFixture[str]) -> None: + """_maybe_save_index prints to stderr when index.save raises.""" + fake_index = MagicMock() + fake_index.loaded_from_disk = False + fake_index.save.side_effect = OSError("disk full") + with patch("semble.cli.find_index_from_cache_folder", return_value=Path("/cache")): + _maybe_save_index(fake_index, "/some/path") + assert "Error saving index" in capsys.readouterr().err + + def test_agent_file_tools_are_bash_only() -> None: """The agent file must list only Bash and Read — no MCP tools that require schema loading.""" frontmatter = files("semble").joinpath("agents/claude.md").read_text(encoding="utf-8").split("---")[1] diff --git a/tests/test_files.py b/tests/test_files.py index 3ff1bb1..0998967 100644 --- a/tests/test_files.py +++ b/tests/test_files.py @@ -41,7 +41,7 @@ def test_language_sets_are_consistent() -> None: ) def test_get_extensions(types: list[ContentType], includes: list[str], excludes: list[str]) -> None: """get_extensions returns the right extensions for each combination of content types.""" - exts = set(get_extensions(types, None)) + exts = set(get_extensions(types)) for ext in includes: assert ext in exts for ext in excludes: @@ -50,17 +50,6 @@ def test_get_extensions(types: list[ContentType], includes: list[str], excludes: def test_all_excludes_data_extensions() -> None: """--content all does not include data file extensions (csv, json, tsv, psv).""" - all_exts = set(get_extensions(list(ContentType), None)) + all_exts = set(get_extensions(list(ContentType))) for ext in (".csv", ".tsv", ".psv", ".json", ".json5"): assert ext not in all_exts, f"{ext} should not be indexed by 'all'" - - -def test_get_extensions_additional() -> None: - """Extra extensions are appended and existing ones are not duplicated.""" - base = get_extensions(list(ContentType), None) - with_extra = get_extensions(list(ContentType), [".kjs"]) - assert set(with_extra) == set(base) | {".kjs"} - - base_code = get_extensions([ContentType.CODE], None) - with_existing = get_extensions([ContentType.CODE], [".py"]) - assert set(with_existing) == set(base_code) diff --git a/tests/test_mcp.py b/tests/test_mcp.py index 900477a..8521b32 100644 --- a/tests/test_mcp.py +++ b/tests/test_mcp.py @@ -118,12 +118,16 @@ async def test_index_cache_builds_and_caches( """_IndexCache.get() builds via the correct SembleIndex.* entrypoint and caches subsequent calls.""" resolved_source = str(tmp_path) if source == "local_tmp_path" else source fake_index = MagicMock() - with patch(f"semble.mcp.SembleIndex.{patch_target}", return_value=fake_index) as mock_build: + with ( + patch(f"semble.mcp.SembleIndex.{patch_target}", return_value=fake_index) as mock_build, + patch("semble.mcp.save_index_to_cache") as mock_save, + ): first = await cache.get(resolved_source) second = await cache.get(resolved_source) assert first is fake_index assert second is fake_index mock_build.assert_called_once() + mock_save.assert_called_once_with(fake_index, cache._compute_cache_key(resolved_source)) @pytest.mark.anyio @@ -146,6 +150,17 @@ def _failing_then_ok(path: str, **kwargs: object) -> MagicMock: assert call_count == 2 +@pytest.mark.anyio +async def test_index_cache_ignores_cache_save_failure(cache: _IndexCache, tmp_path: Path) -> None: + """A cache save failure must not fail the MCP request.""" + fake_index = MagicMock() + with ( + patch("semble.mcp.SembleIndex.from_path", return_value=fake_index), + patch("semble.mcp.save_index_to_cache", side_effect=RuntimeError("save failed")), + ): + assert await cache.get(str(tmp_path)) is fake_index + + @pytest.mark.anyio @pytest.mark.parametrize( ("tool", "args"), diff --git a/tests/test_stats.py b/tests/test_stats.py index d3c7689..e3c1f32 100644 --- a/tests/test_stats.py +++ b/tests/test_stats.py @@ -2,7 +2,7 @@ import sys from datetime import datetime, timezone from pathlib import Path -from unittest.mock import MagicMock +from unittest.mock import MagicMock, patch import pytest @@ -32,14 +32,14 @@ def test_save_search_stats(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> N chunk = make_chunk("hello", "src/foo.py") result = SearchResult(chunk=chunk, score=0.9) stats_file = tmp_path / "stats.jsonl" - monkeypatch.setattr("semble.stats._STATS_FILE", stats_file) + monkeypatch.setattr("semble.stats._get_stats_file", lambda: stats_file) save_search_stats([result, result], CallType.SEARCH, {"src/foo.py": 42}) assert json.loads(stats_file.read_text())["file_chars"] == 42 mock_path = MagicMock() mock_path.parent.mkdir.return_value = None mock_path.open.side_effect = OSError("no write") - monkeypatch.setattr("semble.stats._STATS_FILE", mock_path) + monkeypatch.setattr("semble.stats._get_stats_file", lambda: mock_path) save_search_stats([result], CallType.SEARCH, {"src/foo.py": 42}) # must not raise @@ -87,6 +87,11 @@ def test_savings_do_not_subtract_unknown_baselines(tmp_path: Path) -> None: assert summary.buckets["All time"].saved_chars == 400 assert "~100 tokens" in format_savings_report(path=stats_file) + with patch("semble.stats._get_stats_file", lambda: stats_file): + summary = build_savings_summary(path=None) + assert summary.buckets["All time"].saved_chars == 400 + assert "~100 tokens" in format_savings_report(path=stats_file) + def test_savings_tolerates_bad_json(tmp_path: Path) -> None: """Malformed JSON lines are skipped with a warning.""" @@ -108,7 +113,7 @@ def test_savings_cli_dispatch( ) -> None: """Savings subcommand dispatches to format_savings_report, with and without --verbose.""" monkeypatch.setattr(sys, "argv", argv) - monkeypatch.setattr("semble.stats._STATS_FILE", tmp_path / "nonexistent.jsonl") + monkeypatch.setattr("semble.stats._get_stats_file", lambda: tmp_path / "nonexistent.jsonl") _cli_main() assert expected in capsys.readouterr().out diff --git a/uv.lock b/uv.lock index 04d014b..95e35ed 100644 --- a/uv.lock +++ b/uv.lock @@ -10,7 +10,7 @@ resolution-markers = [ [options] exclude-newer = "0001-01-01T00:00:00Z" # This has no effect and is included for backwards compatibility when using relative exclude-newer values. -exclude-newer-span = "P3D" +exclude-newer-span = "P1W" [[package]] name = "annotated-doc"