diff --git a/README.md b/README.md
index 6a583c2..42f20f2 100644
--- a/README.md
+++ b/README.md
@@ -69,19 +69,7 @@ semble search "save_pretrained" ./my-project
semble search "save model to disk" ./my-project --top-k 10
```
-If you anticipate doing more than one search, use `semble index` to create an index.
-
-```bash
-semble index ./my-project -o my_index
-```
-
-You can then reuse this index later on:
-
-```bash
-semble search "save_pretrained" --index my_index
-```
-
-An index is not automatically updated, so if the code changes significantly, reindex. If you notice stale results while resolving searches to files, reindex.
+The index is built on first run (and cached for subsequent runs) and invalidated automatically when files change.
Use `--content docs` to search documentation and prose, `--content config` for config files (yaml, toml, etc.), or `--content all` to search code, docs, and config:
@@ -97,20 +85,17 @@ Use `semble find-related` to discover code similar to a known location (pass `fi
semble find-related src/auth.py 42 ./my-project
```
-Like search, `find-related` also accepts an `--index` argument.
-
`path` defaults to the current directory when omitted; git URLs are accepted.
If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its place.
### Workflow
-1. Index the repo using `semble index -o cached_index`.
-2. Start with `semble search` to find relevant chunks. Pass the index to achieve results faster.
-3. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything.
-4. Inspect full files only when the returned chunk does not give enough context.
-5. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations.
-6. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string.
+1. Start with `semble search` to find relevant chunks. The index is built and cached automatically.
+2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything.
+3. Inspect full files only when the returned chunk does not give enough context.
+4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations.
+5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string.
```
@@ -330,7 +315,55 @@ Add to `~/.config/zed/settings.json` (or `.zed/settings.json` in your project):
By default the MCP server indexes only code files. To also index documentation, config, or everything, append `--content docs`, `--content config`, or `--content all` to the server command, or a combination, e.g. `--content code docs`. For example, in Claude Code: `claude mcp add semble -s user -- uvx --from "semble[mcp]" semble --content all`.
-## Sub-agent setup
+
+
+## Bash / AGENTS.md
+
+An alternative to MCP is to invoke Semble via Bash. Sub-agents cannot call MCP tools directly, so this is the only option for sub-agent support; it can also be used alongside MCP for the top-level agent.
+
+To add Bash support, append the following to your `AGENTS.md`, `CLAUDE.md`, `GEMINI.md`, or equivalent:
+
+```markdown
+## Code Search
+
+Use `semble search` to find code by describing what it does or naming a symbol/identifier, instead of grep:
+
+```bash
+semble search "authentication flow" ./my-project
+semble search "save_pretrained" ./my-project
+semble search "save model to disk" ./my-project --top-k 10
+```
+
+The index is built on first run (and cached for subsequent runs) and invalidated automatically when files change.
+
+Use `--content docs` to search documentation and prose, `--content config` for config files (yaml, toml, etc.), or `--content all` to search code, docs, and config:
+
+```bash
+semble search "deployment guide" ./my-project --content docs
+semble search "database host port" ./my-project --content config
+semble search "authentication" ./my-project --content all
+```
+
+Use `semble find-related` to discover code similar to a known location (pass `file_path` and `line` from a prior search result):
+
+```bash
+semble find-related src/auth.py 42 ./my-project
+```
+
+`path` defaults to the current directory when omitted; git URLs are accepted.
+
+If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its place.
+
+### Workflow
+
+1. Start with `semble search` to find relevant chunks. The index is built and cached automatically.
+2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything.
+3. Inspect full files only when the returned chunk does not give enough context.
+4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations.
+5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string.
+```
+
+### Sub-agent setup
Claude Code, Gemini CLI, Cursor, OpenCode, GitHub Copilot CLI, and Kiro all support a dedicated semble search sub-agent. Run `semble init` once in your project root:
@@ -350,13 +383,9 @@ If semble is not on `$PATH`, prefix the command with `uvx --from "semble[mcp]"`.
Semble also ships as a standalone CLI. This is useful in scripts or anywhere you want search results without an MCP session.
```bash
-# Search a local repo
+# Search a local repo (index is built and cached automatically)
semble search "authentication flow" ./my-project
-# Index first for faster repeated searches (--index works with any command below)
-semble index ./my-project -o my-index
-semble search "authentication flow" --index my-index
-
# Search a remote repo (cloned on demand)
semble search "save model to disk" https://github.com/MinishLab/model2vec
@@ -372,6 +401,31 @@ semble find-related src/auth.py 42 ./my-project
`--content` accepts `code` (default), `docs`, `config`, or `all`. `path` defaults to the current directory when omitted; git URLs are accepted. If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its place.
+
+Controlling which files are indexed
+
+Semble reads `.gitignore` and `.sembleignore` files to determine which files to index. Both files use standard gitignore syntax and their patterns are merged. `.sembleignore` lets you add semble-specific rules without touching `.gitignore`. Rules are applied recursively, so a `.sembleignore` in a subdirectory applies to that subtree.
+
+**Excluding files:** add patterns the same way you would in `.gitignore`:
+
+```
+# .sembleignore
+generated/ # exclude generated dir
+*.pb.go. # exclude Go protobuf files
+```
+
+**Including non-default extensions:** prefix the extension pattern with `!` to force-include files that semble wouldn't index by default:
+
+```
+# .sembleignore
+!*.proto # include Protobuf files
+!*.cob # include COBOL files
+```
+
+Semble also always skips a set of well-known non-source directories regardless of ignore files (e.g. `node_modules/`, `.venv/`, `dist/`, `build/`, `__pycache__/`, and similar).
+
+
+
Savings
@@ -394,7 +448,7 @@ semble savings --verbose # also show breakdown by call type
Savings are calculated as follows: for each call, semble records the total character count of the unique files containing returned chunks and the character count of the snippets returned. Estimated tokens saved is `(file chars − snippet chars) / 4` (4 chars per token). This is a conservative estimate: the baseline is reading matched files in full, which is how coding agents often explore unfamiliar code.
-Stats are stored in `~/.semble/savings.jsonl`.
+Stats are stored in the OS cache folder (`~/Library/Caches/semble/` on macOS, `~/.cache/semble/` on Linux, `%LOCALAPPDATA%\semble\Cache\` on Windows).
diff --git a/benchmarks/baselines/ablations.py b/benchmarks/baselines/ablations.py
index 7f91b67..63bd6ab 100644
--- a/benchmarks/baselines/ablations.py
+++ b/benchmarks/baselines/ablations.py
@@ -17,7 +17,7 @@
)
from benchmarks.run_benchmark import RepoResult, evaluate
from semble import SembleIndex
-from semble.index.dense import _DEFAULT_MODEL_NAME
+from semble.utils import DEFAULT_MODEL_NAME
# alpha=None → raw mode, input depends on query
# alpha=0.0 → hybrid pipeline, BM25-only input
@@ -129,7 +129,7 @@ def main() -> None:
summary = {
"tool": "semble-ablations",
- "model": _DEFAULT_MODEL_NAME,
+ "model": DEFAULT_MODEL_NAME,
"by_mode": summarize_modes(results, modes),
"repos": [asdict(r) for r in results],
}
diff --git a/benchmarks/run_benchmark.py b/benchmarks/run_benchmark.py
index 4af095e..003e25c 100644
--- a/benchmarks/run_benchmark.py
+++ b/benchmarks/run_benchmark.py
@@ -16,8 +16,8 @@
)
from benchmarks.metrics import ndcg_at_k, target_rank
from semble import SembleIndex
-from semble.index.dense import _DEFAULT_MODEL_NAME
from semble.types import SearchResult
+from semble.utils import DEFAULT_MODEL_NAME
_LATENCY_RUNS = 5
_DIRECT_TOP_K = 10
@@ -259,7 +259,7 @@ def _save_results(results: list[RepoResult]) -> None:
n_repos = len(results)
output = {
"tool": "semble-hybrid",
- "model": _DEFAULT_MODEL_NAME,
+ "model": DEFAULT_MODEL_NAME,
"summary": {
"ndcg10": round(sum(r.ndcg10 for r in results) / n_repos, 4),
"tokens": round(sum(r.tokens for r in results) / n_repos, 0),
diff --git a/benchmarks/speed_benchmark.py b/benchmarks/speed_benchmark.py
index b96ad75..1eee147 100644
--- a/benchmarks/speed_benchmark.py
+++ b/benchmarks/speed_benchmark.py
@@ -11,8 +11,8 @@
from benchmarks.data import RepoSpec, Task, available_repo_specs, load_tasks, save_results
from benchmarks.tools import run_colgrep_files, run_ripgrep_count
from semble import SembleIndex
-from semble.index.dense import _DEFAULT_MODEL_NAME
from semble.types import EmbeddingMatrix
+from semble.utils import DEFAULT_MODEL_NAME
# One representative repo per language (medium size, healthy NDCG on the main benchmark).
_REPOS: list[str] = [
@@ -192,7 +192,7 @@ def main() -> None:
print("Loading semble model...", file=sys.stderr)
started = time.perf_counter()
- semble_model = StaticModel.from_pretrained(_DEFAULT_MODEL_NAME)
+ semble_model = StaticModel.from_pretrained(DEFAULT_MODEL_NAME)
print(f" loaded in {(time.perf_counter() - started) * 1000:.0f}ms", file=sys.stderr)
print("Loading CodeRankEmbed...", file=sys.stderr)
diff --git a/benchmarks/token_efficiency.py b/benchmarks/token_efficiency.py
index 7c44ac5..77f10ed 100644
--- a/benchmarks/token_efficiency.py
+++ b/benchmarks/token_efficiency.py
@@ -24,10 +24,10 @@
target_matches_location,
)
from semble import SembleIndex
-from semble.index.dense import _DEFAULT_MODEL_NAME
from semble.index.file_walker import DEFAULT_IGNORED_DIRS, FILE_TYPES, FileCategory
from semble.ranking.boosting import _STOPWORDS as _SEMBLE_STOPWORDS
from semble.types import Chunk
+from semble.utils import DEFAULT_MODEL_NAME
_RG_INCLUDE_GLOBS: tuple[str, ...] = tuple(
f"*{ext}" for ext, spec in FILE_TYPES.items() if spec.category == FileCategory.CODE
@@ -378,7 +378,7 @@ def run_recall(args: argparse.Namespace) -> None:
print("Loading tokenizer + model...", file=sys.stderr)
enc = tiktoken.get_encoding(_TOKENIZER_NAME)
- model = StaticModel.from_pretrained(_DEFAULT_MODEL_NAME)
+ model = StaticModel.from_pretrained(DEFAULT_MODEL_NAME)
method_curves: dict[str, MethodCurves] = defaultdict(list)
print(f"\n{'Repo':<22} {'Language':<12} {'Tasks':>6} {'Time':>8}", file=sys.stderr)
diff --git a/src/semble/agents/claude.md b/src/semble/agents/claude.md
index 895e282..2cdc0f5 100644
--- a/src/semble/agents/claude.md
+++ b/src/semble/agents/claude.md
@@ -12,19 +12,7 @@ semble search "save_pretrained" ./my-project
semble search "save model to disk" ./my-project --top-k 10
```
-If you anticipate doing more than one search, use `semble index` to create an index.
-
-```bash
-semble index ./my-project -o my_index
-```
-
-You can then reuse this index later on:
-
-```bash
-semble search "save_pretrained" --index my_index
-```
-
-An index is not automatically updated, so if the code changes significantly, reindex. If you notice stale results while resolving searches to files, reindex.
+Results are cached automatically on first run and invalidated when files change.
Use `--content docs` to search documentation and prose, `--content config` for config files (yaml, toml, etc.), or `--content all` to search code, docs, and config:
@@ -40,17 +28,14 @@ Use `semble find-related` to discover code similar to a known location (pass `fi
semble find-related src/auth.py 42 ./my-project
```
-Like search, `find-related` also accepts an `--index` argument.
-
`path` defaults to the current directory when omitted; git URLs are accepted.
If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its place.
### Workflow
-1. Index the repo using `semble index -o cached_index`.
-2. Start with `semble search` to find relevant chunks. Pass the index to achieve results faster.
-3. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything.
-4. Inspect full files only when the returned chunk does not give enough context.
-5. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations.
-6. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string.
+1. Start with `semble search` to find relevant chunks. The index is built and cached automatically.
+2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything.
+3. Inspect full files only when the returned chunk does not give enough context.
+4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations.
+5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string.
diff --git a/src/semble/agents/copilot.md b/src/semble/agents/copilot.md
index 895e282..2cdc0f5 100644
--- a/src/semble/agents/copilot.md
+++ b/src/semble/agents/copilot.md
@@ -12,19 +12,7 @@ semble search "save_pretrained" ./my-project
semble search "save model to disk" ./my-project --top-k 10
```
-If you anticipate doing more than one search, use `semble index` to create an index.
-
-```bash
-semble index ./my-project -o my_index
-```
-
-You can then reuse this index later on:
-
-```bash
-semble search "save_pretrained" --index my_index
-```
-
-An index is not automatically updated, so if the code changes significantly, reindex. If you notice stale results while resolving searches to files, reindex.
+Results are cached automatically on first run and invalidated when files change.
Use `--content docs` to search documentation and prose, `--content config` for config files (yaml, toml, etc.), or `--content all` to search code, docs, and config:
@@ -40,17 +28,14 @@ Use `semble find-related` to discover code similar to a known location (pass `fi
semble find-related src/auth.py 42 ./my-project
```
-Like search, `find-related` also accepts an `--index` argument.
-
`path` defaults to the current directory when omitted; git URLs are accepted.
If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its place.
### Workflow
-1. Index the repo using `semble index -o cached_index`.
-2. Start with `semble search` to find relevant chunks. Pass the index to achieve results faster.
-3. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything.
-4. Inspect full files only when the returned chunk does not give enough context.
-5. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations.
-6. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string.
+1. Start with `semble search` to find relevant chunks. The index is built and cached automatically.
+2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything.
+3. Inspect full files only when the returned chunk does not give enough context.
+4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations.
+5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string.
diff --git a/src/semble/agents/cursor.md b/src/semble/agents/cursor.md
index baf455c..2071c27 100644
--- a/src/semble/agents/cursor.md
+++ b/src/semble/agents/cursor.md
@@ -11,19 +11,7 @@ semble search "save_pretrained" ./my-project
semble search "save model to disk" ./my-project --top-k 10
```
-If you anticipate doing more than one search, use `semble index` to create an index.
-
-```bash
-semble index ./my-project -o my_index
-```
-
-You can then reuse this index later on:
-
-```bash
-semble search "save_pretrained" --index my_index
-```
-
-An index is not automatically updated, so if the code changes significantly, reindex. If you notice stale results while resolving searches to files, reindex.
+Results are cached automatically on first run and invalidated when files change.
Use `--content docs` to search documentation and prose, `--content config` for config files (yaml, toml, etc.), or `--content all` to search code, docs, and config:
@@ -39,17 +27,14 @@ Use `semble find-related` to discover code similar to a known location (pass `fi
semble find-related src/auth.py 42 ./my-project
```
-Like search, `find-related` also accepts an `--index` argument.
-
`path` defaults to the current directory when omitted; git URLs are accepted.
If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its place.
### Workflow
-1. Index the repo using `semble index -o cached_index`.
-2. Start with `semble search` to find relevant chunks. Pass the index to achieve results faster.
-3. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything.
-4. Inspect full files only when the returned chunk does not give enough context.
-5. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations.
-6. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string.
+1. Start with `semble search` to find relevant chunks. The index is built and cached automatically.
+2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything.
+3. Inspect full files only when the returned chunk does not give enough context.
+4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations.
+5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string.
diff --git a/src/semble/agents/gemini.md b/src/semble/agents/gemini.md
index e4e9b6a..a20fcd9 100644
--- a/src/semble/agents/gemini.md
+++ b/src/semble/agents/gemini.md
@@ -14,19 +14,7 @@ semble search "save_pretrained" ./my-project
semble search "save model to disk" ./my-project --top-k 10
```
-If you anticipate doing more than one search, use `semble index` to create an index.
-
-```bash
-semble index ./my-project -o my_index
-```
-
-You can then reuse this index later on:
-
-```bash
-semble search "save_pretrained" --index my_index
-```
-
-An index is not automatically updated, so if the code changes significantly, reindex. If you notice stale results while resolving searches to files, reindex.
+Results are cached automatically on first run and invalidated when files change.
Use `--content docs` to search documentation and prose, `--content config` for config files (yaml, toml, etc.), or `--content all` to search code, docs, and config:
@@ -42,17 +30,14 @@ Use `semble find-related` to discover code similar to a known location (pass `fi
semble find-related src/auth.py 42 ./my-project
```
-Like search, `find-related` also accepts an `--index` argument.
-
`path` defaults to the current directory when omitted; git URLs are accepted.
If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its place.
### Workflow
-1. Index the repo using `semble index -o cached_index`.
-2. Start with `semble search` to find relevant chunks. Pass the index to achieve results faster.
-3. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything.
-4. Inspect full files only when the returned chunk does not give enough context.
-5. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations.
-6. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string.
+1. Start with `semble search` to find relevant chunks. The index is built and cached automatically.
+2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything.
+3. Inspect full files only when the returned chunk does not give enough context.
+4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations.
+5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string.
diff --git a/src/semble/agents/kiro.md b/src/semble/agents/kiro.md
index d556c13..bf5d5fc 100644
--- a/src/semble/agents/kiro.md
+++ b/src/semble/agents/kiro.md
@@ -14,19 +14,7 @@ semble search "save_pretrained" ./my-project
semble search "save model to disk" ./my-project --top-k 10
```
-If you anticipate doing more than one search, use `semble index` to create an index.
-
-```bash
-semble index ./my-project -o my_index
-```
-
-You can then reuse this index later on:
-
-```bash
-semble search "save_pretrained" --index my_index
-```
-
-An index is not automatically updated, so if the code changes significantly, reindex. If you notice stale results while resolving searches to files, reindex.
+Results are cached automatically on first run and invalidated when files change.
Use `--content docs` to search documentation and prose, `--content config` for config files (yaml, toml, etc.), or `--content all` to search code, docs, and config:
@@ -42,17 +30,14 @@ Use `semble find-related` to discover code similar to a known location (pass `fi
semble find-related src/auth.py 42 ./my-project
```
-Like search, `find-related` also accepts an `--index` argument.
-
`path` defaults to the current directory when omitted; git URLs are accepted.
If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its place.
### Workflow
-1. Index the repo using `semble index -o cached_index`.
-2. Start with `semble search` to find relevant chunks. Pass the index to achieve results faster.
-3. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything.
-4. Inspect full files only when the returned chunk does not give enough context.
-5. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations.
-6. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string.
+1. Start with `semble search` to find relevant chunks. The index is built and cached automatically.
+2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything.
+3. Inspect full files only when the returned chunk does not give enough context.
+4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations.
+5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string.
diff --git a/src/semble/agents/opencode.md b/src/semble/agents/opencode.md
index 2ec43c8..fbfcede 100644
--- a/src/semble/agents/opencode.md
+++ b/src/semble/agents/opencode.md
@@ -15,19 +15,7 @@ semble search "save_pretrained" ./my-project
semble search "save model to disk" ./my-project --top-k 10
```
-If you anticipate doing more than one search, use `semble index` to create an index.
-
-```bash
-semble index ./my-project -o my_index
-```
-
-You can then reuse this index later on:
-
-```bash
-semble search "save_pretrained" --index my_index
-```
-
-An index is not automatically updated, so if the code changes significantly, reindex. If you notice stale results while resolving searches to files, reindex.
+Results are cached automatically on first run and invalidated when files change.
Use `--content docs` to search documentation and prose, `--content config` for config files (yaml, toml, etc.), or `--content all` to search code, docs, and config:
@@ -43,17 +31,14 @@ Use `semble find-related` to discover code similar to a known location (pass `fi
semble find-related src/auth.py 42 ./my-project
```
-Like search, `find-related` also accepts an `--index` argument.
-
`path` defaults to the current directory when omitted; git URLs are accepted.
If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its place.
### Workflow
-1. Index the repo using `semble index -o cached_index`.
-2. Start with `semble search` to find relevant chunks. Pass the index to achieve results faster.
-3. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything.
-4. Inspect full files only when the returned chunk does not give enough context.
-5. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations.
-6. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string.
+1. Start with `semble search` to find relevant chunks. The index is built and cached automatically.
+2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything.
+3. Inspect full files only when the returned chunk does not give enough context.
+4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations.
+5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string.
diff --git a/src/semble/cache.py b/src/semble/cache.py
new file mode 100644
index 0000000..6f6ea49
--- /dev/null
+++ b/src/semble/cache.py
@@ -0,0 +1,124 @@
+import hashlib
+import json
+import os
+import shutil
+import sys
+from collections.abc import Sequence
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+from semble.index.file_walker import walk_files
+from semble.index.files import FileStatus, get_extensions, get_file_status
+from semble.index.types import PersistencePath
+from semble.types import ContentType
+from semble.utils import is_git_url, resolve_model_name
+
+if TYPE_CHECKING:
+ from semble.index import SembleIndex
+
+
+def find_index_from_cache_folder(path: str) -> Path:
+ """Finds an index from a cache folder and a project path."""
+ if is_git_url(path):
+ data = path.encode("utf-8")
+ else:
+ normalized = Path(path).expanduser().resolve()
+ data = str(normalized).encode("utf-8")
+ subdir_path = hashlib.new("sha256", data).hexdigest()
+ cache_dir = resolve_cache_folder() / subdir_path
+ return cache_dir / "index"
+
+
+def _windows_cache_dir(name: str) -> Path:
+ """Get the default windows cache dir."""
+ env_base = os.getenv("LOCALAPPDATA") or os.getenv("APPDATA")
+ base = Path(env_base) if env_base is not None else Path.home() / "AppData" / "Local"
+ return base / name / "Cache"
+
+
+def _macos_cache_dir(name: str) -> Path:
+ """Get the default macOS cache dir."""
+ return Path.home() / "Library" / "Caches" / name
+
+
+def _linux_cache_dir(name: str) -> Path:
+ """Get the default Linux cache dir."""
+ env_base = os.getenv("XDG_CACHE_HOME")
+ base = Path(env_base) if env_base else Path.home() / ".cache"
+ return base / name
+
+
+def resolve_cache_folder() -> Path:
+ """Resolves a cache folder, respects XDG_CACHE_HOME."""
+ name = "semble"
+ if sys.platform == "win32":
+ cache_dir = _windows_cache_dir(name)
+ elif sys.platform == "darwin":
+ cache_dir = _macos_cache_dir(name)
+ else:
+ cache_dir = _linux_cache_dir(name)
+
+ cache_dir.mkdir(parents=True, exist_ok=True)
+ return cache_dir
+
+
+def clear_cache(path: str) -> None:
+ """Clears the cache for the given path."""
+ index_path = find_index_from_cache_folder(path)
+ if index_path.exists():
+ shutil.rmtree(index_path)
+
+
+def save_index_to_cache(index: "SembleIndex", path: str) -> None:
+ """Save an index to the cache folder if it was freshly built."""
+ if not index.loaded_from_disk:
+ index.save(find_index_from_cache_folder(path))
+
+
+def _metadata_matches(metadata: dict, model_path: str, content: Sequence[ContentType]) -> bool:
+ """Return True if the stored metadata is compatible with the requested parameters."""
+ try:
+ content_type = tuple(ContentType(s) for s in metadata["content_type"])
+ return metadata["model_path"] == model_path and set(content_type) == set(content)
+ except (KeyError, ValueError):
+ return False
+
+
+def get_validated_cache(path: str, model_path: str | None, content: Sequence[ContentType]) -> Path | None:
+ """Validates the cache folder and returns the index path."""
+ index_path = find_index_from_cache_folder(path)
+ if not index_path.exists():
+ return None
+
+ persistence_path = PersistencePath.from_path(index_path)
+ if persistence_path.non_existing():
+ return None
+
+ if model_path is None:
+ model_path = resolve_model_name()
+ with open(persistence_path.metadata) as f:
+ metadata = json.load(f)
+ if not _metadata_matches(metadata, model_path, content):
+ return None
+
+ if is_git_url(str(path)):
+ return index_path
+
+ write_time = metadata["time"]
+ extensions = get_extensions(content)
+
+ path_as_path = Path(path).resolve()
+ stored_files: list[str] = metadata.get("file_paths", [])
+ current_files = []
+ for file_path in walk_files(path_as_path, extensions=extensions):
+ file_status = get_file_status(file_path, write_time)
+ if file_status == FileStatus.NEWER:
+ return None
+ if file_status != FileStatus.VALID:
+ continue
+ current_files.append(str(file_path.relative_to(path_as_path)))
+
+ if set(current_files) != set(stored_files):
+ return None
+
+ return index_path
diff --git a/src/semble/cli.py b/src/semble/cli.py
index aac944d..1a0e8b4 100644
--- a/src/semble/cli.py
+++ b/src/semble/cli.py
@@ -10,6 +10,7 @@
from model2vec.utils import get_package_extras
+from semble.cache import find_index_from_cache_folder
from semble.index import SembleIndex
from semble.stats import format_savings_report
from semble.types import ContentType
@@ -26,7 +27,26 @@ class Agent(str, Enum):
_DEFAULT_AGENT = Agent.CLAUDE
-_CLI_DISPATCH_ARGS = frozenset({"search", "find-related", "init", "savings", "-h", "--help", "index"})
+_CLI_DISPATCH_ARGS = frozenset({"search", "find-related", "init", "savings", "-h", "--help"})
+
+
+def _build_index(path: str, content: list[ContentType]) -> SembleIndex:
+ """Build an index from a local path or git URL."""
+ return (
+ SembleIndex.from_git(path, content=content)
+ if is_git_url(path)
+ else SembleIndex.from_path(path, content=content)
+ )
+
+
+def _maybe_save_index(index: SembleIndex, path: str) -> None:
+ """Save the index to the cache folder if it was not loaded from disk."""
+ if not index.loaded_from_disk:
+ try:
+ cache_folder = find_index_from_cache_folder(path)
+ index.save(cache_folder)
+ except Exception as e:
+ print(f"Error saving index: {e}", file=sys.stderr)
def _agent_path(agent: Agent) -> Path:
@@ -83,16 +103,6 @@ def _mcp_main() -> None:
asyncio.run(serve(args.path, ref=args.ref, content=content))
-def _run_index(*, path: str, include_text_files: bool = False, out: str) -> None:
- """Index and store a codebase."""
- if is_git_url(path):
- index = SembleIndex.from_git(path, include_text_files=include_text_files)
- else:
- index = SembleIndex.from_path(path, include_text_files=include_text_files)
- Path(out).mkdir(parents=True, exist_ok=True)
- index.save(out)
-
-
def _run_init(*, agent: Agent = _DEFAULT_AGENT, force: bool = False) -> None:
"""Write the semble sub-agent file for the given coding agent into the current project."""
dest = _agent_path(agent)
@@ -118,24 +128,49 @@ def _resolve_content(content: list[str], include_text_files: bool) -> list[Conte
return [ContentType(c) for c in content]
+def _load_index(path: str, content: list[ContentType]) -> SembleIndex:
+ """Build an index from a local path or git URL, exiting on FileNotFoundError."""
+ try:
+ return _build_index(path, content)
+ except FileNotFoundError as e:
+ print(str(e), file=sys.stderr)
+ sys.exit(1)
+
+
+def _run_search(path: str, query: str, top_k: int, content: list[ContentType]) -> None:
+ """Handle the `search` subcommand."""
+ index = _load_index(path, content)
+ results = index.search(query, top_k=top_k)
+ out = format_results(query, results) if results else {"error": "No results found."}
+ print(json.dumps(out))
+ _maybe_save_index(index, path)
+
+
+def _run_find_related(path: str, file_path: str, line: int, top_k: int, content: list[ContentType]) -> None:
+ """Handle the `find-related` subcommand."""
+ index = _load_index(path, content)
+ chunk = resolve_chunk(index.chunks, file_path, line)
+ if chunk is None:
+ print(f"No chunk found at {file_path}:{line}.", file=sys.stderr)
+ sys.exit(1)
+ results = index.find_related(chunk, top_k=top_k)
+ out = (
+ format_results(f"Chunks related to {file_path}:{line}", results)
+ if results
+ else {"error": f"No related chunks found for {file_path}:{line}."}
+ )
+ print(json.dumps(out))
+ _maybe_save_index(index, path)
+
+
def _cli_main() -> None:
parser = argparse.ArgumentParser(prog="semble")
sub = parser.add_subparsers(dest="command")
- index_p = sub.add_parser("index", help="Index and store a codebase.")
- index_p.add_argument("path", nargs="?", default=".", help="Local path or git URL (default: current directory).")
- index_p.add_argument(
- "--include-text-files",
- action="store_true",
- help="Also index non-code text files (.md, .yaml, .json, etc.).",
- )
- index_p.add_argument("-o", "--out", type=str, required=True, help="The path to write the pre-built index to.")
-
search_p = sub.add_parser("search", help="Search a codebase.")
search_p.add_argument("query", help="Natural language or code query.")
search_p.add_argument("path", nargs="?", default=".", help="Local path or git URL (default: current directory).")
search_p.add_argument("-k", "--top-k", type=int, default=5, help="Number of results (default: 5).")
- search_p.add_argument("--index", type=str, default=None, help="A path pointing to a pre-built index.")
_add_content_args(search_p)
related_p = sub.add_parser("find-related", help="Find code similar to a specific location.")
@@ -143,7 +178,6 @@ def _cli_main() -> None:
related_p.add_argument("line", type=int, help="Line number (1-indexed).")
related_p.add_argument("path", nargs="?", default=".", help="Local path or git URL (default: current directory).")
related_p.add_argument("-k", "--top-k", type=int, default=5, help="Number of results (default: 5).")
- related_p.add_argument("--index", type=str, default=None, help="A path pointing to a pre-built index.")
_add_content_args(related_p)
init_p = sub.add_parser("init", help="Write a semble sub-agent file for your coding agent.")
@@ -163,42 +197,11 @@ def _cli_main() -> None:
if args.command == "init":
_run_init(agent=Agent(args.agent), force=args.force)
- return
-
- if args.command == "index":
- _run_index(path=args.path, include_text_files=args.include_text_files, out=args.out)
- return
-
- if args.command == "savings":
- print(format_savings_report(verbose=args.verbose), end="")
- return
-
- if args.index:
- index = SembleIndex.load_from_disk(args.index)
- else:
- content = _resolve_content(args.content, args.include_text_files)
- index = (
- SembleIndex.from_git(args.path, content=content)
- if is_git_url(args.path)
- else SembleIndex.from_path(args.path, content=content)
- )
-
- if args.command == "search":
- results = index.search(args.query, top_k=args.top_k)
- if not results:
- out = {"error": "No results found."}
- else:
- out = format_results(args.query, results)
- print(json.dumps(out))
-
+ elif args.command == "savings":
+ print(format_savings_report(verbose=args.verbose))
+ elif args.command == "search":
+ _run_search(args.path, args.query, args.top_k, _resolve_content(args.content, args.include_text_files))
elif args.command == "find-related":
- chunk = resolve_chunk(index.chunks, args.file_path, args.line)
- if chunk is None:
- print(f"No chunk found at {args.file_path}:{args.line}.", file=sys.stderr)
- sys.exit(1)
- results = index.find_related(chunk, top_k=args.top_k)
- if not results:
- out = {"error": f"No related chunks found for {args.file_path}:{args.line}."}
- else:
- out = format_results(f"Chunks related to {args.file_path}:{args.line}", results)
- print(json.dumps(out))
+ _run_find_related(
+ args.path, args.file_path, args.line, args.top_k, _resolve_content(args.content, args.include_text_files)
+ )
diff --git a/src/semble/index/create.py b/src/semble/index/create.py
index b72a055..b4dd189 100644
--- a/src/semble/index/create.py
+++ b/src/semble/index/create.py
@@ -9,18 +9,15 @@
from semble.chunking import chunk_source
from semble.index.dense import SelectableBasicBackend, embed_chunks
from semble.index.file_walker import walk_files
-from semble.index.files import detect_language, get_extensions
+from semble.index.files import FileStatus, detect_language, get_extensions, get_file_status, read_file_text
from semble.index.sparse import enrich_for_bm25
from semble.tokens import tokenize
from semble.types import Chunk, ContentType
-_MAX_FILE_BYTES = 1_000_000 # 1 MB max file size to read and index
-
def create_index_from_path(
path: Path,
model: StaticModel,
- extensions: Sequence[str] | None = None,
content: ContentType | Sequence[ContentType] = (ContentType.CODE,),
display_root: Path | None = None,
) -> tuple[bm25s.BM25, SelectableBasicBackend, list[Chunk]]:
@@ -28,7 +25,6 @@ def create_index_from_path(
:param path: Resolved absolute path to index.
:param model: The model to use for indexing.
- :param extensions: File extensions to include.
:param content: Content types to index.
:param display_root: If set, chunk file paths are stored relative to this root.
:raises ValueError: if no items were found, no index can be created.
@@ -36,13 +32,14 @@ def create_index_from_path(
"""
chunks: list[Chunk] = []
normalized = (content,) if isinstance(content, ContentType) else content
- resolved_extensions = get_extensions(normalized, extensions)
+ resolved_extensions = get_extensions(normalized)
for file_path in walk_files(path, resolved_extensions):
language = detect_language(file_path)
with contextlib.suppress(OSError):
- if file_path.stat().st_size > _MAX_FILE_BYTES:
+ file_status = get_file_status(file_path, None)
+ if file_status != FileStatus.VALID:
continue
- source = file_path.read_text(encoding="utf-8", errors="replace")
+ source = read_file_text(file_path)
chunk_path = file_path.relative_to(display_root) if display_root else file_path
chunks.extend(chunk_source(source, str(chunk_path), language))
diff --git a/src/semble/index/dense.py b/src/semble/index/dense.py
index 9677c22..e3c0e26 100644
--- a/src/semble/index/dense.py
+++ b/src/semble/index/dense.py
@@ -12,8 +12,7 @@
from vicinity.utils import normalize
from semble.types import Chunk
-
-_DEFAULT_MODEL_NAME = "minishlab/potion-code-16M"
+from semble.utils import resolve_model_name
@cache
@@ -32,7 +31,7 @@ def _load_cached(model_path: str) -> StaticModel:
def load_model(model_path: str | None = None) -> tuple[StaticModel, str]:
"""Return the current model, loading the default if none was provided."""
if model_path is None:
- model_path = _DEFAULT_MODEL_NAME
+ model_path = resolve_model_name()
model = _load_cached(model_path)
return model, model_path
diff --git a/src/semble/index/files.py b/src/semble/index/files.py
index a20e804..7aa0702 100644
--- a/src/semble/index/files.py
+++ b/src/semble/index/files.py
@@ -1,9 +1,12 @@
from collections import defaultdict
from collections.abc import Sequence
+from enum import Enum
from pathlib import Path
from semble.types import ContentType
+_MAX_FILE_BYTES = 1_000_000 # 1 MB max file size to read and index
+_EMPTY_FILE_BYTES = 128
_EXTENSION_TO_LANGUAGE = {
".4th": "forth",
".ada": "ada",
@@ -461,7 +464,7 @@ def detect_language(file_name: Path) -> str | None:
return _EXTENSION_TO_LANGUAGE.get(file_name.suffix.lower())
-def get_extensions(types: Sequence[ContentType], extensions: Sequence[str] | None) -> list[str]:
+def get_extensions(types: Sequence[ContentType]) -> list[str]:
"""Returns a list of supported file extensions for the given content types."""
languages: set[str] = set()
for content_type in types:
@@ -469,7 +472,35 @@ def get_extensions(types: Sequence[ContentType], extensions: Sequence[str] | Non
all_extensions: set[str] = set()
for language in languages:
all_extensions.update(_LANGUAGE_TO_EXTENSION.get(language, set()))
- if extensions is not None:
- all_extensions.update(extensions)
return sorted(all_extensions)
+
+
+class FileStatus(str, Enum):
+ NEWER = "newer"
+ TOO_LARGE = "too_large"
+ EMPTY = "empty"
+ VALID = "valid"
+
+
+def read_file_text(file_path: Path) -> str:
+ """Read a file's text content, replacing invalid characters and silencing read errors."""
+ return file_path.read_text(encoding="utf-8", errors="replace")
+
+
+def get_file_status(file_path: Path, write_time: float | None) -> FileStatus:
+ """Checks if a file should be indexed based on its size and modification time."""
+ stat = file_path.stat()
+ if write_time is not None and stat.st_mtime > write_time:
+ # Index invalid, file invalid
+ return FileStatus.NEWER
+ size = stat.st_size
+ if size > _MAX_FILE_BYTES:
+ # index valid, file invalid
+ return FileStatus.TOO_LARGE
+ if size < _EMPTY_FILE_BYTES and not read_file_text(file_path).strip():
+ # index valid, file invalid
+ return FileStatus.EMPTY
+
+ # Both valid
+ return FileStatus.VALID
diff --git a/src/semble/index/index.py b/src/semble/index/index.py
index 7949471..8bab699 100644
--- a/src/semble/index/index.py
+++ b/src/semble/index/index.py
@@ -15,8 +15,10 @@
from bm25s import BM25
from model2vec.model import StaticModel
+from semble.cache import get_validated_cache
from semble.index.create import create_index_from_path
from semble.index.dense import SelectableBasicBackend, load_model
+from semble.index.files import read_file_text
from semble.index.types import PersistencePath
from semble.search import _search_semantic, search
from semble.stats import save_search_stats
@@ -57,6 +59,7 @@ def __init__(
model_path: str,
root: Path | None = None,
content: ContentType | Sequence[ContentType] = _DEFAULT_CONTENT,
+ loaded_from_disk: bool = False,
) -> None:
"""Initialize a SembleIndex. Should be created with from_path or from_git.
@@ -67,6 +70,7 @@ def __init__(
:param model_path: Path to the model file.
:param root: Root directory used to read file sizes for token-savings stats.
:param content: Content type used when indexing; controls the search pipeline.
+ :param loaded_from_disk: Whether the index was loaded from disk (cache hit); controls CLI messaging.
"""
self.model = model
self.chunks: list[Chunk] = chunks
@@ -77,6 +81,7 @@ def __init__(
self._content: tuple[ContentType, ...] = (content,) if isinstance(content, ContentType) else tuple(content)
self._file_sizes: dict[str, int] = self._compute_file_sizes(root) if root else {}
self._file_mapping, self._language_mapping = self._populate_mapping()
+ self.loaded_from_disk: bool = loaded_from_disk
def _populate_mapping(self) -> tuple[dict[str, list[int]], dict[str, list[int]]]:
"""Build (file → chunk indices, language → chunk indices) mappings, in that order."""
@@ -97,7 +102,7 @@ def _compute_file_sizes(self, root: Path) -> dict[str, int]:
if chunk.file_path in sizes:
continue
try:
- sizes[chunk.file_path] = len((root / chunk.file_path).read_text(encoding="utf-8", errors="replace"))
+ sizes[chunk.file_path] = len(read_file_text(root / chunk.file_path))
except OSError:
pass
return sizes
@@ -120,7 +125,6 @@ def stats(self) -> IndexStats:
def from_path(
cls,
path: str | Path,
- extensions: Sequence[str] | None = None,
content: ContentType | Sequence[ContentType] = _DEFAULT_CONTENT,
include_text_files: bool | None = None,
model_path: str | None = None,
@@ -128,7 +132,6 @@ def from_path(
"""Create and index a SembleIndex from a directory.
:param path: Root directory to index.
- :param extensions: File extensions to include. Defaults to a standard set of code extensions.
:param content: Content types to index, e.g. ContentType.CODE or [ContentType.CODE, ContentType.DOCS].
:param include_text_files: Deprecated. Pass a content sequence directly instead.
:param model_path: Path to the model to use. If None, the default model will be used.
@@ -136,18 +139,22 @@ def from_path(
:raises FileNotFoundError: If `path` does not exist.
:raises NotADirectoryError: If `path` exists but is not a directory.
"""
- model, model_path = load_model(model_path)
- normalized = _apply_include_text_files(content, include_text_files)
path = Path(path)
if not path.exists():
raise FileNotFoundError(f"Path does not exist: {path}")
if not path.is_dir():
raise NotADirectoryError(f"Path is not a directory: {path}")
+
+ normalized = _apply_include_text_files(content, include_text_files)
+ cache_path = get_validated_cache(str(path), model_path, normalized)
+ if cache_path:
+ return cls.load_from_disk(cache_path)
+ model, model_path = load_model(model_path)
+
path = path.resolve()
bm25, vicinity, chunks = create_index_from_path(
path,
model=model,
- extensions=extensions,
content=normalized,
display_root=path,
)
@@ -159,7 +166,6 @@ def from_git(
cls,
url: str,
ref: str | None = None,
- extensions: Sequence[str] | None = None,
model_path: str | None = None,
content: ContentType | Sequence[ContentType] = _DEFAULT_CONTENT,
include_text_files: bool | None = None,
@@ -173,7 +179,6 @@ def from_git(
:param url: URL of the git repository to clone (any git provider).
:param ref: Branch or tag to check out. Defaults to the remote HEAD.
- :param extensions: File extensions to include. Defaults to a standard set of code extensions.
:param model_path: Path to the model to use. If None, the default model will be used.
:param content: Content types to index, e.g. (ContentType.CODE,) or (ContentType.CODE, ContentType.DOCS).
:param include_text_files: Deprecated. Pass content=(ContentType.CODE, ContentType.DOCS, ...) instead.
@@ -181,6 +186,11 @@ def from_git(
:raises RuntimeError: If git is not on PATH, the clone fails, or times out.
"""
normalized = _apply_include_text_files(content, include_text_files)
+ cache_key = f"{url}@{ref}" if ref else url
+ cache_path = get_validated_cache(cache_key, model_path, normalized)
+ if cache_path:
+ return cls.load_from_disk(cache_path)
+
with tempfile.TemporaryDirectory() as tmp_dir:
# `--` prevents `url` from being interpreted as a git option (e.g. `--upload-pack=...`).
cmd = ["git", "clone", "--depth", "1", *(["--branch", ref] if ref else []), "--", url, tmp_dir]
@@ -200,12 +210,19 @@ def from_git(
bm25, vicinity, chunks = create_index_from_path(
resolved_path,
model=model,
- extensions=extensions,
content=normalized,
display_root=resolved_path,
)
- return SembleIndex(model, bm25, vicinity, chunks, model_path, root=resolved_path, content=normalized)
+ return SembleIndex(
+ model,
+ bm25,
+ vicinity,
+ chunks,
+ model_path,
+ root=resolved_path,
+ content=normalized,
+ )
def find_related(self, source: Chunk | SearchResult, *, top_k: int = 5) -> list[SearchResult]:
"""Return chunks semantically similar to the given chunk or search result.
@@ -300,12 +317,22 @@ def load_from_disk(cls: type[SembleIndex], path: Path | str) -> SembleIndex:
chunks.append(Chunk.from_dict(chunk_item))
root_path = metadata["root_path"]
model_path = metadata["model_path"]
+ content = tuple(ContentType(s) for s in metadata.get("content_type", ["code"]))
if root_path:
root_path = Path(root_path)
model, model_path = load_model(model_path)
- return cls(model, bm_25_index, semantic_index, chunks, model_path, root=root_path)
+ return cls(
+ model,
+ bm_25_index,
+ semantic_index,
+ chunks,
+ model_path,
+ root=root_path,
+ content=content,
+ loaded_from_disk=True,
+ )
def save(self, path: Path | str) -> None:
"""Save the index to disk."""
@@ -321,7 +348,13 @@ def save(self, path: Path | str) -> None:
data = orjson.dumps(chunks_as_dict)
f.write(data)
root_str = None if self._root is None else str(self._root)
- metadata = {"root_path": root_str, "time": datetime.now().timestamp(), "model_path": self._model_path}
+ metadata = {
+ "root_path": root_str,
+ "time": datetime.now().timestamp(),
+ "model_path": self._model_path,
+ "content_type": list(x.value for x in self._content),
+ "file_paths": sorted(self._file_mapping),
+ }
with open(persistence_paths.metadata, "wb") as f:
data = orjson.dumps(metadata)
f.write(data)
diff --git a/src/semble/mcp.py b/src/semble/mcp.py
index 3aa526a..f31d0d8 100644
--- a/src/semble/mcp.py
+++ b/src/semble/mcp.py
@@ -12,6 +12,7 @@
from mcp.server.fastmcp import FastMCP
from pydantic import Field
+from semble.cache import save_index_to_cache
from semble.index import SembleIndex
from semble.index.dense import load_model
from semble.types import ContentType
@@ -174,6 +175,19 @@ def _compute_cache_key(self, source: str, ref: str | None = None) -> str:
is_git = is_git_url(source)
return (f"{source}@{ref}" if ref else source) if is_git else str(Path(source).resolve())
+ def _build_and_cache_index(self, source: str, ref: str | None, model_path: str, cache_key: str) -> SembleIndex:
+ """Build an index for the given source and cache it."""
+ index = (
+ SembleIndex.from_git(source, ref=ref, model_path=model_path, content=self._content)
+ if is_git_url(source)
+ else SembleIndex.from_path(cache_key, model_path=model_path, content=self._content)
+ )
+ try:
+ save_index_to_cache(index, cache_key)
+ except Exception:
+ logger.warning("Failed to save index cache for %r", cache_key, exc_info=True)
+ return index
+
def evict(self, source: str) -> None:
self._tasks.pop(self._compute_cache_key(source), None)
@@ -203,25 +217,9 @@ async def get(self, source: str, ref: str | None = None) -> SembleIndex:
if cache_key not in self._tasks:
if len(self._tasks) >= _CACHE_MAX_SIZE:
self._tasks.popitem(last=False)
- if is_git_url(source):
- self._tasks[cache_key] = asyncio.create_task(
- asyncio.to_thread(
- SembleIndex.from_git,
- source,
- ref=ref,
- model_path=model_path,
- content=self._content,
- )
- )
- else:
- self._tasks[cache_key] = asyncio.create_task(
- asyncio.to_thread(
- SembleIndex.from_path,
- cache_key,
- model_path=model_path,
- content=self._content,
- )
- )
+ self._tasks[cache_key] = asyncio.create_task(
+ asyncio.to_thread(self._build_and_cache_index, source, ref, model_path, cache_key)
+ )
self._tasks.move_to_end(cache_key)
task = self._tasks[cache_key]
try:
diff --git a/src/semble/search.py b/src/semble/search.py
index f7c8fbb..238d9eb 100644
--- a/src/semble/search.py
+++ b/src/semble/search.py
@@ -105,8 +105,7 @@ def search(
normalized_semantic = _rrf_scores(semantic_scores)
normalized_bm25 = _rrf_scores(bm25_scores)
- # Sort by the file path and start line to
- # counteract randomness introduces by hashing.
+ # Sort by start line to counteract randomness introduced by hashing.
all_candidates = sorted(
{*normalized_semantic, *normalized_bm25},
key=lambda c: c.start_line,
diff --git a/src/semble/stats.py b/src/semble/stats.py
index 90f75a2..bebc988 100644
--- a/src/semble/stats.py
+++ b/src/semble/stats.py
@@ -5,11 +5,15 @@
from datetime import datetime, timedelta, timezone
from pathlib import Path
+from semble.cache import resolve_cache_folder
from semble.types import CallType, SearchResult
logger = logging.getLogger(__name__)
-_STATS_FILE = Path.home() / ".semble" / "savings.jsonl"
+
+def _get_stats_file() -> Path:
+ """Safely create a stats file."""
+ return resolve_cache_folder() / "savings.jsonl"
@dataclass
@@ -52,15 +56,18 @@ def save_search_stats(
"snippet_chars": snippet_chars,
"file_chars": file_chars,
}
- _STATS_FILE.parent.mkdir(parents=True, exist_ok=True)
- with _STATS_FILE.open("a") as f:
+ stats_file = _get_stats_file()
+ stats_file.parent.mkdir(parents=True, exist_ok=True)
+ with stats_file.open("a") as f:
f.write(json.dumps(record) + "\n")
except OSError:
pass
-def build_savings_summary(path: Path = _STATS_FILE) -> SavingsSummary:
+def build_savings_summary(path: Path | None = None) -> SavingsSummary:
"""Read savings.jsonl and return a SavingsSummary."""
+ if path is None:
+ path = _get_stats_file()
now = datetime.now(timezone.utc)
today = now.date()
seven_days_ago = (now - timedelta(days=7)).date()
@@ -98,7 +105,7 @@ def build_savings_summary(path: Path = _STATS_FILE) -> SavingsSummary:
def format_savings_report(path: Path | None = None, *, verbose: bool = False) -> str:
"""Return a formatted token-savings report."""
if path is None:
- path = _STATS_FILE
+ path = _get_stats_file()
if not path.exists():
return "No stats yet. Run a search first."
diff --git a/src/semble/utils.py b/src/semble/utils.py
index 4b71395..b11ee29 100644
--- a/src/semble/utils.py
+++ b/src/semble/utils.py
@@ -1,5 +1,6 @@
from __future__ import annotations
+import os
import re
from typing import Any
@@ -7,6 +8,7 @@
_GIT_URL_SCHEMES = ("https://", "http://", "ssh://", "git://", "git+ssh://", "file://")
_SCP_GIT_URL_RE = re.compile(r"^[\w.-]+@[\w.-]+:(?!/)")
+DEFAULT_MODEL_NAME = "minishlab/potion-code-16M"
def is_git_url(path: str) -> bool:
@@ -33,3 +35,8 @@ def resolve_chunk(chunks: list[Chunk], file_path: str, line: int) -> Chunk | Non
def format_results(query: str, results: list[SearchResult]) -> dict[str, Any]:
"""Render SearchResult objects as a JSONable object."""
return {"query": query, "results": [r.to_dict() for r in results]}
+
+
+def resolve_model_name() -> str:
+ """Resolve a model name to a configurable."""
+ return os.environ.get("SEMBLE_MODEL_NAME", DEFAULT_MODEL_NAME)
diff --git a/src/semble/version.py b/src/semble/version.py
index 6036609..b7e3609 100644
--- a/src/semble/version.py
+++ b/src/semble/version.py
@@ -1,2 +1,2 @@
-__version_triple__ = (0, 2, 0)
+__version_triple__ = (0, 3, 0)
__version__ = ".".join(map(str, __version_triple__))
diff --git a/tests/index/test_index.py b/tests/index/test_index.py
index a4d05b8..76a9d04 100644
--- a/tests/index/test_index.py
+++ b/tests/index/test_index.py
@@ -6,7 +6,8 @@
from model2vec import StaticModel
from semble import SembleIndex
-from semble.index.create import _MAX_FILE_BYTES, create_index_from_path
+from semble.index.create import create_index_from_path
+from semble.index.files import _MAX_FILE_BYTES, FileStatus, get_file_status
from semble.types import ContentType
from tests.conftest import make_chunk
@@ -74,6 +75,13 @@ def test_oversized_file_is_skipped(mock_model: StaticModel, tmp_path: Path) -> N
create_index_from_path(tmp_path, mock_model)
+def test_tiny_invalid_utf8_file_status_does_not_crash(tmp_path: Path) -> None:
+ """Tiny files with invalid UTF-8 bytes are treated as non-empty."""
+ path = tmp_path / "latin1.py"
+ path.write_bytes(b"\xff")
+ assert get_file_status(path, None) is FileStatus.VALID
+
+
def test_index_language_counts(indexed_index: SembleIndex) -> None:
"""Language breakdown in stats includes python with at least one chunk."""
stats = indexed_index.stats
@@ -185,6 +193,22 @@ def test_roundtrip(tmp_path: Path, indexed_index: SembleIndex) -> None:
assert index_2._root == indexed_index._root
+def test_load_save_roundtrip_preserves_manifest(tmp_path: Path, indexed_index: SembleIndex) -> None:
+ """load_from_disk followed by save must not clobber file_paths with an empty list."""
+ save_a = tmp_path / "a"
+ save_b = tmp_path / "b"
+ indexed_index.save(save_a)
+ with patch.object(StaticModel, "from_pretrained"):
+ loaded = SembleIndex.load_from_disk(save_a)
+ loaded.save(save_b)
+ import json
+
+ manifest_a = json.loads((save_a / "metadata.json").read_text())["file_paths"]
+ manifest_b = json.loads((save_b / "metadata.json").read_text())["file_paths"]
+ assert manifest_b == manifest_a
+ assert len(manifest_b) > 0
+
+
def test_load_non_existent(tmp_path: Path, indexed_index: SembleIndex) -> None:
"""Test that saving and loading a folder leads to the same data."""
with pytest.raises(FileNotFoundError):
@@ -208,3 +232,22 @@ def test_load_from_disk_missing_files_reports_them(tmp_path: Path) -> None:
assert "metadata.json" in error_msg
# The file we did create should NOT be listed as missing.
assert "chunks.json" not in error_msg
+
+
+def test_from_path_uses_cache_when_valid(tmp_project: Path) -> None:
+ """from_path returns the cached index directly when get_validated_cache hits."""
+ fake_cached = MagicMock(spec=SembleIndex)
+ with patch("semble.index.index.get_validated_cache", return_value=tmp_project / "cache"):
+ with patch.object(SembleIndex, "load_from_disk", return_value=fake_cached):
+ result = SembleIndex.from_path(tmp_project)
+ assert result is fake_cached
+
+
+@pytest.mark.parametrize("ref", [None, "v1.0"])
+def test_from_git_uses_cache_when_valid(ref: str | None) -> None:
+ """from_git uses the cache for both URL-only and URL@ref cache keys."""
+ fake_cached = MagicMock(spec=SembleIndex)
+ with patch("semble.index.index.get_validated_cache", return_value=Path("/cache")):
+ with patch.object(SembleIndex, "load_from_disk", return_value=fake_cached):
+ result = SembleIndex.from_git("https://github.com/org/repo.git", ref=ref)
+ assert result is fake_cached
diff --git a/tests/test_cache.py b/tests/test_cache.py
new file mode 100644
index 0000000..e46eb20
--- /dev/null
+++ b/tests/test_cache.py
@@ -0,0 +1,243 @@
+from __future__ import annotations
+
+import json
+import sys
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from semble.cache import (
+ _linux_cache_dir,
+ _windows_cache_dir,
+ clear_cache,
+ find_index_from_cache_folder,
+ get_validated_cache,
+ resolve_cache_folder,
+ save_index_to_cache,
+)
+from semble.types import ContentType
+
+
+def test_find_index_from_cache_folder_local_path(tmp_path: Path) -> None:
+ """Local paths are normalised before hashing, result ends with /index."""
+ result = find_index_from_cache_folder(str(tmp_path))
+ assert result.name == "index"
+ assert result == find_index_from_cache_folder(str(tmp_path))
+
+
+def test_find_index_from_cache_folder_git_url() -> None:
+ """Git URLs are hashed as-is (not expanded via Path.resolve)."""
+ url = "https://github.com/org/repo.git"
+ result = find_index_from_cache_folder(url)
+ assert result.name == "index"
+ assert result != find_index_from_cache_folder("https://github.com/org/other.git")
+
+
+@pytest.mark.parametrize(
+ ("env", "expected_base"),
+ [
+ ({"LOCALAPPDATA": "C:\\Local", "APPDATA": "C:\\Roaming"}, "C:\\Local"),
+ ({"APPDATA": "C:\\Roaming"}, "C:\\Roaming"),
+ ],
+)
+def test_windows_cache_dir_env(env: dict[str, str], expected_base: str) -> None:
+ """_windows_cache_dir prefers LOCALAPPDATA, falls back to APPDATA."""
+ with patch.dict("os.environ", env, clear=True):
+ assert _windows_cache_dir("semble") == Path(expected_base) / "semble" / "Cache"
+
+
+def test_linux_cache_dir_with_xdg() -> None:
+ """_linux_cache_dir uses XDG_CACHE_HOME when set."""
+ with patch.dict("os.environ", {"XDG_CACHE_HOME": "/xdg"}, clear=True):
+ assert _linux_cache_dir("semble") == Path("/xdg") / "semble"
+
+
+@pytest.mark.parametrize(
+ ("fn", "expected_rel"),
+ [
+ (_windows_cache_dir, Path("AppData") / "Local" / "semble" / "Cache"),
+ (_linux_cache_dir, Path(".cache") / "semble"),
+ ],
+)
+def test_cache_dir_no_env(fn: object, expected_rel: Path) -> None:
+ """Both helpers fall back to a home-relative path when no env vars are set."""
+ home = Path("/fake/home")
+ with patch.dict("os.environ", {}, clear=True):
+ with patch("pathlib.Path.home", return_value=home):
+ assert fn("semble") == home / expected_rel # type: ignore[operator]
+
+
+def test_save_index_to_cache(tmp_path: Path) -> None:
+ """A freshly built index is saved under its cache key."""
+ index = MagicMock(loaded_from_disk=False)
+ with patch("semble.cache.find_index_from_cache_folder", return_value=tmp_path / "index"):
+ save_index_to_cache(index, "repo")
+ index.save.assert_called_once_with(tmp_path / "index")
+
+
+@pytest.mark.parametrize(
+ ("platform", "mock_target", "expected"),
+ [
+ ("win32", "semble.cache._windows_cache_dir", Path("/win")),
+ ("linux", "semble.cache._linux_cache_dir", Path("/linux")),
+ ],
+)
+def test_resolve_cache_folder(platform: str, mock_target: str, expected: Path) -> None:
+ """resolve_cache_folder calls the correct platform helper."""
+ with patch.object(sys, "platform", platform):
+ with patch(mock_target, return_value=expected) as mock_fn:
+ with patch("pathlib.Path.mkdir"):
+ result = resolve_cache_folder()
+ mock_fn.assert_called_once_with("semble")
+ assert result == expected
+
+
+def test_clear_cache(tmp_path: Path) -> None:
+ """clear_cache removes the index directory when it exists and is a no-op otherwise."""
+ index_path = tmp_path / "index"
+ with patch("semble.cache.find_index_from_cache_folder", return_value=index_path):
+ clear_cache("/some/path") # no-op: path doesn't exist yet
+ index_path.mkdir()
+ with patch("semble.cache.find_index_from_cache_folder", return_value=index_path):
+ clear_cache("/some/path")
+ assert not index_path.exists()
+
+
+def _write_metadata(
+ path: Path,
+ model_path: str,
+ content_type: list[str],
+ write_time: float,
+ file_paths: list[str] | None = None,
+) -> None:
+ path.mkdir(parents=True, exist_ok=True)
+ (path / "chunks.json").write_text("[]")
+ (path / "bm25_index").write_text("")
+ (path / "semantic_index").write_text("")
+ (path / "metadata.json").write_text(
+ json.dumps(
+ {
+ "model_path": model_path,
+ "content_type": content_type,
+ "time": write_time,
+ "file_paths": file_paths if file_paths is not None else [],
+ }
+ )
+ )
+
+
+def test_get_validated_cache_invalid_index(tmp_path: Path) -> None:
+ """Returns None when the index directory is missing or incomplete."""
+ with patch("semble.cache.find_index_from_cache_folder", return_value=tmp_path / "missing"):
+ assert get_validated_cache("/path", None, [ContentType.CODE]) is None
+
+ index_path = tmp_path / "index"
+ index_path.mkdir()
+ with patch("semble.cache.find_index_from_cache_folder", return_value=index_path):
+ assert get_validated_cache("/path", None, [ContentType.CODE]) is None
+
+
+@pytest.mark.parametrize(
+ ("stored_model", "stored_content", "req_model", "req_content"),
+ [
+ ("other/model", ["code"], "my/model", [ContentType.CODE]), # model mismatch
+ ("my/model", ["docs"], "my/model", [ContentType.CODE]), # content mismatch
+ ("my/model", ["unknown_type"], "my/model", [ContentType.CODE]), # invalid content value
+ ],
+)
+def test_get_validated_cache_metadata_mismatch(
+ stored_model: str,
+ stored_content: list[str],
+ req_model: str,
+ req_content: list[ContentType],
+ tmp_path: Path,
+) -> None:
+ """Returns None when stored model or content type doesn't match the request."""
+ index_path = tmp_path / "index"
+ _write_metadata(index_path, stored_model, stored_content, 0.0)
+ with patch("semble.cache.find_index_from_cache_folder", return_value=index_path):
+ assert get_validated_cache("/path", req_model, req_content) is None
+
+
+def test_get_validated_cache_legacy_metadata_returns_none(tmp_path: Path) -> None:
+ """Old cache metadata missing content_type returns None instead of crashing."""
+ index_path = tmp_path / "index"
+ index_path.mkdir(parents=True)
+ (index_path / "chunks.json").write_text("[]")
+ (index_path / "bm25_index").write_text("")
+ (index_path / "semantic_index").write_text("")
+ (index_path / "metadata.json").write_text(json.dumps({"model_path": "my/model", "time": 0.0}))
+ with patch("semble.cache.find_index_from_cache_folder", return_value=index_path):
+ assert get_validated_cache("/path", "my/model", [ContentType.CODE]) is None
+
+
+def test_get_validated_cache_resolves_default_model(tmp_path: Path) -> None:
+ """When model_path is None, resolve_model_name() is used for comparison."""
+ index_path = tmp_path / "index"
+ _write_metadata(index_path, "default/model", ["code"], 0.0)
+ with patch("semble.cache.find_index_from_cache_folder", return_value=index_path):
+ with patch("semble.cache.resolve_model_name", return_value="other/model"):
+ assert get_validated_cache("/path", None, [ContentType.CODE]) is None
+
+
+def test_get_validated_cache_git_url_returns_immediately(tmp_path: Path) -> None:
+ """Git URL paths skip file-mtime checks and return the index path directly."""
+ index_path = tmp_path / "index"
+ _write_metadata(index_path, "my/model", ["code"], 0.0)
+ url = "https://github.com/org/repo.git"
+ with patch("semble.cache.find_index_from_cache_folder", return_value=index_path):
+ result = get_validated_cache(url, "my/model", [ContentType.CODE])
+ assert result == index_path
+
+
+@pytest.mark.parametrize(
+ ("write_time", "walk_result", "write", "expected"),
+ [
+ (0.0, "stale", True, None), # file newer than index → stale
+ (float("inf"), [], True, "index"), # no newer files → valid
+ (float("inf"), "stale", False, None), # no index, returns None
+ ],
+)
+def test_get_validated_cache_mtime(
+ write_time: float, walk_result: str | list, write: bool, expected: str | None, tmp_path: Path
+) -> None:
+ """Returns None when a tracked file is newer than the index; the path otherwise."""
+ index_path = tmp_path / "index"
+ stale_file = tmp_path / "src.py"
+ stale_file.write_text("x = 1" if write else "")
+ files = [stale_file] if walk_result == "stale" else walk_result
+ # Include the file in stored manifest so manifest check passes and mtime check fires.
+ stored_files = ["src.py"] if walk_result == "stale" else []
+ _write_metadata(index_path, "my/model", ["code"], write_time, file_paths=stored_files)
+
+ with patch("semble.cache.find_index_from_cache_folder", return_value=index_path):
+ with patch("semble.cache.get_extensions", return_value={".py"}):
+ with patch("semble.cache.walk_files", return_value=files):
+ result = get_validated_cache(str(tmp_path), "my/model", [ContentType.CODE])
+ assert result == (index_path if expected == "index" else None)
+
+
+@pytest.mark.parametrize(
+ ("stored_files", "current_files"),
+ [
+ (["deleted.py"], []), # file deleted since indexing
+ ([], ["new.py"]), # new file added since indexing
+ ],
+)
+def test_get_validated_cache_manifest_mismatch(
+ stored_files: list[str], current_files: list[str], tmp_path: Path
+) -> None:
+ """Returns None when the current file set differs from the stored manifest."""
+ index_path = tmp_path / "index"
+ walk_return = []
+ for f in current_files:
+ p = tmp_path / f
+ # Make sure file is not empty
+ p.write_text("a")
+ walk_return.append(p)
+ _write_metadata(index_path, "my/model", ["code"], float("inf"), file_paths=stored_files)
+ with patch("semble.cache.find_index_from_cache_folder", return_value=index_path):
+ with patch("semble.cache.walk_files", return_value=walk_return):
+ result = get_validated_cache(str(tmp_path), "my/model", [ContentType.CODE])
+ assert result is None
diff --git a/tests/test_cli.py b/tests/test_cli.py
index b998e44..e0b5de2 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -5,7 +5,7 @@
import pytest
-from semble.cli import Agent, _agent_path, _cli_main, _run_index, _run_init, main
+from semble.cli import Agent, _agent_path, _cli_main, _maybe_save_index, _run_init, main
from semble.types import ContentType, SearchResult
from tests.conftest import make_chunk
@@ -195,51 +195,23 @@ def test_mcp_main_exits_with_message_when_extras_missing(
assert "pip install 'semble[mcp]'" in capsys.readouterr().err
-def test_run_index(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
- """_run_index creates the output directory and saves the index."""
- out_dir = tmp_path / "index_output"
- fake_index = MagicMock()
- with patch("semble.cli.SembleIndex.from_path", return_value=fake_index) as mock_from_path:
- _run_index(path="/some/path", include_text_files=True, out=str(out_dir))
- mock_from_path.assert_called_once_with("/some/path", include_text_files=True)
- assert out_dir.exists()
- fake_index.save.assert_called_once_with(str(out_dir))
-
-
-def test_index_via_cli(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
- """_cli_main index subcommand calls _run_index with the correct arguments."""
- out_dir = tmp_path / "built_index"
- fake_index = MagicMock()
- monkeypatch.setattr(sys, "argv", ["semble", "index", "/some/path", "-o", str(out_dir)])
- with patch("semble.cli.SembleIndex.from_path", return_value=fake_index):
- _cli_main()
- assert out_dir.exists()
- fake_index.save.assert_called_once_with(str(out_dir))
-
-
-def test_index_git_via_cli(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
- """_cli_main index subcommand calls _run_index with the correct arguments."""
- out_dir = tmp_path / "built_index"
- fake_index = MagicMock()
- monkeypatch.setattr(sys, "argv", ["semble", "index", "git://xyz.git", "-o", str(out_dir)])
- with patch("semble.cli.SembleIndex.from_git", return_value=fake_index):
- _cli_main()
- assert out_dir.exists()
- fake_index.save.assert_called_once_with(str(out_dir))
-
-
-def test_cli_search_with_prebuilt_index(monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]) -> None:
- """_cli_main search with --index loads the pre-built index from disk."""
- chunk = make_chunk("def foo(): pass", "src/foo.py")
- fake_index = MagicMock()
- fake_index.search.return_value = [SearchResult(chunk=chunk, score=0.95)]
- monkeypatch.setattr(sys, "argv", ["semble", "search", "query text", ".", "--index", "/some/prebuilt"])
- with patch("semble.cli.SembleIndex.load_from_disk", return_value=fake_index) as mock_load:
- _cli_main()
- mock_load.assert_called_once_with("/some/prebuilt")
- out = capsys.readouterr().out
- assert "query text" in out
- assert "0.95" in out
+@pytest.mark.parametrize(
+ ("command", "argv"),
+ [
+ ("search", ["semble", "search", "query", "/no/such/path"]),
+ ("find-related", ["semble", "find-related", "src/foo.py", "1", "/no/such/path"]),
+ ],
+)
+def test_cli_path_not_found(
+ command: str, argv: list[str], monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]
+) -> None:
+ """index, search, and find-related exit 1 with a friendly message when the path does not exist."""
+ monkeypatch.setattr(sys, "argv", argv)
+ with patch("semble.cli._build_index", side_effect=FileNotFoundError("Path does not exist: /no/such/path")):
+ with pytest.raises(SystemExit) as exc_info:
+ _cli_main()
+ assert exc_info.value.code == 1
+ assert "Path does not exist" in capsys.readouterr().err
def test_include_text_files_cli_deprecated(
@@ -287,6 +259,16 @@ def test_cli_content_argument(
assert list(mock_from_path.call_args.kwargs["content"]) == expected
+def test_maybe_save_index_logs_error_on_save_failure(capsys: pytest.CaptureFixture[str]) -> None:
+ """_maybe_save_index prints to stderr when index.save raises."""
+ fake_index = MagicMock()
+ fake_index.loaded_from_disk = False
+ fake_index.save.side_effect = OSError("disk full")
+ with patch("semble.cli.find_index_from_cache_folder", return_value=Path("/cache")):
+ _maybe_save_index(fake_index, "/some/path")
+ assert "Error saving index" in capsys.readouterr().err
+
+
def test_agent_file_tools_are_bash_only() -> None:
"""The agent file must list only Bash and Read — no MCP tools that require schema loading."""
frontmatter = files("semble").joinpath("agents/claude.md").read_text(encoding="utf-8").split("---")[1]
diff --git a/tests/test_files.py b/tests/test_files.py
index 3ff1bb1..0998967 100644
--- a/tests/test_files.py
+++ b/tests/test_files.py
@@ -41,7 +41,7 @@ def test_language_sets_are_consistent() -> None:
)
def test_get_extensions(types: list[ContentType], includes: list[str], excludes: list[str]) -> None:
"""get_extensions returns the right extensions for each combination of content types."""
- exts = set(get_extensions(types, None))
+ exts = set(get_extensions(types))
for ext in includes:
assert ext in exts
for ext in excludes:
@@ -50,17 +50,6 @@ def test_get_extensions(types: list[ContentType], includes: list[str], excludes:
def test_all_excludes_data_extensions() -> None:
"""--content all does not include data file extensions (csv, json, tsv, psv)."""
- all_exts = set(get_extensions(list(ContentType), None))
+ all_exts = set(get_extensions(list(ContentType)))
for ext in (".csv", ".tsv", ".psv", ".json", ".json5"):
assert ext not in all_exts, f"{ext} should not be indexed by 'all'"
-
-
-def test_get_extensions_additional() -> None:
- """Extra extensions are appended and existing ones are not duplicated."""
- base = get_extensions(list(ContentType), None)
- with_extra = get_extensions(list(ContentType), [".kjs"])
- assert set(with_extra) == set(base) | {".kjs"}
-
- base_code = get_extensions([ContentType.CODE], None)
- with_existing = get_extensions([ContentType.CODE], [".py"])
- assert set(with_existing) == set(base_code)
diff --git a/tests/test_mcp.py b/tests/test_mcp.py
index 900477a..8521b32 100644
--- a/tests/test_mcp.py
+++ b/tests/test_mcp.py
@@ -118,12 +118,16 @@ async def test_index_cache_builds_and_caches(
"""_IndexCache.get() builds via the correct SembleIndex.* entrypoint and caches subsequent calls."""
resolved_source = str(tmp_path) if source == "local_tmp_path" else source
fake_index = MagicMock()
- with patch(f"semble.mcp.SembleIndex.{patch_target}", return_value=fake_index) as mock_build:
+ with (
+ patch(f"semble.mcp.SembleIndex.{patch_target}", return_value=fake_index) as mock_build,
+ patch("semble.mcp.save_index_to_cache") as mock_save,
+ ):
first = await cache.get(resolved_source)
second = await cache.get(resolved_source)
assert first is fake_index
assert second is fake_index
mock_build.assert_called_once()
+ mock_save.assert_called_once_with(fake_index, cache._compute_cache_key(resolved_source))
@pytest.mark.anyio
@@ -146,6 +150,17 @@ def _failing_then_ok(path: str, **kwargs: object) -> MagicMock:
assert call_count == 2
+@pytest.mark.anyio
+async def test_index_cache_ignores_cache_save_failure(cache: _IndexCache, tmp_path: Path) -> None:
+ """A cache save failure must not fail the MCP request."""
+ fake_index = MagicMock()
+ with (
+ patch("semble.mcp.SembleIndex.from_path", return_value=fake_index),
+ patch("semble.mcp.save_index_to_cache", side_effect=RuntimeError("save failed")),
+ ):
+ assert await cache.get(str(tmp_path)) is fake_index
+
+
@pytest.mark.anyio
@pytest.mark.parametrize(
("tool", "args"),
diff --git a/tests/test_stats.py b/tests/test_stats.py
index d3c7689..e3c1f32 100644
--- a/tests/test_stats.py
+++ b/tests/test_stats.py
@@ -2,7 +2,7 @@
import sys
from datetime import datetime, timezone
from pathlib import Path
-from unittest.mock import MagicMock
+from unittest.mock import MagicMock, patch
import pytest
@@ -32,14 +32,14 @@ def test_save_search_stats(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> N
chunk = make_chunk("hello", "src/foo.py")
result = SearchResult(chunk=chunk, score=0.9)
stats_file = tmp_path / "stats.jsonl"
- monkeypatch.setattr("semble.stats._STATS_FILE", stats_file)
+ monkeypatch.setattr("semble.stats._get_stats_file", lambda: stats_file)
save_search_stats([result, result], CallType.SEARCH, {"src/foo.py": 42})
assert json.loads(stats_file.read_text())["file_chars"] == 42
mock_path = MagicMock()
mock_path.parent.mkdir.return_value = None
mock_path.open.side_effect = OSError("no write")
- monkeypatch.setattr("semble.stats._STATS_FILE", mock_path)
+ monkeypatch.setattr("semble.stats._get_stats_file", lambda: mock_path)
save_search_stats([result], CallType.SEARCH, {"src/foo.py": 42}) # must not raise
@@ -87,6 +87,11 @@ def test_savings_do_not_subtract_unknown_baselines(tmp_path: Path) -> None:
assert summary.buckets["All time"].saved_chars == 400
assert "~100 tokens" in format_savings_report(path=stats_file)
+ with patch("semble.stats._get_stats_file", lambda: stats_file):
+ summary = build_savings_summary(path=None)
+ assert summary.buckets["All time"].saved_chars == 400
+ assert "~100 tokens" in format_savings_report(path=stats_file)
+
def test_savings_tolerates_bad_json(tmp_path: Path) -> None:
"""Malformed JSON lines are skipped with a warning."""
@@ -108,7 +113,7 @@ def test_savings_cli_dispatch(
) -> None:
"""Savings subcommand dispatches to format_savings_report, with and without --verbose."""
monkeypatch.setattr(sys, "argv", argv)
- monkeypatch.setattr("semble.stats._STATS_FILE", tmp_path / "nonexistent.jsonl")
+ monkeypatch.setattr("semble.stats._get_stats_file", lambda: tmp_path / "nonexistent.jsonl")
_cli_main()
assert expected in capsys.readouterr().out
diff --git a/uv.lock b/uv.lock
index 04d014b..95e35ed 100644
--- a/uv.lock
+++ b/uv.lock
@@ -10,7 +10,7 @@ resolution-markers = [
[options]
exclude-newer = "0001-01-01T00:00:00Z" # This has no effect and is included for backwards compatibility when using relative exclude-newer values.
-exclude-newer-span = "P3D"
+exclude-newer-span = "P1W"
[[package]]
name = "annotated-doc"