From 3fc0f28b1a3106c176377c8fac5fbc3df3e4b18a Mon Sep 17 00:00:00 2001 From: stephantul Date: Fri, 22 May 2026 17:08:46 +0200 Subject: [PATCH 01/17] feat: store index in caches folder --- src/semble/cli.py | 30 ++++++++++++++---------------- src/semble/stats.py | 3 ++- src/semble/utils.py | 36 ++++++++++++++++++++++++++++++++++++ 3 files changed, 52 insertions(+), 17 deletions(-) diff --git a/src/semble/cli.py b/src/semble/cli.py index 441dd1c..8e886a4 100644 --- a/src/semble/cli.py +++ b/src/semble/cli.py @@ -6,13 +6,14 @@ from importlib.resources import files from importlib.util import find_spec from pathlib import Path +from typing import Sequence from model2vec.utils import get_package_extras from semble.index import SembleIndex from semble.stats import format_savings_report from semble.types import ContentType -from semble.utils import format_results, is_git_url, resolve_chunk +from semble.utils import find_index_from_cache_folder, format_results, is_git_url, resolve_chunk class Agent(str, Enum): @@ -82,14 +83,15 @@ def _mcp_main() -> None: asyncio.run(serve(args.path, ref=args.ref, content=content)) -def _run_index(*, path: str, include_text_files: bool = False, out: str) -> None: +def _run_index(*, path: str, content: Sequence[ContentType], include_text_files: bool | None) -> None: """Index and store a codebase.""" if is_git_url(path): - index = SembleIndex.from_git(path, include_text_files=include_text_files) + index = SembleIndex.from_git(path, content=content, include_text_files=include_text_files) else: - index = SembleIndex.from_path(path, include_text_files=include_text_files) - Path(out).mkdir(parents=True, exist_ok=True) - index.save(out) + index = SembleIndex.from_path(path, content=content, include_text_files=include_text_files) + index_path = find_index_from_cache_folder(Path(path)) + print(f"Wrote index to `{index_path}`.") + index.save(index_path) def _run_init(*, agent: Agent = _DEFAULT_AGENT, force: bool = False) -> None: @@ -123,18 +125,13 @@ def _cli_main() -> None: index_p = sub.add_parser("index", help="Index and store a codebase.") index_p.add_argument("path", nargs="?", default=".", help="Local path or git URL (default: current directory).") - index_p.add_argument( - "--include-text-files", - action="store_true", - help="Also index non-code text files (.md, .yaml, .json, etc.).", - ) - index_p.add_argument("-o", "--out", type=str, required=True, help="The path to write the pre-built index to.") + _add_content_args(index_p) search_p = sub.add_parser("search", help="Search a codebase.") search_p.add_argument("query", help="Natural language or code query.") search_p.add_argument("path", nargs="?", default=".", help="Local path or git URL (default: current directory).") search_p.add_argument("-k", "--top-k", type=int, default=5, help="Number of results (default: 5).") - search_p.add_argument("--index", type=str, default=None, help="A path pointing to a pre-built index.") + search_p.add_argument("--index", action="store_true", help="Use an index at the default path.") _add_content_args(search_p) related_p = sub.add_parser("find-related", help="Find code similar to a specific location.") @@ -165,15 +162,16 @@ def _cli_main() -> None: return if args.command == "index": - _run_index(path=args.path, include_text_files=args.include_text_files, out=args.out) + _run_index(path=args.path, content=args.content, include_text_files=args.include_text_files) return if args.command == "savings": - print(format_savings_report(verbose=args.verbose), end="") + print(format_savings_report(verbose=args.verbose)) return if args.index: - index = SembleIndex.load_from_disk(args.index) + path = find_index_from_cache_folder(Path(args.path)) + index = SembleIndex.load_from_disk(path) else: content = _resolve_content(args.content, args.include_text_files) index = ( diff --git a/src/semble/stats.py b/src/semble/stats.py index 90f75a2..d33bfd4 100644 --- a/src/semble/stats.py +++ b/src/semble/stats.py @@ -6,10 +6,11 @@ from pathlib import Path from semble.types import CallType, SearchResult +from semble.utils import resolve_cache_folder logger = logging.getLogger(__name__) -_STATS_FILE = Path.home() / ".semble" / "savings.jsonl" +_STATS_FILE = resolve_cache_folder() / "savings.jsonl" @dataclass diff --git a/src/semble/utils.py b/src/semble/utils.py index f8c16f1..955ae3b 100644 --- a/src/semble/utils.py +++ b/src/semble/utils.py @@ -1,6 +1,10 @@ from __future__ import annotations +import hashlib +import os import re +import sys +from pathlib import Path from semble.types import Chunk, SearchResult @@ -13,6 +17,38 @@ def is_git_url(path: str) -> bool: return path.startswith(_GIT_URL_SCHEMES) or _SCP_GIT_URL_RE.match(path) is not None +def find_index_from_cache_folder(path: Path) -> Path: + """Finds an index from a cache folder and a project path.""" + normalized = path.expanduser().resolve() + data = str(normalized).encode("utf-8") + subdir_path = hashlib.new("sha256", data).hexdigest() + cache_dir = resolve_cache_folder() / subdir_path + return cache_dir / "index" + + +def resolve_cache_folder() -> Path: + """Resolves a cache folder, respects XDG_CACHE_HOME.""" + name = "semble" + if sys.platform == "win32": + base = os.getenv("LOCALAPPDATA") or os.getenv("APPDATA") + if base is None: + base = Path.home() / "AppData" / "Local" + else: + base = Path(base) + cache_dir = base / name / "Cache" + elif sys.platform == "darwin": + cache_dir = Path.home() / "Library" / "Caches" / name + else: + base = os.getenv("XDG_CACHE_HOME") + if base: + cache_dir = Path(base) / name + else: + cache_dir = Path.home() / ".cache" / name + + cache_dir.mkdir(parents=True, exist_ok=True) + return cache_dir + + def resolve_chunk(chunks: list[Chunk], file_path: str, line: int) -> Chunk | None: """Return the chunk containing *line* in *file_path*, or None. From 3a67a20ba1f028cecc46f3a8f9c1690e06264f8c Mon Sep 17 00:00:00 2001 From: stephantul Date: Tue, 26 May 2026 16:12:17 +0200 Subject: [PATCH 02/17] commit with broken tests --- benchmarks/baselines/ablations.py | 4 +- benchmarks/run_benchmark.py | 4 +- benchmarks/speed_benchmark.py | 4 +- benchmarks/token_efficiency.py | 4 +- src/semble/cache.py | 104 ++++++++++++++++++++++++++++++ src/semble/cli.py | 53 ++++++++------- src/semble/index/dense.py | 5 +- src/semble/index/index.py | 26 ++++++-- src/semble/search.py | 3 +- src/semble/stats.py | 2 +- src/semble/utils.py | 41 ++---------- tests/test_cli.py | 42 +++++------- 12 files changed, 186 insertions(+), 106 deletions(-) create mode 100644 src/semble/cache.py diff --git a/benchmarks/baselines/ablations.py b/benchmarks/baselines/ablations.py index 7f91b67..c948e07 100644 --- a/benchmarks/baselines/ablations.py +++ b/benchmarks/baselines/ablations.py @@ -17,7 +17,7 @@ ) from benchmarks.run_benchmark import RepoResult, evaluate from semble import SembleIndex -from semble.index.dense import _DEFAULT_MODEL_NAME +from semble.index.dense import DEFAULT_MODEL_NAME # alpha=None → raw mode, input depends on query # alpha=0.0 → hybrid pipeline, BM25-only input @@ -129,7 +129,7 @@ def main() -> None: summary = { "tool": "semble-ablations", - "model": _DEFAULT_MODEL_NAME, + "model": DEFAULT_MODEL_NAME, "by_mode": summarize_modes(results, modes), "repos": [asdict(r) for r in results], } diff --git a/benchmarks/run_benchmark.py b/benchmarks/run_benchmark.py index 4af095e..75137af 100644 --- a/benchmarks/run_benchmark.py +++ b/benchmarks/run_benchmark.py @@ -16,7 +16,7 @@ ) from benchmarks.metrics import ndcg_at_k, target_rank from semble import SembleIndex -from semble.index.dense import _DEFAULT_MODEL_NAME +from semble.index.dense import DEFAULT_MODEL_NAME from semble.types import SearchResult _LATENCY_RUNS = 5 @@ -259,7 +259,7 @@ def _save_results(results: list[RepoResult]) -> None: n_repos = len(results) output = { "tool": "semble-hybrid", - "model": _DEFAULT_MODEL_NAME, + "model": DEFAULT_MODEL_NAME, "summary": { "ndcg10": round(sum(r.ndcg10 for r in results) / n_repos, 4), "tokens": round(sum(r.tokens for r in results) / n_repos, 0), diff --git a/benchmarks/speed_benchmark.py b/benchmarks/speed_benchmark.py index b96ad75..aee3061 100644 --- a/benchmarks/speed_benchmark.py +++ b/benchmarks/speed_benchmark.py @@ -11,7 +11,7 @@ from benchmarks.data import RepoSpec, Task, available_repo_specs, load_tasks, save_results from benchmarks.tools import run_colgrep_files, run_ripgrep_count from semble import SembleIndex -from semble.index.dense import _DEFAULT_MODEL_NAME +from semble.index.dense import DEFAULT_MODEL_NAME from semble.types import EmbeddingMatrix # One representative repo per language (medium size, healthy NDCG on the main benchmark). @@ -192,7 +192,7 @@ def main() -> None: print("Loading semble model...", file=sys.stderr) started = time.perf_counter() - semble_model = StaticModel.from_pretrained(_DEFAULT_MODEL_NAME) + semble_model = StaticModel.from_pretrained(DEFAULT_MODEL_NAME) print(f" loaded in {(time.perf_counter() - started) * 1000:.0f}ms", file=sys.stderr) print("Loading CodeRankEmbed...", file=sys.stderr) diff --git a/benchmarks/token_efficiency.py b/benchmarks/token_efficiency.py index 7c44ac5..cc62873 100644 --- a/benchmarks/token_efficiency.py +++ b/benchmarks/token_efficiency.py @@ -24,7 +24,7 @@ target_matches_location, ) from semble import SembleIndex -from semble.index.dense import _DEFAULT_MODEL_NAME +from semble.index.dense import DEFAULT_MODEL_NAME from semble.index.file_walker import DEFAULT_IGNORED_DIRS, FILE_TYPES, FileCategory from semble.ranking.boosting import _STOPWORDS as _SEMBLE_STOPWORDS from semble.types import Chunk @@ -378,7 +378,7 @@ def run_recall(args: argparse.Namespace) -> None: print("Loading tokenizer + model...", file=sys.stderr) enc = tiktoken.get_encoding(_TOKENIZER_NAME) - model = StaticModel.from_pretrained(_DEFAULT_MODEL_NAME) + model = StaticModel.from_pretrained(DEFAULT_MODEL_NAME) method_curves: dict[str, MethodCurves] = defaultdict(list) print(f"\n{'Repo':<22} {'Language':<12} {'Tasks':>6} {'Time':>8}", file=sys.stderr) diff --git a/src/semble/cache.py b/src/semble/cache.py new file mode 100644 index 0000000..ac5091e --- /dev/null +++ b/src/semble/cache.py @@ -0,0 +1,104 @@ +import hashlib +import json +import os +import shutil +import sys +from collections.abc import Sequence +from pathlib import Path + +from semble.index.file_walker import walk_files +from semble.index.files import get_extensions +from semble.index.types import PersistencePath +from semble.types import ContentType +from semble.utils import is_git_url, resolve_model_name + + +def find_index_from_cache_folder(path: str) -> Path: + """Finds an index from a cache folder and a project path.""" + if is_git_url(path): + data = path.encode("utf-8") + else: + normalized = Path(path).expanduser().resolve() + data = str(normalized).encode("utf-8") + subdir_path = hashlib.new("sha256", data).hexdigest() + cache_dir = resolve_cache_folder() / subdir_path + return cache_dir / "index" + + +def _windows_cache_dir(name: str) -> Path: + """Get the default windows cache dir.""" + env_base = os.getenv("LOCALAPPDATA") or os.getenv("APPDATA") + base = Path(env_base) if env_base is not None else Path.home() / "AppData" / "Local" + return base / name / "Cache" + + +def _macos_cache_dir(name: str) -> Path: + """Get the default macOS cache dir.""" + return Path.home() / "Library" / "Caches" / name + + +def _linux_cache_dir(name: str) -> Path: + """Get the default Linux cache dir.""" + env_base = os.getenv("XDG_CACHE_HOME") + base = Path(env_base) if env_base else Path.home() / ".cache" + return base / name + + +def resolve_cache_folder() -> Path: + """Resolves a cache folder, respects XDG_CACHE_HOME.""" + name = "semble" + if sys.platform == "win32": + cache_dir = _windows_cache_dir(name) + if sys.platform == "darwin": + cache_dir = _macos_cache_dir(name) + else: + cache_dir = _linux_cache_dir(name) + + cache_dir.mkdir(parents=True, exist_ok=True) + return cache_dir + + +def clear_cache(path: str) -> None: + """Clears the cache for the given path.""" + index_path = find_index_from_cache_folder(path) + if index_path and index_path.exists(): + shutil.rmtree(index_path) + + +def get_validated_cache(path: str, model_path: str | None, content: Sequence[ContentType]) -> Path | None: + """Validates the cache folder and returns the index path.""" + index_path = find_index_from_cache_folder(path) + if not index_path.exists(): + return None + + persistence_path = PersistencePath.from_path(index_path) + if persistence_path.non_existing(): + return None + + with open(persistence_path.metadata) as f: + metadata = json.load(f) + model_path_from_index = metadata["model_path"] + if model_path is None: + model_path = resolve_model_name() + if model_path_from_index != model_path: + return None + + content_type_strings: list[str] = metadata["content_type"] + + content_type = tuple(ContentType(string) for string in content_type_strings) + if set(content_type) != set(content): + return None + + if is_git_url(str(path)): + return index_path + + write_time = metadata["time"] + extensions = get_extensions(content_type, None) + + path_as_path = Path(path) + for file_path in walk_files(path_as_path, extensions=extensions): + st = file_path.stat() + if st.st_mtime > write_time: + return None + + return index_path diff --git a/src/semble/cli.py b/src/semble/cli.py index e6ee880..b75f9c7 100644 --- a/src/semble/cli.py +++ b/src/semble/cli.py @@ -2,19 +2,20 @@ import asyncio import json import sys +import time import warnings from enum import Enum from importlib.resources import files from importlib.util import find_spec from pathlib import Path -from typing import Sequence from model2vec.utils import get_package_extras +from semble.cache import find_index_from_cache_folder from semble.index import SembleIndex from semble.stats import format_savings_report from semble.types import ContentType -from semble.utils import find_index_from_cache_folder, format_results, is_git_url, resolve_chunk +from semble.utils import format_results, is_git_url, resolve_chunk class Agent(str, Enum): @@ -30,6 +31,19 @@ class Agent(str, Enum): _CLI_DISPATCH_ARGS = frozenset({"search", "find-related", "init", "savings", "-h", "--help", "index"}) +def _maybe_save_index(index: SembleIndex, path: str, creation_time: float) -> None: + """Maybe save an index. Based on the index itself and the creation time.""" + # If the index was not loaded from disk, + # the index was invalidated or it didn't exist. + # If the creation time of the index was < 1 second, don't save + if creation_time > 1.0 and not index.loaded_from_disk: + try: + cache_folder = find_index_from_cache_folder(path) + index.save(cache_folder) + except Exception as e: + print(f"Error saving index: {e}", file=sys.stderr) + + def _agent_path(agent: Agent) -> Path: """Return the project-relative path where the semble sub-agent file should be written.""" base_dir = ".github" if agent is Agent.COPILOT else f".{agent.value}" @@ -84,17 +98,6 @@ def _mcp_main() -> None: asyncio.run(serve(args.path, ref=args.ref, content=content)) -def _run_index(*, path: str, content: Sequence[ContentType], include_text_files: bool | None) -> None: - """Index and store a codebase.""" - if is_git_url(path): - index = SembleIndex.from_git(path, content=content, include_text_files=include_text_files) - else: - index = SembleIndex.from_path(path, content=content, include_text_files=include_text_files) - index_path = find_index_from_cache_folder(Path(path)) - print(f"Wrote index to `{index_path}`.") - index.save(index_path) - - def _run_init(*, agent: Agent = _DEFAULT_AGENT, force: bool = False) -> None: """Write the semble sub-agent file for the given coding agent into the current project.""" dest = _agent_path(agent) @@ -162,24 +165,18 @@ def _cli_main() -> None: _run_init(agent=Agent(args.agent), force=args.force) return - if args.command == "index": - _run_index(path=args.path, content=args.content, include_text_files=args.include_text_files) - return - if args.command == "savings": print(format_savings_report(verbose=args.verbose)) return - if args.index: - path = find_index_from_cache_folder(Path(args.path)) - index = SembleIndex.load_from_disk(path) - else: - content = _resolve_content(args.content, args.include_text_files) - index = ( - SembleIndex.from_git(args.path, content=content) - if is_git_url(args.path) - else SembleIndex.from_path(args.path, content=content) - ) + content = _resolve_content(args.content, args.include_text_files) + start = time.time() + index = ( + SembleIndex.from_git(args.path, content=content) + if is_git_url(args.path) + else SembleIndex.from_path(args.path, content=content) + ) + creation_time = time.time() - start if args.command == "search": results = index.search(args.query, top_k=args.top_k) @@ -200,3 +197,5 @@ def _cli_main() -> None: else: out = format_results(f"Chunks related to {args.file_path}:{args.line}", results) print(json.dumps(out)) + + _maybe_save_index(index, args.path, creation_time) diff --git a/src/semble/index/dense.py b/src/semble/index/dense.py index 9677c22..e3c0e26 100644 --- a/src/semble/index/dense.py +++ b/src/semble/index/dense.py @@ -12,8 +12,7 @@ from vicinity.utils import normalize from semble.types import Chunk - -_DEFAULT_MODEL_NAME = "minishlab/potion-code-16M" +from semble.utils import resolve_model_name @cache @@ -32,7 +31,7 @@ def _load_cached(model_path: str) -> StaticModel: def load_model(model_path: str | None = None) -> tuple[StaticModel, str]: """Return the current model, loading the default if none was provided.""" if model_path is None: - model_path = _DEFAULT_MODEL_NAME + model_path = resolve_model_name() model = _load_cached(model_path) return model, model_path diff --git a/src/semble/index/index.py b/src/semble/index/index.py index 7949471..b4437a4 100644 --- a/src/semble/index/index.py +++ b/src/semble/index/index.py @@ -15,6 +15,7 @@ from bm25s import BM25 from model2vec.model import StaticModel +from semble.cache import get_validated_cache from semble.index.create import create_index_from_path from semble.index.dense import SelectableBasicBackend, load_model from semble.index.types import PersistencePath @@ -57,6 +58,7 @@ def __init__( model_path: str, root: Path | None = None, content: ContentType | Sequence[ContentType] = _DEFAULT_CONTENT, + loaded_from_disk: bool = False, ) -> None: """Initialize a SembleIndex. Should be created with from_path or from_git. @@ -77,6 +79,7 @@ def __init__( self._content: tuple[ContentType, ...] = (content,) if isinstance(content, ContentType) else tuple(content) self._file_sizes: dict[str, int] = self._compute_file_sizes(root) if root else {} self._file_mapping, self._language_mapping = self._populate_mapping() + self.loaded_from_disk: bool = loaded_from_disk def _populate_mapping(self) -> tuple[dict[str, list[int]], dict[str, list[int]]]: """Build (file → chunk indices, language → chunk indices) mappings, in that order.""" @@ -136,13 +139,18 @@ def from_path( :raises FileNotFoundError: If `path` does not exist. :raises NotADirectoryError: If `path` exists but is not a directory. """ - model, model_path = load_model(model_path) - normalized = _apply_include_text_files(content, include_text_files) path = Path(path) if not path.exists(): raise FileNotFoundError(f"Path does not exist: {path}") if not path.is_dir(): raise NotADirectoryError(f"Path is not a directory: {path}") + + normalized = _apply_include_text_files(content, include_text_files) + cache_path = get_validated_cache(str(path), model_path, normalized) + if cache_path: + return cls.load_from_disk(cache_path) + model, model_path = load_model(model_path) + path = path.resolve() bm25, vicinity, chunks = create_index_from_path( path, @@ -180,7 +188,12 @@ def from_git( :return: An indexed SembleIndex. Chunk file paths are repo-relative (e.g. ``src/foo.py``). :raises RuntimeError: If git is not on PATH, the clone fails, or times out. """ + path = f"{url}@{ref}" if ref else url normalized = _apply_include_text_files(content, include_text_files) + cache_path = get_validated_cache(path, model_path, normalized) + if cache_path: + return cls.load_from_disk(cache_path) + with tempfile.TemporaryDirectory() as tmp_dir: # `--` prevents `url` from being interpreted as a git option (e.g. `--upload-pack=...`). cmd = ["git", "clone", "--depth", "1", *(["--branch", ref] if ref else []), "--", url, tmp_dir] @@ -305,7 +318,7 @@ def load_from_disk(cls: type[SembleIndex], path: Path | str) -> SembleIndex: model, model_path = load_model(model_path) - return cls(model, bm_25_index, semantic_index, chunks, model_path, root=root_path) + return cls(model, bm_25_index, semantic_index, chunks, model_path, root=root_path, loaded_from_disk=True) def save(self, path: Path | str) -> None: """Save the index to disk.""" @@ -321,7 +334,12 @@ def save(self, path: Path | str) -> None: data = orjson.dumps(chunks_as_dict) f.write(data) root_str = None if self._root is None else str(self._root) - metadata = {"root_path": root_str, "time": datetime.now().timestamp(), "model_path": self._model_path} + metadata = { + "root_path": root_str, + "time": datetime.now().timestamp(), + "model_path": self._model_path, + "content_type": list(x.value for x in self._content), + } with open(persistence_paths.metadata, "wb") as f: data = orjson.dumps(metadata) f.write(data) diff --git a/src/semble/search.py b/src/semble/search.py index f7c8fbb..238d9eb 100644 --- a/src/semble/search.py +++ b/src/semble/search.py @@ -105,8 +105,7 @@ def search( normalized_semantic = _rrf_scores(semantic_scores) normalized_bm25 = _rrf_scores(bm25_scores) - # Sort by the file path and start line to - # counteract randomness introduces by hashing. + # Sort by start line to counteract randomness introduced by hashing. all_candidates = sorted( {*normalized_semantic, *normalized_bm25}, key=lambda c: c.start_line, diff --git a/src/semble/stats.py b/src/semble/stats.py index d33bfd4..58f6097 100644 --- a/src/semble/stats.py +++ b/src/semble/stats.py @@ -5,8 +5,8 @@ from datetime import datetime, timedelta, timezone from pathlib import Path +from semble.cache import resolve_cache_folder from semble.types import CallType, SearchResult -from semble.utils import resolve_cache_folder logger = logging.getLogger(__name__) diff --git a/src/semble/utils.py b/src/semble/utils.py index 593988d..0015c77 100644 --- a/src/semble/utils.py +++ b/src/semble/utils.py @@ -1,16 +1,14 @@ from __future__ import annotations -import hashlib import os import re -import sys -from pathlib import Path from typing import Any from semble.types import Chunk, SearchResult _GIT_URL_SCHEMES = ("https://", "http://", "ssh://", "git://", "git+ssh://", "file://") _SCP_GIT_URL_RE = re.compile(r"^[\w.-]+@[\w.-]+:(?!/)") +_DEFAULT_MODEL_NAME = "minishlab/potion-code-16M" def is_git_url(path: str) -> bool: @@ -18,38 +16,6 @@ def is_git_url(path: str) -> bool: return path.startswith(_GIT_URL_SCHEMES) or _SCP_GIT_URL_RE.match(path) is not None -def find_index_from_cache_folder(path: Path) -> Path: - """Finds an index from a cache folder and a project path.""" - normalized = path.expanduser().resolve() - data = str(normalized).encode("utf-8") - subdir_path = hashlib.new("sha256", data).hexdigest() - cache_dir = resolve_cache_folder() / subdir_path - return cache_dir / "index" - - -def resolve_cache_folder() -> Path: - """Resolves a cache folder, respects XDG_CACHE_HOME.""" - name = "semble" - if sys.platform == "win32": - base = os.getenv("LOCALAPPDATA") or os.getenv("APPDATA") - if base is None: - base = Path.home() / "AppData" / "Local" - else: - base = Path(base) - cache_dir = base / name / "Cache" - elif sys.platform == "darwin": - cache_dir = Path.home() / "Library" / "Caches" / name - else: - base = os.getenv("XDG_CACHE_HOME") - if base: - cache_dir = Path(base) / name - else: - cache_dir = Path.home() / ".cache" / name - - cache_dir.mkdir(parents=True, exist_ok=True) - return cache_dir - - def resolve_chunk(chunks: list[Chunk], file_path: str, line: int) -> Chunk | None: """Return the chunk containing *line* in *file_path*, or None. @@ -69,3 +35,8 @@ def resolve_chunk(chunks: list[Chunk], file_path: str, line: int) -> Chunk | Non def format_results(query: str, results: list[SearchResult]) -> dict[str, Any]: """Render SearchResult objects as a JSONable object.""" return {"query": query, "results": [r.to_dict() for r in results]} + + +def resolve_model_name() -> str: + """Resolve a model name to a configurable.""" + return os.environ.get("SEMBLE_MODEL_NAME", _DEFAULT_MODEL_NAME) diff --git a/tests/test_cli.py b/tests/test_cli.py index b998e44..4e5285a 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -5,7 +5,7 @@ import pytest -from semble.cli import Agent, _agent_path, _cli_main, _run_index, _run_init, main +from semble.cli import Agent, _agent_path, _cli_main, _run_init, main from semble.types import ContentType, SearchResult from tests.conftest import make_chunk @@ -195,37 +195,26 @@ def test_mcp_main_exits_with_message_when_extras_missing( assert "pip install 'semble[mcp]'" in capsys.readouterr().err -def test_run_index(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: - """_run_index creates the output directory and saves the index.""" - out_dir = tmp_path / "index_output" - fake_index = MagicMock() - with patch("semble.cli.SembleIndex.from_path", return_value=fake_index) as mock_from_path: - _run_index(path="/some/path", include_text_files=True, out=str(out_dir)) - mock_from_path.assert_called_once_with("/some/path", include_text_files=True) - assert out_dir.exists() - fake_index.save.assert_called_once_with(str(out_dir)) - - def test_index_via_cli(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: """_cli_main index subcommand calls _run_index with the correct arguments.""" - out_dir = tmp_path / "built_index" + out_dir = tmp_path / "index" fake_index = MagicMock() - monkeypatch.setattr(sys, "argv", ["semble", "index", "/some/path", "-o", str(out_dir)]) - with patch("semble.cli.SembleIndex.from_path", return_value=fake_index): - _cli_main() + monkeypatch.setattr(sys, "argv", ["semble", "index", "/some/path"]) + with patch("semble.cache.get_validated_cache", return_value=tmp_path): + with patch("semble.cli.SembleIndex.from_path", return_value=fake_index): + _cli_main() assert out_dir.exists() fake_index.save.assert_called_once_with(str(out_dir)) def test_index_git_via_cli(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: """_cli_main index subcommand calls _run_index with the correct arguments.""" - out_dir = tmp_path / "built_index" fake_index = MagicMock() - monkeypatch.setattr(sys, "argv", ["semble", "index", "git://xyz.git", "-o", str(out_dir)]) - with patch("semble.cli.SembleIndex.from_git", return_value=fake_index): - _cli_main() - assert out_dir.exists() - fake_index.save.assert_called_once_with(str(out_dir)) + with patch("semble.cache.get_validated_cache", lambda: tmp_path): + monkeypatch.setattr(sys, "argv", ["semble", "search", "git://xyz.git"]) + with patch("semble.cli.SembleIndex.from_git", return_value=fake_index): + _cli_main() + fake_index.save.assert_called_once_with(str(tmp_path)) def test_cli_search_with_prebuilt_index(monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]) -> None: @@ -233,10 +222,11 @@ def test_cli_search_with_prebuilt_index(monkeypatch: pytest.MonkeyPatch, capsys: chunk = make_chunk("def foo(): pass", "src/foo.py") fake_index = MagicMock() fake_index.search.return_value = [SearchResult(chunk=chunk, score=0.95)] - monkeypatch.setattr(sys, "argv", ["semble", "search", "query text", ".", "--index", "/some/prebuilt"]) - with patch("semble.cli.SembleIndex.load_from_disk", return_value=fake_index) as mock_load: - _cli_main() - mock_load.assert_called_once_with("/some/prebuilt") + monkeypatch.setattr(sys, "argv", ["semble", "search", "query text"]) + with patch("semble.cache.get_validated_cache", return_value="/some/prebuilt"): + with patch("semble.cli.SembleIndex.load_from_disk", return_value=fake_index) as mock_load: + _cli_main() + mock_load.assert_called_once() out = capsys.readouterr().out assert "query text" in out assert "0.95" in out From 970397d15d47304fdd72e4b48b1dfe683187bf19 Mon Sep 17 00:00:00 2001 From: Pringled Date: Tue, 26 May 2026 17:00:49 +0200 Subject: [PATCH 03/17] Update tests --- tests/index/test_index.py | 18 ++++ tests/test_cache.py | 185 ++++++++++++++++++++++++++++++++++++++ tests/test_cli.py | 59 ++++++------ 3 files changed, 236 insertions(+), 26 deletions(-) create mode 100644 tests/test_cache.py diff --git a/tests/index/test_index.py b/tests/index/test_index.py index a4d05b8..9b8be68 100644 --- a/tests/index/test_index.py +++ b/tests/index/test_index.py @@ -208,3 +208,21 @@ def test_load_from_disk_missing_files_reports_them(tmp_path: Path) -> None: assert "metadata.json" in error_msg # The file we did create should NOT be listed as missing. assert "chunks.json" not in error_msg + + +def test_from_path_uses_cache_when_valid(tmp_project: Path) -> None: + """from_path returns the cached index directly when get_validated_cache hits.""" + fake_cached = MagicMock(spec=SembleIndex) + with patch("semble.index.index.get_validated_cache", return_value=tmp_project / "cache"): + with patch.object(SembleIndex, "load_from_disk", return_value=fake_cached): + result = SembleIndex.from_path(tmp_project) + assert result is fake_cached + + +def test_from_git_uses_cache_when_valid() -> None: + """from_git returns the cached index directly when get_validated_cache hits.""" + fake_cached = MagicMock(spec=SembleIndex) + with patch("semble.index.index.get_validated_cache", return_value=Path("/cache")): + with patch.object(SembleIndex, "load_from_disk", return_value=fake_cached): + result = SembleIndex.from_git("https://github.com/org/repo.git") + assert result is fake_cached diff --git a/tests/test_cache.py b/tests/test_cache.py new file mode 100644 index 0000000..21986d1 --- /dev/null +++ b/tests/test_cache.py @@ -0,0 +1,185 @@ +from __future__ import annotations + +import json +import sys +from pathlib import Path +from unittest.mock import patch + +import pytest + +from semble.cache import ( + _linux_cache_dir, + _windows_cache_dir, + clear_cache, + find_index_from_cache_folder, + get_validated_cache, + resolve_cache_folder, +) +from semble.types import ContentType + + +def test_find_index_from_cache_folder_local_path(tmp_path: Path) -> None: + """Local paths are normalised before hashing, result ends with /index.""" + result = find_index_from_cache_folder(str(tmp_path)) + assert result.name == "index" + assert result == find_index_from_cache_folder(str(tmp_path)) + + +def test_find_index_from_cache_folder_git_url() -> None: + """Git URLs are hashed as-is (not expanded via Path.resolve).""" + url = "https://github.com/org/repo.git" + result = find_index_from_cache_folder(url) + assert result.name == "index" + assert result != find_index_from_cache_folder("https://github.com/org/other.git") + + +@pytest.mark.parametrize( + ("env", "expected_base"), + [ + ({"LOCALAPPDATA": "C:\\Local", "APPDATA": "C:\\Roaming"}, "C:\\Local"), + ({"APPDATA": "C:\\Roaming"}, "C:\\Roaming"), + ], +) +def test_windows_cache_dir_env(env: dict[str, str], expected_base: str) -> None: + """_windows_cache_dir prefers LOCALAPPDATA, falls back to APPDATA.""" + with patch.dict("os.environ", env, clear=True): + assert _windows_cache_dir("semble") == Path(expected_base) / "semble" / "Cache" + + +def test_linux_cache_dir_with_xdg() -> None: + """_linux_cache_dir uses XDG_CACHE_HOME when set.""" + with patch.dict("os.environ", {"XDG_CACHE_HOME": "/xdg"}, clear=True): + assert _linux_cache_dir("semble") == Path("/xdg") / "semble" + + +@pytest.mark.parametrize( + ("fn", "expected_rel"), + [ + (_windows_cache_dir, Path("AppData") / "Local" / "semble" / "Cache"), + (_linux_cache_dir, Path(".cache") / "semble"), + ], +) +def test_cache_dir_no_env(fn: object, expected_rel: Path) -> None: + """Both helpers fall back to a home-relative path when no env vars are set.""" + home = Path("/fake/home") + with patch.dict("os.environ", {}, clear=True): + with patch("pathlib.Path.home", return_value=home): + assert fn("semble") == home / expected_rel # type: ignore[operator] + + +def test_resolve_cache_folder_windows() -> None: + """resolve_cache_folder calls _windows_cache_dir on win32.""" + with patch.object(sys, "platform", "win32"): + with patch("semble.cache._windows_cache_dir", return_value=Path("/win")) as mock_win: + with patch("semble.cache._linux_cache_dir", return_value=Path("/linux")) as mock_linux: + with patch("pathlib.Path.mkdir"): + result = resolve_cache_folder() + mock_win.assert_called_once_with("semble") + mock_linux.assert_called_once_with("semble") + assert result == Path("/linux") + + +def test_resolve_cache_folder_linux() -> None: + """resolve_cache_folder calls _linux_cache_dir on non-darwin platforms.""" + with patch.object(sys, "platform", "linux"): + with patch("semble.cache._linux_cache_dir", return_value=Path("/linux")) as mock_linux: + with patch("pathlib.Path.mkdir"): + result = resolve_cache_folder() + mock_linux.assert_called_once_with("semble") + assert result == Path("/linux") + + +def test_clear_cache(tmp_path: Path) -> None: + """clear_cache removes the index directory when it exists and is a no-op otherwise.""" + index_path = tmp_path / "index" + with patch("semble.cache.find_index_from_cache_folder", return_value=index_path): + clear_cache("/some/path") # no-op: path doesn't exist yet + index_path.mkdir() + with patch("semble.cache.find_index_from_cache_folder", return_value=index_path): + clear_cache("/some/path") + assert not index_path.exists() + + +def _write_metadata(path: Path, model_path: str, content_type: list[str], write_time: float) -> None: + path.mkdir(parents=True, exist_ok=True) + (path / "chunks.json").write_text("[]") + (path / "bm25_index").write_text("") + (path / "semantic_index").write_text("") + (path / "metadata.json").write_text( + json.dumps({"model_path": model_path, "content_type": content_type, "time": write_time}) + ) + + +def test_get_validated_cache_invalid_index(tmp_path: Path) -> None: + """Returns None when the index directory is missing or incomplete.""" + with patch("semble.cache.find_index_from_cache_folder", return_value=tmp_path / "missing"): + assert get_validated_cache("/path", None, [ContentType.CODE]) is None + + index_path = tmp_path / "index" + index_path.mkdir() + with patch("semble.cache.find_index_from_cache_folder", return_value=index_path): + assert get_validated_cache("/path", None, [ContentType.CODE]) is None + + +@pytest.mark.parametrize( + ("stored_model", "stored_content", "req_model", "req_content"), + [ + ("other/model", ["code"], "my/model", [ContentType.CODE]), # model mismatch + ("my/model", ["docs"], "my/model", [ContentType.CODE]), # content mismatch + ], +) +def test_get_validated_cache_metadata_mismatch( + stored_model: str, + stored_content: list[str], + req_model: str, + req_content: list[ContentType], + tmp_path: Path, +) -> None: + """Returns None when stored model or content type doesn't match the request.""" + index_path = tmp_path / "index" + _write_metadata(index_path, stored_model, stored_content, 0.0) + with patch("semble.cache.find_index_from_cache_folder", return_value=index_path): + assert get_validated_cache("/path", req_model, req_content) is None + + +def test_get_validated_cache_resolves_default_model(tmp_path: Path) -> None: + """When model_path is None, resolve_model_name() is used for comparison.""" + index_path = tmp_path / "index" + _write_metadata(index_path, "default/model", ["code"], 0.0) + with patch("semble.cache.find_index_from_cache_folder", return_value=index_path): + with patch("semble.cache.resolve_model_name", return_value="other/model"): + assert get_validated_cache("/path", None, [ContentType.CODE]) is None + + +def test_get_validated_cache_git_url_returns_immediately(tmp_path: Path) -> None: + """Git URL paths skip file-mtime checks and return the index path directly.""" + index_path = tmp_path / "index" + _write_metadata(index_path, "my/model", ["code"], 0.0) + url = "https://github.com/org/repo.git" + with patch("semble.cache.find_index_from_cache_folder", return_value=index_path): + result = get_validated_cache(url, "my/model", [ContentType.CODE]) + assert result == index_path + + +@pytest.mark.parametrize( + ("write_time", "walk_result", "expected"), + [ + (0.0, "stale", None), # file newer than index → stale + (float("inf"), [], "index"), # no newer files → valid + ], +) +def test_get_validated_cache_mtime( + write_time: float, walk_result: str | list, expected: str | None, tmp_path: Path +) -> None: + """Returns None when a tracked file is newer than the index; the path otherwise.""" + index_path = tmp_path / "index" + _write_metadata(index_path, "my/model", ["code"], write_time) + stale_file = tmp_path / "src.py" + stale_file.write_text("x = 1") + files = [stale_file] if walk_result == "stale" else walk_result + + with patch("semble.cache.find_index_from_cache_folder", return_value=index_path): + with patch("semble.cache.get_extensions", return_value={".py"}): + with patch("semble.cache.walk_files", return_value=files): + result = get_validated_cache(str(tmp_path), "my/model", [ContentType.CODE]) + assert result == (index_path if expected == "index" else None) diff --git a/tests/test_cli.py b/tests/test_cli.py index 4e5285a..537f36e 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -5,7 +5,7 @@ import pytest -from semble.cli import Agent, _agent_path, _cli_main, _run_init, main +from semble.cli import Agent, _agent_path, _cli_main, _maybe_save_index, _run_init, main from semble.types import ContentType, SearchResult from tests.conftest import make_chunk @@ -195,38 +195,35 @@ def test_mcp_main_exits_with_message_when_extras_missing( assert "pip install 'semble[mcp]'" in capsys.readouterr().err -def test_index_via_cli(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: - """_cli_main index subcommand calls _run_index with the correct arguments.""" - out_dir = tmp_path / "index" - fake_index = MagicMock() - monkeypatch.setattr(sys, "argv", ["semble", "index", "/some/path"]) - with patch("semble.cache.get_validated_cache", return_value=tmp_path): - with patch("semble.cli.SembleIndex.from_path", return_value=fake_index): - _cli_main() - assert out_dir.exists() - fake_index.save.assert_called_once_with(str(out_dir)) - - -def test_index_git_via_cli(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: - """_cli_main index subcommand calls _run_index with the correct arguments.""" +@pytest.mark.parametrize( + ("path", "mock_target"), + [ + ("/some/path", "semble.cli.SembleIndex.from_path"), + ("git://xyz.git", "semble.cli.SembleIndex.from_git"), + ], +) +def test_index_via_cli(path: str, mock_target: str, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: + """Index command builds an index and saves it to the cache folder (local and git).""" fake_index = MagicMock() - with patch("semble.cache.get_validated_cache", lambda: tmp_path): - monkeypatch.setattr(sys, "argv", ["semble", "search", "git://xyz.git"]) - with patch("semble.cli.SembleIndex.from_git", return_value=fake_index): - _cli_main() - fake_index.save.assert_called_once_with(str(tmp_path)) + fake_index.loaded_from_disk = False + expected_cache_path = tmp_path / "hash" / "index" + monkeypatch.setattr(sys, "argv", ["semble", "index", path]) + with patch(mock_target, return_value=fake_index): + with patch("semble.cli.find_index_from_cache_folder", return_value=expected_cache_path): + with patch("semble.cli.time.time", side_effect=[0.0, 2.0]): + _cli_main() + fake_index.save.assert_called_once_with(expected_cache_path) def test_cli_search_with_prebuilt_index(monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]) -> None: - """_cli_main search with --index loads the pre-built index from disk.""" + """Search returns results correctly when the index is loaded from cache.""" chunk = make_chunk("def foo(): pass", "src/foo.py") fake_index = MagicMock() + fake_index.loaded_from_disk = True fake_index.search.return_value = [SearchResult(chunk=chunk, score=0.95)] - monkeypatch.setattr(sys, "argv", ["semble", "search", "query text"]) - with patch("semble.cache.get_validated_cache", return_value="/some/prebuilt"): - with patch("semble.cli.SembleIndex.load_from_disk", return_value=fake_index) as mock_load: - _cli_main() - mock_load.assert_called_once() + monkeypatch.setattr(sys, "argv", ["semble", "search", "query text", "."]) + with patch("semble.cli.SembleIndex.from_path", return_value=fake_index): + _cli_main() out = capsys.readouterr().out assert "query text" in out assert "0.95" in out @@ -277,6 +274,16 @@ def test_cli_content_argument( assert list(mock_from_path.call_args.kwargs["content"]) == expected +def test_maybe_save_index_logs_error_on_save_failure(capsys: pytest.CaptureFixture[str]) -> None: + """_maybe_save_index prints to stderr when index.save raises.""" + fake_index = MagicMock() + fake_index.loaded_from_disk = False + fake_index.save.side_effect = OSError("disk full") + with patch("semble.cli.find_index_from_cache_folder", return_value=Path("/cache")): + _maybe_save_index(fake_index, "/some/path", 2.0) + assert "Error saving index" in capsys.readouterr().err + + def test_agent_file_tools_are_bash_only() -> None: """The agent file must list only Bash and Read — no MCP tools that require schema loading.""" frontmatter = files("semble").joinpath("agents/claude.md").read_text(encoding="utf-8").split("---")[1] From 33c56aaabf8f81d1844f323c338702a59b206907 Mon Sep 17 00:00:00 2001 From: Pringled Date: Tue, 26 May 2026 17:47:05 +0200 Subject: [PATCH 04/17] Lots of small changes and bugfixes --- README.md | 65 +++++------------- benchmarks/baselines/ablations.py | 2 +- benchmarks/run_benchmark.py | 2 +- benchmarks/speed_benchmark.py | 2 +- benchmarks/token_efficiency.py | 2 +- src/semble/agents/claude.md | 27 ++------ src/semble/agents/copilot.md | 27 ++------ src/semble/agents/cursor.md | 27 ++------ src/semble/agents/gemini.md | 27 ++------ src/semble/agents/kiro.md | 27 ++------ src/semble/agents/opencode.md | 27 ++------ src/semble/cache.py | 44 ++++++++----- src/semble/cli.py | 105 ++++++++++++++++++++---------- src/semble/index/index.py | 43 ++++++++++-- src/semble/utils.py | 4 +- tests/index/test_index.py | 7 +- tests/test_cache.py | 94 ++++++++++++++++++++------ tests/test_cli.py | 33 +++++++--- uv.lock | 2 +- 19 files changed, 296 insertions(+), 271 deletions(-) diff --git a/README.md b/README.md index e6e1b5a..f90a4ad 100644 --- a/README.md +++ b/README.md @@ -63,19 +63,7 @@ semble search "save_pretrained" ./my-project semble search "save model to disk" ./my-project --top-k 10 ​``` -If you anticipate doing more than one search, use `semble index` to create an index. - -​```bash -semble index ./my-project -o my_index -​``` - -You can then reuse this index later on: - -​```bash -semble search "save_pretrained" --index my_index -​``` - -An index is not automatically updated, so if the code changes significantly, reindex. If you notice stale results while resolving searches to files, reindex. +Results are cached automatically on first run and invalidated when files change. Use `--content docs` to search documentation and prose, `--content config` for config files (yaml, toml, etc.), or `--content all` to search code, docs, and config: @@ -91,20 +79,17 @@ Use `semble find-related` to discover code similar to a known location (pass `fi semble find-related src/auth.py 42 ./my-project ​``` -Like search, `find-related` also accepts an `--index` argument. - `path` defaults to the current directory when omitted; git URLs are accepted. If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its place. ### Workflow -1. Index the repo using `semble index -o cached_index`. -2. Start with `semble search` to find relevant chunks. Pass the index to achieve results faster. -3. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. -4. Inspect full files only when the returned chunk does not give enough context. -5. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. -6. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. +1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. +2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. +3. Inspect full files only when the returned chunk does not give enough context. +4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. +5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. ``` @@ -335,19 +320,7 @@ semble search "save_pretrained" ./my-project semble search "save model to disk" ./my-project --top-k 10 ​``` -If you anticipate doing more than one search, use `semble index` to create an index. - -​```bash -semble index ./my-project -o my_index -​``` - -You can then reuse this index later on: - -​```bash -semble search "save_pretrained" --index my_index -​``` - -An index is not automatically updated, so if the code changes significantly, reindex. If you notice stale results while resolving searches to files, reindex. +Results are cached automatically on first run and invalidated when files change. Use `--content docs` to search documentation and prose, `--content config` for config files (yaml, toml, etc.), or `--content all` to search code, docs, and config: @@ -363,20 +336,17 @@ Use `semble find-related` to discover code similar to a known location (pass `fi semble find-related src/auth.py 42 ./my-project ​``` -Like search, `find-related` also accepts an `--index` argument. - `path` defaults to the current directory when omitted; git URLs are accepted. If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its place. ### Workflow -1. Index the repo using `semble index -o cached_index`. -2. Start with `semble search` to find relevant chunks. Pass the index to achieve results faster. -3. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. -4. Inspect full files only when the returned chunk does not give enough context. -5. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. -6. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. +1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. +2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. +3. Inspect full files only when the returned chunk does not give enough context. +4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. +5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. ``` ### Sub-agent setup @@ -399,14 +369,11 @@ If semble is not on `$PATH`, prefix the command with `uvx --from "semble[mcp]"`. Semble also ships as a standalone CLI. This is useful in scripts or anywhere you want search results without an MCP session. ```bash -# Index a local repository -semble index ./my-project -o my-index +# Pre-build the cache for a local repository (optional — search builds it automatically) +semble index ./my-project -# Search a local repo +# Search a local repo (index is built and cached automatically) semble search "authentication flow" ./my-project -# Or with index (significantly faster) -# the index flag applies to all commands below. -semble search "authentication flow" --index my-index # Search for a symbol or identifier semble search "save_pretrained" ./my-project @@ -454,7 +421,7 @@ semble savings --verbose # also show breakdown by call type Savings are calculated as follows: for each call, semble records the total character count of the unique files containing returned chunks and the character count of the snippets returned. Estimated tokens saved is `(file chars − snippet chars) / 4` (4 chars per token). This is a conservative estimate: the baseline is reading matched files in full, which is how coding agents often explore unfamiliar code. -Stats are stored in `~/.semble/savings.jsonl`. +Stats are stored in the OS cache folder (`~/Library/Caches/semble/` on macOS, `~/.cache/semble/` on Linux, `%LOCALAPPDATA%\semble\Cache\` on Windows). diff --git a/benchmarks/baselines/ablations.py b/benchmarks/baselines/ablations.py index c948e07..63bd6ab 100644 --- a/benchmarks/baselines/ablations.py +++ b/benchmarks/baselines/ablations.py @@ -17,7 +17,7 @@ ) from benchmarks.run_benchmark import RepoResult, evaluate from semble import SembleIndex -from semble.index.dense import DEFAULT_MODEL_NAME +from semble.utils import DEFAULT_MODEL_NAME # alpha=None → raw mode, input depends on query # alpha=0.0 → hybrid pipeline, BM25-only input diff --git a/benchmarks/run_benchmark.py b/benchmarks/run_benchmark.py index 75137af..003e25c 100644 --- a/benchmarks/run_benchmark.py +++ b/benchmarks/run_benchmark.py @@ -16,8 +16,8 @@ ) from benchmarks.metrics import ndcg_at_k, target_rank from semble import SembleIndex -from semble.index.dense import DEFAULT_MODEL_NAME from semble.types import SearchResult +from semble.utils import DEFAULT_MODEL_NAME _LATENCY_RUNS = 5 _DIRECT_TOP_K = 10 diff --git a/benchmarks/speed_benchmark.py b/benchmarks/speed_benchmark.py index aee3061..1eee147 100644 --- a/benchmarks/speed_benchmark.py +++ b/benchmarks/speed_benchmark.py @@ -11,8 +11,8 @@ from benchmarks.data import RepoSpec, Task, available_repo_specs, load_tasks, save_results from benchmarks.tools import run_colgrep_files, run_ripgrep_count from semble import SembleIndex -from semble.index.dense import DEFAULT_MODEL_NAME from semble.types import EmbeddingMatrix +from semble.utils import DEFAULT_MODEL_NAME # One representative repo per language (medium size, healthy NDCG on the main benchmark). _REPOS: list[str] = [ diff --git a/benchmarks/token_efficiency.py b/benchmarks/token_efficiency.py index cc62873..77f10ed 100644 --- a/benchmarks/token_efficiency.py +++ b/benchmarks/token_efficiency.py @@ -24,10 +24,10 @@ target_matches_location, ) from semble import SembleIndex -from semble.index.dense import DEFAULT_MODEL_NAME from semble.index.file_walker import DEFAULT_IGNORED_DIRS, FILE_TYPES, FileCategory from semble.ranking.boosting import _STOPWORDS as _SEMBLE_STOPWORDS from semble.types import Chunk +from semble.utils import DEFAULT_MODEL_NAME _RG_INCLUDE_GLOBS: tuple[str, ...] = tuple( f"*{ext}" for ext, spec in FILE_TYPES.items() if spec.category == FileCategory.CODE diff --git a/src/semble/agents/claude.md b/src/semble/agents/claude.md index 895e282..2cdc0f5 100644 --- a/src/semble/agents/claude.md +++ b/src/semble/agents/claude.md @@ -12,19 +12,7 @@ semble search "save_pretrained" ./my-project semble search "save model to disk" ./my-project --top-k 10 ``` -If you anticipate doing more than one search, use `semble index` to create an index. - -```bash -semble index ./my-project -o my_index -``` - -You can then reuse this index later on: - -```bash -semble search "save_pretrained" --index my_index -``` - -An index is not automatically updated, so if the code changes significantly, reindex. If you notice stale results while resolving searches to files, reindex. +Results are cached automatically on first run and invalidated when files change. Use `--content docs` to search documentation and prose, `--content config` for config files (yaml, toml, etc.), or `--content all` to search code, docs, and config: @@ -40,17 +28,14 @@ Use `semble find-related` to discover code similar to a known location (pass `fi semble find-related src/auth.py 42 ./my-project ``` -Like search, `find-related` also accepts an `--index` argument. - `path` defaults to the current directory when omitted; git URLs are accepted. If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its place. ### Workflow -1. Index the repo using `semble index -o cached_index`. -2. Start with `semble search` to find relevant chunks. Pass the index to achieve results faster. -3. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. -4. Inspect full files only when the returned chunk does not give enough context. -5. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. -6. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. +1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. +2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. +3. Inspect full files only when the returned chunk does not give enough context. +4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. +5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. diff --git a/src/semble/agents/copilot.md b/src/semble/agents/copilot.md index 895e282..2cdc0f5 100644 --- a/src/semble/agents/copilot.md +++ b/src/semble/agents/copilot.md @@ -12,19 +12,7 @@ semble search "save_pretrained" ./my-project semble search "save model to disk" ./my-project --top-k 10 ``` -If you anticipate doing more than one search, use `semble index` to create an index. - -```bash -semble index ./my-project -o my_index -``` - -You can then reuse this index later on: - -```bash -semble search "save_pretrained" --index my_index -``` - -An index is not automatically updated, so if the code changes significantly, reindex. If you notice stale results while resolving searches to files, reindex. +Results are cached automatically on first run and invalidated when files change. Use `--content docs` to search documentation and prose, `--content config` for config files (yaml, toml, etc.), or `--content all` to search code, docs, and config: @@ -40,17 +28,14 @@ Use `semble find-related` to discover code similar to a known location (pass `fi semble find-related src/auth.py 42 ./my-project ``` -Like search, `find-related` also accepts an `--index` argument. - `path` defaults to the current directory when omitted; git URLs are accepted. If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its place. ### Workflow -1. Index the repo using `semble index -o cached_index`. -2. Start with `semble search` to find relevant chunks. Pass the index to achieve results faster. -3. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. -4. Inspect full files only when the returned chunk does not give enough context. -5. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. -6. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. +1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. +2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. +3. Inspect full files only when the returned chunk does not give enough context. +4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. +5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. diff --git a/src/semble/agents/cursor.md b/src/semble/agents/cursor.md index baf455c..2071c27 100644 --- a/src/semble/agents/cursor.md +++ b/src/semble/agents/cursor.md @@ -11,19 +11,7 @@ semble search "save_pretrained" ./my-project semble search "save model to disk" ./my-project --top-k 10 ``` -If you anticipate doing more than one search, use `semble index` to create an index. - -```bash -semble index ./my-project -o my_index -``` - -You can then reuse this index later on: - -```bash -semble search "save_pretrained" --index my_index -``` - -An index is not automatically updated, so if the code changes significantly, reindex. If you notice stale results while resolving searches to files, reindex. +Results are cached automatically on first run and invalidated when files change. Use `--content docs` to search documentation and prose, `--content config` for config files (yaml, toml, etc.), or `--content all` to search code, docs, and config: @@ -39,17 +27,14 @@ Use `semble find-related` to discover code similar to a known location (pass `fi semble find-related src/auth.py 42 ./my-project ``` -Like search, `find-related` also accepts an `--index` argument. - `path` defaults to the current directory when omitted; git URLs are accepted. If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its place. ### Workflow -1. Index the repo using `semble index -o cached_index`. -2. Start with `semble search` to find relevant chunks. Pass the index to achieve results faster. -3. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. -4. Inspect full files only when the returned chunk does not give enough context. -5. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. -6. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. +1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. +2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. +3. Inspect full files only when the returned chunk does not give enough context. +4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. +5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. diff --git a/src/semble/agents/gemini.md b/src/semble/agents/gemini.md index e4e9b6a..a20fcd9 100644 --- a/src/semble/agents/gemini.md +++ b/src/semble/agents/gemini.md @@ -14,19 +14,7 @@ semble search "save_pretrained" ./my-project semble search "save model to disk" ./my-project --top-k 10 ``` -If you anticipate doing more than one search, use `semble index` to create an index. - -```bash -semble index ./my-project -o my_index -``` - -You can then reuse this index later on: - -```bash -semble search "save_pretrained" --index my_index -``` - -An index is not automatically updated, so if the code changes significantly, reindex. If you notice stale results while resolving searches to files, reindex. +Results are cached automatically on first run and invalidated when files change. Use `--content docs` to search documentation and prose, `--content config` for config files (yaml, toml, etc.), or `--content all` to search code, docs, and config: @@ -42,17 +30,14 @@ Use `semble find-related` to discover code similar to a known location (pass `fi semble find-related src/auth.py 42 ./my-project ``` -Like search, `find-related` also accepts an `--index` argument. - `path` defaults to the current directory when omitted; git URLs are accepted. If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its place. ### Workflow -1. Index the repo using `semble index -o cached_index`. -2. Start with `semble search` to find relevant chunks. Pass the index to achieve results faster. -3. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. -4. Inspect full files only when the returned chunk does not give enough context. -5. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. -6. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. +1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. +2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. +3. Inspect full files only when the returned chunk does not give enough context. +4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. +5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. diff --git a/src/semble/agents/kiro.md b/src/semble/agents/kiro.md index d556c13..bf5d5fc 100644 --- a/src/semble/agents/kiro.md +++ b/src/semble/agents/kiro.md @@ -14,19 +14,7 @@ semble search "save_pretrained" ./my-project semble search "save model to disk" ./my-project --top-k 10 ``` -If you anticipate doing more than one search, use `semble index` to create an index. - -```bash -semble index ./my-project -o my_index -``` - -You can then reuse this index later on: - -```bash -semble search "save_pretrained" --index my_index -``` - -An index is not automatically updated, so if the code changes significantly, reindex. If you notice stale results while resolving searches to files, reindex. +Results are cached automatically on first run and invalidated when files change. Use `--content docs` to search documentation and prose, `--content config` for config files (yaml, toml, etc.), or `--content all` to search code, docs, and config: @@ -42,17 +30,14 @@ Use `semble find-related` to discover code similar to a known location (pass `fi semble find-related src/auth.py 42 ./my-project ``` -Like search, `find-related` also accepts an `--index` argument. - `path` defaults to the current directory when omitted; git URLs are accepted. If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its place. ### Workflow -1. Index the repo using `semble index -o cached_index`. -2. Start with `semble search` to find relevant chunks. Pass the index to achieve results faster. -3. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. -4. Inspect full files only when the returned chunk does not give enough context. -5. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. -6. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. +1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. +2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. +3. Inspect full files only when the returned chunk does not give enough context. +4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. +5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. diff --git a/src/semble/agents/opencode.md b/src/semble/agents/opencode.md index 2ec43c8..fbfcede 100644 --- a/src/semble/agents/opencode.md +++ b/src/semble/agents/opencode.md @@ -15,19 +15,7 @@ semble search "save_pretrained" ./my-project semble search "save model to disk" ./my-project --top-k 10 ``` -If you anticipate doing more than one search, use `semble index` to create an index. - -```bash -semble index ./my-project -o my_index -``` - -You can then reuse this index later on: - -```bash -semble search "save_pretrained" --index my_index -``` - -An index is not automatically updated, so if the code changes significantly, reindex. If you notice stale results while resolving searches to files, reindex. +Results are cached automatically on first run and invalidated when files change. Use `--content docs` to search documentation and prose, `--content config` for config files (yaml, toml, etc.), or `--content all` to search code, docs, and config: @@ -43,17 +31,14 @@ Use `semble find-related` to discover code similar to a known location (pass `fi semble find-related src/auth.py 42 ./my-project ``` -Like search, `find-related` also accepts an `--index` argument. - `path` defaults to the current directory when omitted; git URLs are accepted. If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its place. ### Workflow -1. Index the repo using `semble index -o cached_index`. -2. Start with `semble search` to find relevant chunks. Pass the index to achieve results faster. -3. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. -4. Inspect full files only when the returned chunk does not give enough context. -5. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. -6. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. +1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. +2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. +3. Inspect full files only when the returned chunk does not give enough context. +4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. +5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. diff --git a/src/semble/cache.py b/src/semble/cache.py index ac5091e..eb7070c 100644 --- a/src/semble/cache.py +++ b/src/semble/cache.py @@ -49,7 +49,7 @@ def resolve_cache_folder() -> Path: name = "semble" if sys.platform == "win32": cache_dir = _windows_cache_dir(name) - if sys.platform == "darwin": + elif sys.platform == "darwin": cache_dir = _macos_cache_dir(name) else: cache_dir = _linux_cache_dir(name) @@ -61,11 +61,25 @@ def resolve_cache_folder() -> Path: def clear_cache(path: str) -> None: """Clears the cache for the given path.""" index_path = find_index_from_cache_folder(path) - if index_path and index_path.exists(): + if index_path.exists(): shutil.rmtree(index_path) -def get_validated_cache(path: str, model_path: str | None, content: Sequence[ContentType]) -> Path | None: +def _metadata_matches( + metadata: dict, model_path: str, content: Sequence[ContentType], extensions: list[str] | None +) -> bool: + """Return True if the stored metadata is compatible with the requested parameters.""" + content_type = tuple(ContentType(s) for s in metadata["content_type"]) + return ( + metadata["model_path"] == model_path + and set(content_type) == set(content) + and metadata.get("extensions") == extensions + ) + + +def get_validated_cache( + path: str, model_path: str | None, content: Sequence[ContentType], extensions: list[str] | None = None +) -> Path | None: """Validates the cache folder and returns the index path.""" index_path = find_index_from_cache_folder(path) if not index_path.exists(): @@ -75,28 +89,28 @@ def get_validated_cache(path: str, model_path: str | None, content: Sequence[Con if persistence_path.non_existing(): return None - with open(persistence_path.metadata) as f: - metadata = json.load(f) - model_path_from_index = metadata["model_path"] if model_path is None: model_path = resolve_model_name() - if model_path_from_index != model_path: - return None - - content_type_strings: list[str] = metadata["content_type"] - - content_type = tuple(ContentType(string) for string in content_type_strings) - if set(content_type) != set(content): + with open(persistence_path.metadata) as f: + metadata = json.load(f) + if not _metadata_matches(metadata, model_path, content, extensions): return None if is_git_url(str(path)): return index_path write_time = metadata["time"] - extensions = get_extensions(content_type, None) + resolved_extensions = extensions if extensions is not None else get_extensions(list(content), None) path_as_path = Path(path) - for file_path in walk_files(path_as_path, extensions=extensions): + current_files = sorted( + str(f.relative_to(path_as_path)) for f in walk_files(path_as_path, extensions=resolved_extensions) + ) + stored_files: list[str] = metadata.get("file_paths", []) + if current_files != stored_files: + return None + + for file_path in walk_files(path_as_path, extensions=resolved_extensions): st = file_path.stat() if st.st_mtime > write_time: return None diff --git a/src/semble/cli.py b/src/semble/cli.py index b75f9c7..b1cb268 100644 --- a/src/semble/cli.py +++ b/src/semble/cli.py @@ -31,6 +31,15 @@ class Agent(str, Enum): _CLI_DISPATCH_ARGS = frozenset({"search", "find-related", "init", "savings", "-h", "--help", "index"}) +def _build_index(path: str, content: list[ContentType]) -> SembleIndex: + """Build an index from a local path or git URL.""" + return ( + SembleIndex.from_git(path, content=content) + if is_git_url(path) + else SembleIndex.from_path(path, content=content) + ) + + def _maybe_save_index(index: SembleIndex, path: str, creation_time: float) -> None: """Maybe save an index. Based on the index itself and the creation time.""" # If the index was not loaded from disk, @@ -123,6 +132,58 @@ def _resolve_content(content: list[str], include_text_files: bool) -> list[Conte return [ContentType(c) for c in content] +def _run_index(path: str, content: list[ContentType]) -> None: + """Handle the `index` subcommand.""" + try: + index = _build_index(path, content) + except FileNotFoundError as e: + print(str(e), file=sys.stderr) + sys.exit(1) + if not index.loaded_from_disk: + cache_folder = find_index_from_cache_folder(path) + index.save(cache_folder) + print(f"Wrote index to `{cache_folder}`.") + else: + print("Index is already up to date.") + + +def _load_index_timed(path: str, content: list[ContentType]) -> tuple[SembleIndex, float]: + """Build an index and return it with the elapsed build time in seconds.""" + start = time.time() + try: + index = _build_index(path, content) + except FileNotFoundError as e: + print(str(e), file=sys.stderr) + sys.exit(1) + return index, time.time() - start + + +def _run_search(path: str, query: str, top_k: int, content: list[ContentType]) -> None: + """Handle the `search` subcommand.""" + index, creation_time = _load_index_timed(path, content) + results = index.search(query, top_k=top_k) + out = format_results(query, results) if results else {"error": "No results found."} + print(json.dumps(out)) + _maybe_save_index(index, path, creation_time) + + +def _run_find_related(path: str, file_path: str, line: int, top_k: int, content: list[ContentType]) -> None: + """Handle the `find-related` subcommand.""" + index, creation_time = _load_index_timed(path, content) + chunk = resolve_chunk(index.chunks, file_path, line) + if chunk is None: + print(f"No chunk found at {file_path}:{line}.", file=sys.stderr) + sys.exit(1) + results = index.find_related(chunk, top_k=top_k) + out = ( + format_results(f"Chunks related to {file_path}:{line}", results) + if results + else {"error": f"No related chunks found for {file_path}:{line}."} + ) + print(json.dumps(out)) + _maybe_save_index(index, path, creation_time) + + def _cli_main() -> None: parser = argparse.ArgumentParser(prog="semble") sub = parser.add_subparsers(dest="command") @@ -135,7 +196,6 @@ def _cli_main() -> None: search_p.add_argument("query", help="Natural language or code query.") search_p.add_argument("path", nargs="?", default=".", help="Local path or git URL (default: current directory).") search_p.add_argument("-k", "--top-k", type=int, default=5, help="Number of results (default: 5).") - search_p.add_argument("--index", action="store_true", help="Use an index at the default path.") _add_content_args(search_p) related_p = sub.add_parser("find-related", help="Find code similar to a specific location.") @@ -143,7 +203,6 @@ def _cli_main() -> None: related_p.add_argument("line", type=int, help="Line number (1-indexed).") related_p.add_argument("path", nargs="?", default=".", help="Local path or git URL (default: current directory).") related_p.add_argument("-k", "--top-k", type=int, default=5, help="Number of results (default: 5).") - related_p.add_argument("--index", type=str, default=None, help="A path pointing to a pre-built index.") _add_content_args(related_p) init_p = sub.add_parser("init", help="Write a semble sub-agent file for your coding agent.") @@ -163,39 +222,13 @@ def _cli_main() -> None: if args.command == "init": _run_init(agent=Agent(args.agent), force=args.force) - return - - if args.command == "savings": + elif args.command == "savings": print(format_savings_report(verbose=args.verbose)) - return - - content = _resolve_content(args.content, args.include_text_files) - start = time.time() - index = ( - SembleIndex.from_git(args.path, content=content) - if is_git_url(args.path) - else SembleIndex.from_path(args.path, content=content) - ) - creation_time = time.time() - start - - if args.command == "search": - results = index.search(args.query, top_k=args.top_k) - if not results: - out = {"error": "No results found."} - else: - out = format_results(args.query, results) - print(json.dumps(out)) - + elif args.command == "index": + _run_index(args.path, _resolve_content(args.content, args.include_text_files)) + elif args.command == "search": + _run_search(args.path, args.query, args.top_k, _resolve_content(args.content, args.include_text_files)) elif args.command == "find-related": - chunk = resolve_chunk(index.chunks, args.file_path, args.line) - if chunk is None: - print(f"No chunk found at {args.file_path}:{args.line}.", file=sys.stderr) - sys.exit(1) - results = index.find_related(chunk, top_k=args.top_k) - if not results: - out = {"error": f"No related chunks found for {args.file_path}:{args.line}."} - else: - out = format_results(f"Chunks related to {args.file_path}:{args.line}", results) - print(json.dumps(out)) - - _maybe_save_index(index, args.path, creation_time) + _run_find_related( + args.path, args.file_path, args.line, args.top_k, _resolve_content(args.content, args.include_text_files) + ) diff --git a/src/semble/index/index.py b/src/semble/index/index.py index b4437a4..13a8b68 100644 --- a/src/semble/index/index.py +++ b/src/semble/index/index.py @@ -18,6 +18,7 @@ from semble.cache import get_validated_cache from semble.index.create import create_index_from_path from semble.index.dense import SelectableBasicBackend, load_model +from semble.index.files import get_extensions from semble.index.types import PersistencePath from semble.search import _search_semantic, search from semble.stats import save_search_stats @@ -59,6 +60,7 @@ def __init__( root: Path | None = None, content: ContentType | Sequence[ContentType] = _DEFAULT_CONTENT, loaded_from_disk: bool = False, + extensions: list[str] | None = None, ) -> None: """Initialize a SembleIndex. Should be created with from_path or from_git. @@ -69,6 +71,8 @@ def __init__( :param model_path: Path to the model file. :param root: Root directory used to read file sizes for token-savings stats. :param content: Content type used when indexing; controls the search pipeline. + :param loaded_from_disk: Whether the index was loaded from disk (cache hit); controls CLI messaging. + :param extensions: File extensions included in the index, or None if not applicable. """ self.model = model self.chunks: list[Chunk] = chunks @@ -77,6 +81,7 @@ def __init__( self._model_path: str = model_path self._root: Path | None = root self._content: tuple[ContentType, ...] = (content,) if isinstance(content, ContentType) else tuple(content) + self._extensions: list[str] | None = extensions self._file_sizes: dict[str, int] = self._compute_file_sizes(root) if root else {} self._file_mapping, self._language_mapping = self._populate_mapping() self.loaded_from_disk: bool = loaded_from_disk @@ -146,7 +151,8 @@ def from_path( raise NotADirectoryError(f"Path is not a directory: {path}") normalized = _apply_include_text_files(content, include_text_files) - cache_path = get_validated_cache(str(path), model_path, normalized) + resolved_extensions = get_extensions(normalized, extensions) + cache_path = get_validated_cache(str(path), model_path, normalized, resolved_extensions) if cache_path: return cls.load_from_disk(cache_path) model, model_path = load_model(model_path) @@ -160,7 +166,9 @@ def from_path( display_root=path, ) - return SembleIndex(model, bm25, vicinity, chunks, model_path, root=path, content=normalized) + return SembleIndex( + model, bm25, vicinity, chunks, model_path, root=path, content=normalized, extensions=resolved_extensions + ) @classmethod def from_git( @@ -188,9 +196,10 @@ def from_git( :return: An indexed SembleIndex. Chunk file paths are repo-relative (e.g. ``src/foo.py``). :raises RuntimeError: If git is not on PATH, the clone fails, or times out. """ - path = f"{url}@{ref}" if ref else url normalized = _apply_include_text_files(content, include_text_files) - cache_path = get_validated_cache(path, model_path, normalized) + resolved_extensions = get_extensions(normalized, extensions) + cache_key = f"{url}@{ref}" if ref else url + cache_path = get_validated_cache(cache_key, model_path, normalized, resolved_extensions) if cache_path: return cls.load_from_disk(cache_path) @@ -218,7 +227,16 @@ def from_git( display_root=resolved_path, ) - return SembleIndex(model, bm25, vicinity, chunks, model_path, root=resolved_path, content=normalized) + return SembleIndex( + model, + bm25, + vicinity, + chunks, + model_path, + root=resolved_path, + content=normalized, + extensions=resolved_extensions, + ) def find_related(self, source: Chunk | SearchResult, *, top_k: int = 5) -> list[SearchResult]: """Return chunks semantically similar to the given chunk or search result. @@ -313,12 +331,22 @@ def load_from_disk(cls: type[SembleIndex], path: Path | str) -> SembleIndex: chunks.append(Chunk.from_dict(chunk_item)) root_path = metadata["root_path"] model_path = metadata["model_path"] + extensions = metadata.get("extensions") if root_path: root_path = Path(root_path) model, model_path = load_model(model_path) - return cls(model, bm_25_index, semantic_index, chunks, model_path, root=root_path, loaded_from_disk=True) + return cls( + model, + bm_25_index, + semantic_index, + chunks, + model_path, + root=root_path, + loaded_from_disk=True, + extensions=extensions, + ) def save(self, path: Path | str) -> None: """Save the index to disk.""" @@ -334,11 +362,14 @@ def save(self, path: Path | str) -> None: data = orjson.dumps(chunks_as_dict) f.write(data) root_str = None if self._root is None else str(self._root) + file_paths = sorted({chunk.file_path for chunk in self.chunks}) metadata = { "root_path": root_str, "time": datetime.now().timestamp(), "model_path": self._model_path, "content_type": list(x.value for x in self._content), + "extensions": self._extensions, + "file_paths": file_paths, } with open(persistence_paths.metadata, "wb") as f: data = orjson.dumps(metadata) diff --git a/src/semble/utils.py b/src/semble/utils.py index 0015c77..b11ee29 100644 --- a/src/semble/utils.py +++ b/src/semble/utils.py @@ -8,7 +8,7 @@ _GIT_URL_SCHEMES = ("https://", "http://", "ssh://", "git://", "git+ssh://", "file://") _SCP_GIT_URL_RE = re.compile(r"^[\w.-]+@[\w.-]+:(?!/)") -_DEFAULT_MODEL_NAME = "minishlab/potion-code-16M" +DEFAULT_MODEL_NAME = "minishlab/potion-code-16M" def is_git_url(path: str) -> bool: @@ -39,4 +39,4 @@ def format_results(query: str, results: list[SearchResult]) -> dict[str, Any]: def resolve_model_name() -> str: """Resolve a model name to a configurable.""" - return os.environ.get("SEMBLE_MODEL_NAME", _DEFAULT_MODEL_NAME) + return os.environ.get("SEMBLE_MODEL_NAME", DEFAULT_MODEL_NAME) diff --git a/tests/index/test_index.py b/tests/index/test_index.py index 9b8be68..98715fc 100644 --- a/tests/index/test_index.py +++ b/tests/index/test_index.py @@ -219,10 +219,11 @@ def test_from_path_uses_cache_when_valid(tmp_project: Path) -> None: assert result is fake_cached -def test_from_git_uses_cache_when_valid() -> None: - """from_git returns the cached index directly when get_validated_cache hits.""" +@pytest.mark.parametrize("ref", [None, "v1.0"]) +def test_from_git_uses_cache_when_valid(ref: str | None) -> None: + """from_git uses the cache for both URL-only and URL@ref cache keys.""" fake_cached = MagicMock(spec=SembleIndex) with patch("semble.index.index.get_validated_cache", return_value=Path("/cache")): with patch.object(SembleIndex, "load_from_disk", return_value=fake_cached): - result = SembleIndex.from_git("https://github.com/org/repo.git") + result = SembleIndex.from_git("https://github.com/org/repo.git", ref=ref) assert result is fake_cached diff --git a/tests/test_cache.py b/tests/test_cache.py index 21986d1..cec5ce0 100644 --- a/tests/test_cache.py +++ b/tests/test_cache.py @@ -67,26 +67,21 @@ def test_cache_dir_no_env(fn: object, expected_rel: Path) -> None: assert fn("semble") == home / expected_rel # type: ignore[operator] -def test_resolve_cache_folder_windows() -> None: - """resolve_cache_folder calls _windows_cache_dir on win32.""" - with patch.object(sys, "platform", "win32"): - with patch("semble.cache._windows_cache_dir", return_value=Path("/win")) as mock_win: - with patch("semble.cache._linux_cache_dir", return_value=Path("/linux")) as mock_linux: - with patch("pathlib.Path.mkdir"): - result = resolve_cache_folder() - mock_win.assert_called_once_with("semble") - mock_linux.assert_called_once_with("semble") - assert result == Path("/linux") - - -def test_resolve_cache_folder_linux() -> None: - """resolve_cache_folder calls _linux_cache_dir on non-darwin platforms.""" - with patch.object(sys, "platform", "linux"): - with patch("semble.cache._linux_cache_dir", return_value=Path("/linux")) as mock_linux: +@pytest.mark.parametrize( + ("platform", "mock_target", "expected"), + [ + ("win32", "semble.cache._windows_cache_dir", Path("/win")), + ("linux", "semble.cache._linux_cache_dir", Path("/linux")), + ], +) +def test_resolve_cache_folder(platform: str, mock_target: str, expected: Path) -> None: + """resolve_cache_folder calls the correct platform helper.""" + with patch.object(sys, "platform", platform): + with patch(mock_target, return_value=expected) as mock_fn: with patch("pathlib.Path.mkdir"): result = resolve_cache_folder() - mock_linux.assert_called_once_with("semble") - assert result == Path("/linux") + mock_fn.assert_called_once_with("semble") + assert result == expected def test_clear_cache(tmp_path: Path) -> None: @@ -100,13 +95,28 @@ def test_clear_cache(tmp_path: Path) -> None: assert not index_path.exists() -def _write_metadata(path: Path, model_path: str, content_type: list[str], write_time: float) -> None: +def _write_metadata( + path: Path, + model_path: str, + content_type: list[str], + write_time: float, + file_paths: list[str] | None = None, + extensions: list[str] | None = None, +) -> None: path.mkdir(parents=True, exist_ok=True) (path / "chunks.json").write_text("[]") (path / "bm25_index").write_text("") (path / "semantic_index").write_text("") (path / "metadata.json").write_text( - json.dumps({"model_path": model_path, "content_type": content_type, "time": write_time}) + json.dumps( + { + "model_path": model_path, + "content_type": content_type, + "time": write_time, + "file_paths": file_paths if file_paths is not None else [], + "extensions": extensions, + } + ) ) @@ -173,13 +183,55 @@ def test_get_validated_cache_mtime( ) -> None: """Returns None when a tracked file is newer than the index; the path otherwise.""" index_path = tmp_path / "index" - _write_metadata(index_path, "my/model", ["code"], write_time) stale_file = tmp_path / "src.py" stale_file.write_text("x = 1") files = [stale_file] if walk_result == "stale" else walk_result + # Include the file in stored manifest so manifest check passes and mtime check fires. + stored_files = ["src.py"] if walk_result == "stale" else [] + _write_metadata(index_path, "my/model", ["code"], write_time, file_paths=stored_files) with patch("semble.cache.find_index_from_cache_folder", return_value=index_path): with patch("semble.cache.get_extensions", return_value={".py"}): with patch("semble.cache.walk_files", return_value=files): result = get_validated_cache(str(tmp_path), "my/model", [ContentType.CODE]) assert result == (index_path if expected == "index" else None) + + +@pytest.mark.parametrize( + ("stored_extensions", "req_extensions"), + [ + (None, [".py"]), # stored without custom ext, requested with → mismatch + ([".py"], None), # stored with custom ext, requested without → mismatch + ([".py"], [".ts"]), # different custom extensions → mismatch + ], +) +def test_get_validated_cache_extensions_mismatch( + stored_extensions: list[str] | None, + req_extensions: list[str] | None, + tmp_path: Path, +) -> None: + """Returns None when stored extensions don't match the requested extensions.""" + index_path = tmp_path / "index" + _write_metadata(index_path, "my/model", ["code"], float("inf"), extensions=stored_extensions) + with patch("semble.cache.find_index_from_cache_folder", return_value=index_path): + assert get_validated_cache("/path", "my/model", [ContentType.CODE], req_extensions) is None + + +@pytest.mark.parametrize( + ("stored_files", "current_files"), + [ + (["deleted.py"], []), # file deleted since indexing + ([], ["new.py"]), # new file added since indexing + ], +) +def test_get_validated_cache_manifest_mismatch( + stored_files: list[str], current_files: list[str], tmp_path: Path +) -> None: + """Returns None when the current file set differs from the stored manifest.""" + index_path = tmp_path / "index" + walk_return = [tmp_path / f for f in current_files] + _write_metadata(index_path, "my/model", ["code"], float("inf"), file_paths=stored_files) + with patch("semble.cache.find_index_from_cache_folder", return_value=index_path): + with patch("semble.cache.walk_files", return_value=walk_return): + result = get_validated_cache(str(tmp_path), "my/model", [ContentType.CODE]) + assert result is None diff --git a/tests/test_cli.py b/tests/test_cli.py index 537f36e..4b5972d 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -215,18 +215,35 @@ def test_index_via_cli(path: str, mock_target: str, tmp_path: Path, monkeypatch: fake_index.save.assert_called_once_with(expected_cache_path) -def test_cli_search_with_prebuilt_index(monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]) -> None: - """Search returns results correctly when the index is loaded from cache.""" - chunk = make_chunk("def foo(): pass", "src/foo.py") +def test_index_already_up_to_date(monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]) -> None: + """Index command prints 'up to date' when the index was loaded from disk (cache hit).""" fake_index = MagicMock() fake_index.loaded_from_disk = True - fake_index.search.return_value = [SearchResult(chunk=chunk, score=0.95)] - monkeypatch.setattr(sys, "argv", ["semble", "search", "query text", "."]) + monkeypatch.setattr(sys, "argv", ["semble", "index", "/some/path"]) with patch("semble.cli.SembleIndex.from_path", return_value=fake_index): _cli_main() - out = capsys.readouterr().out - assert "query text" in out - assert "0.95" in out + assert "up to date" in capsys.readouterr().out + fake_index.save.assert_not_called() + + +@pytest.mark.parametrize( + ("command", "argv"), + [ + ("index", ["semble", "index", "/no/such/path"]), + ("search", ["semble", "search", "query", "/no/such/path"]), + ("find-related", ["semble", "find-related", "src/foo.py", "1", "/no/such/path"]), + ], +) +def test_cli_path_not_found( + command: str, argv: list[str], monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str] +) -> None: + """index, search, and find-related exit 1 with a friendly message when the path does not exist.""" + monkeypatch.setattr(sys, "argv", argv) + with patch("semble.cli._build_index", side_effect=FileNotFoundError("Path does not exist: /no/such/path")): + with pytest.raises(SystemExit) as exc_info: + _cli_main() + assert exc_info.value.code == 1 + assert "Path does not exist" in capsys.readouterr().err def test_include_text_files_cli_deprecated( diff --git a/uv.lock b/uv.lock index 04d014b..95e35ed 100644 --- a/uv.lock +++ b/uv.lock @@ -10,7 +10,7 @@ resolution-markers = [ [options] exclude-newer = "0001-01-01T00:00:00Z" # This has no effect and is included for backwards compatibility when using relative exclude-newer values. -exclude-newer-span = "P3D" +exclude-newer-span = "P1W" [[package]] name = "annotated-doc" From 198854f51ece271cb04a888eb0b01865a2530bcf Mon Sep 17 00:00:00 2001 From: Pringled Date: Tue, 26 May 2026 17:50:09 +0200 Subject: [PATCH 05/17] Few more small changes --- README.md | 4 ++-- src/semble/index/index.py | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index f90a4ad..e6a9b10 100644 --- a/README.md +++ b/README.md @@ -63,7 +63,7 @@ semble search "save_pretrained" ./my-project semble search "save model to disk" ./my-project --top-k 10 ​``` -Results are cached automatically on first run and invalidated when files change. +The index is built on first run (and cached for subsequent runs) and invalidated automatically when files change. Use `--content docs` to search documentation and prose, `--content config` for config files (yaml, toml, etc.), or `--content all` to search code, docs, and config: @@ -320,7 +320,7 @@ semble search "save_pretrained" ./my-project semble search "save model to disk" ./my-project --top-k 10 ​``` -Results are cached automatically on first run and invalidated when files change. +The index is built on first run (and cached for subsequent runs) and invalidated automatically when files change. Use `--content docs` to search documentation and prose, `--content config` for config files (yaml, toml, etc.), or `--content all` to search code, docs, and config: diff --git a/src/semble/index/index.py b/src/semble/index/index.py index 13a8b68..d47edba 100644 --- a/src/semble/index/index.py +++ b/src/semble/index/index.py @@ -332,6 +332,7 @@ def load_from_disk(cls: type[SembleIndex], path: Path | str) -> SembleIndex: root_path = metadata["root_path"] model_path = metadata["model_path"] extensions = metadata.get("extensions") + content = tuple(ContentType(s) for s in metadata.get("content_type", ["code"])) if root_path: root_path = Path(root_path) @@ -344,6 +345,7 @@ def load_from_disk(cls: type[SembleIndex], path: Path | str) -> SembleIndex: chunks, model_path, root=root_path, + content=content, loaded_from_disk=True, extensions=extensions, ) From 6b6a75e92e00625e47b7bfd75dd087f483c77858 Mon Sep 17 00:00:00 2001 From: Pringled Date: Tue, 26 May 2026 18:26:42 +0200 Subject: [PATCH 06/17] Derive cache manifest from walked files and validate in a single pass --- src/semble/cache.py | 13 +++++-------- src/semble/index/index.py | 24 +++++++++++++++++++++--- tests/test_cache.py | 6 +++++- 3 files changed, 31 insertions(+), 12 deletions(-) diff --git a/src/semble/cache.py b/src/semble/cache.py index eb7070c..39bf666 100644 --- a/src/semble/cache.py +++ b/src/semble/cache.py @@ -103,16 +103,13 @@ def get_validated_cache( resolved_extensions = extensions if extensions is not None else get_extensions(list(content), None) path_as_path = Path(path) - current_files = sorted( - str(f.relative_to(path_as_path)) for f in walk_files(path_as_path, extensions=resolved_extensions) - ) stored_files: list[str] = metadata.get("file_paths", []) - if current_files != stored_files: - return None - + current_files = [] for file_path in walk_files(path_as_path, extensions=resolved_extensions): - st = file_path.stat() - if st.st_mtime > write_time: + current_files.append(str(file_path.relative_to(path_as_path))) + if file_path.stat().st_mtime > write_time: return None + if sorted(current_files) != stored_files: + return None return index_path diff --git a/src/semble/index/index.py b/src/semble/index/index.py index d47edba..3822bcb 100644 --- a/src/semble/index/index.py +++ b/src/semble/index/index.py @@ -18,6 +18,7 @@ from semble.cache import get_validated_cache from semble.index.create import create_index_from_path from semble.index.dense import SelectableBasicBackend, load_model +from semble.index.file_walker import walk_files from semble.index.files import get_extensions from semble.index.types import PersistencePath from semble.search import _search_semantic, search @@ -61,6 +62,7 @@ def __init__( content: ContentType | Sequence[ContentType] = _DEFAULT_CONTENT, loaded_from_disk: bool = False, extensions: list[str] | None = None, + file_manifest: list[str] | None = None, ) -> None: """Initialize a SembleIndex. Should be created with from_path or from_git. @@ -73,6 +75,7 @@ def __init__( :param content: Content type used when indexing; controls the search pipeline. :param loaded_from_disk: Whether the index was loaded from disk (cache hit); controls CLI messaging. :param extensions: File extensions included in the index, or None if not applicable. + :param file_manifest: Sorted repo-relative paths of all walked files at index time, used for cache invalidation. """ self.model = model self.chunks: list[Chunk] = chunks @@ -82,6 +85,7 @@ def __init__( self._root: Path | None = root self._content: tuple[ContentType, ...] = (content,) if isinstance(content, ContentType) else tuple(content) self._extensions: list[str] | None = extensions + self._file_manifest: list[str] | None = file_manifest self._file_sizes: dict[str, int] = self._compute_file_sizes(root) if root else {} self._file_mapping, self._language_mapping = self._populate_mapping() self.loaded_from_disk: bool = loaded_from_disk @@ -158,6 +162,7 @@ def from_path( model, model_path = load_model(model_path) path = path.resolve() + file_manifest = sorted(str(f.relative_to(path)) for f in walk_files(path, extensions=resolved_extensions)) bm25, vicinity, chunks = create_index_from_path( path, model=model, @@ -167,7 +172,15 @@ def from_path( ) return SembleIndex( - model, bm25, vicinity, chunks, model_path, root=path, content=normalized, extensions=resolved_extensions + model, + bm25, + vicinity, + chunks, + model_path, + root=path, + content=normalized, + extensions=resolved_extensions, + file_manifest=file_manifest, ) @classmethod @@ -219,6 +232,9 @@ def from_git( model, model_path = load_model(model_path) resolved_path = Path(tmp_dir).resolve() + file_manifest = sorted( + str(f.relative_to(resolved_path)) for f in walk_files(resolved_path, extensions=resolved_extensions) + ) bm25, vicinity, chunks = create_index_from_path( resolved_path, model=model, @@ -236,6 +252,7 @@ def from_git( root=resolved_path, content=normalized, extensions=resolved_extensions, + file_manifest=file_manifest, ) def find_related(self, source: Chunk | SearchResult, *, top_k: int = 5) -> list[SearchResult]: @@ -333,6 +350,7 @@ def load_from_disk(cls: type[SembleIndex], path: Path | str) -> SembleIndex: model_path = metadata["model_path"] extensions = metadata.get("extensions") content = tuple(ContentType(s) for s in metadata.get("content_type", ["code"])) + file_manifest = metadata.get("file_paths") if root_path: root_path = Path(root_path) @@ -348,6 +366,7 @@ def load_from_disk(cls: type[SembleIndex], path: Path | str) -> SembleIndex: content=content, loaded_from_disk=True, extensions=extensions, + file_manifest=file_manifest, ) def save(self, path: Path | str) -> None: @@ -364,14 +383,13 @@ def save(self, path: Path | str) -> None: data = orjson.dumps(chunks_as_dict) f.write(data) root_str = None if self._root is None else str(self._root) - file_paths = sorted({chunk.file_path for chunk in self.chunks}) metadata = { "root_path": root_str, "time": datetime.now().timestamp(), "model_path": self._model_path, "content_type": list(x.value for x in self._content), "extensions": self._extensions, - "file_paths": file_paths, + "file_paths": self._file_manifest if self._file_manifest is not None else [], } with open(persistence_paths.metadata, "wb") as f: data = orjson.dumps(metadata) diff --git a/tests/test_cache.py b/tests/test_cache.py index cec5ce0..9c0868a 100644 --- a/tests/test_cache.py +++ b/tests/test_cache.py @@ -229,7 +229,11 @@ def test_get_validated_cache_manifest_mismatch( ) -> None: """Returns None when the current file set differs from the stored manifest.""" index_path = tmp_path / "index" - walk_return = [tmp_path / f for f in current_files] + walk_return = [] + for f in current_files: + p = tmp_path / f + p.write_text("") + walk_return.append(p) _write_metadata(index_path, "my/model", ["code"], float("inf"), file_paths=stored_files) with patch("semble.cache.find_index_from_cache_folder", return_value=index_path): with patch("semble.cache.walk_files", return_value=walk_return): From 776ff871477bec51197bef1624b5eba40999543e Mon Sep 17 00:00:00 2001 From: Pringled Date: Wed, 27 May 2026 06:36:05 +0200 Subject: [PATCH 07/17] Deprecate extensions, remove 1 second threhsold, skip restoring file_manifest on load --- src/semble/cache.py | 20 ++++++-------------- src/semble/cli.py | 26 ++++++++++---------------- src/semble/index/index.py | 35 +++++++---------------------------- tests/index/test_index.py | 16 ++++++++++++++++ tests/test_cache.py | 22 ---------------------- tests/test_cli.py | 5 ++--- 6 files changed, 41 insertions(+), 83 deletions(-) diff --git a/src/semble/cache.py b/src/semble/cache.py index 39bf666..e73d733 100644 --- a/src/semble/cache.py +++ b/src/semble/cache.py @@ -65,21 +65,13 @@ def clear_cache(path: str) -> None: shutil.rmtree(index_path) -def _metadata_matches( - metadata: dict, model_path: str, content: Sequence[ContentType], extensions: list[str] | None -) -> bool: +def _metadata_matches(metadata: dict, model_path: str, content: Sequence[ContentType]) -> bool: """Return True if the stored metadata is compatible with the requested parameters.""" content_type = tuple(ContentType(s) for s in metadata["content_type"]) - return ( - metadata["model_path"] == model_path - and set(content_type) == set(content) - and metadata.get("extensions") == extensions - ) + return metadata["model_path"] == model_path and set(content_type) == set(content) -def get_validated_cache( - path: str, model_path: str | None, content: Sequence[ContentType], extensions: list[str] | None = None -) -> Path | None: +def get_validated_cache(path: str, model_path: str | None, content: Sequence[ContentType]) -> Path | None: """Validates the cache folder and returns the index path.""" index_path = find_index_from_cache_folder(path) if not index_path.exists(): @@ -93,19 +85,19 @@ def get_validated_cache( model_path = resolve_model_name() with open(persistence_path.metadata) as f: metadata = json.load(f) - if not _metadata_matches(metadata, model_path, content, extensions): + if not _metadata_matches(metadata, model_path, content): return None if is_git_url(str(path)): return index_path write_time = metadata["time"] - resolved_extensions = extensions if extensions is not None else get_extensions(list(content), None) + extensions = get_extensions(list(content), None) path_as_path = Path(path) stored_files: list[str] = metadata.get("file_paths", []) current_files = [] - for file_path in walk_files(path_as_path, extensions=resolved_extensions): + for file_path in walk_files(path_as_path, extensions=extensions): current_files.append(str(file_path.relative_to(path_as_path))) if file_path.stat().st_mtime > write_time: return None diff --git a/src/semble/cli.py b/src/semble/cli.py index b1cb268..d47e00b 100644 --- a/src/semble/cli.py +++ b/src/semble/cli.py @@ -2,7 +2,6 @@ import asyncio import json import sys -import time import warnings from enum import Enum from importlib.resources import files @@ -40,12 +39,9 @@ def _build_index(path: str, content: list[ContentType]) -> SembleIndex: ) -def _maybe_save_index(index: SembleIndex, path: str, creation_time: float) -> None: - """Maybe save an index. Based on the index itself and the creation time.""" - # If the index was not loaded from disk, - # the index was invalidated or it didn't exist. - # If the creation time of the index was < 1 second, don't save - if creation_time > 1.0 and not index.loaded_from_disk: +def _maybe_save_index(index: SembleIndex, path: str) -> None: + """Save the index to the cache folder if it was not loaded from disk.""" + if not index.loaded_from_disk: try: cache_folder = find_index_from_cache_folder(path) index.save(cache_folder) @@ -147,29 +143,27 @@ def _run_index(path: str, content: list[ContentType]) -> None: print("Index is already up to date.") -def _load_index_timed(path: str, content: list[ContentType]) -> tuple[SembleIndex, float]: - """Build an index and return it with the elapsed build time in seconds.""" - start = time.time() +def _load_index(path: str, content: list[ContentType]) -> SembleIndex: + """Build an index from a local path or git URL, exiting on FileNotFoundError.""" try: - index = _build_index(path, content) + return _build_index(path, content) except FileNotFoundError as e: print(str(e), file=sys.stderr) sys.exit(1) - return index, time.time() - start def _run_search(path: str, query: str, top_k: int, content: list[ContentType]) -> None: """Handle the `search` subcommand.""" - index, creation_time = _load_index_timed(path, content) + index = _load_index(path, content) results = index.search(query, top_k=top_k) out = format_results(query, results) if results else {"error": "No results found."} print(json.dumps(out)) - _maybe_save_index(index, path, creation_time) + _maybe_save_index(index, path) def _run_find_related(path: str, file_path: str, line: int, top_k: int, content: list[ContentType]) -> None: """Handle the `find-related` subcommand.""" - index, creation_time = _load_index_timed(path, content) + index = _load_index(path, content) chunk = resolve_chunk(index.chunks, file_path, line) if chunk is None: print(f"No chunk found at {file_path}:{line}.", file=sys.stderr) @@ -181,7 +175,7 @@ def _run_find_related(path: str, file_path: str, line: int, top_k: int, content: else {"error": f"No related chunks found for {file_path}:{line}."} ) print(json.dumps(out)) - _maybe_save_index(index, path, creation_time) + _maybe_save_index(index, path) def _cli_main() -> None: diff --git a/src/semble/index/index.py b/src/semble/index/index.py index 3822bcb..bb725d1 100644 --- a/src/semble/index/index.py +++ b/src/semble/index/index.py @@ -61,7 +61,6 @@ def __init__( root: Path | None = None, content: ContentType | Sequence[ContentType] = _DEFAULT_CONTENT, loaded_from_disk: bool = False, - extensions: list[str] | None = None, file_manifest: list[str] | None = None, ) -> None: """Initialize a SembleIndex. Should be created with from_path or from_git. @@ -74,7 +73,6 @@ def __init__( :param root: Root directory used to read file sizes for token-savings stats. :param content: Content type used when indexing; controls the search pipeline. :param loaded_from_disk: Whether the index was loaded from disk (cache hit); controls CLI messaging. - :param extensions: File extensions included in the index, or None if not applicable. :param file_manifest: Sorted repo-relative paths of all walked files at index time, used for cache invalidation. """ self.model = model @@ -84,7 +82,6 @@ def __init__( self._model_path: str = model_path self._root: Path | None = root self._content: tuple[ContentType, ...] = (content,) if isinstance(content, ContentType) else tuple(content) - self._extensions: list[str] | None = extensions self._file_manifest: list[str] | None = file_manifest self._file_sizes: dict[str, int] = self._compute_file_sizes(root) if root else {} self._file_mapping, self._language_mapping = self._populate_mapping() @@ -132,7 +129,6 @@ def stats(self) -> IndexStats: def from_path( cls, path: str | Path, - extensions: Sequence[str] | None = None, content: ContentType | Sequence[ContentType] = _DEFAULT_CONTENT, include_text_files: bool | None = None, model_path: str | None = None, @@ -140,7 +136,6 @@ def from_path( """Create and index a SembleIndex from a directory. :param path: Root directory to index. - :param extensions: File extensions to include. Defaults to a standard set of code extensions. :param content: Content types to index, e.g. ContentType.CODE or [ContentType.CODE, ContentType.DOCS]. :param include_text_files: Deprecated. Pass a content sequence directly instead. :param model_path: Path to the model to use. If None, the default model will be used. @@ -155,32 +150,23 @@ def from_path( raise NotADirectoryError(f"Path is not a directory: {path}") normalized = _apply_include_text_files(content, include_text_files) - resolved_extensions = get_extensions(normalized, extensions) - cache_path = get_validated_cache(str(path), model_path, normalized, resolved_extensions) + cache_path = get_validated_cache(str(path), model_path, normalized) if cache_path: return cls.load_from_disk(cache_path) model, model_path = load_model(model_path) path = path.resolve() - file_manifest = sorted(str(f.relative_to(path)) for f in walk_files(path, extensions=resolved_extensions)) + extensions = get_extensions(normalized, None) + file_manifest = sorted(str(f.relative_to(path)) for f in walk_files(path, extensions=extensions)) bm25, vicinity, chunks = create_index_from_path( path, model=model, - extensions=extensions, content=normalized, display_root=path, ) return SembleIndex( - model, - bm25, - vicinity, - chunks, - model_path, - root=path, - content=normalized, - extensions=resolved_extensions, - file_manifest=file_manifest, + model, bm25, vicinity, chunks, model_path, root=path, content=normalized, file_manifest=file_manifest ) @classmethod @@ -188,7 +174,6 @@ def from_git( cls, url: str, ref: str | None = None, - extensions: Sequence[str] | None = None, model_path: str | None = None, content: ContentType | Sequence[ContentType] = _DEFAULT_CONTENT, include_text_files: bool | None = None, @@ -202,7 +187,6 @@ def from_git( :param url: URL of the git repository to clone (any git provider). :param ref: Branch or tag to check out. Defaults to the remote HEAD. - :param extensions: File extensions to include. Defaults to a standard set of code extensions. :param model_path: Path to the model to use. If None, the default model will be used. :param content: Content types to index, e.g. (ContentType.CODE,) or (ContentType.CODE, ContentType.DOCS). :param include_text_files: Deprecated. Pass content=(ContentType.CODE, ContentType.DOCS, ...) instead. @@ -210,9 +194,8 @@ def from_git( :raises RuntimeError: If git is not on PATH, the clone fails, or times out. """ normalized = _apply_include_text_files(content, include_text_files) - resolved_extensions = get_extensions(normalized, extensions) cache_key = f"{url}@{ref}" if ref else url - cache_path = get_validated_cache(cache_key, model_path, normalized, resolved_extensions) + cache_path = get_validated_cache(cache_key, model_path, normalized) if cache_path: return cls.load_from_disk(cache_path) @@ -232,13 +215,13 @@ def from_git( model, model_path = load_model(model_path) resolved_path = Path(tmp_dir).resolve() + extensions = get_extensions(normalized, None) file_manifest = sorted( - str(f.relative_to(resolved_path)) for f in walk_files(resolved_path, extensions=resolved_extensions) + str(f.relative_to(resolved_path)) for f in walk_files(resolved_path, extensions=extensions) ) bm25, vicinity, chunks = create_index_from_path( resolved_path, model=model, - extensions=extensions, content=normalized, display_root=resolved_path, ) @@ -251,7 +234,6 @@ def from_git( model_path, root=resolved_path, content=normalized, - extensions=resolved_extensions, file_manifest=file_manifest, ) @@ -348,7 +330,6 @@ def load_from_disk(cls: type[SembleIndex], path: Path | str) -> SembleIndex: chunks.append(Chunk.from_dict(chunk_item)) root_path = metadata["root_path"] model_path = metadata["model_path"] - extensions = metadata.get("extensions") content = tuple(ContentType(s) for s in metadata.get("content_type", ["code"])) file_manifest = metadata.get("file_paths") if root_path: @@ -365,7 +346,6 @@ def load_from_disk(cls: type[SembleIndex], path: Path | str) -> SembleIndex: root=root_path, content=content, loaded_from_disk=True, - extensions=extensions, file_manifest=file_manifest, ) @@ -388,7 +368,6 @@ def save(self, path: Path | str) -> None: "time": datetime.now().timestamp(), "model_path": self._model_path, "content_type": list(x.value for x in self._content), - "extensions": self._extensions, "file_paths": self._file_manifest if self._file_manifest is not None else [], } with open(persistence_paths.metadata, "wb") as f: diff --git a/tests/index/test_index.py b/tests/index/test_index.py index 98715fc..fab314e 100644 --- a/tests/index/test_index.py +++ b/tests/index/test_index.py @@ -185,6 +185,22 @@ def test_roundtrip(tmp_path: Path, indexed_index: SembleIndex) -> None: assert index_2._root == indexed_index._root +def test_load_save_roundtrip_preserves_manifest(tmp_path: Path, indexed_index: SembleIndex) -> None: + """load_from_disk followed by save must not clobber file_paths with an empty list.""" + save_a = tmp_path / "a" + save_b = tmp_path / "b" + indexed_index.save(save_a) + with patch.object(StaticModel, "from_pretrained"): + loaded = SembleIndex.load_from_disk(save_a) + loaded.save(save_b) + import json + + manifest_a = json.loads((save_a / "metadata.json").read_text())["file_paths"] + manifest_b = json.loads((save_b / "metadata.json").read_text())["file_paths"] + assert manifest_b == manifest_a + assert len(manifest_b) > 0 + + def test_load_non_existent(tmp_path: Path, indexed_index: SembleIndex) -> None: """Test that saving and loading a folder leads to the same data.""" with pytest.raises(FileNotFoundError): diff --git a/tests/test_cache.py b/tests/test_cache.py index 9c0868a..658ed8d 100644 --- a/tests/test_cache.py +++ b/tests/test_cache.py @@ -101,7 +101,6 @@ def _write_metadata( content_type: list[str], write_time: float, file_paths: list[str] | None = None, - extensions: list[str] | None = None, ) -> None: path.mkdir(parents=True, exist_ok=True) (path / "chunks.json").write_text("[]") @@ -114,7 +113,6 @@ def _write_metadata( "content_type": content_type, "time": write_time, "file_paths": file_paths if file_paths is not None else [], - "extensions": extensions, } ) ) @@ -197,26 +195,6 @@ def test_get_validated_cache_mtime( assert result == (index_path if expected == "index" else None) -@pytest.mark.parametrize( - ("stored_extensions", "req_extensions"), - [ - (None, [".py"]), # stored without custom ext, requested with → mismatch - ([".py"], None), # stored with custom ext, requested without → mismatch - ([".py"], [".ts"]), # different custom extensions → mismatch - ], -) -def test_get_validated_cache_extensions_mismatch( - stored_extensions: list[str] | None, - req_extensions: list[str] | None, - tmp_path: Path, -) -> None: - """Returns None when stored extensions don't match the requested extensions.""" - index_path = tmp_path / "index" - _write_metadata(index_path, "my/model", ["code"], float("inf"), extensions=stored_extensions) - with patch("semble.cache.find_index_from_cache_folder", return_value=index_path): - assert get_validated_cache("/path", "my/model", [ContentType.CODE], req_extensions) is None - - @pytest.mark.parametrize( ("stored_files", "current_files"), [ diff --git a/tests/test_cli.py b/tests/test_cli.py index 4b5972d..9011a39 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -210,8 +210,7 @@ def test_index_via_cli(path: str, mock_target: str, tmp_path: Path, monkeypatch: monkeypatch.setattr(sys, "argv", ["semble", "index", path]) with patch(mock_target, return_value=fake_index): with patch("semble.cli.find_index_from_cache_folder", return_value=expected_cache_path): - with patch("semble.cli.time.time", side_effect=[0.0, 2.0]): - _cli_main() + _cli_main() fake_index.save.assert_called_once_with(expected_cache_path) @@ -297,7 +296,7 @@ def test_maybe_save_index_logs_error_on_save_failure(capsys: pytest.CaptureFixtu fake_index.loaded_from_disk = False fake_index.save.side_effect = OSError("disk full") with patch("semble.cli.find_index_from_cache_folder", return_value=Path("/cache")): - _maybe_save_index(fake_index, "/some/path", 2.0) + _maybe_save_index(fake_index, "/some/path") assert "Error saving index" in capsys.readouterr().err From e6e316e8030cbffb986c1f654b4769d2d97e4025 Mon Sep 17 00:00:00 2001 From: Pringled Date: Wed, 27 May 2026 06:46:09 +0200 Subject: [PATCH 08/17] Treat malformed cache metadata as stale instead of crashing --- src/semble/cache.py | 7 +++++-- tests/test_cache.py | 13 +++++++++++++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/src/semble/cache.py b/src/semble/cache.py index e73d733..6960b91 100644 --- a/src/semble/cache.py +++ b/src/semble/cache.py @@ -67,8 +67,11 @@ def clear_cache(path: str) -> None: def _metadata_matches(metadata: dict, model_path: str, content: Sequence[ContentType]) -> bool: """Return True if the stored metadata is compatible with the requested parameters.""" - content_type = tuple(ContentType(s) for s in metadata["content_type"]) - return metadata["model_path"] == model_path and set(content_type) == set(content) + try: + content_type = tuple(ContentType(s) for s in metadata["content_type"]) + return metadata["model_path"] == model_path and set(content_type) == set(content) + except (KeyError, ValueError): + return False def get_validated_cache(path: str, model_path: str | None, content: Sequence[ContentType]) -> Path | None: diff --git a/tests/test_cache.py b/tests/test_cache.py index 658ed8d..fd1104a 100644 --- a/tests/test_cache.py +++ b/tests/test_cache.py @@ -134,6 +134,7 @@ def test_get_validated_cache_invalid_index(tmp_path: Path) -> None: [ ("other/model", ["code"], "my/model", [ContentType.CODE]), # model mismatch ("my/model", ["docs"], "my/model", [ContentType.CODE]), # content mismatch + ("my/model", ["unknown_type"], "my/model", [ContentType.CODE]), # invalid content value ], ) def test_get_validated_cache_metadata_mismatch( @@ -150,6 +151,18 @@ def test_get_validated_cache_metadata_mismatch( assert get_validated_cache("/path", req_model, req_content) is None +def test_get_validated_cache_legacy_metadata_returns_none(tmp_path: Path) -> None: + """Old cache metadata missing content_type returns None instead of crashing.""" + index_path = tmp_path / "index" + index_path.mkdir(parents=True) + (index_path / "chunks.json").write_text("[]") + (index_path / "bm25_index").write_text("") + (index_path / "semantic_index").write_text("") + (index_path / "metadata.json").write_text(json.dumps({"model_path": "my/model", "time": 0.0})) + with patch("semble.cache.find_index_from_cache_folder", return_value=index_path): + assert get_validated_cache("/path", "my/model", [ContentType.CODE]) is None + + def test_get_validated_cache_resolves_default_model(tmp_path: Path) -> None: """When model_path is None, resolve_model_name() is used for comparison.""" index_path = tmp_path / "index" From ae004470d5549f0fac611f6102fb192661fbfba7 Mon Sep 17 00:00:00 2001 From: Pringled Date: Wed, 27 May 2026 06:51:42 +0200 Subject: [PATCH 09/17] Add docs about sembleignore --- README.md | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/README.md b/README.md index 657cc1a..ed688a2 100644 --- a/README.md +++ b/README.md @@ -404,6 +404,31 @@ semble find-related src/auth.py 42 ./my-project `--content` accepts `code` (default), `docs`, `config`, or `all`. `path` defaults to the current directory when omitted; git URLs are accepted. If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its place. +
+Controlling which files are indexed + +Semble reads `.gitignore` and `.sembleignore` files to determine which files to index. Both files use standard gitignore syntax and their patterns are merged. `.sembleignore` lets you add semble-specific rules without touching `.gitignore`. Rules are applied recursively, so a `.sembleignore` in a subdirectory applies to that subtree. + +**Excluding files:** add patterns the same way you would in `.gitignore`: + +``` +# .sembleignore +generated/ # exclude generated dir +*.pb.go. # exclude Go protobuf files +``` + +**Including non-default extensions:** prefix the extension pattern with `!` to force-include files that semble wouldn't index by default: + +``` +# .sembleignore +!*.proto # include Protobuf files +!*.cob # include COBOL files +``` + +Semble also always skips a set of well-known non-source directories regardless of ignore files (e.g. `node_modules/`, `.venv/`, `dist/`, `build/`, `__pycache__/`, and similar). + +
+
Savings From 3baa90c92afd72957aead81ac431beb6a913328c Mon Sep 17 00:00:00 2001 From: stephantul Date: Wed, 27 May 2026 09:12:14 +0200 Subject: [PATCH 10/17] small fixes --- src/semble/cache.py | 2 +- src/semble/cli.py | 21 --------------------- src/semble/index/create.py | 4 +--- src/semble/index/files.py | 4 +--- src/semble/index/index.py | 20 ++------------------ tests/test_cli.py | 31 ------------------------------- tests/test_files.py | 15 ++------------- 7 files changed, 7 insertions(+), 90 deletions(-) diff --git a/src/semble/cache.py b/src/semble/cache.py index 6960b91..c9d5b40 100644 --- a/src/semble/cache.py +++ b/src/semble/cache.py @@ -95,7 +95,7 @@ def get_validated_cache(path: str, model_path: str | None, content: Sequence[Con return index_path write_time = metadata["time"] - extensions = get_extensions(list(content), None) + extensions = get_extensions(content) path_as_path = Path(path) stored_files: list[str] = metadata.get("file_paths", []) diff --git a/src/semble/cli.py b/src/semble/cli.py index d47e00b..83ecc58 100644 --- a/src/semble/cli.py +++ b/src/semble/cli.py @@ -128,21 +128,6 @@ def _resolve_content(content: list[str], include_text_files: bool) -> list[Conte return [ContentType(c) for c in content] -def _run_index(path: str, content: list[ContentType]) -> None: - """Handle the `index` subcommand.""" - try: - index = _build_index(path, content) - except FileNotFoundError as e: - print(str(e), file=sys.stderr) - sys.exit(1) - if not index.loaded_from_disk: - cache_folder = find_index_from_cache_folder(path) - index.save(cache_folder) - print(f"Wrote index to `{cache_folder}`.") - else: - print("Index is already up to date.") - - def _load_index(path: str, content: list[ContentType]) -> SembleIndex: """Build an index from a local path or git URL, exiting on FileNotFoundError.""" try: @@ -182,10 +167,6 @@ def _cli_main() -> None: parser = argparse.ArgumentParser(prog="semble") sub = parser.add_subparsers(dest="command") - index_p = sub.add_parser("index", help="Index and store a codebase.") - index_p.add_argument("path", nargs="?", default=".", help="Local path or git URL (default: current directory).") - _add_content_args(index_p) - search_p = sub.add_parser("search", help="Search a codebase.") search_p.add_argument("query", help="Natural language or code query.") search_p.add_argument("path", nargs="?", default=".", help="Local path or git URL (default: current directory).") @@ -218,8 +199,6 @@ def _cli_main() -> None: _run_init(agent=Agent(args.agent), force=args.force) elif args.command == "savings": print(format_savings_report(verbose=args.verbose)) - elif args.command == "index": - _run_index(args.path, _resolve_content(args.content, args.include_text_files)) elif args.command == "search": _run_search(args.path, args.query, args.top_k, _resolve_content(args.content, args.include_text_files)) elif args.command == "find-related": diff --git a/src/semble/index/create.py b/src/semble/index/create.py index b72a055..217ecd1 100644 --- a/src/semble/index/create.py +++ b/src/semble/index/create.py @@ -20,7 +20,6 @@ def create_index_from_path( path: Path, model: StaticModel, - extensions: Sequence[str] | None = None, content: ContentType | Sequence[ContentType] = (ContentType.CODE,), display_root: Path | None = None, ) -> tuple[bm25s.BM25, SelectableBasicBackend, list[Chunk]]: @@ -28,7 +27,6 @@ def create_index_from_path( :param path: Resolved absolute path to index. :param model: The model to use for indexing. - :param extensions: File extensions to include. :param content: Content types to index. :param display_root: If set, chunk file paths are stored relative to this root. :raises ValueError: if no items were found, no index can be created. @@ -36,7 +34,7 @@ def create_index_from_path( """ chunks: list[Chunk] = [] normalized = (content,) if isinstance(content, ContentType) else content - resolved_extensions = get_extensions(normalized, extensions) + resolved_extensions = get_extensions(normalized) for file_path in walk_files(path, resolved_extensions): language = detect_language(file_path) with contextlib.suppress(OSError): diff --git a/src/semble/index/files.py b/src/semble/index/files.py index a20e804..2be33aa 100644 --- a/src/semble/index/files.py +++ b/src/semble/index/files.py @@ -461,7 +461,7 @@ def detect_language(file_name: Path) -> str | None: return _EXTENSION_TO_LANGUAGE.get(file_name.suffix.lower()) -def get_extensions(types: Sequence[ContentType], extensions: Sequence[str] | None) -> list[str]: +def get_extensions(types: Sequence[ContentType]) -> list[str]: """Returns a list of supported file extensions for the given content types.""" languages: set[str] = set() for content_type in types: @@ -469,7 +469,5 @@ def get_extensions(types: Sequence[ContentType], extensions: Sequence[str] | Non all_extensions: set[str] = set() for language in languages: all_extensions.update(_LANGUAGE_TO_EXTENSION.get(language, set())) - if extensions is not None: - all_extensions.update(extensions) return sorted(all_extensions) diff --git a/src/semble/index/index.py b/src/semble/index/index.py index bb725d1..cbaf533 100644 --- a/src/semble/index/index.py +++ b/src/semble/index/index.py @@ -18,8 +18,6 @@ from semble.cache import get_validated_cache from semble.index.create import create_index_from_path from semble.index.dense import SelectableBasicBackend, load_model -from semble.index.file_walker import walk_files -from semble.index.files import get_extensions from semble.index.types import PersistencePath from semble.search import _search_semantic, search from semble.stats import save_search_stats @@ -61,7 +59,6 @@ def __init__( root: Path | None = None, content: ContentType | Sequence[ContentType] = _DEFAULT_CONTENT, loaded_from_disk: bool = False, - file_manifest: list[str] | None = None, ) -> None: """Initialize a SembleIndex. Should be created with from_path or from_git. @@ -73,7 +70,6 @@ def __init__( :param root: Root directory used to read file sizes for token-savings stats. :param content: Content type used when indexing; controls the search pipeline. :param loaded_from_disk: Whether the index was loaded from disk (cache hit); controls CLI messaging. - :param file_manifest: Sorted repo-relative paths of all walked files at index time, used for cache invalidation. """ self.model = model self.chunks: list[Chunk] = chunks @@ -82,7 +78,6 @@ def __init__( self._model_path: str = model_path self._root: Path | None = root self._content: tuple[ContentType, ...] = (content,) if isinstance(content, ContentType) else tuple(content) - self._file_manifest: list[str] | None = file_manifest self._file_sizes: dict[str, int] = self._compute_file_sizes(root) if root else {} self._file_mapping, self._language_mapping = self._populate_mapping() self.loaded_from_disk: bool = loaded_from_disk @@ -156,8 +151,6 @@ def from_path( model, model_path = load_model(model_path) path = path.resolve() - extensions = get_extensions(normalized, None) - file_manifest = sorted(str(f.relative_to(path)) for f in walk_files(path, extensions=extensions)) bm25, vicinity, chunks = create_index_from_path( path, model=model, @@ -165,9 +158,7 @@ def from_path( display_root=path, ) - return SembleIndex( - model, bm25, vicinity, chunks, model_path, root=path, content=normalized, file_manifest=file_manifest - ) + return SembleIndex(model, bm25, vicinity, chunks, model_path, root=path, content=normalized) @classmethod def from_git( @@ -215,10 +206,6 @@ def from_git( model, model_path = load_model(model_path) resolved_path = Path(tmp_dir).resolve() - extensions = get_extensions(normalized, None) - file_manifest = sorted( - str(f.relative_to(resolved_path)) for f in walk_files(resolved_path, extensions=extensions) - ) bm25, vicinity, chunks = create_index_from_path( resolved_path, model=model, @@ -234,7 +221,6 @@ def from_git( model_path, root=resolved_path, content=normalized, - file_manifest=file_manifest, ) def find_related(self, source: Chunk | SearchResult, *, top_k: int = 5) -> list[SearchResult]: @@ -331,7 +317,6 @@ def load_from_disk(cls: type[SembleIndex], path: Path | str) -> SembleIndex: root_path = metadata["root_path"] model_path = metadata["model_path"] content = tuple(ContentType(s) for s in metadata.get("content_type", ["code"])) - file_manifest = metadata.get("file_paths") if root_path: root_path = Path(root_path) @@ -346,7 +331,6 @@ def load_from_disk(cls: type[SembleIndex], path: Path | str) -> SembleIndex: root=root_path, content=content, loaded_from_disk=True, - file_manifest=file_manifest, ) def save(self, path: Path | str) -> None: @@ -368,7 +352,7 @@ def save(self, path: Path | str) -> None: "time": datetime.now().timestamp(), "model_path": self._model_path, "content_type": list(x.value for x in self._content), - "file_paths": self._file_manifest if self._file_manifest is not None else [], + "file_paths": sorted(self._file_mapping), } with open(persistence_paths.metadata, "wb") as f: data = orjson.dumps(metadata) diff --git a/tests/test_cli.py b/tests/test_cli.py index 9011a39..e0b5de2 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -195,40 +195,9 @@ def test_mcp_main_exits_with_message_when_extras_missing( assert "pip install 'semble[mcp]'" in capsys.readouterr().err -@pytest.mark.parametrize( - ("path", "mock_target"), - [ - ("/some/path", "semble.cli.SembleIndex.from_path"), - ("git://xyz.git", "semble.cli.SembleIndex.from_git"), - ], -) -def test_index_via_cli(path: str, mock_target: str, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: - """Index command builds an index and saves it to the cache folder (local and git).""" - fake_index = MagicMock() - fake_index.loaded_from_disk = False - expected_cache_path = tmp_path / "hash" / "index" - monkeypatch.setattr(sys, "argv", ["semble", "index", path]) - with patch(mock_target, return_value=fake_index): - with patch("semble.cli.find_index_from_cache_folder", return_value=expected_cache_path): - _cli_main() - fake_index.save.assert_called_once_with(expected_cache_path) - - -def test_index_already_up_to_date(monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]) -> None: - """Index command prints 'up to date' when the index was loaded from disk (cache hit).""" - fake_index = MagicMock() - fake_index.loaded_from_disk = True - monkeypatch.setattr(sys, "argv", ["semble", "index", "/some/path"]) - with patch("semble.cli.SembleIndex.from_path", return_value=fake_index): - _cli_main() - assert "up to date" in capsys.readouterr().out - fake_index.save.assert_not_called() - - @pytest.mark.parametrize( ("command", "argv"), [ - ("index", ["semble", "index", "/no/such/path"]), ("search", ["semble", "search", "query", "/no/such/path"]), ("find-related", ["semble", "find-related", "src/foo.py", "1", "/no/such/path"]), ], diff --git a/tests/test_files.py b/tests/test_files.py index 3ff1bb1..0998967 100644 --- a/tests/test_files.py +++ b/tests/test_files.py @@ -41,7 +41,7 @@ def test_language_sets_are_consistent() -> None: ) def test_get_extensions(types: list[ContentType], includes: list[str], excludes: list[str]) -> None: """get_extensions returns the right extensions for each combination of content types.""" - exts = set(get_extensions(types, None)) + exts = set(get_extensions(types)) for ext in includes: assert ext in exts for ext in excludes: @@ -50,17 +50,6 @@ def test_get_extensions(types: list[ContentType], includes: list[str], excludes: def test_all_excludes_data_extensions() -> None: """--content all does not include data file extensions (csv, json, tsv, psv).""" - all_exts = set(get_extensions(list(ContentType), None)) + all_exts = set(get_extensions(list(ContentType))) for ext in (".csv", ".tsv", ".psv", ".json", ".json5"): assert ext not in all_exts, f"{ext} should not be indexed by 'all'" - - -def test_get_extensions_additional() -> None: - """Extra extensions are appended and existing ones are not duplicated.""" - base = get_extensions(list(ContentType), None) - with_extra = get_extensions(list(ContentType), [".kjs"]) - assert set(with_extra) == set(base) | {".kjs"} - - base_code = get_extensions([ContentType.CODE], None) - with_existing = get_extensions([ContentType.CODE], [".py"]) - assert set(with_existing) == set(base_code) From 815fc8a6387d2d01aee5c17afbf71d1609e06b33 Mon Sep 17 00:00:00 2001 From: stephantul Date: Wed, 27 May 2026 10:53:12 +0200 Subject: [PATCH 11/17] update file checks --- src/semble/cache.py | 12 ++++++++---- src/semble/index/create.py | 7 +++---- src/semble/index/files.py | 28 ++++++++++++++++++++++++++++ tests/index/test_index.py | 3 ++- tests/test_cache.py | 3 ++- 5 files changed, 43 insertions(+), 10 deletions(-) diff --git a/src/semble/cache.py b/src/semble/cache.py index c9d5b40..d5ce927 100644 --- a/src/semble/cache.py +++ b/src/semble/cache.py @@ -7,7 +7,7 @@ from pathlib import Path from semble.index.file_walker import walk_files -from semble.index.files import get_extensions +from semble.index.files import FileStatus, get_extensions, get_file_status from semble.index.types import PersistencePath from semble.types import ContentType from semble.utils import is_git_url, resolve_model_name @@ -101,10 +101,14 @@ def get_validated_cache(path: str, model_path: str | None, content: Sequence[Con stored_files: list[str] = metadata.get("file_paths", []) current_files = [] for file_path in walk_files(path_as_path, extensions=extensions): - current_files.append(str(file_path.relative_to(path_as_path))) - if file_path.stat().st_mtime > write_time: + file_status = get_file_status(file_path, write_time) + if file_status == FileStatus.NEWER: return None - if sorted(current_files) != stored_files: + if file_status != FileStatus.VALID: + continue + current_files.append(str(file_path.relative_to(path_as_path))) + + if set(current_files) != set(stored_files): return None return index_path diff --git a/src/semble/index/create.py b/src/semble/index/create.py index 217ecd1..368e063 100644 --- a/src/semble/index/create.py +++ b/src/semble/index/create.py @@ -9,13 +9,11 @@ from semble.chunking import chunk_source from semble.index.dense import SelectableBasicBackend, embed_chunks from semble.index.file_walker import walk_files -from semble.index.files import detect_language, get_extensions +from semble.index.files import detect_language, get_extensions, get_file_status from semble.index.sparse import enrich_for_bm25 from semble.tokens import tokenize from semble.types import Chunk, ContentType -_MAX_FILE_BYTES = 1_000_000 # 1 MB max file size to read and index - def create_index_from_path( path: Path, @@ -38,7 +36,8 @@ def create_index_from_path( for file_path in walk_files(path, resolved_extensions): language = detect_language(file_path) with contextlib.suppress(OSError): - if file_path.stat().st_size > _MAX_FILE_BYTES: + file_status = get_file_status(file_path, None) + if file_status != file_status.VALID: continue source = file_path.read_text(encoding="utf-8", errors="replace") chunk_path = file_path.relative_to(display_root) if display_root else file_path diff --git a/src/semble/index/files.py b/src/semble/index/files.py index 2be33aa..329f7ee 100644 --- a/src/semble/index/files.py +++ b/src/semble/index/files.py @@ -1,9 +1,12 @@ from collections import defaultdict from collections.abc import Sequence +from enum import Enum from pathlib import Path from semble.types import ContentType +_MAX_FILE_BYTES = 1_000_000 # 1 MB max file size to read and index +_EMPTY_FILE_BYTES = 10 _EXTENSION_TO_LANGUAGE = { ".4th": "forth", ".ada": "ada", @@ -471,3 +474,28 @@ def get_extensions(types: Sequence[ContentType]) -> list[str]: all_extensions.update(_LANGUAGE_TO_EXTENSION.get(language, set())) return sorted(all_extensions) + + +class FileStatus(str, Enum): + NEWER = "newer" + TOO_LARGE = "too_large" + EMPTY = "empty" + VALID = "valid" + + +def get_file_status(file_path: Path, write_time: float | None) -> FileStatus: + """Checks if a file should be indexed based on its size and modification time.""" + stat = file_path.stat() + if write_time is not None and stat.st_mtime > write_time: + # Index invalid, file invalid + return FileStatus.NEWER + size = stat.st_size + if size > _MAX_FILE_BYTES: + # index valid, file invalid + return FileStatus.TOO_LARGE + if size < _EMPTY_FILE_BYTES and not file_path.read_text().strip(): + # index valid, file invalid + return FileStatus.EMPTY + + # Both valid + return FileStatus.VALID diff --git a/tests/index/test_index.py b/tests/index/test_index.py index fab314e..7a19577 100644 --- a/tests/index/test_index.py +++ b/tests/index/test_index.py @@ -6,7 +6,8 @@ from model2vec import StaticModel from semble import SembleIndex -from semble.index.create import _MAX_FILE_BYTES, create_index_from_path +from semble.index.create import create_index_from_path +from semble.index.files import _MAX_FILE_BYTES from semble.types import ContentType from tests.conftest import make_chunk diff --git a/tests/test_cache.py b/tests/test_cache.py index fd1104a..1fbd59d 100644 --- a/tests/test_cache.py +++ b/tests/test_cache.py @@ -223,7 +223,8 @@ def test_get_validated_cache_manifest_mismatch( walk_return = [] for f in current_files: p = tmp_path / f - p.write_text("") + # Make sure file is not empty + p.write_text("a") walk_return.append(p) _write_metadata(index_path, "my/model", ["code"], float("inf"), file_paths=stored_files) with patch("semble.cache.find_index_from_cache_folder", return_value=index_path): From 978441e0538c135eab34fc2fa9ac976eef6e33ae Mon Sep 17 00:00:00 2001 From: stephantul Date: Wed, 27 May 2026 11:08:35 +0200 Subject: [PATCH 12/17] address comments --- README.md | 3 --- src/semble/cli.py | 2 +- src/semble/index/create.py | 4 ++-- src/semble/stats.py | 16 +++++++++++----- tests/test_stats.py | 6 +++--- 5 files changed, 17 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index ed688a2..42f20f2 100644 --- a/README.md +++ b/README.md @@ -383,9 +383,6 @@ If semble is not on `$PATH`, prefix the command with `uvx --from "semble[mcp]"`. Semble also ships as a standalone CLI. This is useful in scripts or anywhere you want search results without an MCP session. ```bash -# Pre-build the cache for a local repository (optional — search builds it automatically) -semble index ./my-project - # Search a local repo (index is built and cached automatically) semble search "authentication flow" ./my-project diff --git a/src/semble/cli.py b/src/semble/cli.py index 83ecc58..1a0e8b4 100644 --- a/src/semble/cli.py +++ b/src/semble/cli.py @@ -27,7 +27,7 @@ class Agent(str, Enum): _DEFAULT_AGENT = Agent.CLAUDE -_CLI_DISPATCH_ARGS = frozenset({"search", "find-related", "init", "savings", "-h", "--help", "index"}) +_CLI_DISPATCH_ARGS = frozenset({"search", "find-related", "init", "savings", "-h", "--help"}) def _build_index(path: str, content: list[ContentType]) -> SembleIndex: diff --git a/src/semble/index/create.py b/src/semble/index/create.py index 368e063..5a9be9d 100644 --- a/src/semble/index/create.py +++ b/src/semble/index/create.py @@ -9,7 +9,7 @@ from semble.chunking import chunk_source from semble.index.dense import SelectableBasicBackend, embed_chunks from semble.index.file_walker import walk_files -from semble.index.files import detect_language, get_extensions, get_file_status +from semble.index.files import FileStatus, detect_language, get_extensions, get_file_status from semble.index.sparse import enrich_for_bm25 from semble.tokens import tokenize from semble.types import Chunk, ContentType @@ -37,7 +37,7 @@ def create_index_from_path( language = detect_language(file_path) with contextlib.suppress(OSError): file_status = get_file_status(file_path, None) - if file_status != file_status.VALID: + if file_status != FileStatus.VALID: continue source = file_path.read_text(encoding="utf-8", errors="replace") chunk_path = file_path.relative_to(display_root) if display_root else file_path diff --git a/src/semble/stats.py b/src/semble/stats.py index 58f6097..bebc988 100644 --- a/src/semble/stats.py +++ b/src/semble/stats.py @@ -10,7 +10,10 @@ logger = logging.getLogger(__name__) -_STATS_FILE = resolve_cache_folder() / "savings.jsonl" + +def _get_stats_file() -> Path: + """Safely create a stats file.""" + return resolve_cache_folder() / "savings.jsonl" @dataclass @@ -53,15 +56,18 @@ def save_search_stats( "snippet_chars": snippet_chars, "file_chars": file_chars, } - _STATS_FILE.parent.mkdir(parents=True, exist_ok=True) - with _STATS_FILE.open("a") as f: + stats_file = _get_stats_file() + stats_file.parent.mkdir(parents=True, exist_ok=True) + with stats_file.open("a") as f: f.write(json.dumps(record) + "\n") except OSError: pass -def build_savings_summary(path: Path = _STATS_FILE) -> SavingsSummary: +def build_savings_summary(path: Path | None = None) -> SavingsSummary: """Read savings.jsonl and return a SavingsSummary.""" + if path is None: + path = _get_stats_file() now = datetime.now(timezone.utc) today = now.date() seven_days_ago = (now - timedelta(days=7)).date() @@ -99,7 +105,7 @@ def build_savings_summary(path: Path = _STATS_FILE) -> SavingsSummary: def format_savings_report(path: Path | None = None, *, verbose: bool = False) -> str: """Return a formatted token-savings report.""" if path is None: - path = _STATS_FILE + path = _get_stats_file() if not path.exists(): return "No stats yet. Run a search first." diff --git a/tests/test_stats.py b/tests/test_stats.py index d3c7689..00b2f81 100644 --- a/tests/test_stats.py +++ b/tests/test_stats.py @@ -32,14 +32,14 @@ def test_save_search_stats(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> N chunk = make_chunk("hello", "src/foo.py") result = SearchResult(chunk=chunk, score=0.9) stats_file = tmp_path / "stats.jsonl" - monkeypatch.setattr("semble.stats._STATS_FILE", stats_file) + monkeypatch.setattr("semble.stats._get_stats_file", lambda: stats_file) save_search_stats([result, result], CallType.SEARCH, {"src/foo.py": 42}) assert json.loads(stats_file.read_text())["file_chars"] == 42 mock_path = MagicMock() mock_path.parent.mkdir.return_value = None mock_path.open.side_effect = OSError("no write") - monkeypatch.setattr("semble.stats._STATS_FILE", mock_path) + monkeypatch.setattr("semble.stats._get_stats_file", lambda: mock_path) save_search_stats([result], CallType.SEARCH, {"src/foo.py": 42}) # must not raise @@ -108,7 +108,7 @@ def test_savings_cli_dispatch( ) -> None: """Savings subcommand dispatches to format_savings_report, with and without --verbose.""" monkeypatch.setattr(sys, "argv", argv) - monkeypatch.setattr("semble.stats._STATS_FILE", tmp_path / "nonexistent.jsonl") + monkeypatch.setattr("semble.stats._get_stats_file", lambda: tmp_path / "nonexistent.jsonl") _cli_main() assert expected in capsys.readouterr().out From bd5710ea72021353e4a48fbc3fc802fceeb0b38c Mon Sep 17 00:00:00 2001 From: stephantul Date: Wed, 27 May 2026 11:28:05 +0200 Subject: [PATCH 13/17] fix tests --- src/semble/cache.py | 2 +- tests/test_cache.py | 11 ++++++----- tests/test_stats.py | 7 ++++++- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/src/semble/cache.py b/src/semble/cache.py index d5ce927..9666eee 100644 --- a/src/semble/cache.py +++ b/src/semble/cache.py @@ -97,7 +97,7 @@ def get_validated_cache(path: str, model_path: str | None, content: Sequence[Con write_time = metadata["time"] extensions = get_extensions(content) - path_as_path = Path(path) + path_as_path = Path(path).resolve() stored_files: list[str] = metadata.get("file_paths", []) current_files = [] for file_path in walk_files(path_as_path, extensions=extensions): diff --git a/tests/test_cache.py b/tests/test_cache.py index 1fbd59d..0a1609a 100644 --- a/tests/test_cache.py +++ b/tests/test_cache.py @@ -183,19 +183,20 @@ def test_get_validated_cache_git_url_returns_immediately(tmp_path: Path) -> None @pytest.mark.parametrize( - ("write_time", "walk_result", "expected"), + ("write_time", "walk_result", "write", "expected"), [ - (0.0, "stale", None), # file newer than index → stale - (float("inf"), [], "index"), # no newer files → valid + (0.0, "stale", True, None), # file newer than index → stale + (float("inf"), [], True, "index"), # no newer files → valid + (float("inf"), "stale", False, None), # no index, returns None ], ) def test_get_validated_cache_mtime( - write_time: float, walk_result: str | list, expected: str | None, tmp_path: Path + write_time: float, walk_result: str | list, write: bool, expected: str | None, tmp_path: Path ) -> None: """Returns None when a tracked file is newer than the index; the path otherwise.""" index_path = tmp_path / "index" stale_file = tmp_path / "src.py" - stale_file.write_text("x = 1") + stale_file.write_text("x = 1" if write else "") files = [stale_file] if walk_result == "stale" else walk_result # Include the file in stored manifest so manifest check passes and mtime check fires. stored_files = ["src.py"] if walk_result == "stale" else [] diff --git a/tests/test_stats.py b/tests/test_stats.py index 00b2f81..e3c1f32 100644 --- a/tests/test_stats.py +++ b/tests/test_stats.py @@ -2,7 +2,7 @@ import sys from datetime import datetime, timezone from pathlib import Path -from unittest.mock import MagicMock +from unittest.mock import MagicMock, patch import pytest @@ -87,6 +87,11 @@ def test_savings_do_not_subtract_unknown_baselines(tmp_path: Path) -> None: assert summary.buckets["All time"].saved_chars == 400 assert "~100 tokens" in format_savings_report(path=stats_file) + with patch("semble.stats._get_stats_file", lambda: stats_file): + summary = build_savings_summary(path=None) + assert summary.buckets["All time"].saved_chars == 400 + assert "~100 tokens" in format_savings_report(path=stats_file) + def test_savings_tolerates_bad_json(tmp_path: Path) -> None: """Malformed JSON lines are skipped with a warning.""" From 1e1bdc0a60bfb5dbed6eb52d80fc5b5d4e4390f2 Mon Sep 17 00:00:00 2001 From: Pringled Date: Wed, 27 May 2026 16:43:36 +0200 Subject: [PATCH 14/17] Make MCP save/load from cache as well, update _empty_file_bytes to 128, bump version --- src/semble/cache.py | 7 +++++++ src/semble/index/files.py | 2 +- src/semble/mcp.py | 35 ++++++++++++++++------------------- src/semble/version.py | 2 +- tests/test_cache.py | 11 ++++++++++- tests/test_mcp.py | 17 ++++++++++++++++- 6 files changed, 51 insertions(+), 23 deletions(-) diff --git a/src/semble/cache.py b/src/semble/cache.py index 9666eee..0f045d8 100644 --- a/src/semble/cache.py +++ b/src/semble/cache.py @@ -5,6 +5,7 @@ import sys from collections.abc import Sequence from pathlib import Path +from typing import Any from semble.index.file_walker import walk_files from semble.index.files import FileStatus, get_extensions, get_file_status @@ -65,6 +66,12 @@ def clear_cache(path: str) -> None: shutil.rmtree(index_path) +def save_index_to_cache(index: Any, path: str) -> None: + """Save an index to the cache folder if it was freshly built.""" + if not index.loaded_from_disk: + index.save(find_index_from_cache_folder(path)) + + def _metadata_matches(metadata: dict, model_path: str, content: Sequence[ContentType]) -> bool: """Return True if the stored metadata is compatible with the requested parameters.""" try: diff --git a/src/semble/index/files.py b/src/semble/index/files.py index 329f7ee..45ea0ab 100644 --- a/src/semble/index/files.py +++ b/src/semble/index/files.py @@ -6,7 +6,7 @@ from semble.types import ContentType _MAX_FILE_BYTES = 1_000_000 # 1 MB max file size to read and index -_EMPTY_FILE_BYTES = 10 +_EMPTY_FILE_BYTES = 128 _EXTENSION_TO_LANGUAGE = { ".4th": "forth", ".ada": "ada", diff --git a/src/semble/mcp.py b/src/semble/mcp.py index 3aa526a..6c76df3 100644 --- a/src/semble/mcp.py +++ b/src/semble/mcp.py @@ -12,6 +12,7 @@ from mcp.server.fastmcp import FastMCP from pydantic import Field +from semble.cache import save_index_to_cache from semble.index import SembleIndex from semble.index.dense import load_model from semble.types import ContentType @@ -174,6 +175,18 @@ def _compute_cache_key(self, source: str, ref: str | None = None) -> str: is_git = is_git_url(source) return (f"{source}@{ref}" if ref else source) if is_git else str(Path(source).resolve()) + def _build_and_cache_index(self, source: str, ref: str | None, model_path: str, cache_key: str) -> SembleIndex: + index = ( + SembleIndex.from_git(source, ref=ref, model_path=model_path, content=self._content) + if is_git_url(source) + else SembleIndex.from_path(cache_key, model_path=model_path, content=self._content) + ) + try: + save_index_to_cache(index, cache_key) + except Exception: + logger.warning("Failed to save index cache for %r", cache_key, exc_info=True) + return index + def evict(self, source: str) -> None: self._tasks.pop(self._compute_cache_key(source), None) @@ -203,25 +216,9 @@ async def get(self, source: str, ref: str | None = None) -> SembleIndex: if cache_key not in self._tasks: if len(self._tasks) >= _CACHE_MAX_SIZE: self._tasks.popitem(last=False) - if is_git_url(source): - self._tasks[cache_key] = asyncio.create_task( - asyncio.to_thread( - SembleIndex.from_git, - source, - ref=ref, - model_path=model_path, - content=self._content, - ) - ) - else: - self._tasks[cache_key] = asyncio.create_task( - asyncio.to_thread( - SembleIndex.from_path, - cache_key, - model_path=model_path, - content=self._content, - ) - ) + self._tasks[cache_key] = asyncio.create_task( + asyncio.to_thread(self._build_and_cache_index, source, ref, model_path, cache_key) + ) self._tasks.move_to_end(cache_key) task = self._tasks[cache_key] try: diff --git a/src/semble/version.py b/src/semble/version.py index 6036609..b7e3609 100644 --- a/src/semble/version.py +++ b/src/semble/version.py @@ -1,2 +1,2 @@ -__version_triple__ = (0, 2, 0) +__version_triple__ = (0, 3, 0) __version__ = ".".join(map(str, __version_triple__)) diff --git a/tests/test_cache.py b/tests/test_cache.py index 0a1609a..e46eb20 100644 --- a/tests/test_cache.py +++ b/tests/test_cache.py @@ -3,7 +3,7 @@ import json import sys from pathlib import Path -from unittest.mock import patch +from unittest.mock import MagicMock, patch import pytest @@ -14,6 +14,7 @@ find_index_from_cache_folder, get_validated_cache, resolve_cache_folder, + save_index_to_cache, ) from semble.types import ContentType @@ -67,6 +68,14 @@ def test_cache_dir_no_env(fn: object, expected_rel: Path) -> None: assert fn("semble") == home / expected_rel # type: ignore[operator] +def test_save_index_to_cache(tmp_path: Path) -> None: + """A freshly built index is saved under its cache key.""" + index = MagicMock(loaded_from_disk=False) + with patch("semble.cache.find_index_from_cache_folder", return_value=tmp_path / "index"): + save_index_to_cache(index, "repo") + index.save.assert_called_once_with(tmp_path / "index") + + @pytest.mark.parametrize( ("platform", "mock_target", "expected"), [ diff --git a/tests/test_mcp.py b/tests/test_mcp.py index 900477a..8521b32 100644 --- a/tests/test_mcp.py +++ b/tests/test_mcp.py @@ -118,12 +118,16 @@ async def test_index_cache_builds_and_caches( """_IndexCache.get() builds via the correct SembleIndex.* entrypoint and caches subsequent calls.""" resolved_source = str(tmp_path) if source == "local_tmp_path" else source fake_index = MagicMock() - with patch(f"semble.mcp.SembleIndex.{patch_target}", return_value=fake_index) as mock_build: + with ( + patch(f"semble.mcp.SembleIndex.{patch_target}", return_value=fake_index) as mock_build, + patch("semble.mcp.save_index_to_cache") as mock_save, + ): first = await cache.get(resolved_source) second = await cache.get(resolved_source) assert first is fake_index assert second is fake_index mock_build.assert_called_once() + mock_save.assert_called_once_with(fake_index, cache._compute_cache_key(resolved_source)) @pytest.mark.anyio @@ -146,6 +150,17 @@ def _failing_then_ok(path: str, **kwargs: object) -> MagicMock: assert call_count == 2 +@pytest.mark.anyio +async def test_index_cache_ignores_cache_save_failure(cache: _IndexCache, tmp_path: Path) -> None: + """A cache save failure must not fail the MCP request.""" + fake_index = MagicMock() + with ( + patch("semble.mcp.SembleIndex.from_path", return_value=fake_index), + patch("semble.mcp.save_index_to_cache", side_effect=RuntimeError("save failed")), + ): + assert await cache.get(str(tmp_path)) is fake_index + + @pytest.mark.anyio @pytest.mark.parametrize( ("tool", "args"), From e7766ded8ccf7a11012c364c3957bdb2ad4be855 Mon Sep 17 00:00:00 2001 From: Pringled Date: Wed, 27 May 2026 16:46:00 +0200 Subject: [PATCH 15/17] Update docstring --- src/semble/mcp.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/semble/mcp.py b/src/semble/mcp.py index 6c76df3..f31d0d8 100644 --- a/src/semble/mcp.py +++ b/src/semble/mcp.py @@ -176,6 +176,7 @@ def _compute_cache_key(self, source: str, ref: str | None = None) -> str: return (f"{source}@{ref}" if ref else source) if is_git else str(Path(source).resolve()) def _build_and_cache_index(self, source: str, ref: str | None, model_path: str, cache_key: str) -> SembleIndex: + """Build an index for the given source and cache it.""" index = ( SembleIndex.from_git(source, ref=ref, model_path=model_path, content=self._content) if is_git_url(source) From d7e58ea3327d390401f4983050fcb27a45cca22c Mon Sep 17 00:00:00 2001 From: Pringled Date: Wed, 27 May 2026 16:49:09 +0200 Subject: [PATCH 16/17] Keep stephan happy --- src/semble/cache.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/semble/cache.py b/src/semble/cache.py index 0f045d8..6f6ea49 100644 --- a/src/semble/cache.py +++ b/src/semble/cache.py @@ -5,7 +5,7 @@ import sys from collections.abc import Sequence from pathlib import Path -from typing import Any +from typing import TYPE_CHECKING from semble.index.file_walker import walk_files from semble.index.files import FileStatus, get_extensions, get_file_status @@ -13,6 +13,9 @@ from semble.types import ContentType from semble.utils import is_git_url, resolve_model_name +if TYPE_CHECKING: + from semble.index import SembleIndex + def find_index_from_cache_folder(path: str) -> Path: """Finds an index from a cache folder and a project path.""" @@ -66,7 +69,7 @@ def clear_cache(path: str) -> None: shutil.rmtree(index_path) -def save_index_to_cache(index: Any, path: str) -> None: +def save_index_to_cache(index: "SembleIndex", path: str) -> None: """Save an index to the cache folder if it was freshly built.""" if not index.loaded_from_disk: index.save(find_index_from_cache_folder(path)) From c655abbd6c109c3bbb5cca770fe7a63155c49786 Mon Sep 17 00:00:00 2001 From: Pringled Date: Wed, 27 May 2026 17:04:47 +0200 Subject: [PATCH 17/17] Add safe file read function --- src/semble/index/create.py | 4 ++-- src/semble/index/files.py | 7 ++++++- src/semble/index/index.py | 3 ++- tests/index/test_index.py | 9 ++++++++- 4 files changed, 18 insertions(+), 5 deletions(-) diff --git a/src/semble/index/create.py b/src/semble/index/create.py index 5a9be9d..b4dd189 100644 --- a/src/semble/index/create.py +++ b/src/semble/index/create.py @@ -9,7 +9,7 @@ from semble.chunking import chunk_source from semble.index.dense import SelectableBasicBackend, embed_chunks from semble.index.file_walker import walk_files -from semble.index.files import FileStatus, detect_language, get_extensions, get_file_status +from semble.index.files import FileStatus, detect_language, get_extensions, get_file_status, read_file_text from semble.index.sparse import enrich_for_bm25 from semble.tokens import tokenize from semble.types import Chunk, ContentType @@ -39,7 +39,7 @@ def create_index_from_path( file_status = get_file_status(file_path, None) if file_status != FileStatus.VALID: continue - source = file_path.read_text(encoding="utf-8", errors="replace") + source = read_file_text(file_path) chunk_path = file_path.relative_to(display_root) if display_root else file_path chunks.extend(chunk_source(source, str(chunk_path), language)) diff --git a/src/semble/index/files.py b/src/semble/index/files.py index 45ea0ab..7aa0702 100644 --- a/src/semble/index/files.py +++ b/src/semble/index/files.py @@ -483,6 +483,11 @@ class FileStatus(str, Enum): VALID = "valid" +def read_file_text(file_path: Path) -> str: + """Read a file's text content, replacing invalid characters and silencing read errors.""" + return file_path.read_text(encoding="utf-8", errors="replace") + + def get_file_status(file_path: Path, write_time: float | None) -> FileStatus: """Checks if a file should be indexed based on its size and modification time.""" stat = file_path.stat() @@ -493,7 +498,7 @@ def get_file_status(file_path: Path, write_time: float | None) -> FileStatus: if size > _MAX_FILE_BYTES: # index valid, file invalid return FileStatus.TOO_LARGE - if size < _EMPTY_FILE_BYTES and not file_path.read_text().strip(): + if size < _EMPTY_FILE_BYTES and not read_file_text(file_path).strip(): # index valid, file invalid return FileStatus.EMPTY diff --git a/src/semble/index/index.py b/src/semble/index/index.py index cbaf533..8bab699 100644 --- a/src/semble/index/index.py +++ b/src/semble/index/index.py @@ -18,6 +18,7 @@ from semble.cache import get_validated_cache from semble.index.create import create_index_from_path from semble.index.dense import SelectableBasicBackend, load_model +from semble.index.files import read_file_text from semble.index.types import PersistencePath from semble.search import _search_semantic, search from semble.stats import save_search_stats @@ -101,7 +102,7 @@ def _compute_file_sizes(self, root: Path) -> dict[str, int]: if chunk.file_path in sizes: continue try: - sizes[chunk.file_path] = len((root / chunk.file_path).read_text(encoding="utf-8", errors="replace")) + sizes[chunk.file_path] = len(read_file_text(root / chunk.file_path)) except OSError: pass return sizes diff --git a/tests/index/test_index.py b/tests/index/test_index.py index 7a19577..76a9d04 100644 --- a/tests/index/test_index.py +++ b/tests/index/test_index.py @@ -7,7 +7,7 @@ from semble import SembleIndex from semble.index.create import create_index_from_path -from semble.index.files import _MAX_FILE_BYTES +from semble.index.files import _MAX_FILE_BYTES, FileStatus, get_file_status from semble.types import ContentType from tests.conftest import make_chunk @@ -75,6 +75,13 @@ def test_oversized_file_is_skipped(mock_model: StaticModel, tmp_path: Path) -> N create_index_from_path(tmp_path, mock_model) +def test_tiny_invalid_utf8_file_status_does_not_crash(tmp_path: Path) -> None: + """Tiny files with invalid UTF-8 bytes are treated as non-empty.""" + path = tmp_path / "latin1.py" + path.write_bytes(b"\xff") + assert get_file_status(path, None) is FileStatus.VALID + + def test_index_language_counts(indexed_index: SembleIndex) -> None: """Language breakdown in stats includes python with at least one chunk.""" stats = indexed_index.stats