diff --git a/.claude/settings.local.json b/.claude/settings.local.json index c66a86c..c62b52d 100644 --- a/.claude/settings.local.json +++ b/.claude/settings.local.json @@ -15,7 +15,8 @@ "Bash(gh issue view:*)", "Bash(gh repo view:*)", "Bash(cargo build:*)", - "Bash(cargo search:*)" + "Bash(cargo search:*)", + "WebFetch(domain:docs.rs)" ] } } diff --git a/Cargo.lock b/Cargo.lock index 1c32895..f543b38 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -278,6 +278,16 @@ dependencies = [ "generic-array", ] +[[package]] +name = "bstr" +version = "1.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "63044e1ae8e69f3b5a92c736ca6269b8d12fa7efe39bf34ddb06d102cf0e2cab" +dependencies = [ + "memchr", + "serde", +] + [[package]] name = "bumpalo" version = "3.20.2" @@ -452,6 +462,8 @@ dependencies = [ "codebook_config", "env_logger", "fs2", + "globset", + "ignore", "log", "lru", "serde", @@ -574,6 +586,25 @@ dependencies = [ "libc", ] +[[package]] +name = "crossbeam-deque" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + [[package]] name = "crossbeam-utils" version = "0.8.21" @@ -987,6 +1018,19 @@ version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280" +[[package]] +name = "globset" +version = "0.4.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52dfc19153a48bde0cbd630453615c8151bce3a5adfac7a0aebfbf0a1e1f57e3" +dependencies = [ + "aho-corasick", + "bstr", + "log", + "regex-automata", + "regex-syntax", +] + [[package]] name = "h2" version = "0.4.13" @@ -1341,6 +1385,22 @@ dependencies = [ "icu_properties", ] +[[package]] +name = "ignore" +version = "0.4.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3d782a365a015e0f5c04902246139249abf769125006fbe7649e2ee88169b4a" +dependencies = [ + "crossbeam-deque", + "globset", + "log", + "memchr", + "regex-automata", + "same-file", + "walkdir", + "winapi-util", +] + [[package]] name = "indexmap" version = "2.13.0" diff --git a/Cargo.toml b/Cargo.toml index 1e33f56..714cfe4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,6 +22,8 @@ env_logger = "0.11.6" fs2 = "0.4" git2 = "0.20.0" glob = "0.3" +globset = "0.4" +ignore = "0.4" httpmock = "<0.9.0" lazy_static = "1.5.0" log = "0.4.22" diff --git a/README.md b/README.md index 074e96f..1cc0953 100644 --- a/README.md +++ b/README.md @@ -117,6 +117,28 @@ Any editor that implements the Language Server Protocol should be compatible wit codebook-lsp serve ``` +### CLI (Lint) (Unstable) + +Codebook can also be used as a standalone command-line spell checker, which is useful for CI pipelines, pre-commit hooks, or one-off checks. + +Note: this command is currently experimental, unstable, and subject to breaking changes in future releases. Please submit feedback! + +```sh +# Check specific files +codebook-lsp lint src/main.rs src/lib.rs + +# Check all files in a directory (recursive) +codebook-lsp lint src/ + +# Show spelling suggestions +codebook-lsp lint --suggest src/ + +# Only report each misspelled word once across all files +codebook-lsp lint --unique src/ +``` + +The exit code is **0** if all files are clean, **1** if any spelling errors are found, and **2** if there were unreadable files, invalid UTF-8, etc. + ## About Codebook is a spell checker for code. It binds together the venerable Tree Sitter and the fast spell checker [Spellbook](https://github.com/helix-editor/spellbook). Included is a Language Server for use in (theoretically) any editor. Everything is done in Rust to keep response times snappy and memory usage _low_. diff --git a/crates/codebook-lsp/Cargo.toml b/crates/codebook-lsp/Cargo.toml index 7c2a99a..fc2fdb2 100644 --- a/crates/codebook-lsp/Cargo.toml +++ b/crates/codebook-lsp/Cargo.toml @@ -28,6 +28,8 @@ env_logger.workspace = true fs2.workspace = true log.workspace = true lru.workspace = true +globset.workspace = true +ignore.workspace = true serde.workspace = true serde_json.workspace = true string-offsets.workspace = true diff --git a/crates/codebook-lsp/src/lint.rs b/crates/codebook-lsp/src/lint.rs new file mode 100644 index 0000000..6f23709 --- /dev/null +++ b/crates/codebook-lsp/src/lint.rs @@ -0,0 +1,390 @@ +use codebook::Codebook; +use codebook_config::{CodebookConfig, CodebookConfigFile}; +use globset::Glob; +use ignore::WalkBuilder; +use std::collections::HashSet; +use std::path::{Path, PathBuf}; +use std::sync::Arc; +use string_offsets::{AllConfig, StringOffsets}; + +macro_rules! err { + ($($arg:tt)*) => { + eprintln!("error: {}", format_args!($($arg)*)) + }; +} + +/// Result of a lint run, mapped to exit codes by the caller. +pub enum LintResult { + /// All files clean — exit 0. + Clean, + /// Spelling errors found — exit 1. + Errors, + /// Infrastructure failure (IO errors, bad patterns, etc.) — exit 2. + Failure, +} + +/// Computes a workspace-relative path string for a given file. Falls back to +/// the absolute path if the file is outside the workspace or canonicalization +/// fails. `root_canonical` should be the already-canonicalized workspace root. +fn relative_to_root(root_canonical: Option<&Path>, path: &Path) -> String { + root_canonical + .and_then(|root| { + let canon = path.canonicalize().ok()?; + canon + .strip_prefix(root) + .ok() + .map(|rel| rel.to_string_lossy().into_owned()) + }) + .unwrap_or_else(|| path.to_string_lossy().into_owned()) +} + +pub fn run_lint(files: &[String], root: &Path, unique: bool, suggest: bool) -> LintResult { + let config = match CodebookConfigFile::load(Some(root)) { + Ok(c) => Arc::new(c), + Err(e) => { + err!("failed to load config: {e}"); + return LintResult::Failure; + } + }; + + print_config_source(&config); + eprintln!(); + + let codebook = match Codebook::new(config.clone()) { + Ok(c) => c, + Err(e) => { + err!("failed to initialize: {e}"); + return LintResult::Failure; + } + }; + + // Canonicalize the root once here rather than once per file. + let root_canonical = root.canonicalize().ok(); + + let (resolved, mut had_failure) = resolve_paths(files, root); + + let mut seen_words: HashSet = HashSet::new(); + let mut total_errors = 0usize; + let mut files_with_errors = 0usize; + + for path in &resolved { + let relative = relative_to_root(root_canonical.as_deref(), path); + + if config.should_ignore_path(Path::new(&relative)) { + continue; + } + if !config.should_include_path(Path::new(&relative)) { + continue; + } + + let (errors, file_failure) = + check_file(path, &relative, &codebook, &mut seen_words, unique, suggest); + had_failure |= file_failure; + if errors > 0 { + total_errors += errors; + files_with_errors += 1; + } + } + + let unique_label = if unique { "unique " } else { "" }; + eprintln!( + "Found {total_errors} {unique_label}spelling error(s) in {files_with_errors} file(s)." + ); + + if had_failure { + LintResult::Failure + } else if total_errors > 0 { + LintResult::Errors + } else { + LintResult::Clean + } +} + +/// Spell-checks a single file and prints any diagnostics to stdout. +/// +/// Returns `(error_count, had_io_error)`. `error_count` is 0 if the file was +/// clean; `had_io_error` is true when the file could not be read. `relative` is +/// the workspace-relative path used for display and ignore matching. +fn check_file( + path: &Path, + relative: &str, + codebook: &Codebook, + seen_words: &mut HashSet, + unique: bool, + suggest: bool, +) -> (usize, bool) { + let text = match std::fs::read_to_string(path) { + Ok(t) => t, + Err(e) if e.kind() == std::io::ErrorKind::InvalidData => { + // Binary / non-UTF-8 file — silently skip. + return (0, false); + } + Err(e) => { + err!("{}: {e}", path.display()); + return (0, true); + } + }; + + let display = relative.strip_prefix("./").unwrap_or(relative); + + // Build the offset table once per file + let offsets = StringOffsets::::new(&text); + let mut locations = codebook.spell_check(&text, None, Some(relative)); + // Sort inner locations first (HashSet iteration order is nondeterministic), + // then sort the outer list by first occurrence in the file. + for wl in &mut locations { + wl.locations.sort_by_key(|r| r.start_byte); + } + locations.sort_by_key(|l| l.locations.first().map(|r| r.start_byte).unwrap_or(0)); + + // Collect hits first so we can compute pad_len for column alignment. The + // unique check is per-word, so all ranges of a word are included or skipped + // together. + let mut hits: Vec<(String, &str, Option>)> = Vec::new(); + for wl in &locations { + if unique && !seen_words.insert(wl.word.to_lowercase()) { + continue; + } + + let mut suggestions = if suggest { + codebook.get_suggestions(wl.word.as_str()) + } else { + None + }; + + // If unique mode: Only emit the first occurrence of each word. + let ranges = if unique { + &wl.locations[..1] + } else { + &wl.locations[..] + }; + + for (i, range) in ranges.iter().enumerate() { + // utf8_to_char_pos returns 0-based line and Unicode-char column. + let pos = offsets.utf8_to_char_pos(range.start_byte.min(text.len())); + + // Move out of `suggestions` on the last iteration to avoid a clone. + let sugg = if i + 1 < ranges.len() { + suggestions.clone() + } else { + suggestions.take() + }; + + hits.push(( + format!("{}:{}", pos.line + 1, pos.col + 1), + wl.word.as_str(), + sugg, + )); + } + } + + if hits.is_empty() { + return (0, false); + } + + let pad_len = hits.iter().map(|(lc, _, _)| lc.len()).max().unwrap_or(0); + + println!("{display}"); + for (linecol, word, suggestions) in &hits { + let pad = " ".repeat(pad_len - linecol.len()); + if let Some(s) = suggestions { + println!(" {display}:{linecol}{pad} {word} -> {}", s.join(", ")); + } else { + println!(" {display}:{linecol}{pad} {word}"); + } + } + println!(); + + (hits.len(), false) +} + +/// Prints which config file is being used, or notes that the default is active. +fn print_config_source(config: &CodebookConfigFile) { + let cwd = std::env::current_dir().unwrap_or_default(); + let (label, path) = match ( + config.project_config_path().filter(|p| p.is_file()), + config.global_config_path().filter(|p| p.is_file()), + ) { + (Some(p), _) => ("using config", p), + (None, Some(g)) => ("using global config", g), + (None, None) => { + eprintln!("No config found, using default config"); + return; + } + }; + let display = path + .strip_prefix(&cwd) + .unwrap_or(&path) + .display() + .to_string(); + eprintln!("{label} {display}"); +} + +/// Resolves a mix of file paths, directories, and glob patterns into a sorted, +/// deduplicated list of file paths. All paths are resolved through the `ignore` +/// crate's `WalkBuilder`, which respects `.gitignore` rules (including nested +/// ones) and skips hidden files/directories. +/// +/// Returns `(paths, had_failure)`. `had_failure` is true for unmatched +/// patterns, invalid globs, or walk I/O errors. +fn resolve_paths(patterns: &[String], root: &Path) -> (Vec, bool) { + let mut paths = Vec::new(); + let mut had_failure = false; + + for pattern in patterns { + // root.join() is a no-op when pattern is absolute + let p = root.join(pattern); + if p.is_dir() { + had_failure |= collect_walk(&mut WalkBuilder::new(&p), &mut paths); + } else if p.is_file() { + paths.push(p); + } else { + // Treat as a glob pattern. Walk from the workspace root so that + // the full .gitignore hierarchy (parent + nested) is respected, + // then post-filter surviving files against the glob. + let pattern_str = p.to_string_lossy(); + let matcher = match Glob::new(&pattern_str).map(|g| g.compile_matcher()) { + Ok(m) => m, + Err(e) => { + err!("invalid pattern '{pattern_str}': {e}"); + had_failure = true; + continue; + } + }; + let before = paths.len(); + let mut walker = WalkBuilder::new(root); + for entry in walker.follow_links(false).build() { + match entry { + Ok(e) if e.file_type().is_some_and(|ft| ft.is_file()) => { + if matcher.is_match(e.path()) { + paths.push(e.into_path()); + } + } + Ok(_) => {} + Err(e) => { + err!("walk error: {e}"); + had_failure = true; + } + } + } + if paths.len() == before { + err!("no match for '{pattern_str}'"); + had_failure = true; + } + } + } + + paths.sort(); + paths.dedup(); + (paths, had_failure) +} + +/// Walks using the given `WalkBuilder`, collecting all files into `out`. +/// Respects `.gitignore` rules (including nested) and skips hidden +/// files/directories. Returns `true` if any I/O error occurred. +fn collect_walk(walker: &mut WalkBuilder, out: &mut Vec) -> bool { + let mut had_failure = false; + for entry in walker.follow_links(false).build() { + match entry { + Ok(e) if e.file_type().is_some_and(|ft| ft.is_file()) => out.push(e.into_path()), + Ok(_) => {} + Err(e) => { + err!("walk error: {e}"); + had_failure = true; + } + } + } + had_failure +} + +#[cfg(test)] +mod tests { + use super::*; + use codebook::Codebook; + use codebook_config::CodebookConfigMemory; + use std::collections::HashSet; + use std::fs; + use std::sync::Arc; + use tempfile::tempdir; + + #[test] + fn test_path_and_dir_resolution() { + let dir = tempdir().unwrap(); + let sub = dir.path().join("sub"); + fs::create_dir_all(&sub).unwrap(); + + let f1 = dir.path().join("a.rs"); + let f2 = sub.join("b.txt"); + fs::write(&f1, "").unwrap(); + fs::write(&f2, "").unwrap(); + + let root_canon = dir.path().canonicalize().unwrap(); + assert_eq!(relative_to_root(Some(&root_canon), &f1), "a.rs"); + + let pattern = format!("{}/**/*.*", dir.path().display()); + let (paths, err) = resolve_paths(&[pattern], dir.path()); + + assert!(!err); + assert_eq!(paths.len(), 2); + let path_strs: HashSet<_> = paths.iter().map(|p| p.to_string_lossy()).collect(); + assert!(path_strs.iter().any(|s| s.ends_with("a.rs"))); + assert!(path_strs.iter().any(|s| s.ends_with("b.txt"))); + + let (_, err_missing) = resolve_paths(&["nonexistent.rs".into()], dir.path()); + assert!(err_missing); + } + + #[test] + fn test_check_file_logic() { + let dir = tempdir().unwrap(); + let f = dir.path().join("test.txt"); + fs::write(&f, "actualbad\n🦀 actualbad").unwrap(); + + let cb = Codebook::new(Arc::new(CodebookConfigMemory::default())).unwrap(); + let mut seen = HashSet::new(); + + // Test basic flagging and multi-occurrence counting + let (count, err) = check_file(&f, "test.txt", &cb, &mut seen, false, false); + assert_eq!(count, 2); + assert!(!err); + + // Test unique mode + let mut seen_unique = HashSet::new(); + let (c1, _) = check_file(&f, "f1.txt", &cb, &mut seen_unique, true, false); + let (c2, _) = check_file(&f, "f2.txt", &cb, &mut seen_unique, true, false); + assert_eq!(c1, 1, "Should flag word once"); + assert_eq!(c2, 0, "Should skip already-seen word in second file"); + + // Test IO failure + let (_, err_io) = check_file( + &dir.path().join("missing"), + "!", + &cb, + &mut seen, + false, + false, + ); + assert!(err_io); + } + + #[test] + fn test_unicode_line_col() { + let cases = [ + ("actualbad", 0, 1, 1), // Start + ("ok\nactualbad", 3, 2, 1), // Newline + ("résumé actualbad", 9, 1, 8), // Multi-byte chars (é is 2 bytes) + ("🦀 actualbad", 5, 1, 3), // Emoji (4 bytes, 1 char) + ]; + + for (text, offset, line, col) in cases { + let table = StringOffsets::::new(text); + let pos = table.utf8_to_char_pos(offset); + assert_eq!( + (pos.line + 1, pos.col + 1), + (line, col), + "Failed on: {}", + text + ); + } + } +} diff --git a/crates/codebook-lsp/src/main.rs b/crates/codebook-lsp/src/main.rs index bc055ee..ff682e6 100644 --- a/crates/codebook-lsp/src/main.rs +++ b/crates/codebook-lsp/src/main.rs @@ -1,5 +1,6 @@ mod file_cache; mod init_options; +mod lint; mod lsp; mod lsp_logger; @@ -30,19 +31,35 @@ enum Commands { Serve {}, /// Remove server cache Clean {}, + /// Check files for spelling errors + Lint { + /// Files or glob patterns to spell-check + #[arg(required = true)] + files: Vec, + /// Only report each misspelled word once, ignoring duplicates across files + #[arg(short = 'u', long)] + unique: bool, + /// Show spelling suggestions for each misspelled word + #[arg(short = 's', long)] + suggest: bool, + }, } #[tokio::main(flavor = "current_thread")] async fn main() { - // Initialize logger early with stderr output and buffering - // Default to INFO level, will be adjusted when LSP client connects + let cli = Cli::parse(); + + // Initialize logger early with stderr output and buffering. + // Default to INFO for LSP, WARN for lint (to suppress LSP-oriented noise). + let is_lint = matches!(cli.command, Some(Commands::Lint { .. })); let log_level = match env::var("RUST_LOG").as_deref() { Ok("debug") => LevelFilter::Debug, + Ok("info") => LevelFilter::Info, + _ if is_lint => LevelFilter::Warn, _ => LevelFilter::Info, }; LspLogger::init_early(log_level).expect("Failed to initialize early logger"); debug!("Logger initialized with log level: {log_level:?}"); - let cli = Cli::parse(); let root = match cli.root.as_deref() { Some(path) => path, @@ -58,6 +75,18 @@ async fn main() { info!("Cleaning: {:?}", config.cache_dir); config.clean_cache() } + Some(Commands::Lint { + files, + unique, + suggest, + }) => { + let code = match lint::run_lint(files, root, *unique, *suggest) { + lint::LintResult::Clean => 0, + lint::LintResult::Errors => 1, + lint::LintResult::Failure => 2, + }; + std::process::exit(code); + } None => {} } }