From 5108f88811379dc66cdbd55c9de25a2f09c18bae Mon Sep 17 00:00:00 2001 From: prosdev Date: Wed, 1 Apr 2026 16:33:33 -0700 Subject: [PATCH 01/11] feat(core): add Rust tree-sitter type and grammar validation tests Step 0 of Phase 5: validate tree-sitter-rust grammar node names before building the scanner. All 14 tests confirm expected node types: function_item, struct_item, enum_item, trait_item, impl_item (with type field for both inherent and trait impls), use_declaration, call_expression (for both bare and method calls), macro_invocation, visibility_modifier, line_comment for doc comments, and generic impl blocks. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../core/src/scanner/__tests__/rust.test.ts | 228 ++++++++++++++++++ packages/core/src/scanner/tree-sitter.ts | 2 +- 2 files changed, 229 insertions(+), 1 deletion(-) create mode 100644 packages/core/src/scanner/__tests__/rust.test.ts diff --git a/packages/core/src/scanner/__tests__/rust.test.ts b/packages/core/src/scanner/__tests__/rust.test.ts new file mode 100644 index 0000000..8619d79 --- /dev/null +++ b/packages/core/src/scanner/__tests__/rust.test.ts @@ -0,0 +1,228 @@ +/** + * Rust Scanner Tests + * + * Step 0: Grammar validation — confirms tree-sitter-rust node names + * before building the scanner. Keep this test as a permanent reference. + */ + +import { describe, expect, it } from 'vitest'; +import { parseCode } from '../tree-sitter'; + +// Step 0: Validate tree-sitter-rust grammar node names +describe('Rust grammar validation (Step 0)', () => { + it('should parse function_item', async () => { + const tree = await parseCode('pub fn hello() { }', 'rust'); + const root = tree.rootNode; + const fn = root.namedChildren[0]; + expect(fn.type).toBe('function_item'); + }); + + it('should parse struct_item', async () => { + const tree = await parseCode('pub struct Foo { x: i32 }', 'rust'); + const root = tree.rootNode; + const s = root.namedChildren[0]; + expect(s.type).toBe('struct_item'); + }); + + it('should parse enum_item', async () => { + const tree = await parseCode('pub enum Status { Active, Inactive }', 'rust'); + const root = tree.rootNode; + const e = root.namedChildren[0]; + expect(e.type).toBe('enum_item'); + }); + + it('should parse trait_item', async () => { + const tree = await parseCode('pub trait Handler { fn handle(&self); }', 'rust'); + const root = tree.rootNode; + const t = root.namedChildren[0]; + expect(t.type).toBe('trait_item'); + }); + + it('should parse impl_item with type field', async () => { + const code = ` +impl Foo { + fn bar(&self) {} +}`; + const tree = await parseCode(code, 'rust'); + const root = tree.rootNode; + const impl = root.namedChildren[0]; + expect(impl.type).toBe('impl_item'); + + // The 'type' field should give us the concrete type name + const typeNode = impl.childForFieldName('type'); + expect(typeNode).not.toBeNull(); + expect(typeNode!.text).toBe('Foo'); + }); + + it('should parse impl Trait for Type with type field', async () => { + const code = ` +impl Handler for Server { + fn handle(&self) {} +}`; + const tree = await parseCode(code, 'rust'); + const root = tree.rootNode; + const impl = root.namedChildren[0]; + expect(impl.type).toBe('impl_item'); + + // 'type' field should give concrete type (Server), not the trait + const typeNode = impl.childForFieldName('type'); + expect(typeNode).not.toBeNull(); + expect(typeNode!.text).toBe('Server'); + + // 'trait' field should give the trait name + const traitNode = impl.childForFieldName('trait'); + expect(traitNode).not.toBeNull(); + expect(traitNode!.text).toBe('Handler'); + }); + + it('should parse use_declaration', async () => { + const tree = await parseCode('use std::collections::HashMap;', 'rust'); + const root = tree.rootNode; + const use = root.namedChildren[0]; + expect(use.type).toBe('use_declaration'); + }); + + it('should parse call_expression for function calls', async () => { + const code = ` +fn main() { + hello(); +}`; + const tree = await parseCode(code, 'rust'); + const fn = tree.rootNode.namedChildren[0]; + // Walk to find call_expression + const body = fn.childForFieldName('body'); + expect(body).not.toBeNull(); + + function findNodeType(node: typeof fn, type: string): typeof fn | null { + if (node.type === type) return node; + for (const child of node.namedChildren) { + const found = findNodeType(child, type); + if (found) return found; + } + return null; + } + + const call = findNodeType(body!, 'call_expression'); + expect(call).not.toBeNull(); + expect(call!.childForFieldName('function')?.text).toBe('hello'); + }); + + it('should parse call_expression for method calls (field_expression)', async () => { + const code = ` +fn main() { + self.process_request(); +}`; + const tree = await parseCode(code, 'rust'); + + function findNodeType(node: any, type: string): any { + if (node.type === type) return node; + for (const child of node.namedChildren) { + const found = findNodeType(child, type); + if (found) return found; + } + return null; + } + + const call = findNodeType(tree.rootNode, 'call_expression'); + expect(call).not.toBeNull(); + + const funcNode = call.childForFieldName('function'); + expect(funcNode).not.toBeNull(); + expect(funcNode.type).toBe('field_expression'); + expect(funcNode.text).toBe('self.process_request'); + }); + + it('should parse macro_invocation separately from call_expression', async () => { + const code = ` +fn main() { + println!("hello"); + hello(); +}`; + const tree = await parseCode(code, 'rust'); + + function findAllNodeTypes(node: any, type: string, results: any[] = []): any[] { + if (node.type === type) results.push(node); + for (const child of node.namedChildren) { + findAllNodeTypes(child, type, results); + } + return results; + } + + const macros = findAllNodeTypes(tree.rootNode, 'macro_invocation'); + const calls = findAllNodeTypes(tree.rootNode, 'call_expression'); + + expect(macros.length).toBe(1); // println! + expect(calls.length).toBe(1); // hello() + }); + + it('should parse visibility_modifier for pub', async () => { + const code = ` +pub fn public_fn() {} +fn private_fn() {} +pub(crate) fn crate_fn() {}`; + const tree = await parseCode(code, 'rust'); + const fns = tree.rootNode.namedChildren.filter((n: any) => n.type === 'function_item'); + expect(fns.length).toBe(3); + + // pub fn — has visibility_modifier + const pubFn = fns[0]; + const vis0 = pubFn.namedChildren.find((n: any) => n.type === 'visibility_modifier'); + expect(vis0).toBeDefined(); + expect(vis0!.text).toBe('pub'); + + // fn — no visibility_modifier + const privateFn = fns[1]; + const vis1 = privateFn.namedChildren.find((n: any) => n.type === 'visibility_modifier'); + expect(vis1).toBeUndefined(); + + // pub(crate) fn — has visibility_modifier + const crateFn = fns[2]; + const vis2 = crateFn.namedChildren.find((n: any) => n.type === 'visibility_modifier'); + expect(vis2).toBeDefined(); + expect(vis2!.text).toBe('pub(crate)'); + }); + + it('should parse doc comments as line_comment', async () => { + const code = ` +/// This is a doc comment +/// Second line +pub fn documented() {}`; + const tree = await parseCode(code, 'rust'); + const root = tree.rootNode; + + // Doc comments are line_comment nodes before the function + const comments = root.namedChildren.filter((n: any) => n.type === 'line_comment'); + expect(comments.length).toBe(2); + expect(comments[0].text).toBe('/// This is a doc comment'); + expect(comments[1].text).toBe('/// Second line'); + }); + + it('should detect async function via child nodes', async () => { + const code = 'pub async fn fetch() {}'; + const tree = await parseCode(code, 'rust'); + const fn = tree.rootNode.namedChildren.find((n: any) => n.type === 'function_item'); + expect(fn).toBeDefined(); + + // Check if any child is the 'async' keyword + // tree-sitter-rust may expose it as an anonymous child + const hasAsync = fn!.text.startsWith('pub async') || fn!.text.startsWith('async'); + expect(hasAsync).toBe(true); + }); + + it('should parse generic impl block', async () => { + const code = ` +impl Container { + pub fn show(&self) -> String { + self.value.to_string() + } +}`; + const tree = await parseCode(code, 'rust'); + const impl = tree.rootNode.namedChildren[0]; + expect(impl.type).toBe('impl_item'); + + const typeNode = impl.childForFieldName('type'); + expect(typeNode).not.toBeNull(); + // The type text includes the generic parameter + expect(typeNode!.text).toContain('Container'); + }); +}); diff --git a/packages/core/src/scanner/tree-sitter.ts b/packages/core/src/scanner/tree-sitter.ts index 1e5c857..c5290bd 100644 --- a/packages/core/src/scanner/tree-sitter.ts +++ b/packages/core/src/scanner/tree-sitter.ts @@ -37,7 +37,7 @@ let parserInitialized = false; * 2. Update SUPPORTED_LANGUAGES in packages/dev-agent/scripts/copy-wasm.js * 3. Ensure tree-sitter-wasms contains the required WASM file */ -export type TreeSitterLanguage = 'go' | 'typescript' | 'tsx' | 'javascript' | 'python'; +export type TreeSitterLanguage = 'go' | 'typescript' | 'tsx' | 'javascript' | 'python' | 'rust'; /** * Cache of loaded language grammars From a192ed2c34f3d9d0e69e793f8962f9d4188f75a9 Mon Sep 17 00:00:00 2001 From: prosdev Date: Wed, 1 Apr 2026 16:41:22 -0700 Subject: [PATCH 02/11] feat(core): add Rust scanner with full extraction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit RustScanner extracts functions, structs, enums, traits, impl methods, imports, callees, and doc comments from Rust source files. - impl_item.type field for method naming (works for both impl Type and impl Trait for Type) - Generic type param stripping: Container.show → Container.show - Callee extraction via recursive AST walk (call_expression only, macro_invocation intentionally excluded) - Doc comments via /// prefix (line_comment nodes) - Visibility: pub, pub(crate), pub(super) → exported: true - Async detection via text inspection before fn keyword - Generated file skipping: target/ directory - Malformed file resilience: returns empty, no crash 37 tests: 14 grammar validation + 23 scanner tests covering both fixtures (simple + complex), impl patterns, generics, closures, macros, malformed files, and generated file detection. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../src/scanner/__fixtures__/rust-complex.rs | 62 +++ .../scanner/__fixtures__/rust-malformed.rs | 4 + .../src/scanner/__fixtures__/rust-simple.rs | 54 ++ .../core/src/scanner/__tests__/rust.test.ts | 181 ++++++ packages/core/src/scanner/index.ts | 7 +- packages/core/src/scanner/rust-queries.ts | 54 ++ packages/core/src/scanner/rust.ts | 521 ++++++++++++++++++ 7 files changed, 882 insertions(+), 1 deletion(-) create mode 100644 packages/core/src/scanner/__fixtures__/rust-complex.rs create mode 100644 packages/core/src/scanner/__fixtures__/rust-malformed.rs create mode 100644 packages/core/src/scanner/__fixtures__/rust-simple.rs create mode 100644 packages/core/src/scanner/rust-queries.ts create mode 100644 packages/core/src/scanner/rust.ts diff --git a/packages/core/src/scanner/__fixtures__/rust-complex.rs b/packages/core/src/scanner/__fixtures__/rust-complex.rs new file mode 100644 index 0000000..09f1a90 --- /dev/null +++ b/packages/core/src/scanner/__fixtures__/rust-complex.rs @@ -0,0 +1,62 @@ +use std::fmt; + +/// Server handles HTTP requests +pub struct Server { + host: String, + port: u16, +} + +pub trait Handler { + fn handle(&self, request: &str) -> Result>; +} + +impl Handler for Server { + fn handle(&self, request: &str) -> Result> { + let processed = self.process_request(request)?; + Ok(processed) + } +} + +impl Server { + pub fn new(host: &str, port: u16) -> Self { + Server { host: host.to_string(), port } + } + + fn process_request(&self, data: &str) -> Result> { + let trimmed = data.trim(); + let result = format!("{}:{} - {}", self.host, self.port, trimmed); + Ok(result) + } +} + +impl fmt::Display for Server { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "Server({}:{})", self.host, self.port) + } +} + +/// Generic container — tests type parameter stripping +pub struct Container { + value: T, +} + +impl Container { + pub fn show(&self) -> String { + self.value.to_string() + } +} + +fn transform(input: &str) -> String { + input.to_uppercase() +} + +/// Tests callee extraction inside closures +pub fn process_items(items: Vec) -> Vec { + items.iter().map(|x| transform(x)).collect() +} + +/// Tests that field access is NOT a callee +pub fn read_server_host(s: &Server) -> String { + let _host = s.host.clone(); + s.host.to_uppercase() +} diff --git a/packages/core/src/scanner/__fixtures__/rust-malformed.rs b/packages/core/src/scanner/__fixtures__/rust-malformed.rs new file mode 100644 index 0000000..9df65f9 --- /dev/null +++ b/packages/core/src/scanner/__fixtures__/rust-malformed.rs @@ -0,0 +1,4 @@ +fn broken( { + // This file intentionally has a syntax error + let x = +} diff --git a/packages/core/src/scanner/__fixtures__/rust-simple.rs b/packages/core/src/scanner/__fixtures__/rust-simple.rs new file mode 100644 index 0000000..dcb096f --- /dev/null +++ b/packages/core/src/scanner/__fixtures__/rust-simple.rs @@ -0,0 +1,54 @@ +use std::collections::HashMap; +use std::io::{self, Read}; + +/// A simple key-value store +pub struct Store { + data: HashMap, +} + +impl Store { + /// Create a new empty store + pub fn new() -> Self { + Store { data: HashMap::new() } + } + + /// Get a value by key + pub fn get(&self, key: &str) -> Option<&String> { + self.data.get(key) + } + + fn internal_cleanup(&mut self) { + self.data.clear(); + } +} + +/// Process input from stdin +pub fn process_input() -> io::Result { + let mut buffer = String::new(); + io::stdin().read_to_string(&mut buffer)?; + Ok(buffer) +} + +fn helper() -> bool { + true +} + +/// Only visible within the crate +pub(crate) fn crate_visible() -> bool { + helper() +} + +pub enum Status { + Active, + Inactive, + Error(String), +} + +pub trait Processor { + fn process(&self, input: &str) -> String; +} + +/// Async function for testing async detection +pub async fn fetch_data(url: &str) -> Result> { + Ok(url.to_string()) +} diff --git a/packages/core/src/scanner/__tests__/rust.test.ts b/packages/core/src/scanner/__tests__/rust.test.ts index 8619d79..69a345c 100644 --- a/packages/core/src/scanner/__tests__/rust.test.ts +++ b/packages/core/src/scanner/__tests__/rust.test.ts @@ -5,7 +5,9 @@ * before building the scanner. Keep this test as a permanent reference. */ +import * as path from 'node:path'; import { describe, expect, it } from 'vitest'; +import { RustScanner } from '../rust'; import { parseCode } from '../tree-sitter'; // Step 0: Validate tree-sitter-rust grammar node names @@ -226,3 +228,182 @@ impl Container { expect(typeNode!.text).toContain('Container'); }); }); + +// ============================================================================ +// RustScanner — Full extraction tests +// ============================================================================ + +const fixturesDir = path.join(__dirname, '..', '__fixtures__'); + +describe('RustScanner', () => { + const scanner = new RustScanner(); + + describe('canHandle', () => { + it('should handle .rs files', () => { + expect(scanner.canHandle('src/main.rs')).toBe(true); + expect(scanner.canHandle('lib.rs')).toBe(true); + }); + + it('should not handle other files', () => { + expect(scanner.canHandle('main.go')).toBe(false); + expect(scanner.canHandle('app.py')).toBe(false); + expect(scanner.canHandle('index.ts')).toBe(false); + }); + }); + + describe('rust-simple.rs', () => { + let docs: Awaited>; + + it('should extract from simple fixture', async () => { + const fs = await import('node:fs'); + const content = fs.readFileSync(path.join(fixturesDir, 'rust-simple.rs'), 'utf-8'); + docs = await scanner.extractFromFile(content, 'rust-simple.rs'); + expect(docs.length).toBeGreaterThan(0); + }); + + it('should extract free functions', () => { + const processInput = docs.find((d) => d.metadata.name === 'process_input'); + expect(processInput).toBeDefined(); + expect(processInput!.type).toBe('function'); + expect(processInput!.language).toBe('rust'); + }); + + it('should detect pub vs non-pub', () => { + const pub = docs.find((d) => d.metadata.name === 'process_input'); + const priv = docs.find((d) => d.metadata.name === 'helper'); + expect(pub?.metadata.exported).toBe(true); + expect(priv?.metadata.exported).toBe(false); + }); + + it('should detect pub(crate) as exported', () => { + const crateVis = docs.find((d) => d.metadata.name === 'crate_visible'); + expect(crateVis).toBeDefined(); + expect(crateVis!.metadata.exported).toBe(true); + }); + + it('should extract structs', () => { + const store = docs.find((d) => d.metadata.name === 'Store'); + expect(store).toBeDefined(); + expect(store!.type).toBe('class'); + }); + + it('should extract enums', () => { + const status = docs.find((d) => d.metadata.name === 'Status'); + expect(status).toBeDefined(); + expect(status!.type).toBe('class'); + }); + + it('should extract traits', () => { + const processor = docs.find((d) => d.metadata.name === 'Processor'); + expect(processor).toBeDefined(); + expect(processor!.type).toBe('interface'); + }); + + it('should extract methods from impl blocks', () => { + const newFn = docs.find((d) => d.metadata.name === 'Store.new'); + const getFn = docs.find((d) => d.metadata.name === 'Store.get'); + const cleanup = docs.find((d) => d.metadata.name === 'Store.internal_cleanup'); + expect(newFn).toBeDefined(); + expect(getFn).toBeDefined(); + expect(cleanup).toBeDefined(); + expect(newFn!.type).toBe('method'); + }); + + it('should extract doc comments', () => { + const store = docs.find((d) => d.metadata.name === 'Store'); + expect(store?.metadata.docstring).toBe('A simple key-value store'); + + const newFn = docs.find((d) => d.metadata.name === 'Store.new'); + expect(newFn?.metadata.docstring).toBe('Create a new empty store'); + }); + + it('should extract imports', () => { + const fn = docs.find((d) => d.metadata.name === 'process_input'); + expect(fn?.metadata.imports).toBeDefined(); + expect(fn!.metadata.imports!.some((i) => i.includes('HashMap'))).toBe(true); + }); + + it('should detect async functions', () => { + const fetchData = docs.find((d) => d.metadata.name === 'fetch_data'); + expect(fetchData).toBeDefined(); + expect(fetchData!.metadata.isAsync).toBe(true); + }); + + it('should include signatures', () => { + const fn = docs.find((d) => d.metadata.name === 'process_input'); + expect(fn?.metadata.signature).toContain('fn process_input'); + }); + }); + + describe('rust-complex.rs', () => { + let docs: Awaited>; + + it('should extract from complex fixture', async () => { + const fs = await import('node:fs'); + const content = fs.readFileSync(path.join(fixturesDir, 'rust-complex.rs'), 'utf-8'); + docs = await scanner.extractFromFile(content, 'rust-complex.rs'); + expect(docs.length).toBeGreaterThan(0); + }); + + it('should handle impl Trait for Type — uses concrete type', () => { + const handle = docs.find((d) => d.metadata.name === 'Server.handle'); + expect(handle).toBeDefined(); + expect(handle!.type).toBe('method'); + }); + + it('should handle impl fmt::Display — uses concrete type', () => { + const fmt = docs.find((d) => d.metadata.name === 'Server.fmt'); + expect(fmt).toBeDefined(); + expect(fmt!.type).toBe('method'); + }); + + it('should strip generic type params from impl', () => { + const show = docs.find((d) => d.metadata.name === 'Container.show'); + expect(show).toBeDefined(); + expect(show!.metadata.name).toBe('Container.show'); + // Should NOT be Container.show + expect(show!.metadata.name).not.toContain('<'); + }); + + it('should extract callees from methods', () => { + const handle = docs.find((d) => d.metadata.name === 'Server.handle'); + expect(handle?.metadata.callees).toBeDefined(); + const calleeNames = handle!.metadata.callees!.map((c) => c.name); + expect(calleeNames.some((n) => n.includes('process_request'))).toBe(true); + }); + + it('should extract callees inside closures', () => { + const processItems = docs.find((d) => d.metadata.name === 'process_items'); + expect(processItems?.metadata.callees).toBeDefined(); + const calleeNames = processItems!.metadata.callees!.map((c) => c.name); + expect(calleeNames.some((n) => n.includes('transform'))).toBe(true); + }); + + it('should NOT include macros in callees', () => { + // process_request calls format!() — should NOT be in callees + const processReq = docs.find((d) => d.metadata.name === 'Server.process_request'); + if (processReq?.metadata.callees) { + const calleeNames = processReq.metadata.callees.map((c) => c.name); + expect(calleeNames.some((n) => n.includes('format!'))).toBe(false); + } + }); + }); + + describe('malformed file', () => { + it('should return empty documents for malformed Rust file', async () => { + const fs = await import('node:fs'); + const content = fs.readFileSync(path.join(fixturesDir, 'rust-malformed.rs'), 'utf-8'); + const docs = await scanner.extractFromFile(content, 'rust-malformed.rs'); + // Should not crash — may return partial or empty results + expect(Array.isArray(docs)).toBe(true); + }); + }); + + describe('generated file detection', () => { + it('should skip files in target/ directory', async () => { + const files = ['target/debug/build/main.rs']; + const results = await scanner.scan(files, '/fake/root'); + expect(results.length).toBe(0); + }); + }); +}); diff --git a/packages/core/src/scanner/index.ts b/packages/core/src/scanner/index.ts index 62015dc..78947dc 100644 --- a/packages/core/src/scanner/index.ts +++ b/packages/core/src/scanner/index.ts @@ -4,6 +4,7 @@ export { GoScanner } from './go'; export { MarkdownScanner } from './markdown'; export { PythonScanner } from './python'; export { ScannerRegistry } from './registry'; +export { RustScanner } from './rust'; export type { CalleeInfo, CallerInfo, @@ -24,8 +25,9 @@ export { TypeScriptScanner } from './typescript'; import { GoScanner } from './go'; import { MarkdownScanner } from './markdown'; import { PythonScanner } from './python'; -// Create default scanner registry with TypeScript, Markdown, Go, and Python +// Create default scanner registry with TypeScript, Markdown, Go, Python, and Rust import { ScannerRegistry } from './registry'; +import { RustScanner } from './rust'; import type { ScanOptions } from './types'; import { TypeScriptScanner } from './typescript'; @@ -47,6 +49,9 @@ export function createDefaultRegistry(): ScannerRegistry { // Register Python scanner registry.register(new PythonScanner()); + // Register Rust scanner + registry.register(new RustScanner()); + return registry; } diff --git a/packages/core/src/scanner/rust-queries.ts b/packages/core/src/scanner/rust-queries.ts new file mode 100644 index 0000000..6168c48 --- /dev/null +++ b/packages/core/src/scanner/rust-queries.ts @@ -0,0 +1,54 @@ +/** + * Tree-sitter queries for Rust code extraction. + * + * All queries validated against tree-sitter-rust grammar via Step 0 tests. + * Node names confirmed: function_item, struct_item, enum_item, trait_item, + * impl_item (with type/trait fields), use_declaration, visibility_modifier. + */ + +export const RUST_QUERIES = { + // Free functions (top-level, not inside impl blocks) + functions: ` + (source_file + (function_item + name: (identifier) @name) @definition) + `, + + // Struct definitions + structs: ` + (struct_item + name: (type_identifier) @name) @definition + `, + + // Enum definitions + enums: ` + (enum_item + name: (type_identifier) @name) @definition + `, + + // Trait definitions + traits: ` + (trait_item + name: (type_identifier) @name) @definition + `, + + // Methods inside impl blocks (captures receiver type + method name) + implMethods: ` + (impl_item + type: (_) @receiver + body: (declaration_list + (function_item + name: (identifier) @name) @definition)) + `, + + // Use declarations (imports) + imports: ` + (use_declaration) @definition + `, + + // Type aliases + typeAliases: ` + (type_item + name: (type_identifier) @name) @definition + `, +}; diff --git a/packages/core/src/scanner/rust.ts b/packages/core/src/scanner/rust.ts new file mode 100644 index 0000000..ec0efdc --- /dev/null +++ b/packages/core/src/scanner/rust.ts @@ -0,0 +1,521 @@ +/** + * Rust language scanner using tree-sitter + * + * Extracts functions, structs, enums, traits, impl methods, imports, + * callees, and doc comments from Rust source files. + * Uses tree-sitter queries for declarative pattern matching. + */ + +import * as path from 'node:path'; +import type { Logger } from '@prosdevlab/kero'; +import { + type FileSystemValidator, + NodeFileSystemValidator, + validateFile, +} from '../utils/file-validator'; +import { RUST_QUERIES } from './rust-queries'; +import type { TreeSitterNode } from './tree-sitter'; +import { initTreeSitter, loadLanguage, type ParsedTree, parseCode } from './tree-sitter'; +import type { CalleeInfo, Document, Scanner, ScannerCapabilities } from './types'; + +/** Generated file patterns to skip */ +const GENERATED_COMMENTS = ['// Code generated', '// DO NOT EDIT', '// Auto-generated']; + +/** + * Rust scanner using tree-sitter for parsing + */ +export class RustScanner implements Scanner { + readonly language = 'rust'; + readonly capabilities: ScannerCapabilities = { + syntax: true, + types: true, + documentation: true, + }; + + private static readonly MAX_SNIPPET_LINES = 50; + private fileValidator: FileSystemValidator; + + constructor(fileValidator: FileSystemValidator = new NodeFileSystemValidator()) { + this.fileValidator = fileValidator; + } + + canHandle(filePath: string): boolean { + return path.extname(filePath).toLowerCase() === '.rs'; + } + + private async validateRustSupport(): Promise { + try { + await initTreeSitter(); + await loadLanguage('rust'); + } catch (error) { + const errorMessage = error instanceof Error ? error.message : String(error); + if (errorMessage.includes('tree-sitter WASM') || errorMessage.includes('Failed to locate')) { + throw new Error( + 'Rust tree-sitter WASM files not found. ' + + 'tree-sitter-rust.wasm is required for Rust code parsing.' + ); + } + throw error; + } + } + + async scan( + files: string[], + repoRoot: string, + logger?: Logger, + onProgress?: (filesProcessed: number, totalFiles: number) => void + ): Promise { + const documents: Document[] = []; + const total = files.length; + + try { + await this.validateRustSupport(); + } catch (error) { + const errorMessage = error instanceof Error ? error.message : String(error); + logger?.error({ error: errorMessage }, 'Rust scanner initialization failed'); + throw error; + } + + const startTime = Date.now(); + let lastLogTime = startTime; + + for (let i = 0; i < total; i++) { + const file = files[i]; + + if (onProgress && i > 0 && i % 50 === 0) { + onProgress(i, total); + } + + const now = Date.now(); + if (logger && i > 0 && (i % 50 === 0 || now - lastLogTime > 10000)) { + lastLogTime = now; + const percent = Math.round((i / total) * 100); + logger.info( + { filesProcessed: i, total, percent, documents: documents.length }, + `rust ${i}/${total} (${percent}%) - ${documents.length} docs` + ); + } + + try { + const absolutePath = path.join(repoRoot, file); + const validation = validateFile(file, absolutePath, this.fileValidator); + if (!validation.isValid) continue; + + const sourceText = this.fileValidator.readText(absolutePath); + + if (this.isGeneratedFile(file, sourceText)) continue; + + const fileDocs = await this.extractFromFile(sourceText, file); + documents.push(...fileDocs); + } catch (error) { + const errorMessage = error instanceof Error ? error.message : String(error); + logger?.debug({ file, error: errorMessage }, `Skipped Rust file: ${file}`); + } + } + + logger?.info( + { successCount: documents.length, total }, + `Rust scan complete: ${documents.length} docs from ${total} files` + ); + + return documents; + } + + async extractFromFile(sourceText: string, relativeFile: string): Promise { + const documents: Document[] = []; + + let tree: ParsedTree; + try { + tree = await parseCode(sourceText, 'rust'); + } catch { + // Parse failure (malformed file) — return empty, don't crash + return documents; + } + + // Extract file-level imports + const imports = this.extractImports(tree); + + // Extract free functions (top-level) + documents.push(...this.extractFunctions(tree, sourceText, relativeFile, imports)); + + // Extract structs + documents.push(...this.extractStructs(tree, sourceText, relativeFile)); + + // Extract enums + documents.push(...this.extractEnums(tree, sourceText, relativeFile)); + + // Extract traits + documents.push(...this.extractTraits(tree, sourceText, relativeFile)); + + // Extract methods from impl blocks + documents.push(...this.extractMethods(tree, sourceText, relativeFile, imports)); + + return documents; + } + + // ======================================================================== + // Extraction methods + // ======================================================================== + + private extractFunctions( + tree: ParsedTree, + sourceText: string, + file: string, + imports: string[] + ): Document[] { + const documents: Document[] = []; + + for (const match of tree.query(RUST_QUERIES.functions)) { + const nameCapture = match.captures.find((c) => c.name === 'name'); + const defCapture = match.captures.find((c) => c.name === 'definition'); + if (!nameCapture || !defCapture) continue; + + const name = nameCapture.node.text; + const node = defCapture.node; + const startLine = node.startPosition.row + 1; + const endLine = node.endPosition.row + 1; + const exported = this.isExported(node); + const docstring = this.extractDocComment(sourceText, startLine); + const signature = this.extractSignature(node); + const snippet = this.truncateSnippet(node.text); + const callees = this.walkCallNodes(node); + const isAsync = this.isAsyncFunction(node); + + documents.push({ + id: `${file}:${name}:${startLine}`, + text: this.buildEmbeddingText('function', name, signature, docstring), + type: 'function', + language: 'rust', + metadata: { + name, + file, + startLine, + endLine, + exported, + signature, + docstring, + snippet, + imports, + callees: callees.length > 0 ? callees : undefined, + isAsync: isAsync || undefined, + }, + }); + } + + return documents; + } + + private extractStructs(tree: ParsedTree, sourceText: string, file: string): Document[] { + const documents: Document[] = []; + + for (const match of tree.query(RUST_QUERIES.structs)) { + const nameCapture = match.captures.find((c) => c.name === 'name'); + const defCapture = match.captures.find((c) => c.name === 'definition'); + if (!nameCapture || !defCapture) continue; + + const name = nameCapture.node.text; + const node = defCapture.node; + const startLine = node.startPosition.row + 1; + const endLine = node.endPosition.row + 1; + const exported = this.isExported(node); + const docstring = this.extractDocComment(sourceText, startLine); + const signature = this.extractSignature(node); + + documents.push({ + id: `${file}:${name}:${startLine}`, + text: this.buildEmbeddingText('struct', name, signature, docstring), + type: 'class', // Use 'class' for consistency with TS/Python scanners + language: 'rust', + metadata: { + name, + file, + startLine, + endLine, + exported, + signature, + docstring, + }, + }); + } + + return documents; + } + + private extractEnums(tree: ParsedTree, sourceText: string, file: string): Document[] { + const documents: Document[] = []; + + for (const match of tree.query(RUST_QUERIES.enums)) { + const nameCapture = match.captures.find((c) => c.name === 'name'); + const defCapture = match.captures.find((c) => c.name === 'definition'); + if (!nameCapture || !defCapture) continue; + + const name = nameCapture.node.text; + const node = defCapture.node; + const startLine = node.startPosition.row + 1; + const endLine = node.endPosition.row + 1; + const exported = this.isExported(node); + const docstring = this.extractDocComment(sourceText, startLine); + const signature = this.extractSignature(node); + + documents.push({ + id: `${file}:${name}:${startLine}`, + text: this.buildEmbeddingText('enum', name, signature, docstring), + type: 'class', // Use 'class' for consistency + language: 'rust', + metadata: { + name, + file, + startLine, + endLine, + exported, + signature, + docstring, + }, + }); + } + + return documents; + } + + private extractTraits(tree: ParsedTree, sourceText: string, file: string): Document[] { + const documents: Document[] = []; + + for (const match of tree.query(RUST_QUERIES.traits)) { + const nameCapture = match.captures.find((c) => c.name === 'name'); + const defCapture = match.captures.find((c) => c.name === 'definition'); + if (!nameCapture || !defCapture) continue; + + const name = nameCapture.node.text; + const node = defCapture.node; + const startLine = node.startPosition.row + 1; + const endLine = node.endPosition.row + 1; + const exported = this.isExported(node); + const docstring = this.extractDocComment(sourceText, startLine); + const signature = this.extractSignature(node); + + documents.push({ + id: `${file}:${name}:${startLine}`, + text: this.buildEmbeddingText('trait', name, signature, docstring), + type: 'interface', // Traits map to interfaces + language: 'rust', + metadata: { + name, + file, + startLine, + endLine, + exported, + signature, + docstring, + }, + }); + } + + return documents; + } + + private extractMethods( + tree: ParsedTree, + sourceText: string, + file: string, + imports: string[] + ): Document[] { + const documents: Document[] = []; + + for (const match of tree.query(RUST_QUERIES.implMethods)) { + const receiverCapture = match.captures.find((c) => c.name === 'receiver'); + const nameCapture = match.captures.find((c) => c.name === 'name'); + const defCapture = match.captures.find((c) => c.name === 'definition'); + if (!receiverCapture || !nameCapture || !defCapture) continue; + + // Strip generic type params: Container → Container + const receiverType = receiverCapture.node.text.replace(/<.*>/, ''); + const methodName = nameCapture.node.text; + const qualifiedName = `${receiverType}.${methodName}`; + const node = defCapture.node; + const startLine = node.startPosition.row + 1; + const endLine = node.endPosition.row + 1; + const exported = this.isExported(node); + const docstring = this.extractDocComment(sourceText, startLine); + const signature = this.extractSignature(node); + const snippet = this.truncateSnippet(node.text); + const callees = this.walkCallNodes(node); + const isAsync = this.isAsyncFunction(node); + + documents.push({ + id: `${file}:${qualifiedName}:${startLine}`, + text: this.buildEmbeddingText('method', qualifiedName, signature, docstring), + type: 'method', + language: 'rust', + metadata: { + name: qualifiedName, + file, + startLine, + endLine, + exported, + signature, + docstring, + snippet, + imports, + callees: callees.length > 0 ? callees : undefined, + isAsync: isAsync || undefined, + }, + }); + } + + return documents; + } + + private extractImports(tree: ParsedTree): string[] { + const imports: string[] = []; + for (const match of tree.query(RUST_QUERIES.imports)) { + const defCapture = match.captures.find((c) => c.name === 'definition'); + if (defCapture) { + imports.push(defCapture.node.text); + } + } + return imports; + } + + // ======================================================================== + // Callee extraction + // ======================================================================== + + /** + * Walk AST nodes recursively to find all call_expression nodes. + * Skips macro_invocation nodes (println!, vec!, format!, etc.). + */ + private walkCallNodes(node: TreeSitterNode): CalleeInfo[] { + const callees: CalleeInfo[] = []; + const seen = new Set(); + + function walk(n: TreeSitterNode) { + if (n.type === 'call_expression') { + const funcNode = n.childForFieldName('function'); + if (funcNode) { + const name = funcNode.text; + const line = n.startPosition.row + 1; + const key = `${name}:${line}`; + if (!seen.has(key)) { + seen.add(key); + callees.push({ name, line }); + } + } + } + // Recurse into children (but NOT into macro_invocation — those are skipped) + for (const child of n.namedChildren) { + walk(child); + } + } + + walk(node); + return callees; + } + + // ======================================================================== + // Helpers + // ======================================================================== + + /** + * Check if a node has a visibility_modifier child (pub, pub(crate), etc.) + */ + private isExported(node: TreeSitterNode): boolean { + return node.namedChildren.some((c) => c.type === 'visibility_modifier'); + } + + /** + * Check if a function is async by looking for 'async' in the function text + * before the 'fn' keyword. tree-sitter-rust includes 'async' as part of + * the function_item text. + */ + private isAsyncFunction(node: TreeSitterNode): boolean { + // Check the text before 'fn' for the async keyword + const text = node.text; + const fnIndex = text.indexOf('fn '); + if (fnIndex <= 0) return false; + return text.slice(0, fnIndex).includes('async'); + } + + /** + * Extract doc comment (/// lines) preceding a node. + * Walks backwards from the line before the node, collecting /// comments. + */ + private extractDocComment(sourceText: string, nodeStartLine: number): string | undefined { + const lines = sourceText.split('\n'); + const docLines: string[] = []; + + // Walk backwards from the line before the node + for (let i = nodeStartLine - 2; i >= 0; i--) { + const line = lines[i].trim(); + + if (line.startsWith('///')) { + // Strip /// prefix and trim + const commentText = line.slice(3).trim(); + docLines.unshift(commentText); + } else if (line.startsWith('#[')) { + } else if (line === '') { + // Empty line — stop if we have comments, otherwise continue + if (docLines.length > 0) break; + } else { + // Non-comment, non-attribute, non-empty — stop + break; + } + } + + return docLines.length > 0 ? docLines.join('\n') : undefined; + } + + /** + * Extract the signature line from a node. + * Skips attribute lines (#[...]) to find the actual fn/struct/enum/trait line. + */ + private extractSignature(node: TreeSitterNode): string { + const lines = node.text.split('\n'); + for (const line of lines) { + const trimmed = line.trim(); + if ( + trimmed.startsWith('pub ') || + trimmed.startsWith('pub(') || + trimmed.startsWith('fn ') || + trimmed.startsWith('async ') || + trimmed.startsWith('struct ') || + trimmed.startsWith('enum ') || + trimmed.startsWith('trait ') || + trimmed.startsWith('type ') + ) { + // Return up to the opening brace or end of line + const braceIndex = trimmed.indexOf('{'); + return braceIndex > 0 ? trimmed.slice(0, braceIndex).trim() : trimmed; + } + } + // Fallback: first line + return lines[0].trim(); + } + + /** + * Truncate a code snippet to MAX_SNIPPET_LINES + */ + private truncateSnippet(text: string): string { + const lines = text.split('\n'); + if (lines.length <= RustScanner.MAX_SNIPPET_LINES) return text; + return lines.slice(0, RustScanner.MAX_SNIPPET_LINES).join('\n') + '\n// ...'; + } + + private buildEmbeddingText( + type: string, + name: string, + signature: string, + docstring?: string + ): string { + const parts = [`${type} ${name}`, signature]; + if (docstring) parts.push(docstring); + return parts.join('\n'); + } + + private isGeneratedFile(filePath: string, sourceText: string): boolean { + // Skip files in target/ directory (build output) + if (filePath.includes('/target/') || filePath.startsWith('target/')) return true; + + const firstLines = sourceText.split('\n').slice(0, 3).join('\n'); + return GENERATED_COMMENTS.some((c) => firstLines.includes(c)); + } +} From 471cc480c7c681f0bec8fbfa70662e91cabd7dfd Mon Sep 17 00:00:00 2001 From: prosdev Date: Wed, 1 Apr 2026 16:46:39 -0700 Subject: [PATCH 03/11] feat(core): add Go and Rust pattern rules and pipeline wiring - Go pattern rules: if err != nil, defer, goroutine, channel send - Rust pattern rules: try operator, match expression, unsafe block, impl block, trait definition - Add .go and .rs to EXTENSION_TO_LANGUAGE (fixes: Go patterns never fired) - Add 'rust' to supportedLanguages in wasm-matcher - Add go/rust to QUERIES_BY_LANGUAGE map in pattern-analysis-service - Add 'rust' to SUPPORTED_LANGUAGES in copy-wasm.js - Add Rust test file detection (tests/ dir, _test.rs) to test-utils - Fix tests that used .rs as unsupported extension example Co-Authored-By: Claude Opus 4.6 (1M context) --- .../__tests__/wasm-matcher.test.ts | 15 +++- packages/core/src/pattern-matcher/rules.ts | 79 +++++++++++++++++++ .../core/src/pattern-matcher/wasm-matcher.ts | 11 ++- .../src/services/pattern-analysis-service.ts | 9 ++- packages/core/src/utils/test-utils.ts | 7 ++ packages/dev-agent/scripts/copy-wasm.js | 2 +- 6 files changed, 116 insertions(+), 7 deletions(-) diff --git a/packages/core/src/pattern-matcher/__tests__/wasm-matcher.test.ts b/packages/core/src/pattern-matcher/__tests__/wasm-matcher.test.ts index 56cb6cc..cde13c8 100644 --- a/packages/core/src/pattern-matcher/__tests__/wasm-matcher.test.ts +++ b/packages/core/src/pattern-matcher/__tests__/wasm-matcher.test.ts @@ -253,7 +253,7 @@ function App() { }); it('returns empty map for unsupported language', async () => { - const results = await matcher.match('fn main() {}', 'rust', ERROR_HANDLING_QUERIES); + const results = await matcher.match('fn main() {}', 'dart', ERROR_HANDLING_QUERIES); expect(results.size).toBe(0); }); }); @@ -346,10 +346,17 @@ describe('resolveLanguage', () => { expect(resolveLanguage('components/App.jsx')).toBe('javascript'); }); + it('maps .go to go', () => { + expect(resolveLanguage('main.go')).toBe('go'); + }); + + it('maps .rs to rust', () => { + expect(resolveLanguage('main.rs')).toBe('rust'); + }); + it('returns undefined for unsupported extensions', () => { - expect(resolveLanguage('main.py')).toBe('python'); - expect(resolveLanguage('main.go')).toBeUndefined(); // Go has scanner, not pattern matcher expect(resolveLanguage('README.md')).toBeUndefined(); + expect(resolveLanguage('main.dart')).toBeUndefined(); }); }); @@ -443,7 +450,7 @@ describe('extractErrorHandlingWithAst', () => { it('unsupported extension → runAllAstQueries returns empty → regex', async () => { const source = 'throw new Error("bad");'; - const ast = await runAllAstQueries(source, 'test.rs', matcher); + const ast = await runAllAstQueries(source, 'test.dart', matcher); expect(ast.size).toBe(0); // unsupported language expect(extractErrorHandlingWithAst(source, ast)).toEqual( extractErrorHandlingFromContent(source) diff --git a/packages/core/src/pattern-matcher/rules.ts b/packages/core/src/pattern-matcher/rules.ts index ccacc0d..f04ee94 100644 --- a/packages/core/src/pattern-matcher/rules.ts +++ b/packages/core/src/pattern-matcher/rules.ts @@ -189,3 +189,82 @@ export const ALL_PYTHON_QUERIES: PatternMatchRule[] = [ ...PYTHON_IMPORT_QUERIES, ...PYTHON_TYPE_QUERIES, ]; + +// ============================================================================ +// Go Error Handling + Concurrency (5 rules) +// ============================================================================ + +export const GO_ERROR_HANDLING_QUERIES: PatternMatchRule[] = [ + { + id: 'go-if-err', + category: 'error-handling', + query: '(if_statement condition: (binary_expression right: (nil))) @match', + }, + { + id: 'go-defer', + category: 'error-handling', + query: '(defer_statement) @match', + }, +]; + +export const GO_CONCURRENCY_QUERIES: PatternMatchRule[] = [ + { + id: 'go-goroutine', + category: 'concurrency', + query: '(go_statement) @match', + }, + { + id: 'go-channel-send', + category: 'concurrency', + query: '(send_statement) @match', + }, +]; + +export const ALL_GO_QUERIES: PatternMatchRule[] = [ + ...GO_ERROR_HANDLING_QUERIES, + ...GO_CONCURRENCY_QUERIES, +]; + +// ============================================================================ +// Rust Error Handling + Unsafe + Types (5 rules) +// ============================================================================ + +export const RUST_ERROR_HANDLING_QUERIES: PatternMatchRule[] = [ + { + id: 'rust-try-operator', + category: 'error-handling', + query: '(try_expression) @match', + }, + { + id: 'rust-match', + category: 'error-handling', + query: '(match_expression) @match', + }, +]; + +export const RUST_UNSAFE_QUERIES: PatternMatchRule[] = [ + { + id: 'rust-unsafe-block', + category: 'unsafe', + query: '(unsafe_block) @match', + }, +]; + +export const RUST_TYPE_QUERIES: PatternMatchRule[] = [ + { + id: 'rust-impl-block', + category: 'types', + query: '(impl_item) @match', + }, + { + id: 'rust-trait-def', + category: 'types', + query: '(trait_item) @match', + }, +]; + +export const ALL_RUST_QUERIES: PatternMatchRule[] = [ + ...RUST_ERROR_HANDLING_QUERIES, + ...RUST_UNSAFE_QUERIES, + ...RUST_TYPE_QUERIES, +]; diff --git a/packages/core/src/pattern-matcher/wasm-matcher.ts b/packages/core/src/pattern-matcher/wasm-matcher.ts index 27163e4..17db697 100644 --- a/packages/core/src/pattern-matcher/wasm-matcher.ts +++ b/packages/core/src/pattern-matcher/wasm-matcher.ts @@ -41,6 +41,8 @@ const EXTENSION_TO_LANGUAGE: Record = { '.js': 'javascript', '.jsx': 'javascript', '.py': 'python', + '.go': 'go', + '.rs': 'rust', }; /** @@ -62,7 +64,14 @@ class WasmPatternMatcher implements PatternMatcher { queries: PatternMatchRule[] ): Promise> { // Validate language is supported - const supportedLanguages = new Set(['typescript', 'tsx', 'javascript', 'go', 'python']); + const supportedLanguages = new Set([ + 'typescript', + 'tsx', + 'javascript', + 'go', + 'python', + 'rust', + ]); if (!supportedLanguages.has(language)) { return new Map(); } diff --git a/packages/core/src/services/pattern-analysis-service.ts b/packages/core/src/services/pattern-analysis-service.ts index 4c84e82..3602140 100644 --- a/packages/core/src/services/pattern-analysis-service.ts +++ b/packages/core/src/services/pattern-analysis-service.ts @@ -7,7 +7,12 @@ import * as fs from 'node:fs/promises'; import * as path from 'node:path'; -import { ALL_PYTHON_QUERIES, ALL_QUERIES } from '../pattern-matcher/rules'; +import { + ALL_GO_QUERIES, + ALL_PYTHON_QUERIES, + ALL_QUERIES, + ALL_RUST_QUERIES, +} from '../pattern-matcher/rules'; import type { PatternMatcher, PatternMatchRule } from '../pattern-matcher/wasm-matcher'; import { resolveLanguage } from '../pattern-matcher/wasm-matcher'; @@ -20,6 +25,8 @@ const QUERIES_BY_LANGUAGE: Record = { tsx: ALL_QUERIES, javascript: ALL_QUERIES, python: ALL_PYTHON_QUERIES, + go: ALL_GO_QUERIES, + rust: ALL_RUST_QUERIES, }; import { scanRepository } from '../scanner'; diff --git a/packages/core/src/utils/test-utils.ts b/packages/core/src/utils/test-utils.ts index 8d5a3a7..5deb5d9 100644 --- a/packages/core/src/utils/test-utils.ts +++ b/packages/core/src/utils/test-utils.ts @@ -23,6 +23,8 @@ const TEST_PATTERNS: Record boolean> = { const name = path.basename(f); return name.startsWith('test_') || name.endsWith('_test.py') || name === 'conftest.py'; }, + // Rust: integration tests in tests/ directory, or _test.rs convention + rs: (f) => f.includes('/tests/') || path.basename(f).endsWith('_test.rs'), }; /** @@ -46,6 +48,11 @@ const TEST_PATH_GENERATORS: Record string const name = path.basename(base); return [path.join(dir, `test_${name}.py`), path.join(dir, `${name}_test.py`)]; }, + rs: (base, _ext) => { + const dir = path.dirname(base); + const name = path.basename(base, '.rs'); + return [path.join(dir, '..', 'tests', `${name}.rs`), path.join(dir, `${name}_test.rs`)]; + }, }; /** diff --git a/packages/dev-agent/scripts/copy-wasm.js b/packages/dev-agent/scripts/copy-wasm.js index 99dddc3..08bed26 100644 --- a/packages/dev-agent/scripts/copy-wasm.js +++ b/packages/dev-agent/scripts/copy-wasm.js @@ -95,7 +95,7 @@ if (!fs.existsSync(wasmSourceDir)) { // 3. Ensure tree-sitter-wasms package contains tree-sitter-{lang}.wasm // 4. Create a language-specific scanner in packages/core/src/scanner/{lang}.ts // 5. Update scanner registration in packages/core/src/scanner/index.ts -const SUPPORTED_LANGUAGES = ['go', 'typescript', 'tsx', 'javascript', 'python']; +const SUPPORTED_LANGUAGES = ['go', 'typescript', 'tsx', 'javascript', 'python', 'rust']; const SUPPORTED_FILES = new Set([ ...SUPPORTED_LANGUAGES.map((lang) => `tree-sitter-${lang}.wasm`), 'tree-sitter.wasm', // Runtime if present From f2842b6acea33c1b771bba2fbbae106ffc8c0242 Mon Sep 17 00:00:00 2001 From: prosdev Date: Wed, 1 Apr 2026 16:50:28 -0700 Subject: [PATCH 04/11] feat(core): add Go callee extraction Add walkCallNodes to GoScanner for function and method callee extraction. Uses full selector text ("fmt.Println" not "Println") matching TS scanner. 6 new tests: callee extraction, full selector names, methods, line numbers, deduplication, no callees on structs. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../scanner/__tests__/fixtures/go/callees.go | 33 ++++++++++++ .../core/src/scanner/__tests__/go.test.ts | 54 +++++++++++++++++++ packages/core/src/scanner/go.ts | 39 +++++++++++++- 3 files changed, 125 insertions(+), 1 deletion(-) create mode 100644 packages/core/src/scanner/__tests__/fixtures/go/callees.go diff --git a/packages/core/src/scanner/__tests__/fixtures/go/callees.go b/packages/core/src/scanner/__tests__/fixtures/go/callees.go new file mode 100644 index 0000000..52736bf --- /dev/null +++ b/packages/core/src/scanner/__tests__/fixtures/go/callees.go @@ -0,0 +1,33 @@ +package main + +import ( + "fmt" + "os" + "strings" +) + +func processInput(input string) string { + trimmed := strings.TrimSpace(input) + fmt.Println("Processing:", trimmed) + return trimmed +} + +func main() { + result := processInput(os.Args[1]) + fmt.Println(result) + os.Exit(0) +} + +type Server struct { + host string +} + +func (s *Server) Start() error { + fmt.Println("Starting server on", s.host) + return nil +} + +func (s *Server) handleRequest(data string) { + processed := processInput(data) + fmt.Println("Handled:", processed) +} diff --git a/packages/core/src/scanner/__tests__/go.test.ts b/packages/core/src/scanner/__tests__/go.test.ts index b93d68f..c716a85 100644 --- a/packages/core/src/scanner/__tests__/go.test.ts +++ b/packages/core/src/scanner/__tests__/go.test.ts @@ -477,4 +477,58 @@ describe('GoScanner', () => { }); }); }); + + describe('callee extraction', () => { + let calleeDocs: Document[]; + + beforeAll(async () => { + calleeDocs = await scanner.scan(['callees.go'], fixturesDir); + }); + + it('should extract callees from functions', () => { + const processInput = calleeDocs.find((d) => d.metadata.name === 'processInput'); + expect(processInput).toBeDefined(); + expect(processInput!.metadata.callees).toBeDefined(); + expect(processInput!.metadata.callees!.length).toBeGreaterThan(0); + }); + + it('should use full selector text for qualified calls', () => { + const processInput = calleeDocs.find((d) => d.metadata.name === 'processInput'); + const calleeNames = processInput!.metadata.callees!.map((c) => c.name); + // Should be "fmt.Println" not just "Println" + expect(calleeNames.some((n) => n === 'fmt.Println')).toBe(true); + expect(calleeNames.some((n) => n === 'strings.TrimSpace')).toBe(true); + }); + + it('should extract callees from methods', () => { + const start = calleeDocs.find((d) => d.metadata.name === 'Server.Start'); + expect(start).toBeDefined(); + expect(start!.metadata.callees).toBeDefined(); + const calleeNames = start!.metadata.callees!.map((c) => c.name); + expect(calleeNames.some((n) => n === 'fmt.Println')).toBe(true); + }); + + it('should include callee line numbers', () => { + const main = calleeDocs.find((d) => d.metadata.name === 'main'); + expect(main!.metadata.callees).toBeDefined(); + for (const callee of main!.metadata.callees!) { + expect(callee.line).toBeGreaterThan(0); + } + }); + + it('should deduplicate callees at same line', () => { + const main = calleeDocs.find((d) => d.metadata.name === 'main'); + const seen = new Set(); + for (const callee of main!.metadata.callees!) { + const key = `${callee.name}:${callee.line}`; + expect(seen.has(key)).toBe(false); + seen.add(key); + } + }); + + it('should not have callees for structs', () => { + const server = calleeDocs.find((d) => d.metadata.name === 'Server' && d.type === 'class'); + expect(server?.metadata.callees).toBeUndefined(); + }); + }); }); diff --git a/packages/core/src/scanner/go.ts b/packages/core/src/scanner/go.ts index bde6788..bf02d0c 100644 --- a/packages/core/src/scanner/go.ts +++ b/packages/core/src/scanner/go.ts @@ -12,6 +12,7 @@ import { NodeFileSystemValidator, validateFile, } from '../utils/file-validator'; +import type { TreeSitterNode } from './tree-sitter'; import { extractGoDocComment, initTreeSitter, @@ -19,7 +20,7 @@ import { type ParsedTree, parseCode, } from './tree-sitter'; -import type { Document, Scanner, ScannerCapabilities } from './types'; +import type { CalleeInfo, Document, Scanner, ScannerCapabilities } from './types'; /** * Tree-sitter queries for Go code extraction @@ -383,6 +384,8 @@ export class GoScanner implements Scanner { // Check for generics const { isGeneric, typeParameters } = this.extractTypeParameters(signature); + const callees = this.walkCallNodes(defCapture.node); + documents.push({ id: `${file}:${name}:${startLine}`, text: this.buildEmbeddingText('function', name, signature, docstring), @@ -397,6 +400,7 @@ export class GoScanner implements Scanner { exported, docstring, snippet, + callees: callees.length > 0 ? callees : undefined, custom: { ...(isTestFile ? { isTest: true } : {}), ...(isGeneric ? { isGeneric, typeParameters } : {}), @@ -451,6 +455,8 @@ export class GoScanner implements Scanner { this.extractTypeParameters(signature); const isGeneric = receiverHasGenerics || signatureHasGenerics; + const callees = this.walkCallNodes(defCapture.node); + documents.push({ id: `${file}:${name}:${startLine}`, text: this.buildEmbeddingText('method', name, signature, docstring), @@ -465,6 +471,7 @@ export class GoScanner implements Scanner { exported, docstring, snippet, + callees: callees.length > 0 ? callees : undefined, custom: { receiver: baseReceiverType, receiverPointer, @@ -700,6 +707,36 @@ export class GoScanner implements Scanner { /** * Check if a Go identifier is exported (starts with uppercase) */ + /** + * Walk AST nodes recursively to find all call_expression nodes. + * Returns full selector text (e.g., "fmt.Println" not just "Println"). + */ + private walkCallNodes(node: TreeSitterNode): CalleeInfo[] { + const callees: CalleeInfo[] = []; + const seen = new Set(); + + function walk(n: TreeSitterNode) { + if (n.type === 'call_expression') { + const funcNode = n.childForFieldName('function'); + if (funcNode) { + const name = funcNode.text; + const line = n.startPosition.row + 1; + const key = `${name}:${line}`; + if (!seen.has(key)) { + seen.add(key); + callees.push({ name, line }); + } + } + } + for (const child of n.namedChildren) { + walk(child); + } + } + + walk(node); + return callees; + } + private isExported(name: string): boolean { if (!name || name.length === 0) return false; const firstChar = name.charAt(0); From b00c2e44d6f9d7ad50a8e46f7d50db6b6f851abf Mon Sep 17 00:00:00 2001 From: prosdev Date: Wed, 1 Apr 2026 16:52:04 -0700 Subject: [PATCH 05/11] docs: update language lists and add changelog for Phase 5 Add Rust to CLAUDE.md scanner description, website Multi-Language feature list, release notes (v0.12.0), and changeset. Co-Authored-By: Claude Opus 4.6 (1M context) --- .changeset/go-rust-support.md | 12 ++++++++++++ CLAUDE.md | 2 +- website/content/index.mdx | 2 +- website/content/latest-version.ts | 8 ++++---- website/content/updates/index.mdx | 17 +++++++++++++++++ 5 files changed, 35 insertions(+), 6 deletions(-) create mode 100644 .changeset/go-rust-support.md diff --git a/.changeset/go-rust-support.md b/.changeset/go-rust-support.md new file mode 100644 index 0000000..5aa83da --- /dev/null +++ b/.changeset/go-rust-support.md @@ -0,0 +1,12 @@ +--- +'@prosdevlab/dev-agent': minor +--- + +Go callee extraction and Rust language support + +- Rust: full scanner — functions, structs, enums, traits, impl methods, imports, callees, doc comments +- Rust: pattern rules — try operator, match expression, unsafe block, impl/trait definitions +- Go: callee extraction for functions and methods — dev_refs now traces Go call chains +- Go: pattern rules — error handling (if err != nil), goroutines, defer, channels +- Generic impl type parameter stripping (Container.show → Container.show) +- All MCP tools (dev_search, dev_refs, dev_map, dev_patterns) work with Go callees and Rust diff --git a/CLAUDE.md b/CLAUDE.md index 718c2e0..1e33e1a 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -21,7 +21,7 @@ Everything runs on your machine. No data leaves. ``` packages/ - core/ # Scanner (ts-morph, tree-sitter for Python/Go), vector storage (Antfly), services + core/ # Scanner (ts-morph, tree-sitter for Python/Go/Rust), vector storage (Antfly), services cli/ # Commander.js CLI — dev index, dev search, dev refs, dev map, dev mcp install mcp-server/ # MCP server with 5 built-in adapters subagents/ # Coordinator, explorer, planner, PR agents diff --git a/website/content/index.mdx b/website/content/index.mdx index ad3207e..3d66b79 100644 --- a/website/content/index.mdx +++ b/website/content/index.mdx @@ -101,7 +101,7 @@ dev mcp install # For Claude Code - **Hybrid Search** — BM25 keyword + vector semantic, fused with RRF - **Code Snippets** — Search returns actual code, not just file paths - **Call Graph** — Callers/callees extracted from AST at index time -- **Multi-Language** — TypeScript, JavaScript, Python, Go, Markdown +- **Multi-Language** — TypeScript, JavaScript, Python, Go, Rust, Markdown - **100% Local** — Antfly runs on your machine. No data leaves. - **Auto-Index** — File watcher re-indexes on save while MCP server runs - **1,600+ Tests** — Production-grade reliability diff --git a/website/content/latest-version.ts b/website/content/latest-version.ts index a7ab639..93998ca 100644 --- a/website/content/latest-version.ts +++ b/website/content/latest-version.ts @@ -4,10 +4,10 @@ */ export const latestVersion = { - version: '0.11.2', - title: 'dev refs CLI Command', + version: '0.12.0', + title: 'Go Callees + Rust Language Support', date: 'April 1, 2026', summary: - 'Find callers and callees from the terminal — dev refs . Plus callee path normalization so hot paths show source files.', - link: '/updates#v0112--dev-refs-cli-command', + 'Index Rust codebases — functions, structs, traits, impl methods, callees. Go call graph tracing. All MCP tools work with both languages.', + link: '/updates#v0120--go-callees--rust-language-support', } as const; diff --git a/website/content/updates/index.mdx b/website/content/updates/index.mdx index f3e3ffe..42eb60f 100644 --- a/website/content/updates/index.mdx +++ b/website/content/updates/index.mdx @@ -9,6 +9,23 @@ What's new in dev-agent. We ship improvements regularly to help AI assistants un --- +## v0.12.0 — Go Callees + Rust Language Support + +*April 1, 2026* + +**dev-agent now indexes Rust codebases and traces Go call graphs.** + +- **Rust scanner:** functions, structs, enums, traits, impl methods, imports, callees, doc comments +- **Rust patterns:** try operator (`?`), match expressions, unsafe blocks, impl/trait definitions +- **Go callees:** `dev_refs` now traces Go call chains (was extraction-only, no call graph) +- **Go patterns:** error handling (`if err != nil`), goroutines, defer, channels +- Generic impl type parameter stripping: `Container.show` → `Container.show` +- Macros intentionally excluded from callees (they're `macro_invocation`, not function calls) +- Malformed file resilience: scanner returns empty, no crash +- 43 new tests across both languages + +--- + ## v0.11.2 — `dev refs` CLI Command *April 1, 2026* From 3c5066fa2c8cb1b4b4287b944154f213fe06915f Mon Sep 17 00:00:00 2001 From: prosdev Date: Wed, 1 Apr 2026 16:55:36 -0700 Subject: [PATCH 06/11] =?UTF-8?q?fix(core):=20address=20code=20review=20?= =?UTF-8?q?=E2=80=94=20macro=20skip,=20orphaned=20JSDoc,=20comments?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Actually skip macro_invocation in Rust walkCallNodes (prevents capturing calls inside macros like vec![foo()]) - Fix orphaned JSDoc comment in go.ts (was between walkCallNodes and isExported after insertion) - Add comment explaining attribute skip in doc comment extraction Co-Authored-By: Claude Opus 4.6 (1M context) --- packages/core/src/scanner/go.ts | 4 +--- packages/core/src/scanner/rust.ts | 6 +++++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/packages/core/src/scanner/go.ts b/packages/core/src/scanner/go.ts index bf02d0c..e6cbc12 100644 --- a/packages/core/src/scanner/go.ts +++ b/packages/core/src/scanner/go.ts @@ -704,9 +704,6 @@ export class GoScanner implements Scanner { return documents; } - /** - * Check if a Go identifier is exported (starts with uppercase) - */ /** * Walk AST nodes recursively to find all call_expression nodes. * Returns full selector text (e.g., "fmt.Println" not just "Println"). @@ -737,6 +734,7 @@ export class GoScanner implements Scanner { return callees; } + /** Check if a Go identifier is exported (starts with uppercase) */ private isExported(name: string): boolean { if (!name || name.length === 0) return false; const firstChar = name.charAt(0); diff --git a/packages/core/src/scanner/rust.ts b/packages/core/src/scanner/rust.ts index ec0efdc..9cb69a6 100644 --- a/packages/core/src/scanner/rust.ts +++ b/packages/core/src/scanner/rust.ts @@ -401,7 +401,10 @@ export class RustScanner implements Scanner { } } } - // Recurse into children (but NOT into macro_invocation — those are skipped) + // Skip macro_invocation entirely — macros (println!, vec!, format!) are not function calls. + // Without this, calls INSIDE macros (e.g., vec![foo()]) would be captured. + if (n.type === 'macro_invocation') return; + for (const child of n.namedChildren) { walk(child); } @@ -452,6 +455,7 @@ export class RustScanner implements Scanner { const commentText = line.slice(3).trim(); docLines.unshift(commentText); } else if (line.startsWith('#[')) { + // Skip attributes (#[derive], #[cfg], etc.) between doc comments and the item } else if (line === '') { // Empty line — stop if we have comments, otherwise continue if (docLines.length > 0) break; From e589e404db7ab906bd04c38b0c69691c25ccb316 Mon Sep 17 00:00:00 2001 From: prosdev Date: Wed, 1 Apr 2026 17:08:59 -0700 Subject: [PATCH 07/11] =?UTF-8?q?fix(core):=20address=20Rust=20expert=20re?= =?UTF-8?q?view=20=E2=80=94=20mod=20blocks,=20nested=20generics?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit W1: Remove source_file anchoring from functions query so functions inside mod blocks are captured. Filter impl methods by checking parent chain (declaration_list > impl_item), not just declaration_list. W3: Fix greedy generic stripping — use split('<')[0] instead of regex replace. Handles nested generics like Wrapper>. 2 new tests: functions inside mod blocks (pub + private), nested generic type param stripping. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../src/scanner/__fixtures__/rust-complex.rs | 22 +++++++++++++++++++ .../core/src/scanner/__tests__/rust.test.ts | 21 ++++++++++++++++++ packages/core/src/scanner/rust-queries.ts | 9 ++++---- packages/core/src/scanner/rust.ts | 9 ++++++-- 4 files changed, 55 insertions(+), 6 deletions(-) diff --git a/packages/core/src/scanner/__fixtures__/rust-complex.rs b/packages/core/src/scanner/__fixtures__/rust-complex.rs index 09f1a90..9cd2bbf 100644 --- a/packages/core/src/scanner/__fixtures__/rust-complex.rs +++ b/packages/core/src/scanner/__fixtures__/rust-complex.rs @@ -60,3 +60,25 @@ pub fn read_server_host(s: &Server) -> String { let _host = s.host.clone(); s.host.to_uppercase() } + +// Tests mod block support — functions inside mod blocks must be captured +mod handlers { + pub fn handle_request(data: &str) -> String { + data.to_uppercase() + } + + fn internal_helper() -> bool { + true + } +} + +// Tests nested generic stripping +pub struct Wrapper { + inner: Option, +} + +impl Wrapper> { + pub fn unwrap_display(&self) -> String { + format!("{:?}", self.inner) + } +} diff --git a/packages/core/src/scanner/__tests__/rust.test.ts b/packages/core/src/scanner/__tests__/rust.test.ts index 69a345c..453996a 100644 --- a/packages/core/src/scanner/__tests__/rust.test.ts +++ b/packages/core/src/scanner/__tests__/rust.test.ts @@ -379,6 +379,27 @@ describe('RustScanner', () => { expect(calleeNames.some((n) => n.includes('transform'))).toBe(true); }); + it('should extract functions inside mod blocks', () => { + const handleReq = docs.find( + (d) => d.metadata.name === 'handle_request' && d.type === 'function' + ); + expect(handleReq).toBeDefined(); + expect(handleReq!.metadata.exported).toBe(true); + + const helper = docs.find((d) => d.metadata.name === 'internal_helper'); + expect(helper).toBeDefined(); + expect(helper!.metadata.exported).toBe(false); + }); + + it('should strip nested generic type params from impl', () => { + // impl Wrapper> → Wrapper, not Wrapper> or Wrapper> + const method = docs.find((d) => d.metadata.name === 'Wrapper.unwrap_display'); + expect(method).toBeDefined(); + expect(method!.metadata.name).toBe('Wrapper.unwrap_display'); + expect(method!.metadata.name).not.toContain('<'); + expect(method!.metadata.name).not.toContain('>'); + }); + it('should NOT include macros in callees', () => { // process_request calls format!() — should NOT be in callees const processReq = docs.find((d) => d.metadata.name === 'Server.process_request'); diff --git a/packages/core/src/scanner/rust-queries.ts b/packages/core/src/scanner/rust-queries.ts index 6168c48..14bda51 100644 --- a/packages/core/src/scanner/rust-queries.ts +++ b/packages/core/src/scanner/rust-queries.ts @@ -7,11 +7,12 @@ */ export const RUST_QUERIES = { - // Free functions (top-level, not inside impl blocks) + // All function_item nodes at any depth (including inside mod blocks). + // Methods inside impl blocks are filtered out in the scanner code + // by checking if the parent is a declaration_list (impl body). functions: ` - (source_file - (function_item - name: (identifier) @name) @definition) + (function_item + name: (identifier) @name) @definition `, // Struct definitions diff --git a/packages/core/src/scanner/rust.ts b/packages/core/src/scanner/rust.ts index 9cb69a6..db2867d 100644 --- a/packages/core/src/scanner/rust.ts +++ b/packages/core/src/scanner/rust.ts @@ -170,6 +170,11 @@ export class RustScanner implements Scanner { const defCapture = match.captures.find((c) => c.name === 'definition'); if (!nameCapture || !defCapture) continue; + // Skip functions inside impl blocks — those are captured by extractMethods. + // Functions inside mod blocks (mod_item > declaration_list) should be kept. + const parent = defCapture.node.parent; + if (parent?.type === 'declaration_list' && parent.parent?.type === 'impl_item') continue; + const name = nameCapture.node.text; const node = defCapture.node; const startLine = node.startPosition.row + 1; @@ -327,8 +332,8 @@ export class RustScanner implements Scanner { const defCapture = match.captures.find((c) => c.name === 'definition'); if (!receiverCapture || !nameCapture || !defCapture) continue; - // Strip generic type params: Container → Container - const receiverType = receiverCapture.node.text.replace(/<.*>/, ''); + // Strip generic type params: Container → Container, HashMap> → HashMap + const receiverType = receiverCapture.node.text.split('<')[0]; const methodName = nameCapture.node.text; const qualifiedName = `${receiverType}.${methodName}`; const node = defCapture.node; From 68cea1eb9be9c0c205e3c5a7e54239d748d464fa Mon Sep 17 00:00:00 2001 From: prosdev Date: Wed, 1 Apr 2026 17:24:35 -0700 Subject: [PATCH 08/11] docs: add Antfly batch limit and callee file resolution to scratchpad Track two limitations found during manual verification: - Antfly Linear Merge fails at ~6k docs (blocks large repo indexing) - Rust/Go callees don't resolve target files (no cross-file graph edges) Co-Authored-By: Claude Opus 4.6 (1M context) --- .claude/scratchpad.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.claude/scratchpad.md b/.claude/scratchpad.md index 67504d1..503e6a6 100644 --- a/.claude/scratchpad.md +++ b/.claude/scratchpad.md @@ -4,6 +4,8 @@ - **`getDocsByFilePath` fetches all docs client-side (capped at 5k).** Uses `getAll(limit: 5000)` + exact path filter. Fine for single repos (dev-agent has ~2,200 docs). Won't scale to monorepos with 50k+ files. Future fix: server-side path filter in Antfly SDK. - **Two clones of the same repo share one index.** Storage path is hashed from git remote URL (`prosdevlab/dev-agent` → `a1b2c3d4`). Two local clones on different branches share the same index, graph cache, and watcher snapshot. Stale data possible if branches diverge significantly. Pre-existing design — not introduced by graph cache. Fix would be to include branch or worktree path in the hash. +- **Antfly Linear Merge fails on large batch sizes (~6k+ docs).** Tested with cli/cli (5,933 docs): `decoding request: json: string unexpected end of JSON input`. The scanner completes successfully but Antfly's HTTP endpoint fails to process the full payload. Workaround: none currently — the full index fails. Fix options: (1) batch the linearMerge into chunks of ~3k docs, (2) raise the limit on Antfly side, (3) stream instead of single POST. This blocks indexing medium-large Go/Rust repos (>~5k components). +- **Rust/Go callee extraction does not resolve target files.** tree-sitter callees have `name` and `line` but no `file` field (unlike ts-morph which resolves cross-file references). This means `dev_map` hot paths show 0 refs for Rust/Go repos, and `dev_refs --depends-on` won't trace cross-file paths. The dependency graph only has edges when callees include a `file` field. Future: cross-file resolution for tree-sitter languages. ## Open Questions From 7565a7fa60618c8e00713699a818174272ac0a80 Mon Sep 17 00:00:00 2001 From: prosdev Date: Wed, 1 Apr 2026 17:31:56 -0700 Subject: [PATCH 09/11] fix(core): chunk Linear Merge to avoid Antfly payload size limit Antfly's merge endpoint fails on large JSON payloads (~6k+ docs). Split documents into chunks of 3,000 before sending. - Extract chunk() utility as a pure function in utils/chunking.ts - AntflyVectorStore.linearMerge splits sorted docs into chunks, runs linearMergeChunk per batch, accumulates results - Progress callbacks report across all chunks - 10 new tests for chunk() (even/uneven splits, edge cases, large arrays) - Update scratchpad with Antfly batch limit and callee file resolution Co-Authored-By: Claude Opus 4.6 (1M context) --- .../core/src/utils/__tests__/chunking.test.ts | 65 +++++++++++ packages/core/src/utils/chunking.ts | 16 +++ packages/core/src/vector/antfly-store.ts | 110 +++++++++++------- 3 files changed, 151 insertions(+), 40 deletions(-) create mode 100644 packages/core/src/utils/__tests__/chunking.test.ts create mode 100644 packages/core/src/utils/chunking.ts diff --git a/packages/core/src/utils/__tests__/chunking.test.ts b/packages/core/src/utils/__tests__/chunking.test.ts new file mode 100644 index 0000000..2ac0b4f --- /dev/null +++ b/packages/core/src/utils/__tests__/chunking.test.ts @@ -0,0 +1,65 @@ +/** + * Tests for array chunking utility. + * Pure function — no I/O, no mocks. + */ + +import { describe, expect, it } from 'vitest'; +import { chunk } from '../chunking'; + +describe('chunk', () => { + it('should return single chunk for small arrays', () => { + expect(chunk([1, 2, 3], 5)).toEqual([[1, 2, 3]]); + }); + + it('should split evenly', () => { + expect(chunk([1, 2, 3, 4, 5, 6], 3)).toEqual([ + [1, 2, 3], + [4, 5, 6], + ]); + }); + + it('should handle uneven splits', () => { + expect(chunk([1, 2, 3, 4, 5], 3)).toEqual([ + [1, 2, 3], + [4, 5], + ]); + }); + + it('should handle single element chunks', () => { + expect(chunk([1, 2, 3], 1)).toEqual([[1], [2], [3]]); + }); + + it('should return empty for empty array', () => { + expect(chunk([], 3)).toEqual([]); + }); + + it('should handle chunk size equal to array length', () => { + expect(chunk([1, 2, 3], 3)).toEqual([[1, 2, 3]]); + }); + + it('should handle chunk size larger than array', () => { + expect(chunk([1, 2], 100)).toEqual([[1, 2]]); + }); + + it('should throw on non-positive size', () => { + expect(() => chunk([1], 0)).toThrow('Chunk size must be positive'); + expect(() => chunk([1], -1)).toThrow('Chunk size must be positive'); + }); + + it('should work with large arrays (6000 items, chunks of 3000)', () => { + const items = Array.from({ length: 6000 }, (_, i) => i); + const result = chunk(items, 3000); + expect(result.length).toBe(2); + expect(result[0].length).toBe(3000); + expect(result[1].length).toBe(3000); + }); + + it('should work with 7500 items in chunks of 3000', () => { + const items = Array.from({ length: 7500 }, (_, i) => i); + const result = chunk(items, 3000); + expect(result.length).toBe(3); + expect(result[0].length).toBe(3000); + expect(result[1].length).toBe(3000); + expect(result[2].length).toBe(1500); + }); +}); diff --git a/packages/core/src/utils/chunking.ts b/packages/core/src/utils/chunking.ts new file mode 100644 index 0000000..58cb90d --- /dev/null +++ b/packages/core/src/utils/chunking.ts @@ -0,0 +1,16 @@ +/** + * Array chunking utility. + * + * Splits an array into chunks of at most `size` elements. + * Pure function — no side effects. + */ +export function chunk(array: T[], size: number): T[][] { + if (size <= 0) throw new Error('Chunk size must be positive'); + if (array.length === 0) return []; + + const chunks: T[][] = []; + for (let i = 0; i < array.length; i += size) { + chunks.push(array.slice(i, i + size)); + } + return chunks; +} diff --git a/packages/core/src/vector/antfly-store.ts b/packages/core/src/vector/antfly-store.ts index 0530bf6..a81d3ab 100644 --- a/packages/core/src/vector/antfly-store.ts +++ b/packages/core/src/vector/antfly-store.ts @@ -7,6 +7,7 @@ */ import { AntflyClient } from '@antfly/sdk'; +import { chunk } from '../utils/chunking'; import type { EmbeddingDocument, SearchOptions, @@ -303,6 +304,13 @@ export class AntflyVectorStore implements VectorStore { * Use ONLY for full-index operations. For incremental updates, use batchUpsertAndDelete(). * Records must be sorted lexicographically by key (handled internally). */ + /** + * Maximum documents per Linear Merge HTTP request. + * Antfly's endpoint fails on large JSON payloads (~6k+ docs). + * Chunking into smaller batches avoids the limit. + */ + private static readonly MERGE_BATCH_SIZE = 3000; + async linearMerge( documents: EmbeddingDocument[], lastMergedId = '', @@ -314,51 +322,21 @@ export class AntflyVectorStore implements VectorStore { this.assertReady(); const sorted = [...documents].sort((a, b) => a.id.localeCompare(b.id)); - const records: Record = {}; - for (const doc of sorted) { - records[doc.id] = { text: doc.text, metadata: JSON.stringify(doc.metadata) }; - } - const total = documents.length; const totals: LinearMergeResult = { upserted: 0, skipped: 0, deleted: 0 }; - let cursor = lastMergedId; - try { - const raw = this.client.getRawClient(); - do { - const result = await raw.POST('/tables/{tableName}/merge', { - params: { path: { tableName: this.cfg.table } }, - body: { records, last_merged_id: cursor }, - }); - - if (result.error) { - throw new Error( - typeof result.error === 'object' && 'error' in result.error - ? String((result.error as Record).error) - : String(result.error) - ); - } - - const data = result.data; - if (!data) { - throw new Error('Linear Merge returned no data'); - } - - totals.upserted += data.upserted ?? 0; - totals.skipped += data.skipped ?? 0; - totals.deleted += data.deleted ?? 0; - if (data.took) totals.took = (totals.took ?? 0) + data.took; + // Chunk documents to avoid Antfly HTTP payload size limit + const chunks = chunk(sorted, AntflyVectorStore.MERGE_BATCH_SIZE); + try { + for (const chunk of chunks) { + const result = await this.linearMergeChunk(chunk, lastMergedId); + totals.upserted += result.upserted; + totals.skipped += result.skipped; + totals.deleted += result.deleted; + if (result.took) totals.took = (totals.took ?? 0) + result.took; onProgress?.(totals.upserted + totals.skipped, total); - - if (data.status === 'partial' && data.next_cursor) { - cursor = data.next_cursor; - } else { - break; - } - // biome-ignore lint/correctness/noConstantCondition: pagination loop exits via break - } while (true); - + } return totals; } catch (error) { throw new Error( @@ -367,6 +345,58 @@ export class AntflyVectorStore implements VectorStore { } } + /** + * Merge a single chunk of documents via Antfly's merge endpoint. + * Handles server-side pagination (status: "partial" + next_cursor). + */ + private async linearMergeChunk( + chunk: EmbeddingDocument[], + lastMergedId: string + ): Promise { + const records: Record = {}; + for (const doc of chunk) { + records[doc.id] = { text: doc.text, metadata: JSON.stringify(doc.metadata) }; + } + + const totals: LinearMergeResult = { upserted: 0, skipped: 0, deleted: 0 }; + let cursor = lastMergedId; + + const raw = this.client.getRawClient(); + do { + const result = await raw.POST('/tables/{tableName}/merge', { + params: { path: { tableName: this.cfg.table } }, + body: { records, last_merged_id: cursor }, + }); + + if (result.error) { + throw new Error( + typeof result.error === 'object' && 'error' in result.error + ? String((result.error as Record).error) + : String(result.error) + ); + } + + const data = result.data; + if (!data) { + throw new Error('Linear Merge returned no data'); + } + + totals.upserted += data.upserted ?? 0; + totals.skipped += data.skipped ?? 0; + totals.deleted += data.deleted ?? 0; + if (data.took) totals.took = (totals.took ?? 0) + data.took; + + if (data.status === 'partial' && data.next_cursor) { + cursor = data.next_cursor; + } else { + break; + } + // biome-ignore lint/correctness/noConstantCondition: pagination loop exits via break + } while (true); + + return totals; + } + /** * Combined upsert + delete in a single batchOp call. * Safe for incremental updates and concurrent calls. From 3da9558474edd6610879da75f206547750bc809a Mon Sep 17 00:00:00 2001 From: prosdev Date: Wed, 1 Apr 2026 17:42:54 -0700 Subject: [PATCH 10/11] =?UTF-8?q?fix(core):=20revert=20Linear=20Merge=20ch?= =?UTF-8?q?unking=20=E2=80=94=20breaks=20merge=20semantics?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Chunking Linear Merge causes each chunk to delete the previous chunk's records (server thinks each subset is the full dataset). Reverted to single-call approach. The Antfly payload size limit (~6k docs) is an Antfly-side issue that needs a fix in the server (raise JSON body limit or support streaming). Tracked in scratchpad. chunk() utility kept — useful elsewhere. Co-Authored-By: Claude Opus 4.6 (1M context) --- .claude/scratchpad.md | 2 +- packages/core/src/vector/antfly-store.ts | 110 +++++++++-------------- 2 files changed, 41 insertions(+), 71 deletions(-) diff --git a/.claude/scratchpad.md b/.claude/scratchpad.md index 503e6a6..7c021be 100644 --- a/.claude/scratchpad.md +++ b/.claude/scratchpad.md @@ -4,7 +4,7 @@ - **`getDocsByFilePath` fetches all docs client-side (capped at 5k).** Uses `getAll(limit: 5000)` + exact path filter. Fine for single repos (dev-agent has ~2,200 docs). Won't scale to monorepos with 50k+ files. Future fix: server-side path filter in Antfly SDK. - **Two clones of the same repo share one index.** Storage path is hashed from git remote URL (`prosdevlab/dev-agent` → `a1b2c3d4`). Two local clones on different branches share the same index, graph cache, and watcher snapshot. Stale data possible if branches diverge significantly. Pre-existing design — not introduced by graph cache. Fix would be to include branch or worktree path in the hash. -- **Antfly Linear Merge fails on large batch sizes (~6k+ docs).** Tested with cli/cli (5,933 docs): `decoding request: json: string unexpected end of JSON input`. The scanner completes successfully but Antfly's HTTP endpoint fails to process the full payload. Workaround: none currently — the full index fails. Fix options: (1) batch the linearMerge into chunks of ~3k docs, (2) raise the limit on Antfly side, (3) stream instead of single POST. This blocks indexing medium-large Go/Rust repos (>~5k components). +- **Antfly Linear Merge fails on large JSON payloads (~6k+ docs).** Tested with cli/cli (5,933 docs): `decoding request: json: string unexpected end of JSON input`. The scanner completes successfully but Antfly's HTTP endpoint can't parse the JSON body. Chunking is NOT a viable fix — Linear Merge semantics require ALL records in one call (the server deletes records not in the set, so each chunk deletes the previous chunk's data). Fix must be Antfly-side: raise the JSON body size limit, or support streaming/chunked transfer encoding. File a ticket with Antfly. Blocks indexing repos with >~5k components. - **Rust/Go callee extraction does not resolve target files.** tree-sitter callees have `name` and `line` but no `file` field (unlike ts-morph which resolves cross-file references). This means `dev_map` hot paths show 0 refs for Rust/Go repos, and `dev_refs --depends-on` won't trace cross-file paths. The dependency graph only has edges when callees include a `file` field. Future: cross-file resolution for tree-sitter languages. ## Open Questions diff --git a/packages/core/src/vector/antfly-store.ts b/packages/core/src/vector/antfly-store.ts index a81d3ab..0530bf6 100644 --- a/packages/core/src/vector/antfly-store.ts +++ b/packages/core/src/vector/antfly-store.ts @@ -7,7 +7,6 @@ */ import { AntflyClient } from '@antfly/sdk'; -import { chunk } from '../utils/chunking'; import type { EmbeddingDocument, SearchOptions, @@ -304,13 +303,6 @@ export class AntflyVectorStore implements VectorStore { * Use ONLY for full-index operations. For incremental updates, use batchUpsertAndDelete(). * Records must be sorted lexicographically by key (handled internally). */ - /** - * Maximum documents per Linear Merge HTTP request. - * Antfly's endpoint fails on large JSON payloads (~6k+ docs). - * Chunking into smaller batches avoids the limit. - */ - private static readonly MERGE_BATCH_SIZE = 3000; - async linearMerge( documents: EmbeddingDocument[], lastMergedId = '', @@ -322,21 +314,51 @@ export class AntflyVectorStore implements VectorStore { this.assertReady(); const sorted = [...documents].sort((a, b) => a.id.localeCompare(b.id)); + const records: Record = {}; + for (const doc of sorted) { + records[doc.id] = { text: doc.text, metadata: JSON.stringify(doc.metadata) }; + } + const total = documents.length; const totals: LinearMergeResult = { upserted: 0, skipped: 0, deleted: 0 }; - - // Chunk documents to avoid Antfly HTTP payload size limit - const chunks = chunk(sorted, AntflyVectorStore.MERGE_BATCH_SIZE); + let cursor = lastMergedId; try { - for (const chunk of chunks) { - const result = await this.linearMergeChunk(chunk, lastMergedId); - totals.upserted += result.upserted; - totals.skipped += result.skipped; - totals.deleted += result.deleted; - if (result.took) totals.took = (totals.took ?? 0) + result.took; + const raw = this.client.getRawClient(); + do { + const result = await raw.POST('/tables/{tableName}/merge', { + params: { path: { tableName: this.cfg.table } }, + body: { records, last_merged_id: cursor }, + }); + + if (result.error) { + throw new Error( + typeof result.error === 'object' && 'error' in result.error + ? String((result.error as Record).error) + : String(result.error) + ); + } + + const data = result.data; + if (!data) { + throw new Error('Linear Merge returned no data'); + } + + totals.upserted += data.upserted ?? 0; + totals.skipped += data.skipped ?? 0; + totals.deleted += data.deleted ?? 0; + if (data.took) totals.took = (totals.took ?? 0) + data.took; + onProgress?.(totals.upserted + totals.skipped, total); - } + + if (data.status === 'partial' && data.next_cursor) { + cursor = data.next_cursor; + } else { + break; + } + // biome-ignore lint/correctness/noConstantCondition: pagination loop exits via break + } while (true); + return totals; } catch (error) { throw new Error( @@ -345,58 +367,6 @@ export class AntflyVectorStore implements VectorStore { } } - /** - * Merge a single chunk of documents via Antfly's merge endpoint. - * Handles server-side pagination (status: "partial" + next_cursor). - */ - private async linearMergeChunk( - chunk: EmbeddingDocument[], - lastMergedId: string - ): Promise { - const records: Record = {}; - for (const doc of chunk) { - records[doc.id] = { text: doc.text, metadata: JSON.stringify(doc.metadata) }; - } - - const totals: LinearMergeResult = { upserted: 0, skipped: 0, deleted: 0 }; - let cursor = lastMergedId; - - const raw = this.client.getRawClient(); - do { - const result = await raw.POST('/tables/{tableName}/merge', { - params: { path: { tableName: this.cfg.table } }, - body: { records, last_merged_id: cursor }, - }); - - if (result.error) { - throw new Error( - typeof result.error === 'object' && 'error' in result.error - ? String((result.error as Record).error) - : String(result.error) - ); - } - - const data = result.data; - if (!data) { - throw new Error('Linear Merge returned no data'); - } - - totals.upserted += data.upserted ?? 0; - totals.skipped += data.skipped ?? 0; - totals.deleted += data.deleted ?? 0; - if (data.took) totals.took = (totals.took ?? 0) + data.took; - - if (data.status === 'partial' && data.next_cursor) { - cursor = data.next_cursor; - } else { - break; - } - // biome-ignore lint/correctness/noConstantCondition: pagination loop exits via break - } while (true); - - return totals; - } - /** * Combined upsert + delete in a single batchOp call. * Safe for incremental updates and concurrent calls. From 957885fade726c2ef880935396e08b6f009a59d0 Mon Sep 17 00:00:00 2001 From: prosdev Date: Wed, 1 Apr 2026 17:58:03 -0700 Subject: [PATCH 11/11] =?UTF-8?q?docs:=20link=20Antfly=20issue=20#37=20in?= =?UTF-8?q?=20scratchpad=20=E2=80=94=20AJ=20to=20address?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Opus 4.6 (1M context) --- .claude/scratchpad.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.claude/scratchpad.md b/.claude/scratchpad.md index 7c021be..8386f0f 100644 --- a/.claude/scratchpad.md +++ b/.claude/scratchpad.md @@ -4,7 +4,7 @@ - **`getDocsByFilePath` fetches all docs client-side (capped at 5k).** Uses `getAll(limit: 5000)` + exact path filter. Fine for single repos (dev-agent has ~2,200 docs). Won't scale to monorepos with 50k+ files. Future fix: server-side path filter in Antfly SDK. - **Two clones of the same repo share one index.** Storage path is hashed from git remote URL (`prosdevlab/dev-agent` → `a1b2c3d4`). Two local clones on different branches share the same index, graph cache, and watcher snapshot. Stale data possible if branches diverge significantly. Pre-existing design — not introduced by graph cache. Fix would be to include branch or worktree path in the hash. -- **Antfly Linear Merge fails on large JSON payloads (~6k+ docs).** Tested with cli/cli (5,933 docs): `decoding request: json: string unexpected end of JSON input`. The scanner completes successfully but Antfly's HTTP endpoint can't parse the JSON body. Chunking is NOT a viable fix — Linear Merge semantics require ALL records in one call (the server deletes records not in the set, so each chunk deletes the previous chunk's data). Fix must be Antfly-side: raise the JSON body size limit, or support streaming/chunked transfer encoding. File a ticket with Antfly. Blocks indexing repos with >~5k components. +- **Antfly Linear Merge fails on large JSON payloads (~6k+ docs).** Tested with cli/cli (5,933 docs): `decoding request: json: string unexpected end of JSON input`. Chunking is NOT viable — merge semantics require ALL records in one call. Filed as [antflydb/antfly#37](https://github.com/antflydb/antfly/issues/37). AJ will take a look. Blocks indexing repos with >~5k components. - **Rust/Go callee extraction does not resolve target files.** tree-sitter callees have `name` and `line` but no `file` field (unlike ts-morph which resolves cross-file references). This means `dev_map` hot paths show 0 refs for Rust/Go repos, and `dev_refs --depends-on` won't trace cross-file paths. The dependency graph only has edges when callees include a `file` field. Future: cross-file resolution for tree-sitter languages. ## Open Questions