From 15184a07d6d0b8cd7fb1ea0f0ff6123780c28ff4 Mon Sep 17 00:00:00 2001 From: carlos-alm Date: Mon, 11 May 2026 04:09:39 -0600 Subject: [PATCH 01/11] feat(native): port R extractor to Rust Adds tree-sitter-r dependency and native extractor matching the WASM-side behavior for R symbol, import, and call extraction. Part of #1071 --- Cargo.lock | 11 + crates/codegraph-core/Cargo.toml | 1 + .../codegraph-core/src/extractors/helpers.rs | 11 + crates/codegraph-core/src/extractors/mod.rs | 4 + .../codegraph-core/src/extractors/r_lang.rs | 411 ++++++++++++++++++ crates/codegraph-core/src/file_collector.rs | 2 + crates/codegraph-core/src/parser_registry.rs | 14 +- src/ast-analysis/rules/index.ts | 7 + 8 files changed, 458 insertions(+), 3 deletions(-) create mode 100644 crates/codegraph-core/src/extractors/r_lang.rs diff --git a/Cargo.lock b/Cargo.lock index 413504b0d..8198cbdf4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -96,6 +96,7 @@ dependencies = [ "tree-sitter-ocaml", "tree-sitter-php", "tree-sitter-python", + "tree-sitter-r", "tree-sitter-ruby", "tree-sitter-rust", "tree-sitter-scala", @@ -895,6 +896,16 @@ dependencies = [ "tree-sitter-language", ] +[[package]] +name = "tree-sitter-r" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "429133cbda9f8a46e03ef3aae6abb6c3d22875f8585cad472138101bfd517255" +dependencies = [ + "cc", + "tree-sitter-language", +] + [[package]] name = "tree-sitter-ruby" version = "0.23.1" diff --git a/crates/codegraph-core/Cargo.toml b/crates/codegraph-core/Cargo.toml index df4361e17..a0c513b68 100644 --- a/crates/codegraph-core/Cargo.toml +++ b/crates/codegraph-core/Cargo.toml @@ -35,6 +35,7 @@ tree-sitter-dart = "0.0.4" tree-sitter-zig = "1" tree-sitter-haskell = "0.23" tree-sitter-ocaml = "0.24" +tree-sitter-r = "1.2" rayon = "1" ignore = "0.4" globset = "0.4" diff --git a/crates/codegraph-core/src/extractors/helpers.rs b/crates/codegraph-core/src/extractors/helpers.rs index b02531896..3e0d5964e 100644 --- a/crates/codegraph-core/src/extractors/helpers.rs +++ b/crates/codegraph-core/src/extractors/helpers.rs @@ -360,6 +360,17 @@ pub const OCAML_AST_CONFIG: LangAstConfig = LangAstConfig { string_prefixes: &[], }; +pub const R_AST_CONFIG: LangAstConfig = LangAstConfig { + new_types: &[], + throw_types: &[], + await_types: &[], + // tree-sitter-r emits `string` for both single- and double-quoted literals. + string_types: &["string"], + regex_types: &[], + quote_chars: &['\'', '"'], + string_prefixes: &[], +}; + // ── Generic AST node walker ────────────────────────────────────────────────── /// Node types that represent identifiers across languages. diff --git a/crates/codegraph-core/src/extractors/mod.rs b/crates/codegraph-core/src/extractors/mod.rs index 642f29f98..3d8ac9810 100644 --- a/crates/codegraph-core/src/extractors/mod.rs +++ b/crates/codegraph-core/src/extractors/mod.rs @@ -15,6 +15,7 @@ pub mod lua; pub mod ocaml; pub mod php; pub mod python; +pub mod r_lang; pub mod ruby; pub mod rust_lang; pub mod scala; @@ -126,5 +127,8 @@ pub fn extract_symbols_with_opts( LanguageKind::Ocaml | LanguageKind::OcamlInterface => { ocaml::OcamlExtractor.extract_with_opts(tree, source, file_path, include_ast_nodes) } + LanguageKind::R => { + r_lang::RExtractor.extract_with_opts(tree, source, file_path, include_ast_nodes) + } } } diff --git a/crates/codegraph-core/src/extractors/r_lang.rs b/crates/codegraph-core/src/extractors/r_lang.rs new file mode 100644 index 000000000..79a5411ff --- /dev/null +++ b/crates/codegraph-core/src/extractors/r_lang.rs @@ -0,0 +1,411 @@ +use tree_sitter::{Node, Tree}; +use crate::cfg::build_function_cfg; +use crate::complexity::compute_all_metrics; +use crate::types::*; +use super::helpers::*; +use super::SymbolExtractor; + +/// R symbol extractor — ports `src/extractors/r.ts` from the JS engine. +/// +/// tree-sitter-r grammar (r-lib/tree-sitter-r) notes: +/// - Assignments: `binary_operator` with `<-`, `=`, or `<<-` operator +/// - Functions: `function_definition` as RHS of assignment +/// - Calls: `call` node with `function`/`arguments` fields +/// - Imports: `library()` / `require()` (packages) and `source()` (files) +/// - S4 classes: `setClass()`, `setRefClass()`, `setGeneric()`, `setMethod()` +pub struct RExtractor; + +impl SymbolExtractor for RExtractor { + fn extract(&self, tree: &Tree, source: &[u8], file_path: &str) -> FileSymbols { + let mut symbols = FileSymbols::new(file_path.to_string()); + walk_tree(&tree.root_node(), source, &mut symbols, match_r_node); + walk_ast_nodes_with_config(&tree.root_node(), source, &mut symbols.ast_nodes, &R_AST_CONFIG); + symbols + } +} + +fn match_r_node(node: &Node, source: &[u8], symbols: &mut FileSymbols, _depth: usize) { + match node.kind() { + "binary_operator" => handle_binary_op(node, source, symbols), + "call" => handle_call(node, source, symbols), + _ => {} + } +} + +fn handle_binary_op(node: &Node, source: &[u8], symbols: &mut FileSymbols) { + // binary_operator children: lhs, operator, rhs + // We use field accessors for robustness; the grammar exposes + // `lhs`/`operator`/`rhs` fields explicitly. + let lhs = match node.child_by_field_name("lhs").or_else(|| node.child(0)) { + Some(n) => n, + None => return, + }; + let op = match node.child_by_field_name("operator").or_else(|| node.child(1)) { + Some(n) => n, + None => return, + }; + let rhs = match node.child_by_field_name("rhs").or_else(|| node.child(2)) { + Some(n) => n, + None => return, + }; + + let op_text = node_text(&op, source); + if op_text != "<-" && op_text != "=" && op_text != "<<-" { + return; + } + if lhs.kind() != "identifier" { + return; + } + + let name = node_text(&lhs, source).to_string(); + + if rhs.kind() == "function_definition" { + let params = extract_r_params(&rhs, source); + symbols.definitions.push(Definition { + name, + kind: "function".to_string(), + line: start_line(node), + end_line: Some(end_line(node)), + decorators: None, + complexity: compute_all_metrics(&rhs, source, "r"), + cfg: build_function_cfg(&rhs, "r", source), + children: opt_children(params), + }); + } else if is_program_level(node) { + // Only record top-level variable assignments (matches JS extractor). + symbols.definitions.push(Definition { + name, + kind: "variable".to_string(), + line: start_line(node), + end_line: Some(end_line(node)), + decorators: None, + complexity: None, + cfg: None, + children: None, + }); + } +} + +fn is_program_level(node: &Node) -> bool { + node.parent().map(|p| p.kind() == "program").unwrap_or(false) +} + +fn extract_r_params(func_def: &Node, source: &[u8]) -> Vec { + let mut params = Vec::new(); + let params_node = match func_def.child_by_field_name("parameters") { + Some(n) => n, + None => return params, + }; + + for i in 0..params_node.child_count() { + let Some(child) = params_node.child(i) else { continue }; + match child.kind() { + "parameter" => { + // parameter has `name` field, e.g. `x` or `y = 10`. + // Falls back to first identifier child (or `dots` for `...`). + if let Some(name_node) = child.child_by_field_name("name") { + params.push(child_def( + node_text(&name_node, source).to_string(), + "parameter", + start_line(&child), + )); + } else if let Some(dots) = find_child(&child, "dots") { + params.push(child_def( + node_text(&dots, source).to_string(), + "parameter", + start_line(&child), + )); + } else if let Some(ident) = find_child(&child, "identifier") { + params.push(child_def( + node_text(&ident, source).to_string(), + "parameter", + start_line(&child), + )); + } + } + "identifier" => { + // Some grammar variants expose bare identifiers at the parameters level. + params.push(child_def( + node_text(&child, source).to_string(), + "parameter", + start_line(&child), + )); + } + _ => {} + } + } + params +} + +fn handle_call(node: &Node, source: &[u8], symbols: &mut FileSymbols) { + // call: function field is the callee (identifier or namespace_operator), + // arguments field is the arguments list. + let func_node = match node.child_by_field_name("function").or_else(|| node.child(0)) { + Some(n) => n, + None => return, + }; + + let func_text = node_text(&func_node, source); + + // Special-case keyword-like callees first; they short-circuit and do NOT + // produce a generic call edge (matches JS extractor). + if func_node.kind() == "identifier" { + match func_text { + "library" | "require" => { + handle_library_call(node, source, symbols); + return; + } + "source" => { + handle_source_call(node, source, symbols); + return; + } + "setClass" | "setRefClass" => { + handle_set_class(node, source, symbols); + return; + } + "setGeneric" | "setMethod" => { + handle_set_generic(node, source, symbols); + return; + } + _ => {} + } + } + + match func_node.kind() { + "identifier" => { + symbols.calls.push(Call { + name: func_text.to_string(), + line: start_line(node), + dynamic: None, + receiver: None, + }); + } + "namespace_operator" => { + // `pkg::func` — receiver is the package; name is the function. + let parts: Vec<&str> = func_text.split("::").collect(); + if parts.len() >= 2 { + let name = parts[parts.len() - 1].to_string(); + let receiver = parts[..parts.len() - 1].join("::"); + symbols.calls.push(Call { + name, + line: start_line(node), + dynamic: None, + receiver: Some(receiver), + }); + } + } + _ => {} + } +} + +/// Extract the first argument value from a call's `arguments` node. +/// +/// Returns the inner string literal text (quotes stripped) or the bare +/// identifier text — whichever appears first. Used for `library(pkg)`, +/// `source("file.R")`, `setClass("Foo", ...)`, etc. +fn first_argument_value(node: &Node, source: &[u8], accept_identifier: bool) -> Option { + let args = node.child_by_field_name("arguments").or_else(|| find_child(node, "arguments"))?; + for i in 0..args.child_count() { + let Some(arg) = args.child(i) else { continue }; + match arg.kind() { + "argument" => { + // argument wraps the actual value + for j in 0..arg.child_count() { + let Some(inner) = arg.child(j) else { continue }; + if inner.kind() == "string" { + return Some(strip_string_quotes(&inner, source)); + } + if accept_identifier && inner.kind() == "identifier" { + return Some(node_text(&inner, source).to_string()); + } + } + } + "string" => { + return Some(strip_string_quotes(&arg, source)); + } + "identifier" if accept_identifier => { + return Some(node_text(&arg, source).to_string()); + } + _ => {} + } + } + None +} + +/// Strip surrounding `'` or `"` quotes from a `string` node's text. +fn strip_string_quotes(node: &Node, source: &[u8]) -> String { + // Prefer `string_content` child when available (avoids any escape quirks). + if let Some(content) = find_child(node, "string_content") { + return node_text(&content, source).to_string(); + } + node_text(node, source) + .trim_matches(|c| c == '\'' || c == '"') + .to_string() +} + +fn handle_library_call(node: &Node, source: &[u8], symbols: &mut FileSymbols) { + if let Some(pkg) = first_argument_value(node, source, true) { + symbols.imports.push(Import::new( + pkg.clone(), + vec![pkg], + start_line(node), + )); + } +} + +fn handle_source_call(node: &Node, source: &[u8], symbols: &mut FileSymbols) { + // source() only accepts string literals — `source(varname)` is not an import. + if let Some(path) = first_argument_value(node, source, false) { + symbols.imports.push(Import::new( + path, + vec!["source".to_string()], + start_line(node), + )); + } +} + +fn handle_set_class(node: &Node, source: &[u8], symbols: &mut FileSymbols) { + if let Some(name) = first_argument_value(node, source, false) { + symbols.definitions.push(Definition { + name, + kind: "class".to_string(), + line: start_line(node), + end_line: Some(end_line(node)), + decorators: None, + complexity: None, + cfg: None, + children: None, + }); + } +} + +fn handle_set_generic(node: &Node, source: &[u8], symbols: &mut FileSymbols) { + if let Some(name) = first_argument_value(node, source, false) { + symbols.definitions.push(Definition { + name, + kind: "function".to_string(), + line: start_line(node), + end_line: Some(end_line(node)), + decorators: None, + complexity: None, + cfg: None, + children: None, + }); + } +} + +#[cfg(test)] +mod tests { + use super::*; + use tree_sitter::Parser; + + fn parse_r(code: &str) -> FileSymbols { + let mut parser = Parser::new(); + parser + .set_language(&tree_sitter_r::LANGUAGE.into()) + .unwrap(); + let tree = parser.parse(code.as_bytes(), None).unwrap(); + RExtractor.extract(&tree, code.as_bytes(), "test.R") + } + + #[test] + fn finds_function_assignment() { + let s = parse_r("greet <- function(name) { print(name) }\n"); + assert_eq!(s.definitions.len(), 1); + assert_eq!(s.definitions[0].name, "greet"); + assert_eq!(s.definitions[0].kind, "function"); + let children = s.definitions[0].children.as_ref().unwrap(); + assert_eq!(children.len(), 1); + assert_eq!(children[0].name, "name"); + assert_eq!(children[0].kind, "parameter"); + } + + #[test] + fn finds_function_with_default_and_dots() { + let s = parse_r("f <- function(x, y = 10, ...) { x }\n"); + let f = s.definitions.iter().find(|d| d.name == "f").unwrap(); + let children = f.children.as_ref().unwrap(); + let names: Vec<&str> = children.iter().map(|c| c.name.as_str()).collect(); + assert!(names.contains(&"x")); + assert!(names.contains(&"y")); + assert!(names.contains(&"...")); + } + + #[test] + fn finds_top_level_variable() { + let s = parse_r("user_store <- list()\n"); + let v = s.definitions.iter().find(|d| d.name == "user_store").unwrap(); + assert_eq!(v.kind, "variable"); + } + + #[test] + fn skips_nested_variable_assignment() { + // Inner `user <- ...` is inside the function body — should not be recorded + // as a top-level definition (it's a local binding). + let s = parse_r("f <- function() { user <- list(); user }\n"); + let defs: Vec<&str> = s.definitions.iter().map(|d| d.name.as_str()).collect(); + assert!(defs.contains(&"f")); + assert!(!defs.contains(&"user")); + } + + #[test] + fn extracts_source_imports() { + let s = parse_r("source(\"service.R\")\nsource('utils.R')\n"); + assert_eq!(s.imports.len(), 2); + assert_eq!(s.imports[0].source, "service.R"); + assert_eq!(s.imports[0].names, vec!["source".to_string()]); + assert_eq!(s.imports[1].source, "utils.R"); + } + + #[test] + fn extracts_library_and_require_imports() { + let s = parse_r("library(dplyr)\nrequire(\"ggplot2\")\n"); + assert_eq!(s.imports.len(), 2); + assert_eq!(s.imports[0].source, "dplyr"); + assert_eq!(s.imports[1].source, "ggplot2"); + } + + #[test] + fn extracts_calls() { + let s = parse_r("f <- function() { print(1); validate(x) }\n"); + let names: Vec<&str> = s.calls.iter().map(|c| c.name.as_str()).collect(); + assert!(names.contains(&"print")); + assert!(names.contains(&"validate")); + } + + #[test] + fn source_call_is_import_not_call() { + let s = parse_r("source(\"service.R\")\n"); + assert!(s.calls.iter().all(|c| c.name != "source"), + "source() should be classified as import, not as a generic call"); + } + + #[test] + fn namespace_call_splits_receiver() { + let s = parse_r("f <- function() { dplyr::filter(df) }\n"); + let c = s.calls.iter().find(|c| c.name == "filter").unwrap(); + assert_eq!(c.receiver, Some("dplyr".to_string())); + } + + #[test] + fn set_class_creates_class_definition() { + let s = parse_r("setClass(\"Person\", representation(name = \"character\"))\n"); + let d = s.definitions.iter().find(|d| d.name == "Person").unwrap(); + assert_eq!(d.kind, "class"); + } + + #[test] + fn set_generic_creates_function_definition() { + let s = parse_r("setGeneric(\"doIt\", function(x) standardGeneric(\"doIt\"))\n"); + let d = s.definitions.iter().find(|d| d.name == "doIt").unwrap(); + assert_eq!(d.kind, "function"); + } + + #[test] + fn function_with_double_arrow_assignment() { + // `<<-` is super-assignment in R; the JS extractor accepts it too. + let s = parse_r("g <<- function() { 1 }\n"); + let g = s.definitions.iter().find(|d| d.name == "g").unwrap(); + assert_eq!(g.kind, "function"); + } +} diff --git a/crates/codegraph-core/src/file_collector.rs b/crates/codegraph-core/src/file_collector.rs index 0cb157814..d1e82213d 100644 --- a/crates/codegraph-core/src/file_collector.rs +++ b/crates/codegraph-core/src/file_collector.rs @@ -36,6 +36,8 @@ const SUPPORTED_EXTENSIONS: &[&str] = &[ "js", "jsx", "mjs", "cjs", "ts", "tsx", "d.ts", "py", "pyi", "go", "rs", "java", "cs", "rb", "rake", "gemspec", "php", "phtml", "tf", "hcl", "c", "h", "cpp", "cc", "cxx", "hpp", "kt", "kts", "swift", "scala", "sh", "bash", "ex", "exs", "lua", "dart", "zig", "hs", "ml", "mli", + // R is case-sensitive: both `.r` and `.R` are conventional. + "r", "R", ]; /// Returns whether `path` has an extension the Rust file_collector would accept. diff --git a/crates/codegraph-core/src/parser_registry.rs b/crates/codegraph-core/src/parser_registry.rs index c87957f29..1084e72ee 100644 --- a/crates/codegraph-core/src/parser_registry.rs +++ b/crates/codegraph-core/src/parser_registry.rs @@ -27,6 +27,7 @@ pub enum LanguageKind { Haskell, Ocaml, OcamlInterface, + R, } impl LanguageKind { @@ -58,6 +59,7 @@ impl LanguageKind { Self::Haskell => "haskell", Self::Ocaml => "ocaml", Self::OcamlInterface => "ocaml-interface", + Self::R => "r", } } @@ -97,6 +99,9 @@ impl LanguageKind { "hs" => Some(Self::Haskell), "ml" => Some(Self::Ocaml), "mli" => Some(Self::OcamlInterface), + // R is case-sensitive: both `.r` (lowercase) and `.R` (uppercase) + // are conventional. `Path::extension` preserves case on Unix. + "r" | "R" => Some(Self::R), _ => None, } } @@ -129,6 +134,7 @@ impl LanguageKind { "haskell" => Some(Self::Haskell), "ocaml" => Some(Self::Ocaml), "ocaml-interface" => Some(Self::OcamlInterface), + "r" => Some(Self::R), _ => None, } } @@ -160,6 +166,7 @@ impl LanguageKind { Self::Haskell => tree_sitter_haskell::LANGUAGE.into(), Self::Ocaml => tree_sitter_ocaml::LANGUAGE_OCAML.into(), Self::OcamlInterface => tree_sitter_ocaml::LANGUAGE_OCAML_INTERFACE.into(), + Self::R => tree_sitter_r::LANGUAGE.into(), } } @@ -175,7 +182,7 @@ impl LanguageKind { &[ JavaScript, TypeScript, Tsx, Python, Go, Rust, Java, CSharp, Ruby, Php, Hcl, C, Cpp, Kotlin, Swift, Scala, Bash, Elixir, Lua, Dart, Zig, Haskell, Ocaml, - OcamlInterface, + OcamlInterface, R, ] } } @@ -244,14 +251,15 @@ mod tests { | LanguageKind::Zig | LanguageKind::Haskell | LanguageKind::Ocaml - | LanguageKind::OcamlInterface => (), + | LanguageKind::OcamlInterface + | LanguageKind::R => (), }; // IMPORTANT: this constant must equal the number of arms in the match // above AND the length of the slice returned by `LanguageKind::all()`. // Because both checks require the same manual update, they reinforce // each other: a developer who updates the match is reminded to also // update `all()` and this count. - const EXPECTED_LEN: usize = 24; + const EXPECTED_LEN: usize = 25; assert_eq!( LanguageKind::all().len(), EXPECTED_LEN, diff --git a/src/ast-analysis/rules/index.ts b/src/ast-analysis/rules/index.ts index 653cbd59b..ff7e0fb66 100644 --- a/src/ast-analysis/rules/index.ts +++ b/src/ast-analysis/rules/index.ts @@ -153,6 +153,10 @@ const OCAML_AST_TYPES: Record = { string: 'string', }; +const R_AST_TYPES: Record = { + string: 'string', +}; + export const AST_TYPE_MAPS: Map> = new Map([ ['javascript', JS_AST_TYPES], ['typescript', JS_AST_TYPES], @@ -177,6 +181,7 @@ export const AST_TYPE_MAPS: Map> = new Map([ ['haskell', HASKELL_AST_TYPES], ['ocaml', OCAML_AST_TYPES], ['ocaml-interface', OCAML_AST_TYPES], + ['r', R_AST_TYPES], ]); // ─── Per-language string-extraction config ─────────────────────────────── @@ -211,6 +216,7 @@ const DART_STRING_CONFIG: AstStringConfig = { quoteChars: '\'"', stringPrefixes: const ZIG_STRING_CONFIG: AstStringConfig = { quoteChars: '"', stringPrefixes: '' }; const HASKELL_STRING_CONFIG: AstStringConfig = { quoteChars: '"\'', stringPrefixes: '' }; const OCAML_STRING_CONFIG: AstStringConfig = { quoteChars: '"', stringPrefixes: '' }; +const R_STRING_CONFIG: AstStringConfig = { quoteChars: '\'"', stringPrefixes: '' }; export const AST_STRING_CONFIGS: Map = new Map([ ['javascript', JS_STRING_CONFIG], @@ -236,6 +242,7 @@ export const AST_STRING_CONFIGS: Map = new Map([ ['haskell', HASKELL_STRING_CONFIG], ['ocaml', OCAML_STRING_CONFIG], ['ocaml-interface', OCAML_STRING_CONFIG], + ['r', R_STRING_CONFIG], ]); // ─── Per-language "stop-after-collect" kinds ───────────────────────────── From 2a34ef1aec2ed19437f8809c1be969aa9dc8d4db Mon Sep 17 00:00:00 2001 From: carlos-alm Date: Mon, 11 May 2026 20:55:24 -0600 Subject: [PATCH 02/11] fix(native): include .r in NATIVE_SUPPORTED_EXTENSIONS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The drift guard test expects parity between NATIVE_SUPPORTED_EXTENSIONS (JS-side) and from_extension in parser_registry.rs (Rust-side). After this PR added native R support, .r was missing from the JS-side set, causing the drift guard test to fail. Also updates the classifyNativeDrops tests that previously assumed .r was unsupported by native — those tests now correctly assert that .R/.r files dropped by the native engine indicate an extractor failure, not a parser-limit gap. --- src/domain/parser.ts | 1 + tests/parsers/native-drop-classification.test.ts | 10 +++++----- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/domain/parser.ts b/src/domain/parser.ts index f1c7dd809..76bd56756 100644 --- a/src/domain/parser.ts +++ b/src/domain/parser.ts @@ -471,6 +471,7 @@ export const NATIVE_SUPPORTED_EXTENSIONS: ReadonlySet = new Set([ '.hs', '.ml', '.mli', + '.r', ]); /** diff --git a/tests/parsers/native-drop-classification.test.ts b/tests/parsers/native-drop-classification.test.ts index 24aee1d53..1bade56e8 100644 --- a/tests/parsers/native-drop-classification.test.ts +++ b/tests/parsers/native-drop-classification.test.ts @@ -19,7 +19,6 @@ describe('classifyNativeDrops', () => { 'src/b.gleam', 'src/c.clj', 'src/d.jl', - 'src/e.R', 'src/f.erl', 'src/g.sol', 'src/h.cu', @@ -27,11 +26,10 @@ describe('classifyNativeDrops', () => { 'src/j.v', 'src/k.m', ]); - expect(totals['unsupported-by-native']).toBe(11); + expect(totals['unsupported-by-native']).toBe(10); expect(totals['native-extractor-failure']).toBe(0); expect(byReason['unsupported-by-native'].get('.fs')).toEqual(['src/a.fs']); expect(byReason['unsupported-by-native'].get('.gleam')).toEqual(['src/b.gleam']); - expect(byReason['unsupported-by-native'].get('.r')).toEqual(['src/e.R']); }); it('flags natively-supported extensions as native-extractor-failure', () => { @@ -61,9 +59,11 @@ describe('classifyNativeDrops', () => { }); it('lowercases extensions so .R and .r share a bucket', () => { + // `.r` is now natively supported (R extractor was ported to Rust), so + // any dropped `.R`/`.r` files indicate a native extractor failure. const { byReason, totals } = classifyNativeDrops(['scripts/a.R', 'scripts/b.r']); - expect(totals['unsupported-by-native']).toBe(2); - expect(byReason['unsupported-by-native'].get('.r')).toEqual(['scripts/a.R', 'scripts/b.r']); + expect(totals['native-extractor-failure']).toBe(2); + expect(byReason['native-extractor-failure'].get('.r')).toEqual(['scripts/a.R', 'scripts/b.r']); }); it('returns empty buckets when no files are passed', () => { From fb4ae359fba6dad21974c1166d95cfb8ca2c3237 Mon Sep 17 00:00:00 2001 From: carlos-alm Date: Mon, 11 May 2026 20:55:42 -0600 Subject: [PATCH 03/11] fix(extractors): handle named arguments in R library() calls Greptile flagged that `first_argument_value` (native) and `handleLibraryCall` (WASM) both returned the first identifier child inside an `argument` node, which for named arguments like `library(package = dplyr)` returned `package` (the parameter name) instead of `dplyr` (the value). Fix in both engines together to preserve native/WASM parity: - Prefer the field-named `value` child of the `argument` node, which the tree-sitter-r grammar exposes explicitly for named arguments. - Fall back to a positional child scan that skips any child reachable via the `name` field, so positional grammar variants still work. Two new tests cover the identifier and string-literal forms. --- .../codegraph-core/src/extractors/r_lang.rs | 57 ++++++++++++++++++- src/extractors/r.ts | 29 ++++++++-- tests/parsers/r.test.ts | 11 ++++ 3 files changed, 92 insertions(+), 5 deletions(-) diff --git a/crates/codegraph-core/src/extractors/r_lang.rs b/crates/codegraph-core/src/extractors/r_lang.rs index 79a5411ff..1c68ebc72 100644 --- a/crates/codegraph-core/src/extractors/r_lang.rs +++ b/crates/codegraph-core/src/extractors/r_lang.rs @@ -203,15 +203,40 @@ fn handle_call(node: &Node, source: &[u8], symbols: &mut FileSymbols) { /// Returns the inner string literal text (quotes stripped) or the bare /// identifier text — whichever appears first. Used for `library(pkg)`, /// `source("file.R")`, `setClass("Foo", ...)`, etc. +/// +/// For named arguments like `library(package = dplyr)`, the tree-sitter-r +/// grammar exposes a `value` field on the `argument` node — we prefer that +/// over a positional child scan so we extract `dplyr`, not `package`. fn first_argument_value(node: &Node, source: &[u8], accept_identifier: bool) -> Option { let args = node.child_by_field_name("arguments").or_else(|| find_child(node, "arguments"))?; for i in 0..args.child_count() { let Some(arg) = args.child(i) else { continue }; match arg.kind() { "argument" => { - // argument wraps the actual value + // Prefer the field-named `value` child when present — this + // correctly handles `library(package = dplyr)` by returning + // `dplyr` (the value), not `package` (the parameter name). + if let Some(value) = arg.child_by_field_name("value") { + if value.kind() == "string" { + return Some(strip_string_quotes(&value, source)); + } + if accept_identifier && value.kind() == "identifier" { + return Some(node_text(&value, source).to_string()); + } + } + // Fallback: scan children but skip anything before the `=` + // operator. The grammar exposes the parameter name via the + // `name` field, so we use that to know which children are + // before/after the `=`. + let name_node = arg.child_by_field_name("name"); for j in 0..arg.child_count() { let Some(inner) = arg.child(j) else { continue }; + // Skip the parameter-name identifier itself for named args. + if let Some(ref n) = name_node { + if inner.id() == n.id() { + continue; + } + } if inner.kind() == "string" { return Some(strip_string_quotes(&inner, source)); } @@ -408,4 +433,34 @@ mod tests { let g = s.definitions.iter().find(|d| d.name == "g").unwrap(); assert_eq!(g.kind, "function"); } + + #[test] + fn library_named_argument_extracts_value_not_name() { + // `library(package = dplyr)` uses a named argument — the import + // source must be `dplyr` (the value), not `package` (the name). + let s = parse_r("library(package = dplyr)\n"); + assert_eq!(s.imports.len(), 1); + assert_eq!(s.imports[0].source, "dplyr"); + assert_eq!(s.imports[0].names, vec!["dplyr".to_string()]); + } + + #[test] + fn library_named_argument_with_string_value() { + // Same pattern but with a string literal as the value. + let s = parse_r("library(package = \"dplyr\")\n"); + assert_eq!(s.imports.len(), 1); + assert_eq!(s.imports[0].source, "dplyr"); + } + + #[test] + fn nested_function_assignment_is_recorded() { + // Matches the JS extractor's documented behavior: function + // definitions are emitted regardless of nesting depth (only + // variable assignments are filtered by `is_program_level`). + // This test pins the behavior so future changes are intentional. + let s = parse_r("outer <- function() { inner <- function() { 1 }; inner() }\n"); + let defs: Vec<&str> = s.definitions.iter().map(|d| d.name.as_str()).collect(); + assert!(defs.contains(&"outer")); + assert!(defs.contains(&"inner")); + } } diff --git a/src/extractors/r.ts b/src/extractors/r.ts index 19cf0e723..15e4be61f 100644 --- a/src/extractors/r.ts +++ b/src/extractors/r.ts @@ -147,7 +147,10 @@ function handleCall(node: TreeSitterNode, ctx: ExtractorOutput): void { } function handleLibraryCall(node: TreeSitterNode, ctx: ExtractorOutput): void { - // Find the package name in arguments + // Find the package name in arguments. For named arguments like + // `library(package = dplyr)`, prefer the field-named `value` child of the + // `argument` node so we extract `dplyr` (the value), not `package` (the + // parameter name). Keeps native (Rust) and WASM extractors in parity. for (let i = 0; i < node.childCount; i++) { const child = node.child(i); if (!child) continue; @@ -174,9 +177,27 @@ function handleLibraryCall(node: TreeSitterNode, ctx: ExtractorOutput): void { } // Argument might be wrapped if (arg.type === 'argument') { - const id = findChild(arg, 'identifier') || findChild(arg, 'string'); - if (id) { - const text = id.text.replace(/^["']|["']$/g, ''); + // Prefer the `value` field (correct for named arguments). + const valueNode = arg.childForFieldName('value'); + let pick: TreeSitterNode | null = null; + if (valueNode && (valueNode.type === 'string' || valueNode.type === 'identifier')) { + pick = valueNode; + } else { + // Fallback: skip the parameter-name child if the grammar exposes + // it via the `name` field, then pick the first string/identifier. + const nameNode = arg.childForFieldName('name'); + for (let k = 0; k < arg.childCount; k++) { + const inner = arg.child(k); + if (!inner) continue; + if (nameNode && inner.id === nameNode.id) continue; + if (inner.type === 'string' || inner.type === 'identifier') { + pick = inner; + break; + } + } + } + if (pick) { + const text = pick.text.replace(/^["']|["']$/g, ''); ctx.imports.push({ source: text, names: [text], diff --git a/tests/parsers/r.test.ts b/tests/parsers/r.test.ts index 85380c590..ac2f26534 100644 --- a/tests/parsers/r.test.ts +++ b/tests/parsers/r.test.ts @@ -44,4 +44,15 @@ require(ggplot2)`); mean(c(1, 2, 3))`); expect(symbols.calls.length).toBeGreaterThanOrEqual(1); }); + + it('extracts the value (not the parameter name) for named library arguments', () => { + // `library(package = dplyr)` is rare but valid R. The import source must + // be `dplyr` (the value), not `package` (the parameter name). Keeps the + // WASM and native extractors in parity. + const symbols = parseR(`library(package = dplyr)`); + expect(symbols.imports).toContainEqual( + expect.objectContaining({ source: 'dplyr' }), + ); + expect(symbols.imports.some((i) => i.source === 'package')).toBe(false); + }); }); From 53dfd00e2e9e6702e344f736faf18f938227c43c Mon Sep 17 00:00:00 2001 From: carlos-alm Date: Mon, 11 May 2026 20:57:25 -0600 Subject: [PATCH 04/11] chore: sync Cargo.lock with codegraph-core 3.10.0 from main --- Cargo.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index 8198cbdf4..318611895 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -66,7 +66,7 @@ checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" [[package]] name = "codegraph-core" -version = "3.9.6" +version = "3.10.0" dependencies = [ "globset", "ignore", From e8aee6cf57620fbff976cbb47f0b13df9c0489e4 Mon Sep 17 00:00:00 2001 From: carlos-alm Date: Mon, 11 May 2026 23:54:21 -0600 Subject: [PATCH 05/11] fix: address lint format error in r.test.ts (#1102) --- tests/parsers/r.test.ts | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/parsers/r.test.ts b/tests/parsers/r.test.ts index ac2f26534..878a179b9 100644 --- a/tests/parsers/r.test.ts +++ b/tests/parsers/r.test.ts @@ -50,9 +50,7 @@ mean(c(1, 2, 3))`); // be `dplyr` (the value), not `package` (the parameter name). Keeps the // WASM and native extractors in parity. const symbols = parseR(`library(package = dplyr)`); - expect(symbols.imports).toContainEqual( - expect.objectContaining({ source: 'dplyr' }), - ); + expect(symbols.imports).toContainEqual(expect.objectContaining({ source: 'dplyr' })); expect(symbols.imports.some((i) => i.source === 'package')).toBe(false); }); }); From ebc9754ab48d4fb75bb6692d9faf8b00191afa3a Mon Sep 17 00:00:00 2001 From: carlos-alm Date: Tue, 12 May 2026 00:21:55 -0600 Subject: [PATCH 06/11] fix: correct unsupported-native count after Clojure native port (#1102) --- tests/parsers/native-drop-classification.test.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/parsers/native-drop-classification.test.ts b/tests/parsers/native-drop-classification.test.ts index c6f42f317..ce9f1d1b3 100644 --- a/tests/parsers/native-drop-classification.test.ts +++ b/tests/parsers/native-drop-classification.test.ts @@ -25,7 +25,7 @@ describe('classifyNativeDrops', () => { 'src/j.v', 'src/k.m', ]); - expect(totals['unsupported-by-native']).toBe(10); + expect(totals['unsupported-by-native']).toBe(9); expect(totals['native-extractor-failure']).toBe(0); expect(byReason['unsupported-by-native'].get('.fs')).toEqual(['src/a.fs']); expect(byReason['unsupported-by-native'].get('.gleam')).toEqual(['src/b.gleam']); From 815e6d0f95f1fb3a3ca84f1ff35f1f2f81b3d6a5 Mon Sep 17 00:00:00 2001 From: carlos-alm Date: Tue, 12 May 2026 00:30:41 -0600 Subject: [PATCH 07/11] chore: re-trigger CI (#1102) From 58b85e769c48a2f9333ac221e75302f1cd62a09f Mon Sep 17 00:00:00 2001 From: carlos-alm Date: Wed, 13 May 2026 03:35:32 -0600 Subject: [PATCH 08/11] fix: bump EXPECTED_LEN to 28 after merging Julia + R --- crates/codegraph-core/src/parser_registry.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/codegraph-core/src/parser_registry.rs b/crates/codegraph-core/src/parser_registry.rs index b5e17e223..8d8b4e8cf 100644 --- a/crates/codegraph-core/src/parser_registry.rs +++ b/crates/codegraph-core/src/parser_registry.rs @@ -277,7 +277,7 @@ mod tests { // Because both checks require the same manual update, they reinforce // each other: a developer who updates the match is reminded to also // update `all()` and this count. - const EXPECTED_LEN: usize = 27; + const EXPECTED_LEN: usize = 28; assert_eq!( LanguageKind::all().len(), EXPECTED_LEN, From 20f988b7a03a5f2903cffbd0b4d55a280449288a Mon Sep 17 00:00:00 2001 From: carlos-alm Date: Wed, 13 May 2026 03:38:26 -0600 Subject: [PATCH 09/11] fix(r-extractor): index-based quote strip preserves inner quote (#1102) --- .../codegraph-core/src/extractors/r_lang.rs | 35 +++++++++++++++++-- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/crates/codegraph-core/src/extractors/r_lang.rs b/crates/codegraph-core/src/extractors/r_lang.rs index 1c68ebc72..256500ef7 100644 --- a/crates/codegraph-core/src/extractors/r_lang.rs +++ b/crates/codegraph-core/src/extractors/r_lang.rs @@ -263,9 +263,22 @@ fn strip_string_quotes(node: &Node, source: &[u8]) -> String { if let Some(content) = find_child(node, "string_content") { return node_text(&content, source).to_string(); } - node_text(node, source) - .trim_matches(|c| c == '\'' || c == '"') - .to_string() + // Fallback: strip exactly one matching quote from each end. We can't use + // `trim_matches` because it strips *all* matching characters greedily — + // e.g. for the literal `"'"` (a string containing a single quote) the + // text is `"`, `'`, `"`, and `trim_matches` would consume all three, + // returning an empty string. Index-based strip removes only the outer + // pair, leaving the inner character intact. + let text = node_text(node, source); + let bytes = text.as_bytes(); + if bytes.len() >= 2 { + let first = bytes[0]; + let last = bytes[bytes.len() - 1]; + if (first == b'\'' || first == b'"') && first == last { + return text[1..bytes.len() - 1].to_string(); + } + } + text.to_string() } fn handle_library_call(node: &Node, source: &[u8], symbols: &mut FileSymbols) { @@ -452,6 +465,22 @@ mod tests { assert_eq!(s.imports[0].source, "dplyr"); } + #[test] + fn source_call_with_mixed_quote_content_preserves_inner_quote() { + // Edge case for the strip_string_quotes fallback: if a grammar + // version drops the `string_content` child, the fallback must strip + // only the outer pair of quotes. `trim_matches` would greedily eat + // both the outer `"` and the inner `'`, returning an empty path. + // Index-based strip leaves the inner `'` intact. + // + // We exercise the fallback indirectly via `source("a'b.R")` — + // current grammars expose `string_content`, so this primarily + // guards against future regressions in the fallback path. + let s = parse_r("source(\"a'b.R\")\n"); + assert_eq!(s.imports.len(), 1); + assert_eq!(s.imports[0].source, "a'b.R"); + } + #[test] fn nested_function_assignment_is_recorded() { // Matches the JS extractor's documented behavior: function From 522131febb319bcbed0103f582adf115beff90bc Mon Sep 17 00:00:00 2001 From: carlos-alm Date: Wed, 13 May 2026 06:21:05 -0600 Subject: [PATCH 10/11] test: exempt 3.10.0:Query time regression for #1102 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Combined R + Solidity native ports trip the bench gate's 25% Query time threshold (+110% native, 49.6 → 104ms) — neither PR caused this individually. Tracked in #1113. docs check acknowledged --- tests/benchmarks/regression-guard.test.ts | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/benchmarks/regression-guard.test.ts b/tests/benchmarks/regression-guard.test.ts index 337f0a5aa..97ba31b5f 100644 --- a/tests/benchmarks/regression-guard.test.ts +++ b/tests/benchmarks/regression-guard.test.ts @@ -166,6 +166,17 @@ const SKIP_VERSIONS = new Set(['3.8.0']); * Exempt this release; remove once 3.11.0+ data confirms stabilization * under the warmup + 5-sample methodology already applied to incremental * benchmarks. + * + * - 3.10.0:Query time — cumulative effect of adding two native extractors + * (Solidity #1100 + R #1102) in quick succession. Neither tripped the + * threshold individually (Solidity PR's Query time stayed at 49ms, R PR + * showed no warning), but the combined +110% (49.6 → ~105ms) on the + * `fnDepsData('buildGraph', dbPath)` measurement reflects natural graph + * growth: ~1100 LoC of new extractor code + 9 fixture files added to the + * self-build benchmark expand `buildGraph`'s transitive callee count and + * DB row counts. Tracked in #1113 — exempt this release; remove once + * 3.11.0+ data captures the new steady-state and the per-language + * fixture footprint has been evaluated. */ const KNOWN_REGRESSIONS = new Set([ '3.9.6:Build ms/file', @@ -176,6 +187,7 @@ const KNOWN_REGRESSIONS = new Set([ '3.10.0:No-op rebuild', '3.10.0:1-file rebuild', '3.10.0:fnDeps depth 1', + '3.10.0:Query time', ]); /** From e4a0dd1c6fce0c00c7d5044abb78615e8960bef1 Mon Sep 17 00:00:00 2001 From: carlos-alm Date: Wed, 13 May 2026 07:45:20 -0600 Subject: [PATCH 11/11] test: exempt 3.10.0:fnDeps depth 5 regression for #1102 merge --- tests/benchmarks/regression-guard.test.ts | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/benchmarks/regression-guard.test.ts b/tests/benchmarks/regression-guard.test.ts index 97ba31b5f..fae6a390d 100644 --- a/tests/benchmarks/regression-guard.test.ts +++ b/tests/benchmarks/regression-guard.test.ts @@ -177,6 +177,15 @@ const SKIP_VERSIONS = new Set(['3.8.0']); * DB row counts. Tracked in #1113 — exempt this release; remove once * 3.11.0+ data captures the new steady-state and the per-language * fixture footprint has been evaluated. + * + * - 3.10.0:fnDeps depth 5 — same cause as Query time above. Merging main + * into #1102 added the Erlang extractor (#1103) on top of the existing + * Solidity (#1100) + R (#1102) growth, expanding `buildGraph`'s + * depth-5 transitive callee fan-out by another step. The depth-5 walk + * amplifies any base-graph growth quadratically (each new node adds + * its own depth-5 subtree). +31% over the 25% threshold maps to the + * ~33→43ms swing on a sub-50ms metric. Tracked in #1113 alongside + * Query time; remove both once 3.11.0+ data confirms the new steady-state. */ const KNOWN_REGRESSIONS = new Set([ '3.9.6:Build ms/file', @@ -187,6 +196,7 @@ const KNOWN_REGRESSIONS = new Set([ '3.10.0:No-op rebuild', '3.10.0:1-file rebuild', '3.10.0:fnDeps depth 1', + '3.10.0:fnDeps depth 5', '3.10.0:Query time', ]);