From 15184a07d6d0b8cd7fb1ea0f0ff6123780c28ff4 Mon Sep 17 00:00:00 2001
From: carlos-alm <contato@carlosalmeida.com>
Date: Mon, 11 May 2026 04:09:39 -0600
Subject: [PATCH 01/11] feat(native): port R extractor to Rust

Adds tree-sitter-r dependency and native extractor matching
the WASM-side behavior for R symbol, import, and call extraction.

Part of #1071
---
 Cargo.lock                                    |  11 +
 crates/codegraph-core/Cargo.toml              |   1 +
 .../codegraph-core/src/extractors/helpers.rs  |  11 +
 crates/codegraph-core/src/extractors/mod.rs   |   4 +
 .../codegraph-core/src/extractors/r_lang.rs   | 411 ++++++++++++++++++
 crates/codegraph-core/src/file_collector.rs   |   2 +
 crates/codegraph-core/src/parser_registry.rs  |  14 +-
 src/ast-analysis/rules/index.ts               |   7 +
 8 files changed, 458 insertions(+), 3 deletions(-)
 create mode 100644 crates/codegraph-core/src/extractors/r_lang.rs

diff --git a/Cargo.lock b/Cargo.lock
index 413504b0d..8198cbdf4 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -96,6 +96,7 @@ dependencies = [
  "tree-sitter-ocaml",
  "tree-sitter-php",
  "tree-sitter-python",
+ "tree-sitter-r",
  "tree-sitter-ruby",
  "tree-sitter-rust",
  "tree-sitter-scala",
@@ -895,6 +896,16 @@ dependencies = [
  "tree-sitter-language",
 ]
 
+[[package]]
+name = "tree-sitter-r"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "429133cbda9f8a46e03ef3aae6abb6c3d22875f8585cad472138101bfd517255"
+dependencies = [
+ "cc",
+ "tree-sitter-language",
+]
+
 [[package]]
 name = "tree-sitter-ruby"
 version = "0.23.1"
diff --git a/crates/codegraph-core/Cargo.toml b/crates/codegraph-core/Cargo.toml
index df4361e17..a0c513b68 100644
--- a/crates/codegraph-core/Cargo.toml
+++ b/crates/codegraph-core/Cargo.toml
@@ -35,6 +35,7 @@ tree-sitter-dart = "0.0.4"
 tree-sitter-zig = "1"
 tree-sitter-haskell = "0.23"
 tree-sitter-ocaml = "0.24"
+tree-sitter-r = "1.2"
 rayon = "1"
 ignore = "0.4"
 globset = "0.4"
diff --git a/crates/codegraph-core/src/extractors/helpers.rs b/crates/codegraph-core/src/extractors/helpers.rs
index b02531896..3e0d5964e 100644
--- a/crates/codegraph-core/src/extractors/helpers.rs
+++ b/crates/codegraph-core/src/extractors/helpers.rs
@@ -360,6 +360,17 @@ pub const OCAML_AST_CONFIG: LangAstConfig = LangAstConfig {
     string_prefixes: &[],
 };
 
+pub const R_AST_CONFIG: LangAstConfig = LangAstConfig {
+    new_types: &[],
+    throw_types: &[],
+    await_types: &[],
+    // tree-sitter-r emits `string` for both single- and double-quoted literals.
+    string_types: &["string"],
+    regex_types: &[],
+    quote_chars: &['\'', '"'],
+    string_prefixes: &[],
+};
+
 // ── Generic AST node walker ──────────────────────────────────────────────────
 
 /// Node types that represent identifiers across languages.
diff --git a/crates/codegraph-core/src/extractors/mod.rs b/crates/codegraph-core/src/extractors/mod.rs
index 642f29f98..3d8ac9810 100644
--- a/crates/codegraph-core/src/extractors/mod.rs
+++ b/crates/codegraph-core/src/extractors/mod.rs
@@ -15,6 +15,7 @@ pub mod lua;
 pub mod ocaml;
 pub mod php;
 pub mod python;
+pub mod r_lang;
 pub mod ruby;
 pub mod rust_lang;
 pub mod scala;
@@ -126,5 +127,8 @@ pub fn extract_symbols_with_opts(
         LanguageKind::Ocaml | LanguageKind::OcamlInterface => {
             ocaml::OcamlExtractor.extract_with_opts(tree, source, file_path, include_ast_nodes)
         }
+        LanguageKind::R => {
+            r_lang::RExtractor.extract_with_opts(tree, source, file_path, include_ast_nodes)
+        }
     }
 }
diff --git a/crates/codegraph-core/src/extractors/r_lang.rs b/crates/codegraph-core/src/extractors/r_lang.rs
new file mode 100644
index 000000000..79a5411ff
--- /dev/null
+++ b/crates/codegraph-core/src/extractors/r_lang.rs
@@ -0,0 +1,411 @@
+use tree_sitter::{Node, Tree};
+use crate::cfg::build_function_cfg;
+use crate::complexity::compute_all_metrics;
+use crate::types::*;
+use super::helpers::*;
+use super::SymbolExtractor;
+
+/// R symbol extractor — ports `src/extractors/r.ts` from the JS engine.
+///
+/// tree-sitter-r grammar (r-lib/tree-sitter-r) notes:
+/// - Assignments: `binary_operator` with `<-`, `=`, or `<<-` operator
+/// - Functions: `function_definition` as RHS of assignment
+/// - Calls: `call` node with `function`/`arguments` fields
+/// - Imports: `library()` / `require()` (packages) and `source()` (files)
+/// - S4 classes: `setClass()`, `setRefClass()`, `setGeneric()`, `setMethod()`
+pub struct RExtractor;
+
+impl SymbolExtractor for RExtractor {
+    fn extract(&self, tree: &Tree, source: &[u8], file_path: &str) -> FileSymbols {
+        let mut symbols = FileSymbols::new(file_path.to_string());
+        walk_tree(&tree.root_node(), source, &mut symbols, match_r_node);
+        walk_ast_nodes_with_config(&tree.root_node(), source, &mut symbols.ast_nodes, &R_AST_CONFIG);
+        symbols
+    }
+}
+
+fn match_r_node(node: &Node, source: &[u8], symbols: &mut FileSymbols, _depth: usize) {
+    match node.kind() {
+        "binary_operator" => handle_binary_op(node, source, symbols),
+        "call" => handle_call(node, source, symbols),
+        _ => {}
+    }
+}
+
+fn handle_binary_op(node: &Node, source: &[u8], symbols: &mut FileSymbols) {
+    // binary_operator children: lhs, operator, rhs
+    // We use field accessors for robustness; the grammar exposes
+    // `lhs`/`operator`/`rhs` fields explicitly.
+    let lhs = match node.child_by_field_name("lhs").or_else(|| node.child(0)) {
+        Some(n) => n,
+        None => return,
+    };
+    let op = match node.child_by_field_name("operator").or_else(|| node.child(1)) {
+        Some(n) => n,
+        None => return,
+    };
+    let rhs = match node.child_by_field_name("rhs").or_else(|| node.child(2)) {
+        Some(n) => n,
+        None => return,
+    };
+
+    let op_text = node_text(&op, source);
+    if op_text != "<-" && op_text != "=" && op_text != "<<-" {
+        return;
+    }
+    if lhs.kind() != "identifier" {
+        return;
+    }
+
+    let name = node_text(&lhs, source).to_string();
+
+    if rhs.kind() == "function_definition" {
+        let params = extract_r_params(&rhs, source);
+        symbols.definitions.push(Definition {
+            name,
+            kind: "function".to_string(),
+            line: start_line(node),
+            end_line: Some(end_line(node)),
+            decorators: None,
+            complexity: compute_all_metrics(&rhs, source, "r"),
+            cfg: build_function_cfg(&rhs, "r", source),
+            children: opt_children(params),
+        });
+    } else if is_program_level(node) {
+        // Only record top-level variable assignments (matches JS extractor).
+        symbols.definitions.push(Definition {
+            name,
+            kind: "variable".to_string(),
+            line: start_line(node),
+            end_line: Some(end_line(node)),
+            decorators: None,
+            complexity: None,
+            cfg: None,
+            children: None,
+        });
+    }
+}
+
+fn is_program_level(node: &Node) -> bool {
+    node.parent().map(|p| p.kind() == "program").unwrap_or(false)
+}
+
+fn extract_r_params(func_def: &Node, source: &[u8]) -> Vec<Definition> {
+    let mut params = Vec::new();
+    let params_node = match func_def.child_by_field_name("parameters") {
+        Some(n) => n,
+        None => return params,
+    };
+
+    for i in 0..params_node.child_count() {
+        let Some(child) = params_node.child(i) else { continue };
+        match child.kind() {
+            "parameter" => {
+                // parameter has `name` field, e.g. `x` or `y = 10`.
+                // Falls back to first identifier child (or `dots` for `...`).
+                if let Some(name_node) = child.child_by_field_name("name") {
+                    params.push(child_def(
+                        node_text(&name_node, source).to_string(),
+                        "parameter",
+                        start_line(&child),
+                    ));
+                } else if let Some(dots) = find_child(&child, "dots") {
+                    params.push(child_def(
+                        node_text(&dots, source).to_string(),
+                        "parameter",
+                        start_line(&child),
+                    ));
+                } else if let Some(ident) = find_child(&child, "identifier") {
+                    params.push(child_def(
+                        node_text(&ident, source).to_string(),
+                        "parameter",
+                        start_line(&child),
+                    ));
+                }
+            }
+            "identifier" => {
+                // Some grammar variants expose bare identifiers at the parameters level.
+                params.push(child_def(
+                    node_text(&child, source).to_string(),
+                    "parameter",
+                    start_line(&child),
+                ));
+            }
+            _ => {}
+        }
+    }
+    params
+}
+
+fn handle_call(node: &Node, source: &[u8], symbols: &mut FileSymbols) {
+    // call: function field is the callee (identifier or namespace_operator),
+    // arguments field is the arguments list.
+    let func_node = match node.child_by_field_name("function").or_else(|| node.child(0)) {
+        Some(n) => n,
+        None => return,
+    };
+
+    let func_text = node_text(&func_node, source);
+
+    // Special-case keyword-like callees first; they short-circuit and do NOT
+    // produce a generic call edge (matches JS extractor).
+    if func_node.kind() == "identifier" {
+        match func_text {
+            "library" | "require" => {
+                handle_library_call(node, source, symbols);
+                return;
+            }
+            "source" => {
+                handle_source_call(node, source, symbols);
+                return;
+            }
+            "setClass" | "setRefClass" => {
+                handle_set_class(node, source, symbols);
+                return;
+            }
+            "setGeneric" | "setMethod" => {
+                handle_set_generic(node, source, symbols);
+                return;
+            }
+            _ => {}
+        }
+    }
+
+    match func_node.kind() {
+        "identifier" => {
+            symbols.calls.push(Call {
+                name: func_text.to_string(),
+                line: start_line(node),
+                dynamic: None,
+                receiver: None,
+            });
+        }
+        "namespace_operator" => {
+            // `pkg::func` — receiver is the package; name is the function.
+            let parts: Vec<&str> = func_text.split("::").collect();
+            if parts.len() >= 2 {
+                let name = parts[parts.len() - 1].to_string();
+                let receiver = parts[..parts.len() - 1].join("::");
+                symbols.calls.push(Call {
+                    name,
+                    line: start_line(node),
+                    dynamic: None,
+                    receiver: Some(receiver),
+                });
+            }
+        }
+        _ => {}
+    }
+}
+
+/// Extract the first argument value from a call's `arguments` node.
+///
+/// Returns the inner string literal text (quotes stripped) or the bare
+/// identifier text — whichever appears first. Used for `library(pkg)`,
+/// `source("file.R")`, `setClass("Foo", ...)`, etc.
+fn first_argument_value(node: &Node, source: &[u8], accept_identifier: bool) -> Option<String> {
+    let args = node.child_by_field_name("arguments").or_else(|| find_child(node, "arguments"))?;
+    for i in 0..args.child_count() {
+        let Some(arg) = args.child(i) else { continue };
+        match arg.kind() {
+            "argument" => {
+                // argument wraps the actual value
+                for j in 0..arg.child_count() {
+                    let Some(inner) = arg.child(j) else { continue };
+                    if inner.kind() == "string" {
+                        return Some(strip_string_quotes(&inner, source));
+                    }
+                    if accept_identifier && inner.kind() == "identifier" {
+                        return Some(node_text(&inner, source).to_string());
+                    }
+                }
+            }
+            "string" => {
+                return Some(strip_string_quotes(&arg, source));
+            }
+            "identifier" if accept_identifier => {
+                return Some(node_text(&arg, source).to_string());
+            }
+            _ => {}
+        }
+    }
+    None
+}
+
+/// Strip surrounding `'` or `"` quotes from a `string` node's text.
+fn strip_string_quotes(node: &Node, source: &[u8]) -> String {
+    // Prefer `string_content` child when available (avoids any escape quirks).
+    if let Some(content) = find_child(node, "string_content") {
+        return node_text(&content, source).to_string();
+    }
+    node_text(node, source)
+        .trim_matches(|c| c == '\'' || c == '"')
+        .to_string()
+}
+
+fn handle_library_call(node: &Node, source: &[u8], symbols: &mut FileSymbols) {
+    if let Some(pkg) = first_argument_value(node, source, true) {
+        symbols.imports.push(Import::new(
+            pkg.clone(),
+            vec![pkg],
+            start_line(node),
+        ));
+    }
+}
+
+fn handle_source_call(node: &Node, source: &[u8], symbols: &mut FileSymbols) {
+    // source() only accepts string literals — `source(varname)` is not an import.
+    if let Some(path) = first_argument_value(node, source, false) {
+        symbols.imports.push(Import::new(
+            path,
+            vec!["source".to_string()],
+            start_line(node),
+        ));
+    }
+}
+
+fn handle_set_class(node: &Node, source: &[u8], symbols: &mut FileSymbols) {
+    if let Some(name) = first_argument_value(node, source, false) {
+        symbols.definitions.push(Definition {
+            name,
+            kind: "class".to_string(),
+            line: start_line(node),
+            end_line: Some(end_line(node)),
+            decorators: None,
+            complexity: None,
+            cfg: None,
+            children: None,
+        });
+    }
+}
+
+fn handle_set_generic(node: &Node, source: &[u8], symbols: &mut FileSymbols) {
+    if let Some(name) = first_argument_value(node, source, false) {
+        symbols.definitions.push(Definition {
+            name,
+            kind: "function".to_string(),
+            line: start_line(node),
+            end_line: Some(end_line(node)),
+            decorators: None,
+            complexity: None,
+            cfg: None,
+            children: None,
+        });
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use tree_sitter::Parser;
+
+    fn parse_r(code: &str) -> FileSymbols {
+        let mut parser = Parser::new();
+        parser
+            .set_language(&tree_sitter_r::LANGUAGE.into())
+            .unwrap();
+        let tree = parser.parse(code.as_bytes(), None).unwrap();
+        RExtractor.extract(&tree, code.as_bytes(), "test.R")
+    }
+
+    #[test]
+    fn finds_function_assignment() {
+        let s = parse_r("greet <- function(name) { print(name) }\n");
+        assert_eq!(s.definitions.len(), 1);
+        assert_eq!(s.definitions[0].name, "greet");
+        assert_eq!(s.definitions[0].kind, "function");
+        let children = s.definitions[0].children.as_ref().unwrap();
+        assert_eq!(children.len(), 1);
+        assert_eq!(children[0].name, "name");
+        assert_eq!(children[0].kind, "parameter");
+    }
+
+    #[test]
+    fn finds_function_with_default_and_dots() {
+        let s = parse_r("f <- function(x, y = 10, ...) { x }\n");
+        let f = s.definitions.iter().find(|d| d.name == "f").unwrap();
+        let children = f.children.as_ref().unwrap();
+        let names: Vec<&str> = children.iter().map(|c| c.name.as_str()).collect();
+        assert!(names.contains(&"x"));
+        assert!(names.contains(&"y"));
+        assert!(names.contains(&"..."));
+    }
+
+    #[test]
+    fn finds_top_level_variable() {
+        let s = parse_r("user_store <- list()\n");
+        let v = s.definitions.iter().find(|d| d.name == "user_store").unwrap();
+        assert_eq!(v.kind, "variable");
+    }
+
+    #[test]
+    fn skips_nested_variable_assignment() {
+        // Inner `user <- ...` is inside the function body — should not be recorded
+        // as a top-level definition (it's a local binding).
+        let s = parse_r("f <- function() { user <- list(); user }\n");
+        let defs: Vec<&str> = s.definitions.iter().map(|d| d.name.as_str()).collect();
+        assert!(defs.contains(&"f"));
+        assert!(!defs.contains(&"user"));
+    }
+
+    #[test]
+    fn extracts_source_imports() {
+        let s = parse_r("source(\"service.R\")\nsource('utils.R')\n");
+        assert_eq!(s.imports.len(), 2);
+        assert_eq!(s.imports[0].source, "service.R");
+        assert_eq!(s.imports[0].names, vec!["source".to_string()]);
+        assert_eq!(s.imports[1].source, "utils.R");
+    }
+
+    #[test]
+    fn extracts_library_and_require_imports() {
+        let s = parse_r("library(dplyr)\nrequire(\"ggplot2\")\n");
+        assert_eq!(s.imports.len(), 2);
+        assert_eq!(s.imports[0].source, "dplyr");
+        assert_eq!(s.imports[1].source, "ggplot2");
+    }
+
+    #[test]
+    fn extracts_calls() {
+        let s = parse_r("f <- function() { print(1); validate(x) }\n");
+        let names: Vec<&str> = s.calls.iter().map(|c| c.name.as_str()).collect();
+        assert!(names.contains(&"print"));
+        assert!(names.contains(&"validate"));
+    }
+
+    #[test]
+    fn source_call_is_import_not_call() {
+        let s = parse_r("source(\"service.R\")\n");
+        assert!(s.calls.iter().all(|c| c.name != "source"),
+            "source() should be classified as import, not as a generic call");
+    }
+
+    #[test]
+    fn namespace_call_splits_receiver() {
+        let s = parse_r("f <- function() { dplyr::filter(df) }\n");
+        let c = s.calls.iter().find(|c| c.name == "filter").unwrap();
+        assert_eq!(c.receiver, Some("dplyr".to_string()));
+    }
+
+    #[test]
+    fn set_class_creates_class_definition() {
+        let s = parse_r("setClass(\"Person\", representation(name = \"character\"))\n");
+        let d = s.definitions.iter().find(|d| d.name == "Person").unwrap();
+        assert_eq!(d.kind, "class");
+    }
+
+    #[test]
+    fn set_generic_creates_function_definition() {
+        let s = parse_r("setGeneric(\"doIt\", function(x) standardGeneric(\"doIt\"))\n");
+        let d = s.definitions.iter().find(|d| d.name == "doIt").unwrap();
+        assert_eq!(d.kind, "function");
+    }
+
+    #[test]
+    fn function_with_double_arrow_assignment() {
+        // `<<-` is super-assignment in R; the JS extractor accepts it too.
+        let s = parse_r("g <<- function() { 1 }\n");
+        let g = s.definitions.iter().find(|d| d.name == "g").unwrap();
+        assert_eq!(g.kind, "function");
+    }
+}
diff --git a/crates/codegraph-core/src/file_collector.rs b/crates/codegraph-core/src/file_collector.rs
index 0cb157814..d1e82213d 100644
--- a/crates/codegraph-core/src/file_collector.rs
+++ b/crates/codegraph-core/src/file_collector.rs
@@ -36,6 +36,8 @@ const SUPPORTED_EXTENSIONS: &[&str] = &[
     "js", "jsx", "mjs", "cjs", "ts", "tsx", "d.ts", "py", "pyi", "go", "rs", "java", "cs", "rb",
     "rake", "gemspec", "php", "phtml", "tf", "hcl", "c", "h", "cpp", "cc", "cxx", "hpp", "kt",
     "kts", "swift", "scala", "sh", "bash", "ex", "exs", "lua", "dart", "zig", "hs", "ml", "mli",
+    // R is case-sensitive: both `.r` and `.R` are conventional.
+    "r", "R",
 ];
 
 /// Returns whether `path` has an extension the Rust file_collector would accept.
diff --git a/crates/codegraph-core/src/parser_registry.rs b/crates/codegraph-core/src/parser_registry.rs
index c87957f29..1084e72ee 100644
--- a/crates/codegraph-core/src/parser_registry.rs
+++ b/crates/codegraph-core/src/parser_registry.rs
@@ -27,6 +27,7 @@ pub enum LanguageKind {
     Haskell,
     Ocaml,
     OcamlInterface,
+    R,
 }
 
 impl LanguageKind {
@@ -58,6 +59,7 @@ impl LanguageKind {
             Self::Haskell => "haskell",
             Self::Ocaml => "ocaml",
             Self::OcamlInterface => "ocaml-interface",
+            Self::R => "r",
         }
     }
 
@@ -97,6 +99,9 @@ impl LanguageKind {
             "hs" => Some(Self::Haskell),
             "ml" => Some(Self::Ocaml),
             "mli" => Some(Self::OcamlInterface),
+            // R is case-sensitive: both `.r` (lowercase) and `.R` (uppercase)
+            // are conventional. `Path::extension` preserves case on Unix.
+            "r" | "R" => Some(Self::R),
             _ => None,
         }
     }
@@ -129,6 +134,7 @@ impl LanguageKind {
             "haskell" => Some(Self::Haskell),
             "ocaml" => Some(Self::Ocaml),
             "ocaml-interface" => Some(Self::OcamlInterface),
+            "r" => Some(Self::R),
             _ => None,
         }
     }
@@ -160,6 +166,7 @@ impl LanguageKind {
             Self::Haskell => tree_sitter_haskell::LANGUAGE.into(),
             Self::Ocaml => tree_sitter_ocaml::LANGUAGE_OCAML.into(),
             Self::OcamlInterface => tree_sitter_ocaml::LANGUAGE_OCAML_INTERFACE.into(),
+            Self::R => tree_sitter_r::LANGUAGE.into(),
         }
     }
 
@@ -175,7 +182,7 @@ impl LanguageKind {
         &[
             JavaScript, TypeScript, Tsx, Python, Go, Rust, Java, CSharp, Ruby, Php, Hcl, C,
             Cpp, Kotlin, Swift, Scala, Bash, Elixir, Lua, Dart, Zig, Haskell, Ocaml,
-            OcamlInterface,
+            OcamlInterface, R,
         ]
     }
 }
@@ -244,14 +251,15 @@ mod tests {
             | LanguageKind::Zig
             | LanguageKind::Haskell
             | LanguageKind::Ocaml
-            | LanguageKind::OcamlInterface => (),
+            | LanguageKind::OcamlInterface
+            | LanguageKind::R => (),
         };
         // IMPORTANT: this constant must equal the number of arms in the match
         // above AND the length of the slice returned by `LanguageKind::all()`.
         // Because both checks require the same manual update, they reinforce
         // each other: a developer who updates the match is reminded to also
         // update `all()` and this count.
-        const EXPECTED_LEN: usize = 24;
+        const EXPECTED_LEN: usize = 25;
         assert_eq!(
             LanguageKind::all().len(),
             EXPECTED_LEN,
diff --git a/src/ast-analysis/rules/index.ts b/src/ast-analysis/rules/index.ts
index 653cbd59b..ff7e0fb66 100644
--- a/src/ast-analysis/rules/index.ts
+++ b/src/ast-analysis/rules/index.ts
@@ -153,6 +153,10 @@ const OCAML_AST_TYPES: Record<string, string> = {
   string: 'string',
 };
 
+const R_AST_TYPES: Record<string, string> = {
+  string: 'string',
+};
+
 export const AST_TYPE_MAPS: Map<string, Record<string, string>> = new Map([
   ['javascript', JS_AST_TYPES],
   ['typescript', JS_AST_TYPES],
@@ -177,6 +181,7 @@ export const AST_TYPE_MAPS: Map<string, Record<string, string>> = new Map([
   ['haskell', HASKELL_AST_TYPES],
   ['ocaml', OCAML_AST_TYPES],
   ['ocaml-interface', OCAML_AST_TYPES],
+  ['r', R_AST_TYPES],
 ]);
 
 // ─── Per-language string-extraction config ───────────────────────────────
@@ -211,6 +216,7 @@ const DART_STRING_CONFIG: AstStringConfig = { quoteChars: '\'"', stringPrefixes:
 const ZIG_STRING_CONFIG: AstStringConfig = { quoteChars: '"', stringPrefixes: '' };
 const HASKELL_STRING_CONFIG: AstStringConfig = { quoteChars: '"\'', stringPrefixes: '' };
 const OCAML_STRING_CONFIG: AstStringConfig = { quoteChars: '"', stringPrefixes: '' };
+const R_STRING_CONFIG: AstStringConfig = { quoteChars: '\'"', stringPrefixes: '' };
 
 export const AST_STRING_CONFIGS: Map<string, AstStringConfig> = new Map([
   ['javascript', JS_STRING_CONFIG],
@@ -236,6 +242,7 @@ export const AST_STRING_CONFIGS: Map<string, AstStringConfig> = new Map([
   ['haskell', HASKELL_STRING_CONFIG],
   ['ocaml', OCAML_STRING_CONFIG],
   ['ocaml-interface', OCAML_STRING_CONFIG],
+  ['r', R_STRING_CONFIG],
 ]);
 
 // ─── Per-language "stop-after-collect" kinds ─────────────────────────────

From 2a34ef1aec2ed19437f8809c1be969aa9dc8d4db Mon Sep 17 00:00:00 2001
From: carlos-alm <contato@carlosalmeida.com>
Date: Mon, 11 May 2026 20:55:24 -0600
Subject: [PATCH 02/11] fix(native): include .r in NATIVE_SUPPORTED_EXTENSIONS
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The drift guard test expects parity between NATIVE_SUPPORTED_EXTENSIONS
(JS-side) and from_extension in parser_registry.rs (Rust-side). After
this PR added native R support, .r was missing from the JS-side set,
causing the drift guard test to fail.

Also updates the classifyNativeDrops tests that previously assumed .r
was unsupported by native — those tests now correctly assert that
.R/.r files dropped by the native engine indicate an extractor failure,
not a parser-limit gap.
---
 src/domain/parser.ts                             |  1 +
 tests/parsers/native-drop-classification.test.ts | 10 +++++-----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/domain/parser.ts b/src/domain/parser.ts
index f1c7dd809..76bd56756 100644
--- a/src/domain/parser.ts
+++ b/src/domain/parser.ts
@@ -471,6 +471,7 @@ export const NATIVE_SUPPORTED_EXTENSIONS: ReadonlySet<string> = new Set([
   '.hs',
   '.ml',
   '.mli',
+  '.r',
 ]);
 
 /**
diff --git a/tests/parsers/native-drop-classification.test.ts b/tests/parsers/native-drop-classification.test.ts
index 24aee1d53..1bade56e8 100644
--- a/tests/parsers/native-drop-classification.test.ts
+++ b/tests/parsers/native-drop-classification.test.ts
@@ -19,7 +19,6 @@ describe('classifyNativeDrops', () => {
       'src/b.gleam',
       'src/c.clj',
       'src/d.jl',
-      'src/e.R',
       'src/f.erl',
       'src/g.sol',
       'src/h.cu',
@@ -27,11 +26,10 @@ describe('classifyNativeDrops', () => {
       'src/j.v',
       'src/k.m',
     ]);
-    expect(totals['unsupported-by-native']).toBe(11);
+    expect(totals['unsupported-by-native']).toBe(10);
     expect(totals['native-extractor-failure']).toBe(0);
     expect(byReason['unsupported-by-native'].get('.fs')).toEqual(['src/a.fs']);
     expect(byReason['unsupported-by-native'].get('.gleam')).toEqual(['src/b.gleam']);
-    expect(byReason['unsupported-by-native'].get('.r')).toEqual(['src/e.R']);
   });
 
   it('flags natively-supported extensions as native-extractor-failure', () => {
@@ -61,9 +59,11 @@ describe('classifyNativeDrops', () => {
   });
 
   it('lowercases extensions so .R and .r share a bucket', () => {
+    // `.r` is now natively supported (R extractor was ported to Rust), so
+    // any dropped `.R`/`.r` files indicate a native extractor failure.
     const { byReason, totals } = classifyNativeDrops(['scripts/a.R', 'scripts/b.r']);
-    expect(totals['unsupported-by-native']).toBe(2);
-    expect(byReason['unsupported-by-native'].get('.r')).toEqual(['scripts/a.R', 'scripts/b.r']);
+    expect(totals['native-extractor-failure']).toBe(2);
+    expect(byReason['native-extractor-failure'].get('.r')).toEqual(['scripts/a.R', 'scripts/b.r']);
   });
 
   it('returns empty buckets when no files are passed', () => {

From fb4ae359fba6dad21974c1166d95cfb8ca2c3237 Mon Sep 17 00:00:00 2001
From: carlos-alm <contato@carlosalmeida.com>
Date: Mon, 11 May 2026 20:55:42 -0600
Subject: [PATCH 03/11] fix(extractors): handle named arguments in R library()
 calls

Greptile flagged that `first_argument_value` (native) and
`handleLibraryCall` (WASM) both returned the first identifier child
inside an `argument` node, which for named arguments like
`library(package = dplyr)` returned `package` (the parameter name)
instead of `dplyr` (the value).

Fix in both engines together to preserve native/WASM parity:
- Prefer the field-named `value` child of the `argument` node, which
  the tree-sitter-r grammar exposes explicitly for named arguments.
- Fall back to a positional child scan that skips any child reachable
  via the `name` field, so positional grammar variants still work.

Two new tests cover the identifier and string-literal forms.
---
 .../codegraph-core/src/extractors/r_lang.rs   | 57 ++++++++++++++++++-
 src/extractors/r.ts                           | 29 ++++++++--
 tests/parsers/r.test.ts                       | 11 ++++
 3 files changed, 92 insertions(+), 5 deletions(-)

diff --git a/crates/codegraph-core/src/extractors/r_lang.rs b/crates/codegraph-core/src/extractors/r_lang.rs
index 79a5411ff..1c68ebc72 100644
--- a/crates/codegraph-core/src/extractors/r_lang.rs
+++ b/crates/codegraph-core/src/extractors/r_lang.rs
@@ -203,15 +203,40 @@ fn handle_call(node: &Node, source: &[u8], symbols: &mut FileSymbols) {
 /// Returns the inner string literal text (quotes stripped) or the bare
 /// identifier text — whichever appears first. Used for `library(pkg)`,
 /// `source("file.R")`, `setClass("Foo", ...)`, etc.
+///
+/// For named arguments like `library(package = dplyr)`, the tree-sitter-r
+/// grammar exposes a `value` field on the `argument` node — we prefer that
+/// over a positional child scan so we extract `dplyr`, not `package`.
 fn first_argument_value(node: &Node, source: &[u8], accept_identifier: bool) -> Option<String> {
     let args = node.child_by_field_name("arguments").or_else(|| find_child(node, "arguments"))?;
     for i in 0..args.child_count() {
         let Some(arg) = args.child(i) else { continue };
         match arg.kind() {
             "argument" => {
-                // argument wraps the actual value
+                // Prefer the field-named `value` child when present — this
+                // correctly handles `library(package = dplyr)` by returning
+                // `dplyr` (the value), not `package` (the parameter name).
+                if let Some(value) = arg.child_by_field_name("value") {
+                    if value.kind() == "string" {
+                        return Some(strip_string_quotes(&value, source));
+                    }
+                    if accept_identifier && value.kind() == "identifier" {
+                        return Some(node_text(&value, source).to_string());
+                    }
+                }
+                // Fallback: scan children but skip anything before the `=`
+                // operator. The grammar exposes the parameter name via the
+                // `name` field, so we use that to know which children are
+                // before/after the `=`.
+                let name_node = arg.child_by_field_name("name");
                 for j in 0..arg.child_count() {
                     let Some(inner) = arg.child(j) else { continue };
+                    // Skip the parameter-name identifier itself for named args.
+                    if let Some(ref n) = name_node {
+                        if inner.id() == n.id() {
+                            continue;
+                        }
+                    }
                     if inner.kind() == "string" {
                         return Some(strip_string_quotes(&inner, source));
                     }
@@ -408,4 +433,34 @@ mod tests {
         let g = s.definitions.iter().find(|d| d.name == "g").unwrap();
         assert_eq!(g.kind, "function");
     }
+
+    #[test]
+    fn library_named_argument_extracts_value_not_name() {
+        // `library(package = dplyr)` uses a named argument — the import
+        // source must be `dplyr` (the value), not `package` (the name).
+        let s = parse_r("library(package = dplyr)\n");
+        assert_eq!(s.imports.len(), 1);
+        assert_eq!(s.imports[0].source, "dplyr");
+        assert_eq!(s.imports[0].names, vec!["dplyr".to_string()]);
+    }
+
+    #[test]
+    fn library_named_argument_with_string_value() {
+        // Same pattern but with a string literal as the value.
+        let s = parse_r("library(package = \"dplyr\")\n");
+        assert_eq!(s.imports.len(), 1);
+        assert_eq!(s.imports[0].source, "dplyr");
+    }
+
+    #[test]
+    fn nested_function_assignment_is_recorded() {
+        // Matches the JS extractor's documented behavior: function
+        // definitions are emitted regardless of nesting depth (only
+        // variable assignments are filtered by `is_program_level`).
+        // This test pins the behavior so future changes are intentional.
+        let s = parse_r("outer <- function() { inner <- function() { 1 }; inner() }\n");
+        let defs: Vec<&str> = s.definitions.iter().map(|d| d.name.as_str()).collect();
+        assert!(defs.contains(&"outer"));
+        assert!(defs.contains(&"inner"));
+    }
 }
diff --git a/src/extractors/r.ts b/src/extractors/r.ts
index 19cf0e723..15e4be61f 100644
--- a/src/extractors/r.ts
+++ b/src/extractors/r.ts
@@ -147,7 +147,10 @@ function handleCall(node: TreeSitterNode, ctx: ExtractorOutput): void {
 }
 
 function handleLibraryCall(node: TreeSitterNode, ctx: ExtractorOutput): void {
-  // Find the package name in arguments
+  // Find the package name in arguments. For named arguments like
+  // `library(package = dplyr)`, prefer the field-named `value` child of the
+  // `argument` node so we extract `dplyr` (the value), not `package` (the
+  // parameter name). Keeps native (Rust) and WASM extractors in parity.
   for (let i = 0; i < node.childCount; i++) {
     const child = node.child(i);
     if (!child) continue;
@@ -174,9 +177,27 @@ function handleLibraryCall(node: TreeSitterNode, ctx: ExtractorOutput): void {
         }
         // Argument might be wrapped
         if (arg.type === 'argument') {
-          const id = findChild(arg, 'identifier') || findChild(arg, 'string');
-          if (id) {
-            const text = id.text.replace(/^["']|["']$/g, '');
+          // Prefer the `value` field (correct for named arguments).
+          const valueNode = arg.childForFieldName('value');
+          let pick: TreeSitterNode | null = null;
+          if (valueNode && (valueNode.type === 'string' || valueNode.type === 'identifier')) {
+            pick = valueNode;
+          } else {
+            // Fallback: skip the parameter-name child if the grammar exposes
+            // it via the `name` field, then pick the first string/identifier.
+            const nameNode = arg.childForFieldName('name');
+            for (let k = 0; k < arg.childCount; k++) {
+              const inner = arg.child(k);
+              if (!inner) continue;
+              if (nameNode && inner.id === nameNode.id) continue;
+              if (inner.type === 'string' || inner.type === 'identifier') {
+                pick = inner;
+                break;
+              }
+            }
+          }
+          if (pick) {
+            const text = pick.text.replace(/^["']|["']$/g, '');
             ctx.imports.push({
               source: text,
               names: [text],
diff --git a/tests/parsers/r.test.ts b/tests/parsers/r.test.ts
index 85380c590..ac2f26534 100644
--- a/tests/parsers/r.test.ts
+++ b/tests/parsers/r.test.ts
@@ -44,4 +44,15 @@ require(ggplot2)`);
 mean(c(1, 2, 3))`);
     expect(symbols.calls.length).toBeGreaterThanOrEqual(1);
   });
+
+  it('extracts the value (not the parameter name) for named library arguments', () => {
+    // `library(package = dplyr)` is rare but valid R. The import source must
+    // be `dplyr` (the value), not `package` (the parameter name). Keeps the
+    // WASM and native extractors in parity.
+    const symbols = parseR(`library(package = dplyr)`);
+    expect(symbols.imports).toContainEqual(
+      expect.objectContaining({ source: 'dplyr' }),
+    );
+    expect(symbols.imports.some((i) => i.source === 'package')).toBe(false);
+  });
 });

From 53dfd00e2e9e6702e344f736faf18f938227c43c Mon Sep 17 00:00:00 2001
From: carlos-alm <contato@carlosalmeida.com>
Date: Mon, 11 May 2026 20:57:25 -0600
Subject: [PATCH 04/11] chore: sync Cargo.lock with codegraph-core 3.10.0 from
 main

---
 Cargo.lock | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Cargo.lock b/Cargo.lock
index 8198cbdf4..318611895 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -66,7 +66,7 @@ checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
 
 [[package]]
 name = "codegraph-core"
-version = "3.9.6"
+version = "3.10.0"
 dependencies = [
  "globset",
  "ignore",

From e8aee6cf57620fbff976cbb47f0b13df9c0489e4 Mon Sep 17 00:00:00 2001
From: carlos-alm <contato@carlosalmeida.com>
Date: Mon, 11 May 2026 23:54:21 -0600
Subject: [PATCH 05/11] fix: address lint format error in r.test.ts (#1102)

---
 tests/parsers/r.test.ts | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tests/parsers/r.test.ts b/tests/parsers/r.test.ts
index ac2f26534..878a179b9 100644
--- a/tests/parsers/r.test.ts
+++ b/tests/parsers/r.test.ts
@@ -50,9 +50,7 @@ mean(c(1, 2, 3))`);
     // be `dplyr` (the value), not `package` (the parameter name). Keeps the
     // WASM and native extractors in parity.
     const symbols = parseR(`library(package = dplyr)`);
-    expect(symbols.imports).toContainEqual(
-      expect.objectContaining({ source: 'dplyr' }),
-    );
+    expect(symbols.imports).toContainEqual(expect.objectContaining({ source: 'dplyr' }));
     expect(symbols.imports.some((i) => i.source === 'package')).toBe(false);
   });
 });

From ebc9754ab48d4fb75bb6692d9faf8b00191afa3a Mon Sep 17 00:00:00 2001
From: carlos-alm <contato@carlosalmeida.com>
Date: Tue, 12 May 2026 00:21:55 -0600
Subject: [PATCH 06/11] fix: correct unsupported-native count after Clojure
 native port (#1102)

---
 tests/parsers/native-drop-classification.test.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/parsers/native-drop-classification.test.ts b/tests/parsers/native-drop-classification.test.ts
index c6f42f317..ce9f1d1b3 100644
--- a/tests/parsers/native-drop-classification.test.ts
+++ b/tests/parsers/native-drop-classification.test.ts
@@ -25,7 +25,7 @@ describe('classifyNativeDrops', () => {
       'src/j.v',
       'src/k.m',
     ]);
-    expect(totals['unsupported-by-native']).toBe(10);
+    expect(totals['unsupported-by-native']).toBe(9);
     expect(totals['native-extractor-failure']).toBe(0);
     expect(byReason['unsupported-by-native'].get('.fs')).toEqual(['src/a.fs']);
     expect(byReason['unsupported-by-native'].get('.gleam')).toEqual(['src/b.gleam']);

From 815e6d0f95f1fb3a3ca84f1ff35f1f2f81b3d6a5 Mon Sep 17 00:00:00 2001
From: carlos-alm <contato@carlosalmeida.com>
Date: Tue, 12 May 2026 00:30:41 -0600
Subject: [PATCH 07/11] chore: re-trigger CI (#1102)


From 58b85e769c48a2f9333ac221e75302f1cd62a09f Mon Sep 17 00:00:00 2001
From: carlos-alm <contato@carlosalmeida.com>
Date: Wed, 13 May 2026 03:35:32 -0600
Subject: [PATCH 08/11] fix: bump EXPECTED_LEN to 28 after merging Julia + R

---
 crates/codegraph-core/src/parser_registry.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/codegraph-core/src/parser_registry.rs b/crates/codegraph-core/src/parser_registry.rs
index b5e17e223..8d8b4e8cf 100644
--- a/crates/codegraph-core/src/parser_registry.rs
+++ b/crates/codegraph-core/src/parser_registry.rs
@@ -277,7 +277,7 @@ mod tests {
         // Because both checks require the same manual update, they reinforce
         // each other: a developer who updates the match is reminded to also
         // update `all()` and this count.
-        const EXPECTED_LEN: usize = 27;
+        const EXPECTED_LEN: usize = 28;
         assert_eq!(
             LanguageKind::all().len(),
             EXPECTED_LEN,

From 20f988b7a03a5f2903cffbd0b4d55a280449288a Mon Sep 17 00:00:00 2001
From: carlos-alm <contato@carlosalmeida.com>
Date: Wed, 13 May 2026 03:38:26 -0600
Subject: [PATCH 09/11] fix(r-extractor): index-based quote strip preserves
 inner quote (#1102)

---
 .../codegraph-core/src/extractors/r_lang.rs   | 35 +++++++++++++++++--
 1 file changed, 32 insertions(+), 3 deletions(-)

diff --git a/crates/codegraph-core/src/extractors/r_lang.rs b/crates/codegraph-core/src/extractors/r_lang.rs
index 1c68ebc72..256500ef7 100644
--- a/crates/codegraph-core/src/extractors/r_lang.rs
+++ b/crates/codegraph-core/src/extractors/r_lang.rs
@@ -263,9 +263,22 @@ fn strip_string_quotes(node: &Node, source: &[u8]) -> String {
     if let Some(content) = find_child(node, "string_content") {
         return node_text(&content, source).to_string();
     }
-    node_text(node, source)
-        .trim_matches(|c| c == '\'' || c == '"')
-        .to_string()
+    // Fallback: strip exactly one matching quote from each end. We can't use
+    // `trim_matches` because it strips *all* matching characters greedily —
+    // e.g. for the literal `"'"` (a string containing a single quote) the
+    // text is `"`, `'`, `"`, and `trim_matches` would consume all three,
+    // returning an empty string. Index-based strip removes only the outer
+    // pair, leaving the inner character intact.
+    let text = node_text(node, source);
+    let bytes = text.as_bytes();
+    if bytes.len() >= 2 {
+        let first = bytes[0];
+        let last = bytes[bytes.len() - 1];
+        if (first == b'\'' || first == b'"') && first == last {
+            return text[1..bytes.len() - 1].to_string();
+        }
+    }
+    text.to_string()
 }
 
 fn handle_library_call(node: &Node, source: &[u8], symbols: &mut FileSymbols) {
@@ -452,6 +465,22 @@ mod tests {
         assert_eq!(s.imports[0].source, "dplyr");
     }
 
+    #[test]
+    fn source_call_with_mixed_quote_content_preserves_inner_quote() {
+        // Edge case for the strip_string_quotes fallback: if a grammar
+        // version drops the `string_content` child, the fallback must strip
+        // only the outer pair of quotes. `trim_matches` would greedily eat
+        // both the outer `"` and the inner `'`, returning an empty path.
+        // Index-based strip leaves the inner `'` intact.
+        //
+        // We exercise the fallback indirectly via `source("a'b.R")` —
+        // current grammars expose `string_content`, so this primarily
+        // guards against future regressions in the fallback path.
+        let s = parse_r("source(\"a'b.R\")\n");
+        assert_eq!(s.imports.len(), 1);
+        assert_eq!(s.imports[0].source, "a'b.R");
+    }
+
     #[test]
     fn nested_function_assignment_is_recorded() {
         // Matches the JS extractor's documented behavior: function

From 522131febb319bcbed0103f582adf115beff90bc Mon Sep 17 00:00:00 2001
From: carlos-alm <contato@carlosalmeida.com>
Date: Wed, 13 May 2026 06:21:05 -0600
Subject: [PATCH 10/11] test: exempt 3.10.0:Query time regression for #1102
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Combined R + Solidity native ports trip the bench gate's 25% Query
time threshold (+110% native, 49.6 → 104ms) — neither PR caused this
individually. Tracked in #1113.

docs check acknowledged
---
 tests/benchmarks/regression-guard.test.ts | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/tests/benchmarks/regression-guard.test.ts b/tests/benchmarks/regression-guard.test.ts
index 337f0a5aa..97ba31b5f 100644
--- a/tests/benchmarks/regression-guard.test.ts
+++ b/tests/benchmarks/regression-guard.test.ts
@@ -166,6 +166,17 @@ const SKIP_VERSIONS = new Set(['3.8.0']);
  *   Exempt this release; remove once 3.11.0+ data confirms stabilization
  *   under the warmup + 5-sample methodology already applied to incremental
  *   benchmarks.
+ *
+ * - 3.10.0:Query time — cumulative effect of adding two native extractors
+ *   (Solidity #1100 + R #1102) in quick succession. Neither tripped the
+ *   threshold individually (Solidity PR's Query time stayed at 49ms, R PR
+ *   showed no warning), but the combined +110% (49.6 → ~105ms) on the
+ *   `fnDepsData('buildGraph', dbPath)` measurement reflects natural graph
+ *   growth: ~1100 LoC of new extractor code + 9 fixture files added to the
+ *   self-build benchmark expand `buildGraph`'s transitive callee count and
+ *   DB row counts. Tracked in #1113 — exempt this release; remove once
+ *   3.11.0+ data captures the new steady-state and the per-language
+ *   fixture footprint has been evaluated.
  */
 const KNOWN_REGRESSIONS = new Set([
   '3.9.6:Build ms/file',
@@ -176,6 +187,7 @@ const KNOWN_REGRESSIONS = new Set([
   '3.10.0:No-op rebuild',
   '3.10.0:1-file rebuild',
   '3.10.0:fnDeps depth 1',
+  '3.10.0:Query time',
 ]);
 
 /**

From e4a0dd1c6fce0c00c7d5044abb78615e8960bef1 Mon Sep 17 00:00:00 2001
From: carlos-alm <contato@carlosalmeida.com>
Date: Wed, 13 May 2026 07:45:20 -0600
Subject: [PATCH 11/11] test: exempt 3.10.0:fnDeps depth 5 regression for #1102
 merge

---
 tests/benchmarks/regression-guard.test.ts | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/tests/benchmarks/regression-guard.test.ts b/tests/benchmarks/regression-guard.test.ts
index 97ba31b5f..fae6a390d 100644
--- a/tests/benchmarks/regression-guard.test.ts
+++ b/tests/benchmarks/regression-guard.test.ts
@@ -177,6 +177,15 @@ const SKIP_VERSIONS = new Set(['3.8.0']);
  *   DB row counts. Tracked in #1113 — exempt this release; remove once
  *   3.11.0+ data captures the new steady-state and the per-language
  *   fixture footprint has been evaluated.
+ *
+ * - 3.10.0:fnDeps depth 5 — same cause as Query time above. Merging main
+ *   into #1102 added the Erlang extractor (#1103) on top of the existing
+ *   Solidity (#1100) + R (#1102) growth, expanding `buildGraph`'s
+ *   depth-5 transitive callee fan-out by another step. The depth-5 walk
+ *   amplifies any base-graph growth quadratically (each new node adds
+ *   its own depth-5 subtree). +31% over the 25% threshold maps to the
+ *   ~33→43ms swing on a sub-50ms metric. Tracked in #1113 alongside
+ *   Query time; remove both once 3.11.0+ data confirms the new steady-state.
  */
 const KNOWN_REGRESSIONS = new Set([
   '3.9.6:Build ms/file',
@@ -187,6 +196,7 @@ const KNOWN_REGRESSIONS = new Set([
   '3.10.0:No-op rebuild',
   '3.10.0:1-file rebuild',
   '3.10.0:fnDeps depth 1',
+  '3.10.0:fnDeps depth 5',
   '3.10.0:Query time',
 ]);