From 6184ac7bd8d0c0bc32cb93ae205836dfd9711fb1 Mon Sep 17 00:00:00 2001 From: carlos-alm <127798846+carlos-alm@users.noreply.github.com> Date: Wed, 4 Mar 2026 14:50:29 -0700 Subject: [PATCH 1/7] perf: compute dataflow analysis in Rust native engine Port per-file dataflow extraction from WASM (JS) to the Rust native engine, eliminating the tree-sitter WASM dependency for dataflow. Rust changes: - Add dataflow.rs with DataflowRules struct, 8 per-language static configs (JS/TS, Python, Go, Rust, Java, C#, PHP, Ruby), ParamStrategy enum for per-language param extraction, scope tracking with binding confidence, and extract_dataflow() recursive visitor - Add 6 NAPI structs to types.rs (DataflowParam, DataflowReturn, DataflowAssignment, DataflowArgFlow, DataflowMutation, DataflowResult) and dataflow field on FileSymbols - Call extract_dataflow() after extract_symbols() in parallel.rs - Add lang_id_str() to LanguageKind for rules lookup JS changes: - Extend normalizeNativeSymbols() to map native dataflow result - Add native bypass in buildDataflowEdges(): use symbols.dataflow when present, fall back to WASM extraction otherwise Impact: 30 functions changed, 32 affected --- crates/codegraph-core/src/dataflow.rs | 1389 ++++++++++++++++++ crates/codegraph-core/src/lib.rs | 1 + crates/codegraph-core/src/parallel.rs | 3 + crates/codegraph-core/src/parser_registry.rs | 18 + crates/codegraph-core/src/types.rs | 83 ++ src/dataflow.js | 62 +- src/parser.js | 42 + 7 files changed, 1569 insertions(+), 29 deletions(-) create mode 100644 crates/codegraph-core/src/dataflow.rs diff --git a/crates/codegraph-core/src/dataflow.rs b/crates/codegraph-core/src/dataflow.rs new file mode 100644 index 0000000..06df887 --- /dev/null +++ b/crates/codegraph-core/src/dataflow.rs @@ -0,0 +1,1389 @@ +use std::collections::HashMap; +use tree_sitter::{Node, Tree}; + +use crate::types::{ + DataflowArgFlow, DataflowAssignment, DataflowMutation, DataflowParam, DataflowResult, + DataflowReturn, +}; + +// ─── Param Strategy ────────────────────────────────────────────────────── + +/// Per-language parameter extraction strategy. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum ParamStrategy { + Default, + Python, + Go, + Rust, + Java, + CSharp, + Php, + Ruby, +} + +// ─── DataflowRules ────────────────────────────────────────────────────── + +/// Per-language AST node type names and field names for dataflow extraction. +/// Mirrors `DATAFLOW_DEFAULTS` + per-language overrides in `src/dataflow.js`. +pub struct DataflowRules { + // Scope entry + function_nodes: &'static [&'static str], + + // Function name extraction + name_field: &'static str, + var_assigned_fn_parent: Option<&'static str>, + assignment_fn_parent: Option<&'static str>, + pair_fn_parent: Option<&'static str>, + + // Parameters + param_list_field: &'static str, + param_identifier: &'static str, + param_wrapper_types: &'static [&'static str], + default_param_type: Option<&'static str>, + rest_param_type: Option<&'static str>, + object_destruct_type: Option<&'static str>, + array_destruct_type: Option<&'static str>, + shorthand_prop_pattern: Option<&'static str>, + pair_pattern_type: Option<&'static str>, + extract_param_strategy: ParamStrategy, + + // Return + return_node: Option<&'static str>, + + // Variable declarations + var_declarator_node: Option<&'static str>, + var_declarator_nodes: &'static [&'static str], + var_name_field: &'static str, + var_value_field: Option<&'static str>, + assignment_node: Option<&'static str>, + assign_left_field: &'static str, + assign_right_field: &'static str, + + // Calls + call_node: Option<&'static str>, + call_nodes: &'static [&'static str], + call_function_field: &'static str, + call_args_field: &'static str, + spread_type: Option<&'static str>, + + // Member access + member_node: Option<&'static str>, + member_object_field: &'static str, + member_property_field: &'static str, + optional_chain_node: Option<&'static str>, + + // Await + await_node: Option<&'static str>, + + // Mutation + mutating_methods: &'static [&'static str], + expression_stmt_node: &'static str, + call_object_field: Option<&'static str>, + + // Structural wrappers + expression_list_type: Option<&'static str>, + equals_clause_type: Option<&'static str>, + argument_wrapper_type: Option<&'static str>, + extra_identifier_types: &'static [&'static str], +} + +// ─── Per-Language Configs ──────────────────────────────────────────────── + +static JS_TS_DATAFLOW: DataflowRules = DataflowRules { + function_nodes: &[ + "function_declaration", + "method_definition", + "arrow_function", + "function_expression", + "function", + ], + name_field: "name", + var_assigned_fn_parent: Some("variable_declarator"), + assignment_fn_parent: Some("assignment_expression"), + pair_fn_parent: Some("pair"), + param_list_field: "parameters", + param_identifier: "identifier", + param_wrapper_types: &["required_parameter", "optional_parameter"], + default_param_type: Some("assignment_pattern"), + rest_param_type: Some("rest_pattern"), + object_destruct_type: Some("object_pattern"), + array_destruct_type: Some("array_pattern"), + shorthand_prop_pattern: Some("shorthand_property_identifier_pattern"), + pair_pattern_type: Some("pair_pattern"), + extract_param_strategy: ParamStrategy::Default, + return_node: Some("return_statement"), + var_declarator_node: Some("variable_declarator"), + var_declarator_nodes: &[], + var_name_field: "name", + var_value_field: Some("value"), + assignment_node: Some("assignment_expression"), + assign_left_field: "left", + assign_right_field: "right", + call_node: Some("call_expression"), + call_nodes: &[], + call_function_field: "function", + call_args_field: "arguments", + spread_type: Some("spread_element"), + member_node: Some("member_expression"), + member_object_field: "object", + member_property_field: "property", + optional_chain_node: Some("optional_chain_expression"), + await_node: Some("await_expression"), + mutating_methods: &[ + "push", "pop", "shift", "unshift", "splice", "sort", "reverse", "fill", "set", "delete", + "add", "clear", + ], + expression_stmt_node: "expression_statement", + call_object_field: None, + expression_list_type: None, + equals_clause_type: None, + argument_wrapper_type: None, + extra_identifier_types: &[], +}; + +static PYTHON_DATAFLOW: DataflowRules = DataflowRules { + function_nodes: &["function_definition", "lambda"], + name_field: "name", + var_assigned_fn_parent: None, + assignment_fn_parent: None, + pair_fn_parent: None, + param_list_field: "parameters", + param_identifier: "identifier", + param_wrapper_types: &[], + default_param_type: Some("default_parameter"), + rest_param_type: Some("list_splat_pattern"), + object_destruct_type: None, + array_destruct_type: None, + shorthand_prop_pattern: None, + pair_pattern_type: None, + extract_param_strategy: ParamStrategy::Python, + return_node: Some("return_statement"), + var_declarator_node: None, + var_declarator_nodes: &[], + var_name_field: "name", + var_value_field: Some("value"), + assignment_node: Some("assignment"), + assign_left_field: "left", + assign_right_field: "right", + call_node: Some("call"), + call_nodes: &[], + call_function_field: "function", + call_args_field: "arguments", + spread_type: Some("list_splat"), + member_node: Some("attribute"), + member_object_field: "object", + member_property_field: "attribute", + optional_chain_node: None, + await_node: Some("await"), + mutating_methods: &[ + "append", "extend", "insert", "pop", "remove", "clear", "sort", "reverse", "add", + "discard", "update", + ], + expression_stmt_node: "expression_statement", + call_object_field: None, + expression_list_type: None, + equals_clause_type: None, + argument_wrapper_type: None, + extra_identifier_types: &[], +}; + +static GO_DATAFLOW: DataflowRules = DataflowRules { + function_nodes: &["function_declaration", "method_declaration", "func_literal"], + name_field: "name", + var_assigned_fn_parent: None, + assignment_fn_parent: None, + pair_fn_parent: None, + param_list_field: "parameters", + param_identifier: "identifier", + param_wrapper_types: &[], + default_param_type: None, + rest_param_type: None, + object_destruct_type: None, + array_destruct_type: None, + shorthand_prop_pattern: None, + pair_pattern_type: None, + extract_param_strategy: ParamStrategy::Go, + return_node: Some("return_statement"), + var_declarator_node: None, + var_declarator_nodes: &["short_var_declaration", "var_declaration"], + var_name_field: "left", + var_value_field: Some("right"), + assignment_node: Some("assignment_statement"), + assign_left_field: "left", + assign_right_field: "right", + call_node: Some("call_expression"), + call_nodes: &[], + call_function_field: "function", + call_args_field: "arguments", + spread_type: None, + member_node: Some("selector_expression"), + member_object_field: "operand", + member_property_field: "field", + optional_chain_node: None, + await_node: None, + mutating_methods: &[], + expression_stmt_node: "expression_statement", + call_object_field: None, + expression_list_type: Some("expression_list"), + equals_clause_type: None, + argument_wrapper_type: None, + extra_identifier_types: &[], +}; + +static RUST_DATAFLOW: DataflowRules = DataflowRules { + function_nodes: &["function_item", "closure_expression"], + name_field: "name", + var_assigned_fn_parent: None, + assignment_fn_parent: None, + pair_fn_parent: None, + param_list_field: "parameters", + param_identifier: "identifier", + param_wrapper_types: &[], + default_param_type: None, + rest_param_type: None, + object_destruct_type: None, + array_destruct_type: None, + shorthand_prop_pattern: None, + pair_pattern_type: None, + extract_param_strategy: ParamStrategy::Rust, + return_node: Some("return_expression"), + var_declarator_node: Some("let_declaration"), + var_declarator_nodes: &[], + var_name_field: "pattern", + var_value_field: Some("value"), + assignment_node: Some("assignment_expression"), + assign_left_field: "left", + assign_right_field: "right", + call_node: Some("call_expression"), + call_nodes: &[], + call_function_field: "function", + call_args_field: "arguments", + spread_type: None, + member_node: Some("field_expression"), + member_object_field: "value", + member_property_field: "field", + optional_chain_node: None, + await_node: Some("await_expression"), + mutating_methods: &["push", "pop", "insert", "remove", "clear", "sort", "reverse"], + expression_stmt_node: "expression_statement", + call_object_field: None, + expression_list_type: None, + equals_clause_type: None, + argument_wrapper_type: None, + extra_identifier_types: &[], +}; + +static JAVA_DATAFLOW: DataflowRules = DataflowRules { + function_nodes: &[ + "method_declaration", + "constructor_declaration", + "lambda_expression", + ], + name_field: "name", + var_assigned_fn_parent: None, + assignment_fn_parent: None, + pair_fn_parent: None, + param_list_field: "parameters", + param_identifier: "identifier", + param_wrapper_types: &[], + default_param_type: None, + rest_param_type: None, + object_destruct_type: None, + array_destruct_type: None, + shorthand_prop_pattern: None, + pair_pattern_type: None, + extract_param_strategy: ParamStrategy::Java, + return_node: Some("return_statement"), + var_declarator_node: Some("variable_declarator"), + var_declarator_nodes: &[], + var_name_field: "name", + var_value_field: Some("value"), + assignment_node: Some("assignment_expression"), + assign_left_field: "left", + assign_right_field: "right", + call_node: None, + call_nodes: &["method_invocation", "object_creation_expression"], + call_function_field: "name", + call_args_field: "arguments", + spread_type: None, + member_node: Some("field_access"), + member_object_field: "object", + member_property_field: "field", + optional_chain_node: None, + await_node: None, + mutating_methods: &["add", "remove", "clear", "put", "set", "push", "pop", "sort"], + expression_stmt_node: "expression_statement", + call_object_field: Some("object"), + expression_list_type: None, + equals_clause_type: None, + argument_wrapper_type: Some("argument"), + extra_identifier_types: &[], +}; + +static CSHARP_DATAFLOW: DataflowRules = DataflowRules { + function_nodes: &[ + "method_declaration", + "constructor_declaration", + "lambda_expression", + "local_function_statement", + ], + name_field: "name", + var_assigned_fn_parent: None, + assignment_fn_parent: None, + pair_fn_parent: None, + param_list_field: "parameters", + param_identifier: "identifier", + param_wrapper_types: &[], + default_param_type: None, + rest_param_type: None, + object_destruct_type: None, + array_destruct_type: None, + shorthand_prop_pattern: None, + pair_pattern_type: None, + extract_param_strategy: ParamStrategy::CSharp, + return_node: Some("return_statement"), + var_declarator_node: Some("variable_declarator"), + var_declarator_nodes: &[], + var_name_field: "name", + var_value_field: None, + assignment_node: Some("assignment_expression"), + assign_left_field: "left", + assign_right_field: "right", + call_node: Some("invocation_expression"), + call_nodes: &[], + call_function_field: "function", + call_args_field: "arguments", + spread_type: None, + member_node: Some("member_access_expression"), + member_object_field: "expression", + member_property_field: "name", + optional_chain_node: None, + await_node: Some("await_expression"), + mutating_methods: &["Add", "Remove", "Clear", "Insert", "Sort", "Reverse", "Push", "Pop"], + expression_stmt_node: "expression_statement", + call_object_field: None, + expression_list_type: None, + equals_clause_type: Some("equals_value_clause"), + argument_wrapper_type: Some("argument"), + extra_identifier_types: &[], +}; + +static PHP_DATAFLOW: DataflowRules = DataflowRules { + function_nodes: &[ + "function_definition", + "method_declaration", + "anonymous_function_creation_expression", + "arrow_function", + ], + name_field: "name", + var_assigned_fn_parent: None, + assignment_fn_parent: None, + pair_fn_parent: None, + param_list_field: "parameters", + param_identifier: "variable_name", + param_wrapper_types: &[], + default_param_type: None, + rest_param_type: None, + object_destruct_type: None, + array_destruct_type: None, + shorthand_prop_pattern: None, + pair_pattern_type: None, + extract_param_strategy: ParamStrategy::Php, + return_node: Some("return_statement"), + var_declarator_node: None, + var_declarator_nodes: &[], + var_name_field: "name", + var_value_field: Some("value"), + assignment_node: Some("assignment_expression"), + assign_left_field: "left", + assign_right_field: "right", + call_node: None, + call_nodes: &[ + "function_call_expression", + "member_call_expression", + "scoped_call_expression", + ], + call_function_field: "function", + call_args_field: "arguments", + spread_type: Some("spread_expression"), + member_node: Some("member_access_expression"), + member_object_field: "object", + member_property_field: "name", + optional_chain_node: None, + await_node: None, + mutating_methods: &["push", "pop", "shift", "unshift", "splice", "sort", "reverse"], + expression_stmt_node: "expression_statement", + call_object_field: None, + expression_list_type: None, + equals_clause_type: None, + argument_wrapper_type: Some("argument"), + extra_identifier_types: &["variable_name", "name"], +}; + +static RUBY_DATAFLOW: DataflowRules = DataflowRules { + function_nodes: &["method", "singleton_method", "lambda"], + name_field: "name", + var_assigned_fn_parent: None, + assignment_fn_parent: None, + pair_fn_parent: None, + param_list_field: "parameters", + param_identifier: "identifier", + param_wrapper_types: &[], + default_param_type: None, + rest_param_type: None, + object_destruct_type: None, + array_destruct_type: None, + shorthand_prop_pattern: None, + pair_pattern_type: None, + extract_param_strategy: ParamStrategy::Ruby, + return_node: Some("return"), + var_declarator_node: None, + var_declarator_nodes: &[], + var_name_field: "name", + var_value_field: Some("value"), + assignment_node: Some("assignment"), + assign_left_field: "left", + assign_right_field: "right", + call_node: Some("call"), + call_nodes: &[], + call_function_field: "method", + call_args_field: "arguments", + spread_type: Some("splat_parameter"), + member_node: Some("call"), + member_object_field: "receiver", + member_property_field: "method", + optional_chain_node: None, + await_node: None, + mutating_methods: &[ + "push", "pop", "shift", "unshift", "delete", "clear", "sort!", "reverse!", "map!", + "select!", "reject!", "compact!", "flatten!", "concat", "replace", "insert", + ], + expression_stmt_node: "expression_statement", + call_object_field: None, + expression_list_type: None, + equals_clause_type: None, + argument_wrapper_type: None, + extra_identifier_types: &[], +}; + +/// Get dataflow rules for a language ID string. +fn get_dataflow_rules(lang_id: &str) -> Option<&'static DataflowRules> { + match lang_id { + "javascript" | "typescript" | "tsx" => Some(&JS_TS_DATAFLOW), + "python" => Some(&PYTHON_DATAFLOW), + "go" => Some(&GO_DATAFLOW), + "rust" => Some(&RUST_DATAFLOW), + "java" => Some(&JAVA_DATAFLOW), + "csharp" => Some(&CSHARP_DATAFLOW), + "php" => Some(&PHP_DATAFLOW), + "ruby" => Some(&RUBY_DATAFLOW), + _ => None, + } +} + +// ─── Helpers ───────────────────────────────────────────────────────────── + +fn is_call_node(rules: &DataflowRules, kind: &str) -> bool { + if !rules.call_nodes.is_empty() { + rules.call_nodes.contains(&kind) + } else { + rules.call_node.is_some_and(|cn| cn == kind) + } +} + +fn is_function_node(rules: &DataflowRules, kind: &str) -> bool { + rules.function_nodes.contains(&kind) +} + +fn is_ident(rules: &DataflowRules, kind: &str) -> bool { + kind == "identifier" + || kind == rules.param_identifier + || rules.extra_identifier_types.contains(&kind) +} + +fn truncate(s: &str, max: usize) -> String { + if s.len() <= max { + s.to_string() + } else { + let mut result = String::with_capacity(max + 3); + // Take at most `max` bytes, but don't split a char + for ch in s.chars() { + if result.len() + ch.len_utf8() > max { + break; + } + result.push(ch); + } + result.push('…'); + result + } +} + +fn node_text<'a>(node: &Node, source: &'a [u8]) -> &'a str { + node.utf8_text(source).unwrap_or("") +} + +fn node_line(node: &Node) -> u32 { + node.start_position().row as u32 + 1 +} + +/// Extract function name from a function AST node. +fn function_name<'a>(fn_node: &Node<'a>, rules: &DataflowRules, source: &[u8]) -> Option { + // Try the standard name field + if let Some(name_node) = fn_node.child_by_field_name(rules.name_field) { + return Some(node_text(&name_node, source).to_string()); + } + + // JS-specific: arrow_function/function_expression assigned to variable, pair, or assignment + if let Some(parent) = fn_node.parent() { + let pt = parent.kind(); + if rules.var_assigned_fn_parent.is_some_and(|v| v == pt) { + let n = parent.child_by_field_name("name"); + return n.map(|n| node_text(&n, source).to_string()); + } + if rules.pair_fn_parent.is_some_and(|v| v == pt) { + let key = parent.child_by_field_name("key"); + return key.map(|k| node_text(&k, source).to_string()); + } + if rules.assignment_fn_parent.is_some_and(|v| v == pt) { + let left = parent.child_by_field_name(rules.assign_left_field); + return left.map(|l| node_text(&l, source).to_string()); + } + } + None +} + +/// Extract parameter names using per-language strategy. +fn extract_param_names_strategy(node: &Node, strategy: ParamStrategy, source: &[u8]) -> Option> { + match strategy { + ParamStrategy::Default => None, + ParamStrategy::Python => { + let t = node.kind(); + if t == "typed_parameter" || t == "typed_default_parameter" { + let cursor = &mut node.walk(); + for c in node.named_children(cursor) { + if c.kind() == "identifier" { + return Some(vec![node_text(&c, source).to_string()]); + } + } + return Some(vec![]); + } + if t == "default_parameter" { + if let Some(name_node) = node.child_by_field_name("name") { + return Some(vec![node_text(&name_node, source).to_string()]); + } + return Some(vec![]); + } + if t == "list_splat_pattern" || t == "dictionary_splat_pattern" { + let cursor = &mut node.walk(); + for c in node.named_children(cursor) { + if c.kind() == "identifier" { + return Some(vec![node_text(&c, source).to_string()]); + } + } + return Some(vec![]); + } + None + } + ParamStrategy::Go => { + let t = node.kind(); + if t == "parameter_declaration" { + let mut names = Vec::new(); + let cursor = &mut node.walk(); + for c in node.named_children(cursor) { + if c.kind() == "identifier" { + names.push(node_text(&c, source).to_string()); + } + } + if !names.is_empty() { Some(names) } else { None } + } else if t == "variadic_parameter_declaration" { + node.child_by_field_name("name") + .map(|n| vec![node_text(&n, source).to_string()]) + } else { + None + } + } + ParamStrategy::Rust => { + let t = node.kind(); + if t == "parameter" { + if let Some(pat) = node.child_by_field_name("pattern") { + if pat.kind() == "identifier" { + return Some(vec![node_text(&pat, source).to_string()]); + } + } + return Some(vec![]); + } + if t == "identifier" { + return Some(vec![node_text(node, source).to_string()]); + } + None + } + ParamStrategy::Java => { + let t = node.kind(); + if t == "formal_parameter" || t == "spread_parameter" { + if let Some(name_node) = node.child_by_field_name("name") { + return Some(vec![node_text(&name_node, source).to_string()]); + } + return Some(vec![]); + } + if t == "identifier" { + return Some(vec![node_text(node, source).to_string()]); + } + None + } + ParamStrategy::CSharp => { + let t = node.kind(); + if t == "parameter" { + if let Some(name_node) = node.child_by_field_name("name") { + return Some(vec![node_text(&name_node, source).to_string()]); + } + return Some(vec![]); + } + if t == "identifier" { + return Some(vec![node_text(node, source).to_string()]); + } + None + } + ParamStrategy::Php => { + let t = node.kind(); + if t == "simple_parameter" || t == "variadic_parameter" { + if let Some(name_node) = node.child_by_field_name("name") { + return Some(vec![node_text(&name_node, source).to_string()]); + } + return Some(vec![]); + } + if t == "variable_name" { + return Some(vec![node_text(node, source).to_string()]); + } + None + } + ParamStrategy::Ruby => { + let t = node.kind(); + if t == "identifier" { + return Some(vec![node_text(node, source).to_string()]); + } + if t == "optional_parameter" + || t == "keyword_parameter" + || t == "splat_parameter" + || t == "hash_splat_parameter" + { + if let Some(name_node) = node.child_by_field_name("name") { + return Some(vec![node_text(&name_node, source).to_string()]); + } + return Some(vec![]); + } + None + } + } +} + +/// Extract parameter names from a node, using rules and strategy. +fn extract_param_names(node: &Node, rules: &DataflowRules, source: &[u8]) -> Vec { + let t = node.kind(); + + // Language-specific override + if let Some(names) = extract_param_names_strategy(node, rules.extract_param_strategy, source) { + return names; + } + + // Leaf identifier + if t == rules.param_identifier { + return vec![node_text(node, source).to_string()]; + } + + // Wrapper types (TS required_parameter, etc.) + if rules.param_wrapper_types.contains(&t) { + let pattern = node + .child_by_field_name("pattern") + .or_else(|| node.child_by_field_name("name")); + return pattern + .map(|p| extract_param_names(&p, rules, source)) + .unwrap_or_default(); + } + + // Default parameter + if rules.default_param_type.is_some_and(|d| d == t) { + let left = node + .child_by_field_name("left") + .or_else(|| node.child_by_field_name("name")); + return left + .map(|l| extract_param_names(&l, rules, source)) + .unwrap_or_default(); + } + + // Rest / splat parameter + if rules.rest_param_type.is_some_and(|r| r == t) { + if let Some(name_node) = node.child_by_field_name("name") { + return vec![node_text(&name_node, source).to_string()]; + } + let cursor = &mut node.walk(); + for child in node.named_children(cursor) { + if child.kind() == rules.param_identifier { + return vec![node_text(&child, source).to_string()]; + } + } + return vec![]; + } + + // Object destructuring (JS only) + if rules.object_destruct_type.is_some_and(|o| o == t) { + let mut names = Vec::new(); + let cursor = &mut node.walk(); + for child in node.named_children(cursor) { + let ck = child.kind(); + if rules.shorthand_prop_pattern.is_some_and(|s| s == ck) { + names.push(node_text(&child, source).to_string()); + } else if rules.pair_pattern_type.is_some_and(|p| p == ck) { + if let Some(value) = child.child_by_field_name("value") { + names.extend(extract_param_names(&value, rules, source)); + } + } else if rules.rest_param_type.is_some_and(|r| r == ck) { + names.extend(extract_param_names(&child, rules, source)); + } + } + return names; + } + + // Array destructuring (JS only) + if rules.array_destruct_type.is_some_and(|a| a == t) { + let mut names = Vec::new(); + let cursor = &mut node.walk(); + for child in node.named_children(cursor) { + names.extend(extract_param_names(&child, rules, source)); + } + return names; + } + + vec![] +} + +/// Extract parameters: name + index pairs from formal_parameters node. +fn extract_params(params_node: &Node, rules: &DataflowRules, source: &[u8]) -> Vec<(String, u32)> { + let mut result = Vec::new(); + let mut index: u32 = 0; + let cursor = &mut params_node.walk(); + for child in params_node.named_children(cursor) { + let names = extract_param_names(&child, rules, source); + for name in names { + result.push((name, index)); + } + index += 1; + } + result +} + +/// Resolve the callee name from a call expression node. +fn resolve_callee_name(call_node: &Node, rules: &DataflowRules, source: &[u8]) -> Option { + let fn_node = call_node.child_by_field_name(rules.call_function_field); + match fn_node { + Some(f) => { + if is_ident(rules, f.kind()) { + return Some(node_text(&f, source).to_string()); + } + if rules.member_node.is_some_and(|m| m == f.kind()) { + let prop = f.child_by_field_name(rules.member_property_field); + return prop.map(|p| node_text(&p, source).to_string()); + } + if rules.optional_chain_node.is_some_and(|o| o == f.kind()) { + if let Some(target) = f.named_child(0) { + if rules.member_node.is_some_and(|m| m == target.kind()) { + let prop = target.child_by_field_name(rules.member_property_field); + return prop.map(|p| node_text(&p, source).to_string()); + } + if target.kind() == "identifier" { + return Some(node_text(&target, source).to_string()); + } + } + let prop = f.child_by_field_name(rules.member_property_field); + return prop.map(|p| node_text(&p, source).to_string()); + } + None + } + None => { + // Some languages (Java method_invocation, Ruby call) use 'name'/'method' directly + let name_node = call_node + .child_by_field_name("name") + .or_else(|| call_node.child_by_field_name("method")); + name_node.map(|n| node_text(&n, source).to_string()) + } + } +} + +/// Get the receiver (object) of a member expression. +fn member_receiver(member_expr: &Node, rules: &DataflowRules, source: &[u8]) -> Option { + let obj = member_expr.child_by_field_name(rules.member_object_field)?; + if is_ident(rules, obj.kind()) { + return Some(node_text(&obj, source).to_string()); + } + if rules.member_node.is_some_and(|m| m == obj.kind()) { + return member_receiver(&obj, rules, source); + } + None +} + +/// Collect all identifier names referenced within a node. +fn collect_identifiers(node: &Node, out: &mut Vec, rules: &DataflowRules, source: &[u8]) { + if is_ident(rules, node.kind()) { + out.push(node_text(node, source).to_string()); + return; + } + let cursor = &mut node.walk(); + for child in node.named_children(cursor) { + collect_identifiers(&child, out, rules, source); + } +} + +// ─── Scope Tracking ────────────────────────────────────────────────────── + +#[derive(Debug, Clone)] +enum LocalSource { + CallReturn { callee: String }, + Destructured { callee: String }, +} + +struct ScopeFrame { + func_name: Option, + params: HashMap, + locals: HashMap, +} + +/// Binding info returned by find_binding. +struct BindingInfo { + binding_type: String, + confidence: f64, +} + +fn find_binding(scope_stack: &[ScopeFrame], name: &str) -> Option { + for scope in scope_stack.iter().rev() { + if scope.params.contains_key(name) { + return Some(BindingInfo { + binding_type: "param".to_string(), + confidence: 1.0, + }); + } + if let Some(local) = scope.locals.get(name) { + let confidence = match local { + LocalSource::CallReturn { .. } => 0.9, + LocalSource::Destructured { .. } => 0.8, + }; + return Some(BindingInfo { + binding_type: "local".to_string(), + confidence, + }); + } + } + None +} + +fn binding_confidence(binding: &Option) -> f64 { + match binding { + Some(b) => b.confidence, + None => 0.5, + } +} + +// ─── Core: extract_dataflow ────────────────────────────────────────────── + +/// Extract dataflow information from a parsed AST tree. +/// Returns None if the language has no dataflow rules (e.g., HCL). +pub fn extract_dataflow(tree: &Tree, source: &[u8], lang_id: &str) -> Option { + let rules = get_dataflow_rules(lang_id)?; + + let mut parameters = Vec::new(); + let mut returns = Vec::new(); + let mut assignments = Vec::new(); + let mut arg_flows = Vec::new(); + let mut mutations = Vec::new(); + + let mut scope_stack: Vec = Vec::new(); + + visit( + &tree.root_node(), + rules, + source, + &mut scope_stack, + &mut parameters, + &mut returns, + &mut assignments, + &mut arg_flows, + &mut mutations, + ); + + Some(DataflowResult { + parameters, + returns, + assignments, + arg_flows, + mutations, + }) +} + +#[allow(clippy::too_many_arguments)] +fn visit( + node: &Node, + rules: &DataflowRules, + source: &[u8], + scope_stack: &mut Vec, + parameters: &mut Vec, + returns: &mut Vec, + assignments: &mut Vec, + arg_flows: &mut Vec, + mutations: &mut Vec, +) { + let t = node.kind(); + + // Enter function scope + if is_function_node(rules, t) { + enter_scope(node, rules, source, scope_stack, parameters); + let cursor = &mut node.walk(); + for child in node.named_children(cursor) { + visit(&child, rules, source, scope_stack, parameters, returns, assignments, arg_flows, mutations); + } + scope_stack.pop(); + return; + } + + // Return statements + if rules.return_node.is_some_and(|r| r == t) { + if let Some(scope) = scope_stack.last() { + if let Some(ref func_name) = scope.func_name { + let expr = node.named_child(0); + let mut referenced_names = Vec::new(); + if let Some(ref e) = expr { + collect_identifiers(e, &mut referenced_names, rules, source); + } + returns.push(DataflowReturn { + func_name: func_name.clone(), + expression: truncate( + expr.map(|e| node_text(&e, source)).unwrap_or(""), + 120, + ), + referenced_names, + line: node_line(node), + }); + } + } + let cursor = &mut node.walk(); + for child in node.named_children(cursor) { + visit(&child, rules, source, scope_stack, parameters, returns, assignments, arg_flows, mutations); + } + return; + } + + // Variable declarations (single type) + if rules.var_declarator_node.is_some_and(|v| v == t) { + handle_var_declarator(node, rules, source, scope_stack, assignments); + let cursor = &mut node.walk(); + for child in node.named_children(cursor) { + visit(&child, rules, source, scope_stack, parameters, returns, assignments, arg_flows, mutations); + } + return; + } + + // Variable declarations (multi-type, e.g., Go) + if !rules.var_declarator_nodes.is_empty() && rules.var_declarator_nodes.contains(&t) { + handle_var_declarator(node, rules, source, scope_stack, assignments); + let cursor = &mut node.walk(); + for child in node.named_children(cursor) { + visit(&child, rules, source, scope_stack, parameters, returns, assignments, arg_flows, mutations); + } + return; + } + + // Call expressions + if is_call_node(rules, t) { + handle_call_expr(node, rules, source, scope_stack, arg_flows); + let cursor = &mut node.walk(); + for child in node.named_children(cursor) { + visit(&child, rules, source, scope_stack, parameters, returns, assignments, arg_flows, mutations); + } + return; + } + + // Assignment expressions + if rules.assignment_node.is_some_and(|a| a == t) { + handle_assignment(node, rules, source, scope_stack, assignments, mutations); + let cursor = &mut node.walk(); + for child in node.named_children(cursor) { + visit(&child, rules, source, scope_stack, parameters, returns, assignments, arg_flows, mutations); + } + return; + } + + // Mutation detection via expression_statement + if t == rules.expression_stmt_node { + handle_expr_stmt_mutation(node, rules, source, scope_stack, mutations); + } + + // Default: visit children + let cursor = &mut node.walk(); + for child in node.named_children(cursor) { + visit(&child, rules, source, scope_stack, parameters, returns, assignments, arg_flows, mutations); + } +} + +fn enter_scope( + fn_node: &Node, + rules: &DataflowRules, + source: &[u8], + scope_stack: &mut Vec, + parameters: &mut Vec, +) { + let name = function_name(fn_node, rules, source); + let params_node = fn_node.child_by_field_name(rules.param_list_field); + let param_list = params_node + .as_ref() + .map(|pn| extract_params(pn, rules, source)) + .unwrap_or_default(); + + let mut param_map = HashMap::new(); + for (pname, pidx) in ¶m_list { + param_map.insert(pname.clone(), *pidx); + if let Some(ref fn_name) = name { + let line = params_node + .as_ref() + .map(|pn| node_line(pn)) + .unwrap_or_else(|| node_line(fn_node)); + parameters.push(DataflowParam { + func_name: fn_name.clone(), + param_name: pname.clone(), + param_index: *pidx, + line, + }); + } + } + + scope_stack.push(ScopeFrame { + func_name: name, + params: param_map, + locals: HashMap::new(), + }); +} + +/// Unwrap await if present, returning the inner expression. +fn unwrap_await<'a>(node: &Node<'a>, rules: &DataflowRules) -> Node<'a> { + if rules.await_node.is_some_and(|a| a == node.kind()) { + if let Some(inner) = node.named_child(0) { + return inner; + } + } + *node +} + +fn handle_var_declarator( + node: &Node, + rules: &DataflowRules, + source: &[u8], + scope_stack: &mut Vec, + assignments: &mut Vec, +) { + let mut name_node = node.child_by_field_name(rules.var_name_field); + let mut value_node = rules.var_value_field.and_then(|f| node.child_by_field_name(f)); + + // C#: initializer is inside equals_value_clause child + if value_node.is_none() { + if let Some(eq_type) = rules.equals_clause_type { + let cursor = &mut node.walk(); + for child in node.named_children(cursor) { + if child.kind() == eq_type { + value_node = child + .child_by_field_name("value") + .or_else(|| child.named_child(0)); + break; + } + } + } + } + + // Fallback: initializer is a direct unnamed child (C# variable_declarator) + if value_node.is_none() { + if let Some(ref nn) = name_node { + let cursor = &mut node.walk(); + for child in node.named_children(cursor) { + if child.id() != nn.id() { + let uw = unwrap_await(&child, rules); + if is_call_node(rules, uw.kind()) { + value_node = Some(child); + break; + } + } + } + } + } + + // Go: expression_list wraps LHS/RHS — unwrap to first named child + if let Some(el_type) = rules.expression_list_type { + if name_node.as_ref().is_some_and(|n| n.kind() == el_type) { + name_node = name_node.and_then(|n| n.named_child(0)); + } + if value_node.as_ref().is_some_and(|v| v.kind() == el_type) { + value_node = value_node.and_then(|v| v.named_child(0)); + } + } + + let scope = match scope_stack.last_mut() { + Some(s) => s, + None => return, + }; + let name_n = match name_node { + Some(n) => n, + None => return, + }; + let value_n = match value_node { + Some(v) => v, + None => return, + }; + + let unwrapped = unwrap_await(&value_n, rules); + if !is_call_node(rules, unwrapped.kind()) { + return; + } + + let callee = match resolve_callee_name(&unwrapped, rules, source) { + Some(c) => c, + None => return, + }; + let func_name = match &scope.func_name { + Some(f) => f.clone(), + None => return, + }; + + // Destructuring: const { a, b } = foo() + let is_obj_destruct = rules.object_destruct_type.is_some_and(|o| o == name_n.kind()); + let is_arr_destruct = rules.array_destruct_type.is_some_and(|a| a == name_n.kind()); + + if is_obj_destruct || is_arr_destruct { + let names = extract_param_names(&name_n, rules, source); + for n in &names { + assignments.push(DataflowAssignment { + var_name: n.clone(), + caller_func: Some(func_name.clone()), + source_call_name: callee.clone(), + expression: truncate(node_text(node, source), 120), + line: node_line(node), + }); + scope + .locals + .insert(n.clone(), LocalSource::Destructured { callee: callee.clone() }); + } + } else { + let var_name = node_text(&name_n, source).to_string(); + assignments.push(DataflowAssignment { + var_name: var_name.clone(), + caller_func: Some(func_name), + source_call_name: callee.clone(), + expression: truncate(node_text(node, source), 120), + line: node_line(node), + }); + scope.locals.insert(var_name, LocalSource::CallReturn { callee }); + } +} + +fn handle_assignment( + node: &Node, + rules: &DataflowRules, + source: &[u8], + scope_stack: &mut Vec, + assignments: &mut Vec, + mutations: &mut Vec, +) { + let left = node.child_by_field_name(rules.assign_left_field); + let right = node.child_by_field_name(rules.assign_right_field); + + let func_name = match scope_stack.last() { + Some(s) => match &s.func_name { + Some(f) => f.clone(), + None => return, + }, + None => return, + }; + + // Mutation: obj.prop = value + if let Some(ref left_n) = left { + if rules.member_node.is_some_and(|m| m == left_n.kind()) { + if let Some(receiver) = member_receiver(left_n, rules, source) { + let binding = find_binding(scope_stack, &receiver); + if binding.is_some() { + mutations.push(DataflowMutation { + func_name: Some(func_name.clone()), + receiver_name: receiver, + binding_type: binding.as_ref().map(|b| b.binding_type.clone()), + mutating_expr: truncate(node_text(node, source), 120), + line: node_line(node), + }); + } + } + } + } + + // Non-declaration assignment: x = foo() + if let (Some(left_n), Some(right_n)) = (left, right) { + if is_ident(rules, left_n.kind()) { + let unwrapped = unwrap_await(&right_n, rules); + if is_call_node(rules, unwrapped.kind()) { + if let Some(callee) = resolve_callee_name(&unwrapped, rules, source) { + let var_name = node_text(&left_n, source).to_string(); + assignments.push(DataflowAssignment { + var_name: var_name.clone(), + caller_func: Some(func_name), + source_call_name: callee.clone(), + expression: truncate(node_text(node, source), 120), + line: node_line(node), + }); + if let Some(scope) = scope_stack.last_mut() { + scope.locals.insert(var_name, LocalSource::CallReturn { callee }); + } + } + } + } + } +} + +fn handle_call_expr( + node: &Node, + rules: &DataflowRules, + source: &[u8], + scope_stack: &[ScopeFrame], + arg_flows: &mut Vec, +) { + let callee = match resolve_callee_name(node, rules, source) { + Some(c) => c, + None => return, + }; + let args_node = match node.child_by_field_name(rules.call_args_field) { + Some(a) => a, + None => return, + }; + let func_name = match scope_stack.last() { + Some(s) => match &s.func_name { + Some(f) => f.clone(), + None => return, + }, + None => return, + }; + + let mut arg_index: u32 = 0; + let cursor = &mut args_node.walk(); + for arg_raw in args_node.named_children(cursor) { + // PHP/Java: unwrap argument wrapper + let arg = if rules.argument_wrapper_type.is_some_and(|w| w == arg_raw.kind()) { + arg_raw.named_child(0).unwrap_or(arg_raw) + } else { + arg_raw + }; + + let unwrapped = if rules.spread_type.is_some_and(|s| s == arg.kind()) { + arg.named_child(0).unwrap_or(arg) + } else { + arg + }; + + let arg_name = if is_ident(rules, unwrapped.kind()) { + Some(node_text(&unwrapped, source).to_string()) + } else { + None + }; + let arg_member = if arg_name.is_none() + && rules.member_node.is_some_and(|m| m == unwrapped.kind()) + { + member_receiver(&unwrapped, rules, source) + } else { + None + }; + let tracked_name = arg_name.clone().or(arg_member); + + if let Some(ref tracked) = tracked_name { + let binding = find_binding(scope_stack, tracked); + if binding.is_some() { + let conf = binding_confidence(&binding); + arg_flows.push(DataflowArgFlow { + caller_func: Some(func_name.clone()), + callee_name: callee.clone(), + arg_index, + arg_name: Some(tracked.clone()), + binding_type: binding.as_ref().map(|b| b.binding_type.clone()), + confidence: conf, + expression: truncate(node_text(&arg_raw, source), 120), + line: node_line(node), + }); + } + } + arg_index += 1; + } +} + +fn handle_expr_stmt_mutation( + node: &Node, + rules: &DataflowRules, + source: &[u8], + scope_stack: &[ScopeFrame], + mutations: &mut Vec, +) { + if rules.mutating_methods.is_empty() { + return; + } + let expr = match node.named_child(0) { + Some(e) => e, + None => return, + }; + if !is_call_node(rules, expr.kind()) { + return; + } + + let mut method_name: Option = None; + let mut receiver: Option = None; + + // Standard pattern: call(fn: member(obj, prop)) + if let Some(fn_node) = expr.child_by_field_name(rules.call_function_field) { + if rules.member_node.is_some_and(|m| m == fn_node.kind()) { + if let Some(prop) = fn_node.child_by_field_name(rules.member_property_field) { + method_name = Some(node_text(&prop, source).to_string()); + } + receiver = member_receiver(&fn_node, rules, source); + } + } + + // Java/combined pattern: call node itself has object + name fields + if receiver.is_none() { + if let Some(obj_field) = rules.call_object_field { + let obj = expr.child_by_field_name(obj_field); + let name = expr.child_by_field_name(rules.call_function_field); + if let (Some(obj_n), Some(name_n)) = (obj, name) { + method_name = Some(node_text(&name_n, source).to_string()); + if is_ident(rules, obj_n.kind()) { + receiver = Some(node_text(&obj_n, source).to_string()); + } + } + } + } + + let method = match method_name { + Some(m) => m, + None => return, + }; + if !rules.mutating_methods.contains(&method.as_str()) { + return; + } + + let recv = match receiver { + Some(r) => r, + None => return, + }; + let func_name = match scope_stack.last() { + Some(s) => s.func_name.clone(), + None => None, + }; + if func_name.is_none() { + return; + } + + let binding = find_binding(scope_stack, &recv); + if binding.is_some() { + mutations.push(DataflowMutation { + func_name, + receiver_name: recv, + binding_type: binding.as_ref().map(|b| b.binding_type.clone()), + mutating_expr: truncate(node_text(&expr, source), 120), + line: node_line(node), + }); + } +} diff --git a/crates/codegraph-core/src/lib.rs b/crates/codegraph-core/src/lib.rs index 90e673e..ccd3534 100644 --- a/crates/codegraph-core/src/lib.rs +++ b/crates/codegraph-core/src/lib.rs @@ -7,6 +7,7 @@ pub mod cycles; pub mod incremental; pub mod complexity; pub mod cfg; +pub mod dataflow; use napi_derive::napi; use types::*; diff --git a/crates/codegraph-core/src/parallel.rs b/crates/codegraph-core/src/parallel.rs index e2c8aad..4127a27 100644 --- a/crates/codegraph-core/src/parallel.rs +++ b/crates/codegraph-core/src/parallel.rs @@ -2,6 +2,7 @@ use rayon::prelude::*; use std::fs; use tree_sitter::Parser; +use crate::dataflow::extract_dataflow; use crate::extractors::extract_symbols; use crate::parser_registry::LanguageKind; use crate::types::FileSymbols; @@ -24,6 +25,7 @@ pub fn parse_files_parallel(file_paths: &[String], _root_dir: &str) -> Vec Option { let tree = parser.parse(source_bytes, None)?; let line_count = source_bytes.iter().filter(|&&b| b == b'\n').count() as u32 + 1; let mut symbols = extract_symbols(lang, &tree, source_bytes, file_path); + symbols.dataflow = extract_dataflow(&tree, source_bytes, lang.lang_id_str()); symbols.line_count = Some(line_count); Some(symbols) } diff --git a/crates/codegraph-core/src/parser_registry.rs b/crates/codegraph-core/src/parser_registry.rs index 2c2c7e9..f800b27 100644 --- a/crates/codegraph-core/src/parser_registry.rs +++ b/crates/codegraph-core/src/parser_registry.rs @@ -17,6 +17,24 @@ pub enum LanguageKind { } impl LanguageKind { + /// Return the string ID used by dataflow/cfg rules lookup. + /// Matches the JS `DATAFLOW_RULES` map keys in `src/dataflow.js`. + pub fn lang_id_str(&self) -> &'static str { + match self { + Self::JavaScript => "javascript", + Self::TypeScript => "typescript", + Self::Tsx => "tsx", + Self::Python => "python", + Self::Go => "go", + Self::Rust => "rust", + Self::Java => "java", + Self::CSharp => "csharp", + Self::Ruby => "ruby", + Self::Php => "php", + Self::Hcl => "hcl", + } + } + /// Determine language from file extension — mirrors `getParser()` in parser.js pub fn from_extension(file_path: &str) -> Option { let path = Path::new(file_path); diff --git a/crates/codegraph-core/src/types.rs b/crates/codegraph-core/src/types.rs index c381671..f1b68ff 100644 --- a/crates/codegraph-core/src/types.rs +++ b/crates/codegraph-core/src/types.rs @@ -175,6 +175,87 @@ pub struct AstNode { pub receiver: Option, } +// ─── Dataflow Types ────────────────────────────────────────────────────── + +#[napi(object)] +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct DataflowParam { + #[napi(js_name = "funcName")] + pub func_name: String, + #[napi(js_name = "paramName")] + pub param_name: String, + #[napi(js_name = "paramIndex")] + pub param_index: u32, + pub line: u32, +} + +#[napi(object)] +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct DataflowReturn { + #[napi(js_name = "funcName")] + pub func_name: String, + pub expression: String, + #[napi(js_name = "referencedNames")] + pub referenced_names: Vec, + pub line: u32, +} + +#[napi(object)] +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct DataflowAssignment { + #[napi(js_name = "varName")] + pub var_name: String, + #[napi(js_name = "callerFunc")] + pub caller_func: Option, + #[napi(js_name = "sourceCallName")] + pub source_call_name: String, + pub expression: String, + pub line: u32, +} + +#[napi(object)] +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct DataflowArgFlow { + #[napi(js_name = "callerFunc")] + pub caller_func: Option, + #[napi(js_name = "calleeName")] + pub callee_name: String, + #[napi(js_name = "argIndex")] + pub arg_index: u32, + #[napi(js_name = "argName")] + pub arg_name: Option, + #[napi(js_name = "bindingType")] + pub binding_type: Option, + pub confidence: f64, + pub expression: String, + pub line: u32, +} + +#[napi(object)] +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct DataflowMutation { + #[napi(js_name = "funcName")] + pub func_name: Option, + #[napi(js_name = "receiverName")] + pub receiver_name: String, + #[napi(js_name = "bindingType")] + pub binding_type: Option, + #[napi(js_name = "mutatingExpr")] + pub mutating_expr: String, + pub line: u32, +} + +#[napi(object)] +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct DataflowResult { + pub parameters: Vec, + pub returns: Vec, + pub assignments: Vec, + #[napi(js_name = "argFlows")] + pub arg_flows: Vec, + pub mutations: Vec, +} + #[napi(object)] #[derive(Debug, Clone, Serialize, Deserialize)] pub struct FileSymbols { @@ -186,6 +267,7 @@ pub struct FileSymbols { pub exports: Vec, #[napi(js_name = "astNodes")] pub ast_nodes: Vec, + pub dataflow: Option, pub line_count: Option, } @@ -199,6 +281,7 @@ impl FileSymbols { classes: Vec::new(), exports: Vec::new(), ast_nodes: Vec::new(), + dataflow: None, line_count: None, } } diff --git a/src/dataflow.js b/src/dataflow.js index ad6f156..08b982f 100644 --- a/src/dataflow.js +++ b/src/dataflow.js @@ -1009,7 +1009,7 @@ export async function buildDataflowEdges(db, fileSymbols, rootDir, _engineOpts) let needsFallback = false; for (const [relPath, symbols] of fileSymbols) { - if (!symbols._tree) { + if (!symbols._tree && !symbols.dataflow) { const ext = path.extname(relPath).toLowerCase(); if (DATAFLOW_EXTENSIONS.has(ext)) { needsFallback = true; @@ -1061,41 +1061,45 @@ export async function buildDataflowEdges(db, fileSymbols, rootDir, _engineOpts) const ext = path.extname(relPath).toLowerCase(); if (!DATAFLOW_EXTENSIONS.has(ext)) continue; - let tree = symbols._tree; - let langId = symbols._langId; + // Use native dataflow data if available — skip WASM extraction + let data = symbols.dataflow; + if (!data) { + let tree = symbols._tree; + let langId = symbols._langId; + + // WASM fallback if no cached tree + if (!tree) { + if (!extToLang || !getParserFn) continue; + langId = extToLang.get(ext); + if (!langId || !DATAFLOW_LANG_IDS.has(langId)) continue; + + const absPath = path.join(rootDir, relPath); + let code; + try { + code = fs.readFileSync(absPath, 'utf-8'); + } catch { + continue; + } - // WASM fallback if no cached tree - if (!tree) { - if (!extToLang || !getParserFn) continue; - langId = extToLang.get(ext); - if (!langId || !DATAFLOW_LANG_IDS.has(langId)) continue; + const parser = getParserFn(parsers, absPath); + if (!parser) continue; - const absPath = path.join(rootDir, relPath); - let code; - try { - code = fs.readFileSync(absPath, 'utf-8'); - } catch { - continue; + try { + tree = parser.parse(code); + } catch { + continue; + } } - const parser = getParserFn(parsers, absPath); - if (!parser) continue; - - try { - tree = parser.parse(code); - } catch { - continue; + if (!langId) { + langId = extToLang ? extToLang.get(ext) : null; + if (!langId) continue; } - } - - if (!langId) { - langId = extToLang ? extToLang.get(ext) : null; - if (!langId) continue; - } - if (!DATAFLOW_RULES.has(langId)) continue; + if (!DATAFLOW_RULES.has(langId)) continue; - const data = extractDataflow(tree, relPath, symbols.definitions, langId); + data = extractDataflow(tree, relPath, symbols.definitions, langId); + } // Resolve function names to node IDs in this file first, then globally function resolveNode(funcName) { diff --git a/src/parser.js b/src/parser.js index cb98498..aa84412 100644 --- a/src/parser.js +++ b/src/parser.js @@ -269,6 +269,48 @@ function normalizeNativeSymbols(result) { text: n.text ?? null, receiver: n.receiver ?? null, })), + dataflow: result.dataflow + ? { + parameters: (result.dataflow.parameters || []).map((p) => ({ + funcName: p.funcName ?? p.func_name, + paramName: p.paramName ?? p.param_name, + paramIndex: p.paramIndex ?? p.param_index, + line: p.line, + })), + returns: (result.dataflow.returns || []).map((r) => ({ + funcName: r.funcName ?? r.func_name, + expression: r.expression ?? '', + referencedNames: r.referencedNames ?? r.referenced_names ?? [], + line: r.line, + })), + assignments: (result.dataflow.assignments || []).map((a) => ({ + varName: a.varName ?? a.var_name, + callerFunc: a.callerFunc ?? a.caller_func ?? null, + sourceCallName: a.sourceCallName ?? a.source_call_name, + expression: a.expression ?? '', + line: a.line, + })), + argFlows: (result.dataflow.argFlows ?? result.dataflow.arg_flows ?? []).map((f) => ({ + callerFunc: f.callerFunc ?? f.caller_func ?? null, + calleeName: f.calleeName ?? f.callee_name, + argIndex: f.argIndex ?? f.arg_index, + argName: f.argName ?? f.arg_name ?? null, + binding: + (f.bindingType ?? f.binding_type) ? { type: f.bindingType ?? f.binding_type } : null, + confidence: f.confidence, + expression: f.expression ?? '', + line: f.line, + })), + mutations: (result.dataflow.mutations || []).map((m) => ({ + funcName: m.funcName ?? m.func_name ?? null, + receiverName: m.receiverName ?? m.receiver_name, + binding: + (m.bindingType ?? m.binding_type) ? { type: m.bindingType ?? m.binding_type } : null, + mutatingExpr: m.mutatingExpr ?? m.mutating_expr, + line: m.line, + })), + } + : null, }; } From 0b64e25e634238011a03e1add3407435d89467fd Mon Sep 17 00:00:00 2001 From: carlos-alm <127798846+carlos-alm@users.noreply.github.com> Date: Wed, 4 Mar 2026 18:18:45 -0700 Subject: [PATCH 2/7] fix: address PR review feedback for dataflow-in-rust - Add method_call_expression to RUST_DATAFLOW call_nodes so Rust obj.method(arg) calls are no longer silently excluded from dataflow - Remove var_declaration from GO_DATAFLOW var_declarator_nodes since its child fields don't match the left/right schema used by short_var_declaration - Remove unreachable snake_case fallbacks in normalizeNativeSymbols since napi-rs #[napi(js_name)] guarantees camelCase property names Impact: 1 functions changed, 3 affected --- crates/codegraph-core/src/dataflow.rs | 8 +++--- src/parser.js | 38 +++++++++++++-------------- 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/crates/codegraph-core/src/dataflow.rs b/crates/codegraph-core/src/dataflow.rs index 06df887..9b8358d 100644 --- a/crates/codegraph-core/src/dataflow.rs +++ b/crates/codegraph-core/src/dataflow.rs @@ -205,7 +205,9 @@ static GO_DATAFLOW: DataflowRules = DataflowRules { extract_param_strategy: ParamStrategy::Go, return_node: Some("return_statement"), var_declarator_node: None, - var_declarator_nodes: &["short_var_declaration", "var_declaration"], + // Only short_var_declaration uses left/right fields. var_declaration has + // var_spec children with name/type/value fields — not yet supported. + var_declarator_nodes: &["short_var_declaration"], var_name_field: "left", var_value_field: Some("right"), assignment_node: Some("assignment_statement"), @@ -254,8 +256,8 @@ static RUST_DATAFLOW: DataflowRules = DataflowRules { assignment_node: Some("assignment_expression"), assign_left_field: "left", assign_right_field: "right", - call_node: Some("call_expression"), - call_nodes: &[], + call_node: None, + call_nodes: &["call_expression", "method_call_expression"], call_function_field: "function", call_args_field: "arguments", spread_type: None, diff --git a/src/parser.js b/src/parser.js index aa84412..b8b2979 100644 --- a/src/parser.js +++ b/src/parser.js @@ -272,41 +272,39 @@ function normalizeNativeSymbols(result) { dataflow: result.dataflow ? { parameters: (result.dataflow.parameters || []).map((p) => ({ - funcName: p.funcName ?? p.func_name, - paramName: p.paramName ?? p.param_name, - paramIndex: p.paramIndex ?? p.param_index, + funcName: p.funcName, + paramName: p.paramName, + paramIndex: p.paramIndex, line: p.line, })), returns: (result.dataflow.returns || []).map((r) => ({ - funcName: r.funcName ?? r.func_name, + funcName: r.funcName, expression: r.expression ?? '', - referencedNames: r.referencedNames ?? r.referenced_names ?? [], + referencedNames: r.referencedNames ?? [], line: r.line, })), assignments: (result.dataflow.assignments || []).map((a) => ({ - varName: a.varName ?? a.var_name, - callerFunc: a.callerFunc ?? a.caller_func ?? null, - sourceCallName: a.sourceCallName ?? a.source_call_name, + varName: a.varName, + callerFunc: a.callerFunc ?? null, + sourceCallName: a.sourceCallName, expression: a.expression ?? '', line: a.line, })), - argFlows: (result.dataflow.argFlows ?? result.dataflow.arg_flows ?? []).map((f) => ({ - callerFunc: f.callerFunc ?? f.caller_func ?? null, - calleeName: f.calleeName ?? f.callee_name, - argIndex: f.argIndex ?? f.arg_index, - argName: f.argName ?? f.arg_name ?? null, - binding: - (f.bindingType ?? f.binding_type) ? { type: f.bindingType ?? f.binding_type } : null, + argFlows: (result.dataflow.argFlows ?? []).map((f) => ({ + callerFunc: f.callerFunc ?? null, + calleeName: f.calleeName, + argIndex: f.argIndex, + argName: f.argName ?? null, + binding: f.bindingType ? { type: f.bindingType } : null, confidence: f.confidence, expression: f.expression ?? '', line: f.line, })), mutations: (result.dataflow.mutations || []).map((m) => ({ - funcName: m.funcName ?? m.func_name ?? null, - receiverName: m.receiverName ?? m.receiver_name, - binding: - (m.bindingType ?? m.binding_type) ? { type: m.bindingType ?? m.binding_type } : null, - mutatingExpr: m.mutatingExpr ?? m.mutating_expr, + funcName: m.funcName ?? null, + receiverName: m.receiverName, + binding: m.bindingType ? { type: m.bindingType } : null, + mutatingExpr: m.mutatingExpr, line: m.line, })), } From 46eaa11011f627eab8a5edf7e0dfe6c6ba99ad7d Mon Sep 17 00:00:00 2001 From: carlos-alm <127798846+carlos-alm@users.noreply.github.com> Date: Wed, 4 Mar 2026 19:36:13 -0700 Subject: [PATCH 3/7] =?UTF-8?q?fix(native):=20address=20dataflow=20review?= =?UTF-8?q?=20comments=20=E2=80=94=20conditional=20extraction,=20depth=20l?= =?UTF-8?q?imit,=20method=20mutations,=20truncation=20parity?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add include_dataflow flag to parse_files_parallel/parse_file to skip dataflow extraction when not needed (threaded from builder.js opts) - Add MAX_VISIT_DEPTH (200) to prevent stack overflow on deeply nested ASTs - Add method_call_name_field to DataflowRules for languages where method calls use a different field than call_function_field (fixes dead mutating_methods for Rust's method_call_expression) - Fix truncate() to use chars().count() instead of byte length for parity with JS str.length on non-ASCII content Impact: 11 functions changed, 31 affected --- crates/codegraph-core/src/dataflow.rs | 72 +++++++++++++++++++++------ crates/codegraph-core/src/lib.rs | 18 +++++-- crates/codegraph-core/src/parallel.rs | 18 +++++-- src/builder.js | 2 +- src/parser.js | 4 +- 5 files changed, 87 insertions(+), 27 deletions(-) diff --git a/crates/codegraph-core/src/dataflow.rs b/crates/codegraph-core/src/dataflow.rs index 9b8358d..7ddf3dd 100644 --- a/crates/codegraph-core/src/dataflow.rs +++ b/crates/codegraph-core/src/dataflow.rs @@ -6,6 +6,10 @@ use crate::types::{ DataflowReturn, }; +/// Maximum recursion depth for AST traversal to prevent stack overflow +/// on deeply nested trees. Matches the approach used in cfg.rs. +const MAX_VISIT_DEPTH: usize = 200; + // ─── Param Strategy ────────────────────────────────────────────────────── /// Per-language parameter extraction strategy. @@ -80,6 +84,10 @@ pub struct DataflowRules { expression_stmt_node: &'static str, call_object_field: Option<&'static str>, + // Method call name extraction (for languages where method_call uses a different + // field than call_function_field, e.g. Rust's method_call_expression has "name") + method_call_name_field: Option<&'static str>, + // Structural wrappers expression_list_type: Option<&'static str>, equals_clause_type: Option<&'static str>, @@ -135,6 +143,7 @@ static JS_TS_DATAFLOW: DataflowRules = DataflowRules { ], expression_stmt_node: "expression_statement", call_object_field: None, + method_call_name_field: None, expression_list_type: None, equals_clause_type: None, argument_wrapper_type: None, @@ -181,6 +190,7 @@ static PYTHON_DATAFLOW: DataflowRules = DataflowRules { ], expression_stmt_node: "expression_statement", call_object_field: None, + method_call_name_field: None, expression_list_type: None, equals_clause_type: None, argument_wrapper_type: None, @@ -226,6 +236,7 @@ static GO_DATAFLOW: DataflowRules = DataflowRules { mutating_methods: &[], expression_stmt_node: "expression_statement", call_object_field: None, + method_call_name_field: None, expression_list_type: Some("expression_list"), equals_clause_type: None, argument_wrapper_type: None, @@ -269,6 +280,7 @@ static RUST_DATAFLOW: DataflowRules = DataflowRules { mutating_methods: &["push", "pop", "insert", "remove", "clear", "sort", "reverse"], expression_stmt_node: "expression_statement", call_object_field: None, + method_call_name_field: Some("name"), expression_list_type: None, equals_clause_type: None, argument_wrapper_type: None, @@ -316,6 +328,7 @@ static JAVA_DATAFLOW: DataflowRules = DataflowRules { mutating_methods: &["add", "remove", "clear", "put", "set", "push", "pop", "sort"], expression_stmt_node: "expression_statement", call_object_field: Some("object"), + method_call_name_field: None, expression_list_type: None, equals_clause_type: None, argument_wrapper_type: Some("argument"), @@ -364,6 +377,7 @@ static CSHARP_DATAFLOW: DataflowRules = DataflowRules { mutating_methods: &["Add", "Remove", "Clear", "Insert", "Sort", "Reverse", "Push", "Pop"], expression_stmt_node: "expression_statement", call_object_field: None, + method_call_name_field: None, expression_list_type: None, equals_clause_type: Some("equals_value_clause"), argument_wrapper_type: Some("argument"), @@ -416,6 +430,7 @@ static PHP_DATAFLOW: DataflowRules = DataflowRules { mutating_methods: &["push", "pop", "shift", "unshift", "splice", "sort", "reverse"], expression_stmt_node: "expression_statement", call_object_field: None, + method_call_name_field: None, expression_list_type: None, equals_clause_type: None, argument_wrapper_type: Some("argument"), @@ -462,6 +477,7 @@ static RUBY_DATAFLOW: DataflowRules = DataflowRules { ], expression_stmt_node: "expression_statement", call_object_field: None, + method_call_name_field: None, expression_list_type: None, equals_clause_type: None, argument_wrapper_type: None, @@ -504,17 +520,16 @@ fn is_ident(rules: &DataflowRules, kind: &str) -> bool { } fn truncate(s: &str, max: usize) -> String { - if s.len() <= max { + if s.chars().count() <= max { s.to_string() } else { - let mut result = String::with_capacity(max + 3); - // Take at most `max` bytes, but don't split a char - for ch in s.chars() { - if result.len() + ch.len_utf8() > max { - break; - } - result.push(ch); - } + // Find the byte offset of the max-th character + let byte_offset = s + .char_indices() + .nth(max) + .map(|(i, _)| i) + .unwrap_or(s.len()); + let mut result = s[..byte_offset].to_string(); result.push('…'); result } @@ -908,6 +923,7 @@ pub fn extract_dataflow(tree: &Tree, source: &[u8], lang_id: &str) -> Option, arg_flows: &mut Vec, mutations: &mut Vec, + depth: usize, ) { + if depth >= MAX_VISIT_DEPTH { + return; + } + let t = node.kind(); // Enter function scope @@ -938,7 +959,7 @@ fn visit( enter_scope(node, rules, source, scope_stack, parameters); let cursor = &mut node.walk(); for child in node.named_children(cursor) { - visit(&child, rules, source, scope_stack, parameters, returns, assignments, arg_flows, mutations); + visit(&child, rules, source, scope_stack, parameters, returns, assignments, arg_flows, mutations, depth + 1); } scope_stack.pop(); return; @@ -966,7 +987,7 @@ fn visit( } let cursor = &mut node.walk(); for child in node.named_children(cursor) { - visit(&child, rules, source, scope_stack, parameters, returns, assignments, arg_flows, mutations); + visit(&child, rules, source, scope_stack, parameters, returns, assignments, arg_flows, mutations, depth + 1); } return; } @@ -976,7 +997,7 @@ fn visit( handle_var_declarator(node, rules, source, scope_stack, assignments); let cursor = &mut node.walk(); for child in node.named_children(cursor) { - visit(&child, rules, source, scope_stack, parameters, returns, assignments, arg_flows, mutations); + visit(&child, rules, source, scope_stack, parameters, returns, assignments, arg_flows, mutations, depth + 1); } return; } @@ -986,7 +1007,7 @@ fn visit( handle_var_declarator(node, rules, source, scope_stack, assignments); let cursor = &mut node.walk(); for child in node.named_children(cursor) { - visit(&child, rules, source, scope_stack, parameters, returns, assignments, arg_flows, mutations); + visit(&child, rules, source, scope_stack, parameters, returns, assignments, arg_flows, mutations, depth + 1); } return; } @@ -996,7 +1017,7 @@ fn visit( handle_call_expr(node, rules, source, scope_stack, arg_flows); let cursor = &mut node.walk(); for child in node.named_children(cursor) { - visit(&child, rules, source, scope_stack, parameters, returns, assignments, arg_flows, mutations); + visit(&child, rules, source, scope_stack, parameters, returns, assignments, arg_flows, mutations, depth + 1); } return; } @@ -1006,7 +1027,7 @@ fn visit( handle_assignment(node, rules, source, scope_stack, assignments, mutations); let cursor = &mut node.walk(); for child in node.named_children(cursor) { - visit(&child, rules, source, scope_stack, parameters, returns, assignments, arg_flows, mutations); + visit(&child, rules, source, scope_stack, parameters, returns, assignments, arg_flows, mutations, depth + 1); } return; } @@ -1019,7 +1040,7 @@ fn visit( // Default: visit children let cursor = &mut node.walk(); for child in node.named_children(cursor) { - visit(&child, rules, source, scope_stack, parameters, returns, assignments, arg_flows, mutations); + visit(&child, rules, source, scope_stack, parameters, returns, assignments, arg_flows, mutations, depth + 1); } } @@ -1344,6 +1365,25 @@ fn handle_expr_stmt_mutation( } } + // Method call pattern: call node has a dedicated name field distinct from + // call_function_field (e.g. Rust method_call_expression has "name" + receiver via + // member_object_field "value") + if method_name.is_none() { + if let Some(name_field) = rules.method_call_name_field { + if let Some(name_n) = expr.child_by_field_name(name_field) { + method_name = Some(node_text(&name_n, source).to_string()); + // Extract receiver: try member_object_field on the call expr itself + if let Some(recv_node) = expr.child_by_field_name(rules.member_object_field) { + if is_ident(rules, recv_node.kind()) { + receiver = Some(node_text(&recv_node, source).to_string()); + } else if rules.member_node.is_some_and(|m| m == recv_node.kind()) { + receiver = member_receiver(&recv_node, rules, source); + } + } + } + } + } + // Java/combined pattern: call node itself has object + name fields if receiver.is_none() { if let Some(obj_field) = rules.call_object_field { diff --git a/crates/codegraph-core/src/lib.rs b/crates/codegraph-core/src/lib.rs index ccd3534..607aec1 100644 --- a/crates/codegraph-core/src/lib.rs +++ b/crates/codegraph-core/src/lib.rs @@ -13,15 +13,25 @@ use napi_derive::napi; use types::*; /// Parse a single file and return extracted symbols. +/// When `include_dataflow` is true, dataflow analysis is also extracted. #[napi] -pub fn parse_file(file_path: String, source: String) -> Option { - parallel::parse_file(&file_path, &source) +pub fn parse_file( + file_path: String, + source: String, + include_dataflow: Option, +) -> Option { + parallel::parse_file(&file_path, &source, include_dataflow.unwrap_or(false)) } /// Parse multiple files in parallel and return all extracted symbols. +/// When `include_dataflow` is true, dataflow analysis is also extracted. #[napi] -pub fn parse_files(file_paths: Vec, root_dir: String) -> Vec { - parallel::parse_files_parallel(&file_paths, &root_dir) +pub fn parse_files( + file_paths: Vec, + root_dir: String, + include_dataflow: Option, +) -> Vec { + parallel::parse_files_parallel(&file_paths, &root_dir, include_dataflow.unwrap_or(false)) } /// Resolve a single import path. diff --git a/crates/codegraph-core/src/parallel.rs b/crates/codegraph-core/src/parallel.rs index 4127a27..7fb0d8d 100644 --- a/crates/codegraph-core/src/parallel.rs +++ b/crates/codegraph-core/src/parallel.rs @@ -10,7 +10,12 @@ use crate::types::FileSymbols; /// Parse multiple files in parallel using rayon. /// Each thread creates its own Parser (cheap; Language objects are Send+Sync). /// Failed files are silently skipped (matches WASM behavior). -pub fn parse_files_parallel(file_paths: &[String], _root_dir: &str) -> Vec { +/// When `include_dataflow` is false, dataflow extraction is skipped for performance. +pub fn parse_files_parallel( + file_paths: &[String], + _root_dir: &str, + include_dataflow: bool, +) -> Vec { file_paths .par_iter() .filter_map(|file_path| { @@ -25,7 +30,9 @@ pub fn parse_files_parallel(file_paths: &[String], _root_dir: &str) -> Vec Vec Option { +/// When `include_dataflow` is false, dataflow extraction is skipped for performance. +pub fn parse_file(file_path: &str, source: &str, include_dataflow: bool) -> Option { let lang = LanguageKind::from_extension(file_path)?; let source_bytes = source.as_bytes(); @@ -45,7 +53,9 @@ pub fn parse_file(file_path: &str, source: &str) -> Option { let tree = parser.parse(source_bytes, None)?; let line_count = source_bytes.iter().filter(|&&b| b == b'\n').count() as u32 + 1; let mut symbols = extract_symbols(lang, &tree, source_bytes, file_path); - symbols.dataflow = extract_dataflow(&tree, source_bytes, lang.lang_id_str()); + if include_dataflow { + symbols.dataflow = extract_dataflow(&tree, source_bytes, lang.lang_id_str()); + } symbols.line_count = Some(line_count); Some(symbols) } diff --git a/src/builder.js b/src/builder.js index c5019b4..af312a5 100644 --- a/src/builder.js +++ b/src/builder.js @@ -444,7 +444,7 @@ export async function buildGraph(rootDir, opts = {}) { opts.incremental !== false && config.build && config.build.incremental !== false; // Engine selection: 'native', 'wasm', or 'auto' (default) - const engineOpts = { engine: opts.engine || 'auto' }; + const engineOpts = { engine: opts.engine || 'auto', dataflow: opts.dataflow !== false }; const { name: engineName, version: engineVersion } = getActiveEngine(engineOpts); info(`Using ${engineName} engine${engineVersion ? ` (v${engineVersion})` : ''}`); diff --git a/src/parser.js b/src/parser.js index b8b2979..e9d5a1e 100644 --- a/src/parser.js +++ b/src/parser.js @@ -440,7 +440,7 @@ export async function parseFileAuto(filePath, source, opts = {}) { const { native } = resolveEngine(opts); if (native) { - const result = native.parseFile(filePath, source); + const result = native.parseFile(filePath, source, !!opts.dataflow); return result ? normalizeNativeSymbols(result) : null; } @@ -463,7 +463,7 @@ export async function parseFilesAuto(filePaths, rootDir, opts = {}) { const result = new Map(); if (native) { - const nativeResults = native.parseFiles(filePaths, rootDir); + const nativeResults = native.parseFiles(filePaths, rootDir, !!opts.dataflow); for (const r of nativeResults) { if (!r) continue; const relPath = path.relative(rootDir, r.file).split(path.sep).join('/'); From 6b0a9e7090ee32773df7cea9b9c9143d80ec3177 Mon Sep 17 00:00:00 2001 From: carlos-alm <127798846+carlos-alm@users.noreply.github.com> Date: Wed, 4 Mar 2026 20:11:16 -0700 Subject: [PATCH 4/7] fix(native): use correct receiver field for Rust method call mutations Impact: 1 functions changed, 4 affected --- crates/codegraph-core/src/dataflow.rs | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/crates/codegraph-core/src/dataflow.rs b/crates/codegraph-core/src/dataflow.rs index 7ddf3dd..f479c10 100644 --- a/crates/codegraph-core/src/dataflow.rs +++ b/crates/codegraph-core/src/dataflow.rs @@ -88,6 +88,11 @@ pub struct DataflowRules { // field than call_function_field, e.g. Rust's method_call_expression has "name") method_call_name_field: Option<&'static str>, + // Method call receiver extraction (for languages where the method call receiver + // uses a different field than member_object_field, e.g. Rust's + // method_call_expression exposes "receiver" not "value") + method_call_receiver_field: Option<&'static str>, + // Structural wrappers expression_list_type: Option<&'static str>, equals_clause_type: Option<&'static str>, @@ -144,6 +149,7 @@ static JS_TS_DATAFLOW: DataflowRules = DataflowRules { expression_stmt_node: "expression_statement", call_object_field: None, method_call_name_field: None, + method_call_receiver_field: None, expression_list_type: None, equals_clause_type: None, argument_wrapper_type: None, @@ -191,6 +197,7 @@ static PYTHON_DATAFLOW: DataflowRules = DataflowRules { expression_stmt_node: "expression_statement", call_object_field: None, method_call_name_field: None, + method_call_receiver_field: None, expression_list_type: None, equals_clause_type: None, argument_wrapper_type: None, @@ -237,6 +244,7 @@ static GO_DATAFLOW: DataflowRules = DataflowRules { expression_stmt_node: "expression_statement", call_object_field: None, method_call_name_field: None, + method_call_receiver_field: None, expression_list_type: Some("expression_list"), equals_clause_type: None, argument_wrapper_type: None, @@ -281,6 +289,7 @@ static RUST_DATAFLOW: DataflowRules = DataflowRules { expression_stmt_node: "expression_statement", call_object_field: None, method_call_name_field: Some("name"), + method_call_receiver_field: Some("receiver"), expression_list_type: None, equals_clause_type: None, argument_wrapper_type: None, @@ -329,6 +338,7 @@ static JAVA_DATAFLOW: DataflowRules = DataflowRules { expression_stmt_node: "expression_statement", call_object_field: Some("object"), method_call_name_field: None, + method_call_receiver_field: None, expression_list_type: None, equals_clause_type: None, argument_wrapper_type: Some("argument"), @@ -378,6 +388,7 @@ static CSHARP_DATAFLOW: DataflowRules = DataflowRules { expression_stmt_node: "expression_statement", call_object_field: None, method_call_name_field: None, + method_call_receiver_field: None, expression_list_type: None, equals_clause_type: Some("equals_value_clause"), argument_wrapper_type: Some("argument"), @@ -431,6 +442,7 @@ static PHP_DATAFLOW: DataflowRules = DataflowRules { expression_stmt_node: "expression_statement", call_object_field: None, method_call_name_field: None, + method_call_receiver_field: None, expression_list_type: None, equals_clause_type: None, argument_wrapper_type: Some("argument"), @@ -478,6 +490,7 @@ static RUBY_DATAFLOW: DataflowRules = DataflowRules { expression_stmt_node: "expression_statement", call_object_field: None, method_call_name_field: None, + method_call_receiver_field: None, expression_list_type: None, equals_clause_type: None, argument_wrapper_type: None, @@ -1366,14 +1379,17 @@ fn handle_expr_stmt_mutation( } // Method call pattern: call node has a dedicated name field distinct from - // call_function_field (e.g. Rust method_call_expression has "name" + receiver via - // member_object_field "value") + // call_function_field (e.g. Rust method_call_expression has "name" + "receiver") if method_name.is_none() { if let Some(name_field) = rules.method_call_name_field { if let Some(name_n) = expr.child_by_field_name(name_field) { method_name = Some(node_text(&name_n, source).to_string()); - // Extract receiver: try member_object_field on the call expr itself - if let Some(recv_node) = expr.child_by_field_name(rules.member_object_field) { + // Extract receiver: prefer method_call_receiver_field if set, + // otherwise fall back to member_object_field + let recv_field = rules + .method_call_receiver_field + .unwrap_or(rules.member_object_field); + if let Some(recv_node) = expr.child_by_field_name(recv_field) { if is_ident(rules, recv_node.kind()) { receiver = Some(node_text(&recv_node, source).to_string()); } else if rules.member_node.is_some_and(|m| m == recv_node.kind()) { From 5c67b54c65e5df5ea0746756d9847b96440d7b64 Mon Sep 17 00:00:00 2001 From: carlos-alm <127798846+carlos-alm@users.noreply.github.com> Date: Wed, 4 Mar 2026 20:25:22 -0700 Subject: [PATCH 5/7] fix(native): add depth guard to collect_identifiers in dataflow Impact: 2 functions changed, 2 affected --- crates/codegraph-core/src/dataflow.rs | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/crates/codegraph-core/src/dataflow.rs b/crates/codegraph-core/src/dataflow.rs index f479c10..6b58e38 100644 --- a/crates/codegraph-core/src/dataflow.rs +++ b/crates/codegraph-core/src/dataflow.rs @@ -851,14 +851,17 @@ fn member_receiver(member_expr: &Node, rules: &DataflowRules, source: &[u8]) -> } /// Collect all identifier names referenced within a node. -fn collect_identifiers(node: &Node, out: &mut Vec, rules: &DataflowRules, source: &[u8]) { +fn collect_identifiers(node: &Node, out: &mut Vec, rules: &DataflowRules, source: &[u8], depth: usize) { + if depth > MAX_VISIT_DEPTH { + return; + } if is_ident(rules, node.kind()) { out.push(node_text(node, source).to_string()); return; } let cursor = &mut node.walk(); for child in node.named_children(cursor) { - collect_identifiers(&child, out, rules, source); + collect_identifiers(&child, out, rules, source, depth + 1); } } @@ -985,7 +988,7 @@ fn visit( let expr = node.named_child(0); let mut referenced_names = Vec::new(); if let Some(ref e) = expr { - collect_identifiers(e, &mut referenced_names, rules, source); + collect_identifiers(e, &mut referenced_names, rules, source, depth + 1); } returns.push(DataflowReturn { func_name: func_name.clone(), From 01d4b5f1af605f908ea178f7146fbd906a1bf56c Mon Sep 17 00:00:00 2001 From: carlos-alm <127798846+carlos-alm@users.noreply.github.com> Date: Wed, 4 Mar 2026 23:00:43 -0700 Subject: [PATCH 6/7] perf: skip dataflow computation in incremental CFG-only rebuilds When only needsCfg is true in the pending analysis path, the native engine was still computing dataflow for every file via engineOpts defaulting dataflow to true. The results were immediately discarded since the needsDataflow guard was false. Override engineOpts.dataflow in the analysis call site so it only runs when actually needed. Impact: 1 functions changed, 0 affected --- src/builder.js | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/builder.js b/src/builder.js index af312a5..edcf8b1 100644 --- a/src/builder.js +++ b/src/builder.js @@ -548,7 +548,11 @@ export async function buildGraph(rootDir, opts = {}) { if (needsCfg || needsDataflow) { info('No file changes. Running pending analysis pass...'); - const analysisSymbols = await parseFilesAuto(files, rootDir, engineOpts); + const analysisOpts = { + ...engineOpts, + dataflow: needsDataflow && opts.dataflow !== false, + }; + const analysisSymbols = await parseFilesAuto(files, rootDir, analysisOpts); if (needsCfg) { const { buildCFGData } = await import('./cfg.js'); await buildCFGData(db, analysisSymbols, rootDir, engineOpts); From 654a60b68684672143177957614a4ec88e545d33 Mon Sep 17 00:00:00 2001 From: carlos-alm <127798846+carlos-alm@users.noreply.github.com> Date: Thu, 5 Mar 2026 02:25:03 -0700 Subject: [PATCH 7/7] fix: use >= for depth guard in collect_identifiers to match visit() Impact: 1 functions changed, 4 affected --- crates/codegraph-core/src/dataflow.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/codegraph-core/src/dataflow.rs b/crates/codegraph-core/src/dataflow.rs index 6b58e38..82c3022 100644 --- a/crates/codegraph-core/src/dataflow.rs +++ b/crates/codegraph-core/src/dataflow.rs @@ -852,7 +852,7 @@ fn member_receiver(member_expr: &Node, rules: &DataflowRules, source: &[u8]) -> /// Collect all identifier names referenced within a node. fn collect_identifiers(node: &Node, out: &mut Vec, rules: &DataflowRules, source: &[u8], depth: usize) { - if depth > MAX_VISIT_DEPTH { + if depth >= MAX_VISIT_DEPTH { return; } if is_ident(rules, node.kind()) {