From 3d1f41c2219ebb7358813a81afb69ed8295424b8 Mon Sep 17 00:00:00 2001 From: carlos-alm <127798846+carlos-alm@users.noreply.github.com> Date: Wed, 4 Mar 2026 02:37:02 -0700 Subject: [PATCH 1/9] perf: compute CFG in Rust native engine for all languages Port buildFunctionCFG algorithm from JS (cfg.js) to Rust (cfg.rs) with per-language CfgRules for all 8 supported languages. Each extractor now calls build_function_cfg() on function/method AST nodes during extraction, storing the CFG directly on the Definition struct. JS pipeline updated to use native CFG data when available (def.cfg), falling back to WASM tree walk only when native data is absent. This eliminates the need for WASM re-parsing in the CFG phase for native engine builds. Rust changes: - New cfg.rs module with CfgRules struct and 8 language configs - CfgBlock, CfgEdge, CfgData types in types.rs - All extractors call build_function_cfg for function/method defs JS changes: - parser.js normalizeNativeSymbols maps def.cfg through - cfg.js buildCFGData checks def.cfg before WASM fallback - Skips WASM parser init when all defs have native CFG Tests: 1437 pass, new cfg-all-langs.test.js with JS-side mock tests and native parity tests (block/edge count + type matching). Impact: 38 functions changed, 56 affected --- crates/codegraph-core/src/cfg.rs | 1045 +++++++++++++++++ .../codegraph-core/src/extractors/csharp.rs | 10 + crates/codegraph-core/src/extractors/go.rs | 8 + crates/codegraph-core/src/extractors/hcl.rs | 1 + .../codegraph-core/src/extractors/helpers.rs | 1 + crates/codegraph-core/src/extractors/java.rs | 7 + .../src/extractors/javascript.rs | 13 + crates/codegraph-core/src/extractors/php.rs | 8 + .../codegraph-core/src/extractors/python.rs | 4 + crates/codegraph-core/src/extractors/ruby.rs | 5 + .../src/extractors/rust_lang.rs | 7 + crates/codegraph-core/src/lib.rs | 1 + crates/codegraph-core/src/types.rs | 31 + src/cfg.js | 37 +- src/parser.js | 16 + tests/parsers/cfg-all-langs.test.js | 461 ++++++++ 16 files changed, 1646 insertions(+), 9 deletions(-) create mode 100644 crates/codegraph-core/src/cfg.rs create mode 100644 tests/parsers/cfg-all-langs.test.js diff --git a/crates/codegraph-core/src/cfg.rs b/crates/codegraph-core/src/cfg.rs new file mode 100644 index 0000000..6c0da09 --- /dev/null +++ b/crates/codegraph-core/src/cfg.rs @@ -0,0 +1,1045 @@ +use tree_sitter::Node; +use crate::types::{CfgBlock, CfgData, CfgEdge}; + +// ─── CFG Rules ────────────────────────────────────────────────────────── + +/// Per-language node type names for CFG construction. +pub struct CfgRules { + pub if_node: Option<&'static str>, + pub if_nodes: &'static [&'static str], + pub elif_node: Option<&'static str>, + pub else_clause: Option<&'static str>, + pub else_via_alternative: bool, + pub if_consequent_field: Option<&'static str>, + pub for_nodes: &'static [&'static str], + pub while_node: Option<&'static str>, + pub while_nodes: &'static [&'static str], + pub do_node: Option<&'static str>, + pub infinite_loop_node: Option<&'static str>, + pub unless_node: Option<&'static str>, + pub until_node: Option<&'static str>, + pub switch_node: Option<&'static str>, + pub switch_nodes: &'static [&'static str], + pub case_node: Option<&'static str>, + pub case_nodes: &'static [&'static str], + pub default_node: Option<&'static str>, + pub try_node: Option<&'static str>, + pub catch_node: Option<&'static str>, + pub finally_node: Option<&'static str>, + pub return_node: Option<&'static str>, + pub throw_node: Option<&'static str>, + pub break_node: Option<&'static str>, + pub continue_node: Option<&'static str>, + pub block_node: Option<&'static str>, + pub block_nodes: &'static [&'static str], + pub labeled_node: Option<&'static str>, +} + +fn matches_opt(kind: &str, opt: Option<&str>) -> bool { + opt.is_some_and(|s| s == kind) +} + +fn matches_slice(kind: &str, slice: &[&str]) -> bool { + slice.contains(&kind) +} + +// ─── Per-Language Rules ───────────────────────────────────────────────── + +pub static JS_TS_CFG: CfgRules = CfgRules { + if_node: Some("if_statement"), + if_nodes: &[], + elif_node: None, + else_clause: Some("else_clause"), + else_via_alternative: false, + if_consequent_field: None, + for_nodes: &["for_statement", "for_in_statement"], + while_node: Some("while_statement"), + while_nodes: &[], + do_node: Some("do_statement"), + infinite_loop_node: None, + unless_node: None, + until_node: None, + switch_node: Some("switch_statement"), + switch_nodes: &[], + case_node: Some("switch_case"), + case_nodes: &[], + default_node: Some("switch_default"), + try_node: Some("try_statement"), + catch_node: Some("catch_clause"), + finally_node: Some("finally_clause"), + return_node: Some("return_statement"), + throw_node: Some("throw_statement"), + break_node: Some("break_statement"), + continue_node: Some("continue_statement"), + block_node: Some("statement_block"), + block_nodes: &[], + labeled_node: Some("labeled_statement"), +}; + +pub static PYTHON_CFG: CfgRules = CfgRules { + if_node: Some("if_statement"), + if_nodes: &[], + elif_node: Some("elif_clause"), + else_clause: Some("else_clause"), + else_via_alternative: false, + if_consequent_field: None, + for_nodes: &["for_statement"], + while_node: Some("while_statement"), + while_nodes: &[], + do_node: None, + infinite_loop_node: None, + unless_node: None, + until_node: None, + switch_node: Some("match_statement"), + switch_nodes: &[], + case_node: Some("case_clause"), + case_nodes: &[], + default_node: None, + try_node: Some("try_statement"), + catch_node: Some("except_clause"), + finally_node: Some("finally_clause"), + return_node: Some("return_statement"), + throw_node: Some("raise_statement"), + break_node: Some("break_statement"), + continue_node: Some("continue_statement"), + block_node: Some("block"), + block_nodes: &[], + labeled_node: None, +}; + +pub static GO_CFG: CfgRules = CfgRules { + if_node: Some("if_statement"), + if_nodes: &[], + elif_node: None, + else_clause: None, + else_via_alternative: true, + if_consequent_field: None, + for_nodes: &["for_statement"], + while_node: None, + while_nodes: &[], + do_node: None, + infinite_loop_node: None, + unless_node: None, + until_node: None, + switch_node: None, + switch_nodes: &["expression_switch_statement", "type_switch_statement", "select_statement"], + case_node: Some("expression_case"), + case_nodes: &["type_case", "communication_case"], + default_node: Some("default_case"), + try_node: None, + catch_node: None, + finally_node: None, + return_node: Some("return_statement"), + throw_node: None, + break_node: Some("break_statement"), + continue_node: Some("continue_statement"), + block_node: Some("block"), + block_nodes: &[], + labeled_node: Some("labeled_statement"), +}; + +pub static RUST_CFG: CfgRules = CfgRules { + if_node: Some("if_expression"), + if_nodes: &["if_let_expression"], + elif_node: None, + else_clause: Some("else_clause"), + else_via_alternative: false, + if_consequent_field: None, + for_nodes: &["for_expression"], + while_node: Some("while_expression"), + while_nodes: &["while_let_expression"], + do_node: None, + infinite_loop_node: Some("loop_expression"), + unless_node: None, + until_node: None, + switch_node: Some("match_expression"), + switch_nodes: &[], + case_node: Some("match_arm"), + case_nodes: &[], + default_node: None, + try_node: None, + catch_node: None, + finally_node: None, + return_node: Some("return_expression"), + throw_node: None, + break_node: Some("break_expression"), + continue_node: Some("continue_expression"), + block_node: Some("block"), + block_nodes: &[], + labeled_node: None, +}; + +pub static JAVA_CFG: CfgRules = CfgRules { + if_node: Some("if_statement"), + if_nodes: &[], + elif_node: None, + else_clause: None, + else_via_alternative: true, + if_consequent_field: None, + for_nodes: &["for_statement", "enhanced_for_statement"], + while_node: Some("while_statement"), + while_nodes: &[], + do_node: Some("do_statement"), + infinite_loop_node: None, + unless_node: None, + until_node: None, + switch_node: Some("switch_expression"), + switch_nodes: &[], + case_node: Some("switch_block_statement_group"), + case_nodes: &["switch_rule"], + default_node: None, + try_node: Some("try_statement"), + catch_node: Some("catch_clause"), + finally_node: Some("finally_clause"), + return_node: Some("return_statement"), + throw_node: Some("throw_statement"), + break_node: Some("break_statement"), + continue_node: Some("continue_statement"), + block_node: Some("block"), + block_nodes: &[], + labeled_node: Some("labeled_statement"), +}; + +pub static CSHARP_CFG: CfgRules = CfgRules { + if_node: Some("if_statement"), + if_nodes: &[], + elif_node: None, + else_clause: None, + else_via_alternative: true, + if_consequent_field: None, + for_nodes: &["for_statement", "foreach_statement"], + while_node: Some("while_statement"), + while_nodes: &[], + do_node: Some("do_statement"), + infinite_loop_node: None, + unless_node: None, + until_node: None, + switch_node: Some("switch_statement"), + switch_nodes: &[], + case_node: Some("switch_section"), + case_nodes: &[], + default_node: None, + try_node: Some("try_statement"), + catch_node: Some("catch_clause"), + finally_node: Some("finally_clause"), + return_node: Some("return_statement"), + throw_node: Some("throw_statement"), + break_node: Some("break_statement"), + continue_node: Some("continue_statement"), + block_node: Some("block"), + block_nodes: &[], + labeled_node: Some("labeled_statement"), +}; + +pub static RUBY_CFG: CfgRules = CfgRules { + if_node: Some("if"), + if_nodes: &[], + elif_node: Some("elsif"), + else_clause: Some("else"), + else_via_alternative: false, + if_consequent_field: None, + for_nodes: &["for"], + while_node: Some("while"), + while_nodes: &[], + do_node: None, + infinite_loop_node: None, + unless_node: Some("unless"), + until_node: Some("until"), + switch_node: Some("case"), + switch_nodes: &[], + case_node: Some("when"), + case_nodes: &[], + default_node: Some("else"), + try_node: Some("begin"), + catch_node: Some("rescue"), + finally_node: Some("ensure"), + return_node: Some("return"), + throw_node: None, + break_node: Some("break"), + continue_node: Some("next"), + block_node: None, + block_nodes: &["then", "do", "body_statement"], + labeled_node: None, +}; + +pub static PHP_CFG: CfgRules = CfgRules { + if_node: Some("if_statement"), + if_nodes: &[], + elif_node: Some("else_if_clause"), + else_clause: Some("else_clause"), + else_via_alternative: false, + if_consequent_field: Some("body"), + for_nodes: &["for_statement", "foreach_statement"], + while_node: Some("while_statement"), + while_nodes: &[], + do_node: Some("do_statement"), + infinite_loop_node: None, + unless_node: None, + until_node: None, + switch_node: Some("switch_statement"), + switch_nodes: &[], + case_node: Some("case_statement"), + case_nodes: &[], + default_node: Some("default_statement"), + try_node: Some("try_statement"), + catch_node: Some("catch_clause"), + finally_node: Some("finally_clause"), + return_node: Some("return_statement"), + throw_node: Some("throw_expression"), + break_node: Some("break_statement"), + continue_node: Some("continue_statement"), + block_node: Some("compound_statement"), + block_nodes: &[], + labeled_node: None, +}; + +/// Get CFG rules for a language ID. +pub fn get_cfg_rules(lang_id: &str) -> Option<&'static CfgRules> { + match lang_id { + "javascript" | "typescript" | "tsx" => Some(&JS_TS_CFG), + "python" => Some(&PYTHON_CFG), + "go" => Some(&GO_CFG), + "rust" => Some(&RUST_CFG), + "java" => Some(&JAVA_CFG), + "c_sharp" => Some(&CSHARP_CFG), + "ruby" => Some(&RUBY_CFG), + "php" => Some(&PHP_CFG), + _ => None, + } +} + +// ─── Core Algorithm ───────────────────────────────────────────────────── + +/// Loop context for break/continue resolution. +struct LoopCtx { + header_idx: u32, + exit_idx: u32, +} + +/// Label context for labeled break/continue. +struct LabelCtx { + header_idx: Option, + exit_idx: Option, +} + +/// CFG builder state. +struct CfgBuilder<'a> { + rules: &'a CfgRules, + blocks: Vec, + edges: Vec, + next_index: u32, + exit_idx: u32, + loop_stack: Vec, + label_map: Vec<(String, LabelCtx)>, +} + +impl<'a> CfgBuilder<'a> { + fn new(rules: &'a CfgRules) -> Self { + Self { + rules, + blocks: Vec::new(), + edges: Vec::new(), + next_index: 0, + exit_idx: 0, + loop_stack: Vec::new(), + label_map: Vec::new(), + } + } + + fn make_block(&mut self, block_type: &str, start_line: Option, end_line: Option, label: Option<&str>) -> u32 { + let idx = self.next_index; + self.next_index += 1; + self.blocks.push(CfgBlock { + index: idx, + block_type: block_type.to_string(), + start_line, + end_line, + label: label.map(|s| s.to_string()), + }); + idx + } + + fn add_edge(&mut self, source: u32, target: u32, kind: &str) { + self.edges.push(CfgEdge { + source_index: source, + target_index: target, + kind: kind.to_string(), + }); + } + + fn set_end_line(&mut self, block_idx: u32, line: u32) { + if let Some(b) = self.blocks.iter_mut().find(|b| b.index == block_idx) { + b.end_line = Some(line); + } + } + + fn set_start_line_if_empty(&mut self, block_idx: u32, line: u32) { + if let Some(b) = self.blocks.iter_mut().find(|b| b.index == block_idx) { + if b.start_line.is_none() { + b.start_line = Some(line); + } + } + } + + fn start_line_of(&self, block_idx: u32) -> Option { + self.blocks.iter().find(|b| b.index == block_idx).and_then(|b| b.start_line) + } + + /// Get statement children from a block or statement list. + fn get_statements<'b>(&self, node: &Node<'b>) -> Vec> { + let kind = node.kind(); + if matches_opt(kind, self.rules.block_node) || matches_slice(kind, self.rules.block_nodes) { + let mut stmts = Vec::new(); + let cursor = &mut node.walk(); + for child in node.named_children(cursor) { + stmts.push(child); + } + return stmts; + } + // Single statement + vec![*node] + } + + /// Process a list of statements, returns the last current block or None if all paths terminated. + fn process_statements(&mut self, stmts: &[Node], current: u32) -> Option { + let mut cur = Some(current); + for stmt in stmts { + match cur { + None => break, // Dead code after return/break/continue/throw + Some(c) => cur = self.process_statement(stmt, c), + } + } + cur + } + + /// Process a single statement. + fn process_statement(&mut self, stmt: &Node, current: u32) -> Option { + let kind = stmt.kind(); + + // Unwrap expression_statement (Rust uses expressions for control flow) + if kind == "expression_statement" && stmt.named_child_count() == 1 { + if let Some(inner) = stmt.named_child(0) { + let t = inner.kind(); + if matches_opt(t, self.rules.if_node) + || matches_slice(t, self.rules.if_nodes) + || matches_slice(t, self.rules.for_nodes) + || matches_opt(t, self.rules.while_node) + || matches_slice(t, self.rules.while_nodes) + || matches_opt(t, self.rules.do_node) + || matches_opt(t, self.rules.infinite_loop_node) + || matches_opt(t, self.rules.switch_node) + || matches_slice(t, self.rules.switch_nodes) + || matches_opt(t, self.rules.return_node) + || matches_opt(t, self.rules.throw_node) + || matches_opt(t, self.rules.break_node) + || matches_opt(t, self.rules.continue_node) + || matches_opt(t, self.rules.unless_node) + || matches_opt(t, self.rules.until_node) + { + return self.process_statement(&inner, current); + } + } + } + + // Labeled statement + if matches_opt(kind, self.rules.labeled_node) { + let label_node = stmt.child_by_field_name("label"); + let body = stmt.child_by_field_name("body"); + if let (Some(label_node), Some(body)) = (label_node, body) { + let label_name = label_node.utf8_text(&[]).unwrap_or("").to_string(); + // We can't know the loop blocks yet — push a placeholder + self.label_map.push((label_name.clone(), LabelCtx { header_idx: None, exit_idx: None })); + let result = self.process_statement(&body, current); + self.label_map.retain(|(n, _)| n != &label_name); + return result; + } + return Some(current); + } + + // If statement + if matches_opt(kind, self.rules.if_node) || matches_slice(kind, self.rules.if_nodes) { + return self.process_if(stmt, current); + } + + // Unless (Ruby) + if matches_opt(kind, self.rules.unless_node) { + return self.process_if(stmt, current); + } + + // For loops + if matches_slice(kind, self.rules.for_nodes) { + return self.process_for_loop(stmt, current); + } + + // While loop + if matches_opt(kind, self.rules.while_node) || matches_slice(kind, self.rules.while_nodes) { + return self.process_while_loop(stmt, current); + } + + // Until (Ruby) + if matches_opt(kind, self.rules.until_node) { + return self.process_while_loop(stmt, current); + } + + // Do-while + if matches_opt(kind, self.rules.do_node) { + return self.process_do_while_loop(stmt, current); + } + + // Infinite loop (Rust loop {}) + if matches_opt(kind, self.rules.infinite_loop_node) { + return self.process_infinite_loop(stmt, current); + } + + // Switch/match + if matches_opt(kind, self.rules.switch_node) || matches_slice(kind, self.rules.switch_nodes) { + return self.process_switch(stmt, current); + } + + // Try/catch/finally + if matches_opt(kind, self.rules.try_node) { + return self.process_try_catch(stmt, current); + } + + // Return + if matches_opt(kind, self.rules.return_node) { + self.set_end_line(current, node_line(stmt)); + self.add_edge(current, self.exit_idx, "return"); + return None; + } + + // Throw + if matches_opt(kind, self.rules.throw_node) { + self.set_end_line(current, node_line(stmt)); + self.add_edge(current, self.exit_idx, "exception"); + return None; + } + + // Break + if matches_opt(kind, self.rules.break_node) { + let label_name = stmt.child_by_field_name("label") + .map(|n| n.utf8_text(&[]).unwrap_or("").to_string()); + + let target = if let Some(ref name) = label_name { + self.label_map.iter().rev() + .find(|(n, _)| n == name) + .and_then(|(_, ctx)| ctx.exit_idx) + } else { + self.loop_stack.last().map(|ctx| ctx.exit_idx) + }; + + if let Some(target) = target { + self.set_end_line(current, node_line(stmt)); + self.add_edge(current, target, "break"); + return None; + } + return Some(current); + } + + // Continue + if matches_opt(kind, self.rules.continue_node) { + let label_name = stmt.child_by_field_name("label") + .map(|n| n.utf8_text(&[]).unwrap_or("").to_string()); + + let target = if let Some(ref name) = label_name { + self.label_map.iter().rev() + .find(|(n, _)| n == name) + .and_then(|(_, ctx)| ctx.header_idx) + } else { + self.loop_stack.last().map(|ctx| ctx.header_idx) + }; + + if let Some(target) = target { + self.set_end_line(current, node_line(stmt)); + self.add_edge(current, target, "continue"); + return None; + } + return Some(current); + } + + // Regular statement — extend current block + self.set_start_line_if_empty(current, node_line(stmt)); + self.set_end_line(current, node_end_line(stmt)); + Some(current) + } + + /// Process if/else-if/else chain (handles patterns A, B, C). + fn process_if(&mut self, if_stmt: &Node, current: u32) -> Option { + self.set_end_line(current, node_line(if_stmt)); + + let cond_block = self.make_block("condition", Some(node_line(if_stmt)), Some(node_line(if_stmt)), Some("if")); + self.add_edge(current, cond_block, "fallthrough"); + + let join_block = self.make_block("body", None, None, None); + + // True branch + let consequent_field = self.rules.if_consequent_field.unwrap_or("consequence"); + let consequent = if_stmt.child_by_field_name(consequent_field); + let true_block = self.make_block("branch_true", None, None, Some("then")); + self.add_edge(cond_block, true_block, "branch_true"); + + if let Some(consequent) = consequent { + let true_stmts = self.get_statements(&consequent); + let true_end = self.process_statements(&true_stmts, true_block); + if let Some(te) = true_end { + self.add_edge(te, join_block, "fallthrough"); + } + } else { + self.add_edge(true_block, join_block, "fallthrough"); + } + + // False branch + if self.rules.elif_node.is_some() { + // Pattern B: elif/else as siblings + self.process_elif_siblings(if_stmt, cond_block, join_block); + } else { + let alternative = if_stmt.child_by_field_name("alternative"); + if let Some(alternative) = alternative { + let alt_kind = alternative.kind(); + if self.rules.else_via_alternative && !matches_opt(alt_kind, self.rules.else_clause) { + // Pattern C: alternative points directly to if or block + if matches_opt(alt_kind, self.rules.if_node) || matches_slice(alt_kind, self.rules.if_nodes) { + let false_block = self.make_block("branch_false", None, None, Some("else-if")); + self.add_edge(cond_block, false_block, "branch_false"); + let else_if_end = self.process_if(&alternative, false_block); + if let Some(eie) = else_if_end { + self.add_edge(eie, join_block, "fallthrough"); + } + } else { + let false_block = self.make_block("branch_false", None, None, Some("else")); + self.add_edge(cond_block, false_block, "branch_false"); + let false_stmts = self.get_statements(&alternative); + let false_end = self.process_statements(&false_stmts, false_block); + if let Some(fe) = false_end { + self.add_edge(fe, join_block, "fallthrough"); + } + } + } else if matches_opt(alt_kind, self.rules.else_clause) { + // Pattern A: else_clause wrapper + let else_children: Vec = { + let cursor = &mut alternative.walk(); + alternative.named_children(cursor).collect() + }; + if else_children.len() == 1 + && (matches_opt(else_children[0].kind(), self.rules.if_node) + || matches_slice(else_children[0].kind(), self.rules.if_nodes)) + { + // else-if: recurse + let false_block = self.make_block("branch_false", None, None, Some("else-if")); + self.add_edge(cond_block, false_block, "branch_false"); + let else_if_end = self.process_if(&else_children[0], false_block); + if let Some(eie) = else_if_end { + self.add_edge(eie, join_block, "fallthrough"); + } + } else { + // else block + let false_block = self.make_block("branch_false", None, None, Some("else")); + self.add_edge(cond_block, false_block, "branch_false"); + let false_end = self.process_statements(&else_children, false_block); + if let Some(fe) = false_end { + self.add_edge(fe, join_block, "fallthrough"); + } + } + } else { + // Unknown alternative type — treat as no else + self.add_edge(cond_block, join_block, "branch_false"); + } + } else { + // No else: condition-false goes to join + self.add_edge(cond_block, join_block, "branch_false"); + } + } + + Some(join_block) + } + + /// Pattern B: elif/elsif/else_if as sibling children of the if node. + fn process_elif_siblings(&mut self, if_stmt: &Node, first_cond: u32, join_block: u32) { + let mut last_cond = first_cond; + let mut found_else = false; + + let cursor = &mut if_stmt.walk(); + let children: Vec = if_stmt.named_children(cursor).collect(); + + for child in &children { + let child_kind = child.kind(); + + if matches_opt(child_kind, self.rules.elif_node) { + let elif_cond = self.make_block("condition", Some(node_line(child)), Some(node_line(child)), Some("else-if")); + self.add_edge(last_cond, elif_cond, "branch_false"); + + let elif_consequent_field = self.rules.if_consequent_field.unwrap_or("consequence"); + let elif_consequent = child.child_by_field_name(elif_consequent_field); + let elif_true = self.make_block("branch_true", None, None, Some("then")); + self.add_edge(elif_cond, elif_true, "branch_true"); + + if let Some(cons) = elif_consequent { + let stmts = self.get_statements(&cons); + let end = self.process_statements(&stmts, elif_true); + if let Some(e) = end { + self.add_edge(e, join_block, "fallthrough"); + } + } else { + self.add_edge(elif_true, join_block, "fallthrough"); + } + + last_cond = elif_cond; + } else if matches_opt(child_kind, self.rules.else_clause) { + let else_block = self.make_block("branch_false", None, None, Some("else")); + self.add_edge(last_cond, else_block, "branch_false"); + + // Try field access first, then collect children + let else_body = child.child_by_field_name("body"); + let else_stmts: Vec = if let Some(body) = else_body { + self.get_statements(&body) + } else { + let cursor2 = &mut child.walk(); + child.named_children(cursor2).collect() + }; + let else_end = self.process_statements(&else_stmts, else_block); + if let Some(ee) = else_end { + self.add_edge(ee, join_block, "fallthrough"); + } + + found_else = true; + } + } + + if !found_else { + self.add_edge(last_cond, join_block, "branch_false"); + } + } + + /// Update label map with loop context (for newly created loops inside labeled stmts). + fn update_label_map(&mut self, header_idx: u32, exit_idx: u32) { + for (_, ctx) in self.label_map.iter_mut() { + if ctx.header_idx.is_none() { + ctx.header_idx = Some(header_idx); + ctx.exit_idx = Some(exit_idx); + } + } + } + + fn process_for_loop(&mut self, for_stmt: &Node, current: u32) -> Option { + let header = self.make_block("loop_header", Some(node_line(for_stmt)), Some(node_line(for_stmt)), Some("for")); + self.add_edge(current, header, "fallthrough"); + + let exit = self.make_block("body", None, None, None); + + self.loop_stack.push(LoopCtx { header_idx: header, exit_idx: exit }); + self.update_label_map(header, exit); + + let body = for_stmt.child_by_field_name("body"); + let body_block = self.make_block("loop_body", None, None, None); + self.add_edge(header, body_block, "branch_true"); + + if let Some(body) = body { + let stmts = self.get_statements(&body); + let body_end = self.process_statements(&stmts, body_block); + if let Some(be) = body_end { + self.add_edge(be, header, "loop_back"); + } + } + + self.add_edge(header, exit, "loop_exit"); + self.loop_stack.pop(); + Some(exit) + } + + fn process_while_loop(&mut self, while_stmt: &Node, current: u32) -> Option { + let header = self.make_block("loop_header", Some(node_line(while_stmt)), Some(node_line(while_stmt)), Some("while")); + self.add_edge(current, header, "fallthrough"); + + let exit = self.make_block("body", None, None, None); + + self.loop_stack.push(LoopCtx { header_idx: header, exit_idx: exit }); + self.update_label_map(header, exit); + + let body = while_stmt.child_by_field_name("body"); + let body_block = self.make_block("loop_body", None, None, None); + self.add_edge(header, body_block, "branch_true"); + + if let Some(body) = body { + let stmts = self.get_statements(&body); + let body_end = self.process_statements(&stmts, body_block); + if let Some(be) = body_end { + self.add_edge(be, header, "loop_back"); + } + } + + self.add_edge(header, exit, "loop_exit"); + self.loop_stack.pop(); + Some(exit) + } + + fn process_do_while_loop(&mut self, do_stmt: &Node, current: u32) -> Option { + let body_block = self.make_block("loop_body", Some(node_line(do_stmt)), None, Some("do")); + self.add_edge(current, body_block, "fallthrough"); + + let cond_block = self.make_block("loop_header", None, None, Some("do-while")); + let exit = self.make_block("body", None, None, None); + + self.loop_stack.push(LoopCtx { header_idx: cond_block, exit_idx: exit }); + self.update_label_map(cond_block, exit); + + let body = do_stmt.child_by_field_name("body"); + if let Some(body) = body { + let stmts = self.get_statements(&body); + let body_end = self.process_statements(&stmts, body_block); + if let Some(be) = body_end { + self.add_edge(be, cond_block, "fallthrough"); + } + } + + self.add_edge(cond_block, body_block, "loop_back"); + self.add_edge(cond_block, exit, "loop_exit"); + + self.loop_stack.pop(); + Some(exit) + } + + fn process_infinite_loop(&mut self, loop_stmt: &Node, current: u32) -> Option { + let header = self.make_block("loop_header", Some(node_line(loop_stmt)), Some(node_line(loop_stmt)), Some("loop")); + self.add_edge(current, header, "fallthrough"); + + let exit = self.make_block("body", None, None, None); + + self.loop_stack.push(LoopCtx { header_idx: header, exit_idx: exit }); + self.update_label_map(header, exit); + + let body = loop_stmt.child_by_field_name("body"); + let body_block = self.make_block("loop_body", None, None, None); + self.add_edge(header, body_block, "branch_true"); + + if let Some(body) = body { + let stmts = self.get_statements(&body); + let body_end = self.process_statements(&stmts, body_block); + if let Some(be) = body_end { + self.add_edge(be, header, "loop_back"); + } + } + + // No loop_exit from header — only exit via break + self.loop_stack.pop(); + Some(exit) + } + + fn process_switch(&mut self, switch_stmt: &Node, current: u32) -> Option { + self.set_end_line(current, node_line(switch_stmt)); + + let switch_header = self.make_block("condition", Some(node_line(switch_stmt)), Some(node_line(switch_stmt)), Some("switch")); + self.add_edge(current, switch_header, "fallthrough"); + + let join_block = self.make_block("body", None, None, None); + + // Switch acts like a break target + self.loop_stack.push(LoopCtx { header_idx: switch_header, exit_idx: join_block }); + + // Get case children from body field or direct children + let container = switch_stmt.child_by_field_name("body").unwrap_or(*switch_stmt); + + let mut has_default = false; + let cursor = &mut container.walk(); + let case_children: Vec = container.named_children(cursor).collect(); + + for case_clause in &case_children { + let cc_kind = case_clause.kind(); + let is_default = matches_opt(cc_kind, self.rules.default_node); + let is_case = is_default + || matches_opt(cc_kind, self.rules.case_node) + || matches_slice(cc_kind, self.rules.case_nodes); + + if !is_case { + continue; + } + + let case_label = if is_default { "default" } else { "case" }; + let case_block = self.make_block("case", Some(node_line(case_clause)), None, Some(case_label)); + let edge_kind = if is_default { "branch_false" } else { "branch_true" }; + self.add_edge(switch_header, case_block, edge_kind); + if is_default { + has_default = true; + } + + // Extract case body + let case_body_node = case_clause.child_by_field_name("body") + .or_else(|| case_clause.child_by_field_name("consequence")); + + let case_stmts: Vec = if let Some(body_node) = case_body_node { + self.get_statements(&body_node) + } else { + let value_node = case_clause.child_by_field_name("value"); + let pattern_node = case_clause.child_by_field_name("pattern"); + let cursor2 = &mut case_clause.walk(); + case_clause.named_children(cursor2) + .filter(|child| { + if let Some(ref v) = value_node { if child.id() == v.id() { return false; } } + if let Some(ref p) = pattern_node { if child.id() == p.id() { return false; } } + child.kind() != "switch_label" + }) + .collect() + }; + + let case_end = self.process_statements(&case_stmts, case_block); + if let Some(ce) = case_end { + self.add_edge(ce, join_block, "fallthrough"); + } + } + + if !has_default { + self.add_edge(switch_header, join_block, "branch_false"); + } + + self.loop_stack.pop(); + Some(join_block) + } + + fn process_try_catch(&mut self, try_stmt: &Node, current: u32) -> Option { + self.set_end_line(current, node_line(try_stmt)); + + let join_block = self.make_block("body", None, None, None); + + // Try body + let try_body = try_stmt.child_by_field_name("body"); + let (try_body_start, try_stmts): (u32, Vec) = if let Some(body) = try_body { + (node_line(&body), self.get_statements(&body)) + } else { + let cursor = &mut try_stmt.walk(); + let stmts: Vec = try_stmt.named_children(cursor) + .filter(|child| { + let ck = child.kind(); + !matches_opt(ck, self.rules.catch_node) + && !matches_opt(ck, self.rules.finally_node) + }) + .collect(); + (node_line(try_stmt), stmts) + }; + + let try_block = self.make_block("body", Some(try_body_start), None, Some("try")); + self.add_edge(current, try_block, "fallthrough"); + let try_end = self.process_statements(&try_stmts, try_block); + + // Find catch and finally handlers + let mut catch_handler: Option = None; + let mut finally_handler: Option = None; + let cursor = &mut try_stmt.walk(); + for child in try_stmt.named_children(cursor) { + if matches_opt(child.kind(), self.rules.catch_node) { + catch_handler = Some(child); + } + if matches_opt(child.kind(), self.rules.finally_node) { + finally_handler = Some(child); + } + } + + if let Some(catch_node) = catch_handler { + let catch_block = self.make_block("catch", Some(node_line(&catch_node)), None, Some("catch")); + self.add_edge(try_block, catch_block, "exception"); + + let catch_body_node = catch_node.child_by_field_name("body"); + let catch_stmts: Vec = if let Some(body) = catch_body_node { + self.get_statements(&body) + } else { + let cursor2 = &mut catch_node.walk(); + catch_node.named_children(cursor2).collect() + }; + let catch_end = self.process_statements(&catch_stmts, catch_block); + + if let Some(finally_node) = finally_handler { + let finally_block = self.make_block("finally", Some(node_line(&finally_node)), None, Some("finally")); + if let Some(te) = try_end { + self.add_edge(te, finally_block, "fallthrough"); + } + if let Some(ce) = catch_end { + self.add_edge(ce, finally_block, "fallthrough"); + } + let finally_body = finally_node.child_by_field_name("body"); + let finally_stmts: Vec = if let Some(body) = finally_body { + self.get_statements(&body) + } else { + self.get_statements(&finally_node) + }; + let finally_end = self.process_statements(&finally_stmts, finally_block); + if let Some(fe) = finally_end { + self.add_edge(fe, join_block, "fallthrough"); + } + } else { + if let Some(te) = try_end { + self.add_edge(te, join_block, "fallthrough"); + } + if let Some(ce) = catch_end { + self.add_edge(ce, join_block, "fallthrough"); + } + } + } else if let Some(finally_node) = finally_handler { + let finally_block = self.make_block("finally", Some(node_line(&finally_node)), None, Some("finally")); + if let Some(te) = try_end { + self.add_edge(te, finally_block, "fallthrough"); + } + let finally_body = finally_node.child_by_field_name("body"); + let finally_stmts: Vec = if let Some(body) = finally_body { + self.get_statements(&body) + } else { + self.get_statements(&finally_node) + }; + let finally_end = self.process_statements(&finally_stmts, finally_block); + if let Some(fe) = finally_end { + self.add_edge(fe, join_block, "fallthrough"); + } + } else { + if let Some(te) = try_end { + self.add_edge(te, join_block, "fallthrough"); + } + } + + Some(join_block) + } +} + +// ─── Helpers ──────────────────────────────────────────────────────────── + +fn node_line(node: &Node) -> u32 { + node.start_position().row as u32 + 1 +} + +fn node_end_line(node: &Node) -> u32 { + node.end_position().row as u32 + 1 +} + +// ─── Public API ───────────────────────────────────────────────────────── + +/// Build a control flow graph for a single function AST node. +pub fn build_function_cfg(function_node: &Node, lang_id: &str) -> Option { + let rules = get_cfg_rules(lang_id)?; + + let mut builder = CfgBuilder::new(rules); + + let entry = builder.make_block("entry", None, None, None); + let exit = builder.make_block("exit", None, None, None); + builder.exit_idx = exit; + + let body = function_node.child_by_field_name("body"); + let body = match body { + Some(b) => b, + None => { + builder.add_edge(entry, exit, "fallthrough"); + return Some(CfgData { blocks: builder.blocks, edges: builder.edges }); + } + }; + + let stmts = builder.get_statements(&body); + if stmts.is_empty() { + builder.add_edge(entry, exit, "fallthrough"); + return Some(CfgData { blocks: builder.blocks, edges: builder.edges }); + } + + let first_block = builder.make_block("body", None, None, None); + builder.add_edge(entry, first_block, "fallthrough"); + + let last_block = builder.process_statements(&stmts, first_block); + if let Some(lb) = last_block { + builder.add_edge(lb, exit, "fallthrough"); + } + + Some(CfgData { blocks: builder.blocks, edges: builder.edges }) +} diff --git a/crates/codegraph-core/src/extractors/csharp.rs b/crates/codegraph-core/src/extractors/csharp.rs index 9d853ec..cc207bb 100644 --- a/crates/codegraph-core/src/extractors/csharp.rs +++ b/crates/codegraph-core/src/extractors/csharp.rs @@ -1,4 +1,5 @@ use tree_sitter::{Node, Tree}; +use crate::cfg::build_function_cfg; use crate::complexity::compute_all_metrics; use crate::types::*; use super::helpers::*; @@ -45,6 +46,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(node)), decorators: None, complexity: None, + cfg: None, children: opt_children(children), }); extract_csharp_base_types(node, &class_name, source, symbols); @@ -61,6 +63,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(node)), decorators: None, complexity: None, + cfg: None, children: None, }); extract_csharp_base_types(node, &name, source, symbols); @@ -77,6 +80,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(node)), decorators: None, complexity: None, + cfg: None, children: None, }); extract_csharp_base_types(node, &name, source, symbols); @@ -93,6 +97,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(node)), decorators: None, complexity: None, + cfg: None, children: None, }); if let Some(body) = node.child_by_field_name("body") { @@ -111,6 +116,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(&child)), decorators: None, complexity: compute_all_metrics(&child, source, "c_sharp"), + cfg: build_function_cfg(&child, "c_sharp"), children: None, }); } @@ -132,6 +138,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(node)), decorators: None, complexity: None, + cfg: None, children: opt_children(children), }); } @@ -153,6 +160,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(node)), decorators: None, complexity: compute_all_metrics(node, source, "c_sharp"), + cfg: build_function_cfg(node, "c_sharp"), children: opt_children(children), }); } @@ -174,6 +182,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(node)), decorators: None, complexity: compute_all_metrics(node, source, "c_sharp"), + cfg: build_function_cfg(node, "c_sharp"), children: opt_children(children), }); } @@ -194,6 +203,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(node)), decorators: None, complexity: compute_all_metrics(node, source, "c_sharp"), + cfg: build_function_cfg(node, "c_sharp"), children: None, }); } diff --git a/crates/codegraph-core/src/extractors/go.rs b/crates/codegraph-core/src/extractors/go.rs index 23d7e1a..26cafa5 100644 --- a/crates/codegraph-core/src/extractors/go.rs +++ b/crates/codegraph-core/src/extractors/go.rs @@ -1,4 +1,5 @@ use tree_sitter::{Node, Tree}; +use crate::cfg::build_function_cfg; use crate::complexity::compute_all_metrics; use crate::types::*; use super::helpers::*; @@ -27,6 +28,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(node)), decorators: None, complexity: compute_all_metrics(node, source, "go"), + cfg: build_function_cfg(node, "go"), children: opt_children(children), }); } @@ -65,6 +67,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(node)), decorators: None, complexity: compute_all_metrics(node, source, "go"), + cfg: build_function_cfg(node, "go"), children: opt_children(children), }); } @@ -90,6 +93,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(node)), decorators: None, complexity: None, + cfg: None, children: opt_children(children), }); } @@ -101,6 +105,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(node)), decorators: None, complexity: None, + cfg: None, children: None, }); // Extract interface methods @@ -121,6 +126,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(&member)), decorators: None, complexity: None, + cfg: build_function_cfg(&member, "go"), children: None, }); } @@ -136,6 +142,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(node)), decorators: None, complexity: None, + cfg: None, children: None, }); } @@ -157,6 +164,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(&spec)), decorators: None, complexity: None, + cfg: None, children: None, }); } diff --git a/crates/codegraph-core/src/extractors/hcl.rs b/crates/codegraph-core/src/extractors/hcl.rs index ab51641..349bc82 100644 --- a/crates/codegraph-core/src/extractors/hcl.rs +++ b/crates/codegraph-core/src/extractors/hcl.rs @@ -67,6 +67,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(node)), decorators: None, complexity: None, + cfg: None, children: None, }); diff --git a/crates/codegraph-core/src/extractors/helpers.rs b/crates/codegraph-core/src/extractors/helpers.rs index 7419f61..9c11b76 100644 --- a/crates/codegraph-core/src/extractors/helpers.rs +++ b/crates/codegraph-core/src/extractors/helpers.rs @@ -20,6 +20,7 @@ pub fn child_def(name: String, kind: &str, line: u32) -> Definition { end_line: None, decorators: None, complexity: None, + cfg: None, children: None, } } diff --git a/crates/codegraph-core/src/extractors/java.rs b/crates/codegraph-core/src/extractors/java.rs index fd07ac2..a2a8e8c 100644 --- a/crates/codegraph-core/src/extractors/java.rs +++ b/crates/codegraph-core/src/extractors/java.rs @@ -1,4 +1,5 @@ use tree_sitter::{Node, Tree}; +use crate::cfg::build_function_cfg; use crate::complexity::compute_all_metrics; use crate::types::*; use super::helpers::*; @@ -44,6 +45,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(node)), decorators: None, complexity: None, + cfg: None, children: opt_children(children), }); @@ -97,6 +99,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(node)), decorators: None, complexity: None, + cfg: None, children: None, }); if let Some(body) = node.child_by_field_name("body") { @@ -115,6 +118,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(&child)), decorators: None, complexity: compute_all_metrics(&child, source, "java"), + cfg: build_function_cfg(&child, "java"), children: None, }); } @@ -136,6 +140,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(node)), decorators: None, complexity: None, + cfg: None, children: opt_children(children), }); } @@ -157,6 +162,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(node)), decorators: None, complexity: compute_all_metrics(node, source, "java"), + cfg: build_function_cfg(node, "java"), children: opt_children(children), }); } @@ -178,6 +184,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(node)), decorators: None, complexity: compute_all_metrics(node, source, "java"), + cfg: build_function_cfg(node, "java"), children: opt_children(children), }); } diff --git a/crates/codegraph-core/src/extractors/javascript.rs b/crates/codegraph-core/src/extractors/javascript.rs index 7144cf9..3fc3ffb 100644 --- a/crates/codegraph-core/src/extractors/javascript.rs +++ b/crates/codegraph-core/src/extractors/javascript.rs @@ -1,4 +1,5 @@ use tree_sitter::{Node, Tree}; +use crate::cfg::build_function_cfg; use crate::complexity::compute_all_metrics; use crate::types::*; use super::helpers::*; @@ -27,6 +28,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(node)), decorators: None, complexity: compute_all_metrics(node, source, "javascript"), + cfg: build_function_cfg(node, "javascript"), children: opt_children(children), }); } @@ -43,6 +45,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(node)), decorators: None, complexity: None, + cfg: None, children: opt_children(children), }); @@ -87,6 +90,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(node)), decorators: None, complexity: compute_all_metrics(node, source, "javascript"), + cfg: build_function_cfg(node, "javascript"), children: opt_children(children), }); } @@ -102,6 +106,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(node)), decorators: None, complexity: None, + cfg: None, children: None, }); // Extract interface methods @@ -124,6 +129,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(node)), decorators: None, complexity: None, + cfg: None, children: None, }); } @@ -141,6 +147,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(node)), decorators: None, complexity: None, + cfg: None, children: opt_children(children), }); } @@ -169,6 +176,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(&value_n)), decorators: None, complexity: compute_all_metrics(&value_n, source, "javascript"), + cfg: build_function_cfg(&value_n, "javascript"), children: opt_children(children), }); } else if is_const && is_js_literal(&value_n) @@ -184,6 +192,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(node)), decorators: None, complexity: None, + cfg: None, children: None, }); } @@ -703,6 +712,7 @@ fn extract_interface_methods( end_line: Some(end_line(&child)), decorators: None, complexity: None, + cfg: build_function_cfg(&child, "javascript"), children: None, }); } @@ -919,6 +929,7 @@ fn extract_callback_definition(call_node: &Node, source: &[u8]) -> Option Option Option, + #[napi(js_name = "endLine")] + pub end_line: Option, + pub label: Option, +} + +#[napi(object)] +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct CfgEdge { + #[napi(js_name = "sourceIndex")] + pub source_index: u32, + #[napi(js_name = "targetIndex")] + pub target_index: u32, + pub kind: String, +} + +#[napi(object)] +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct CfgData { + pub blocks: Vec, + pub edges: Vec, +} + #[napi(object)] #[derive(Debug, Clone, Serialize, Deserialize)] pub struct Definition { @@ -65,6 +95,7 @@ pub struct Definition { #[napi(ts_type = "string[] | undefined")] pub decorators: Option>, pub complexity: Option, + pub cfg: Option, #[napi(ts_type = "Definition[] | undefined")] pub children: Option>, } diff --git a/src/cfg.js b/src/cfg.js index 67dd333..3282605 100644 --- a/src/cfg.js +++ b/src/cfg.js @@ -1053,8 +1053,14 @@ export async function buildCFGData(db, fileSymbols, rootDir, _engineOpts) { if (!symbols._tree) { const ext = path.extname(relPath).toLowerCase(); if (CFG_EXTENSIONS.has(ext)) { - needsFallback = true; - break; + // Check if all function/method defs already have native CFG data + const hasNativeCfg = symbols.definitions + .filter((d) => (d.kind === 'function' || d.kind === 'method') && d.line) + .every((d) => d.cfg?.blocks?.length); + if (!hasNativeCfg) { + needsFallback = true; + break; + } } } } @@ -1102,8 +1108,13 @@ export async function buildCFGData(db, fileSymbols, rootDir, _engineOpts) { let tree = symbols._tree; let langId = symbols._langId; - // WASM fallback if no cached tree - if (!tree) { + // Check if all defs already have native CFG — skip WASM parse if so + const allNative = symbols.definitions + .filter((d) => (d.kind === 'function' || d.kind === 'method') && d.line) + .every((d) => d.cfg?.blocks?.length); + + // WASM fallback if no cached tree and not all native + if (!tree && !allNative) { if (!extToLang || !getParserFn) continue; langId = extToLang.get(ext); if (!langId || !CFG_LANG_IDS.has(langId)) continue; @@ -1135,7 +1146,7 @@ export async function buildCFGData(db, fileSymbols, rootDir, _engineOpts) { if (!cfgRules) continue; const complexityRules = COMPLEXITY_RULES.get(langId); - if (!complexityRules) continue; + // complexityRules only needed for WASM fallback path for (const def of symbols.definitions) { if (def.kind !== 'function' && def.kind !== 'method') continue; @@ -1144,11 +1155,19 @@ export async function buildCFGData(db, fileSymbols, rootDir, _engineOpts) { const row = getNodeId.get(def.name, relPath, def.line); if (!row) continue; - const funcNode = findFunctionNode(tree.rootNode, def.line, def.endLine, complexityRules); - if (!funcNode) continue; + // Native path: use pre-computed CFG from Rust engine + let cfg = null; + if (def.cfg?.blocks?.length) { + cfg = def.cfg; + } else { + // WASM fallback: compute CFG from tree-sitter AST + if (!tree) continue; + const funcNode = findFunctionNode(tree.rootNode, def.line, def.endLine, complexityRules); + if (!funcNode) continue; + cfg = buildFunctionCFG(funcNode, langId); + } - const cfg = buildFunctionCFG(funcNode, langId); - if (cfg.blocks.length === 0) continue; + if (!cfg || cfg.blocks.length === 0) continue; // Clear old CFG data for this function deleteEdges.run(row.id); diff --git a/src/parser.js b/src/parser.js index e4a4a2e..cb98498 100644 --- a/src/parser.js +++ b/src/parser.js @@ -205,6 +205,22 @@ function normalizeNativeSymbols(result) { maintainabilityIndex: d.complexity.maintainabilityIndex ?? null, } : null, + cfg: d.cfg?.blocks?.length + ? { + blocks: d.cfg.blocks.map((b) => ({ + index: b.index, + type: b.type, + startLine: b.startLine, + endLine: b.endLine, + label: b.label ?? null, + })), + edges: d.cfg.edges.map((e) => ({ + sourceIndex: e.sourceIndex, + targetIndex: e.targetIndex, + kind: e.kind, + })), + } + : null, children: d.children?.length ? d.children.map((c) => ({ name: c.name, diff --git a/tests/parsers/cfg-all-langs.test.js b/tests/parsers/cfg-all-langs.test.js new file mode 100644 index 0000000..0567944 --- /dev/null +++ b/tests/parsers/cfg-all-langs.test.js @@ -0,0 +1,461 @@ +/** + * Tests for native CFG extraction across all languages. + * + * 1. Verifies buildCFGData accepts native def.cfg for non-JS languages + * (tests the JS-side native path in buildCFGData). + * 2. When native engine is available, verifies each language extractor + * produces CFG data for function/method definitions. + * 3. Parity: compares native CFG block/edge counts against WASM buildFunctionCFG. + */ + +import fs from 'node:fs'; +import os from 'node:os'; +import path from 'node:path'; +import Database from 'better-sqlite3'; +import { afterAll, beforeAll, describe, expect, test } from 'vitest'; +import { buildCFGData, buildFunctionCFG } from '../../src/cfg.js'; +import { COMPLEXITY_RULES, findFunctionNode } from '../../src/complexity.js'; +import { initSchema } from '../../src/db.js'; +import { loadNative } from '../../src/native.js'; +import { createParsers, getParser, parseFilesAuto } from '../../src/parser.js'; + +// ─── Helpers ────────────────────────────────────────────────────────── + +function createTempDb() { + const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-cfg-lang-')); + fs.mkdirSync(path.join(tmpDir, '.codegraph')); + const dbPath = path.join(tmpDir, '.codegraph', 'graph.db'); + const db = new Database(dbPath); + db.pragma('journal_mode = WAL'); + initSchema(db); + return { tmpDir, db }; +} + +// ─── JS-side: buildCFGData accepts native def.cfg ───────────────────── + +describe('buildCFGData — native CFG path', () => { + let tmpDir, db; + + beforeAll(() => { + ({ tmpDir, db } = createTempDb()); + }); + + afterAll(() => { + if (db) db.close(); + if (tmpDir) fs.rmSync(tmpDir, { recursive: true, force: true }); + }); + + test('inserts native CFG data for a function with pre-computed cfg', async () => { + // Insert function node in DB + db.prepare('INSERT INTO nodes (name, kind, file, line, end_line) VALUES (?, ?, ?, ?, ?)').run( + 'processData', + 'function', + 'src/process.py', + 1, + 10, + ); + + const fileSymbols = new Map(); + fileSymbols.set('src/process.py', { + definitions: [ + { + name: 'processData', + kind: 'function', + line: 1, + endLine: 10, + cfg: { + blocks: [ + { index: 0, type: 'entry', startLine: null, endLine: null, label: null }, + { index: 1, type: 'exit', startLine: null, endLine: null, label: null }, + { index: 2, type: 'body', startLine: 2, endLine: 5, label: null }, + { index: 3, type: 'condition', startLine: 6, endLine: 6, label: 'if' }, + { index: 4, type: 'branch_true', startLine: 7, endLine: 8, label: 'then' }, + { index: 5, type: 'body', startLine: 9, endLine: 10, label: null }, + ], + edges: [ + { sourceIndex: 0, targetIndex: 2, kind: 'fallthrough' }, + { sourceIndex: 2, targetIndex: 3, kind: 'fallthrough' }, + { sourceIndex: 3, targetIndex: 4, kind: 'branch_true' }, + { sourceIndex: 3, targetIndex: 5, kind: 'branch_false' }, + { sourceIndex: 4, targetIndex: 5, kind: 'fallthrough' }, + { sourceIndex: 5, targetIndex: 1, kind: 'fallthrough' }, + ], + }, + }, + ], + calls: [], + _langId: 'python', + }); + + await buildCFGData(db, fileSymbols, tmpDir); + + const blocks = db.prepare('SELECT * FROM cfg_blocks ORDER BY block_index').all(); + expect(blocks.length).toBe(6); + expect(blocks[0].block_type).toBe('entry'); + expect(blocks[1].block_type).toBe('exit'); + expect(blocks[3].block_type).toBe('condition'); + + const edges = db.prepare('SELECT * FROM cfg_edges').all(); + expect(edges.length).toBe(6); + const edgeKinds = edges.map((e) => e.kind); + expect(edgeKinds).toContain('branch_true'); + expect(edgeKinds).toContain('branch_false'); + expect(edgeKinds).toContain('fallthrough'); + }); + + test('native CFG data does not require WASM tree', async () => { + const { tmpDir: tmpDir2, db: db2 } = createTempDb(); + + db2 + .prepare('INSERT INTO nodes (name, kind, file, line, end_line) VALUES (?, ?, ?, ?, ?)') + .run('hello', 'function', 'src/hello.rb', 1, 5); + + const fileSymbols = new Map(); + fileSymbols.set('src/hello.rb', { + definitions: [ + { + name: 'hello', + kind: 'function', + line: 1, + endLine: 5, + cfg: { + blocks: [ + { index: 0, type: 'entry', startLine: null, endLine: null, label: null }, + { index: 1, type: 'exit', startLine: null, endLine: null, label: null }, + { index: 2, type: 'body', startLine: 2, endLine: 4, label: null }, + ], + edges: [ + { sourceIndex: 0, targetIndex: 2, kind: 'fallthrough' }, + { sourceIndex: 2, targetIndex: 1, kind: 'fallthrough' }, + ], + }, + }, + ], + calls: [], + // No _tree, no _langId — should still work with native CFG + _langId: 'ruby', + }); + + await buildCFGData(db2, fileSymbols, tmpDir2); + + const blocks = db2.prepare('SELECT * FROM cfg_blocks').all(); + expect(blocks.length).toBe(3); + + db2.close(); + fs.rmSync(tmpDir2, { recursive: true, force: true }); + }); +}); + +// ─── Native engine: multi-language CFG extraction + parity ───────────── + +const LANG_CFG_FIXTURES = { + 'fixture.js': ` +function processItems(items) { + if (items.length === 0) { + return []; + } + for (const item of items) { + console.log(item); + } + return items; +} +`, + 'fixture.py': ` +def process(data): + if not data: + raise ValueError("empty") + for item in data: + print(item) + return data +`, + 'fixture.go': ` +package main + +func process(items []string) []string { + if len(items) == 0 { + return nil + } + for _, item := range items { + println(item) + } + return items +} +`, + 'fixture.rs': ` +fn process(items: Vec) -> Vec { + if items.is_empty() { + return vec![]; + } + for item in &items { + println!("{}", item); + } + items +} +`, + 'fixture.java': ` +public class Processor { + public String[] process(String[] items) { + if (items.length == 0) { + return new String[0]; + } + for (String item : items) { + System.out.println(item); + } + return items; + } +} +`, + 'fixture.cs': ` +public class Processor { + public string[] Process(string[] items) { + if (items.Length == 0) { + return new string[0]; + } + foreach (var item in items) { + Console.WriteLine(item); + } + return items; + } +} +`, + 'fixture.rb': ` +class Processor + def process(items) + if items.empty? + return [] + end + items.each do |item| + puts item + end + items + end +end +`, + 'fixture.php': ` d.cfg?.blocks?.length > 0); + fs.rmSync(tmpCheck, { recursive: true, force: true }); + return hasCfg; + } catch { + return false; + } +} + +const canTestNativeCfg = nativeSupportsCfg(); + +describe.skipIf(!canTestNativeCfg)('native CFG — multi-language', () => { + let tmpDir; + const nativeResults = new Map(); + + beforeAll(async () => { + tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-cfg-multilang-')); + const srcDir = path.join(tmpDir, 'src'); + fs.mkdirSync(srcDir, { recursive: true }); + + const filePaths = []; + for (const [name, code] of Object.entries(LANG_CFG_FIXTURES)) { + const fp = path.join(srcDir, name); + fs.writeFileSync(fp, code); + filePaths.push(fp); + } + + const allSymbols = await parseFilesAuto(filePaths, tmpDir, { engine: 'native' }); + for (const [relPath, symbols] of allSymbols) { + nativeResults.set(relPath, symbols); + } + }); + + afterAll(() => { + if (tmpDir) fs.rmSync(tmpDir, { recursive: true, force: true }); + }); + + const langTests = [ + { file: 'fixture.js', lang: 'JavaScript', funcPattern: /processItems/ }, + { file: 'fixture.py', lang: 'Python', funcPattern: /process/ }, + { file: 'fixture.go', lang: 'Go', funcPattern: /process/ }, + { file: 'fixture.rs', lang: 'Rust', funcPattern: /process/ }, + { file: 'fixture.java', lang: 'Java', funcPattern: /process/ }, + { file: 'fixture.cs', lang: 'C#', funcPattern: /Process/ }, + { file: 'fixture.rb', lang: 'Ruby', funcPattern: /process/ }, + { file: 'fixture.php', lang: 'PHP', funcPattern: /process/ }, + ]; + + for (const { file, lang, funcPattern } of langTests) { + test(`${lang}: native produces CFG data for function`, () => { + const relPath = `src/${file}`; + const symbols = nativeResults.get(relPath); + expect(symbols, `no symbols for ${relPath}`).toBeTruthy(); + + const funcDefs = symbols.definitions.filter( + (d) => (d.kind === 'function' || d.kind === 'method') && funcPattern.test(d.name), + ); + expect(funcDefs.length, `no function matching ${funcPattern} in ${relPath}`).toBeGreaterThan( + 0, + ); + + for (const def of funcDefs) { + expect(def.cfg, `no cfg on ${def.name}`).toBeTruthy(); + expect(def.cfg.blocks.length, `no blocks in cfg of ${def.name}`).toBeGreaterThan(0); + expect(def.cfg.edges.length, `no edges in cfg of ${def.name}`).toBeGreaterThan(0); + + // Entry and exit blocks should always be present + const blockTypes = def.cfg.blocks.map((b) => b.type); + expect(blockTypes).toContain('entry'); + expect(blockTypes).toContain('exit'); + + // At least one fallthrough edge + const edgeKinds = def.cfg.edges.map((e) => e.kind); + expect(edgeKinds).toContain('fallthrough'); + } + }); + } + + for (const { file, lang, funcPattern } of langTests) { + test(`${lang}: CFG has if-condition and for-loop blocks`, () => { + const relPath = `src/${file}`; + const symbols = nativeResults.get(relPath); + if (!symbols) return; + + const funcDefs = symbols.definitions.filter( + (d) => (d.kind === 'function' || d.kind === 'method') && funcPattern.test(d.name), + ); + if (funcDefs.length === 0) return; + + const def = funcDefs[0]; + const blockTypes = def.cfg.blocks.map((b) => b.type); + const edgeKinds = def.cfg.edges.map((e) => e.kind); + + // All fixtures have an if statement + expect(blockTypes).toContain('condition'); + expect(edgeKinds).toContain('branch_true'); + + // All fixtures have a for loop + expect(blockTypes).toContain('loop_header'); + expect(blockTypes).toContain('loop_body'); + }); + } +}); + +// ─── Parity: native vs WASM CFG ────────────────────────────────────── + +describe.skipIf(!canTestNativeCfg)('native vs WASM CFG parity', () => { + let tmpDir; + const nativeResults = new Map(); + let parsers; + + const LANG_MAP = { + '.js': 'javascript', + '.py': 'python', + '.go': 'go', + '.rs': 'rust', + '.java': 'java', + '.cs': 'csharp', + '.rb': 'ruby', + '.php': 'php', + }; + + beforeAll(async () => { + tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-cfg-parity-')); + const srcDir = path.join(tmpDir, 'src'); + fs.mkdirSync(srcDir, { recursive: true }); + + const filePaths = []; + for (const [name, code] of Object.entries(LANG_CFG_FIXTURES)) { + const fp = path.join(srcDir, name); + fs.writeFileSync(fp, code); + filePaths.push(fp); + } + + const allSymbols = await parseFilesAuto(filePaths, tmpDir, { engine: 'native' }); + for (const [relPath, symbols] of allSymbols) { + nativeResults.set(relPath, symbols); + } + + parsers = await createParsers(); + }); + + afterAll(() => { + if (tmpDir) fs.rmSync(tmpDir, { recursive: true, force: true }); + }); + + const parityTests = [ + { file: 'fixture.js', ext: '.js', funcPattern: /processItems/ }, + { file: 'fixture.py', ext: '.py', funcPattern: /process/ }, + { file: 'fixture.java', ext: '.java', funcPattern: /process/ }, + { file: 'fixture.cs', ext: '.cs', funcPattern: /Process/ }, + { file: 'fixture.php', ext: '.php', funcPattern: /process/ }, + ]; + + for (const { file, ext, funcPattern } of parityTests) { + test(`parity: ${file} — native vs WASM block/edge counts match`, () => { + const relPath = `src/${file}`; + const symbols = nativeResults.get(relPath); + if (!symbols) return; + + const langId = LANG_MAP[ext]; + const complexityRules = COMPLEXITY_RULES.get(langId); + if (!complexityRules) return; + + // Parse with WASM + const absPath = path.join(tmpDir, relPath); + const parser = getParser(parsers, absPath); + if (!parser) return; + + const code = fs.readFileSync(absPath, 'utf-8'); + const tree = parser.parse(code); + if (!tree) return; + + const funcDefs = symbols.definitions.filter( + (d) => (d.kind === 'function' || d.kind === 'method') && funcPattern.test(d.name), + ); + + for (const def of funcDefs) { + if (!def.cfg?.blocks?.length) continue; + + const funcNode = findFunctionNode(tree.rootNode, def.line, def.endLine, complexityRules); + if (!funcNode) continue; + + const wasmCfg = buildFunctionCFG(funcNode, langId); + + // Block counts should match + expect(def.cfg.blocks.length).toBe(wasmCfg.blocks.length); + // Edge counts should match + expect(def.cfg.edges.length).toBe(wasmCfg.edges.length); + + // Block types should match (sorted for order independence) + const nativeTypes = def.cfg.blocks.map((b) => b.type).sort(); + const wasmTypes = wasmCfg.blocks.map((b) => b.type).sort(); + expect(nativeTypes).toEqual(wasmTypes); + + // Edge kinds should match (sorted) + const nativeKinds = def.cfg.edges.map((e) => e.kind).sort(); + const wasmKinds = wasmCfg.edges.map((e) => e.kind).sort(); + expect(nativeKinds).toEqual(wasmKinds); + } + }); + } +}); From c252834621c75ec6c68fe96b7c150d8b502f90df Mon Sep 17 00:00:00 2001 From: carlos-alm <127798846+carlos-alm@users.noreply.github.com> Date: Wed, 4 Mar 2026 14:50:52 -0700 Subject: [PATCH 2/9] fix(native): use consistent "csharp" language ID and skip CFG for Go interface methods - Change "c_sharp" to "csharp" in cfg.rs, complexity.rs, and csharp.rs to match the canonical ID from parser_registry::lang_id_str() - Set cfg: None for Go interface method_elem signatures since they have no bodies, matching the existing complexity: None pattern Impact: 6 functions changed, 6 affected --- crates/codegraph-core/src/cfg.rs | 2 +- crates/codegraph-core/src/complexity.rs | 6 +++--- crates/codegraph-core/src/extractors/csharp.rs | 16 ++++++++-------- crates/codegraph-core/src/extractors/go.rs | 2 +- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/crates/codegraph-core/src/cfg.rs b/crates/codegraph-core/src/cfg.rs index 6c0da09..48d9daf 100644 --- a/crates/codegraph-core/src/cfg.rs +++ b/crates/codegraph-core/src/cfg.rs @@ -301,7 +301,7 @@ pub fn get_cfg_rules(lang_id: &str) -> Option<&'static CfgRules> { "go" => Some(&GO_CFG), "rust" => Some(&RUST_CFG), "java" => Some(&JAVA_CFG), - "c_sharp" => Some(&CSHARP_CFG), + "csharp" => Some(&CSHARP_CFG), "ruby" => Some(&RUBY_CFG), "php" => Some(&PHP_CFG), _ => None, diff --git a/crates/codegraph-core/src/complexity.rs b/crates/codegraph-core/src/complexity.rs index df2bdaf..93458e0 100644 --- a/crates/codegraph-core/src/complexity.rs +++ b/crates/codegraph-core/src/complexity.rs @@ -344,7 +344,7 @@ pub fn lang_rules(lang_id: &str) -> Option<&'static LangRules> { "go" => Some(&GO_RULES), "rust" => Some(&RUST_LANG_RULES), "java" => Some(&JAVA_RULES), - "c_sharp" => Some(&CSHARP_RULES), + "csharp" => Some(&CSHARP_RULES), "ruby" => Some(&RUBY_RULES), "php" => Some(&PHP_RULES), _ => None, @@ -850,7 +850,7 @@ pub fn halstead_rules(lang_id: &str) -> Option<&'static HalsteadRules> { "go" => Some(&GO_HALSTEAD), "rust" => Some(&RUST_HALSTEAD), "java" => Some(&JAVA_HALSTEAD), - "c_sharp" => Some(&CSHARP_HALSTEAD), + "csharp" => Some(&CSHARP_HALSTEAD), "ruby" => Some(&RUBY_HALSTEAD), "php" => Some(&PHP_HALSTEAD), _ => None, @@ -860,7 +860,7 @@ pub fn halstead_rules(lang_id: &str) -> Option<&'static HalsteadRules> { /// Comment line prefixes per language, used for LOC metrics. pub fn comment_prefixes(lang_id: &str) -> &'static [&'static str] { match lang_id { - "javascript" | "typescript" | "tsx" | "go" | "rust" | "java" | "c_sharp" => { + "javascript" | "typescript" | "tsx" | "go" | "rust" | "java" | "csharp" => { &["//", "/*", "*", "*/"] } "python" | "ruby" => &["#"], diff --git a/crates/codegraph-core/src/extractors/csharp.rs b/crates/codegraph-core/src/extractors/csharp.rs index cc207bb..454a0a0 100644 --- a/crates/codegraph-core/src/extractors/csharp.rs +++ b/crates/codegraph-core/src/extractors/csharp.rs @@ -115,8 +115,8 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { line: start_line(&child), end_line: Some(end_line(&child)), decorators: None, - complexity: compute_all_metrics(&child, source, "c_sharp"), - cfg: build_function_cfg(&child, "c_sharp"), + complexity: compute_all_metrics(&child, source, "csharp"), + cfg: build_function_cfg(&child, "csharp"), children: None, }); } @@ -159,8 +159,8 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { line: start_line(node), end_line: Some(end_line(node)), decorators: None, - complexity: compute_all_metrics(node, source, "c_sharp"), - cfg: build_function_cfg(node, "c_sharp"), + complexity: compute_all_metrics(node, source, "csharp"), + cfg: build_function_cfg(node, "csharp"), children: opt_children(children), }); } @@ -181,8 +181,8 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { line: start_line(node), end_line: Some(end_line(node)), decorators: None, - complexity: compute_all_metrics(node, source, "c_sharp"), - cfg: build_function_cfg(node, "c_sharp"), + complexity: compute_all_metrics(node, source, "csharp"), + cfg: build_function_cfg(node, "csharp"), children: opt_children(children), }); } @@ -202,8 +202,8 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { line: start_line(node), end_line: Some(end_line(node)), decorators: None, - complexity: compute_all_metrics(node, source, "c_sharp"), - cfg: build_function_cfg(node, "c_sharp"), + complexity: compute_all_metrics(node, source, "csharp"), + cfg: build_function_cfg(node, "csharp"), children: None, }); } diff --git a/crates/codegraph-core/src/extractors/go.rs b/crates/codegraph-core/src/extractors/go.rs index 26cafa5..10d04be 100644 --- a/crates/codegraph-core/src/extractors/go.rs +++ b/crates/codegraph-core/src/extractors/go.rs @@ -126,7 +126,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(&member)), decorators: None, complexity: None, - cfg: build_function_cfg(&member, "go"), + cfg: None, children: None, }); } From e87790902b309a9b75d3d84e459b986da0cfa88d Mon Sep 17 00:00:00 2001 From: carlos-alm <127798846+carlos-alm@users.noreply.github.com> Date: Wed, 4 Mar 2026 18:19:25 -0700 Subject: [PATCH 3/9] fix(native): pass source bytes to CfgBuilder and use fallthrough for infinite loops MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix two issues in the Rust native CFG construction: 1. CfgBuilder called utf8_text(&[]) to extract label names for labeled loops, break, and continue statements. Passing an empty slice panics when the label node has a non-zero byte offset. Add a `source` field to CfgBuilder, thread it through from build_function_cfg, and use utf8_text(self.source) for all 3 call sites. Updated all 8 extractor files to pass source bytes. 2. process_infinite_loop used "branch_true" for the edge from loop header to body, but Rust `loop {}` is unconditional — there is no condition to branch on. Changed to "fallthrough" to match the semantics. Impact: 14 functions changed, 18 affected --- crates/codegraph-core/src/cfg.rs | 16 +++++++++------- crates/codegraph-core/src/extractors/csharp.rs | 8 ++++---- crates/codegraph-core/src/extractors/go.rs | 4 ++-- crates/codegraph-core/src/extractors/java.rs | 6 +++--- .../codegraph-core/src/extractors/javascript.rs | 14 +++++++------- crates/codegraph-core/src/extractors/php.rs | 6 +++--- crates/codegraph-core/src/extractors/python.rs | 2 +- crates/codegraph-core/src/extractors/ruby.rs | 4 ++-- .../codegraph-core/src/extractors/rust_lang.rs | 4 ++-- 9 files changed, 33 insertions(+), 31 deletions(-) diff --git a/crates/codegraph-core/src/cfg.rs b/crates/codegraph-core/src/cfg.rs index 48d9daf..fcd5e20 100644 --- a/crates/codegraph-core/src/cfg.rs +++ b/crates/codegraph-core/src/cfg.rs @@ -325,6 +325,7 @@ struct LabelCtx { /// CFG builder state. struct CfgBuilder<'a> { rules: &'a CfgRules, + source: &'a [u8], blocks: Vec, edges: Vec, next_index: u32, @@ -334,9 +335,10 @@ struct CfgBuilder<'a> { } impl<'a> CfgBuilder<'a> { - fn new(rules: &'a CfgRules) -> Self { + fn new(rules: &'a CfgRules, source: &'a [u8]) -> Self { Self { rules, + source, blocks: Vec::new(), edges: Vec::new(), next_index: 0, @@ -446,7 +448,7 @@ impl<'a> CfgBuilder<'a> { let label_node = stmt.child_by_field_name("label"); let body = stmt.child_by_field_name("body"); if let (Some(label_node), Some(body)) = (label_node, body) { - let label_name = label_node.utf8_text(&[]).unwrap_or("").to_string(); + let label_name = label_node.utf8_text(self.source).unwrap_or("").to_string(); // We can't know the loop blocks yet — push a placeholder self.label_map.push((label_name.clone(), LabelCtx { header_idx: None, exit_idx: None })); let result = self.process_statement(&body, current); @@ -518,7 +520,7 @@ impl<'a> CfgBuilder<'a> { // Break if matches_opt(kind, self.rules.break_node) { let label_name = stmt.child_by_field_name("label") - .map(|n| n.utf8_text(&[]).unwrap_or("").to_string()); + .map(|n| n.utf8_text(self.source).unwrap_or("").to_string()); let target = if let Some(ref name) = label_name { self.label_map.iter().rev() @@ -539,7 +541,7 @@ impl<'a> CfgBuilder<'a> { // Continue if matches_opt(kind, self.rules.continue_node) { let label_name = stmt.child_by_field_name("label") - .map(|n| n.utf8_text(&[]).unwrap_or("").to_string()); + .map(|n| n.utf8_text(self.source).unwrap_or("").to_string()); let target = if let Some(ref name) = label_name { self.label_map.iter().rev() @@ -809,7 +811,7 @@ impl<'a> CfgBuilder<'a> { let body = loop_stmt.child_by_field_name("body"); let body_block = self.make_block("loop_body", None, None, None); - self.add_edge(header, body_block, "branch_true"); + self.add_edge(header, body_block, "fallthrough"); if let Some(body) = body { let stmts = self.get_statements(&body); @@ -1009,10 +1011,10 @@ fn node_end_line(node: &Node) -> u32 { // ─── Public API ───────────────────────────────────────────────────────── /// Build a control flow graph for a single function AST node. -pub fn build_function_cfg(function_node: &Node, lang_id: &str) -> Option { +pub fn build_function_cfg(function_node: &Node, lang_id: &str, source: &[u8]) -> Option { let rules = get_cfg_rules(lang_id)?; - let mut builder = CfgBuilder::new(rules); + let mut builder = CfgBuilder::new(rules, source); let entry = builder.make_block("entry", None, None, None); let exit = builder.make_block("exit", None, None, None); diff --git a/crates/codegraph-core/src/extractors/csharp.rs b/crates/codegraph-core/src/extractors/csharp.rs index 454a0a0..77c14cb 100644 --- a/crates/codegraph-core/src/extractors/csharp.rs +++ b/crates/codegraph-core/src/extractors/csharp.rs @@ -116,7 +116,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(&child)), decorators: None, complexity: compute_all_metrics(&child, source, "csharp"), - cfg: build_function_cfg(&child, "csharp"), + cfg: build_function_cfg(&child, "csharp", source), children: None, }); } @@ -160,7 +160,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(node)), decorators: None, complexity: compute_all_metrics(node, source, "csharp"), - cfg: build_function_cfg(node, "csharp"), + cfg: build_function_cfg(node, "csharp", source), children: opt_children(children), }); } @@ -182,7 +182,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(node)), decorators: None, complexity: compute_all_metrics(node, source, "csharp"), - cfg: build_function_cfg(node, "csharp"), + cfg: build_function_cfg(node, "csharp", source), children: opt_children(children), }); } @@ -203,7 +203,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(node)), decorators: None, complexity: compute_all_metrics(node, source, "csharp"), - cfg: build_function_cfg(node, "csharp"), + cfg: build_function_cfg(node, "csharp", source), children: None, }); } diff --git a/crates/codegraph-core/src/extractors/go.rs b/crates/codegraph-core/src/extractors/go.rs index 10d04be..19a0d31 100644 --- a/crates/codegraph-core/src/extractors/go.rs +++ b/crates/codegraph-core/src/extractors/go.rs @@ -28,7 +28,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(node)), decorators: None, complexity: compute_all_metrics(node, source, "go"), - cfg: build_function_cfg(node, "go"), + cfg: build_function_cfg(node, "go", source), children: opt_children(children), }); } @@ -67,7 +67,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(node)), decorators: None, complexity: compute_all_metrics(node, source, "go"), - cfg: build_function_cfg(node, "go"), + cfg: build_function_cfg(node, "go", source), children: opt_children(children), }); } diff --git a/crates/codegraph-core/src/extractors/java.rs b/crates/codegraph-core/src/extractors/java.rs index a2a8e8c..6b6f784 100644 --- a/crates/codegraph-core/src/extractors/java.rs +++ b/crates/codegraph-core/src/extractors/java.rs @@ -118,7 +118,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(&child)), decorators: None, complexity: compute_all_metrics(&child, source, "java"), - cfg: build_function_cfg(&child, "java"), + cfg: build_function_cfg(&child, "java", source), children: None, }); } @@ -162,7 +162,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(node)), decorators: None, complexity: compute_all_metrics(node, source, "java"), - cfg: build_function_cfg(node, "java"), + cfg: build_function_cfg(node, "java", source), children: opt_children(children), }); } @@ -184,7 +184,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(node)), decorators: None, complexity: compute_all_metrics(node, source, "java"), - cfg: build_function_cfg(node, "java"), + cfg: build_function_cfg(node, "java", source), children: opt_children(children), }); } diff --git a/crates/codegraph-core/src/extractors/javascript.rs b/crates/codegraph-core/src/extractors/javascript.rs index 3fc3ffb..fbceefa 100644 --- a/crates/codegraph-core/src/extractors/javascript.rs +++ b/crates/codegraph-core/src/extractors/javascript.rs @@ -28,7 +28,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(node)), decorators: None, complexity: compute_all_metrics(node, source, "javascript"), - cfg: build_function_cfg(node, "javascript"), + cfg: build_function_cfg(node, "javascript", source), children: opt_children(children), }); } @@ -90,7 +90,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(node)), decorators: None, complexity: compute_all_metrics(node, source, "javascript"), - cfg: build_function_cfg(node, "javascript"), + cfg: build_function_cfg(node, "javascript", source), children: opt_children(children), }); } @@ -176,7 +176,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(&value_n)), decorators: None, complexity: compute_all_metrics(&value_n, source, "javascript"), - cfg: build_function_cfg(&value_n, "javascript"), + cfg: build_function_cfg(&value_n, "javascript", source), children: opt_children(children), }); } else if is_const && is_js_literal(&value_n) @@ -712,7 +712,7 @@ fn extract_interface_methods( end_line: Some(end_line(&child)), decorators: None, complexity: None, - cfg: build_function_cfg(&child, "javascript"), + cfg: build_function_cfg(&child, "javascript", source), children: None, }); } @@ -929,7 +929,7 @@ fn extract_callback_definition(call_node: &Node, source: &[u8]) -> Option Option Option Date: Wed, 4 Mar 2026 19:33:44 -0700 Subject: [PATCH 4/9] fix(cfg): address PR #342 review comments - Set cfg: None for TS interface method_signature/property_signature (no body) - Tag loop_stack entries with is_loop flag; continue now skips switch entries - Use match_arm value field as case body for Rust instead of filtering it out - Return None from process_infinite_loop when no break targets the exit block - Treat cfg: null as native-handled in allNative check to avoid false WASM fallback - Re-add complexityRules guard before passing to findFunctionNode --- crates/codegraph-core/src/cfg.rs | 32 +++++++++++++------ .../src/extractors/javascript.rs | 2 +- src/cfg.js | 4 +-- 3 files changed, 25 insertions(+), 13 deletions(-) diff --git a/crates/codegraph-core/src/cfg.rs b/crates/codegraph-core/src/cfg.rs index fcd5e20..316527a 100644 --- a/crates/codegraph-core/src/cfg.rs +++ b/crates/codegraph-core/src/cfg.rs @@ -314,6 +314,7 @@ pub fn get_cfg_rules(lang_id: &str) -> Option<&'static CfgRules> { struct LoopCtx { header_idx: u32, exit_idx: u32, + is_loop: bool, } /// Label context for labeled break/continue. @@ -548,7 +549,10 @@ impl<'a> CfgBuilder<'a> { .find(|(n, _)| n == name) .and_then(|(_, ctx)| ctx.header_idx) } else { - self.loop_stack.last().map(|ctx| ctx.header_idx) + // Walk back to find the nearest actual loop (skip switch entries) + self.loop_stack.iter().rev() + .find(|ctx| ctx.is_loop) + .map(|ctx| ctx.header_idx) }; if let Some(target) = target { @@ -728,7 +732,7 @@ impl<'a> CfgBuilder<'a> { let exit = self.make_block("body", None, None, None); - self.loop_stack.push(LoopCtx { header_idx: header, exit_idx: exit }); + self.loop_stack.push(LoopCtx { header_idx: header, exit_idx: exit, is_loop: true }); self.update_label_map(header, exit); let body = for_stmt.child_by_field_name("body"); @@ -754,7 +758,7 @@ impl<'a> CfgBuilder<'a> { let exit = self.make_block("body", None, None, None); - self.loop_stack.push(LoopCtx { header_idx: header, exit_idx: exit }); + self.loop_stack.push(LoopCtx { header_idx: header, exit_idx: exit, is_loop: true }); self.update_label_map(header, exit); let body = while_stmt.child_by_field_name("body"); @@ -781,7 +785,7 @@ impl<'a> CfgBuilder<'a> { let cond_block = self.make_block("loop_header", None, None, Some("do-while")); let exit = self.make_block("body", None, None, None); - self.loop_stack.push(LoopCtx { header_idx: cond_block, exit_idx: exit }); + self.loop_stack.push(LoopCtx { header_idx: cond_block, exit_idx: exit, is_loop: true }); self.update_label_map(cond_block, exit); let body = do_stmt.child_by_field_name("body"); @@ -806,7 +810,7 @@ impl<'a> CfgBuilder<'a> { let exit = self.make_block("body", None, None, None); - self.loop_stack.push(LoopCtx { header_idx: header, exit_idx: exit }); + self.loop_stack.push(LoopCtx { header_idx: header, exit_idx: exit, is_loop: true }); self.update_label_map(header, exit); let body = loop_stmt.child_by_field_name("body"); @@ -823,7 +827,14 @@ impl<'a> CfgBuilder<'a> { // No loop_exit from header — only exit via break self.loop_stack.pop(); - Some(exit) + + // If no break targeted the exit block, subsequent code is unreachable + let has_break_to_exit = self.edges.iter().any(|e| e.target_index == exit); + if has_break_to_exit { + Some(exit) + } else { + None + } } fn process_switch(&mut self, switch_stmt: &Node, current: u32) -> Option { @@ -834,8 +845,8 @@ impl<'a> CfgBuilder<'a> { let join_block = self.make_block("body", None, None, None); - // Switch acts like a break target - self.loop_stack.push(LoopCtx { header_idx: switch_header, exit_idx: join_block }); + // Switch acts like a break target but not a continue target + self.loop_stack.push(LoopCtx { header_idx: switch_header, exit_idx: join_block, is_loop: false }); // Get case children from body field or direct children let container = switch_stmt.child_by_field_name("body").unwrap_or(*switch_stmt); @@ -869,13 +880,14 @@ impl<'a> CfgBuilder<'a> { let case_stmts: Vec = if let Some(body_node) = case_body_node { self.get_statements(&body_node) + } else if let Some(value_node) = case_clause.child_by_field_name("value") { + // Rust match_arm: the `value` field is the arm expression body + vec![value_node] } else { - let value_node = case_clause.child_by_field_name("value"); let pattern_node = case_clause.child_by_field_name("pattern"); let cursor2 = &mut case_clause.walk(); case_clause.named_children(cursor2) .filter(|child| { - if let Some(ref v) = value_node { if child.id() == v.id() { return false; } } if let Some(ref p) = pattern_node { if child.id() == p.id() { return false; } } child.kind() != "switch_label" }) diff --git a/crates/codegraph-core/src/extractors/javascript.rs b/crates/codegraph-core/src/extractors/javascript.rs index fbceefa..91b634b 100644 --- a/crates/codegraph-core/src/extractors/javascript.rs +++ b/crates/codegraph-core/src/extractors/javascript.rs @@ -712,7 +712,7 @@ fn extract_interface_methods( end_line: Some(end_line(&child)), decorators: None, complexity: None, - cfg: build_function_cfg(&child, "javascript", source), + cfg: None, children: None, }); } diff --git a/src/cfg.js b/src/cfg.js index 3282605..f6ddff5 100644 --- a/src/cfg.js +++ b/src/cfg.js @@ -1111,7 +1111,7 @@ export async function buildCFGData(db, fileSymbols, rootDir, _engineOpts) { // Check if all defs already have native CFG — skip WASM parse if so const allNative = symbols.definitions .filter((d) => (d.kind === 'function' || d.kind === 'method') && d.line) - .every((d) => d.cfg?.blocks?.length); + .every((d) => d.cfg === null || d.cfg?.blocks?.length); // WASM fallback if no cached tree and not all native if (!tree && !allNative) { @@ -1161,7 +1161,7 @@ export async function buildCFGData(db, fileSymbols, rootDir, _engineOpts) { cfg = def.cfg; } else { // WASM fallback: compute CFG from tree-sitter AST - if (!tree) continue; + if (!tree || !complexityRules) continue; const funcNode = findFunctionNode(tree.rootNode, def.line, def.endLine, complexityRules); if (!funcNode) continue; cfg = buildFunctionCFG(funcNode, langId); From e028f38abbd41a2133d1404e6696e6c1384aacc0 Mon Sep 17 00:00:00 2001 From: carlos-alm <127798846+carlos-alm@users.noreply.github.com> Date: Wed, 4 Mar 2026 20:12:04 -0700 Subject: [PATCH 5/9] fix(cfg): add Java switch_statement to CFG rules and fix C# test fixture --- crates/codegraph-core/src/cfg.rs | 4 ++-- tests/parsers/ast-all-langs.test.js | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/crates/codegraph-core/src/cfg.rs b/crates/codegraph-core/src/cfg.rs index 316527a..3420f0f 100644 --- a/crates/codegraph-core/src/cfg.rs +++ b/crates/codegraph-core/src/cfg.rs @@ -183,8 +183,8 @@ pub static JAVA_CFG: CfgRules = CfgRules { infinite_loop_node: None, unless_node: None, until_node: None, - switch_node: Some("switch_expression"), - switch_nodes: &[], + switch_node: Some("switch_statement"), + switch_nodes: &["switch_expression"], case_node: Some("switch_block_statement_group"), case_nodes: &["switch_rule"], default_node: None, diff --git a/tests/parsers/ast-all-langs.test.js b/tests/parsers/ast-all-langs.test.js index c07368e..2c4c372 100644 --- a/tests/parsers/ast-all-langs.test.js +++ b/tests/parsers/ast-all-langs.test.js @@ -201,6 +201,7 @@ public class Service { public async Task FetchAsync() { var result = await GetDataAsync(); string msg = "hello from csharp"; + var ex = new ArgumentNullException("x"); if (result == null) { throw new ArgumentNullException("result"); } From e020420b5e631a96a3dba057f3f6a58f96090a54 Mon Sep 17 00:00:00 2001 From: carlos-alm <127798846+carlos-alm@users.noreply.github.com> Date: Wed, 4 Mar 2026 20:25:29 -0700 Subject: [PATCH 6/9] fix(cfg): handle multiple catch clauses and fix label map resolution Impact: 2 functions changed, 0 affected --- crates/codegraph-core/src/cfg.rs | 53 +++++++++++++++++++------------- 1 file changed, 31 insertions(+), 22 deletions(-) diff --git a/crates/codegraph-core/src/cfg.rs b/crates/codegraph-core/src/cfg.rs index 3420f0f..1419bd8 100644 --- a/crates/codegraph-core/src/cfg.rs +++ b/crates/codegraph-core/src/cfg.rs @@ -718,11 +718,11 @@ impl<'a> CfgBuilder<'a> { /// Update label map with loop context (for newly created loops inside labeled stmts). fn update_label_map(&mut self, header_idx: u32, exit_idx: u32) { - for (_, ctx) in self.label_map.iter_mut() { - if ctx.header_idx.is_none() { - ctx.header_idx = Some(header_idx); - ctx.exit_idx = Some(exit_idx); - } + if let Some((_, ctx)) = self.label_map.iter_mut().rev() + .find(|(_, ctx)| ctx.header_idx.is_none()) + { + ctx.header_idx = Some(header_idx); + ctx.exit_idx = Some(exit_idx); } } @@ -934,38 +934,45 @@ impl<'a> CfgBuilder<'a> { let try_end = self.process_statements(&try_stmts, try_block); // Find catch and finally handlers - let mut catch_handler: Option = None; + let mut catch_handlers: Vec = Vec::new(); let mut finally_handler: Option = None; let cursor = &mut try_stmt.walk(); for child in try_stmt.named_children(cursor) { if matches_opt(child.kind(), self.rules.catch_node) { - catch_handler = Some(child); + catch_handlers.push(child); } if matches_opt(child.kind(), self.rules.finally_node) { finally_handler = Some(child); } } - if let Some(catch_node) = catch_handler { - let catch_block = self.make_block("catch", Some(node_line(&catch_node)), None, Some("catch")); - self.add_edge(try_block, catch_block, "exception"); + if !catch_handlers.is_empty() { + let mut catch_ends: Vec> = Vec::new(); - let catch_body_node = catch_node.child_by_field_name("body"); - let catch_stmts: Vec = if let Some(body) = catch_body_node { - self.get_statements(&body) - } else { - let cursor2 = &mut catch_node.walk(); - catch_node.named_children(cursor2).collect() - }; - let catch_end = self.process_statements(&catch_stmts, catch_block); + for catch_node in &catch_handlers { + let catch_block = self.make_block("catch", Some(node_line(catch_node)), None, Some("catch")); + self.add_edge(try_block, catch_block, "exception"); + + let catch_body_node = catch_node.child_by_field_name("body"); + let catch_stmts: Vec = if let Some(body) = catch_body_node { + self.get_statements(&body) + } else { + let cursor2 = &mut catch_node.walk(); + catch_node.named_children(cursor2).collect() + }; + let catch_end = self.process_statements(&catch_stmts, catch_block); + catch_ends.push(catch_end); + } if let Some(finally_node) = finally_handler { let finally_block = self.make_block("finally", Some(node_line(&finally_node)), None, Some("finally")); if let Some(te) = try_end { self.add_edge(te, finally_block, "fallthrough"); } - if let Some(ce) = catch_end { - self.add_edge(ce, finally_block, "fallthrough"); + for catch_end in &catch_ends { + if let Some(ce) = *catch_end { + self.add_edge(ce, finally_block, "fallthrough"); + } } let finally_body = finally_node.child_by_field_name("body"); let finally_stmts: Vec = if let Some(body) = finally_body { @@ -981,8 +988,10 @@ impl<'a> CfgBuilder<'a> { if let Some(te) = try_end { self.add_edge(te, join_block, "fallthrough"); } - if let Some(ce) = catch_end { - self.add_edge(ce, join_block, "fallthrough"); + for catch_end in &catch_ends { + if let Some(ce) = *catch_end { + self.add_edge(ce, join_block, "fallthrough"); + } } } } else if let Some(finally_node) = finally_handler { From a59c9bc908ab159d68d5a8df7c18b5bb49ab1ddb Mon Sep 17 00:00:00 2001 From: carlos-alm <127798846+carlos-alm@users.noreply.github.com> Date: Thu, 5 Mar 2026 12:36:06 -0700 Subject: [PATCH 7/9] fix: address 6 Greptile review comments on CFG code - Add cfg === null guard to hasNativeCfg check in src/cfg.js, matching the allNative pattern (prevents unnecessary WASM parser init for interface methods) - Add condition_field to CfgRules; process_for_loop now treats conditionless for loops (Go `for {}`) as infinite loops without spurious loop_exit edge - Add else_node to CfgRules; process_try_catch now extracts Python else_clause as a distinct success-only path instead of folding it into the try body - Add switch_expression to CSHARP_CFG.switch_nodes and switch_expression_arm to case_nodes for C# 8+ pattern matching - Add try_nodes to CfgRules; Ruby body_statement containing rescue is now recognized as a try construct via RUBY_CFG.try_nodes - Add wildcard_pattern_node to CfgRules; Python match/case with case _: wildcard now correctly sets has_default, preventing spurious branch_false fallthrough edge Impact: 5 functions changed, 13 affected --- crates/codegraph-core/src/cfg.rs | 117 +++++++++++++++++++++++++++---- src/cfg.js | 2 +- 2 files changed, 103 insertions(+), 16 deletions(-) diff --git a/crates/codegraph-core/src/cfg.rs b/crates/codegraph-core/src/cfg.rs index 1419bd8..1e0742c 100644 --- a/crates/codegraph-core/src/cfg.rs +++ b/crates/codegraph-core/src/cfg.rs @@ -12,6 +12,7 @@ pub struct CfgRules { pub else_via_alternative: bool, pub if_consequent_field: Option<&'static str>, pub for_nodes: &'static [&'static str], + pub condition_field: Option<&'static str>, pub while_node: Option<&'static str>, pub while_nodes: &'static [&'static str], pub do_node: Option<&'static str>, @@ -23,9 +24,12 @@ pub struct CfgRules { pub case_node: Option<&'static str>, pub case_nodes: &'static [&'static str], pub default_node: Option<&'static str>, + pub wildcard_pattern_node: Option<&'static str>, pub try_node: Option<&'static str>, + pub try_nodes: &'static [&'static str], pub catch_node: Option<&'static str>, pub finally_node: Option<&'static str>, + pub else_node: Option<&'static str>, pub return_node: Option<&'static str>, pub throw_node: Option<&'static str>, pub break_node: Option<&'static str>, @@ -53,6 +57,7 @@ pub static JS_TS_CFG: CfgRules = CfgRules { else_via_alternative: false, if_consequent_field: None, for_nodes: &["for_statement", "for_in_statement"], + condition_field: Some("condition"), while_node: Some("while_statement"), while_nodes: &[], do_node: Some("do_statement"), @@ -64,9 +69,12 @@ pub static JS_TS_CFG: CfgRules = CfgRules { case_node: Some("switch_case"), case_nodes: &[], default_node: Some("switch_default"), + wildcard_pattern_node: None, try_node: Some("try_statement"), + try_nodes: &[], catch_node: Some("catch_clause"), finally_node: Some("finally_clause"), + else_node: None, return_node: Some("return_statement"), throw_node: Some("throw_statement"), break_node: Some("break_statement"), @@ -84,6 +92,7 @@ pub static PYTHON_CFG: CfgRules = CfgRules { else_via_alternative: false, if_consequent_field: None, for_nodes: &["for_statement"], + condition_field: Some("condition"), while_node: Some("while_statement"), while_nodes: &[], do_node: None, @@ -95,9 +104,12 @@ pub static PYTHON_CFG: CfgRules = CfgRules { case_node: Some("case_clause"), case_nodes: &[], default_node: None, + wildcard_pattern_node: Some("wildcard_pattern"), try_node: Some("try_statement"), + try_nodes: &[], catch_node: Some("except_clause"), finally_node: Some("finally_clause"), + else_node: Some("else_clause"), return_node: Some("return_statement"), throw_node: Some("raise_statement"), break_node: Some("break_statement"), @@ -115,6 +127,7 @@ pub static GO_CFG: CfgRules = CfgRules { else_via_alternative: true, if_consequent_field: None, for_nodes: &["for_statement"], + condition_field: Some("condition"), while_node: None, while_nodes: &[], do_node: None, @@ -126,9 +139,12 @@ pub static GO_CFG: CfgRules = CfgRules { case_node: Some("expression_case"), case_nodes: &["type_case", "communication_case"], default_node: Some("default_case"), + wildcard_pattern_node: None, try_node: None, + try_nodes: &[], catch_node: None, finally_node: None, + else_node: None, return_node: Some("return_statement"), throw_node: None, break_node: Some("break_statement"), @@ -146,6 +162,7 @@ pub static RUST_CFG: CfgRules = CfgRules { else_via_alternative: false, if_consequent_field: None, for_nodes: &["for_expression"], + condition_field: None, while_node: Some("while_expression"), while_nodes: &["while_let_expression"], do_node: None, @@ -157,9 +174,12 @@ pub static RUST_CFG: CfgRules = CfgRules { case_node: Some("match_arm"), case_nodes: &[], default_node: None, + wildcard_pattern_node: None, try_node: None, + try_nodes: &[], catch_node: None, finally_node: None, + else_node: None, return_node: Some("return_expression"), throw_node: None, break_node: Some("break_expression"), @@ -177,6 +197,7 @@ pub static JAVA_CFG: CfgRules = CfgRules { else_via_alternative: true, if_consequent_field: None, for_nodes: &["for_statement", "enhanced_for_statement"], + condition_field: Some("condition"), while_node: Some("while_statement"), while_nodes: &[], do_node: Some("do_statement"), @@ -188,9 +209,12 @@ pub static JAVA_CFG: CfgRules = CfgRules { case_node: Some("switch_block_statement_group"), case_nodes: &["switch_rule"], default_node: None, + wildcard_pattern_node: None, try_node: Some("try_statement"), + try_nodes: &[], catch_node: Some("catch_clause"), finally_node: Some("finally_clause"), + else_node: None, return_node: Some("return_statement"), throw_node: Some("throw_statement"), break_node: Some("break_statement"), @@ -208,6 +232,7 @@ pub static CSHARP_CFG: CfgRules = CfgRules { else_via_alternative: true, if_consequent_field: None, for_nodes: &["for_statement", "foreach_statement"], + condition_field: Some("condition"), while_node: Some("while_statement"), while_nodes: &[], do_node: Some("do_statement"), @@ -215,13 +240,16 @@ pub static CSHARP_CFG: CfgRules = CfgRules { unless_node: None, until_node: None, switch_node: Some("switch_statement"), - switch_nodes: &[], + switch_nodes: &["switch_expression"], case_node: Some("switch_section"), - case_nodes: &[], + case_nodes: &["switch_expression_arm"], default_node: None, + wildcard_pattern_node: None, try_node: Some("try_statement"), + try_nodes: &[], catch_node: Some("catch_clause"), finally_node: Some("finally_clause"), + else_node: None, return_node: Some("return_statement"), throw_node: Some("throw_statement"), break_node: Some("break_statement"), @@ -239,6 +267,7 @@ pub static RUBY_CFG: CfgRules = CfgRules { else_via_alternative: false, if_consequent_field: None, for_nodes: &["for"], + condition_field: Some("condition"), while_node: Some("while"), while_nodes: &[], do_node: None, @@ -250,9 +279,12 @@ pub static RUBY_CFG: CfgRules = CfgRules { case_node: Some("when"), case_nodes: &[], default_node: Some("else"), + wildcard_pattern_node: None, try_node: Some("begin"), + try_nodes: &["body_statement"], catch_node: Some("rescue"), finally_node: Some("ensure"), + else_node: None, return_node: Some("return"), throw_node: None, break_node: Some("break"), @@ -270,6 +302,7 @@ pub static PHP_CFG: CfgRules = CfgRules { else_via_alternative: false, if_consequent_field: Some("body"), for_nodes: &["for_statement", "foreach_statement"], + condition_field: Some("condition"), while_node: Some("while_statement"), while_nodes: &[], do_node: Some("do_statement"), @@ -281,9 +314,12 @@ pub static PHP_CFG: CfgRules = CfgRules { case_node: Some("case_statement"), case_nodes: &[], default_node: Some("default_statement"), + wildcard_pattern_node: None, try_node: Some("try_statement"), + try_nodes: &[], catch_node: Some("catch_clause"), finally_node: Some("finally_clause"), + else_node: None, return_node: Some("return_statement"), throw_node: Some("throw_expression"), break_node: Some("break_statement"), @@ -503,6 +539,16 @@ impl<'a> CfgBuilder<'a> { if matches_opt(kind, self.rules.try_node) { return self.process_try_catch(stmt, current); } + // Additional try nodes (e.g. Ruby body_statement with rescue) + if matches_slice(kind, self.rules.try_nodes) { + // Only treat as try if it actually contains a catch/rescue child + let cursor = &mut stmt.walk(); + let has_rescue = stmt.named_children(cursor) + .any(|c| matches_opt(c.kind(), self.rules.catch_node)); + if has_rescue { + return self.process_try_catch(stmt, current); + } + } // Return if matches_opt(kind, self.rules.return_node) { @@ -735,9 +781,15 @@ impl<'a> CfgBuilder<'a> { self.loop_stack.push(LoopCtx { header_idx: header, exit_idx: exit, is_loop: true }); self.update_label_map(header, exit); + // Check if this for loop has a condition — if not (e.g. Go `for {}`), treat as infinite loop + let has_condition = self.rules.condition_field + .and_then(|f| for_stmt.child_by_field_name(f)) + .is_some(); + let body = for_stmt.child_by_field_name("body"); let body_block = self.make_block("loop_body", None, None, None); - self.add_edge(header, body_block, "branch_true"); + let body_edge = if has_condition { "branch_true" } else { "fallthrough" }; + self.add_edge(header, body_block, body_edge); if let Some(body) = body { let stmts = self.get_statements(&body); @@ -747,9 +799,21 @@ impl<'a> CfgBuilder<'a> { } } - self.add_edge(header, exit, "loop_exit"); self.loop_stack.pop(); - Some(exit) + + if has_condition { + // Normal for loop with condition — always emit loop_exit edge + self.add_edge(header, exit, "loop_exit"); + Some(exit) + } else { + // Infinite loop (no condition) — only exit via break + let has_break_to_exit = self.edges.iter().any(|e| e.target_index == exit); + if has_break_to_exit { + Some(exit) + } else { + None + } + } } fn process_while_loop(&mut self, while_stmt: &Node, current: u32) -> Option { @@ -857,7 +921,11 @@ impl<'a> CfgBuilder<'a> { for case_clause in &case_children { let cc_kind = case_clause.kind(); - let is_default = matches_opt(cc_kind, self.rules.default_node); + let is_default = matches_opt(cc_kind, self.rules.default_node) + || (self.rules.wildcard_pattern_node.is_some() + && (matches_opt(cc_kind, self.rules.case_node) || matches_slice(cc_kind, self.rules.case_nodes)) + && case_clause.named_child(0) + .is_some_and(|c| matches_opt(c.kind(), self.rules.wildcard_pattern_node))); let is_case = is_default || matches_opt(cc_kind, self.rules.case_node) || matches_slice(cc_kind, self.rules.case_nodes); @@ -924,6 +992,7 @@ impl<'a> CfgBuilder<'a> { let ck = child.kind(); !matches_opt(ck, self.rules.catch_node) && !matches_opt(ck, self.rules.finally_node) + && !matches_opt(ck, self.rules.else_node) }) .collect(); (node_line(try_stmt), stmts) @@ -933,9 +1002,10 @@ impl<'a> CfgBuilder<'a> { self.add_edge(current, try_block, "fallthrough"); let try_end = self.process_statements(&try_stmts, try_block); - // Find catch and finally handlers + // Find catch, finally, and else handlers let mut catch_handlers: Vec = Vec::new(); let mut finally_handler: Option = None; + let mut else_handler: Option = None; let cursor = &mut try_stmt.walk(); for child in try_stmt.named_children(cursor) { if matches_opt(child.kind(), self.rules.catch_node) { @@ -944,8 +1014,25 @@ impl<'a> CfgBuilder<'a> { if matches_opt(child.kind(), self.rules.finally_node) { finally_handler = Some(child); } + if matches_opt(child.kind(), self.rules.else_node) { + // Only treat as try-else if it's a direct child of the try statement + // (not the else_clause of an if inside the try body) + else_handler = Some(child); + } } + // Process else clause (Python try...except...else): runs when try succeeds + let success_end = if let Some(else_node) = else_handler { + let else_block = self.make_block("body", Some(node_line(&else_node)), None, Some("else")); + if let Some(te) = try_end { + self.add_edge(te, else_block, "fallthrough"); + } + let else_stmts = self.get_statements(&else_node); + self.process_statements(&else_stmts, else_block) + } else { + try_end + }; + if !catch_handlers.is_empty() { let mut catch_ends: Vec> = Vec::new(); @@ -966,8 +1053,8 @@ impl<'a> CfgBuilder<'a> { if let Some(finally_node) = finally_handler { let finally_block = self.make_block("finally", Some(node_line(&finally_node)), None, Some("finally")); - if let Some(te) = try_end { - self.add_edge(te, finally_block, "fallthrough"); + if let Some(se) = success_end { + self.add_edge(se, finally_block, "fallthrough"); } for catch_end in &catch_ends { if let Some(ce) = *catch_end { @@ -985,8 +1072,8 @@ impl<'a> CfgBuilder<'a> { self.add_edge(fe, join_block, "fallthrough"); } } else { - if let Some(te) = try_end { - self.add_edge(te, join_block, "fallthrough"); + if let Some(se) = success_end { + self.add_edge(se, join_block, "fallthrough"); } for catch_end in &catch_ends { if let Some(ce) = *catch_end { @@ -996,8 +1083,8 @@ impl<'a> CfgBuilder<'a> { } } else if let Some(finally_node) = finally_handler { let finally_block = self.make_block("finally", Some(node_line(&finally_node)), None, Some("finally")); - if let Some(te) = try_end { - self.add_edge(te, finally_block, "fallthrough"); + if let Some(se) = success_end { + self.add_edge(se, finally_block, "fallthrough"); } let finally_body = finally_node.child_by_field_name("body"); let finally_stmts: Vec = if let Some(body) = finally_body { @@ -1010,8 +1097,8 @@ impl<'a> CfgBuilder<'a> { self.add_edge(fe, join_block, "fallthrough"); } } else { - if let Some(te) = try_end { - self.add_edge(te, join_block, "fallthrough"); + if let Some(se) = success_end { + self.add_edge(se, join_block, "fallthrough"); } } diff --git a/src/cfg.js b/src/cfg.js index f6ddff5..23e9aae 100644 --- a/src/cfg.js +++ b/src/cfg.js @@ -1056,7 +1056,7 @@ export async function buildCFGData(db, fileSymbols, rootDir, _engineOpts) { // Check if all function/method defs already have native CFG data const hasNativeCfg = symbols.definitions .filter((d) => (d.kind === 'function' || d.kind === 'method') && d.line) - .every((d) => d.cfg?.blocks?.length); + .every((d) => d.cfg === null || d.cfg?.blocks?.length); if (!hasNativeCfg) { needsFallback = true; break; From c0d1626560a6875047ff337b6c8d775bd8f6aca3 Mon Sep 17 00:00:00 2001 From: carlos-alm <127798846+carlos-alm@users.noreply.github.com> Date: Thu, 5 Mar 2026 12:45:34 -0700 Subject: [PATCH 8/9] perf: compute dataflow analysis in Rust native engine (#343) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * perf: compute dataflow analysis in Rust native engine Port per-file dataflow extraction from WASM (JS) to the Rust native engine, eliminating the tree-sitter WASM dependency for dataflow. Rust changes: - Add dataflow.rs with DataflowRules struct, 8 per-language static configs (JS/TS, Python, Go, Rust, Java, C#, PHP, Ruby), ParamStrategy enum for per-language param extraction, scope tracking with binding confidence, and extract_dataflow() recursive visitor - Add 6 NAPI structs to types.rs (DataflowParam, DataflowReturn, DataflowAssignment, DataflowArgFlow, DataflowMutation, DataflowResult) and dataflow field on FileSymbols - Call extract_dataflow() after extract_symbols() in parallel.rs - Add lang_id_str() to LanguageKind for rules lookup JS changes: - Extend normalizeNativeSymbols() to map native dataflow result - Add native bypass in buildDataflowEdges(): use symbols.dataflow when present, fall back to WASM extraction otherwise Impact: 30 functions changed, 32 affected * fix: address PR review feedback for dataflow-in-rust - Add method_call_expression to RUST_DATAFLOW call_nodes so Rust obj.method(arg) calls are no longer silently excluded from dataflow - Remove var_declaration from GO_DATAFLOW var_declarator_nodes since its child fields don't match the left/right schema used by short_var_declaration - Remove unreachable snake_case fallbacks in normalizeNativeSymbols since napi-rs #[napi(js_name)] guarantees camelCase property names Impact: 1 functions changed, 3 affected * fix(native): address dataflow review comments — conditional extraction, depth limit, method mutations, truncation parity - Add include_dataflow flag to parse_files_parallel/parse_file to skip dataflow extraction when not needed (threaded from builder.js opts) - Add MAX_VISIT_DEPTH (200) to prevent stack overflow on deeply nested ASTs - Add method_call_name_field to DataflowRules for languages where method calls use a different field than call_function_field (fixes dead mutating_methods for Rust's method_call_expression) - Fix truncate() to use chars().count() instead of byte length for parity with JS str.length on non-ASCII content Impact: 11 functions changed, 31 affected * fix(native): use correct receiver field for Rust method call mutations Impact: 1 functions changed, 4 affected * fix(native): add depth guard to collect_identifiers in dataflow Impact: 2 functions changed, 2 affected * perf: skip dataflow computation in incremental CFG-only rebuilds When only needsCfg is true in the pending analysis path, the native engine was still computing dataflow for every file via engineOpts defaulting dataflow to true. The results were immediately discarded since the needsDataflow guard was false. Override engineOpts.dataflow in the analysis call site so it only runs when actually needed. Impact: 1 functions changed, 0 affected * fix: use >= for depth guard in collect_identifiers to match visit() Impact: 1 functions changed, 4 affected --- crates/codegraph-core/src/dataflow.rs | 1450 ++++++++++++++++++ crates/codegraph-core/src/lib.rs | 19 +- crates/codegraph-core/src/parallel.rs | 17 +- crates/codegraph-core/src/parser_registry.rs | 18 + crates/codegraph-core/src/types.rs | 83 + src/builder.js | 8 +- src/dataflow.js | 62 +- src/parser.js | 44 +- 8 files changed, 1662 insertions(+), 39 deletions(-) create mode 100644 crates/codegraph-core/src/dataflow.rs diff --git a/crates/codegraph-core/src/dataflow.rs b/crates/codegraph-core/src/dataflow.rs new file mode 100644 index 0000000..82c3022 --- /dev/null +++ b/crates/codegraph-core/src/dataflow.rs @@ -0,0 +1,1450 @@ +use std::collections::HashMap; +use tree_sitter::{Node, Tree}; + +use crate::types::{ + DataflowArgFlow, DataflowAssignment, DataflowMutation, DataflowParam, DataflowResult, + DataflowReturn, +}; + +/// Maximum recursion depth for AST traversal to prevent stack overflow +/// on deeply nested trees. Matches the approach used in cfg.rs. +const MAX_VISIT_DEPTH: usize = 200; + +// ─── Param Strategy ────────────────────────────────────────────────────── + +/// Per-language parameter extraction strategy. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum ParamStrategy { + Default, + Python, + Go, + Rust, + Java, + CSharp, + Php, + Ruby, +} + +// ─── DataflowRules ────────────────────────────────────────────────────── + +/// Per-language AST node type names and field names for dataflow extraction. +/// Mirrors `DATAFLOW_DEFAULTS` + per-language overrides in `src/dataflow.js`. +pub struct DataflowRules { + // Scope entry + function_nodes: &'static [&'static str], + + // Function name extraction + name_field: &'static str, + var_assigned_fn_parent: Option<&'static str>, + assignment_fn_parent: Option<&'static str>, + pair_fn_parent: Option<&'static str>, + + // Parameters + param_list_field: &'static str, + param_identifier: &'static str, + param_wrapper_types: &'static [&'static str], + default_param_type: Option<&'static str>, + rest_param_type: Option<&'static str>, + object_destruct_type: Option<&'static str>, + array_destruct_type: Option<&'static str>, + shorthand_prop_pattern: Option<&'static str>, + pair_pattern_type: Option<&'static str>, + extract_param_strategy: ParamStrategy, + + // Return + return_node: Option<&'static str>, + + // Variable declarations + var_declarator_node: Option<&'static str>, + var_declarator_nodes: &'static [&'static str], + var_name_field: &'static str, + var_value_field: Option<&'static str>, + assignment_node: Option<&'static str>, + assign_left_field: &'static str, + assign_right_field: &'static str, + + // Calls + call_node: Option<&'static str>, + call_nodes: &'static [&'static str], + call_function_field: &'static str, + call_args_field: &'static str, + spread_type: Option<&'static str>, + + // Member access + member_node: Option<&'static str>, + member_object_field: &'static str, + member_property_field: &'static str, + optional_chain_node: Option<&'static str>, + + // Await + await_node: Option<&'static str>, + + // Mutation + mutating_methods: &'static [&'static str], + expression_stmt_node: &'static str, + call_object_field: Option<&'static str>, + + // Method call name extraction (for languages where method_call uses a different + // field than call_function_field, e.g. Rust's method_call_expression has "name") + method_call_name_field: Option<&'static str>, + + // Method call receiver extraction (for languages where the method call receiver + // uses a different field than member_object_field, e.g. Rust's + // method_call_expression exposes "receiver" not "value") + method_call_receiver_field: Option<&'static str>, + + // Structural wrappers + expression_list_type: Option<&'static str>, + equals_clause_type: Option<&'static str>, + argument_wrapper_type: Option<&'static str>, + extra_identifier_types: &'static [&'static str], +} + +// ─── Per-Language Configs ──────────────────────────────────────────────── + +static JS_TS_DATAFLOW: DataflowRules = DataflowRules { + function_nodes: &[ + "function_declaration", + "method_definition", + "arrow_function", + "function_expression", + "function", + ], + name_field: "name", + var_assigned_fn_parent: Some("variable_declarator"), + assignment_fn_parent: Some("assignment_expression"), + pair_fn_parent: Some("pair"), + param_list_field: "parameters", + param_identifier: "identifier", + param_wrapper_types: &["required_parameter", "optional_parameter"], + default_param_type: Some("assignment_pattern"), + rest_param_type: Some("rest_pattern"), + object_destruct_type: Some("object_pattern"), + array_destruct_type: Some("array_pattern"), + shorthand_prop_pattern: Some("shorthand_property_identifier_pattern"), + pair_pattern_type: Some("pair_pattern"), + extract_param_strategy: ParamStrategy::Default, + return_node: Some("return_statement"), + var_declarator_node: Some("variable_declarator"), + var_declarator_nodes: &[], + var_name_field: "name", + var_value_field: Some("value"), + assignment_node: Some("assignment_expression"), + assign_left_field: "left", + assign_right_field: "right", + call_node: Some("call_expression"), + call_nodes: &[], + call_function_field: "function", + call_args_field: "arguments", + spread_type: Some("spread_element"), + member_node: Some("member_expression"), + member_object_field: "object", + member_property_field: "property", + optional_chain_node: Some("optional_chain_expression"), + await_node: Some("await_expression"), + mutating_methods: &[ + "push", "pop", "shift", "unshift", "splice", "sort", "reverse", "fill", "set", "delete", + "add", "clear", + ], + expression_stmt_node: "expression_statement", + call_object_field: None, + method_call_name_field: None, + method_call_receiver_field: None, + expression_list_type: None, + equals_clause_type: None, + argument_wrapper_type: None, + extra_identifier_types: &[], +}; + +static PYTHON_DATAFLOW: DataflowRules = DataflowRules { + function_nodes: &["function_definition", "lambda"], + name_field: "name", + var_assigned_fn_parent: None, + assignment_fn_parent: None, + pair_fn_parent: None, + param_list_field: "parameters", + param_identifier: "identifier", + param_wrapper_types: &[], + default_param_type: Some("default_parameter"), + rest_param_type: Some("list_splat_pattern"), + object_destruct_type: None, + array_destruct_type: None, + shorthand_prop_pattern: None, + pair_pattern_type: None, + extract_param_strategy: ParamStrategy::Python, + return_node: Some("return_statement"), + var_declarator_node: None, + var_declarator_nodes: &[], + var_name_field: "name", + var_value_field: Some("value"), + assignment_node: Some("assignment"), + assign_left_field: "left", + assign_right_field: "right", + call_node: Some("call"), + call_nodes: &[], + call_function_field: "function", + call_args_field: "arguments", + spread_type: Some("list_splat"), + member_node: Some("attribute"), + member_object_field: "object", + member_property_field: "attribute", + optional_chain_node: None, + await_node: Some("await"), + mutating_methods: &[ + "append", "extend", "insert", "pop", "remove", "clear", "sort", "reverse", "add", + "discard", "update", + ], + expression_stmt_node: "expression_statement", + call_object_field: None, + method_call_name_field: None, + method_call_receiver_field: None, + expression_list_type: None, + equals_clause_type: None, + argument_wrapper_type: None, + extra_identifier_types: &[], +}; + +static GO_DATAFLOW: DataflowRules = DataflowRules { + function_nodes: &["function_declaration", "method_declaration", "func_literal"], + name_field: "name", + var_assigned_fn_parent: None, + assignment_fn_parent: None, + pair_fn_parent: None, + param_list_field: "parameters", + param_identifier: "identifier", + param_wrapper_types: &[], + default_param_type: None, + rest_param_type: None, + object_destruct_type: None, + array_destruct_type: None, + shorthand_prop_pattern: None, + pair_pattern_type: None, + extract_param_strategy: ParamStrategy::Go, + return_node: Some("return_statement"), + var_declarator_node: None, + // Only short_var_declaration uses left/right fields. var_declaration has + // var_spec children with name/type/value fields — not yet supported. + var_declarator_nodes: &["short_var_declaration"], + var_name_field: "left", + var_value_field: Some("right"), + assignment_node: Some("assignment_statement"), + assign_left_field: "left", + assign_right_field: "right", + call_node: Some("call_expression"), + call_nodes: &[], + call_function_field: "function", + call_args_field: "arguments", + spread_type: None, + member_node: Some("selector_expression"), + member_object_field: "operand", + member_property_field: "field", + optional_chain_node: None, + await_node: None, + mutating_methods: &[], + expression_stmt_node: "expression_statement", + call_object_field: None, + method_call_name_field: None, + method_call_receiver_field: None, + expression_list_type: Some("expression_list"), + equals_clause_type: None, + argument_wrapper_type: None, + extra_identifier_types: &[], +}; + +static RUST_DATAFLOW: DataflowRules = DataflowRules { + function_nodes: &["function_item", "closure_expression"], + name_field: "name", + var_assigned_fn_parent: None, + assignment_fn_parent: None, + pair_fn_parent: None, + param_list_field: "parameters", + param_identifier: "identifier", + param_wrapper_types: &[], + default_param_type: None, + rest_param_type: None, + object_destruct_type: None, + array_destruct_type: None, + shorthand_prop_pattern: None, + pair_pattern_type: None, + extract_param_strategy: ParamStrategy::Rust, + return_node: Some("return_expression"), + var_declarator_node: Some("let_declaration"), + var_declarator_nodes: &[], + var_name_field: "pattern", + var_value_field: Some("value"), + assignment_node: Some("assignment_expression"), + assign_left_field: "left", + assign_right_field: "right", + call_node: None, + call_nodes: &["call_expression", "method_call_expression"], + call_function_field: "function", + call_args_field: "arguments", + spread_type: None, + member_node: Some("field_expression"), + member_object_field: "value", + member_property_field: "field", + optional_chain_node: None, + await_node: Some("await_expression"), + mutating_methods: &["push", "pop", "insert", "remove", "clear", "sort", "reverse"], + expression_stmt_node: "expression_statement", + call_object_field: None, + method_call_name_field: Some("name"), + method_call_receiver_field: Some("receiver"), + expression_list_type: None, + equals_clause_type: None, + argument_wrapper_type: None, + extra_identifier_types: &[], +}; + +static JAVA_DATAFLOW: DataflowRules = DataflowRules { + function_nodes: &[ + "method_declaration", + "constructor_declaration", + "lambda_expression", + ], + name_field: "name", + var_assigned_fn_parent: None, + assignment_fn_parent: None, + pair_fn_parent: None, + param_list_field: "parameters", + param_identifier: "identifier", + param_wrapper_types: &[], + default_param_type: None, + rest_param_type: None, + object_destruct_type: None, + array_destruct_type: None, + shorthand_prop_pattern: None, + pair_pattern_type: None, + extract_param_strategy: ParamStrategy::Java, + return_node: Some("return_statement"), + var_declarator_node: Some("variable_declarator"), + var_declarator_nodes: &[], + var_name_field: "name", + var_value_field: Some("value"), + assignment_node: Some("assignment_expression"), + assign_left_field: "left", + assign_right_field: "right", + call_node: None, + call_nodes: &["method_invocation", "object_creation_expression"], + call_function_field: "name", + call_args_field: "arguments", + spread_type: None, + member_node: Some("field_access"), + member_object_field: "object", + member_property_field: "field", + optional_chain_node: None, + await_node: None, + mutating_methods: &["add", "remove", "clear", "put", "set", "push", "pop", "sort"], + expression_stmt_node: "expression_statement", + call_object_field: Some("object"), + method_call_name_field: None, + method_call_receiver_field: None, + expression_list_type: None, + equals_clause_type: None, + argument_wrapper_type: Some("argument"), + extra_identifier_types: &[], +}; + +static CSHARP_DATAFLOW: DataflowRules = DataflowRules { + function_nodes: &[ + "method_declaration", + "constructor_declaration", + "lambda_expression", + "local_function_statement", + ], + name_field: "name", + var_assigned_fn_parent: None, + assignment_fn_parent: None, + pair_fn_parent: None, + param_list_field: "parameters", + param_identifier: "identifier", + param_wrapper_types: &[], + default_param_type: None, + rest_param_type: None, + object_destruct_type: None, + array_destruct_type: None, + shorthand_prop_pattern: None, + pair_pattern_type: None, + extract_param_strategy: ParamStrategy::CSharp, + return_node: Some("return_statement"), + var_declarator_node: Some("variable_declarator"), + var_declarator_nodes: &[], + var_name_field: "name", + var_value_field: None, + assignment_node: Some("assignment_expression"), + assign_left_field: "left", + assign_right_field: "right", + call_node: Some("invocation_expression"), + call_nodes: &[], + call_function_field: "function", + call_args_field: "arguments", + spread_type: None, + member_node: Some("member_access_expression"), + member_object_field: "expression", + member_property_field: "name", + optional_chain_node: None, + await_node: Some("await_expression"), + mutating_methods: &["Add", "Remove", "Clear", "Insert", "Sort", "Reverse", "Push", "Pop"], + expression_stmt_node: "expression_statement", + call_object_field: None, + method_call_name_field: None, + method_call_receiver_field: None, + expression_list_type: None, + equals_clause_type: Some("equals_value_clause"), + argument_wrapper_type: Some("argument"), + extra_identifier_types: &[], +}; + +static PHP_DATAFLOW: DataflowRules = DataflowRules { + function_nodes: &[ + "function_definition", + "method_declaration", + "anonymous_function_creation_expression", + "arrow_function", + ], + name_field: "name", + var_assigned_fn_parent: None, + assignment_fn_parent: None, + pair_fn_parent: None, + param_list_field: "parameters", + param_identifier: "variable_name", + param_wrapper_types: &[], + default_param_type: None, + rest_param_type: None, + object_destruct_type: None, + array_destruct_type: None, + shorthand_prop_pattern: None, + pair_pattern_type: None, + extract_param_strategy: ParamStrategy::Php, + return_node: Some("return_statement"), + var_declarator_node: None, + var_declarator_nodes: &[], + var_name_field: "name", + var_value_field: Some("value"), + assignment_node: Some("assignment_expression"), + assign_left_field: "left", + assign_right_field: "right", + call_node: None, + call_nodes: &[ + "function_call_expression", + "member_call_expression", + "scoped_call_expression", + ], + call_function_field: "function", + call_args_field: "arguments", + spread_type: Some("spread_expression"), + member_node: Some("member_access_expression"), + member_object_field: "object", + member_property_field: "name", + optional_chain_node: None, + await_node: None, + mutating_methods: &["push", "pop", "shift", "unshift", "splice", "sort", "reverse"], + expression_stmt_node: "expression_statement", + call_object_field: None, + method_call_name_field: None, + method_call_receiver_field: None, + expression_list_type: None, + equals_clause_type: None, + argument_wrapper_type: Some("argument"), + extra_identifier_types: &["variable_name", "name"], +}; + +static RUBY_DATAFLOW: DataflowRules = DataflowRules { + function_nodes: &["method", "singleton_method", "lambda"], + name_field: "name", + var_assigned_fn_parent: None, + assignment_fn_parent: None, + pair_fn_parent: None, + param_list_field: "parameters", + param_identifier: "identifier", + param_wrapper_types: &[], + default_param_type: None, + rest_param_type: None, + object_destruct_type: None, + array_destruct_type: None, + shorthand_prop_pattern: None, + pair_pattern_type: None, + extract_param_strategy: ParamStrategy::Ruby, + return_node: Some("return"), + var_declarator_node: None, + var_declarator_nodes: &[], + var_name_field: "name", + var_value_field: Some("value"), + assignment_node: Some("assignment"), + assign_left_field: "left", + assign_right_field: "right", + call_node: Some("call"), + call_nodes: &[], + call_function_field: "method", + call_args_field: "arguments", + spread_type: Some("splat_parameter"), + member_node: Some("call"), + member_object_field: "receiver", + member_property_field: "method", + optional_chain_node: None, + await_node: None, + mutating_methods: &[ + "push", "pop", "shift", "unshift", "delete", "clear", "sort!", "reverse!", "map!", + "select!", "reject!", "compact!", "flatten!", "concat", "replace", "insert", + ], + expression_stmt_node: "expression_statement", + call_object_field: None, + method_call_name_field: None, + method_call_receiver_field: None, + expression_list_type: None, + equals_clause_type: None, + argument_wrapper_type: None, + extra_identifier_types: &[], +}; + +/// Get dataflow rules for a language ID string. +fn get_dataflow_rules(lang_id: &str) -> Option<&'static DataflowRules> { + match lang_id { + "javascript" | "typescript" | "tsx" => Some(&JS_TS_DATAFLOW), + "python" => Some(&PYTHON_DATAFLOW), + "go" => Some(&GO_DATAFLOW), + "rust" => Some(&RUST_DATAFLOW), + "java" => Some(&JAVA_DATAFLOW), + "csharp" => Some(&CSHARP_DATAFLOW), + "php" => Some(&PHP_DATAFLOW), + "ruby" => Some(&RUBY_DATAFLOW), + _ => None, + } +} + +// ─── Helpers ───────────────────────────────────────────────────────────── + +fn is_call_node(rules: &DataflowRules, kind: &str) -> bool { + if !rules.call_nodes.is_empty() { + rules.call_nodes.contains(&kind) + } else { + rules.call_node.is_some_and(|cn| cn == kind) + } +} + +fn is_function_node(rules: &DataflowRules, kind: &str) -> bool { + rules.function_nodes.contains(&kind) +} + +fn is_ident(rules: &DataflowRules, kind: &str) -> bool { + kind == "identifier" + || kind == rules.param_identifier + || rules.extra_identifier_types.contains(&kind) +} + +fn truncate(s: &str, max: usize) -> String { + if s.chars().count() <= max { + s.to_string() + } else { + // Find the byte offset of the max-th character + let byte_offset = s + .char_indices() + .nth(max) + .map(|(i, _)| i) + .unwrap_or(s.len()); + let mut result = s[..byte_offset].to_string(); + result.push('…'); + result + } +} + +fn node_text<'a>(node: &Node, source: &'a [u8]) -> &'a str { + node.utf8_text(source).unwrap_or("") +} + +fn node_line(node: &Node) -> u32 { + node.start_position().row as u32 + 1 +} + +/// Extract function name from a function AST node. +fn function_name<'a>(fn_node: &Node<'a>, rules: &DataflowRules, source: &[u8]) -> Option { + // Try the standard name field + if let Some(name_node) = fn_node.child_by_field_name(rules.name_field) { + return Some(node_text(&name_node, source).to_string()); + } + + // JS-specific: arrow_function/function_expression assigned to variable, pair, or assignment + if let Some(parent) = fn_node.parent() { + let pt = parent.kind(); + if rules.var_assigned_fn_parent.is_some_and(|v| v == pt) { + let n = parent.child_by_field_name("name"); + return n.map(|n| node_text(&n, source).to_string()); + } + if rules.pair_fn_parent.is_some_and(|v| v == pt) { + let key = parent.child_by_field_name("key"); + return key.map(|k| node_text(&k, source).to_string()); + } + if rules.assignment_fn_parent.is_some_and(|v| v == pt) { + let left = parent.child_by_field_name(rules.assign_left_field); + return left.map(|l| node_text(&l, source).to_string()); + } + } + None +} + +/// Extract parameter names using per-language strategy. +fn extract_param_names_strategy(node: &Node, strategy: ParamStrategy, source: &[u8]) -> Option> { + match strategy { + ParamStrategy::Default => None, + ParamStrategy::Python => { + let t = node.kind(); + if t == "typed_parameter" || t == "typed_default_parameter" { + let cursor = &mut node.walk(); + for c in node.named_children(cursor) { + if c.kind() == "identifier" { + return Some(vec![node_text(&c, source).to_string()]); + } + } + return Some(vec![]); + } + if t == "default_parameter" { + if let Some(name_node) = node.child_by_field_name("name") { + return Some(vec![node_text(&name_node, source).to_string()]); + } + return Some(vec![]); + } + if t == "list_splat_pattern" || t == "dictionary_splat_pattern" { + let cursor = &mut node.walk(); + for c in node.named_children(cursor) { + if c.kind() == "identifier" { + return Some(vec![node_text(&c, source).to_string()]); + } + } + return Some(vec![]); + } + None + } + ParamStrategy::Go => { + let t = node.kind(); + if t == "parameter_declaration" { + let mut names = Vec::new(); + let cursor = &mut node.walk(); + for c in node.named_children(cursor) { + if c.kind() == "identifier" { + names.push(node_text(&c, source).to_string()); + } + } + if !names.is_empty() { Some(names) } else { None } + } else if t == "variadic_parameter_declaration" { + node.child_by_field_name("name") + .map(|n| vec![node_text(&n, source).to_string()]) + } else { + None + } + } + ParamStrategy::Rust => { + let t = node.kind(); + if t == "parameter" { + if let Some(pat) = node.child_by_field_name("pattern") { + if pat.kind() == "identifier" { + return Some(vec![node_text(&pat, source).to_string()]); + } + } + return Some(vec![]); + } + if t == "identifier" { + return Some(vec![node_text(node, source).to_string()]); + } + None + } + ParamStrategy::Java => { + let t = node.kind(); + if t == "formal_parameter" || t == "spread_parameter" { + if let Some(name_node) = node.child_by_field_name("name") { + return Some(vec![node_text(&name_node, source).to_string()]); + } + return Some(vec![]); + } + if t == "identifier" { + return Some(vec![node_text(node, source).to_string()]); + } + None + } + ParamStrategy::CSharp => { + let t = node.kind(); + if t == "parameter" { + if let Some(name_node) = node.child_by_field_name("name") { + return Some(vec![node_text(&name_node, source).to_string()]); + } + return Some(vec![]); + } + if t == "identifier" { + return Some(vec![node_text(node, source).to_string()]); + } + None + } + ParamStrategy::Php => { + let t = node.kind(); + if t == "simple_parameter" || t == "variadic_parameter" { + if let Some(name_node) = node.child_by_field_name("name") { + return Some(vec![node_text(&name_node, source).to_string()]); + } + return Some(vec![]); + } + if t == "variable_name" { + return Some(vec![node_text(node, source).to_string()]); + } + None + } + ParamStrategy::Ruby => { + let t = node.kind(); + if t == "identifier" { + return Some(vec![node_text(node, source).to_string()]); + } + if t == "optional_parameter" + || t == "keyword_parameter" + || t == "splat_parameter" + || t == "hash_splat_parameter" + { + if let Some(name_node) = node.child_by_field_name("name") { + return Some(vec![node_text(&name_node, source).to_string()]); + } + return Some(vec![]); + } + None + } + } +} + +/// Extract parameter names from a node, using rules and strategy. +fn extract_param_names(node: &Node, rules: &DataflowRules, source: &[u8]) -> Vec { + let t = node.kind(); + + // Language-specific override + if let Some(names) = extract_param_names_strategy(node, rules.extract_param_strategy, source) { + return names; + } + + // Leaf identifier + if t == rules.param_identifier { + return vec![node_text(node, source).to_string()]; + } + + // Wrapper types (TS required_parameter, etc.) + if rules.param_wrapper_types.contains(&t) { + let pattern = node + .child_by_field_name("pattern") + .or_else(|| node.child_by_field_name("name")); + return pattern + .map(|p| extract_param_names(&p, rules, source)) + .unwrap_or_default(); + } + + // Default parameter + if rules.default_param_type.is_some_and(|d| d == t) { + let left = node + .child_by_field_name("left") + .or_else(|| node.child_by_field_name("name")); + return left + .map(|l| extract_param_names(&l, rules, source)) + .unwrap_or_default(); + } + + // Rest / splat parameter + if rules.rest_param_type.is_some_and(|r| r == t) { + if let Some(name_node) = node.child_by_field_name("name") { + return vec![node_text(&name_node, source).to_string()]; + } + let cursor = &mut node.walk(); + for child in node.named_children(cursor) { + if child.kind() == rules.param_identifier { + return vec![node_text(&child, source).to_string()]; + } + } + return vec![]; + } + + // Object destructuring (JS only) + if rules.object_destruct_type.is_some_and(|o| o == t) { + let mut names = Vec::new(); + let cursor = &mut node.walk(); + for child in node.named_children(cursor) { + let ck = child.kind(); + if rules.shorthand_prop_pattern.is_some_and(|s| s == ck) { + names.push(node_text(&child, source).to_string()); + } else if rules.pair_pattern_type.is_some_and(|p| p == ck) { + if let Some(value) = child.child_by_field_name("value") { + names.extend(extract_param_names(&value, rules, source)); + } + } else if rules.rest_param_type.is_some_and(|r| r == ck) { + names.extend(extract_param_names(&child, rules, source)); + } + } + return names; + } + + // Array destructuring (JS only) + if rules.array_destruct_type.is_some_and(|a| a == t) { + let mut names = Vec::new(); + let cursor = &mut node.walk(); + for child in node.named_children(cursor) { + names.extend(extract_param_names(&child, rules, source)); + } + return names; + } + + vec![] +} + +/// Extract parameters: name + index pairs from formal_parameters node. +fn extract_params(params_node: &Node, rules: &DataflowRules, source: &[u8]) -> Vec<(String, u32)> { + let mut result = Vec::new(); + let mut index: u32 = 0; + let cursor = &mut params_node.walk(); + for child in params_node.named_children(cursor) { + let names = extract_param_names(&child, rules, source); + for name in names { + result.push((name, index)); + } + index += 1; + } + result +} + +/// Resolve the callee name from a call expression node. +fn resolve_callee_name(call_node: &Node, rules: &DataflowRules, source: &[u8]) -> Option { + let fn_node = call_node.child_by_field_name(rules.call_function_field); + match fn_node { + Some(f) => { + if is_ident(rules, f.kind()) { + return Some(node_text(&f, source).to_string()); + } + if rules.member_node.is_some_and(|m| m == f.kind()) { + let prop = f.child_by_field_name(rules.member_property_field); + return prop.map(|p| node_text(&p, source).to_string()); + } + if rules.optional_chain_node.is_some_and(|o| o == f.kind()) { + if let Some(target) = f.named_child(0) { + if rules.member_node.is_some_and(|m| m == target.kind()) { + let prop = target.child_by_field_name(rules.member_property_field); + return prop.map(|p| node_text(&p, source).to_string()); + } + if target.kind() == "identifier" { + return Some(node_text(&target, source).to_string()); + } + } + let prop = f.child_by_field_name(rules.member_property_field); + return prop.map(|p| node_text(&p, source).to_string()); + } + None + } + None => { + // Some languages (Java method_invocation, Ruby call) use 'name'/'method' directly + let name_node = call_node + .child_by_field_name("name") + .or_else(|| call_node.child_by_field_name("method")); + name_node.map(|n| node_text(&n, source).to_string()) + } + } +} + +/// Get the receiver (object) of a member expression. +fn member_receiver(member_expr: &Node, rules: &DataflowRules, source: &[u8]) -> Option { + let obj = member_expr.child_by_field_name(rules.member_object_field)?; + if is_ident(rules, obj.kind()) { + return Some(node_text(&obj, source).to_string()); + } + if rules.member_node.is_some_and(|m| m == obj.kind()) { + return member_receiver(&obj, rules, source); + } + None +} + +/// Collect all identifier names referenced within a node. +fn collect_identifiers(node: &Node, out: &mut Vec, rules: &DataflowRules, source: &[u8], depth: usize) { + if depth >= MAX_VISIT_DEPTH { + return; + } + if is_ident(rules, node.kind()) { + out.push(node_text(node, source).to_string()); + return; + } + let cursor = &mut node.walk(); + for child in node.named_children(cursor) { + collect_identifiers(&child, out, rules, source, depth + 1); + } +} + +// ─── Scope Tracking ────────────────────────────────────────────────────── + +#[derive(Debug, Clone)] +enum LocalSource { + CallReturn { callee: String }, + Destructured { callee: String }, +} + +struct ScopeFrame { + func_name: Option, + params: HashMap, + locals: HashMap, +} + +/// Binding info returned by find_binding. +struct BindingInfo { + binding_type: String, + confidence: f64, +} + +fn find_binding(scope_stack: &[ScopeFrame], name: &str) -> Option { + for scope in scope_stack.iter().rev() { + if scope.params.contains_key(name) { + return Some(BindingInfo { + binding_type: "param".to_string(), + confidence: 1.0, + }); + } + if let Some(local) = scope.locals.get(name) { + let confidence = match local { + LocalSource::CallReturn { .. } => 0.9, + LocalSource::Destructured { .. } => 0.8, + }; + return Some(BindingInfo { + binding_type: "local".to_string(), + confidence, + }); + } + } + None +} + +fn binding_confidence(binding: &Option) -> f64 { + match binding { + Some(b) => b.confidence, + None => 0.5, + } +} + +// ─── Core: extract_dataflow ────────────────────────────────────────────── + +/// Extract dataflow information from a parsed AST tree. +/// Returns None if the language has no dataflow rules (e.g., HCL). +pub fn extract_dataflow(tree: &Tree, source: &[u8], lang_id: &str) -> Option { + let rules = get_dataflow_rules(lang_id)?; + + let mut parameters = Vec::new(); + let mut returns = Vec::new(); + let mut assignments = Vec::new(); + let mut arg_flows = Vec::new(); + let mut mutations = Vec::new(); + + let mut scope_stack: Vec = Vec::new(); + + visit( + &tree.root_node(), + rules, + source, + &mut scope_stack, + &mut parameters, + &mut returns, + &mut assignments, + &mut arg_flows, + &mut mutations, + 0, + ); + + Some(DataflowResult { + parameters, + returns, + assignments, + arg_flows, + mutations, + }) +} + +#[allow(clippy::too_many_arguments)] +fn visit( + node: &Node, + rules: &DataflowRules, + source: &[u8], + scope_stack: &mut Vec, + parameters: &mut Vec, + returns: &mut Vec, + assignments: &mut Vec, + arg_flows: &mut Vec, + mutations: &mut Vec, + depth: usize, +) { + if depth >= MAX_VISIT_DEPTH { + return; + } + + let t = node.kind(); + + // Enter function scope + if is_function_node(rules, t) { + enter_scope(node, rules, source, scope_stack, parameters); + let cursor = &mut node.walk(); + for child in node.named_children(cursor) { + visit(&child, rules, source, scope_stack, parameters, returns, assignments, arg_flows, mutations, depth + 1); + } + scope_stack.pop(); + return; + } + + // Return statements + if rules.return_node.is_some_and(|r| r == t) { + if let Some(scope) = scope_stack.last() { + if let Some(ref func_name) = scope.func_name { + let expr = node.named_child(0); + let mut referenced_names = Vec::new(); + if let Some(ref e) = expr { + collect_identifiers(e, &mut referenced_names, rules, source, depth + 1); + } + returns.push(DataflowReturn { + func_name: func_name.clone(), + expression: truncate( + expr.map(|e| node_text(&e, source)).unwrap_or(""), + 120, + ), + referenced_names, + line: node_line(node), + }); + } + } + let cursor = &mut node.walk(); + for child in node.named_children(cursor) { + visit(&child, rules, source, scope_stack, parameters, returns, assignments, arg_flows, mutations, depth + 1); + } + return; + } + + // Variable declarations (single type) + if rules.var_declarator_node.is_some_and(|v| v == t) { + handle_var_declarator(node, rules, source, scope_stack, assignments); + let cursor = &mut node.walk(); + for child in node.named_children(cursor) { + visit(&child, rules, source, scope_stack, parameters, returns, assignments, arg_flows, mutations, depth + 1); + } + return; + } + + // Variable declarations (multi-type, e.g., Go) + if !rules.var_declarator_nodes.is_empty() && rules.var_declarator_nodes.contains(&t) { + handle_var_declarator(node, rules, source, scope_stack, assignments); + let cursor = &mut node.walk(); + for child in node.named_children(cursor) { + visit(&child, rules, source, scope_stack, parameters, returns, assignments, arg_flows, mutations, depth + 1); + } + return; + } + + // Call expressions + if is_call_node(rules, t) { + handle_call_expr(node, rules, source, scope_stack, arg_flows); + let cursor = &mut node.walk(); + for child in node.named_children(cursor) { + visit(&child, rules, source, scope_stack, parameters, returns, assignments, arg_flows, mutations, depth + 1); + } + return; + } + + // Assignment expressions + if rules.assignment_node.is_some_and(|a| a == t) { + handle_assignment(node, rules, source, scope_stack, assignments, mutations); + let cursor = &mut node.walk(); + for child in node.named_children(cursor) { + visit(&child, rules, source, scope_stack, parameters, returns, assignments, arg_flows, mutations, depth + 1); + } + return; + } + + // Mutation detection via expression_statement + if t == rules.expression_stmt_node { + handle_expr_stmt_mutation(node, rules, source, scope_stack, mutations); + } + + // Default: visit children + let cursor = &mut node.walk(); + for child in node.named_children(cursor) { + visit(&child, rules, source, scope_stack, parameters, returns, assignments, arg_flows, mutations, depth + 1); + } +} + +fn enter_scope( + fn_node: &Node, + rules: &DataflowRules, + source: &[u8], + scope_stack: &mut Vec, + parameters: &mut Vec, +) { + let name = function_name(fn_node, rules, source); + let params_node = fn_node.child_by_field_name(rules.param_list_field); + let param_list = params_node + .as_ref() + .map(|pn| extract_params(pn, rules, source)) + .unwrap_or_default(); + + let mut param_map = HashMap::new(); + for (pname, pidx) in ¶m_list { + param_map.insert(pname.clone(), *pidx); + if let Some(ref fn_name) = name { + let line = params_node + .as_ref() + .map(|pn| node_line(pn)) + .unwrap_or_else(|| node_line(fn_node)); + parameters.push(DataflowParam { + func_name: fn_name.clone(), + param_name: pname.clone(), + param_index: *pidx, + line, + }); + } + } + + scope_stack.push(ScopeFrame { + func_name: name, + params: param_map, + locals: HashMap::new(), + }); +} + +/// Unwrap await if present, returning the inner expression. +fn unwrap_await<'a>(node: &Node<'a>, rules: &DataflowRules) -> Node<'a> { + if rules.await_node.is_some_and(|a| a == node.kind()) { + if let Some(inner) = node.named_child(0) { + return inner; + } + } + *node +} + +fn handle_var_declarator( + node: &Node, + rules: &DataflowRules, + source: &[u8], + scope_stack: &mut Vec, + assignments: &mut Vec, +) { + let mut name_node = node.child_by_field_name(rules.var_name_field); + let mut value_node = rules.var_value_field.and_then(|f| node.child_by_field_name(f)); + + // C#: initializer is inside equals_value_clause child + if value_node.is_none() { + if let Some(eq_type) = rules.equals_clause_type { + let cursor = &mut node.walk(); + for child in node.named_children(cursor) { + if child.kind() == eq_type { + value_node = child + .child_by_field_name("value") + .or_else(|| child.named_child(0)); + break; + } + } + } + } + + // Fallback: initializer is a direct unnamed child (C# variable_declarator) + if value_node.is_none() { + if let Some(ref nn) = name_node { + let cursor = &mut node.walk(); + for child in node.named_children(cursor) { + if child.id() != nn.id() { + let uw = unwrap_await(&child, rules); + if is_call_node(rules, uw.kind()) { + value_node = Some(child); + break; + } + } + } + } + } + + // Go: expression_list wraps LHS/RHS — unwrap to first named child + if let Some(el_type) = rules.expression_list_type { + if name_node.as_ref().is_some_and(|n| n.kind() == el_type) { + name_node = name_node.and_then(|n| n.named_child(0)); + } + if value_node.as_ref().is_some_and(|v| v.kind() == el_type) { + value_node = value_node.and_then(|v| v.named_child(0)); + } + } + + let scope = match scope_stack.last_mut() { + Some(s) => s, + None => return, + }; + let name_n = match name_node { + Some(n) => n, + None => return, + }; + let value_n = match value_node { + Some(v) => v, + None => return, + }; + + let unwrapped = unwrap_await(&value_n, rules); + if !is_call_node(rules, unwrapped.kind()) { + return; + } + + let callee = match resolve_callee_name(&unwrapped, rules, source) { + Some(c) => c, + None => return, + }; + let func_name = match &scope.func_name { + Some(f) => f.clone(), + None => return, + }; + + // Destructuring: const { a, b } = foo() + let is_obj_destruct = rules.object_destruct_type.is_some_and(|o| o == name_n.kind()); + let is_arr_destruct = rules.array_destruct_type.is_some_and(|a| a == name_n.kind()); + + if is_obj_destruct || is_arr_destruct { + let names = extract_param_names(&name_n, rules, source); + for n in &names { + assignments.push(DataflowAssignment { + var_name: n.clone(), + caller_func: Some(func_name.clone()), + source_call_name: callee.clone(), + expression: truncate(node_text(node, source), 120), + line: node_line(node), + }); + scope + .locals + .insert(n.clone(), LocalSource::Destructured { callee: callee.clone() }); + } + } else { + let var_name = node_text(&name_n, source).to_string(); + assignments.push(DataflowAssignment { + var_name: var_name.clone(), + caller_func: Some(func_name), + source_call_name: callee.clone(), + expression: truncate(node_text(node, source), 120), + line: node_line(node), + }); + scope.locals.insert(var_name, LocalSource::CallReturn { callee }); + } +} + +fn handle_assignment( + node: &Node, + rules: &DataflowRules, + source: &[u8], + scope_stack: &mut Vec, + assignments: &mut Vec, + mutations: &mut Vec, +) { + let left = node.child_by_field_name(rules.assign_left_field); + let right = node.child_by_field_name(rules.assign_right_field); + + let func_name = match scope_stack.last() { + Some(s) => match &s.func_name { + Some(f) => f.clone(), + None => return, + }, + None => return, + }; + + // Mutation: obj.prop = value + if let Some(ref left_n) = left { + if rules.member_node.is_some_and(|m| m == left_n.kind()) { + if let Some(receiver) = member_receiver(left_n, rules, source) { + let binding = find_binding(scope_stack, &receiver); + if binding.is_some() { + mutations.push(DataflowMutation { + func_name: Some(func_name.clone()), + receiver_name: receiver, + binding_type: binding.as_ref().map(|b| b.binding_type.clone()), + mutating_expr: truncate(node_text(node, source), 120), + line: node_line(node), + }); + } + } + } + } + + // Non-declaration assignment: x = foo() + if let (Some(left_n), Some(right_n)) = (left, right) { + if is_ident(rules, left_n.kind()) { + let unwrapped = unwrap_await(&right_n, rules); + if is_call_node(rules, unwrapped.kind()) { + if let Some(callee) = resolve_callee_name(&unwrapped, rules, source) { + let var_name = node_text(&left_n, source).to_string(); + assignments.push(DataflowAssignment { + var_name: var_name.clone(), + caller_func: Some(func_name), + source_call_name: callee.clone(), + expression: truncate(node_text(node, source), 120), + line: node_line(node), + }); + if let Some(scope) = scope_stack.last_mut() { + scope.locals.insert(var_name, LocalSource::CallReturn { callee }); + } + } + } + } + } +} + +fn handle_call_expr( + node: &Node, + rules: &DataflowRules, + source: &[u8], + scope_stack: &[ScopeFrame], + arg_flows: &mut Vec, +) { + let callee = match resolve_callee_name(node, rules, source) { + Some(c) => c, + None => return, + }; + let args_node = match node.child_by_field_name(rules.call_args_field) { + Some(a) => a, + None => return, + }; + let func_name = match scope_stack.last() { + Some(s) => match &s.func_name { + Some(f) => f.clone(), + None => return, + }, + None => return, + }; + + let mut arg_index: u32 = 0; + let cursor = &mut args_node.walk(); + for arg_raw in args_node.named_children(cursor) { + // PHP/Java: unwrap argument wrapper + let arg = if rules.argument_wrapper_type.is_some_and(|w| w == arg_raw.kind()) { + arg_raw.named_child(0).unwrap_or(arg_raw) + } else { + arg_raw + }; + + let unwrapped = if rules.spread_type.is_some_and(|s| s == arg.kind()) { + arg.named_child(0).unwrap_or(arg) + } else { + arg + }; + + let arg_name = if is_ident(rules, unwrapped.kind()) { + Some(node_text(&unwrapped, source).to_string()) + } else { + None + }; + let arg_member = if arg_name.is_none() + && rules.member_node.is_some_and(|m| m == unwrapped.kind()) + { + member_receiver(&unwrapped, rules, source) + } else { + None + }; + let tracked_name = arg_name.clone().or(arg_member); + + if let Some(ref tracked) = tracked_name { + let binding = find_binding(scope_stack, tracked); + if binding.is_some() { + let conf = binding_confidence(&binding); + arg_flows.push(DataflowArgFlow { + caller_func: Some(func_name.clone()), + callee_name: callee.clone(), + arg_index, + arg_name: Some(tracked.clone()), + binding_type: binding.as_ref().map(|b| b.binding_type.clone()), + confidence: conf, + expression: truncate(node_text(&arg_raw, source), 120), + line: node_line(node), + }); + } + } + arg_index += 1; + } +} + +fn handle_expr_stmt_mutation( + node: &Node, + rules: &DataflowRules, + source: &[u8], + scope_stack: &[ScopeFrame], + mutations: &mut Vec, +) { + if rules.mutating_methods.is_empty() { + return; + } + let expr = match node.named_child(0) { + Some(e) => e, + None => return, + }; + if !is_call_node(rules, expr.kind()) { + return; + } + + let mut method_name: Option = None; + let mut receiver: Option = None; + + // Standard pattern: call(fn: member(obj, prop)) + if let Some(fn_node) = expr.child_by_field_name(rules.call_function_field) { + if rules.member_node.is_some_and(|m| m == fn_node.kind()) { + if let Some(prop) = fn_node.child_by_field_name(rules.member_property_field) { + method_name = Some(node_text(&prop, source).to_string()); + } + receiver = member_receiver(&fn_node, rules, source); + } + } + + // Method call pattern: call node has a dedicated name field distinct from + // call_function_field (e.g. Rust method_call_expression has "name" + "receiver") + if method_name.is_none() { + if let Some(name_field) = rules.method_call_name_field { + if let Some(name_n) = expr.child_by_field_name(name_field) { + method_name = Some(node_text(&name_n, source).to_string()); + // Extract receiver: prefer method_call_receiver_field if set, + // otherwise fall back to member_object_field + let recv_field = rules + .method_call_receiver_field + .unwrap_or(rules.member_object_field); + if let Some(recv_node) = expr.child_by_field_name(recv_field) { + if is_ident(rules, recv_node.kind()) { + receiver = Some(node_text(&recv_node, source).to_string()); + } else if rules.member_node.is_some_and(|m| m == recv_node.kind()) { + receiver = member_receiver(&recv_node, rules, source); + } + } + } + } + } + + // Java/combined pattern: call node itself has object + name fields + if receiver.is_none() { + if let Some(obj_field) = rules.call_object_field { + let obj = expr.child_by_field_name(obj_field); + let name = expr.child_by_field_name(rules.call_function_field); + if let (Some(obj_n), Some(name_n)) = (obj, name) { + method_name = Some(node_text(&name_n, source).to_string()); + if is_ident(rules, obj_n.kind()) { + receiver = Some(node_text(&obj_n, source).to_string()); + } + } + } + } + + let method = match method_name { + Some(m) => m, + None => return, + }; + if !rules.mutating_methods.contains(&method.as_str()) { + return; + } + + let recv = match receiver { + Some(r) => r, + None => return, + }; + let func_name = match scope_stack.last() { + Some(s) => s.func_name.clone(), + None => None, + }; + if func_name.is_none() { + return; + } + + let binding = find_binding(scope_stack, &recv); + if binding.is_some() { + mutations.push(DataflowMutation { + func_name, + receiver_name: recv, + binding_type: binding.as_ref().map(|b| b.binding_type.clone()), + mutating_expr: truncate(node_text(&expr, source), 120), + line: node_line(node), + }); + } +} diff --git a/crates/codegraph-core/src/lib.rs b/crates/codegraph-core/src/lib.rs index 90e673e..607aec1 100644 --- a/crates/codegraph-core/src/lib.rs +++ b/crates/codegraph-core/src/lib.rs @@ -7,20 +7,31 @@ pub mod cycles; pub mod incremental; pub mod complexity; pub mod cfg; +pub mod dataflow; use napi_derive::napi; use types::*; /// Parse a single file and return extracted symbols. +/// When `include_dataflow` is true, dataflow analysis is also extracted. #[napi] -pub fn parse_file(file_path: String, source: String) -> Option { - parallel::parse_file(&file_path, &source) +pub fn parse_file( + file_path: String, + source: String, + include_dataflow: Option, +) -> Option { + parallel::parse_file(&file_path, &source, include_dataflow.unwrap_or(false)) } /// Parse multiple files in parallel and return all extracted symbols. +/// When `include_dataflow` is true, dataflow analysis is also extracted. #[napi] -pub fn parse_files(file_paths: Vec, root_dir: String) -> Vec { - parallel::parse_files_parallel(&file_paths, &root_dir) +pub fn parse_files( + file_paths: Vec, + root_dir: String, + include_dataflow: Option, +) -> Vec { + parallel::parse_files_parallel(&file_paths, &root_dir, include_dataflow.unwrap_or(false)) } /// Resolve a single import path. diff --git a/crates/codegraph-core/src/parallel.rs b/crates/codegraph-core/src/parallel.rs index e2c8aad..7fb0d8d 100644 --- a/crates/codegraph-core/src/parallel.rs +++ b/crates/codegraph-core/src/parallel.rs @@ -2,6 +2,7 @@ use rayon::prelude::*; use std::fs; use tree_sitter::Parser; +use crate::dataflow::extract_dataflow; use crate::extractors::extract_symbols; use crate::parser_registry::LanguageKind; use crate::types::FileSymbols; @@ -9,7 +10,12 @@ use crate::types::FileSymbols; /// Parse multiple files in parallel using rayon. /// Each thread creates its own Parser (cheap; Language objects are Send+Sync). /// Failed files are silently skipped (matches WASM behavior). -pub fn parse_files_parallel(file_paths: &[String], _root_dir: &str) -> Vec { +/// When `include_dataflow` is false, dataflow extraction is skipped for performance. +pub fn parse_files_parallel( + file_paths: &[String], + _root_dir: &str, + include_dataflow: bool, +) -> Vec { file_paths .par_iter() .filter_map(|file_path| { @@ -24,6 +30,9 @@ pub fn parse_files_parallel(file_paths: &[String], _root_dir: &str) -> Vec Vec Option { +/// When `include_dataflow` is false, dataflow extraction is skipped for performance. +pub fn parse_file(file_path: &str, source: &str, include_dataflow: bool) -> Option { let lang = LanguageKind::from_extension(file_path)?; let source_bytes = source.as_bytes(); @@ -43,6 +53,9 @@ pub fn parse_file(file_path: &str, source: &str) -> Option { let tree = parser.parse(source_bytes, None)?; let line_count = source_bytes.iter().filter(|&&b| b == b'\n').count() as u32 + 1; let mut symbols = extract_symbols(lang, &tree, source_bytes, file_path); + if include_dataflow { + symbols.dataflow = extract_dataflow(&tree, source_bytes, lang.lang_id_str()); + } symbols.line_count = Some(line_count); Some(symbols) } diff --git a/crates/codegraph-core/src/parser_registry.rs b/crates/codegraph-core/src/parser_registry.rs index 2c2c7e9..f800b27 100644 --- a/crates/codegraph-core/src/parser_registry.rs +++ b/crates/codegraph-core/src/parser_registry.rs @@ -17,6 +17,24 @@ pub enum LanguageKind { } impl LanguageKind { + /// Return the string ID used by dataflow/cfg rules lookup. + /// Matches the JS `DATAFLOW_RULES` map keys in `src/dataflow.js`. + pub fn lang_id_str(&self) -> &'static str { + match self { + Self::JavaScript => "javascript", + Self::TypeScript => "typescript", + Self::Tsx => "tsx", + Self::Python => "python", + Self::Go => "go", + Self::Rust => "rust", + Self::Java => "java", + Self::CSharp => "csharp", + Self::Ruby => "ruby", + Self::Php => "php", + Self::Hcl => "hcl", + } + } + /// Determine language from file extension — mirrors `getParser()` in parser.js pub fn from_extension(file_path: &str) -> Option { let path = Path::new(file_path); diff --git a/crates/codegraph-core/src/types.rs b/crates/codegraph-core/src/types.rs index c381671..f1b68ff 100644 --- a/crates/codegraph-core/src/types.rs +++ b/crates/codegraph-core/src/types.rs @@ -175,6 +175,87 @@ pub struct AstNode { pub receiver: Option, } +// ─── Dataflow Types ────────────────────────────────────────────────────── + +#[napi(object)] +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct DataflowParam { + #[napi(js_name = "funcName")] + pub func_name: String, + #[napi(js_name = "paramName")] + pub param_name: String, + #[napi(js_name = "paramIndex")] + pub param_index: u32, + pub line: u32, +} + +#[napi(object)] +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct DataflowReturn { + #[napi(js_name = "funcName")] + pub func_name: String, + pub expression: String, + #[napi(js_name = "referencedNames")] + pub referenced_names: Vec, + pub line: u32, +} + +#[napi(object)] +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct DataflowAssignment { + #[napi(js_name = "varName")] + pub var_name: String, + #[napi(js_name = "callerFunc")] + pub caller_func: Option, + #[napi(js_name = "sourceCallName")] + pub source_call_name: String, + pub expression: String, + pub line: u32, +} + +#[napi(object)] +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct DataflowArgFlow { + #[napi(js_name = "callerFunc")] + pub caller_func: Option, + #[napi(js_name = "calleeName")] + pub callee_name: String, + #[napi(js_name = "argIndex")] + pub arg_index: u32, + #[napi(js_name = "argName")] + pub arg_name: Option, + #[napi(js_name = "bindingType")] + pub binding_type: Option, + pub confidence: f64, + pub expression: String, + pub line: u32, +} + +#[napi(object)] +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct DataflowMutation { + #[napi(js_name = "funcName")] + pub func_name: Option, + #[napi(js_name = "receiverName")] + pub receiver_name: String, + #[napi(js_name = "bindingType")] + pub binding_type: Option, + #[napi(js_name = "mutatingExpr")] + pub mutating_expr: String, + pub line: u32, +} + +#[napi(object)] +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct DataflowResult { + pub parameters: Vec, + pub returns: Vec, + pub assignments: Vec, + #[napi(js_name = "argFlows")] + pub arg_flows: Vec, + pub mutations: Vec, +} + #[napi(object)] #[derive(Debug, Clone, Serialize, Deserialize)] pub struct FileSymbols { @@ -186,6 +267,7 @@ pub struct FileSymbols { pub exports: Vec, #[napi(js_name = "astNodes")] pub ast_nodes: Vec, + pub dataflow: Option, pub line_count: Option, } @@ -199,6 +281,7 @@ impl FileSymbols { classes: Vec::new(), exports: Vec::new(), ast_nodes: Vec::new(), + dataflow: None, line_count: None, } } diff --git a/src/builder.js b/src/builder.js index c5019b4..edcf8b1 100644 --- a/src/builder.js +++ b/src/builder.js @@ -444,7 +444,7 @@ export async function buildGraph(rootDir, opts = {}) { opts.incremental !== false && config.build && config.build.incremental !== false; // Engine selection: 'native', 'wasm', or 'auto' (default) - const engineOpts = { engine: opts.engine || 'auto' }; + const engineOpts = { engine: opts.engine || 'auto', dataflow: opts.dataflow !== false }; const { name: engineName, version: engineVersion } = getActiveEngine(engineOpts); info(`Using ${engineName} engine${engineVersion ? ` (v${engineVersion})` : ''}`); @@ -548,7 +548,11 @@ export async function buildGraph(rootDir, opts = {}) { if (needsCfg || needsDataflow) { info('No file changes. Running pending analysis pass...'); - const analysisSymbols = await parseFilesAuto(files, rootDir, engineOpts); + const analysisOpts = { + ...engineOpts, + dataflow: needsDataflow && opts.dataflow !== false, + }; + const analysisSymbols = await parseFilesAuto(files, rootDir, analysisOpts); if (needsCfg) { const { buildCFGData } = await import('./cfg.js'); await buildCFGData(db, analysisSymbols, rootDir, engineOpts); diff --git a/src/dataflow.js b/src/dataflow.js index ad6f156..08b982f 100644 --- a/src/dataflow.js +++ b/src/dataflow.js @@ -1009,7 +1009,7 @@ export async function buildDataflowEdges(db, fileSymbols, rootDir, _engineOpts) let needsFallback = false; for (const [relPath, symbols] of fileSymbols) { - if (!symbols._tree) { + if (!symbols._tree && !symbols.dataflow) { const ext = path.extname(relPath).toLowerCase(); if (DATAFLOW_EXTENSIONS.has(ext)) { needsFallback = true; @@ -1061,41 +1061,45 @@ export async function buildDataflowEdges(db, fileSymbols, rootDir, _engineOpts) const ext = path.extname(relPath).toLowerCase(); if (!DATAFLOW_EXTENSIONS.has(ext)) continue; - let tree = symbols._tree; - let langId = symbols._langId; + // Use native dataflow data if available — skip WASM extraction + let data = symbols.dataflow; + if (!data) { + let tree = symbols._tree; + let langId = symbols._langId; + + // WASM fallback if no cached tree + if (!tree) { + if (!extToLang || !getParserFn) continue; + langId = extToLang.get(ext); + if (!langId || !DATAFLOW_LANG_IDS.has(langId)) continue; + + const absPath = path.join(rootDir, relPath); + let code; + try { + code = fs.readFileSync(absPath, 'utf-8'); + } catch { + continue; + } - // WASM fallback if no cached tree - if (!tree) { - if (!extToLang || !getParserFn) continue; - langId = extToLang.get(ext); - if (!langId || !DATAFLOW_LANG_IDS.has(langId)) continue; + const parser = getParserFn(parsers, absPath); + if (!parser) continue; - const absPath = path.join(rootDir, relPath); - let code; - try { - code = fs.readFileSync(absPath, 'utf-8'); - } catch { - continue; + try { + tree = parser.parse(code); + } catch { + continue; + } } - const parser = getParserFn(parsers, absPath); - if (!parser) continue; - - try { - tree = parser.parse(code); - } catch { - continue; + if (!langId) { + langId = extToLang ? extToLang.get(ext) : null; + if (!langId) continue; } - } - - if (!langId) { - langId = extToLang ? extToLang.get(ext) : null; - if (!langId) continue; - } - if (!DATAFLOW_RULES.has(langId)) continue; + if (!DATAFLOW_RULES.has(langId)) continue; - const data = extractDataflow(tree, relPath, symbols.definitions, langId); + data = extractDataflow(tree, relPath, symbols.definitions, langId); + } // Resolve function names to node IDs in this file first, then globally function resolveNode(funcName) { diff --git a/src/parser.js b/src/parser.js index cb98498..e9d5a1e 100644 --- a/src/parser.js +++ b/src/parser.js @@ -269,6 +269,46 @@ function normalizeNativeSymbols(result) { text: n.text ?? null, receiver: n.receiver ?? null, })), + dataflow: result.dataflow + ? { + parameters: (result.dataflow.parameters || []).map((p) => ({ + funcName: p.funcName, + paramName: p.paramName, + paramIndex: p.paramIndex, + line: p.line, + })), + returns: (result.dataflow.returns || []).map((r) => ({ + funcName: r.funcName, + expression: r.expression ?? '', + referencedNames: r.referencedNames ?? [], + line: r.line, + })), + assignments: (result.dataflow.assignments || []).map((a) => ({ + varName: a.varName, + callerFunc: a.callerFunc ?? null, + sourceCallName: a.sourceCallName, + expression: a.expression ?? '', + line: a.line, + })), + argFlows: (result.dataflow.argFlows ?? []).map((f) => ({ + callerFunc: f.callerFunc ?? null, + calleeName: f.calleeName, + argIndex: f.argIndex, + argName: f.argName ?? null, + binding: f.bindingType ? { type: f.bindingType } : null, + confidence: f.confidence, + expression: f.expression ?? '', + line: f.line, + })), + mutations: (result.dataflow.mutations || []).map((m) => ({ + funcName: m.funcName ?? null, + receiverName: m.receiverName, + binding: m.bindingType ? { type: m.bindingType } : null, + mutatingExpr: m.mutatingExpr, + line: m.line, + })), + } + : null, }; } @@ -400,7 +440,7 @@ export async function parseFileAuto(filePath, source, opts = {}) { const { native } = resolveEngine(opts); if (native) { - const result = native.parseFile(filePath, source); + const result = native.parseFile(filePath, source, !!opts.dataflow); return result ? normalizeNativeSymbols(result) : null; } @@ -423,7 +463,7 @@ export async function parseFilesAuto(filePaths, rootDir, opts = {}) { const result = new Map(); if (native) { - const nativeResults = native.parseFiles(filePaths, rootDir); + const nativeResults = native.parseFiles(filePaths, rootDir, !!opts.dataflow); for (const r of nativeResults) { if (!r) continue; const relPath = path.relative(rootDir, r.file).split(path.sep).join('/'); From c4dc1d546a5bbcb2c57ed24f05a44648a35200f5 Mon Sep 17 00:00:00 2001 From: carlos-alm <127798846+carlos-alm@users.noreply.github.com> Date: Thu, 5 Mar 2026 14:31:52 -0700 Subject: [PATCH 9/9] test: add dataflow parity tests for Go, Rust, and Ruby MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cross-engine parity tests comparing WASM extractDataflow() output against native Rust parseFile(include_dataflow=true) for the three languages that lacked coverage. Tests auto-skip when the native binary doesn't include dataflow support (requires local Rust build). Covers parameters, returns, assignments, argFlows, and mutations per language — 13 test cases total. --- tests/engines/dataflow-parity.test.js | 269 ++++++++++++++++++++++++++ 1 file changed, 269 insertions(+) create mode 100644 tests/engines/dataflow-parity.test.js diff --git a/tests/engines/dataflow-parity.test.js b/tests/engines/dataflow-parity.test.js new file mode 100644 index 0000000..ae08b92 --- /dev/null +++ b/tests/engines/dataflow-parity.test.js @@ -0,0 +1,269 @@ +/** + * Cross-engine dataflow parity tests. + * + * Parse the same source snippets with both WASM and native engines, + * then assert the dataflow output is equivalent for Go, Rust, and Ruby. + * + * JS/TS/Python/Java/C# already have good parity coverage via the + * 5 existing language-specific dataflow tests + build-parity. + * + * Skipped when the native engine is not installed or when the native + * binary does not include dataflow support (requires local Rust build). + */ + +import { beforeAll, describe, expect, it } from 'vitest'; +import { extractDataflow } from '../../src/dataflow.js'; +import { isNativeAvailable } from '../../src/native.js'; +import { createParsers, getParser } from '../../src/parser.js'; + +let native; +let parsers; +let nativeHasDataflow = false; + +/** + * Extract dataflow via WASM: parse with tree-sitter WASM, then run + * the JS extractDataflow() visitor. + */ +function wasmDataflow(code, filePath, langId) { + const parser = getParser(parsers, filePath); + if (!parser) return null; + const tree = parser.parse(code); + return extractDataflow(tree, filePath, [], langId); +} + +/** + * Extract dataflow via native: parseFile with include_dataflow=true. + * Returns null if native doesn't support dataflow. + */ +function nativeDataflow(code, filePath) { + const result = native.parseFile(filePath, code, true); + if (!result || !result.dataflow) return null; + const df = result.dataflow; + return { + parameters: (df.parameters || []).map((p) => ({ + funcName: p.funcName, + paramName: p.paramName, + paramIndex: p.paramIndex, + line: p.line, + })), + returns: (df.returns || []).map((r) => ({ + funcName: r.funcName, + expression: r.expression ?? '', + referencedNames: r.referencedNames ?? [], + line: r.line, + })), + assignments: (df.assignments || []).map((a) => ({ + varName: a.varName, + callerFunc: a.callerFunc ?? null, + sourceCallName: a.sourceCallName, + expression: a.expression ?? '', + line: a.line, + })), + argFlows: (df.argFlows ?? []).map((f) => ({ + callerFunc: f.callerFunc ?? null, + calleeName: f.calleeName, + argIndex: f.argIndex, + argName: f.argName ?? null, + confidence: f.confidence, + expression: f.expression ?? '', + line: f.line, + })), + mutations: (df.mutations || []).map((m) => ({ + funcName: m.funcName ?? null, + receiverName: m.receiverName, + mutatingExpr: m.mutatingExpr, + line: m.line, + })), + }; +} + +/** + * Normalize WASM extractDataflow() output to match the native shape. + * WASM returns extra fields (binding, etc.) that native doesn't — strip them. + */ +function normalizeWasm(data) { + if (!data) return null; + return { + parameters: (data.parameters || []).map((p) => ({ + funcName: p.funcName, + paramName: p.paramName, + paramIndex: p.paramIndex, + line: p.line, + })), + returns: (data.returns || []).map((r) => ({ + funcName: r.funcName, + expression: r.expression ?? '', + referencedNames: r.referencedNames ?? [], + line: r.line, + })), + assignments: (data.assignments || []).map((a) => ({ + varName: a.varName, + callerFunc: a.callerFunc ?? null, + sourceCallName: a.sourceCallName, + expression: a.expression ?? '', + line: a.line, + })), + argFlows: (data.argFlows ?? []).map((f) => ({ + callerFunc: f.callerFunc ?? null, + calleeName: f.calleeName, + argIndex: f.argIndex, + argName: f.argName ?? null, + confidence: f.confidence, + expression: f.expression ?? '', + line: f.line, + })), + mutations: (data.mutations || []).map((m) => ({ + funcName: m.funcName ?? null, + receiverName: m.receiverName, + mutatingExpr: m.mutatingExpr, + line: m.line, + })), + }; +} + +const hasNative = isNativeAvailable(); + +// Detect whether the installed native binary includes dataflow support. +// The published npm prebuilt (v3.0.0) doesn't — only a local Rust build does. +function detectNativeDataflow() { + if (!native) return false; + const r = native.parseFile('probe.js', 'function f(a) { return a; }', true); + return !!r?.dataflow; +} + +const describeOrSkip = hasNative ? describe : describe.skip; + +describeOrSkip('Cross-engine dataflow parity', () => { + beforeAll(async () => { + if (!hasNative) return; + const { getNative } = await import('../../src/native.js'); + native = getNative(); + nativeHasDataflow = detectNativeDataflow(); + parsers = await createParsers(); + }); + + // ── Go ───────────────────────────────────────────────────────────────── + + describe('Go', () => { + const lang = 'go'; + const file = 'test.go'; + + it('parameters — simple', ({ skip }) => { + if (!nativeHasDataflow) skip(); + const code = 'package main\nfunc add(a int, b int) int {\n\treturn a + b\n}\n'; + const w = normalizeWasm(wasmDataflow(code, file, lang)); + const n = nativeDataflow(code, file); + expect(n.parameters).toEqual(w.parameters); + }); + + it('returns — captures referenced names', ({ skip }) => { + if (!nativeHasDataflow) skip(); + const code = 'package main\nfunc double(x int) int {\n\treturn x * 2\n}\n'; + const w = normalizeWasm(wasmDataflow(code, file, lang)); + const n = nativeDataflow(code, file); + expect(n.returns).toEqual(w.returns); + }); + + it('assignments — short var declaration from call', ({ skip }) => { + if (!nativeHasDataflow) skip(); + const code = 'package main\nfunc run() {\n\tresult := compute()\n\t_ = result\n}\n'; + const w = normalizeWasm(wasmDataflow(code, file, lang)); + const n = nativeDataflow(code, file); + expect(n.assignments).toEqual(w.assignments); + }); + + it('argFlows — parameter passed as argument', ({ skip }) => { + if (!nativeHasDataflow) skip(); + const code = 'package main\nfunc process(input string) {\n\ttransform(input)\n}\n'; + const w = normalizeWasm(wasmDataflow(code, file, lang)); + const n = nativeDataflow(code, file); + expect(n.argFlows).toEqual(w.argFlows); + }); + }); + + // ── Rust ─────────────────────────────────────────────────────────────── + + describe('Rust', () => { + const lang = 'rust'; + const file = 'test.rs'; + + it('parameters — simple', ({ skip }) => { + if (!nativeHasDataflow) skip(); + const code = 'fn add(a: i32, b: i32) -> i32 {\n a + b\n}\n'; + const w = normalizeWasm(wasmDataflow(code, file, lang)); + const n = nativeDataflow(code, file); + expect(n.parameters).toEqual(w.parameters); + }); + + it('returns — explicit return', ({ skip }) => { + if (!nativeHasDataflow) skip(); + const code = 'fn double(x: i32) -> i32 {\n return x * 2;\n}\n'; + const w = normalizeWasm(wasmDataflow(code, file, lang)); + const n = nativeDataflow(code, file); + expect(n.returns).toEqual(w.returns); + }); + + it('assignments — let binding from call', ({ skip }) => { + if (!nativeHasDataflow) skip(); + const code = 'fn main() {\n let result = compute();\n println!("{}", result);\n}\n'; + const w = normalizeWasm(wasmDataflow(code, file, lang)); + const n = nativeDataflow(code, file); + expect(n.assignments).toEqual(w.assignments); + }); + + it('argFlows — parameter passed as argument', ({ skip }) => { + if (!nativeHasDataflow) skip(); + const code = 'fn process(input: String) {\n transform(input);\n}\n'; + const w = normalizeWasm(wasmDataflow(code, file, lang)); + const n = nativeDataflow(code, file); + expect(n.argFlows).toEqual(w.argFlows); + }); + + it('mutations — push on mutable parameter', ({ skip }) => { + if (!nativeHasDataflow) skip(); + const code = 'fn add_item(items: &mut Vec, item: i32) {\n items.push(item);\n}\n'; + const w = normalizeWasm(wasmDataflow(code, file, lang)); + const n = nativeDataflow(code, file); + expect(n.mutations).toEqual(w.mutations); + }); + }); + + // ── Ruby ─────────────────────────────────────────────────────────────── + + describe('Ruby', () => { + const lang = 'ruby'; + const file = 'test.rb'; + + it('parameters — simple', ({ skip }) => { + if (!nativeHasDataflow) skip(); + const code = 'def add(a, b)\n return a + b\nend\n'; + const w = normalizeWasm(wasmDataflow(code, file, lang)); + const n = nativeDataflow(code, file); + expect(n.parameters).toEqual(w.parameters); + }); + + it('returns — explicit return', ({ skip }) => { + if (!nativeHasDataflow) skip(); + const code = 'def double(x)\n return x * 2\nend\n'; + const w = normalizeWasm(wasmDataflow(code, file, lang)); + const n = nativeDataflow(code, file); + expect(n.returns).toEqual(w.returns); + }); + + it('assignments — variable from method call', ({ skip }) => { + if (!nativeHasDataflow) skip(); + const code = 'def main\n result = compute()\n return result\nend\n'; + const w = normalizeWasm(wasmDataflow(code, file, lang)); + const n = nativeDataflow(code, file); + expect(n.assignments).toEqual(w.assignments); + }); + + it('argFlows — parameter passed as argument', ({ skip }) => { + if (!nativeHasDataflow) skip(); + const code = 'def process(input)\n transform(input)\nend\n'; + const w = normalizeWasm(wasmDataflow(code, file, lang)); + const n = nativeDataflow(code, file); + expect(n.argFlows).toEqual(w.argFlows); + }); + }); +});