diff --git a/crates/codegraph-core/src/cfg.rs b/crates/codegraph-core/src/cfg.rs new file mode 100644 index 0000000..1e0742c --- /dev/null +++ b/crates/codegraph-core/src/cfg.rs @@ -0,0 +1,1155 @@ +use tree_sitter::Node; +use crate::types::{CfgBlock, CfgData, CfgEdge}; + +// ─── CFG Rules ────────────────────────────────────────────────────────── + +/// Per-language node type names for CFG construction. +pub struct CfgRules { + pub if_node: Option<&'static str>, + pub if_nodes: &'static [&'static str], + pub elif_node: Option<&'static str>, + pub else_clause: Option<&'static str>, + pub else_via_alternative: bool, + pub if_consequent_field: Option<&'static str>, + pub for_nodes: &'static [&'static str], + pub condition_field: Option<&'static str>, + pub while_node: Option<&'static str>, + pub while_nodes: &'static [&'static str], + pub do_node: Option<&'static str>, + pub infinite_loop_node: Option<&'static str>, + pub unless_node: Option<&'static str>, + pub until_node: Option<&'static str>, + pub switch_node: Option<&'static str>, + pub switch_nodes: &'static [&'static str], + pub case_node: Option<&'static str>, + pub case_nodes: &'static [&'static str], + pub default_node: Option<&'static str>, + pub wildcard_pattern_node: Option<&'static str>, + pub try_node: Option<&'static str>, + pub try_nodes: &'static [&'static str], + pub catch_node: Option<&'static str>, + pub finally_node: Option<&'static str>, + pub else_node: Option<&'static str>, + pub return_node: Option<&'static str>, + pub throw_node: Option<&'static str>, + pub break_node: Option<&'static str>, + pub continue_node: Option<&'static str>, + pub block_node: Option<&'static str>, + pub block_nodes: &'static [&'static str], + pub labeled_node: Option<&'static str>, +} + +fn matches_opt(kind: &str, opt: Option<&str>) -> bool { + opt.is_some_and(|s| s == kind) +} + +fn matches_slice(kind: &str, slice: &[&str]) -> bool { + slice.contains(&kind) +} + +// ─── Per-Language Rules ───────────────────────────────────────────────── + +pub static JS_TS_CFG: CfgRules = CfgRules { + if_node: Some("if_statement"), + if_nodes: &[], + elif_node: None, + else_clause: Some("else_clause"), + else_via_alternative: false, + if_consequent_field: None, + for_nodes: &["for_statement", "for_in_statement"], + condition_field: Some("condition"), + while_node: Some("while_statement"), + while_nodes: &[], + do_node: Some("do_statement"), + infinite_loop_node: None, + unless_node: None, + until_node: None, + switch_node: Some("switch_statement"), + switch_nodes: &[], + case_node: Some("switch_case"), + case_nodes: &[], + default_node: Some("switch_default"), + wildcard_pattern_node: None, + try_node: Some("try_statement"), + try_nodes: &[], + catch_node: Some("catch_clause"), + finally_node: Some("finally_clause"), + else_node: None, + return_node: Some("return_statement"), + throw_node: Some("throw_statement"), + break_node: Some("break_statement"), + continue_node: Some("continue_statement"), + block_node: Some("statement_block"), + block_nodes: &[], + labeled_node: Some("labeled_statement"), +}; + +pub static PYTHON_CFG: CfgRules = CfgRules { + if_node: Some("if_statement"), + if_nodes: &[], + elif_node: Some("elif_clause"), + else_clause: Some("else_clause"), + else_via_alternative: false, + if_consequent_field: None, + for_nodes: &["for_statement"], + condition_field: Some("condition"), + while_node: Some("while_statement"), + while_nodes: &[], + do_node: None, + infinite_loop_node: None, + unless_node: None, + until_node: None, + switch_node: Some("match_statement"), + switch_nodes: &[], + case_node: Some("case_clause"), + case_nodes: &[], + default_node: None, + wildcard_pattern_node: Some("wildcard_pattern"), + try_node: Some("try_statement"), + try_nodes: &[], + catch_node: Some("except_clause"), + finally_node: Some("finally_clause"), + else_node: Some("else_clause"), + return_node: Some("return_statement"), + throw_node: Some("raise_statement"), + break_node: Some("break_statement"), + continue_node: Some("continue_statement"), + block_node: Some("block"), + block_nodes: &[], + labeled_node: None, +}; + +pub static GO_CFG: CfgRules = CfgRules { + if_node: Some("if_statement"), + if_nodes: &[], + elif_node: None, + else_clause: None, + else_via_alternative: true, + if_consequent_field: None, + for_nodes: &["for_statement"], + condition_field: Some("condition"), + while_node: None, + while_nodes: &[], + do_node: None, + infinite_loop_node: None, + unless_node: None, + until_node: None, + switch_node: None, + switch_nodes: &["expression_switch_statement", "type_switch_statement", "select_statement"], + case_node: Some("expression_case"), + case_nodes: &["type_case", "communication_case"], + default_node: Some("default_case"), + wildcard_pattern_node: None, + try_node: None, + try_nodes: &[], + catch_node: None, + finally_node: None, + else_node: None, + return_node: Some("return_statement"), + throw_node: None, + break_node: Some("break_statement"), + continue_node: Some("continue_statement"), + block_node: Some("block"), + block_nodes: &[], + labeled_node: Some("labeled_statement"), +}; + +pub static RUST_CFG: CfgRules = CfgRules { + if_node: Some("if_expression"), + if_nodes: &["if_let_expression"], + elif_node: None, + else_clause: Some("else_clause"), + else_via_alternative: false, + if_consequent_field: None, + for_nodes: &["for_expression"], + condition_field: None, + while_node: Some("while_expression"), + while_nodes: &["while_let_expression"], + do_node: None, + infinite_loop_node: Some("loop_expression"), + unless_node: None, + until_node: None, + switch_node: Some("match_expression"), + switch_nodes: &[], + case_node: Some("match_arm"), + case_nodes: &[], + default_node: None, + wildcard_pattern_node: None, + try_node: None, + try_nodes: &[], + catch_node: None, + finally_node: None, + else_node: None, + return_node: Some("return_expression"), + throw_node: None, + break_node: Some("break_expression"), + continue_node: Some("continue_expression"), + block_node: Some("block"), + block_nodes: &[], + labeled_node: None, +}; + +pub static JAVA_CFG: CfgRules = CfgRules { + if_node: Some("if_statement"), + if_nodes: &[], + elif_node: None, + else_clause: None, + else_via_alternative: true, + if_consequent_field: None, + for_nodes: &["for_statement", "enhanced_for_statement"], + condition_field: Some("condition"), + while_node: Some("while_statement"), + while_nodes: &[], + do_node: Some("do_statement"), + infinite_loop_node: None, + unless_node: None, + until_node: None, + switch_node: Some("switch_statement"), + switch_nodes: &["switch_expression"], + case_node: Some("switch_block_statement_group"), + case_nodes: &["switch_rule"], + default_node: None, + wildcard_pattern_node: None, + try_node: Some("try_statement"), + try_nodes: &[], + catch_node: Some("catch_clause"), + finally_node: Some("finally_clause"), + else_node: None, + return_node: Some("return_statement"), + throw_node: Some("throw_statement"), + break_node: Some("break_statement"), + continue_node: Some("continue_statement"), + block_node: Some("block"), + block_nodes: &[], + labeled_node: Some("labeled_statement"), +}; + +pub static CSHARP_CFG: CfgRules = CfgRules { + if_node: Some("if_statement"), + if_nodes: &[], + elif_node: None, + else_clause: None, + else_via_alternative: true, + if_consequent_field: None, + for_nodes: &["for_statement", "foreach_statement"], + condition_field: Some("condition"), + while_node: Some("while_statement"), + while_nodes: &[], + do_node: Some("do_statement"), + infinite_loop_node: None, + unless_node: None, + until_node: None, + switch_node: Some("switch_statement"), + switch_nodes: &["switch_expression"], + case_node: Some("switch_section"), + case_nodes: &["switch_expression_arm"], + default_node: None, + wildcard_pattern_node: None, + try_node: Some("try_statement"), + try_nodes: &[], + catch_node: Some("catch_clause"), + finally_node: Some("finally_clause"), + else_node: None, + return_node: Some("return_statement"), + throw_node: Some("throw_statement"), + break_node: Some("break_statement"), + continue_node: Some("continue_statement"), + block_node: Some("block"), + block_nodes: &[], + labeled_node: Some("labeled_statement"), +}; + +pub static RUBY_CFG: CfgRules = CfgRules { + if_node: Some("if"), + if_nodes: &[], + elif_node: Some("elsif"), + else_clause: Some("else"), + else_via_alternative: false, + if_consequent_field: None, + for_nodes: &["for"], + condition_field: Some("condition"), + while_node: Some("while"), + while_nodes: &[], + do_node: None, + infinite_loop_node: None, + unless_node: Some("unless"), + until_node: Some("until"), + switch_node: Some("case"), + switch_nodes: &[], + case_node: Some("when"), + case_nodes: &[], + default_node: Some("else"), + wildcard_pattern_node: None, + try_node: Some("begin"), + try_nodes: &["body_statement"], + catch_node: Some("rescue"), + finally_node: Some("ensure"), + else_node: None, + return_node: Some("return"), + throw_node: None, + break_node: Some("break"), + continue_node: Some("next"), + block_node: None, + block_nodes: &["then", "do", "body_statement"], + labeled_node: None, +}; + +pub static PHP_CFG: CfgRules = CfgRules { + if_node: Some("if_statement"), + if_nodes: &[], + elif_node: Some("else_if_clause"), + else_clause: Some("else_clause"), + else_via_alternative: false, + if_consequent_field: Some("body"), + for_nodes: &["for_statement", "foreach_statement"], + condition_field: Some("condition"), + while_node: Some("while_statement"), + while_nodes: &[], + do_node: Some("do_statement"), + infinite_loop_node: None, + unless_node: None, + until_node: None, + switch_node: Some("switch_statement"), + switch_nodes: &[], + case_node: Some("case_statement"), + case_nodes: &[], + default_node: Some("default_statement"), + wildcard_pattern_node: None, + try_node: Some("try_statement"), + try_nodes: &[], + catch_node: Some("catch_clause"), + finally_node: Some("finally_clause"), + else_node: None, + return_node: Some("return_statement"), + throw_node: Some("throw_expression"), + break_node: Some("break_statement"), + continue_node: Some("continue_statement"), + block_node: Some("compound_statement"), + block_nodes: &[], + labeled_node: None, +}; + +/// Get CFG rules for a language ID. +pub fn get_cfg_rules(lang_id: &str) -> Option<&'static CfgRules> { + match lang_id { + "javascript" | "typescript" | "tsx" => Some(&JS_TS_CFG), + "python" => Some(&PYTHON_CFG), + "go" => Some(&GO_CFG), + "rust" => Some(&RUST_CFG), + "java" => Some(&JAVA_CFG), + "csharp" => Some(&CSHARP_CFG), + "ruby" => Some(&RUBY_CFG), + "php" => Some(&PHP_CFG), + _ => None, + } +} + +// ─── Core Algorithm ───────────────────────────────────────────────────── + +/// Loop context for break/continue resolution. +struct LoopCtx { + header_idx: u32, + exit_idx: u32, + is_loop: bool, +} + +/// Label context for labeled break/continue. +struct LabelCtx { + header_idx: Option, + exit_idx: Option, +} + +/// CFG builder state. +struct CfgBuilder<'a> { + rules: &'a CfgRules, + source: &'a [u8], + blocks: Vec, + edges: Vec, + next_index: u32, + exit_idx: u32, + loop_stack: Vec, + label_map: Vec<(String, LabelCtx)>, +} + +impl<'a> CfgBuilder<'a> { + fn new(rules: &'a CfgRules, source: &'a [u8]) -> Self { + Self { + rules, + source, + blocks: Vec::new(), + edges: Vec::new(), + next_index: 0, + exit_idx: 0, + loop_stack: Vec::new(), + label_map: Vec::new(), + } + } + + fn make_block(&mut self, block_type: &str, start_line: Option, end_line: Option, label: Option<&str>) -> u32 { + let idx = self.next_index; + self.next_index += 1; + self.blocks.push(CfgBlock { + index: idx, + block_type: block_type.to_string(), + start_line, + end_line, + label: label.map(|s| s.to_string()), + }); + idx + } + + fn add_edge(&mut self, source: u32, target: u32, kind: &str) { + self.edges.push(CfgEdge { + source_index: source, + target_index: target, + kind: kind.to_string(), + }); + } + + fn set_end_line(&mut self, block_idx: u32, line: u32) { + if let Some(b) = self.blocks.iter_mut().find(|b| b.index == block_idx) { + b.end_line = Some(line); + } + } + + fn set_start_line_if_empty(&mut self, block_idx: u32, line: u32) { + if let Some(b) = self.blocks.iter_mut().find(|b| b.index == block_idx) { + if b.start_line.is_none() { + b.start_line = Some(line); + } + } + } + + fn start_line_of(&self, block_idx: u32) -> Option { + self.blocks.iter().find(|b| b.index == block_idx).and_then(|b| b.start_line) + } + + /// Get statement children from a block or statement list. + fn get_statements<'b>(&self, node: &Node<'b>) -> Vec> { + let kind = node.kind(); + if matches_opt(kind, self.rules.block_node) || matches_slice(kind, self.rules.block_nodes) { + let mut stmts = Vec::new(); + let cursor = &mut node.walk(); + for child in node.named_children(cursor) { + stmts.push(child); + } + return stmts; + } + // Single statement + vec![*node] + } + + /// Process a list of statements, returns the last current block or None if all paths terminated. + fn process_statements(&mut self, stmts: &[Node], current: u32) -> Option { + let mut cur = Some(current); + for stmt in stmts { + match cur { + None => break, // Dead code after return/break/continue/throw + Some(c) => cur = self.process_statement(stmt, c), + } + } + cur + } + + /// Process a single statement. + fn process_statement(&mut self, stmt: &Node, current: u32) -> Option { + let kind = stmt.kind(); + + // Unwrap expression_statement (Rust uses expressions for control flow) + if kind == "expression_statement" && stmt.named_child_count() == 1 { + if let Some(inner) = stmt.named_child(0) { + let t = inner.kind(); + if matches_opt(t, self.rules.if_node) + || matches_slice(t, self.rules.if_nodes) + || matches_slice(t, self.rules.for_nodes) + || matches_opt(t, self.rules.while_node) + || matches_slice(t, self.rules.while_nodes) + || matches_opt(t, self.rules.do_node) + || matches_opt(t, self.rules.infinite_loop_node) + || matches_opt(t, self.rules.switch_node) + || matches_slice(t, self.rules.switch_nodes) + || matches_opt(t, self.rules.return_node) + || matches_opt(t, self.rules.throw_node) + || matches_opt(t, self.rules.break_node) + || matches_opt(t, self.rules.continue_node) + || matches_opt(t, self.rules.unless_node) + || matches_opt(t, self.rules.until_node) + { + return self.process_statement(&inner, current); + } + } + } + + // Labeled statement + if matches_opt(kind, self.rules.labeled_node) { + let label_node = stmt.child_by_field_name("label"); + let body = stmt.child_by_field_name("body"); + if let (Some(label_node), Some(body)) = (label_node, body) { + let label_name = label_node.utf8_text(self.source).unwrap_or("").to_string(); + // We can't know the loop blocks yet — push a placeholder + self.label_map.push((label_name.clone(), LabelCtx { header_idx: None, exit_idx: None })); + let result = self.process_statement(&body, current); + self.label_map.retain(|(n, _)| n != &label_name); + return result; + } + return Some(current); + } + + // If statement + if matches_opt(kind, self.rules.if_node) || matches_slice(kind, self.rules.if_nodes) { + return self.process_if(stmt, current); + } + + // Unless (Ruby) + if matches_opt(kind, self.rules.unless_node) { + return self.process_if(stmt, current); + } + + // For loops + if matches_slice(kind, self.rules.for_nodes) { + return self.process_for_loop(stmt, current); + } + + // While loop + if matches_opt(kind, self.rules.while_node) || matches_slice(kind, self.rules.while_nodes) { + return self.process_while_loop(stmt, current); + } + + // Until (Ruby) + if matches_opt(kind, self.rules.until_node) { + return self.process_while_loop(stmt, current); + } + + // Do-while + if matches_opt(kind, self.rules.do_node) { + return self.process_do_while_loop(stmt, current); + } + + // Infinite loop (Rust loop {}) + if matches_opt(kind, self.rules.infinite_loop_node) { + return self.process_infinite_loop(stmt, current); + } + + // Switch/match + if matches_opt(kind, self.rules.switch_node) || matches_slice(kind, self.rules.switch_nodes) { + return self.process_switch(stmt, current); + } + + // Try/catch/finally + if matches_opt(kind, self.rules.try_node) { + return self.process_try_catch(stmt, current); + } + // Additional try nodes (e.g. Ruby body_statement with rescue) + if matches_slice(kind, self.rules.try_nodes) { + // Only treat as try if it actually contains a catch/rescue child + let cursor = &mut stmt.walk(); + let has_rescue = stmt.named_children(cursor) + .any(|c| matches_opt(c.kind(), self.rules.catch_node)); + if has_rescue { + return self.process_try_catch(stmt, current); + } + } + + // Return + if matches_opt(kind, self.rules.return_node) { + self.set_end_line(current, node_line(stmt)); + self.add_edge(current, self.exit_idx, "return"); + return None; + } + + // Throw + if matches_opt(kind, self.rules.throw_node) { + self.set_end_line(current, node_line(stmt)); + self.add_edge(current, self.exit_idx, "exception"); + return None; + } + + // Break + if matches_opt(kind, self.rules.break_node) { + let label_name = stmt.child_by_field_name("label") + .map(|n| n.utf8_text(self.source).unwrap_or("").to_string()); + + let target = if let Some(ref name) = label_name { + self.label_map.iter().rev() + .find(|(n, _)| n == name) + .and_then(|(_, ctx)| ctx.exit_idx) + } else { + self.loop_stack.last().map(|ctx| ctx.exit_idx) + }; + + if let Some(target) = target { + self.set_end_line(current, node_line(stmt)); + self.add_edge(current, target, "break"); + return None; + } + return Some(current); + } + + // Continue + if matches_opt(kind, self.rules.continue_node) { + let label_name = stmt.child_by_field_name("label") + .map(|n| n.utf8_text(self.source).unwrap_or("").to_string()); + + let target = if let Some(ref name) = label_name { + self.label_map.iter().rev() + .find(|(n, _)| n == name) + .and_then(|(_, ctx)| ctx.header_idx) + } else { + // Walk back to find the nearest actual loop (skip switch entries) + self.loop_stack.iter().rev() + .find(|ctx| ctx.is_loop) + .map(|ctx| ctx.header_idx) + }; + + if let Some(target) = target { + self.set_end_line(current, node_line(stmt)); + self.add_edge(current, target, "continue"); + return None; + } + return Some(current); + } + + // Regular statement — extend current block + self.set_start_line_if_empty(current, node_line(stmt)); + self.set_end_line(current, node_end_line(stmt)); + Some(current) + } + + /// Process if/else-if/else chain (handles patterns A, B, C). + fn process_if(&mut self, if_stmt: &Node, current: u32) -> Option { + self.set_end_line(current, node_line(if_stmt)); + + let cond_block = self.make_block("condition", Some(node_line(if_stmt)), Some(node_line(if_stmt)), Some("if")); + self.add_edge(current, cond_block, "fallthrough"); + + let join_block = self.make_block("body", None, None, None); + + // True branch + let consequent_field = self.rules.if_consequent_field.unwrap_or("consequence"); + let consequent = if_stmt.child_by_field_name(consequent_field); + let true_block = self.make_block("branch_true", None, None, Some("then")); + self.add_edge(cond_block, true_block, "branch_true"); + + if let Some(consequent) = consequent { + let true_stmts = self.get_statements(&consequent); + let true_end = self.process_statements(&true_stmts, true_block); + if let Some(te) = true_end { + self.add_edge(te, join_block, "fallthrough"); + } + } else { + self.add_edge(true_block, join_block, "fallthrough"); + } + + // False branch + if self.rules.elif_node.is_some() { + // Pattern B: elif/else as siblings + self.process_elif_siblings(if_stmt, cond_block, join_block); + } else { + let alternative = if_stmt.child_by_field_name("alternative"); + if let Some(alternative) = alternative { + let alt_kind = alternative.kind(); + if self.rules.else_via_alternative && !matches_opt(alt_kind, self.rules.else_clause) { + // Pattern C: alternative points directly to if or block + if matches_opt(alt_kind, self.rules.if_node) || matches_slice(alt_kind, self.rules.if_nodes) { + let false_block = self.make_block("branch_false", None, None, Some("else-if")); + self.add_edge(cond_block, false_block, "branch_false"); + let else_if_end = self.process_if(&alternative, false_block); + if let Some(eie) = else_if_end { + self.add_edge(eie, join_block, "fallthrough"); + } + } else { + let false_block = self.make_block("branch_false", None, None, Some("else")); + self.add_edge(cond_block, false_block, "branch_false"); + let false_stmts = self.get_statements(&alternative); + let false_end = self.process_statements(&false_stmts, false_block); + if let Some(fe) = false_end { + self.add_edge(fe, join_block, "fallthrough"); + } + } + } else if matches_opt(alt_kind, self.rules.else_clause) { + // Pattern A: else_clause wrapper + let else_children: Vec = { + let cursor = &mut alternative.walk(); + alternative.named_children(cursor).collect() + }; + if else_children.len() == 1 + && (matches_opt(else_children[0].kind(), self.rules.if_node) + || matches_slice(else_children[0].kind(), self.rules.if_nodes)) + { + // else-if: recurse + let false_block = self.make_block("branch_false", None, None, Some("else-if")); + self.add_edge(cond_block, false_block, "branch_false"); + let else_if_end = self.process_if(&else_children[0], false_block); + if let Some(eie) = else_if_end { + self.add_edge(eie, join_block, "fallthrough"); + } + } else { + // else block + let false_block = self.make_block("branch_false", None, None, Some("else")); + self.add_edge(cond_block, false_block, "branch_false"); + let false_end = self.process_statements(&else_children, false_block); + if let Some(fe) = false_end { + self.add_edge(fe, join_block, "fallthrough"); + } + } + } else { + // Unknown alternative type — treat as no else + self.add_edge(cond_block, join_block, "branch_false"); + } + } else { + // No else: condition-false goes to join + self.add_edge(cond_block, join_block, "branch_false"); + } + } + + Some(join_block) + } + + /// Pattern B: elif/elsif/else_if as sibling children of the if node. + fn process_elif_siblings(&mut self, if_stmt: &Node, first_cond: u32, join_block: u32) { + let mut last_cond = first_cond; + let mut found_else = false; + + let cursor = &mut if_stmt.walk(); + let children: Vec = if_stmt.named_children(cursor).collect(); + + for child in &children { + let child_kind = child.kind(); + + if matches_opt(child_kind, self.rules.elif_node) { + let elif_cond = self.make_block("condition", Some(node_line(child)), Some(node_line(child)), Some("else-if")); + self.add_edge(last_cond, elif_cond, "branch_false"); + + let elif_consequent_field = self.rules.if_consequent_field.unwrap_or("consequence"); + let elif_consequent = child.child_by_field_name(elif_consequent_field); + let elif_true = self.make_block("branch_true", None, None, Some("then")); + self.add_edge(elif_cond, elif_true, "branch_true"); + + if let Some(cons) = elif_consequent { + let stmts = self.get_statements(&cons); + let end = self.process_statements(&stmts, elif_true); + if let Some(e) = end { + self.add_edge(e, join_block, "fallthrough"); + } + } else { + self.add_edge(elif_true, join_block, "fallthrough"); + } + + last_cond = elif_cond; + } else if matches_opt(child_kind, self.rules.else_clause) { + let else_block = self.make_block("branch_false", None, None, Some("else")); + self.add_edge(last_cond, else_block, "branch_false"); + + // Try field access first, then collect children + let else_body = child.child_by_field_name("body"); + let else_stmts: Vec = if let Some(body) = else_body { + self.get_statements(&body) + } else { + let cursor2 = &mut child.walk(); + child.named_children(cursor2).collect() + }; + let else_end = self.process_statements(&else_stmts, else_block); + if let Some(ee) = else_end { + self.add_edge(ee, join_block, "fallthrough"); + } + + found_else = true; + } + } + + if !found_else { + self.add_edge(last_cond, join_block, "branch_false"); + } + } + + /// Update label map with loop context (for newly created loops inside labeled stmts). + fn update_label_map(&mut self, header_idx: u32, exit_idx: u32) { + if let Some((_, ctx)) = self.label_map.iter_mut().rev() + .find(|(_, ctx)| ctx.header_idx.is_none()) + { + ctx.header_idx = Some(header_idx); + ctx.exit_idx = Some(exit_idx); + } + } + + fn process_for_loop(&mut self, for_stmt: &Node, current: u32) -> Option { + let header = self.make_block("loop_header", Some(node_line(for_stmt)), Some(node_line(for_stmt)), Some("for")); + self.add_edge(current, header, "fallthrough"); + + let exit = self.make_block("body", None, None, None); + + self.loop_stack.push(LoopCtx { header_idx: header, exit_idx: exit, is_loop: true }); + self.update_label_map(header, exit); + + // Check if this for loop has a condition — if not (e.g. Go `for {}`), treat as infinite loop + let has_condition = self.rules.condition_field + .and_then(|f| for_stmt.child_by_field_name(f)) + .is_some(); + + let body = for_stmt.child_by_field_name("body"); + let body_block = self.make_block("loop_body", None, None, None); + let body_edge = if has_condition { "branch_true" } else { "fallthrough" }; + self.add_edge(header, body_block, body_edge); + + if let Some(body) = body { + let stmts = self.get_statements(&body); + let body_end = self.process_statements(&stmts, body_block); + if let Some(be) = body_end { + self.add_edge(be, header, "loop_back"); + } + } + + self.loop_stack.pop(); + + if has_condition { + // Normal for loop with condition — always emit loop_exit edge + self.add_edge(header, exit, "loop_exit"); + Some(exit) + } else { + // Infinite loop (no condition) — only exit via break + let has_break_to_exit = self.edges.iter().any(|e| e.target_index == exit); + if has_break_to_exit { + Some(exit) + } else { + None + } + } + } + + fn process_while_loop(&mut self, while_stmt: &Node, current: u32) -> Option { + let header = self.make_block("loop_header", Some(node_line(while_stmt)), Some(node_line(while_stmt)), Some("while")); + self.add_edge(current, header, "fallthrough"); + + let exit = self.make_block("body", None, None, None); + + self.loop_stack.push(LoopCtx { header_idx: header, exit_idx: exit, is_loop: true }); + self.update_label_map(header, exit); + + let body = while_stmt.child_by_field_name("body"); + let body_block = self.make_block("loop_body", None, None, None); + self.add_edge(header, body_block, "branch_true"); + + if let Some(body) = body { + let stmts = self.get_statements(&body); + let body_end = self.process_statements(&stmts, body_block); + if let Some(be) = body_end { + self.add_edge(be, header, "loop_back"); + } + } + + self.add_edge(header, exit, "loop_exit"); + self.loop_stack.pop(); + Some(exit) + } + + fn process_do_while_loop(&mut self, do_stmt: &Node, current: u32) -> Option { + let body_block = self.make_block("loop_body", Some(node_line(do_stmt)), None, Some("do")); + self.add_edge(current, body_block, "fallthrough"); + + let cond_block = self.make_block("loop_header", None, None, Some("do-while")); + let exit = self.make_block("body", None, None, None); + + self.loop_stack.push(LoopCtx { header_idx: cond_block, exit_idx: exit, is_loop: true }); + self.update_label_map(cond_block, exit); + + let body = do_stmt.child_by_field_name("body"); + if let Some(body) = body { + let stmts = self.get_statements(&body); + let body_end = self.process_statements(&stmts, body_block); + if let Some(be) = body_end { + self.add_edge(be, cond_block, "fallthrough"); + } + } + + self.add_edge(cond_block, body_block, "loop_back"); + self.add_edge(cond_block, exit, "loop_exit"); + + self.loop_stack.pop(); + Some(exit) + } + + fn process_infinite_loop(&mut self, loop_stmt: &Node, current: u32) -> Option { + let header = self.make_block("loop_header", Some(node_line(loop_stmt)), Some(node_line(loop_stmt)), Some("loop")); + self.add_edge(current, header, "fallthrough"); + + let exit = self.make_block("body", None, None, None); + + self.loop_stack.push(LoopCtx { header_idx: header, exit_idx: exit, is_loop: true }); + self.update_label_map(header, exit); + + let body = loop_stmt.child_by_field_name("body"); + let body_block = self.make_block("loop_body", None, None, None); + self.add_edge(header, body_block, "fallthrough"); + + if let Some(body) = body { + let stmts = self.get_statements(&body); + let body_end = self.process_statements(&stmts, body_block); + if let Some(be) = body_end { + self.add_edge(be, header, "loop_back"); + } + } + + // No loop_exit from header — only exit via break + self.loop_stack.pop(); + + // If no break targeted the exit block, subsequent code is unreachable + let has_break_to_exit = self.edges.iter().any(|e| e.target_index == exit); + if has_break_to_exit { + Some(exit) + } else { + None + } + } + + fn process_switch(&mut self, switch_stmt: &Node, current: u32) -> Option { + self.set_end_line(current, node_line(switch_stmt)); + + let switch_header = self.make_block("condition", Some(node_line(switch_stmt)), Some(node_line(switch_stmt)), Some("switch")); + self.add_edge(current, switch_header, "fallthrough"); + + let join_block = self.make_block("body", None, None, None); + + // Switch acts like a break target but not a continue target + self.loop_stack.push(LoopCtx { header_idx: switch_header, exit_idx: join_block, is_loop: false }); + + // Get case children from body field or direct children + let container = switch_stmt.child_by_field_name("body").unwrap_or(*switch_stmt); + + let mut has_default = false; + let cursor = &mut container.walk(); + let case_children: Vec = container.named_children(cursor).collect(); + + for case_clause in &case_children { + let cc_kind = case_clause.kind(); + let is_default = matches_opt(cc_kind, self.rules.default_node) + || (self.rules.wildcard_pattern_node.is_some() + && (matches_opt(cc_kind, self.rules.case_node) || matches_slice(cc_kind, self.rules.case_nodes)) + && case_clause.named_child(0) + .is_some_and(|c| matches_opt(c.kind(), self.rules.wildcard_pattern_node))); + let is_case = is_default + || matches_opt(cc_kind, self.rules.case_node) + || matches_slice(cc_kind, self.rules.case_nodes); + + if !is_case { + continue; + } + + let case_label = if is_default { "default" } else { "case" }; + let case_block = self.make_block("case", Some(node_line(case_clause)), None, Some(case_label)); + let edge_kind = if is_default { "branch_false" } else { "branch_true" }; + self.add_edge(switch_header, case_block, edge_kind); + if is_default { + has_default = true; + } + + // Extract case body + let case_body_node = case_clause.child_by_field_name("body") + .or_else(|| case_clause.child_by_field_name("consequence")); + + let case_stmts: Vec = if let Some(body_node) = case_body_node { + self.get_statements(&body_node) + } else if let Some(value_node) = case_clause.child_by_field_name("value") { + // Rust match_arm: the `value` field is the arm expression body + vec![value_node] + } else { + let pattern_node = case_clause.child_by_field_name("pattern"); + let cursor2 = &mut case_clause.walk(); + case_clause.named_children(cursor2) + .filter(|child| { + if let Some(ref p) = pattern_node { if child.id() == p.id() { return false; } } + child.kind() != "switch_label" + }) + .collect() + }; + + let case_end = self.process_statements(&case_stmts, case_block); + if let Some(ce) = case_end { + self.add_edge(ce, join_block, "fallthrough"); + } + } + + if !has_default { + self.add_edge(switch_header, join_block, "branch_false"); + } + + self.loop_stack.pop(); + Some(join_block) + } + + fn process_try_catch(&mut self, try_stmt: &Node, current: u32) -> Option { + self.set_end_line(current, node_line(try_stmt)); + + let join_block = self.make_block("body", None, None, None); + + // Try body + let try_body = try_stmt.child_by_field_name("body"); + let (try_body_start, try_stmts): (u32, Vec) = if let Some(body) = try_body { + (node_line(&body), self.get_statements(&body)) + } else { + let cursor = &mut try_stmt.walk(); + let stmts: Vec = try_stmt.named_children(cursor) + .filter(|child| { + let ck = child.kind(); + !matches_opt(ck, self.rules.catch_node) + && !matches_opt(ck, self.rules.finally_node) + && !matches_opt(ck, self.rules.else_node) + }) + .collect(); + (node_line(try_stmt), stmts) + }; + + let try_block = self.make_block("body", Some(try_body_start), None, Some("try")); + self.add_edge(current, try_block, "fallthrough"); + let try_end = self.process_statements(&try_stmts, try_block); + + // Find catch, finally, and else handlers + let mut catch_handlers: Vec = Vec::new(); + let mut finally_handler: Option = None; + let mut else_handler: Option = None; + let cursor = &mut try_stmt.walk(); + for child in try_stmt.named_children(cursor) { + if matches_opt(child.kind(), self.rules.catch_node) { + catch_handlers.push(child); + } + if matches_opt(child.kind(), self.rules.finally_node) { + finally_handler = Some(child); + } + if matches_opt(child.kind(), self.rules.else_node) { + // Only treat as try-else if it's a direct child of the try statement + // (not the else_clause of an if inside the try body) + else_handler = Some(child); + } + } + + // Process else clause (Python try...except...else): runs when try succeeds + let success_end = if let Some(else_node) = else_handler { + let else_block = self.make_block("body", Some(node_line(&else_node)), None, Some("else")); + if let Some(te) = try_end { + self.add_edge(te, else_block, "fallthrough"); + } + let else_stmts = self.get_statements(&else_node); + self.process_statements(&else_stmts, else_block) + } else { + try_end + }; + + if !catch_handlers.is_empty() { + let mut catch_ends: Vec> = Vec::new(); + + for catch_node in &catch_handlers { + let catch_block = self.make_block("catch", Some(node_line(catch_node)), None, Some("catch")); + self.add_edge(try_block, catch_block, "exception"); + + let catch_body_node = catch_node.child_by_field_name("body"); + let catch_stmts: Vec = if let Some(body) = catch_body_node { + self.get_statements(&body) + } else { + let cursor2 = &mut catch_node.walk(); + catch_node.named_children(cursor2).collect() + }; + let catch_end = self.process_statements(&catch_stmts, catch_block); + catch_ends.push(catch_end); + } + + if let Some(finally_node) = finally_handler { + let finally_block = self.make_block("finally", Some(node_line(&finally_node)), None, Some("finally")); + if let Some(se) = success_end { + self.add_edge(se, finally_block, "fallthrough"); + } + for catch_end in &catch_ends { + if let Some(ce) = *catch_end { + self.add_edge(ce, finally_block, "fallthrough"); + } + } + let finally_body = finally_node.child_by_field_name("body"); + let finally_stmts: Vec = if let Some(body) = finally_body { + self.get_statements(&body) + } else { + self.get_statements(&finally_node) + }; + let finally_end = self.process_statements(&finally_stmts, finally_block); + if let Some(fe) = finally_end { + self.add_edge(fe, join_block, "fallthrough"); + } + } else { + if let Some(se) = success_end { + self.add_edge(se, join_block, "fallthrough"); + } + for catch_end in &catch_ends { + if let Some(ce) = *catch_end { + self.add_edge(ce, join_block, "fallthrough"); + } + } + } + } else if let Some(finally_node) = finally_handler { + let finally_block = self.make_block("finally", Some(node_line(&finally_node)), None, Some("finally")); + if let Some(se) = success_end { + self.add_edge(se, finally_block, "fallthrough"); + } + let finally_body = finally_node.child_by_field_name("body"); + let finally_stmts: Vec = if let Some(body) = finally_body { + self.get_statements(&body) + } else { + self.get_statements(&finally_node) + }; + let finally_end = self.process_statements(&finally_stmts, finally_block); + if let Some(fe) = finally_end { + self.add_edge(fe, join_block, "fallthrough"); + } + } else { + if let Some(se) = success_end { + self.add_edge(se, join_block, "fallthrough"); + } + } + + Some(join_block) + } +} + +// ─── Helpers ──────────────────────────────────────────────────────────── + +fn node_line(node: &Node) -> u32 { + node.start_position().row as u32 + 1 +} + +fn node_end_line(node: &Node) -> u32 { + node.end_position().row as u32 + 1 +} + +// ─── Public API ───────────────────────────────────────────────────────── + +/// Build a control flow graph for a single function AST node. +pub fn build_function_cfg(function_node: &Node, lang_id: &str, source: &[u8]) -> Option { + let rules = get_cfg_rules(lang_id)?; + + let mut builder = CfgBuilder::new(rules, source); + + let entry = builder.make_block("entry", None, None, None); + let exit = builder.make_block("exit", None, None, None); + builder.exit_idx = exit; + + let body = function_node.child_by_field_name("body"); + let body = match body { + Some(b) => b, + None => { + builder.add_edge(entry, exit, "fallthrough"); + return Some(CfgData { blocks: builder.blocks, edges: builder.edges }); + } + }; + + let stmts = builder.get_statements(&body); + if stmts.is_empty() { + builder.add_edge(entry, exit, "fallthrough"); + return Some(CfgData { blocks: builder.blocks, edges: builder.edges }); + } + + let first_block = builder.make_block("body", None, None, None); + builder.add_edge(entry, first_block, "fallthrough"); + + let last_block = builder.process_statements(&stmts, first_block); + if let Some(lb) = last_block { + builder.add_edge(lb, exit, "fallthrough"); + } + + Some(CfgData { blocks: builder.blocks, edges: builder.edges }) +} diff --git a/crates/codegraph-core/src/complexity.rs b/crates/codegraph-core/src/complexity.rs index df2bdaf..93458e0 100644 --- a/crates/codegraph-core/src/complexity.rs +++ b/crates/codegraph-core/src/complexity.rs @@ -344,7 +344,7 @@ pub fn lang_rules(lang_id: &str) -> Option<&'static LangRules> { "go" => Some(&GO_RULES), "rust" => Some(&RUST_LANG_RULES), "java" => Some(&JAVA_RULES), - "c_sharp" => Some(&CSHARP_RULES), + "csharp" => Some(&CSHARP_RULES), "ruby" => Some(&RUBY_RULES), "php" => Some(&PHP_RULES), _ => None, @@ -850,7 +850,7 @@ pub fn halstead_rules(lang_id: &str) -> Option<&'static HalsteadRules> { "go" => Some(&GO_HALSTEAD), "rust" => Some(&RUST_HALSTEAD), "java" => Some(&JAVA_HALSTEAD), - "c_sharp" => Some(&CSHARP_HALSTEAD), + "csharp" => Some(&CSHARP_HALSTEAD), "ruby" => Some(&RUBY_HALSTEAD), "php" => Some(&PHP_HALSTEAD), _ => None, @@ -860,7 +860,7 @@ pub fn halstead_rules(lang_id: &str) -> Option<&'static HalsteadRules> { /// Comment line prefixes per language, used for LOC metrics. pub fn comment_prefixes(lang_id: &str) -> &'static [&'static str] { match lang_id { - "javascript" | "typescript" | "tsx" | "go" | "rust" | "java" | "c_sharp" => { + "javascript" | "typescript" | "tsx" | "go" | "rust" | "java" | "csharp" => { &["//", "/*", "*", "*/"] } "python" | "ruby" => &["#"], diff --git a/crates/codegraph-core/src/dataflow.rs b/crates/codegraph-core/src/dataflow.rs new file mode 100644 index 0000000..82c3022 --- /dev/null +++ b/crates/codegraph-core/src/dataflow.rs @@ -0,0 +1,1450 @@ +use std::collections::HashMap; +use tree_sitter::{Node, Tree}; + +use crate::types::{ + DataflowArgFlow, DataflowAssignment, DataflowMutation, DataflowParam, DataflowResult, + DataflowReturn, +}; + +/// Maximum recursion depth for AST traversal to prevent stack overflow +/// on deeply nested trees. Matches the approach used in cfg.rs. +const MAX_VISIT_DEPTH: usize = 200; + +// ─── Param Strategy ────────────────────────────────────────────────────── + +/// Per-language parameter extraction strategy. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum ParamStrategy { + Default, + Python, + Go, + Rust, + Java, + CSharp, + Php, + Ruby, +} + +// ─── DataflowRules ────────────────────────────────────────────────────── + +/// Per-language AST node type names and field names for dataflow extraction. +/// Mirrors `DATAFLOW_DEFAULTS` + per-language overrides in `src/dataflow.js`. +pub struct DataflowRules { + // Scope entry + function_nodes: &'static [&'static str], + + // Function name extraction + name_field: &'static str, + var_assigned_fn_parent: Option<&'static str>, + assignment_fn_parent: Option<&'static str>, + pair_fn_parent: Option<&'static str>, + + // Parameters + param_list_field: &'static str, + param_identifier: &'static str, + param_wrapper_types: &'static [&'static str], + default_param_type: Option<&'static str>, + rest_param_type: Option<&'static str>, + object_destruct_type: Option<&'static str>, + array_destruct_type: Option<&'static str>, + shorthand_prop_pattern: Option<&'static str>, + pair_pattern_type: Option<&'static str>, + extract_param_strategy: ParamStrategy, + + // Return + return_node: Option<&'static str>, + + // Variable declarations + var_declarator_node: Option<&'static str>, + var_declarator_nodes: &'static [&'static str], + var_name_field: &'static str, + var_value_field: Option<&'static str>, + assignment_node: Option<&'static str>, + assign_left_field: &'static str, + assign_right_field: &'static str, + + // Calls + call_node: Option<&'static str>, + call_nodes: &'static [&'static str], + call_function_field: &'static str, + call_args_field: &'static str, + spread_type: Option<&'static str>, + + // Member access + member_node: Option<&'static str>, + member_object_field: &'static str, + member_property_field: &'static str, + optional_chain_node: Option<&'static str>, + + // Await + await_node: Option<&'static str>, + + // Mutation + mutating_methods: &'static [&'static str], + expression_stmt_node: &'static str, + call_object_field: Option<&'static str>, + + // Method call name extraction (for languages where method_call uses a different + // field than call_function_field, e.g. Rust's method_call_expression has "name") + method_call_name_field: Option<&'static str>, + + // Method call receiver extraction (for languages where the method call receiver + // uses a different field than member_object_field, e.g. Rust's + // method_call_expression exposes "receiver" not "value") + method_call_receiver_field: Option<&'static str>, + + // Structural wrappers + expression_list_type: Option<&'static str>, + equals_clause_type: Option<&'static str>, + argument_wrapper_type: Option<&'static str>, + extra_identifier_types: &'static [&'static str], +} + +// ─── Per-Language Configs ──────────────────────────────────────────────── + +static JS_TS_DATAFLOW: DataflowRules = DataflowRules { + function_nodes: &[ + "function_declaration", + "method_definition", + "arrow_function", + "function_expression", + "function", + ], + name_field: "name", + var_assigned_fn_parent: Some("variable_declarator"), + assignment_fn_parent: Some("assignment_expression"), + pair_fn_parent: Some("pair"), + param_list_field: "parameters", + param_identifier: "identifier", + param_wrapper_types: &["required_parameter", "optional_parameter"], + default_param_type: Some("assignment_pattern"), + rest_param_type: Some("rest_pattern"), + object_destruct_type: Some("object_pattern"), + array_destruct_type: Some("array_pattern"), + shorthand_prop_pattern: Some("shorthand_property_identifier_pattern"), + pair_pattern_type: Some("pair_pattern"), + extract_param_strategy: ParamStrategy::Default, + return_node: Some("return_statement"), + var_declarator_node: Some("variable_declarator"), + var_declarator_nodes: &[], + var_name_field: "name", + var_value_field: Some("value"), + assignment_node: Some("assignment_expression"), + assign_left_field: "left", + assign_right_field: "right", + call_node: Some("call_expression"), + call_nodes: &[], + call_function_field: "function", + call_args_field: "arguments", + spread_type: Some("spread_element"), + member_node: Some("member_expression"), + member_object_field: "object", + member_property_field: "property", + optional_chain_node: Some("optional_chain_expression"), + await_node: Some("await_expression"), + mutating_methods: &[ + "push", "pop", "shift", "unshift", "splice", "sort", "reverse", "fill", "set", "delete", + "add", "clear", + ], + expression_stmt_node: "expression_statement", + call_object_field: None, + method_call_name_field: None, + method_call_receiver_field: None, + expression_list_type: None, + equals_clause_type: None, + argument_wrapper_type: None, + extra_identifier_types: &[], +}; + +static PYTHON_DATAFLOW: DataflowRules = DataflowRules { + function_nodes: &["function_definition", "lambda"], + name_field: "name", + var_assigned_fn_parent: None, + assignment_fn_parent: None, + pair_fn_parent: None, + param_list_field: "parameters", + param_identifier: "identifier", + param_wrapper_types: &[], + default_param_type: Some("default_parameter"), + rest_param_type: Some("list_splat_pattern"), + object_destruct_type: None, + array_destruct_type: None, + shorthand_prop_pattern: None, + pair_pattern_type: None, + extract_param_strategy: ParamStrategy::Python, + return_node: Some("return_statement"), + var_declarator_node: None, + var_declarator_nodes: &[], + var_name_field: "name", + var_value_field: Some("value"), + assignment_node: Some("assignment"), + assign_left_field: "left", + assign_right_field: "right", + call_node: Some("call"), + call_nodes: &[], + call_function_field: "function", + call_args_field: "arguments", + spread_type: Some("list_splat"), + member_node: Some("attribute"), + member_object_field: "object", + member_property_field: "attribute", + optional_chain_node: None, + await_node: Some("await"), + mutating_methods: &[ + "append", "extend", "insert", "pop", "remove", "clear", "sort", "reverse", "add", + "discard", "update", + ], + expression_stmt_node: "expression_statement", + call_object_field: None, + method_call_name_field: None, + method_call_receiver_field: None, + expression_list_type: None, + equals_clause_type: None, + argument_wrapper_type: None, + extra_identifier_types: &[], +}; + +static GO_DATAFLOW: DataflowRules = DataflowRules { + function_nodes: &["function_declaration", "method_declaration", "func_literal"], + name_field: "name", + var_assigned_fn_parent: None, + assignment_fn_parent: None, + pair_fn_parent: None, + param_list_field: "parameters", + param_identifier: "identifier", + param_wrapper_types: &[], + default_param_type: None, + rest_param_type: None, + object_destruct_type: None, + array_destruct_type: None, + shorthand_prop_pattern: None, + pair_pattern_type: None, + extract_param_strategy: ParamStrategy::Go, + return_node: Some("return_statement"), + var_declarator_node: None, + // Only short_var_declaration uses left/right fields. var_declaration has + // var_spec children with name/type/value fields — not yet supported. + var_declarator_nodes: &["short_var_declaration"], + var_name_field: "left", + var_value_field: Some("right"), + assignment_node: Some("assignment_statement"), + assign_left_field: "left", + assign_right_field: "right", + call_node: Some("call_expression"), + call_nodes: &[], + call_function_field: "function", + call_args_field: "arguments", + spread_type: None, + member_node: Some("selector_expression"), + member_object_field: "operand", + member_property_field: "field", + optional_chain_node: None, + await_node: None, + mutating_methods: &[], + expression_stmt_node: "expression_statement", + call_object_field: None, + method_call_name_field: None, + method_call_receiver_field: None, + expression_list_type: Some("expression_list"), + equals_clause_type: None, + argument_wrapper_type: None, + extra_identifier_types: &[], +}; + +static RUST_DATAFLOW: DataflowRules = DataflowRules { + function_nodes: &["function_item", "closure_expression"], + name_field: "name", + var_assigned_fn_parent: None, + assignment_fn_parent: None, + pair_fn_parent: None, + param_list_field: "parameters", + param_identifier: "identifier", + param_wrapper_types: &[], + default_param_type: None, + rest_param_type: None, + object_destruct_type: None, + array_destruct_type: None, + shorthand_prop_pattern: None, + pair_pattern_type: None, + extract_param_strategy: ParamStrategy::Rust, + return_node: Some("return_expression"), + var_declarator_node: Some("let_declaration"), + var_declarator_nodes: &[], + var_name_field: "pattern", + var_value_field: Some("value"), + assignment_node: Some("assignment_expression"), + assign_left_field: "left", + assign_right_field: "right", + call_node: None, + call_nodes: &["call_expression", "method_call_expression"], + call_function_field: "function", + call_args_field: "arguments", + spread_type: None, + member_node: Some("field_expression"), + member_object_field: "value", + member_property_field: "field", + optional_chain_node: None, + await_node: Some("await_expression"), + mutating_methods: &["push", "pop", "insert", "remove", "clear", "sort", "reverse"], + expression_stmt_node: "expression_statement", + call_object_field: None, + method_call_name_field: Some("name"), + method_call_receiver_field: Some("receiver"), + expression_list_type: None, + equals_clause_type: None, + argument_wrapper_type: None, + extra_identifier_types: &[], +}; + +static JAVA_DATAFLOW: DataflowRules = DataflowRules { + function_nodes: &[ + "method_declaration", + "constructor_declaration", + "lambda_expression", + ], + name_field: "name", + var_assigned_fn_parent: None, + assignment_fn_parent: None, + pair_fn_parent: None, + param_list_field: "parameters", + param_identifier: "identifier", + param_wrapper_types: &[], + default_param_type: None, + rest_param_type: None, + object_destruct_type: None, + array_destruct_type: None, + shorthand_prop_pattern: None, + pair_pattern_type: None, + extract_param_strategy: ParamStrategy::Java, + return_node: Some("return_statement"), + var_declarator_node: Some("variable_declarator"), + var_declarator_nodes: &[], + var_name_field: "name", + var_value_field: Some("value"), + assignment_node: Some("assignment_expression"), + assign_left_field: "left", + assign_right_field: "right", + call_node: None, + call_nodes: &["method_invocation", "object_creation_expression"], + call_function_field: "name", + call_args_field: "arguments", + spread_type: None, + member_node: Some("field_access"), + member_object_field: "object", + member_property_field: "field", + optional_chain_node: None, + await_node: None, + mutating_methods: &["add", "remove", "clear", "put", "set", "push", "pop", "sort"], + expression_stmt_node: "expression_statement", + call_object_field: Some("object"), + method_call_name_field: None, + method_call_receiver_field: None, + expression_list_type: None, + equals_clause_type: None, + argument_wrapper_type: Some("argument"), + extra_identifier_types: &[], +}; + +static CSHARP_DATAFLOW: DataflowRules = DataflowRules { + function_nodes: &[ + "method_declaration", + "constructor_declaration", + "lambda_expression", + "local_function_statement", + ], + name_field: "name", + var_assigned_fn_parent: None, + assignment_fn_parent: None, + pair_fn_parent: None, + param_list_field: "parameters", + param_identifier: "identifier", + param_wrapper_types: &[], + default_param_type: None, + rest_param_type: None, + object_destruct_type: None, + array_destruct_type: None, + shorthand_prop_pattern: None, + pair_pattern_type: None, + extract_param_strategy: ParamStrategy::CSharp, + return_node: Some("return_statement"), + var_declarator_node: Some("variable_declarator"), + var_declarator_nodes: &[], + var_name_field: "name", + var_value_field: None, + assignment_node: Some("assignment_expression"), + assign_left_field: "left", + assign_right_field: "right", + call_node: Some("invocation_expression"), + call_nodes: &[], + call_function_field: "function", + call_args_field: "arguments", + spread_type: None, + member_node: Some("member_access_expression"), + member_object_field: "expression", + member_property_field: "name", + optional_chain_node: None, + await_node: Some("await_expression"), + mutating_methods: &["Add", "Remove", "Clear", "Insert", "Sort", "Reverse", "Push", "Pop"], + expression_stmt_node: "expression_statement", + call_object_field: None, + method_call_name_field: None, + method_call_receiver_field: None, + expression_list_type: None, + equals_clause_type: Some("equals_value_clause"), + argument_wrapper_type: Some("argument"), + extra_identifier_types: &[], +}; + +static PHP_DATAFLOW: DataflowRules = DataflowRules { + function_nodes: &[ + "function_definition", + "method_declaration", + "anonymous_function_creation_expression", + "arrow_function", + ], + name_field: "name", + var_assigned_fn_parent: None, + assignment_fn_parent: None, + pair_fn_parent: None, + param_list_field: "parameters", + param_identifier: "variable_name", + param_wrapper_types: &[], + default_param_type: None, + rest_param_type: None, + object_destruct_type: None, + array_destruct_type: None, + shorthand_prop_pattern: None, + pair_pattern_type: None, + extract_param_strategy: ParamStrategy::Php, + return_node: Some("return_statement"), + var_declarator_node: None, + var_declarator_nodes: &[], + var_name_field: "name", + var_value_field: Some("value"), + assignment_node: Some("assignment_expression"), + assign_left_field: "left", + assign_right_field: "right", + call_node: None, + call_nodes: &[ + "function_call_expression", + "member_call_expression", + "scoped_call_expression", + ], + call_function_field: "function", + call_args_field: "arguments", + spread_type: Some("spread_expression"), + member_node: Some("member_access_expression"), + member_object_field: "object", + member_property_field: "name", + optional_chain_node: None, + await_node: None, + mutating_methods: &["push", "pop", "shift", "unshift", "splice", "sort", "reverse"], + expression_stmt_node: "expression_statement", + call_object_field: None, + method_call_name_field: None, + method_call_receiver_field: None, + expression_list_type: None, + equals_clause_type: None, + argument_wrapper_type: Some("argument"), + extra_identifier_types: &["variable_name", "name"], +}; + +static RUBY_DATAFLOW: DataflowRules = DataflowRules { + function_nodes: &["method", "singleton_method", "lambda"], + name_field: "name", + var_assigned_fn_parent: None, + assignment_fn_parent: None, + pair_fn_parent: None, + param_list_field: "parameters", + param_identifier: "identifier", + param_wrapper_types: &[], + default_param_type: None, + rest_param_type: None, + object_destruct_type: None, + array_destruct_type: None, + shorthand_prop_pattern: None, + pair_pattern_type: None, + extract_param_strategy: ParamStrategy::Ruby, + return_node: Some("return"), + var_declarator_node: None, + var_declarator_nodes: &[], + var_name_field: "name", + var_value_field: Some("value"), + assignment_node: Some("assignment"), + assign_left_field: "left", + assign_right_field: "right", + call_node: Some("call"), + call_nodes: &[], + call_function_field: "method", + call_args_field: "arguments", + spread_type: Some("splat_parameter"), + member_node: Some("call"), + member_object_field: "receiver", + member_property_field: "method", + optional_chain_node: None, + await_node: None, + mutating_methods: &[ + "push", "pop", "shift", "unshift", "delete", "clear", "sort!", "reverse!", "map!", + "select!", "reject!", "compact!", "flatten!", "concat", "replace", "insert", + ], + expression_stmt_node: "expression_statement", + call_object_field: None, + method_call_name_field: None, + method_call_receiver_field: None, + expression_list_type: None, + equals_clause_type: None, + argument_wrapper_type: None, + extra_identifier_types: &[], +}; + +/// Get dataflow rules for a language ID string. +fn get_dataflow_rules(lang_id: &str) -> Option<&'static DataflowRules> { + match lang_id { + "javascript" | "typescript" | "tsx" => Some(&JS_TS_DATAFLOW), + "python" => Some(&PYTHON_DATAFLOW), + "go" => Some(&GO_DATAFLOW), + "rust" => Some(&RUST_DATAFLOW), + "java" => Some(&JAVA_DATAFLOW), + "csharp" => Some(&CSHARP_DATAFLOW), + "php" => Some(&PHP_DATAFLOW), + "ruby" => Some(&RUBY_DATAFLOW), + _ => None, + } +} + +// ─── Helpers ───────────────────────────────────────────────────────────── + +fn is_call_node(rules: &DataflowRules, kind: &str) -> bool { + if !rules.call_nodes.is_empty() { + rules.call_nodes.contains(&kind) + } else { + rules.call_node.is_some_and(|cn| cn == kind) + } +} + +fn is_function_node(rules: &DataflowRules, kind: &str) -> bool { + rules.function_nodes.contains(&kind) +} + +fn is_ident(rules: &DataflowRules, kind: &str) -> bool { + kind == "identifier" + || kind == rules.param_identifier + || rules.extra_identifier_types.contains(&kind) +} + +fn truncate(s: &str, max: usize) -> String { + if s.chars().count() <= max { + s.to_string() + } else { + // Find the byte offset of the max-th character + let byte_offset = s + .char_indices() + .nth(max) + .map(|(i, _)| i) + .unwrap_or(s.len()); + let mut result = s[..byte_offset].to_string(); + result.push('…'); + result + } +} + +fn node_text<'a>(node: &Node, source: &'a [u8]) -> &'a str { + node.utf8_text(source).unwrap_or("") +} + +fn node_line(node: &Node) -> u32 { + node.start_position().row as u32 + 1 +} + +/// Extract function name from a function AST node. +fn function_name<'a>(fn_node: &Node<'a>, rules: &DataflowRules, source: &[u8]) -> Option { + // Try the standard name field + if let Some(name_node) = fn_node.child_by_field_name(rules.name_field) { + return Some(node_text(&name_node, source).to_string()); + } + + // JS-specific: arrow_function/function_expression assigned to variable, pair, or assignment + if let Some(parent) = fn_node.parent() { + let pt = parent.kind(); + if rules.var_assigned_fn_parent.is_some_and(|v| v == pt) { + let n = parent.child_by_field_name("name"); + return n.map(|n| node_text(&n, source).to_string()); + } + if rules.pair_fn_parent.is_some_and(|v| v == pt) { + let key = parent.child_by_field_name("key"); + return key.map(|k| node_text(&k, source).to_string()); + } + if rules.assignment_fn_parent.is_some_and(|v| v == pt) { + let left = parent.child_by_field_name(rules.assign_left_field); + return left.map(|l| node_text(&l, source).to_string()); + } + } + None +} + +/// Extract parameter names using per-language strategy. +fn extract_param_names_strategy(node: &Node, strategy: ParamStrategy, source: &[u8]) -> Option> { + match strategy { + ParamStrategy::Default => None, + ParamStrategy::Python => { + let t = node.kind(); + if t == "typed_parameter" || t == "typed_default_parameter" { + let cursor = &mut node.walk(); + for c in node.named_children(cursor) { + if c.kind() == "identifier" { + return Some(vec![node_text(&c, source).to_string()]); + } + } + return Some(vec![]); + } + if t == "default_parameter" { + if let Some(name_node) = node.child_by_field_name("name") { + return Some(vec![node_text(&name_node, source).to_string()]); + } + return Some(vec![]); + } + if t == "list_splat_pattern" || t == "dictionary_splat_pattern" { + let cursor = &mut node.walk(); + for c in node.named_children(cursor) { + if c.kind() == "identifier" { + return Some(vec![node_text(&c, source).to_string()]); + } + } + return Some(vec![]); + } + None + } + ParamStrategy::Go => { + let t = node.kind(); + if t == "parameter_declaration" { + let mut names = Vec::new(); + let cursor = &mut node.walk(); + for c in node.named_children(cursor) { + if c.kind() == "identifier" { + names.push(node_text(&c, source).to_string()); + } + } + if !names.is_empty() { Some(names) } else { None } + } else if t == "variadic_parameter_declaration" { + node.child_by_field_name("name") + .map(|n| vec![node_text(&n, source).to_string()]) + } else { + None + } + } + ParamStrategy::Rust => { + let t = node.kind(); + if t == "parameter" { + if let Some(pat) = node.child_by_field_name("pattern") { + if pat.kind() == "identifier" { + return Some(vec![node_text(&pat, source).to_string()]); + } + } + return Some(vec![]); + } + if t == "identifier" { + return Some(vec![node_text(node, source).to_string()]); + } + None + } + ParamStrategy::Java => { + let t = node.kind(); + if t == "formal_parameter" || t == "spread_parameter" { + if let Some(name_node) = node.child_by_field_name("name") { + return Some(vec![node_text(&name_node, source).to_string()]); + } + return Some(vec![]); + } + if t == "identifier" { + return Some(vec![node_text(node, source).to_string()]); + } + None + } + ParamStrategy::CSharp => { + let t = node.kind(); + if t == "parameter" { + if let Some(name_node) = node.child_by_field_name("name") { + return Some(vec![node_text(&name_node, source).to_string()]); + } + return Some(vec![]); + } + if t == "identifier" { + return Some(vec![node_text(node, source).to_string()]); + } + None + } + ParamStrategy::Php => { + let t = node.kind(); + if t == "simple_parameter" || t == "variadic_parameter" { + if let Some(name_node) = node.child_by_field_name("name") { + return Some(vec![node_text(&name_node, source).to_string()]); + } + return Some(vec![]); + } + if t == "variable_name" { + return Some(vec![node_text(node, source).to_string()]); + } + None + } + ParamStrategy::Ruby => { + let t = node.kind(); + if t == "identifier" { + return Some(vec![node_text(node, source).to_string()]); + } + if t == "optional_parameter" + || t == "keyword_parameter" + || t == "splat_parameter" + || t == "hash_splat_parameter" + { + if let Some(name_node) = node.child_by_field_name("name") { + return Some(vec![node_text(&name_node, source).to_string()]); + } + return Some(vec![]); + } + None + } + } +} + +/// Extract parameter names from a node, using rules and strategy. +fn extract_param_names(node: &Node, rules: &DataflowRules, source: &[u8]) -> Vec { + let t = node.kind(); + + // Language-specific override + if let Some(names) = extract_param_names_strategy(node, rules.extract_param_strategy, source) { + return names; + } + + // Leaf identifier + if t == rules.param_identifier { + return vec![node_text(node, source).to_string()]; + } + + // Wrapper types (TS required_parameter, etc.) + if rules.param_wrapper_types.contains(&t) { + let pattern = node + .child_by_field_name("pattern") + .or_else(|| node.child_by_field_name("name")); + return pattern + .map(|p| extract_param_names(&p, rules, source)) + .unwrap_or_default(); + } + + // Default parameter + if rules.default_param_type.is_some_and(|d| d == t) { + let left = node + .child_by_field_name("left") + .or_else(|| node.child_by_field_name("name")); + return left + .map(|l| extract_param_names(&l, rules, source)) + .unwrap_or_default(); + } + + // Rest / splat parameter + if rules.rest_param_type.is_some_and(|r| r == t) { + if let Some(name_node) = node.child_by_field_name("name") { + return vec![node_text(&name_node, source).to_string()]; + } + let cursor = &mut node.walk(); + for child in node.named_children(cursor) { + if child.kind() == rules.param_identifier { + return vec![node_text(&child, source).to_string()]; + } + } + return vec![]; + } + + // Object destructuring (JS only) + if rules.object_destruct_type.is_some_and(|o| o == t) { + let mut names = Vec::new(); + let cursor = &mut node.walk(); + for child in node.named_children(cursor) { + let ck = child.kind(); + if rules.shorthand_prop_pattern.is_some_and(|s| s == ck) { + names.push(node_text(&child, source).to_string()); + } else if rules.pair_pattern_type.is_some_and(|p| p == ck) { + if let Some(value) = child.child_by_field_name("value") { + names.extend(extract_param_names(&value, rules, source)); + } + } else if rules.rest_param_type.is_some_and(|r| r == ck) { + names.extend(extract_param_names(&child, rules, source)); + } + } + return names; + } + + // Array destructuring (JS only) + if rules.array_destruct_type.is_some_and(|a| a == t) { + let mut names = Vec::new(); + let cursor = &mut node.walk(); + for child in node.named_children(cursor) { + names.extend(extract_param_names(&child, rules, source)); + } + return names; + } + + vec![] +} + +/// Extract parameters: name + index pairs from formal_parameters node. +fn extract_params(params_node: &Node, rules: &DataflowRules, source: &[u8]) -> Vec<(String, u32)> { + let mut result = Vec::new(); + let mut index: u32 = 0; + let cursor = &mut params_node.walk(); + for child in params_node.named_children(cursor) { + let names = extract_param_names(&child, rules, source); + for name in names { + result.push((name, index)); + } + index += 1; + } + result +} + +/// Resolve the callee name from a call expression node. +fn resolve_callee_name(call_node: &Node, rules: &DataflowRules, source: &[u8]) -> Option { + let fn_node = call_node.child_by_field_name(rules.call_function_field); + match fn_node { + Some(f) => { + if is_ident(rules, f.kind()) { + return Some(node_text(&f, source).to_string()); + } + if rules.member_node.is_some_and(|m| m == f.kind()) { + let prop = f.child_by_field_name(rules.member_property_field); + return prop.map(|p| node_text(&p, source).to_string()); + } + if rules.optional_chain_node.is_some_and(|o| o == f.kind()) { + if let Some(target) = f.named_child(0) { + if rules.member_node.is_some_and(|m| m == target.kind()) { + let prop = target.child_by_field_name(rules.member_property_field); + return prop.map(|p| node_text(&p, source).to_string()); + } + if target.kind() == "identifier" { + return Some(node_text(&target, source).to_string()); + } + } + let prop = f.child_by_field_name(rules.member_property_field); + return prop.map(|p| node_text(&p, source).to_string()); + } + None + } + None => { + // Some languages (Java method_invocation, Ruby call) use 'name'/'method' directly + let name_node = call_node + .child_by_field_name("name") + .or_else(|| call_node.child_by_field_name("method")); + name_node.map(|n| node_text(&n, source).to_string()) + } + } +} + +/// Get the receiver (object) of a member expression. +fn member_receiver(member_expr: &Node, rules: &DataflowRules, source: &[u8]) -> Option { + let obj = member_expr.child_by_field_name(rules.member_object_field)?; + if is_ident(rules, obj.kind()) { + return Some(node_text(&obj, source).to_string()); + } + if rules.member_node.is_some_and(|m| m == obj.kind()) { + return member_receiver(&obj, rules, source); + } + None +} + +/// Collect all identifier names referenced within a node. +fn collect_identifiers(node: &Node, out: &mut Vec, rules: &DataflowRules, source: &[u8], depth: usize) { + if depth >= MAX_VISIT_DEPTH { + return; + } + if is_ident(rules, node.kind()) { + out.push(node_text(node, source).to_string()); + return; + } + let cursor = &mut node.walk(); + for child in node.named_children(cursor) { + collect_identifiers(&child, out, rules, source, depth + 1); + } +} + +// ─── Scope Tracking ────────────────────────────────────────────────────── + +#[derive(Debug, Clone)] +enum LocalSource { + CallReturn { callee: String }, + Destructured { callee: String }, +} + +struct ScopeFrame { + func_name: Option, + params: HashMap, + locals: HashMap, +} + +/// Binding info returned by find_binding. +struct BindingInfo { + binding_type: String, + confidence: f64, +} + +fn find_binding(scope_stack: &[ScopeFrame], name: &str) -> Option { + for scope in scope_stack.iter().rev() { + if scope.params.contains_key(name) { + return Some(BindingInfo { + binding_type: "param".to_string(), + confidence: 1.0, + }); + } + if let Some(local) = scope.locals.get(name) { + let confidence = match local { + LocalSource::CallReturn { .. } => 0.9, + LocalSource::Destructured { .. } => 0.8, + }; + return Some(BindingInfo { + binding_type: "local".to_string(), + confidence, + }); + } + } + None +} + +fn binding_confidence(binding: &Option) -> f64 { + match binding { + Some(b) => b.confidence, + None => 0.5, + } +} + +// ─── Core: extract_dataflow ────────────────────────────────────────────── + +/// Extract dataflow information from a parsed AST tree. +/// Returns None if the language has no dataflow rules (e.g., HCL). +pub fn extract_dataflow(tree: &Tree, source: &[u8], lang_id: &str) -> Option { + let rules = get_dataflow_rules(lang_id)?; + + let mut parameters = Vec::new(); + let mut returns = Vec::new(); + let mut assignments = Vec::new(); + let mut arg_flows = Vec::new(); + let mut mutations = Vec::new(); + + let mut scope_stack: Vec = Vec::new(); + + visit( + &tree.root_node(), + rules, + source, + &mut scope_stack, + &mut parameters, + &mut returns, + &mut assignments, + &mut arg_flows, + &mut mutations, + 0, + ); + + Some(DataflowResult { + parameters, + returns, + assignments, + arg_flows, + mutations, + }) +} + +#[allow(clippy::too_many_arguments)] +fn visit( + node: &Node, + rules: &DataflowRules, + source: &[u8], + scope_stack: &mut Vec, + parameters: &mut Vec, + returns: &mut Vec, + assignments: &mut Vec, + arg_flows: &mut Vec, + mutations: &mut Vec, + depth: usize, +) { + if depth >= MAX_VISIT_DEPTH { + return; + } + + let t = node.kind(); + + // Enter function scope + if is_function_node(rules, t) { + enter_scope(node, rules, source, scope_stack, parameters); + let cursor = &mut node.walk(); + for child in node.named_children(cursor) { + visit(&child, rules, source, scope_stack, parameters, returns, assignments, arg_flows, mutations, depth + 1); + } + scope_stack.pop(); + return; + } + + // Return statements + if rules.return_node.is_some_and(|r| r == t) { + if let Some(scope) = scope_stack.last() { + if let Some(ref func_name) = scope.func_name { + let expr = node.named_child(0); + let mut referenced_names = Vec::new(); + if let Some(ref e) = expr { + collect_identifiers(e, &mut referenced_names, rules, source, depth + 1); + } + returns.push(DataflowReturn { + func_name: func_name.clone(), + expression: truncate( + expr.map(|e| node_text(&e, source)).unwrap_or(""), + 120, + ), + referenced_names, + line: node_line(node), + }); + } + } + let cursor = &mut node.walk(); + for child in node.named_children(cursor) { + visit(&child, rules, source, scope_stack, parameters, returns, assignments, arg_flows, mutations, depth + 1); + } + return; + } + + // Variable declarations (single type) + if rules.var_declarator_node.is_some_and(|v| v == t) { + handle_var_declarator(node, rules, source, scope_stack, assignments); + let cursor = &mut node.walk(); + for child in node.named_children(cursor) { + visit(&child, rules, source, scope_stack, parameters, returns, assignments, arg_flows, mutations, depth + 1); + } + return; + } + + // Variable declarations (multi-type, e.g., Go) + if !rules.var_declarator_nodes.is_empty() && rules.var_declarator_nodes.contains(&t) { + handle_var_declarator(node, rules, source, scope_stack, assignments); + let cursor = &mut node.walk(); + for child in node.named_children(cursor) { + visit(&child, rules, source, scope_stack, parameters, returns, assignments, arg_flows, mutations, depth + 1); + } + return; + } + + // Call expressions + if is_call_node(rules, t) { + handle_call_expr(node, rules, source, scope_stack, arg_flows); + let cursor = &mut node.walk(); + for child in node.named_children(cursor) { + visit(&child, rules, source, scope_stack, parameters, returns, assignments, arg_flows, mutations, depth + 1); + } + return; + } + + // Assignment expressions + if rules.assignment_node.is_some_and(|a| a == t) { + handle_assignment(node, rules, source, scope_stack, assignments, mutations); + let cursor = &mut node.walk(); + for child in node.named_children(cursor) { + visit(&child, rules, source, scope_stack, parameters, returns, assignments, arg_flows, mutations, depth + 1); + } + return; + } + + // Mutation detection via expression_statement + if t == rules.expression_stmt_node { + handle_expr_stmt_mutation(node, rules, source, scope_stack, mutations); + } + + // Default: visit children + let cursor = &mut node.walk(); + for child in node.named_children(cursor) { + visit(&child, rules, source, scope_stack, parameters, returns, assignments, arg_flows, mutations, depth + 1); + } +} + +fn enter_scope( + fn_node: &Node, + rules: &DataflowRules, + source: &[u8], + scope_stack: &mut Vec, + parameters: &mut Vec, +) { + let name = function_name(fn_node, rules, source); + let params_node = fn_node.child_by_field_name(rules.param_list_field); + let param_list = params_node + .as_ref() + .map(|pn| extract_params(pn, rules, source)) + .unwrap_or_default(); + + let mut param_map = HashMap::new(); + for (pname, pidx) in ¶m_list { + param_map.insert(pname.clone(), *pidx); + if let Some(ref fn_name) = name { + let line = params_node + .as_ref() + .map(|pn| node_line(pn)) + .unwrap_or_else(|| node_line(fn_node)); + parameters.push(DataflowParam { + func_name: fn_name.clone(), + param_name: pname.clone(), + param_index: *pidx, + line, + }); + } + } + + scope_stack.push(ScopeFrame { + func_name: name, + params: param_map, + locals: HashMap::new(), + }); +} + +/// Unwrap await if present, returning the inner expression. +fn unwrap_await<'a>(node: &Node<'a>, rules: &DataflowRules) -> Node<'a> { + if rules.await_node.is_some_and(|a| a == node.kind()) { + if let Some(inner) = node.named_child(0) { + return inner; + } + } + *node +} + +fn handle_var_declarator( + node: &Node, + rules: &DataflowRules, + source: &[u8], + scope_stack: &mut Vec, + assignments: &mut Vec, +) { + let mut name_node = node.child_by_field_name(rules.var_name_field); + let mut value_node = rules.var_value_field.and_then(|f| node.child_by_field_name(f)); + + // C#: initializer is inside equals_value_clause child + if value_node.is_none() { + if let Some(eq_type) = rules.equals_clause_type { + let cursor = &mut node.walk(); + for child in node.named_children(cursor) { + if child.kind() == eq_type { + value_node = child + .child_by_field_name("value") + .or_else(|| child.named_child(0)); + break; + } + } + } + } + + // Fallback: initializer is a direct unnamed child (C# variable_declarator) + if value_node.is_none() { + if let Some(ref nn) = name_node { + let cursor = &mut node.walk(); + for child in node.named_children(cursor) { + if child.id() != nn.id() { + let uw = unwrap_await(&child, rules); + if is_call_node(rules, uw.kind()) { + value_node = Some(child); + break; + } + } + } + } + } + + // Go: expression_list wraps LHS/RHS — unwrap to first named child + if let Some(el_type) = rules.expression_list_type { + if name_node.as_ref().is_some_and(|n| n.kind() == el_type) { + name_node = name_node.and_then(|n| n.named_child(0)); + } + if value_node.as_ref().is_some_and(|v| v.kind() == el_type) { + value_node = value_node.and_then(|v| v.named_child(0)); + } + } + + let scope = match scope_stack.last_mut() { + Some(s) => s, + None => return, + }; + let name_n = match name_node { + Some(n) => n, + None => return, + }; + let value_n = match value_node { + Some(v) => v, + None => return, + }; + + let unwrapped = unwrap_await(&value_n, rules); + if !is_call_node(rules, unwrapped.kind()) { + return; + } + + let callee = match resolve_callee_name(&unwrapped, rules, source) { + Some(c) => c, + None => return, + }; + let func_name = match &scope.func_name { + Some(f) => f.clone(), + None => return, + }; + + // Destructuring: const { a, b } = foo() + let is_obj_destruct = rules.object_destruct_type.is_some_and(|o| o == name_n.kind()); + let is_arr_destruct = rules.array_destruct_type.is_some_and(|a| a == name_n.kind()); + + if is_obj_destruct || is_arr_destruct { + let names = extract_param_names(&name_n, rules, source); + for n in &names { + assignments.push(DataflowAssignment { + var_name: n.clone(), + caller_func: Some(func_name.clone()), + source_call_name: callee.clone(), + expression: truncate(node_text(node, source), 120), + line: node_line(node), + }); + scope + .locals + .insert(n.clone(), LocalSource::Destructured { callee: callee.clone() }); + } + } else { + let var_name = node_text(&name_n, source).to_string(); + assignments.push(DataflowAssignment { + var_name: var_name.clone(), + caller_func: Some(func_name), + source_call_name: callee.clone(), + expression: truncate(node_text(node, source), 120), + line: node_line(node), + }); + scope.locals.insert(var_name, LocalSource::CallReturn { callee }); + } +} + +fn handle_assignment( + node: &Node, + rules: &DataflowRules, + source: &[u8], + scope_stack: &mut Vec, + assignments: &mut Vec, + mutations: &mut Vec, +) { + let left = node.child_by_field_name(rules.assign_left_field); + let right = node.child_by_field_name(rules.assign_right_field); + + let func_name = match scope_stack.last() { + Some(s) => match &s.func_name { + Some(f) => f.clone(), + None => return, + }, + None => return, + }; + + // Mutation: obj.prop = value + if let Some(ref left_n) = left { + if rules.member_node.is_some_and(|m| m == left_n.kind()) { + if let Some(receiver) = member_receiver(left_n, rules, source) { + let binding = find_binding(scope_stack, &receiver); + if binding.is_some() { + mutations.push(DataflowMutation { + func_name: Some(func_name.clone()), + receiver_name: receiver, + binding_type: binding.as_ref().map(|b| b.binding_type.clone()), + mutating_expr: truncate(node_text(node, source), 120), + line: node_line(node), + }); + } + } + } + } + + // Non-declaration assignment: x = foo() + if let (Some(left_n), Some(right_n)) = (left, right) { + if is_ident(rules, left_n.kind()) { + let unwrapped = unwrap_await(&right_n, rules); + if is_call_node(rules, unwrapped.kind()) { + if let Some(callee) = resolve_callee_name(&unwrapped, rules, source) { + let var_name = node_text(&left_n, source).to_string(); + assignments.push(DataflowAssignment { + var_name: var_name.clone(), + caller_func: Some(func_name), + source_call_name: callee.clone(), + expression: truncate(node_text(node, source), 120), + line: node_line(node), + }); + if let Some(scope) = scope_stack.last_mut() { + scope.locals.insert(var_name, LocalSource::CallReturn { callee }); + } + } + } + } + } +} + +fn handle_call_expr( + node: &Node, + rules: &DataflowRules, + source: &[u8], + scope_stack: &[ScopeFrame], + arg_flows: &mut Vec, +) { + let callee = match resolve_callee_name(node, rules, source) { + Some(c) => c, + None => return, + }; + let args_node = match node.child_by_field_name(rules.call_args_field) { + Some(a) => a, + None => return, + }; + let func_name = match scope_stack.last() { + Some(s) => match &s.func_name { + Some(f) => f.clone(), + None => return, + }, + None => return, + }; + + let mut arg_index: u32 = 0; + let cursor = &mut args_node.walk(); + for arg_raw in args_node.named_children(cursor) { + // PHP/Java: unwrap argument wrapper + let arg = if rules.argument_wrapper_type.is_some_and(|w| w == arg_raw.kind()) { + arg_raw.named_child(0).unwrap_or(arg_raw) + } else { + arg_raw + }; + + let unwrapped = if rules.spread_type.is_some_and(|s| s == arg.kind()) { + arg.named_child(0).unwrap_or(arg) + } else { + arg + }; + + let arg_name = if is_ident(rules, unwrapped.kind()) { + Some(node_text(&unwrapped, source).to_string()) + } else { + None + }; + let arg_member = if arg_name.is_none() + && rules.member_node.is_some_and(|m| m == unwrapped.kind()) + { + member_receiver(&unwrapped, rules, source) + } else { + None + }; + let tracked_name = arg_name.clone().or(arg_member); + + if let Some(ref tracked) = tracked_name { + let binding = find_binding(scope_stack, tracked); + if binding.is_some() { + let conf = binding_confidence(&binding); + arg_flows.push(DataflowArgFlow { + caller_func: Some(func_name.clone()), + callee_name: callee.clone(), + arg_index, + arg_name: Some(tracked.clone()), + binding_type: binding.as_ref().map(|b| b.binding_type.clone()), + confidence: conf, + expression: truncate(node_text(&arg_raw, source), 120), + line: node_line(node), + }); + } + } + arg_index += 1; + } +} + +fn handle_expr_stmt_mutation( + node: &Node, + rules: &DataflowRules, + source: &[u8], + scope_stack: &[ScopeFrame], + mutations: &mut Vec, +) { + if rules.mutating_methods.is_empty() { + return; + } + let expr = match node.named_child(0) { + Some(e) => e, + None => return, + }; + if !is_call_node(rules, expr.kind()) { + return; + } + + let mut method_name: Option = None; + let mut receiver: Option = None; + + // Standard pattern: call(fn: member(obj, prop)) + if let Some(fn_node) = expr.child_by_field_name(rules.call_function_field) { + if rules.member_node.is_some_and(|m| m == fn_node.kind()) { + if let Some(prop) = fn_node.child_by_field_name(rules.member_property_field) { + method_name = Some(node_text(&prop, source).to_string()); + } + receiver = member_receiver(&fn_node, rules, source); + } + } + + // Method call pattern: call node has a dedicated name field distinct from + // call_function_field (e.g. Rust method_call_expression has "name" + "receiver") + if method_name.is_none() { + if let Some(name_field) = rules.method_call_name_field { + if let Some(name_n) = expr.child_by_field_name(name_field) { + method_name = Some(node_text(&name_n, source).to_string()); + // Extract receiver: prefer method_call_receiver_field if set, + // otherwise fall back to member_object_field + let recv_field = rules + .method_call_receiver_field + .unwrap_or(rules.member_object_field); + if let Some(recv_node) = expr.child_by_field_name(recv_field) { + if is_ident(rules, recv_node.kind()) { + receiver = Some(node_text(&recv_node, source).to_string()); + } else if rules.member_node.is_some_and(|m| m == recv_node.kind()) { + receiver = member_receiver(&recv_node, rules, source); + } + } + } + } + } + + // Java/combined pattern: call node itself has object + name fields + if receiver.is_none() { + if let Some(obj_field) = rules.call_object_field { + let obj = expr.child_by_field_name(obj_field); + let name = expr.child_by_field_name(rules.call_function_field); + if let (Some(obj_n), Some(name_n)) = (obj, name) { + method_name = Some(node_text(&name_n, source).to_string()); + if is_ident(rules, obj_n.kind()) { + receiver = Some(node_text(&obj_n, source).to_string()); + } + } + } + } + + let method = match method_name { + Some(m) => m, + None => return, + }; + if !rules.mutating_methods.contains(&method.as_str()) { + return; + } + + let recv = match receiver { + Some(r) => r, + None => return, + }; + let func_name = match scope_stack.last() { + Some(s) => s.func_name.clone(), + None => None, + }; + if func_name.is_none() { + return; + } + + let binding = find_binding(scope_stack, &recv); + if binding.is_some() { + mutations.push(DataflowMutation { + func_name, + receiver_name: recv, + binding_type: binding.as_ref().map(|b| b.binding_type.clone()), + mutating_expr: truncate(node_text(&expr, source), 120), + line: node_line(node), + }); + } +} diff --git a/crates/codegraph-core/src/extractors/csharp.rs b/crates/codegraph-core/src/extractors/csharp.rs index 9d853ec..77c14cb 100644 --- a/crates/codegraph-core/src/extractors/csharp.rs +++ b/crates/codegraph-core/src/extractors/csharp.rs @@ -1,4 +1,5 @@ use tree_sitter::{Node, Tree}; +use crate::cfg::build_function_cfg; use crate::complexity::compute_all_metrics; use crate::types::*; use super::helpers::*; @@ -45,6 +46,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(node)), decorators: None, complexity: None, + cfg: None, children: opt_children(children), }); extract_csharp_base_types(node, &class_name, source, symbols); @@ -61,6 +63,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(node)), decorators: None, complexity: None, + cfg: None, children: None, }); extract_csharp_base_types(node, &name, source, symbols); @@ -77,6 +80,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(node)), decorators: None, complexity: None, + cfg: None, children: None, }); extract_csharp_base_types(node, &name, source, symbols); @@ -93,6 +97,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(node)), decorators: None, complexity: None, + cfg: None, children: None, }); if let Some(body) = node.child_by_field_name("body") { @@ -110,7 +115,8 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { line: start_line(&child), end_line: Some(end_line(&child)), decorators: None, - complexity: compute_all_metrics(&child, source, "c_sharp"), + complexity: compute_all_metrics(&child, source, "csharp"), + cfg: build_function_cfg(&child, "csharp", source), children: None, }); } @@ -132,6 +138,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(node)), decorators: None, complexity: None, + cfg: None, children: opt_children(children), }); } @@ -152,7 +159,8 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { line: start_line(node), end_line: Some(end_line(node)), decorators: None, - complexity: compute_all_metrics(node, source, "c_sharp"), + complexity: compute_all_metrics(node, source, "csharp"), + cfg: build_function_cfg(node, "csharp", source), children: opt_children(children), }); } @@ -173,7 +181,8 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { line: start_line(node), end_line: Some(end_line(node)), decorators: None, - complexity: compute_all_metrics(node, source, "c_sharp"), + complexity: compute_all_metrics(node, source, "csharp"), + cfg: build_function_cfg(node, "csharp", source), children: opt_children(children), }); } @@ -193,7 +202,8 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { line: start_line(node), end_line: Some(end_line(node)), decorators: None, - complexity: compute_all_metrics(node, source, "c_sharp"), + complexity: compute_all_metrics(node, source, "csharp"), + cfg: build_function_cfg(node, "csharp", source), children: None, }); } diff --git a/crates/codegraph-core/src/extractors/go.rs b/crates/codegraph-core/src/extractors/go.rs index 23d7e1a..19a0d31 100644 --- a/crates/codegraph-core/src/extractors/go.rs +++ b/crates/codegraph-core/src/extractors/go.rs @@ -1,4 +1,5 @@ use tree_sitter::{Node, Tree}; +use crate::cfg::build_function_cfg; use crate::complexity::compute_all_metrics; use crate::types::*; use super::helpers::*; @@ -27,6 +28,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(node)), decorators: None, complexity: compute_all_metrics(node, source, "go"), + cfg: build_function_cfg(node, "go", source), children: opt_children(children), }); } @@ -65,6 +67,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(node)), decorators: None, complexity: compute_all_metrics(node, source, "go"), + cfg: build_function_cfg(node, "go", source), children: opt_children(children), }); } @@ -90,6 +93,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(node)), decorators: None, complexity: None, + cfg: None, children: opt_children(children), }); } @@ -101,6 +105,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(node)), decorators: None, complexity: None, + cfg: None, children: None, }); // Extract interface methods @@ -121,6 +126,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(&member)), decorators: None, complexity: None, + cfg: None, children: None, }); } @@ -136,6 +142,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(node)), decorators: None, complexity: None, + cfg: None, children: None, }); } @@ -157,6 +164,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(&spec)), decorators: None, complexity: None, + cfg: None, children: None, }); } diff --git a/crates/codegraph-core/src/extractors/hcl.rs b/crates/codegraph-core/src/extractors/hcl.rs index ab51641..349bc82 100644 --- a/crates/codegraph-core/src/extractors/hcl.rs +++ b/crates/codegraph-core/src/extractors/hcl.rs @@ -67,6 +67,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(node)), decorators: None, complexity: None, + cfg: None, children: None, }); diff --git a/crates/codegraph-core/src/extractors/helpers.rs b/crates/codegraph-core/src/extractors/helpers.rs index 7419f61..9c11b76 100644 --- a/crates/codegraph-core/src/extractors/helpers.rs +++ b/crates/codegraph-core/src/extractors/helpers.rs @@ -20,6 +20,7 @@ pub fn child_def(name: String, kind: &str, line: u32) -> Definition { end_line: None, decorators: None, complexity: None, + cfg: None, children: None, } } diff --git a/crates/codegraph-core/src/extractors/java.rs b/crates/codegraph-core/src/extractors/java.rs index fd07ac2..6b6f784 100644 --- a/crates/codegraph-core/src/extractors/java.rs +++ b/crates/codegraph-core/src/extractors/java.rs @@ -1,4 +1,5 @@ use tree_sitter::{Node, Tree}; +use crate::cfg::build_function_cfg; use crate::complexity::compute_all_metrics; use crate::types::*; use super::helpers::*; @@ -44,6 +45,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(node)), decorators: None, complexity: None, + cfg: None, children: opt_children(children), }); @@ -97,6 +99,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(node)), decorators: None, complexity: None, + cfg: None, children: None, }); if let Some(body) = node.child_by_field_name("body") { @@ -115,6 +118,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(&child)), decorators: None, complexity: compute_all_metrics(&child, source, "java"), + cfg: build_function_cfg(&child, "java", source), children: None, }); } @@ -136,6 +140,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(node)), decorators: None, complexity: None, + cfg: None, children: opt_children(children), }); } @@ -157,6 +162,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(node)), decorators: None, complexity: compute_all_metrics(node, source, "java"), + cfg: build_function_cfg(node, "java", source), children: opt_children(children), }); } @@ -178,6 +184,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(node)), decorators: None, complexity: compute_all_metrics(node, source, "java"), + cfg: build_function_cfg(node, "java", source), children: opt_children(children), }); } diff --git a/crates/codegraph-core/src/extractors/javascript.rs b/crates/codegraph-core/src/extractors/javascript.rs index 7144cf9..91b634b 100644 --- a/crates/codegraph-core/src/extractors/javascript.rs +++ b/crates/codegraph-core/src/extractors/javascript.rs @@ -1,4 +1,5 @@ use tree_sitter::{Node, Tree}; +use crate::cfg::build_function_cfg; use crate::complexity::compute_all_metrics; use crate::types::*; use super::helpers::*; @@ -27,6 +28,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(node)), decorators: None, complexity: compute_all_metrics(node, source, "javascript"), + cfg: build_function_cfg(node, "javascript", source), children: opt_children(children), }); } @@ -43,6 +45,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(node)), decorators: None, complexity: None, + cfg: None, children: opt_children(children), }); @@ -87,6 +90,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(node)), decorators: None, complexity: compute_all_metrics(node, source, "javascript"), + cfg: build_function_cfg(node, "javascript", source), children: opt_children(children), }); } @@ -102,6 +106,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(node)), decorators: None, complexity: None, + cfg: None, children: None, }); // Extract interface methods @@ -124,6 +129,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(node)), decorators: None, complexity: None, + cfg: None, children: None, }); } @@ -141,6 +147,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(node)), decorators: None, complexity: None, + cfg: None, children: opt_children(children), }); } @@ -169,6 +176,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(&value_n)), decorators: None, complexity: compute_all_metrics(&value_n, source, "javascript"), + cfg: build_function_cfg(&value_n, "javascript", source), children: opt_children(children), }); } else if is_const && is_js_literal(&value_n) @@ -184,6 +192,7 @@ fn walk_node(node: &Node, source: &[u8], symbols: &mut FileSymbols) { end_line: Some(end_line(node)), decorators: None, complexity: None, + cfg: None, children: None, }); } @@ -703,6 +712,7 @@ fn extract_interface_methods( end_line: Some(end_line(&child)), decorators: None, complexity: None, + cfg: None, children: None, }); } @@ -919,6 +929,7 @@ fn extract_callback_definition(call_node: &Node, source: &[u8]) -> Option Option Option Option { - parallel::parse_file(&file_path, &source) +pub fn parse_file( + file_path: String, + source: String, + include_dataflow: Option, +) -> Option { + parallel::parse_file(&file_path, &source, include_dataflow.unwrap_or(false)) } /// Parse multiple files in parallel and return all extracted symbols. +/// When `include_dataflow` is true, dataflow analysis is also extracted. #[napi] -pub fn parse_files(file_paths: Vec, root_dir: String) -> Vec { - parallel::parse_files_parallel(&file_paths, &root_dir) +pub fn parse_files( + file_paths: Vec, + root_dir: String, + include_dataflow: Option, +) -> Vec { + parallel::parse_files_parallel(&file_paths, &root_dir, include_dataflow.unwrap_or(false)) } /// Resolve a single import path. diff --git a/crates/codegraph-core/src/parallel.rs b/crates/codegraph-core/src/parallel.rs index e2c8aad..7fb0d8d 100644 --- a/crates/codegraph-core/src/parallel.rs +++ b/crates/codegraph-core/src/parallel.rs @@ -2,6 +2,7 @@ use rayon::prelude::*; use std::fs; use tree_sitter::Parser; +use crate::dataflow::extract_dataflow; use crate::extractors::extract_symbols; use crate::parser_registry::LanguageKind; use crate::types::FileSymbols; @@ -9,7 +10,12 @@ use crate::types::FileSymbols; /// Parse multiple files in parallel using rayon. /// Each thread creates its own Parser (cheap; Language objects are Send+Sync). /// Failed files are silently skipped (matches WASM behavior). -pub fn parse_files_parallel(file_paths: &[String], _root_dir: &str) -> Vec { +/// When `include_dataflow` is false, dataflow extraction is skipped for performance. +pub fn parse_files_parallel( + file_paths: &[String], + _root_dir: &str, + include_dataflow: bool, +) -> Vec { file_paths .par_iter() .filter_map(|file_path| { @@ -24,6 +30,9 @@ pub fn parse_files_parallel(file_paths: &[String], _root_dir: &str) -> Vec Vec Option { +/// When `include_dataflow` is false, dataflow extraction is skipped for performance. +pub fn parse_file(file_path: &str, source: &str, include_dataflow: bool) -> Option { let lang = LanguageKind::from_extension(file_path)?; let source_bytes = source.as_bytes(); @@ -43,6 +53,9 @@ pub fn parse_file(file_path: &str, source: &str) -> Option { let tree = parser.parse(source_bytes, None)?; let line_count = source_bytes.iter().filter(|&&b| b == b'\n').count() as u32 + 1; let mut symbols = extract_symbols(lang, &tree, source_bytes, file_path); + if include_dataflow { + symbols.dataflow = extract_dataflow(&tree, source_bytes, lang.lang_id_str()); + } symbols.line_count = Some(line_count); Some(symbols) } diff --git a/crates/codegraph-core/src/parser_registry.rs b/crates/codegraph-core/src/parser_registry.rs index 2c2c7e9..f800b27 100644 --- a/crates/codegraph-core/src/parser_registry.rs +++ b/crates/codegraph-core/src/parser_registry.rs @@ -17,6 +17,24 @@ pub enum LanguageKind { } impl LanguageKind { + /// Return the string ID used by dataflow/cfg rules lookup. + /// Matches the JS `DATAFLOW_RULES` map keys in `src/dataflow.js`. + pub fn lang_id_str(&self) -> &'static str { + match self { + Self::JavaScript => "javascript", + Self::TypeScript => "typescript", + Self::Tsx => "tsx", + Self::Python => "python", + Self::Go => "go", + Self::Rust => "rust", + Self::Java => "java", + Self::CSharp => "csharp", + Self::Ruby => "ruby", + Self::Php => "php", + Self::Hcl => "hcl", + } + } + /// Determine language from file extension — mirrors `getParser()` in parser.js pub fn from_extension(file_path: &str) -> Option { let path = Path::new(file_path); diff --git a/crates/codegraph-core/src/types.rs b/crates/codegraph-core/src/types.rs index 1b219c7..f1b68ff 100644 --- a/crates/codegraph-core/src/types.rs +++ b/crates/codegraph-core/src/types.rs @@ -55,6 +55,36 @@ impl ComplexityMetrics { } } +#[napi(object)] +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct CfgBlock { + pub index: u32, + #[napi(js_name = "type")] + pub block_type: String, + #[napi(js_name = "startLine")] + pub start_line: Option, + #[napi(js_name = "endLine")] + pub end_line: Option, + pub label: Option, +} + +#[napi(object)] +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct CfgEdge { + #[napi(js_name = "sourceIndex")] + pub source_index: u32, + #[napi(js_name = "targetIndex")] + pub target_index: u32, + pub kind: String, +} + +#[napi(object)] +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct CfgData { + pub blocks: Vec, + pub edges: Vec, +} + #[napi(object)] #[derive(Debug, Clone, Serialize, Deserialize)] pub struct Definition { @@ -65,6 +95,7 @@ pub struct Definition { #[napi(ts_type = "string[] | undefined")] pub decorators: Option>, pub complexity: Option, + pub cfg: Option, #[napi(ts_type = "Definition[] | undefined")] pub children: Option>, } @@ -144,6 +175,87 @@ pub struct AstNode { pub receiver: Option, } +// ─── Dataflow Types ────────────────────────────────────────────────────── + +#[napi(object)] +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct DataflowParam { + #[napi(js_name = "funcName")] + pub func_name: String, + #[napi(js_name = "paramName")] + pub param_name: String, + #[napi(js_name = "paramIndex")] + pub param_index: u32, + pub line: u32, +} + +#[napi(object)] +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct DataflowReturn { + #[napi(js_name = "funcName")] + pub func_name: String, + pub expression: String, + #[napi(js_name = "referencedNames")] + pub referenced_names: Vec, + pub line: u32, +} + +#[napi(object)] +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct DataflowAssignment { + #[napi(js_name = "varName")] + pub var_name: String, + #[napi(js_name = "callerFunc")] + pub caller_func: Option, + #[napi(js_name = "sourceCallName")] + pub source_call_name: String, + pub expression: String, + pub line: u32, +} + +#[napi(object)] +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct DataflowArgFlow { + #[napi(js_name = "callerFunc")] + pub caller_func: Option, + #[napi(js_name = "calleeName")] + pub callee_name: String, + #[napi(js_name = "argIndex")] + pub arg_index: u32, + #[napi(js_name = "argName")] + pub arg_name: Option, + #[napi(js_name = "bindingType")] + pub binding_type: Option, + pub confidence: f64, + pub expression: String, + pub line: u32, +} + +#[napi(object)] +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct DataflowMutation { + #[napi(js_name = "funcName")] + pub func_name: Option, + #[napi(js_name = "receiverName")] + pub receiver_name: String, + #[napi(js_name = "bindingType")] + pub binding_type: Option, + #[napi(js_name = "mutatingExpr")] + pub mutating_expr: String, + pub line: u32, +} + +#[napi(object)] +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct DataflowResult { + pub parameters: Vec, + pub returns: Vec, + pub assignments: Vec, + #[napi(js_name = "argFlows")] + pub arg_flows: Vec, + pub mutations: Vec, +} + #[napi(object)] #[derive(Debug, Clone, Serialize, Deserialize)] pub struct FileSymbols { @@ -155,6 +267,7 @@ pub struct FileSymbols { pub exports: Vec, #[napi(js_name = "astNodes")] pub ast_nodes: Vec, + pub dataflow: Option, pub line_count: Option, } @@ -168,6 +281,7 @@ impl FileSymbols { classes: Vec::new(), exports: Vec::new(), ast_nodes: Vec::new(), + dataflow: None, line_count: None, } } diff --git a/src/builder.js b/src/builder.js index 5eb48d7..95badad 100644 --- a/src/builder.js +++ b/src/builder.js @@ -444,7 +444,7 @@ export async function buildGraph(rootDir, opts = {}) { opts.incremental !== false && config.build && config.build.incremental !== false; // Engine selection: 'native', 'wasm', or 'auto' (default) - const engineOpts = { engine: opts.engine || 'auto' }; + const engineOpts = { engine: opts.engine || 'auto', dataflow: opts.dataflow !== false }; const { name: engineName, version: engineVersion } = getActiveEngine(engineOpts); info(`Using ${engineName} engine${engineVersion ? ` (v${engineVersion})` : ''}`); @@ -548,7 +548,11 @@ export async function buildGraph(rootDir, opts = {}) { if (needsCfg || needsDataflow) { info('No file changes. Running pending analysis pass...'); - const analysisSymbols = await parseFilesAuto(files, rootDir, engineOpts); + const analysisOpts = { + ...engineOpts, + dataflow: needsDataflow && opts.dataflow !== false, + }; + const analysisSymbols = await parseFilesAuto(files, rootDir, analysisOpts); if (needsCfg) { const { buildCFGData } = await import('./cfg.js'); await buildCFGData(db, analysisSymbols, rootDir, engineOpts); diff --git a/src/cfg.js b/src/cfg.js index 67dd333..23e9aae 100644 --- a/src/cfg.js +++ b/src/cfg.js @@ -1053,8 +1053,14 @@ export async function buildCFGData(db, fileSymbols, rootDir, _engineOpts) { if (!symbols._tree) { const ext = path.extname(relPath).toLowerCase(); if (CFG_EXTENSIONS.has(ext)) { - needsFallback = true; - break; + // Check if all function/method defs already have native CFG data + const hasNativeCfg = symbols.definitions + .filter((d) => (d.kind === 'function' || d.kind === 'method') && d.line) + .every((d) => d.cfg === null || d.cfg?.blocks?.length); + if (!hasNativeCfg) { + needsFallback = true; + break; + } } } } @@ -1102,8 +1108,13 @@ export async function buildCFGData(db, fileSymbols, rootDir, _engineOpts) { let tree = symbols._tree; let langId = symbols._langId; - // WASM fallback if no cached tree - if (!tree) { + // Check if all defs already have native CFG — skip WASM parse if so + const allNative = symbols.definitions + .filter((d) => (d.kind === 'function' || d.kind === 'method') && d.line) + .every((d) => d.cfg === null || d.cfg?.blocks?.length); + + // WASM fallback if no cached tree and not all native + if (!tree && !allNative) { if (!extToLang || !getParserFn) continue; langId = extToLang.get(ext); if (!langId || !CFG_LANG_IDS.has(langId)) continue; @@ -1135,7 +1146,7 @@ export async function buildCFGData(db, fileSymbols, rootDir, _engineOpts) { if (!cfgRules) continue; const complexityRules = COMPLEXITY_RULES.get(langId); - if (!complexityRules) continue; + // complexityRules only needed for WASM fallback path for (const def of symbols.definitions) { if (def.kind !== 'function' && def.kind !== 'method') continue; @@ -1144,11 +1155,19 @@ export async function buildCFGData(db, fileSymbols, rootDir, _engineOpts) { const row = getNodeId.get(def.name, relPath, def.line); if (!row) continue; - const funcNode = findFunctionNode(tree.rootNode, def.line, def.endLine, complexityRules); - if (!funcNode) continue; + // Native path: use pre-computed CFG from Rust engine + let cfg = null; + if (def.cfg?.blocks?.length) { + cfg = def.cfg; + } else { + // WASM fallback: compute CFG from tree-sitter AST + if (!tree || !complexityRules) continue; + const funcNode = findFunctionNode(tree.rootNode, def.line, def.endLine, complexityRules); + if (!funcNode) continue; + cfg = buildFunctionCFG(funcNode, langId); + } - const cfg = buildFunctionCFG(funcNode, langId); - if (cfg.blocks.length === 0) continue; + if (!cfg || cfg.blocks.length === 0) continue; // Clear old CFG data for this function deleteEdges.run(row.id); diff --git a/src/dataflow.js b/src/dataflow.js index ad6f156..08b982f 100644 --- a/src/dataflow.js +++ b/src/dataflow.js @@ -1009,7 +1009,7 @@ export async function buildDataflowEdges(db, fileSymbols, rootDir, _engineOpts) let needsFallback = false; for (const [relPath, symbols] of fileSymbols) { - if (!symbols._tree) { + if (!symbols._tree && !symbols.dataflow) { const ext = path.extname(relPath).toLowerCase(); if (DATAFLOW_EXTENSIONS.has(ext)) { needsFallback = true; @@ -1061,41 +1061,45 @@ export async function buildDataflowEdges(db, fileSymbols, rootDir, _engineOpts) const ext = path.extname(relPath).toLowerCase(); if (!DATAFLOW_EXTENSIONS.has(ext)) continue; - let tree = symbols._tree; - let langId = symbols._langId; + // Use native dataflow data if available — skip WASM extraction + let data = symbols.dataflow; + if (!data) { + let tree = symbols._tree; + let langId = symbols._langId; + + // WASM fallback if no cached tree + if (!tree) { + if (!extToLang || !getParserFn) continue; + langId = extToLang.get(ext); + if (!langId || !DATAFLOW_LANG_IDS.has(langId)) continue; + + const absPath = path.join(rootDir, relPath); + let code; + try { + code = fs.readFileSync(absPath, 'utf-8'); + } catch { + continue; + } - // WASM fallback if no cached tree - if (!tree) { - if (!extToLang || !getParserFn) continue; - langId = extToLang.get(ext); - if (!langId || !DATAFLOW_LANG_IDS.has(langId)) continue; + const parser = getParserFn(parsers, absPath); + if (!parser) continue; - const absPath = path.join(rootDir, relPath); - let code; - try { - code = fs.readFileSync(absPath, 'utf-8'); - } catch { - continue; + try { + tree = parser.parse(code); + } catch { + continue; + } } - const parser = getParserFn(parsers, absPath); - if (!parser) continue; - - try { - tree = parser.parse(code); - } catch { - continue; + if (!langId) { + langId = extToLang ? extToLang.get(ext) : null; + if (!langId) continue; } - } - - if (!langId) { - langId = extToLang ? extToLang.get(ext) : null; - if (!langId) continue; - } - if (!DATAFLOW_RULES.has(langId)) continue; + if (!DATAFLOW_RULES.has(langId)) continue; - const data = extractDataflow(tree, relPath, symbols.definitions, langId); + data = extractDataflow(tree, relPath, symbols.definitions, langId); + } // Resolve function names to node IDs in this file first, then globally function resolveNode(funcName) { diff --git a/src/parser.js b/src/parser.js index e4a4a2e..e9d5a1e 100644 --- a/src/parser.js +++ b/src/parser.js @@ -205,6 +205,22 @@ function normalizeNativeSymbols(result) { maintainabilityIndex: d.complexity.maintainabilityIndex ?? null, } : null, + cfg: d.cfg?.blocks?.length + ? { + blocks: d.cfg.blocks.map((b) => ({ + index: b.index, + type: b.type, + startLine: b.startLine, + endLine: b.endLine, + label: b.label ?? null, + })), + edges: d.cfg.edges.map((e) => ({ + sourceIndex: e.sourceIndex, + targetIndex: e.targetIndex, + kind: e.kind, + })), + } + : null, children: d.children?.length ? d.children.map((c) => ({ name: c.name, @@ -253,6 +269,46 @@ function normalizeNativeSymbols(result) { text: n.text ?? null, receiver: n.receiver ?? null, })), + dataflow: result.dataflow + ? { + parameters: (result.dataflow.parameters || []).map((p) => ({ + funcName: p.funcName, + paramName: p.paramName, + paramIndex: p.paramIndex, + line: p.line, + })), + returns: (result.dataflow.returns || []).map((r) => ({ + funcName: r.funcName, + expression: r.expression ?? '', + referencedNames: r.referencedNames ?? [], + line: r.line, + })), + assignments: (result.dataflow.assignments || []).map((a) => ({ + varName: a.varName, + callerFunc: a.callerFunc ?? null, + sourceCallName: a.sourceCallName, + expression: a.expression ?? '', + line: a.line, + })), + argFlows: (result.dataflow.argFlows ?? []).map((f) => ({ + callerFunc: f.callerFunc ?? null, + calleeName: f.calleeName, + argIndex: f.argIndex, + argName: f.argName ?? null, + binding: f.bindingType ? { type: f.bindingType } : null, + confidence: f.confidence, + expression: f.expression ?? '', + line: f.line, + })), + mutations: (result.dataflow.mutations || []).map((m) => ({ + funcName: m.funcName ?? null, + receiverName: m.receiverName, + binding: m.bindingType ? { type: m.bindingType } : null, + mutatingExpr: m.mutatingExpr, + line: m.line, + })), + } + : null, }; } @@ -384,7 +440,7 @@ export async function parseFileAuto(filePath, source, opts = {}) { const { native } = resolveEngine(opts); if (native) { - const result = native.parseFile(filePath, source); + const result = native.parseFile(filePath, source, !!opts.dataflow); return result ? normalizeNativeSymbols(result) : null; } @@ -407,7 +463,7 @@ export async function parseFilesAuto(filePaths, rootDir, opts = {}) { const result = new Map(); if (native) { - const nativeResults = native.parseFiles(filePaths, rootDir); + const nativeResults = native.parseFiles(filePaths, rootDir, !!opts.dataflow); for (const r of nativeResults) { if (!r) continue; const relPath = path.relative(rootDir, r.file).split(path.sep).join('/'); diff --git a/tests/engines/dataflow-parity.test.js b/tests/engines/dataflow-parity.test.js new file mode 100644 index 0000000..ae08b92 --- /dev/null +++ b/tests/engines/dataflow-parity.test.js @@ -0,0 +1,269 @@ +/** + * Cross-engine dataflow parity tests. + * + * Parse the same source snippets with both WASM and native engines, + * then assert the dataflow output is equivalent for Go, Rust, and Ruby. + * + * JS/TS/Python/Java/C# already have good parity coverage via the + * 5 existing language-specific dataflow tests + build-parity. + * + * Skipped when the native engine is not installed or when the native + * binary does not include dataflow support (requires local Rust build). + */ + +import { beforeAll, describe, expect, it } from 'vitest'; +import { extractDataflow } from '../../src/dataflow.js'; +import { isNativeAvailable } from '../../src/native.js'; +import { createParsers, getParser } from '../../src/parser.js'; + +let native; +let parsers; +let nativeHasDataflow = false; + +/** + * Extract dataflow via WASM: parse with tree-sitter WASM, then run + * the JS extractDataflow() visitor. + */ +function wasmDataflow(code, filePath, langId) { + const parser = getParser(parsers, filePath); + if (!parser) return null; + const tree = parser.parse(code); + return extractDataflow(tree, filePath, [], langId); +} + +/** + * Extract dataflow via native: parseFile with include_dataflow=true. + * Returns null if native doesn't support dataflow. + */ +function nativeDataflow(code, filePath) { + const result = native.parseFile(filePath, code, true); + if (!result || !result.dataflow) return null; + const df = result.dataflow; + return { + parameters: (df.parameters || []).map((p) => ({ + funcName: p.funcName, + paramName: p.paramName, + paramIndex: p.paramIndex, + line: p.line, + })), + returns: (df.returns || []).map((r) => ({ + funcName: r.funcName, + expression: r.expression ?? '', + referencedNames: r.referencedNames ?? [], + line: r.line, + })), + assignments: (df.assignments || []).map((a) => ({ + varName: a.varName, + callerFunc: a.callerFunc ?? null, + sourceCallName: a.sourceCallName, + expression: a.expression ?? '', + line: a.line, + })), + argFlows: (df.argFlows ?? []).map((f) => ({ + callerFunc: f.callerFunc ?? null, + calleeName: f.calleeName, + argIndex: f.argIndex, + argName: f.argName ?? null, + confidence: f.confidence, + expression: f.expression ?? '', + line: f.line, + })), + mutations: (df.mutations || []).map((m) => ({ + funcName: m.funcName ?? null, + receiverName: m.receiverName, + mutatingExpr: m.mutatingExpr, + line: m.line, + })), + }; +} + +/** + * Normalize WASM extractDataflow() output to match the native shape. + * WASM returns extra fields (binding, etc.) that native doesn't — strip them. + */ +function normalizeWasm(data) { + if (!data) return null; + return { + parameters: (data.parameters || []).map((p) => ({ + funcName: p.funcName, + paramName: p.paramName, + paramIndex: p.paramIndex, + line: p.line, + })), + returns: (data.returns || []).map((r) => ({ + funcName: r.funcName, + expression: r.expression ?? '', + referencedNames: r.referencedNames ?? [], + line: r.line, + })), + assignments: (data.assignments || []).map((a) => ({ + varName: a.varName, + callerFunc: a.callerFunc ?? null, + sourceCallName: a.sourceCallName, + expression: a.expression ?? '', + line: a.line, + })), + argFlows: (data.argFlows ?? []).map((f) => ({ + callerFunc: f.callerFunc ?? null, + calleeName: f.calleeName, + argIndex: f.argIndex, + argName: f.argName ?? null, + confidence: f.confidence, + expression: f.expression ?? '', + line: f.line, + })), + mutations: (data.mutations || []).map((m) => ({ + funcName: m.funcName ?? null, + receiverName: m.receiverName, + mutatingExpr: m.mutatingExpr, + line: m.line, + })), + }; +} + +const hasNative = isNativeAvailable(); + +// Detect whether the installed native binary includes dataflow support. +// The published npm prebuilt (v3.0.0) doesn't — only a local Rust build does. +function detectNativeDataflow() { + if (!native) return false; + const r = native.parseFile('probe.js', 'function f(a) { return a; }', true); + return !!r?.dataflow; +} + +const describeOrSkip = hasNative ? describe : describe.skip; + +describeOrSkip('Cross-engine dataflow parity', () => { + beforeAll(async () => { + if (!hasNative) return; + const { getNative } = await import('../../src/native.js'); + native = getNative(); + nativeHasDataflow = detectNativeDataflow(); + parsers = await createParsers(); + }); + + // ── Go ───────────────────────────────────────────────────────────────── + + describe('Go', () => { + const lang = 'go'; + const file = 'test.go'; + + it('parameters — simple', ({ skip }) => { + if (!nativeHasDataflow) skip(); + const code = 'package main\nfunc add(a int, b int) int {\n\treturn a + b\n}\n'; + const w = normalizeWasm(wasmDataflow(code, file, lang)); + const n = nativeDataflow(code, file); + expect(n.parameters).toEqual(w.parameters); + }); + + it('returns — captures referenced names', ({ skip }) => { + if (!nativeHasDataflow) skip(); + const code = 'package main\nfunc double(x int) int {\n\treturn x * 2\n}\n'; + const w = normalizeWasm(wasmDataflow(code, file, lang)); + const n = nativeDataflow(code, file); + expect(n.returns).toEqual(w.returns); + }); + + it('assignments — short var declaration from call', ({ skip }) => { + if (!nativeHasDataflow) skip(); + const code = 'package main\nfunc run() {\n\tresult := compute()\n\t_ = result\n}\n'; + const w = normalizeWasm(wasmDataflow(code, file, lang)); + const n = nativeDataflow(code, file); + expect(n.assignments).toEqual(w.assignments); + }); + + it('argFlows — parameter passed as argument', ({ skip }) => { + if (!nativeHasDataflow) skip(); + const code = 'package main\nfunc process(input string) {\n\ttransform(input)\n}\n'; + const w = normalizeWasm(wasmDataflow(code, file, lang)); + const n = nativeDataflow(code, file); + expect(n.argFlows).toEqual(w.argFlows); + }); + }); + + // ── Rust ─────────────────────────────────────────────────────────────── + + describe('Rust', () => { + const lang = 'rust'; + const file = 'test.rs'; + + it('parameters — simple', ({ skip }) => { + if (!nativeHasDataflow) skip(); + const code = 'fn add(a: i32, b: i32) -> i32 {\n a + b\n}\n'; + const w = normalizeWasm(wasmDataflow(code, file, lang)); + const n = nativeDataflow(code, file); + expect(n.parameters).toEqual(w.parameters); + }); + + it('returns — explicit return', ({ skip }) => { + if (!nativeHasDataflow) skip(); + const code = 'fn double(x: i32) -> i32 {\n return x * 2;\n}\n'; + const w = normalizeWasm(wasmDataflow(code, file, lang)); + const n = nativeDataflow(code, file); + expect(n.returns).toEqual(w.returns); + }); + + it('assignments — let binding from call', ({ skip }) => { + if (!nativeHasDataflow) skip(); + const code = 'fn main() {\n let result = compute();\n println!("{}", result);\n}\n'; + const w = normalizeWasm(wasmDataflow(code, file, lang)); + const n = nativeDataflow(code, file); + expect(n.assignments).toEqual(w.assignments); + }); + + it('argFlows — parameter passed as argument', ({ skip }) => { + if (!nativeHasDataflow) skip(); + const code = 'fn process(input: String) {\n transform(input);\n}\n'; + const w = normalizeWasm(wasmDataflow(code, file, lang)); + const n = nativeDataflow(code, file); + expect(n.argFlows).toEqual(w.argFlows); + }); + + it('mutations — push on mutable parameter', ({ skip }) => { + if (!nativeHasDataflow) skip(); + const code = 'fn add_item(items: &mut Vec, item: i32) {\n items.push(item);\n}\n'; + const w = normalizeWasm(wasmDataflow(code, file, lang)); + const n = nativeDataflow(code, file); + expect(n.mutations).toEqual(w.mutations); + }); + }); + + // ── Ruby ─────────────────────────────────────────────────────────────── + + describe('Ruby', () => { + const lang = 'ruby'; + const file = 'test.rb'; + + it('parameters — simple', ({ skip }) => { + if (!nativeHasDataflow) skip(); + const code = 'def add(a, b)\n return a + b\nend\n'; + const w = normalizeWasm(wasmDataflow(code, file, lang)); + const n = nativeDataflow(code, file); + expect(n.parameters).toEqual(w.parameters); + }); + + it('returns — explicit return', ({ skip }) => { + if (!nativeHasDataflow) skip(); + const code = 'def double(x)\n return x * 2\nend\n'; + const w = normalizeWasm(wasmDataflow(code, file, lang)); + const n = nativeDataflow(code, file); + expect(n.returns).toEqual(w.returns); + }); + + it('assignments — variable from method call', ({ skip }) => { + if (!nativeHasDataflow) skip(); + const code = 'def main\n result = compute()\n return result\nend\n'; + const w = normalizeWasm(wasmDataflow(code, file, lang)); + const n = nativeDataflow(code, file); + expect(n.assignments).toEqual(w.assignments); + }); + + it('argFlows — parameter passed as argument', ({ skip }) => { + if (!nativeHasDataflow) skip(); + const code = 'def process(input)\n transform(input)\nend\n'; + const w = normalizeWasm(wasmDataflow(code, file, lang)); + const n = nativeDataflow(code, file); + expect(n.argFlows).toEqual(w.argFlows); + }); + }); +}); diff --git a/tests/parsers/ast-all-langs.test.js b/tests/parsers/ast-all-langs.test.js index c07368e..2c4c372 100644 --- a/tests/parsers/ast-all-langs.test.js +++ b/tests/parsers/ast-all-langs.test.js @@ -201,6 +201,7 @@ public class Service { public async Task FetchAsync() { var result = await GetDataAsync(); string msg = "hello from csharp"; + var ex = new ArgumentNullException("x"); if (result == null) { throw new ArgumentNullException("result"); } diff --git a/tests/parsers/cfg-all-langs.test.js b/tests/parsers/cfg-all-langs.test.js new file mode 100644 index 0000000..0567944 --- /dev/null +++ b/tests/parsers/cfg-all-langs.test.js @@ -0,0 +1,461 @@ +/** + * Tests for native CFG extraction across all languages. + * + * 1. Verifies buildCFGData accepts native def.cfg for non-JS languages + * (tests the JS-side native path in buildCFGData). + * 2. When native engine is available, verifies each language extractor + * produces CFG data for function/method definitions. + * 3. Parity: compares native CFG block/edge counts against WASM buildFunctionCFG. + */ + +import fs from 'node:fs'; +import os from 'node:os'; +import path from 'node:path'; +import Database from 'better-sqlite3'; +import { afterAll, beforeAll, describe, expect, test } from 'vitest'; +import { buildCFGData, buildFunctionCFG } from '../../src/cfg.js'; +import { COMPLEXITY_RULES, findFunctionNode } from '../../src/complexity.js'; +import { initSchema } from '../../src/db.js'; +import { loadNative } from '../../src/native.js'; +import { createParsers, getParser, parseFilesAuto } from '../../src/parser.js'; + +// ─── Helpers ────────────────────────────────────────────────────────── + +function createTempDb() { + const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-cfg-lang-')); + fs.mkdirSync(path.join(tmpDir, '.codegraph')); + const dbPath = path.join(tmpDir, '.codegraph', 'graph.db'); + const db = new Database(dbPath); + db.pragma('journal_mode = WAL'); + initSchema(db); + return { tmpDir, db }; +} + +// ─── JS-side: buildCFGData accepts native def.cfg ───────────────────── + +describe('buildCFGData — native CFG path', () => { + let tmpDir, db; + + beforeAll(() => { + ({ tmpDir, db } = createTempDb()); + }); + + afterAll(() => { + if (db) db.close(); + if (tmpDir) fs.rmSync(tmpDir, { recursive: true, force: true }); + }); + + test('inserts native CFG data for a function with pre-computed cfg', async () => { + // Insert function node in DB + db.prepare('INSERT INTO nodes (name, kind, file, line, end_line) VALUES (?, ?, ?, ?, ?)').run( + 'processData', + 'function', + 'src/process.py', + 1, + 10, + ); + + const fileSymbols = new Map(); + fileSymbols.set('src/process.py', { + definitions: [ + { + name: 'processData', + kind: 'function', + line: 1, + endLine: 10, + cfg: { + blocks: [ + { index: 0, type: 'entry', startLine: null, endLine: null, label: null }, + { index: 1, type: 'exit', startLine: null, endLine: null, label: null }, + { index: 2, type: 'body', startLine: 2, endLine: 5, label: null }, + { index: 3, type: 'condition', startLine: 6, endLine: 6, label: 'if' }, + { index: 4, type: 'branch_true', startLine: 7, endLine: 8, label: 'then' }, + { index: 5, type: 'body', startLine: 9, endLine: 10, label: null }, + ], + edges: [ + { sourceIndex: 0, targetIndex: 2, kind: 'fallthrough' }, + { sourceIndex: 2, targetIndex: 3, kind: 'fallthrough' }, + { sourceIndex: 3, targetIndex: 4, kind: 'branch_true' }, + { sourceIndex: 3, targetIndex: 5, kind: 'branch_false' }, + { sourceIndex: 4, targetIndex: 5, kind: 'fallthrough' }, + { sourceIndex: 5, targetIndex: 1, kind: 'fallthrough' }, + ], + }, + }, + ], + calls: [], + _langId: 'python', + }); + + await buildCFGData(db, fileSymbols, tmpDir); + + const blocks = db.prepare('SELECT * FROM cfg_blocks ORDER BY block_index').all(); + expect(blocks.length).toBe(6); + expect(blocks[0].block_type).toBe('entry'); + expect(blocks[1].block_type).toBe('exit'); + expect(blocks[3].block_type).toBe('condition'); + + const edges = db.prepare('SELECT * FROM cfg_edges').all(); + expect(edges.length).toBe(6); + const edgeKinds = edges.map((e) => e.kind); + expect(edgeKinds).toContain('branch_true'); + expect(edgeKinds).toContain('branch_false'); + expect(edgeKinds).toContain('fallthrough'); + }); + + test('native CFG data does not require WASM tree', async () => { + const { tmpDir: tmpDir2, db: db2 } = createTempDb(); + + db2 + .prepare('INSERT INTO nodes (name, kind, file, line, end_line) VALUES (?, ?, ?, ?, ?)') + .run('hello', 'function', 'src/hello.rb', 1, 5); + + const fileSymbols = new Map(); + fileSymbols.set('src/hello.rb', { + definitions: [ + { + name: 'hello', + kind: 'function', + line: 1, + endLine: 5, + cfg: { + blocks: [ + { index: 0, type: 'entry', startLine: null, endLine: null, label: null }, + { index: 1, type: 'exit', startLine: null, endLine: null, label: null }, + { index: 2, type: 'body', startLine: 2, endLine: 4, label: null }, + ], + edges: [ + { sourceIndex: 0, targetIndex: 2, kind: 'fallthrough' }, + { sourceIndex: 2, targetIndex: 1, kind: 'fallthrough' }, + ], + }, + }, + ], + calls: [], + // No _tree, no _langId — should still work with native CFG + _langId: 'ruby', + }); + + await buildCFGData(db2, fileSymbols, tmpDir2); + + const blocks = db2.prepare('SELECT * FROM cfg_blocks').all(); + expect(blocks.length).toBe(3); + + db2.close(); + fs.rmSync(tmpDir2, { recursive: true, force: true }); + }); +}); + +// ─── Native engine: multi-language CFG extraction + parity ───────────── + +const LANG_CFG_FIXTURES = { + 'fixture.js': ` +function processItems(items) { + if (items.length === 0) { + return []; + } + for (const item of items) { + console.log(item); + } + return items; +} +`, + 'fixture.py': ` +def process(data): + if not data: + raise ValueError("empty") + for item in data: + print(item) + return data +`, + 'fixture.go': ` +package main + +func process(items []string) []string { + if len(items) == 0 { + return nil + } + for _, item := range items { + println(item) + } + return items +} +`, + 'fixture.rs': ` +fn process(items: Vec) -> Vec { + if items.is_empty() { + return vec![]; + } + for item in &items { + println!("{}", item); + } + items +} +`, + 'fixture.java': ` +public class Processor { + public String[] process(String[] items) { + if (items.length == 0) { + return new String[0]; + } + for (String item : items) { + System.out.println(item); + } + return items; + } +} +`, + 'fixture.cs': ` +public class Processor { + public string[] Process(string[] items) { + if (items.Length == 0) { + return new string[0]; + } + foreach (var item in items) { + Console.WriteLine(item); + } + return items; + } +} +`, + 'fixture.rb': ` +class Processor + def process(items) + if items.empty? + return [] + end + items.each do |item| + puts item + end + items + end +end +`, + 'fixture.php': ` d.cfg?.blocks?.length > 0); + fs.rmSync(tmpCheck, { recursive: true, force: true }); + return hasCfg; + } catch { + return false; + } +} + +const canTestNativeCfg = nativeSupportsCfg(); + +describe.skipIf(!canTestNativeCfg)('native CFG — multi-language', () => { + let tmpDir; + const nativeResults = new Map(); + + beforeAll(async () => { + tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-cfg-multilang-')); + const srcDir = path.join(tmpDir, 'src'); + fs.mkdirSync(srcDir, { recursive: true }); + + const filePaths = []; + for (const [name, code] of Object.entries(LANG_CFG_FIXTURES)) { + const fp = path.join(srcDir, name); + fs.writeFileSync(fp, code); + filePaths.push(fp); + } + + const allSymbols = await parseFilesAuto(filePaths, tmpDir, { engine: 'native' }); + for (const [relPath, symbols] of allSymbols) { + nativeResults.set(relPath, symbols); + } + }); + + afterAll(() => { + if (tmpDir) fs.rmSync(tmpDir, { recursive: true, force: true }); + }); + + const langTests = [ + { file: 'fixture.js', lang: 'JavaScript', funcPattern: /processItems/ }, + { file: 'fixture.py', lang: 'Python', funcPattern: /process/ }, + { file: 'fixture.go', lang: 'Go', funcPattern: /process/ }, + { file: 'fixture.rs', lang: 'Rust', funcPattern: /process/ }, + { file: 'fixture.java', lang: 'Java', funcPattern: /process/ }, + { file: 'fixture.cs', lang: 'C#', funcPattern: /Process/ }, + { file: 'fixture.rb', lang: 'Ruby', funcPattern: /process/ }, + { file: 'fixture.php', lang: 'PHP', funcPattern: /process/ }, + ]; + + for (const { file, lang, funcPattern } of langTests) { + test(`${lang}: native produces CFG data for function`, () => { + const relPath = `src/${file}`; + const symbols = nativeResults.get(relPath); + expect(symbols, `no symbols for ${relPath}`).toBeTruthy(); + + const funcDefs = symbols.definitions.filter( + (d) => (d.kind === 'function' || d.kind === 'method') && funcPattern.test(d.name), + ); + expect(funcDefs.length, `no function matching ${funcPattern} in ${relPath}`).toBeGreaterThan( + 0, + ); + + for (const def of funcDefs) { + expect(def.cfg, `no cfg on ${def.name}`).toBeTruthy(); + expect(def.cfg.blocks.length, `no blocks in cfg of ${def.name}`).toBeGreaterThan(0); + expect(def.cfg.edges.length, `no edges in cfg of ${def.name}`).toBeGreaterThan(0); + + // Entry and exit blocks should always be present + const blockTypes = def.cfg.blocks.map((b) => b.type); + expect(blockTypes).toContain('entry'); + expect(blockTypes).toContain('exit'); + + // At least one fallthrough edge + const edgeKinds = def.cfg.edges.map((e) => e.kind); + expect(edgeKinds).toContain('fallthrough'); + } + }); + } + + for (const { file, lang, funcPattern } of langTests) { + test(`${lang}: CFG has if-condition and for-loop blocks`, () => { + const relPath = `src/${file}`; + const symbols = nativeResults.get(relPath); + if (!symbols) return; + + const funcDefs = symbols.definitions.filter( + (d) => (d.kind === 'function' || d.kind === 'method') && funcPattern.test(d.name), + ); + if (funcDefs.length === 0) return; + + const def = funcDefs[0]; + const blockTypes = def.cfg.blocks.map((b) => b.type); + const edgeKinds = def.cfg.edges.map((e) => e.kind); + + // All fixtures have an if statement + expect(blockTypes).toContain('condition'); + expect(edgeKinds).toContain('branch_true'); + + // All fixtures have a for loop + expect(blockTypes).toContain('loop_header'); + expect(blockTypes).toContain('loop_body'); + }); + } +}); + +// ─── Parity: native vs WASM CFG ────────────────────────────────────── + +describe.skipIf(!canTestNativeCfg)('native vs WASM CFG parity', () => { + let tmpDir; + const nativeResults = new Map(); + let parsers; + + const LANG_MAP = { + '.js': 'javascript', + '.py': 'python', + '.go': 'go', + '.rs': 'rust', + '.java': 'java', + '.cs': 'csharp', + '.rb': 'ruby', + '.php': 'php', + }; + + beforeAll(async () => { + tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-cfg-parity-')); + const srcDir = path.join(tmpDir, 'src'); + fs.mkdirSync(srcDir, { recursive: true }); + + const filePaths = []; + for (const [name, code] of Object.entries(LANG_CFG_FIXTURES)) { + const fp = path.join(srcDir, name); + fs.writeFileSync(fp, code); + filePaths.push(fp); + } + + const allSymbols = await parseFilesAuto(filePaths, tmpDir, { engine: 'native' }); + for (const [relPath, symbols] of allSymbols) { + nativeResults.set(relPath, symbols); + } + + parsers = await createParsers(); + }); + + afterAll(() => { + if (tmpDir) fs.rmSync(tmpDir, { recursive: true, force: true }); + }); + + const parityTests = [ + { file: 'fixture.js', ext: '.js', funcPattern: /processItems/ }, + { file: 'fixture.py', ext: '.py', funcPattern: /process/ }, + { file: 'fixture.java', ext: '.java', funcPattern: /process/ }, + { file: 'fixture.cs', ext: '.cs', funcPattern: /Process/ }, + { file: 'fixture.php', ext: '.php', funcPattern: /process/ }, + ]; + + for (const { file, ext, funcPattern } of parityTests) { + test(`parity: ${file} — native vs WASM block/edge counts match`, () => { + const relPath = `src/${file}`; + const symbols = nativeResults.get(relPath); + if (!symbols) return; + + const langId = LANG_MAP[ext]; + const complexityRules = COMPLEXITY_RULES.get(langId); + if (!complexityRules) return; + + // Parse with WASM + const absPath = path.join(tmpDir, relPath); + const parser = getParser(parsers, absPath); + if (!parser) return; + + const code = fs.readFileSync(absPath, 'utf-8'); + const tree = parser.parse(code); + if (!tree) return; + + const funcDefs = symbols.definitions.filter( + (d) => (d.kind === 'function' || d.kind === 'method') && funcPattern.test(d.name), + ); + + for (const def of funcDefs) { + if (!def.cfg?.blocks?.length) continue; + + const funcNode = findFunctionNode(tree.rootNode, def.line, def.endLine, complexityRules); + if (!funcNode) continue; + + const wasmCfg = buildFunctionCFG(funcNode, langId); + + // Block counts should match + expect(def.cfg.blocks.length).toBe(wasmCfg.blocks.length); + // Edge counts should match + expect(def.cfg.edges.length).toBe(wasmCfg.edges.length); + + // Block types should match (sorted for order independence) + const nativeTypes = def.cfg.blocks.map((b) => b.type).sort(); + const wasmTypes = wasmCfg.blocks.map((b) => b.type).sort(); + expect(nativeTypes).toEqual(wasmTypes); + + // Edge kinds should match (sorted) + const nativeKinds = def.cfg.edges.map((e) => e.kind).sort(); + const wasmKinds = wasmCfg.edges.map((e) => e.kind).sort(); + expect(nativeKinds).toEqual(wasmKinds); + } + }); + } +});