diff --git a/Cargo.lock b/Cargo.lock index 8874b953..ce3ab4bb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -111,6 +111,7 @@ dependencies = [ "tree-sitter-solidity", "tree-sitter-swift", "tree-sitter-typescript", + "tree-sitter-verilog", "tree-sitter-zig", ] @@ -1055,6 +1056,16 @@ dependencies = [ "tree-sitter-language", ] +[[package]] +name = "tree-sitter-verilog" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4e7e0360395852f1f6ff5b7b82c72dc6557d181073188df1d60ec469ea69c66" +dependencies = [ + "cc", + "tree-sitter-language", +] + [[package]] name = "tree-sitter-zig" version = "1.1.2" diff --git a/crates/codegraph-core/Cargo.toml b/crates/codegraph-core/Cargo.toml index dd6cc938..85efaee0 100644 --- a/crates/codegraph-core/Cargo.toml +++ b/crates/codegraph-core/Cargo.toml @@ -45,6 +45,7 @@ tree-sitter-erlang = "0.16" tree-sitter-groovy = "0.1" tree-sitter-r = "1.2" tree-sitter-solidity = "1.2" +tree-sitter-verilog = "1.0.3" rayon = "1" ignore = "0.4" globset = "0.4" diff --git a/crates/codegraph-core/src/change_detection.rs b/crates/codegraph-core/src/change_detection.rs index a22daf31..8c27c027 100644 --- a/crates/codegraph-core/src/change_detection.rs +++ b/crates/codegraph-core/src/change_detection.rs @@ -774,15 +774,18 @@ mod tests { #[test] fn detect_removed_skips_unsupported_extensions() { - // Files in WASM-only languages (Verilog) live in - // `file_hashes` because the JS-side WASM backfill writes them, but - // Rust's narrower file_collector never collects them. Without this - // skip, every incremental rebuild would flag them as removed and - // purge their rows — the #1066 ~2s floor. + // Files that the JS-side WASM backfill wrote into `file_hashes` for + // an extension that the Rust `file_collector` doesn't recognise must + // not be flagged as removed merely because the orchestrator's + // narrower collector never sees them — that would purge their rows + // on every incremental rebuild (the #1066 ~2s floor). All currently + // registered languages have native extractors, so this test uses + // synthetic extensions that are deliberately outside the + // `SUPPORTED_EXTENSIONS` set to exercise the skip path. let mut existing = HashMap::new(); for path in [ - "tests/fixtures/verilog/main.v", - "tests/fixtures/verilog/util.sv", + "tests/fixtures/unknown/main.unknownlang", + "tests/fixtures/unknown/util.fakelang", ] { existing.insert( path.to_string(), diff --git a/crates/codegraph-core/src/extractors/helpers.rs b/crates/codegraph-core/src/extractors/helpers.rs index 129fd287..7ae7b4bf 100644 --- a/crates/codegraph-core/src/extractors/helpers.rs +++ b/crates/codegraph-core/src/extractors/helpers.rs @@ -473,6 +473,23 @@ pub const SOLIDITY_AST_CONFIG: LangAstConfig = LangAstConfig { string_prefixes: &[], }; +/// Verilog/SystemVerilog AST config. +/// +/// The WASM-side `AST_TYPE_MAPS` (in `src/ast-analysis/rules/index.ts`) has no +/// `verilog` entry, so the JS engine emits no `ast_nodes` rows for Verilog +/// files. Keeping every list empty produces the same outcome here: the generic +/// walker visits every node but classifies none, so nothing is pushed. If the +/// JS map ever grows a Verilog entry, mirror it here. +pub const VERILOG_AST_CONFIG: LangAstConfig = LangAstConfig { + new_types: &[], + throw_types: &[], + await_types: &[], + string_types: &[], + regex_types: &[], + quote_chars: &['"'], + string_prefixes: &[], +}; + // ── Generic AST node walker ────────────────────────────────────────────────── /// Node types that represent identifiers across languages. diff --git a/crates/codegraph-core/src/extractors/mod.rs b/crates/codegraph-core/src/extractors/mod.rs index a69b5963..d826bb8c 100644 --- a/crates/codegraph-core/src/extractors/mod.rs +++ b/crates/codegraph-core/src/extractors/mod.rs @@ -29,6 +29,7 @@ pub mod rust_lang; pub mod scala; pub mod solidity; pub mod swift; +pub mod verilog; pub mod zig; use crate::parser_registry::LanguageKind; @@ -166,5 +167,8 @@ pub fn extract_symbols_with_opts( LanguageKind::Solidity => { solidity::SolidityExtractor.extract_with_opts(tree, source, file_path, include_ast_nodes) } + LanguageKind::Verilog => { + verilog::VerilogExtractor.extract_with_opts(tree, source, file_path, include_ast_nodes) + } } } diff --git a/crates/codegraph-core/src/extractors/verilog.rs b/crates/codegraph-core/src/extractors/verilog.rs new file mode 100644 index 00000000..1034fc5c --- /dev/null +++ b/crates/codegraph-core/src/extractors/verilog.rs @@ -0,0 +1,620 @@ +use tree_sitter::{Node, Tree}; +use crate::types::*; +use super::helpers::*; +use super::SymbolExtractor; + +/// Verilog/SystemVerilog symbol extractor. +/// +/// Mirrors `src/extractors/verilog.ts` (the WASM-engine source of truth) so +/// both engines produce identical definitions/imports/calls. The +/// tree-sitter-verilog grammar exposes no field names on the relevant nodes, +/// so name extraction works by scanning children for the appropriate +/// `*_identifier` wrapper or a plain `simple_identifier`. +/// +/// Definitions captured: +/// - `module_declaration` → kind `module` (ports collected as children) +/// - `interface_declaration` → kind `interface` +/// - `package_declaration` → kind `module` +/// - `class_declaration` → kind `class` (extends emitted into `classes`) +/// - `function_declaration` → kind `function` (`.` when nested) +/// - `task_declaration` → kind `function` (`.` when nested) +/// +/// Imports captured: +/// - `package_import_declaration` → `pkg::item` or `pkg::*` +/// - `include_compiler_directive` → ``include "file.vh"`` +/// +/// Calls captured: +/// - `module_instantiation` → module-type as call name (Verilog's analogue +/// of a function call — wires one module into another) +pub struct VerilogExtractor; + +impl SymbolExtractor for VerilogExtractor { + fn extract(&self, tree: &Tree, source: &[u8], file_path: &str) -> FileSymbols { + let mut symbols = FileSymbols::new(file_path.to_string()); + walk_tree(&tree.root_node(), source, &mut symbols, match_verilog_node); + walk_ast_nodes_with_config(&tree.root_node(), source, &mut symbols.ast_nodes, &VERILOG_AST_CONFIG); + symbols + } +} + +fn match_verilog_node(node: &Node, source: &[u8], symbols: &mut FileSymbols, _depth: usize) { + match node.kind() { + "module_declaration" => handle_module_decl(node, source, symbols), + "interface_declaration" => handle_interface_decl(node, source, symbols), + "package_declaration" => handle_package_decl(node, source, symbols), + "class_declaration" => handle_class_decl(node, source, symbols), + "function_declaration" => handle_function_decl(node, source, symbols), + "task_declaration" => handle_task_decl(node, source, symbols), + "module_instantiation" => handle_module_instantiation(node, source, symbols), + "package_import_declaration" => handle_package_import(node, source, symbols), + "include_compiler_directive" => handle_include_directive(node, source, symbols), + _ => {} + } +} + +// ── Handlers ──────────────────────────────────────────────────────────────── + +fn handle_module_decl(node: &Node, source: &[u8], symbols: &mut FileSymbols) { + let name = match find_module_name(node, source) { + Some(n) => n, + None => return, + }; + let ports = extract_ports(node, source); + symbols.definitions.push(Definition { + name, + kind: "module".to_string(), + line: start_line(node), + end_line: Some(end_line(node)), + decorators: None, + complexity: None, + cfg: None, + children: opt_children(ports), + }); +} + +fn handle_interface_decl(node: &Node, source: &[u8], symbols: &mut FileSymbols) { + let name = match find_decl_name(node, source) { + Some(n) => n, + None => return, + }; + symbols.definitions.push(Definition { + name, + kind: "interface".to_string(), + line: start_line(node), + end_line: Some(end_line(node)), + decorators: None, + complexity: None, + cfg: None, + children: None, + }); +} + +fn handle_package_decl(node: &Node, source: &[u8], symbols: &mut FileSymbols) { + let name = match find_decl_name(node, source) { + Some(n) => n, + None => return, + }; + symbols.definitions.push(Definition { + name, + kind: "module".to_string(), + line: start_line(node), + end_line: Some(end_line(node)), + decorators: None, + complexity: None, + cfg: None, + children: None, + }); +} + +fn handle_class_decl(node: &Node, source: &[u8], symbols: &mut FileSymbols) { + // tree-sitter-verilog exposes no field names on `class_declaration`. The class + // name lives under a `class_identifier` wrapper (`class_identifier > + // simple_identifier`), and the superclass appears as a `class_type` child + // (`class_type > class_identifier > simple_identifier`) — there is no + // `superclass` field. The WASM extractor's `childForFieldName('name')` + // returns null for the same reason, so we use the structural lookup here + // and mirror the fix in `src/extractors/verilog.ts` to keep both engines + // producing the same class definitions and `extends` relations. + let name = match find_class_name(node, source) { + Some(n) => n, + None => return, + }; + symbols.definitions.push(Definition { + name: name.clone(), + kind: "class".to_string(), + line: start_line(node), + end_line: Some(end_line(node)), + decorators: None, + complexity: None, + cfg: None, + children: None, + }); + + if let Some(superclass) = find_class_superclass(node, source) { + symbols.classes.push(ClassRelation { + name, + extends: Some(superclass), + implements: None, + line: start_line(node), + }); + } +} + +/// Resolve the name of a `class_declaration`. The grammar wraps the name in a +/// `class_identifier > simple_identifier` chain, so a plain identifier scan +/// (used by `find_decl_name`) misses it. +fn find_class_name(node: &Node, source: &[u8]) -> Option { + if let Some(text) = named_child_text(node, "name", source) { + return Some(text.to_string()); + } + for i in 0..node.child_count() { + let child = node.child(i)?; + if child.kind() == "class_identifier" { + return Some(extract_identifier_text(&child, source)); + } + } + None +} + +/// Resolve the superclass of a `class_declaration`. The grammar emits the +/// `extends` keyword followed by a `class_type` node holding a +/// `class_identifier > simple_identifier`. +fn find_class_superclass(node: &Node, source: &[u8]) -> Option { + for i in 0..node.child_count() { + let child = node.child(i)?; + if child.kind() == "class_type" { + if let Some(id) = find_child(&child, "class_identifier") { + return Some(extract_identifier_text(&id, source)); + } + return Some(node_text(&child, source).trim().to_string()); + } + } + None +} + +fn handle_function_decl(node: &Node, source: &[u8], symbols: &mut FileSymbols) { + let name = match find_function_or_task_name(node, source, "function_identifier") { + Some(n) => n, + None => return, + }; + let parent = find_verilog_parent(node, source); + let full_name = match parent { + Some(p) => format!("{}.{}", p, name), + None => name, + }; + symbols.definitions.push(Definition { + name: full_name, + kind: "function".to_string(), + line: start_line(node), + end_line: Some(end_line(node)), + decorators: None, + complexity: None, + cfg: None, + children: None, + }); +} + +fn handle_task_decl(node: &Node, source: &[u8], symbols: &mut FileSymbols) { + let name = match find_function_or_task_name(node, source, "task_identifier") { + Some(n) => n, + None => return, + }; + let parent = find_verilog_parent(node, source); + let full_name = match parent { + Some(p) => format!("{}.{}", p, name), + None => name, + }; + symbols.definitions.push(Definition { + name: full_name, + kind: "function".to_string(), + line: start_line(node), + end_line: Some(end_line(node)), + decorators: None, + complexity: None, + cfg: None, + children: None, + }); +} + +fn handle_module_instantiation(node: &Node, source: &[u8], symbols: &mut FileSymbols) { + // Tree-sitter-verilog exposes no field name on `module_instantiation`; the + // module type identifier is the first *named* child. Using `named_child(0)` + // (instead of `child(0)`) skips anonymous tokens like a leading `#` + // parameter-override punctuation, which would otherwise be captured as the + // call name on some non-ANSI instantiation shapes. The WASM extractor in + // `src/extractors/verilog.ts` is updated in lockstep to keep parity. + let name_node = node + .child_by_field_name("type") + .or_else(|| node.named_child(0)); + let name_node = match name_node { + Some(n) => n, + None => return, + }; + let name = node_text(&name_node, source).to_string(); + if name.is_empty() { + return; + } + symbols.calls.push(Call { + name, + line: start_line(node), + dynamic: None, + receiver: None, + }); +} + +fn handle_package_import(node: &Node, source: &[u8], symbols: &mut FileSymbols) { + // import pkg::item; or import pkg::*; + for i in 0..node.child_count() { + if let Some(child) = node.child(i) { + if child.kind() == "package_import_item" { + let text = node_text(&child, source); + let mut parts = text.splitn(2, "::"); + // `splitn(2, …).next()` always yields `Some(…)` — when the + // delimiter is absent the whole string is the sole item, so + // the empty-string fallback is unreachable in practice. + let pkg = parts.next().unwrap_or("").to_string(); + let item = parts.next().unwrap_or("*").to_string(); + symbols.imports.push(Import::new( + pkg, + vec![item], + start_line(node), + )); + } + } + } +} + +fn handle_include_directive(node: &Node, source: &[u8], symbols: &mut FileSymbols) { + // `include "file.vh" + for i in 0..node.child_count() { + if let Some(child) = node.child(i) { + let kind = child.kind(); + if kind == "string_literal" || kind == "quoted_string" || kind == "double_quoted_string" { + let raw = node_text(&child, source); + let source_path = raw + .trim_matches(|c: char| c == '"' || c == '\'') + .to_string(); + if source_path.is_empty() { + return; + } + let last = source_path + .split('/') + .last() + .unwrap_or(&source_path) + .to_string(); + let mut imp = Import::new(source_path, vec![last], start_line(node)); + imp.c_include = Some(true); + symbols.imports.push(imp); + return; + } + } + } +} + +// ── Name lookups ──────────────────────────────────────────────────────────── + +/// Find a module's name: try `name` field, then `module_header > simple_identifier`, +/// then any direct identifier child. +fn find_module_name(node: &Node, source: &[u8]) -> Option { + if let Some(text) = named_child_text(node, "name", source) { + return Some(text.to_string()); + } + if let Some(header) = find_child(node, "module_header") { + let id = find_child(&header, "simple_identifier") + .or_else(|| find_child(&header, "identifier")); + if let Some(id) = id { + return Some(node_text(&id, source).to_string()); + } + } + for i in 0..node.child_count() { + if let Some(child) = node.child(i) { + if child.kind() == "simple_identifier" || child.kind() == "identifier" { + return Some(node_text(&child, source).to_string()); + } + } + } + None +} + +/// Generic name lookup: `name` field, else first direct identifier child. +fn find_decl_name(node: &Node, source: &[u8]) -> Option { + if let Some(text) = named_child_text(node, "name", source) { + return Some(text.to_string()); + } + for i in 0..node.child_count() { + if let Some(child) = node.child(i) { + if child.kind() == "simple_identifier" || child.kind() == "identifier" { + return Some(node_text(&child, source).to_string()); + } + } + } + None +} + +/// Function/task name lookup. Falls back to a one-level deeper search for the +/// dedicated `*_identifier` wrapper (which itself wraps `simple_identifier`), +/// mirroring `findFunctionOrTaskName` in `verilog.ts`. +fn find_function_or_task_name(node: &Node, source: &[u8], identifier_type: &str) -> Option { + if let Some(name) = find_decl_name(node, source) { + return Some(name); + } + for i in 0..node.child_count() { + let child = match node.child(i) { + Some(c) => c, + None => continue, + }; + if child.kind() == identifier_type { + return Some(extract_identifier_text(&child, source)); + } + for j in 0..child.child_count() { + if let Some(grand) = child.child(j) { + if grand.kind() == identifier_type { + return Some(extract_identifier_text(&grand, source)); + } + } + } + } + None +} + +/// Pull a clean identifier string out of a `*_identifier` wrapper. The grammar +/// nests `function_identifier > function_identifier > simple_identifier`, so +/// using `node_text` on the outer node is safe (yields just the name in +/// well-formed source) but we strip whitespace defensively. +fn extract_identifier_text(node: &Node, source: &[u8]) -> String { + // Prefer the inner `simple_identifier` when present so we never accidentally + // pick up trailing punctuation or whitespace from the outer span. + if let Some(simple) = find_child(node, "simple_identifier") { + return node_text(&simple, source).trim().to_string(); + } + if let Some(inner) = find_child(node, node.kind()) { + return extract_identifier_text(&inner, source); + } + node_text(node, source).trim().to_string() +} + +/// Walk up to find the enclosing module/interface/package/class and return its +/// name — used to qualify nested function/task definitions like +/// `validators.check_range` or `MyClass.check_range`. `class_declaration` +/// wraps its name in `class_identifier > simple_identifier`, which +/// `find_decl_name` and `find_module_name` do not descend into, so we also +/// try `find_class_name` to keep parity with the JS extractor for tasks and +/// functions nested inside SystemVerilog classes. +fn find_verilog_parent(node: &Node, source: &[u8]) -> Option { + const PARENT_KINDS: &[&str] = &[ + "module_declaration", + "interface_declaration", + "package_declaration", + "class_declaration", + ]; + let mut current = node.parent(); + while let Some(parent) = current { + if PARENT_KINDS.contains(&parent.kind()) { + return find_decl_name(&parent, source) + .or_else(|| find_module_name(&parent, source)) + .or_else(|| find_class_name(&parent, source)); + } + current = parent.parent(); + } + None +} + +// ── Port extraction ───────────────────────────────────────────────────────── + +fn extract_ports(module_node: &Node, source: &[u8]) -> Vec { + let mut ports = Vec::new(); + collect_ports(module_node, source, &mut ports); + ports +} + +fn collect_ports(node: &Node, source: &[u8], ports: &mut Vec) { + const PORT_KINDS: &[&str] = &[ + "ansi_port_declaration", + "port_declaration", + "input_declaration", + "output_declaration", + "inout_declaration", + ]; + const CONTAINER_KINDS: &[&str] = &[ + "list_of_port_declarations", + "module_header", + "module_ansi_header", + "port_declaration_list", + ]; + + for i in 0..node.child_count() { + let child = match node.child(i) { + Some(c) => c, + None => continue, + }; + if PORT_KINDS.contains(&child.kind()) { + let name_node = child + .child_by_field_name("name") + .or_else(|| find_child(&child, "port_identifier")) + .or_else(|| find_child(&child, "simple_identifier")) + .or_else(|| find_child(&child, "identifier")); + if let Some(name_node) = name_node { + // `port_identifier` wraps a `simple_identifier`; descend to the + // innermost identifier for a clean, whitespace-free name. + let inner = find_child(&name_node, "simple_identifier") + .or_else(|| find_child(&name_node, "identifier")) + .unwrap_or(name_node); + ports.push(child_def( + node_text(&inner, source).to_string(), + "property", + start_line(&child), + )); + } + } + if CONTAINER_KINDS.contains(&child.kind()) { + collect_ports(&child, source, ports); + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use tree_sitter::Parser; + + fn parse(code: &str) -> FileSymbols { + let mut parser = Parser::new(); + parser + .set_language(&tree_sitter_verilog::LANGUAGE.into()) + .unwrap(); + let tree = parser.parse(code.as_bytes(), None).unwrap(); + VerilogExtractor.extract(&tree, code.as_bytes(), "test.v") + } + + #[test] + fn extracts_module() { + let s = parse("module top(input clk, output reg q); endmodule"); + let top = s.definitions.iter().find(|d| d.name == "top").unwrap(); + assert_eq!(top.kind, "module"); + let children = top.children.as_ref().unwrap(); + // ports: clk, q + assert_eq!(children.len(), 2); + assert!(children.iter().any(|c| c.name == "clk")); + assert!(children.iter().any(|c| c.name == "q")); + } + + #[test] + fn extracts_module_instantiation_as_call() { + // Use multi-line + multiple named port connections so the grammar + // disambiguates `sub u_sub(...)` as `module_instantiation` rather + // than `checker_instantiation` (a SystemVerilog assertion form). + let s = parse( + "module top(\n\ + input wire clk\n\ + );\n\ + wire w;\n\ + sub u_sub(\n\ + .clk(clk),\n\ + .out(w)\n\ + );\n\ + endmodule\n", + ); + let calls: Vec<&Call> = s.calls.iter().filter(|c| c.name == "sub").collect(); + assert_eq!(calls.len(), 1, "module instantiation should appear as a call"); + } + + #[test] + fn extracts_nested_function_with_parent_prefix() { + let s = parse( + "module validators(input clk, output reg valid); \ + function automatic check_range; \ + input [7:0] val; \ + check_range = (val >= 0); \ + endfunction \ + endmodule", + ); + let f = s + .definitions + .iter() + .find(|d| d.name == "validators.check_range") + .expect("nested function should be qualified by parent module"); + assert_eq!(f.kind, "function"); + } + + #[test] + fn extracts_task() { + let s = parse( + "module m; \ + task automatic do_thing; \ + input x; \ + x = 1; \ + endtask \ + endmodule", + ); + let t = s + .definitions + .iter() + .find(|d| d.name == "m.do_thing") + .expect("task should be qualified by parent module"); + assert_eq!(t.kind, "function"); + } + + #[test] + fn extracts_package_import() { + let s = parse( + "package pkg; endpackage \ + module m; \ + import pkg::*; \ + endmodule", + ); + let import = s.imports.iter().find(|i| i.source == "pkg"); + assert!(import.is_some(), "expected package import 'pkg'"); + let import = import.unwrap(); + assert_eq!(import.names, vec!["*".to_string()]); + } + + #[test] + fn extracts_include_directive() { + let s = parse("`include \"defs.vh\"\nmodule m; endmodule"); + let inc = s + .imports + .iter() + .find(|i| i.source == "defs.vh") + .expect("expected include for defs.vh"); + assert_eq!(inc.c_include, Some(true)); + assert_eq!(inc.names, vec!["defs.vh".to_string()]); + } + + #[test] + fn extracts_class_with_superclass() { + // tree-sitter-verilog wraps the class name in `class_identifier`, not a + // bare `simple_identifier`, so the lookup must descend through the + // wrapper. Guards against the silent regression where class extraction + // was a no-op despite a parseable class. + let s = parse("class Foo extends Bar; endclass"); + let class_def = s + .definitions + .iter() + .find(|d| d.name == "Foo" && d.kind == "class") + .expect("class Foo should be extracted"); + assert_eq!(class_def.kind, "class"); + let rel = s + .classes + .iter() + .find(|c| c.name == "Foo") + .expect("extends relation should be emitted"); + assert_eq!(rel.extends.as_deref(), Some("Bar")); + } + + #[test] + fn extracts_class_without_superclass() { + let s = parse("class Baz; endclass"); + let class_def = s + .definitions + .iter() + .find(|d| d.name == "Baz" && d.kind == "class") + .expect("class Baz should be extracted"); + assert_eq!(class_def.kind, "class"); + assert!( + s.classes.iter().all(|c| c.name != "Baz"), + "no extends relation should be emitted for a class without a superclass" + ); + } + + #[test] + fn qualifies_task_nested_in_class_with_class_name() { + // `find_verilog_parent` must descend into `class_identifier` to + // recover the class name when qualifying nested function/task + // definitions; otherwise a task declared inside a SystemVerilog + // class surfaces with a bare name rather than `ClassName.task`. + let s = parse( + "class MyClass; \ + task run; \ + input x; \ + endtask \ + endclass", + ); + let t = s + .definitions + .iter() + .find(|d| d.name == "MyClass.run") + .expect("task nested in a class should be qualified by the class name"); + assert_eq!(t.kind, "function"); + } +} + diff --git a/crates/codegraph-core/src/file_collector.rs b/crates/codegraph-core/src/file_collector.rs index dc18cd62..1c21dda9 100644 --- a/crates/codegraph-core/src/file_collector.rs +++ b/crates/codegraph-core/src/file_collector.rs @@ -33,13 +33,19 @@ const DEFAULT_IGNORE_DIRS: &[&str] = &[ /// All supported file extensions (mirrors the JS `EXTENSIONS` set). /// Must stay in sync with `LanguageKind::from_extension`. /// -/// Known extension collisions: +/// **Extension collisions to be aware of:** +/// - `.v` is shared by Verilog and Coq theorem-prover source files. Codegraph +/// routes `.v` to the Verilog parser; Coq-heavy repositories will see Coq +/// files mis-classified as Verilog and produce mostly-empty symbol output. +/// There is currently no per-repo override for this; users with Coq files +/// should exclude `*.v` via the `exclude` config glob. /// - `.m` is the canonical extension for both Objective-C *and* MATLAB/GNU /// Octave source files. We route every `.m` file through the Objective-C /// extractor. MATLAB files will parse but produce garbled or empty symbol /// output (no error is raised). If MATLAB support is added later this will /// need disambiguation heuristics (e.g. presence of `@interface`/`@import` /// vs MATLAB keywords like `function`/`classdef`). +/// - `.h` (C vs Objective-C) is unambiguous here — routed to C parser. const SUPPORTED_EXTENSIONS: &[&str] = &[ "js", "jsx", "mjs", "cjs", "ts", "tsx", "d.ts", "py", "pyi", "go", "rs", "java", "cs", "rb", "rake", "gemspec", "php", "phtml", "tf", "hcl", "c", "h", "cpp", "cc", "cxx", "hpp", "cu", @@ -47,7 +53,7 @@ const SUPPORTED_EXTENSIONS: &[&str] = &[ "ml", "mli", "fs", "fsx", "fsi", "m", "jl", "gleam", "clj", "cljs", "cljc", "erl", "hrl", "groovy", "gvy", "sol", // R is case-sensitive: both `.r` and `.R` are conventional. - "r", "R", + "r", "R", "v", "sv", ]; /// Returns whether `path` has an extension the Rust file_collector would accept. @@ -55,9 +61,8 @@ const SUPPORTED_EXTENSIONS: &[&str] = &[ /// Mirrors the predicate at the heart of `collect_files`: a file is collected /// if `LanguageKind::from_extension` recognizes it OR its raw extension is in /// `SUPPORTED_EXTENSIONS`. Exposed for `change_detection::detect_removed_files` -/// so that files outside Rust's capability (e.g. WASM-only `.v`) are -/// not flagged as "removed" merely because the orchestrator's narrower -/// collector never sees them. +/// so that files outside Rust's capability are not flagged as "removed" +/// merely because the orchestrator's narrower collector never sees them. pub fn is_supported_extension(path: &str) -> bool { if LanguageKind::from_extension(path).is_some() { return true; diff --git a/crates/codegraph-core/src/parser_registry.rs b/crates/codegraph-core/src/parser_registry.rs index 6b28ed6b..cfd61fd1 100644 --- a/crates/codegraph-core/src/parser_registry.rs +++ b/crates/codegraph-core/src/parser_registry.rs @@ -37,6 +37,7 @@ pub enum LanguageKind { Groovy, R, Solidity, + Verilog, } impl LanguageKind { @@ -78,6 +79,7 @@ impl LanguageKind { Self::Groovy => "groovy", Self::R => "r", Self::Solidity => "solidity", + Self::Verilog => "verilog", } } @@ -129,6 +131,7 @@ impl LanguageKind { // are conventional. `Path::extension` preserves case on Unix. "r" | "R" => Some(Self::R), "sol" => Some(Self::Solidity), + "v" | "sv" => Some(Self::Verilog), _ => None, } } @@ -171,6 +174,7 @@ impl LanguageKind { "groovy" => Some(Self::Groovy), "r" => Some(Self::R), "solidity" => Some(Self::Solidity), + "verilog" => Some(Self::Verilog), _ => None, } } @@ -212,6 +216,7 @@ impl LanguageKind { Self::Groovy => tree_sitter_groovy::LANGUAGE.into(), Self::R => tree_sitter_r::LANGUAGE.into(), Self::Solidity => tree_sitter_solidity::LANGUAGE.into(), + Self::Verilog => tree_sitter_verilog::LANGUAGE.into(), } } @@ -228,6 +233,7 @@ impl LanguageKind { JavaScript, TypeScript, Tsx, Python, Go, Rust, Java, CSharp, Ruby, Php, Hcl, C, Cpp, Kotlin, Swift, Scala, Bash, Elixir, Lua, Dart, Zig, Haskell, Ocaml, OcamlInterface, FSharp, ObjC, Gleam, Julia, Cuda, Clojure, Erlang, Groovy, R, Solidity, + Verilog, ] } } @@ -306,14 +312,15 @@ mod tests { | LanguageKind::Erlang | LanguageKind::Groovy | LanguageKind::R - | LanguageKind::Solidity => (), + | LanguageKind::Solidity + | LanguageKind::Verilog => (), }; // IMPORTANT: this constant must equal the number of arms in the match // above AND the length of the slice returned by `LanguageKind::all()`. // Because both checks require the same manual update, they reinforce // each other: a developer who updates the match is reminded to also // update `all()` and this count. - const EXPECTED_LEN: usize = 34; + const EXPECTED_LEN: usize = 35; assert_eq!( LanguageKind::all().len(), EXPECTED_LEN, diff --git a/src/domain/parser.ts b/src/domain/parser.ts index a9e7587c..a94bbe49 100644 --- a/src/domain/parser.ts +++ b/src/domain/parser.ts @@ -488,6 +488,8 @@ export const NATIVE_SUPPORTED_EXTENSIONS: ReadonlySet = new Set([ '.gvy', '.r', '.sol', + '.v', + '.sv', ]); /** diff --git a/src/extractors/verilog.ts b/src/extractors/verilog.ts index 1b85fec5..eced1089 100644 --- a/src/extractors/verilog.ts +++ b/src/extractors/verilog.ts @@ -99,27 +99,60 @@ function handlePackageDecl(node: TreeSitterNode, ctx: ExtractorOutput): void { } function handleClassDecl(node: TreeSitterNode, ctx: ExtractorOutput): void { - const nameNode = node.childForFieldName('name'); - if (!nameNode) return; + // tree-sitter-verilog exposes no field names on `class_declaration`. The + // class name lives under a `class_identifier > simple_identifier` chain, and + // the superclass appears as a `class_type` child (no `superclass` field). + // The Rust extractor in `crates/codegraph-core/src/extractors/verilog.rs` + // uses the same structural lookups so both engines emit identical class + // definitions and `extends` relations. + const name = findClassName(node); + if (!name) return; ctx.definitions.push({ - name: nameNode.text, + name, kind: 'class', line: node.startPosition.row + 1, endLine: nodeEndLine(node), }); - // Superclass via extends - const superclass = node.childForFieldName('superclass'); + const superclass = findClassSuperclass(node); if (superclass) { ctx.classes.push({ - name: nameNode.text, - extends: superclass.text, + name, + extends: superclass, line: node.startPosition.row + 1, }); } } +function findClassName(node: TreeSitterNode): string | null { + const fieldName = node.childForFieldName('name'); + if (fieldName) return fieldName.text; + for (let i = 0; i < node.childCount; i++) { + const child = node.child(i); + if (child && child.type === 'class_identifier') { + const simple = findChild(child, 'simple_identifier'); + return (simple ?? child).text.trim(); + } + } + return null; +} + +function findClassSuperclass(node: TreeSitterNode): string | null { + for (let i = 0; i < node.childCount; i++) { + const child = node.child(i); + if (child && child.type === 'class_type') { + const id = findChild(child, 'class_identifier'); + if (id) { + const simple = findChild(id, 'simple_identifier'); + return (simple ?? id).text.trim(); + } + return child.text.trim(); + } + } + return null; +} + function handleFunctionDecl(node: TreeSitterNode, ctx: ExtractorOutput): void { const nameNode = findFunctionOrTaskName(node, 'function_identifier'); if (!nameNode) return; @@ -151,8 +184,12 @@ function handleTaskDecl(node: TreeSitterNode, ctx: ExtractorOutput): void { } function handleModuleInstantiation(node: TreeSitterNode, ctx: ExtractorOutput): void { - // Module instantiations are like function calls: `ModuleName instance_name(...);` - const moduleType = node.childForFieldName('type') || node.child(0); + // Module instantiations are like function calls: `ModuleName instance_name(...);`. + // The module type identifier is the first *named* child; using + // `namedChild(0)` (instead of `child(0)`) skips anonymous tokens like a + // leading `#` parameter-override punctuation so we never capture that as a + // call name. The Rust extractor uses the same lookup for parity. + const moduleType = node.childForFieldName('type') ?? node.namedChild(0); if (!moduleType) return; ctx.calls.push({ @@ -169,7 +206,10 @@ function handlePackageImport(node: TreeSitterNode, ctx: ExtractorOutput): void { if (child.type === 'package_import_item') { const text = child.text; const parts = text.split('::'); - const pkg = parts[0] ?? text; + // `String.split('::')` always yields at least one element — when the + // delimiter is absent the whole string is the sole item, so the + // empty-string fallback is unreachable in practice. + const pkg = parts[0] ?? ''; const item = parts[1] ?? '*'; ctx.imports.push({ source: pkg, @@ -182,9 +222,18 @@ function handlePackageImport(node: TreeSitterNode, ctx: ExtractorOutput): void { function handleIncludeDirective(node: TreeSitterNode, ctx: ExtractorOutput): void { // `include "file.vh" + // Mirrors the Rust `handle_include_directive` which checks all three node + // kinds — tree-sitter-verilog has emitted `double_quoted_string` in some + // grammar revisions, and missing it would silently drop the import in WASM + // while the native engine still records it. for (let i = 0; i < node.childCount; i++) { const child = node.child(i); - if (child && (child.type === 'string_literal' || child.type === 'quoted_string')) { + if ( + child && + (child.type === 'string_literal' || + child.type === 'quoted_string' || + child.type === 'double_quoted_string') + ) { const source = child.text.replace(/^["']|["']$/g, ''); ctx.imports.push({ source, @@ -266,8 +315,14 @@ function findVerilogParent(node: TreeSitterNode): string | null { current.type === 'package_declaration' || current.type === 'class_declaration' ) { - const name = findDeclName(current) || findModuleName(current); - return name ? name.text : null; + // `class_declaration` wraps its name in `class_identifier > + // simple_identifier`; `findDeclName` / `findModuleName` only look at + // bare `simple_identifier`/`identifier` children, so they miss it. + // `findClassName` already handles the wrapper, so consult it last to + // qualify tasks/functions nested inside a SystemVerilog class. + const nameNode = findDeclName(current) || findModuleName(current); + if (nameNode) return nameNode.text; + return findClassName(current); } current = current.parent; } @@ -292,17 +347,30 @@ function extractPorts(moduleNode: TreeSitterNode): SubDeclaration[] { ) { const nameNode = child.childForFieldName('name') || + findChild(child, 'port_identifier') || findChild(child, 'simple_identifier') || findChild(child, 'identifier'); if (nameNode) { - ports.push({ name: nameNode.text, kind: 'property', line: child.startPosition.row + 1 }); + // `port_identifier` wraps a `simple_identifier`; descend to the + // innermost identifier for a clean, whitespace-free name. + const inner = + findChild(nameNode, 'simple_identifier') || + findChild(nameNode, 'identifier') || + nameNode; + ports.push({ name: inner.text, kind: 'property', line: child.startPosition.row + 1 }); } } - // Recurse into port list containers + // Recurse into port list containers. `module_ansi_header` wraps the + // ANSI-style declarations emitted by tree-sitter-verilog (e.g. + // `module top(input clk, output reg q);`) — without this branch the + // WASM engine returns an empty children array while the native engine + // (which includes the same kind in its CONTAINER_KINDS list) returns + // the correct ports, breaking engine parity. if ( child.type === 'list_of_port_declarations' || child.type === 'module_header' || + child.type === 'module_ansi_header' || child.type === 'port_declaration_list' ) { collectFromNode(child); diff --git a/tests/benchmarks/regression-guard.test.ts b/tests/benchmarks/regression-guard.test.ts index 1b076a4c..4125325e 100644 --- a/tests/benchmarks/regression-guard.test.ts +++ b/tests/benchmarks/regression-guard.test.ts @@ -167,6 +167,27 @@ const SKIP_VERSIONS = new Set(['3.8.0']); * under the warmup + 5-sample methodology already applied to incremental * benchmarks. * + * - 3.10.0:fnDeps depth 3 — same CI-variance pattern as fnDeps depth 1, just + * one depth-level deeper. WASM baseline is 33ms (sub-30ms range when CI + * jitter is included). The fn_deps codepath is depth-agnostic — same Rust + * implementation, same JS wrapper, same DB indexes — so a deviation at + * depth 3 but not depth 1/5 indicates per-run runner noise, not a + * structural regression. Observed +32% (33 → 43.4ms) on run 25790873005, + * absolute delta 10.4ms exactly at the MIN_ABSOLUTE_DELTA floor. Exempt + * this release; remove once 3.11.0+ data confirms stabilization. + * + * - 3.10.0:Full build — adding native Verilog support (#1107) pulled the + * 4 `.v` resolution-benchmark fixtures into the corpus the incremental + * benchmark sweeps (it runs against the repo root). tree-sitter-verilog + * is a large grammar (SystemVerilog is one of the heaviest in the + * tree-sitter ecosystem) so each file costs noticeably more than the + * other fixture languages. Local measurement: 1959 → 2809 (+43%, run + * 25716010487). The cost is real and structural — not a regression in + * shared code paths. Resolution: either exclude `tests/benchmarks/ + * resolution/fixtures/verilog/**` from the benchmark sweep or accept the + * one-time bump as the cost of supporting Verilog. Tracked separately; + * exempt this release. + * * - 3.10.0:Query time — cumulative effect of adding two native extractors * (Solidity #1100 + R #1102) in quick succession. Neither tripped the * threshold individually (Solidity PR's Query time stayed at 49ms, R PR @@ -209,6 +230,7 @@ const KNOWN_REGRESSIONS = new Set([ '3.10.0:fnDeps depth 1', '3.10.0:fnDeps depth 3', '3.10.0:fnDeps depth 5', + '3.10.0:Full build', '3.10.0:Query time', ]); diff --git a/tests/parsers/native-drop-classification.test.ts b/tests/parsers/native-drop-classification.test.ts index 1ed38d56..ec50b0f7 100644 --- a/tests/parsers/native-drop-classification.test.ts +++ b/tests/parsers/native-drop-classification.test.ts @@ -13,12 +13,16 @@ const __dirname = path.dirname(__filename); const REPO_ROOT = path.resolve(__dirname, '..', '..'); describe('classifyNativeDrops', () => { - it('groups WASM-only languages under unsupported-by-native', () => { - const { byReason, totals } = classifyNativeDrops(['src/j.v', 'src/k.sv']); + it('groups extensions without a native extractor under unsupported-by-native', () => { + // No real language in `LANGUAGE_REGISTRY` is WASM-only anymore (every + // supported grammar has a native extractor), so this test uses synthetic + // extensions that are deliberately absent from + // `NATIVE_SUPPORTED_EXTENSIONS` to exercise the unsupported branch. + const { byReason, totals } = classifyNativeDrops(['src/a.unknownlang', 'src/b.fakelang']); expect(totals['unsupported-by-native']).toBe(2); expect(totals['native-extractor-failure']).toBe(0); - expect(byReason['unsupported-by-native'].get('.v')).toEqual(['src/j.v']); - expect(byReason['unsupported-by-native'].get('.sv')).toEqual(['src/k.sv']); + expect(byReason['unsupported-by-native'].get('.unknownlang')).toEqual(['src/a.unknownlang']); + expect(byReason['unsupported-by-native'].get('.fakelang')).toEqual(['src/b.fakelang']); }); it('flags natively-supported extensions as native-extractor-failure', () => { @@ -37,14 +41,17 @@ describe('classifyNativeDrops', () => { it('handles a mix of supported and unsupported extensions', () => { const { byReason, totals } = classifyNativeDrops([ 'src/a.ts', - 'src/b.v', - 'src/c.v', - 'src/d.sv', + 'src/b.unknownlang', + 'src/c.unknownlang', + 'src/d.fakelang', ]); expect(totals['native-extractor-failure']).toBe(1); expect(totals['unsupported-by-native']).toBe(3); - expect(byReason['unsupported-by-native'].get('.v')).toEqual(['src/b.v', 'src/c.v']); - expect(byReason['unsupported-by-native'].get('.sv')).toEqual(['src/d.sv']); + expect(byReason['unsupported-by-native'].get('.unknownlang')).toEqual([ + 'src/b.unknownlang', + 'src/c.unknownlang', + ]); + expect(byReason['unsupported-by-native'].get('.fakelang')).toEqual(['src/d.fakelang']); }); it('lowercases extensions so .R and .r share a bucket', () => { @@ -70,8 +77,9 @@ describe('classifyNativeDrops', () => { expect(NATIVE_SUPPORTED_EXTENSIONS.has('.fsx')).toBe(true); expect(NATIVE_SUPPORTED_EXTENSIONS.has('.gleam')).toBe(true); expect(NATIVE_SUPPORTED_EXTENSIONS.has('.m')).toBe(true); - expect(NATIVE_SUPPORTED_EXTENSIONS.has('.v')).toBe(false); - expect(NATIVE_SUPPORTED_EXTENSIONS.has('.sv')).toBe(false); + expect(NATIVE_SUPPORTED_EXTENSIONS.has('.v')).toBe(true); + expect(NATIVE_SUPPORTED_EXTENSIONS.has('.sv')).toBe(true); + expect(NATIVE_SUPPORTED_EXTENSIONS.has('.unknownlang')).toBe(false); }); }); diff --git a/tests/parsers/verilog.test.ts b/tests/parsers/verilog.test.ts index 7c4894bf..a73a03b9 100644 --- a/tests/parsers/verilog.test.ts +++ b/tests/parsers/verilog.test.ts @@ -61,4 +61,67 @@ endmodule`); expect.objectContaining({ source: 'pkg', names: ['item'] }), ); }); + + it('extracts class declarations with extends', () => { + // tree-sitter-verilog wraps the class name in `class_identifier`, not a + // bare `simple_identifier`, so the lookup must descend through the + // wrapper. Guards against the silent regression where class extraction + // was a no-op despite the grammar parsing the class cleanly. + const symbols = parseVerilog(`class Foo extends Bar; endclass`); + expect(symbols.definitions).toContainEqual( + expect.objectContaining({ name: 'Foo', kind: 'class' }), + ); + expect(symbols.classes).toContainEqual( + expect.objectContaining({ name: 'Foo', extends: 'Bar' }), + ); + }); + + it('extracts class declarations without extends', () => { + const symbols = parseVerilog(`class Baz; endclass`); + expect(symbols.definitions).toContainEqual( + expect.objectContaining({ name: 'Baz', kind: 'class' }), + ); + expect(symbols.classes.find((c: { name: string }) => c.name === 'Baz')).toBeUndefined(); + }); + + it('extracts ports from ANSI-style modules', () => { + // tree-sitter-verilog wraps ANSI declarations (`module top(input clk, …);`) + // under `module_ansi_header`, so `extractPorts` must descend through that + // wrapper. Without it the WASM engine returns no port children while the + // native engine extracts them correctly — a parity violation. + const symbols = parseVerilog(`module top(input clk, output reg q); endmodule`); + const moduleDef = symbols.definitions.find( + (d: { name: string; kind: string }) => d.name === 'top' && d.kind === 'module', + ); + expect(moduleDef).toBeDefined(); + expect(moduleDef?.children).toBeDefined(); + const portNames = moduleDef?.children?.map((c: { name: string }) => c.name) ?? []; + expect(portNames).toContain('clk'); + expect(portNames).toContain('q'); + }); + + it('extracts include compiler directives as imports', () => { + const symbols = parseVerilog(`\`include "common/defines.vh"`); + expect(symbols.imports).toContainEqual( + expect.objectContaining({ + source: 'common/defines.vh', + cInclude: true, + }), + ); + }); + + it('qualifies tasks nested inside a class with the class name', () => { + // `findVerilogParent` must descend into `class_identifier` to recover the + // class name when qualifying nested function/task definitions, otherwise + // a task declared inside `class MyClass; task run; endtask endclass` + // would surface as a bare `run` instead of `MyClass.run`. + const symbols = parseVerilog(`class MyClass; + task run; + input x; + endtask +endclass`); + expect(symbols.definitions).toContainEqual( + expect.objectContaining({ name: 'MyClass.run', kind: 'function' }), + ); + }); });