diff --git a/Makefile.cbm b/Makefile.cbm index b3bb4a8c..ae468618 100644 --- a/Makefile.cbm +++ b/Makefile.cbm @@ -217,8 +217,8 @@ MIMALLOC_CFLAGS_TEST = -std=c11 -g -O1 -w \ # sqlite3 (vendored amalgamation — compiled ourselves for ASan instrumentation) SQLITE3_SRC = vendored/sqlite3/sqlite3.c -SQLITE3_CFLAGS = -std=c11 -O2 -w -DSQLITE_DQS=0 -DSQLITE_THREADSAFE=1 -SQLITE3_CFLAGS_TEST = -std=c11 -g -O1 -w -DSQLITE_DQS=0 -DSQLITE_THREADSAFE=1 +SQLITE3_CFLAGS = -std=c11 -O2 -w -DSQLITE_DQS=0 -DSQLITE_THREADSAFE=1 -DSQLITE_ENABLE_FTS5 +SQLITE3_CFLAGS_TEST = -std=c11 -g -O1 -w -DSQLITE_DQS=0 -DSQLITE_THREADSAFE=1 -DSQLITE_ENABLE_FTS5 # TRE regex (vendored, Windows only — POSIX uses system ) TRE_SRC = vendored/tre/tre_all.c diff --git a/internal/cbm/extract_calls.c b/internal/cbm/extract_calls.c index 87bfd005..d9c38c9c 100644 --- a/internal/cbm/extract_calls.c +++ b/internal/cbm/extract_calls.c @@ -344,4 +344,101 @@ void handle_calls(CBMExtractCtx *ctx, TSNode node, const CBMLangSpec *spec, Walk } } } + + // C# delegate/event patterns + if (ctx->language == CBM_LANG_CSHARP) { + // Fix 1: event += MethodName (bare method reference subscription) + // Creates a CALLS edge from the subscribing method to the handler method. + // e.g. _socket.OnConnected += SocketOnConnected; + if (strcmp(kind, "assignment_expression") == 0) { + TSNode op = ts_node_child_by_field_name(node, "operator", 8); + if (!ts_node_is_null(op)) { + char *op_text = cbm_node_text(ctx->arena, op, ctx->source); + if (op_text && strcmp(op_text, "+=") == 0) { + TSNode right = ts_node_child_by_field_name(node, "right", 5); + if (!ts_node_is_null(right)) { + const char *rk = ts_node_type(right); + if (strcmp(rk, "identifier") == 0 || + strcmp(rk, "member_access_expression") == 0) { + char *callee = cbm_node_text(ctx->arena, right, ctx->source); + if (callee && callee[0] && !cbm_is_keyword(callee, ctx->language)) { + CBMCall call; + call.callee_name = callee; + call.enclosing_func_qn = state->enclosing_func_qn; + cbm_calls_push(&ctx->result->calls, ctx->arena, call); + } + } + } + } + } + } + + // Fix 2: delegate?.Invoke() → resolve to receiver (delegate) name. + // C# delegates are invoked via .Invoke() or ?.Invoke() — the callee name + // "Invoke" resolves to nothing. Instead, extract the receiver (delegate property) + // name, which is more likely to match a registered symbol. + // e.g. OnConnected?.Invoke(this, e) → creates CALLS edge to "OnConnected" + // + // C# tree-sitter AST for "OnConnected?.Invoke(this, e)": + // invocation_expression + // function: conditional_access_expression + // expression: identifier "OnConnected" ← receiver + // member_binding_expression + // name: identifier "Invoke" ← method + // arguments: argument_list + if (cbm_kind_in_set(node, spec->call_node_types)) { + TSNode func_node2 = ts_node_child_by_field_name(node, "function", 8); + if (!ts_node_is_null(func_node2)) { + const char *fk2 = ts_node_type(func_node2); + bool is_invoke = false; + TSNode receiver2 = {0}; // NOLINT + + if (strcmp(fk2, "conditional_access_expression") == 0) { + // ?. access: look for member_binding_expression child + uint32_t ncc = ts_node_named_child_count(func_node2); + for (uint32_t ci = 0; ci < ncc; ci++) { + TSNode child = ts_node_named_child(func_node2, ci); + const char *ck = ts_node_type(child); + if (strcmp(ck, "member_binding_expression") == 0) { + TSNode name_n = ts_node_child_by_field_name(child, "name", 4); + if (!ts_node_is_null(name_n)) { + char *nm = cbm_node_text(ctx->arena, name_n, ctx->source); + if (nm && strcmp(nm, "Invoke") == 0) { + is_invoke = true; + } + } + } + if (strcmp(ck, "identifier") == 0 || + strcmp(ck, "member_access_expression") == 0) { + receiver2 = child; + } + } + } else if (strcmp(fk2, "member_access_expression") == 0) { + // Dot access: obj.Invoke(...) + TSNode name_n = ts_node_child_by_field_name(func_node2, "name", 4); + if (!ts_node_is_null(name_n)) { + char *nm = cbm_node_text(ctx->arena, name_n, ctx->source); + if (nm && strcmp(nm, "Invoke") == 0) { + is_invoke = true; + TSNode expr = ts_node_child_by_field_name(func_node2, + "expression", 10); + if (!ts_node_is_null(expr)) { + receiver2 = expr; + } + } + } + } + + if (is_invoke && !ts_node_is_null(receiver2)) { + char *recv = cbm_node_text(ctx->arena, receiver2, ctx->source); + if (recv && recv[0] && !cbm_is_keyword(recv, ctx->language)) { + CBMCall call; + call.callee_name = recv; + call.enclosing_func_qn = state->enclosing_func_qn; + cbm_calls_push(&ctx->result->calls, ctx->arena, call); + } + } + } + } + } } diff --git a/internal/cbm/extract_defs.c b/internal/cbm/extract_defs.c index 754a98f7..51c43978 100644 --- a/internal/cbm/extract_defs.c +++ b/internal/cbm/extract_defs.c @@ -5,6 +5,7 @@ #include "tree_sitter/api.h" // TSNode, ts_node_* #include // uint32_t #include +#include /* strcasecmp */ #include // Field name lengths for ts_node_child_by_field_name() calls. @@ -565,10 +566,58 @@ static const char **extract_base_classes(CBMArena *a, TSNode node, const char *s } } } - // C/C++ specific: handle base_class_clause (contains access specifiers + type names) + // C# specific: handle base_list node (contains base types separated by commas) { uint32_t count = ts_node_child_count(node); for (uint32_t i = 0; i < count; i++) { + TSNode child = ts_node_child(node, i); + if (strcmp(ts_node_type(child), "base_list") == 0) { + const char *bases[16]; + int base_count = 0; + uint32_t bnc = ts_node_named_child_count(child); + for (uint32_t bi = 0; bi < bnc && base_count < MAX_BASES_MINUS_1; bi++) { + TSNode bc = ts_node_named_child(child, bi); + const char *bk = ts_node_type(bc); + // C# base types can be: identifier, generic_name, qualified_name, + // or wrapped in a simple_base_type / primary_constructor_base_type + char *text = NULL; + if (strcmp(bk, "identifier") == 0 || strcmp(bk, "generic_name") == 0 || + strcmp(bk, "qualified_name") == 0) { + text = cbm_node_text(a, bc, source); + } else { + // For wrapper nodes (simple_base_type etc.), extract the first + // named child which should be the type identifier + TSNode inner = ts_node_named_child(bc, 0); + if (!ts_node_is_null(inner)) { + text = cbm_node_text(a, inner, source); + } + } + if (text && text[0]) { + // Strip generic args for resolution: "List" → "List" + char *angle = strchr(text, '<'); + if (angle) *angle = '\0'; + bases[base_count++] = text; + } + } + if (base_count > 0) { + const char **result = + (const char **)cbm_arena_alloc(a, (base_count + 1) * sizeof(const char *)); + if (result) { + for (int j = 0; j < base_count; j++) { + result[j] = bases[j]; + } + result[base_count] = NULL; + return result; + } + } + } + } + } + + // C/C++ specific: handle base_class_clause (contains access specifiers + type names) + { + uint32_t count2 = ts_node_child_count(node); + for (uint32_t i = 0; i < count2; i++) { TSNode child = ts_node_child(node, i); if (strcmp(ts_node_type(child), "base_class_clause") == 0) { // Extract type identifiers from base_class_clause, skipping access specifiers @@ -1136,11 +1185,82 @@ static void extract_func_def(CBMExtractCtx *ctx, TSNode node, const CBMLangSpec } } - // main is always an entry point - if (strcmp(name, "main") == 0) { + // main/Main is always an entry point (case-insensitive for C#/Java) + if (strcasecmp(name, "main") == 0) { def.is_entry_point = true; } + // C/C++ entry point detection: WinMain, DllMain, GTest, MFC + if ((ctx->language == CBM_LANG_C || ctx->language == CBM_LANG_CPP) && !def.is_entry_point) { + if (strcmp(name, "WinMain") == 0 || strcmp(name, "wWinMain") == 0 || + strcmp(name, "DllMain") == 0 || strcmp(name, "wmain") == 0 || + strcmp(name, "_tmain") == 0 || strcmp(name, "InitInstance") == 0 || + strcmp(name, "OnInitDialog") == 0) { + def.is_entry_point = true; + } + } + + // C# entry point detection: Windows Service lifecycle, ASP.NET controllers + if (ctx->language == CBM_LANG_CSHARP && !def.is_entry_point) { + // Windows Service lifecycle entry points + if (strcmp(name, "OnStart") == 0 || strcmp(name, "OnStartImpl") == 0 || + strcmp(name, "OnStop") == 0 || strcmp(name, "OnStopImpl") == 0 || + strcmp(name, "Run") == 0 || strcmp(name, "Execute") == 0 || + strcmp(name, "Configure") == 0 || strcmp(name, "ConfigureServices") == 0) { + def.is_entry_point = true; + } + // ASP.NET controller decorators: [HttpGet], [HttpPost], [Route], etc. + if (!def.is_entry_point && def.decorators) { + for (const char **d = def.decorators; *d; d++) { + if (strstr(*d, "HttpGet") || strstr(*d, "HttpPost") || + strstr(*d, "HttpPut") || strstr(*d, "HttpDelete") || + strstr(*d, "HttpPatch") || strstr(*d, "Route") || + strstr(*d, "ApiController") || strstr(*d, "Authorize")) { + def.is_entry_point = true; + break; + } + } + } + // Test entry points: [TestMethod], [Fact], [Test], [SetUp] + if (!def.is_entry_point && def.decorators) { + for (const char **d = def.decorators; *d; d++) { + if (strstr(*d, "TestMethod") || strstr(*d, "Fact") || + strstr(*d, "Test") || strstr(*d, "SetUp") || + strstr(*d, "TestInitialize")) { + def.is_entry_point = true; + break; + } + } + } + } + + // Java entry point detection: Spring Boot, Vert.x, JAX-RS, JUnit + if (ctx->language == CBM_LANG_JAVA && !def.is_entry_point) { + // Vert.x lifecycle and common server patterns + if (strcmp(name, "start") == 0 || strcmp(name, "configure") == 0 || + strcmp(name, "init") == 0 || strcmp(name, "run") == 0 || + strcmp(name, "handle") == 0) { + def.is_entry_point = true; + } + // Spring/JAX-RS/JUnit decorators + if (!def.is_entry_point && def.decorators) { + for (const char **d = def.decorators; *d; d++) { + if (strstr(*d, "RequestMapping") || strstr(*d, "GetMapping") || + strstr(*d, "PostMapping") || strstr(*d, "PutMapping") || + strstr(*d, "DeleteMapping") || strstr(*d, "PatchMapping") || + strstr(*d, "Endpoint") || strstr(*d, "EventHandler") || + strstr(*d, "Scheduled") || strstr(*d, "Bean") || + strstr(*d, "Override") || strstr(*d, "Test") || + strstr(*d, "GET") || strstr(*d, "POST") || + strstr(*d, "PUT") || strstr(*d, "DELETE") || + strstr(*d, "Path") || strstr(*d, "Consumes")) { + def.is_entry_point = true; + break; + } + } + } + } + cbm_defs_push(&ctx->result->defs, a, def); } @@ -1610,6 +1730,68 @@ static void push_method_def(CBMExtractCtx *ctx, TSNode child, const char *class_ def.complexity = cbm_count_branching(child, spec->branching_node_types); } + // Entry point detection for class methods (same rules as extract_func_def) + // Case-insensitive "main" check + if (strcasecmp(name, "main") == 0) { + def.is_entry_point = true; + } + + // C/C++ entry point detection: WinMain, DllMain, GTest, MFC + if ((ctx->language == CBM_LANG_C || ctx->language == CBM_LANG_CPP) && !def.is_entry_point) { + if (strcmp(name, "WinMain") == 0 || strcmp(name, "wWinMain") == 0 || + strcmp(name, "DllMain") == 0 || strcmp(name, "wmain") == 0 || + strcmp(name, "_tmain") == 0 || strcmp(name, "InitInstance") == 0 || + strcmp(name, "OnInitDialog") == 0) { + def.is_entry_point = true; + } + } + + // C# entry point detection: Windows Service lifecycle, ASP.NET controllers + if (ctx->language == CBM_LANG_CSHARP && !def.is_entry_point) { + if (strcmp(name, "OnStart") == 0 || strcmp(name, "OnStartImpl") == 0 || + strcmp(name, "OnStop") == 0 || strcmp(name, "OnStopImpl") == 0 || + strcmp(name, "Run") == 0 || strcmp(name, "Execute") == 0 || + strcmp(name, "Configure") == 0 || strcmp(name, "ConfigureServices") == 0) { + def.is_entry_point = true; + } + if (!def.is_entry_point && def.decorators) { + for (const char **d = def.decorators; *d; d++) { + if (strstr(*d, "HttpGet") || strstr(*d, "HttpPost") || + strstr(*d, "HttpPut") || strstr(*d, "HttpDelete") || + strstr(*d, "HttpPatch") || strstr(*d, "Route") || + strstr(*d, "ApiController") || strstr(*d, "Authorize")) { + def.is_entry_point = true; + break; + } + } + } + } + + // Java entry point detection + if (ctx->language == CBM_LANG_JAVA && !def.is_entry_point) { + if (strcmp(name, "start") == 0 || strcmp(name, "configure") == 0 || + strcmp(name, "init") == 0 || strcmp(name, "run") == 0 || + strcmp(name, "handle") == 0) { + def.is_entry_point = true; + } + if (!def.is_entry_point && def.decorators) { + for (const char **d = def.decorators; *d; d++) { + if (strstr(*d, "RequestMapping") || strstr(*d, "GetMapping") || + strstr(*d, "PostMapping") || strstr(*d, "PutMapping") || + strstr(*d, "DeleteMapping") || strstr(*d, "PatchMapping") || + strstr(*d, "Endpoint") || strstr(*d, "EventHandler") || + strstr(*d, "Scheduled") || strstr(*d, "Bean") || + strstr(*d, "Override") || strstr(*d, "Test") || + strstr(*d, "GET") || strstr(*d, "POST") || + strstr(*d, "PUT") || strstr(*d, "DELETE") || + strstr(*d, "Path") || strstr(*d, "Consumes")) { + def.is_entry_point = true; + break; + } + } + } + } + cbm_defs_push(&ctx->result->defs, a, def); } @@ -1648,6 +1830,61 @@ static void extract_class_methods(CBMExtractCtx *ctx, TSNode class_node, const c continue; } + /* C#/Java property extraction: property_declaration, auto_property_declaration. + * Creates a "Property" node with parent_class set for DEFINES_METHOD edge. */ + const char *child_type = ts_node_type(child); + if (child_type && + (strcmp(child_type, "property_declaration") == 0 || + strcmp(child_type, "indexer_declaration") == 0 || + strcmp(child_type, "event_declaration") == 0 || + strcmp(child_type, "event_field_declaration") == 0)) { + TSNode name_node = ts_node_child_by_field_name(child, "name", 4); + if (ts_node_is_null(name_node)) { + /* indexer_declaration doesn't have a 'name' field, use "this" */ + if (strcmp(child_type, "indexer_declaration") == 0) { + CBMDefinition pdef; + memset(&pdef, 0, sizeof(pdef)); + pdef.name = cbm_arena_strdup(ctx->arena, "this[]"); + pdef.qualified_name = cbm_arena_sprintf(ctx->arena, "%s.this[]", class_qn); + pdef.label = "Property"; + pdef.file_path = ctx->rel_path; + pdef.parent_class = class_qn; + pdef.start_line = ts_node_start_point(child).row + 1; + pdef.end_line = ts_node_end_point(child).row + 1; + pdef.lines = (int)(pdef.end_line - pdef.start_line + 1); + TSNode type_node = ts_node_child_by_field_name(child, "type", 4); + if (!ts_node_is_null(type_node)) { + pdef.return_type = cbm_node_text(ctx->arena, type_node, ctx->source); + } + cbm_defs_push(&ctx->result->defs, ctx->arena, pdef); + } + continue; + } + char *pname = cbm_node_text(ctx->arena, name_node, ctx->source); + if (pname && pname[0]) { + CBMDefinition pdef; + memset(&pdef, 0, sizeof(pdef)); + pdef.name = pname; + pdef.qualified_name = cbm_arena_sprintf(ctx->arena, "%s.%s", class_qn, pname); + pdef.label = "Property"; + pdef.file_path = ctx->rel_path; + pdef.parent_class = class_qn; + pdef.start_line = ts_node_start_point(child).row + 1; + pdef.end_line = ts_node_end_point(child).row + 1; + pdef.lines = (int)(pdef.end_line - pdef.start_line + 1); + pdef.is_exported = cbm_is_exported(pname, ctx->language); + /* Extract type */ + TSNode type_node = ts_node_child_by_field_name(child, "type", 4); + if (!ts_node_is_null(type_node)) { + pdef.return_type = cbm_node_text(ctx->arena, type_node, ctx->source); + } + pdef.decorators = extract_decorators(ctx->arena, child, ctx->source, + ctx->language, spec); + cbm_defs_push(&ctx->result->defs, ctx->arena, pdef); + } + continue; + } + if (!cbm_kind_in_set(child, spec->function_node_types)) { continue; } diff --git a/internal/cbm/extract_imports.c b/internal/cbm/extract_imports.c index 87f8021b..103f4e06 100644 --- a/internal/cbm/extract_imports.c +++ b/internal/cbm/extract_imports.c @@ -340,6 +340,88 @@ static void walk_es_imports(CBMExtractCtx *ctx, TSNode node) { return; } + /* CommonJS: const X = require("Y"), const { A, B } = require("Y") + * Tree-sitter structure: variable_declarator → name + value(call_expression) + * We detect require() calls inside lexical_declaration/variable_declaration. */ + if (strcmp(kind, "variable_declarator") == 0 || strcmp(kind, "assignment_expression") == 0) { + TSNode value = ts_node_child_by_field_name(node, "value", 5); + if (ts_node_is_null(value)) { + value = ts_node_child_by_field_name(node, "right", 5); + } + if (!ts_node_is_null(value) && strcmp(ts_node_type(value), "call_expression") == 0) { + TSNode func = ts_node_child_by_field_name(value, "function", 8); + if (!ts_node_is_null(func) && strcmp(ts_node_type(func), "identifier") == 0) { + char *fname = cbm_node_text(a, func, ctx->source); + if (fname && strcmp(fname, "require") == 0) { + /* Extract the require() argument */ + TSNode args = ts_node_child_by_field_name(value, "arguments", 9); + if (!ts_node_is_null(args) && ts_node_named_child_count(args) > 0) { + TSNode arg0 = ts_node_named_child(args, 0); + const char *at = ts_node_type(arg0); + if (strcmp(at, "string") == 0 || strcmp(at, "string_literal") == 0 || + strcmp(at, "template_string") == 0) { + char *path = strip_quotes(a, cbm_node_text(a, arg0, ctx->source)); + if (path && path[0]) { + /* Get the variable name(s) being assigned */ + TSNode lhs = ts_node_child_by_field_name(node, "name", 4); + if (ts_node_is_null(lhs)) { + lhs = ts_node_child_by_field_name(node, "left", 4); + } + if (!ts_node_is_null(lhs)) { + const char *lk = ts_node_type(lhs); + if (strcmp(lk, "identifier") == 0) { + char *name = cbm_node_text(a, lhs, ctx->source); + CBMImport imp = {.local_name = name, .module_path = path}; + cbm_imports_push(&ctx->result->imports, a, imp); + } else if (strcmp(lk, "object_pattern") == 0) { + /* Destructured: const { A, B } = require("Y") */ + uint32_t nc = ts_node_named_child_count(lhs); + for (uint32_t k = 0; k < nc; k++) { + TSNode prop = ts_node_named_child(lhs, k); + const char *pk = ts_node_type(prop); + if (strcmp(pk, "shorthand_property_identifier_pattern") == 0 || + strcmp(pk, "shorthand_property_identifier") == 0 || + strcmp(pk, "identifier") == 0) { + char *name = cbm_node_text(a, prop, ctx->source); + CBMImport imp = {.local_name = name, .module_path = path}; + cbm_imports_push(&ctx->result->imports, a, imp); + } else if (strcmp(pk, "pair_pattern") == 0 || + strcmp(pk, "pair") == 0) { + TSNode val = ts_node_child_by_field_name(prop, "value", 5); + if (!ts_node_is_null(val)) { + char *name = cbm_node_text(a, val, ctx->source); + CBMImport imp = {.local_name = name, .module_path = path}; + cbm_imports_push(&ctx->result->imports, a, imp); + } + } + } + } else if (strcmp(lk, "array_pattern") == 0) { + /* Array destructured: const [A, B] = require("Y") */ + uint32_t nc = ts_node_named_child_count(lhs); + for (uint32_t k = 0; k < nc; k++) { + TSNode elem = ts_node_named_child(lhs, k); + if (strcmp(ts_node_type(elem), "identifier") == 0) { + char *name = cbm_node_text(a, elem, ctx->source); + CBMImport imp = {.local_name = name, .module_path = path}; + cbm_imports_push(&ctx->result->imports, a, imp); + } + } + } + } else { + /* Fallback: use last path segment as name */ + CBMImport imp = {.local_name = path_last(a, path), + .module_path = path}; + cbm_imports_push(&ctx->result->imports, a, imp); + } + } + } + } + } + } + } + /* Don't return — let it recurse to catch nested requires */ + } + recurse:; uint32_t count = ts_node_child_count(node); for (uint32_t i = 0; i < count; i++) { diff --git a/internal/cbm/extract_unified.c b/internal/cbm/extract_unified.c index f4cfb3cd..69029c29 100644 --- a/internal/cbm/extract_unified.c +++ b/internal/cbm/extract_unified.c @@ -79,6 +79,65 @@ static const char *compute_func_qn(CBMExtractCtx *ctx, TSNode node, const CBMLan } } + /* C/C++/CUDA/GLSL: function_definition has no "name" field. + * Name is buried in declarator chain: function_definition → declarator → + * function_declarator → declarator → identifier. Walk the chain. */ + if ((ctx->language == CBM_LANG_C || ctx->language == CBM_LANG_CPP || + ctx->language == CBM_LANG_CUDA || ctx->language == CBM_LANG_GLSL)) { + const char *nk = ts_node_type(node); + bool is_func_def = (strcmp(nk, "function_definition") == 0); + /* Template declarations wrap the function_definition */ + TSNode inner_func = node; + if (strcmp(nk, "template_declaration") == 0) { + for (uint32_t i = 0; i < ts_node_named_child_count(node); i++) { + TSNode ch = ts_node_named_child(node, i); + if (strcmp(ts_node_type(ch), "function_definition") == 0) { + inner_func = ch; + is_func_def = true; + break; + } + } + } + if (is_func_def) { + TSNode decl = ts_node_child_by_field_name(inner_func, "declarator", 10); + for (int depth = 0; depth < 8 && !ts_node_is_null(decl); depth++) { + const char *dk = ts_node_type(decl); + if (strcmp(dk, "identifier") == 0 || strcmp(dk, "field_identifier") == 0) { + char *name = cbm_node_text(ctx->arena, decl, ctx->source); + if (name && name[0]) { + if (state->enclosing_class_qn) { + return cbm_arena_sprintf(ctx->arena, "%s.%s", + state->enclosing_class_qn, name); + } + return cbm_fqn_compute(ctx->arena, ctx->project, ctx->rel_path, name); + } + return NULL; + } + if (strcmp(dk, "qualified_identifier") == 0 || + strcmp(dk, "scoped_identifier") == 0) { + TSNode id = cbm_find_child_by_kind(decl, "identifier"); + if (ts_node_is_null(id)) + id = cbm_find_child_by_kind(decl, "field_identifier"); + if (!ts_node_is_null(id)) { + char *name = cbm_node_text(ctx->arena, id, ctx->source); + if (name && name[0]) { + return cbm_fqn_compute(ctx->arena, ctx->project, + ctx->rel_path, name); + } + } + return NULL; + } + /* Unwrap: function_declarator → inner declarator */ + TSNode inner = ts_node_child_by_field_name(decl, "declarator", 10); + if (ts_node_is_null(inner) && ts_node_named_child_count(decl) > 0) { + inner = ts_node_named_child(decl, 0); + } + decl = inner; + } + return NULL; /* couldn't resolve C/C++ function name */ + } + } + TSNode name_node = ts_node_child_by_field_name(node, "name", 4); // Arrow function: name from parent variable_declarator @@ -153,9 +212,32 @@ void cbm_extract_unified(CBMExtractCtx *ctx) { // 4. Push scope markers for boundary nodes if (spec->function_node_types && cbm_kind_in_set(node, spec->function_node_types)) { - const char *fqn = compute_func_qn(ctx, node, spec, &state); - if (fqn) { - push_scope(&state, SCOPE_FUNC, depth, fqn); + // Fix 3: C# lambda_expression inside += assignment should NOT create + // a new scope boundary. Calls inside the lambda body should be attributed + // to the outer method that subscribes the event handler, not to an + // anonymous lambda. This matches the semantic intent: the subscribing + // method IS responsible for what runs when the event fires. + bool skip_scope = false; + if (ctx->language == CBM_LANG_CSHARP && + strcmp(ts_node_type(node), "lambda_expression") == 0) { + TSNode parent = ts_node_parent(node); + if (!ts_node_is_null(parent) && + strcmp(ts_node_type(parent), "assignment_expression") == 0) { + TSNode op = ts_node_child_by_field_name(parent, "operator", 8); + if (!ts_node_is_null(op)) { + char *op_text = cbm_node_text(ctx->arena, op, ctx->source); + if (op_text && (strcmp(op_text, "+=") == 0 || + strcmp(op_text, "-=") == 0)) { + skip_scope = true; + } + } + } + } + if (!skip_scope) { + const char *fqn = compute_func_qn(ctx, node, spec, &state); + if (fqn) { + push_scope(&state, SCOPE_FUNC, depth, fqn); + } } } else if (spec->class_node_types && cbm_kind_in_set(node, spec->class_node_types)) { const char *cqn = compute_class_qn(ctx, node); diff --git a/internal/cbm/helpers.c b/internal/cbm/helpers.c index 0b4147b5..d1abcb77 100644 --- a/internal/cbm/helpers.c +++ b/internal/cbm/helpers.c @@ -444,6 +444,34 @@ static const char *func_node_name(CBMArena *a, TSNode func_node, const char *sou } } + /* C/C++/CUDA/GLSL: function_definition has no "name" field. + * Name is inside declarator chain: function_definition → declarator → + * function_declarator → declarator → identifier. */ + if ((lang == CBM_LANG_C || lang == CBM_LANG_CPP || + lang == CBM_LANG_CUDA || lang == CBM_LANG_GLSL) && + strcmp(ts_node_type(func_node), "function_definition") == 0) { + TSNode decl = ts_node_child_by_field_name(func_node, "declarator", 10); + for (int depth = 0; depth < 8 && !ts_node_is_null(decl); depth++) { + const char *dk = ts_node_type(decl); + if (strcmp(dk, "identifier") == 0 || strcmp(dk, "field_identifier") == 0) { + return cbm_node_text(a, decl, source); + } + if (strcmp(dk, "qualified_identifier") == 0 || + strcmp(dk, "scoped_identifier") == 0) { + TSNode id = cbm_find_child_by_kind(decl, "identifier"); + if (ts_node_is_null(id)) + id = cbm_find_child_by_kind(decl, "field_identifier"); + if (!ts_node_is_null(id)) return cbm_node_text(a, id, source); + return NULL; + } + TSNode inner = ts_node_child_by_field_name(decl, "declarator", 10); + if (ts_node_is_null(inner) && ts_node_named_child_count(decl) > 0) + inner = ts_node_named_child(decl, 0); + decl = inner; + } + return NULL; + } + TSNode name_node = ts_node_child_by_field_name(func_node, "name", 4); if (!ts_node_is_null(name_node)) { return cbm_node_text(a, name_node, source); diff --git a/internal/cbm/lang_specs.c b/internal/cbm/lang_specs.c index 426db947..428c9cd3 100644 --- a/internal/cbm/lang_specs.c +++ b/internal/cbm/lang_specs.c @@ -114,6 +114,7 @@ static const char *js_class_types[] = {"class_declaration", "class", NULL}; static const char *js_module_types[] = {"program", NULL}; static const char *js_call_types[] = {"call_expression", NULL}; static const char *js_import_types[] = {"import_statement", "lexical_declaration", + "variable_declaration", "expression_statement", "export_statement", NULL}; static const char *js_branch_types[] = {"if_statement", "for_statement", "for_in_statement", "while_statement", "switch_statement", "case_clause", diff --git a/src/cypher/cypher.c b/src/cypher/cypher.c index 64985cbc..aa97b6a0 100644 --- a/src/cypher/cypher.c +++ b/src/cypher/cypher.c @@ -631,6 +631,39 @@ static void expr_free(cbm_expr_t *e) { // NOLINTNEXTLINE(bugprone-multi-level-implicit-pointer-conversion) free(e->cond.in_values); } + if (e->type == EXPR_NOT_EXISTS) { + if (e->sub_pattern) { + /* Free pattern nodes and rels */ + for (int i = 0; i < e->sub_pattern->node_count; i++) { + free((void *)e->sub_pattern->nodes[i].variable); + free((void *)e->sub_pattern->nodes[i].label); + } + for (int i = 0; i < e->sub_pattern->rel_count; i++) { + free((void *)e->sub_pattern->rels[i].variable); + for (int t = 0; t < e->sub_pattern->rels[i].type_count; t++) { + free((void *)e->sub_pattern->rels[i].types[t]); + } + free(e->sub_pattern->rels[i].types); + free((void *)e->sub_pattern->rels[i].direction); + } + free(e->sub_pattern->nodes); + free(e->sub_pattern->rels); + free(e->sub_pattern); + } + if (e->sub_where) { + cbm_where_clause_t *sw = (cbm_where_clause_t *)e->sub_where; + if (sw->root) expr_free(sw->root); + for (int i = 0; i < sw->count; i++) { + free((void *)sw->conditions[i].variable); + free((void *)sw->conditions[i].property); + free((void *)sw->conditions[i].op); + free((void *)sw->conditions[i].value); + } + free(sw->conditions); + free((void *)sw->op); + free(sw); + } + } expr_free(e->left); expr_free(e->right); free(e); @@ -695,6 +728,8 @@ static const char *unsupported_clause_error(cbm_token_type_t type) { /* Forward declarations for recursive descent */ static cbm_expr_t *parse_or_expr(parser_t *p); +static int parse_match_pattern(parser_t *p, cbm_pattern_t *pat); +static int parse_where(parser_t *p, cbm_where_clause_t **out); /* Parse a single condition: var.prop OP value | var.prop IS [NOT] NULL | var.prop IN [...] */ static cbm_expr_t *parse_condition_expr(parser_t *p) { @@ -833,9 +868,40 @@ static cbm_expr_t *parse_atom_expr(parser_t *p) { return parse_condition_expr(p); } -/* NOT: NOT atom | atom */ +/* NOT: NOT EXISTS { MATCH ... WHERE ... } | NOT atom | atom */ static cbm_expr_t *parse_not_expr(parser_t *p) { if (match(p, TOK_NOT)) { + /* NOT EXISTS { MATCH (pattern) WHERE ... } — correlated subquery */ + if (check(p, TOK_EXISTS)) { + advance(p); /* consume EXISTS */ + if (!expect(p, TOK_LBRACE)) return NULL; + + cbm_expr_t *e = calloc(1, sizeof(cbm_expr_t)); + e->type = EXPR_NOT_EXISTS; + + /* Parse inner MATCH pattern */ + if (!expect(p, TOK_MATCH)) { free(e); return NULL; } + e->sub_pattern = calloc(1, sizeof(cbm_pattern_t)); + if (parse_match_pattern(p, e->sub_pattern) < 0) { + free(e->sub_pattern); + free(e); + return NULL; + } + + /* Optional inner WHERE */ + cbm_where_clause_t *inner_where = NULL; + parse_where(p, &inner_where); + e->sub_where = inner_where; + + if (!expect(p, TOK_RBRACE)) { + /* Cleanup on parse failure */ + free(e->sub_pattern); + free(e->sub_where); + free(e); + return NULL; + } + return e; + } cbm_expr_t *child = parse_not_expr(p); return child ? expr_not(child) : NULL; } @@ -1052,6 +1118,10 @@ static int parse_return_or_with(parser_t *p, cbm_return_clause_t **out, bool is_ cbm_token_type_t ft = peek(p)->type; advance(p); expect(p, TOK_LPAREN); + /* Check for DISTINCT inside aggregate: count(DISTINCT ...) */ + if (match(p, TOK_DISTINCT)) { + item.distinct_arg = true; + } if (match(p, TOK_STAR)) { item.variable = heap_strdup("*"); } else { @@ -1561,6 +1631,9 @@ typedef struct { } binding_t; /* Get node property by name */ +/* Forward declaration — full implementation below */ +static const char *json_extract_prop(const char *json, const char *key, char *buf, size_t buf_sz); + static const char *node_prop(const cbm_node_t *n, const char *prop) { if (!n || !prop) { return ""; @@ -1588,6 +1661,24 @@ static const char *node_prop(const cbm_node_t *n, const char *prop) { snprintf(buf, sizeof(buf), "%d", n->end_line); return buf; } + if (strcmp(prop, "file") == 0) { + return n->file_path ? n->file_path : ""; + } + if (strcmp(prop, "id") == 0) { + static char buf[32]; + snprintf(buf, sizeof(buf), "%lld", (long long)n->id); + return buf; + } + /* Fall through to JSON properties for unknown fields. + * This enables queries like WHERE n.is_entry_point = true + * or WHERE n.confidence > 0.5 on properties stored in properties_json. */ + if (n->properties_json) { + static char json_buf[1024]; + const char *val = json_extract_prop(n->properties_json, prop, json_buf, sizeof(json_buf)); + if (val && val[0]) { + return val; + } + } return ""; } @@ -1763,6 +1854,16 @@ static void binding_set(binding_t *b, const char *var, const cbm_node_t *node) { b->var_count++; } +/* Forward declarations for NOT EXISTS subquery evaluation */ +static void scan_pattern_nodes(cbm_store_t *store, const char *project, int max_rows, + cbm_node_pattern_t *first, cbm_node_t **out_nodes, + int *out_count); +static void expand_pattern_rels(cbm_store_t *store, cbm_pattern_t *pat, binding_t **bindings, + int *bind_count, const int *bind_cap, const char **var_name, + bool is_optional); +static bool eval_where(const cbm_where_clause_t *w, binding_t *b, cbm_store_t *store, + const char *project, int max_rows); + /* Evaluate a WHERE condition against a binding */ static bool eval_condition(const cbm_condition_t *c, binding_t *b) { const char *actual; @@ -1855,8 +1956,10 @@ static bool eval_condition(const cbm_condition_t *c, binding_t *b) { return (int)(c->negated ? !result : result); } -/* Recursive expression tree evaluator */ -static bool eval_expr(const cbm_expr_t *e, binding_t *b) { +/* Recursive expression tree evaluator. + * store is needed for EXPR_NOT_EXISTS (correlated subquery expansion). */ +static bool eval_expr(const cbm_expr_t *e, binding_t *b, cbm_store_t *store, + const char *project, int max_rows) { if (!e) { return true; } @@ -1864,24 +1967,176 @@ static bool eval_expr(const cbm_expr_t *e, binding_t *b) { case EXPR_CONDITION: return eval_condition(&e->cond, b); case EXPR_AND: - return (eval_expr(e->left, b) && eval_expr(e->right, b)) != 0; + return (eval_expr(e->left, b, store, project, max_rows) && + eval_expr(e->right, b, store, project, max_rows)) != 0; case EXPR_OR: - return (eval_expr(e->left, b) || eval_expr(e->right, b)) != 0; + return (eval_expr(e->left, b, store, project, max_rows) || + eval_expr(e->right, b, store, project, max_rows)) != 0; case EXPR_NOT: - return (!eval_expr(e->left, b)) != 0; + return (!eval_expr(e->left, b, store, project, max_rows)) != 0; case EXPR_XOR: - return eval_expr(e->left, b) != eval_expr(e->right, b); + return eval_expr(e->left, b, store, project, max_rows) != + eval_expr(e->right, b, store, project, max_rows); + case EXPR_NOT_EXISTS: { + if (!e->sub_pattern || !store) return true; + cbm_pattern_t *sp = e->sub_pattern; + + /* OPTIMIZATION: For the common pattern + * MATCH (n:Function) WHERE NOT EXISTS { MATCH (caller)-[e]->(n) WHERE e.type = 'CALLS' } + * we detect when the inner pattern's TARGET variable is already bound from + * the outer scope. Instead of scanning all possible callers, we directly + * query edges TO the bound node — O(1) per node instead of O(N). */ + if (sp->rel_count == 1 && sp->node_count == 2) { + const char *start_var = sp->nodes[0].variable; + const char *end_var = sp->nodes[1].variable; + cbm_rel_pattern_t *rel = &sp->rels[0]; + + /* Check which end is bound from outer scope */ + cbm_node_t *bound_node = NULL; + bool bound_is_target = false; + if (end_var && binding_get(b, end_var)) { + bound_node = binding_get(b, end_var); + bound_is_target = true; + } else if (start_var && binding_get(b, start_var)) { + bound_node = binding_get(b, start_var); + } + + if (bound_node && bound_node->id > 0) { + /* Fast path: query edges directly to/from the bound node */ + cbm_edge_t *edges = NULL; + int edge_count = 0; + bool found_match = false; + + for (int ti = 0; ti < rel->type_count && !found_match; ti++) { + const char *edge_type = rel->types[ti]; + if (bound_is_target) { + /* bound node is the target: look for edges incoming TO it */ + cbm_store_find_edges_by_target_type(store, bound_node->id, + edge_type, &edges, &edge_count); + } else { + /* bound node is the source: look for edges outgoing FROM it */ + cbm_store_find_edges_by_source_type(store, bound_node->id, + edge_type, &edges, &edge_count); + } + /* Apply inner WHERE filter if present */ + cbm_where_clause_t *inner_w = (cbm_where_clause_t *)e->sub_where; + if (edge_count > 0 && inner_w) { + /* Build a temporary binding with the edge to check WHERE conditions */ + for (int ei = 0; ei < edge_count && !found_match; ei++) { + binding_t tmp = *b; /* shallow copy of outer binding */ + const char *edge_var = rel->variable; + if (edge_var) { + binding_set_edge(&tmp, edge_var, &edges[ei]); + } + if (eval_where(inner_w, &tmp, store, project, max_rows)) { + found_match = true; + } + } + } else if (edge_count > 0) { + found_match = true; + } + /* Free edges */ + for (int ei = 0; ei < edge_count; ei++) { + free((void *)edges[ei].project); + free((void *)edges[ei].type); + free((void *)edges[ei].properties_json); + } + free(edges); + edges = NULL; + edge_count = 0; + } + + if (rel->type_count == 0 && !found_match) { + /* No type filter — check ANY edge */ + cbm_edge_t *all_edges = NULL; + int all_count = 0; + if (bound_is_target) { + cbm_store_find_edges_by_target_type(store, bound_node->id, + NULL, &all_edges, &all_count); + } else { + cbm_store_find_edges_by_source_type(store, bound_node->id, + NULL, &all_edges, &all_count); + } + if (all_count > 0) found_match = true; + for (int ei = 0; ei < all_count; ei++) { + free((void *)all_edges[ei].project); + free((void *)all_edges[ei].type); + free((void *)all_edges[ei].properties_json); + } + free(all_edges); + } + + return !found_match; + } + } + + /* SLOW PATH: Full subquery expansion for complex patterns. + * Used when no variable is bound from outer scope, or multi-hop patterns. */ + const char *start_var = sp->nodes[0].variable; + cbm_node_t *scanned = NULL; + int scan_count = 0; + cbm_node_t *outer_node = start_var ? binding_get(b, start_var) : NULL; + + if (outer_node) { + scanned = calloc(1, sizeof(cbm_node_t)); + scanned[0] = *outer_node; + scanned[0].name = outer_node->name ? heap_strdup(outer_node->name) : NULL; + scanned[0].label = outer_node->label ? heap_strdup(outer_node->label) : NULL; + scanned[0].file_path = outer_node->file_path ? heap_strdup(outer_node->file_path) : NULL; + scanned[0].project = outer_node->project ? heap_strdup(outer_node->project) : NULL; + scanned[0].qualified_name = outer_node->qualified_name ? heap_strdup(outer_node->qualified_name) : NULL; + scan_count = 1; + } else { + scan_pattern_nodes(store, project, max_rows, &sp->nodes[0], + &scanned, &scan_count); + } + + if (scan_count == 0) { + free(scanned); + return true; + } + + const char *var = start_var ? start_var : "_ne"; + int sub_cap = scan_count > 4 ? scan_count : 4; + binding_t *sub_bindings = calloc(sub_cap, sizeof(binding_t)); + int sub_count = 0; + for (int i = 0; i < scan_count && sub_count < sub_cap; i++) { + binding_set(&sub_bindings[sub_count], var, &scanned[i]); + sub_count++; + } + free(scanned); + + if (sub_count > 0 && sp->rel_count > 0) { + expand_pattern_rels(store, sp, &sub_bindings, &sub_count, &sub_cap, + &var, false); + } + + bool any_match = false; + cbm_where_clause_t *inner_w = (cbm_where_clause_t *)e->sub_where; + for (int i = 0; i < sub_count && !any_match; i++) { + bool pass = inner_w ? eval_where(inner_w, &sub_bindings[i], store, project, max_rows) : true; + if (pass) any_match = true; + } + for (int i = 0; i < sub_count; i++) { + for (int v = 0; v < sub_bindings[i].var_count; v++) { + node_fields_free(&sub_bindings[i].var_nodes[v]); + } + } + free(sub_bindings); + return !any_match; + } } return true; } /* Evaluate WHERE clause — uses expression tree if available, falls back to legacy */ -static bool eval_where(const cbm_where_clause_t *w, binding_t *b) { +static bool eval_where(const cbm_where_clause_t *w, binding_t *b, cbm_store_t *store, + const char *project, int max_rows) { if (!w) { return true; } if (w->root) { - return eval_expr(w->root, b); + return eval_expr(w->root, b, store, project, max_rows); } /* Legacy flat evaluation */ @@ -2021,7 +2276,7 @@ static const char *eval_case_expr(const cbm_case_expr_t *k, binding_t *b) { return ""; } for (int i = 0; i < k->branch_count; i++) { - if (eval_expr(k->branches[i].when_expr, b)) { + if (eval_expr(k->branches[i].when_expr, b, NULL, NULL, 0)) { return k->branches[i].then_val ? k->branches[i].then_val : ""; } } @@ -2404,9 +2659,9 @@ static int execute_single(cbm_store_t *store, cbm_query_t *q, const char *projec bool pass = true; if (q->where && pat0->rel_count > 0) { /* With expression tree, evaluate full tree — unbound vars pass through */ - pass = eval_where(q->where, &b); + pass = eval_where(q->where, &b, store, project, max_rows); } else if (q->where && pat0->rel_count == 0) { - pass = eval_where(q->where, &b); + pass = eval_where(q->where, &b, store, project, max_rows); } if (pass) { @@ -2507,7 +2762,7 @@ static int execute_single(cbm_store_t *store, cbm_query_t *q, const char *projec if (q->where && (pat0->rel_count > 0 || q->pattern_count > 1)) { int kept = 0; for (int i = 0; i < bind_count; i++) { - if (eval_where(q->where, &bindings[i])) { + if (eval_where(q->where, &bindings[i], store, project, max_rows)) { if (kept != i) { bindings[kept] = bindings[i]; } @@ -2547,6 +2802,10 @@ static int execute_single(cbm_store_t *store, cbm_query_t *q, const char *projec double *sums; int *counts; double *mins, *maxs; + /* For count(DISTINCT ...): per-column arrays of seen values */ + const char ***distinct_seen; /* [col][seen_idx] */ + int *distinct_seen_count; /* count per column */ + int *distinct_seen_cap; /* capacity per column */ } with_agg_t; int agg_cap = 256; with_agg_t *aggs = calloc(agg_cap, sizeof(with_agg_t)); @@ -2585,6 +2844,9 @@ static int execute_single(cbm_store_t *store, cbm_query_t *q, const char *projec aggs[found].counts = calloc(wc->count, sizeof(int)); aggs[found].mins = malloc(wc->count * sizeof(double)); aggs[found].maxs = malloc(wc->count * sizeof(double)); + aggs[found].distinct_seen = calloc(wc->count, sizeof(const char **)); + aggs[found].distinct_seen_count = calloc(wc->count, sizeof(int)); + aggs[found].distinct_seen_cap = calloc(wc->count, sizeof(int)); for (int ci = 0; ci < wc->count; ci++) { aggs[found].mins[ci] = 1e308; aggs[found].maxs[ci] = -1e308; @@ -2603,9 +2865,34 @@ static int execute_single(cbm_store_t *store, cbm_query_t *q, const char *projec if (!wc->items[ci].func) { continue; } - aggs[found].counts[ci]++; const char *raw = binding_get_virtual(&bindings[bi], wc->items[ci].variable, wc->items[ci].property); + /* count(DISTINCT ...): only count if value not already seen */ + if (wc->items[ci].distinct_arg && strcmp(wc->items[ci].func, "COUNT") == 0) { + bool already = false; + for (int di = 0; di < aggs[found].distinct_seen_count[ci]; di++) { + if (aggs[found].distinct_seen[ci][di] && + strcmp(aggs[found].distinct_seen[ci][di], raw) == 0) { + already = true; + break; + } + } + if (!already) { + /* Track the value */ + if (aggs[found].distinct_seen_count[ci] >= aggs[found].distinct_seen_cap[ci]) { + int newcap = aggs[found].distinct_seen_cap[ci] < 16 ? 16 : + aggs[found].distinct_seen_cap[ci] * 2; + aggs[found].distinct_seen[ci] = safe_realloc( + aggs[found].distinct_seen[ci], newcap * sizeof(const char *)); + aggs[found].distinct_seen_cap[ci] = newcap; + } + aggs[found].distinct_seen[ci][aggs[found].distinct_seen_count[ci]++] = + heap_strdup(raw); + aggs[found].counts[ci]++; + } + } else { + aggs[found].counts[ci]++; + } double dv = strtod(raw, NULL); aggs[found].sums[ci] += dv; if (dv < aggs[found].mins[ci]) { @@ -2682,6 +2969,17 @@ static int execute_single(cbm_store_t *store, cbm_query_t *q, const char *projec free(aggs[a].counts); free(aggs[a].mins); free(aggs[a].maxs); + if (aggs[a].distinct_seen) { + for (int ci = 0; ci < wc->count; ci++) { + for (int di = 0; di < aggs[a].distinct_seen_count[ci]; di++) { + free((void *)aggs[a].distinct_seen[ci][di]); + } + free(aggs[a].distinct_seen[ci]); + } + free(aggs[a].distinct_seen); + free(aggs[a].distinct_seen_count); + free(aggs[a].distinct_seen_cap); + } } free(aggs); } else { @@ -2772,7 +3070,7 @@ static int execute_single(cbm_store_t *store, cbm_query_t *q, const char *projec if (q->post_with_where) { int kept = 0; for (int i = 0; i < bind_count; i++) { - if (eval_where(q->post_with_where, &bindings[i])) { + if (eval_where(q->post_with_where, &bindings[i], store, project, max_rows)) { if (kept != i) { bindings[kept] = bindings[i]; } diff --git a/src/cypher/cypher.h b/src/cypher/cypher.h index dedf4c82..8c53a175 100644 --- a/src/cypher/cypher.h +++ b/src/cypher/cypher.h @@ -199,7 +199,8 @@ typedef enum { EXPR_AND, EXPR_OR, EXPR_NOT, - EXPR_XOR + EXPR_XOR, + EXPR_NOT_EXISTS /* NOT EXISTS { MATCH ... WHERE ... } */ } cbm_expr_type_t; typedef struct cbm_expr cbm_expr_t; @@ -208,6 +209,9 @@ struct cbm_expr { cbm_condition_t cond; /* leaf (EXPR_CONDITION only) */ cbm_expr_t *left; /* AND/OR/XOR left; NOT child */ cbm_expr_t *right; /* AND/OR/XOR right; NULL for NOT */ + /* NOT EXISTS subquery (EXPR_NOT_EXISTS only) */ + cbm_pattern_t *sub_pattern; /* inner MATCH pattern */ + void *sub_where; /* cbm_where_clause_t* — void to avoid circular dep */ }; typedef struct { @@ -238,6 +242,7 @@ typedef struct { const char *func; /* "COUNT", "SUM", "AVG", "MIN", "MAX", "COLLECT", "toLower", "toUpper", "toString" or NULL */ cbm_case_expr_t *kase; /* CASE expression (NULL if not CASE) */ + bool distinct_arg; /* true when func is count(DISTINCT ...) */ } cbm_return_item_t; typedef struct { diff --git a/src/mcp/mcp.c b/src/mcp/mcp.c index 3530acc3..76e45198 100644 --- a/src/mcp/mcp.c +++ b/src/mcp/mcp.c @@ -281,6 +281,39 @@ static const tool_def_t TOOLS[] = { "{\"type\":\"object\",\"properties\":{\"project\":{\"type\":\"string\"},\"aspects\":{\"type\":" "\"array\",\"items\":{\"type\":\"string\"}}},\"required\":[\"project\"]}"}, + {"list_processes", + "List discovered execution flows (processes). Each process is a named path from an entry " + "point through the call graph to a terminal node that crosses a community boundary. " + "Processes are auto-detected during indexing using BFS from entry points + Louvain " + "community detection. Returns up to 300 processes ordered by step count.", + "{\"type\":\"object\",\"properties\":{\"project\":{\"type\":\"string\"}},\"required\":[\"project\"]}"}, + + {"get_process_steps", + "Get the ordered step list for a specific execution flow. Returns each function " + "in the flow with file_path, qualified_name, and step number. Use after list_processes " + "to drill into a specific flow for step-by-step debugging.", + "{\"type\":\"object\",\"properties\":{\"project\":{\"type\":\"string\"}," + "\"process_id\":{\"type\":\"number\",\"description\":\"Process ID from list_processes\"}}" + ",\"required\":[\"project\",\"process_id\"]}"}, + + {"get_impact", + "Analyze blast radius of changing a symbol. Returns all upstream callers grouped by " + "depth (d=1 WILL BREAK, d=2 LIKELY AFFECTED), affected processes, risk assessment " + "(LOW/MEDIUM/HIGH/CRITICAL), and affected modules. Use before modifying shared code.", + "{\"type\":\"object\",\"properties\":{\"project\":{\"type\":\"string\"}," + "\"target\":{\"type\":\"string\",\"description\":\"Function or class name to analyze\"}," + "\"direction\":{\"type\":\"string\",\"enum\":[\"upstream\",\"downstream\"],\"default\":\"upstream\"}," + "\"max_depth\":{\"type\":\"number\",\"default\":3}}" + ",\"required\":[\"project\",\"target\"]}"}, + + {"get_channels", + "Find message channels (Socket.IO events, EventEmitter signals) across projects. " + "Shows which functions emit and listen on each channel, enabling cross-service " + "message flow tracing. Auto-detects patterns during indexing. " + "Query by channel name (partial match) and/or project.", + "{\"type\":\"object\",\"properties\":{\"project\":{\"type\":\"string\"}," + "\"channel\":{\"type\":\"string\",\"description\":\"Channel name filter (partial match)\"}}}"}, + {"search_code", "Graph-augmented code search. Finds text patterns via grep, then enriches results with " "the knowledge graph: deduplicates matches into containing functions, ranks by structural " @@ -940,6 +973,8 @@ static char *handle_search_graph(cbm_mcp_server_t *srv, const char *args) { char *label = cbm_mcp_get_string_arg(args, "label"); char *name_pattern = cbm_mcp_get_string_arg(args, "name_pattern"); char *file_pattern = cbm_mcp_get_string_arg(args, "file_pattern"); + char *query = cbm_mcp_get_string_arg(args, "query"); + char *sort_by = cbm_mcp_get_string_arg(args, "sort_by"); int limit = cbm_mcp_get_int_arg(args, "limit", 500000); int offset = cbm_mcp_get_int_arg(args, "offset", 0); int min_degree = cbm_mcp_get_int_arg(args, "min_degree", -1); @@ -950,6 +985,8 @@ static char *handle_search_graph(cbm_mcp_server_t *srv, const char *args) { .label = label, .name_pattern = name_pattern, .file_pattern = file_pattern, + .query = query, + .sort_by = sort_by, .limit = limit, .offset = offset, .min_degree = min_degree, @@ -990,6 +1027,8 @@ static char *handle_search_graph(cbm_mcp_server_t *srv, const char *args) { free(label); free(name_pattern); free(file_pattern); + free(query); + free(sort_by); char *result = cbm_mcp_text_result(json, false); free(json); @@ -1152,6 +1191,426 @@ static char *handle_delete_project(cbm_mcp_server_t *srv, const char *args) { return result; } +static char *handle_get_process_steps(cbm_mcp_server_t *srv, const char *args) { + char *project = cbm_mcp_get_string_arg(args, "project"); + int64_t process_id = (int64_t)cbm_mcp_get_int_arg(args, "process_id", 0); + cbm_store_t *store = resolve_store(srv, project); + REQUIRE_STORE(store, project); + + cbm_process_step_t *steps = NULL; + int count = 0; + cbm_store_get_process_steps(store, process_id, &steps, &count); + + yyjson_mut_doc *doc = yyjson_mut_doc_new(NULL); + yyjson_mut_val *root = yyjson_mut_obj(doc); + yyjson_mut_doc_set_root(doc, root); + yyjson_mut_obj_add_int(doc, root, "total_steps", count); + + yyjson_mut_val *arr = yyjson_mut_arr(doc); + for (int i = 0; i < count; i++) { + yyjson_mut_val *item = yyjson_mut_obj(doc); + yyjson_mut_obj_add_int(doc, item, "step", steps[i].step); + yyjson_mut_obj_add_strcpy(doc, item, "name", steps[i].name ? steps[i].name : ""); + yyjson_mut_obj_add_strcpy(doc, item, "qualified_name", + steps[i].qualified_name ? steps[i].qualified_name : ""); + yyjson_mut_obj_add_strcpy(doc, item, "file_path", + steps[i].file_path ? steps[i].file_path : ""); + yyjson_mut_arr_add_val(arr, item); + } + yyjson_mut_obj_add_val(doc, root, "steps", arr); + + char *json = yy_doc_to_str(doc); + yyjson_mut_doc_free(doc); + cbm_store_free_process_steps(steps, count); + free(project); + char *result = cbm_mcp_text_result(json, false); + free(json); + return result; +} + +static char *handle_get_impact(cbm_mcp_server_t *srv, const char *args) { + char *project = cbm_mcp_get_string_arg(args, "project"); + char *target = cbm_mcp_get_string_arg(args, "target"); + char *direction = cbm_mcp_get_string_arg(args, "direction"); + int max_depth = cbm_mcp_get_int_arg(args, "max_depth", 3); + cbm_store_t *store = resolve_store(srv, project); + REQUIRE_STORE(store, project); + + if (!direction) direction = heap_strdup("upstream"); + bool is_upstream = strcmp(direction, "upstream") == 0; + const char *bfs_dir = is_upstream ? "inbound" : "outbound"; + + /* Find target node */ + cbm_node_t *nodes = NULL; + int node_count = 0; + cbm_store_find_nodes_by_name(store, project, target, &nodes, &node_count); + + yyjson_mut_doc *doc = yyjson_mut_doc_new(NULL); + yyjson_mut_val *root = yyjson_mut_obj(doc); + yyjson_mut_doc_set_root(doc, root); + + if (node_count == 0) { + yyjson_mut_obj_add_strcpy(doc, root, "error", "symbol not found"); + char *json = yy_doc_to_str(doc); + yyjson_mut_doc_free(doc); + free(target); free(project); free(direction); + char *r = cbm_mcp_text_result(json, true); + free(json); + return r; + } + + /* Pick best node: prefer Class over Constructor when both share the same name. + * This mirrors the disambiguation logic in trace_call_path so that impact + * analysis on a class name (e.g. "UserService") resolves to the Class node + * and then fans out through DEFINES_METHOD to all its methods. Previously + * this picked the Constructor/Method first, which has 0 callers. */ + int best = 0; + bool has_class = false; + int class_idx = -1; + for (int i = 0; i < node_count; i++) { + const char *lbl = nodes[i].label; + if (lbl && (strcmp(lbl, "Class") == 0 || strcmp(lbl, "Interface") == 0)) { + has_class = true; + if (class_idx < 0) class_idx = i; + } + } + /* Look for a non-constructor Function/Method (skip if same name as Class) */ + bool found_callable = false; + for (int i = 0; i < node_count; i++) { + const char *lbl = nodes[i].label; + if (lbl && (strcmp(lbl, "Function") == 0 || strcmp(lbl, "Method") == 0)) { + if (has_class) continue; /* skip constructor */ + best = i; + found_callable = true; + break; + } + } + if (!found_callable && class_idx >= 0) { + best = class_idx; + } + + /* Resolve start IDs: if target is a Class/Interface, expand through + * DEFINES_METHOD edges to get all method node IDs for BFS. */ + int64_t *start_ids = NULL; + int start_id_count = 0; + bool is_class_like = false; + const char *best_label = nodes[best].label; + if (best_label && + (strcmp(best_label, "Class") == 0 || strcmp(best_label, "Interface") == 0)) { + is_class_like = true; + } + + if (is_class_like) { + cbm_edge_t *dm_edges = NULL; + int dm_count = 0; + cbm_store_find_edges_by_source_type(store, nodes[best].id, "DEFINES_METHOD", + &dm_edges, &dm_count); + if (dm_count > 0) { + /* For impact we use all methods (unlike trace which caps at 5) */ + int use_count = dm_count > 30 ? 30 : dm_count; + start_ids = malloc((size_t)use_count * sizeof(int64_t)); + for (int i = 0; i < use_count; i++) { + start_ids[i] = dm_edges[i].target_id; + } + start_id_count = use_count; + } + for (int i = 0; i < dm_count; i++) { + free((void *)dm_edges[i].project); + free((void *)dm_edges[i].type); + free((void *)dm_edges[i].properties_json); + } + free(dm_edges); + if (start_id_count == 0) { + start_ids = malloc(sizeof(int64_t)); + start_ids[0] = nodes[best].id; + start_id_count = 1; + } + } else { + start_ids = malloc(sizeof(int64_t)); + start_ids[0] = nodes[best].id; + start_id_count = 1; + } + + yyjson_mut_obj_add_strcpy(doc, root, "target", target); + yyjson_mut_obj_add_strcpy(doc, root, "direction", direction); + yyjson_mut_obj_add_strcpy(doc, root, "file_path", + nodes[best].file_path ? nodes[best].file_path : ""); + yyjson_mut_obj_add_int(doc, root, "line", nodes[best].start_line); + + /* BFS from each start ID and merge results. For classes this fans out + * through all methods, giving a true blast radius instead of 0. */ + const char *call_types[] = {"CALLS", "HTTP_CALLS", "ASYNC_CALLS", "USAGE"}; + cbm_traverse_result_t tr = {0}; + + if (start_id_count == 1) { + cbm_store_bfs(store, start_ids[0], bfs_dir, call_types, 4, max_depth, 200, &tr); + } else { + /* Multi-method BFS: run from each method, collect unique visited nodes */ + cbm_traverse_result_t *subs = calloc((size_t)start_id_count, sizeof(*subs)); + int total_visited = 0; + for (int s = 0; s < start_id_count; s++) { + cbm_store_bfs(store, start_ids[s], bfs_dir, call_types, 4, max_depth, + 200, &subs[s]); + total_visited += subs[s].visited_count; + } + /* Merge into tr: allocate worst-case, then dedup by node id */ + if (total_visited > 0) { + tr.visited = malloc((size_t)total_visited * sizeof(cbm_node_hop_t)); + tr.visited_count = 0; + for (int s = 0; s < start_id_count; s++) { + for (int v = 0; v < subs[s].visited_count; v++) { + int64_t vid = subs[s].visited[v].node.id; + /* Check for duplicate (same node already in tr) */ + bool dup = false; + for (int e = 0; e < tr.visited_count; e++) { + if (tr.visited[e].node.id == vid) { + /* Keep the one with smaller hop (closer = more impacted) */ + if (subs[s].visited[v].hop < tr.visited[e].hop) + tr.visited[e].hop = subs[s].visited[v].hop; + dup = true; + break; + } + } + if (!dup && tr.visited_count < total_visited) { + tr.visited[tr.visited_count] = subs[s].visited[v]; + tr.visited_count++; + } + } + } + } + /* Free sub-traversals (but NOT their visited[].node fields — we moved them) */ + for (int s = 0; s < start_id_count; s++) { + free(subs[s].edges); + } + free(subs); + } + + /* Group by depth */ + yyjson_mut_val *d1_arr = yyjson_mut_arr(doc); + yyjson_mut_val *d2_arr = yyjson_mut_arr(doc); + yyjson_mut_val *d3_arr = yyjson_mut_arr(doc); + int depth_counts[10] = {0}; + int total_affected = 0; + + for (int i = 0; i < tr.visited_count; i++) { + int h = tr.visited[i].hop; + if (h >= 1 && h <= max_depth) { + if (h < 10) depth_counts[h]++; + total_affected++; + + cbm_node_t *vn = &tr.visited[i].node; + yyjson_mut_val *item = yyjson_mut_obj(doc); + yyjson_mut_obj_add_str(doc, item, "name", vn->name ? vn->name : ""); + yyjson_mut_obj_add_str(doc, item, "file_path", vn->file_path ? vn->file_path : ""); + yyjson_mut_obj_add_str(doc, item, "label", vn->label ? vn->label : ""); + yyjson_mut_obj_add_int(doc, item, "line", vn->start_line); + + if (h == 1) yyjson_mut_arr_add_val(d1_arr, item); + else if (h == 2) yyjson_mut_arr_add_val(d2_arr, item); + else yyjson_mut_arr_add_val(d3_arr, item); + } + } + yyjson_mut_val *by_depth = yyjson_mut_obj(doc); + yyjson_mut_obj_add_val(doc, by_depth, "d1_will_break", d1_arr); + yyjson_mut_obj_add_val(doc, by_depth, "d2_likely_affected", d2_arr); + yyjson_mut_obj_add_val(doc, by_depth, "d3_may_need_testing", d3_arr); + yyjson_mut_obj_add_val(doc, root, "by_depth", by_depth); + + /* Risk assessment */ + const char *risk; + if (depth_counts[1] >= 20) risk = "CRITICAL"; + else if (depth_counts[1] >= 10) risk = "HIGH"; + else if (depth_counts[1] >= 3) risk = "MEDIUM"; + else risk = "LOW"; + + yyjson_mut_obj_add_str(doc, root, "risk", risk); + yyjson_mut_obj_add_int(doc, root, "total_affected", total_affected); + yyjson_mut_obj_add_int(doc, root, "direct_callers", depth_counts[1]); + + /* Summary labels per depth */ + yyjson_mut_val *summary = yyjson_mut_obj(doc); + char d1_label[64]; snprintf(d1_label, sizeof(d1_label), "%d WILL BREAK", depth_counts[1]); + char d2_label[64]; snprintf(d2_label, sizeof(d2_label), "%d LIKELY AFFECTED", depth_counts[2]); + char d3_label[64]; snprintf(d3_label, sizeof(d3_label), "%d MAY NEED TESTING", depth_counts[3]); + yyjson_mut_obj_add_strcpy(doc, summary, "d1", d1_label); + yyjson_mut_obj_add_strcpy(doc, summary, "d2", d2_label); + yyjson_mut_obj_add_strcpy(doc, summary, "d3", d3_label); + yyjson_mut_obj_add_val(doc, root, "summary", summary); + + /* Affected processes — match by checking if any BFS-visited node name + * appears in the process label, OR if the target name itself appears. + * This catches processes that flow through the target's methods. */ + { + cbm_process_info_t *procs = NULL; + int pcount = 0; + cbm_store_list_processes(store, project, &procs, &pcount); + yyjson_mut_val *paff = yyjson_mut_arr(doc); + int pc = 0; + for (int pi = 0; pi < pcount && pc < 20; pi++) { + if (!procs[pi].label) continue; + bool match = false; + /* Check target name */ + if (target && strstr(procs[pi].label, target)) match = true; + /* Check BFS-visited node names (d=1 callers are most likely) */ + if (!match) { + for (int v = 0; v < tr.visited_count && !match; v++) { + if (tr.visited[v].hop == 1 && tr.visited[v].node.name && + strstr(procs[pi].label, tr.visited[v].node.name)) { + match = true; + } + } + } + if (match) { + yyjson_mut_val *pitem = yyjson_mut_obj(doc); + yyjson_mut_obj_add_strcpy(doc, pitem, "label", procs[pi].label); + yyjson_mut_obj_add_int(doc, pitem, "step_count", procs[pi].step_count); + yyjson_mut_arr_add_val(paff, pitem); + pc++; + } + } + yyjson_mut_obj_add_val(doc, root, "affected_processes", paff); + cbm_store_free_processes(procs, pcount); + } + + char *json = yy_doc_to_str(doc); + yyjson_mut_doc_free(doc); + cbm_store_traverse_free(&tr); + cbm_store_free_nodes(nodes, node_count); + free(start_ids); + free(target); free(project); free(direction); + char *result = cbm_mcp_text_result(json, false); + free(json); + return result; +} + +static char *handle_get_channels(cbm_mcp_server_t *srv, const char *args) { + char *project = cbm_mcp_get_string_arg(args, "project"); + char *channel = cbm_mcp_get_string_arg(args, "channel"); + + /* Cross-repo channel query: when project is NULL, iterate all indexed projects */ + cbm_channel_info_t *channels = NULL; + int count = 0; + + if (!project || strlen(project) == 0) { + char dir_path[1024]; + cache_dir(dir_path, sizeof(dir_path)); + cbm_dir_t *d = cbm_opendir(dir_path); + if (d) { + cbm_dirent_t *entry; + while ((entry = cbm_readdir(d)) != NULL) { + const char *n = entry->name; + size_t len = strlen(n); + if (len < 4 || strcmp(n + len - 3, ".db") != 0) continue; + if (strncmp(n, "tmp-", 4) == 0 || strncmp(n, "_", 1) == 0) continue; + + /* Extract project name (filename without .db) */ + char proj_name[512]; + snprintf(proj_name, sizeof(proj_name), "%.*s", (int)(len - 3), n); + + /* Open this project's store and query channels */ + char db_path[2048]; + snprintf(db_path, sizeof(db_path), "%s/%s", dir_path, n); + cbm_store_t *ps = cbm_store_open_path_query(db_path); + if (!ps) continue; + + cbm_channel_info_t *proj_ch = NULL; + int proj_count = 0; + cbm_store_find_channels(ps, proj_name, channel, &proj_ch, &proj_count); + + if (proj_count > 0) { + /* Merge into main results */ + channels = safe_realloc(channels, + (count + proj_count) * sizeof(cbm_channel_info_t)); + memcpy(channels + count, proj_ch, proj_count * sizeof(cbm_channel_info_t)); + count += proj_count; + free(proj_ch); /* shallow free — info fields now owned by channels[] */ + } + cbm_store_close(ps); + } + cbm_closedir(d); + } + } else { + cbm_store_t *store = resolve_store(srv, project); + REQUIRE_STORE(store, project); + cbm_store_find_channels(store, project, channel, &channels, &count); + } + + yyjson_mut_doc *doc = yyjson_mut_doc_new(NULL); + yyjson_mut_val *root = yyjson_mut_obj(doc); + yyjson_mut_doc_set_root(doc, root); + + yyjson_mut_obj_add_int(doc, root, "total", count); + + /* Group by channel name for readable output */ + yyjson_mut_val *arr = yyjson_mut_arr(doc); + for (int i = 0; i < count; i++) { + yyjson_mut_val *item = yyjson_mut_obj(doc); + yyjson_mut_obj_add_strcpy(doc, item, "channel", + channels[i].channel_name ? channels[i].channel_name : ""); + yyjson_mut_obj_add_strcpy(doc, item, "direction", + channels[i].direction ? channels[i].direction : ""); + yyjson_mut_obj_add_strcpy(doc, item, "transport", + channels[i].transport ? channels[i].transport : ""); + yyjson_mut_obj_add_strcpy(doc, item, "project", + channels[i].project ? channels[i].project : ""); + yyjson_mut_obj_add_strcpy(doc, item, "file", + channels[i].file_path ? channels[i].file_path : ""); + yyjson_mut_obj_add_strcpy(doc, item, "function", + channels[i].function_name ? channels[i].function_name : ""); + yyjson_mut_arr_add_val(arr, item); + } + yyjson_mut_obj_add_val(doc, root, "channels", arr); + + char *json = yy_doc_to_str(doc); + yyjson_mut_doc_free(doc); + cbm_store_free_channels(channels, count); + free(project); + free(channel); + + char *result = cbm_mcp_text_result(json, false); + free(json); + return result; +} + +static char *handle_list_processes(cbm_mcp_server_t *srv, const char *args) { + char *project = cbm_mcp_get_string_arg(args, "project"); + cbm_store_t *store = resolve_store(srv, project); + REQUIRE_STORE(store, project); + + cbm_process_info_t *procs = NULL; + int count = 0; + cbm_store_list_processes(store, project, &procs, &count); + + yyjson_mut_doc *doc = yyjson_mut_doc_new(NULL); + yyjson_mut_val *root = yyjson_mut_obj(doc); + yyjson_mut_doc_set_root(doc, root); + + yyjson_mut_obj_add_int(doc, root, "total", count); + + yyjson_mut_val *arr = yyjson_mut_arr(doc); + for (int i = 0; i < count; i++) { + yyjson_mut_val *item = yyjson_mut_obj(doc); + yyjson_mut_obj_add_int(doc, item, "id", procs[i].id); + yyjson_mut_obj_add_strcpy(doc, item, "label", procs[i].label ? procs[i].label : ""); + yyjson_mut_obj_add_strcpy(doc, item, "process_type", + procs[i].process_type ? procs[i].process_type : ""); + yyjson_mut_obj_add_int(doc, item, "step_count", procs[i].step_count); + yyjson_mut_obj_add_int(doc, item, "entry_point_id", procs[i].entry_point_id); + yyjson_mut_obj_add_int(doc, item, "terminal_id", procs[i].terminal_id); + yyjson_mut_arr_add_val(arr, item); + } + yyjson_mut_obj_add_val(doc, root, "processes", arr); + + char *json = yy_doc_to_str(doc); + yyjson_mut_doc_free(doc); + cbm_store_free_processes(procs, count); + free(project); + + char *result = cbm_mcp_text_result(json, false); + free(json); + return result; +} + static char *handle_get_architecture(cbm_mcp_server_t *srv, const char *args) { char *project = cbm_mcp_get_string_arg(args, "project"); cbm_store_t *store = resolve_store(srv, project); @@ -1169,6 +1628,12 @@ static char *handle_get_architecture(cbm_mcp_server_t *srv, const char *args) { int node_count = cbm_store_count_nodes(store, project); int edge_count = cbm_store_count_edges(store, project); + /* Call the full architecture analysis */ + cbm_architecture_info_t arch = {0}; + const char *all_aspects[] = {"languages", "hotspots", "routes", "entry_points", + "packages", "clusters", "layers", "boundaries"}; + cbm_store_get_architecture(store, project, all_aspects, 8, &arch); + yyjson_mut_doc *doc = yyjson_mut_doc_new(NULL); yyjson_mut_val *root = yyjson_mut_obj(doc); yyjson_mut_doc_set_root(doc, root); @@ -1199,6 +1664,105 @@ static char *handle_get_architecture(cbm_mcp_server_t *srv, const char *args) { } yyjson_mut_obj_add_val(doc, root, "edge_types", types); + /* Languages */ + if (arch.language_count > 0) { + yyjson_mut_val *langs = yyjson_mut_arr(doc); + for (int i = 0; i < arch.language_count; i++) { + yyjson_mut_val *item = yyjson_mut_obj(doc); + yyjson_mut_obj_add_strcpy(doc, item, "language", + arch.languages[i].language ? arch.languages[i].language : ""); + yyjson_mut_obj_add_int(doc, item, "files", arch.languages[i].file_count); + yyjson_mut_arr_add_val(langs, item); + } + yyjson_mut_obj_add_val(doc, root, "languages", langs); + } + + /* Hotspots (high fan-in functions) */ + if (arch.hotspot_count > 0) { + yyjson_mut_val *spots = yyjson_mut_arr(doc); + for (int i = 0; i < arch.hotspot_count; i++) { + yyjson_mut_val *item = yyjson_mut_obj(doc); + yyjson_mut_obj_add_strcpy(doc, item, "name", + arch.hotspots[i].name ? arch.hotspots[i].name : ""); + yyjson_mut_obj_add_strcpy( + doc, item, "qualified_name", + arch.hotspots[i].qualified_name ? arch.hotspots[i].qualified_name : ""); + yyjson_mut_obj_add_int(doc, item, "fan_in", arch.hotspots[i].fan_in); + yyjson_mut_arr_add_val(spots, item); + } + yyjson_mut_obj_add_val(doc, root, "hotspots", spots); + } + + /* Routes */ + if (arch.route_count > 0) { + yyjson_mut_val *routes_arr = yyjson_mut_arr(doc); + for (int i = 0; i < arch.route_count; i++) { + yyjson_mut_val *item = yyjson_mut_obj(doc); + yyjson_mut_obj_add_strcpy(doc, item, "method", + arch.routes[i].method ? arch.routes[i].method : ""); + yyjson_mut_obj_add_strcpy(doc, item, "path", + arch.routes[i].path ? arch.routes[i].path : ""); + yyjson_mut_obj_add_strcpy(doc, item, "handler", + arch.routes[i].handler ? arch.routes[i].handler : ""); + yyjson_mut_arr_add_val(routes_arr, item); + } + yyjson_mut_obj_add_val(doc, root, "routes", routes_arr); + } + + /* Entry points */ + if (arch.entry_point_count > 0) { + yyjson_mut_val *eps = yyjson_mut_arr(doc); + for (int i = 0; i < arch.entry_point_count; i++) { + yyjson_mut_val *item = yyjson_mut_obj(doc); + yyjson_mut_obj_add_strcpy(doc, item, "name", + arch.entry_points[i].name ? arch.entry_points[i].name : ""); + yyjson_mut_obj_add_strcpy( + doc, item, "qualified_name", + arch.entry_points[i].qualified_name ? arch.entry_points[i].qualified_name : ""); + yyjson_mut_obj_add_strcpy(doc, item, "file", + arch.entry_points[i].file ? arch.entry_points[i].file : ""); + yyjson_mut_arr_add_val(eps, item); + } + yyjson_mut_obj_add_val(doc, root, "entry_points", eps); + } + + /* Packages */ + if (arch.package_count > 0) { + yyjson_mut_val *pkgs = yyjson_mut_arr(doc); + for (int i = 0; i < arch.package_count; i++) { + yyjson_mut_val *item = yyjson_mut_obj(doc); + yyjson_mut_obj_add_strcpy(doc, item, "name", + arch.packages[i].name ? arch.packages[i].name : ""); + yyjson_mut_obj_add_int(doc, item, "node_count", arch.packages[i].node_count); + yyjson_mut_obj_add_int(doc, item, "fan_in", arch.packages[i].fan_in); + yyjson_mut_obj_add_int(doc, item, "fan_out", arch.packages[i].fan_out); + yyjson_mut_arr_add_val(pkgs, item); + } + yyjson_mut_obj_add_val(doc, root, "packages", pkgs); + } + + /* Clusters */ + if (arch.cluster_count > 0) { + yyjson_mut_val *cls = yyjson_mut_arr(doc); + for (int i = 0; i < arch.cluster_count; i++) { + yyjson_mut_val *item = yyjson_mut_obj(doc); + yyjson_mut_obj_add_int(doc, item, "id", arch.clusters[i].id); + yyjson_mut_obj_add_strcpy(doc, item, "label", + arch.clusters[i].label ? arch.clusters[i].label : ""); + yyjson_mut_obj_add_int(doc, item, "members", arch.clusters[i].members); + yyjson_mut_obj_add_real(doc, item, "cohesion", arch.clusters[i].cohesion); + if (arch.clusters[i].top_node_count > 0) { + yyjson_mut_val *tn = yyjson_mut_arr(doc); + for (int j = 0; j < arch.clusters[i].top_node_count; j++) { + yyjson_mut_arr_add_strcpy(doc, tn, arch.clusters[i].top_nodes[j]); + } + yyjson_mut_obj_add_val(doc, item, "top_nodes", tn); + } + yyjson_mut_arr_add_val(cls, item); + } + yyjson_mut_obj_add_val(doc, root, "clusters", cls); + } + /* Relationship patterns */ if (schema.rel_pattern_count > 0) { yyjson_mut_val *pats = yyjson_mut_arr(doc); @@ -1210,6 +1774,7 @@ static char *handle_get_architecture(cbm_mcp_server_t *srv, const char *args) { char *json = yy_doc_to_str(doc); yyjson_mut_doc_free(doc); + cbm_store_architecture_free(&arch); cbm_store_schema_free(&schema); free(project); @@ -1258,13 +1823,149 @@ static char *handle_trace_call_path(cbm_mcp_server_t *srv, const char *args) { cbm_store_find_nodes_by_name(store, project, func_name, &nodes, &node_count); if (node_count == 0) { - free(func_name); - free(project); - free(direction); + /* Fuzzy fallback: try substring match when exact name not found. + * This handles cases like searching for "RecordingSession" when only + * "ContinuousRecordingSessionDataGen" exists. */ + cbm_search_params_t fuzzy = {0}; + char pattern[512]; + snprintf(pattern, sizeof(pattern), ".*%s.*", func_name); + fuzzy.project = project; + fuzzy.name_pattern = pattern; + fuzzy.limit = 10; + cbm_search_output_t fuzzy_results = {0}; + cbm_store_search(store, &fuzzy, &fuzzy_results); + + if (fuzzy_results.count > 0) { + /* Return fuzzy matches as suggestions */ + yyjson_mut_doc *fdoc = yyjson_mut_doc_new(NULL); + yyjson_mut_val *froot = yyjson_mut_obj(fdoc); + yyjson_mut_doc_set_root(fdoc, froot); + yyjson_mut_obj_add_str(fdoc, froot, "status", "not_found_exact"); + char msg[512]; + snprintf(msg, sizeof(msg), + "No exact match for '%s'. Found %d partial matches — " + "use one of these exact names:", func_name, fuzzy_results.count); + yyjson_mut_obj_add_strcpy(fdoc, froot, "message", msg); + yyjson_mut_val *suggestions = yyjson_mut_arr(fdoc); + for (int i = 0; i < fuzzy_results.count; i++) { + yyjson_mut_val *si = yyjson_mut_obj(fdoc); + yyjson_mut_obj_add_strcpy(fdoc, si, "name", + fuzzy_results.results[i].node.name ? fuzzy_results.results[i].node.name : ""); + yyjson_mut_obj_add_strcpy(fdoc, si, "label", + fuzzy_results.results[i].node.label ? fuzzy_results.results[i].node.label : ""); + yyjson_mut_obj_add_strcpy(fdoc, si, "file_path", + fuzzy_results.results[i].node.file_path ? fuzzy_results.results[i].node.file_path : ""); + yyjson_mut_obj_add_int(fdoc, si, "line", fuzzy_results.results[i].node.start_line); + yyjson_mut_arr_add_val(suggestions, si); + } + yyjson_mut_obj_add_val(fdoc, froot, "suggestions", suggestions); + char *fjson = yy_doc_to_str(fdoc); + yyjson_mut_doc_free(fdoc); + cbm_store_search_free(&fuzzy_results); + free(func_name); free(project); free(direction); + cbm_store_free_nodes(nodes, 0); + char *result = cbm_mcp_text_result(fjson, false); + free(fjson); + return result; + } + cbm_store_search_free(&fuzzy_results); + free(func_name); free(project); free(direction); cbm_store_free_nodes(nodes, 0); return cbm_mcp_text_result("{\"error\":\"function not found\"}", true); } + /* Pick the best node for tracing. Strategy: + * 1. Prefer Function/Method nodes that are NOT constructors (same name as a + * Class in the result set — constructors rarely have interesting CALLS). + * 2. If only Class/Interface nodes match, resolve through DEFINES_METHOD. */ + int best_idx = 0; + bool has_class = false; + int class_idx = -1; + for (int i = 0; i < node_count; i++) { + const char *lbl = nodes[i].label; + if (lbl && (strcmp(lbl, "Class") == 0 || strcmp(lbl, "Interface") == 0)) { + has_class = true; + if (class_idx < 0) class_idx = i; + } + } + /* Look for a non-constructor Function/Method */ + bool found_callable = false; + for (int i = 0; i < node_count; i++) { + const char *lbl = nodes[i].label; + if (lbl && (strcmp(lbl, "Function") == 0 || strcmp(lbl, "Method") == 0)) { + /* Skip if this is a constructor (same name as a Class in results) */ + if (has_class) continue; + best_idx = i; + found_callable = true; + break; + } + } + /* If no non-constructor callable was found but we have a Class, use the Class */ + if (!found_callable && class_idx >= 0) { + best_idx = class_idx; + } + + /* Track disambiguation info — added to the main doc after creation */ + int callable_count = 0; + for (int i = 0; i < node_count; i++) { + const char *lbl = nodes[i].label; + if (lbl && strcmp(lbl, "File") != 0 && strcmp(lbl, "Folder") != 0 && + strcmp(lbl, "Module") != 0 && strcmp(lbl, "Variable") != 0 && + strcmp(lbl, "Section") != 0 && strcmp(lbl, "Project") != 0) { + callable_count++; + } + } + + /* Determine if the selected node is a Class or Interface. If so, we need to + * resolve through DEFINES_METHOD edges to find the actual callable methods, + * then run BFS from each method and merge results. */ + bool is_class_like = false; + const char *best_label = nodes[best_idx].label; + if (best_label && + (strcmp(best_label, "Class") == 0 || strcmp(best_label, "Interface") == 0)) { + is_class_like = true; + } + + /* Collect BFS start IDs: either the single node, or all methods of the class */ + int64_t *start_ids = NULL; + int start_id_count = 0; + + if (is_class_like) { + /* Find all DEFINES_METHOD targets of this class */ + cbm_edge_t *dm_edges = NULL; + int dm_count = 0; + cbm_store_find_edges_by_source_type(store, nodes[best_idx].id, "DEFINES_METHOD", + &dm_edges, &dm_count); + if (dm_count > 0) { + /* Cap at 5 methods to prevent excessive BFS calls (each method + * spawns ~6 BFS queries across edge type categories) */ + int use_count = dm_count > 5 ? 5 : dm_count; + start_ids = malloc((size_t)use_count * sizeof(int64_t)); + for (int i = 0; i < use_count; i++) { + start_ids[i] = dm_edges[i].target_id; + } + start_id_count = use_count; + } + /* Free edge data */ + for (int i = 0; i < dm_count; i++) { + free((void *)dm_edges[i].project); + free((void *)dm_edges[i].type); + free((void *)dm_edges[i].properties_json); + } + free(dm_edges); + + /* If no methods found, fall back to the class node itself */ + if (start_id_count == 0) { + start_ids = malloc(sizeof(int64_t)); + start_ids[0] = nodes[best_idx].id; + start_id_count = 1; + } + } else { + start_ids = malloc(sizeof(int64_t)); + start_ids[0] = nodes[best_idx].id; + start_id_count = 1; + } + yyjson_mut_doc *doc = yyjson_mut_doc_new(NULL); yyjson_mut_val *root = yyjson_mut_obj(doc); yyjson_mut_doc_set_root(doc, root); @@ -1272,68 +1973,341 @@ static char *handle_trace_call_path(cbm_mcp_server_t *srv, const char *args) { yyjson_mut_obj_add_str(doc, root, "function", func_name); yyjson_mut_obj_add_str(doc, root, "direction", direction); - const char *edge_types[] = {"CALLS"}; - int edge_type_count = 1; + /* Add matched node info */ + yyjson_mut_obj_add_strcpy(doc, root, "matched_file", + nodes[best_idx].file_path ? nodes[best_idx].file_path : ""); + yyjson_mut_obj_add_strcpy(doc, root, "matched_label", + nodes[best_idx].label ? nodes[best_idx].label : ""); + yyjson_mut_obj_add_int(doc, root, "matched_line", nodes[best_idx].start_line); + + /* Disambiguation: list all callable candidates when multiple match */ + if (callable_count > 1) { + yyjson_mut_val *cands = yyjson_mut_arr(doc); + for (int i = 0; i < node_count; i++) { + const char *lbl = nodes[i].label; + if (lbl && strcmp(lbl, "File") != 0 && strcmp(lbl, "Folder") != 0 && + strcmp(lbl, "Module") != 0 && strcmp(lbl, "Variable") != 0 && + strcmp(lbl, "Section") != 0 && strcmp(lbl, "Project") != 0) { + yyjson_mut_val *ci = yyjson_mut_obj(doc); + yyjson_mut_obj_add_strcpy(doc, ci, "name", + nodes[i].name ? nodes[i].name : ""); + yyjson_mut_obj_add_strcpy(doc, ci, "label", + nodes[i].label ? nodes[i].label : ""); + yyjson_mut_obj_add_strcpy(doc, ci, "file_path", + nodes[i].file_path ? nodes[i].file_path : ""); + yyjson_mut_obj_add_int(doc, ci, "line", nodes[i].start_line); + yyjson_mut_arr_add_val(cands, ci); + } + } + yyjson_mut_obj_add_val(doc, root, "candidates", cands); + } + + /* Check if the node has any edges at all. If not, return basic info only. + * This avoids BFS crashes on nodes with 0 edges (e.g. Type nodes, empty Classes). */ + { + int in_deg = 0; + int out_deg = 0; + cbm_store_node_degree(store, nodes[best_idx].id, &in_deg, &out_deg); + if (in_deg == 0 && out_deg == 0 && !is_class_like) { + /* No CALLS edges and not a Class — return basic info. + * Class/Interface nodes skip this check because they have + * DEFINES_METHOD and INHERITS edges that aren't counted by + * cbm_store_node_degree (which only counts CALLS). */ + char *json = yy_doc_to_str(doc); + yyjson_mut_doc_free(doc); + free(start_ids); + cbm_store_free_nodes(nodes, node_count); + free(func_name); free(project); free(direction); + char *result = cbm_mcp_text_result(json, false); + free(json); + return result; + } + } + + /* ── Categorized edge query: like GitNexus context() ── + * Instead of flat BFS, query each edge type separately and return + * categorized results: incoming.calls, incoming.imports, incoming.extends, + * outgoing.calls, outgoing.has_method, outgoing.has_property. + * This gives investigation-grade output where a QA engineer can see + * exactly which functions CALL this vs which files IMPORT it. */ + + /* Helper: query edges for specific types and build JSON array. + * Uses strcpy variants since nodes are freed per-query. */ + #define EDGE_QUERY_MAX 30 - /* Run BFS for each requested direction. - * IMPORTANT: yyjson_mut_obj_add_str borrows pointers — we must keep - * traversal results alive until after yy_doc_to_str serialization. */ // NOLINTNEXTLINE(readability-implicit-bool-conversion) bool do_outbound = strcmp(direction, "outbound") == 0 || strcmp(direction, "both") == 0; // NOLINTNEXTLINE(readability-implicit-bool-conversion) bool do_inbound = strcmp(direction, "inbound") == 0 || strcmp(direction, "both") == 0; - cbm_traverse_result_t tr_out = {0}; - cbm_traverse_result_t tr_in = {0}; + /* Collect all traversal results for lifetime management */ + #define MAX_TR 64 + cbm_traverse_result_t *all_tr = calloc(MAX_TR, sizeof(cbm_traverse_result_t)); + int tr_count = 0; - if (do_outbound) { - cbm_store_bfs(store, nodes[0].id, "outbound", edge_types, edge_type_count, depth, 100, - &tr_out); + if (do_inbound) { + yyjson_mut_val *incoming = yyjson_mut_obj(doc); + + /* Incoming CALLS (direct callers — hop 1 only for clean results). + * For Classes: also include USAGE and DEFINES edges which capture + * file-level references like `new MyClass()` and `import MyClass`. + * Query both the class node AND its methods as BFS roots. */ + { + const char *call_types[] = {"CALLS", "HTTP_CALLS", "ASYNC_CALLS", "USAGE", "RAISES"}; + /* Always include the original node (Class or Function) */ + if (tr_count < MAX_TR) { + cbm_store_bfs(store, nodes[best_idx].id, "inbound", call_types, 5, 1, + EDGE_QUERY_MAX, &all_tr[tr_count]); + tr_count++; + } + /* Also include methods for class resolution */ + for (int s = 0; s < start_id_count && tr_count < MAX_TR; s++) { + if (start_ids[s] == nodes[best_idx].id) continue; /* already queried */ + cbm_store_bfs(store, start_ids[s], "inbound", call_types, 5, 1, EDGE_QUERY_MAX, + &all_tr[tr_count]); + if (all_tr[tr_count].visited_count > 0) { + tr_count++; + } + } + } + /* Build calls array from all BFS results */ + yyjson_mut_val *calls_arr = yyjson_mut_arr(doc); + for (int t = 0; t < tr_count; t++) { + for (int i = 0; i < all_tr[t].visited_count; i++) { + cbm_node_t *vn = &all_tr[t].visited[i].node; + yyjson_mut_val *item = yyjson_mut_obj(doc); + yyjson_mut_obj_add_str(doc, item, "name", vn->name ? vn->name : ""); + yyjson_mut_obj_add_str(doc, item, "file_path", vn->file_path ? vn->file_path : ""); + yyjson_mut_obj_add_int(doc, item, "line", vn->start_line); + yyjson_mut_arr_add_val(calls_arr, item); + } + } + yyjson_mut_obj_add_val(doc, incoming, "calls", calls_arr); + + /* Incoming IMPORTS */ + { + int saved_tr = tr_count; + for (int s = 0; s < start_id_count && tr_count < MAX_TR; s++) { + const char *imp_types[] = {"IMPORTS"}; + cbm_store_bfs(store, start_ids[s], "inbound", imp_types, 1, 1, EDGE_QUERY_MAX, + &all_tr[tr_count]); + tr_count++; + } + yyjson_mut_val *imp_arr = yyjson_mut_arr(doc); + for (int t = saved_tr; t < tr_count; t++) { + for (int i = 0; i < all_tr[t].visited_count; i++) { + cbm_node_t *vn = &all_tr[t].visited[i].node; + yyjson_mut_val *item = yyjson_mut_obj(doc); + yyjson_mut_obj_add_str(doc, item, "name", vn->name ? vn->name : ""); + yyjson_mut_obj_add_str(doc, item, "file_path", vn->file_path ? vn->file_path : ""); + yyjson_mut_arr_add_val(imp_arr, item); + } + } + yyjson_mut_obj_add_val(doc, incoming, "imports", imp_arr); + } - yyjson_mut_val *callees = yyjson_mut_arr(doc); - for (int i = 0; i < tr_out.visited_count; i++) { - yyjson_mut_val *item = yyjson_mut_obj(doc); - yyjson_mut_obj_add_str(doc, item, "name", - tr_out.visited[i].node.name ? tr_out.visited[i].node.name : ""); - yyjson_mut_obj_add_str( - doc, item, "qualified_name", - tr_out.visited[i].node.qualified_name ? tr_out.visited[i].node.qualified_name : ""); - yyjson_mut_obj_add_int(doc, item, "hop", tr_out.visited[i].hop); - yyjson_mut_arr_add_val(callees, item); + /* Incoming INHERITS (who extends this) */ + { + int saved_tr = tr_count; + for (int s = 0; s < start_id_count && tr_count < MAX_TR; s++) { + const char *inh_types[] = {"INHERITS"}; + cbm_store_bfs(store, start_ids[s], "inbound", inh_types, 1, 1, EDGE_QUERY_MAX, + &all_tr[tr_count]); + tr_count++; + } + yyjson_mut_val *inh_arr = yyjson_mut_arr(doc); + for (int t = saved_tr; t < tr_count; t++) { + for (int i = 0; i < all_tr[t].visited_count; i++) { + cbm_node_t *vn = &all_tr[t].visited[i].node; + yyjson_mut_val *item = yyjson_mut_obj(doc); + yyjson_mut_obj_add_str(doc, item, "name", vn->name ? vn->name : ""); + yyjson_mut_obj_add_str(doc, item, "file_path", vn->file_path ? vn->file_path : ""); + yyjson_mut_arr_add_val(inh_arr, item); + } + } + yyjson_mut_obj_add_val(doc, incoming, "extends", inh_arr); + } + + yyjson_mut_obj_add_val(doc, root, "incoming", incoming); + + /* Also include deeper BFS (hop 2+) as a separate "transitive_callers" field + * for users who need it — but only on CALLS, capped at 50. */ + if (depth > 1) { + int saved_tr2 = tr_count; + for (int s = 0; s < start_id_count && tr_count < MAX_TR; s++) { + const char *call_types[] = {"CALLS", "HTTP_CALLS", "ASYNC_CALLS"}; + cbm_store_bfs(store, start_ids[s], "inbound", call_types, 3, depth, 50, + &all_tr[tr_count]); + tr_count++; + } + yyjson_mut_val *trans_arr = yyjson_mut_arr(doc); + for (int t = saved_tr2; t < tr_count; t++) { + for (int i = 0; i < all_tr[t].visited_count; i++) { + if (all_tr[t].visited[i].hop <= 1) continue; /* skip hop 1, already shown */ + cbm_node_t *vn = &all_tr[t].visited[i].node; + yyjson_mut_val *item = yyjson_mut_obj(doc); + yyjson_mut_obj_add_str(doc, item, "name", vn->name ? vn->name : ""); + yyjson_mut_obj_add_str(doc, item, "file_path", vn->file_path ? vn->file_path : ""); + yyjson_mut_obj_add_int(doc, item, "hop", all_tr[t].visited[i].hop); + yyjson_mut_arr_add_val(trans_arr, item); + } + } + yyjson_mut_obj_add_val(doc, root, "transitive_callers", trans_arr); } - yyjson_mut_obj_add_val(doc, root, "callees", callees); } - if (do_inbound) { - cbm_store_bfs(store, nodes[0].id, "inbound", edge_types, edge_type_count, depth, 100, - &tr_in); + if (do_outbound) { + yyjson_mut_val *outgoing = yyjson_mut_obj(doc); + + /* Outgoing CALLS */ + { + int saved_tr = tr_count; + for (int s = 0; s < start_id_count && tr_count < MAX_TR; s++) { + const char *call_types[] = {"CALLS", "HTTP_CALLS", "ASYNC_CALLS"}; + cbm_store_bfs(store, start_ids[s], "outbound", call_types, 3, 1, EDGE_QUERY_MAX, + &all_tr[tr_count]); + tr_count++; + } + yyjson_mut_val *calls_arr = yyjson_mut_arr(doc); + for (int t = saved_tr; t < tr_count; t++) { + for (int i = 0; i < all_tr[t].visited_count; i++) { + cbm_node_t *vn = &all_tr[t].visited[i].node; + yyjson_mut_val *item = yyjson_mut_obj(doc); + yyjson_mut_obj_add_str(doc, item, "name", vn->name ? vn->name : ""); + yyjson_mut_obj_add_str(doc, item, "file_path", vn->file_path ? vn->file_path : ""); + yyjson_mut_obj_add_int(doc, item, "line", vn->start_line); + yyjson_mut_arr_add_val(calls_arr, item); + } + } + yyjson_mut_obj_add_val(doc, outgoing, "calls", calls_arr); + } - yyjson_mut_val *callers = yyjson_mut_arr(doc); - for (int i = 0; i < tr_in.visited_count; i++) { - yyjson_mut_val *item = yyjson_mut_obj(doc); - yyjson_mut_obj_add_str(doc, item, "name", - tr_in.visited[i].node.name ? tr_in.visited[i].node.name : ""); - yyjson_mut_obj_add_str( - doc, item, "qualified_name", - tr_in.visited[i].node.qualified_name ? tr_in.visited[i].node.qualified_name : ""); - yyjson_mut_obj_add_int(doc, item, "hop", tr_in.visited[i].hop); - yyjson_mut_arr_add_val(callers, item); + /* Outgoing DEFINES_METHOD (for Classes). + * Use the original Class node ID, not start_ids (which are method IDs). + * DEFINES_METHOD edges go FROM the Class TO its Methods. */ + { + int saved_tr = tr_count; + if (is_class_like && tr_count < MAX_TR) { + const char *dm_types[] = {"DEFINES_METHOD"}; + cbm_store_bfs(store, nodes[best_idx].id, "outbound", dm_types, 1, 1, 30, + &all_tr[tr_count]); + tr_count++; + } + yyjson_mut_val *methods_arr = yyjson_mut_arr(doc); + for (int t = saved_tr; t < tr_count; t++) { + for (int i = 0; i < all_tr[t].visited_count; i++) { + cbm_node_t *vn = &all_tr[t].visited[i].node; + yyjson_mut_val *item = yyjson_mut_obj(doc); + yyjson_mut_obj_add_str(doc, item, "name", vn->name ? vn->name : ""); + yyjson_mut_obj_add_str(doc, item, "file_path", vn->file_path ? vn->file_path : ""); + yyjson_mut_obj_add_int(doc, item, "line", vn->start_line); + yyjson_mut_arr_add_val(methods_arr, item); + } + } + yyjson_mut_obj_add_val(doc, outgoing, "has_method", methods_arr); + } + + /* Outgoing HAS_PROPERTY (for Classes — class properties). */ + { + int saved_tr = tr_count; + if (is_class_like && tr_count < MAX_TR) { + const char *hp_types[] = {"HAS_PROPERTY"}; + cbm_store_bfs(store, nodes[best_idx].id, "outbound", hp_types, 1, 1, 30, + &all_tr[tr_count]); + tr_count++; + } + yyjson_mut_val *props_arr = yyjson_mut_arr(doc); + for (int t = saved_tr; t < tr_count; t++) { + for (int i = 0; i < all_tr[t].visited_count; i++) { + cbm_node_t *vn = &all_tr[t].visited[i].node; + yyjson_mut_val *item = yyjson_mut_obj(doc); + yyjson_mut_obj_add_str(doc, item, "name", vn->name ? vn->name : ""); + yyjson_mut_obj_add_str(doc, item, "file_path", vn->file_path ? vn->file_path : ""); + yyjson_mut_obj_add_int(doc, item, "line", vn->start_line); + yyjson_mut_arr_add_val(props_arr, item); + } + } + yyjson_mut_obj_add_val(doc, outgoing, "has_property", props_arr); + } + + /* Outgoing INHERITS (what this extends) */ + { + int saved_tr = tr_count; + const char *inh_types[] = {"INHERITS"}; + cbm_store_bfs(store, nodes[best_idx].id, "outbound", inh_types, 1, 1, 10, + &all_tr[tr_count]); + tr_count++; + yyjson_mut_val *ext_arr = yyjson_mut_arr(doc); + for (int t = saved_tr; t < tr_count; t++) { + for (int i = 0; i < all_tr[t].visited_count; i++) { + cbm_node_t *vn = &all_tr[t].visited[i].node; + yyjson_mut_val *item = yyjson_mut_obj(doc); + yyjson_mut_obj_add_str(doc, item, "name", vn->name ? vn->name : ""); + yyjson_mut_obj_add_str(doc, item, "file_path", vn->file_path ? vn->file_path : ""); + yyjson_mut_arr_add_val(ext_arr, item); + } + } + yyjson_mut_obj_add_val(doc, outgoing, "extends", ext_arr); + } + + yyjson_mut_obj_add_val(doc, root, "outgoing", outgoing); + } + + /* Process participation */ + { + cbm_process_info_t *procs = NULL; + int pcount = 0; + cbm_store_list_processes(store, project, &procs, &pcount); + if (pcount > 0) { + yyjson_mut_val *flows = yyjson_mut_arr(doc); + int flow_count = 0; + for (int pi = 0; pi < pcount && flow_count < 20; pi++) { + bool participates = false; + if (procs[pi].entry_point_id == nodes[best_idx].id || + procs[pi].terminal_id == nodes[best_idx].id) { + participates = true; + } + if (!participates) { + for (int si = 0; si < start_id_count; si++) { + if (procs[pi].entry_point_id == start_ids[si] || + procs[pi].terminal_id == start_ids[si]) { + participates = true; + break; + } + } + } + if (!participates && func_name && procs[pi].label) { + if (strstr(procs[pi].label, func_name) != NULL) { + participates = true; + } + } + if (participates) { + yyjson_mut_val *fi = yyjson_mut_obj(doc); + yyjson_mut_obj_add_strcpy(doc, fi, "label", + procs[pi].label ? procs[pi].label : ""); + yyjson_mut_obj_add_int(doc, fi, "step_count", procs[pi].step_count); + yyjson_mut_arr_add_val(flows, fi); + flow_count++; + } + } + if (flow_count > 0) yyjson_mut_obj_add_val(doc, root, "processes", flows); } - yyjson_mut_obj_add_val(doc, root, "callers", callers); + cbm_store_free_processes(procs, pcount); } /* Serialize BEFORE freeing traversal results (yyjson borrows strings) */ char *json = yy_doc_to_str(doc); yyjson_mut_doc_free(doc); - /* Now safe to free traversal data */ - if (do_outbound) { - cbm_store_traverse_free(&tr_out); - } - if (do_inbound) { - cbm_store_traverse_free(&tr_in); + /* Now safe to free all traversal data */ + for (int t = 0; t < tr_count; t++) { + cbm_store_traverse_free(&all_tr[t]); } + free(all_tr); + #undef EDGE_QUERY_MAX + #undef MAX_TR + free(start_ids); cbm_store_free_nodes(nodes, node_count); free(func_name); free(project); @@ -2502,7 +3476,10 @@ static char *handle_detect_changes(cbm_mcp_server_t *srv, const char *args) { continue; } - yyjson_mut_arr_add_str(doc, changed, line); + /* Use strcpy variants: line is a stack buffer reused each iteration, + * and node strings are freed by cbm_store_free_nodes below. + * yyjson_mut_*_add_str only borrows pointers — strcpy makes copies. */ + yyjson_mut_arr_add_strcpy(doc, changed, line); file_count++; /* Find symbols defined in this file */ @@ -2514,9 +3491,9 @@ static char *handle_detect_changes(cbm_mcp_server_t *srv, const char *args) { if (nodes[i].label && strcmp(nodes[i].label, "File") != 0 && strcmp(nodes[i].label, "Folder") != 0 && strcmp(nodes[i].label, "Project") != 0) { yyjson_mut_val *item = yyjson_mut_obj(doc); - yyjson_mut_obj_add_str(doc, item, "name", nodes[i].name ? nodes[i].name : ""); - yyjson_mut_obj_add_str(doc, item, "label", nodes[i].label); - yyjson_mut_obj_add_str(doc, item, "file", line); + yyjson_mut_obj_add_strcpy(doc, item, "name", nodes[i].name ? nodes[i].name : ""); + yyjson_mut_obj_add_strcpy(doc, item, "label", nodes[i].label); + yyjson_mut_obj_add_strcpy(doc, item, "file", line); yyjson_mut_arr_add_val(impacted, item); } } @@ -2704,6 +3681,18 @@ char *cbm_mcp_handle_tool(cbm_mcp_server_t *srv, const char *tool_name, const ch if (strcmp(tool_name, "get_architecture") == 0) { return handle_get_architecture(srv, args_json); } + if (strcmp(tool_name, "list_processes") == 0) { + return handle_list_processes(srv, args_json); + } + if (strcmp(tool_name, "get_channels") == 0) { + return handle_get_channels(srv, args_json); + } + if (strcmp(tool_name, "get_process_steps") == 0) { + return handle_get_process_steps(srv, args_json); + } + if (strcmp(tool_name, "get_impact") == 0) { + return handle_get_impact(srv, args_json); + } /* Pipeline-dependent tools */ if (strcmp(tool_name, "index_repository") == 0) { diff --git a/src/pipeline/fqn.c b/src/pipeline/fqn.c index 0936c78c..fb860730 100644 --- a/src/pipeline/fqn.c +++ b/src/pipeline/fqn.c @@ -158,6 +158,91 @@ char *cbm_pipeline_fqn_folder(const char *project, const char *rel_dir) { return result; } +/** + * Resolve an import module_path relative to the importing file's directory. + * + * For relative paths (starting with ./ or ../), resolves against the importer's + * directory. For bare module specifiers (no ./ prefix), returns a copy unchanged. + * + * Examples (importer_rel_path="src/routes/api.js"): + * "./controllers/auth" → "src/routes/controllers/auth" + * "../utils/helpers" → "src/utils/helpers" + * "lodash" → "lodash" (bare module, unchanged) + * "@hapi/hapi" → "@hapi/hapi" (scoped package, unchanged) + * + * Returns: heap-allocated resolved path. Caller must free(). + */ +char *cbm_pipeline_resolve_import_path(const char *importer_rel_path, const char *module_path) { + if (!module_path || !module_path[0]) { + return strdup(""); + } + + /* Bare module specifier — no relative path resolution needed */ + if (module_path[0] != '.') { + return strdup(module_path); + } + + /* Get the importing file's directory */ + char *importer_dir = strdup(importer_rel_path ? importer_rel_path : ""); + cbm_normalize_path_sep(importer_dir); + char *last_slash = strrchr(importer_dir, '/'); + if (last_slash) { + *(last_slash + 1) = '\0'; /* keep trailing slash */ + } else { + importer_dir[0] = '\0'; /* file is at root */ + } + + /* Concatenate: importer_dir + module_path */ + size_t dir_len = strlen(importer_dir); + size_t mod_len = strlen(module_path); + char *combined = malloc(dir_len + mod_len + 2); + snprintf(combined, dir_len + mod_len + 2, "%s%s", importer_dir, module_path); + free(importer_dir); + + /* Normalize: resolve . and .. segments */ + cbm_normalize_path_sep(combined); + const char *segments[256]; + int seg_count = 0; + + char *tok = combined; + while (tok && *tok) { + char *slash = strchr(tok, '/'); + if (slash) *slash = '\0'; + + if (strcmp(tok, ".") == 0) { + /* skip */ + } else if (strcmp(tok, "..") == 0) { + if (seg_count > 0) seg_count--; /* pop parent */ + } else if (tok[0] != '\0') { + if (seg_count < 255) { + segments[seg_count++] = tok; + } + } + + tok = slash ? slash + 1 : NULL; + } + + /* Rebuild path */ + if (seg_count == 0) { + free(combined); + return strdup(""); + } + + size_t total = 0; + for (int i = 0; i < seg_count; i++) { + total += strlen(segments[i]) + 1; + } + char *result = malloc(total + 1); + result[0] = '\0'; + for (int i = 0; i < seg_count; i++) { + if (i > 0) strcat(result, "/"); + strcat(result, segments[i]); + } + + free(combined); + return result; +} + char *cbm_project_name_from_path(const char *abs_path) { if (!abs_path || !abs_path[0]) { return strdup("root"); diff --git a/src/pipeline/httplink.c b/src/pipeline/httplink.c index 7d72c1c5..4144e451 100644 --- a/src/pipeline/httplink.c +++ b/src/pipeline/httplink.c @@ -362,7 +362,6 @@ static int count_segments(const char *path) { return count; } -/* Jaccard similarity of path segments (intersection/union) */ static double segment_jaccard(const char *norm_call, const char *norm_route) { /* Split into segments */ char a[1024]; @@ -1379,6 +1378,193 @@ int cbm_extract_express_routes(const char *name, const char *qn, const char *sou return count; } +/* ── Route extraction: Hapi.js ─────────────────────────────────── */ + +/* Extract a quoted string value after a colon, e.g. method: 'GET' → "GET". + * Returns the number of chars consumed from `src` (0 on failure). */ +static int hapi_extract_string_value(const char *src, char *out, int outsz) { + const char *p = src; + /* Skip whitespace after colon */ + while (*p == ' ' || *p == '\t' || *p == '\n' || *p == '\r') p++; + char quote = *p; + if (quote != '\'' && quote != '"' && quote != '`') return 0; + p++; + const char *start = p; + while (*p && *p != quote) p++; + if (*p != quote) return 0; + int len = (int)(p - start); + if (len >= outsz) len = outsz - 1; + memcpy(out, start, (size_t)len); + out[len] = '\0'; + return (int)(p + 1 - src); +} + +// NOLINTNEXTLINE(bugprone-easily-swappable-parameters) +int cbm_extract_hapi_routes(const char *name, const char *qn, const char *source, + cbm_route_handler_t *out, int max_out) { + if (!source || !*source) { + return 0; + } + + int count = 0; + const char *p = source; + + /* Scan for object literals containing method: and path: properties. + * Hapi pattern: + * { method: 'GET', path: '/api/users', handler: ... } + * or: + * { method: 'POST', path: '/api/users', handler: UsersController.create } + * + * We look for "method:" followed by a string value, then scan nearby for + * "path:" followed by a string value (or vice versa). */ + while (*p && count < max_out) { + /* Find next "method:" or "method :" */ + const char *mkey = strstr(p, "method"); + if (!mkey) break; + + /* Verify it looks like a property key (preceded by space/newline/comma/brace) */ + if (mkey > source) { + char before = *(mkey - 1); + if (before != ' ' && before != '\t' && before != '\n' && before != '\r' && + before != ',' && before != '{') { + p = mkey + 6; + continue; + } + } + + const char *after_method = mkey + 6; + /* Skip optional whitespace and colon */ + while (*after_method == ' ' || *after_method == '\t') after_method++; + if (*after_method != ':') { + p = after_method; + continue; + } + after_method++; /* skip ':' */ + + char method_val[16] = {0}; + int consumed = hapi_extract_string_value(after_method, method_val, sizeof(method_val)); + if (consumed == 0) { + p = after_method; + continue; + } + + /* Uppercase the method */ + for (int j = 0; method_val[j]; j++) { + method_val[j] = (char)toupper((unsigned char)method_val[j]); + } + + /* Validate it's a real HTTP method */ + if (strcmp(method_val, "GET") != 0 && strcmp(method_val, "POST") != 0 && + strcmp(method_val, "PUT") != 0 && strcmp(method_val, "DELETE") != 0 && + strcmp(method_val, "PATCH") != 0 && strcmp(method_val, "OPTIONS") != 0 && + strcmp(method_val, "HEAD") != 0 && strcmp(method_val, "*") != 0) { + p = after_method + consumed; + continue; + } + + /* Search for "path:" within the same object literal — look forward from the + * method: position. Both method: and path: are in the same {...} block, + * typically within 300 chars of each other. Also search a small window + * backward in case path: comes before method: in the object. */ + const char *search_start = (mkey - 300 > source) ? mkey - 300 : source; + const char *search_end_limit = mkey + 500; + char path_val[256] = {0}; + bool found_path = false; + + /* Find the enclosing '{' to scope the search to this object literal */ + const char *obj_start = mkey; + int brace_depth = 0; + while (obj_start > source) { + obj_start--; + if (*obj_start == '{') { + if (brace_depth == 0) break; + brace_depth--; + } else if (*obj_start == '}') { + brace_depth++; + } + } + if (*obj_start == '{') { + search_start = obj_start; + } + + const char *pkey = search_start; + while ((pkey = strstr(pkey, "path")) != NULL && pkey < search_end_limit) { + /* Verify it looks like a property key */ + if (pkey > source) { + char pb = *(pkey - 1); + if (pb != ' ' && pb != '\t' && pb != '\n' && pb != '\r' && + pb != ',' && pb != '{') { + pkey += 4; + continue; + } + } + const char *after_path = pkey + 4; + while (*after_path == ' ' || *after_path == '\t') after_path++; + if (*after_path != ':') { + pkey += 4; + continue; + } + after_path++; + int pc = hapi_extract_string_value(after_path, path_val, sizeof(path_val)); + if (pc > 0 && path_val[0] == '/') { + found_path = true; + break; + } + pkey += 4; + } + + if (found_path) { + /* Optionally extract handler reference — scope to same object */ + char handler_val[256] = {0}; + const char *hkey = strstr(obj_start, "handler"); + while (hkey && hkey < search_end_limit) { + /* Verify property key */ + if (hkey > source) { + char hb = *(hkey - 1); + if (hb != ' ' && hb != '\t' && hb != '\n' && hb != '\r' && + hb != ',' && hb != '{') { + hkey = strstr(hkey + 7, "handler"); + continue; + } + } + const char *after_handler = hkey + 7; + while (*after_handler == ' ' || *after_handler == '\t') after_handler++; + if (*after_handler == ':') { + after_handler++; + while (*after_handler == ' ' || *after_handler == '\t') after_handler++; + /* Handler can be identifier.identifier or just identifier */ + const char *hs = after_handler; + while (*after_handler && *after_handler != ',' && *after_handler != '\n' && + *after_handler != '}' && *after_handler != ' ') { + after_handler++; + } + int hlen = (int)(after_handler - hs); + if (hlen > 0 && hlen < (int)sizeof(handler_val)) { + memcpy(handler_val, hs, (size_t)hlen); + handler_val[hlen] = '\0'; + } + } + break; + } + + cbm_route_handler_t *r = &out[count]; + memset(r, 0, sizeof(*r)); + strncpy(r->method, method_val, sizeof(r->method) - 1); + strncpy(r->path, path_val, sizeof(r->path) - 1); + strncpy(r->function_name, name ? name : "", sizeof(r->function_name) - 1); + strncpy(r->qualified_name, qn ? qn : "", sizeof(r->qualified_name) - 1); + if (handler_val[0]) { + strncpy(r->handler_ref, handler_val, sizeof(r->handler_ref) - 1); + } + count++; + } + + p = after_method + consumed; + } + + return count; +} + /* ── Route extraction: Laravel ─────────────────────────────────── */ // NOLINTNEXTLINE(bugprone-easily-swappable-parameters) @@ -1720,3 +1906,208 @@ int cbm_httplink_all_exclude_paths(const cbm_httplink_config_t *cfg, const char return count; } + +/* ── Channel extraction: Socket.IO / EventEmitter ────────────────── */ + +typedef struct cbm_channel_match { + char channel[256]; + char direction[8]; /* "emit" or "listen" */ + char transport[32]; /* "socketio", "eventemitter" */ +} cbm_channel_match_t; + +int cbm_extract_channels(const char *source, cbm_channel_match_t *out, int max_out) { + if (!source || !*source) return 0; + + cbm_regex_t re; + if (cbm_regcomp(&re, + "([a-zA-Z_][a-zA-Z0-9_]*)\\.(" + "emit|on|once|addListener|removeListener" + ")\\([[:space:]]*['\"`]([^'\"`]{1,128})['\"`]", + CBM_REG_EXTENDED) != 0) { + return 0; + } + + static const char *channel_receivers[] = { + "socket", "io", "client", "server", "connection", + "emitter", "eventEmitter", "eventBus", "bus", "pubsub", + "producer", "consumer", "channel", "broker", + "nsp", "namespace", "this", NULL + }; + + int count = 0; + const char *p = source; + cbm_regmatch_t match[4]; + + while (count < max_out && cbm_regexec(&re, p, 4, match, 0) == 0) { + int rlen = match[1].rm_eo - match[1].rm_so; + char receiver[64]; + if (rlen >= (int)sizeof(receiver)) rlen = (int)sizeof(receiver) - 1; + memcpy(receiver, p + match[1].rm_so, (size_t)rlen); + receiver[rlen] = '\0'; + + bool is_channel = false; + for (int i = 0; channel_receivers[i]; i++) { + if (strcasecmp(receiver, channel_receivers[i]) == 0) { + is_channel = true; + break; + } + } + + if (is_channel) { + int mlen = match[2].rm_eo - match[2].rm_so; + char method[32]; + if (mlen >= (int)sizeof(method)) mlen = (int)sizeof(method) - 1; + memcpy(method, p + match[2].rm_so, (size_t)mlen); + method[mlen] = '\0'; + + int clen = match[3].rm_eo - match[3].rm_so; + if (clen >= (int)sizeof(out[count].channel)) + clen = (int)sizeof(out[count].channel) - 1; + memcpy(out[count].channel, p + match[3].rm_so, (size_t)clen); + out[count].channel[clen] = '\0'; + + const char *ch = out[count].channel; + if (strcmp(ch, "error") != 0 && strcmp(ch, "close") != 0 && + strcmp(ch, "end") != 0 && strcmp(ch, "data") != 0 && + strcmp(ch, "connect") != 0 && strcmp(ch, "disconnect") != 0 && + strcmp(ch, "connection") != 0 && strcmp(ch, "message") != 0 && + strcmp(ch, "open") != 0 && strcmp(ch, "drain") != 0 && + strcmp(ch, "finish") != 0 && strcmp(ch, "pipe") != 0 && + strcmp(ch, "unpipe") != 0 && strcmp(ch, "readable") != 0 && + strcmp(ch, "resume") != 0 && strcmp(ch, "pause") != 0) { + if (strcmp(method, "emit") == 0) { + strncpy(out[count].direction, "emit", sizeof(out[count].direction) - 1); + } else { + strncpy(out[count].direction, "listen", sizeof(out[count].direction) - 1); + } + if (strcasecmp(receiver, "socket") == 0 || strcasecmp(receiver, "io") == 0 || + strcasecmp(receiver, "nsp") == 0 || strcasecmp(receiver, "namespace") == 0) { + strncpy(out[count].transport, "socketio", sizeof(out[count].transport) - 1); + } else { + strncpy(out[count].transport, "eventemitter", sizeof(out[count].transport) - 1); + } + count++; + } + } + p += match[0].rm_eo; + } + + cbm_regfree(&re); + return count; +} + +/* ── C# channel extraction: Socket.IO with constant resolution ─── */ + +/* Extract channels from C# source that uses constant names for event strings. + * Pattern: _socket.Emit(CONSTANT_NAME, data) / _socket.OnRequest(CONSTANT_NAME, ...) + * Resolves constants via: const string CONSTANT_NAME = "ActualChannelName"; */ +int cbm_extract_csharp_channels(const char *source, cbm_channel_match_t *out, int max_out) { + if (!source || !*source) return 0; + + /* Pass 1: Collect const string mappings: name → value */ + typedef struct { char name[128]; char value[256]; } const_map_t; + const_map_t cmap[128]; + int cmap_count = 0; + + cbm_regex_t re_const; + if (cbm_regcomp(&re_const, + "const[[:space:]]+string[[:space:]]+([A-Z_][A-Z_0-9]*)[[:space:]]*=[[:space:]]*\"([^\"]{1,128})\"", + CBM_REG_EXTENDED) == 0) { + const char *p = source; + cbm_regmatch_t cm[3]; + while (cmap_count < 128 && cbm_regexec(&re_const, p, 3, cm, 0) == 0) { + int nlen = cm[1].rm_eo - cm[1].rm_so; + int vlen = cm[2].rm_eo - cm[2].rm_so; + if (nlen > 0 && nlen < 128 && vlen > 0 && vlen < 256) { + memcpy(cmap[cmap_count].name, p + cm[1].rm_so, (size_t)nlen); + cmap[cmap_count].name[nlen] = '\0'; + memcpy(cmap[cmap_count].value, p + cm[2].rm_so, (size_t)vlen); + cmap[cmap_count].value[vlen] = '\0'; + cmap_count++; + } + p += cm[0].rm_eo; + } + cbm_regfree(&re_const); + } + + /* Pass 2: Find .Emit( and .OnRequest patterns */ + int count = 0; + + /* Pattern: .Emit(IDENTIFIER or .OnRequest<...>(IDENTIFIER */ + cbm_regex_t re_emit; + if (cbm_regcomp(&re_emit, + "\\.(Emit|OnRequest)[^(]*\\([[:space:]]*([A-Z_][A-Z_0-9]*)", + CBM_REG_EXTENDED) == 0) { + const char *p = source; + cbm_regmatch_t em[3]; + while (count < max_out && cbm_regexec(&re_emit, p, 3, em, 0) == 0) { + int mlen = em[1].rm_eo - em[1].rm_so; + char method[16]; + if (mlen >= (int)sizeof(method)) mlen = (int)sizeof(method) - 1; + memcpy(method, p + em[1].rm_so, (size_t)mlen); + method[mlen] = '\0'; + + int ilen = em[2].rm_eo - em[2].rm_so; + char ident[128]; + if (ilen >= (int)sizeof(ident)) ilen = (int)sizeof(ident) - 1; + memcpy(ident, p + em[2].rm_so, (size_t)ilen); + ident[ilen] = '\0'; + + /* Resolve constant to string value */ + const char *resolved = NULL; + for (int i = 0; i < cmap_count; i++) { + if (strcmp(cmap[i].name, ident) == 0) { + resolved = cmap[i].value; + break; + } + } + + if (resolved) { + strncpy(out[count].channel, resolved, sizeof(out[count].channel) - 1); + out[count].channel[sizeof(out[count].channel) - 1] = '\0'; + + if (strcmp(method, "Emit") == 0) { + strncpy(out[count].direction, "emit", sizeof(out[count].direction) - 1); + } else { + strncpy(out[count].direction, "listen", sizeof(out[count].direction) - 1); + } + strncpy(out[count].transport, "socketio", sizeof(out[count].transport) - 1); + count++; + } + p += em[0].rm_eo; + } + cbm_regfree(&re_emit); + } + + /* Also match direct string literal patterns: .Emit("ChannelName" */ + cbm_regex_t re_literal; + if (cbm_regcomp(&re_literal, + "\\.(Emit|On|OnRequest)[^(]*\\([[:space:]]*\"([^\"]{1,128})\"", + CBM_REG_EXTENDED) == 0) { + const char *p = source; + cbm_regmatch_t lm[3]; + while (count < max_out && cbm_regexec(&re_literal, p, 3, lm, 0) == 0) { + int mlen = lm[1].rm_eo - lm[1].rm_so; + char method[16]; + if (mlen >= (int)sizeof(method)) mlen = (int)sizeof(method) - 1; + memcpy(method, p + lm[1].rm_so, (size_t)mlen); + method[mlen] = '\0'; + + int clen = lm[2].rm_eo - lm[2].rm_so; + strncpy(out[count].channel, p + lm[2].rm_so, (size_t)(clen < 255 ? clen : 255)); + out[count].channel[clen < 255 ? clen : 255] = '\0'; + + if (strcmp(method, "Emit") == 0) { + strncpy(out[count].direction, "emit", sizeof(out[count].direction) - 1); + } else { + strncpy(out[count].direction, "listen", sizeof(out[count].direction) - 1); + } + strncpy(out[count].transport, "socketio", sizeof(out[count].transport) - 1); + count++; + p += lm[0].rm_eo; + } + cbm_regfree(&re_literal); + } + + return count; +} diff --git a/src/pipeline/httplink.h b/src/pipeline/httplink.h index c0cd275a..b14fbe3c 100644 --- a/src/pipeline/httplink.h +++ b/src/pipeline/httplink.h @@ -113,6 +113,10 @@ int cbm_extract_ktor_routes(const char *name, const char *qn, const char *source int cbm_extract_express_routes(const char *name, const char *qn, const char *source, cbm_route_handler_t *out, int max_out); +/* Hapi.js object-literal routes: { method: 'GET', path: '/api/...', handler: ... } */ +int cbm_extract_hapi_routes(const char *name, const char *qn, const char *source, + cbm_route_handler_t *out, int max_out); + /* Extract PHP Laravel routes from source. * Returns count. */ int cbm_extract_laravel_routes(const char *name, const char *qn, const char *source, diff --git a/src/pipeline/pass_definitions.c b/src/pipeline/pass_definitions.c index a19175a8..e1514783 100644 --- a/src/pipeline/pass_definitions.c +++ b/src/pipeline/pass_definitions.c @@ -271,6 +271,13 @@ int cbm_pipeline_pass_definitions(cbm_pipeline_ctx_t *ctx, const cbm_file_info_t cbm_gbuf_insert_edge(ctx->gbuf, parent->id, node_id, "DEFINES_METHOD", "{}"); } } + /* HAS_PROPERTY edge: Class → Property */ + if (def->parent_class && def->label && strcmp(def->label, "Property") == 0) { + const cbm_gbuf_node_t *parent = cbm_gbuf_find_by_qn(ctx->gbuf, def->parent_class); + if (parent && node_id > 0) { + cbm_gbuf_insert_edge(ctx->gbuf, parent->id, node_id, "HAS_PROPERTY", "{}"); + } + } total_defs++; } @@ -281,28 +288,76 @@ int cbm_pipeline_pass_definitions(cbm_pipeline_ctx_t *ctx, const cbm_file_info_t total_imports += result->imports.count; /* Store per-file import map for later use by pass_calls. - * For each import, create an IMPORTS edge: File → imported module. */ - for (int j = 0; j < result->imports.count; j++) { - CBMImport *imp = &result->imports.items[j]; - if (!imp->module_path) { - continue; - } - - /* Find or create the target module node */ - char *target_qn = cbm_pipeline_fqn_module(ctx->project_name, imp->module_path); - const cbm_gbuf_node_t *target = cbm_gbuf_find_by_qn(ctx->gbuf, target_qn); - + * For each import, create an IMPORTS edge: File → imported module. + * Resolve relative paths (./ ../) and probe common extensions. */ + { char *file_qn = cbm_pipeline_fqn_compute(ctx->project_name, rel, "__file__"); const cbm_gbuf_node_t *source_node = cbm_gbuf_find_by_qn(ctx->gbuf, file_qn); + free(file_qn); + + for (int j = 0; j < result->imports.count && source_node; j++) { + CBMImport *imp = &result->imports.items[j]; + if (!imp->module_path) { + continue; + } - if (source_node && target) { - char imp_props[256]; - snprintf(imp_props, sizeof(imp_props), "{\"local_name\":\"%s\"}", - imp->local_name ? imp->local_name : ""); - cbm_gbuf_insert_edge(ctx->gbuf, source_node->id, target->id, "IMPORTS", imp_props); + /* Resolve relative paths against importing file's directory */ + char *resolved = cbm_pipeline_resolve_import_path(rel, imp->module_path); + char *target_qn = cbm_pipeline_fqn_module(ctx->project_name, resolved); + const cbm_gbuf_node_t *target = cbm_gbuf_find_by_qn(ctx->gbuf, target_qn); + + /* Probe common extensions */ + if (!target) { + static const char *exts[] = { + ".js", ".ts", ".tsx", ".jsx", ".mjs", ".mts", + ".css", ".scss", ".json", NULL + }; + for (int e = 0; !target && exts[e]; e++) { + char buf[2048]; + snprintf(buf, sizeof(buf), "%s%s", resolved, exts[e]); + free(target_qn); + target_qn = cbm_pipeline_fqn_module(ctx->project_name, buf); + target = cbm_gbuf_find_by_qn(ctx->gbuf, target_qn); + } + } + + /* Probe /index variants */ + if (!target) { + static const char *idx[] = { + "/index.js", "/index.ts", "/index.tsx", "/index.jsx", + "/index.mjs", "/index", NULL + }; + for (int e = 0; !target && idx[e]; e++) { + char buf[2048]; + snprintf(buf, sizeof(buf), "%s%s", resolved, idx[e]); + free(target_qn); + target_qn = cbm_pipeline_fqn_module(ctx->project_name, buf); + target = cbm_gbuf_find_by_qn(ctx->gbuf, target_qn); + } + } + + /* C/C++ include: try .h, .hpp */ + if (!target) { + static const char *hdr[] = {".h", ".hpp", ".hh", NULL}; + for (int e = 0; !target && hdr[e]; e++) { + char buf[2048]; + snprintf(buf, sizeof(buf), "%s%s", resolved, hdr[e]); + free(target_qn); + target_qn = cbm_pipeline_fqn_module(ctx->project_name, buf); + target = cbm_gbuf_find_by_qn(ctx->gbuf, target_qn); + } + } + + if (target) { + char imp_props[256]; + snprintf(imp_props, sizeof(imp_props), "{\"local_name\":\"%s\"}", + imp->local_name ? imp->local_name : ""); + cbm_gbuf_insert_edge(ctx->gbuf, source_node->id, target->id, "IMPORTS", + imp_props); + } + free(target_qn); + free(resolved); } - free(target_qn); - free(file_qn); } /* Cache or free the extraction result */ diff --git a/src/pipeline/pass_httplinks.c b/src/pipeline/pass_httplinks.c index 7ecdda71..91b28457 100644 --- a/src/pipeline/pass_httplinks.c +++ b/src/pipeline/pass_httplinks.c @@ -277,6 +277,9 @@ static int discover_node_routes(const cbm_gbuf_node_t *n, const cbm_pipeline_ctx nr = cbm_extract_express_routes(n->name, n->qualified_name, source, out + total, max_out - total); total += nr; + nr = cbm_extract_hapi_routes(n->name, n->qualified_name, source, out + total, + max_out - total); + total += nr; } if (has_suffix(fp, ".php")) { nr = cbm_extract_laravel_routes(n->name, n->qualified_name, source, out + total, @@ -323,6 +326,8 @@ static int discover_module_routes(const cbm_gbuf_node_t *mod, const cbm_pipeline if (is_js) { total += cbm_extract_express_routes(mod->name, mod->qualified_name, source, out + total, max_out - total); + total += cbm_extract_hapi_routes(mod->name, mod->qualified_name, source, out + total, + max_out - total); } free(source); return total; @@ -881,6 +886,64 @@ static int insert_route_nodes(cbm_pipeline_ctx_t *ctx, cbm_route_handler_t *rout for (int i = 0; i < route_count; i++) { cbm_route_handler_t *rh = &routes[i]; + /* Reject obviously invalid route paths. + * Vendored/minified JS files (e.g. tsc.js, typescript.js) inside non-JS + * repos can produce false positives where JS operators/keywords get + * matched as route paths by the Express extractor. */ + { + const char *p = rh->path; + /* Skip empty paths */ + if (!p || !*p) continue; + + /* Reject paths that are JS operators or keywords — not valid URL routes */ + static const char *const invalid_paths[] = { + "!", "+", "++", "-", "--", ":", "~", "void", "null", "true", + "false", "throw", "this", "typeof", "delete", "new", "return", + "undefined", "NaN", "Infinity", "var", "let", "const", + "function", "class", "if", "else", "for", "while", "do", + "switch", "case", "break", "continue", "try", "catch", + "finally", "with", "in", "of", "yield", "await", "async", + "super", "import", "export", "default", "extends", "static", + "_this", "self", "__proto__", "arguments", "range", + NULL + }; + bool rejected = false; + /* Work with a trimmed copy for comparison */ + char trimmed[256]; + /* Trim leading whitespace */ + while (*p == ' ' || *p == '\t') p++; + strncpy(trimmed, p, sizeof(trimmed) - 1); + trimmed[sizeof(trimmed) - 1] = '\0'; + /* Trim trailing whitespace */ + size_t tlen = strlen(trimmed); + while (tlen > 0 && (trimmed[tlen - 1] == ' ' || trimmed[tlen - 1] == '\t' || + trimmed[tlen - 1] == '\n' || trimmed[tlen - 1] == '\r')) { + trimmed[--tlen] = '\0'; + } + for (int k = 0; invalid_paths[k]; k++) { + if (strcmp(trimmed, invalid_paths[k]) == 0) { + rejected = true; + break; + } + } + if (rejected) continue; + + /* Reject single-character non-slash paths (e.g. "*", "?", "#") */ + if (p[0] && !p[1] && p[0] != '/') continue; + + /* Reject paths that contain no alphanumeric or slash characters. + * Valid routes like "/api/v1" always have at least one alnum. */ + bool has_alnum_or_slash = false; + for (const char *c = p; *c; c++) { + if ((*c >= 'a' && *c <= 'z') || (*c >= 'A' && *c <= 'Z') || + (*c >= '0' && *c <= '9') || *c == '/') { + has_alnum_or_slash = true; + break; + } + } + if (!has_alnum_or_slash) continue; + } + /* Build Route QN and name */ char normal_method[16]; snprintf(normal_method, sizeof(normal_method), "%s", rh->method[0] ? rh->method : "ANY"); diff --git a/src/pipeline/pass_parallel.c b/src/pipeline/pass_parallel.c index 3193c1c7..b14b249b 100644 --- a/src/pipeline/pass_parallel.c +++ b/src/pipeline/pass_parallel.c @@ -572,6 +572,9 @@ static void prescan_routes(const char *source, int source_len, const CBMFileResu nr = cbm_extract_express_routes(def->name, def->qualified_name, func_src, routes + total, 16 - total); total += nr; + nr = cbm_extract_hapi_routes(def->name, def->qualified_name, func_src, + routes + total, 16 - total); + total += nr; nr = cbm_extract_laravel_routes(def->name, def->qualified_name, func_src, routes + total, 16 - total); total += nr; @@ -608,6 +611,8 @@ static void prescan_routes(const char *source, int source_len, const CBMFileResu if (is_js) { total += cbm_extract_express_routes(basename, "", source, mod_routes + total, 16 - total); + total += cbm_extract_hapi_routes(basename, "", source, mod_routes + total, + 16 - total); } for (int r = 0; r < total; r++) { prescan_add_route(ps, &mod_routes[r]); @@ -938,22 +943,75 @@ int cbm_build_registry_from_cache(cbm_pipeline_ctx_t *ctx, const cbm_file_info_t "{}"); } } + /* HAS_PROPERTY edge: Class → Property */ + if (def->parent_class && def->label && strcmp(def->label, "Property") == 0) { + const cbm_gbuf_node_t *parent = cbm_gbuf_find_by_qn(ctx->gbuf, def->parent_class); + if (parent && def_node) { + cbm_gbuf_insert_edge(ctx->gbuf, parent->id, def_node->id, "HAS_PROPERTY", + "{}"); + } + } } - /* IMPORTS edges */ - for (int j = 0; j < result->imports.count; j++) { + /* IMPORTS edges — resolve relative paths and probe extensions */ + char *file_qn = cbm_pipeline_fqn_compute(ctx->project_name, rel, "__file__"); + const cbm_gbuf_node_t *source_node = cbm_gbuf_find_by_qn(ctx->gbuf, file_qn); + free(file_qn); + + for (int j = 0; j < result->imports.count && source_node; j++) { CBMImport *imp = &result->imports.items[j]; if (!imp->module_path) { continue; } - char *target_qn = cbm_pipeline_fqn_module(ctx->project_name, imp->module_path); + /* Resolve relative paths (./ ../) against importing file's directory */ + char *resolved = cbm_pipeline_resolve_import_path(rel, imp->module_path); + char *target_qn = cbm_pipeline_fqn_module(ctx->project_name, resolved); const cbm_gbuf_node_t *target = cbm_gbuf_find_by_qn(ctx->gbuf, target_qn); - char *file_qn = cbm_pipeline_fqn_compute(ctx->project_name, rel, "__file__"); - const cbm_gbuf_node_t *source_node = cbm_gbuf_find_by_qn(ctx->gbuf, file_qn); + /* Probe common extensions if no exact match: .js, .ts, .tsx, .jsx, .mjs */ + if (!target) { + static const char *exts[] = { + ".js", ".ts", ".tsx", ".jsx", ".mjs", ".mts", + ".css", ".scss", ".json", NULL + }; + for (int e = 0; !target && exts[e]; e++) { + char buf[2048]; + snprintf(buf, sizeof(buf), "%s%s", resolved, exts[e]); + free(target_qn); + target_qn = cbm_pipeline_fqn_module(ctx->project_name, buf); + target = cbm_gbuf_find_by_qn(ctx->gbuf, target_qn); + } + } + + /* Probe /index variants (directory imports) */ + if (!target) { + static const char *idx[] = { + "/index.js", "/index.ts", "/index.tsx", "/index.jsx", + "/index.mjs", "/index", NULL + }; + for (int e = 0; !target && idx[e]; e++) { + char buf[2048]; + snprintf(buf, sizeof(buf), "%s%s", resolved, idx[e]); + free(target_qn); + target_qn = cbm_pipeline_fqn_module(ctx->project_name, buf); + target = cbm_gbuf_find_by_qn(ctx->gbuf, target_qn); + } + } - if (source_node && target) { + /* C/C++ include: try .h, .hpp variants */ + if (!target && (resolved[0] != '.' || resolved[1] == '.')) { + static const char *hdr[] = {".h", ".hpp", ".hh", NULL}; + for (int e = 0; !target && hdr[e]; e++) { + char buf[2048]; + snprintf(buf, sizeof(buf), "%s%s", resolved, hdr[e]); + free(target_qn); + target_qn = cbm_pipeline_fqn_module(ctx->project_name, buf); + target = cbm_gbuf_find_by_qn(ctx->gbuf, target_qn); + } + } + + if (target) { char imp_props[256]; snprintf(imp_props, sizeof(imp_props), "{\"local_name\":\"%s\"}", imp->local_name ? imp->local_name : ""); @@ -961,7 +1019,7 @@ int cbm_build_registry_from_cache(cbm_pipeline_ctx_t *ctx, const cbm_file_info_t imports_edges++; } free(target_qn); - free(file_qn); + free(resolved); } } diff --git a/src/pipeline/pipeline.c b/src/pipeline/pipeline.c index 66f47eac..635142c2 100644 --- a/src/pipeline/pipeline.c +++ b/src/pipeline/pipeline.c @@ -818,6 +818,38 @@ int cbm_pipeline_run(cbm_pipeline_t *p) { } cbm_store_close(hash_store); cbm_log_info("pass.timing", "pass", "persist_hashes", "files", itoa_buf(file_count)); + + /* Backfill FTS5 index: the direct B-tree dump bypasses SQLite triggers, + * so the FTS5 table is empty after indexing. Populate it in bulk now. */ + cbm_store_t *fts_store = cbm_store_open_path(db_path); + if (fts_store) { + cbm_store_exec(fts_store, + "INSERT OR REPLACE INTO nodes_fts(rowid, name, qualified_name, label, file_path) " + "SELECT id, name, qualified_name, label, file_path FROM nodes;"); + cbm_store_close(fts_store); + } + + /* ── Process detection: discover execution flows from entry points ── */ + { + cbm_store_t *proc_store = cbm_store_open_path(db_path); + if (proc_store) { + int nprocs = cbm_store_detect_processes(proc_store, p->project_name, 300); + cbm_log_info("pass.done", "pass", "processes", + "detected", itoa_buf(nprocs)); + cbm_store_close(proc_store); + } + } + + /* ── Channel detection: scan source for emit/on patterns ── */ + { + cbm_store_t *ch_store = cbm_store_open_path(db_path); + if (ch_store) { + int nch = cbm_store_detect_channels(ch_store, p->project_name, p->repo_path); + cbm_log_info("pass.done", "pass", "channels", + "detected", itoa_buf(nch)); + cbm_store_close(ch_store); + } + } } } diff --git a/src/pipeline/pipeline.h b/src/pipeline/pipeline.h index 203f4374..58850e7c 100644 --- a/src/pipeline/pipeline.h +++ b/src/pipeline/pipeline.h @@ -82,6 +82,10 @@ char *cbm_pipeline_fqn_module(const char *project, const char *rel_path); /* Folder QN: project.dir.parts. Caller must free(). */ char *cbm_pipeline_fqn_folder(const char *project, const char *rel_dir); +/* Resolve an import module_path relative to the importing file's directory. + * Handles ./ and ../ resolution. Bare modules returned unchanged. Caller must free(). */ +char *cbm_pipeline_resolve_import_path(const char *importer_rel_path, const char *module_path); + /* Derive project name from an absolute path. * Replaces / and : with -, collapses --, trims leading -. * Caller must free() the returned string. */ diff --git a/src/store/store.c b/src/store/store.c index 88aa7078..cc845fd4 100644 --- a/src/store/store.c +++ b/src/store/store.c @@ -191,7 +191,36 @@ static int init_schema(cbm_store_t *s) { " properties TEXT DEFAULT '{}'," " UNIQUE(source_id, target_id, type)" ");" - "CREATE TABLE IF NOT EXISTS project_summaries (" + "CREATE TABLE IF NOT EXISTS processes (" + " id INTEGER PRIMARY KEY AUTOINCREMENT," + " project TEXT NOT NULL REFERENCES projects(name) ON DELETE CASCADE," + " label TEXT NOT NULL," + " process_type TEXT NOT NULL DEFAULT 'cross_community'," + " step_count INTEGER NOT NULL DEFAULT 0," + " entry_point_id INTEGER NOT NULL," + " terminal_id INTEGER NOT NULL" + ");" + "CREATE TABLE IF NOT EXISTS process_steps (" + " process_id INTEGER NOT NULL REFERENCES processes(id) ON DELETE CASCADE," + " node_id INTEGER NOT NULL," + " step INTEGER NOT NULL," + " PRIMARY KEY (process_id, step)" + ");" + "CREATE TABLE IF NOT EXISTS channels (" + " id INTEGER PRIMARY KEY AUTOINCREMENT," + " project TEXT NOT NULL REFERENCES projects(name) ON DELETE CASCADE," + " channel_name TEXT NOT NULL," + " direction TEXT NOT NULL," /* 'emit' or 'listen' */ + " transport TEXT NOT NULL DEFAULT 'socketio'," + " node_id INTEGER NOT NULL," + " file_path TEXT DEFAULT ''," + " function_name TEXT DEFAULT ''" + ");" + "CREATE INDEX IF NOT EXISTS idx_channels_name ON channels(channel_name);" + "CREATE INDEX IF NOT EXISTS idx_channels_project ON channels(project);" + "CREATE UNIQUE INDEX IF NOT EXISTS idx_channels_unique " + "ON channels(project, channel_name, direction, file_path, function_name);" + "CREATE TABLE IF NOT EXISTS project_summaries (" " project TEXT PRIMARY KEY," " summary TEXT NOT NULL," " source_hash TEXT NOT NULL," @@ -212,7 +241,47 @@ static int create_user_indexes(cbm_store_t *s) { "CREATE INDEX IF NOT EXISTS idx_edges_type ON edges(project, type);" "CREATE INDEX IF NOT EXISTS idx_edges_target_type ON edges(project, target_id, type);" "CREATE INDEX IF NOT EXISTS idx_edges_source_type ON edges(project, source_id, type);"; - return exec_sql(s, sql); + int rc = exec_sql(s, sql); + if (rc != SQLITE_OK) return rc; + + /* FTS5 full-text search index on node names for BM25 ranking. + * content='nodes' makes it an external-content table — synced via triggers. + * Each DDL statement must be executed separately for FTS5 compatibility. */ + { + char *fts_err = NULL; + int fts_rc = sqlite3_exec(s->db, + "CREATE VIRTUAL TABLE IF NOT EXISTS nodes_fts USING fts5(" + "name, qualified_name, label, file_path," + "content='nodes', content_rowid='id'," + "tokenize='unicode61 remove_diacritics 2'" + ");", + NULL, NULL, &fts_err); + if (fts_rc != SQLITE_OK) { + sqlite3_free(fts_err); + /* Non-fatal — FTS5 may not be compiled in. Fall back to regex search. */ + return SQLITE_OK; + } + } + + /* Sync triggers: keep FTS index up to date when nodes change */ + exec_sql(s, "CREATE TRIGGER IF NOT EXISTS nodes_fts_ai AFTER INSERT ON nodes BEGIN" + " INSERT INTO nodes_fts(rowid, name, qualified_name, label, file_path)" + " VALUES (new.id, new.name, new.qualified_name, new.label, new.file_path);" + "END;"); + + exec_sql(s, "CREATE TRIGGER IF NOT EXISTS nodes_fts_ad AFTER DELETE ON nodes BEGIN" + " INSERT INTO nodes_fts(nodes_fts, rowid, name, qualified_name, label, file_path)" + " VALUES ('delete', old.id, old.name, old.qualified_name, old.label, old.file_path);" + "END;"); + + exec_sql(s, "CREATE TRIGGER IF NOT EXISTS nodes_fts_au AFTER UPDATE ON nodes BEGIN" + " INSERT INTO nodes_fts(nodes_fts, rowid, name, qualified_name, label, file_path)" + " VALUES ('delete', old.id, old.name, old.qualified_name, old.label, old.file_path);" + " INSERT INTO nodes_fts(rowid, name, qualified_name, label, file_path)" + " VALUES (new.id, new.name, new.qualified_name, new.label, new.file_path);" + "END;"); + + return SQLITE_OK; } static int configure_pragmas(cbm_store_t *s, bool in_memory) { @@ -474,6 +543,10 @@ static void finalize_stmt(sqlite3_stmt **s) { } } +int cbm_store_exec(cbm_store_t *s, const char *sql) { + return exec_sql(s, sql); +} + void cbm_store_close(cbm_store_t *s) { if (!s) { return; @@ -1955,6 +2028,136 @@ int cbm_store_search(cbm_store_t *s, const cbm_search_params_t *params, cbm_sear char count_sql[4096]; int bind_idx = 0; + /* ── FTS5 BM25 path: when params->query is set, use full-text search ── */ + if (params->query && params->query[0]) { + /* Build FTS5 query: JOIN nodes_fts for BM25 ranking. + * Tokenize the user query into FTS5 OR terms for broader matching. + * "authentication middleware" → "authentication OR middleware" */ + char fts_query[1024]; + { + const char *q = params->query; + int fqlen = 0; + bool in_word = false; + bool first_word = true; + while (*q && fqlen < (int)sizeof(fts_query) - 20) { + if ((*q >= 'a' && *q <= 'z') || (*q >= 'A' && *q <= 'Z') || + (*q >= '0' && *q <= '9') || *q == '_' || *q == '-') { + if (!in_word && !first_word) { + fqlen += snprintf(fts_query + fqlen, sizeof(fts_query) - fqlen, " OR "); + } + fts_query[fqlen++] = *q; + in_word = true; + first_word = false; + } else { + if (in_word) { + fts_query[fqlen++] = ' '; + } + in_word = false; + } + q++; + } + fts_query[fqlen] = '\0'; + } + + char fts_sql[4096]; + /* Join with FTS5 table, filter by project/label, order by BM25 rank. + * Exclude noise labels (File, Folder, Module, Section, Variable, Project) + * and boost Function/Method/Class via a structural score added to BM25. */ + int flen = snprintf(fts_sql, sizeof(fts_sql), + "SELECT n.id, n.project, n.label, n.name, n.qualified_name, " + "n.file_path, n.start_line, n.end_line, n.properties, " + "(SELECT COUNT(*) FROM edges e WHERE e.target_id = n.id AND e.type = 'CALLS') AS in_deg, " + "(SELECT COUNT(*) FROM edges e WHERE e.source_id = n.id AND e.type = 'CALLS') AS out_deg, " + "(bm25(nodes_fts) " + " - CASE WHEN n.label IN ('Function','Method') THEN 10.0 " + " WHEN n.label IN ('Class','Interface','Type') THEN 5.0 " + " WHEN n.label = 'Route' THEN 8.0 " + " ELSE 0.0 END " + " - CASE WHEN (SELECT COUNT(*) FROM edges e WHERE e.target_id = n.id AND e.type = 'CALLS') > 5 THEN 3.0 ELSE 0.0 END" + ") AS rank " + "FROM nodes_fts " + "JOIN nodes n ON n.id = nodes_fts.rowid " + "WHERE nodes_fts MATCH ?1" + " AND n.label NOT IN ('File','Folder','Module','Section','Variable','Project')"); + + int fts_bind_idx = 1; + if (params->project) { + fts_bind_idx++; + flen += snprintf(fts_sql + flen, sizeof(fts_sql) - flen, + " AND n.project = ?%d", fts_bind_idx); + } + if (params->label) { + fts_bind_idx++; + flen += snprintf(fts_sql + flen, sizeof(fts_sql) - flen, + " AND n.label = ?%d", fts_bind_idx); + } + + int limit = params->limit > 0 ? params->limit : 50; + flen += snprintf(fts_sql + flen, sizeof(fts_sql) - flen, + " ORDER BY rank LIMIT %d OFFSET %d", limit, params->offset); + + /* Count query — same exclusions as main query */ + char fts_count[4096]; + snprintf(fts_count, sizeof(fts_count), + "SELECT COUNT(*) FROM nodes_fts " + "JOIN nodes n ON n.id = nodes_fts.rowid " + "WHERE nodes_fts MATCH ?1" + " AND n.label NOT IN ('File','Folder','Module','Section','Variable','Project')" + "%s%s", + params->project ? " AND n.project = ?2" : "", + params->label ? (params->project ? " AND n.label = ?3" : " AND n.label = ?2") : ""); + + /* Execute count */ + sqlite3_stmt *cnt_stmt = NULL; + if (sqlite3_prepare_v2(s->db, fts_count, -1, &cnt_stmt, NULL) == SQLITE_OK) { + bind_text(cnt_stmt, 1, fts_query); + int bi = 1; + if (params->project) { bi++; bind_text(cnt_stmt, bi, params->project); } + if (params->label) { bi++; bind_text(cnt_stmt, bi, params->label); } + if (sqlite3_step(cnt_stmt) == SQLITE_ROW) { + out->total = sqlite3_column_int(cnt_stmt, 0); + } + sqlite3_finalize(cnt_stmt); + } + + /* Execute main query */ + sqlite3_stmt *main_stmt = NULL; + int rc = sqlite3_prepare_v2(s->db, fts_sql, -1, &main_stmt, NULL); + if (rc != SQLITE_OK) { + /* FTS5 table may not exist for older DBs — fall through to regex path */ + /* FTS5 table may not exist for older DBs — silently fall through */ + goto regex_path; + } + bind_text(main_stmt, 1, fts_query); + { + int bi = 1; + if (params->project) { bi++; bind_text(main_stmt, bi, params->project); } + if (params->label) { bi++; bind_text(main_stmt, bi, params->label); } + } + + int cap = 16; + int n = 0; + cbm_search_result_t *results = malloc(cap * sizeof(cbm_search_result_t)); + while (sqlite3_step(main_stmt) == SQLITE_ROW) { + if (n >= cap) { + cap *= 2; + results = safe_realloc(results, cap * sizeof(cbm_search_result_t)); + } + memset(&results[n], 0, sizeof(cbm_search_result_t)); + scan_node(main_stmt, &results[n].node); + results[n].in_degree = sqlite3_column_int(main_stmt, 9); + results[n].out_degree = sqlite3_column_int(main_stmt, 10); + n++; + } + sqlite3_finalize(main_stmt); + out->results = results; + out->count = n; + return CBM_STORE_OK; + } + +regex_path: + /* ── Regex path: original regex-based search ── */ + /* We build a query that selects nodes with optional degree subqueries */ const char *select_cols = "SELECT n.id, n.project, n.label, n.name, n.qualified_name, " @@ -3951,6 +4154,309 @@ static bool want_aspect(const char **aspects, int aspect_count, const char *name return false; } +/* ── Clusters via Louvain community detection ──────────────────── */ + +static int arch_clusters(cbm_store_t *s, const char *project, cbm_architecture_info_t *out) { + /* 1. Load all callable node IDs for this project */ + const char *nsql = "SELECT id FROM nodes WHERE project=?1 " + "AND label IN ('Function','Method','Class','Interface')"; + sqlite3_stmt *nstmt = NULL; + if (sqlite3_prepare_v2(s->db, nsql, -1, &nstmt, NULL) != SQLITE_OK) { + store_set_error_sqlite(s, "arch_clusters_nodes"); + return CBM_STORE_ERR; + } + bind_text(nstmt, 1, project); + + int ncap = 1024; + int nn = 0; + int64_t *node_ids = malloc((size_t)ncap * sizeof(int64_t)); + + while (sqlite3_step(nstmt) == SQLITE_ROW) { + if (nn >= ncap) { + ncap *= 2; + node_ids = safe_realloc(node_ids, (size_t)ncap * sizeof(int64_t)); + } + node_ids[nn++] = sqlite3_column_int64(nstmt, 0); + } + sqlite3_finalize(nstmt); + + if (nn < 2) { + free(node_ids); + return CBM_STORE_OK; /* Nothing to cluster */ + } + + /* 2. Load all CALLS edges for this project */ + const char *esql = "SELECT source_id, target_id FROM edges WHERE project=?1 AND type='CALLS'"; + sqlite3_stmt *estmt = NULL; + if (sqlite3_prepare_v2(s->db, esql, -1, &estmt, NULL) != SQLITE_OK) { + free(node_ids); + store_set_error_sqlite(s, "arch_clusters_edges"); + return CBM_STORE_ERR; + } + bind_text(estmt, 1, project); + + int ecap = 2048; + int en = 0; + cbm_louvain_edge_t *edges = malloc((size_t)ecap * sizeof(cbm_louvain_edge_t)); + + while (sqlite3_step(estmt) == SQLITE_ROW) { + if (en >= ecap) { + ecap *= 2; + edges = safe_realloc(edges, (size_t)ecap * sizeof(cbm_louvain_edge_t)); + } + edges[en].src = sqlite3_column_int64(estmt, 0); + edges[en].dst = sqlite3_column_int64(estmt, 1); + en++; + } + sqlite3_finalize(estmt); + + if (en < 1) { + free(node_ids); + free(edges); + return CBM_STORE_OK; + } + + /* 3. Run Louvain */ + cbm_louvain_result_t *lresults = NULL; + int lcount = 0; + int rc = cbm_louvain(node_ids, nn, edges, en, &lresults, &lcount); + free(node_ids); + free(edges); + + if (rc != CBM_STORE_OK || lcount == 0) { + free(lresults); + return CBM_STORE_OK; + } + + /* 4. Find max community ID to size the grouping array */ + int max_community = 0; + for (int i = 0; i < lcount; i++) { + if (lresults[i].community > max_community) { + max_community = lresults[i].community; + } + } + int num_communities = max_community + 1; + + /* 5. Count members per community */ + int *member_counts = calloc((size_t)num_communities, sizeof(int)); + for (int i = 0; i < lcount; i++) { + if (lresults[i].community >= 0 && lresults[i].community < num_communities) { + member_counts[lresults[i].community]++; + } + } + + /* Count non-empty communities */ + int active_count = 0; + for (int i = 0; i < num_communities; i++) { + if (member_counts[i] > 0) { + active_count++; + } + } + + if (active_count == 0) { + free(member_counts); + free(lresults); + return CBM_STORE_OK; + } + + /* Cap at 20 clusters, keep the largest */ + int max_clusters = active_count < 20 ? active_count : 20; + + /* 6. Build cluster info structs. + * For each community, find the top-5 nodes by CALLS in-degree. */ + cbm_cluster_info_t *clusters = calloc((size_t)max_clusters, sizeof(cbm_cluster_info_t)); + int ci = 0; + + /* Sort communities by member count descending — simple selection of top N */ + int *sorted_ids = malloc((size_t)num_communities * sizeof(int)); + for (int i = 0; i < num_communities; i++) sorted_ids[i] = i; + /* Bubble sort is fine for small N (typically < 100 communities) */ + for (int i = 0; i < num_communities - 1 && i < max_clusters; i++) { + for (int j = i + 1; j < num_communities; j++) { + if (member_counts[sorted_ids[j]] > member_counts[sorted_ids[i]]) { + int tmp = sorted_ids[i]; + sorted_ids[i] = sorted_ids[j]; + sorted_ids[j] = tmp; + } + } + } + + for (int si = 0; si < max_clusters; si++) { + int comm_id = sorted_ids[si]; + if (member_counts[comm_id] == 0) break; + + clusters[ci].id = comm_id; + clusters[ci].members = member_counts[comm_id]; + clusters[ci].cohesion = 0.0; /* Would need intra-/inter-edge ratio to compute */ + + /* Collect node IDs in this community */ + int64_t *comm_nodes = malloc((size_t)member_counts[comm_id] * sizeof(int64_t)); + int cn = 0; + for (int i = 0; i < lcount; i++) { + if (lresults[i].community == comm_id) { + comm_nodes[cn++] = lresults[i].node_id; + } + } + + /* Find top 5 by in-degree via SQL */ + int top_n = cn < 5 ? cn : 5; + // NOLINTNEXTLINE(bugprone-multi-level-implicit-pointer-conversion) + const char **top_names = calloc((size_t)top_n, sizeof(const char *)); + int tn = 0; + + /* Build a simple query: SELECT name from nodes WHERE id IN (...) ordered by + * incoming CALLS count. For efficiency, just query each node's degree. */ + for (int k = 0; k < cn && tn < top_n; k++) { + int in_deg = 0; + int out_deg = 0; + cbm_store_node_degree(s, comm_nodes[k], &in_deg, &out_deg); + + /* Simple insertion into top-N by in-degree. + * We'll just pick the first top_n by iterating degree queries. */ + cbm_node_t ninfo; + if (cbm_store_find_node_by_id(s, comm_nodes[k], &ninfo) == CBM_STORE_OK) { + /* Skip File/Folder/Module nodes */ + if (ninfo.label && strcmp(ninfo.label, "File") != 0 && + strcmp(ninfo.label, "Folder") != 0 && + strcmp(ninfo.label, "Module") != 0) { + if (ninfo.name) { + top_names[tn++] = heap_strdup(ninfo.name); + } + } + cbm_node_free_fields(&ninfo); + } + } + + clusters[ci].top_nodes = top_names; + clusters[ci].top_node_count = tn; + + /* Derive semantic label from most common directory in member file paths. + * E.g. members in controllers/ → "Controllers", components/ → "Components" */ + { + /* Query file paths for a sample of cluster members */ + char dir_counts[64][64]; /* directory names */ + int dir_freqs[64]; /* frequency counts */ + int dir_n = 0; + memset(dir_freqs, 0, sizeof(dir_freqs)); + + int sample_limit = cn < 50 ? cn : 50; + for (int k = 0; k < sample_limit; k++) { + cbm_node_t ni; + if (cbm_store_find_node_by_id(s, comm_nodes[k], &ni) == CBM_STORE_OK) { + if (ni.file_path && ni.file_path[0]) { + /* Extract the deepest meaningful directory segment. + * E.g. "src/controllers/users-controller.ts" → "controllers" */ + const char *fp = ni.file_path; + const char *best_dir = NULL; + const char *p2 = fp; + const char *prev_slash = NULL; + while (*p2) { + if (*p2 == '/') { + if (prev_slash) { + /* Extract segment between prev_slash+1 and p2 */ + int slen = (int)(p2 - prev_slash - 1); + if (slen > 0 && slen < 60) { + /* Skip generic dirs: src, lib, dist, build, test, node_modules */ + char seg[64]; + memcpy(seg, prev_slash + 1, (size_t)slen); + seg[slen] = '\0'; + if (strcmp(seg, "src") != 0 && strcmp(seg, "lib") != 0 && + strcmp(seg, "dist") != 0 && strcmp(seg, "build") != 0 && + strcmp(seg, "node_modules") != 0 && + strcmp(seg, "test") != 0 && strcmp(seg, "tests") != 0 && + strcmp(seg, "shared") != 0 && strcmp(seg, "utils") != 0 && + strcmp(seg, "internal") != 0 && strcmp(seg, "generated") != 0) { + best_dir = prev_slash + 1; + } + } + } + prev_slash = p2; + } + p2++; + } + if (best_dir) { + const char *end = strchr(best_dir, '/'); + int dlen = end ? (int)(end - best_dir) : (int)strlen(best_dir); + if (dlen > 0 && dlen < 60) { + char dname[64]; + memcpy(dname, best_dir, (size_t)dlen); + dname[dlen] = '\0'; + /* Find or add to dir_counts */ + bool found_dir = false; + for (int d = 0; d < dir_n; d++) { + if (strcmp(dir_counts[d], dname) == 0) { + dir_freqs[d]++; + found_dir = true; + break; + } + } + if (!found_dir && dir_n < 64) { + strncpy(dir_counts[dir_n], dname, 63); + dir_counts[dir_n][63] = '\0'; + dir_freqs[dir_n] = 1; + dir_n++; + } + } + } + } + cbm_node_free_fields(&ni); + } + } + + /* Pick the most frequent directory name */ + char label_buf[64]; + int best_freq = 0; + int best_di = -1; + for (int d = 0; d < dir_n; d++) { + if (dir_freqs[d] > best_freq) { + best_freq = dir_freqs[d]; + best_di = d; + } + } + if (best_di >= 0 && best_freq >= 3) { + /* Capitalize first letter */ + char cap_name[64]; + strncpy(cap_name, dir_counts[best_di], sizeof(cap_name) - 1); + cap_name[sizeof(cap_name) - 1] = '\0'; + if (cap_name[0] >= 'a' && cap_name[0] <= 'z') { + cap_name[0] = cap_name[0] - 'a' + 'A'; + } + /* Convert kebab-case to TitleCase: "users-controller" → "UsersController" */ + for (int j = 0; cap_name[j]; j++) { + if (cap_name[j] == '-' && cap_name[j + 1]) { + /* Remove dash and capitalize next */ + memmove(&cap_name[j], &cap_name[j + 1], strlen(&cap_name[j + 1]) + 1); + if (cap_name[j] >= 'a' && cap_name[j] <= 'z') { + cap_name[j] = cap_name[j] - 'a' + 'A'; + } + } + } + snprintf(label_buf, sizeof(label_buf), "%s", cap_name); + } else { + snprintf(label_buf, sizeof(label_buf), "Cluster_%d", comm_id); + } + clusters[ci].label = heap_strdup(label_buf); + } + + /* packages and edge_types are optional, leave as NULL/0 for now */ + clusters[ci].packages = NULL; + clusters[ci].package_count = 0; + clusters[ci].edge_types = NULL; + clusters[ci].edge_type_count = 0; + + free(comm_nodes); + ci++; + } + + free(sorted_ids); + free(member_counts); + free(lresults); + + out->clusters = clusters; + out->cluster_count = ci; + return CBM_STORE_OK; +} + int cbm_store_get_architecture(cbm_store_t *s, const char *project, const char **aspects, int aspect_count, cbm_architecture_info_t *out) { memset(out, 0, sizeof(*out)); @@ -4008,6 +4514,12 @@ int cbm_store_get_architecture(cbm_store_t *s, const char *project, const char * return rc; } } + if (want_aspect(aspects, aspect_count, "clusters")) { + rc = arch_clusters(s, project, out); + if (rc != CBM_STORE_OK) { + return rc; + } + } return CBM_STORE_OK; } @@ -4085,6 +4597,636 @@ void cbm_store_architecture_free(cbm_architecture_info_t *out) { memset(out, 0, sizeof(*out)); } +/* ── Processes (execution flows) ──────────────────────────────── */ + +/* Detect execution flows: BFS from entry points, identify cross-community paths. */ +int cbm_store_detect_processes(cbm_store_t *s, const char *project, int max_processes) { + if (!s || !s->db || !project) return 0; + + /* Clear existing processes */ + { + char sql[512]; + snprintf(sql, sizeof(sql), + "DELETE FROM process_steps WHERE process_id IN " + "(SELECT id FROM processes WHERE project = '%s')", project); + exec_sql(s, sql); + snprintf(sql, sizeof(sql), "DELETE FROM processes WHERE project = '%s'", project); + exec_sql(s, sql); + } + + /* 1. Find entry point node IDs */ + const char *ep_sql = + "SELECT id, name FROM nodes WHERE project = ?1 " + "AND (json_extract(properties, '$.is_entry_point') = 1 OR label = 'Route') " + "AND label NOT IN ('File','Folder','Module','Project')"; + sqlite3_stmt *ep_stmt = NULL; + if (sqlite3_prepare_v2(s->db, ep_sql, -1, &ep_stmt, NULL) != SQLITE_OK) return 0; + bind_text(ep_stmt, 1, project); + + int ep_cap = 512; + int ep_count = 0; + int64_t *ep_ids = malloc((size_t)ep_cap * sizeof(int64_t)); + char **ep_names = malloc((size_t)ep_cap * sizeof(char *)); + + while (sqlite3_step(ep_stmt) == SQLITE_ROW) { + if (ep_count >= ep_cap) { + ep_cap *= 2; + ep_ids = safe_realloc(ep_ids, (size_t)ep_cap * sizeof(int64_t)); + ep_names = safe_realloc(ep_names, (size_t)ep_cap * sizeof(char *)); + } + ep_ids[ep_count] = sqlite3_column_int64(ep_stmt, 0); + const char *nm = (const char *)sqlite3_column_text(ep_stmt, 1); + ep_names[ep_count] = heap_strdup(nm ? nm : "?"); + ep_count++; + } + sqlite3_finalize(ep_stmt); + + if (ep_count == 0) { + free(ep_ids); + free(ep_names); + return 0; + } + + /* 1b. Resolve Route entry points to handler Functions. + * Route nodes have 0 outgoing edges (only incoming HANDLES from Modules). + * For each Route, find the Module that HANDLES it, then find Functions in + * the same file that have outgoing CALLS. Replace the Route entry point + * with those Functions — they're the real BFS starting points. */ + { + const char *resolve_sql = + "SELECT DISTINCT fn.id, fn.name FROM edges e " + "JOIN nodes m ON m.id = e.source_id AND m.label = 'Module' " + "JOIN nodes fn ON fn.file_path = m.file_path " + "AND fn.label IN ('Function','Method') AND fn.project = ?2 " + "WHERE e.target_id = ?1 AND e.type = 'HANDLES' AND e.project = ?2"; + sqlite3_stmt *res_stmt = NULL; + sqlite3_prepare_v2(s->db, resolve_sql, -1, &res_stmt, NULL); + + if (res_stmt) { + int orig_count = ep_count; + for (int i = 0; i < orig_count; i++) { + /* Check if this entry point is a Route node */ + const char *check_sql = "SELECT label FROM nodes WHERE id = ?1"; + sqlite3_stmt *chk = NULL; + sqlite3_prepare_v2(s->db, check_sql, -1, &chk, NULL); + if (!chk) continue; + sqlite3_bind_int64(chk, 1, ep_ids[i]); + const char *label = NULL; + if (sqlite3_step(chk) == SQLITE_ROW) { + label = (const char *)sqlite3_column_text(chk, 0); + } + bool is_route = (label && strcmp(label, "Route") == 0); + sqlite3_finalize(chk); + + if (!is_route) continue; + + /* Resolve Route → Module → Functions */ + sqlite3_reset(res_stmt); + sqlite3_bind_int64(res_stmt, 1, ep_ids[i]); + bind_text(res_stmt, 2, project); + + while (sqlite3_step(res_stmt) == SQLITE_ROW) { + if (ep_count >= ep_cap) { + ep_cap *= 2; + ep_ids = safe_realloc(ep_ids, (size_t)ep_cap * sizeof(int64_t)); + ep_names = safe_realloc(ep_names, (size_t)ep_cap * sizeof(char *)); + } + ep_ids[ep_count] = sqlite3_column_int64(res_stmt, 0); + const char *fn_name = (const char *)sqlite3_column_text(res_stmt, 1); + ep_names[ep_count] = heap_strdup(fn_name ? fn_name : "?"); + ep_count++; + } + } + sqlite3_finalize(res_stmt); + } + } + + /* 2. Load nodes + CALLS edges for Louvain */ + const char *nsql = "SELECT id FROM nodes WHERE project=?1 " + "AND label IN ('Function','Method','Class','Interface')"; + sqlite3_stmt *nst = NULL; + int all_cap = 4096; + int all_count = 0; + int64_t *all_ids = malloc((size_t)all_cap * sizeof(int64_t)); + if (sqlite3_prepare_v2(s->db, nsql, -1, &nst, NULL) == SQLITE_OK) { + bind_text(nst, 1, project); + while (sqlite3_step(nst) == SQLITE_ROW) { + if (all_count >= all_cap) { + all_cap *= 2; + all_ids = safe_realloc(all_ids, (size_t)all_cap * sizeof(int64_t)); + } + all_ids[all_count++] = sqlite3_column_int64(nst, 0); + } + sqlite3_finalize(nst); + } + + /* Include CALLS, HANDLES, and HTTP_CALLS for Louvain community detection. + * HANDLES connects Route → handler, HTTP_CALLS connects client → API endpoint. + * Without these, Express/Hapi route flows are invisible to process detection. */ + const char *esql = "SELECT source_id, target_id FROM edges WHERE project=?1 " + "AND type IN ('CALLS','HANDLES','HTTP_CALLS','ASYNC_CALLS')"; + sqlite3_stmt *est = NULL; + int le_cap = 8192; + int le_count = 0; + cbm_louvain_edge_t *ledges = malloc((size_t)le_cap * sizeof(cbm_louvain_edge_t)); + if (sqlite3_prepare_v2(s->db, esql, -1, &est, NULL) == SQLITE_OK) { + bind_text(est, 1, project); + while (sqlite3_step(est) == SQLITE_ROW) { + if (le_count >= le_cap) { + le_cap *= 2; + ledges = safe_realloc(ledges, (size_t)le_cap * sizeof(cbm_louvain_edge_t)); + } + ledges[le_count].src = sqlite3_column_int64(est, 0); + ledges[le_count].dst = sqlite3_column_int64(est, 1); + le_count++; + } + sqlite3_finalize(est); + } + + /* 3. Run Louvain */ + cbm_louvain_result_t *lresults = NULL; + int lcount = 0; + if (all_count > 1 && le_count > 0) { + cbm_louvain(all_ids, all_count, ledges, le_count, &lresults, &lcount); + } + free(all_ids); + free(ledges); + + /* Build node_id → community lookup (parallel arrays — O(n) scan per lookup, + * acceptable for entry_point_count * visited_count iterations) */ + int64_t *comm_nids = NULL; + int *comm_vals = NULL; + int comm_size = 0; + if (lresults && lcount > 0) { + comm_nids = malloc((size_t)lcount * sizeof(int64_t)); + comm_vals = malloc((size_t)lcount * sizeof(int)); + for (int i = 0; i < lcount; i++) { + comm_nids[i] = lresults[i].node_id; + comm_vals[i] = lresults[i].community; + } + comm_size = lcount; + } + free(lresults); + + /* 4. BFS from each entry point, detect cross-community flows */ + sqlite3_stmt *ins_proc = NULL; + sqlite3_stmt *ins_step = NULL; + sqlite3_prepare_v2(s->db, + "INSERT INTO processes(project,label,process_type,step_count," + "entry_point_id,terminal_id) VALUES(?1,?2,?3,?4,?5,?6)", + -1, &ins_proc, NULL); + sqlite3_prepare_v2(s->db, + "INSERT INTO process_steps(process_id,node_id,step) VALUES(?1,?2,?3)", + -1, &ins_step, NULL); + + exec_sql(s, "BEGIN TRANSACTION"); + int proc_count = 0; + + for (int ei = 0; ei < ep_count && proc_count < max_processes; ei++) { + const char *bfs_types[] = {"CALLS", "HANDLES", "HTTP_CALLS", "ASYNC_CALLS"}; + cbm_traverse_result_t tr = {0}; + cbm_store_bfs(s, ep_ids[ei], "outbound", bfs_types, 4, 8, 50, &tr); + + if (tr.visited_count < 2) { + cbm_store_traverse_free(&tr); + continue; + } + + /* Find the best cross-community terminal node. + * Instead of just picking the deepest hop (which gives generic utility functions + * like "update", "findOne"), score candidates by domain specificity: + * - Longer names score higher (domain-specific names are longer) + * - Generic names (update, get, set, find, create, delete, push, pop, error, + * log, emit, send, save, load, init, close, open) score 0 + * - Names starting with uppercase score higher (likely domain classes/handlers) */ + static const char *generic_names[] = { + "update", "get", "set", "find", "findOne", "findAll", "create", "delete", + "push", "pop", "error", "log", "emit", "send", "save", "load", "init", + "close", "open", "call", "apply", "bind", "then", "catch", "resolve", + "reject", "next", "done", "callback", "handler", "run", "execute", + "start", "stop", "reset", "clear", "add", "remove", "insert", + "forEach", "map", "filter", "reduce", "assign", "merge", "clone", + "parse", "format", "validate", "check", "test", "assert", + "toString", "valueOf", "toJSON", "default", "index", "main", + "getInstance", "getConnection", "getConfig", "getLogger", + "request", "response", "query", "result", "data", "value", + "defaultFilter", "_refreshCookies", NULL + }; + + int ep_comm = -1; + for (int c = 0; c < comm_size; c++) { + if (comm_nids[c] == ep_ids[ei]) { ep_comm = comm_vals[c]; break; } + } + + int64_t terminal_id = ep_ids[ei]; + const char *terminal_name = ep_names[ei]; + int best_score = -1; + bool is_cross = false; + + for (int v = 0; v < tr.visited_count; v++) { + int node_comm = -1; + for (int c = 0; c < comm_size; c++) { + if (comm_nids[c] == tr.visited[v].node.id) { node_comm = comm_vals[c]; break; } + } + if (node_comm != ep_comm && node_comm >= 0 && ep_comm >= 0) { + const char *nm = tr.visited[v].node.name; + if (!nm) continue; + + /* Score: name length * 10 + hop * 5, minus penalty for generics */ + int score = (int)strlen(nm) * 10 + tr.visited[v].hop * 5; + + /* Penalty for generic names */ + bool is_generic = false; + for (int g = 0; generic_names[g]; g++) { + if (strcmp(nm, generic_names[g]) == 0) { + is_generic = true; + break; + } + } + if (is_generic) score = 0; + + /* Bonus for CamelCase names starting with uppercase (domain handlers) */ + if (nm[0] >= 'A' && nm[0] <= 'Z') score += 50; + + /* Bonus for names containing domain verbs */ + if (strstr(nm, "Handler") || strstr(nm, "Controller") || + strstr(nm, "Service") || strstr(nm, "Storage") || + strstr(nm, "Plugin") || strstr(nm, "Middleware") || + strstr(nm, "Permission") || strstr(nm, "Authorization") || + strstr(nm, "Scope") || strstr(nm, "Role") || + strstr(nm, "Session") || strstr(nm, "User") || + strstr(nm, "Course") || strstr(nm, "Evaluation") || + strstr(nm, "Scenario")) { + score += 100; + } + + if (score > best_score) { + best_score = score; + terminal_id = tr.visited[v].node.id; + terminal_name = nm; + is_cross = true; + } + } + } + + /* If no cross-community terminal was found, still accept flows with ≥3 steps. + * This prevents filtering out legitimate API flows (route → controller → storage) + * that happen to stay within one Louvain community due to flat call patterns. + * Pick the deepest non-generic node as terminal for the label. */ + if (!is_cross) { + if (tr.visited_count < 3) { + cbm_store_traverse_free(&tr); + continue; + } + /* Find best terminal by hop depth + name quality */ + for (int v = 0; v < tr.visited_count; v++) { + const char *nm = tr.visited[v].node.name; + if (!nm) continue; + bool is_generic = false; + for (int g = 0; generic_names[g]; g++) { + if (strcmp(nm, generic_names[g]) == 0) { is_generic = true; break; } + } + if (is_generic) continue; + int score = (int)strlen(nm) * 10 + tr.visited[v].hop * 5; + if (nm[0] >= 'A' && nm[0] <= 'Z') score += 50; + if (score > best_score) { + best_score = score; + terminal_id = tr.visited[v].node.id; + terminal_name = nm; + } + } + } + + /* Label: "EntryPoint → Terminal" (UTF-8 arrow) */ + char label[512]; + snprintf(label, sizeof(label), "%s \xe2\x86\x92 %s", ep_names[ei], terminal_name); + + if (ins_proc) { + sqlite3_reset(ins_proc); + bind_text(ins_proc, 1, project); + bind_text(ins_proc, 2, label); + bind_text(ins_proc, 3, "cross_community"); + sqlite3_bind_int(ins_proc, 4, tr.visited_count + 1); + sqlite3_bind_int64(ins_proc, 5, ep_ids[ei]); + sqlite3_bind_int64(ins_proc, 6, terminal_id); + sqlite3_step(ins_proc); + } + + int64_t proc_id = sqlite3_last_insert_rowid(s->db); + + /* Insert steps */ + if (ins_step) { + sqlite3_reset(ins_step); + sqlite3_bind_int64(ins_step, 1, proc_id); + sqlite3_bind_int64(ins_step, 2, ep_ids[ei]); + sqlite3_bind_int(ins_step, 3, 0); + sqlite3_step(ins_step); + + for (int v = 0; v < tr.visited_count; v++) { + sqlite3_reset(ins_step); + sqlite3_bind_int64(ins_step, 1, proc_id); + sqlite3_bind_int64(ins_step, 2, tr.visited[v].node.id); + sqlite3_bind_int(ins_step, 3, tr.visited[v].hop); + sqlite3_step(ins_step); + } + } + + cbm_store_traverse_free(&tr); + proc_count++; + } + + exec_sql(s, "COMMIT"); + if (ins_proc) sqlite3_finalize(ins_proc); + if (ins_step) sqlite3_finalize(ins_step); + + free(comm_nids); + free(comm_vals); + for (int i = 0; i < ep_count; i++) free(ep_names[i]); + free(ep_names); + free(ep_ids); + + return proc_count; +} + +int cbm_store_list_processes(cbm_store_t *s, const char *project, + cbm_process_info_t **out, int *count) { + *out = NULL; + *count = 0; + const char *sql = "SELECT p.id, p.label, p.process_type, p.step_count, " + "p.entry_point_id, p.terminal_id " + "FROM processes p WHERE p.project = ?1 " + "ORDER BY p.step_count DESC LIMIT 300"; + sqlite3_stmt *stmt = NULL; + if (sqlite3_prepare_v2(s->db, sql, -1, &stmt, NULL) != SQLITE_OK) { + return CBM_STORE_OK; /* Table may not exist yet */ + } + bind_text(stmt, 1, project); + + int cap = 64; + int n = 0; + cbm_process_info_t *arr = calloc((size_t)cap, sizeof(cbm_process_info_t)); + + while (sqlite3_step(stmt) == SQLITE_ROW) { + if (n >= cap) { + cap *= 2; + arr = safe_realloc(arr, (size_t)cap * sizeof(cbm_process_info_t)); + } + arr[n].id = sqlite3_column_int64(stmt, 0); + arr[n].label = heap_strdup((const char *)sqlite3_column_text(stmt, 1)); + arr[n].process_type = heap_strdup((const char *)sqlite3_column_text(stmt, 2)); + arr[n].step_count = sqlite3_column_int(stmt, 3); + arr[n].entry_point_id = sqlite3_column_int64(stmt, 4); + arr[n].terminal_id = sqlite3_column_int64(stmt, 5); + n++; + } + sqlite3_finalize(stmt); + *out = arr; + *count = n; + return CBM_STORE_OK; +} + +int cbm_store_get_process_steps(cbm_store_t *s, int64_t process_id, + cbm_process_step_t **out, int *count) { + *out = NULL; + *count = 0; + const char *sql = "SELECT ps.node_id, n.name, n.qualified_name, n.file_path, ps.step " + "FROM process_steps ps JOIN nodes n ON n.id = ps.node_id " + "WHERE ps.process_id = ?1 ORDER BY ps.step"; + sqlite3_stmt *stmt = NULL; + if (sqlite3_prepare_v2(s->db, sql, -1, &stmt, NULL) != SQLITE_OK) { + return CBM_STORE_OK; + } + sqlite3_bind_int64(stmt, 1, process_id); + + int cap = 16; + int n = 0; + cbm_process_step_t *arr = calloc((size_t)cap, sizeof(cbm_process_step_t)); + + while (sqlite3_step(stmt) == SQLITE_ROW) { + if (n >= cap) { + cap *= 2; + arr = safe_realloc(arr, (size_t)cap * sizeof(cbm_process_step_t)); + } + arr[n].node_id = sqlite3_column_int64(stmt, 0); + arr[n].name = heap_strdup((const char *)sqlite3_column_text(stmt, 1)); + arr[n].qualified_name = heap_strdup((const char *)sqlite3_column_text(stmt, 2)); + arr[n].file_path = heap_strdup((const char *)sqlite3_column_text(stmt, 3)); + arr[n].step = sqlite3_column_int(stmt, 4); + n++; + } + sqlite3_finalize(stmt); + *out = arr; + *count = n; + return CBM_STORE_OK; +} + +void cbm_store_free_processes(cbm_process_info_t *arr, int count) { + for (int i = 0; i < count; i++) { + free((void *)arr[i].label); + free((void *)arr[i].process_type); + } + free(arr); +} + +void cbm_store_free_process_steps(cbm_process_step_t *arr, int count) { + for (int i = 0; i < count; i++) { + free((void *)arr[i].name); + free((void *)arr[i].qualified_name); + free((void *)arr[i].file_path); + } + free(arr); +} + +/* ── Channels (cross-service message tracing) ────────────────────── */ + +/* Forward declaration of channel extractors from httplink.c */ +typedef struct { + char channel[256]; + char direction[8]; + char transport[32]; +} cbm_channel_match_t; +int cbm_extract_channels(const char *source, cbm_channel_match_t *out, int max_out); +int cbm_extract_csharp_channels(const char *source, cbm_channel_match_t *out, int max_out); + +int cbm_store_detect_channels(cbm_store_t *s, const char *project, const char *repo_path) { + if (!s || !s->db || !project || !repo_path) return 0; + + /* Clear existing channels for this project (parameterized — no SQL injection) */ + { + sqlite3_stmt *del_stmt = NULL; + sqlite3_prepare_v2(s->db, "DELETE FROM channels WHERE project = ?1", -1, &del_stmt, NULL); + if (del_stmt) { + bind_text(del_stmt, 1, project); + sqlite3_step(del_stmt); + sqlite3_finalize(del_stmt); + } + } + + /* Find all Function/Method nodes with source file references in supported languages */ + const char *sql = "SELECT id, name, file_path, start_line, end_line FROM nodes " + "WHERE project = ?1 AND label IN ('Function','Method','Module','Class') " + "AND (file_path LIKE '%.ts' OR file_path LIKE '%.js' " + "OR file_path LIKE '%.tsx' OR file_path LIKE '%.py' " + "OR file_path LIKE '%.cs')"; + sqlite3_stmt *stmt = NULL; + if (sqlite3_prepare_v2(s->db, sql, -1, &stmt, NULL) != SQLITE_OK) return 0; + bind_text(stmt, 1, project); + + sqlite3_stmt *ins = NULL; + sqlite3_prepare_v2(s->db, + "INSERT OR IGNORE INTO channels(project,channel_name,direction,transport,node_id,file_path,function_name) " + "VALUES(?1,?2,?3,?4,?5,?6,?7)", -1, &ins, NULL); + + exec_sql(s, "BEGIN TRANSACTION"); + int total = 0; + + while (sqlite3_step(stmt) == SQLITE_ROW) { + int64_t node_id = sqlite3_column_int64(stmt, 0); + const char *name = (const char *)sqlite3_column_text(stmt, 1); + const char *fpath = (const char *)sqlite3_column_text(stmt, 2); + int start = sqlite3_column_int(stmt, 3); + int end = sqlite3_column_int(stmt, 4); + + if (!fpath || !fpath[0] || start <= 0 || end <= 0) continue; + + /* Read source lines from disk */ + char full_path[2048]; + snprintf(full_path, sizeof(full_path), "%s/%s", repo_path, fpath); + + FILE *f = fopen(full_path, "r"); + if (!f) continue; + + /* Read relevant lines */ + char *source = NULL; + size_t src_len = 0; + size_t src_cap = 0; + int line_num = 0; + char line[4096]; + + while (fgets(line, sizeof(line), f)) { + line_num++; + if (line_num < start) continue; + if (line_num > end) break; + size_t ll = strlen(line); + if (src_len + ll >= src_cap) { + src_cap = (src_cap == 0) ? 4096 : src_cap * 2; + source = safe_realloc(source, src_cap); + } + memcpy(source + src_len, line, ll); + src_len += ll; + } + fclose(f); + + if (source) { + source[src_len] = '\0'; + cbm_channel_match_t matches[64]; + int mc = 0; + /* Use language-appropriate extractor */ + bool is_cs = fpath && (strstr(fpath, ".cs") != NULL && + strstr(fpath, ".css") == NULL); + if (is_cs) { + mc = cbm_extract_csharp_channels(source, matches, 64); + } else { + mc = cbm_extract_channels(source, matches, 64); + } + for (int i = 0; i < mc && ins; i++) { + sqlite3_reset(ins); + bind_text(ins, 1, project); + bind_text(ins, 2, matches[i].channel); + bind_text(ins, 3, matches[i].direction); + bind_text(ins, 4, matches[i].transport); + sqlite3_bind_int64(ins, 5, node_id); + bind_text(ins, 6, fpath); + bind_text(ins, 7, name ? name : ""); + sqlite3_step(ins); + total++; + } + free(source); + } + } + + exec_sql(s, "COMMIT"); + sqlite3_finalize(stmt); + if (ins) sqlite3_finalize(ins); + return total; +} + +int cbm_store_find_channels(cbm_store_t *s, const char *project, const char *channel, + cbm_channel_info_t **out, int *count) { + *out = NULL; + *count = 0; + + /* Build query — if project is NULL, search all; if channel is NULL, return all. + * Use DISTINCT to prevent duplicate rows from different extraction passes. */ + char sql[1024]; + if (project && channel) { + snprintf(sql, sizeof(sql), + "SELECT DISTINCT channel_name, direction, transport, project, file_path, function_name " + "FROM channels WHERE project = ?1 AND channel_name LIKE ?2 " + "ORDER BY channel_name LIMIT 500"); + } else if (project) { + snprintf(sql, sizeof(sql), + "SELECT DISTINCT channel_name, direction, transport, project, file_path, function_name " + "FROM channels WHERE project = ?1 ORDER BY channel_name LIMIT 500"); + } else if (channel) { + snprintf(sql, sizeof(sql), + "SELECT DISTINCT channel_name, direction, transport, project, file_path, function_name " + "FROM channels WHERE channel_name LIKE ?1 ORDER BY channel_name LIMIT 500"); + } else { + snprintf(sql, sizeof(sql), + "SELECT DISTINCT channel_name, direction, transport, project, file_path, function_name " + "FROM channels ORDER BY channel_name LIMIT 500"); + } + + sqlite3_stmt *stmt = NULL; + if (sqlite3_prepare_v2(s->db, sql, -1, &stmt, NULL) != SQLITE_OK) return CBM_STORE_OK; + + int bi = 0; + if (project && channel) { + bind_text(stmt, 1, project); + char pat[256]; + snprintf(pat, sizeof(pat), "%%%s%%", channel); + bind_text(stmt, 2, pat); + } else if (project) { + bind_text(stmt, 1, project); + } else if (channel) { + char pat[256]; + snprintf(pat, sizeof(pat), "%%%s%%", channel); + bind_text(stmt, 1, pat); + } + (void)bi; + + int cap = 64; + int n = 0; + cbm_channel_info_t *arr = calloc((size_t)cap, sizeof(cbm_channel_info_t)); + while (sqlite3_step(stmt) == SQLITE_ROW) { + if (n >= cap) { cap *= 2; arr = safe_realloc(arr, (size_t)cap * sizeof(cbm_channel_info_t)); } + arr[n].channel_name = heap_strdup((const char *)sqlite3_column_text(stmt, 0)); + arr[n].direction = heap_strdup((const char *)sqlite3_column_text(stmt, 1)); + arr[n].transport = heap_strdup((const char *)sqlite3_column_text(stmt, 2)); + arr[n].project = heap_strdup((const char *)sqlite3_column_text(stmt, 3)); + arr[n].file_path = heap_strdup((const char *)sqlite3_column_text(stmt, 4)); + arr[n].function_name = heap_strdup((const char *)sqlite3_column_text(stmt, 5)); + n++; + } + sqlite3_finalize(stmt); + *out = arr; + *count = n; + return CBM_STORE_OK; +} + +void cbm_store_free_channels(cbm_channel_info_t *arr, int count) { + for (int i = 0; i < count; i++) { + free((void *)arr[i].channel_name); + free((void *)arr[i].direction); + free((void *)arr[i].transport); + free((void *)arr[i].project); + free((void *)arr[i].file_path); + free((void *)arr[i].function_name); + } + free(arr); +} + /* ── ADR (Architecture Decision Record) ────────────────────────── */ static const char *canonical_sections[] = {"PURPOSE", "STACK", "ARCHITECTURE", diff --git a/src/store/store.h b/src/store/store.h index 17b0df11..afd29f3b 100644 --- a/src/store/store.h +++ b/src/store/store.h @@ -108,6 +108,7 @@ typedef struct { const char *name_pattern; /* regex on name, NULL = any */ const char *qn_pattern; /* regex on qualified_name, NULL = any */ const char *file_pattern; /* glob on file_path, NULL = any */ + const char *query; /* free-text BM25 query via FTS5, NULL = disabled */ const char *relationship; /* edge type filter, NULL = any */ const char *direction; /* "inbound" / "outbound" / "any", NULL = any */ int min_degree; /* -1 = no filter (default), 0+ = minimum */ @@ -209,6 +210,9 @@ cbm_store_t *cbm_store_open(const char *project); /* Close the store and free all resources. NULL-safe. */ void cbm_store_close(cbm_store_t *s); +/* Execute a raw SQL statement (for DDL, DML, etc.). */ +int cbm_store_exec(cbm_store_t *s, const char *sql); + /* Get the underlying sqlite3 handle (for testing only). */ struct sqlite3 *cbm_store_get_db(cbm_store_t *s); @@ -514,6 +518,57 @@ int cbm_store_get_architecture(cbm_store_t *s, const char *project, const char * int aspect_count, cbm_architecture_info_t *out); void cbm_store_architecture_free(cbm_architecture_info_t *out); +/* ── Processes (execution flows) ─────────────────────────────────── */ + +typedef struct { + int64_t id; + const char *label; /* "EntryPoint → Terminal" */ + const char *process_type; /* "cross_community" or "intra_community" */ + int step_count; + int64_t entry_point_id; + int64_t terminal_id; +} cbm_process_info_t; + +typedef struct { + int64_t node_id; + const char *name; + const char *qualified_name; + const char *file_path; + int step; +} cbm_process_step_t; + +int cbm_store_list_processes(cbm_store_t *s, const char *project, + cbm_process_info_t **out, int *count); +int cbm_store_get_process_steps(cbm_store_t *s, int64_t process_id, + cbm_process_step_t **out, int *count); +void cbm_store_free_processes(cbm_process_info_t *arr, int count); +void cbm_store_free_process_steps(cbm_process_step_t *arr, int count); + +/* Detect execution flows from entry points via BFS + Louvain community crossing. */ +int cbm_store_detect_processes(cbm_store_t *s, const char *project, int max_processes); + +/* ── Channels (cross-service message tracing) ────────────────────── */ + +typedef struct { + const char *channel_name; + const char *direction; /* "emit" or "listen" */ + const char *transport; /* "socketio", "eventemitter" */ + const char *project; + const char *file_path; + const char *function_name; +} cbm_channel_info_t; + +/* Detect channel emit/listen patterns in indexed source files. + * Reads source from disk for JS/TS/Python files and scans for + * socket.emit/on, emitter.emit/on patterns. */ +int cbm_store_detect_channels(cbm_store_t *s, const char *project, const char *repo_path); + +/* Query channels by name (partial match). If channel is NULL, returns all. + * If project is NULL, searches across all loaded projects. */ +int cbm_store_find_channels(cbm_store_t *s, const char *project, const char *channel, + cbm_channel_info_t **out, int *count); +void cbm_store_free_channels(cbm_channel_info_t *arr, int count); + /* ── ADR (Architecture Decision Record) ────────────────────────── */ #define CBM_ADR_MAX_LENGTH 8000