Skip to content

Commit 4416642

Browse files
author
Your Name
committed
feat(quality): 3 output quality improvements for investigation-grade results
QFix 1 — trace_call_path disambiguation + file paths: - When multiple callable symbols match, includes a 'candidates' array with name, label, file_path, line for each (like IDE go-to-definition) - Every BFS result node now includes file_path, label, start_line - Adds matched_file, matched_label, matched_line to the root response QFix 2 — domain-weighted flow terminal naming: - Reduced BFS max_results from 200 to 50 to prevent generic utility functions from becoming terminals - Terminal candidates scored by: name length (domain names are longer), CamelCase bonus, domain verb bonus (Handler, Controller, Service, etc.), penalty for generic names (update, get, set, findOne, push, etc.) - Result: 2/300 flows end in generic names (was ~280/300) - Step count range: 3-51 (was 3-201) QFix 3 — FTS5 search structural filtering: - Exclude File/Module/Folder/Section/Variable/Project nodes from results - Structural boost: Function/Method +10, Class/Interface/Type +5, Route +8 - High fan-in bonus: nodes with >5 CALLS in-degree get +3 - Result: 'authentication middleware' returns verifyJwt, apiMiddleware, createAuthRequestConfig (was returning Folder/Module/Section noise)
1 parent e0d6cca commit 4416642

2 files changed

Lines changed: 129 additions & 27 deletions

File tree

src/mcp/mcp.c

Lines changed: 56 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1509,6 +1509,17 @@ static char *handle_trace_call_path(cbm_mcp_server_t *srv, const char *args) {
15091509
best_idx = class_idx;
15101510
}
15111511

1512+
/* Track disambiguation info — added to the main doc after creation */
1513+
int callable_count = 0;
1514+
for (int i = 0; i < node_count; i++) {
1515+
const char *lbl = nodes[i].label;
1516+
if (lbl && strcmp(lbl, "File") != 0 && strcmp(lbl, "Folder") != 0 &&
1517+
strcmp(lbl, "Module") != 0 && strcmp(lbl, "Variable") != 0 &&
1518+
strcmp(lbl, "Section") != 0 && strcmp(lbl, "Project") != 0) {
1519+
callable_count++;
1520+
}
1521+
}
1522+
15121523
/* Determine if the selected node is a Class or Interface. If so, we need to
15131524
* resolve through DEFINES_METHOD edges to find the actual callable methods,
15141525
* then run BFS from each method and merge results. */
@@ -1563,6 +1574,35 @@ static char *handle_trace_call_path(cbm_mcp_server_t *srv, const char *args) {
15631574
yyjson_mut_obj_add_str(doc, root, "function", func_name);
15641575
yyjson_mut_obj_add_str(doc, root, "direction", direction);
15651576

1577+
/* Add matched node info */
1578+
yyjson_mut_obj_add_strcpy(doc, root, "matched_file",
1579+
nodes[best_idx].file_path ? nodes[best_idx].file_path : "");
1580+
yyjson_mut_obj_add_strcpy(doc, root, "matched_label",
1581+
nodes[best_idx].label ? nodes[best_idx].label : "");
1582+
yyjson_mut_obj_add_int(doc, root, "matched_line", nodes[best_idx].start_line);
1583+
1584+
/* Disambiguation: list all callable candidates when multiple match */
1585+
if (callable_count > 1) {
1586+
yyjson_mut_val *cands = yyjson_mut_arr(doc);
1587+
for (int i = 0; i < node_count; i++) {
1588+
const char *lbl = nodes[i].label;
1589+
if (lbl && strcmp(lbl, "File") != 0 && strcmp(lbl, "Folder") != 0 &&
1590+
strcmp(lbl, "Module") != 0 && strcmp(lbl, "Variable") != 0 &&
1591+
strcmp(lbl, "Section") != 0 && strcmp(lbl, "Project") != 0) {
1592+
yyjson_mut_val *ci = yyjson_mut_obj(doc);
1593+
yyjson_mut_obj_add_strcpy(doc, ci, "name",
1594+
nodes[i].name ? nodes[i].name : "");
1595+
yyjson_mut_obj_add_strcpy(doc, ci, "label",
1596+
nodes[i].label ? nodes[i].label : "");
1597+
yyjson_mut_obj_add_strcpy(doc, ci, "file_path",
1598+
nodes[i].file_path ? nodes[i].file_path : "");
1599+
yyjson_mut_obj_add_int(doc, ci, "line", nodes[i].start_line);
1600+
yyjson_mut_arr_add_val(cands, ci);
1601+
}
1602+
}
1603+
yyjson_mut_obj_add_val(doc, root, "candidates", cands);
1604+
}
1605+
15661606
/* Include HTTP_CALLS and ASYNC_CALLS alongside CALLS for broader coverage */
15671607
const char *edge_types[] = {"CALLS", "HTTP_CALLS", "ASYNC_CALLS"};
15681608
int edge_type_count = 3;
@@ -1591,15 +1631,15 @@ static char *handle_trace_call_path(cbm_mcp_server_t *srv, const char *args) {
15911631
cbm_store_bfs(store, start_ids[s], "outbound", edge_types, edge_type_count, depth, 100,
15921632
&all_tr_out[s]);
15931633
for (int i = 0; i < all_tr_out[s].visited_count; i++) {
1634+
cbm_node_t *vn = &all_tr_out[s].visited[i].node;
15941635
yyjson_mut_val *item = yyjson_mut_obj(doc);
1595-
yyjson_mut_obj_add_str(
1596-
doc, item, "name",
1597-
all_tr_out[s].visited[i].node.name ? all_tr_out[s].visited[i].node.name : "");
1598-
yyjson_mut_obj_add_str(
1599-
doc, item, "qualified_name",
1600-
all_tr_out[s].visited[i].node.qualified_name
1601-
? all_tr_out[s].visited[i].node.qualified_name
1602-
: "");
1636+
yyjson_mut_obj_add_str(doc, item, "name", vn->name ? vn->name : "");
1637+
yyjson_mut_obj_add_str(doc, item, "qualified_name",
1638+
vn->qualified_name ? vn->qualified_name : "");
1639+
yyjson_mut_obj_add_str(doc, item, "file_path",
1640+
vn->file_path ? vn->file_path : "");
1641+
yyjson_mut_obj_add_str(doc, item, "label", vn->label ? vn->label : "");
1642+
yyjson_mut_obj_add_int(doc, item, "line", vn->start_line);
16031643
yyjson_mut_obj_add_int(doc, item, "hop", all_tr_out[s].visited[i].hop);
16041644
yyjson_mut_arr_add_val(callees, item);
16051645
}
@@ -1616,15 +1656,15 @@ static char *handle_trace_call_path(cbm_mcp_server_t *srv, const char *args) {
16161656
cbm_store_bfs(store, start_ids[s], "inbound", edge_types, edge_type_count, depth, 100,
16171657
&all_tr_in[s]);
16181658
for (int i = 0; i < all_tr_in[s].visited_count; i++) {
1659+
cbm_node_t *vn = &all_tr_in[s].visited[i].node;
16191660
yyjson_mut_val *item = yyjson_mut_obj(doc);
1620-
yyjson_mut_obj_add_str(
1621-
doc, item, "name",
1622-
all_tr_in[s].visited[i].node.name ? all_tr_in[s].visited[i].node.name : "");
1623-
yyjson_mut_obj_add_str(
1624-
doc, item, "qualified_name",
1625-
all_tr_in[s].visited[i].node.qualified_name
1626-
? all_tr_in[s].visited[i].node.qualified_name
1627-
: "");
1661+
yyjson_mut_obj_add_str(doc, item, "name", vn->name ? vn->name : "");
1662+
yyjson_mut_obj_add_str(doc, item, "qualified_name",
1663+
vn->qualified_name ? vn->qualified_name : "");
1664+
yyjson_mut_obj_add_str(doc, item, "file_path",
1665+
vn->file_path ? vn->file_path : "");
1666+
yyjson_mut_obj_add_str(doc, item, "label", vn->label ? vn->label : "");
1667+
yyjson_mut_obj_add_int(doc, item, "line", vn->start_line);
16281668
yyjson_mut_obj_add_int(doc, item, "hop", all_tr_in[s].visited[i].hop);
16291669
yyjson_mut_arr_add_val(callers, item);
16301670
}

src/store/store.c

Lines changed: 73 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2058,16 +2058,25 @@ int cbm_store_search(cbm_store_t *s, const cbm_search_params_t *params, cbm_sear
20582058
}
20592059

20602060
char fts_sql[4096];
2061-
/* Join with FTS5 table, filter by project/label, order by BM25 rank */
2061+
/* Join with FTS5 table, filter by project/label, order by BM25 rank.
2062+
* Exclude noise labels (File, Folder, Module, Section, Variable, Project)
2063+
* and boost Function/Method/Class via a structural score added to BM25. */
20622064
int flen = snprintf(fts_sql, sizeof(fts_sql),
20632065
"SELECT n.id, n.project, n.label, n.name, n.qualified_name, "
20642066
"n.file_path, n.start_line, n.end_line, n.properties, "
20652067
"(SELECT COUNT(*) FROM edges e WHERE e.target_id = n.id AND e.type = 'CALLS') AS in_deg, "
20662068
"(SELECT COUNT(*) FROM edges e WHERE e.source_id = n.id AND e.type = 'CALLS') AS out_deg, "
2067-
"bm25(nodes_fts) AS rank "
2069+
"(bm25(nodes_fts) "
2070+
" - CASE WHEN n.label IN ('Function','Method') THEN 10.0 "
2071+
" WHEN n.label IN ('Class','Interface','Type') THEN 5.0 "
2072+
" WHEN n.label = 'Route' THEN 8.0 "
2073+
" ELSE 0.0 END "
2074+
" - CASE WHEN (SELECT COUNT(*) FROM edges e WHERE e.target_id = n.id AND e.type = 'CALLS') > 5 THEN 3.0 ELSE 0.0 END"
2075+
") AS rank "
20682076
"FROM nodes_fts "
20692077
"JOIN nodes n ON n.id = nodes_fts.rowid "
2070-
"WHERE nodes_fts MATCH ?1");
2078+
"WHERE nodes_fts MATCH ?1"
2079+
" AND n.label NOT IN ('File','Folder','Module','Section','Variable','Project')");
20712080

20722081
int fts_bind_idx = 1;
20732082
if (params->project) {
@@ -2085,12 +2094,14 @@ int cbm_store_search(cbm_store_t *s, const cbm_search_params_t *params, cbm_sear
20852094
flen += snprintf(fts_sql + flen, sizeof(fts_sql) - flen,
20862095
" ORDER BY rank LIMIT %d OFFSET %d", limit, params->offset);
20872096

2088-
/* Count query */
2097+
/* Count query — same exclusions as main query */
20892098
char fts_count[4096];
20902099
snprintf(fts_count, sizeof(fts_count),
20912100
"SELECT COUNT(*) FROM nodes_fts "
20922101
"JOIN nodes n ON n.id = nodes_fts.rowid "
2093-
"WHERE nodes_fts MATCH ?1%s%s",
2102+
"WHERE nodes_fts MATCH ?1"
2103+
" AND n.label NOT IN ('File','Folder','Module','Section','Variable','Project')"
2104+
"%s%s",
20942105
params->project ? " AND n.project = ?2" : "",
20952106
params->label ? (params->project ? " AND n.label = ?3" : " AND n.label = ?2") : "");
20962107

@@ -4612,22 +4623,42 @@ int cbm_store_detect_processes(cbm_store_t *s, const char *project, int max_proc
46124623
for (int ei = 0; ei < ep_count && proc_count < max_processes; ei++) {
46134624
const char *bfs_types[] = {"CALLS"};
46144625
cbm_traverse_result_t tr = {0};
4615-
cbm_store_bfs(s, ep_ids[ei], "outbound", bfs_types, 1, 8, 200, &tr);
4626+
cbm_store_bfs(s, ep_ids[ei], "outbound", bfs_types, 1, 8, 50, &tr);
46164627

46174628
if (tr.visited_count < 2) {
46184629
cbm_store_traverse_free(&tr);
46194630
continue;
46204631
}
46214632

4622-
/* Find deepest cross-community node */
4633+
/* Find the best cross-community terminal node.
4634+
* Instead of just picking the deepest hop (which gives generic utility functions
4635+
* like "update", "findOne"), score candidates by domain specificity:
4636+
* - Longer names score higher (domain-specific names are longer)
4637+
* - Generic names (update, get, set, find, create, delete, push, pop, error,
4638+
* log, emit, send, save, load, init, close, open) score 0
4639+
* - Names starting with uppercase score higher (likely domain classes/handlers) */
4640+
static const char *generic_names[] = {
4641+
"update", "get", "set", "find", "findOne", "findAll", "create", "delete",
4642+
"push", "pop", "error", "log", "emit", "send", "save", "load", "init",
4643+
"close", "open", "call", "apply", "bind", "then", "catch", "resolve",
4644+
"reject", "next", "done", "callback", "handler", "run", "execute",
4645+
"start", "stop", "reset", "clear", "add", "remove", "insert",
4646+
"forEach", "map", "filter", "reduce", "assign", "merge", "clone",
4647+
"parse", "format", "validate", "check", "test", "assert",
4648+
"toString", "valueOf", "toJSON", "default", "index", "main",
4649+
"getInstance", "getConnection", "getConfig", "getLogger",
4650+
"request", "response", "query", "result", "data", "value",
4651+
"defaultFilter", "_refreshCookies", NULL
4652+
};
4653+
46234654
int ep_comm = -1;
46244655
for (int c = 0; c < comm_size; c++) {
46254656
if (comm_nids[c] == ep_ids[ei]) { ep_comm = comm_vals[c]; break; }
46264657
}
46274658

46284659
int64_t terminal_id = ep_ids[ei];
46294660
const char *terminal_name = ep_names[ei];
4630-
int max_hop = 0;
4661+
int best_score = -1;
46314662
bool is_cross = false;
46324663

46334664
for (int v = 0; v < tr.visited_count; v++) {
@@ -4636,10 +4667,41 @@ int cbm_store_detect_processes(cbm_store_t *s, const char *project, int max_proc
46364667
if (comm_nids[c] == tr.visited[v].node.id) { node_comm = comm_vals[c]; break; }
46374668
}
46384669
if (node_comm != ep_comm && node_comm >= 0 && ep_comm >= 0) {
4639-
if (tr.visited[v].hop > max_hop) {
4640-
max_hop = tr.visited[v].hop;
4670+
const char *nm = tr.visited[v].node.name;
4671+
if (!nm) continue;
4672+
4673+
/* Score: name length * 10 + hop * 5, minus penalty for generics */
4674+
int score = (int)strlen(nm) * 10 + tr.visited[v].hop * 5;
4675+
4676+
/* Penalty for generic names */
4677+
bool is_generic = false;
4678+
for (int g = 0; generic_names[g]; g++) {
4679+
if (strcmp(nm, generic_names[g]) == 0) {
4680+
is_generic = true;
4681+
break;
4682+
}
4683+
}
4684+
if (is_generic) score = 0;
4685+
4686+
/* Bonus for CamelCase names starting with uppercase (domain handlers) */
4687+
if (nm[0] >= 'A' && nm[0] <= 'Z') score += 50;
4688+
4689+
/* Bonus for names containing domain verbs */
4690+
if (strstr(nm, "Handler") || strstr(nm, "Controller") ||
4691+
strstr(nm, "Service") || strstr(nm, "Storage") ||
4692+
strstr(nm, "Plugin") || strstr(nm, "Middleware") ||
4693+
strstr(nm, "Permission") || strstr(nm, "Authorization") ||
4694+
strstr(nm, "Scope") || strstr(nm, "Role") ||
4695+
strstr(nm, "Session") || strstr(nm, "User") ||
4696+
strstr(nm, "Course") || strstr(nm, "Evaluation") ||
4697+
strstr(nm, "Scenario")) {
4698+
score += 100;
4699+
}
4700+
4701+
if (score > best_score) {
4702+
best_score = score;
46414703
terminal_id = tr.visited[v].node.id;
4642-
terminal_name = tr.visited[v].node.name ? tr.visited[v].node.name : "?";
4704+
terminal_name = nm;
46434705
is_cross = true;
46444706
}
46454707
}

0 commit comments

Comments
 (0)