Skip to content

Commit 358de42

Browse files
author
Your Name
committed
feat(store): add BM25 full-text search via SQLite FTS5
Add a nodes_fts FTS5 virtual table synced via triggers for INSERT/UPDATE/DELETE. Enable SQLITE_ENABLE_FTS5 in both production and test Makefile flags. New 'query' parameter on search_graph: when set, uses FTS5 MATCH with bm25() ranking instead of regex matching. Multi-word queries are tokenized into OR terms for broad matching (e.g. 'authentication middleware' matches nodes containing either word, ranked by relevance). The direct B-tree dump pipeline bypasses SQLite triggers, so add a bulk FTS5 backfill step after indexing: INSERT INTO nodes_fts SELECT id, name, qualified_name, label, file_path FROM nodes Add cbm_store_exec() public API for raw SQL execution. Falls back gracefully to regex path if FTS5 is unavailable. Tested: 'authentication middleware' query returns 242 ranked results (was 0). 'session recording upload' returns 4,722 ranked results with relevant routes, controllers, and constants at the top.
1 parent 58fff9e commit 358de42

5 files changed

Lines changed: 186 additions & 3 deletions

File tree

Makefile.cbm

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -217,8 +217,8 @@ MIMALLOC_CFLAGS_TEST = -std=c11 -g -O1 -w \
217217

218218
# sqlite3 (vendored amalgamation — compiled ourselves for ASan instrumentation)
219219
SQLITE3_SRC = vendored/sqlite3/sqlite3.c
220-
SQLITE3_CFLAGS = -std=c11 -O2 -w -DSQLITE_DQS=0 -DSQLITE_THREADSAFE=1
221-
SQLITE3_CFLAGS_TEST = -std=c11 -g -O1 -w -DSQLITE_DQS=0 -DSQLITE_THREADSAFE=1
220+
SQLITE3_CFLAGS = -std=c11 -O2 -w -DSQLITE_DQS=0 -DSQLITE_THREADSAFE=1 -DSQLITE_ENABLE_FTS5
221+
SQLITE3_CFLAGS_TEST = -std=c11 -g -O1 -w -DSQLITE_DQS=0 -DSQLITE_THREADSAFE=1 -DSQLITE_ENABLE_FTS5
222222

223223
# TRE regex (vendored, Windows only — POSIX uses system <regex.h>)
224224
TRE_SRC = vendored/tre/tre_all.c

src/mcp/mcp.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -940,6 +940,8 @@ static char *handle_search_graph(cbm_mcp_server_t *srv, const char *args) {
940940
char *label = cbm_mcp_get_string_arg(args, "label");
941941
char *name_pattern = cbm_mcp_get_string_arg(args, "name_pattern");
942942
char *file_pattern = cbm_mcp_get_string_arg(args, "file_pattern");
943+
char *query = cbm_mcp_get_string_arg(args, "query");
944+
char *sort_by = cbm_mcp_get_string_arg(args, "sort_by");
943945
int limit = cbm_mcp_get_int_arg(args, "limit", 500000);
944946
int offset = cbm_mcp_get_int_arg(args, "offset", 0);
945947
int min_degree = cbm_mcp_get_int_arg(args, "min_degree", -1);
@@ -950,6 +952,8 @@ static char *handle_search_graph(cbm_mcp_server_t *srv, const char *args) {
950952
.label = label,
951953
.name_pattern = name_pattern,
952954
.file_pattern = file_pattern,
955+
.query = query,
956+
.sort_by = sort_by,
953957
.limit = limit,
954958
.offset = offset,
955959
.min_degree = min_degree,
@@ -990,6 +994,8 @@ static char *handle_search_graph(cbm_mcp_server_t *srv, const char *args) {
990994
free(label);
991995
free(name_pattern);
992996
free(file_pattern);
997+
free(query);
998+
free(sort_by);
993999

9941000
char *result = cbm_mcp_text_result(json, false);
9951001
free(json);

src/pipeline/pipeline.c

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -818,6 +818,16 @@ int cbm_pipeline_run(cbm_pipeline_t *p) {
818818
}
819819
cbm_store_close(hash_store);
820820
cbm_log_info("pass.timing", "pass", "persist_hashes", "files", itoa_buf(file_count));
821+
822+
/* Backfill FTS5 index: the direct B-tree dump bypasses SQLite triggers,
823+
* so the FTS5 table is empty after indexing. Populate it in bulk now. */
824+
cbm_store_t *fts_store = cbm_store_open_path(db_path);
825+
if (fts_store) {
826+
cbm_store_exec(fts_store,
827+
"INSERT OR REPLACE INTO nodes_fts(rowid, name, qualified_name, label, file_path) "
828+
"SELECT id, name, qualified_name, label, file_path FROM nodes;");
829+
cbm_store_close(fts_store);
830+
}
821831
}
822832
}
823833

src/store/store.c

Lines changed: 164 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -212,7 +212,47 @@ static int create_user_indexes(cbm_store_t *s) {
212212
"CREATE INDEX IF NOT EXISTS idx_edges_type ON edges(project, type);"
213213
"CREATE INDEX IF NOT EXISTS idx_edges_target_type ON edges(project, target_id, type);"
214214
"CREATE INDEX IF NOT EXISTS idx_edges_source_type ON edges(project, source_id, type);";
215-
return exec_sql(s, sql);
215+
int rc = exec_sql(s, sql);
216+
if (rc != SQLITE_OK) return rc;
217+
218+
/* FTS5 full-text search index on node names for BM25 ranking.
219+
* content='nodes' makes it an external-content table — synced via triggers.
220+
* Each DDL statement must be executed separately for FTS5 compatibility. */
221+
{
222+
char *fts_err = NULL;
223+
int fts_rc = sqlite3_exec(s->db,
224+
"CREATE VIRTUAL TABLE IF NOT EXISTS nodes_fts USING fts5("
225+
"name, qualified_name, label, file_path,"
226+
"content='nodes', content_rowid='id',"
227+
"tokenize='unicode61 remove_diacritics 2'"
228+
");",
229+
NULL, NULL, &fts_err);
230+
if (fts_rc != SQLITE_OK) {
231+
sqlite3_free(fts_err);
232+
/* Non-fatal — FTS5 may not be compiled in. Fall back to regex search. */
233+
return SQLITE_OK;
234+
}
235+
}
236+
237+
/* Sync triggers: keep FTS index up to date when nodes change */
238+
exec_sql(s, "CREATE TRIGGER IF NOT EXISTS nodes_fts_ai AFTER INSERT ON nodes BEGIN"
239+
" INSERT INTO nodes_fts(rowid, name, qualified_name, label, file_path)"
240+
" VALUES (new.id, new.name, new.qualified_name, new.label, new.file_path);"
241+
"END;");
242+
243+
exec_sql(s, "CREATE TRIGGER IF NOT EXISTS nodes_fts_ad AFTER DELETE ON nodes BEGIN"
244+
" INSERT INTO nodes_fts(nodes_fts, rowid, name, qualified_name, label, file_path)"
245+
" VALUES ('delete', old.id, old.name, old.qualified_name, old.label, old.file_path);"
246+
"END;");
247+
248+
exec_sql(s, "CREATE TRIGGER IF NOT EXISTS nodes_fts_au AFTER UPDATE ON nodes BEGIN"
249+
" INSERT INTO nodes_fts(nodes_fts, rowid, name, qualified_name, label, file_path)"
250+
" VALUES ('delete', old.id, old.name, old.qualified_name, old.label, old.file_path);"
251+
" INSERT INTO nodes_fts(rowid, name, qualified_name, label, file_path)"
252+
" VALUES (new.id, new.name, new.qualified_name, new.label, new.file_path);"
253+
"END;");
254+
255+
return SQLITE_OK;
216256
}
217257

218258
static int configure_pragmas(cbm_store_t *s, bool in_memory) {
@@ -474,6 +514,10 @@ static void finalize_stmt(sqlite3_stmt **s) {
474514
}
475515
}
476516

517+
int cbm_store_exec(cbm_store_t *s, const char *sql) {
518+
return exec_sql(s, sql);
519+
}
520+
477521
void cbm_store_close(cbm_store_t *s) {
478522
if (!s) {
479523
return;
@@ -1955,6 +1999,125 @@ int cbm_store_search(cbm_store_t *s, const cbm_search_params_t *params, cbm_sear
19551999
char count_sql[4096];
19562000
int bind_idx = 0;
19572001

2002+
/* ── FTS5 BM25 path: when params->query is set, use full-text search ── */
2003+
if (params->query && params->query[0]) {
2004+
/* Build FTS5 query: JOIN nodes_fts for BM25 ranking.
2005+
* Tokenize the user query into FTS5 OR terms for broader matching.
2006+
* "authentication middleware" → "authentication OR middleware" */
2007+
char fts_query[1024];
2008+
{
2009+
const char *q = params->query;
2010+
int fqlen = 0;
2011+
bool in_word = false;
2012+
bool first_word = true;
2013+
while (*q && fqlen < (int)sizeof(fts_query) - 20) {
2014+
if ((*q >= 'a' && *q <= 'z') || (*q >= 'A' && *q <= 'Z') ||
2015+
(*q >= '0' && *q <= '9') || *q == '_' || *q == '-') {
2016+
if (!in_word && !first_word) {
2017+
fqlen += snprintf(fts_query + fqlen, sizeof(fts_query) - fqlen, " OR ");
2018+
}
2019+
fts_query[fqlen++] = *q;
2020+
in_word = true;
2021+
first_word = false;
2022+
} else {
2023+
if (in_word) {
2024+
fts_query[fqlen++] = ' ';
2025+
}
2026+
in_word = false;
2027+
}
2028+
q++;
2029+
}
2030+
fts_query[fqlen] = '\0';
2031+
}
2032+
2033+
char fts_sql[4096];
2034+
/* Join with FTS5 table, filter by project/label, order by BM25 rank */
2035+
int flen = snprintf(fts_sql, sizeof(fts_sql),
2036+
"SELECT n.id, n.project, n.label, n.name, n.qualified_name, "
2037+
"n.file_path, n.start_line, n.end_line, n.properties, "
2038+
"(SELECT COUNT(*) FROM edges e WHERE e.target_id = n.id AND e.type = 'CALLS') AS in_deg, "
2039+
"(SELECT COUNT(*) FROM edges e WHERE e.source_id = n.id AND e.type = 'CALLS') AS out_deg, "
2040+
"bm25(nodes_fts) AS rank "
2041+
"FROM nodes_fts "
2042+
"JOIN nodes n ON n.id = nodes_fts.rowid "
2043+
"WHERE nodes_fts MATCH ?1");
2044+
2045+
int fts_bind_idx = 1;
2046+
if (params->project) {
2047+
fts_bind_idx++;
2048+
flen += snprintf(fts_sql + flen, sizeof(fts_sql) - flen,
2049+
" AND n.project = ?%d", fts_bind_idx);
2050+
}
2051+
if (params->label) {
2052+
fts_bind_idx++;
2053+
flen += snprintf(fts_sql + flen, sizeof(fts_sql) - flen,
2054+
" AND n.label = ?%d", fts_bind_idx);
2055+
}
2056+
2057+
int limit = params->limit > 0 ? params->limit : 50;
2058+
flen += snprintf(fts_sql + flen, sizeof(fts_sql) - flen,
2059+
" ORDER BY rank LIMIT %d OFFSET %d", limit, params->offset);
2060+
2061+
/* Count query */
2062+
char fts_count[4096];
2063+
snprintf(fts_count, sizeof(fts_count),
2064+
"SELECT COUNT(*) FROM nodes_fts "
2065+
"JOIN nodes n ON n.id = nodes_fts.rowid "
2066+
"WHERE nodes_fts MATCH ?1%s%s",
2067+
params->project ? " AND n.project = ?2" : "",
2068+
params->label ? (params->project ? " AND n.label = ?3" : " AND n.label = ?2") : "");
2069+
2070+
/* Execute count */
2071+
sqlite3_stmt *cnt_stmt = NULL;
2072+
if (sqlite3_prepare_v2(s->db, fts_count, -1, &cnt_stmt, NULL) == SQLITE_OK) {
2073+
bind_text(cnt_stmt, 1, fts_query);
2074+
int bi = 1;
2075+
if (params->project) { bi++; bind_text(cnt_stmt, bi, params->project); }
2076+
if (params->label) { bi++; bind_text(cnt_stmt, bi, params->label); }
2077+
if (sqlite3_step(cnt_stmt) == SQLITE_ROW) {
2078+
out->total = sqlite3_column_int(cnt_stmt, 0);
2079+
}
2080+
sqlite3_finalize(cnt_stmt);
2081+
}
2082+
2083+
/* Execute main query */
2084+
sqlite3_stmt *main_stmt = NULL;
2085+
int rc = sqlite3_prepare_v2(s->db, fts_sql, -1, &main_stmt, NULL);
2086+
if (rc != SQLITE_OK) {
2087+
/* FTS5 table may not exist for older DBs — fall through to regex path */
2088+
/* FTS5 table may not exist for older DBs — silently fall through */
2089+
goto regex_path;
2090+
}
2091+
bind_text(main_stmt, 1, fts_query);
2092+
{
2093+
int bi = 1;
2094+
if (params->project) { bi++; bind_text(main_stmt, bi, params->project); }
2095+
if (params->label) { bi++; bind_text(main_stmt, bi, params->label); }
2096+
}
2097+
2098+
int cap = 16;
2099+
int n = 0;
2100+
cbm_search_result_t *results = malloc(cap * sizeof(cbm_search_result_t));
2101+
while (sqlite3_step(main_stmt) == SQLITE_ROW) {
2102+
if (n >= cap) {
2103+
cap *= 2;
2104+
results = safe_realloc(results, cap * sizeof(cbm_search_result_t));
2105+
}
2106+
memset(&results[n], 0, sizeof(cbm_search_result_t));
2107+
scan_node(main_stmt, &results[n].node);
2108+
results[n].in_degree = sqlite3_column_int(main_stmt, 9);
2109+
results[n].out_degree = sqlite3_column_int(main_stmt, 10);
2110+
n++;
2111+
}
2112+
sqlite3_finalize(main_stmt);
2113+
out->results = results;
2114+
out->count = n;
2115+
return CBM_STORE_OK;
2116+
}
2117+
2118+
regex_path:
2119+
/* ── Regex path: original regex-based search ── */
2120+
19582121
/* We build a query that selects nodes with optional degree subqueries */
19592122
const char *select_cols =
19602123
"SELECT n.id, n.project, n.label, n.name, n.qualified_name, "

src/store/store.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,7 @@ typedef struct {
108108
const char *name_pattern; /* regex on name, NULL = any */
109109
const char *qn_pattern; /* regex on qualified_name, NULL = any */
110110
const char *file_pattern; /* glob on file_path, NULL = any */
111+
const char *query; /* free-text BM25 query via FTS5, NULL = disabled */
111112
const char *relationship; /* edge type filter, NULL = any */
112113
const char *direction; /* "inbound" / "outbound" / "any", NULL = any */
113114
int min_degree; /* -1 = no filter (default), 0+ = minimum */
@@ -209,6 +210,9 @@ cbm_store_t *cbm_store_open(const char *project);
209210
/* Close the store and free all resources. NULL-safe. */
210211
void cbm_store_close(cbm_store_t *s);
211212

213+
/* Execute a raw SQL statement (for DDL, DML, etc.). */
214+
int cbm_store_exec(cbm_store_t *s, const char *sql);
215+
212216
/* Get the underlying sqlite3 handle (for testing only). */
213217
struct sqlite3 *cbm_store_get_db(cbm_store_t *s);
214218

0 commit comments

Comments
 (0)