From 58cd6c42aebfcc5e9d543cea00286b686f03c98b Mon Sep 17 00:00:00 2001 From: ShauryaaSharma Date: Thu, 25 Jun 2026 03:12:48 +0530 Subject: [PATCH] feat(search): index prose content for BM25 full-text search Section nodes (markdown) and Module nodes (YAML/JSON) previously exposed only their heading/name to BM25, so search_graph could not match the prose body or a config description. Index that text so content is searchable. - store: add a `body` column to the nodes_fts FTS5 table; new cbm_store_fts_rebuild() drops+recreates the table (upgrading legacy 4-column databases) and backfills `body` from each node's docstring, guarded by json_valid() against malformed-JSON rows - pipeline: both FTS backfill sites now call cbm_store_fts_rebuild() - mcp: stop excluding Section/Module from BM25 results (they rank below code symbols, so existing result ordering is preserved) - internal/cbm: capture the markdown section body beneath each heading (#518) and promote top-level description/summary/purpose values onto the file's Module node (#519), reusing the existing docstring property - tests: 7 extraction cases + 3 store FTS cases Closes #518 Closes #519 Signed-off-by: ShauryaaSharma --- internal/cbm/extract_defs.c | 229 +++++++++++++++++++++++++++- src/mcp/mcp.c | 28 ++-- src/pipeline/pipeline.c | 19 +-- src/pipeline/pipeline_incremental.c | 14 +- src/store/store.c | 71 +++++++-- src/store/store.h | 8 + tests/test_extraction.c | 131 ++++++++++++++++ tests/test_store_search.c | 113 ++++++++++++++ 8 files changed, 561 insertions(+), 52 deletions(-) diff --git a/internal/cbm/extract_defs.c b/internal/cbm/extract_defs.c index bfff34fb..13d5b272 100644 --- a/internal/cbm/extract_defs.c +++ b/internal/cbm/extract_defs.c @@ -2834,7 +2834,8 @@ static void extract_func_def(CBMExtractCtx *ctx, TSNode node, const CBMLangSpec // --- Class definition extraction --- // Push a simple class definition (used by config language extractors). -static void push_simple_class_def(CBMExtractCtx *ctx, TSNode node, char *name, const char *label) { +static void push_simple_class_def(CBMExtractCtx *ctx, TSNode node, char *name, const char *label, + const char *docstring) { CBMArena *a = ctx->arena; CBMDefinition def; memset(&def, 0, sizeof(def)); @@ -2845,6 +2846,7 @@ static void push_simple_class_def(CBMExtractCtx *ctx, TSNode node, char *name, c def.start_line = ts_node_start_point(node).row + TS_LINE_OFFSET; def.end_line = ts_node_end_point(node).row + TS_LINE_OFFSET; def.is_exported = true; + def.docstring = docstring; // section body for Markdown (#518); NULL otherwise cbm_defs_push(&ctx->result->defs, a, def); } @@ -2943,6 +2945,57 @@ static char *extract_markdown_heading_name(CBMArena *a, TSNode node, const char return trim_heading_name(name); } +// Max bytes of Markdown section body captured for BM25 content search (#518). +enum { CBM_MD_SECTION_BODY_MAX = 500 }; + +// Capture the prose body beneath a Markdown heading so BM25 can search the +// content, not just the heading text (#518). In the tree-sitter-markdown grammar +// each heading lives inside a `section` node that also holds the body blocks and +// any nested subsections; the body is the source span between the heading and +// either the first nested subsection or the end of the section. Returns NULL when +// there is no enclosing section or no body text. The result is trimmed and capped +// at CBM_MD_SECTION_BODY_MAX bytes (without splitting a UTF-8 sequence). +static char *extract_markdown_section_body(CBMArena *a, TSNode heading, const char *source) { + TSNode parent = ts_node_parent(heading); + if (ts_node_is_null(parent) || strcmp(ts_node_type(parent), "section") != 0) { + return NULL; + } + uint32_t body_start = ts_node_end_byte(heading); + uint32_t body_end = ts_node_end_byte(parent); + // Stop at the first nested subsection — it gets its own Section node + body. + uint32_t cc = ts_node_child_count(parent); + for (uint32_t i = 0; i < cc; i++) { + TSNode ch = ts_node_child(parent, i); + if (ts_node_start_byte(ch) >= body_start && strcmp(ts_node_type(ch), "section") == 0) { + body_end = ts_node_start_byte(ch); + break; + } + } + // Trim surrounding whitespace/newlines (UTF-8 lead/continuation bytes are all + // >= 0x80, so a byte-wise <= ' ' test never cuts a multi-byte character). + while (body_start < body_end && (unsigned char)source[body_start] <= ' ') { + body_start++; + } + while (body_end > body_start && (unsigned char)source[body_end - 1] <= ' ') { + body_end--; + } + if (body_end <= body_start) { + return NULL; + } + size_t len = (size_t)(body_end - body_start); + if (len > CBM_MD_SECTION_BODY_MAX) { + len = CBM_MD_SECTION_BODY_MAX; + // Back off so the cap never splits a UTF-8 multi-byte sequence. + while (len > 0 && ((unsigned char)source[body_start + len] & 0xC0) == 0x80) { + len--; + } + if (len == 0) { + return NULL; + } + } + return cbm_arena_strndup(a, source + body_start, len); +} + // INI: extract section name from section node. static char *find_ini_section_name(CBMArena *a, TSNode node, const char *source) { uint32_t nc = ts_node_child_count(node); @@ -2996,6 +3049,7 @@ static bool extract_config_class_def(CBMExtractCtx *ctx, TSNode node, const char CBMArena *a = ctx->arena; char *name = NULL; const char *label = "Class"; + const char *docstring = NULL; if (ctx->language == CBM_LANG_TOML && (strcmp(kind, "table") == 0 || strcmp(kind, "table_array_element") == 0)) { @@ -3008,6 +3062,7 @@ static bool extract_config_class_def(CBMExtractCtx *ctx, TSNode node, const char (strcmp(kind, "atx_heading") == 0 || strcmp(kind, "setext_heading") == 0)) { name = extract_markdown_heading_name(a, node, kind, ctx->source); label = "Section"; + docstring = extract_markdown_section_body(a, node, ctx->source); // #518 } else if (ctx->language == CBM_LANG_HCL && strcmp(kind, "block") == 0) { name = find_hcl_block_name(a, node, ctx->source); } else { @@ -3015,7 +3070,7 @@ static bool extract_config_class_def(CBMExtractCtx *ctx, TSNode node, const char } if (name && name[0]) { - push_simple_class_def(ctx, node, name, label); + push_simple_class_def(ctx, node, name, label, docstring); } return true; } @@ -5514,6 +5569,174 @@ static void walk_defs(CBMExtractCtx *ctx, TSNode root, const CBMLangSpec *spec, } } +// ── Config module description promotion (#519) ────────────────────────────── +// YAML/JSON metadata files (META.yaml, skill manifests, …) carry their most +// search-relevant text in a top-level `description`/`summary`/`purpose` value. +// That value is otherwise dropped — only the key becomes a Variable node — so +// BM25 can't find a module by its description. We promote the value onto the +// file's Module node docstring, which the FTS `body` column then indexes. + +// Case-insensitive ASCII equality (keys are conventionally lowercase, but accept +// "Description"/"SUMMARY" too). +static bool cfg_ci_eq(const char *x, const char *y) { + while (*x && *y) { + char cx = *x; + char cy = *y; + if (cx >= 'A' && cx <= 'Z') { + cx = (char)(cx + ('a' - 'A')); + } + if (cy >= 'A' && cy <= 'Z') { + cy = (char)(cy + ('a' - 'A')); + } + if (cx != cy) { + return false; + } + x++; + y++; + } + return *x == '\0' && *y == '\0'; +} + +static bool cfg_is_desc_key(const char *key) { + return key && (cfg_ci_eq(key, "description") || cfg_ci_eq(key, "summary") || + cfg_ci_eq(key, "purpose")); +} + +// Trim surrounding whitespace and strip one layer of matching quotes, in place. +static char *cfg_strip_quotes_trim(char *s) { + if (!s) { + return NULL; + } + while (*s && (unsigned char)*s <= ' ') { + s++; + } + size_t len = strlen(s); + while (len > 0 && (unsigned char)s[len - 1] <= ' ') { + s[--len] = '\0'; + } + if (len >= 2 && ((s[0] == '"' && s[len - 1] == '"') || (s[0] == '\'' && s[len - 1] == '\''))) { + s[len - 1] = '\0'; + s++; + } + return s; +} + +// Copy at most `cap` bytes of `s`, never splitting a UTF-8 sequence; NULL if empty. +static char *cfg_arena_capped(CBMArena *a, const char *s, size_t cap) { + if (!s || !s[0]) { + return NULL; + } + size_t len = strlen(s); + if (len > cap) { + len = cap; + while (len > 0 && ((unsigned char)s[len] & 0xC0) == 0x80) { + len--; + } + if (len == 0) { + return NULL; + } + } + return cbm_arena_strndup(a, s, len); +} + +// Descend stream/document/block_node to the top-level YAML block_mapping. +static TSNode cfg_find_yaml_mapping(TSNode root) { + TSNode none = {0}; + TSNode cur = root; + for (int depth = 0; depth < 8; depth++) { + uint32_t n = ts_node_child_count(cur); + TSNode next = none; + bool have_next = false; + for (uint32_t i = 0; i < n; i++) { + TSNode ch = ts_node_child(cur, i); + const char *ck = ts_node_type(ch); + if (strcmp(ck, "block_mapping") == 0) { + return ch; + } + if (!have_next && (strcmp(ck, "stream") == 0 || strcmp(ck, "document") == 0 || + strcmp(ck, "block_node") == 0)) { + next = ch; + have_next = true; + } + } + if (!have_next) { + break; + } + cur = next; + } + return none; +} + +// Descend document wrappers to the top-level JSON object. +static TSNode cfg_find_json_object(TSNode root) { + TSNode none = {0}; + TSNode cur = root; + for (int depth = 0; depth < 6; depth++) { + if (strcmp(ts_node_type(cur), "object") == 0) { + return cur; + } + uint32_t n = ts_node_child_count(cur); + TSNode next = none; + bool have_next = false; + for (uint32_t i = 0; i < n; i++) { + TSNode ch = ts_node_child(cur, i); + const char *ck = ts_node_type(ch); + if (strcmp(ck, "object") == 0) { + return ch; + } + if (!have_next && strcmp(ck, "document") == 0) { + next = ch; + have_next = true; + } + } + if (!have_next) { + break; + } + cur = next; + } + return none; +} + +// Find a top-level description/summary/purpose value in a YAML or JSON file and +// return it (trimmed, unquoted, capped) for promotion to the Module docstring. +static const char *extract_config_module_description(CBMExtractCtx *ctx) { + if (ctx->language != CBM_LANG_YAML && ctx->language != CBM_LANG_JSON) { + return NULL; + } + CBMArena *a = ctx->arena; + TSNode mapping; + const char *pair_kind; + if (ctx->language == CBM_LANG_YAML) { + mapping = cfg_find_yaml_mapping(ctx->root); + pair_kind = "block_mapping_pair"; + } else { + mapping = cfg_find_json_object(ctx->root); + pair_kind = "pair"; + } + if (ts_node_is_null(mapping)) { + return NULL; + } + uint32_t n = ts_node_named_child_count(mapping); + for (uint32_t i = 0; i < n; i++) { + TSNode pair = ts_node_named_child(mapping, i); + if (strcmp(ts_node_type(pair), pair_kind) != 0) { + continue; + } + TSNode key = ts_node_child_by_field_name(pair, TS_FIELD("key")); + TSNode val = ts_node_child_by_field_name(pair, TS_FIELD("value")); + if (ts_node_is_null(key) || ts_node_is_null(val)) { + continue; + } + const char *key_txt = cfg_strip_quotes_trim(cbm_node_text(a, key, ctx->source)); + if (!cfg_is_desc_key(key_txt)) { + continue; + } + char *val_txt = cfg_strip_quotes_trim(cbm_node_text(a, val, ctx->source)); + return cfg_arena_capped(a, val_txt, CBM_MD_SECTION_BODY_MAX); + } + return NULL; +} + void cbm_extract_definitions(CBMExtractCtx *ctx) { const CBMLangSpec *spec = cbm_lang_spec(ctx->language); if (!spec) { @@ -5533,6 +5756,8 @@ void cbm_extract_definitions(CBMExtractCtx *ctx) { mod.end_line = ts_node_end_point(ctx->root).row + TS_LINE_OFFSET; mod.is_exported = true; mod.is_test = ctx->result->is_test_file; + // Promote a YAML/JSON top-level description onto the Module for BM25 (#519). + mod.docstring = extract_config_module_description(ctx); cbm_defs_push(&ctx->result->defs, a, mod); // Walk AST for function/class definitions diff --git a/src/mcp/mcp.c b/src/mcp/mcp.c index 7016a0d2..3b03091a 100644 --- a/src/mcp/mcp.c +++ b/src/mcp/mcp.c @@ -1334,7 +1334,11 @@ static char *bm25_search(cbm_store_t *store, const char *project, const char *qu ") fts " "JOIN nodes n ON n.id = fts.rowid " "WHERE n.project = ?2 " - " AND n.label NOT IN ('File','Folder','Module','Section','Variable','Project') " + /* Section and Module are searchable (#518/#519): their FTS `body` column + * carries markdown section prose and YAML/JSON description text. They + * rank below code symbols (no boost above), so code results stay first. + * File/Folder/Variable/Project remain excluded as structural noise. */ + " AND n.label NOT IN ('File','Folder','Variable','Project') " " AND (?6 IS NULL OR n.file_path LIKE ?6) " "ORDER BY rank " "LIMIT ?3 OFFSET ?4"; @@ -1359,17 +1363,17 @@ static char *bm25_search(cbm_store_t *store, const char *project, const char *qu * Uses the identical subquery structure so the FTS5 early-exit applies here too. */ int total = 0; { - const char *count_sql = - "SELECT COUNT(*) FROM (" - " SELECT fts.rowid FROM (" - " SELECT rowid FROM nodes_fts WHERE nodes_fts MATCH ?1" - " ORDER BY bm25(nodes_fts) LIMIT ?3" - " ) fts " - " JOIN nodes n ON n.id = fts.rowid " - " WHERE n.project = ?2 " - " AND n.label NOT IN ('File','Folder','Module','Section','Variable','Project')" - " AND (?6 IS NULL OR n.file_path LIKE ?6)" - ")"; + const char *count_sql = "SELECT COUNT(*) FROM (" + " SELECT fts.rowid FROM (" + " SELECT rowid FROM nodes_fts WHERE nodes_fts MATCH ?1" + " ORDER BY bm25(nodes_fts) LIMIT ?3" + " ) fts " + " JOIN nodes n ON n.id = fts.rowid " + " WHERE n.project = ?2 " + /* Mirror the label filter in the main query above (#518/#519). */ + " AND n.label NOT IN ('File','Folder','Variable','Project')" + " AND (?6 IS NULL OR n.file_path LIKE ?6)" + ")"; sqlite3_stmt *cs = NULL; if (sqlite3_prepare_v2(db, count_sql, BM25_SQL_AUTO_LEN, &cs, NULL) == SQLITE_OK) { sqlite3_bind_text(cs, BM25_BIND_QUERY, fts_query, BM25_SQL_AUTO_LEN, diff --git a/src/pipeline/pipeline.c b/src/pipeline/pipeline.c index 8e370f7c..2f26bbb7 100644 --- a/src/pipeline/pipeline.c +++ b/src/pipeline/pipeline.c @@ -877,20 +877,11 @@ static int dump_and_persist_hashes(cbm_pipeline_t *p, const cbm_file_info_t *fil } } - /* FTS5 backfill: populate nodes_fts with camelCase-split names. - * Contentless FTS5 requires the special 'delete-all' command instead of - * DELETE FROM to wipe prior rows (there's no underlying content table). - * Falls back to plain names if cbm_camel_split is unavailable (which - * shouldn't happen because we always register it, but we stay defensive). */ - cbm_store_exec(hash_store, "INSERT INTO nodes_fts(nodes_fts) VALUES('delete-all');"); - if (cbm_store_exec(hash_store, - "INSERT INTO nodes_fts(rowid, name, qualified_name, label, file_path) " - "SELECT id, cbm_camel_split(name), qualified_name, label, file_path " - "FROM nodes;") != CBM_STORE_OK) { - cbm_store_exec(hash_store, - "INSERT INTO nodes_fts(rowid, name, qualified_name, label, file_path) " - "SELECT id, name, qualified_name, label, file_path FROM nodes;"); - } + /* FTS5 backfill: rebuild nodes_fts with camelCase-split names and prose + * bodies (markdown sections, YAML/JSON descriptions, docstrings) so BM25 + * matches content as well as identifiers (#518/#519). cbm_store_fts_rebuild + * also upgrades legacy 4-column tables to the schema carrying `body`. */ + cbm_store_fts_rebuild(hash_store); cbm_store_close(hash_store); cbm_log_info("pass.timing", "pass", "persist_hashes", "files", itoa_buf(file_count)); diff --git a/src/pipeline/pipeline_incremental.c b/src/pipeline/pipeline_incremental.c index a1cc4482..9a411b94 100644 --- a/src/pipeline/pipeline_incremental.c +++ b/src/pipeline/pipeline_incremental.c @@ -649,17 +649,9 @@ static void dump_and_persist(cbm_gbuf_t *gbuf, const char *db_path, const char * /* FTS5 rebuild after incremental dump. The btree dump path bypasses * any triggers that could have kept nodes_fts synchronized, so we - * rebuild from the nodes table here. See the full-dump path in - * pipeline.c for the matching logic. */ - cbm_store_exec(hash_store, "INSERT INTO nodes_fts(nodes_fts) VALUES('delete-all');"); - if (cbm_store_exec(hash_store, - "INSERT INTO nodes_fts(rowid, name, qualified_name, label, file_path) " - "SELECT id, cbm_camel_split(name), qualified_name, label, file_path " - "FROM nodes;") != CBM_STORE_OK) { - cbm_store_exec(hash_store, - "INSERT INTO nodes_fts(rowid, name, qualified_name, label, file_path) " - "SELECT id, name, qualified_name, label, file_path FROM nodes;"); - } + * rebuild from the nodes table here (also indexing prose bodies for + * content search — see cbm_store_fts_rebuild and #518/#519). */ + cbm_store_fts_rebuild(hash_store); cbm_store_close(hash_store); } diff --git a/src/store/store.c b/src/store/store.c index 995f6e85..185c79d0 100644 --- a/src/store/store.c +++ b/src/store/store.c @@ -212,6 +212,60 @@ static void iso_now(char *buf, size_t sz) { /* ── Schema ─────────────────────────────────────────────────────── */ +/* FTS5 contentless virtual table DDL — single source of truth shared by + * init_schema (fresh databases) and cbm_store_fts_rebuild (re-index + legacy + * upgrade). Columns: name, qualified_name, label, file_path, body. `body` + * (added for #518/#519) carries prose — markdown section bodies, YAML/JSON + * description values, and function docstrings — so BM25 matches content, not + * only identifiers. Contentless (content='') stores only the inverted index; + * we feed cbm_camel_split(name) and the raw body text at insert time. The + * column is named `body` rather than `content` to avoid colliding with the + * `content=''` option keyword in the FTS5 DDL grammar. */ +static const char NODES_FTS_DDL[] = "CREATE VIRTUAL TABLE IF NOT EXISTS nodes_fts USING fts5(" + " name, qualified_name, label, file_path, body," + " content=''," + " tokenize='unicode61 remove_diacritics 2'" + ");"; + +/* FTS backfill INSERTs. The `body` column is fed the node's docstring property, + * or '' when absent or when properties is not valid JSON. The json_valid() guard + * is essential — json_extract() raises on malformed JSON and would otherwise abort + * the whole INSERT...SELECT (pre-fix databases contain such rows; see the + * create_user_indexes note). The primary form camelCase-splits the name; the + * fallback uses the plain name should cbm_camel_split be unavailable. Keep the + * two body expressions in sync. */ +static const char FTS_BACKFILL_SQL[] = + "INSERT INTO nodes_fts(rowid, name, qualified_name, label, file_path, body) " + "SELECT id, cbm_camel_split(name), qualified_name, label, file_path," + " CASE WHEN json_valid(properties)" + " THEN coalesce(json_extract(properties,'$.docstring'),'') ELSE '' END " + "FROM nodes;"; + +static const char FTS_BACKFILL_SQL_FALLBACK[] = + "INSERT INTO nodes_fts(rowid, name, qualified_name, label, file_path, body) " + "SELECT id, name, qualified_name, label, file_path," + " CASE WHEN json_valid(properties)" + " THEN coalesce(json_extract(properties,'$.docstring'),'') ELSE '' END " + "FROM nodes;"; + +int cbm_store_fts_rebuild(cbm_store_t *s) { + if (!s || !s->db) { + return CBM_STORE_ERR; + } + /* Drop + recreate so legacy 4-column tables gain the `body` column. */ + if (cbm_store_exec(s, "DROP TABLE IF EXISTS nodes_fts;") != CBM_STORE_OK) { + return CBM_STORE_ERR; + } + if (cbm_store_exec(s, NODES_FTS_DDL) != CBM_STORE_OK) { + return CBM_STORE_ERR; /* FTS5 not compiled in — regex search path still works. */ + } + int rc = cbm_store_exec(s, FTS_BACKFILL_SQL); + if (rc != CBM_STORE_OK) { + rc = cbm_store_exec(s, FTS_BACKFILL_SQL_FALLBACK); + } + return rc; +} + static int init_schema(cbm_store_t *s) { const char *ddl = "CREATE TABLE IF NOT EXISTS projects (" @@ -262,22 +316,13 @@ static int init_schema(cbm_store_t *s) { return rc; } - /* FTS5 contentless virtual table for BM25 full-text search. - * Contentless (content='') means FTS5 stores only the inverted index, - * not a copy of the source text — required for camelCase tokenization - * because we feed it `cbm_camel_split(name)` at insert time but want - * queries to match against the split tokens, not the original. + /* FTS5 contentless virtual table for BM25 full-text search (see NODES_FTS_DDL). + * Created here for fresh databases and read paths; cbm_store_fts_rebuild drops + * and recreates it during indexing, which also upgrades legacy 4-column tables. * Fails silently if FTS5 is not compiled in (SQLITE_ENABLE_FTS5). */ { char *fts_err = NULL; - int fts_rc = sqlite3_exec(s->db, - "CREATE VIRTUAL TABLE IF NOT EXISTS nodes_fts USING fts5(" - " name, qualified_name, label, file_path," - " content=''," - " tokenize='unicode61 remove_diacritics 2'" - ");", - NULL, NULL, &fts_err); - if (fts_rc != SQLITE_OK && fts_err) { + if (sqlite3_exec(s->db, NODES_FTS_DDL, NULL, NULL, &fts_err) != SQLITE_OK && fts_err) { sqlite3_free(fts_err); } } diff --git a/src/store/store.h b/src/store/store.h index 26b09a5c..b85b54a1 100644 --- a/src/store/store.h +++ b/src/store/store.h @@ -245,6 +245,14 @@ int cbm_store_drop_indexes(cbm_store_t *s); /* Recreate user indexes after bulk inserts. */ int cbm_store_create_indexes(cbm_store_t *s); +/* Rebuild the nodes_fts BM25 index from the nodes table. Drops and recreates + * the FTS virtual table (which upgrades legacy 4-column databases to the schema + * carrying the `body` column), then re-inserts every node with its camelCase- + * split name and prose body (the docstring property) so full-text search matches + * both identifiers and content — markdown section bodies (#518) and YAML/JSON + * descriptions (#519). Returns CBM_STORE_OK or CBM_STORE_ERR. */ +int cbm_store_fts_rebuild(cbm_store_t *s); + /* ── WAL / Checkpoint ───────────────────────────────────────────── */ /* Force WAL checkpoint + PRAGMA optimize. */ diff --git a/tests/test_extraction.c b/tests/test_extraction.c index d06b2a50..e804cb64 100644 --- a/tests/test_extraction.c +++ b/tests/test_extraction.c @@ -57,6 +57,25 @@ static int count_defs_with_label(CBMFileResult *r, const char *label) { return count; } +/* Docstring of the first definition matching label+name (NULL if not found). */ +static const char *def_docstring(CBMFileResult *r, const char *label, const char *name) { + for (int i = 0; i < r->defs.count; i++) { + if (strcmp(r->defs.items[i].label, label) == 0 && + strcmp(r->defs.items[i].name, name) == 0) + return r->defs.items[i].docstring; + } + return NULL; +} + +/* Docstring of the first definition with the given label (NULL if not found). */ +static const char *first_def_docstring(CBMFileResult *r, const char *label) { + for (int i = 0; i < r->defs.count; i++) { + if (strcmp(r->defs.items[i].label, label) == 0) + return r->defs.items[i].docstring; + } + return NULL; +} + /* Convenience: extract, assert no error, return result. Caller frees. */ static CBMFileResult *extract(const char *src, CBMLanguage lang, const char *proj, const char *path) { @@ -2528,6 +2547,111 @@ TEST(markdown_no_headings) { PASS(); } +/* #518: the prose body beneath a heading is captured as the Section docstring so + * BM25 can search markdown content, not just heading text. */ +TEST(markdown_section_body_captured) { + CBMFileResult *r = extract("## BROWSER AGENT\n\n" + "Before writing any test file, explore the live application " + "using Playwright MCP.\n\n" + "## NEXT SECTION\n\n" + "Totally unrelated prose here.\n", + CBM_LANG_MARKDOWN, "t", "SKILL.md"); + ASSERT_NOT_NULL(r); + ASSERT_FALSE(r->has_error); + const char *body = def_docstring(r, "Section", "BROWSER AGENT"); + ASSERT_NOT_NULL(body); + ASSERT(strstr(body, "Playwright") != NULL); + ASSERT(strstr(body, "test file") != NULL); + /* Body stops at the next heading — it must not absorb the sibling section. */ + ASSERT(strstr(body, "unrelated") == NULL); + cbm_free_result(r); + PASS(); +} + +/* #518: a heading with no prose beneath it yields no docstring (not empty text). */ +TEST(markdown_section_no_body) { + CBMFileResult *r = extract("# Title\n## Empty\n", CBM_LANG_MARKDOWN, "t", "README.md"); + ASSERT_NOT_NULL(r); + ASSERT_FALSE(r->has_error); + const char *body = def_docstring(r, "Section", "Empty"); + ASSERT(body == NULL || body[0] == '\0'); + cbm_free_result(r); + PASS(); +} + +/* #518: the captured body is capped (defends the 2 KB node-properties buffer). */ +TEST(markdown_section_body_capped) { + /* Build a heading followed by ~1500 chars of prose. */ + char src[2048]; + int n = snprintf(src, sizeof(src), "# Big\n\n"); + for (int i = 0; i < 250 && n < (int)sizeof(src) - 8; i++) + n += snprintf(src + n, sizeof(src) - (size_t)n, "alpha "); + CBMFileResult *r = extract(src, CBM_LANG_MARKDOWN, "t", "BIG.md"); + ASSERT_NOT_NULL(r); + ASSERT_FALSE(r->has_error); + const char *body = def_docstring(r, "Section", "Big"); + ASSERT_NOT_NULL(body); + ASSERT(strlen(body) <= 500); + cbm_free_result(r); + PASS(); +} + +/* #519: a top-level YAML description value is promoted to the Module docstring so + * BM25 can find a module by its description. */ +TEST(yaml_description_promoted_to_module) { + CBMFileResult *r = + extract("name: qa-run\n" + "description: \"8-agent QA loop using Playwright MCP for browser testing\"\n", + CBM_LANG_YAML, "t", "META.yaml"); + ASSERT_NOT_NULL(r); + ASSERT_FALSE(r->has_error); + const char *desc = first_def_docstring(r, "Module"); + ASSERT_NOT_NULL(desc); + ASSERT(strstr(desc, "Playwright") != NULL); + ASSERT(strstr(desc, "browser testing") != NULL); + cbm_free_result(r); + PASS(); +} + +/* #519: the `summary` key is also promoted. */ +TEST(yaml_summary_promoted_to_module) { + CBMFileResult *r = extract("summary: concise pipeline overview text\nname: x\n", CBM_LANG_YAML, + "t", "info.yaml"); + ASSERT_NOT_NULL(r); + ASSERT_FALSE(r->has_error); + const char *desc = first_def_docstring(r, "Module"); + ASSERT_NOT_NULL(desc); + ASSERT(strstr(desc, "pipeline overview") != NULL); + cbm_free_result(r); + PASS(); +} + +/* #519: a top-level JSON description value is promoted to the Module docstring. */ +TEST(json_description_promoted_to_module) { + CBMFileResult *r = + extract("{\"name\": \"qa-run\", " + "\"description\": \"8-agent QA loop with Playwright browser testing\"}", + CBM_LANG_JSON, "t", "skill.json"); + ASSERT_NOT_NULL(r); + ASSERT_FALSE(r->has_error); + const char *desc = first_def_docstring(r, "Module"); + ASSERT_NOT_NULL(desc); + ASSERT(strstr(desc, "Playwright") != NULL); + cbm_free_result(r); + PASS(); +} + +/* #519: files without a promotable key leave the Module docstring unset. */ +TEST(yaml_no_description_leaves_module_bare) { + CBMFileResult *r = extract("name: thing\nversion: 1\n", CBM_LANG_YAML, "t", "plain.yaml"); + ASSERT_NOT_NULL(r); + ASSERT_FALSE(r->has_error); + const char *desc = first_def_docstring(r, "Module"); + ASSERT(desc == NULL || desc[0] == '\0'); + cbm_free_result(r); + PASS(); +} + /* ═══════════════════════════════════════════════════════════════════ * Python __init__.py Module QN collision regression * ═══════════════════════════════════════════════════════════════════ */ @@ -3156,6 +3280,13 @@ SUITE(extraction) { RUN_TEST(markdown_setext_headings); RUN_TEST(markdown_heading_content); RUN_TEST(markdown_no_headings); + RUN_TEST(markdown_section_body_captured); + RUN_TEST(markdown_section_no_body); + RUN_TEST(markdown_section_body_capped); + RUN_TEST(yaml_description_promoted_to_module); + RUN_TEST(yaml_summary_promoted_to_module); + RUN_TEST(json_description_promoted_to_module); + RUN_TEST(yaml_no_description_leaves_module_bare); /* __init__.py / index.ts Module QN collision regression */ RUN_TEST(python_init_module_qn_not_collide_with_folder); diff --git a/tests/test_store_search.c b/tests/test_store_search.c index 5cbf1a19..656b1507 100644 --- a/tests/test_store_search.c +++ b/tests/test_store_search.c @@ -6,6 +6,7 @@ #include "../src/foundation/compat.h" #include "test_framework.h" #include "test_helpers.h" +#include #include #include #include @@ -1458,6 +1459,114 @@ TEST(store_impact_summary_empty) { PASS(); } +/* ── FTS5 body indexing (#518 / #519) ────────────────────────────── */ + +/* Count nodes_fts rows matching a single (test-controlled) alpha token. */ +static int fts_match_count(sqlite3 *db, const char *term) { + char sql[256]; + snprintf(sql, sizeof(sql), "SELECT count(*) FROM nodes_fts WHERE nodes_fts MATCH '%s'", term); + sqlite3_stmt *st = NULL; + if (sqlite3_prepare_v2(db, sql, -1, &st, NULL) != SQLITE_OK) { + return -1; + } + int c = (sqlite3_step(st) == SQLITE_ROW) ? sqlite3_column_int(st, 0) : -1; + sqlite3_finalize(st); + return c; +} + +/* cbm_store_fts_rebuild indexes the `body` column from each node's docstring, so + * BM25 MATCH finds Section/Module nodes by their content, not just their names. */ +TEST(fts_rebuild_indexes_body_content) { + cbm_store_t *s = cbm_store_open_memory(); + cbm_store_upsert_project(s, "test", "/tmp/test"); + cbm_node_t sec = {.project = "test", + .label = "Section", + .name = "BROWSER AGENT", + .qualified_name = "test.SKILL.browser", + .file_path = "SKILL.md", + .properties_json = "{\"docstring\":\"explore the live app using Playwright\"}"}; + cbm_node_t mod = {.project = "test", + .label = "Module", + .name = "META.yaml", + .qualified_name = "test.META", + .file_path = "META.yaml", + .properties_json = "{\"docstring\":\"eight agent quality gate pipeline\"}"}; + cbm_store_upsert_node(s, &sec); + cbm_store_upsert_node(s, &mod); + + ASSERT_EQ(cbm_store_fts_rebuild(s), CBM_STORE_OK); + sqlite3 *db = cbm_store_get_db(s); + + /* Body tokens are searchable even though they appear in no node name. */ + ASSERT_GTE(fts_match_count(db, "playwright"), 1); + ASSERT_GTE(fts_match_count(db, "quality"), 1); + /* Name tokens still searchable. */ + ASSERT_GTE(fts_match_count(db, "browser"), 1); + cbm_store_close(s); + PASS(); +} + +/* A database created before the `body` column is upgraded in place: rebuild drops + * the legacy 4-column table, recreates it with body, and repopulates. */ +TEST(fts_rebuild_upgrades_legacy_schema) { + cbm_store_t *s = cbm_store_open_memory(); + cbm_store_upsert_project(s, "test", "/tmp/test"); + sqlite3 *db = cbm_store_get_db(s); + /* Replace the current table with the historical 4-column (no body) schema. */ + ASSERT_EQ(sqlite3_exec(db, "DROP TABLE IF EXISTS nodes_fts;", NULL, NULL, NULL), SQLITE_OK); + ASSERT_EQ(sqlite3_exec(db, + "CREATE VIRTUAL TABLE nodes_fts USING fts5(name, qualified_name, label, " + "file_path, content='', tokenize='unicode61 remove_diacritics 2');", + NULL, NULL, NULL), + SQLITE_OK); + cbm_node_t fn = {.project = "test", + .label = "Function", + .name = "parseConfig", + .qualified_name = "test.parseConfig", + .file_path = "a.c", + .properties_json = "{\"docstring\":\"loads zookeeper settings\"}"}; + cbm_store_upsert_node(s, &fn); + + ASSERT_EQ(cbm_store_fts_rebuild(s), CBM_STORE_OK); + + /* The new body column now exists and the docstring is searchable. */ + sqlite3_stmt *probe = NULL; + ASSERT_EQ(sqlite3_prepare_v2(db, "SELECT body FROM nodes_fts", -1, &probe, NULL), SQLITE_OK); + sqlite3_finalize(probe); + ASSERT_GTE(fts_match_count(db, "zookeeper"), 1); + cbm_store_close(s); + PASS(); +} + +/* The backfill must survive rows whose properties_json is not valid JSON (legacy + * databases contain such rows); the json_valid() guard prevents json_extract from + * aborting the whole INSERT...SELECT. */ +TEST(fts_rebuild_tolerates_malformed_properties) { + cbm_store_t *s = cbm_store_open_memory(); + cbm_store_upsert_project(s, "test", "/tmp/test"); + cbm_node_t bad = {.project = "test", + .label = "Function", + .name = "weird", + .qualified_name = "test.weird", + .file_path = "a.c", + .properties_json = "{not valid json"}; + cbm_node_t ok = {.project = "test", + .label = "Function", + .name = "fine", + .qualified_name = "test.fine", + .file_path = "b.c", + .properties_json = "{\"docstring\":\"kafka consumer group\"}"}; + cbm_store_upsert_node(s, &bad); + cbm_store_upsert_node(s, &ok); + + ASSERT_EQ(cbm_store_fts_rebuild(s), CBM_STORE_OK); + sqlite3 *db = cbm_store_get_db(s); + ASSERT_GTE(fts_match_count(db, "kafka"), 1); /* good row indexed */ + ASSERT_GTE(fts_match_count(db, "weird"), 1); /* malformed row's name still indexed */ + cbm_store_close(s); + PASS(); +} + SUITE(store_search) { RUN_TEST(store_search_by_label); RUN_TEST(store_search_by_name_pattern); @@ -1525,4 +1634,8 @@ SUITE(store_search) { RUN_TEST(store_hop_to_risk_all_levels); RUN_TEST(store_risk_label_all_levels); RUN_TEST(store_impact_summary_empty); + /* FTS5 body indexing (#518 / #519) */ + RUN_TEST(fts_rebuild_indexes_body_content); + RUN_TEST(fts_rebuild_upgrades_legacy_schema); + RUN_TEST(fts_rebuild_tolerates_malformed_properties); }