From 58cd6c42aebfcc5e9d543cea00286b686f03c98b Mon Sep 17 00:00:00 2001
From: ShauryaaSharma <shauryasofficial27@gmail.com>
Date: Thu, 25 Jun 2026 03:12:48 +0530
Subject: [PATCH] feat(search): index prose content for BM25 full-text search

Section nodes (markdown) and Module nodes (YAML/JSON) previously exposed
only their heading/name to BM25, so search_graph could not match the prose
body or a config description. Index that text so content is searchable.

- store: add a `body` column to the nodes_fts FTS5 table; new
  cbm_store_fts_rebuild() drops+recreates the table (upgrading legacy
  4-column databases) and backfills `body` from each node's docstring,
  guarded by json_valid() against malformed-JSON rows
- pipeline: both FTS backfill sites now call cbm_store_fts_rebuild()
- mcp: stop excluding Section/Module from BM25 results (they rank below
  code symbols, so existing result ordering is preserved)
- internal/cbm: capture the markdown section body beneath each heading
  (#518) and promote top-level description/summary/purpose values onto
  the file's Module node (#519), reusing the existing docstring property
- tests: 7 extraction cases + 3 store FTS cases

Closes #518
Closes #519

Signed-off-by: ShauryaaSharma <shauryasofficial27@gmail.com>
---
 internal/cbm/extract_defs.c         | 229 +++++++++++++++++++++++++++-
 src/mcp/mcp.c                       |  28 ++--
 src/pipeline/pipeline.c             |  19 +--
 src/pipeline/pipeline_incremental.c |  14 +-
 src/store/store.c                   |  71 +++++++--
 src/store/store.h                   |   8 +
 tests/test_extraction.c             | 131 ++++++++++++++++
 tests/test_store_search.c           | 113 ++++++++++++++
 8 files changed, 561 insertions(+), 52 deletions(-)

diff --git a/internal/cbm/extract_defs.c b/internal/cbm/extract_defs.c
index bfff34fb..13d5b272 100644
--- a/internal/cbm/extract_defs.c
+++ b/internal/cbm/extract_defs.c
@@ -2834,7 +2834,8 @@ static void extract_func_def(CBMExtractCtx *ctx, TSNode node, const CBMLangSpec
 // --- Class definition extraction ---
 
 // Push a simple class definition (used by config language extractors).
-static void push_simple_class_def(CBMExtractCtx *ctx, TSNode node, char *name, const char *label) {
+static void push_simple_class_def(CBMExtractCtx *ctx, TSNode node, char *name, const char *label,
+                                  const char *docstring) {
     CBMArena *a = ctx->arena;
     CBMDefinition def;
     memset(&def, 0, sizeof(def));
@@ -2845,6 +2846,7 @@ static void push_simple_class_def(CBMExtractCtx *ctx, TSNode node, char *name, c
     def.start_line = ts_node_start_point(node).row + TS_LINE_OFFSET;
     def.end_line = ts_node_end_point(node).row + TS_LINE_OFFSET;
     def.is_exported = true;
+    def.docstring = docstring; // section body for Markdown (#518); NULL otherwise
     cbm_defs_push(&ctx->result->defs, a, def);
 }
 
@@ -2943,6 +2945,57 @@ static char *extract_markdown_heading_name(CBMArena *a, TSNode node, const char
     return trim_heading_name(name);
 }
 
+// Max bytes of Markdown section body captured for BM25 content search (#518).
+enum { CBM_MD_SECTION_BODY_MAX = 500 };
+
+// Capture the prose body beneath a Markdown heading so BM25 can search the
+// content, not just the heading text (#518). In the tree-sitter-markdown grammar
+// each heading lives inside a `section` node that also holds the body blocks and
+// any nested subsections; the body is the source span between the heading and
+// either the first nested subsection or the end of the section. Returns NULL when
+// there is no enclosing section or no body text. The result is trimmed and capped
+// at CBM_MD_SECTION_BODY_MAX bytes (without splitting a UTF-8 sequence).
+static char *extract_markdown_section_body(CBMArena *a, TSNode heading, const char *source) {
+    TSNode parent = ts_node_parent(heading);
+    if (ts_node_is_null(parent) || strcmp(ts_node_type(parent), "section") != 0) {
+        return NULL;
+    }
+    uint32_t body_start = ts_node_end_byte(heading);
+    uint32_t body_end = ts_node_end_byte(parent);
+    // Stop at the first nested subsection — it gets its own Section node + body.
+    uint32_t cc = ts_node_child_count(parent);
+    for (uint32_t i = 0; i < cc; i++) {
+        TSNode ch = ts_node_child(parent, i);
+        if (ts_node_start_byte(ch) >= body_start && strcmp(ts_node_type(ch), "section") == 0) {
+            body_end = ts_node_start_byte(ch);
+            break;
+        }
+    }
+    // Trim surrounding whitespace/newlines (UTF-8 lead/continuation bytes are all
+    // >= 0x80, so a byte-wise <= ' ' test never cuts a multi-byte character).
+    while (body_start < body_end && (unsigned char)source[body_start] <= ' ') {
+        body_start++;
+    }
+    while (body_end > body_start && (unsigned char)source[body_end - 1] <= ' ') {
+        body_end--;
+    }
+    if (body_end <= body_start) {
+        return NULL;
+    }
+    size_t len = (size_t)(body_end - body_start);
+    if (len > CBM_MD_SECTION_BODY_MAX) {
+        len = CBM_MD_SECTION_BODY_MAX;
+        // Back off so the cap never splits a UTF-8 multi-byte sequence.
+        while (len > 0 && ((unsigned char)source[body_start + len] & 0xC0) == 0x80) {
+            len--;
+        }
+        if (len == 0) {
+            return NULL;
+        }
+    }
+    return cbm_arena_strndup(a, source + body_start, len);
+}
+
 // INI: extract section name from section node.
 static char *find_ini_section_name(CBMArena *a, TSNode node, const char *source) {
     uint32_t nc = ts_node_child_count(node);
@@ -2996,6 +3049,7 @@ static bool extract_config_class_def(CBMExtractCtx *ctx, TSNode node, const char
     CBMArena *a = ctx->arena;
     char *name = NULL;
     const char *label = "Class";
+    const char *docstring = NULL;
 
     if (ctx->language == CBM_LANG_TOML &&
         (strcmp(kind, "table") == 0 || strcmp(kind, "table_array_element") == 0)) {
@@ -3008,6 +3062,7 @@ static bool extract_config_class_def(CBMExtractCtx *ctx, TSNode node, const char
                (strcmp(kind, "atx_heading") == 0 || strcmp(kind, "setext_heading") == 0)) {
         name = extract_markdown_heading_name(a, node, kind, ctx->source);
         label = "Section";
+        docstring = extract_markdown_section_body(a, node, ctx->source); // #518
     } else if (ctx->language == CBM_LANG_HCL && strcmp(kind, "block") == 0) {
         name = find_hcl_block_name(a, node, ctx->source);
     } else {
@@ -3015,7 +3070,7 @@ static bool extract_config_class_def(CBMExtractCtx *ctx, TSNode node, const char
     }
 
     if (name && name[0]) {
-        push_simple_class_def(ctx, node, name, label);
+        push_simple_class_def(ctx, node, name, label, docstring);
     }
     return true;
 }
@@ -5514,6 +5569,174 @@ static void walk_defs(CBMExtractCtx *ctx, TSNode root, const CBMLangSpec *spec,
     }
 }
 
+// ── Config module description promotion (#519) ──────────────────────────────
+// YAML/JSON metadata files (META.yaml, skill manifests, …) carry their most
+// search-relevant text in a top-level `description`/`summary`/`purpose` value.
+// That value is otherwise dropped — only the key becomes a Variable node — so
+// BM25 can't find a module by its description. We promote the value onto the
+// file's Module node docstring, which the FTS `body` column then indexes.
+
+// Case-insensitive ASCII equality (keys are conventionally lowercase, but accept
+// "Description"/"SUMMARY" too).
+static bool cfg_ci_eq(const char *x, const char *y) {
+    while (*x && *y) {
+        char cx = *x;
+        char cy = *y;
+        if (cx >= 'A' && cx <= 'Z') {
+            cx = (char)(cx + ('a' - 'A'));
+        }
+        if (cy >= 'A' && cy <= 'Z') {
+            cy = (char)(cy + ('a' - 'A'));
+        }
+        if (cx != cy) {
+            return false;
+        }
+        x++;
+        y++;
+    }
+    return *x == '\0' && *y == '\0';
+}
+
+static bool cfg_is_desc_key(const char *key) {
+    return key && (cfg_ci_eq(key, "description") || cfg_ci_eq(key, "summary") ||
+                   cfg_ci_eq(key, "purpose"));
+}
+
+// Trim surrounding whitespace and strip one layer of matching quotes, in place.
+static char *cfg_strip_quotes_trim(char *s) {
+    if (!s) {
+        return NULL;
+    }
+    while (*s && (unsigned char)*s <= ' ') {
+        s++;
+    }
+    size_t len = strlen(s);
+    while (len > 0 && (unsigned char)s[len - 1] <= ' ') {
+        s[--len] = '\0';
+    }
+    if (len >= 2 && ((s[0] == '"' && s[len - 1] == '"') || (s[0] == '\'' && s[len - 1] == '\''))) {
+        s[len - 1] = '\0';
+        s++;
+    }
+    return s;
+}
+
+// Copy at most `cap` bytes of `s`, never splitting a UTF-8 sequence; NULL if empty.
+static char *cfg_arena_capped(CBMArena *a, const char *s, size_t cap) {
+    if (!s || !s[0]) {
+        return NULL;
+    }
+    size_t len = strlen(s);
+    if (len > cap) {
+        len = cap;
+        while (len > 0 && ((unsigned char)s[len] & 0xC0) == 0x80) {
+            len--;
+        }
+        if (len == 0) {
+            return NULL;
+        }
+    }
+    return cbm_arena_strndup(a, s, len);
+}
+
+// Descend stream/document/block_node to the top-level YAML block_mapping.
+static TSNode cfg_find_yaml_mapping(TSNode root) {
+    TSNode none = {0};
+    TSNode cur = root;
+    for (int depth = 0; depth < 8; depth++) {
+        uint32_t n = ts_node_child_count(cur);
+        TSNode next = none;
+        bool have_next = false;
+        for (uint32_t i = 0; i < n; i++) {
+            TSNode ch = ts_node_child(cur, i);
+            const char *ck = ts_node_type(ch);
+            if (strcmp(ck, "block_mapping") == 0) {
+                return ch;
+            }
+            if (!have_next && (strcmp(ck, "stream") == 0 || strcmp(ck, "document") == 0 ||
+                               strcmp(ck, "block_node") == 0)) {
+                next = ch;
+                have_next = true;
+            }
+        }
+        if (!have_next) {
+            break;
+        }
+        cur = next;
+    }
+    return none;
+}
+
+// Descend document wrappers to the top-level JSON object.
+static TSNode cfg_find_json_object(TSNode root) {
+    TSNode none = {0};
+    TSNode cur = root;
+    for (int depth = 0; depth < 6; depth++) {
+        if (strcmp(ts_node_type(cur), "object") == 0) {
+            return cur;
+        }
+        uint32_t n = ts_node_child_count(cur);
+        TSNode next = none;
+        bool have_next = false;
+        for (uint32_t i = 0; i < n; i++) {
+            TSNode ch = ts_node_child(cur, i);
+            const char *ck = ts_node_type(ch);
+            if (strcmp(ck, "object") == 0) {
+                return ch;
+            }
+            if (!have_next && strcmp(ck, "document") == 0) {
+                next = ch;
+                have_next = true;
+            }
+        }
+        if (!have_next) {
+            break;
+        }
+        cur = next;
+    }
+    return none;
+}
+
+// Find a top-level description/summary/purpose value in a YAML or JSON file and
+// return it (trimmed, unquoted, capped) for promotion to the Module docstring.
+static const char *extract_config_module_description(CBMExtractCtx *ctx) {
+    if (ctx->language != CBM_LANG_YAML && ctx->language != CBM_LANG_JSON) {
+        return NULL;
+    }
+    CBMArena *a = ctx->arena;
+    TSNode mapping;
+    const char *pair_kind;
+    if (ctx->language == CBM_LANG_YAML) {
+        mapping = cfg_find_yaml_mapping(ctx->root);
+        pair_kind = "block_mapping_pair";
+    } else {
+        mapping = cfg_find_json_object(ctx->root);
+        pair_kind = "pair";
+    }
+    if (ts_node_is_null(mapping)) {
+        return NULL;
+    }
+    uint32_t n = ts_node_named_child_count(mapping);
+    for (uint32_t i = 0; i < n; i++) {
+        TSNode pair = ts_node_named_child(mapping, i);
+        if (strcmp(ts_node_type(pair), pair_kind) != 0) {
+            continue;
+        }
+        TSNode key = ts_node_child_by_field_name(pair, TS_FIELD("key"));
+        TSNode val = ts_node_child_by_field_name(pair, TS_FIELD("value"));
+        if (ts_node_is_null(key) || ts_node_is_null(val)) {
+            continue;
+        }
+        const char *key_txt = cfg_strip_quotes_trim(cbm_node_text(a, key, ctx->source));
+        if (!cfg_is_desc_key(key_txt)) {
+            continue;
+        }
+        char *val_txt = cfg_strip_quotes_trim(cbm_node_text(a, val, ctx->source));
+        return cfg_arena_capped(a, val_txt, CBM_MD_SECTION_BODY_MAX);
+    }
+    return NULL;
+}
+
 void cbm_extract_definitions(CBMExtractCtx *ctx) {
     const CBMLangSpec *spec = cbm_lang_spec(ctx->language);
     if (!spec) {
@@ -5533,6 +5756,8 @@ void cbm_extract_definitions(CBMExtractCtx *ctx) {
     mod.end_line = ts_node_end_point(ctx->root).row + TS_LINE_OFFSET;
     mod.is_exported = true;
     mod.is_test = ctx->result->is_test_file;
+    // Promote a YAML/JSON top-level description onto the Module for BM25 (#519).
+    mod.docstring = extract_config_module_description(ctx);
     cbm_defs_push(&ctx->result->defs, a, mod);
 
     // Walk AST for function/class definitions
diff --git a/src/mcp/mcp.c b/src/mcp/mcp.c
index 7016a0d2..3b03091a 100644
--- a/src/mcp/mcp.c
+++ b/src/mcp/mcp.c
@@ -1334,7 +1334,11 @@ static char *bm25_search(cbm_store_t *store, const char *project, const char *qu
         ") fts "
         "JOIN nodes n ON n.id = fts.rowid "
         "WHERE n.project = ?2 "
-        "  AND n.label NOT IN ('File','Folder','Module','Section','Variable','Project') "
+        /* Section and Module are searchable (#518/#519): their FTS `body` column
+         * carries markdown section prose and YAML/JSON description text.  They
+         * rank below code symbols (no boost above), so code results stay first.
+         * File/Folder/Variable/Project remain excluded as structural noise. */
+        "  AND n.label NOT IN ('File','Folder','Variable','Project') "
         "  AND (?6 IS NULL OR n.file_path LIKE ?6) "
         "ORDER BY rank "
         "LIMIT ?3 OFFSET ?4";
@@ -1359,17 +1363,17 @@ static char *bm25_search(cbm_store_t *store, const char *project, const char *qu
      * Uses the identical subquery structure so the FTS5 early-exit applies here too. */
     int total = 0;
     {
-        const char *count_sql =
-            "SELECT COUNT(*) FROM ("
-            "    SELECT fts.rowid FROM ("
-            "        SELECT rowid FROM nodes_fts WHERE nodes_fts MATCH ?1"
-            "        ORDER BY bm25(nodes_fts) LIMIT ?3"
-            "    ) fts "
-            "    JOIN nodes n ON n.id = fts.rowid "
-            "    WHERE n.project = ?2 "
-            "      AND n.label NOT IN ('File','Folder','Module','Section','Variable','Project')"
-            "      AND (?6 IS NULL OR n.file_path LIKE ?6)"
-            ")";
+        const char *count_sql = "SELECT COUNT(*) FROM ("
+                                "    SELECT fts.rowid FROM ("
+                                "        SELECT rowid FROM nodes_fts WHERE nodes_fts MATCH ?1"
+                                "        ORDER BY bm25(nodes_fts) LIMIT ?3"
+                                "    ) fts "
+                                "    JOIN nodes n ON n.id = fts.rowid "
+                                "    WHERE n.project = ?2 "
+                                /* Mirror the label filter in the main query above (#518/#519). */
+                                "      AND n.label NOT IN ('File','Folder','Variable','Project')"
+                                "      AND (?6 IS NULL OR n.file_path LIKE ?6)"
+                                ")";
         sqlite3_stmt *cs = NULL;
         if (sqlite3_prepare_v2(db, count_sql, BM25_SQL_AUTO_LEN, &cs, NULL) == SQLITE_OK) {
             sqlite3_bind_text(cs, BM25_BIND_QUERY, fts_query, BM25_SQL_AUTO_LEN,
diff --git a/src/pipeline/pipeline.c b/src/pipeline/pipeline.c
index 8e370f7c..2f26bbb7 100644
--- a/src/pipeline/pipeline.c
+++ b/src/pipeline/pipeline.c
@@ -877,20 +877,11 @@ static int dump_and_persist_hashes(cbm_pipeline_t *p, const cbm_file_info_t *fil
             }
         }
 
-        /* FTS5 backfill: populate nodes_fts with camelCase-split names.
-         * Contentless FTS5 requires the special 'delete-all' command instead of
-         * DELETE FROM to wipe prior rows (there's no underlying content table).
-         * Falls back to plain names if cbm_camel_split is unavailable (which
-         * shouldn't happen because we always register it, but we stay defensive). */
-        cbm_store_exec(hash_store, "INSERT INTO nodes_fts(nodes_fts) VALUES('delete-all');");
-        if (cbm_store_exec(hash_store,
-                           "INSERT INTO nodes_fts(rowid, name, qualified_name, label, file_path) "
-                           "SELECT id, cbm_camel_split(name), qualified_name, label, file_path "
-                           "FROM nodes;") != CBM_STORE_OK) {
-            cbm_store_exec(hash_store,
-                           "INSERT INTO nodes_fts(rowid, name, qualified_name, label, file_path) "
-                           "SELECT id, name, qualified_name, label, file_path FROM nodes;");
-        }
+        /* FTS5 backfill: rebuild nodes_fts with camelCase-split names and prose
+         * bodies (markdown sections, YAML/JSON descriptions, docstrings) so BM25
+         * matches content as well as identifiers (#518/#519).  cbm_store_fts_rebuild
+         * also upgrades legacy 4-column tables to the schema carrying `body`. */
+        cbm_store_fts_rebuild(hash_store);
 
         cbm_store_close(hash_store);
         cbm_log_info("pass.timing", "pass", "persist_hashes", "files", itoa_buf(file_count));
diff --git a/src/pipeline/pipeline_incremental.c b/src/pipeline/pipeline_incremental.c
index a1cc4482..9a411b94 100644
--- a/src/pipeline/pipeline_incremental.c
+++ b/src/pipeline/pipeline_incremental.c
@@ -649,17 +649,9 @@ static void dump_and_persist(cbm_gbuf_t *gbuf, const char *db_path, const char *
 
         /* FTS5 rebuild after incremental dump.  The btree dump path bypasses
          * any triggers that could have kept nodes_fts synchronized, so we
-         * rebuild from the nodes table here.  See the full-dump path in
-         * pipeline.c for the matching logic. */
-        cbm_store_exec(hash_store, "INSERT INTO nodes_fts(nodes_fts) VALUES('delete-all');");
-        if (cbm_store_exec(hash_store,
-                           "INSERT INTO nodes_fts(rowid, name, qualified_name, label, file_path) "
-                           "SELECT id, cbm_camel_split(name), qualified_name, label, file_path "
-                           "FROM nodes;") != CBM_STORE_OK) {
-            cbm_store_exec(hash_store,
-                           "INSERT INTO nodes_fts(rowid, name, qualified_name, label, file_path) "
-                           "SELECT id, name, qualified_name, label, file_path FROM nodes;");
-        }
+         * rebuild from the nodes table here (also indexing prose bodies for
+         * content search — see cbm_store_fts_rebuild and #518/#519). */
+        cbm_store_fts_rebuild(hash_store);
 
         cbm_store_close(hash_store);
     }
diff --git a/src/store/store.c b/src/store/store.c
index 995f6e85..185c79d0 100644
--- a/src/store/store.c
+++ b/src/store/store.c
@@ -212,6 +212,60 @@ static void iso_now(char *buf, size_t sz) {
 
 /* ── Schema ─────────────────────────────────────────────────────── */
 
+/* FTS5 contentless virtual table DDL — single source of truth shared by
+ * init_schema (fresh databases) and cbm_store_fts_rebuild (re-index + legacy
+ * upgrade).  Columns: name, qualified_name, label, file_path, body.  `body`
+ * (added for #518/#519) carries prose — markdown section bodies, YAML/JSON
+ * description values, and function docstrings — so BM25 matches content, not
+ * only identifiers.  Contentless (content='') stores only the inverted index;
+ * we feed cbm_camel_split(name) and the raw body text at insert time.  The
+ * column is named `body` rather than `content` to avoid colliding with the
+ * `content=''` option keyword in the FTS5 DDL grammar. */
+static const char NODES_FTS_DDL[] = "CREATE VIRTUAL TABLE IF NOT EXISTS nodes_fts USING fts5("
+                                    "  name, qualified_name, label, file_path, body,"
+                                    "  content='',"
+                                    "  tokenize='unicode61 remove_diacritics 2'"
+                                    ");";
+
+/* FTS backfill INSERTs.  The `body` column is fed the node's docstring property,
+ * or '' when absent or when properties is not valid JSON.  The json_valid() guard
+ * is essential — json_extract() raises on malformed JSON and would otherwise abort
+ * the whole INSERT...SELECT (pre-fix databases contain such rows; see the
+ * create_user_indexes note).  The primary form camelCase-splits the name; the
+ * fallback uses the plain name should cbm_camel_split be unavailable.  Keep the
+ * two body expressions in sync. */
+static const char FTS_BACKFILL_SQL[] =
+    "INSERT INTO nodes_fts(rowid, name, qualified_name, label, file_path, body) "
+    "SELECT id, cbm_camel_split(name), qualified_name, label, file_path,"
+    " CASE WHEN json_valid(properties)"
+    " THEN coalesce(json_extract(properties,'$.docstring'),'') ELSE '' END "
+    "FROM nodes;";
+
+static const char FTS_BACKFILL_SQL_FALLBACK[] =
+    "INSERT INTO nodes_fts(rowid, name, qualified_name, label, file_path, body) "
+    "SELECT id, name, qualified_name, label, file_path,"
+    " CASE WHEN json_valid(properties)"
+    " THEN coalesce(json_extract(properties,'$.docstring'),'') ELSE '' END "
+    "FROM nodes;";
+
+int cbm_store_fts_rebuild(cbm_store_t *s) {
+    if (!s || !s->db) {
+        return CBM_STORE_ERR;
+    }
+    /* Drop + recreate so legacy 4-column tables gain the `body` column. */
+    if (cbm_store_exec(s, "DROP TABLE IF EXISTS nodes_fts;") != CBM_STORE_OK) {
+        return CBM_STORE_ERR;
+    }
+    if (cbm_store_exec(s, NODES_FTS_DDL) != CBM_STORE_OK) {
+        return CBM_STORE_ERR; /* FTS5 not compiled in — regex search path still works. */
+    }
+    int rc = cbm_store_exec(s, FTS_BACKFILL_SQL);
+    if (rc != CBM_STORE_OK) {
+        rc = cbm_store_exec(s, FTS_BACKFILL_SQL_FALLBACK);
+    }
+    return rc;
+}
+
 static int init_schema(cbm_store_t *s) {
     const char *ddl =
         "CREATE TABLE IF NOT EXISTS projects ("
@@ -262,22 +316,13 @@ static int init_schema(cbm_store_t *s) {
         return rc;
     }
 
-    /* FTS5 contentless virtual table for BM25 full-text search.
-     * Contentless (content='') means FTS5 stores only the inverted index,
-     * not a copy of the source text — required for camelCase tokenization
-     * because we feed it `cbm_camel_split(name)` at insert time but want
-     * queries to match against the split tokens, not the original.
+    /* FTS5 contentless virtual table for BM25 full-text search (see NODES_FTS_DDL).
+     * Created here for fresh databases and read paths; cbm_store_fts_rebuild drops
+     * and recreates it during indexing, which also upgrades legacy 4-column tables.
      * Fails silently if FTS5 is not compiled in (SQLITE_ENABLE_FTS5). */
     {
         char *fts_err = NULL;
-        int fts_rc = sqlite3_exec(s->db,
-                                  "CREATE VIRTUAL TABLE IF NOT EXISTS nodes_fts USING fts5("
-                                  "  name, qualified_name, label, file_path,"
-                                  "  content='',"
-                                  "  tokenize='unicode61 remove_diacritics 2'"
-                                  ");",
-                                  NULL, NULL, &fts_err);
-        if (fts_rc != SQLITE_OK && fts_err) {
+        if (sqlite3_exec(s->db, NODES_FTS_DDL, NULL, NULL, &fts_err) != SQLITE_OK && fts_err) {
             sqlite3_free(fts_err);
         }
     }
diff --git a/src/store/store.h b/src/store/store.h
index 26b09a5c..b85b54a1 100644
--- a/src/store/store.h
+++ b/src/store/store.h
@@ -245,6 +245,14 @@ int cbm_store_drop_indexes(cbm_store_t *s);
 /* Recreate user indexes after bulk inserts. */
 int cbm_store_create_indexes(cbm_store_t *s);
 
+/* Rebuild the nodes_fts BM25 index from the nodes table.  Drops and recreates
+ * the FTS virtual table (which upgrades legacy 4-column databases to the schema
+ * carrying the `body` column), then re-inserts every node with its camelCase-
+ * split name and prose body (the docstring property) so full-text search matches
+ * both identifiers and content — markdown section bodies (#518) and YAML/JSON
+ * descriptions (#519).  Returns CBM_STORE_OK or CBM_STORE_ERR. */
+int cbm_store_fts_rebuild(cbm_store_t *s);
+
 /* ── WAL / Checkpoint ───────────────────────────────────────────── */
 
 /* Force WAL checkpoint + PRAGMA optimize. */
diff --git a/tests/test_extraction.c b/tests/test_extraction.c
index d06b2a50..e804cb64 100644
--- a/tests/test_extraction.c
+++ b/tests/test_extraction.c
@@ -57,6 +57,25 @@ static int count_defs_with_label(CBMFileResult *r, const char *label) {
     return count;
 }
 
+/* Docstring of the first definition matching label+name (NULL if not found). */
+static const char *def_docstring(CBMFileResult *r, const char *label, const char *name) {
+    for (int i = 0; i < r->defs.count; i++) {
+        if (strcmp(r->defs.items[i].label, label) == 0 &&
+            strcmp(r->defs.items[i].name, name) == 0)
+            return r->defs.items[i].docstring;
+    }
+    return NULL;
+}
+
+/* Docstring of the first definition with the given label (NULL if not found). */
+static const char *first_def_docstring(CBMFileResult *r, const char *label) {
+    for (int i = 0; i < r->defs.count; i++) {
+        if (strcmp(r->defs.items[i].label, label) == 0)
+            return r->defs.items[i].docstring;
+    }
+    return NULL;
+}
+
 /* Convenience: extract, assert no error, return result. Caller frees. */
 static CBMFileResult *extract(const char *src, CBMLanguage lang, const char *proj,
                               const char *path) {
@@ -2528,6 +2547,111 @@ TEST(markdown_no_headings) {
     PASS();
 }
 
+/* #518: the prose body beneath a heading is captured as the Section docstring so
+ * BM25 can search markdown content, not just heading text. */
+TEST(markdown_section_body_captured) {
+    CBMFileResult *r = extract("## BROWSER AGENT\n\n"
+                               "Before writing any test file, explore the live application "
+                               "using Playwright MCP.\n\n"
+                               "## NEXT SECTION\n\n"
+                               "Totally unrelated prose here.\n",
+                               CBM_LANG_MARKDOWN, "t", "SKILL.md");
+    ASSERT_NOT_NULL(r);
+    ASSERT_FALSE(r->has_error);
+    const char *body = def_docstring(r, "Section", "BROWSER AGENT");
+    ASSERT_NOT_NULL(body);
+    ASSERT(strstr(body, "Playwright") != NULL);
+    ASSERT(strstr(body, "test file") != NULL);
+    /* Body stops at the next heading — it must not absorb the sibling section. */
+    ASSERT(strstr(body, "unrelated") == NULL);
+    cbm_free_result(r);
+    PASS();
+}
+
+/* #518: a heading with no prose beneath it yields no docstring (not empty text). */
+TEST(markdown_section_no_body) {
+    CBMFileResult *r = extract("# Title\n## Empty\n", CBM_LANG_MARKDOWN, "t", "README.md");
+    ASSERT_NOT_NULL(r);
+    ASSERT_FALSE(r->has_error);
+    const char *body = def_docstring(r, "Section", "Empty");
+    ASSERT(body == NULL || body[0] == '\0');
+    cbm_free_result(r);
+    PASS();
+}
+
+/* #518: the captured body is capped (defends the 2 KB node-properties buffer). */
+TEST(markdown_section_body_capped) {
+    /* Build a heading followed by ~1500 chars of prose. */
+    char src[2048];
+    int n = snprintf(src, sizeof(src), "# Big\n\n");
+    for (int i = 0; i < 250 && n < (int)sizeof(src) - 8; i++)
+        n += snprintf(src + n, sizeof(src) - (size_t)n, "alpha ");
+    CBMFileResult *r = extract(src, CBM_LANG_MARKDOWN, "t", "BIG.md");
+    ASSERT_NOT_NULL(r);
+    ASSERT_FALSE(r->has_error);
+    const char *body = def_docstring(r, "Section", "Big");
+    ASSERT_NOT_NULL(body);
+    ASSERT(strlen(body) <= 500);
+    cbm_free_result(r);
+    PASS();
+}
+
+/* #519: a top-level YAML description value is promoted to the Module docstring so
+ * BM25 can find a module by its description. */
+TEST(yaml_description_promoted_to_module) {
+    CBMFileResult *r =
+        extract("name: qa-run\n"
+                "description: \"8-agent QA loop using Playwright MCP for browser testing\"\n",
+                CBM_LANG_YAML, "t", "META.yaml");
+    ASSERT_NOT_NULL(r);
+    ASSERT_FALSE(r->has_error);
+    const char *desc = first_def_docstring(r, "Module");
+    ASSERT_NOT_NULL(desc);
+    ASSERT(strstr(desc, "Playwright") != NULL);
+    ASSERT(strstr(desc, "browser testing") != NULL);
+    cbm_free_result(r);
+    PASS();
+}
+
+/* #519: the `summary` key is also promoted. */
+TEST(yaml_summary_promoted_to_module) {
+    CBMFileResult *r = extract("summary: concise pipeline overview text\nname: x\n", CBM_LANG_YAML,
+                               "t", "info.yaml");
+    ASSERT_NOT_NULL(r);
+    ASSERT_FALSE(r->has_error);
+    const char *desc = first_def_docstring(r, "Module");
+    ASSERT_NOT_NULL(desc);
+    ASSERT(strstr(desc, "pipeline overview") != NULL);
+    cbm_free_result(r);
+    PASS();
+}
+
+/* #519: a top-level JSON description value is promoted to the Module docstring. */
+TEST(json_description_promoted_to_module) {
+    CBMFileResult *r =
+        extract("{\"name\": \"qa-run\", "
+                "\"description\": \"8-agent QA loop with Playwright browser testing\"}",
+                CBM_LANG_JSON, "t", "skill.json");
+    ASSERT_NOT_NULL(r);
+    ASSERT_FALSE(r->has_error);
+    const char *desc = first_def_docstring(r, "Module");
+    ASSERT_NOT_NULL(desc);
+    ASSERT(strstr(desc, "Playwright") != NULL);
+    cbm_free_result(r);
+    PASS();
+}
+
+/* #519: files without a promotable key leave the Module docstring unset. */
+TEST(yaml_no_description_leaves_module_bare) {
+    CBMFileResult *r = extract("name: thing\nversion: 1\n", CBM_LANG_YAML, "t", "plain.yaml");
+    ASSERT_NOT_NULL(r);
+    ASSERT_FALSE(r->has_error);
+    const char *desc = first_def_docstring(r, "Module");
+    ASSERT(desc == NULL || desc[0] == '\0');
+    cbm_free_result(r);
+    PASS();
+}
+
 /* ═══════════════════════════════════════════════════════════════════
  * Python __init__.py Module QN collision regression
  * ═══════════════════════════════════════════════════════════════════ */
@@ -3156,6 +3280,13 @@ SUITE(extraction) {
     RUN_TEST(markdown_setext_headings);
     RUN_TEST(markdown_heading_content);
     RUN_TEST(markdown_no_headings);
+    RUN_TEST(markdown_section_body_captured);
+    RUN_TEST(markdown_section_no_body);
+    RUN_TEST(markdown_section_body_capped);
+    RUN_TEST(yaml_description_promoted_to_module);
+    RUN_TEST(yaml_summary_promoted_to_module);
+    RUN_TEST(json_description_promoted_to_module);
+    RUN_TEST(yaml_no_description_leaves_module_bare);
 
     /* __init__.py / index.ts Module QN collision regression */
     RUN_TEST(python_init_module_qn_not_collide_with_folder);
diff --git a/tests/test_store_search.c b/tests/test_store_search.c
index 5cbf1a19..656b1507 100644
--- a/tests/test_store_search.c
+++ b/tests/test_store_search.c
@@ -6,6 +6,7 @@
 #include "../src/foundation/compat.h"
 #include "test_framework.h"
 #include "test_helpers.h"
+#include <sqlite3.h>
 #include <store/store.h>
 #include <string.h>
 #include <stdlib.h>
@@ -1458,6 +1459,114 @@ TEST(store_impact_summary_empty) {
     PASS();
 }
 
+/* ── FTS5 body indexing (#518 / #519) ────────────────────────────── */
+
+/* Count nodes_fts rows matching a single (test-controlled) alpha token. */
+static int fts_match_count(sqlite3 *db, const char *term) {
+    char sql[256];
+    snprintf(sql, sizeof(sql), "SELECT count(*) FROM nodes_fts WHERE nodes_fts MATCH '%s'", term);
+    sqlite3_stmt *st = NULL;
+    if (sqlite3_prepare_v2(db, sql, -1, &st, NULL) != SQLITE_OK) {
+        return -1;
+    }
+    int c = (sqlite3_step(st) == SQLITE_ROW) ? sqlite3_column_int(st, 0) : -1;
+    sqlite3_finalize(st);
+    return c;
+}
+
+/* cbm_store_fts_rebuild indexes the `body` column from each node's docstring, so
+ * BM25 MATCH finds Section/Module nodes by their content, not just their names. */
+TEST(fts_rebuild_indexes_body_content) {
+    cbm_store_t *s = cbm_store_open_memory();
+    cbm_store_upsert_project(s, "test", "/tmp/test");
+    cbm_node_t sec = {.project = "test",
+                      .label = "Section",
+                      .name = "BROWSER AGENT",
+                      .qualified_name = "test.SKILL.browser",
+                      .file_path = "SKILL.md",
+                      .properties_json = "{\"docstring\":\"explore the live app using Playwright\"}"};
+    cbm_node_t mod = {.project = "test",
+                      .label = "Module",
+                      .name = "META.yaml",
+                      .qualified_name = "test.META",
+                      .file_path = "META.yaml",
+                      .properties_json = "{\"docstring\":\"eight agent quality gate pipeline\"}"};
+    cbm_store_upsert_node(s, &sec);
+    cbm_store_upsert_node(s, &mod);
+
+    ASSERT_EQ(cbm_store_fts_rebuild(s), CBM_STORE_OK);
+    sqlite3 *db = cbm_store_get_db(s);
+
+    /* Body tokens are searchable even though they appear in no node name. */
+    ASSERT_GTE(fts_match_count(db, "playwright"), 1);
+    ASSERT_GTE(fts_match_count(db, "quality"), 1);
+    /* Name tokens still searchable. */
+    ASSERT_GTE(fts_match_count(db, "browser"), 1);
+    cbm_store_close(s);
+    PASS();
+}
+
+/* A database created before the `body` column is upgraded in place: rebuild drops
+ * the legacy 4-column table, recreates it with body, and repopulates. */
+TEST(fts_rebuild_upgrades_legacy_schema) {
+    cbm_store_t *s = cbm_store_open_memory();
+    cbm_store_upsert_project(s, "test", "/tmp/test");
+    sqlite3 *db = cbm_store_get_db(s);
+    /* Replace the current table with the historical 4-column (no body) schema. */
+    ASSERT_EQ(sqlite3_exec(db, "DROP TABLE IF EXISTS nodes_fts;", NULL, NULL, NULL), SQLITE_OK);
+    ASSERT_EQ(sqlite3_exec(db,
+                           "CREATE VIRTUAL TABLE nodes_fts USING fts5(name, qualified_name, label, "
+                           "file_path, content='', tokenize='unicode61 remove_diacritics 2');",
+                           NULL, NULL, NULL),
+              SQLITE_OK);
+    cbm_node_t fn = {.project = "test",
+                     .label = "Function",
+                     .name = "parseConfig",
+                     .qualified_name = "test.parseConfig",
+                     .file_path = "a.c",
+                     .properties_json = "{\"docstring\":\"loads zookeeper settings\"}"};
+    cbm_store_upsert_node(s, &fn);
+
+    ASSERT_EQ(cbm_store_fts_rebuild(s), CBM_STORE_OK);
+
+    /* The new body column now exists and the docstring is searchable. */
+    sqlite3_stmt *probe = NULL;
+    ASSERT_EQ(sqlite3_prepare_v2(db, "SELECT body FROM nodes_fts", -1, &probe, NULL), SQLITE_OK);
+    sqlite3_finalize(probe);
+    ASSERT_GTE(fts_match_count(db, "zookeeper"), 1);
+    cbm_store_close(s);
+    PASS();
+}
+
+/* The backfill must survive rows whose properties_json is not valid JSON (legacy
+ * databases contain such rows); the json_valid() guard prevents json_extract from
+ * aborting the whole INSERT...SELECT. */
+TEST(fts_rebuild_tolerates_malformed_properties) {
+    cbm_store_t *s = cbm_store_open_memory();
+    cbm_store_upsert_project(s, "test", "/tmp/test");
+    cbm_node_t bad = {.project = "test",
+                      .label = "Function",
+                      .name = "weird",
+                      .qualified_name = "test.weird",
+                      .file_path = "a.c",
+                      .properties_json = "{not valid json"};
+    cbm_node_t ok = {.project = "test",
+                     .label = "Function",
+                     .name = "fine",
+                     .qualified_name = "test.fine",
+                     .file_path = "b.c",
+                     .properties_json = "{\"docstring\":\"kafka consumer group\"}"};
+    cbm_store_upsert_node(s, &bad);
+    cbm_store_upsert_node(s, &ok);
+
+    ASSERT_EQ(cbm_store_fts_rebuild(s), CBM_STORE_OK);
+    sqlite3 *db = cbm_store_get_db(s);
+    ASSERT_GTE(fts_match_count(db, "kafka"), 1);  /* good row indexed */
+    ASSERT_GTE(fts_match_count(db, "weird"), 1);  /* malformed row's name still indexed */
+    cbm_store_close(s);
+    PASS();
+}
+
 SUITE(store_search) {
     RUN_TEST(store_search_by_label);
     RUN_TEST(store_search_by_name_pattern);
@@ -1525,4 +1634,8 @@ SUITE(store_search) {
     RUN_TEST(store_hop_to_risk_all_levels);
     RUN_TEST(store_risk_label_all_levels);
     RUN_TEST(store_impact_summary_empty);
+    /* FTS5 body indexing (#518 / #519) */
+    RUN_TEST(fts_rebuild_indexes_body_content);
+    RUN_TEST(fts_rebuild_upgrades_legacy_schema);
+    RUN_TEST(fts_rebuild_tolerates_malformed_properties);
 }