DeusData
diff --git a/‎Makefile.cbm‎
Lines changed: 10 additions & 2 deletions b/‎Makefile.cbm‎
Lines changed: 10 additions & 2 deletions
diff --git a/‎internal/cbm/cbm.h‎
Lines changed: 1 addition & 0 deletions b/‎internal/cbm/cbm.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎internal/cbm/extract_defs.c‎
Lines changed: 57 additions & 0 deletions b/‎internal/cbm/extract_defs.c‎
Lines changed: 57 additions & 0 deletions
diff --git a/‎internal/cbm/sqlite_writer.c‎
Lines changed: 52 additions & 5 deletions b/‎internal/cbm/sqlite_writer.c‎
Lines changed: 52 additions & 5 deletions
diff --git a/‎internal/cbm/sqlite_writer.h‎
Lines changed: 12 additions & 2 deletions b/‎internal/cbm/sqlite_writer.h‎
Lines changed: 12 additions & 2 deletions
diff --git a/‎src/graph_buffer/graph_buffer.c‎
Lines changed: 50 additions & 1 deletion b/‎src/graph_buffer/graph_buffer.c‎
Lines changed: 50 additions & 1 deletion
diff --git a/‎src/graph_buffer/graph_buffer.h‎
Lines changed: 6 additions & 0 deletions b/‎src/graph_buffer/graph_buffer.h‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎src/pipeline/pass_definitions.c‎
Lines changed: 5 additions & 0 deletions b/‎src/pipeline/pass_definitions.c‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎src/pipeline/pass_parallel.c‎
Lines changed: 5 additions & 0 deletions b/‎src/pipeline/pass_parallel.c‎
Lines changed: 5 additions & 0 deletions
@@ -195,6 +195,9 @@ SIMHASH_SRCS = src/simhash/minhash.c
 # Semantic embedding module
 SEMANTIC_SRCS = src/semantic/semantic.c src/semantic/ast_profile.c
 
+# UniXcoder pretrained vectors (assembler blob)
+UNIXCODER_BLOB_SRC = vendored/unixcoder/code_vectors_blob.S
+
 # Traces module (new)
 TRACES_SRCS = src/traces/traces.c
 
@@ -415,7 +418,12 @@ $(BUILD_DIR)/test_lz4.o: $(CBM_DIR)/vendored/lz4/lz4.c | $(BUILD_DIR)
 $(BUILD_DIR)/test_lz4hc.o: $(CBM_DIR)/vendored/lz4/lz4hc.c | $(BUILD_DIR)
 	$(CC) -std=c11 -D_DEFAULT_SOURCE -g -O1 $(SANITIZE) -w -I$(CBM_DIR)/vendored/lz4 -c -o $@ $<
 
-OBJS_VENDORED_TEST = $(MIMALLOC_OBJ_TEST) $(SQLITE3_OBJ_TEST) $(TRE_OBJ_TEST) $(GRAMMAR_OBJS_TEST) $(TS_RUNTIME_OBJ_TEST) $(LSP_OBJ_TEST) $(PP_OBJ_TEST) $(MONGOOSE_OBJ_TEST) $(LZ4_OBJ_TEST)
+# UniXcoder pretrained vector blob
+UNIXCODER_OBJ = $(BUILD_DIR)/unixcoder_blob.o
+$(UNIXCODER_OBJ): $(UNIXCODER_BLOB_SRC) vendored/unixcoder/code_vectors.bin | $(BUILD_DIR)
+	$(AS) -o $@ $<
+
+OBJS_VENDORED_TEST = $(MIMALLOC_OBJ_TEST) $(SQLITE3_OBJ_TEST) $(TRE_OBJ_TEST) $(GRAMMAR_OBJS_TEST) $(TS_RUNTIME_OBJ_TEST) $(LSP_OBJ_TEST) $(PP_OBJ_TEST) $(MONGOOSE_OBJ_TEST) $(LZ4_OBJ_TEST) $(UNIXCODER_OBJ)
 
 $(BUILD_DIR)/test-runner: $(ALL_TEST_SRCS) $(PROD_SRCS) $(EXTRACTION_SRCS) $(AC_LZ4_SRCS) $(SQLITE_WRITER_SRC) $(OBJS_VENDORED_TEST) | $(BUILD_DIR)
 	$(CC) $(CFLAGS_TEST) -o $@ \
@@ -459,7 +467,7 @@ $(BUILD_DIR)/prod_lz4.o: $(CBM_DIR)/vendored/lz4/lz4.c | $(BUILD_DIR)
 $(BUILD_DIR)/prod_lz4hc.o: $(CBM_DIR)/vendored/lz4/lz4hc.c | $(BUILD_DIR)
 	$(CC) -std=c11 -D_DEFAULT_SOURCE -O2 -w -I$(CBM_DIR)/vendored/lz4 -c -o $@ $<
 
-OBJS_VENDORED_PROD = $(MIMALLOC_OBJ_PROD) $(SQLITE3_OBJ_PROD) $(TRE_OBJ_PROD) $(GRAMMAR_OBJS_PROD) $(TS_RUNTIME_OBJ_PROD) $(LSP_OBJ_PROD) $(PP_OBJ_PROD) $(MONGOOSE_OBJ_PROD) $(LZ4_OBJ_PROD)
+OBJS_VENDORED_PROD = $(MIMALLOC_OBJ_PROD) $(SQLITE3_OBJ_PROD) $(TRE_OBJ_PROD) $(GRAMMAR_OBJS_PROD) $(TS_RUNTIME_OBJ_PROD) $(LSP_OBJ_PROD) $(PP_OBJ_PROD) $(MONGOOSE_OBJ_PROD) $(LZ4_OBJ_PROD) $(UNIXCODER_OBJ)
 
 MAIN_SRC = src/main.c
 
 
@@ -110,6 +110,7 @@ typedef struct {
     bool is_test;
     bool is_entry_point;
     const char *structural_profile; // AST structural profile (arena-allocated) or NULL
+    const char *body_tokens;        // space-separated raw identifier tokens from body (arena) or NULL
 } CBMDefinition;
 
 /* Argument captured from a call expression */
 
@@ -66,6 +66,63 @@ static void compute_fingerprint(CBMExtractCtx *ctx, CBMDefinition *def, TSNode f
         cbm_ast_profile_to_str(&profile, sp_buf, sizeof(sp_buf));
         def->structural_profile = cbm_arena_strdup(ctx->arena, sp_buf);
     }
+
+    /* Extract raw identifier tokens from body for semantic search.
+     * Walk the AST, collect unique identifier text, store as space-separated string.
+     * Cap at ~500 chars to fit in properties_json. */
+    {
+        enum { BT_STACK = 512, BT_BUF = 512, BT_MAX_IDENTS = 40, BT_SEEN = 128, BT_SEEN_MASK = 127 };
+        TSNode bt_stack[BT_STACK];
+        int bt_top = 0;
+        bt_stack[bt_top++] = body;
+        char bt_buf[BT_BUF];
+        int bt_pos = 0;
+        uint32_t bt_seen[BT_SEEN];
+        memset(bt_seen, 0, sizeof(bt_seen));
+        int bt_count = 0;
+
+        while (bt_top > 0 && bt_count < BT_MAX_IDENTS) {
+            TSNode nd = bt_stack[--bt_top];
+            uint32_t nc = ts_node_child_count(nd);
+            if (nc == 0) {
+                const char *k = ts_node_type(nd);
+                if (strcmp(k, "identifier") == 0 || strcmp(k, "field_identifier") == 0 ||
+                    strcmp(k, "property_identifier") == 0) {
+                    uint32_t s = ts_node_start_byte(nd);
+                    uint32_t e = ts_node_end_byte(nd);
+                    int len = (int)(e - s);
+                    if (len > 0 && len < CBM_SZ_64 && s < (uint32_t)ctx->source_len) {
+                        /* Dedup via simple hash set */
+                        uint32_t h = 0;
+                        for (int x = 0; x < len; x++) {
+                            h = h * 31 + (uint32_t)(unsigned char)ctx->source[s + x];
+                        }
+                        uint32_t slot = h & BT_SEEN_MASK;
+                        bool dup = false;
+                        for (int p = 0; p < BT_SEEN; p++) {
+                            uint32_t idx = (slot + (uint32_t)p) & BT_SEEN_MASK;
+                            if (bt_seen[idx] == 0) { bt_seen[idx] = h | 1; break; }
+                            if (bt_seen[idx] == (h | 1)) { dup = true; break; }
+                        }
+                        if (!dup && bt_pos + len + 1 < BT_BUF) {
+                            if (bt_pos > 0) { bt_buf[bt_pos++] = ' '; }
+                            memcpy(bt_buf + bt_pos, ctx->source + s, (size_t)len);
+                            bt_pos += len;
+                            bt_count++;
+                        }
+                    }
+                }
+            } else {
+                for (int i = (int)nc - 1; i >= 0 && bt_top < BT_STACK; i--) {
+                    bt_stack[bt_top++] = ts_node_child(nd, (uint32_t)i);
+                }
+            }
+        }
+        if (bt_pos > 0) {
+            bt_buf[bt_pos] = '\0';
+            def->body_tokens = cbm_arena_strdup(ctx->arena, bt_buf);
+        }
+    }
 }
 
 // Tree-sitter row is 0-based; lines are 1-based.
 
@@ -746,6 +746,23 @@ static uint8_t *build_vector_record(const CBMDumpVector *v, int *out_len) {
     return data;
 }
 
+// Build a token_vectors table record: (id, project, token, vector, idf)
+static uint8_t *build_token_vec_record(const CBMDumpTokenVec *tv, int *out_len) {
+    RecordBuilder r;
+    rec_init(&r);
+
+    rec_add_int(&r, tv->id);
+    rec_add_text(&r, tv->project);
+    rec_add_text(&r, tv->token);
+    rec_add_blob(&r, tv->vector, tv->vector_len);
+    /* Store IDF as integer × 1000 for fixed-point (avoid float in record) */
+    rec_add_int(&r, (int64_t)(tv->idf * 1000.0f));
+
+    uint8_t *data = rec_finalize(&r, out_len);
+    rec_free(&r);
+    return data;
+}
+
 // Build a projects table record: (name, indexed_at, root_path)
 static uint8_t *build_project_record(const char *name, const char *indexed_at,
                                      const char *root_path, int *out_len) {
@@ -1492,11 +1509,13 @@ typedef struct {
     int edge_count;
     CBMDumpVector *vectors;
     int vector_count;
+    CBMDumpTokenVec *token_vecs;
+    int token_vec_count;
 } write_db_ctx_t;
 
 /* Phase 1: Write node + edge + vector data tables (streaming). */
 static int write_data_tables(write_db_ctx_t *w, uint32_t *nodes_root, uint32_t *edges_root,
-                             uint32_t *vectors_root) {
+                             uint32_t *vectors_root, uint32_t *token_vecs_root) {
     if (w->node_count > 0) {
         PageBuilder pb;
         pb_init(&pb, w->fp, w->next_page, false);
@@ -1552,6 +1571,26 @@ static int write_data_tables(write_db_ctx_t *w, uint32_t *nodes_root, uint32_t *
     } else {
         *vectors_root = write_table_btree(w->fp, &w->next_page, NULL, NULL, NULL, 0, false);
     }
+
+    /* token_vectors table — enriched RI vectors for query-time lookup */
+    if (w->token_vec_count > 0 && w->token_vecs) {
+        PageBuilder pb;
+        pb_init(&pb, w->fp, w->next_page, false);
+        for (int i = 0; i < w->token_vec_count; i++) {
+            int rec_len;
+            uint8_t *rec = build_token_vec_record(&w->token_vecs[i], &rec_len);
+            if (!rec) {
+                return ERR_WRITE_FAILED;
+            }
+            pb_add_table_cell_with_flush(&pb, w->token_vecs[i].id, rec, rec_len,
+                                         i > 0 ? w->token_vecs[i - SKIP_ONE].id : 0);
+            free(rec);
+        }
+        *token_vecs_root = pb_finalize_table(&pb, &w->next_page,
+                                              w->token_vecs[w->token_vec_count - SKIP_ONE].id);
+    } else {
+        *token_vecs_root = write_table_btree(w->fp, &w->next_page, NULL, NULL, NULL, 0, false);
+    }
     return 0;
 }
 
@@ -1599,7 +1638,8 @@ static void write_metadata_tables(write_db_ctx_t *w, uint32_t *projects_root,
 
 int cbm_write_db(const char *path, const char *project, const char *root_path,
                  const char *indexed_at, CBMDumpNode *nodes, int node_count, CBMDumpEdge *edges,
-                 int edge_count, CBMDumpVector *vectors, int vector_count) {
+                 int edge_count, CBMDumpVector *vectors, int vector_count,
+                 CBMDumpTokenVec *token_vecs, int token_vec_count) {
     FILE *fp = fopen(path, "wb");
     if (!fp) {
         return CBM_NOT_FOUND;
@@ -1615,13 +1655,16 @@ int cbm_write_db(const char *path, const char *project, const char *root_path,
                         .edges = edges,
                         .edge_count = edge_count,
                         .vectors = vectors,
-                        .vector_count = vector_count};
+                        .vector_count = vector_count,
+                        .token_vecs = token_vecs,
+                        .token_vec_count = token_vec_count};
 
-    // Phase 1: Data tables (streaming node + edge + vector records)
+    // Phase 1: Data tables (streaming node + edge + vector + token_vector records)
     uint32_t nodes_root;
     uint32_t edges_root;
     uint32_t vectors_root;
-    int rc = write_data_tables(&w, &nodes_root, &edges_root, &vectors_root);
+    uint32_t token_vecs_root;
+    int rc = write_data_tables(&w, &nodes_root, &edges_root, &vectors_root, &token_vecs_root);
     if (rc != 0) {
         (void)fclose(fp);
         return rc;
@@ -1810,6 +1853,10 @@ int cbm_write_db(const char *path, const char *project, const char *root_path,
         {"table", "node_vectors", "node_vectors", vectors_root,
          "CREATE TABLE node_vectors (\n\t\tnode_id INTEGER PRIMARY KEY,\n\t\tproject TEXT NOT "
          "NULL,\n\t\tvector BLOB NOT NULL\n\t)"},
+        {"table", "token_vectors", "token_vectors", token_vecs_root,
+         "CREATE TABLE token_vectors (\n\t\tid INTEGER PRIMARY KEY,\n\t\tproject "
+         "TEXT NOT NULL,\n\t\ttoken TEXT NOT NULL,\n\t\tvector BLOB NOT NULL,\n\t\tidf INTEGER "
+         "NOT NULL\n\t)"},
         {"table", "sqlite_sequence", "sqlite_sequence", sqlite_seq_root,
          "CREATE TABLE sqlite_sequence(name,seq)"},
     };
 
@@ -34,14 +34,24 @@ typedef struct {
     int vector_len;        // length in bytes (e.g. 256 for d=256)
 } CBMDumpVector;
 
+typedef struct {
+    int64_t id;            // sequential ID (1..T)
+    const char *project;
+    const char *token;     // the token string
+    const uint8_t *vector; // int8-quantized enriched RI vector blob
+    int vector_len;        // length in bytes (e.g. 256 for d=256)
+    float idf;             // inverse document frequency weight
+} CBMDumpTokenVec;
+
 // --- Public API ---
 
 // Write a complete SQLite .db file from sorted in-memory data.
 // Constructs B-tree pages directly — no SQL parser, no INSERTs.
 // Returns 0 on success, non-zero on error.
-// vectors/vector_count may be NULL/0 if no vectors are available.
+// vectors/vector_count and token_vecs/token_vec_count may be NULL/0.
 int cbm_write_db(const char *path, const char *project, const char *root_path,
                  const char *indexed_at, CBMDumpNode *nodes, int node_count, CBMDumpEdge *edges,
-                 int edge_count, CBMDumpVector *vectors, int vector_count);
+                 int edge_count, CBMDumpVector *vectors, int vector_count,
+                 CBMDumpTokenVec *token_vecs, int token_vec_count);
 
 #endif // CBM_SQLITE_WRITER_H
@@ -88,6 +88,11 @@ struct cbm_gbuf {
     CBMDumpVector *dump_vectors;
     int dump_vector_count;
     int dump_vector_cap;
+
+    /* Token vector storage for enriched RI vectors (query-time lookup). */
+    CBMDumpTokenVec *dump_token_vecs;
+    int dump_token_vec_count;
+    int dump_token_vec_cap;
 };
 
 /* ── Helpers ─────────────────────────────────────────────────────── */
@@ -431,6 +436,13 @@ void cbm_gbuf_free(cbm_gbuf_t *gb) {
     }
     free(gb->dump_vectors);
 
+    /* Free token vector storage */
+    for (int i = 0; i < gb->dump_token_vec_count; i++) {
+        free((void *)gb->dump_token_vecs[i].token);
+        free((void *)gb->dump_token_vecs[i].vector);
+    }
+    free(gb->dump_token_vecs);
+
     free(gb->project);
     free(gb->root_path);
     free(gb);
@@ -469,6 +481,42 @@ int cbm_gbuf_store_vector(cbm_gbuf_t *gb, int64_t node_id, const uint8_t *vector
     return 0;
 }
 
+int cbm_gbuf_store_token_vector(cbm_gbuf_t *gb, const char *token, const uint8_t *vector,
+                                int vector_len, float idf) {
+    if (!gb || !token || !vector || vector_len <= 0) {
+        return -1;
+    }
+    enum { TV_INIT_CAP = 256, TV_GROW = 2 };
+    if (gb->dump_token_vec_count >= gb->dump_token_vec_cap) {
+        int new_cap = gb->dump_token_vec_cap < TV_INIT_CAP ? TV_INIT_CAP
+                                                           : gb->dump_token_vec_cap * TV_GROW;
+        CBMDumpTokenVec *grown =
+            realloc(gb->dump_token_vecs, (size_t)new_cap * sizeof(CBMDumpTokenVec));
+        if (!grown) {
+            return -1;
+        }
+        gb->dump_token_vecs = grown;
+        gb->dump_token_vec_cap = new_cap;
+    }
+    uint8_t *vec_copy = malloc((size_t)vector_len);
+    if (!vec_copy) {
+        return -1;
+    }
+    memcpy(vec_copy, vector, (size_t)vector_len);
+
+    int idx = gb->dump_token_vec_count;
+    gb->dump_token_vecs[idx] = (CBMDumpTokenVec){
+        .id = idx + SKIP_ONE, /* 1-based sequential ID */
+        .project = gb->project,
+        .token = strdup(token),
+        .vector = vec_copy,
+        .vector_len = vector_len,
+        .idf = idf,
+    };
+    gb->dump_token_vec_count++;
+    return 0;
+}
+
 /* ── ID accessors ────────────────────────────────────────────────── */
 
 int64_t cbm_gbuf_next_id(const cbm_gbuf_t *gb) {
@@ -1242,7 +1290,8 @@ int cbm_gbuf_dump_to_sqlite(cbm_gbuf_t *gb, const char *path) {
      * Callers must delete the old .db before calling this (reindex)
      * or ensure no file exists (first index). */
     int rc = cbm_write_db(path, gb->project, gb->root_path, indexed_at, dump_nodes, node_idx,
-                          dump_edges, edge_idx, gb->dump_vectors, gb->dump_vector_count);
+                          dump_edges, edge_idx, gb->dump_vectors, gb->dump_vector_count,
+                          gb->dump_token_vecs, gb->dump_token_vec_count);
 
     {
         char b1[CBM_SZ_16];
 
@@ -155,6 +155,12 @@ int cbm_gbuf_delete_edges_by_type(cbm_gbuf_t *gb, const char *type);
  * Vectors are carried through to cbm_write_db during the dump phase. */
 int cbm_gbuf_store_vector(cbm_gbuf_t *gb, int64_t node_id, const uint8_t *vector, int vector_len);
 
+/* Store an enriched token vector for query-time lookup.
+ * Called by pass_semantic_edges after corpus finalization.
+ * Token string and vector data are copied. */
+int cbm_gbuf_store_token_vector(cbm_gbuf_t *gb, const char *token, const uint8_t *vector,
+                                int vector_len, float idf);
+
 /* ── Dump to SQLite ──────────────────────────────────────────────── */
 
 /* Dump the entire buffer to a SQLite file using the direct page writer.
 
@@ -206,6 +206,11 @@ static void build_def_props(char *buf, size_t bufsize, const CBMDefinition *def)
         append_json_string(buf, bufsize, &pos, "sp", def->structural_profile);
     }
 
+    /* Body tokens */
+    if (def->body_tokens && pos + CBM_SZ_512 < bufsize) {
+        append_json_string(buf, bufsize, &pos, "bt", def->body_tokens);
+    }
+
     if (pos < bufsize - SKIP_ONE) {
         buf[pos] = '}';
         buf[pos + SKIP_ONE] = '\0';
 
@@ -229,6 +229,11 @@ static void build_def_props(char *buf, size_t bufsize, const CBMDefinition *def)
         append_json_string(buf, bufsize, &pos, "sp", def->structural_profile);
     }
 
+    /* Body tokens — raw identifiers from function body AST for semantic search. */
+    if (def->body_tokens && pos + CBM_SZ_512 < bufsize) {
+        append_json_string(buf, bufsize, &pos, "bt", def->body_tokens);
+    }
+
     if (pos < bufsize - SKIP_ONE) {
         buf[pos] = '}';
         buf[pos + SKIP_ONE] = '\0';