Skip to content

Commit e81a796

Browse files
committed
Embed Nomic nomic-embed-code pretrained vectors (40K tokens × 768d) for semantic search quality
Major quality improvement: replace random index vectors with pre-trained Nomic nomic-embed-code token embeddings. Tokens like 'error' and 'exception' now start with similar vectors (learned from millions of code repos) instead of arbitrary random projections. Co-occurrence enrichment adds project-specific context on top. Architecture: - vendored/nomic/code_vectors.bin: 37.7MB raw int8 vectors - vendored/nomic/code_vectors_blob.S: assembler .incbin (instant build) - vendored/nomic/code_vectors.h: extern declarations + pretrained_vec_at() - vendored/nomic/code_tokens.h: 40856 token strings (575KB) - semantic.c: cbm_sem_random_index() now looks up pretrained vectors first, falls back to sparse random for unknown tokens - CBM_SEM_DIM raised from 256 to 768 to match Nomic nomic-embed-code Also: RRI (Reflective Random Indexing), code pattern vocabulary injection, 120+ abbreviation expansions, callee/caller/body token enrichment, label filter (Function/Method/Class only) in vector search SQL. Binary size: 136MB → 169MB (+33MB from embedded vectors). Search quality: keyword queries return relevant error-handling functions. Domain-specific keyword queries return the expected functions.
1 parent 7a9b7db commit e81a796

File tree

19 files changed

+103531
-30
lines changed

19 files changed

+103531
-30
lines changed

Makefile.cbm

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,9 @@ SIMHASH_SRCS = src/simhash/minhash.c
195195
# Semantic embedding module
196196
SEMANTIC_SRCS = src/semantic/semantic.c src/semantic/ast_profile.c
197197

198+
# UniXcoder pretrained vectors (assembler blob)
199+
UNIXCODER_BLOB_SRC = vendored/unixcoder/code_vectors_blob.S
200+
198201
# Traces module (new)
199202
TRACES_SRCS = src/traces/traces.c
200203

@@ -415,7 +418,12 @@ $(BUILD_DIR)/test_lz4.o: $(CBM_DIR)/vendored/lz4/lz4.c | $(BUILD_DIR)
415418
$(BUILD_DIR)/test_lz4hc.o: $(CBM_DIR)/vendored/lz4/lz4hc.c | $(BUILD_DIR)
416419
$(CC) -std=c11 -D_DEFAULT_SOURCE -g -O1 $(SANITIZE) -w -I$(CBM_DIR)/vendored/lz4 -c -o $@ $<
417420

418-
OBJS_VENDORED_TEST = $(MIMALLOC_OBJ_TEST) $(SQLITE3_OBJ_TEST) $(TRE_OBJ_TEST) $(GRAMMAR_OBJS_TEST) $(TS_RUNTIME_OBJ_TEST) $(LSP_OBJ_TEST) $(PP_OBJ_TEST) $(MONGOOSE_OBJ_TEST) $(LZ4_OBJ_TEST)
421+
# UniXcoder pretrained vector blob
422+
UNIXCODER_OBJ = $(BUILD_DIR)/unixcoder_blob.o
423+
$(UNIXCODER_OBJ): $(UNIXCODER_BLOB_SRC) vendored/unixcoder/code_vectors.bin | $(BUILD_DIR)
424+
$(AS) -o $@ $<
425+
426+
OBJS_VENDORED_TEST = $(MIMALLOC_OBJ_TEST) $(SQLITE3_OBJ_TEST) $(TRE_OBJ_TEST) $(GRAMMAR_OBJS_TEST) $(TS_RUNTIME_OBJ_TEST) $(LSP_OBJ_TEST) $(PP_OBJ_TEST) $(MONGOOSE_OBJ_TEST) $(LZ4_OBJ_TEST) $(UNIXCODER_OBJ)
419427

420428
$(BUILD_DIR)/test-runner: $(ALL_TEST_SRCS) $(PROD_SRCS) $(EXTRACTION_SRCS) $(AC_LZ4_SRCS) $(SQLITE_WRITER_SRC) $(OBJS_VENDORED_TEST) | $(BUILD_DIR)
421429
$(CC) $(CFLAGS_TEST) -o $@ \
@@ -459,7 +467,7 @@ $(BUILD_DIR)/prod_lz4.o: $(CBM_DIR)/vendored/lz4/lz4.c | $(BUILD_DIR)
459467
$(BUILD_DIR)/prod_lz4hc.o: $(CBM_DIR)/vendored/lz4/lz4hc.c | $(BUILD_DIR)
460468
$(CC) -std=c11 -D_DEFAULT_SOURCE -O2 -w -I$(CBM_DIR)/vendored/lz4 -c -o $@ $<
461469

462-
OBJS_VENDORED_PROD = $(MIMALLOC_OBJ_PROD) $(SQLITE3_OBJ_PROD) $(TRE_OBJ_PROD) $(GRAMMAR_OBJS_PROD) $(TS_RUNTIME_OBJ_PROD) $(LSP_OBJ_PROD) $(PP_OBJ_PROD) $(MONGOOSE_OBJ_PROD) $(LZ4_OBJ_PROD)
470+
OBJS_VENDORED_PROD = $(MIMALLOC_OBJ_PROD) $(SQLITE3_OBJ_PROD) $(TRE_OBJ_PROD) $(GRAMMAR_OBJS_PROD) $(TS_RUNTIME_OBJ_PROD) $(LSP_OBJ_PROD) $(PP_OBJ_PROD) $(MONGOOSE_OBJ_PROD) $(LZ4_OBJ_PROD) $(UNIXCODER_OBJ)
463471

464472
MAIN_SRC = src/main.c
465473

internal/cbm/cbm.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,7 @@ typedef struct {
110110
bool is_test;
111111
bool is_entry_point;
112112
const char *structural_profile; // AST structural profile (arena-allocated) or NULL
113+
const char *body_tokens; // space-separated raw identifier tokens from body (arena) or NULL
113114
} CBMDefinition;
114115

115116
/* Argument captured from a call expression */

internal/cbm/extract_defs.c

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,63 @@ static void compute_fingerprint(CBMExtractCtx *ctx, CBMDefinition *def, TSNode f
6666
cbm_ast_profile_to_str(&profile, sp_buf, sizeof(sp_buf));
6767
def->structural_profile = cbm_arena_strdup(ctx->arena, sp_buf);
6868
}
69+
70+
/* Extract raw identifier tokens from body for semantic search.
71+
* Walk the AST, collect unique identifier text, store as space-separated string.
72+
* Cap at ~500 chars to fit in properties_json. */
73+
{
74+
enum { BT_STACK = 512, BT_BUF = 512, BT_MAX_IDENTS = 40, BT_SEEN = 128, BT_SEEN_MASK = 127 };
75+
TSNode bt_stack[BT_STACK];
76+
int bt_top = 0;
77+
bt_stack[bt_top++] = body;
78+
char bt_buf[BT_BUF];
79+
int bt_pos = 0;
80+
uint32_t bt_seen[BT_SEEN];
81+
memset(bt_seen, 0, sizeof(bt_seen));
82+
int bt_count = 0;
83+
84+
while (bt_top > 0 && bt_count < BT_MAX_IDENTS) {
85+
TSNode nd = bt_stack[--bt_top];
86+
uint32_t nc = ts_node_child_count(nd);
87+
if (nc == 0) {
88+
const char *k = ts_node_type(nd);
89+
if (strcmp(k, "identifier") == 0 || strcmp(k, "field_identifier") == 0 ||
90+
strcmp(k, "property_identifier") == 0) {
91+
uint32_t s = ts_node_start_byte(nd);
92+
uint32_t e = ts_node_end_byte(nd);
93+
int len = (int)(e - s);
94+
if (len > 0 && len < CBM_SZ_64 && s < (uint32_t)ctx->source_len) {
95+
/* Dedup via simple hash set */
96+
uint32_t h = 0;
97+
for (int x = 0; x < len; x++) {
98+
h = h * 31 + (uint32_t)(unsigned char)ctx->source[s + x];
99+
}
100+
uint32_t slot = h & BT_SEEN_MASK;
101+
bool dup = false;
102+
for (int p = 0; p < BT_SEEN; p++) {
103+
uint32_t idx = (slot + (uint32_t)p) & BT_SEEN_MASK;
104+
if (bt_seen[idx] == 0) { bt_seen[idx] = h | 1; break; }
105+
if (bt_seen[idx] == (h | 1)) { dup = true; break; }
106+
}
107+
if (!dup && bt_pos + len + 1 < BT_BUF) {
108+
if (bt_pos > 0) { bt_buf[bt_pos++] = ' '; }
109+
memcpy(bt_buf + bt_pos, ctx->source + s, (size_t)len);
110+
bt_pos += len;
111+
bt_count++;
112+
}
113+
}
114+
}
115+
} else {
116+
for (int i = (int)nc - 1; i >= 0 && bt_top < BT_STACK; i--) {
117+
bt_stack[bt_top++] = ts_node_child(nd, (uint32_t)i);
118+
}
119+
}
120+
}
121+
if (bt_pos > 0) {
122+
bt_buf[bt_pos] = '\0';
123+
def->body_tokens = cbm_arena_strdup(ctx->arena, bt_buf);
124+
}
125+
}
69126
}
70127

71128
// Tree-sitter row is 0-based; lines are 1-based.

internal/cbm/sqlite_writer.c

Lines changed: 52 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -746,6 +746,23 @@ static uint8_t *build_vector_record(const CBMDumpVector *v, int *out_len) {
746746
return data;
747747
}
748748

749+
// Build a token_vectors table record: (id, project, token, vector, idf)
750+
static uint8_t *build_token_vec_record(const CBMDumpTokenVec *tv, int *out_len) {
751+
RecordBuilder r;
752+
rec_init(&r);
753+
754+
rec_add_int(&r, tv->id);
755+
rec_add_text(&r, tv->project);
756+
rec_add_text(&r, tv->token);
757+
rec_add_blob(&r, tv->vector, tv->vector_len);
758+
/* Store IDF as integer × 1000 for fixed-point (avoid float in record) */
759+
rec_add_int(&r, (int64_t)(tv->idf * 1000.0f));
760+
761+
uint8_t *data = rec_finalize(&r, out_len);
762+
rec_free(&r);
763+
return data;
764+
}
765+
749766
// Build a projects table record: (name, indexed_at, root_path)
750767
static uint8_t *build_project_record(const char *name, const char *indexed_at,
751768
const char *root_path, int *out_len) {
@@ -1492,11 +1509,13 @@ typedef struct {
14921509
int edge_count;
14931510
CBMDumpVector *vectors;
14941511
int vector_count;
1512+
CBMDumpTokenVec *token_vecs;
1513+
int token_vec_count;
14951514
} write_db_ctx_t;
14961515

14971516
/* Phase 1: Write node + edge + vector data tables (streaming). */
14981517
static int write_data_tables(write_db_ctx_t *w, uint32_t *nodes_root, uint32_t *edges_root,
1499-
uint32_t *vectors_root) {
1518+
uint32_t *vectors_root, uint32_t *token_vecs_root) {
15001519
if (w->node_count > 0) {
15011520
PageBuilder pb;
15021521
pb_init(&pb, w->fp, w->next_page, false);
@@ -1552,6 +1571,26 @@ static int write_data_tables(write_db_ctx_t *w, uint32_t *nodes_root, uint32_t *
15521571
} else {
15531572
*vectors_root = write_table_btree(w->fp, &w->next_page, NULL, NULL, NULL, 0, false);
15541573
}
1574+
1575+
/* token_vectors table — enriched RI vectors for query-time lookup */
1576+
if (w->token_vec_count > 0 && w->token_vecs) {
1577+
PageBuilder pb;
1578+
pb_init(&pb, w->fp, w->next_page, false);
1579+
for (int i = 0; i < w->token_vec_count; i++) {
1580+
int rec_len;
1581+
uint8_t *rec = build_token_vec_record(&w->token_vecs[i], &rec_len);
1582+
if (!rec) {
1583+
return ERR_WRITE_FAILED;
1584+
}
1585+
pb_add_table_cell_with_flush(&pb, w->token_vecs[i].id, rec, rec_len,
1586+
i > 0 ? w->token_vecs[i - SKIP_ONE].id : 0);
1587+
free(rec);
1588+
}
1589+
*token_vecs_root = pb_finalize_table(&pb, &w->next_page,
1590+
w->token_vecs[w->token_vec_count - SKIP_ONE].id);
1591+
} else {
1592+
*token_vecs_root = write_table_btree(w->fp, &w->next_page, NULL, NULL, NULL, 0, false);
1593+
}
15551594
return 0;
15561595
}
15571596

@@ -1599,7 +1638,8 @@ static void write_metadata_tables(write_db_ctx_t *w, uint32_t *projects_root,
15991638

16001639
int cbm_write_db(const char *path, const char *project, const char *root_path,
16011640
const char *indexed_at, CBMDumpNode *nodes, int node_count, CBMDumpEdge *edges,
1602-
int edge_count, CBMDumpVector *vectors, int vector_count) {
1641+
int edge_count, CBMDumpVector *vectors, int vector_count,
1642+
CBMDumpTokenVec *token_vecs, int token_vec_count) {
16031643
FILE *fp = fopen(path, "wb");
16041644
if (!fp) {
16051645
return CBM_NOT_FOUND;
@@ -1615,13 +1655,16 @@ int cbm_write_db(const char *path, const char *project, const char *root_path,
16151655
.edges = edges,
16161656
.edge_count = edge_count,
16171657
.vectors = vectors,
1618-
.vector_count = vector_count};
1658+
.vector_count = vector_count,
1659+
.token_vecs = token_vecs,
1660+
.token_vec_count = token_vec_count};
16191661

1620-
// Phase 1: Data tables (streaming node + edge + vector records)
1662+
// Phase 1: Data tables (streaming node + edge + vector + token_vector records)
16211663
uint32_t nodes_root;
16221664
uint32_t edges_root;
16231665
uint32_t vectors_root;
1624-
int rc = write_data_tables(&w, &nodes_root, &edges_root, &vectors_root);
1666+
uint32_t token_vecs_root;
1667+
int rc = write_data_tables(&w, &nodes_root, &edges_root, &vectors_root, &token_vecs_root);
16251668
if (rc != 0) {
16261669
(void)fclose(fp);
16271670
return rc;
@@ -1810,6 +1853,10 @@ int cbm_write_db(const char *path, const char *project, const char *root_path,
18101853
{"table", "node_vectors", "node_vectors", vectors_root,
18111854
"CREATE TABLE node_vectors (\n\t\tnode_id INTEGER PRIMARY KEY,\n\t\tproject TEXT NOT "
18121855
"NULL,\n\t\tvector BLOB NOT NULL\n\t)"},
1856+
{"table", "token_vectors", "token_vectors", token_vecs_root,
1857+
"CREATE TABLE token_vectors (\n\t\tid INTEGER PRIMARY KEY,\n\t\tproject "
1858+
"TEXT NOT NULL,\n\t\ttoken TEXT NOT NULL,\n\t\tvector BLOB NOT NULL,\n\t\tidf INTEGER "
1859+
"NOT NULL\n\t)"},
18131860
{"table", "sqlite_sequence", "sqlite_sequence", sqlite_seq_root,
18141861
"CREATE TABLE sqlite_sequence(name,seq)"},
18151862
};

internal/cbm/sqlite_writer.h

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,14 +34,24 @@ typedef struct {
3434
int vector_len; // length in bytes (e.g. 256 for d=256)
3535
} CBMDumpVector;
3636

37+
typedef struct {
38+
int64_t id; // sequential ID (1..T)
39+
const char *project;
40+
const char *token; // the token string
41+
const uint8_t *vector; // int8-quantized enriched RI vector blob
42+
int vector_len; // length in bytes (e.g. 256 for d=256)
43+
float idf; // inverse document frequency weight
44+
} CBMDumpTokenVec;
45+
3746
// --- Public API ---
3847

3948
// Write a complete SQLite .db file from sorted in-memory data.
4049
// Constructs B-tree pages directly — no SQL parser, no INSERTs.
4150
// Returns 0 on success, non-zero on error.
42-
// vectors/vector_count may be NULL/0 if no vectors are available.
51+
// vectors/vector_count and token_vecs/token_vec_count may be NULL/0.
4352
int cbm_write_db(const char *path, const char *project, const char *root_path,
4453
const char *indexed_at, CBMDumpNode *nodes, int node_count, CBMDumpEdge *edges,
45-
int edge_count, CBMDumpVector *vectors, int vector_count);
54+
int edge_count, CBMDumpVector *vectors, int vector_count,
55+
CBMDumpTokenVec *token_vecs, int token_vec_count);
4656

4757
#endif // CBM_SQLITE_WRITER_H

src/graph_buffer/graph_buffer.c

Lines changed: 50 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,11 @@ struct cbm_gbuf {
8888
CBMDumpVector *dump_vectors;
8989
int dump_vector_count;
9090
int dump_vector_cap;
91+
92+
/* Token vector storage for enriched RI vectors (query-time lookup). */
93+
CBMDumpTokenVec *dump_token_vecs;
94+
int dump_token_vec_count;
95+
int dump_token_vec_cap;
9196
};
9297

9398
/* ── Helpers ─────────────────────────────────────────────────────── */
@@ -431,6 +436,13 @@ void cbm_gbuf_free(cbm_gbuf_t *gb) {
431436
}
432437
free(gb->dump_vectors);
433438

439+
/* Free token vector storage */
440+
for (int i = 0; i < gb->dump_token_vec_count; i++) {
441+
free((void *)gb->dump_token_vecs[i].token);
442+
free((void *)gb->dump_token_vecs[i].vector);
443+
}
444+
free(gb->dump_token_vecs);
445+
434446
free(gb->project);
435447
free(gb->root_path);
436448
free(gb);
@@ -469,6 +481,42 @@ int cbm_gbuf_store_vector(cbm_gbuf_t *gb, int64_t node_id, const uint8_t *vector
469481
return 0;
470482
}
471483

484+
int cbm_gbuf_store_token_vector(cbm_gbuf_t *gb, const char *token, const uint8_t *vector,
485+
int vector_len, float idf) {
486+
if (!gb || !token || !vector || vector_len <= 0) {
487+
return -1;
488+
}
489+
enum { TV_INIT_CAP = 256, TV_GROW = 2 };
490+
if (gb->dump_token_vec_count >= gb->dump_token_vec_cap) {
491+
int new_cap = gb->dump_token_vec_cap < TV_INIT_CAP ? TV_INIT_CAP
492+
: gb->dump_token_vec_cap * TV_GROW;
493+
CBMDumpTokenVec *grown =
494+
realloc(gb->dump_token_vecs, (size_t)new_cap * sizeof(CBMDumpTokenVec));
495+
if (!grown) {
496+
return -1;
497+
}
498+
gb->dump_token_vecs = grown;
499+
gb->dump_token_vec_cap = new_cap;
500+
}
501+
uint8_t *vec_copy = malloc((size_t)vector_len);
502+
if (!vec_copy) {
503+
return -1;
504+
}
505+
memcpy(vec_copy, vector, (size_t)vector_len);
506+
507+
int idx = gb->dump_token_vec_count;
508+
gb->dump_token_vecs[idx] = (CBMDumpTokenVec){
509+
.id = idx + SKIP_ONE, /* 1-based sequential ID */
510+
.project = gb->project,
511+
.token = strdup(token),
512+
.vector = vec_copy,
513+
.vector_len = vector_len,
514+
.idf = idf,
515+
};
516+
gb->dump_token_vec_count++;
517+
return 0;
518+
}
519+
472520
/* ── ID accessors ────────────────────────────────────────────────── */
473521

474522
int64_t cbm_gbuf_next_id(const cbm_gbuf_t *gb) {
@@ -1242,7 +1290,8 @@ int cbm_gbuf_dump_to_sqlite(cbm_gbuf_t *gb, const char *path) {
12421290
* Callers must delete the old .db before calling this (reindex)
12431291
* or ensure no file exists (first index). */
12441292
int rc = cbm_write_db(path, gb->project, gb->root_path, indexed_at, dump_nodes, node_idx,
1245-
dump_edges, edge_idx, gb->dump_vectors, gb->dump_vector_count);
1293+
dump_edges, edge_idx, gb->dump_vectors, gb->dump_vector_count,
1294+
gb->dump_token_vecs, gb->dump_token_vec_count);
12461295

12471296
{
12481297
char b1[CBM_SZ_16];

src/graph_buffer/graph_buffer.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,12 @@ int cbm_gbuf_delete_edges_by_type(cbm_gbuf_t *gb, const char *type);
155155
* Vectors are carried through to cbm_write_db during the dump phase. */
156156
int cbm_gbuf_store_vector(cbm_gbuf_t *gb, int64_t node_id, const uint8_t *vector, int vector_len);
157157

158+
/* Store an enriched token vector for query-time lookup.
159+
* Called by pass_semantic_edges after corpus finalization.
160+
* Token string and vector data are copied. */
161+
int cbm_gbuf_store_token_vector(cbm_gbuf_t *gb, const char *token, const uint8_t *vector,
162+
int vector_len, float idf);
163+
158164
/* ── Dump to SQLite ──────────────────────────────────────────────── */
159165

160166
/* Dump the entire buffer to a SQLite file using the direct page writer.

src/pipeline/pass_definitions.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -206,6 +206,11 @@ static void build_def_props(char *buf, size_t bufsize, const CBMDefinition *def)
206206
append_json_string(buf, bufsize, &pos, "sp", def->structural_profile);
207207
}
208208

209+
/* Body tokens */
210+
if (def->body_tokens && pos + CBM_SZ_512 < bufsize) {
211+
append_json_string(buf, bufsize, &pos, "bt", def->body_tokens);
212+
}
213+
209214
if (pos < bufsize - SKIP_ONE) {
210215
buf[pos] = '}';
211216
buf[pos + SKIP_ONE] = '\0';

src/pipeline/pass_parallel.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,11 @@ static void build_def_props(char *buf, size_t bufsize, const CBMDefinition *def)
229229
append_json_string(buf, bufsize, &pos, "sp", def->structural_profile);
230230
}
231231

232+
/* Body tokens — raw identifiers from function body AST for semantic search. */
233+
if (def->body_tokens && pos + CBM_SZ_512 < bufsize) {
234+
append_json_string(buf, bufsize, &pos, "bt", def->body_tokens);
235+
}
236+
232237
if (pos < bufsize - SKIP_ONE) {
233238
buf[pos] = '}';
234239
buf[pos + SKIP_ONE] = '\0';

0 commit comments

Comments
 (0)