Skip to content

Commit 7a9b7db

Browse files
committed
Add semantic_query to search_graph: vector search via cbm_cosine_i8 SQL function
Vector search pipeline (end-to-end): - store.c: cbm_cosine_i8 custom SQLite function (int8 dot product with sqrt magnitude), registered at store open time - store.c: cbm_store_vector_search() builds merged query vector from keywords via base RI vectors (xxHash), int8-quantizes, runs SQL query with JOIN nodes ON node_id = id for metadata - mcp.c: search_graph gains semantic_query param (array of strings), returns semantic_results alongside regular results - sqlite_writer.c: fixed build_vector_record to include node_id in record body (matches build_node_record pattern) - mcp.c: use yyjson_mut_obj_add_strcpy (not add_str) to avoid use-after-free when vector results are freed before JSON serialization Quality note: base RI vectors at query time don't capture co-occurrence enrichment — results are moderate quality. Storing enriched token vectors would improve this significantly (future work).
1 parent 8a0ad04 commit 7a9b7db

9 files changed

Lines changed: 405 additions & 13 deletions

File tree

internal/cbm/sqlite_writer.c

Lines changed: 63 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,9 @@ enum {
107107
// SQLite text serial type offset: serial_type = len*2 + TEXT_SERIAL_BASE.
108108
#define TEXT_SERIAL_BASE 13
109109

110+
// SQLite blob serial type offset: serial_type = len*2 + BLOB_SERIAL_BASE.
111+
#define BLOB_SERIAL_BASE 12
112+
110113
// SQLite integer storage range limits.
111114
#define INT8_MAX_VAL 127
112115
#define INT16_MAX_VAL 32767
@@ -360,6 +363,16 @@ static void rec_add_text(RecordBuilder *r, const char *s) {
360363
}
361364
}
362365

366+
static void rec_add_blob(RecordBuilder *r, const uint8_t *data, int len) {
367+
int64_t st = len > 0 ? (int64_t)len * 2 + BLOB_SERIAL_BASE : 0;
368+
uint8_t vbuf[VARINT_MAX_BYTES];
369+
int vlen = put_varint(vbuf, st);
370+
dynbuf_append(&r->header, vbuf, vlen);
371+
if (len > 0 && data) {
372+
dynbuf_append(&r->body, data, len);
373+
}
374+
}
375+
363376
// Finalize: returns the complete record bytes (header_len + header + body).
364377
// Caller must free the returned buffer.
365378
static uint8_t *rec_finalize(RecordBuilder *r, int *out_len) {
@@ -718,6 +731,21 @@ static uint8_t *build_edge_record(const CBMDumpEdge *e, int *out_len) {
718731
return data;
719732
}
720733

734+
// Build a node_vectors table record: (node_id, project, vector)
735+
// Includes node_id in the record body (same pattern as build_node_record).
736+
static uint8_t *build_vector_record(const CBMDumpVector *v, int *out_len) {
737+
RecordBuilder r;
738+
rec_init(&r);
739+
740+
rec_add_int(&r, v->node_id);
741+
rec_add_text(&r, v->project);
742+
rec_add_blob(&r, v->vector, v->vector_len);
743+
744+
uint8_t *data = rec_finalize(&r, out_len);
745+
rec_free(&r);
746+
return data;
747+
}
748+
721749
// Build a projects table record: (name, indexed_at, root_path)
722750
static uint8_t *build_project_record(const char *name, const char *indexed_at,
723751
const char *root_path, int *out_len) {
@@ -1462,10 +1490,13 @@ typedef struct {
14621490
int node_count;
14631491
CBMDumpEdge *edges;
14641492
int edge_count;
1493+
CBMDumpVector *vectors;
1494+
int vector_count;
14651495
} write_db_ctx_t;
14661496

1467-
/* Phase 1: Write node + edge data tables (streaming). */
1468-
static int write_data_tables(write_db_ctx_t *w, uint32_t *nodes_root, uint32_t *edges_root) {
1497+
/* Phase 1: Write node + edge + vector data tables (streaming). */
1498+
static int write_data_tables(write_db_ctx_t *w, uint32_t *nodes_root, uint32_t *edges_root,
1499+
uint32_t *vectors_root) {
14691500
if (w->node_count > 0) {
14701501
PageBuilder pb;
14711502
pb_init(&pb, w->fp, w->next_page, false);
@@ -1501,6 +1532,26 @@ static int write_data_tables(write_db_ctx_t *w, uint32_t *nodes_root, uint32_t *
15011532
} else {
15021533
*edges_root = write_table_btree(w->fp, &w->next_page, NULL, NULL, NULL, 0, false);
15031534
}
1535+
1536+
/* node_vectors table — uses node_id as rowid (not AUTOINCREMENT) */
1537+
if (w->vector_count > 0 && w->vectors) {
1538+
PageBuilder pb;
1539+
pb_init(&pb, w->fp, w->next_page, false);
1540+
for (int i = 0; i < w->vector_count; i++) {
1541+
int rec_len;
1542+
uint8_t *rec = build_vector_record(&w->vectors[i], &rec_len);
1543+
if (!rec) {
1544+
return ERR_WRITE_FAILED;
1545+
}
1546+
pb_add_table_cell_with_flush(&pb, w->vectors[i].node_id, rec, rec_len,
1547+
i > 0 ? w->vectors[i - SKIP_ONE].node_id : 0);
1548+
free(rec);
1549+
}
1550+
*vectors_root =
1551+
pb_finalize_table(&pb, &w->next_page, w->vectors[w->vector_count - SKIP_ONE].node_id);
1552+
} else {
1553+
*vectors_root = write_table_btree(w->fp, &w->next_page, NULL, NULL, NULL, 0, false);
1554+
}
15041555
return 0;
15051556
}
15061557

@@ -1548,7 +1599,7 @@ static void write_metadata_tables(write_db_ctx_t *w, uint32_t *projects_root,
15481599

15491600
int cbm_write_db(const char *path, const char *project, const char *root_path,
15501601
const char *indexed_at, CBMDumpNode *nodes, int node_count, CBMDumpEdge *edges,
1551-
int edge_count) {
1602+
int edge_count, CBMDumpVector *vectors, int vector_count) {
15521603
FILE *fp = fopen(path, "wb");
15531604
if (!fp) {
15541605
return CBM_NOT_FOUND;
@@ -1562,12 +1613,15 @@ int cbm_write_db(const char *path, const char *project, const char *root_path,
15621613
.nodes = nodes,
15631614
.node_count = node_count,
15641615
.edges = edges,
1565-
.edge_count = edge_count};
1616+
.edge_count = edge_count,
1617+
.vectors = vectors,
1618+
.vector_count = vector_count};
15661619

1567-
// Phase 1: Data tables (streaming node + edge records)
1620+
// Phase 1: Data tables (streaming node + edge + vector records)
15681621
uint32_t nodes_root;
15691622
uint32_t edges_root;
1570-
int rc = write_data_tables(&w, &nodes_root, &edges_root);
1623+
uint32_t vectors_root;
1624+
int rc = write_data_tables(&w, &nodes_root, &edges_root, &vectors_root);
15711625
if (rc != 0) {
15721626
(void)fclose(fp);
15731627
return rc;
@@ -1753,6 +1807,9 @@ int cbm_write_db(const char *path, const char *project, const char *root_path,
17531807
"NULL,\n\t\t\tupdated_at TEXT NOT NULL\n\t\t)"},
17541808
{"index", "sqlite_autoindex_project_summaries_1", "project_summaries",
17551809
autoindex_summaries_root, NULL},
1810+
{"table", "node_vectors", "node_vectors", vectors_root,
1811+
"CREATE TABLE node_vectors (\n\t\tnode_id INTEGER PRIMARY KEY,\n\t\tproject TEXT NOT "
1812+
"NULL,\n\t\tvector BLOB NOT NULL\n\t)"},
17561813
{"table", "sqlite_sequence", "sqlite_sequence", sqlite_seq_root,
17571814
"CREATE TABLE sqlite_sequence(name,seq)"},
17581815
};

internal/cbm/sqlite_writer.h

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,13 +27,21 @@ typedef struct {
2727
const char *url_path; // extracted from properties by Go (for idx_edges_url_path)
2828
} CBMDumpEdge;
2929

30+
typedef struct {
31+
int64_t node_id; // final sequential ID (matches nodes.id)
32+
const char *project;
33+
const uint8_t *vector; // int8-quantized vector blob
34+
int vector_len; // length in bytes (e.g. 256 for d=256)
35+
} CBMDumpVector;
36+
3037
// --- Public API ---
3138

3239
// Write a complete SQLite .db file from sorted in-memory data.
3340
// Constructs B-tree pages directly — no SQL parser, no INSERTs.
3441
// Returns 0 on success, non-zero on error.
42+
// vectors/vector_count may be NULL/0 if no vectors are available.
3543
int cbm_write_db(const char *path, const char *project, const char *root_path,
3644
const char *indexed_at, CBMDumpNode *nodes, int node_count, CBMDumpEdge *edges,
37-
int edge_count);
45+
int edge_count, CBMDumpVector *vectors, int vector_count);
3846

3947
#endif // CBM_SQLITE_WRITER_H

src/graph_buffer/graph_buffer.c

Lines changed: 46 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,12 @@ struct cbm_gbuf {
8282
CBMHashTable *edges_by_source_type; /* "srcID:type" → edge_ptr_array_t* */
8383
CBMHashTable *edges_by_target_type; /* "tgtID:type" → edge_ptr_array_t* */
8484
CBMHashTable *edges_by_type; /* "type" → edge_ptr_array_t* */
85+
86+
/* Vector storage for semantic embeddings (filled by pass_semantic_edges,
87+
* consumed by cbm_write_db during dump). */
88+
CBMDumpVector *dump_vectors;
89+
int dump_vector_count;
90+
int dump_vector_cap;
8591
};
8692

8793
/* ── Helpers ─────────────────────────────────────────────────────── */
@@ -419,11 +425,50 @@ void cbm_gbuf_free(cbm_gbuf_t *gb) {
419425
cbm_ht_free(gb->edges_by_type);
420426
}
421427

428+
/* Free vector storage */
429+
for (int i = 0; i < gb->dump_vector_count; i++) {
430+
free((void *)gb->dump_vectors[i].vector);
431+
}
432+
free(gb->dump_vectors);
433+
422434
free(gb->project);
423435
free(gb->root_path);
424436
free(gb);
425437
}
426438

439+
/* ── Vector storage ──────────────────────────────────────────────── */
440+
441+
int cbm_gbuf_store_vector(cbm_gbuf_t *gb, int64_t node_id, const uint8_t *vector, int vector_len) {
442+
if (!gb || !vector || vector_len <= 0) {
443+
return -1;
444+
}
445+
enum { VEC_INIT_CAP = 1024, VEC_GROW = 2 };
446+
if (gb->dump_vector_count >= gb->dump_vector_cap) {
447+
int new_cap =
448+
gb->dump_vector_cap < VEC_INIT_CAP ? VEC_INIT_CAP : gb->dump_vector_cap * VEC_GROW;
449+
CBMDumpVector *grown = realloc(gb->dump_vectors, (size_t)new_cap * sizeof(CBMDumpVector));
450+
if (!grown) {
451+
return -1;
452+
}
453+
gb->dump_vectors = grown;
454+
gb->dump_vector_cap = new_cap;
455+
}
456+
/* Copy vector data */
457+
uint8_t *vec_copy = malloc((size_t)vector_len);
458+
if (!vec_copy) {
459+
return -1;
460+
}
461+
memcpy(vec_copy, vector, (size_t)vector_len);
462+
463+
gb->dump_vectors[gb->dump_vector_count++] = (CBMDumpVector){
464+
.node_id = node_id,
465+
.project = gb->project, /* borrowed — valid until gbuf_free */
466+
.vector = vec_copy,
467+
.vector_len = vector_len,
468+
};
469+
return 0;
470+
}
471+
427472
/* ── ID accessors ────────────────────────────────────────────────── */
428473

429474
int64_t cbm_gbuf_next_id(const cbm_gbuf_t *gb) {
@@ -1197,7 +1242,7 @@ int cbm_gbuf_dump_to_sqlite(cbm_gbuf_t *gb, const char *path) {
11971242
* Callers must delete the old .db before calling this (reindex)
11981243
* or ensure no file exists (first index). */
11991244
int rc = cbm_write_db(path, gb->project, gb->root_path, indexed_at, dump_nodes, node_idx,
1200-
dump_edges, edge_idx);
1245+
dump_edges, edge_idx, gb->dump_vectors, gb->dump_vector_count);
12011246

12021247
{
12031248
char b1[CBM_SZ_16];

src/graph_buffer/graph_buffer.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,13 @@ int cbm_gbuf_edge_count_by_type(const cbm_gbuf_t *gb, const char *type);
148148
/* Delete all edges of a type. */
149149
int cbm_gbuf_delete_edges_by_type(cbm_gbuf_t *gb, const char *type);
150150

151+
/* ── Vector storage (for semantic embeddings) ───────────────────── */
152+
153+
/* Store an int8-quantized vector for a node. The vector data is copied.
154+
* Called by pass_semantic_edges after computing RI vectors.
155+
* Vectors are carried through to cbm_write_db during the dump phase. */
156+
int cbm_gbuf_store_vector(cbm_gbuf_t *gb, int64_t node_id, const uint8_t *vector, int vector_len);
157+
151158
/* ── Dump to SQLite ──────────────────────────────────────────────── */
152159

153160
/* Dump the entire buffer to a SQLite file using the direct page writer.

src/mcp/mcp.c

Lines changed: 53 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -269,7 +269,9 @@ static const tool_def_t TOOLS[] = {
269269
"\"string\"},\"name_pattern\":{\"type\":\"string\"},\"qn_pattern\":{\"type\":\"string\"},"
270270
"\"file_pattern\":{\"type\":\"string\"},\"relationship\":{\"type\":\"string\"},\"min_degree\":"
271271
"{\"type\":\"integer\"},\"max_degree\":{\"type\":\"integer\"},\"exclude_entry_points\":{"
272-
"\"type\":\"boolean\"},\"include_connected\":{\"type\":\"boolean\"},\"limit\":{\"type\":"
272+
"\"type\":\"boolean\"},\"include_connected\":{\"type\":\"boolean\"},\"semantic_query\":{"
273+
"\"type\":\"array\",\"items\":{\"type\":\"string\"},\"description\":\"Keywords for semantic "
274+
"vector search (requires moderate/full index mode)\"},\"limit\":{\"type\":"
273275
"\"integer\",\"description\":\"Max results. Default: "
274276
"unlimited\"},\"offset\":{\"type\":\"integer\",\"default\":0}},\"required\":[\"project\"]}"},
275277

@@ -1099,6 +1101,56 @@ static char *handle_search_graph(cbm_mcp_server_t *srv, const char *args) {
10991101
yyjson_mut_obj_add_val(doc, root, "results", results);
11001102
yyjson_mut_obj_add_bool(doc, root, "has_more", out.total > offset + out.count);
11011103

1104+
/* Semantic vector search: parse semantic_query array and run vector search */
1105+
{
1106+
yyjson_doc *args_doc = yyjson_read(args, strlen(args), 0);
1107+
yyjson_val *args_root = args_doc ? yyjson_doc_get_root(args_doc) : NULL;
1108+
yyjson_val *sq_arr = args_root ? yyjson_obj_get(args_root, "semantic_query") : NULL;
1109+
if (sq_arr && yyjson_is_arr(sq_arr)) {
1110+
int kw_count = (int)yyjson_arr_size(sq_arr);
1111+
if (kw_count > 0) {
1112+
enum { MAX_KW = 32 };
1113+
const char *keywords[MAX_KW];
1114+
if (kw_count > MAX_KW) {
1115+
kw_count = MAX_KW;
1116+
}
1117+
size_t kw_idx = 0;
1118+
size_t kw_max = 0;
1119+
yyjson_val *kw_val;
1120+
int ki = 0;
1121+
yyjson_arr_foreach(sq_arr, kw_idx, kw_max, kw_val) {
1122+
if (ki < kw_count && yyjson_is_str(kw_val)) {
1123+
keywords[ki++] = yyjson_get_str(kw_val);
1124+
}
1125+
}
1126+
1127+
cbm_vector_result_t *vresults = NULL;
1128+
int vcount = 0;
1129+
int sem_limit = limit > 0 ? limit : CBM_SZ_16;
1130+
if (cbm_store_vector_search(store, project, keywords, ki, sem_limit, &vresults,
1131+
&vcount) == CBM_STORE_OK &&
1132+
vcount > 0) {
1133+
yyjson_mut_val *sem_results = yyjson_mut_arr(doc);
1134+
for (int v = 0; v < vcount; v++) {
1135+
yyjson_mut_val *vitem = yyjson_mut_obj(doc);
1136+
yyjson_mut_obj_add_strcpy(doc, vitem, "name", vresults[v].name);
1137+
yyjson_mut_obj_add_strcpy(doc, vitem, "qualified_name",
1138+
vresults[v].qualified_name);
1139+
yyjson_mut_obj_add_strcpy(doc, vitem, "label", vresults[v].label);
1140+
yyjson_mut_obj_add_strcpy(doc, vitem, "file_path", vresults[v].file_path);
1141+
yyjson_mut_obj_add_real(doc, vitem, "score", vresults[v].score);
1142+
yyjson_mut_arr_add_val(sem_results, vitem);
1143+
}
1144+
yyjson_mut_obj_add_val(doc, root, "semantic_results", sem_results);
1145+
cbm_store_free_vector_results(vresults, vcount);
1146+
}
1147+
}
1148+
}
1149+
if (args_doc) {
1150+
yyjson_doc_free(args_doc);
1151+
}
1152+
}
1153+
11021154
char *json = yy_doc_to_str(doc);
11031155
yyjson_mut_doc_free(doc);
11041156
cbm_store_search_free(&out);

src/pipeline/pass_semantic_edges.c

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -383,8 +383,24 @@ int cbm_pipeline_pass_semantic_edges(cbm_pipeline_ctx_t *ctx) {
383383
}
384384
}
385385
cbm_sem_normalize(&funcs[f].ri_vec);
386+
387+
/* Int8-quantize and store in graph buffer for vector search at query time */
388+
uint8_t qvec[CBM_SEM_DIM];
389+
for (int d = 0; d < CBM_SEM_DIM; d++) {
390+
float clamped = funcs[f].ri_vec.v[d];
391+
if (clamped > 1.0f) {
392+
clamped = 1.0f;
393+
}
394+
if (clamped < -1.0f) {
395+
clamped = -1.0f;
396+
}
397+
qvec[d] = (uint8_t)(int8_t)(clamped * 127.0f);
398+
}
399+
cbm_gbuf_store_vector(gbuf, funcs[f].node_id, qvec, CBM_SEM_DIM);
386400
}
387401

402+
cbm_log_info("pass.semantic.vectors_stored", "count", itoa_log(func_count));
403+
388404
/* Phase 5: Build cosine-LSH index using random hyperplanes on RI vectors.
389405
* This gives O(n) candidate generation instead of O(n²) brute-force.
390406
*

0 commit comments

Comments
 (0)