Skip to content

Commit 8a0ad04

Browse files
committed
WIP: Algorithmic semantic embeddings + indexing modes (checkpoint before vector storage)
1 parent 1d30971 commit 8a0ad04

15 files changed

Lines changed: 1808 additions & 10 deletions

Makefile.cbm

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -186,11 +186,15 @@ PIPELINE_SRCS = \
186186
src/pipeline/pass_compile_commands.c \
187187
src/pipeline/pass_infrascan.c \
188188
src/pipeline/pass_k8s.c \
189-
src/pipeline/pass_similarity.c
189+
src/pipeline/pass_similarity.c \
190+
src/pipeline/pass_semantic_edges.c
190191

191192
# SimHash / MinHash module
192193
SIMHASH_SRCS = src/simhash/minhash.c
193194

195+
# Semantic embedding module
196+
SEMANTIC_SRCS = src/semantic/semantic.c src/semantic/ast_profile.c
197+
194198
# Traces module (new)
195199
TRACES_SRCS = src/traces/traces.c
196200

@@ -237,7 +241,7 @@ TRE_CFLAGS = -std=c11 -g -O1 -w -Ivendored/tre
237241
YYJSON_SRC = vendored/yyjson/yyjson.c
238242

239243
# All production sources
240-
PROD_SRCS = $(FOUNDATION_SRCS) $(STORE_SRCS) $(CYPHER_SRCS) $(MCP_SRCS) $(DISCOVER_SRCS) $(GRAPH_BUFFER_SRCS) $(PIPELINE_SRCS) $(SIMHASH_SRCS) $(TRACES_SRCS) $(WATCHER_SRCS) $(CLI_SRCS) $(UI_SRCS) $(YYJSON_SRC)
244+
PROD_SRCS = $(FOUNDATION_SRCS) $(STORE_SRCS) $(CYPHER_SRCS) $(MCP_SRCS) $(DISCOVER_SRCS) $(GRAPH_BUFFER_SRCS) $(PIPELINE_SRCS) $(SIMHASH_SRCS) $(SEMANTIC_SRCS) $(TRACES_SRCS) $(WATCHER_SRCS) $(CLI_SRCS) $(UI_SRCS) $(YYJSON_SRC)
241245
EXISTING_C_SRCS = $(EXTRACTION_SRCS) $(LSP_SRCS) $(TS_RUNTIME_SRC) \
242246
$(GRAMMAR_SRCS) $(AC_LZ4_SRCS) $(SQLITE_WRITER_SRC)
243247

@@ -511,7 +515,7 @@ SYSROOT_FLAG = $(if $(SYSROOT),-isysroot $(SYSROOT),)
511515

512516
# Our source files (excluding vendored, grammars, tree-sitter runtime)
513517
LINT_SRCS = $(FOUNDATION_SRCS) $(STORE_SRCS) $(CYPHER_SRCS) $(MCP_SRCS) \
514-
$(DISCOVER_SRCS) $(GRAPH_BUFFER_SRCS) $(PIPELINE_SRCS) $(SIMHASH_SRCS) \
518+
$(DISCOVER_SRCS) $(GRAPH_BUFFER_SRCS) $(PIPELINE_SRCS) $(SIMHASH_SRCS) $(SEMANTIC_SRCS) \
515519
$(TRACES_SRCS) $(WATCHER_SRCS) $(CLI_SRCS) $(EXTRACTION_SRCS) $(AC_LZ4_SRCS) \
516520
$(SQLITE_WRITER_SRC) $(MAIN_SRC)
517521
LINT_HDRS = $(wildcard src/**/*.h src/*.h $(CBM_DIR)/*.h)

internal/cbm/cbm.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ typedef struct {
109109
bool is_abstract;
110110
bool is_test;
111111
bool is_entry_point;
112+
const char *structural_profile; // AST structural profile (arena-allocated) or NULL
112113
} CBMDefinition;
113114

114115
/* Argument captured from a call expression */

internal/cbm/extract_defs.c

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
#include "lang_specs.h"
55
#include "foundation/constants.h"
66
#include "simhash/minhash.h"
7+
#include "semantic/ast_profile.h"
78
#include "tree_sitter/api.h" // TSNode, ts_node_*
89
#include <stdint.h> // uint32_t
910
#include <string.h>
@@ -50,6 +51,21 @@ static void compute_fingerprint(CBMExtractCtx *ctx, CBMDefinition *def, TSNode f
5051
memcpy(fp, result.values, CBM_MINHASH_K * sizeof(uint32_t));
5152
def->fingerprint = fp;
5253
def->fingerprint_k = CBM_MINHASH_K;
54+
55+
/* AST structural profile (signals 8, 9, 11) — rides the same body node */
56+
cbm_ast_profile_t profile;
57+
int pc = 0;
58+
if (def->param_names) {
59+
while (def->param_names[pc]) {
60+
pc++;
61+
}
62+
}
63+
if (cbm_ast_profile_compute(body, ctx->source, def->param_names, pc, &profile)) {
64+
profile.body_lines = (uint16_t)def->lines;
65+
char sp_buf[CBM_AST_PROFILE_BUF];
66+
cbm_ast_profile_to_str(&profile, sp_buf, sizeof(sp_buf));
67+
def->structural_profile = cbm_arena_strdup(ctx->arena, sp_buf);
68+
}
5369
}
5470

5571
// Tree-sitter row is 0-based; lines are 1-based.

src/mcp/mcp.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1678,6 +1678,8 @@ static char *handle_index_repository(cbm_mcp_server_t *srv, const char *args) {
16781678
cbm_index_mode_t mode = CBM_MODE_FULL;
16791679
if (mode_str && strcmp(mode_str, "fast") == 0) {
16801680
mode = CBM_MODE_FAST;
1681+
} else if (mode_str && strcmp(mode_str, "moderate") == 0) {
1682+
mode = CBM_MODE_MODERATE;
16811683
}
16821684
free(mode_str);
16831685

src/pipeline/pass_definitions.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ enum { PD_RING = 4, PD_RING_MASK = 3, PD_JSON_MARGIN = 10, PD_ESC_MARGIN = 3, PD
2121
#include "foundation/compat.h"
2222
#include "cbm.h"
2323
#include "simhash/minhash.h"
24+
#include "semantic/ast_profile.h"
2425

2526
#include <stdio.h>
2627
#include <stdlib.h>
@@ -200,6 +201,11 @@ static void build_def_props(char *buf, size_t bufsize, const CBMDefinition *def)
200201
append_json_string(buf, bufsize, &pos, "fp", fp_hex);
201202
}
202203

204+
/* AST structural profile */
205+
if (def->structural_profile && pos + CBM_AST_PROFILE_BUF < bufsize) {
206+
append_json_string(buf, bufsize, &pos, "sp", def->structural_profile);
207+
}
208+
203209
if (pos < bufsize - SKIP_ONE) {
204210
buf[pos] = '}';
205211
buf[pos + SKIP_ONE] = '\0';

src/pipeline/pass_parallel.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ enum {
4040
#include "foundation/compat_regex.h"
4141
#include "cbm.h"
4242
#include "simhash/minhash.h"
43+
#include "semantic/ast_profile.h"
4344

4445
#include <stdatomic.h>
4546
#include <stdint.h>
@@ -223,6 +224,11 @@ static void build_def_props(char *buf, size_t bufsize, const CBMDefinition *def)
223224
append_json_string(buf, bufsize, &pos, "fp", fp_hex);
224225
}
225226

227+
/* AST structural profile — append if present and buffer has room. */
228+
if (def->structural_profile && pos + CBM_AST_PROFILE_BUF < bufsize) {
229+
append_json_string(buf, bufsize, &pos, "sp", def->structural_profile);
230+
}
231+
226232
if (pos < bufsize - SKIP_ONE) {
227233
buf[pos] = '}';
228234
buf[pos + SKIP_ONE] = '\0';

0 commit comments

Comments
 (0)