Skip to content

Commit 007980c

Browse files
committed
Fix C++ SEGV: NULL deref in LSP type resolver on large header files
Root cause: c_eval_expr_type_inner has 42 places accessing ptr->kind on CBMType* pointers. Some code paths (cbm_type_substitute, internal lookups) return NULL on unusual C++ AST shapes (deeply nested templates, 300+ defs per file). NULL->kind = SIGSEGV. Fix: safe_kind() inline returns CBM_TYPE_UNKNOWN for NULL pointers. All 42 ->kind accesses in c_eval_expr_type_inner replaced. Also: recursion depth guard (256), cbm_type_substitute returns cbm_type_unknown() for NULL input, walk_usages/walk_env depth limits. Verified: spdlog (previously crashed) now indexes 2526 nodes, 5518 edges. All 2586 tests pass.
1 parent babbebc commit 007980c

7 files changed

Lines changed: 181 additions & 50 deletions

File tree

internal/cbm/extract_env_accesses.c

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -131,8 +131,13 @@ static bool is_env_var_name(const char *s) {
131131
return has_upper;
132132
}
133133

134+
#define WALK_ENV_MAX_DEPTH 4096
135+
134136
// NOLINTNEXTLINE(misc-no-recursion) — intentional AST tree walk
135-
static void walk_env_accesses(CBMExtractCtx *ctx, TSNode node, const CBMLangSpec *spec) {
137+
static void walk_env_inner(CBMExtractCtx *ctx, TSNode node, const CBMLangSpec *spec, int depth) {
138+
if (depth > WALK_ENV_MAX_DEPTH) {
139+
return;
140+
}
136141
const char *kind = ts_node_type(node);
137142
const char *env_key = NULL;
138143

@@ -158,10 +163,14 @@ static void walk_env_accesses(CBMExtractCtx *ctx, TSNode node, const CBMLangSpec
158163
// Recurse
159164
uint32_t count = ts_node_child_count(node);
160165
for (uint32_t i = 0; i < count; i++) {
161-
walk_env_accesses(ctx, ts_node_child(node, i), spec);
166+
walk_env_inner(ctx, ts_node_child(node, i), spec, depth + 1);
162167
}
163168
}
164169

170+
static void walk_env_accesses(CBMExtractCtx *ctx, TSNode node, const CBMLangSpec *spec) {
171+
walk_env_inner(ctx, node, spec, 0);
172+
}
173+
165174
void cbm_extract_env_accesses(CBMExtractCtx *ctx) {
166175
const CBMLangSpec *spec = cbm_lang_spec(ctx->language);
167176
if (!spec) {

internal/cbm/extract_usages.c

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,14 @@ static bool is_reference_node(TSNode node, CBMLanguage lang) {
7676
}
7777

7878
// NOLINTNEXTLINE(misc-no-recursion) — intentional AST tree walk
79-
static void walk_usages(CBMExtractCtx *ctx, TSNode node, const CBMLangSpec *spec) {
79+
/* Max recursion depth — prevents stack overflow on deeply nested C++ templates.
80+
* 8MB stack / ~256 bytes per frame ≈ 32K max, use 4K as safe limit. */
81+
#define WALK_USAGES_MAX_DEPTH 4096
82+
83+
static void walk_usages_inner(CBMExtractCtx *ctx, TSNode node, const CBMLangSpec *spec, int depth) {
84+
if (depth > WALK_USAGES_MAX_DEPTH) {
85+
return;
86+
}
8087
if (is_reference_node(node, ctx->language)) {
8188
// Skip if inside a call (already counted as CALLS edge)
8289
if (is_inside_call(node, spec)) {
@@ -110,10 +117,14 @@ static void walk_usages(CBMExtractCtx *ctx, TSNode node, const CBMLangSpec *spec
110117
recurse:;
111118
uint32_t count = ts_node_child_count(node);
112119
for (uint32_t i = 0; i < count; i++) {
113-
walk_usages(ctx, ts_node_child(node, i), spec);
120+
walk_usages_inner(ctx, ts_node_child(node, i), spec, depth + 1);
114121
}
115122
}
116123

124+
static void walk_usages(CBMExtractCtx *ctx, TSNode node, const CBMLangSpec *spec) {
125+
walk_usages_inner(ctx, node, spec, 0);
126+
}
127+
117128
void cbm_extract_usages(CBMExtractCtx *ctx) {
118129
const CBMLangSpec *spec = cbm_lang_spec(ctx->language);
119130
if (!spec) {

internal/cbm/lsp/c_lsp.c

Lines changed: 62 additions & 42 deletions
Large diffs are not rendered by default.

internal/cbm/lsp/c_lsp.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ typedef struct {
7474
bool cpp_mode; // C++ features enabled
7575
bool in_template; // currently inside template declaration
7676
bool debug;
77+
int eval_depth; // recursion depth for c_eval_expr_type (crash guard)
7778
} CLSPContext;
7879

7980
// --- API ---

internal/cbm/lsp/type_rep.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -231,7 +231,8 @@ const CBMType* cbm_type_resolve_alias(const CBMType* t) {
231231
// Generic substitution: recursively replace TYPE_PARAM with concrete types.
232232
const CBMType* cbm_type_substitute(CBMArena* a, const CBMType* t,
233233
const char** type_params, const CBMType** type_args) {
234-
if (!t || !type_params || !type_args) return t;
234+
if (!t) return cbm_type_unknown();
235+
if (!type_params || !type_args) return t;
235236

236237
switch (t->kind) {
237238
case CBM_TYPE_TYPE_PARAM: {

scripts/benchmark-index.sh

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
#!/usr/bin/env bash
2+
set -euo pipefail
3+
4+
# Index a single benchmark repository and capture metrics.
5+
# Usage: benchmark-index.sh <binary> <lang> <repo_path> <results_dir>
6+
7+
BINARY="${1:?Usage: benchmark-index.sh <binary> <lang> <repo_path> <results_dir>}"
8+
LANG="${2:?}"
9+
REPO="${3:?}"
10+
RESULTS_DIR="${4:?}"
11+
12+
# Resolve symlinks
13+
REPO=$(cd "$REPO" && pwd -P)
14+
15+
OUT="$RESULTS_DIR/$LANG"
16+
mkdir -p "$OUT"
17+
18+
echo "INDEX: $LANG ($REPO)"
19+
20+
# Count source files and LOC (exclude .git, vendor, node_modules, build dirs)
21+
FILE_COUNT=$(find "$REPO" -type f \
22+
! -path '*/.git/*' ! -path '*/node_modules/*' ! -path '*/vendor/*' \
23+
! -path '*/target/*' ! -path '*/build/*' ! -path '*/dist/*' \
24+
! -path '*/__pycache__/*' ! -path '*/.cache/*' \
25+
| wc -l | tr -d ' ')
26+
27+
LOC=$(find "$REPO" -type f \
28+
! -path '*/.git/*' ! -path '*/node_modules/*' ! -path '*/vendor/*' \
29+
! -path '*/target/*' ! -path '*/build/*' ! -path '*/dist/*' \
30+
! -path '*/__pycache__/*' ! -path '*/.cache/*' \
31+
-exec cat {} + 2>/dev/null | wc -l | tr -d ' ')
32+
33+
echo "$FILE_COUNT" > "$OUT/file-count.txt"
34+
echo "$LOC" > "$OUT/loc.txt"
35+
36+
# Index via CLI and capture timing
37+
START_MS=$(python3 -c "import time; print(int(time.time()*1000))")
38+
39+
INDEX_JSON=$("$BINARY" cli index_repository "{\"repo_path\":\"$REPO\",\"mode\":\"full\"}" 2>/dev/null || echo '{"error":"index failed"}')
40+
41+
END_MS=$(python3 -c "import time; print(int(time.time()*1000))")
42+
ELAPSED=$((END_MS - START_MS))
43+
44+
echo "$INDEX_JSON" > "$OUT/00-index.json"
45+
echo "$ELAPSED" > "$OUT/index-time.txt"
46+
47+
# Extract node/edge counts (CLI wraps in MCP content envelope)
48+
NODES=$(echo "$INDEX_JSON" | python3 -c "
49+
import json,sys
50+
d=json.load(sys.stdin)
51+
# Unwrap MCP content envelope if present
52+
if 'content' in d:
53+
inner=json.loads(d['content'][0]['text'])
54+
else:
55+
inner=d
56+
print(inner.get('nodes',0))
57+
" 2>/dev/null || echo "0")
58+
EDGES=$(echo "$INDEX_JSON" | python3 -c "
59+
import json,sys
60+
d=json.load(sys.stdin)
61+
if 'content' in d:
62+
inner=json.loads(d['content'][0]['text'])
63+
else:
64+
inner=d
65+
print(inner.get('edges',0))
66+
" 2>/dev/null || echo "0")
67+
PROJECT=$(echo "$INDEX_JSON" | python3 -c "
68+
import json,sys
69+
d=json.load(sys.stdin)
70+
if 'content' in d:
71+
inner=json.loads(d['content'][0]['text'])
72+
else:
73+
inner=d
74+
print(inner.get('project',''))
75+
" 2>/dev/null || echo "")
76+
77+
echo "$NODES" > "$OUT/nodes.txt"
78+
echo "$EDGES" > "$OUT/edges.txt"
79+
echo "$PROJECT" > "$OUT/project.txt"
80+
81+
printf " %s: %s files, %s LOC, %sms, %s nodes, %s edges\n" \
82+
"$LANG" "$FILE_COUNT" "$LOC" "$ELAPSED" "$NODES" "$EDGES"

scripts/clone-bench-repos.sh

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ symlink() {
3232

3333
mkdir -p "$BENCH_DIR"
3434

35-
# Programming languages — Tier 1 (39 languages, 27 original + 12 new)
35+
# Programming languages — Tier 1 (44 languages)
3636
clone go "go-chi/chi"
3737
clone python "httpie/cli"
3838
clone javascript "expressjs/express"
@@ -43,7 +43,7 @@ clone kotlin "JetBrains/Exposed"
4343
clone scala "playframework/play-samples"
4444
clone rust "meilisearch/meilisearch"
4545
clone c "redis/redis"
46-
clone cpp "nlohmann/json"
46+
clone cpp "google/leveldb"
4747
clone csharp "ardalis/CleanArchitecture"
4848
clone php "koel/koel"
4949
clone ruby "sinatra/sinatra"
@@ -71,8 +71,12 @@ clone fortran "cp2k/cp2k"
7171
clone cobol "OCamlPro/gnucobol"
7272
clone verilog "YosysHQ/yosys"
7373
clone emacslisp "emacs-mirror/emacs"
74+
clone matlab "acristoffers/tree-sitter-matlab"
75+
clone lean "leanprover-community/mathlib4"
76+
clone form "vermaseren/form"
77+
clone wolfram "WolframResearch/WolframLanguageForJupyter"
7478

75-
# Helper languages — Tier 2 (20 languages, 8 original + 12 new)
79+
# Helper languages — Tier 2 (22 languages)
7680
clone yaml "kubernetes/examples"
7781
clone hcl "terraform-aws-modules/terraform-aws-eks"
7882
clone scss "twbs/bootstrap"
@@ -96,6 +100,9 @@ symlink markdown python # httpie docs
96100
symlink makefile c # redis Makefile
97101
clone glsl "repalash/Open-Shaders"
98102
symlink ini python # httpie .cfg/.ini files
103+
symlink magma lean # .m files — disambiguated via content markers
104+
symlink kubernetes yaml # YAML subtype — Deployment/Service manifests
105+
symlink kustomize yaml # YAML subtype — kustomization.yaml
99106

100107
echo ""
101108
echo "=== Clone complete ==="

0 commit comments

Comments
 (0)