Skip to content

Commit 0aa615a

Browse files
author
Your Name
committed
feat(cypher): NOT EXISTS subquery with optimized edge lookup
Adds WHERE NOT EXISTS { MATCH (caller)-[e]->(n) WHERE e.type = 'CALLS' } support for anti-join queries like dead-code detection. Parser: extends parse_not_expr to recognize NOT EXISTS { MATCH ... WHERE ... } as a correlated subquery. Creates EXPR_NOT_EXISTS expression node with sub_pattern and sub_where fields. Executor: two evaluation paths for performance: - Fast path (O(1) per node): when inner pattern has exactly 1 hop and one endpoint is bound from outer scope, directly queries edges by source/target ID. No full node scan needed. - Slow path: full subquery expansion for complex/multi-hop patterns. Threading: eval_expr and eval_where now accept (store, project, max_rows) parameters to support correlated subquery expansion. All 5 call sites updated. Enables queries like: MATCH (n:Function) WHERE NOT EXISTS { MATCH (caller)-[e]->(n) WHERE e.type = 'CALLS' } RETURN n.name, n.file_path LIMIT 20 Tested: finds 10 dead functions in a 216-function JS codebase in <1 second.
1 parent 8021d94 commit 0aa615a

2 files changed

Lines changed: 249 additions & 15 deletions

File tree

src/cypher/cypher.c

Lines changed: 244 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -631,6 +631,39 @@ static void expr_free(cbm_expr_t *e) {
631631
// NOLINTNEXTLINE(bugprone-multi-level-implicit-pointer-conversion)
632632
free(e->cond.in_values);
633633
}
634+
if (e->type == EXPR_NOT_EXISTS) {
635+
if (e->sub_pattern) {
636+
/* Free pattern nodes and rels */
637+
for (int i = 0; i < e->sub_pattern->node_count; i++) {
638+
free((void *)e->sub_pattern->nodes[i].variable);
639+
free((void *)e->sub_pattern->nodes[i].label);
640+
}
641+
for (int i = 0; i < e->sub_pattern->rel_count; i++) {
642+
free((void *)e->sub_pattern->rels[i].variable);
643+
for (int t = 0; t < e->sub_pattern->rels[i].type_count; t++) {
644+
free((void *)e->sub_pattern->rels[i].types[t]);
645+
}
646+
free(e->sub_pattern->rels[i].types);
647+
free((void *)e->sub_pattern->rels[i].direction);
648+
}
649+
free(e->sub_pattern->nodes);
650+
free(e->sub_pattern->rels);
651+
free(e->sub_pattern);
652+
}
653+
if (e->sub_where) {
654+
cbm_where_clause_t *sw = (cbm_where_clause_t *)e->sub_where;
655+
if (sw->root) expr_free(sw->root);
656+
for (int i = 0; i < sw->count; i++) {
657+
free((void *)sw->conditions[i].variable);
658+
free((void *)sw->conditions[i].property);
659+
free((void *)sw->conditions[i].op);
660+
free((void *)sw->conditions[i].value);
661+
}
662+
free(sw->conditions);
663+
free((void *)sw->op);
664+
free(sw);
665+
}
666+
}
634667
expr_free(e->left);
635668
expr_free(e->right);
636669
free(e);
@@ -695,6 +728,8 @@ static const char *unsupported_clause_error(cbm_token_type_t type) {
695728

696729
/* Forward declarations for recursive descent */
697730
static cbm_expr_t *parse_or_expr(parser_t *p);
731+
static int parse_match_pattern(parser_t *p, cbm_pattern_t *pat);
732+
static int parse_where(parser_t *p, cbm_where_clause_t **out);
698733

699734
/* Parse a single condition: var.prop OP value | var.prop IS [NOT] NULL | var.prop IN [...] */
700735
static cbm_expr_t *parse_condition_expr(parser_t *p) {
@@ -833,9 +868,40 @@ static cbm_expr_t *parse_atom_expr(parser_t *p) {
833868
return parse_condition_expr(p);
834869
}
835870

836-
/* NOT: NOT atom | atom */
871+
/* NOT: NOT EXISTS { MATCH ... WHERE ... } | NOT atom | atom */
837872
static cbm_expr_t *parse_not_expr(parser_t *p) {
838873
if (match(p, TOK_NOT)) {
874+
/* NOT EXISTS { MATCH (pattern) WHERE ... } — correlated subquery */
875+
if (check(p, TOK_EXISTS)) {
876+
advance(p); /* consume EXISTS */
877+
if (!expect(p, TOK_LBRACE)) return NULL;
878+
879+
cbm_expr_t *e = calloc(1, sizeof(cbm_expr_t));
880+
e->type = EXPR_NOT_EXISTS;
881+
882+
/* Parse inner MATCH pattern */
883+
if (!expect(p, TOK_MATCH)) { free(e); return NULL; }
884+
e->sub_pattern = calloc(1, sizeof(cbm_pattern_t));
885+
if (parse_match_pattern(p, e->sub_pattern) < 0) {
886+
free(e->sub_pattern);
887+
free(e);
888+
return NULL;
889+
}
890+
891+
/* Optional inner WHERE */
892+
cbm_where_clause_t *inner_where = NULL;
893+
parse_where(p, &inner_where);
894+
e->sub_where = inner_where;
895+
896+
if (!expect(p, TOK_RBRACE)) {
897+
/* Cleanup on parse failure */
898+
free(e->sub_pattern);
899+
free(e->sub_where);
900+
free(e);
901+
return NULL;
902+
}
903+
return e;
904+
}
839905
cbm_expr_t *child = parse_not_expr(p);
840906
return child ? expr_not(child) : NULL;
841907
}
@@ -1788,6 +1854,16 @@ static void binding_set(binding_t *b, const char *var, const cbm_node_t *node) {
17881854
b->var_count++;
17891855
}
17901856

1857+
/* Forward declarations for NOT EXISTS subquery evaluation */
1858+
static void scan_pattern_nodes(cbm_store_t *store, const char *project, int max_rows,
1859+
cbm_node_pattern_t *first, cbm_node_t **out_nodes,
1860+
int *out_count);
1861+
static void expand_pattern_rels(cbm_store_t *store, cbm_pattern_t *pat, binding_t **bindings,
1862+
int *bind_count, const int *bind_cap, const char **var_name,
1863+
bool is_optional);
1864+
static bool eval_where(const cbm_where_clause_t *w, binding_t *b, cbm_store_t *store,
1865+
const char *project, int max_rows);
1866+
17911867
/* Evaluate a WHERE condition against a binding */
17921868
static bool eval_condition(const cbm_condition_t *c, binding_t *b) {
17931869
const char *actual;
@@ -1880,33 +1956,187 @@ static bool eval_condition(const cbm_condition_t *c, binding_t *b) {
18801956
return (int)(c->negated ? !result : result);
18811957
}
18821958

1883-
/* Recursive expression tree evaluator */
1884-
static bool eval_expr(const cbm_expr_t *e, binding_t *b) {
1959+
/* Recursive expression tree evaluator.
1960+
* store is needed for EXPR_NOT_EXISTS (correlated subquery expansion). */
1961+
static bool eval_expr(const cbm_expr_t *e, binding_t *b, cbm_store_t *store,
1962+
const char *project, int max_rows) {
18851963
if (!e) {
18861964
return true;
18871965
}
18881966
switch (e->type) {
18891967
case EXPR_CONDITION:
18901968
return eval_condition(&e->cond, b);
18911969
case EXPR_AND:
1892-
return (eval_expr(e->left, b) && eval_expr(e->right, b)) != 0;
1970+
return (eval_expr(e->left, b, store, project, max_rows) &&
1971+
eval_expr(e->right, b, store, project, max_rows)) != 0;
18931972
case EXPR_OR:
1894-
return (eval_expr(e->left, b) || eval_expr(e->right, b)) != 0;
1973+
return (eval_expr(e->left, b, store, project, max_rows) ||
1974+
eval_expr(e->right, b, store, project, max_rows)) != 0;
18951975
case EXPR_NOT:
1896-
return (!eval_expr(e->left, b)) != 0;
1976+
return (!eval_expr(e->left, b, store, project, max_rows)) != 0;
18971977
case EXPR_XOR:
1898-
return eval_expr(e->left, b) != eval_expr(e->right, b);
1978+
return eval_expr(e->left, b, store, project, max_rows) !=
1979+
eval_expr(e->right, b, store, project, max_rows);
1980+
case EXPR_NOT_EXISTS: {
1981+
if (!e->sub_pattern || !store) return true;
1982+
cbm_pattern_t *sp = e->sub_pattern;
1983+
1984+
/* OPTIMIZATION: For the common pattern
1985+
* MATCH (n:Function) WHERE NOT EXISTS { MATCH (caller)-[e]->(n) WHERE e.type = 'CALLS' }
1986+
* we detect when the inner pattern's TARGET variable is already bound from
1987+
* the outer scope. Instead of scanning all possible callers, we directly
1988+
* query edges TO the bound node — O(1) per node instead of O(N). */
1989+
if (sp->rel_count == 1 && sp->node_count == 2) {
1990+
const char *start_var = sp->nodes[0].variable;
1991+
const char *end_var = sp->nodes[1].variable;
1992+
cbm_rel_pattern_t *rel = &sp->rels[0];
1993+
1994+
/* Check which end is bound from outer scope */
1995+
cbm_node_t *bound_node = NULL;
1996+
bool bound_is_target = false;
1997+
if (end_var && binding_get(b, end_var)) {
1998+
bound_node = binding_get(b, end_var);
1999+
bound_is_target = true;
2000+
} else if (start_var && binding_get(b, start_var)) {
2001+
bound_node = binding_get(b, start_var);
2002+
}
2003+
2004+
if (bound_node && bound_node->id > 0) {
2005+
/* Fast path: query edges directly to/from the bound node */
2006+
cbm_edge_t *edges = NULL;
2007+
int edge_count = 0;
2008+
bool found_match = false;
2009+
2010+
for (int ti = 0; ti < rel->type_count && !found_match; ti++) {
2011+
const char *edge_type = rel->types[ti];
2012+
if (bound_is_target) {
2013+
/* bound node is the target: look for edges incoming TO it */
2014+
cbm_store_find_edges_by_target_type(store, bound_node->id,
2015+
edge_type, &edges, &edge_count);
2016+
} else {
2017+
/* bound node is the source: look for edges outgoing FROM it */
2018+
cbm_store_find_edges_by_source_type(store, bound_node->id,
2019+
edge_type, &edges, &edge_count);
2020+
}
2021+
/* Apply inner WHERE filter if present */
2022+
cbm_where_clause_t *inner_w = (cbm_where_clause_t *)e->sub_where;
2023+
if (edge_count > 0 && inner_w) {
2024+
/* Build a temporary binding with the edge to check WHERE conditions */
2025+
for (int ei = 0; ei < edge_count && !found_match; ei++) {
2026+
binding_t tmp = *b; /* shallow copy of outer binding */
2027+
const char *edge_var = rel->variable;
2028+
if (edge_var) {
2029+
binding_set_edge(&tmp, edge_var, &edges[ei]);
2030+
}
2031+
if (eval_where(inner_w, &tmp, store, project, max_rows)) {
2032+
found_match = true;
2033+
}
2034+
}
2035+
} else if (edge_count > 0) {
2036+
found_match = true;
2037+
}
2038+
/* Free edges */
2039+
for (int ei = 0; ei < edge_count; ei++) {
2040+
free((void *)edges[ei].project);
2041+
free((void *)edges[ei].type);
2042+
free((void *)edges[ei].properties_json);
2043+
}
2044+
free(edges);
2045+
edges = NULL;
2046+
edge_count = 0;
2047+
}
2048+
2049+
if (rel->type_count == 0 && !found_match) {
2050+
/* No type filter — check ANY edge */
2051+
cbm_edge_t *all_edges = NULL;
2052+
int all_count = 0;
2053+
if (bound_is_target) {
2054+
cbm_store_find_edges_by_target_type(store, bound_node->id,
2055+
NULL, &all_edges, &all_count);
2056+
} else {
2057+
cbm_store_find_edges_by_source_type(store, bound_node->id,
2058+
NULL, &all_edges, &all_count);
2059+
}
2060+
if (all_count > 0) found_match = true;
2061+
for (int ei = 0; ei < all_count; ei++) {
2062+
free((void *)all_edges[ei].project);
2063+
free((void *)all_edges[ei].type);
2064+
free((void *)all_edges[ei].properties_json);
2065+
}
2066+
free(all_edges);
2067+
}
2068+
2069+
return !found_match;
2070+
}
2071+
}
2072+
2073+
/* SLOW PATH: Full subquery expansion for complex patterns.
2074+
* Used when no variable is bound from outer scope, or multi-hop patterns. */
2075+
const char *start_var = sp->nodes[0].variable;
2076+
cbm_node_t *scanned = NULL;
2077+
int scan_count = 0;
2078+
cbm_node_t *outer_node = start_var ? binding_get(b, start_var) : NULL;
2079+
2080+
if (outer_node) {
2081+
scanned = calloc(1, sizeof(cbm_node_t));
2082+
scanned[0] = *outer_node;
2083+
scanned[0].name = outer_node->name ? heap_strdup(outer_node->name) : NULL;
2084+
scanned[0].label = outer_node->label ? heap_strdup(outer_node->label) : NULL;
2085+
scanned[0].file_path = outer_node->file_path ? heap_strdup(outer_node->file_path) : NULL;
2086+
scanned[0].project = outer_node->project ? heap_strdup(outer_node->project) : NULL;
2087+
scanned[0].qualified_name = outer_node->qualified_name ? heap_strdup(outer_node->qualified_name) : NULL;
2088+
scan_count = 1;
2089+
} else {
2090+
scan_pattern_nodes(store, project, max_rows, &sp->nodes[0],
2091+
&scanned, &scan_count);
2092+
}
2093+
2094+
if (scan_count == 0) {
2095+
free(scanned);
2096+
return true;
2097+
}
2098+
2099+
const char *var = start_var ? start_var : "_ne";
2100+
int sub_cap = scan_count > 4 ? scan_count : 4;
2101+
binding_t *sub_bindings = calloc(sub_cap, sizeof(binding_t));
2102+
int sub_count = 0;
2103+
for (int i = 0; i < scan_count && sub_count < sub_cap; i++) {
2104+
binding_set(&sub_bindings[sub_count], var, &scanned[i]);
2105+
sub_count++;
2106+
}
2107+
free(scanned);
2108+
2109+
if (sub_count > 0 && sp->rel_count > 0) {
2110+
expand_pattern_rels(store, sp, &sub_bindings, &sub_count, &sub_cap,
2111+
&var, false);
2112+
}
2113+
2114+
bool any_match = false;
2115+
cbm_where_clause_t *inner_w = (cbm_where_clause_t *)e->sub_where;
2116+
for (int i = 0; i < sub_count && !any_match; i++) {
2117+
bool pass = inner_w ? eval_where(inner_w, &sub_bindings[i], store, project, max_rows) : true;
2118+
if (pass) any_match = true;
2119+
}
2120+
for (int i = 0; i < sub_count; i++) {
2121+
for (int v = 0; v < sub_bindings[i].var_count; v++) {
2122+
node_fields_free(&sub_bindings[i].var_nodes[v]);
2123+
}
2124+
}
2125+
free(sub_bindings);
2126+
return !any_match;
2127+
}
18992128
}
19002129
return true;
19012130
}
19022131

19032132
/* Evaluate WHERE clause — uses expression tree if available, falls back to legacy */
1904-
static bool eval_where(const cbm_where_clause_t *w, binding_t *b) {
2133+
static bool eval_where(const cbm_where_clause_t *w, binding_t *b, cbm_store_t *store,
2134+
const char *project, int max_rows) {
19052135
if (!w) {
19062136
return true;
19072137
}
19082138
if (w->root) {
1909-
return eval_expr(w->root, b);
2139+
return eval_expr(w->root, b, store, project, max_rows);
19102140
}
19112141

19122142
/* Legacy flat evaluation */
@@ -2046,7 +2276,7 @@ static const char *eval_case_expr(const cbm_case_expr_t *k, binding_t *b) {
20462276
return "";
20472277
}
20482278
for (int i = 0; i < k->branch_count; i++) {
2049-
if (eval_expr(k->branches[i].when_expr, b)) {
2279+
if (eval_expr(k->branches[i].when_expr, b, NULL, NULL, 0)) {
20502280
return k->branches[i].then_val ? k->branches[i].then_val : "";
20512281
}
20522282
}
@@ -2429,9 +2659,9 @@ static int execute_single(cbm_store_t *store, cbm_query_t *q, const char *projec
24292659
bool pass = true;
24302660
if (q->where && pat0->rel_count > 0) {
24312661
/* With expression tree, evaluate full tree — unbound vars pass through */
2432-
pass = eval_where(q->where, &b);
2662+
pass = eval_where(q->where, &b, store, project, max_rows);
24332663
} else if (q->where && pat0->rel_count == 0) {
2434-
pass = eval_where(q->where, &b);
2664+
pass = eval_where(q->where, &b, store, project, max_rows);
24352665
}
24362666

24372667
if (pass) {
@@ -2532,7 +2762,7 @@ static int execute_single(cbm_store_t *store, cbm_query_t *q, const char *projec
25322762
if (q->where && (pat0->rel_count > 0 || q->pattern_count > 1)) {
25332763
int kept = 0;
25342764
for (int i = 0; i < bind_count; i++) {
2535-
if (eval_where(q->where, &bindings[i])) {
2765+
if (eval_where(q->where, &bindings[i], store, project, max_rows)) {
25362766
if (kept != i) {
25372767
bindings[kept] = bindings[i];
25382768
}
@@ -2840,7 +3070,7 @@ static int execute_single(cbm_store_t *store, cbm_query_t *q, const char *projec
28403070
if (q->post_with_where) {
28413071
int kept = 0;
28423072
for (int i = 0; i < bind_count; i++) {
2843-
if (eval_where(q->post_with_where, &bindings[i])) {
3073+
if (eval_where(q->post_with_where, &bindings[i], store, project, max_rows)) {
28443074
if (kept != i) {
28453075
bindings[kept] = bindings[i];
28463076
}

src/cypher/cypher.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -199,7 +199,8 @@ typedef enum {
199199
EXPR_AND,
200200
EXPR_OR,
201201
EXPR_NOT,
202-
EXPR_XOR
202+
EXPR_XOR,
203+
EXPR_NOT_EXISTS /* NOT EXISTS { MATCH ... WHERE ... } */
203204
} cbm_expr_type_t;
204205

205206
typedef struct cbm_expr cbm_expr_t;
@@ -208,6 +209,9 @@ struct cbm_expr {
208209
cbm_condition_t cond; /* leaf (EXPR_CONDITION only) */
209210
cbm_expr_t *left; /* AND/OR/XOR left; NOT child */
210211
cbm_expr_t *right; /* AND/OR/XOR right; NULL for NOT */
212+
/* NOT EXISTS subquery (EXPR_NOT_EXISTS only) */
213+
cbm_pattern_t *sub_pattern; /* inner MATCH pattern */
214+
void *sub_where; /* cbm_where_clause_t* — void to avoid circular dep */
211215
};
212216

213217
typedef struct {

0 commit comments

Comments
 (0)