Skip to content

Commit 524725a

Browse files
h3n4lclaude
andcommitted
feat(cosmosdb): rewrite grammar with unified scalar_expression and new clauses
Add missing lexer tokens: !=, IN, BETWEEN, TOP, VALUE, ORDER, BY, GROUP, OFFSET, LIMIT, ASC, DESC, EXISTS, LIKE, HAVING, JOIN. Fix IDENTIFIER to allow leading underscore (for _ts, _etag, etc.). Merge scalar_expression and scalar_expression_in_where into a single unified scalar_expression rule. Add TOP, VALUE, ORDER BY, GROUP BY, OFFSET LIMIT, HAVING, JOIN, IN, BETWEEN, LIKE, EXISTS, NOT, and subquery support. Fix object_constant_field_pair to use COLON_SYMBOL. Resolves all 13 failing query feature areas from BYT-9043. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 39c76b4 commit 524725a

File tree

8 files changed

+4595
-2966
lines changed

8 files changed

+4595
-2966
lines changed

cosmosdb/CosmosDBLexer.g4

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,21 @@ UDF_SYMBOL: 'UDF';
4646
WHERE_SYMBOL: 'WHERE';
4747
AND_SYMBOL: 'AND';
4848
OR_SYMBOL: 'OR';
49+
IN_SYMBOL: 'IN';
50+
BETWEEN_SYMBOL: 'BETWEEN';
51+
TOP_SYMBOL: 'TOP';
52+
VALUE_SYMBOL: 'VALUE';
53+
ORDER_SYMBOL: 'ORDER';
54+
BY_SYMBOL: 'BY';
55+
GROUP_SYMBOL: 'GROUP';
56+
OFFSET_SYMBOL: 'OFFSET';
57+
LIMIT_SYMBOL: 'LIMIT';
58+
ASC_SYMBOL: 'ASC';
59+
DESC_SYMBOL: 'DESC';
60+
EXISTS_SYMBOL: 'EXISTS';
61+
LIKE_SYMBOL: 'LIKE';
62+
HAVING_SYMBOL: 'HAVING';
63+
JOIN_SYMBOL: 'JOIN';
4964

5065
AT_SYMBOL: '@';
5166
LC_BRACKET_SYMBOL: '{';
@@ -77,10 +92,11 @@ GREATER_THAN_EQUAL_OPERATOR: '>=';
7792
LEFT_SHIFT_OPERATOR: '<<';
7893
RIGHT_SHIFT_OPERATOR: '>>';
7994
ZERO_FILL_RIGHT_SHIFT_OPERATOR: '>>>';
95+
NOT_EQUAL_OPERATOR: '!=';
8096

8197

8298
/* Identifiers */
83-
IDENTIFIER: [a-z] [a-z_0-9]*;
99+
IDENTIFIER: [a-z_] [a-z_0-9]*;
84100

85101
// White space handling
86102
WHITESPACE:

cosmosdb/CosmosDBParser.g4

Lines changed: 100 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -6,86 +6,115 @@ options {
66

77
root: select EOF;
88

9-
select: select_clause from_clause where_clause?;
9+
select:
10+
select_clause from_clause? where_clause? group_by_clause? having_clause? order_by_clause?
11+
offset_limit_clause?;
1012

11-
select_clause: SELECT_SYMBOL select_specification;
13+
select_clause: SELECT_SYMBOL top_clause? select_specification;
14+
15+
top_clause: TOP_SYMBOL DECIMAL;
1216

1317
select_specification:
1418
MULTIPLY_OPERATOR
15-
| DISTINCT_SYMBOL? object_property_list;
19+
| DISTINCT_SYMBOL? VALUE_SYMBOL? object_property_list;
1620

1721
from_clause: FROM_SYMBOL from_specification;
1822

19-
where_clause: WHERE_SYMBOL scalar_expression_in_where;
23+
where_clause: WHERE_SYMBOL scalar_expression;
24+
25+
group_by_clause:
26+
GROUP_SYMBOL BY_SYMBOL scalar_expression (
27+
COMMA_SYMBOL scalar_expression
28+
)*;
29+
30+
having_clause: HAVING_SYMBOL scalar_expression;
31+
32+
order_by_clause:
33+
ORDER_SYMBOL BY_SYMBOL sort_expression (
34+
COMMA_SYMBOL sort_expression
35+
)*;
36+
37+
sort_expression: scalar_expression (ASC_SYMBOL | DESC_SYMBOL)?;
38+
39+
offset_limit_clause: OFFSET_SYMBOL DECIMAL LIMIT_SYMBOL DECIMAL;
2040

2141
from_specification: from_source;
2242

23-
from_source: container_expression;
43+
from_source: container_expression (join_clause)*;
44+
45+
container_expression: container_name (AS_SYMBOL? identifier)?;
2446

25-
container_expression: container_name (AS_SYMBOL? IDENTIFIER)?;
47+
join_clause:
48+
JOIN_SYMBOL identifier IN_SYMBOL scalar_expression;
2649

27-
container_name: IDENTIFIER;
50+
container_name: identifier;
2851

2952
object_property_list:
3053
object_property (COMMA_SYMBOL object_property)*;
3154

32-
object_property: scalar_expression (AS_SYMBOL? property_alias)?;
55+
object_property:
56+
scalar_expression (AS_SYMBOL? property_alias)?;
3357

34-
property_alias: IDENTIFIER;
58+
property_alias: identifier;
3559

36-
// scalar_expression: https://learn.microsoft.com/en-us/azure/cosmos-db/nosql/query/scalar-expressions
60+
// Unified scalar_expression - used in both SELECT projections and WHERE clause.
61+
// https://learn.microsoft.com/en-us/azure/cosmos-db/nosql/query/scalar-expressions
3762
scalar_expression:
38-
input_alias
39-
| scalar_expression DOT_SYMBOL property_name
40-
| scalar_expression LS_BRACKET_SYMBOL (
41-
(DOUBLE_QUOTE_STRING_LITERAL)
42-
| (array_index)
43-
) RS_BRACKET_SYMBOL
44-
| unary_operator scalar_expression;
45-
46-
// TODO(zp): Merge scalar_expression and scalar_expression_in_where while supporting the project
47-
// fully. https://learn.microsoft.com/en-us/azure/cosmos-db/nosql/query/scalar-expressions
48-
scalar_expression_in_where:
4963
constant
5064
| input_alias
5165
| parameter_name
52-
| scalar_expression_in_where AND_SYMBOL scalar_expression_in_where
53-
| scalar_expression_in_where OR_SYMBOL scalar_expression_in_where
54-
| scalar_expression_in_where DOT_SYMBOL property_name
55-
| scalar_expression_in_where LS_BRACKET_SYMBOL (
56-
(DOUBLE_QUOTE_STRING_LITERAL)
57-
| (array_index)
66+
| scalar_expression AND_SYMBOL scalar_expression
67+
| scalar_expression OR_SYMBOL scalar_expression
68+
| scalar_expression DOT_SYMBOL property_name
69+
| scalar_expression LS_BRACKET_SYMBOL (
70+
DOUBLE_QUOTE_STRING_LITERAL
71+
| SINGLE_QUOTE_STRING_LITERAL
72+
| array_index
5873
) RS_BRACKET_SYMBOL
59-
| unary_operator scalar_expression_in_where
60-
| scalar_expression_in_where binary_operator scalar_expression_in_where
61-
| scalar_expression_in_where QUESTION_MARK_SYMBOL scalar_expression_in_where COLON_SYMBOL
62-
scalar_expression_in_where
74+
| unary_operator scalar_expression
75+
| NOT_SYMBOL scalar_expression
76+
| scalar_expression binary_operator scalar_expression
77+
| scalar_expression NOT_SYMBOL? IN_SYMBOL LR_BRACKET_SYMBOL (
78+
scalar_expression (COMMA_SYMBOL scalar_expression)*
79+
)? RR_BRACKET_SYMBOL
80+
| scalar_expression NOT_SYMBOL? BETWEEN_SYMBOL scalar_expression AND_SYMBOL scalar_expression
81+
| scalar_expression NOT_SYMBOL? LIKE_SYMBOL scalar_expression
82+
| EXISTS_SYMBOL LR_BRACKET_SYMBOL select RR_BRACKET_SYMBOL
83+
| scalar_expression QUESTION_MARK_SYMBOL scalar_expression COLON_SYMBOL scalar_expression
6384
| scalar_function_expression
6485
| create_object_expression
6586
| create_array_expression
66-
| LR_BRACKET_SYMBOL scalar_expression_in_where RR_BRACKET_SYMBOL;
87+
| LR_BRACKET_SYMBOL scalar_expression RR_BRACKET_SYMBOL
88+
| LR_BRACKET_SYMBOL select RR_BRACKET_SYMBOL;
6789

68-
create_array_expression: array_constant;
90+
create_array_expression:
91+
LS_BRACKET_SYMBOL (
92+
scalar_expression (COMMA_SYMBOL scalar_expression)*
93+
)? RS_BRACKET_SYMBOL;
6994

70-
create_object_expression: object_constant;
95+
create_object_expression:
96+
LC_BRACKET_SYMBOL (
97+
object_field_pair (COMMA_SYMBOL object_field_pair)*
98+
)? RC_BRACKET_SYMBOL;
99+
100+
object_field_pair:
101+
(string_literal | property_name) COLON_SYMBOL scalar_expression;
71102

72103
scalar_function_expression:
73104
udf_scalar_function_expression
74105
| builtin_function_expression;
75106

76107
udf_scalar_function_expression:
77-
UDF_SYMBOL DOT_SYMBOL IDENTIFIER LR_BRACKET_SYMBOL (
78-
scalar_expression_in_where (
79-
COMMA_SYMBOL scalar_expression_in_where
80-
)*
81-
) RR_BRACKET_SYMBOL;
108+
UDF_SYMBOL DOT_SYMBOL identifier LR_BRACKET_SYMBOL (
109+
scalar_expression (COMMA_SYMBOL scalar_expression)*
110+
)? RR_BRACKET_SYMBOL;
82111

83112
builtin_function_expression:
84-
IDENTIFIER LR_BRACKET_SYMBOL (
85-
scalar_expression_in_where (
86-
COMMA_SYMBOL scalar_expression_in_where
113+
identifier LR_BRACKET_SYMBOL (
114+
(MULTIPLY_OPERATOR | scalar_expression) (
115+
COMMA_SYMBOL scalar_expression
87116
)*
88-
) RR_BRACKET_SYMBOL;
117+
)? RR_BRACKET_SYMBOL;
89118

90119
binary_operator:
91120
MULTIPLY_OPERATOR
@@ -98,45 +127,26 @@ binary_operator:
98127
| BIT_OR_SYMBOL
99128
| DOUBLE_BAR_SYMBOL
100129
| EQUAL_SYMBOL
130+
| NOT_EQUAL_OPERATOR
101131
| LESS_THAN_OPERATOR
102132
| LESS_THAN_EQUAL_OPERATOR
103133
| GREATER_THAN_OPERATOR
104134
| GREATER_THAN_EQUAL_OPERATOR
105135
| LEFT_SHIFT_OPERATOR
106136
| RIGHT_SHIFT_OPERATOR
107-
| ZERO_FILL_RIGHT_SHIFT_OPERATOR
108-
;
137+
| ZERO_FILL_RIGHT_SHIFT_OPERATOR;
109138

110139
unary_operator: BIT_NOT_SYMBOL | PLUS_SYMBOL | MINUS_SYMBOL;
111140

112-
parameter_name: AT_SYMBOL IDENTIFIER;
141+
parameter_name: AT_SYMBOL identifier;
113142

114143
// https://learn.microsoft.com/en-us/azure/cosmos-db/nosql/query/constants
115144
constant:
116145
undefined_constant
117146
| null_constant
118147
| boolean_constant
119148
| number_constant
120-
| string_constant
121-
| array_constant
122-
| object_constant;
123-
124-
object_constant:
125-
LC_BRACKET_SYMBOL (
126-
object_constant_field_pair (
127-
COMMA_SYMBOL object_constant_field_pair
128-
)*
129-
) RC_BRACKET_SYMBOL;
130-
131-
object_constant_field_pair: (
132-
property_name
133-
| (DOUBLE_QUOTE_SYMBOL property_name DOUBLE_QUOTE_SYMBOL)
134-
) COMMA_SYMBOL constant;
135-
136-
array_constant:
137-
LS_BRACKET_SYMBOL (constant (COMMA_SYMBOL constant)*)? RS_BRACKET_SYMBOL;
138-
139-
string_constant: string_literal;
149+
| string_constant;
140150

141151
undefined_constant: UNDEFINED_SYMBOL;
142152

@@ -146,6 +156,8 @@ boolean_constant: TRUE_SYMBOL | FALSE_SYMBOL;
146156

147157
number_constant: decimal_literal | hexadecimal_literal;
148158

159+
string_constant: string_literal;
160+
149161
string_literal:
150162
SINGLE_QUOTE_STRING_LITERAL
151163
| DOUBLE_QUOTE_STRING_LITERAL;
@@ -154,8 +166,28 @@ decimal_literal: DECIMAL | REAL | FLOAT;
154166

155167
hexadecimal_literal: HEXADECIMAL;
156168

157-
property_name: IDENTIFIER;
169+
// Allow keywords to be used as identifiers (property names, aliases, etc.)
170+
// This is necessary because CosmosDB allows keywords as property names.
171+
identifier:
172+
IDENTIFIER
173+
| IN_SYMBOL
174+
| BETWEEN_SYMBOL
175+
| TOP_SYMBOL
176+
| VALUE_SYMBOL
177+
| ORDER_SYMBOL
178+
| BY_SYMBOL
179+
| GROUP_SYMBOL
180+
| OFFSET_SYMBOL
181+
| LIMIT_SYMBOL
182+
| ASC_SYMBOL
183+
| DESC_SYMBOL
184+
| EXISTS_SYMBOL
185+
| LIKE_SYMBOL
186+
| HAVING_SYMBOL
187+
| JOIN_SYMBOL;
188+
189+
property_name: identifier;
158190

159191
array_index: DECIMAL;
160192

161-
input_alias: IDENTIFIER;
193+
input_alias: identifier;

0 commit comments

Comments
 (0)