127127
128128def read_tokens_name_before_token_from_lexer_file (filepath : str , token : str ) -> list [str ]:
129129 tokens_name_before_token = []
130- regex = r"^(?P<token_name>[A-Z_]+ )\s*:"
130+ regex = r"^\s* (?P<token_name>[A-Z_][A-Z_0-9]* )\s*:"
131131 start_placeholder = "Build id contains the non reserved keywords start."
132132 stop_placeholder = "Build id contains the non reserved keywords stop."
133133 begin = False
134134 with open (filepath , "r" ) as lexer_file :
135135 lines = lexer_file .readlines ()
136+ has_placeholder = any (start_placeholder in line for line in lines ) and any (stop_placeholder in line for line in lines )
137+ if not has_placeholder :
138+ begin = True
139+ reached_target_token = False
136140 for line in lines :
137- if start_placeholder in line :
141+ if has_placeholder and start_placeholder in line :
138142 begin = True
139143 continue
140144 if line .isspace () or (not begin ):
141145 continue
142- if (stop_placeholder in line ):
146+ if has_placeholder and (stop_placeholder in line ):
143147 break
144148
145149 matches = re .finditer (regex , line , re .MULTILINE )
146150 for matchNum , match in enumerate (matches , start = 1 ):
147151 if matchNum > 1 :
148152 break
149153 if match .group ("token_name" ) == token :
154+ reached_target_token = True
150155 break
151156 tokens_name_before_token .append (match .group ("token_name" ))
157+ if reached_target_token :
158+ break
152159 return tokens_name_before_token
153160
154161def pretty_print (tokens : list [str ], hello : str | None ) -> None :
@@ -197,9 +204,10 @@ def append_non_reserved_token_to_rules_in_parser(parser_file_path: str, append_r
197204
198205
199206def get_content_by_token_name (content : str , token_name : str ) -> str :
200- token_regex = r"^(%s)\s*:[.\s\S]*?;" % token_name
201- # Get the content of the rules_regex match.
202- token_content = re .search (token_regex , content , re .MULTILINE )
207+ # Support upstream grammar formatting where comments may appear between
208+ # rule name and ':' and where rules can be indented.
209+ token_regex = r"(?ms)^\s*(%s)\s*(?:\n\s*//[^\n]*)*\n\s*:[\s\S]*?\n\s*;" % re .escape (token_name )
210+ token_content = re .search (token_regex , content )
203211 if token_content :
204212 return token_content .group (0 )
205213 return None
@@ -214,4 +222,3 @@ def get_content_by_token_name(content: str, token_name: str) -> str:
214222 pretty_print (filtered_tokens , "Tokens before ID token without reserved keywords:" )
215223 append_non_reserved_token_to_rules_in_parser ("SnowflakeParser.g4" , "id_" , "supplement_non_reserved_words" , filtered_tokens )
216224
217-
0 commit comments