Skip to content

Commit ae5ea48

Browse files
authored
Merge pull request #3984 from ruby/regex
Restructure regexp encoding validation
2 parents 203a520 + 0944c7f commit ae5ea48

8 files changed

Lines changed: 1065 additions & 373 deletions

File tree

config.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -248,7 +248,9 @@ errors:
248248
- PATTERN_TERM_PAREN
249249
- PIPEPIPEEQ_MULTI_ASSIGN
250250
- REGEXP_ENCODING_OPTION_MISMATCH
251+
- REGEXP_ESCAPED_NON_ASCII_IN_UTF8
251252
- REGEXP_INCOMPAT_CHAR_ENCODING
253+
- REGEXP_INVALID_CHAR_PROPERTY
252254
- REGEXP_INVALID_UNICODE_RANGE
253255
- REGEXP_NON_ESCAPED_MBC
254256
- REGEXP_PARSE_ERROR

include/prism/parser.h

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -933,12 +933,6 @@ struct pm_parser {
933933
*/
934934
bool semantic_token_seen;
935935

936-
/**
937-
* True if the current regular expression being lexed contains only ASCII
938-
* characters.
939-
*/
940-
bool current_regular_expression_ascii_only;
941-
942936
/**
943937
* By default, Ruby always warns about mismatched indentation. This can be
944938
* toggled with a magic comment.

include/prism/regexp.h

Lines changed: 41 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -17,27 +17,56 @@
1717
#include <string.h>
1818

1919
/**
20-
* This callback is called by pm_regexp_parse() when a named capture group is found.
20+
* Accumulation state for named capture groups found during regexp parsing.
21+
* The caller initializes this with the call node and passes it to
22+
* pm_regexp_parse. The regexp parser populates match and names as groups
23+
* are found.
2124
*/
22-
typedef void (*pm_regexp_name_callback_t)(const pm_string_t *name, void *data);
25+
typedef struct {
26+
/** The call node wrapping the regular expression node (for =~). */
27+
pm_call_node_t *call;
28+
29+
/** The match write node being built, or NULL if no captures found yet. */
30+
pm_match_write_node_t *match;
31+
32+
/** The list of capture names found so far (for deduplication). */
33+
pm_constant_id_list_t names;
34+
} pm_regexp_name_data_t;
2335

2436
/**
25-
* This callback is called by pm_regexp_parse() when a parse error is found.
37+
* Callback invoked by pm_regexp_parse() for each named capture group found.
38+
*
39+
* @param parser The main parser.
40+
* @param name The name of the capture group.
41+
* @param shared Whether the source content is shared (impacts constant storage).
42+
* @param data The accumulation state for named captures.
2643
*/
27-
typedef void (*pm_regexp_error_callback_t)(const uint8_t *start, const uint8_t *end, const char *message, void *data);
44+
typedef void (*pm_regexp_name_callback_t)(pm_parser_t *parser, const pm_string_t *name, bool shared, pm_regexp_name_data_t *data);
2845

2946
/**
30-
* Parse a regular expression.
47+
* Parse a regular expression, validate its encoding, and optionally extract
48+
* named capture groups. Returns the encoding flags to set on the node.
3149
*
3250
* @param parser The parser that is currently being used.
33-
* @param source The source code to parse.
34-
* @param size The size of the source code.
35-
* @param extended_mode Whether to parse the regular expression in extended mode.
51+
* @param node The regular expression node to parse and validate.
3652
* @param name_callback The optional callback to call when a named capture group is found.
37-
* @param name_data The optional data to pass to the name callback.
38-
* @param error_callback The callback to call when a parse error is found.
39-
* @param error_data The data to pass to the error callback.
53+
* @param name_data The optional accumulation state for named captures.
54+
* @return The encoding flags to set on the node (e.g., FORCED_UTF8_ENCODING).
55+
*/
56+
PRISM_EXPORTED_FUNCTION pm_node_flags_t pm_regexp_parse(pm_parser_t *parser, pm_regular_expression_node_t *node, pm_regexp_name_callback_t name_callback, pm_regexp_name_data_t *name_data);
57+
58+
/**
59+
* Parse an interpolated regular expression for named capture groups only.
60+
* No encoding validation is performed.
61+
*
62+
* @param parser The parser that is currently being used.
63+
* @param source The source content to parse.
64+
* @param size The length of the source content.
65+
* @param shared Whether the source points into the parser's source buffer.
66+
* @param extended_mode Whether or not the regular expression is in extended mode.
67+
* @param name_callback The callback to call when a named capture group is found.
68+
* @param name_data The accumulation state for named captures.
4069
*/
41-
PRISM_EXPORTED_FUNCTION void pm_regexp_parse(pm_parser_t *parser, const uint8_t *source, size_t size, bool extended_mode, pm_regexp_name_callback_t name_callback, void *name_data, pm_regexp_error_callback_t error_callback, void *error_data);
70+
void pm_regexp_parse_named_captures(pm_parser_t *parser, const uint8_t *source, size_t size, bool shared, bool extended_mode, pm_regexp_name_callback_t name_callback, pm_regexp_name_data_t *name_data);
4271

4372
#endif

snapshots/seattlerb/regexp_escape_extended.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
├── flags: ∅
77
└── body: (length: 1)
88
└── @ RegularExpressionNode (location: (1,0)-(1,6))
9-
├── flags: newline, static_literal
9+
├── flags: newline, static_literal, forced_us_ascii_encoding
1010
├── opening_loc: (1,0)-(1,1) = "/"
1111
├── content_loc: (1,1)-(1,5) = "\\“"
1212
├── closing_loc: (1,5)-(1,6) = "/"

0 commit comments

Comments
 (0)