From 0944c7fba294aea09b9448f84b0eca4ab74695c4 Mon Sep 17 00:00:00 2001 From: Kevin Newton Date: Tue, 10 Mar 2026 22:32:43 -0400 Subject: [PATCH] Restructure regexp encoding validation Move all the logic from prism.c into regexp.c. Now regexp.c does two passes. The first pass scans the raw source to track escape types, non-ASCII literals, and multibyte validity for encoding validation. The second pass scans the unescaped content for named capture extraction (needed because escape sequences like line continuations alter group names). Fixed a couple of things along the way. ascii_only was previously computed from unescaped content, but we can do that as we go to avoid scanning again. Unicode properties also now properly error for regexp with modifiers. --- config.yml | 2 + include/prism/parser.h | 6 - include/prism/regexp.h | 53 +- .../seattlerb/regexp_escape_extended.txt | 2 +- src/prism.c | 370 ++----- src/regexp.c | 965 +++++++++++++++++- templates/src/diagnostic.c.erb | 6 +- .../regular_expression_encoding_test.rb | 34 +- 8 files changed, 1065 insertions(+), 373 deletions(-) diff --git a/config.yml b/config.yml index c82c239de6..1ea1bcf7f1 100644 --- a/config.yml +++ b/config.yml @@ -248,7 +248,9 @@ errors: - PATTERN_TERM_PAREN - PIPEPIPEEQ_MULTI_ASSIGN - REGEXP_ENCODING_OPTION_MISMATCH + - REGEXP_ESCAPED_NON_ASCII_IN_UTF8 - REGEXP_INCOMPAT_CHAR_ENCODING + - REGEXP_INVALID_CHAR_PROPERTY - REGEXP_INVALID_UNICODE_RANGE - REGEXP_NON_ESCAPED_MBC - REGEXP_PARSE_ERROR diff --git a/include/prism/parser.h b/include/prism/parser.h index 5ebace10c6..d8e7a550e7 100644 --- a/include/prism/parser.h +++ b/include/prism/parser.h @@ -933,12 +933,6 @@ struct pm_parser { */ bool semantic_token_seen; - /** - * True if the current regular expression being lexed contains only ASCII - * characters. - */ - bool current_regular_expression_ascii_only; - /** * By default, Ruby always warns about mismatched indentation. This can be * toggled with a magic comment. diff --git a/include/prism/regexp.h b/include/prism/regexp.h index 5366b5a5a0..b3e739b457 100644 --- a/include/prism/regexp.h +++ b/include/prism/regexp.h @@ -17,27 +17,56 @@ #include /** - * This callback is called by pm_regexp_parse() when a named capture group is found. + * Accumulation state for named capture groups found during regexp parsing. + * The caller initializes this with the call node and passes it to + * pm_regexp_parse. The regexp parser populates match and names as groups + * are found. */ -typedef void (*pm_regexp_name_callback_t)(const pm_string_t *name, void *data); +typedef struct { + /** The call node wrapping the regular expression node (for =~). */ + pm_call_node_t *call; + + /** The match write node being built, or NULL if no captures found yet. */ + pm_match_write_node_t *match; + + /** The list of capture names found so far (for deduplication). */ + pm_constant_id_list_t names; +} pm_regexp_name_data_t; /** - * This callback is called by pm_regexp_parse() when a parse error is found. + * Callback invoked by pm_regexp_parse() for each named capture group found. + * + * @param parser The main parser. + * @param name The name of the capture group. + * @param shared Whether the source content is shared (impacts constant storage). + * @param data The accumulation state for named captures. */ -typedef void (*pm_regexp_error_callback_t)(const uint8_t *start, const uint8_t *end, const char *message, void *data); +typedef void (*pm_regexp_name_callback_t)(pm_parser_t *parser, const pm_string_t *name, bool shared, pm_regexp_name_data_t *data); /** - * Parse a regular expression. + * Parse a regular expression, validate its encoding, and optionally extract + * named capture groups. Returns the encoding flags to set on the node. * * @param parser The parser that is currently being used. - * @param source The source code to parse. - * @param size The size of the source code. - * @param extended_mode Whether to parse the regular expression in extended mode. + * @param node The regular expression node to parse and validate. * @param name_callback The optional callback to call when a named capture group is found. - * @param name_data The optional data to pass to the name callback. - * @param error_callback The callback to call when a parse error is found. - * @param error_data The data to pass to the error callback. + * @param name_data The optional accumulation state for named captures. + * @return The encoding flags to set on the node (e.g., FORCED_UTF8_ENCODING). + */ +PRISM_EXPORTED_FUNCTION pm_node_flags_t pm_regexp_parse(pm_parser_t *parser, pm_regular_expression_node_t *node, pm_regexp_name_callback_t name_callback, pm_regexp_name_data_t *name_data); + +/** + * Parse an interpolated regular expression for named capture groups only. + * No encoding validation is performed. + * + * @param parser The parser that is currently being used. + * @param source The source content to parse. + * @param size The length of the source content. + * @param shared Whether the source points into the parser's source buffer. + * @param extended_mode Whether or not the regular expression is in extended mode. + * @param name_callback The callback to call when a named capture group is found. + * @param name_data The accumulation state for named captures. */ -PRISM_EXPORTED_FUNCTION void pm_regexp_parse(pm_parser_t *parser, const uint8_t *source, size_t size, bool extended_mode, pm_regexp_name_callback_t name_callback, void *name_data, pm_regexp_error_callback_t error_callback, void *error_data); +void pm_regexp_parse_named_captures(pm_parser_t *parser, const uint8_t *source, size_t size, bool shared, bool extended_mode, pm_regexp_name_callback_t name_callback, pm_regexp_name_data_t *name_data); #endif diff --git a/snapshots/seattlerb/regexp_escape_extended.txt b/snapshots/seattlerb/regexp_escape_extended.txt index 732899a20d..a2e4d5eb96 100644 --- a/snapshots/seattlerb/regexp_escape_extended.txt +++ b/snapshots/seattlerb/regexp_escape_extended.txt @@ -6,7 +6,7 @@ ├── flags: ∅ └── body: (length: 1) └── @ RegularExpressionNode (location: (1,0)-(1,6)) - ├── flags: newline, static_literal + ├── flags: newline, static_literal, forced_us_ascii_encoding ├── opening_loc: (1,0)-(1,1) = "/" ├── content_loc: (1,1)-(1,5) = "\\“" ├── closing_loc: (1,5)-(1,6) = "/" diff --git a/src/prism.c b/src/prism.c index fcc7be8a1a..d196c5d7c4 100644 --- a/src/prism.c +++ b/src/prism.c @@ -6423,129 +6423,6 @@ parse_symbol_encoding(pm_parser_t *parser, const pm_token_t *location, const pm_ return 0; } -static pm_node_flags_t -parse_and_validate_regular_expression_encoding_modifier(pm_parser_t *parser, const pm_string_t *source, bool ascii_only, pm_node_flags_t flags, char modifier, const pm_encoding_t *modifier_encoding) { - assert ((modifier == 'n' && modifier_encoding == PM_ENCODING_ASCII_8BIT_ENTRY) || - (modifier == 'u' && modifier_encoding == PM_ENCODING_UTF_8_ENTRY) || - (modifier == 'e' && modifier_encoding == PM_ENCODING_EUC_JP_ENTRY) || - (modifier == 's' && modifier_encoding == PM_ENCODING_WINDOWS_31J_ENTRY)); - - // There's special validation logic used if a string does not contain any character escape sequences. - if (parser->explicit_encoding == NULL) { - // If an ASCII-only string without character escapes is used with an encoding modifier, then resulting Regexp - // has the modifier encoding, unless the ASCII-8BIT modifier is used, in which case the Regexp "downgrades" to - // the US-ASCII encoding. - if (ascii_only) { - return modifier == 'n' ? PM_REGULAR_EXPRESSION_FLAGS_FORCED_US_ASCII_ENCODING : flags; - } - - if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY) { - if (!ascii_only) { - PM_PARSER_ERR_TOKEN_FORMAT(parser, &parser->current, PM_ERR_INVALID_MULTIBYTE_CHAR, parser->encoding->name); - } - } else if (parser->encoding != modifier_encoding) { - PM_PARSER_ERR_TOKEN_FORMAT(parser, &parser->current, PM_ERR_REGEXP_ENCODING_OPTION_MISMATCH, modifier, parser->encoding->name); - - if (modifier == 'n' && !ascii_only) { - PM_PARSER_ERR_TOKEN_FORMAT(parser, &parser->current, PM_ERR_REGEXP_NON_ESCAPED_MBC, (int) pm_string_length(source), (const char *) pm_string_source(source)); - } - } - - return flags; - } - - // TODO (nirvdrum 21-Feb-2024): To validate regexp sources with character escape sequences we need to know whether hex or Unicode escape sequences were used and Prism doesn't currently provide that data. We handle a subset of unambiguous cases in the meanwhile. - bool mixed_encoding = false; - - if (mixed_encoding) { - PM_PARSER_ERR_TOKEN_FORMAT(parser, &parser->current, PM_ERR_INVALID_MULTIBYTE_ESCAPE, (int) pm_string_length(source), (const char *) pm_string_source(source)); - } else if (modifier != 'n' && parser->explicit_encoding == PM_ENCODING_ASCII_8BIT_ENTRY) { - // TODO (nirvdrum 21-Feb-2024): Validate the content is valid in the modifier encoding. Do this on-demand so we don't pay the cost of computation unnecessarily. - bool valid_string_in_modifier_encoding = true; - - if (!valid_string_in_modifier_encoding) { - PM_PARSER_ERR_TOKEN_FORMAT(parser, &parser->current, PM_ERR_INVALID_MULTIBYTE_ESCAPE, (int) pm_string_length(source), (const char *) pm_string_source(source)); - } - } else if (modifier != 'u' && parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) { - // TODO (nirvdrum 21-Feb-2024): There's currently no way to tell if the source used hex or Unicode character escapes from `explicit_encoding` alone. If the source encoding was already UTF-8, both character escape types would set `explicit_encoding` to UTF-8, but need to be processed differently. Skip for now. - if (parser->encoding != PM_ENCODING_UTF_8_ENTRY) { - PM_PARSER_ERR_TOKEN_FORMAT(parser, &parser->current, PM_ERR_REGEXP_INCOMPAT_CHAR_ENCODING, (int) pm_string_length(source), (const char *) pm_string_source(source)); - } - } - - // We've determined the encoding would naturally be EUC-JP and there is no need to force the encoding to anything else. - return flags; -} - -/** - * Ruby "downgrades" the encoding of Regexps to US-ASCII if the associated encoding is ASCII-compatible and - * the unescaped representation of a Regexp source consists only of US-ASCII code points. This is true even - * when the Regexp is explicitly given an ASCII-8BIT encoding via the (/n) modifier. Otherwise, the encoding - * may be explicitly set with an escape sequence. - */ -static pm_node_flags_t -parse_and_validate_regular_expression_encoding(pm_parser_t *parser, const pm_string_t *source, bool ascii_only, pm_node_flags_t flags) { - // TODO (nirvdrum 22-Feb-2024): CRuby reports a special Regexp-specific error for invalid Unicode ranges. We either need to scan again or modify the "invalid Unicode escape sequence" message we already report. - bool valid_unicode_range = true; - if (parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY && !valid_unicode_range) { - PM_PARSER_ERR_TOKEN_FORMAT(parser, &parser->current, PM_ERR_REGEXP_INVALID_UNICODE_RANGE, (int) pm_string_length(source), (const char *) pm_string_source(source)); - return flags; - } - - // US-ASCII strings do not admit multi-byte character literals. However, character escape sequences corresponding - // to multi-byte characters are allowed. - if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY && parser->explicit_encoding == NULL && !ascii_only) { - // CRuby will continue processing even though a SyntaxError has already been detected. It may result in the - // following error message appearing twice. We do the same for compatibility. - PM_PARSER_ERR_TOKEN_FORMAT(parser, &parser->current, PM_ERR_INVALID_MULTIBYTE_CHAR, parser->encoding->name); - } - - /** - * Start checking modifier flags. We need to process these before considering any explicit encodings that may have - * been set by character literals. The order in which the encoding modifiers is checked does not matter. In the - * event that both an encoding modifier and an explicit encoding would result in the same encoding we do not set - * the corresponding "forced_" flag. Instead, the caller should check the encoding modifier flag and - * determine the encoding that way. - */ - - if (flags & PM_REGULAR_EXPRESSION_FLAGS_ASCII_8BIT) { - return parse_and_validate_regular_expression_encoding_modifier(parser, source, ascii_only, flags, 'n', PM_ENCODING_ASCII_8BIT_ENTRY); - } - - if (flags & PM_REGULAR_EXPRESSION_FLAGS_UTF_8) { - return parse_and_validate_regular_expression_encoding_modifier(parser, source, ascii_only, flags, 'u', PM_ENCODING_UTF_8_ENTRY); - } - - if (flags & PM_REGULAR_EXPRESSION_FLAGS_EUC_JP) { - return parse_and_validate_regular_expression_encoding_modifier(parser, source, ascii_only, flags, 'e', PM_ENCODING_EUC_JP_ENTRY); - } - - if (flags & PM_REGULAR_EXPRESSION_FLAGS_WINDOWS_31J) { - return parse_and_validate_regular_expression_encoding_modifier(parser, source, ascii_only, flags, 's', PM_ENCODING_WINDOWS_31J_ENTRY); - } - - // At this point no encoding modifiers will be present on the regular expression as they would have already - // been processed. Ruby stipulates that all source files must use an ASCII-compatible encoding. Thus, all - // regular expressions without an encoding modifier appearing in source are eligible for "downgrading" to US-ASCII. - if (ascii_only) { - return PM_REGULAR_EXPRESSION_FLAGS_FORCED_US_ASCII_ENCODING; - } - - // A Regexp may optionally have its encoding explicitly set via a character escape sequence in the source string - // or by specifying a modifier. - // - // NB: an explicitly set encoding is ignored by Ruby if the Regexp consists of only US ASCII code points. - if (parser->explicit_encoding != NULL) { - if (parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) { - return PM_REGULAR_EXPRESSION_FLAGS_FORCED_UTF8_ENCODING; - } else if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY) { - return PM_REGULAR_EXPRESSION_FLAGS_FORCED_BINARY_ENCODING; - } - } - - return 0; -} - /** * Allocate and initialize a new SymbolNode node with the given unescaped * string. @@ -8589,7 +8466,7 @@ escape_hexadecimal_digit(const uint8_t value) { * validated. */ static inline uint32_t -escape_unicode(pm_parser_t *parser, const uint8_t *string, size_t length, const pm_location_t *error_location) { +escape_unicode(pm_parser_t *parser, const uint8_t *string, size_t length, const pm_location_t *error_location, const uint8_t flags) { uint32_t value = 0; for (size_t index = 0; index < length; index++) { if (index != 0) value <<= 4; @@ -8599,7 +8476,10 @@ escape_unicode(pm_parser_t *parser, const uint8_t *string, size_t length, const // Here we're going to verify that the value is actually a valid Unicode // codepoint and not a surrogate pair. if (value >= 0xD800 && value <= 0xDFFF) { - if (error_location != NULL) { + if (flags & PM_ESCAPE_FLAG_REGEXP) { + // In regexp context, defer the error to regexp encoding + // validation where we can produce a regexp-specific message. + } else if (error_location != NULL) { pm_parser_err(parser, error_location->start, error_location->length, PM_ERR_ESCAPE_INVALID_UNICODE); } else { pm_parser_err(parser, U32(string - parser->start), U32(length), PM_ERR_ESCAPE_INVALID_UNICODE); @@ -8630,14 +8510,25 @@ escape_write_unicode(pm_parser_t *parser, pm_buffer_t *buffer, const uint8_t fla // literal. if (value >= 0x80 || flags & PM_ESCAPE_FLAG_SINGLE) { if (parser->explicit_encoding != NULL && parser->explicit_encoding != PM_ENCODING_UTF_8_ENTRY) { - PM_PARSER_ERR_FORMAT(parser, U32(start - parser->start), U32(end - start), PM_ERR_MIXED_ENCODING, parser->explicit_encoding->name); + if (flags & PM_ESCAPE_FLAG_REGEXP) { + // In regexp context, suppress this error — the regexp encoding + // validation will produce a more specific error message. + } else { + PM_PARSER_ERR_FORMAT(parser, U32(start - parser->start), U32(end - start), PM_ERR_MIXED_ENCODING, parser->explicit_encoding->name); + } } parser->explicit_encoding = PM_ENCODING_UTF_8_ENTRY; } if (!pm_buffer_append_unicode_codepoint(buffer, value)) { - pm_parser_err(parser, U32(start - parser->start), U32(end - start), PM_ERR_ESCAPE_INVALID_UNICODE); + if (flags & PM_ESCAPE_FLAG_REGEXP) { + // In regexp context, defer the error to the regexp encoding + // validation which produces a regexp-specific message. + } else { + pm_parser_err(parser, U32(start - parser->start), U32(end - start), PM_ERR_ESCAPE_INVALID_UNICODE); + } + pm_buffer_append_byte(buffer, 0xEF); pm_buffer_append_byte(buffer, 0xBF); pm_buffer_append_byte(buffer, 0xBD); @@ -8649,10 +8540,15 @@ escape_write_unicode(pm_parser_t *parser, pm_buffer_t *buffer, const uint8_t fla * (i.e., the top bit is set) then it locks in the encoding. */ static inline void -escape_write_byte_encoded(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t byte) { +escape_write_byte_encoded(pm_parser_t *parser, pm_buffer_t *buffer, const uint8_t flags, uint8_t byte) { if (byte >= 0x80) { if (parser->explicit_encoding != NULL && parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY && parser->encoding != PM_ENCODING_UTF_8_ENTRY) { - PM_PARSER_ERR_TOKEN_FORMAT(parser, &parser->current, PM_ERR_MIXED_ENCODING, parser->encoding->name); + if (flags & PM_ESCAPE_FLAG_REGEXP) { + // In regexp context, suppress this error — the regexp encoding + // validation will produce a more specific error message. + } else { + PM_PARSER_ERR_TOKEN_FORMAT(parser, &parser->current, PM_ERR_MIXED_ENCODING, parser->encoding->name); + } } parser->explicit_encoding = parser->encoding; @@ -8682,7 +8578,7 @@ escape_write_byte(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular pm_buffer_append_format(regular_expression_buffer, "\\x%02X", byte); } - escape_write_byte_encoded(parser, buffer, byte); + escape_write_byte_encoded(parser, buffer, flags, byte); } /** @@ -8838,7 +8734,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expre } } - escape_write_byte_encoded(parser, buffer, value); + escape_write_byte_encoded(parser, buffer, flags, value); } else { pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_HEXADECIMAL); } @@ -8887,7 +8783,8 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expre if (flags & PM_ESCAPE_FLAG_REGEXP) { // If this is a regular expression, we are going to // let the regular expression engine handle this - // error instead of us. + // error instead of us because we don't know at this + // point if we're inside a comment in /x mode. pm_buffer_append_bytes(regular_expression_buffer, start, (size_t) (parser->current.end - start)); } else { pm_parser_err(parser, PM_TOKEN_END(parser, &parser->current), 0, PM_ERR_ESCAPE_INVALID_UNICODE); @@ -8903,7 +8800,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expre extra_codepoints_start = unicode_start; } - uint32_t value = escape_unicode(parser, unicode_start, hexadecimal_length, NULL); + uint32_t value = escape_unicode(parser, unicode_start, hexadecimal_length, NULL, flags); escape_write_unicode(parser, buffer, flags, unicode_start, parser->current.end, value); parser->current.end += pm_strspn_inline_whitespace(parser->current.end, parser->end - parser->current.end); @@ -8923,7 +8820,8 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expre if (flags & PM_ESCAPE_FLAG_REGEXP) { // If this is a regular expression, we are going to let // the regular expression engine handle this error - // instead of us. + // instead of us because we don't know at this point if + // we're inside a comment in /x mode. pm_buffer_append_bytes(regular_expression_buffer, start, (size_t) (parser->current.end - start)); } else { pm_parser_err(parser, U32(unicode_codepoints_start - parser->start), U32(parser->current.end - unicode_codepoints_start), PM_ERR_ESCAPE_INVALID_UNICODE_TERM); @@ -8944,7 +8842,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expre PM_PARSER_ERR_FORMAT(parser, U32(start - parser->start), U32(parser->current.end - start), PM_ERR_ESCAPE_INVALID_UNICODE_SHORT, 2, start); } } else if (length == 4) { - uint32_t value = escape_unicode(parser, parser->current.end, 4, NULL); + uint32_t value = escape_unicode(parser, parser->current.end, 4, NULL, flags); if (flags & PM_ESCAPE_FLAG_REGEXP) { pm_buffer_append_bytes(regular_expression_buffer, start, (size_t) (parser->current.end + 4 - start)); @@ -9131,7 +9029,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expre case '\r': { if (peek_offset(parser, 1) == '\n') { parser->current.end += 2; - escape_write_byte_encoded(parser, buffer, escape_byte('\n', flags)); + escape_write_byte_encoded(parser, buffer, flags, escape_byte('\n', flags)); return; } PRISM_FALLTHROUGH @@ -9516,20 +9414,12 @@ pm_token_buffer_push_escaped(pm_token_buffer_t *token_buffer, pm_parser_t *parse static void pm_regexp_token_buffer_push_escaped(pm_regexp_token_buffer_t *token_buffer, pm_parser_t *parser) { size_t width = parser_char_width(parser); - pm_buffer_append_bytes(&token_buffer->base.buffer, parser->current.end, width); - pm_buffer_append_bytes(&token_buffer->regexp_buffer, parser->current.end, width); + const uint8_t *start = parser->current.end; + pm_buffer_append_bytes(&token_buffer->base.buffer, start, width); + pm_buffer_append_bytes(&token_buffer->regexp_buffer, start, width); parser->current.end += width; } -static bool -pm_slice_ascii_only_p(const uint8_t *value, size_t length) { - for (size_t index = 0; index < length; index++) { - if (value[index] & 0x80) return false; - } - - return true; -} - /** * When we're about to return from lexing the current token and we know for sure * that we have found an escape sequence, this function is called to copy the @@ -9548,7 +9438,6 @@ pm_token_buffer_copy(pm_parser_t *parser, pm_token_buffer_t *token_buffer) { static inline void pm_regexp_token_buffer_copy(pm_parser_t *parser, pm_regexp_token_buffer_t *token_buffer) { pm_token_buffer_copy(parser, &token_buffer->base); - parser->current_regular_expression_ascii_only = pm_slice_ascii_only_p((const uint8_t *) pm_buffer_value(&token_buffer->regexp_buffer), pm_buffer_length(&token_buffer->regexp_buffer)); pm_buffer_free(&token_buffer->regexp_buffer); } @@ -9575,10 +9464,11 @@ static void pm_regexp_token_buffer_flush(pm_parser_t *parser, pm_regexp_token_buffer_t *token_buffer) { if (token_buffer->base.cursor == NULL) { pm_string_shared_init(&parser->current_string, parser->current.start, parser->current.end); - parser->current_regular_expression_ascii_only = pm_slice_ascii_only_p(parser->current.start, (size_t) (parser->current.end - parser->current.start)); } else { - pm_buffer_append_bytes(&token_buffer->base.buffer, token_buffer->base.cursor, (size_t) (parser->current.end - token_buffer->base.cursor)); - pm_buffer_append_bytes(&token_buffer->regexp_buffer, token_buffer->base.cursor, (size_t) (parser->current.end - token_buffer->base.cursor)); + const uint8_t *cursor = token_buffer->base.cursor; + size_t length = (size_t) (parser->current.end - cursor); + pm_buffer_append_bytes(&token_buffer->base.buffer, cursor, length); + pm_buffer_append_bytes(&token_buffer->regexp_buffer, cursor, length); pm_regexp_token_buffer_copy(parser, token_buffer); } } @@ -17383,63 +17273,6 @@ parse_yield(pm_parser_t *parser, const pm_node_t *node) { } } -/** - * This struct is used to pass information between the regular expression parser - * and the error callback. - */ -typedef struct { - /** The parser that we are parsing the regular expression for. */ - pm_parser_t *parser; - - /** The start of the regular expression. */ - const uint8_t *start; - - /** The end of the regular expression. */ - const uint8_t *end; - - /** - * Whether or not the source of the regular expression is shared. This - * impacts the location of error messages, because if it is shared then we - * can use the location directly and if it is not, then we use the bounds of - * the regular expression itself. - */ - bool shared; -} parse_regular_expression_error_data_t; - -/** - * This callback is called when the regular expression parser encounters a - * syntax error. - */ -static void -parse_regular_expression_error(const uint8_t *start, const uint8_t *end, const char *message, void *data) { - parse_regular_expression_error_data_t *callback_data = (parse_regular_expression_error_data_t *) data; - pm_token_t location; - - if (callback_data->shared) { - location = (pm_token_t) { .type = 0, .start = start, .end = end }; - } else { - location = (pm_token_t) { .type = 0, .start = callback_data->start, .end = callback_data->end }; - } - - PM_PARSER_ERR_FORMAT(callback_data->parser, PM_TOKEN_START(callback_data->parser, &location), PM_TOKEN_LENGTH(&location), PM_ERR_REGEXP_PARSE_ERROR, message); -} - -/** - * Parse the errors for the regular expression and add them to the parser. - */ -static void -parse_regular_expression_errors(pm_parser_t *parser, pm_regular_expression_node_t *node) { - const pm_string_t *unescaped = &node->unescaped; - parse_regular_expression_error_data_t error_data = { - .parser = parser, - .start = parser->start + PM_NODE_START(node), - .end = parser->start + PM_NODE_END(node), - .shared = unescaped->type == PM_STRING_SHARED - }; - - pm_regexp_parse(parser, pm_string_source(unescaped), pm_string_length(unescaped), PM_NODE_FLAG_P(node, PM_REGULAR_EXPRESSION_FLAGS_EXTENDED), NULL, NULL, parse_regular_expression_error, &error_data); -} - /** * Determine if a given call node looks like a "command", which means it has * arguments but does not have parentheses. @@ -19317,22 +19150,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, u } } } else { - /* `not` in a single line method is allowed to be followed by - * an expression without pattern matching, that optionally is - * followed by a `rescue` modifier. */ - if (flags & PM_PARSE_IN_ENDLESS_DEF) { - receiver = parse_expression(parser, PM_BINDING_POWER_MATCH + 1, (flags & PM_PARSE_ACCEPTS_DO_BLOCK) | PM_PARSE_ACCEPTS_COMMAND_CALL, PM_ERR_NOT_EXPRESSION, (uint16_t) (depth + 1)); - - if (accept1(parser, PM_TOKEN_KEYWORD_RESCUE_MODIFIER)) { - context_push(parser, PM_CONTEXT_RESCUE_MODIFIER); - pm_token_t rescue_keyword = parser->previous; - pm_node_t *value = parse_expression(parser, PM_BINDING_POWER_MATCH + 1, flags & PM_PARSE_ACCEPTS_DO_BLOCK, PM_ERR_RESCUE_MODIFIER_VALUE, (uint16_t) (depth + 1)); - context_pop(parser); - receiver = UP(pm_rescue_modifier_node_create(parser, receiver, &rescue_keyword, value)); - } - } else { - receiver = parse_expression(parser, PM_BINDING_POWER_NOT, (flags & PM_PARSE_ACCEPTS_DO_BLOCK) | PM_PARSE_ACCEPTS_COMMAND_CALL, PM_ERR_NOT_EXPRESSION, (uint16_t) (depth + 1)); - } + receiver = parse_expression(parser, PM_BINDING_POWER_NOT, (flags & PM_PARSE_ACCEPTS_DO_BLOCK) | PM_PARSE_ACCEPTS_COMMAND_CALL, PM_ERR_NOT_EXPRESSION, (uint16_t) (depth + 1)); } return UP(pm_call_node_not_create(parser, receiver, &message, &arguments)); @@ -19913,10 +19731,9 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, u parser_lex(parser); - pm_node_t *node = UP(pm_regular_expression_node_create(parser, &opening, &content, &parser->previous)); - pm_node_flag_set(node, PM_REGULAR_EXPRESSION_FLAGS_FORCED_US_ASCII_ENCODING); - - return node; + pm_regular_expression_node_t *node = pm_regular_expression_node_create(parser, &opening, &content, &parser->previous); + pm_node_flag_set(UP(node), pm_regexp_parse(parser, node, NULL, NULL)); + return UP(node); } pm_interpolated_regular_expression_node_t *interpolated; @@ -19928,7 +19745,6 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, u // regular expression) or if it's not then it has interpolation. pm_string_t unescaped = parser->current_string; pm_token_t content = parser->current; - bool ascii_only = parser->current_regular_expression_ascii_only; parser_lex(parser); // If we hit an end, then we can create a regular expression @@ -19937,15 +19753,14 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, u if (accept1(parser, PM_TOKEN_REGEXP_END)) { pm_regular_expression_node_t *node = (pm_regular_expression_node_t *) pm_regular_expression_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped); - // If we're not immediately followed by a =~, then we want - // to parse all of the errors at this point. If it is - // followed by a =~, then it will get parsed higher up while - // parsing the named captures as well. + // If we're not immediately followed by a =~, then we + // parse and validate now. If it is followed by a =~, + // then it will get parsed in the =~ handler where + // named captures can also be extracted. if (!match1(parser, PM_TOKEN_EQUAL_TILDE)) { - parse_regular_expression_errors(parser, node); + pm_node_flag_set(UP(node), pm_regexp_parse(parser, node, NULL, NULL)); } - pm_node_flag_set(UP(node), parse_and_validate_regular_expression_encoding(parser, &unescaped, ascii_only, FL(node))); return UP(node); } @@ -20318,7 +20133,7 @@ parse_assignment_value(pm_parser_t *parser, pm_binding_power_t previous_binding_ pm_token_t rescue = parser->current; parser_lex(parser); - pm_node_t *right = parse_expression(parser, PM_BINDING_POWER_MATCH + 1, flags & PM_PARSE_ACCEPTS_DO_BLOCK, PM_ERR_RESCUE_MODIFIER_VALUE, (uint16_t) (depth + 1)); + pm_node_t *right = parse_expression(parser, pm_binding_powers[PM_TOKEN_KEYWORD_RESCUE_MODIFIER].right, flags & PM_PARSE_ACCEPTS_DO_BLOCK, PM_ERR_RESCUE_MODIFIER_VALUE, (uint16_t) (depth + 1)); context_pop(parser); return UP(pm_rescue_modifier_node_create(parser, value, &rescue, right)); @@ -20435,7 +20250,7 @@ parse_assignment_values(pm_parser_t *parser, pm_binding_power_t previous_binding } } - pm_node_t *right = parse_expression(parser, PM_BINDING_POWER_MATCH + 1, (flags & PM_PARSE_ACCEPTS_DO_BLOCK) | (accepts_command_call_inner ? PM_PARSE_ACCEPTS_COMMAND_CALL : 0), PM_ERR_RESCUE_MODIFIER_VALUE, (uint16_t) (depth + 1)); + pm_node_t *right = parse_expression(parser, pm_binding_powers[PM_TOKEN_KEYWORD_RESCUE_MODIFIER].right, (flags & PM_PARSE_ACCEPTS_DO_BLOCK) | (accepts_command_call_inner ? PM_PARSE_ACCEPTS_COMMAND_CALL : 0), PM_ERR_RESCUE_MODIFIER_VALUE, (uint16_t) (depth + 1)); context_pop(parser); return UP(pm_rescue_modifier_node_create(parser, value, &rescue, right)); @@ -20466,31 +20281,6 @@ parse_call_operator_write(pm_parser_t *parser, pm_call_node_t *call_node, const } } -/** - * This struct is used to pass information between the regular expression parser - * and the named capture callback. - */ -typedef struct { - /** The parser that is parsing the regular expression. */ - pm_parser_t *parser; - - /** The call node wrapping the regular expression node. */ - pm_call_node_t *call; - - /** The match write node that is being created. */ - pm_match_write_node_t *match; - - /** The list of names that have been parsed. */ - pm_constant_id_list_t names; - - /** - * Whether the content of the regular expression is shared. This impacts - * whether or not we used owned constants or shared constants in the - * constant pool for the names of the captures. - */ - bool shared; -} parse_regular_expression_named_capture_data_t; - static inline const uint8_t * pm_named_capture_escape_hex(pm_buffer_t *unescaped, const uint8_t *cursor, const uint8_t *end) { cursor++; @@ -20543,7 +20333,7 @@ pm_named_capture_escape_unicode(pm_parser_t *parser, pm_buffer_t *unescaped, con if (*cursor != '{') { size_t length = pm_strspn_hexadecimal_digit(cursor, MIN(end - cursor, 4)); - uint32_t value = escape_unicode(parser, cursor, length, error_location); + uint32_t value = escape_unicode(parser, cursor, length, error_location, 0); if (!pm_buffer_append_unicode_codepoint(unescaped, value)) { pm_buffer_append_string(unescaped, (const char *) start, (size_t) ((cursor + length) - start)); @@ -20566,7 +20356,7 @@ pm_named_capture_escape_unicode(pm_parser_t *parser, pm_buffer_t *unescaped, con if (length == 0) { break; } - uint32_t value = escape_unicode(parser, cursor, length, error_location); + uint32_t value = escape_unicode(parser, cursor, length, error_location, 0); (void) pm_buffer_append_unicode_codepoint(unescaped, value); cursor += length; @@ -20616,10 +20406,7 @@ pm_named_capture_escape(pm_parser_t *parser, pm_buffer_t *unescaped, const uint8 * capture group. */ static void -parse_regular_expression_named_capture(const pm_string_t *capture, void *data) { - parse_regular_expression_named_capture_data_t *callback_data = (parse_regular_expression_named_capture_data_t *) data; - - pm_parser_t *parser = callback_data->parser; +parse_regular_expression_named_capture(pm_parser_t *parser, const pm_string_t *capture, bool shared, pm_regexp_name_data_t *callback_data) { pm_call_node_t *call = callback_data->call; pm_constant_id_list_t *names = &callback_data->names; @@ -20637,7 +20424,7 @@ parse_regular_expression_named_capture(const pm_string_t *capture, void *data) { // unescaped, which is what we need. const uint8_t *cursor = pm_memchr(source, '\\', length, parser->encoding_changed, parser->encoding); if (PRISM_UNLIKELY(cursor != NULL)) { - pm_named_capture_escape(parser, &unescaped, source, length, cursor, callback_data->shared ? NULL : &call->receiver->location); + pm_named_capture_escape(parser, &unescaped, source, length, cursor, shared ? NULL : &call->receiver->location); source = (const uint8_t *) pm_buffer_value(&unescaped); length = pm_buffer_length(&unescaped); } @@ -20653,7 +20440,7 @@ parse_regular_expression_named_capture(const pm_string_t *capture, void *data) { return; } - if (callback_data->shared) { + if (shared) { // If the unescaped string is a slice of the source, then we can // copy the names directly. The pointers will line up. start = source; @@ -20707,26 +20494,19 @@ parse_regular_expression_named_capture(const pm_string_t *capture, void *data) { } /** - * Potentially change a =~ with a regular expression with named captures into a - * match write node. + * Potentially change a =~ with an interpolated regular expression with named + * captures into a match write node. This is for the interpolated case where + * we have concatenated content rather than a regular expression node. */ static pm_node_t * -parse_regular_expression_named_captures(pm_parser_t *parser, const pm_string_t *content, pm_call_node_t *call, bool extended_mode) { - parse_regular_expression_named_capture_data_t callback_data = { - .parser = parser, +parse_interpolated_regular_expression_named_captures(pm_parser_t *parser, const pm_string_t *content, pm_call_node_t *call, bool extended_mode) { + pm_regexp_name_data_t callback_data = { .call = call, + .match = NULL, .names = { 0 }, - .shared = content->type == PM_STRING_SHARED - }; - - parse_regular_expression_error_data_t error_data = { - .parser = parser, - .start = parser->start + PM_NODE_START(call->receiver), - .end = parser->start + PM_NODE_END(call->receiver), - .shared = content->type == PM_STRING_SHARED }; - pm_regexp_parse(parser, pm_string_source(content), pm_string_length(content), extended_mode, parse_regular_expression_named_capture, &callback_data, parse_regular_expression_error, &error_data); + pm_regexp_parse_named_captures(parser, pm_string_source(content), pm_string_length(content), false, extended_mode, parse_regular_expression_named_capture, &callback_data); if (callback_data.match != NULL) { return UP(callback_data.match); @@ -21248,14 +21028,25 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t pm_string_t owned; pm_string_owned_init(&owned, (uint8_t *) memory, total_length); - result = parse_regular_expression_named_captures(parser, &owned, call, PM_NODE_FLAG_P(node, PM_REGULAR_EXPRESSION_FLAGS_EXTENDED)); + result = parse_interpolated_regular_expression_named_captures(parser, &owned, call, PM_NODE_FLAG_P(node, PM_REGULAR_EXPRESSION_FLAGS_EXTENDED)); pm_string_free(&owned); } } else if (PM_NODE_TYPE_P(node, PM_REGULAR_EXPRESSION_NODE)) { - // If we have a regular expression node, then we can just parse - // the named captures directly off the unescaped string. - const pm_string_t *content = &((pm_regular_expression_node_t *) node)->unescaped; - result = parse_regular_expression_named_captures(parser, content, call, PM_NODE_FLAG_P(node, PM_REGULAR_EXPRESSION_FLAGS_EXTENDED)); + // If we have a regular expression node, then we can parse + // the named captures and validate encoding in one pass. + pm_regular_expression_node_t *regexp = (pm_regular_expression_node_t *) node; + + pm_regexp_name_data_t name_data = { + .call = call, + .match = NULL, + .names = { 0 }, + }; + + pm_node_flag_set(UP(regexp), pm_regexp_parse(parser, regexp, parse_regular_expression_named_capture, &name_data)); + + if (name_data.match != NULL) { + result = UP(name_data.match); + } } return result; @@ -22137,7 +21928,6 @@ pm_parser_init(pm_arena_t *arena, pm_parser_t *parser, const uint8_t *source, si .current_block_exits = NULL, .semantic_token_seen = false, .frozen_string_literal = PM_OPTIONS_FROZEN_STRING_LITERAL_UNSET, - .current_regular_expression_ascii_only = false, .warn_mismatched_indentation = true }; diff --git a/src/regexp.c b/src/regexp.c index dcc7476244..23495a35e7 100644 --- a/src/regexp.c +++ b/src/regexp.c @@ -1,5 +1,9 @@ #include "prism/regexp.h" +#include "prism/diagnostic.h" +#include "prism/util/pm_buffer.h" +#include "prism/util/pm_strncasecmp.h" +/** The maximum depth of nested groups allowed in a regular expression. */ #define PM_REGEXP_PARSE_DEPTH_MAX 4096 /** @@ -18,6 +22,54 @@ typedef struct { /** A pointer to the end of the source that we are parsing. */ const uint8_t *end; + /** The encoding of the source. */ + const pm_encoding_t *encoding; + + /** The callback to call when a named capture group is found. */ + pm_regexp_name_callback_t name_callback; + + /** The data to pass to the name callback. */ + pm_regexp_name_data_t *name_data; + + /** The start of the regexp node (for error locations). */ + const uint8_t *node_start; + + /** The end of the regexp node (for error locations). */ + const uint8_t *node_end; + + /** + * The explicit encoding determined by escape sequences. NULL if no + * encoding-setting escape has been seen, UTF-8 for `\u` escapes, or the + * source encoding for `\x` escapes. + */ + const pm_encoding_t *explicit_encoding; + + /** + * Pointer to the first non-POSIX property name (for /n error messages). + * POSIX properties (Alnum, Alpha, etc.) work in all encodings. + * Script properties (Hiragana, Katakana, etc.) work in /e, /s, /u. + * Unicode-only properties (L, Ll, etc.) work only in /u. + */ + const uint8_t *property_name; + + /** Length of the first non-POSIX property name found. */ + size_t property_name_length; + + /** + * Pointer to the first Unicode-only property name (for /e, /s error + * messages). NULL if only POSIX or script properties have been seen. + */ + const uint8_t *unicode_property_name; + + /** Length of the first Unicode-only property name found. */ + size_t unicode_property_name_length; + + /** Buffer of hex escape byte values >= 0x80, separated by 0x00 sentinels. */ + pm_buffer_t hex_escape_buffer; + + /** Count of non-ASCII literal bytes (not from escapes). */ + uint32_t non_ascii_literal_count; + /** * Whether or not the regular expression currently being parsed is in * extended mode, wherein whitespace is ignored and comments are allowed. @@ -27,30 +79,76 @@ typedef struct { /** Whether the encoding has changed from the default. */ bool encoding_changed; - /** The encoding of the source. */ - const pm_encoding_t *encoding; + /** Whether the source content is shared (for named capture callback). */ + bool shared; - /** The callback to call when a named capture group is found. */ - pm_regexp_name_callback_t name_callback; + /** Whether a `\u{...}` escape with value >= 0x80 was seen. */ + bool has_unicode_escape; - /** The data to pass to the name callback. */ - void *name_data; + /** Whether a `\xNN` escape (or `\M-x`, etc.) with value >= 0x80 was seen. */ + bool has_hex_escape; + + /** + * Tracks whether the last encoding-setting escape was `\u` (true) or `\x` + * (false). This matters for error messages when both types are mixed. + */ + bool last_escape_was_unicode; + + /** Whether any `\p{...}` or `\P{...}` property escape was found. */ + bool has_property_escape; - /** The callback to call when a parse error is found. */ - pm_regexp_error_callback_t error_callback; + /** Whether a Unicode-only property escape was found (not POSIX or script). */ + bool has_unicode_property_escape; - /** The data to pass to the error callback. */ - void *error_data; + /** Whether a `\u` escape with invalid range (surrogate or > 0x10FFFF) was seen. */ + bool invalid_unicode_range; + + /** Whether we are accumulating consecutive hex escape bytes. */ + bool hex_group_active; + + /** Whether an invalid multibyte character was found during parsing. */ + bool has_invalid_multibyte; } pm_regexp_parser_t; /** - * Append an error to the parser. + * Append a syntax error to the parser's error list. If the source is shared + * (points into the original source), we can point to the exact error location. + * Otherwise, we point to the whole regexp node. */ static inline void pm_regexp_parse_error(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end, const char *message) { - parser->error_callback(start, end, message, parser->error_data); + pm_parser_t *pm = parser->parser; + uint32_t loc_start, loc_length; + + if (parser->shared) { + loc_start = (uint32_t) (start - pm->start); + loc_length = (uint32_t) (end - start); + } else { + loc_start = (uint32_t) (parser->node_start - pm->start); + loc_length = (uint32_t) (parser->node_end - parser->node_start); + } + + pm_diagnostic_list_append_format(&pm->error_list, loc_start, loc_length, PM_ERR_REGEXP_PARSE_ERROR, message); } +/** + * Append a formatted diagnostic error with proper shared/non-shared location + * handling. This is a macro because we need variadic args for the format string. + */ +#define pm_regexp_parse_error_format(parser_, err_start_, err_end_, diag_id, ...) \ + do { \ + pm_parser_t *pm__ = (parser_)->parser; \ + uint32_t loc_start__, loc_length__; \ + if ((parser_)->shared) { \ + loc_start__ = (uint32_t) ((err_start_) - pm__->start); \ + loc_length__ = (uint32_t) ((err_end_) - (err_start_)); \ + } else { \ + loc_start__ = (uint32_t) ((parser_)->node_start - pm__->start); \ + loc_length__ = (uint32_t) ((parser_)->node_end - (parser_)->node_start); \ + } \ + pm_diagnostic_list_append_format(&pm__->error_list, loc_start__, loc_length__, diag_id, __VA_ARGS__); \ + } while (0) + /** * This appends a new string to the list of named captures. This function * assumes the caller has already checked the validity of the name callback. @@ -59,7 +157,7 @@ static void pm_regexp_parser_named_capture(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end) { pm_string_t string; pm_string_shared_init(&string, start, end); - parser->name_callback(&string, parser->name_data); + parser->name_callback(parser->parser, &string, parser->shared, parser->name_data); pm_string_free(&string); } @@ -113,6 +211,47 @@ pm_regexp_char_find(pm_regexp_parser_t *parser, uint8_t value) { return true; } +/** + * Mark a group boundary in the hex escape byte buffer. When consecutive hex + * escape bytes >= 0x80 are followed by a non-hex-escape, this appends a 0x00 + * sentinel to separate the groups for later multibyte validation. + */ +static inline void +pm_regexp_hex_group_boundary(pm_regexp_parser_t *parser) { + if (parser->hex_group_active) { + pm_buffer_append_byte(&parser->hex_escape_buffer, 0x00); + parser->hex_group_active = false; + } +} + +/** + * Track a hex escape byte value >= 0x80 for multibyte validation. + */ +static inline void +pm_regexp_track_hex_escape(pm_regexp_parser_t *parser, uint8_t byte) { + if (byte >= 0x80) { + pm_buffer_append_byte(&parser->hex_escape_buffer, byte); + parser->hex_group_active = true; + parser->has_hex_escape = true; + + parser->explicit_encoding = parser->encoding; + parser->last_escape_was_unicode = false; + } else { + pm_regexp_hex_group_boundary(parser); + } +} + +/** + * Parse a hex digit character and return its value, or -1 if not a hex digit. + */ +static inline int +pm_regexp_hex_digit_value(uint8_t byte) { + if (byte >= '0' && byte <= '9') return byte - '0'; + if (byte >= 'a' && byte <= 'f') return byte - 'a' + 10; + if (byte >= 'A' && byte <= 'F') return byte - 'A' + 10; + return -1; +} + /** * Range quantifiers are a special class of quantifiers that look like * @@ -121,13 +260,12 @@ pm_regexp_char_find(pm_regexp_parser_t *parser, uint8_t value) { * * {digit,digit} * * {,digit} * - * Unfortunately, if there are any spaces in between, then this just becomes a - * regular character match expression and we have to backtrack. So when this - * function first starts running, we'll create a "save" point and then attempt - * to parse the quantifier. If it fails, we'll restore the save point and - * return. + * If there are any spaces in between, then this just becomes a regular + * character match expression and we have to backtrack. So when this function + * first starts running, we'll create a "save" point and then attempt to parse + * the quantifier. If it fails, we'll restore the save point and return. * - * The properly track everything, we're going to build a little state machine. + * To properly track everything, we're going to build a little state machine. * It looks something like the following: * * +-------+ +---------+ ------------+ @@ -275,10 +413,368 @@ pm_regexp_parse_posix_class(pm_regexp_parser_t *parser) { ); } +/** + * Property escape classification. Onigmo supports three tiers of property + * names depending on the encoding: + * + * - POSIX properties (Alnum, Alpha, ASCII, Blank, Cntrl, Digit, Graph, Lower, + * Print, Punct, Space, Upper, XDigit, Word): valid in all encodings. + * - Script properties (Hiragana, Katakana, Han, Latin, Greek, Cyrillic): valid + * in EUC-JP (/e), Windows-31J (/s), and UTF-8 (/u), but not ASCII-8BIT (/n). + * - Unicode-only properties (general categories like L, Ll, Lu, etc., plus + * Any, Assigned): valid only in UTF-8 (/u). + */ +typedef enum { + PM_REGEXP_PROPERTY_POSIX, + PM_REGEXP_PROPERTY_SCRIPT, + PM_REGEXP_PROPERTY_UNICODE +} pm_regexp_property_type_t; + +/** + * Check if a property name matches a NUL-terminated target string + * (case-insensitive, exact length match). + */ +static inline bool +pm_regexp_property_name_matches(const uint8_t *name, size_t length, const char *target) { + return target[length] == '\0' && pm_strncasecmp(name, (const uint8_t *) target, length) == 0; +} + +/** + * Classify a property name. The name may start with '^' for negation, which + * is skipped before matching. + */ +static pm_regexp_property_type_t +pm_regexp_classify_property(const uint8_t *name, size_t length) { + // Skip leading '^' for negated properties like \p{^Hiragana}. + if (length > 0 && name[0] == '^') { + name++; + length--; + } + + // POSIX properties — valid in all encodings. + static const char *const posix_properties[] = { + "Alnum", "Alpha", "ASCII", "Blank", "Cntrl", "Digit", "Graph", + "Lower", "Print", "Punct", "Space", "Upper", "XDigit", "Word", + NULL + }; + + for (const char *const *property = posix_properties; *property != NULL; property++) { + if (pm_regexp_property_name_matches(name, length, *property)) { + return PM_REGEXP_PROPERTY_POSIX; + } + } + + // Script properties — valid in /e, /s, /u but not /n. + static const char *const script_properties[] = { + "Hiragana", "Katakana", "Han", "Latin", "Greek", "Cyrillic", + NULL + }; + + for (const char *const *property = script_properties; *property != NULL; property++) { + if (pm_regexp_property_name_matches(name, length, *property)) { + return PM_REGEXP_PROPERTY_SCRIPT; + } + } + + // Everything else is Unicode-only (general categories, other scripts, etc.). + return PM_REGEXP_PROPERTY_UNICODE; +} + +/** + * Check for and skip a `\p{...}` or `\P{...}` Unicode property escape. The + * cursor should be pointing at 'p' or 'P' when this is called. If a property + * escape is found, record it on the regexp parser and advance past the closing + * '}'. + * + * Properties are classified into three tiers (POSIX, script, Unicode-only) to + * determine which encoding modifiers they are valid with. + */ +static bool +pm_regexp_parse_property_escape(pm_regexp_parser_t *parser) { + assert(*parser->cursor == 'p' || *parser->cursor == 'P'); + + if (parser->cursor + 1 < parser->end && parser->cursor[1] == '{') { + const uint8_t *name_start = parser->cursor + 2; + const uint8_t *search = name_start; + + while (search < parser->end && *search != '}') search++; + + if (search < parser->end) { + size_t name_length = (size_t) (search - name_start); + parser->has_property_escape = true; + + pm_regexp_property_type_t type = pm_regexp_classify_property(name_start, name_length); + + // Track the first non-POSIX property name (for /n error messages). + if (type >= PM_REGEXP_PROPERTY_SCRIPT && parser->property_name == NULL) { + parser->property_name = name_start; + parser->property_name_length = name_length; + } + + // Track the first Unicode-only property name (for /e, /s error messages). + if (type == PM_REGEXP_PROPERTY_UNICODE) { + parser->has_unicode_property_escape = true; + if (parser->unicode_property_name == NULL) { + parser->unicode_property_name = name_start; + parser->unicode_property_name_length = name_length; + } + } + + parser->cursor = search + 1; // skip past '}' + return true; + } + } + + // Not a property escape, just skip the single character after '\'. + parser->cursor++; + return false; +} + +/** + * Validate and skip a \u escape sequence in a regular expression. The cursor + * should be pointing at the character after 'u' when this is called. This + * handles both the \u{NNNN MMMM} and \uNNNN forms. Also tracks encoding + * state for validation. + */ +static void +pm_regexp_parse_unicode_escape(pm_regexp_parser_t *parser) { + const uint8_t *escape_start = parser->cursor - 2; // points to '\' + + if (pm_regexp_char_is_eof(parser)) { + pm_regexp_parse_error(parser, escape_start, parser->cursor, "invalid Unicode escape"); + return; + } + + if (*parser->cursor == '{') { + parser->cursor++; // skip '{' + + // Skip leading whitespace. + while (!pm_regexp_char_is_eof(parser) && pm_char_is_whitespace(*parser->cursor)) { + parser->cursor++; + } + + bool has_codepoint = false; + + while (!pm_regexp_char_is_eof(parser) && *parser->cursor != '}') { + // Parse the hex digits to compute the codepoint value. + uint32_t value = 0; + size_t hex_count = 0; + + int digit; + while (!pm_regexp_char_is_eof(parser) && (digit = pm_regexp_hex_digit_value(*parser->cursor)) >= 0) { + value = (value << 4) | (uint32_t) digit; + hex_count++; + parser->cursor++; + } + + if (hex_count == 0) { + // Skip to '}' or end of regexp to find the full extent. + while (!pm_regexp_char_is_eof(parser) && *parser->cursor != '}') { + parser->cursor++; + } + + const uint8_t *escape_end = parser->cursor; + if (!pm_regexp_char_is_eof(parser)) { + escape_end++; + parser->cursor++; // skip '}' + } + + pm_regexp_parse_error_format(parser, escape_start, escape_end, PM_ERR_ESCAPE_INVALID_UNICODE_LIST, (int) (escape_end - escape_start), (const char *) escape_start); + return; + } + + if (hex_count > 6) { + pm_regexp_parse_error(parser, escape_start, parser->cursor, "invalid Unicode range"); + } + + // Track encoding state for this codepoint. + if (value >= 0x80) { + parser->has_unicode_escape = true; + parser->explicit_encoding = PM_ENCODING_UTF_8_ENTRY; + parser->last_escape_was_unicode = true; + pm_regexp_hex_group_boundary(parser); + } + + // Check for invalid Unicode range (surrogates or > 0x10FFFF). + if (value > 0x10FFFF || (value >= 0xD800 && value <= 0xDFFF)) { + parser->invalid_unicode_range = true; + } + + has_codepoint = true; + + // Skip whitespace between codepoints. + while (!pm_regexp_char_is_eof(parser) && pm_char_is_whitespace(*parser->cursor)) { + parser->cursor++; + } + } + + if (pm_regexp_char_is_eof(parser)) { + pm_regexp_parse_error(parser, escape_start, parser->cursor, "unterminated Unicode escape"); + } else { + if (!has_codepoint) { + pm_regexp_parse_error_format(parser, escape_start, parser->cursor + 1, PM_ERR_ESCAPE_INVALID_UNICODE_LIST, (int) (parser->cursor + 1 - escape_start), (const char *) escape_start); + } + parser->cursor++; // skip '}' + } + } else { + // \uNNNN form — need exactly 4 hex digits. + uint32_t value = 0; + size_t hex_count = 0; + + int digit; + while (hex_count < 4 && !pm_regexp_char_is_eof(parser) && (digit = pm_regexp_hex_digit_value(*parser->cursor)) >= 0) { + value = (value << 4) | (uint32_t) digit; + hex_count++; + parser->cursor++; + } + + if (hex_count < 4) { + pm_regexp_parse_error(parser, escape_start, parser->cursor, "invalid Unicode escape"); + } else if (value >= 0x80) { + parser->has_unicode_escape = true; + parser->explicit_encoding = PM_ENCODING_UTF_8_ENTRY; + parser->last_escape_was_unicode = true; + pm_regexp_hex_group_boundary(parser); + } + + // Check for invalid Unicode range. + if (hex_count == 4 && (value > 0x10FFFF || (value >= 0xD800 && value <= 0xDFFF))) { + parser->invalid_unicode_range = true; + } + } +} + // Forward declaration because character sets can be nested. static bool pm_regexp_parse_lbracket(pm_regexp_parser_t *parser, uint16_t depth); +/** + * Parse a \x escape and return the byte value. The cursor should be pointing + * at the character after 'x'. Returns -1 if no hex digits follow. + */ +static int +pm_regexp_parse_hex_escape(pm_regexp_parser_t *parser) { + int value = -1; + + if (!pm_regexp_char_is_eof(parser)) { + int digit = pm_regexp_hex_digit_value(*parser->cursor); + if (digit >= 0) { + value = digit; + parser->cursor++; + + if (!pm_regexp_char_is_eof(parser)) { + digit = pm_regexp_hex_digit_value(*parser->cursor); + if (digit >= 0) { + value = (value << 4) | digit; + parser->cursor++; + } + } + } + } + + if (value >= 0) { + pm_regexp_track_hex_escape(parser, (uint8_t) value); + } + + return value; +} + +/** + * Parse a backslash escape sequence in a regexp, handling \u (unicode), + * \p/\P (property), \x (hex), and other single-character escapes. Also + * tracks encoding state for \M-x and \C-\M-x escapes. + */ +static void +pm_regexp_parse_backslash_escape(pm_regexp_parser_t *parser) { + if (pm_regexp_char_is_eof(parser)) return; + + switch (*parser->cursor) { + case 'u': + parser->cursor++; // skip 'u' + pm_regexp_parse_unicode_escape(parser); + break; + case 'p': + case 'P': + pm_regexp_parse_property_escape(parser); + break; + case 'x': + parser->cursor++; // skip 'x' + pm_regexp_parse_hex_escape(parser); + break; + case 'M': + // \M-x produces (x | 0x80), always >= 0x80 + if (parser->cursor + 2 < parser->end && parser->cursor[1] == '-') { + parser->cursor += 2; // skip 'M-' + if (!pm_regexp_char_is_eof(parser)) { + if (*parser->cursor == '\\') { + parser->cursor++; + // \M-\C-x or \M-\cx — the resulting byte is always >= 0x80 + // We just need to track it as a hex escape >= 0x80. + pm_regexp_parse_backslash_escape(parser); + } else { + parser->cursor++; + } + // \M-x always produces a byte >= 0x80 + pm_regexp_track_hex_escape(parser, 0x80); + } + } else { + parser->cursor++; + } + break; + case 'C': + // \C-x produces (x & 0x1F) + if (parser->cursor + 2 < parser->end && parser->cursor[1] == '-') { + parser->cursor += 2; // skip 'C-' + if (!pm_regexp_char_is_eof(parser)) { + if (*parser->cursor == '\\') { + parser->cursor++; + pm_regexp_parse_backslash_escape(parser); + } else { + parser->cursor++; + } + } + } else { + parser->cursor++; + } + break; + case 'c': + // \cx produces (x & 0x1F) + parser->cursor++; // skip 'c' + if (!pm_regexp_char_is_eof(parser)) { + if (*parser->cursor == '\\') { + parser->cursor++; + pm_regexp_parse_backslash_escape(parser); + } else { + parser->cursor++; + } + } + break; + default: + pm_regexp_hex_group_boundary(parser); + parser->cursor++; + break; + } +} + +/** + * Check if a byte at the current position is a non-ASCII byte in a multibyte + * encoding that produces an invalid character. If so, emit an error at the + * byte location immediately. + */ +static void +pm_regexp_parse_invalid_multibyte(pm_regexp_parser_t *parser, const uint8_t *cursor) { + uint8_t byte = *cursor; + if (byte >= 0x80 && parser->encoding_changed && parser->encoding->multibyte) { + size_t width = parser->encoding->char_width(cursor, (ptrdiff_t) (parser->end - cursor)); + if (width > 1) { + parser->cursor += width - 1; + } else if (width == 0) { + parser->has_invalid_multibyte = true; + pm_regexp_parse_error_format(parser, cursor, cursor + 1, PM_ERR_INVALID_MULTIBYTE_CHAR, parser->encoding->name); + } + } +} + /** * match-char-set : '[' '^'? (match-range | match-char)* ']' * ; @@ -293,12 +789,16 @@ pm_regexp_parse_character_set(pm_regexp_parser_t *parser, uint16_t depth) { pm_regexp_parse_lbracket(parser, (uint16_t) (depth + 1)); break; case '\\': - if (!pm_regexp_char_is_eof(parser)) { - parser->cursor++; - } + pm_regexp_parse_backslash_escape(parser); break; default: - // do nothing, we've already advanced the cursor + // We've already advanced the cursor by one byte. If the byte + // was >= 0x80 in a multibyte encoding, we may need to consume + // additional continuation bytes and validate the character. + if (*(parser->cursor - 1) >= 0x80) { + parser->non_ascii_literal_count++; + } + pm_regexp_parse_invalid_multibyte(parser, parser->cursor - 1); break; } } @@ -354,8 +854,13 @@ typedef enum { // These are the options that are configurable on the regular expression (or // from within a group). +/** The minimum character value for a regexp option slot. */ #define PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM 'a' + +/** The maximum character value for a regexp option slot. */ #define PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM 'x' + +/** The number of regexp option slots. */ #define PRISM_REGEXP_OPTION_STATE_SLOTS (PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM + 1) /** @@ -498,7 +1003,15 @@ pm_regexp_parse_group(pm_regexp_parser_t *parser, uint16_t depth) { } size_t width = parser->encoding->char_width(parser->cursor, (ptrdiff_t) (parser->end - parser->cursor)); - if (width == 0) return false; + if (width == 0) { + if (*parser->cursor >= 0x80) { + parser->has_invalid_multibyte = true; + pm_regexp_parse_error_format(parser, parser->cursor, parser->cursor + 1, PM_ERR_INVALID_MULTIBYTE_CHAR, parser->encoding->name); + parser->cursor++; + continue; + } + return false; + } escaped = (width == 1) && (*parser->cursor == '\\'); parser->cursor += width; @@ -686,9 +1199,7 @@ pm_regexp_parse_item(pm_regexp_parser_t *parser, uint16_t depth) { return pm_regexp_parse_quantifier(parser); case '\\': parser->cursor++; - if (!pm_regexp_char_is_eof(parser)) { - parser->cursor++; - } + pm_regexp_parse_backslash_escape(parser); return pm_regexp_parse_quantifier(parser); case '(': parser->cursor++; @@ -720,9 +1231,30 @@ pm_regexp_parse_item(pm_regexp_parser_t *parser, uint16_t depth) { width = parser->encoding->char_width(parser->cursor, (ptrdiff_t) (parser->end - parser->cursor)); } - if (width == 0) return false; // TODO: add appropriate error - parser->cursor += width; + if (width == 0) { + if (*parser->cursor >= 0x80 && parser->encoding_changed) { + if (parser->encoding->multibyte) { + // Invalid multibyte character in a multibyte encoding. + // Emit the error at the byte location immediately. + parser->has_invalid_multibyte = true; + pm_regexp_parse_error_format(parser, parser->cursor, parser->cursor + 1, PM_ERR_INVALID_MULTIBYTE_CHAR, parser->encoding->name); + } else { + // Non-ASCII byte in a single-byte encoding (e.g., + // US-ASCII). Count it for later error reporting. + parser->non_ascii_literal_count++; + } + parser->cursor++; + return pm_regexp_parse_quantifier(parser); + } + return false; + } + // Count non-ASCII literal bytes. + for (size_t i = 0; i < width; i++) { + if (parser->cursor[i] >= 0x80) parser->non_ascii_literal_count++; + } + + parser->cursor += width; return pm_regexp_parse_quantifier(parser); } } @@ -768,13 +1300,353 @@ pm_regexp_parse_pattern(pm_regexp_parser_t *parser) { return pm_regexp_char_is_eof(parser); } +// --------------------------------------------------------------------------- +// Encoding validation +// --------------------------------------------------------------------------- + /** - * Parse a regular expression and extract the names of all of the named capture - * groups. + * Validate that groups of hex escape bytes in the buffer form valid multibyte + * characters in the given encoding. Groups are separated by 0x00 sentinels. + */ +static bool +pm_regexp_validate_hex_escapes(const pm_encoding_t *encoding, const pm_buffer_t *buffer) { + const uint8_t *data = (const uint8_t *) pm_buffer_value(buffer); + size_t len = pm_buffer_length(buffer); + size_t i = 0; + + while (i < len) { + size_t group_start = i; + while (i < len && data[i] != 0x00) i++; + + for (size_t j = group_start; j < i; ) { + size_t width = encoding->char_width(data + j, (ptrdiff_t) (i - j)); + if (width == 0) return false; + j += width; + } + + if (i < len) i++; // skip sentinel + } + + return true; +} + +/** + * Format regexp source content for use in error messages, hex-escaping + * non-ASCII bytes. + */ +static void +pm_regexp_format_for_error(pm_buffer_t *buffer, const pm_encoding_t *encoding, const uint8_t *source, size_t length) { + size_t index = 0; + + if (encoding == PM_ENCODING_UTF_8_ENTRY) { + pm_buffer_append_string(buffer, (const char *) source, length); + return; + } + + while (index < length) { + if (source[index] < 0x80) { + pm_buffer_append_byte(buffer, source[index]); + index++; + } else if (encoding->multibyte) { + size_t width = encoding->char_width(source + index, (ptrdiff_t) (length - index)); + + if (width > 1) { + pm_buffer_append_string(buffer, "\\x{", 3); + for (size_t i = 0; i < width; i++) { + pm_buffer_append_format(buffer, "%02X", source[index + i]); + } + pm_buffer_append_byte(buffer, '}'); + index += width; + } else { + pm_buffer_append_format(buffer, "\\x%02X", source[index]); + index++; + } + } else { + pm_buffer_append_format(buffer, "\\x%02X", source[index]); + index++; + } + } +} + +/** + * Emit an encoding validation error on the regexp node. + */ +#define PM_REGEXP_ENCODING_ERROR(parser, diag_id, ...) \ + pm_diagnostic_list_append_format( \ + &(parser)->parser->error_list, \ + (uint32_t) ((parser)->node_start - (parser)->parser->start), \ + (uint32_t) ((parser)->node_end - (parser)->node_start), \ + diag_id, __VA_ARGS__) + +/** + * Validate encoding for a regexp with an encoding modifier (/e, /s, /u, /n). + * + * The decision tree is: + * + * 1. No escape-set encoding (explicit_encoding == NULL): + * a. ASCII-only content: validate property escapes, return forced US-ASCII + * for /n or the modifier flags for others. + * b. US-ASCII source with non-ASCII literals: emit per-byte errors. + * c. Source encoding differs from modifier encoding: emit mismatch error. + * + * 2. Mixed \u and \x escapes: emit the appropriate conflict error depending + * on the modifier and which escape type was last. + * + * 3. \u escape with non-/u modifier: incompatible encoding error. + * + * 4. Validate that hex escape byte sequences form valid multibyte characters + * in the modifier's encoding. + */ +static pm_node_flags_t +pm_regexp_validate_encoding_modifier(pm_regexp_parser_t *parser, bool ascii_only, pm_node_flags_t flags, char modifier, const pm_encoding_t *modifier_encoding, const char *source_start, int source_length) { + + if (parser->explicit_encoding == NULL) { + if (ascii_only) { + // Check property escapes against the modifier's encoding tier. + // /n (ASCII-8BIT): only POSIX properties are valid. + // /e, /s: POSIX and script properties are valid. + // /u: all properties are valid. + if (modifier == 'n' && parser->property_name != NULL) { + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_INVALID_CHAR_PROPERTY, + (int) parser->property_name_length, (const char *) parser->property_name, + source_length, source_start); + } else if (modifier != 'u' && parser->has_unicode_property_escape) { + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_INVALID_CHAR_PROPERTY, + (int) parser->unicode_property_name_length, (const char *) parser->unicode_property_name, + source_length, source_start); + } + return modifier == 'n' ? PM_REGULAR_EXPRESSION_FLAGS_FORCED_US_ASCII_ENCODING : flags; + } + + if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY) { + for (uint32_t i = 0; i < parser->non_ascii_literal_count; i++) { + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_INVALID_MULTIBYTE_CHAR, parser->encoding->name); + } + } else if (parser->encoding != modifier_encoding) { + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_ENCODING_OPTION_MISMATCH, modifier, parser->encoding->name); + + if (modifier == 'n' && !ascii_only) { + pm_buffer_t formatted = { 0 }; + pm_regexp_format_for_error(&formatted, parser->encoding, (const uint8_t *) source_start, (size_t) source_length); + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_NON_ESCAPED_MBC, (int) formatted.length, (const char *) formatted.value); + pm_buffer_free(&formatted); + } + } + + return flags; + } + + // Mixed unicode + hex escapes. + if (parser->has_unicode_escape && parser->has_hex_escape) { + if (modifier == 'n') { + if (parser->last_escape_was_unicode) { + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_UTF8_CHAR_NON_UTF8_REGEXP, source_length, source_start); + } else { + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_ESCAPED_NON_ASCII_IN_UTF8, source_length, source_start); + } + } else { + if (!pm_regexp_validate_hex_escapes(modifier_encoding, &parser->hex_escape_buffer)) { + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_INVALID_MULTIBYTE_ESCAPE, source_length, source_start); + } + } + + return flags; + } + + if (modifier != 'u' && parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) { + if (parser->last_escape_was_unicode) { + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_INCOMPAT_CHAR_ENCODING, source_length, source_start); + } else if (parser->encoding != PM_ENCODING_UTF_8_ENTRY) { + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_INCOMPAT_CHAR_ENCODING, source_length, source_start); + } + } + + if (modifier != 'n' && !pm_regexp_validate_hex_escapes(modifier_encoding, &parser->hex_escape_buffer)) { + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_INVALID_MULTIBYTE_ESCAPE, source_length, source_start); + } + + return flags; +} + +/** + * Validate encoding for a regexp without a modifier and compute the encoding + * flags to set on the node. + * + * The decision tree is: + * + * 1. If a modifier (/n, /u, /e, /s) is present, delegate to + * pm_regexp_validate_encoding_modifier. + * 2. Invalid multibyte chars or unicode ranges: suppress further checks (errors + * were already emitted during parsing). + * 3. US-ASCII source with non-ASCII literals: emit per-byte errors. + * 4. ASCII-only content: return forced US-ASCII (or forced UTF-8 if \p{...}). + * 5. Escape-set encoding present: validate hex escapes against the target + * encoding, handle mixed \u + \x conflicts, and return the appropriate + * forced encoding flag. + */ +static pm_node_flags_t +pm_regexp_validate_encoding(pm_regexp_parser_t *parser, bool ascii_only, pm_node_flags_t flags, const char *source_start, int source_length) { + + // Invalid multibyte characters suppress further validation. + // Errors were already emitted at the byte locations during parsing. + if (parser->has_invalid_multibyte) { + return flags; + } + + if (parser->invalid_unicode_range) { + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_INVALID_UNICODE_RANGE, source_length, source_start); + return flags; + } + + // Check modifier flags first. + if (flags & PM_REGULAR_EXPRESSION_FLAGS_ASCII_8BIT) { + return pm_regexp_validate_encoding_modifier(parser, ascii_only, flags, 'n', PM_ENCODING_ASCII_8BIT_ENTRY, source_start, source_length); + } + if (flags & PM_REGULAR_EXPRESSION_FLAGS_UTF_8) { + return pm_regexp_validate_encoding_modifier(parser, ascii_only, flags, 'u', PM_ENCODING_UTF_8_ENTRY, source_start, source_length); + } + if (flags & PM_REGULAR_EXPRESSION_FLAGS_EUC_JP) { + return pm_regexp_validate_encoding_modifier(parser, ascii_only, flags, 'e', PM_ENCODING_EUC_JP_ENTRY, source_start, source_length); + } + if (flags & PM_REGULAR_EXPRESSION_FLAGS_WINDOWS_31J) { + return pm_regexp_validate_encoding_modifier(parser, ascii_only, flags, 's', PM_ENCODING_WINDOWS_31J_ENTRY, source_start, source_length); + } + + // No modifier — check for non-ASCII literals in US-ASCII encoding. + if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY && parser->explicit_encoding == NULL && !ascii_only) { + for (uint32_t i = 0; i < parser->non_ascii_literal_count; i++) { + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_INVALID_MULTIBYTE_CHAR, parser->encoding->name); + } + } + + // ASCII-only regexps get downgraded to US-ASCII, unless property escapes + // force UTF-8. + if (ascii_only) { + if (parser->has_property_escape) { + return PM_REGULAR_EXPRESSION_FLAGS_FORCED_UTF8_ENCODING; + } + return PM_REGULAR_EXPRESSION_FLAGS_FORCED_US_ASCII_ENCODING; + } + + // Check explicit encoding from escape sequences. + if (parser->explicit_encoding != NULL) { + // Mixed unicode + hex escapes without modifier. + if (parser->has_unicode_escape && parser->has_hex_escape && parser->encoding != PM_ENCODING_UTF_8_ENTRY) { + if (parser->encoding != PM_ENCODING_US_ASCII_ENTRY && + parser->encoding != PM_ENCODING_ASCII_8BIT_ENTRY && + !pm_regexp_validate_hex_escapes(parser->encoding, &parser->hex_escape_buffer)) { + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_INVALID_MULTIBYTE_ESCAPE, source_length, source_start); + } else if (parser->last_escape_was_unicode) { + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_UTF8_CHAR_NON_UTF8_REGEXP, source_length, source_start); + } else { + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_ESCAPED_NON_ASCII_IN_UTF8, source_length, source_start); + } + + return 0; + } + + if (parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) { + if (!pm_regexp_validate_hex_escapes(parser->explicit_encoding, &parser->hex_escape_buffer)) { + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_INVALID_MULTIBYTE_ESCAPE, source_length, source_start); + } + + return PM_REGULAR_EXPRESSION_FLAGS_FORCED_UTF8_ENCODING; + } else if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY) { + return PM_REGULAR_EXPRESSION_FLAGS_FORCED_BINARY_ENCODING; + } else { + if (!pm_regexp_validate_hex_escapes(parser->explicit_encoding, &parser->hex_escape_buffer)) { + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_INVALID_MULTIBYTE_ESCAPE, source_length, source_start); + } + } + } + + return 0; +} + +/** + * Parse a regular expression, validate its encoding, and optionally extract + * named capture groups. Encoding validation walks the raw source (content_loc) + * to distinguish escape-produced bytes from literal bytes. Named capture + * extraction walks the unescaped content since escape sequences in group names + * (e.g., line continuations) have already been processed by the lexer. + */ +PRISM_EXPORTED_FUNCTION pm_node_flags_t +pm_regexp_parse(pm_parser_t *parser, pm_regular_expression_node_t *node, pm_regexp_name_callback_t name_callback, pm_regexp_name_data_t *name_data) { + const uint8_t *source = parser->start + node->content_loc.start; + size_t size = node->content_loc.length; + bool extended_mode = PM_NODE_FLAG_P(node, PM_REGULAR_EXPRESSION_FLAGS_EXTENDED); + pm_node_flags_t flags = PM_NODE_FLAGS(node); + + const uint8_t *node_start = parser->start + node->base.location.start; + const uint8_t *node_end = parser->start + node->base.location.start + node->base.location.length; + + // First pass: walk raw source for encoding validation (no name extraction). + pm_regexp_parser_t regexp_parser = { + .parser = parser, + .start = source, + .cursor = source, + .end = source + size, + .extended_mode = extended_mode, + .encoding_changed = parser->encoding_changed, + .encoding = parser->encoding, + .name_callback = NULL, + .name_data = NULL, + .shared = true, + .node_start = node_start, + .node_end = node_end, + .has_unicode_escape = false, + .has_hex_escape = false, + .last_escape_was_unicode = false, + .explicit_encoding = NULL, + .has_property_escape = false, + .has_unicode_property_escape = false, + .property_name = NULL, + .property_name_length = 0, + .unicode_property_name = NULL, + .unicode_property_name_length = 0, + .non_ascii_literal_count = 0, + .invalid_unicode_range = false, + .hex_escape_buffer = { 0 }, + .hex_group_active = false, + .has_invalid_multibyte = false, + }; + + pm_regexp_parse_pattern(®exp_parser); + + // Compute ascii_only from the regexp parser's tracked state. We cannot + // use node->unescaped for this because regexp unescaped content preserves + // escape text (e.g., \x80 is 4 ASCII chars), not the binary values. + bool ascii_only = !regexp_parser.has_hex_escape && !regexp_parser.has_unicode_escape && regexp_parser.non_ascii_literal_count == 0; + // Use the unescaped content for error messages to match CRuby's format, + // where Ruby escapes like \M-\C-? are resolved to bytes but regexp escapes + // like \u{80} are preserved as text. + const char *error_source = (const char *) pm_string_source(&node->unescaped); + int error_source_length = (int) pm_string_length(&node->unescaped); + pm_node_flags_t encoding_flags = pm_regexp_validate_encoding(®exp_parser, ascii_only, flags, error_source, error_source_length); + pm_buffer_free(®exp_parser.hex_escape_buffer); + + // Second pass: walk unescaped content for named capture extraction. + if (name_callback != NULL) { + bool shared = node->unescaped.type == PM_STRING_SHARED; + pm_regexp_parse_named_captures(parser, pm_string_source(&node->unescaped), pm_string_length(&node->unescaped), shared, extended_mode, name_callback, name_data); + } + + return encoding_flags; +} + +/** + * Parse an interpolated regular expression for named capture groups only. + * This is used for the =~ operator with interpolated regexps where we don't + * have a pm_regular_expression_node_t. No encoding validation is performed. + * + * Note: The encoding-tracking fields (has_unicode_escape, has_hex_escape, etc.) + * are initialized but not used for the result. They exist because the parsing + * functions (pm_regexp_parse_backslash_escape, etc.) unconditionally update + * them as they walk through the content. */ -PRISM_EXPORTED_FUNCTION void -pm_regexp_parse(pm_parser_t *parser, const uint8_t *source, size_t size, bool extended_mode, pm_regexp_name_callback_t name_callback, void *name_data, pm_regexp_error_callback_t error_callback, void *error_data) { - pm_regexp_parse_pattern(&(pm_regexp_parser_t) { +void +pm_regexp_parse_named_captures(pm_parser_t *parser, const uint8_t *source, size_t size, bool shared, bool extended_mode, pm_regexp_name_callback_t name_callback, pm_regexp_name_data_t *name_data) { + pm_regexp_parser_t regexp_parser = { .parser = parser, .start = source, .cursor = source, @@ -784,7 +1656,26 @@ pm_regexp_parse(pm_parser_t *parser, const uint8_t *source, size_t size, bool ex .encoding = parser->encoding, .name_callback = name_callback, .name_data = name_data, - .error_callback = error_callback, - .error_data = error_data - }); + .shared = shared, + .node_start = source, + .node_end = source + size, + .has_unicode_escape = false, + .has_hex_escape = false, + .last_escape_was_unicode = false, + .explicit_encoding = NULL, + .has_property_escape = false, + .has_unicode_property_escape = false, + .property_name = NULL, + .property_name_length = 0, + .unicode_property_name = NULL, + .unicode_property_name_length = 0, + .non_ascii_literal_count = 0, + .invalid_unicode_range = false, + .hex_escape_buffer = { 0 }, + .hex_group_active = false, + .has_invalid_multibyte = false, + }; + + pm_regexp_parse_pattern(®exp_parser); + pm_buffer_free(®exp_parser.hex_escape_buffer); } diff --git a/templates/src/diagnostic.c.erb b/templates/src/diagnostic.c.erb index d717dc1e16..8fa47590c0 100644 --- a/templates/src/diagnostic.c.erb +++ b/templates/src/diagnostic.c.erb @@ -330,13 +330,15 @@ static const pm_diagnostic_data_t diagnostic_messages[PM_DIAGNOSTIC_ID_MAX] = { [PM_ERR_PATTERN_TERM_PAREN] = { "expected a `)` to close the pattern expression", PM_ERROR_LEVEL_SYNTAX }, [PM_ERR_PIPEPIPEEQ_MULTI_ASSIGN] = { "unexpected `||=` in a multiple assignment", PM_ERROR_LEVEL_SYNTAX }, [PM_ERR_REGEXP_ENCODING_OPTION_MISMATCH] = { "regexp encoding option '%c' differs from source encoding '%s'", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_REGEXP_ESCAPED_NON_ASCII_IN_UTF8] = { "escaped non ASCII character in UTF-8 regexp: /%.*s/", PM_ERROR_LEVEL_SYNTAX }, [PM_ERR_REGEXP_INCOMPAT_CHAR_ENCODING] = { "incompatible character encoding: /%.*s/", PM_ERROR_LEVEL_SYNTAX }, - [PM_ERR_REGEXP_NON_ESCAPED_MBC] = { "/.../n has a non escaped non ASCII character in non ASCII-8BIT script: /%.*s/", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_REGEXP_INVALID_CHAR_PROPERTY] = { "invalid character property name {%.*s}: /%.*s/", PM_ERROR_LEVEL_SYNTAX }, [PM_ERR_REGEXP_INVALID_UNICODE_RANGE] = { "invalid Unicode range: /%.*s/", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_REGEXP_NON_ESCAPED_MBC] = { "/.../n has a non escaped non ASCII character in non ASCII-8BIT script: /%.*s/", PM_ERROR_LEVEL_SYNTAX }, [PM_ERR_REGEXP_PARSE_ERROR] = { "%s", PM_ERROR_LEVEL_SYNTAX }, [PM_ERR_REGEXP_UNKNOWN_OPTIONS] = { "unknown regexp %s - %.*s", PM_ERROR_LEVEL_SYNTAX }, [PM_ERR_REGEXP_TERM] = { "unterminated regexp meets end of file; expected a closing delimiter", PM_ERROR_LEVEL_SYNTAX }, - [PM_ERR_REGEXP_UTF8_CHAR_NON_UTF8_REGEXP] = { "UTF-8 character in non UTF-8 regexp: /%s/", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_REGEXP_UTF8_CHAR_NON_UTF8_REGEXP] = { "UTF-8 character in non UTF-8 regexp: /%.*s/", PM_ERROR_LEVEL_SYNTAX }, [PM_ERR_RESCUE_EXPRESSION] = { "expected a rescued expression", PM_ERROR_LEVEL_SYNTAX }, [PM_ERR_RESCUE_MODIFIER_VALUE] = { "expected a value after the `rescue` modifier", PM_ERROR_LEVEL_SYNTAX }, [PM_ERR_RESCUE_TERM] = { "expected a closing delimiter for the `rescue` clause", PM_ERROR_LEVEL_SYNTAX }, diff --git a/test/prism/encoding/regular_expression_encoding_test.rb b/test/prism/encoding/regular_expression_encoding_test.rb index e2daae1d7f..fdff1e3281 100644 --- a/test/prism/encoding/regular_expression_encoding_test.rb +++ b/test/prism/encoding/regular_expression_encoding_test.rb @@ -2,6 +2,7 @@ return unless defined?(RubyVM::InstructionSequence) return if RubyVM::InstructionSequence.compile("").to_a[4][:parser] == :prism +return if RUBY_VERSION < "3.2" require_relative "../test_helper" @@ -21,7 +22,7 @@ class RegularExpressionEncodingTest < TestCase ["n", "u", "e", "s"].each do |modifier| define_method(:"test_regular_expression_encoding_modifiers_/#{modifier}_#{encoding.name}") do - regexp_sources = ["abc", "garçon", "\\x80", "gar\\xC3\\xA7on", "gar\\u{E7}on", "abc\\u{FFFFFF}", "\\x80\\u{80}" ] + regexp_sources = ["abc", "garçon", "\\x80", "gar\\xC3\\xA7on", "gar\\u{E7}on", "abc\\u{FFFFFF}", "\\x80\\u{80}", "\\p{L}" ] assert_regular_expression_encoding_flags( encoding, @@ -35,17 +36,15 @@ class RegularExpressionEncodingTest < TestCase def assert_regular_expression_encoding_flags(encoding, regexps) regexps.each do |regexp| - regexp_modifier_used = regexp.end_with?("/u") || regexp.end_with?("/e") || regexp.end_with?("/s") || regexp.end_with?("/n") source = "# encoding: #{encoding.name}\n#{regexp}" - encoding_errors = ["invalid multibyte char", "escaped non ASCII character in UTF-8 regexp", "differs from source encoding"] - skipped_errors = ["invalid multibyte escape", "incompatible character encoding", "UTF-8 character in non UTF-8 regexp", "invalid Unicode range", "invalid Unicode list"] - - # TODO (nirvdrum 21-Feb-2024): Prism currently does not handle Regexp validation unless modifiers are used. So, skip processing those errors for now: https://github.com/ruby/prism/issues/2104 - unless regexp_modifier_used - skipped_errors += encoding_errors - encoding_errors.clear - end + encoding_errors = [ + "invalid multibyte char", "escaped non ASCII character in UTF-8 regexp", + "differs from source encoding", "incompatible character encoding", + "invalid multibyte escape", "UTF-8 character in non UTF-8 regexp", + "invalid Unicode range", "non escaped non ASCII character", + "invalid character property name", "invalid Unicode list", + ] expected = begin @@ -53,8 +52,6 @@ def assert_regular_expression_encoding_flags(encoding, regexps) rescue SyntaxError => error if encoding_errors.find { |e| error.message.include?(e) } error.message.split("\n").map { |m| m[/: (.+?)$/, 1] } - elsif skipped_errors.find { |e| error.message.include?(e) } - next else raise end @@ -111,19 +108,6 @@ def assert_regular_expression_encoding_flags(encoding, regexps) end end - # TODO (nirvdrum 22-Feb-2024): Remove this workaround once Prism better maps CRuby's error messages. - # This class of error message is tricky. The part not being compared is a representation of the regexp. - # Depending on the source encoding and any encoding modifiers being used, CRuby alters how the regexp is represented. - # Sometimes it's an MBC string. Other times it uses hexadecimal character escapes. And in other cases it uses - # the long-form Unicode escape sequences. This short-circuit checks that the error message is mostly correct. - if expected.is_a?(Array) && actual.is_a?(Array) - if expected.last.start_with?("/.../n has a non escaped non ASCII character in non ASCII-8BIT script:") && - actual.last.start_with?("/.../n has a non escaped non ASCII character in non ASCII-8BIT script:") - expected.pop - actual.pop - end - end - assert_equal expected, actual end end