From 0944c7fba294aea09b9448f84b0eca4ab74695c4 Mon Sep 17 00:00:00 2001
From: Kevin Newton <kddnewton@gmail.com>
Date: Tue, 10 Mar 2026 22:32:43 -0400
Subject: [PATCH] Restructure regexp encoding validation

Move all the logic from prism.c into regexp.c. Now regexp.c does two passes. The first pass scans the raw source to track escape types, non-ASCII literals, and multibyte validity for encoding validation. The second pass scans the unescaped content for named capture extraction (needed because escape sequences like line continuations alter group names).

Fixed a couple of things along the way. ascii_only was previously computed from unescaped content, but we can do that as we go to avoid scanning again. Unicode properties also now properly error for regexp with modifiers.
---
 config.yml                                    |   2 +
 include/prism/parser.h                        |   6 -
 include/prism/regexp.h                        |  53 +-
 .../seattlerb/regexp_escape_extended.txt      |   2 +-
 src/prism.c                                   | 370 ++-----
 src/regexp.c                                  | 965 +++++++++++++++++-
 templates/src/diagnostic.c.erb                |   6 +-
 .../regular_expression_encoding_test.rb       |  34 +-
 8 files changed, 1065 insertions(+), 373 deletions(-)

diff --git a/config.yml b/config.yml
index c82c239de6..1ea1bcf7f1 100644
--- a/config.yml
+++ b/config.yml
@@ -248,7 +248,9 @@ errors:
   - PATTERN_TERM_PAREN
   - PIPEPIPEEQ_MULTI_ASSIGN
   - REGEXP_ENCODING_OPTION_MISMATCH
+  - REGEXP_ESCAPED_NON_ASCII_IN_UTF8
   - REGEXP_INCOMPAT_CHAR_ENCODING
+  - REGEXP_INVALID_CHAR_PROPERTY
   - REGEXP_INVALID_UNICODE_RANGE
   - REGEXP_NON_ESCAPED_MBC
   - REGEXP_PARSE_ERROR
diff --git a/include/prism/parser.h b/include/prism/parser.h
index 5ebace10c6..d8e7a550e7 100644
--- a/include/prism/parser.h
+++ b/include/prism/parser.h
@@ -933,12 +933,6 @@ struct pm_parser {
      */
     bool semantic_token_seen;
 
-    /**
-     * True if the current regular expression being lexed contains only ASCII
-     * characters.
-     */
-    bool current_regular_expression_ascii_only;
-
     /**
      * By default, Ruby always warns about mismatched indentation. This can be
      * toggled with a magic comment.
diff --git a/include/prism/regexp.h b/include/prism/regexp.h
index 5366b5a5a0..b3e739b457 100644
--- a/include/prism/regexp.h
+++ b/include/prism/regexp.h
@@ -17,27 +17,56 @@
 #include <string.h>
 
 /**
- * This callback is called by pm_regexp_parse() when a named capture group is found.
+ * Accumulation state for named capture groups found during regexp parsing.
+ * The caller initializes this with the call node and passes it to
+ * pm_regexp_parse. The regexp parser populates match and names as groups
+ * are found.
  */
-typedef void (*pm_regexp_name_callback_t)(const pm_string_t *name, void *data);
+typedef struct {
+    /** The call node wrapping the regular expression node (for =~). */
+    pm_call_node_t *call;
+
+    /** The match write node being built, or NULL if no captures found yet. */
+    pm_match_write_node_t *match;
+
+    /** The list of capture names found so far (for deduplication). */
+    pm_constant_id_list_t names;
+} pm_regexp_name_data_t;
 
 /**
- * This callback is called by pm_regexp_parse() when a parse error is found.
+ * Callback invoked by pm_regexp_parse() for each named capture group found.
+ *
+ * @param parser The main parser.
+ * @param name The name of the capture group.
+ * @param shared Whether the source content is shared (impacts constant storage).
+ * @param data The accumulation state for named captures.
  */
-typedef void (*pm_regexp_error_callback_t)(const uint8_t *start, const uint8_t *end, const char *message, void *data);
+typedef void (*pm_regexp_name_callback_t)(pm_parser_t *parser, const pm_string_t *name, bool shared, pm_regexp_name_data_t *data);
 
 /**
- * Parse a regular expression.
+ * Parse a regular expression, validate its encoding, and optionally extract
+ * named capture groups. Returns the encoding flags to set on the node.
  *
  * @param parser The parser that is currently being used.
- * @param source The source code to parse.
- * @param size The size of the source code.
- * @param extended_mode Whether to parse the regular expression in extended mode.
+ * @param node The regular expression node to parse and validate.
  * @param name_callback The optional callback to call when a named capture group is found.
- * @param name_data The optional data to pass to the name callback.
- * @param error_callback The callback to call when a parse error is found.
- * @param error_data The data to pass to the error callback.
+ * @param name_data The optional accumulation state for named captures.
+ * @return The encoding flags to set on the node (e.g., FORCED_UTF8_ENCODING).
+ */
+PRISM_EXPORTED_FUNCTION pm_node_flags_t pm_regexp_parse(pm_parser_t *parser, pm_regular_expression_node_t *node, pm_regexp_name_callback_t name_callback, pm_regexp_name_data_t *name_data);
+
+/**
+ * Parse an interpolated regular expression for named capture groups only.
+ * No encoding validation is performed.
+ *
+ * @param parser The parser that is currently being used.
+ * @param source The source content to parse.
+ * @param size The length of the source content.
+ * @param shared Whether the source points into the parser's source buffer.
+ * @param extended_mode Whether or not the regular expression is in extended mode.
+ * @param name_callback The callback to call when a named capture group is found.
+ * @param name_data The accumulation state for named captures.
  */
-PRISM_EXPORTED_FUNCTION void pm_regexp_parse(pm_parser_t *parser, const uint8_t *source, size_t size, bool extended_mode, pm_regexp_name_callback_t name_callback, void *name_data, pm_regexp_error_callback_t error_callback, void *error_data);
+void pm_regexp_parse_named_captures(pm_parser_t *parser, const uint8_t *source, size_t size, bool shared, bool extended_mode, pm_regexp_name_callback_t name_callback, pm_regexp_name_data_t *name_data);
 
 #endif
diff --git a/snapshots/seattlerb/regexp_escape_extended.txt b/snapshots/seattlerb/regexp_escape_extended.txt
index 732899a20d..a2e4d5eb96 100644
--- a/snapshots/seattlerb/regexp_escape_extended.txt
+++ b/snapshots/seattlerb/regexp_escape_extended.txt
@@ -6,7 +6,7 @@
     ├── flags: ∅
     └── body: (length: 1)
         └── @ RegularExpressionNode (location: (1,0)-(1,6))
-            ├── flags: newline, static_literal
+            ├── flags: newline, static_literal, forced_us_ascii_encoding
             ├── opening_loc: (1,0)-(1,1) = "/"
             ├── content_loc: (1,1)-(1,5) = "\\“"
             ├── closing_loc: (1,5)-(1,6) = "/"
diff --git a/src/prism.c b/src/prism.c
index fcc7be8a1a..d196c5d7c4 100644
--- a/src/prism.c
+++ b/src/prism.c
@@ -6423,129 +6423,6 @@ parse_symbol_encoding(pm_parser_t *parser, const pm_token_t *location, const pm_
     return 0;
 }
 
-static pm_node_flags_t
-parse_and_validate_regular_expression_encoding_modifier(pm_parser_t *parser, const pm_string_t *source, bool ascii_only, pm_node_flags_t flags, char modifier, const pm_encoding_t *modifier_encoding) {
-    assert ((modifier == 'n' && modifier_encoding == PM_ENCODING_ASCII_8BIT_ENTRY) ||
-            (modifier == 'u' && modifier_encoding == PM_ENCODING_UTF_8_ENTRY) ||
-            (modifier == 'e' && modifier_encoding == PM_ENCODING_EUC_JP_ENTRY) ||
-            (modifier == 's' && modifier_encoding == PM_ENCODING_WINDOWS_31J_ENTRY));
-
-    // There's special validation logic used if a string does not contain any character escape sequences.
-    if (parser->explicit_encoding == NULL) {
-        // If an ASCII-only string without character escapes is used with an encoding modifier, then resulting Regexp
-        // has the modifier encoding, unless the ASCII-8BIT modifier is used, in which case the Regexp "downgrades" to
-        // the US-ASCII encoding.
-        if (ascii_only) {
-            return modifier == 'n' ? PM_REGULAR_EXPRESSION_FLAGS_FORCED_US_ASCII_ENCODING : flags;
-        }
-
-        if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY) {
-            if (!ascii_only) {
-                PM_PARSER_ERR_TOKEN_FORMAT(parser, &parser->current, PM_ERR_INVALID_MULTIBYTE_CHAR, parser->encoding->name);
-            }
-        } else if (parser->encoding != modifier_encoding) {
-            PM_PARSER_ERR_TOKEN_FORMAT(parser, &parser->current, PM_ERR_REGEXP_ENCODING_OPTION_MISMATCH, modifier, parser->encoding->name);
-
-            if (modifier == 'n' && !ascii_only) {
-                PM_PARSER_ERR_TOKEN_FORMAT(parser, &parser->current, PM_ERR_REGEXP_NON_ESCAPED_MBC, (int) pm_string_length(source), (const char *) pm_string_source(source));
-            }
-        }
-
-        return flags;
-    }
-
-    // TODO (nirvdrum 21-Feb-2024): To validate regexp sources with character escape sequences we need to know whether hex or Unicode escape sequences were used and Prism doesn't currently provide that data. We handle a subset of unambiguous cases in the meanwhile.
-    bool mixed_encoding = false;
-
-    if (mixed_encoding) {
-        PM_PARSER_ERR_TOKEN_FORMAT(parser, &parser->current, PM_ERR_INVALID_MULTIBYTE_ESCAPE, (int) pm_string_length(source), (const char *) pm_string_source(source));
-    } else if (modifier != 'n' && parser->explicit_encoding == PM_ENCODING_ASCII_8BIT_ENTRY) {
-        // TODO (nirvdrum 21-Feb-2024): Validate the content is valid in the modifier encoding. Do this on-demand so we don't pay the cost of computation unnecessarily.
-        bool valid_string_in_modifier_encoding = true;
-
-        if (!valid_string_in_modifier_encoding) {
-            PM_PARSER_ERR_TOKEN_FORMAT(parser, &parser->current, PM_ERR_INVALID_MULTIBYTE_ESCAPE, (int) pm_string_length(source), (const char *) pm_string_source(source));
-        }
-    } else if (modifier != 'u' && parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) {
-        // TODO (nirvdrum 21-Feb-2024): There's currently no way to tell if the source used hex or Unicode character escapes from `explicit_encoding` alone. If the source encoding was already UTF-8, both character escape types would set `explicit_encoding` to UTF-8, but need to be processed differently. Skip for now.
-        if (parser->encoding != PM_ENCODING_UTF_8_ENTRY) {
-            PM_PARSER_ERR_TOKEN_FORMAT(parser, &parser->current, PM_ERR_REGEXP_INCOMPAT_CHAR_ENCODING, (int) pm_string_length(source), (const char *) pm_string_source(source));
-        }
-    }
-
-    // We've determined the encoding would naturally be EUC-JP and there is no need to force the encoding to anything else.
-    return flags;
-}
-
-/**
- * Ruby "downgrades" the encoding of Regexps to US-ASCII if the associated encoding is ASCII-compatible and
- * the unescaped representation of a Regexp source consists only of US-ASCII code points. This is true even
- * when the Regexp is explicitly given an ASCII-8BIT encoding via the (/n) modifier. Otherwise, the encoding
- * may be explicitly set with an escape sequence.
- */
-static pm_node_flags_t
-parse_and_validate_regular_expression_encoding(pm_parser_t *parser, const pm_string_t *source, bool ascii_only, pm_node_flags_t flags) {
-    // TODO (nirvdrum 22-Feb-2024): CRuby reports a special Regexp-specific error for invalid Unicode ranges. We either need to scan again or modify the "invalid Unicode escape sequence" message we already report.
-    bool valid_unicode_range = true;
-    if (parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY && !valid_unicode_range) {
-        PM_PARSER_ERR_TOKEN_FORMAT(parser, &parser->current, PM_ERR_REGEXP_INVALID_UNICODE_RANGE, (int) pm_string_length(source), (const char *) pm_string_source(source));
-        return flags;
-    }
-
-    // US-ASCII strings do not admit multi-byte character literals. However, character escape sequences corresponding
-    // to multi-byte characters are allowed.
-    if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY && parser->explicit_encoding == NULL && !ascii_only) {
-        // CRuby will continue processing even though a SyntaxError has already been detected. It may result in the
-        // following error message appearing twice. We do the same for compatibility.
-        PM_PARSER_ERR_TOKEN_FORMAT(parser, &parser->current, PM_ERR_INVALID_MULTIBYTE_CHAR, parser->encoding->name);
-    }
-
-    /**
-     * Start checking modifier flags. We need to process these before considering any explicit encodings that may have
-     * been set by character literals. The order in which the encoding modifiers is checked does not matter. In the
-     * event that both an encoding modifier and an explicit encoding would result in the same encoding we do not set
-     * the corresponding "forced_<encoding>" flag. Instead, the caller should check the encoding modifier flag and
-     * determine the encoding that way.
-     */
-
-    if (flags & PM_REGULAR_EXPRESSION_FLAGS_ASCII_8BIT) {
-        return parse_and_validate_regular_expression_encoding_modifier(parser, source, ascii_only, flags, 'n', PM_ENCODING_ASCII_8BIT_ENTRY);
-    }
-
-    if (flags & PM_REGULAR_EXPRESSION_FLAGS_UTF_8) {
-        return parse_and_validate_regular_expression_encoding_modifier(parser, source, ascii_only, flags, 'u', PM_ENCODING_UTF_8_ENTRY);
-    }
-
-    if (flags & PM_REGULAR_EXPRESSION_FLAGS_EUC_JP) {
-        return parse_and_validate_regular_expression_encoding_modifier(parser, source, ascii_only, flags, 'e', PM_ENCODING_EUC_JP_ENTRY);
-    }
-
-    if (flags & PM_REGULAR_EXPRESSION_FLAGS_WINDOWS_31J) {
-        return parse_and_validate_regular_expression_encoding_modifier(parser, source, ascii_only, flags, 's', PM_ENCODING_WINDOWS_31J_ENTRY);
-    }
-
-    // At this point no encoding modifiers will be present on the regular expression as they would have already
-    // been processed. Ruby stipulates that all source files must use an ASCII-compatible encoding. Thus, all
-    // regular expressions without an encoding modifier appearing in source are eligible for "downgrading" to US-ASCII.
-    if (ascii_only) {
-        return PM_REGULAR_EXPRESSION_FLAGS_FORCED_US_ASCII_ENCODING;
-    }
-
-    // A Regexp may optionally have its encoding explicitly set via a character escape sequence in the source string
-    // or by specifying a modifier.
-    //
-    // NB: an explicitly set encoding is ignored by Ruby if the Regexp consists of only US ASCII code points.
-    if (parser->explicit_encoding != NULL) {
-        if (parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) {
-            return PM_REGULAR_EXPRESSION_FLAGS_FORCED_UTF8_ENCODING;
-        } else if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY) {
-            return PM_REGULAR_EXPRESSION_FLAGS_FORCED_BINARY_ENCODING;
-        }
-    }
-
-    return 0;
-}
-
 /**
  * Allocate and initialize a new SymbolNode node with the given unescaped
  * string.
@@ -8589,7 +8466,7 @@ escape_hexadecimal_digit(const uint8_t value) {
  * validated.
  */
 static inline uint32_t
-escape_unicode(pm_parser_t *parser, const uint8_t *string, size_t length, const pm_location_t *error_location) {
+escape_unicode(pm_parser_t *parser, const uint8_t *string, size_t length, const pm_location_t *error_location, const uint8_t flags) {
     uint32_t value = 0;
     for (size_t index = 0; index < length; index++) {
         if (index != 0) value <<= 4;
@@ -8599,7 +8476,10 @@ escape_unicode(pm_parser_t *parser, const uint8_t *string, size_t length, const
     // Here we're going to verify that the value is actually a valid Unicode
     // codepoint and not a surrogate pair.
     if (value >= 0xD800 && value <= 0xDFFF) {
-        if (error_location != NULL) {
+        if (flags & PM_ESCAPE_FLAG_REGEXP) {
+            // In regexp context, defer the error to regexp encoding
+            // validation where we can produce a regexp-specific message.
+        } else if (error_location != NULL) {
             pm_parser_err(parser, error_location->start, error_location->length, PM_ERR_ESCAPE_INVALID_UNICODE);
         } else {
             pm_parser_err(parser, U32(string - parser->start), U32(length), PM_ERR_ESCAPE_INVALID_UNICODE);
@@ -8630,14 +8510,25 @@ escape_write_unicode(pm_parser_t *parser, pm_buffer_t *buffer, const uint8_t fla
     // literal.
     if (value >= 0x80 || flags & PM_ESCAPE_FLAG_SINGLE) {
         if (parser->explicit_encoding != NULL && parser->explicit_encoding != PM_ENCODING_UTF_8_ENTRY) {
-            PM_PARSER_ERR_FORMAT(parser, U32(start - parser->start), U32(end - start), PM_ERR_MIXED_ENCODING, parser->explicit_encoding->name);
+            if (flags & PM_ESCAPE_FLAG_REGEXP) {
+                // In regexp context, suppress this error — the regexp encoding
+                // validation will produce a more specific error message.
+            } else {
+                PM_PARSER_ERR_FORMAT(parser, U32(start - parser->start), U32(end - start), PM_ERR_MIXED_ENCODING, parser->explicit_encoding->name);
+            }
         }
 
         parser->explicit_encoding = PM_ENCODING_UTF_8_ENTRY;
     }
 
     if (!pm_buffer_append_unicode_codepoint(buffer, value)) {
-        pm_parser_err(parser, U32(start - parser->start), U32(end - start), PM_ERR_ESCAPE_INVALID_UNICODE);
+        if (flags & PM_ESCAPE_FLAG_REGEXP) {
+            // In regexp context, defer the error to the regexp encoding
+            // validation which produces a regexp-specific message.
+        } else {
+            pm_parser_err(parser, U32(start - parser->start), U32(end - start), PM_ERR_ESCAPE_INVALID_UNICODE);
+        }
+
         pm_buffer_append_byte(buffer, 0xEF);
         pm_buffer_append_byte(buffer, 0xBF);
         pm_buffer_append_byte(buffer, 0xBD);
@@ -8649,10 +8540,15 @@ escape_write_unicode(pm_parser_t *parser, pm_buffer_t *buffer, const uint8_t fla
  * (i.e., the top bit is set) then it locks in the encoding.
  */
 static inline void
-escape_write_byte_encoded(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t byte) {
+escape_write_byte_encoded(pm_parser_t *parser, pm_buffer_t *buffer, const uint8_t flags, uint8_t byte) {
     if (byte >= 0x80) {
         if (parser->explicit_encoding != NULL && parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY && parser->encoding != PM_ENCODING_UTF_8_ENTRY) {
-            PM_PARSER_ERR_TOKEN_FORMAT(parser, &parser->current, PM_ERR_MIXED_ENCODING, parser->encoding->name);
+            if (flags & PM_ESCAPE_FLAG_REGEXP) {
+                // In regexp context, suppress this error — the regexp encoding
+                // validation will produce a more specific error message.
+            } else {
+                PM_PARSER_ERR_TOKEN_FORMAT(parser, &parser->current, PM_ERR_MIXED_ENCODING, parser->encoding->name);
+            }
         }
 
         parser->explicit_encoding = parser->encoding;
@@ -8682,7 +8578,7 @@ escape_write_byte(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular
         pm_buffer_append_format(regular_expression_buffer, "\\x%02X", byte);
     }
 
-    escape_write_byte_encoded(parser, buffer, byte);
+    escape_write_byte_encoded(parser, buffer, flags, byte);
 }
 
 /**
@@ -8838,7 +8734,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expre
                     }
                 }
 
-                escape_write_byte_encoded(parser, buffer, value);
+                escape_write_byte_encoded(parser, buffer, flags, value);
             } else {
                 pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_HEXADECIMAL);
             }
@@ -8887,7 +8783,8 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expre
                         if (flags & PM_ESCAPE_FLAG_REGEXP) {
                             // If this is a regular expression, we are going to
                             // let the regular expression engine handle this
-                            // error instead of us.
+                            // error instead of us because we don't know at this
+                            // point if we're inside a comment in /x mode.
                             pm_buffer_append_bytes(regular_expression_buffer, start, (size_t) (parser->current.end - start));
                         } else {
                             pm_parser_err(parser, PM_TOKEN_END(parser, &parser->current), 0, PM_ERR_ESCAPE_INVALID_UNICODE);
@@ -8903,7 +8800,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expre
                         extra_codepoints_start = unicode_start;
                     }
 
-                    uint32_t value = escape_unicode(parser, unicode_start, hexadecimal_length, NULL);
+                    uint32_t value = escape_unicode(parser, unicode_start, hexadecimal_length, NULL, flags);
                     escape_write_unicode(parser, buffer, flags, unicode_start, parser->current.end, value);
 
                     parser->current.end += pm_strspn_inline_whitespace(parser->current.end, parser->end - parser->current.end);
@@ -8923,7 +8820,8 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expre
                     if (flags & PM_ESCAPE_FLAG_REGEXP) {
                         // If this is a regular expression, we are going to let
                         // the regular expression engine handle this error
-                        // instead of us.
+                        // instead of us because we don't know at this point if
+                        // we're inside a comment in /x mode.
                         pm_buffer_append_bytes(regular_expression_buffer, start, (size_t) (parser->current.end - start));
                     } else {
                         pm_parser_err(parser, U32(unicode_codepoints_start - parser->start), U32(parser->current.end - unicode_codepoints_start), PM_ERR_ESCAPE_INVALID_UNICODE_TERM);
@@ -8944,7 +8842,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expre
                         PM_PARSER_ERR_FORMAT(parser, U32(start - parser->start), U32(parser->current.end - start), PM_ERR_ESCAPE_INVALID_UNICODE_SHORT, 2, start);
                     }
                 } else if (length == 4) {
-                    uint32_t value = escape_unicode(parser, parser->current.end, 4, NULL);
+                    uint32_t value = escape_unicode(parser, parser->current.end, 4, NULL, flags);
 
                     if (flags & PM_ESCAPE_FLAG_REGEXP) {
                         pm_buffer_append_bytes(regular_expression_buffer, start, (size_t) (parser->current.end + 4 - start));
@@ -9131,7 +9029,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expre
         case '\r': {
             if (peek_offset(parser, 1) == '\n') {
                 parser->current.end += 2;
-                escape_write_byte_encoded(parser, buffer, escape_byte('\n', flags));
+                escape_write_byte_encoded(parser, buffer, flags, escape_byte('\n', flags));
                 return;
             }
             PRISM_FALLTHROUGH
@@ -9516,20 +9414,12 @@ pm_token_buffer_push_escaped(pm_token_buffer_t *token_buffer, pm_parser_t *parse
 static void
 pm_regexp_token_buffer_push_escaped(pm_regexp_token_buffer_t *token_buffer, pm_parser_t *parser) {
     size_t width = parser_char_width(parser);
-    pm_buffer_append_bytes(&token_buffer->base.buffer, parser->current.end, width);
-    pm_buffer_append_bytes(&token_buffer->regexp_buffer, parser->current.end, width);
+    const uint8_t *start = parser->current.end;
+    pm_buffer_append_bytes(&token_buffer->base.buffer, start, width);
+    pm_buffer_append_bytes(&token_buffer->regexp_buffer, start, width);
     parser->current.end += width;
 }
 
-static bool
-pm_slice_ascii_only_p(const uint8_t *value, size_t length) {
-    for (size_t index = 0; index < length; index++) {
-        if (value[index] & 0x80) return false;
-    }
-
-    return true;
-}
-
 /**
  * When we're about to return from lexing the current token and we know for sure
  * that we have found an escape sequence, this function is called to copy the
@@ -9548,7 +9438,6 @@ pm_token_buffer_copy(pm_parser_t *parser, pm_token_buffer_t *token_buffer) {
 static inline void
 pm_regexp_token_buffer_copy(pm_parser_t *parser, pm_regexp_token_buffer_t *token_buffer) {
     pm_token_buffer_copy(parser, &token_buffer->base);
-    parser->current_regular_expression_ascii_only = pm_slice_ascii_only_p((const uint8_t *) pm_buffer_value(&token_buffer->regexp_buffer), pm_buffer_length(&token_buffer->regexp_buffer));
     pm_buffer_free(&token_buffer->regexp_buffer);
 }
 
@@ -9575,10 +9464,11 @@ static void
 pm_regexp_token_buffer_flush(pm_parser_t *parser, pm_regexp_token_buffer_t *token_buffer) {
     if (token_buffer->base.cursor == NULL) {
         pm_string_shared_init(&parser->current_string, parser->current.start, parser->current.end);
-        parser->current_regular_expression_ascii_only = pm_slice_ascii_only_p(parser->current.start, (size_t) (parser->current.end - parser->current.start));
     } else {
-        pm_buffer_append_bytes(&token_buffer->base.buffer, token_buffer->base.cursor, (size_t) (parser->current.end - token_buffer->base.cursor));
-        pm_buffer_append_bytes(&token_buffer->regexp_buffer, token_buffer->base.cursor, (size_t) (parser->current.end - token_buffer->base.cursor));
+        const uint8_t *cursor = token_buffer->base.cursor;
+        size_t length = (size_t) (parser->current.end - cursor);
+        pm_buffer_append_bytes(&token_buffer->base.buffer, cursor, length);
+        pm_buffer_append_bytes(&token_buffer->regexp_buffer, cursor, length);
         pm_regexp_token_buffer_copy(parser, token_buffer);
     }
 }
@@ -17383,63 +17273,6 @@ parse_yield(pm_parser_t *parser, const pm_node_t *node) {
     }
 }
 
-/**
- * This struct is used to pass information between the regular expression parser
- * and the error callback.
- */
-typedef struct {
-    /** The parser that we are parsing the regular expression for. */
-    pm_parser_t *parser;
-
-    /** The start of the regular expression. */
-    const uint8_t *start;
-
-    /** The end of the regular expression. */
-    const uint8_t *end;
-
-    /**
-     * Whether or not the source of the regular expression is shared. This
-     * impacts the location of error messages, because if it is shared then we
-     * can use the location directly and if it is not, then we use the bounds of
-     * the regular expression itself.
-     */
-    bool shared;
-} parse_regular_expression_error_data_t;
-
-/**
- * This callback is called when the regular expression parser encounters a
- * syntax error.
- */
-static void
-parse_regular_expression_error(const uint8_t *start, const uint8_t *end, const char *message, void *data) {
-    parse_regular_expression_error_data_t *callback_data = (parse_regular_expression_error_data_t *) data;
-    pm_token_t location;
-
-    if (callback_data->shared) {
-        location = (pm_token_t) { .type = 0, .start = start, .end = end };
-    } else {
-        location = (pm_token_t) { .type = 0, .start = callback_data->start, .end = callback_data->end };
-    }
-
-    PM_PARSER_ERR_FORMAT(callback_data->parser, PM_TOKEN_START(callback_data->parser, &location), PM_TOKEN_LENGTH(&location), PM_ERR_REGEXP_PARSE_ERROR, message);
-}
-
-/**
- * Parse the errors for the regular expression and add them to the parser.
- */
-static void
-parse_regular_expression_errors(pm_parser_t *parser, pm_regular_expression_node_t *node) {
-    const pm_string_t *unescaped = &node->unescaped;
-    parse_regular_expression_error_data_t error_data = {
-        .parser = parser,
-        .start = parser->start + PM_NODE_START(node),
-        .end = parser->start + PM_NODE_END(node),
-        .shared = unescaped->type == PM_STRING_SHARED
-    };
-
-    pm_regexp_parse(parser, pm_string_source(unescaped), pm_string_length(unescaped), PM_NODE_FLAG_P(node, PM_REGULAR_EXPRESSION_FLAGS_EXTENDED), NULL, NULL, parse_regular_expression_error, &error_data);
-}
-
 /**
  * Determine if a given call node looks like a "command", which means it has
  * arguments but does not have parentheses.
@@ -19317,22 +19150,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, u
                     }
                 }
             } else {
-                /* `not` in a single line method is allowed to be followed by
-                 * an expression without pattern matching, that optionally is
-                 * followed by a `rescue` modifier. */
-                if (flags & PM_PARSE_IN_ENDLESS_DEF) {
-                    receiver = parse_expression(parser, PM_BINDING_POWER_MATCH + 1, (flags & PM_PARSE_ACCEPTS_DO_BLOCK) | PM_PARSE_ACCEPTS_COMMAND_CALL, PM_ERR_NOT_EXPRESSION, (uint16_t) (depth + 1));
-
-                    if (accept1(parser, PM_TOKEN_KEYWORD_RESCUE_MODIFIER)) {
-                        context_push(parser, PM_CONTEXT_RESCUE_MODIFIER);
-                        pm_token_t rescue_keyword = parser->previous;
-                        pm_node_t *value = parse_expression(parser, PM_BINDING_POWER_MATCH + 1, flags & PM_PARSE_ACCEPTS_DO_BLOCK, PM_ERR_RESCUE_MODIFIER_VALUE, (uint16_t) (depth + 1));
-                        context_pop(parser);
-                        receiver = UP(pm_rescue_modifier_node_create(parser, receiver, &rescue_keyword, value));
-                    }
-                } else {
-                    receiver = parse_expression(parser, PM_BINDING_POWER_NOT, (flags & PM_PARSE_ACCEPTS_DO_BLOCK) | PM_PARSE_ACCEPTS_COMMAND_CALL, PM_ERR_NOT_EXPRESSION, (uint16_t) (depth + 1));
-                }
+                receiver = parse_expression(parser, PM_BINDING_POWER_NOT, (flags & PM_PARSE_ACCEPTS_DO_BLOCK) | PM_PARSE_ACCEPTS_COMMAND_CALL, PM_ERR_NOT_EXPRESSION, (uint16_t) (depth + 1));
             }
 
             return UP(pm_call_node_not_create(parser, receiver, &message, &arguments));
@@ -19913,10 +19731,9 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, u
 
                 parser_lex(parser);
 
-                pm_node_t *node = UP(pm_regular_expression_node_create(parser, &opening, &content, &parser->previous));
-                pm_node_flag_set(node, PM_REGULAR_EXPRESSION_FLAGS_FORCED_US_ASCII_ENCODING);
-
-                return node;
+                pm_regular_expression_node_t *node = pm_regular_expression_node_create(parser, &opening, &content, &parser->previous);
+                pm_node_flag_set(UP(node), pm_regexp_parse(parser, node, NULL, NULL));
+                return UP(node);
             }
 
             pm_interpolated_regular_expression_node_t *interpolated;
@@ -19928,7 +19745,6 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, u
                 // regular expression) or if it's not then it has interpolation.
                 pm_string_t unescaped = parser->current_string;
                 pm_token_t content = parser->current;
-                bool ascii_only = parser->current_regular_expression_ascii_only;
                 parser_lex(parser);
 
                 // If we hit an end, then we can create a regular expression
@@ -19937,15 +19753,14 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, u
                 if (accept1(parser, PM_TOKEN_REGEXP_END)) {
                     pm_regular_expression_node_t *node = (pm_regular_expression_node_t *) pm_regular_expression_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped);
 
-                    // If we're not immediately followed by a =~, then we want
-                    // to parse all of the errors at this point. If it is
-                    // followed by a =~, then it will get parsed higher up while
-                    // parsing the named captures as well.
+                    // If we're not immediately followed by a =~, then we
+                    // parse and validate now. If it is followed by a =~,
+                    // then it will get parsed in the =~ handler where
+                    // named captures can also be extracted.
                     if (!match1(parser, PM_TOKEN_EQUAL_TILDE)) {
-                        parse_regular_expression_errors(parser, node);
+                        pm_node_flag_set(UP(node), pm_regexp_parse(parser, node, NULL, NULL));
                     }
 
-                    pm_node_flag_set(UP(node), parse_and_validate_regular_expression_encoding(parser, &unescaped, ascii_only, FL(node)));
                     return UP(node);
                 }
 
@@ -20318,7 +20133,7 @@ parse_assignment_value(pm_parser_t *parser, pm_binding_power_t previous_binding_
         pm_token_t rescue = parser->current;
         parser_lex(parser);
 
-        pm_node_t *right = parse_expression(parser, PM_BINDING_POWER_MATCH + 1, flags & PM_PARSE_ACCEPTS_DO_BLOCK, PM_ERR_RESCUE_MODIFIER_VALUE, (uint16_t) (depth + 1));
+        pm_node_t *right = parse_expression(parser, pm_binding_powers[PM_TOKEN_KEYWORD_RESCUE_MODIFIER].right, flags & PM_PARSE_ACCEPTS_DO_BLOCK, PM_ERR_RESCUE_MODIFIER_VALUE, (uint16_t) (depth + 1));
         context_pop(parser);
 
         return UP(pm_rescue_modifier_node_create(parser, value, &rescue, right));
@@ -20435,7 +20250,7 @@ parse_assignment_values(pm_parser_t *parser, pm_binding_power_t previous_binding
             }
         }
 
-        pm_node_t *right = parse_expression(parser, PM_BINDING_POWER_MATCH + 1, (flags & PM_PARSE_ACCEPTS_DO_BLOCK) | (accepts_command_call_inner ? PM_PARSE_ACCEPTS_COMMAND_CALL : 0), PM_ERR_RESCUE_MODIFIER_VALUE, (uint16_t) (depth + 1));
+        pm_node_t *right = parse_expression(parser, pm_binding_powers[PM_TOKEN_KEYWORD_RESCUE_MODIFIER].right, (flags & PM_PARSE_ACCEPTS_DO_BLOCK) | (accepts_command_call_inner ? PM_PARSE_ACCEPTS_COMMAND_CALL : 0), PM_ERR_RESCUE_MODIFIER_VALUE, (uint16_t) (depth + 1));
         context_pop(parser);
 
         return UP(pm_rescue_modifier_node_create(parser, value, &rescue, right));
@@ -20466,31 +20281,6 @@ parse_call_operator_write(pm_parser_t *parser, pm_call_node_t *call_node, const
     }
 }
 
-/**
- * This struct is used to pass information between the regular expression parser
- * and the named capture callback.
- */
-typedef struct {
-    /** The parser that is parsing the regular expression. */
-    pm_parser_t *parser;
-
-    /** The call node wrapping the regular expression node. */
-    pm_call_node_t *call;
-
-    /** The match write node that is being created. */
-    pm_match_write_node_t *match;
-
-    /** The list of names that have been parsed. */
-    pm_constant_id_list_t names;
-
-    /**
-     * Whether the content of the regular expression is shared. This impacts
-     * whether or not we used owned constants or shared constants in the
-     * constant pool for the names of the captures.
-     */
-    bool shared;
-} parse_regular_expression_named_capture_data_t;
-
 static inline const uint8_t *
 pm_named_capture_escape_hex(pm_buffer_t *unescaped, const uint8_t *cursor, const uint8_t *end) {
     cursor++;
@@ -20543,7 +20333,7 @@ pm_named_capture_escape_unicode(pm_parser_t *parser, pm_buffer_t *unescaped, con
 
     if (*cursor != '{') {
         size_t length = pm_strspn_hexadecimal_digit(cursor, MIN(end - cursor, 4));
-        uint32_t value = escape_unicode(parser, cursor, length, error_location);
+        uint32_t value = escape_unicode(parser, cursor, length, error_location, 0);
 
         if (!pm_buffer_append_unicode_codepoint(unescaped, value)) {
             pm_buffer_append_string(unescaped, (const char *) start, (size_t) ((cursor + length) - start));
@@ -20566,7 +20356,7 @@ pm_named_capture_escape_unicode(pm_parser_t *parser, pm_buffer_t *unescaped, con
         if (length == 0) {
             break;
         }
-        uint32_t value = escape_unicode(parser, cursor, length, error_location);
+        uint32_t value = escape_unicode(parser, cursor, length, error_location, 0);
 
         (void) pm_buffer_append_unicode_codepoint(unescaped, value);
         cursor += length;
@@ -20616,10 +20406,7 @@ pm_named_capture_escape(pm_parser_t *parser, pm_buffer_t *unescaped, const uint8
  * capture group.
  */
 static void
-parse_regular_expression_named_capture(const pm_string_t *capture, void *data) {
-    parse_regular_expression_named_capture_data_t *callback_data = (parse_regular_expression_named_capture_data_t *) data;
-
-    pm_parser_t *parser = callback_data->parser;
+parse_regular_expression_named_capture(pm_parser_t *parser, const pm_string_t *capture, bool shared, pm_regexp_name_data_t *callback_data) {
     pm_call_node_t *call = callback_data->call;
     pm_constant_id_list_t *names = &callback_data->names;
 
@@ -20637,7 +20424,7 @@ parse_regular_expression_named_capture(const pm_string_t *capture, void *data) {
     // unescaped, which is what we need.
     const uint8_t *cursor = pm_memchr(source, '\\', length, parser->encoding_changed, parser->encoding);
     if (PRISM_UNLIKELY(cursor != NULL)) {
-        pm_named_capture_escape(parser, &unescaped, source, length, cursor, callback_data->shared ? NULL : &call->receiver->location);
+        pm_named_capture_escape(parser, &unescaped, source, length, cursor, shared ? NULL : &call->receiver->location);
         source = (const uint8_t *) pm_buffer_value(&unescaped);
         length = pm_buffer_length(&unescaped);
     }
@@ -20653,7 +20440,7 @@ parse_regular_expression_named_capture(const pm_string_t *capture, void *data) {
         return;
     }
 
-    if (callback_data->shared) {
+    if (shared) {
         // If the unescaped string is a slice of the source, then we can
         // copy the names directly. The pointers will line up.
         start = source;
@@ -20707,26 +20494,19 @@ parse_regular_expression_named_capture(const pm_string_t *capture, void *data) {
 }
 
 /**
- * Potentially change a =~ with a regular expression with named captures into a
- * match write node.
+ * Potentially change a =~ with an interpolated regular expression with named
+ * captures into a match write node. This is for the interpolated case where
+ * we have concatenated content rather than a regular expression node.
  */
 static pm_node_t *
-parse_regular_expression_named_captures(pm_parser_t *parser, const pm_string_t *content, pm_call_node_t *call, bool extended_mode) {
-    parse_regular_expression_named_capture_data_t callback_data = {
-        .parser = parser,
+parse_interpolated_regular_expression_named_captures(pm_parser_t *parser, const pm_string_t *content, pm_call_node_t *call, bool extended_mode) {
+    pm_regexp_name_data_t callback_data = {
         .call = call,
+        .match = NULL,
         .names = { 0 },
-        .shared = content->type == PM_STRING_SHARED
-    };
-
-    parse_regular_expression_error_data_t error_data = {
-        .parser = parser,
-        .start = parser->start + PM_NODE_START(call->receiver),
-        .end = parser->start + PM_NODE_END(call->receiver),
-        .shared = content->type == PM_STRING_SHARED
     };
 
-    pm_regexp_parse(parser, pm_string_source(content), pm_string_length(content), extended_mode, parse_regular_expression_named_capture, &callback_data, parse_regular_expression_error, &error_data);
+    pm_regexp_parse_named_captures(parser, pm_string_source(content), pm_string_length(content), false, extended_mode, parse_regular_expression_named_capture, &callback_data);
 
     if (callback_data.match != NULL) {
         return UP(callback_data.match);
@@ -21248,14 +21028,25 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t
                     pm_string_t owned;
                     pm_string_owned_init(&owned, (uint8_t *) memory, total_length);
 
-                    result = parse_regular_expression_named_captures(parser, &owned, call, PM_NODE_FLAG_P(node, PM_REGULAR_EXPRESSION_FLAGS_EXTENDED));
+                    result = parse_interpolated_regular_expression_named_captures(parser, &owned, call, PM_NODE_FLAG_P(node, PM_REGULAR_EXPRESSION_FLAGS_EXTENDED));
                     pm_string_free(&owned);
                 }
             } else if (PM_NODE_TYPE_P(node, PM_REGULAR_EXPRESSION_NODE)) {
-                // If we have a regular expression node, then we can just parse
-                // the named captures directly off the unescaped string.
-                const pm_string_t *content = &((pm_regular_expression_node_t *) node)->unescaped;
-                result = parse_regular_expression_named_captures(parser, content, call, PM_NODE_FLAG_P(node, PM_REGULAR_EXPRESSION_FLAGS_EXTENDED));
+                // If we have a regular expression node, then we can parse
+                // the named captures and validate encoding in one pass.
+                pm_regular_expression_node_t *regexp = (pm_regular_expression_node_t *) node;
+
+                pm_regexp_name_data_t name_data = {
+                    .call = call,
+                    .match = NULL,
+                    .names = { 0 },
+                };
+
+                pm_node_flag_set(UP(regexp), pm_regexp_parse(parser, regexp, parse_regular_expression_named_capture, &name_data));
+
+                if (name_data.match != NULL) {
+                    result = UP(name_data.match);
+                }
             }
 
             return result;
@@ -22137,7 +21928,6 @@ pm_parser_init(pm_arena_t *arena, pm_parser_t *parser, const uint8_t *source, si
         .current_block_exits = NULL,
         .semantic_token_seen = false,
         .frozen_string_literal = PM_OPTIONS_FROZEN_STRING_LITERAL_UNSET,
-        .current_regular_expression_ascii_only = false,
         .warn_mismatched_indentation = true
     };
 
diff --git a/src/regexp.c b/src/regexp.c
index dcc7476244..23495a35e7 100644
--- a/src/regexp.c
+++ b/src/regexp.c
@@ -1,5 +1,9 @@
 #include "prism/regexp.h"
+#include "prism/diagnostic.h"
+#include "prism/util/pm_buffer.h"
+#include "prism/util/pm_strncasecmp.h"
 
+/** The maximum depth of nested groups allowed in a regular expression. */
 #define PM_REGEXP_PARSE_DEPTH_MAX 4096
 
 /**
@@ -18,6 +22,54 @@ typedef struct {
     /** A pointer to the end of the source that we are parsing. */
     const uint8_t *end;
 
+    /** The encoding of the source. */
+    const pm_encoding_t *encoding;
+
+    /** The callback to call when a named capture group is found. */
+    pm_regexp_name_callback_t name_callback;
+
+    /** The data to pass to the name callback. */
+    pm_regexp_name_data_t *name_data;
+
+    /** The start of the regexp node (for error locations). */
+    const uint8_t *node_start;
+
+    /** The end of the regexp node (for error locations). */
+    const uint8_t *node_end;
+
+    /**
+     * The explicit encoding determined by escape sequences. NULL if no
+     * encoding-setting escape has been seen, UTF-8 for `\u` escapes, or the
+     * source encoding for `\x` escapes.
+     */
+    const pm_encoding_t *explicit_encoding;
+
+    /**
+     * Pointer to the first non-POSIX property name (for /n error messages).
+     * POSIX properties (Alnum, Alpha, etc.) work in all encodings.
+     * Script properties (Hiragana, Katakana, etc.) work in /e, /s, /u.
+     * Unicode-only properties (L, Ll, etc.) work only in /u.
+     */
+    const uint8_t *property_name;
+
+    /** Length of the first non-POSIX property name found. */
+    size_t property_name_length;
+
+    /**
+     * Pointer to the first Unicode-only property name (for /e, /s error
+     * messages). NULL if only POSIX or script properties have been seen.
+     */
+    const uint8_t *unicode_property_name;
+
+    /** Length of the first Unicode-only property name found. */
+    size_t unicode_property_name_length;
+
+    /** Buffer of hex escape byte values >= 0x80, separated by 0x00 sentinels. */
+    pm_buffer_t hex_escape_buffer;
+
+    /** Count of non-ASCII literal bytes (not from escapes). */
+    uint32_t non_ascii_literal_count;
+
     /**
      * Whether or not the regular expression currently being parsed is in
      * extended mode, wherein whitespace is ignored and comments are allowed.
@@ -27,30 +79,76 @@ typedef struct {
     /** Whether the encoding has changed from the default. */
     bool encoding_changed;
 
-    /** The encoding of the source. */
-    const pm_encoding_t *encoding;
+    /** Whether the source content is shared (for named capture callback). */
+    bool shared;
 
-    /** The callback to call when a named capture group is found. */
-    pm_regexp_name_callback_t name_callback;
+    /** Whether a `\u{...}` escape with value >= 0x80 was seen. */
+    bool has_unicode_escape;
 
-    /** The data to pass to the name callback. */
-    void *name_data;
+    /** Whether a `\xNN` escape (or `\M-x`, etc.) with value >= 0x80 was seen. */
+    bool has_hex_escape;
+
+    /**
+     * Tracks whether the last encoding-setting escape was `\u` (true) or `\x`
+     * (false). This matters for error messages when both types are mixed.
+     */
+    bool last_escape_was_unicode;
+
+    /** Whether any `\p{...}` or `\P{...}` property escape was found. */
+    bool has_property_escape;
 
-    /** The callback to call when a parse error is found. */
-    pm_regexp_error_callback_t error_callback;
+    /** Whether a Unicode-only property escape was found (not POSIX or script). */
+    bool has_unicode_property_escape;
 
-    /** The data to pass to the error callback. */
-    void *error_data;
+    /** Whether a `\u` escape with invalid range (surrogate or > 0x10FFFF) was seen. */
+    bool invalid_unicode_range;
+
+    /** Whether we are accumulating consecutive hex escape bytes. */
+    bool hex_group_active;
+
+    /** Whether an invalid multibyte character was found during parsing. */
+    bool has_invalid_multibyte;
 } pm_regexp_parser_t;
 
 /**
- * Append an error to the parser.
+ * Append a syntax error to the parser's error list. If the source is shared
+ * (points into the original source), we can point to the exact error location.
+ * Otherwise, we point to the whole regexp node.
  */
 static inline void
 pm_regexp_parse_error(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end, const char *message) {
-    parser->error_callback(start, end, message, parser->error_data);
+    pm_parser_t *pm = parser->parser;
+    uint32_t loc_start, loc_length;
+
+    if (parser->shared) {
+        loc_start = (uint32_t) (start - pm->start);
+        loc_length = (uint32_t) (end - start);
+    } else {
+        loc_start = (uint32_t) (parser->node_start - pm->start);
+        loc_length = (uint32_t) (parser->node_end - parser->node_start);
+    }
+
+    pm_diagnostic_list_append_format(&pm->error_list, loc_start, loc_length, PM_ERR_REGEXP_PARSE_ERROR, message);
 }
 
+/**
+ * Append a formatted diagnostic error with proper shared/non-shared location
+ * handling. This is a macro because we need variadic args for the format string.
+ */
+#define pm_regexp_parse_error_format(parser_, err_start_, err_end_, diag_id, ...) \
+    do { \
+        pm_parser_t *pm__ = (parser_)->parser; \
+        uint32_t loc_start__, loc_length__; \
+        if ((parser_)->shared) { \
+            loc_start__ = (uint32_t) ((err_start_) - pm__->start); \
+            loc_length__ = (uint32_t) ((err_end_) - (err_start_)); \
+        } else { \
+            loc_start__ = (uint32_t) ((parser_)->node_start - pm__->start); \
+            loc_length__ = (uint32_t) ((parser_)->node_end - (parser_)->node_start); \
+        } \
+        pm_diagnostic_list_append_format(&pm__->error_list, loc_start__, loc_length__, diag_id, __VA_ARGS__); \
+    } while (0)
+
 /**
  * This appends a new string to the list of named captures. This function
  * assumes the caller has already checked the validity of the name callback.
@@ -59,7 +157,7 @@ static void
 pm_regexp_parser_named_capture(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end) {
     pm_string_t string;
     pm_string_shared_init(&string, start, end);
-    parser->name_callback(&string, parser->name_data);
+    parser->name_callback(parser->parser, &string, parser->shared, parser->name_data);
     pm_string_free(&string);
 }
 
@@ -113,6 +211,47 @@ pm_regexp_char_find(pm_regexp_parser_t *parser, uint8_t value) {
     return true;
 }
 
+/**
+ * Mark a group boundary in the hex escape byte buffer. When consecutive hex
+ * escape bytes >= 0x80 are followed by a non-hex-escape, this appends a 0x00
+ * sentinel to separate the groups for later multibyte validation.
+ */
+static inline void
+pm_regexp_hex_group_boundary(pm_regexp_parser_t *parser) {
+    if (parser->hex_group_active) {
+        pm_buffer_append_byte(&parser->hex_escape_buffer, 0x00);
+        parser->hex_group_active = false;
+    }
+}
+
+/**
+ * Track a hex escape byte value >= 0x80 for multibyte validation.
+ */
+static inline void
+pm_regexp_track_hex_escape(pm_regexp_parser_t *parser, uint8_t byte) {
+    if (byte >= 0x80) {
+        pm_buffer_append_byte(&parser->hex_escape_buffer, byte);
+        parser->hex_group_active = true;
+        parser->has_hex_escape = true;
+
+        parser->explicit_encoding = parser->encoding;
+        parser->last_escape_was_unicode = false;
+    } else {
+        pm_regexp_hex_group_boundary(parser);
+    }
+}
+
+/**
+ * Parse a hex digit character and return its value, or -1 if not a hex digit.
+ */
+static inline int
+pm_regexp_hex_digit_value(uint8_t byte) {
+    if (byte >= '0' && byte <= '9') return byte - '0';
+    if (byte >= 'a' && byte <= 'f') return byte - 'a' + 10;
+    if (byte >= 'A' && byte <= 'F') return byte - 'A' + 10;
+    return -1;
+}
+
 /**
  * Range quantifiers are a special class of quantifiers that look like
  *
@@ -121,13 +260,12 @@ pm_regexp_char_find(pm_regexp_parser_t *parser, uint8_t value) {
  * * {digit,digit}
  * * {,digit}
  *
- * Unfortunately, if there are any spaces in between, then this just becomes a
- * regular character match expression and we have to backtrack. So when this
- * function first starts running, we'll create a "save" point and then attempt
- * to parse the quantifier. If it fails, we'll restore the save point and
- * return.
+ * If there are any spaces in between, then this just becomes a regular
+ * character match expression and we have to backtrack. So when this function
+ * first starts running, we'll create a "save" point and then attempt to parse
+ * the quantifier. If it fails, we'll restore the save point and return.
  *
- * The properly track everything, we're going to build a little state machine.
+ * To properly track everything, we're going to build a little state machine.
  * It looks something like the following:
  *
  *                  +-------+                 +---------+ ------------+
@@ -275,10 +413,368 @@ pm_regexp_parse_posix_class(pm_regexp_parser_t *parser) {
     );
 }
 
+/**
+ * Property escape classification. Onigmo supports three tiers of property
+ * names depending on the encoding:
+ *
+ * - POSIX properties (Alnum, Alpha, ASCII, Blank, Cntrl, Digit, Graph, Lower,
+ *   Print, Punct, Space, Upper, XDigit, Word): valid in all encodings.
+ * - Script properties (Hiragana, Katakana, Han, Latin, Greek, Cyrillic): valid
+ *   in EUC-JP (/e), Windows-31J (/s), and UTF-8 (/u), but not ASCII-8BIT (/n).
+ * - Unicode-only properties (general categories like L, Ll, Lu, etc., plus
+ *   Any, Assigned): valid only in UTF-8 (/u).
+ */
+typedef enum {
+    PM_REGEXP_PROPERTY_POSIX,
+    PM_REGEXP_PROPERTY_SCRIPT,
+    PM_REGEXP_PROPERTY_UNICODE
+} pm_regexp_property_type_t;
+
+/**
+ * Check if a property name matches a NUL-terminated target string
+ * (case-insensitive, exact length match).
+ */
+static inline bool
+pm_regexp_property_name_matches(const uint8_t *name, size_t length, const char *target) {
+    return target[length] == '\0' && pm_strncasecmp(name, (const uint8_t *) target, length) == 0;
+}
+
+/**
+ * Classify a property name. The name may start with '^' for negation, which
+ * is skipped before matching.
+ */
+static pm_regexp_property_type_t
+pm_regexp_classify_property(const uint8_t *name, size_t length) {
+    // Skip leading '^' for negated properties like \p{^Hiragana}.
+    if (length > 0 && name[0] == '^') {
+        name++;
+        length--;
+    }
+
+    // POSIX properties — valid in all encodings.
+    static const char *const posix_properties[] = {
+        "Alnum", "Alpha", "ASCII", "Blank", "Cntrl", "Digit", "Graph",
+        "Lower", "Print", "Punct", "Space", "Upper", "XDigit", "Word",
+        NULL
+    };
+
+    for (const char *const *property = posix_properties; *property != NULL; property++) {
+        if (pm_regexp_property_name_matches(name, length, *property)) {
+            return PM_REGEXP_PROPERTY_POSIX;
+        }
+    }
+
+    // Script properties — valid in /e, /s, /u but not /n.
+    static const char *const script_properties[] = {
+        "Hiragana", "Katakana", "Han", "Latin", "Greek", "Cyrillic",
+        NULL
+    };
+
+    for (const char *const *property = script_properties; *property != NULL; property++) {
+        if (pm_regexp_property_name_matches(name, length, *property)) {
+            return PM_REGEXP_PROPERTY_SCRIPT;
+        }
+    }
+
+    // Everything else is Unicode-only (general categories, other scripts, etc.).
+    return PM_REGEXP_PROPERTY_UNICODE;
+}
+
+/**
+ * Check for and skip a `\p{...}` or `\P{...}` Unicode property escape. The
+ * cursor should be pointing at 'p' or 'P' when this is called. If a property
+ * escape is found, record it on the regexp parser and advance past the closing
+ * '}'.
+ *
+ * Properties are classified into three tiers (POSIX, script, Unicode-only) to
+ * determine which encoding modifiers they are valid with.
+ */
+static bool
+pm_regexp_parse_property_escape(pm_regexp_parser_t *parser) {
+    assert(*parser->cursor == 'p' || *parser->cursor == 'P');
+
+    if (parser->cursor + 1 < parser->end && parser->cursor[1] == '{') {
+        const uint8_t *name_start = parser->cursor + 2;
+        const uint8_t *search = name_start;
+
+        while (search < parser->end && *search != '}') search++;
+
+        if (search < parser->end) {
+            size_t name_length = (size_t) (search - name_start);
+            parser->has_property_escape = true;
+
+            pm_regexp_property_type_t type = pm_regexp_classify_property(name_start, name_length);
+
+            // Track the first non-POSIX property name (for /n error messages).
+            if (type >= PM_REGEXP_PROPERTY_SCRIPT && parser->property_name == NULL) {
+                parser->property_name = name_start;
+                parser->property_name_length = name_length;
+            }
+
+            // Track the first Unicode-only property name (for /e, /s error messages).
+            if (type == PM_REGEXP_PROPERTY_UNICODE) {
+                parser->has_unicode_property_escape = true;
+                if (parser->unicode_property_name == NULL) {
+                    parser->unicode_property_name = name_start;
+                    parser->unicode_property_name_length = name_length;
+                }
+            }
+
+            parser->cursor = search + 1; // skip past '}'
+            return true;
+        }
+    }
+
+    // Not a property escape, just skip the single character after '\'.
+    parser->cursor++;
+    return false;
+}
+
+/**
+ * Validate and skip a \u escape sequence in a regular expression. The cursor
+ * should be pointing at the character after 'u' when this is called. This
+ * handles both the \u{NNNN MMMM} and \uNNNN forms. Also tracks encoding
+ * state for validation.
+ */
+static void
+pm_regexp_parse_unicode_escape(pm_regexp_parser_t *parser) {
+    const uint8_t *escape_start = parser->cursor - 2; // points to '\'
+
+    if (pm_regexp_char_is_eof(parser)) {
+        pm_regexp_parse_error(parser, escape_start, parser->cursor, "invalid Unicode escape");
+        return;
+    }
+
+    if (*parser->cursor == '{') {
+        parser->cursor++; // skip '{'
+
+        // Skip leading whitespace.
+        while (!pm_regexp_char_is_eof(parser) && pm_char_is_whitespace(*parser->cursor)) {
+            parser->cursor++;
+        }
+
+        bool has_codepoint = false;
+
+        while (!pm_regexp_char_is_eof(parser) && *parser->cursor != '}') {
+            // Parse the hex digits to compute the codepoint value.
+            uint32_t value = 0;
+            size_t hex_count = 0;
+
+            int digit;
+            while (!pm_regexp_char_is_eof(parser) && (digit = pm_regexp_hex_digit_value(*parser->cursor)) >= 0) {
+                value = (value << 4) | (uint32_t) digit;
+                hex_count++;
+                parser->cursor++;
+            }
+
+            if (hex_count == 0) {
+                // Skip to '}' or end of regexp to find the full extent.
+                while (!pm_regexp_char_is_eof(parser) && *parser->cursor != '}') {
+                    parser->cursor++;
+                }
+
+                const uint8_t *escape_end = parser->cursor;
+                if (!pm_regexp_char_is_eof(parser)) {
+                    escape_end++;
+                    parser->cursor++; // skip '}'
+                }
+
+                pm_regexp_parse_error_format(parser, escape_start, escape_end, PM_ERR_ESCAPE_INVALID_UNICODE_LIST, (int) (escape_end - escape_start), (const char *) escape_start);
+                return;
+            }
+
+            if (hex_count > 6) {
+                pm_regexp_parse_error(parser, escape_start, parser->cursor, "invalid Unicode range");
+            }
+
+            // Track encoding state for this codepoint.
+            if (value >= 0x80) {
+                parser->has_unicode_escape = true;
+                parser->explicit_encoding = PM_ENCODING_UTF_8_ENTRY;
+                parser->last_escape_was_unicode = true;
+                pm_regexp_hex_group_boundary(parser);
+            }
+
+            // Check for invalid Unicode range (surrogates or > 0x10FFFF).
+            if (value > 0x10FFFF || (value >= 0xD800 && value <= 0xDFFF)) {
+                parser->invalid_unicode_range = true;
+            }
+
+            has_codepoint = true;
+
+            // Skip whitespace between codepoints.
+            while (!pm_regexp_char_is_eof(parser) && pm_char_is_whitespace(*parser->cursor)) {
+                parser->cursor++;
+            }
+        }
+
+        if (pm_regexp_char_is_eof(parser)) {
+            pm_regexp_parse_error(parser, escape_start, parser->cursor, "unterminated Unicode escape");
+        } else {
+            if (!has_codepoint) {
+                pm_regexp_parse_error_format(parser, escape_start, parser->cursor + 1, PM_ERR_ESCAPE_INVALID_UNICODE_LIST, (int) (parser->cursor + 1 - escape_start), (const char *) escape_start);
+            }
+            parser->cursor++; // skip '}'
+        }
+    } else {
+        // \uNNNN form — need exactly 4 hex digits.
+        uint32_t value = 0;
+        size_t hex_count = 0;
+
+        int digit;
+        while (hex_count < 4 && !pm_regexp_char_is_eof(parser) && (digit = pm_regexp_hex_digit_value(*parser->cursor)) >= 0) {
+            value = (value << 4) | (uint32_t) digit;
+            hex_count++;
+            parser->cursor++;
+        }
+
+        if (hex_count < 4) {
+            pm_regexp_parse_error(parser, escape_start, parser->cursor, "invalid Unicode escape");
+        } else if (value >= 0x80) {
+            parser->has_unicode_escape = true;
+            parser->explicit_encoding = PM_ENCODING_UTF_8_ENTRY;
+            parser->last_escape_was_unicode = true;
+            pm_regexp_hex_group_boundary(parser);
+        }
+
+        // Check for invalid Unicode range.
+        if (hex_count == 4 && (value > 0x10FFFF || (value >= 0xD800 && value <= 0xDFFF))) {
+            parser->invalid_unicode_range = true;
+        }
+    }
+}
+
 // Forward declaration because character sets can be nested.
 static bool
 pm_regexp_parse_lbracket(pm_regexp_parser_t *parser, uint16_t depth);
 
+/**
+ * Parse a \x escape and return the byte value. The cursor should be pointing
+ * at the character after 'x'. Returns -1 if no hex digits follow.
+ */
+static int
+pm_regexp_parse_hex_escape(pm_regexp_parser_t *parser) {
+    int value = -1;
+
+    if (!pm_regexp_char_is_eof(parser)) {
+        int digit = pm_regexp_hex_digit_value(*parser->cursor);
+        if (digit >= 0) {
+            value = digit;
+            parser->cursor++;
+
+            if (!pm_regexp_char_is_eof(parser)) {
+                digit = pm_regexp_hex_digit_value(*parser->cursor);
+                if (digit >= 0) {
+                    value = (value << 4) | digit;
+                    parser->cursor++;
+                }
+            }
+        }
+    }
+
+    if (value >= 0) {
+        pm_regexp_track_hex_escape(parser, (uint8_t) value);
+    }
+
+    return value;
+}
+
+/**
+ * Parse a backslash escape sequence in a regexp, handling \u (unicode),
+ * \p/\P (property), \x (hex), and other single-character escapes. Also
+ * tracks encoding state for \M-x and \C-\M-x escapes.
+ */
+static void
+pm_regexp_parse_backslash_escape(pm_regexp_parser_t *parser) {
+    if (pm_regexp_char_is_eof(parser)) return;
+
+    switch (*parser->cursor) {
+        case 'u':
+            parser->cursor++; // skip 'u'
+            pm_regexp_parse_unicode_escape(parser);
+            break;
+        case 'p':
+        case 'P':
+            pm_regexp_parse_property_escape(parser);
+            break;
+        case 'x':
+            parser->cursor++; // skip 'x'
+            pm_regexp_parse_hex_escape(parser);
+            break;
+        case 'M':
+            // \M-x produces (x | 0x80), always >= 0x80
+            if (parser->cursor + 2 < parser->end && parser->cursor[1] == '-') {
+                parser->cursor += 2; // skip 'M-'
+                if (!pm_regexp_char_is_eof(parser)) {
+                    if (*parser->cursor == '\\') {
+                        parser->cursor++;
+                        // \M-\C-x or \M-\cx — the resulting byte is always >= 0x80
+                        // We just need to track it as a hex escape >= 0x80.
+                        pm_regexp_parse_backslash_escape(parser);
+                    } else {
+                        parser->cursor++;
+                    }
+                    // \M-x always produces a byte >= 0x80
+                    pm_regexp_track_hex_escape(parser, 0x80);
+                }
+            } else {
+                parser->cursor++;
+            }
+            break;
+        case 'C':
+            // \C-x produces (x & 0x1F)
+            if (parser->cursor + 2 < parser->end && parser->cursor[1] == '-') {
+                parser->cursor += 2; // skip 'C-'
+                if (!pm_regexp_char_is_eof(parser)) {
+                    if (*parser->cursor == '\\') {
+                        parser->cursor++;
+                        pm_regexp_parse_backslash_escape(parser);
+                    } else {
+                        parser->cursor++;
+                    }
+                }
+            } else {
+                parser->cursor++;
+            }
+            break;
+        case 'c':
+            // \cx produces (x & 0x1F)
+            parser->cursor++; // skip 'c'
+            if (!pm_regexp_char_is_eof(parser)) {
+                if (*parser->cursor == '\\') {
+                    parser->cursor++;
+                    pm_regexp_parse_backslash_escape(parser);
+                } else {
+                    parser->cursor++;
+                }
+            }
+            break;
+        default:
+            pm_regexp_hex_group_boundary(parser);
+            parser->cursor++;
+            break;
+    }
+}
+
+/**
+ * Check if a byte at the current position is a non-ASCII byte in a multibyte
+ * encoding that produces an invalid character. If so, emit an error at the
+ * byte location immediately.
+ */
+static void
+pm_regexp_parse_invalid_multibyte(pm_regexp_parser_t *parser, const uint8_t *cursor) {
+    uint8_t byte = *cursor;
+    if (byte >= 0x80 && parser->encoding_changed && parser->encoding->multibyte) {
+        size_t width = parser->encoding->char_width(cursor, (ptrdiff_t) (parser->end - cursor));
+        if (width > 1) {
+            parser->cursor += width - 1;
+        } else if (width == 0) {
+            parser->has_invalid_multibyte = true;
+            pm_regexp_parse_error_format(parser, cursor, cursor + 1, PM_ERR_INVALID_MULTIBYTE_CHAR, parser->encoding->name);
+        }
+    }
+}
+
 /**
  * match-char-set : '[' '^'? (match-range | match-char)* ']'
  *                ;
@@ -293,12 +789,16 @@ pm_regexp_parse_character_set(pm_regexp_parser_t *parser, uint16_t depth) {
                 pm_regexp_parse_lbracket(parser, (uint16_t) (depth + 1));
                 break;
             case '\\':
-                if (!pm_regexp_char_is_eof(parser)) {
-                    parser->cursor++;
-                }
+                pm_regexp_parse_backslash_escape(parser);
                 break;
             default:
-                // do nothing, we've already advanced the cursor
+                // We've already advanced the cursor by one byte. If the byte
+                // was >= 0x80 in a multibyte encoding, we may need to consume
+                // additional continuation bytes and validate the character.
+                if (*(parser->cursor - 1) >= 0x80) {
+                    parser->non_ascii_literal_count++;
+                }
+                pm_regexp_parse_invalid_multibyte(parser, parser->cursor - 1);
                 break;
         }
     }
@@ -354,8 +854,13 @@ typedef enum {
 // These are the options that are configurable on the regular expression (or
 // from within a group).
 
+/** The minimum character value for a regexp option slot. */
 #define PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM 'a'
+
+/** The maximum character value for a regexp option slot. */
 #define PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM 'x'
+
+/** The number of regexp option slots. */
 #define PRISM_REGEXP_OPTION_STATE_SLOTS (PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM + 1)
 
 /**
@@ -498,7 +1003,15 @@ pm_regexp_parse_group(pm_regexp_parser_t *parser, uint16_t depth) {
                         }
 
                         size_t width = parser->encoding->char_width(parser->cursor, (ptrdiff_t) (parser->end - parser->cursor));
-                        if (width == 0) return false;
+                        if (width == 0) {
+                            if (*parser->cursor >= 0x80) {
+                                parser->has_invalid_multibyte = true;
+                                pm_regexp_parse_error_format(parser, parser->cursor, parser->cursor + 1, PM_ERR_INVALID_MULTIBYTE_CHAR, parser->encoding->name);
+                                parser->cursor++;
+                                continue;
+                            }
+                            return false;
+                        }
 
                         escaped = (width == 1) && (*parser->cursor == '\\');
                         parser->cursor += width;
@@ -686,9 +1199,7 @@ pm_regexp_parse_item(pm_regexp_parser_t *parser, uint16_t depth) {
             return pm_regexp_parse_quantifier(parser);
         case '\\':
             parser->cursor++;
-            if (!pm_regexp_char_is_eof(parser)) {
-                parser->cursor++;
-            }
+            pm_regexp_parse_backslash_escape(parser);
             return pm_regexp_parse_quantifier(parser);
         case '(':
             parser->cursor++;
@@ -720,9 +1231,30 @@ pm_regexp_parse_item(pm_regexp_parser_t *parser, uint16_t depth) {
                 width = parser->encoding->char_width(parser->cursor, (ptrdiff_t) (parser->end - parser->cursor));
             }
 
-            if (width == 0) return false; // TODO: add appropriate error
-            parser->cursor += width;
+            if (width == 0) {
+                if (*parser->cursor >= 0x80 && parser->encoding_changed) {
+                    if (parser->encoding->multibyte) {
+                        // Invalid multibyte character in a multibyte encoding.
+                        // Emit the error at the byte location immediately.
+                        parser->has_invalid_multibyte = true;
+                        pm_regexp_parse_error_format(parser, parser->cursor, parser->cursor + 1, PM_ERR_INVALID_MULTIBYTE_CHAR, parser->encoding->name);
+                    } else {
+                        // Non-ASCII byte in a single-byte encoding (e.g.,
+                        // US-ASCII). Count it for later error reporting.
+                        parser->non_ascii_literal_count++;
+                    }
+                    parser->cursor++;
+                    return pm_regexp_parse_quantifier(parser);
+                }
+                return false;
+            }
 
+            // Count non-ASCII literal bytes.
+            for (size_t i = 0; i < width; i++) {
+                if (parser->cursor[i] >= 0x80) parser->non_ascii_literal_count++;
+            }
+
+            parser->cursor += width;
             return pm_regexp_parse_quantifier(parser);
         }
     }
@@ -768,13 +1300,353 @@ pm_regexp_parse_pattern(pm_regexp_parser_t *parser) {
     return pm_regexp_char_is_eof(parser);
 }
 
+// ---------------------------------------------------------------------------
+// Encoding validation
+// ---------------------------------------------------------------------------
+
 /**
- * Parse a regular expression and extract the names of all of the named capture
- * groups.
+ * Validate that groups of hex escape bytes in the buffer form valid multibyte
+ * characters in the given encoding. Groups are separated by 0x00 sentinels.
+ */
+static bool
+pm_regexp_validate_hex_escapes(const pm_encoding_t *encoding, const pm_buffer_t *buffer) {
+    const uint8_t *data = (const uint8_t *) pm_buffer_value(buffer);
+    size_t len = pm_buffer_length(buffer);
+    size_t i = 0;
+
+    while (i < len) {
+        size_t group_start = i;
+        while (i < len && data[i] != 0x00) i++;
+
+        for (size_t j = group_start; j < i; ) {
+            size_t width = encoding->char_width(data + j, (ptrdiff_t) (i - j));
+            if (width == 0) return false;
+            j += width;
+        }
+
+        if (i < len) i++; // skip sentinel
+    }
+
+    return true;
+}
+
+/**
+ * Format regexp source content for use in error messages, hex-escaping
+ * non-ASCII bytes.
+ */
+static void
+pm_regexp_format_for_error(pm_buffer_t *buffer, const pm_encoding_t *encoding, const uint8_t *source, size_t length) {
+    size_t index = 0;
+
+    if (encoding == PM_ENCODING_UTF_8_ENTRY) {
+        pm_buffer_append_string(buffer, (const char *) source, length);
+        return;
+    }
+
+    while (index < length) {
+        if (source[index] < 0x80) {
+            pm_buffer_append_byte(buffer, source[index]);
+            index++;
+        } else if (encoding->multibyte) {
+            size_t width = encoding->char_width(source + index, (ptrdiff_t) (length - index));
+
+            if (width > 1) {
+                pm_buffer_append_string(buffer, "\\x{", 3);
+                for (size_t i = 0; i < width; i++) {
+                    pm_buffer_append_format(buffer, "%02X", source[index + i]);
+                }
+                pm_buffer_append_byte(buffer, '}');
+                index += width;
+            } else {
+                pm_buffer_append_format(buffer, "\\x%02X", source[index]);
+                index++;
+            }
+        } else {
+            pm_buffer_append_format(buffer, "\\x%02X", source[index]);
+            index++;
+        }
+    }
+}
+
+/**
+ * Emit an encoding validation error on the regexp node.
+ */
+#define PM_REGEXP_ENCODING_ERROR(parser, diag_id, ...) \
+    pm_diagnostic_list_append_format( \
+        &(parser)->parser->error_list, \
+        (uint32_t) ((parser)->node_start - (parser)->parser->start), \
+        (uint32_t) ((parser)->node_end - (parser)->node_start), \
+        diag_id, __VA_ARGS__)
+
+/**
+ * Validate encoding for a regexp with an encoding modifier (/e, /s, /u, /n).
+ *
+ * The decision tree is:
+ *
+ * 1. No escape-set encoding (explicit_encoding == NULL):
+ *    a. ASCII-only content: validate property escapes, return forced US-ASCII
+ *       for /n or the modifier flags for others.
+ *    b. US-ASCII source with non-ASCII literals: emit per-byte errors.
+ *    c. Source encoding differs from modifier encoding: emit mismatch error.
+ *
+ * 2. Mixed \u and \x escapes: emit the appropriate conflict error depending
+ *    on the modifier and which escape type was last.
+ *
+ * 3. \u escape with non-/u modifier: incompatible encoding error.
+ *
+ * 4. Validate that hex escape byte sequences form valid multibyte characters
+ *    in the modifier's encoding.
+ */
+static pm_node_flags_t
+pm_regexp_validate_encoding_modifier(pm_regexp_parser_t *parser, bool ascii_only, pm_node_flags_t flags, char modifier, const pm_encoding_t *modifier_encoding, const char *source_start, int source_length) {
+
+    if (parser->explicit_encoding == NULL) {
+        if (ascii_only) {
+            // Check property escapes against the modifier's encoding tier.
+            // /n (ASCII-8BIT): only POSIX properties are valid.
+            // /e, /s: POSIX and script properties are valid.
+            // /u: all properties are valid.
+            if (modifier == 'n' && parser->property_name != NULL) {
+                PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_INVALID_CHAR_PROPERTY,
+                    (int) parser->property_name_length, (const char *) parser->property_name,
+                    source_length, source_start);
+            } else if (modifier != 'u' && parser->has_unicode_property_escape) {
+                PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_INVALID_CHAR_PROPERTY,
+                    (int) parser->unicode_property_name_length, (const char *) parser->unicode_property_name,
+                    source_length, source_start);
+            }
+            return modifier == 'n' ? PM_REGULAR_EXPRESSION_FLAGS_FORCED_US_ASCII_ENCODING : flags;
+        }
+
+        if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY) {
+            for (uint32_t i = 0; i < parser->non_ascii_literal_count; i++) {
+                PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_INVALID_MULTIBYTE_CHAR, parser->encoding->name);
+            }
+        } else if (parser->encoding != modifier_encoding) {
+            PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_ENCODING_OPTION_MISMATCH, modifier, parser->encoding->name);
+
+            if (modifier == 'n' && !ascii_only) {
+                pm_buffer_t formatted = { 0 };
+                pm_regexp_format_for_error(&formatted, parser->encoding, (const uint8_t *) source_start, (size_t) source_length);
+                PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_NON_ESCAPED_MBC, (int) formatted.length, (const char *) formatted.value);
+                pm_buffer_free(&formatted);
+            }
+        }
+
+        return flags;
+    }
+
+    // Mixed unicode + hex escapes.
+    if (parser->has_unicode_escape && parser->has_hex_escape) {
+        if (modifier == 'n') {
+            if (parser->last_escape_was_unicode) {
+                PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_UTF8_CHAR_NON_UTF8_REGEXP, source_length, source_start);
+            } else {
+                PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_ESCAPED_NON_ASCII_IN_UTF8, source_length, source_start);
+            }
+        } else {
+            if (!pm_regexp_validate_hex_escapes(modifier_encoding, &parser->hex_escape_buffer)) {
+                PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_INVALID_MULTIBYTE_ESCAPE, source_length, source_start);
+            }
+        }
+
+        return flags;
+    }
+
+    if (modifier != 'u' && parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) {
+        if (parser->last_escape_was_unicode) {
+            PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_INCOMPAT_CHAR_ENCODING, source_length, source_start);
+        } else if (parser->encoding != PM_ENCODING_UTF_8_ENTRY) {
+            PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_INCOMPAT_CHAR_ENCODING, source_length, source_start);
+        }
+    }
+
+    if (modifier != 'n' && !pm_regexp_validate_hex_escapes(modifier_encoding, &parser->hex_escape_buffer)) {
+        PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_INVALID_MULTIBYTE_ESCAPE, source_length, source_start);
+    }
+
+    return flags;
+}
+
+/**
+ * Validate encoding for a regexp without a modifier and compute the encoding
+ * flags to set on the node.
+ *
+ * The decision tree is:
+ *
+ * 1. If a modifier (/n, /u, /e, /s) is present, delegate to
+ *    pm_regexp_validate_encoding_modifier.
+ * 2. Invalid multibyte chars or unicode ranges: suppress further checks (errors
+ *    were already emitted during parsing).
+ * 3. US-ASCII source with non-ASCII literals: emit per-byte errors.
+ * 4. ASCII-only content: return forced US-ASCII (or forced UTF-8 if \p{...}).
+ * 5. Escape-set encoding present: validate hex escapes against the target
+ *    encoding, handle mixed \u + \x conflicts, and return the appropriate
+ *    forced encoding flag.
+ */
+static pm_node_flags_t
+pm_regexp_validate_encoding(pm_regexp_parser_t *parser, bool ascii_only, pm_node_flags_t flags, const char *source_start, int source_length) {
+
+    // Invalid multibyte characters suppress further validation.
+    // Errors were already emitted at the byte locations during parsing.
+    if (parser->has_invalid_multibyte) {
+        return flags;
+    }
+
+    if (parser->invalid_unicode_range) {
+        PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_INVALID_UNICODE_RANGE, source_length, source_start);
+        return flags;
+    }
+
+    // Check modifier flags first.
+    if (flags & PM_REGULAR_EXPRESSION_FLAGS_ASCII_8BIT) {
+        return pm_regexp_validate_encoding_modifier(parser, ascii_only, flags, 'n', PM_ENCODING_ASCII_8BIT_ENTRY, source_start, source_length);
+    }
+    if (flags & PM_REGULAR_EXPRESSION_FLAGS_UTF_8) {
+        return pm_regexp_validate_encoding_modifier(parser, ascii_only, flags, 'u', PM_ENCODING_UTF_8_ENTRY, source_start, source_length);
+    }
+    if (flags & PM_REGULAR_EXPRESSION_FLAGS_EUC_JP) {
+        return pm_regexp_validate_encoding_modifier(parser, ascii_only, flags, 'e', PM_ENCODING_EUC_JP_ENTRY, source_start, source_length);
+    }
+    if (flags & PM_REGULAR_EXPRESSION_FLAGS_WINDOWS_31J) {
+        return pm_regexp_validate_encoding_modifier(parser, ascii_only, flags, 's', PM_ENCODING_WINDOWS_31J_ENTRY, source_start, source_length);
+    }
+
+    // No modifier — check for non-ASCII literals in US-ASCII encoding.
+    if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY && parser->explicit_encoding == NULL && !ascii_only) {
+        for (uint32_t i = 0; i < parser->non_ascii_literal_count; i++) {
+            PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_INVALID_MULTIBYTE_CHAR, parser->encoding->name);
+        }
+    }
+
+    // ASCII-only regexps get downgraded to US-ASCII, unless property escapes
+    // force UTF-8.
+    if (ascii_only) {
+        if (parser->has_property_escape) {
+            return PM_REGULAR_EXPRESSION_FLAGS_FORCED_UTF8_ENCODING;
+        }
+        return PM_REGULAR_EXPRESSION_FLAGS_FORCED_US_ASCII_ENCODING;
+    }
+
+    // Check explicit encoding from escape sequences.
+    if (parser->explicit_encoding != NULL) {
+        // Mixed unicode + hex escapes without modifier.
+        if (parser->has_unicode_escape && parser->has_hex_escape && parser->encoding != PM_ENCODING_UTF_8_ENTRY) {
+            if (parser->encoding != PM_ENCODING_US_ASCII_ENTRY &&
+                parser->encoding != PM_ENCODING_ASCII_8BIT_ENTRY &&
+                !pm_regexp_validate_hex_escapes(parser->encoding, &parser->hex_escape_buffer)) {
+                PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_INVALID_MULTIBYTE_ESCAPE, source_length, source_start);
+            } else if (parser->last_escape_was_unicode) {
+                PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_UTF8_CHAR_NON_UTF8_REGEXP, source_length, source_start);
+            } else {
+                PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_ESCAPED_NON_ASCII_IN_UTF8, source_length, source_start);
+            }
+
+            return 0;
+        }
+
+        if (parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) {
+            if (!pm_regexp_validate_hex_escapes(parser->explicit_encoding, &parser->hex_escape_buffer)) {
+                PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_INVALID_MULTIBYTE_ESCAPE, source_length, source_start);
+            }
+
+            return PM_REGULAR_EXPRESSION_FLAGS_FORCED_UTF8_ENCODING;
+        } else if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY) {
+            return PM_REGULAR_EXPRESSION_FLAGS_FORCED_BINARY_ENCODING;
+        } else {
+            if (!pm_regexp_validate_hex_escapes(parser->explicit_encoding, &parser->hex_escape_buffer)) {
+                PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_INVALID_MULTIBYTE_ESCAPE, source_length, source_start);
+            }
+        }
+    }
+
+    return 0;
+}
+
+/**
+ * Parse a regular expression, validate its encoding, and optionally extract
+ * named capture groups. Encoding validation walks the raw source (content_loc)
+ * to distinguish escape-produced bytes from literal bytes. Named capture
+ * extraction walks the unescaped content since escape sequences in group names
+ * (e.g., line continuations) have already been processed by the lexer.
+ */
+PRISM_EXPORTED_FUNCTION pm_node_flags_t
+pm_regexp_parse(pm_parser_t *parser, pm_regular_expression_node_t *node, pm_regexp_name_callback_t name_callback, pm_regexp_name_data_t *name_data) {
+    const uint8_t *source = parser->start + node->content_loc.start;
+    size_t size = node->content_loc.length;
+    bool extended_mode = PM_NODE_FLAG_P(node, PM_REGULAR_EXPRESSION_FLAGS_EXTENDED);
+    pm_node_flags_t flags = PM_NODE_FLAGS(node);
+
+    const uint8_t *node_start = parser->start + node->base.location.start;
+    const uint8_t *node_end = parser->start + node->base.location.start + node->base.location.length;
+
+    // First pass: walk raw source for encoding validation (no name extraction).
+    pm_regexp_parser_t regexp_parser = {
+        .parser = parser,
+        .start = source,
+        .cursor = source,
+        .end = source + size,
+        .extended_mode = extended_mode,
+        .encoding_changed = parser->encoding_changed,
+        .encoding = parser->encoding,
+        .name_callback = NULL,
+        .name_data = NULL,
+        .shared = true,
+        .node_start = node_start,
+        .node_end = node_end,
+        .has_unicode_escape = false,
+        .has_hex_escape = false,
+        .last_escape_was_unicode = false,
+        .explicit_encoding = NULL,
+        .has_property_escape = false,
+        .has_unicode_property_escape = false,
+        .property_name = NULL,
+        .property_name_length = 0,
+        .unicode_property_name = NULL,
+        .unicode_property_name_length = 0,
+        .non_ascii_literal_count = 0,
+        .invalid_unicode_range = false,
+        .hex_escape_buffer = { 0 },
+        .hex_group_active = false,
+        .has_invalid_multibyte = false,
+    };
+
+    pm_regexp_parse_pattern(&regexp_parser);
+
+    // Compute ascii_only from the regexp parser's tracked state. We cannot
+    // use node->unescaped for this because regexp unescaped content preserves
+    // escape text (e.g., \x80 is 4 ASCII chars), not the binary values.
+    bool ascii_only = !regexp_parser.has_hex_escape && !regexp_parser.has_unicode_escape && regexp_parser.non_ascii_literal_count == 0;
+    // Use the unescaped content for error messages to match CRuby's format,
+    // where Ruby escapes like \M-\C-? are resolved to bytes but regexp escapes
+    // like \u{80} are preserved as text.
+    const char *error_source = (const char *) pm_string_source(&node->unescaped);
+    int error_source_length = (int) pm_string_length(&node->unescaped);
+    pm_node_flags_t encoding_flags = pm_regexp_validate_encoding(&regexp_parser, ascii_only, flags, error_source, error_source_length);
+    pm_buffer_free(&regexp_parser.hex_escape_buffer);
+
+    // Second pass: walk unescaped content for named capture extraction.
+    if (name_callback != NULL) {
+        bool shared = node->unescaped.type == PM_STRING_SHARED;
+        pm_regexp_parse_named_captures(parser, pm_string_source(&node->unescaped), pm_string_length(&node->unescaped), shared, extended_mode, name_callback, name_data);
+    }
+
+    return encoding_flags;
+}
+
+/**
+ * Parse an interpolated regular expression for named capture groups only.
+ * This is used for the =~ operator with interpolated regexps where we don't
+ * have a pm_regular_expression_node_t. No encoding validation is performed.
+ *
+ * Note: The encoding-tracking fields (has_unicode_escape, has_hex_escape, etc.)
+ * are initialized but not used for the result. They exist because the parsing
+ * functions (pm_regexp_parse_backslash_escape, etc.) unconditionally update
+ * them as they walk through the content.
  */
-PRISM_EXPORTED_FUNCTION void
-pm_regexp_parse(pm_parser_t *parser, const uint8_t *source, size_t size, bool extended_mode, pm_regexp_name_callback_t name_callback, void *name_data, pm_regexp_error_callback_t error_callback, void *error_data) {
-    pm_regexp_parse_pattern(&(pm_regexp_parser_t) {
+void
+pm_regexp_parse_named_captures(pm_parser_t *parser, const uint8_t *source, size_t size, bool shared, bool extended_mode, pm_regexp_name_callback_t name_callback, pm_regexp_name_data_t *name_data) {
+    pm_regexp_parser_t regexp_parser = {
         .parser = parser,
         .start = source,
         .cursor = source,
@@ -784,7 +1656,26 @@ pm_regexp_parse(pm_parser_t *parser, const uint8_t *source, size_t size, bool ex
         .encoding = parser->encoding,
         .name_callback = name_callback,
         .name_data = name_data,
-        .error_callback = error_callback,
-        .error_data = error_data
-    });
+        .shared = shared,
+        .node_start = source,
+        .node_end = source + size,
+        .has_unicode_escape = false,
+        .has_hex_escape = false,
+        .last_escape_was_unicode = false,
+        .explicit_encoding = NULL,
+        .has_property_escape = false,
+        .has_unicode_property_escape = false,
+        .property_name = NULL,
+        .property_name_length = 0,
+        .unicode_property_name = NULL,
+        .unicode_property_name_length = 0,
+        .non_ascii_literal_count = 0,
+        .invalid_unicode_range = false,
+        .hex_escape_buffer = { 0 },
+        .hex_group_active = false,
+        .has_invalid_multibyte = false,
+    };
+
+    pm_regexp_parse_pattern(&regexp_parser);
+    pm_buffer_free(&regexp_parser.hex_escape_buffer);
 }
diff --git a/templates/src/diagnostic.c.erb b/templates/src/diagnostic.c.erb
index d717dc1e16..8fa47590c0 100644
--- a/templates/src/diagnostic.c.erb
+++ b/templates/src/diagnostic.c.erb
@@ -330,13 +330,15 @@ static const pm_diagnostic_data_t diagnostic_messages[PM_DIAGNOSTIC_ID_MAX] = {
     [PM_ERR_PATTERN_TERM_PAREN]                 = { "expected a `)` to close the pattern expression", PM_ERROR_LEVEL_SYNTAX },
     [PM_ERR_PIPEPIPEEQ_MULTI_ASSIGN]            = { "unexpected `||=` in a multiple assignment", PM_ERROR_LEVEL_SYNTAX },
     [PM_ERR_REGEXP_ENCODING_OPTION_MISMATCH]    = { "regexp encoding option '%c' differs from source encoding '%s'", PM_ERROR_LEVEL_SYNTAX },
+    [PM_ERR_REGEXP_ESCAPED_NON_ASCII_IN_UTF8]   = { "escaped non ASCII character in UTF-8 regexp: /%.*s/", PM_ERROR_LEVEL_SYNTAX },
     [PM_ERR_REGEXP_INCOMPAT_CHAR_ENCODING]      = { "incompatible character encoding: /%.*s/", PM_ERROR_LEVEL_SYNTAX },
-    [PM_ERR_REGEXP_NON_ESCAPED_MBC]             = { "/.../n has a non escaped non ASCII character in non ASCII-8BIT script: /%.*s/", PM_ERROR_LEVEL_SYNTAX },
+    [PM_ERR_REGEXP_INVALID_CHAR_PROPERTY]       = { "invalid character property name {%.*s}: /%.*s/", PM_ERROR_LEVEL_SYNTAX },
     [PM_ERR_REGEXP_INVALID_UNICODE_RANGE]       = { "invalid Unicode range: /%.*s/", PM_ERROR_LEVEL_SYNTAX },
+    [PM_ERR_REGEXP_NON_ESCAPED_MBC]             = { "/.../n has a non escaped non ASCII character in non ASCII-8BIT script: /%.*s/", PM_ERROR_LEVEL_SYNTAX },
     [PM_ERR_REGEXP_PARSE_ERROR]                 = { "%s", PM_ERROR_LEVEL_SYNTAX },
     [PM_ERR_REGEXP_UNKNOWN_OPTIONS]             = { "unknown regexp %s - %.*s", PM_ERROR_LEVEL_SYNTAX },
     [PM_ERR_REGEXP_TERM]                        = { "unterminated regexp meets end of file; expected a closing delimiter", PM_ERROR_LEVEL_SYNTAX },
-    [PM_ERR_REGEXP_UTF8_CHAR_NON_UTF8_REGEXP]   = { "UTF-8 character in non UTF-8 regexp: /%s/", PM_ERROR_LEVEL_SYNTAX },
+    [PM_ERR_REGEXP_UTF8_CHAR_NON_UTF8_REGEXP]   = { "UTF-8 character in non UTF-8 regexp: /%.*s/", PM_ERROR_LEVEL_SYNTAX },
     [PM_ERR_RESCUE_EXPRESSION]                  = { "expected a rescued expression", PM_ERROR_LEVEL_SYNTAX },
     [PM_ERR_RESCUE_MODIFIER_VALUE]              = { "expected a value after the `rescue` modifier", PM_ERROR_LEVEL_SYNTAX },
     [PM_ERR_RESCUE_TERM]                        = { "expected a closing delimiter for the `rescue` clause", PM_ERROR_LEVEL_SYNTAX },
diff --git a/test/prism/encoding/regular_expression_encoding_test.rb b/test/prism/encoding/regular_expression_encoding_test.rb
index e2daae1d7f..fdff1e3281 100644
--- a/test/prism/encoding/regular_expression_encoding_test.rb
+++ b/test/prism/encoding/regular_expression_encoding_test.rb
@@ -2,6 +2,7 @@
 
 return unless defined?(RubyVM::InstructionSequence)
 return if RubyVM::InstructionSequence.compile("").to_a[4][:parser] == :prism
+return if RUBY_VERSION < "3.2"
 
 require_relative "../test_helper"
 
@@ -21,7 +22,7 @@ class RegularExpressionEncodingTest < TestCase
 
       ["n", "u", "e", "s"].each do |modifier|
         define_method(:"test_regular_expression_encoding_modifiers_/#{modifier}_#{encoding.name}") do
-          regexp_sources = ["abc", "garçon", "\\x80", "gar\\xC3\\xA7on", "gar\\u{E7}on", "abc\\u{FFFFFF}", "\\x80\\u{80}" ]
+          regexp_sources = ["abc", "garçon", "\\x80", "gar\\xC3\\xA7on", "gar\\u{E7}on", "abc\\u{FFFFFF}", "\\x80\\u{80}", "\\p{L}" ]
 
           assert_regular_expression_encoding_flags(
             encoding,
@@ -35,17 +36,15 @@ class RegularExpressionEncodingTest < TestCase
 
     def assert_regular_expression_encoding_flags(encoding, regexps)
       regexps.each do |regexp|
-        regexp_modifier_used = regexp.end_with?("/u") || regexp.end_with?("/e") || regexp.end_with?("/s") || regexp.end_with?("/n")
         source = "# encoding: #{encoding.name}\n#{regexp}"
 
-        encoding_errors = ["invalid multibyte char", "escaped non ASCII character in UTF-8 regexp", "differs from source encoding"]
-        skipped_errors = ["invalid multibyte escape", "incompatible character encoding", "UTF-8 character in non UTF-8 regexp", "invalid Unicode range", "invalid Unicode list"]
-
-        # TODO (nirvdrum 21-Feb-2024): Prism currently does not handle Regexp validation unless modifiers are used. So, skip processing those errors for now: https://github.com/ruby/prism/issues/2104
-        unless regexp_modifier_used
-          skipped_errors += encoding_errors
-          encoding_errors.clear
-        end
+        encoding_errors = [
+          "invalid multibyte char", "escaped non ASCII character in UTF-8 regexp",
+          "differs from source encoding", "incompatible character encoding",
+          "invalid multibyte escape", "UTF-8 character in non UTF-8 regexp",
+          "invalid Unicode range", "non escaped non ASCII character",
+          "invalid character property name", "invalid Unicode list",
+        ]
 
         expected =
           begin
@@ -53,8 +52,6 @@ def assert_regular_expression_encoding_flags(encoding, regexps)
           rescue SyntaxError => error
             if encoding_errors.find { |e| error.message.include?(e) }
               error.message.split("\n").map { |m| m[/: (.+?)$/, 1] }
-            elsif skipped_errors.find { |e| error.message.include?(e) }
-              next
             else
               raise
             end
@@ -111,19 +108,6 @@ def assert_regular_expression_encoding_flags(encoding, regexps)
             end
           end
 
-        # TODO (nirvdrum 22-Feb-2024): Remove this workaround once Prism better maps CRuby's error messages.
-        # This class of error message is tricky. The part not being compared is a representation of the regexp.
-        # Depending on the source encoding and any encoding modifiers being used, CRuby alters how the regexp is represented.
-        # Sometimes it's an MBC string. Other times it uses hexadecimal character escapes. And in other cases it uses
-        # the long-form Unicode escape sequences. This short-circuit checks that the error message is mostly correct.
-        if expected.is_a?(Array) && actual.is_a?(Array)
-          if expected.last.start_with?("/.../n has a non escaped non ASCII character in non ASCII-8BIT script:") &&
-              actual.last.start_with?("/.../n has a non escaped non ASCII character in non ASCII-8BIT script:")
-            expected.pop
-            actual.pop
-          end
-        end
-
         assert_equal expected, actual
       end
     end