@@ -146,6 +146,68 @@ public function test_invalid_utf8_with_two_single_byte_invalid_sequences(): void
146146 $ this ->assertSame ( $ expected , $ actual_tokens );
147147 }
148148
149+ /**
150+ * In the slow path of decode_string_or_url() (triggered by a backslash escape), normal
151+ * text segments must still have invalid UTF-8 bytes replaced with U+FFFD, just
152+ * as the fast path does via wp_scrub_utf8().
153+ */
154+ public function test_invalid_utf8_in_normal_segment_combined_with_escape (): void {
155+ // The ident token contains an invalid UTF-8 byte (0xF1) in the "normal"
156+ // segment before a CSS hex escape (\41 = U+0041 = 'A'). The backslash
157+ // triggers the slow path, which previously skipped wp_scrub_utf8() on the
158+ // normal segment.
159+ $ css = ".test \xF1\\41name " ;
160+
161+ $ expected = array (
162+ array (
163+ 'type ' => CSSProcessor::TOKEN_DELIM ,
164+ 'raw ' => '. ' ,
165+ 'value ' => '. ' ,
166+ ),
167+ array (
168+ 'type ' => CSSProcessor::TOKEN_IDENT ,
169+ // raw contains the original bytes.
170+ 'raw ' => "test \xF1\\41name " ,
171+ // value must have 0xF1 replaced with U+FFFD and \41 decoded to 'A'.
172+ 'value ' => "test \u{FFFD}Aname " ,
173+ ),
174+ );
175+
176+ $ processor = CSSProcessor::create ( $ css );
177+ $ actual_tokens = $ this ->collect_tokens ( $ processor , array ( 'type ' , 'raw ' , 'value ' ) );
178+ $ this ->assertSame ( $ expected , $ actual_tokens );
179+ }
180+
181+ /**
182+ * When an invalid UTF-8 byte is the character directly after a backslash
183+ * (i.e. it is the escaped character itself), decode_escape_at() returns the
184+ * raw byte. The caller must scrub it to U+FFFD.
185+ */
186+ public function test_invalid_utf8_as_escaped_character (): void {
187+ // The CSS `.\xF1` is a delim + ident containing a lone invalid byte.
188+ // Adding a backslash before the invalid byte makes it an escape sequence:
189+ // `.\\\xF1` => delim + ident whose value is the escaped 0xF1 byte.
190+ $ css = ".a \\\xF1b " ;
191+
192+ $ expected = array (
193+ array (
194+ 'type ' => CSSProcessor::TOKEN_DELIM ,
195+ 'raw ' => '. ' ,
196+ 'value ' => '. ' ,
197+ ),
198+ array (
199+ 'type ' => CSSProcessor::TOKEN_IDENT ,
200+ 'raw ' => "a \\\xF1b " ,
201+ // The escaped 0xF1 must be replaced with U+FFFD.
202+ 'value ' => "a \u{FFFD}b " ,
203+ ),
204+ );
205+
206+ $ processor = CSSProcessor::create ( $ css );
207+ $ actual_tokens = $ this ->collect_tokens ( $ processor , array ( 'type ' , 'raw ' , 'value ' ) );
208+ $ this ->assertSame ( $ expected , $ actual_tokens );
209+ }
210+
149211 /**
150212 * Legacy test to ensure basic tokenization still works.
151213 */
@@ -1541,64 +1603,4 @@ public function test_ident_start_codepoint_bounds_check(): void {
15411603 );
15421604 $ this ->assertSame ( $ expected_tokens , $ actual_tokens );
15431605 }
1544-
1545- /**
1546- * Tests that invalid UTF-8 bytes in the slow-path normal segments of a URL token
1547- * are replaced with U+FFFD, consistent with the fast path.
1548- *
1549- * The slow path is triggered by a backslash escape in the URL content. Invalid bytes
1550- * that appear in the non-escaped portions must still be scrubbed.
1551- *
1552- * @see https://github.com/WordPress/php-toolkit/issues/229
1553- */
1554- public function test_invalid_utf8_in_url_slow_path_normal_segment (): void {
1555- // CSS bytes: u r l ( A \ 4 1 0xFF B )
1556- // The \41 hex escape triggers the slow path.
1557- // 0xFF is an invalid UTF-8 byte in a normal (non-escaped) segment.
1558- $ css = "url(A \\41 \xFFB) " ;
1559-
1560- $ processor = CSSProcessor::create ( $ css );
1561- $ this ->assertTrue ( $ processor ->next_token () );
1562- $ this ->assertSame ( CSSProcessor::TOKEN_URL , $ processor ->get_token_type () );
1563- // \41 decodes to 'A'; \xFF must be replaced with U+FFFD.
1564- $ this ->assertSame ( "AA \u{FFFD}B " , $ processor ->get_token_value () );
1565- }
1566-
1567- /**
1568- * Tests that a backslash-escaped invalid UTF-8 byte in a URL token
1569- * is replaced with U+FFFD, consistent with the fast path.
1570- *
1571- * In the slow path, decode_escape_at() returns the raw invalid byte for
1572- * the "anything else" escape case and the caller must scrub it.
1573- *
1574- * @see https://github.com/WordPress/php-toolkit/issues/229
1575- */
1576- public function test_invalid_utf8_in_url_slow_path_escaped_byte (): void {
1577- // CSS bytes: u r l ( A \ 4 1 \ 0xFF )
1578- // \41 is a hex escape for 'A'; \<0xFF> is "anything else" escape for the 0xFF byte.
1579- $ css = "url(A \\41 \\\xFF) " ;
1580-
1581- $ processor = CSSProcessor::create ( $ css );
1582- $ this ->assertTrue ( $ processor ->next_token () );
1583- $ this ->assertSame ( CSSProcessor::TOKEN_URL , $ processor ->get_token_type () );
1584- // \41 decodes to 'A'; \<0xFF> must produce U+FFFD.
1585- $ this ->assertSame ( "AA \u{FFFD}" , $ processor ->get_token_value () );
1586- }
1587-
1588- /**
1589- * Tests that invalid UTF-8 bytes in the slow-path normal segments of a string token
1590- * are replaced with U+FFFD, consistent with the fast path.
1591- *
1592- * @see https://github.com/WordPress/php-toolkit/issues/229
1593- */
1594- public function test_invalid_utf8_in_string_slow_path_normal_segment (): void {
1595- // String token 'A\41<0xFF>B' – the \41 escape triggers the slow path.
1596- $ css = "'A \\41 \xFFB' " ;
1597-
1598- $ processor = CSSProcessor::create ( $ css );
1599- $ this ->assertTrue ( $ processor ->next_token () );
1600- $ this ->assertSame ( CSSProcessor::TOKEN_STRING , $ processor ->get_token_type () );
1601- // \41 decodes to 'A'; \xFF must be replaced with U+FFFD.
1602- $ this ->assertSame ( "AA \u{FFFD}B " , $ processor ->get_token_value () );
1603- }
16041606}
0 commit comments