From f12717fb2c68728b48b40e5bfa9ac675d9f586cd Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 8 Apr 2026 15:27:15 +0000 Subject: [PATCH 1/3] Initial plan From 3bf270c14acbdb145166e4903592fff846a927c1 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 8 Apr 2026 16:13:42 +0000 Subject: [PATCH 2/3] CSS: Fix inconsistent UTF-8 scrubbing in decode_string_or_url slow path Agent-Logs-Url: https://github.com/WordPress/php-toolkit/sessions/c82f4270-56cb-4261-bee7-92e7971aba49 Co-authored-by: sirreal <841763+sirreal@users.noreply.github.com> --- .../DataLiberation/CSS/class-cssprocessor.php | 4 +- .../DataLiberation/Tests/CSSProcessorTest.php | 60 +++++++++++++++++++ 2 files changed, 62 insertions(+), 2 deletions(-) diff --git a/components/DataLiberation/CSS/class-cssprocessor.php b/components/DataLiberation/CSS/class-cssprocessor.php index cacb48b4..912a7e78 100644 --- a/components/DataLiberation/CSS/class-cssprocessor.php +++ b/components/DataLiberation/CSS/class-cssprocessor.php @@ -1571,7 +1571,7 @@ private function decode_string_or_url( int $start, int $length ): string { if ( $normal_len > 0 ) { // Clamp to not exceed the end boundary. $normal_len = min( $normal_len, $end - $at ); - $decoded .= substr( $this->css, $at, $normal_len ); + $decoded .= wp_scrub_utf8( substr( $this->css, $at, $normal_len ) ); $at += $normal_len; } @@ -1585,7 +1585,7 @@ private function decode_string_or_url( int $start, int $length ): string { if ( '\\' === $char ) { if ( $this->is_valid_escape( $at ) ) { ++$at; - $decoded .= $this->decode_escape_at( $at, $bytes_consumed ); + $decoded .= wp_scrub_utf8( $this->decode_escape_at( $at, $bytes_consumed ) ); $at += $bytes_consumed; continue; } diff --git a/components/DataLiberation/Tests/CSSProcessorTest.php b/components/DataLiberation/Tests/CSSProcessorTest.php index ed164ac2..a08a7873 100644 --- a/components/DataLiberation/Tests/CSSProcessorTest.php +++ b/components/DataLiberation/Tests/CSSProcessorTest.php @@ -1541,4 +1541,64 @@ public function test_ident_start_codepoint_bounds_check(): void { ); $this->assertSame( $expected_tokens, $actual_tokens ); } + + /** + * Tests that invalid UTF-8 bytes in the slow-path normal segments of a URL token + * are replaced with U+FFFD, consistent with the fast path. + * + * The slow path is triggered by a backslash escape in the URL content. Invalid bytes + * that appear in the non-escaped portions must still be scrubbed. + * + * @see https://github.com/WordPress/php-toolkit/issues/229 + */ + public function test_invalid_utf8_in_url_slow_path_normal_segment(): void { + // CSS bytes: u r l ( A \ 4 1 0xFF B ) + // The \41 hex escape triggers the slow path. + // 0xFF is an invalid UTF-8 byte in a normal (non-escaped) segment. + $css = "url(A\\41\xFFB)"; + + $processor = CSSProcessor::create( $css ); + $this->assertTrue( $processor->next_token() ); + $this->assertSame( CSSProcessor::TOKEN_URL, $processor->get_token_type() ); + // \41 decodes to 'A'; \xFF must be replaced with U+FFFD. + $this->assertSame( "AA\u{FFFD}B", $processor->get_token_value() ); + } + + /** + * Tests that a backslash-escaped invalid UTF-8 byte in a URL token + * is replaced with U+FFFD, consistent with the fast path. + * + * In the slow path, decode_escape_at() returns the raw invalid byte for + * the "anything else" escape case and the caller must scrub it. + * + * @see https://github.com/WordPress/php-toolkit/issues/229 + */ + public function test_invalid_utf8_in_url_slow_path_escaped_byte(): void { + // CSS bytes: u r l ( A \ 4 1 \ 0xFF ) + // \41 is a hex escape for 'A'; \<0xFF> is "anything else" escape for the 0xFF byte. + $css = "url(A\\41\\\xFF)"; + + $processor = CSSProcessor::create( $css ); + $this->assertTrue( $processor->next_token() ); + $this->assertSame( CSSProcessor::TOKEN_URL, $processor->get_token_type() ); + // \41 decodes to 'A'; \<0xFF> must produce U+FFFD. + $this->assertSame( "AA\u{FFFD}", $processor->get_token_value() ); + } + + /** + * Tests that invalid UTF-8 bytes in the slow-path normal segments of a string token + * are replaced with U+FFFD, consistent with the fast path. + * + * @see https://github.com/WordPress/php-toolkit/issues/229 + */ + public function test_invalid_utf8_in_string_slow_path_normal_segment(): void { + // String token 'A\41<0xFF>B' – the \41 escape triggers the slow path. + $css = "'A\\41\xFFB'"; + + $processor = CSSProcessor::create( $css ); + $this->assertTrue( $processor->next_token() ); + $this->assertSame( CSSProcessor::TOKEN_STRING, $processor->get_token_type() ); + // \41 decodes to 'A'; \xFF must be replaced with U+FFFD. + $this->assertSame( "AA\u{FFFD}B", $processor->get_token_value() ); + } } From 70fb72423a6358d36d2dce165c2e7f4054f327a8 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 8 Apr 2026 19:22:32 +0000 Subject: [PATCH 3/3] CSS: Replace URL/string tests with cleaner ident-based tests from #231 Agent-Logs-Url: https://github.com/WordPress/php-toolkit/sessions/892f6cca-d936-43dc-9f63-27739478ddbd Co-authored-by: sirreal <841763+sirreal@users.noreply.github.com> --- .../DataLiberation/Tests/CSSProcessorTest.php | 122 +++++++++--------- 1 file changed, 62 insertions(+), 60 deletions(-) diff --git a/components/DataLiberation/Tests/CSSProcessorTest.php b/components/DataLiberation/Tests/CSSProcessorTest.php index a08a7873..a7e124c3 100644 --- a/components/DataLiberation/Tests/CSSProcessorTest.php +++ b/components/DataLiberation/Tests/CSSProcessorTest.php @@ -146,6 +146,68 @@ public function test_invalid_utf8_with_two_single_byte_invalid_sequences(): void $this->assertSame( $expected, $actual_tokens ); } + /** + * In the slow path of decode_string_or_url() (triggered by a backslash escape), normal + * text segments must still have invalid UTF-8 bytes replaced with U+FFFD, just + * as the fast path does via wp_scrub_utf8(). + */ + public function test_invalid_utf8_in_normal_segment_combined_with_escape(): void { + // The ident token contains an invalid UTF-8 byte (0xF1) in the "normal" + // segment before a CSS hex escape (\41 = U+0041 = 'A'). The backslash + // triggers the slow path, which previously skipped wp_scrub_utf8() on the + // normal segment. + $css = ".test\xF1\\41name"; + + $expected = array( + array( + 'type' => CSSProcessor::TOKEN_DELIM, + 'raw' => '.', + 'value' => '.', + ), + array( + 'type' => CSSProcessor::TOKEN_IDENT, + // raw contains the original bytes. + 'raw' => "test\xF1\\41name", + // value must have 0xF1 replaced with U+FFFD and \41 decoded to 'A'. + 'value' => "test\u{FFFD}Aname", + ), + ); + + $processor = CSSProcessor::create( $css ); + $actual_tokens = $this->collect_tokens( $processor, array( 'type', 'raw', 'value' ) ); + $this->assertSame( $expected, $actual_tokens ); + } + + /** + * When an invalid UTF-8 byte is the character directly after a backslash + * (i.e. it is the escaped character itself), decode_escape_at() returns the + * raw byte. The caller must scrub it to U+FFFD. + */ + public function test_invalid_utf8_as_escaped_character(): void { + // The CSS `.\xF1` is a delim + ident containing a lone invalid byte. + // Adding a backslash before the invalid byte makes it an escape sequence: + // `.\\\xF1` => delim + ident whose value is the escaped 0xF1 byte. + $css = ".a\\\xF1b"; + + $expected = array( + array( + 'type' => CSSProcessor::TOKEN_DELIM, + 'raw' => '.', + 'value' => '.', + ), + array( + 'type' => CSSProcessor::TOKEN_IDENT, + 'raw' => "a\\\xF1b", + // The escaped 0xF1 must be replaced with U+FFFD. + 'value' => "a\u{FFFD}b", + ), + ); + + $processor = CSSProcessor::create( $css ); + $actual_tokens = $this->collect_tokens( $processor, array( 'type', 'raw', 'value' ) ); + $this->assertSame( $expected, $actual_tokens ); + } + /** * Legacy test to ensure basic tokenization still works. */ @@ -1541,64 +1603,4 @@ public function test_ident_start_codepoint_bounds_check(): void { ); $this->assertSame( $expected_tokens, $actual_tokens ); } - - /** - * Tests that invalid UTF-8 bytes in the slow-path normal segments of a URL token - * are replaced with U+FFFD, consistent with the fast path. - * - * The slow path is triggered by a backslash escape in the URL content. Invalid bytes - * that appear in the non-escaped portions must still be scrubbed. - * - * @see https://github.com/WordPress/php-toolkit/issues/229 - */ - public function test_invalid_utf8_in_url_slow_path_normal_segment(): void { - // CSS bytes: u r l ( A \ 4 1 0xFF B ) - // The \41 hex escape triggers the slow path. - // 0xFF is an invalid UTF-8 byte in a normal (non-escaped) segment. - $css = "url(A\\41\xFFB)"; - - $processor = CSSProcessor::create( $css ); - $this->assertTrue( $processor->next_token() ); - $this->assertSame( CSSProcessor::TOKEN_URL, $processor->get_token_type() ); - // \41 decodes to 'A'; \xFF must be replaced with U+FFFD. - $this->assertSame( "AA\u{FFFD}B", $processor->get_token_value() ); - } - - /** - * Tests that a backslash-escaped invalid UTF-8 byte in a URL token - * is replaced with U+FFFD, consistent with the fast path. - * - * In the slow path, decode_escape_at() returns the raw invalid byte for - * the "anything else" escape case and the caller must scrub it. - * - * @see https://github.com/WordPress/php-toolkit/issues/229 - */ - public function test_invalid_utf8_in_url_slow_path_escaped_byte(): void { - // CSS bytes: u r l ( A \ 4 1 \ 0xFF ) - // \41 is a hex escape for 'A'; \<0xFF> is "anything else" escape for the 0xFF byte. - $css = "url(A\\41\\\xFF)"; - - $processor = CSSProcessor::create( $css ); - $this->assertTrue( $processor->next_token() ); - $this->assertSame( CSSProcessor::TOKEN_URL, $processor->get_token_type() ); - // \41 decodes to 'A'; \<0xFF> must produce U+FFFD. - $this->assertSame( "AA\u{FFFD}", $processor->get_token_value() ); - } - - /** - * Tests that invalid UTF-8 bytes in the slow-path normal segments of a string token - * are replaced with U+FFFD, consistent with the fast path. - * - * @see https://github.com/WordPress/php-toolkit/issues/229 - */ - public function test_invalid_utf8_in_string_slow_path_normal_segment(): void { - // String token 'A\41<0xFF>B' – the \41 escape triggers the slow path. - $css = "'A\\41\xFFB'"; - - $processor = CSSProcessor::create( $css ); - $this->assertTrue( $processor->next_token() ); - $this->assertSame( CSSProcessor::TOKEN_STRING, $processor->get_token_type() ); - // \41 decodes to 'A'; \xFF must be replaced with U+FFFD. - $this->assertSame( "AA\u{FFFD}B", $processor->get_token_value() ); - } }