From f12717fb2c68728b48b40e5bfa9ac675d9f586cd Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 8 Apr 2026 15:27:15 +0000
Subject: [PATCH 1/3] Initial plan


From 3bf270c14acbdb145166e4903592fff846a927c1 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 8 Apr 2026 16:13:42 +0000
Subject: [PATCH 2/3] CSS: Fix inconsistent UTF-8 scrubbing in
 decode_string_or_url slow path

Agent-Logs-Url: https://github.com/WordPress/php-toolkit/sessions/c82f4270-56cb-4261-bee7-92e7971aba49

Co-authored-by: sirreal <841763+sirreal@users.noreply.github.com>
---
 .../DataLiberation/CSS/class-cssprocessor.php |  4 +-
 .../DataLiberation/Tests/CSSProcessorTest.php | 60 +++++++++++++++++++
 2 files changed, 62 insertions(+), 2 deletions(-)

diff --git a/components/DataLiberation/CSS/class-cssprocessor.php b/components/DataLiberation/CSS/class-cssprocessor.php
index cacb48b4..912a7e78 100644
--- a/components/DataLiberation/CSS/class-cssprocessor.php
+++ b/components/DataLiberation/CSS/class-cssprocessor.php
@@ -1571,7 +1571,7 @@ private function decode_string_or_url( int $start, int $length ): string {
 			if ( $normal_len > 0 ) {
 				// Clamp to not exceed the end boundary.
 				$normal_len = min( $normal_len, $end - $at );
-				$decoded   .= substr( $this->css, $at, $normal_len );
+				$decoded   .= wp_scrub_utf8( substr( $this->css, $at, $normal_len ) );
 				$at        += $normal_len;
 			}
 
@@ -1585,7 +1585,7 @@ private function decode_string_or_url( int $start, int $length ): string {
 			if ( '\\' === $char ) {
 				if ( $this->is_valid_escape( $at ) ) {
 					++$at;
-					$decoded .= $this->decode_escape_at( $at, $bytes_consumed );
+					$decoded .= wp_scrub_utf8( $this->decode_escape_at( $at, $bytes_consumed ) );
 					$at      += $bytes_consumed;
 					continue;
 				}
diff --git a/components/DataLiberation/Tests/CSSProcessorTest.php b/components/DataLiberation/Tests/CSSProcessorTest.php
index ed164ac2..a08a7873 100644
--- a/components/DataLiberation/Tests/CSSProcessorTest.php
+++ b/components/DataLiberation/Tests/CSSProcessorTest.php
@@ -1541,4 +1541,64 @@ public function test_ident_start_codepoint_bounds_check(): void {
 		);
 		$this->assertSame( $expected_tokens, $actual_tokens );
 	}
+
+	/**
+	 * Tests that invalid UTF-8 bytes in the slow-path normal segments of a URL token
+	 * are replaced with U+FFFD, consistent with the fast path.
+	 *
+	 * The slow path is triggered by a backslash escape in the URL content. Invalid bytes
+	 * that appear in the non-escaped portions must still be scrubbed.
+	 *
+	 * @see https://github.com/WordPress/php-toolkit/issues/229
+	 */
+	public function test_invalid_utf8_in_url_slow_path_normal_segment(): void {
+		// CSS bytes: u r l ( A \ 4 1 0xFF B )
+		// The \41 hex escape triggers the slow path.
+		// 0xFF is an invalid UTF-8 byte in a normal (non-escaped) segment.
+		$css = "url(A\\41\xFFB)";
+
+		$processor = CSSProcessor::create( $css );
+		$this->assertTrue( $processor->next_token() );
+		$this->assertSame( CSSProcessor::TOKEN_URL, $processor->get_token_type() );
+		// \41 decodes to 'A'; \xFF must be replaced with U+FFFD.
+		$this->assertSame( "AA\u{FFFD}B", $processor->get_token_value() );
+	}
+
+	/**
+	 * Tests that a backslash-escaped invalid UTF-8 byte in a URL token
+	 * is replaced with U+FFFD, consistent with the fast path.
+	 *
+	 * In the slow path, decode_escape_at() returns the raw invalid byte for
+	 * the "anything else" escape case and the caller must scrub it.
+	 *
+	 * @see https://github.com/WordPress/php-toolkit/issues/229
+	 */
+	public function test_invalid_utf8_in_url_slow_path_escaped_byte(): void {
+		// CSS bytes: u r l ( A \ 4 1 \ 0xFF )
+		// \41 is a hex escape for 'A'; \<0xFF> is "anything else" escape for the 0xFF byte.
+		$css = "url(A\\41\\\xFF)";
+
+		$processor = CSSProcessor::create( $css );
+		$this->assertTrue( $processor->next_token() );
+		$this->assertSame( CSSProcessor::TOKEN_URL, $processor->get_token_type() );
+		// \41 decodes to 'A'; \<0xFF> must produce U+FFFD.
+		$this->assertSame( "AA\u{FFFD}", $processor->get_token_value() );
+	}
+
+	/**
+	 * Tests that invalid UTF-8 bytes in the slow-path normal segments of a string token
+	 * are replaced with U+FFFD, consistent with the fast path.
+	 *
+	 * @see https://github.com/WordPress/php-toolkit/issues/229
+	 */
+	public function test_invalid_utf8_in_string_slow_path_normal_segment(): void {
+		// String token 'A\41<0xFF>B' – the \41 escape triggers the slow path.
+		$css = "'A\\41\xFFB'";
+
+		$processor = CSSProcessor::create( $css );
+		$this->assertTrue( $processor->next_token() );
+		$this->assertSame( CSSProcessor::TOKEN_STRING, $processor->get_token_type() );
+		// \41 decodes to 'A'; \xFF must be replaced with U+FFFD.
+		$this->assertSame( "AA\u{FFFD}B", $processor->get_token_value() );
+	}
 }

From 70fb72423a6358d36d2dce165c2e7f4054f327a8 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 8 Apr 2026 19:22:32 +0000
Subject: [PATCH 3/3] CSS: Replace URL/string tests with cleaner ident-based
 tests from #231

Agent-Logs-Url: https://github.com/WordPress/php-toolkit/sessions/892f6cca-d936-43dc-9f63-27739478ddbd

Co-authored-by: sirreal <841763+sirreal@users.noreply.github.com>
---
 .../DataLiberation/Tests/CSSProcessorTest.php | 122 +++++++++---------
 1 file changed, 62 insertions(+), 60 deletions(-)

diff --git a/components/DataLiberation/Tests/CSSProcessorTest.php b/components/DataLiberation/Tests/CSSProcessorTest.php
index a08a7873..a7e124c3 100644
--- a/components/DataLiberation/Tests/CSSProcessorTest.php
+++ b/components/DataLiberation/Tests/CSSProcessorTest.php
@@ -146,6 +146,68 @@ public function test_invalid_utf8_with_two_single_byte_invalid_sequences(): void
 		$this->assertSame( $expected, $actual_tokens );
 	}
 
+	/**
+	 * In the slow path of decode_string_or_url() (triggered by a backslash escape), normal
+	 * text segments must still have invalid UTF-8 bytes replaced with U+FFFD, just
+	 * as the fast path does via wp_scrub_utf8().
+	 */
+	public function test_invalid_utf8_in_normal_segment_combined_with_escape(): void {
+		// The ident token contains an invalid UTF-8 byte (0xF1) in the "normal"
+		// segment before a CSS hex escape (\41 = U+0041 = 'A'). The backslash
+		// triggers the slow path, which previously skipped wp_scrub_utf8() on the
+		// normal segment.
+		$css = ".test\xF1\\41name";
+
+		$expected = array(
+			array(
+				'type'  => CSSProcessor::TOKEN_DELIM,
+				'raw'   => '.',
+				'value' => '.',
+			),
+			array(
+				'type'  => CSSProcessor::TOKEN_IDENT,
+				// raw contains the original bytes.
+				'raw'   => "test\xF1\\41name",
+				// value must have 0xF1 replaced with U+FFFD and \41 decoded to 'A'.
+				'value' => "test\u{FFFD}Aname",
+			),
+		);
+
+		$processor = CSSProcessor::create( $css );
+		$actual_tokens = $this->collect_tokens( $processor, array( 'type', 'raw', 'value' ) );
+		$this->assertSame( $expected, $actual_tokens );
+	}
+
+	/**
+	 * When an invalid UTF-8 byte is the character directly after a backslash
+	 * (i.e. it is the escaped character itself), decode_escape_at() returns the
+	 * raw byte. The caller must scrub it to U+FFFD.
+	 */
+	public function test_invalid_utf8_as_escaped_character(): void {
+		// The CSS `.\xF1` is a delim + ident containing a lone invalid byte.
+		// Adding a backslash before the invalid byte makes it an escape sequence:
+		// `.\\\xF1` => delim + ident whose value is the escaped 0xF1 byte.
+		$css = ".a\\\xF1b";
+
+		$expected = array(
+			array(
+				'type'  => CSSProcessor::TOKEN_DELIM,
+				'raw'   => '.',
+				'value' => '.',
+			),
+			array(
+				'type'  => CSSProcessor::TOKEN_IDENT,
+				'raw'   => "a\\\xF1b",
+				// The escaped 0xF1 must be replaced with U+FFFD.
+				'value' => "a\u{FFFD}b",
+			),
+		);
+
+		$processor = CSSProcessor::create( $css );
+		$actual_tokens = $this->collect_tokens( $processor, array( 'type', 'raw', 'value' ) );
+		$this->assertSame( $expected, $actual_tokens );
+	}
+
 	/**
 	 * Legacy test to ensure basic tokenization still works.
 	 */
@@ -1541,64 +1603,4 @@ public function test_ident_start_codepoint_bounds_check(): void {
 		);
 		$this->assertSame( $expected_tokens, $actual_tokens );
 	}
-
-	/**
-	 * Tests that invalid UTF-8 bytes in the slow-path normal segments of a URL token
-	 * are replaced with U+FFFD, consistent with the fast path.
-	 *
-	 * The slow path is triggered by a backslash escape in the URL content. Invalid bytes
-	 * that appear in the non-escaped portions must still be scrubbed.
-	 *
-	 * @see https://github.com/WordPress/php-toolkit/issues/229
-	 */
-	public function test_invalid_utf8_in_url_slow_path_normal_segment(): void {
-		// CSS bytes: u r l ( A \ 4 1 0xFF B )
-		// The \41 hex escape triggers the slow path.
-		// 0xFF is an invalid UTF-8 byte in a normal (non-escaped) segment.
-		$css = "url(A\\41\xFFB)";
-
-		$processor = CSSProcessor::create( $css );
-		$this->assertTrue( $processor->next_token() );
-		$this->assertSame( CSSProcessor::TOKEN_URL, $processor->get_token_type() );
-		// \41 decodes to 'A'; \xFF must be replaced with U+FFFD.
-		$this->assertSame( "AA\u{FFFD}B", $processor->get_token_value() );
-	}
-
-	/**
-	 * Tests that a backslash-escaped invalid UTF-8 byte in a URL token
-	 * is replaced with U+FFFD, consistent with the fast path.
-	 *
-	 * In the slow path, decode_escape_at() returns the raw invalid byte for
-	 * the "anything else" escape case and the caller must scrub it.
-	 *
-	 * @see https://github.com/WordPress/php-toolkit/issues/229
-	 */
-	public function test_invalid_utf8_in_url_slow_path_escaped_byte(): void {
-		// CSS bytes: u r l ( A \ 4 1 \ 0xFF )
-		// \41 is a hex escape for 'A'; \<0xFF> is "anything else" escape for the 0xFF byte.
-		$css = "url(A\\41\\\xFF)";
-
-		$processor = CSSProcessor::create( $css );
-		$this->assertTrue( $processor->next_token() );
-		$this->assertSame( CSSProcessor::TOKEN_URL, $processor->get_token_type() );
-		// \41 decodes to 'A'; \<0xFF> must produce U+FFFD.
-		$this->assertSame( "AA\u{FFFD}", $processor->get_token_value() );
-	}
-
-	/**
-	 * Tests that invalid UTF-8 bytes in the slow-path normal segments of a string token
-	 * are replaced with U+FFFD, consistent with the fast path.
-	 *
-	 * @see https://github.com/WordPress/php-toolkit/issues/229
-	 */
-	public function test_invalid_utf8_in_string_slow_path_normal_segment(): void {
-		// String token 'A\41<0xFF>B' – the \41 escape triggers the slow path.
-		$css = "'A\\41\xFFB'";
-
-		$processor = CSSProcessor::create( $css );
-		$this->assertTrue( $processor->next_token() );
-		$this->assertSame( CSSProcessor::TOKEN_STRING, $processor->get_token_type() );
-		// \41 decodes to 'A'; \xFF must be replaced with U+FFFD.
-		$this->assertSame( "AA\u{FFFD}B", $processor->get_token_value() );
-	}
 }