From 4858eae584dc0c41d0520288648d8e95cce09e05 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Wed, 8 Apr 2026 10:59:16 +0200 Subject: [PATCH 01/12] Add tests --- .../DataLiberation/Tests/CSSProcessorTest.php | 138 ++++++++++++++++++ 1 file changed, 138 insertions(+) diff --git a/components/DataLiberation/Tests/CSSProcessorTest.php b/components/DataLiberation/Tests/CSSProcessorTest.php index ed164ac2..fc99008c 100644 --- a/components/DataLiberation/Tests/CSSProcessorTest.php +++ b/components/DataLiberation/Tests/CSSProcessorTest.php @@ -1541,4 +1541,142 @@ public function test_ident_start_codepoint_bounds_check(): void { ); $this->assertSame( $expected_tokens, $actual_tokens ); } + + /** + * Tests that backslash-newline in a string token contributes nothing to the value. + * + * CSS spec §4.3.5 consume-string-token: + * > U+005C REVERSE SOLIDUS (\) + * > Otherwise, if the next input code point is a newline, consume it. + * + * The backslash and newline are both consumed and produce no value. + * + * @see https://www.w3.org/TR/css-syntax-3/#consume-string-token + * @see https://github.com/WordPress/php-toolkit/issues/222 + * + * @dataProvider data_string_backslash_newline + */ + public function test_string_backslash_newline( string $css, string $expected_value ): void { + $processor = CSSProcessor::create( $css ); + $this->assertTrue( $processor->next_token() ); + $this->assertSame( CSSProcessor::TOKEN_STRING, $processor->get_token_type() ); + $this->assertSame( $expected_value, $processor->get_token_value() ); + } + + static public function data_string_backslash_newline(): array { + return array( + 'backslash-LF' => array( "'str\\\ning'", 'string' ), + 'backslash-FF' => array( "'str\\\fing'", 'string' ), + 'backslash-CR' => array( "'str\\\ring'", 'string' ), + 'backslash-CRLF' => array( "'str\\\r\ning'", 'string' ), + ); + } + + /** + * Tests that backslash-EOF in a string token contributes nothing to the value. + * + * CSS spec §4.3.5 consume-string-token: + * > U+005C REVERSE SOLIDUS (\) + * > If the next input code point is EOF, do nothing. + * + * The trailing backslash is consumed and produces no value. + * + * @see https://www.w3.org/TR/css-syntax-3/#consume-string-token + * @see https://github.com/WordPress/php-toolkit/issues/223 + */ + public function test_string_backslash_eof(): void { + $processor = CSSProcessor::create( "'string\\" ); + $this->assertTrue( $processor->next_token() ); + $this->assertSame( CSSProcessor::TOKEN_STRING, $processor->get_token_type() ); + $this->assertSame( 'string', $processor->get_token_value() ); + } + + /** + * Tests that backslash-newline in an unquoted URL produces a bad-url token. + * + * In unquoted URLs, the backslash-newline check goes through is_valid_escape() + * which returns false for newlines, triggering consume_remnants_of_bad_url(). + * + * @see https://www.w3.org/TR/css-syntax-3/#consume-url-token + * + * @dataProvider data_url_backslash_newline + */ + public function test_url_backslash_newline( string $css ): void { + $processor = CSSProcessor::create( $css ); + + $found_bad_url = false; + while ( $processor->next_token() ) { + if ( CSSProcessor::TOKEN_BAD_URL === $processor->get_token_type() ) { + $found_bad_url = true; + break; + } + } + + $this->assertTrue( $found_bad_url, 'Expected a BAD_URL token but none was found.' ); + } + + static public function data_url_backslash_newline(): array { + return array( + 'backslash-LF' => array( "url(ab\\\ncd)" ), + 'backslash-FF' => array( "url(ab\\\fcd)" ), + 'backslash-CR' => array( "url(ab\\\rcd)" ), + 'backslash-CRLF' => array( "url(ab\\\r\ncd)" ), + ); + } + + /** + * Tests that backslash-EOF in an unquoted URL produces U+FFFD in the value. + * + * In unquoted URLs, is_valid_escape() returns true for backslash-EOF, + * and consuming the escaped code point at EOF produces U+FFFD per spec. + * + * @see https://www.w3.org/TR/css-syntax-3/#consume-url-token + */ + public function test_url_backslash_eof(): void { + $processor = CSSProcessor::create( "url(string\\" ); + $this->assertTrue( $processor->next_token() ); + $this->assertSame( CSSProcessor::TOKEN_URL, $processor->get_token_type() ); + $this->assertSame( "string\u{FFFD}", $processor->get_token_value() ); + } + + /** + * Tests that backslash-newline stops an ident sequence. + * + * In idents, is_valid_escape() returns false for backslash-newline, + * so the ident stops before the backslash. + * + * @see https://www.w3.org/TR/css-syntax-3/#consume-name + * + * @dataProvider data_ident_backslash_newline + */ + public function test_ident_backslash_newline( string $css ): void { + $processor = CSSProcessor::create( $css ); + $this->assertTrue( $processor->next_token() ); + $this->assertSame( CSSProcessor::TOKEN_IDENT, $processor->get_token_type() ); + $this->assertSame( 'abc', $processor->get_token_value() ); + } + + static public function data_ident_backslash_newline(): array { + return array( + 'backslash-LF' => array( "abc\\\n" ), + 'backslash-FF' => array( "abc\\\f" ), + 'backslash-CR' => array( "abc\\\r" ), + 'backslash-CRLF' => array( "abc\\\r\n" ), + ); + } + + /** + * Tests that backslash-EOF in an ident produces U+FFFD in the value. + * + * In idents, is_valid_escape() returns true for backslash-EOF, + * and consuming the escaped code point at EOF produces U+FFFD per spec. + * + * @see https://www.w3.org/TR/css-syntax-3/#consume-name + */ + public function test_ident_backslash_eof(): void { + $processor = CSSProcessor::create( "abc\\" ); + $this->assertTrue( $processor->next_token() ); + $this->assertSame( CSSProcessor::TOKEN_IDENT, $processor->get_token_type() ); + $this->assertSame( "abc\u{FFFD}", $processor->get_token_value() ); + } } From 5409b37e3aed88d43e6cc55b920f4e8b2be8285a Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Wed, 8 Apr 2026 10:59:28 +0200 Subject: [PATCH 02/12] Add test group to CSS processor tests --- components/DataLiberation/Tests/CSSProcessorTest.php | 2 ++ 1 file changed, 2 insertions(+) diff --git a/components/DataLiberation/Tests/CSSProcessorTest.php b/components/DataLiberation/Tests/CSSProcessorTest.php index fc99008c..e7bca23b 100644 --- a/components/DataLiberation/Tests/CSSProcessorTest.php +++ b/components/DataLiberation/Tests/CSSProcessorTest.php @@ -5,6 +5,8 @@ /** * Comprehensive CSS processor tests based on the CSS Syntax Level 3 specification. + * + * @group css */ class CSSProcessorTest extends TestCase { From b38e7578710666f498de304b91646348af52d6d3 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Wed, 8 Apr 2026 11:00:32 +0200 Subject: [PATCH 03/12] Add css test group to cssurlprocessor tets --- components/DataLiberation/Tests/CSSUrlProcessorTest.php | 3 +++ 1 file changed, 3 insertions(+) diff --git a/components/DataLiberation/Tests/CSSUrlProcessorTest.php b/components/DataLiberation/Tests/CSSUrlProcessorTest.php index 9fb9eb6a..050ce3b8 100644 --- a/components/DataLiberation/Tests/CSSUrlProcessorTest.php +++ b/components/DataLiberation/Tests/CSSUrlProcessorTest.php @@ -3,6 +3,9 @@ use PHPUnit\Framework\TestCase; use WordPress\DataLiberation\URL\CSSURLProcessor; +/** + * @group css + */ class CSSURLProcessorTest extends TestCase { /** From a83213072f09eca24ee175922050debeea0989c8 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Wed, 8 Apr 2026 11:07:00 +0200 Subject: [PATCH 04/12] Use "public static" order (PSR-12) --- components/DataLiberation/Tests/CSSProcessorTest.php | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/components/DataLiberation/Tests/CSSProcessorTest.php b/components/DataLiberation/Tests/CSSProcessorTest.php index e7bca23b..7863dbc4 100644 --- a/components/DataLiberation/Tests/CSSProcessorTest.php +++ b/components/DataLiberation/Tests/CSSProcessorTest.php @@ -27,7 +27,7 @@ public function test_processor_matches_spec( string $css, array $expected_tokens * @see https://github.com/romainmenke/css-processor-tests/ * @return array */ - static public function corpus_provider(): array { + public static function corpus_provider(): array { return json_decode(file_get_contents(__DIR__ . '/css-test-cases.json'), true); } @@ -37,7 +37,7 @@ static public function corpus_provider(): array { * @param CSSProcessor $processor The CSS processor. * @return array Array of tokens with type, raw, startIndex, endIndex, structured. */ - static public function collect_tokens( CSSProcessor $processor, $keys = null ): array { + public static function collect_tokens( CSSProcessor $processor, $keys = null ): array { $tokens = array(); while ( $processor->next_token() ) { @@ -1565,7 +1565,7 @@ public function test_string_backslash_newline( string $css, string $expected_val $this->assertSame( $expected_value, $processor->get_token_value() ); } - static public function data_string_backslash_newline(): array { + public static function data_string_backslash_newline(): array { return array( 'backslash-LF' => array( "'str\\\ning'", 'string' ), 'backslash-FF' => array( "'str\\\fing'", 'string' ), @@ -1617,7 +1617,7 @@ public function test_url_backslash_newline( string $css ): void { $this->assertTrue( $found_bad_url, 'Expected a BAD_URL token but none was found.' ); } - static public function data_url_backslash_newline(): array { + public static function data_url_backslash_newline(): array { return array( 'backslash-LF' => array( "url(ab\\\ncd)" ), 'backslash-FF' => array( "url(ab\\\fcd)" ), @@ -1658,7 +1658,7 @@ public function test_ident_backslash_newline( string $css ): void { $this->assertSame( 'abc', $processor->get_token_value() ); } - static public function data_ident_backslash_newline(): array { + public static function data_ident_backslash_newline(): array { return array( 'backslash-LF' => array( "abc\\\n" ), 'backslash-FF' => array( "abc\\\f" ), From fdf2e0846a2be769c95052c236661bf869b26672 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Wed, 8 Apr 2026 11:27:55 +0200 Subject: [PATCH 05/12] Fix incorrect test cases in JSON corpus --- .../DataLiberation/Tests/css-test-cases.json | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/components/DataLiberation/Tests/css-test-cases.json b/components/DataLiberation/Tests/css-test-cases.json index a1c10081..f1cee4b4 100644 --- a/components/DataLiberation/Tests/css-test-cases.json +++ b/components/DataLiberation/Tests/css-test-cases.json @@ -273,8 +273,8 @@ "raw": "\"foo\\\n\"", "startIndex": 0, "endIndex": 7, - "normalized": "\"foo\\\n\"", - "value": "foo\\\n" + "normalized": "\"foo\"", + "value": "foo" }, { "type": "whitespace-token", @@ -331,8 +331,8 @@ "raw": "\"foo\\\r\n\"", "startIndex": 0, "endIndex": 8, - "normalized": "\"foo\\\n\"", - "value": "foo\\\n" + "normalized": "\"foo\"", + "value": "foo" }, { "type": "whitespace-token", @@ -1497,8 +1497,8 @@ "raw": "\"lQh{R5M!QyOWE}oC2{2K TIa9}zb2oXWREY]0aj5J\\\r\nBJ5CO-16W5H7noF 19䀹41H3e8Z9%tg[O5AHEY24xh'9\"", "startIndex": 11, "endIndex": 102, - "normalized": "\"lQh{R5M!QyOWE}oC2{2K TIa9}zb2oXWREY]0aj5J\\\nBJ5CO-16W5H7noF 19䀹41H3e8Z9%tg[O5AHEY24xh'9\"", - "value": "lQh{R5M!QyOWE}oC2{2K TIa9}zb2oXWREY]0aj5J\\\nBJ5CO-16W5H7noF 19䀹41H3e8Z9%tg[O5AHEY24xh'9" + "normalized": "\"lQh{R5M!QyOWE}oC2{2K TIa9}zb2oXWREY]0aj5JBJ5CO-16W5H7noF 19䀹41H3e8Z9%tg[O5AHEY24xh'9\"", + "value": "lQh{R5M!QyOWE}oC2{2K TIa9}zb2oXWREY]0aj5JBJ5CO-16W5H7noF 19䀹41H3e8Z9%tg[O5AHEY24xh'9" }, { "type": "string-token", @@ -1721,8 +1721,8 @@ "raw": "'E{z\u0000U\u001fEG2}2Verb>nj3TVk3mu7wX1J\b.H\u000bi1Ga8f5 dserqydJ3\"xj398xy.W\" uHQbv7Bw1NtF;N3PwNY7Vx00BF o\"4CXzvP\"{594 6r}8QQKNQw135i1\\\r\nrey\thg7[5%rBK8RUC64Lu␌17O{E\\90873u}1O3vx4gHTC55Q9i4\"V3Vx4\"7r(34L]F\"ns2pPf\"V7b)EOBGH8rdC7\"\u000eVJ4OQ[ 9jtoMdINgS7o�206vo72kTcKkZR9wl30G'", "startIndex": 3, "endIndex": 263, - "normalized": "'E{z�U\u001fEG2}2Verb>nj3TVk3mu7wX1J\b.H\u000bi1Ga8f5 dserqydJ3\"xj398xy.W\" uHQbv7Bw1NtF;N3PwNY7Vx00BF o\"4CXzvP\"{594 6r}8QQKNQw135i1\\\nrey\thg7[5%rBK8RUC64Lu␌17O{E򐡳u}1O3vx4gHTC55Q9i4\"V3Vx4\"7r(34L]F\"ns2pPf\"V7b)EOBGH8rdC7\"\u000eVJ4OQ[ 9jtoMdINgS7o�206vo72kTcKkZR9wl30G'", - "value": "E{z�U\u001fEG2}2Verb>nj3TVk3mu7wX1J\b.H\u000bi1Ga8f5 dserqydJ3\"xj398xy.W\" uHQbv7Bw1NtF;N3PwNY7Vx00BF o\"4CXzvP\"{594 6r}8QQKNQw135i1\\\nrey\thg7[5%rBK8RUC64Lu␌17O{E򐡳u}1O3vx4gHTC55Q9i4\"V3Vx4\"7r(34L]F\"ns2pPf\"V7b)EOBGH8rdC7\"\u000eVJ4OQ[ 9jtoMdINgS7o�206vo72kTcKkZR9wl30G" + "normalized": "'E{z�U\u001fEG2}2Verb>nj3TVk3mu7wX1J\b.H\u000bi1Ga8f5 dserqydJ3\"xj398xy.W\" uHQbv7Bw1NtF;N3PwNY7Vx00BF o\"4CXzvP\"{594 6r}8QQKNQw135i1rey\thg7[5%rBK8RUC64Lu␌17O{E򐡳u}1O3vx4gHTC55Q9i4\"V3Vx4\"7r(34L]F\"ns2pPf\"V7b)EOBGH8rdC7\"\u000eVJ4OQ[ 9jtoMdINgS7o�206vo72kTcKkZR9wl30G'", + "value": "E{z�U\u001fEG2}2Verb>nj3TVk3mu7wX1J\b.H\u000bi1Ga8f5 dserqydJ3\"xj398xy.W\" uHQbv7Bw1NtF;N3PwNY7Vx00BF o\"4CXzvP\"{594 6r}8QQKNQw135i1rey\thg7[5%rBK8RUC64Lu␌17O{E򐡳u}1O3vx4gHTC55Q9i4\"V3Vx4\"7r(34L]F\"ns2pPf\"V7b)EOBGH8rdC7\"\u000eVJ4OQ[ 9jtoMdINgS7o�206vo72kTcKkZR9wl30G" }, { "type": "ident-token", @@ -1893,8 +1893,8 @@ "raw": "'\\\r\n{X Date: Wed, 8 Apr 2026 12:24:55 +0200 Subject: [PATCH 06/12] Fix misreported string token values for \-EOF and \-newline --- .../DataLiberation/CSS/class-cssprocessor.php | 67 +++++++++++++++---- 1 file changed, 53 insertions(+), 14 deletions(-) diff --git a/components/DataLiberation/CSS/class-cssprocessor.php b/components/DataLiberation/CSS/class-cssprocessor.php index cacb48b4..0d0ca935 100644 --- a/components/DataLiberation/CSS/class-cssprocessor.php +++ b/components/DataLiberation/CSS/class-cssprocessor.php @@ -632,9 +632,10 @@ public function get_normalized_token(): ?string { return null; } - return $this->decode_string_or_url( + return $this->decode_escapes( $this->token_starts_at, - $this->token_length + $this->token_length, + self::TOKEN_STRING === $this->token_type || self::TOKEN_BAD_STRING === $this->token_type ); } @@ -680,34 +681,45 @@ public function get_token_value() { switch ( $this->token_type ) { case self::TOKEN_HASH: // Hash value starts after the # character. - $this->token_value = $this->decode_string_or_url( $this->token_starts_at + 1, $this->token_length - 1 ); + $this->token_value = $this->decode_escapes( $this->token_starts_at + 1, $this->token_length - 1 ); break; case self::TOKEN_AT_KEYWORD: // At-keyword value starts after the @ character. - $this->token_value = $this->decode_string_or_url( $this->token_starts_at + 1, $this->token_length - 1 ); + $this->token_value = $this->decode_escapes( $this->token_starts_at + 1, $this->token_length - 1 ); break; case self::TOKEN_FUNCTION: // Function name is everything except the final (. - $this->token_value = $this->decode_string_or_url( $this->token_starts_at, $this->token_length - 1 ); + $this->token_value = $this->decode_escapes( $this->token_starts_at, $this->token_length - 1 ); break; case self::TOKEN_IDENT: // Identifier is the entire token. - $this->token_value = $this->decode_string_or_url( $this->token_starts_at, $this->token_length ); + $this->token_value = $this->decode_escapes( $this->token_starts_at, $this->token_length ); break; case self::TOKEN_STRING: case self::TOKEN_BAD_STRING: + // Decode and cache the string value. + if ( null !== $this->token_value_starts_at && null !== $this->token_value_length ) { + $this->token_value = $this->decode_escapes( + $this->token_value_starts_at, + $this->token_value_length, + true + ); + } else { + $this->token_value = null; + } + break; + case self::TOKEN_URL: - // Decode and cache the string/URL value. + // Decode and cache the URL value. if ( null !== $this->token_value_starts_at && null !== $this->token_value_length ) { - $this->token_value = $this->decode_string_or_url( + $this->token_value = $this->decode_escapes( $this->token_value_starts_at, $this->token_value_length ); - $this->token_value = $this->token_value; } else { $this->token_value = null; } @@ -715,7 +727,7 @@ public function get_token_value() { case self::TOKEN_DELIM: // Delim value is the single code point. - $this->token_value = $this->decode_string_or_url( $this->token_starts_at, $this->token_length ); + $this->token_value = $this->decode_escapes( $this->token_starts_at, $this->token_length ); break; case self::TOKEN_NUMBER: @@ -1185,7 +1197,7 @@ private function consume_numeric(): bool { // Consume an ident sequence. Set the 's unit to the returned value. $unit_starts_at = $this->at; $this->consume_ident_sequence(); - $this->token_unit = $this->decode_string_or_url( $unit_starts_at, $this->at - $unit_starts_at ); + $this->token_unit = $this->decode_escapes( $unit_starts_at, $this->at - $unit_starts_at ); $this->token_type = self::TOKEN_DIMENSION; $this->token_length = $this->at - $this->token_starts_at; return true; @@ -1220,7 +1232,7 @@ private function consume_ident_like(): bool { // Consume an ident sequence, and let string be the result. $ident_start = $this->at; $decoded = $this->consume_ident_sequence(); - $string = $decoded ?? $this->decode_string_or_url( $ident_start, $this->at - $ident_start ); + $string = $decoded ?? $this->decode_escapes( $ident_start, $this->at - $ident_start ); // If string's value is an ASCII case-insensitive match for "url", // and the next input code point is U+0028 LEFT PARENTHESIS ((). @@ -1551,7 +1563,7 @@ private function consume_ident_start_codepoint( $at ): int { * @param int $length Length of the substring to decode. * @return string Decoded/normalized string. */ - private function decode_string_or_url( int $start, int $length ): string { + private function decode_escapes( int $start, int $length, bool $string_escapes = false ): string { // Fast path: check if any processing is needed. $slice = wp_scrub_utf8( substr( $this->css, $start, $length ) ); $special_chars = "\\\r\f\x00"; @@ -1581,8 +1593,35 @@ private function decode_string_or_url( int $start, int $length ): string { $char = $this->css[ $at ]; - // Handle escapes (if enabled). + // Handle escapes. if ( '\\' === $char ) { + /* + * String tokens have special escape rules per §4.3.5: + * - \-EOF: do nothing (consume the backslash, produce no value). + * - \-newline: consume both (line continuation, produce no value). + * These must be checked before the general escape path. + */ + if ( $string_escapes ) { + if ( $at + 1 >= $end ) { + // \-EOF: consume the backslash and stop. + ++$at; + continue; + } + $next = $this->css[ $at + 1 ]; + if ( "\n" === $next || "\f" === $next ) { + $at += 2; + continue; + } + if ( "\r" === $next ) { + $at += 2; + // \r\n counts as one newline. + if ( $at < $end && "\n" === $this->css[ $at ] ) { + ++$at; + } + continue; + } + } + if ( $this->is_valid_escape( $at ) ) { ++$at; $decoded .= $this->decode_escape_at( $at, $bytes_consumed ); From a73187bbbc004c6628e6e0b72bbda42220d6c427 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Wed, 8 Apr 2026 12:38:57 +0200 Subject: [PATCH 07/12] Document new param --- components/DataLiberation/CSS/class-cssprocessor.php | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/components/DataLiberation/CSS/class-cssprocessor.php b/components/DataLiberation/CSS/class-cssprocessor.php index 0d0ca935..d2774292 100644 --- a/components/DataLiberation/CSS/class-cssprocessor.php +++ b/components/DataLiberation/CSS/class-cssprocessor.php @@ -1559,9 +1559,13 @@ private function consume_ident_start_codepoint( $at ): int { * Slow path: Builds the decoded string by optionally processing escapes and * normalizing line endings and null bytes. * - * @param int $start Start byte offset. - * @param int $length Length of the substring to decode. - * @return string Decoded/normalized string. + * @param int $start Start byte offset. + * @param int $length Length of the substring to decode. + * @param bool $string_escapes Optional, default false. When true, apply special CSS string + * token escape rules: + * - \-newline is consumed as a line continuation (ignored). + * - \-EOF is silently discarded. + * @return string Decoded and normalized string. */ private function decode_escapes( int $start, int $length, bool $string_escapes = false ): string { // Fast path: check if any processing is needed. From 6bbda4d194438a4ea189ee5ce3443718a3935b4a Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Wed, 8 Apr 2026 14:51:59 +0200 Subject: [PATCH 08/12] Update docblock appropriately --- components/DataLiberation/CSS/class-cssprocessor.php | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/components/DataLiberation/CSS/class-cssprocessor.php b/components/DataLiberation/CSS/class-cssprocessor.php index d2774292..e5bd3771 100644 --- a/components/DataLiberation/CSS/class-cssprocessor.php +++ b/components/DataLiberation/CSS/class-cssprocessor.php @@ -1551,13 +1551,9 @@ private function consume_ident_start_codepoint( $at ): int { } /** - * Decodes a string or URL value with escape sequences and normalization. + * Decodes and normalizes ident-like or string CSS values from a byte range. * - * Fast path: If the slice contains no special characters, returns the raw - * substring with almost zero allocations. - * - * Slow path: Builds the decoded string by optionally processing escapes and - * normalizing line endings and null bytes. + * Applies appropriate escaping rules and normalizes newlines and null bytes. * * @param int $start Start byte offset. * @param int $length Length of the substring to decode. From a24a2c27bf0e3c595d681a474fe3f3d2067da792 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Wed, 8 Apr 2026 15:06:53 +0200 Subject: [PATCH 09/12] Rename method to more general `decode_range` and add table of examples --- .../DataLiberation/CSS/class-cssprocessor.php | 34 ++++++++++++------- 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/components/DataLiberation/CSS/class-cssprocessor.php b/components/DataLiberation/CSS/class-cssprocessor.php index e5bd3771..e74460ce 100644 --- a/components/DataLiberation/CSS/class-cssprocessor.php +++ b/components/DataLiberation/CSS/class-cssprocessor.php @@ -632,7 +632,7 @@ public function get_normalized_token(): ?string { return null; } - return $this->decode_escapes( + return $this->decode_range( $this->token_starts_at, $this->token_length, self::TOKEN_STRING === $this->token_type || self::TOKEN_BAD_STRING === $this->token_type @@ -681,29 +681,29 @@ public function get_token_value() { switch ( $this->token_type ) { case self::TOKEN_HASH: // Hash value starts after the # character. - $this->token_value = $this->decode_escapes( $this->token_starts_at + 1, $this->token_length - 1 ); + $this->token_value = $this->decode_range( $this->token_starts_at + 1, $this->token_length - 1 ); break; case self::TOKEN_AT_KEYWORD: // At-keyword value starts after the @ character. - $this->token_value = $this->decode_escapes( $this->token_starts_at + 1, $this->token_length - 1 ); + $this->token_value = $this->decode_range( $this->token_starts_at + 1, $this->token_length - 1 ); break; case self::TOKEN_FUNCTION: // Function name is everything except the final (. - $this->token_value = $this->decode_escapes( $this->token_starts_at, $this->token_length - 1 ); + $this->token_value = $this->decode_range( $this->token_starts_at, $this->token_length - 1 ); break; case self::TOKEN_IDENT: // Identifier is the entire token. - $this->token_value = $this->decode_escapes( $this->token_starts_at, $this->token_length ); + $this->token_value = $this->decode_range( $this->token_starts_at, $this->token_length ); break; case self::TOKEN_STRING: case self::TOKEN_BAD_STRING: // Decode and cache the string value. if ( null !== $this->token_value_starts_at && null !== $this->token_value_length ) { - $this->token_value = $this->decode_escapes( + $this->token_value = $this->decode_range( $this->token_value_starts_at, $this->token_value_length, true @@ -716,7 +716,7 @@ public function get_token_value() { case self::TOKEN_URL: // Decode and cache the URL value. if ( null !== $this->token_value_starts_at && null !== $this->token_value_length ) { - $this->token_value = $this->decode_escapes( + $this->token_value = $this->decode_range( $this->token_value_starts_at, $this->token_value_length ); @@ -727,7 +727,7 @@ public function get_token_value() { case self::TOKEN_DELIM: // Delim value is the single code point. - $this->token_value = $this->decode_escapes( $this->token_starts_at, $this->token_length ); + $this->token_value = $this->decode_range( $this->token_starts_at, $this->token_length ); break; case self::TOKEN_NUMBER: @@ -1197,7 +1197,7 @@ private function consume_numeric(): bool { // Consume an ident sequence. Set the 's unit to the returned value. $unit_starts_at = $this->at; $this->consume_ident_sequence(); - $this->token_unit = $this->decode_escapes( $unit_starts_at, $this->at - $unit_starts_at ); + $this->token_unit = $this->decode_range( $unit_starts_at, $this->at - $unit_starts_at ); $this->token_type = self::TOKEN_DIMENSION; $this->token_length = $this->at - $this->token_starts_at; return true; @@ -1232,7 +1232,7 @@ private function consume_ident_like(): bool { // Consume an ident sequence, and let string be the result. $ident_start = $this->at; $decoded = $this->consume_ident_sequence(); - $string = $decoded ?? $this->decode_escapes( $ident_start, $this->at - $ident_start ); + $string = $decoded ?? $this->decode_range( $ident_start, $this->at - $ident_start ); // If string's value is an ASCII case-insensitive match for "url", // and the next input code point is U+0028 LEFT PARENTHESIS ((). @@ -1553,7 +1553,17 @@ private function consume_ident_start_codepoint( $at ): int { /** * Decodes and normalizes ident-like or string CSS values from a byte range. * - * Applies appropriate escaping rules and normalizes newlines and null bytes. + * For example: + * ┌──────────────┬────────┐ + * │ Input │ Output │ + * ├──────────────┼────────┤ + * │ 'xyz' │ 'xyz' │ + * │ '\x\y\z' │ 'xyz' │ + * │ 'x\79z' │ 'xyz' │ + * │ 'x\000079 z' │ 'xyz' │ + * │ 'a\r\nb' │ 'a\nb' │ + * │ 'a\0b' │ 'a�b' │ + * └──────────────┴────────┘ * * @param int $start Start byte offset. * @param int $length Length of the substring to decode. @@ -1563,7 +1573,7 @@ private function consume_ident_start_codepoint( $at ): int { * - \-EOF is silently discarded. * @return string Decoded and normalized string. */ - private function decode_escapes( int $start, int $length, bool $string_escapes = false ): string { + private function decode_range( int $start, int $length, bool $string_escapes = false ): string { // Fast path: check if any processing is needed. $slice = wp_scrub_utf8( substr( $this->css, $start, $length ) ); $special_chars = "\\\r\f\x00"; From 96b91af2e0a80333c5d4cb3354ccfdd66d9f1a44 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 8 Apr 2026 16:17:06 +0000 Subject: [PATCH 10/12] Address review comments: fix TOKEN_BAD_STRING condition and improve 0x notation in comments Agent-Logs-Url: https://github.com/WordPress/php-toolkit/sessions/fe9a37e8-8456-41d8-85db-ad02267619dc Co-authored-by: adamziel <205419+adamziel@users.noreply.github.com> --- components/DataLiberation/CSS/class-cssprocessor.php | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/components/DataLiberation/CSS/class-cssprocessor.php b/components/DataLiberation/CSS/class-cssprocessor.php index e74460ce..931ec92d 100644 --- a/components/DataLiberation/CSS/class-cssprocessor.php +++ b/components/DataLiberation/CSS/class-cssprocessor.php @@ -635,7 +635,7 @@ public function get_normalized_token(): ?string { return $this->decode_range( $this->token_starts_at, $this->token_length, - self::TOKEN_STRING === $this->token_type || self::TOKEN_BAD_STRING === $this->token_type + self::TOKEN_STRING === $this->token_type ); } @@ -1607,24 +1607,26 @@ private function decode_range( int $start, int $length, bool $string_escapes = f if ( '\\' === $char ) { /* * String tokens have special escape rules per §4.3.5: - * - \-EOF: do nothing (consume the backslash, produce no value). - * - \-newline: consume both (line continuation, produce no value). + * - 0x5C (backslash) at EOF: consume the backslash, produce no value. + * - 0x5C (backslash) followed by 0x0A (LF), 0x0C (FF), or 0x0D (CR): + * consume both characters as a line continuation, produce no value. * These must be checked before the general escape path. */ if ( $string_escapes ) { if ( $at + 1 >= $end ) { - // \-EOF: consume the backslash and stop. + // 0x5C at EOF: consume the backslash and stop. ++$at; continue; } $next = $this->css[ $at + 1 ]; if ( "\n" === $next || "\f" === $next ) { + // 0x5C followed by 0x0A (LF) or 0x0C (FF): line continuation. $at += 2; continue; } if ( "\r" === $next ) { + // 0x5C followed by 0x0D (CR): line continuation; 0x0D 0x0A counts as one newline. $at += 2; - // \r\n counts as one newline. if ( $at < $end && "\n" === $this->css[ $at ] ) { ++$at; } From b0d082e1cf620826cf0f00d4033fd6a991b140f6 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 8 Apr 2026 19:12:06 +0000 Subject: [PATCH 11/12] Use 0x hex notation in $string_escapes param docblock Agent-Logs-Url: https://github.com/WordPress/php-toolkit/sessions/601e9fa1-ea30-4c5d-accc-7da80e9f849e Co-authored-by: sirreal <841763+sirreal@users.noreply.github.com> --- components/DataLiberation/CSS/class-cssprocessor.php | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/components/DataLiberation/CSS/class-cssprocessor.php b/components/DataLiberation/CSS/class-cssprocessor.php index 6b90abb9..bb7b2de9 100644 --- a/components/DataLiberation/CSS/class-cssprocessor.php +++ b/components/DataLiberation/CSS/class-cssprocessor.php @@ -1566,8 +1566,9 @@ private function consume_ident_start_codepoint( $at ): int { * @param int $length Length of the substring to decode. * @param bool $string_escapes Optional, default false. When true, apply special CSS string * token escape rules: - * - \-newline is consumed as a line continuation (ignored). - * - \-EOF is silently discarded. + * - 0x5C (backslash) followed by 0x0A (LF), 0x0C (FF), or 0x0D (CR) + * is consumed as a line continuation (ignored). + * - 0x5C (backslash) at EOF is silently discarded. * @return string Decoded and normalized string. */ private function decode_range( int $start, int $length, bool $string_escapes = false ): string { From 59fe7771b2f63450345becd8e16c5ff647491ee1 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 8 Apr 2026 19:31:48 +0000 Subject: [PATCH 12/12] Simplify $string_escapes docblock and add CRLF case to body comment Agent-Logs-Url: https://github.com/WordPress/php-toolkit/sessions/35229818-8e9a-4375-b689-2b947ddac989 Co-authored-by: sirreal <841763+sirreal@users.noreply.github.com> --- components/DataLiberation/CSS/class-cssprocessor.php | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/components/DataLiberation/CSS/class-cssprocessor.php b/components/DataLiberation/CSS/class-cssprocessor.php index bb7b2de9..f4dda307 100644 --- a/components/DataLiberation/CSS/class-cssprocessor.php +++ b/components/DataLiberation/CSS/class-cssprocessor.php @@ -1564,11 +1564,8 @@ private function consume_ident_start_codepoint( $at ): int { * * @param int $start Start byte offset. * @param int $length Length of the substring to decode. - * @param bool $string_escapes Optional, default false. When true, apply special CSS string - * token escape rules: - * - 0x5C (backslash) followed by 0x0A (LF), 0x0C (FF), or 0x0D (CR) - * is consumed as a line continuation (ignored). - * - 0x5C (backslash) at EOF is silently discarded. + * @param bool $string_escapes Optional, default false. When true, apply additional escape + * rules that apply only to string tokens (CSS §4.3.5). * @return string Decoded and normalized string. */ private function decode_range( int $start, int $length, bool $string_escapes = false ): string { @@ -1608,6 +1605,8 @@ private function decode_range( int $start, int $length, bool $string_escapes = f * - 0x5C (backslash) at EOF: consume the backslash, produce no value. * - 0x5C (backslash) followed by 0x0A (LF), 0x0C (FF), or 0x0D (CR): * consume both characters as a line continuation, produce no value. + * - 0x5C (backslash) followed by 0x0D 0x0A (CRLF): + * consume all three characters as a line continuation, produce no value. * These must be checked before the general escape path. */ if ( $string_escapes ) {