Skip to content

Commit 70fb724

Browse files
Copilotsirreal
andauthored
CSS: Replace URL/string tests with cleaner ident-based tests from #231
Agent-Logs-Url: https://github.com/WordPress/php-toolkit/sessions/892f6cca-d936-43dc-9f63-27739478ddbd Co-authored-by: sirreal <841763+sirreal@users.noreply.github.com>
1 parent 3bf270c commit 70fb724

1 file changed

Lines changed: 62 additions & 60 deletions

File tree

components/DataLiberation/Tests/CSSProcessorTest.php

Lines changed: 62 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,68 @@ public function test_invalid_utf8_with_two_single_byte_invalid_sequences(): void
146146
$this->assertSame( $expected, $actual_tokens );
147147
}
148148

149+
/**
150+
* In the slow path of decode_string_or_url() (triggered by a backslash escape), normal
151+
* text segments must still have invalid UTF-8 bytes replaced with U+FFFD, just
152+
* as the fast path does via wp_scrub_utf8().
153+
*/
154+
public function test_invalid_utf8_in_normal_segment_combined_with_escape(): void {
155+
// The ident token contains an invalid UTF-8 byte (0xF1) in the "normal"
156+
// segment before a CSS hex escape (\41 = U+0041 = 'A'). The backslash
157+
// triggers the slow path, which previously skipped wp_scrub_utf8() on the
158+
// normal segment.
159+
$css = ".test\xF1\\41name";
160+
161+
$expected = array(
162+
array(
163+
'type' => CSSProcessor::TOKEN_DELIM,
164+
'raw' => '.',
165+
'value' => '.',
166+
),
167+
array(
168+
'type' => CSSProcessor::TOKEN_IDENT,
169+
// raw contains the original bytes.
170+
'raw' => "test\xF1\\41name",
171+
// value must have 0xF1 replaced with U+FFFD and \41 decoded to 'A'.
172+
'value' => "test\u{FFFD}Aname",
173+
),
174+
);
175+
176+
$processor = CSSProcessor::create( $css );
177+
$actual_tokens = $this->collect_tokens( $processor, array( 'type', 'raw', 'value' ) );
178+
$this->assertSame( $expected, $actual_tokens );
179+
}
180+
181+
/**
182+
* When an invalid UTF-8 byte is the character directly after a backslash
183+
* (i.e. it is the escaped character itself), decode_escape_at() returns the
184+
* raw byte. The caller must scrub it to U+FFFD.
185+
*/
186+
public function test_invalid_utf8_as_escaped_character(): void {
187+
// The CSS `.\xF1` is a delim + ident containing a lone invalid byte.
188+
// Adding a backslash before the invalid byte makes it an escape sequence:
189+
// `.\\\xF1` => delim + ident whose value is the escaped 0xF1 byte.
190+
$css = ".a\\\xF1b";
191+
192+
$expected = array(
193+
array(
194+
'type' => CSSProcessor::TOKEN_DELIM,
195+
'raw' => '.',
196+
'value' => '.',
197+
),
198+
array(
199+
'type' => CSSProcessor::TOKEN_IDENT,
200+
'raw' => "a\\\xF1b",
201+
// The escaped 0xF1 must be replaced with U+FFFD.
202+
'value' => "a\u{FFFD}b",
203+
),
204+
);
205+
206+
$processor = CSSProcessor::create( $css );
207+
$actual_tokens = $this->collect_tokens( $processor, array( 'type', 'raw', 'value' ) );
208+
$this->assertSame( $expected, $actual_tokens );
209+
}
210+
149211
/**
150212
* Legacy test to ensure basic tokenization still works.
151213
*/
@@ -1541,64 +1603,4 @@ public function test_ident_start_codepoint_bounds_check(): void {
15411603
);
15421604
$this->assertSame( $expected_tokens, $actual_tokens );
15431605
}
1544-
1545-
/**
1546-
* Tests that invalid UTF-8 bytes in the slow-path normal segments of a URL token
1547-
* are replaced with U+FFFD, consistent with the fast path.
1548-
*
1549-
* The slow path is triggered by a backslash escape in the URL content. Invalid bytes
1550-
* that appear in the non-escaped portions must still be scrubbed.
1551-
*
1552-
* @see https://github.com/WordPress/php-toolkit/issues/229
1553-
*/
1554-
public function test_invalid_utf8_in_url_slow_path_normal_segment(): void {
1555-
// CSS bytes: u r l ( A \ 4 1 0xFF B )
1556-
// The \41 hex escape triggers the slow path.
1557-
// 0xFF is an invalid UTF-8 byte in a normal (non-escaped) segment.
1558-
$css = "url(A\\41\xFFB)";
1559-
1560-
$processor = CSSProcessor::create( $css );
1561-
$this->assertTrue( $processor->next_token() );
1562-
$this->assertSame( CSSProcessor::TOKEN_URL, $processor->get_token_type() );
1563-
// \41 decodes to 'A'; \xFF must be replaced with U+FFFD.
1564-
$this->assertSame( "AA\u{FFFD}B", $processor->get_token_value() );
1565-
}
1566-
1567-
/**
1568-
* Tests that a backslash-escaped invalid UTF-8 byte in a URL token
1569-
* is replaced with U+FFFD, consistent with the fast path.
1570-
*
1571-
* In the slow path, decode_escape_at() returns the raw invalid byte for
1572-
* the "anything else" escape case and the caller must scrub it.
1573-
*
1574-
* @see https://github.com/WordPress/php-toolkit/issues/229
1575-
*/
1576-
public function test_invalid_utf8_in_url_slow_path_escaped_byte(): void {
1577-
// CSS bytes: u r l ( A \ 4 1 \ 0xFF )
1578-
// \41 is a hex escape for 'A'; \<0xFF> is "anything else" escape for the 0xFF byte.
1579-
$css = "url(A\\41\\\xFF)";
1580-
1581-
$processor = CSSProcessor::create( $css );
1582-
$this->assertTrue( $processor->next_token() );
1583-
$this->assertSame( CSSProcessor::TOKEN_URL, $processor->get_token_type() );
1584-
// \41 decodes to 'A'; \<0xFF> must produce U+FFFD.
1585-
$this->assertSame( "AA\u{FFFD}", $processor->get_token_value() );
1586-
}
1587-
1588-
/**
1589-
* Tests that invalid UTF-8 bytes in the slow-path normal segments of a string token
1590-
* are replaced with U+FFFD, consistent with the fast path.
1591-
*
1592-
* @see https://github.com/WordPress/php-toolkit/issues/229
1593-
*/
1594-
public function test_invalid_utf8_in_string_slow_path_normal_segment(): void {
1595-
// String token 'A\41<0xFF>B' – the \41 escape triggers the slow path.
1596-
$css = "'A\\41\xFFB'";
1597-
1598-
$processor = CSSProcessor::create( $css );
1599-
$this->assertTrue( $processor->next_token() );
1600-
$this->assertSame( CSSProcessor::TOKEN_STRING, $processor->get_token_type() );
1601-
// \41 decodes to 'A'; \xFF must be replaced with U+FFFD.
1602-
$this->assertSame( "AA\u{FFFD}B", $processor->get_token_value() );
1603-
}
16041606
}

0 commit comments

Comments
 (0)