diff --git a/components/DataLiberation/CSS/class-cssprocessor.php b/components/DataLiberation/CSS/class-cssprocessor.php index e74460ce..8016ce5a 100644 --- a/components/DataLiberation/CSS/class-cssprocessor.php +++ b/components/DataLiberation/CSS/class-cssprocessor.php @@ -1593,7 +1593,7 @@ private function decode_range( int $start, int $length, bool $string_escapes = f if ( $normal_len > 0 ) { // Clamp to not exceed the end boundary. $normal_len = min( $normal_len, $end - $at ); - $decoded .= substr( $this->css, $at, $normal_len ); + $decoded .= wp_scrub_utf8( substr( $this->css, $at, $normal_len ) ); $at += $normal_len; } @@ -1634,7 +1634,7 @@ private function decode_range( int $start, int $length, bool $string_escapes = f if ( $this->is_valid_escape( $at ) ) { ++$at; - $decoded .= $this->decode_escape_at( $at, $bytes_consumed ); + $decoded .= wp_scrub_utf8( $this->decode_escape_at( $at, $bytes_consumed ) ); $at += $bytes_consumed; continue; } diff --git a/components/DataLiberation/Tests/CSSProcessorTest.php b/components/DataLiberation/Tests/CSSProcessorTest.php index 7863dbc4..4bcebafc 100644 --- a/components/DataLiberation/Tests/CSSProcessorTest.php +++ b/components/DataLiberation/Tests/CSSProcessorTest.php @@ -148,6 +148,68 @@ public function test_invalid_utf8_with_two_single_byte_invalid_sequences(): void $this->assertSame( $expected, $actual_tokens ); } + /** + * In the slow path of decode_range() (triggered by a backslash escape), normal + * text segments must still have invalid UTF-8 bytes replaced with U+FFFD, just + * as the fast path does via wp_scrub_utf8(). + */ + public function test_invalid_utf8_in_normal_segment_combined_with_escape(): void { + // The ident token contains an invalid UTF-8 byte (0xF1) in the "normal" + // segment before a CSS hex escape (\41 = U+0041 = 'A'). The backslash + // triggers the slow path, which previously skipped wp_scrub_utf8() on the + // normal segment. + $css = ".test\xF1\\41name"; + + $expected = array( + array( + 'type' => CSSProcessor::TOKEN_DELIM, + 'raw' => '.', + 'value' => '.', + ), + array( + 'type' => CSSProcessor::TOKEN_IDENT, + // raw contains the original bytes. + 'raw' => "test\xF1\\41name", + // value must have 0xF1 replaced with U+FFFD and \41 decoded to 'A'. + 'value' => "test\u{FFFD}Aname", + ), + ); + + $processor = CSSProcessor::create( $css ); + $actual_tokens = $this->collect_tokens( $processor, array( 'type', 'raw', 'value' ) ); + $this->assertSame( $expected, $actual_tokens ); + } + + /** + * When an invalid UTF-8 byte is the character directly after a backslash + * (i.e. it is the escaped character itself), decode_escape_at() returns the + * raw byte. The caller must scrub it to U+FFFD. + */ + public function test_invalid_utf8_as_escaped_character(): void { + // The CSS `.\xF1` is a delim + ident containing a lone invalid byte. + // Adding a backslash before the invalid byte makes it an escape sequence: + // `.\\\xF1` => delim + ident whose value is the escaped 0xF1 byte. + $css = ".a\\\xF1b"; + + $expected = array( + array( + 'type' => CSSProcessor::TOKEN_DELIM, + 'raw' => '.', + 'value' => '.', + ), + array( + 'type' => CSSProcessor::TOKEN_IDENT, + 'raw' => "a\\\xF1b", + // The escaped 0xF1 must be replaced with U+FFFD. + 'value' => "a\u{FFFD}b", + ), + ); + + $processor = CSSProcessor::create( $css ); + $actual_tokens = $this->collect_tokens( $processor, array( 'type', 'raw', 'value' ) ); + $this->assertSame( $expected, $actual_tokens ); + } + /** * Legacy test to ensure basic tokenization still works. */