From 70c9ccba671f179e5a97b9be6ff53ef2f772502b Mon Sep 17 00:00:00 2001 From: tkshsbcue Date: Tue, 7 Apr 2026 20:01:36 +0530 Subject: [PATCH 1/2] fix(runtime): replace unpaired surrogates with U+FFFD in UTF-16 TextDecoder The UTF-16 TextDecoder (both LE and BE) was passing raw code units directly to JsString, preserving unpaired surrogates instead of replacing them with U+FFFD as required by the WHATWG Encoding Standard. Route both decoders through a shared `decode_utf16_units` helper that uses `std::char::decode_utf16` with replacement, and simplify the UTF-16 BE decoder to borrow instead of taking ownership. Closes #4612 --- core/runtime/src/text/encodings.rs | 81 ++++++++++++++++-------------- core/runtime/src/text/mod.rs | 5 +- core/runtime/src/text/tests.rs | 66 ++++++++++++++++++++++++ 3 files changed, 110 insertions(+), 42 deletions(-) diff --git a/core/runtime/src/text/encodings.rs b/core/runtime/src/text/encodings.rs index cbe961fdc8e..f36ecb8b263 100644 --- a/core/runtime/src/text/encodings.rs +++ b/core/runtime/src/text/encodings.rs @@ -21,61 +21,66 @@ pub(crate) mod utf8 { } } +/// Decodes an iterator of UTF-16 code units into a well-formed `JsString`, +/// replacing any unpaired surrogates with U+FFFD. +/// +/// If `dangling_byte` is true and the last decoded code unit is not a high +/// surrogate (which would already have been replaced), an additional U+FFFD +/// is appended for the truncated trailing byte. +fn decode_utf16_units( + code_units: impl IntoIterator, + dangling_byte: bool, +) -> boa_engine::JsString { + let mut string = String::new(); + let mut last_code_unit = None; + string.extend( + std::char::decode_utf16(code_units.into_iter().inspect(|code_unit| { + last_code_unit = Some(*code_unit); + })) + .map(|result| result.unwrap_or('\u{FFFD}')), + ); + let trailing_high_surrogate = + last_code_unit.is_some_and(|code_unit| (0xD800..=0xDBFF).contains(&code_unit)); + if dangling_byte && !trailing_high_surrogate { + string.push('\u{FFFD}'); + } + boa_engine::JsString::from(string) +} + pub(crate) mod utf16le { - use boa_engine::{JsString, js_string}; + use boa_engine::JsString; pub(crate) fn decode(mut input: &[u8], strip_bom: bool) -> JsString { if strip_bom { input = input.strip_prefix(&[0xFF, 0xFE]).unwrap_or(input); } - // After this point, input is of even length. - let dangling = if input.len().is_multiple_of(2) { - false - } else { + let dangling_byte = !input.len().is_multiple_of(2); + if dangling_byte { input = &input[0..input.len() - 1]; - true - }; - - let input: &[u16] = bytemuck::cast_slice(input); - - if dangling { - JsString::from(&[JsString::from(input), js_string!("\u{FFFD}")]) - } else { - JsString::from(input) } + + let code_units: &[u16] = bytemuck::cast_slice(input); + super::decode_utf16_units(code_units.iter().copied(), dangling_byte) } } pub(crate) mod utf16be { - use boa_engine::{JsString, js_string}; + use boa_engine::JsString; - pub(crate) fn decode(mut input: Vec, strip_bom: bool) -> JsString { - if strip_bom && input.starts_with(&[0xFE, 0xFF]) { - input.drain(..2); + pub(crate) fn decode(mut input: &[u8], strip_bom: bool) -> JsString { + if strip_bom && let Some(rest) = input.strip_prefix(&[0xFE, 0xFF]) { + input = rest; } - let mut input = input.as_mut_slice(); - // After this point, input is of even length. - let dangling = if input.len().is_multiple_of(2) { - false - } else { - let new_len = input.len() - 1; - input = &mut input[0..new_len]; - true - }; - - let input: &mut [u16] = bytemuck::cast_slice_mut(input); - - // Swap the bytes. - for b in &mut *input { - *b = b.swap_bytes(); + let dangling_byte = !input.len().is_multiple_of(2); + if dangling_byte { + input = &input[0..input.len() - 1]; } - if dangling { - JsString::from(&[JsString::from(&*input), js_string!("\u{FFFD}")]) - } else { - JsString::from(&*input) - } + let code_units = input + .chunks_exact(2) + .map(|pair| u16::from_be_bytes([pair[0], pair[1]])); + super::decode_utf16_units(code_units, dangling_byte) } } diff --git a/core/runtime/src/text/mod.rs b/core/runtime/src/text/mod.rs index 9a6b7a6ef7a..6ef64f44bae 100644 --- a/core/runtime/src/text/mod.rs +++ b/core/runtime/src/text/mod.rs @@ -200,10 +200,7 @@ impl TextDecoder { Ok(match self.encoding { Encoding::Utf8 => encodings::utf8::decode(data, strip_bom), Encoding::Utf16Le => encodings::utf16le::decode(data, strip_bom), - Encoding::Utf16Be => { - let owned = data.to_vec(); - encodings::utf16be::decode(owned, strip_bom) - } + Encoding::Utf16Be => encodings::utf16be::decode(data, strip_bom), }) } } diff --git a/core/runtime/src/text/tests.rs b/core/runtime/src/text/tests.rs index 43511525a0a..d909717ce6f 100644 --- a/core/runtime/src/text/tests.rs +++ b/core/runtime/src/text/tests.rs @@ -1,3 +1,4 @@ +use super::encodings; use crate::test::{TestAction, run_test_actions_with}; use crate::text; use boa_engine::object::builtins::JsUint8Array; @@ -476,3 +477,68 @@ fn decoder_handle_data_view_offset_and_length() { context, ); } + +// Test cases from issue #4612: unpaired surrogates must be replaced with U+FFFD. +const INVALID_UTF16_CASES: &[(&[u16], &[u16])] = &[ + // Lone high surrogate in the middle + ( + &[0x0061, 0x0062, 0xD800, 0x0077, 0x0078], + &[0x0061, 0x0062, 0xFFFD, 0x0077, 0x0078], + ), + // Lone high surrogate only + (&[0xD800], &[0xFFFD]), + // Two consecutive high surrogates + (&[0xD800, 0xD800], &[0xFFFD, 0xFFFD]), + // Lone low surrogate in the middle + ( + &[0x0061, 0x0062, 0xDFFF, 0x0077, 0x0078], + &[0x0061, 0x0062, 0xFFFD, 0x0077, 0x0078], + ), + // Low surrogate followed by high surrogate (wrong order) + (&[0xDFFF, 0xD800], &[0xFFFD, 0xFFFD]), +]; + +#[test] +fn decoder_utf16le_replaces_unpaired_surrogates() { + for (invalid, replaced) in INVALID_UTF16_CASES { + let mut input_bytes = Vec::with_capacity(invalid.len() * 2); + for &code_unit in *invalid { + input_bytes.extend_from_slice(&code_unit.to_le_bytes()); + } + + let result = encodings::utf16le::decode(&input_bytes, false); + let expected = JsString::from(*replaced); + assert_eq!(result, expected, "utf16le failed for input {invalid:?}"); + } +} + +#[test] +fn decoder_utf16be_replaces_unpaired_surrogates() { + for (invalid, replaced) in INVALID_UTF16_CASES { + let mut input_bytes = Vec::with_capacity(invalid.len() * 2); + for &code_unit in *invalid { + input_bytes.extend_from_slice(&code_unit.to_be_bytes()); + } + + let result = encodings::utf16be::decode(&input_bytes, false); + let expected = JsString::from(*replaced); + assert_eq!(result, expected, "utf16be failed for input {invalid:?}"); + } +} + +#[test] +fn decoder_utf16le_dangling_byte_produces_replacement() { + // Odd-length input: the last byte is truncated and replaced with U+FFFD + let input: &[u8] = &[0x41, 0x00, 0x42]; // 'A' (LE) + dangling byte + let result = encodings::utf16le::decode(input, false); + let expected = JsString::from(&[0x0041u16, 0xFFFD][..]); + assert_eq!(result, expected); +} + +#[test] +fn decoder_utf16be_dangling_byte_produces_replacement() { + let input: &[u8] = &[0x00, 0x41, 0x42]; // 'A' (BE) + dangling byte + let result = encodings::utf16be::decode(input, false); + let expected = JsString::from(&[0x0041u16, 0xFFFD][..]); + assert_eq!(result, expected); +} From 9c83c6ef97dcce3b2c65d722dcccc19ec4cfe514 Mon Sep 17 00:00:00 2001 From: tkshsbcue Date: Fri, 17 Apr 2026 15:43:18 +0530 Subject: [PATCH 2/2] feat(runtime): implement TextDecoder fatal option and use WPT tests for UTF-16 surrogates Addresses review feedback on #5304: replace hand-written UTF-16 surrogate tests with the upstream WPT suite. The WPT `textdecoder-utf16-surrogates.any.js` test exercises both the replacement path (already fixed in 70c9ccba) and the fatal mode, which requires the decoder to throw a TypeError on invalid input. Wire the fatal option through TextDecoderOptions and the decode helpers, and enable the WPT file in the encoding test list. - Add `fatal: Option` to TextDecoderOptions and a `fatal` field + getter on TextDecoder - Change encoding decode functions to return Result and propagate a TypeError from TextDecoder::decode when fatal is set - Drop the custom INVALID_UTF16_CASES tests now covered by WPT --- core/runtime/src/text/encodings.rs | 52 +++++++++++++++-------- core/runtime/src/text/mod.rs | 28 ++++++++++--- core/runtime/src/text/tests.rs | 66 ------------------------------ tests/wpt/src/lib.rs | 1 + 4 files changed, 59 insertions(+), 88 deletions(-) diff --git a/core/runtime/src/text/encodings.rs b/core/runtime/src/text/encodings.rs index f36ecb8b263..8ead8a8f6dd 100644 --- a/core/runtime/src/text/encodings.rs +++ b/core/runtime/src/text/encodings.rs @@ -12,12 +12,17 @@ pub(crate) mod utf8 { .collect() } - pub(crate) fn decode(mut input: &[u8], strip_bom: bool) -> JsString { + pub(crate) fn decode(mut input: &[u8], strip_bom: bool, fatal: bool) -> Result { if strip_bom { input = input.strip_prefix(&[0xEF, 0xBB, 0xBF]).unwrap_or(input); } - let string = String::from_utf8_lossy(input); - JsString::from(string.as_ref()) + if fatal { + let s = std::str::from_utf8(input).map_err(|_| ())?; + Ok(JsString::from(s)) + } else { + let string = String::from_utf8_lossy(input); + Ok(JsString::from(string.as_ref())) + } } } @@ -27,30 +32,43 @@ pub(crate) mod utf8 { /// If `dangling_byte` is true and the last decoded code unit is not a high /// surrogate (which would already have been replaced), an additional U+FFFD /// is appended for the truncated trailing byte. +/// +/// When `fatal` is true, any decoder error (unpaired surrogate or dangling +/// byte) causes this function to return `Err(())` instead of inserting a +/// replacement character. fn decode_utf16_units( code_units: impl IntoIterator, dangling_byte: bool, -) -> boa_engine::JsString { + fatal: bool, +) -> Result { let mut string = String::new(); let mut last_code_unit = None; - string.extend( - std::char::decode_utf16(code_units.into_iter().inspect(|code_unit| { - last_code_unit = Some(*code_unit); - })) - .map(|result| result.unwrap_or('\u{FFFD}')), - ); + for result in std::char::decode_utf16(code_units.into_iter().inspect(|code_unit| { + last_code_unit = Some(*code_unit); + })) { + match result { + Ok(c) => string.push(c), + Err(_) if fatal => return Err(()), + Err(_) => string.push('\u{FFFD}'), + } + } let trailing_high_surrogate = last_code_unit.is_some_and(|code_unit| (0xD800..=0xDBFF).contains(&code_unit)); - if dangling_byte && !trailing_high_surrogate { - string.push('\u{FFFD}'); + if dangling_byte { + if fatal { + return Err(()); + } + if !trailing_high_surrogate { + string.push('\u{FFFD}'); + } } - boa_engine::JsString::from(string) + Ok(boa_engine::JsString::from(string)) } pub(crate) mod utf16le { use boa_engine::JsString; - pub(crate) fn decode(mut input: &[u8], strip_bom: bool) -> JsString { + pub(crate) fn decode(mut input: &[u8], strip_bom: bool, fatal: bool) -> Result { if strip_bom { input = input.strip_prefix(&[0xFF, 0xFE]).unwrap_or(input); } @@ -61,14 +79,14 @@ pub(crate) mod utf16le { } let code_units: &[u16] = bytemuck::cast_slice(input); - super::decode_utf16_units(code_units.iter().copied(), dangling_byte) + super::decode_utf16_units(code_units.iter().copied(), dangling_byte, fatal) } } pub(crate) mod utf16be { use boa_engine::JsString; - pub(crate) fn decode(mut input: &[u8], strip_bom: bool) -> JsString { + pub(crate) fn decode(mut input: &[u8], strip_bom: bool, fatal: bool) -> Result { if strip_bom && let Some(rest) = input.strip_prefix(&[0xFE, 0xFF]) { input = rest; } @@ -81,6 +99,6 @@ pub(crate) mod utf16be { let code_units = input .chunks_exact(2) .map(|pair| u16::from_be_bytes([pair[0], pair[1]])); - super::decode_utf16_units(code_units, dangling_byte) + super::decode_utf16_units(code_units, dangling_byte, fatal) } } diff --git a/core/runtime/src/text/mod.rs b/core/runtime/src/text/mod.rs index 6ef64f44bae..d8cb63cda3a 100644 --- a/core/runtime/src/text/mod.rs +++ b/core/runtime/src/text/mod.rs @@ -20,6 +20,7 @@ mod encodings; pub struct TextDecoderOptions { #[boa(rename = "ignoreBOM")] ignore_bom: Option, + fatal: Option, } /// The character encoding used by [`TextDecoder`]. @@ -73,6 +74,8 @@ pub struct TextDecoder { encoding: Encoding, #[unsafe_ignore_trace] ignore_bom: bool, + #[unsafe_ignore_trace] + fatal: bool, } #[boa_class] @@ -89,6 +92,7 @@ impl TextDecoder { options: Option, ) -> JsResult { let ignore_bom = options.and_then(|o| o.ignore_bom).unwrap_or(false); + let fatal = options.and_then(|o| o.fatal).unwrap_or(false); let encoding = match encoding { Some(enc) => { @@ -103,6 +107,7 @@ impl TextDecoder { Ok(Self { encoding, ignore_bom, + fatal, }) } @@ -131,6 +136,17 @@ impl TextDecoder { self.ignore_bom } + /// The [`TextDecoder.fatal`][mdn] read-only property is a `bool` indicating whether + /// the error mode of the decoder is fatal, i.e. whether invalid input throws a + /// `TypeError` instead of being replaced with U+FFFD. + /// + /// [mdn]: https://developer.mozilla.org/en-US/docs/Web/API/TextDecoder/fatal + #[boa(getter)] + #[must_use] + pub fn fatal(&self) -> bool { + self.fatal + } + /// The [`TextDecoder.decode()`][mdn] method returns a string containing text decoded from the /// buffer passed as a parameter. /// @@ -197,11 +213,13 @@ impl TextDecoder { &full_data }; - Ok(match self.encoding { - Encoding::Utf8 => encodings::utf8::decode(data, strip_bom), - Encoding::Utf16Le => encodings::utf16le::decode(data, strip_bom), - Encoding::Utf16Be => encodings::utf16be::decode(data, strip_bom), - }) + let result = match self.encoding { + Encoding::Utf8 => encodings::utf8::decode(data, strip_bom, self.fatal), + Encoding::Utf16Le => encodings::utf16le::decode(data, strip_bom, self.fatal), + Encoding::Utf16Be => encodings::utf16be::decode(data, strip_bom, self.fatal), + }; + + result.map_err(|()| js_error!(TypeError: "The encoded data was not valid.")) } } diff --git a/core/runtime/src/text/tests.rs b/core/runtime/src/text/tests.rs index d909717ce6f..43511525a0a 100644 --- a/core/runtime/src/text/tests.rs +++ b/core/runtime/src/text/tests.rs @@ -1,4 +1,3 @@ -use super::encodings; use crate::test::{TestAction, run_test_actions_with}; use crate::text; use boa_engine::object::builtins::JsUint8Array; @@ -477,68 +476,3 @@ fn decoder_handle_data_view_offset_and_length() { context, ); } - -// Test cases from issue #4612: unpaired surrogates must be replaced with U+FFFD. -const INVALID_UTF16_CASES: &[(&[u16], &[u16])] = &[ - // Lone high surrogate in the middle - ( - &[0x0061, 0x0062, 0xD800, 0x0077, 0x0078], - &[0x0061, 0x0062, 0xFFFD, 0x0077, 0x0078], - ), - // Lone high surrogate only - (&[0xD800], &[0xFFFD]), - // Two consecutive high surrogates - (&[0xD800, 0xD800], &[0xFFFD, 0xFFFD]), - // Lone low surrogate in the middle - ( - &[0x0061, 0x0062, 0xDFFF, 0x0077, 0x0078], - &[0x0061, 0x0062, 0xFFFD, 0x0077, 0x0078], - ), - // Low surrogate followed by high surrogate (wrong order) - (&[0xDFFF, 0xD800], &[0xFFFD, 0xFFFD]), -]; - -#[test] -fn decoder_utf16le_replaces_unpaired_surrogates() { - for (invalid, replaced) in INVALID_UTF16_CASES { - let mut input_bytes = Vec::with_capacity(invalid.len() * 2); - for &code_unit in *invalid { - input_bytes.extend_from_slice(&code_unit.to_le_bytes()); - } - - let result = encodings::utf16le::decode(&input_bytes, false); - let expected = JsString::from(*replaced); - assert_eq!(result, expected, "utf16le failed for input {invalid:?}"); - } -} - -#[test] -fn decoder_utf16be_replaces_unpaired_surrogates() { - for (invalid, replaced) in INVALID_UTF16_CASES { - let mut input_bytes = Vec::with_capacity(invalid.len() * 2); - for &code_unit in *invalid { - input_bytes.extend_from_slice(&code_unit.to_be_bytes()); - } - - let result = encodings::utf16be::decode(&input_bytes, false); - let expected = JsString::from(*replaced); - assert_eq!(result, expected, "utf16be failed for input {invalid:?}"); - } -} - -#[test] -fn decoder_utf16le_dangling_byte_produces_replacement() { - // Odd-length input: the last byte is truncated and replaced with U+FFFD - let input: &[u8] = &[0x41, 0x00, 0x42]; // 'A' (LE) + dangling byte - let result = encodings::utf16le::decode(input, false); - let expected = JsString::from(&[0x0041u16, 0xFFFD][..]); - assert_eq!(result, expected); -} - -#[test] -fn decoder_utf16be_dangling_byte_produces_replacement() { - let input: &[u8] = &[0x00, 0x41, 0x42]; // 'A' (BE) + dangling byte - let result = encodings::utf16be::decode(input, false); - let expected = JsString::from(&[0x0041u16, 0xFFFD][..]); - assert_eq!(result, expected); -} diff --git a/tests/wpt/src/lib.rs b/tests/wpt/src/lib.rs index 0f57721c4e5..62091781fb9 100644 --- a/tests/wpt/src/lib.rs +++ b/tests/wpt/src/lib.rs @@ -399,6 +399,7 @@ fn encoding( #[base_dir = "${WPT_ROOT}"] #[files("encoding/api-*.any.js")] #[files("encoding/textencoder-constructor-non-utf.any.js")] + #[files("encoding/textdecoder-utf16-surrogates.any.js")] // TODO: re-enable those when better encoding and options are supported. // #[files("encoding/textdecoder-*.any.js")] // #[files("encoding/textencoder-*.any.js")]