diff --git a/core/runtime/src/text/encodings.rs b/core/runtime/src/text/encodings.rs index cbe961fdc8e..8ead8a8f6dd 100644 --- a/core/runtime/src/text/encodings.rs +++ b/core/runtime/src/text/encodings.rs @@ -12,70 +12,93 @@ pub(crate) mod utf8 { .collect() } - pub(crate) fn decode(mut input: &[u8], strip_bom: bool) -> JsString { + pub(crate) fn decode(mut input: &[u8], strip_bom: bool, fatal: bool) -> Result { if strip_bom { input = input.strip_prefix(&[0xEF, 0xBB, 0xBF]).unwrap_or(input); } - let string = String::from_utf8_lossy(input); - JsString::from(string.as_ref()) + if fatal { + let s = std::str::from_utf8(input).map_err(|_| ())?; + Ok(JsString::from(s)) + } else { + let string = String::from_utf8_lossy(input); + Ok(JsString::from(string.as_ref())) + } } } +/// Decodes an iterator of UTF-16 code units into a well-formed `JsString`, +/// replacing any unpaired surrogates with U+FFFD. +/// +/// If `dangling_byte` is true and the last decoded code unit is not a high +/// surrogate (which would already have been replaced), an additional U+FFFD +/// is appended for the truncated trailing byte. +/// +/// When `fatal` is true, any decoder error (unpaired surrogate or dangling +/// byte) causes this function to return `Err(())` instead of inserting a +/// replacement character. +fn decode_utf16_units( + code_units: impl IntoIterator, + dangling_byte: bool, + fatal: bool, +) -> Result { + let mut string = String::new(); + let mut last_code_unit = None; + for result in std::char::decode_utf16(code_units.into_iter().inspect(|code_unit| { + last_code_unit = Some(*code_unit); + })) { + match result { + Ok(c) => string.push(c), + Err(_) if fatal => return Err(()), + Err(_) => string.push('\u{FFFD}'), + } + } + let trailing_high_surrogate = + last_code_unit.is_some_and(|code_unit| (0xD800..=0xDBFF).contains(&code_unit)); + if dangling_byte { + if fatal { + return Err(()); + } + if !trailing_high_surrogate { + string.push('\u{FFFD}'); + } + } + Ok(boa_engine::JsString::from(string)) +} + pub(crate) mod utf16le { - use boa_engine::{JsString, js_string}; + use boa_engine::JsString; - pub(crate) fn decode(mut input: &[u8], strip_bom: bool) -> JsString { + pub(crate) fn decode(mut input: &[u8], strip_bom: bool, fatal: bool) -> Result { if strip_bom { input = input.strip_prefix(&[0xFF, 0xFE]).unwrap_or(input); } - // After this point, input is of even length. - let dangling = if input.len().is_multiple_of(2) { - false - } else { + let dangling_byte = !input.len().is_multiple_of(2); + if dangling_byte { input = &input[0..input.len() - 1]; - true - }; - - let input: &[u16] = bytemuck::cast_slice(input); - - if dangling { - JsString::from(&[JsString::from(input), js_string!("\u{FFFD}")]) - } else { - JsString::from(input) } + + let code_units: &[u16] = bytemuck::cast_slice(input); + super::decode_utf16_units(code_units.iter().copied(), dangling_byte, fatal) } } pub(crate) mod utf16be { - use boa_engine::{JsString, js_string}; + use boa_engine::JsString; - pub(crate) fn decode(mut input: Vec, strip_bom: bool) -> JsString { - if strip_bom && input.starts_with(&[0xFE, 0xFF]) { - input.drain(..2); + pub(crate) fn decode(mut input: &[u8], strip_bom: bool, fatal: bool) -> Result { + if strip_bom && let Some(rest) = input.strip_prefix(&[0xFE, 0xFF]) { + input = rest; } - let mut input = input.as_mut_slice(); - // After this point, input is of even length. - let dangling = if input.len().is_multiple_of(2) { - false - } else { - let new_len = input.len() - 1; - input = &mut input[0..new_len]; - true - }; - - let input: &mut [u16] = bytemuck::cast_slice_mut(input); - - // Swap the bytes. - for b in &mut *input { - *b = b.swap_bytes(); + let dangling_byte = !input.len().is_multiple_of(2); + if dangling_byte { + input = &input[0..input.len() - 1]; } - if dangling { - JsString::from(&[JsString::from(&*input), js_string!("\u{FFFD}")]) - } else { - JsString::from(&*input) - } + let code_units = input + .chunks_exact(2) + .map(|pair| u16::from_be_bytes([pair[0], pair[1]])); + super::decode_utf16_units(code_units, dangling_byte, fatal) } } diff --git a/core/runtime/src/text/mod.rs b/core/runtime/src/text/mod.rs index 9a6b7a6ef7a..d8cb63cda3a 100644 --- a/core/runtime/src/text/mod.rs +++ b/core/runtime/src/text/mod.rs @@ -20,6 +20,7 @@ mod encodings; pub struct TextDecoderOptions { #[boa(rename = "ignoreBOM")] ignore_bom: Option, + fatal: Option, } /// The character encoding used by [`TextDecoder`]. @@ -73,6 +74,8 @@ pub struct TextDecoder { encoding: Encoding, #[unsafe_ignore_trace] ignore_bom: bool, + #[unsafe_ignore_trace] + fatal: bool, } #[boa_class] @@ -89,6 +92,7 @@ impl TextDecoder { options: Option, ) -> JsResult { let ignore_bom = options.and_then(|o| o.ignore_bom).unwrap_or(false); + let fatal = options.and_then(|o| o.fatal).unwrap_or(false); let encoding = match encoding { Some(enc) => { @@ -103,6 +107,7 @@ impl TextDecoder { Ok(Self { encoding, ignore_bom, + fatal, }) } @@ -131,6 +136,17 @@ impl TextDecoder { self.ignore_bom } + /// The [`TextDecoder.fatal`][mdn] read-only property is a `bool` indicating whether + /// the error mode of the decoder is fatal, i.e. whether invalid input throws a + /// `TypeError` instead of being replaced with U+FFFD. + /// + /// [mdn]: https://developer.mozilla.org/en-US/docs/Web/API/TextDecoder/fatal + #[boa(getter)] + #[must_use] + pub fn fatal(&self) -> bool { + self.fatal + } + /// The [`TextDecoder.decode()`][mdn] method returns a string containing text decoded from the /// buffer passed as a parameter. /// @@ -197,14 +213,13 @@ impl TextDecoder { &full_data }; - Ok(match self.encoding { - Encoding::Utf8 => encodings::utf8::decode(data, strip_bom), - Encoding::Utf16Le => encodings::utf16le::decode(data, strip_bom), - Encoding::Utf16Be => { - let owned = data.to_vec(); - encodings::utf16be::decode(owned, strip_bom) - } - }) + let result = match self.encoding { + Encoding::Utf8 => encodings::utf8::decode(data, strip_bom, self.fatal), + Encoding::Utf16Le => encodings::utf16le::decode(data, strip_bom, self.fatal), + Encoding::Utf16Be => encodings::utf16be::decode(data, strip_bom, self.fatal), + }; + + result.map_err(|()| js_error!(TypeError: "The encoded data was not valid.")) } } diff --git a/tests/wpt/src/lib.rs b/tests/wpt/src/lib.rs index 0f57721c4e5..62091781fb9 100644 --- a/tests/wpt/src/lib.rs +++ b/tests/wpt/src/lib.rs @@ -399,6 +399,7 @@ fn encoding( #[base_dir = "${WPT_ROOT}"] #[files("encoding/api-*.any.js")] #[files("encoding/textencoder-constructor-non-utf.any.js")] + #[files("encoding/textdecoder-utf16-surrogates.any.js")] // TODO: re-enable those when better encoding and options are supported. // #[files("encoding/textdecoder-*.any.js")] // #[files("encoding/textencoder-*.any.js")]