From 70c9ccba671f179e5a97b9be6ff53ef2f772502b Mon Sep 17 00:00:00 2001
From: tkshsbcue <kumar.tanay@somaiya.edu>
Date: Tue, 7 Apr 2026 20:01:36 +0530
Subject: [PATCH 1/2] fix(runtime): replace unpaired surrogates with U+FFFD in
 UTF-16 TextDecoder

The UTF-16 TextDecoder (both LE and BE) was passing raw code units
directly to JsString, preserving unpaired surrogates instead of
replacing them with U+FFFD as required by the WHATWG Encoding Standard.

Route both decoders through a shared `decode_utf16_units` helper that
uses `std::char::decode_utf16` with replacement, and simplify the
UTF-16 BE decoder to borrow instead of taking ownership.

Closes #4612
---
 core/runtime/src/text/encodings.rs | 81 ++++++++++++++++--------------
 core/runtime/src/text/mod.rs       |  5 +-
 core/runtime/src/text/tests.rs     | 66 ++++++++++++++++++++++++
 3 files changed, 110 insertions(+), 42 deletions(-)
diff --git a/core/runtime/src/text/encodings.rs b/core/runtime/src/text/encodings.rs
index cbe961fdc8e..f36ecb8b263 100644
--- a/core/runtime/src/text/encodings.rs
+++ b/core/runtime/src/text/encodings.rs
@@ -21,61 +21,66 @@ pub(crate) mod utf8 {
     }
 }
 
+/// Decodes an iterator of UTF-16 code units into a well-formed `JsString`,
+/// replacing any unpaired surrogates with U+FFFD.
+///
+/// If `dangling_byte` is true and the last decoded code unit is not a high
+/// surrogate (which would already have been replaced), an additional U+FFFD
+/// is appended for the truncated trailing byte.
+fn decode_utf16_units(
+    code_units: impl IntoIterator<Item = u16>,
+    dangling_byte: bool,
+) -> boa_engine::JsString {
+    let mut string = String::new();
+    let mut last_code_unit = None;
+    string.extend(
+        std::char::decode_utf16(code_units.into_iter().inspect(|code_unit| {
+            last_code_unit = Some(*code_unit);
+        }))
+        .map(|result| result.unwrap_or('\u{FFFD}')),
+    );
+    let trailing_high_surrogate =
+        last_code_unit.is_some_and(|code_unit| (0xD800..=0xDBFF).contains(&code_unit));
+    if dangling_byte && !trailing_high_surrogate {
+        string.push('\u{FFFD}');
+    }
+    boa_engine::JsString::from(string)
+}
+
 pub(crate) mod utf16le {
-    use boa_engine::{JsString, js_string};
+    use boa_engine::JsString;
 
     pub(crate) fn decode(mut input: &[u8], strip_bom: bool) -> JsString {
         if strip_bom {
             input = input.strip_prefix(&[0xFF, 0xFE]).unwrap_or(input);
         }
 
-        // After this point, input is of even length.
-        let dangling = if input.len().is_multiple_of(2) {
-            false
-        } else {
+        let dangling_byte = !input.len().is_multiple_of(2);
+        if dangling_byte {
             input = &input[0..input.len() - 1];
-            true
-        };
-
-        let input: &[u16] = bytemuck::cast_slice(input);
-
-        if dangling {
-            JsString::from(&[JsString::from(input), js_string!("\u{FFFD}")])
-        } else {
-            JsString::from(input)
         }
+
+        let code_units: &[u16] = bytemuck::cast_slice(input);
+        super::decode_utf16_units(code_units.iter().copied(), dangling_byte)
     }
 }
 
 pub(crate) mod utf16be {
-    use boa_engine::{JsString, js_string};
+    use boa_engine::JsString;
 
-    pub(crate) fn decode(mut input: Vec<u8>, strip_bom: bool) -> JsString {
-        if strip_bom && input.starts_with(&[0xFE, 0xFF]) {
-            input.drain(..2);
+    pub(crate) fn decode(mut input: &[u8], strip_bom: bool) -> JsString {
+        if strip_bom && let Some(rest) = input.strip_prefix(&[0xFE, 0xFF]) {
+            input = rest;
         }
 
-        let mut input = input.as_mut_slice();
-        // After this point, input is of even length.
-        let dangling = if input.len().is_multiple_of(2) {
-            false
-        } else {
-            let new_len = input.len() - 1;
-            input = &mut input[0..new_len];
-            true
-        };
-
-        let input: &mut [u16] = bytemuck::cast_slice_mut(input);
-
-        // Swap the bytes.
-        for b in &mut *input {
-            *b = b.swap_bytes();
+        let dangling_byte = !input.len().is_multiple_of(2);
+        if dangling_byte {
+            input = &input[0..input.len() - 1];
         }
 
-        if dangling {
-            JsString::from(&[JsString::from(&*input), js_string!("\u{FFFD}")])
-        } else {
-            JsString::from(&*input)
-        }
+        let code_units = input
+            .chunks_exact(2)
+            .map(|pair| u16::from_be_bytes([pair[0], pair[1]]));
+        super::decode_utf16_units(code_units, dangling_byte)
     }
 }
diff --git a/core/runtime/src/text/mod.rs b/core/runtime/src/text/mod.rs
index 9a6b7a6ef7a..6ef64f44bae 100644
--- a/core/runtime/src/text/mod.rs
+++ b/core/runtime/src/text/mod.rs
@@ -200,10 +200,7 @@ impl TextDecoder {
         Ok(match self.encoding {
             Encoding::Utf8 => encodings::utf8::decode(data, strip_bom),
             Encoding::Utf16Le => encodings::utf16le::decode(data, strip_bom),
-            Encoding::Utf16Be => {
-                let owned = data.to_vec();
-                encodings::utf16be::decode(owned, strip_bom)
-            }
+            Encoding::Utf16Be => encodings::utf16be::decode(data, strip_bom),
         })
     }
 }
diff --git a/core/runtime/src/text/tests.rs b/core/runtime/src/text/tests.rs
index 43511525a0a..d909717ce6f 100644
--- a/core/runtime/src/text/tests.rs
+++ b/core/runtime/src/text/tests.rs
@@ -1,3 +1,4 @@
+use super::encodings;
 use crate::test::{TestAction, run_test_actions_with};
 use crate::text;
 use boa_engine::object::builtins::JsUint8Array;
@@ -476,3 +477,68 @@ fn decoder_handle_data_view_offset_and_length() {
         context,
     );
 }
+
+// Test cases from issue #4612: unpaired surrogates must be replaced with U+FFFD.
+const INVALID_UTF16_CASES: &[(&[u16], &[u16])] = &[
+    // Lone high surrogate in the middle
+    (
+        &[0x0061, 0x0062, 0xD800, 0x0077, 0x0078],
+        &[0x0061, 0x0062, 0xFFFD, 0x0077, 0x0078],
+    ),
+    // Lone high surrogate only
+    (&[0xD800], &[0xFFFD]),
+    // Two consecutive high surrogates
+    (&[0xD800, 0xD800], &[0xFFFD, 0xFFFD]),
+    // Lone low surrogate in the middle
+    (
+        &[0x0061, 0x0062, 0xDFFF, 0x0077, 0x0078],
+        &[0x0061, 0x0062, 0xFFFD, 0x0077, 0x0078],
+    ),
+    // Low surrogate followed by high surrogate (wrong order)
+    (&[0xDFFF, 0xD800], &[0xFFFD, 0xFFFD]),
+];
+
+#[test]
+fn decoder_utf16le_replaces_unpaired_surrogates() {
+    for (invalid, replaced) in INVALID_UTF16_CASES {
+        let mut input_bytes = Vec::with_capacity(invalid.len() * 2);
+        for &code_unit in *invalid {
+            input_bytes.extend_from_slice(&code_unit.to_le_bytes());
+        }
+
+        let result = encodings::utf16le::decode(&input_bytes, false);
+        let expected = JsString::from(*replaced);
+        assert_eq!(result, expected, "utf16le failed for input {invalid:?}");
+    }
+}
+
+#[test]
+fn decoder_utf16be_replaces_unpaired_surrogates() {
+    for (invalid, replaced) in INVALID_UTF16_CASES {
+        let mut input_bytes = Vec::with_capacity(invalid.len() * 2);
+        for &code_unit in *invalid {
+            input_bytes.extend_from_slice(&code_unit.to_be_bytes());
+        }
+
+        let result = encodings::utf16be::decode(&input_bytes, false);
+        let expected = JsString::from(*replaced);
+        assert_eq!(result, expected, "utf16be failed for input {invalid:?}");
+    }
+}
+
+#[test]
+fn decoder_utf16le_dangling_byte_produces_replacement() {
+    // Odd-length input: the last byte is truncated and replaced with U+FFFD
+    let input: &[u8] = &[0x41, 0x00, 0x42]; // 'A' (LE) + dangling byte
+    let result = encodings::utf16le::decode(input, false);
+    let expected = JsString::from(&[0x0041u16, 0xFFFD][..]);
+    assert_eq!(result, expected);
+}
+
+#[test]
+fn decoder_utf16be_dangling_byte_produces_replacement() {
+    let input: &[u8] = &[0x00, 0x41, 0x42]; // 'A' (BE) + dangling byte
+    let result = encodings::utf16be::decode(input, false);
+    let expected = JsString::from(&[0x0041u16, 0xFFFD][..]);
+    assert_eq!(result, expected);
+}

From 9c83c6ef97dcce3b2c65d722dcccc19ec4cfe514 Mon Sep 17 00:00:00 2001
From: tkshsbcue <kumar.tanay@somaiya.edu>
Date: Fri, 17 Apr 2026 15:43:18 +0530
Subject: [PATCH 2/2] feat(runtime): implement TextDecoder fatal option and use
 WPT tests for UTF-16 surrogates

Addresses review feedback on #5304: replace hand-written UTF-16 surrogate
tests with the upstream WPT suite.

The WPT `textdecoder-utf16-surrogates.any.js` test exercises both the
replacement path (already fixed in 70c9ccba) and the fatal mode, which
requires the decoder to throw a TypeError on invalid input. Wire the
fatal option through TextDecoderOptions and the decode helpers, and
enable the WPT file in the encoding test list.

- Add `fatal: Option<bool>` to TextDecoderOptions and a `fatal` field +
  getter on TextDecoder
- Change encoding decode functions to return Result<JsString, ()> and
  propagate a TypeError from TextDecoder::decode when fatal is set
- Drop the custom INVALID_UTF16_CASES tests now covered by WPT
---
 core/runtime/src/text/encodings.rs | 52 +++++++++++++++--------
 core/runtime/src/text/mod.rs       | 28 ++++++++++---
 core/runtime/src/text/tests.rs     | 66 ------------------------------
 tests/wpt/src/lib.rs               |  1 +
 4 files changed, 59 insertions(+), 88 deletions(-)

diff --git a/core/runtime/src/text/encodings.rs b/core/runtime/src/text/encodings.rs
index f36ecb8b263..8ead8a8f6dd 100644
--- a/core/runtime/src/text/encodings.rs
+++ b/core/runtime/src/text/encodings.rs
@@ -12,12 +12,17 @@ pub(crate) mod utf8 {
             .collect()
     }
 
-    pub(crate) fn decode(mut input: &[u8], strip_bom: bool) -> JsString {
+    pub(crate) fn decode(mut input: &[u8], strip_bom: bool, fatal: bool) -> Result<JsString, ()> {
         if strip_bom {
             input = input.strip_prefix(&[0xEF, 0xBB, 0xBF]).unwrap_or(input);
         }
-        let string = String::from_utf8_lossy(input);
-        JsString::from(string.as_ref())
+        if fatal {
+            let s = std::str::from_utf8(input).map_err(|_| ())?;
+            Ok(JsString::from(s))
+        } else {
+            let string = String::from_utf8_lossy(input);
+            Ok(JsString::from(string.as_ref()))
+        }
     }
 }
 
@@ -27,30 +32,43 @@ pub(crate) mod utf8 {
 /// If `dangling_byte` is true and the last decoded code unit is not a high
 /// surrogate (which would already have been replaced), an additional U+FFFD
 /// is appended for the truncated trailing byte.
+///
+/// When `fatal` is true, any decoder error (unpaired surrogate or dangling
+/// byte) causes this function to return `Err(())` instead of inserting a
+/// replacement character.
 fn decode_utf16_units(
     code_units: impl IntoIterator<Item = u16>,
     dangling_byte: bool,
-) -> boa_engine::JsString {
+    fatal: bool,
+) -> Result<boa_engine::JsString, ()> {
     let mut string = String::new();
     let mut last_code_unit = None;
-    string.extend(
-        std::char::decode_utf16(code_units.into_iter().inspect(|code_unit| {
-            last_code_unit = Some(*code_unit);
-        }))
-        .map(|result| result.unwrap_or('\u{FFFD}')),
-    );
+    for result in std::char::decode_utf16(code_units.into_iter().inspect(|code_unit| {
+        last_code_unit = Some(*code_unit);
+    })) {
+        match result {
+            Ok(c) => string.push(c),
+            Err(_) if fatal => return Err(()),
+            Err(_) => string.push('\u{FFFD}'),
+        }
+    }
     let trailing_high_surrogate =
         last_code_unit.is_some_and(|code_unit| (0xD800..=0xDBFF).contains(&code_unit));
-    if dangling_byte && !trailing_high_surrogate {
-        string.push('\u{FFFD}');
+    if dangling_byte {
+        if fatal {
+            return Err(());
+        }
+        if !trailing_high_surrogate {
+            string.push('\u{FFFD}');
+        }
     }
-    boa_engine::JsString::from(string)
+    Ok(boa_engine::JsString::from(string))
 }
 
 pub(crate) mod utf16le {
     use boa_engine::JsString;
 
-    pub(crate) fn decode(mut input: &[u8], strip_bom: bool) -> JsString {
+    pub(crate) fn decode(mut input: &[u8], strip_bom: bool, fatal: bool) -> Result<JsString, ()> {
         if strip_bom {
             input = input.strip_prefix(&[0xFF, 0xFE]).unwrap_or(input);
         }
@@ -61,14 +79,14 @@ pub(crate) mod utf16le {
         }
 
         let code_units: &[u16] = bytemuck::cast_slice(input);
-        super::decode_utf16_units(code_units.iter().copied(), dangling_byte)
+        super::decode_utf16_units(code_units.iter().copied(), dangling_byte, fatal)
     }
 }
 
 pub(crate) mod utf16be {
     use boa_engine::JsString;
 
-    pub(crate) fn decode(mut input: &[u8], strip_bom: bool) -> JsString {
+    pub(crate) fn decode(mut input: &[u8], strip_bom: bool, fatal: bool) -> Result<JsString, ()> {
         if strip_bom && let Some(rest) = input.strip_prefix(&[0xFE, 0xFF]) {
             input = rest;
         }
@@ -81,6 +99,6 @@ pub(crate) mod utf16be {
         let code_units = input
             .chunks_exact(2)
             .map(|pair| u16::from_be_bytes([pair[0], pair[1]]));
-        super::decode_utf16_units(code_units, dangling_byte)
+        super::decode_utf16_units(code_units, dangling_byte, fatal)
     }
 }
diff --git a/core/runtime/src/text/mod.rs b/core/runtime/src/text/mod.rs
index 6ef64f44bae..d8cb63cda3a 100644
--- a/core/runtime/src/text/mod.rs
+++ b/core/runtime/src/text/mod.rs
@@ -20,6 +20,7 @@ mod encodings;
 pub struct TextDecoderOptions {
     #[boa(rename = "ignoreBOM")]
     ignore_bom: Option<bool>,
+    fatal: Option<bool>,
 }
 
 /// The character encoding used by [`TextDecoder`].
@@ -73,6 +74,8 @@ pub struct TextDecoder {
     encoding: Encoding,
     #[unsafe_ignore_trace]
     ignore_bom: bool,
+    #[unsafe_ignore_trace]
+    fatal: bool,
 }
 
 #[boa_class]
@@ -89,6 +92,7 @@ impl TextDecoder {
         options: Option<TextDecoderOptions>,
     ) -> JsResult<Self> {
         let ignore_bom = options.and_then(|o| o.ignore_bom).unwrap_or(false);
+        let fatal = options.and_then(|o| o.fatal).unwrap_or(false);
 
         let encoding = match encoding {
             Some(enc) => {
@@ -103,6 +107,7 @@ impl TextDecoder {
         Ok(Self {
             encoding,
             ignore_bom,
+            fatal,
         })
     }
 
@@ -131,6 +136,17 @@ impl TextDecoder {
         self.ignore_bom
     }
 
+    /// The [`TextDecoder.fatal`][mdn] read-only property is a `bool` indicating whether
+    /// the error mode of the decoder is fatal, i.e. whether invalid input throws a
+    /// `TypeError` instead of being replaced with U+FFFD.
+    ///
+    /// [mdn]: https://developer.mozilla.org/en-US/docs/Web/API/TextDecoder/fatal
+    #[boa(getter)]
+    #[must_use]
+    pub fn fatal(&self) -> bool {
+        self.fatal
+    }
+
     /// The [`TextDecoder.decode()`][mdn] method returns a string containing text decoded from the
     /// buffer passed as a parameter.
     ///
@@ -197,11 +213,13 @@ impl TextDecoder {
             &full_data
         };
 
-        Ok(match self.encoding {
-            Encoding::Utf8 => encodings::utf8::decode(data, strip_bom),
-            Encoding::Utf16Le => encodings::utf16le::decode(data, strip_bom),
-            Encoding::Utf16Be => encodings::utf16be::decode(data, strip_bom),
-        })
+        let result = match self.encoding {
+            Encoding::Utf8 => encodings::utf8::decode(data, strip_bom, self.fatal),
+            Encoding::Utf16Le => encodings::utf16le::decode(data, strip_bom, self.fatal),
+            Encoding::Utf16Be => encodings::utf16be::decode(data, strip_bom, self.fatal),
+        };
+
+        result.map_err(|()| js_error!(TypeError: "The encoded data was not valid."))
     }
 }
 
diff --git a/core/runtime/src/text/tests.rs b/core/runtime/src/text/tests.rs
index d909717ce6f..43511525a0a 100644
--- a/core/runtime/src/text/tests.rs
+++ b/core/runtime/src/text/tests.rs
@@ -1,4 +1,3 @@
-use super::encodings;
 use crate::test::{TestAction, run_test_actions_with};
 use crate::text;
 use boa_engine::object::builtins::JsUint8Array;
@@ -477,68 +476,3 @@ fn decoder_handle_data_view_offset_and_length() {
         context,
     );
 }
-
-// Test cases from issue #4612: unpaired surrogates must be replaced with U+FFFD.
-const INVALID_UTF16_CASES: &[(&[u16], &[u16])] = &[
-    // Lone high surrogate in the middle
-    (
-        &[0x0061, 0x0062, 0xD800, 0x0077, 0x0078],
-        &[0x0061, 0x0062, 0xFFFD, 0x0077, 0x0078],
-    ),
-    // Lone high surrogate only
-    (&[0xD800], &[0xFFFD]),
-    // Two consecutive high surrogates
-    (&[0xD800, 0xD800], &[0xFFFD, 0xFFFD]),
-    // Lone low surrogate in the middle
-    (
-        &[0x0061, 0x0062, 0xDFFF, 0x0077, 0x0078],
-        &[0x0061, 0x0062, 0xFFFD, 0x0077, 0x0078],
-    ),
-    // Low surrogate followed by high surrogate (wrong order)
-    (&[0xDFFF, 0xD800], &[0xFFFD, 0xFFFD]),
-];
-
-#[test]
-fn decoder_utf16le_replaces_unpaired_surrogates() {
-    for (invalid, replaced) in INVALID_UTF16_CASES {
-        let mut input_bytes = Vec::with_capacity(invalid.len() * 2);
-        for &code_unit in *invalid {
-            input_bytes.extend_from_slice(&code_unit.to_le_bytes());
-        }
-
-        let result = encodings::utf16le::decode(&input_bytes, false);
-        let expected = JsString::from(*replaced);
-        assert_eq!(result, expected, "utf16le failed for input {invalid:?}");
-    }
-}
-
-#[test]
-fn decoder_utf16be_replaces_unpaired_surrogates() {
-    for (invalid, replaced) in INVALID_UTF16_CASES {
-        let mut input_bytes = Vec::with_capacity(invalid.len() * 2);
-        for &code_unit in *invalid {
-            input_bytes.extend_from_slice(&code_unit.to_be_bytes());
-        }
-
-        let result = encodings::utf16be::decode(&input_bytes, false);
-        let expected = JsString::from(*replaced);
-        assert_eq!(result, expected, "utf16be failed for input {invalid:?}");
-    }
-}
-
-#[test]
-fn decoder_utf16le_dangling_byte_produces_replacement() {
-    // Odd-length input: the last byte is truncated and replaced with U+FFFD
-    let input: &[u8] = &[0x41, 0x00, 0x42]; // 'A' (LE) + dangling byte
-    let result = encodings::utf16le::decode(input, false);
-    let expected = JsString::from(&[0x0041u16, 0xFFFD][..]);
-    assert_eq!(result, expected);
-}
-
-#[test]
-fn decoder_utf16be_dangling_byte_produces_replacement() {
-    let input: &[u8] = &[0x00, 0x41, 0x42]; // 'A' (BE) + dangling byte
-    let result = encodings::utf16be::decode(input, false);
-    let expected = JsString::from(&[0x0041u16, 0xFFFD][..]);
-    assert_eq!(result, expected);
-}
diff --git a/tests/wpt/src/lib.rs b/tests/wpt/src/lib.rs
index 0f57721c4e5..62091781fb9 100644
--- a/tests/wpt/src/lib.rs
+++ b/tests/wpt/src/lib.rs
@@ -399,6 +399,7 @@ fn encoding(
     #[base_dir = "${WPT_ROOT}"]
     #[files("encoding/api-*.any.js")]
     #[files("encoding/textencoder-constructor-non-utf.any.js")]
+    #[files("encoding/textdecoder-utf16-surrogates.any.js")]
     // TODO: re-enable those when better encoding and options are supported.
     // #[files("encoding/textdecoder-*.any.js")]
     // #[files("encoding/textencoder-*.any.js")]