getsentry · antonis · Apr 14, 2026 · Apr 14, 2026
@@ -8,6 +8,7 @@
 
 **Bug Fixes**:
 
+- Recover log containers containing lone Unicode surrogates (`\uD800`–`\uDFFF`) instead of discarding the entire batch. ([#5833](https://github.com/getsentry/relay/pull/5833))
 - Store segment name in `sentry.transaction` in addition to `sentry.segment.name` on OTLP spans. ([#5765](https://github.com/getsentry/relay/pull/5765))
 - Explicitly handle in-flight requests during shutdown. ([#5746](https://github.com/getsentry/relay/pull/5746), [#5769](https://github.com/getsentry/relay/pull/5769))
 - Emit outcomes in both `log_byte` and `log_item` categories when logs are dropped. ([#5766](https://github.com/getsentry/relay/pull/5766))

@@ -276,6 +276,109 @@ impl<T: ContainerItem> ItemContainer<T> {
     }
 }
 
+/// Sanitizes lone Unicode surrogates in JSON-escaped form within raw bytes.
+///
+/// JSON payloads may contain escaped lone surrogates (`\uD800`–`\uDFFF`) that are not part of
+/// a valid surrogate pair. These are rejected by strict JSON parsers like `serde_json`.
+///
+/// This function scans for such sequences and replaces them with the Unicode replacement
+/// character escape (`\uFFFD`), which is the same byte length (6 bytes), allowing in-place
+/// replacement without shifting offsets.
+///
+/// Valid surrogate pairs (a high surrogate `\uD800`–`\uDBFF` immediately followed by a low
+/// surrogate `\uDC00`–`\uDFFF`) are left intact.
+///
+/// Note: this function does not track whether a `\uDxxx` sequence appears inside a JSON string
+/// value or a key — it replaces lone surrogates everywhere. This is safe because lone surrogates
+/// are equally invalid in both contexts, and no real SDK emits surrogate-containing key names.
+///
+/// This function also does not handle escaped backslashes (`\\uD800` representing the literal
+/// text `\uD800`). This is safe because such sequences are valid JSON and would not cause
+/// `serde_json` to fail — meaning this function would never be called for such payloads.
+///
+/// Returns a `Cow::Borrowed` reference to the original slice if no replacements were needed,
+/// avoiding allocation on the happy path.
+pub(crate) fn sanitize_lone_surrogates(input: &[u8]) -> std::borrow::Cow<'_, [u8]> {
+    use std::borrow::Cow;
+
+    const REPLACEMENT: &[u8] = b"\\uFFFD";
+
+    // Minimum length for a `\uXXXX` escape is 6 bytes.
+    if input.len() < 6 {
+        return Cow::Borrowed(input);
+    }
+
+    let mut result: Option<Vec<u8>> = None;
+    let mut i = 0;
+
+    while i + 5 < input.len() {
+        if let Some(surrogate) = parse_unicode_escape(input, i) {
+            if is_high_surrogate(surrogate) {
+                // Check if followed by a low surrogate (valid pair).
+                if i + 11 < input.len()
+                    && let Some(next) = parse_unicode_escape(input, i + 6)
+                    && is_low_surrogate(next)
+                {
+                    // Valid surrogate pair — keep both escapes as-is.
+                    if let Some(ref mut buf) = result {
+                        buf.extend_from_slice(&input[i..i + 12]);
+                    }
+                    i += 12;
+                    continue;
+                }
+                // Lone high surrogate — replace.
+                let buf = result.get_or_insert_with(|| input[..i].to_vec());
+                buf.extend_from_slice(REPLACEMENT);
+                i += 6;
+                continue;
+            } else if is_low_surrogate(surrogate) {
+                // Lone low surrogate (not preceded by a high surrogate we would have consumed).
+                let buf = result.get_or_insert_with(|| input[..i].to_vec());
+                buf.extend_from_slice(REPLACEMENT);
+                i += 6;
+                continue;
+            }
+        }
+
+        if let Some(ref mut buf) = result {
+            buf.push(input[i]);
+        }
+        i += 1;
+    }
+
+    // Copy remaining bytes.
+    match result {
+        Some(mut buf) => {
+            buf.extend_from_slice(&input[i..]);
+            Cow::Owned(buf)
+        }
+        None => Cow::Borrowed(input),
+    }
+}
+
+/// Attempts to parse a `\uXXXX` escape sequence starting at position `i`.
+///
+/// Returns the parsed 16-bit code point if the bytes at `input[i..i+6]` form a valid
+/// JSON unicode escape (`\u` followed by exactly 4 hex digits), or `None` otherwise.
+fn parse_unicode_escape(input: &[u8], i: usize) -> Option<u16> {
+    if i + 5 >= input.len() {
+        return None;
+    }
+    if input[i] != b'\\' || input[i + 1] != b'u' {
+        return None;
+    }
+    let hex = std::str::from_utf8(&input[i + 2..i + 6]).ok()?;
+    u16::from_str_radix(hex, 16).ok()
+}
+
+fn is_high_surrogate(code: u16) -> bool {
+    (0xD800..=0xDBFF).contains(&code)
+}
+
+fn is_low_surrogate(code: u16) -> bool {
+    (0xDC00..=0xDFFF).contains(&code)
+}
+
 impl<T: ContainerItem> From<ContainerItems<T>> for ItemContainer<T> {
     fn from(items: ContainerItems<T>) -> Self {
         Self { items }
@@ -607,4 +710,79 @@ mod tests {
         // e.g. correct order of fields.
         assert_eq!(new_item.payload(), item.payload());
     }
+
+    #[test]
+    fn test_sanitize_no_surrogates() {
+        let input = br#"{"items":[{"level":"info","message":"hello world"}]}"#;
+        let result = sanitize_lone_surrogates(input);
+        assert!(matches!(result, std::borrow::Cow::Borrowed(_)));
+        assert_eq!(result.as_ref(), input.as_slice());
+    }
+
+    #[test]
+    fn test_sanitize_lone_high_surrogate() {
+        let input = br#"{"items":[{"level":"info","message":"bad \uD800 char"}]}"#;
+        let expected = br#"{"items":[{"level":"info","message":"bad \uFFFD char"}]}"#;
+        let result = sanitize_lone_surrogates(input);
+        assert!(matches!(result, std::borrow::Cow::Owned(_)));
+        assert_eq!(result.as_ref(), expected.as_slice());
+    }
+
+    #[test]
+    fn test_sanitize_lone_low_surrogate() {
+        let input = br#"{"message":"\uDC00"}"#;
+        let expected = br#"{"message":"\uFFFD"}"#;
+        let result = sanitize_lone_surrogates(input);
+        assert_eq!(result.as_ref(), expected.as_slice());
+    }
+
+    #[test]
+    fn test_sanitize_preserves_valid_surrogate_pair() {
+        // \uD83D\uDE00 is the surrogate pair for 😀
+        let input = br#"{"message":"\uD83D\uDE00"}"#;
+        let result = sanitize_lone_surrogates(input);
+        assert!(matches!(result, std::borrow::Cow::Borrowed(_)));
+        assert_eq!(result.as_ref(), input.as_slice());
+    }
+
+    #[test]
+    fn test_sanitize_high_surrogate_followed_by_non_surrogate_escape() {
+        // High surrogate followed by a non-surrogate \u escape — both should be handled.
+        let input = br#"{"message":"\uD800\u0041"}"#;
+        let expected = br#"{"message":"\uFFFD\u0041"}"#;
+        let result = sanitize_lone_surrogates(input);
+        assert_eq!(result.as_ref(), expected.as_slice());
+    }
+
+    #[test]
+    fn test_sanitize_multiple_lone_surrogates() {
+        let input = br#"{"a":"\uD800","b":"\uDBFF","c":"\uDC00"}"#;
+        let expected = br#"{"a":"\uFFFD","b":"\uFFFD","c":"\uFFFD"}"#;
+        let result = sanitize_lone_surrogates(input);
+        assert_eq!(result.as_ref(), expected.as_slice());
+    }
+
+    #[test]
+    fn test_sanitize_surrogate_at_end_of_input() {
+        let input = br#"{"m":"\uD800"}"#;
+        let expected = br#"{"m":"\uFFFD"}"#;
+        let result = sanitize_lone_surrogates(input);
+        assert_eq!(result.as_ref(), expected.as_slice());
+    }
+
+    #[test]
+    fn test_sanitize_mixed_lone_and_valid_pair() {
+        // Lone surrogate followed later by a valid pair — lone gets replaced, pair preserved.
+        let input = br#"{"a":"\uD800","b":"\uD83D\uDE00"}"#;
+        let expected = br#"{"a":"\uFFFD","b":"\uD83D\uDE00"}"#;
+        let result = sanitize_lone_surrogates(input);
+        assert_eq!(result.as_ref(), expected.as_slice());
+    }
+
+    #[test]
+    fn test_sanitize_empty_and_short_inputs() {
+        assert_eq!(sanitize_lone_surrogates(b"").as_ref(), b"");
+        assert_eq!(sanitize_lone_surrogates(b"{}").as_ref(), b"{}");
+        assert_eq!(sanitize_lone_surrogates(b"hello").as_ref(), b"hello");
+    }
 }
@@ -57,6 +57,7 @@ mod item;
 mod meta;
 
 pub use self::attachment::*;
+pub(crate) use self::container::sanitize_lone_surrogates;
 pub use self::container::*;
 pub use self::content_type::*;
 pub use self::item::*;

@@ -4,11 +4,14 @@ use relay_event_schema::protocol::{OurLog, OurLogHeader};
 use relay_protocol::Annotated;
 use relay_quotas::DataCategory;
 
-use crate::envelope::{ContainerItems, EnvelopeHeaders, Item, ItemContainer};
+use crate::envelope::{
+    ContainerItems, ContentType, EnvelopeHeaders, Item, ItemContainer, sanitize_lone_surrogates,
+};
 use crate::extractors::RequestTrust;
 use crate::processing::logs::{self, Error, ExpandedLogs, Result, SerializedLogs};
 use crate::processing::{Context, Managed, utils};
 use crate::services::outcome::DiscardReason;
+use crate::statsd::RelayCounters;
 
 /// Parses all serialized logs into their [`ExpandedLogs`] representation.
 ///
@@ -63,6 +66,7 @@ pub fn scrub(logs: &mut Managed<ExpandedLogs>, ctx: Context<'_>) {
 
 fn expand_log_container(item: &Item, trust: RequestTrust) -> Result<ContainerItems<OurLog>> {
     let mut logs = ItemContainer::parse(item)
+        .or_else(|err| try_sanitize_and_reparse(item, err))
         .map_err(|err| {
             relay_log::debug!("failed to parse logs container: {err}");
             Error::Invalid(DiscardReason::InvalidJson)
@@ -89,6 +93,44 @@ fn expand_log_container(item: &Item, trust: RequestTrust) -> Result<ContainerIte
     Ok(logs)
 }
 
+/// Attempts to recover from a log container parse failure caused by lone Unicode surrogates.
+///
+/// When `ItemContainer::parse` fails, this function sanitizes the raw payload by replacing
+/// JSON-escaped lone surrogates (`\uD800`–`\uDFFF`) with the replacement character (`\uFFFD`)
+/// and retries parsing. If the payload had no surrogates to sanitize, the original error is
+/// returned unchanged.
+fn try_sanitize_and_reparse(
+    item: &Item,
+    original_err: crate::envelope::ContainerParseError,
+) -> std::result::Result<ItemContainer<OurLog>, crate::envelope::ContainerParseError> {
+    use crate::envelope::ContainerParseError;
+
+    // Only attempt sanitization for deserialization errors.
+    if !matches!(original_err, ContainerParseError::Deserialize(_)) {
+        return Err(original_err);
+    }
+
+    let payload = item.payload();
+    let sanitized = sanitize_lone_surrogates(&payload);
+
+    if sanitized.as_ref() == payload.as_ref() {
+        // Payload unchanged — the error is not caused by lone surrogates.
+        return Err(original_err);
+    }
+
+    relay_log::debug!("sanitized lone surrogates in log container payload");
+    relay_statsd::metric!(counter(RelayCounters::LogContainerSurrogateSanitized) += 1);
+
+    // Re-parse with a sanitized payload. Content type and item type were already validated
+    // by the original parse attempt, so we re-use the same item with the sanitized payload.
+    let mut sanitized_item = item.clone();
+    sanitized_item.set_payload(ContentType::LogContainer, sanitized.into_owned());
+    ItemContainer::parse(&sanitized_item).map_err(|err| {
+        relay_log::debug!("failed to parse log container after surrogate sanitization: {err}");
+        err
+    })
+}
+
 fn scrub_log(log: &mut Annotated<OurLog>, ctx: Context<'_>) -> Result<()> {
     let pii_config_from_scrubbing = ctx.project_info.config.datascrubbing_settings.pii_config();
 
@@ -140,6 +182,7 @@ fn normalize_log(
 
 #[cfg(test)]
 mod tests {
+    use bytes::Bytes;
     use relay_pii::PiiConfig;
     use relay_protocol::assert_annotated_snapshot;
 
@@ -424,4 +467,49 @@ mod tests {
         }
         "#);
     }
+
+    /// Helper to construct a log container [`Item`] from raw JSON item bodies.
+    fn log_container_item(items_json: &str, item_count: u32) -> Item {
+        let header = format!(
+            r#"{{"type":"log","content_type":"application/vnd.sentry.items.log+json","item_count":{item_count}}}"#
+        );
+        let raw = format!("{header}\n{{\"items\":[{items_json}]}}");
+        let (item, _) = Item::parse(Bytes::from(raw)).unwrap();
+        item
+    }
+
+    #[test]
+    fn test_expand_log_container_with_lone_surrogate() {
+        let item = log_container_item(
+            &[
+                r#"{"timestamp":1544719860.0,"trace_id":"5b8efff798038103d269b633813fc60c","level":"info","body":"good log","attributes":{}}"#,
+                r#"{"timestamp":1544719860.0,"trace_id":"5b8efff798038103d269b633813fc60c","level":"error","body":"bad \uD800 char","attributes":{}}"#,
+            ].join(","),
+            2,
+        );
+
+        let logs = expand_log_container(&item, RequestTrust::Untrusted).unwrap();
+        assert_eq!(logs.len(), 2);
+
+        let first = logs[0].value.value().unwrap();
+        assert_eq!(first.body.as_str(), Some("good log"));
+
+        let second = logs[1].value.value().unwrap();
+        assert_eq!(second.body.as_str(), Some("bad \u{FFFD} char"));
+    }
+
+    #[test]
+    fn test_expand_log_container_without_surrogates_unchanged() {
+        let item = log_container_item(
+            r#"{"timestamp":1544719860.0,"trace_id":"5b8efff798038103d269b633813fc60c","level":"info","body":"clean log","attributes":{}}"#,
+            1,
+        );
+
+        let logs = expand_log_container(&item, RequestTrust::Untrusted).unwrap();
+        assert_eq!(logs.len(), 1);
+        assert_eq!(
+            logs[0].value.value().unwrap().body.as_str(),
+            Some("clean log")
+        );
+    }
 }
@@ -993,6 +993,12 @@ pub enum RelayCounters {
     /// This metric is tagged with:
     /// - `expansion`: What expansion was used to expand the error (e.g. unreal).
     ErrorProcessed,
+    /// Number of log container payloads that required lone surrogate sanitization.
+    ///
+    /// Emitted when a log container JSON payload contains lone Unicode surrogates
+    /// (`\uD800`–`\uDFFF`) that would otherwise cause deserialization to fail and discard
+    /// the entire batch.
+    LogContainerSurrogateSanitized,
 }
 
 impl CounterMetric for RelayCounters {
@@ -1052,6 +1058,7 @@ impl CounterMetric for RelayCounters {
             RelayCounters::EnvelopeWithLogs => "logs.envelope",
             RelayCounters::ProfileChunksWithoutPlatform => "profile_chunk.no_platform",
             RelayCounters::ErrorProcessed => "event.error.processed",
+            RelayCounters::LogContainerSurrogateSanitized => "logs.container.surrogate_sanitized",
         }
     }
 }