From 568b97216a087b78a9bf023a40cb242aef7412d5 Mon Sep 17 00:00:00 2001 From: Antonis Lilis Date: Tue, 14 Apr 2026 14:28:06 +0200 Subject: [PATCH 1/2] fix(logs): Recover log containers with lone Unicode surrogates MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a log container payload contains JSON-escaped lone surrogates (\uD800–\uDFFF), serde_json rejects the entire payload, discarding all logs in the batch. This is a data loss amplification issue where a single malformed log entry causes the entire container to be dropped. This adds a fallback in log container parsing: on deserialization failure, the raw payload is scanned for lone surrogates and they are replaced with the Unicode replacement character (\uFFFD). The sanitized payload is then re-parsed. This only runs on the error path, so there is zero overhead on valid payloads. Scoped to log containers only. Spans and trace metrics are not affected. Ref: getsentry/sentry-react-native#5186 Co-Authored-By: Claude Opus 4.6 (1M context) --- relay-server/src/envelope/container.rs | 178 ++++++++++++++++++++ relay-server/src/envelope/mod.rs | 1 + relay-server/src/processing/logs/process.rs | 90 +++++++++- relay-server/src/statsd.rs | 7 + 4 files changed, 275 insertions(+), 1 deletion(-) diff --git a/relay-server/src/envelope/container.rs b/relay-server/src/envelope/container.rs index ce4fff87f42..51123c3c942 100644 --- a/relay-server/src/envelope/container.rs +++ b/relay-server/src/envelope/container.rs @@ -276,6 +276,109 @@ impl ItemContainer { } } +/// Sanitizes lone Unicode surrogates in JSON-escaped form within raw bytes. +/// +/// JSON payloads may contain escaped lone surrogates (`\uD800`–`\uDFFF`) that are not part of +/// a valid surrogate pair. These are rejected by strict JSON parsers like `serde_json`. +/// +/// This function scans for such sequences and replaces them with the Unicode replacement +/// character escape (`\uFFFD`), which is the same byte length (6 bytes), allowing in-place +/// replacement without shifting offsets. +/// +/// Valid surrogate pairs (a high surrogate `\uD800`–`\uDBFF` immediately followed by a low +/// surrogate `\uDC00`–`\uDFFF`) are left intact. +/// +/// Note: this function does not track whether a `\uDxxx` sequence appears inside a JSON string +/// value or a key — it replaces lone surrogates everywhere. This is safe because lone surrogates +/// are equally invalid in both contexts, and no real SDK emits surrogate-containing key names. +/// +/// This function also does not handle escaped backslashes (`\\uD800` representing the literal +/// text `\uD800`). This is safe because such sequences are valid JSON and would not cause +/// `serde_json` to fail — meaning this function would never be called for such payloads. +/// +/// Returns a `Cow::Borrowed` reference to the original slice if no replacements were needed, +/// avoiding allocation on the happy path. +pub(crate) fn sanitize_lone_surrogates(input: &[u8]) -> std::borrow::Cow<'_, [u8]> { + use std::borrow::Cow; + + const REPLACEMENT: &[u8] = b"\\uFFFD"; + + // Minimum length for a `\uXXXX` escape is 6 bytes. + if input.len() < 6 { + return Cow::Borrowed(input); + } + + let mut result: Option> = None; + let mut i = 0; + + while i + 5 < input.len() { + if let Some(surrogate) = parse_unicode_escape(input, i) { + if is_high_surrogate(surrogate) { + // Check if followed by a low surrogate (valid pair). + if i + 11 < input.len() + && let Some(next) = parse_unicode_escape(input, i + 6) + && is_low_surrogate(next) + { + // Valid surrogate pair — keep both escapes as-is. + if let Some(ref mut buf) = result { + buf.extend_from_slice(&input[i..i + 12]); + } + i += 12; + continue; + } + // Lone high surrogate — replace. + let buf = result.get_or_insert_with(|| input[..i].to_vec()); + buf.extend_from_slice(REPLACEMENT); + i += 6; + continue; + } else if is_low_surrogate(surrogate) { + // Lone low surrogate (not preceded by a high surrogate we would have consumed). + let buf = result.get_or_insert_with(|| input[..i].to_vec()); + buf.extend_from_slice(REPLACEMENT); + i += 6; + continue; + } + } + + if let Some(ref mut buf) = result { + buf.push(input[i]); + } + i += 1; + } + + // Copy remaining bytes. + match result { + Some(mut buf) => { + buf.extend_from_slice(&input[i..]); + Cow::Owned(buf) + } + None => Cow::Borrowed(input), + } +} + +/// Attempts to parse a `\uXXXX` escape sequence starting at position `i`. +/// +/// Returns the parsed 16-bit code point if the bytes at `input[i..i+6]` form a valid +/// JSON unicode escape (`\u` followed by exactly 4 hex digits), or `None` otherwise. +fn parse_unicode_escape(input: &[u8], i: usize) -> Option { + if i + 5 >= input.len() { + return None; + } + if input[i] != b'\\' || input[i + 1] != b'u' { + return None; + } + let hex = std::str::from_utf8(&input[i + 2..i + 6]).ok()?; + u16::from_str_radix(hex, 16).ok() +} + +fn is_high_surrogate(code: u16) -> bool { + (0xD800..=0xDBFF).contains(&code) +} + +fn is_low_surrogate(code: u16) -> bool { + (0xDC00..=0xDFFF).contains(&code) +} + impl From> for ItemContainer { fn from(items: ContainerItems) -> Self { Self { items } @@ -607,4 +710,79 @@ mod tests { // e.g. correct order of fields. assert_eq!(new_item.payload(), item.payload()); } + + #[test] + fn test_sanitize_no_surrogates() { + let input = br#"{"items":[{"level":"info","message":"hello world"}]}"#; + let result = sanitize_lone_surrogates(input); + assert!(matches!(result, std::borrow::Cow::Borrowed(_))); + assert_eq!(result.as_ref(), input.as_slice()); + } + + #[test] + fn test_sanitize_lone_high_surrogate() { + let input = br#"{"items":[{"level":"info","message":"bad \uD800 char"}]}"#; + let expected = br#"{"items":[{"level":"info","message":"bad \uFFFD char"}]}"#; + let result = sanitize_lone_surrogates(input); + assert!(matches!(result, std::borrow::Cow::Owned(_))); + assert_eq!(result.as_ref(), expected.as_slice()); + } + + #[test] + fn test_sanitize_lone_low_surrogate() { + let input = br#"{"message":"\uDC00"}"#; + let expected = br#"{"message":"\uFFFD"}"#; + let result = sanitize_lone_surrogates(input); + assert_eq!(result.as_ref(), expected.as_slice()); + } + + #[test] + fn test_sanitize_preserves_valid_surrogate_pair() { + // \uD83D\uDE00 is the surrogate pair for 😀 + let input = br#"{"message":"\uD83D\uDE00"}"#; + let result = sanitize_lone_surrogates(input); + assert!(matches!(result, std::borrow::Cow::Borrowed(_))); + assert_eq!(result.as_ref(), input.as_slice()); + } + + #[test] + fn test_sanitize_high_surrogate_followed_by_non_surrogate_escape() { + // High surrogate followed by a non-surrogate \u escape — both should be handled. + let input = br#"{"message":"\uD800\u0041"}"#; + let expected = br#"{"message":"\uFFFD\u0041"}"#; + let result = sanitize_lone_surrogates(input); + assert_eq!(result.as_ref(), expected.as_slice()); + } + + #[test] + fn test_sanitize_multiple_lone_surrogates() { + let input = br#"{"a":"\uD800","b":"\uDBFF","c":"\uDC00"}"#; + let expected = br#"{"a":"\uFFFD","b":"\uFFFD","c":"\uFFFD"}"#; + let result = sanitize_lone_surrogates(input); + assert_eq!(result.as_ref(), expected.as_slice()); + } + + #[test] + fn test_sanitize_surrogate_at_end_of_input() { + let input = br#"{"m":"\uD800"}"#; + let expected = br#"{"m":"\uFFFD"}"#; + let result = sanitize_lone_surrogates(input); + assert_eq!(result.as_ref(), expected.as_slice()); + } + + #[test] + fn test_sanitize_mixed_lone_and_valid_pair() { + // Lone surrogate followed later by a valid pair — lone gets replaced, pair preserved. + let input = br#"{"a":"\uD800","b":"\uD83D\uDE00"}"#; + let expected = br#"{"a":"\uFFFD","b":"\uD83D\uDE00"}"#; + let result = sanitize_lone_surrogates(input); + assert_eq!(result.as_ref(), expected.as_slice()); + } + + #[test] + fn test_sanitize_empty_and_short_inputs() { + assert_eq!(sanitize_lone_surrogates(b"").as_ref(), b""); + assert_eq!(sanitize_lone_surrogates(b"{}").as_ref(), b"{}"); + assert_eq!(sanitize_lone_surrogates(b"hello").as_ref(), b"hello"); + } } diff --git a/relay-server/src/envelope/mod.rs b/relay-server/src/envelope/mod.rs index 5b67f481b8c..6a934e7612e 100644 --- a/relay-server/src/envelope/mod.rs +++ b/relay-server/src/envelope/mod.rs @@ -57,6 +57,7 @@ mod item; mod meta; pub use self::attachment::*; +pub(crate) use self::container::sanitize_lone_surrogates; pub use self::container::*; pub use self::content_type::*; pub use self::item::*; diff --git a/relay-server/src/processing/logs/process.rs b/relay-server/src/processing/logs/process.rs index 2ba32da87e9..11cd13aa03e 100644 --- a/relay-server/src/processing/logs/process.rs +++ b/relay-server/src/processing/logs/process.rs @@ -4,11 +4,14 @@ use relay_event_schema::protocol::{OurLog, OurLogHeader}; use relay_protocol::Annotated; use relay_quotas::DataCategory; -use crate::envelope::{ContainerItems, EnvelopeHeaders, Item, ItemContainer}; +use crate::envelope::{ + ContainerItems, ContentType, EnvelopeHeaders, Item, ItemContainer, sanitize_lone_surrogates, +}; use crate::extractors::RequestTrust; use crate::processing::logs::{self, Error, ExpandedLogs, Result, SerializedLogs}; use crate::processing::{Context, Managed, utils}; use crate::services::outcome::DiscardReason; +use crate::statsd::RelayCounters; /// Parses all serialized logs into their [`ExpandedLogs`] representation. /// @@ -63,6 +66,7 @@ pub fn scrub(logs: &mut Managed, ctx: Context<'_>) { fn expand_log_container(item: &Item, trust: RequestTrust) -> Result> { let mut logs = ItemContainer::parse(item) + .or_else(|err| try_sanitize_and_reparse(item, err)) .map_err(|err| { relay_log::debug!("failed to parse logs container: {err}"); Error::Invalid(DiscardReason::InvalidJson) @@ -89,6 +93,44 @@ fn expand_log_container(item: &Item, trust: RequestTrust) -> Result std::result::Result, crate::envelope::ContainerParseError> { + use crate::envelope::ContainerParseError; + + // Only attempt sanitization for deserialization errors. + if !matches!(original_err, ContainerParseError::Deserialize(_)) { + return Err(original_err); + } + + let payload = item.payload(); + let sanitized = sanitize_lone_surrogates(&payload); + + if sanitized.as_ref() == payload.as_ref() { + // Payload unchanged — the error is not caused by lone surrogates. + return Err(original_err); + } + + relay_log::debug!("sanitized lone surrogates in log container payload"); + relay_statsd::metric!(counter(RelayCounters::LogContainerSurrogateSanitized) += 1); + + // Re-parse with a sanitized payload. Content type and item type were already validated + // by the original parse attempt, so we re-use the same item with the sanitized payload. + let mut sanitized_item = item.clone(); + sanitized_item.set_payload(ContentType::LogContainer, sanitized.into_owned()); + ItemContainer::parse(&sanitized_item).map_err(|err| { + relay_log::debug!("failed to parse log container after surrogate sanitization: {err}"); + err + }) +} + fn scrub_log(log: &mut Annotated, ctx: Context<'_>) -> Result<()> { let pii_config_from_scrubbing = ctx.project_info.config.datascrubbing_settings.pii_config(); @@ -140,6 +182,7 @@ fn normalize_log( #[cfg(test)] mod tests { + use bytes::Bytes; use relay_pii::PiiConfig; use relay_protocol::assert_annotated_snapshot; @@ -424,4 +467,49 @@ mod tests { } "#); } + + /// Helper to construct a log container [`Item`] from raw JSON item bodies. + fn log_container_item(items_json: &str, item_count: u32) -> Item { + let header = format!( + r#"{{"type":"log","content_type":"application/vnd.sentry.items.log+json","item_count":{item_count}}}"# + ); + let raw = format!("{header}\n{{\"items\":[{items_json}]}}"); + let (item, _) = Item::parse(Bytes::from(raw)).unwrap(); + item + } + + #[test] + fn test_expand_log_container_with_lone_surrogate() { + let item = log_container_item( + &[ + r#"{"timestamp":1544719860.0,"trace_id":"5b8efff798038103d269b633813fc60c","level":"info","body":"good log","attributes":{}}"#, + r#"{"timestamp":1544719860.0,"trace_id":"5b8efff798038103d269b633813fc60c","level":"error","body":"bad \uD800 char","attributes":{}}"#, + ].join(","), + 2, + ); + + let logs = expand_log_container(&item, RequestTrust::Untrusted).unwrap(); + assert_eq!(logs.len(), 2); + + let first = logs[0].value.value().unwrap(); + assert_eq!(first.body.as_str(), Some("good log")); + + let second = logs[1].value.value().unwrap(); + assert_eq!(second.body.as_str(), Some("bad \u{FFFD} char")); + } + + #[test] + fn test_expand_log_container_without_surrogates_unchanged() { + let item = log_container_item( + r#"{"timestamp":1544719860.0,"trace_id":"5b8efff798038103d269b633813fc60c","level":"info","body":"clean log","attributes":{}}"#, + 1, + ); + + let logs = expand_log_container(&item, RequestTrust::Untrusted).unwrap(); + assert_eq!(logs.len(), 1); + assert_eq!( + logs[0].value.value().unwrap().body.as_str(), + Some("clean log") + ); + } } diff --git a/relay-server/src/statsd.rs b/relay-server/src/statsd.rs index e47fbc913ce..4c68f44e8b7 100644 --- a/relay-server/src/statsd.rs +++ b/relay-server/src/statsd.rs @@ -993,6 +993,12 @@ pub enum RelayCounters { /// This metric is tagged with: /// - `expansion`: What expansion was used to expand the error (e.g. unreal). ErrorProcessed, + /// Number of log container payloads that required lone surrogate sanitization. + /// + /// Emitted when a log container JSON payload contains lone Unicode surrogates + /// (`\uD800`–`\uDFFF`) that would otherwise cause deserialization to fail and discard + /// the entire batch. + LogContainerSurrogateSanitized, } impl CounterMetric for RelayCounters { @@ -1052,6 +1058,7 @@ impl CounterMetric for RelayCounters { RelayCounters::EnvelopeWithLogs => "logs.envelope", RelayCounters::ProfileChunksWithoutPlatform => "profile_chunk.no_platform", RelayCounters::ErrorProcessed => "event.error.processed", + RelayCounters::LogContainerSurrogateSanitized => "logs.container.surrogate_sanitized", } } } From 1aaa1650cbf600e3ef10937d086ed4a31a800f4c Mon Sep 17 00:00:00 2001 From: Antonis Lilis Date: Tue, 14 Apr 2026 15:14:37 +0200 Subject: [PATCH 2/2] Add changelog entry for lone surrogate sanitization Co-Authored-By: Claude Opus 4.6 (1M context) --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index a4f5d19c0b1..b3ca9fbdbb6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ **Bug Fixes**: +- Recover log containers containing lone Unicode surrogates (`\uD800`–`\uDFFF`) instead of discarding the entire batch. ([#5833](https://github.com/getsentry/relay/pull/5833)) - Store segment name in `sentry.transaction` in addition to `sentry.segment.name` on OTLP spans. ([#5765](https://github.com/getsentry/relay/pull/5765)) - Explicitly handle in-flight requests during shutdown. ([#5746](https://github.com/getsentry/relay/pull/5746), [#5769](https://github.com/getsentry/relay/pull/5769)) - Emit outcomes in both `log_byte` and `log_item` categories when logs are dropped. ([#5766](https://github.com/getsentry/relay/pull/5766))