Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

**Bug Fixes**:

- Recover log containers containing lone Unicode surrogates (`\uD800`–`\uDFFF`) instead of discarding the entire batch. ([#5833](https://github.com/getsentry/relay/pull/5833))
- Store segment name in `sentry.transaction` in addition to `sentry.segment.name` on OTLP spans. ([#5765](https://github.com/getsentry/relay/pull/5765))
- Explicitly handle in-flight requests during shutdown. ([#5746](https://github.com/getsentry/relay/pull/5746), [#5769](https://github.com/getsentry/relay/pull/5769))
- Emit outcomes in both `log_byte` and `log_item` categories when logs are dropped. ([#5766](https://github.com/getsentry/relay/pull/5766))
Expand Down
178 changes: 178 additions & 0 deletions relay-server/src/envelope/container.rs
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,109 @@ impl<T: ContainerItem> ItemContainer<T> {
}
}

/// Sanitizes lone Unicode surrogates in JSON-escaped form within raw bytes.
///
/// JSON payloads may contain escaped lone surrogates (`\uD800`–`\uDFFF`) that are not part of
/// a valid surrogate pair. These are rejected by strict JSON parsers like `serde_json`.
///
/// This function scans for such sequences and replaces them with the Unicode replacement
/// character escape (`\uFFFD`), which is the same byte length (6 bytes), allowing in-place
/// replacement without shifting offsets.
///
/// Valid surrogate pairs (a high surrogate `\uD800`–`\uDBFF` immediately followed by a low
/// surrogate `\uDC00`–`\uDFFF`) are left intact.
///
/// Note: this function does not track whether a `\uDxxx` sequence appears inside a JSON string
/// value or a key — it replaces lone surrogates everywhere. This is safe because lone surrogates
/// are equally invalid in both contexts, and no real SDK emits surrogate-containing key names.
///
/// This function also does not handle escaped backslashes (`\\uD800` representing the literal
/// text `\uD800`). This is safe because such sequences are valid JSON and would not cause
/// `serde_json` to fail — meaning this function would never be called for such payloads.
///
/// Returns a `Cow::Borrowed` reference to the original slice if no replacements were needed,
/// avoiding allocation on the happy path.
pub(crate) fn sanitize_lone_surrogates(input: &[u8]) -> std::borrow::Cow<'_, [u8]> {
use std::borrow::Cow;

const REPLACEMENT: &[u8] = b"\\uFFFD";

// Minimum length for a `\uXXXX` escape is 6 bytes.
if input.len() < 6 {
return Cow::Borrowed(input);
}

let mut result: Option<Vec<u8>> = None;
let mut i = 0;

while i + 5 < input.len() {
if let Some(surrogate) = parse_unicode_escape(input, i) {
if is_high_surrogate(surrogate) {
// Check if followed by a low surrogate (valid pair).
if i + 11 < input.len()
&& let Some(next) = parse_unicode_escape(input, i + 6)
&& is_low_surrogate(next)
{
// Valid surrogate pair — keep both escapes as-is.
if let Some(ref mut buf) = result {
buf.extend_from_slice(&input[i..i + 12]);
}
i += 12;
continue;
}
// Lone high surrogate — replace.
let buf = result.get_or_insert_with(|| input[..i].to_vec());
buf.extend_from_slice(REPLACEMENT);
i += 6;
continue;
} else if is_low_surrogate(surrogate) {
// Lone low surrogate (not preceded by a high surrogate we would have consumed).
let buf = result.get_or_insert_with(|| input[..i].to_vec());
buf.extend_from_slice(REPLACEMENT);
i += 6;
continue;
}
}

if let Some(ref mut buf) = result {
buf.push(input[i]);
}
i += 1;
Comment thread
antonis marked this conversation as resolved.
}
Comment thread
antonis marked this conversation as resolved.

// Copy remaining bytes.
match result {
Some(mut buf) => {
buf.extend_from_slice(&input[i..]);
Cow::Owned(buf)
}
None => Cow::Borrowed(input),
}
}

/// Attempts to parse a `\uXXXX` escape sequence starting at position `i`.
///
/// Returns the parsed 16-bit code point if the bytes at `input[i..i+6]` form a valid
/// JSON unicode escape (`\u` followed by exactly 4 hex digits), or `None` otherwise.
fn parse_unicode_escape(input: &[u8], i: usize) -> Option<u16> {
if i + 5 >= input.len() {
return None;
}
if input[i] != b'\\' || input[i + 1] != b'u' {
return None;
}
let hex = std::str::from_utf8(&input[i + 2..i + 6]).ok()?;
u16::from_str_radix(hex, 16).ok()
}

fn is_high_surrogate(code: u16) -> bool {
(0xD800..=0xDBFF).contains(&code)
}

fn is_low_surrogate(code: u16) -> bool {
(0xDC00..=0xDFFF).contains(&code)
}

impl<T: ContainerItem> From<ContainerItems<T>> for ItemContainer<T> {
fn from(items: ContainerItems<T>) -> Self {
Self { items }
Expand Down Expand Up @@ -607,4 +710,79 @@ mod tests {
// e.g. correct order of fields.
assert_eq!(new_item.payload(), item.payload());
}

#[test]
fn test_sanitize_no_surrogates() {
let input = br#"{"items":[{"level":"info","message":"hello world"}]}"#;
let result = sanitize_lone_surrogates(input);
assert!(matches!(result, std::borrow::Cow::Borrowed(_)));
assert_eq!(result.as_ref(), input.as_slice());
}

#[test]
fn test_sanitize_lone_high_surrogate() {
let input = br#"{"items":[{"level":"info","message":"bad \uD800 char"}]}"#;
let expected = br#"{"items":[{"level":"info","message":"bad \uFFFD char"}]}"#;
let result = sanitize_lone_surrogates(input);
assert!(matches!(result, std::borrow::Cow::Owned(_)));
assert_eq!(result.as_ref(), expected.as_slice());
}

#[test]
fn test_sanitize_lone_low_surrogate() {
let input = br#"{"message":"\uDC00"}"#;
let expected = br#"{"message":"\uFFFD"}"#;
let result = sanitize_lone_surrogates(input);
assert_eq!(result.as_ref(), expected.as_slice());
}

#[test]
fn test_sanitize_preserves_valid_surrogate_pair() {
// \uD83D\uDE00 is the surrogate pair for 😀
let input = br#"{"message":"\uD83D\uDE00"}"#;
let result = sanitize_lone_surrogates(input);
assert!(matches!(result, std::borrow::Cow::Borrowed(_)));
assert_eq!(result.as_ref(), input.as_slice());
}

#[test]
fn test_sanitize_high_surrogate_followed_by_non_surrogate_escape() {
// High surrogate followed by a non-surrogate \u escape — both should be handled.
let input = br#"{"message":"\uD800\u0041"}"#;
let expected = br#"{"message":"\uFFFD\u0041"}"#;
let result = sanitize_lone_surrogates(input);
assert_eq!(result.as_ref(), expected.as_slice());
}

#[test]
fn test_sanitize_multiple_lone_surrogates() {
let input = br#"{"a":"\uD800","b":"\uDBFF","c":"\uDC00"}"#;
let expected = br#"{"a":"\uFFFD","b":"\uFFFD","c":"\uFFFD"}"#;
let result = sanitize_lone_surrogates(input);
assert_eq!(result.as_ref(), expected.as_slice());
}

#[test]
fn test_sanitize_surrogate_at_end_of_input() {
let input = br#"{"m":"\uD800"}"#;
let expected = br#"{"m":"\uFFFD"}"#;
let result = sanitize_lone_surrogates(input);
assert_eq!(result.as_ref(), expected.as_slice());
}

#[test]
fn test_sanitize_mixed_lone_and_valid_pair() {
// Lone surrogate followed later by a valid pair — lone gets replaced, pair preserved.
let input = br#"{"a":"\uD800","b":"\uD83D\uDE00"}"#;
let expected = br#"{"a":"\uFFFD","b":"\uD83D\uDE00"}"#;
let result = sanitize_lone_surrogates(input);
assert_eq!(result.as_ref(), expected.as_slice());
}

#[test]
fn test_sanitize_empty_and_short_inputs() {
assert_eq!(sanitize_lone_surrogates(b"").as_ref(), b"");
assert_eq!(sanitize_lone_surrogates(b"{}").as_ref(), b"{}");
assert_eq!(sanitize_lone_surrogates(b"hello").as_ref(), b"hello");
}
}
1 change: 1 addition & 0 deletions relay-server/src/envelope/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ mod item;
mod meta;

pub use self::attachment::*;
pub(crate) use self::container::sanitize_lone_surrogates;
pub use self::container::*;
pub use self::content_type::*;
pub use self::item::*;
Expand Down
90 changes: 89 additions & 1 deletion relay-server/src/processing/logs/process.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,14 @@ use relay_event_schema::protocol::{OurLog, OurLogHeader};
use relay_protocol::Annotated;
use relay_quotas::DataCategory;

use crate::envelope::{ContainerItems, EnvelopeHeaders, Item, ItemContainer};
use crate::envelope::{
ContainerItems, ContentType, EnvelopeHeaders, Item, ItemContainer, sanitize_lone_surrogates,
};
use crate::extractors::RequestTrust;
use crate::processing::logs::{self, Error, ExpandedLogs, Result, SerializedLogs};
use crate::processing::{Context, Managed, utils};
use crate::services::outcome::DiscardReason;
use crate::statsd::RelayCounters;

/// Parses all serialized logs into their [`ExpandedLogs`] representation.
///
Expand Down Expand Up @@ -63,6 +66,7 @@ pub fn scrub(logs: &mut Managed<ExpandedLogs>, ctx: Context<'_>) {

fn expand_log_container(item: &Item, trust: RequestTrust) -> Result<ContainerItems<OurLog>> {
let mut logs = ItemContainer::parse(item)
.or_else(|err| try_sanitize_and_reparse(item, err))
.map_err(|err| {
relay_log::debug!("failed to parse logs container: {err}");
Error::Invalid(DiscardReason::InvalidJson)
Expand All @@ -89,6 +93,44 @@ fn expand_log_container(item: &Item, trust: RequestTrust) -> Result<ContainerIte
Ok(logs)
}

/// Attempts to recover from a log container parse failure caused by lone Unicode surrogates.
///
/// When `ItemContainer::parse` fails, this function sanitizes the raw payload by replacing
/// JSON-escaped lone surrogates (`\uD800`–`\uDFFF`) with the replacement character (`\uFFFD`)
/// and retries parsing. If the payload had no surrogates to sanitize, the original error is
/// returned unchanged.
fn try_sanitize_and_reparse(
item: &Item,
original_err: crate::envelope::ContainerParseError,
) -> std::result::Result<ItemContainer<OurLog>, crate::envelope::ContainerParseError> {
use crate::envelope::ContainerParseError;

// Only attempt sanitization for deserialization errors.
if !matches!(original_err, ContainerParseError::Deserialize(_)) {
return Err(original_err);
}

let payload = item.payload();
let sanitized = sanitize_lone_surrogates(&payload);

if sanitized.as_ref() == payload.as_ref() {
// Payload unchanged — the error is not caused by lone surrogates.
return Err(original_err);
}

relay_log::debug!("sanitized lone surrogates in log container payload");
relay_statsd::metric!(counter(RelayCounters::LogContainerSurrogateSanitized) += 1);

// Re-parse with a sanitized payload. Content type and item type were already validated
// by the original parse attempt, so we re-use the same item with the sanitized payload.
let mut sanitized_item = item.clone();
sanitized_item.set_payload(ContentType::LogContainer, sanitized.into_owned());
ItemContainer::parse(&sanitized_item).map_err(|err| {
relay_log::debug!("failed to parse log container after surrogate sanitization: {err}");
err
})
}

fn scrub_log(log: &mut Annotated<OurLog>, ctx: Context<'_>) -> Result<()> {
let pii_config_from_scrubbing = ctx.project_info.config.datascrubbing_settings.pii_config();

Expand Down Expand Up @@ -140,6 +182,7 @@ fn normalize_log(

#[cfg(test)]
mod tests {
use bytes::Bytes;
use relay_pii::PiiConfig;
use relay_protocol::assert_annotated_snapshot;

Expand Down Expand Up @@ -424,4 +467,49 @@ mod tests {
}
"#);
}

/// Helper to construct a log container [`Item`] from raw JSON item bodies.
fn log_container_item(items_json: &str, item_count: u32) -> Item {
let header = format!(
r#"{{"type":"log","content_type":"application/vnd.sentry.items.log+json","item_count":{item_count}}}"#
);
let raw = format!("{header}\n{{\"items\":[{items_json}]}}");
let (item, _) = Item::parse(Bytes::from(raw)).unwrap();
item
}

#[test]
fn test_expand_log_container_with_lone_surrogate() {
let item = log_container_item(
&[
r#"{"timestamp":1544719860.0,"trace_id":"5b8efff798038103d269b633813fc60c","level":"info","body":"good log","attributes":{}}"#,
r#"{"timestamp":1544719860.0,"trace_id":"5b8efff798038103d269b633813fc60c","level":"error","body":"bad \uD800 char","attributes":{}}"#,
].join(","),
2,
);

let logs = expand_log_container(&item, RequestTrust::Untrusted).unwrap();
assert_eq!(logs.len(), 2);

let first = logs[0].value.value().unwrap();
assert_eq!(first.body.as_str(), Some("good log"));

let second = logs[1].value.value().unwrap();
assert_eq!(second.body.as_str(), Some("bad \u{FFFD} char"));
}

#[test]
fn test_expand_log_container_without_surrogates_unchanged() {
let item = log_container_item(
r#"{"timestamp":1544719860.0,"trace_id":"5b8efff798038103d269b633813fc60c","level":"info","body":"clean log","attributes":{}}"#,
1,
);

let logs = expand_log_container(&item, RequestTrust::Untrusted).unwrap();
assert_eq!(logs.len(), 1);
assert_eq!(
logs[0].value.value().unwrap().body.as_str(),
Some("clean log")
);
}
}
7 changes: 7 additions & 0 deletions relay-server/src/statsd.rs
Original file line number Diff line number Diff line change
Expand Up @@ -993,6 +993,12 @@ pub enum RelayCounters {
/// This metric is tagged with:
/// - `expansion`: What expansion was used to expand the error (e.g. unreal).
ErrorProcessed,
/// Number of log container payloads that required lone surrogate sanitization.
///
/// Emitted when a log container JSON payload contains lone Unicode surrogates
/// (`\uD800`–`\uDFFF`) that would otherwise cause deserialization to fail and discard
/// the entire batch.
LogContainerSurrogateSanitized,
}

impl CounterMetric for RelayCounters {
Expand Down Expand Up @@ -1052,6 +1058,7 @@ impl CounterMetric for RelayCounters {
RelayCounters::EnvelopeWithLogs => "logs.envelope",
RelayCounters::ProfileChunksWithoutPlatform => "profile_chunk.no_platform",
RelayCounters::ErrorProcessed => "event.error.processed",
RelayCounters::LogContainerSurrogateSanitized => "logs.container.surrogate_sanitized",
}
}
}
Loading