From 02587ef3a9ed726d4960df1698c387240761bc56 Mon Sep 17 00:00:00 2001 From: north-al Date: Wed, 10 Dec 2025 15:02:52 +0000 Subject: [PATCH] fix: Assume naive datetime as UTC in serialize_datetime When a datetime object without timezone info (naive datetime) is passed to serialize_datetime(), it was previously interpreted using the local timezone. This caused timestamp inconsistencies when the SDK runs in non-UTC timezones, leading to: 1. Duplicate trace records in ClickHouse with different toDate(timestamp) 2. ReplacingMergeTree unable to merge records for the same trace ID 3. Incorrect trace rendering in the Langfuse UI This fix changes the behavior to assume naive datetimes are UTC, which is consistent with Langfuse's infrastructure requirements and the internal _get_timestamp() function that already uses UTC. The fix ensures all timestamp serialization uses UTC, preventing the date boundary issues that occur when local time differs from UTC by enough hours to cross midnight. Includes comprehensive test coverage for: - UTC datetime serialization - Non-UTC timezone handling - Naive datetime (now assumed UTC) - Edge cases (midnight, end of day) - ISO 8601 compliance --- langfuse/api/core/datetime_utils.py | 11 ++- tests/test_datetime_utils.py | 132 ++++++++++++++++++++++++++++ 2 files changed, 139 insertions(+), 4 deletions(-) create mode 100644 tests/test_datetime_utils.py diff --git a/langfuse/api/core/datetime_utils.py b/langfuse/api/core/datetime_utils.py index 47344e9d9..76c45b788 100644 --- a/langfuse/api/core/datetime_utils.py +++ b/langfuse/api/core/datetime_utils.py @@ -7,7 +7,9 @@ def serialize_datetime(v: dt.datetime) -> str: """ Serialize a datetime including timezone info. - Uses the timezone info provided if present, otherwise uses the current runtime's timezone info. + Assumes naive datetime (without tzinfo) is UTC, as Langfuse infrastructure + expects all timestamps in UTC. This prevents timestamp inconsistencies that + cause duplicate trace records in ClickHouse when the SDK runs in non-UTC timezones. UTC datetimes end in "Z" while all other timezones are represented as offset from UTC, e.g. +05:00. """ @@ -25,6 +27,7 @@ def _serialize_zoned_datetime(v: dt.datetime) -> str: if v.tzinfo is not None: return _serialize_zoned_datetime(v) else: - local_tz = dt.datetime.now().astimezone().tzinfo - localized_dt = v.replace(tzinfo=local_tz) - return _serialize_zoned_datetime(localized_dt) + # Assume naive datetime is UTC (Langfuse standard) + # This fixes duplicate trace records caused by timezone inconsistencies + utc_dt = v.replace(tzinfo=dt.timezone.utc) + return _serialize_zoned_datetime(utc_dt) diff --git a/tests/test_datetime_utils.py b/tests/test_datetime_utils.py new file mode 100644 index 000000000..e3b8265d8 --- /dev/null +++ b/tests/test_datetime_utils.py @@ -0,0 +1,132 @@ +"""Test suite for datetime_utils module - UTC timestamp serialization.""" + +import datetime as dt +from datetime import timezone + +import pytest + +from langfuse.api.core.datetime_utils import serialize_datetime + + +class TestSerializeDatetime: + """Test suite for the serialize_datetime function.""" + + def test_utc_datetime_ends_with_z(self): + """Test that UTC datetime is serialized with 'Z' suffix.""" + utc_dt = dt.datetime(2025, 12, 10, 13, 30, 45, 123456, tzinfo=timezone.utc) + result = serialize_datetime(utc_dt) + + assert result.endswith("Z") + assert "+00:00" not in result + assert result == "2025-12-10T13:30:45.123456Z" + + def test_utc_datetime_without_microseconds(self): + """Test UTC datetime without microseconds.""" + utc_dt = dt.datetime(2025, 12, 10, 13, 30, 45, tzinfo=timezone.utc) + result = serialize_datetime(utc_dt) + + assert result.endswith("Z") + assert result == "2025-12-10T13:30:45Z" + + def test_naive_datetime_assumed_utc(self): + """Test that naive datetime (no tzinfo) is assumed to be UTC. + + This is the key fix: naive datetime should be treated as UTC, + not local time, to prevent duplicate trace records in ClickHouse + when the SDK runs in non-UTC timezones. + """ + naive_dt = dt.datetime(2025, 12, 10, 13, 30, 45, 123456) + result = serialize_datetime(naive_dt) + + # Should end with 'Z' (UTC), not a local timezone offset like +08:00 + assert result.endswith("Z"), f"Expected UTC suffix 'Z', got: {result}" + assert result == "2025-12-10T13:30:45.123456Z" + + def test_naive_datetime_without_microseconds(self): + """Test naive datetime without microseconds is assumed UTC.""" + naive_dt = dt.datetime(2025, 12, 10, 13, 30, 45) + result = serialize_datetime(naive_dt) + + assert result.endswith("Z") + assert result == "2025-12-10T13:30:45Z" + + def test_non_utc_timezone_uses_offset(self): + """Test that non-UTC timezones use offset format.""" + # Create datetime with +08:00 timezone + tz_plus_8 = timezone(dt.timedelta(hours=8)) + dt_plus_8 = dt.datetime(2025, 12, 10, 21, 30, 45, tzinfo=tz_plus_8) + result = serialize_datetime(dt_plus_8) + + # Should use offset format, not 'Z' + assert result.endswith("+08:00") + assert result == "2025-12-10T21:30:45+08:00" + + def test_negative_timezone_offset(self): + """Test negative timezone offset format.""" + tz_minus_5 = timezone(dt.timedelta(hours=-5)) + dt_minus_5 = dt.datetime(2025, 12, 10, 8, 30, 45, tzinfo=tz_minus_5) + result = serialize_datetime(dt_minus_5) + + assert result.endswith("-05:00") + assert result == "2025-12-10T08:30:45-05:00" + + def test_consistency_with_internal_timestamp_function(self): + """Test that serialize_datetime is consistent with _get_timestamp. + + The _get_timestamp function returns datetime.now(timezone.utc), + which should serialize correctly with 'Z' suffix. + """ + from langfuse._utils import _get_timestamp + + timestamp = _get_timestamp() + result = serialize_datetime(timestamp) + + # Should always end with 'Z' since _get_timestamp uses UTC + assert result.endswith("Z"), f"Expected UTC suffix 'Z', got: {result}" + + def test_multiple_naive_datetimes_serialize_consistently(self): + """Test that multiple naive datetimes serialize consistently. + + This prevents the issue where different events in the same trace + could get different timezone treatments. + """ + dt1 = dt.datetime(2025, 12, 10, 13, 30, 45) + dt2 = dt.datetime(2025, 12, 10, 13, 30, 46) + dt3 = dt.datetime(2025, 12, 10, 13, 30, 47) + + results = [serialize_datetime(d) for d in [dt1, dt2, dt3]] + + # All should have 'Z' suffix (UTC) + for result in results: + assert result.endswith("Z"), f"Expected UTC suffix 'Z', got: {result}" + + # All should have the same date (no timezone shift causing date change) + for result in results: + assert result.startswith("2025-12-10") + + def test_edge_case_midnight_utc(self): + """Test midnight UTC serialization.""" + midnight = dt.datetime(2025, 12, 10, 0, 0, 0, tzinfo=timezone.utc) + result = serialize_datetime(midnight) + + assert result == "2025-12-10T00:00:00Z" + + def test_edge_case_end_of_day_utc(self): + """Test end of day UTC serialization.""" + end_of_day = dt.datetime(2025, 12, 10, 23, 59, 59, 999999, tzinfo=timezone.utc) + result = serialize_datetime(end_of_day) + + assert result == "2025-12-10T23:59:59.999999Z" + + def test_iso8601_format_compliance(self): + """Test that output complies with ISO 8601 format.""" + naive_dt = dt.datetime(2025, 12, 10, 13, 30, 45, 123456) + result = serialize_datetime(naive_dt) + + # ISO 8601 format: YYYY-MM-DDTHH:MM:SS.ffffff[Z|+HH:MM|-HH:MM] + assert "T" in result + assert result.count(":") >= 2 + # Should be parseable + parsed = dt.datetime.fromisoformat(result.replace("Z", "+00:00")) + assert parsed.tzinfo is not None +