From 6885c619bac88a1d6f061649a96c364f6d61330a Mon Sep 17 00:00:00 2001 From: Sasa Junuzovic <44276455+microsasa@users.noreply.github.com> Date: Sat, 18 Apr 2026 13:09:10 -0700 Subject: [PATCH 1/5] fix: redirect parse_events to _parse_events_from_offset (#987) Redirect parse_events to delegate to _parse_events_from_offset with include_partial_tail=True, aligning tests with the production code path. Add include_partial_tail keyword arg to _parse_events_from_offset (matching the vscode_parser pattern). Update architecture.md step 2 to describe the actual pipeline. Add regression test verifying parse_events and get_cached_events agree on partial-tail files. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/copilot_usage/docs/architecture.md | 2 +- src/copilot_usage/parser.py | 55 ++++++------------- tests/copilot_usage/test_parser.py | 73 ++++++++++++++++++-------- 3 files changed, 68 insertions(+), 62 deletions(-) diff --git a/src/copilot_usage/docs/architecture.md b/src/copilot_usage/docs/architecture.md index d495dee0..39b27d80 100644 --- a/src/copilot_usage/docs/architecture.md +++ b/src/copilot_usage/docs/architecture.md @@ -48,7 +48,7 @@ Monorepo containing Python CLI utilities that share tooling, CI, and common depe ### Event Processing Pipeline 1. **Discovery** — `discover_sessions()` scans `~/.copilot/session-state/*/events.jsonl`, returns paths sorted by modification time -2. **Parsing** — `parse_events()` reads each line as JSON, creates `SessionEvent` objects via Pydantic validation. Malformed lines are skipped with a warning. +2. **Parsing** — `_parse_events_from_offset()` reads each line as JSON in binary mode, creates `SessionEvent` objects via Pydantic validation. The production pipeline accesses this through `get_cached_events()`, which caches results and supports incremental byte-offset parsing for append-only file growth. The public `parse_events()` delegates to the same implementation with `include_partial_tail=True` for one-shot full-file reads. Malformed lines are skipped with a warning. 3. **Typed dispatch** — callers use the narrowly-typed `as_*()` accessors (`as_session_start()`, `as_assistant_message()`, etc.) on `SessionEvent` to get a validated payload for each known event type. Unknown event types still validate as `SessionEvent`, but normal processing ignores them unless a caller explicitly validates `data` with `GenericEventData`. 4. **Summarization** — `build_session_summary()` orchestrates four focused helpers: - `_first_pass()`: single pass over events — extracts session metadata from `session.start`, counts raw events (model calls, user messages, output tokens), collects all shutdown data diff --git a/src/copilot_usage/parser.py b/src/copilot_usage/parser.py index 54889a32..98df1d92 100644 --- a/src/copilot_usage/parser.py +++ b/src/copilot_usage/parser.py @@ -210,7 +210,10 @@ def _insert_events_entry( def _parse_events_from_offset( - events_path: Path, offset: int + events_path: Path, + offset: int, + *, + include_partial_tail: bool = False, ) -> tuple[list[SessionEvent], int]: """Parse events from *events_path* starting at byte *offset*. @@ -223,6 +226,11 @@ def _parse_events_from_offset( parsing — the returned *safe_end* does not advance past them so the caller can retry on the next refresh. + When *include_partial_tail* is ``True`` (used by :func:`parse_events` + for one-shot full parsing), a final non-newline-terminated line is + **included** in the results, and ``safe_end`` advances past that + consumed partial tail as well to preserve full-file read semantics. + Returns: ``(new_events, safe_end)`` where *safe_end* is the byte position after the last fully consumed line. Callers should @@ -250,7 +258,7 @@ def _parse_events_from_offset( except ValidationError as exc: errors = exc.errors(include_url=False) if errors and errors[0].get("type") == "json_invalid": - if not raw_line.endswith(b"\n"): + if not raw_line.endswith(b"\n") and not include_partial_tail: break logger.warning( "{}:offset {} — malformed JSON, skipping", @@ -644,52 +652,19 @@ def discover_sessions(base_path: Path | None = None) -> list[Path]: def parse_events(events_path: Path) -> list[SessionEvent]: """Parse an ``events.jsonl`` file into a list of :class:`SessionEvent`. - Uses ``SessionEvent.model_validate_json`` (Rust-based parser) for each - line, bypassing the intermediate ``dict`` allocation of - ``json.loads`` + ``model_validate``. + Delegates to :func:`_parse_events_from_offset` (the same binary-mode + implementation used by the production cache layer) with + ``include_partial_tail=True`` so that a final line lacking a trailing + newline is still included — matching one-shot full-file read semantics. Lines that fail JSON decoding or Pydantic validation are skipped with a warning. - If a UTF-8 decode error occurs while reading the file, parsing stops - early and the events parsed so far are returned (a partial session). - Raises: OSError: If the file cannot be opened or read (e.g., deleted between discovery and parsing, or I/O error while streaming). - UnicodeDecodeError is caught internally; callers only need to - handle OSError. """ - events: list[SessionEvent] = [] - try: - with events_path.open(encoding="utf-8") as fh: - for lineno, line in enumerate(fh, start=1): - if not line or line.isspace(): - continue - try: - events.append(SessionEvent.model_validate_json(line)) - except ValidationError as exc: - errors = exc.errors(include_url=False) - if errors and errors[0].get("type") == "json_invalid": - logger.warning( - "{}:{} — malformed JSON, skipping", - events_path, - lineno, - ) - else: - logger.warning( - "{}:{} — validation error ({}), skipping", - events_path, - lineno, - exc.error_count(), - ) - except UnicodeDecodeError as exc: - logger.warning( - "{} — UTF-8 decode error while reading; returning {} parsed events so far (partial session): {}", - events_path, - len(events), - exc, - ) + events, _ = _parse_events_from_offset(events_path, 0, include_partial_tail=True) return events diff --git a/tests/copilot_usage/test_parser.py b/tests/copilot_usage/test_parser.py index 6fded77d..d2b351d9 100644 --- a/tests/copilot_usage/test_parser.py +++ b/tests/copilot_usage/test_parser.py @@ -1500,10 +1500,10 @@ def test_validation_error_non_json_invalid_logs_warning( # error_count() must be passed as positional arg after format string assert warning_spy.call_args.args[3] == expected_error_count - def test_validation_error_warning_includes_file_and_lineno( + def test_validation_error_warning_includes_file_and_offset( self, tmp_path: Path ) -> None: - """Validation-error warning includes the file path and line number.""" + """Validation-error warning includes the file path and byte offset.""" bad_event = json.dumps({"no_type_field": True}) p = tmp_path / "s" / "events.jsonl" _write_events(p, _START_EVENT, bad_event) @@ -1512,10 +1512,12 @@ def test_validation_error_warning_includes_file_and_lineno( warning_spy.assert_called_once() args = warning_spy.call_args.args assert args[1] == p # file path - assert args[2] == 2 # line number (bad event is the second line) + # byte offset of the second line (after _START_EVENT + newline separator) + expected_offset = len(_START_EVENT.encode("utf-8")) + 1 + assert args[2] == expected_offset def test_multiple_validation_errors_each_warned(self, tmp_path: Path) -> None: - """Two bad lines emit two separate warnings with correct line numbers.""" + """Two bad lines emit two separate warnings with correct byte offsets.""" bad1 = json.dumps({"no_type_field": True}) bad2 = json.dumps({"also_invalid": 42}) p = tmp_path / "s" / "events.jsonl" @@ -1524,9 +1526,11 @@ def test_multiple_validation_errors_each_warned(self, tmp_path: Path) -> None: events = parse_events(p) assert len(events) == 1 # only the start event survives assert warning_spy.call_count == 2 - # Each warning should reference its own line number - line_numbers = [call.args[2] for call in warning_spy.call_args_list] - assert line_numbers == [2, 3] + # Each warning should reference its own byte offset + offsets = [call.args[2] for call in warning_spy.call_args_list] + offset1 = len(_START_EVENT.encode("utf-8")) + 1 + offset2 = offset1 + len(bad1.encode("utf-8")) + 1 + assert offsets == [offset1, offset2] def test_unicode_decode_error_returns_partial(self, tmp_path: Path) -> None: """events.jsonl with invalid UTF-8 bytes returns what was parsed so far. @@ -1558,12 +1562,14 @@ def test_unicode_decode_error_full_file(self, tmp_path: Path) -> None: events = parse_events(p) assert events == [] - def test_unicode_decode_error_returns_partial_results(self, tmp_path: Path) -> None: - """Valid events before an invalid UTF-8 sequence are returned. + def test_embedded_invalid_utf8_skips_malformed_line(self, tmp_path: Path) -> None: + """Embedded invalid UTF-8 bytes produce a malformed-JSON skip, not a crash. - Python's TextIOWrapper reads in buffer-sized chunks, so the valid - content must exceed one buffer to guarantee the first lines are - yielded before the decode error fires on the next chunk. + Because ``parse_events`` delegates to ``_parse_events_from_offset`` + (binary mode), raw invalid UTF-8 bytes don't raise + ``UnicodeDecodeError`` at the file-I/O layer. Instead the + affected line fails JSON parsing and is skipped; lines on both + sides of the bad bytes still parse successfully. """ p = tmp_path / "s" / "events.jsonl" p.parent.mkdir(parents=True, exist_ok=True) @@ -1573,23 +1579,22 @@ def test_unicode_decode_error_returns_partial_results(self, tmp_path: Path) -> N # First block: repeat valid lines enough to exceed the default read buffer. first_repeat = (io.DEFAULT_BUFFER_SIZE // len(valid_line)) + 2 first_block = valid_line * first_repeat - # Second block: additional valid lines that should never be returned. + # Second block: additional valid lines after the bad bytes. second_repeat = 5 second_block = valid_line * second_repeat - total_valid_lines = first_repeat + second_repeat - # Insert invalid UTF-8 bytes between the two valid blocks so the decode - # error occurs in the middle of the file, after some events were yielded. + # Insert invalid UTF-8 bytes between the two valid blocks. + # Binary-mode reading absorbs these into the preceding line's + # tail; that line becomes malformed JSON and is skipped. invalid_bytes = b"\xff\xfe" p.write_bytes(first_block + invalid_bytes + second_block) result = parse_events(p) - # Partial parse: at least the first event must survive. assert isinstance(result, list) assert len(result) >= 1 assert result[0].type == EventType.SESSION_START - # Not everything was returned (error cut parsing short in the middle). - assert len(result) < total_valid_lines - # All returned events should be from the first valid block. - assert len(result) <= first_repeat + # Binary mode recovers: all valid lines parse, only the one + # line containing the invalid bytes is skipped. + total_valid_lines = first_repeat + second_repeat + assert len(result) == total_valid_lines - 1 class TestParseEventsModelValidateJson: @@ -9765,6 +9770,32 @@ def _boom( assert safe_end == len(line1.encode("utf-8")) +class TestParseEventsPartialTailParity: + """Regression: parse_events and get_cached_events agree on partial-tail files.""" + + def test_no_trailing_newline_same_events(self, tmp_path: Path) -> None: + """File whose last line is valid JSON without trailing newline. + + Both ``parse_events`` (one-shot) and ``get_cached_events`` (cache + layer) must return the same set of events, including the + partial-tail line. + """ + p = tmp_path / "sess" / "events.jsonl" + p.parent.mkdir(parents=True, exist_ok=True) + line1 = _make_event_line("session.start", "s1") + "\n" + line2 = _make_event_line("assistant.message", "m1") # no trailing newline + p.write_bytes(line1.encode("utf-8") + line2.encode("utf-8")) + + one_shot = parse_events(p) + cached = list(get_cached_events(p)) + + assert len(one_shot) == 2 + assert one_shot[0].type == "session.start" + assert one_shot[1].type == "assistant.message" + assert len(cached) == len(one_shot) + assert [e.type for e in cached] == [e.type for e in one_shot] + + class TestIncrementalEventsCaching: """Tests for incremental cache behaviour in get_cached_events.""" From 051080f03cacd047c43d6613c9fd1cde8c8ca39f Mon Sep 17 00:00:00 2001 From: Sasa Junuzovic <44276455+microsasa@users.noreply.github.com> Date: Sat, 18 Apr 2026 13:21:44 -0700 Subject: [PATCH 2/5] fix: address review comments Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/copilot_usage/parser.py | 12 +++++++----- tests/copilot_usage/test_parser.py | 12 +++++++----- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/src/copilot_usage/parser.py b/src/copilot_usage/parser.py index 98df1d92..47a8f38a 100644 --- a/src/copilot_usage/parser.py +++ b/src/copilot_usage/parser.py @@ -227,9 +227,10 @@ def _parse_events_from_offset( the caller can retry on the next refresh. When *include_partial_tail* is ``True`` (used by :func:`parse_events` - for one-shot full parsing), a final non-newline-terminated line is - **included** in the results, and ``safe_end`` advances past that - consumed partial tail as well to preserve full-file read semantics. + for one-shot full parsing), this special case is disabled for a final + non-newline-terminated line that fails JSON decoding: the line is + warned about and skipped, and ``safe_end`` advances past it. Valid + final lines are parsed and returned regardless of this flag. Returns: ``(new_events, safe_end)`` where *safe_end* is the byte @@ -654,8 +655,9 @@ def parse_events(events_path: Path) -> list[SessionEvent]: Delegates to :func:`_parse_events_from_offset` (the same binary-mode implementation used by the production cache layer) with - ``include_partial_tail=True`` so that a final line lacking a trailing - newline is still included — matching one-shot full-file read semantics. + ``include_partial_tail=True`` to match full-file parsing behavior, + including the more permissive handling of a malformed partial tail at + EOF. Lines that fail JSON decoding or Pydantic validation are skipped with a warning. diff --git a/tests/copilot_usage/test_parser.py b/tests/copilot_usage/test_parser.py index d2b351d9..064e1dc2 100644 --- a/tests/copilot_usage/test_parser.py +++ b/tests/copilot_usage/test_parser.py @@ -317,7 +317,7 @@ def _clear_session_cache() -> None: def _write_events(path: Path, *lines: str) -> Path: """Write event lines to an events.jsonl file and return the path.""" path.parent.mkdir(parents=True, exist_ok=True) - path.write_text("\n".join(lines) + "\n", encoding="utf-8") + path.write_text("\n".join(lines) + "\n", encoding="utf-8", newline="\n") return path @@ -1535,8 +1535,9 @@ def test_multiple_validation_errors_each_warned(self, tmp_path: Path) -> None: def test_unicode_decode_error_returns_partial(self, tmp_path: Path) -> None: """events.jsonl with invalid UTF-8 bytes returns what was parsed so far. - Due to buffered I/O, the UnicodeDecodeError may fire before any - lines are yielded, so the result is typically an empty list. + In binary mode, invalid UTF-8 bytes cause a JSON parse failure + rather than a ``UnicodeDecodeError``, so the affected line is + skipped and valid lines are still returned. """ p = tmp_path / "s" / "events.jsonl" p.parent.mkdir(parents=True, exist_ok=True) @@ -1583,8 +1584,9 @@ def test_embedded_invalid_utf8_skips_malformed_line(self, tmp_path: Path) -> Non second_repeat = 5 second_block = valid_line * second_repeat # Insert invalid UTF-8 bytes between the two valid blocks. - # Binary-mode reading absorbs these into the preceding line's - # tail; that line becomes malformed JSON and is skipped. + # Because ``first_block`` ends with a newline, these bytes + # prefix the next line, corrupting the first line of + # ``second_block`` so it becomes malformed JSON and is skipped. invalid_bytes = b"\xff\xfe" p.write_bytes(first_block + invalid_bytes + second_block) result = parse_events(p) From 6b80b16ae80c47af4330b61e9813d5dc7d1f9f33 Mon Sep 17 00:00:00 2001 From: Sasa Junuzovic <44276455+microsasa@users.noreply.github.com> Date: Sat, 18 Apr 2026 13:32:08 -0700 Subject: [PATCH 3/5] fix: address review comments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rename test methods with stale UnicodeDecodeError names to reflect actual binary-mode behavior (invalid UTF-8 bytes cause JSON skip): - test_unicode_decode_error_returns_partial → test_invalid_utf8_line_skipped_valid_event_returned - test_unicode_decode_error_full_file → test_entirely_invalid_utf8_returns_empty_list Update docstrings and inline comments accordingly. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- tests/copilot_usage/test_parser.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/copilot_usage/test_parser.py b/tests/copilot_usage/test_parser.py index 064e1dc2..40a541be 100644 --- a/tests/copilot_usage/test_parser.py +++ b/tests/copilot_usage/test_parser.py @@ -1532,8 +1532,10 @@ def test_multiple_validation_errors_each_warned(self, tmp_path: Path) -> None: offset2 = offset1 + len(bad1.encode("utf-8")) + 1 assert offsets == [offset1, offset2] - def test_unicode_decode_error_returns_partial(self, tmp_path: Path) -> None: - """events.jsonl with invalid UTF-8 bytes returns what was parsed so far. + def test_invalid_utf8_line_skipped_valid_event_returned( + self, tmp_path: Path + ) -> None: + """Invalid UTF-8 bytes on a line are skipped; prior valid events returned. In binary mode, invalid UTF-8 bytes cause a JSON parse failure rather than a ``UnicodeDecodeError``, so the affected line is @@ -1546,17 +1548,15 @@ def test_unicode_decode_error_returns_partial(self, tmp_path: Path) -> None: bad_line = b"\xff\xfe invalid utf-8 bytes\n" p.write_bytes(valid_line + bad_line) events = parse_events(p) - # Should return what was parsed before the error and not raise. + # The bad line is skipped; only the valid start event survives. assert isinstance(events, list) - # 0 or 1 events may be returned depending on buffered I/O behavior. assert len(events) <= 1 if events: - # If any event is returned, it should be the initial session.start. assert len(events) == 1 assert events[0].type == "session.start" - def test_unicode_decode_error_full_file(self, tmp_path: Path) -> None: - """events.jsonl that is entirely invalid UTF-8 returns empty list.""" + def test_entirely_invalid_utf8_returns_empty_list(self, tmp_path: Path) -> None: + """events.jsonl that is entirely invalid UTF-8 bytes returns empty list.""" p = tmp_path / "s" / "events.jsonl" p.parent.mkdir(parents=True, exist_ok=True) p.write_bytes(b"\xff\xfe\x80\x81\x82") From 1a0af583f5063fc25ada519b32f22ef1d3ad8cc8 Mon Sep 17 00:00:00 2001 From: Sasa Junuzovic <44276455+microsasa@users.noreply.github.com> Date: Sat, 18 Apr 2026 13:42:52 -0700 Subject: [PATCH 4/5] fix: address review comments - Strengthen test_invalid_utf8_line_skipped_valid_event_returned to assert exactly 1 event instead of allowing 0 (binary mode is deterministic) - Document UnicodeDecodeError partial-return behavior in parse_events docstring Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/copilot_usage/parser.py | 6 ++++++ tests/copilot_usage/test_parser.py | 8 +++----- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/src/copilot_usage/parser.py b/src/copilot_usage/parser.py index 47a8f38a..9f0729a8 100644 --- a/src/copilot_usage/parser.py +++ b/src/copilot_usage/parser.py @@ -662,9 +662,15 @@ def parse_events(events_path: Path) -> list[SessionEvent]: Lines that fail JSON decoding or Pydantic validation are skipped with a warning. + If a ``UnicodeDecodeError`` is raised during parsing (e.g. from a + Pydantic validator), parsing stops early and the events parsed so + far are returned as a partial session. + Raises: OSError: If the file cannot be opened or read (e.g., deleted between discovery and parsing, or I/O error while streaming). + ``UnicodeDecodeError`` is caught internally; callers only + need to handle ``OSError``. """ events, _ = _parse_events_from_offset(events_path, 0, include_partial_tail=True) return events diff --git a/tests/copilot_usage/test_parser.py b/tests/copilot_usage/test_parser.py index 40a541be..2380ffdd 100644 --- a/tests/copilot_usage/test_parser.py +++ b/tests/copilot_usage/test_parser.py @@ -1548,12 +1548,10 @@ def test_invalid_utf8_line_skipped_valid_event_returned( bad_line = b"\xff\xfe invalid utf-8 bytes\n" p.write_bytes(valid_line + bad_line) events = parse_events(p) - # The bad line is skipped; only the valid start event survives. + # The bad line is skipped; the valid start event always survives. assert isinstance(events, list) - assert len(events) <= 1 - if events: - assert len(events) == 1 - assert events[0].type == "session.start" + assert len(events) == 1 + assert events[0].type == "session.start" def test_entirely_invalid_utf8_returns_empty_list(self, tmp_path: Path) -> None: """events.jsonl that is entirely invalid UTF-8 bytes returns empty list.""" From 5bc077ef80d6ae792c459b666f5220bd845f4850 Mon Sep 17 00:00:00 2001 From: Sasa Junuzovic <44276455+microsasa@users.noreply.github.com> Date: Sat, 18 Apr 2026 13:53:10 -0700 Subject: [PATCH 5/5] fix: address review comments Add test for malformed partial tail with include_partial_tail=True to regression-protect the flag's warn-and-skip behavior when the final non-newline-terminated line fails JSON decoding. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- tests/copilot_usage/test_parser.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tests/copilot_usage/test_parser.py b/tests/copilot_usage/test_parser.py index 2380ffdd..d8ae268d 100644 --- a/tests/copilot_usage/test_parser.py +++ b/tests/copilot_usage/test_parser.py @@ -9795,6 +9795,30 @@ def test_no_trailing_newline_same_events(self, tmp_path: Path) -> None: assert len(cached) == len(one_shot) assert [e.type for e in cached] == [e.type for e in one_shot] + def test_malformed_partial_tail_warned_and_skipped(self, tmp_path: Path) -> None: + """Malformed JSON final line without trailing newline is skipped with warning. + + When ``include_partial_tail=True``, a non-newline-terminated line + that fails JSON decoding should be warned about and skipped (not + trigger the early ``break``). ``safe_end`` must advance past it. + """ + p = tmp_path / "sess" / "events.jsonl" + p.parent.mkdir(parents=True, exist_ok=True) + line1 = _make_event_line("session.start", "s1") + "\n" + bad_tail = b"NOT-VALID-JSON{{{" # no trailing newline + p.write_bytes(line1.encode("utf-8") + bad_tail) + + with patch.object(_parser_module.logger, "warning") as warning_spy: + events, safe_end = _parse_events_from_offset( + p, 0, include_partial_tail=True + ) + + assert len(events) == 1 + assert events[0].type == "session.start" + assert safe_end == p.stat().st_size + warning_spy.assert_called_once() + assert "json" in warning_spy.call_args.args[0].lower() + class TestIncrementalEventsCaching: """Tests for incremental cache behaviour in get_cached_events."""