From 7d8df4724326d75dc0ebbba19414eda3f8697527 Mon Sep 17 00:00:00 2001 From: Trecek Date: Sun, 12 Apr 2026 15:31:45 -0700 Subject: [PATCH 01/14] test: add failing tests for trace identity enrollment contract (Part A) Tests 1.1-1.3 (session_log): recover_crashed_sessions must skip trace files for alive PIDs, files without enrollment sidecars, and files whose enrollment boot_id doesn't match the current boot. Tests 1.4-1.5 (linux_tracing): start_linux_tracing must write an enrollment sidecar atomically; stop() must unlink both trace and sidecar. All 5 tests fail until the implementation is added. Co-Authored-By: Claude Sonnet 4.6 --- tests/execution/test_linux_tracing.py | 45 ++++++++++++++ tests/execution/test_session_log.py | 84 +++++++++++++++++++++++++++ 2 files changed, 129 insertions(+) diff --git a/tests/execution/test_linux_tracing.py b/tests/execution/test_linux_tracing.py index af01a3ded..11cb43798 100644 --- a/tests/execution/test_linux_tracing.py +++ b/tests/execution/test_linux_tracing.py @@ -348,3 +348,48 @@ async def test_proc_monitor_persists_psutil_process_for_cpu_percent(): finally: proc.kill() proc.wait() + + +@pytest.mark.anyio +async def test_start_linux_tracing_writes_enrollment_sidecar(tmp_path): + """start_linux_tracing must write autoskillit_enrollment_{pid}.json immediately.""" + import anyio + + from autoskillit.config import LinuxTracingConfig + from autoskillit.execution.linux_tracing import start_linux_tracing + + cfg = LinuxTracingConfig(enabled=True, proc_interval=0.1, tmpfs_path=str(tmp_path)) + async with anyio.create_task_group() as tg: + proc = await anyio.open_process(["sleep", "2"]) + handle = start_linux_tracing(proc.pid, cfg, tg) + assert handle is not None + enrollment = tmp_path / f"autoskillit_enrollment_{proc.pid}.json" + assert enrollment.exists() + data = json.loads(enrollment.read_text()) + assert data["schema_version"] == 1 + assert data["pid"] == proc.pid + handle.stop() + proc.kill() + assert not enrollment.exists(), "Enrollment must be deleted by stop()" + + +@pytest.mark.anyio +async def test_stop_unlinks_trace_and_enrollment(tmp_path): + """stop() must delete both trace JSONL and enrollment sidecar on clean exit.""" + import anyio + + from autoskillit.config import LinuxTracingConfig + from autoskillit.execution.linux_tracing import start_linux_tracing + + cfg = LinuxTracingConfig(enabled=True, proc_interval=0.1, tmpfs_path=str(tmp_path)) + async with anyio.create_task_group() as tg: + proc = await anyio.open_process(["sleep", "2"]) + handle = start_linux_tracing(proc.pid, cfg, tg) + assert handle is not None + trace = tmp_path / f"autoskillit_trace_{proc.pid}.jsonl" + enrollment = tmp_path / f"autoskillit_enrollment_{proc.pid}.json" + assert trace.exists() + handle.stop() + proc.kill() + assert not trace.exists(), "Trace file must be deleted on clean stop()" + assert not enrollment.exists(), "Enrollment sidecar must be deleted on clean stop()" diff --git a/tests/execution/test_session_log.py b/tests/execution/test_session_log.py index e1b047a3a..3afc50f4e 100644 --- a/tests/execution/test_session_log.py +++ b/tests/execution/test_session_log.py @@ -3,11 +3,15 @@ from __future__ import annotations import json +import os +import sys +import time from datetime import UTC, datetime from pathlib import Path import pytest +from autoskillit.execution.linux_tracing import read_boot_id, read_starttime_ticks from autoskillit.execution.session_log import ( flush_session_log, read_telemetry_clear_marker, @@ -811,3 +815,83 @@ def test_flush_session_log_order_id_defaults_to_empty(tmp_path): entry = json.loads((tmp_path / "sessions.jsonl").read_text().strip()) assert "order_id" in entry assert entry["order_id"] == "" + + +@pytest.mark.skipif(sys.platform != "linux", reason="Linux-only: uses /proc and boot_id") +def test_recover_crashed_sessions_skips_live_pid(tmp_path): + """A trace file whose enrolled PID is still alive must not be recovered.""" + tmpfs = tmp_path / "shm" + tmpfs.mkdir() + pid = os.getpid() + trace = tmpfs / f"autoskillit_trace_{pid}.jsonl" + enrollment = tmpfs / f"autoskillit_enrollment_{pid}.json" + trace.write_text("") + enrollment.write_text( + json.dumps( + { + "schema_version": 1, + "pid": pid, + "boot_id": read_boot_id() or "", + "starttime_ticks": read_starttime_ticks(pid), + "session_id": "", + "enrolled_at": datetime.now(UTC).isoformat(), + "kitchen_id": "", + "order_id": "", + } + ) + ) + os.utime(trace, (time.time() - 60,) * 2) + + count = recover_crashed_sessions(tmpfs_path=str(tmpfs), log_dir=str(tmp_path)) + + assert count == 0 + assert trace.exists(), "Trace file for alive PID must not be deleted" + assert enrollment.exists(), "Enrollment sidecar for alive PID must not be deleted" + + +def test_recover_crashed_sessions_skips_file_without_enrollment(tmp_path): + """A trace file with no enrollment sidecar must be skipped — it is not + an autoskillit-owned trace (e.g. a test artifact or alien file).""" + tmpfs = tmp_path / "shm" + tmpfs.mkdir() + trace = tmpfs / "autoskillit_trace_99997.jsonl" + trace.write_text("") + os.utime(trace, (time.time() - 60,) * 2) + + count = recover_crashed_sessions(tmpfs_path=str(tmpfs), log_dir=str(tmp_path)) + + assert count == 0 + assert trace.exists(), "Alien trace file must not be deleted" + + +def test_recover_crashed_sessions_skips_wrong_boot_id(tmp_path, monkeypatch): + """An enrollment sidecar with a different boot_id must be rejected.""" + monkeypatch.setattr( + "autoskillit.execution.session_log.read_boot_id", + lambda: "current-boot-id", + ) + tmpfs = tmp_path / "shm" + tmpfs.mkdir() + pid = 99996 + trace = tmpfs / f"autoskillit_trace_{pid}.jsonl" + enrollment = tmpfs / f"autoskillit_enrollment_{pid}.json" + trace.write_text("") + enrollment.write_text( + json.dumps( + { + "schema_version": 1, + "pid": pid, + "boot_id": "stale-boot-id", + "starttime_ticks": 1234, + "session_id": "", + "enrolled_at": "2026-01-01T00:00:00+00:00", + "kitchen_id": "", + "order_id": "", + } + ) + ) + os.utime(trace, (time.time() - 60,) * 2) + + count = recover_crashed_sessions(tmpfs_path=str(tmpfs), log_dir=str(tmp_path)) + + assert count == 0 From 130477a83eac080fc7c32af035c2f297717a17b5 Mon Sep 17 00:00:00 2001 From: Trecek Date: Sun, 12 Apr 2026 15:33:39 -0700 Subject: [PATCH 02/14] feat: establish trace identity enrollment contract (Part A) Introduces a per-session enrollment sidecar (autoskillit_enrollment_{pid}.json) written atomically at start_linux_tracing time, containing the identity triple (boot_id, pid, starttime_ticks) that anchors the trace file to the specific process that created it. Key changes: - linux_tracing.py: Add TraceEnrollmentRecord frozen dataclass, _write_enrollment_atomic and _read_enrollment helpers; extend start_linux_tracing with session_id/kitchen_id/order_id keyword params; write sidecar immediately after opening trace file; update LinuxTracingHandle.stop() to unlink both trace and sidecar on clean exit so recovery only ever sees genuine crashes. - session_log.py: Add three-gate identity chain to recover_crashed_sessions: (1) enrollment sidecar must exist, (2) boot_id must match current boot, (3) PID must be dead or starttime_ticks must differ. Delete both files after recovery. - test_session_log.py: Update _write_old_trace to write companion enrollment sidecars so existing recovery tests pass the new gates. Co-Authored-By: Claude Sonnet 4.6 --- src/autoskillit/execution/linux_tracing.py | 84 ++++++++++++++++++++++ src/autoskillit/execution/session_log.py | 50 ++++++++++--- tests/execution/test_session_log.py | 29 ++++++-- 3 files changed, 148 insertions(+), 15 deletions(-) diff --git a/src/autoskillit/execution/linux_tracing.py b/src/autoskillit/execution/linux_tracing.py index b7a4612d5..5dfc8fa21 100644 --- a/src/autoskillit/execution/linux_tracing.py +++ b/src/autoskillit/execution/linux_tracing.py @@ -18,7 +18,9 @@ import dataclasses import json +import os import sys +import tempfile from collections.abc import AsyncIterator from dataclasses import dataclass, field from datetime import UTC, datetime, timedelta @@ -58,6 +60,59 @@ def read_starttime_ticks(pid: int) -> int | None: return None +@dataclass(frozen=True) +class TraceEnrollmentRecord: + """Identity triple written atomically at trace-open time. + + (boot_id, pid, starttime_ticks) together form a collision-resistant identity: + - boot_id rejects pre-reboot stale files + - starttime_ticks detects PID recycling + """ + + schema_version: int # always 1 + pid: int + boot_id: str | None # read_boot_id(); None if /proc unavailable + starttime_ticks: int | None # read_starttime_ticks(pid); None if unavailable + session_id: str # caller-provided; "" if not yet resolved + enrolled_at: str # ISO 8601 UTC + kitchen_id: str # "" + order_id: str # "" + + +def _write_enrollment_atomic(path: Path, record: TraceEnrollmentRecord) -> None: + """Write enrollment sidecar atomically using tempfile + os.replace.""" + content = json.dumps(dataclasses.asdict(record)) + fd, tmp = tempfile.mkstemp(dir=path.parent, suffix=".tmp") + try: + with os.fdopen(fd, "w", encoding="utf-8") as f: + f.write(content) + os.replace(tmp, path) + except Exception: + try: + os.unlink(tmp) + except OSError: + pass + raise + + +def _read_enrollment(path: Path) -> TraceEnrollmentRecord | None: + """Read and validate an enrollment sidecar. Returns None on any error.""" + try: + data = json.loads(path.read_text(encoding="utf-8")) + return TraceEnrollmentRecord( + schema_version=data["schema_version"], + pid=data["pid"], + boot_id=data.get("boot_id"), + starttime_ticks=data.get("starttime_ticks"), + session_id=data.get("session_id", ""), + enrolled_at=data.get("enrolled_at", ""), + kitchen_id=data.get("kitchen_id", ""), + order_id=data.get("order_id", ""), + ) + except (OSError, KeyError, TypeError, ValueError, json.JSONDecodeError): + return None + + @dataclass(frozen=True) class ProcSnapshot: """Point-in-time snapshot of process state.""" @@ -200,6 +255,7 @@ class LinuxTracingHandle: _snapshots: list[ProcSnapshot] = field(default_factory=list) _trace_path: Path | None = field(default=None) _trace_file: IO[str] | None = field(default=None) + _enrollment_path: Path | None = field(default=None) def stop(self) -> list[ProcSnapshot]: """Stop tracing, flush and close the trace file, return accumulated snapshots.""" @@ -212,6 +268,12 @@ def stop(self) -> list[ProcSnapshot]: except OSError: pass self._trace_file = None + if self._trace_path is not None: + self._trace_path.unlink(missing_ok=True) + self._trace_path = None + if self._enrollment_path is not None: + self._enrollment_path.unlink(missing_ok=True) + self._enrollment_path = None return list(self._snapshots) @@ -219,6 +281,10 @@ def start_linux_tracing( pid: int, config: LinuxTracingConfig, tg: anyio.abc.TaskGroup | None, + *, + session_id: str = "", + kitchen_id: str = "", + order_id: str = "", ) -> LinuxTracingHandle | None: """Start Linux tracing if all gates pass. Returns handle or None.""" if not LINUX_TRACING_AVAILABLE or not config.enabled: @@ -240,6 +306,24 @@ def start_linux_tracing( handle._trace_path = None handle._trace_file = None + # Write enrollment sidecar atomically for crash-recovery identity contract + enrollment_path = tmpfs / f"autoskillit_enrollment_{pid}.json" + try: + record = TraceEnrollmentRecord( + schema_version=1, + pid=pid, + boot_id=read_boot_id(), + starttime_ticks=read_starttime_ticks(pid), + session_id=session_id, + enrolled_at=datetime.now(UTC).isoformat(), + kitchen_id=kitchen_id, + order_id=order_id, + ) + _write_enrollment_atomic(enrollment_path, record) + handle._enrollment_path = enrollment_path + except OSError: + handle._enrollment_path = None + async def _run_monitor() -> None: with scope: async for snap in proc_monitor(pid, config.proc_interval): diff --git a/src/autoskillit/execution/session_log.py b/src/autoskillit/execution/session_log.py index 891e0f1fe..e6e9cd79a 100644 --- a/src/autoskillit/execution/session_log.py +++ b/src/autoskillit/execution/session_log.py @@ -17,8 +17,15 @@ from pathlib import Path from typing import Any +import psutil + from autoskillit.core import atomic_write, claude_code_log_path, get_logger from autoskillit.execution.anomaly_detection import detect_anomalies +from autoskillit.execution.linux_tracing import ( + _read_enrollment, + read_boot_id, + read_starttime_ticks, +) logger = get_logger(__name__) @@ -315,6 +322,7 @@ def recover_crashed_sessions(tmpfs_path: str = "/dev/shm", log_dir: str = "") -> return 0 count = 0 + current_boot_id = read_boot_id() for trace_file in sorted(tmpfs.glob("autoskillit_trace_*.jsonl")): # Skip files modified within the last 30 seconds — may be active try: @@ -324,7 +332,35 @@ def recover_crashed_sessions(tmpfs_path: str = "/dev/shm", log_dir: str = "") -> if age_seconds < 30: continue - # Read snapshots + # Extract PID from filename: autoskillit_trace_{pid}.jsonl + try: + pid = int(trace_file.stem.split("_")[-1]) + except (ValueError, IndexError): + pid = 0 + + # Gate 1: Enrollment sidecar must exist — no sidecar means alien/test file + enrollment_path = tmpfs / f"autoskillit_enrollment_{pid}.json" + enrollment = _read_enrollment(enrollment_path) + if enrollment is None: + logger.debug("Skipping %s: no enrollment sidecar", trace_file.name) + continue + + # Gate 2: Boot ID must match current boot — mismatch means pre-reboot stale file + if current_boot_id and enrollment.boot_id and enrollment.boot_id != current_boot_id: + logger.debug("Skipping %s: boot_id mismatch", trace_file.name) + trace_file.unlink(missing_ok=True) + enrollment_path.unlink(missing_ok=True) + continue + + # Gate 3: PID liveness + starttime_ticks identity + if psutil.pid_exists(pid): + current_ticks = read_starttime_ticks(pid) + if current_ticks is None or current_ticks == enrollment.starttime_ticks: + logger.debug("Skipping %s: PID %d still alive", trace_file.name, pid) + continue + # PID recycled — original process is gone, treat as crash + + # All gates passed — read snapshots and emit crashed row snapshots: list[dict[str, object]] = [] try: for line in trace_file.read_text().splitlines(): @@ -335,12 +371,6 @@ def recover_crashed_sessions(tmpfs_path: str = "/dev/shm", log_dir: str = "") -> except OSError: continue - # Extract PID from filename: autoskillit_trace_{pid}.jsonl - try: - pid = int(trace_file.stem.split("_")[-1]) - except (ValueError, IndexError): - pid = 0 - # Compute start_ts from file mtime try: mtime_ts = datetime.fromtimestamp(trace_file.stat().st_mtime, tz=UTC).isoformat() @@ -365,10 +395,8 @@ def recover_crashed_sessions(tmpfs_path: str = "/dev/shm", log_dir: str = "") -> logger.debug("recover_crashed_sessions: failed to finalize %s", trace_file) continue - try: - trace_file.unlink() - except OSError: - pass + trace_file.unlink(missing_ok=True) + enrollment_path.unlink(missing_ok=True) count += 1 diff --git a/tests/execution/test_session_log.py b/tests/execution/test_session_log.py index 3afc50f4e..7ad74e8df 100644 --- a/tests/execution/test_session_log.py +++ b/tests/execution/test_session_log.py @@ -391,15 +391,36 @@ def test_recover_crashed_sessions_skips_recent_files(tmp_path): def _write_old_trace(tmpfs: Path, filename: str, content: str) -> Path: - """Write a trace file and backdate its mtime to 60 seconds ago.""" - import time + """Write a trace file (backdated 60s) and its enrollment sidecar. + The enrollment sidecar uses the current boot_id so Gate 2 passes. + The PID embedded in the filename is expected to be dead (so Gate 3 passes). + """ trace = tmpfs / filename trace.write_text(content) old_mtime = time.time() - 60 - import os - os.utime(trace, (old_mtime, old_mtime)) + + # Write companion enrollment sidecar so Gate 1 passes + try: + pid = int(Path(filename).stem.split("_")[-1]) + except (ValueError, IndexError): + pid = 0 + enrollment = tmpfs / f"autoskillit_enrollment_{pid}.json" + enrollment.write_text( + json.dumps( + { + "schema_version": 1, + "pid": pid, + "boot_id": read_boot_id() or "", + "starttime_ticks": None, + "session_id": "", + "enrolled_at": "2026-01-01T00:00:00+00:00", + "kitchen_id": "", + "order_id": "", + } + ) + ) return trace From 5b4df88339c225c26186635089216fdda5dc26b6 Mon Sep 17 00:00:00 2001 From: Trecek Date: Sun, 12 Apr 2026 15:40:39 -0700 Subject: [PATCH 03/14] fix: update tests broken by stop() clean-exit unlink behavior MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit test_streaming_writes_each_snapshot_as_jsonl: save trace path and flush before calling stop(), which now deletes both trace and enrollment files on clean exit. Read file content into variable before stop() runs. test_current_json_write_sites_match_allowlist: update session_log.py allowlist line numbers (206→213, 219→226, 222→229) shifted by the enrollment sidecar code added above the existing atomic_write calls. Co-Authored-By: Claude Sonnet 4.6 --- tests/execution/test_linux_tracing.py | 9 +++++++-- tests/infra/test_schema_version_convention.py | 6 +++--- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/tests/execution/test_linux_tracing.py b/tests/execution/test_linux_tracing.py index 11cb43798..96a56e3ac 100644 --- a/tests/execution/test_linux_tracing.py +++ b/tests/execution/test_linux_tracing.py @@ -219,9 +219,14 @@ async def test_streaming_writes_each_snapshot_as_jsonl(tmp_path): await anyio.sleep(0.2) tg.cancel_scope.cancel() + # Save trace path and flush before stop() deletes the file on clean exit + trace_path = handle._trace_path + assert trace_path is not None + if handle._trace_file is not None: + handle._trace_file.flush() + content = trace_path.read_text() snapshots = handle.stop() - assert handle._trace_path is not None - lines = handle._trace_path.read_text().strip().split("\n") + lines = content.strip().split("\n") assert len(lines) >= 1 for line in lines: record = json.loads(line) diff --git a/tests/infra/test_schema_version_convention.py b/tests/infra/test_schema_version_convention.py index 5e510bc02..26456ce3d 100644 --- a/tests/infra/test_schema_version_convention.py +++ b/tests/infra/test_schema_version_convention.py @@ -111,9 +111,9 @@ def _is_yaml_dump(node: ast.expr) -> bool: # core/io.py — write_versioned_json itself (the blessed helper) uses atomic_write+json.dumps ("src/autoskillit/core/io.py", 118), # session_log.py — summary dict, token_usage list, audit_log list - ("src/autoskillit/execution/session_log.py", 206), - ("src/autoskillit/execution/session_log.py", 219), - ("src/autoskillit/execution/session_log.py", 222), + ("src/autoskillit/execution/session_log.py", 213), + ("src/autoskillit/execution/session_log.py", 226), + ("src/autoskillit/execution/session_log.py", 229), # migration/store.py — failure store dicts ("src/autoskillit/migration/store.py", 54), ("src/autoskillit/migration/store.py", 64), From 098eec954886a42575680fc9ba665a54ce0e07fc Mon Sep 17 00:00:00 2001 From: Trecek Date: Sun, 12 Apr 2026 15:43:27 -0700 Subject: [PATCH 04/14] fix: update list_sites line numbers in test_allowlist_includes_list_payloads MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Lines 219→226 and 222→229 in session_log.py shifted by the enrollment sidecar code; the hardcoded list_sites check in the same convention test needed the same update as _LEGACY_JSON_WRITES. Co-Authored-By: Claude Sonnet 4.6 --- tests/infra/test_schema_version_convention.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/infra/test_schema_version_convention.py b/tests/infra/test_schema_version_convention.py index 26456ce3d..d07e72479 100644 --- a/tests/infra/test_schema_version_convention.py +++ b/tests/infra/test_schema_version_convention.py @@ -209,8 +209,8 @@ def test_allowlist_includes_list_payloads_as_documented(self): """List-payload sites are included since the AST scanner can't distinguish return types.""" # These sites write list payloads through function calls but are caught by the scanner list_sites = [ - ("src/autoskillit/execution/session_log.py", 219), - ("src/autoskillit/execution/session_log.py", 222), + ("src/autoskillit/execution/session_log.py", 226), + ("src/autoskillit/execution/session_log.py", 229), ("src/autoskillit/smoke_utils.py", 83), ("src/autoskillit/smoke_utils.py", 138), ] From 84810f0fa005edadddc9b8b5fb8113a5ba64a877 Mon Sep 17 00:00:00 2001 From: Trecek Date: Sun, 12 Apr 2026 15:58:13 -0700 Subject: [PATCH 05/14] test: add failing tests for LinuxTracingConfig /dev/shm guard and isolated_tracing_config fixture Tests assert: - __post_init__ raises RuntimeError when tmpfs_path == /dev/shm and PYTEST_CURRENT_TEST is set - Custom tmpfs path does not raise in test env - /dev/shm is allowed outside pytest (production path) - isolated_tracing_config fixture returns non-/dev/shm temp dir All four tests fail before implementation (no guard, no fixture yet). Co-Authored-By: Claude Sonnet 4.6 --- tests/execution/test_linux_tracing.py | 46 +++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/tests/execution/test_linux_tracing.py b/tests/execution/test_linux_tracing.py index 96a56e3ac..efb19e944 100644 --- a/tests/execution/test_linux_tracing.py +++ b/tests/execution/test_linux_tracing.py @@ -378,6 +378,52 @@ async def test_start_linux_tracing_writes_enrollment_sidecar(tmp_path): assert not enrollment.exists(), "Enrollment must be deleted by stop()" +# --- LinuxTracingConfig guard tests --- + + +def test_tracing_config_rejects_dev_shm_in_test_env(monkeypatch): + """LinuxTracingConfig.__post_init__ must raise when tmpfs_path is /dev/shm + and PYTEST_CURRENT_TEST env var is set.""" + from autoskillit.config import LinuxTracingConfig + + monkeypatch.setenv( + "PYTEST_CURRENT_TEST", + "tests/execution/test_linux_tracing.py::fake_test", + ) + with pytest.raises(RuntimeError, match="tmpfs_path|PYTEST_CURRENT_TEST"): + LinuxTracingConfig(tmpfs_path="/dev/shm") + + +def test_tracing_config_allows_custom_tmpfs_in_test_env(monkeypatch, tmp_path): + """LinuxTracingConfig must not raise when a non-/dev/shm path is provided.""" + from autoskillit.config import LinuxTracingConfig + + monkeypatch.setenv( + "PYTEST_CURRENT_TEST", + "tests/execution/test_linux_tracing.py::fake_test", + ) + cfg = LinuxTracingConfig(tmpfs_path=str(tmp_path)) # must not raise + assert cfg.tmpfs_path == str(tmp_path) + + +def test_tracing_config_allows_dev_shm_outside_test_env(monkeypatch): + """LinuxTracingConfig must not raise for /dev/shm when not running under pytest.""" + from autoskillit.config import LinuxTracingConfig + + monkeypatch.delenv("PYTEST_CURRENT_TEST", raising=False) + cfg = LinuxTracingConfig(tmpfs_path="/dev/shm") # production path — must not raise + assert cfg.tmpfs_path == "/dev/shm" + + +def test_isolated_tracing_config_fixture_returns_non_dev_shm(isolated_tracing_config): + """The isolated_tracing_config fixture must return a config pointing to a + temp dir, not /dev/shm.""" + from pathlib import Path + + assert isolated_tracing_config.tmpfs_path != "/dev/shm" + assert Path(isolated_tracing_config.tmpfs_path).is_dir() + + @pytest.mark.anyio async def test_stop_unlinks_trace_and_enrollment(tmp_path): """stop() must delete both trace JSONL and enrollment sidecar on clean exit.""" From 7a5a7134b136d47834668862839c4bcffb136977 Mon Sep 17 00:00:00 2001 From: Trecek Date: Sun, 12 Apr 2026 15:58:31 -0700 Subject: [PATCH 06/14] feat: add __post_init__ guard to LinuxTracingConfig that rejects /dev/shm in test env MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When PYTEST_CURRENT_TEST is set and tmpfs_path is the production default /dev/shm, construction raises RuntimeError with a diagnostic message pointing to the correct fix. Zero overhead in production — env var is never set outside pytest. Co-Authored-By: Claude Sonnet 4.6 --- src/autoskillit/config/settings.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/autoskillit/config/settings.py b/src/autoskillit/config/settings.py index 6ad80b021..3210aeaed 100644 --- a/src/autoskillit/config/settings.py +++ b/src/autoskillit/config/settings.py @@ -193,6 +193,18 @@ class LinuxTracingConfig: log_dir: str = "" # empty = platform default (~/.local/share/autoskillit/logs on Linux) tmpfs_path: str = "/dev/shm" # RAM-backed tmpfs for crash-resilient streaming + def __post_init__(self) -> None: + import os + + if self.tmpfs_path == "/dev/shm" and os.environ.get("PYTEST_CURRENT_TEST"): + raise RuntimeError( + "LinuxTracingConfig.tmpfs_path is '/dev/shm' but PYTEST_CURRENT_TEST " + "is set — this test would write to the real shared tmpfs and pollute " + "production state. Override tmpfs_path with a test-local path, e.g.: " + "LinuxTracingConfig(tmpfs_path=str(tmp_path)). " + "Use the isolated_tracing_config fixture for new tests." + ) + @dataclass class McpResponseConfig: From a881b84c967c691b14d0502bf88a40516a7df3f9 Mon Sep 17 00:00:00 2001 From: Trecek Date: Sun, 12 Apr 2026 15:59:08 -0700 Subject: [PATCH 07/14] fix: pass tmpfs_path=str(tmp_path) to all non-isolated LinuxTracingConfig call sites Five test call sites that previously wrote to the real /dev/shm are now explicitly isolated: four in test_linux_tracing.py and one in test_session_log_integration.py. Each gains a tmp_path fixture param where it lacked one, and passes tmpfs_path=str(tmp_path) to the constructor. These tests would have raised RuntimeError at construction after the __post_init__ guard was added; this commit restores them to passing. Co-Authored-By: Claude Sonnet 4.6 --- tests/execution/test_linux_tracing.py | 16 ++++++++-------- tests/execution/test_session_log_integration.py | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/execution/test_linux_tracing.py b/tests/execution/test_linux_tracing.py index efb19e944..86e95431b 100644 --- a/tests/execution/test_linux_tracing.py +++ b/tests/execution/test_linux_tracing.py @@ -78,7 +78,7 @@ def test_read_proc_snapshot_has_all_fields(): @pytest.mark.anyio -async def test_tracing_handle_accumulates_snapshots(): +async def test_tracing_handle_accumulates_snapshots(tmp_path): """LinuxTracingHandle accumulates snapshots during monitoring.""" import subprocess @@ -88,7 +88,7 @@ async def test_tracing_handle_accumulates_snapshots(): from autoskillit.execution.linux_tracing import start_linux_tracing proc = subprocess.Popen(["sleep", "2"]) - cfg = LinuxTracingConfig(enabled=True, proc_interval=0.1) + cfg = LinuxTracingConfig(enabled=True, proc_interval=0.1, tmpfs_path=str(tmp_path)) async with anyio.create_task_group() as tg: handle = start_linux_tracing(pid=proc.pid, config=cfg, tg=tg) @@ -104,7 +104,7 @@ async def test_tracing_handle_accumulates_snapshots(): @pytest.mark.anyio -async def test_tracing_handle_stop_returns_snapshots(): +async def test_tracing_handle_stop_returns_snapshots(tmp_path): """stop() returns the accumulated snapshot list.""" import os @@ -113,7 +113,7 @@ async def test_tracing_handle_stop_returns_snapshots(): from autoskillit.config import LinuxTracingConfig from autoskillit.execution.linux_tracing import start_linux_tracing - cfg = LinuxTracingConfig(enabled=True, proc_interval=0.1) + cfg = LinuxTracingConfig(enabled=True, proc_interval=0.1, tmpfs_path=str(tmp_path)) async with anyio.create_task_group() as tg: handle = start_linux_tracing(pid=os.getpid(), config=cfg, tg=tg) @@ -140,13 +140,13 @@ def test_linux_tracing_unavailable_on_non_linux(): assert LINUX_TRACING_AVAILABLE is False -def test_noop_on_non_linux(monkeypatch): +def test_noop_on_non_linux(monkeypatch, tmp_path): """start_linux_tracing is a no-op when LINUX_TRACING_AVAILABLE is False.""" from autoskillit.config import LinuxTracingConfig from autoskillit.execution import linux_tracing monkeypatch.setattr(linux_tracing, "LINUX_TRACING_AVAILABLE", False) - cfg = LinuxTracingConfig(enabled=True, proc_interval=1.0) + cfg = LinuxTracingConfig(enabled=True, proc_interval=1.0, tmpfs_path=str(tmp_path)) result = linux_tracing.start_linux_tracing(pid=1, config=cfg, tg=None) assert result is None @@ -304,7 +304,7 @@ def test_proc_snapshot_has_captured_at_field(): @pytest.mark.anyio -async def test_proc_monitor_snapshots_have_distinct_timestamps(): +async def test_proc_monitor_snapshots_have_distinct_timestamps(tmp_path): """Consecutive snapshots from proc_monitor must have distinct captured_at values.""" import os @@ -313,7 +313,7 @@ async def test_proc_monitor_snapshots_have_distinct_timestamps(): from autoskillit.config import LinuxTracingConfig from autoskillit.execution.linux_tracing import start_linux_tracing - config = LinuxTracingConfig(proc_interval=0.05) + config = LinuxTracingConfig(proc_interval=0.05, tmpfs_path=str(tmp_path)) async with anyio.create_task_group() as tg: handle = start_linux_tracing(os.getpid(), config, tg) await anyio.sleep(0.2) diff --git a/tests/execution/test_session_log_integration.py b/tests/execution/test_session_log_integration.py index 5a7780f74..190b21e6a 100644 --- a/tests/execution/test_session_log_integration.py +++ b/tests/execution/test_session_log_integration.py @@ -21,7 +21,7 @@ async def test_full_tracing_pipeline_writes_distinct_timestamps(tmp_path): from autoskillit.execution.linux_tracing import start_linux_tracing from autoskillit.execution.session_log import flush_session_log - config = LinuxTracingConfig(proc_interval=0.05) + config = LinuxTracingConfig(proc_interval=0.05, tmpfs_path=str(tmp_path)) start_ts = datetime.now(UTC).isoformat() start_mono = time.monotonic() async with anyio.create_task_group() as tg: From a30da9b75ad6d48780de398be0ba74b3871f1a21 Mon Sep 17 00:00:00 2001 From: Trecek Date: Sun, 12 Apr 2026 15:59:41 -0700 Subject: [PATCH 08/14] feat: add isolated_tracing_config fixture to tests/execution/conftest.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Provides a canonical, pre-isolated LinuxTracingConfig for all tracing tests. The fixture creates a shm/ subdir under tmp_path and returns a config pointing to it — never to the real /dev/shm. New tests should use this fixture instead of constructing LinuxTracingConfig manually. Co-Authored-By: Claude Sonnet 4.6 --- tests/execution/conftest.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tests/execution/conftest.py b/tests/execution/conftest.py index ed56c65e0..50d31a67e 100644 --- a/tests/execution/conftest.py +++ b/tests/execution/conftest.py @@ -2,8 +2,13 @@ from __future__ import annotations +import pathlib import textwrap +import pytest + +from autoskillit.config.settings import LinuxTracingConfig + # Simulates Claude CLI process that writes a result line then hangs. # Used by test_process_channel_b.py and test_process_monitor.py. WRITE_RESULT_THEN_HANG_SCRIPT = textwrap.dedent("""\ @@ -14,3 +19,13 @@ sys.stdout.flush() time.sleep(3600) """) + + +@pytest.fixture +def isolated_tracing_config(tmp_path: pathlib.Path) -> LinuxTracingConfig: + """Pre-isolated LinuxTracingConfig for tracing tests. + Always writes to a tmp_path subdir, never to the real /dev/shm. + Use this fixture for all new tests that need a LinuxTracingConfig.""" + shm = tmp_path / "shm" + shm.mkdir(parents=True, exist_ok=True) + return LinuxTracingConfig(enabled=True, proc_interval=0.05, tmpfs_path=str(shm)) From 1809a8c7f38c4176f435c09e01f7df2a05d8cb05 Mon Sep 17 00:00:00 2001 From: Trecek Date: Sun, 12 Apr 2026 16:14:15 -0700 Subject: [PATCH 09/14] fix: scope __post_init__ guard to direct test callers only MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The guard fired too broadly — any AutomationConfig() or from_dynaconf() call in tests triggered it even when no actual /dev/shm writes would occur. Use frame inspection to fire only when the immediate caller (two frames up: past __post_init__ and the generated __init__) is test code (/tests/ in filename). Library machinery (AutomationConfig default_factory, from_dynaconf) resolves to or src/, so it passes through. Also set tool_ctx.config.linux_tracing.tmpfs_path to an isolated tmp_path for proper test isolation. Co-Authored-By: Claude Sonnet 4.6 --- src/autoskillit/config/settings.py | 11 ++++++++++- tests/conftest.py | 1 + 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/src/autoskillit/config/settings.py b/src/autoskillit/config/settings.py index 3210aeaed..dfb39fd53 100644 --- a/src/autoskillit/config/settings.py +++ b/src/autoskillit/config/settings.py @@ -194,9 +194,18 @@ class LinuxTracingConfig: tmpfs_path: str = "/dev/shm" # RAM-backed tmpfs for crash-resilient streaming def __post_init__(self) -> None: + import inspect import os - if self.tmpfs_path == "/dev/shm" and os.environ.get("PYTEST_CURRENT_TEST"): + if self.tmpfs_path != "/dev/shm" or not os.environ.get("PYTEST_CURRENT_TEST"): + return + # Only raise when called directly from test code — not from library machinery + # (e.g. AutomationConfig default_factory, from_dynaconf). We inspect the call + # frame two levels up: __post_init__ → __init__ (generated) → actual caller. + frame = inspect.currentframe() + init_frame = frame.f_back if frame is not None else None + caller = init_frame.f_back if init_frame is not None else None + if caller is not None and "/tests/" in (caller.f_code.co_filename or ""): raise RuntimeError( "LinuxTracingConfig.tmpfs_path is '/dev/shm' but PYTEST_CURRENT_TEST " "is set — this test would write to the real shared tmpfs and pollute " diff --git a/tests/conftest.py b/tests/conftest.py index add85e59d..772a0e256 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -255,6 +255,7 @@ def tool_ctx(monkeypatch, tmp_path): ctx = make_context(AutomationConfig(), runner=mock_runner, plugin_dir=str(tmp_path)) ctx.gate = DefaultGateState(enabled=True) ctx.config.linux_tracing.log_dir = str(tmp_path / "session_logs") + ctx.config.linux_tracing.tmpfs_path = str(tmp_path / "shm") # Anchor temp_dir to tmp_path so server tools that read from ctx.temp_dir # (e.g. _apply_triage_gate's staleness cache) write under the per-test # tmp directory rather than the cwd captured at fixture-init time. From d90c682e210316d4d596b1e0bcf34819cd5ed9c6 Mon Sep 17 00:00:00 2001 From: Trecek Date: Sun, 12 Apr 2026 16:45:07 -0700 Subject: [PATCH 10/14] fix(review): fix inverted liveness logic and use pid=-1 for unparseable filenames MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Gate 3 previously skipped recovery when read_starttime_ticks() returned None (unreadable /proc//stat — a transient state after process exit), causing genuinely crashed sessions to be missed. Fix: only skip when ticks are positively known and match the enrollment record. Also change the pid parse-failure fallback from 0 to -1: PID 0 exists on Linux (swapper), making psutil.pid_exists(0) return True and silently short-circuiting Gate 3. PID -1 is guaranteed non-existent. Addresses reviewer comments 3070341630 (critical) and 3070341628. --- src/autoskillit/execution/session_log.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/autoskillit/execution/session_log.py b/src/autoskillit/execution/session_log.py index e6e9cd79a..81b9fecb5 100644 --- a/src/autoskillit/execution/session_log.py +++ b/src/autoskillit/execution/session_log.py @@ -336,7 +336,7 @@ def recover_crashed_sessions(tmpfs_path: str = "/dev/shm", log_dir: str = "") -> try: pid = int(trace_file.stem.split("_")[-1]) except (ValueError, IndexError): - pid = 0 + pid = -1 # Gate 1: Enrollment sidecar must exist — no sidecar means alien/test file enrollment_path = tmpfs / f"autoskillit_enrollment_{pid}.json" @@ -355,7 +355,7 @@ def recover_crashed_sessions(tmpfs_path: str = "/dev/shm", log_dir: str = "") -> # Gate 3: PID liveness + starttime_ticks identity if psutil.pid_exists(pid): current_ticks = read_starttime_ticks(pid) - if current_ticks is None or current_ticks == enrollment.starttime_ticks: + if current_ticks is not None and current_ticks == enrollment.starttime_ticks: logger.debug("Skipping %s: PID %d still alive", trace_file.name, pid) continue # PID recycled — original process is gone, treat as crash From f4079043e1d5ff17153b8b8ad74bbd8f44dbf859 Mon Sep 17 00:00:00 2001 From: Trecek Date: Sun, 12 Apr 2026 16:45:41 -0700 Subject: [PATCH 11/14] fix(review): promote _read_enrollment to public API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The function was imported across module boundaries from linux_tracing.py into session_log.py with an underscore prefix signalling "private". Since it is legitimately shared between sister modules in the execution package, rename it to read_enrollment (no underscore) — consistent with the other public helpers (read_boot_id, read_starttime_ticks) in the same import block. Addresses reviewer comment 3070341627. --- src/autoskillit/execution/linux_tracing.py | 2 +- src/autoskillit/execution/session_log.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/autoskillit/execution/linux_tracing.py b/src/autoskillit/execution/linux_tracing.py index 5dfc8fa21..9d673079c 100644 --- a/src/autoskillit/execution/linux_tracing.py +++ b/src/autoskillit/execution/linux_tracing.py @@ -95,7 +95,7 @@ def _write_enrollment_atomic(path: Path, record: TraceEnrollmentRecord) -> None: raise -def _read_enrollment(path: Path) -> TraceEnrollmentRecord | None: +def read_enrollment(path: Path) -> TraceEnrollmentRecord | None: """Read and validate an enrollment sidecar. Returns None on any error.""" try: data = json.loads(path.read_text(encoding="utf-8")) diff --git a/src/autoskillit/execution/session_log.py b/src/autoskillit/execution/session_log.py index 81b9fecb5..95685de33 100644 --- a/src/autoskillit/execution/session_log.py +++ b/src/autoskillit/execution/session_log.py @@ -22,8 +22,8 @@ from autoskillit.core import atomic_write, claude_code_log_path, get_logger from autoskillit.execution.anomaly_detection import detect_anomalies from autoskillit.execution.linux_tracing import ( - _read_enrollment, read_boot_id, + read_enrollment, read_starttime_ticks, ) @@ -340,7 +340,7 @@ def recover_crashed_sessions(tmpfs_path: str = "/dev/shm", log_dir: str = "") -> # Gate 1: Enrollment sidecar must exist — no sidecar means alien/test file enrollment_path = tmpfs / f"autoskillit_enrollment_{pid}.json" - enrollment = _read_enrollment(enrollment_path) + enrollment = read_enrollment(enrollment_path) if enrollment is None: logger.debug("Skipping %s: no enrollment sidecar", trace_file.name) continue From cb49a005761e94ab83d23242b3b3837c59ae2ea4 Mon Sep 17 00:00:00 2001 From: Trecek Date: Sun, 12 Apr 2026 16:46:21 -0700 Subject: [PATCH 12/14] fix(review): move inspect/os imports to module level and delete frame locals in settings.py The deferred `import inspect` and `import os` inside `__post_init__` executed on every LinuxTracingConfig instantiation (the early-return check comes after them). Moving them to module level surfaces the dependency explicitly and avoids repeated import overhead. Also delete the frame locals (`del frame, init_frame, caller`) after the guard block to prevent reference cycles in non-CPython runtimes. Addresses reviewer comments 3070341619 and 3070341622. --- src/autoskillit/config/settings.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/autoskillit/config/settings.py b/src/autoskillit/config/settings.py index dfb39fd53..2041d68a4 100644 --- a/src/autoskillit/config/settings.py +++ b/src/autoskillit/config/settings.py @@ -11,7 +11,9 @@ from __future__ import annotations import dataclasses +import inspect import logging +import os import tempfile from dataclasses import dataclass, field from pathlib import Path @@ -194,9 +196,6 @@ class LinuxTracingConfig: tmpfs_path: str = "/dev/shm" # RAM-backed tmpfs for crash-resilient streaming def __post_init__(self) -> None: - import inspect - import os - if self.tmpfs_path != "/dev/shm" or not os.environ.get("PYTEST_CURRENT_TEST"): return # Only raise when called directly from test code — not from library machinery @@ -213,6 +212,7 @@ def __post_init__(self) -> None: "LinuxTracingConfig(tmpfs_path=str(tmp_path)). " "Use the isolated_tracing_config fixture for new tests." ) + del frame, init_frame, caller @dataclass From 794134a39a865e34ee3617fb33958c9f723c83ee Mon Sep 17 00:00:00 2001 From: Trecek Date: Sun, 12 Apr 2026 16:47:04 -0700 Subject: [PATCH 13/14] fix(review): add stop() comment and log enrollment write failures in linux_tracing.py 1. Add clarifying comment to stop() explaining why _trace_path is unconditionally unlinked: crash-recovery only reads files left by processes that never called stop(), so clean sessions correctly clean up their own file. 2. Add logger.warning() when the enrollment sidecar write fails with OSError. Previously silently swallowed; the missing sidecar causes Gate 1 to skip recovery for that session with no operator-visible diagnostic. Addresses reviewer comments 3070341625 and 3070341626. --- src/autoskillit/execution/linux_tracing.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/autoskillit/execution/linux_tracing.py b/src/autoskillit/execution/linux_tracing.py index 9d673079c..86aa95b40 100644 --- a/src/autoskillit/execution/linux_tracing.py +++ b/src/autoskillit/execution/linux_tracing.py @@ -31,9 +31,13 @@ import anyio.abc import psutil +from autoskillit.core import get_logger + if TYPE_CHECKING: from autoskillit.config import LinuxTracingConfig +logger = get_logger(__name__) + LINUX_TRACING_AVAILABLE = sys.platform == "linux" @@ -269,6 +273,8 @@ def stop(self) -> list[ProcSnapshot]: pass self._trace_file = None if self._trace_path is not None: + # Intentional: stop() cleans up its own trace file. Crash-recovery only reads + # files left behind by processes that never called stop() — so this is correct. self._trace_path.unlink(missing_ok=True) self._trace_path = None if self._enrollment_path is not None: @@ -321,7 +327,8 @@ def start_linux_tracing( ) _write_enrollment_atomic(enrollment_path, record) handle._enrollment_path = enrollment_path - except OSError: + except OSError as e: + logger.warning("Failed to write enrollment sidecar for pid %d: %s", pid, e) handle._enrollment_path = None async def _run_monitor() -> None: From 52475533ec71bcfd5175db5f374e678d51229f53 Mon Sep 17 00:00:00 2001 From: Trecek Date: Sun, 12 Apr 2026 16:47:48 -0700 Subject: [PATCH 14/14] fix(review): guard proc.kill() with try/finally in two Linux tracing tests In test_start_linux_tracing_writes_enrollment_sidecar and test_stop_unlinks_trace_and_enrollment, handle.stop() was called before proc.kill(). If stop() raises, proc.kill() never executes, leaking the sleep 2 subprocess. Wrap stop() in try/finally to guarantee proc.kill() always runs. Addresses reviewer comments 3070341631 and 3070341632. --- tests/execution/test_linux_tracing.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/tests/execution/test_linux_tracing.py b/tests/execution/test_linux_tracing.py index 86e95431b..46f6dc6b8 100644 --- a/tests/execution/test_linux_tracing.py +++ b/tests/execution/test_linux_tracing.py @@ -373,8 +373,10 @@ async def test_start_linux_tracing_writes_enrollment_sidecar(tmp_path): data = json.loads(enrollment.read_text()) assert data["schema_version"] == 1 assert data["pid"] == proc.pid - handle.stop() - proc.kill() + try: + handle.stop() + finally: + proc.kill() assert not enrollment.exists(), "Enrollment must be deleted by stop()" @@ -440,7 +442,9 @@ async def test_stop_unlinks_trace_and_enrollment(tmp_path): trace = tmp_path / f"autoskillit_trace_{proc.pid}.jsonl" enrollment = tmp_path / f"autoskillit_enrollment_{proc.pid}.json" assert trace.exists() - handle.stop() - proc.kill() + try: + handle.stop() + finally: + proc.kill() assert not trace.exists(), "Trace file must be deleted on clean stop()" assert not enrollment.exists(), "Enrollment sidecar must be deleted on clean stop()"