From 23f5e000e10d19e114658bfd7abbc5ebb5a44831 Mon Sep 17 00:00:00 2001
From: Anton Nekipelov <226657+anton-107@users.noreply.github.com>
Date: Thu, 11 Jun 2026 16:07:57 +0200
Subject: [PATCH] ssh: propagate server logs when the bootstrap job fails

The bootstrap notebook now tees the SSH server's output and, when the
server process exits non-zero, fails the run with the last 2000 bytes of
server logs in the exception, which "ssh connect" prints via the existing
failed-run path. Also fixes the SIGCHLD subreaper handler silently
swallowing the server's exit status, which could report a failed server
as a successful run.

Co-authored-by: Isaac
---
 experimental/ssh/FAILURE_MODES.md             |  6 ++++
 experimental/ssh/internal/client/client.go    | 12 ++++---
 .../internal/client/client_internal_test.go   | 31 +++++++++++++++++++
 .../internal/client/ssh-server-bootstrap.py   | 24 +++++++++++++-
 4 files changed, 68 insertions(+), 5 deletions(-)

diff --git a/experimental/ssh/FAILURE_MODES.md b/experimental/ssh/FAILURE_MODES.md
index 18249cc0a5d..9a54db2cd7d 100644
--- a/experimental/ssh/FAILURE_MODES.md
+++ b/experimental/ssh/FAILURE_MODES.md
@@ -132,6 +132,12 @@ live notebook cell stdout / driver logs, not the Jobs run-output API. A failed r
 [Mode 2](#mode-2-container-cant-run-the-python-bootstrap) does populate the run's state message
 and error.
 
+When the `databricks ssh server` process itself exits non-zero (a third shape, distinct from
+the modes above: e.g. bad arguments or a startup crash), the bootstrap notebook tees the
+server's stdout/stderr and fails the run with the last ~2000 bytes of server logs embedded in
+the raised exception — `ssh connect` prints that tail via the failed-run path, so the cause is
+visible without opening the run page.
+
 ## Reproducing locally, without a workspace
 
 The proxy-layer behaviors have unit tests that don't need a cluster:
diff --git a/experimental/ssh/internal/client/client.go b/experimental/ssh/internal/client/client.go
index 00c1e05d0d0..7bab3b57087 100644
--- a/experimental/ssh/internal/client/client.go
+++ b/experimental/ssh/internal/client/client.go
@@ -759,10 +759,14 @@ func describeRunFailure(ctx context.Context, client *databricks.WorkspaceClient,
 		outputRunID = sshTask.RunId
 	}
 	if output, err := client.Jobs.GetRunOutput(ctx, jobs.GetRunOutputRequest{RunId: outputRunID}); err == nil && output != nil {
-		if e := strings.TrimSpace(output.Error); e != "" {
-			fmt.Fprintf(&b, "  %s\n", e)
-		}
-		if trace := strings.TrimSpace(output.ErrorTrace); trace != "" {
+		e := strings.TrimSpace(output.Error)
+		trace := strings.TrimSpace(output.ErrorTrace)
+		// Notebook tracebacks end with the same message as Error; skip Error then so the
+		// server-log tail the bootstrap embeds in the message isn't printed twice.
+		if e != "" && !strings.Contains(trace, e) {
+			fmt.Fprintf(&b, "  %s\n", truncateTail(e, maxRunFailureTraceBytes))
+		}
+		if trace != "" {
 			fmt.Fprintf(&b, "%s\n", truncateTail(trace, maxRunFailureTraceBytes))
 		}
 	}
diff --git a/experimental/ssh/internal/client/client_internal_test.go b/experimental/ssh/internal/client/client_internal_test.go
index 35740d8cee7..4e592a0eab3 100644
--- a/experimental/ssh/internal/client/client_internal_test.go
+++ b/experimental/ssh/internal/client/client_internal_test.go
@@ -62,6 +62,37 @@ func TestDescribeRunFailureTruncatesLongTrace(t *testing.T) {
 	assert.NotContains(t, out, strings.Repeat("x", maxRunFailureTraceBytes+1))
 }
 
+func TestDescribeRunFailureDeduplicatesErrorInTrace(t *testing.T) {
+	ctx := cmdio.MockDiscard(t.Context())
+	m := mocks.NewMockWorkspaceClient(t)
+	api := m.GetMockJobsAPI()
+	errMsg := "SSH server exited with code 1. Last server logs:\nLOG_MARKER"
+	api.EXPECT().GetRun(mock.Anything, jobs.GetRunRequest{RunId: 1}).Return(
+		terminatedRun(1, 99, "", "https://example.test/run/1"), nil)
+	api.EXPECT().GetRunOutput(mock.Anything, jobs.GetRunOutputRequest{RunId: 99}).Return(
+		&jobs.RunOutput{Error: errMsg, ErrorTrace: "Traceback (most recent call last):\n  boom\nRuntimeError: " + errMsg}, nil)
+
+	out := describeRunFailure(ctx, m.WorkspaceClient, 1)
+	assert.Contains(t, out, "Traceback (most recent call last):")
+	assert.Equal(t, 1, strings.Count(out, "LOG_MARKER"))
+}
+
+func TestDescribeRunFailureTruncatesLongError(t *testing.T) {
+	ctx := cmdio.MockDiscard(t.Context())
+	m := mocks.NewMockWorkspaceClient(t)
+	api := m.GetMockJobsAPI()
+	longError := strings.Repeat("x", maxRunFailureTraceBytes+500) + "TAIL_MARKER"
+	api.EXPECT().GetRun(mock.Anything, jobs.GetRunRequest{RunId: 1}).Return(
+		terminatedRun(1, 99, "", "https://example.test/run/1"), nil)
+	api.EXPECT().GetRunOutput(mock.Anything, jobs.GetRunOutputRequest{RunId: 99}).Return(
+		&jobs.RunOutput{Error: longError}, nil)
+
+	out := describeRunFailure(ctx, m.WorkspaceClient, 1)
+	assert.Contains(t, out, "...")
+	assert.Contains(t, out, "TAIL_MARKER")
+	assert.NotContains(t, out, strings.Repeat("x", maxRunFailureTraceBytes+1))
+}
+
 func TestDescribeRunFailureNoRunID(t *testing.T) {
 	ctx := cmdio.MockDiscard(t.Context())
 	m := mocks.NewMockWorkspaceClient(t)
diff --git a/experimental/ssh/internal/client/ssh-server-bootstrap.py b/experimental/ssh/internal/client/ssh-server-bootstrap.py
index 90cec89df16..6d5ac670baf 100644
--- a/experimental/ssh/internal/client/ssh-server-bootstrap.py
+++ b/experimental/ssh/internal/client/ssh-server-bootstrap.py
@@ -3,6 +3,7 @@
 import os
 import sys
 import subprocess
+import collections
 import ctypes
 import ctypes.util
 import signal
@@ -12,6 +13,11 @@
 
 SSH_TUNNEL_BASENAME = "databricks_cli"
 
+# Exit statuses collected by the SIGCHLD subreaper handler, keyed by pid. The handler
+# can reap the server subprocess before Popen.wait() does, in which case Popen would
+# report exit code 0; this map preserves the real status.
+reaped_statuses = {}
+
 dbutils.widgets.text("version", "")
 dbutils.widgets.text("secretScopeName", "")
 dbutils.widgets.text("authorizedKeySecretName", "")
@@ -38,6 +44,7 @@ def sigchld_handler(signum, frame):
                 # -1 means any child, WNOHANG means don't block
                 pid, status = os.waitpid(-1, os.WNOHANG)
                 if pid > 0:
+                    reaped_statuses[pid] = status
                     print(f"Reaped child {pid} with status {status}")
                 elif pid == 0:
                     print("No child has changed state")
@@ -156,8 +163,23 @@ def run_ssh_server():
         "--log-file=stdout",
     ]
 
+    # Tee the server output instead of inheriting stdout: the run-page logs remain the only
+    # place to debug a RUNNING server, but on failure we attach the log tail to the exception
+    # so "ssh connect" can print it (the Jobs run-output API has no stdout logs for notebook tasks).
+    tail = collections.deque(maxlen=100)
+    proc = subprocess.Popen(server_args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, errors="replace")
     try:
-        subprocess.run(server_args, check=True)
+        for line in proc.stdout:
+            # flush so the run-page logs stay live while the server is running
+            print(line, end="", flush=True)
+            tail.append(line)
+        returncode = proc.wait()
+        # The SIGCHLD subreaper handler may have collected the server first; Popen reports that as 0.
+        if proc.pid in reaped_statuses:
+            returncode = os.waitstatus_to_exitcode(reaped_statuses[proc.pid])
+        if returncode != 0:
+            # The tail size matches maxRunFailureTraceBytes, the cap the client prints to the terminal.
+            raise RuntimeError(f"SSH server exited with code {returncode}. Last server logs:\n" + "".join(tail)[-2000:])
     finally:
         kill_all_children()