From 23f5e000e10d19e114658bfd7abbc5ebb5a44831 Mon Sep 17 00:00:00 2001 From: Anton Nekipelov <226657+anton-107@users.noreply.github.com> Date: Thu, 11 Jun 2026 16:07:57 +0200 Subject: [PATCH] ssh: propagate server logs when the bootstrap job fails The bootstrap notebook now tees the SSH server's output and, when the server process exits non-zero, fails the run with the last 2000 bytes of server logs in the exception, which "ssh connect" prints via the existing failed-run path. Also fixes the SIGCHLD subreaper handler silently swallowing the server's exit status, which could report a failed server as a successful run. Co-authored-by: Isaac --- experimental/ssh/FAILURE_MODES.md | 6 ++++ experimental/ssh/internal/client/client.go | 12 ++++--- .../internal/client/client_internal_test.go | 31 +++++++++++++++++++ .../internal/client/ssh-server-bootstrap.py | 24 +++++++++++++- 4 files changed, 68 insertions(+), 5 deletions(-) diff --git a/experimental/ssh/FAILURE_MODES.md b/experimental/ssh/FAILURE_MODES.md index 18249cc0a5d..9a54db2cd7d 100644 --- a/experimental/ssh/FAILURE_MODES.md +++ b/experimental/ssh/FAILURE_MODES.md @@ -132,6 +132,12 @@ live notebook cell stdout / driver logs, not the Jobs run-output API. A failed r [Mode 2](#mode-2-container-cant-run-the-python-bootstrap) does populate the run's state message and error. +When the `databricks ssh server` process itself exits non-zero (a third shape, distinct from +the modes above: e.g. bad arguments or a startup crash), the bootstrap notebook tees the +server's stdout/stderr and fails the run with the last ~2000 bytes of server logs embedded in +the raised exception — `ssh connect` prints that tail via the failed-run path, so the cause is +visible without opening the run page. + ## Reproducing locally, without a workspace The proxy-layer behaviors have unit tests that don't need a cluster: diff --git a/experimental/ssh/internal/client/client.go b/experimental/ssh/internal/client/client.go index 00c1e05d0d0..7bab3b57087 100644 --- a/experimental/ssh/internal/client/client.go +++ b/experimental/ssh/internal/client/client.go @@ -759,10 +759,14 @@ func describeRunFailure(ctx context.Context, client *databricks.WorkspaceClient, outputRunID = sshTask.RunId } if output, err := client.Jobs.GetRunOutput(ctx, jobs.GetRunOutputRequest{RunId: outputRunID}); err == nil && output != nil { - if e := strings.TrimSpace(output.Error); e != "" { - fmt.Fprintf(&b, " %s\n", e) - } - if trace := strings.TrimSpace(output.ErrorTrace); trace != "" { + e := strings.TrimSpace(output.Error) + trace := strings.TrimSpace(output.ErrorTrace) + // Notebook tracebacks end with the same message as Error; skip Error then so the + // server-log tail the bootstrap embeds in the message isn't printed twice. + if e != "" && !strings.Contains(trace, e) { + fmt.Fprintf(&b, " %s\n", truncateTail(e, maxRunFailureTraceBytes)) + } + if trace != "" { fmt.Fprintf(&b, "%s\n", truncateTail(trace, maxRunFailureTraceBytes)) } } diff --git a/experimental/ssh/internal/client/client_internal_test.go b/experimental/ssh/internal/client/client_internal_test.go index 35740d8cee7..4e592a0eab3 100644 --- a/experimental/ssh/internal/client/client_internal_test.go +++ b/experimental/ssh/internal/client/client_internal_test.go @@ -62,6 +62,37 @@ func TestDescribeRunFailureTruncatesLongTrace(t *testing.T) { assert.NotContains(t, out, strings.Repeat("x", maxRunFailureTraceBytes+1)) } +func TestDescribeRunFailureDeduplicatesErrorInTrace(t *testing.T) { + ctx := cmdio.MockDiscard(t.Context()) + m := mocks.NewMockWorkspaceClient(t) + api := m.GetMockJobsAPI() + errMsg := "SSH server exited with code 1. Last server logs:\nLOG_MARKER" + api.EXPECT().GetRun(mock.Anything, jobs.GetRunRequest{RunId: 1}).Return( + terminatedRun(1, 99, "", "https://example.test/run/1"), nil) + api.EXPECT().GetRunOutput(mock.Anything, jobs.GetRunOutputRequest{RunId: 99}).Return( + &jobs.RunOutput{Error: errMsg, ErrorTrace: "Traceback (most recent call last):\n boom\nRuntimeError: " + errMsg}, nil) + + out := describeRunFailure(ctx, m.WorkspaceClient, 1) + assert.Contains(t, out, "Traceback (most recent call last):") + assert.Equal(t, 1, strings.Count(out, "LOG_MARKER")) +} + +func TestDescribeRunFailureTruncatesLongError(t *testing.T) { + ctx := cmdio.MockDiscard(t.Context()) + m := mocks.NewMockWorkspaceClient(t) + api := m.GetMockJobsAPI() + longError := strings.Repeat("x", maxRunFailureTraceBytes+500) + "TAIL_MARKER" + api.EXPECT().GetRun(mock.Anything, jobs.GetRunRequest{RunId: 1}).Return( + terminatedRun(1, 99, "", "https://example.test/run/1"), nil) + api.EXPECT().GetRunOutput(mock.Anything, jobs.GetRunOutputRequest{RunId: 99}).Return( + &jobs.RunOutput{Error: longError}, nil) + + out := describeRunFailure(ctx, m.WorkspaceClient, 1) + assert.Contains(t, out, "...") + assert.Contains(t, out, "TAIL_MARKER") + assert.NotContains(t, out, strings.Repeat("x", maxRunFailureTraceBytes+1)) +} + func TestDescribeRunFailureNoRunID(t *testing.T) { ctx := cmdio.MockDiscard(t.Context()) m := mocks.NewMockWorkspaceClient(t) diff --git a/experimental/ssh/internal/client/ssh-server-bootstrap.py b/experimental/ssh/internal/client/ssh-server-bootstrap.py index 90cec89df16..6d5ac670baf 100644 --- a/experimental/ssh/internal/client/ssh-server-bootstrap.py +++ b/experimental/ssh/internal/client/ssh-server-bootstrap.py @@ -3,6 +3,7 @@ import os import sys import subprocess +import collections import ctypes import ctypes.util import signal @@ -12,6 +13,11 @@ SSH_TUNNEL_BASENAME = "databricks_cli" +# Exit statuses collected by the SIGCHLD subreaper handler, keyed by pid. The handler +# can reap the server subprocess before Popen.wait() does, in which case Popen would +# report exit code 0; this map preserves the real status. +reaped_statuses = {} + dbutils.widgets.text("version", "") dbutils.widgets.text("secretScopeName", "") dbutils.widgets.text("authorizedKeySecretName", "") @@ -38,6 +44,7 @@ def sigchld_handler(signum, frame): # -1 means any child, WNOHANG means don't block pid, status = os.waitpid(-1, os.WNOHANG) if pid > 0: + reaped_statuses[pid] = status print(f"Reaped child {pid} with status {status}") elif pid == 0: print("No child has changed state") @@ -156,8 +163,23 @@ def run_ssh_server(): "--log-file=stdout", ] + # Tee the server output instead of inheriting stdout: the run-page logs remain the only + # place to debug a RUNNING server, but on failure we attach the log tail to the exception + # so "ssh connect" can print it (the Jobs run-output API has no stdout logs for notebook tasks). + tail = collections.deque(maxlen=100) + proc = subprocess.Popen(server_args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, errors="replace") try: - subprocess.run(server_args, check=True) + for line in proc.stdout: + # flush so the run-page logs stay live while the server is running + print(line, end="", flush=True) + tail.append(line) + returncode = proc.wait() + # The SIGCHLD subreaper handler may have collected the server first; Popen reports that as 0. + if proc.pid in reaped_statuses: + returncode = os.waitstatus_to_exitcode(reaped_statuses[proc.pid]) + if returncode != 0: + # The tail size matches maxRunFailureTraceBytes, the cap the client prints to the terminal. + raise RuntimeError(f"SSH server exited with code {returncode}. Last server logs:\n" + "".join(tail)[-2000:]) finally: kill_all_children()