Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions experimental/ssh/FAILURE_MODES.md
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,12 @@ live notebook cell stdout / driver logs, not the Jobs run-output API. A failed r
[Mode 2](#mode-2-container-cant-run-the-python-bootstrap) does populate the run's state message
and error.

When the `databricks ssh server` process itself exits non-zero (a third shape, distinct from
the modes above: e.g. bad arguments or a startup crash), the bootstrap notebook tees the
server's stdout/stderr and fails the run with the last ~2000 bytes of server logs embedded in
the raised exception — `ssh connect` prints that tail via the failed-run path, so the cause is
visible without opening the run page.

## Reproducing locally, without a workspace

The proxy-layer behaviors have unit tests that don't need a cluster:
Expand Down
12 changes: 8 additions & 4 deletions experimental/ssh/internal/client/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -759,10 +759,14 @@ func describeRunFailure(ctx context.Context, client *databricks.WorkspaceClient,
outputRunID = sshTask.RunId
}
if output, err := client.Jobs.GetRunOutput(ctx, jobs.GetRunOutputRequest{RunId: outputRunID}); err == nil && output != nil {
if e := strings.TrimSpace(output.Error); e != "" {
fmt.Fprintf(&b, " %s\n", e)
}
if trace := strings.TrimSpace(output.ErrorTrace); trace != "" {
e := strings.TrimSpace(output.Error)
trace := strings.TrimSpace(output.ErrorTrace)
// Notebook tracebacks end with the same message as Error; skip Error then so the
// server-log tail the bootstrap embeds in the message isn't printed twice.
if e != "" && !strings.Contains(trace, e) {
fmt.Fprintf(&b, " %s\n", truncateTail(e, maxRunFailureTraceBytes))
}
if trace != "" {
fmt.Fprintf(&b, "%s\n", truncateTail(trace, maxRunFailureTraceBytes))
}
}
Expand Down
31 changes: 31 additions & 0 deletions experimental/ssh/internal/client/client_internal_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,37 @@ func TestDescribeRunFailureTruncatesLongTrace(t *testing.T) {
assert.NotContains(t, out, strings.Repeat("x", maxRunFailureTraceBytes+1))
}

func TestDescribeRunFailureDeduplicatesErrorInTrace(t *testing.T) {
ctx := cmdio.MockDiscard(t.Context())
m := mocks.NewMockWorkspaceClient(t)
api := m.GetMockJobsAPI()
errMsg := "SSH server exited with code 1. Last server logs:\nLOG_MARKER"
api.EXPECT().GetRun(mock.Anything, jobs.GetRunRequest{RunId: 1}).Return(
terminatedRun(1, 99, "", "https://example.test/run/1"), nil)
api.EXPECT().GetRunOutput(mock.Anything, jobs.GetRunOutputRequest{RunId: 99}).Return(
&jobs.RunOutput{Error: errMsg, ErrorTrace: "Traceback (most recent call last):\n boom\nRuntimeError: " + errMsg}, nil)

out := describeRunFailure(ctx, m.WorkspaceClient, 1)
assert.Contains(t, out, "Traceback (most recent call last):")
assert.Equal(t, 1, strings.Count(out, "LOG_MARKER"))
}

func TestDescribeRunFailureTruncatesLongError(t *testing.T) {
ctx := cmdio.MockDiscard(t.Context())
m := mocks.NewMockWorkspaceClient(t)
api := m.GetMockJobsAPI()
longError := strings.Repeat("x", maxRunFailureTraceBytes+500) + "TAIL_MARKER"
api.EXPECT().GetRun(mock.Anything, jobs.GetRunRequest{RunId: 1}).Return(
terminatedRun(1, 99, "", "https://example.test/run/1"), nil)
api.EXPECT().GetRunOutput(mock.Anything, jobs.GetRunOutputRequest{RunId: 99}).Return(
&jobs.RunOutput{Error: longError}, nil)

out := describeRunFailure(ctx, m.WorkspaceClient, 1)
assert.Contains(t, out, "...")
assert.Contains(t, out, "TAIL_MARKER")
assert.NotContains(t, out, strings.Repeat("x", maxRunFailureTraceBytes+1))
}

func TestDescribeRunFailureNoRunID(t *testing.T) {
ctx := cmdio.MockDiscard(t.Context())
m := mocks.NewMockWorkspaceClient(t)
Expand Down
24 changes: 23 additions & 1 deletion experimental/ssh/internal/client/ssh-server-bootstrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import os
import sys
import subprocess
import collections
import ctypes
import ctypes.util
import signal
Expand All @@ -12,6 +13,11 @@

SSH_TUNNEL_BASENAME = "databricks_cli"

# Exit statuses collected by the SIGCHLD subreaper handler, keyed by pid. The handler
# can reap the server subprocess before Popen.wait() does, in which case Popen would
# report exit code 0; this map preserves the real status.
reaped_statuses = {}

dbutils.widgets.text("version", "")
dbutils.widgets.text("secretScopeName", "")
dbutils.widgets.text("authorizedKeySecretName", "")
Expand All @@ -38,6 +44,7 @@ def sigchld_handler(signum, frame):
# -1 means any child, WNOHANG means don't block
pid, status = os.waitpid(-1, os.WNOHANG)
if pid > 0:
reaped_statuses[pid] = status
print(f"Reaped child {pid} with status {status}")
elif pid == 0:
print("No child has changed state")
Expand Down Expand Up @@ -156,8 +163,23 @@ def run_ssh_server():
"--log-file=stdout",
]

# Tee the server output instead of inheriting stdout: the run-page logs remain the only
# place to debug a RUNNING server, but on failure we attach the log tail to the exception
# so "ssh connect" can print it (the Jobs run-output API has no stdout logs for notebook tasks).
tail = collections.deque(maxlen=100)
proc = subprocess.Popen(server_args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, errors="replace")
try:
subprocess.run(server_args, check=True)
for line in proc.stdout:
# flush so the run-page logs stay live while the server is running
print(line, end="", flush=True)
tail.append(line)
returncode = proc.wait()
# The SIGCHLD subreaper handler may have collected the server first; Popen reports that as 0.
if proc.pid in reaped_statuses:
returncode = os.waitstatus_to_exitcode(reaped_statuses[proc.pid])
if returncode != 0:
# The tail size matches maxRunFailureTraceBytes, the cap the client prints to the terminal.
raise RuntimeError(f"SSH server exited with code {returncode}. Last server logs:\n" + "".join(tail)[-2000:])
finally:
kill_all_children()

Expand Down
Loading