diff --git a/NEXT_CHANGELOG.md b/NEXT_CHANGELOG.md index f26e1ea1afc..17bd35390ef 100644 --- a/NEXT_CHANGELOG.md +++ b/NEXT_CHANGELOG.md @@ -6,6 +6,7 @@ ### CLI * Show a once-per-day notice after a command when a newer CLI release is available, with a link to the release and the upgrade command for the detected install method. Suppressed for non-interactive/CI runs, JSON output, the Databricks Runtime, and development builds, and can be disabled with `DATABRICKS_CLI_DISABLE_UPDATE_CHECK` ([#5470](https://github.com/databricks/cli/pull/5470)). +* Increase the SSH server startup timeout from 10 to 45 minutes when a GPU accelerator is requested via `databricks ssh connect --accelerator` ([#5569](https://github.com/databricks/cli/pull/5569)). ### Bundles * Remove API enum values and types that are still in development from the `databricks-bundles` Python package; these were never accepted by the backend ([#5484](https://github.com/databricks/cli/pull/5484)). diff --git a/experimental/ssh/cmd/connect.go b/experimental/ssh/cmd/connect.go index 2c1af41c5c8..6daaca6db7a 100644 --- a/experimental/ssh/cmd/connect.go +++ b/experimental/ssh/cmd/connect.go @@ -90,6 +90,12 @@ Connect to a dedicated cluster: if connectionName == "" && clusterID == "" && !proxyMode { connectionName = client.GenerateDefaultConnectionName(wsClient.Config.Host, accelerator) } + // Serverless GPU compute can take much longer to provision than CPU compute, + // so allow extra time for the SSH server job to start. + startupTimeout := taskStartupTimeout + if accelerator != "" { + startupTimeout = gpuTaskStartupTimeout + } opts := client.ClientOptions{ Profile: wsClient.Config.Profile, ClusterID: clusterID, @@ -103,7 +109,7 @@ Connect to a dedicated cluster: HandoverTimeout: handoverTimeout, ReleasesDir: releasesDir, ServerTimeout: max(serverTimeout, shutdownDelay), - TaskStartupTimeout: taskStartupTimeout, + TaskStartupTimeout: startupTimeout, AutoStartCluster: autoStartCluster, ClientPublicKeyName: clientPublicKeyName, ClientPrivateKeyName: clientPrivateKeyName, diff --git a/experimental/ssh/cmd/constants.go b/experimental/ssh/cmd/constants.go index edf95494b75..64c99b5bd48 100644 --- a/experimental/ssh/cmd/constants.go +++ b/experimental/ssh/cmd/constants.go @@ -9,12 +9,13 @@ const ( defaultHandoverTimeout = 30 * time.Minute defaultEnvironmentVersion = 4 - serverTimeout = 24 * time.Hour - taskStartupTimeout = 10 * time.Minute - serverPortRange = 100 - serverConfigDir = ".ssh-tunnel" - serverPrivateKeyName = "server-private-key" - serverPublicKeyName = "server-public-key" - clientPrivateKeyName = "client-private-key" - clientPublicKeyName = "client-public-key" + serverTimeout = 24 * time.Hour + taskStartupTimeout = 10 * time.Minute + gpuTaskStartupTimeout = 45 * time.Minute + serverPortRange = 100 + serverConfigDir = ".ssh-tunnel" + serverPrivateKeyName = "server-private-key" + serverPublicKeyName = "server-public-key" + clientPrivateKeyName = "client-private-key" + clientPublicKeyName = "client-public-key" )