From da0eef1f7fecf90a0710dfbf909137d3d05bf6b8 Mon Sep 17 00:00:00 2001 From: Islam Sharabash Date: Tue, 21 Apr 2026 21:37:04 +0000 Subject: [PATCH] fix: release shells mutex before spawn_subshell to prevent deadlock select_shell_desc() previously held the global 'shells' mutex while calling spawn_subshell(), which calls wait_for_startup(). If wait_for_startup blocks (e.g. due to a version mismatch between client and daemon, or a shell that fails to start), the mutex is held forever, deadlocking the entire daemon. All operations (list, attach, detach, kill) that need the shells mutex will hang indefinitely. Fix: Refactor select_shell_desc() into three phases: 1. Hold the lock to determine whether to create/reuse a session 2. Release the lock, then call spawn_subshell (may block) 3. Re-acquire the lock briefly to insert the session and return refs Also adds a 30-second timeout to wait_for_startup() as defense-in-depth, and a regression test that proves list does not deadlock during a slow shell spawn. --- libshpool/src/daemon/prompt.rs | 5 + libshpool/src/daemon/server.rs | 211 +++++++++++++--------- shpool/tests/data/hanging_shell.sh | 5 + shpool/tests/data/slow_shell_startup.toml | 13 ++ shpool/tests/deadlock.rs | 141 +++++++++++++++ 5 files changed, 290 insertions(+), 85 deletions(-) create mode 100755 shpool/tests/data/hanging_shell.sh create mode 100644 shpool/tests/data/slow_shell_startup.toml create mode 100644 shpool/tests/deadlock.rs diff --git a/libshpool/src/daemon/prompt.rs b/libshpool/src/daemon/prompt.rs index cfd208a6..ae2a7e36 100644 --- a/libshpool/src/daemon/prompt.rs +++ b/libshpool/src/daemon/prompt.rs @@ -129,8 +129,13 @@ fn wait_for_startup(pty_master: &mut shpool_pty::fork::Master) -> anyhow::Result .write_all(startup_sentinel_cmd.as_bytes()) .context("running startup sentinel script")?; + let deadline = std::time::Instant::now() + std::time::Duration::from_secs(30); let mut buf: [u8; 2048] = [0; 2048]; loop { + if std::time::Instant::now() > deadline { + warn!("timed out waiting for startup sentinel after 30s"); + return Err(anyhow!("timed out waiting for startup sentinel")); + } let len = pty_master.read(&mut buf).context("reading chunk to scan for startup")?; if len == 0 { continue; diff --git a/libshpool/src/daemon/server.rs b/libshpool/src/daemon/server.rs index 09e5225c..3b21b171 100644 --- a/libshpool/src/daemon/server.rs +++ b/libshpool/src/daemon/server.rs @@ -363,89 +363,122 @@ impl Server { )> { let warnings = vec![]; - // we unwrap to propagate the poison as an unwind - let _s = span!(Level::INFO, "1_lock(shells)").entered(); - let mut shells = self.shells.lock().unwrap(); - - let mut status = AttachStatus::Attached { warnings: warnings.clone() }; - if let Some(session) = shells.get(&header.name) { - info!("found entry for '{}'", header.name); - if let Ok(mut inner) = session.inner.try_lock() { - let _s = - span!(Level::INFO, "aquired_lock(session.inner)", s = header.name).entered(); - // We have an existing session in our table, but the subshell - // proc might have exited in the meantime, for example if the - // user typed `exit` right before the connection dropped there - // could be a zombie entry in our session table. We need to - // re-check whether the subshell has exited before taking this over. - // - // N.B. this is still technically a race, but in practice it does - // not ever cause problems, and there is no real way to avoid some - // sort of race without just always creating a new session when - // a shell exits, which would break `exit` typed at the shell prompt. - match session.child_exit_notifier.wait(Some(time::Duration::from_millis(0))) { - None => { - // the channel is still open so the subshell is still running - info!("taking over existing session inner"); - inner.client_stream = Some(stream.try_clone()?); - session.lifecycle_timestamps.lock().unwrap().last_connected_at = - Some(time::SystemTime::now()); + // Phase 1: Determine what to do while holding the lock. + // We must NOT call spawn_subshell while holding this lock + // because spawn_subshell may call wait_for_startup which + // blocks indefinitely reading from the PTY. If it blocks, + // the global shells mutex is held forever, deadlocking + // the entire daemon (list, attach, detach, kill all hang). + let needs_new_shell = { + let _s = span!(Level::INFO, "1_lock(shells)").entered(); + let shells = self.shells.lock().unwrap(); + + let mut status = AttachStatus::Attached { warnings: warnings.clone() }; + if let Some(session) = shells.get(&header.name) { + info!("found entry for '{}'", header.name); + if let Ok(mut inner) = session.inner.try_lock() { + let _s = + span!(Level::INFO, "aquired_lock(session.inner)", s = header.name) + .entered(); + // We have an existing session in our table, but the subshell + // proc might have exited in the meantime, for example if the + // user typed `exit` right before the connection dropped there + // could be a zombie entry in our session table. We need to + // re-check whether the subshell has exited before taking this over. + // + // N.B. this is still technically a race, but in practice it does + // not ever cause problems, and there is no real way to avoid some + // sort of race without just always creating a new session when + // a shell exits, which would break `exit` typed at the shell prompt. + match session.child_exit_notifier.wait(Some(time::Duration::from_millis(0))) { + None => { + // the channel is still open so the subshell is still running + info!("taking over existing session inner"); + inner.client_stream = Some(stream.try_clone()?); + session.lifecycle_timestamps.lock().unwrap().last_connected_at = + Some(time::SystemTime::now()); + + if inner + .shell_to_client_join_h + .as_ref() + .map(|h| h.is_finished()) + .unwrap_or(false) + { + warn!( + "child_exited chan unclosed, but shell->client thread has exited, clobbering with new subshell" + ); + status = AttachStatus::Created { warnings: warnings.clone() }; + } - if inner - .shell_to_client_join_h - .as_ref() - .map(|h| h.is_finished()) - .unwrap_or(false) - { - warn!( - "child_exited chan unclosed, but shell->client thread has exited, clobbering with new subshell" + // status is already attached + } + Some(exit_status) => { + // the channel is closed so we know the subshell exited + info!( + "stale inner, (child exited with status {}) clobbering with new subshell", + exit_status ); - status = AttachStatus::Created { warnings }; + status = AttachStatus::Created { warnings: warnings.clone() }; } - - // status is already attached } - Some(exit_status) => { - // the channel is closed so we know the subshell exited - info!( - "stale inner, (child exited with status {}) clobbering with new subshell", - exit_status - ); - status = AttachStatus::Created { warnings }; + + if inner + .shell_to_client_join_h + .as_ref() + .map(|h| h.is_finished()) + .unwrap_or(false) + { + info!("shell->client thread finished, joining"); + if let Some(h) = inner.shell_to_client_join_h.take() { + h.join() + .map_err(|e| { + anyhow!("joining shell->client on reattach: {:?}", e) + })? + .context("within shell->client thread on reattach")?; + } + assert!(matches!(status, AttachStatus::Created { .. })); } - } - if inner.shell_to_client_join_h.as_ref().map(|h| h.is_finished()).unwrap_or(false) { - info!("shell->client thread finished, joining"); - if let Some(h) = inner.shell_to_client_join_h.take() { - h.join() - .map_err(|e| anyhow!("joining shell->client on reattach: {:?}", e))? - .context("within shell->client thread on reattach")?; + // fallthrough to bidi streaming + } else { + info!("busy shell session, doing nothing"); + // The stream is busy, so we just inform the client and close the stream. + write_reply( + &mut stream, + AttachReplyHeader { status: AttachStatus::Busy }, + )?; + stream.shutdown(net::Shutdown::Both).context("closing stream")?; + if let Err(err) = self.hooks.on_busy(&header.name) { + warn!("busy hook: {:?}", err); } - assert!(matches!(status, AttachStatus::Created { .. })); + return Err(ShellSelectionError::BusyShellSession)?; } + } else { + info!("no existing '{}' session, creating new one", &header.name); + status = AttachStatus::Created { warnings: warnings.clone() }; + } - // fallthrough to bidi streaming + if matches!(status, AttachStatus::Created { .. }) { + // Signal that we need to spawn a new shell, but do NOT + // do it here — we'll do it after releasing the lock. + if let Err(err) = self.hooks.on_new_session(&header.name) { + warn!("new_session hook: {:?}", err); + } + true } else { - info!("busy shell session, doing nothing"); - // The stream is busy, so we just inform the client and close the stream. - write_reply(&mut stream, AttachReplyHeader { status: AttachStatus::Busy })?; - stream.shutdown(net::Shutdown::Both).context("closing stream")?; - if let Err(err) = self.hooks.on_busy(&header.name) { - warn!("busy hook: {:?}", err); + if let Err(err) = self.hooks.on_reattach(&header.name) { + warn!("reattach hook: {:?}", err); } - return Err(ShellSelectionError::BusyShellSession)?; + false } - } else { - info!("no existing '{}' session, creating new one", &header.name); - status = AttachStatus::Created { warnings }; - } + // shells lock is dropped here + }; - if matches!(status, AttachStatus::Created { .. }) { - info!("creating new subshell"); - if let Err(err) = self.hooks.on_new_session(&header.name) { - warn!("new_session hook: {:?}", err); - } + // Phase 2: Spawn the subshell WITHOUT the shells lock held. + // This is critical because spawn_subshell -> maybe_inject_prefix + // -> wait_for_startup can block indefinitely. + if needs_new_shell { + info!("creating new subshell (lock released)"); let motd = self.config.get().motd.clone().unwrap_or_default(); let session = self.spawn_subshell( conn_id, @@ -458,24 +491,32 @@ impl Server { session.lifecycle_timestamps.lock().unwrap().last_connected_at = Some(time::SystemTime::now()); + + // Re-acquire the lock to insert the new session + let _s = span!(Level::INFO, "2_lock(shells)").entered(); + let mut shells = self.shells.lock().unwrap(); shells.insert(header.name.clone(), Box::new(session)); - // fallthrough to bidi streaming - } else if let Err(err) = self.hooks.on_reattach(&header.name) { - warn!("reattach hook: {:?}", err); } - // return a reference to the inner session so that - // we can work with it without the global session - // table lock held - if let Some(session) = shells.get(&header.name) { - Ok(( - Some(Arc::clone(&session.child_exit_notifier)), - Some(Arc::clone(&session.inner)), - Some(Arc::clone(&session.pager_ctl)), - status, - )) - } else { - Ok((None, None, None, status)) + // Phase 3: Return references to the session. + // Re-acquire the lock (briefly) to get Arc references. + { + let _s = span!(Level::INFO, "3_lock(shells)").entered(); + let shells = self.shells.lock().unwrap(); + if let Some(session) = shells.get(&header.name) { + Ok(( + Some(Arc::clone(&session.child_exit_notifier)), + Some(Arc::clone(&session.inner)), + Some(Arc::clone(&session.pager_ctl)), + if needs_new_shell { + AttachStatus::Created { warnings: vec![] } + } else { + AttachStatus::Attached { warnings: vec![] } + }, + )) + } else { + Ok((None, None, None, AttachStatus::Created { warnings: vec![] })) + } } } diff --git a/shpool/tests/data/hanging_shell.sh b/shpool/tests/data/hanging_shell.sh new file mode 100755 index 00000000..cea75f5d --- /dev/null +++ b/shpool/tests/data/hanging_shell.sh @@ -0,0 +1,5 @@ +#!/bin/bash +# A fake shell that reads stdin but never executes anything. +# This simulates a shell that is slow to start up, which causes +# wait_for_startup to block forever. +sleep 3600 diff --git a/shpool/tests/data/slow_shell_startup.toml b/shpool/tests/data/slow_shell_startup.toml new file mode 100644 index 00000000..24f59a2e --- /dev/null +++ b/shpool/tests/data/slow_shell_startup.toml @@ -0,0 +1,13 @@ +norc = true +noecho = true +# Use cat as the shell - it never produces the startup sentinel, +# simulating a slow/hanging shell startup that triggers the deadlock. +shell = "/bin/cat" +session_restore_mode = "simple" +# Non-empty prompt_prefix enables sentinel injection, which triggers +# wait_for_startup in spawn_subshell. +prompt_prefix = "test> " + +[env] +PS1 = "prompt> " +TERM = "" diff --git a/shpool/tests/deadlock.rs b/shpool/tests/deadlock.rs new file mode 100644 index 00000000..841d42ce --- /dev/null +++ b/shpool/tests/deadlock.rs @@ -0,0 +1,141 @@ +use std::{ + io::Write, + os::unix::fs::PermissionsExt, + process::Command, + sync::mpsc, + time::Duration, +}; + +use anyhow::Context; +use ntest::timeout; + +mod support; + +use crate::support::daemon::DaemonArgs; + +/// Regression test for a deadlock where the global `shells` mutex is held +/// during `spawn_subshell` -> `wait_for_startup`. If `wait_for_startup` +/// blocks (e.g. the shell never produces the startup sentinel), the mutex +/// is held forever, blocking ALL daemon operations (list, attach, detach, +/// kill) that need it. +/// +/// This test: +/// 1. Creates a "shell" that just sleeps (never produces the sentinel) +/// 2. Uses a config with non-empty prompt_prefix (triggers sentinel injection) +/// 3. Spawns an attach (which hangs in wait_for_startup, holding the mutex) +/// 4. Tries `list` — on buggy code this deadlocks; on fixed code it returns +#[test] +#[timeout(20000)] +fn list_not_blocked_by_slow_shell_spawn() -> anyhow::Result<()> { + // Create a "shell" that never produces the startup sentinel. + let tmp_dir = tempfile::tempdir().context("creating temp dir")?; + + let hanging_shell = tmp_dir.path().join("hanging_shell.sh"); + { + let mut f = std::fs::File::create(&hanging_shell)?; + writeln!(f, "#!/bin/bash")?; + writeln!(f, "# A shell that never starts up properly.")?; + writeln!(f, "# It reads stdin to avoid SIGPIPE but never executes commands.")?; + writeln!(f, "exec cat > /dev/null")?; + } + std::fs::set_permissions(&hanging_shell, std::fs::Permissions::from_mode(0o755))?; + + // Create config that uses the hanging shell with a non-empty prompt_prefix. + // The non-empty prompt_prefix is key: it causes supports_sentinels=true, + // which triggers the call to wait_for_startup inside spawn_subshell. + let config_path = tmp_dir.path().join("config.toml"); + std::fs::write( + &config_path, + format!( + r#" +norc = true +noecho = true +shell = "{}" +session_restore_mode = "simple" +prompt_prefix = "test> " + +[env] +PS1 = "prompt> " +TERM = "" +"#, + hanging_shell.display() + ), + )?; + + // Start the daemon with this config (no test_hook events needed). + let mut daemon_proc = support::daemon::Proc::new( + &config_path, + DaemonArgs { listen_events: false, ..DaemonArgs::default() }, + ) + .context("starting daemon proc")?; + + // Spawn an attach in the background. + // This will enter select_shell_desc, lock(shells), call spawn_subshell, + // and hang in wait_for_startup — WITH THE MUTEX HELD on buggy code. + let socket_for_attach = daemon_proc.socket_path.clone(); + let bin = support::shpool_bin()?; + let mut attach_child = Command::new(&bin) + .arg("-vv") + .arg("--socket") + .arg(&socket_for_attach) + .arg("--config-file") + .arg(&config_path) + .arg("--no-daemonize") + .arg("attach") + .arg("deadlock-test-session") + .stdin(std::process::Stdio::null()) + .stdout(std::process::Stdio::null()) + .stderr(std::process::Stdio::null()) + .spawn() + .context("spawning attach proc")?; + + // Give the attach time to reach wait_for_startup and grab the mutex. + std::thread::sleep(Duration::from_secs(3)); + + // Now try `list` in a background thread with a timeout. + // On BUGGY code: list blocks forever (deadlock on shells mutex). + // On FIXED code: list returns immediately. + let (tx, rx) = mpsc::channel(); + let socket_for_list = daemon_proc.socket_path.clone(); + let bin_for_list = support::shpool_bin()?; + std::thread::spawn(move || { + let result = Command::new(&bin_for_list) + .arg("-vv") + .arg("--socket") + .arg(&socket_for_list) + .arg("--no-daemonize") + .arg("list") + .output(); + let _ = tx.send(result); + }); + + // Wait for list to complete, with a 5-second timeout. + let list_result = rx.recv_timeout(Duration::from_secs(5)); + + // Clean up: kill the hanging attach process. + let _ = attach_child.kill(); + let _ = attach_child.wait(); + + match list_result { + Ok(Ok(output)) => { + assert!( + output.status.success(), + "list should succeed, stderr: {}", + String::from_utf8_lossy(&output.stderr) + ); + let stdout = String::from_utf8_lossy(&output.stdout); + assert!(stdout.contains("NAME"), "list output should contain headers"); + Ok(()) + } + Ok(Err(e)) => { + panic!("list command failed to execute: {:?}", e); + } + Err(_) => { + panic!( + "DEADLOCK DETECTED: `shpool list` did not complete within 5 seconds. \ + The shells mutex is being held by spawn_subshell/wait_for_startup, \ + blocking all other daemon operations." + ); + } + } +}