From 790adb801f98755e2aed7365b5fc18bc31748a8d Mon Sep 17 00:00:00 2001 From: Adam Miller Date: Thu, 21 May 2026 15:09:20 -0500 Subject: [PATCH] fix(packaging): add upgrade migration docs and podman socket retry After #1415 ships, users upgrading from previous releases need guidance on the gateway.env deprecation, port/bind/database path changes, and the podman.socket restart requirement. - docs(rpm): add 'Migrating from gateway.env' section to TROUBLESHOOTING covering backward compatibility, env-to-TOML key mapping, and three breaking changes (default port 8080->17670, bind address 0.0.0.0->127.0.0.1, database path move). Add podman.socket restart step to upgrade procedure. - docs(rpm): add upgrade callout to CONFIGURATION.md pointing at migration section. - fix(podman): retry PodmanComputeDriver ping up to 5 times with 2s delay to tolerate transient socket unavailability after package upgrades. The systemd unit uses Wants=podman.socket (not Requires) so the gateway can start while the socket is briefly re-activating after an RPM upgrade changes its unit file on disk. - chore(rpm): update EnvironmentFile comment in RPM spec to explain backward-compatibility intent. Signed-off-by: Adam Miller --- crates/openshell-driver-podman/src/driver.rs | 27 +++++++- deploy/rpm/CONFIGURATION.md | 5 ++ deploy/rpm/TROUBLESHOOTING.md | 69 ++++++++++++++++++++ openshell.spec | 4 +- 4 files changed, 102 insertions(+), 3 deletions(-) diff --git a/crates/openshell-driver-podman/src/driver.rs b/crates/openshell-driver-podman/src/driver.rs index 3d88e12ac..394881690 100644 --- a/crates/openshell-driver-podman/src/driver.rs +++ b/crates/openshell-driver-podman/src/driver.rs @@ -11,6 +11,7 @@ use crate::watcher::{ }; use openshell_core::ComputeDriverError; use openshell_core::proto::compute::v1::{DriverSandbox, GetCapabilitiesResponse}; +use std::time::Duration; use tracing::{info, warn}; impl From for ComputeDriverError { @@ -57,6 +58,9 @@ fn validated_container_name(sandbox_name: &str) -> Result Result { + const MAX_PING_RETRIES: u32 = 5; + const PING_RETRY_DELAY: Duration = Duration::from_secs(2); + if !config.socket_path.exists() { if cfg!(target_os = "macos") { warn!( @@ -80,8 +84,27 @@ impl PodmanComputeDriver { let client = PodmanClient::new(config.socket_path.clone()); - // Verify connectivity. - client.ping().await?; + // Verify connectivity, retrying briefly to tolerate transient socket + // unavailability (e.g. podman.socket restarting after a package + // upgrade). The systemd unit uses Wants=podman.socket (not Requires), + // so the gateway may start while the socket is briefly re-activating. + let mut attempts = 0; + loop { + match client.ping().await { + Ok(()) => break, + Err(e) if attempts < MAX_PING_RETRIES => { + attempts += 1; + warn!( + attempt = attempts, + max_retries = MAX_PING_RETRIES, + error = %e, + "Podman socket not ready, retrying" + ); + tokio::time::sleep(PING_RETRY_DELAY).await; + } + Err(e) => return Err(e), + } + } // Verify cgroups v2, detect rootless mode, and log system info. match client.system_info().await { diff --git a/deploy/rpm/CONFIGURATION.md b/deploy/rpm/CONFIGURATION.md index 8a7edca8c..f48b7d158 100644 --- a/deploy/rpm/CONFIGURATION.md +++ b/deploy/rpm/CONFIGURATION.md @@ -195,6 +195,11 @@ configuration is required. ## Configuration reference +> **Upgrading from a previous release?** See the +> ["Migrating from gateway.env"](TROUBLESHOOTING.md#migrating-from-gatewayenv) +> section in TROUBLESHOOTING.md for the env-to-TOML mapping and notes on +> the default port, bind address, and database path changes. + Gateway and driver settings have local runtime defaults. The gateway reads `~/.config/openshell/gateway.toml` when that file exists. Set `OPENSHELL_GATEWAY_CONFIG` in the launch environment to use a different file. diff --git a/deploy/rpm/TROUBLESHOOTING.md b/deploy/rpm/TROUBLESHOOTING.md index c46e0ba9f..68a1f4946 100644 --- a/deploy/rpm/TROUBLESHOOTING.md +++ b/deploy/rpm/TROUBLESHOOTING.md @@ -214,12 +214,19 @@ After upgrading the RPM packages: ```shell sudo dnf update openshell openshell-gateway +systemctl --user restart podman.socket systemctl --user restart openshell-gateway ``` The SQLite database schema is auto-migrated on startup. Running sandboxes are stopped during the restart. +Restarting `podman.socket` after a package upgrade is recommended: if the +unit file changed on disk during the upgrade, the running socket may become +non-functional until restarted, causing the gateway to fail with a +connection error on `/run/user//podman/podman.sock`. The gateway +retries briefly on startup, but a stale socket will not recover on its own. + Package upgrades do not overwrite `~/.config/openshell/gateway.toml` when you create one. New gateway process options can be added manually by referencing CONFIGURATION.md or running `openshell-gateway --help`. @@ -230,3 +237,65 @@ To pick up new container images after an upgrade: podman pull ghcr.io/nvidia/openshell/supervisor:latest podman pull ghcr.io/nvidia/openshell-community/sandboxes/base:latest ``` + +### Migrating from gateway.env + +Previous releases generated `~/.config/openshell/gateway.env` on first +start and used it to configure the gateway at launch. The gateway now +starts from built-in runtime defaults and reads +`~/.config/openshell/gateway.toml` when that file exists. + +If you have a `gateway.env` file it is still honored: the systemd unit +reads it via `EnvironmentFile` on every start. You can leave it in place +or delete it. New installs no longer generate one. + +To migrate settings to TOML, create `~/.config/openshell/gateway.toml` +and map the relevant variables: + +| Environment variable | TOML equivalent | +|---|---| +| `OPENSHELL_BIND_ADDRESS=A` + `OPENSHELL_SERVER_PORT=P` | `bind_address = "A:P"` under `[openshell.gateway]` | +| `OPENSHELL_DRIVERS=podman` | `compute_drivers = ["podman"]` under `[openshell.gateway]` | +| `OPENSHELL_DISABLE_TLS=true` | `disable_tls = true` under `[openshell.gateway]` | +| `OPENSHELL_TLS_CERT=PATH` | `cert_path = "PATH"` under `[openshell.gateway.tls]` | +| `OPENSHELL_TLS_KEY=PATH` | `key_path = "PATH"` under `[openshell.gateway.tls]` | +| `OPENSHELL_TLS_CLIENT_CA=PATH` | `client_ca_path = "PATH"` under `[openshell.gateway.tls]` | +| `OPENSHELL_DB_URL=URL` | env-only — not accepted in TOML; keep in env or drop-in override | +| `OPENSHELL_LOG_LEVEL=debug` | env-only — keep as `Environment=OPENSHELL_LOG_LEVEL=debug` in a drop-in | + +Other breaking changes in this release: + +- **Default port changed from 8080 to 17670.** If you registered the + gateway at `https://127.0.0.1:8080`, re-register it: + + ```shell + openshell gateway add --local https://127.0.0.1:17670 + ``` + +- **Default bind address changed from `0.0.0.0` to `127.0.0.1`.** If + you relied on network-accessible access without an explicit bind + address, add the following to `~/.config/openshell/gateway.toml`: + + ```toml + [openshell.gateway] + bind_address = "0.0.0.0:17670" + ``` + + Also update your firewall rule if applicable: + + ```shell + sudo firewall-cmd --remove-port=8080/tcp --permanent + sudo firewall-cmd --add-port=17670/tcp --permanent + sudo firewall-cmd --reload + ``` + +- **Database path changed** from `~/.local/state/openshell/gateway.db` + to `~/.local/state/openshell/gateway/openshell.db`. Existing gateway + state (registered sandboxes, etc.) is not migrated automatically. To + preserve state across the upgrade, move the file before restarting: + + ```shell + mkdir -p ~/.local/state/openshell/gateway + mv ~/.local/state/openshell/gateway.db \ + ~/.local/state/openshell/gateway/openshell.db + ``` diff --git a/openshell.spec b/openshell.spec index e351f5a44..1f3e9cd97 100644 --- a/openshell.spec +++ b/openshell.spec @@ -161,7 +161,9 @@ ExecStartPre=/bin/sh -c 'test -f %%E/openshell/gateway.toml || install -Dm644 /u # %%S expands to $XDG_STATE_HOME (~/.local/state) in user units. ExecStartPre=/usr/bin/openshell-gateway generate-certs --output-dir %%S/openshell/tls --server-san host.openshell.internal -# Optional OPENSHELL_* overrides. +# gateway.env is honored for backward compatibility with pre-1415 installs. +# New installs use runtime defaults; create gateway.toml to override. +# See TROUBLESHOOTING.md for the env-to-TOML migration guide. EnvironmentFile=-%%E/openshell/gateway.env ExecStart=/usr/bin/openshell-gateway StateDirectory=openshell