diff --git a/docker/standalone/start-all.sh b/docker/standalone/start-all.sh index f17fd0211..b5f163e6c 100755 --- a/docker/standalone/start-all.sh +++ b/docker/standalone/start-all.sh @@ -1,6 +1,28 @@ #!/bin/bash set -e +# ============================================================================= +# Embedded pg0 data integrity check (#675) +# +# When using embedded pg0, check if the data directory has existing PostgreSQL +# data before starting. If the directory exists but appears empty/corrupt +# (e.g., missing PG_VERSION file), log a warning. This helps diagnose data +# loss scenarios where a container restart caused the data directory to be +# wiped despite a volume mount being present. +# ============================================================================= +PG0_DATA_DIR="${HOME}/.pg0" +if [ -d "$PG0_DATA_DIR" ]; then + # Look for actual PostgreSQL data directories (pg0 creates subdirs per instance) + if compgen -G "$PG0_DATA_DIR"/*/PG_VERSION > /dev/null 2>&1; then + echo "✅ Existing pg0 data directory detected at $PG0_DATA_DIR" + elif [ "$(ls -A "$PG0_DATA_DIR" 2>/dev/null)" ]; then + echo "⚠️ WARNING: pg0 data directory exists at $PG0_DATA_DIR but no PG_VERSION found." + echo " This may indicate data corruption or an incomplete previous shutdown." + echo " If you see all migrations running from scratch after this, your data may have been lost." + echo " See: https://github.com/vectorize-io/hindsight/issues/675" + fi +fi + # Service flags (default to true if not set) ENABLE_API="${HINDSIGHT_ENABLE_API:-true}" ENABLE_CP="${HINDSIGHT_ENABLE_CP:-true}" @@ -71,6 +93,63 @@ if [ "${HINDSIGHT_WAIT_FOR_DEPS:-false}" = "true" ]; then done fi +# ============================================================================= +# Graceful shutdown handler (#675) +# +# Docker sends SIGTERM on `docker stop`/`docker restart`. Without a trap, child +# processes (hindsight-api + pg0, control-plane) are killed abruptly. For the +# embedded pg0 database this can cause data loss when the data directory is on +# a Docker volume that gets remounted after restart. +# +# The trap forwards SIGTERM to all tracked child PIDs so that: +# - hindsight-api receives the signal and can run its shutdown hooks +# - pg0 gets a clean PostgreSQL shutdown (checkpoint + WAL flush) +# - The control-plane Node.js process exits cleanly +# ============================================================================= +# Guard against concurrent cleanup (e.g., child crash + SIGTERM arriving together) +SHUTTING_DOWN=false + +cleanup() { + if $SHUTTING_DOWN; then return; fi + SHUTTING_DOWN=true + + echo "" + echo "🛑 Received shutdown signal, stopping services gracefully..." + for pid in "${PIDS[@]}"; do + if kill -0 "$pid" 2>/dev/null; then + kill -TERM "$pid" 2>/dev/null + fi + done + # Give processes time to shut down cleanly (pg0 needs to flush WAL). + # NOTE: Docker's default stop_grace_period is 10s. If you use the default, + # either set stop_grace_period: 30s in your compose file / docker stop -t 30, + # or Docker will SIGKILL the container before this timeout expires. + local timeout=30 + for ((i=1; i<=timeout; i++)); do + local all_stopped=true + for pid in "${PIDS[@]}"; do + if kill -0 "$pid" 2>/dev/null; then + all_stopped=false + break + fi + done + if $all_stopped; then + echo "✅ All services stopped cleanly" + exit 0 + fi + sleep 1 + done + # Force kill if still running after timeout + echo "⚠️ Timeout reached, forcing shutdown..." + for pid in "${PIDS[@]}"; do + if kill -0 "$pid" 2>/dev/null; then + kill -9 "$pid" 2>/dev/null + fi + done + exit 1 +} +trap cleanup SIGTERM SIGINT + # Track PIDs for wait PIDS=() @@ -138,8 +217,21 @@ if [ ${#PIDS[@]} -eq 0 ]; then exit 1 fi -# Wait for any process to exit -wait -n - -# Exit with status of first exited process -exit $? +# Wait for any process to exit (use wait -n with trap-safe loop) +while true; do + # wait -n returns when any child exits; it also returns on signal delivery + # (the trap handler will run and exit, so this loop is just for robustness). + # `&& true` prevents `set -e` from killing the script when wait -n returns + # non-zero (child exited with error or no backgrounded children remain). + wait -n && true + # Check if any tracked PID has exited + for pid in "${PIDS[@]}"; do + if ! kill -0 "$pid" 2>/dev/null; then + wait "$pid" 2>/dev/null + exit_code=$? + echo "⚠️ Service (PID $pid) exited with code $exit_code" + # Trigger cleanup for remaining services + cleanup + fi + done +done