From c47bcf7b973d09ac663a1d9cb76f4f8c1d6571f4 Mon Sep 17 00:00:00 2001 From: kagura-agent Date: Thu, 26 Mar 2026 10:46:40 +0800 Subject: [PATCH 1/2] fix(docker): add graceful shutdown handler to prevent pg0 data loss on restart (#675) - Trap SIGTERM/SIGINT in start-all.sh to forward signals to child processes - pg0 (embedded PostgreSQL) now gets a clean shutdown with WAL flush - 30-second timeout before force-killing unresponsive processes - Add startup data integrity check: warn if pg0 data dir exists but PG_VERSION missing - Improve wait loop robustness: trigger cleanup when any child exits unexpectedly Fixes #675 --- docker/standalone/start-all.sh | 92 ++++++++++++++++++++++++++++++++-- 1 file changed, 87 insertions(+), 5 deletions(-) diff --git a/docker/standalone/start-all.sh b/docker/standalone/start-all.sh index f17fd0211..dffef4259 100755 --- a/docker/standalone/start-all.sh +++ b/docker/standalone/start-all.sh @@ -1,6 +1,29 @@ #!/bin/bash set -e +# ============================================================================= +# Embedded pg0 data integrity check (#675) +# +# When using embedded pg0, check if the data directory has existing PostgreSQL +# data before starting. If the directory exists but appears empty/corrupt +# (e.g., missing PG_VERSION file), log a warning. This helps diagnose data +# loss scenarios where a container restart caused the data directory to be +# wiped despite a volume mount being present. +# ============================================================================= +PG0_DATA_DIR="${HOME}/.pg0" +if [ -d "$PG0_DATA_DIR" ]; then + # Look for actual PostgreSQL data directories (pg0 creates subdirs per instance) + pg_dirs=$(find "$PG0_DATA_DIR" -maxdepth 2 -name "PG_VERSION" 2>/dev/null) + if [ -n "$pg_dirs" ]; then + echo "✅ Existing pg0 data directory detected at $PG0_DATA_DIR" + elif [ "$(ls -A "$PG0_DATA_DIR" 2>/dev/null)" ]; then + echo "⚠️ WARNING: pg0 data directory exists at $PG0_DATA_DIR but no PG_VERSION found." + echo " This may indicate data corruption or an incomplete previous shutdown." + echo " If you see all migrations running from scratch after this, your data may have been lost." + echo " See: https://github.com/vectorize-io/hindsight/issues/675" + fi +fi + # Service flags (default to true if not set) ENABLE_API="${HINDSIGHT_ENABLE_API:-true}" ENABLE_CP="${HINDSIGHT_ENABLE_CP:-true}" @@ -71,6 +94,54 @@ if [ "${HINDSIGHT_WAIT_FOR_DEPS:-false}" = "true" ]; then done fi +# ============================================================================= +# Graceful shutdown handler (#675) +# +# Docker sends SIGTERM on `docker stop`/`docker restart`. Without a trap, child +# processes (hindsight-api + pg0, control-plane) are killed abruptly. For the +# embedded pg0 database this can cause data loss when the data directory is on +# a Docker volume that gets remounted after restart. +# +# The trap forwards SIGTERM to all tracked child PIDs so that: +# - hindsight-api receives the signal and can run its shutdown hooks +# - pg0 gets a clean PostgreSQL shutdown (checkpoint + WAL flush) +# - The control-plane Node.js process exits cleanly +# ============================================================================= +cleanup() { + echo "" + echo "🛑 Received shutdown signal, stopping services gracefully..." + for pid in "${PIDS[@]}"; do + if kill -0 "$pid" 2>/dev/null; then + kill -TERM "$pid" 2>/dev/null + fi + done + # Give processes time to shut down cleanly (pg0 needs to flush WAL) + local timeout=30 + for ((i=1; i<=timeout; i++)); do + local all_stopped=true + for pid in "${PIDS[@]}"; do + if kill -0 "$pid" 2>/dev/null; then + all_stopped=false + break + fi + done + if $all_stopped; then + echo "✅ All services stopped cleanly" + exit 0 + fi + sleep 1 + done + # Force kill if still running after timeout + echo "⚠️ Timeout reached, forcing shutdown..." + for pid in "${PIDS[@]}"; do + if kill -0 "$pid" 2>/dev/null; then + kill -9 "$pid" 2>/dev/null + fi + done + exit 1 +} +trap cleanup SIGTERM SIGINT + # Track PIDs for wait PIDS=() @@ -138,8 +209,19 @@ if [ ${#PIDS[@]} -eq 0 ]; then exit 1 fi -# Wait for any process to exit -wait -n - -# Exit with status of first exited process -exit $? +# Wait for any process to exit (use wait -n with trap-safe loop) +while true; do + # wait -n returns when any child exits; it also returns on signal delivery + # (the trap handler will run and exit, so this loop is just for robustness) + wait -n && true + # Check if any tracked PID has exited + for pid in "${PIDS[@]}"; do + if ! kill -0 "$pid" 2>/dev/null; then + wait "$pid" 2>/dev/null + exit_code=$? + echo "⚠️ Service (PID $pid) exited with code $exit_code" + # Trigger cleanup for remaining services + cleanup + fi + done +done From 42321945fbdeb22603d690927ff9c0178cd4d548 Mon Sep 17 00:00:00 2001 From: kagura-agent Date: Thu, 26 Mar 2026 18:37:31 +0800 Subject: [PATCH 2/2] =?UTF-8?q?fix:=20address=20review=20feedback=20?= =?UTF-8?q?=E2=80=94=20re-entrant=20guard,=20timeout=20docs,=20cleaner=20g?= =?UTF-8?q?lob?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add SHUTTING_DOWN guard to prevent concurrent cleanup runs - Document Docker stop_grace_period mismatch (30s cleanup vs 10s default) - Replace find subprocess with compgen glob for PG_VERSION check - Add comment explaining wait -n && true idiom --- docker/standalone/start-all.sh | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/docker/standalone/start-all.sh b/docker/standalone/start-all.sh index dffef4259..b5f163e6c 100755 --- a/docker/standalone/start-all.sh +++ b/docker/standalone/start-all.sh @@ -13,8 +13,7 @@ set -e PG0_DATA_DIR="${HOME}/.pg0" if [ -d "$PG0_DATA_DIR" ]; then # Look for actual PostgreSQL data directories (pg0 creates subdirs per instance) - pg_dirs=$(find "$PG0_DATA_DIR" -maxdepth 2 -name "PG_VERSION" 2>/dev/null) - if [ -n "$pg_dirs" ]; then + if compgen -G "$PG0_DATA_DIR"/*/PG_VERSION > /dev/null 2>&1; then echo "✅ Existing pg0 data directory detected at $PG0_DATA_DIR" elif [ "$(ls -A "$PG0_DATA_DIR" 2>/dev/null)" ]; then echo "⚠️ WARNING: pg0 data directory exists at $PG0_DATA_DIR but no PG_VERSION found." @@ -107,7 +106,13 @@ fi # - pg0 gets a clean PostgreSQL shutdown (checkpoint + WAL flush) # - The control-plane Node.js process exits cleanly # ============================================================================= +# Guard against concurrent cleanup (e.g., child crash + SIGTERM arriving together) +SHUTTING_DOWN=false + cleanup() { + if $SHUTTING_DOWN; then return; fi + SHUTTING_DOWN=true + echo "" echo "🛑 Received shutdown signal, stopping services gracefully..." for pid in "${PIDS[@]}"; do @@ -115,7 +120,10 @@ cleanup() { kill -TERM "$pid" 2>/dev/null fi done - # Give processes time to shut down cleanly (pg0 needs to flush WAL) + # Give processes time to shut down cleanly (pg0 needs to flush WAL). + # NOTE: Docker's default stop_grace_period is 10s. If you use the default, + # either set stop_grace_period: 30s in your compose file / docker stop -t 30, + # or Docker will SIGKILL the container before this timeout expires. local timeout=30 for ((i=1; i<=timeout; i++)); do local all_stopped=true @@ -212,7 +220,9 @@ fi # Wait for any process to exit (use wait -n with trap-safe loop) while true; do # wait -n returns when any child exits; it also returns on signal delivery - # (the trap handler will run and exit, so this loop is just for robustness) + # (the trap handler will run and exit, so this loop is just for robustness). + # `&& true` prevents `set -e` from killing the script when wait -n returns + # non-zero (child exited with error or no backgrounded children remain). wait -n && true # Check if any tracked PID has exited for pid in "${PIDS[@]}"; do