From c47bcf7b973d09ac663a1d9cb76f4f8c1d6571f4 Mon Sep 17 00:00:00 2001
From: kagura-agent <kagura.chen28@gmail.com>
Date: Thu, 26 Mar 2026 10:46:40 +0800
Subject: [PATCH 1/2] fix(docker): add graceful shutdown handler to prevent pg0
 data loss on restart (#675)

- Trap SIGTERM/SIGINT in start-all.sh to forward signals to child processes
- pg0 (embedded PostgreSQL) now gets a clean shutdown with WAL flush
- 30-second timeout before force-killing unresponsive processes
- Add startup data integrity check: warn if pg0 data dir exists but PG_VERSION missing
- Improve wait loop robustness: trigger cleanup when any child exits unexpectedly

Fixes #675
---
 docker/standalone/start-all.sh | 92 ++++++++++++++++++++++++++++++++--
 1 file changed, 87 insertions(+), 5 deletions(-)

diff --git a/docker/standalone/start-all.sh b/docker/standalone/start-all.sh
index f17fd0211..dffef4259 100755
--- a/docker/standalone/start-all.sh
+++ b/docker/standalone/start-all.sh
@@ -1,6 +1,29 @@
 #!/bin/bash
 set -e
 
+# =============================================================================
+# Embedded pg0 data integrity check (#675)
+#
+# When using embedded pg0, check if the data directory has existing PostgreSQL
+# data before starting. If the directory exists but appears empty/corrupt
+# (e.g., missing PG_VERSION file), log a warning. This helps diagnose data
+# loss scenarios where a container restart caused the data directory to be
+# wiped despite a volume mount being present.
+# =============================================================================
+PG0_DATA_DIR="${HOME}/.pg0"
+if [ -d "$PG0_DATA_DIR" ]; then
+    # Look for actual PostgreSQL data directories (pg0 creates subdirs per instance)
+    pg_dirs=$(find "$PG0_DATA_DIR" -maxdepth 2 -name "PG_VERSION" 2>/dev/null)
+    if [ -n "$pg_dirs" ]; then
+        echo "✅ Existing pg0 data directory detected at $PG0_DATA_DIR"
+    elif [ "$(ls -A "$PG0_DATA_DIR" 2>/dev/null)" ]; then
+        echo "⚠️  WARNING: pg0 data directory exists at $PG0_DATA_DIR but no PG_VERSION found."
+        echo "   This may indicate data corruption or an incomplete previous shutdown."
+        echo "   If you see all migrations running from scratch after this, your data may have been lost."
+        echo "   See: https://github.com/vectorize-io/hindsight/issues/675"
+    fi
+fi
+
 # Service flags (default to true if not set)
 ENABLE_API="${HINDSIGHT_ENABLE_API:-true}"
 ENABLE_CP="${HINDSIGHT_ENABLE_CP:-true}"
@@ -71,6 +94,54 @@ if [ "${HINDSIGHT_WAIT_FOR_DEPS:-false}" = "true" ]; then
     done
 fi
 
+# =============================================================================
+# Graceful shutdown handler (#675)
+#
+# Docker sends SIGTERM on `docker stop`/`docker restart`. Without a trap, child
+# processes (hindsight-api + pg0, control-plane) are killed abruptly. For the
+# embedded pg0 database this can cause data loss when the data directory is on
+# a Docker volume that gets remounted after restart.
+#
+# The trap forwards SIGTERM to all tracked child PIDs so that:
+#   - hindsight-api receives the signal and can run its shutdown hooks
+#   - pg0 gets a clean PostgreSQL shutdown (checkpoint + WAL flush)
+#   - The control-plane Node.js process exits cleanly
+# =============================================================================
+cleanup() {
+    echo ""
+    echo "🛑 Received shutdown signal, stopping services gracefully..."
+    for pid in "${PIDS[@]}"; do
+        if kill -0 "$pid" 2>/dev/null; then
+            kill -TERM "$pid" 2>/dev/null
+        fi
+    done
+    # Give processes time to shut down cleanly (pg0 needs to flush WAL)
+    local timeout=30
+    for ((i=1; i<=timeout; i++)); do
+        local all_stopped=true
+        for pid in "${PIDS[@]}"; do
+            if kill -0 "$pid" 2>/dev/null; then
+                all_stopped=false
+                break
+            fi
+        done
+        if $all_stopped; then
+            echo "✅ All services stopped cleanly"
+            exit 0
+        fi
+        sleep 1
+    done
+    # Force kill if still running after timeout
+    echo "⚠️  Timeout reached, forcing shutdown..."
+    for pid in "${PIDS[@]}"; do
+        if kill -0 "$pid" 2>/dev/null; then
+            kill -9 "$pid" 2>/dev/null
+        fi
+    done
+    exit 1
+}
+trap cleanup SIGTERM SIGINT
+
 # Track PIDs for wait
 PIDS=()
 
@@ -138,8 +209,19 @@ if [ ${#PIDS[@]} -eq 0 ]; then
     exit 1
 fi
 
-# Wait for any process to exit
-wait -n
-
-# Exit with status of first exited process
-exit $?
+# Wait for any process to exit (use wait -n with trap-safe loop)
+while true; do
+    # wait -n returns when any child exits; it also returns on signal delivery
+    # (the trap handler will run and exit, so this loop is just for robustness)
+    wait -n && true
+    # Check if any tracked PID has exited
+    for pid in "${PIDS[@]}"; do
+        if ! kill -0 "$pid" 2>/dev/null; then
+            wait "$pid" 2>/dev/null
+            exit_code=$?
+            echo "⚠️  Service (PID $pid) exited with code $exit_code"
+            # Trigger cleanup for remaining services
+            cleanup
+        fi
+    done
+done

From 42321945fbdeb22603d690927ff9c0178cd4d548 Mon Sep 17 00:00:00 2001
From: kagura-agent <kagura.chen28@gmail.com>
Date: Thu, 26 Mar 2026 18:37:31 +0800
Subject: [PATCH 2/2] =?UTF-8?q?fix:=20address=20review=20feedback=20?=
 =?UTF-8?q?=E2=80=94=20re-entrant=20guard,=20timeout=20docs,=20cleaner=20g?=
 =?UTF-8?q?lob?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add SHUTTING_DOWN guard to prevent concurrent cleanup runs
- Document Docker stop_grace_period mismatch (30s cleanup vs 10s default)
- Replace find subprocess with compgen glob for PG_VERSION check
- Add comment explaining wait -n && true idiom
---
 docker/standalone/start-all.sh | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/docker/standalone/start-all.sh b/docker/standalone/start-all.sh
index dffef4259..b5f163e6c 100755
--- a/docker/standalone/start-all.sh
+++ b/docker/standalone/start-all.sh
@@ -13,8 +13,7 @@ set -e
 PG0_DATA_DIR="${HOME}/.pg0"
 if [ -d "$PG0_DATA_DIR" ]; then
     # Look for actual PostgreSQL data directories (pg0 creates subdirs per instance)
-    pg_dirs=$(find "$PG0_DATA_DIR" -maxdepth 2 -name "PG_VERSION" 2>/dev/null)
-    if [ -n "$pg_dirs" ]; then
+    if compgen -G "$PG0_DATA_DIR"/*/PG_VERSION > /dev/null 2>&1; then
         echo "✅ Existing pg0 data directory detected at $PG0_DATA_DIR"
     elif [ "$(ls -A "$PG0_DATA_DIR" 2>/dev/null)" ]; then
         echo "⚠️  WARNING: pg0 data directory exists at $PG0_DATA_DIR but no PG_VERSION found."
@@ -107,7 +106,13 @@ fi
 #   - pg0 gets a clean PostgreSQL shutdown (checkpoint + WAL flush)
 #   - The control-plane Node.js process exits cleanly
 # =============================================================================
+# Guard against concurrent cleanup (e.g., child crash + SIGTERM arriving together)
+SHUTTING_DOWN=false
+
 cleanup() {
+    if $SHUTTING_DOWN; then return; fi
+    SHUTTING_DOWN=true
+
     echo ""
     echo "🛑 Received shutdown signal, stopping services gracefully..."
     for pid in "${PIDS[@]}"; do
@@ -115,7 +120,10 @@ cleanup() {
             kill -TERM "$pid" 2>/dev/null
         fi
     done
-    # Give processes time to shut down cleanly (pg0 needs to flush WAL)
+    # Give processes time to shut down cleanly (pg0 needs to flush WAL).
+    # NOTE: Docker's default stop_grace_period is 10s. If you use the default,
+    # either set stop_grace_period: 30s in your compose file / docker stop -t 30,
+    # or Docker will SIGKILL the container before this timeout expires.
     local timeout=30
     for ((i=1; i<=timeout; i++)); do
         local all_stopped=true
@@ -212,7 +220,9 @@ fi
 # Wait for any process to exit (use wait -n with trap-safe loop)
 while true; do
     # wait -n returns when any child exits; it also returns on signal delivery
-    # (the trap handler will run and exit, so this loop is just for robustness)
+    # (the trap handler will run and exit, so this loop is just for robustness).
+    # `&& true` prevents `set -e` from killing the script when wait -n returns
+    # non-zero (child exited with error or no backgrounded children remain).
     wait -n && true
     # Check if any tracked PID has exited
     for pid in "${PIDS[@]}"; do