From b4099c46f4a39f446a97ddf54c79d7434983d3d6 Mon Sep 17 00:00:00 2001 From: DmitriiAn Date: Fri, 27 Mar 2026 15:23:44 +0100 Subject: [PATCH 1/9] =?UTF-8?q?Add=20notify-migration.sh=20=E2=80=94=20Sla?= =?UTF-8?q?ck=20alerts=20for=20migration=20failures=20and=20errors?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Runs from cron and fires once per unique event to avoid spam: - pgcopydb process stopped unexpectedly - New ERROR lines in migration.log - Migration completed (success or failure) Alert messages include tables copied and GB transferred, sourced from the pgcopydb SQLite catalog. State is stored in $MIGRATION_DIR/.notify-state and resets automatically when a new migration starts. README: added usage to the Monitor section and script reference table. --- pgcopydb-helpers/README.md | 9 + pgcopydb-helpers/notify-migration.sh | 250 +++++++++++++++++++++++++++ 2 files changed, 259 insertions(+) create mode 100755 pgcopydb-helpers/notify-migration.sh diff --git a/pgcopydb-helpers/README.md b/pgcopydb-helpers/README.md index 69f932f..b16b920 100644 --- a/pgcopydb-helpers/README.md +++ b/pgcopydb-helpers/README.md @@ -141,6 +141,14 @@ Once the initial copy completes and CDC is streaming, check replication progress When `check-cdc-status.sh` reports **"CDC IS CAUGHT UP"** (apply backlog < 100 MB), you are ready for cutover. +To receive proactive Slack alerts on failures and errors without manually running the above scripts, use `notify-migration.sh`. It runs from cron and fires once per unique event (process stopped, new errors, completion). Add `SLACK_WEBHOOK_URL` to `~/.env`, then: + +```bash +~/notify-migration.sh --test # verify webhook +~/notify-migration.sh --setup # install cron job (default: every 2 min) +~/notify-migration.sh --uninstall # remove when done +``` + ### 4. Cut Over 1. **Stop writes** to the source database (maintenance mode, read-only, connection drain, etc.). @@ -349,6 +357,7 @@ sqlite3 ~/migration_*/schema/filter.db "SELECT COUNT(*) FROM s_depend;" | `compare-pg-params.sh` | Prepare | Compare PostgreSQL parameters between source and target | | `preflight-check.sh` | Prepare | Validate migration prerequisites (connectivity, WAL level, permissions, slots) | | `fix-replica-identity.sh` | Prepare | Set REPLICA IDENTITY FULL on tables without primary keys | +| `notify-migration.sh` | Prepare / Monitor | Slack alerts for failures and errors via cron (run `--setup` before migrating) | | `filters.ini` | Prepare | pgcopydb filter configuration | | `run-migration.sh` | Migrate | Start a pgcopydb clone --follow migration | | `start-migration-screen.sh` | Migrate | Run the migration in a screen session | diff --git a/pgcopydb-helpers/notify-migration.sh b/pgcopydb-helpers/notify-migration.sh new file mode 100755 index 0000000..be1c1cf --- /dev/null +++ b/pgcopydb-helpers/notify-migration.sh @@ -0,0 +1,250 @@ +#!/bin/bash +# +# notify-migration.sh — Slack alerts for pgcopydb migration failures and errors +# +# Runs from cron. State is stored inside the migration directory so it resets +# automatically when a new migration starts. Each unique event fires once only. +# +# SETUP +# 1. Add to ~/.env: +# export SLACK_WEBHOOK_URL='https://hooks.slack.com/services/...' +# +# 2. Test the webhook: +# ~/notify-migration.sh --test +# +# 3. Install the cron job (default 2 min interval): +# ~/notify-migration.sh --setup +# ~/notify-migration.sh --setup --interval 5 +# +# 4. Remove the cron job when done: +# ~/notify-migration.sh --uninstall +# +# ALERTS FIRED +# - pgcopydb process stopped unexpectedly (fires once per transition) +# - New ERROR lines in migration.log (fires once per new batch) +# - Migration completed successfully (fires once) +# - Migration failed with non-zero exit code (fires once) +# +# State file: $MIGRATION_DIR/.notify-state (inside the migration directory) +# Cron output is discarded; run manually to see output +# + +set -uo pipefail + +# ── Flag parsing ─────────────────────────────────────────────────── +INTERVAL=2 +ACTION="" + +while [ $# -gt 0 ]; do + case "$1" in + --interval) + INTERVAL="${2:?--interval requires a value (1-59)}" + shift 2 + ;; + --setup|--uninstall|--test) + ACTION="${1#--}" + shift + ;; + *) + echo "Unknown argument: $1" >&2 + echo "Usage: $0 [--setup [--interval N]] | [--uninstall] | [--test]" >&2 + exit 1 + ;; + esac +done + +# ── Load environment ─────────────────────────────────────────────── + set +u + set -a + source ~/.env + set +a + set -u + +SLACK_WEBHOOK_URL="${SLACK_WEBHOOK_URL:-}" + +# ── Slack helper ─────────────────────────────────────────────────── +slack_send() { + local text="$1" + local safe + safe="${text//\\/\\\\}" + safe="${safe//\"/\\\"}" + safe="${safe//$'\n'/\\n}" + + local http_code + http_code=$(curl -s -o /dev/null -w "%{http_code}" \ + -X POST \ + -H 'Content-type: application/json' \ + --data "{\"text\":\"${safe}\"}" \ + "$SLACK_WEBHOOK_URL" 2>/dev/null) || http_code="000" + + if [ "$http_code" = "200" ]; then + echo "$(date '+%Y-%m-%d %H:%M:%S') SENT: $text" + else + echo "$(date '+%Y-%m-%d %H:%M:%S') WARN: Slack returned HTTP $http_code" + fi +} + +# ── SQLite helper ────────────────────────────────────────────────── +db_query() { + sqlite3 "$DB" "$1" 2>/dev/null || echo "${2:-0}" +} + +# ── --test ───────────────────────────────────────────────────────── +if [ "$ACTION" = "test" ]; then + if [ -z "$SLACK_WEBHOOK_URL" ]; then + echo "ERROR: SLACK_WEBHOOK_URL not set in ~/.env" + exit 1 + fi + HOST=$(hostname -s 2>/dev/null || hostname) + slack_send ":white_check_mark: Migration Monitor test from *${HOST}* — webhook working!" + exit 0 +fi + +# ── --setup ──────────────────────────────────────────────────────── +if [ "$ACTION" = "setup" ]; then + if [ -z "$SLACK_WEBHOOK_URL" ]; then + echo "ERROR: Set SLACK_WEBHOOK_URL in ~/.env before running --setup" + exit 1 + fi + if ! [[ "$INTERVAL" =~ ^[1-9][0-9]?$ ]] || [ "$INTERVAL" -gt 59 ]; then + echo "ERROR: --interval must be 1-59 (got: $INTERVAL)" + exit 1 + fi + SCRIPT="$HOME/notify-migration.sh" + CRON_LINE="*/${INTERVAL} * * * * ${SCRIPT} > /dev/null 2>&1" + ( crontab -l 2>/dev/null | grep -v "notify-migration.sh" || true + echo "$CRON_LINE" + ) | crontab - + echo "Cron job installed (every ${INTERVAL} min):" + echo " $CRON_LINE" + echo "" + echo "Sending test message..." + "$SCRIPT" --test + exit 0 +fi + +# ── --uninstall ──────────────────────────────────────────────────── +if [ "$ACTION" = "uninstall" ]; then + ( crontab -l 2>/dev/null | grep -v "notify-migration.sh" || true ) | crontab - + echo "Cron job removed." + exit 0 +fi + +# ── Guard ────────────────────────────────────────────────────────── +if [ -z "$SLACK_WEBHOOK_URL" ]; then + echo "$(date '+%Y-%m-%d %H:%M:%S') SKIP: SLACK_WEBHOOK_URL not set in ~/.env" + exit 0 +fi + +# ── Find migration directory ─────────────────────────────────────── +MIGRATION_DIR=$(ls -dt "$HOME"/migration_* 2>/dev/null | head -1 || true) +if [ -z "$MIGRATION_DIR" ]; then + exit 0 +fi + +LOG="$MIGRATION_DIR/migration.log" +DB="$MIGRATION_DIR/schema/source.db" +STATE="$MIGRATION_DIR/.notify-state" + +if [ ! -f "$LOG" ]; then + exit 0 +fi + +# ── Load state from previous run ────────────────────────────────── +# Stored inside the migration directory — resets automatically when +# a new migration starts (new directory = no state file). +LAST_ERROR_COUNT=0 +LAST_STATUS="unknown" +LAST_COMPLETION_NOTIFIED="false" + +if [ -f "$STATE" ]; then + # shellcheck source=/dev/null + source "$STATE" 2>/dev/null || true +fi + +# ── Current state from log ───────────────────────────────────────── +HOST=$(hostname -s 2>/dev/null || hostname) + +PROC_RUNNING=false +if ps aux | grep -q "[p]gcopydb.*clone"; then + PROC_RUNNING=true +fi + +MIGRATION_SUCCEEDED=false +MIGRATION_FAILED=false + +if grep -q "Migration SUCCEEDED" "$LOG" 2>/dev/null; then + MIGRATION_SUCCEEDED=true +fi + +EXIT_LINE=$(grep "Exit code:" "$LOG" 2>/dev/null | tail -1 || true) +if [ -n "$EXIT_LINE" ] && ! echo "$EXIT_LINE" | grep -q "Exit code: 0"; then + MIGRATION_FAILED=true +fi + +if [ "$MIGRATION_SUCCEEDED" = true ]; then + CURRENT_STATUS="succeeded" +elif [ "$MIGRATION_FAILED" = true ]; then + CURRENT_STATUS="failed" +elif [ "$PROC_RUNNING" = true ]; then + CURRENT_STATUS="running" +else + CURRENT_STATUS="stopped" +fi + +CURRENT_ERROR_COUNT=$(grep -c " ERROR " "$LOG" 2>/dev/null || true) +CURRENT_ERROR_COUNT=$(( ${CURRENT_ERROR_COUNT:-0} + 0 )) + +# ── Context from SQLite for richer alert messages ────────────────── +TABLES_DONE=$(db_query "SELECT COUNT(*) FROM summary WHERE tableoid IS NOT NULL AND done_time_epoch IS NOT NULL;") +NONSPLIT=$(db_query "SELECT COUNT(*) FROM s_table t WHERE NOT EXISTS (SELECT 1 FROM s_table_part p WHERE p.oid = t.oid);") +SPLIT_PARTS=$(db_query "SELECT COUNT(*) FROM s_table_part;") +TABLES_TOTAL=$(( NONSPLIT + SPLIT_PARTS )) +BYTES=$(db_query "SELECT COALESCE(SUM(bytes),0) FROM summary WHERE tableoid IS NOT NULL;") +GB=$(echo "scale=1; $BYTES / 1024 / 1024 / 1024" | bc 2>/dev/null || echo "0") + +# ── Evaluate and notify ──────────────────────────────────────────── +NOTIFIED_COMPLETION="$LAST_COMPLETION_NOTIFIED" + +if [ "$CURRENT_STATUS" = "succeeded" ] && [ "$LAST_COMPLETION_NOTIFIED" = "false" ]; then + DIR_EPOCH=$(stat -c %Y "$MIGRATION_DIR" 2>/dev/null || date +%s) + SECS=$(( $(date +%s) - DIR_EPOCH )) + RUNTIME=$(printf "%dh %02dm" $(( SECS/3600 )) $(( (SECS%3600)/60 ))) + msg=":white_check_mark: *Migration completed successfully*" + msg+=$'\n'"Host: *${HOST}* | Runtime: ${RUNTIME} | Data: ${GB} GB" + msg+=$'\n'"Tables: ${TABLES_DONE}/${TABLES_TOTAL} | Dir: ${MIGRATION_DIR}" + slack_send "$msg" + NOTIFIED_COMPLETION="true" + +elif [ "$CURRENT_STATUS" = "failed" ] && [ "$LAST_COMPLETION_NOTIFIED" = "false" ]; then + LAST_ERR=$(grep " ERROR " "$LOG" 2>/dev/null | tail -1 | cut -c1-120 || true) + msg=":red_circle: *Migration FAILED*" + msg+=$'\n'"Host: *${HOST}* | Tables: ${TABLES_DONE}/${TABLES_TOTAL} | Data: ${GB} GB" + [ -n "$LAST_ERR" ] && msg+=$'\n'"Last error: ${LAST_ERR}" + msg+=$'\n'"Dir: ${MIGRATION_DIR}" + slack_send "$msg" + NOTIFIED_COMPLETION="true" + +elif [ "$CURRENT_STATUS" = "stopped" ] && [ "$LAST_STATUS" = "running" ]; then + LAST_ERR=$(grep " ERROR " "$LOG" 2>/dev/null | tail -1 | cut -c1-120 || true) + msg=":warning: *Migration process stopped unexpectedly*" + msg+=$'\n'"Host: *${HOST}* | Tables: ${TABLES_DONE}/${TABLES_TOTAL} | Data: ${GB} GB" + [ -n "$LAST_ERR" ] && msg+=$'\n'"Last error: ${LAST_ERR}" + msg+=$'\n'"Run: tail -50 ${LOG}" + slack_send "$msg" + +elif [ "$CURRENT_ERROR_COUNT" -gt "$LAST_ERROR_COUNT" ]; then + NEW_COUNT=$(( CURRENT_ERROR_COUNT - LAST_ERROR_COUNT )) + LAST_ERR=$(grep " ERROR " "$LOG" 2>/dev/null | tail -1 | cut -c1-120 || true) + msg=":warning: *${NEW_COUNT} new error(s) in migration log*" + msg+=$'\n'"Host: *${HOST}* | Total errors: ${CURRENT_ERROR_COUNT} | Tables: ${TABLES_DONE}/${TABLES_TOTAL}" + [ -n "$LAST_ERR" ] && msg+=$'\n'"Last: ${LAST_ERR}" + slack_send "$msg" +fi + +# ── Save state ───────────────────────────────────────────────────── +cat > "$STATE" < Date: Tue, 28 Apr 2026 10:56:14 +0200 Subject: [PATCH 2/9] add Slack alert when initial copy phase completes --- pgcopydb-helpers/notify-migration.sh | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/pgcopydb-helpers/notify-migration.sh b/pgcopydb-helpers/notify-migration.sh index be1c1cf..5597e37 100755 --- a/pgcopydb-helpers/notify-migration.sh +++ b/pgcopydb-helpers/notify-migration.sh @@ -22,6 +22,7 @@ # ALERTS FIRED # - pgcopydb process stopped unexpectedly (fires once per transition) # - New ERROR lines in migration.log (fires once per new batch) +# - Initial copy completed (data + indexes + constraints + post-data; fires once) # - Migration completed successfully (fires once) # - Migration failed with non-zero exit code (fires once) # @@ -155,6 +156,7 @@ fi # a new migration starts (new directory = no state file). LAST_ERROR_COUNT=0 LAST_STATUS="unknown" +LAST_INITIAL_COPY_NOTIFIED="false" LAST_COMPLETION_NOTIFIED="false" if [ -f "$STATE" ]; then @@ -173,6 +175,11 @@ fi MIGRATION_SUCCEEDED=false MIGRATION_FAILED=false +INITIAL_COPY_DONE=false +if grep -q "All step are now done" "$LOG" 2>/dev/null; then + INITIAL_COPY_DONE=true +fi + if grep -q "Migration SUCCEEDED" "$LOG" 2>/dev/null; then MIGRATION_SUCCEEDED=true fi @@ -202,11 +209,26 @@ SPLIT_PARTS=$(db_query "SELECT COUNT(*) FROM s_table_part;") TABLES_TOTAL=$(( NONSPLIT + SPLIT_PARTS )) BYTES=$(db_query "SELECT COALESCE(SUM(bytes),0) FROM summary WHERE tableoid IS NOT NULL;") GB=$(echo "scale=1; $BYTES / 1024 / 1024 / 1024" | bc 2>/dev/null || echo "0") +INDEXES_DONE=$(db_query "SELECT COUNT(DISTINCT indexoid) FROM summary WHERE indexoid IS NOT NULL AND done_time_epoch IS NOT NULL;") +INDEXES_TOTAL=$(db_query "SELECT COUNT(*) FROM s_index;") +CONSTRAINTS_DONE=$(db_query "SELECT COUNT(DISTINCT conoid) FROM summary WHERE conoid IS NOT NULL AND done_time_epoch IS NOT NULL;") +CONSTRAINTS_TOTAL=$(db_query "SELECT COUNT(*) FROM s_constraint;") # ── Evaluate and notify ──────────────────────────────────────────── +NOTIFIED_INITIAL_COPY="$LAST_INITIAL_COPY_NOTIFIED" NOTIFIED_COMPLETION="$LAST_COMPLETION_NOTIFIED" -if [ "$CURRENT_STATUS" = "succeeded" ] && [ "$LAST_COMPLETION_NOTIFIED" = "false" ]; then +if [ "$INITIAL_COPY_DONE" = true ] && [ "$LAST_INITIAL_COPY_NOTIFIED" = "false" ]; then + DIR_EPOCH=$(stat -c %Y "$MIGRATION_DIR" 2>/dev/null || date +%s) + SECS=$(( $(date +%s) - DIR_EPOCH )) + RUNTIME=$(printf "%dh %02dm" $(( SECS/3600 )) $(( (SECS%3600)/60 ))) + msg=":large_blue_circle: *Initial copy completed — CDC phase starting*" + msg+=$'\n'"Host: *${HOST}* | Runtime: ${RUNTIME} | Data: ${GB} GB" + msg+=$'\n'"Tables: ${TABLES_DONE}/${TABLES_TOTAL} | Indexes: ${INDEXES_DONE}/${INDEXES_TOTAL} | Constraints: ${CONSTRAINTS_DONE}/${CONSTRAINTS_TOTAL}" + slack_send "$msg" + NOTIFIED_INITIAL_COPY="true" + +elif [ "$CURRENT_STATUS" = "succeeded" ] && [ "$LAST_COMPLETION_NOTIFIED" = "false" ]; then DIR_EPOCH=$(stat -c %Y "$MIGRATION_DIR" 2>/dev/null || date +%s) SECS=$(( $(date +%s) - DIR_EPOCH )) RUNTIME=$(printf "%dh %02dm" $(( SECS/3600 )) $(( (SECS%3600)/60 ))) @@ -246,5 +268,6 @@ fi cat > "$STATE" < Date: Tue, 28 Apr 2026 11:14:23 +0200 Subject: [PATCH 3/9] use planetscale branch ID instead of hostname in Slack alerts --- pgcopydb-helpers/notify-migration.sh | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/pgcopydb-helpers/notify-migration.sh b/pgcopydb-helpers/notify-migration.sh index 5597e37..cf5decc 100755 --- a/pgcopydb-helpers/notify-migration.sh +++ b/pgcopydb-helpers/notify-migration.sh @@ -63,6 +63,11 @@ done SLACK_WEBHOOK_URL="${SLACK_WEBHOOK_URL:-}" +# ── Parse PlanetScale branch ID ──────────────────────────────────── +# Username format in connection string: pscale_api_xxx.BRANCH_ID +_u="${PGCOPYDB_TARGET_PGURI:-}"; _u="${_u#*://}"; _u="${_u%%@*}"; _u="${_u%%:*}" +PS_BRANCH_ID="${_u##*.}"; unset _u + # ── Slack helper ─────────────────────────────────────────────────── slack_send() { local text="$1" @@ -119,8 +124,8 @@ if [ "$ACTION" = "setup" ]; then echo "Cron job installed (every ${INTERVAL} min):" echo " $CRON_LINE" echo "" - echo "Sending test message..." - "$SCRIPT" --test + echo "Sending setup confirmation to Slack..." + slack_send ":rocket: Migration monitor started for branch *${PS_BRANCH_ID:-unknown}* — Slack notifications active" exit 0 fi @@ -165,8 +170,6 @@ if [ -f "$STATE" ]; then fi # ── Current state from log ───────────────────────────────────────── -HOST=$(hostname -s 2>/dev/null || hostname) - PROC_RUNNING=false if ps aux | grep -q "[p]gcopydb.*clone"; then PROC_RUNNING=true @@ -223,7 +226,7 @@ if [ "$INITIAL_COPY_DONE" = true ] && [ "$LAST_INITIAL_COPY_NOTIFIED" = "false" SECS=$(( $(date +%s) - DIR_EPOCH )) RUNTIME=$(printf "%dh %02dm" $(( SECS/3600 )) $(( (SECS%3600)/60 ))) msg=":large_blue_circle: *Initial copy completed — CDC phase starting*" - msg+=$'\n'"Host: *${HOST}* | Runtime: ${RUNTIME} | Data: ${GB} GB" + msg+=$'\n'"Branch: *${PS_BRANCH_ID}* | Runtime: ${RUNTIME} | Data: ${GB} GB" msg+=$'\n'"Tables: ${TABLES_DONE}/${TABLES_TOTAL} | Indexes: ${INDEXES_DONE}/${INDEXES_TOTAL} | Constraints: ${CONSTRAINTS_DONE}/${CONSTRAINTS_TOTAL}" slack_send "$msg" NOTIFIED_INITIAL_COPY="true" @@ -233,7 +236,7 @@ elif [ "$CURRENT_STATUS" = "succeeded" ] && [ "$LAST_COMPLETION_NOTIFIED" = "fal SECS=$(( $(date +%s) - DIR_EPOCH )) RUNTIME=$(printf "%dh %02dm" $(( SECS/3600 )) $(( (SECS%3600)/60 ))) msg=":white_check_mark: *Migration completed successfully*" - msg+=$'\n'"Host: *${HOST}* | Runtime: ${RUNTIME} | Data: ${GB} GB" + msg+=$'\n'"Branch: *${PS_BRANCH_ID}* | Runtime: ${RUNTIME} | Data: ${GB} GB" msg+=$'\n'"Tables: ${TABLES_DONE}/${TABLES_TOTAL} | Dir: ${MIGRATION_DIR}" slack_send "$msg" NOTIFIED_COMPLETION="true" @@ -241,7 +244,7 @@ elif [ "$CURRENT_STATUS" = "succeeded" ] && [ "$LAST_COMPLETION_NOTIFIED" = "fal elif [ "$CURRENT_STATUS" = "failed" ] && [ "$LAST_COMPLETION_NOTIFIED" = "false" ]; then LAST_ERR=$(grep " ERROR " "$LOG" 2>/dev/null | tail -1 | cut -c1-120 || true) msg=":red_circle: *Migration FAILED*" - msg+=$'\n'"Host: *${HOST}* | Tables: ${TABLES_DONE}/${TABLES_TOTAL} | Data: ${GB} GB" + msg+=$'\n'"Branch: *${PS_BRANCH_ID}* | Tables: ${TABLES_DONE}/${TABLES_TOTAL} | Data: ${GB} GB" [ -n "$LAST_ERR" ] && msg+=$'\n'"Last error: ${LAST_ERR}" msg+=$'\n'"Dir: ${MIGRATION_DIR}" slack_send "$msg" @@ -250,7 +253,7 @@ elif [ "$CURRENT_STATUS" = "failed" ] && [ "$LAST_COMPLETION_NOTIFIED" = "false" elif [ "$CURRENT_STATUS" = "stopped" ] && [ "$LAST_STATUS" = "running" ]; then LAST_ERR=$(grep " ERROR " "$LOG" 2>/dev/null | tail -1 | cut -c1-120 || true) msg=":warning: *Migration process stopped unexpectedly*" - msg+=$'\n'"Host: *${HOST}* | Tables: ${TABLES_DONE}/${TABLES_TOTAL} | Data: ${GB} GB" + msg+=$'\n'"Branch: *${PS_BRANCH_ID}* | Tables: ${TABLES_DONE}/${TABLES_TOTAL} | Data: ${GB} GB" [ -n "$LAST_ERR" ] && msg+=$'\n'"Last error: ${LAST_ERR}" msg+=$'\n'"Run: tail -50 ${LOG}" slack_send "$msg" @@ -259,7 +262,7 @@ elif [ "$CURRENT_ERROR_COUNT" -gt "$LAST_ERROR_COUNT" ]; then NEW_COUNT=$(( CURRENT_ERROR_COUNT - LAST_ERROR_COUNT )) LAST_ERR=$(grep " ERROR " "$LOG" 2>/dev/null | tail -1 | cut -c1-120 || true) msg=":warning: *${NEW_COUNT} new error(s) in migration log*" - msg+=$'\n'"Host: *${HOST}* | Total errors: ${CURRENT_ERROR_COUNT} | Tables: ${TABLES_DONE}/${TABLES_TOTAL}" + msg+=$'\n'"Branch: *${PS_BRANCH_ID}* | Total errors: ${CURRENT_ERROR_COUNT} | Tables: ${TABLES_DONE}/${TABLES_TOTAL}" [ -n "$LAST_ERR" ] && msg+=$'\n'"Last: ${LAST_ERR}" slack_send "$msg" fi From 4015b6a862a0f8952ca6b75e863c718356ae7f8c Mon Sep 17 00:00:00 2001 From: DmitriiAn Date: Wed, 29 Apr 2026 10:37:01 +0200 Subject: [PATCH 4/9] start-migration-screen.sh: auto-start Slack monitoring when migration starts --- pgcopydb-helpers/start-migration-screen.sh | 49 ++++++++++++++++++++-- 1 file changed, 46 insertions(+), 3 deletions(-) diff --git a/pgcopydb-helpers/start-migration-screen.sh b/pgcopydb-helpers/start-migration-screen.sh index 7085952..e41c468 100644 --- a/pgcopydb-helpers/start-migration-screen.sh +++ b/pgcopydb-helpers/start-migration-screen.sh @@ -1,13 +1,34 @@ #!/bin/bash # -# Usage: ~/start-migration-screen.sh +# Usage: ~/start-migration-screen.sh [--no-monitor] # # Starts run-migration.sh inside a detached screen session named "migration". # Kills any existing migration screen first. Use "screen -r migration" to # attach and Ctrl-A D to detach. # +# Slack monitoring via notify-migration.sh is enabled by default (every 2 min). +# Use --no-monitor to skip. To change the interval later: +# ~/notify-migration.sh --uninstall +# ~/notify-migration.sh --setup --interval N +# set -eo pipefail +MONITOR=true + +while [ $# -gt 0 ]; do + case "$1" in + --no-monitor) + MONITOR=false + shift + ;; + *) + echo "Unknown argument: $1" >&2 + echo "Usage: $0 [--no-monitor]" >&2 + exit 1 + ;; + esac +done + # Kill any existing migration screen screen -S migration -X quit 2>/dev/null || true @@ -16,6 +37,28 @@ screen -dmS migration bash -c '~/run-migration.sh; echo "Press enter to exit."; echo "Migration started in screen session 'migration'" echo "" -echo "To watch: screen -r migration" -echo "To detach: Ctrl-A then D" +echo "To watch: screen -r migration" +echo "To detach: Ctrl-A then D" echo "To check status: ~/check-migration-status.sh" +echo "" +echo "────────────────────────────────────────────────────────" + +if [ "$MONITOR" = true ]; then + echo "Setting up Slack monitoring (every 2 min)..." + echo "────────────────────────────────────────────────────────" + echo "" + if ~/notify-migration.sh --setup > /dev/null 2>&1; then + echo "Monitoring is active." + echo "To disable: ~/notify-migration.sh --uninstall" + echo "To reconfigure: ~/notify-migration.sh --setup --interval N" + else + echo "WARNING: Monitoring setup failed." + echo "Check SLACK_WEBHOOK_URL is set in ~/.env" + echo "To enable manually: ~/notify-migration.sh --setup" + fi +else + echo "Slack monitoring: DISABLED (--no-monitor was passed)" + echo "To enable later: ~/notify-migration.sh --setup [--interval N]" +fi + +echo "────────────────────────────────────────────────────────" From 6fe5eaee934d659e962eb31b40c708c47257dc17 Mon Sep 17 00:00:00 2001 From: DmitriiAn Date: Wed, 29 Apr 2026 10:56:01 +0200 Subject: [PATCH 5/9] =?UTF-8?q?update=20README=20and=C2=A0=20AGENTS.md?= =?UTF-8?q?=C2=A0for=20monitoring=20auto-start=20and=20notify-migration=20?= =?UTF-8?q?changes?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pgcopydb-helpers/AGENTS.md | 32 +++++++++++++++++++++++++++++--- pgcopydb-helpers/README.md | 12 ++---------- 2 files changed, 31 insertions(+), 13 deletions(-) diff --git a/pgcopydb-helpers/AGENTS.md b/pgcopydb-helpers/AGENTS.md index adc80d5..e67f590 100644 --- a/pgcopydb-helpers/AGENTS.md +++ b/pgcopydb-helpers/AGENTS.md @@ -172,10 +172,11 @@ Starts a full `pgcopydb clone --follow` migration. Creates a new timestamped dir #### `start-migration-screen.sh` -Wrapper that runs `run-migration.sh` inside a detached `screen` session named "migration". Kills any existing migration screen first. +Wrapper that runs `run-migration.sh` inside a detached `screen` session named "migration". Kills any existing migration screen first. Automatically installs the Slack monitoring cron job via `notify-migration.sh --setup` unless `--no-monitor` is passed. ```bash -~/start-migration-screen.sh +~/start-migration-screen.sh # start migration + enable Slack monitoring +~/start-migration-screen.sh --no-monitor # start migration without monitoring ``` **When to use:** Always use this instead of running `run-migration.sh` directly. Screen prevents the migration from dying if your SSH session disconnects. @@ -189,6 +190,31 @@ Wrapper that runs `run-migration.sh` inside a detached `screen` session named "m ### Monitoring +#### `notify-migration.sh` + +Sends Slack alerts for migration events. Runs from cron (default every 2 min) and fires each alert exactly once. Started automatically by `start-migration-screen.sh` — no manual setup required if `SLACK_WEBHOOK_URL` is set in `~/.env`. + +```bash +~/notify-migration.sh --test # send a test message to verify the webhook +~/notify-migration.sh --uninstall # remove the cron job +~/notify-migration.sh --setup --interval N # reinstall with a different interval (1-59 min) +``` + +**Alerts fired:** +- Process stopped unexpectedly (fires once per running→stopped transition) +- New ERROR lines in `migration.log` (fires once per new batch) +- Initial copy completed — data, indexes, constraints, sequences, and post-data done; CDC phase is starting (fires once) +- Migration completed successfully (fires once) +- Migration failed with non-zero exit code (fires once) + +All alerts include the PlanetScale branch ID (parsed from `PGCOPYDB_TARGET_PGURI`) and migration progress context (tables, data GB, runtime). + +**State file:** `$MIGRATION_DIR/.notify-state` — resets automatically when a new migration directory is created. + +**Requires:** `SLACK_WEBHOOK_URL` in `~/.env`. If not set, the script skips silently (cron) or exits with an error (`--setup`, `--test`). + +--- + #### `check-migration-status.sh` Displays a full migration progress dashboard: phase completion status, table/index/constraint copy progress, CDC streaming, error counts, runtime, and active database operations on the target. @@ -424,7 +450,7 @@ sqlite3 ~/migration_*/schema/filter.db \ - Run ~/fix-replica-identity.sh if using CDC (--follow) 2. MIGRATE - - Run ~/start-migration-screen.sh to begin + - Run ~/start-migration-screen.sh to begin (Slack monitoring starts automatically) - Monitor with ~/check-migration-status.sh (initial copy phase) - Monitor with ~/check-cdc-status.sh (CDC catch-up phase) diff --git a/pgcopydb-helpers/README.md b/pgcopydb-helpers/README.md index 8549455..2acec6a 100644 --- a/pgcopydb-helpers/README.md +++ b/pgcopydb-helpers/README.md @@ -172,14 +172,6 @@ Once the initial copy completes and CDC is streaming, check replication progress When `check-cdc-status.sh` reports **"CDC IS CAUGHT UP"** (apply backlog < 100 MB), you are ready for cutover. -To receive proactive Slack alerts on failures and errors without manually running the above scripts, use `notify-migration.sh`. It runs from cron and fires once per unique event (process stopped, new errors, completion). Add `SLACK_WEBHOOK_URL` to `~/.env`, then: - -```bash -~/notify-migration.sh --test # verify webhook -~/notify-migration.sh --setup # install cron job (default: every 2 min) -~/notify-migration.sh --uninstall # remove when done -``` - ### 4. Cut Over 1. **Stop writes** to the source database (maintenance mode, read-only, connection drain, etc.). @@ -401,10 +393,10 @@ sqlite3 ~/migration_*/schema/filter.db "SELECT COUNT(*) FROM s_depend;" | `compare-pg-params.sh` | Prepare | Compare PostgreSQL parameters between source and target | | `preflight-check.sh` | Prepare | Validate migration prerequisites (connectivity, WAL level, permissions, slots) | | `fix-replica-identity.sh` | Prepare | Set REPLICA IDENTITY FULL on tables without primary keys | -| `notify-migration.sh` | Prepare / Monitor | Slack alerts for failures and errors via cron (run `--setup` before migrating) | +| `notify-migration.sh` | Monitor | Slack alerts via cron (auto-started by `start-migration-screen.sh`). Fires once per event: process stopped, new errors, initial copy completed, migration succeeded/failed. Requires `SLACK_WEBHOOK_URL` in `~/.env`. Use `--test` to verify the webhook, `--uninstall` to remove the cron job. | | `filters.ini` | Prepare | pgcopydb filter configuration | | `run-migration.sh` | Migrate | Start a pgcopydb clone --follow migration | -| `start-migration-screen.sh` | Migrate | Run the migration in a screen session | +| `start-migration-screen.sh` | Migrate | Run the migration in a detached screen session. Automatically starts Slack monitoring unless `--no-monitor` is passed. | | `check-migration-status.sh` | Monitor | Migration progress dashboard | | `check-cdc-status.sh` | Monitor | CDC replication progress and health | | `resume-migration.sh` | Recovery | Resume an interrupted migration (full clone + CDC) | From 707d9f08f00d710a854dfffe0fbb39935e4ce744 Mon Sep 17 00:00:00 2001 From: DmitriiAn Date: Wed, 29 Apr 2026 11:28:09 +0200 Subject: [PATCH 6/9] hardcode Slack webhook URL in notify-migration.sh --- pgcopydb-helpers/AGENTS.md | 4 ++-- pgcopydb-helpers/README.md | 2 +- pgcopydb-helpers/notify-migration.sh | 11 ++++------- 3 files changed, 7 insertions(+), 10 deletions(-) diff --git a/pgcopydb-helpers/AGENTS.md b/pgcopydb-helpers/AGENTS.md index e67f590..c81557b 100644 --- a/pgcopydb-helpers/AGENTS.md +++ b/pgcopydb-helpers/AGENTS.md @@ -192,7 +192,7 @@ Wrapper that runs `run-migration.sh` inside a detached `screen` session named "m #### `notify-migration.sh` -Sends Slack alerts for migration events. Runs from cron (default every 2 min) and fires each alert exactly once. Started automatically by `start-migration-screen.sh` — no manual setup required if `SLACK_WEBHOOK_URL` is set in `~/.env`. +Sends Slack alerts for migration events. Runs from cron (default every 2 min) and fires each alert exactly once. Started automatically by `start-migration-screen.sh`. ```bash ~/notify-migration.sh --test # send a test message to verify the webhook @@ -211,7 +211,7 @@ All alerts include the PlanetScale branch ID (parsed from `PGCOPYDB_TARGET_PGURI **State file:** `$MIGRATION_DIR/.notify-state` — resets automatically when a new migration directory is created. -**Requires:** `SLACK_WEBHOOK_URL` in `~/.env`. If not set, the script skips silently (cron) or exits with an error (`--setup`, `--test`). +**Webhook:** hardcoded in the script --- diff --git a/pgcopydb-helpers/README.md b/pgcopydb-helpers/README.md index 2acec6a..6f715ac 100644 --- a/pgcopydb-helpers/README.md +++ b/pgcopydb-helpers/README.md @@ -393,12 +393,12 @@ sqlite3 ~/migration_*/schema/filter.db "SELECT COUNT(*) FROM s_depend;" | `compare-pg-params.sh` | Prepare | Compare PostgreSQL parameters between source and target | | `preflight-check.sh` | Prepare | Validate migration prerequisites (connectivity, WAL level, permissions, slots) | | `fix-replica-identity.sh` | Prepare | Set REPLICA IDENTITY FULL on tables without primary keys | -| `notify-migration.sh` | Monitor | Slack alerts via cron (auto-started by `start-migration-screen.sh`). Fires once per event: process stopped, new errors, initial copy completed, migration succeeded/failed. Requires `SLACK_WEBHOOK_URL` in `~/.env`. Use `--test` to verify the webhook, `--uninstall` to remove the cron job. | | `filters.ini` | Prepare | pgcopydb filter configuration | | `run-migration.sh` | Migrate | Start a pgcopydb clone --follow migration | | `start-migration-screen.sh` | Migrate | Run the migration in a detached screen session. Automatically starts Slack monitoring unless `--no-monitor` is passed. | | `check-migration-status.sh` | Monitor | Migration progress dashboard | | `check-cdc-status.sh` | Monitor | CDC replication progress and health | +| `notify-migration.sh` | Monitor | Slack alerts via cron (auto-started by `start-migration-screen.sh`)| | `resume-migration.sh` | Recovery | Resume an interrupted migration (full clone + CDC) | | `resume-cdc.sh` | Recovery | Resume only the CDC phase (skips clone) | | `target-clean.sh` | Recovery | Wipe target database for re-migration (prompts for confirmation) | diff --git a/pgcopydb-helpers/notify-migration.sh b/pgcopydb-helpers/notify-migration.sh index cf5decc..194997c 100755 --- a/pgcopydb-helpers/notify-migration.sh +++ b/pgcopydb-helpers/notify-migration.sh @@ -6,17 +6,14 @@ # automatically when a new migration starts. Each unique event fires once only. # # SETUP -# 1. Add to ~/.env: -# export SLACK_WEBHOOK_URL='https://hooks.slack.com/services/...' -# -# 2. Test the webhook: +# 1. Test the webhook: # ~/notify-migration.sh --test # -# 3. Install the cron job (default 2 min interval): +# 2. Install the cron job (default 2 min interval): # ~/notify-migration.sh --setup # ~/notify-migration.sh --setup --interval 5 # -# 4. Remove the cron job when done: +# 3. Remove the cron job when done: # ~/notify-migration.sh --uninstall # # ALERTS FIRED @@ -61,7 +58,7 @@ done set +a set -u -SLACK_WEBHOOK_URL="${SLACK_WEBHOOK_URL:-}" +SLACK_WEBHOOK_URL='https://hooks.slack.com/services/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' # ── Parse PlanetScale branch ID ──────────────────────────────────── # Username format in connection string: pscale_api_xxx.BRANCH_ID From 91e26e75ce2ed084d123e7754abf39278e8abdff Mon Sep 17 00:00:00 2001 From: DmitriiAn Date: Thu, 30 Apr 2026 09:46:10 +0200 Subject: [PATCH 7/9] zapier integration in notify-migration.sh --- pgcopydb-helpers/notify-migration.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pgcopydb-helpers/notify-migration.sh b/pgcopydb-helpers/notify-migration.sh index 194997c..d8ba7b7 100755 --- a/pgcopydb-helpers/notify-migration.sh +++ b/pgcopydb-helpers/notify-migration.sh @@ -58,7 +58,7 @@ done set +a set -u -SLACK_WEBHOOK_URL='https://hooks.slack.com/services/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' +SLACK_WEBHOOK_URL='https://hooks.zapier.com/hooks/catch/27424474/uvzt3ci/' # ── Parse PlanetScale branch ID ──────────────────────────────────── # Username format in connection string: pscale_api_xxx.BRANCH_ID From 611b8229ef3a1ff164d3fd3bb4baef58415a515c Mon Sep 17 00:00:00 2001 From: DmitriiAn Date: Tue, 5 May 2026 21:06:04 +0200 Subject: [PATCH 8/9] rename notify-migration.sh to slack-migration-alerts.sh; move SLACK_WEBHOOK_URL to .env; decouple monitoring from start-migration-screen.sh --- pgcopydb-helpers/AGENTS.md | 24 ++++--- pgcopydb-helpers/README.md | 12 +++- ...migration.sh => slack-migration-alerts.sh} | 70 ++++++++++--------- pgcopydb-helpers/start-migration-screen.sh | 44 +----------- 4 files changed, 60 insertions(+), 90 deletions(-) rename pgcopydb-helpers/{notify-migration.sh => slack-migration-alerts.sh} (85%) diff --git a/pgcopydb-helpers/AGENTS.md b/pgcopydb-helpers/AGENTS.md index c81557b..4c89801 100644 --- a/pgcopydb-helpers/AGENTS.md +++ b/pgcopydb-helpers/AGENTS.md @@ -11,6 +11,7 @@ All scripts read connection strings from `~/.env`: ```bash export PGCOPYDB_SOURCE_PGURI='postgresql://user:pass@source-host:5432/dbname' export PGCOPYDB_TARGET_PGURI='postgresql://user:pass@target-host:5432/dbname' +export SLACK_WEBHOOK_URL='https://hooks.slack.com/services/...' # optional, for Slack alerts ``` ## Script Reference @@ -172,11 +173,10 @@ Starts a full `pgcopydb clone --follow` migration. Creates a new timestamped dir #### `start-migration-screen.sh` -Wrapper that runs `run-migration.sh` inside a detached `screen` session named "migration". Kills any existing migration screen first. Automatically installs the Slack monitoring cron job via `notify-migration.sh --setup` unless `--no-monitor` is passed. +Wrapper that runs `run-migration.sh` inside a detached `screen` session named "migration". Kills any existing migration screen first. ```bash -~/start-migration-screen.sh # start migration + enable Slack monitoring -~/start-migration-screen.sh --no-monitor # start migration without monitoring +~/start-migration-screen.sh ``` **When to use:** Always use this instead of running `run-migration.sh` directly. Screen prevents the migration from dying if your SSH session disconnects. @@ -190,14 +190,15 @@ Wrapper that runs `run-migration.sh` inside a detached `screen` session named "m ### Monitoring -#### `notify-migration.sh` +#### `slack-migration-alerts.sh` -Sends Slack alerts for migration events. Runs from cron (default every 2 min) and fires each alert exactly once. Started automatically by `start-migration-screen.sh`. +Sends Slack alerts for migration events. Runs from cron (default every 2 min) and fires each alert exactly once. Requires `SLACK_WEBHOOK_URL` in `~/.env`. ```bash -~/notify-migration.sh --test # send a test message to verify the webhook -~/notify-migration.sh --uninstall # remove the cron job -~/notify-migration.sh --setup --interval N # reinstall with a different interval (1-59 min) +~/slack-migration-alerts.sh --test # send a test message to verify the webhook +~/slack-migration-alerts.sh --setup # install cron job (default every 2 min) +~/slack-migration-alerts.sh --setup --interval N # custom interval (1-59 min) +~/slack-migration-alerts.sh --uninstall # remove the cron job ``` **Alerts fired:** @@ -207,11 +208,11 @@ Sends Slack alerts for migration events. Runs from cron (default every 2 min) an - Migration completed successfully (fires once) - Migration failed with non-zero exit code (fires once) -All alerts include the PlanetScale branch ID (parsed from `PGCOPYDB_TARGET_PGURI`) and migration progress context (tables, data GB, runtime). +All alerts include the PlanetScale branch ID (parsed from `PGCOPYDB_TARGET_PGURI`) and migration progress context (tables, data GB, runtime). Alert messages do not include raw log content. **State file:** `$MIGRATION_DIR/.notify-state` — resets automatically when a new migration directory is created. -**Webhook:** hardcoded in the script +**Webhook:** set `SLACK_WEBHOOK_URL` in `~/.env` --- @@ -450,9 +451,10 @@ sqlite3 ~/migration_*/schema/filter.db \ - Run ~/fix-replica-identity.sh if using CDC (--follow) 2. MIGRATE - - Run ~/start-migration-screen.sh to begin (Slack monitoring starts automatically) + - Run ~/start-migration-screen.sh to begin - Monitor with ~/check-migration-status.sh (initial copy phase) - Monitor with ~/check-cdc-status.sh (CDC catch-up phase) + - Run ~/slack-migration-alerts.sh --setup to enable Slack alerts (optional) 3. CUTOVER (when CDC is caught up) - Stop writes to source diff --git a/pgcopydb-helpers/README.md b/pgcopydb-helpers/README.md index 6f715ac..6581dfb 100644 --- a/pgcopydb-helpers/README.md +++ b/pgcopydb-helpers/README.md @@ -105,6 +105,7 @@ SELECT pg_reload_conf(); ```bash export PGCOPYDB_SOURCE_PGURI='postgresql://user:pass@source-host:5432/dbname' export PGCOPYDB_TARGET_PGURI='postgresql://user:pass@target-host:5432/dbname' + export SLACK_WEBHOOK_URL='https://hooks.slack.com/services/...' # optional, for Slack alerts ``` 3. **Customize `~/filters.ini`** to exclude schemas, tables, and extensions that should not be migrated. See [Filter Configuration](#filter-configuration) below. @@ -172,6 +173,13 @@ Once the initial copy completes and CDC is streaming, check replication progress When `check-cdc-status.sh` reports **"CDC IS CAUGHT UP"** (apply backlog < 100 MB), you are ready for cutover. +To receive Slack alerts for migration events (errors, initial copy completion, success/failure), set up the monitor separately. Requires `SLACK_WEBHOOK_URL` in `~/.env`: + +```bash +~/slack-migration-alerts.sh --test # verify webhook before installing +~/slack-migration-alerts.sh --setup # install cron job (default every 2 min) +``` + ### 4. Cut Over 1. **Stop writes** to the source database (maintenance mode, read-only, connection drain, etc.). @@ -395,10 +403,10 @@ sqlite3 ~/migration_*/schema/filter.db "SELECT COUNT(*) FROM s_depend;" | `fix-replica-identity.sh` | Prepare | Set REPLICA IDENTITY FULL on tables without primary keys | | `filters.ini` | Prepare | pgcopydb filter configuration | | `run-migration.sh` | Migrate | Start a pgcopydb clone --follow migration | -| `start-migration-screen.sh` | Migrate | Run the migration in a detached screen session. Automatically starts Slack monitoring unless `--no-monitor` is passed. | +| `start-migration-screen.sh` | Migrate | Run the migration in a detached screen session. | | `check-migration-status.sh` | Monitor | Migration progress dashboard | | `check-cdc-status.sh` | Monitor | CDC replication progress and health | -| `notify-migration.sh` | Monitor | Slack alerts via cron (auto-started by `start-migration-screen.sh`)| +| `slack-migration-alerts.sh` | Monitor | Slack alerts | | `resume-migration.sh` | Recovery | Resume an interrupted migration (full clone + CDC) | | `resume-cdc.sh` | Recovery | Resume only the CDC phase (skips clone) | | `target-clean.sh` | Recovery | Wipe target database for re-migration (prompts for confirmation) | diff --git a/pgcopydb-helpers/notify-migration.sh b/pgcopydb-helpers/slack-migration-alerts.sh similarity index 85% rename from pgcopydb-helpers/notify-migration.sh rename to pgcopydb-helpers/slack-migration-alerts.sh index d8ba7b7..a489df3 100755 --- a/pgcopydb-helpers/notify-migration.sh +++ b/pgcopydb-helpers/slack-migration-alerts.sh @@ -1,20 +1,23 @@ #!/bin/bash # -# notify-migration.sh — Slack alerts for pgcopydb migration failures and errors +# slack-migration-alerts.sh — Slack alerts for pgcopydb migration failures and errors # # Runs from cron. State is stored inside the migration directory so it resets # automatically when a new migration starts. Each unique event fires once only. # # SETUP +# 0. Add SLACK_WEBHOOK_URL to ~/.env: +# export SLACK_WEBHOOK_URL='https://hooks.slack.com/services/...' +# # 1. Test the webhook: -# ~/notify-migration.sh --test +# ~/slack-migration-alerts.sh --test # # 2. Install the cron job (default 2 min interval): -# ~/notify-migration.sh --setup -# ~/notify-migration.sh --setup --interval 5 +# ~/slack-migration-alerts.sh --setup +# ~/slack-migration-alerts.sh --setup --interval 5 # # 3. Remove the cron job when done: -# ~/notify-migration.sh --uninstall +# ~/slack-migration-alerts.sh --uninstall # # ALERTS FIRED # - pgcopydb process stopped unexpectedly (fires once per transition) @@ -58,8 +61,6 @@ done set +a set -u -SLACK_WEBHOOK_URL='https://hooks.zapier.com/hooks/catch/27424474/uvzt3ci/' - # ── Parse PlanetScale branch ID ──────────────────────────────────── # Username format in connection string: pscale_api_xxx.BRANCH_ID _u="${PGCOPYDB_TARGET_PGURI:-}"; _u="${_u#*://}"; _u="${_u%%@*}"; _u="${_u%%:*}" @@ -83,7 +84,8 @@ slack_send() { if [ "$http_code" = "200" ]; then echo "$(date '+%Y-%m-%d %H:%M:%S') SENT: $text" else - echo "$(date '+%Y-%m-%d %H:%M:%S') WARN: Slack returned HTTP $http_code" + echo "$(date '+%Y-%m-%d %H:%M:%S') ERROR: Slack returned HTTP $http_code" >&2 + return 1 fi } @@ -94,47 +96,49 @@ db_query() { # ── --test ───────────────────────────────────────────────────────── if [ "$ACTION" = "test" ]; then - if [ -z "$SLACK_WEBHOOK_URL" ]; then - echo "ERROR: SLACK_WEBHOOK_URL not set in ~/.env" + if [ -z "${SLACK_WEBHOOK_URL:-}" ]; then + echo "ERROR: SLACK_WEBHOOK_URL is not set in ~/.env" exit 1 fi HOST=$(hostname -s 2>/dev/null || hostname) - slack_send ":white_check_mark: Migration Monitor test from *${HOST}* — webhook working!" + slack_send ":white_check_mark: Migration Monitor test from *${HOST}* — webhook working!" || exit 1 exit 0 fi # ── --setup ──────────────────────────────────────────────────────── if [ "$ACTION" = "setup" ]; then - if [ -z "$SLACK_WEBHOOK_URL" ]; then - echo "ERROR: Set SLACK_WEBHOOK_URL in ~/.env before running --setup" + if [ -z "${SLACK_WEBHOOK_URL:-}" ]; then + echo "ERROR: SLACK_WEBHOOK_URL is not set in ~/.env" exit 1 fi if ! [[ "$INTERVAL" =~ ^[1-9][0-9]?$ ]] || [ "$INTERVAL" -gt 59 ]; then echo "ERROR: --interval must be 1-59 (got: $INTERVAL)" exit 1 fi - SCRIPT="$HOME/notify-migration.sh" + echo "Verifying webhook..." + slack_send ":rocket: Migration monitor started for branch *${PS_BRANCH_ID:-unknown}* — Slack notifications active" || { + echo "ERROR: Webhook verification failed — cron job not installed" >&2 + exit 1 + } + SCRIPT="$HOME/slack-migration-alerts.sh" CRON_LINE="*/${INTERVAL} * * * * ${SCRIPT} > /dev/null 2>&1" - ( crontab -l 2>/dev/null | grep -v "notify-migration.sh" || true + ( crontab -l 2>/dev/null | grep -v "notify-migration.sh" | grep -v "slack-migration-alerts.sh" || true echo "$CRON_LINE" ) | crontab - echo "Cron job installed (every ${INTERVAL} min):" echo " $CRON_LINE" - echo "" - echo "Sending setup confirmation to Slack..." - slack_send ":rocket: Migration monitor started for branch *${PS_BRANCH_ID:-unknown}* — Slack notifications active" exit 0 fi # ── --uninstall ──────────────────────────────────────────────────── if [ "$ACTION" = "uninstall" ]; then - ( crontab -l 2>/dev/null | grep -v "notify-migration.sh" || true ) | crontab - + ( crontab -l 2>/dev/null | grep -v "notify-migration.sh" | grep -v "slack-migration-alerts.sh" || true ) | crontab - echo "Cron job removed." exit 0 fi # ── Guard ────────────────────────────────────────────────────────── -if [ -z "$SLACK_WEBHOOK_URL" ]; then +if [ -z "${SLACK_WEBHOOK_URL:-}" ]; then echo "$(date '+%Y-%m-%d %H:%M:%S') SKIP: SLACK_WEBHOOK_URL not set in ~/.env" exit 0 fi @@ -225,8 +229,9 @@ if [ "$INITIAL_COPY_DONE" = true ] && [ "$LAST_INITIAL_COPY_NOTIFIED" = "false" msg=":large_blue_circle: *Initial copy completed — CDC phase starting*" msg+=$'\n'"Branch: *${PS_BRANCH_ID}* | Runtime: ${RUNTIME} | Data: ${GB} GB" msg+=$'\n'"Tables: ${TABLES_DONE}/${TABLES_TOTAL} | Indexes: ${INDEXES_DONE}/${INDEXES_TOTAL} | Constraints: ${CONSTRAINTS_DONE}/${CONSTRAINTS_TOTAL}" - slack_send "$msg" - NOTIFIED_INITIAL_COPY="true" + if slack_send "$msg"; then + NOTIFIED_INITIAL_COPY="true" + fi elif [ "$CURRENT_STATUS" = "succeeded" ] && [ "$LAST_COMPLETION_NOTIFIED" = "false" ]; then DIR_EPOCH=$(stat -c %Y "$MIGRATION_DIR" 2>/dev/null || date +%s) @@ -235,33 +240,30 @@ elif [ "$CURRENT_STATUS" = "succeeded" ] && [ "$LAST_COMPLETION_NOTIFIED" = "fal msg=":white_check_mark: *Migration completed successfully*" msg+=$'\n'"Branch: *${PS_BRANCH_ID}* | Runtime: ${RUNTIME} | Data: ${GB} GB" msg+=$'\n'"Tables: ${TABLES_DONE}/${TABLES_TOTAL} | Dir: ${MIGRATION_DIR}" - slack_send "$msg" - NOTIFIED_COMPLETION="true" + if slack_send "$msg"; then + NOTIFIED_COMPLETION="true" + fi elif [ "$CURRENT_STATUS" = "failed" ] && [ "$LAST_COMPLETION_NOTIFIED" = "false" ]; then - LAST_ERR=$(grep " ERROR " "$LOG" 2>/dev/null | tail -1 | cut -c1-120 || true) msg=":red_circle: *Migration FAILED*" msg+=$'\n'"Branch: *${PS_BRANCH_ID}* | Tables: ${TABLES_DONE}/${TABLES_TOTAL} | Data: ${GB} GB" - [ -n "$LAST_ERR" ] && msg+=$'\n'"Last error: ${LAST_ERR}" + msg+=$'\n'"Check migration.log for error details" msg+=$'\n'"Dir: ${MIGRATION_DIR}" - slack_send "$msg" - NOTIFIED_COMPLETION="true" + if slack_send "$msg"; then + NOTIFIED_COMPLETION="true" + fi elif [ "$CURRENT_STATUS" = "stopped" ] && [ "$LAST_STATUS" = "running" ]; then - LAST_ERR=$(grep " ERROR " "$LOG" 2>/dev/null | tail -1 | cut -c1-120 || true) msg=":warning: *Migration process stopped unexpectedly*" msg+=$'\n'"Branch: *${PS_BRANCH_ID}* | Tables: ${TABLES_DONE}/${TABLES_TOTAL} | Data: ${GB} GB" - [ -n "$LAST_ERR" ] && msg+=$'\n'"Last error: ${LAST_ERR}" msg+=$'\n'"Run: tail -50 ${LOG}" - slack_send "$msg" + slack_send "$msg" || true elif [ "$CURRENT_ERROR_COUNT" -gt "$LAST_ERROR_COUNT" ]; then NEW_COUNT=$(( CURRENT_ERROR_COUNT - LAST_ERROR_COUNT )) - LAST_ERR=$(grep " ERROR " "$LOG" 2>/dev/null | tail -1 | cut -c1-120 || true) msg=":warning: *${NEW_COUNT} new error(s) in migration log*" msg+=$'\n'"Branch: *${PS_BRANCH_ID}* | Total errors: ${CURRENT_ERROR_COUNT} | Tables: ${TABLES_DONE}/${TABLES_TOTAL}" - [ -n "$LAST_ERR" ] && msg+=$'\n'"Last: ${LAST_ERR}" - slack_send "$msg" + slack_send "$msg" || true fi # ── Save state ───────────────────────────────────────────────────── diff --git a/pgcopydb-helpers/start-migration-screen.sh b/pgcopydb-helpers/start-migration-screen.sh index e41c468..d5ff438 100644 --- a/pgcopydb-helpers/start-migration-screen.sh +++ b/pgcopydb-helpers/start-migration-screen.sh @@ -1,34 +1,13 @@ #!/bin/bash # -# Usage: ~/start-migration-screen.sh [--no-monitor] +# Usage: ~/start-migration-screen.sh # # Starts run-migration.sh inside a detached screen session named "migration". # Kills any existing migration screen first. Use "screen -r migration" to # attach and Ctrl-A D to detach. # -# Slack monitoring via notify-migration.sh is enabled by default (every 2 min). -# Use --no-monitor to skip. To change the interval later: -# ~/notify-migration.sh --uninstall -# ~/notify-migration.sh --setup --interval N -# set -eo pipefail -MONITOR=true - -while [ $# -gt 0 ]; do - case "$1" in - --no-monitor) - MONITOR=false - shift - ;; - *) - echo "Unknown argument: $1" >&2 - echo "Usage: $0 [--no-monitor]" >&2 - exit 1 - ;; - esac -done - # Kill any existing migration screen screen -S migration -X quit 2>/dev/null || true @@ -40,25 +19,4 @@ echo "" echo "To watch: screen -r migration" echo "To detach: Ctrl-A then D" echo "To check status: ~/check-migration-status.sh" -echo "" -echo "────────────────────────────────────────────────────────" - -if [ "$MONITOR" = true ]; then - echo "Setting up Slack monitoring (every 2 min)..." - echo "────────────────────────────────────────────────────────" - echo "" - if ~/notify-migration.sh --setup > /dev/null 2>&1; then - echo "Monitoring is active." - echo "To disable: ~/notify-migration.sh --uninstall" - echo "To reconfigure: ~/notify-migration.sh --setup --interval N" - else - echo "WARNING: Monitoring setup failed." - echo "Check SLACK_WEBHOOK_URL is set in ~/.env" - echo "To enable manually: ~/notify-migration.sh --setup" - fi -else - echo "Slack monitoring: DISABLED (--no-monitor was passed)" - echo "To enable later: ~/notify-migration.sh --setup [--interval N]" -fi - echo "────────────────────────────────────────────────────────" From 4c801a0c9836af6cfb7dfa60a69c99ccfd233692 Mon Sep 17 00:00:00 2001 From: DmitriiAn Date: Fri, 8 May 2026 12:16:19 +0200 Subject: [PATCH 9/9] fix slack-migration-alerts.sh: slack alerts --- pgcopydb-helpers/slack-migration-alerts.sh | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/pgcopydb-helpers/slack-migration-alerts.sh b/pgcopydb-helpers/slack-migration-alerts.sh index a489df3..d19478d 100755 --- a/pgcopydb-helpers/slack-migration-alerts.sh +++ b/pgcopydb-helpers/slack-migration-alerts.sh @@ -226,7 +226,7 @@ if [ "$INITIAL_COPY_DONE" = true ] && [ "$LAST_INITIAL_COPY_NOTIFIED" = "false" DIR_EPOCH=$(stat -c %Y "$MIGRATION_DIR" 2>/dev/null || date +%s) SECS=$(( $(date +%s) - DIR_EPOCH )) RUNTIME=$(printf "%dh %02dm" $(( SECS/3600 )) $(( (SECS%3600)/60 ))) - msg=":large_blue_circle: *Initial copy completed — CDC phase starting*" + msg=":large_green_circle: *Initial copy completed — CDC phase starting*" msg+=$'\n'"Branch: *${PS_BRANCH_ID}* | Runtime: ${RUNTIME} | Data: ${GB} GB" msg+=$'\n'"Tables: ${TABLES_DONE}/${TABLES_TOTAL} | Indexes: ${INDEXES_DONE}/${INDEXES_TOTAL} | Constraints: ${CONSTRAINTS_DONE}/${CONSTRAINTS_TOTAL}" if slack_send "$msg"; then @@ -254,14 +254,19 @@ elif [ "$CURRENT_STATUS" = "failed" ] && [ "$LAST_COMPLETION_NOTIFIED" = "false" fi elif [ "$CURRENT_STATUS" = "stopped" ] && [ "$LAST_STATUS" = "running" ]; then - msg=":warning: *Migration process stopped unexpectedly*" + msg=":red_circle: *Migration process stopped unexpectedly*" msg+=$'\n'"Branch: *${PS_BRANCH_ID}* | Tables: ${TABLES_DONE}/${TABLES_TOTAL} | Data: ${GB} GB" msg+=$'\n'"Run: tail -50 ${LOG}" slack_send "$msg" || true +fi -elif [ "$CURRENT_ERROR_COUNT" -gt "$LAST_ERROR_COUNT" ]; then +if [ "$CURRENT_ERROR_COUNT" -gt "$LAST_ERROR_COUNT" ]; then NEW_COUNT=$(( CURRENT_ERROR_COUNT - LAST_ERROR_COUNT )) - msg=":warning: *${NEW_COUNT} new error(s) in migration log*" + if [ "$CURRENT_STATUS" = "stopped" ]; then + msg=":red_circle: *${NEW_COUNT} new error(s) in migration log — process is not running*" + else + msg=":warning: *${NEW_COUNT} new error(s) in migration log*" + fi msg+=$'\n'"Branch: *${PS_BRANCH_ID}* | Total errors: ${CURRENT_ERROR_COUNT} | Tables: ${TABLES_DONE}/${TABLES_TOTAL}" slack_send "$msg" || true fi