From 36bcf26753cdce1ecc090029755b88da00bac068 Mon Sep 17 00:00:00 2001 From: Rene Cannao Date: Sat, 18 Apr 2026 15:14:10 +0000 Subject: [PATCH 1/2] fix(tests): verify PG graceful switchover at PostgreSQL level, not via cluster view MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The post-merge CI run surfaced that the inherited "primary did not change" check is unreliable for PG graceful takeover. Because the demoted primary keeps running until the operator restarts it with standby.signal, orchestrator sees two cluster roots afterward (one per host, each RO=false), and "find RO=false in original cluster" returns the same host both before and after — producing a spurious failure. Replace with direct PostgreSQL-level verification: - pgstandby1 has pg_is_in_recovery()=false (promoted) - pgprimary has default_transaction_read_only=on (demoted) Existing writability and primary_conninfo checks remain. The round-trip section now gates on SWITCHOVER_OK from the direct check and discovers the new cluster name dynamically (pgstandby1 becomes a new cluster root after the first takeover, so the round-trip API call must target its cluster, not the original PG_CLUSTER). The round-trip completion check also uses pg_is_in_recovery() on pgprimary instead of the split-brain cluster view. --- tests/functional/test-postgresql.sh | 94 ++++++++++++++++------------- 1 file changed, 52 insertions(+), 42 deletions(-) diff --git a/tests/functional/test-postgresql.sh b/tests/functional/test-postgresql.sh index 560d789d..fbaeca7f 100755 --- a/tests/functional/test-postgresql.sh +++ b/tests/functional/test-postgresql.sh @@ -152,21 +152,28 @@ else curl -s --max-time 10 "$ORC_URL/api/discover/172.30.0.21/5432" > /dev/null 2>&1 sleep 5 - # Verify primary has changed - NEW_PRIMARY=$(curl -s --max-time 10 "$ORC_URL/api/cluster/$PG_CLUSTER" 2>/dev/null | python3 -c " -import json, sys -instances = json.load(sys.stdin) -for inst in instances: - if not inst.get('ReadOnly', True): - print(inst['Key']['Hostname'] + ':' + str(inst['Key']['Port'])) - sys.exit(0) -print('') -" 2>/dev/null || echo "") + # Verify the switchover at the PostgreSQL level, not via orchestrator's + # cluster view. After a PG graceful takeover the demoted primary is still + # running (awaiting an operator-managed restart with standby.signal), so + # orchestrator sees two roots — one per former cluster — and a "find RO=false + # in original cluster" check returns the same host both times. + SWITCHOVER_OK=false + + # pgstandby1 must have been promoted (no longer in recovery) + PROMOTED=$($COMPOSE exec -T pgstandby1 psql -U postgres -tAc "SELECT pg_is_in_recovery();" 2>/dev/null | tr -d '[:space:]') + if [ "$PROMOTED" = "f" ]; then + pass "pgstandby1 has been promoted (pg_is_in_recovery=false)" + SWITCHOVER_OK=true + else + fail "pgstandby1 still in recovery after switchover (got: '$PROMOTED')" + fi - if [ -n "$NEW_PRIMARY" ] && [ "$NEW_PRIMARY" != "$CURRENT_PRIMARY" ]; then - pass "Primary switched from $CURRENT_PRIMARY to $NEW_PRIMARY" + # pgprimary must have been set read-only (default_transaction_read_only=on) + DEMOTED_RO=$($COMPOSE exec -T pgprimary psql -U postgres -tAc "SHOW default_transaction_read_only;" 2>/dev/null | tr -d '[:space:]') + if [ "$DEMOTED_RO" = "on" ]; then + pass "pgprimary has default_transaction_read_only=on" else - fail "Primary did not change: was $CURRENT_PRIMARY, now ${NEW_PRIMARY:-unknown}" + fail "pgprimary default_transaction_read_only=$DEMOTED_RO (expected on)" fi # Verify new primary is actually writable (not just flagged read_only=false) @@ -217,7 +224,7 @@ echo "--- Graceful switchover round-trip (switch back) ---" # actually stream WAL from the new primary. Simulate what a # PostGracefulTakeoverProcesses hook would do. -if [ -n "${NEW_PRIMARY:-}" ] && [ "${NEW_PRIMARY:-}" != "${CURRENT_PRIMARY:-}" ]; then +if [ "${SWITCHOVER_OK:-false}" = "true" ]; then echo "Converting demoted pgprimary into a live standby of pgstandby1..." $COMPOSE exec -T pgprimary bash -c 'touch /var/lib/postgresql/data/standby.signal && chown postgres:postgres /var/lib/postgresql/data/standby.signal' || true $COMPOSE restart pgprimary @@ -238,39 +245,51 @@ if [ -n "${NEW_PRIMARY:-}" ] && [ "${NEW_PRIMARY:-}" != "${CURRENT_PRIMARY:-}" ] else pass "pgprimary restarted as a standby" - # Let orchestrator re-discover the flipped topology + # Let orchestrator re-discover — after pgprimary restarts as a standby, + # it joins pgstandby1's cluster ("172.30.0.21:5432"). Poll for that. sleep 5 curl -s --max-time 10 "$ORC_URL/api/discover/172.30.0.20/5432" > /dev/null 2>&1 curl -s --max-time 10 "$ORC_URL/api/discover/172.30.0.21/5432" > /dev/null 2>&1 sleep 8 - # Verify orchestrator sees pgstandby1 as primary and pgprimary as standby - TOPOLOGY_OK=false + NEW_CLUSTER="" for i in $(seq 1 30); do - PRIMARY_HOST=$(curl -s --max-time 10 "$ORC_URL/api/cluster/$PG_CLUSTER" 2>/dev/null | python3 -c " + NEW_CLUSTER=$(curl -s --max-time 10 "$ORC_URL/api/all-instances" 2>/dev/null | python3 -c " import json, sys for inst in json.load(sys.stdin): - if not inst.get('ReadOnly', True): - print(inst['Key']['Hostname']) + if inst['Key']['Hostname'] == '172.30.0.21': + print(inst.get('ClusterName', '')) sys.exit(0) " 2>/dev/null || echo "") - if [ "$PRIMARY_HOST" = "172.30.0.21" ] || [ "$PRIMARY_HOST" = "pgstandby1" ]; then - TOPOLOGY_OK=true + # Verify pgprimary (172.30.0.20) joined the same cluster as pgstandby1 + PRIMARY_CLUSTER=$(curl -s --max-time 10 "$ORC_URL/api/all-instances" 2>/dev/null | python3 -c " +import json, sys +for inst in json.load(sys.stdin): + if inst['Key']['Hostname'] == '172.30.0.20': + print(inst.get('ClusterName', '')) + sys.exit(0) +" 2>/dev/null || echo "") + if [ -n "$NEW_CLUSTER" ] && [ "$NEW_CLUSTER" = "$PRIMARY_CLUSTER" ]; then break fi + # Re-seed periodically + if [ "$((i % 5))" = "0" ]; then + curl -s --max-time 10 "$ORC_URL/api/discover/172.30.0.20/5432" > /dev/null 2>&1 + curl -s --max-time 10 "$ORC_URL/api/discover/172.30.0.21/5432" > /dev/null 2>&1 + fi sleep 1 done - if [ "$TOPOLOGY_OK" = "true" ]; then - pass "Orchestrator sees pgstandby1 as primary after round-trip setup" + if [ -n "$NEW_CLUSTER" ] && [ "$NEW_CLUSTER" = "$PRIMARY_CLUSTER" ]; then + pass "Orchestrator re-unified topology under new primary (cluster=$NEW_CLUSTER)" else - fail "Orchestrator does not see pgstandby1 as primary (got: ${PRIMARY_HOST:-unknown})" + fail "Topology not re-unified: pgstandby1 cluster=$NEW_CLUSTER pgprimary cluster=$PRIMARY_CLUSTER" fi - # Now switch back: pgstandby1 → pgprimary - if [ "$TOPOLOGY_OK" = "true" ]; then - echo "Executing graceful-master-takeover-auto to switch back..." - BACK_RESULT=$(curl -s --max-time 60 "$ORC_URL/api/graceful-master-takeover-auto/$PG_CLUSTER" 2>/dev/null) + # Now switch back: pgstandby1 → pgprimary, using the NEW cluster name + if [ -n "$NEW_CLUSTER" ] && [ "$NEW_CLUSTER" = "$PRIMARY_CLUSTER" ]; then + echo "Executing graceful-master-takeover-auto on cluster $NEW_CLUSTER..." + BACK_RESULT=$(curl -s --max-time 60 "$ORC_URL/api/graceful-master-takeover-auto/$NEW_CLUSTER" 2>/dev/null) BACK_CODE=$(echo "$BACK_RESULT" | python3 -c "import json,sys; print(json.load(sys.stdin).get('Code','ERROR'))" 2>/dev/null || echo "ERROR") if [ "$BACK_CODE" = "OK" ]; then @@ -280,22 +299,13 @@ for inst in json.load(sys.stdin): fi sleep 10 - curl -s --max-time 10 "$ORC_URL/api/discover/172.30.0.20/5432" > /dev/null 2>&1 - curl -s --max-time 10 "$ORC_URL/api/discover/172.30.0.21/5432" > /dev/null 2>&1 - sleep 5 - - FINAL_PRIMARY=$(curl -s --max-time 10 "$ORC_URL/api/cluster/$PG_CLUSTER" 2>/dev/null | python3 -c " -import json, sys -for inst in json.load(sys.stdin): - if not inst.get('ReadOnly', True): - print(inst['Key']['Hostname']) - sys.exit(0) -" 2>/dev/null || echo "") - if [ "$FINAL_PRIMARY" = "172.30.0.20" ] || [ "$FINAL_PRIMARY" = "pgprimary" ]; then + # Verify pgprimary is now promoted (not in recovery) + BACK_PROMOTED=$($COMPOSE exec -T pgprimary psql -U postgres -tAc "SELECT pg_is_in_recovery();" 2>/dev/null | tr -d '[:space:]') + if [ "$BACK_PROMOTED" = "f" ]; then pass "Round-trip complete: pgprimary is primary again" else - fail "Round-trip incomplete: primary is '$FINAL_PRIMARY' (expected pgprimary)" + fail "Round-trip incomplete: pgprimary pg_is_in_recovery='$BACK_PROMOTED' (expected f)" fi # After round-trip, pgstandby1 is the demoted primary — reactivate From d5e022646cc7fecd1a0df748b3d5641205cf8baf Mon Sep 17 00:00:00 2001 From: Rene Cannao Date: Sat, 18 Apr 2026 15:38:34 +0000 Subject: [PATCH 2/2] fix(tests): allow orchestrator user to establish replication connections MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The round-trip round of the PG functional test hung for 60s on the second graceful-master-takeover-auto: the wait-for-LSN step never completed because the demoted primary's WAL receiver could not authenticate to the new primary. Root cause: PostgreSQLReconfigureStandby / PostgreSQLRepositionAsStandby set primary_conninfo using orchestrator's credentials (PostgreSQLTopologyUser / PostgreSQLTopologyPassword), but the test cluster's pg_hba.conf only had "host replication repl" — no entry for the orchestrator user. Streaming replication was silently rejected. Add "host replication orchestrator all md5" so that once the operator restarts the demoted primary with standby.signal, the primary_conninfo orchestrator set actually authenticates. This matches the operational prerequisite a real deployment would need to satisfy for the PostGracefulTakeoverProcesses hook to work with only a restart. --- tests/functional/postgres/init-primary.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/functional/postgres/init-primary.sh b/tests/functional/postgres/init-primary.sh index 0f69899c..fd397cb5 100755 --- a/tests/functional/postgres/init-primary.sh +++ b/tests/functional/postgres/init-primary.sh @@ -19,6 +19,9 @@ cat >> "$PGDATA/pg_hba.conf" <