From 6a4ecdea1cadc98b25aee724fac9100427427172 Mon Sep 17 00:00:00 2001 From: im0x0ing Date: Tue, 16 Jun 2026 17:04:29 +0800 Subject: [PATCH] Fix PostgreSQL standby password repair requeue Read standby passwords only from replica pgpass files and stop falling back to pod environment variables. Exclude the leader pod as a password source so the repair uses replicas as the source of truth when correcting the leader-side standby role password. Treat missing pgpass files and pgpass files without a standby entry as unavailable password sources. Record StandbyPasswordUnavailable without requeueing for those non-transient states, while keeping delayed requeue behavior for real exec/runtime failures. Allow the repair transformer to run while the PostgreSQL component is Running, Updating, or Abnormal, and skip components that have no replica password source. Add focused tests for replica-only password sourcing, unavailable pgpass handling, non-running eligible phases, single-replica skips, and runtime failures that should still requeue. --- ...nent_postgresql_standby_password_repair.go | 79 ++++-- ...postgresql_standby_password_repair_test.go | 232 ++++++++++++++---- 2 files changed, 235 insertions(+), 76 deletions(-) diff --git a/controllers/apps/transformer_component_postgresql_standby_password_repair.go b/controllers/apps/transformer_component_postgresql_standby_password_repair.go index 995c7d624cd..00866f8ce4b 100644 --- a/controllers/apps/transformer_component_postgresql_standby_password_repair.go +++ b/controllers/apps/transformer_component_postgresql_standby_password_repair.go @@ -53,12 +53,12 @@ const ( standbyPasswordRepairReasonSucceeded = "Succeeded" standbyPasswordRepairReasonFailed = "Failed" standbyPasswordRepairReasonInconsistent = "StandbyPasswordInconsistent" + standbyPasswordRepairReasonUnavailable = "StandbyPasswordUnavailable" standbyPasswordRepairReasonSkipped = "Skipped" standbyPasswordRepairRequeueInterval = time.Minute standbyPgpassPath = "/run/postgresql/pgpass" standbyUserName = "standby" - readStandbyPasswordEnvCommand = `printf "%s" "${PGPASSWORD_STANDBY:-}"` readPostgreSQLModeEnvCommand = `printf "%s" "${PG_MODE:-}"` ) @@ -103,6 +103,7 @@ printf "%s\n" "$matches" ` var errStandbyEntryNotFound = errors.New("standby entry not found") +var errStandbyPasswordUnavailable = errors.New("standby password unavailable") // componentPostgreSQLStandbyPasswordRepairTransformer repairs drift between // the standby password used by pods and the password stored in PostgreSQL. @@ -124,7 +125,7 @@ func (t *componentPostgreSQLStandbyPasswordRepairTransformer) Transform( if !isPostgreSQLComponent(transCtx) { return nil } - if transCtx.Component.Status.Phase != appsv1alpha1.RunningClusterCompPhase { + if !shouldRunStandbyPasswordRepair(transCtx.Component.Status.Phase) { return nil } @@ -172,12 +173,27 @@ func (t *componentPostgreSQLStandbyPasswordRepairTransformer) Transform( return intctrlutil.NewDelayedRequeueError(standbyPasswordRepairRequeueInterval, err.Error()) } - expectedPassword, err := consistentStandbyPassword(transCtx.Context, runner, pods) + passwordSourcePods := standbyPasswordSourcePods(pods, leaderPod.Name) + if len(passwordSourcePods) == 0 { + if transCtx.SynthesizeComponent.Replicas <= 1 { + t.markRepairSkipped(transCtx) + return nil + } + err := fmt.Errorf("postgresql standby password repair: no running replica pods with pod ip found") + t.markRepairFailed(transCtx, standbyPasswordRepairReasonUnavailable, err) + return nil + } + + expectedPassword, err := consistentStandbyPassword(transCtx.Context, runner, passwordSourcePods) if err != nil { if isInconsistentStandbyPasswordError(err) { t.markRepairFailed(transCtx, standbyPasswordRepairReasonInconsistent, err) return nil } + if errors.Is(err, errStandbyPasswordUnavailable) { + t.markRepairFailed(transCtx, standbyPasswordRepairReasonUnavailable, err) + return nil + } t.markRepairFailed(transCtx, standbyPasswordRepairReasonFailed, err) return intctrlutil.NewDelayedRequeueError(standbyPasswordRepairRequeueInterval, err.Error()) } @@ -196,6 +212,17 @@ func (t *componentPostgreSQLStandbyPasswordRepairTransformer) Transform( return nil } +func shouldRunStandbyPasswordRepair(phase appsv1alpha1.ClusterComponentPhase) bool { + switch phase { + case appsv1alpha1.RunningClusterCompPhase, + appsv1alpha1.UpdatingClusterCompPhase, + appsv1alpha1.AbnormalClusterCompPhase: + return true + default: + return false + } +} + func (t *componentPostgreSQLStandbyPasswordRepairTransformer) runningPods( transCtx *componentTransformContext, ) ([]*corev1.Pod, error) { @@ -238,6 +265,17 @@ func (t *componentPostgreSQLStandbyPasswordRepairTransformer) leaderPod( return pod, nil } +func standbyPasswordSourcePods(pods []*corev1.Pod, leaderPodName string) []*corev1.Pod { + sourcePods := make([]*corev1.Pod, 0, len(pods)) + for _, pod := range pods { + if pod == nil || pod.Name == leaderPodName { + continue + } + sourcePods = append(sourcePods, pod) + } + return sourcePods +} + func (t *componentPostgreSQLStandbyPasswordRepairTransformer) markRepairSucceeded( transCtx *componentTransformContext, repaired bool, @@ -265,7 +303,7 @@ func (t *componentPostgreSQLStandbyPasswordRepairTransformer) markRepairSkipped( ObservedGeneration: transCtx.Component.Generation, LastTransitionTime: metav1.Now(), Reason: standbyPasswordRepairReasonSkipped, - Message: "PostgreSQL standby password repair is skipped for standby cluster mode", + Message: "PostgreSQL standby password repair is skipped", }) } @@ -333,6 +371,15 @@ func consistentStandbyPassword(ctx context.Context, runner podExecRunner, pods [ func standbyPasswordFromPod(ctx context.Context, runner podExecRunner, pod *corev1.Pod) (string, error) { stdout, stderr, err := runner.Exec(ctx, pod, []string{"cat", standbyPgpassPath}, "") if err != nil { + if strings.Contains(strings.ToLower(stderr), "no such file") { + return "", fmt.Errorf( + "postgresql standby password repair: standby password unavailable in pod %q: read pgpass failed: %v: %s: %w", + pod.Name, + err, + strings.TrimSpace(stderr), + errStandbyPasswordUnavailable, + ) + } return "", fmt.Errorf( "postgresql standby password repair: read pgpass from pod %q: %w: %s", pod.Name, @@ -340,33 +387,19 @@ func standbyPasswordFromPod(ctx context.Context, runner podExecRunner, pod *core strings.TrimSpace(stderr), ) } + password, err := parseStandbyPasswordFromPgpass(stdout) if err == nil { return password, nil } - if !errors.Is(err, errStandbyEntryNotFound) { - return "", fmt.Errorf("postgresql standby password repair: parse pgpass from pod %q: %w", pod.Name, err) - } - - // Some PostgreSQL leaders keep only the superuser entry in pgpass, while the - // replication password is still exposed through the pod environment. - stdout, stderr, err = runner.Exec(ctx, pod, []string{"sh", "-c", readStandbyPasswordEnvCommand}, "") - if err != nil { - return "", fmt.Errorf( - "postgresql standby password repair: read standby env from pod %q: %w: %s", - pod.Name, - err, - strings.TrimSpace(stderr), - ) - } - password = strings.TrimRight(stdout, "\r\n") - if password == "" { + if errors.Is(err, errStandbyEntryNotFound) { return "", fmt.Errorf( - "postgresql standby password repair: standby password not found in pod %q pgpass or env", + "postgresql standby password repair: standby password unavailable in pod %q: standby password not found in pgpass: %w", pod.Name, + errStandbyPasswordUnavailable, ) } - return password, nil + return "", fmt.Errorf("postgresql standby password repair: parse pgpass from pod %q: %w", pod.Name, err) } func ensureLeaderStandbyPassword( diff --git a/controllers/apps/transformer_component_postgresql_standby_password_repair_test.go b/controllers/apps/transformer_component_postgresql_standby_password_repair_test.go index 78c6c6dfdd3..1cc8d18688e 100644 --- a/controllers/apps/transformer_component_postgresql_standby_password_repair_test.go +++ b/controllers/apps/transformer_component_postgresql_standby_password_repair_test.go @@ -21,6 +21,7 @@ package apps import ( "context" + "fmt" "strings" "testing" @@ -38,6 +39,7 @@ import ( ctrlcomp "github.com/apecloud/kubeblocks/pkg/controller/component" "github.com/apecloud/kubeblocks/pkg/controller/graph" "github.com/apecloud/kubeblocks/pkg/controller/model" + intctrlutil "github.com/apecloud/kubeblocks/pkg/controllerutil" ) func TestParseStandbyPasswordFromPgpass(t *testing.T) { @@ -117,34 +119,6 @@ func TestConsistentStandbyPassword(t *testing.T) { }, runner.commands) } -func TestConsistentStandbyPasswordFallsBackToEnv(t *testing.T) { - t.Parallel() - - pods := []*corev1.Pod{ - {ObjectMeta: metav1.ObjectMeta{Name: "postgresql-0"}}, - {ObjectMeta: metav1.ObjectMeta{Name: "postgresql-1"}}, - } - runner := &fakePodExecRunner{ - pgpass: map[string]string{ - "postgresql-0": "localhost:5432:*:postgres:secret", - "postgresql-1": "localhost:5432:*:standby:secret", - }, - standbyEnv: map[string]string{ - "postgresql-0": "secret", - }, - } - - password, err := consistentStandbyPassword(context.Background(), runner, pods) - - require.NoError(t, err) - require.Equal(t, "secret", password) - require.Equal(t, [][]string{ - {"cat", standbyPgpassPath}, - {"sh", "-c", readStandbyPasswordEnvCommand}, - {"cat", standbyPgpassPath}, - }, runner.commands) -} - func TestConsistentStandbyPasswordInconsistent(t *testing.T) { t.Parallel() @@ -238,11 +212,8 @@ func TestComponentPostgreSQLStandbyPasswordRepairTransformer(t *testing.T) { transCtx := newPostgreSQLStandbyPasswordRepairTestContext(t, leaderPodName, replicaPod) runner := &fakePodExecRunner{ pgpass: map[string]string{ - leaderPodName: "localhost:5432:*:postgres:secret", replicaPod: "localhost:5432:*:standby:secret", - }, - standbyEnv: map[string]string{ - leaderPodName: "secret", + leaderPodName: "localhost:5432:*:standby:leader-secret", }, ensureResult: "f", } @@ -258,6 +229,158 @@ func TestComponentPostgreSQLStandbyPasswordRepairTransformer(t *testing.T) { require.Equal(t, metav1.ConditionTrue, cond.Status) } +func TestComponentPostgreSQLStandbyPasswordRepairTransformerRunsWhileNotRunning(t *testing.T) { + t.Parallel() + + testCases := []struct { + name string + phase appsv1alpha1.ClusterComponentPhase + }{ + { + name: "updating", + phase: appsv1alpha1.UpdatingClusterCompPhase, + }, + { + name: "abnormal", + phase: appsv1alpha1.AbnormalClusterCompPhase, + }, + } + + for _, tc := range testCases { + tc := tc + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + + const ( + leaderPodName = "test-postgresql-1" + replicaPod = "test-postgresql-0" + ) + + transCtx := newPostgreSQLStandbyPasswordRepairTestContext(t, leaderPodName, replicaPod) + transCtx.Component.Status.Phase = tc.phase + runner := &fakePodExecRunner{ + pgpass: map[string]string{ + replicaPod: "localhost:5432:*:standby:secret", + leaderPodName: "localhost:5432:*:postgres:ignored", + }, + ensureResult: "f", + } + + err := (&componentPostgreSQLStandbyPasswordRepairTransformer{ + execRunner: runner, + }).Transform(transCtx, graph.NewDAG()) + + require.NoError(t, err) + require.Equal(t, leaderPodName, runner.ensurePod) + require.Equal(t, "secret", runner.stdin) + }) + } +} + +func TestComponentPostgreSQLStandbyPasswordRepairTransformerUnavailableDoesNotRequeue(t *testing.T) { + t.Parallel() + + const ( + leaderPodName = "test-postgresql-1" + replicaPod = "test-postgresql-0" + ) + + testCases := []struct { + name string + runner *fakePodExecRunner + }{ + { + name: "missing pgpass", + runner: &fakePodExecRunner{ + missingPgpass: map[string]bool{ + replicaPod: true, + }, + }, + }, + { + name: "standby entry missing", + runner: &fakePodExecRunner{ + pgpass: map[string]string{ + replicaPod: "localhost:5432:*:postgres:secret", + }, + }, + }, + } + + for _, tc := range testCases { + tc := tc + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + + transCtx := newPostgreSQLStandbyPasswordRepairTestContext(t, leaderPodName, replicaPod) + err := (&componentPostgreSQLStandbyPasswordRepairTransformer{ + execRunner: tc.runner, + }).Transform(transCtx, graph.NewDAG()) + + require.NoError(t, err) + require.Empty(t, tc.runner.ensurePod) + cond := meta.FindStatusCondition(transCtx.Component.Status.Conditions, standbyPasswordRepairConditionType) + require.NotNil(t, cond) + require.Equal(t, metav1.ConditionFalse, cond.Status) + require.Equal(t, standbyPasswordRepairReasonUnavailable, cond.Reason) + }) + } +} + +func TestComponentPostgreSQLStandbyPasswordRepairTransformerSkipsSingleReplica(t *testing.T) { + t.Parallel() + + const leaderPodName = "test-postgresql-0" + + transCtx := newPostgreSQLStandbyPasswordRepairTestContext(t, leaderPodName) + runner := &fakePodExecRunner{ + pgpass: map[string]string{ + leaderPodName: "localhost:5432:*:standby:secret", + }, + } + + err := (&componentPostgreSQLStandbyPasswordRepairTransformer{ + execRunner: runner, + }).Transform(transCtx, graph.NewDAG()) + + require.NoError(t, err) + require.Empty(t, runner.ensurePod) + cond := meta.FindStatusCondition(transCtx.Component.Status.Conditions, standbyPasswordRepairConditionType) + require.NotNil(t, cond) + require.Equal(t, metav1.ConditionTrue, cond.Status) + require.Equal(t, standbyPasswordRepairReasonSkipped, cond.Reason) +} + +func TestComponentPostgreSQLStandbyPasswordRepairTransformerPgpassExecFailureRequeues(t *testing.T) { + t.Parallel() + + const ( + leaderPodName = "test-postgresql-1" + replicaPod = "test-postgresql-0" + ) + + transCtx := newPostgreSQLStandbyPasswordRepairTestContext(t, leaderPodName, replicaPod) + runner := &fakePodExecRunner{ + pgpassError: map[string]error{ + replicaPod: fmt.Errorf("exec failed"), + }, + pgpassStderr: map[string]string{ + replicaPod: "container not found", + }, + } + + err := (&componentPostgreSQLStandbyPasswordRepairTransformer{ + execRunner: runner, + }).Transform(transCtx, graph.NewDAG()) + + require.Error(t, err) + require.True(t, intctrlutil.IsDelayedRequeueError(err)) + cond := meta.FindStatusCondition(transCtx.Component.Status.Conditions, standbyPasswordRepairConditionType) + require.NotNil(t, cond) + require.Equal(t, metav1.ConditionFalse, cond.Status) + require.Equal(t, standbyPasswordRepairReasonFailed, cond.Reason) +} + func TestComponentPostgreSQLStandbyPasswordRepairConditionUsesStatusVertex(t *testing.T) { t.Parallel() @@ -269,11 +392,8 @@ func TestComponentPostgreSQLStandbyPasswordRepairConditionUsesStatusVertex(t *te transCtx := newPostgreSQLStandbyPasswordRepairTestContext(t, leaderPodName, replicaPod) runner := &fakePodExecRunner{ pgpass: map[string]string{ - leaderPodName: "localhost:5432:*:postgres:secret", replicaPod: "localhost:5432:*:standby:secret", - }, - standbyEnv: map[string]string{ - leaderPodName: "secret", + leaderPodName: "localhost:5432:*:postgres:ignored", }, ensureResult: "f", } @@ -334,14 +454,16 @@ func TestComponentPostgreSQLStandbyPasswordRepairTransformerInconsistent(t *test const ( leaderPodName = "test-postgresql-1" - replicaPod = "test-postgresql-0" + replicaPodA = "test-postgresql-0" + replicaPodB = "test-postgresql-2" ) - transCtx := newPostgreSQLStandbyPasswordRepairTestContext(t, leaderPodName, replicaPod) + transCtx := newPostgreSQLStandbyPasswordRepairTestContext(t, leaderPodName, replicaPodA, replicaPodB) runner := &fakePodExecRunner{ pgpass: map[string]string{ - leaderPodName: "localhost:5432:*:standby:secret-a", - replicaPod: "localhost:5432:*:standby:secret-b", + leaderPodName: "localhost:5432:*:standby:leader-secret", + replicaPodA: "localhost:5432:*:standby:secret-a", + replicaPodB: "localhost:5432:*:standby:secret-b", }, } @@ -403,7 +525,7 @@ func newPostgreSQLStandbyPasswordRepairTestContext( WithScheme(rscheme). WithObjects(objects...). Build() - replicas := int32(len(objects) - 3) + replicas := int32(1 + len(replicaPodNames)) membersStatus := []workloads.MemberStatus{{ PodName: leaderPodName, ReplicaRole: &workloads.ReplicaRole{ @@ -484,14 +606,16 @@ func postgresqlTestPod(namespace, name string, labels map[string]string) *corev1 } type fakePodExecRunner struct { - pgpass map[string]string - pgMode map[string]string - standbyEnv map[string]string - ensureResult string - ensurePod string - stdin string - lastCommand []string - commands [][]string + pgpass map[string]string + pgpassError map[string]error + pgpassStderr map[string]string + pgMode map[string]string + missingPgpass map[string]bool + ensureResult string + ensurePod string + stdin string + lastCommand []string + commands [][]string } func (r *fakePodExecRunner) Exec( @@ -503,12 +627,14 @@ func (r *fakePodExecRunner) Exec( r.commands = append(r.commands, append([]string{}, command...)) r.lastCommand = append([]string{}, command...) if len(command) == 2 && command[0] == "cat" && command[1] == standbyPgpassPath { + if err := r.pgpassError[pod.Name]; err != nil { + return "", r.pgpassStderr[pod.Name], err + } + if r.missingPgpass[pod.Name] { + return "", "cat: /run/postgresql/pgpass: No such file or directory", fmt.Errorf("exit status 1") + } return r.pgpass[pod.Name], "", nil } - if len(command) == 3 && command[0] == "sh" && command[1] == "-c" && - command[2] == readStandbyPasswordEnvCommand { - return r.standbyEnv[pod.Name], "", nil - } if len(command) == 3 && command[0] == "sh" && command[1] == "-c" && command[2] == readPostgreSQLModeEnvCommand { return r.pgMode[pod.Name], "", nil