diff --git a/controllers/apps/transformer_component_postgresql_standby_password_repair.go b/controllers/apps/transformer_component_postgresql_standby_password_repair.go index 995c7d624cd..00866f8ce4b 100644 --- a/controllers/apps/transformer_component_postgresql_standby_password_repair.go +++ b/controllers/apps/transformer_component_postgresql_standby_password_repair.go @@ -53,12 +53,12 @@ const ( standbyPasswordRepairReasonSucceeded = "Succeeded" standbyPasswordRepairReasonFailed = "Failed" standbyPasswordRepairReasonInconsistent = "StandbyPasswordInconsistent" + standbyPasswordRepairReasonUnavailable = "StandbyPasswordUnavailable" standbyPasswordRepairReasonSkipped = "Skipped" standbyPasswordRepairRequeueInterval = time.Minute standbyPgpassPath = "/run/postgresql/pgpass" standbyUserName = "standby" - readStandbyPasswordEnvCommand = `printf "%s" "${PGPASSWORD_STANDBY:-}"` readPostgreSQLModeEnvCommand = `printf "%s" "${PG_MODE:-}"` ) @@ -103,6 +103,7 @@ printf "%s\n" "$matches" ` var errStandbyEntryNotFound = errors.New("standby entry not found") +var errStandbyPasswordUnavailable = errors.New("standby password unavailable") // componentPostgreSQLStandbyPasswordRepairTransformer repairs drift between // the standby password used by pods and the password stored in PostgreSQL. @@ -124,7 +125,7 @@ func (t *componentPostgreSQLStandbyPasswordRepairTransformer) Transform( if !isPostgreSQLComponent(transCtx) { return nil } - if transCtx.Component.Status.Phase != appsv1alpha1.RunningClusterCompPhase { + if !shouldRunStandbyPasswordRepair(transCtx.Component.Status.Phase) { return nil } @@ -172,12 +173,27 @@ func (t *componentPostgreSQLStandbyPasswordRepairTransformer) Transform( return intctrlutil.NewDelayedRequeueError(standbyPasswordRepairRequeueInterval, err.Error()) } - expectedPassword, err := consistentStandbyPassword(transCtx.Context, runner, pods) + passwordSourcePods := standbyPasswordSourcePods(pods, leaderPod.Name) + if len(passwordSourcePods) == 0 { + if transCtx.SynthesizeComponent.Replicas <= 1 { + t.markRepairSkipped(transCtx) + return nil + } + err := fmt.Errorf("postgresql standby password repair: no running replica pods with pod ip found") + t.markRepairFailed(transCtx, standbyPasswordRepairReasonUnavailable, err) + return nil + } + + expectedPassword, err := consistentStandbyPassword(transCtx.Context, runner, passwordSourcePods) if err != nil { if isInconsistentStandbyPasswordError(err) { t.markRepairFailed(transCtx, standbyPasswordRepairReasonInconsistent, err) return nil } + if errors.Is(err, errStandbyPasswordUnavailable) { + t.markRepairFailed(transCtx, standbyPasswordRepairReasonUnavailable, err) + return nil + } t.markRepairFailed(transCtx, standbyPasswordRepairReasonFailed, err) return intctrlutil.NewDelayedRequeueError(standbyPasswordRepairRequeueInterval, err.Error()) } @@ -196,6 +212,17 @@ func (t *componentPostgreSQLStandbyPasswordRepairTransformer) Transform( return nil } +func shouldRunStandbyPasswordRepair(phase appsv1alpha1.ClusterComponentPhase) bool { + switch phase { + case appsv1alpha1.RunningClusterCompPhase, + appsv1alpha1.UpdatingClusterCompPhase, + appsv1alpha1.AbnormalClusterCompPhase: + return true + default: + return false + } +} + func (t *componentPostgreSQLStandbyPasswordRepairTransformer) runningPods( transCtx *componentTransformContext, ) ([]*corev1.Pod, error) { @@ -238,6 +265,17 @@ func (t *componentPostgreSQLStandbyPasswordRepairTransformer) leaderPod( return pod, nil } +func standbyPasswordSourcePods(pods []*corev1.Pod, leaderPodName string) []*corev1.Pod { + sourcePods := make([]*corev1.Pod, 0, len(pods)) + for _, pod := range pods { + if pod == nil || pod.Name == leaderPodName { + continue + } + sourcePods = append(sourcePods, pod) + } + return sourcePods +} + func (t *componentPostgreSQLStandbyPasswordRepairTransformer) markRepairSucceeded( transCtx *componentTransformContext, repaired bool, @@ -265,7 +303,7 @@ func (t *componentPostgreSQLStandbyPasswordRepairTransformer) markRepairSkipped( ObservedGeneration: transCtx.Component.Generation, LastTransitionTime: metav1.Now(), Reason: standbyPasswordRepairReasonSkipped, - Message: "PostgreSQL standby password repair is skipped for standby cluster mode", + Message: "PostgreSQL standby password repair is skipped", }) } @@ -333,6 +371,15 @@ func consistentStandbyPassword(ctx context.Context, runner podExecRunner, pods [ func standbyPasswordFromPod(ctx context.Context, runner podExecRunner, pod *corev1.Pod) (string, error) { stdout, stderr, err := runner.Exec(ctx, pod, []string{"cat", standbyPgpassPath}, "") if err != nil { + if strings.Contains(strings.ToLower(stderr), "no such file") { + return "", fmt.Errorf( + "postgresql standby password repair: standby password unavailable in pod %q: read pgpass failed: %v: %s: %w", + pod.Name, + err, + strings.TrimSpace(stderr), + errStandbyPasswordUnavailable, + ) + } return "", fmt.Errorf( "postgresql standby password repair: read pgpass from pod %q: %w: %s", pod.Name, @@ -340,33 +387,19 @@ func standbyPasswordFromPod(ctx context.Context, runner podExecRunner, pod *core strings.TrimSpace(stderr), ) } + password, err := parseStandbyPasswordFromPgpass(stdout) if err == nil { return password, nil } - if !errors.Is(err, errStandbyEntryNotFound) { - return "", fmt.Errorf("postgresql standby password repair: parse pgpass from pod %q: %w", pod.Name, err) - } - - // Some PostgreSQL leaders keep only the superuser entry in pgpass, while the - // replication password is still exposed through the pod environment. - stdout, stderr, err = runner.Exec(ctx, pod, []string{"sh", "-c", readStandbyPasswordEnvCommand}, "") - if err != nil { - return "", fmt.Errorf( - "postgresql standby password repair: read standby env from pod %q: %w: %s", - pod.Name, - err, - strings.TrimSpace(stderr), - ) - } - password = strings.TrimRight(stdout, "\r\n") - if password == "" { + if errors.Is(err, errStandbyEntryNotFound) { return "", fmt.Errorf( - "postgresql standby password repair: standby password not found in pod %q pgpass or env", + "postgresql standby password repair: standby password unavailable in pod %q: standby password not found in pgpass: %w", pod.Name, + errStandbyPasswordUnavailable, ) } - return password, nil + return "", fmt.Errorf("postgresql standby password repair: parse pgpass from pod %q: %w", pod.Name, err) } func ensureLeaderStandbyPassword( diff --git a/controllers/apps/transformer_component_postgresql_standby_password_repair_test.go b/controllers/apps/transformer_component_postgresql_standby_password_repair_test.go index 78c6c6dfdd3..1cc8d18688e 100644 --- a/controllers/apps/transformer_component_postgresql_standby_password_repair_test.go +++ b/controllers/apps/transformer_component_postgresql_standby_password_repair_test.go @@ -21,6 +21,7 @@ package apps import ( "context" + "fmt" "strings" "testing" @@ -38,6 +39,7 @@ import ( ctrlcomp "github.com/apecloud/kubeblocks/pkg/controller/component" "github.com/apecloud/kubeblocks/pkg/controller/graph" "github.com/apecloud/kubeblocks/pkg/controller/model" + intctrlutil "github.com/apecloud/kubeblocks/pkg/controllerutil" ) func TestParseStandbyPasswordFromPgpass(t *testing.T) { @@ -117,34 +119,6 @@ func TestConsistentStandbyPassword(t *testing.T) { }, runner.commands) } -func TestConsistentStandbyPasswordFallsBackToEnv(t *testing.T) { - t.Parallel() - - pods := []*corev1.Pod{ - {ObjectMeta: metav1.ObjectMeta{Name: "postgresql-0"}}, - {ObjectMeta: metav1.ObjectMeta{Name: "postgresql-1"}}, - } - runner := &fakePodExecRunner{ - pgpass: map[string]string{ - "postgresql-0": "localhost:5432:*:postgres:secret", - "postgresql-1": "localhost:5432:*:standby:secret", - }, - standbyEnv: map[string]string{ - "postgresql-0": "secret", - }, - } - - password, err := consistentStandbyPassword(context.Background(), runner, pods) - - require.NoError(t, err) - require.Equal(t, "secret", password) - require.Equal(t, [][]string{ - {"cat", standbyPgpassPath}, - {"sh", "-c", readStandbyPasswordEnvCommand}, - {"cat", standbyPgpassPath}, - }, runner.commands) -} - func TestConsistentStandbyPasswordInconsistent(t *testing.T) { t.Parallel() @@ -238,11 +212,8 @@ func TestComponentPostgreSQLStandbyPasswordRepairTransformer(t *testing.T) { transCtx := newPostgreSQLStandbyPasswordRepairTestContext(t, leaderPodName, replicaPod) runner := &fakePodExecRunner{ pgpass: map[string]string{ - leaderPodName: "localhost:5432:*:postgres:secret", replicaPod: "localhost:5432:*:standby:secret", - }, - standbyEnv: map[string]string{ - leaderPodName: "secret", + leaderPodName: "localhost:5432:*:standby:leader-secret", }, ensureResult: "f", } @@ -258,6 +229,158 @@ func TestComponentPostgreSQLStandbyPasswordRepairTransformer(t *testing.T) { require.Equal(t, metav1.ConditionTrue, cond.Status) } +func TestComponentPostgreSQLStandbyPasswordRepairTransformerRunsWhileNotRunning(t *testing.T) { + t.Parallel() + + testCases := []struct { + name string + phase appsv1alpha1.ClusterComponentPhase + }{ + { + name: "updating", + phase: appsv1alpha1.UpdatingClusterCompPhase, + }, + { + name: "abnormal", + phase: appsv1alpha1.AbnormalClusterCompPhase, + }, + } + + for _, tc := range testCases { + tc := tc + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + + const ( + leaderPodName = "test-postgresql-1" + replicaPod = "test-postgresql-0" + ) + + transCtx := newPostgreSQLStandbyPasswordRepairTestContext(t, leaderPodName, replicaPod) + transCtx.Component.Status.Phase = tc.phase + runner := &fakePodExecRunner{ + pgpass: map[string]string{ + replicaPod: "localhost:5432:*:standby:secret", + leaderPodName: "localhost:5432:*:postgres:ignored", + }, + ensureResult: "f", + } + + err := (&componentPostgreSQLStandbyPasswordRepairTransformer{ + execRunner: runner, + }).Transform(transCtx, graph.NewDAG()) + + require.NoError(t, err) + require.Equal(t, leaderPodName, runner.ensurePod) + require.Equal(t, "secret", runner.stdin) + }) + } +} + +func TestComponentPostgreSQLStandbyPasswordRepairTransformerUnavailableDoesNotRequeue(t *testing.T) { + t.Parallel() + + const ( + leaderPodName = "test-postgresql-1" + replicaPod = "test-postgresql-0" + ) + + testCases := []struct { + name string + runner *fakePodExecRunner + }{ + { + name: "missing pgpass", + runner: &fakePodExecRunner{ + missingPgpass: map[string]bool{ + replicaPod: true, + }, + }, + }, + { + name: "standby entry missing", + runner: &fakePodExecRunner{ + pgpass: map[string]string{ + replicaPod: "localhost:5432:*:postgres:secret", + }, + }, + }, + } + + for _, tc := range testCases { + tc := tc + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + + transCtx := newPostgreSQLStandbyPasswordRepairTestContext(t, leaderPodName, replicaPod) + err := (&componentPostgreSQLStandbyPasswordRepairTransformer{ + execRunner: tc.runner, + }).Transform(transCtx, graph.NewDAG()) + + require.NoError(t, err) + require.Empty(t, tc.runner.ensurePod) + cond := meta.FindStatusCondition(transCtx.Component.Status.Conditions, standbyPasswordRepairConditionType) + require.NotNil(t, cond) + require.Equal(t, metav1.ConditionFalse, cond.Status) + require.Equal(t, standbyPasswordRepairReasonUnavailable, cond.Reason) + }) + } +} + +func TestComponentPostgreSQLStandbyPasswordRepairTransformerSkipsSingleReplica(t *testing.T) { + t.Parallel() + + const leaderPodName = "test-postgresql-0" + + transCtx := newPostgreSQLStandbyPasswordRepairTestContext(t, leaderPodName) + runner := &fakePodExecRunner{ + pgpass: map[string]string{ + leaderPodName: "localhost:5432:*:standby:secret", + }, + } + + err := (&componentPostgreSQLStandbyPasswordRepairTransformer{ + execRunner: runner, + }).Transform(transCtx, graph.NewDAG()) + + require.NoError(t, err) + require.Empty(t, runner.ensurePod) + cond := meta.FindStatusCondition(transCtx.Component.Status.Conditions, standbyPasswordRepairConditionType) + require.NotNil(t, cond) + require.Equal(t, metav1.ConditionTrue, cond.Status) + require.Equal(t, standbyPasswordRepairReasonSkipped, cond.Reason) +} + +func TestComponentPostgreSQLStandbyPasswordRepairTransformerPgpassExecFailureRequeues(t *testing.T) { + t.Parallel() + + const ( + leaderPodName = "test-postgresql-1" + replicaPod = "test-postgresql-0" + ) + + transCtx := newPostgreSQLStandbyPasswordRepairTestContext(t, leaderPodName, replicaPod) + runner := &fakePodExecRunner{ + pgpassError: map[string]error{ + replicaPod: fmt.Errorf("exec failed"), + }, + pgpassStderr: map[string]string{ + replicaPod: "container not found", + }, + } + + err := (&componentPostgreSQLStandbyPasswordRepairTransformer{ + execRunner: runner, + }).Transform(transCtx, graph.NewDAG()) + + require.Error(t, err) + require.True(t, intctrlutil.IsDelayedRequeueError(err)) + cond := meta.FindStatusCondition(transCtx.Component.Status.Conditions, standbyPasswordRepairConditionType) + require.NotNil(t, cond) + require.Equal(t, metav1.ConditionFalse, cond.Status) + require.Equal(t, standbyPasswordRepairReasonFailed, cond.Reason) +} + func TestComponentPostgreSQLStandbyPasswordRepairConditionUsesStatusVertex(t *testing.T) { t.Parallel() @@ -269,11 +392,8 @@ func TestComponentPostgreSQLStandbyPasswordRepairConditionUsesStatusVertex(t *te transCtx := newPostgreSQLStandbyPasswordRepairTestContext(t, leaderPodName, replicaPod) runner := &fakePodExecRunner{ pgpass: map[string]string{ - leaderPodName: "localhost:5432:*:postgres:secret", replicaPod: "localhost:5432:*:standby:secret", - }, - standbyEnv: map[string]string{ - leaderPodName: "secret", + leaderPodName: "localhost:5432:*:postgres:ignored", }, ensureResult: "f", } @@ -334,14 +454,16 @@ func TestComponentPostgreSQLStandbyPasswordRepairTransformerInconsistent(t *test const ( leaderPodName = "test-postgresql-1" - replicaPod = "test-postgresql-0" + replicaPodA = "test-postgresql-0" + replicaPodB = "test-postgresql-2" ) - transCtx := newPostgreSQLStandbyPasswordRepairTestContext(t, leaderPodName, replicaPod) + transCtx := newPostgreSQLStandbyPasswordRepairTestContext(t, leaderPodName, replicaPodA, replicaPodB) runner := &fakePodExecRunner{ pgpass: map[string]string{ - leaderPodName: "localhost:5432:*:standby:secret-a", - replicaPod: "localhost:5432:*:standby:secret-b", + leaderPodName: "localhost:5432:*:standby:leader-secret", + replicaPodA: "localhost:5432:*:standby:secret-a", + replicaPodB: "localhost:5432:*:standby:secret-b", }, } @@ -403,7 +525,7 @@ func newPostgreSQLStandbyPasswordRepairTestContext( WithScheme(rscheme). WithObjects(objects...). Build() - replicas := int32(len(objects) - 3) + replicas := int32(1 + len(replicaPodNames)) membersStatus := []workloads.MemberStatus{{ PodName: leaderPodName, ReplicaRole: &workloads.ReplicaRole{ @@ -484,14 +606,16 @@ func postgresqlTestPod(namespace, name string, labels map[string]string) *corev1 } type fakePodExecRunner struct { - pgpass map[string]string - pgMode map[string]string - standbyEnv map[string]string - ensureResult string - ensurePod string - stdin string - lastCommand []string - commands [][]string + pgpass map[string]string + pgpassError map[string]error + pgpassStderr map[string]string + pgMode map[string]string + missingPgpass map[string]bool + ensureResult string + ensurePod string + stdin string + lastCommand []string + commands [][]string } func (r *fakePodExecRunner) Exec( @@ -503,12 +627,14 @@ func (r *fakePodExecRunner) Exec( r.commands = append(r.commands, append([]string{}, command...)) r.lastCommand = append([]string{}, command...) if len(command) == 2 && command[0] == "cat" && command[1] == standbyPgpassPath { + if err := r.pgpassError[pod.Name]; err != nil { + return "", r.pgpassStderr[pod.Name], err + } + if r.missingPgpass[pod.Name] { + return "", "cat: /run/postgresql/pgpass: No such file or directory", fmt.Errorf("exit status 1") + } return r.pgpass[pod.Name], "", nil } - if len(command) == 3 && command[0] == "sh" && command[1] == "-c" && - command[2] == readStandbyPasswordEnvCommand { - return r.standbyEnv[pod.Name], "", nil - } if len(command) == 3 && command[0] == "sh" && command[1] == "-c" && command[2] == readPostgreSQLModeEnvCommand { return r.pgMode[pod.Name], "", nil