diff --git a/fluss-client/src/main/java/org/apache/fluss/client/admin/Admin.java b/fluss-client/src/main/java/org/apache/fluss/client/admin/Admin.java index b5ae7b06b9..9764102ccc 100644 --- a/fluss-client/src/main/java/org/apache/fluss/client/admin/Admin.java +++ b/fluss-client/src/main/java/org/apache/fluss/client/admin/Admin.java @@ -23,6 +23,7 @@ import org.apache.fluss.client.metadata.LakeSnapshot; import org.apache.fluss.cluster.ServerNode; import org.apache.fluss.cluster.rebalance.GoalType; +import org.apache.fluss.cluster.rebalance.RebalanceProgress; import org.apache.fluss.cluster.rebalance.ServerTag; import org.apache.fluss.config.ConfigOptions; import org.apache.fluss.config.cluster.AlterConfig; @@ -66,6 +67,8 @@ import org.apache.fluss.security.acl.AclBinding; import org.apache.fluss.security.acl.AclBindingFilter; +import javax.annotation.Nullable; + import java.util.Collection; import java.util.List; import java.util.concurrent.CompletableFuture; @@ -576,9 +579,12 @@ ListOffsetsResult listOffsets( *
  • {@link NoRebalanceInProgressException} If there are no rebalance tasks in progress. * * + * @param rebalanceId the rebalance id to list progress, if it is null means list the latest + * rebalance task's process. If rebalance id is not exists in server, empty rebalance result + * will be returned. * @return the rebalance process. */ - CompletableFuture listRebalanceProgress(); + CompletableFuture listRebalanceProgress(@Nullable String rebalanceId); /** * Cannel the rebalance task. @@ -588,6 +594,9 @@ ListOffsetsResult listOffsets( * permissions. *
  • {@link NoRebalanceInProgressException} If there are no rebalance tasks in progress. * + * + * @param rebalanceId the rebalance id to cancel, if it is null means cancel the latest + * rebalance task. If rebalanceId is not exists in server, nothing will be done. */ - CompletableFuture cancelRebalance(); + CompletableFuture cancelRebalance(@Nullable String rebalanceId); } diff --git a/fluss-client/src/main/java/org/apache/fluss/client/admin/FlussAdmin.java b/fluss-client/src/main/java/org/apache/fluss/client/admin/FlussAdmin.java index ac34caed87..79f3905700 100644 --- a/fluss-client/src/main/java/org/apache/fluss/client/admin/FlussAdmin.java +++ b/fluss-client/src/main/java/org/apache/fluss/client/admin/FlussAdmin.java @@ -25,6 +25,7 @@ import org.apache.fluss.cluster.Cluster; import org.apache.fluss.cluster.ServerNode; import org.apache.fluss.cluster.rebalance.GoalType; +import org.apache.fluss.cluster.rebalance.RebalanceProgress; import org.apache.fluss.cluster.rebalance.ServerTag; import org.apache.fluss.config.cluster.AlterConfig; import org.apache.fluss.config.cluster.ConfigEntry; @@ -49,6 +50,7 @@ import org.apache.fluss.rpc.messages.AddServerTagRequest; import org.apache.fluss.rpc.messages.AlterClusterConfigsRequest; import org.apache.fluss.rpc.messages.AlterTableRequest; +import org.apache.fluss.rpc.messages.CancelRebalanceRequest; import org.apache.fluss.rpc.messages.CreateAclsRequest; import org.apache.fluss.rpc.messages.CreateDatabaseRequest; import org.apache.fluss.rpc.messages.CreateTableRequest; @@ -69,12 +71,14 @@ import org.apache.fluss.rpc.messages.ListDatabasesResponse; import org.apache.fluss.rpc.messages.ListOffsetsRequest; import org.apache.fluss.rpc.messages.ListPartitionInfosRequest; +import org.apache.fluss.rpc.messages.ListRebalanceProgressRequest; import org.apache.fluss.rpc.messages.ListTablesRequest; import org.apache.fluss.rpc.messages.ListTablesResponse; import org.apache.fluss.rpc.messages.PbAlterConfig; import org.apache.fluss.rpc.messages.PbListOffsetsRespForBucket; import org.apache.fluss.rpc.messages.PbPartitionSpec; import org.apache.fluss.rpc.messages.PbTablePath; +import org.apache.fluss.rpc.messages.RebalanceRequest; import org.apache.fluss.rpc.messages.RemoveServerTagRequest; import org.apache.fluss.rpc.messages.TableExistsRequest; import org.apache.fluss.rpc.messages.TableExistsResponse; @@ -553,17 +557,33 @@ public CompletableFuture removeServerTag( @Override public CompletableFuture rebalance( List priorityGoals, boolean dryRun) { - throw new UnsupportedOperationException("Support soon"); + RebalanceRequest request = new RebalanceRequest().setDryRun(dryRun); + priorityGoals.forEach(goal -> request.addGoal(goal.value)); + return gateway.rebalance(request).thenApply(ClientRpcMessageUtils::toRebalancePlan); } @Override - public CompletableFuture listRebalanceProgress() { - throw new UnsupportedOperationException("Support soon"); + public CompletableFuture listRebalanceProgress( + @Nullable String rebalanceId) { + ListRebalanceProgressRequest request = new ListRebalanceProgressRequest(); + + if (rebalanceId != null) { + request.setRebalanceId(rebalanceId); + } + + return gateway.listRebalanceProgress(request) + .thenApply(ClientRpcMessageUtils::toRebalanceProgress); } @Override - public CompletableFuture cancelRebalance() { - throw new UnsupportedOperationException("Support soon"); + public CompletableFuture cancelRebalance(@Nullable String rebalanceId) { + CancelRebalanceRequest request = new CancelRebalanceRequest(); + + if (rebalanceId != null) { + request.setRebalanceId(rebalanceId); + } + + return gateway.cancelRebalance(request).thenApply(r -> null); } @Override diff --git a/fluss-client/src/main/java/org/apache/fluss/client/admin/RebalancePlan.java b/fluss-client/src/main/java/org/apache/fluss/client/admin/RebalancePlan.java index 4e054a71b6..bf39330fa5 100644 --- a/fluss-client/src/main/java/org/apache/fluss/client/admin/RebalancePlan.java +++ b/fluss-client/src/main/java/org/apache/fluss/client/admin/RebalancePlan.java @@ -29,12 +29,19 @@ */ public class RebalancePlan { + private final String rebalanceId; private final Map planForBucketMap; - public RebalancePlan(Map planForBucketMap) { + public RebalancePlan( + String rebalanceId, Map planForBucketMap) { + this.rebalanceId = rebalanceId; this.planForBucketMap = planForBucketMap; } + public String getRebalanceId() { + return rebalanceId; + } + public Map getPlanForBucketMap() { return planForBucketMap; } diff --git a/fluss-client/src/main/java/org/apache/fluss/client/utils/ClientRpcMessageUtils.java b/fluss-client/src/main/java/org/apache/fluss/client/utils/ClientRpcMessageUtils.java index 3e349cca3a..da55d7a04b 100644 --- a/fluss-client/src/main/java/org/apache/fluss/client/utils/ClientRpcMessageUtils.java +++ b/fluss-client/src/main/java/org/apache/fluss/client/utils/ClientRpcMessageUtils.java @@ -18,6 +18,7 @@ package org.apache.fluss.client.utils; import org.apache.fluss.client.admin.OffsetSpec; +import org.apache.fluss.client.admin.RebalancePlan; import org.apache.fluss.client.lookup.LookupBatch; import org.apache.fluss.client.lookup.PrefixLookupBatch; import org.apache.fluss.client.metadata.KvSnapshotMetadata; @@ -25,6 +26,10 @@ import org.apache.fluss.client.metadata.LakeSnapshot; import org.apache.fluss.client.write.KvWriteBatch; import org.apache.fluss.client.write.ReadyWriteBatch; +import org.apache.fluss.cluster.rebalance.RebalancePlanForBucket; +import org.apache.fluss.cluster.rebalance.RebalanceProgress; +import org.apache.fluss.cluster.rebalance.RebalanceResultForBucket; +import org.apache.fluss.cluster.rebalance.RebalanceStatus; import org.apache.fluss.config.cluster.AlterConfigOpType; import org.apache.fluss.config.cluster.ColumnPositionType; import org.apache.fluss.config.cluster.ConfigEntry; @@ -46,6 +51,7 @@ import org.apache.fluss.rpc.messages.GetLatestLakeSnapshotResponse; import org.apache.fluss.rpc.messages.ListOffsetsRequest; import org.apache.fluss.rpc.messages.ListPartitionInfosResponse; +import org.apache.fluss.rpc.messages.ListRebalanceProgressResponse; import org.apache.fluss.rpc.messages.LookupRequest; import org.apache.fluss.rpc.messages.MetadataRequest; import org.apache.fluss.rpc.messages.PbAddColumn; @@ -61,11 +67,16 @@ import org.apache.fluss.rpc.messages.PbPrefixLookupReqForBucket; import org.apache.fluss.rpc.messages.PbProduceLogReqForBucket; import org.apache.fluss.rpc.messages.PbPutKvReqForBucket; +import org.apache.fluss.rpc.messages.PbRebalancePlanForBucket; +import org.apache.fluss.rpc.messages.PbRebalancePlanForTable; +import org.apache.fluss.rpc.messages.PbRebalanceProgressForBucket; +import org.apache.fluss.rpc.messages.PbRebalanceProgressForTable; import org.apache.fluss.rpc.messages.PbRemotePathAndLocalFile; import org.apache.fluss.rpc.messages.PbRenameColumn; import org.apache.fluss.rpc.messages.PrefixLookupRequest; import org.apache.fluss.rpc.messages.ProduceLogRequest; import org.apache.fluss.rpc.messages.PutKvRequest; +import org.apache.fluss.rpc.messages.RebalanceResponse; import org.apache.fluss.utils.json.DataTypeJsonSerde; import org.apache.fluss.utils.json.JsonSerdeUtils; @@ -80,6 +91,7 @@ import java.util.Set; import java.util.stream.Collectors; +import static org.apache.fluss.cluster.rebalance.RebalanceUtils.FINAL_STATUSES; import static org.apache.fluss.rpc.util.CommonRpcMessageUtils.toResolvedPartitionSpec; import static org.apache.fluss.utils.Preconditions.checkState; @@ -370,6 +382,68 @@ public static AlterTableRequest makeAlterTableRequest( return request; } + public static RebalancePlan toRebalancePlan(RebalanceResponse response) { + Map rebalancePlan = new HashMap<>(); + for (PbRebalancePlanForTable pbTable : response.getTablePlansList()) { + long tableId = pbTable.getTableId(); + for (PbRebalancePlanForBucket pbBucket : pbTable.getBucketsPlansList()) { + RebalancePlanForBucket planForBucket = toRebalancePlanForBucket(tableId, pbBucket); + rebalancePlan.put(planForBucket.getTableBucket(), planForBucket); + } + } + return new RebalancePlan(response.getRebalanceId(), rebalancePlan); + } + + public static RebalanceProgress toRebalanceProgress(ListRebalanceProgressResponse response) { + RebalanceStatus totalRebalanceStatus = RebalanceStatus.of(response.getRebalanceStatus()); + int totalTask = 0; + int finishedTask = 0; + Map rebalanceProgress = new HashMap<>(); + for (PbRebalanceProgressForTable pbTable : response.getTableProgressesList()) { + long tableId = pbTable.getTableId(); + for (PbRebalanceProgressForBucket pbBucket : pbTable.getBucketsProgressesList()) { + RebalanceStatus bucketStatus = RebalanceStatus.of(pbBucket.getRebalanceStatus()); + RebalancePlanForBucket planForBucket = + toRebalancePlanForBucket(tableId, pbBucket.getRebalancePlan()); + rebalanceProgress.put( + planForBucket.getTableBucket(), + new RebalanceResultForBucket(planForBucket, bucketStatus)); + if (FINAL_STATUSES.contains(bucketStatus)) { + finishedTask++; + } + totalTask++; + } + } + + double progress = -1d; + if (totalTask != 0) { + progress = (double) finishedTask / totalTask; + } + + return new RebalanceProgress( + response.hasRebalanceId() ? response.getRebalanceId() : null, + totalRebalanceStatus, + progress, + rebalanceProgress); + } + + private static RebalancePlanForBucket toRebalancePlanForBucket( + long tableId, PbRebalancePlanForBucket rebalancePlan) { + TableBucket tableBucket = + new TableBucket( + tableId, + rebalancePlan.hasPartitionId() ? rebalancePlan.getPartitionId() : null, + rebalancePlan.getBucketId()); + return new RebalancePlanForBucket( + tableBucket, + rebalancePlan.getOriginalLeader(), + rebalancePlan.getNewLeader(), + Arrays.stream(rebalancePlan.getOriginalReplicas()) + .boxed() + .collect(Collectors.toList()), + Arrays.stream(rebalancePlan.getNewReplicas()).boxed().collect(Collectors.toList())); + } + public static List toPartitionInfos(ListPartitionInfosResponse response) { return response.getPartitionsInfosList().stream() .map( diff --git a/fluss-client/src/test/java/org/apache/fluss/client/admin/RebalanceITCase.java b/fluss-client/src/test/java/org/apache/fluss/client/admin/RebalanceITCase.java new file mode 100644 index 0000000000..63be3650d8 --- /dev/null +++ b/fluss-client/src/test/java/org/apache/fluss/client/admin/RebalanceITCase.java @@ -0,0 +1,300 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.fluss.client.admin; + +import org.apache.fluss.client.Connection; +import org.apache.fluss.client.ConnectionFactory; +import org.apache.fluss.cluster.rebalance.GoalType; +import org.apache.fluss.cluster.rebalance.RebalancePlanForBucket; +import org.apache.fluss.cluster.rebalance.RebalanceProgress; +import org.apache.fluss.cluster.rebalance.RebalanceResultForBucket; +import org.apache.fluss.cluster.rebalance.RebalanceStatus; +import org.apache.fluss.cluster.rebalance.ServerTag; +import org.apache.fluss.config.ConfigOptions; +import org.apache.fluss.config.Configuration; +import org.apache.fluss.metadata.DatabaseDescriptor; +import org.apache.fluss.metadata.PartitionSpec; +import org.apache.fluss.metadata.TableBucket; +import org.apache.fluss.metadata.TableDescriptor; +import org.apache.fluss.metadata.TablePath; +import org.apache.fluss.server.replica.ReplicaManager; +import org.apache.fluss.server.testutils.FlussClusterExtension; +import org.apache.fluss.server.zk.data.RebalancePlan; + +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.RegisterExtension; + +import java.time.Duration; +import java.util.Arrays; +import java.util.Collections; +import java.util.Map; +import java.util.Optional; + +import static org.apache.fluss.record.TestData.DATA1_SCHEMA; +import static org.apache.fluss.testutils.common.CommonTestUtils.retry; +import static org.assertj.core.api.Assertions.assertThat; + +/** ITCase for rebalance. */ +public class RebalanceITCase { + @RegisterExtension + public static final FlussClusterExtension FLUSS_CLUSTER_EXTENSION = + FlussClusterExtension.builder() + .setNumOfTabletServers(4) + .setClusterConf(initConfig()) + .build(); + + private Connection conn; + private Admin admin; + + @BeforeEach + protected void setup() throws Exception { + conn = ConnectionFactory.createConnection(FLUSS_CLUSTER_EXTENSION.getClientConfig()); + admin = conn.getAdmin(); + } + + @AfterEach + protected void teardown() throws Exception { + FLUSS_CLUSTER_EXTENSION.getZooKeeperClient().deleteRebalancePlan(); + + if (admin != null) { + admin.close(); + admin = null; + } + + if (conn != null) { + conn.close(); + conn = null; + } + } + + @Test + void testRebalanceForLogTable() throws Exception { + String dbName = "db-balance"; + admin.createDatabase(dbName, DatabaseDescriptor.EMPTY, false).get(); + + TableDescriptor logDescriptor = + TableDescriptor.builder() + .schema(DATA1_SCHEMA) + .distributedBy(3) + .property( + ConfigOptions.TABLE_GENERATE_UNBALANCE_TABLE_ASSIGNMENT.key(), + "true") + .build(); + // create some none partitioned log table. + for (int i = 0; i < 6; i++) { + long tableId = + createTable( + new TablePath(dbName, "test-rebalance_table-" + i), + logDescriptor, + false); + FLUSS_CLUSTER_EXTENSION.waitUntilTableReady(tableId); + } + + // create some partitioned table with partition. + TableDescriptor partitionedDescriptor = + TableDescriptor.builder() + .schema(DATA1_SCHEMA) + .distributedBy(3) + .property( + ConfigOptions.TABLE_GENERATE_UNBALANCE_TABLE_ASSIGNMENT.key(), + "true") + .partitionedBy("b") + .build(); + for (int i = 0; i < 3; i++) { + TablePath tablePath = new TablePath(dbName, "test-rebalance_partitioned_table-" + i); + long tableId = createTable(tablePath, partitionedDescriptor, false); + for (int j = 0; j < 2; j++) { + PartitionSpec partitionSpec = + new PartitionSpec(Collections.singletonMap("b", String.valueOf(j))); + admin.createPartition(tablePath, partitionSpec, false).get(); + long partitionId = + admin.listPartitionInfos(tablePath, partitionSpec) + .get() + .get(0) + .getPartitionId(); + FLUSS_CLUSTER_EXTENSION.waitUntilTablePartitionReady(tableId, partitionId); + } + } + + // verify before rebalance. As we use unbalance assignment, all replicas will be location on + // servers [0,1,2], all leader will be location on server 0. + for (int i = 0; i < 3; i++) { + ReplicaManager replicaManager = + FLUSS_CLUSTER_EXTENSION.getTabletServerById(i).getReplicaManager(); + assertThat(replicaManager.onlineReplicas().count()).isEqualTo(36); + if (i == 0) { + assertThat(replicaManager.leaderCount()).isEqualTo(36); + } else { + assertThat(replicaManager.leaderCount()).isEqualTo(0); + } + } + ReplicaManager replicaManager3 = + FLUSS_CLUSTER_EXTENSION.getTabletServerById(3).getReplicaManager(); + assertThat(replicaManager3.onlineReplicas().count()).isEqualTo(0); + assertThat(replicaManager3.leaderCount()).isEqualTo(0); + + // trigger rebalance with goal set[ReplicaDistributionGoal, LeaderReplicaDistributionGoal] + admin.rebalance( + Arrays.asList( + GoalType.REPLICA_DISTRIBUTION_GOAL, + GoalType.LEADER_DISTRIBUTION_GOAL), + false) + .get(); + + retry( + Duration.ofMinutes(2), + () -> { + // TODO use admin#listRebalanceProcess to verify rebalance is finished. + Optional rebalancePlan = + FLUSS_CLUSTER_EXTENSION.getZooKeeperClient().getRebalancePlan(); + assertThat(rebalancePlan).isPresent(); + assertThat(rebalancePlan.get().getRebalanceStatus()) + .isEqualTo(RebalanceStatus.COMPLETED); + for (int i = 0; i < 4; i++) { + ReplicaManager replicaManager = + FLUSS_CLUSTER_EXTENSION.getTabletServerById(i).getReplicaManager(); + // average will be 27 + assertThat(replicaManager.onlineReplicas().count()).isBetween(24L, 30L); + long leaderCount = replicaManager.leaderCount(); + // average will be 9 + assertThat(leaderCount).isBetween(7L, 11L); + } + }); + + // add server tag PERMANENT_OFFLINE for server 3, trigger all leader and replica removed + // from server 3. + admin.addServerTag(Collections.singletonList(3), ServerTag.PERMANENT_OFFLINE).get(); + admin.rebalance( + Arrays.asList( + GoalType.REPLICA_DISTRIBUTION_GOAL, + GoalType.LEADER_DISTRIBUTION_GOAL), + false) + .get(); + retry( + Duration.ofMinutes(2), + () -> { + // TODO use admin#listRebalanceProcess to verify rebalance is finished. + Optional rebalancePlan = + FLUSS_CLUSTER_EXTENSION.getZooKeeperClient().getRebalancePlan(); + assertThat(rebalancePlan).isPresent(); + assertThat(rebalancePlan.get().getRebalanceStatus()) + .isEqualTo(RebalanceStatus.COMPLETED); + assertThat(replicaManager3.onlineReplicas().count()).isEqualTo(0); + assertThat(replicaManager3.leaderCount()).isEqualTo(0); + for (int i = 0; i < 3; i++) { + ReplicaManager replicaManager = + FLUSS_CLUSTER_EXTENSION.getTabletServerById(i).getReplicaManager(); + // average will be 36 + assertThat(replicaManager.onlineReplicas().count()).isBetween(34L, 38L); + long leaderCount = replicaManager.leaderCount(); + // average will be 12 + assertThat(leaderCount).isBetween(10L, 14L); + } + }); + } + + @Test + void testListRebalanceProcess() throws Exception { + RebalanceProgress rebalanceProgress = admin.listRebalanceProgress(null).get(); + assertThat(rebalanceProgress.progress()).isEqualTo(-1d); + assertThat(rebalanceProgress.status()).isEqualTo(RebalanceStatus.NO_TASK); + assertThat(rebalanceProgress.progressForBucketMap()).isEmpty(); + + String dbName = "db-rebalance-list"; + admin.createDatabase(dbName, DatabaseDescriptor.EMPTY, false).get(); + + TableDescriptor logDescriptor = + TableDescriptor.builder() + .schema(DATA1_SCHEMA) + .distributedBy(3) + .property( + ConfigOptions.TABLE_GENERATE_UNBALANCE_TABLE_ASSIGNMENT.key(), + "true") + .build(); + // create some none partitioned log table. + for (int i = 0; i < 6; i++) { + long tableId = + createTable( + new TablePath(dbName, "test-rebalance_table-" + i), + logDescriptor, + false); + FLUSS_CLUSTER_EXTENSION.waitUntilTableReady(tableId); + } + + // trigger rebalance with goal set[ReplicaDistributionGoal, LeaderReplicaDistributionGoal] + org.apache.fluss.client.admin.RebalancePlan rebalancePlan = + admin.rebalance( + Arrays.asList( + GoalType.REPLICA_DISTRIBUTION_GOAL, + GoalType.LEADER_DISTRIBUTION_GOAL), + false) + .get(); + retry( + Duration.ofMinutes(2), + () -> { + RebalanceProgress progress = + admin.listRebalanceProgress(rebalancePlan.getRebalanceId()).get(); + assertThat(progress.progress()).isEqualTo(1d); + assertThat(progress.status()).isEqualTo(RebalanceStatus.COMPLETED); + Map processForBuckets = + progress.progressForBucketMap(); + Map planForBuckets = + rebalancePlan.getPlanForBucketMap(); + assertThat(planForBuckets.size()).isEqualTo(processForBuckets.size()); + for (TableBucket tableBucket : planForBuckets.keySet()) { + RebalanceResultForBucket processForBucket = + processForBuckets.get(tableBucket); + assertThat(processForBucket.status()).isEqualTo(RebalanceStatus.COMPLETED); + assertThat(processForBucket.plan()) + .isEqualTo(planForBuckets.get(tableBucket)); + } + }); + + // cancel rebalance. + admin.cancelRebalance(rebalancePlan.getRebalanceId()).get(); + + RebalanceProgress progress = + admin.listRebalanceProgress(rebalancePlan.getRebalanceId()).get(); + assertThat(progress.progress()).isEqualTo(1d); + assertThat(progress.status()).isEqualTo(RebalanceStatus.CANCELED); + + // test list and cancel an un-existed rebalance id. + progress = admin.listRebalanceProgress("unexisted-rebalance-id").get(); + assertThat(progress.progress()).isEqualTo(-1d); + assertThat(progress.status()).isEqualTo(RebalanceStatus.NO_TASK); + assertThat(progress.progressForBucketMap()).isEmpty(); + + admin.cancelRebalance("unexisted-rebalance-id").get(); + } + + private static Configuration initConfig() { + Configuration configuration = new Configuration(); + configuration.set(ConfigOptions.DEFAULT_REPLICATION_FACTOR, 3); + configuration.set(ConfigOptions.DEFAULT_BUCKET_NUMBER, 3); + return configuration; + } + + private long createTable( + TablePath tablePath, TableDescriptor tableDescriptor, boolean ignoreIfExists) + throws Exception { + admin.createTable(tablePath, tableDescriptor, ignoreIfExists).get(); + return admin.getTableInfo(tablePath).get().getTableId(); + } +} diff --git a/fluss-common/src/main/java/org/apache/fluss/cluster/rebalance/GoalType.java b/fluss-common/src/main/java/org/apache/fluss/cluster/rebalance/GoalType.java index c9392497c1..9efd819f4d 100644 --- a/fluss-common/src/main/java/org/apache/fluss/cluster/rebalance/GoalType.java +++ b/fluss-common/src/main/java/org/apache/fluss/cluster/rebalance/GoalType.java @@ -57,4 +57,8 @@ public static GoalType valueOf(int value) { "Value %s must be one of %s", value, Arrays.asList(GoalType.values()))); } } + + public static GoalType fromName(String name) { + return valueOf(name.toUpperCase()); + } } diff --git a/fluss-common/src/main/java/org/apache/fluss/cluster/rebalance/RebalancePlanForBucket.java b/fluss-common/src/main/java/org/apache/fluss/cluster/rebalance/RebalancePlanForBucket.java index df5bde12b8..ae7a5e9b8f 100644 --- a/fluss-common/src/main/java/org/apache/fluss/cluster/rebalance/RebalancePlanForBucket.java +++ b/fluss-common/src/main/java/org/apache/fluss/cluster/rebalance/RebalancePlanForBucket.java @@ -73,6 +73,10 @@ public List getNewReplicas() { return newReplicas; } + public boolean isLeaderAction() { + return originalLeader != newLeader; + } + @Override public String toString() { return "RebalancePlanForBucket{" diff --git a/fluss-client/src/main/java/org/apache/fluss/client/admin/RebalanceProgress.java b/fluss-common/src/main/java/org/apache/fluss/cluster/rebalance/RebalanceProgress.java similarity index 86% rename from fluss-client/src/main/java/org/apache/fluss/client/admin/RebalanceProgress.java rename to fluss-common/src/main/java/org/apache/fluss/cluster/rebalance/RebalanceProgress.java index 43738ca6c4..29c45a7017 100644 --- a/fluss-client/src/main/java/org/apache/fluss/client/admin/RebalanceProgress.java +++ b/fluss-common/src/main/java/org/apache/fluss/cluster/rebalance/RebalanceProgress.java @@ -15,12 +15,12 @@ * limitations under the License. */ -package org.apache.fluss.client.admin; +package org.apache.fluss.cluster.rebalance; -import org.apache.fluss.cluster.rebalance.RebalanceResultForBucket; -import org.apache.fluss.cluster.rebalance.RebalanceStatus; import org.apache.fluss.metadata.TableBucket; +import javax.annotation.Nullable; + import java.util.Map; import static org.apache.fluss.utils.Preconditions.checkNotNull; @@ -32,6 +32,9 @@ */ public class RebalanceProgress { + /** The rebalance id. */ + private final @Nullable String rebalanceId; + /** The rebalance status for the overall rebalance. */ private final RebalanceStatus rebalanceStatus; @@ -42,15 +45,21 @@ public class RebalanceProgress { private final Map progressForBucketMap; public RebalanceProgress( + @Nullable String rebalanceId, RebalanceStatus rebalanceStatus, double progress, Map progressForBucketMap) { + this.rebalanceId = rebalanceId; // TODO: we may derive the overall progress and status from progressForBucketMap this.rebalanceStatus = checkNotNull(rebalanceStatus); this.progress = progress; this.progressForBucketMap = checkNotNull(progressForBucketMap); } + public @Nullable String rebalanceId() { + return rebalanceId; + } + public RebalanceStatus status() { return rebalanceStatus; } diff --git a/fluss-common/src/main/java/org/apache/fluss/cluster/rebalance/RebalanceStatus.java b/fluss-common/src/main/java/org/apache/fluss/cluster/rebalance/RebalanceStatus.java index cb7b6f92dd..3de4deeae1 100644 --- a/fluss-common/src/main/java/org/apache/fluss/cluster/rebalance/RebalanceStatus.java +++ b/fluss-common/src/main/java/org/apache/fluss/cluster/rebalance/RebalanceStatus.java @@ -26,10 +26,12 @@ */ @PublicEvolving public enum RebalanceStatus { + NO_TASK(0), NOT_STARTED(1), REBALANCING(2), FAILED(3), - COMPLETED(4); + COMPLETED(4), + CANCELED(5); private final int code; diff --git a/fluss-common/src/main/java/org/apache/fluss/cluster/rebalance/RebalanceUtils.java b/fluss-common/src/main/java/org/apache/fluss/cluster/rebalance/RebalanceUtils.java new file mode 100644 index 0000000000..8c20bd5f4e --- /dev/null +++ b/fluss-common/src/main/java/org/apache/fluss/cluster/rebalance/RebalanceUtils.java @@ -0,0 +1,32 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.fluss.cluster.rebalance; + +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; + +import static org.apache.fluss.cluster.rebalance.RebalanceStatus.CANCELED; +import static org.apache.fluss.cluster.rebalance.RebalanceStatus.COMPLETED; +import static org.apache.fluss.cluster.rebalance.RebalanceStatus.FAILED; + +/** Rebalance utils. */ +public class RebalanceUtils { + public static final Set FINAL_STATUSES = + new HashSet<>(Arrays.asList(COMPLETED, CANCELED, FAILED)); +} diff --git a/fluss-common/src/main/java/org/apache/fluss/config/ConfigOptions.java b/fluss-common/src/main/java/org/apache/fluss/config/ConfigOptions.java index 9f4b603e98..3054a29394 100644 --- a/fluss-common/src/main/java/org/apache/fluss/config/ConfigOptions.java +++ b/fluss-common/src/main/java/org/apache/fluss/config/ConfigOptions.java @@ -1473,6 +1473,14 @@ public class ConfigOptions { + "This mode reduces storage and transmission costs but loses the ability to track previous values. " + "This option only affects primary key tables."); + public static final ConfigOption TABLE_GENERATE_UNBALANCE_TABLE_ASSIGNMENT = + key("table.generate-unbalance-table-assignment") + .booleanType() + .defaultValue(false) + .withDescription( + "This parameter is only used for test. If set to true, the assignment will always be [0,1,2] " + + "as replica factor set as 3 even if there are tabletServers more than 3."); + // ------------------------------------------------------------------------ // ConfigOptions for Kv // ------------------------------------------------------------------------ diff --git a/fluss-common/src/main/java/org/apache/fluss/config/TableConfig.java b/fluss-common/src/main/java/org/apache/fluss/config/TableConfig.java index 80d2ee8f78..7d19ef1725 100644 --- a/fluss-common/src/main/java/org/apache/fluss/config/TableConfig.java +++ b/fluss-common/src/main/java/org/apache/fluss/config/TableConfig.java @@ -140,4 +140,9 @@ public ArrowCompressionInfo getArrowCompressionInfo() { public AutoPartitionStrategy getAutoPartitionStrategy() { return AutoPartitionStrategy.from(config); } + + /** Whether to generate unbalance assignment fot this table. */ + public boolean generateUnbalanceAssignment() { + return config.get(ConfigOptions.TABLE_GENERATE_UNBALANCE_TABLE_ASSIGNMENT); + } } diff --git a/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/procedure/CancelRebalanceProcedure.java b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/procedure/CancelRebalanceProcedure.java new file mode 100644 index 0000000000..9a64828ee6 --- /dev/null +++ b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/procedure/CancelRebalanceProcedure.java @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.fluss.flink.procedure; + +import org.apache.flink.table.annotation.ArgumentHint; +import org.apache.flink.table.annotation.DataTypeHint; +import org.apache.flink.table.annotation.ProcedureHint; +import org.apache.flink.table.procedure.ProcedureContext; + +import javax.annotation.Nullable; + +/** Procedure to cancel rebalance. */ +public class CancelRebalanceProcedure extends ProcedureBase { + + @ProcedureHint( + argument = { + @ArgumentHint( + name = "rebalanceId", + type = @DataTypeHint("STRING"), + isOptional = true) + }) + public String[] call(ProcedureContext context, @Nullable String rebalanceId) throws Exception { + admin.cancelRebalance(rebalanceId).get(); + return new String[] {"success"}; + } +} diff --git a/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/procedure/ListRebalanceProcessProcedure.java b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/procedure/ListRebalanceProcessProcedure.java new file mode 100644 index 0000000000..e4be98781a --- /dev/null +++ b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/procedure/ListRebalanceProcessProcedure.java @@ -0,0 +1,74 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.fluss.flink.procedure; + +import org.apache.fluss.cluster.rebalance.RebalanceProgress; +import org.apache.fluss.cluster.rebalance.RebalanceResultForBucket; +import org.apache.fluss.cluster.rebalance.RebalanceStatus; +import org.apache.fluss.metadata.TableBucket; + +import org.apache.flink.table.annotation.ArgumentHint; +import org.apache.flink.table.annotation.DataTypeHint; +import org.apache.flink.table.annotation.ProcedureHint; +import org.apache.flink.table.procedure.ProcedureContext; + +import javax.annotation.Nullable; + +import java.text.NumberFormat; +import java.util.Map; + +/** Procedure to list rebalance process. */ +public class ListRebalanceProcessProcedure extends ProcedureBase { + + @ProcedureHint( + argument = { + @ArgumentHint( + name = "rebalanceId", + type = @DataTypeHint("STRING"), + isOptional = true) + }) + public String[] call(ProcedureContext context, @Nullable String rebalanceId) throws Exception { + RebalanceProgress progress = admin.listRebalanceProgress(rebalanceId).get(); + return progressToString(progress); + } + + private static String[] progressToString(RebalanceProgress progress) { + RebalanceStatus status = progress.status(); + double rebalanceProgress = progress.progress(); + Map bucketMap = progress.progressForBucketMap(); + + String[] result = new String[bucketMap.size() + 3]; + result[0] = "Reblance total status: " + status; + result[1] = "Rebalance progress: " + formatAsPercentage(rebalanceProgress); + result[2] = "Rebalance detail progress for bucket:"; + int i = 3; + for (RebalanceResultForBucket resultForBucket : bucketMap.values()) { + result[i++] = resultForBucket.toString(); + } + return result; + } + + public static String formatAsPercentage(double value) { + if (value < 0) { + return "NONE"; + } + NumberFormat pctFormat = NumberFormat.getPercentInstance(); + pctFormat.setMaximumFractionDigits(2); + return pctFormat.format(value); + } +} diff --git a/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/procedure/ProcedureManager.java b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/procedure/ProcedureManager.java index 12d4cf3675..7b24d5ce63 100644 --- a/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/procedure/ProcedureManager.java +++ b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/procedure/ProcedureManager.java @@ -73,7 +73,10 @@ private enum ProcedureEnum { SET_CLUSTER_CONFIG("sys.set_cluster_config", SetClusterConfigProcedure.class), GET_CLUSTER_CONFIG("sys.get_cluster_config", GetClusterConfigProcedure.class), ADD_SERVER_TAG("sys.add_server_tag", AddServerTagProcedure.class), - REMOVE_SERVER_TAG("sys.remove_server_tag", RemoveServerTagProcedure.class); + REMOVE_SERVER_TAG("sys.remove_server_tag", RemoveServerTagProcedure.class), + REBALANCE("sys.rebalance", RebalanceProcedure.class), + CANCEL_REBALANCE("sys.cancel_rebalance", CancelRebalanceProcedure.class), + LIST_REBALANCE_PROGRESS("sys.list_rebalance_progress", ListRebalanceProcessProcedure.class); private final String path; private final Class procedureClass; diff --git a/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/procedure/RebalanceProcedure.java b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/procedure/RebalanceProcedure.java new file mode 100644 index 0000000000..319e1c79a6 --- /dev/null +++ b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/procedure/RebalanceProcedure.java @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.fluss.flink.procedure; + +import org.apache.fluss.client.admin.RebalancePlan; +import org.apache.fluss.cluster.rebalance.GoalType; +import org.apache.fluss.cluster.rebalance.RebalancePlanForBucket; +import org.apache.fluss.metadata.TableBucket; + +import org.apache.flink.table.annotation.ArgumentHint; +import org.apache.flink.table.annotation.DataTypeHint; +import org.apache.flink.table.annotation.ProcedureHint; +import org.apache.flink.table.procedure.ProcedureContext; + +import javax.annotation.Nullable; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +/** Procedure to rebalance. */ +public class RebalanceProcedure extends ProcedureBase { + + /** + * As flink call don't support input a nested type like 'ARRAY'. So priorityGoals is defined as + * a String type, and different goals are split by ';'. + */ + @ProcedureHint( + argument = { + @ArgumentHint(name = "priorityGoals", type = @DataTypeHint("STRING")), + @ArgumentHint(name = "dryRun", type = @DataTypeHint("BOOLEAN"), isOptional = true) + }) + public String[] call(ProcedureContext context, String priorityGoals, @Nullable Boolean dryRun) + throws Exception { + List goalTypes = validateAndGetPriorityGoals(priorityGoals); + RebalancePlan planForBucketMap = admin.rebalance(goalTypes, dryRun != null && dryRun).get(); + return planForBucketMapToString(planForBucketMap.getPlanForBucketMap()); + } + + private static String[] planForBucketMapToString( + Map planForBucketMap) { + if (planForBucketMap == null || planForBucketMap.isEmpty()) { + return new String[] {}; + } + + return planForBucketMap.values().stream() + .map(RebalancePlanForBucket::toString) + .toArray(String[]::new); + } + + private static List validateAndGetPriorityGoals(String priorityGoals) { + if (priorityGoals == null || priorityGoals.trim().isEmpty()) { + throw new IllegalArgumentException( + "priority goals cannot be null or empty. You can specify one goal as 'REPLICA_DISTRIBUTION_GOAL' or " + + "specify multi goals as 'REPLICA_DISTRIBUTION_GOAL;LEADER_DISTRIBUTION_GOAL' (split by ';')"); + } + + priorityGoals = priorityGoals.trim(); + String[] splitGoals = priorityGoals.split(";"); + if (splitGoals.length == 0) { + throw new IllegalArgumentException( + "priority goals cannot be empty. You can specify one goal as 'REPLICA_DISTRIBUTION_GOAL' " + + "or specify multi goals as 'REPLICA_DISTRIBUTION_GOAL;LEADER_DISTRIBUTION_GOAL' (split by ';')"); + } + List goalTypes = new ArrayList<>(); + for (String goal : splitGoals) { + goalTypes.add(GoalType.valueOf(goal.toUpperCase())); + } + return goalTypes; + } +} diff --git a/fluss-flink/fluss-flink-common/src/test/java/org/apache/fluss/flink/procedure/FlinkProcedureITCase.java b/fluss-flink/fluss-flink-common/src/test/java/org/apache/fluss/flink/procedure/FlinkProcedureITCase.java index b764923050..39c69eecf3 100644 --- a/fluss-flink/fluss-flink-common/src/test/java/org/apache/fluss/flink/procedure/FlinkProcedureITCase.java +++ b/fluss-flink/fluss-flink-common/src/test/java/org/apache/fluss/flink/procedure/FlinkProcedureITCase.java @@ -17,12 +17,16 @@ package org.apache.fluss.flink.procedure; +import org.apache.fluss.client.Connection; +import org.apache.fluss.client.ConnectionFactory; +import org.apache.fluss.client.admin.Admin; import org.apache.fluss.cluster.rebalance.ServerTag; import org.apache.fluss.config.ConfigOptions; import org.apache.fluss.config.Configuration; import org.apache.fluss.config.MemorySize; import org.apache.fluss.exception.SecurityDisabledException; import org.apache.fluss.metadata.DataLakeFormat; +import org.apache.fluss.metadata.TablePath; import org.apache.fluss.server.testutils.FlussClusterExtension; import org.apache.fluss.server.zk.ZooKeeperClient; import org.apache.fluss.server.zk.data.ServerTags; @@ -32,6 +36,8 @@ import org.apache.flink.types.Row; import org.apache.flink.util.CloseableIterator; import org.apache.flink.util.CollectionUtil; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.extension.RegisterExtension; @@ -48,6 +54,8 @@ import static org.apache.fluss.cluster.rebalance.ServerTag.PERMANENT_OFFLINE; import static org.apache.fluss.flink.source.testutils.FlinkRowAssertionsUtils.assertResultsIgnoreOrder; +import static org.apache.fluss.server.testutils.FlussClusterExtension.BUILTIN_DATABASE; +import static org.apache.fluss.testutils.common.CommonTestUtils.retry; import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatThrownBy; @@ -57,16 +65,29 @@ public abstract class FlinkProcedureITCase { @RegisterExtension public static final FlussClusterExtension FLUSS_CLUSTER_EXTENSION = FlussClusterExtension.builder() - .setNumOfTabletServers(3) + .setNumOfTabletServers(4) .setCoordinatorServerListeners("FLUSS://localhost:0, CLIENT://localhost:0") .setTabletServerListeners("FLUSS://localhost:0, CLIENT://localhost:0") .setClusterConf(initConfig()) .build(); static final String CATALOG_NAME = "testcatalog"; + static final String DEFAULT_DB = "defaultdb"; + static Configuration clientConf; + static String bootstrapServers; + static Connection conn; + static Admin admin; TableEnvironment tEnv; + @BeforeAll + protected static void beforeAll() { + clientConf = FLUSS_CLUSTER_EXTENSION.getClientConfig(); + bootstrapServers = FLUSS_CLUSTER_EXTENSION.getBootstrapServers(); + conn = ConnectionFactory.createConnection(clientConf); + admin = conn.getAdmin(); + } + @BeforeEach void before() throws ExecutionException, InterruptedException { String bootstrapServers = @@ -90,6 +111,14 @@ void before() throws ExecutionException, InterruptedException { CATALOG_NAME, bootstrapServers); tEnv.executeSql(catalogDDL).await(); tEnv.executeSql("use catalog " + CATALOG_NAME); + tEnv.executeSql("create database " + DEFAULT_DB); + tEnv.useDatabase(DEFAULT_DB); + } + + @AfterEach + void after() { + tEnv.useDatabase(BUILTIN_DATABASE); + tEnv.executeSql(String.format("drop database %s cascade", DEFAULT_DB)); } @Test @@ -104,7 +133,10 @@ void testShowProcedures() throws Exception { "+I[sys.list_acl]", "+I[sys.set_cluster_config]", "+I[sys.add_server_tag]", - "+I[sys.remove_server_tag]"); + "+I[sys.remove_server_tag]", + "+I[sys.rebalance]", + "+I[sys.cancel_rebalance]", + "+I[sys.list_rebalance_progress]"); // make sure no more results is unread. assertResultsIgnoreOrder(showProceduresIterator, expectedShowProceduresResult, true); } @@ -460,6 +492,121 @@ void testAddAndRemoveServerTag(boolean upperCase) throws Exception { } } + @ParameterizedTest + @ValueSource(booleans = {true, false}) + void testRebalance(boolean upperCase) throws Exception { + // first create some unbalance assignment table. + for (int i = 0; i < 10; i++) { + String tableName = "reblance_test_tab_" + i; + tEnv.executeSql( + String.format( + "create table %s (a int, b varchar, c bigint, d int ) " + + "with ('connector' = 'fluss', 'table.generate-unbalance-table-assignment'='true')", + tableName)); + long tableId = + admin.getTableInfo(TablePath.of(DEFAULT_DB, tableName)).get().getTableId(); + FLUSS_CLUSTER_EXTENSION.waitUntilTableReady(tableId); + } + + String rebalance = + String.format( + upperCase + ? "Call %s.sys.rebalance('REPLICA_DISTRIBUTION_GOAL;LEADER_DISTRIBUTION_GOAL', true)" + : "Call %s.sys.rebalance('replica_distribution_goal;leader_distribution_goal', true)", + CATALOG_NAME); + try (CloseableIterator rows = tEnv.executeSql(rebalance).collect()) { + List actual = + CollectionUtil.iteratorToList(rows).stream() + .map(Row::toString) + .collect(Collectors.toList()); + assertThat(actual.size()).isGreaterThan(1); + } + + // test cancel rebalance. + try (CloseableIterator listProceduresIterator = + tEnv.executeSql(String.format("Call %s.sys.cancel_rebalance()", CATALOG_NAME)) + .collect()) { + assertCallResult(listProceduresIterator, new String[] {"+I[success]"}); + } + + // test cancel an un-existed rebalance. + try (CloseableIterator listProceduresIterator = + tEnv.executeSql( + String.format( + "Call %s.sys.cancel_rebalance('not-exist-id')", + CATALOG_NAME)) + .collect()) { + assertCallResult(listProceduresIterator, new String[] {"+I[success]"}); + } + + // delete rebalance plan to avoid conflict with other tests. + FLUSS_CLUSTER_EXTENSION.getZooKeeperClient().deleteRebalancePlan(); + } + + @Test + void testListRebalanceProgress() throws Exception { + try (CloseableIterator listProceduresIterator = + tEnv.executeSql( + String.format( + "Call %s.sys.list_rebalance_progress()", CATALOG_NAME)) + .collect()) { + assertCallResult( + listProceduresIterator, + new String[] { + "+I[Reblance total status: NO_TASK]", + "+I[Rebalance progress: NONE]", + "+I[Rebalance detail progress for bucket:]" + }); + } + + // first create some unbalance assignment table. + for (int i = 0; i < 10; i++) { + String tableName = "reblance_test_tab_" + i; + tEnv.executeSql( + String.format( + "create table %s (a int, b varchar, c bigint, d int ) " + + "with ('connector' = 'fluss', 'table.generate-unbalance-table-assignment'='true')", + tableName)); + long tableId = + admin.getTableInfo(TablePath.of(DEFAULT_DB, tableName)).get().getTableId(); + FLUSS_CLUSTER_EXTENSION.waitUntilTableReady(tableId); + } + + String rebalance = + String.format( + "Call %s.sys.rebalance('REPLICA_DISTRIBUTION_GOAL;LEADER_DISTRIBUTION_GOAL', false)", + CATALOG_NAME); + List plan; + try (CloseableIterator rows = tEnv.executeSql(rebalance).collect()) { + plan = + CollectionUtil.iteratorToList(rows).stream() + .map(Row::toString) + .collect(Collectors.toList()); + assertThat(plan.size()).isGreaterThan(1); + } + + retry( + Duration.ofMinutes(2), + () -> { + try (CloseableIterator rows = + tEnv.executeSql( + String.format( + "Call %s.sys.list_rebalance_progress()", + CATALOG_NAME)) + .collect()) { + List listProgressResult = + CollectionUtil.iteratorToList(rows).stream() + .map(Row::toString) + .collect(Collectors.toList()); + assertThat(listProgressResult.size()).isEqualTo(plan.size() + 3); + assertThat(listProgressResult.get(0)) + .isEqualTo("+I[Reblance total status: COMPLETED]"); + assertThat(listProgressResult.get(1)) + .isEqualTo("+I[Rebalance progress: 100%]"); + } + }); + } + private static Configuration initConfig() { Configuration conf = new Configuration(); conf.setInt(ConfigOptions.DEFAULT_REPLICATION_FACTOR, 3); diff --git a/fluss-rpc/src/main/proto/FlussApi.proto b/fluss-rpc/src/main/proto/FlussApi.proto index 6698d98d58..f0b42fd725 100644 --- a/fluss-rpc/src/main/proto/FlussApi.proto +++ b/fluss-rpc/src/main/proto/FlussApi.proto @@ -598,17 +598,22 @@ message RebalanceRequest { } message RebalanceResponse { - repeated PbRebalancePlanForTable table_plan = 1; + required string rebalance_id = 1; + repeated PbRebalancePlanForTable table_plan = 2; } message ListRebalanceProgressRequest { + optional string rebalance_id = 1; } message ListRebalanceProgressResponse { - repeated PbRebalanceProgressForTable table_progress = 1; + required int32 rebalance_status = 1; + optional string rebalance_id = 2; + repeated PbRebalanceProgressForTable table_progress = 3; } message CancelRebalanceRequest { + optional string rebalance_id = 1; } message CancelRebalanceResponse { diff --git a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/CoordinatorContext.java b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/CoordinatorContext.java index e811e01456..cd55d55b11 100644 --- a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/CoordinatorContext.java +++ b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/CoordinatorContext.java @@ -184,7 +184,7 @@ public Map allTables() { return tablePathById; } - public Set allBuckets() { + public Set getAllBuckets() { Set allBuckets = new HashSet<>(); for (Map.Entry>> tableAssign : tableAssignments.entrySet()) { diff --git a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/CoordinatorEventProcessor.java b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/CoordinatorEventProcessor.java index 471c62f7dc..0ff794720a 100644 --- a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/CoordinatorEventProcessor.java +++ b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/CoordinatorEventProcessor.java @@ -21,6 +21,9 @@ import org.apache.fluss.cluster.Endpoint; import org.apache.fluss.cluster.ServerNode; import org.apache.fluss.cluster.ServerType; +import org.apache.fluss.cluster.rebalance.RebalancePlanForBucket; +import org.apache.fluss.cluster.rebalance.RebalanceProgress; +import org.apache.fluss.cluster.rebalance.RebalanceStatus; import org.apache.fluss.cluster.rebalance.ServerTag; import org.apache.fluss.config.ConfigOptions; import org.apache.fluss.config.Configuration; @@ -29,6 +32,7 @@ import org.apache.fluss.exception.IneligibleReplicaException; import org.apache.fluss.exception.InvalidCoordinatorException; import org.apache.fluss.exception.InvalidUpdateVersionException; +import org.apache.fluss.exception.RebalanceFailureException; import org.apache.fluss.exception.ServerNotExistException; import org.apache.fluss.exception.ServerTagAlreadyExistException; import org.apache.fluss.exception.ServerTagNotExistException; @@ -45,16 +49,20 @@ import org.apache.fluss.metadata.TablePath; import org.apache.fluss.rpc.messages.AddServerTagResponse; import org.apache.fluss.rpc.messages.AdjustIsrResponse; +import org.apache.fluss.rpc.messages.CancelRebalanceResponse; import org.apache.fluss.rpc.messages.CommitKvSnapshotResponse; import org.apache.fluss.rpc.messages.CommitLakeTableSnapshotResponse; import org.apache.fluss.rpc.messages.CommitRemoteLogManifestResponse; import org.apache.fluss.rpc.messages.ControlledShutdownResponse; +import org.apache.fluss.rpc.messages.ListRebalanceProgressResponse; import org.apache.fluss.rpc.messages.PbCommitLakeTableSnapshotRespForTable; +import org.apache.fluss.rpc.messages.RebalanceResponse; import org.apache.fluss.rpc.messages.RemoveServerTagResponse; import org.apache.fluss.rpc.protocol.ApiError; import org.apache.fluss.server.coordinator.event.AccessContextEvent; import org.apache.fluss.server.coordinator.event.AddServerTagEvent; import org.apache.fluss.server.coordinator.event.AdjustIsrReceivedEvent; +import org.apache.fluss.server.coordinator.event.CancelRebalanceEvent; import org.apache.fluss.server.coordinator.event.CommitKvSnapshotEvent; import org.apache.fluss.server.coordinator.event.CommitLakeTableSnapshotEvent; import org.apache.fluss.server.coordinator.event.CommitRemoteLogManifestEvent; @@ -69,14 +77,19 @@ import org.apache.fluss.server.coordinator.event.DropTableEvent; import org.apache.fluss.server.coordinator.event.EventProcessor; import org.apache.fluss.server.coordinator.event.FencedCoordinatorEvent; +import org.apache.fluss.server.coordinator.event.ListRebalanceProgressEvent; import org.apache.fluss.server.coordinator.event.NewTabletServerEvent; import org.apache.fluss.server.coordinator.event.NotifyKvSnapshotOffsetEvent; import org.apache.fluss.server.coordinator.event.NotifyLakeTableOffsetEvent; import org.apache.fluss.server.coordinator.event.NotifyLeaderAndIsrResponseReceivedEvent; +import org.apache.fluss.server.coordinator.event.RebalanceEvent; import org.apache.fluss.server.coordinator.event.RemoveServerTagEvent; import org.apache.fluss.server.coordinator.event.SchemaChangeEvent; import org.apache.fluss.server.coordinator.event.watcher.TableChangeWatcher; import org.apache.fluss.server.coordinator.event.watcher.TabletServerChangeWatcher; +import org.apache.fluss.server.coordinator.rebalance.RebalanceManager; +import org.apache.fluss.server.coordinator.statemachine.ReplicaLeaderElection.ControlledShutdownLeaderElection; +import org.apache.fluss.server.coordinator.statemachine.ReplicaLeaderElection.ReassignmentLeaderElection; import org.apache.fluss.server.coordinator.statemachine.ReplicaStateMachine; import org.apache.fluss.server.coordinator.statemachine.TableBucketStateMachine; import org.apache.fluss.server.entity.AdjustIsrResultForBucket; @@ -94,6 +107,7 @@ import org.apache.fluss.server.zk.data.BucketAssignment; import org.apache.fluss.server.zk.data.LeaderAndIsr; import org.apache.fluss.server.zk.data.PartitionAssignment; +import org.apache.fluss.server.zk.data.RebalancePlan; import org.apache.fluss.server.zk.data.RemoteLogManifestHandle; import org.apache.fluss.server.zk.data.ServerTags; import org.apache.fluss.server.zk.data.TableAssignment; @@ -117,6 +131,7 @@ import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Objects; import java.util.Optional; import java.util.Set; import java.util.concurrent.CompletableFuture; @@ -125,12 +140,14 @@ import static org.apache.fluss.server.coordinator.statemachine.BucketState.OfflineBucket; import static org.apache.fluss.server.coordinator.statemachine.BucketState.OnlineBucket; -import static org.apache.fluss.server.coordinator.statemachine.ReplicaLeaderElectionStrategy.CONTROLLED_SHUTDOWN_ELECTION; +import static org.apache.fluss.server.coordinator.statemachine.ReplicaState.NewReplica; import static org.apache.fluss.server.coordinator.statemachine.ReplicaState.OfflineReplica; import static org.apache.fluss.server.coordinator.statemachine.ReplicaState.OnlineReplica; import static org.apache.fluss.server.coordinator.statemachine.ReplicaState.ReplicaDeletionStarted; import static org.apache.fluss.server.coordinator.statemachine.ReplicaState.ReplicaDeletionSuccessful; import static org.apache.fluss.server.utils.ServerRpcMessageUtils.makeAdjustIsrResponse; +import static org.apache.fluss.server.utils.ServerRpcMessageUtils.makeListRebalanceProgressResponse; +import static org.apache.fluss.server.utils.ServerRpcMessageUtils.makeRebalanceRespose; import static org.apache.fluss.utils.concurrent.FutureUtils.completeFromCallable; /** An implementation for {@link EventProcessor}. */ @@ -154,8 +171,9 @@ public class CoordinatorEventProcessor implements EventProcessor { private final TabletServerChangeWatcher tabletServerChangeWatcher; private final CoordinatorMetadataCache serverMetadataCache; private final CoordinatorRequestBatch coordinatorRequestBatch; - private final CoordinatorMetricGroup coordinatorMetricGroup; private final String internalListenerName; + private final CoordinatorMetricGroup coordinatorMetricGroup; + private final RebalanceManager rebalanceManager; private final CompletedSnapshotStoreManager completedSnapshotStoreManager; private final LakeTableHelper lakeTableHelper; @@ -218,6 +236,7 @@ public CoordinatorEventProcessor( this.lakeTableTieringManager = lakeTableTieringManager; this.coordinatorMetricGroup = coordinatorMetricGroup; this.internalListenerName = conf.getString(ConfigOptions.INTERNAL_LISTENER_NAME); + this.rebalanceManager = new RebalanceManager(this, zooKeeperClient); this.ioExecutor = ioExecutor; this.lakeTableHelper = new LakeTableHelper(zooKeeperClient, conf.getString(ConfigOptions.REMOTE_DATA_DIR)); @@ -227,6 +246,14 @@ public CoordinatorEventManager getCoordinatorEventManager() { return coordinatorEventManager; } + public RebalanceManager getRebalanceManager() { + return rebalanceManager; + } + + public CoordinatorContext getCoordinatorContext() { + return coordinatorContext; + } + public void startup() { coordinatorContext.setCoordinatorServerInfo(getCoordinatorServerInfo()); // start watchers first so that we won't miss node in zk; @@ -256,11 +283,15 @@ public void startup() { // start the event manager which will then process the event coordinatorEventManager.start(); + + // start rebalance manager. + rebalanceManager.startup(); } public void shutdown() { // close the event manager coordinatorEventManager.close(); + rebalanceManager.close(); onShutdown(); } @@ -579,6 +610,21 @@ public void process(CoordinatorEvent event) { completeFromCallable( removeServerTagEvent.getRespCallback(), () -> processRemoveServerTag(removeServerTagEvent)); + } else if (event instanceof RebalanceEvent) { + RebalanceEvent rebalanceEvent = (RebalanceEvent) event; + completeFromCallable( + rebalanceEvent.getRespCallback(), () -> processRebalance(rebalanceEvent)); + } else if (event instanceof CancelRebalanceEvent) { + CancelRebalanceEvent cancelRebalanceEvent = (CancelRebalanceEvent) event; + completeFromCallable( + cancelRebalanceEvent.getRespCallback(), + () -> processCancelRebalance(cancelRebalanceEvent)); + } else if (event instanceof ListRebalanceProgressEvent) { + ListRebalanceProgressEvent listRebalanceProgressEvent = + (ListRebalanceProgressEvent) event; + completeFromCallable( + listRebalanceProgressEvent.getRespCallback(), + () -> processListRebalanceProgress(listRebalanceProgressEvent)); } else if (event instanceof AccessContextEvent) { AccessContextEvent accessContextEvent = (AccessContextEvent) event; processAccessContext(accessContextEvent); @@ -1044,6 +1090,7 @@ private AddServerTagResponse processAddServerTag(AddServerTagEvent event) { // Then update coordinatorContext. serverIds.forEach(serverId -> coordinatorContext.putServerTag(serverId, serverTag)); + LOG.info("Server tag {} added for servers {}.", serverTag, serverIds); return addServerTagResponse; } @@ -1093,10 +1140,331 @@ private RemoveServerTagResponse processRemoveServerTag(RemoveServerTagEvent even // Then update coordinatorContext. serverIds.forEach(coordinatorContext::removeServerTag); + LOG.info("Server tag {} removed for servers {}.", serverTag, serverIds); return removeServerTagResponse; } + private RebalanceResponse processRebalance(RebalanceEvent rebalanceEvent) { + boolean isDryRun = rebalanceEvent.isDryRun(); + RebalancePlan rebalancePlan; + try { + rebalancePlan = + rebalanceManager.generateRebalancePlan(rebalanceEvent.getGoalsByPriority()); + } catch (Exception e) { + throw new RebalanceFailureException("Failed to generate rebalance plan.", e); + } + + if (!isDryRun) { + if (rebalanceManager.hasOngoingRebalance()) { + throw new RebalanceFailureException( + "Rebalance task already exists. Please wait for it to finish or cancel it first."); + } + + // 2. execute rebalance plan. + Map executePlan = rebalancePlan.getExecutePlan(); + rebalanceManager.registerRebalance(rebalancePlan.getRebalanceId(), executePlan); + } + + return makeRebalanceRespose(rebalancePlan); + } + + private CancelRebalanceResponse processCancelRebalance( + CancelRebalanceEvent cancelRebalanceEvent) { + CancelRebalanceResponse response = new CancelRebalanceResponse(); + rebalanceManager.cancelRebalance(cancelRebalanceEvent.getRabalanceId()); + return response; + } + + private ListRebalanceProgressResponse processListRebalanceProgress( + ListRebalanceProgressEvent event) { + RebalanceProgress rebalanceProgress = + rebalanceManager.listRebalanceProgress(event.getRabalanceId()); + return makeListRebalanceProgressResponse(rebalanceProgress); + } + + /** + * This method can be trigger by: + * + *
      + *
    • The rebalanceManager submit a new rebalance task. + *
    • The coordinatorServer restart, and want to do the unfinished rebalance task stored in + * Zookeeper. + *
    + */ + public void tryToExecuteRebalanceTask(RebalancePlanForBucket planForBucket) { + Set allBuckets = coordinatorContext.getAllBuckets(); + TableBucket tableBucket = planForBucket.getTableBucket(); + if (!allBuckets.contains(tableBucket)) { + LOG.warn( + "Skipping rebalance task of tableBucket {} since it doesn't exist.", + tableBucket); + rebalanceManager.finishRebalanceTask(tableBucket, RebalanceStatus.FAILED); + return; + } + + if (coordinatorContext.isTableQueuedForDeletion(tableBucket.getTableId())) { + LOG.warn( + "Skipping rebalance task of tableBucket {} since the respective " + + "tables are being deleted.", + tableBucket); + rebalanceManager.finishRebalanceTask(tableBucket, RebalanceStatus.FAILED); + return; + } + + List newReplicas = planForBucket.getNewReplicas(); + ReplicaReassignment reassignment = + ReplicaReassignment.build( + coordinatorContext.getAssignment(tableBucket), newReplicas); + + if (planForBucket.isLeaderAction() && !reassignment.isBeingReassigned()) { + // buckets only need to change leader like leader replica rebalance. + LOG.info("trigger leader election for tableBucket {}.", tableBucket); + tableBucketStateMachine.handleStateChange( + Collections.singleton(tableBucket), + OnlineBucket, + new ReassignmentLeaderElection(newReplicas)); + rebalanceManager.finishRebalanceTask(tableBucket, RebalanceStatus.COMPLETED); + } else { + try { + LOG.info( + "Try to processing bucket reassignment for tableBucket {} with assignment: {}.", + tableBucket, + reassignment); + onBucketReassignment(tableBucket, reassignment); + } catch (Exception e) { + LOG.error("Error when processing bucket reassignment.", e); + rebalanceManager.finishRebalanceTask(tableBucket, RebalanceStatus.FAILED); + } + } + } + + /** try to finish rebalance tasks after receive notify leader and isr response. */ + private void tryToCompleteRebalanceTask(TableBucket tableBucket) { + RebalancePlanForBucket planForBucket = + rebalanceManager.getRebalancePlanForBucket(tableBucket); + if (planForBucket != null) { + ReplicaReassignment reassignment = + ReplicaReassignment.build( + coordinatorContext.getAssignment(tableBucket), + planForBucket.getNewReplicas()); + try { + if (planForBucket.isLeaderAction() && !reassignment.isBeingReassigned()) { + LeaderAndIsr leaderAndIsr = zooKeeperClient.getLeaderAndIsr(tableBucket).get(); + int currentLeader = leaderAndIsr.leader(); + if (currentLeader == planForBucket.getNewLeader()) { + // leader action finish. + rebalanceManager.finishRebalanceTask( + tableBucket, RebalanceStatus.COMPLETED); + } + } else { + boolean isReassignmentComplete = + isReassignmentComplete(tableBucket, reassignment); + if (isReassignmentComplete) { + LOG.info( + "Target replicas {} have all caught up with the leader for reassigning bucket {}", + reassignment.getTargetReplicas(), + tableBucket); + onBucketReassignment(tableBucket, reassignment); + } + } + } catch (Exception e) { + LOG.error( + "Failed to complete the reassignment for table bucket {}", tableBucket, e); + rebalanceManager.finishRebalanceTask(tableBucket, RebalanceStatus.FAILED); + } + } + } + + /** + * Reassigning replicas for a tableBucket goes through a few steps listed in the code. + * + *
      + *
    • RS = current assigned replica set + *
    • ORS = original assigned replica set + *
    • TRS = target replica set + *
    • AR = the replicas we are adding as part of this reassignment + *
    • RR = the replicas we are removing as part of this reassignment + *
    + * + *

    A reassignment may have up to two phases, each with its own steps: + * + *

    To complete the reassignment, we need to bring the new replicas into sync, so depending on + * the state of the ISR, we will execute one of the following steps. + * + *

    Phase A (when TRS != ISR): The reassignment is not yet complete + * + *

      + *
    • A1. Bump the bucket epoch for the bucket and send LeaderAndIsr updates to ORS + TRS. + *
    • A2. Start new replicas AR by moving replicas in AR to NewReplica state. + *
    • A3. Send the start replica request to the tabletSevers in the reassigned replicas list + * that are not in the assigned + *
    + * + *

    Phase B (when TRS = ISR): The reassignment is complete + * + *

      + *
    • B1. Move all replicas in AR to OnlineReplica state. + *
    • B2. Send a LeaderAndIsr request with RS = TRS. This will prevent the leader from adding + * any replica in TRS - ORS back in the isr. If the current leader is not in TRS or isn't + * alive, we move the leader to a new replica in TRS. We may send the LeaderAndIsr to more + * than the TRS replicas due to the way the partition state machine works (it reads + * replicas from ZK) + *
    • B3. Move all replicas in RR to OfflineReplica state. As part of OfflineReplica state + * change, we shrink the isr to remove RR in ZooKeeper and send a LeaderAndIsr ONLY to the + * Leader to notify it of the shrunk isr. After that, we send a StopReplica (delete = + * false) to the replicas in RR. + *
    • B4. Move all replicas in RR to NonExistentReplica state. This will send a StopReplica + * (delete = true) to he replicas in RR to physically delete the replicas on disk. + *
    • B5. Set RS = TRS, AR = [], RR = [] in memory. + *
    • B6. Update ZK with RS=TRS, AR=[], RR=[]. + *
    • B7. After electing leader, the replicas and isr information changes. So resend the + * update metadata request to every tabletServer. + *
    • B8. Mark the ongoing rebalance task to finish. + *
    + * + *

    In general, there are two goals we want to aim for: + * + *

      + *
    • 1. Every replica present in the replica set of a LeaderAndIsrRequest gets the request + * sent to it + *
    • 2. Replicas that are removed from a bucket's assignment get StopReplica sent to them + *
    + * + *

    For example, if ORS = {1,2,3} and TRS = {4,5,6}, the values in the table and leader/isr + * paths in ZK may go through the following transitions. + * + * + * + * + * + * + * + * + * + *
    RS AR RR leader isr step
    {1,2,3} {} {} 1 {1,2,3} (initial state)
    {4,5,6,1,2,3} {4,5,6} {1,2,3} 1 {1,2,3} (step A2)
    {4,5,6,1,2,3} {4,5,6} {1,2,3} 1 {1,2,3,4,5,6} (phase B)
    {4,5,6,1,2,3} {4,5,6} {1,2,3} 4 {1,2,3,4,5,6} (step B3)
    {4,5,6,1,2,3} {4,5,6} {1,2,3} 4 {4,5,6} (step B4)
    {4,5,6} {} {} 4 {4,5,6} (step B6)
    + * + *

    Note that we have to update RS in ZK with TRS last since it's the only place where we + * store ORS persistently. This way, if the coordinatorServer crashes before that step, we can + * still recover. + */ + private void onBucketReassignment(TableBucket tableBucket, ReplicaReassignment reassignment) + throws Exception { + List addingReplicas = reassignment.addingReplicas; + List removingReplicas = reassignment.removingReplicas; + + if (!isReassignmentComplete(tableBucket, reassignment)) { + // A1. Send LeaderAndIsr request to every replica in ORS + TRS (with the new RS, AR and + // RR). + updateBucketEpochAndSendRequest(tableBucket, reassignment); + + // A2. Set RS = TRS, AR = [], RR = [] in memory. + coordinatorContext.updateBucketReplicaAssignment(tableBucket, reassignment.replicas); + updateReplicaAssignmentForBucket(tableBucket, reassignment.replicas); + + // A3. replicas in AR -> NewReplica + // send the start replica request to the tabletSevers in the reassigned replicas list + // that are not in the assigned + addingReplicas.forEach( + replica -> + replicaStateMachine.handleStateChanges( + Collections.singleton( + new TableBucketReplica(tableBucket, replica)), + NewReplica)); + } else { + // B1. replicas in AR -> OnlineReplica + addingReplicas.forEach( + replica -> + replicaStateMachine.handleStateChanges( + Collections.singleton( + new TableBucketReplica(tableBucket, replica)), + OnlineReplica)); + List targetReplicas = reassignment.getTargetReplicas(); + // B2. Send LeaderAndIsr request with a potential new leader (if current leader not in + // TRS) and a new RS (using TRS) and same isr to every tabletServer in ORS + TRS or TRS + maybeReassignedBucketLeaderIfRequired(tableBucket, targetReplicas); + // B3. replicas in RR -> Offline (force those replicas out of isr) + // B4. replicas in RR -> NonExistentReplica (force those replicas to be deleted) + stopRemovedReplicasOfReassignedBucket(tableBucket, removingReplicas); + // B5. Set RS = TRS, AR = [], RR = [] in memory. + coordinatorContext.updateBucketReplicaAssignment( + tableBucket, reassignment.getTargetReplicas()); + // B6. Update ZK with RS = TRS, AR = [], RR = []. + updateReplicaAssignmentForBucket(tableBucket, targetReplicas); + // B7. After electing a leader in B3, the replicas and isr information changes, so + // resend the update metadata request to every tabletServer. + updateTabletServerMetadataCache( + new HashSet<>(coordinatorContext.getLiveTabletServers().values()), + null, + null, + Collections.singleton(tableBucket)); + // B8. Mark the ongoing rebalance task to finish. + rebalanceManager.finishRebalanceTask(tableBucket, RebalanceStatus.COMPLETED); + } + } + + private boolean isReassignmentComplete( + TableBucket tableBucket, ReplicaReassignment reassignment) throws Exception { + LeaderAndIsr leaderAndIsr = zooKeeperClient.getLeaderAndIsr(tableBucket).get(); + List isr = leaderAndIsr.isr(); + List targetReplicas = reassignment.getTargetReplicas(); + return targetReplicas.isEmpty() || new HashSet<>(isr).containsAll(targetReplicas); + } + + private void maybeReassignedBucketLeaderIfRequired( + TableBucket tableBucket, List targetReplicas) { + LeaderAndIsr leaderAndIsr = coordinatorContext.getBucketLeaderAndIsr(tableBucket).get(); + int currentLeader = leaderAndIsr.leader(); + if (currentLeader != targetReplicas.get(0)) { + LOG.info( + "Leader {} for tableBucket {} being reassigned. Re-electing leader to {}", + currentLeader, + tableBucket, + targetReplicas.get(0)); + tableBucketStateMachine.handleStateChange( + Collections.singleton(tableBucket), + OnlineBucket, + new ReassignmentLeaderElection(targetReplicas)); + } + } + + private void stopRemovedReplicasOfReassignedBucket( + TableBucket tableBucket, List removingReplicas) { + Set replicasToBeDeleted = new HashSet<>(); + removingReplicas.forEach( + replica -> replicasToBeDeleted.add(new TableBucketReplica(tableBucket, replica))); + replicaStateMachine.handleStateChanges(replicasToBeDeleted, OfflineReplica); + // send stop replica command to the old replicas. + replicaStateMachine.handleStateChanges(replicasToBeDeleted, ReplicaDeletionStarted); + } + + private void updateReplicaAssignmentForBucket( + TableBucket tableBucket, List targetReplicas) throws Exception { + long tableId = tableBucket.getTableId(); + @Nullable Long partitionId = tableBucket.getPartitionId(); + if (partitionId == null) { + Map> tableAssignment = + coordinatorContext.getTableAssignment(tableId); + tableAssignment.put(tableBucket.getBucket(), targetReplicas); + Map newTableAssignment = new HashMap<>(); + tableAssignment.forEach( + (bucket, replicas) -> + newTableAssignment.put(bucket, new BucketAssignment(replicas))); + zooKeeperClient.updateTableAssignment(tableId, new TableAssignment(newTableAssignment)); + } else { + Map> partitionAssignment = + coordinatorContext.getPartitionAssignment( + new TablePartition(tableId, partitionId)); + partitionAssignment.put(tableBucket.getBucket(), targetReplicas); + Map newPartitionAssignment = new HashMap<>(); + partitionAssignment.forEach( + (bucket, replicas) -> + newPartitionAssignment.put(bucket, new BucketAssignment(replicas))); + zooKeeperClient.updatePartitionAssignment( + partitionId, new PartitionAssignment(tableId, newPartitionAssignment)); + } + } + private List tryProcessAdjustIsr( Map leaderAndIsrList) { // TODO verify leader epoch. @@ -1164,6 +1532,9 @@ private List tryProcessAdjustIsr( // update coordinator leader and isr cache. newLeaderAndIsrList.forEach(coordinatorContext::putBucketLeaderAndIsr); + // First, try to judge whether the bucket is in rebalance task when isr change. + newLeaderAndIsrList.keySet().forEach(this::tryToCompleteRebalanceTask); + // TODO update metadata for all alive tablet servers. return result; @@ -1217,6 +1588,13 @@ private void validateLeaderAndIsr(TableBucket tableBucket, LeaderAndIsr newLeade throw new IneligibleReplicaException(errorMsg); } } + + List isr = newLeaderAndIsr.isr(); + Set assignment = new HashSet<>(coordinatorContext.getAssignment(tableBucket)); + if (!assignment.containsAll(isr)) { + throw new FencedLeaderEpochException( + "The request isr in adjust isr request is not in assignment."); + } } } @@ -1526,7 +1904,7 @@ private ControlledShutdownResponse tryProcessControlledShutdown( } tableBucketStateMachine.handleStateChange( - bucketsLedByServer, OnlineBucket, CONTROLLED_SHUTDOWN_ELECTION); + bucketsLedByServer, OnlineBucket, new ControlledShutdownLeaderElection()); // TODO need send stop request to the leader? @@ -1637,8 +2015,114 @@ private void updateTabletServerMetadataCache( coordinatorRequestBatch.sendUpdateMetadataRequest(); } + private void updateBucketEpochAndSendRequest( + TableBucket tableBucket, ReplicaReassignment reassignment) throws Exception { + Optional leaderAndIsrOpt = zooKeeperClient.getLeaderAndIsr(tableBucket); + if (!leaderAndIsrOpt.isPresent()) { + return; + } + LeaderAndIsr leaderAndIsr = leaderAndIsrOpt.get(); + + String partitionName = null; + if (tableBucket.getPartitionId() != null) { + partitionName = coordinatorContext.getPartitionName(tableBucket.getPartitionId()); + if (partitionName == null) { + LOG.error("Can't find partition name for partition: {}.", tableBucket.getBucket()); + return; + } + } + + List newReplicas = reassignment.replicas; + // pass the original isr not include the new replicas. + LeaderAndIsr newLeaderAndIsr = leaderAndIsr.newLeaderAndIsr(leaderAndIsr.isr()); + + coordinatorContext.putBucketLeaderAndIsr(tableBucket, newLeaderAndIsr); + zooKeeperClient.updateLeaderAndIsr(tableBucket, newLeaderAndIsr); + + coordinatorRequestBatch.newBatch(); + coordinatorRequestBatch.addNotifyLeaderRequestForTabletServers( + new HashSet<>(newReplicas), + PhysicalTablePath.of( + coordinatorContext.getTablePathById(tableBucket.getTableId()), + partitionName), + tableBucket, + newReplicas, + newLeaderAndIsr); + coordinatorRequestBatch.sendRequestToTabletServers( + coordinatorContext.getCoordinatorEpoch()); + } + @VisibleForTesting CompletedSnapshotStoreManager completedSnapshotStoreManager() { return completedSnapshotStoreManager; } + + private static final class ReplicaReassignment { + private final List replicas; + private final List addingReplicas; + private final List removingReplicas; + + private ReplicaReassignment( + List replicas, + List addingReplicas, + List removingReplicas) { + this.replicas = Collections.unmodifiableList(replicas); + this.addingReplicas = Collections.unmodifiableList(addingReplicas); + this.removingReplicas = Collections.unmodifiableList(removingReplicas); + } + + private static ReplicaReassignment build( + List originReplicas, List targetReplicas) { + // targetReplicas behind originReplicas in full set. + List fullReplicaSet = new ArrayList<>(targetReplicas); + fullReplicaSet.addAll(originReplicas); + fullReplicaSet = fullReplicaSet.stream().distinct().collect(Collectors.toList()); + + List newAddingReplicas = new ArrayList<>(fullReplicaSet); + newAddingReplicas.removeAll(originReplicas); + + List newRemovingReplicas = new ArrayList<>(originReplicas); + newRemovingReplicas.removeAll(targetReplicas); + + return new ReplicaReassignment(fullReplicaSet, newAddingReplicas, newRemovingReplicas); + } + + private List getTargetReplicas() { + List computed = new ArrayList<>(replicas); + computed.removeAll(removingReplicas); + return Collections.unmodifiableList(computed); + } + + private boolean isBeingReassigned() { + return !addingReplicas.isEmpty() || !removingReplicas.isEmpty(); + } + + @Override + public String toString() { + return String.format( + "ReplicaAssignment(replicas=%s, addingReplicas=%s, removingReplicas=%s)", + replicas, addingReplicas, removingReplicas); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + + if (o == null || getClass() != o.getClass()) { + return false; + } + + ReplicaReassignment that = (ReplicaReassignment) o; + return Objects.equals(replicas, that.replicas) + && Objects.equals(addingReplicas, that.addingReplicas) + && Objects.equals(removingReplicas, that.removingReplicas); + } + + @Override + public int hashCode() { + return Objects.hash(replicas, addingReplicas, removingReplicas); + } + } } diff --git a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/CoordinatorServer.java b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/CoordinatorServer.java index cd546f4479..76a3fbd68c 100644 --- a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/CoordinatorServer.java +++ b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/CoordinatorServer.java @@ -33,6 +33,7 @@ import org.apache.fluss.server.ServerBase; import org.apache.fluss.server.authorizer.Authorizer; import org.apache.fluss.server.authorizer.AuthorizerLoader; +import org.apache.fluss.server.coordinator.rebalance.RebalanceManager; import org.apache.fluss.server.metadata.CoordinatorMetadataCache; import org.apache.fluss.server.metadata.ServerMetadataCache; import org.apache.fluss.server.metrics.ServerMetricUtils; @@ -508,6 +509,11 @@ public DynamicConfigManager getDynamicConfigManager() { return dynamicConfigManager; } + @VisibleForTesting + public RebalanceManager getRebalanceManager() { + return coordinatorEventProcessor.getRebalanceManager(); + } + private static void validateConfigs(Configuration conf) { if (conf.get(ConfigOptions.DEFAULT_REPLICATION_FACTOR) < 1) { throw new IllegalConfigurationException( diff --git a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/CoordinatorService.java b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/CoordinatorService.java index 7fc53b96f8..2ef7d04af6 100644 --- a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/CoordinatorService.java +++ b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/CoordinatorService.java @@ -20,6 +20,7 @@ import org.apache.fluss.annotation.VisibleForTesting; import org.apache.fluss.cluster.ServerType; import org.apache.fluss.cluster.TabletServerInfo; +import org.apache.fluss.cluster.rebalance.GoalType; import org.apache.fluss.cluster.rebalance.ServerTag; import org.apache.fluss.config.ConfigOptions; import org.apache.fluss.config.Configuration; @@ -113,12 +114,16 @@ import org.apache.fluss.server.coordinator.event.AccessContextEvent; import org.apache.fluss.server.coordinator.event.AddServerTagEvent; import org.apache.fluss.server.coordinator.event.AdjustIsrReceivedEvent; +import org.apache.fluss.server.coordinator.event.CancelRebalanceEvent; import org.apache.fluss.server.coordinator.event.CommitKvSnapshotEvent; import org.apache.fluss.server.coordinator.event.CommitLakeTableSnapshotEvent; import org.apache.fluss.server.coordinator.event.CommitRemoteLogManifestEvent; import org.apache.fluss.server.coordinator.event.ControlledShutdownEvent; import org.apache.fluss.server.coordinator.event.EventManager; +import org.apache.fluss.server.coordinator.event.ListRebalanceProgressEvent; +import org.apache.fluss.server.coordinator.event.RebalanceEvent; import org.apache.fluss.server.coordinator.event.RemoveServerTagEvent; +import org.apache.fluss.server.coordinator.rebalance.goal.Goal; import org.apache.fluss.server.entity.CommitKvSnapshotData; import org.apache.fluss.server.entity.LakeTieringTableInfo; import org.apache.fluss.server.entity.TablePropertyChanges; @@ -140,7 +145,9 @@ import javax.annotation.Nullable; import java.io.UncheckedIOException; +import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -153,6 +160,7 @@ import static org.apache.fluss.config.FlussConfigUtils.isTableStorageConfig; import static org.apache.fluss.rpc.util.CommonRpcMessageUtils.toAclBindingFilters; import static org.apache.fluss.rpc.util.CommonRpcMessageUtils.toAclBindings; +import static org.apache.fluss.server.coordinator.rebalance.goal.GoalUtils.getGoalByType; import static org.apache.fluss.server.utils.ServerRpcMessageUtils.fromTablePath; import static org.apache.fluss.server.utils.ServerRpcMessageUtils.getAdjustIsrData; import static org.apache.fluss.server.utils.ServerRpcMessageUtils.getCommitLakeTableSnapshotData; @@ -170,7 +178,6 @@ /** An RPC Gateway service for coordinator server. */ public final class CoordinatorService extends RpcServiceBase implements CoordinatorGateway { - private final int defaultBucketNumber; private final int defaultReplicationFactor; private final boolean logTableAllowCreation; @@ -309,12 +316,29 @@ public CompletableFuture createTable(CreateTableRequest req // first, generate the assignment TableAssignment tableAssignment = null; + Map properties = tableDescriptor.getProperties(); + boolean generateUnbalanceAssignment; + if (properties.containsKey(ConfigOptions.TABLE_GENERATE_UNBALANCE_TABLE_ASSIGNMENT.key())) { + generateUnbalanceAssignment = + Boolean.parseBoolean( + properties.get( + ConfigOptions.TABLE_GENERATE_UNBALANCE_TABLE_ASSIGNMENT.key())); + } else { + generateUnbalanceAssignment = false; + } // only when it's no partitioned table do we generate the assignment for it if (!tableDescriptor.isPartitioned()) { // the replication factor must be set now int replicaFactor = tableDescriptor.getReplicationFactor(); TabletServerInfo[] servers = metadataCache.getLiveServers(); - tableAssignment = generateAssignment(bucketCount, replicaFactor, servers); + if (generateUnbalanceAssignment) { + // this branch is only used for testing. + tableAssignment = + new TableAssignment( + generateUnBalanceAssignment(bucketCount, replicaFactor)); + } else { + tableAssignment = generateAssignment(bucketCount, replicaFactor, servers); + } } // before create table in fluss, we may create in lake @@ -528,9 +552,18 @@ public CompletableFuture createPartition( // second, generate the PartitionAssignment. int replicaFactor = table.getTableConfig().getReplicationFactor(); TabletServerInfo[] servers = metadataCache.getLiveServers(); - Map bucketAssignments = - generateAssignment(table.bucketCount, replicaFactor, servers) - .getBucketAssignments(); + Map bucketAssignments; + + boolean generateUnbalanceAssignment = table.getTableConfig().generateUnbalanceAssignment(); + if (generateUnbalanceAssignment) { + // This branch is only used for testing. + bucketAssignments = generateUnBalanceAssignment(table.bucketCount, replicaFactor); + } else { + bucketAssignments = + generateAssignment(table.bucketCount, replicaFactor, servers) + .getBucketAssignments(); + } + PartitionAssignment partitionAssignment = new PartitionAssignment(table.tableId, bucketAssignments); @@ -577,16 +610,15 @@ public CompletableFuture metadata(MetadataRequest request) { AccessContextEvent metadataResponseAccessContextEvent = new AccessContextEvent<>( - ctx -> { - return processMetadataRequest( - request, - listenerName, - session, - authorizer, - metadataCache, - new CoordinatorMetadataProvider( - zkClient, metadataManager, ctx)); - }); + ctx -> + processMetadataRequest( + request, + listenerName, + session, + authorizer, + metadataCache, + new CoordinatorMetadataProvider( + zkClient, metadataManager, ctx))); eventManagerSupplier.get().put(metadataResponseAccessContextEvent); return metadataResponseAccessContextEvent.getResultFuture(); } @@ -851,19 +883,40 @@ public CompletableFuture removeServerTag( @Override public CompletableFuture rebalance(RebalanceRequest request) { - throw new UnsupportedOperationException("Support soon!"); + List goalsByPriority = new ArrayList<>(); + Arrays.stream(request.getGoals()) + .forEach(goal -> goalsByPriority.add(getGoalByType(GoalType.valueOf(goal)))); + boolean isDryRun = request.isDryRun(); + + CompletableFuture response = new CompletableFuture<>(); + eventManagerSupplier.get().put(new RebalanceEvent(goalsByPriority, isDryRun, response)); + return response; } @Override public CompletableFuture listRebalanceProgress( ListRebalanceProgressRequest request) { - throw new UnsupportedOperationException("Support soon!"); + CompletableFuture response = new CompletableFuture<>(); + eventManagerSupplier + .get() + .put( + new ListRebalanceProgressEvent( + request.hasRebalanceId() ? request.getRebalanceId() : null, + response)); + return response; } @Override public CompletableFuture cancelRebalance( CancelRebalanceRequest request) { - throw new UnsupportedOperationException("Support soon!"); + CompletableFuture response = new CompletableFuture<>(); + eventManagerSupplier + .get() + .put( + new CancelRebalanceEvent( + request.hasRebalanceId() ? request.getRebalanceId() : null, + response)); + return response; } @VisibleForTesting @@ -909,6 +962,24 @@ private void validateTableCreationPermission( } } + private Map generateUnBalanceAssignment( + int nBuckets, int replicationFactor) { + Map assignments = new HashMap<>(); + for (int i = 0; i < nBuckets; i++) { + if (replicationFactor == 1) { + assignments.put(i, new BucketAssignment(Collections.singletonList(0))); + } else if (replicationFactor == 2) { + assignments.put(i, new BucketAssignment(Arrays.asList(0, 1))); + } else if (replicationFactor == 3) { + assignments.put(i, new BucketAssignment(Arrays.asList(0, 1, 2))); + } else { + throw new IllegalArgumentException( + "replicationFactor must be 1, 2 or 3 for unbalance assignment."); + } + } + return assignments; + } + static class DefaultLakeCatalogContext implements LakeCatalog.Context { private final boolean isCreatingFlussTable; diff --git a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/event/CancelRebalanceEvent.java b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/event/CancelRebalanceEvent.java new file mode 100644 index 0000000000..8261a25df7 --- /dev/null +++ b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/event/CancelRebalanceEvent.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.fluss.server.coordinator.event; + +import org.apache.fluss.rpc.messages.CancelRebalanceResponse; + +import javax.annotation.Nullable; + +import java.util.concurrent.CompletableFuture; + +/** The event for canceling rebalance. */ +public class CancelRebalanceEvent implements CoordinatorEvent { + + private final @Nullable String rebalanceId; + private final CompletableFuture respCallback; + + public CancelRebalanceEvent( + @Nullable String rebalanceId, CompletableFuture respCallback) { + this.respCallback = respCallback; + this.rebalanceId = rebalanceId; + } + + public @Nullable String getRabalanceId() { + return rebalanceId; + } + + public CompletableFuture getRespCallback() { + return respCallback; + } +} diff --git a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/event/ListRebalanceProgressEvent.java b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/event/ListRebalanceProgressEvent.java new file mode 100644 index 0000000000..c0d944bcd2 --- /dev/null +++ b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/event/ListRebalanceProgressEvent.java @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.fluss.server.coordinator.event; + +import org.apache.fluss.rpc.messages.ListRebalanceProgressResponse; + +import javax.annotation.Nullable; + +import java.util.concurrent.CompletableFuture; + +/** The event for listing rebalance progress. */ +public class ListRebalanceProgressEvent implements CoordinatorEvent { + + private final @Nullable String rebalanceId; + private final CompletableFuture respCallback; + + public ListRebalanceProgressEvent( + @Nullable String rebalanceId, + CompletableFuture respCallback) { + this.rebalanceId = rebalanceId; + this.respCallback = respCallback; + } + + public @Nullable String getRabalanceId() { + return rebalanceId; + } + + public CompletableFuture getRespCallback() { + return respCallback; + } +} diff --git a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/event/RebalanceEvent.java b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/event/RebalanceEvent.java new file mode 100644 index 0000000000..63cb2fe9e4 --- /dev/null +++ b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/event/RebalanceEvent.java @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.fluss.server.coordinator.event; + +import org.apache.fluss.rpc.messages.RebalanceResponse; +import org.apache.fluss.server.coordinator.rebalance.goal.Goal; + +import java.util.List; +import java.util.concurrent.CompletableFuture; + +/** The event for rebalance. */ +public class RebalanceEvent implements CoordinatorEvent { + + private final List goalsByPriority; + private final boolean isDryRun; + private final CompletableFuture respCallback; + + public RebalanceEvent( + List goalsByPriority, + boolean isDryRun, + CompletableFuture respCallback) { + this.goalsByPriority = goalsByPriority; + this.isDryRun = isDryRun; + this.respCallback = respCallback; + } + + public List getGoalsByPriority() { + return goalsByPriority; + } + + public boolean isDryRun() { + return isDryRun; + } + + public CompletableFuture getRespCallback() { + return respCallback; + } +} diff --git a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/rebalance/ActionAcceptance.java b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/rebalance/ActionAcceptance.java new file mode 100644 index 0000000000..5e815e9c34 --- /dev/null +++ b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/rebalance/ActionAcceptance.java @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.fluss.server.coordinator.rebalance; + +/** Flags to indicate if an action is acceptable by the goal(s). */ +public enum ActionAcceptance { + /** Action is acceptable -- i.e. it does not violate goal constraints. */ + ACCEPT, + /** + * Action is rejected in replica-level. But, the destination tabletServer may potentially accept + * actions of the same {@link ActionType} from the source tabletServer specified in the given + * action. + */ + REPLICA_REJECT, + + /** + * Action is rejected in server-level. hence, the destination tabletServer does not accept + * actions of the same {@link ActionType} from the source tabletServer specified in the given + * action. + */ + SERVER_REJECT +} diff --git a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/rebalance/ActionType.java b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/rebalance/ActionType.java new file mode 100644 index 0000000000..a24bf8acb6 --- /dev/null +++ b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/rebalance/ActionType.java @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.fluss.server.coordinator.rebalance; + +/** Flags to indicate the type of action. */ +public enum ActionType { + /** Move a replica from a source tabletServer to a destination tabletServer. */ + REPLICA_MOVEMENT, + + /** + * Move leadership of a leader from a source tabletServer to a follower of the same replica + * residing in a destination tabletServer. + */ + LEADERSHIP_MOVEMENT; +} diff --git a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/rebalance/ReBalancingAction.java b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/rebalance/ReBalancingAction.java new file mode 100644 index 0000000000..bd62f82c2a --- /dev/null +++ b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/rebalance/ReBalancingAction.java @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.fluss.server.coordinator.rebalance; + +import org.apache.fluss.metadata.TableBucket; + +/** Represents the load rebalancing operation over a replica for Fluss Load GoalOptimizer. */ +public class ReBalancingAction { + private final TableBucket tableBucket; + private final Integer sourceServerId; + private final Integer destinationServerId; + private final ActionType actionType; + + public ReBalancingAction( + TableBucket tableBucket, + Integer sourceServerId, + Integer destinationServerId, + ActionType actionType) { + this.tableBucket = tableBucket; + this.sourceServerId = sourceServerId; + this.destinationServerId = destinationServerId; + this.actionType = actionType; + } + + public TableBucket getTableBucket() { + return tableBucket; + } + + public Integer getSourceServerId() { + return sourceServerId; + } + + public Integer getDestinationServerId() { + return destinationServerId; + } + + public ActionType getActionType() { + return actionType; + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + + ReBalancingAction that = (ReBalancingAction) o; + + if (!tableBucket.equals(that.tableBucket)) { + return false; + } + if (!sourceServerId.equals(that.sourceServerId)) { + return false; + } + if (!destinationServerId.equals(that.destinationServerId)) { + return false; + } + return actionType == that.actionType; + } + + @Override + public int hashCode() { + int result = tableBucket.hashCode(); + result = 31 * result + sourceServerId.hashCode(); + result = 31 * result + destinationServerId.hashCode(); + result = 31 * result + actionType.hashCode(); + return result; + } + + @Override + public String toString() { + return "ReBalancingAction{" + + "tableBucket=" + + tableBucket + + ", sourceServerId=" + + sourceServerId + + ", destinationServerId=" + + destinationServerId + + ", actionType=" + + actionType + + '}'; + } +} diff --git a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/rebalance/RebalanceManager.java b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/rebalance/RebalanceManager.java new file mode 100644 index 0000000000..c251906355 --- /dev/null +++ b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/rebalance/RebalanceManager.java @@ -0,0 +1,437 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.fluss.server.coordinator.rebalance; + +import org.apache.fluss.annotation.VisibleForTesting; +import org.apache.fluss.cluster.rebalance.RebalancePlanForBucket; +import org.apache.fluss.cluster.rebalance.RebalanceProgress; +import org.apache.fluss.cluster.rebalance.RebalanceResultForBucket; +import org.apache.fluss.cluster.rebalance.RebalanceStatus; +import org.apache.fluss.cluster.rebalance.ServerTag; +import org.apache.fluss.exception.RebalanceFailureException; +import org.apache.fluss.metadata.TableBucket; +import org.apache.fluss.server.coordinator.CoordinatorContext; +import org.apache.fluss.server.coordinator.CoordinatorEventProcessor; +import org.apache.fluss.server.coordinator.rebalance.goal.Goal; +import org.apache.fluss.server.coordinator.rebalance.goal.GoalOptimizer; +import org.apache.fluss.server.coordinator.rebalance.model.ClusterModel; +import org.apache.fluss.server.coordinator.rebalance.model.RackModel; +import org.apache.fluss.server.coordinator.rebalance.model.ServerModel; +import org.apache.fluss.server.metadata.ServerInfo; +import org.apache.fluss.server.zk.ZooKeeperClient; +import org.apache.fluss.server.zk.data.LeaderAndIsr; +import org.apache.fluss.server.zk.data.RebalancePlan; +import org.apache.fluss.utils.MapUtils; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.Nullable; +import javax.annotation.concurrent.GuardedBy; +import javax.annotation.concurrent.ThreadSafe; + +import java.util.ArrayDeque; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Queue; +import java.util.Set; +import java.util.SortedSet; +import java.util.TreeSet; +import java.util.UUID; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.locks.Lock; +import java.util.concurrent.locks.ReentrantLock; + +import static org.apache.fluss.cluster.rebalance.RebalanceStatus.CANCELED; +import static org.apache.fluss.cluster.rebalance.RebalanceStatus.COMPLETED; +import static org.apache.fluss.cluster.rebalance.RebalanceStatus.NOT_STARTED; +import static org.apache.fluss.cluster.rebalance.RebalanceStatus.NO_TASK; +import static org.apache.fluss.cluster.rebalance.RebalanceStatus.REBALANCING; +import static org.apache.fluss.cluster.rebalance.RebalanceUtils.FINAL_STATUSES; +import static org.apache.fluss.utils.Preconditions.checkArgument; +import static org.apache.fluss.utils.Preconditions.checkNotNull; +import static org.apache.fluss.utils.concurrent.LockUtils.inLock; + +/** + * A rebalance manager to generate rebalance plan, and execution rebalance plan. + * + *

    This manager can only be used in {@link CoordinatorEventProcessor} as a single threaded model. + */ +@ThreadSafe +public class RebalanceManager { + + private static final Logger LOG = LoggerFactory.getLogger(RebalanceManager.class); + + private final AtomicBoolean isClosed = new AtomicBoolean(false); + private final Lock lock = new ReentrantLock(); + private final ZooKeeperClient zkClient; + private final CoordinatorEventProcessor eventProcessor; + + @GuardedBy("lock") + private final Queue ongoingRebalanceTasksQueue = new ArrayDeque<>(); + + /** A mapping from table bucket to rebalance status of pending and running tasks. */ + @GuardedBy("lock") + private final Map ongoingRebalanceTasks = + MapUtils.newConcurrentHashMap(); + + /** A mapping from table bucket to rebalance status of failed or completed tasks. */ + @GuardedBy("lock") + private final Map finishedRebalanceTasks = + MapUtils.newConcurrentHashMap(); + + @GuardedBy("lock") + private final GoalOptimizer goalOptimizer; + + @GuardedBy("lock") + private long registerTime; + + @GuardedBy("lock") + private volatile RebalanceStatus rebalanceStatus = NO_TASK; + + @GuardedBy("lock") + private volatile @Nullable String currentRebalanceId; + + public RebalanceManager(CoordinatorEventProcessor eventProcessor, ZooKeeperClient zkClient) { + this.eventProcessor = eventProcessor; + this.zkClient = zkClient; + this.goalOptimizer = new GoalOptimizer(); + } + + public void startup() { + LOG.info("Start up rebalance manager."); + initialize(); + } + + private void initialize() { + try { + zkClient.getRebalancePlan() + .ifPresent( + rebalancePlan -> + registerRebalance( + rebalancePlan.getRebalanceId(), + rebalancePlan.getExecutePlan())); + } catch (Exception e) { + LOG.error( + "Failed to get rebalance plan from zookeeper, it will be treated as no" + + "rebalance tasks.", + e); + } + } + + public void registerRebalance( + String rebalanceId, Map rebalancePlan) { + checkNotClosed(); + registerTime = System.currentTimeMillis(); + // Register to zookeeper first. + try { + // first clear all exists tasks. + ongoingRebalanceTasks.clear(); + ongoingRebalanceTasksQueue.clear(); + finishedRebalanceTasks.clear(); + + Optional existPlanOpt = zkClient.getRebalancePlan(); + RebalanceStatus newStatus = rebalancePlan.isEmpty() ? COMPLETED : NOT_STARTED; + if (!existPlanOpt.isPresent()) { + zkClient.registerRebalancePlan( + new RebalancePlan(rebalanceId, newStatus, rebalancePlan)); + } else { + RebalancePlan existPlan = existPlanOpt.get(); + if (FINAL_STATUSES.contains(existPlan.getRebalanceStatus())) { + zkClient.updateRebalancePlan( + new RebalancePlan(rebalanceId, newStatus, rebalancePlan)); + } else { + throw new RebalanceFailureException( + "Rebalance task already exists. Please wait for it to finish or cancel it first."); + } + } + + currentRebalanceId = rebalanceId; + rebalanceStatus = newStatus; + } catch (Exception e) { + LOG.error("Error when register rebalance plan to zookeeper.", e); + throw new RebalanceFailureException( + "Error when register rebalance plan to zookeeper.", e); + } + + inLock( + lock, + () -> { + // Then, register to ongoingRebalanceTasks. + rebalancePlan.forEach( + ((tableBucket, rebalancePlanForBucket) -> { + ongoingRebalanceTasksQueue.add(tableBucket); + ongoingRebalanceTasks.put( + tableBucket, + RebalanceResultForBucket.of( + rebalancePlanForBucket, NOT_STARTED)); + })); + + // Trigger one rebalance task to execute. + rebalanceStatus = rebalancePlan.isEmpty() ? COMPLETED : REBALANCING; + processNewRebalanceTask(); + }); + } + + public void finishRebalanceTask(TableBucket tableBucket, RebalanceStatus statusForBucket) { + checkNotClosed(); + inLock( + lock, + () -> { + if (ongoingRebalanceTasksQueue.contains(tableBucket)) { + ongoingRebalanceTasksQueue.remove(tableBucket); + RebalanceResultForBucket resultForBucket = + ongoingRebalanceTasks.remove(tableBucket); + checkNotNull(resultForBucket, "RebalanceResultForBucket is null."); + finishedRebalanceTasks.put( + tableBucket, + RebalanceResultForBucket.of( + resultForBucket.plan(), statusForBucket)); + LOG.info( + "Rebalance task {} in progress: {} tasks pending, {} completed.", + currentRebalanceId, + ongoingRebalanceTasksQueue.size(), + finishedRebalanceTasks.size()); + + if (ongoingRebalanceTasksQueue.isEmpty()) { + // All rebalance tasks are completed. + rebalanceStatus = COMPLETED; + completeRebalance(); + } else { + // Trigger one rebalance task to execute. + processNewRebalanceTask(); + } + } + }); + } + + public RebalanceProgress listRebalanceProgress(@Nullable String rebalanceId) { + checkNotClosed(); + return inLock( + lock, + () -> { + if (rebalanceId != null + && currentRebalanceId != null + && !rebalanceId.equals(currentRebalanceId)) { + LOG.warn( + "Ignore the list rebalance task because it is not the current" + + " rebalance task."); + return new RebalanceProgress( + currentRebalanceId, NO_TASK, 0.0, Collections.emptyMap()); + } + + Map progressForBucketMap = + new HashMap<>(); + progressForBucketMap.putAll(ongoingRebalanceTasks); + progressForBucketMap.putAll(finishedRebalanceTasks); + // the progress will be set at client. + return new RebalanceProgress( + currentRebalanceId, rebalanceStatus, 0.0, progressForBucketMap); + }); + } + + public void cancelRebalance(@Nullable String rebalanceId) { + checkNotClosed(); + inLock( + lock, + () -> { + try { + if (rebalanceId != null + && currentRebalanceId != null + && !rebalanceId.equals(currentRebalanceId)) { + // do nothing. + LOG.warn( + "Ignore the cancel rebalance task because it is not the current" + + " rebalance task."); + return; + } + + Optional rebalancePlanOpt = zkClient.getRebalancePlan(); + if (rebalancePlanOpt.isPresent()) { + RebalancePlan rebalancePlan = rebalancePlanOpt.get(); + zkClient.updateRebalancePlan( + new RebalancePlan( + rebalancePlan.getRebalanceId(), + CANCELED, + rebalancePlan.getExecutePlan())); + } + } catch (Exception e) { + LOG.error("Error when delete rebalance plan from zookeeper.", e); + } + + rebalanceStatus = CANCELED; + ongoingRebalanceTasksQueue.clear(); + ongoingRebalanceTasks.clear(); + + // Here, it will not clear finishedRebalanceTasks, because it will be used by + // listRebalanceProgress. It will be cleared when next register. + + LOG.info("Cancel rebalance task success."); + }); + } + + public boolean hasOngoingRebalance() { + checkNotClosed(); + return inLock( + lock, + () -> !ongoingRebalanceTasks.isEmpty() || !ongoingRebalanceTasksQueue.isEmpty()); + } + + public RebalancePlan generateRebalancePlan(List goalsByPriority) { + checkNotClosed(); + List rebalancePlanForBuckets; + try { + // Generate the latest cluster model. + ClusterModel clusterModel = buildClusterModel(eventProcessor.getCoordinatorContext()); + + // do optimize. + rebalancePlanForBuckets = goalOptimizer.doOptimizeOnce(clusterModel, goalsByPriority); + } catch (Exception e) { + LOG.error("Failed to generate rebalance plan.", e); + throw e; + } + + // group by tableId and partitionId to generate rebalance plan. + return buildRebalancePlan(rebalancePlanForBuckets); + } + + public @Nullable RebalancePlanForBucket getRebalancePlanForBucket(TableBucket tableBucket) { + checkNotClosed(); + return inLock( + lock, + () -> { + RebalanceResultForBucket resultForBucket = + ongoingRebalanceTasks.get(tableBucket); + if (resultForBucket != null) { + return resultForBucket.plan(); + } + return null; + }); + } + + private void processNewRebalanceTask() { + TableBucket tableBucket = ongoingRebalanceTasksQueue.peek(); + if (tableBucket != null && ongoingRebalanceTasks.containsKey(tableBucket)) { + RebalanceResultForBucket resultForBucket = ongoingRebalanceTasks.get(tableBucket); + RebalanceResultForBucket rebalanceResultForBucket = + RebalanceResultForBucket.of(resultForBucket.plan(), REBALANCING); + eventProcessor.tryToExecuteRebalanceTask(rebalanceResultForBucket.plan()); + } + } + + private void completeRebalance() { + checkNotClosed(); + inLock( + lock, + () -> { + try { + Optional rebalancePlanOpt = zkClient.getRebalancePlan(); + if (rebalancePlanOpt.isPresent()) { + RebalancePlan rebalancePlan = rebalancePlanOpt.get(); + zkClient.updateRebalancePlan( + new RebalancePlan( + rebalancePlan.getRebalanceId(), + COMPLETED, + rebalancePlan.getExecutePlan())); + } + } catch (Exception e) { + LOG.error("Error when update rebalance plan from zookeeper.", e); + } + + ongoingRebalanceTasks.clear(); + ongoingRebalanceTasksQueue.clear(); + + // Here, it will not clear finishedRebalanceTasks, because it will be used by + // listRebalanceProgress. It will be cleared when next register. + + LOG.info( + "Rebalance complete with {} ms.", + System.currentTimeMillis() - registerTime); + }); + } + + private ClusterModel buildClusterModel(CoordinatorContext coordinatorContext) { + Map liveTabletServers = coordinatorContext.getLiveTabletServers(); + Map serverTags = coordinatorContext.getServerTags(); + + Map serverModelMap = new HashMap<>(); + for (ServerInfo serverInfo : liveTabletServers.values()) { + Integer id = serverInfo.id(); + String rack = serverInfo.rack() == null ? RackModel.DEFAULT_RACK : serverInfo.rack(); + if (serverTags.containsKey(id)) { + serverModelMap.put( + id, new ServerModel(id, rack, !isServerOffline(serverTags.get(id)))); + } else { + serverModelMap.put(id, new ServerModel(id, rack, true)); + } + } + + ClusterModel clusterModel = initialClusterModel(serverModelMap); + + // Try to update the cluster model with the latest bucket states. + Set allBuckets = coordinatorContext.getAllBuckets(); + for (TableBucket tableBucket : allBuckets) { + List assignment = coordinatorContext.getAssignment(tableBucket); + Optional bucketLeaderAndIsrOpt = + coordinatorContext.getBucketLeaderAndIsr(tableBucket); + checkArgument(bucketLeaderAndIsrOpt.isPresent(), "Bucket leader and isr is empty."); + LeaderAndIsr isr = bucketLeaderAndIsrOpt.get(); + int leader = isr.leader(); + for (int i = 0; i < assignment.size(); i++) { + int replica = assignment.get(i); + clusterModel.createReplica(replica, tableBucket, i, leader == replica); + } + } + return clusterModel; + } + + private RebalancePlan buildRebalancePlan(List rebalancePlanForBuckets) { + Map bucketPlan = new HashMap<>(); + for (RebalancePlanForBucket rebalancePlanForBucket : rebalancePlanForBuckets) { + bucketPlan.put(rebalancePlanForBucket.getTableBucket(), rebalancePlanForBucket); + } + return new RebalancePlan(UUID.randomUUID().toString(), NOT_STARTED, bucketPlan); + } + + private boolean isServerOffline(ServerTag serverTag) { + return serverTag == ServerTag.PERMANENT_OFFLINE || serverTag == ServerTag.TEMPORARY_OFFLINE; + } + + private ClusterModel initialClusterModel(Map serverModelMap) { + SortedSet servers = new TreeSet<>(serverModelMap.values()); + return new ClusterModel(servers); + } + + private void checkNotClosed() { + if (isClosed.get()) { + throw new IllegalStateException("RebalanceManager is already closed."); + } + } + + public void close() { + isClosed.compareAndSet(false, true); + } + + @VisibleForTesting + public ClusterModel buildClusterModel() { + return buildClusterModel(eventProcessor.getCoordinatorContext()); + } +} diff --git a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/rebalance/ReplicaReassignment.java b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/rebalance/ReplicaReassignment.java new file mode 100644 index 0000000000..1e4ff17ea2 --- /dev/null +++ b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/rebalance/ReplicaReassignment.java @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.fluss.server.coordinator.rebalance; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Objects; +import java.util.stream.Collectors; + +/** Replica reassignment. */ +public class ReplicaReassignment { + private final List replicas; + private final List addingReplicas; + private final List removingReplicas; + + private ReplicaReassignment( + List replicas, List addingReplicas, List removingReplicas) { + this.replicas = Collections.unmodifiableList(replicas); + this.addingReplicas = Collections.unmodifiableList(addingReplicas); + this.removingReplicas = Collections.unmodifiableList(removingReplicas); + } + + private static ReplicaReassignment build( + List originReplicas, List targetReplicas) { + // targetReplicas behind originReplicas in full set. + List fullReplicaSet = new ArrayList<>(targetReplicas); + fullReplicaSet.addAll(originReplicas); + fullReplicaSet = fullReplicaSet.stream().distinct().collect(Collectors.toList()); + + List newAddingReplicas = new ArrayList<>(fullReplicaSet); + newAddingReplicas.removeAll(originReplicas); + + List newRemovingReplicas = new ArrayList<>(originReplicas); + newRemovingReplicas.removeAll(targetReplicas); + + return new ReplicaReassignment(fullReplicaSet, newAddingReplicas, newRemovingReplicas); + } + + private List getTargetReplicas() { + List computed = new ArrayList<>(replicas); + computed.removeAll(removingReplicas); + return Collections.unmodifiableList(computed); + } + + private List getOriginReplicas() { + List computed = new ArrayList<>(replicas); + computed.removeAll(addingReplicas); + return Collections.unmodifiableList(computed); + } + + private boolean isBeingReassigned() { + return !addingReplicas.isEmpty() || !removingReplicas.isEmpty(); + } + + @Override + public String toString() { + return String.format( + "ReplicaAssignment(replicas=%s, addingReplicas=%s, removingReplicas=%s)", + replicas, addingReplicas, removingReplicas); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + + if (o == null || getClass() != o.getClass()) { + return false; + } + + ReplicaReassignment that = (ReplicaReassignment) o; + return Objects.equals(replicas, that.replicas) + && Objects.equals(addingReplicas, that.addingReplicas) + && Objects.equals(removingReplicas, that.removingReplicas); + } + + @Override + public int hashCode() { + return Objects.hash(replicas, addingReplicas, removingReplicas); + } +} diff --git a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/rebalance/goal/AbstractGoal.java b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/rebalance/goal/AbstractGoal.java new file mode 100644 index 0000000000..513d7209db --- /dev/null +++ b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/rebalance/goal/AbstractGoal.java @@ -0,0 +1,195 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.fluss.server.coordinator.rebalance.goal; + +import org.apache.fluss.exception.RebalanceFailureException; +import org.apache.fluss.metadata.TableBucket; +import org.apache.fluss.server.coordinator.rebalance.ActionAcceptance; +import org.apache.fluss.server.coordinator.rebalance.ActionType; +import org.apache.fluss.server.coordinator.rebalance.ReBalancingAction; +import org.apache.fluss.server.coordinator.rebalance.model.ClusterModel; +import org.apache.fluss.server.coordinator.rebalance.model.ClusterModelStats; +import org.apache.fluss.server.coordinator.rebalance.model.ReplicaModel; +import org.apache.fluss.server.coordinator.rebalance.model.ServerModel; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; +import java.util.Set; +import java.util.SortedSet; + +import static org.apache.fluss.server.coordinator.rebalance.ActionAcceptance.ACCEPT; +import static org.apache.fluss.server.coordinator.rebalance.ActionType.LEADERSHIP_MOVEMENT; +import static org.apache.fluss.server.coordinator.rebalance.ActionType.REPLICA_MOVEMENT; +import static org.apache.fluss.server.coordinator.rebalance.goal.GoalOptimizerUtils.isProposalAcceptableForOptimizedGoals; +import static org.apache.fluss.server.coordinator.rebalance.goal.GoalUtils.legitMove; + +/** An abstract class for goals. */ +public abstract class AbstractGoal implements Goal { + private static final Logger LOG = LoggerFactory.getLogger(AbstractGoal.class); + protected boolean finished; + protected boolean succeeded; + + public AbstractGoal() { + finished = false; + succeeded = true; + } + + @Override + public void optimize(ClusterModel clusterModel, Set optimizedGoals) { + LOG.debug("Starting Optimizing for goal {}", name()); + // Initialize pre-optimized stats. + ClusterModelStats statsBeforeOptimization = clusterModel.getClusterStats(); + LOG.trace("[PRE - {}] {}", name(), statsBeforeOptimization); + finished = false; + long goalStartTime = System.currentTimeMillis(); + initGoalState(clusterModel); + SortedSet offlineServers = clusterModel.offlineServers(); + + while (!finished) { + for (ServerModel server : serversToBalance(clusterModel)) { + rebalanceForServer(server, clusterModel, optimizedGoals); + } + updateGoalState(clusterModel); + } + + ClusterModelStats statsAfterOptimization = clusterModel.getClusterStats(); + LOG.trace("[POST - {}] {}", name(), statsAfterOptimization); + if (LOG.isDebugEnabled()) { + LOG.debug( + "Finished optimization for {} in {}ms.", + name(), + System.currentTimeMillis() - goalStartTime); + } + LOG.trace("Cluster after optimization is {}", clusterModel); + // The optimization cannot make stats worse unless the cluster has (1) offline servers for + // replica move with replicas. + if (offlineServers.isEmpty()) { + ClusterModelStatsComparator comparator = clusterModelStatsComparator(); + // Throw exception when the stats before optimization is preferred. + if (comparator.compare(statsAfterOptimization, statsBeforeOptimization) < 0) { + // If a goal provides worse stats after optimization, that indicates an + // implementation error with the goal. + throw new IllegalStateException( + String.format( + "Optimization for goal %s failed because the optimized result is worse than before." + + " Reason: %s.", + name(), comparator.explainLastComparison())); + } + } + } + + @Override + public void finish() { + finished = true; + } + + @Override + public String name() { + return this.getClass().getSimpleName(); + } + + /** + * Get sorted tabletServers that the rebalance process will go over to apply balancing actions + * to replicas they contain. + */ + protected SortedSet serversToBalance(ClusterModel clusterModel) { + return clusterModel.servers(); + } + + /** + * Initialize states that this goal requires. E.g. run sanity checks regarding hard goals + * requirements. + */ + protected abstract void initGoalState(ClusterModel clusterModel) + throws RebalanceFailureException; + + /** + * Rebalance the given tabletServers without violating the constraints of the current goal and + * optimized goals. + */ + protected abstract void rebalanceForServer( + ServerModel server, ClusterModel clusterModel, Set optimizedGoals) + throws RebalanceFailureException; + + /** Update goal state after one round of rebalance. */ + protected abstract void updateGoalState(ClusterModel clusterModel) + throws RebalanceFailureException; + + /** + * Check if requirements of this goal are not violated if this action is applied to the given + * cluster state, {@code false} otherwise. + */ + protected abstract boolean selfSatisfied(ClusterModel clusterModel, ReBalancingAction action); + + /** + * Attempt to apply the given balancing action to the given replica in the given cluster. The + * application considers the candidate tabletServers as the potential destination tabletServers + * for replica movement or the location of followers for leadership transfer. If the movement + * attempt succeeds, the function returns the server id of the destination, otherwise the + * function returns null. + */ + protected ServerModel maybeApplyBalancingAction( + ClusterModel clusterModel, + ReplicaModel replica, + Collection candidateServers, + ActionType action, + Set optimizedGoals) { + List eligibleServers = new ArrayList<>(candidateServers); + TableBucket tableBucket = replica.tableBucket(); + for (ServerModel server : eligibleServers) { + ReBalancingAction proposal = + new ReBalancingAction(tableBucket, replica.server().id(), server.id(), action); + // A replica should be moved if: + // 0. The move is legit. + // 1. The goal requirements are not violated if this action is applied to the given + // cluster state. + // 2. The movement is acceptable by the previously optimized goals. + + if (!legitMove(replica, server, clusterModel, action)) { + LOG.trace("Replica move to server is not legit for {}.", proposal); + continue; + } + + if (!selfSatisfied(clusterModel, proposal)) { + LOG.trace("Unable to self-satisfy proposal {}.", proposal); + continue; + } + + ActionAcceptance acceptance = + isProposalAcceptableForOptimizedGoals(optimizedGoals, proposal, clusterModel); + LOG.trace( + "Trying to apply legit and self-satisfied action {}, actionAcceptance = {}", + proposal, + acceptance); + if (acceptance == ACCEPT) { + if (action == LEADERSHIP_MOVEMENT) { + clusterModel.relocateLeadership( + tableBucket, replica.server().id(), server.id()); + } else if (action == REPLICA_MOVEMENT) { + clusterModel.relocateReplica(tableBucket, replica.server().id(), server.id()); + } + return server; + } + } + return null; + } +} diff --git a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/rebalance/goal/Goal.java b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/rebalance/goal/Goal.java new file mode 100644 index 0000000000..e4ab551c53 --- /dev/null +++ b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/rebalance/goal/Goal.java @@ -0,0 +1,119 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.fluss.server.coordinator.rebalance.goal; + +import org.apache.fluss.exception.RebalanceFailureException; +import org.apache.fluss.server.coordinator.rebalance.ActionAcceptance; +import org.apache.fluss.server.coordinator.rebalance.ReBalancingAction; +import org.apache.fluss.server.coordinator.rebalance.model.ClusterModel; +import org.apache.fluss.server.coordinator.rebalance.model.ClusterModelStats; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.Serializable; +import java.util.Comparator; +import java.util.Set; + +/** This is the interface of the optimization goals used for rebalance. */ +public interface Goal { + Logger LOG = LoggerFactory.getLogger(Goal.class); + + /** + * Optimize the given cluster model as needed for this goal. + * + *

    The method will be given a cluster model. The goal can try to optimize the cluster model + * by performing some admin operations(e.g. move replicas or leader of tableBuckets). + * + *

    During the optimization, the implementation should make sure that all the previously + * optimized goals are still satisfied after this method completes its execution. The + * implementation can use {@link #actionAcceptance(ReBalancingAction, ClusterModel)} to check + * whether an admin operation is allowed by a previously optimized goals. + * + *

    The implementation of a soft goal should return a boolean indicating whether the goal has + * been met after the optimization or not. + * + *

    The implementation of a hard goal should throw an {@link RebalanceFailureException} when + * the goal cannot be met. This will then fail the entire optimization attempt. + */ + void optimize(ClusterModel clusterModel, Set optimizedGoals); + + /** + * Check whether the given action is acceptable by this goal in the given state of the cluster. + * An action is (1) accepted by a goal if it satisfies requirements of the goal, or (2) rejected + * by a goal if it violates its requirements. The return value indicates whether the action is + * accepted or why it is rejected. + */ + ActionAcceptance actionAcceptance(ReBalancingAction action, ClusterModel clusterModel); + + /** + * Get an instance of {@link ClusterModelStatsComparator} for this goal. + * + *

    The {@link ClusterModelStatsComparator#compare(ClusterModelStats, ClusterModelStats)} + * method should give a preference between two {@link ClusterModelStats}. + * + *

    The returned value must not be null. + * + * @return An instance of {@link ClusterModelStatsComparator} for this goal. + */ + ClusterModelStatsComparator clusterModelStatsComparator(); + + /** + * Signal for finishing the process for rebalance. It is intended to mark the goal optimization + * as finished and perform the memory clean up after the goal optimization. + */ + void finish(); + + /** + * @return {@code true} if this is a hard goal, {@code false} otherwise. + */ + boolean isHardGoal(); + + /** + * @return The name of this goal. Name of a goal provides an identification for the goal in + * human-readable format. + */ + String name(); + + /** + * A comparator that compares two cluster model stats. + * + *

    Note: this comparator imposes orderings that are inconsistent with equals. + */ + interface ClusterModelStatsComparator extends Comparator, Serializable { + + /** + * Compare two cluster model stats and determine which stats is preferred. + * + * @param stats1 the first stats + * @param stats2 the second stats + * @return Positive value if stats1 is preferred, 0 if the two stats are equally preferred, + * negative value if stats2 is preferred. + */ + @Override + int compare(ClusterModelStats stats1, ClusterModelStats stats2); + + /** + * This is a method to get the reason for the last comparison. The implementation should at + * least provide a reason when the last comparison returns negative value. + * + * @return A string that explains the result of last comparison. + */ + String explainLastComparison(); + } +} diff --git a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/rebalance/goal/GoalOptimizer.java b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/rebalance/goal/GoalOptimizer.java new file mode 100644 index 0000000000..6a34c0122a --- /dev/null +++ b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/rebalance/goal/GoalOptimizer.java @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.fluss.server.coordinator.rebalance.goal; + +import org.apache.fluss.cluster.rebalance.RebalancePlanForBucket; +import org.apache.fluss.metadata.TableBucket; +import org.apache.fluss.server.coordinator.rebalance.model.ClusterModel; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import static org.apache.fluss.server.coordinator.rebalance.goal.GoalOptimizerUtils.getDiff; +import static org.apache.fluss.server.coordinator.rebalance.goal.GoalOptimizerUtils.hasDiff; + +/** A class for optimizing goals in the given order of priority. */ +public class GoalOptimizer { + private static final Logger LOG = LoggerFactory.getLogger(GoalOptimizer.class); + + public List doOptimizeOnce( + ClusterModel clusterModel, List goalsByPriority) { + LOG.trace("Cluster before optimization is {}", clusterModel); + Map> initReplicaDistribution = + clusterModel.getReplicaDistribution(); + Map initLeaderDistribution = clusterModel.getLeaderDistribution(); + + // Set of balancing proposals that will be applied to the given cluster state to satisfy + // goals (leadership transfer AFTER bucket transfer.) + Set optimizedGoals = new HashSet<>(); + Map> preOptimizedReplicaDistribution = null; + Map preOptimizedLeaderDistribution = null; + for (Goal goal : goalsByPriority) { + preOptimizedReplicaDistribution = + preOptimizedReplicaDistribution == null + ? initReplicaDistribution + : clusterModel.getReplicaDistribution(); + preOptimizedLeaderDistribution = + preOptimizedLeaderDistribution == null + ? initLeaderDistribution + : clusterModel.getLeaderDistribution(); + + // executing the goal optimization. + goal.optimize(clusterModel, optimizedGoals); + optimizedGoals.add(goal); + + boolean hasDiff = + hasDiff( + preOptimizedReplicaDistribution, + preOptimizedLeaderDistribution, + clusterModel); + LOG.info( + "[{}/{}] Generated {} proposals for {}", + optimizedGoals.size(), + goalsByPriority.size(), + hasDiff ? "some" : "no", + goal.name()); + } + + return getDiff(initReplicaDistribution, initLeaderDistribution, clusterModel); + } +} diff --git a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/rebalance/goal/GoalOptimizerUtils.java b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/rebalance/goal/GoalOptimizerUtils.java new file mode 100644 index 0000000000..dadde867b2 --- /dev/null +++ b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/rebalance/goal/GoalOptimizerUtils.java @@ -0,0 +1,172 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.fluss.server.coordinator.rebalance.goal; + +import org.apache.fluss.cluster.rebalance.RebalancePlanForBucket; +import org.apache.fluss.metadata.TableBucket; +import org.apache.fluss.server.coordinator.rebalance.ActionAcceptance; +import org.apache.fluss.server.coordinator.rebalance.ReBalancingAction; +import org.apache.fluss.server.coordinator.rebalance.model.BucketModel; +import org.apache.fluss.server.coordinator.rebalance.model.ClusterModel; +import org.apache.fluss.server.coordinator.rebalance.model.ReplicaModel; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import static org.apache.fluss.server.coordinator.rebalance.ActionAcceptance.ACCEPT; +import static org.apache.fluss.utils.Preconditions.checkNotNull; + +/** An util class for {@link GoalOptimizer}. */ +public class GoalOptimizerUtils { + + public static final double EPSILON = 1E-5; + + /** Check whether the given proposal is acceptable for all the given optimized goals. */ + public static ActionAcceptance isProposalAcceptableForOptimizedGoals( + Set optimizedGoals, ReBalancingAction action, ClusterModel cluster) { + for (Goal goal : optimizedGoals) { + ActionAcceptance acceptance = goal.actionAcceptance(action, cluster); + if (acceptance != ACCEPT) { + return acceptance; + } + } + return ACCEPT; + } + + /** + * Compare the given values. + * + *

    +     *     1. Return 1 if first
    +     *     2. -1 if first
    +     *     3. 0 otherwise.
    +     * 
    + */ + public static int compare(double d1, double d2, double epsilon) { + if (d2 - d1 > epsilon) { + // Second value is larger than the first value. + return -1; + } + if (d1 - d2 > epsilon) { + // First value is larger than the second value. + return 1; + } + // Given values are approximately equal. + return 0; + } + + /** + * Get whether there is any diff represented by a set of rebalance plan to move from the initial + * to final distribution. + */ + public static boolean hasDiff( + Map> initialReplicaDistribution, + Map initialLeaderDistribution, + ClusterModel optimizedCluster) { + Map> finalReplicaDistribution = + optimizedCluster.getReplicaDistribution(); + sanityCheckReplicaDistribution(initialReplicaDistribution, finalReplicaDistribution); + + boolean hasDiff = false; + for (Map.Entry> entry : initialReplicaDistribution.entrySet()) { + TableBucket tableBucket = entry.getKey(); + List initialReplicas = entry.getValue(); + List finalReplicas = finalReplicaDistribution.get(tableBucket); + + if (!finalReplicas.equals(initialReplicas)) { + hasDiff = true; + break; + } else { + BucketModel bucket = optimizedCluster.bucket(tableBucket); + checkNotNull(bucket, "Bucket is not in the cluster."); + ReplicaModel finalLeaderReplica = bucket.leader(); + checkNotNull(finalLeaderReplica, "Leader replica is not in the bucket."); + Integer finalLeader = finalLeaderReplica.server().id(); + if (!initialLeaderDistribution.get(tableBucket).equals(finalLeader)) { + hasDiff = true; + break; + } + // The bucket has no change. + } + } + return hasDiff; + } + + /** + * Get the diff represented by the set of rebalance plan for bucket to move from initial to + * final distribution. + */ + public static List getDiff( + Map> initialReplicaDistribution, + Map initialLeaderDistribution, + ClusterModel optimizedCluster) { + Map> finalReplicaDistribution = + optimizedCluster.getReplicaDistribution(); + sanityCheckReplicaDistribution(initialReplicaDistribution, finalReplicaDistribution); + + // Generate a set of rebalance plans to represent the diff between initial and final + // distribution. + List diff = new ArrayList<>(); + for (Map.Entry> entry : initialReplicaDistribution.entrySet()) { + TableBucket tableBucket = entry.getKey(); + List initialReplicas = entry.getValue(); + List finalReplicas = finalReplicaDistribution.get(tableBucket); + BucketModel bucket = optimizedCluster.bucket(tableBucket); + checkNotNull(bucket, "Bucket is not in the cluster."); + ReplicaModel finalLeaderReplica = bucket.leader(); + checkNotNull(finalLeaderReplica, "Leader replica is not in the bucket."); + int finalLeader = finalLeaderReplica.server().id(); + // The bucket has no change. + if (finalReplicas.equals(initialReplicas) + && initialLeaderDistribution.get(tableBucket).equals(finalLeader)) { + continue; + } + // We need to adjust the final server list order to ensure the final leader is the first + // replica. + if (finalLeader != finalReplicas.get(0)) { + int leaderPos = finalReplicas.indexOf(finalLeader); + finalReplicas.set(leaderPos, finalReplicas.get(0)); + finalReplicas.set(0, finalLeader); + } + diff.add( + new RebalancePlanForBucket( + tableBucket, + initialLeaderDistribution.get(tableBucket), + finalLeader, + initialReplicas, + finalReplicas)); + } + return diff; + } + + /** + * Sanity check to ensure that initial and final replica distribution have exactly the same + * buckets. + */ + private static void sanityCheckReplicaDistribution( + Map> initialReplicaDistribution, + Map> finalReplicaDistribution) { + // Sanity check to make sure that given distributions contain the same replicas. + if (!initialReplicaDistribution.keySet().equals(finalReplicaDistribution.keySet())) { + throw new IllegalArgumentException( + "Initial and final replica distributions do not contain the same buckets."); + } + } +} diff --git a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/rebalance/goal/GoalUtils.java b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/rebalance/goal/GoalUtils.java new file mode 100644 index 0000000000..81d137602b --- /dev/null +++ b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/rebalance/goal/GoalUtils.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.fluss.server.coordinator.rebalance.goal; + +import org.apache.fluss.cluster.rebalance.GoalType; +import org.apache.fluss.server.coordinator.rebalance.ActionType; +import org.apache.fluss.server.coordinator.rebalance.model.ClusterModel; +import org.apache.fluss.server.coordinator.rebalance.model.ReplicaModel; +import org.apache.fluss.server.coordinator.rebalance.model.ServerModel; + +import java.util.HashSet; +import java.util.Set; +import java.util.stream.Collectors; + +/** An util class for {@link Goal}. */ +public class GoalUtils { + + public static Goal getGoalByType(GoalType goalType) { + switch (goalType) { + case REPLICA_DISTRIBUTION_GOAL: + return new ReplicaDistributionGoal(); + case LEADER_DISTRIBUTION_GOAL: + return new LeaderReplicaDistributionGoal(); + default: + throw new IllegalArgumentException("Unsupported goal type " + goalType); + } + } + + /** + * Check whether the proposed action is legit. An action is legit if it is: + * + *
      + *
    • 1. a replica movement across tabletServers, the dest server does not have a replica of + * the same bucket and is allowed to have a replica from the bucket + *
    • a leadership movement, the replica is a leader and the dest server has a follower of + * the same bucket + *
    + */ + public static boolean legitMove( + ReplicaModel replica, + ServerModel destServer, + ClusterModel cluster, + ActionType actionType) { + switch (actionType) { + case REPLICA_MOVEMENT: + return cluster.bucket(replica.tableBucket()).canAssignReplicaToServer(destServer) + && destServer.replica(replica.tableBucket()) == null; + case LEADERSHIP_MOVEMENT: + return replica.isLeader() && destServer.replica(replica.tableBucket()) != null; + default: + return false; + } + } + + /** + * Retrieve alive servers ids that are not excluded for replica moves. Returns a set to provide + * constant time lookup guaranteed by a HashSet. + */ + public static Set aliveServersNotExcludeForReplicaMove(ClusterModel cluster) { + return cluster.aliveServers().stream() + .map(ServerModel::id) + .collect(Collectors.toCollection(HashSet::new)); + } +} diff --git a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/rebalance/goal/LeaderReplicaDistributionGoal.java b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/rebalance/goal/LeaderReplicaDistributionGoal.java new file mode 100644 index 0000000000..8b5faea263 --- /dev/null +++ b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/rebalance/goal/LeaderReplicaDistributionGoal.java @@ -0,0 +1,334 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.fluss.server.coordinator.rebalance.goal; + +import org.apache.fluss.exception.RebalanceFailureException; +import org.apache.fluss.server.coordinator.rebalance.ActionAcceptance; +import org.apache.fluss.server.coordinator.rebalance.ActionType; +import org.apache.fluss.server.coordinator.rebalance.ReBalancingAction; +import org.apache.fluss.server.coordinator.rebalance.model.BucketModel; +import org.apache.fluss.server.coordinator.rebalance.model.ClusterModel; +import org.apache.fluss.server.coordinator.rebalance.model.ClusterModelStats; +import org.apache.fluss.server.coordinator.rebalance.model.ReplicaModel; +import org.apache.fluss.server.coordinator.rebalance.model.ServerModel; +import org.apache.fluss.server.coordinator.rebalance.model.Statistic; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Collections; +import java.util.Comparator; +import java.util.HashSet; +import java.util.List; +import java.util.Objects; +import java.util.PriorityQueue; +import java.util.Set; +import java.util.SortedSet; +import java.util.TreeSet; +import java.util.stream.Collectors; + +import static org.apache.fluss.server.coordinator.rebalance.ActionAcceptance.ACCEPT; +import static org.apache.fluss.server.coordinator.rebalance.ActionAcceptance.REPLICA_REJECT; +import static org.apache.fluss.server.coordinator.rebalance.goal.GoalOptimizerUtils.EPSILON; +import static org.apache.fluss.server.coordinator.rebalance.goal.ReplicaDistributionAbstractGoal.ChangeType.ADD; +import static org.apache.fluss.server.coordinator.rebalance.goal.ReplicaDistributionAbstractGoal.ChangeType.REMOVE; +import static org.apache.fluss.utils.Preconditions.checkNotNull; + +/** + * Soft goal to generate leadership movement and leader replica movement proposals to ensure that + * the number of leader replicas on each server is. + * + *
      + *
    • Under: (the average number of leader replicas per server) * (1 + leader replica count + * balance percentage) + *
    • Above: (the average number of leader replicas per server) * Math.max(0, 1 - leader replica + * count balance percentage) + *
    + */ +public class LeaderReplicaDistributionGoal extends ReplicaDistributionAbstractGoal { + + private static final Logger LOG = LoggerFactory.getLogger(LeaderReplicaDistributionGoal.class); + + /** + * The maximum allowed extent of unbalance for leader replica distribution. For example, 1.10 + * means the highest leader replica count of a server should not be 1.10x of average leader + * replica count of all alive tabletServers. + */ + private static final Double LEADER_REPLICA_COUNT_REBALANCE_THRESHOLD = 1.10d; + + @Override + public ActionAcceptance actionAcceptance(ReBalancingAction action, ClusterModel clusterModel) { + ServerModel sourceServer = clusterModel.server(action.getSourceServerId()); + checkNotNull( + sourceServer, "Source server " + action.getSourceServerId() + " is not found."); + ReplicaModel sourceReplica = sourceServer.replica(action.getTableBucket()); + checkNotNull(sourceReplica, "Source replica " + action.getTableBucket() + " is not found."); + ServerModel destServer = clusterModel.server(action.getDestinationServerId()); + switch (action.getActionType()) { + case LEADERSHIP_MOVEMENT: + return isLeaderMovementSatisfiable(sourceServer, destServer); + case REPLICA_MOVEMENT: + if (sourceReplica.isLeader()) { + return isLeaderMovementSatisfiable(sourceServer, destServer); + } + return ACCEPT; + default: + throw new IllegalArgumentException( + "Unsupported action type " + action.getActionType()); + } + } + + @Override + protected void rebalanceForServer( + ServerModel server, ClusterModel clusterModel, Set optimizedGoals) + throws RebalanceFailureException { + LOG.debug( + "Rebalancing server {} [limits] lower: {} upper: {}.", + server.id(), + rebalanceLowerLimit, + rebalanceUpperLimit); + int numLeaderReplicas = server.leaderReplicas().size(); + boolean isExcludedForReplicaMove = isExcludedForReplicaMove(server); + boolean requireLessLeaderReplicas = + numLeaderReplicas > (isExcludedForReplicaMove ? 0 : rebalanceUpperLimit) + || !server.isAlive(); + boolean requireMoreLeaderReplicas = + !isExcludedForReplicaMove + && server.isAlive() + && numLeaderReplicas < rebalanceLowerLimit; + // Update server ids over the balance limit for logging purposes. + if (((requireLessLeaderReplicas + && rebalanceByMovingLeadershipOut(server, clusterModel, optimizedGoals))) + && rebalanceByMovingReplicasOut(server, clusterModel, optimizedGoals)) { + serverIdsAboveRebalanceUpperLimit.add(server.id()); + LOG.debug( + "Failed to sufficiently decrease leader replica count in server {}. Leader replicas: {}.", + server.id(), + server.leaderReplicas().size()); + } else if (requireMoreLeaderReplicas + && rebalanceByMovingLeadershipIn(server, clusterModel, optimizedGoals) + && rebalanceByMovingLeaderReplicasIn(server, clusterModel, optimizedGoals)) { + serverIdsBelowRebalanceLowerLimit.add(server.id()); + LOG.debug( + "Failed to sufficiently increase leader replica count in server {}. Leader replicas: {}.", + server.id(), + server.leaderReplicas().size()); + } + } + + @Override + public ClusterModelStatsComparator clusterModelStatsComparator() { + return new LeaderReplicaDistributionGoalStatsComparator(); + } + + @Override + int numInterestedReplicas(ClusterModel clusterModel) { + return clusterModel.numLeaderReplicas(); + } + + @Override + double balancePercentage() { + return LEADER_REPLICA_COUNT_REBALANCE_THRESHOLD; + } + + private ActionAcceptance isLeaderMovementSatisfiable( + ServerModel sourceServer, ServerModel destServer) { + return (isReplicaCountUnderBalanceUpperLimitAfterChange( + destServer, destServer.leaderReplicas().size(), ADD) + && (isExcludedForReplicaMove(sourceServer) + || isReplicaCountAboveBalanceLowerLimitAfterChange( + sourceServer, + sourceServer.leaderReplicas().size(), + REMOVE))) + ? ACCEPT + : REPLICA_REJECT; + } + + private boolean rebalanceByMovingLeadershipOut( + ServerModel server, ClusterModel cluster, Set optimizedGoals) { + // If the source server is excluded for replica move, set its upper limit to 0. + int balanceUpperLimitForSourceServer = + isExcludedForReplicaMove(server) ? 0 : rebalanceUpperLimit; + int numLeaderReplicas = server.leaderReplicas().size(); + for (ReplicaModel leader : new HashSet<>(server.leaderReplicas())) { + BucketModel bucketModel = cluster.bucket(leader.tableBucket()); + checkNotNull(bucketModel, "Bucket " + leader.tableBucket() + " is not found."); + Set candidateServers = + bucketModel.bucketServers().stream() + .filter(b -> b != server) + .collect(Collectors.toSet()); + ServerModel b = + maybeApplyBalancingAction( + cluster, + leader, + candidateServers, + ActionType.LEADERSHIP_MOVEMENT, + optimizedGoals); + // Only check if we successfully moved something. + if (b != null) { + if (--numLeaderReplicas <= balanceUpperLimitForSourceServer) { + return false; + } + } + } + return true; + } + + private boolean rebalanceByMovingLeadershipIn( + ServerModel server, ClusterModel cluster, Set optimizedGoals) { + int numLeaderReplicas = server.leaderReplicas().size(); + Set candidateServers = Collections.singleton(server); + for (ReplicaModel replica : server.replicas()) { + if (replica.isLeader()) { + continue; + } + + BucketModel bucket = cluster.bucket(replica.tableBucket()); + checkNotNull(bucket, "Bucket " + replica.tableBucket() + " is not found."); + ServerModel b = + maybeApplyBalancingAction( + cluster, + Objects.requireNonNull(bucket.leader()), + candidateServers, + ActionType.LEADERSHIP_MOVEMENT, + optimizedGoals); + // Only check if we successfully moved something. + if (b != null) { + if (++numLeaderReplicas >= rebalanceLowerLimit) { + return false; + } + } + } + return true; + } + + private boolean rebalanceByMovingReplicasOut( + ServerModel server, ClusterModel cluster, Set optimizedGoals) { + // Get the eligible servers. + SortedSet candidateServers; + candidateServers = + new TreeSet<>( + Comparator.comparingInt((ServerModel b) -> b.leaderReplicas().size()) + .thenComparingInt(ServerModel::id)); + candidateServers.addAll( + cluster.aliveServers().stream() + .filter(b -> b.leaderReplicas().size() < rebalanceUpperLimit) + .collect(Collectors.toSet())); + + int balanceUpperLimit = rebalanceUpperLimit; + int numReplicas = server.replicas().size(); + for (ReplicaModel replica : server.replicas()) { + ServerModel b = + maybeApplyBalancingAction( + cluster, + replica, + candidateServers, + ActionType.REPLICA_MOVEMENT, + optimizedGoals); + // Only check if we successfully moved something. + if (b != null) { + if (--numReplicas <= balanceUpperLimit) { + return false; + } + // Remove and reinsert the server so the order is correct. + candidateServers.remove(b); + if (b.leaderReplicas().size() < rebalanceUpperLimit) { + candidateServers.add(b); + } + } + } + return true; + } + + private boolean rebalanceByMovingLeaderReplicasIn( + ServerModel server, ClusterModel clusterModel, Set optimizedGoals) { + PriorityQueue eligibleServers = + new PriorityQueue<>( + (b1, b2) -> { + int result = + Integer.compare( + b2.leaderReplicas().size(), b1.leaderReplicas().size()); + return result == 0 ? Integer.compare(b1.id(), b2.id()) : result; + }); + + for (ServerModel aliveServer : clusterModel.aliveServers()) { + if (aliveServer.leaderReplicas().size() > rebalanceLowerLimit) { + eligibleServers.add(aliveServer); + } + } + List candidateServers = Collections.singletonList(server); + int numLeaderReplicas = server.leaderReplicas().size(); + while (!eligibleServers.isEmpty()) { + ServerModel sourceServer = eligibleServers.poll(); + for (ReplicaModel replica : sourceServer.replicas()) { + ServerModel b = + maybeApplyBalancingAction( + clusterModel, + replica, + candidateServers, + ActionType.REPLICA_MOVEMENT, + optimizedGoals); + // Only need to check status if the action is taken. This will also handle the case + // that the source server has nothing to move in. In that case we will never + // reenqueue that source server. + if (b != null) { + if (++numLeaderReplicas >= rebalanceLowerLimit) { + return false; + } + // If the source server has a lower number of leader replicas than the next + // server in the eligible server queue, we reenqueue the source server and + // switch to the next server. + if (!eligibleServers.isEmpty() + && sourceServer.leaderReplicas().size() + < eligibleServers.peek().leaderReplicas().size()) { + eligibleServers.add(sourceServer); + break; + } + } + } + } + return true; + } + + private class LeaderReplicaDistributionGoalStatsComparator + implements ClusterModelStatsComparator { + private String reasonForLastNegativeResult; + + @Override + public int compare(ClusterModelStats stats1, ClusterModelStats stats2) { + // Standard deviation of number of leader replicas over alive servers in the current + // must be less than the pre-optimized stats. + double stDev1 = stats1.leaderReplicaStats().get(Statistic.ST_DEV).doubleValue(); + double stDev2 = stats2.leaderReplicaStats().get(Statistic.ST_DEV).doubleValue(); + int result = GoalOptimizerUtils.compare(stDev2, stDev1, EPSILON); + if (result < 0) { + reasonForLastNegativeResult = + String.format( + "Violated %s. [Std Deviation of Leader Replica Distribution] post-" + + "optimization:%.3f pre-optimization:%.3f", + name(), stDev1, stDev2); + } + return result; + } + + @Override + public String explainLastComparison() { + return reasonForLastNegativeResult; + } + } +} diff --git a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/rebalance/goal/ReplicaDistributionAbstractGoal.java b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/rebalance/goal/ReplicaDistributionAbstractGoal.java new file mode 100644 index 0000000000..cbd55305c5 --- /dev/null +++ b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/rebalance/goal/ReplicaDistributionAbstractGoal.java @@ -0,0 +1,161 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.fluss.server.coordinator.rebalance.goal; + +import org.apache.fluss.exception.RebalanceFailureException; +import org.apache.fluss.server.coordinator.rebalance.ReBalancingAction; +import org.apache.fluss.server.coordinator.rebalance.model.ClusterModel; +import org.apache.fluss.server.coordinator.rebalance.model.ServerModel; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.HashSet; +import java.util.Set; + +import static org.apache.fluss.server.coordinator.rebalance.ActionAcceptance.ACCEPT; +import static org.apache.fluss.server.coordinator.rebalance.goal.GoalUtils.aliveServersNotExcludeForReplicaMove; + +/** An abstract class for goals that are based on the distribution of replicas. */ +public abstract class ReplicaDistributionAbstractGoal extends AbstractGoal { + private static final Logger LOG = + LoggerFactory.getLogger(ReplicaDistributionAbstractGoal.class); + private static final double BALANCE_MARGIN = 0.9; + protected final Set serverIdsAboveRebalanceUpperLimit; + protected final Set serverIdsBelowRebalanceLowerLimit; + protected double avgReplicasOnAliveServer; + protected int rebalanceUpperLimit; + protected int rebalanceLowerLimit; + // This is used to identify servers not excluded for replica moves. + protected Set serversAllowedReplicaRemove; + + public ReplicaDistributionAbstractGoal() { + serverIdsAboveRebalanceUpperLimit = new HashSet<>(); + serverIdsBelowRebalanceLowerLimit = new HashSet<>(); + } + + private int rebalanceUpperLimit(double balancePercentage) { + return (int) + Math.ceil( + avgReplicasOnAliveServer + * (1 + adjustedRebalancePercentage(balancePercentage))); + } + + private int rebalanceLowerLimit(double balancePercentage) { + return (int) + Math.floor( + avgReplicasOnAliveServer + * Math.max( + 0, (1 - adjustedRebalancePercentage(balancePercentage)))); + } + + private double adjustedRebalancePercentage(double rebalancePercentage) { + return (rebalancePercentage - 1) * BALANCE_MARGIN; + } + + boolean isReplicaCountUnderBalanceUpperLimitAfterChange( + ServerModel server, int currentReplicaCount, ChangeType changeType) { + int serverBalanceUpperLimit = server.isAlive() ? rebalanceUpperLimit : 0; + + return changeType == ChangeType.ADD + ? currentReplicaCount + 1 <= serverBalanceUpperLimit + : currentReplicaCount - 1 <= serverBalanceUpperLimit; + } + + boolean isReplicaCountAboveBalanceLowerLimitAfterChange( + ServerModel server, int currentReplicaCount, ChangeType changeType) { + int serverBalanceLowerLimit = server.isAlive() ? rebalanceLowerLimit : 0; + + return changeType == ChangeType.ADD + ? currentReplicaCount + 1 >= serverBalanceLowerLimit + : currentReplicaCount - 1 >= serverBalanceLowerLimit; + } + + @Override + public boolean isHardGoal() { + return false; + } + + @Override + protected void initGoalState(ClusterModel clusterModel) throws RebalanceFailureException { + serversAllowedReplicaRemove = aliveServersNotExcludeForReplicaMove(clusterModel); + if (serversAllowedReplicaRemove.isEmpty()) { + throw new RebalanceFailureException( + String.format( + "[%s] All alive tabletServers are excluded from replica moves.", + name())); + } + + // Initialize the average replicas on an alive server. + avgReplicasOnAliveServer = + numInterestedReplicas(clusterModel) / (double) serversAllowedReplicaRemove.size(); + + rebalanceUpperLimit = rebalanceUpperLimit(balancePercentage()); + rebalanceLowerLimit = rebalanceLowerLimit(balancePercentage()); + } + + @Override + protected boolean selfSatisfied(ClusterModel clusterModel, ReBalancingAction action) { + // Check that destination and source would not become unbalanced. + return actionAcceptance(action, clusterModel) == ACCEPT; + } + + @Override + protected void updateGoalState(ClusterModel clusterModel) throws RebalanceFailureException { + if (!serverIdsAboveRebalanceUpperLimit.isEmpty()) { + LOG.debug( + "Replicas count on server ids:{} {} above the balance limit of {} after rebalance.", + serverIdsAboveRebalanceUpperLimit, + (serverIdsAboveRebalanceUpperLimit.size() > 1) ? "are" : "is", + rebalanceUpperLimit); + serverIdsAboveRebalanceUpperLimit.clear(); + succeeded = false; + } + + if (!serverIdsBelowRebalanceLowerLimit.isEmpty()) { + LOG.debug( + "Replicas count on server ids:{} {} below the balance limit of {} after rebalance.", + serverIdsBelowRebalanceLowerLimit, + (serverIdsBelowRebalanceLowerLimit.size() > 1) ? "are" : "is", + rebalanceLowerLimit); + serverIdsBelowRebalanceLowerLimit.clear(); + succeeded = false; + } + + // TODO maybe need check offline server. + + finish(); + } + + abstract int numInterestedReplicas(ClusterModel clusterModel); + + /** + * @return The requested balance threshold. + */ + abstract double balancePercentage(); + + protected boolean isExcludedForReplicaMove(ServerModel server) { + return !serversAllowedReplicaRemove.contains(server.id()); + } + + /** Whether bring replica in or out. */ + protected enum ChangeType { + ADD, + REMOVE + } +} diff --git a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/rebalance/goal/ReplicaDistributionGoal.java b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/rebalance/goal/ReplicaDistributionGoal.java new file mode 100644 index 0000000000..b70c5ca622 --- /dev/null +++ b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/rebalance/goal/ReplicaDistributionGoal.java @@ -0,0 +1,292 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.fluss.server.coordinator.rebalance.goal; + +import org.apache.fluss.exception.RebalanceFailureException; +import org.apache.fluss.server.coordinator.rebalance.ActionAcceptance; +import org.apache.fluss.server.coordinator.rebalance.ActionType; +import org.apache.fluss.server.coordinator.rebalance.ReBalancingAction; +import org.apache.fluss.server.coordinator.rebalance.model.ClusterModel; +import org.apache.fluss.server.coordinator.rebalance.model.ClusterModelStats; +import org.apache.fluss.server.coordinator.rebalance.model.ReplicaModel; +import org.apache.fluss.server.coordinator.rebalance.model.ServerModel; +import org.apache.fluss.server.coordinator.rebalance.model.Statistic; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Collections; +import java.util.Comparator; +import java.util.List; +import java.util.PriorityQueue; +import java.util.Set; +import java.util.SortedSet; +import java.util.TreeSet; +import java.util.stream.Collectors; + +import static org.apache.fluss.server.coordinator.rebalance.ActionAcceptance.ACCEPT; +import static org.apache.fluss.server.coordinator.rebalance.ActionAcceptance.REPLICA_REJECT; +import static org.apache.fluss.server.coordinator.rebalance.goal.GoalOptimizerUtils.EPSILON; +import static org.apache.fluss.server.coordinator.rebalance.goal.ReplicaDistributionAbstractGoal.ChangeType.ADD; +import static org.apache.fluss.server.coordinator.rebalance.goal.ReplicaDistributionAbstractGoal.ChangeType.REMOVE; +import static org.apache.fluss.utils.Preconditions.checkNotNull; + +/** + * Soft goal to generate replica movement proposals to ensure that the number of replicas on each + * server is. + * + *
      + *
    • Under: (the average number of replicas per server) * (1 + replica count balance percentage) + *
    • Above: (the average number of replicas per server) * Math.max(0, 1 - replica count balance + * percentage) + *
    + */ +public class ReplicaDistributionGoal extends ReplicaDistributionAbstractGoal { + + private static final Logger LOG = LoggerFactory.getLogger(ReplicaDistributionGoal.class); + + // TODO configurable. + /** + * The maximum allowed extent of unbalance for replica leader replica distribution. For example, + * 1.10 means the highest leader replica count of a server should not be 1.10x of average leader + * replica count of all alive tabletServers. + */ + private static final Double REPLICA_COUNT_REBALANCE_THRESHOLD = 1.10d; + + @Override + public ActionAcceptance actionAcceptance(ReBalancingAction action, ClusterModel clusterModel) { + switch (action.getActionType()) { + case LEADERSHIP_MOVEMENT: + return ACCEPT; + case REPLICA_MOVEMENT: + ServerModel sourceServer = clusterModel.server(action.getSourceServerId()); + ServerModel destServer = clusterModel.server(action.getDestinationServerId()); + + checkNotNull( + sourceServer, + "Source server " + action.getSourceServerId() + " is not found."); + checkNotNull( + destServer, + "Destination server " + action.getDestinationServerId() + " is not found."); + + // Check that destination and source would not become unbalanced. + return (isReplicaCountUnderBalanceUpperLimitAfterChange( + destServer, destServer.replicas().size(), ADD)) + && (isExcludedForReplicaMove(sourceServer) + || isReplicaCountAboveBalanceLowerLimitAfterChange( + sourceServer, + sourceServer.replicas().size(), + REMOVE)) + ? ACCEPT + : REPLICA_REJECT; + default: + throw new IllegalArgumentException( + "Unsupported balancing action " + action.getActionType() + " is provided."); + } + } + + @Override + protected void rebalanceForServer( + ServerModel server, ClusterModel clusterModel, Set optimizedGoals) + throws RebalanceFailureException { + LOG.debug( + "Rebalancing server {} [limits] lower: {} upper: {}.", + server.id(), + rebalanceLowerLimit, + rebalanceUpperLimit); + int numReplicas = server.replicas().size(); + boolean isExcludeForReplicaMove = isExcludedForReplicaMove(server); + + boolean requireLessReplicas = + numReplicas > rebalanceUpperLimit || isExcludeForReplicaMove || !server.isAlive(); + boolean requireMoreReplicas = + !isExcludeForReplicaMove && server.isAlive() && numReplicas < rebalanceLowerLimit; + if (!requireMoreReplicas && !requireLessReplicas) { + // return if the server is already within the limit. + return; + } + + if (requireLessReplicas + && rebalanceByMovingReplicasOut(server, clusterModel, optimizedGoals)) { + serverIdsAboveRebalanceUpperLimit.add(server.id()); + LOG.debug( + "Failed to sufficiently decrease replica count in server {} with replica movements. " + + "Replicas number after remove: {}.", + server.id(), + server.replicas().size()); + } + + if (requireMoreReplicas + && rebalanceByMovingReplicasIn(server, clusterModel, optimizedGoals)) { + serverIdsBelowRebalanceLowerLimit.add(server.id()); + LOG.debug( + "Failed to sufficiently increase replica count in server {} with replica movements. " + + "Replicas number after remove: {}.", + server.id(), + server.replicas().size()); + } + + if (!serverIdsAboveRebalanceUpperLimit.contains(server.id()) + && !serverIdsBelowRebalanceLowerLimit.contains(server.id())) { + LOG.debug( + "Successfully balanced replica count for server {} by moving replicas. " + + "Replicas number after remove: {}", + server.id(), + server.replicas().size()); + } + } + + @Override + public ClusterModelStatsComparator clusterModelStatsComparator() { + return new ReplicaDistributionGoalStatsComparator(); + } + + @Override + int numInterestedReplicas(ClusterModel clusterModel) { + return clusterModel.numReplicas(); + } + + @Override + double balancePercentage() { + return REPLICA_COUNT_REBALANCE_THRESHOLD; + } + + private boolean rebalanceByMovingReplicasOut( + ServerModel server, ClusterModel cluster, Set optimizedGoals) { + SortedSet candidateServers = + new TreeSet<>( + Comparator.comparingInt((ServerModel b) -> b.replicas().size()) + .thenComparingInt(ServerModel::id)); + + candidateServers.addAll( + cluster.aliveServers().stream() + .filter(b -> b.replicas().size() < rebalanceUpperLimit) + .collect(Collectors.toSet())); + int balanceUpperLimitForSourceServer = + isExcludedForReplicaMove(server) ? 0 : rebalanceUpperLimit; + + // Now let's do the replica out operation. + // TODO maybe use a sorted replicas set + for (ReplicaModel replica : server.replicas()) { + ServerModel b = + maybeApplyBalancingAction( + cluster, + replica, + candidateServers, + ActionType.REPLICA_MOVEMENT, + optimizedGoals); + // Only check if we successfully moved something. + if (b != null) { + if (server.replicas().size() <= balanceUpperLimitForSourceServer) { + return false; + } + + // Remove and reinsert the server so the order is correct. + candidateServers.remove(b); + if (b.replicas().size() < rebalanceUpperLimit) { + candidateServers.add(b); + } + } + } + + return !server.replicas().isEmpty(); + } + + private boolean rebalanceByMovingReplicasIn( + ServerModel aliveDestServer, ClusterModel cluster, Set optimizedGoals) { + PriorityQueue eligibleServers = + new PriorityQueue<>( + (b1, b2) -> { + // Servers are sorted by (1) all replica count then (2) server id. + int resultByAllReplicas = + Integer.compare(b2.replicas().size(), b1.replicas().size()); + return resultByAllReplicas == 0 + ? Integer.compare(b1.id(), b2.id()) + : resultByAllReplicas; + }); + + // Source server can be offline, alive. + for (ServerModel sourceServer : cluster.servers()) { + if (sourceServer.replicas().size() > rebalanceLowerLimit + || isExcludedForReplicaMove(sourceServer)) { + eligibleServers.add(sourceServer); + } + } + + List candidateServers = Collections.singletonList(aliveDestServer); + while (!eligibleServers.isEmpty()) { + ServerModel sourceServer = eligibleServers.poll(); + // TODO maybe use a sorted replicas set + for (ReplicaModel replica : sourceServer.replicas()) { + ServerModel b = + maybeApplyBalancingAction( + cluster, + replica, + candidateServers, + ActionType.REPLICA_MOVEMENT, + optimizedGoals); + // Only need to check status if the action is taken. This will also handle the case + // that the source server has nothing to move in. In that case we will never + // re-enqueue that source server. + if (b != null) { + if (aliveDestServer.replicas().size() >= rebalanceLowerLimit) { + // Note that the server passed to this method is always alive; hence, there + // is no need to check if it is dead. + return false; + } + + if (!eligibleServers.isEmpty()) { + if (sourceServer.replicas().size() + < eligibleServers.peek().replicas().size()) { + eligibleServers.add(sourceServer); + break; + } + } + } + } + } + return true; + } + + private class ReplicaDistributionGoalStatsComparator implements ClusterModelStatsComparator { + private String reasonForLastNegativeResult; + + @Override + public int compare(ClusterModelStats stats1, ClusterModelStats stats2) { + // Standard deviation of number of replicas over servers not excluded for replica moves + // must be less than the + // pre-optimized stats. + double stDev1 = stats1.replicaStats().get(Statistic.ST_DEV).doubleValue(); + double stDev2 = stats2.replicaStats().get(Statistic.ST_DEV).doubleValue(); + int result = GoalOptimizerUtils.compare(stDev2, stDev1, EPSILON); + if (result < 0) { + reasonForLastNegativeResult = + String.format( + "Violated %s. [Std Deviation of Replica Distribution] post-" + + "optimization:%.3f pre-optimization:%.3f", + name(), stDev1, stDev2); + } + return result; + } + + @Override + public String explainLastComparison() { + return reasonForLastNegativeResult; + } + } +} diff --git a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/rebalance/model/BucketModel.java b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/rebalance/model/BucketModel.java new file mode 100644 index 0000000000..9aff7b0b80 --- /dev/null +++ b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/rebalance/model/BucketModel.java @@ -0,0 +1,131 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.fluss.server.coordinator.rebalance.model; + +import org.apache.fluss.metadata.TableBucket; + +import javax.annotation.Nullable; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +/** A class that holds the information of the {@link TableBucket} for rebalance. */ +public class BucketModel { + private final TableBucket tableBucket; + private final List replicas; + private @Nullable ReplicaModel leader; + // Set of server which are unable to host replica of this replica (such as: the server are + // offline). + private final Set ineligibleServers; + + public BucketModel(TableBucket tableBucket, Set ineligibleServers) { + this.tableBucket = tableBucket; + this.replicas = new ArrayList<>(); + this.leader = null; + this.ineligibleServers = ineligibleServers; + } + + public TableBucket tableBucket() { + return tableBucket; + } + + public @Nullable ReplicaModel leader() { + return leader; + } + + public List replicas() { + return replicas; + } + + public Set bucketServers() { + Set bucketServers = new HashSet<>(); + replicas.forEach(replica -> bucketServers.add(replica.server())); + return bucketServers; + } + + public boolean canAssignReplicaToServer(ServerModel candidateServer) { + return !ineligibleServers.contains(candidateServer); + } + + public ReplicaModel replica(long serverId) { + for (ReplicaModel replica : replicas) { + if (replica.server().id() == serverId) { + return replica; + } + } + + throw new IllegalArgumentException( + "Requested replica " + serverId + " is not a replica of bucket " + tableBucket); + } + + public void addLeader(ReplicaModel leader, int index) { + if (this.leader != null) { + throw new IllegalArgumentException( + String.format( + "Bucket %s already has a leader replica %s. Cannot add a new leader replica %s.", + tableBucket, this.leader, leader)); + } + + if (!leader.isLeader()) { + throw new IllegalArgumentException( + String.format( + "Inconsistent leadership information. Trying to set %s as the leader for bucket %s while " + + "the replica is not marked as a leader", + leader, tableBucket)); + } + + this.leader = leader; + replicas.add(index, leader); + } + + public void addFollower(ReplicaModel follower, int index) { + if (follower.isLeader()) { + throw new IllegalArgumentException( + String.format( + "Inconsistent leadership information. Trying to set %s as the follower for bucket %s while " + + "the replica is marked as a leader", + follower, tableBucket)); + } + + if (!follower.tableBucket().equals(this.tableBucket)) { + throw new IllegalArgumentException( + String.format( + "Inconsistent table bucket. Trying to add follower replica %s to tableBucket %s", + follower, tableBucket)); + } + + // Add follower to list of followers + replicas.add(index, follower); + } + + void relocateLeadership(ReplicaModel prospectiveLeader) { + int leaderPos = replicas.indexOf(prospectiveLeader); + swapReplicaPositions(0, leaderPos); + leader = prospectiveLeader; + } + + private void swapReplicaPositions(int index1, int index2) { + ReplicaModel replica1 = replicas.get(index1); + ReplicaModel replica2 = replicas.get(index2); + + replicas.set(index2, replica1); + replicas.set(index1, replica2); + } +} diff --git a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/rebalance/model/ClusterModel.java b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/rebalance/model/ClusterModel.java new file mode 100644 index 0000000000..884beafbed --- /dev/null +++ b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/rebalance/model/ClusterModel.java @@ -0,0 +1,282 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.fluss.server.coordinator.rebalance.model; + +import org.apache.fluss.metadata.TableBucket; + +import javax.annotation.Nullable; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.SortedMap; +import java.util.SortedSet; +import java.util.TreeMap; +import java.util.TreeSet; +import java.util.stream.Collectors; + +/** + * A class that holds the information of the cluster for rebalance.The information including live + * tabletServers, bucket distribution, tabletServer tag etc. + * + *

    Currently, the clusterModel can only be created by a rebalance request. It's used as the input + * of the GoalOptimizer to generate the rebalance plan for load rebalance. + */ +public class ClusterModel { + // TODO ClusterModel can be implemented in incremental mode, dynamically modified when there are + // events such as table create, table delete, server offline, etc. Currently designed to read + // coordinatorContext and generate it directly + + private final Map racksById; + private final Map serverIdToRack; + private final Set aliveServers; + private final SortedSet offlineServers; + private final SortedSet servers; + private final Map bucketsByTableBucket; + + public ClusterModel(SortedSet servers) { + this.servers = servers; + this.bucketsByTableBucket = new HashMap<>(); + + this.aliveServers = new HashSet<>(); + this.offlineServers = new TreeSet<>(); + for (ServerModel serverModel : servers) { + if (serverModel.isAlive()) { + aliveServers.add(serverModel); + } else { + offlineServers.add(serverModel); + } + } + + this.racksById = new HashMap<>(); + this.serverIdToRack = new HashMap<>(); + for (ServerModel serverModel : servers) { + RackModel rackModel = racksById.computeIfAbsent(serverModel.rack(), RackModel::new); + rackModel.addServer(serverModel); + serverIdToRack.put(serverModel.id(), rackModel); + } + } + + public SortedSet offlineServers() { + return offlineServers; + } + + public SortedSet servers() { + return servers; + } + + public Set aliveServers() { + return Collections.unmodifiableSet(aliveServers); + } + + public @Nullable BucketModel bucket(TableBucket tableBucket) { + return bucketsByTableBucket.get(tableBucket); + } + + public RackModel rack(String rack) { + return racksById.get(rack); + } + + public @Nullable ServerModel server(int serverId) { + RackModel rack = serverIdToRack.get(serverId); + return rack == null ? null : rack.server(serverId); + } + + /** Populate the analysis stats with this cluster. */ + public ClusterModelStats getClusterStats() { + return (new ClusterModelStats()).populate(this); + } + + public int numReplicas() { + return bucketsByTableBucket.values().stream().mapToInt(p -> p.replicas().size()).sum(); + } + + public int numLeaderReplicas() { + return bucketsByTableBucket.size(); + } + + public SortedMap> getBucketsByTable() { + SortedMap> bucketsByTable = new TreeMap<>(); + for (Long tableId : tables()) { + bucketsByTable.put(tableId, new ArrayList<>()); + } + for (Map.Entry entry : bucketsByTableBucket.entrySet()) { + bucketsByTable.get(entry.getKey().getTableId()).add(entry.getValue()); + } + return bucketsByTable; + } + + public Set tables() { + Set tables = new HashSet<>(); + + for (RackModel rack : racksById.values()) { + tables.addAll(rack.tables()); + } + return tables; + } + + /** + * Get the distribution of replicas in the cluster at the point of call. + * + * @return A map from tableBucket to the list of replicas. the first element is the leader, the + * rest are followers. + */ + public Map> getReplicaDistribution() { + Map> replicaDistribution = new HashMap<>(); + for (Map.Entry entry : bucketsByTableBucket.entrySet()) { + TableBucket tableBucket = entry.getKey(); + BucketModel bucket = entry.getValue(); + List replicaIds = + bucket.replicas().stream() + .map(r -> r.server().id()) + .collect(Collectors.toList()); + replicaDistribution.put(tableBucket, replicaIds); + } + return replicaDistribution; + } + + public Map getLeaderDistribution() { + Map leaderDistribution = new HashMap<>(); + for (Map.Entry entry : bucketsByTableBucket.entrySet()) { + TableBucket tableBucket = entry.getKey(); + BucketModel bucket = entry.getValue(); + + ReplicaModel replicaModel = bucket.leader(); + if (replicaModel == null) { + continue; + } + + leaderDistribution.put(tableBucket, replicaModel.server().id()); + } + return leaderDistribution; + } + + public void createReplica(int serverId, TableBucket tableBucket, int index, boolean isLeader) { + ServerModel server = server(serverId); + if (server == null) { + throw new IllegalArgumentException("Server is not in the cluster."); + } + + ReplicaModel replica = new ReplicaModel(tableBucket, server, isLeader); + server.putReplica(tableBucket, replica); + + if (!bucketsByTableBucket.containsKey(tableBucket)) { + bucketsByTableBucket.put(tableBucket, new BucketModel(tableBucket, offlineServers())); + } + + BucketModel bucket = bucketsByTableBucket.get(tableBucket); + if (isLeader) { + bucket.addLeader(replica, index); + } else { + bucket.addFollower(replica, index); + } + } + + /** + * Relocate leadership from source server to destination server. + * + *

      + *
    • 1. Removes leadership from source replica. + *
    • 2. Adds this leadership to the destination replica. + *
    • 3. Updates the leader and list of followers of the bucket. + *
    + */ + public boolean relocateLeadership( + TableBucket tableBucket, int sourceServerId, int desServerId) { + // Sanity check to see if the source replica is the leader. + BucketModel bucket = bucketsByTableBucket.get(tableBucket); + ReplicaModel sourceReplica = bucket.replica(sourceServerId); + if (!sourceReplica.isLeader()) { + return false; + } + + // Sanity check to see if the destination replica is a follower. + ReplicaModel desReplica = bucket.replica(desServerId); + if (desReplica.isLeader()) { + throw new IllegalArgumentException( + "Cannot relocate leadership of bucket " + + tableBucket + + " from server " + + sourceServerId + + " to server " + + desServerId + + " because the destination replica is a leader."); + } + + ServerModel sourceServer = server(sourceServerId); + if (sourceServer == null) { + throw new IllegalArgumentException("Source server is not in the cluster."); + } + sourceServer.makeFollower(tableBucket); + + ServerModel destServer = server(desServerId); + if (destServer == null) { + throw new IllegalArgumentException("Destination server is not in the cluster."); + } + destServer.makeLeader(tableBucket); + + // Update the leader and list of followers of the bucket. + bucket.relocateLeadership(desReplica); + return true; + } + + /** + * Relocate replica from source server to destination server. + * + *
      + *
    • 1. Removes the replica from source server. + *
    • 2. Set the server of the removed replica as the dest server + *
    • 3. Add this replica to the dest server. + *
    + */ + public void relocateReplica(TableBucket tableBucket, int sourceServerId, int destServerId) { + // Removes the replica from the source server. + ReplicaModel replica = removeReplica(sourceServerId, tableBucket); + if (replica == null) { + throw new IllegalArgumentException("Replica is not in the cluster."); + } + + // Updates the tabletServer of the removed replicas with dest server. + replica.setServer(server(destServerId)); + + // Add this replica back to destination rack and server. + String rack = replica.server().rack(); + rack(rack).addReplica(replica); + } + + private @Nullable ReplicaModel removeReplica(int serverId, TableBucket tableBucket) { + for (RackModel rack : racksById.values()) { + ReplicaModel removedReplica = rack.removeReplica(serverId, tableBucket); + if (removedReplica != null) { + return removedReplica; + } + } + return null; + } + + @Override + public String toString() { + return String.format( + "ClusterModel[serverCount=%s,bucketCount=%s,aliveServerCount=%s]", + servers.size(), bucketsByTableBucket.size(), aliveServers.size()); + } +} diff --git a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/rebalance/model/ClusterModelStats.java b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/rebalance/model/ClusterModelStats.java new file mode 100644 index 0000000000..16bd9f29b0 --- /dev/null +++ b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/rebalance/model/ClusterModelStats.java @@ -0,0 +1,115 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.fluss.server.coordinator.rebalance.model; + +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; +import java.util.Set; +import java.util.SortedSet; +import java.util.function.Function; + +/** A class that holds the statistics of the cluster for rebalance. */ +public class ClusterModelStats { + private final Map replicaStats; + private final Map leaderReplicaStats; + private int numServers; + private int numReplicasInCluster; + + public ClusterModelStats() { + replicaStats = new HashMap<>(); + leaderReplicaStats = new HashMap<>(); + + numServers = 0; + numReplicasInCluster = 0; + } + + ClusterModelStats populate(ClusterModel clusterModel) { + final SortedSet servers = clusterModel.servers(); + final Set aliveServers = clusterModel.aliveServers(); + this.numServers = servers.size(); + numForReplicas(clusterModel, servers, aliveServers); + numForLeaderReplicas(servers, aliveServers); + return this; + } + + /** Generate statistics for replicas in the given cluster. */ + private void numForReplicas( + ClusterModel clusterModel, + SortedSet servers, + Set aliveServers) { + populateReplicaStats( + serverModel -> serverModel.replicas().size(), replicaStats, servers, aliveServers); + numReplicasInCluster = clusterModel.numReplicas(); + } + + /** Generate statistics for leader replicas in the given cluster. */ + private void numForLeaderReplicas( + SortedSet servers, Set aliveServers) { + populateReplicaStats( + serverModel -> serverModel.leaderReplicas().size(), + leaderReplicaStats, + servers, + aliveServers); + } + + private void populateReplicaStats( + Function numInterestedReplicasFunc, + Map interestedReplicaStats, + SortedSet servers, + Set aliveServers) { + // Average, minimum, and maximum number of replicas of interest in servers. + int maxInterestedReplicasInServer = 0; + int minInterestedReplicasInServer = Integer.MAX_VALUE; + int numInterestedReplicasInCluster = 0; + for (ServerModel server : servers) { + int numInterestedReplicasInServer = numInterestedReplicasFunc.apply(server); + numInterestedReplicasInCluster += numInterestedReplicasInServer; + maxInterestedReplicasInServer = + Math.max(maxInterestedReplicasInServer, numInterestedReplicasInServer); + minInterestedReplicasInServer = + Math.min(minInterestedReplicasInServer, numInterestedReplicasInServer); + } + double avgInterestedReplicas = + ((double) numInterestedReplicasInCluster) / aliveServers.size(); + + // Standard deviation of replicas of interest in alive servers. + double variance = 0.0; + for (ServerModel broker : aliveServers) { + variance += + (Math.pow( + (double) numInterestedReplicasFunc.apply(broker) + - avgInterestedReplicas, + 2) + / aliveServers.size()); + } + + interestedReplicaStats.put(Statistic.AVG, avgInterestedReplicas); + interestedReplicaStats.put(Statistic.MAX, maxInterestedReplicasInServer); + interestedReplicaStats.put(Statistic.MIN, minInterestedReplicasInServer); + interestedReplicaStats.put(Statistic.ST_DEV, Math.sqrt(variance)); + } + + public Map replicaStats() { + return Collections.unmodifiableMap(replicaStats); + } + + public Map leaderReplicaStats() { + return Collections.unmodifiableMap(leaderReplicaStats); + } +} diff --git a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/rebalance/model/RackModel.java b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/rebalance/model/RackModel.java new file mode 100644 index 0000000000..fdf9cbad80 --- /dev/null +++ b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/rebalance/model/RackModel.java @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.fluss.server.coordinator.rebalance.model; + +import org.apache.fluss.metadata.TableBucket; + +import javax.annotation.Nullable; + +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +/** + * A class that holds the information of the rack, including its liveness tabletServers and + * replicas. A rack object is created as part of a cluster structure. + */ +public class RackModel { + public static final String DEFAULT_RACK = "default_rack"; + + private final String rack; + private final Map servers; + + public RackModel(String rack) { + this.rack = rack; + this.servers = new HashMap<>(); + } + + @Nullable + ReplicaModel removeReplica(int serverId, TableBucket tableBucket) { + ServerModel server = servers.get(serverId); + if (server != null) { + return server.removeReplica(tableBucket); + } + + return null; + } + + void addReplica(ReplicaModel replica) { + replica.server().putReplica(replica.tableBucket(), replica); + } + + public String rack() { + return rack; + } + + @Nullable + ServerModel server(int serverId) { + return servers.get(serverId); + } + + public void addServer(ServerModel server) { + servers.put(server.id(), server); + } + + public Set tables() { + Set tables = new HashSet<>(); + + for (ServerModel server : servers.values()) { + tables.addAll(server.tables()); + } + return tables; + } + + @Override + public String toString() { + return String.format("RackModel[rack=%s,servers=%s]", rack, servers.size()); + } +} diff --git a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/rebalance/model/ReplicaModel.java b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/rebalance/model/ReplicaModel.java new file mode 100644 index 0000000000..e67d9bd733 --- /dev/null +++ b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/rebalance/model/ReplicaModel.java @@ -0,0 +1,99 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.fluss.server.coordinator.rebalance.model; + +import org.apache.fluss.metadata.TableBucket; +import org.apache.fluss.server.replica.Replica; + +import java.util.Objects; + +/** A class that holds the information of the {@link Replica} for rebalance. */ +public class ReplicaModel { + private final TableBucket tableBucket; + private final ServerModel originalServer; + private ServerModel server; + private boolean isLeader; + + public ReplicaModel(TableBucket tableBucket, ServerModel server, boolean isLeader) { + this.tableBucket = tableBucket; + this.server = server; + this.isLeader = isLeader; + this.originalServer = server; + } + + public TableBucket tableBucket() { + return tableBucket; + } + + public ServerModel originalServer() { + return originalServer; + } + + public ServerModel server() { + return server; + } + + public int serverId() { + return server.id(); + } + + public boolean isLeader() { + return isLeader; + } + + public void makeFollower() { + setLeadership(false); + } + + public void makeLeader() { + setLeadership(true); + } + + void setLeadership(boolean leader) { + isLeader = leader; + } + + public void setServer(ServerModel server) { + this.server = server; + } + + @Override + public String toString() { + return String.format( + "ReplicaModel[TableBucket=%s,isLeader=%s,rack=%s,server=%s,originalServer=%s]", + tableBucket, isLeader, server.rack(), server.id(), originalServer.id()); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + ReplicaModel that = (ReplicaModel) o; + return Objects.equals(tableBucket, that.tableBucket) + && originalServer.id() == that.originalServer.id(); + } + + @Override + public int hashCode() { + return Objects.hash(tableBucket, originalServer.id()); + } +} diff --git a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/rebalance/model/ServerModel.java b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/rebalance/model/ServerModel.java new file mode 100644 index 0000000000..a57bc85b30 --- /dev/null +++ b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/rebalance/model/ServerModel.java @@ -0,0 +1,200 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.fluss.server.coordinator.rebalance.model; + +import org.apache.fluss.metadata.TableBucket; +import org.apache.fluss.metadata.TablePartition; + +import javax.annotation.Nullable; + +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +/** A class that holds the information of the tabletServer for rebalance. */ +public class ServerModel implements Comparable { + + private final int serverId; + private final boolean isAlive; + private final String rack; + private final Set replicas; + private final Set leaderReplicas; + /** A map for tracking (tableId) -> (BucketId -> replica) for none-partitioned table. */ + private final Map> tableReplicas; + + /** A map for tracking (tableId, partitionId) -> (BucketId -> replica) for partitioned table. */ + private final Map> tablePartitionReplicas; + + public ServerModel(int serverId, String rack, boolean isAlive) { + this.serverId = serverId; + this.rack = rack; + this.isAlive = isAlive; + this.replicas = new HashSet<>(); + this.leaderReplicas = new HashSet<>(); + this.tableReplicas = new HashMap<>(); + this.tablePartitionReplicas = new HashMap<>(); + } + + public int id() { + return serverId; + } + + public String rack() { + return rack; + } + + public boolean isAlive() { + return isAlive; + } + + public Set replicas() { + return new HashSet<>(replicas); + } + + public Set leaderReplicas() { + return new HashSet<>(leaderReplicas); + } + + public Set tables() { + Set tables = new HashSet<>(tableReplicas.keySet()); + tablePartitionReplicas.keySet().forEach(t -> tables.add(t.getTableId())); + return tables; + } + + public void makeFollower(TableBucket tableBucket) { + ReplicaModel replica = replica(tableBucket); + if (replica != null) { + replica.makeFollower(); + leaderReplicas.remove(replica); + } + } + + public void makeLeader(TableBucket tableBucket) { + ReplicaModel replica = replica(tableBucket); + if (replica != null) { + replica.makeLeader(); + leaderReplicas.add(replica); + } + } + + public void putReplica(TableBucket tableBucket, ReplicaModel replica) { + replicas.add(replica); + replica.setServer(this); + if (tableBucket.getPartitionId() != null) { + TablePartition tablePartition = + new TablePartition(tableBucket.getTableId(), tableBucket.getPartitionId()); + tablePartitionReplicas + .computeIfAbsent(tablePartition, k -> new HashMap<>()) + .put(tableBucket.getBucket(), replica); + } else { + tableReplicas + .computeIfAbsent(tableBucket.getTableId(), k -> new HashMap<>()) + .put(tableBucket.getBucket(), replica); + } + + if (replica.isLeader()) { + leaderReplicas.add(replica); + } + } + + public @Nullable ReplicaModel replica(TableBucket tableBucket) { + if (tableBucket.getPartitionId() == null) { + Map replicas = tableReplicas.get(tableBucket.getTableId()); + if (replicas == null) { + return null; + } + + return replicas.get(tableBucket.getBucket()); + } else { + TablePartition tablePartition = + new TablePartition(tableBucket.getTableId(), tableBucket.getPartitionId()); + Map replicas = tablePartitionReplicas.get(tablePartition); + if (replicas == null) { + return null; + } + return replicas.get(tableBucket.getBucket()); + } + } + + public @Nullable ReplicaModel removeReplica(TableBucket tableBucket) { + ReplicaModel removedReplica = replica(tableBucket); + if (removedReplica != null) { + replicas.remove(removedReplica); + + if (tableBucket.getPartitionId() != null) { + TablePartition tablePartition = + new TablePartition(tableBucket.getTableId(), tableBucket.getPartitionId()); + Map tablePartitionReplicas = + this.tablePartitionReplicas.get(tablePartition); + if (tablePartitionReplicas != null) { + tablePartitionReplicas.remove(tableBucket.getBucket()); + + if (tablePartitionReplicas.isEmpty()) { + this.tablePartitionReplicas.remove(tablePartition); + } + } + } else { + Map tableReplicas = + this.tableReplicas.get(tableBucket.getTableId()); + if (tableReplicas != null) { + tableReplicas.remove(tableBucket.getBucket()); + + if (tableReplicas.isEmpty()) { + this.tableReplicas.remove(tableBucket.getTableId()); + } + } + } + + if (removedReplica.isLeader()) { + leaderReplicas.remove(removedReplica); + } + } + + return removedReplica; + } + + @Override + public int compareTo(ServerModel o) { + return Integer.compare(serverId, o.id()); + } + + @Override + public String toString() { + return String.format( + "ServerModel[id=%s,rack=%s,isAlive=%s,replicaCount=%s]", + serverId, rack, isAlive, replicas.size()); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + ServerModel that = (ServerModel) o; + return serverId == that.serverId; + } + + @Override + public int hashCode() { + return serverId; + } +} diff --git a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/statemachine/ReplicaLeaderElectionStrategy.java b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/rebalance/model/Statistic.java similarity index 79% rename from fluss-server/src/main/java/org/apache/fluss/server/coordinator/statemachine/ReplicaLeaderElectionStrategy.java rename to fluss-server/src/main/java/org/apache/fluss/server/coordinator/rebalance/model/Statistic.java index faff47a42d..bf12b8b281 100644 --- a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/statemachine/ReplicaLeaderElectionStrategy.java +++ b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/rebalance/model/Statistic.java @@ -15,10 +15,12 @@ * limitations under the License. */ -package org.apache.fluss.server.coordinator.statemachine; +package org.apache.fluss.server.coordinator.rebalance.model; -/** The strategies to elect the replica leader. */ -public enum ReplicaLeaderElectionStrategy { - DEFAULT_ELECTION, - CONTROLLED_SHUTDOWN_ELECTION +/** An enum for the statistic. */ +public enum Statistic { + AVG, + MAX, + MIN, + ST_DEV } diff --git a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/statemachine/ReplicaLeaderElection.java b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/statemachine/ReplicaLeaderElection.java new file mode 100644 index 0000000000..492f9ae0b2 --- /dev/null +++ b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/statemachine/ReplicaLeaderElection.java @@ -0,0 +1,124 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.fluss.server.coordinator.statemachine; + +import org.apache.fluss.server.coordinator.statemachine.TableBucketStateMachine.ElectionResult; +import org.apache.fluss.server.zk.data.LeaderAndIsr; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Optional; +import java.util.Set; +import java.util.stream.Collectors; + +/** The strategies to elect the replica leader. */ +public abstract class ReplicaLeaderElection { + + /** The default replica leader election. */ + public static class DefaultLeaderElection extends ReplicaLeaderElection { + /** + * Default replica leader election, like electing leader while leader offline. + * + * @param assignments the assignments + * @param aliveReplicas the alive replicas + * @param leaderAndIsr the original leaderAndIsr + * @return the election result + */ + public Optional leaderElection( + List assignments, List aliveReplicas, LeaderAndIsr leaderAndIsr) { + // currently, we always use the first replica in assignment, which also in aliveReplicas + // and + // isr as the leader replica. + List isr = leaderAndIsr.isr(); + for (int assignment : assignments) { + if (aliveReplicas.contains(assignment) && isr.contains(assignment)) { + return Optional.of( + new TableBucketStateMachine.ElectionResult( + aliveReplicas, leaderAndIsr.newLeaderAndIsr(assignment, isr))); + } + } + + return Optional.empty(); + } + } + + /** The controlled shutdown replica leader election. */ + public static class ControlledShutdownLeaderElection extends ReplicaLeaderElection { + /** + * Controlled shutdown replica leader election. + * + * @param assignments the assignments + * @param aliveReplicas the alive replicas + * @param leaderAndIsr the original leaderAndIsr + * @param shutdownTabletServers the shutdown tabletServers + * @return the election result + */ + public Optional leaderElection( + List assignments, + List aliveReplicas, + LeaderAndIsr leaderAndIsr, + Set shutdownTabletServers) { + List originIsr = leaderAndIsr.isr(); + Set isrSet = new HashSet<>(originIsr); + for (Integer id : assignments) { + if (aliveReplicas.contains(id) + && isrSet.contains(id) + && !shutdownTabletServers.contains(id)) { + Set newAliveReplicas = new HashSet<>(aliveReplicas); + newAliveReplicas.removeAll(shutdownTabletServers); + List newIsr = + originIsr.stream() + .filter(replica -> !shutdownTabletServers.contains(replica)) + .collect(Collectors.toList()); + return Optional.of( + new ElectionResult( + new ArrayList<>(newAliveReplicas), + leaderAndIsr.newLeaderAndIsr(id, newIsr))); + } + } + return Optional.empty(); + } + } + + /** The reassignment replica leader election. */ + public static class ReassignmentLeaderElection extends ReplicaLeaderElection { + private final List newReplicas; + + public ReassignmentLeaderElection(List newReplicas) { + this.newReplicas = newReplicas; + } + + public Optional leaderElection( + List liveReplicas, LeaderAndIsr leaderAndIsr) { + // currently, we always use the first replica in targetReplicas, which also in + // liveReplicas and isr as the leader replica. For bucket reassignment, the first + // replica is the target leader replica. + List isr = leaderAndIsr.isr(); + for (int assignment : newReplicas) { + if (liveReplicas.contains(assignment) && isr.contains(assignment)) { + return Optional.of( + new ElectionResult( + liveReplicas, leaderAndIsr.newLeaderAndIsr(assignment, isr))); + } + } + + return Optional.empty(); + } + } +} diff --git a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/statemachine/ReplicaLeaderElectionAlgorithms.java b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/statemachine/ReplicaLeaderElectionAlgorithms.java deleted file mode 100644 index c7c1aa07a4..0000000000 --- a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/statemachine/ReplicaLeaderElectionAlgorithms.java +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.fluss.server.coordinator.statemachine; - -import org.apache.fluss.server.coordinator.statemachine.TableBucketStateMachine.ElectionResult; -import org.apache.fluss.server.zk.data.LeaderAndIsr; - -import java.util.ArrayList; -import java.util.HashSet; -import java.util.List; -import java.util.Optional; -import java.util.Set; -import java.util.stream.Collectors; - -/** The algorithms to elect the replica leader. */ -public class ReplicaLeaderElectionAlgorithms { - - /** - * Init replica leader election when the bucket is new created. - * - * @param assignments the assignments - * @param aliveReplicas the alive replicas - * @param coordinatorEpoch the coordinator epoch - * @return the election result - */ - public static Optional initReplicaLeaderElection( - List assignments, List aliveReplicas, int coordinatorEpoch) { - // currently, we always use the first replica in assignment, which also in aliveReplicas and - // isr as the leader replica. - for (int assignment : assignments) { - if (aliveReplicas.contains(assignment)) { - return Optional.of( - new ElectionResult( - aliveReplicas, - new LeaderAndIsr( - assignment, 0, aliveReplicas, coordinatorEpoch, 0))); - } - } - - return Optional.empty(); - } - - /** - * Default replica leader election, like electing leader while leader offline. - * - * @param assignments the assignments - * @param aliveReplicas the alive replicas - * @param leaderAndIsr the original leaderAndIsr - * @return the election result - */ - public static Optional defaultReplicaLeaderElection( - List assignments, List aliveReplicas, LeaderAndIsr leaderAndIsr) { - // currently, we always use the first replica in assignment, which also in aliveReplicas and - // isr as the leader replica. - List isr = leaderAndIsr.isr(); - for (int assignment : assignments) { - if (aliveReplicas.contains(assignment) && isr.contains(assignment)) { - return Optional.of( - new ElectionResult( - aliveReplicas, leaderAndIsr.newLeaderAndIsr(assignment, isr))); - } - } - - return Optional.empty(); - } - - /** - * Controlled shutdown replica leader election. - * - * @param assignments the assignments - * @param aliveReplicas the alive replicas - * @param leaderAndIsr the original leaderAndIsr - * @param shutdownTabletServers the shutdown tabletServers - * @return the election result - */ - public static Optional controlledShutdownReplicaLeaderElection( - List assignments, - List aliveReplicas, - LeaderAndIsr leaderAndIsr, - Set shutdownTabletServers) { - List originIsr = leaderAndIsr.isr(); - Set isrSet = new HashSet<>(originIsr); - for (Integer id : assignments) { - if (aliveReplicas.contains(id) - && isrSet.contains(id) - && !shutdownTabletServers.contains(id)) { - Set newAliveReplicas = new HashSet<>(aliveReplicas); - newAliveReplicas.removeAll(shutdownTabletServers); - List newIsr = - originIsr.stream() - .filter(replica -> !shutdownTabletServers.contains(replica)) - .collect(Collectors.toList()); - return Optional.of( - new ElectionResult( - new ArrayList<>(newAliveReplicas), - leaderAndIsr.newLeaderAndIsr(id, newIsr))); - } - } - return Optional.empty(); - } -} diff --git a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/statemachine/ReplicaStateMachine.java b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/statemachine/ReplicaStateMachine.java index 7494ab203e..fef67b083e 100644 --- a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/statemachine/ReplicaStateMachine.java +++ b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/statemachine/ReplicaStateMachine.java @@ -101,7 +101,7 @@ public void shutdown() { private Tuple2, Set> initializeReplicaState() { Set onlineReplicas = new HashSet<>(); Set offlineReplicas = new HashSet<>(); - Set allBuckets = coordinatorContext.allBuckets(); + Set allBuckets = coordinatorContext.getAllBuckets(); for (TableBucket tableBucket : allBuckets) { List replicas = coordinatorContext.getAssignment(tableBucket); for (Integer replica : replicas) { diff --git a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/statemachine/TableBucketStateMachine.java b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/statemachine/TableBucketStateMachine.java index 224ba5db87..85dcc434f4 100644 --- a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/statemachine/TableBucketStateMachine.java +++ b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/statemachine/TableBucketStateMachine.java @@ -17,10 +17,14 @@ package org.apache.fluss.server.coordinator.statemachine; +import org.apache.fluss.annotation.VisibleForTesting; import org.apache.fluss.metadata.PhysicalTablePath; import org.apache.fluss.metadata.TableBucket; import org.apache.fluss.server.coordinator.CoordinatorContext; import org.apache.fluss.server.coordinator.CoordinatorRequestBatch; +import org.apache.fluss.server.coordinator.statemachine.ReplicaLeaderElection.ControlledShutdownLeaderElection; +import org.apache.fluss.server.coordinator.statemachine.ReplicaLeaderElection.DefaultLeaderElection; +import org.apache.fluss.server.coordinator.statemachine.ReplicaLeaderElection.ReassignmentLeaderElection; import org.apache.fluss.server.entity.BatchRegisterLeadAndIsr; import org.apache.fluss.server.entity.RegisterTableBucketLeadAndIsrInfo; import org.apache.fluss.server.zk.ZooKeeperClient; @@ -40,12 +44,6 @@ import java.util.Set; import java.util.stream.Collectors; -import static org.apache.fluss.server.coordinator.statemachine.ReplicaLeaderElectionAlgorithms.controlledShutdownReplicaLeaderElection; -import static org.apache.fluss.server.coordinator.statemachine.ReplicaLeaderElectionAlgorithms.defaultReplicaLeaderElection; -import static org.apache.fluss.server.coordinator.statemachine.ReplicaLeaderElectionAlgorithms.initReplicaLeaderElection; -import static org.apache.fluss.server.coordinator.statemachine.ReplicaLeaderElectionStrategy.CONTROLLED_SHUTDOWN_ELECTION; -import static org.apache.fluss.server.coordinator.statemachine.ReplicaLeaderElectionStrategy.DEFAULT_ELECTION; - /* This file is based on source code of Apache Kafka Project (https://kafka.apache.org/), licensed by the Apache * Software Foundation (ASF) under the Apache License, Version 2.0. See the NOTICE file distributed with this work for * additional information regarding copyright ownership. */ @@ -83,7 +81,7 @@ public void startup() { * table buckets in zookeeper. */ private void initializeBucketState() { - Set tableBuckets = coordinatorContext.allBuckets(); + Set tableBuckets = coordinatorContext.getAllBuckets(); for (TableBucket tableBucket : tableBuckets) { BucketState bucketState = coordinatorContext @@ -121,13 +119,13 @@ public void shutdown() { } public void handleStateChange(Set tableBuckets, BucketState targetState) { - handleStateChange(tableBuckets, targetState, DEFAULT_ELECTION); + handleStateChange(tableBuckets, targetState, new DefaultLeaderElection()); } public void handleStateChange( Set tableBuckets, BucketState targetState, - ReplicaLeaderElectionStrategy replicaLeaderElectionStrategy) { + ReplicaLeaderElection replicaLeaderElection) { try { coordinatorRequestBatch.newBatch(); @@ -136,7 +134,7 @@ public void handleStateChange( batchHandleOnlineChangeAndInitLeader(tableBuckets); } else { for (TableBucket tableBucket : tableBuckets) { - doHandleStateChange(tableBucket, targetState, replicaLeaderElectionStrategy); + doHandleStateChange(tableBucket, targetState, replicaLeaderElection); } } coordinatorRequestBatch.sendRequestToTabletServers( @@ -188,12 +186,12 @@ public void handleStateChange( * * @param tableBucket The table bucket that is to do state change * @param targetState the target state that is to change to - * @param replicaLeaderElectionStrategy the strategy to choose a new leader + * @param replicaLeaderElection the strategy to choose a new leader */ private void doHandleStateChange( TableBucket tableBucket, BucketState targetState, - ReplicaLeaderElectionStrategy replicaLeaderElectionStrategy) { + ReplicaLeaderElection replicaLeaderElection) { coordinatorContext.putBucketStateIfNotExists(tableBucket, BucketState.NonExistentBucket); if (!checkValidTableBucketStateChange(tableBucket, targetState)) { return; @@ -245,8 +243,7 @@ private void doHandleStateChange( // current state is Online or Offline // not new bucket, we then need to update leader/epoch for the bucket Optional optionalElectionResult = - electNewLeaderForTableBuckets( - tableBucket, replicaLeaderElectionStrategy); + electNewLeaderForTableBuckets(tableBucket, replicaLeaderElection); if (!optionalElectionResult.isPresent()) { logFailedStateChange( tableBucket, currentState, targetState, "Elect result is empty."); @@ -470,7 +467,7 @@ private List tryRegisterLeaderAndIsrOneByOne( } private Optional electNewLeaderForTableBuckets( - TableBucket tableBucket, ReplicaLeaderElectionStrategy electionStrategy) { + TableBucket tableBucket, ReplicaLeaderElection electionStrategy) { LeaderAndIsr leaderAndIsr; try { leaderAndIsr = zooKeeperClient.getLeaderAndIsr(tableBucket).get(); @@ -602,7 +599,7 @@ private String stringifyBucket(TableBucket tableBucket) { private Optional electLeader( TableBucket tableBucket, LeaderAndIsr leaderAndIsr, - ReplicaLeaderElectionStrategy electionStrategy) { + ReplicaLeaderElection electionStrategy) { List assignment = coordinatorContext.getAssignment(tableBucket); // filter out the live servers List liveReplicas = @@ -616,13 +613,23 @@ private Optional electLeader( } Optional resultOpt = Optional.empty(); - if (electionStrategy == DEFAULT_ELECTION) { - resultOpt = defaultReplicaLeaderElection(assignment, liveReplicas, leaderAndIsr); - } else if (electionStrategy == CONTROLLED_SHUTDOWN_ELECTION) { + if (electionStrategy instanceof DefaultLeaderElection) { + resultOpt = + ((DefaultLeaderElection) electionStrategy) + .leaderElection(assignment, liveReplicas, leaderAndIsr); + } else if (electionStrategy instanceof ControlledShutdownLeaderElection) { Set shuttingDownTabletServers = coordinatorContext.shuttingDownTabletServers(); resultOpt = - controlledShutdownReplicaLeaderElection( - assignment, liveReplicas, leaderAndIsr, shuttingDownTabletServers); + ((ControlledShutdownLeaderElection) electionStrategy) + .leaderElection( + assignment, + liveReplicas, + leaderAndIsr, + shuttingDownTabletServers); + } else if (electionStrategy instanceof ReassignmentLeaderElection) { + resultOpt = + ((ReassignmentLeaderElection) electionStrategy) + .leaderElection(liveReplicas, leaderAndIsr); } if (!resultOpt.isPresent()) { @@ -656,4 +663,30 @@ public LeaderAndIsr getLeaderAndIsr() { return leaderAndIsr; } } + + /** + * Init replica leader election when the bucket is new created. + * + * @param assignments the assignments + * @param aliveReplicas the alive replicas + * @param coordinatorEpoch the coordinator epoch + * @return the election result + */ + @VisibleForTesting + public static Optional initReplicaLeaderElection( + List assignments, List aliveReplicas, int coordinatorEpoch) { + // currently, we always use the first replica in assignment, which also in aliveReplicas and + // isr as the leader replica. + for (int assignment : assignments) { + if (aliveReplicas.contains(assignment)) { + return Optional.of( + new ElectionResult( + aliveReplicas, + new LeaderAndIsr( + assignment, 0, aliveReplicas, coordinatorEpoch, 0))); + } + } + + return Optional.empty(); + } } diff --git a/fluss-server/src/main/java/org/apache/fluss/server/replica/ReplicaManager.java b/fluss-server/src/main/java/org/apache/fluss/server/replica/ReplicaManager.java index d9350257dc..421db4a068 100644 --- a/fluss-server/src/main/java/org/apache/fluss/server/replica/ReplicaManager.java +++ b/fluss-server/src/main/java/org/apache/fluss/server/replica/ReplicaManager.java @@ -339,7 +339,8 @@ private void registerMetrics() { this::physicalStorageRemoteLogSize); } - private Stream onlineReplicas() { + @VisibleForTesting + public Stream onlineReplicas() { return allReplicas.values().stream() .map( t -> { @@ -365,6 +366,11 @@ private long atMinIsrCount() { return onlineReplicas().filter(Replica::isAtMinIsr).count(); } + @VisibleForTesting + public long leaderCount() { + return onlineReplicas().filter(Replica::isLeader).count(); + } + private int writerIdCount() { return onlineReplicas().map(Replica::writerIdCount).reduce(0, Integer::sum); } @@ -412,6 +418,11 @@ public void becomeLeaderOrFollower( List replicasToBeLeader = new ArrayList<>(); List replicasToBeFollower = new ArrayList<>(); for (NotifyLeaderAndIsrData data : notifyLeaderAndIsrDataList) { + LOG.info( + "Try to become leaderAndFollower for {} with isr {}, replicas: {}", + data.getTableBucket(), + data.getLeaderAndIsr(), + data.getReplicas()); TableBucket tb = data.getTableBucket(); try { boolean becomeLeader = validateAndGetIsBecomeLeader(data); diff --git a/fluss-server/src/main/java/org/apache/fluss/server/utils/ServerRpcMessageUtils.java b/fluss-server/src/main/java/org/apache/fluss/server/utils/ServerRpcMessageUtils.java index 8ee46545b0..5501a3fa9b 100644 --- a/fluss-server/src/main/java/org/apache/fluss/server/utils/ServerRpcMessageUtils.java +++ b/fluss-server/src/main/java/org/apache/fluss/server/utils/ServerRpcMessageUtils.java @@ -20,6 +20,9 @@ import org.apache.fluss.cluster.Endpoint; import org.apache.fluss.cluster.ServerNode; import org.apache.fluss.cluster.ServerType; +import org.apache.fluss.cluster.rebalance.RebalancePlanForBucket; +import org.apache.fluss.cluster.rebalance.RebalanceProgress; +import org.apache.fluss.cluster.rebalance.RebalanceResultForBucket; import org.apache.fluss.config.ConfigOptions; import org.apache.fluss.config.cluster.AlterConfigOpType; import org.apache.fluss.config.cluster.ColumnPositionType; @@ -33,6 +36,7 @@ import org.apache.fluss.metadata.TableChange; import org.apache.fluss.metadata.TableDescriptor; import org.apache.fluss.metadata.TableInfo; +import org.apache.fluss.metadata.TablePartition; import org.apache.fluss.metadata.TablePath; import org.apache.fluss.record.BytesViewLogRecords; import org.apache.fluss.record.DefaultKvRecordBatch; @@ -72,6 +76,7 @@ import org.apache.fluss.rpc.messages.ListOffsetsRequest; import org.apache.fluss.rpc.messages.ListOffsetsResponse; import org.apache.fluss.rpc.messages.ListPartitionInfosResponse; +import org.apache.fluss.rpc.messages.ListRebalanceProgressResponse; import org.apache.fluss.rpc.messages.LookupRequest; import org.apache.fluss.rpc.messages.LookupResponse; import org.apache.fluss.rpc.messages.MetadataResponse; @@ -120,6 +125,9 @@ import org.apache.fluss.rpc.messages.PbProduceLogRespForBucket; import org.apache.fluss.rpc.messages.PbPutKvReqForBucket; import org.apache.fluss.rpc.messages.PbPutKvRespForBucket; +import org.apache.fluss.rpc.messages.PbRebalancePlanForBucket; +import org.apache.fluss.rpc.messages.PbRebalancePlanForTable; +import org.apache.fluss.rpc.messages.PbRebalanceProgressForBucket; import org.apache.fluss.rpc.messages.PbRemoteLogSegment; import org.apache.fluss.rpc.messages.PbRemotePathAndLocalFile; import org.apache.fluss.rpc.messages.PbRenameColumn; @@ -138,6 +146,7 @@ import org.apache.fluss.rpc.messages.ProduceLogResponse; import org.apache.fluss.rpc.messages.PutKvRequest; import org.apache.fluss.rpc.messages.PutKvResponse; +import org.apache.fluss.rpc.messages.RebalanceResponse; import org.apache.fluss.rpc.messages.StopReplicaRequest; import org.apache.fluss.rpc.messages.StopReplicaResponse; import org.apache.fluss.rpc.messages.UpdateMetadataRequest; @@ -168,6 +177,7 @@ import org.apache.fluss.server.metadata.TableMetadata; import org.apache.fluss.server.zk.data.BucketSnapshot; import org.apache.fluss.server.zk.data.LeaderAndIsr; +import org.apache.fluss.server.zk.data.RebalancePlan; import org.apache.fluss.server.zk.data.lake.LakeTable; import org.apache.fluss.server.zk.data.lake.LakeTableSnapshot; import org.apache.fluss.utils.json.DataTypeJsonSerde; @@ -1790,6 +1800,109 @@ public static List toPbConfigEntries(List describ .collect(Collectors.toList()); } + public static RebalanceResponse makeRebalanceRespose(RebalancePlan rebalancePlan) { + RebalanceResponse response = + new RebalanceResponse().setRebalanceId(rebalancePlan.getRebalanceId()); + List planForTables = new ArrayList<>(); + + // for none-partitioned tables. + for (Map.Entry> planForTable : + rebalancePlan.getPlanForBuckets().entrySet()) { + PbRebalancePlanForTable pbPlanForTable = + response.addTablePlan().setTableId(planForTable.getKey()); + List planForBuckets = new ArrayList<>(); + planForTable + .getValue() + .forEach( + planForBucket -> + planForBuckets.add(toPbRebalancePlanForBucket(planForBucket))); + pbPlanForTable.addAllBucketsPlans(planForBuckets); + planForTables.add(pbPlanForTable); + } + + // for partitioned tables. + Map tableIdToPbPlanForTable = new HashMap<>(); + for (Map.Entry> planForTable : + rebalancePlan.getPlanForBucketsOfPartitionedTable().entrySet()) { + TablePartition tablePartition = planForTable.getKey(); + long tableId = tablePartition.getTableId(); + PbRebalancePlanForTable pbPlanForTable = + tableIdToPbPlanForTable.computeIfAbsent( + tableId, k -> new PbRebalancePlanForTable().setTableId(tableId)); + List planForBuckets = new ArrayList<>(); + planForTable + .getValue() + .forEach( + planForBucket -> + planForBuckets.add(toPbRebalancePlanForBucket(planForBucket))); + pbPlanForTable.addAllBucketsPlans(planForBuckets); + } + + planForTables.addAll(tableIdToPbPlanForTable.values()); + response.addAllTablePlans(planForTables); + return response; + } + + public static ListRebalanceProgressResponse makeListRebalanceProgressResponse( + RebalanceProgress rebalanceProgress) { + ListRebalanceProgressResponse response = + new ListRebalanceProgressResponse() + .setRebalanceStatus(rebalanceProgress.status().getCode()); + + if (rebalanceProgress.rebalanceId() != null) { + response.setRebalanceId(rebalanceProgress.rebalanceId()); + } + + Map> tableIdToPbBuckets = new HashMap<>(); + for (Map.Entry progressForBucket : + rebalanceProgress.progressForBucketMap().entrySet()) { + TableBucket tableBucket = progressForBucket.getKey(); + RebalanceResultForBucket rebalanceResultForBucket = progressForBucket.getValue(); + long tableId = tableBucket.getTableId(); + List pbBuckets = + tableIdToPbBuckets.computeIfAbsent(tableId, k -> new ArrayList<>()); + pbBuckets.add( + new PbRebalanceProgressForBucket() + .setRebalancePlan( + toPbRebalancePlanForBucket(rebalanceResultForBucket.plan())) + .setRebalanceStatus(rebalanceResultForBucket.status().getCode())); + } + + for (Map.Entry> entry : + tableIdToPbBuckets.entrySet()) { + response.addTableProgress() + .setTableId(entry.getKey()) + .addAllBucketsProgresses(entry.getValue()); + } + + return response; + } + + private static PbRebalancePlanForBucket toPbRebalancePlanForBucket( + RebalancePlanForBucket planForBucket) { + PbRebalancePlanForBucket pbRebalancePlanForBucket = + new PbRebalancePlanForBucket() + .setBucketId(planForBucket.getBucketId()) + .setOriginalLeader(planForBucket.getOriginalLeader()) + .setNewLeader(planForBucket.getNewLeader()); + + Long partitionId = planForBucket.getTableBucket().getPartitionId(); + if (partitionId != null) { + pbRebalancePlanForBucket.setPartitionId(partitionId); + } + + pbRebalancePlanForBucket + .setOriginalReplicas( + planForBucket.getOriginReplicas().stream() + .mapToInt(Integer::intValue) + .toArray()) + .setNewReplicas( + planForBucket.getNewReplicas().stream() + .mapToInt(Integer::intValue) + .toArray()); + return pbRebalancePlanForBucket; + } + private static Map mergeResponse( Map response, Map errors) { if (errors.isEmpty()) { diff --git a/fluss-server/src/main/java/org/apache/fluss/server/utils/TableAssignmentUtils.java b/fluss-server/src/main/java/org/apache/fluss/server/utils/TableAssignmentUtils.java index 067edc35e6..eca96ddf5e 100644 --- a/fluss-server/src/main/java/org/apache/fluss/server/utils/TableAssignmentUtils.java +++ b/fluss-server/src/main/java/org/apache/fluss/server/utils/TableAssignmentUtils.java @@ -224,31 +224,31 @@ private static TableAssignment generateRackAwareAssigment( replicas.add(leader); Set racksWithReplicas = new HashSet<>(); racksWithReplicas.add(serverRackMap.get(leader)); - Set brokersWithReplicas = new HashSet<>(); - brokersWithReplicas.add(leader); + Set tabletServersWithReplicas = new HashSet<>(); + tabletServersWithReplicas.add(leader); int k = 0; for (int j = 0; j < replicationFactor - 1; j++) { boolean done = false; while (!done) { - Integer broker = + Integer server = arrangedServerList.get( replicaIndex( firstReplicaIndex, nextReplicaShift * numRacks, k, arrangedServerList.size())); - String rack = serverRackMap.get(broker); + String rack = serverRackMap.get(server); // Skip this tabletServer if // 1. there is already a tabletServer in the same rack that has assigned a // replica AND there is one or more racks that do not have any replica, or // 2. the tabletServer has already assigned a replica AND there is one or more // tabletServers that do not have replica assigned if ((!racksWithReplicas.contains(rack) || racksWithReplicas.size() == numRacks) - && (!brokersWithReplicas.contains(broker) - || brokersWithReplicas.size() == numServers)) { - replicas.add(broker); + && (!tabletServersWithReplicas.contains(server) + || tabletServersWithReplicas.size() == numServers)) { + replicas.add(server); racksWithReplicas.add(rack); - brokersWithReplicas.add(broker); + tabletServersWithReplicas.add(server); done = true; } k += 1; diff --git a/fluss-server/src/main/java/org/apache/fluss/server/zk/ZooKeeperClient.java b/fluss-server/src/main/java/org/apache/fluss/server/zk/ZooKeeperClient.java index 40bffe8171..1636947b08 100644 --- a/fluss-server/src/main/java/org/apache/fluss/server/zk/ZooKeeperClient.java +++ b/fluss-server/src/main/java/org/apache/fluss/server/zk/ZooKeeperClient.java @@ -296,7 +296,17 @@ public void updateTableAssignment(long tableId, TableAssignment tableAssignment) throws Exception { String path = TableIdZNode.path(tableId); zkClient.setData().forPath(path, TableIdZNode.encode(tableAssignment)); - LOG.info("Updated table assignment {} for table id {}.", tableAssignment, tableId); + LOG.debug("Updated table assignment {} for table id {}.", tableAssignment, tableId); + } + + public void updatePartitionAssignment(long partitionId, PartitionAssignment partitionAssignment) + throws Exception { + String path = PartitionIdZNode.path(partitionId); + zkClient.setData().forPath(path, PartitionIdZNode.encode(partitionAssignment)); + LOG.debug( + "Updated partition assignment {} for partition id {}.", + partitionAssignment, + partitionId); } public void deleteTableAssignment(long tableId) throws Exception { @@ -1236,6 +1246,11 @@ public Optional getRebalancePlan() throws Exception { return getOrEmpty(path).map(RebalanceZNode::decode); } + /** Deletes the rebalance plan from ZooKeeper. Only for testing propose now */ + public void deleteRebalancePlan() throws Exception { + deletePath(RebalanceZNode.path()); + } + // -------------------------------------------------------------------------------------------- // Utils // -------------------------------------------------------------------------------------------- diff --git a/fluss-server/src/main/java/org/apache/fluss/server/zk/data/RebalancePlan.java b/fluss-server/src/main/java/org/apache/fluss/server/zk/data/RebalancePlan.java index da9f019c35..abb9f39bee 100644 --- a/fluss-server/src/main/java/org/apache/fluss/server/zk/data/RebalancePlan.java +++ b/fluss-server/src/main/java/org/apache/fluss/server/zk/data/RebalancePlan.java @@ -37,6 +37,9 @@ */ public class RebalancePlan { + /** The rebalance id to trace rebalace task. */ + private final String rebalanceId; + /** The rebalance status for the overall rebalance. */ private final RebalanceStatus rebalanceStatus; @@ -48,7 +51,10 @@ public class RebalancePlan { planForBucketsOfPartitionedTable; public RebalancePlan( - RebalanceStatus rebalanceStatus, Map bucketPlan) { + String rebalanceId, + RebalanceStatus rebalanceStatus, + Map bucketPlan) { + this.rebalanceId = rebalanceId; this.rebalanceStatus = rebalanceStatus; this.planForBuckets = new HashMap<>(); this.planForBucketsOfPartitionedTable = new HashMap<>(); @@ -70,6 +76,10 @@ public RebalancePlan( } } + public String getRebalanceId() { + return rebalanceId; + } + public RebalanceStatus getRebalanceStatus() { return rebalanceStatus; } @@ -82,12 +92,34 @@ public Map> getPlanForBucketsOfPart return planForBucketsOfPartitionedTable; } + public Map getExecutePlan() { + Map executePlan = new HashMap<>(); + planForBuckets.forEach( + (tableId, rebalancePlanForBuckets) -> + rebalancePlanForBuckets.forEach( + rebalancePlanForBucket -> + executePlan.put( + rebalancePlanForBucket.getTableBucket(), + rebalancePlanForBucket))); + + planForBucketsOfPartitionedTable.forEach( + (tablePartition, rebalancePlanForBuckets) -> + rebalancePlanForBuckets.forEach( + rebalancePlanForBucket -> + executePlan.put( + rebalancePlanForBucket.getTableBucket(), + rebalancePlanForBucket))); + return executePlan; + } + @Override public String toString() { return "RebalancePlan{" - + "rebalanceStatus=" + + "rebalanceId='" + + rebalanceId + + ", rebalanceStatus=" + rebalanceStatus - + "planForBuckets=" + + ", planForBuckets=" + planForBuckets + ", planForBucketsOfPartitionedTable=" + planForBucketsOfPartitionedTable @@ -105,6 +137,7 @@ public boolean equals(Object o) { RebalancePlan that = (RebalancePlan) o; return rebalanceStatus == that.rebalanceStatus + && Objects.equals(rebalanceId, that.rebalanceId) && Objects.equals(planForBuckets, that.planForBuckets) && Objects.equals( planForBucketsOfPartitionedTable, that.planForBucketsOfPartitionedTable); @@ -112,6 +145,7 @@ public boolean equals(Object o) { @Override public int hashCode() { - return Objects.hash(rebalanceStatus, planForBuckets, planForBucketsOfPartitionedTable); + return Objects.hash( + rebalanceId, rebalanceStatus, planForBuckets, planForBucketsOfPartitionedTable); } } diff --git a/fluss-server/src/main/java/org/apache/fluss/server/zk/data/RebalancePlanJsonSerde.java b/fluss-server/src/main/java/org/apache/fluss/server/zk/data/RebalancePlanJsonSerde.java index 6588d879c5..7b6b06a1d8 100644 --- a/fluss-server/src/main/java/org/apache/fluss/server/zk/data/RebalancePlanJsonSerde.java +++ b/fluss-server/src/main/java/org/apache/fluss/server/zk/data/RebalancePlanJsonSerde.java @@ -40,6 +40,7 @@ public class RebalancePlanJsonSerde public static final RebalancePlanJsonSerde INSTANCE = new RebalancePlanJsonSerde(); private static final String VERSION_KEY = "version"; + private static final String REBALANCE_ID = "rebalance_id"; private static final String REBALANCE_STATUS = "rebalance_status"; private static final String REBALANCE_PLAN = "rebalance_plan"; @@ -59,6 +60,7 @@ public class RebalancePlanJsonSerde public void serialize(RebalancePlan rebalancePlan, JsonGenerator generator) throws IOException { generator.writeStartObject(); generator.writeNumberField(VERSION_KEY, VERSION); + generator.writeStringField(REBALANCE_ID, rebalancePlan.getRebalanceId()); generator.writeNumberField(REBALANCE_STATUS, rebalancePlan.getRebalanceStatus().getCode()); generator.writeArrayFieldStart(REBALANCE_PLAN); @@ -98,6 +100,7 @@ public void serialize(RebalancePlan rebalancePlan, JsonGenerator generator) thro public RebalancePlan deserialize(JsonNode node) { JsonNode rebalancePlanNode = node.get(REBALANCE_PLAN); + String rebalanceId = node.get(REBALANCE_ID).asText(); RebalanceStatus rebalanceStatus = RebalanceStatus.of(node.get(REBALANCE_STATUS).asInt()); Map planForBuckets = new HashMap<>(); @@ -137,7 +140,7 @@ public RebalancePlan deserialize(JsonNode node) { } } - return new RebalancePlan(rebalanceStatus, planForBuckets); + return new RebalancePlan(rebalanceId, rebalanceStatus, planForBuckets); } private void serializeRebalancePlanForBucket( diff --git a/fluss-server/src/test/java/org/apache/fluss/server/coordinator/CoordinatorEventProcessorTest.java b/fluss-server/src/test/java/org/apache/fluss/server/coordinator/CoordinatorEventProcessorTest.java index dcfa6b5b3a..d04aee40cb 100644 --- a/fluss-server/src/test/java/org/apache/fluss/server/coordinator/CoordinatorEventProcessorTest.java +++ b/fluss-server/src/test/java/org/apache/fluss/server/coordinator/CoordinatorEventProcessorTest.java @@ -19,6 +19,7 @@ import org.apache.fluss.cluster.Endpoint; import org.apache.fluss.cluster.TabletServerInfo; +import org.apache.fluss.cluster.rebalance.RebalancePlanForBucket; import org.apache.fluss.config.ConfigOptions; import org.apache.fluss.config.Configuration; import org.apache.fluss.exception.FencedLeaderEpochException; @@ -69,6 +70,7 @@ import org.apache.fluss.server.zk.data.PartitionAssignment; import org.apache.fluss.server.zk.data.TableAssignment; import org.apache.fluss.server.zk.data.TabletServerRegistration; +import org.apache.fluss.server.zk.data.ZkData; import org.apache.fluss.server.zk.data.ZkData.PartitionIdsZNode; import org.apache.fluss.server.zk.data.ZkData.TableIdsZNode; import org.apache.fluss.testutils.common.AllCallbackWrapper; @@ -448,6 +450,9 @@ void testServerBecomeOnlineAndOfflineLine() throws Exception { t2Bucket0State = fromCtx(ctx -> ctx.getBucketState(t2Bucket0)); assertThat(t2Bucket0State).isEqualTo(OnlineBucket); + + // clean up the tablet server 3 + ZOO_KEEPER_EXTENSION_WRAPPER.getCustomExtension().cleanupPath(ZkData.ServerIdZNode.path(3)); } @Test @@ -900,6 +905,78 @@ void testSchemaChange() throws Exception { 3, new TableMetadata(tableInfo2, Collections.emptyList()))); } + @Test + void testDoBucketReassignment() throws Exception { + zookeeperClient.registerTabletServer( + 3, + new TabletServerRegistration( + "rack3", + Collections.singletonList( + new Endpoint("host3", 1001, DEFAULT_LISTENER_NAME)), + System.currentTimeMillis())); + + initCoordinatorChannel(); + TablePath t1 = TablePath.of(defaultDatabase, "test_bucket_reassignment_table"); + // Mock un-balanced table assignment. + Map bucketAssignments = new HashMap<>(); + bucketAssignments.put(0, BucketAssignment.of(0, 1, 3)); + TableAssignment tableAssignment = new TableAssignment(bucketAssignments); + long t1Id = + metadataManager.createTable( + t1, CoordinatorEventProcessorTest.TEST_TABLE, tableAssignment, false); + TableBucket tb0 = new TableBucket(t1Id, 0); + verifyIsr(tb0, 0, Arrays.asList(0, 1, 3)); + + // trigger bucket reassignment for tb0: + // bucket0 -> (0, 1, 2) + Map rebalancePlan = new HashMap<>(); + RebalancePlanForBucket planForBucket0 = + new RebalancePlanForBucket( + tb0, 0, 0, Arrays.asList(0, 1, 3), Arrays.asList(0, 1, 2)); + + rebalancePlan.put(tb0, planForBucket0); + // try to execute. + eventProcessor + .getRebalanceManager() + .registerRebalance("rebalance-task-jdsds1", rebalancePlan); + + // Mock to finish rebalance tasks, in production case, this need to be trigged by receiving + // AdjustIsrRequest. + Map leaderAndIsrMap = new HashMap<>(); + CompletableFuture respCallback = new CompletableFuture<>(); + + // This isr list equals originReplicas + addingReplicas. the bucket epoch is 1. + leaderAndIsrMap.put(tb0, new LeaderAndIsr(0, 0, Arrays.asList(0, 1, 2, 3), 0, 1)); + eventProcessor + .getCoordinatorEventManager() + .put(new AdjustIsrReceivedEvent(leaderAndIsrMap, respCallback)); + respCallback.get(); + verifyIsr(tb0, 0, Arrays.asList(0, 1, 2)); + + // clean up the tablet server 3 + ZOO_KEEPER_EXTENSION_WRAPPER.getCustomExtension().cleanupPath(ZkData.ServerIdZNode.path(3)); + } + + private void verifyIsr(TableBucket tb, int expectedLeader, List expectedIsr) + throws Exception { + LeaderAndIsr leaderAndIsr = + waitValue( + () -> fromCtx((ctx) -> ctx.getBucketLeaderAndIsr(tb)), + Duration.ofMinutes(1), + "leader not elected"); + LeaderAndIsr newLeaderAndIsrOfZk = zookeeperClient.getLeaderAndIsr(tb).get(); + retry( + Duration.ofMinutes(1), + () -> { + assertThat(leaderAndIsr.leader()) + .isEqualTo(newLeaderAndIsrOfZk.leader()) + .isEqualTo(expectedLeader); + assertThat(leaderAndIsr.isr()) + .isEqualTo(newLeaderAndIsrOfZk.isr()) + .hasSameElementsAs(expectedIsr); + }); + } + private CoordinatorEventProcessor buildCoordinatorEventProcessor() { return new CoordinatorEventProcessor( zookeeperClient, diff --git a/fluss-server/src/test/java/org/apache/fluss/server/coordinator/rebalance/RebalanceManagerITCase.java b/fluss-server/src/test/java/org/apache/fluss/server/coordinator/rebalance/RebalanceManagerITCase.java new file mode 100644 index 0000000000..422cb267b7 --- /dev/null +++ b/fluss-server/src/test/java/org/apache/fluss/server/coordinator/rebalance/RebalanceManagerITCase.java @@ -0,0 +1,112 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.fluss.server.coordinator.rebalance; + +import org.apache.fluss.cluster.rebalance.ServerTag; +import org.apache.fluss.config.ConfigOptions; +import org.apache.fluss.config.Configuration; +import org.apache.fluss.metadata.PartitionSpec; +import org.apache.fluss.metadata.TableDescriptor; +import org.apache.fluss.metadata.TablePath; +import org.apache.fluss.rpc.messages.AddServerTagRequest; +import org.apache.fluss.server.coordinator.rebalance.model.ClusterModel; +import org.apache.fluss.server.testutils.FlussClusterExtension; +import org.apache.fluss.server.zk.ZooKeeperClient; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.RegisterExtension; + +import java.util.Collections; + +import static org.apache.fluss.record.TestData.DATA1_SCHEMA; +import static org.apache.fluss.record.TestData.DATA1_TABLE_DESCRIPTOR; +import static org.apache.fluss.record.TestData.DATA1_TABLE_PATH; +import static org.apache.fluss.server.testutils.RpcMessageTestUtils.createPartition; +import static org.apache.fluss.server.testutils.RpcMessageTestUtils.createTable; +import static org.assertj.core.api.Assertions.assertThat; + +/** IT test for {@link RebalanceManager}. */ +public class RebalanceManagerITCase { + + @RegisterExtension + public static final FlussClusterExtension FLUSS_CLUSTER_EXTENSION = + FlussClusterExtension.builder() + .setNumOfTabletServers(3) + .setClusterConf(initConfig()) + .build(); + + private ZooKeeperClient zkClient; + private RebalanceManager rebalanceManager; + + @BeforeEach + void beforeEach() { + zkClient = FLUSS_CLUSTER_EXTENSION.getZooKeeperClient(); + rebalanceManager = FLUSS_CLUSTER_EXTENSION.getRebalanceManager(); + } + + @Test + void testBuildClusterModel() throws Exception { + // one none-partitioned table. + long tableId1 = + createTable(FLUSS_CLUSTER_EXTENSION, DATA1_TABLE_PATH, DATA1_TABLE_DESCRIPTOR); + // one partitioned table. + TablePath partitionTablePath = TablePath.of("test_db_1", "test_partition_table_1"); + TableDescriptor partitionTableDescriptor = + TableDescriptor.builder() + .schema(DATA1_SCHEMA) + .distributedBy(3) + .partitionedBy("b") + .property(ConfigOptions.TABLE_AUTO_PARTITION_ENABLED, false) + .build(); + long tableId2 = + createTable(FLUSS_CLUSTER_EXTENSION, partitionTablePath, partitionTableDescriptor); + String partitionName1 = "b1"; + createPartition( + FLUSS_CLUSTER_EXTENSION, + partitionTablePath, + new PartitionSpec(Collections.singletonMap("b", partitionName1)), + false); + + ClusterModel clusterModel = rebalanceManager.buildClusterModel(); + assertThat(clusterModel.servers().size()).isEqualTo(3); + assertThat(clusterModel.aliveServers().size()).isEqualTo(3); + assertThat(clusterModel.offlineServers().size()).isEqualTo(0); + assertThat(clusterModel.tables().size()).isEqualTo(2); + assertThat(clusterModel.tables()).contains(tableId1, tableId2); + + // offline one table. + AddServerTagRequest request = + new AddServerTagRequest().setServerTag(ServerTag.PERMANENT_OFFLINE.value); + request.addServerId(0); + FLUSS_CLUSTER_EXTENSION.newCoordinatorClient().addServerTag(request).get(); + + clusterModel = rebalanceManager.buildClusterModel(); + assertThat(clusterModel.servers().size()).isEqualTo(3); + assertThat(clusterModel.aliveServers().size()).isEqualTo(2); + assertThat(clusterModel.offlineServers().size()).isEqualTo(1); + assertThat(clusterModel.tables().size()).isEqualTo(2); + assertThat(clusterModel.tables()).contains(tableId1, tableId2); + } + + private static Configuration initConfig() { + Configuration conf = new Configuration(); + conf.setInt(ConfigOptions.DEFAULT_REPLICATION_FACTOR, 3); + return conf; + } +} diff --git a/fluss-server/src/test/java/org/apache/fluss/server/coordinator/rebalance/RebalanceTestUtils.java b/fluss-server/src/test/java/org/apache/fluss/server/coordinator/rebalance/RebalanceTestUtils.java new file mode 100644 index 0000000000..5942043b02 --- /dev/null +++ b/fluss-server/src/test/java/org/apache/fluss/server/coordinator/rebalance/RebalanceTestUtils.java @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.fluss.server.coordinator.rebalance; + +import org.apache.fluss.metadata.TableBucket; +import org.apache.fluss.server.coordinator.rebalance.model.ClusterModel; + +import java.util.List; + +/** A util class for rebalance test. */ +public class RebalanceTestUtils { + + public static void addBucket( + ClusterModel clusterModel, TableBucket tb, List replicas) { + for (int i = 0; i < replicas.size(); i++) { + clusterModel.createReplica(replicas.get(i), tb, i, i == 0); + } + } +} diff --git a/fluss-server/src/test/java/org/apache/fluss/server/coordinator/rebalance/goal/GoalOptimizerTest.java b/fluss-server/src/test/java/org/apache/fluss/server/coordinator/rebalance/goal/GoalOptimizerTest.java new file mode 100644 index 0000000000..ef85d08071 --- /dev/null +++ b/fluss-server/src/test/java/org/apache/fluss/server/coordinator/rebalance/goal/GoalOptimizerTest.java @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.fluss.server.coordinator.rebalance.goal; + +import org.apache.fluss.server.coordinator.rebalance.model.ServerModel; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.util.SortedSet; +import java.util.TreeSet; + +/** Test for {@link GoalOptimizer}. */ +public class GoalOptimizerTest { + + private SortedSet servers; + + @BeforeEach + public void setup() { + servers = new TreeSet<>(); + ServerModel server0 = new ServerModel(0, "rack0", true); + ServerModel server1 = new ServerModel(1, "rack1", true); + ServerModel server2 = new ServerModel(2, "rack2", true); + ServerModel server3 = new ServerModel(3, "rack0", true); + servers.add(server0); + servers.add(server1); + servers.add(server2); + servers.add(server3); + } + + @Test + void testOptimize() {} +} diff --git a/fluss-server/src/test/java/org/apache/fluss/server/coordinator/rebalance/goal/GoalOptimizerUtilsTest.java b/fluss-server/src/test/java/org/apache/fluss/server/coordinator/rebalance/goal/GoalOptimizerUtilsTest.java new file mode 100644 index 0000000000..ebf9e70975 --- /dev/null +++ b/fluss-server/src/test/java/org/apache/fluss/server/coordinator/rebalance/goal/GoalOptimizerUtilsTest.java @@ -0,0 +1,120 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.fluss.server.coordinator.rebalance.goal; + +import org.apache.fluss.cluster.rebalance.RebalancePlanForBucket; +import org.apache.fluss.metadata.TableBucket; +import org.apache.fluss.server.coordinator.rebalance.model.ClusterModel; +import org.apache.fluss.server.coordinator.rebalance.model.ServerModel; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.SortedSet; +import java.util.TreeSet; + +import static org.apache.fluss.server.coordinator.rebalance.RebalanceTestUtils.addBucket; +import static org.apache.fluss.server.coordinator.rebalance.goal.GoalOptimizerUtils.getDiff; +import static org.apache.fluss.server.coordinator.rebalance.goal.GoalOptimizerUtils.hasDiff; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +/** A test class for {@link GoalOptimizerUtils}. */ +public class GoalOptimizerUtilsTest { + + private SortedSet servers; + + @BeforeEach + public void setup() { + servers = new TreeSet<>(); + ServerModel server0 = new ServerModel(0, "rack0", true); + ServerModel server1 = new ServerModel(1, "rack1", true); + ServerModel server2 = new ServerModel(2, "rack2", true); + ServerModel server3 = new ServerModel(3, "rack0", true); + servers.add(server0); + servers.add(server1); + servers.add(server2); + servers.add(server3); + } + + @Test + void testHasDiff() { + ClusterModel clusterModel = new ClusterModel(servers); + + // add buckets into clusterModel. + addBucket(clusterModel, new TableBucket(0, 0), Arrays.asList(0, 1, 2)); + addBucket(clusterModel, new TableBucket(1, 0), Arrays.asList(0, 1, 2)); + + Map> initialReplicaDistribution = + clusterModel.getReplicaDistribution(); + Map initialLeaderDistribution = clusterModel.getLeaderDistribution(); + assertThat(hasDiff(initialReplicaDistribution, initialLeaderDistribution, clusterModel)) + .isFalse(); + + clusterModel.relocateLeadership(new TableBucket(0, 0), 0, 1); + clusterModel.relocateReplica(new TableBucket(1, 0), 2, 3); + assertThat(hasDiff(initialReplicaDistribution, initialLeaderDistribution, clusterModel)) + .isTrue(); + + assertThatThrownBy(() -> hasDiff(new HashMap<>(), initialLeaderDistribution, clusterModel)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining( + "Initial and final replica distributions do not contain the same buckets."); + } + + @Test + void testGetDiff() { + ClusterModel clusterModel = new ClusterModel(servers); + + // add buckets into clusterModel. + addBucket(clusterModel, new TableBucket(0, 0), Arrays.asList(0, 1, 2)); + addBucket(clusterModel, new TableBucket(1, 0), Arrays.asList(0, 1, 2)); + + Map> initialReplicaDistribution = + clusterModel.getReplicaDistribution(); + Map initialLeaderDistribution = clusterModel.getLeaderDistribution(); + assertThat(hasDiff(initialReplicaDistribution, initialLeaderDistribution, clusterModel)) + .isFalse(); + + clusterModel.relocateLeadership(new TableBucket(0, 0), 0, 1); + clusterModel.relocateReplica(new TableBucket(1, 0), 2, 3); + assertThat(hasDiff(initialReplicaDistribution, initialLeaderDistribution, clusterModel)) + .isTrue(); + + List diffPlan = + getDiff(initialReplicaDistribution, initialLeaderDistribution, clusterModel); + assertThat(diffPlan) + .contains( + new RebalancePlanForBucket( + new TableBucket(0, 0), + 0, + 1, + Arrays.asList(0, 1, 2), + Arrays.asList(1, 0, 2)), + new RebalancePlanForBucket( + new TableBucket(1, 0), + 0, + 0, + Arrays.asList(0, 1, 2), + Arrays.asList(0, 1, 3))); + } +} diff --git a/fluss-server/src/test/java/org/apache/fluss/server/coordinator/rebalance/goal/LeaderReplicaDistributionGoalTest.java b/fluss-server/src/test/java/org/apache/fluss/server/coordinator/rebalance/goal/LeaderReplicaDistributionGoalTest.java new file mode 100644 index 0000000000..aa1f6916b5 --- /dev/null +++ b/fluss-server/src/test/java/org/apache/fluss/server/coordinator/rebalance/goal/LeaderReplicaDistributionGoalTest.java @@ -0,0 +1,129 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.fluss.server.coordinator.rebalance.goal; + +import org.apache.fluss.metadata.TableBucket; +import org.apache.fluss.server.coordinator.rebalance.model.ClusterModel; +import org.apache.fluss.server.coordinator.rebalance.model.ServerModel; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.SortedSet; +import java.util.TreeSet; + +import static org.apache.fluss.server.coordinator.rebalance.RebalanceTestUtils.addBucket; +import static org.assertj.core.api.Assertions.assertThat; + +/** Test for {@link LeaderReplicaDistributionGoal}. */ +public class LeaderReplicaDistributionGoalTest { + private SortedSet servers; + + @BeforeEach + public void setup() { + servers = new TreeSet<>(); + ServerModel server0 = new ServerModel(0, "rack0", true); + ServerModel server1 = new ServerModel(1, "rack1", true); + ServerModel server2 = new ServerModel(2, "rack2", true); + ServerModel server3 = new ServerModel(3, "rack0", true); + servers.add(server0); + servers.add(server1); + servers.add(server2); + servers.add(server3); + } + + @Test + void testDoOptimize() { + LeaderReplicaDistributionGoal goal = new LeaderReplicaDistributionGoal(); + ClusterModel clusterModel = new ClusterModel(servers); + + // before optimize: + // for 18 buckets, the assignment: 0,1 + // for 18 buckets, the assignment: 1,0 + // the leader replica ratio of servers is 18:18:0:0, the avg buckets per server is 9 + for (int i = 0; i < 18; i++) { + addBucket(clusterModel, new TableBucket(0, i), Arrays.asList(0, 1)); + addBucket(clusterModel, new TableBucket(1, i), Arrays.asList(1, 0)); + } + + Map serverIdToLeaderReplicaNumber = getServerIdToLeaderReplicaNumber(); + assertThat(serverIdToLeaderReplicaNumber.get(0)).isEqualTo(18); + assertThat(serverIdToLeaderReplicaNumber.get(1)).isEqualTo(18); + assertThat(serverIdToLeaderReplicaNumber.get(2)).isEqualTo(0); + assertThat(serverIdToLeaderReplicaNumber.get(3)).isEqualTo(0); + + goal.optimize(clusterModel, new HashSet<>()); + + serverIdToLeaderReplicaNumber = getServerIdToLeaderReplicaNumber(); + assertThat(serverIdToLeaderReplicaNumber.get(0)).isEqualTo(10); + assertThat(serverIdToLeaderReplicaNumber.get(1)).isEqualTo(8); + assertThat(serverIdToLeaderReplicaNumber.get(2)).isEqualTo(10); + assertThat(serverIdToLeaderReplicaNumber.get(3)).isEqualTo(8); + } + + @Test + void testDoOptimizeWithOfflineServer() { + ServerModel server4 = new ServerModel(4, "rack0", false); + servers.add(server4); + + LeaderReplicaDistributionGoal goal = new LeaderReplicaDistributionGoal(); + ClusterModel clusterModel = new ClusterModel(servers); + + // before optimize: + // for 18 buckets, the assignment: 0,1 + // for 18 buckets, the assignment: 1,0 + // for 4 buckets, the assignment: 4,0,1 + // the leader replica ratio of servers is 18:18:0:0:4, the avg buckets per server is 8 + for (int i = 0; i < 18; i++) { + addBucket(clusterModel, new TableBucket(0, i), Arrays.asList(0, 1)); + addBucket(clusterModel, new TableBucket(1, i), Arrays.asList(1, 0)); + } + + for (int i = 0; i < 4; i++) { + addBucket(clusterModel, new TableBucket(2, i), Arrays.asList(4, 2, 1)); + } + + Map serverIdToLeaderReplicaNumber = getServerIdToLeaderReplicaNumber(); + assertThat(serverIdToLeaderReplicaNumber.get(0)).isEqualTo(18); + assertThat(serverIdToLeaderReplicaNumber.get(1)).isEqualTo(18); + assertThat(serverIdToLeaderReplicaNumber.get(2)).isEqualTo(0); + assertThat(serverIdToLeaderReplicaNumber.get(3)).isEqualTo(0); + assertThat(serverIdToLeaderReplicaNumber.get(4)).isEqualTo(4); + + goal.optimize(clusterModel, new HashSet<>()); + + serverIdToLeaderReplicaNumber = getServerIdToLeaderReplicaNumber(); + assertThat(serverIdToLeaderReplicaNumber.get(0)).isEqualTo(9); + assertThat(serverIdToLeaderReplicaNumber.get(1)).isEqualTo(11); + assertThat(serverIdToLeaderReplicaNumber.get(2)).isEqualTo(9); + assertThat(serverIdToLeaderReplicaNumber.get(3)).isEqualTo(11); + assertThat(serverIdToLeaderReplicaNumber.get(4)).isEqualTo(0); + } + + private Map getServerIdToLeaderReplicaNumber() { + Map idToLeaderReplicaNumber = new HashMap<>(); + for (ServerModel server : servers) { + idToLeaderReplicaNumber.put(server.id(), server.leaderReplicas().size()); + } + return idToLeaderReplicaNumber; + } +} diff --git a/fluss-server/src/test/java/org/apache/fluss/server/coordinator/rebalance/goal/ReplicaDistributionGoalTest.java b/fluss-server/src/test/java/org/apache/fluss/server/coordinator/rebalance/goal/ReplicaDistributionGoalTest.java new file mode 100644 index 0000000000..bafdfd1684 --- /dev/null +++ b/fluss-server/src/test/java/org/apache/fluss/server/coordinator/rebalance/goal/ReplicaDistributionGoalTest.java @@ -0,0 +1,141 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.fluss.server.coordinator.rebalance.goal; + +import org.apache.fluss.metadata.TableBucket; +import org.apache.fluss.server.coordinator.rebalance.model.ClusterModel; +import org.apache.fluss.server.coordinator.rebalance.model.ClusterModelStats; +import org.apache.fluss.server.coordinator.rebalance.model.ServerModel; +import org.apache.fluss.server.coordinator.rebalance.model.Statistic; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.SortedSet; +import java.util.TreeSet; + +import static org.apache.fluss.server.coordinator.rebalance.RebalanceTestUtils.addBucket; +import static org.assertj.core.api.Assertions.assertThat; + +/** Test for {@link ReplicaDistributionGoal}. */ +public class ReplicaDistributionGoalTest { + private SortedSet servers; + + @BeforeEach + public void setup() { + servers = new TreeSet<>(); + ServerModel server0 = new ServerModel(0, "rack0", true); + ServerModel server1 = new ServerModel(1, "rack1", true); + ServerModel server2 = new ServerModel(2, "rack2", true); + ServerModel server3 = new ServerModel(3, "rack0", true); + servers.add(server0); + servers.add(server1); + servers.add(server2); + servers.add(server3); + } + + @Test + void testDoOptimize() { + ReplicaDistributionGoal goal = new ReplicaDistributionGoal(); + ClusterModel clusterModel = new ClusterModel(servers); + TableBucket t1b0 = new TableBucket(1, 0); + TableBucket t1b1 = new TableBucket(1, 1); + + // before optimize: + // t1b0: assignment: 0, 1, 3 + // t1b1: assignment: 0, 1, 2 + // for other 11 buckets, the assignment: 0,1 + // the replica ratio of servers is 13:13:1:1, the avg buckets per server is 7 + addBucket(clusterModel, t1b0, Arrays.asList(0, 1, 3)); + addBucket(clusterModel, t1b1, Arrays.asList(0, 1, 2)); + for (int i = 0; i < 11; i++) { + addBucket(clusterModel, new TableBucket(2, i), Arrays.asList(0, 1)); + } + + ClusterModelStats clusterStats = clusterModel.getClusterStats(); + Map replicaStats = clusterStats.replicaStats(); + assertThat(replicaStats.get(Statistic.AVG)).isEqualTo(7.0); + assertThat(replicaStats.get(Statistic.MIN)).isEqualTo(1); + assertThat(replicaStats.get(Statistic.MAX)).isEqualTo(13); + + Map serverIdToReplicaNumber = getServerIdToReplicaNumber(clusterModel); + assertThat(serverIdToReplicaNumber.get(0)).isEqualTo(13); + assertThat(serverIdToReplicaNumber.get(1)).isEqualTo(13); + assertThat(serverIdToReplicaNumber.get(2)).isEqualTo(1); + assertThat(serverIdToReplicaNumber.get(3)).isEqualTo(1); + + goal.optimize(clusterModel, new HashSet<>()); + + serverIdToReplicaNumber = getServerIdToReplicaNumber(clusterModel); + assertThat(serverIdToReplicaNumber.get(0)).isEqualTo(8); + assertThat(serverIdToReplicaNumber.get(1)).isEqualTo(8); + assertThat(serverIdToReplicaNumber.get(2)).isEqualTo(6); + assertThat(serverIdToReplicaNumber.get(3)).isEqualTo(6); + } + + @Test + void testDoOptimizeWithOfflineServer() { + ServerModel server4 = new ServerModel(4, "rack0", false); + servers.add(server4); + + ReplicaDistributionGoal goal = new ReplicaDistributionGoal(); + ClusterModel clusterModel = new ClusterModel(servers); + TableBucket t1b0 = new TableBucket(1, 0); + TableBucket t1b1 = new TableBucket(1, 1); + + // All replicas in server4 need to be move out. + // before optimize: + // t1b0: assignment: 0, 1, 3 + // t1b1: assignment: 0, 1, 2 + // for other 13 buckets, the assignment: 0,1,4 + // the replica ratio of servers is 15:15:1:1:13, the avg buckets per server is 9 + addBucket(clusterModel, t1b0, Arrays.asList(0, 1, 3)); + addBucket(clusterModel, t1b1, Arrays.asList(0, 1, 2)); + for (int i = 0; i < 13; i++) { + addBucket(clusterModel, new TableBucket(2, i), Arrays.asList(0, 1, 4)); + } + + Map serverIdToReplicaNumber = getServerIdToReplicaNumber(clusterModel); + assertThat(serverIdToReplicaNumber.get(0)).isEqualTo(15); + assertThat(serverIdToReplicaNumber.get(1)).isEqualTo(15); + assertThat(serverIdToReplicaNumber.get(2)).isEqualTo(1); + assertThat(serverIdToReplicaNumber.get(3)).isEqualTo(1); + assertThat(serverIdToReplicaNumber.get(4)).isEqualTo(13); + + goal.optimize(clusterModel, new HashSet<>()); + + serverIdToReplicaNumber = getServerIdToReplicaNumber(clusterModel); + assertThat(serverIdToReplicaNumber.get(0)).isEqualTo(13); + assertThat(serverIdToReplicaNumber.get(1)).isEqualTo(10); + assertThat(serverIdToReplicaNumber.get(2)).isEqualTo(12); + assertThat(serverIdToReplicaNumber.get(3)).isEqualTo(10); + assertThat(serverIdToReplicaNumber.get(4)).isEqualTo(0); + } + + private Map getServerIdToReplicaNumber(ClusterModel clusterModel) { + Map idToReplicaNumber = new HashMap<>(); + for (ServerModel server : clusterModel.servers()) { + idToReplicaNumber.put(server.id(), server.replicas().size()); + } + return idToReplicaNumber; + } +} diff --git a/fluss-server/src/test/java/org/apache/fluss/server/coordinator/rebalance/model/BucketModelTest.java b/fluss-server/src/test/java/org/apache/fluss/server/coordinator/rebalance/model/BucketModelTest.java new file mode 100644 index 0000000000..052e6a385e --- /dev/null +++ b/fluss-server/src/test/java/org/apache/fluss/server/coordinator/rebalance/model/BucketModelTest.java @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.fluss.server.coordinator.rebalance.model; + +import org.apache.fluss.metadata.TableBucket; + +import org.junit.jupiter.api.Test; + +import java.util.Collections; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +/** Test for {@link BucketModel}. */ +public class BucketModelTest { + + @Test + void testBucketModel() { + BucketModel bucketModel = + new BucketModel( + new TableBucket(1L, 0), + Collections.singleton(new ServerModel(0, "rack0", true))); + assertThat(bucketModel.tableBucket()).isEqualTo(new TableBucket(1L, 0)); + assertThat(bucketModel.leader()).isNull(); + assertThat(bucketModel.bucketServers()).isEmpty(); + assertThat(bucketModel.replicas()).isEmpty(); + assertThat(bucketModel.canAssignReplicaToServer(new ServerModel(0, "rack0", true))) + .isFalse(); + + // add a leader replica. + ReplicaModel replicaModel1 = + new ReplicaModel(new TableBucket(1L, 0), new ServerModel(1, "rack1", true), true); + bucketModel.addLeader(replicaModel1, 0); + assertThat(bucketModel.leader()).isNotNull(); + assertThat(bucketModel.leader().tableBucket()).isEqualTo(new TableBucket(1L, 0)); + + // add a leader replica again will throw exception. + assertThatThrownBy( + () -> + bucketModel.addLeader( + new ReplicaModel( + new TableBucket(1L, 0), + new ServerModel(1, "rack1", false), + true), + 0)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining( + "Bucket TableBucket{tableId=1, bucket=0} already has a leader replica " + + "ReplicaModel[TableBucket=TableBucket{tableId=1, bucket=0},isLeader=true,rack=rack1,server=1,originalServer=1]. " + + "Cannot add a new leader replica ReplicaModel[TableBucket=TableBucket{tableId=1, bucket=0},isLeader=true,rack=rack1,server=1,originalServer=1]."); + + // add a follower replica. + ReplicaModel replicaModel2 = + new ReplicaModel(new TableBucket(1L, 0), new ServerModel(2, "rack2", true), false); + bucketModel.addFollower(replicaModel2, 1); + ReplicaModel replicaModel3 = + new ReplicaModel(new TableBucket(1L, 0), new ServerModel(3, "rack3", true), false); + bucketModel.addFollower(replicaModel3, 2); + + assertThat(bucketModel.replicas()).contains(replicaModel1, replicaModel2, replicaModel3); + assertThat(bucketModel.replica(1)).isEqualTo(replicaModel1); + assertThat(bucketModel.replica(2)).isEqualTo(replicaModel2); + assertThat(bucketModel.replica(3)).isEqualTo(replicaModel3); + + // change 2 to leader. + bucketModel.relocateLeadership(replicaModel2); + assertThat(bucketModel.leader()).isEqualTo(replicaModel2); + } +} diff --git a/fluss-server/src/test/java/org/apache/fluss/server/coordinator/rebalance/model/ClusterModelTest.java b/fluss-server/src/test/java/org/apache/fluss/server/coordinator/rebalance/model/ClusterModelTest.java new file mode 100644 index 0000000000..48ebd38711 --- /dev/null +++ b/fluss-server/src/test/java/org/apache/fluss/server/coordinator/rebalance/model/ClusterModelTest.java @@ -0,0 +1,148 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.fluss.server.coordinator.rebalance.model; + +import org.apache.fluss.metadata.TableBucket; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.util.List; +import java.util.Map; +import java.util.SortedSet; +import java.util.TreeSet; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +/** Test for {@link ClusterModel}. */ +public class ClusterModelTest { + private SortedSet servers; + private ServerModel server0; + private ServerModel server1; + private ServerModel server2; + private ServerModel server3; + + @BeforeEach + public void setup() { + servers = new TreeSet<>(); + server0 = new ServerModel(0, "rack0", true); + server1 = new ServerModel(1, "rack1", true); + server2 = new ServerModel(2, "rack2", true); + server3 = new ServerModel(3, "rack0", false); + servers.add(server0); + servers.add(server1); + servers.add(server2); + servers.add(server3); + } + + @Test + void testClusterModel() { + ClusterModel clusterModel = new ClusterModel(servers); + assertThat(clusterModel.aliveServers()).containsOnly(server0, server1, server2); + assertThat(clusterModel.offlineServers()).containsOnly(server3); + assertThat(clusterModel.servers()).containsOnly(server0, server1, server2, server3); + assertThat(clusterModel.bucket(new TableBucket(1, 0))).isNull(); + assertThat(clusterModel.numReplicas()).isEqualTo(0); + assertThat(clusterModel.numLeaderReplicas()).isEqualTo(0); + assertThat(clusterModel.rack("rack0").rack()).isEqualTo("rack0"); + assertThat(clusterModel.server(0)).isEqualTo(server0); + assertThat(clusterModel.server(5)).isNull(); + + // Test create replicas. + clusterModel.createReplica(0, new TableBucket(1, 0), 0, true); + clusterModel.createReplica(1, new TableBucket(1, 0), 1, false); + clusterModel.createReplica(2, new TableBucket(1, 0), 2, false); + clusterModel.createReplica(0, new TableBucket(2, 0L, 0), 0, true); + clusterModel.createReplica(1, new TableBucket(2, 0L, 0), 1, false); + clusterModel.createReplica(1, new TableBucket(2, 1L, 0), 0, true); + + assertThat(clusterModel.numReplicas()).isEqualTo(6); + assertThat(clusterModel.numLeaderReplicas()).isEqualTo(3); + assertThat(clusterModel.tables()).containsOnly(1L, 2L); + assertThat(clusterModel.getBucketsByTable()).hasSize(2); + + // test get replica distribution. + Map> replicaDistribution = clusterModel.getReplicaDistribution(); + assertThat(replicaDistribution).hasSize(3); + assertThat(replicaDistribution.get(new TableBucket(1, 0))).contains(0, 1, 2); + assertThat(replicaDistribution.get(new TableBucket(2, 0L, 0))).contains(0, 1); + assertThat(replicaDistribution.get(new TableBucket(2, 1L, 0))).contains(1); + + // test get leader distribution. + Map leaderDistribution = clusterModel.getLeaderDistribution(); + assertThat(leaderDistribution).hasSize(3); + assertThat(leaderDistribution.get(new TableBucket(1, 0))).isEqualTo(0); + assertThat(leaderDistribution.get(new TableBucket(2, 0L, 0))).isEqualTo(0); + assertThat(leaderDistribution.get(new TableBucket(2, 1L, 0))).isEqualTo(1); + } + + @Test + void testRelocateLeadership() { + TableBucket tb0 = new TableBucket(1, 0); + ClusterModel clusterModel = new ClusterModel(servers); + clusterModel.createReplica(0, tb0, 0, true); + clusterModel.createReplica(1, tb0, 1, false); + clusterModel.createReplica(2, tb0, 2, false); + + // try to relocate leadership from server 0 to server 1 + assertThat(clusterModel.relocateLeadership(tb0, 0, 1)).isTrue(); + ReplicaModel leaderReplica = clusterModel.bucket(tb0).leader(); + assertThat(leaderReplica).isNotNull(); + assertThat(leaderReplica.server().id()).isEqualTo(1); + + // try to relocate leadership from server 0 to server 2. As 0 is not leader, this operation + // will return false. + assertThat(clusterModel.relocateLeadership(tb0, 0, 2)).isFalse(); + + assertThatThrownBy(() -> clusterModel.relocateLeadership(tb0, 1, 1)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining( + "Cannot relocate leadership of bucket TableBucket{tableId=1, bucket=0} " + + "from server 1 to server 1 because the destination replica is a leader."); + + assertThatThrownBy(() -> clusterModel.relocateLeadership(tb0, 1, 5)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining( + "Requested replica 5 is not a replica of bucket TableBucket{tableId=1, bucket=0}"); + } + + @Test + void testRelocateReplica() { + TableBucket tb0 = new TableBucket(1, 0); + ClusterModel clusterModel = new ClusterModel(servers); + clusterModel.createReplica(0, tb0, 0, true); + clusterModel.createReplica(1, tb0, 1, false); + + BucketModel bucket = clusterModel.bucket(tb0); + assertThat(bucket).isNotNull(); + assertThat(bucket.replica(0)).isNotNull(); + assertThat(bucket.replica(1)).isNotNull(); + assertThatThrownBy(() -> bucket.replica(2)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining( + "Requested replica 2 is not a replica of bucket TableBucket{tableId=1, bucket=0}"); + clusterModel.relocateReplica(tb0, 1, 2); + assertThat(bucket.replica(0)).isNotNull(); + assertThat(bucket.replica(2)).isNotNull(); + assertThatThrownBy(() -> bucket.replica(1)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining( + "Requested replica 1 is not a replica of bucket TableBucket{tableId=1, bucket=0}"); + } +} diff --git a/fluss-server/src/test/java/org/apache/fluss/server/coordinator/rebalance/model/RackModelTest.java b/fluss-server/src/test/java/org/apache/fluss/server/coordinator/rebalance/model/RackModelTest.java new file mode 100644 index 0000000000..e345141e3c --- /dev/null +++ b/fluss-server/src/test/java/org/apache/fluss/server/coordinator/rebalance/model/RackModelTest.java @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.fluss.server.coordinator.rebalance.model; + +import org.apache.fluss.metadata.TableBucket; + +import org.junit.jupiter.api.Test; + +import static org.assertj.core.api.Assertions.assertThat; + +/** Test for {@link RackModel}. */ +public class RackModelTest { + + @Test + void testRackModel() { + RackModel rackModel = new RackModel("rack0"); + assertThat(rackModel.rack()).isEqualTo("rack0"); + assertThat(rackModel.server(0)).isNull(); + + ServerModel serverModel = new ServerModel(0, "rack0", true); + rackModel.addServer(serverModel); + assertThat(rackModel.server(0)).isEqualTo(serverModel); + + assertThat(rackModel.removeReplica(0, new TableBucket(1L, 0))).isNull(); + + ReplicaModel replicaModel = new ReplicaModel(new TableBucket(1L, 0), serverModel, false); + rackModel.addReplica(replicaModel); + assertThat(serverModel.replica(new TableBucket(1L, 0))).isEqualTo(replicaModel); + } + + @Test + void testToString() { + RackModel rackModel = new RackModel("rack0"); + assertThat(rackModel.toString()).isEqualTo("RackModel[rack=rack0,servers=0]"); + } +} diff --git a/fluss-server/src/test/java/org/apache/fluss/server/coordinator/rebalance/model/ReplicaModelTest.java b/fluss-server/src/test/java/org/apache/fluss/server/coordinator/rebalance/model/ReplicaModelTest.java new file mode 100644 index 0000000000..da07b8de12 --- /dev/null +++ b/fluss-server/src/test/java/org/apache/fluss/server/coordinator/rebalance/model/ReplicaModelTest.java @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.fluss.server.coordinator.rebalance.model; + +import org.apache.fluss.metadata.TableBucket; + +import org.junit.jupiter.api.Test; + +import static org.assertj.core.api.Assertions.assertThat; + +/** Test for {@link ReplicaModel}. */ +public class ReplicaModelTest { + + @Test + void testReplicaModel() { + ReplicaModel replicaModel = + new ReplicaModel(new TableBucket(1L, 0), new ServerModel(1, "rack1", true), false); + assertThat(replicaModel.tableBucket()).isEqualTo(new TableBucket(1L, 0)); + assertThat(replicaModel.isLeader()).isFalse(); + assertThat(replicaModel.server().id()).isEqualTo(1); + assertThat(replicaModel.originalServer().id()).isEqualTo(1); + + // make this replica as leader. + replicaModel.makeLeader(); + assertThat(replicaModel.isLeader()).isTrue(); + + // make as follower again. + replicaModel.makeFollower(); + assertThat(replicaModel.isLeader()).isFalse(); + + // set server. + replicaModel.setServer(new ServerModel(2, "rack2", true)); + assertThat(replicaModel.server().id()).isEqualTo(2); + assertThat(replicaModel.originalServer().id()).isEqualTo(1); + } + + @Test + void testToString() { + ReplicaModel replicaModel = + new ReplicaModel(new TableBucket(1L, 0), new ServerModel(1, "rack1", true), false); + assertThat(replicaModel.toString()) + .isEqualTo( + "ReplicaModel[TableBucket=TableBucket{tableId=1, bucket=0},isLeader=false,rack=rack1,server=1,originalServer=1]"); + + replicaModel.makeLeader(); + replicaModel.setServer(new ServerModel(2, "rack2", true)); + assertThat(replicaModel.toString()) + .isEqualTo( + "ReplicaModel[TableBucket=TableBucket{tableId=1, bucket=0},isLeader=true,rack=rack2,server=2,originalServer=1]"); + } + + @Test + void testEquals() { + ReplicaModel replicaModel1 = + new ReplicaModel(new TableBucket(1L, 0), new ServerModel(1, "rack1", true), false); + ReplicaModel replicaModel2 = + new ReplicaModel(new TableBucket(1L, 0), new ServerModel(1, "rack1", true), false); + assertThat(replicaModel1).isEqualTo(replicaModel2); + + replicaModel1.setServer(new ServerModel(2, "rack2", true)); + assertThat(replicaModel1).isEqualTo(replicaModel2); + + replicaModel1.setLeadership(true); + assertThat(replicaModel1).isEqualTo(replicaModel2); + } +} diff --git a/fluss-server/src/test/java/org/apache/fluss/server/coordinator/rebalance/model/ServerModelTest.java b/fluss-server/src/test/java/org/apache/fluss/server/coordinator/rebalance/model/ServerModelTest.java new file mode 100644 index 0000000000..66ec87d3a7 --- /dev/null +++ b/fluss-server/src/test/java/org/apache/fluss/server/coordinator/rebalance/model/ServerModelTest.java @@ -0,0 +1,108 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.fluss.server.coordinator.rebalance.model; + +import org.apache.fluss.metadata.TableBucket; + +import org.junit.jupiter.api.Test; + +import static org.assertj.core.api.Assertions.assertThat; + +/** Test for {@link ServerModel}. */ +public class ServerModelTest { + + @Test + void testServerModel() { + ServerModel serverModel = new ServerModel(0, "rack0", true); + assertThat(serverModel.id()).isEqualTo(0); + assertThat(serverModel.rack()).isEqualTo("rack0"); + assertThat(serverModel.isAlive()).isTrue(); + + // put some replicas. + TableBucket t1b0 = new TableBucket(1L, 0); + TableBucket t1b1 = new TableBucket(1L, 1); + TableBucket t1b2 = new TableBucket(1L, 2); + TableBucket t2b0 = new TableBucket(2L, 0); + TableBucket t3p0b0 = new TableBucket(3L, 0L, 0); + TableBucket t3p0b1 = new TableBucket(3L, 0L, 1); + TableBucket t3p1b0 = new TableBucket(3L, 1L, 0); + serverModel.putReplica(t1b0, new ReplicaModel(t1b0, serverModel, true)); + serverModel.putReplica(t1b1, new ReplicaModel(t1b1, serverModel, false)); + serverModel.putReplica(t1b2, new ReplicaModel(t1b2, serverModel, false)); + serverModel.putReplica(t2b0, new ReplicaModel(t2b0, serverModel, true)); + serverModel.putReplica(t3p0b0, new ReplicaModel(t3p0b0, serverModel, true)); + serverModel.putReplica(t3p0b1, new ReplicaModel(t3p0b1, serverModel, false)); + serverModel.putReplica(t3p1b0, new ReplicaModel(t3p1b0, serverModel, false)); + + assertThat(serverModel.replicas()).hasSize(7); + assertThat(serverModel.leaderReplicas()).hasSize(3); + assertThat(serverModel.tables()).containsExactly(1L, 2L, 3L); + + // make t1b0 as follower and make t1b1 as leader. + assertThat(serverModel.replica(t1b0).isLeader()).isTrue(); + assertThat(serverModel.replica(t1b1).isLeader()).isFalse(); + serverModel.makeFollower(t1b0); + serverModel.makeLeader(t1b1); + assertThat(serverModel.replica(t1b0).isLeader()).isFalse(); + assertThat(serverModel.replica(t1b1).isLeader()).isTrue(); + + // make t3p0b0 as follower and make t3p0b1 as leader. + assertThat(serverModel.replica(t3p0b0).isLeader()).isTrue(); + assertThat(serverModel.replica(t3p0b1).isLeader()).isFalse(); + serverModel.makeFollower(t3p0b0); + serverModel.makeLeader(t3p0b1); + assertThat(serverModel.replica(t3p0b0).isLeader()).isFalse(); + assertThat(serverModel.replica(t3p0b1).isLeader()).isTrue(); + + // remove replica t2b0 and t3p1b0. + serverModel.removeReplica(t2b0); + serverModel.removeReplica(t3p1b0); + assertThat(serverModel.replicas()).hasSize(5); + assertThat(serverModel.leaderReplicas()).hasSize(2); + assertThat(serverModel.tables()).containsExactly(1L, 3L); + } + + @Test + void testToString() { + ServerModel serverModel = new ServerModel(0, "rack0", true); + assertThat(serverModel.toString()) + .isEqualTo("ServerModel[id=0,rack=rack0,isAlive=true,replicaCount=0]"); + + serverModel.putReplica( + new TableBucket(1L, 0), + new ReplicaModel(new TableBucket(1L, 0), serverModel, false)); + assertThat(serverModel.toString()) + .isEqualTo("ServerModel[id=0,rack=rack0,isAlive=true,replicaCount=1]"); + } + + @Test + void testEquals() { + // equals by server Id. + ServerModel serverModel1 = new ServerModel(0, "rack0", true); + ServerModel serverModel2 = new ServerModel(0, "rack0", true); + assertThat(serverModel1).isEqualTo(serverModel2); + } + + @Test + void testCompareTo() { + // order by server Id. + ServerModel serverModel1 = new ServerModel(0, "rack0", true); + ServerModel serverModel2 = new ServerModel(1, "rack1", true); + assertThat(serverModel1.compareTo(serverModel2)).isEqualTo(-1); + } +} diff --git a/fluss-server/src/test/java/org/apache/fluss/server/coordinator/statemachine/ReplicaLeaderElectionAlgorithmsTest.java b/fluss-server/src/test/java/org/apache/fluss/server/coordinator/statemachine/ReplicaLeaderElectionTest.java similarity index 60% rename from fluss-server/src/test/java/org/apache/fluss/server/coordinator/statemachine/ReplicaLeaderElectionAlgorithmsTest.java rename to fluss-server/src/test/java/org/apache/fluss/server/coordinator/statemachine/ReplicaLeaderElectionTest.java index a530a19293..d1dc77d61a 100644 --- a/fluss-server/src/test/java/org/apache/fluss/server/coordinator/statemachine/ReplicaLeaderElectionAlgorithmsTest.java +++ b/fluss-server/src/test/java/org/apache/fluss/server/coordinator/statemachine/ReplicaLeaderElectionTest.java @@ -17,6 +17,9 @@ package org.apache.fluss.server.coordinator.statemachine; +import org.apache.fluss.server.coordinator.statemachine.ReplicaLeaderElection.ControlledShutdownLeaderElection; +import org.apache.fluss.server.coordinator.statemachine.ReplicaLeaderElection.DefaultLeaderElection; +import org.apache.fluss.server.coordinator.statemachine.ReplicaLeaderElection.ReassignmentLeaderElection; import org.apache.fluss.server.coordinator.statemachine.TableBucketStateMachine.ElectionResult; import org.apache.fluss.server.zk.data.LeaderAndIsr; @@ -29,27 +32,10 @@ import java.util.Optional; import java.util.Set; -import static org.apache.fluss.server.coordinator.statemachine.ReplicaLeaderElectionAlgorithms.controlledShutdownReplicaLeaderElection; -import static org.apache.fluss.server.coordinator.statemachine.ReplicaLeaderElectionAlgorithms.defaultReplicaLeaderElection; -import static org.apache.fluss.server.coordinator.statemachine.ReplicaLeaderElectionAlgorithms.initReplicaLeaderElection; import static org.assertj.core.api.Assertions.assertThat; -/** Test for {@link ReplicaLeaderElectionAlgorithms}. */ -public class ReplicaLeaderElectionAlgorithmsTest { - - @Test - void testInitReplicaLeaderElection() { - List assignments = Arrays.asList(2, 4); - List liveReplicas = Collections.singletonList(4); - - Optional leaderElectionResultOpt = - initReplicaLeaderElection(assignments, liveReplicas, 0); - assertThat(leaderElectionResultOpt.isPresent()).isTrue(); - ElectionResult leaderElectionResult = leaderElectionResultOpt.get(); - assertThat(leaderElectionResult.getLiveReplicas()).containsExactlyInAnyOrder(4); - assertThat(leaderElectionResult.getLeaderAndIsr().leader()).isEqualTo(4); - assertThat(leaderElectionResult.getLeaderAndIsr().isr()).containsExactlyInAnyOrder(4); - } +/** Test for different implement of {@link ReplicaLeaderElection}. */ +public class ReplicaLeaderElectionTest { @Test void testDefaultReplicaLeaderElection() { @@ -57,8 +43,9 @@ void testDefaultReplicaLeaderElection() { List liveReplicas = Arrays.asList(2, 4); LeaderAndIsr originLeaderAndIsr = new LeaderAndIsr(4, 0, Arrays.asList(2, 4), 0, 0); + DefaultLeaderElection defaultLeaderElection = new DefaultLeaderElection(); Optional leaderElectionResultOpt = - defaultReplicaLeaderElection(assignments, liveReplicas, originLeaderAndIsr); + defaultLeaderElection.leaderElection(assignments, liveReplicas, originLeaderAndIsr); assertThat(leaderElectionResultOpt.isPresent()).isTrue(); ElectionResult leaderElectionResult = leaderElectionResultOpt.get(); assertThat(leaderElectionResult.getLiveReplicas()).containsExactlyInAnyOrder(2, 4); @@ -73,8 +60,10 @@ void testControlledShutdownReplicaLeaderElection() { LeaderAndIsr originLeaderAndIsr = new LeaderAndIsr(2, 0, Arrays.asList(2, 4), 0, 0); Set shutdownTabletServers = Collections.singleton(2); + ControlledShutdownLeaderElection controlledShutdownLeaderElection = + new ControlledShutdownLeaderElection(); Optional leaderElectionResultOpt = - controlledShutdownReplicaLeaderElection( + controlledShutdownLeaderElection.leaderElection( assignments, liveReplicas, originLeaderAndIsr, shutdownTabletServers); assertThat(leaderElectionResultOpt.isPresent()).isTrue(); ElectionResult leaderElectionResult = leaderElectionResultOpt.get(); @@ -91,8 +80,10 @@ void testControlledShutdownReplicaLeaderElectionLastIsrShuttingDown() { new LeaderAndIsr(2, 0, Collections.singletonList(2), 0, 0); Set shutdownTabletServers = Collections.singleton(2); + ControlledShutdownLeaderElection controlledShutdownLeaderElection = + new ControlledShutdownLeaderElection(); Optional leaderElectionResultOpt = - controlledShutdownReplicaLeaderElection( + controlledShutdownLeaderElection.leaderElection( assignments, liveReplicas, originLeaderAndIsr, shutdownTabletServers); assertThat(leaderElectionResultOpt).isEmpty(); } @@ -104,9 +95,39 @@ void testControlledShutdownPartitionLeaderElectionAllIsrSimultaneouslyShutdown() LeaderAndIsr originLeaderAndIsr = new LeaderAndIsr(2, 0, Arrays.asList(2, 4), 0, 0); Set shutdownTabletServers = new HashSet<>(Arrays.asList(2, 4)); + ControlledShutdownLeaderElection controlledShutdownLeaderElection = + new ControlledShutdownLeaderElection(); Optional leaderElectionResultOpt = - controlledShutdownReplicaLeaderElection( + controlledShutdownLeaderElection.leaderElection( assignments, liveReplicas, originLeaderAndIsr, shutdownTabletServers); assertThat(leaderElectionResultOpt).isEmpty(); } + + @Test + void testReassignBucketLeaderElection() { + List targetReplicas = Arrays.asList(1, 2, 3); + ReassignmentLeaderElection reassignmentLeaderElection = + new ReassignmentLeaderElection(targetReplicas); + List liveReplicas = Arrays.asList(1, 2, 3); + LeaderAndIsr leaderAndIsr = new LeaderAndIsr(1, 0, Arrays.asList(1, 2, 3), 0, 0); + Optional leaderOpt = + reassignmentLeaderElection.leaderElection(liveReplicas, leaderAndIsr); + assertThat(leaderOpt).isPresent(); + assertThat(leaderOpt.get().getLeaderAndIsr().leader()).isEqualTo(1); + + targetReplicas = Arrays.asList(1, 2, 3); + reassignmentLeaderElection = new ReassignmentLeaderElection(targetReplicas); + liveReplicas = Arrays.asList(2, 3); + leaderAndIsr = new LeaderAndIsr(1, 0, Arrays.asList(2, 3), 0, 0); + leaderOpt = reassignmentLeaderElection.leaderElection(liveReplicas, leaderAndIsr); + assertThat(leaderOpt).isPresent(); + assertThat(leaderOpt.get().getLeaderAndIsr().leader()).isEqualTo(2); + + targetReplicas = Arrays.asList(1, 2, 3); + reassignmentLeaderElection = new ReassignmentLeaderElection(targetReplicas); + liveReplicas = Arrays.asList(1, 2); + leaderAndIsr = new LeaderAndIsr(2, 1, Collections.emptyList(), 0, 1); + leaderOpt = reassignmentLeaderElection.leaderElection(liveReplicas, leaderAndIsr); + assertThat(leaderOpt).isNotPresent(); + } } diff --git a/fluss-server/src/test/java/org/apache/fluss/server/coordinator/statemachine/TableBucketStateMachineTest.java b/fluss-server/src/test/java/org/apache/fluss/server/coordinator/statemachine/TableBucketStateMachineTest.java index c57c9950e5..bafb477c54 100644 --- a/fluss-server/src/test/java/org/apache/fluss/server/coordinator/statemachine/TableBucketStateMachineTest.java +++ b/fluss-server/src/test/java/org/apache/fluss/server/coordinator/statemachine/TableBucketStateMachineTest.java @@ -35,6 +35,8 @@ import org.apache.fluss.server.coordinator.MetadataManager; import org.apache.fluss.server.coordinator.TestCoordinatorChannelManager; import org.apache.fluss.server.coordinator.event.CoordinatorEventManager; +import org.apache.fluss.server.coordinator.statemachine.ReplicaLeaderElection.ControlledShutdownLeaderElection; +import org.apache.fluss.server.coordinator.statemachine.TableBucketStateMachine.ElectionResult; import org.apache.fluss.server.metadata.CoordinatorMetadataCache; import org.apache.fluss.server.metrics.group.TestingMetricGroups; import org.apache.fluss.server.zk.NOPErrorHandler; @@ -66,7 +68,7 @@ import static org.apache.fluss.server.coordinator.statemachine.BucketState.NonExistentBucket; import static org.apache.fluss.server.coordinator.statemachine.BucketState.OfflineBucket; import static org.apache.fluss.server.coordinator.statemachine.BucketState.OnlineBucket; -import static org.apache.fluss.server.coordinator.statemachine.ReplicaLeaderElectionStrategy.CONTROLLED_SHUTDOWN_ELECTION; +import static org.apache.fluss.server.coordinator.statemachine.TableBucketStateMachine.initReplicaLeaderElection; import static org.apache.fluss.testutils.common.CommonTestUtils.retry; import static org.assertj.core.api.Assertions.assertThat; @@ -238,6 +240,9 @@ void testStateChangeToOnline() throws Exception { coordinatorContext.putBucketState(tableBucket, OfflineBucket); coordinatorContext.setLiveTabletServers(createServers(Collections.emptyList())); tableBucketStateMachine.handleStateChange(Collections.singleton(tableBucket), OnlineBucket); + coordinatorContext.setLiveTabletServers( + CoordinatorTestUtils.createServers(Collections.emptyList())); + tableBucketStateMachine.handleStateChange(Collections.singleton(tableBucket), OnlineBucket); // the state will still be offline assertThat(coordinatorContext.getBucketState(tableBucket)).isEqualTo(OfflineBucket); @@ -376,12 +381,26 @@ void testStateChangeForTabletServerControlledShutdown() { // handle state change for controlled shutdown. tableBucketStateMachine.handleStateChange( - Collections.singleton(tb), OnlineBucket, CONTROLLED_SHUTDOWN_ELECTION); + Collections.singleton(tb), OnlineBucket, new ControlledShutdownLeaderElection()); assertThat(coordinatorContext.getBucketState(tb)).isEqualTo(OnlineBucket); assertThat(coordinatorContext.getBucketLeaderAndIsr(tb).get().leader()) .isNotEqualTo(oldLeader); } + @Test + void testInitReplicaLeaderElection() { + List assignments = Arrays.asList(2, 4); + List liveReplicas = Collections.singletonList(4); + + Optional leaderElectionResultOpt = + initReplicaLeaderElection(assignments, liveReplicas, 0); + assertThat(leaderElectionResultOpt.isPresent()).isTrue(); + ElectionResult leaderElectionResult = leaderElectionResultOpt.get(); + assertThat(leaderElectionResult.getLiveReplicas()).containsExactlyInAnyOrder(4); + assertThat(leaderElectionResult.getLeaderAndIsr().leader()).isEqualTo(4); + assertThat(leaderElectionResult.getLeaderAndIsr().isr()).containsExactlyInAnyOrder(4); + } + private TableBucketStateMachine createTableBucketStateMachine() { return new TableBucketStateMachine( coordinatorContext, coordinatorRequestBatch, zookeeperClient); diff --git a/fluss-server/src/test/java/org/apache/fluss/server/testutils/FlussClusterExtension.java b/fluss-server/src/test/java/org/apache/fluss/server/testutils/FlussClusterExtension.java index bab31f10b2..b63138ea8c 100644 --- a/fluss-server/src/test/java/org/apache/fluss/server/testutils/FlussClusterExtension.java +++ b/fluss-server/src/test/java/org/apache/fluss/server/testutils/FlussClusterExtension.java @@ -47,6 +47,7 @@ import org.apache.fluss.server.coordinator.CoordinatorServer; import org.apache.fluss.server.coordinator.LakeCatalogDynamicLoader; import org.apache.fluss.server.coordinator.MetadataManager; +import org.apache.fluss.server.coordinator.rebalance.RebalanceManager; import org.apache.fluss.server.entity.NotifyLeaderAndIsrData; import org.apache.fluss.server.kv.snapshot.CompletedSnapshot; import org.apache.fluss.server.kv.snapshot.CompletedSnapshotHandle; @@ -461,6 +462,10 @@ public ZooKeeperClient getZooKeeperClient() { return zooKeeperClient; } + public RebalanceManager getRebalanceManager() { + return coordinatorServer.getRebalanceManager(); + } + public RpcClient getRpcClient() { return rpcClient; } diff --git a/fluss-server/src/test/java/org/apache/fluss/server/zk/ZooKeeperClientTest.java b/fluss-server/src/test/java/org/apache/fluss/server/zk/ZooKeeperClientTest.java index 64da4b95cb..5b559898dd 100644 --- a/fluss-server/src/test/java/org/apache/fluss/server/zk/ZooKeeperClientTest.java +++ b/fluss-server/src/test/java/org/apache/fluss/server/zk/ZooKeeperClientTest.java @@ -613,9 +613,10 @@ void testRebalancePlan() throws Exception { 1, Arrays.asList(0, 1, 2), Arrays.asList(1, 2, 3))); - zookeeperClient.registerRebalancePlan(new RebalancePlan(NOT_STARTED, bucketPlan)); + zookeeperClient.registerRebalancePlan( + new RebalancePlan("rebalance-task-1", NOT_STARTED, bucketPlan)); assertThat(zookeeperClient.getRebalancePlan()) - .hasValue(new RebalancePlan(NOT_STARTED, bucketPlan)); + .hasValue(new RebalancePlan("rebalance-task-1", NOT_STARTED, bucketPlan)); bucketPlan = new HashMap<>(); bucketPlan.put( @@ -626,13 +627,15 @@ void testRebalancePlan() throws Exception { 3, Arrays.asList(0, 1, 2), Arrays.asList(3, 4, 5))); - zookeeperClient.updateRebalancePlan(new RebalancePlan(NOT_STARTED, bucketPlan)); + zookeeperClient.updateRebalancePlan( + new RebalancePlan("rebalance-task-2", NOT_STARTED, bucketPlan)); assertThat(zookeeperClient.getRebalancePlan()) - .hasValue(new RebalancePlan(NOT_STARTED, bucketPlan)); + .hasValue(new RebalancePlan("rebalance-task-2", NOT_STARTED, bucketPlan)); - zookeeperClient.updateRebalancePlan(new RebalancePlan(COMPLETED, bucketPlan)); + zookeeperClient.updateRebalancePlan( + new RebalancePlan("rebalance-task-2", COMPLETED, bucketPlan)); assertThat(zookeeperClient.getRebalancePlan()) - .hasValue(new RebalancePlan(COMPLETED, bucketPlan)); + .hasValue(new RebalancePlan("rebalance-task-2", COMPLETED, bucketPlan)); } @Test diff --git a/fluss-server/src/test/java/org/apache/fluss/server/zk/data/RebalancePlanJsonSerdeTest.java b/fluss-server/src/test/java/org/apache/fluss/server/zk/data/RebalancePlanJsonSerdeTest.java index 1bd8ad426e..cae25f2118 100644 --- a/fluss-server/src/test/java/org/apache/fluss/server/zk/data/RebalancePlanJsonSerdeTest.java +++ b/fluss-server/src/test/java/org/apache/fluss/server/zk/data/RebalancePlanJsonSerdeTest.java @@ -79,13 +79,15 @@ protected RebalancePlan[] createObjects() { 3, Arrays.asList(0, 1, 2), Arrays.asList(3, 4, 5))); - return new RebalancePlan[] {new RebalancePlan(NOT_STARTED, bucketPlan)}; + return new RebalancePlan[] { + new RebalancePlan("rebalance-task-21jd", NOT_STARTED, bucketPlan) + }; } @Override protected String[] expectedJsons() { return new String[] { - "{\"version\":1,\"rebalance_status\":1,\"rebalance_plan\":" + "{\"version\":1,\"rebalance_id\":\"rebalance-task-21jd\",\"rebalance_status\":1,\"rebalance_plan\":" + "[{\"table_id\":0,\"buckets\":" + "[{\"bucket_id\":1,\"original_leader\":1,\"new_leader\":1,\"origin_replicas\":[0,1,2],\"new_replicas\":[1,2,3]}," + "{\"bucket_id\":0,\"original_leader\":0,\"new_leader\":3,\"origin_replicas\":[0,1,2],\"new_replicas\":[3,4,5]}]},"