-
Notifications
You must be signed in to change notification settings - Fork 600
HDDS-15034. Query SCM status for ozone admin upgrade status command #10084
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: HDDS-14496-zdu
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -116,6 +116,7 @@ | |
| import org.apache.hadoop.ozone.audit.AuditMessage; | ||
| import org.apache.hadoop.ozone.audit.Auditor; | ||
| import org.apache.hadoop.ozone.audit.SCMAction; | ||
| import org.apache.hadoop.ozone.upgrade.UpgradeFinalization; | ||
| import org.apache.hadoop.ozone.upgrade.UpgradeFinalization.StatusAndMessages; | ||
| import org.apache.hadoop.security.UserGroupInformation; | ||
| import org.apache.hadoop.security.token.Token; | ||
|
|
@@ -1153,13 +1154,12 @@ public HddsProtos.UpgradeStatus queryUpgradeStatus() throws IOException { | |
| try { | ||
| getScm().checkAdminAccess(getRemoteUser(), true); | ||
|
|
||
| // Returning a placeholder for now. | ||
| HddsProtos.UpgradeStatus result = HddsProtos.UpgradeStatus.newBuilder() | ||
| .setScmFinalized(true) | ||
| .setNumDatanodesFinalized(10) | ||
| .setNumDatanodesTotal(10) | ||
| .setShouldFinalize(true) | ||
| .build(); | ||
| UpgradeFinalization.Status scmUpgradeStatus = | ||
| scm.getLayoutVersionManager().getUpgradeState(); | ||
| int totalDatanodes = scm.getScmNodeManager().getAllNodeCount(); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I just realised that nodeManager probably remembers any dead or decommissioned nodes and they will never finalize. We actually want the count of all IN_SERVICE and HEALTHY nodes as those are the ones that should finalize or end up going dead as they are not responding. So I think we need to use this API: Where the NodeStatus is IN_SERVICE + HEALTHY. |
||
| int finalizedDatanodes = scm.getScmNodeManager().getNumDatanodesFinalized(); | ||
| HddsProtos.UpgradeStatus result = buildUpgradeStatus( | ||
| scmUpgradeStatus, finalizedDatanodes, totalDatanodes); | ||
| AUDIT.logReadSuccess(buildAuditMessageForSuccess(SCMAction.QUERY_UPGRADE_STATUS, null)); | ||
| return result; | ||
| } catch (IOException ex) { | ||
|
|
@@ -1168,6 +1168,29 @@ public HddsProtos.UpgradeStatus queryUpgradeStatus() throws IOException { | |
| } | ||
| } | ||
|
|
||
| static HddsProtos.UpgradeStatus buildUpgradeStatus( | ||
| UpgradeFinalization.Status scmUpgradeStatus, | ||
| int finalizedDatanodes, | ||
| int totalDatanodes) { | ||
| return HddsProtos.UpgradeStatus.newBuilder() | ||
| .setScmFinalized(isScmFinalized(scmUpgradeStatus)) | ||
| .setNumDatanodesFinalized(finalizedDatanodes) | ||
| .setNumDatanodesTotal(totalDatanodes) | ||
| .setShouldFinalize(shouldFinalize(scmUpgradeStatus, finalizedDatanodes, totalDatanodes)) | ||
| .build(); | ||
| } | ||
|
|
||
| static boolean isScmFinalized(UpgradeFinalization.Status scmUpgradeStatus) { | ||
| return UpgradeFinalization.isFinalized(scmUpgradeStatus) | ||
| || UpgradeFinalization.isDone(scmUpgradeStatus); | ||
| } | ||
|
|
||
| static boolean shouldFinalize(UpgradeFinalization.Status scmUpgradeStatus, | ||
| int finalizedDatanodes, int totalDatanodes) { | ||
| return UpgradeFinalization.Status.FINALIZATION_REQUIRED.equals(scmUpgradeStatus) | ||
| && finalizedDatanodes == totalDatanodes; | ||
| } | ||
|
|
||
| @Override | ||
| public StartContainerBalancerResponseProto startContainerBalancer( | ||
| Optional<Double> threshold, Optional<Integer> iterations, | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -737,6 +737,65 @@ public void testProcessLayoutVersion() throws IOException { | |
| testProcessLayoutVersionReportHigherMlv(); | ||
| } | ||
|
|
||
| @Test | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we need a test for registration when a DN is finalized on registration, and not finalized on registration and when the remove_node is called too. This test covers the heartbeat part I think
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks, added tests for this scenario. |
||
| public void testDatanodeFinalizedCounterTracksLayoutVersionReports() | ||
| throws IOException, AuthenticationException { | ||
| try (SCMNodeManager nodeManager = createNodeManager(getConf())) { | ||
| DatanodeDetails node = | ||
| HddsTestUtils.createRandomDatanodeAndRegister(nodeManager); | ||
| assertEquals(1, nodeManager.getNumDatanodesFinalized(), | ||
| "Initial datanode should be counted as finalized"); | ||
|
|
||
| int softwareVersion = | ||
| nodeManager.getLayoutVersionManager().getSoftwareLayoutVersion(); | ||
| int metadataVersion = | ||
| nodeManager.getLayoutVersionManager().getMetadataLayoutVersion(); | ||
| nodeManager.processLayoutVersionReport(node, | ||
| LayoutVersionProto.newBuilder() | ||
| .setMetadataLayoutVersion(metadataVersion - 1) | ||
| .setSoftwareLayoutVersion(softwareVersion) | ||
| .build()); | ||
| assertEquals(0, nodeManager.getNumDatanodesFinalized(), | ||
| "Lower metadata layout version should decrement finalized count"); | ||
|
|
||
| nodeManager.processLayoutVersionReport(node, | ||
| LayoutVersionProto.newBuilder() | ||
| .setMetadataLayoutVersion(metadataVersion) | ||
| .setSoftwareLayoutVersion(softwareVersion) | ||
| .build()); | ||
| assertEquals(1, nodeManager.getNumDatanodesFinalized(), | ||
| "Restored metadata layout version should restore finalized count"); | ||
| } | ||
| } | ||
|
|
||
| @Test | ||
| public void testDatanodeFinalizedCounterTracksRegistrationAndRemoveNode() | ||
| throws IOException, AuthenticationException, NodeNotFoundException { | ||
| try (SCMNodeManager nodeManager = createNodeManager(getConf())) { | ||
| DatanodeDetails finalizedNode = | ||
| registerWithCapacity(nodeManager, CORRECT_LAYOUT_PROTO, success); | ||
| assertEquals(1, nodeManager.getNumDatanodesFinalized(), | ||
| "Finalized registration should increment finalized count"); | ||
|
|
||
| DatanodeDetails nonFinalizedNode = | ||
| registerWithCapacity(nodeManager, SMALLER_MLV_LAYOUT_PROTO, success); | ||
| assertEquals(1, nodeManager.getNumDatanodesFinalized(), | ||
| "Non-finalized registration should not increment finalized count"); | ||
|
|
||
| nonFinalizedNode.setPersistedOpState( | ||
| HddsProtos.NodeOperationalState.DECOMMISSIONED); | ||
| nodeManager.removeNode(nonFinalizedNode); | ||
| assertEquals(1, nodeManager.getNumDatanodesFinalized(), | ||
| "Removing a non-finalized node should not change finalized count"); | ||
|
|
||
| finalizedNode.setPersistedOpState( | ||
| HddsProtos.NodeOperationalState.DECOMMISSIONED); | ||
| nodeManager.removeNode(finalizedNode); | ||
| assertEquals(0, nodeManager.getNumDatanodesFinalized(), | ||
| "Removing a finalized node should decrement finalized count"); | ||
| } | ||
| } | ||
|
|
||
| // Currently invoked by testProcessLayoutVersion. | ||
| public void testProcessLayoutVersionReportHigherMlv() | ||
| throws IOException { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this should be the IN_SERVICE + HEALTHY node, rather than all. I guess it might be OK iterating all nodes, but there is a chance that there is some DEAD node that was finalized, and then in the other place where we get the total datanode count it could make the counts match when they should not. Eg we have a finalized decommissioned node, and an unfinalized in_service node and the decommissioned one counts +1 when it should not.
Really what we want to ensure is that all nodes that can finalize, are finalized. Nodes unable to finalize, by being dead should not block finalization.
I think a dead node has to be registered when it comes back alive, at least if the DN process is restarted. If the DN process is just hung for some time preventing the heart beats, the node can be moved back to IN_SERVICE.
I wonder if such a node should be forced to finalize before it rejoins - certainly on registration we can make a node finalize. I think this is out of scope for this PR, but we need to think about it as another change.