Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 44 additions & 3 deletions hadoop-hdds/common/src/main/resources/ozone-default.xml
Original file line number Diff line number Diff line change
Expand Up @@ -3607,11 +3607,23 @@
</property>
<property>
<name>ozone.recon.scm.container.threshold</name>
<value>100</value>
<value>10000</value>
<tag>OZONE, RECON, SCM</tag>
<description>
Non-OPEN container count drift threshold above which Recon escalates from
incremental SCM container sync to a full SCM DB snapshot sync. Missing
OPEN containers stay on the incremental path because they are short-lived
and can be repaired cheaply without replacing the full SCM DB.
</description>
</property>
<property>
<name>ozone.recon.scm.per.state.drift.threshold</name>
<value>5</value>
<tag>OZONE, RECON, SCM</tag>
<description>
Threshold value for the difference in number of containers
in SCM and RECON.
Per-state lifecycle drift threshold used when SCM and Recon total container
counts are equal. If OPEN, QUASI_CLOSED, or derived CLOSED counts differ by
more than this value, Recon triggers a targeted SCM container sync.
</description>
</property>
<property>
Expand Down Expand Up @@ -4600,6 +4612,35 @@
Interval in MINUTES by Recon to request SCM DB Snapshot.
</description>
</property>
<property>
<name>ozone.recon.scm.container.sync.task.initial.delay</name>
<value>2m</value>
<tag>OZONE, MANAGEMENT, RECON, SCM</tag>
<description>
Initial delay before Recon starts the incremental SCM container sync task.
This is slightly later than the SCM snapshot initial delay so the snapshot
can initialize Recon's SCM DB before the first incremental sync runs.
</description>
</property>
<property>
<name>ozone.recon.scm.container.sync.task.interval.delay</name>
<value>1h</value>
<tag>OZONE, MANAGEMENT, RECON, SCM</tag>
<description>
Interval between incremental SCM container sync runs in Recon. Each cycle
evaluates drift between SCM and Recon and either runs the targeted
multi-pass sync or takes no action.
</description>
</property>
<property>
<name>ozone.recon.scm.deleted.container.check.batch.size</name>
<value>500</value>
<tag>OZONE, RECON, SCM, PERFORMANCE</tag>
<description>
Maximum number of CLOSED or QUASI_CLOSED Recon containers checked against
SCM per incremental sync cycle for DELETING or DELETED retirement.
</description>
</property>
<property>
<name>ozone.om.snapshot.compaction.dag.max.time.allowed</name>
<value>30d</value>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1248,10 +1248,12 @@ public long getContainerCount() throws IOException {
public long getContainerCount(HddsProtos.LifeCycleState state)
throws IOException {
GetContainerCountRequestProto request =
GetContainerCountRequestProto.newBuilder().build();
GetContainerCountRequestProto.newBuilder()
.setState(state)
.build();

GetContainerCountResponseProto response =
submitRequest(Type.GetClosedContainerCount,
submitRequest(Type.GetContainerCount,
builder -> builder.setGetContainerCountRequest(request))
.getGetContainerCountResponse();
return response.getContainerCount();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -470,6 +470,7 @@ message GetPipelineResponseProto {
}

message GetContainerCountRequestProto {
optional LifeCycleState state = 1;
}

message GetContainerCountResponseProto {
Expand Down
12 changes: 10 additions & 2 deletions hadoop-hdds/interface-admin/src/main/resources/proto.lock
Original file line number Diff line number Diff line change
Expand Up @@ -1544,7 +1544,15 @@
]
},
{
"name": "GetContainerCountRequestProto"
"name": "GetContainerCountRequestProto",
"fields": [
{
"id": 1,
"name": "state",
"type": "LifeCycleState",
"optional": true
}
]
},
{
"name": "GetContainerCountResponseProto",
Expand Down Expand Up @@ -2358,4 +2366,4 @@
}
}
]
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,8 @@ private void initialize() throws IOException {
final ContainerInfo container = iterator.next();
Objects.requireNonNull(container, "container == null");
containers.addContainer(container);
if (container.getState() == LifeCycleState.OPEN) {
if (container.getState() == LifeCycleState.OPEN
&& container.getPipelineID() != null) {
try {
pipelineManager.addContainerToPipelineSCMStart(
container.getPipelineID(), container.containerID());
Expand All @@ -260,8 +261,12 @@ private void initialize() throws IOException {
getContainerStateChangeActions() {
final Map<LifeCycleEvent, CheckedConsumer<ContainerInfo, IOException>>
actions = new EnumMap<>(LifeCycleEvent.class);
actions.put(FINALIZE, info -> pipelineManager
.removeContainerFromPipeline(info.getPipelineID(), info.containerID()));
actions.put(FINALIZE, info -> {
if (info.getPipelineID() != null) {
pipelineManager.removeContainerFromPipeline(
info.getPipelineID(), info.containerID());
}
});
return actions;
}

Expand Down Expand Up @@ -334,12 +339,16 @@ public void addContainer(final ContainerInfoProto containerInfo)
transactionBuffer.addToBuffer(containerStore,
containerID, container);
containers.addContainer(container);
if (pipelineManager.containsPipeline(pipelineID)) {
if (pipelineID != null && pipelineManager.containsPipeline(pipelineID)) {
pipelineManager.addContainerToPipeline(pipelineID, containerID);
} else if (containerInfo.getState().
equals(LifeCycleState.OPEN)) {
// Pipeline should exist, but not
throw new PipelineNotFoundException();
if (pipelineID != null) {
// OPEN containers normally require a live pipeline reference.
throw new PipelineNotFoundException();
}
LOG.warn("Adding OPEN container {} without pipeline tracking "
+ "because its pipeline ID is null.", containerID);
}
//recon may receive report of closed container,
// no corresponding Pipeline can be synced for scm.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1350,9 +1350,12 @@ public DatanodeUsageInfoResponseProto getDatanodeUsageInfo(
public GetContainerCountResponseProto getContainerCount(
StorageContainerLocationProtocolProtos.GetContainerCountRequestProto
request) throws IOException {
long containerCount = request.hasState()
? impl.getContainerCount(request.getState())
: impl.getContainerCount();

return GetContainerCountResponseProto.newBuilder()
.setContainerCount(impl.getContainerCount())
.setContainerCount(containerCount)
.build();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -415,8 +415,21 @@ public List<ContainerWithPipeline> getExistContainerWithPipelinesInBatch(
ContainerWithPipeline cp = getContainerWithPipelineCommon(containerID);
cpList.add(cp);
} catch (IOException ex) {
//not found , just go ahead
LOG.error("Container with common pipeline not found: {}", ex);
// Pipeline lookup failed (e.g., QUASI_CLOSED container whose pipeline
// has already been cleaned up). Return the container metadata without a
// pipeline so that callers (e.g., Recon's sync) can still record the
// container rather than losing it silently.
LOG.warn("Pipeline lookup failed for container {}; returning container "
+ "without pipeline. Cause: {}", containerID, ex.getMessage());
try {
ContainerInfo info = scm.getContainerManager()
.getContainer(ContainerID.valueOf(containerID));
cpList.add(new ContainerWithPipeline(info, null));
} catch (ContainerNotFoundException notFound) {
// Container truly does not exist in SCM — exclude it from the result.
LOG.error("Container {} not found in SCM and will not be returned "
+ "to caller.", containerID, notFound);
}
}
}
return cpList;
Expand Down
16 changes: 13 additions & 3 deletions hadoop-ozone/dist/src/main/compose/ozone/docker-config
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ CORE-SITE.XML_hadoop.proxyuser.hadoop.groups=*
OZONE-SITE.XML_ozone.om.address=om
OZONE-SITE.XML_ozone.om.http-address=om:9874
OZONE-SITE.XML_ozone.scm.http-address=scm:9876
OZONE-SITE.XML_ozone.scm.container.size=1GB
OZONE-SITE.XML_ozone.scm.container.size=100MB
OZONE-SITE.XML_ozone.scm.block.size=1MB
OZONE-SITE.XML_ozone.scm.datanode.ratis.volume.free-space.min=10MB
OZONE-SITE.XML_ozone.scm.pipeline.creation.interval=30s
Expand All @@ -43,6 +43,16 @@ OZONE-SITE.XML_ozone.recon.http-address=0.0.0.0:9888
OZONE-SITE.XML_ozone.recon.https-address=0.0.0.0:9889
OZONE-SITE.XML_ozone.recon.om.snapshot.task.interval.delay=1m
OZONE-SITE.XML_ozone.recon.om.snapshot.task.initial.delay=20s
OZONE-SITE.XML_ozone.recon.scm.container.sync.task.initial.delay=30s
OZONE-SITE.XML_ozone.recon.scm.container.sync.task.interval.delay=2m
OZONE-SITE.XML_ozone.recon.scm.snapshot.task.initial.delay=20s
OZONE-SITE.XML_ozone.recon.scm.snapshot.task.interval.delay=30m
OZONE-SITE.XML_ozone.recon.scm.container.threshold=20
OZONE-SITE.XML_ozone.recon.scm.per.state.drift.threshold=1
OZONE-SITE.XML_ozone.recon.scm.deleted.container.check.batch.size=50
OZONE-SITE.XML_hdds.heartbeat.recon.interval=5m
OZONE-SITE.XML_hdds.container.report.interval=1h
OZONE-SITE.XML_hdds.pipeline.report.interval=5m
OZONE-SITE.XML_ozone.datanode.pipeline.limit=1
OZONE-SITE.XML_hdds.scmclient.max.retry.timeout=30s
OZONE-SITE.XML_hdds.container.report.interval=60s
Expand All @@ -51,8 +61,8 @@ OZONE-SITE.XML_ozone.scm.dead.node.interval=45s
OZONE-SITE.XML_hdds.heartbeat.interval=5s
OZONE-SITE.XML_ozone.scm.close.container.wait.duration=5s
OZONE-SITE.XML_hdds.scm.replication.thread.interval=15s
OZONE-SITE.XML_hdds.scm.replication.under.replicated.interval=5s
OZONE-SITE.XML_hdds.scm.replication.over.replicated.interval=5s
OZONE-SITE.XML_hdds.scm.replication.under.replicated.interval=10s
OZONE-SITE.XML_hdds.scm.replication.over.replicated.interval=2m
OZONE-SITE.XML_hdds.scm.wait.time.after.safemode.exit=30s
OZONE-SITE.XML_ozone.http.basedir=/tmp/ozone_http

Expand Down
Loading