Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions hadoop-hdds/interface-client/src/main/proto/hdds.proto
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,7 @@ message Node {
repeated NodeOperationalState nodeOperationalStates = 3;
optional int32 totalVolumeCount = 4;
optional int32 healthyVolumeCount = 5;
repeated string failedVolumes = 6;
}

message NodePool {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,14 @@ public void getMetrics(MetricsCollector collector, boolean all) {
Integer.parseInt(nonWritableNodes));
}

String volumeFailures = nodeStatistics.get("VolumeFailures");
if (volumeFailures != null) {
metrics.addGauge(
Interns.info("VolumeFailures",
"Number of datanodes with at least one failed volume"),
Integer.parseInt(volumeFailures));
}

for (Map.Entry<String, Long> e : nodeInfo.entrySet()) {
metrics.addGauge(
Interns.info(e.getKey(), diskMetricDescription(e.getKey())),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
import org.apache.hadoop.hdds.protocol.proto.HddsProtos.DeletedBlocksTransactionInfo;
import org.apache.hadoop.hdds.protocol.proto.HddsProtos.DeletedBlocksTransactionSummary;
import org.apache.hadoop.hdds.protocol.proto.ReconfigureProtocolProtos.ReconfigureProtocolService;
import org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.StorageReportProto;
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos;
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.ContainerBalancerStatusInfoResponseProto;
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.DecommissionScmResponseProto;
Expand Down Expand Up @@ -655,6 +656,7 @@ public List<HddsProtos.Node> queryNode(
if (datanodeInfo != null) {
nodeBuilder.setTotalVolumeCount(datanodeInfo.getStorageReports().size());
nodeBuilder.setHealthyVolumeCount(datanodeInfo.getHealthyVolumeCount());
addFailedVolumes(nodeBuilder, datanodeInfo);
}
result.add(nodeBuilder.build());
}
Expand Down Expand Up @@ -687,6 +689,7 @@ public HddsProtos.Node queryNode(UUID uuid)
if (datanodeInfo != null) {
nodeBuilder.setTotalVolumeCount(datanodeInfo.getStorageReports().size());
nodeBuilder.setHealthyVolumeCount(datanodeInfo.getHealthyVolumeCount());
addFailedVolumes(nodeBuilder, datanodeInfo);
}
result = nodeBuilder.build();
}
Expand All @@ -702,6 +705,15 @@ public HddsProtos.Node queryNode(UUID uuid)
return result;
}

private static void addFailedVolumes(HddsProtos.Node.Builder nodeBuilder,
DatanodeInfo datanodeInfo) {
for (StorageReportProto report : datanodeInfo.getStorageReports()) {
if (report.hasFailed() && report.getFailed()) {
nodeBuilder.addFailedVolumes(report.getStorageLocation());
}
}
}

@Override
public List<DatanodeAdminError> decommissionNodes(List<String> nodes, boolean force)
throws IOException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.annotation.JsonProperty;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import org.apache.hadoop.hdds.protocol.DatanodeDetails;
import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
Expand All @@ -41,6 +43,8 @@ public final class BasicDatanodeInfo {
private Integer totalVolumeCount = null;
@JsonInclude(JsonInclude.Include.NON_NULL)
private Integer healthyVolumeCount = null;
@JsonInclude(JsonInclude.Include.NON_EMPTY)
private List<String> failedVolumes = null;

private BasicDatanodeInfo(Builder builder) {
this.dn = builder.dn;
Expand All @@ -51,26 +55,30 @@ private BasicDatanodeInfo(Builder builder) {
this.percentUsed = builder.percentUsed;
this.totalVolumeCount = builder.totalVolumeCount;
this.healthyVolumeCount = builder.healthyVolumeCount;
this.failedVolumes = builder.failedVolumes;
}

/**
* Builder class for creating instances of BasicDatanodeInfo.
*/
public static class Builder {
private DatanodeDetails dn;
private HddsProtos.NodeOperationalState opState;
private HddsProtos.NodeState healthState;
private final DatanodeDetails dn;
private final HddsProtos.NodeOperationalState opState;
private final HddsProtos.NodeState healthState;
private Long used;
private Long capacity;
private Double percentUsed;
private Integer totalVolumeCount;
private Integer healthyVolumeCount;

public Builder(DatanodeDetails dn, HddsProtos.NodeOperationalState opState,
HddsProtos.NodeState healthState) {
this.dn = dn;
this.opState = opState;
this.healthState = healthState;
private final Integer totalVolumeCount;
private final Integer healthyVolumeCount;
private final List<String> failedVolumes;

public Builder(HddsProtos.Node node) {
dn = DatanodeDetails.getFromProtoBuf(node.getNodeID());
healthState = node.getNodeStates(0);
opState = node.getNodeOperationalStates(0);
totalVolumeCount = node.hasTotalVolumeCount() ? node.getTotalVolumeCount() : null;
healthyVolumeCount = node.hasHealthyVolumeCount() ? node.getHealthyVolumeCount() : null;
failedVolumes = getFailedVolumes(node);
}

public Builder withUsageInfo(long usedBytes, long capacityBytes, double percentUsedBytes) {
Expand All @@ -80,12 +88,6 @@ public Builder withUsageInfo(long usedBytes, long capacityBytes, double percentU
return this;
}

public Builder withVolumeCounts(Integer total, Integer healthy) {
this.totalVolumeCount = total;
this.healthyVolumeCount = healthy;
return this;
}

public BasicDatanodeInfo build() {
return new BasicDatanodeInfo(this);
}
Expand Down Expand Up @@ -206,8 +208,26 @@ public Integer getHealthyVolumeCount() {
return healthyVolumeCount;
}

@JsonProperty(index = 112)
public List<String> getFailedVolumes() {
return failedVolumes;
}

@JsonIgnore
public DatanodeDetails getDatanodeDetails() {
return dn;
}

private static List<String> getFailedVolumes(HddsProtos.Node node) {
int count = node.getFailedVolumesCount();
if (count == 0) {
return Collections.emptyList();
}
List<String> result = new ArrayList<>(count);
for (int i = 0; i < count; i++) {
result.add(node.getFailedVolumes(i));
}
return result;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,14 @@ public class ListInfoSubcommand extends ScmSubcommand {
defaultValue = "false")
private boolean json;

@CommandLine.Option(names = {"--nodes-with-failed-volumes"},
description = "Only show datanodes that have at least one failed volume.",
defaultValue = "false")
private boolean nodeWithFailedVolumes;

Comment on lines +68 to +72
Copy link
Copy Markdown
Contributor

@sreejasahithi sreejasahithi Apr 10, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The --nodes-with-failed-volumes filter is silently ignored when --node-id is used.
We should make these options mutually exclusive to avoid confusion when a user provides both in the command but receives the result for the node whose ID was specified, regardless of whether it has failed volumes.

@CommandLine.Spec
private CommandLine.Model.CommandSpec spec;

@CommandLine.ArgGroup(exclusive = true, multiplicity = "0..1")
private ExclusiveNodeOptions exclusiveNodeOptions;

Expand All @@ -85,14 +93,16 @@ static class ExclusiveNodeOptions extends NodeSelectionMixin {

@Override
public void execute(ScmClient scmClient) throws IOException {
if (nodeWithFailedVolumes && exclusiveNodeOptions != null
&& !Strings.isNullOrEmpty(exclusiveNodeOptions.getNodeId())) {
throw new CommandLine.ParameterException(spec.commandLine(),
"--nodes-with-failed-volumes cannot be used with --id/--node-id. "
+ "Use them separately.");
}
Comment on lines +96 to +101
Copy link
Copy Markdown
Contributor

@sreejasahithi sreejasahithi Apr 14, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Picocli reports argument conflicts as CommandLine.MutuallyExclusiveArgsException (a subclass of CommandLine.ParameterException). Throwing IOException from execute() bypasses picocli's error handling entirely, producing a different error format than what the user would see from other mutual-exclusion violations. so change this to throw new CommandLine.ParameterException.

(you can refer to DecommissionStatusSubCommand to see how it can be used.)

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Update the test as well with the correct exception.

pipelines = scmClient.listPipelines();
if (exclusiveNodeOptions != null && !Strings.isNullOrEmpty(exclusiveNodeOptions.getNodeId())) {
HddsProtos.Node node = scmClient.queryNode(UUID.fromString(exclusiveNodeOptions.getNodeId()));
Integer totalVolumeCount = node.hasTotalVolumeCount() ? node.getTotalVolumeCount() : null;
Integer healthyVolumeCount = node.hasHealthyVolumeCount() ? node.getHealthyVolumeCount() : null;
BasicDatanodeInfo singleNodeInfo = new BasicDatanodeInfo.Builder(
DatanodeDetails.getFromProtoBuf(node.getNodeID()), node.getNodeOperationalStates(0),
node.getNodeStates(0)).withVolumeCounts(totalVolumeCount, healthyVolumeCount).build();
BasicDatanodeInfo singleNodeInfo = new BasicDatanodeInfo.Builder(node).build();
if (json) {
List<BasicDatanodeInfo> dtoList = Collections.singletonList(singleNodeInfo);
System.out.println(JsonUtils.toJsonStringWithDefaultPrettyPrinter(dtoList));
Expand All @@ -118,6 +128,10 @@ public void execute(ScmClient scmClient) throws IOException {
allNodes = allNodes.filter(p -> p.getHealthState().toString()
.compareToIgnoreCase(nodeState) == 0);
}
if (nodeWithFailedVolumes) {
allNodes = allNodes.filter(p ->
p.getFailedVolumes() != null && !p.getFailedVolumes().isEmpty());
}

if (!listLimitOptions.isAll()) {
allNodes = allNodes.limit(listLimitOptions.getLimit());
Expand Down Expand Up @@ -154,13 +168,9 @@ private List<BasicDatanodeInfo> getAllNodes(ScmClient scmClient)
long capacity = p.getCapacity();
long used = capacity - p.getRemaining();
double percentUsed = (capacity > 0) ? (used * 100.0) / capacity : 0.0;
Integer totalVolumeCount = node.hasTotalVolumeCount() ? node.getTotalVolumeCount() : null;
Integer healthyVolumeCount = node.hasHealthyVolumeCount() ? node.getHealthyVolumeCount() : null;
return new BasicDatanodeInfo.Builder(
DatanodeDetails.getFromProtoBuf(node.getNodeID()),
node.getNodeOperationalStates(0), node.getNodeStates(0))
return new BasicDatanodeInfo.Builder(node)
.withUsageInfo(used, capacity, percentUsed)
.withVolumeCounts(totalVolumeCount, healthyVolumeCount).build();
.build();
} catch (Exception e) {
String reason = "Could not process info for an unknown datanode";
if (p != null && p.getNode() != null && !Strings.isNullOrEmpty(p.getNode().getUuid())) {
Expand All @@ -177,12 +187,7 @@ private List<BasicDatanodeInfo> getAllNodes(ScmClient scmClient)
List<HddsProtos.Node> nodes = scmClient.queryNode(null,
null, HddsProtos.QueryScope.CLUSTER, "");

return nodes.stream().map(p -> {
Integer totalVolumeCount = p.hasTotalVolumeCount() ? p.getTotalVolumeCount() : null;
Integer healthyVolumeCount = p.hasHealthyVolumeCount() ? p.getHealthyVolumeCount() : null;
return new BasicDatanodeInfo.Builder(
DatanodeDetails.getFromProtoBuf(p.getNodeID()), p.getNodeOperationalStates(0), p.getNodeStates(0))
.withVolumeCounts(totalVolumeCount, healthyVolumeCount).build(); })
return nodes.stream().map(p -> new BasicDatanodeInfo.Builder(p).build())
.sorted(Comparator.comparing(BasicDatanodeInfo::getHealthState))
.collect(Collectors.toList());
}
Expand All @@ -206,10 +211,12 @@ private void printDatanodeInfo(BasicDatanodeInfo dn) {
.append('/').append(p.getPipelineState().toString()).append('/')
.append(datanode.getID().equals(p.getLeaderId()) ?
"Leader" : "Follower")
.append(System.getProperty("line.separator")));
.append('\n'));
}
} else {
pipelineListInfo.append("No pipelines in cluster.");
pipelineListInfo
.append("No pipelines in cluster.")
.append('\n');
}
System.out.println("Datanode: " + datanode.getUuid().toString() +
" (" + datanode.getNetworkLocation() + "/" + datanode.getIpAddress()
Expand All @@ -221,6 +228,12 @@ private void printDatanodeInfo(BasicDatanodeInfo dn) {
System.out.println("Total volume count: " + dn.getTotalVolumeCount() + "\n" +
"Healthy volume count: " + dn.getHealthyVolumeCount());
}
if (dn.getFailedVolumes() != null && !dn.getFailedVolumes().isEmpty()) {
System.out.println("Failed volumes:");
for (String vol : dn.getFailedVolumes()) {
System.out.println(" " + vol);
}
}
System.out.println("Related pipelines:\n" + pipelineListInfo);

if (dn.getUsed() != null && dn.getCapacity() != null && dn.getUsed() >= 0 && dn.getCapacity() > 0) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

package org.apache.hadoop.hdds.scm.cli.datanode;

import static org.assertj.core.api.AssertionsForClassTypes.assertThat;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
Expand Down Expand Up @@ -357,6 +358,65 @@ public void testVolumeCounters() throws Exception {
assertTrue(output.contains("Healthy volume count:"), "Should display healthy volume count");
}

@Test
public void testFailedVolumesFilter() throws Exception {
ScmClient scmClient = mock(ScmClient.class);
List<HddsProtos.Node> baseNodes = getNodeDetails();

List<HddsProtos.Node> nodes = new ArrayList<>();
// node 0: 1 failed volume
nodes.add(HddsProtos.Node.newBuilder(baseNodes.get(0))
.setTotalVolumeCount(4).setHealthyVolumeCount(3)
.addFailedVolumes("/data/disk2").build());
// node 1: healthy, no failed volumes
nodes.add(HddsProtos.Node.newBuilder(baseNodes.get(1))
.setTotalVolumeCount(4).setHealthyVolumeCount(4).build());
// node 2: 2 failed volumes
nodes.add(HddsProtos.Node.newBuilder(baseNodes.get(2))
.setTotalVolumeCount(6).setHealthyVolumeCount(4)
.addFailedVolumes("/data/disk1")
.addFailedVolumes("/data/disk5").build());
// node 3: healthy, no failed volumes
nodes.add(HddsProtos.Node.newBuilder(baseNodes.get(3))
.setTotalVolumeCount(4).setHealthyVolumeCount(4).build());

when(scmClient.queryNode(any(), any(), any(), any())).thenReturn(nodes);
when(scmClient.listPipelines()).thenReturn(new ArrayList<>());

CommandLine c = new CommandLine(cmd);
c.parseArgs("--nodes-with-failed-volumes");
cmd.execute(scmClient);
String output = outContent.toString(DEFAULT_ENCODING);

// Only 2 datanodes (those with failed volumes) should appear
Matcher m = Pattern.compile("^Datanode:", Pattern.MULTILINE)
.matcher(output);
int count = 0;
while (m.find()) {
count++;
}
assertEquals(2, count, "Only datanodes with failed volumes should be listed");
assertThat(output).contains("Failed volume");
assertThat(output).contains("/data/disk2");
assertThat(output).contains("/data/disk1");
assertThat(output).contains("/data/disk5");
}

@Test
public void testFailedVolumesFilterRejectsNodeId() throws Exception {
ScmClient scmClient = mock(ScmClient.class);
List<HddsProtos.Node> nodes = getNodeDetails();
when(scmClient.listPipelines()).thenReturn(new ArrayList<>());

CommandLine c = new CommandLine(cmd);
c.parseArgs("--nodes-with-failed-volumes",
"--id", nodes.get(0).getNodeID().getUuid());
CommandLine.ParameterException ex = assertThrows(
CommandLine.ParameterException.class, () -> cmd.execute(scmClient));
assertTrue(ex.getMessage().contains(
"--nodes-with-failed-volumes cannot be used with --id/--node-id"));
}

private void validateOrdering(JsonNode root, String orderDirection) {
for (int i = 0; i < root.size() - 1; i++) {
long usedCurrent = root.get(i).get("used").asLong();
Expand Down
Loading