From d9be5d8db550fea285a49ec57e2899d11774fd6f Mon Sep 17 00:00:00 2001 From: Johann Lombardi Date: Thu, 11 Jun 2026 16:09:16 -0400 Subject: [PATCH 1/3] DAOS-19150 bio: add percentage_used to nvme stats Report percentage_used via the nvme health metrics. Signed-off-by: Johann Lombardi --- src/bio/bio_internal.h | 4 ++++ src/bio/bio_monitor.c | 2 ++ src/control/lib/spdk/src/nvme_control_common.c | 1 + src/include/daos_srv/control.h | 1 + src/proto/ctl/smd.proto | 1 + 5 files changed, 9 insertions(+) diff --git a/src/bio/bio_internal.h b/src/bio/bio_internal.h index dcdcf0f93c4..00307f000f6 100644 --- a/src/bio/bio_internal.h +++ b/src/bio/bio_internal.h @@ -175,6 +175,10 @@ struct bio_dma_buffer { X(bdh_unsafe_shutdowns, "unsafe_shutdowns", \ "Number of unsafe shutdowns (no notification prior to power loss)", \ "shutdowns", D_TM_COUNTER) \ + X(bdh_percentage_used, "percentage_used", \ + "Percentage as canonical NAND-life indicator, hits 100 when the drive has consumed its rated endurance" \ + "percents", D_TM_GAUGE) \ + X(bdh_unsafe_shutdowns, "unsafe_shutdowns", \ X(bdh_temp, "temp/current", \ "Current SSD temperature", \ "kelvins", D_TM_GAUGE) \ diff --git a/src/bio/bio_monitor.c b/src/bio/bio_monitor.c index d00061f7d3f..bf6a7a70887 100644 --- a/src/bio/bio_monitor.c +++ b/src/bio/bio_monitor.c @@ -512,6 +512,8 @@ populate_health_stats(struct bio_dev_health *bdh) dev_state->unsafe_shutdowns = page->unsafe_shutdowns[0]; d_tm_set_counter(bdh->bdh_unsafe_shutdowns, page->unsafe_shutdowns[0]); + dev_state->percentage_used = page->percentage_used; + d_tm_set_gauge(bdh->bdh_percentage_used, page->percentage_used); /** temperature */ dev_state->warn_temp_time = page->warning_temp_time; diff --git a/src/control/lib/spdk/src/nvme_control_common.c b/src/control/lib/spdk/src/nvme_control_common.c index ac0d8eaea88..81af7cbd59f 100644 --- a/src/control/lib/spdk/src/nvme_control_common.c +++ b/src/control/lib/spdk/src/nvme_control_common.c @@ -352,6 +352,7 @@ populate_dev_health(struct nvme_stats *stats, stats->read_only_warn = cw.bits.read_only ? true : false; stats->volatile_mem_warn = cw.bits.volatile_memory_backup ? true : false; + stats->percentage_used = hp->percentage_used; /* Intel Smart Information Attributes */ if ((cdata == NULL) || (cdata->vid != SPDK_PCI_VID_INTEL)) diff --git a/src/include/daos_srv/control.h b/src/include/daos_srv/control.h index c1d8771b777..416f56ba109 100644 --- a/src/include/daos_srv/control.h +++ b/src/include/daos_srv/control.h @@ -105,6 +105,7 @@ struct nvme_stats { uint64_t unsafe_shutdowns; uint64_t media_errs; uint64_t err_log_entries; + uint8_t percentage_used; /* I/O error counters */ uint32_t bio_read_errs; uint32_t bio_write_errs; diff --git a/src/proto/ctl/smd.proto b/src/proto/ctl/smd.proto index 57cb414634a..f6aec631ad3 100644 --- a/src/proto/ctl/smd.proto +++ b/src/proto/ctl/smd.proto @@ -79,6 +79,7 @@ message BioHealthResp { uint32 link_max_width = 51; // maximum width (number of lanes) float link_neg_speed = 52; // negotiated speed in transactions per second uint32 link_neg_width = 53; // negotiated width (number of lanes) + uint32 percentage_used = 54; } enum NvmeDevState { From d0442ff403403264b7251903244a35cdf6b74e0e Mon Sep 17 00:00:00 2001 From: Johann Lombardi Date: Thu, 11 Jun 2026 16:22:19 -0400 Subject: [PATCH 2/3] fix typo Signed-off-by: Johann Lombardi --- src/bio/bio_internal.h | 1 - 1 file changed, 1 deletion(-) diff --git a/src/bio/bio_internal.h b/src/bio/bio_internal.h index 00307f000f6..1855c9925d1 100644 --- a/src/bio/bio_internal.h +++ b/src/bio/bio_internal.h @@ -178,7 +178,6 @@ struct bio_dma_buffer { X(bdh_percentage_used, "percentage_used", \ "Percentage as canonical NAND-life indicator, hits 100 when the drive has consumed its rated endurance" \ "percents", D_TM_GAUGE) \ - X(bdh_unsafe_shutdowns, "unsafe_shutdowns", \ X(bdh_temp, "temp/current", \ "Current SSD temperature", \ "kelvins", D_TM_GAUGE) \ From dadcc673674922cf16c14ef88b51356fa12b44fa Mon Sep 17 00:00:00 2001 From: Johann Lombardi Date: Thu, 11 Jun 2026 16:36:49 -0400 Subject: [PATCH 3/3] fix another nit Signed-off-by: Johann Lombardi --- src/bio/bio_internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/bio/bio_internal.h b/src/bio/bio_internal.h index 1855c9925d1..721309eef05 100644 --- a/src/bio/bio_internal.h +++ b/src/bio/bio_internal.h @@ -176,7 +176,7 @@ struct bio_dma_buffer { "Number of unsafe shutdowns (no notification prior to power loss)", \ "shutdowns", D_TM_COUNTER) \ X(bdh_percentage_used, "percentage_used", \ - "Percentage as canonical NAND-life indicator, hits 100 when the drive has consumed its rated endurance" \ + "Percentage as canonical NAND-life indicator, hits 100 when the drive has consumed its rated endurance",\ "percents", D_TM_GAUGE) \ X(bdh_temp, "temp/current", \ "Current SSD temperature", \