From e8b0ecd78b2792b8296066b7792a7ac432e0e622 Mon Sep 17 00:00:00 2001 From: Kenneth Cain Date: Tue, 9 Jun 2026 12:46:36 -0400 Subject: [PATCH 1/2] DAOS-19028 test: DO NOT LAND test_rebuild_29 repro attempt Debug logging for MGMT_TGT_MAP_UPDATE map_update_bcast() and ds_mgmt_tgt_map_update_pre_forward(), to inspect on any reproducer possible URI and incarnation mismatches, between the PS leader, forwarding engines in the knomial tree, and the restarted engine itself. Test-tag: test_rebuild_29 Test-Repeat: 10 Skip-unit-tests: true Skip-fault-injection-test: true Skip-test-rpms: true Test-provider-hw-medium: ofi+tcp Signed-off-by: Kenneth Cain --- src/mgmt/srv_system.c | 17 +++++++++++++++++ src/mgmt/srv_target.c | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+) diff --git a/src/mgmt/srv_system.c b/src/mgmt/srv_system.c index 38e4f0e702c..64d162e9416 100644 --- a/src/mgmt/srv_system.c +++ b/src/mgmt/srv_system.c @@ -1,5 +1,6 @@ /* * (C) Copyright 2019-2022 Intel Corporation. + * (C) Copyright 2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -221,10 +222,21 @@ map_update_bcast(crt_context_t ctx, struct mgmt_svc *svc, uint32_t map_version, struct mgmt_tgt_map_update_out *out; crt_opcode_t opc; crt_rpc_t *rpc; + int i; int rc; D_DEBUG(DB_MGMT, "enter: version=%u nservers=%d\n", map_version, nservers); + for (i = 0; i < nservers; i++) { + const char *uri = ""; + + if (servers[i].se_uri != NULL) + uri = servers[i].se_uri; + + D_DEBUG(DB_MGMT, "map[%d/%d]: rank=%u inc=%lu uri=%s flags=%u nctxs=%u\n", i + 1, + nservers, servers[i].se_rank, (unsigned long)servers[i].se_incarnation, uri, + (unsigned int)servers[i].se_flags, (unsigned int)servers[i].se_nctxs); + } opc = DAOS_RPC_OPCODE(MGMT_TGT_MAP_UPDATE, DAOS_MGMT_MODULE, DAOS_MGMT_VERSION); @@ -254,6 +266,11 @@ map_update_bcast(crt_context_t ctx, struct mgmt_svc *svc, uint32_t map_version, out_rpc: crt_req_decref(rpc); out: + if (rc != 0) { + D_WARN("map update bcast failed for version=%u nservers=%d: " DF_RC "\n", + map_version, nservers, DP_RC(rc)); + } + D_DEBUG(DB_MGMT, "leave: version=%u nservers=%d: "DF_RC"\n", map_version, nservers, DP_RC(rc)); return rc; diff --git a/src/mgmt/srv_target.c b/src/mgmt/srv_target.c index bf5f373aaf6..e286e22728c 100644 --- a/src/mgmt/srv_target.c +++ b/src/mgmt/srv_target.c @@ -1540,6 +1540,39 @@ int ds_mgmt_tgt_map_update_pre_forward(crt_rpc_t *rpc, void *arg) { struct mgmt_tgt_map_update_in *in = crt_req_get(rpc); + d_rank_t self_rank = dss_self_rank(); + uint64_t self_inc = 0; + uint64_t map_inc = 0; + const char *map_uri = ""; + char *self_uri = NULL; + int self_inc_rc; + int self_uri_rc; + bool map_has_self = false; + uint32_t i; + + for (i = 0; i < in->tm_servers.ca_count; i++) { + if (in->tm_servers.ca_arrays[i].se_rank == self_rank) { + map_has_self = true; + map_inc = in->tm_servers.ca_arrays[i].se_incarnation; + if (in->tm_servers.ca_arrays[i].se_uri != NULL) + map_uri = in->tm_servers.ca_arrays[i].se_uri; + break; + } + } + + self_inc_rc = crt_self_incarnation_get(&self_inc); + self_uri_rc = crt_self_uri_get(0 /* tag */, &self_uri); + + D_DEBUG(DB_MGMT, + "map update recv: version=%u self_rank=%u self_inc=%lu self_uri=%s " + "map_has_self=%d map_inc=%lu map_uri=%s nservers=" DF_U64 + " self_inc_rc=%d self_uri_rc=%d\n", + in->tm_map_version, self_rank, (unsigned long)self_inc, + self_uri_rc == 0 ? self_uri : "", map_has_self, (unsigned long)map_inc, + map_uri, in->tm_servers.ca_count, self_inc_rc, self_uri_rc); + + if (self_uri_rc == 0) + D_FREE(self_uri); return ds_mgmt_group_update(in->tm_servers.ca_arrays, in->tm_servers.ca_count, in->tm_map_version); From f7c300d153af582c106fdde20083e536b6fdcab7 Mon Sep 17 00:00:00 2001 From: Kenneth Cain Date: Fri, 12 Jun 2026 17:09:03 -0400 Subject: [PATCH 2/2] Expand DAOS-19028 debug logging across MGMT/cart map-update Latest changes include: - MGMT map distribution logging in map_update_bcast(), with verbose per-rank map dumps controlled by DAOS_MAP_UPDATE_VERBOSE (in addition to needing log_mask: DEBUG) - MGMT target pre-forward logging in ds_mgmt_tgt_map_update_pre_forward(), including a "MISMATCH " prefix when self/map state differs. - MGMT map update aggregation warning for non-zero member return codes. - CaRT group replace-path diagnostics in crt_group_primary_modify() for existing-rank SWIM-check flow (incoming rank/incarnation/URI visibility). - ftest suite env updates to enable DAOS_MAP_UPDATE_VERBOSE=1 on both engines. - launch.py CI repeat cap increased from 10 to 20. Looking for potential stale membership/address state during reintegrate hangs. Test-tag: test_rebuild_29 Test-Repeat: 20 Skip-unit-tests: true Skip-fault-injection-test: true Skip-test-rpms: true Test-provider-hw-medium: ofi+tcp Signed-off-by: Kenneth Cain --- src/cart/crt_group.c | 11 ++++++- src/mgmt/srv_system.c | 47 ++++++++++++++++++---------- src/mgmt/srv_target.c | 38 +++++++++++++++++----- src/tests/ftest/daos_test/suite.yaml | 2 ++ src/tests/ftest/launch.py | 2 +- 5 files changed, 75 insertions(+), 25 deletions(-) diff --git a/src/cart/crt_group.c b/src/cart/crt_group.c index 18454a9266c..d7ed27b5972 100644 --- a/src/cart/crt_group.c +++ b/src/cart/crt_group.c @@ -1,6 +1,6 @@ /* * (C) Copyright 2016-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -3436,8 +3436,17 @@ crt_group_primary_modify(crt_group_t *grp, crt_context_t *ctxs, int num_ctxs, d_ for (i = 0; i < n_idx_to_check; i++) { uint32_t idx = idx_to_check[i]; uint64_t incarnation = incarnations[idx]; + const char *uri = ""; rank = ranks->rl_ranks[idx]; + if (uris != NULL && uris[idx] != NULL) + uri = uris[idx]; + + D_DEBUG(DB_ALL, + "group replace existing rank: rank=%u incoming_inc=%lu incoming_uri=%s " + "note=swim_check_only_uri_cache_not_explicitly_refreshed\n", + rank, (unsigned long)incarnation, uri); + rc = crt_swim_rank_check(grp_priv, rank, incarnation); if (rc != 0) D_ERROR("Failed to check SWIM state of rank %u: "DF_RC"\n", rank, diff --git a/src/mgmt/srv_system.c b/src/mgmt/srv_system.c index 64d162e9416..81e4b70e4eb 100644 --- a/src/mgmt/srv_system.c +++ b/src/mgmt/srv_system.c @@ -141,6 +141,20 @@ free_server_list(struct server_entry *list, int len) D_FREE(list); } +static bool +map_update_verbose_enabled(void) +{ + static bool initialized; + static bool enabled; + + if (!initialized) { + d_getenv_bool("DAOS_MAP_UPDATE_VERBOSE", &enabled); + initialized = true; + } + + return enabled; +} + static struct server_entry * dup_server_list(struct server_entry *in, int in_len) { @@ -224,18 +238,24 @@ map_update_bcast(crt_context_t ctx, struct mgmt_svc *svc, uint32_t map_version, crt_rpc_t *rpc; int i; int rc; + bool verbose; + verbose = map_update_verbose_enabled(); D_DEBUG(DB_MGMT, "enter: version=%u nservers=%d\n", map_version, nservers); - for (i = 0; i < nservers; i++) { - const char *uri = ""; - - if (servers[i].se_uri != NULL) - uri = servers[i].se_uri; - - D_DEBUG(DB_MGMT, "map[%d/%d]: rank=%u inc=%lu uri=%s flags=%u nctxs=%u\n", i + 1, - nservers, servers[i].se_rank, (unsigned long)servers[i].se_incarnation, uri, - (unsigned int)servers[i].se_flags, (unsigned int)servers[i].se_nctxs); + if (verbose) { + for (i = 0; i < nservers; i++) { + const char *uri = ""; + + if (servers[i].se_uri != NULL) + uri = servers[i].se_uri; + + D_DEBUG(DB_MGMT, "map[%d/%d]: rank=%u inc=%lu uri=%s flags=%u nctxs=%u\n", + i + 1, nservers, servers[i].se_rank, + (unsigned long)servers[i].se_incarnation, uri, + (unsigned int)servers[i].se_flags, + (unsigned int)servers[i].se_nctxs); + } } opc = DAOS_RPC_OPCODE(MGMT_TGT_MAP_UPDATE, DAOS_MGMT_MODULE, @@ -266,13 +286,8 @@ map_update_bcast(crt_context_t ctx, struct mgmt_svc *svc, uint32_t map_version, out_rpc: crt_req_decref(rpc); out: - if (rc != 0) { - D_WARN("map update bcast failed for version=%u nservers=%d: " DF_RC "\n", - map_version, nservers, DP_RC(rc)); - } - - D_DEBUG(DB_MGMT, "leave: version=%u nservers=%d: "DF_RC"\n", - map_version, nservers, DP_RC(rc)); + DL_CDEBUG(rc, DLOG_WARN, DB_MGMT, rc, "map update bcast: version=%u nservers=%d", + map_version, nservers); return rc; } diff --git a/src/mgmt/srv_target.c b/src/mgmt/srv_target.c index e286e22728c..6b4f9ddf004 100644 --- a/src/mgmt/srv_target.c +++ b/src/mgmt/srv_target.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -1548,6 +1549,10 @@ ds_mgmt_tgt_map_update_pre_forward(crt_rpc_t *rpc, void *arg) int self_inc_rc; int self_uri_rc; bool map_has_self = false; + bool inc_mismatch = false; + bool uri_mismatch = false; + bool warn; + const char *warn_prefix; uint32_t i; for (i = 0; i < in->tm_servers.ca_count; i++) { @@ -1563,13 +1568,21 @@ ds_mgmt_tgt_map_update_pre_forward(crt_rpc_t *rpc, void *arg) self_inc_rc = crt_self_incarnation_get(&self_inc); self_uri_rc = crt_self_uri_get(0 /* tag */, &self_uri); - D_DEBUG(DB_MGMT, - "map update recv: version=%u self_rank=%u self_inc=%lu self_uri=%s " - "map_has_self=%d map_inc=%lu map_uri=%s nservers=" DF_U64 - " self_inc_rc=%d self_uri_rc=%d\n", - in->tm_map_version, self_rank, (unsigned long)self_inc, - self_uri_rc == 0 ? self_uri : "", map_has_self, (unsigned long)map_inc, - map_uri, in->tm_servers.ca_count, self_inc_rc, self_uri_rc); + if (map_has_self && self_inc_rc == 0) + inc_mismatch = self_inc != map_inc; + if (map_has_self && self_uri_rc == 0 && map_uri != NULL) + uri_mismatch = strcmp(self_uri, map_uri) != 0; + + warn = !map_has_self || inc_mismatch || uri_mismatch; + warn_prefix = warn ? "MISMATCH " : ""; + D_CDEBUG(warn, DLOG_WARN, DB_MGMT, + "%smap update recv: version=%u self_rank=%u self_inc=%lu self_uri=%s " + "map_has_self=%d map_inc=%lu map_uri=%s nservers=" DF_U64 + " self_inc_rc=%d self_uri_rc=%d\n", + warn_prefix, in->tm_map_version, self_rank, (unsigned long)self_inc, + self_uri_rc == 0 ? self_uri : "", map_has_self, + (unsigned long)map_inc, map_uri, in->tm_servers.ca_count, self_inc_rc, + self_uri_rc); if (self_uri_rc == 0) D_FREE(self_uri); @@ -1604,6 +1617,17 @@ ds_mgmt_tgt_map_update_aggregator(crt_rpc_t *source, crt_rpc_t *result, { struct mgmt_tgt_map_update_out *out_source = crt_reply_get(source); struct mgmt_tgt_map_update_out *out_result = crt_reply_get(result); + d_rank_t src_rank = CRT_NO_RANK; + int rc; + + rc = crt_req_src_rank_get(source, &src_rank); + if (rc != 0) + src_rank = CRT_NO_RANK; + + if (out_source->tm_rc != 0) { + D_WARN("map update aggregate member error: src_rank=%u tm_rc=%d\n", src_rank, + out_source->tm_rc); + } out_result->tm_rc += out_source->tm_rc; return 0; diff --git a/src/tests/ftest/daos_test/suite.yaml b/src/tests/ftest/daos_test/suite.yaml index 245874ca8e4..2767fbfe6cb 100644 --- a/src/tests/ftest/daos_test/suite.yaml +++ b/src/tests/ftest/daos_test/suite.yaml @@ -50,6 +50,7 @@ server_config: - D_LOG_FILE_APPEND_RANK=1 - D_LOG_FLUSH=DEBUG - FI_LOG_LEVEL=warn + - DAOS_MAP_UPDATE_VERBOSE=1 - D_LOG_STDERR_IN_LOG=1 storage: auto 1: @@ -63,6 +64,7 @@ server_config: - D_LOG_FILE_APPEND_RANK=1 - D_LOG_FLUSH=DEBUG - FI_LOG_LEVEL=warn + - DAOS_MAP_UPDATE_VERBOSE=1 - D_LOG_STDERR_IN_LOG=1 storage: auto transport_config: diff --git a/src/tests/ftest/launch.py b/src/tests/ftest/launch.py index 02916b1b46b..bf960d0e48c 100755 --- a/src/tests/ftest/launch.py +++ b/src/tests/ftest/launch.py @@ -34,7 +34,7 @@ from util.yaml_utils import YamlException DEFAULT_LOGS_THRESHOLD = "2150M" # 2.1G -MAX_CI_REPETITIONS = 10 +MAX_CI_REPETITIONS = 20 class LaunchError(Exception):