Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion src/cart/crt_group.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/*
* (C) Copyright 2016-2024 Intel Corporation.
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
* (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -3436,8 +3436,17 @@ crt_group_primary_modify(crt_group_t *grp, crt_context_t *ctxs, int num_ctxs, d_
for (i = 0; i < n_idx_to_check; i++) {
uint32_t idx = idx_to_check[i];
uint64_t incarnation = incarnations[idx];
const char *uri = "<none>";

rank = ranks->rl_ranks[idx];
if (uris != NULL && uris[idx] != NULL)
uri = uris[idx];

D_DEBUG(DB_ALL,
"group replace existing rank: rank=%u incoming_inc=%lu incoming_uri=%s "
"note=swim_check_only_uri_cache_not_explicitly_refreshed\n",
rank, (unsigned long)incarnation, uri);

rc = crt_swim_rank_check(grp_priv, rank, incarnation);
if (rc != 0)
D_ERROR("Failed to check SWIM state of rank %u: "DF_RC"\n", rank,
Expand Down
36 changes: 34 additions & 2 deletions src/mgmt/srv_system.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/*
* (C) Copyright 2019-2022 Intel Corporation.
* (C) Copyright 2026 Hewlett Packard Enterprise Development LP
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -140,6 +141,20 @@ free_server_list(struct server_entry *list, int len)
D_FREE(list);
}

static bool
map_update_verbose_enabled(void)
{
static bool initialized;
static bool enabled;

if (!initialized) {
d_getenv_bool("DAOS_MAP_UPDATE_VERBOSE", &enabled);
initialized = true;
}

return enabled;
}

static struct server_entry *
dup_server_list(struct server_entry *in, int in_len)
{
Expand Down Expand Up @@ -221,10 +236,27 @@ map_update_bcast(crt_context_t ctx, struct mgmt_svc *svc, uint32_t map_version,
struct mgmt_tgt_map_update_out *out;
crt_opcode_t opc;
crt_rpc_t *rpc;
int i;
int rc;
bool verbose;

verbose = map_update_verbose_enabled();
D_DEBUG(DB_MGMT, "enter: version=%u nservers=%d\n", map_version,
nservers);
if (verbose) {
for (i = 0; i < nservers; i++) {
const char *uri = "<none>";

if (servers[i].se_uri != NULL)
uri = servers[i].se_uri;

D_DEBUG(DB_MGMT, "map[%d/%d]: rank=%u inc=%lu uri=%s flags=%u nctxs=%u\n",
i + 1, nservers, servers[i].se_rank,
(unsigned long)servers[i].se_incarnation, uri,
(unsigned int)servers[i].se_flags,
(unsigned int)servers[i].se_nctxs);
}
}

opc = DAOS_RPC_OPCODE(MGMT_TGT_MAP_UPDATE, DAOS_MGMT_MODULE,
DAOS_MGMT_VERSION);
Expand Down Expand Up @@ -254,8 +286,8 @@ map_update_bcast(crt_context_t ctx, struct mgmt_svc *svc, uint32_t map_version,
out_rpc:
crt_req_decref(rpc);
out:
D_DEBUG(DB_MGMT, "leave: version=%u nservers=%d: "DF_RC"\n",
map_version, nservers, DP_RC(rc));
DL_CDEBUG(rc, DLOG_WARN, DB_MGMT, rc, "map update bcast: version=%u nservers=%d",
map_version, nservers);
return rc;
}

Expand Down
57 changes: 57 additions & 0 deletions src/mgmt/srv_target.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include <sys/sysinfo.h>
#include <ftw.h>
#include <dirent.h>
#include <string.h>

#include <daos_srv/vos.h>
#include <daos_srv/pool.h>
Expand Down Expand Up @@ -1540,6 +1541,51 @@ int
ds_mgmt_tgt_map_update_pre_forward(crt_rpc_t *rpc, void *arg)
{
struct mgmt_tgt_map_update_in *in = crt_req_get(rpc);
d_rank_t self_rank = dss_self_rank();
uint64_t self_inc = 0;
uint64_t map_inc = 0;
const char *map_uri = "<none>";
char *self_uri = NULL;
int self_inc_rc;
int self_uri_rc;
bool map_has_self = false;
bool inc_mismatch = false;
bool uri_mismatch = false;
bool warn;
const char *warn_prefix;
uint32_t i;

for (i = 0; i < in->tm_servers.ca_count; i++) {
if (in->tm_servers.ca_arrays[i].se_rank == self_rank) {
map_has_self = true;
map_inc = in->tm_servers.ca_arrays[i].se_incarnation;
if (in->tm_servers.ca_arrays[i].se_uri != NULL)
map_uri = in->tm_servers.ca_arrays[i].se_uri;
break;
}
}

self_inc_rc = crt_self_incarnation_get(&self_inc);
self_uri_rc = crt_self_uri_get(0 /* tag */, &self_uri);

if (map_has_self && self_inc_rc == 0)
inc_mismatch = self_inc != map_inc;
if (map_has_self && self_uri_rc == 0 && map_uri != NULL)
uri_mismatch = strcmp(self_uri, map_uri) != 0;

warn = !map_has_self || inc_mismatch || uri_mismatch;
warn_prefix = warn ? "MISMATCH " : "";
D_CDEBUG(warn, DLOG_WARN, DB_MGMT,
"%smap update recv: version=%u self_rank=%u self_inc=%lu self_uri=%s "
"map_has_self=%d map_inc=%lu map_uri=%s nservers=" DF_U64
" self_inc_rc=%d self_uri_rc=%d\n",
warn_prefix, in->tm_map_version, self_rank, (unsigned long)self_inc,
self_uri_rc == 0 ? self_uri : "<unavailable>", map_has_self,
(unsigned long)map_inc, map_uri, in->tm_servers.ca_count, self_inc_rc,
self_uri_rc);

if (self_uri_rc == 0)
D_FREE(self_uri);

return ds_mgmt_group_update(in->tm_servers.ca_arrays, in->tm_servers.ca_count,
in->tm_map_version);
Expand Down Expand Up @@ -1571,6 +1617,17 @@ ds_mgmt_tgt_map_update_aggregator(crt_rpc_t *source, crt_rpc_t *result,
{
struct mgmt_tgt_map_update_out *out_source = crt_reply_get(source);
struct mgmt_tgt_map_update_out *out_result = crt_reply_get(result);
d_rank_t src_rank = CRT_NO_RANK;
int rc;

rc = crt_req_src_rank_get(source, &src_rank);
if (rc != 0)
src_rank = CRT_NO_RANK;

if (out_source->tm_rc != 0) {
D_WARN("map update aggregate member error: src_rank=%u tm_rc=%d\n", src_rank,
out_source->tm_rc);
}

out_result->tm_rc += out_source->tm_rc;
return 0;
Expand Down
2 changes: 2 additions & 0 deletions src/tests/ftest/daos_test/suite.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ server_config:
- D_LOG_FILE_APPEND_RANK=1
- D_LOG_FLUSH=DEBUG
- FI_LOG_LEVEL=warn
- DAOS_MAP_UPDATE_VERBOSE=1
- D_LOG_STDERR_IN_LOG=1
storage: auto
1:
Expand All @@ -63,6 +64,7 @@ server_config:
- D_LOG_FILE_APPEND_RANK=1
- D_LOG_FLUSH=DEBUG
- FI_LOG_LEVEL=warn
- DAOS_MAP_UPDATE_VERBOSE=1
- D_LOG_STDERR_IN_LOG=1
storage: auto
transport_config:
Expand Down
2 changes: 1 addition & 1 deletion src/tests/ftest/launch.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
from util.yaml_utils import YamlException

DEFAULT_LOGS_THRESHOLD = "2150M" # 2.1G
MAX_CI_REPETITIONS = 10
MAX_CI_REPETITIONS = 20


class LaunchError(Exception):
Expand Down
Loading