diff --git a/src/client/api/event.c b/src/client/api/event.c index b75d3475348..83dd9676419 100644 --- a/src/client/api/event.c +++ b/src/client/api/event.c @@ -761,6 +761,7 @@ daos_eq_poll(daos_handle_t eqh, int wait_running, int64_t timeout, unsigned int n_events, struct daos_event **events) { struct eq_progress_arg epa; + struct d_fault_attr_t *fa; int rc; if (n_events == 0 || events == NULL) @@ -778,6 +779,13 @@ daos_eq_poll(daos_handle_t eqh, int wait_running, int64_t timeout, epa.wait_running = wait_running; epa.count = 0; + /* Fault injection: crt_progress failure BEFORE dequeue; caller's evp remains stale. */ + fa = d_fault_attr_lookup(DAOS_FAULT_EQ_POLL_FAIL); + if (fa != NULL && D_SHOULD_FAIL(fa)) { + daos_eq_putref(epa.eqx); + return -DER_HG; + } + /* pass the timeout to crt_progress() with a conditional callback */ rc = crt_progress_cond(epa.eqx->eqx_ctx, timeout, eq_progress_cb, &epa); diff --git a/src/include/daos/common.h b/src/include/daos/common.h index e605d94eefb..dc1bd4628bd 100644 --- a/src/include/daos/common.h +++ b/src/include/daos/common.h @@ -971,6 +971,9 @@ enum { #define DAOS_FAULT_POOL_EXT_PADDING (DAOS_FAIL_SYS_TEST_GROUP_LOC | 0x209) #define DAOS_FAULT_POOL_EXT_RESERVED (DAOS_FAIL_SYS_TEST_GROUP_LOC | 0x20a) +/* Client code fault injection */ +#define DAOS_FAULT_EQ_POLL_FAIL (DAOS_FAIL_SYS_TEST_GROUP_LOC | 0x1000) + #define DAOS_DTX_SKIP_PREPARE DAOS_DTX_SPEC_LEADER #define DAOS_FAIL_CHECK(id) daos_fail_check(id) diff --git a/src/tests/ftest/pool/autotest_eq_poll_fi.py b/src/tests/ftest/pool/autotest_eq_poll_fi.py new file mode 100644 index 00000000000..30a215ba306 --- /dev/null +++ b/src/tests/ftest/pool/autotest_eq_poll_fi.py @@ -0,0 +1,71 @@ +""" + (C) Copyright 2026 Hewlett Packard Enterprise Development LP. + + SPDX-License-Identifier: BSD-2-Clause-Patent +""" +from apricot import TestWithServers + + +class PoolAutotestEqPollFITest(TestWithServers): + """Test daos pool autotest robustness under daos_eq_poll() fault injection. + + Validates the DAOS-19016 fix: the kv_put() and kv_get() spin loops in + src/utils/daos_autotest.c must handle daos_eq_poll() returning a negative + error code without dereferencing the stale event pointer (evp). + + Fault injection point DAOS_FAULT_EQ_POLL_FAIL (ID 135168) injects a + -DER_HG return from daos_eq_poll(), exercising the rc < 0 break added by + the fix. The expected outcome is: + - daos pool autotest exits with rc == 1 (no crash or hang) + - the error message contains DER_HG(-1020) + + :avocado: recursive + """ + + def test_pool_autotest_eq_poll_fi(self): + """Test that daos pool autotest handles daos_eq_poll() errors correctly. + + Run daos pool autotest with fault injection point DAOS_FAULT_EQ_POLL_FAIL + (fault ID 135168, enabled via the YAML faults section) active. Confirm + that when daos_eq_poll() returns -DER_HG the autotest exits cleanly with + rc == 1 and reports DER_HG(-1020), proving that the stale event pointer + fix from DAOS-19016 is working. + + :avocado: tags=all,full_regression + :avocado: tags=hw,medium + :avocado: tags=pool,daos_cmd,autotest,fault_injection + :avocado: tags=test_pool_autotest_eq_poll_fi,PoolAutotestEqPollFITest + """ + self.log_step("Create a pool") + self.add_pool() + self.pool.set_query_data() + daos_cmd = self.get_daos_command() + + # Fault injection is enabled via the YAML 'fault_list' section. + # The autotest is expected to fail: disable the exception so that the + # CmdResult can be inspected for the expected error signature. + self.log_step("Run pool autotest with daos_eq_poll fault injection (DAOS-19016)") + daos_cmd.exit_status_exception = False + result = daos_cmd.pool_autotest(pool=self.pool.identifier) + + self.log_step("Verify autotest exited with the expected error code") + if result.exit_status == 0: + self.fail( + "daos pool autotest succeeded unexpectedly; " + "expected it to fail due to DAOS_FAULT_EQ_POLL_FAIL injection") + if result.exit_status != 1: + self.fail( + f"Expected exit code 1, got {result.exit_status}; " + f"stderr: {result.stderr_text}") + + self.log_step("Verify DER_HG(-1020) error in autotest output") + if "DER_HG(-1020)" not in result.stderr_text: + self.fail( + f"Expected 'DER_HG(-1020)' in autotest stderr; " + f"got: {result.stderr_text}") + self.log.info( + "Fault injection correctly propagated DER_HG(-1020) " + "without stale event pointer dereference") + + self.log_step("Confirm pool is still healthy after the expected autotest failure") + self.pool.set_query_data() diff --git a/src/tests/ftest/pool/autotest_eq_poll_fi.yaml b/src/tests/ftest/pool/autotest_eq_poll_fi.yaml new file mode 100644 index 00000000000..c46ac4d50ab --- /dev/null +++ b/src/tests/ftest/pool/autotest_eq_poll_fi.yaml @@ -0,0 +1,17 @@ +hosts: + test_servers: 1 + test_clients: 1 +timeout: 600 +setup: + start_servers_once: False +server_config: + name: daos_server + engines_per_host: 1 + engines: + 0: + storage: auto +pool: + size: 20G +faults: + fault_list: + - DAOS_FAULT_EQ_POLL_FAIL diff --git a/src/tests/ftest/util/fault_config_utils.py b/src/tests/ftest/util/fault_config_utils.py index 5dd3071271f..c831c51d6d0 100644 --- a/src/tests/ftest/util/fault_config_utils.py +++ b/src/tests/ftest/util/fault_config_utils.py @@ -1,5 +1,6 @@ """ (C) Copyright 2019-2024 Intel Corporation. + (C) Copyright 2026 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -235,6 +236,12 @@ 'probability_y': '100', 'interval': '1', 'max_faults': '1'}, + 'DAOS_FAULT_EQ_POLL_FAIL': { + 'id': '135168', + 'probability_x': '1000', + 'probability_y': '100', + 'interval': '100', + 'max_faults': '5'}, } diff --git a/src/utils/daos_autotest.c b/src/utils/daos_autotest.c index 5d9f591b7fa..cc9787ec801 100644 --- a/src/utils/daos_autotest.c +++ b/src/utils/daos_autotest.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2020-2022 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -488,6 +488,7 @@ kv_put(daos_handle_t oh, daos_size_t size) * Max request in flight reached, wait for one i/o to * complete to reuse the slot */ + evp = NULL; while (1) { rc = daos_eq_poll(eq, 1, DAOS_EQ_NOWAIT, 1, &evp); if (rc > 0) @@ -497,6 +498,11 @@ kv_put(daos_handle_t oh, daos_size_t size) break; } } + /* Poll failure: evp is stale, do not dereference it */ + if (rc < 0) + break; + /* Fault injection guard: verify evp is valid to catch stale pointer bugs */ + D_ASSERT(evp != NULL); /** Check if completed operation failed */ if (evp->ev_error != DER_SUCCESS) { @@ -551,8 +557,11 @@ kv_put(daos_handle_t oh, daos_size_t size) num_events = daos_eq_query(eq, DAOS_EQR_ALL, 0, NULL); while (1) { eq_rc = daos_eq_poll(eq, 1, DAOS_EQ_NOWAIT, 1, &evp); - if (eq_rc > 0) + if (eq_rc > 0) { completions += eq_rc; + if (rc == 0 && evp->ev_error != DER_SUCCESS) + rc = evp->ev_error; + } if (eq_rc < 0) { rc = eq_rc; break; @@ -628,6 +637,7 @@ kv_get(daos_handle_t oh, daos_size_t size) * Max request in flight reached, wait for one i/o to * complete to reuse the slot */ + evp = NULL; while (1) { rc = daos_eq_poll(eq, 1, DAOS_EQ_NOWAIT, 1, &evp); if (rc > 0) @@ -637,6 +647,11 @@ kv_get(daos_handle_t oh, daos_size_t size) break; } } + /* Poll failure: evp is stale, do not dereference it */ + if (rc < 0) + break; + /* Fault injection guard: verify evp is valid to catch stale pointer bugs */ + D_ASSERT(evp != NULL); /** Check if completed operation failed */ if (evp->ev_error != DER_SUCCESS) {