Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions src/client/api/event.c
Original file line number Diff line number Diff line change
Expand Up @@ -761,6 +761,7 @@ daos_eq_poll(daos_handle_t eqh, int wait_running, int64_t timeout,
unsigned int n_events, struct daos_event **events)
{
struct eq_progress_arg epa;
struct d_fault_attr_t *fa;
int rc;

if (n_events == 0 || events == NULL)
Expand All @@ -778,6 +779,13 @@ daos_eq_poll(daos_handle_t eqh, int wait_running, int64_t timeout,
epa.wait_running = wait_running;
epa.count = 0;

/* Fault injection: crt_progress failure BEFORE dequeue; caller's evp remains stale. */
fa = d_fault_attr_lookup(DAOS_FAULT_EQ_POLL_FAIL);
if (fa != NULL && D_SHOULD_FAIL(fa)) {
daos_eq_putref(epa.eqx);
return -DER_HG;
}

/* pass the timeout to crt_progress() with a conditional callback */
rc = crt_progress_cond(epa.eqx->eqx_ctx, timeout, eq_progress_cb, &epa);

Expand Down
3 changes: 3 additions & 0 deletions src/include/daos/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -971,6 +971,9 @@ enum {
#define DAOS_FAULT_POOL_EXT_PADDING (DAOS_FAIL_SYS_TEST_GROUP_LOC | 0x209)
#define DAOS_FAULT_POOL_EXT_RESERVED (DAOS_FAIL_SYS_TEST_GROUP_LOC | 0x20a)

/* Client code fault injection */
#define DAOS_FAULT_EQ_POLL_FAIL (DAOS_FAIL_SYS_TEST_GROUP_LOC | 0x1000)

#define DAOS_DTX_SKIP_PREPARE DAOS_DTX_SPEC_LEADER

#define DAOS_FAIL_CHECK(id) daos_fail_check(id)
Expand Down
71 changes: 71 additions & 0 deletions src/tests/ftest/pool/autotest_eq_poll_fi.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
"""
(C) Copyright 2026 Hewlett Packard Enterprise Development LP.

SPDX-License-Identifier: BSD-2-Clause-Patent
"""
from apricot import TestWithServers


class PoolAutotestEqPollFITest(TestWithServers):
"""Test daos pool autotest robustness under daos_eq_poll() fault injection.

Validates the DAOS-19016 fix: the kv_put() and kv_get() spin loops in
src/utils/daos_autotest.c must handle daos_eq_poll() returning a negative
error code without dereferencing the stale event pointer (evp).

Fault injection point DAOS_FAULT_EQ_POLL_FAIL (ID 135168) injects a
-DER_HG return from daos_eq_poll(), exercising the rc < 0 break added by
the fix. The expected outcome is:
- daos pool autotest exits with rc == 1 (no crash or hang)
- the error message contains DER_HG(-1020)

:avocado: recursive
"""

def test_pool_autotest_eq_poll_fi(self):
"""Test that daos pool autotest handles daos_eq_poll() errors correctly.

Run daos pool autotest with fault injection point DAOS_FAULT_EQ_POLL_FAIL
(fault ID 135168, enabled via the YAML faults section) active. Confirm
that when daos_eq_poll() returns -DER_HG the autotest exits cleanly with
rc == 1 and reports DER_HG(-1020), proving that the stale event pointer
fix from DAOS-19016 is working.

:avocado: tags=all,full_regression
:avocado: tags=hw,medium
:avocado: tags=pool,daos_cmd,autotest,fault_injection
:avocado: tags=test_pool_autotest_eq_poll_fi,PoolAutotestEqPollFITest
"""
self.log_step("Create a pool")
self.add_pool()
self.pool.set_query_data()
daos_cmd = self.get_daos_command()

# Fault injection is enabled via the YAML 'fault_list' section.
# The autotest is expected to fail: disable the exception so that the
# CmdResult can be inspected for the expected error signature.
self.log_step("Run pool autotest with daos_eq_poll fault injection (DAOS-19016)")
daos_cmd.exit_status_exception = False
result = daos_cmd.pool_autotest(pool=self.pool.identifier)

self.log_step("Verify autotest exited with the expected error code")
if result.exit_status == 0:
self.fail(
"daos pool autotest succeeded unexpectedly; "
"expected it to fail due to DAOS_FAULT_EQ_POLL_FAIL injection")
if result.exit_status != 1:
self.fail(
f"Expected exit code 1, got {result.exit_status}; "
f"stderr: {result.stderr_text}")

self.log_step("Verify DER_HG(-1020) error in autotest output")
if "DER_HG(-1020)" not in result.stderr_text:
self.fail(
f"Expected 'DER_HG(-1020)' in autotest stderr; "
f"got: {result.stderr_text}")
self.log.info(
"Fault injection correctly propagated DER_HG(-1020) "
"without stale event pointer dereference")

self.log_step("Confirm pool is still healthy after the expected autotest failure")
self.pool.set_query_data()
17 changes: 17 additions & 0 deletions src/tests/ftest/pool/autotest_eq_poll_fi.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
hosts:
test_servers: 1
test_clients: 1
timeout: 600
setup:
start_servers_once: False
server_config:
name: daos_server
engines_per_host: 1
engines:
0:
storage: auto
pool:
size: 20G
faults:
fault_list:
- DAOS_FAULT_EQ_POLL_FAIL
7 changes: 7 additions & 0 deletions src/tests/ftest/util/fault_config_utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""
(C) Copyright 2019-2024 Intel Corporation.
(C) Copyright 2026 Hewlett Packard Enterprise Development LP

SPDX-License-Identifier: BSD-2-Clause-Patent
"""
Expand Down Expand Up @@ -235,6 +236,12 @@
'probability_y': '100',
'interval': '1',
'max_faults': '1'},
'DAOS_FAULT_EQ_POLL_FAIL': {
'id': '135168',
'probability_x': '1000',
'probability_y': '100',
'interval': '100',
'max_faults': '5'},
}


Expand Down
19 changes: 17 additions & 2 deletions src/utils/daos_autotest.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/**
* (C) Copyright 2020-2022 Intel Corporation.
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
* (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -488,6 +488,7 @@ kv_put(daos_handle_t oh, daos_size_t size)
* Max request in flight reached, wait for one i/o to
* complete to reuse the slot
*/
evp = NULL;
while (1) {
rc = daos_eq_poll(eq, 1, DAOS_EQ_NOWAIT, 1, &evp);
if (rc > 0)
Expand All @@ -497,6 +498,11 @@ kv_put(daos_handle_t oh, daos_size_t size)
break;
}
}
/* Poll failure: evp is stale, do not dereference it */
if (rc < 0)
break;
/* Fault injection guard: verify evp is valid to catch stale pointer bugs */
D_ASSERT(evp != NULL);

/** Check if completed operation failed */
if (evp->ev_error != DER_SUCCESS) {
Expand Down Expand Up @@ -551,8 +557,11 @@ kv_put(daos_handle_t oh, daos_size_t size)
num_events = daos_eq_query(eq, DAOS_EQR_ALL, 0, NULL);
while (1) {
eq_rc = daos_eq_poll(eq, 1, DAOS_EQ_NOWAIT, 1, &evp);
if (eq_rc > 0)
if (eq_rc > 0) {
completions += eq_rc;
if (rc == 0 && evp->ev_error != DER_SUCCESS)
rc = evp->ev_error;
}
if (eq_rc < 0) {
rc = eq_rc;
break;
Expand Down Expand Up @@ -628,6 +637,7 @@ kv_get(daos_handle_t oh, daos_size_t size)
* Max request in flight reached, wait for one i/o to
* complete to reuse the slot
*/
evp = NULL;
while (1) {
rc = daos_eq_poll(eq, 1, DAOS_EQ_NOWAIT, 1, &evp);
if (rc > 0)
Expand All @@ -637,6 +647,11 @@ kv_get(daos_handle_t oh, daos_size_t size)
break;
}
}
/* Poll failure: evp is stale, do not dereference it */
if (rc < 0)
break;
/* Fault injection guard: verify evp is valid to catch stale pointer bugs */
D_ASSERT(evp != NULL);

/** Check if completed operation failed */
if (evp->ev_error != DER_SUCCESS) {
Expand Down
Loading