From 9bc57a20ac78c6adab3d1c00fd5e375d2d898411 Mon Sep 17 00:00:00 2001 From: Arseniy Obolenskiy Date: Thu, 28 May 2026 16:07:13 +0200 Subject: [PATCH] Stabilize MPI test timing Synchronize ranks before timed sections so scheduler skew and barrier waits are not counted as task runtime, preventing rare timeout flakes like these: ``` [ RUN ] PicMatrixTests/NesterovARunFuncTestsProcesses3.MatmulFromPic/nesterov_a_test_task_processes_3_mpi_enabled_3_3 unknown file: error: C++ exception with description " Task execute time need to be: time < 1 secs. Original time in secs: 1.21769 " thrown in the test body. [ OK ] PicMatrixTests/NesterovARunFuncTestsProcesses3.MatmulFromPic/nesterov_a_test_task_processes_3_mpi_enabled_3_3 (1224 ms) [ FAILED ] PicMatrixTests/NesterovARunFuncTestsProcesses3.MatmulFromPic/nesterov_a_test_task_processes_3_mpi_enabled_3_3, where GetParam() = (64-byte object <20-AA 75-60 F6-7F 00-00 C0-6C 6E-60 F6-7F 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 88-77 B8-48 FD-01 00-00>, "nesterov_a_test_task_processes_3_mpi_enabled", (3, "3")) (1225 ms) [ RUN ] PicMatrixTests/NesterovARunFuncTestsProcesses3.MatmulFromPic/nesterov_a_test_task_processes_3_mpi_enabled_7_7 job aborted: [ranks] message [0] terminated [1] application aborted aborting MPI_COMM_WORLD (comm=0x44000000), error 1, comm rank 1 [2] terminated ---- error analysis ----- [1] on runnervmqq1k9 D:\a\parallel_programming_course\parallel_programming_course\install\bin\ppc_func_tests aborted the job. abort code 1 ---- error analysis ----- [ PROCESS 1 ] [ PROCESS 1 ] Traceback (most recent call last): File "D:\a\parallel_programming_course\parallel_programming_course\scripts\run_tests.py", line 308, in _execute(args_dict, env_copy) File "D:\a\parallel_programming_course\parallel_programming_course\scripts\run_tests.py", line 283, in _execute runner.run_processes(args_dict["additional_mpi_args"]) File "D:\a\parallel_programming_course\parallel_programming_course\scripts\run_tests.py", line 247, in run_processes self.__run_exec( File "D:\a\parallel_programming_course\parallel_programming_course\scripts\run_tests.py", line 122, in __run_exec raise Exception(f"Subprocess return {result.returncode}.") Exception: Subprocess return 1. Error: Process completed with exit code 1. ``` --- modules/runners/src/runners.cpp | 12 ++++++------ modules/util/include/func_test_util.hpp | 1 + modules/util/include/perf_test_util.hpp | 1 + modules/util/include/util.hpp | 1 + modules/util/src/util.cpp | 19 +++++++++++++++++++ 5 files changed, 28 insertions(+), 6 deletions(-) diff --git a/modules/runners/src/runners.cpp b/modules/runners/src/runners.cpp index 642eae60..5f950455 100644 --- a/modules/runners/src/runners.cpp +++ b/modules/runners/src/runners.cpp @@ -82,22 +82,22 @@ int RunAllTests() { } void SyncGTestSeed() { - unsigned int seed = 0; int rank = -1; MPI_Comm_rank(MPI_COMM_WORLD, &rank); - if (rank == 0) { + int seed = ::testing::GTEST_FLAG(random_seed); + if (rank == 0 && seed == 0) { try { - seed = std::random_device{}(); + seed = static_cast((std::random_device{}() % 99999U) + 1U); } catch (...) { seed = 0; } if (seed == 0) { const auto now = static_cast(std::chrono::steady_clock::now().time_since_epoch().count()); - seed = static_cast(((now & 0x7fffffffULL) | 1ULL)); + seed = static_cast((now % 99999ULL) + 1ULL); } } - MPI_Bcast(&seed, 1, MPI_UNSIGNED, 0, MPI_COMM_WORLD); - ::testing::GTEST_FLAG(random_seed) = static_cast(seed); + MPI_Bcast(&seed, 1, MPI_INT, 0, MPI_COMM_WORLD); + ::testing::GTEST_FLAG(random_seed) = seed; } void SyncGTestFilter() { diff --git a/modules/util/include/func_test_util.hpp b/modules/util/include/func_test_util.hpp index 7d49edde..9564b765 100644 --- a/modules/util/include/func_test_util.hpp +++ b/modules/util/include/func_test_util.hpp @@ -103,6 +103,7 @@ class BaseRunFuncTests : public ::testing::TestWithParamValidation()); + SynchronizeMpiRanks(); EXPECT_TRUE(task_->PreProcessing()); } diff --git a/modules/util/include/perf_test_util.hpp b/modules/util/include/perf_test_util.hpp index de0cb463..8fb0a2ab 100644 --- a/modules/util/include/perf_test_util.hpp +++ b/modules/util/include/perf_test_util.hpp @@ -85,6 +85,7 @@ class BaseRunPerfTests : public ::testing::TestWithParam std::string GetNamespace() { diff --git a/modules/util/src/util.cpp b/modules/util/src/util.cpp index 34c06538..633eac97 100644 --- a/modules/util/src/util.cpp +++ b/modules/util/src/util.cpp @@ -1,5 +1,7 @@ #include "util/include/util.hpp" +#include + #include #include #include @@ -65,3 +67,20 @@ bool ppc::util::IsUnderMpirun() { return static_cast(mpi_env.has_value()); }); } + +void ppc::util::SynchronizeMpiRanks() { + int initialized = 0; + if (MPI_Initialized(&initialized) != MPI_SUCCESS || initialized == 0) { + return; + } + + int finalized = 0; + if (MPI_Finalized(&finalized) != MPI_SUCCESS || finalized != 0) { + return; + } + + const int barrier_res = MPI_Barrier(MPI_COMM_WORLD); + if (barrier_res != MPI_SUCCESS) { + MPI_Abort(MPI_COMM_WORLD, barrier_res); + } +}