diff --git a/benchmarks/linear_programming/cuopt/run_mip.cpp b/benchmarks/linear_programming/cuopt/run_mip.cpp index 308c7087b..92d600111 100644 --- a/benchmarks/linear_programming/cuopt/run_mip.cpp +++ b/benchmarks/linear_programming/cuopt/run_mip.cpp @@ -148,7 +148,9 @@ int run_single_file(std::string file_path, bool write_log_file, bool log_to_console, int reliability_branching, - double time_limit) + double time_limit, + double work_limit, + bool deterministic) { const raft::handle_t handle_{}; cuopt::linear_programming::mip_solver_settings_t settings; @@ -197,14 +199,17 @@ int run_single_file(std::string file_path, } } } - settings.time_limit = time_limit; - settings.heuristics_only = heuristics_only; - settings.num_cpu_threads = num_cpu_threads; - settings.log_to_console = log_to_console; + settings.time_limit = time_limit; + settings.work_limit = work_limit; + settings.heuristics_only = heuristics_only; + settings.num_cpu_threads = num_cpu_threads; + settings.log_to_console = log_to_console; + settings.determinism_mode = deterministic ? CUOPT_MODE_DETERMINISTIC : CUOPT_MODE_OPPORTUNISTIC; settings.tolerances.relative_tolerance = 1e-12; settings.tolerances.absolute_tolerance = 1e-6; settings.presolve = true; settings.reliability_branching = reliability_branching; + settings.seed = 42; cuopt::linear_programming::benchmark_info_t benchmark_info; settings.benchmark_info_ptr = &benchmark_info; auto start_run_solver = std::chrono::high_resolution_clock::now(); @@ -258,7 +263,9 @@ void run_single_file_mp(std::string file_path, bool write_log_file, bool log_to_console, int reliability_branching, - double time_limit) + double time_limit, + double work_limit, + bool deterministic) { std::cout << "running file " << file_path << " on gpu : " << device << std::endl; auto memory_resource = make_async(); @@ -274,7 +281,9 @@ void run_single_file_mp(std::string file_path, write_log_file, log_to_console, reliability_branching, - time_limit); + time_limit, + work_limit, + deterministic); // this is a bad design to communicate the result but better than adding complexity of IPC or // pipes exit(sol_found); @@ -344,7 +353,12 @@ int main(int argc, char* argv[]) .default_value(std::string("t")); program.add_argument("--time-limit") - .help("time limit") + .help("time limit in seconds") + .scan<'g', double>() + .default_value(std::numeric_limits::infinity()); + + program.add_argument("--work-limit") + .help("work unit limit (for deterministic mode)") .scan<'g', double>() .default_value(std::numeric_limits::infinity()); @@ -362,6 +376,11 @@ int main(int argc, char* argv[]) .scan<'i', int>() .default_value(-1); + program.add_argument("-d", "--determinism") + .help("enable deterministic mode") + .default_value(false) + .implicit_value(true); + // Parse arguments try { program.parse_args(argc, argv); @@ -376,6 +395,7 @@ int main(int argc, char* argv[]) std::string run_dir_arg = program.get("--run-dir"); bool run_dir = run_dir_arg[0] == 't'; double time_limit = program.get("--time-limit"); + double work_limit = program.get("--work-limit"); bool run_selected = program.get("--run-selected")[0] == 't'; int n_gpus = program.get("--n-gpus"); @@ -391,6 +411,7 @@ int main(int argc, char* argv[]) double memory_limit = program.get("--memory-limit"); bool track_allocations = program.get("--track-allocations")[0] == 't'; int reliability_branching = program.get("--reliability-branching"); + bool deterministic = program.get("--determinism"); if (num_cpu_threads < 0) { num_cpu_threads = omp_get_max_threads() / n_gpus; } @@ -479,7 +500,9 @@ int main(int argc, char* argv[]) write_log_file, log_to_console, reliability_branching, - time_limit); + time_limit, + work_limit, + deterministic); } else if (sys_pid < 0) { std::cerr << "Fork failed!" << std::endl; exit(1); @@ -520,7 +543,9 @@ int main(int argc, char* argv[]) write_log_file, log_to_console, reliability_branching, - time_limit); + time_limit, + work_limit, + deterministic); } return 0; diff --git a/cpp/include/cuopt/linear_programming/constants.h b/cpp/include/cuopt/linear_programming/constants.h index c52c22c85..4f7430d39 100644 --- a/cpp/include/cuopt/linear_programming/constants.h +++ b/cpp/include/cuopt/linear_programming/constants.h @@ -32,6 +32,7 @@ #define CUOPT_DUAL_INFEASIBLE_TOLERANCE "dual_infeasible_tolerance" #define CUOPT_ITERATION_LIMIT "iteration_limit" #define CUOPT_TIME_LIMIT "time_limit" +#define CUOPT_WORK_LIMIT "work_limit" #define CUOPT_PDLP_SOLVER_MODE "pdlp_solver_mode" #define CUOPT_METHOD "method" #define CUOPT_PER_CONSTRAINT_RESIDUAL "per_constraint_residual" @@ -49,6 +50,7 @@ #define CUOPT_CUDSS_DETERMINISTIC "cudss_deterministic" #define CUOPT_PRESOLVE "presolve" #define CUOPT_DUAL_POSTSOLVE "dual_postsolve" +#define CUOPT_MIP_DETERMINISM_MODE "mip_determinism_mode" #define CUOPT_MIP_ABSOLUTE_TOLERANCE "mip_absolute_tolerance" #define CUOPT_MIP_RELATIVE_TOLERANCE "mip_relative_tolerance" #define CUOPT_MIP_INTEGRALITY_TOLERANCE "mip_integrality_tolerance" @@ -57,6 +59,7 @@ #define CUOPT_MIP_HEURISTICS_ONLY "mip_heuristics_only" #define CUOPT_MIP_SCALING "mip_scaling" #define CUOPT_MIP_PRESOLVE "mip_presolve" +#define CUOPT_MIP_SEED "mip_seed" #define CUOPT_MIP_RELIABILITY_BRANCHING "mip_reliability_branching" #define CUOPT_MIP_CUT_PASSES "mip_cut_passes" #define CUOPT_MIP_MIXED_INTEGER_ROUNDING_CUTS "mip_mixed_integer_rounding_cuts" @@ -72,6 +75,10 @@ #define CUOPT_NUM_GPUS "num_gpus" #define CUOPT_USER_PROBLEM_FILE "user_problem_file" +/* @brief MIP determinism mode constants */ +#define CUOPT_MODE_OPPORTUNISTIC 0 +#define CUOPT_MODE_DETERMINISTIC 1 + /* @brief LP/MIP termination status constants */ #define CUOPT_TERIMINATION_STATUS_NO_TERMINATION 0 #define CUOPT_TERIMINATION_STATUS_OPTIMAL 1 @@ -83,6 +90,7 @@ #define CUOPT_TERIMINATION_STATUS_PRIMAL_FEASIBLE 7 #define CUOPT_TERIMINATION_STATUS_FEASIBLE_FOUND 8 #define CUOPT_TERIMINATION_STATUS_CONCURRENT_LIMIT 9 +#define CUOPT_TERIMINATION_STATUS_WORK_LIMIT 10 /* @brief The objective sense constants */ #define CUOPT_MINIMIZE 1 diff --git a/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp b/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp index c5c26884f..e6c76a27f 100644 --- a/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp +++ b/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp @@ -84,6 +84,7 @@ class mip_solver_settings_t { tolerances_t tolerances; f_t time_limit = std::numeric_limits::infinity(); + f_t work_limit = std::numeric_limits::infinity(); i_t node_limit = std::numeric_limits::max(); bool heuristics_only = false; i_t reliability_branching = -1; @@ -108,6 +109,23 @@ class mip_solver_settings_t { std::vector>> initial_solutions; bool mip_scaling = false; bool presolve = true; + /** + * @brief Determinism mode for MIP solver. + * + * Controls the determinism behavior of the MIP solver: + * - CUOPT_MODE_OPPORTUNISTIC (0): Default mode, allows non-deterministic + * parallelism for better performance + * - CUOPT_MODE_DETERMINISTIC (1): Ensures deterministic results across runs + * at potential cost of performance + */ + int determinism_mode = CUOPT_MODE_OPPORTUNISTIC; + /** + * @brief Random seed for the MIP solver. + * + * Controls the initial seed for random number generation in the solver. + * Use -1 to generate a random seed. + */ + i_t seed = -1; // this is for extracting info from different places of the solver during // benchmarks benchmark_info_t* benchmark_info_ptr = nullptr; diff --git a/cpp/include/cuopt/linear_programming/mip/solver_solution.hpp b/cpp/include/cuopt/linear_programming/mip/solver_solution.hpp index 6ff8d324b..a6c28ac20 100644 --- a/cpp/include/cuopt/linear_programming/mip/solver_solution.hpp +++ b/cpp/include/cuopt/linear_programming/mip/solver_solution.hpp @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -30,6 +30,7 @@ enum class mip_termination_status_t : int8_t { Infeasible = CUOPT_TERIMINATION_STATUS_INFEASIBLE, Unbounded = CUOPT_TERIMINATION_STATUS_UNBOUNDED, TimeLimit = CUOPT_TERIMINATION_STATUS_TIME_LIMIT, + WorkLimit = CUOPT_TERIMINATION_STATUS_WORK_LIMIT, }; template diff --git a/cpp/src/CMakeLists.txt b/cpp/src/CMakeLists.txt index a69c36d9d..59c06c844 100644 --- a/cpp/src/CMakeLists.txt +++ b/cpp/src/CMakeLists.txt @@ -1,12 +1,13 @@ # cmake-format: off -# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # cmake-format: on set(UTIL_SRC_FILES ${CMAKE_CURRENT_SOURCE_DIR}/utilities/seed_generator.cu ${CMAKE_CURRENT_SOURCE_DIR}/utilities/logger.cpp ${CMAKE_CURRENT_SOURCE_DIR}/utilities/version_info.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/utilities/timestamp_utils.cpp) + ${CMAKE_CURRENT_SOURCE_DIR}/utilities/timestamp_utils.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/utilities/work_unit_scheduler.cpp) add_subdirectory(linear_programming) add_subdirectory(math_optimization) diff --git a/cpp/src/dual_simplex/barrier.cu b/cpp/src/dual_simplex/barrier.cu index 5eef97bb8..de50cfcc8 100644 --- a/cpp/src/dual_simplex/barrier.cu +++ b/cpp/src/dual_simplex/barrier.cu @@ -1388,7 +1388,7 @@ class iteration_data_t { // v = alpha * A * w + beta * v = alpha * A * Dinv * A^T * y + beta * v matrix_vector_multiply(A, alpha, w, beta, v); if (debug) { - printf("||A|| = %.16e\n", vector_norm2(A.x)); + printf("||A|| = %.16e\n", vector_norm2(A.x.underlying())); printf("||w|| = %.16e\n", vector_norm2(w)); printf("||v|| = %.16e\n", vector_norm2(v)); } diff --git a/cpp/src/dual_simplex/basis_solves.cpp b/cpp/src/dual_simplex/basis_solves.cpp index 3b26b2a8b..17f997f4a 100644 --- a/cpp/src/dual_simplex/basis_solves.cpp +++ b/cpp/src/dual_simplex/basis_solves.cpp @@ -13,6 +13,8 @@ #include #include +#include + namespace cuopt::linear_programming::dual_simplex { template @@ -57,14 +59,14 @@ void get_basis_from_vstatus(i_t m, namespace { -template +template void write_singleton_info(i_t m, i_t col_singletons, i_t row_singletons, const csc_matrix_t& B, - const std::vector& row_perm, - const std::vector& row_perm_inv, - const std::vector& col_perm) + const VectorI& row_perm, + const VectorI& row_perm_inv, + const VectorI& col_perm) { FILE* file = fopen("singleton_debug.m", "w"); if (file != NULL) { @@ -94,7 +96,7 @@ void write_singleton_info(i_t m, fclose(file); } -template +template void write_factor_info(const char* filename, i_t m, i_t row_singletons, @@ -104,8 +106,8 @@ void write_factor_info(const char* filename, const csc_matrix_t& D, const csc_matrix_t& L, const csc_matrix_t& U, - const std::vector& row_perm, - const std::vector& col_perm) + const VectorI& row_perm, + const VectorI& col_perm) { FILE* file = fopen(filename, "w"); if (file != NULL) { @@ -165,6 +167,7 @@ i_t factorize_basis(const csc_matrix_t& A, std::vector& deficient, std::vector& slacks_needed) { + raft::common::nvtx::range scope("LU::factorize_basis"); const i_t m = basic_list.size(); constexpr f_t medium_tol = 1e-12; @@ -779,6 +782,8 @@ i_t b_transpose_solve(const csc_matrix_t& L, // U'*r = c // L'*w = r + raft::common::nvtx::range scope("LU::b_transpose_solve"); + // Solve for r such that U'*r = c std::vector r = rhs; upper_triangular_transpose_solve(U, r); diff --git a/cpp/src/dual_simplex/basis_updates.cpp b/cpp/src/dual_simplex/basis_updates.cpp index dd262622c..71dce2e39 100644 --- a/cpp/src/dual_simplex/basis_updates.cpp +++ b/cpp/src/dual_simplex/basis_updates.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -35,6 +36,7 @@ i_t basis_update_t::b_solve(const std::vector& rhs, std::vector& solution, std::vector& Lsol) const { + raft::common::nvtx::range scope("LU::b_solve"); const i_t m = L0_.m; assert(row_permutation_.size() == m); assert(rhs.size() == m); @@ -86,6 +88,7 @@ template i_t basis_update_t::b_transpose_solve(const std::vector& rhs, std::vector& solution) const { + raft::common::nvtx::range scope("LU::b_transpose_solve"); // Observe that // P*B = L*U // B'*P' = U'*L' @@ -2263,6 +2266,7 @@ int basis_update_mpf_t::refactor_basis( std::vector& nonbasic_list, std::vector& vstatus) { + raft::common::nvtx::range scope("LU::refactor_basis"); std::vector deficient; std::vector slacks_needed; std::vector superbasic_list; // Empty superbasic list diff --git a/cpp/src/dual_simplex/bb_event.hpp b/cpp/src/dual_simplex/bb_event.hpp new file mode 100644 index 000000000..f2e16466f --- /dev/null +++ b/cpp/src/dual_simplex/bb_event.hpp @@ -0,0 +1,146 @@ +/* clang-format off */ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ +/* clang-format on */ + +#pragma once + +#include +#include +#include +#include + +namespace cuopt::linear_programming::dual_simplex { + +// Event types generated by B&B workers during deterministic execution +enum class bb_event_type_t : int8_t { + NODE_BRANCHED = 0, + NODE_FATHOMED = 1, + NODE_INTEGER = 2, + NODE_INFEASIBLE = 3, + NODE_NUMERICAL = 4, +}; + +template +struct branched_payload_t { + i_t down_child_id; + i_t up_child_id; + f_t node_lower_bound; + i_t branch_var; + f_t branch_value; +}; + +template +struct integer_solution_payload_t { + f_t objective_value; +}; + +template +struct fathomed_payload_t { + f_t lower_bound; +}; + +template +struct bb_event_t { + bb_event_type_t type; + double wut; + int worker_id; + i_t node_id; + int event_sequence; + + union { + branched_payload_t branched; + integer_solution_payload_t integer_solution; + fathomed_payload_t fathomed; + } payload; + + bb_event_t() + : type(bb_event_type_t::NODE_FATHOMED), wut(0.0), worker_id(0), node_id(0), event_sequence(0) + { + payload.fathomed = {0.0}; + } + + bool operator<(const bb_event_t& other) const + { + return std::tie(wut, worker_id, node_id, event_sequence) < + std::tie(other.wut, other.worker_id, other.node_id, other.event_sequence); + } + + static bb_event_t make_branched(double work_unit_ts, + int worker, + i_t node, + i_t down_id, + i_t up_id, + f_t lower_bound, + i_t branch_var, + f_t branch_val) + { + bb_event_t e; + e.type = bb_event_type_t::NODE_BRANCHED; + e.wut = work_unit_ts; + e.worker_id = worker; + e.node_id = node; + e.payload.branched = {down_id, up_id, lower_bound, branch_var, branch_val}; + return e; + } + + static bb_event_t make_integer_solution(double work_unit_ts, int worker, i_t node, f_t objective) + { + bb_event_t e; + e.type = bb_event_type_t::NODE_INTEGER; + e.wut = work_unit_ts; + e.worker_id = worker; + e.node_id = node; + e.payload.integer_solution = {objective}; + return e; + } + + static bb_event_t make_fathomed(double work_unit_ts, int worker, i_t node, f_t lower_bound) + { + bb_event_t e; + e.type = bb_event_type_t::NODE_FATHOMED; + e.wut = work_unit_ts; + e.worker_id = worker; + e.node_id = node; + e.payload.fathomed = {lower_bound}; + return e; + } + + static bb_event_t make_infeasible(double work_unit_ts, int worker, i_t node) + { + bb_event_t e; + e.type = bb_event_type_t::NODE_INFEASIBLE; + e.wut = work_unit_ts; + e.worker_id = worker; + e.node_id = node; + return e; + } + + static bb_event_t make_numerical(double work_unit_ts, int worker, i_t node) + { + bb_event_t e; + e.type = bb_event_type_t::NODE_NUMERICAL; + e.wut = work_unit_ts; + e.worker_id = worker; + e.node_id = node; + return e; + } +}; + +template +struct bb_event_batch_t { + std::vector> events; + + void clear() { events.clear(); } + + void add(bb_event_t event) { events.push_back(std::move(event)); } + + void sort_for_replay() { std::sort(events.begin(), events.end()); } + + size_t size() const { return events.size(); } + bool empty() const { return events.empty(); } +}; + +} // namespace cuopt::linear_programming::dual_simplex diff --git a/cpp/src/dual_simplex/bounds_strengthening.cpp b/cpp/src/dual_simplex/bounds_strengthening.cpp index 2b20940d2..37ab114a7 100644 --- a/cpp/src/dual_simplex/bounds_strengthening.cpp +++ b/cpp/src/dual_simplex/bounds_strengthening.cpp @@ -102,6 +102,13 @@ bool bounds_strengthening_t::bounds_strengthening( std::vector variable_changed(n, false); std::vector constraint_changed_next(m, false); + auto& A_i = A.i.underlying(); + auto& A_x = A.x.underlying(); + auto& Arow_j = Arow.j.underlying(); + auto& Arow_x = Arow.x.underlying(); + + size_t nnz_processed = 0; + if (!bounds_changed.empty()) { std::fill(constraint_changed.begin(), constraint_changed.end(), false); for (i_t j = 0; j < n; ++j) { @@ -109,7 +116,7 @@ bool bounds_strengthening_t::bounds_strengthening( const i_t col_start = A.col_start[j]; const i_t col_end = A.col_start[j + 1]; for (i_t p = col_start; p < col_end; ++p) { - const i_t i = A.i[p]; + const i_t i = A_i[p]; constraint_changed[i] = true; } } @@ -127,12 +134,13 @@ bool bounds_strengthening_t::bounds_strengthening( if (!constraint_changed[i]) { continue; } const i_t row_start = Arow.row_start[i]; const i_t row_end = Arow.row_start[i + 1]; + nnz_processed += (row_end - row_start); f_t min_a = 0.0; f_t max_a = 0.0; for (i_t p = row_start; p < row_end; ++p) { - const i_t j = Arow.j[p]; - const f_t a_ij = Arow.x[p]; + const i_t j = Arow_j[p]; + const f_t a_ij = Arow_x[p]; variable_changed[j] = true; if (a_ij > 0) { @@ -162,6 +170,7 @@ bool bounds_strengthening_t::bounds_strengthening( cnst_ub, min_a, max_a); + last_nnz_processed = nnz_processed; return false; } @@ -181,11 +190,12 @@ bool bounds_strengthening_t::bounds_strengthening( const i_t col_start = A.col_start[k]; const i_t col_end = A.col_start[k + 1]; + nnz_processed += (col_end - col_start); for (i_t p = col_start; p < col_end; ++p) { - const i_t i = A.i[p]; + const i_t i = A_i[p]; if (!constraint_changed[i]) { continue; } - const f_t a_ik = A.x[p]; + const f_t a_ik = A_x[p]; f_t delta_min_act = delta_min_activity[i]; f_t delta_max_act = delta_max_activity[i]; @@ -213,6 +223,7 @@ bool bounds_strengthening_t::bounds_strengthening( if (new_lb > new_ub + settings.primal_tol) { settings.log.debug( "Iter:: %d, Infeasible variable after update %d, %e > %e\n", iter, k, new_lb, new_ub); + last_nnz_processed = nnz_processed; return false; } if (new_lb != old_lb || new_ub != old_ub) { @@ -280,6 +291,7 @@ bool bounds_strengthening_t::bounds_strengthening( lower_bounds = lower; upper_bounds = upper; + last_nnz_processed = nnz_processed; return true; } diff --git a/cpp/src/dual_simplex/bounds_strengthening.hpp b/cpp/src/dual_simplex/bounds_strengthening.hpp index b811fb1c1..009f7b243 100644 --- a/cpp/src/dual_simplex/bounds_strengthening.hpp +++ b/cpp/src/dual_simplex/bounds_strengthening.hpp @@ -27,6 +27,8 @@ class bounds_strengthening_t { std::vector& lower_bounds, std::vector& upper_bounds); + size_t last_nnz_processed{0}; + private: const csc_matrix_t& A; const csr_matrix_t& Arow; diff --git a/cpp/src/dual_simplex/branch_and_bound.cpp b/cpp/src/dual_simplex/branch_and_bound.cpp index c6e8b0d17..de8a01953 100644 --- a/cpp/src/dual_simplex/branch_and_bound.cpp +++ b/cpp/src/dual_simplex/branch_and_bound.cpp @@ -20,6 +20,8 @@ #include #include #include +#include +#include #include @@ -30,10 +32,15 @@ #include #include #include +#include #include #include +#include +#include #include +// #define DETERMINISM_DISABLE_BOUNDS_STRENGTHENING + namespace cuopt::linear_programming::dual_simplex { namespace { @@ -145,6 +152,8 @@ dual::status_t convert_lp_status_to_dual_status(lp_status_t status) return dual::status_t::ITERATION_LIMIT; } else if (status == lp_status_t::TIME_LIMIT) { return dual::status_t::TIME_LIMIT; + } else if (status == lp_status_t::WORK_LIMIT) { + return dual::status_t::WORK_LIMIT; } else if (status == lp_status_t::NUMERICAL_ISSUES) { return dual::status_t::NUMERICAL; } else if (status == lp_status_t::CUTOFF) { @@ -469,6 +478,47 @@ void branch_and_bound_t::set_new_solution(const std::vector& solu } } +template +void branch_and_bound_t::queue_external_solution_deterministic( + const std::vector& solution, double work_unit_ts) +{ + // In deterministic mode, queue the solution to be processed at the correct work unit timestamp + // This ensures deterministic ordering of solution events + + if (solution.size() != original_problem_.num_cols) { + settings_.log.printf( + "Solution size mismatch %ld %d\n", solution.size(), original_problem_.num_cols); + return; + } + + mutex_original_lp_.lock(); + std::vector crushed_solution; + crush_primal_solution( + original_problem_, original_lp_, solution, new_slacks_, crushed_solution); + f_t obj = compute_objective(original_lp_, crushed_solution); + + // Validate solution before queueing + f_t primal_err; + f_t bound_err; + i_t num_fractional; + bool is_feasible = check_guess( + original_lp_, settings_, var_types_, crushed_solution, primal_err, bound_err, num_fractional); + mutex_original_lp_.unlock(); + + if (!is_feasible) { + // Queue for repair + mutex_repair_.lock(); + repair_queue_.push_back(crushed_solution); + mutex_repair_.unlock(); + return; + } + + // Queue the solution with its work unit timestamp + mutex_heuristic_queue_.lock(); + heuristic_solution_queue_.push_back({std::move(crushed_solution), obj, work_unit_ts}); + mutex_heuristic_queue_.unlock(); +} + template bool branch_and_bound_t::repair_solution(const std::vector& edge_norms, const std::vector& potential_solution, @@ -535,6 +585,7 @@ bool branch_and_bound_t::repair_solution(const std::vector& edge_ template void branch_and_bound_t::repair_heuristic_solutions() { + raft::common::nvtx::range scope("BB::repair_heuristics"); // Check if there are any solutions to repair std::vector> to_repair; mutex_repair_.lock(); @@ -612,10 +663,17 @@ void branch_and_bound_t::set_final_solution(mip_solution_t& if (solver_status_ == mip_status_t::TIME_LIMIT) { settings_.log.printf("Time limit reached. Stopping the solver...\n"); } + if (solver_status_ == mip_status_t::WORK_LIMIT) { + settings_.log.printf("Work limit reached. Stopping the solver...\n"); + } if (solver_status_ == mip_status_t::NODE_LIMIT) { settings_.log.printf("Node limit reached. Stopping the solver...\n"); } + if (settings_.heuristic_preemption_callback != nullptr) { + settings_.heuristic_preemption_callback(); + } + f_t gap = upper_bound_ - lower_bound; f_t obj = compute_user_objective(original_lp_, upper_bound_.load()); f_t user_bound = compute_user_objective(original_lp_, lower_bound); @@ -777,6 +835,362 @@ branch_variable_t branch_and_bound_t::variable_selection( } } +// ============================================================================ +// Policies for update_tree +// These allow sharing the tree update logic between the default and deterministic codepaths +// ============================================================================ + +template +struct opportunistic_tree_update_callbacks_t { + branch_and_bound_t& bnb; + branch_and_bound_worker_t* worker; + logger_t& log; + + f_t upper_bound() const { return bnb.upper_bound_.load(); } + + void update_pseudo_costs(mip_node_t* node, f_t leaf_obj) + { + bnb.pc_.update_pseudo_costs(node, leaf_obj); + } + + void handle_integer_solution(mip_node_t* node, f_t obj, const std::vector& x) + { + bnb.add_feasible_solution(obj, x, node->depth, worker->search_strategy); + } + + branch_variable_t select_branch_variable(mip_node_t* node, + const std::vector& fractional, + const std::vector&) + { + return bnb.variable_selection(node, fractional, worker); + } + + void update_objective_estimate(mip_node_t* node, + const std::vector& fractional, + const std::vector& x) + { + if (worker->search_strategy == search_strategy_t::BEST_FIRST) { + logger_t pc_log; + pc_log.log = false; + node->objective_estimate = bnb.pc_.obj_estimate(fractional, x, node->lower_bound, pc_log); + } + } + + void on_numerical_issue(mip_node_t* node) + { + if (worker->search_strategy == search_strategy_t::BEST_FIRST) { + fetch_min(bnb.lower_bound_ceiling_, node->lower_bound); + log.printf("LP returned numerical issue on node %d. Best bound set to %+10.6e.\n", + node->node_id, + compute_user_objective(bnb.original_lp_, bnb.lower_bound_ceiling_.load())); + } + } + + void graphviz(search_tree_t& tree, + mip_node_t* node, + const char* label, + f_t value) + { + tree.graphviz_node(log, node, label, value); + } + + void on_optimal_callback(const std::vector& x, f_t objective) + { + if (worker->search_strategy == search_strategy_t::BEST_FIRST && + bnb.settings_.node_processed_callback != nullptr) { + std::vector original_x; + uncrush_primal_solution(bnb.original_problem_, bnb.original_lp_, x, original_x); + bnb.settings_.node_processed_callback(original_x, objective); + } + } + + void on_node_completed(mip_node_t*, node_status_t, rounding_direction_t) {} +}; + +template +struct determinism_tree_update_policy_base_t { + branch_and_bound_t& bnb; + WorkerT& worker; + + f_t upper_bound() const { return worker.local_upper_bound; } + + void update_pseudo_costs(mip_node_t* node, f_t leaf_obj) + { + if (node->branch_var < 0) return; + f_t change = std::max(leaf_obj - node->lower_bound, f_t(0)); + f_t frac = node->branch_dir == rounding_direction_t::DOWN + ? node->fractional_val - std::floor(node->fractional_val) + : std::ceil(node->fractional_val) - node->fractional_val; + if (frac > 1e-10) { + worker.queue_pseudo_cost_update(node->branch_var, node->branch_dir, change / frac); + } + } + + void on_numerical_issue(mip_node_t*) {} + void graphviz(search_tree_t&, mip_node_t*, const char*, f_t) {} + void on_optimal_callback(const std::vector&, f_t) {} +}; + +template +struct determinism_bfs_tree_update_callbacks_t + : determinism_tree_update_policy_base_t> { + void handle_integer_solution(mip_node_t* node, f_t obj, const std::vector& x) + { + if (obj < this->worker.local_upper_bound) { + this->worker.local_upper_bound = obj; + this->worker.integer_solutions.push_back( + {obj, x, node->depth, this->worker.worker_id, this->worker.next_solution_seq++}); + } + } + + branch_variable_t select_branch_variable(mip_node_t*, + const std::vector& fractional, + const std::vector& x) + { + i_t var = this->worker.variable_selection_from_snapshot(fractional, x); + auto dir = martin_criteria(x[var], this->bnb.root_relax_soln_.x[var]); + return {var, dir}; + } + + void update_objective_estimate(mip_node_t* node, + const std::vector& fractional, + const std::vector& x) + { + node->objective_estimate = + obj_estimate_from_arrays(this->worker.pc_sum_down_snapshot.data(), + this->worker.pc_sum_up_snapshot.data(), + this->worker.pc_num_down_snapshot.data(), + this->worker.pc_num_up_snapshot.data(), + (i_t)this->worker.pc_sum_down_snapshot.size(), + fractional, + x, + node->lower_bound); + } + + void on_node_completed(mip_node_t* node, node_status_t status, rounding_direction_t dir) + { + switch (status) { + case node_status_t::INFEASIBLE: this->worker.record_infeasible(node); break; + case node_status_t::FATHOMED: this->worker.record_fathomed(node, node->lower_bound); break; + case node_status_t::INTEGER_FEASIBLE: + this->worker.record_integer_solution(node, node->lower_bound); + break; + case node_status_t::HAS_CHILDREN: + this->worker.record_branched(node, + node->get_down_child()->node_id, + node->get_up_child()->node_id, + node->branch_var, + node->fractional_val); + this->bnb.exploration_stats_.nodes_unexplored += 2; + this->worker.enqueue_children_for_plunge(node->get_down_child(), node->get_up_child(), dir); + break; + case node_status_t::NUMERICAL: this->worker.record_numerical(node); break; + default: break; + } + if (status != node_status_t::HAS_CHILDREN) { this->worker.recompute_bounds_and_basis = true; } + } + + void on_numerical_issue(mip_node_t* node) + { + this->worker.local_lower_bound_ceiling = + std::min(node->lower_bound, this->worker.local_lower_bound_ceiling); + } +}; + +template +struct determinism_diving_tree_update_callbacks_t + : determinism_tree_update_policy_base_t> { + std::deque*>& stack; + i_t max_backtrack_depth; + + void handle_integer_solution(mip_node_t* node, f_t obj, const std::vector& x) + { + if (obj < this->worker.local_upper_bound) { + this->worker.local_upper_bound = obj; + this->worker.queue_integer_solution(obj, x, node->depth); + } + } + + branch_variable_t select_branch_variable(mip_node_t*, + const std::vector& fractional, + const std::vector& x) + { + switch (this->worker.diving_type) { + case search_strategy_t::PSEUDOCOST_DIVING: + return this->worker.variable_selection_from_snapshot(fractional, x); + + case search_strategy_t::LINE_SEARCH_DIVING: + if (this->worker.root_solution) { + logger_t log; + log.log = false; + return line_search_diving(fractional, x, *this->worker.root_solution, log); + } + return this->worker.variable_selection_from_snapshot(fractional, x); + + case search_strategy_t::GUIDED_DIVING: + return this->worker.guided_variable_selection(fractional, x); + + case search_strategy_t::COEFFICIENT_DIVING: { + logger_t log; + log.log = false; + return coefficient_diving(this->bnb.original_lp_, + fractional, + x, + this->bnb.var_up_locks_, + this->bnb.var_down_locks_, + log); + } + + default: return this->worker.variable_selection_from_snapshot(fractional, x); + } + } + + void update_objective_estimate(mip_node_t* node, + const std::vector& fractional, + const std::vector& x) + { + } + + void on_node_completed(mip_node_t* node, node_status_t status, rounding_direction_t dir) + { + if (status == node_status_t::HAS_CHILDREN) { + if (dir == rounding_direction_t::UP) { + stack.push_front(node->get_down_child()); + stack.push_front(node->get_up_child()); + } else { + stack.push_front(node->get_up_child()); + stack.push_front(node->get_down_child()); + } + if (stack.size() > 1 && stack.front()->depth - stack.back()->depth > max_backtrack_depth) { + stack.pop_back(); + } + this->worker.recompute_bounds_and_basis = false; + } else { + this->worker.recompute_bounds_and_basis = true; + } + } +}; + +template +template +std::pair branch_and_bound_t::update_tree_impl( + mip_node_t* node_ptr, + search_tree_t& search_tree, + WorkerT* worker, + dual::status_t lp_status, + Policy& policy) +{ + constexpr f_t inf = std::numeric_limits::infinity(); + const f_t abs_fathom_tol = settings_.absolute_mip_gap_tol / 10; + lp_problem_t& leaf_problem = worker->leaf_problem; + lp_solution_t& leaf_solution = worker->leaf_solution; + const f_t upper_bound = policy.upper_bound(); + node_status_t status = node_status_t::PENDING; + rounding_direction_t round_dir = rounding_direction_t::NONE; + + if (lp_status == dual::status_t::DUAL_UNBOUNDED) { + node_ptr->lower_bound = inf; + policy.graphviz(search_tree, node_ptr, "infeasible", 0.0); + search_tree.update(node_ptr, node_status_t::INFEASIBLE); + status = node_status_t::INFEASIBLE; + + } else if (lp_status == dual::status_t::CUTOFF) { + f_t leaf_obj = compute_objective(leaf_problem, leaf_solution.x); + node_ptr->lower_bound = upper_bound; + policy.graphviz(search_tree, node_ptr, "cut off", leaf_obj); + search_tree.update(node_ptr, node_status_t::FATHOMED); + status = node_status_t::FATHOMED; + + } else if (lp_status == dual::status_t::OPTIMAL) { + std::vector leaf_fractional; + i_t num_frac = fractional_variables(settings_, leaf_solution.x, var_types_, leaf_fractional); + +#ifdef DEBUG_FRACTIONAL_FIXED + for (i_t j : leaf_fractional) { + if (leaf_problem.lower[j] == leaf_problem.upper[j]) { + printf( + "Node %d: Fixed variable %d has a fractional value %e. Lower %e upper %e. Variable " + "status %d\n", + node_ptr->node_id, + j, + leaf_solution.x[j], + leaf_problem.lower[j], + leaf_problem.upper[j], + node_ptr->vstatus[j]); + } + } +#endif + + f_t leaf_obj = compute_objective(leaf_problem, leaf_solution.x); + + policy.graphviz(search_tree, node_ptr, "lower bound", leaf_obj); + policy.update_pseudo_costs(node_ptr, leaf_obj); + node_ptr->lower_bound = leaf_obj; + policy.on_optimal_callback(leaf_solution.x, leaf_obj); + + if (num_frac == 0) { + policy.handle_integer_solution(node_ptr, leaf_obj, leaf_solution.x); + policy.graphviz(search_tree, node_ptr, "integer feasible", leaf_obj); + search_tree.update(node_ptr, node_status_t::INTEGER_FEASIBLE); + status = node_status_t::INTEGER_FEASIBLE; + + } else if (leaf_obj <= upper_bound + abs_fathom_tol) { + auto [branch_var, dir] = + policy.select_branch_variable(node_ptr, leaf_fractional, leaf_solution.x); + round_dir = dir; + + assert(node_ptr->vstatus.size() == leaf_problem.num_cols); + assert(branch_var >= 0); + assert(dir != rounding_direction_t::NONE); + + policy.update_objective_estimate(node_ptr, leaf_fractional, leaf_solution.x); + + logger_t log; + log.log = false; + search_tree.branch(node_ptr, + branch_var, + leaf_solution.x[branch_var], + num_frac, + node_ptr->vstatus, + leaf_problem, + log); + search_tree.update(node_ptr, node_status_t::HAS_CHILDREN); + status = node_status_t::HAS_CHILDREN; + + } else { + policy.graphviz(search_tree, node_ptr, "fathomed", leaf_obj); + search_tree.update(node_ptr, node_status_t::FATHOMED); + status = node_status_t::FATHOMED; + } + } else if (lp_status == dual::status_t::TIME_LIMIT) { + policy.graphviz(search_tree, node_ptr, "timeout", 0.0); + status = node_status_t::PENDING; + } else if (lp_status == dual::status_t::WORK_LIMIT) { + policy.graphviz(search_tree, node_ptr, "work limit", 0.0); + status = node_status_t::PENDING; + } else { + policy.on_numerical_issue(node_ptr); + policy.graphviz(search_tree, node_ptr, "numerical", 0.0); + search_tree.update(node_ptr, node_status_t::NUMERICAL); + status = node_status_t::NUMERICAL; + } + + policy.on_node_completed(node_ptr, status, round_dir); + return {status, round_dir}; +} + +template +std::pair branch_and_bound_t::update_tree( + mip_node_t* node_ptr, + search_tree_t& search_tree, + branch_and_bound_worker_t* worker, + dual::status_t lp_status, + logger_t& log) +{ + opportunistic_tree_update_callbacks_t policy{*this, worker, log}; + return update_tree_impl(node_ptr, search_tree, worker, lp_status, policy); +} + template dual::status_t branch_and_bound_t::solve_node_lp( mip_node_t* node_ptr, @@ -784,6 +1198,7 @@ dual::status_t branch_and_bound_t::solve_node_lp( branch_and_bound_stats_t& stats, logger_t& log) { + raft::common::nvtx::range scope("BB::solve_node"); #ifdef DEBUG_BRANCHING i_t num_integer_variables = 0; for (i_t j = 0; j < original_lp_.num_cols; j++) { @@ -906,129 +1321,6 @@ dual::status_t branch_and_bound_t::solve_node_lp( return lp_status; } -template -std::pair branch_and_bound_t::update_tree( - mip_node_t* node_ptr, - search_tree_t& search_tree, - branch_and_bound_worker_t* worker, - dual::status_t lp_status, - logger_t& log) -{ - const f_t abs_fathom_tol = settings_.absolute_mip_gap_tol / 10; - std::vector& leaf_vstatus = node_ptr->vstatus; - lp_problem_t& leaf_problem = worker->leaf_problem; - lp_solution_t& leaf_solution = worker->leaf_solution; - - if (lp_status == dual::status_t::DUAL_UNBOUNDED) { - // Node was infeasible. Do not branch - node_ptr->lower_bound = inf; - search_tree.graphviz_node(log, node_ptr, "infeasible", 0.0); - search_tree.update(node_ptr, node_status_t::INFEASIBLE); - return {node_status_t::INFEASIBLE, rounding_direction_t::NONE}; - - } else if (lp_status == dual::status_t::CUTOFF) { - // Node was cut off. Do not branch - node_ptr->lower_bound = upper_bound_; - f_t leaf_objective = compute_objective(leaf_problem, leaf_solution.x); - search_tree.graphviz_node(log, node_ptr, "cut off", leaf_objective); - search_tree.update(node_ptr, node_status_t::FATHOMED); - return {node_status_t::FATHOMED, rounding_direction_t::NONE}; - - } else if (lp_status == dual::status_t::OPTIMAL) { - // LP was feasible - std::vector leaf_fractional; - i_t leaf_num_fractional = - fractional_variables(settings_, leaf_solution.x, var_types_, leaf_fractional); - -#ifdef DEBUG_FRACTIONAL_FIXED - // Check if any of the fractional variables were fixed to their bounds - for (i_t j : leaf_fractional) { - if (leaf_problem.lower[j] == leaf_problem.upper[j]) { - printf( - "Node %d: Fixed variable %d has a fractional value %e. Lower %e upper %e. Variable " - "status %d\n", - node_ptr->node_id, - j, - leaf_solution.x[j], - leaf_problem.lower[j], - leaf_problem.upper[j], - leaf_vstatus[j]); - } - } -#endif - - f_t leaf_objective = compute_objective(leaf_problem, leaf_solution.x); - search_tree.graphviz_node(log, node_ptr, "lower bound", leaf_objective); - pc_.update_pseudo_costs(node_ptr, leaf_objective); - node_ptr->lower_bound = leaf_objective; - - if (worker->search_strategy == search_strategy_t::BEST_FIRST) { - if (settings_.node_processed_callback != nullptr) { - std::vector original_x; - uncrush_primal_solution(original_problem_, original_lp_, leaf_solution.x, original_x); - settings_.node_processed_callback(original_x, leaf_objective); - } - } - - if (leaf_num_fractional == 0) { - // Found a integer feasible solution - add_feasible_solution( - leaf_objective, leaf_solution.x, node_ptr->depth, worker->search_strategy); - search_tree.graphviz_node(log, node_ptr, "integer feasible", leaf_objective); - search_tree.update(node_ptr, node_status_t::INTEGER_FEASIBLE); - return {node_status_t::INTEGER_FEASIBLE, rounding_direction_t::NONE}; - - } else if (leaf_objective <= upper_bound_ + abs_fathom_tol) { - // Choose fractional variable to branch on - auto [branch_var, round_dir] = variable_selection(node_ptr, leaf_fractional, worker); - - assert(leaf_vstatus.size() == leaf_problem.num_cols); - assert(branch_var >= 0); - assert(round_dir != rounding_direction_t::NONE); - - // Note that the exploration thread is the only one that can insert new nodes into the heap, - // and thus, we only need to calculate the objective estimate here (it is used for - // sorting the nodes for diving). - if (worker->search_strategy == search_strategy_t::BEST_FIRST) { - logger_t pc_log; - pc_log.log = false; - node_ptr->objective_estimate = - pc_.obj_estimate(leaf_fractional, leaf_solution.x, node_ptr->lower_bound, pc_log); - } - - search_tree.branch(node_ptr, - branch_var, - leaf_solution.x[branch_var], - leaf_num_fractional, - leaf_vstatus, - leaf_problem, - log); - search_tree.update(node_ptr, node_status_t::HAS_CHILDREN); - return {node_status_t::HAS_CHILDREN, round_dir}; - - } else { - search_tree.graphviz_node(log, node_ptr, "fathomed", leaf_objective); - search_tree.update(node_ptr, node_status_t::FATHOMED); - return {node_status_t::FATHOMED, rounding_direction_t::NONE}; - } - } else { - if (worker->search_strategy == search_strategy_t::BEST_FIRST) { - fetch_min(lower_bound_ceiling_, node_ptr->lower_bound); - log.printf( - "LP returned status %d on node %d. This indicates a numerical issue. The best bound is set " - "to " - "%+10.6e.\n", - lp_status, - node_ptr->node_id, - compute_user_objective(original_lp_, lower_bound_ceiling_.load())); - } - - search_tree.graphviz_node(log, node_ptr, "numerical", 0.0); - search_tree.update(node_ptr, node_status_t::NUMERICAL); - return {node_status_t::NUMERICAL, rounding_direction_t::NONE}; - } -} - template void branch_and_bound_t::plunge_with(branch_and_bound_worker_t* worker) { @@ -1133,6 +1425,7 @@ void branch_and_bound_t::plunge_with(branch_and_bound_worker_t void branch_and_bound_t::dive_with(branch_and_bound_worker_t* worker) { + raft::common::nvtx::range scope("BB::diving_thread"); logger_t log; log.log = false; @@ -1560,6 +1853,8 @@ lp_status_t branch_and_bound_t::solve_root_relaxation( template mip_status_t branch_and_bound_t::solve(mip_solution_t& solution) { + raft::common::nvtx::range scope("BB::solve"); + logger_t log; log.log = false; log.log_prefix = settings_.log.log_prefix; @@ -1570,6 +1865,7 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut original_lp_.A.to_compressed_row(Arow_); if (guess_.size() != 0) { + raft::common::nvtx::range scope_guess("BB::check_initial_guess"); std::vector crushed_guess; crush_primal_solution(original_problem_, original_lp_, guess_, new_slacks_, crushed_guess); f_t primal_err; @@ -1646,6 +1942,12 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut return solver_status_; } + if (root_status == lp_status_t::WORK_LIMIT) { + solver_status_ = mip_status_t::WORK_LIMIT; + set_final_solution(solution, -inf); + return solver_status_; + } + if (root_status == lp_status_t::NUMERICAL_ISSUES) { solver_status_ = mip_status_t::NUMERICAL; set_final_solution(solution, -inf); @@ -1965,17 +2267,20 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut set_uninitialized_steepest_edge_norms(original_lp_, basic_list, edge_norms_); pc_.resize(original_lp_.num_cols); - strong_branching(original_problem_, - original_lp_, - settings_, - exploration_stats_.start_time, - var_types_, - root_relax_soln_.x, - fractional, - root_objective_, - root_vstatus_, - edge_norms_, - pc_); + { + raft::common::nvtx::range scope_sb("BB::strong_branching"); + strong_branching(original_problem_, + original_lp_, + settings_, + exploration_stats_.start_time, + var_types_, + root_relax_soln_.x, + fractional, + root_objective_, + root_vstatus_, + edge_norms_, + pc_); + } if (toc(exploration_stats_.start_time) > settings_.time_limit) { solver_status_ = mip_status_t::TIME_LIMIT; @@ -2055,30 +2360,1185 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut calculate_variable_locks(original_lp_, var_up_locks_, var_down_locks_); } - settings_.log.printf( - " | Explored | Unexplored | Objective | Bound | IntInf | Depth | Iter/Node | " - "Gap " - "| Time |\n"); - - if (settings_.num_threads > 1) { + if (settings_.deterministic) { + run_determinism_coordinator(Arow_); + } else if (settings_.num_threads > 1) { #pragma omp parallel num_threads(settings_.num_threads) { #pragma omp master run_scheduler(); } - } else { single_threaded_solve(); } is_running_ = false; - f_t lower_bound = node_queue_.best_first_queue_size() > 0 ? node_queue_.get_lower_bound() - : search_tree_.root.lower_bound; + // Compute final lower bound + f_t lower_bound; + if (determinism_mode_enabled_) { + lower_bound = determinism_compute_lower_bound(); + if (lower_bound == std::numeric_limits::infinity() && incumbent_.has_incumbent) { + lower_bound = upper_bound_.load(); + } + solver_status_ = determinism_global_termination_status_; + } else { + lower_bound = node_queue_.best_first_queue_size() > 0 ? node_queue_.get_lower_bound() + : search_tree_.root.lower_bound; + } set_final_solution(solution, lower_bound); return solver_status_; } +// ============================================================================ +// Deterministic implementation +// ============================================================================ + +// The deterministic BSP model is based on letting independent workers execute during virtual time +// intervals, and exchange data during serialized interval sync points. +/* + +Work Units: 0 0.5 1.0 + │ │ │ + │◄──────── Horizon 0 ──────────►│◄───────── Horizon 1 ──────────►│ + │ │ │ +══════════════╪═══════════════════════════════╪════════════════════════════════╪════ + │ │ │ + │ ┌──────────────┐ ┌──────────────┐ + BFS Worker 0 │ ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓ │ │ ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓ │ │ + ├ plunge │ explore nodes │ │ explore nodes │ │ + │ stack │ emit events (wut) │ │ emit events │ │ + ├ backlog │ │ SYNC S1 │ │ SYNC S2 │ + │ heap │ │ │ │ │ + ├ PC snap │ │ • Sort by │ │ • Sort by │ + ├ events[] │ │ (wut, w, │ │ (wut, w, │ + └ solutions[]│ │ seq) │ │ seq) │ +──────────────┼────────────────────────│ • Replay │──────────────────│ • Replay │ + │ │ • Merge PC │ │ • Merge PC │ + BFS Worker 1 │ ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓ │ • Merge sols │ ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓ │ • Merge sols │ + ├ plunge │ explore nodes │ • Prune │ explore nodes │ • Prune │ + │ stack │ emit events (wut) │ • Balance │ emit events │ • Balance │ + ├ backlog │ │ • Assign │ │ • Assign │ + │ heap │ │ • Snapshot │ │ • Snapshot │ + ├ PC snap │ │ │ │ │ + ├ events[] │ │ [38779ebd] │ │ [2ad65699] │ + └ solutions[]│ │ │ │ │ +──────────────┼────────────────────────│ │──────────────────│ │ + │ │ │ │ │ + Diving D0 │ ░░░░░░░░░░░░░░░░░░░░░░ │ │ ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓ │ │ + ├ dive_queue │ (waiting) │ │ dive, find sols │ │ + ├ PC snap │ │ │ │ │ + ├ incumbent │ │ │ │ │ + │ snap │ │ │ │ │ + ├ pc_updates │ │ │ │ │ + └ solutions[]│ │ │ │ │ +──────────────┼────────────────────────│ │──────────────────│ │ + │ │ │ │ │ + Diving D1 │ ░░░░░░░░░░░░░░░░░░░░░░ │ │ ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓ │ │ + ├ dive_queue │ (waiting) │ │ dive, find sols │ │ + ├ PC snap │ │ │ │ │ + ├ incumbent │ └──────────────┘ └──────────────┘ + │ snap │ + ├ pc_updates │ + └ solutions[]│ +══════════════╪═══════════════════════════════════════════════════════════════════════════ + │ + ▼ +──────────────────────────────────────────────────────────────────────────────────────────► + Work Unit Time + +Legend: ▓▓▓ = actively working ░░░ = waiting at barrier [hash] = state hash for verification + wut = work unit timestamp PC = pseudo-costs snap = snapshot (local copy) + +*/ + +template +void branch_and_bound_t::run_determinism_coordinator(const csr_matrix_t& Arow) +{ + raft::common::nvtx::range scope("BB::determinism_coordinator"); + + determinism_horizon_step_ = 0.50; + + // Compute worker counts using the same formula as reliability-branching scheduler + const i_t num_workers = 2 * settings_.num_threads; + std::vector search_strategies = + get_search_strategies(settings_.diving_settings); + std::array max_num_workers = + get_max_workers(num_workers, search_strategies); + + const int num_bfs_workers = max_num_workers[search_strategy_t::BEST_FIRST]; + int num_diving_workers = 0; + for (size_t i = 1; i < search_strategies.size(); ++i) { + num_diving_workers += max_num_workers[search_strategies[i]]; + } + + determinism_mode_enabled_ = true; + determinism_current_horizon_ = determinism_horizon_step_; + determinism_horizon_number_ = 0; + determinism_global_termination_status_ = mip_status_t::UNSET; + + determinism_workers_ = std::make_unique>( + num_bfs_workers, original_lp_, Arow, var_types_, settings_); + + if (num_diving_workers > 0) { + // Extract diving types from search_strategies (skip BEST_FIRST at index 0) + std::vector diving_types(search_strategies.begin() + 1, + search_strategies.end()); + + if (settings_.diving_settings.coefficient_diving != 0) { + calculate_variable_locks(original_lp_, var_up_locks_, var_down_locks_); + } + + if (!diving_types.empty()) { + determinism_diving_workers_ = + std::make_unique>(num_diving_workers, + diving_types, + original_lp_, + Arow, + var_types_, + settings_, + &root_relax_soln_.x); + } + } + + determinism_scheduler_ = std::make_unique(determinism_horizon_step_); + + scoped_context_registrations_t context_registrations(*determinism_scheduler_); + for (auto& worker : *determinism_workers_) { + context_registrations.add(worker.work_context); + } + if (determinism_diving_workers_) { + for (auto& worker : *determinism_diving_workers_) { + context_registrations.add(worker.work_context); + } + } + + int actual_diving_workers = + determinism_diving_workers_ ? (int)determinism_diving_workers_->size() : 0; + settings_.log.printf( + "Deterministic Mode: %d BFS workers + %d diving workers, horizon step = %.2f work " + "units\n", + num_bfs_workers, + actual_diving_workers, + determinism_horizon_step_); + + search_tree_.root.get_down_child()->origin_worker_id = -1; + search_tree_.root.get_down_child()->creation_seq = 0; + search_tree_.root.get_up_child()->origin_worker_id = -1; + search_tree_.root.get_up_child()->creation_seq = 1; + + (*determinism_workers_)[0].enqueue_node(search_tree_.root.get_down_child()); + (*determinism_workers_)[1 % num_bfs_workers].enqueue_node(search_tree_.root.get_up_child()); + + determinism_scheduler_->set_sync_callback([this](double) { determinism_sync_callback(); }); + + std::vector incumbent_snapshot; + if (incumbent_.has_incumbent) { incumbent_snapshot = incumbent_.x; } + + determinism_broadcast_snapshots( + *determinism_workers_, incumbent_snapshot, 0.0, determinism_horizon_step_); + if (determinism_diving_workers_) { + determinism_broadcast_snapshots( + *determinism_diving_workers_, incumbent_snapshot, 0.0, determinism_horizon_step_); + } + + const int total_thread_count = num_bfs_workers + num_diving_workers; + +#pragma omp parallel num_threads(total_thread_count) + { + int thread_id = omp_get_thread_num(); + if (thread_id < num_bfs_workers) { + auto& worker = (*determinism_workers_)[thread_id]; + f_t worker_start_time = tic(); + run_deterministic_bfs_loop(worker, search_tree_); + worker.total_runtime += toc(worker_start_time); + } else { + int diving_id = thread_id - num_bfs_workers; + auto& worker = (*determinism_diving_workers_)[diving_id]; + f_t worker_start_time = tic(); + run_deterministic_diving_loop(worker); + worker.total_runtime += toc(worker_start_time); + } + } + + settings_.log.printf("\n"); + settings_.log.printf("BFS Worker Statistics:\n"); + settings_.log.printf( + " Worker | Nodes | Branched | Pruned | Infeas. | IntSol | Assigned | Clock | " + "Sync%% | NoWork\n"); + settings_.log.printf( + " " + "-------+---------+----------+--------+---------+--------+----------+----------+-------+-------" + "\n"); + for (const auto& worker : *determinism_workers_) { + double sync_time = worker.work_context.total_sync_time; + double total_time = worker.total_runtime; // Already includes sync time + double sync_percent = (total_time > 0) ? (100.0 * sync_time / total_time) : 0.0; + settings_.log.printf(" %6d | %7d | %8d | %6d | %7d | %6d | %8d | %7.3fs | %4.1f%% | %5.2fs\n", + worker.worker_id, + worker.total_nodes_processed, + worker.total_nodes_branched, + worker.total_nodes_pruned, + worker.total_nodes_infeasible, + worker.total_integer_solutions, + worker.total_nodes_assigned, + total_time, + std::min(99.9, sync_percent), + worker.total_nowork_time); + } + + // Print diving worker statistics + if (determinism_diving_workers_ && determinism_diving_workers_->size() > 0) { + settings_.log.printf("\n"); + settings_.log.printf("Diving Worker Statistics:\n"); + settings_.log.printf(" Worker | Type | Dives | Nodes | IntSol | Clock | NoWork\n"); + settings_.log.printf(" -------+--------+---------+--------+--------+----------+-------\n"); + for (const auto& worker : *determinism_diving_workers_) { + const char* type_str = "???"; + switch (worker.diving_type) { + case search_strategy_t::PSEUDOCOST_DIVING: type_str = "PC"; break; + case search_strategy_t::LINE_SEARCH_DIVING: type_str = "LS"; break; + case search_strategy_t::GUIDED_DIVING: type_str = "GD"; break; + case search_strategy_t::COEFFICIENT_DIVING: type_str = "CD"; break; + default: break; + } + settings_.log.printf(" %6d | %6s | %7d | %6d | %6d | %7.3fs | %5.2fs\n", + worker.worker_id, + type_str, + worker.total_dives, + worker.total_nodes_explored, + worker.total_integer_solutions, + worker.total_runtime, + worker.total_nowork_time); + } + } + + if (producer_sync_.num_producers() > 0 || producer_wait_count_ > 0) { + double avg_wait = + (producer_wait_count_ > 0) ? total_producer_wait_time_ / producer_wait_count_ : 0.0; + settings_.log.printf("Producer Sync Statistics:\n"); + settings_.log.printf( + " Producers: %zu, Syncs: %d\n", producer_sync_.num_producers(), producer_wait_count_); + settings_.log.printf(" Total wait: %.3fs, Avg: %.4fs, Max: %.4fs\n", + total_producer_wait_time_, + avg_wait, + max_producer_wait_time_); + } +} + +template +void branch_and_bound_t::run_deterministic_bfs_loop( + determinism_bfs_worker_t& worker, search_tree_t& search_tree) +{ + raft::common::nvtx::range scope("BB::worker_loop"); + + while (determinism_global_termination_status_ == mip_status_t::UNSET) { + if (worker.has_work()) { + mip_node_t* node = worker.dequeue_node(); + if (node == nullptr) { continue; } + + worker.current_node = node; + + f_t upper_bound = worker.local_upper_bound; + f_t rel_gap = user_relative_gap(original_lp_, upper_bound, node->lower_bound); + if (node->lower_bound > upper_bound || rel_gap < settings_.relative_mip_gap_tol) { + worker.current_node = nullptr; + worker.record_fathomed(node, node->lower_bound); + search_tree.update(node, node_status_t::FATHOMED); + --exploration_stats_.nodes_unexplored; + continue; + } + + bool is_child = (node->parent == worker.last_solved_node); + worker.recompute_bounds_and_basis = !is_child; + + node_status_t status = solve_node_deterministic(worker, node, search_tree); + worker.last_solved_node = node; + + worker.current_node = nullptr; + continue; + } + + // No work - advance to sync point to participate in barrier + f_t nowork_start = tic(); + determinism_scheduler_->wait_for_next_sync(worker.work_context); + worker.total_nowork_time += toc(nowork_start); + } +} + +template +void branch_and_bound_t::determinism_sync_callback() +{ + raft::common::nvtx::range scope("BB::determinism_sync_callback"); + + ++determinism_horizon_number_; + double horizon_end = determinism_current_horizon_; + + double wait_start = tic(); + producer_sync_.wait_for_producers(horizon_end); + double wait_time = toc(wait_start); + total_producer_wait_time_ += wait_time; + max_producer_wait_time_ = std::max(max_producer_wait_time_, wait_time); + ++producer_wait_count_; + + work_unit_context_.global_work_units_elapsed = horizon_end; + + bb_event_batch_t all_events = determinism_workers_->collect_and_sort_events(); + + determinism_sort_replay_events(all_events); + + determinism_prune_worker_nodes_vs_incumbent(); + + determinism_collect_diving_solutions(); + + for (auto& worker : *determinism_workers_) { + worker.integer_solutions.clear(); + worker.pseudo_cost_updates.clear(); + } + if (determinism_diving_workers_) { + for (auto& worker : *determinism_diving_workers_) { + worker.integer_solutions.clear(); + worker.pseudo_cost_updates.clear(); + } + } + + determinism_populate_diving_heap(); + + determinism_assign_diving_nodes(); + + determinism_balance_worker_loads(); + + uint32_t state_hash = 0; + { + std::vector state_data; + state_data.push_back(static_cast(exploration_stats_.nodes_explored)); + state_data.push_back(static_cast(exploration_stats_.nodes_unexplored)); + f_t ub = upper_bound_.load(); + f_t lb = determinism_compute_lower_bound(); + state_data.push_back(std::bit_cast(ub)); + state_data.push_back(std::bit_cast(lb)); + + for (auto& worker : *determinism_workers_) { + if (worker.current_node != nullptr) { + state_data.push_back(worker.current_node->get_id_packed()); + } + for (auto* node : worker.plunge_stack) { + state_data.push_back(node->get_id_packed()); + } + for (auto* node : worker.backlog.data()) { + state_data.push_back(node->get_id_packed()); + } + } + + if (determinism_diving_workers_) { + for (auto& diving_worker : *determinism_diving_workers_) { + for (const auto& dive_entry : diving_worker.dive_queue) { + state_data.push_back(dive_entry.node.get_id_packed()); + } + } + } + + state_hash = detail::compute_hash(state_data); + state_hash ^= pc_.compute_state_hash(); + } + + determinism_current_horizon_ += determinism_horizon_step_; + + std::vector incumbent_snapshot; + if (incumbent_.has_incumbent) { incumbent_snapshot = incumbent_.x; } + + determinism_broadcast_snapshots( + *determinism_workers_, incumbent_snapshot, horizon_end, determinism_current_horizon_); + if (determinism_diving_workers_) { + determinism_broadcast_snapshots( + *determinism_diving_workers_, incumbent_snapshot, horizon_end, determinism_current_horizon_); + } + + f_t lower_bound = determinism_compute_lower_bound(); + f_t upper_bound = upper_bound_.load(); + f_t abs_gap = upper_bound - lower_bound; + f_t rel_gap = user_relative_gap(original_lp_, upper_bound, lower_bound); + + if (abs_gap <= settings_.absolute_mip_gap_tol || rel_gap <= settings_.relative_mip_gap_tol) { + determinism_global_termination_status_ = mip_status_t::OPTIMAL; + } + + if (!determinism_workers_->any_has_work()) { + // Tree exhausted - check if we found a solution + if (upper_bound == std::numeric_limits::infinity()) { + determinism_global_termination_status_ = mip_status_t::INFEASIBLE; + } else { + determinism_global_termination_status_ = mip_status_t::OPTIMAL; + } + } + + if (toc(exploration_stats_.start_time) > settings_.time_limit) { + determinism_global_termination_status_ = mip_status_t::TIME_LIMIT; + } + + // Stop early if next horizon exceeds work limit + if (determinism_current_horizon_ > settings_.work_limit) { + determinism_global_termination_status_ = mip_status_t::WORK_LIMIT; + } + + // Signal shutdown to prevent threads from entering barriers after termination + if (determinism_global_termination_status_ != mip_status_t::UNSET) { + determinism_scheduler_->signal_shutdown(); + } + + f_t obj = compute_user_objective(original_lp_, upper_bound); + f_t user_lower = compute_user_objective(original_lp_, lower_bound); + std::string gap_user = user_mip_gap(obj, user_lower); + + std::string idle_workers; + i_t idle_count = 0; + for (const auto& w : *determinism_workers_) { + if (!w.has_work() && w.current_node == nullptr) { + ++idle_count; + // if (!idle_workers.empty()) idle_workers += ","; + // idle_workers += "W" + std::to_string(w.worker_id); + } + } + idle_workers = idle_count > 0 ? std::to_string(idle_count) + " idle" : ""; + + settings_.log.printf("W%-5g %8d %8lu %+13.6e %+10.6e %s %8.2f [%08x]%s%s\n", + determinism_current_horizon_, + exploration_stats_.nodes_explored, + exploration_stats_.nodes_unexplored, + obj, + user_lower, + gap_user.c_str(), + toc(exploration_stats_.start_time), + state_hash, + idle_workers.empty() ? "" : " ", + idle_workers.c_str()); +} + +template +node_status_t branch_and_bound_t::solve_node_deterministic( + determinism_bfs_worker_t& worker, + mip_node_t* node_ptr, + search_tree_t& search_tree) +{ + raft::common::nvtx::range scope("BB::solve_node_deterministic"); + + double work_units_at_start = worker.work_context.global_work_units_elapsed; + + std::fill(worker.bounds_changed.begin(), worker.bounds_changed.end(), false); + + if (worker.recompute_bounds_and_basis) { + worker.leaf_problem.lower = original_lp_.lower; + worker.leaf_problem.upper = original_lp_.upper; + node_ptr->get_variable_bounds( + worker.leaf_problem.lower, worker.leaf_problem.upper, worker.bounds_changed); + } else { + node_ptr->update_branched_variable_bounds( + worker.leaf_problem.lower, worker.leaf_problem.upper, worker.bounds_changed); + } + + double remaining_time = settings_.time_limit - toc(exploration_stats_.start_time); + + // Bounds strengthening + simplex_solver_settings_t lp_settings = settings_; + lp_settings.set_log(false); + + lp_settings.cut_off = worker.local_upper_bound + settings_.dual_tol; + lp_settings.inside_mip = 2; + lp_settings.time_limit = remaining_time; + lp_settings.scale_columns = false; + + bool feasible = true; +#ifndef DETERMINISM_DISABLE_BOUNDS_STRENGTHENING + raft::common::nvtx::range scope_bs("BB::bound_strengthening"); + feasible = worker.node_presolver.bounds_strengthening( + lp_settings, worker.bounds_changed, worker.leaf_problem.lower, worker.leaf_problem.upper); + + if (settings_.deterministic) { + // TEMP APPROXIMATION; + worker.work_context.record_work(worker.node_presolver.last_nnz_processed / 1e8); + } +#endif + + if (!feasible) { + node_ptr->lower_bound = std::numeric_limits::infinity(); + search_tree.update(node_ptr, node_status_t::INFEASIBLE); + worker.record_infeasible(node_ptr); + --exploration_stats_.nodes_unexplored; + ++exploration_stats_.nodes_explored; + worker.recompute_bounds_and_basis = true; + return node_status_t::INFEASIBLE; + } + + // Solve LP relaxation + worker.leaf_solution.resize(worker.leaf_problem.num_rows, worker.leaf_problem.num_cols); + std::vector& leaf_vstatus = node_ptr->vstatus; + i_t node_iter = 0; + f_t lp_start_time = tic(); + std::vector leaf_edge_norms = edge_norms_; + + dual::status_t lp_status = dual_phase2_with_advanced_basis(2, + 0, + worker.recompute_bounds_and_basis, + lp_start_time, + worker.leaf_problem, + lp_settings, + leaf_vstatus, + worker.basis_factors, + worker.basic_list, + worker.nonbasic_list, + worker.leaf_solution, + node_iter, + leaf_edge_norms, + &worker.work_context); + + if (lp_status == dual::status_t::NUMERICAL) { + settings_.log.printf("Numerical issue node %d. Resolving from scratch.\n", node_ptr->node_id); + lp_status_t second_status = solve_linear_program_with_advanced_basis(worker.leaf_problem, + lp_start_time, + lp_settings, + worker.leaf_solution, + worker.basis_factors, + worker.basic_list, + worker.nonbasic_list, + leaf_vstatus, + leaf_edge_norms); + lp_status = convert_lp_status_to_dual_status(second_status); + } + + double work_performed = worker.work_context.global_work_units_elapsed - work_units_at_start; + worker.clock += work_performed; + + exploration_stats_.total_lp_solve_time += toc(lp_start_time); + exploration_stats_.total_lp_iters += node_iter; + ++exploration_stats_.nodes_explored; + --exploration_stats_.nodes_unexplored; + + determinism_bfs_tree_update_callbacks_t policy{*this, worker}; + auto [status, round_dir] = update_tree_impl(node_ptr, search_tree, &worker, lp_status, policy); + + return status; +} + +template +template +void branch_and_bound_t::determinism_process_worker_solutions( + PoolT& pool, WorkerTypeGetter get_worker_type) +{ + std::vector*> all_solutions; + for (auto& worker : pool) { + for (auto& sol : worker.integer_solutions) { + all_solutions.push_back(&sol); + } + } + + std::sort(all_solutions.begin(), + all_solutions.end(), + [](const queued_integer_solution_t* a, + const queued_integer_solution_t* b) { return *a < *b; }); + + f_t determinism_lower = determinism_compute_lower_bound(); + f_t current_upper = upper_bound_.load(); + + for (const auto* sol : all_solutions) { + if (sol->objective < current_upper) { + f_t user_obj = compute_user_objective(original_lp_, sol->objective); + f_t user_lower = compute_user_objective(original_lp_, determinism_lower); + i_t nodes_explored = exploration_stats_.nodes_explored.load(); + i_t nodes_unexplored = exploration_stats_.nodes_unexplored.load(); + + search_strategy_t worker_type = get_worker_type(pool, sol->worker_id); + report( + feasible_solution_symbol(worker_type), sol->objective, determinism_lower, sol->depth, 0); + + bool improved = false; + if (sol->objective < upper_bound_) { + upper_bound_ = sol->objective; + incumbent_.set_incumbent_solution(sol->objective, sol->solution); + current_upper = sol->objective; + improved = true; + } + + if (improved && settings_.solution_callback != nullptr) { + std::vector original_x; + uncrush_primal_solution(original_problem_, original_lp_, sol->solution, original_x); + settings_.solution_callback(original_x, sol->objective); + } + } + } + + for (auto& worker : pool) { + worker.integer_solutions.clear(); + } +} + +template +template +void branch_and_bound_t::determinism_merge_pseudo_cost_updates(PoolT& pool) +{ + std::vector> all_pc_updates; + for (auto& worker : pool) { + for (auto& upd : worker.pseudo_cost_updates) { + all_pc_updates.push_back(upd); + } + } + + std::sort(all_pc_updates.begin(), all_pc_updates.end()); + + for (const auto& upd : all_pc_updates) { + if (upd.direction == rounding_direction_t::DOWN) { + pc_.pseudo_cost_sum_down[upd.variable] += upd.delta; + pc_.pseudo_cost_num_down[upd.variable]++; + } else { + pc_.pseudo_cost_sum_up[upd.variable] += upd.delta; + pc_.pseudo_cost_num_up[upd.variable]++; + } + } + + for (auto& worker : pool) { + worker.pseudo_cost_updates.clear(); + } +} + +template +template +void branch_and_bound_t::determinism_broadcast_snapshots( + PoolT& pool, const std::vector& incumbent_snapshot, double horizon_start, double horizon_end) +{ + for (auto& worker : pool) { + worker.set_snapshots(upper_bound_.load(), + (const f_t*)pc_.pseudo_cost_sum_up.data(), + (const f_t*)pc_.pseudo_cost_sum_down.data(), + (const i_t*)pc_.pseudo_cost_num_up.data(), + (const i_t*)pc_.pseudo_cost_num_down.data(), + incumbent_snapshot, + exploration_stats_.total_lp_iters.load(), + horizon_start, + horizon_end); + } +} + +template +void branch_and_bound_t::determinism_sort_replay_events( + const bb_event_batch_t& events) +{ + // Infeasible solutions from GPU heuristics are queued for repair; process them now + { + std::vector> to_repair; + // TODO: support repair queue in deterministic mode + // mutex_repair_.lock(); + // if (repair_queue_.size() > 0) { + // to_repair = repair_queue_; + // repair_queue_.clear(); + // } + // mutex_repair_.unlock(); + + std::sort(to_repair.begin(), + to_repair.end(), + [](const std::vector& a, const std::vector& b) { return a < b; }); + + if (to_repair.size() > 0) { + settings_.log.debug("Deterministic sync: Attempting to repair %ld injected solutions\n", + to_repair.size()); + for (const std::vector& potential_solution : to_repair) { + std::vector repaired_solution; + f_t repaired_obj; + bool success = + repair_solution(edge_norms_, potential_solution, repaired_obj, repaired_solution); + if (success) { + // Queue repaired solution with work unit timestamp (...workstamp?) + mutex_heuristic_queue_.lock(); + heuristic_solution_queue_.push_back( + {std::move(repaired_solution), repaired_obj, determinism_current_horizon_}); + mutex_heuristic_queue_.unlock(); + } + } + } + } + + // Extract heuristic solutions, keeping future solutions for next horizon + // Use determinism_current_horizon_ as the upper bound (horizon_end) + std::vector heuristic_solutions; + mutex_heuristic_queue_.lock(); + { + std::vector future_solutions; + for (auto& sol : heuristic_solution_queue_) { + if (sol.wut < determinism_current_horizon_) { + heuristic_solutions.push_back(std::move(sol)); + } else { + future_solutions.push_back(std::move(sol)); + } + } + heuristic_solution_queue_ = std::move(future_solutions); + } + mutex_heuristic_queue_.unlock(); + + // sort by work unit timestamp, with objective and solution values as tie-breakers + std::sort(heuristic_solutions.begin(), + heuristic_solutions.end(), + [](const queued_heuristic_solution_t& a, const queued_heuristic_solution_t& b) { + if (a.wut != b.wut) { return a.wut < b.wut; } + if (a.objective != b.objective) { return a.objective < b.objective; } + return a.solution < b.solution; // edge-case - lexicographical comparison + }); + + // Merge B&B events and heuristic solutions for unified timeline replay + size_t event_idx = 0; + size_t heuristic_idx = 0; + + while (event_idx < events.events.size() || heuristic_idx < heuristic_solutions.size()) { + bool process_event = false; + bool process_heuristic = false; + + if (event_idx >= events.events.size()) { + process_heuristic = true; + } else if (heuristic_idx >= heuristic_solutions.size()) { + process_event = true; + } else { + // Both have items - pick the one with smaller WUT + if (events.events[event_idx].wut <= heuristic_solutions[heuristic_idx].wut) { + process_event = true; + } else { + process_heuristic = true; + } + } + + if (process_event) { + const auto& event = events.events[event_idx++]; + switch (event.type) { + case bb_event_type_t::NODE_INTEGER: + case bb_event_type_t::NODE_BRANCHED: + case bb_event_type_t::NODE_FATHOMED: + case bb_event_type_t::NODE_INFEASIBLE: + case bb_event_type_t::NODE_NUMERICAL: break; + } + } + + if (process_heuristic) { + const auto& hsol = heuristic_solutions[heuristic_idx++]; + + CUOPT_LOG_TRACE( + "Deterministic sync: Heuristic solution received at WUT %f with objective %g, current " + "horizon %f", + hsol.wut, + hsol.objective, + determinism_current_horizon_); + + // Process heuristic solution at its correct work unit timestamp position + f_t new_upper = std::numeric_limits::infinity(); + + if (hsol.objective < upper_bound_) { + upper_bound_ = hsol.objective; + incumbent_.set_incumbent_solution(hsol.objective, hsol.solution); + new_upper = hsol.objective; + } + + if (new_upper < std::numeric_limits::infinity()) { + report_heuristic(new_upper); + + if (settings_.solution_callback != nullptr) { + std::vector original_x; + uncrush_primal_solution(original_problem_, original_lp_, hsol.solution, original_x); + settings_.solution_callback(original_x, hsol.objective); + } + } + } + } + + // Merge integer solutions from BFS workers and update global incumbent + determinism_process_worker_solutions(*determinism_workers_, + [](const determinism_bfs_worker_pool_t&, int) { + return search_strategy_t::BEST_FIRST; + }); + + // Merge and apply pseudo-cost updates from BFS workers + determinism_merge_pseudo_cost_updates(*determinism_workers_); + + for (const auto& worker : *determinism_workers_) { + fetch_min(lower_bound_ceiling_, worker.local_lower_bound_ceiling); + } +} + +template +void branch_and_bound_t::determinism_prune_worker_nodes_vs_incumbent() +{ + f_t upper_bound = upper_bound_.load(); + + for (auto& worker : *determinism_workers_) { + // Check nodes in plunge stack - filter in place + { + std::deque*> surviving; + for (auto* node : worker.plunge_stack) { + if (node->lower_bound >= upper_bound) { + search_tree_.update(node, node_status_t::FATHOMED); + --exploration_stats_.nodes_unexplored; + } else { + surviving.push_back(node); + } + } + worker.plunge_stack = std::move(surviving); + } + + // Check nodes in backlog heap - filter and rebuild + { + std::vector*> surviving; + for (auto* node : worker.backlog.data()) { + if (node->lower_bound >= upper_bound) { + search_tree_.update(node, node_status_t::FATHOMED); + --exploration_stats_.nodes_unexplored; + } else { + surviving.push_back(node); + } + } + worker.backlog.clear(); + for (auto* node : surviving) { + worker.backlog.push(node); + } + } + } +} + +template +void branch_and_bound_t::determinism_balance_worker_loads() +{ + const size_t num_workers = determinism_workers_->size(); + if (num_workers <= 1) return; + + constexpr bool force_rebalance_every_sync = false; + + // Count work for each worker: current_node (if any) + plunge_stack + backlog + std::vector work_counts(num_workers); + size_t total_work = 0; + size_t max_work = 0; + size_t min_work = std::numeric_limits::max(); + + for (size_t w = 0; w < num_workers; ++w) { + auto& worker = (*determinism_workers_)[w]; + work_counts[w] = worker.queue_size(); + total_work += work_counts[w]; + max_work = std::max(max_work, work_counts[w]); + min_work = std::min(min_work, work_counts[w]); + } + if (total_work == 0) return; + + bool needs_balance; + if (force_rebalance_every_sync) { + needs_balance = (total_work > 1); + } else { + needs_balance = (min_work == 0 && max_work >= 2) || (min_work > 0 && max_work > 4 * min_work); + } + + if (!needs_balance) return; + + std::vector*> all_nodes; + for (auto& worker : *determinism_workers_) { + for (auto* node : worker.backlog.data()) { + all_nodes.push_back(node); + } + worker.backlog.clear(); + } + + if (all_nodes.empty()) return; + + auto deterministic_less = [](const mip_node_t* a, const mip_node_t* b) { + if (a->origin_worker_id != b->origin_worker_id) { + return a->origin_worker_id < b->origin_worker_id; + } + return a->creation_seq < b->creation_seq; + }; + std::sort(all_nodes.begin(), all_nodes.end(), deterministic_less); + + // Redistribute round-robin + std::vector worker_order; + for (size_t w = 0; w < num_workers; ++w) { + worker_order.push_back(w); + } + + // Distribute nodes + for (size_t i = 0; i < all_nodes.size(); ++i) { + size_t worker_idx = worker_order[i % num_workers]; + (*determinism_workers_)[worker_idx].enqueue_node(all_nodes[i]); + } +} + +template +f_t branch_and_bound_t::determinism_compute_lower_bound() +{ + // Compute lower bound from BFS worker local structures only + const f_t inf = std::numeric_limits::infinity(); + f_t lower_bound = lower_bound_ceiling_.load(); + if (!std::isfinite(lower_bound)) lower_bound = inf; + + // Check all BFS worker queues + for (const auto& worker : *determinism_workers_) { + // Check paused node (current_node) + if (worker.current_node != nullptr) { + lower_bound = std::min(worker.current_node->lower_bound, lower_bound); + } + + // Check plunge stack nodes + for (auto* node : worker.plunge_stack) { + lower_bound = std::min(node->lower_bound, lower_bound); + } + + // Check backlog heap nodes + for (auto* node : worker.backlog.data()) { + lower_bound = std::min(node->lower_bound, lower_bound); + } + } + + return lower_bound; +} + +template +void branch_and_bound_t::determinism_populate_diving_heap() +{ + // Clear diving heap from previous horizon + diving_heap_.clear(); + + if (!determinism_diving_workers_ || determinism_diving_workers_->size() == 0) return; + + const int num_diving = determinism_diving_workers_->size(); + constexpr int target_nodes_per_worker = 10; + const int target_total = num_diving * target_nodes_per_worker; + f_t upper_bound = upper_bound_.load(); + + // Collect candidate nodes from BFS worker backlog heaps + std::vector*, f_t>> candidates; + + for (auto& worker : *determinism_workers_) { + for (auto* node : worker.backlog.data()) { + if (node->lower_bound < upper_bound) { + f_t score = node->objective_estimate; + if (!std::isfinite(score)) { score = node->lower_bound; } + candidates.push_back({node, score}); + } + } + } + + if (candidates.empty()) return; + + // Technically not necessary as it stands since the worker assignments and ordering are + // deterministic + std::sort(candidates.begin(), candidates.end(), [](const auto& a, const auto& b) { + if (a.second != b.second) return a.second < b.second; + if (a.first->origin_worker_id != b.first->origin_worker_id) { + return a.first->origin_worker_id < b.first->origin_worker_id; + } + return a.first->creation_seq < b.first->creation_seq; + }); + + int nodes_to_take = std::min(target_total, (int)candidates.size()); + + for (int i = 0; i < nodes_to_take; ++i) { + diving_heap_.push({candidates[i].first, candidates[i].second}); + } +} + +template +void branch_and_bound_t::determinism_assign_diving_nodes() +{ + if (!determinism_diving_workers_ || determinism_diving_workers_->size() == 0) { + diving_heap_.clear(); + return; + } + + constexpr int target_nodes_per_worker = 10; + + // Round-robin assignment + int worker_idx = 0; + const int num_workers = determinism_diving_workers_->size(); + + while (!diving_heap_.empty()) { + auto& worker = (*determinism_diving_workers_)[worker_idx]; + + // Skip workers that already have enough nodes + if ((int)worker.dive_queue_size() >= target_nodes_per_worker) { + worker_idx = (worker_idx + 1) % num_workers; + // Check if all workers are full + bool all_full = true; + for (auto& w : *determinism_diving_workers_) { + if ((int)w.dive_queue_size() < target_nodes_per_worker) { + all_full = false; + break; + } + } + if (all_full) break; + continue; + } + + auto entry = diving_heap_.pop(); + if (entry.has_value()) { + worker.enqueue_dive_node(entry.value().node, original_lp_, settings_); + } + + worker_idx = (worker_idx + 1) % num_workers; + } + + diving_heap_.clear(); +} + +template +void branch_and_bound_t::determinism_collect_diving_solutions() +{ + if (!determinism_diving_workers_) return; + + // Collect integer solutions from diving workers and update global incumbent + determinism_process_worker_solutions(*determinism_diving_workers_, + [](const determinism_diving_worker_pool_t& pool, + int worker_id) { return pool[worker_id].diving_type; }); + + // Merge pseudo-cost updates from diving workers + determinism_merge_pseudo_cost_updates(*determinism_diving_workers_); +} + +template +void branch_and_bound_t::run_deterministic_diving_loop( + determinism_diving_worker_t& worker) +{ + raft::common::nvtx::range scope("BB::diving_worker_loop"); + + while (determinism_global_termination_status_ == mip_status_t::UNSET) { + // Process dives from queue until empty or horizon exhausted + auto entry_opt = worker.dequeue_dive_node(); + if (entry_opt.has_value()) { + deterministic_dive(worker, std::move(entry_opt.value())); + continue; + } + + // Queue empty - wait for next sync point where we'll be assigned new nodes + f_t nowork_start = tic(); + determinism_scheduler_->wait_for_next_sync(worker.work_context); + worker.total_nowork_time += toc(nowork_start); + // Termination status is checked in loop condition + } +} + +template +void branch_and_bound_t::deterministic_dive(determinism_diving_worker_t& worker, + dive_queue_entry_t entry) +{ + raft::common::nvtx::range scope("BB::deterministic_dive"); + + // Create local search tree for the dive + search_tree_t dive_tree(std::move(entry.node)); + std::deque*> stack; + stack.push_front(&dive_tree.root); + + worker.dive_lower = std::move(entry.resolved_lower); + worker.dive_upper = std::move(entry.resolved_upper); + + const i_t max_nodes_per_dive = settings_.diving_settings.node_limit; + const i_t max_backtrack_depth = settings_.diving_settings.backtrack_limit; + i_t nodes_this_dive = 0; + worker.lp_iters_this_dive = 0; + worker.recompute_bounds_and_basis = true; + + while (!stack.empty() && determinism_global_termination_status_ == mip_status_t::UNSET && + nodes_this_dive < max_nodes_per_dive) { + mip_node_t* node_ptr = stack.front(); + stack.pop_front(); + + // Prune check using snapshot upper bound + f_t rel_gap = user_relative_gap(original_lp_, worker.local_upper_bound, node_ptr->lower_bound); + if (node_ptr->lower_bound > worker.local_upper_bound || + rel_gap < settings_.relative_mip_gap_tol) { + worker.recompute_bounds_and_basis = true; + continue; + } + + // Setup bounds for this node + std::fill(worker.bounds_changed.begin(), worker.bounds_changed.end(), false); + + if (worker.recompute_bounds_and_basis) { + worker.leaf_problem.lower = worker.dive_lower; + worker.leaf_problem.upper = worker.dive_upper; + node_ptr->get_variable_bounds( + worker.leaf_problem.lower, worker.leaf_problem.upper, worker.bounds_changed); + } else { + node_ptr->update_branched_variable_bounds( + worker.leaf_problem.lower, worker.leaf_problem.upper, worker.bounds_changed); + } + + double remaining_time = settings_.time_limit - toc(exploration_stats_.start_time); + if (remaining_time <= 0) { break; } + + // Setup LP settings + simplex_solver_settings_t lp_settings = settings_; + lp_settings.set_log(false); + lp_settings.cut_off = worker.local_upper_bound + settings_.dual_tol; + lp_settings.inside_mip = 2; + lp_settings.time_limit = remaining_time; + lp_settings.scale_columns = false; + +#ifndef DETERMINISM_DISABLE_BOUNDS_STRENGTHENING + bool feasible = worker.node_presolver.bounds_strengthening( + lp_settings, worker.bounds_changed, worker.leaf_problem.lower, worker.leaf_problem.upper); + + if (!feasible) { + worker.recompute_bounds_and_basis = true; + continue; + } +#endif + + { + f_t factor = settings_.diving_settings.iteration_limit_factor; + i_t max_iter = (i_t)(factor * worker.total_lp_iters_snapshot); + lp_settings.iteration_limit = max_iter - worker.lp_iters_this_dive; + if (lp_settings.iteration_limit <= 0) { break; } + } + + // Solve LP relaxation + worker.leaf_solution.resize(worker.leaf_problem.num_rows, worker.leaf_problem.num_cols); + std::vector& leaf_vstatus = node_ptr->vstatus; + i_t node_iter = 0; + f_t lp_start_time = tic(); + std::vector leaf_edge_norms = edge_norms_; + + dual::status_t lp_status = dual_phase2_with_advanced_basis(2, + 0, + worker.recompute_bounds_and_basis, + lp_start_time, + worker.leaf_problem, + lp_settings, + leaf_vstatus, + worker.basis_factors, + worker.basic_list, + worker.nonbasic_list, + worker.leaf_solution, + node_iter, + leaf_edge_norms, + &worker.work_context); + + if (lp_status == dual::status_t::NUMERICAL) { + lp_status_t second_status = solve_linear_program_with_advanced_basis(worker.leaf_problem, + lp_start_time, + lp_settings, + worker.leaf_solution, + worker.basis_factors, + worker.basic_list, + worker.nonbasic_list, + leaf_vstatus, + leaf_edge_norms); + lp_status = convert_lp_status_to_dual_status(second_status); + } + + ++nodes_this_dive; + ++worker.total_nodes_explored; + worker.lp_iters_this_dive += node_iter; + + worker.clock = worker.work_context.global_work_units_elapsed; + + if (lp_status == dual::status_t::TIME_LIMIT || lp_status == dual::status_t::WORK_LIMIT || + lp_status == dual::status_t::ITERATION_LIMIT) { + break; + } + + determinism_diving_tree_update_callbacks_t policy{ + *this, worker, stack, max_backtrack_depth}; + update_tree_impl(node_ptr, dive_tree, &worker, lp_status, policy); + } +} + #ifdef DUAL_SIMPLEX_INSTANTIATE_DOUBLE template class branch_and_bound_t; diff --git a/cpp/src/dual_simplex/branch_and_bound.hpp b/cpp/src/dual_simplex/branch_and_bound.hpp index 1d6947681..579754d94 100644 --- a/cpp/src/dual_simplex/branch_and_bound.hpp +++ b/cpp/src/dual_simplex/branch_and_bound.hpp @@ -7,8 +7,10 @@ #pragma once +#include #include #include +#include #include #include #include @@ -21,6 +23,9 @@ #include #include #include +#include +#include +#include #include #include @@ -36,11 +41,24 @@ enum class mip_status_t { NODE_LIMIT = 4, // The maximum number of nodes was reached (not implemented) NUMERICAL = 5, // The solver encountered a numerical error UNSET = 6, // The status is not set + WORK_LIMIT = 7, // The solver reached a deterministic work limit }; +template +class bounds_strengthening_t; + template void upper_bound_callback(f_t upper_bound); +template +struct opportunistic_tree_update_callbacks_t; +template +struct determinism_tree_update_policy_base_t; +template +struct determinism_bfs_tree_update_callbacks_t; +template +struct determinism_diving_tree_update_callbacks_t; + template class branch_and_bound_t { public: @@ -74,6 +92,9 @@ class branch_and_bound_t { // Set a solution based on the user problem during the course of the solve void set_new_solution(const std::vector& solution); + // This queues the solution to be processed at the correct work unit timestamp + void queue_external_solution_deterministic(const std::vector& solution, double work_unit_ts); + void set_user_bound_callback(std::function callback) { user_bound_callback_ = std::move(callback); @@ -106,10 +127,17 @@ class branch_and_bound_t { // The main entry routine. Returns the solver status and populates solution with the incumbent. mip_status_t solve(mip_solution_t& solution); + work_limit_context_t& get_work_unit_context() { return work_unit_context_; } + + // Get producer sync for external heuristics (e.g., CPUFJ) to register + producer_sync_t& get_producer_sync() { return producer_sync_; } + private: const user_problem_t& original_problem_; const simplex_solver_settings_t settings_; + work_limit_context_t work_unit_context_{"B&B"}; + // Initial guess. std::vector guess_; @@ -234,9 +262,21 @@ class branch_and_bound_t { branch_and_bound_stats_t& stats, logger_t& log); - // Update the tree based on the LP relaxation. Returns the status - // of the node and, if appropriated, the preferred rounding direction - // when visiting the children. + // Selects the variable to branch on. + branch_variable_t variable_selection(mip_node_t* node_ptr, + const std::vector& fractional, + branch_and_bound_worker_t* worker); + + // Policy-based tree update shared between opportunistic and deterministic codepaths. + template + std::pair update_tree_impl( + mip_node_t* node_ptr, + search_tree_t& search_tree, + WorkerT* worker, + dual::status_t lp_status, + Policy& policy); + + // Opportunistic tree update wrapper. std::pair update_tree( mip_node_t* node_ptr, search_tree_t& search_tree, @@ -244,10 +284,118 @@ class branch_and_bound_t { dual::status_t lp_status, logger_t& log); - // Selects the variable to branch on. - branch_variable_t variable_selection(mip_node_t* node_ptr, - const std::vector& fractional, - branch_and_bound_worker_t* worker); + // ============================================================================ + // Deterministic BSP (Bulk Synchronous Parallel) methods for deterministic parallel B&B + // ============================================================================ + + // Main determinism coordinator loop + void run_determinism_coordinator(const csr_matrix_t& Arow); + + // Gather all events generated, sort by WU timestamp, apply + void determinism_sort_replay_events(const bb_event_batch_t& events); + + // Prune nodes held by workers based on new incumbent + void determinism_prune_worker_nodes_vs_incumbent(); + + // Balance worker loads - redistribute nodes only if significant imbalance detected + void determinism_balance_worker_loads(); + + node_status_t solve_node_deterministic(determinism_bfs_worker_t& worker, + mip_node_t* node_ptr, + search_tree_t& search_tree); + + f_t determinism_compute_lower_bound(); + + void run_deterministic_bfs_loop(determinism_bfs_worker_t& worker, + search_tree_t& search_tree); + + // Executed when all workers reach barrier + // Handles termination logic serially in deterministic mode + void determinism_sync_callback(); + + void run_deterministic_diving_loop(determinism_diving_worker_t& worker); + + void deterministic_dive(determinism_diving_worker_t& worker, + dive_queue_entry_t entry); + + // Populate diving heap from BFS worker backlogs at sync + void determinism_populate_diving_heap(); + + // Assign starting nodes to diving workers from diving heap + void determinism_assign_diving_nodes(); + + // Collect and merge diving solutions at sync + void determinism_collect_diving_solutions(); + + template + void determinism_process_worker_solutions(PoolT& pool, WorkerTypeGetter get_worker_type); + + template + void determinism_merge_pseudo_cost_updates(PoolT& pool); + + template + void determinism_broadcast_snapshots(PoolT& pool, + const std::vector& incumbent_snapshot, + double horizon_start, + double horizon_end); + + friend struct opportunistic_tree_update_callbacks_t; + friend struct determinism_bfs_tree_update_callbacks_t; + friend struct determinism_diving_tree_update_callbacks_t; + + private: + // unique_ptr as we only want to initialize these if we're in the determinism codepath + std::unique_ptr> determinism_workers_; + std::unique_ptr determinism_scheduler_; + mip_status_t determinism_global_termination_status_{mip_status_t::UNSET}; + double determinism_horizon_step_{5.0}; // Work unit step per horizon (tunable) + double determinism_current_horizon_{0.0}; // Current horizon target + bool determinism_mode_enabled_{false}; + int determinism_horizon_number_{0}; // Current horizon number (for debugging) + + // Producer synchronization for external heuristics (CPUFJ) + // B&B waits for registered producers at each horizon sync + producer_sync_t producer_sync_; + + // Producer wait time statistics + double total_producer_wait_time_{0.0}; + double max_producer_wait_time_{0.0}; + i_t producer_wait_count_{0}; + + // Determinism heuristic solution queue - solutions received from GPU heuristics + // Stored with work unit timestamp for deterministic ordering + struct queued_heuristic_solution_t { + std::vector solution; + f_t objective; + double wut; + }; + omp_mutex_t mutex_heuristic_queue_; + std::vector heuristic_solution_queue_; + + // ============================================================================ + // Determinism Diving state + // ============================================================================ + + // Diving worker pool + // unique_ptr as we only want to initialize these if we're in the determinism codepath + std::unique_ptr> determinism_diving_workers_; + + // Diving heap - nodes available for diving, sorted by objective estimate + struct diving_entry_t { + mip_node_t* node; + f_t score; + }; + struct diving_score_comp { + bool operator()(const diving_entry_t& a, const diving_entry_t& b) const + { + if (a.score != b.score) return a.score > b.score; // Min-heap by score + if (a.node->origin_worker_id != b.node->origin_worker_id) { + return a.node->origin_worker_id > b.node->origin_worker_id; + } + return a.node->creation_seq > b.node->creation_seq; + } + }; + heap_t diving_heap_; }; } // namespace cuopt::linear_programming::dual_simplex diff --git a/cpp/src/dual_simplex/crossover.cpp b/cpp/src/dual_simplex/crossover.cpp index afc8c6674..597628e73 100644 --- a/cpp/src/dual_simplex/crossover.cpp +++ b/cpp/src/dual_simplex/crossover.cpp @@ -15,6 +15,8 @@ #include #include +#include + #include namespace cuopt::linear_programming::dual_simplex { @@ -83,6 +85,7 @@ f_t dual_infeasibility(const lp_problem_t& lp, const std::vector& vstatus, const std::vector& z) { + raft::common::nvtx::range scope("DualSimplex::dual_infeasibility"); const i_t n = lp.num_cols; const i_t m = lp.num_rows; i_t num_infeasible = 0; @@ -1135,6 +1138,7 @@ crossover_status_t crossover(const lp_problem_t& lp, lp_solution_t& solution, std::vector& vstatus) { + raft::common::nvtx::range scope("Barrier::crossover"); const i_t m = lp.num_rows; const i_t n = lp.num_cols; f_t crossover_start = tic(); diff --git a/cpp/src/dual_simplex/cusparse_view.cu b/cpp/src/dual_simplex/cusparse_view.cu index a63ed6add..8dc5e51f9 100644 --- a/cpp/src/dual_simplex/cusparse_view.cu +++ b/cpp/src/dual_simplex/cusparse_view.cu @@ -158,9 +158,9 @@ cusparse_view_t::cusparse_view_t(raft::handle_t const* handle_ptr, A_indices_ = device_copy(indices, handle_ptr->get_stream()); A_data_ = device_copy(data, handle_ptr->get_stream()); - A_T_offsets_ = device_copy(A.col_start, handle_ptr->get_stream()); - A_T_indices_ = device_copy(A.i, handle_ptr->get_stream()); - A_T_data_ = device_copy(A.x, handle_ptr->get_stream()); + A_T_offsets_ = device_copy(A.col_start.underlying(), handle_ptr->get_stream()); + A_T_indices_ = device_copy(A.i.underlying(), handle_ptr->get_stream()); + A_T_data_ = device_copy(A.x.underlying(), handle_ptr->get_stream()); cusparseCreateCsr(&A_, rows, diff --git a/cpp/src/dual_simplex/deterministic_workers.hpp b/cpp/src/dual_simplex/deterministic_workers.hpp new file mode 100644 index 000000000..2117dae7c --- /dev/null +++ b/cpp/src/dual_simplex/deterministic_workers.hpp @@ -0,0 +1,557 @@ +/* clang-format off */ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ +/* clang-format on */ + +#pragma once + +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include + +namespace cuopt::linear_programming::dual_simplex { + +template +struct backlog_node_compare_t { + bool operator()(const mip_node_t* a, const mip_node_t* b) const + { + if (a->lower_bound != b->lower_bound) { return a->lower_bound > b->lower_bound; } + if (a->origin_worker_id != b->origin_worker_id) { + return a->origin_worker_id > b->origin_worker_id; + } + return a->creation_seq > b->creation_seq; + } +}; + +template +struct pseudo_cost_update_t { + i_t variable; + rounding_direction_t direction; + f_t delta; + double wut; + int worker_id; + + bool operator<(const pseudo_cost_update_t& other) const + { + if (wut != other.wut) return wut < other.wut; + if (variable != other.variable) return variable < other.variable; + if (delta != other.delta) return delta < other.delta; + return worker_id < other.worker_id; + } +}; + +template +struct queued_integer_solution_t { + f_t objective; + std::vector solution; + i_t depth; + int worker_id; + int sequence_id; + + bool operator<(const queued_integer_solution_t& other) const + { + if (objective != other.objective) return objective < other.objective; + if (worker_id != other.worker_id) return worker_id < other.worker_id; + return sequence_id < other.sequence_id; + } +}; + +template +class determinism_worker_base_t : public branch_and_bound_worker_t { + using base_t = branch_and_bound_worker_t; + + public: + double clock{0.0}; + double horizon_start{0.0}; + double horizon_end{0.0}; + work_limit_context_t work_context; + + // Local snapshots of global state + std::vector pc_sum_up_snapshot; + std::vector pc_sum_down_snapshot; + std::vector pc_num_up_snapshot; + std::vector pc_num_down_snapshot; + f_t local_upper_bound{std::numeric_limits::infinity()}; + + // Diving-specific snapshots (ignored by BFS workers) + std::vector incumbent_snapshot; + i_t total_lp_iters_snapshot{0}; + + std::vector> integer_solutions; + std::vector> pseudo_cost_updates; + int next_solution_seq{0}; + + i_t total_nodes_processed{0}; + i_t total_integer_solutions{0}; + double total_runtime{0.0}; + double total_nowork_time{0.0}; + + determinism_worker_base_t(int id, + const lp_problem_t& original_lp, + const csr_matrix_t& Arow, + const std::vector& var_types, + const simplex_solver_settings_t& settings, + const std::string& context_name) + : base_t(id, original_lp, Arow, var_types, settings), work_context(context_name) + { + work_context.deterministic = true; + } + + void set_snapshots(f_t global_upper_bound, + const f_t* pc_sum_up, + const f_t* pc_sum_down, + const i_t* pc_num_up, + const i_t* pc_num_down, + const std::vector& incumbent, + i_t total_lp_iters, + double new_horizon_start, + double new_horizon_end) + { + const i_t n = this->leaf_problem.num_cols; + local_upper_bound = global_upper_bound; + pc_sum_up_snapshot.assign(pc_sum_up, pc_sum_up + n); + pc_sum_down_snapshot.assign(pc_sum_down, pc_sum_down + n); + pc_num_up_snapshot.assign(pc_num_up, pc_num_up + n); + pc_num_down_snapshot.assign(pc_num_down, pc_num_down + n); + incumbent_snapshot = incumbent; + total_lp_iters_snapshot = total_lp_iters; + horizon_start = new_horizon_start; + horizon_end = new_horizon_end; + } + + // Queue pseudo-cost update and apply to local snapshot + void queue_pseudo_cost_update(i_t variable, rounding_direction_t direction, f_t delta) + { + pseudo_cost_updates.push_back({variable, direction, delta, clock, this->worker_id}); + if (direction == rounding_direction_t::DOWN) { + pc_sum_down_snapshot[variable] += delta; + pc_num_down_snapshot[variable]++; + } else { + pc_sum_up_snapshot[variable] += delta; + pc_num_up_snapshot[variable]++; + } + } + + // Basic variable selection from snapshots + i_t variable_selection_from_snapshot(const std::vector& fractional, + const std::vector& solution) const + { + return variable_selection_from_pseudo_costs(pc_sum_down_snapshot.data(), + pc_sum_up_snapshot.data(), + pc_num_down_snapshot.data(), + pc_num_up_snapshot.data(), + (i_t)pc_sum_down_snapshot.size(), + fractional, + solution); + } + + bool has_work() const { return static_cast(this)->has_work_impl(); } +}; + +template +class determinism_bfs_worker_t + : public determinism_worker_base_t> { + using base_t = determinism_worker_base_t>; + + public: + // Node management + std::deque*> plunge_stack; + heap_t*, backlog_node_compare_t> backlog; + mip_node_t* current_node{nullptr}; + mip_node_t* last_solved_node{nullptr}; + + // Event logging for deterministic replay + bb_event_batch_t events; + int event_sequence{0}; + int32_t next_creation_seq{0}; + + // BFS-specific state + f_t local_lower_bound_ceiling{std::numeric_limits::infinity()}; + bool recompute_bounds_and_basis{true}; + i_t nodes_processed_this_horizon{0}; + + // BFS statistics + i_t total_nodes_pruned{0}; + i_t total_nodes_branched{0}; + i_t total_nodes_infeasible{0}; + i_t total_nodes_assigned{0}; + + explicit determinism_bfs_worker_t(int id, + const lp_problem_t& original_lp, + const csr_matrix_t& Arow, + const std::vector& var_types, + const simplex_solver_settings_t& settings) + : base_t(id, original_lp, Arow, var_types, settings, "BB_Worker_" + std::to_string(id)) + { + } + + bool has_work_impl() const + { + return current_node != nullptr || !plunge_stack.empty() || !backlog.empty(); + } + + void enqueue_node(mip_node_t* node) + { + plunge_stack.push_front(node); + ++total_nodes_assigned; + } + + mip_node_t* enqueue_children_for_plunge(mip_node_t* down_child, + mip_node_t* up_child, + rounding_direction_t preferred_direction) + { + if (!plunge_stack.empty()) { + backlog.push(plunge_stack.back()); + plunge_stack.pop_back(); + } + + down_child->origin_worker_id = this->worker_id; + down_child->creation_seq = next_creation_seq++; + up_child->origin_worker_id = this->worker_id; + up_child->creation_seq = next_creation_seq++; + + mip_node_t* first_child; + if (preferred_direction == rounding_direction_t::UP) { + plunge_stack.push_front(down_child); + plunge_stack.push_front(up_child); + first_child = up_child; + } else { + plunge_stack.push_front(up_child); + plunge_stack.push_front(down_child); + first_child = down_child; + } + return first_child; + } + + mip_node_t* dequeue_node() + { + if (current_node != nullptr) { + mip_node_t* node = current_node; + current_node = nullptr; + return node; + } + if (!plunge_stack.empty()) { + mip_node_t* node = plunge_stack.front(); + plunge_stack.pop_front(); + return node; + } + auto node_opt = backlog.pop(); + return node_opt.has_value() ? node_opt.value() : nullptr; + } + + size_t queue_size() const + { + return plunge_stack.size() + backlog.size() + (current_node != nullptr ? 1 : 0); + } + + void record_event(bb_event_t event) + { + event.event_sequence = event_sequence++; + events.add(std::move(event)); + } + + void record_branched( + mip_node_t* node, i_t down_child_id, i_t up_child_id, i_t branch_var, f_t branch_val) + { + record_event(bb_event_t::make_branched(this->clock, + this->worker_id, + node->creation_seq, + down_child_id, + up_child_id, + node->lower_bound, + branch_var, + branch_val)); + ++nodes_processed_this_horizon; + ++this->total_nodes_processed; + ++total_nodes_branched; + } + + void record_integer_solution(mip_node_t* node, f_t objective) + { + record_event(bb_event_t::make_integer_solution( + this->clock, this->worker_id, node->creation_seq, objective)); + ++nodes_processed_this_horizon; + ++this->total_nodes_processed; + ++this->total_integer_solutions; + } + + void record_fathomed(mip_node_t* node, f_t lower_bound) + { + record_event(bb_event_t::make_fathomed( + this->clock, this->worker_id, node->creation_seq, lower_bound)); + ++nodes_processed_this_horizon; + ++this->total_nodes_processed; + ++total_nodes_pruned; + } + + void record_infeasible(mip_node_t* node) + { + record_event( + bb_event_t::make_infeasible(this->clock, this->worker_id, node->creation_seq)); + ++nodes_processed_this_horizon; + ++this->total_nodes_processed; + ++total_nodes_infeasible; + } + + void record_numerical(mip_node_t* node) + { + record_event( + bb_event_t::make_numerical(this->clock, this->worker_id, node->creation_seq)); + ++nodes_processed_this_horizon; + ++this->total_nodes_processed; + } +}; + +template +struct dive_queue_entry_t { + mip_node_t node; + std::vector resolved_lower; + std::vector resolved_upper; +}; + +template +class determinism_diving_worker_t + : public determinism_worker_base_t> { + using base_t = determinism_worker_base_t>; + + public: + search_strategy_t diving_type{search_strategy_t::PSEUDOCOST_DIVING}; + + // Diving-specific node management + std::deque> dive_queue; + std::vector dive_lower; + std::vector dive_upper; + + // Root LP relaxation solution (constant, set once at construction) + const std::vector* root_solution{nullptr}; + + // Diving state + bool recompute_bounds_and_basis{true}; + + // Diving statistics + i_t total_nodes_explored{0}; + i_t total_dives{0}; + i_t lp_iters_this_dive{0}; + + explicit determinism_diving_worker_t(int id, + search_strategy_t type, + const lp_problem_t& original_lp, + const csr_matrix_t& Arow, + const std::vector& var_types, + const simplex_solver_settings_t& settings, + const std::vector* root_sol) + : base_t(id, original_lp, Arow, var_types, settings, "Diving_Worker_" + std::to_string(id)), + diving_type(type), + root_solution(root_sol) + { + dive_lower = original_lp.lower; + dive_upper = original_lp.upper; + } + + determinism_diving_worker_t(const determinism_diving_worker_t&) = delete; + determinism_diving_worker_t& operator=(const determinism_diving_worker_t&) = delete; + determinism_diving_worker_t(determinism_diving_worker_t&&) = default; + determinism_diving_worker_t& operator=(determinism_diving_worker_t&&) = default; + + bool has_work_impl() const { return !dive_queue.empty(); } + + void enqueue_dive_node(mip_node_t* node, + const lp_problem_t& original_lp, + const simplex_solver_settings_t& settings) + { + dive_queue_entry_t entry; + entry.resolved_lower = original_lp.lower; + entry.resolved_upper = original_lp.upper; + std::vector bounds_changed(original_lp.num_cols, false); + node->get_variable_bounds(entry.resolved_lower, entry.resolved_upper, bounds_changed); + this->node_presolver.bounds_strengthening( + settings, bounds_changed, entry.resolved_lower, entry.resolved_upper); + entry.node = node->detach_copy(); + dive_queue.push_back(std::move(entry)); + } + + std::optional> dequeue_dive_node() + { + if (dive_queue.empty()) return std::nullopt; + auto entry = std::move(dive_queue.front()); + dive_queue.pop_front(); + ++total_dives; + return entry; + } + + size_t dive_queue_size() const { return dive_queue.size(); } + size_t queue_size() const { return dive_queue_size(); } // Unified interface for pool + + void queue_integer_solution(f_t objective, const std::vector& solution, i_t depth) + { + this->integer_solutions.push_back( + {objective, solution, depth, this->worker_id, this->next_solution_seq++}); + ++this->total_integer_solutions; + } + + branch_variable_t variable_selection_from_snapshot(const std::vector& fractional, + const std::vector& solution) const + { + assert(root_solution != nullptr); + return pseudocost_diving_from_arrays(this->pc_sum_down_snapshot.data(), + this->pc_sum_up_snapshot.data(), + this->pc_num_down_snapshot.data(), + this->pc_num_up_snapshot.data(), + (i_t)this->pc_sum_down_snapshot.size(), + fractional, + solution, + *root_solution); + } + + branch_variable_t guided_variable_selection(const std::vector& fractional, + const std::vector& solution) const + { + if (this->incumbent_snapshot.empty()) { + return variable_selection_from_snapshot(fractional, solution); + } + return guided_diving_from_arrays(this->pc_sum_down_snapshot.data(), + this->pc_sum_up_snapshot.data(), + this->pc_num_down_snapshot.data(), + this->pc_num_up_snapshot.data(), + (i_t)this->pc_sum_down_snapshot.size(), + fractional, + solution, + this->incumbent_snapshot); + } + + f_t obj_estimate_from_snapshot(const std::vector& fractional, + const std::vector& solution, + f_t lower_bound) const + { + return obj_estimate_from_arrays(this->pc_sum_down_snapshot.data(), + this->pc_sum_up_snapshot.data(), + this->pc_num_down_snapshot.data(), + this->pc_num_up_snapshot.data(), + (i_t)this->pc_sum_down_snapshot.size(), + fractional, + solution, + lower_bound); + } +}; + +template +class determinism_worker_pool_base_t { + protected: + std::vector workers_; + + public: + WorkerT& operator[](int worker_id) { return workers_[worker_id]; } + const WorkerT& operator[](int worker_id) const { return workers_[worker_id]; } + int size() const { return static_cast(workers_.size()); } + + bool any_has_work() const + { + for (const auto& worker : workers_) { + if (worker.has_work()) return true; + } + return false; + } + + size_t total_queue_size() const + { + size_t total = 0; + for (const auto& worker : workers_) { + total += worker.queue_size(); + } + return total; + } + + bb_event_batch_t collect_and_sort_events() + { + bb_event_batch_t all_events; + for (auto& worker : workers_) { + static_cast(this)->collect_worker_events(worker, all_events); + } + all_events.sort_for_replay(); + return all_events; + } + + auto begin() { return workers_.begin(); } + auto end() { return workers_.end(); } + auto begin() const { return workers_.begin(); } + auto end() const { return workers_.end(); } +}; + +template +class determinism_bfs_worker_pool_t + : public determinism_worker_pool_base_t, + determinism_bfs_worker_pool_t> { + using base_t = determinism_worker_pool_base_t, + determinism_bfs_worker_pool_t>; + + public: + determinism_bfs_worker_pool_t(int num_workers, + const lp_problem_t& original_lp, + const csr_matrix_t& Arow, + const std::vector& var_types, + const simplex_solver_settings_t& settings) + { + this->workers_.reserve(num_workers); + for (int i = 0; i < num_workers; ++i) { + this->workers_.emplace_back(i, original_lp, Arow, var_types, settings); + } + } + + void collect_worker_events(determinism_bfs_worker_t& worker, + bb_event_batch_t& all_events) + { + for (auto& event : worker.events.events) { + all_events.add(std::move(event)); + } + worker.events.clear(); + } +}; + +template +class determinism_diving_worker_pool_t + : public determinism_worker_pool_base_t, + determinism_diving_worker_pool_t> { + using base_t = determinism_worker_pool_base_t, + determinism_diving_worker_pool_t>; + + public: + determinism_diving_worker_pool_t(int num_workers, + const std::vector& diving_types, + const lp_problem_t& original_lp, + const csr_matrix_t& Arow, + const std::vector& var_types, + const simplex_solver_settings_t& settings, + const std::vector* root_solution) + { + this->workers_.reserve(num_workers); + for (int i = 0; i < num_workers; ++i) { + search_strategy_t type = diving_types[i % diving_types.size()]; + this->workers_.emplace_back(i, type, original_lp, Arow, var_types, settings, root_solution); + } + } + + void collect_worker_events(determinism_diving_worker_t&, bb_event_batch_t&) {} +}; + +} // namespace cuopt::linear_programming::dual_simplex diff --git a/cpp/src/dual_simplex/device_sparse_matrix.cu b/cpp/src/dual_simplex/device_sparse_matrix.cu index 86ec99c7b..11c3798b8 100644 --- a/cpp/src/dual_simplex/device_sparse_matrix.cu +++ b/cpp/src/dual_simplex/device_sparse_matrix.cu @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -27,29 +27,9 @@ void csc_matrix_t::scale_columns(const std::vector& sc } #ifdef DUAL_SIMPLEX_INSTANTIATE_DOUBLE -template int -matrix_vector_multiply, PinnedHostAllocator>( - const csc_matrix_t& A, - double alpha, - const std::vector>& x, - double beta, - std::vector>& y); -template int -matrix_vector_multiply, std::allocator>( - const csc_matrix_t& A, - double alpha, - const std::vector>& x, - double beta, - std::vector>& y); - -template int -matrix_vector_multiply, PinnedHostAllocator>( - const csc_matrix_t& A, - double alpha, - const std::vector>& x, - double beta, - std::vector>& y); +// NOTE: matrix_vector_multiply is now templated on VectorX and VectorY. +// Since it's defined inline in the header, no explicit instantiation is needed here. template int matrix_transpose_vector_multiply pseudocost_diving(pseudo_costs_t& pc, const std::vector& root_solution, logger_t& log) { - i_t branch_var = -1; - f_t max_score = -1; - rounding_direction_t round_dir = rounding_direction_t::NONE; - constexpr f_t eps = 1e-6; - - i_t num_initialized_down; - i_t num_initialized_up; - f_t pseudo_cost_down_avg; - f_t pseudo_cost_up_avg; - pc.initialized( - num_initialized_down, num_initialized_up, pseudo_cost_down_avg, pseudo_cost_up_avg); - - for (i_t j : fractional) { - rounding_direction_t dir = rounding_direction_t::NONE; - f_t f_down = solution[j] - std::floor(solution[j]); - f_t f_up = std::ceil(solution[j]) - solution[j]; - - f_t pc_down = pc.pseudo_cost_num_down[j] != 0 - ? pc.pseudo_cost_sum_down[j] / pc.pseudo_cost_num_down[j] - : pseudo_cost_down_avg; - - f_t pc_up = pc.pseudo_cost_num_up[j] != 0 ? pc.pseudo_cost_sum_up[j] / pc.pseudo_cost_num_up[j] - : pseudo_cost_up_avg; - - f_t score_down = std::sqrt(f_up) * (1 + pc_up) / (1 + pc_down); - f_t score_up = std::sqrt(f_down) * (1 + pc_down) / (1 + pc_up); - f_t score = 0; - - if (solution[j] < root_solution[j] - 0.4) { - score = score_down; - dir = rounding_direction_t::DOWN; - } else if (solution[j] > root_solution[j] + 0.4) { - score = score_up; - dir = rounding_direction_t::UP; - } else if (f_down < 0.3) { - score = score_down; - dir = rounding_direction_t::DOWN; - } else if (f_down > 0.7) { - score = score_up; - dir = rounding_direction_t::UP; - } else if (pc_down < pc_up + eps) { - score = score_down; - dir = rounding_direction_t::DOWN; - } else { - score = score_up; - dir = rounding_direction_t::UP; - } - - if (score > max_score) { - max_score = score; - branch_var = j; - round_dir = dir; - } - } - - // If we cannot choose the variable, then arbitrarily pick the first - // fractional variable and round it down. This only happens when - // there is only one fractional variable and its the pseudocost is - // infinite for both direction. - if (round_dir == rounding_direction_t::NONE) { - branch_var = fractional[0]; - round_dir = rounding_direction_t::DOWN; - } - - assert(round_dir != rounding_direction_t::NONE); - assert(branch_var >= 0); - - log.debug("Pseudocost diving: selected var %d with val = %e, round dir = %d and score = %e\n", - branch_var, - solution[branch_var], - round_dir, - max_score); - - return {branch_var, round_dir}; + return pseudocost_diving_from_arrays((const f_t*)pc.pseudo_cost_sum_down.data(), + (const f_t*)pc.pseudo_cost_sum_up.data(), + (const i_t*)pc.pseudo_cost_num_down.data(), + (const i_t*)pc.pseudo_cost_num_up.data(), + (i_t)pc.pseudo_cost_sum_down.size(), + fractional, + solution, + root_solution); } template @@ -154,54 +88,14 @@ branch_variable_t guided_diving(pseudo_costs_t& pc, const std::vector& incumbent, logger_t& log) { - i_t branch_var = -1; - f_t max_score = -1; - rounding_direction_t round_dir = rounding_direction_t::NONE; - constexpr f_t eps = 1e-6; - - i_t num_initialized_down; - i_t num_initialized_up; - f_t pseudo_cost_down_avg; - f_t pseudo_cost_up_avg; - pc.initialized( - num_initialized_down, num_initialized_up, pseudo_cost_down_avg, pseudo_cost_up_avg); - - for (i_t j : fractional) { - f_t f_down = solution[j] - std::floor(solution[j]); - f_t f_up = std::ceil(solution[j]) - solution[j]; - f_t down_dist = std::abs(incumbent[j] - std::floor(solution[j])); - f_t up_dist = std::abs(std::ceil(solution[j]) - incumbent[j]); - rounding_direction_t dir = - down_dist < up_dist + eps ? rounding_direction_t::DOWN : rounding_direction_t::UP; - - f_t pc_down = pc.pseudo_cost_num_down[j] != 0 - ? pc.pseudo_cost_sum_down[j] / pc.pseudo_cost_num_down[j] - : pseudo_cost_down_avg; - - f_t pc_up = pc.pseudo_cost_num_up[j] != 0 ? pc.pseudo_cost_sum_up[j] / pc.pseudo_cost_num_up[j] - : pseudo_cost_up_avg; - - f_t score1 = dir == rounding_direction_t::DOWN ? 5 * pc_down * f_down : 5 * pc_up * f_up; - f_t score2 = dir == rounding_direction_t::DOWN ? pc_up * f_up : pc_down * f_down; - f_t score = (score1 + score2) / 6; - - if (score > max_score) { - max_score = score; - branch_var = j; - round_dir = dir; - } - } - - assert(round_dir != rounding_direction_t::NONE); - assert(branch_var >= 0); - - log.debug("Guided diving: selected var %d with val = %e, round dir = %d and score = %e\n", - branch_var, - solution[branch_var], - round_dir, - max_score); - - return {branch_var, round_dir}; + return guided_diving_from_arrays((const f_t*)pc.pseudo_cost_sum_down.data(), + (const f_t*)pc.pseudo_cost_sum_up.data(), + (const i_t*)pc.pseudo_cost_num_down.data(), + (const i_t*)pc.pseudo_cost_num_up.data(), + (i_t)pc.pseudo_cost_sum_down.size(), + fractional, + solution, + incumbent); } template diff --git a/cpp/src/dual_simplex/diving_heuristics.hpp b/cpp/src/dual_simplex/diving_heuristics.hpp index 3c6d77c04..bef2a4490 100644 --- a/cpp/src/dual_simplex/diving_heuristics.hpp +++ b/cpp/src/dual_simplex/diving_heuristics.hpp @@ -20,6 +20,128 @@ struct branch_variable_t { rounding_direction_t direction; }; +template +branch_variable_t pseudocost_diving_from_arrays(const f_t* pc_sum_down, + const f_t* pc_sum_up, + const i_t* pc_num_down, + const i_t* pc_num_up, + i_t n_vars, + const std::vector& fractional, + const std::vector& solution, + const std::vector& root_solution) +{ + const i_t num_fractional = fractional.size(); + if (num_fractional == 0) return {-1, rounding_direction_t::NONE}; + + auto avgs = compute_pseudo_cost_averages(pc_sum_down, pc_sum_up, pc_num_down, pc_num_up, n_vars); + + i_t branch_var = fractional[0]; + f_t max_score = std::numeric_limits::lowest(); + rounding_direction_t round_dir = rounding_direction_t::DOWN; + constexpr f_t eps = f_t(1e-6); + + for (i_t j : fractional) { + f_t f_down = solution[j] - std::floor(solution[j]); + f_t f_up = std::ceil(solution[j]) - solution[j]; + f_t pc_down = pc_num_down[j] != 0 ? pc_sum_down[j] / pc_num_down[j] : avgs.down_avg; + f_t pc_up = pc_num_up[j] != 0 ? pc_sum_up[j] / pc_num_up[j] : avgs.up_avg; + + f_t score_down = std::sqrt(f_up) * (1 + pc_up) / (1 + pc_down); + f_t score_up = std::sqrt(f_down) * (1 + pc_down) / (1 + pc_up); + + f_t score = 0; + rounding_direction_t dir = rounding_direction_t::DOWN; + + f_t root_val = (j < static_cast(root_solution.size())) ? root_solution[j] : solution[j]; + + if (solution[j] < root_val - f_t(0.4)) { + score = score_down; + dir = rounding_direction_t::DOWN; + } else if (solution[j] > root_val + f_t(0.4)) { + score = score_up; + dir = rounding_direction_t::UP; + } else if (f_down < f_t(0.3)) { + score = score_down; + dir = rounding_direction_t::DOWN; + } else if (f_down > f_t(0.7)) { + score = score_up; + dir = rounding_direction_t::UP; + } else if (pc_down < pc_up + eps) { + score = score_down; + dir = rounding_direction_t::DOWN; + } else { + score = score_up; + dir = rounding_direction_t::UP; + } + + if (score > max_score) { + max_score = score; + branch_var = j; + round_dir = dir; + } + } + + // If we cannot choose the variable, then arbitrarily pick the first + // fractional variable and round it down. This only happens when + // there is only one fractional variable and its the pseudocost is + // infinite for both direction. + if (round_dir == rounding_direction_t::NONE) { + branch_var = fractional[0]; + round_dir = rounding_direction_t::DOWN; + } + + assert(round_dir != rounding_direction_t::NONE); + assert(branch_var >= 0); + + return {branch_var, round_dir}; +} + +// Guided diving variable selection (lock-free implementation) +template +branch_variable_t guided_diving_from_arrays(const f_t* pc_sum_down, + const f_t* pc_sum_up, + const i_t* pc_num_down, + const i_t* pc_num_up, + i_t n_vars, + const std::vector& fractional, + const std::vector& solution, + const std::vector& incumbent) +{ + const i_t num_fractional = fractional.size(); + if (num_fractional == 0) return {-1, rounding_direction_t::NONE}; + + auto avgs = compute_pseudo_cost_averages(pc_sum_down, pc_sum_up, pc_num_down, pc_num_up, n_vars); + + i_t branch_var = fractional[0]; + f_t max_score = std::numeric_limits::lowest(); + rounding_direction_t round_dir = rounding_direction_t::DOWN; + constexpr f_t eps = f_t(1e-6); + + for (i_t j : fractional) { + f_t f_down = solution[j] - std::floor(solution[j]); + f_t f_up = std::ceil(solution[j]) - solution[j]; + f_t down_dist = std::abs(incumbent[j] - std::floor(solution[j])); + f_t up_dist = std::abs(std::ceil(solution[j]) - incumbent[j]); + rounding_direction_t dir = + down_dist < up_dist + eps ? rounding_direction_t::DOWN : rounding_direction_t::UP; + + f_t pc_down = pc_num_down[j] != 0 ? pc_sum_down[j] / pc_num_down[j] : avgs.down_avg; + f_t pc_up = pc_num_up[j] != 0 ? pc_sum_up[j] / pc_num_up[j] : avgs.up_avg; + + f_t score1 = dir == rounding_direction_t::DOWN ? 5 * pc_down * f_down : 5 * pc_up * f_up; + f_t score2 = dir == rounding_direction_t::DOWN ? pc_up * f_up : pc_down * f_down; + f_t score = (score1 + score2) / 6; + + if (score > max_score) { + max_score = score; + branch_var = j; + round_dir = dir; + } + } + + return {branch_var, round_dir}; +} + template branch_variable_t line_search_diving(const std::vector& fractional, const std::vector& solution, diff --git a/cpp/src/dual_simplex/folding.cpp b/cpp/src/dual_simplex/folding.cpp index c59d827c5..5ff2eda71 100644 --- a/cpp/src/dual_simplex/folding.cpp +++ b/cpp/src/dual_simplex/folding.cpp @@ -126,8 +126,8 @@ void compute_sums(const csc_matrix_t& A, // Find all vertices (columns) that have a neighbor in the refining color colors_to_update.reserve(num_col_colors); find_vertices_to_refine(refining_color.vertices, - Arow.row_start, - Arow.j, + Arow.row_start.underlying(), + Arow.j.underlying(), col_color_map, marked_vertices, vertices_to_refine, @@ -143,9 +143,9 @@ void compute_sums(const csc_matrix_t& A, compute_sums_of_refined_vertices(refining_color.color, refining_color.vertices, vertices_to_refine, - Arow.row_start, - Arow.j, - Arow.x, + Arow.row_start.underlying(), + Arow.j.underlying(), + Arow.x.underlying(), col_color_map, vertex_to_sum, max_sum_by_color); @@ -154,8 +154,8 @@ void compute_sums(const csc_matrix_t& A, // Find all vertices (rows) that have a neighbor in the refining color colors_to_update.reserve(num_row_colors); find_vertices_to_refine(refining_color.vertices, - A.col_start, - A.i, + A.col_start.underlying(), + A.i.underlying(), row_color_map, marked_vertices, vertices_to_refine, @@ -171,9 +171,9 @@ void compute_sums(const csc_matrix_t& A, compute_sums_of_refined_vertices(refining_color.color, refining_color.vertices, vertices_to_refine, - A.col_start, - A.i, - A.x, + A.col_start.underlying(), + A.i.underlying(), + A.x.underlying(), row_color_map, vertex_to_sum, max_sum_by_color); diff --git a/cpp/src/dual_simplex/initial_basis.cpp b/cpp/src/dual_simplex/initial_basis.cpp index 9dbe4052b..5da844904 100644 --- a/cpp/src/dual_simplex/initial_basis.cpp +++ b/cpp/src/dual_simplex/initial_basis.cpp @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -11,6 +11,8 @@ #include #include +#include + #include #include @@ -24,6 +26,7 @@ i_t initial_basis_selection(const lp_problem_t& problem, std::vector& vstatus, std::vector& dependent_rows) { + raft::common::nvtx::range scope("DualSimplex::initial_basis"); i_t m = problem.num_rows; i_t n = problem.num_cols; i_t nz = problem.A.col_start[n]; diff --git a/cpp/src/dual_simplex/mip_node.hpp b/cpp/src/dual_simplex/mip_node.hpp index 8b2211c1d..347d75aa2 100644 --- a/cpp/src/dual_simplex/mip_node.hpp +++ b/cpp/src/dual_simplex/mip_node.hpp @@ -9,6 +9,7 @@ #include #include +#include #include #include @@ -247,6 +248,9 @@ class mip_node_t { copy.children[0] = nullptr; copy.children[1] = nullptr; copy.status = node_status_t::PENDING; + + copy.origin_worker_id = origin_worker_id; + copy.creation_seq = creation_seq; return copy; } @@ -266,6 +270,32 @@ class mip_node_t { std::unique_ptr children[2]; std::vector vstatus; + + // Worker-local identification for deterministic ordering: + // - origin_worker_id: which worker created this node + // - creation_seq: sequence number within that worker (cumulative across horizons, serial) + // The tuple (origin_worker_id, creation_seq) is unique and stable + int32_t origin_worker_id{-1}; + int32_t creation_seq{-1}; + + uint64_t get_id_packed() const + { + return (static_cast(origin_worker_id + 1) << 32) | + static_cast(static_cast(creation_seq)); + } + + uint32_t compute_path_hash() const + { + std::vector path_steps; + const mip_node_t* node = this; + while (node != nullptr && node->branch_var >= 0) { + uint64_t step = static_cast(node->branch_var) << 1; + step |= (node->branch_dir == rounding_direction_t::UP) ? 1 : 0; + path_steps.push_back(step); + node = node->parent; + } + return detail::compute_hash(path_steps); + } }; template diff --git a/cpp/src/dual_simplex/node_queue.hpp b/cpp/src/dual_simplex/node_queue.hpp index b5c9a9ea7..15e0fe9e0 100644 --- a/cpp/src/dual_simplex/node_queue.hpp +++ b/cpp/src/dual_simplex/node_queue.hpp @@ -59,6 +59,9 @@ class heap_t { void clear() { buffer.clear(); } bool empty() const { return buffer.empty(); } + // Read-only access to underlying buffer for iteration without modification + const std::vector& data() const { return buffer; } + private: std::vector buffer; Comp comp; diff --git a/cpp/src/dual_simplex/phase2.cpp b/cpp/src/dual_simplex/phase2.cpp index c74841c1c..3d46d0f7c 100644 --- a/cpp/src/dual_simplex/phase2.cpp +++ b/cpp/src/dual_simplex/phase2.cpp @@ -16,13 +16,72 @@ #include #include +#include +#include +#include +#include + +#include + +#define PHASE2_NVTX_RANGES + +#ifdef PHASE2_NVTX_RANGES +#define PHASE2_NVTX_RANGE(name) raft::common::nvtx::range NVTX_UNIQUE_NAME(nvtx_scope_)(name) +#define NVTX_UNIQUE_NAME(base) NVTX_CONCAT(base, __LINE__) +#define NVTX_CONCAT(a, b) NVTX_CONCAT_INNER(a, b) +#define NVTX_CONCAT_INNER(a, b) a##b +#else +#define PHASE2_NVTX_RANGE(name) ((void)0) +#endif + +#include +#include #include #include #include #include +#include namespace cuopt::linear_programming::dual_simplex { +constexpr int FEATURE_LOG_INTERVAL = 100; + +using cuopt::ins_vector; + +class nvtx_range_guard { + public: + explicit nvtx_range_guard(const char* name) : active_(true) + { + raft::common::nvtx::push_range(name); + } + + ~nvtx_range_guard() + { + if (active_) { raft::common::nvtx::pop_range(); } + } + + // Pop the range early, preventing the destructor from popping again + void pop() + { + if (active_) { + raft::common::nvtx::pop_range(); + active_ = false; + } + } + + // Check if the range is still active + bool is_active() const { return active_; } + + // Non-copyable, non-movable + nvtx_range_guard(const nvtx_range_guard&) = delete; + nvtx_range_guard& operator=(const nvtx_range_guard&) = delete; + nvtx_range_guard(nvtx_range_guard&&) = delete; + nvtx_range_guard& operator=(nvtx_range_guard&&) = delete; + + private: + bool active_; +}; + namespace phase2 { // Computes vectors farkas_y, farkas_zl, farkas_zu that satisfy @@ -225,8 +284,6 @@ void initial_perturbation(const lp_problem_t& lp, const f_t dual_tol = settings.dual_tol; - std::srand(static_cast(std::time(nullptr))); - objective.resize(n); f_t sum_perturb = 0.0; i_t num_perturb = 0; @@ -269,38 +326,55 @@ template void compute_reduced_cost_update(const lp_problem_t& lp, const std::vector& basic_list, const std::vector& nonbasic_list, - const std::vector& delta_y, + const ins_vector& delta_y, i_t leaving_index, i_t direction, - std::vector& delta_z_mark, - std::vector& delta_z_indices, - std::vector& delta_z) + ins_vector& delta_z_mark, + ins_vector& delta_z_indices, + ins_vector& delta_z) { const i_t m = lp.num_rows; const i_t n = lp.num_cols; + const f_t* __restrict__ delta_y_ptr = delta_y.data(); + const f_t* __restrict__ Ax = lp.A.x.data(); + const i_t* __restrict__ Ai = lp.A.i.data(); + const i_t* __restrict__ ptr_col_start = lp.A.col_start.data(); + f_t* __restrict__ delta_z_ptr = delta_z.data(); + + size_t nnzs_processed = 0; + // delta_zB = sigma*ei for (i_t k = 0; k < m; k++) { - const i_t j = basic_list[k]; - delta_z[j] = 0; + const i_t j = basic_list[k]; + delta_z_ptr[j] = 0; } - delta_z[leaving_index] = direction; + delta_z_ptr[leaving_index] = direction; // delta_zN = -N'*delta_y - for (i_t k = 0; k < n - m; k++) { + const i_t num_nonbasic = n - m; + for (i_t k = 0; k < num_nonbasic; k++) { const i_t j = nonbasic_list[k]; // z_j <- -A(:, j)'*delta_y - const i_t col_start = lp.A.col_start[j]; - const i_t col_end = lp.A.col_start[j + 1]; + const i_t col_start = ptr_col_start[j]; + const i_t col_end = ptr_col_start[j + 1]; f_t dot = 0.0; for (i_t p = col_start; p < col_end; ++p) { - dot += lp.A.x[p] * delta_y[lp.A.i[p]]; + dot += Ax[p] * delta_y_ptr[Ai[p]]; } - delta_z[j] = -dot; + nnzs_processed += col_end - col_start; + + delta_z_ptr[j] = -dot; if (dot != 0.0) { delta_z_indices.push_back(j); // Note delta_z_indices has n elements reserved delta_z_mark[j] = 1; } } + + lp.A.x.byte_loads += nnzs_processed * sizeof(f_t); + lp.A.i.byte_loads += nnzs_processed * sizeof(i_t); + delta_y.byte_loads += nnzs_processed * sizeof(f_t); + lp.A.col_start.byte_loads += 2 * num_nonbasic * sizeof(i_t); + delta_z.byte_stores += (m + 1 + num_nonbasic) * sizeof(f_t); } template @@ -308,10 +382,10 @@ void compute_delta_z(const csc_matrix_t& A_transpose, const sparse_vector_t& delta_y, i_t leaving_index, i_t direction, - std::vector& nonbasic_mark, - std::vector& delta_z_mark, - std::vector& delta_z_indices, - std::vector& delta_z) + ins_vector& nonbasic_mark, + ins_vector& delta_z_mark, + ins_vector& delta_z_indices, + ins_vector& delta_z) { // delta_zN = - N'*delta_y const i_t nz_delta_y = delta_y.i.size(); @@ -374,6 +448,8 @@ void compute_reduced_costs(const std::vector& objective, const std::vector& nonbasic_list, std::vector& z) { + PHASE2_NVTX_RANGE("DualSimplex::compute_reduced_costs"); + const i_t m = A.m; const i_t n = A.n; // zN = cN - N'*y @@ -404,8 +480,10 @@ void compute_primal_variables(const basis_update_mpf_t& ft, const std::vector& basic_list, const std::vector& nonbasic_list, f_t tight_tol, - std::vector& x) + std::vector& x, + ins_vector& xB_workspace) { + PHASE2_NVTX_RANGE("DualSimplex::compute_primal_variables"); const i_t m = A.m; const i_t n = A.n; std::vector rhs = lp_rhs; @@ -422,21 +500,21 @@ void compute_primal_variables(const basis_update_mpf_t& ft, } } - std::vector xB(m); - ft.b_solve(rhs, xB); + xB_workspace.resize(m); + ft.b_solve(rhs, xB_workspace); for (i_t k = 0; k < m; ++k) { const i_t j = basic_list[k]; - x[j] = xB[k]; + x[j] = xB_workspace[k]; } } template void clear_delta_z(i_t entering_index, i_t leaving_index, - std::vector& delta_z_mark, - std::vector& delta_z_indices, - std::vector& delta_z) + ins_vector& delta_z_mark, + ins_vector& delta_z_indices, + ins_vector& delta_z) { for (i_t k = 0; k < delta_z_indices.size(); k++) { const i_t j = delta_z_indices[k]; @@ -452,7 +530,7 @@ template void clear_delta_x(const std::vector& basic_list, i_t entering_index, sparse_vector_t& scaled_delta_xB_sparse, - std::vector& delta_x) + ins_vector& delta_x) { const i_t scaled_delta_xB_nz = scaled_delta_xB_sparse.i.size(); for (i_t k = 0; k < scaled_delta_xB_nz; ++k) { @@ -472,6 +550,8 @@ void compute_dual_residual(const csc_matrix_t& A, const std::vector& z, std::vector& dual_residual) { + PHASE2_NVTX_RANGE("DualSimplex::compute_dual_residual"); + dual_residual = z; const i_t n = A.n; // r = A'*y + z - c @@ -517,7 +597,7 @@ void vstatus_changes(const std::vector& vstatus, template void compute_bounded_info(const std::vector& lower, const std::vector& upper, - std::vector& bounded_variables) + ins_vector& bounded_variables) { const size_t n = lower.size(); for (size_t j = 0; j < n; j++) { @@ -538,7 +618,7 @@ void compute_dual_solution_from_basis(const lp_problem_t& lp, const i_t n = lp.num_cols; y.resize(m); - std::vector cB(m); + ins_vector cB(m); for (i_t k = 0; k < m; ++k) { const i_t j = basic_list[k]; cB[k] = lp.objective[j]; @@ -577,7 +657,8 @@ i_t compute_primal_solution_from_basis(const lp_problem_t& lp, const std::vector& basic_list, const std::vector& nonbasic_list, const std::vector& vstatus, - std::vector& x) + std::vector& x, + ins_vector& xB_workspace) { const i_t m = lp.num_rows; const i_t n = lp.num_cols; @@ -607,12 +688,12 @@ i_t compute_primal_solution_from_basis(const lp_problem_t& lp, } } - std::vector xB(m); - ft.b_solve(rhs, xB); + xB_workspace.resize(m); + ft.b_solve(rhs, xB_workspace); for (i_t k = 0; k < m; ++k) { const i_t j = basic_list[k]; - x[j] = xB[k]; + x[j] = xB_workspace[k]; } return 0; } @@ -622,10 +703,11 @@ f_t compute_initial_primal_infeasibilities(const lp_problem_t& lp, const simplex_solver_settings_t& settings, const std::vector& basic_list, const std::vector& x, - std::vector& squared_infeasibilities, - std::vector& infeasibility_indices, + ins_vector& squared_infeasibilities, + ins_vector& infeasibility_indices, f_t& primal_inf) { + PHASE2_NVTX_RANGE("DualSimplex::compute_initial_primal_infeasibilities"); const i_t m = lp.num_rows; const i_t n = lp.num_cols; squared_infeasibilities.resize(n); @@ -728,8 +810,8 @@ void update_primal_infeasibilities(const lp_problem_t& lp, } template -void clean_up_infeasibilities(std::vector& squared_infeasibilities, - std::vector& infeasibility_indices) +void clean_up_infeasibilities(ins_vector& squared_infeasibilities, + ins_vector& infeasibility_indices) { bool needs_clean_up = false; for (i_t k = 0; k < infeasibility_indices.size(); ++k) { @@ -758,9 +840,9 @@ i_t steepest_edge_pricing_with_infeasibilities(const lp_problem_t& lp, const simplex_solver_settings_t& settings, const std::vector& x, const std::vector& dy_steepest_edge, - const std::vector& basic_mark, - std::vector& squared_infeasibilities, - std::vector& infeasibility_indices, + const ins_vector& basic_mark, + ins_vector& squared_infeasibilities, + ins_vector& infeasibility_indices, i_t& direction, i_t& basic_leaving, f_t& max_val) @@ -904,7 +986,7 @@ f_t first_stage_harris(const lp_problem_t& lp, const std::vector& vstatus, const std::vector& nonbasic_list, std::vector& z, - std::vector& delta_z) + ins_vector& delta_z) { const i_t n = lp.num_cols; const i_t m = lp.num_rows; @@ -938,7 +1020,7 @@ i_t second_stage_harris(const lp_problem_t& lp, const std::vector& vstatus, const std::vector& nonbasic_list, const std::vector& z, - const std::vector& delta_z, + const ins_vector& delta_z, f_t max_step_length, f_t& step_length, i_t& nonbasic_entering) @@ -978,10 +1060,10 @@ i_t second_stage_harris(const lp_problem_t& lp, template i_t phase2_ratio_test(const lp_problem_t& lp, const simplex_solver_settings_t& settings, - const std::vector& vstatus, - const std::vector& nonbasic_list, + std::vector& vstatus, + std::vector& nonbasic_list, std::vector& z, - std::vector& delta_z, + ins_vector& delta_z, f_t& step_length, i_t& nonbasic_entering) { @@ -1031,20 +1113,21 @@ i_t phase2_ratio_test(const lp_problem_t& lp, template i_t flip_bounds(const lp_problem_t& lp, const simplex_solver_settings_t& settings, - const std::vector& bounded_variables, - const std::vector& objective, + const ins_vector& bounded_variables, + const ins_vector& objective, const std::vector& z, - const std::vector& delta_z_indices, + const ins_vector& delta_z_indices, const std::vector& nonbasic_list, i_t entering_index, std::vector& vstatus, - std::vector& delta_x, - std::vector& mark, - std::vector& atilde, - std::vector& atilde_index) + ins_vector& delta_x, + ins_vector& mark, + ins_vector& atilde, + ins_vector& atilde_index) { i_t num_flipped = 0; - for (i_t j : delta_z_indices) { + for (i_t k = 0; k < delta_z_indices.size(); ++k) { + const i_t j = delta_z_indices[k]; if (j == entering_index) { continue; } if (!bounded_variables[j]) { continue; } // x_j is now a nonbasic bounded variable that will not enter the basis this @@ -1249,12 +1332,12 @@ i_t update_steepest_edge_norms(const simplex_solver_settings_t& settin const sparse_vector_t& scaled_delta_xB, i_t basic_leaving_index, i_t entering_index, - std::vector& v, + ins_vector& v, + sparse_vector_t& v_sparse, std::vector& delta_y_steepest_edge) { - i_t m = basic_list.size(); const i_t delta_y_nz = delta_y_sparse.i.size(); - sparse_vector_t v_sparse(m, 0); + v_sparse.clear(); // B^T delta_y = - direction * e_basic_leaving_index // We want B v = - B^{-T} e_basic_leaving_index ft.b_solve(delta_y_sparse, v_sparse); @@ -1351,9 +1434,9 @@ i_t check_steepest_edge_norms(const simplex_solver_settings_t& setting const i_t m = basic_list.size(); for (i_t k = 0; k < m; ++k) { const i_t j = basic_list[k]; - std::vector ei(m); + ins_vector ei(m); ei[k] = -1.0; - std::vector delta_yi(m); + ins_vector delta_yi(m); ft.b_transpose_solve(ei, delta_yi); const f_t computed_norm = vector_norm2_squared(delta_yi); const f_t updated_norm = delta_y_steepest_edge[j]; @@ -1369,9 +1452,9 @@ i_t check_steepest_edge_norms(const simplex_solver_settings_t& setting template i_t compute_perturbation(const lp_problem_t& lp, const simplex_solver_settings_t& settings, - const std::vector& delta_z_indices, + const ins_vector& delta_z_indices, std::vector& z, - std::vector& objective, + ins_vector& objective, f_t& sum_perturb) { const i_t n = lp.num_cols; @@ -1418,8 +1501,8 @@ i_t compute_perturbation(const lp_problem_t& lp, template void reset_basis_mark(const std::vector& basic_list, const std::vector& nonbasic_list, - std::vector& basic_mark, - std::vector& nonbasic_mark) + ins_vector& basic_mark, + ins_vector& nonbasic_mark) { const i_t m = basic_list.size(); const i_t n = nonbasic_mark.size(); @@ -1487,8 +1570,8 @@ void compute_delta_y(const basis_update_mpf_t& ft, template i_t update_dual_variables(const sparse_vector_t& delta_y_sparse, - const std::vector& delta_z_indices, - const std::vector& delta_z, + const ins_vector& delta_z_indices, + const ins_vector& delta_z, f_t step_length, i_t leaving_index, std::vector& y, @@ -1514,21 +1597,23 @@ i_t update_dual_variables(const sparse_vector_t& delta_y_sparse, template void adjust_for_flips(const basis_update_mpf_t& ft, const std::vector& basic_list, - const std::vector& delta_z_indices, - std::vector& atilde_index, - std::vector& atilde, - std::vector& atilde_mark, + const ins_vector& delta_z_indices, + ins_vector& atilde_index, + ins_vector& atilde, + ins_vector& atilde_mark, + sparse_vector_t& atilde_sparse, sparse_vector_t& delta_xB_0_sparse, - std::vector& delta_x_flip, + ins_vector& delta_x_flip, std::vector& x) { - const i_t m = basic_list.size(); const i_t atilde_nz = atilde_index.size(); // B*delta_xB_0 = atilde - sparse_vector_t atilde_sparse(m, atilde_nz); + atilde_sparse.clear(); + atilde_sparse.i.reserve(atilde_nz); + atilde_sparse.x.reserve(atilde_nz); for (i_t k = 0; k < atilde_nz; ++k) { - atilde_sparse.i[k] = atilde_index[k]; - atilde_sparse.x[k] = atilde[atilde_index[k]]; + atilde_sparse.i.push_back(atilde_index[k]); + atilde_sparse.x.push_back(atilde[atilde_index[k]]); } ft.b_solve(atilde_sparse, delta_xB_0_sparse); const i_t delta_xB_0_nz = delta_xB_0_sparse.i.size(); @@ -1537,7 +1622,8 @@ void adjust_for_flips(const basis_update_mpf_t& ft, x[j] += delta_xB_0_sparse.x[k]; } - for (i_t j : delta_z_indices) { + for (i_t k = 0; k < delta_z_indices.size(); ++k) { + const i_t j = delta_z_indices[k]; x[j] += delta_x_flip[j]; delta_x_flip[j] = 0.0; } @@ -1561,12 +1647,12 @@ i_t compute_delta_x(const lp_problem_t& lp, i_t basic_leaving_index, i_t direction, const std::vector& basic_list, - const std::vector& delta_x_flip, + const ins_vector& delta_x_flip, const sparse_vector_t& rhs_sparse, const std::vector& x, sparse_vector_t& utilde_sparse, sparse_vector_t& scaled_delta_xB_sparse, - std::vector& delta_x) + ins_vector& delta_x) { f_t delta_x_leaving = direction == 1 ? lp.lower[leaving_index] - x[leaving_index] : lp.upper[leaving_index] - x[leaving_index]; @@ -1624,7 +1710,7 @@ i_t compute_delta_x(const lp_problem_t& lp, template void update_primal_variables(const sparse_vector_t& scaled_delta_xB_sparse, const std::vector& basic_list, - const std::vector& delta_x, + const ins_vector& delta_x, i_t entering_index, std::vector& x) { @@ -1640,9 +1726,9 @@ void update_primal_variables(const sparse_vector_t& scaled_delta_xB_sp template void update_objective(const std::vector& basic_list, - const std::vector& changed_basic_indices, + const ins_vector& changed_basic_indices, const std::vector& objective, - const std::vector& delta_x, + const ins_vector& delta_x, i_t entering_index, f_t& obj) { @@ -1848,8 +1934,8 @@ void check_primal_infeasibilities(const lp_problem_t& lp, const simplex_solver_settings_t& settings, const std::vector& basic_list, const std::vector& x, - const std::vector& squared_infeasibilities, - const std::vector& infeasibility_indices) + const ins_vector& squared_infeasibilities, + const ins_vector& infeasibility_indices) { const i_t m = basic_list.size(); for (i_t k = 0; k < m; ++k) { @@ -1879,8 +1965,8 @@ void check_primal_infeasibilities(const lp_problem_t& lp, template void check_basic_infeasibilities(const std::vector& basic_list, - const std::vector& basic_mark, - const std::vector& infeasibility_indices, + const ins_vector& basic_mark, + const ins_vector& infeasibility_indices, i_t info) { for (i_t k = 0; k < infeasibility_indices.size(); ++k) { @@ -1923,8 +2009,8 @@ template void check_basis_mark(const simplex_solver_settings_t& settings, const std::vector& basic_list, const std::vector& nonbasic_list, - const std::vector& basic_mark, - const std::vector& nonbasic_mark) + const ins_vector& basic_mark, + const ins_vector& nonbasic_mark) { const i_t m = basic_list.size(); const i_t n = basic_mark.size(); @@ -1978,6 +2064,7 @@ void set_primal_variables_on_bounds(const lp_problem_t& lp, std::vector& vstatus, std::vector& x) { + PHASE2_NVTX_RANGE("DualSimplex::set_primal_variables_on_bounds"); const i_t n = lp.num_cols; f_t tol = 1e-10; for (i_t j = 0; j < n; ++j) { @@ -2064,7 +2151,7 @@ void set_primal_variables_on_bounds(const lp_problem_t& lp, } template -f_t compute_perturbed_objective(const std::vector& objective, const std::vector& x) +f_t compute_perturbed_objective(const ins_vector& objective, const std::vector& x) { const size_t n = objective.size(); f_t obj_val = 0.0; @@ -2075,7 +2162,7 @@ f_t compute_perturbed_objective(const std::vector& objective, const std::ve } template -f_t amount_of_perturbation(const lp_problem_t& lp, const std::vector& objective) +f_t amount_of_perturbation(const lp_problem_t& lp, const ins_vector& objective) { f_t perturbation = 0.0; const i_t n = lp.num_cols; @@ -2091,7 +2178,7 @@ void prepare_optimality(i_t info, const lp_problem_t& lp, const simplex_solver_settings_t& settings, basis_update_mpf_t& ft, - const std::vector& objective, + const ins_vector& objective, const std::vector& basic_list, const std::vector& nonbasic_list, const std::vector& vstatus, @@ -2261,6 +2348,7 @@ dual::status_t dual_phase2(i_t phase, i_t& iter, std::vector& delta_y_steepest_edge) { + PHASE2_NVTX_RANGE("DualSimplex::phase2"); const i_t m = lp.num_rows; const i_t n = lp.num_cols; std::vector basic_list(m); @@ -2296,8 +2384,10 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, std::vector& nonbasic_list, lp_solution_t& sol, i_t& iter, - std::vector& delta_y_steepest_edge) + std::vector& delta_y_steepest_edge, + work_limit_context_t* work_unit_context) { + PHASE2_NVTX_RANGE("DualSimplex::phase2_advanced"); const i_t m = lp.num_rows; const i_t n = lp.num_cols; assert(m <= n); @@ -2313,10 +2403,22 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, std::vector& y = sol.y; std::vector& z = sol.z; + // Declare instrumented vectors used during initialization (before aggregator setup) + // Perturbed objective + ins_vector objective(lp.objective); + ins_vector c_basic(m); + ins_vector xB_workspace(m); + + // Create instrumentation aggregator early to capture init section memory operations + instrumentation_aggregator_t aggregator; + + aggregator.add("objective", objective); + aggregator.add("c_basic", c_basic); + aggregator.add("xB_workspace", xB_workspace); + dual::status_t status = dual::status_t::UNSET; - // Perturbed objective - std::vector objective = lp.objective; + nvtx_range_guard init_scope("DualSimplex::phase2_advanced_init"); settings.log.printf("Dual Simplex Phase %d\n", phase); std::vector vstatus_old = vstatus; @@ -2324,6 +2426,7 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, phase2::bound_info(lp, settings); if (initialize_basis) { + PHASE2_NVTX_RANGE("DualSimplex::init_basis"); std::vector superbasic_list; nonbasic_list.clear(); nonbasic_list.reserve(n - m); @@ -2339,7 +2442,7 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, if (toc(start_time) > settings.time_limit) { return dual::status_t::TIME_LIMIT; } } - std::vector c_basic(m); + // Populate c_basic after basis is initialized for (i_t k = 0; k < m; ++k) { const i_t j = basic_list[k]; c_basic[k] = objective[j]; @@ -2354,7 +2457,7 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, "|| y || %e || cB || %e\n", vector_norm_inf(y), vector_norm_inf(c_basic)); } - phase2::compute_reduced_costs(objective, lp.A, y, basic_list, nonbasic_list, z); + phase2::compute_reduced_costs(objective.underlying(), lp.A, y, basic_list, nonbasic_list, z); if constexpr (print_norms) { settings.log.printf("|| z || %e\n", vector_norm_inf(z)); } #ifdef COMPUTE_DUAL_RESIDUAL @@ -2390,7 +2493,7 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, } phase2::compute_primal_variables( - ft, lp.rhs, lp.A, basic_list, nonbasic_list, settings.tight_tol, x); + ft, lp.rhs, lp.A, basic_list, nonbasic_list, settings.tight_tol, x, xB_workspace); if (toc(start_time) > settings.time_limit) { return dual::status_t::TIME_LIMIT; } if (print_norms) { settings.log.printf("|| x || %e\n", vector_norm2(x)); } @@ -2405,6 +2508,7 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, #endif if (delta_y_steepest_edge.size() == 0) { + PHASE2_NVTX_RANGE("DualSimplex::initialize_steepest_edge_norms"); delta_y_steepest_edge.resize(n); if (slack_basis) { phase2::initialize_steepest_edge_norms_from_slack_basis( @@ -2433,26 +2537,28 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, } const i_t iter_limit = settings.iteration_limit; - std::vector delta_y(m, 0.0); - std::vector delta_z(n, 0.0); - std::vector delta_x(n, 0.0); - std::vector delta_x_flip(n, 0.0); - std::vector atilde(m, 0.0); - std::vector atilde_mark(m, 0); - std::vector atilde_index; - std::vector nonbasic_mark(n); - std::vector basic_mark(n); - std::vector delta_z_mark(n, 0); - std::vector delta_z_indices; - std::vector v(m, 0.0); - std::vector squared_infeasibilities; - std::vector infeasibility_indices; + + // Instrumented vectors for memory access tracking + ins_vector delta_y(m, 0.0); + ins_vector delta_z(n, 0.0); + ins_vector delta_x(n, 0.0); + ins_vector delta_x_flip(n, 0.0); + ins_vector atilde(m, 0.0); + ins_vector atilde_mark(m, 0); + ins_vector atilde_index; + ins_vector nonbasic_mark(n); + ins_vector basic_mark(n); + ins_vector delta_z_mark(n, 0); + ins_vector delta_z_indices; + ins_vector v(m, 0.0); + ins_vector squared_infeasibilities; + ins_vector infeasibility_indices; delta_z_indices.reserve(n); phase2::reset_basis_mark(basic_list, nonbasic_list, basic_mark, nonbasic_mark); - std::vector bounded_variables(n, 0); + ins_vector bounded_variables(n, 0); phase2::compute_bounded_info(lp.lower, lp.upper, bounded_variables); f_t primal_infeasibility; @@ -2470,15 +2576,113 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, #endif csc_matrix_t A_transpose(1, 1, 0); - lp.A.transpose(A_transpose); + aggregator.add("A_transpose.col_start", A_transpose.col_start); + aggregator.add("A_transpose.i", A_transpose.i); + aggregator.add("A_transpose.x", A_transpose.x); + { + PHASE2_NVTX_RANGE("DualSimplex::transpose_A"); + lp.A.transpose(A_transpose); + } + f_t obj = compute_objective(lp, x); + init_scope.pop(); // End phase2_advanced_init range - f_t obj = compute_objective(lp, x); const i_t start_iter = iter; - i_t sparse_delta_z = 0; - i_t dense_delta_z = 0; + i_t sparse_delta_z = 0; + i_t dense_delta_z = 0; + i_t num_refactors = 0; + i_t total_bound_flips = 0; + f_t delta_y_nz_percentage = 0.0; phase2::phase2_timers_t timers(false); + // // Feature collection for regression training + // dual_simplex_features_t features; + // features.init_from_problem(lp, settings, phase, slack_basis != 0, initialize_basis); + // features.start_iteration = iter; + + // Sparse vectors for main loop (declared outside loop for instrumentation) + sparse_vector_t delta_y_sparse(m, 0); + sparse_vector_t UTsol_sparse(m, 0); + sparse_vector_t delta_xB_0_sparse(m, 0); + sparse_vector_t utilde_sparse(m, 0); + sparse_vector_t scaled_delta_xB_sparse(m, 0); + sparse_vector_t rhs_sparse(m, 0); + sparse_vector_t v_sparse(m, 0); // For steepest edge norms + sparse_vector_t atilde_sparse(m, 0); // For flip adjustments + + // Add remaining instrumented vectors to aggregator (x, y, z, objective, c_basic, xB_workspace + // added earlier) Delta vectors + aggregator.add("delta_y", delta_y); + aggregator.add("delta_z", delta_z); + aggregator.add("delta_x", delta_x); + aggregator.add("delta_x_flip", delta_x_flip); + aggregator.add("atilde", atilde); + aggregator.add("atilde_mark", atilde_mark); + aggregator.add("atilde_index", atilde_index); + aggregator.add("nonbasic_mark", nonbasic_mark); + aggregator.add("basic_mark", basic_mark); + aggregator.add("delta_z_mark", delta_z_mark); + aggregator.add("delta_z_indices", delta_z_indices); + aggregator.add("v", v); + aggregator.add("squared_infeasibilities", squared_infeasibilities); + aggregator.add("infeasibility_indices", infeasibility_indices); + aggregator.add("bounded_variables", bounded_variables); + + // Add sparse vector internal arrays to aggregator + aggregator.add("delta_y_sparse.i", delta_y_sparse.i); + aggregator.add("delta_y_sparse.x", delta_y_sparse.x); + aggregator.add("UTsol_sparse.i", UTsol_sparse.i); + aggregator.add("UTsol_sparse.x", UTsol_sparse.x); + aggregator.add("delta_xB_0_sparse.i", delta_xB_0_sparse.i); + aggregator.add("delta_xB_0_sparse.x", delta_xB_0_sparse.x); + aggregator.add("utilde_sparse.i", utilde_sparse.i); + aggregator.add("utilde_sparse.x", utilde_sparse.x); + aggregator.add("scaled_delta_xB_sparse.i", scaled_delta_xB_sparse.i); + aggregator.add("scaled_delta_xB_sparse.x", scaled_delta_xB_sparse.x); + aggregator.add("rhs_sparse.i", rhs_sparse.i); + aggregator.add("rhs_sparse.x", rhs_sparse.x); + aggregator.add("v_sparse.i", v_sparse.i); + aggregator.add("v_sparse.x", v_sparse.x); + aggregator.add("atilde_sparse.i", atilde_sparse.i); + aggregator.add("atilde_sparse.x", atilde_sparse.x); + + // Add A matrix for entering column access during basis update + aggregator.add("A.col_start", lp.A.col_start); + aggregator.add("A.i", lp.A.i); + aggregator.add("A.x", lp.A.x); + + // Track iteration interval start time for runtime measurement + [[maybe_unused]] f_t interval_start_time = toc(start_time); + i_t last_feature_log_iter = iter; + + cuopt::scope_guard work_unit_guard([&]() { + i_t remaining_iters = iter - last_feature_log_iter; + if (remaining_iters <= 0) return; + + auto [total_loads, total_stores] = aggregator.collect_and_flush(); + // features.byte_loads = total_loads; + // features.byte_stores = total_stores; + + // f_t now = toc(start_time); + // features.interval_runtime = now - interval_start_time; + // interval_start_time = now; + + // features.iteration = iter; + // features.num_refactors = num_refactors; + // features.num_basis_updates = ft.num_updates(); + // features.sparse_delta_z_count = sparse_delta_z; + // features.dense_delta_z_count = dense_delta_z; + // features.total_bound_flips = total_bound_flips; + // features.num_infeasibilities = infeasibility_indices.size(); + // features.delta_y_nz_percentage = delta_y_nz_percentage; + // features.log_features(settings); + + if (work_unit_context) { + // TEMP; + work_unit_context->record_work((total_loads + total_stores) / 1e8); + } + }); + if (phase == 2) { settings.log.printf("%5d %+.16e %7d %.8e %.2e %.2f\n", 0, @@ -2490,27 +2694,31 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, } while (iter < iter_limit) { + PHASE2_NVTX_RANGE("DualSimplex::phase2_main_loop"); // Pricing i_t direction = 0; i_t basic_leaving_index = -1; i_t leaving_index = -1; f_t max_val; timers.start_timer(); - if (settings.use_steepest_edge_pricing) { - leaving_index = phase2::steepest_edge_pricing_with_infeasibilities(lp, - settings, - x, - delta_y_steepest_edge, - basic_mark, - squared_infeasibilities, - infeasibility_indices, - direction, - basic_leaving_index, - max_val); - } else { - // Max infeasibility pricing - leaving_index = phase2::phase2_pricing( - lp, settings, x, basic_list, direction, basic_leaving_index, primal_infeasibility); + { + PHASE2_NVTX_RANGE("DualSimplex::pricing"); + if (settings.use_steepest_edge_pricing) { + leaving_index = phase2::steepest_edge_pricing_with_infeasibilities(lp, + settings, + x, + delta_y_steepest_edge, + basic_mark, + squared_infeasibilities, + infeasibility_indices, + direction, + basic_leaving_index, + max_val); + } else { + // Max infeasibility pricing + leaving_index = phase2::phase2_pricing( + lp, settings, x, basic_list, direction, basic_leaving_index, primal_infeasibility); + } } timers.pricing_time += timers.stop_timer(); if (leaving_index == -1) { @@ -2596,9 +2804,12 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, // BTran // BT*delta_y = -delta_zB = -sigma*ei timers.start_timer(); - sparse_vector_t delta_y_sparse(m, 0); - sparse_vector_t UTsol_sparse(m, 0); - phase2::compute_delta_y(ft, basic_leaving_index, direction, delta_y_sparse, UTsol_sparse); + delta_y_sparse.clear(); + UTsol_sparse.clear(); + { + PHASE2_NVTX_RANGE("DualSimplex::btran"); + phase2::compute_delta_y(ft, basic_leaving_index, direction, delta_y_sparse, UTsol_sparse); + } timers.btran_time += timers.stop_timer(); const f_t steepest_edge_norm_check = delta_y_sparse.norm2_squared(); @@ -2623,31 +2834,34 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, for (i_t k = 0; k < nz_delta_y; k++) { if (std::abs(delta_y_sparse.x[k]) > 1e-12) { delta_y_nz0++; } } - const f_t delta_y_nz_percentage = delta_y_nz0 / static_cast(m) * 100.0; - const bool use_transpose = delta_y_nz_percentage <= 30.0; - if (use_transpose) { - sparse_delta_z++; - phase2::compute_delta_z(A_transpose, - delta_y_sparse, - leaving_index, - direction, - nonbasic_mark, - delta_z_mark, - delta_z_indices, - delta_z); - } else { - dense_delta_z++; - // delta_zB = sigma*ei - delta_y_sparse.to_dense(delta_y); - phase2::compute_reduced_cost_update(lp, - basic_list, - nonbasic_list, - delta_y, - leaving_index, - direction, - delta_z_mark, - delta_z_indices, - delta_z); + delta_y_nz_percentage = delta_y_nz0 / static_cast(m) * 100.0; + const bool use_transpose = delta_y_nz_percentage <= 30.0; + { + PHASE2_NVTX_RANGE("DualSimplex::delta_z"); + if (use_transpose) { + sparse_delta_z++; + phase2::compute_delta_z(A_transpose, + delta_y_sparse, + leaving_index, + direction, + nonbasic_mark, + delta_z_mark, + delta_z_indices, + delta_z); + } else { + dense_delta_z++; + // delta_zB = sigma*ei + delta_y_sparse.to_dense(delta_y); + phase2::compute_reduced_cost_update(lp, + basic_list, + nonbasic_list, + delta_y, + leaving_index, + direction, + delta_z_mark, + delta_z_indices, + delta_z); + } } timers.delta_z_time += timers.stop_timer(); @@ -2667,43 +2881,46 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, i_t nonbasic_entering_index = -1; const bool harris_ratio = settings.use_harris_ratio; const bool bound_flip_ratio = settings.use_bound_flip_ratio; - if (harris_ratio) { - f_t max_step_length = phase2::first_stage_harris(lp, vstatus, nonbasic_list, z, delta_z); - entering_index = phase2::second_stage_harris(lp, + { + PHASE2_NVTX_RANGE("DualSimplex::ratio_test"); + if (harris_ratio) { + f_t max_step_length = phase2::first_stage_harris(lp, vstatus, nonbasic_list, z, delta_z); + entering_index = phase2::second_stage_harris(lp, + vstatus, + nonbasic_list, + z, + delta_z, + max_step_length, + step_length, + nonbasic_entering_index); + } else if (bound_flip_ratio) { + timers.start_timer(); + f_t slope = direction == 1 ? (lp.lower[leaving_index] - x[leaving_index]) + : (x[leaving_index] - lp.upper[leaving_index]); + bound_flipping_ratio_test_t bfrt(settings, + start_time, + m, + n, + slope, + lp.lower, + lp.upper, + bounded_variables.underlying(), vstatus, nonbasic_list, z, - delta_z, - max_step_length, - step_length, - nonbasic_entering_index); - } else if (bound_flip_ratio) { - timers.start_timer(); - f_t slope = direction == 1 ? (lp.lower[leaving_index] - x[leaving_index]) - : (x[leaving_index] - lp.upper[leaving_index]); - bound_flipping_ratio_test_t bfrt(settings, - start_time, - m, - n, - slope, - lp.lower, - lp.upper, - bounded_variables, - vstatus, - nonbasic_list, - z, - delta_z, - delta_z_indices, - nonbasic_mark); - entering_index = bfrt.compute_step_length(step_length, nonbasic_entering_index); - if (entering_index == RATIO_TEST_NUMERICAL_ISSUES) { - settings.log.printf("Numerical issues encountered in ratio test.\n"); - return dual::status_t::NUMERICAL; + delta_z.underlying(), + delta_z_indices.underlying(), + nonbasic_mark); + entering_index = bfrt.compute_step_length(step_length, nonbasic_entering_index); + if (entering_index == RATIO_TEST_NUMERICAL_ISSUES) { + settings.log.printf("Numerical issues encountered in ratio test.\n"); + return dual::status_t::NUMERICAL; + } + timers.bfrt_time += timers.stop_timer(); + } else { + entering_index = phase2::phase2_ratio_test( + lp, settings, vstatus, nonbasic_list, z, delta_z, step_length, nonbasic_entering_index); } - timers.bfrt_time += timers.stop_timer(); - } else { - entering_index = phase2::phase2_ratio_test( - lp, settings, vstatus, nonbasic_list, z, delta_z, step_length, nonbasic_entering_index); } if (entering_index == RATIO_TEST_TIME_LIMIT) { return dual::status_t::TIME_LIMIT; } if (entering_index == CONCURRENT_HALT_RETURN) { return dual::status_t::CONCURRENT_LIMIT; } @@ -2730,7 +2947,7 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, std::vector unperturbed_x(n); phase2::compute_primal_solution_from_basis( - lp, ft, basic_list, nonbasic_list, vstatus, unperturbed_x); + lp, ft, basic_list, nonbasic_list, vstatus, unperturbed_x, xB_workspace); x = unperturbed_x; primal_infeasibility_squared = phase2::compute_initial_primal_infeasibilities(lp, @@ -2775,7 +2992,7 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, } else { std::vector unperturbed_x(n); phase2::compute_primal_solution_from_basis( - lp, ft, basic_list, nonbasic_list, vstatus, unperturbed_x); + lp, ft, basic_list, nonbasic_list, vstatus, unperturbed_x, xB_workspace); x = unperturbed_x; primal_infeasibility_squared = phase2::compute_initial_primal_infeasibilities(lp, @@ -2898,8 +3115,9 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, atilde_index); timers.flip_time += timers.stop_timer(); + total_bound_flips += num_flipped; - sparse_vector_t delta_xB_0_sparse(m, 0); + delta_xB_0_sparse.clear(); if (num_flipped > 0) { timers.start_timer(); phase2::adjust_for_flips(ft, @@ -2908,6 +3126,7 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, atilde_index, atilde, atilde_mark, + atilde_sparse, delta_xB_0_sparse, delta_x_flip, x); @@ -2915,24 +3134,27 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, } timers.start_timer(); - sparse_vector_t utilde_sparse(m, 0); - sparse_vector_t scaled_delta_xB_sparse(m, 0); - sparse_vector_t rhs_sparse(lp.A, entering_index); - if (phase2::compute_delta_x(lp, - ft, - entering_index, - leaving_index, - basic_leaving_index, - direction, - basic_list, - delta_x_flip, - rhs_sparse, - x, - utilde_sparse, - scaled_delta_xB_sparse, - delta_x) == -1) { - settings.log.printf("Failed to compute delta_x. Iter %d\n", iter); - return dual::status_t::NUMERICAL; + utilde_sparse.clear(); + scaled_delta_xB_sparse.clear(); + rhs_sparse.from_csc_column(lp.A, entering_index); + { + PHASE2_NVTX_RANGE("DualSimplex::ftran"); + if (phase2::compute_delta_x(lp, + ft, + entering_index, + leaving_index, + basic_leaving_index, + direction, + basic_list, + delta_x_flip, + rhs_sparse, + x, + utilde_sparse, + scaled_delta_xB_sparse, + delta_x) == -1) { + settings.log.printf("Failed to compute delta_x. Iter %d\n", iter); + return dual::status_t::NUMERICAL; + } } timers.ftran_time += timers.stop_timer(); @@ -2955,6 +3177,7 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, basic_leaving_index, entering_index, v, + v_sparse, delta_y_steepest_edge); #ifdef STEEPEST_EDGE_DEBUG if (steepest_edge_status == -1) { @@ -2998,9 +3221,9 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, x, entering_index, leaving_index, - delta_xB_0_sparse.i, - squared_infeasibilities, - infeasibility_indices, + delta_xB_0_sparse.i.underlying(), + squared_infeasibilities.underlying(), + infeasibility_indices.underlying(), primal_infeasibility_squared); // Update primal infeasibilities due to changes in basic variables // from the leaving and entering variables @@ -3010,17 +3233,17 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, x, entering_index, leaving_index, - scaled_delta_xB_sparse.i, - squared_infeasibilities, - infeasibility_indices, + scaled_delta_xB_sparse.i.underlying(), + squared_infeasibilities.underlying(), + infeasibility_indices.underlying(), primal_infeasibility_squared); // Update the entering variable phase2::update_single_primal_infeasibility(lp.lower, lp.upper, x, settings.primal_tol, - squared_infeasibilities, - infeasibility_indices, + squared_infeasibilities.underlying(), + infeasibility_indices.underlying(), entering_index, primal_infeasibility_squared); @@ -3060,62 +3283,67 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, timers.start_timer(); // Refactor or update the basis factorization - bool should_refactor = ft.num_updates() > settings.refactor_frequency; - if (!should_refactor) { - i_t recommend_refactor = ft.update(utilde_sparse, UTsol_sparse, basic_leaving_index); + { + PHASE2_NVTX_RANGE("DualSimplex::basis_update"); + bool should_refactor = ft.num_updates() > settings.refactor_frequency; + if (!should_refactor) { + i_t recommend_refactor = ft.update(utilde_sparse, UTsol_sparse, basic_leaving_index); #ifdef CHECK_UPDATE - phase2::check_update(lp, settings, ft, basic_list, basic_leaving_index); + phase2::check_update(lp, settings, ft, basic_list, basic_leaving_index); #endif - should_refactor = recommend_refactor == 1; - } + should_refactor = recommend_refactor == 1; + } #ifdef CHECK_BASIC_INFEASIBILITIES - phase2::check_basic_infeasibilities(basic_list, basic_mark, infeasibility_indices, 6); + phase2::check_basic_infeasibilities(basic_list, basic_mark, infeasibility_indices, 6); #endif - if (should_refactor) { - bool should_recompute_x = false; - if (ft.refactor_basis( - lp.A, settings, lp.lower, lp.upper, basic_list, nonbasic_list, vstatus) > 0) { - should_recompute_x = true; - settings.log.printf("Failed to factorize basis. Iteration %d\n", iter); - if (toc(start_time) > settings.time_limit) { return dual::status_t::TIME_LIMIT; } - i_t count = 0; - i_t deficient_size; - while ((deficient_size = ft.refactor_basis( - lp.A, settings, lp.lower, lp.upper, basic_list, nonbasic_list, vstatus)) > 0) { - settings.log.printf("Failed to repair basis. Iteration %d. %d deficient columns.\n", - iter, - static_cast(deficient_size)); - + if (should_refactor) { + PHASE2_NVTX_RANGE("DualSimplex::refactorization"); + num_refactors++; + bool should_recompute_x = false; + if (ft.refactor_basis( + lp.A, settings, lp.lower, lp.upper, basic_list, nonbasic_list, vstatus) > 0) { + should_recompute_x = true; + settings.log.printf("Failed to factorize basis. Iteration %d\n", iter); if (toc(start_time) > settings.time_limit) { return dual::status_t::TIME_LIMIT; } - settings.threshold_partial_pivoting_tol = 1.0; + i_t count = 0; + i_t deficient_size; + while ((deficient_size = ft.refactor_basis( + lp.A, settings, lp.lower, lp.upper, basic_list, nonbasic_list, vstatus)) > 0) { + settings.log.printf("Failed to repair basis. Iteration %d. %d deficient columns.\n", + iter, + static_cast(deficient_size)); + + if (toc(start_time) > settings.time_limit) { return dual::status_t::TIME_LIMIT; } + settings.threshold_partial_pivoting_tol = 1.0; + + count++; + if (count > 10) { return dual::status_t::NUMERICAL; } + } - count++; - if (count > 10) { return dual::status_t::NUMERICAL; } + settings.log.printf("Successfully repaired basis. Iteration %d\n", iter); } - settings.log.printf("Successfully repaired basis. Iteration %d\n", iter); - } - - phase2::reset_basis_mark(basic_list, nonbasic_list, basic_mark, nonbasic_mark); - if (should_recompute_x) { - std::vector unperturbed_x(n); - phase2::compute_primal_solution_from_basis( - lp, ft, basic_list, nonbasic_list, vstatus, unperturbed_x); - x = unperturbed_x; + phase2::reset_basis_mark(basic_list, nonbasic_list, basic_mark, nonbasic_mark); + if (should_recompute_x) { + std::vector unperturbed_x(n); + phase2::compute_primal_solution_from_basis( + lp, ft, basic_list, nonbasic_list, vstatus, unperturbed_x, xB_workspace); + x = unperturbed_x; + } + primal_infeasibility_squared = + phase2::compute_initial_primal_infeasibilities(lp, + settings, + basic_list, + x, + squared_infeasibilities, + infeasibility_indices, + primal_infeasibility); } - primal_infeasibility_squared = - phase2::compute_initial_primal_infeasibilities(lp, - settings, - basic_list, - x, - squared_infeasibilities, - infeasibility_indices, - primal_infeasibility); - } #ifdef CHECK_BASIC_INFEASIBILITIES - phase2::check_basic_infeasibilities(basic_list, basic_mark, infeasibility_indices, 7); + phase2::check_basic_infeasibilities(basic_list, basic_mark, infeasibility_indices, 7); #endif + } timers.lu_update_time += timers.stop_timer(); timers.start_timer(); @@ -3140,6 +3368,36 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, phase2::clear_delta_z(entering_index, leaving_index, delta_z_mark, delta_z_indices, delta_z); f_t now = toc(start_time); + + // Feature logging for regression training (every FEATURE_LOG_INTERVAL iterations) + if ((iter % FEATURE_LOG_INTERVAL) == 0 && work_unit_context) { + [[maybe_unused]] i_t iters_elapsed = iter - last_feature_log_iter; + + auto [total_loads, total_stores] = aggregator.collect_and_flush(); + // features.byte_loads = total_loads; + // features.byte_stores = total_stores; + + // features.interval_runtime = now - interval_start_time; + // interval_start_time = now; + + // features.iteration = iter; + // features.num_refactors = num_refactors; + // features.num_basis_updates = ft.num_updates(); + // features.sparse_delta_z_count = sparse_delta_z; + // features.dense_delta_z_count = dense_delta_z; + // features.total_bound_flips = total_bound_flips; + // features.num_infeasibilities = infeasibility_indices.size(); + // features.delta_y_nz_percentage = delta_y_nz_percentage; + // features.log_features(settings); + + if (work_unit_context) { + // TEMP; + work_unit_context->record_work((total_loads + total_stores) / 1e8); + } + + last_feature_log_iter = iter; + } + if ((iter - start_iter) < settings.first_iteration_log || (iter % settings.iteration_log_frequency) == 0) { if (phase == 1 && iter == 1) { @@ -3159,6 +3417,10 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, return dual::status_t::CUTOFF; } + if (work_unit_context && work_unit_context->global_work_units_elapsed >= settings.work_limit) { + return dual::status_t::WORK_LIMIT; + } + if (now > settings.time_limit) { return dual::status_t::TIME_LIMIT; } if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { @@ -3213,7 +3475,8 @@ template dual::status_t dual_phase2_with_advanced_basis( std::vector& nonbasic_list, lp_solution_t& sol, int& iter, - std::vector& steepest_edge_norms); + std::vector& steepest_edge_norms, + work_limit_context_t* work_unit_context); #endif } // namespace cuopt::linear_programming::dual_simplex diff --git a/cpp/src/dual_simplex/phase2.hpp b/cpp/src/dual_simplex/phase2.hpp index caeae82e1..4fd83b8f1 100644 --- a/cpp/src/dual_simplex/phase2.hpp +++ b/cpp/src/dual_simplex/phase2.hpp @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -13,9 +13,14 @@ #include #include #include +#include #include +namespace cuopt { +struct work_limit_context_t; +} + namespace cuopt::linear_programming::dual_simplex { namespace dual { @@ -27,7 +32,8 @@ enum class status_t { TIME_LIMIT = 4, ITERATION_LIMIT = 5, CONCURRENT_LIMIT = 6, - UNSET = 7 + UNSET = 7, + WORK_LIMIT = 8 }; static std::string status_to_string(status_t status) @@ -40,6 +46,7 @@ static std::string status_to_string(status_t status) case status_t::TIME_LIMIT: return "TIME_LIMIT"; case status_t::ITERATION_LIMIT: return "ITERATION_LIMIT"; case status_t::CONCURRENT_LIMIT: return "CONCURRENT_LIMIT"; + case status_t::WORK_LIMIT: return "WORK_LIMIT"; case status_t::UNSET: return "UNSET"; } return "UNKNOWN"; @@ -70,6 +77,7 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, std::vector& nonbasic_list, lp_solution_t& sol, i_t& iter, - std::vector& delta_y_steepest_edge); + std::vector& delta_y_steepest_edge, + work_limit_context_t* work_unit_context = nullptr); } // namespace cuopt::linear_programming::dual_simplex diff --git a/cpp/src/dual_simplex/pseudo_costs.cpp b/cpp/src/dual_simplex/pseudo_costs.cpp index 682bdaa6f..de8994e80 100644 --- a/cpp/src/dual_simplex/pseudo_costs.cpp +++ b/cpp/src/dual_simplex/pseudo_costs.cpp @@ -11,6 +11,8 @@ #include #include +#include + #include #include @@ -33,6 +35,7 @@ void strong_branch_helper(i_t start, const std::vector& edge_norms, pseudo_costs_t& pc) { + raft::common::nvtx::range scope("BB::strong_branch_helper"); lp_problem_t child_problem = original_lp; constexpr bool verbose = false; @@ -467,37 +470,13 @@ void pseudo_costs_t::initialized(i_t& num_initialized_down, f_t& pseudo_cost_down_avg, f_t& pseudo_cost_up_avg) const { - num_initialized_down = 0; - num_initialized_up = 0; - pseudo_cost_down_avg = 0; - pseudo_cost_up_avg = 0; - const i_t n = pseudo_cost_sum_down.size(); - for (i_t j = 0; j < n; j++) { - if (pseudo_cost_num_down[j] > 0) { - num_initialized_down++; - if (std::isfinite(pseudo_cost_sum_down[j])) { - pseudo_cost_down_avg += pseudo_cost_sum_down[j] / pseudo_cost_num_down[j]; - } - } - - if (pseudo_cost_num_up[j] > 0) { - num_initialized_up++; - - if (std::isfinite(pseudo_cost_sum_up[j])) { - pseudo_cost_up_avg += pseudo_cost_sum_up[j] / pseudo_cost_num_up[j]; - } - } - } - if (num_initialized_down > 0) { - pseudo_cost_down_avg /= num_initialized_down; - } else { - pseudo_cost_down_avg = 1.0; - } - if (num_initialized_up > 0) { - pseudo_cost_up_avg /= num_initialized_up; - } else { - pseudo_cost_up_avg = 1.0; - } + auto avgs = compute_pseudo_cost_averages(pseudo_cost_sum_down.data(), + pseudo_cost_sum_up.data(), + pseudo_cost_num_down.data(), + pseudo_cost_num_up.data(), + pseudo_cost_sum_down.size()); + pseudo_cost_down_avg = avgs.down_avg; + pseudo_cost_up_avg = avgs.up_avg; } template diff --git a/cpp/src/dual_simplex/pseudo_costs.hpp b/cpp/src/dual_simplex/pseudo_costs.hpp index 1d3b0deef..89fe8c0fe 100644 --- a/cpp/src/dual_simplex/pseudo_costs.hpp +++ b/cpp/src/dual_simplex/pseudo_costs.hpp @@ -17,9 +17,122 @@ #include #include +#include +#include namespace cuopt::linear_programming::dual_simplex { +template +struct pseudo_cost_averages_t { + f_t down_avg; + f_t up_avg; +}; + +// used to get T from omp_atomic_t based on the fact that omp_atomic_t::operator++ returns T +template +using underlying_type = decltype(std::declval()++); + +// Necessary because omp_atomic_t may be passed instead of f_t +template +auto compute_pseudo_cost_averages(const MaybeWrappedF* pc_sum_down, + const MaybeWrappedF* pc_sum_up, + const MaybeWrappedI* pc_num_down, + const MaybeWrappedI* pc_num_up, + size_t n) +{ + using underlying_f_t = underlying_type; + using underlying_i_t = underlying_type; + + underlying_i_t num_initialized_down = 0; + underlying_i_t num_initialized_up = 0; + underlying_f_t pseudo_cost_down_avg = 0.0; + underlying_f_t pseudo_cost_up_avg = 0.0; + + for (size_t j = 0; j < n; ++j) { + if (pc_num_down[j] > 0) { + ++num_initialized_down; + if (std::isfinite(pc_sum_down[j])) { + pseudo_cost_down_avg += pc_sum_down[j] / pc_num_down[j]; + } + } + if (pc_num_up[j] > 0) { + ++num_initialized_up; + if (std::isfinite(pc_sum_up[j])) { pseudo_cost_up_avg += pc_sum_up[j] / pc_num_up[j]; } + } + } + + pseudo_cost_down_avg = + (num_initialized_down > 0) ? pseudo_cost_down_avg / num_initialized_down : 1.0; + pseudo_cost_up_avg = (num_initialized_up > 0) ? pseudo_cost_up_avg / num_initialized_up : 1.0; + + return pseudo_cost_averages_t{pseudo_cost_down_avg, pseudo_cost_up_avg}; +} + +// Variable selection using pseudo-cost product scoring +// Returns the best variable to branch on +template +i_t variable_selection_from_pseudo_costs(const f_t* pc_sum_down, + const f_t* pc_sum_up, + const i_t* pc_num_down, + const i_t* pc_num_up, + i_t n_vars, + const std::vector& fractional, + const std::vector& solution) +{ + const i_t num_fractional = fractional.size(); + if (num_fractional == 0) return -1; + + auto [pc_down_avg, pc_up_avg] = + compute_pseudo_cost_averages(pc_sum_down, pc_sum_up, pc_num_down, pc_num_up, n_vars); + + i_t branch_var = fractional[0]; + f_t max_score = std::numeric_limits::lowest(); + constexpr f_t eps = f_t(1e-6); + + for (i_t j : fractional) { + f_t pc_down = pc_num_down[j] != 0 ? pc_sum_down[j] / pc_num_down[j] : pc_down_avg; + f_t pc_up = pc_num_up[j] != 0 ? pc_sum_up[j] / pc_num_up[j] : pc_up_avg; + const f_t f_down = solution[j] - std::floor(solution[j]); + const f_t f_up = std::ceil(solution[j]) - solution[j]; + f_t score = std::max(f_down * pc_down, eps) * std::max(f_up * pc_up, eps); + if (score > max_score) { + max_score = score; + branch_var = j; + } + } + + return branch_var; +} + +// Objective estimate using pseudo-costs (lock-free implementation) +// Returns lower_bound + estimated cost to reach integer feasibility +template +f_t obj_estimate_from_arrays(const f_t* pc_sum_down, + const f_t* pc_sum_up, + const i_t* pc_num_down, + const i_t* pc_num_up, + i_t n_vars, + const std::vector& fractional, + const std::vector& solution, + f_t lower_bound) +{ + auto [pc_down_avg, pc_up_avg] = + compute_pseudo_cost_averages(pc_sum_down, pc_sum_up, pc_num_down, pc_num_up, n_vars); + + f_t estimate = lower_bound; + constexpr f_t eps = f_t(1e-6); + + for (i_t j : fractional) { + f_t pc_down = pc_num_down[j] != 0 ? pc_sum_down[j] / pc_num_down[j] : pc_down_avg; + f_t pc_up = pc_num_up[j] != 0 ? pc_sum_up[j] / pc_num_up[j] : pc_up_avg; + const f_t f_down = solution[j] - std::floor(solution[j]); + const f_t f_up = std::ceil(solution[j]) - solution[j]; + estimate += std::min(std::max(pc_down * f_down, eps), std::max(pc_up * f_up, eps)); + } + + return estimate; +} + template struct reliability_branching_settings_t { // Lower bound for the maximum number of LP iterations for a single trial branching @@ -106,6 +219,17 @@ class pseudo_costs_t { void update_pseudo_costs_from_strong_branching(const std::vector& fractional, const std::vector& root_soln); + uint32_t compute_state_hash() const + { + return detail::compute_hash(pseudo_cost_sum_down) ^ detail::compute_hash(pseudo_cost_sum_up) ^ + detail::compute_hash(pseudo_cost_num_down) ^ detail::compute_hash(pseudo_cost_num_up); + } + + uint32_t compute_strong_branch_hash() const + { + return detail::compute_hash(strong_branch_down) ^ detail::compute_hash(strong_branch_up); + } + f_t calculate_pseudocost_score(i_t j, const std::vector& solution, f_t pseudo_cost_up_avg, diff --git a/cpp/src/dual_simplex/right_looking_lu.cpp b/cpp/src/dual_simplex/right_looking_lu.cpp index 82ea7c0e1..cb9834705 100644 --- a/cpp/src/dual_simplex/right_looking_lu.cpp +++ b/cpp/src/dual_simplex/right_looking_lu.cpp @@ -7,11 +7,16 @@ #include #include +#include + +#include #include #include #include +using cuopt::ins_vector; + namespace cuopt::linear_programming::dual_simplex { namespace { @@ -28,9 +33,9 @@ struct element_t { }; constexpr int kNone = -1; -template +template i_t initialize_degree_data(const csc_matrix_t& A, - const std::vector& column_list, + const VectorI& column_list, std::vector& Cdegree, std::vector& Rdegree, std::vector>& col_count, @@ -67,9 +72,9 @@ i_t initialize_degree_data(const csc_matrix_t& A, return Bnz; } -template +template i_t load_elements(const csc_matrix_t& A, - const std::vector& column_list, + const VectorI& column_list, i_t Bnz, std::vector>& elements, std::vector& first_in_row, @@ -567,16 +572,17 @@ void remove_pivot_col(i_t pivot_i, } // namespace -template +template i_t right_looking_lu(const csc_matrix_t& A, const simplex_solver_settings_t& settings, f_t tol, - const std::vector& column_list, - std::vector& q, + const VectorI& column_list, + VectorI& q, csc_matrix_t& L, csc_matrix_t& U, - std::vector& pinv) + VectorI& pinv) { + raft::common::nvtx::range scope("LU::right_looking_lu"); const i_t n = column_list.size(); const i_t m = A.m; @@ -1143,14 +1149,25 @@ i_t right_looking_lu_row_permutation_only(const csc_matrix_t& A, #ifdef DUAL_SIMPLEX_INSTANTIATE_DOUBLE -template int right_looking_lu(const csc_matrix_t& A, - const simplex_solver_settings_t& settings, - double tol, - const std::vector& column_list, - std::vector& q, - csc_matrix_t& L, - csc_matrix_t& U, - std::vector& pinv); +template int right_looking_lu>( + const csc_matrix_t& A, + const simplex_solver_settings_t& settings, + double tol, + const std::vector& column_list, + std::vector& q, + csc_matrix_t& L, + csc_matrix_t& U, + std::vector& pinv); + +template int right_looking_lu>( + const csc_matrix_t& A, + const simplex_solver_settings_t& settings, + double tol, + const ins_vector& column_list, + ins_vector& q, + csc_matrix_t& L, + csc_matrix_t& U, + ins_vector& pinv); template int right_looking_lu_row_permutation_only( const csc_matrix_t& A, diff --git a/cpp/src/dual_simplex/right_looking_lu.hpp b/cpp/src/dual_simplex/right_looking_lu.hpp index 2f985fb36..179fff01b 100644 --- a/cpp/src/dual_simplex/right_looking_lu.hpp +++ b/cpp/src/dual_simplex/right_looking_lu.hpp @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -14,15 +14,15 @@ namespace cuopt::linear_programming::dual_simplex { -template +template i_t right_looking_lu(const csc_matrix_t& A, const simplex_solver_settings_t& settings, f_t tol, - const std::vector& column_list, - std::vector& q, + const VectorI& column_list, + VectorI& q, csc_matrix_t& L, csc_matrix_t& U, - std::vector& pinv); + VectorI& pinv); template i_t right_looking_lu_row_permutation_only(const csc_matrix_t& A, diff --git a/cpp/src/dual_simplex/simplex_solver_settings.hpp b/cpp/src/dual_simplex/simplex_solver_settings.hpp index 34c384949..815e22923 100644 --- a/cpp/src/dual_simplex/simplex_solver_settings.hpp +++ b/cpp/src/dual_simplex/simplex_solver_settings.hpp @@ -50,6 +50,7 @@ struct simplex_solver_settings_t { : iteration_limit(std::numeric_limits::max()), node_limit(std::numeric_limits::max()), time_limit(std::numeric_limits::infinity()), + work_limit(std::numeric_limits::infinity()), absolute_mip_gap_tol(0.0), relative_mip_gap_tol(1e-3), integer_tol(1e-5), @@ -80,6 +81,7 @@ struct simplex_solver_settings_t { print_presolve_stats(true), barrier_presolve(false), cudss_deterministic(false), + deterministic(false), barrier(false), eliminate_dense_columns(true), num_gpus(1), @@ -119,6 +121,7 @@ struct simplex_solver_settings_t { i_t iteration_limit; i_t node_limit; f_t time_limit; + f_t work_limit; f_t absolute_mip_gap_tol; // Tolerance on mip gap to declare optimal f_t relative_mip_gap_tol; // Tolerance on mip gap to declare optimal f_t integer_tol; // Tolerance on integralitiy violation @@ -154,6 +157,7 @@ struct simplex_solver_settings_t { bool barrier_presolve; // true to use barrier presolve bool cudss_deterministic; // true to use cuDSS deterministic mode, false for non-deterministic bool barrier; // true to use barrier method, false to use dual simplex method + bool deterministic; // true to use B&B deterministic mode, false to use non-deterministic mode bool eliminate_dense_columns; // true to eliminate dense columns from A*D*A^T int num_gpus; // Number of GPUs to use (maximum of 2 gpus are supported at the moment) i_t folding; // -1 automatic, 0 don't fold, 1 fold diff --git a/cpp/src/dual_simplex/singletons.cpp b/cpp/src/dual_simplex/singletons.cpp index ff6bcbac2..347604dce 100644 --- a/cpp/src/dual_simplex/singletons.cpp +++ b/cpp/src/dual_simplex/singletons.cpp @@ -1,15 +1,18 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ #include #include +#include #include +using cuopt::ins_vector; + namespace cuopt::linear_programming::dual_simplex { // Destroys the queue but prints it @@ -128,8 +131,8 @@ void create_row_representation(const csc_matrix_t& A, } // Complete the permuation -template -i_t complete_permutation(i_t singletons, std::vector& Xdeg, std::vector& Xperm) +template +i_t complete_permutation(i_t singletons, std::vector& Xdeg, VectorI& Xperm) { i_t n = Xdeg.size(); assert(Xperm.size() == n); @@ -154,12 +157,12 @@ i_t complete_permutation(i_t singletons, std::vector& Xdeg, std::vector +template i_t find_singletons(const csc_matrix_t& A, i_t& row_singletons, - std::vector& row_perm, + VectorI& row_perm, i_t& col_singletons, - std::vector& col_perm) + VectorI& col_perm) { i_t n = A.n; i_t m = A.m; @@ -198,12 +201,14 @@ i_t find_singletons(const csc_matrix_t& A, row_form = true; // Find column singletons + auto& col_perm_vec = static_cast&>(col_perm); + auto& row_perm_vec = static_cast&>(row_perm); row_col_graph_t graph{Cdeg.begin(), - col_perm.begin(), - A.col_start.cbegin(), - A.i.cbegin(), + col_perm_vec.begin(), + A.col_start.underlying().cbegin(), + A.i.underlying().cbegin(), Rdeg.begin(), - row_perm.begin(), + row_perm_vec.begin(), Rp.cbegin(), Rj.cbegin()}; @@ -229,14 +234,16 @@ i_t find_singletons(const csc_matrix_t& A, } // Find row singletons + auto& row_perm_vec2 = static_cast&>(row_perm); + auto& col_perm_vec2 = static_cast&>(col_perm); row_col_graph_t graph{Rdeg.begin(), - row_perm.begin(), + row_perm_vec2.begin(), Rp.cbegin(), Rj.cbegin(), Cdeg.begin(), - col_perm.begin(), - A.col_start.cbegin(), - A.i.cbegin()}; + col_perm_vec2.begin(), + A.col_start.underlying().cbegin(), + A.i.underlying().cbegin()}; #ifdef SINGLETON_DEBUG printf("Searching for row singletons %ld\n", singleton_queue.size()); #endif @@ -280,15 +287,24 @@ template void create_row_representation(const csc_matrix_t& col_index, std::vector& workspace); // Complete the permuation -template int complete_permutation(int singletons, - std::vector& Xdeg, - std::vector& Xperm); - -template int find_singletons(const csc_matrix_t& A, - int& row_singletons, - std::vector& row_perm, - int& col_singleton, - std::vector& col_perm); +template int complete_permutation>(int singletons, + std::vector& Xdeg, + std::vector& Xperm); +template int complete_permutation>(int singletons, + std::vector& Xdeg, + ins_vector& Xperm); + +template int find_singletons>(const csc_matrix_t& A, + int& row_singletons, + std::vector& row_perm, + int& col_singleton, + std::vector& col_perm); + +template int find_singletons>(const csc_matrix_t& A, + int& row_singletons, + ins_vector& row_perm, + int& col_singleton, + ins_vector& col_perm); #endif diff --git a/cpp/src/dual_simplex/singletons.hpp b/cpp/src/dual_simplex/singletons.hpp index b9cfcaa9b..2a1ac3e55 100644 --- a/cpp/src/dual_simplex/singletons.hpp +++ b/cpp/src/dual_simplex/singletons.hpp @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -42,11 +42,11 @@ void create_row_representationon(const csc_matrix_t& A, template i_t complete_permutationn(i_t singletons, std::vector& Xdeg, std::vector& Xperm); -template +template i_t find_singletons(const csc_matrix_t& A, i_t& row_singletons, - std::vector& row_perm, + VectorI& row_perm, i_t& col_singleton, - std::vector& col_perm); + VectorI& col_perm); } // namespace cuopt::linear_programming::dual_simplex diff --git a/cpp/src/dual_simplex/solve.cpp b/cpp/src/dual_simplex/solve.cpp index 37297d9be..133ed453d 100644 --- a/cpp/src/dual_simplex/solve.cpp +++ b/cpp/src/dual_simplex/solve.cpp @@ -24,6 +24,8 @@ #include #include +#include + #include #include #include @@ -108,6 +110,7 @@ lp_status_t solve_linear_program_advanced(const lp_problem_t& original std::vector& vstatus, std::vector& edge_norms) { + raft::common::nvtx::range scope("DualSimplex::solve_lp"); const i_t m = original_lp.num_rows; const i_t n = original_lp.num_cols; assert(m <= n); @@ -140,7 +143,11 @@ lp_status_t solve_linear_program_with_advanced_basis( lp_status_t lp_status = lp_status_t::UNSET; lp_problem_t presolved_lp(original_lp.handle_ptr, 1, 1, 1); presolve_info_t presolve_info; - const i_t ok = presolve(original_lp, settings, presolved_lp, presolve_info); + i_t ok; + { + raft::common::nvtx::range scope_presolve("DualSimplex::presolve"); + ok = presolve(original_lp, settings, presolved_lp, presolve_info); + } if (ok == CONCURRENT_HALT_RETURN) { return lp_status_t::CONCURRENT_LIMIT; } if (ok == -1) { return lp_status_t::INFEASIBLE; } @@ -156,7 +163,10 @@ lp_status_t solve_linear_program_with_advanced_basis( presolved_lp.num_cols, presolved_lp.A.col_start[presolved_lp.num_cols]); std::vector column_scales; - column_scaling(presolved_lp, settings, lp, column_scales); + { + raft::common::nvtx::range scope_scaling("DualSimplex::scaling"); + column_scaling(presolved_lp, settings, lp, column_scales); + } assert(presolved_lp.num_cols == lp.num_cols); lp_problem_t phase1_problem(original_lp.handle_ptr, 1, 1, 1); std::vector phase1_vstatus; @@ -182,14 +192,26 @@ lp_status_t solve_linear_program_with_advanced_basis( i_t iter = 0; lp_solution_t phase1_solution(phase1_problem.num_rows, phase1_problem.num_cols); edge_norms.clear(); - dual::status_t phase1_status = dual_phase2( - 1, 1, start_time, phase1_problem, settings, phase1_vstatus, phase1_solution, iter, edge_norms); + dual::status_t phase1_status; + { + raft::common::nvtx::range scope_phase1("DualSimplex::phase1"); + phase1_status = dual_phase2(1, + 1, + start_time, + phase1_problem, + settings, + phase1_vstatus, + phase1_solution, + iter, + edge_norms); + } if (phase1_status == dual::status_t::NUMERICAL || phase1_status == dual::status_t::DUAL_UNBOUNDED) { settings.log.printf("Failed in Phase 1\n"); return lp_status_t::NUMERICAL_ISSUES; } if (phase1_status == dual::status_t::TIME_LIMIT) { return lp_status_t::TIME_LIMIT; } + if (phase1_status == dual::status_t::WORK_LIMIT) { return lp_status_t::WORK_LIMIT; } if (phase1_status == dual::status_t::ITERATION_LIMIT) { return lp_status_t::ITERATION_LIMIT; } if (phase1_status == dual::status_t::CONCURRENT_LIMIT) { return lp_status_t::CONCURRENT_LIMIT; } phase1_obj = phase1_solution.objective; @@ -273,6 +295,7 @@ lp_status_t solve_linear_program_with_advanced_basis( } if (status == dual::status_t::DUAL_UNBOUNDED) { lp_status = lp_status_t::INFEASIBLE; } if (status == dual::status_t::TIME_LIMIT) { lp_status = lp_status_t::TIME_LIMIT; } + if (status == dual::status_t::WORK_LIMIT) { lp_status = lp_status_t::WORK_LIMIT; } if (status == dual::status_t::ITERATION_LIMIT) { lp_status = lp_status_t::ITERATION_LIMIT; } if (status == dual::status_t::CONCURRENT_LIMIT) { lp_status = lp_status_t::CONCURRENT_LIMIT; } if (status == dual::status_t::NUMERICAL) { lp_status = lp_status_t::NUMERICAL_ISSUES; } diff --git a/cpp/src/dual_simplex/solve.hpp b/cpp/src/dual_simplex/solve.hpp index 6292df637..6e23f6e4c 100644 --- a/cpp/src/dual_simplex/solve.hpp +++ b/cpp/src/dual_simplex/solve.hpp @@ -27,7 +27,8 @@ enum class lp_status_t { NUMERICAL_ISSUES = 5, CUTOFF = 6, CONCURRENT_LIMIT = 7, - UNSET = 8 + UNSET = 8, + WORK_LIMIT = 9 }; static std::string lp_status_to_string(lp_status_t status) @@ -42,6 +43,7 @@ static std::string lp_status_to_string(lp_status_t status) case lp_status_t::CUTOFF: return "CUTOFF"; case lp_status_t::CONCURRENT_LIMIT: return "CONCURRENT_LIMIT"; case lp_status_t::UNSET: return "UNSET"; + case lp_status_t::WORK_LIMIT: return "WORK_LIMIT"; } return "UNKNOWN"; } diff --git a/cpp/src/dual_simplex/sparse_matrix.cpp b/cpp/src/dual_simplex/sparse_matrix.cpp index 1d8f12a3a..6d9cf2b80 100644 --- a/cpp/src/dual_simplex/sparse_matrix.cpp +++ b/cpp/src/dual_simplex/sparse_matrix.cpp @@ -8,9 +8,12 @@ // #include #include #include +#include #include +using cuopt::ins_vector; + // #include // #include @@ -34,8 +37,8 @@ void csc_matrix_t::reallocate(i_t new_nz) this->nz_max = new_nz; } -template -void cumulative_sum(std::vector& inout, std::vector& output) +template +void cumulative_sum(std::vector& inout, OutputVector& output) { i_t n = inout.size(); assert(output.size() == n + 1); @@ -655,8 +658,8 @@ i_t csr_matrix_t::check_matrix(std::string matrix_name) const } // x <- x + alpha * A(:, j) -template -void scatter_dense(const csc_matrix_t& A, i_t j, f_t alpha, std::vector& x) +template +void scatter_dense(const csc_matrix_t& A, i_t j, f_t alpha, VectorF& x) { const i_t col_start = A.col_start[j]; const i_t col_end = A.col_start[j + 1]; @@ -668,13 +671,9 @@ void scatter_dense(const csc_matrix_t& A, i_t j, f_t alpha, std::vecto } // x <- x + alpha * A(:, j) -template -void scatter_dense(const csc_matrix_t& A, - i_t j, - f_t alpha, - std::vector& x, - std::vector& mark, - std::vector& indices) +template +void scatter_dense( + const csc_matrix_t& A, i_t j, f_t alpha, VectorF& x, VectorI& mark, VectorI& indices) { const i_t col_start = A.col_start[j]; const i_t col_end = A.col_start[j + 1]; @@ -934,7 +933,10 @@ template class csc_matrix_t; template class csr_matrix_t; -template void cumulative_sum(std::vector& inout, std::vector& output); +template void cumulative_sum>(std::vector& inout, + std::vector& output); +template void cumulative_sum>(std::vector& inout, + ins_vector& output); template int coo_to_csc(const std::vector& Ai, const std::vector& Aj, @@ -950,17 +952,31 @@ template int scatter(const csc_matrix_t& A, csc_matrix_t& C, int nz); -template void scatter_dense(const csc_matrix_t& A, - int j, - double alpha, - std::vector& x); +template void scatter_dense>(const csc_matrix_t& A, + int j, + double alpha, + std::vector& x); + +template void scatter_dense, std::vector>( + const csc_matrix_t& A, + int j, + double alpha, + std::vector& x, + std::vector& mark, + std::vector& indices); -template void scatter_dense(const csc_matrix_t& A, - int j, - double alpha, - std::vector& x, - std::vector& mark, - std::vector& indices); +template void scatter_dense>(const csc_matrix_t& A, + int j, + double alpha, + ins_vector& x); + +template void scatter_dense, ins_vector>( + const csc_matrix_t& A, + int j, + double alpha, + ins_vector& x, + ins_vector& mark, + ins_vector& indices); template int multiply(const csc_matrix_t& A, const csc_matrix_t& B, @@ -977,12 +993,9 @@ template double sparse_dot(const std::vector& xind, const csc_matrix_t& Y, int y_col); -template int matrix_vector_multiply, std::allocator>( - const csc_matrix_t& A, - double alpha, - const std::vector>& x, - double beta, - std::vector>& y); +// NOTE: matrix_vector_multiply is now templated on VectorX and VectorY. +// Since it's defined inline in the header, no explicit instantiation is needed here. + template int matrix_transpose_vector_multiply, std::allocator>( const csc_matrix_t& A, diff --git a/cpp/src/dual_simplex/sparse_matrix.hpp b/cpp/src/dual_simplex/sparse_matrix.hpp index 0b6c0b11d..8f8f62251 100644 --- a/cpp/src/dual_simplex/sparse_matrix.hpp +++ b/cpp/src/dual_simplex/sparse_matrix.hpp @@ -9,6 +9,7 @@ #include #include +#include #include #include @@ -16,6 +17,9 @@ #include #include +// Import instrumented vector +using cuopt::ins_vector; + namespace cuopt::linear_programming::dual_simplex { template @@ -125,12 +129,12 @@ class csc_matrix_t { return true; } - i_t m; // number of rows - i_t n; // number of columns - i_t nz_max; // maximum number of entries - std::vector col_start; // column pointers (size n + 1) - std::vector i; // row indices, size nz_max - std::vector x; // numerical values, size nz_max + i_t m; // number of rows + i_t n; // number of columns + i_t nz_max; // maximum number of entries + ins_vector col_start; // column pointers (size n + 1) + ins_vector i; // row indices, size nz_max + ins_vector x; // numerical values, size nz_max static_assert(std::is_signed_v); // Require signed integers (we make use of this // to avoid extra space / computation) @@ -173,18 +177,18 @@ class csr_matrix_t { return true; } - i_t nz_max; // maximum number of nonzero entries - i_t m; // number of rows - i_t n; // number of cols - std::vector row_start; // row pointers (size m + 1) - std::vector j; // column inidices, size nz_max - std::vector x; // numerical valuse, size nz_max + i_t nz_max; // maximum number of nonzero entries + i_t m; // number of rows + i_t n; // number of cols + ins_vector row_start; // row pointers (size m + 1) + ins_vector j; // column indices, size nz_max + ins_vector x; // numerical values, size nz_max static_assert(std::is_signed_v); }; -template -void cumulative_sum(std::vector& inout, std::vector& output); +template +void cumulative_sum(std::vector& inout, OutputVector& output); template i_t coo_to_csc(const std::vector& Ai, @@ -203,16 +207,12 @@ i_t scatter(const csc_matrix_t& A, i_t nz); // x <- x + alpha * A(:, j) -template -void scatter_dense(const csc_matrix_t& A, i_t j, f_t alpha, std::vector& x); +template +void scatter_dense(const csc_matrix_t& A, i_t j, f_t alpha, VectorF& x); -template -void scatter_dense(const csc_matrix_t& A, - i_t j, - f_t alpha, - std::vector& x, - std::vector& mark, - std::vector& indices); +template +void scatter_dense( + const csc_matrix_t& A, i_t j, f_t alpha, VectorF& x, VectorI& mark, VectorI& indices); // Compute C = A*B where C is m x n, A is m x k, and B = k x n // Do this by computing C(:, j) = A*B(:, j) = sum (i=1 to k) A(:, k)*B(i, j) @@ -270,12 +270,9 @@ i_t matrix_transpose_vector_multiply(const csc_matrix_t& A, } // y <- alpha*A*x + beta*y -template -i_t matrix_vector_multiply(const csc_matrix_t& A, - f_t alpha, - const std::vector& x, - f_t beta, - std::vector& y) +template +i_t matrix_vector_multiply( + const csc_matrix_t& A, f_t alpha, const VectorX& x, f_t beta, VectorY& y) { // y <- alpha*A*x + beta*y i_t m = A.m; diff --git a/cpp/src/dual_simplex/sparse_vector.cpp b/cpp/src/dual_simplex/sparse_vector.cpp index 4e2ecfa19..f1a91462a 100644 --- a/cpp/src/dual_simplex/sparse_vector.cpp +++ b/cpp/src/dual_simplex/sparse_vector.cpp @@ -13,6 +13,8 @@ namespace cuopt::linear_programming::dual_simplex { +using cuopt::ins_vector; + template sparse_vector_t::sparse_vector_t(const csc_matrix_t& A, i_t col) { @@ -28,6 +30,23 @@ sparse_vector_t::sparse_vector_t(const csc_matrix_t& A, i_t } } +template +void sparse_vector_t::from_csc_column(const csc_matrix_t& A, i_t col) +{ + const i_t col_start = A.col_start[col]; + const i_t col_end = A.col_start[col + 1]; + n = A.m; + const i_t nz = col_end - col_start; + i.clear(); + x.clear(); + i.reserve(nz); + x.reserve(nz); + for (i_t k = col_start; k < col_end; ++k) { + i.push_back(A.i[k]); + x.push_back(A.x[k]); + } +} + template sparse_vector_t::sparse_vector_t(const csr_matrix_t& A, i_t row) { @@ -68,8 +87,8 @@ void sparse_vector_t::to_csc(csc_matrix_t& A) const A.col_start.resize(2); A.col_start[0] = 0; A.col_start[1] = i.size(); - A.i = i; - A.x = x; + A.i = i.underlying(); + A.x = x.underlying(); } template @@ -83,6 +102,17 @@ void sparse_vector_t::to_dense(std::vector& x_dense) const } } +template +void sparse_vector_t::to_dense(ins_vector& x_dense) const +{ + x_dense.clear(); + x_dense.resize(n, 0.0); + const i_t nz = i.size(); + for (i_t k = 0; k < nz; ++k) { + x_dense[i[k]] = x[k]; + } +} + template void sparse_vector_t::scatter(std::vector& x_dense) const { @@ -93,16 +123,26 @@ void sparse_vector_t::scatter(std::vector& x_dense) const } } +template +void sparse_vector_t::scatter(ins_vector& x_dense) const +{ + // Assumes x_dense is already cleared + const i_t nz = i.size(); + for (i_t k = 0; k < nz; ++k) { + x_dense[i[k]] += x[k]; + } +} + template void sparse_vector_t::inverse_permute_vector(const std::vector& p) { assert(p.size() == n); i_t nz = i.size(); - std::vector i_perm(nz); + ins_vector i_perm(nz); for (i_t k = 0; k < nz; ++k) { i_perm[k] = p[i[k]]; } - i = i_perm; + i = std::move(i_perm); } template @@ -114,11 +154,11 @@ void sparse_vector_t::inverse_permute_vector(const std::vector& p i_t nz = i.size(); y.n = n; y.x = x; - std::vector i_perm(nz); + ins_vector i_perm(nz); for (i_t k = 0; k < nz; ++k) { i_perm[k] = p[i[k]]; } - y.i = i_perm; + y.i = std::move(i_perm); } template @@ -180,21 +220,21 @@ void sparse_vector_t::sort() } else { // Use a n log n sort const i_t nz = i.size(); - std::vector i_sorted(nz); - std::vector x_sorted(nz); + ins_vector i_sorted(nz); + ins_vector x_sorted(nz); std::vector perm(nz); for (i_t k = 0; k < nz; ++k) { perm[k] = k; } - std::vector& iunsorted = i; + auto& iunsorted = i; std::sort( perm.begin(), perm.end(), [&iunsorted](i_t a, i_t b) { return iunsorted[a] < iunsorted[b]; }); for (i_t k = 0; k < nz; ++k) { i_sorted[k] = i[perm[k]]; x_sorted[k] = x[perm[k]]; } - i = i_sorted; - x = x_sorted; + i = std::move(i_sorted); + x = std::move(x_sorted); } // Check diff --git a/cpp/src/dual_simplex/sparse_vector.hpp b/cpp/src/dual_simplex/sparse_vector.hpp index 95e9afa29..fadd2df33 100644 --- a/cpp/src/dual_simplex/sparse_vector.hpp +++ b/cpp/src/dual_simplex/sparse_vector.hpp @@ -9,16 +9,20 @@ #include #include +#include #include namespace cuopt::linear_programming::dual_simplex { +// Import instrumented vector type +using cuopt::ins_vector; + // A sparse vector stored as a list of nonzero coefficients and their indices template class sparse_vector_t { public: - sparse_vector_t() : n(0), i({}), x({}) {} + sparse_vector_t() : n(0), i(), x() {} // Construct a sparse vector of dimension n with nz nonzero coefficients sparse_vector_t(i_t n, i_t nz) : n(n), i(nz), x(nz) {} // Construct a sparse vector from a dense vector. @@ -33,9 +37,13 @@ class sparse_vector_t { void to_csc(csc_matrix_t& A) const; // convert a sparse vector into a dense vector. Dense vector is cleared and resized. void to_dense(std::vector& x_dense) const; + // convert a sparse vector into an instrumented dense vector. + void to_dense(ins_vector& x_dense) const; // scatter a sparse vector into a dense vector. Assumes x_dense is already cleared or // preinitialized void scatter(std::vector& x_dense) const; + // scatter into instrumented vector + void scatter(ins_vector& x_dense) const; // inverse permute the current sparse vector void inverse_permute_vector(const std::vector& p); // inverse permute a sparse vector into another sparse vector @@ -51,11 +59,20 @@ class sparse_vector_t { void negate(); f_t find_coefficient(i_t index) const; + void clear() + { + i.clear(); + x.clear(); + } + + // Reset from a column of a CSC matrix + void from_csc_column(const csc_matrix_t& A, i_t col); + void squeeze(sparse_vector_t& y) const; i_t n; - std::vector i; - std::vector x; + ins_vector i; + ins_vector x; }; } // namespace cuopt::linear_programming::dual_simplex diff --git a/cpp/src/dual_simplex/triangle_solve.cpp b/cpp/src/dual_simplex/triangle_solve.cpp index 375784650..2f280295b 100644 --- a/cpp/src/dual_simplex/triangle_solve.cpp +++ b/cpp/src/dual_simplex/triangle_solve.cpp @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -11,73 +11,9 @@ namespace cuopt::linear_programming::dual_simplex { -template -i_t lower_triangular_solve(const csc_matrix_t& L, std::vector& x) -{ - i_t n = L.n; - assert(x.size() == n); - for (i_t j = 0; j < n; ++j) { - i_t col_start = L.col_start[j]; - i_t col_end = L.col_start[j + 1]; - if (x[j] != 0.0) { - x[j] /= L.x[col_start]; - for (i_t p = col_start + 1; p < col_end; ++p) { - x[L.i[p]] -= L.x[p] * x[j]; - } - } - } - return 0; -} - -template -i_t lower_triangular_transpose_solve(const csc_matrix_t& L, std::vector& x) -{ - const i_t n = L.n; - assert(x.size() == n); - for (i_t j = n - 1; j >= 0; --j) { - const i_t col_start = L.col_start[j] + 1; - const i_t col_end = L.col_start[j + 1]; - for (i_t p = col_start; p < col_end; ++p) { - x[j] -= L.x[p] * x[L.i[p]]; - } - x[j] /= L.x[L.col_start[j]]; - } - return 0; -} - -template -i_t upper_triangular_solve(const csc_matrix_t& U, std::vector& x) -{ - const i_t n = U.n; - assert(x.size() == n); - for (i_t j = n - 1; j >= 0; --j) { - const i_t col_start = U.col_start[j]; - const i_t col_end = U.col_start[j + 1] - 1; - if (x[j] != 0.0) { - x[j] /= U.x[col_end]; - for (i_t p = col_start; p < col_end; ++p) { - x[U.i[p]] -= U.x[p] * x[j]; - } - } - } - return 0; -} - -template -i_t upper_triangular_transpose_solve(const csc_matrix_t& U, std::vector& x) -{ - const i_t n = U.n; - assert(x.size() == n); - for (i_t j = 0; j < n; ++j) { - const i_t col_start = U.col_start[j]; - const i_t col_end = U.col_start[j + 1] - 1; - for (i_t p = col_start; p < col_end; ++p) { - x[j] -= U.x[p] * x[U.i[p]]; - } - x[j] /= U.x[col_end]; - } - return 0; -} +// NOTE: lower_triangular_solve, lower_triangular_transpose_solve, +// upper_triangular_solve, and upper_triangular_transpose_solve are now +// templated on vector types and defined in the header file. // \brief Reach computes the reach of b in the graph of G // \param[in] b - Sparse vector containing the rhs @@ -204,17 +140,9 @@ i_t sparse_triangle_solve(const sparse_vector_t& b, #ifdef DUAL_SIMPLEX_INSTANTIATE_DOUBLE -template int lower_triangular_solve(const csc_matrix_t& L, - std::vector& x); - -template int lower_triangular_transpose_solve(const csc_matrix_t& L, - std::vector& x); - -template int upper_triangular_solve(const csc_matrix_t& U, - std::vector& x); - -template int upper_triangular_transpose_solve(const csc_matrix_t& U, - std::vector& x); +// NOTE: lower_triangular_solve, lower_triangular_transpose_solve, +// upper_triangular_solve, and upper_triangular_transpose_solve are now +// templated on vector types and defined in the header file, so no explicit instantiation needed. template int reach(const sparse_vector_t& b, const std::optional>& pinv, diff --git a/cpp/src/dual_simplex/triangle_solve.hpp b/cpp/src/dual_simplex/triangle_solve.hpp index 18dbafc85..f7fc7744d 100644 --- a/cpp/src/dual_simplex/triangle_solve.hpp +++ b/cpp/src/dual_simplex/triangle_solve.hpp @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -25,23 +25,82 @@ namespace cuopt::linear_programming::dual_simplex { // Solve L*x = b. On input x contains the right-hand side b, on output the // solution -template -i_t lower_triangular_solve(const csc_matrix_t& L, std::vector& x); +template +i_t lower_triangular_solve(const csc_matrix_t& L, VectorF& x) +{ + i_t n = L.n; + assert(x.size() == n); + for (i_t j = 0; j < n; ++j) { + i_t col_start = L.col_start[j]; + i_t col_end = L.col_start[j + 1]; + if (x[j] != 0.0) { + x[j] /= L.x[col_start]; + auto x_j = x[j]; // hoist this load out of the loop + // as the compiler cannot guess that x[j] never aliases to x[L.i[p]] + for (i_t p = col_start + 1; p < col_end; ++p) { + x[L.i[p]] -= L.x[p] * x_j; + } + } + } + return 0; +} // Solve L'*x = b. On input x contains the right-hand side b, on output the // solution -template -i_t lower_triangular_transpose_solve(const csc_matrix_t& L, std::vector& x); +template +i_t lower_triangular_transpose_solve(const csc_matrix_t& L, VectorF& x) +{ + const i_t n = L.n; + assert(x.size() == n); + for (i_t j = n - 1; j >= 0; --j) { + const i_t col_start = L.col_start[j] + 1; + const i_t col_end = L.col_start[j + 1]; + for (i_t p = col_start; p < col_end; ++p) { + x[j] -= L.x[p] * x[L.i[p]]; + } + x[j] /= L.x[L.col_start[j]]; + } + return 0; +} // Solve U*x = b. On input x contains the right-hand side b, on output the // solution -template -i_t upper_triangular_solve(const csc_matrix_t& U, std::vector& x); +template +i_t upper_triangular_solve(const csc_matrix_t& U, VectorF& x) +{ + const i_t n = U.n; + assert(x.size() == n); + for (i_t j = n - 1; j >= 0; --j) { + const i_t col_start = U.col_start[j]; + const i_t col_end = U.col_start[j + 1] - 1; + if (x[j] != 0.0) { + x[j] /= U.x[col_end]; + auto x_j = x[j]; // same x_j load hoisting + for (i_t p = col_start; p < col_end; ++p) { + x[U.i[p]] -= U.x[p] * x_j; + } + } + } + return 0; +} // Solve U'*x = b. On input x contains the right-hand side b, on output the // solution -template -i_t upper_triangular_transpose_solve(const csc_matrix_t& U, std::vector& x); +template +i_t upper_triangular_transpose_solve(const csc_matrix_t& U, VectorF& x) +{ + const i_t n = U.n; + assert(x.size() == n); + for (i_t j = 0; j < n; ++j) { + const i_t col_start = U.col_start[j]; + const i_t col_end = U.col_start[j + 1] - 1; + for (i_t p = col_start; p < col_end; ++p) { + x[j] -= U.x[p] * x[U.i[p]]; + } + x[j] /= U.x[col_end]; + } + return 0; +} // \brief Reach computes the reach of b in the graph of G // \param[in] b - sparse vector containing the rhs diff --git a/cpp/src/dual_simplex/vector_math.cpp b/cpp/src/dual_simplex/vector_math.cpp index b935d18e4..f20d17afe 100644 --- a/cpp/src/dual_simplex/vector_math.cpp +++ b/cpp/src/dual_simplex/vector_math.cpp @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -122,44 +122,8 @@ f_t sparse_dot(const std::vector& xind, return dot; } -// x = b(p) -template -i_t permute_vector(const std::vector& p, const std::vector& b, std::vector& x) -{ - i_t n = p.size(); - assert(x.size() == n); - assert(b.size() == n); - for (i_t k = 0; k < n; ++k) { - x[k] = b[p[k]]; - } - return 0; -} - -// x(p) = b -template -i_t inverse_permute_vector(const std::vector& p, - const std::vector& b, - std::vector& x) -{ - i_t n = p.size(); - assert(x.size() == n); - assert(b.size() == n); - for (i_t k = 0; k < n; ++k) { - x[p[k]] = b[k]; - } - return 0; -} - -template -i_t inverse_permutation(const std::vector& p, std::vector& pinv) -{ - i_t n = p.size(); - if (pinv.size() != n) { pinv.resize(n); } - for (i_t k = 0; k < n; ++k) { - pinv[p[k]] = k; - } - return 0; -} +// NOTE: permute_vector, inverse_permute_vector, and inverse_permutation are now +// templated on vector types and defined in the header file. #ifdef DUAL_SIMPLEX_INSTANTIATE_DOUBLE @@ -195,15 +159,8 @@ template double sparse_dot(int const* xind, template double sparse_dot( int* xind, double* xval, int nx, int* yind, double* yval, int ny); -template int permute_vector(const std::vector& p, - const std::vector& b, - std::vector& x); - -template int inverse_permute_vector(const std::vector& p, - const std::vector& b, - std::vector& x); - -template int inverse_permutation(const std::vector& p, std::vector& pinv); +// NOTE: permute_vector, inverse_permute_vector, and inverse_permutation are now +// templated on vector types and defined in the header file, so no explicit instantiation needed. #endif } // namespace cuopt::linear_programming::dual_simplex diff --git a/cpp/src/dual_simplex/vector_math.hpp b/cpp/src/dual_simplex/vector_math.hpp index 44a459935..6bf573e89 100644 --- a/cpp/src/dual_simplex/vector_math.hpp +++ b/cpp/src/dual_simplex/vector_math.hpp @@ -1,12 +1,13 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ #pragma once +#include #include #include @@ -56,17 +57,41 @@ template f_t sparse_dot(i_t* xind, f_t* xval, i_t nx, i_t* yind, f_t* yval, i_t ny); // Computes x = P*b or x=b(p) in MATLAB. -template -i_t permute_vector(const std::vector& p, const std::vector& b, std::vector& x); +template +int permute_vector(const VectorI& p, const VectorF_in& b, VectorF_out& x) +{ + auto n = p.size(); + assert(x.size() == n); + assert(b.size() == n); + for (decltype(n) k = 0; k < n; ++k) { + x[k] = b[p[k]]; + } + return 0; +} // Computes x = P'*b or x(p) = b in MATLAB. -template -i_t inverse_permute_vector(const std::vector& p, - const std::vector& b, - std::vector& x); +template +int inverse_permute_vector(const VectorI& p, const VectorF_in& b, VectorF_out& x) +{ + auto n = p.size(); + assert(x.size() == n); + assert(b.size() == n); + for (decltype(n) k = 0; k < n; ++k) { + x[p[k]] = b[k]; + } + return 0; +} // Computes pinv from p. Or pinv(p) = 1:n in MATLAB -template -i_t inverse_permutation(const std::vector& p, std::vector& pinv); +template +int inverse_permutation(const VectorI_in& p, VectorI_out& pinv) +{ + auto n = p.size(); + if (pinv.size() != n) { pinv.resize(n); } + for (decltype(n) k = 0; k < n; ++k) { + pinv[p[k]] = k; + } + return 0; +} } // namespace cuopt::linear_programming::dual_simplex diff --git a/cpp/src/linear_programming/translate.hpp b/cpp/src/linear_programming/translate.hpp index 19f6c024c..6ecb37c6e 100644 --- a/cpp/src/linear_programming/translate.hpp +++ b/cpp/src/linear_programming/translate.hpp @@ -30,9 +30,9 @@ static dual_simplex::user_problem_t cuopt_problem_to_simplex_problem( user_problem.objective = cuopt::host_copy(model.objective_coefficients, handle_ptr->get_stream()); dual_simplex::csr_matrix_t csr_A(m, n, nz); - csr_A.x = cuopt::host_copy(model.coefficients, handle_ptr->get_stream()); - csr_A.j = cuopt::host_copy(model.variables, handle_ptr->get_stream()); - csr_A.row_start = cuopt::host_copy(model.offsets, handle_ptr->get_stream()); + csr_A.x = ins_vector(cuopt::host_copy(model.coefficients, handle_ptr->get_stream())); + csr_A.j = ins_vector(cuopt::host_copy(model.variables, handle_ptr->get_stream())); + csr_A.row_start = ins_vector(cuopt::host_copy(model.offsets, handle_ptr->get_stream())); csr_A.to_compressed_col(user_problem.A); @@ -121,9 +121,9 @@ void translate_to_crossover_problem(const detail::problem_t& problem, dual_simplex::csr_matrix_t csr_A( problem.n_constraints, problem.n_variables, problem.nnz); - csr_A.x = cuopt::host_copy(problem.coefficients, stream); - csr_A.j = cuopt::host_copy(problem.variables, stream); - csr_A.row_start = cuopt::host_copy(problem.offsets, stream); + csr_A.x = ins_vector(cuopt::host_copy(problem.coefficients, stream)); + csr_A.j = ins_vector(cuopt::host_copy(problem.variables, stream)); + csr_A.row_start = ins_vector(cuopt::host_copy(problem.offsets, stream)); stream.synchronize(); CUOPT_LOG_DEBUG("Converting to compressed column"); diff --git a/cpp/src/math_optimization/solver_settings.cu b/cpp/src/math_optimization/solver_settings.cu index 0858bb75a..0ea40a008 100644 --- a/cpp/src/math_optimization/solver_settings.cu +++ b/cpp/src/math_optimization/solver_settings.cu @@ -60,6 +60,7 @@ solver_settings_t::solver_settings_t() : pdlp_settings(), mip_settings float_parameters = { {CUOPT_TIME_LIMIT, &mip_settings.time_limit, 0.0, std::numeric_limits::infinity(), std::numeric_limits::infinity()}, {CUOPT_TIME_LIMIT, &pdlp_settings.time_limit, 0.0, std::numeric_limits::infinity(), std::numeric_limits::infinity()}, + {CUOPT_WORK_LIMIT, &mip_settings.work_limit, 0.0, std::numeric_limits::infinity(), std::numeric_limits::infinity()}, {CUOPT_ABSOLUTE_DUAL_TOLERANCE, &pdlp_settings.tolerances.absolute_dual_tolerance, 0.0, 1e-1, 1e-4}, {CUOPT_RELATIVE_DUAL_TOLERANCE, &pdlp_settings.tolerances.relative_dual_tolerance, 0.0, 1e-1, 1e-4}, {CUOPT_ABSOLUTE_PRIMAL_TOLERANCE, &pdlp_settings.tolerances.absolute_primal_tolerance, 0.0, 1e-1, 1e-4}, @@ -98,6 +99,8 @@ solver_settings_t::solver_settings_t() : pdlp_settings(), mip_settings {CUOPT_NUM_GPUS, &pdlp_settings.num_gpus, 1, 2, 1}, {CUOPT_NUM_GPUS, &mip_settings.num_gpus, 1, 2, 1}, {CUOPT_MIP_BATCH_PDLP_STRONG_BRANCHING, &mip_settings.mip_batch_pdlp_strong_branching, 0, 1, 0}, + {CUOPT_MIP_DETERMINISM_MODE, &mip_settings.determinism_mode, CUOPT_MODE_OPPORTUNISTIC, CUOPT_MODE_DETERMINISTIC, CUOPT_MODE_OPPORTUNISTIC}, + {CUOPT_MIP_SEED, &mip_settings.seed, -1, std::numeric_limits::max(), -1}, {CUOPT_MIP_RELIABILITY_BRANCHING, &mip_settings.reliability_branching, -1, std::numeric_limits::max(), -1} }; diff --git a/cpp/src/mip/diversity/diversity_manager.cu b/cpp/src/mip/diversity/diversity_manager.cu index f01675327..d35c965db 100644 --- a/cpp/src/mip/diversity/diversity_manager.cu +++ b/cpp/src/mip/diversity/diversity_manager.cu @@ -185,7 +185,11 @@ bool diversity_manager_t::run_presolve(f_t time_limit) if (termination_criterion_t::NO_UPDATE != term_crit) { ls.constraint_prop.bounds_update.set_updated_bounds(*problem_ptr); } - if (!fj_only_run) { + bool run_probing_cache = !fj_only_run; + // Don't run probing cache in deterministic mode yet as neither B&B nor CPUFJ need it + // and it doesn't make use of work units yet + if (context.settings.determinism_mode == CUOPT_MODE_DETERMINISTIC) { run_probing_cache = false; } + if (run_probing_cache) { // Run probing cache before trivial presolve to discover variable implications const f_t time_ratio_of_probing_cache = diversity_config.time_ratio_of_probing_cache; const f_t max_time_on_probing = diversity_config.max_time_on_probing; @@ -303,13 +307,53 @@ template solution_t diversity_manager_t::run_solver() { raft::common::nvtx::range fun_scope("run_solver"); + + CUOPT_LOG_DEBUG("Determinism mode: %s", + context.settings.determinism_mode == CUOPT_MODE_DETERMINISTIC ? "deterministic" + : "opportunistic"); + + // to automatically compute the solving time on scope exit + auto timer_raii_guard = + cuopt::scope_guard([&]() { stats.total_solve_time = timer.elapsed_time(); }); + + // Debug: Allow disabling GPU heuristics to test B&B tree determinism in isolation + const char* disable_heuristics_env = std::getenv("CUOPT_DISABLE_GPU_HEURISTICS"); + if (context.settings.determinism_mode == CUOPT_MODE_DETERMINISTIC) { + CUOPT_LOG_INFO("Running deterministic mode with CPUFJ heuristic"); + population.initialize_population(); + population.allocate_solutions(); + + // Start CPUFJ in deterministic mode with B&B integration + if (context.branch_and_bound_ptr != nullptr) { + ls.start_cpufj_deterministic(*context.branch_and_bound_ptr); + } + + while (!check_b_b_preemption()) { + if (timer.check_time_limit()) break; + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + } + + // Stop CPUFJ when B&B is done + ls.stop_cpufj_deterministic(); + + population.add_external_solutions_to_population(); + return population.best_feasible(); + } + if (disable_heuristics_env != nullptr && std::string(disable_heuristics_env) == "1") { + CUOPT_LOG_INFO("GPU heuristics disabled via CUOPT_DISABLE_GPU_HEURISTICS=1"); + population.initialize_population(); + population.allocate_solutions(); + + while (!check_b_b_preemption()) { + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + } + return population.best_feasible(); + } + population.timer = timer; const f_t time_limit = timer.remaining_time(); const f_t lp_time_limit = std::min(diversity_config.max_time_on_lp, time_limit * diversity_config.time_ratio_on_init_lp); - // to automatically compute the solving time on scope exit - auto timer_raii_guard = - cuopt::scope_guard([&]() { stats.total_solve_time = timer.elapsed_time(); }); // after every change to the problem, we should resize all the relevant vars // we need to encapsulate that to prevent repetitions recombine_stats.reset(); diff --git a/cpp/src/mip/feasibility_jump/fj_cpu.cu b/cpp/src/mip/feasibility_jump/fj_cpu.cu index 8d534dfff..d1e62a10c 100644 --- a/cpp/src/mip/feasibility_jump/fj_cpu.cu +++ b/cpp/src/mip/feasibility_jump/fj_cpu.cu @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -13,15 +13,195 @@ #include +#include + #include +#include +#include #include +#include +#include #include #include #define CPUFJ_TIMING_TRACE 0 +// Define CPUFJ_NVTX_RANGES to enable detailed NVTX profiling ranges +#ifdef CPUFJ_NVTX_RANGES +#define CPUFJ_NVTX_RANGE(name) raft::common::nvtx::range CPUFJ_NVTX_UNIQUE_NAME(nvtx_scope_)(name) +#define CPUFJ_NVTX_UNIQUE_NAME(base) CPUFJ_NVTX_CONCAT(base, __LINE__) +#define CPUFJ_NVTX_CONCAT(a, b) CPUFJ_NVTX_CONCAT_INNER(a, b) +#define CPUFJ_NVTX_CONCAT_INNER(a, b) a##b +#else +#define CPUFJ_NVTX_RANGE(name) ((void)0) +#endif + namespace cuopt::linear_programming::detail { +template +HDI thrust::tuple get_mtm_for_bound( + const typename fj_t::climber_data_t::view_t& fj, + i_t var_idx, + i_t cstr_idx, + f_t cstr_coeff, + f_t bound, + f_t sign, + const ArrayType& assignment, + const ArrayType& lhs_vector) +{ + f_t delta_ij = 0; + f_t slack = 0; + f_t old_val = assignment[var_idx]; + + f_t lhs = lhs_vector[cstr_idx] * sign; + f_t rhs = bound * sign; + slack = rhs - lhs; // bound might be infinite. let the caller handle this case + + delta_ij = slack / (cstr_coeff * sign); + + return {delta_ij, slack}; +} + +template +HDI thrust::tuple get_mtm_for_constraint( + const typename fj_t::climber_data_t::view_t& fj, + i_t var_idx, + i_t cstr_idx, + f_t cstr_coeff, + f_t c_lb, + f_t c_ub, + const ArrayType& assignment, + const ArrayType& lhs_vector) +{ + f_t sign = -1; + f_t delta_ij = 0; + f_t slack = 0; + + f_t cstr_tolerance = fj.get_corrected_tolerance(cstr_idx, c_lb, c_ub); + + f_t old_val = assignment[var_idx]; + + // process each bound as two separate constraints + f_t bounds[2] = {c_lb, c_ub}; + cuopt_assert(isfinite(bounds[0]) || isfinite(bounds[1]), "bounds are not finite"); + + for (i_t bound_idx = 0; bound_idx < 2; ++bound_idx) { + if (!isfinite(bounds[bound_idx])) continue; + + // factor to correct the lhs/rhs to turn a lb <= lhs <= ub constraint into + // two virtual constraints lhs <= ub and -lhs <= -lb + sign = bound_idx == 0 ? -1 : 1; + f_t lhs = lhs_vector[cstr_idx] * sign; + f_t rhs = bounds[bound_idx] * sign; + slack = rhs - lhs; + + // skip constraints that are violated/satisfied based on the MTM move type + bool violated = slack < -cstr_tolerance; + if (move_type == MTMMoveType::FJ_MTM_VIOLATED ? !violated : violated) continue; + + f_t new_val = old_val; + + delta_ij = slack / (cstr_coeff * sign); + break; + } + + return {delta_ij, sign, slack, cstr_tolerance}; +} + +template +HDI std::pair feas_score_constraint( + const typename fj_t::climber_data_t::view_t& fj, + i_t var_idx, + f_t delta, + i_t cstr_idx, + f_t cstr_coeff, + f_t c_lb, + f_t c_ub, + f_t current_lhs, + f_t left_weight, + f_t right_weight) +{ + cuopt_assert(isfinite(delta), "invalid delta"); + cuopt_assert(cstr_coeff != 0 && isfinite(cstr_coeff), "invalid coefficient"); + + f_t base_feas = 0; + f_t bonus_robust = 0; + + f_t bounds[2] = {c_lb, c_ub}; + cuopt_assert(isfinite(c_lb) || isfinite(c_ub), "no range"); + for (i_t bound_idx = 0; bound_idx < 2; ++bound_idx) { + if (!isfinite(bounds[bound_idx])) continue; + + // factor to correct the lhs/rhs to turn a lb <= lhs <= ub constraint into + // two virtual leq constraints "lhs <= ub" and "-lhs <= -lb" in order to match + // the convention of the paper + + // TODO: broadcast left/right weights to a csr_offset-indexed table? local minimums + // usually occur on a rarer basis (around 50 iteratiosn to 1 local minimum) + // likely unreasonable and overkill however + f_t cstr_weight = bound_idx == 0 ? left_weight : right_weight; + f_t sign = bound_idx == 0 ? -1 : 1; + f_t rhs = bounds[bound_idx] * sign; + f_t old_lhs = current_lhs * sign; + f_t new_lhs = (current_lhs + cstr_coeff * delta) * sign; + f_t old_slack = rhs - old_lhs; + f_t new_slack = rhs - new_lhs; + + cuopt_assert(isfinite(cstr_weight), "invalid weight"); + cuopt_assert(cstr_weight >= 0, "invalid weight"); + cuopt_assert(isfinite(old_lhs), ""); + cuopt_assert(isfinite(new_lhs), ""); + cuopt_assert(isfinite(old_slack) && isfinite(new_slack), ""); + + f_t cstr_tolerance = fj.get_corrected_tolerance(cstr_idx, c_lb, c_ub); + + bool old_viol = fj.excess_score(cstr_idx, current_lhs, c_lb, c_ub) < -cstr_tolerance; + bool new_viol = + fj.excess_score(cstr_idx, current_lhs + cstr_coeff * delta, c_lb, c_ub) < -cstr_tolerance; + + bool old_sat = old_lhs < rhs + cstr_tolerance; + bool new_sat = new_lhs < rhs + cstr_tolerance; + + // equality + if (fj.pb.integer_equal(c_lb, c_ub)) { + if (!old_viol) cuopt_assert(old_sat == !old_viol, ""); + if (!new_viol) cuopt_assert(new_sat == !new_viol, ""); + } + + // if it would feasibilize this constraint + if (!old_sat && new_sat) { + cuopt_assert(old_viol, ""); + base_feas += cstr_weight; + } + // would cause this constraint to be violated + else if (old_sat && !new_sat) { + cuopt_assert(new_viol, ""); + base_feas -= cstr_weight; + } + // simple improvement + else if (!old_sat && !new_sat && old_lhs > new_lhs) { + cuopt_assert(old_viol && new_viol, ""); + base_feas += (i_t)(cstr_weight * fj.settings->parameters.excess_improvement_weight); + } + // simple worsening + else if (!old_sat && !new_sat && old_lhs <= new_lhs) { + cuopt_assert(old_viol && new_viol, ""); + base_feas -= (i_t)(cstr_weight * fj.settings->parameters.excess_improvement_weight); + } + + // robustness score bonus if this would leave some strick slack + bool old_stable = old_lhs < rhs - cstr_tolerance; + bool new_stable = new_lhs < rhs - cstr_tolerance; + if (!old_stable && new_stable) { + bonus_robust += cstr_weight; + } else if (old_stable && !new_stable) { + bonus_robust -= cstr_weight; + } + } + + return {base_feas, bonus_robust}; +} + static constexpr double BIGVAL_THRESHOLD = 1e20; template @@ -62,43 +242,282 @@ static void print_timing_stats(fj_cpu_climber_t& fj_cpu) auto [apply_avg, apply_total] = compute_avg_and_total(fj_cpu.apply_move_times); auto [weights_avg, weights_total] = compute_avg_and_total(fj_cpu.update_weights_times); auto [compute_score_avg, compute_score_total] = compute_avg_and_total(fj_cpu.compute_score_times); - CUOPT_LOG_TRACE("=== Timing Statistics (Iteration %d) ===\n", fj_cpu.iterations); - CUOPT_LOG_TRACE("find_lift_move: avg=%.6f ms, total=%.6f ms, calls=%zu\n", + CUOPT_LOG_TRACE("=== Timing Statistics (Iteration %d) ===", fj_cpu.iterations); + CUOPT_LOG_TRACE("find_lift_move: avg=%.6f ms, total=%.6f ms, calls=%zu", lift_avg * 1000.0, lift_total * 1000.0, fj_cpu.find_lift_move_times.size()); - CUOPT_LOG_TRACE("find_mtm_move_viol: avg=%.6f ms, total=%.6f ms, calls=%zu\n", + CUOPT_LOG_TRACE("find_mtm_move_viol: avg=%.6f ms, total=%.6f ms, calls=%zu", viol_avg * 1000.0, viol_total * 1000.0, fj_cpu.find_mtm_move_viol_times.size()); - CUOPT_LOG_TRACE("find_mtm_move_sat: avg=%.6f ms, total=%.6f ms, calls=%zu\n", + CUOPT_LOG_TRACE("find_mtm_move_sat: avg=%.6f ms, total=%.6f ms, calls=%zu", sat_avg * 1000.0, sat_total * 1000.0, fj_cpu.find_mtm_move_sat_times.size()); - CUOPT_LOG_TRACE("apply_move: avg=%.6f ms, total=%.6f ms, calls=%zu\n", + CUOPT_LOG_TRACE("apply_move: avg=%.6f ms, total=%.6f ms, calls=%zu", apply_avg * 1000.0, apply_total * 1000.0, fj_cpu.apply_move_times.size()); - CUOPT_LOG_TRACE("update_weights: avg=%.6f ms, total=%.6f ms, calls=%zu\n", + CUOPT_LOG_TRACE("update_weights: avg=%.6f ms, total=%.6f ms, calls=%zu", weights_avg * 1000.0, weights_total * 1000.0, fj_cpu.update_weights_times.size()); - CUOPT_LOG_TRACE("compute_score: avg=%.6f ms, total=%.6f ms, calls=%zu\n", + CUOPT_LOG_TRACE("compute_score: avg=%.6f ms, total=%.6f ms, calls=%zu", compute_score_avg * 1000.0, compute_score_total * 1000.0, fj_cpu.compute_score_times.size()); - CUOPT_LOG_TRACE("cache hit percentage: %.2f%%\n", + CUOPT_LOG_TRACE("cache hit percentage: %.2f%%", (double)fj_cpu.hit_count / (fj_cpu.hit_count + fj_cpu.miss_count) * 100.0); - CUOPT_LOG_TRACE("bin candidate move hit percentage: %.2f%%\n", + CUOPT_LOG_TRACE("bin candidate move hit percentage: %.2f%%", (double)fj_cpu.candidate_move_hits[0] / (fj_cpu.candidate_move_hits[0] + fj_cpu.candidate_move_misses[0]) * 100.0); - CUOPT_LOG_TRACE("int candidate move hit percentage: %.2f%%\n", + CUOPT_LOG_TRACE("int candidate move hit percentage: %.2f%%", (double)fj_cpu.candidate_move_hits[1] / (fj_cpu.candidate_move_hits[1] + fj_cpu.candidate_move_misses[1]) * 100.0); - CUOPT_LOG_TRACE("cont candidate move hit percentage: %.2f%%\n", + CUOPT_LOG_TRACE("cont candidate move hit percentage: %.2f%%", (double)fj_cpu.candidate_move_hits[2] / (fj_cpu.candidate_move_hits[2] + fj_cpu.candidate_move_misses[2]) * 100.0); - CUOPT_LOG_TRACE("========================================\n"); + CUOPT_LOG_TRACE("========================================"); +} + +template +static void precompute_problem_features(fj_cpu_climber_t& fj_cpu) +{ + fj_cpu.n_binary_vars = 0; + fj_cpu.n_integer_vars = 0; + for (i_t i = 0; i < (i_t)fj_cpu.h_is_binary_variable.size(); i++) { + if (fj_cpu.h_is_binary_variable[i]) { + fj_cpu.n_binary_vars++; + } else if (fj_cpu.h_var_types[i] == var_t::INTEGER) { + fj_cpu.n_integer_vars++; + } + } + + i_t total_nnz = fj_cpu.h_reverse_offsets.back(); + i_t n_vars = fj_cpu.h_reverse_offsets.size() - 1; + i_t n_cstrs = fj_cpu.h_offsets.size() - 1; + + fj_cpu.avg_var_degree = (double)total_nnz / n_vars; + + fj_cpu.max_var_degree = 0; + std::vector var_degrees(n_vars); + for (i_t i = 0; i < n_vars; i++) { + i_t degree = fj_cpu.h_reverse_offsets[i + 1] - fj_cpu.h_reverse_offsets[i]; + var_degrees[i] = degree; + fj_cpu.max_var_degree = std::max(fj_cpu.max_var_degree, degree); + } + + double var_deg_variance = 0.0; + for (i_t i = 0; i < n_vars; i++) { + double diff = var_degrees[i] - fj_cpu.avg_var_degree; + var_deg_variance += diff * diff; + } + var_deg_variance /= n_vars; + double var_degree_std = std::sqrt(var_deg_variance); + fj_cpu.var_degree_cv = fj_cpu.avg_var_degree > 0 ? var_degree_std / fj_cpu.avg_var_degree : 0.0; + + fj_cpu.avg_cstr_degree = (double)total_nnz / n_cstrs; + + fj_cpu.max_cstr_degree = 0; + std::vector cstr_degrees(n_cstrs); + for (i_t i = 0; i < n_cstrs; i++) { + i_t degree = fj_cpu.h_offsets[i + 1] - fj_cpu.h_offsets[i]; + cstr_degrees[i] = degree; + fj_cpu.max_cstr_degree = std::max(fj_cpu.max_cstr_degree, degree); + } + + double cstr_deg_variance = 0.0; + for (i_t i = 0; i < n_cstrs; i++) { + double diff = cstr_degrees[i] - fj_cpu.avg_cstr_degree; + cstr_deg_variance += diff * diff; + } + cstr_deg_variance /= n_cstrs; + double cstr_degree_std = std::sqrt(cstr_deg_variance); + fj_cpu.cstr_degree_cv = + fj_cpu.avg_cstr_degree > 0 ? cstr_degree_std / fj_cpu.avg_cstr_degree : 0.0; + + fj_cpu.problem_density = (double)total_nnz / ((double)n_vars * n_cstrs); +} + +template +static void log_regression_features(fj_cpu_climber_t& fj_cpu, + double time_window_ms, + double total_time_ms, + size_t mem_loads_bytes, + size_t mem_stores_bytes) +{ + i_t total_nnz = fj_cpu.h_reverse_offsets.back(); + i_t n_vars = fj_cpu.h_reverse_offsets.size() - 1; + i_t n_cstrs = fj_cpu.h_offsets.size() - 1; + + // Dynamic runtime features + double violated_ratio = (double)fj_cpu.violated_constraints.size() / n_cstrs; + + // Compute per-iteration metrics + double nnz_per_move = 0.0; + i_t total_moves = + fj_cpu.n_lift_moves_window + fj_cpu.n_mtm_viol_moves_window + fj_cpu.n_mtm_sat_moves_window; + if (total_moves > 0) { nnz_per_move = (double)fj_cpu.nnz_processed_window / total_moves; } + + double eval_intensity = (double)fj_cpu.nnz_processed_window / 1000.0; + + // Cache and locality metrics + i_t cache_hits_window = fj_cpu.hit_count - fj_cpu.hit_count_window_start; + i_t cache_misses_window = fj_cpu.miss_count - fj_cpu.miss_count_window_start; + i_t total_cache_accesses = cache_hits_window + cache_misses_window; + double cache_hit_rate = + total_cache_accesses > 0 ? (double)cache_hits_window / total_cache_accesses : 0.0; + + i_t unique_cstrs = fj_cpu.unique_cstrs_accessed_window.size(); + i_t unique_vars = fj_cpu.unique_vars_accessed_window.size(); + + // Reuse ratios: how many times each constraint/variable was accessed on average + double cstr_reuse_ratio = + unique_cstrs > 0 ? (double)fj_cpu.nnz_processed_window / unique_cstrs : 0.0; + double var_reuse_ratio = + unique_vars > 0 ? (double)fj_cpu.n_variable_updates_window / unique_vars : 0.0; + + // Working set size estimation (KB) + // Each constraint: lhs (f_t) + 2 bounds (f_t) + sumcomp (f_t) = 4 * sizeof(f_t) + // Each variable: assignment (f_t) = 1 * sizeof(f_t) + i_t working_set_bytes = unique_cstrs * 4 * sizeof(f_t) + unique_vars * sizeof(f_t); + double working_set_kb = working_set_bytes / 1024.0; + + // Coverage: what fraction of problem is actively touched + double cstr_coverage = (double)unique_cstrs / n_cstrs; + double var_coverage = (double)unique_vars / n_vars; + + double loads_per_iter = 0.0; + double stores_per_iter = 0.0; + double l1_miss = -1.0; + double l3_miss = -1.0; + + // Compute memory statistics + double mem_loads_mb = mem_loads_bytes / 1e6; + double mem_stores_mb = mem_stores_bytes / 1e6; + double mem_total_mb = (mem_loads_bytes + mem_stores_bytes) / 1e6; + double mem_bandwidth_gb_per_sec = (mem_total_mb / 1000.0) / (time_window_ms / 1000.0); + + // Build per-wrapper memory statistics string + std::stringstream wrapper_stats; + auto per_wrapper_stats = fj_cpu.memory_aggregator.collect_per_wrapper(); + for (const auto& [name, loads, stores] : per_wrapper_stats) { + wrapper_stats << " " << name << "_loads=" << loads << " " << name << "_stores=" << stores; + } + + fj_cpu.memory_aggregator.flush(); + + // Print everything on a single line using precomputed features + CUOPT_LOG_DEBUG( + "%sCPUFJ_FEATURES iter=%d time_window=%.2f " + "n_vars=%d n_cstrs=%d n_bin=%d n_int=%d total_nnz=%d " + "avg_var_deg=%.2f max_var_deg=%d var_deg_cv=%.4f " + "avg_cstr_deg=%.2f max_cstr_deg=%d cstr_deg_cv=%.4f " + "density=%.6f " + "total_viol=%.4f obj_weight=%.4f max_weight=%.4f " + "n_locmin=%d iter_since_best=%d feas_found=%d " + "nnz_proc=%d n_lift=%d n_mtm_viol=%d n_mtm_sat=%d n_var_updates=%d " + "cache_hit_rate=%.4f unique_cstrs=%d unique_vars=%d " + "cstr_reuse=%.2f var_reuse=%.2f working_set_kb=%.1f " + "cstr_coverage=%.4f var_coverage=%.4f " + "L1_miss=%.2f L3_miss=%.2f loads_per_iter=%.0f stores_per_iter=%.0f " + "viol_ratio=%.4f nnz_per_move=%.2f eval_intensity=%.2f " + "mem_loads_mb=%.3f mem_stores_mb=%.3f mem_total_mb=%.3f mem_bandwidth_gb_s=%.3f%s", + fj_cpu.log_prefix.c_str(), + fj_cpu.iterations, + time_window_ms, + n_vars, + n_cstrs, + fj_cpu.n_binary_vars, + fj_cpu.n_integer_vars, + total_nnz, + fj_cpu.avg_var_degree, + fj_cpu.max_var_degree, + fj_cpu.var_degree_cv, + fj_cpu.avg_cstr_degree, + fj_cpu.max_cstr_degree, + fj_cpu.cstr_degree_cv, + fj_cpu.problem_density, + fj_cpu.total_violations, + fj_cpu.h_objective_weight, + fj_cpu.max_weight, + fj_cpu.n_local_minima_window, + fj_cpu.iterations_since_best, + fj_cpu.feasible_found ? 1 : 0, + fj_cpu.nnz_processed_window, + fj_cpu.n_lift_moves_window, + fj_cpu.n_mtm_viol_moves_window, + fj_cpu.n_mtm_sat_moves_window, + fj_cpu.n_variable_updates_window, + cache_hit_rate, + unique_cstrs, + unique_vars, + cstr_reuse_ratio, + var_reuse_ratio, + working_set_kb, + cstr_coverage, + var_coverage, + l1_miss, + l3_miss, + loads_per_iter, + stores_per_iter, + violated_ratio, + nnz_per_move, + eval_intensity, + mem_loads_mb, + mem_stores_mb, + mem_total_mb, + mem_bandwidth_gb_per_sec, + wrapper_stats.str().c_str()); + + // Reset window counters + fj_cpu.nnz_processed_window = 0; + fj_cpu.n_lift_moves_window = 0; + fj_cpu.n_mtm_viol_moves_window = 0; + fj_cpu.n_mtm_sat_moves_window = 0; + fj_cpu.n_variable_updates_window = 0; + fj_cpu.n_local_minima_window = 0; + fj_cpu.prev_best_objective = fj_cpu.h_best_objective; + + // Reset cache and locality tracking + fj_cpu.hit_count_window_start = fj_cpu.hit_count; + fj_cpu.miss_count_window_start = fj_cpu.miss_count; + fj_cpu.unique_cstrs_accessed_window.clear(); + fj_cpu.unique_vars_accessed_window.clear(); +} + +template +static inline std::pair reverse_range_for_var(fj_cpu_climber_t& fj_cpu, + i_t var_idx) +{ + cuopt_assert(var_idx >= 0 && var_idx < fj_cpu.view.pb.n_variables, + "Variable should be within the range"); + return std::make_pair(fj_cpu.h_reverse_offsets[var_idx], fj_cpu.h_reverse_offsets[var_idx + 1]); +} + +template +static inline std::pair range_for_constraint(fj_cpu_climber_t& fj_cpu, + i_t cstr_idx) +{ + return std::make_pair(fj_cpu.h_offsets[cstr_idx], fj_cpu.h_offsets[cstr_idx + 1]); +} + +template +static inline bool check_variable_within_bounds(fj_cpu_climber_t& fj_cpu, + i_t var_idx, + f_t val) +{ + const f_t int_tol = fj_cpu.view.pb.tolerances.integrality_tolerance; + auto bounds = fj_cpu.h_var_bounds[var_idx].get(); + bool within_bounds = val <= (get_upper(bounds) + int_tol) && val >= (get_lower(bounds) - int_tol); + return within_bounds; +} + +template +static inline bool is_integer_var(fj_cpu_climber_t& fj_cpu, i_t var_idx) +{ + return var_t::INTEGER == fj_cpu.h_var_types[var_idx]; } template @@ -117,16 +536,16 @@ static inline bool tabu_check(fj_cpu_climber_t& fj_cpu, } template -static bool check_variable_feasibility(const typename fj_t::climber_data_t::view_t& fj, +static bool check_variable_feasibility(fj_cpu_climber_t& fj_cpu, bool check_integer = true) { - for (i_t var_idx = 0; var_idx < fj.pb.n_variables; var_idx += 1) { - auto val = fj.incumbent_assignment[var_idx]; - bool feasible = fj.pb.check_variable_within_bounds(var_idx, val); + for (i_t var_idx = 0; var_idx < fj_cpu.view.pb.n_variables; var_idx += 1) { + auto val = fj_cpu.h_assignment[var_idx]; + bool feasible = check_variable_within_bounds(fj_cpu, var_idx, val); if (!feasible) return false; - if (check_integer && fj.pb.is_integer_var(var_idx) && - !fj.pb.is_integer(fj.incumbent_assignment[var_idx])) + if (check_integer && is_integer_var(fj_cpu, var_idx) && + !fj_cpu.view.pb.is_integer(fj_cpu.h_assignment[var_idx])) return false; } return true; @@ -148,16 +567,28 @@ static inline std::pair compute_score(fj_cpu_climber_t(fj_cpu, var_idx); + fj_cpu.nnz_processed_window += (offset_end - offset_begin); + for (i_t i = offset_begin; i < offset_end; i++) { - auto cstr_idx = fj_cpu.h_reverse_constraints[i]; + auto cstr_idx = fj_cpu.h_reverse_constraints[i]; + fj_cpu.unique_cstrs_accessed_window.insert(cstr_idx); auto cstr_coeff = fj_cpu.h_reverse_coefficients[i]; - auto [c_lb, c_ub] = fj_cpu.cached_cstr_bounds[i]; + auto [c_lb, c_ub] = fj_cpu.cached_cstr_bounds[i].get(); cuopt_assert(c_lb <= c_ub, "invalid bounds"); - auto [cstr_base_feas, cstr_bonus_robust] = feas_score_constraint( - fj_cpu.view, var_idx, delta, cstr_idx, cstr_coeff, c_lb, c_ub, fj_cpu.h_lhs[cstr_idx]); + auto [cstr_base_feas, cstr_bonus_robust] = + feas_score_constraint(fj_cpu.view, + var_idx, + delta, + cstr_idx, + cstr_coeff, + c_lb, + c_ub, + fj_cpu.h_lhs[cstr_idx], + fj_cpu.h_cstr_left_weights[cstr_idx], + fj_cpu.h_cstr_right_weights[cstr_idx]); base_feas_sum += cstr_base_feas; bonus_robust_sum += cstr_bonus_robust; @@ -188,6 +619,7 @@ static inline std::pair compute_score(fj_cpu_climber_t static void smooth_weights(fj_cpu_climber_t& fj_cpu) { + CPUFJ_NVTX_RANGE("CPUFJ::smooth_weights"); for (i_t cstr_idx = 0; cstr_idx < fj_cpu.view.pb.n_constraints; cstr_idx++) { // consider only satisfied constraints if (fj_cpu.violated_constraints.count(cstr_idx)) continue; @@ -208,6 +640,7 @@ template static void update_weights(fj_cpu_climber_t& fj_cpu) { timing_raii_t timer(fj_cpu.update_weights_times); + CPUFJ_NVTX_RANGE("CPUFJ::update_weights"); raft::random::PCGenerator rng(fj_cpu.settings.seed + fj_cpu.iterations, 0, 0); bool smoothing = rng.next_float() <= fj_cpu.settings.parameters.weight_smoothing_probability; @@ -249,7 +682,8 @@ static void update_weights(fj_cpu_climber_t& fj_cpu) } // Invalidate related cached move scores - auto [relvar_offset_begin, relvar_offset_end] = fj_cpu.view.pb.range_for_constraint(cstr_idx); + auto [relvar_offset_begin, relvar_offset_end] = + range_for_constraint(fj_cpu, cstr_idx); for (auto i = relvar_offset_begin; i < relvar_offset_end; i++) { fj_cpu.cached_mtm_moves[i].first = 0; } @@ -265,20 +699,26 @@ static void apply_move(fj_cpu_climber_t& fj_cpu, bool localmin = false) { timing_raii_t timer(fj_cpu.apply_move_times); + CPUFJ_NVTX_RANGE("CPUFJ::apply_move"); raft::random::PCGenerator rng(fj_cpu.settings.seed + fj_cpu.iterations, 0, 0); cuopt_assert(var_idx < fj_cpu.view.pb.n_variables, "variable index out of bounds"); // Update the LHSs of all involved constraints. - auto [offset_begin, offset_end] = fj_cpu.view.pb.reverse_range_for_var(var_idx); + auto [offset_begin, offset_end] = reverse_range_for_var(fj_cpu, var_idx); + + fj_cpu.nnz_processed_window += (offset_end - offset_begin); + fj_cpu.n_variable_updates_window++; + fj_cpu.unique_vars_accessed_window.insert(var_idx); i_t previous_viol = fj_cpu.violated_constraints.size(); for (auto i = offset_begin; i < offset_end; i++) { cuopt_assert(i < (i_t)fj_cpu.h_reverse_constraints.size(), ""); - auto [c_lb, c_ub] = fj_cpu.cached_cstr_bounds[i]; + auto [c_lb, c_ub] = fj_cpu.cached_cstr_bounds[i].get(); - auto cstr_idx = fj_cpu.h_reverse_constraints[i]; + auto cstr_idx = fj_cpu.h_reverse_constraints[i]; + fj_cpu.unique_cstrs_accessed_window.insert(cstr_idx); auto cstr_coeff = fj_cpu.h_reverse_coefficients[i]; f_t old_lhs = fj_cpu.h_lhs[cstr_idx]; @@ -311,7 +751,8 @@ static void apply_move(fj_cpu_climber_t& fj_cpu, cuopt_assert(isfinite(fj_cpu.h_lhs[cstr_idx]), "assignment should be finite"); // Invalidate related cached move scores - auto [relvar_offset_begin, relvar_offset_end] = fj_cpu.view.pb.range_for_constraint(cstr_idx); + auto [relvar_offset_begin, relvar_offset_end] = + range_for_constraint(fj_cpu, cstr_idx); for (auto i = relvar_offset_begin; i < relvar_offset_end; i++) { fj_cpu.cached_mtm_moves[i].first = 0; } @@ -323,13 +764,13 @@ static void apply_move(fj_cpu_climber_t& fj_cpu, // update the assignment and objective proper f_t new_val = fj_cpu.h_assignment[var_idx] + delta; - if (fj_cpu.view.pb.is_integer_var(var_idx)) { + if (is_integer_var(fj_cpu, var_idx)) { cuopt_assert(fj_cpu.view.pb.integer_equal(new_val, round(new_val)), "new_val is not integer"); new_val = round(new_val); } fj_cpu.h_assignment[var_idx] = new_val; - cuopt_assert(fj_cpu.view.pb.check_variable_within_bounds(var_idx, new_val), + cuopt_assert((check_variable_within_bounds(fj_cpu, var_idx, new_val)), "assignment not within bounds"); cuopt_assert(isfinite(new_val), "assignment is not finite"); @@ -339,16 +780,19 @@ static void apply_move(fj_cpu_climber_t& fj_cpu, // recompute the LHS values to cancel out accumulation errors, then check if feasibility remains recompute_lhs(fj_cpu); - if (fj_cpu.violated_constraints.empty() && check_variable_feasibility(fj_cpu.view)) { + if (fj_cpu.violated_constraints.empty() && check_variable_feasibility(fj_cpu)) { cuopt_assert(fj_cpu.satisfied_constraints.size() == fj_cpu.view.pb.n_constraints, ""); fj_cpu.h_best_objective = fj_cpu.h_incumbent_objective - fj_cpu.settings.parameters.breakthrough_move_epsilon; - fj_cpu.h_best_assignment = fj_cpu.h_assignment; - CUOPT_LOG_TRACE("%sCPUFJ: new best objective: %g\n", + fj_cpu.h_best_assignment = fj_cpu.h_assignment; + fj_cpu.iterations_since_best = 0; + CUOPT_LOG_TRACE("%sCPUFJ: new best objective: %g", fj_cpu.log_prefix.c_str(), fj_cpu.pb_ptr->get_user_obj_from_solver_obj(fj_cpu.h_best_objective)); if (fj_cpu.improvement_callback) { - fj_cpu.improvement_callback(fj_cpu.h_best_objective, fj_cpu.h_assignment); + double current_work_units = fj_cpu.work_units_elapsed.load(std::memory_order_acquire); + fj_cpu.improvement_callback( + fj_cpu.h_best_objective, fj_cpu.h_assignment, current_work_units); } fj_cpu.feasible_found = true; } @@ -378,6 +822,7 @@ template static thrust::tuple find_mtm_move( fj_cpu_climber_t& fj_cpu, const std::vector& target_cstrs, bool localmin = false) { + CPUFJ_NVTX_RANGE("CPUFJ::find_mtm_move"); auto& problem = *fj_cpu.pb_ptr; raft::random::PCGenerator rng(fj_cpu.settings.seed + fj_cpu.iterations, 0, 0); @@ -387,7 +832,7 @@ static thrust::tuple find_mtm_move( // collect all the variables that are involved in the target constraints for (size_t cstr_idx : target_cstrs) { - auto [offset_begin, offset_end] = fj_cpu.view.pb.range_for_constraint(cstr_idx); + auto [offset_begin, offset_end] = range_for_constraint(fj_cpu, cstr_idx); for (auto i = offset_begin; i < offset_end; i++) { i_t var_idx = fj_cpu.h_variables[i]; if (fj_cpu.var_bitmap[var_idx]) continue; @@ -398,7 +843,7 @@ static thrust::tuple find_mtm_move( // estimate the amount of nnzs to consider i_t nnz_sum = 0; for (auto var_idx : fj_cpu.iter_mtm_vars) { - auto [offset_begin, offset_end] = fj_cpu.view.pb.reverse_range_for_var(var_idx); + auto [offset_begin, offset_end] = reverse_range_for_var(fj_cpu, var_idx); nnz_sum += offset_end - offset_begin; } @@ -406,22 +851,24 @@ static thrust::tuple find_mtm_move( if (nnz_sum > fj_cpu.nnz_samples) nnz_pick_probability = (f_t)fj_cpu.nnz_samples / nnz_sum; for (size_t cstr_idx : target_cstrs) { - f_t cstr_tol = fj_cpu.view.get_corrected_tolerance(cstr_idx); + auto c_lb = fj_cpu.h_cstr_lb[cstr_idx]; + auto c_ub = fj_cpu.h_cstr_ub[cstr_idx]; + f_t cstr_tol = fj_cpu.view.get_corrected_tolerance(cstr_idx, c_lb, c_ub); cuopt_assert(cstr_idx < fj_cpu.h_cstr_lb.size(), "cstr_idx is out of bounds"); - auto [offset_begin, offset_end] = fj_cpu.view.pb.range_for_constraint(cstr_idx); + auto [offset_begin, offset_end] = range_for_constraint(fj_cpu, cstr_idx); for (auto i = offset_begin; i < offset_end; i++) { // early cached check if (auto& cached_move = fj_cpu.cached_mtm_moves[i]; cached_move.first != 0) { if (best_score < cached_move.second) { auto var_idx = fj_cpu.h_variables[i]; - if (fj_cpu.view.pb.check_variable_within_bounds( - var_idx, fj_cpu.h_assignment[var_idx] + cached_move.first)) { + if (check_variable_within_bounds( + fj_cpu, var_idx, fj_cpu.h_assignment[var_idx] + cached_move.first)) { best_score = cached_move.second; best_move = fj_move_t{var_idx, cached_move.first}; } // cuopt_assert(fj_cpu.view.pb.check_variable_within_bounds(var_idx, - // fj_cpu.h_assignment[var_idx] + cached_move.first), "best move not within bounds"); + // fj_cpu.h_assignment[var_idx] + cached_move.first), "best move is not within bounds"); } fj_cpu.hit_count++; continue; @@ -445,11 +892,18 @@ static thrust::tuple find_mtm_move( } else { auto cstr_coeff = fj_cpu.h_coefficients[i]; - f_t c_lb = fj_cpu.h_cstr_lb[cstr_idx]; - f_t c_ub = fj_cpu.h_cstr_ub[cstr_idx]; - auto [delta, sign, slack, cstr_tolerance] = get_mtm_for_constraint( - fj_cpu.view, var_idx, cstr_idx, cstr_coeff, c_lb, c_ub); - if (fj_cpu.view.pb.is_integer_var(var_idx)) { + f_t c_lb = fj_cpu.h_cstr_lb[cstr_idx]; + f_t c_ub = fj_cpu.h_cstr_ub[cstr_idx]; + auto [delta, sign, slack, cstr_tolerance] = + get_mtm_for_constraint(fj_cpu.view, + var_idx, + cstr_idx, + cstr_coeff, + c_lb, + c_ub, + fj_cpu.h_assignment, + fj_cpu.h_lhs); + if (is_integer_var(fj_cpu, var_idx)) { new_val = cstr_coeff * sign > 0 ? floor(val + delta + fj_cpu.view.pb.tolerances.integrality_tolerance) : ceil(val + delta - fj_cpu.view.pb.tolerances.integrality_tolerance); @@ -457,18 +911,18 @@ static thrust::tuple find_mtm_move( new_val = val + delta; } // fallback - if (new_val < get_lower(fj_cpu.h_var_bounds[var_idx]) || - new_val > get_upper(fj_cpu.h_var_bounds[var_idx])) { - new_val = cstr_coeff * sign > 0 ? get_lower(fj_cpu.h_var_bounds[var_idx]) - : get_upper(fj_cpu.h_var_bounds[var_idx]); + if (new_val < get_lower(fj_cpu.h_var_bounds[var_idx].get()) || + new_val > get_upper(fj_cpu.h_var_bounds[var_idx].get())) { + new_val = cstr_coeff * sign > 0 ? get_lower(fj_cpu.h_var_bounds[var_idx].get()) + : get_upper(fj_cpu.h_var_bounds[var_idx].get()); } } if (!isfinite(new_val)) continue; - cuopt_assert(fj_cpu.view.pb.check_variable_within_bounds(var_idx, new_val), + cuopt_assert((check_variable_within_bounds(fj_cpu, var_idx, new_val)), "new_val is not within bounds"); delta = new_val - val; // more permissive tabu in the case of local minima - if (tabu_check(fj_cpu, var_idx, delta, localmin)) continue; + if (tabu_check(fj_cpu, var_idx, delta, localmin)) continue; if (fabs(delta) < cstr_tol) continue; auto move = fj_move_t{var_idx, delta}; @@ -507,11 +961,11 @@ static thrust::tuple find_mtm_move( cuopt_assert(move.var_idx < fj_cpu.h_assignment.size(), "move.var_idx is out of bounds"); cuopt_assert(move.var_idx >= 0, "move.var_idx is not positive"); - if (tabu_check(fj_cpu, var_idx, delta)) continue; + if (tabu_check(fj_cpu, var_idx, delta)) continue; auto [score, infeasibility] = compute_score(fj_cpu, var_idx, delta); - cuopt_assert(fj_cpu.view.pb.check_variable_within_bounds(var_idx, new_val), ""); + cuopt_assert((check_variable_within_bounds(fj_cpu, var_idx, new_val)), ""); cuopt_assert(isfinite(delta), ""); if (fj_cpu.view.move_numerically_stable( @@ -532,6 +986,7 @@ static thrust::tuple find_mtm_move_viol( fj_cpu_climber_t& fj_cpu, i_t sample_size = 100, bool localmin = false) { timing_raii_t timer(fj_cpu.find_mtm_move_viol_times); + CPUFJ_NVTX_RANGE("CPUFJ::find_mtm_move_viol"); std::vector sampled_cstrs; sampled_cstrs.reserve(sample_size); @@ -549,6 +1004,7 @@ static thrust::tuple find_mtm_move_sat( fj_cpu_climber_t& fj_cpu, i_t sample_size = 100) { timing_raii_t timer(fj_cpu.find_mtm_move_sat_times); + CPUFJ_NVTX_RANGE("CPUFJ::find_mtm_move_sat"); std::vector sampled_cstrs; sampled_cstrs.reserve(sample_size); @@ -564,22 +1020,25 @@ static thrust::tuple find_mtm_move_sat( template static void recompute_lhs(fj_cpu_climber_t& fj_cpu) { + CPUFJ_NVTX_RANGE("CPUFJ::recompute_lhs"); cuopt_assert(fj_cpu.h_lhs.size() == fj_cpu.view.pb.n_constraints, "h_lhs size mismatch"); fj_cpu.violated_constraints.clear(); fj_cpu.satisfied_constraints.clear(); fj_cpu.total_violations = 0; for (i_t cstr_idx = 0; cstr_idx < fj_cpu.view.pb.n_constraints; ++cstr_idx) { - auto [offset_begin, offset_end] = fj_cpu.view.pb.range_for_constraint(cstr_idx); + auto [offset_begin, offset_end] = range_for_constraint(fj_cpu, cstr_idx); + auto c_lb = fj_cpu.h_cstr_lb[cstr_idx]; + auto c_ub = fj_cpu.h_cstr_ub[cstr_idx]; auto delta_it = - thrust::make_transform_iterator(thrust::make_counting_iterator(0), [fj = fj_cpu.view](i_t j) { - return fj.pb.coefficients[j] * fj.incumbent_assignment[fj.pb.variables[j]]; + thrust::make_transform_iterator(thrust::make_counting_iterator(0), [&fj_cpu](i_t j) { + return fj_cpu.h_coefficients[j] * fj_cpu.h_assignment[fj_cpu.h_variables[j]]; }); fj_cpu.h_lhs[cstr_idx] = fj_kahan_babushka_neumaier_sum(delta_it + offset_begin, delta_it + offset_end); fj_cpu.h_lhs_sumcomp[cstr_idx] = 0; - f_t cstr_tolerance = fj_cpu.view.get_corrected_tolerance(cstr_idx); + f_t cstr_tolerance = fj_cpu.view.get_corrected_tolerance(cstr_idx, c_lb, c_ub); f_t new_cost = fj_cpu.view.excess_score(cstr_idx, fj_cpu.h_lhs[cstr_idx]); if (new_cost < -cstr_tolerance) { fj_cpu.violated_constraints.insert(cstr_idx); @@ -599,6 +1058,7 @@ static thrust::tuple find_lift_move( fj_cpu_climber_t& fj_cpu) { timing_raii_t timer(fj_cpu.find_lift_move_times); + CPUFJ_NVTX_RANGE("CPUFJ::find_lift_move"); fj_move_t best_move = fj_move_t{-1, 0}; fj_staged_score_t best_score = fj_staged_score_t::zero(); @@ -620,14 +1080,14 @@ static thrust::tuple find_lift_move( // flip move wouldn't improve if (delta * obj_coeff >= 0) continue; } else { - f_t lfd_lb = get_lower(fj_cpu.h_var_bounds[var_idx]) - val; - f_t lfd_ub = get_upper(fj_cpu.h_var_bounds[var_idx]) - val; - auto [offset_begin, offset_end] = fj_cpu.view.pb.reverse_range_for_var(var_idx); + f_t lfd_lb = get_lower(fj_cpu.h_var_bounds[var_idx].get()) - val; + f_t lfd_ub = get_upper(fj_cpu.h_var_bounds[var_idx].get()) - val; + auto [offset_begin, offset_end] = reverse_range_for_var(fj_cpu, var_idx); for (i_t j = offset_begin; j < offset_end; j += 1) { - auto cstr_idx = fj_cpu.view.pb.reverse_constraints[j]; - auto cstr_coeff = fj_cpu.view.pb.reverse_coefficients[j]; - f_t c_lb = fj_cpu.view.pb.constraint_lower_bounds[cstr_idx]; - f_t c_ub = fj_cpu.view.pb.constraint_upper_bounds[cstr_idx]; + auto cstr_idx = fj_cpu.h_reverse_constraints[j]; + auto cstr_coeff = fj_cpu.h_reverse_coefficients[j]; + f_t c_lb = fj_cpu.h_cstr_lb[cstr_idx]; + f_t c_ub = fj_cpu.h_cstr_ub[cstr_idx]; f_t cstr_tolerance = fj_cpu.view.get_corrected_tolerance(cstr_idx, c_lb, c_ub); cuopt_assert(c_lb <= c_ub, "invalid bounds"); cuopt_assert(fj_cpu.view.cstr_satisfied(cstr_idx, fj_cpu.h_lhs[cstr_idx]), @@ -636,13 +1096,19 @@ static thrust::tuple find_lift_move( // Process each bound separately, as both are satified and may both be finite // otherwise range constraints aren't correctly handled for (auto [bound, sign] : {std::make_tuple(c_lb, -1), std::make_tuple(c_ub, 1)}) { - auto [delta, slack] = - get_mtm_for_bound(fj_cpu.view, var_idx, cstr_idx, cstr_coeff, bound, sign); + auto [delta, slack] = get_mtm_for_bound(fj_cpu.view, + var_idx, + cstr_idx, + cstr_coeff, + bound, + sign, + fj_cpu.h_assignment, + fj_cpu.h_lhs); if (cstr_coeff * sign < 0) { - if (fj_cpu.view.pb.is_integer_var(var_idx)) delta = ceil(delta); + if (is_integer_var(fj_cpu, var_idx)) delta = ceil(delta); } else { - if (fj_cpu.view.pb.is_integer_var(var_idx)) delta = floor(delta); + if (is_integer_var(fj_cpu, var_idx)) delta = floor(delta); } // skip this variable if there is no slack @@ -652,7 +1118,7 @@ static thrust::tuple find_lift_move( } else { lfd_lb = 0; } - } else if (!fj_cpu.view.pb.check_variable_within_bounds(var_idx, val + delta)) { + } else if (!check_variable_within_bounds(fj_cpu, var_idx, val + delta)) { continue; } else { if (cstr_coeff * sign < 0) { @@ -668,17 +1134,17 @@ static thrust::tuple find_lift_move( // invalid crossing bounds if (lfd_lb >= lfd_ub) { lfd_lb = lfd_ub = 0; } - if (!fj_cpu.view.pb.check_variable_within_bounds(var_idx, val + lfd_lb)) { lfd_lb = 0; } - if (!fj_cpu.view.pb.check_variable_within_bounds(var_idx, val + lfd_ub)) { lfd_ub = 0; } + if (!check_variable_within_bounds(fj_cpu, var_idx, val + lfd_lb)) { lfd_lb = 0; } + if (!check_variable_within_bounds(fj_cpu, var_idx, val + lfd_ub)) { lfd_ub = 0; } - // Now that the life move domain is computed, compute the correct lift move + // Now that the lift move domain is computed, compute the correct lift move cuopt_assert(isfinite(val), "invalid assignment value"); delta = obj_coeff < 0 ? lfd_ub : lfd_lb; } if (!isfinite(delta)) delta = 0; if (fj_cpu.view.pb.integer_equal(delta, (f_t)0)) continue; - if (tabu_check(fj_cpu, var_idx, delta)) continue; + if (tabu_check(fj_cpu, var_idx, delta)) continue; cuopt_assert(delta * obj_coeff < 0, "lift move doesn't improve the objective!"); @@ -700,6 +1166,7 @@ static thrust::tuple find_lift_move( template static void perturb(fj_cpu_climber_t& fj_cpu) { + CPUFJ_NVTX_RANGE("CPUFJ::perturb"); // select N variables, assign them a random value between their bounds std::vector sampled_vars; std::sample(fj_cpu.h_objective_vars.begin(), @@ -710,17 +1177,17 @@ static void perturb(fj_cpu_climber_t& fj_cpu) raft::random::PCGenerator rng(fj_cpu.settings.seed + fj_cpu.iterations, 0, 0); for (auto var_idx : sampled_vars) { - f_t lb = std::max(get_lower(fj_cpu.h_var_bounds[var_idx]), -1e7); - f_t ub = std::min(get_upper(fj_cpu.h_var_bounds[var_idx]), 1e7); + f_t lb = std::max(get_lower(fj_cpu.h_var_bounds[var_idx].get()), -1e7); + f_t ub = std::min(get_upper(fj_cpu.h_var_bounds[var_idx].get()), 1e7); f_t val = lb + (ub - lb) * rng.next_double(); - if (fj_cpu.view.pb.is_integer_var(var_idx)) { + if (is_integer_var(fj_cpu, var_idx)) { lb = std::ceil(lb); ub = std::floor(ub); val = std::round(val); val = std::min(std::max(val, lb), ub); } - cuopt_assert(fj_cpu.view.pb.check_variable_within_bounds(var_idx, val), + cuopt_assert((check_variable_within_bounds(fj_cpu, var_idx, val)), "value is out of bounds"); fj_cpu.h_assignment[var_idx] = val; } @@ -844,7 +1311,7 @@ static void init_fj_cpu(fj_cpu_climber_t& fj_cpu, fj_cpu.cached_cstr_bounds.resize(fj_cpu.h_reverse_coefficients.size()); for (i_t var_idx = 0; var_idx < (i_t)fj_cpu.view.pb.n_variables; ++var_idx) { - auto [offset_begin, offset_end] = fj_cpu.view.pb.reverse_range_for_var(var_idx); + auto [offset_begin, offset_end] = reverse_range_for_var(fj_cpu, var_idx); for (i_t i = offset_begin; i < offset_end; ++i) { fj_cpu.cached_cstr_bounds[i] = std::make_pair(fj_cpu.h_cstr_lb[fj_cpu.h_reverse_constraints[i]], @@ -857,6 +1324,9 @@ static void init_fj_cpu(fj_cpu_climber_t& fj_cpu, fj_cpu.iter_mtm_vars.reserve(fj_cpu.view.pb.n_variables); recompute_lhs(fj_cpu); + + // Precompute static problem features for regression model + precompute_problem_features(fj_cpu); } template @@ -942,17 +1412,30 @@ bool fj_t::cpu_solve(fj_cpu_climber_t& fj_cpu, f_t in_time_l auto loop_start = std::chrono::high_resolution_clock::now(); auto time_limit = std::chrono::milliseconds((int)(in_time_limit * 1000)); auto loop_time_start = std::chrono::high_resolution_clock::now(); + + // Initialize feature tracking + fj_cpu.last_feature_log_time = loop_start; + fj_cpu.prev_best_objective = fj_cpu.h_best_objective; + fj_cpu.iterations_since_best = 0; + while (!fj_cpu.halted && !fj_cpu.preemption_flag.load()) { // Check if 5 seconds have passed auto now = std::chrono::high_resolution_clock::now(); if (in_time_limit < std::numeric_limits::infinity() && now - loop_time_start > time_limit) { - CUOPT_LOG_TRACE("%sTime limit of %.4f seconds reached, breaking loop at iteration %d\n", + CUOPT_LOG_TRACE("%sTime limit of %.4f seconds reached, breaking loop at iteration %d", fj_cpu.log_prefix.c_str(), time_limit.count() / 1000.f, fj_cpu.iterations); break; } + if (fj_cpu.iterations >= fj_cpu.settings.iteration_limit) { + CUOPT_LOG_TRACE("%sIteration limit of %d reached, breaking loop at iteration %d", + fj_cpu.log_prefix.c_str(), + fj_cpu.settings.iteration_limit, + fj_cpu.iterations); + break; + } // periodically recompute the LHS and violation scores // to correct any accumulated numerical errors @@ -966,15 +1449,24 @@ bool fj_t::cpu_solve(fj_cpu_climber_t& fj_cpu, f_t in_time_l fj_move_t move = fj_move_t{-1, 0}; fj_staged_score_t score = fj_staged_score_t::invalid(); + bool is_lift = false; + bool is_mtm_viol = false; + bool is_mtm_sat = false; + // Perform lift moves - if (fj_cpu.violated_constraints.empty()) { thrust::tie(move, score) = find_lift_move(fj_cpu); } + if (fj_cpu.violated_constraints.empty()) { + thrust::tie(move, score) = find_lift_move(fj_cpu); + if (score > fj_staged_score_t::zero()) is_lift = true; + } // Regular MTM if (!(score > fj_staged_score_t::zero())) { thrust::tie(move, score) = find_mtm_move_viol(fj_cpu, fj_cpu.mtm_viol_samples); + if (score > fj_staged_score_t::zero()) is_mtm_viol = true; } // try with MTM in satisfied constraints if (fj_cpu.feasible_found && !(score > fj_staged_score_t::zero())) { thrust::tie(move, score) = find_mtm_move_sat(fj_cpu, fj_cpu.mtm_sat_samples); + if (score > fj_staged_score_t::zero()) is_mtm_sat = true; } // if we're in the feasible region but haven't found improvements in the last n iterations, // perturb @@ -987,13 +1479,17 @@ bool fj_t::cpu_solve(fj_cpu_climber_t& fj_cpu, f_t in_time_l if (score > fj_staged_score_t::zero() && !should_perturb) { apply_move(fj_cpu, move.var_idx, move.value, false); + // Track move types + if (is_lift) fj_cpu.n_lift_moves_window++; + if (is_mtm_viol) fj_cpu.n_mtm_viol_moves_window++; + if (is_mtm_sat) fj_cpu.n_mtm_sat_moves_window++; } else { // Local Min update_weights(fj_cpu); if (should_perturb) { perturb(fj_cpu); - for (auto& cached_move : fj_cpu.cached_mtm_moves) - cached_move.first = 0; + for (size_t i = 0; i < fj_cpu.cached_mtm_moves.size(); i++) + fj_cpu.cached_mtm_moves[i].first = 0; } thrust::tie(move, score) = find_mtm_move_viol(fj_cpu, 1, true); // pick a single random violated constraint @@ -1001,6 +1497,7 @@ bool fj_t::cpu_solve(fj_cpu_climber_t& fj_cpu, f_t in_time_l f_t delta = move.var_idx >= 0 ? move.value : 0; apply_move(fj_cpu, var_idx, delta, true); ++local_mins; + ++fj_cpu.n_local_minima_window; } // number of violated constraints is usually small (<100). recomputing from all LHSs is cheap @@ -1011,10 +1508,14 @@ bool fj_t::cpu_solve(fj_cpu_climber_t& fj_cpu, f_t in_time_l } if (fj_cpu.iterations % fj_cpu.log_interval == 0) { CUOPT_LOG_TRACE( - "%sCPUFJ iteration: %d, local mins: %d, best_objective: %g, viol: %zu, obj weight %g, maxw " - "%g\n", + "%sCPUFJ iteration: %d/%d, local mins: %d, best_objective: %g, viol: %zu, obj weight %g, " + "maxw " + "%g", fj_cpu.log_prefix.c_str(), fj_cpu.iterations, + fj_cpu.settings.iteration_limit != std::numeric_limits::max() + ? fj_cpu.settings.iteration_limit + : -1, local_mins, fj_cpu.pb_ptr->get_user_obj_from_solver_obj(fj_cpu.h_best_objective), fj_cpu.violated_constraints.size(), @@ -1034,20 +1535,36 @@ bool fj_t::cpu_solve(fj_cpu_climber_t& fj_cpu, f_t in_time_l print_timing_stats(fj_cpu); } #endif + + if (fj_cpu.iterations % 100 == 0 && fj_cpu.iterations > 0) { + // Collect memory statistics + auto [loads, stores] = fj_cpu.memory_aggregator.collect(); + + double biased_work = (loads + stores) * fj_cpu.work_unit_bias / 1e10; + fj_cpu.work_units_elapsed += biased_work; + + if (fj_cpu.producer_sync != nullptr) { fj_cpu.producer_sync->notify_progress(); } + + CUOPT_LOG_TRACE("CPUFJ work units: %f incumbent %g", + fj_cpu.work_units_elapsed.load(std::memory_order_relaxed), + fj_cpu.pb_ptr->get_user_obj_from_solver_obj(fj_cpu.h_best_objective)); + } + cuopt_func_call(sanity_checks(fj_cpu)); fj_cpu.iterations++; + fj_cpu.iterations_since_best++; } auto loop_end = std::chrono::high_resolution_clock::now(); double total_time = std::chrono::duration_cast>(loop_end - loop_start).count(); double avg_time_per_iter = total_time / fj_cpu.iterations; - CUOPT_LOG_TRACE("%sCPUFJ Average time per iteration: %.8fms\n", + CUOPT_LOG_TRACE("%sCPUFJ Average time per iteration: %.8fms", fj_cpu.log_prefix.c_str(), avg_time_per_iter * 1000.0); #if CPUFJ_TIMING_TRACE // Print final timing statistics - CUOPT_LOG_TRACE("\n=== Final Timing Statistics ===\n"); + CUOPT_LOG_TRACE("=== Final Timing Statistics ==="); print_timing_stats(fj_cpu); #endif diff --git a/cpp/src/mip/feasibility_jump/fj_cpu.cuh b/cpp/src/mip/feasibility_jump/fj_cpu.cuh index 4b9cfc0cc..187cd300d 100644 --- a/cpp/src/mip/feasibility_jump/fj_cpu.cuh +++ b/cpp/src/mip/feasibility_jump/fj_cpu.cuh @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -18,6 +18,8 @@ #include #include +#include +#include namespace cuopt::linear_programming::detail { @@ -25,7 +27,41 @@ namespace cuopt::linear_programming::detail { // Maintaining a single source of truth for all members would be nice template struct fj_cpu_climber_t { - fj_cpu_climber_t(std::atomic& preemption_flag) : preemption_flag(preemption_flag) {} + fj_cpu_climber_t(std::atomic& preemption_flag) : preemption_flag(preemption_flag) + { +#define ADD_INSTRUMENTED(var) \ + std::make_pair(#var, std::ref(static_cast(var))) + + // Initialize memory aggregator with all ins_vector members + memory_aggregator = instrumentation_aggregator_t{ADD_INSTRUMENTED(h_reverse_coefficients), + ADD_INSTRUMENTED(h_reverse_constraints), + ADD_INSTRUMENTED(h_reverse_offsets), + ADD_INSTRUMENTED(h_coefficients), + ADD_INSTRUMENTED(h_offsets), + ADD_INSTRUMENTED(h_variables), + ADD_INSTRUMENTED(h_obj_coeffs), + ADD_INSTRUMENTED(h_var_bounds), + ADD_INSTRUMENTED(h_cstr_lb), + ADD_INSTRUMENTED(h_cstr_ub), + ADD_INSTRUMENTED(h_var_types), + ADD_INSTRUMENTED(h_is_binary_variable), + ADD_INSTRUMENTED(h_objective_vars), + ADD_INSTRUMENTED(h_binary_indices), + ADD_INSTRUMENTED(h_tabu_nodec_until), + ADD_INSTRUMENTED(h_tabu_noinc_until), + ADD_INSTRUMENTED(h_tabu_lastdec), + ADD_INSTRUMENTED(h_tabu_lastinc), + ADD_INSTRUMENTED(h_lhs), + ADD_INSTRUMENTED(h_lhs_sumcomp), + ADD_INSTRUMENTED(h_cstr_left_weights), + ADD_INSTRUMENTED(h_cstr_right_weights), + ADD_INSTRUMENTED(h_assignment), + ADD_INSTRUMENTED(h_best_assignment), + ADD_INSTRUMENTED(cached_cstr_bounds), + ADD_INSTRUMENTED(iter_mtm_vars)}; + +#undef ADD_INSTRUMENTED + } fj_cpu_climber_t(const fj_cpu_climber_t& other) = delete; fj_cpu_climber_t& operator=(const fj_cpu_climber_t& other) = delete; @@ -36,33 +72,33 @@ struct fj_cpu_climber_t { fj_settings_t settings; typename fj_t::climber_data_t::view_t view; // Host copies of device data as struct members - std::vector h_reverse_coefficients; - std::vector h_reverse_constraints; - std::vector h_reverse_offsets; - std::vector h_coefficients; - std::vector h_offsets; - std::vector h_variables; - std::vector h_obj_coeffs; - std::vector::type> h_var_bounds; - std::vector h_cstr_lb; - std::vector h_cstr_ub; - std::vector h_var_types; - std::vector h_is_binary_variable; - std::vector h_objective_vars; - std::vector h_binary_indices; - - std::vector h_tabu_nodec_until; - std::vector h_tabu_noinc_until; - std::vector h_tabu_lastdec; - std::vector h_tabu_lastinc; - - std::vector h_lhs; - std::vector h_lhs_sumcomp; - std::vector h_cstr_left_weights; - std::vector h_cstr_right_weights; + ins_vector h_reverse_coefficients; + ins_vector h_reverse_constraints; + ins_vector h_reverse_offsets; + ins_vector h_coefficients; + ins_vector h_offsets; + ins_vector h_variables; + ins_vector h_obj_coeffs; + ins_vector::type> h_var_bounds; + ins_vector h_cstr_lb; + ins_vector h_cstr_ub; + ins_vector h_var_types; + ins_vector h_is_binary_variable; + ins_vector h_objective_vars; + ins_vector h_binary_indices; + + ins_vector h_tabu_nodec_until; + ins_vector h_tabu_noinc_until; + ins_vector h_tabu_lastdec; + ins_vector h_tabu_lastinc; + + ins_vector h_lhs; + ins_vector h_lhs_sumcomp; + ins_vector h_cstr_left_weights; + ins_vector h_cstr_right_weights; f_t max_weight; - std::vector h_assignment; - std::vector h_best_assignment; + ins_vector h_assignment; + ins_vector h_best_assignment; f_t h_objective_weight; f_t h_incumbent_objective; f_t h_best_objective; @@ -96,10 +132,10 @@ struct fj_cpu_climber_t { // CSC (transposed!) nnz-offset-indexed constraint bounds (lb, ub) // std::pair better compile down to 16 bytes!! GCC do your job! - std::vector> cached_cstr_bounds; + ins_vector> cached_cstr_bounds; std::vector var_bitmap; - std::vector iter_mtm_vars; + ins_vector iter_mtm_vars; i_t mtm_viol_samples{25}; i_t mtm_sat_samples{15}; @@ -110,11 +146,49 @@ struct fj_cpu_climber_t { i_t diversity_callback_interval{3000}; i_t timing_stats_interval{5000}; - std::function&)> improvement_callback{nullptr}; + // Callback with work unit timestamp for deterministic mode + // Parameters: objective, solution, work_units + std::function&, double)> improvement_callback{nullptr}; std::function&)> diversity_callback{nullptr}; std::string log_prefix{""}; + // Work unit tracking for deterministic synchronization + std::atomic work_units_elapsed{0.0}; + double work_unit_bias{1.5}; // Bias factor to keep CPUFJ ahead of B&B + producer_sync_t* producer_sync{nullptr}; // Optional sync utility for notifying progress + std::atomic halted{false}; + + // Feature tracking for regression model (last 1000 iterations) + i_t nnz_processed_window{0}; + i_t n_lift_moves_window{0}; + i_t n_mtm_viol_moves_window{0}; + i_t n_mtm_sat_moves_window{0}; + i_t n_variable_updates_window{0}; + i_t n_local_minima_window{0}; + std::chrono::high_resolution_clock::time_point last_feature_log_time; + f_t prev_best_objective{std::numeric_limits::infinity()}; + i_t iterations_since_best{0}; + + // Cache and locality tracking + i_t hit_count_window_start{0}; + i_t miss_count_window_start{0}; + std::unordered_set unique_cstrs_accessed_window; + std::unordered_set unique_vars_accessed_window; + + // Precomputed static problem features + i_t n_binary_vars{0}; + i_t n_integer_vars{0}; + i_t max_var_degree{0}; + i_t max_cstr_degree{0}; + double avg_var_degree{0.0}; + double avg_cstr_degree{0.0}; + double var_degree_cv{0.0}; + double cstr_degree_cv{0.0}; + double problem_density{0.0}; + + // Memory instrumentation aggregator + instrumentation_aggregator_t memory_aggregator; // TODO atomic ref? c++20 std::atomic& preemption_flag; }; diff --git a/cpp/src/mip/local_search/local_search.cu b/cpp/src/mip/local_search/local_search.cu index 4f56c52ee..228d8985a 100644 --- a/cpp/src/mip/local_search/local_search.cu +++ b/cpp/src/mip/local_search/local_search.cu @@ -10,6 +10,7 @@ #include +#include #include #include #include @@ -81,20 +82,21 @@ void local_search_t::start_cpufj_scratch_threads(population_t 0); - cpu_fj.fj_cpu->log_prefix = "******* scratch " + std::to_string(counter) + ": "; - cpu_fj.fj_cpu->improvement_callback = [&population, problem_ptr = context.problem_ptr]( - f_t obj, const std::vector& h_vec) { - population.add_external_solution(h_vec, obj, solution_origin_t::CPUFJ); - (void)problem_ptr; - if (obj < local_search_best_obj) { - CUOPT_LOG_TRACE("******* New local search best obj %g, best overall %g", - problem_ptr->get_user_obj_from_solver_obj(obj), - problem_ptr->get_user_obj_from_solver_obj( - population.is_feasible() ? population.best_feasible().get_objective() - : std::numeric_limits::max())); - local_search_best_obj = obj; - } - }; + cpu_fj.fj_cpu->log_prefix = "******* scratch " + std::to_string(counter) + ": "; + cpu_fj.fj_cpu->improvement_callback = + [&population, problem_ptr = context.problem_ptr]( + f_t obj, const std::vector& h_vec, double /*work_units*/) { + population.add_external_solution(h_vec, obj, solution_origin_t::CPUFJ); + (void)problem_ptr; + if (obj < local_search_best_obj) { + CUOPT_LOG_TRACE("******* New local search best obj %g, best overall %g", + problem_ptr->get_user_obj_from_solver_obj(obj), + problem_ptr->get_user_obj_from_solver_obj( + population.is_feasible() ? population.best_feasible().get_objective() + : std::numeric_limits::max())); + local_search_best_obj = obj; + } + }; counter++; }; @@ -119,7 +121,7 @@ void local_search_t::start_cpufj_lptopt_scratch_threads( solution_lp, default_weights, default_weights, 0., context.preempt_heuristic_solver_); scratch_cpu_fj_on_lp_opt.fj_cpu->log_prefix = "******* scratch on LP optimal: "; scratch_cpu_fj_on_lp_opt.fj_cpu->improvement_callback = - [this, &population](f_t obj, const std::vector& h_vec) { + [this, &population](f_t obj, const std::vector& h_vec, double /*work_units*/) { population.add_external_solution(h_vec, obj, solution_origin_t::CPUFJ); if (obj < local_search_best_obj) { CUOPT_LOG_DEBUG("******* New local search best obj %g, best overall %g", @@ -145,6 +147,59 @@ void local_search_t::stop_cpufj_scratch_threads() scratch_cpu_fj_on_lp_opt.request_termination(); } +template +void local_search_t::start_cpufj_deterministic( + dual_simplex::branch_and_bound_t& bb) +{ + std::vector default_weights(context.problem_ptr->n_constraints, 1.); + + solution_t solution(*context.problem_ptr); + thrust::fill(solution.handle_ptr->get_thrust_policy(), + solution.assignment.begin(), + solution.assignment.end(), + 0.0); + solution.clamp_within_bounds(); + + deterministic_cpu_fj.fj_ptr = &fj; + deterministic_cpu_fj.fj_cpu = fj.create_cpu_climber(solution, + default_weights, + default_weights, + 0., + context.preempt_heuristic_solver_, + fj_settings_t{}, + /*randomize=*/true); + + deterministic_cpu_fj.fj_cpu->log_prefix = "******* deterministic CPUFJ: "; + + // Register with producer_sync for B&B synchronization + producer_sync_t& producer_sync = bb.get_producer_sync(); + deterministic_cpu_fj.fj_cpu->producer_sync = &producer_sync; + producer_sync.register_producer(&deterministic_cpu_fj.fj_cpu->work_units_elapsed); + + // Set up callback to send solutions to B&B with work unit timestamps + deterministic_cpu_fj.fj_cpu->improvement_callback = + [&bb](f_t obj, const std::vector& h_vec, double work_units) { + bb.queue_external_solution_deterministic(h_vec, work_units); + }; + + deterministic_cpu_fj.start_cpu_solver(); + + // Signal that registration is complete - B&B can now wait on producers + producer_sync.registration_complete(); +} + +template +void local_search_t::stop_cpufj_deterministic() +{ + if (deterministic_cpu_fj.fj_cpu) { + if (deterministic_cpu_fj.fj_cpu->producer_sync) { + deterministic_cpu_fj.fj_cpu->producer_sync->deregister_producer( + &deterministic_cpu_fj.fj_cpu->work_units_elapsed); + } + deterministic_cpu_fj.request_termination(); + } +} + template bool local_search_t::do_fj_solve(solution_t& solution, fj_t& in_fj, diff --git a/cpp/src/mip/local_search/local_search.cuh b/cpp/src/mip/local_search/local_search.cuh index 6fdf4ac72..d05c2a834 100644 --- a/cpp/src/mip/local_search/local_search.cuh +++ b/cpp/src/mip/local_search/local_search.cuh @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -21,6 +21,11 @@ #include #include +namespace cuopt::linear_programming::dual_simplex { +template +class branch_and_bound_t; +} + namespace cuopt::linear_programming::detail { // make sure RANDOM is always the last @@ -87,6 +92,10 @@ class local_search_t { const std::string& source); i_t ls_threads() const { return ls_cpu_fj.size() + scratch_cpu_fj.size(); } + + // Start CPUFJ thread for deterministic mode with B&B integration + void start_cpufj_deterministic(dual_simplex::branch_and_bound_t& bb); + void stop_cpufj_deterministic(); void save_solution_and_add_cutting_plane(solution_t& solution, rmm::device_uvector& best_solution, f_t& best_objective); @@ -120,6 +129,7 @@ class local_search_t { std::array, 8> ls_cpu_fj; std::array, 1> scratch_cpu_fj; cpu_fj_thread_t scratch_cpu_fj_on_lp_opt; + cpu_fj_thread_t deterministic_cpu_fj; problem_t problem_with_objective_cut; bool cutting_plane_added_for_active_run{false}; }; diff --git a/cpp/src/mip/problem/problem.cu b/cpp/src/mip/problem/problem.cu index 0a630628b..4fa6983df 100644 --- a/cpp/src/mip/problem/problem.cu +++ b/cpp/src/mip/problem/problem.cu @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -92,6 +93,7 @@ void problem_t::op_problem_cstr_body(const optimization_problem_t(*this, combined_bounds); @@ -100,11 +102,13 @@ void problem_t::op_problem_cstr_body(const optimization_problem_t problem_t::problem_t( const optimization_problem_t& problem_, - const typename mip_solver_settings_t::tolerances_t tolerances_) + const typename mip_solver_settings_t::tolerances_t tolerances_, + bool deterministic_) : original_problem_ptr(&problem_), handle_ptr(problem_.get_handle_ptr()), integer_fixed_variable_map(problem_.get_n_variables(), problem_.get_handle_ptr()->get_stream()), tolerances(tolerances_), + deterministic(deterministic_), n_variables(problem_.get_n_variables()), n_constraints(problem_.get_n_constraints()), n_binary_vars(0), @@ -152,6 +156,7 @@ template problem_t::problem_t(const problem_t& problem_) : original_problem_ptr(problem_.original_problem_ptr), tolerances(problem_.tolerances), + deterministic(problem_.deterministic), handle_ptr(problem_.handle_ptr), integer_fixed_problem(problem_.integer_fixed_problem), integer_fixed_variable_map(problem_.integer_fixed_variable_map, handle_ptr->get_stream()), @@ -207,6 +212,7 @@ problem_t::problem_t(const problem_t& problem_, const raft::handle_t* handle_ptr_) : original_problem_ptr(problem_.original_problem_ptr), tolerances(problem_.tolerances), + deterministic(problem_.deterministic), handle_ptr(handle_ptr_), integer_fixed_problem(problem_.integer_fixed_problem), integer_fixed_variable_map(problem_.integer_fixed_variable_map, handle_ptr->get_stream()), @@ -261,6 +267,7 @@ template problem_t::problem_t(const problem_t& problem_, bool no_deep_copy) : original_problem_ptr(problem_.original_problem_ptr), tolerances(problem_.tolerances), + deterministic(problem_.deterministic), handle_ptr(problem_.handle_ptr), integer_fixed_problem(problem_.integer_fixed_problem), integer_fixed_variable_map(problem_.n_variables, handle_ptr->get_stream()), @@ -465,6 +472,7 @@ template void problem_t::compute_transpose_of_problem() { raft::common::nvtx::range fun_scope("compute_transpose_of_problem"); + csrsort_cusparse(coefficients, variables, offsets, n_constraints, n_variables, handle_ptr); RAFT_CUBLAS_TRY(raft::linalg::detail::cublassetpointermode( handle_ptr->get_cublas_handle(), CUBLAS_POINTER_MODE_DEVICE, handle_ptr->get_stream())); RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsesetpointermode( @@ -795,6 +803,55 @@ void problem_t::recompute_auxilliary_data(bool check_representation) if (check_representation) cuopt_func_call(check_problem_representation(true)); } +template +void problem_t::compute_auxiliary_data() +{ + raft::common::nvtx::range fun_scope("compute_auxiliary_data"); + + // Compute sparsity: nnz / (n_rows * n_cols) + sparsity = (n_constraints > 0 && n_variables > 0) + ? static_cast(nnz) / (static_cast(n_constraints) * n_variables) + : 0.0; + + // Compute stddev of non-zeros per row (on device) + nnz_stddev = 0.0; + unbalancedness = 0.0; + if (offsets.size() == static_cast(n_constraints + 1) && n_constraints > 0) { + // First: compute nnz per row on device + rmm::device_uvector d_nnz_per_row(n_constraints, handle_ptr->get_stream()); + thrust::transform(handle_ptr->get_thrust_policy(), + offsets.begin() + 1, + offsets.begin() + n_constraints + 1, + offsets.begin(), + d_nnz_per_row.begin(), + thrust::minus()); + + // Compute mean + double sum = thrust::reduce(handle_ptr->get_thrust_policy(), + d_nnz_per_row.begin(), + d_nnz_per_row.end(), + 0.0, + thrust::plus()); + double mean = sum / n_constraints; + + // Compute variance + double variance = thrust::transform_reduce( + handle_ptr->get_thrust_policy(), + d_nnz_per_row.begin(), + d_nnz_per_row.end(), + [mean] __device__(i_t x) -> double { + double diff = static_cast(x) - mean; + return diff * diff; + }, + 0.0, + thrust::plus()) / + n_constraints; + + nnz_stddev = std::sqrt(variance); + unbalancedness = nnz_stddev / mean; + } +} + template void problem_t::compute_n_integer_vars() { @@ -883,6 +940,9 @@ void problem_t::compute_related_variables(double time_limit) handle_ptr->sync_stream(); + // CHANGE + if (deterministic) { time_limit = std::numeric_limits::infinity(); } + // previously used constants were based on 40GB of memory. Scale accordingly on smaller GPUs // We can't rely on querying free memory or allocation try/catch // since this would break determinism guarantees (GPU may be shared by other processes) @@ -1454,6 +1514,7 @@ problem_t problem_t::get_problem_after_fixing_vars( cuopt_assert(n_variables == assignment.size(), "Assignment size issue"); problem_t problem(*this, true); CUOPT_LOG_DEBUG("Fixing %d variables", variables_to_fix.size()); + CUOPT_LOG_DEBUG("Model fingerprint before fixing: 0x%x", get_fingerprint()); // we will gather from this and scatter back to the original problem variable_map.resize(assignment.size() - variables_to_fix.size(), handle_ptr->get_stream()); // compute variable map to recover the assignment later @@ -1473,6 +1534,9 @@ problem_t problem_t::get_problem_after_fixing_vars( RAFT_CHECK_CUDA(handle_ptr->get_stream()); cuopt_assert(result_end - variable_map.data() == variable_map.size(), "Size issue in set_difference"); + CUOPT_LOG_DEBUG("Fixing assignment hash 0x%x, vars to fix: 0x%x", + detail::compute_hash(assignment, handle_ptr->get_stream()), + detail::compute_hash(variables_to_fix, handle_ptr->get_stream())); problem.fix_given_variables(*this, assignment, variables_to_fix, handle_ptr); RAFT_CHECK_CUDA(handle_ptr->get_stream()); problem.remove_given_variables(*this, assignment, variable_map, handle_ptr); @@ -1493,11 +1557,11 @@ problem_t problem_t::get_problem_after_fixing_vars( auto end_time = std::chrono::high_resolution_clock::now(); double time_taken = std::chrono::duration_cast(end_time - start_time).count(); - static double total_time_taken = 0.; - static int total_calls = 0; + [[maybe_unused]] static double total_time_taken = 0.; + [[maybe_unused]] static int total_calls = 0; total_time_taken += time_taken; total_calls++; - CUOPT_LOG_DEBUG( + CUOPT_LOG_TRACE( "Time taken to fix variables: %f milliseconds, average: %f milliseconds total time: %f", time_taken, total_time_taken / total_calls, @@ -1505,7 +1569,7 @@ problem_t problem_t::get_problem_after_fixing_vars( // if the fixing is greater than 150, mark this as expensive. // this way we can avoid frequent fixings for this problem constexpr double expensive_time_threshold = 150; - if (time_taken > expensive_time_threshold) { expensive_to_fix_vars = true; } + if (time_taken > expensive_time_threshold && !deterministic) { expensive_to_fix_vars = true; } return problem; } @@ -1577,6 +1641,7 @@ void problem_t::remove_given_variables(problem_t& original_p coefficients.resize(nnz, handle_ptr->get_stream()); variables.resize(nnz, handle_ptr->get_stream()); compute_transpose_of_problem(); + compute_auxiliary_data(); combine_constraint_bounds(*this, combined_bounds); handle_ptr->sync_stream(); recompute_auxilliary_data(); @@ -1802,6 +1867,7 @@ void problem_t::preprocess_problem() standardize_bounds(variable_constraint_map, *this); compute_csr(variable_constraint_map, *this); compute_transpose_of_problem(); + compute_auxiliary_data(); cuopt_func_call(check_problem_representation(true, false)); presolve_data.initialize_var_mapping(*this, handle_ptr); integer_indices.resize(n_variables, handle_ptr->get_stream()); @@ -1869,9 +1935,9 @@ void problem_t::get_host_user_problem( user_problem.objective = cuopt::host_copy(objective_coefficients, stream); dual_simplex::csr_matrix_t csr_A(m, n, nz); - csr_A.x = cuopt::host_copy(coefficients, stream); - csr_A.j = cuopt::host_copy(variables, stream); - csr_A.row_start = cuopt::host_copy(offsets, stream); + csr_A.x = ins_vector(cuopt::host_copy(coefficients, stream)); + csr_A.j = ins_vector(cuopt::host_copy(variables, stream)); + csr_A.row_start = ins_vector(cuopt::host_copy(offsets, stream)); csr_A.to_compressed_col(user_problem.A); @@ -1963,6 +2029,44 @@ f_t problem_t::get_user_obj_from_solver_obj(f_t solver_obj) const return presolve_data.objective_scaling_factor * (solver_obj + presolve_data.objective_offset); } +template +uint32_t problem_t::get_fingerprint() const +{ + // CSR representation should be unique and sorted at this point + auto stream = handle_ptr->get_stream(); + + uint32_t h_coeff = detail::compute_hash(coefficients, stream); + uint32_t h_vars = detail::compute_hash(variables, stream); + uint32_t h_offsets = detail::compute_hash(offsets, stream); + uint32_t h_rev_coeff = detail::compute_hash(reverse_coefficients, stream); + uint32_t h_rev_off = detail::compute_hash(reverse_offsets, stream); + uint32_t h_rev_constr = detail::compute_hash(reverse_constraints, stream); + uint32_t h_obj = detail::compute_hash(objective_coefficients, stream); + uint32_t h_varbounds = detail::compute_hash(variable_bounds, stream); + uint32_t h_clb = detail::compute_hash(constraint_lower_bounds, stream); + uint32_t h_cub = detail::compute_hash(constraint_upper_bounds, stream); + uint32_t h_vartypes = detail::compute_hash(variable_types, stream); + uint32_t h_obj_off = detail::compute_hash(presolve_data.objective_offset); + uint32_t h_obj_scale = detail::compute_hash(presolve_data.objective_scaling_factor); + + std::vector hashes = { + h_coeff, + h_vars, + h_offsets, + h_rev_coeff, + h_rev_off, + h_rev_constr, + h_obj, + h_varbounds, + h_clb, + h_cub, + h_vartypes, + h_obj_off, + h_obj_scale, + }; + return detail::compute_hash(hashes); +} + template void problem_t::compute_vars_with_objective_coeffs() { @@ -1999,6 +2103,7 @@ void problem_t::add_cutting_plane_at_objective(f_t objective) objective); insert_constraints(h_constraints); compute_transpose_of_problem(); + compute_auxiliary_data(); cuopt_func_call(check_problem_representation(true)); } diff --git a/cpp/src/mip/problem/problem.cuh b/cpp/src/mip/problem/problem.cuh index 6cbd5e5a5..a93f28793 100644 --- a/cpp/src/mip/problem/problem.cuh +++ b/cpp/src/mip/problem/problem.cuh @@ -51,7 +51,8 @@ template class problem_t { public: problem_t(const optimization_problem_t& problem, - const typename mip_solver_settings_t::tolerances_t tolerances_ = {}); + const typename mip_solver_settings_t::tolerances_t tolerances_ = {}, + bool deterministic = false); problem_t() = delete; // copy constructor problem_t(const problem_t& problem); @@ -75,6 +76,7 @@ class problem_t { void check_problem_representation(bool check_transposed = false, bool check_mip_related_data = true); void recompute_auxilliary_data(bool check_representation = true); + void compute_auxiliary_data(); void compute_n_integer_vars(); void compute_binary_var_table(); void compute_related_variables(double time_limit); @@ -119,6 +121,8 @@ class problem_t { void get_host_user_problem( cuopt::linear_programming::dual_simplex::user_problem_t& user_problem) const; + uint32_t get_fingerprint() const; + void add_cutting_plane_at_objective(f_t objective); void compute_vars_with_objective_coeffs(); void test_problem_fixing_time(); @@ -242,6 +246,12 @@ class problem_t { bool maximize{false}; bool is_binary_pb{false}; bool empty{false}; + bool deterministic{false}; + + // Auxiliary problem statistics + double sparsity{0.0}; + double nnz_stddev{0.0}; + double unbalancedness{0.0}; presolve_data_t presolve_data; diff --git a/cpp/src/mip/problem/problem_helpers.cuh b/cpp/src/mip/problem/problem_helpers.cuh index eadc7e309..fa8a5000d 100644 --- a/cpp/src/mip/problem/problem_helpers.cuh +++ b/cpp/src/mip/problem/problem_helpers.cuh @@ -356,6 +356,9 @@ static void csrsort_cusparse(rmm::device_uvector& values, i_t cols, const raft::handle_t* handle_ptr) { + // skip if the matrix is empty + if (values.size() == 0) { return; } + auto stream = offsets.stream(); cusparseHandle_t handle; cusparseCreate(&handle); diff --git a/cpp/src/mip/solve.cu b/cpp/src/mip/solve.cu index ee852fb29..8822082fc 100644 --- a/cpp/src/mip/solve.cu +++ b/cpp/src/mip/solve.cu @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include @@ -54,7 +54,7 @@ static void init_handler(const raft::handle_t* handle_ptr) template mip_solution_t run_mip(detail::problem_t& problem, mip_solver_settings_t const& settings, - cuopt::timer_t& timer) + timer_t& timer) { raft::common::nvtx::range fun_scope("run_mip"); auto constexpr const running_mip = true; @@ -121,6 +121,7 @@ mip_solution_t run_mip(detail::problem_t& problem, CUOPT_LOG_INFO("Objective offset %f scaling_factor %f", problem.presolve_data.objective_offset, problem.presolve_data.objective_scaling_factor); + CUOPT_LOG_INFO("Model fingerprint: 0x%x", problem.get_fingerprint()); cuopt_assert(problem.original_problem_ptr->get_n_variables() == scaled_problem.n_variables, "Size mismatch"); cuopt_assert(problem.original_problem_ptr->get_n_constraints() == scaled_problem.n_constraints, @@ -166,7 +167,10 @@ mip_solution_t run_mip(detail::problem_t& problem, auto sol = scaled_sol.get_solution( is_feasible_before_scaling || is_feasible_after_unscaling, solver.get_solver_stats(), false); - detail::print_solution(scaled_problem.handle_ptr, sol.get_solution()); + + int hidesol = + std::getenv("CUOPT_MIP_HIDE_SOLUTION") ? atoi(std::getenv("CUOPT_MIP_HIDE_SOLUTION")) : 0; + if (!hidesol) { detail::print_solution(scaled_problem.handle_ptr, sol.get_solution()); } return sol; } @@ -190,6 +194,9 @@ mip_solution_t solve_mip(optimization_problem_t& op_problem, print_version_info(); + // Initialize seed generator if a specific seed is requested + if (settings.seed >= 0) { cuopt::seed_generator::set_seed(settings.seed); } + raft::common::nvtx::range fun_scope("Running solver"); // This is required as user might forget to set some fields @@ -211,11 +218,13 @@ mip_solution_t solve_mip(optimization_problem_t& op_problem, op_problem.get_handle_ptr()->get_stream()); } - auto timer = cuopt::timer_t(time_limit); + auto timer = timer_t(time_limit); + double presolve_time = 0.0; std::unique_ptr> presolver; std::optional> presolve_result; - detail::problem_t problem(op_problem, settings.get_tolerances()); + detail::problem_t problem( + op_problem, settings.get_tolerances(), settings.determinism_mode == CUOPT_MODE_DETERMINISTIC); auto run_presolve = settings.presolve; run_presolve = run_presolve && settings.initial_solutions.size() == 0; @@ -239,7 +248,10 @@ mip_solution_t solve_mip(optimization_problem_t& op_problem, detail::sort_csr(op_problem); // allocate not more than 10% of the time limit to presolve. // Note that this is not the presolve time, but the time limit for presolve. - const double presolve_time_limit = std::min(0.1 * time_limit, 60.0); + double presolve_time_limit = std::min(0.1 * time_limit, 60.0); + if (settings.determinism_mode == CUOPT_MODE_DETERMINISTIC) { + presolve_time_limit = std::numeric_limits::infinity(); + } presolver = std::make_unique>(); auto result = presolver->apply(op_problem, cuopt::linear_programming::problem_category_t::MIP, @@ -277,6 +289,7 @@ mip_solution_t solve_mip(optimization_problem_t& op_problem, if (run_presolve) { auto status_to_skip = sol.get_termination_status() == mip_termination_status_t::TimeLimit || + sol.get_termination_status() == mip_termination_status_t::WorkLimit || sol.get_termination_status() == mip_termination_status_t::Infeasible; auto primal_solution = cuopt::device_copy(sol.get_solution(), op_problem.get_handle_ptr()->get_stream()); diff --git a/cpp/src/mip/solver.cu b/cpp/src/mip/solver.cu index 8583a76f8..ee1d9c2e5 100644 --- a/cpp/src/mip/solver.cu +++ b/cpp/src/mip/solver.cu @@ -21,6 +21,7 @@ #include #include +#include #include #include #include @@ -108,7 +109,10 @@ solution_t mip_solver_t::run_solver() } dm.timer = timer_; const bool run_presolve = context.settings.presolve; - bool presolve_success = run_presolve ? dm.run_presolve(timer_.remaining_time()) : true; + f_t time_limit = context.settings.determinism_mode == CUOPT_MODE_DETERMINISTIC + ? std::numeric_limits::infinity() + : timer_.remaining_time(); + bool presolve_success = run_presolve ? dm.run_presolve(time_limit) : true; if (!presolve_success) { CUOPT_LOG_INFO("Problem proven infeasible in presolve"); solution_t sol(*context.problem_ptr); @@ -159,6 +163,7 @@ solution_t mip_solver_t::run_solver() context.problem_ptr->post_process_solution(sol); return sol; } + context.work_unit_scheduler_.register_context(context.gpu_heur_loop); namespace dual_simplex = cuopt::linear_programming::dual_simplex; std::future branch_and_bound_status_future; @@ -168,7 +173,8 @@ solution_t mip_solver_t::run_solver() branch_and_bound_solution_helper_t solution_helper(&dm, branch_and_bound_settings); dual_simplex::mip_solution_t branch_and_bound_solution(1); - if (!context.settings.heuristics_only) { + bool run_bb = !context.settings.heuristics_only; + if (run_bb) { // Convert the presolved problem to dual_simplex::user_problem_t op_problem_.get_host_user_problem(branch_and_bound_problem); // Resize the solution now that we know the number of columns/variables @@ -184,6 +190,14 @@ solution_t mip_solver_t::run_solver() branch_and_bound_settings.reliability_branching = solver_settings_.reliability_branching; branch_and_bound_settings.max_cut_passes = context.settings.max_cut_passes; branch_and_bound_settings.mir_cuts = context.settings.mir_cuts; + branch_and_bound_settings.deterministic = + context.settings.determinism_mode == CUOPT_MODE_DETERMINISTIC; + + if (context.settings.determinism_mode == CUOPT_MODE_DETERMINISTIC) { + branch_and_bound_settings.work_limit = context.settings.work_limit; + } else { + branch_and_bound_settings.work_limit = std::numeric_limits::infinity(); + } branch_and_bound_settings.mixed_integer_gomory_cuts = context.settings.mixed_integer_gomory_cuts; branch_and_bound_settings.knapsack_cuts = context.settings.knapsack_cuts; @@ -208,36 +222,52 @@ solution_t mip_solver_t::run_solver() &solution_helper, std::placeholders::_1, std::placeholders::_2); + // heuristic_preemption_callback is needed in both modes to properly stop the heuristic thread branch_and_bound_settings.heuristic_preemption_callback = std::bind( &branch_and_bound_solution_helper_t::preempt_heuristic_solver, &solution_helper); - - branch_and_bound_settings.set_simplex_solution_callback = - std::bind(&branch_and_bound_solution_helper_t::set_simplex_solution, - &solution_helper, - std::placeholders::_1, - std::placeholders::_2, - std::placeholders::_3); - - branch_and_bound_settings.node_processed_callback = - std::bind(&branch_and_bound_solution_helper_t::node_processed_callback, - &solution_helper, - std::placeholders::_1, - std::placeholders::_2); + if (context.settings.determinism_mode == CUOPT_MODE_OPPORTUNISTIC) { + branch_and_bound_settings.set_simplex_solution_callback = + std::bind(&branch_and_bound_solution_helper_t::set_simplex_solution, + &solution_helper, + std::placeholders::_1, + std::placeholders::_2, + std::placeholders::_3); + + branch_and_bound_settings.node_processed_callback = + std::bind(&branch_and_bound_solution_helper_t::node_processed_callback, + &solution_helper, + std::placeholders::_1, + std::placeholders::_2); + } // Create the branch and bound object branch_and_bound = std::make_unique>( branch_and_bound_problem, branch_and_bound_settings, timer_.get_tic_start()); context.branch_and_bound_ptr = branch_and_bound.get(); - branch_and_bound->set_concurrent_lp_root_solve(true); - auto* stats_ptr = &context.stats; + auto* stats_ptr = &context.stats; branch_and_bound->set_user_bound_callback( [stats_ptr](f_t user_bound) { stats_ptr->set_solution_bound(user_bound); }); // Set the primal heuristics -> branch and bound callback - context.problem_ptr->branch_and_bound_callback = - std::bind(&dual_simplex::branch_and_bound_t::set_new_solution, - branch_and_bound.get(), - std::placeholders::_1); + if (context.settings.determinism_mode == CUOPT_MODE_OPPORTUNISTIC) { + branch_and_bound->set_concurrent_lp_root_solve(true); + + context.problem_ptr->branch_and_bound_callback = + std::bind(&dual_simplex::branch_and_bound_t::set_new_solution, + branch_and_bound.get(), + std::placeholders::_1); + } else if (context.settings.determinism_mode == CUOPT_MODE_DETERMINISTIC) { + branch_and_bound->set_concurrent_lp_root_solve(false); + // TODO once deterministic GPU heuristics are integrated + // context.problem_ptr->branch_and_bound_callback = + // [bb = branch_and_bound.get()](const std::vector& solution) { + // bb->queue_external_solution_deterministic(solution, 0.0); + // }; + } + + context.work_unit_scheduler_.register_context(branch_and_bound->get_work_unit_context()); + // context.work_unit_scheduler_.verbose = true; + context.problem_ptr->set_root_relaxation_solution_callback = std::bind(&dual_simplex::branch_and_bound_t::set_root_relaxation_solution, branch_and_bound.get(), @@ -261,7 +291,7 @@ solution_t mip_solver_t::run_solver() // Start the primal heuristics context.diversity_manager_ptr = &dm; auto sol = dm.run_solver(); - if (!context.settings.heuristics_only) { + if (run_bb) { // Wait for the branch and bound to finish auto bb_status = branch_and_bound_status_future.get(); if (branch_and_bound_solution.lower_bound > -std::numeric_limits::infinity()) { diff --git a/cpp/src/mip/solver_context.cuh b/cpp/src/mip/solver_context.cuh index 293a36785..c6b6a6ec0 100644 --- a/cpp/src/mip/solver_context.cuh +++ b/cpp/src/mip/solver_context.cuh @@ -10,6 +10,8 @@ #include #include #include +#include +#include #include @@ -39,8 +41,12 @@ struct mip_solver_context_t { cuopt_assert(problem_ptr != nullptr, "problem_ptr is nullptr"); stats.set_solution_bound(problem_ptr->maximize ? std::numeric_limits::infinity() : -std::numeric_limits::infinity()); + gpu_heur_loop.deterministic = settings.determinism_mode == CUOPT_MODE_DETERMINISTIC; } + mip_solver_context_t(const mip_solver_context_t&) = delete; + mip_solver_context_t& operator=(const mip_solver_context_t&) = delete; + raft::handle_t const* const handle_ptr; problem_t* problem_ptr; dual_simplex::branch_and_bound_t* branch_and_bound_ptr{nullptr}; @@ -49,6 +55,12 @@ struct mip_solver_context_t { const mip_solver_settings_t settings; pdlp_initial_scaling_strategy_t& scaling; solver_stats_t stats; + // Work limit context for tracking work units in deterministic mode (shared across all timers in + // GPU heuristic loop) + work_limit_context_t gpu_heur_loop{"GPUHeur"}; + + // synchronization every 5 seconds for deterministic mode + work_unit_scheduler_t work_unit_scheduler_{5.0}; }; } // namespace cuopt::linear_programming::detail diff --git a/cpp/src/mip/solver_solution.cu b/cpp/src/mip/solver_solution.cu index af3947d69..37e58aec9 100644 --- a/cpp/src/mip/solver_solution.cu +++ b/cpp/src/mip/solver_solution.cu @@ -136,6 +136,7 @@ std::string mip_solution_t::get_termination_status_string( case mip_termination_status_t::FeasibleFound: return "FeasibleFound"; case mip_termination_status_t::Infeasible: return "Infeasible"; case mip_termination_status_t::TimeLimit: return "TimeLimit"; + case mip_termination_status_t::WorkLimit: return "WorkLimit"; case mip_termination_status_t::Unbounded: return "Unbounded"; // Do not implement default case to trigger compile time error if new enum is added diff --git a/cpp/src/mip/utilities/cpu_worker_thread.cuh b/cpp/src/mip/utilities/cpu_worker_thread.cuh index 60bd5685b..2b982e1f4 100644 --- a/cpp/src/mip/utilities/cpu_worker_thread.cuh +++ b/cpp/src/mip/utilities/cpu_worker_thread.cuh @@ -18,9 +18,11 @@ #pragma once #include +#include #include #include #include +#include namespace cuopt::linear_programming::detail { @@ -132,8 +134,12 @@ void cpu_worker_thread_base_t::start_cpu_solver() template bool cpu_worker_thread_base_t::wait_for_cpu_solver() { + auto wait_start = std::chrono::high_resolution_clock::now(); std::unique_lock lock(cpu_mutex); cpu_cv.wait(lock, [this] { return cpu_thread_done || cpu_thread_terminate; }); + auto wait_end = std::chrono::high_resolution_clock::now(); + double wait_time = std::chrono::duration(wait_end - wait_start).count(); + if (wait_time > 1.0) { CUOPT_LOG_DEBUG("CPU thread wait time: %.2f seconds", wait_time); } return static_cast(this)->get_result(); } diff --git a/cpp/src/mip/utils.cuh b/cpp/src/mip/utils.cuh index 69707ee1f..41ba8fa9e 100644 --- a/cpp/src/mip/utils.cuh +++ b/cpp/src/mip/utils.cuh @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -15,6 +15,7 @@ #include #include #include +#include #include @@ -29,6 +30,22 @@ constexpr int default_int_upper = std::numeric_limits::max(); constexpr int default_int_lower = std::numeric_limits::min(); constexpr double zero_bound = 0.; +template +inline uint32_t compute_hash(raft::device_span values, rmm::cuda_stream_view stream) +{ + auto h_contents = cuopt::host_copy(values, stream); + RAFT_CHECK_CUDA(stream); + return compute_hash(h_contents); +} + +template +inline uint32_t compute_hash(const rmm::device_uvector& values, rmm::cuda_stream_view stream) +{ + auto h_contents = cuopt::host_copy(values, stream); + RAFT_CHECK_CUDA(stream); + return compute_hash(h_contents); +} + template HDI f_t get_cstr_tolerance(f_t combined_bound, f_t abs_tol, f_t rel_tol) { diff --git a/cpp/src/utilities/hashing.hpp b/cpp/src/utilities/hashing.hpp new file mode 100644 index 000000000..9e8dc09e1 --- /dev/null +++ b/cpp/src/utilities/hashing.hpp @@ -0,0 +1,46 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +#include +#include +#include + +namespace cuopt::linear_programming::detail { + +template +inline uint32_t compute_hash(const std::vector& h_contents) +{ + // FNV-1a hash + + uint32_t hash = 2166136261u; // FNV-1a 32-bit offset basis + std::vector byte_contents(h_contents.size() * sizeof(i_t)); + std::memcpy(byte_contents.data(), h_contents.data(), h_contents.size() * sizeof(i_t)); + for (size_t i = 0; i < byte_contents.size(); ++i) { + hash ^= byte_contents[i]; + hash *= 16777619u; + } + return hash; +} + +template +#if defined(__CUDACC__) +__host__ __device__ +#endif + inline uint32_t + compute_hash(const i_t val) +{ + uint32_t hash = 2166136261u; + uint8_t byte_contents[sizeof(i_t)]; + std::memcpy(byte_contents, &val, sizeof(i_t)); + for (size_t i = 0; i < sizeof(i_t); ++i) { + hash ^= byte_contents[i]; + hash *= 16777619u; + } + return hash; +} + +} // namespace cuopt::linear_programming::detail diff --git a/cpp/src/utilities/memory_instrumentation.hpp b/cpp/src/utilities/memory_instrumentation.hpp new file mode 100644 index 000000000..d33e7de0b --- /dev/null +++ b/cpp/src/utilities/memory_instrumentation.hpp @@ -0,0 +1,905 @@ +/* clang-format off */ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ +/* clang-format on */ + +/** + * @file memory_instrumentation.hpp + * @brief Memory access instrumentation utilities + * + * This file provides wrapper classes for tracking memory reads and writes. + * + * Usage: + * - Define CUOPT_ENABLE_MEMORY_INSTRUMENTATION to enable tracking + * - When undefined, all instrumentation becomes zero-overhead passthrough + * (record_*() calls inline away, no counter storage overhead) + * + * Example: + * ins_vector vec; // Instrumented std::vector + * vec.push_back(42); + * auto val = vec[0]; + * // When enabled: tracking occurs, counters accumulate + * // When disabled: direct passthrough, compiler optimizes away all overhead + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#define CUOPT_ENABLE_MEMORY_INSTRUMENTATION 1 + +#ifdef __NVCC__ +#define HDI inline __host__ __device__ +#else +#define HDI inline +#endif + +namespace cuopt { + +// Define CUOPT_ENABLE_MEMORY_INSTRUMENTATION to 1 to enable memory tracking +// When 0, instrumentation becomes a zero-overhead passthrough + +// Base class for memory operation instrumentation +struct memory_instrumentation_base_t { +#if CUOPT_ENABLE_MEMORY_INSTRUMENTATION + HDI void reset_counters() const { byte_loads = byte_stores = 0; } + + template + HDI void record_load() const + { + byte_loads += sizeof(T); + } + + template + HDI void record_store() const + { + byte_stores += sizeof(T); + } + + template + HDI void record_rmw() const + { + byte_loads += sizeof(T); + byte_stores += sizeof(T); + } + + mutable size_t byte_loads{0}; + mutable size_t byte_stores{0}; +#else + // No-op methods when instrumentation is disabled - these inline away to zero overhead + HDI void reset_counters() const {} + template + HDI void record_load() const + { + } + template + HDI void record_store() const + { + } + template + HDI void record_rmw() const + { + } +#endif // CUOPT_ENABLE_MEMORY_INSTRUMENTATION +}; + +#if CUOPT_ENABLE_MEMORY_INSTRUMENTATION + +// aggregator class to collect statistics from multiple instrumented objects +class instrumentation_aggregator_t { + public: + instrumentation_aggregator_t() = default; + + // Construct with initializer list of (description, instrumented object) pairs + instrumentation_aggregator_t( + std::initializer_list< + std::pair>> + instrumented) + { + for (const auto& [name, instr] : instrumented) { + instrumented_.insert_or_assign(name, instr); + } + } + + // Add an instrumented object to track with a description + void add(const std::string& description, const memory_instrumentation_base_t& instrumented) + { + instrumented_.insert_or_assign(description, std::cref(instrumented)); + } + + // Collect total loads and stores across all instrumented objects + std::pair collect() + { + size_t total_loads = 0; + size_t total_stores = 0; + + for (auto& [name, instr] : instrumented_) { + total_loads += instr.get().byte_loads; + total_stores += instr.get().byte_stores; + } + + return {total_loads, total_stores}; + } + + // Collect per-wrapper statistics + std::vector> collect_per_wrapper() + { + std::vector> results; + results.reserve(instrumented_.size()); + + for (auto& [name, instr] : instrumented_) { + results.emplace_back(name, instr.get().byte_loads, instr.get().byte_stores); + } + + return results; + } + + // Collect total loads and stores, then flush counters + std::pair collect_and_flush() + { + auto result = collect(); + flush(); + return result; + } + + void flush() + { + for (auto& [name, instr] : instrumented_) { + instr.get().reset_counters(); + } + } + + private: + std::unordered_map> + instrumented_; +}; + +#else + +// No-op aggregator when instrumentation is disabled +class instrumentation_aggregator_t { + public: + instrumentation_aggregator_t() = default; + instrumentation_aggregator_t( + std::initializer_list< + std::pair>>) + { + } + void add(const std::string&, const memory_instrumentation_base_t&) {} + std::pair collect() { return {0, 0}; } + std::vector> collect_per_wrapper() { return {}; } + std::pair collect_and_flush() { return {0, 0}; } + void flush() {} +}; + +#endif // CUOPT_ENABLE_MEMORY_INSTRUMENTATION + +// Helper traits to detect container capabilities +namespace type_traits_utils { + +template +struct has_reserve : std::false_type {}; + +template +struct has_reserve().reserve(size_t{}))>> : std::true_type { +}; + +template +struct has_capacity : std::false_type {}; + +template +struct has_capacity().capacity())>> : std::true_type {}; + +template +struct has_shrink_to_fit : std::false_type {}; + +template +struct has_shrink_to_fit().shrink_to_fit())>> + : std::true_type {}; + +template +struct has_push_back : std::false_type {}; + +template +struct has_push_back< + T, + std::void_t().push_back(std::declval()))>> + : std::true_type {}; + +template +struct has_emplace_back : std::false_type {}; + +template +struct has_emplace_back().emplace_back())>> + : std::true_type {}; + +template +struct has_pop_back : std::false_type {}; + +template +struct has_pop_back().pop_back())>> : std::true_type {}; + +template +struct has_data : std::false_type {}; + +template +struct has_data().data())>> : std::true_type {}; + +template +struct has_resize : std::false_type {}; + +template +struct has_resize().resize(size_t{}))>> : std::true_type {}; + +template +struct has_clear : std::false_type {}; + +template +struct has_clear().clear())>> : std::true_type {}; + +template +struct has_max_size : std::false_type {}; + +template +struct has_max_size().max_size())>> : std::true_type {}; + +template +struct has_front : std::false_type {}; + +template +struct has_front().front())>> : std::true_type {}; + +template +struct has_back : std::false_type {}; + +template +struct has_back().back())>> : std::true_type {}; + +} // namespace type_traits_utils + +#if CUOPT_ENABLE_MEMORY_INSTRUMENTATION + +// Memory operation instrumentation wrapper for container-like types +template +struct memop_instrumentation_wrapper_t : public memory_instrumentation_base_t { + // Standard container type traits + using value_type = std::remove_reference_t()[0])>; + using size_type = std::size_t; + using difference_type = std::ptrdiff_t; + using reference = value_type&; + using const_reference = const value_type&; + using pointer = value_type*; + using const_pointer = const value_type*; + + static_assert(std::is_standard_layout_v, + "value_type must have standard layout for memory instrumentation"); + static constexpr size_t type_size = sizeof(value_type); + + // Proxy class to track reads and writes for a single element + class element_proxy_t { + public: + element_proxy_t(value_type& ref, memop_instrumentation_wrapper_t& wrapper) + : ref_(ref), wrapper_(wrapper) + { + } + + element_proxy_t& operator=(const value_type& value) + { + wrapper_.template record_store(); + ref_ = value; + return *this; + } + element_proxy_t& operator=(const element_proxy_t& other) + { + wrapper_.template record_store(); + other.wrapper_.template record_load(); + ref_ = other.ref_; + return *this; + } + + operator value_type() const + { + wrapper_.template record_load(); + return ref_; + } + + // // Allow implicit conversion to reference for functions expecting references + // operator value_type&() { return ref_; } + + // operator const value_type&() const { return ref_; } + + // // Member access operator for structured types (e.g., type_2) + // value_type* operator->() { return &ref_; } + + // const value_type* operator->() const { return &ref_; } + + // Get underlying element reference (records a load) + value_type& get() + { + wrapper_.template record_load(); + return ref_; + } + + const value_type& get() const + { + wrapper_.template record_load(); + return ref_; + } + + element_proxy_t& operator+=(const value_type& value) + { + wrapper_.template record_rmw(); + ref_ += value; + return *this; + } + element_proxy_t& operator-=(const value_type& value) + { + wrapper_.template record_rmw(); + ref_ -= value; + return *this; + } + element_proxy_t& operator*=(const value_type& value) + { + wrapper_.template record_rmw(); + ref_ *= value; + return *this; + } + element_proxy_t& operator/=(const value_type& value) + { + wrapper_.template record_rmw(); + ref_ /= value; + return *this; + } + element_proxy_t& operator++() + { + wrapper_.template record_rmw(); + ++ref_; + return *this; + } + element_proxy_t& operator--() + { + wrapper_.template record_rmw(); + --ref_; + return *this; + } + + value_type operator++(int) + { + wrapper_.template record_rmw(); + return ref_++; + } + value_type operator--(int) + { + wrapper_.template record_rmw(); + return ref_--; + } + + value_type& ref_; + memop_instrumentation_wrapper_t& wrapper_; + }; + + // Instrumented iterator that tracks memory accesses + template + class instrumented_iterator_t { + public: + using iterator_category = std::random_access_iterator_tag; + using value_type = memop_instrumentation_wrapper_t::value_type; + using difference_type = std::ptrdiff_t; + using pointer = std::conditional_t; + using reference = std::conditional_t; + using wrapper_ptr = std::conditional_t; + + instrumented_iterator_t(IterT iter, wrapper_ptr wrapper) : iter_(iter), wrapper_(wrapper) {} + + // Dereference - returns proxy for non-const, tracks load for const + auto operator*() const + { + if constexpr (IsConst) { +#ifdef CUOPT_ENABLE_MEMORY_INSTRUMENTATION + wrapper_->byte_loads += sizeof(value_type); +#endif + return *iter_; + } else { + return element_proxy_t(*iter_, *wrapper_); + } + } + + auto operator->() const { return &(*iter_); } + + instrumented_iterator_t& operator++() + { + ++iter_; + return *this; + } + + instrumented_iterator_t operator++(int) + { + auto tmp = *this; + ++iter_; + return tmp; + } + + instrumented_iterator_t& operator--() + { + --iter_; + return *this; + } + + instrumented_iterator_t operator--(int) + { + auto tmp = *this; + --iter_; + return tmp; + } + + instrumented_iterator_t& operator+=(difference_type n) + { + iter_ += n; + return *this; + } + + instrumented_iterator_t& operator-=(difference_type n) + { + iter_ -= n; + return *this; + } + + instrumented_iterator_t operator+(difference_type n) const + { + return instrumented_iterator_t(iter_ + n, wrapper_); + } + + instrumented_iterator_t operator-(difference_type n) const + { + return instrumented_iterator_t(iter_ - n, wrapper_); + } + + difference_type operator-(const instrumented_iterator_t& other) const + { + return iter_ - other.iter_; + } + + auto operator[](difference_type n) const { return *(*this + n); } + + bool operator==(const instrumented_iterator_t& other) const { return iter_ == other.iter_; } + bool operator!=(const instrumented_iterator_t& other) const { return iter_ != other.iter_; } + bool operator<(const instrumented_iterator_t& other) const { return iter_ < other.iter_; } + bool operator<=(const instrumented_iterator_t& other) const { return iter_ <= other.iter_; } + bool operator>(const instrumented_iterator_t& other) const { return iter_ > other.iter_; } + bool operator>=(const instrumented_iterator_t& other) const { return iter_ >= other.iter_; } + + IterT base() const { return iter_; } + + // Allow iterator_traits to access the underlying iterator + friend struct std::iterator_traits; + + private: + IterT iter_; + wrapper_ptr wrapper_; + }; + + // Iterator type definitions (must come after instrumented_iterator_t) + using iterator = instrumented_iterator_t().begin()), false>; + using const_iterator = instrumented_iterator_t().begin()), true>; + using reverse_iterator = std::reverse_iterator; + using const_reverse_iterator = std::reverse_iterator; + + // Constructors + memop_instrumentation_wrapper_t() : array_() + { + if constexpr (type_traits_utils::has_data::value) { + data_ptr = array_.data(); + } else { + data_ptr = nullptr; + } + } + + // Copy/move from underlying type + memop_instrumentation_wrapper_t(const T& arr) : array_(arr) + { + if constexpr (type_traits_utils::has_data::value) { + data_ptr = const_cast(array_.data()); + } else { + data_ptr = nullptr; + } + } + memop_instrumentation_wrapper_t(T&& arr) : array_(std::move(arr)) + { + if constexpr (type_traits_utils::has_data::value) { + data_ptr = array_.data(); + } else { + data_ptr = nullptr; + } + } + + // Forwarding constructor for underlying container initialization + // Only enabled for types that aren't the wrapper itself or the underlying type + template , memop_instrumentation_wrapper_t> && + !std::is_same_v, T> && + (sizeof...(Args) > 0 || !std::is_convertible_v)>> + explicit memop_instrumentation_wrapper_t(Arg&& arg, Args&&... args) + : array_(std::forward(arg), std::forward(args)...) + { + if constexpr (type_traits_utils::has_data::value) { + data_ptr = array_.data(); + } else { + data_ptr = nullptr; + } + } + + memop_instrumentation_wrapper_t(const memop_instrumentation_wrapper_t& other) + : memory_instrumentation_base_t(), array_(other.array_) + { + if constexpr (type_traits_utils::has_data::value) { + data_ptr = array_.data(); + } else { + data_ptr = nullptr; + } + } + + memop_instrumentation_wrapper_t(memop_instrumentation_wrapper_t&& other) noexcept + : memory_instrumentation_base_t(), array_(std::move(other.array_)) + { + if constexpr (type_traits_utils::has_data::value) { + data_ptr = array_.data(); + } else { + data_ptr = nullptr; + } + } + + memop_instrumentation_wrapper_t& operator=(const memop_instrumentation_wrapper_t& other) + { + if (this != &other) { + reset_counters(); + array_ = other.array_; + if constexpr (type_traits_utils::has_data::value) { + data_ptr = array_.data(); + } else { + data_ptr = nullptr; + } + } + return *this; + } + + memop_instrumentation_wrapper_t& operator=(memop_instrumentation_wrapper_t&& other) noexcept + { + if (this != &other) { + reset_counters(); + array_ = std::move(other.array_); + if constexpr (type_traits_utils::has_data::value) { + data_ptr = array_.data(); + } else { + data_ptr = nullptr; + } + } + return *this; + } + + element_proxy_t operator[](size_type index) + { + return element_proxy_t(underlying()[index], *this); + } + + HDI value_type operator[](size_type index) const + { + this->template record_load(); + // really ugly hack because otherwise nvcc complains about vector operator[] being __host__ only + if constexpr (type_traits_utils::has_data::value) { + return data_ptr[index]; + } else { + return underlying()[index]; + } + } + + template + std::enable_if_t::value, element_proxy_t> front() + { + return element_proxy_t(underlying().front(), *this); + } + + template + std::enable_if_t::value, value_type> front() const + { + this->template record_load(); + return underlying().front(); + } + + template + std::enable_if_t::value, element_proxy_t> back() + { + return element_proxy_t(underlying().back(), *this); + } + + template + std::enable_if_t::value, value_type> back() const + { + this->template record_load(); + return underlying().back(); + } + + // Iterators + iterator begin() noexcept { return iterator(std::begin(underlying()), this); } + const_iterator begin() const noexcept { return const_iterator(std::begin(underlying()), this); } + const_iterator cbegin() const noexcept { return const_iterator(std::begin(underlying()), this); } + + iterator end() noexcept { return iterator(std::end(underlying()), this); } + const_iterator end() const noexcept { return const_iterator(std::end(underlying()), this); } + const_iterator cend() const noexcept { return const_iterator(std::end(underlying()), this); } + + reverse_iterator rbegin() noexcept { return reverse_iterator(end()); } + const_reverse_iterator rbegin() const noexcept + { + return const_reverse_iterator(std::end(underlying())); + } + const_reverse_iterator crbegin() const noexcept { return const_reverse_iterator(cend()); } + + reverse_iterator rend() noexcept { return reverse_iterator(begin()); } + const_reverse_iterator rend() const noexcept { return const_reverse_iterator(begin()); } + const_reverse_iterator crend() const noexcept { return const_reverse_iterator(cbegin()); } + + // Capacity + bool empty() const noexcept { return std::begin(underlying()) == std::end(underlying()); } + size_type size() const noexcept + { + return std::distance(std::begin(underlying()), std::end(underlying())); + } + + // Conditional methods - only available if underlying type supports them + template + std::enable_if_t::value, size_type> max_size() const noexcept + { + return underlying().max_size(); + } + + template + std::enable_if_t::value, size_type> capacity() const noexcept + { + return underlying().capacity(); + } + + template + std::enable_if_t::value> reserve(size_type new_cap) + { + underlying().reserve(new_cap); + if constexpr (type_traits_utils::has_data::value) { data_ptr = underlying().data(); } + } + + template + std::enable_if_t::value> shrink_to_fit() + { + underlying().shrink_to_fit(); + if constexpr (type_traits_utils::has_data::value) { data_ptr = underlying().data(); } + } + + template + std::enable_if_t::value> clear() noexcept + { + underlying().clear(); + if constexpr (type_traits_utils::has_data::value) { data_ptr = underlying().data(); } + } + + template + std::enable_if_t::value> push_back(const value_type& value) + { + // we should probably take into account possible copies done by std::vector. oh well. + // hot loops shouldn't be doing such operations anyway + this->template record_store(); + underlying().push_back(value); + if constexpr (type_traits_utils::has_data::value) { data_ptr = underlying().data(); } + } + + template + std::enable_if_t::value> push_back(value_type&& value) + { + this->template record_store(); + underlying().push_back(std::move(value)); + if constexpr (type_traits_utils::has_data::value) { data_ptr = underlying().data(); } + } + + template + std::enable_if_t::value> emplace_back(Args&&... args) + { + this->template record_store(); + underlying().emplace_back(std::forward(args)...); + if constexpr (type_traits_utils::has_data::value) { data_ptr = underlying().data(); } + } + + template + std::enable_if_t::value> pop_back() + { + this->template record_load(); // Reading the element before removal + underlying().pop_back(); + if constexpr (type_traits_utils::has_data::value) { data_ptr = underlying().data(); } + } + + template + std::enable_if_t::value> resize(size_type count) + { + size_type old_size = underlying().size(); + underlying().resize(count); + if (count > old_size) { + this->byte_stores += (count - old_size) * type_size; // New elements initialized + } + if constexpr (type_traits_utils::has_data::value) { data_ptr = underlying().data(); } + } + + template + std::enable_if_t::value> resize(size_type count, + const value_type& value) + { + size_type old_size = underlying().size(); + underlying().resize(count, value); + if (count > old_size) { this->byte_stores += (count - old_size) * type_size; } + if constexpr (type_traits_utils::has_data::value) { data_ptr = underlying().data(); } + } + + template + std::enable_if_t::value, value_type*> data() noexcept + { + return underlying().data(); + } + + template + std::enable_if_t::value, const value_type*> data() const noexcept + { + return underlying().data(); + } + + // Access to underlying array + operator T&() { return underlying(); } + operator const T&() const { return underlying(); } + + T&& release_array() { return std::move(array_); } + + T& underlying() { return array_; } + const T& underlying() const { return array_; } + + private: + T array_; + value_type* data_ptr{nullptr}; +}; + +#else // !CUOPT_ENABLE_MEMORY_INSTRUMENTATION + +// Zero-overhead passthrough wrapper when instrumentation is disabled +// Provides the same interface as the instrumented version but just forwards to the underlying +// container +template +struct memop_instrumentation_wrapper_t : public memory_instrumentation_base_t { + using value_type = typename T::value_type; + using size_type = typename T::size_type; + using difference_type = typename T::difference_type; + using reference = typename T::reference; + using const_reference = typename T::const_reference; + using pointer = typename T::pointer; + using const_pointer = typename T::const_pointer; + using iterator = typename T::iterator; + using const_iterator = typename T::const_iterator; + using reverse_iterator = typename T::reverse_iterator; + using const_reverse_iterator = typename T::const_reverse_iterator; + + // Constructors - forward everything to the underlying container + memop_instrumentation_wrapper_t() = default; + memop_instrumentation_wrapper_t(const T& arr) : array_(arr) {} + memop_instrumentation_wrapper_t(T&& arr) : array_(std::move(arr)) {} + + template , memop_instrumentation_wrapper_t> && + !std::is_same_v, T> && + (sizeof...(Args) > 0 || !std::is_convertible_v)>> + explicit memop_instrumentation_wrapper_t(Arg&& arg, Args&&... args) + : array_(std::forward(arg), std::forward(args)...) + { + } + + memop_instrumentation_wrapper_t(const memop_instrumentation_wrapper_t& other) + : array_(other.array_) + { + } + + memop_instrumentation_wrapper_t(memop_instrumentation_wrapper_t&& other) noexcept + : array_(std::move(other.array_)) + { + } + + memop_instrumentation_wrapper_t& operator=(const memop_instrumentation_wrapper_t& other) + { + if (this != &other) { array_ = other.array_; } + return *this; + } + + memop_instrumentation_wrapper_t& operator=(memop_instrumentation_wrapper_t&& other) noexcept + { + if (this != &other) { array_ = std::move(other.array_); } + return *this; + } + + // Element access - direct passthrough + reference operator[](size_type index) { return underlying()[index]; } + const_reference operator[](size_type index) const { return underlying()[index]; } + + reference front() { return underlying().front(); } + const_reference front() const { return underlying().front(); } + + reference back() { return underlying().back(); } + const_reference back() const { return underlying().back(); } + + pointer data() noexcept { return underlying().data(); } + const_pointer data() const noexcept { return underlying().data(); } + + // Iterators - use underlying container's iterators directly + iterator begin() noexcept { return underlying().begin(); } + const_iterator begin() const noexcept { return underlying().begin(); } + const_iterator cbegin() const noexcept { return underlying().cbegin(); } + + iterator end() noexcept { return underlying().end(); } + const_iterator end() const noexcept { return underlying().end(); } + const_iterator cend() const noexcept { return underlying().cend(); } + + reverse_iterator rbegin() noexcept { return underlying().rbegin(); } + const_reverse_iterator rbegin() const noexcept { return underlying().rbegin(); } + const_reverse_iterator crbegin() const noexcept { return underlying().crbegin(); } + + reverse_iterator rend() noexcept { return underlying().rend(); } + const_reverse_iterator rend() const noexcept { return underlying().rend(); } + const_reverse_iterator crend() const noexcept { return underlying().crend(); } + + // Capacity + bool empty() const noexcept { return underlying().empty(); } + size_type size() const noexcept { return underlying().size(); } + size_type max_size() const noexcept { return underlying().max_size(); } + size_type capacity() const noexcept { return underlying().capacity(); } + + void reserve(size_type new_cap) { underlying().reserve(new_cap); } + void shrink_to_fit() { underlying().shrink_to_fit(); } + + // Modifiers + void clear() noexcept { underlying().clear(); } + void push_back(const value_type& value) { underlying().push_back(value); } + void push_back(value_type&& value) { underlying().push_back(std::move(value)); } + + template + void emplace_back(Args&&... args) + { + underlying().emplace_back(std::forward(args)...); + } + + void pop_back() { underlying().pop_back(); } + void resize(size_type count) { underlying().resize(count); } + void resize(size_type count, const value_type& value) { underlying().resize(count, value); } + + // Conversion operators + operator T&() { return underlying(); } + operator const T&() const { return underlying(); } + + T&& release_array() { return std::move(array_); } + + T& underlying() { return array_; } + const T& underlying() const { return array_; } + + private: + T array_; +}; + +#endif // CUOPT_ENABLE_MEMORY_INSTRUMENTATION + +// Convenience alias for instrumented std::vector +template +using ins_vector = memop_instrumentation_wrapper_t>; + +} // namespace cuopt diff --git a/cpp/src/utilities/producer_sync.hpp b/cpp/src/utilities/producer_sync.hpp new file mode 100644 index 000000000..dfc316c24 --- /dev/null +++ b/cpp/src/utilities/producer_sync.hpp @@ -0,0 +1,116 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights + * reserved. SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include +#include + +namespace cuopt { + +/** + * One-way synchronization utility for producer threads. + * + * Producers (e.g., CPUFJ) register their work unit progress atomics and advance independently. + * The consumer (e.g., B&B coordinator) can wait until all producers have reached a + * target work unit threshold before proceeding. + * + * Key invariant: Producers must not fall behind the consumer's horizon. The consumer + * waits at sync points until all producers have caught up. The producers are biased + * to ensure they remain likely ahead of the consumer. + */ +class producer_sync_t { + public: + producer_sync_t() = default; + + void register_producer(std::atomic* progress_ptr) + { + std::lock_guard lock(mutex_); + producers_.push_back(progress_ptr); + cv_.notify_all(); + } + + void deregister_producer(std::atomic* progress_ptr) + { + std::lock_guard lock(mutex_); + auto it = std::find(producers_.begin(), producers_.end(), progress_ptr); + if (it != producers_.end()) { producers_.erase(it); } + cv_.notify_all(); + } + + /** + * Signal that all expected producers have been registered. + * Must be called before the consumer can proceed with wait_for_producers(). + */ + void registration_complete() + { + std::lock_guard lock(mutex_); + registration_complete_ = true; + cv_.notify_all(); + } + + bool is_registration_complete() const + { + std::lock_guard lock(mutex_); + return registration_complete_; + } + + /** + * Wait until: + * 1. registration_complete() has been called, AND + * 2. All registered producers have work units >= target_work_units + * + * Returns immediately if no producers are registered (after registration_complete). + */ + void wait_for_producers(double target_work_units) + { + std::unique_lock lock(mutex_); + cv_.wait(lock, [this, target_work_units] { + if (!registration_complete_) { return false; } + return all_producers_at_or_ahead(target_work_units); + }); + } + + /** + * Wake up any waiting consumer. Call this when a producer advances its work units. + */ + void notify_progress() { cv_.notify_all(); } + + size_t num_producers() const + { + std::lock_guard lock(mutex_); + return producers_.size(); + } + + private: + bool all_producers_at_or_ahead(double target) const + { + for (const auto* progress_ptr : producers_) { + if (progress_ptr->load(std::memory_order_acquire) < target) { return false; } + } + return true; + } + + mutable std::mutex mutex_; + std::condition_variable cv_; + std::vector*> producers_; + bool registration_complete_{false}; +}; + +} // namespace cuopt diff --git a/cpp/src/utilities/timing_utils.hpp b/cpp/src/utilities/timing_utils.hpp new file mode 100644 index 000000000..569630a86 --- /dev/null +++ b/cpp/src/utilities/timing_utils.hpp @@ -0,0 +1,71 @@ +/* clang-format off */ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ +/* clang-format on */ + +#pragma once + +#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86) +#define CUOPT_HAS_RDTSC 1 +#else +#define CUOPT_HAS_RDTSC 0 +#endif + +#if CUOPT_HAS_RDTSC + +#include +#include +#include +#include + +namespace cuopt { + +inline uint64_t rdtsc() +{ + uint32_t lo, hi; + __asm__ volatile("rdtsc" : "=a"(lo), "=d"(hi)); + return ((uint64_t)hi << 32) | lo; +} + +} // namespace cuopt + +// clang-format off +#define CYCLE_TIMING_PROLOGUE(name) \ + static constexpr size_t timing_buffer_size_##name = 1024; \ + static thread_local std::array timing_buffer_##name; \ + static thread_local size_t timing_idx_##name = 0; \ + uint64_t t_start_##name = cuopt::rdtsc(); + +#define CYCLE_TIMING_EPILOGUE(name) \ + do { \ + uint64_t t_end_##name = cuopt::rdtsc(); \ + timing_buffer_##name[timing_idx_##name++] = t_end_##name - t_start_##name; \ + if (timing_idx_##name == timing_buffer_size_##name) { \ + uint64_t sum_##name = 0; \ + for (size_t i = 0; i < timing_buffer_size_##name; ++i) \ + sum_##name += timing_buffer_##name[i]; \ + uint64_t avg_##name = sum_##name / timing_buffer_size_##name; \ + std::array sorted_##name = timing_buffer_##name; \ + std::nth_element(sorted_##name.begin(), \ + sorted_##name.begin() + timing_buffer_size_##name / 2, \ + sorted_##name.end()); \ + uint64_t median_##name = sorted_##name[timing_buffer_size_##name / 2]; \ + printf(#name ": avg=%lu cycles, median=%lu cycles (n=%zu)\n", \ + avg_##name, median_##name, timing_buffer_size_##name); \ + timing_idx_##name = 0; \ + } \ + } while (0) +// clang-format on + +#else // !CUOPT_HAS_RDTSC + +#define CYCLE_TIMING_PROLOGUE(name) \ + do { \ + } while (0) +#define CYCLE_TIMING_EPILOGUE(name) \ + do { \ + } while (0) + +#endif // CUOPT_HAS_RDTSC diff --git a/cpp/src/utilities/work_limit_context.hpp b/cpp/src/utilities/work_limit_context.hpp new file mode 100644 index 000000000..fa06dec5a --- /dev/null +++ b/cpp/src/utilities/work_limit_context.hpp @@ -0,0 +1,46 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights + * reserved. SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include + +#include + +#include "timer.hpp" +#include "work_unit_scheduler.hpp" + +namespace cuopt { + +struct work_limit_context_t { + double global_work_units_elapsed{0.0}; + double total_sync_time{0.0}; // Total time spent waiting at sync barriers (seconds) + bool deterministic{false}; + work_unit_scheduler_t* scheduler{nullptr}; + std::string name; + + work_limit_context_t(const std::string& name) : name(name) {} + + void record_work(double work) + { + if (!deterministic) return; + global_work_units_elapsed += work; + if (scheduler) { scheduler->on_work_recorded(*this, global_work_units_elapsed); } + } +}; + +} // namespace cuopt diff --git a/cpp/src/utilities/work_unit_scheduler.cpp b/cpp/src/utilities/work_unit_scheduler.cpp new file mode 100644 index 000000000..314219b8b --- /dev/null +++ b/cpp/src/utilities/work_unit_scheduler.cpp @@ -0,0 +1,136 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights + * reserved. SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "work_unit_scheduler.hpp" + +#include "work_limit_context.hpp" + +#include +#include +#include + +#include + +#include + +namespace cuopt { + +work_unit_scheduler_t::work_unit_scheduler_t(double sync_interval) : sync_interval_(sync_interval) +{ +} + +void work_unit_scheduler_t::register_context(work_limit_context_t& ctx) +{ + contexts_.push_back(ctx); + ctx.scheduler = this; +} + +void work_unit_scheduler_t::deregister_context(work_limit_context_t& ctx) +{ + ctx.scheduler = nullptr; + contexts_.erase(std::remove_if(contexts_.begin(), + contexts_.end(), + [&ctx](const std::reference_wrapper& ref) { + return &ref.get() == &ctx; + }), + contexts_.end()); +} + +void work_unit_scheduler_t::set_sync_interval(double interval) { sync_interval_ = interval; } + +void work_unit_scheduler_t::on_work_recorded(work_limit_context_t& ctx, double total_work) +{ + if (is_shutdown()) return; + + if (verbose) { + CUOPT_LOG_DEBUG("[%s] Work recorded: %f, sync_target: %f (gen %zu)", + ctx.name.c_str(), + total_work, + current_sync_target(), + barrier_generation_); + } + + // Loop to handle large work increments that cross multiple sync points + while (total_work >= current_sync_target() && !is_shutdown()) { + wait_at_sync_point(ctx, current_sync_target()); + } +} + +void work_unit_scheduler_t::set_sync_callback(sync_callback_t callback) +{ + sync_callback_ = std::move(callback); +} + +void work_unit_scheduler_t::wait_for_next_sync(work_limit_context_t& ctx) +{ + if (is_shutdown()) return; + + double next_sync = current_sync_target(); + ctx.global_work_units_elapsed = next_sync; + wait_at_sync_point(ctx, next_sync); +} + +double work_unit_scheduler_t::current_sync_target() const +{ + if (sync_interval_ <= 0) return std::numeric_limits::infinity(); + return (barrier_generation_ + 1) * sync_interval_; +} + +void work_unit_scheduler_t::wait_at_sync_point(work_limit_context_t& ctx, double sync_target) +{ + auto wait_start = std::chrono::high_resolution_clock::now(); + + if (verbose) { + CUOPT_LOG_DEBUG("[%s] Waiting at sync point %.2f (gen %zu)", + ctx.name.c_str(), + sync_target, + barrier_generation_); + } + + // All threads wait at this barrier +#pragma omp barrier + + // One thread executes the sync callback +#pragma omp single + { + current_sync_target_ = sync_target; + barrier_generation_++; + + if (verbose) { + CUOPT_LOG_DEBUG("All contexts arrived at sync point %.2f, new generation %zu", + sync_target, + barrier_generation_); + } + + if (sync_callback_) { sync_callback_(sync_target); } + } + // Implicit barrier at end of single block ensures callback is complete + // before any thread proceeds + + auto wait_end = std::chrono::high_resolution_clock::now(); + double wait_secs = std::chrono::duration(wait_end - wait_start).count(); + ctx.total_sync_time += wait_secs; + + if (verbose) { + CUOPT_LOG_DEBUG("[%s] Sync complete at %.2f, waited %.2f ms", + ctx.name.c_str(), + sync_target, + wait_secs * 1000.0); + } +} + +} // namespace cuopt diff --git a/cpp/src/utilities/work_unit_scheduler.hpp b/cpp/src/utilities/work_unit_scheduler.hpp new file mode 100644 index 000000000..84e7b95fa --- /dev/null +++ b/cpp/src/utilities/work_unit_scheduler.hpp @@ -0,0 +1,99 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights + * reserved. SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include + +namespace cuopt { + +struct work_limit_context_t; + +class work_unit_scheduler_t { + public: + explicit work_unit_scheduler_t(double sync_interval = 5.0); + + void set_sync_interval(double interval); + double get_sync_interval() const { return sync_interval_; } + + void register_context(work_limit_context_t& ctx); + void deregister_context(work_limit_context_t& ctx); + void on_work_recorded(work_limit_context_t& ctx, double total_work); + + // Sync callback - executed by one thread when all contexts reach sync point + using sync_callback_t = std::function; + void set_sync_callback(sync_callback_t callback); + + // Wait for next sync point (for idle workers with no work) + void wait_for_next_sync(work_limit_context_t& ctx); + + double current_sync_target() const; + + void signal_shutdown() { shutdown_.store(true, std::memory_order_release); } + bool is_shutdown() const { return shutdown_.load(std::memory_order_acquire); } + + public: + bool verbose{false}; + + private: + void wait_at_sync_point(work_limit_context_t& ctx, double sync_target); + + double sync_interval_; + std::vector> contexts_; + + size_t barrier_generation_{0}; + double current_sync_target_{0}; + + // Sync callback - executed when all contexts reach sync point + sync_callback_t sync_callback_; + + // Shutdown flag - prevents threads from entering barriers after termination is signaled + std::atomic shutdown_{false}; +}; + +// RAII helper for registering multiple contexts with automatic cleanup +class scoped_context_registrations_t { + public: + explicit scoped_context_registrations_t(work_unit_scheduler_t& scheduler) : scheduler_(scheduler) + { + } + + ~scoped_context_registrations_t() + { + for (auto* ctx : contexts_) { + scheduler_.deregister_context(*ctx); + } + } + + void add(work_limit_context_t& ctx) + { + scheduler_.register_context(ctx); + contexts_.push_back(&ctx); + } + + scoped_context_registrations_t(const scoped_context_registrations_t&) = delete; + scoped_context_registrations_t& operator=(const scoped_context_registrations_t&) = delete; + scoped_context_registrations_t(scoped_context_registrations_t&&) = delete; + scoped_context_registrations_t& operator=(scoped_context_registrations_t&&) = delete; + + private: + work_unit_scheduler_t& scheduler_; + std::vector contexts_; +}; + +} // namespace cuopt diff --git a/cpp/tests/linear_programming/c_api_tests/c_api_test.c b/cpp/tests/linear_programming/c_api_tests/c_api_test.c index 799a42914..923d379f3 100644 --- a/cpp/tests/linear_programming/c_api_tests/c_api_test.c +++ b/cpp/tests/linear_programming/c_api_tests/c_api_test.c @@ -1457,3 +1457,129 @@ cuopt_int_t test_write_problem(const char* input_filename, const char* output_fi cuOptDestroySolution(&solution); return status; } + +cuopt_int_t test_deterministic_bb(const char* filename, + cuopt_int_t num_runs, + cuopt_int_t num_threads, + cuopt_float_t time_limit, + cuopt_float_t work_limit) +{ + cuOptOptimizationProblem problem = NULL; + cuOptSolverSettings settings = NULL; + cuopt_float_t first_objective = 0.0; + cuopt_int_t first_status = -1; + cuopt_int_t status; + cuopt_int_t run; + + printf("Testing deterministic B&B: %s with %d threads, %d runs\n", filename, num_threads, num_runs); + + status = cuOptReadProblem(filename, &problem); + if (status != CUOPT_SUCCESS) { + printf("Error reading problem: %d\n", status); + goto DONE; + } + + status = cuOptCreateSolverSettings(&settings); + if (status != CUOPT_SUCCESS) { + printf("Error creating solver settings: %d\n", status); + goto DONE; + } + + status = cuOptSetIntegerParameter(settings, CUOPT_MIP_DETERMINISM_MODE, CUOPT_MODE_DETERMINISTIC); + if (status != CUOPT_SUCCESS) { + printf("Error setting determinism mode: %d\n", status); + goto DONE; + } + + status = cuOptSetIntegerParameter(settings, CUOPT_NUM_CPU_THREADS, num_threads); + if (status != CUOPT_SUCCESS) { + printf("Error setting num threads: %d\n", status); + goto DONE; + } + + status = cuOptSetFloatParameter(settings, CUOPT_TIME_LIMIT, time_limit); + if (status != CUOPT_SUCCESS) { + printf("Error setting time limit: %d\n", status); + goto DONE; + } + + status = cuOptSetFloatParameter(settings, CUOPT_WORK_LIMIT, work_limit); + if (status != CUOPT_SUCCESS) { + printf("Error setting work limit: %d\n", status); + goto DONE; + } + + int seed = rand(); + printf("Seed: %d\n", seed); + + for (run = 0; run < num_runs; run++) { + cuOptSolution solution = NULL; + cuopt_float_t objective; + cuopt_int_t termination_status; + + status = cuOptSetIntegerParameter(settings, CUOPT_MIP_SEED, seed); + if (status != CUOPT_SUCCESS) { + printf("Error setting seed: %d\n", status); + goto DONE; + } + + status = cuOptSolve(problem, settings, &solution); + if (status != CUOPT_SUCCESS) { + printf("Error solving problem on run %d: %d\n", run, status); + cuOptDestroySolution(&solution); + goto DONE; + } + + status = cuOptGetObjectiveValue(solution, &objective); + if (status != CUOPT_SUCCESS) { + printf("Error getting objective value on run %d: %d\n", run, status); + cuOptDestroySolution(&solution); + goto DONE; + } + + status = cuOptGetTerminationStatus(solution, &termination_status); + if (status != CUOPT_SUCCESS) { + printf("Error getting termination status on run %d: %d\n", run, status); + cuOptDestroySolution(&solution); + goto DONE; + } + + printf("Run %d: status=%s (%d), objective=%f\n", + run, + termination_status_to_string(termination_status), + termination_status, + objective); + + if (run == 0) { + first_objective = objective; + first_status = termination_status; + } else { + if (first_status != termination_status) { + printf("Determinism failure: run %d termination status %d differs from run 0 status %d\n", + run, + termination_status, + first_status); + status = CUOPT_VALIDATION_ERROR; + cuOptDestroySolution(&solution); + goto DONE; + } + if (first_objective != objective) { + printf("Determinism failure: run %d objective %f differs from run 0 objective %f\n", + run, + objective, + first_objective); + status = CUOPT_VALIDATION_ERROR; + cuOptDestroySolution(&solution); + goto DONE; + } + } + cuOptDestroySolution(&solution); + } + + printf("Deterministic B&B test PASSED: all %d runs produced identical results\n", num_runs); + +DONE: + cuOptDestroyProblem(&problem); + cuOptDestroySolverSettings(&settings); + return status; +} diff --git a/cpp/tests/linear_programming/c_api_tests/c_api_tests.cpp b/cpp/tests/linear_programming/c_api_tests/c_api_tests.cpp index 273924ec0..25476022d 100644 --- a/cpp/tests/linear_programming/c_api_tests/c_api_tests.cpp +++ b/cpp/tests/linear_programming/c_api_tests/c_api_tests.cpp @@ -225,3 +225,27 @@ INSTANTIATE_TEST_SUITE_P(c_api, "/mip/enlight_hard.mps", "/mip/enlight11.mps", "/mip/supportcase22.mps")); + +class DeterministicBBTestFixture + : public ::testing::TestWithParam> {}; +TEST_P(DeterministicBBTestFixture, deterministic_reproducibility) +{ + const std::string& rapidsDatasetRootDir = cuopt::test::get_rapids_dataset_root_dir(); + std::string filename = rapidsDatasetRootDir + std::get<0>(GetParam()); + int num_threads = std::get<1>(GetParam()); + double time_limit = std::get<2>(GetParam()); + double work_limit = std::get<3>(GetParam()); + + // Run 3 times and verify identical results + EXPECT_EQ(test_deterministic_bb(filename.c_str(), 3, num_threads, time_limit, work_limit), + CUOPT_SUCCESS); +} +INSTANTIATE_TEST_SUITE_P(c_api, + DeterministicBBTestFixture, + ::testing::Values( + // Low thread count + std::make_tuple("/mip/gen-ip054.mps", 4, 60.0, 2), + // High thread count (high contention) + std::make_tuple("/mip/gen-ip054.mps", 128, 60.0, 2), + // Different instance + std::make_tuple("/mip/bb_optimality.mps", 8, 60.0, 2))); diff --git a/cpp/tests/linear_programming/c_api_tests/c_api_tests.h b/cpp/tests/linear_programming/c_api_tests/c_api_tests.h index 179d7deea..00b825b99 100644 --- a/cpp/tests/linear_programming/c_api_tests/c_api_tests.h +++ b/cpp/tests/linear_programming/c_api_tests/c_api_tests.h @@ -37,6 +37,12 @@ cuopt_int_t test_quadratic_problem(cuopt_int_t* termination_status_ptr, cuopt_int_t test_quadratic_ranged_problem(cuopt_int_t* termination_status_ptr, cuopt_float_t* objective_ptr); cuopt_int_t test_write_problem(const char* input_filename, const char* output_filename); +cuopt_int_t test_deterministic_bb(const char* filename, + cuopt_int_t num_runs, + cuopt_int_t num_threads, + cuopt_float_t time_limit, + cuopt_float_t work_limit); + #ifdef __cplusplus } #endif diff --git a/cpp/tests/mip/CMakeLists.txt b/cpp/tests/mip/CMakeLists.txt index 43fc273db..2f2139890 100644 --- a/cpp/tests/mip/CMakeLists.txt +++ b/cpp/tests/mip/CMakeLists.txt @@ -46,3 +46,6 @@ ConfigureTest(PRESOLVE_TEST ConfigureTest(MIP_TERMINATION_STATUS_TEST ${CMAKE_CURRENT_SOURCE_DIR}/termination_test.cu ) +ConfigureTest(DETERMINISM_TEST + ${CMAKE_CURRENT_SOURCE_DIR}/determinism_test.cu +) diff --git a/cpp/tests/mip/determinism_test.cu b/cpp/tests/mip/determinism_test.cu new file mode 100644 index 000000000..1e59fba64 --- /dev/null +++ b/cpp/tests/mip/determinism_test.cu @@ -0,0 +1,249 @@ +/* clang-format off */ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ +/* clang-format on */ + +#include "../linear_programming/utilities/pdlp_test_utilities.cuh" +#include "mip_utils.cuh" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include +#include +#include +#include + +namespace cuopt::linear_programming::test { + +namespace { + +void expect_solutions_bitwise_equal(const mip_solution_t& sol1, + const mip_solution_t& sol2, + raft::handle_t& handle, + const std::string& label = "") +{ + auto x1 = cuopt::host_copy(sol1.get_solution(), handle.get_stream()); + auto x2 = cuopt::host_copy(sol2.get_solution(), handle.get_stream()); + + ASSERT_EQ(x1.size(), x2.size()) << label << "Solution sizes differ"; + for (size_t i = 0; i < x1.size(); ++i) { + EXPECT_EQ(x1[i], x2[i]) << label << "Variable " << i << " differs"; + } +} + +} // namespace + +class DeterministicBBTest : public ::testing::Test { + protected: + raft::handle_t handle_; +}; + +// Test that multiple runs with deterministic mode produce identical objective values +TEST_F(DeterministicBBTest, reproducible_objective) +{ + auto path = make_path_absolute("/mip/gen-ip054.mps"); + auto problem = mps_parser::parse_mps(path, false); + handle_.sync_stream(); + + mip_solver_settings_t settings; + settings.time_limit = 60.0; + settings.determinism_mode = CUOPT_MODE_DETERMINISTIC; + settings.num_cpu_threads = 8; + settings.work_limit = 4; + + // Ensure seed is positive int32_t + auto seed = std::random_device{}() & 0x7fffffff; + std::cout << "Tested with seed " << seed << "\n"; + settings.seed = seed; + + auto solution1 = solve_mip(&handle_, problem, settings); + double obj1 = solution1.get_objective_value(); + auto status1 = solution1.get_termination_status(); + + for (int i = 2; i <= 10; ++i) { + auto solution = solve_mip(&handle_, problem, settings); + double obj = solution.get_objective_value(); + auto status = solution.get_termination_status(); + + EXPECT_EQ(status1, status) << "Termination status differs on run " << i; + ASSERT_EQ(obj1, obj) << "Objective value differs on run " << i; + expect_solutions_bitwise_equal(solution1, solution, handle_); + } +} + +TEST_F(DeterministicBBTest, reproducible_infeasibility) +{ + auto path = make_path_absolute("/mip/stein9inf.mps"); + auto problem = mps_parser::parse_mps(path, false); + handle_.sync_stream(); + + mip_solver_settings_t settings; + settings.time_limit = 60.0; + settings.determinism_mode = CUOPT_MODE_DETERMINISTIC; + settings.num_cpu_threads = 8; + settings.work_limit = 100; // High enough to fully explore + + auto seed = std::random_device{}() & 0x7fffffff; + std::cout << "Tested with seed " << seed << "\n"; + settings.seed = seed; + + auto solution1 = solve_mip(&handle_, problem, settings); + auto status1 = solution1.get_termination_status(); + EXPECT_EQ(status1, mip_termination_status_t::Infeasible) + << "First run should detect infeasibility"; + + for (int i = 2; i <= 5; ++i) { + auto solution = solve_mip(&handle_, problem, settings); + auto status = solution.get_termination_status(); + + EXPECT_EQ(status1, status) << "Termination status differs on run " << i; + EXPECT_EQ(status, mip_termination_status_t::Infeasible) + << "Run " << i << " should detect infeasibility"; + } +} + +// Test determinism under high thread contention +TEST_F(DeterministicBBTest, reproducible_high_contention) +{ + auto path = make_path_absolute("/mip/gen-ip054.mps"); + auto problem = mps_parser::parse_mps(path, false); + handle_.sync_stream(); + + mip_solver_settings_t settings; + settings.time_limit = 60.0; + settings.determinism_mode = CUOPT_MODE_DETERMINISTIC; + settings.num_cpu_threads = 128; // High thread count to stress contention + settings.work_limit = 1; + + auto seed = std::random_device{}() & 0x7fffffff; + + std::cout << "Tested with seed " << seed << "\n"; + settings.seed = seed; + + std::vector> solutions; + + constexpr int num_runs = 3; + for (int run = 0; run < num_runs; ++run) { + solutions.push_back(solve_mip(&handle_, problem, settings)); + } + + for (int i = 1; i < num_runs; ++i) { + EXPECT_EQ(solutions[0].get_termination_status(), solutions[i].get_termination_status()) + << "Run " << i << " termination status differs from run 0"; + EXPECT_DOUBLE_EQ(solutions[0].get_objective_value(), solutions[i].get_objective_value()) + << "Run " << i << " objective differs from run 0"; + expect_solutions_bitwise_equal( + solutions[0], solutions[i], handle_, "Run " + std::to_string(i) + " vs run 0: "); + } +} + +// Test that solution vectors are bitwise identical across runs +TEST_F(DeterministicBBTest, reproducible_solution_vector) +{ + auto path = make_path_absolute("/mip/swath1.mps"); + auto problem = mps_parser::parse_mps(path, false); + handle_.sync_stream(); + + mip_solver_settings_t settings; + settings.time_limit = 60.0; + settings.determinism_mode = CUOPT_MODE_DETERMINISTIC; + settings.num_cpu_threads = 8; + settings.work_limit = 2; + + auto seed = std::random_device{}() & 0x7fffffff; + + std::cout << "Tested with seed " << seed << "\n"; + settings.seed = seed; + + auto solution1 = solve_mip(&handle_, problem, settings); + auto solution2 = solve_mip(&handle_, problem, settings); + + EXPECT_EQ(solution1.get_termination_status(), solution2.get_termination_status()); + EXPECT_DOUBLE_EQ(solution1.get_objective_value(), solution2.get_objective_value()); + expect_solutions_bitwise_equal(solution1, solution2, handle_); +} + +// Parameterized test for different problem instances +class DeterministicBBInstanceTest + : public ::testing::TestWithParam> { + protected: + raft::handle_t handle_; +}; + +TEST_P(DeterministicBBInstanceTest, deterministic_across_runs) +{ + auto [instance_path, num_threads, time_limit, work_limit] = GetParam(); + auto path = make_path_absolute(instance_path); + auto problem = mps_parser::parse_mps(path, false); + handle_.sync_stream(); + + // Get a random seed for each run + auto seed = std::random_device{}() & 0x7fffffff; + + std::cout << "Tested with seed " << seed << "\n"; + + mip_solver_settings_t settings; + settings.time_limit = time_limit; + settings.determinism_mode = CUOPT_MODE_DETERMINISTIC; + settings.num_cpu_threads = num_threads; + settings.work_limit = work_limit; + settings.seed = seed; + + cuopt::seed_generator::set_seed(seed); + auto solution1 = solve_mip(&handle_, problem, settings); + cuopt::seed_generator::set_seed(seed); + auto solution2 = solve_mip(&handle_, problem, settings); + cuopt::seed_generator::set_seed(seed); + auto solution3 = solve_mip(&handle_, problem, settings); + + EXPECT_EQ(solution1.get_termination_status(), solution2.get_termination_status()); + EXPECT_EQ(solution1.get_termination_status(), solution3.get_termination_status()); + + EXPECT_DOUBLE_EQ(solution1.get_objective_value(), solution2.get_objective_value()); + EXPECT_DOUBLE_EQ(solution1.get_objective_value(), solution3.get_objective_value()); + + EXPECT_DOUBLE_EQ(solution1.get_solution_bound(), solution2.get_solution_bound()); + EXPECT_DOUBLE_EQ(solution1.get_solution_bound(), solution3.get_solution_bound()); + + expect_solutions_bitwise_equal(solution1, solution2, handle_, "Run 1 vs 2: "); + expect_solutions_bitwise_equal(solution1, solution3, handle_, "Run 1 vs 3: "); +} + +INSTANTIATE_TEST_SUITE_P( + DeterministicBB, + DeterministicBBInstanceTest, + ::testing::Values( + // Instance, threads, time_limit + std::make_tuple("/mip/gen-ip054.mps", 4, 60.0, 4), + std::make_tuple("/mip/swath1.mps", 8, 60.0, 4), + std::make_tuple("/mip/gen-ip054.mps", 128, 120.0, 1), + std::make_tuple("/mip/bb_optimality.mps", 4, 60.0, 4), + std::make_tuple("/mip/neos5.mps", 16, 60.0, 1), + std::make_tuple("/mip/seymour1.mps", 16, 60.0, 1), + // too heavy for CI + // std::make_tuple("/mip/n2seq36q.mps", 16, 60.0, 4), + std::make_tuple("/mip/gmu-35-50.mps", 32, 60.0, 3)), + [](const ::testing::TestParamInfo& info) { + const auto& path = std::get<0>(info.param); + int threads = std::get<1>(info.param); + std::string name = path.substr(path.rfind('/') + 1); + name = name.substr(0, name.rfind('.')); + std::replace(name.begin(), name.end(), '-', '_'); + return name + "_threads" + std::to_string(threads); + }); + +} // namespace cuopt::linear_programming::test