From d3b7b12d791abbd6c74be33abe30eb04b2ae528a Mon Sep 17 00:00:00 2001
From: Rajesh Gandham <rgandham@nvidia.com>
Date: Mon, 2 Mar 2026 10:55:23 -0800
Subject: [PATCH 01/30] Take advantage of hyper sparsity in dual push

---
 cpp/src/dual_simplex/crossover.cpp | 40 +++++++++++++++++++-----------
 1 file changed, 26 insertions(+), 14 deletions(-)
diff --git a/cpp/src/dual_simplex/crossover.cpp b/cpp/src/dual_simplex/crossover.cpp
index 988c9c50ad..16f503e893 100644
--- a/cpp/src/dual_simplex/crossover.cpp
+++ b/cpp/src/dual_simplex/crossover.cpp
@@ -331,6 +331,7 @@ void compute_dual_solution_from_basis(const lp_problem_t<i_t, f_t>& lp,
 
 template <typename i_t, typename f_t>
 i_t dual_push(const lp_problem_t<i_t, f_t>& lp,
+              const csr_matrix_t<i_t, f_t>& Arow,
               const simplex_solver_settings_t<i_t, f_t>& settings,
               f_t start_time,
               lp_solution_t<i_t, f_t>& solution,
@@ -401,11 +402,9 @@ i_t dual_push(const lp_problem_t<i_t, f_t>& lp,
     es_sparse.x[0] = -delta_zs;
 
     // B^T delta_y = -delta_zs*es
-    std::vector<f_t> delta_y(m);
     sparse_vector_t<i_t, f_t> delta_y_sparse(m, 1);
     sparse_vector_t<i_t, f_t> UTsol_sparse(m, 1);
     ft.b_transpose_solve(es_sparse, delta_y_sparse, UTsol_sparse);
-    delta_y_sparse.scatter(delta_y);
 
     // We solved B^T delta_y = -delta_zs*es, but for the update we need
     // U^T*etilde = es.
@@ -417,15 +416,23 @@ i_t dual_push(const lp_problem_t<i_t, f_t>& lp,
 
     // delta_zN = -N^T delta_y
     std::vector<f_t> delta_zN(n - m);
-    for (i_t k = 0; k < n - m; ++k) {
-      const i_t j         = nonbasic_list[k];
-      const i_t col_start = lp.A.col_start[j];
-      const i_t col_end   = lp.A.col_start[j + 1];
-      f_t dot             = 0.0;
-      for (i_t p = col_start; p < col_end; ++p) {
-        dot += lp.A.x[p] * delta_y[lp.A.i[p]];
+    std::vector<f_t> delta_expanded(n, 0.);
+    
+    // Iterate directly over sparse delta_y instead of checking zeros
+    for (i_t nnz_idx = 0; nnz_idx < delta_y_sparse.i.size(); ++nnz_idx) {
+      const i_t row = delta_y_sparse.i[nnz_idx];
+      const f_t val = delta_y_sparse.x[nnz_idx];
+      
+      // Accumulate contributions from this row to all columns
+      const i_t row_start = Arow.row_start[row];
+      const i_t row_end   = Arow.row_start[row + 1];
+      for (i_t p = row_start; p < row_end; ++p) {
+        const i_t col = Arow.j[p];
+        delta_expanded[col] += Arow.x[p] * val;
       }
-      delta_zN[k] = -dot;
+    }
+    for (i_t k = 0; k < n - m; ++k) {
+      delta_zN[k] = -delta_expanded[nonbasic_list[k]];
     }
 
     i_t entering_index          = -1;
@@ -435,8 +442,10 @@ i_t dual_push(const lp_problem_t<i_t, f_t>& lp,
     assert(step_length >= -1e-6);
 
     // y <- y + step_length * delta_y
-    for (i_t i = 0; i < m; ++i) {
-      y[i] += step_length * delta_y[i];
+    // Optimized: Only update non-zero elements from sparse representation
+    for (i_t nnz_idx = 0; nnz_idx < delta_y_sparse.i.size(); ++nnz_idx) {
+      const i_t i = delta_y_sparse.i[nnz_idx];
+      y[i] += step_length * delta_y_sparse.x[nnz_idx];
     }
 
     // z <- z + step_length * delta z
@@ -725,7 +734,6 @@ i_t primal_push(const lp_problem_t<i_t, f_t>& lp,
 {
   const i_t m = lp.num_rows;
   const i_t n = lp.num_cols;
-
   settings.log.debug("Primal push: superbasic %ld\n", superbasic_list.size());
 
   std::vector<f_t>& x = solution.x;
@@ -1002,6 +1010,7 @@ i_t primal_push(const lp_problem_t<i_t, f_t>& lp,
   }
   solution.x = x_compare;
   solution.iterations += num_pushes;
+
   return 0;
 }
 
@@ -1190,6 +1199,9 @@ crossover_status_t crossover(const lp_problem_t<i_t, f_t>& lp,
   f_t crossover_start = tic();
   f_t work_estimate   = 0;
 
+  csr_matrix_t<i_t, f_t> Arow(m, n, 1);
+  lp.A.to_compressed_row(Arow);
+
   settings.log.printf("\n");
   settings.log.printf("Starting crossover\n");
 
@@ -1332,7 +1344,7 @@ crossover_status_t crossover(const lp_problem_t<i_t, f_t>& lp,
   verify_basis<i_t, f_t>(m, n, vstatus);
   compare_vstatus_with_lists<i_t, f_t>(m, n, basic_list, nonbasic_list, vstatus);
   i_t dual_push_status = dual_push(
-    lp, settings, start_time, solution, ft, basic_list, nonbasic_list, superbasic_list, vstatus);
+    lp, Arow, settings, start_time, solution, ft, basic_list, nonbasic_list, superbasic_list, vstatus);
   if (dual_push_status < 0) { return return_to_status(dual_push_status); }
   settings.log.debug("basic list size %ld m %d\n", basic_list.size(), m);
   settings.log.debug("nonbasic list size %ld n - m %d\n", nonbasic_list.size(), n - m);

From 6aae07d21fff1f45d4c71c67392a24adaa2ce0fe Mon Sep 17 00:00:00 2001
From: Rajesh Gandham <rgandham@nvidia.com>
Date: Thu, 5 Mar 2026 16:22:51 -0800
Subject: [PATCH 02/30] Initial stab at moving root solves to B&B

---
 cpp/src/branch_and_bound/branch_and_bound.cpp | 89 ++++++++++++++++---
 cpp/src/branch_and_bound/branch_and_bound.hpp | 15 +++-
 .../dual_simplex/simplex_solver_settings.hpp  |  3 +
 cpp/src/dual_simplex/types.hpp                | 13 +++
 cpp/src/mip_heuristics/CMakeLists.txt         |  3 +-
 .../diversity/diversity_manager.cu            | 70 ++++++++-------
 .../diversity/diversity_manager.cuh           | 15 ++++
 cpp/src/mip_heuristics/root_lp.cu             | 74 +++++++++++++++
 cpp/src/mip_heuristics/root_lp.cuh            | 31 +++++++
 cpp/src/mip_heuristics/solver.cu              | 18 +++-
 10 files changed, 281 insertions(+), 50 deletions(-)
 create mode 100644 cpp/src/mip_heuristics/root_lp.cu
 create mode 100644 cpp/src/mip_heuristics/root_lp.cuh

diff --git a/cpp/src/branch_and_bound/branch_and_bound.cpp b/cpp/src/branch_and_bound/branch_and_bound.cpp
index 6ce9a4f4d0..1fe020667d 100644
--- a/cpp/src/branch_and_bound/branch_and_bound.cpp
+++ b/cpp/src/branch_and_bound/branch_and_bound.cpp
@@ -9,6 +9,8 @@
 #include <branch_and_bound/mip_node.hpp>
 #include <branch_and_bound/pseudo_costs.hpp>
 
+#include <mip_heuristics/root_lp.cuh>
+
 #include <cuts/cuts.hpp>
 
 #include <dual_simplex/basis_solves.hpp>
@@ -28,6 +30,7 @@
 #include <omp.h>
 
 #include <algorithm>
+#include <chrono>
 #include <cmath>
 #include <cstdio>
 #include <cstdlib>
@@ -241,7 +244,9 @@ template <typename i_t, typename f_t>
 branch_and_bound_t<i_t, f_t>::branch_and_bound_t(
   const user_problem_t<i_t, f_t>& user_problem,
   const simplex_solver_settings_t<i_t, f_t>& solver_settings,
-  f_t start_time)
+  f_t start_time,
+  cuopt::linear_programming::detail::problem_t<i_t, f_t>* mip_problem_ptr,
+  i_t num_gpus)
   : original_problem_(user_problem),
     settings_(solver_settings),
     original_lp_(user_problem.handle_ptr, 1, 1, 1),
@@ -250,7 +255,9 @@ branch_and_bound_t<i_t, f_t>::branch_and_bound_t(
     root_relax_soln_(1, 1),
     root_crossover_soln_(1, 1),
     pc_(1),
-    solver_status_(mip_status_t::UNSET)
+    solver_status_(mip_status_t::UNSET),
+    mip_problem_ptr_(mip_problem_ptr),
+    pdlp_root_num_gpus_(num_gpus)
 {
   exploration_stats_.start_time = start_time;
 #ifdef PRINT_CONSTRAINT_MATRIX
@@ -1811,15 +1818,65 @@ lp_status_t branch_and_bound_t<i_t, f_t>::solve_root_relaxation(
                                   std::ref(root_vstatus),
                                   std::ref(edge_norms),
                                   nullptr);
-  // Wait for the root relaxation solution to be sent by the diversity manager or dual simplex
-  // to finish
-  while (!root_crossover_solution_set_.load(std::memory_order_acquire) &&
-         *get_root_concurrent_halt() == 0) {
+
+  std::optional<std::future<root_relaxation_first_solution_t<i_t, f_t>>> pdlp_future_opt;
+  if (enable_concurrent_lp_root_solve_ && mip_problem_ptr_ != nullptr) {
+    root_crossover_solution_set_.store(false, std::memory_order_release);
+    pdlp_future_opt =
+      std::async(std::launch::async,
+                 &cuopt::linear_programming::detail::run_pdlp_barrier_for_root_lp<i_t, f_t>,
+                 mip_problem_ptr_,
+                 lp_settings.time_limit,
+                 get_root_concurrent_halt(),
+                 pdlp_root_num_gpus_);
+  }
+
+  // Wait for first completion: PDLP/Barrier future, dual simplex future, or legacy callback
+  while (*get_root_concurrent_halt() == 0) {
+    bool pdlp_ready =
+      pdlp_future_opt && pdlp_future_opt->valid() &&
+      pdlp_future_opt->wait_for(std::chrono::milliseconds(0)) == std::future_status::ready;
+    bool ds_ready =
+      root_status_future.wait_for(std::chrono::milliseconds(0)) == std::future_status::ready;
+    if (root_crossover_solution_set_.load(std::memory_order_acquire) || pdlp_ready || ds_ready) {
+      break;
+    }
     std::this_thread::sleep_for(std::chrono::milliseconds(1));
-    continue;
   }
 
-  if (root_crossover_solution_set_.load(std::memory_order_acquire)) {
+  bool use_pdlp_path = false;
+  if (pdlp_future_opt && pdlp_future_opt->valid() &&
+      pdlp_future_opt->wait_for(std::chrono::milliseconds(0)) == std::future_status::ready) {
+    auto result                         = pdlp_future_opt->get();
+    root_crossover_soln_.x              = result.primal;
+    root_crossover_soln_.y              = result.dual;
+    root_crossover_soln_.z              = result.reduced_costs;
+    root_crossover_soln_.objective      = result.objective;
+    root_crossover_soln_.user_objective = result.user_objective;
+    root_crossover_soln_.iterations     = result.iterations;
+    root_objective_                     = result.objective;
+    root_crossover_solution_set_.store(true, std::memory_order_release);
+    if (lp_settings.on_first_lp_solution_available) {
+      lp_settings.on_first_lp_solution_available(result);
+    }
+    use_pdlp_path = true;
+  }
+
+  if (!use_pdlp_path && root_crossover_solution_set_.load(std::memory_order_acquire)) {
+    // Legacy path: set_root_relaxation_solution was invoked
+    root_relaxation_first_solution_t<i_t, f_t> legacy_result;
+    legacy_result.primal         = root_crossover_soln_.x;
+    legacy_result.dual           = root_crossover_soln_.y;
+    legacy_result.reduced_costs  = root_crossover_soln_.z;
+    legacy_result.objective      = root_crossover_soln_.objective;
+    legacy_result.user_objective = root_crossover_soln_.user_objective;
+    legacy_result.iterations     = root_crossover_soln_.iterations;
+    if (lp_settings.on_first_lp_solution_available) {
+      lp_settings.on_first_lp_solution_available(legacy_result);
+    }
+  }
+
+  if (use_pdlp_path || root_crossover_solution_set_.load(std::memory_order_acquire)) {
     // Crush the root relaxation solution on converted user problem
     std::vector<f_t> crushed_root_x;
     crush_primal_solution(
@@ -1909,9 +1966,19 @@ lp_status_t branch_and_bound_t<i_t, f_t>::solve_root_relaxation(
       solver_name    = "Dual Simplex";
     }
   } else {
-    root_status    = root_status_future.get();
-    user_objective = root_relax_soln_.user_objective;
-    iter           = root_relax_soln_.iterations;
+    root_status = root_status_future.get();
+    root_relaxation_first_solution_t<i_t, f_t> ds_result;
+    ds_result.primal         = root_relax_soln.x;
+    ds_result.dual           = root_relax_soln.y;
+    ds_result.reduced_costs  = root_relax_soln.z;
+    ds_result.objective      = root_relax_soln.objective;
+    ds_result.user_objective = root_relax_soln.user_objective;
+    ds_result.iterations     = root_relax_soln.iterations;
+    if (lp_settings.on_first_lp_solution_available) {
+      lp_settings.on_first_lp_solution_available(ds_result);
+    }
+    user_objective = root_relax_soln.user_objective;
+    iter           = root_relax_soln.iterations;
     solver_name    = "Dual Simplex";
   }
 
diff --git a/cpp/src/branch_and_bound/branch_and_bound.hpp b/cpp/src/branch_and_bound/branch_and_bound.hpp
index a13d5cedcf..825d89049d 100644
--- a/cpp/src/branch_and_bound/branch_and_bound.hpp
+++ b/cpp/src/branch_and_bound/branch_and_bound.hpp
@@ -35,6 +35,10 @@
 #include <functional>
 #include <vector>
 
+namespace cuopt::linear_programming::detail {
+template <typename i_t, typename f_t>
+class problem_t;
+}
 namespace cuopt::linear_programming::dual_simplex {
 
 enum class mip_status_t {
@@ -66,9 +70,12 @@ struct deterministic_diving_policy_t;
 template <typename i_t, typename f_t>
 class branch_and_bound_t {
  public:
-  branch_and_bound_t(const user_problem_t<i_t, f_t>& user_problem,
-                     const simplex_solver_settings_t<i_t, f_t>& solver_settings,
-                     f_t start_time);
+  branch_and_bound_t(
+    const user_problem_t<i_t, f_t>& user_problem,
+    const simplex_solver_settings_t<i_t, f_t>& solver_settings,
+    f_t start_time,
+    cuopt::linear_programming::detail::problem_t<i_t, f_t>* mip_problem_ptr = nullptr,
+    i_t num_gpus                                                            = 1);
 
   // Set an initial guess based on the user_problem. This should be called before solve.
   void set_initial_guess(const std::vector<f_t>& user_guess) { guess_ = user_guess; }
@@ -193,6 +200,8 @@ class branch_and_bound_t {
   bool enable_concurrent_lp_root_solve_{false};
   std::atomic<int> root_concurrent_halt_{0};
   bool is_root_solution_set{false};
+  cuopt::linear_programming::detail::problem_t<i_t, f_t>* mip_problem_ptr_{nullptr};
+  i_t pdlp_root_num_gpus_{1};
 
   // Pseudocosts
   pseudo_costs_t<i_t, f_t> pc_;
diff --git a/cpp/src/dual_simplex/simplex_solver_settings.hpp b/cpp/src/dual_simplex/simplex_solver_settings.hpp
index 815e229232..b014823cda 100644
--- a/cpp/src/dual_simplex/simplex_solver_settings.hpp
+++ b/cpp/src/dual_simplex/simplex_solver_settings.hpp
@@ -202,6 +202,9 @@ struct simplex_solver_settings_t {
   std::function<void(const std::vector<f_t>&, f_t)> node_processed_callback;
   std::function<void()> heuristic_preemption_callback;
   std::function<void(std::vector<f_t>&, std::vector<f_t>&, f_t)> set_simplex_solution_callback;
+  // Called by B&B when first LP solution is available (PDLP/Barrier or dual simplex).
+  std::function<void(root_relaxation_first_solution_t<i_t, f_t> const&)>
+    on_first_lp_solution_available;
   mutable logger_t log;
   std::atomic<int>* concurrent_halt;  // if nullptr ignored, if !nullptr, 0 if solver should
                                       // continue, 1 if solver should halt
diff --git a/cpp/src/dual_simplex/types.hpp b/cpp/src/dual_simplex/types.hpp
index ea46a1f67e..6660a86f0a 100644
--- a/cpp/src/dual_simplex/types.hpp
+++ b/cpp/src/dual_simplex/types.hpp
@@ -9,6 +9,7 @@
 
 #include <cstdint>
 #include <limits>
+#include <vector>
 
 namespace cuopt::linear_programming::dual_simplex {
 
@@ -19,6 +20,18 @@ using float64_t = double;
 
 constexpr float64_t inf = std::numeric_limits<float64_t>::infinity();
 
+// First LP solution from either PDLP/Barrier or dual simplex; used to notify diversity manager
+// without B&B depending on PDLP types.
+template <typename i_t, typename f_t>
+struct root_relaxation_first_solution_t {
+  std::vector<f_t> primal;
+  std::vector<f_t> dual;
+  std::vector<f_t> reduced_costs;
+  f_t objective{0};
+  f_t user_objective{0};
+  i_t iterations{0};
+};
+
 // We return this constant to signal that a concurrent halt has occurred
 #define CONCURRENT_HALT_RETURN -2
 // We return this constant to signal that a time limit has occurred
diff --git a/cpp/src/mip_heuristics/CMakeLists.txt b/cpp/src/mip_heuristics/CMakeLists.txt
index a200d4265b..202ed94bd5 100644
--- a/cpp/src/mip_heuristics/CMakeLists.txt
+++ b/cpp/src/mip_heuristics/CMakeLists.txt
@@ -41,7 +41,8 @@ set(MIP_NON_LP_FILES
   ${CMAKE_CURRENT_SOURCE_DIR}/presolve/conflict_graph/clique_table.cu
   ${CMAKE_CURRENT_SOURCE_DIR}/feasibility_jump/feasibility_jump.cu
   ${CMAKE_CURRENT_SOURCE_DIR}/feasibility_jump/feasibility_jump_kernels.cu
-  ${CMAKE_CURRENT_SOURCE_DIR}/feasibility_jump/fj_cpu.cu)
+  ${CMAKE_CURRENT_SOURCE_DIR}/feasibility_jump/fj_cpu.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/root_lp.cu)
 
 # Choose which files to include based on build mode
 if(BUILD_LP_ONLY)
diff --git a/cpp/src/mip_heuristics/diversity/diversity_manager.cu b/cpp/src/mip_heuristics/diversity/diversity_manager.cu
index ed165fe610..7e223bdd98 100644
--- a/cpp/src/mip_heuristics/diversity/diversity_manager.cu
+++ b/cpp/src/mip_heuristics/diversity/diversity_manager.cu
@@ -409,7 +409,16 @@ solution_t<i_t, f_t> diversity_manager_t<i_t, f_t>::run_solver()
   bool bb_thread_solution_exists = simplex_solution_exists.load();
   if (bb_thread_solution_exists) {
     ls.lp_optimal_exists = true;
+  } else if (branch_and_bound_ptr != nullptr &&
+             branch_and_bound_ptr->enable_concurrent_lp_root_solve()) {
+    // B&B drives root relaxation; wait for first solution (PDLP/Barrier or dual simplex)
+    first_solution_ready_.store(false, std::memory_order_release);
+    std::unique_lock<std::mutex> lock(first_solution_mutex_);
+    first_solution_cv_.wait(lock, [this]() { return first_solution_ready_.load(); });
+    lock.unlock();
+    clamp_within_var_bounds(lp_optimal_solution, problem_ptr, problem_ptr->handle_ptr);
   } else if (!fj_only_run) {
+    // Heuristics-only or non-concurrent: diversity manager runs LP solve
     convert_greater_to_less(*problem_ptr);
 
     f_t tolerance_divisor =
@@ -481,38 +490,6 @@ solution_t<i_t, f_t> diversity_manager_t<i_t, f_t>::run_solver()
       // to bring variables within the bounds
     }
 
-    // Send PDLP relaxed solution to branch and bound
-    if (problem_ptr->set_root_relaxation_solution_callback != nullptr) {
-      auto& d_primal_solution = lp_result.get_primal_solution();
-      auto& d_dual_solution   = lp_result.get_dual_solution();
-      auto& d_reduced_costs   = lp_result.get_reduced_cost();
-
-      std::vector<f_t> host_primal(d_primal_solution.size());
-      std::vector<f_t> host_dual(d_dual_solution.size());
-      std::vector<f_t> host_reduced_costs(d_reduced_costs.size());
-      raft::copy(host_primal.data(),
-                 d_primal_solution.data(),
-                 d_primal_solution.size(),
-                 problem_ptr->handle_ptr->get_stream());
-      raft::copy(host_dual.data(),
-                 d_dual_solution.data(),
-                 d_dual_solution.size(),
-                 problem_ptr->handle_ptr->get_stream());
-      raft::copy(host_reduced_costs.data(),
-                 d_reduced_costs.data(),
-                 d_reduced_costs.size(),
-                 problem_ptr->handle_ptr->get_stream());
-      problem_ptr->handle_ptr->sync_stream();
-
-      // PDLP returns user-space objective (it applies objective_scaling_factor internally)
-      auto user_obj   = lp_result.get_objective_value();
-      auto solver_obj = problem_ptr->get_solver_obj_from_user_obj(user_obj);
-      auto iterations = lp_result.get_additional_termination_information().number_of_steps_taken;
-      // Set for the B&B (param4 expects solver space, param5 expects user space)
-      problem_ptr->set_root_relaxation_solution_callback(
-        host_primal, host_dual, host_reduced_costs, solver_obj, user_obj, iterations);
-    }
-
     // in case the pdlp returned var boudns that are out of bounds
     clamp_within_var_bounds(lp_optimal_solution, problem_ptr, problem_ptr->handle_ptr);
   }
@@ -851,6 +828,35 @@ std::pair<solution_t<i_t, f_t>, bool> diversity_manager_t<i_t, f_t>::recombine(
   return std::make_pair(solution_t<i_t, f_t>(a), false);
 }
 
+template <typename i_t, typename f_t>
+void diversity_manager_t<i_t, f_t>::on_first_lp_solution(
+  cuopt::linear_programming::dual_simplex::root_relaxation_first_solution_t<i_t, f_t> const& result)
+{
+  {
+    std::lock_guard<std::mutex> lock(relaxed_solution_mutex);
+    cuopt_assert(result.primal.size() == lp_optimal_solution.size(),
+                 "First LP solution primal size mismatch");
+    cuopt_assert(result.dual.size() == lp_dual_optimal_solution.size(),
+                 "First LP solution dual size mismatch");
+    raft::copy(lp_optimal_solution.data(),
+               result.primal.data(),
+               result.primal.size(),
+               problem_ptr->handle_ptr->get_stream());
+    raft::copy(lp_dual_optimal_solution.data(),
+               result.dual.data(),
+               result.dual.size(),
+               problem_ptr->handle_ptr->get_stream());
+    problem_ptr->handle_ptr->sync_stream();
+    ls.lp_optimal_exists = true;
+    set_new_user_bound(result.user_objective);
+  }
+  {
+    std::lock_guard<std::mutex> lock(first_solution_mutex_);
+    first_solution_ready_.store(true, std::memory_order_release);
+    first_solution_cv_.notify_all();
+  }
+}
+
 template <typename i_t, typename f_t>
 void diversity_manager_t<i_t, f_t>::set_simplex_solution(const std::vector<f_t>& solution,
                                                          const std::vector<f_t>& dual_solution,
diff --git a/cpp/src/mip_heuristics/diversity/diversity_manager.cuh b/cpp/src/mip_heuristics/diversity/diversity_manager.cuh
index d4e24bdeaf..6dd53be52b 100644
--- a/cpp/src/mip_heuristics/diversity/diversity_manager.cuh
+++ b/cpp/src/mip_heuristics/diversity/diversity_manager.cuh
@@ -21,12 +21,17 @@
 #include <cuopt/linear_programming/mip/solver_settings.hpp>
 #include <cuopt/linear_programming/mip/solver_stats.hpp>
 
+#include <dual_simplex/types.hpp>
+
 #include <mip_heuristics/diversity/lns/rins.cuh>
 #include <mip_heuristics/local_search/local_search.cuh>
 #include <mip_heuristics/solution/solution.cuh>
 #include <mip_heuristics/solver.cuh>
 #include <utilities/timer.hpp>
 
+#include <condition_variable>
+#include <mutex>
+
 namespace cuopt::linear_programming::detail {
 
 template <typename i_t, typename f_t>
@@ -70,6 +75,11 @@ class diversity_manager_t {
                             const std::vector<f_t>& dual_solution,
                             f_t objective);
 
+  // Called by B&B when first LP solution is available (PDLP/Barrier or dual simplex).
+  void on_first_lp_solution(
+    cuopt::linear_programming::dual_simplex::root_relaxation_first_solution_t<i_t, f_t> const&
+      result);
+
   mip_solver_context_t<i_t, f_t>& context;
   dual_simplex::branch_and_bound_t<i_t, f_t>* branch_and_bound_ptr;
   problem_t<i_t, f_t>* problem_ptr;
@@ -97,6 +107,11 @@ class diversity_manager_t {
   // atomic for signalling pdlp to stop
   std::atomic<int> global_concurrent_halt{0};
 
+  // First solution from B&B: wait for B&B to call on_first_lp_solution when run_bb and concurrent
+  std::mutex first_solution_mutex_;
+  std::condition_variable first_solution_cv_;
+  std::atomic<bool> first_solution_ready_{false};
+
   rins_t<i_t, f_t> rins;
 
   bool run_only_ls_recombiner{false};
diff --git a/cpp/src/mip_heuristics/root_lp.cu b/cpp/src/mip_heuristics/root_lp.cu
new file mode 100644
index 0000000000..d11fbda957
--- /dev/null
+++ b/cpp/src/mip_heuristics/root_lp.cu
@@ -0,0 +1,74 @@
+/* clang-format off */
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+/* clang-format on */
+
+#include <mip_heuristics/problem/problem.cuh>
+#include <mip_heuristics/problem/problem_helpers.cuh>
+#include "root_lp.cuh"
+
+#include <pdlp/pdlp.cuh>
+#include <pdlp/solve.cuh>
+
+#include <dual_simplex/types.hpp>
+#include <raft/core/copy.hpp>
+#include <utilities/timer.hpp>
+
+namespace cuopt::linear_programming::detail {
+
+template <typename i_t, typename f_t>
+cuopt::linear_programming::dual_simplex::root_relaxation_first_solution_t<i_t, f_t>
+run_pdlp_barrier_for_root_lp(problem_t<i_t, f_t>* problem,
+                             f_t time_limit,
+                             std::atomic<int>* concurrent_halt,
+                             i_t num_gpus)
+{
+  convert_greater_to_less(*problem);
+  f_t tolerance_divisor =
+    problem->tolerances.absolute_tolerance /
+    (problem->tolerances.relative_tolerance > 0 ? problem->tolerances.relative_tolerance : 1);
+  pdlp_solver_settings_t<i_t, f_t> pdlp_settings{};
+  pdlp_settings.tolerances.relative_primal_tolerance =
+    problem->tolerances.absolute_tolerance / tolerance_divisor;
+  pdlp_settings.tolerances.relative_dual_tolerance =
+    problem->tolerances.absolute_tolerance / tolerance_divisor;
+  pdlp_settings.time_limit            = time_limit;
+  pdlp_settings.first_primal_feasible = false;
+  pdlp_settings.concurrent_halt       = concurrent_halt;
+  pdlp_settings.method                = method_t::Concurrent;
+  pdlp_settings.inside_mip            = true;
+  pdlp_settings.pdlp_solver_mode      = pdlp_solver_mode_t::Stable2;
+  pdlp_settings.num_gpus              = num_gpus;
+  pdlp_settings.presolver             = presolver_t::None;
+
+  timer_t lp_timer(time_limit);
+  auto lp_result = solve_lp_with_method<i_t, f_t>(*problem, pdlp_settings, lp_timer);
+
+  cuopt::linear_programming::dual_simplex::root_relaxation_first_solution_t<i_t, f_t> result;
+  auto stream = problem->handle_ptr->get_stream();
+  result.primal.resize(lp_result.get_primal_solution().size());
+  result.dual.resize(lp_result.get_dual_solution().size());
+  result.reduced_costs.resize(lp_result.get_reduced_cost().size());
+  raft::copy(
+    result.primal.data(), lp_result.get_primal_solution().data(), result.primal.size(), stream);
+  raft::copy(result.dual.data(), lp_result.get_dual_solution().data(), result.dual.size(), stream);
+  raft::copy(result.reduced_costs.data(),
+             lp_result.get_reduced_cost().data(),
+             result.reduced_costs.size(),
+             stream);
+  problem->handle_ptr->sync_stream();
+  result.objective      = problem->get_solver_obj_from_user_obj(lp_result.get_objective_value());
+  result.user_objective = lp_result.get_objective_value();
+  result.iterations     = lp_result.get_additional_termination_information().number_of_steps_taken;
+  return result;
+}
+
+template cuopt::linear_programming::dual_simplex::root_relaxation_first_solution_t<int, double>
+run_pdlp_barrier_for_root_lp<int, double>(problem_t<int, double>*, double, std::atomic<int>*, int);
+
+template cuopt::linear_programming::dual_simplex::root_relaxation_first_solution_t<int, float>
+run_pdlp_barrier_for_root_lp<int, float>(problem_t<int, float>*, float, std::atomic<int>*, int);
+
+}  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/mip_heuristics/root_lp.cuh b/cpp/src/mip_heuristics/root_lp.cuh
new file mode 100644
index 0000000000..8683ebe820
--- /dev/null
+++ b/cpp/src/mip_heuristics/root_lp.cuh
@@ -0,0 +1,31 @@
+/* clang-format off */
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+/* clang-format on */
+
+#pragma once
+
+#include <dual_simplex/types.hpp>
+
+#include <atomic>
+#include <cstdint>
+
+namespace cuopt::linear_programming::detail {
+
+template <typename i_t, typename f_t>
+class problem_t;
+
+/**
+ * Run PDLP/Barrier for root LP (used by branch-and-bound when concurrent root solve is enabled).
+ * Implemented in root_lp.cu so GPU code (convert_greater_to_less, solve_lp_with_method) can run.
+ */
+template <typename i_t, typename f_t>
+cuopt::linear_programming::dual_simplex::root_relaxation_first_solution_t<i_t, f_t>
+run_pdlp_barrier_for_root_lp(problem_t<i_t, f_t>* problem,
+                             f_t time_limit,
+                             std::atomic<int>* concurrent_halt,
+                             i_t num_gpus);
+
+}  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/mip_heuristics/solver.cu b/cpp/src/mip_heuristics/solver.cu
index e6f6d50b62..42f7995ed6 100644
--- a/cpp/src/mip_heuristics/solver.cu
+++ b/cpp/src/mip_heuristics/solver.cu
@@ -257,17 +257,29 @@ solution_t<i_t, f_t> mip_solver_t<i_t, f_t>::run_solver()
                   &solution_helper,
                   std::placeholders::_1,
                   std::placeholders::_2);
+
+      branch_and_bound_settings.on_first_lp_solution_available =
+        [&dm](dual_simplex::root_relaxation_first_solution_t<i_t, f_t> const& result) {
+          dm.on_first_lp_solution(result);
+        };
     }
 
     // Create the branch and bound object
-    branch_and_bound = std::make_unique<dual_simplex::branch_and_bound_t<i_t, f_t>>(
-      branch_and_bound_problem, branch_and_bound_settings, timer_.get_tic_start());
+    auto* mip_problem_ptr = (context.settings.determinism_mode == CUOPT_MODE_OPPORTUNISTIC)
+                              ? context.problem_ptr
+                              : nullptr;
+    i_t num_gpus          = context.settings.num_gpus;
+    branch_and_bound =
+      std::make_unique<dual_simplex::branch_and_bound_t<i_t, f_t>>(branch_and_bound_problem,
+                                                                   branch_and_bound_settings,
+                                                                   timer_.get_tic_start(),
+                                                                   mip_problem_ptr,
+                                                                   num_gpus);
     context.branch_and_bound_ptr = branch_and_bound.get();
     auto* stats_ptr              = &context.stats;
     branch_and_bound->set_user_bound_callback(
       [stats_ptr](f_t user_bound) { stats_ptr->set_solution_bound(user_bound); });
 
-    // Set the primal heuristics -> branch and bound callback
     if (context.settings.determinism_mode == CUOPT_MODE_OPPORTUNISTIC) {
       branch_and_bound->set_concurrent_lp_root_solve(true);
 

From 1b6d98a9996ec7d186f76c195cb296f5a58acef9 Mon Sep 17 00:00:00 2001
From: Rajesh Gandham <rgandham@nvidia.com>
Date: Thu, 5 Mar 2026 18:45:06 -0800
Subject: [PATCH 03/30] Move branch and bound problem to inside branch and
 bound

---
 cpp/src/branch_and_bound/CMakeLists.txt       |  1 +
 cpp/src/branch_and_bound/branch_and_bound.cpp |  8 +++-----
 cpp/src/branch_and_bound/branch_and_bound.hpp | 19 ++++++++++++-------
 cpp/src/mip_heuristics/diversity/lns/rins.cu  |  6 ++----
 .../diversity/recombiners/sub_mip.cuh         |  6 ++----
 cpp/src/mip_heuristics/solver.cu              | 19 ++++---------------
 6 files changed, 24 insertions(+), 35 deletions(-)

diff --git a/cpp/src/branch_and_bound/CMakeLists.txt b/cpp/src/branch_and_bound/CMakeLists.txt
index 5bb1017120..9b04014fb7 100644
--- a/cpp/src/branch_and_bound/CMakeLists.txt
+++ b/cpp/src/branch_and_bound/CMakeLists.txt
@@ -5,6 +5,7 @@
 
 set(BRANCH_AND_BOUND_SRC_FILES
   ${CMAKE_CURRENT_SOURCE_DIR}/branch_and_bound.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/branch_and_bound_from_mip.cu
   ${CMAKE_CURRENT_SOURCE_DIR}/mip_node.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/pseudo_costs.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/diving_heuristics.cpp
diff --git a/cpp/src/branch_and_bound/branch_and_bound.cpp b/cpp/src/branch_and_bound/branch_and_bound.cpp
index 1fe020667d..16a76537aa 100644
--- a/cpp/src/branch_and_bound/branch_and_bound.cpp
+++ b/cpp/src/branch_and_bound/branch_and_bound.cpp
@@ -244,9 +244,7 @@ template <typename i_t, typename f_t>
 branch_and_bound_t<i_t, f_t>::branch_and_bound_t(
   const user_problem_t<i_t, f_t>& user_problem,
   const simplex_solver_settings_t<i_t, f_t>& solver_settings,
-  f_t start_time,
-  cuopt::linear_programming::detail::problem_t<i_t, f_t>* mip_problem_ptr,
-  i_t num_gpus)
+  f_t start_time)
   : original_problem_(user_problem),
     settings_(solver_settings),
     original_lp_(user_problem.handle_ptr, 1, 1, 1),
@@ -256,8 +254,8 @@ branch_and_bound_t<i_t, f_t>::branch_and_bound_t(
     root_crossover_soln_(1, 1),
     pc_(1),
     solver_status_(mip_status_t::UNSET),
-    mip_problem_ptr_(mip_problem_ptr),
-    pdlp_root_num_gpus_(num_gpus)
+    mip_problem_ptr_(nullptr),
+    pdlp_root_num_gpus_(1)
 {
   exploration_stats_.start_time = start_time;
 #ifdef PRINT_CONSTRAINT_MATRIX
diff --git a/cpp/src/branch_and_bound/branch_and_bound.hpp b/cpp/src/branch_and_bound/branch_and_bound.hpp
index 825d89049d..909c57e0c8 100644
--- a/cpp/src/branch_and_bound/branch_and_bound.hpp
+++ b/cpp/src/branch_and_bound/branch_and_bound.hpp
@@ -70,12 +70,16 @@ struct deterministic_diving_policy_t;
 template <typename i_t, typename f_t>
 class branch_and_bound_t {
  public:
-  branch_and_bound_t(
-    const user_problem_t<i_t, f_t>& user_problem,
-    const simplex_solver_settings_t<i_t, f_t>& solver_settings,
-    f_t start_time,
-    cuopt::linear_programming::detail::problem_t<i_t, f_t>* mip_problem_ptr = nullptr,
-    i_t num_gpus                                                            = 1);
+  /** Build from MIP problem_t (used by mip_heuristics). Implemented in
+   * branch_and_bound_from_mip.cu. */
+  branch_and_bound_t(cuopt::linear_programming::detail::problem_t<i_t, f_t>* mip_problem_ptr,
+                     const simplex_solver_settings_t<i_t, f_t>& solver_settings,
+                     f_t start_time,
+                     i_t num_gpus = 1);
+  /** Build from user_problem_t (used by dual_simplex/solve.cpp, RINS, sub_mip). */
+  branch_and_bound_t(const user_problem_t<i_t, f_t>& user_problem,
+                     const simplex_solver_settings_t<i_t, f_t>& solver_settings,
+                     f_t start_time);
 
   // Set an initial guess based on the user_problem. This should be called before solve.
   void set_initial_guess(const std::vector<f_t>& user_guess) { guess_ = user_guess; }
@@ -122,6 +126,7 @@ class branch_and_bound_t {
                        std::vector<f_t>& repaired_solution) const;
 
   f_t get_lower_bound();
+  i_t get_num_cols() const { return original_problem_.num_cols; }
   bool enable_concurrent_lp_root_solve() const { return enable_concurrent_lp_root_solve_; }
   std::atomic<int>* get_root_concurrent_halt() { return &root_concurrent_halt_; }
   void set_root_concurrent_halt(int value) { root_concurrent_halt_ = value; }
@@ -146,7 +151,7 @@ class branch_and_bound_t {
   producer_sync_t& get_producer_sync() { return producer_sync_; }
 
  private:
-  const user_problem_t<i_t, f_t>& original_problem_;
+  user_problem_t<i_t, f_t> original_problem_;
   const simplex_solver_settings_t<i_t, f_t> settings_;
 
   work_limit_context_t work_unit_context_{"B&B"};
diff --git a/cpp/src/mip_heuristics/diversity/lns/rins.cu b/cpp/src/mip_heuristics/diversity/lns/rins.cu
index 7fd8533f82..1d009b8fb7 100644
--- a/cpp/src/mip_heuristics/diversity/lns/rins.cu
+++ b/cpp/src/mip_heuristics/diversity/lns/rins.cu
@@ -248,12 +248,9 @@ void rins_t<i_t, f_t>::run_rins()
 
   // run sub-mip
   namespace dual_simplex = cuopt::linear_programming::dual_simplex;
-  dual_simplex::user_problem_t<i_t, f_t> branch_and_bound_problem(&rins_handle);
   dual_simplex::simplex_solver_settings_t<i_t, f_t> branch_and_bound_settings;
   dual_simplex::mip_solution_t<i_t, f_t> branch_and_bound_solution(1);
   dual_simplex::mip_status_t branch_and_bound_status = dual_simplex::mip_status_t::UNSET;
-  fixed_problem.get_host_user_problem(branch_and_bound_problem);
-  branch_and_bound_solution.resize(branch_and_bound_problem.num_cols);
   // Fill in the settings for branch and bound
   branch_and_bound_settings.time_limit = time_limit;
   // branch_and_bound_settings.node_limit = 5000 + node_count / 100;  // try harder as time goes
@@ -274,7 +271,8 @@ void rins_t<i_t, f_t>::run_rins()
     rins_solution_queue.push_back(solution);
   };
   dual_simplex::branch_and_bound_t<i_t, f_t> branch_and_bound(
-    branch_and_bound_problem, branch_and_bound_settings, dual_simplex::tic());
+    &fixed_problem, branch_and_bound_settings, dual_simplex::tic(), 1);
+  branch_and_bound_solution.resize(branch_and_bound.get_num_cols());
   branch_and_bound.set_initial_guess(cuopt::host_copy(fixed_assignment, rins_handle.get_stream()));
   branch_and_bound_status = branch_and_bound.solve(branch_and_bound_solution);
 
diff --git a/cpp/src/mip_heuristics/diversity/recombiners/sub_mip.cuh b/cpp/src/mip_heuristics/diversity/recombiners/sub_mip.cuh
index b2f7f80066..5b9821cc3f 100644
--- a/cpp/src/mip_heuristics/diversity/recombiners/sub_mip.cuh
+++ b/cpp/src/mip_heuristics/diversity/recombiners/sub_mip.cuh
@@ -95,10 +95,7 @@ class sub_mip_recombiner_t : public recombiner_t<i_t, f_t> {
     if (run_sub_mip) {
       // run sub-mip
       namespace dual_simplex = cuopt::linear_programming::dual_simplex;
-      dual_simplex::user_problem_t<i_t, f_t> branch_and_bound_problem(offspring.handle_ptr);
       dual_simplex::simplex_solver_settings_t<i_t, f_t> branch_and_bound_settings;
-      fixed_problem.get_host_user_problem(branch_and_bound_problem);
-      branch_and_bound_solution.resize(branch_and_bound_problem.num_cols);
       // Fill in the settings for branch and bound
       branch_and_bound_settings.time_limit = sub_mip_recombiner_config_t::sub_mip_time_limit;
       branch_and_bound_settings.print_presolve_stats = false;
@@ -117,7 +114,8 @@ class sub_mip_recombiner_t : public recombiner_t<i_t, f_t> {
       // disable B&B logs, so that it is not interfering with the main B&B thread
       branch_and_bound_settings.log.log = false;
       dual_simplex::branch_and_bound_t<i_t, f_t> branch_and_bound(
-        branch_and_bound_problem, branch_and_bound_settings, dual_simplex::tic());
+        &fixed_problem, branch_and_bound_settings, dual_simplex::tic(), 1);
+      branch_and_bound_solution.resize(branch_and_bound.get_num_cols());
       branch_and_bound_status = branch_and_bound.solve(branch_and_bound_solution);
       if (solution_vector.size() > 0) {
         cuopt_assert(fixed_assignment.size() == branch_and_bound_solution.x.size(),
diff --git a/cpp/src/mip_heuristics/solver.cu b/cpp/src/mip_heuristics/solver.cu
index 42f7995ed6..8d5cd813bf 100644
--- a/cpp/src/mip_heuristics/solver.cu
+++ b/cpp/src/mip_heuristics/solver.cu
@@ -180,13 +180,11 @@ solution_t<i_t, f_t> mip_solver_t<i_t, f_t>::run_solver()
 
   namespace dual_simplex = cuopt::linear_programming::dual_simplex;
   std::future<dual_simplex::mip_status_t> branch_and_bound_status_future;
-  dual_simplex::user_problem_t<i_t, f_t> branch_and_bound_problem(context.problem_ptr->handle_ptr);
   context.problem_ptr->recompute_objective_integrality();
   if (context.problem_ptr->is_objective_integral()) {
     CUOPT_LOG_INFO("Objective function is integral, scale %g",
                    context.problem_ptr->presolve_data.objective_scaling_factor);
   }
-  branch_and_bound_problem.objective_is_integral = context.problem_ptr->is_objective_integral();
   dual_simplex::simplex_solver_settings_t<i_t, f_t> branch_and_bound_settings;
   std::unique_ptr<dual_simplex::branch_and_bound_t<i_t, f_t>> branch_and_bound;
   branch_and_bound_solution_helper_t solution_helper(&dm, branch_and_bound_settings);
@@ -194,11 +192,6 @@ solution_t<i_t, f_t> mip_solver_t<i_t, f_t>::run_solver()
 
   bool run_bb = !context.settings.heuristics_only;
   if (run_bb) {
-    // Convert the presolved problem to dual_simplex::user_problem_t
-    op_problem_.get_host_user_problem(branch_and_bound_problem);
-    // Resize the solution now that we know the number of columns/variables
-    branch_and_bound_solution.resize(branch_and_bound_problem.num_cols);
-
     // Fill in the settings for branch and bound
     branch_and_bound_settings.time_limit           = timer_.get_time_limit();
     branch_and_bound_settings.node_limit           = context.settings.node_limit;
@@ -264,17 +257,13 @@ solution_t<i_t, f_t> mip_solver_t<i_t, f_t>::run_solver()
         };
     }
 
-    // Create the branch and bound object
-    auto* mip_problem_ptr = (context.settings.determinism_mode == CUOPT_MODE_OPPORTUNISTIC)
-                              ? context.problem_ptr
-                              : nullptr;
-    i_t num_gpus          = context.settings.num_gpus;
+    // Create the branch and bound object (builds user_problem from context.problem_ptr)
     branch_and_bound =
-      std::make_unique<dual_simplex::branch_and_bound_t<i_t, f_t>>(branch_and_bound_problem,
+      std::make_unique<dual_simplex::branch_and_bound_t<i_t, f_t>>(context.problem_ptr,
                                                                    branch_and_bound_settings,
                                                                    timer_.get_tic_start(),
-                                                                   mip_problem_ptr,
-                                                                   num_gpus);
+                                                                   context.settings.num_gpus);
+    branch_and_bound_solution.resize(branch_and_bound->get_num_cols());
     context.branch_and_bound_ptr = branch_and_bound.get();
     auto* stats_ptr              = &context.stats;
     branch_and_bound->set_user_bound_callback(

From e311548c0419eaa4df26bc7b96b713431018328c Mon Sep 17 00:00:00 2001
From: Rajesh Gandham <rgandham@nvidia.com>
Date: Fri, 6 Mar 2026 15:32:21 -0800
Subject: [PATCH 04/30] fix link errors

---
 .../branch_and_bound_from_mip.cu              | 106 ++++++++++++++++++
 1 file changed, 106 insertions(+)
 create mode 100644 cpp/src/branch_and_bound/branch_and_bound_from_mip.cu

diff --git a/cpp/src/branch_and_bound/branch_and_bound_from_mip.cu b/cpp/src/branch_and_bound/branch_and_bound_from_mip.cu
new file mode 100644
index 0000000000..d4e713ebff
--- /dev/null
+++ b/cpp/src/branch_and_bound/branch_and_bound_from_mip.cu
@@ -0,0 +1,106 @@
+/* clang-format off */
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+/* clang-format on */
+
+#include <branch_and_bound/branch_and_bound.hpp>
+#include <branch_and_bound/mip_node.hpp>
+#include <branch_and_bound/pseudo_costs.hpp>
+
+#include <mip_heuristics/problem/problem.cuh>
+
+#include <cuts/cuts.hpp>
+#include <dual_simplex/initial_basis.hpp>
+#include <dual_simplex/presolve.hpp>
+#include <dual_simplex/user_problem.hpp>
+
+namespace cuopt::linear_programming::dual_simplex {
+
+namespace {
+template <typename i_t, typename f_t>
+void full_variable_types(const user_problem_t<i_t, f_t>& original_problem,
+                         const lp_problem_t<i_t, f_t>& original_lp,
+                         std::vector<variable_type_t>& var_types)
+{
+  var_types = original_problem.var_types;
+  if (original_lp.num_cols > original_problem.num_cols) {
+    var_types.resize(original_lp.num_cols);
+    for (i_t k = original_problem.num_cols; k < original_lp.num_cols; k++) {
+      var_types[k] = variable_type_t::CONTINUOUS;
+    }
+  }
+}
+}  // anonymous namespace
+
+template <typename i_t, typename f_t>
+branch_and_bound_t<i_t, f_t>::branch_and_bound_t(
+  cuopt::linear_programming::detail::problem_t<i_t, f_t>* mip_problem_ptr,
+  const simplex_solver_settings_t<i_t, f_t>& solver_settings,
+  f_t start_time,
+  i_t num_gpus)
+  : original_problem_(mip_problem_ptr->handle_ptr),
+    settings_(solver_settings),
+    original_lp_(mip_problem_ptr->handle_ptr, 1, 1, 1),
+    Arow_(1, 1, 0),
+    incumbent_(1),
+    root_relax_soln_(1, 1),
+    root_crossover_soln_(1, 1),
+    pc_(1),
+    solver_status_(mip_status_t::UNSET),
+    mip_problem_ptr_(mip_problem_ptr),
+    pdlp_root_num_gpus_(num_gpus)
+{
+  exploration_stats_.start_time = start_time;
+  mip_problem_ptr->recompute_objective_integrality();
+  original_problem_.objective_is_integral = mip_problem_ptr->is_objective_integral();
+  mip_problem_ptr->get_host_user_problem(original_problem_);
+
+#ifdef PRINT_CONSTRAINT_MATRIX
+  settings_.log.printf("A");
+  original_problem_.A.print_matrix();
+#endif
+
+  dualize_info_t<i_t, f_t> dualize_info;
+  convert_user_problem(original_problem_, settings_, original_lp_, new_slacks_, dualize_info);
+  full_variable_types(original_problem_, original_lp_, var_types_);
+
+#ifdef CHECK_SLACKS
+  assert(new_slacks_.size() == original_lp_.num_rows);
+  for (i_t slack : new_slacks_) {
+    const i_t col_start = original_lp_.A.col_start[slack];
+    const i_t col_end   = original_lp_.A.col_start[slack + 1];
+    const i_t col_len   = col_end - col_start;
+    if (col_len != 1) {
+      settings_.log.printf("Slack %d has %d nzs\n", slack, col_len);
+      assert(col_len == 1);
+    }
+    const i_t i = original_lp_.A.i[col_start];
+    const f_t x = original_lp_.A.x[col_start];
+    if (std::abs(x) != 1.0) {
+      settings_.log.printf("Slack %d row %d has non-unit coefficient %e\n", slack, i, x);
+      assert(std::abs(x) == 1.0);
+    }
+  }
+#endif
+
+  upper_bound_    = inf;
+  root_objective_ = std::numeric_limits<f_t>::quiet_NaN();
+}
+
+template branch_and_bound_t<int, double>::branch_and_bound_t(
+  cuopt::linear_programming::detail::problem_t<int, double>*,
+  const simplex_solver_settings_t<int, double>&,
+  double,
+  int);
+
+#ifdef MIP_INSTANTIATION_FLOAT
+template branch_and_bound_t<int, float>::branch_and_bound_t(
+  cuopt::linear_programming::detail::problem_t<int, float>*,
+  const simplex_solver_settings_t<int, float>&,
+  float,
+  int);
+#endif
+
+}  // namespace cuopt::linear_programming::dual_simplex

From c522274671af93a24c93e1776cab9525f91770ae Mon Sep 17 00:00:00 2001
From: Rajesh Gandham <rgandham@nvidia.com>
Date: Fri, 6 Mar 2026 15:40:52 -0800
Subject: [PATCH 05/30] Fix link errors

---
 cpp/src/dual_simplex/presolve.cpp | 60 +++++++++++++++++++++++++++++++
 cpp/src/mip_heuristics/root_lp.cu |  3 +-
 2 files changed, 62 insertions(+), 1 deletion(-)

diff --git a/cpp/src/dual_simplex/presolve.cpp b/cpp/src/dual_simplex/presolve.cpp
index b9ee419517..e8af7ba514 100644
--- a/cpp/src/dual_simplex/presolve.cpp
+++ b/cpp/src/dual_simplex/presolve.cpp
@@ -7,6 +7,8 @@
 
 #include <dual_simplex/presolve.hpp>
 
+#include <cuopt/linear_programming/constants.h>
+
 #include <dual_simplex/bounds_strengthening.hpp>
 #include <dual_simplex/folding.hpp>
 #include <dual_simplex/right_looking_lu.hpp>
@@ -1571,4 +1573,62 @@ template void uncrush_solution<int, double>(const presolve_info_t<int, double>&
 
 #endif
 
+#if CUOPT_INSTANTIATE_FLOAT
+
+template void convert_user_problem<int, float>(
+  const user_problem_t<int, float>& user_problem,
+  const simplex_solver_settings_t<int, float>& settings,
+  lp_problem_t<int, float>& problem,
+  std::vector<int>& new_slacks,
+  dualize_info_t<int, float>& dualize_info);
+
+template void convert_user_lp_with_guess<int, float>(
+  const user_problem_t<int, float>& user_problem,
+  const lp_solution_t<int, float>& initial_solution,
+  const std::vector<float>& initial_slack,
+  lp_problem_t<int, float>& lp,
+  lp_solution_t<int, float>& converted_solution);
+
+template int presolve<int, float>(const lp_problem_t<int, float>& original,
+                                  const simplex_solver_settings_t<int, float>& settings,
+                                  lp_problem_t<int, float>& presolved,
+                                  presolve_info_t<int, float>& presolve_info);
+
+template void crush_primal_solution<int, float>(const user_problem_t<int, float>& user_problem,
+                                                const lp_problem_t<int, float>& problem,
+                                                const std::vector<float>& user_solution,
+                                                const std::vector<int>& new_slacks,
+                                                std::vector<float>& solution);
+
+template float crush_dual_solution<int, float>(const user_problem_t<int, float>& user_problem,
+                                               const lp_problem_t<int, float>& problem,
+                                               const std::vector<int>& new_slacks,
+                                               const std::vector<float>& user_y,
+                                               const std::vector<float>& user_z,
+                                               std::vector<float>& y,
+                                               std::vector<float>& z);
+
+template void uncrush_primal_solution<int, float>(const user_problem_t<int, float>& user_problem,
+                                                  const lp_problem_t<int, float>& problem,
+                                                  const std::vector<float>& solution,
+                                                  std::vector<float>& user_solution);
+
+template void uncrush_dual_solution<int, float>(const user_problem_t<int, float>& user_problem,
+                                                const lp_problem_t<int, float>& problem,
+                                                const std::vector<float>& y,
+                                                const std::vector<float>& z,
+                                                std::vector<float>& user_y,
+                                                std::vector<float>& user_z);
+
+template void uncrush_solution<int, float>(const presolve_info_t<int, float>& presolve_info,
+                                           const simplex_solver_settings_t<int, float>& settings,
+                                           const std::vector<float>& crushed_x,
+                                           const std::vector<float>& crushed_y,
+                                           const std::vector<float>& crushed_z,
+                                           std::vector<float>& uncrushed_x,
+                                           std::vector<float>& uncrushed_y,
+                                           std::vector<float>& uncrushed_z);
+
+#endif
+
 }  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/mip_heuristics/root_lp.cu b/cpp/src/mip_heuristics/root_lp.cu
index d11fbda957..8e3346286b 100644
--- a/cpp/src/mip_heuristics/root_lp.cu
+++ b/cpp/src/mip_heuristics/root_lp.cu
@@ -68,7 +68,8 @@ run_pdlp_barrier_for_root_lp(problem_t<i_t, f_t>* problem,
 template cuopt::linear_programming::dual_simplex::root_relaxation_first_solution_t<int, double>
 run_pdlp_barrier_for_root_lp<int, double>(problem_t<int, double>*, double, std::atomic<int>*, int);
 
+#ifdef MIP_INSTANTIATION_FLOAT
 template cuopt::linear_programming::dual_simplex::root_relaxation_first_solution_t<int, float>
 run_pdlp_barrier_for_root_lp<int, float>(problem_t<int, float>*, float, std::atomic<int>*, int);
-
+#endif
 }  // namespace cuopt::linear_programming::detail

From 55b2afb2d54a10d1e90facab9702162e77e50307 Mon Sep 17 00:00:00 2001
From: Rajesh Gandham <rgandham@nvidia.com>
Date: Mon, 9 Mar 2026 15:21:53 -0700
Subject: [PATCH 06/30] Fix sync issues

---
 cpp/src/branch_and_bound/branch_and_bound.cpp | 143 ++++++++----------
 1 file changed, 65 insertions(+), 78 deletions(-)

diff --git a/cpp/src/branch_and_bound/branch_and_bound.cpp b/cpp/src/branch_and_bound/branch_and_bound.cpp
index 16a76537aa..1b9f370cc5 100644
--- a/cpp/src/branch_and_bound/branch_and_bound.cpp
+++ b/cpp/src/branch_and_bound/branch_and_bound.cpp
@@ -1801,80 +1801,61 @@ lp_status_t branch_and_bound_t<i_t, f_t>::solve_root_relaxation(
   i_t iter                = 0;
   std::string solver_name = "";
 
-  // Root node path
+  // Launch dual simplex on a background thread (it may be halted later if PDLP+crossover wins).
   lp_status_t root_status;
-  std::future<lp_status_t> root_status_future;
-  root_status_future = std::async(std::launch::async,
-                                  &solve_linear_program_with_advanced_basis<i_t, f_t>,
-                                  std::ref(original_lp_),
-                                  exploration_stats_.start_time,
-                                  std::ref(lp_settings),
-                                  std::ref(root_relax_soln),
-                                  std::ref(basis_update),
-                                  std::ref(basic_list),
-                                  std::ref(nonbasic_list),
-                                  std::ref(root_vstatus),
-                                  std::ref(edge_norms),
-                                  nullptr);
-
-  std::optional<std::future<root_relaxation_first_solution_t<i_t, f_t>>> pdlp_future_opt;
-  if (enable_concurrent_lp_root_solve_ && mip_problem_ptr_ != nullptr) {
-    root_crossover_solution_set_.store(false, std::memory_order_release);
-    pdlp_future_opt =
-      std::async(std::launch::async,
-                 &cuopt::linear_programming::detail::run_pdlp_barrier_for_root_lp<i_t, f_t>,
-                 mip_problem_ptr_,
-                 lp_settings.time_limit,
-                 get_root_concurrent_halt(),
-                 pdlp_root_num_gpus_);
-  }
-
-  // Wait for first completion: PDLP/Barrier future, dual simplex future, or legacy callback
-  while (*get_root_concurrent_halt() == 0) {
-    bool pdlp_ready =
-      pdlp_future_opt && pdlp_future_opt->valid() &&
-      pdlp_future_opt->wait_for(std::chrono::milliseconds(0)) == std::future_status::ready;
-    bool ds_ready =
-      root_status_future.wait_for(std::chrono::milliseconds(0)) == std::future_status::ready;
-    if (root_crossover_solution_set_.load(std::memory_order_acquire) || pdlp_ready || ds_ready) {
-      break;
-    }
-    std::this_thread::sleep_for(std::chrono::milliseconds(1));
-  }
+  std::future<lp_status_t> root_status_future =
+    std::async(std::launch::async,
+               &solve_linear_program_with_advanced_basis<i_t, f_t>,
+               std::ref(original_lp_),
+               exploration_stats_.start_time,
+               std::ref(lp_settings),
+               std::ref(root_relax_soln),
+               std::ref(basis_update),
+               std::ref(basic_list),
+               std::ref(nonbasic_list),
+               std::ref(root_vstatus),
+               std::ref(edge_norms),
+               nullptr);
+
+  const auto wait_timeout_s = static_cast<long long>(std::max(600.0, 2.0 * lp_settings.time_limit));
+  const auto wait_timeout   = std::chrono::seconds(wait_timeout_s);
 
   bool use_pdlp_path = false;
-  if (pdlp_future_opt && pdlp_future_opt->valid() &&
-      pdlp_future_opt->wait_for(std::chrono::milliseconds(0)) == std::future_status::ready) {
-    auto result                         = pdlp_future_opt->get();
-    root_crossover_soln_.x              = result.primal;
-    root_crossover_soln_.y              = result.dual;
-    root_crossover_soln_.z              = result.reduced_costs;
-    root_crossover_soln_.objective      = result.objective;
-    root_crossover_soln_.user_objective = result.user_objective;
-    root_crossover_soln_.iterations     = result.iterations;
-    root_objective_                     = result.objective;
-    root_crossover_solution_set_.store(true, std::memory_order_release);
-    if (lp_settings.on_first_lp_solution_available) {
-      lp_settings.on_first_lp_solution_available(result);
-    }
-    use_pdlp_path = true;
-  }
 
-  if (!use_pdlp_path && root_crossover_solution_set_.load(std::memory_order_acquire)) {
-    // Legacy path: set_root_relaxation_solution was invoked
-    root_relaxation_first_solution_t<i_t, f_t> legacy_result;
-    legacy_result.primal         = root_crossover_soln_.x;
-    legacy_result.dual           = root_crossover_soln_.y;
-    legacy_result.reduced_costs  = root_crossover_soln_.z;
-    legacy_result.objective      = root_crossover_soln_.objective;
-    legacy_result.user_objective = root_crossover_soln_.user_objective;
-    legacy_result.iterations     = root_crossover_soln_.iterations;
-    if (lp_settings.on_first_lp_solution_available) {
-      lp_settings.on_first_lp_solution_available(legacy_result);
+  if (enable_concurrent_lp_root_solve_ && mip_problem_ptr_ != nullptr) {
+    if (root_crossover_solution_set_.load(std::memory_order_acquire)) {
+      // Legacy path: set_root_relaxation_solution was already invoked (e.g. by diversity manager).
+      root_relaxation_first_solution_t<i_t, f_t> legacy_result;
+      legacy_result.primal         = root_crossover_soln_.x;
+      legacy_result.dual           = root_crossover_soln_.y;
+      legacy_result.reduced_costs  = root_crossover_soln_.z;
+      legacy_result.objective      = root_crossover_soln_.objective;
+      legacy_result.user_objective = root_crossover_soln_.user_objective;
+      legacy_result.iterations     = root_crossover_soln_.iterations;
+      if (lp_settings.on_first_lp_solution_available) {
+        lp_settings.on_first_lp_solution_available(legacy_result);
+      }
+      use_pdlp_path = true;
+    } else {
+      // Run PDLP/Barrier on the main thread, then crossover on the main thread.
+      auto result = cuopt::linear_programming::detail::run_pdlp_barrier_for_root_lp<i_t, f_t>(
+        mip_problem_ptr_, lp_settings.time_limit, get_root_concurrent_halt(), pdlp_root_num_gpus_);
+      root_crossover_soln_.x              = result.primal;
+      root_crossover_soln_.y              = result.dual;
+      root_crossover_soln_.z              = result.reduced_costs;
+      root_crossover_soln_.objective      = result.objective;
+      root_crossover_soln_.user_objective = result.user_objective;
+      root_crossover_soln_.iterations     = result.iterations;
+      root_objective_                     = result.objective;
+      root_crossover_solution_set_.store(true, std::memory_order_release);
+      if (lp_settings.on_first_lp_solution_available) {
+        lp_settings.on_first_lp_solution_available(result);
+      }
+      use_pdlp_path = true;
     }
   }
 
-  if (use_pdlp_path || root_crossover_solution_set_.load(std::memory_order_acquire)) {
+  if (use_pdlp_path) {
     // Crush the root relaxation solution on converted user problem
     std::vector<f_t> crushed_root_x;
     crush_primal_solution(
@@ -1907,9 +1888,13 @@ lp_status_t branch_and_bound_t<i_t, f_t>::solve_root_relaxation(
 
     // Check if crossover was stopped by dual simplex
     if (crossover_status == crossover_status_t::OPTIMAL) {
-      set_root_concurrent_halt(1);             // Stop dual simplex
-      root_status = root_status_future.get();  // Wait for dual simplex to finish
-      set_root_concurrent_halt(0);             // Clear the concurrent halt flag
+      set_root_concurrent_halt(1);  // Stop dual simplex
+      if (root_status_future.wait_for(wait_timeout) == std::future_status::ready) {
+        root_status = root_status_future.get();
+      } else {
+        root_status = lp_status_t::OPTIMAL;
+      }
+      set_root_concurrent_halt(0);  // Clear the concurrent halt flag
       // Override the root relaxation solution with the crossover solution
       root_relax_soln = root_crossover_soln_;
       root_vstatus    = crossover_vstatus_;
@@ -1926,14 +1911,9 @@ lp_status_t branch_and_bound_t<i_t, f_t>::solve_root_relaxation(
         }
       }
       if (basic_list.size() != original_lp_.num_rows) {
-        settings_.log.printf(
-          "basic_list size %d != m %d\n", basic_list.size(), original_lp_.num_rows);
         assert(basic_list.size() == original_lp_.num_rows);
       }
       if (nonbasic_list.size() != original_lp_.num_cols - original_lp_.num_rows) {
-        settings_.log.printf("nonbasic_list size %d != n - m %d\n",
-                             nonbasic_list.size(),
-                             original_lp_.num_cols - original_lp_.num_rows);
         assert(nonbasic_list.size() == original_lp_.num_cols - original_lp_.num_rows);
       }
       // Populate the basis_update from the crossover vstatus
@@ -1946,7 +1926,6 @@ lp_status_t branch_and_bound_t<i_t, f_t>::solve_root_relaxation(
                                                         nonbasic_list,
                                                         crossover_vstatus_);
       if (refactor_status != 0) {
-        settings_.log.printf("Failed to refactor basis. %d deficient columns.\n", refactor_status);
         assert(refactor_status == 0);
         root_status = lp_status_t::NUMERICAL_ISSUES;
       }
@@ -1958,13 +1937,21 @@ lp_status_t branch_and_bound_t<i_t, f_t>::solve_root_relaxation(
       iter           = root_crossover_soln_.iterations;
       solver_name    = "Barrier/PDLP and Crossover";
     } else {
-      root_status    = root_status_future.get();
+      if (root_status_future.wait_for(wait_timeout) == std::future_status::ready) {
+        root_status = root_status_future.get();
+      } else {
+        root_status = lp_status_t::TIME_LIMIT;
+      }
       user_objective = root_relax_soln_.user_objective;
       iter           = root_relax_soln_.iterations;
       solver_name    = "Dual Simplex";
     }
   } else {
-    root_status = root_status_future.get();
+    if (root_status_future.wait_for(wait_timeout) == std::future_status::ready) {
+      root_status = root_status_future.get();
+    } else {
+      root_status = lp_status_t::TIME_LIMIT;
+    }
     root_relaxation_first_solution_t<i_t, f_t> ds_result;
     ds_result.primal         = root_relax_soln.x;
     ds_result.dual           = root_relax_soln.y;

From 1d213bda8e2e4557aaa121b967ebed05c701fcdf Mon Sep 17 00:00:00 2001
From: Rajesh Gandham <rgandham@nvidia.com>
Date: Tue, 10 Mar 2026 06:32:20 -0700
Subject: [PATCH 07/30] remove stale code

---
 cpp/src/branch_and_bound/branch_and_bound.cpp | 43 ++++++-------------
 cpp/src/branch_and_bound/branch_and_bound.hpp | 22 ----------
 cpp/src/mip_heuristics/problem/problem.cu     |  5 +--
 cpp/src/mip_heuristics/problem/problem.cuh    |  3 --
 cpp/src/mip_heuristics/solver.cu              | 10 -----
 5 files changed, 14 insertions(+), 69 deletions(-)

diff --git a/cpp/src/branch_and_bound/branch_and_bound.cpp b/cpp/src/branch_and_bound/branch_and_bound.cpp
index 1b9f370cc5..ce480c8982 100644
--- a/cpp/src/branch_and_bound/branch_and_bound.cpp
+++ b/cpp/src/branch_and_bound/branch_and_bound.cpp
@@ -1823,36 +1823,20 @@ lp_status_t branch_and_bound_t<i_t, f_t>::solve_root_relaxation(
   bool use_pdlp_path = false;
 
   if (enable_concurrent_lp_root_solve_ && mip_problem_ptr_ != nullptr) {
-    if (root_crossover_solution_set_.load(std::memory_order_acquire)) {
-      // Legacy path: set_root_relaxation_solution was already invoked (e.g. by diversity manager).
-      root_relaxation_first_solution_t<i_t, f_t> legacy_result;
-      legacy_result.primal         = root_crossover_soln_.x;
-      legacy_result.dual           = root_crossover_soln_.y;
-      legacy_result.reduced_costs  = root_crossover_soln_.z;
-      legacy_result.objective      = root_crossover_soln_.objective;
-      legacy_result.user_objective = root_crossover_soln_.user_objective;
-      legacy_result.iterations     = root_crossover_soln_.iterations;
-      if (lp_settings.on_first_lp_solution_available) {
-        lp_settings.on_first_lp_solution_available(legacy_result);
-      }
-      use_pdlp_path = true;
-    } else {
-      // Run PDLP/Barrier on the main thread, then crossover on the main thread.
-      auto result = cuopt::linear_programming::detail::run_pdlp_barrier_for_root_lp<i_t, f_t>(
-        mip_problem_ptr_, lp_settings.time_limit, get_root_concurrent_halt(), pdlp_root_num_gpus_);
-      root_crossover_soln_.x              = result.primal;
-      root_crossover_soln_.y              = result.dual;
-      root_crossover_soln_.z              = result.reduced_costs;
-      root_crossover_soln_.objective      = result.objective;
-      root_crossover_soln_.user_objective = result.user_objective;
-      root_crossover_soln_.iterations     = result.iterations;
-      root_objective_                     = result.objective;
-      root_crossover_solution_set_.store(true, std::memory_order_release);
-      if (lp_settings.on_first_lp_solution_available) {
-        lp_settings.on_first_lp_solution_available(result);
-      }
-      use_pdlp_path = true;
+    // Run PDLP/Barrier on the main thread, then crossover on the main thread.
+    auto result = cuopt::linear_programming::detail::run_pdlp_barrier_for_root_lp<i_t, f_t>(
+      mip_problem_ptr_, lp_settings.time_limit, get_root_concurrent_halt(), pdlp_root_num_gpus_);
+    root_crossover_soln_.x              = result.primal;
+    root_crossover_soln_.y              = result.dual;
+    root_crossover_soln_.z              = result.reduced_costs;
+    root_crossover_soln_.objective      = result.objective;
+    root_crossover_soln_.user_objective = result.user_objective;
+    root_crossover_soln_.iterations     = result.iterations;
+    root_objective_                     = result.objective;
+    if (lp_settings.on_first_lp_solution_available) {
+      lp_settings.on_first_lp_solution_available(result);
     }
+    use_pdlp_path = true;
   }
 
   if (use_pdlp_path) {
@@ -1980,7 +1964,6 @@ lp_status_t branch_and_bound_t<i_t, f_t>::solve_root_relaxation(
   }
 
   settings_.log.printf("\n");
-  is_root_solution_set = true;
 
   return root_status;
 }
diff --git a/cpp/src/branch_and_bound/branch_and_bound.hpp b/cpp/src/branch_and_bound/branch_and_bound.hpp
index 909c57e0c8..de7b8455e9 100644
--- a/cpp/src/branch_and_bound/branch_and_bound.hpp
+++ b/cpp/src/branch_and_bound/branch_and_bound.hpp
@@ -84,26 +84,6 @@ class branch_and_bound_t {
   // Set an initial guess based on the user_problem. This should be called before solve.
   void set_initial_guess(const std::vector<f_t>& user_guess) { guess_ = user_guess; }
 
-  // Set the root solution found by PDLP
-  void set_root_relaxation_solution(const std::vector<f_t>& primal,
-                                    const std::vector<f_t>& dual,
-                                    const std::vector<f_t>& reduced_costs,
-                                    f_t objective,
-                                    f_t user_objective,
-                                    i_t iterations)
-  {
-    if (!is_root_solution_set) {
-      root_crossover_soln_.x              = primal;
-      root_crossover_soln_.y              = dual;
-      root_crossover_soln_.z              = reduced_costs;
-      root_objective_                     = objective;
-      root_crossover_soln_.objective      = objective;
-      root_crossover_soln_.user_objective = user_objective;
-      root_crossover_soln_.iterations     = iterations;
-      root_crossover_solution_set_.store(true, std::memory_order_release);
-    }
-  }
-
   // Set a solution based on the user problem during the course of the solve
   void set_new_solution(const std::vector<f_t>& solution);
 
@@ -201,10 +181,8 @@ class branch_and_bound_t {
   lp_solution_t<i_t, f_t> root_relax_soln_;
   lp_solution_t<i_t, f_t> root_crossover_soln_;
   std::vector<f_t> edge_norms_;
-  std::atomic<bool> root_crossover_solution_set_{false};
   bool enable_concurrent_lp_root_solve_{false};
   std::atomic<int> root_concurrent_halt_{0};
-  bool is_root_solution_set{false};
   cuopt::linear_programming::detail::problem_t<i_t, f_t>* mip_problem_ptr_{nullptr};
   i_t pdlp_root_num_gpus_{1};
 
diff --git a/cpp/src/mip_heuristics/problem/problem.cu b/cpp/src/mip_heuristics/problem/problem.cu
index bc93a9d988..18d8e7ded0 100644
--- a/cpp/src/mip_heuristics/problem/problem.cu
+++ b/cpp/src/mip_heuristics/problem/problem.cu
@@ -148,8 +148,7 @@ problem_t<i_t, f_t>::problem_t(
     Q_values(problem_.get_quadratic_objective_values())
 {
   op_problem_cstr_body(problem_);
-  branch_and_bound_callback             = nullptr;
-  set_root_relaxation_solution_callback = nullptr;
+  branch_and_bound_callback = nullptr;
 }
 
 template <typename i_t, typename f_t>
@@ -161,7 +160,6 @@ problem_t<i_t, f_t>::problem_t(const problem_t<i_t, f_t>& problem_)
     integer_fixed_problem(problem_.integer_fixed_problem),
     integer_fixed_variable_map(problem_.integer_fixed_variable_map, handle_ptr->get_stream()),
     branch_and_bound_callback(nullptr),
-    set_root_relaxation_solution_callback(nullptr),
     n_variables(problem_.n_variables),
     n_constraints(problem_.n_constraints),
     n_binary_vars(problem_.n_binary_vars),
@@ -217,7 +215,6 @@ problem_t<i_t, f_t>::problem_t(const problem_t<i_t, f_t>& problem_,
     integer_fixed_problem(problem_.integer_fixed_problem),
     integer_fixed_variable_map(problem_.integer_fixed_variable_map, handle_ptr->get_stream()),
     branch_and_bound_callback(nullptr),
-    set_root_relaxation_solution_callback(nullptr),
     n_variables(problem_.n_variables),
     n_constraints(problem_.n_constraints),
     n_binary_vars(problem_.n_binary_vars),
diff --git a/cpp/src/mip_heuristics/problem/problem.cuh b/cpp/src/mip_heuristics/problem/problem.cuh
index b9ca420820..489ad424f4 100644
--- a/cpp/src/mip_heuristics/problem/problem.cuh
+++ b/cpp/src/mip_heuristics/problem/problem.cuh
@@ -236,9 +236,6 @@ class problem_t {
   rmm::device_uvector<i_t> integer_fixed_variable_map;
 
   std::function<void(const std::vector<f_t>&)> branch_and_bound_callback;
-  std::function<void(
-    const std::vector<f_t>&, const std::vector<f_t>&, const std::vector<f_t>&, f_t, f_t, i_t)>
-    set_root_relaxation_solution_callback;
 
   typename mip_solver_settings_t<i_t, f_t>::tolerances_t tolerances{};
   i_t n_variables{0};
diff --git a/cpp/src/mip_heuristics/solver.cu b/cpp/src/mip_heuristics/solver.cu
index 8d5cd813bf..1257aa6409 100644
--- a/cpp/src/mip_heuristics/solver.cu
+++ b/cpp/src/mip_heuristics/solver.cu
@@ -288,16 +288,6 @@ solution_t<i_t, f_t> mip_solver_t<i_t, f_t>::run_solver()
     context.work_unit_scheduler_.register_context(branch_and_bound->get_work_unit_context());
     // context.work_unit_scheduler_.verbose = true;
 
-    context.problem_ptr->set_root_relaxation_solution_callback =
-      std::bind(&dual_simplex::branch_and_bound_t<i_t, f_t>::set_root_relaxation_solution,
-                branch_and_bound.get(),
-                std::placeholders::_1,
-                std::placeholders::_2,
-                std::placeholders::_3,
-                std::placeholders::_4,
-                std::placeholders::_5,
-                std::placeholders::_6);
-
     if (timer_.check_time_limit()) {
       CUOPT_LOG_INFO("Time limit reached during B&B setup");
       solution_t<i_t, f_t> sol(*context.problem_ptr);

From 9937868924c872e3a4850e74f2eceb25ba55cf4c Mon Sep 17 00:00:00 2001
From: Rajesh Gandham <rgandham@nvidia.com>
Date: Thu, 19 Mar 2026 07:38:32 -0700
Subject: [PATCH 08/30] Launch 3 threads, one for dual simplex, one for
 PDLP+crossover, one for Barrier+crossover

---
 .../pdlp/solver_settings.hpp                  |   2 +
 .../pdlp/solver_solution.hpp                  |   1 +
 cpp/src/branch_and_bound/branch_and_bound.cpp | 333 ++++++++++++------
 cpp/src/branch_and_bound/branch_and_bound.hpp |  22 +-
 .../branch_and_bound_from_mip.cu              |   1 -
 cpp/src/dual_simplex/phase2.cpp               |   4 -
 cpp/src/mip_heuristics/root_lp.cu             | 194 ++++++++--
 cpp/src/mip_heuristics/root_lp.cuh            |  49 ++-
 cpp/src/pdlp/pdlp.cu                          |   6 +-
 cpp/src/pdlp/solve.cu                         |  19 +-
 cpp/src/pdlp/solver_solution.cu               |   6 +
 11 files changed, 486 insertions(+), 151 deletions(-)

diff --git a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp
index f6ad4c8619..5de5489576 100644
--- a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp
+++ b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp
@@ -248,6 +248,8 @@ class pdlp_solver_settings_t {
   bool inside_mip{false};
   // For concurrent termination
   std::atomic<int>* concurrent_halt{nullptr};
+  /** If true, solver does not set concurrent_halt; caller sets it after crossover. */
+  bool halt_set_by_caller{false};
   static constexpr f_t minimal_absolute_tolerance = 1.0e-12;
   pdlp_hyper_params::pdlp_hyper_params_t hyper_params;
   // Holds the information of new variable lower and upper bounds for each climber in the format:
diff --git a/cpp/include/cuopt/linear_programming/pdlp/solver_solution.hpp b/cpp/include/cuopt/linear_programming/pdlp/solver_solution.hpp
index 45a47e7401..9bd5796a89 100644
--- a/cpp/include/cuopt/linear_programming/pdlp/solver_solution.hpp
+++ b/cpp/include/cuopt/linear_programming/pdlp/solver_solution.hpp
@@ -235,6 +235,7 @@ class optimization_problem_solution_t : public base_solution_t {
    * @return rmm::device_uvector<i_t> The device memory container for the reduced cost.
    */
   rmm::device_uvector<f_t>& get_reduced_cost();
+  const rmm::device_uvector<f_t>& get_reduced_cost() const;
 
   /**
    * @brief Get termination reason
diff --git a/cpp/src/branch_and_bound/branch_and_bound.cpp b/cpp/src/branch_and_bound/branch_and_bound.cpp
index ce480c8982..3cf273545e 100644
--- a/cpp/src/branch_and_bound/branch_and_bound.cpp
+++ b/cpp/src/branch_and_bound/branch_and_bound.cpp
@@ -32,12 +32,16 @@
 #include <algorithm>
 #include <chrono>
 #include <cmath>
+#include <condition_variable>
 #include <cstdio>
 #include <cstdlib>
 #include <deque>
+#include <functional>
 #include <future>
 #include <limits>
 #include <map>
+#include <memory>
+#include <mutex>
 #include <optional>
 #include <string>
 #include <thread>
@@ -251,7 +255,6 @@ branch_and_bound_t<i_t, f_t>::branch_and_bound_t(
     Arow_(1, 1, 0),
     incumbent_(1),
     root_relax_soln_(1, 1),
-    root_crossover_soln_(1, 1),
     pc_(1),
     solver_status_(mip_status_t::UNSET),
     mip_problem_ptr_(nullptr),
@@ -1786,6 +1789,84 @@ void branch_and_bound_t<i_t, f_t>::single_threaded_solve()
   }
 }
 
+template <typename i_t, typename f_t>
+void branch_and_bound_t<i_t, f_t>::run_concurrent_pdlp_and_barrier_with_crossover(
+  const simplex_solver_settings_t<i_t, f_t>& lp_settings,
+  crossover_status_t& crossover_status_out,
+  lp_solution_t<i_t, f_t>& winner_crossover_soln_out,
+  std::vector<variable_status_t>& winner_crossover_vstatus_out,
+  f_t& winner_root_objective_out,
+  std::string& winner_solver_name_out,
+  std::atomic<int>& winner,
+  std::mutex* first_solver_mutex,
+  bool* first_solver_callback_done,
+  std::thread& pdlp_thread_out,
+  std::thread& barrier_thread_out)
+{
+  // PDLP+crossover and Barrier+crossover each in a thread. winner: 0=none, 1=dual, 2=PDLP,
+  // 3=Barrier.
+  struct concurrent_shared_state_t {
+    std::mutex first_result_mutex;
+  };
+  auto shared = std::make_shared<concurrent_shared_state_t>();
+
+  auto do_crush_crossover = [this,
+                             &lp_settings,
+                             &crossover_status_out,
+                             &winner_crossover_soln_out,
+                             &winner_crossover_vstatus_out,
+                             &winner_root_objective_out,
+                             &winner_solver_name_out,
+                             &winner,
+                             first_solver_mutex,
+                             first_solver_callback_done,
+                             shared](const root_relaxation_first_solution_t<i_t, f_t>& result,
+                                     const char* solver_name,
+                                     int winner_id) {
+    return cuopt::linear_programming::detail::run_crush_crossover_and_maybe_win<i_t, f_t>(
+      result,
+      original_problem_,
+      original_lp_,
+      new_slacks_,
+      settings_,
+      exploration_stats_.start_time,
+      get_root_concurrent_halt(),
+      [this]() { set_root_concurrent_halt(1); },
+      lp_settings.on_first_lp_solution_available,
+      first_solver_mutex,
+      first_solver_callback_done,
+      &shared->first_result_mutex,
+      &winner,
+      winner_id,
+      &crossover_status_out,
+      &winner_crossover_soln_out,
+      &winner_crossover_vstatus_out,
+      &winner_root_objective_out,
+      solver_name,
+      &winner_solver_name_out);
+  };
+
+  pdlp_thread_out = std::thread([this, &lp_settings, do_crush_crossover]() {
+    auto result = cuopt::linear_programming::detail::run_solver_for_root_lp<i_t, f_t>(
+      mip_problem_ptr_,
+      lp_settings.time_limit,
+      get_root_concurrent_halt(),
+      pdlp_root_num_gpus_,
+      cuopt::linear_programming::method_t::PDLP);
+    (void)do_crush_crossover(result, "PDLP", 2);
+  });
+
+  barrier_thread_out = std::thread([this, &lp_settings, do_crush_crossover]() {
+    auto result = cuopt::linear_programming::detail::run_solver_for_root_lp<i_t, f_t>(
+      mip_problem_ptr_,
+      lp_settings.time_limit,
+      get_root_concurrent_halt(),
+      pdlp_root_num_gpus_,
+      cuopt::linear_programming::method_t::Barrier);
+    (void)do_crush_crossover(result, "Barrier", 3);
+  });
+}
+
 template <typename i_t, typename f_t>
 lp_status_t branch_and_bound_t<i_t, f_t>::solve_root_relaxation(
   simplex_solver_settings_t<i_t, f_t> const& lp_settings,
@@ -1801,94 +1882,148 @@ lp_status_t branch_and_bound_t<i_t, f_t>::solve_root_relaxation(
   i_t iter                = 0;
   std::string solver_name = "";
 
-  // Launch dual simplex on a background thread (it may be halted later if PDLP+crossover wins).
-  lp_status_t root_status;
-  std::future<lp_status_t> root_status_future =
-    std::async(std::launch::async,
-               &solve_linear_program_with_advanced_basis<i_t, f_t>,
-               std::ref(original_lp_),
-               exploration_stats_.start_time,
-               std::ref(lp_settings),
-               std::ref(root_relax_soln),
-               std::ref(basis_update),
-               std::ref(basic_list),
-               std::ref(nonbasic_list),
-               std::ref(root_vstatus),
-               std::ref(edge_norms),
-               nullptr);
-
-  const auto wait_timeout_s = static_cast<long long>(std::max(600.0, 2.0 * lp_settings.time_limit));
-  const auto wait_timeout   = std::chrono::seconds(wait_timeout_s);
-
-  bool use_pdlp_path = false;
+  // Dual simplex runs on the main thread when concurrent; otherwise it runs alone on main.
+  auto dual_simplex_settings = std::make_shared<simplex_solver_settings_t<i_t, f_t>>(lp_settings);
+  dual_simplex_settings->inside_mip = 1;
+
+  lp_status_t root_status = lp_status_t::UNSET;
+  lp_status_t root_result_status =
+    lp_status_t::UNSET;  // dual simplex result; set when dual returns, read in else branch
+
+  bool use_pdlp_path               = false;
+  bool dual_simplex_finished_first = false;
+
+  crossover_status_t crossover_status = crossover_status_t::NUMERICAL_ISSUES;
+  lp_solution_t<i_t, f_t> winner_crossover_soln(original_lp_.num_rows, original_lp_.num_cols);
+  std::vector<variable_status_t> winner_crossover_vstatus;
+  f_t winner_root_objective = 0;
+  std::string root_winner_solver_name;
+
+  std::thread pdlp_thread;
+  std::thread barrier_thread;
+  std::thread dual_simplex_thread;
+  std::atomic<int> winner{0};  // 0=none, 1=dual, 2=PDLP, 3=Barrier
 
   if (enable_concurrent_lp_root_solve_ && mip_problem_ptr_ != nullptr) {
-    // Run PDLP/Barrier on the main thread, then crossover on the main thread.
-    auto result = cuopt::linear_programming::detail::run_pdlp_barrier_for_root_lp<i_t, f_t>(
-      mip_problem_ptr_, lp_settings.time_limit, get_root_concurrent_halt(), pdlp_root_num_gpus_);
-    root_crossover_soln_.x              = result.primal;
-    root_crossover_soln_.y              = result.dual;
-    root_crossover_soln_.z              = result.reduced_costs;
-    root_crossover_soln_.objective      = result.objective;
-    root_crossover_soln_.user_objective = result.user_objective;
-    root_crossover_soln_.iterations     = result.iterations;
-    root_objective_                     = result.objective;
+    // All three run in threads; main only starts them and joins. First to finish with OPTIMAL sets
+    // winner and halt.
+    std::mutex first_solver_mutex;
+    bool first_solver_callback_done = false;
+    run_concurrent_pdlp_and_barrier_with_crossover(lp_settings,
+                                                   crossover_status,
+                                                   winner_crossover_soln,
+                                                   winner_crossover_vstatus,
+                                                   winner_root_objective,
+                                                   root_winner_solver_name,
+                                                   winner,
+                                                   &first_solver_mutex,
+                                                   &first_solver_callback_done,
+                                                   pdlp_thread,
+                                                   barrier_thread);
+
+    // Dual simplex does not call on_first_lp_solution: diversity manager prefers optimal first;
+    // only PDLP/Barrier feed first solution when they have one.
+    dual_simplex_thread = std::thread([this,
+                                       dual_simplex_settings,
+                                       &root_relax_soln,
+                                       &basis_update,
+                                       &basic_list,
+                                       &nonbasic_list,
+                                       &root_vstatus,
+                                       &edge_norms,
+                                       &root_result_status,
+                                       &winner]() {
+      lp_status_t status =
+        solve_linear_program_with_advanced_basis<i_t, f_t>(original_lp_,
+                                                           exploration_stats_.start_time,
+                                                           *dual_simplex_settings,
+                                                           root_relax_soln,
+                                                           basis_update,
+                                                           basic_list,
+                                                           nonbasic_list,
+                                                           root_vstatus,
+                                                           edge_norms,
+                                                           nullptr);
+      root_result_status = status;
+      int expected       = 0;
+      if (status == lp_status_t::OPTIMAL &&
+          winner.compare_exchange_strong(expected, 1, std::memory_order_acq_rel)) {
+        set_root_concurrent_halt(1);
+      }
+    });
+
+    struct join_threads_guard_t {
+      std::thread* a = nullptr;
+      std::thread* b = nullptr;
+      std::thread* c = nullptr;
+      ~join_threads_guard_t()
+      {
+        if (a && a->joinable()) { a->join(); }
+        if (b && b->joinable()) { b->join(); }
+        if (c && c->joinable()) { c->join(); }
+      }
+    } join_guard;
+    join_guard.a = &pdlp_thread;
+    join_guard.b = &barrier_thread;
+    join_guard.c = &dual_simplex_thread;
+
+    pdlp_thread.join();
+    barrier_thread.join();
+    dual_simplex_thread.join();
+    join_guard.a = nullptr;
+    join_guard.b = nullptr;
+    join_guard.c = nullptr;
+
+    // Winner may have set concurrent_halt==1 to stop peer solvers. All threads are joined; reset
+    // the flag for the rest of B&B (subsequent LP solves, etc.).
+    set_root_concurrent_halt(0);
+
+    const int w   = winner.load(std::memory_order_acquire);
+    use_pdlp_path = (w == 2 || w == 3);
+    if (w == 1) { dual_simplex_finished_first = true; }
+  } else {
+    // Non-concurrent: run dual simplex on main only.
+    root_status        = solve_linear_program_with_advanced_basis<i_t, f_t>(original_lp_,
+                                                                     exploration_stats_.start_time,
+                                                                     *dual_simplex_settings,
+                                                                     root_relax_soln,
+                                                                     basis_update,
+                                                                     basic_list,
+                                                                     nonbasic_list,
+                                                                     root_vstatus,
+                                                                     edge_norms,
+                                                                     nullptr);
+    root_result_status = root_status;
     if (lp_settings.on_first_lp_solution_available) {
-      lp_settings.on_first_lp_solution_available(result);
+      root_relaxation_first_solution_t<i_t, f_t> ds_result;
+      ds_result.primal         = root_relax_soln.x;
+      ds_result.dual           = root_relax_soln.y;
+      ds_result.reduced_costs  = root_relax_soln.z;
+      ds_result.objective      = root_relax_soln.objective;
+      ds_result.user_objective = root_relax_soln.user_objective;
+      ds_result.iterations     = root_relax_soln.iterations;
+      lp_settings.on_first_lp_solution_available(ds_result);
     }
-    use_pdlp_path = true;
   }
 
   if (use_pdlp_path) {
-    // Crush the root relaxation solution on converted user problem
-    std::vector<f_t> crushed_root_x;
-    crush_primal_solution(
-      original_problem_, original_lp_, root_crossover_soln_.x, new_slacks_, crushed_root_x);
-    std::vector<f_t> crushed_root_y;
-    std::vector<f_t> crushed_root_z;
-
-    f_t dual_res_inf = crush_dual_solution(original_problem_,
-                                           original_lp_,
-                                           new_slacks_,
-                                           root_crossover_soln_.y,
-                                           root_crossover_soln_.z,
-                                           crushed_root_y,
-                                           crushed_root_z);
-
-    root_crossover_soln_.x = crushed_root_x;
-    root_crossover_soln_.y = crushed_root_y;
-    root_crossover_soln_.z = crushed_root_z;
-
-    // Call crossover on the crushed solution
-    auto root_crossover_settings            = settings_;
-    root_crossover_settings.log.log         = false;
-    root_crossover_settings.concurrent_halt = get_root_concurrent_halt();
-    crossover_status_t crossover_status     = crossover(original_lp_,
-                                                    root_crossover_settings,
-                                                    root_crossover_soln_,
-                                                    exploration_stats_.start_time,
-                                                    root_crossover_soln_,
-                                                    crossover_vstatus_);
-
-    // Check if crossover was stopped by dual simplex
+    root_objective_                 = winner_root_objective;
+    auto root_crossover_settings    = settings_;
+    root_crossover_settings.log.log = false;
+    // Single-threaded CPU post-processing (refactor_basis, edge norms); concurrent halt must not
+    // apply.
+    root_crossover_settings.concurrent_halt = nullptr;
     if (crossover_status == crossover_status_t::OPTIMAL) {
-      set_root_concurrent_halt(1);  // Stop dual simplex
-      if (root_status_future.wait_for(wait_timeout) == std::future_status::ready) {
-        root_status = root_status_future.get();
-      } else {
-        root_status = lp_status_t::OPTIMAL;
-      }
-      set_root_concurrent_halt(0);  // Clear the concurrent halt flag
-      // Override the root relaxation solution with the crossover solution
-      root_relax_soln = root_crossover_soln_;
-      root_vstatus    = crossover_vstatus_;
+      // Use winner's crossover solution; no wait.
+      root_relax_soln = winner_crossover_soln;
+      root_vstatus    = winner_crossover_vstatus;
       root_status     = lp_status_t::OPTIMAL;
       basic_list.clear();
       nonbasic_list.reserve(original_lp_.num_cols - original_lp_.num_rows);
       nonbasic_list.clear();
       // Get the basic list and nonbasic list from the vstatus
       for (i_t j = 0; j < original_lp_.num_cols; j++) {
-        if (crossover_vstatus_[j] == variable_status_t::BASIC) {
+        if (winner_crossover_vstatus[j] == variable_status_t::BASIC) {
           basic_list.push_back(j);
         } else {
           nonbasic_list.push_back(j);
@@ -1908,7 +2043,7 @@ lp_status_t branch_and_bound_t<i_t, f_t>::solve_root_relaxation(
                                                         exploration_stats_.start_time,
                                                         basic_list,
                                                         nonbasic_list,
-                                                        crossover_vstatus_);
+                                                        winner_crossover_vstatus);
       if (refactor_status != 0) {
         assert(refactor_status == 0);
         root_status = lp_status_t::NUMERICAL_ISSUES;
@@ -1917,35 +2052,30 @@ lp_status_t branch_and_bound_t<i_t, f_t>::solve_root_relaxation(
       // Set the edge norms to a default value
       edge_norms.resize(original_lp_.num_cols, -1.0);
       set_uninitialized_steepest_edge_norms<i_t, f_t>(original_lp_, basic_list, edge_norms);
-      user_objective = root_crossover_soln_.user_objective;
-      iter           = root_crossover_soln_.iterations;
-      solver_name    = "Barrier/PDLP and Crossover";
+      user_objective = winner_crossover_soln.user_objective;
+      iter           = winner_crossover_soln.iterations;
+      solver_name    = root_winner_solver_name + " and Crossover";
     } else {
-      if (root_status_future.wait_for(wait_timeout) == std::future_status::ready) {
-        root_status = root_status_future.get();
-      } else {
-        root_status = lp_status_t::TIME_LIMIT;
+      // Crossover winner path but crossover was not OPTIMAL. Map crossover outcome to lp_status_t.
+      switch (crossover_status) {
+        case crossover_status_t::TIME_LIMIT: root_status = lp_status_t::TIME_LIMIT; break;
+        case crossover_status_t::NUMERICAL_ISSUES:
+          root_status = lp_status_t::NUMERICAL_ISSUES;
+          break;
+        case crossover_status_t::CONCURRENT_LIMIT: root_status = lp_status_t::TIME_LIMIT; break;
+        case crossover_status_t::PRIMAL_FEASIBLE:
+        case crossover_status_t::DUAL_FEASIBLE: root_status = lp_status_t::NUMERICAL_ISSUES; break;
+        default: root_status = lp_status_t::NUMERICAL_ISSUES; break;
       }
-      user_objective = root_relax_soln_.user_objective;
-      iter           = root_relax_soln_.iterations;
-      solver_name    = "Dual Simplex";
+      user_objective = winner_crossover_soln.user_objective;
+      iter           = winner_crossover_soln.iterations;
+      solver_name    = root_winner_solver_name + " and Crossover";
     }
   } else {
-    if (root_status_future.wait_for(wait_timeout) == std::future_status::ready) {
-      root_status = root_status_future.get();
-    } else {
-      root_status = lp_status_t::TIME_LIMIT;
-    }
-    root_relaxation_first_solution_t<i_t, f_t> ds_result;
-    ds_result.primal         = root_relax_soln.x;
-    ds_result.dual           = root_relax_soln.y;
-    ds_result.reduced_costs  = root_relax_soln.z;
-    ds_result.objective      = root_relax_soln.objective;
-    ds_result.user_objective = root_relax_soln.user_objective;
-    ds_result.iterations     = root_relax_soln.iterations;
-    if (lp_settings.on_first_lp_solution_available) {
-      lp_settings.on_first_lp_solution_available(ds_result);
-    }
+    // Use dual simplex result (root_result_status was set when dual simplex returned).
+    root_status = root_result_status;
+    (void)dual_simplex_finished_first;  // used only to select path
+    // Diversity manager was already notified by whoever was first (dual simplex, PDLP, or Barrier).
     user_objective = root_relax_soln.user_objective;
     iter           = root_relax_soln.iterations;
     solver_name    = "Dual Simplex";
@@ -1965,6 +2095,7 @@ lp_status_t branch_and_bound_t<i_t, f_t>::solve_root_relaxation(
 
   settings_.log.printf("\n");
 
+  set_root_concurrent_halt(0);
   return root_status;
 }
 
diff --git a/cpp/src/branch_and_bound/branch_and_bound.hpp b/cpp/src/branch_and_bound/branch_and_bound.hpp
index de7b8455e9..eeccb75af3 100644
--- a/cpp/src/branch_and_bound/branch_and_bound.hpp
+++ b/cpp/src/branch_and_bound/branch_and_bound.hpp
@@ -17,6 +17,7 @@
 
 #include <cuts/cuts.hpp>
 
+#include <dual_simplex/crossover.hpp>
 #include <dual_simplex/initial_basis.hpp>
 #include <dual_simplex/phase2.hpp>
 #include <dual_simplex/simplex_solver_settings.hpp>
@@ -32,6 +33,10 @@
 
 #include <omp.h>
 
+#include <atomic>
+#include <mutex>
+#include <thread>
+
 #include <functional>
 #include <vector>
 
@@ -118,6 +123,21 @@ class branch_and_bound_t {
                                     std::vector<i_t>& nonbasic_list,
                                     std::vector<f_t>& edge_norms);
 
+  /** Starts PDLP+crossover and Barrier+crossover in two threads. winner is 0=none, 1=dual, 2=PDLP,
+   * 3=Barrier; first OPTIMAL sets it. first_solver_* for diversity manager callback. */
+  void run_concurrent_pdlp_and_barrier_with_crossover(
+    const simplex_solver_settings_t<i_t, f_t>& lp_settings,
+    crossover_status_t& crossover_status_out,
+    lp_solution_t<i_t, f_t>& winner_crossover_soln_out,
+    std::vector<variable_status_t>& winner_crossover_vstatus_out,
+    f_t& winner_root_objective_out,
+    std::string& winner_solver_name_out,
+    std::atomic<int>& winner,
+    std::mutex* first_solver_mutex,
+    bool* first_solver_callback_done,
+    std::thread& pdlp_thread_out,
+    std::thread& barrier_thread_out);
+
   i_t find_reduced_cost_fixings(f_t upper_bound,
                                 std::vector<f_t>& lower_bounds,
                                 std::vector<f_t>& upper_bounds);
@@ -176,10 +196,8 @@ class branch_and_bound_t {
 
   // Variables for the root node in the search tree.
   std::vector<variable_status_t> root_vstatus_;
-  std::vector<variable_status_t> crossover_vstatus_;
   f_t root_objective_;
   lp_solution_t<i_t, f_t> root_relax_soln_;
-  lp_solution_t<i_t, f_t> root_crossover_soln_;
   std::vector<f_t> edge_norms_;
   bool enable_concurrent_lp_root_solve_{false};
   std::atomic<int> root_concurrent_halt_{0};
diff --git a/cpp/src/branch_and_bound/branch_and_bound_from_mip.cu b/cpp/src/branch_and_bound/branch_and_bound_from_mip.cu
index d4e713ebff..4e90956f68 100644
--- a/cpp/src/branch_and_bound/branch_and_bound_from_mip.cu
+++ b/cpp/src/branch_and_bound/branch_and_bound_from_mip.cu
@@ -46,7 +46,6 @@ branch_and_bound_t<i_t, f_t>::branch_and_bound_t(
     Arow_(1, 1, 0),
     incumbent_(1),
     root_relax_soln_(1, 1),
-    root_crossover_soln_(1, 1),
     pc_(1),
     solver_status_(mip_status_t::UNSET),
     mip_problem_ptr_(mip_problem_ptr),
diff --git a/cpp/src/dual_simplex/phase2.cpp b/cpp/src/dual_simplex/phase2.cpp
index 426d9a7535..7cfed77385 100644
--- a/cpp/src/dual_simplex/phase2.cpp
+++ b/cpp/src/dual_simplex/phase2.cpp
@@ -3551,10 +3551,6 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
                           100.0 * dense_delta_z / (sparse_delta_z + dense_delta_z));
       ft.print_stats();
     }
-    if (settings.inside_mip && settings.concurrent_halt != nullptr) {
-      settings.log.debug("Setting concurrent halt in Dual Simplex Phase 2\n");
-      *settings.concurrent_halt = 1;
-    }
   }
   return status;
 }
diff --git a/cpp/src/mip_heuristics/root_lp.cu b/cpp/src/mip_heuristics/root_lp.cu
index 8e3346286b..b181db43cd 100644
--- a/cpp/src/mip_heuristics/root_lp.cu
+++ b/cpp/src/mip_heuristics/root_lp.cu
@@ -12,18 +12,48 @@
 #include <pdlp/pdlp.cuh>
 #include <pdlp/solve.cuh>
 
+#include <dual_simplex/crossover.hpp>
+#include <dual_simplex/presolve.hpp>
 #include <dual_simplex/types.hpp>
 #include <raft/core/copy.hpp>
 #include <utilities/timer.hpp>
 
 namespace cuopt::linear_programming::detail {
 
+namespace {
 template <typename i_t, typename f_t>
 cuopt::linear_programming::dual_simplex::root_relaxation_first_solution_t<i_t, f_t>
-run_pdlp_barrier_for_root_lp(problem_t<i_t, f_t>* problem,
-                             f_t time_limit,
-                             std::atomic<int>* concurrent_halt,
-                             i_t num_gpus)
+copy_lp_result_to_root_solution(problem_t<i_t, f_t>* problem,
+                                const optimization_problem_solution_t<i_t, f_t>& lp_result)
+{
+  cuopt::linear_programming::dual_simplex::root_relaxation_first_solution_t<i_t, f_t> result;
+  auto stream = problem->handle_ptr->get_stream();
+  result.primal.resize(lp_result.get_primal_solution().size());
+  result.dual.resize(lp_result.get_dual_solution().size());
+  result.reduced_costs.resize(lp_result.get_reduced_cost().size());
+  raft::copy(
+    result.primal.data(), lp_result.get_primal_solution().data(), result.primal.size(), stream);
+  raft::copy(result.dual.data(), lp_result.get_dual_solution().data(), result.dual.size(), stream);
+  raft::copy(result.reduced_costs.data(),
+             lp_result.get_reduced_cost().data(),
+             result.reduced_costs.size(),
+             stream);
+  problem->handle_ptr->sync_stream();
+  result.objective      = problem->get_solver_obj_from_user_obj(lp_result.get_objective_value());
+  result.user_objective = lp_result.get_objective_value();
+  result.iterations     = lp_result.get_additional_termination_information().number_of_steps_taken;
+  return result;
+}
+
+}  // namespace
+
+template <typename i_t, typename f_t>
+cuopt::linear_programming::dual_simplex::root_relaxation_first_solution_t<i_t, f_t>
+run_solver_for_root_lp(problem_t<i_t, f_t>* problem,
+                       f_t time_limit,
+                       std::atomic<int>* concurrent_halt,
+                       i_t num_gpus,
+                       method_t method)
 {
   convert_greater_to_less(*problem);
   f_t tolerance_divisor =
@@ -37,39 +67,153 @@ run_pdlp_barrier_for_root_lp(problem_t<i_t, f_t>* problem,
   pdlp_settings.time_limit            = time_limit;
   pdlp_settings.first_primal_feasible = false;
   pdlp_settings.concurrent_halt       = concurrent_halt;
-  pdlp_settings.method                = method_t::Concurrent;
+  pdlp_settings.halt_set_by_caller    = true;  // B&B sets halt only after crossover
+  pdlp_settings.method                = method;
   pdlp_settings.inside_mip            = true;
-  pdlp_settings.pdlp_solver_mode      = pdlp_solver_mode_t::Stable2;
   pdlp_settings.num_gpus              = num_gpus;
   pdlp_settings.presolver             = presolver_t::None;
+  pdlp_settings.crossover             = false;  // B&B does crush + crossover for both paths
+  if (method == method_t::PDLP) { pdlp_settings.pdlp_solver_mode = pdlp_solver_mode_t::Stable2; }
 
   timer_t lp_timer(time_limit);
   auto lp_result = solve_lp_with_method<i_t, f_t>(*problem, pdlp_settings, lp_timer);
+  return copy_lp_result_to_root_solution(problem, lp_result);
+}
 
-  cuopt::linear_programming::dual_simplex::root_relaxation_first_solution_t<i_t, f_t> result;
-  auto stream = problem->handle_ptr->get_stream();
-  result.primal.resize(lp_result.get_primal_solution().size());
-  result.dual.resize(lp_result.get_dual_solution().size());
-  result.reduced_costs.resize(lp_result.get_reduced_cost().size());
-  raft::copy(
-    result.primal.data(), lp_result.get_primal_solution().data(), result.primal.size(), stream);
-  raft::copy(result.dual.data(), lp_result.get_dual_solution().data(), result.dual.size(), stream);
-  raft::copy(result.reduced_costs.data(),
-             lp_result.get_reduced_cost().data(),
-             result.reduced_costs.size(),
-             stream);
-  problem->handle_ptr->sync_stream();
-  result.objective      = problem->get_solver_obj_from_user_obj(lp_result.get_objective_value());
-  result.user_objective = lp_result.get_objective_value();
-  result.iterations     = lp_result.get_additional_termination_information().number_of_steps_taken;
-  return result;
+template <typename i_t, typename f_t>
+cuopt::linear_programming::dual_simplex::crossover_status_t run_crush_crossover_and_maybe_win(
+  const cuopt::linear_programming::dual_simplex::root_relaxation_first_solution_t<i_t, f_t>& result,
+  const cuopt::linear_programming::dual_simplex::user_problem_t<i_t, f_t>& original_problem,
+  const cuopt::linear_programming::dual_simplex::lp_problem_t<i_t, f_t>& original_lp,
+  const std::vector<i_t>& new_slacks,
+  const cuopt::linear_programming::dual_simplex::simplex_solver_settings_t<i_t, f_t>&
+    crossover_settings,
+  f_t start_time,
+  std::atomic<int>* concurrent_halt,
+  std::function<void()> set_halter,
+  std::function<void(
+    const cuopt::linear_programming::dual_simplex::root_relaxation_first_solution_t<i_t, f_t>&)>
+    on_first_lp_solution,
+  std::mutex* first_solver_mutex,
+  bool* first_solver_callback_done,
+  std::mutex* first_result_mutex,
+  std::atomic<int>* winner,
+  int winner_id,
+  cuopt::linear_programming::dual_simplex::crossover_status_t* first_crossover_status_out,
+  cuopt::linear_programming::dual_simplex::lp_solution_t<i_t, f_t>* winner_crossover_soln,
+  std::vector<cuopt::linear_programming::dual_simplex::variable_status_t>* winner_crossover_vstatus,
+  f_t* winner_root_objective,
+  const char* this_solver_name,
+  std::string* winner_solver_name_out)
+{
+  using namespace cuopt::linear_programming::dual_simplex;
+  if (on_first_lp_solution) {
+    std::lock_guard<std::mutex> lock(*first_solver_mutex);
+    if (!*first_solver_callback_done) {
+      *first_solver_callback_done = true;
+      on_first_lp_solution(result);
+    }
+  }
+  lp_solution_t<i_t, f_t> soln(original_lp.num_rows, original_lp.num_cols);
+  soln.x              = result.primal;
+  soln.y              = result.dual;
+  soln.z              = result.reduced_costs;
+  soln.objective      = result.objective;
+  soln.user_objective = result.user_objective;
+  soln.iterations     = result.iterations;
+  std::vector<f_t> crushed_x;
+  crush_primal_solution(original_problem, original_lp, soln.x, new_slacks, crushed_x);
+  std::vector<f_t> crushed_y;
+  std::vector<f_t> crushed_z;
+  (void)crush_dual_solution(
+    original_problem, original_lp, new_slacks, soln.y, soln.z, crushed_y, crushed_z);
+  soln.x = std::move(crushed_x);
+  soln.y = std::move(crushed_y);
+  soln.z = std::move(crushed_z);
+  lp_solution_t<i_t, f_t> crossover_out(original_lp.num_rows, original_lp.num_cols);
+  std::vector<variable_status_t> vstatus_out(original_lp.num_cols);
+  auto root_crossover_settings = crossover_settings;
+  root_crossover_settings.inside_mip =
+    1;  // root LP crossover; dual_phase2 uses this to set concurrent_halt
+  root_crossover_settings.log.log         = false;
+  root_crossover_settings.concurrent_halt = concurrent_halt;
+  crossover_status_t status =
+    crossover(original_lp, root_crossover_settings, soln, start_time, crossover_out, vstatus_out);
+  {
+    std::lock_guard<std::mutex> lock(*first_result_mutex);
+    int expected = 0;
+    if (status == crossover_status_t::OPTIMAL &&
+        winner->compare_exchange_strong(expected, winner_id, std::memory_order_acq_rel)) {
+      *first_crossover_status_out = status;
+      if (winner_solver_name_out) { *winner_solver_name_out = this_solver_name; }
+      winner_crossover_soln->x              = std::move(crossover_out.x);
+      winner_crossover_soln->y              = std::move(crossover_out.y);
+      winner_crossover_soln->z              = std::move(crossover_out.z);
+      winner_crossover_soln->objective      = result.objective;
+      winner_crossover_soln->user_objective = result.user_objective;
+      winner_crossover_soln->iterations     = result.iterations;
+      *winner_root_objective                = result.objective;
+      *winner_crossover_vstatus             = std::move(vstatus_out);
+      set_halter();
+    } else {
+      if (winner->load(std::memory_order_acquire) != 0) { status = *first_crossover_status_out; }
+    }
+  }
+  return status;
 }
 
 template cuopt::linear_programming::dual_simplex::root_relaxation_first_solution_t<int, double>
-run_pdlp_barrier_for_root_lp<int, double>(problem_t<int, double>*, double, std::atomic<int>*, int);
+run_solver_for_root_lp<int, double>(
+  problem_t<int, double>*, double, std::atomic<int>*, int, method_t);
+template cuopt::linear_programming::dual_simplex::crossover_status_t
+run_crush_crossover_and_maybe_win<int, double>(
+  const cuopt::linear_programming::dual_simplex::root_relaxation_first_solution_t<int, double>&,
+  const cuopt::linear_programming::dual_simplex::user_problem_t<int, double>&,
+  const cuopt::linear_programming::dual_simplex::lp_problem_t<int, double>&,
+  const std::vector<int>&,
+  const cuopt::linear_programming::dual_simplex::simplex_solver_settings_t<int, double>&,
+  double,
+  std::atomic<int>*,
+  std::function<void()>,
+  std::function<void(
+    const cuopt::linear_programming::dual_simplex::root_relaxation_first_solution_t<int, double>&)>,
+  std::mutex*,
+  bool*,
+  std::mutex*,
+  std::atomic<int>*,
+  int,
+  cuopt::linear_programming::dual_simplex::crossover_status_t*,
+  cuopt::linear_programming::dual_simplex::lp_solution_t<int, double>*,
+  std::vector<cuopt::linear_programming::dual_simplex::variable_status_t>*,
+  double*,
+  const char*,
+  std::string*);
 
 #ifdef MIP_INSTANTIATION_FLOAT
 template cuopt::linear_programming::dual_simplex::root_relaxation_first_solution_t<int, float>
-run_pdlp_barrier_for_root_lp<int, float>(problem_t<int, float>*, float, std::atomic<int>*, int);
+run_solver_for_root_lp<int, float>(problem_t<int, float>*, float, std::atomic<int>*, int, method_t);
+template cuopt::linear_programming::dual_simplex::crossover_status_t
+run_crush_crossover_and_maybe_win<int, float>(
+  const cuopt::linear_programming::dual_simplex::root_relaxation_first_solution_t<int, float>&,
+  const cuopt::linear_programming::dual_simplex::user_problem_t<int, float>&,
+  const cuopt::linear_programming::dual_simplex::lp_problem_t<int, float>&,
+  const std::vector<int>&,
+  const cuopt::linear_programming::dual_simplex::simplex_solver_settings_t<int, float>&,
+  float,
+  std::atomic<int>*,
+  std::function<void()>,
+  std::function<void(
+    const cuopt::linear_programming::dual_simplex::root_relaxation_first_solution_t<int, float>&)>,
+  std::mutex*,
+  bool*,
+  std::mutex*,
+  std::atomic<int>*,
+  int,
+  cuopt::linear_programming::dual_simplex::crossover_status_t*,
+  cuopt::linear_programming::dual_simplex::lp_solution_t<int, float>*,
+  std::vector<cuopt::linear_programming::dual_simplex::variable_status_t>*,
+  float*,
+  const char*,
+  std::string*);
 #endif
 }  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/mip_heuristics/root_lp.cuh b/cpp/src/mip_heuristics/root_lp.cuh
index 8683ebe820..2f87884fe9 100644
--- a/cpp/src/mip_heuristics/root_lp.cuh
+++ b/cpp/src/mip_heuristics/root_lp.cuh
@@ -7,25 +7,60 @@
 
 #pragma once
 
+#include <cuopt/linear_programming/pdlp/solver_settings.hpp>
+#include <dual_simplex/crossover.hpp>
 #include <dual_simplex/types.hpp>
 
 #include <atomic>
 #include <cstdint>
+#include <functional>
+#include <mutex>
+#include <string>
 
 namespace cuopt::linear_programming::detail {
 
 template <typename i_t, typename f_t>
 class problem_t;
 
+/** Run PDLP or Barrier for root LP. Uses concurrent_halt to stop; does not set it. Crossover done
+ * by caller. */
+template <typename i_t, typename f_t>
+cuopt::linear_programming::dual_simplex::root_relaxation_first_solution_t<i_t, f_t>
+run_solver_for_root_lp(problem_t<i_t, f_t>* problem,
+                       f_t time_limit,
+                       std::atomic<int>* concurrent_halt,
+                       i_t num_gpus,
+                       method_t method);
+
 /**
- * Run PDLP/Barrier for root LP (used by branch-and-bound when concurrent root solve is enabled).
- * Implemented in root_lp.cu so GPU code (convert_greater_to_less, solve_lp_with_method) can run.
+ * Run crush + crossover on a root LP solution and optionally store as winner (first to finish).
+ * Used by B&B when running PDLP and Barrier concurrently; both paths call this after their solver
+ * returns.
  */
 template <typename i_t, typename f_t>
-cuopt::linear_programming::dual_simplex::root_relaxation_first_solution_t<i_t, f_t>
-run_pdlp_barrier_for_root_lp(problem_t<i_t, f_t>* problem,
-                             f_t time_limit,
-                             std::atomic<int>* concurrent_halt,
-                             i_t num_gpus);
+cuopt::linear_programming::dual_simplex::crossover_status_t run_crush_crossover_and_maybe_win(
+  const cuopt::linear_programming::dual_simplex::root_relaxation_first_solution_t<i_t, f_t>& result,
+  const cuopt::linear_programming::dual_simplex::user_problem_t<i_t, f_t>& original_problem,
+  const cuopt::linear_programming::dual_simplex::lp_problem_t<i_t, f_t>& original_lp,
+  const std::vector<i_t>& new_slacks,
+  const cuopt::linear_programming::dual_simplex::simplex_solver_settings_t<i_t, f_t>&
+    crossover_settings,
+  f_t start_time,
+  std::atomic<int>* concurrent_halt,
+  std::function<void()> set_halter,
+  std::function<void(
+    const cuopt::linear_programming::dual_simplex::root_relaxation_first_solution_t<i_t, f_t>&)>
+    on_first_lp_solution,
+  std::mutex* first_solver_mutex,
+  bool* first_solver_callback_done,
+  std::mutex* first_result_mutex,
+  std::atomic<int>* winner,
+  int winner_id,
+  cuopt::linear_programming::dual_simplex::crossover_status_t* first_crossover_status_out,
+  cuopt::linear_programming::dual_simplex::lp_solution_t<i_t, f_t>* winner_crossover_soln,
+  std::vector<cuopt::linear_programming::dual_simplex::variable_status_t>* winner_crossover_vstatus,
+  f_t* winner_root_objective,
+  const char* this_solver_name,
+  std::string* winner_solver_name_out);
 
 }  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu
index cda60cf5ff..e36ab732a7 100644
--- a/cpp/src/pdlp/pdlp.cu
+++ b/cpp/src/pdlp/pdlp.cu
@@ -445,9 +445,9 @@ std::optional<optimization_problem_solution_t<i_t, f_t>> pdlp_solver_t<i_t, f_t>
                                              pdlp_termination_status_t::IterationLimit));
   }
 
-  // Check for concurrent limit
-  if (settings_.method == method_t::Concurrent && settings_.concurrent_halt != nullptr &&
-      *settings_.concurrent_halt == 1) {
+  // Check for concurrent limit (whenever caller provides a halt flag, e.g. B&B racing PDLP vs
+  // Barrier)
+  if (settings_.concurrent_halt != nullptr && *settings_.concurrent_halt == 1) {
 #ifdef PDLP_VERBOSE_MODE
     RAFT_CUDA_TRY(cudaDeviceSynchronize());
     std::cout << "Concurrent Limit reached, returning current solution" << std::endl;
diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu
index 5e1e25bbee..39bbda6bd3 100644
--- a/cpp/src/pdlp/solve.cu
+++ b/cpp/src/pdlp/solve.cu
@@ -459,9 +459,10 @@ run_barrier(dual_simplex::user_problem_t<i_t, f_t>& user_problem,
   CUOPT_LOG_CONDITIONAL_INFO(
     !settings.inside_mip, "Barrier finished in %.2f seconds", timer.elapsed_time());
 
-  if (settings.concurrent_halt != nullptr && (status == dual_simplex::lp_status_t::OPTIMAL ||
-                                              status == dual_simplex::lp_status_t::UNBOUNDED ||
-                                              status == dual_simplex::lp_status_t::INFEASIBLE)) {
+  if (!settings.halt_set_by_caller && settings.concurrent_halt != nullptr &&
+      (status == dual_simplex::lp_status_t::OPTIMAL ||
+       status == dual_simplex::lp_status_t::UNBOUNDED ||
+       status == dual_simplex::lp_status_t::INFEASIBLE)) {
     // We finished. Tell PDLP to stop if it is still running.
     *settings.concurrent_halt = 1;
   }
@@ -531,9 +532,10 @@ run_dual_simplex(dual_simplex::user_problem_t<i_t, f_t>& user_problem,
   CUOPT_LOG_CONDITIONAL_INFO(
     !settings.inside_mip, "Dual simplex finished in %.2f seconds", timer.elapsed_time());
 
-  if (settings.concurrent_halt != nullptr && (status == dual_simplex::lp_status_t::OPTIMAL ||
-                                              status == dual_simplex::lp_status_t::UNBOUNDED ||
-                                              status == dual_simplex::lp_status_t::INFEASIBLE)) {
+  if (!settings.halt_set_by_caller && settings.concurrent_halt != nullptr &&
+      (status == dual_simplex::lp_status_t::OPTIMAL ||
+       status == dual_simplex::lp_status_t::UNBOUNDED ||
+       status == dual_simplex::lp_status_t::INFEASIBLE)) {
     // We finished. Tell PDLP to stop if it is still running.
     *settings.concurrent_halt = 1;
   }
@@ -677,8 +679,9 @@ optimization_problem_solution_t<i_t, f_t> run_pdlp(detail::problem_t<i_t, f_t>&
     CUOPT_LOG_CONDITIONAL_INFO(
       !settings.inside_mip, "Crossover status %s", sol.get_termination_status_string().c_str());
   }
-  if (settings.method == method_t::Concurrent && settings.concurrent_halt != nullptr &&
-      crossover_info == 0 && sol.get_termination_status() == pdlp_termination_status_t::Optimal) {
+  if (!settings.halt_set_by_caller && settings.method == method_t::Concurrent &&
+      settings.concurrent_halt != nullptr && crossover_info == 0 &&
+      sol.get_termination_status() == pdlp_termination_status_t::Optimal) {
     // We finished. Tell dual simplex to stop if it is still running.
     CUOPT_LOG_CONDITIONAL_INFO(!settings.inside_mip, "PDLP finished. Telling others to stop");
     *settings.concurrent_halt = 1;
diff --git a/cpp/src/pdlp/solver_solution.cu b/cpp/src/pdlp/solver_solution.cu
index a8001b91c1..29179a8c21 100644
--- a/cpp/src/pdlp/solver_solution.cu
+++ b/cpp/src/pdlp/solver_solution.cu
@@ -372,6 +372,12 @@ rmm::device_uvector<f_t>& optimization_problem_solution_t<i_t, f_t>::get_reduced
   return reduced_cost_;
 }
 
+template <typename i_t, typename f_t>
+const rmm::device_uvector<f_t>& optimization_problem_solution_t<i_t, f_t>::get_reduced_cost() const
+{
+  return reduced_cost_;
+}
+
 template <typename i_t, typename f_t>
 pdlp_termination_status_t optimization_problem_solution_t<i_t, f_t>::get_termination_status(
   i_t id) const

From 0f1a6f9f4be9eb23e0c63b387e45dbd927f343a6 Mon Sep 17 00:00:00 2001
From: Jake Awe <jawe@nvidia.com>
Date: Thu, 19 Mar 2026 11:02:52 -0500
Subject: [PATCH 09/30] Revert "Prepare release/26.04"

This reverts commit 4d5f5e530f0e506314562a5ba80530cbb90f056e.
---
 .github/workflows/build.yaml                  | 28 ++++++++---------
 .github/workflows/pr.yaml                     | 30 +++++++++----------
 .github/workflows/test.yaml                   | 10 +++----
 .../trigger-breaking-change-alert.yaml        |  2 +-
 RAPIDS_BRANCH                                 |  2 +-
 .../routing/routing-example.ipynb             |  2 +-
 docs/cuopt/source/faq.rst                     |  2 +-
 7 files changed, 38 insertions(+), 38 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 3eb1f1f066..593d48bd74 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -45,7 +45,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@python-3.14
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -55,7 +55,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@python-3.14
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -65,7 +65,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@python-3.14
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -73,7 +73,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-cuopt-mps-parser:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.14
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -88,7 +88,7 @@ jobs:
   wheel-publish-cuopt-mps-parser:
     needs: wheel-build-cuopt-mps-parser
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.14
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -99,7 +99,7 @@ jobs:
   wheel-build-libcuopt:
     needs: wheel-build-cuopt-mps-parser
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.14
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -112,7 +112,7 @@ jobs:
   wheel-publish-libcuopt:
     needs: wheel-build-libcuopt
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.14
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -123,7 +123,7 @@ jobs:
   wheel-build-cuopt:
     needs: [wheel-build-cuopt-mps-parser, wheel-build-libcuopt]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.14
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -135,7 +135,7 @@ jobs:
   wheel-publish-cuopt:
     needs: wheel-build-cuopt
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.14
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -145,7 +145,7 @@ jobs:
       package-type: python
   wheel-build-cuopt-server:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.14
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -160,7 +160,7 @@ jobs:
   wheel-publish-cuopt-server:
     needs: wheel-build-cuopt-server
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.14
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -171,7 +171,7 @@ jobs:
   docs-build:
     needs: [python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.14
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       node_type: "gpu-l4-latest-1"
@@ -185,7 +185,7 @@ jobs:
       script: "ci/build_docs.sh"
   wheel-build-cuopt-sh-client:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.14
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -201,7 +201,7 @@ jobs:
   wheel-publish-cuopt-sh-client:
     needs: wheel-build-cuopt-sh-client
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.14
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 47a3bd9fca..95741c1fb5 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -34,7 +34,7 @@ jobs:
       - wheel-build-cuopt-sh-client
       - test-self-hosted-server
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@python-3.14
     if: always()
     with:
       needs: ${{ toJSON(needs) }}
@@ -111,7 +111,7 @@ jobs:
 
   changed-files:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@python-3.14
     with:
       files_yaml: |
         build_docs:
@@ -279,20 +279,20 @@ jobs:
           - '!gemini-extension.json'
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@python-3.14
     with:
       enable_check_generated_files: false
   conda-cpp-build:
     needs: [checks, compute-matrix-filters]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@python-3.14
     with:
       build_type: pull-request
       script: ci/build_cpp.sh
       matrix_filter: ${{ needs.compute-matrix-filters.outputs.conda_lean_filter }}
   conda-cpp-tests:
     needs: [conda-cpp-build, changed-files, compute-matrix-filters]
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@python-3.14
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_cpp
     with:
       build_type: pull-request
@@ -308,14 +308,14 @@ jobs:
   conda-python-build:
     needs: [conda-cpp-build, compute-matrix-filters]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@python-3.14
     with:
       build_type: pull-request
       script: ci/build_python.sh
       matrix_filter: ${{ needs.compute-matrix-filters.outputs.conda_test_filter }}
   conda-python-tests:
     needs: [conda-python-build, changed-files, compute-matrix-filters]
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.14
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python_conda
     with:
       run_codecov: false
@@ -332,7 +332,7 @@ jobs:
   docs-build:
     needs: [conda-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.14
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).build_docs
     with:
       build_type: pull-request
@@ -345,7 +345,7 @@ jobs:
   wheel-build-cuopt-mps-parser:
     needs: compute-matrix-filters
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.14
     with:
       build_type: pull-request
       script: ci/build_wheel_cuopt_mps_parser.sh
@@ -357,7 +357,7 @@ jobs:
   wheel-build-libcuopt:
     needs: [wheel-build-cuopt-mps-parser, compute-matrix-filters]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.14
     with:
       # build for every combination of arch and CUDA version, but only for the latest Python
       matrix_filter: ${{ needs.compute-matrix-filters.outputs.libcuopt_filter }}
@@ -368,7 +368,7 @@ jobs:
   wheel-build-cuopt:
     needs: [wheel-build-cuopt-mps-parser, wheel-build-libcuopt, compute-matrix-filters]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.14
     with:
       build_type: pull-request
       script: ci/build_wheel_cuopt.sh
@@ -377,7 +377,7 @@ jobs:
       matrix_filter: ${{ needs.compute-matrix-filters.outputs.wheel_lean_filter }}
   wheel-tests-cuopt:
     needs: [wheel-build-cuopt, wheel-build-cuopt-mps-parser, wheel-build-cuopt-sh-client, changed-files, compute-matrix-filters]
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.14
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python_wheels
     with:
       build_type: pull-request
@@ -393,7 +393,7 @@ jobs:
   wheel-build-cuopt-server:
     needs: [checks, compute-matrix-filters]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.14
     with:
       build_type: pull-request
       script: ci/build_wheel_cuopt_server.sh
@@ -405,7 +405,7 @@ jobs:
   wheel-build-cuopt-sh-client:
     needs: compute-matrix-filters
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.14
     with:
       build_type: pull-request
       script: ci/build_wheel_cuopt_sh_client.sh
@@ -417,7 +417,7 @@ jobs:
       matrix_filter: ${{ needs.compute-matrix-filters.outputs.cuopt_sh_client_filter }}
   wheel-tests-cuopt-server:
     needs: [wheel-build-cuopt, wheel-build-cuopt-server, changed-files, compute-matrix-filters]
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.14
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python_wheels
     with:
       build_type: pull-request
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 9ad7609e8a..e88b7829f5 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -27,7 +27,7 @@ on:
 
 jobs:
   conda-cpp-tests:
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@main
     with:
       build_type: ${{ inputs.build_type }}
       branch: ${{ inputs.branch }}
@@ -42,7 +42,7 @@ jobs:
       script-env-secret-3-key: CUOPT_AWS_SECRET_ACCESS_KEY
       script-env-secret-3-value: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }}
   conda-python-tests:
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@main
     with:
       run_codecov: false
       build_type: ${{ inputs.build_type }}
@@ -58,7 +58,7 @@ jobs:
       script-env-secret-3-key: CUOPT_AWS_SECRET_ACCESS_KEY
       script-env-secret-3-value: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }}
   wheel-tests-cuopt:
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@main
     with:
       build_type: ${{ inputs.build_type }}
       branch: ${{ inputs.branch }}
@@ -73,7 +73,7 @@ jobs:
       script-env-secret-3-key: CUOPT_AWS_SECRET_ACCESS_KEY
       script-env-secret-3-value: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }}
   wheel-tests-cuopt-server:
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@main
     with:
       build_type: ${{ inputs.build_type }}
       branch: ${{ inputs.branch }}
@@ -89,7 +89,7 @@ jobs:
       script-env-secret-3-value: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }}
   conda-notebook-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main
     with:
       build_type: ${{ inputs.build_type }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/trigger-breaking-change-alert.yaml b/.github/workflows/trigger-breaking-change-alert.yaml
index d394b97db4..57b178740c 100644
--- a/.github/workflows/trigger-breaking-change-alert.yaml
+++ b/.github/workflows/trigger-breaking-change-alert.yaml
@@ -15,7 +15,7 @@ jobs:
   trigger-notifier:
     if: contains(github.event.pull_request.labels.*.name, 'breaking')
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@main
     with:
       sender_login: ${{ github.event.sender.login }}
       sender_avatar: ${{ github.event.sender.avatar_url }}
diff --git a/RAPIDS_BRANCH b/RAPIDS_BRANCH
index d5ea6ced53..ba2906d066 100644
--- a/RAPIDS_BRANCH
+++ b/RAPIDS_BRANCH
@@ -1 +1 @@
-release/26.04
+main
diff --git a/docs/cuopt/source/cuopt-python/routing/routing-example.ipynb b/docs/cuopt/source/cuopt-python/routing/routing-example.ipynb
index 9df5e2c0c7..9cfc05f9bb 100644
--- a/docs/cuopt/source/cuopt-python/routing/routing-example.ipynb
+++ b/docs/cuopt/source/cuopt-python/routing/routing-example.ipynb
@@ -147,7 +147,7 @@
    "metadata": {},
    "source": [
     "#### Compressed Sparse Row (CSR) representation of above weighted waypoint graph.\n",
-    "For details on the CSR encoding of the above graph see the [cost_matrix_and_waypoint_graph_creation.ipynb](https://github.com/NVIDIA/cuopt-examples/blob/release/26.04/intra-factory_transport/cost_matrix_and_waypoint_graph_creation.ipynb) notebook."
+    "For details on the CSR encoding of the above graph see the [cost_matrix_and_waypoint_graph_creation.ipynb](https://github.com/NVIDIA/cuopt-examples/blob/main/intra-factory_transport/cost_matrix_and_waypoint_graph_creation.ipynb) notebook."
    ]
   },
   {
diff --git a/docs/cuopt/source/faq.rst b/docs/cuopt/source/faq.rst
index 1985052531..0c3a0e219f 100644
--- a/docs/cuopt/source/faq.rst
+++ b/docs/cuopt/source/faq.rst
@@ -283,7 +283,7 @@ Routing FAQ
 
     So in either case, task locations are actually integer indices into another structure.
 
-    If you have (lat, long) values, then you can generate a cost matrix using a map API. cuOpt does not directly connect to a third-party map engine, but that can be done outside of cuOpt as shown `here <https://github.com/NVIDIA/cuOpt-Resources/blob/release/26.04/notebooks/routing/service/cost_matrix_creation.ipynb>`__.
+    If you have (lat, long) values, then you can generate a cost matrix using a map API. cuOpt does not directly connect to a third-party map engine, but that can be done outside of cuOpt as shown `here <https://github.com/NVIDIA/cuOpt-Resources/blob/main/notebooks/routing/service/cost_matrix_creation.ipynb>`__.
 
 .. dropdown:: Is it possible to define constraints such as refrigerated vehicles required for certain orders?
 

From e2ea6872842475d01f95e3ed8f8abaee0534fdc9 Mon Sep 17 00:00:00 2001
From: Jake Awe <50372925+AyodeAwe@users.noreply.github.com>
Date: Thu, 19 Mar 2026 11:15:30 -0500
Subject: [PATCH 10/30] Update to 26.06 (#975)

This PR updates the repository to version 26.06.

This is part of the 26.04 release burndown process.
---
 .claude-plugin/marketplace.json               |  2 +-
 .cursor-plugin/plugin.json                    |  2 +-
 .github/workflows/build.yaml                  | 30 +++++------
 .../workflows/build_test_publish_images.yaml  |  2 +-
 .github/workflows/pr.yaml                     | 32 ++++++------
 .github/workflows/test.yaml                   |  2 +-
 README.md                                     | 12 ++---
 VERSION                                       |  2 +-
 .../all_cuda-129_arch-aarch64.yaml            | 10 ++--
 .../all_cuda-129_arch-x86_64.yaml             | 10 ++--
 .../all_cuda-131_arch-aarch64.yaml            | 10 ++--
 .../all_cuda-131_arch-x86_64.yaml             | 10 ++--
 dependencies.yaml                             | 50 +++++++++----------
 gemini-extension.json                         |  2 +-
 helmchart/cuopt-server/Chart.yaml             |  4 +-
 helmchart/cuopt-server/values.yaml            |  2 +-
 python/cuopt/pyproject.toml                   | 18 +++----
 python/cuopt_self_hosted/pyproject.toml       |  2 +-
 python/cuopt_server/pyproject.toml            |  2 +-
 python/libcuopt/pyproject.toml                |  8 +--
 skills/cuopt-developer/SKILL.md               |  2 +-
 skills/cuopt-installation-api-c/SKILL.md      |  2 +-
 skills/cuopt-installation-api-python/SKILL.md |  2 +-
 skills/cuopt-installation-common/SKILL.md     |  2 +-
 skills/cuopt-installation-developer/SKILL.md  |  2 +-
 skills/cuopt-lp-milp-api-c/SKILL.md           |  2 +-
 skills/cuopt-lp-milp-api-cli/SKILL.md         |  2 +-
 skills/cuopt-lp-milp-api-python/SKILL.md      |  2 +-
 skills/cuopt-qp-api-c/SKILL.md                |  2 +-
 skills/cuopt-qp-api-cli/SKILL.md              |  2 +-
 skills/cuopt-qp-api-python/SKILL.md           |  2 +-
 skills/cuopt-routing-api-python/SKILL.md      |  2 +-
 skills/cuopt-server-api-python/SKILL.md       |  2 +-
 skills/cuopt-server-common/SKILL.md           |  2 +-
 skills/cuopt-user-rules/SKILL.md              |  2 +-
 skills/lp-milp-formulation/SKILL.md           |  2 +-
 skills/qp-formulation/SKILL.md                |  2 +-
 skills/routing-formulation/SKILL.md           |  2 +-
 skills/skill-evolution/SKILL.md               |  4 +-
 39 files changed, 126 insertions(+), 126 deletions(-)

diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json
index 4c5df380f6..6ddf2583c4 100644
--- a/.claude-plugin/marketplace.json
+++ b/.claude-plugin/marketplace.json
@@ -5,7 +5,7 @@
   },
   "metadata": {
     "description": "Agent skills for NVIDIA cuOpt: routing (VRP, TSP, PDP), LP/MILP/QP, installation (Python/C/developer), and REST server.",
-    "version": "26.04.00"
+    "version": "26.06.00"
   },
   "plugins": [
     {
diff --git a/.cursor-plugin/plugin.json b/.cursor-plugin/plugin.json
index 5f34873671..e740506140 100644
--- a/.cursor-plugin/plugin.json
+++ b/.cursor-plugin/plugin.json
@@ -1,7 +1,7 @@
 {
   "name": "nvidia-cuopt-skills",
   "description": "Agent skills for NVIDIA cuOpt: routing (VRP, TSP, PDP), LP/MILP/QP, installation (Python/C/developer), and REST server. Use when building or solving optimization with cuOpt.",
-  "version": "26.04.00",
+  "version": "26.06.00",
   "author": {
     "name": "NVIDIA"
   },
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 593d48bd74..a945cde8ec 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -45,7 +45,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@python-3.14
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@main
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -55,7 +55,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@python-3.14
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@main
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -65,7 +65,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@python-3.14
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@main
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -73,7 +73,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-cuopt-mps-parser:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.14
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -88,7 +88,7 @@ jobs:
   wheel-publish-cuopt-mps-parser:
     needs: wheel-build-cuopt-mps-parser
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.14
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@main
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -99,7 +99,7 @@ jobs:
   wheel-build-libcuopt:
     needs: wheel-build-cuopt-mps-parser
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.14
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -112,7 +112,7 @@ jobs:
   wheel-publish-libcuopt:
     needs: wheel-build-libcuopt
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.14
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@main
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -123,7 +123,7 @@ jobs:
   wheel-build-cuopt:
     needs: [wheel-build-cuopt-mps-parser, wheel-build-libcuopt]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.14
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -135,7 +135,7 @@ jobs:
   wheel-publish-cuopt:
     needs: wheel-build-cuopt
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.14
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@main
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -145,7 +145,7 @@ jobs:
       package-type: python
   wheel-build-cuopt-server:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.14
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -160,7 +160,7 @@ jobs:
   wheel-publish-cuopt-server:
     needs: wheel-build-cuopt-server
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.14
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@main
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -171,7 +171,7 @@ jobs:
   docs-build:
     needs: [python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.14
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       node_type: "gpu-l4-latest-1"
@@ -181,11 +181,11 @@ jobs:
       arch: "amd64"
       file_to_upload: "docs/cuopt/build/html/"
       artifact-name: "cuopt_docs"
-      container_image: "rapidsai/ci-conda:26.04-latest"
+      container_image: "rapidsai/ci-conda:26.06-latest"
       script: "ci/build_docs.sh"
   wheel-build-cuopt-sh-client:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.14
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -201,7 +201,7 @@ jobs:
   wheel-publish-cuopt-sh-client:
     needs: wheel-build-cuopt-sh-client
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.14
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@main
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/build_test_publish_images.yaml b/.github/workflows/build_test_publish_images.yaml
index f8f7366e13..17d4e9ab57 100644
--- a/.github/workflows/build_test_publish_images.yaml
+++ b/.github/workflows/build_test_publish_images.yaml
@@ -55,7 +55,7 @@ jobs:
   compute-matrix:
     runs-on: ubuntu-latest
     container:
-      image: rapidsai/ci-conda:26.04-latest
+      image: rapidsai/ci-conda:26.06-latest
     outputs:
       MATRIX: ${{ steps.compute-matrix.outputs.MATRIX }}
       CUOPT_VER: ${{ steps.compute-cuopt-ver.outputs.CUOPT_VER }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 95741c1fb5..a652c23b9a 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -34,7 +34,7 @@ jobs:
       - wheel-build-cuopt-sh-client
       - test-self-hosted-server
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@python-3.14
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@main
     if: always()
     with:
       needs: ${{ toJSON(needs) }}
@@ -111,7 +111,7 @@ jobs:
 
   changed-files:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@python-3.14
+    uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@main
     with:
       files_yaml: |
         build_docs:
@@ -279,20 +279,20 @@ jobs:
           - '!gemini-extension.json'
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@python-3.14
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@main
     with:
       enable_check_generated_files: false
   conda-cpp-build:
     needs: [checks, compute-matrix-filters]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@python-3.14
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@main
     with:
       build_type: pull-request
       script: ci/build_cpp.sh
       matrix_filter: ${{ needs.compute-matrix-filters.outputs.conda_lean_filter }}
   conda-cpp-tests:
     needs: [conda-cpp-build, changed-files, compute-matrix-filters]
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@python-3.14
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@main
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_cpp
     with:
       build_type: pull-request
@@ -308,14 +308,14 @@ jobs:
   conda-python-build:
     needs: [conda-cpp-build, compute-matrix-filters]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@python-3.14
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@main
     with:
       build_type: pull-request
       script: ci/build_python.sh
       matrix_filter: ${{ needs.compute-matrix-filters.outputs.conda_test_filter }}
   conda-python-tests:
     needs: [conda-python-build, changed-files, compute-matrix-filters]
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.14
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@main
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python_conda
     with:
       run_codecov: false
@@ -332,7 +332,7 @@ jobs:
   docs-build:
     needs: [conda-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.14
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).build_docs
     with:
       build_type: pull-request
@@ -340,12 +340,12 @@ jobs:
       arch: "amd64"
       file_to_upload: "docs/cuopt/build/html/"
       artifact-name: "cuopt_docs"
-      container_image: "rapidsai/ci-conda:26.04-latest"
+      container_image: "rapidsai/ci-conda:26.06-latest"
       script: "ci/build_docs.sh"
   wheel-build-cuopt-mps-parser:
     needs: compute-matrix-filters
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.14
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main
     with:
       build_type: pull-request
       script: ci/build_wheel_cuopt_mps_parser.sh
@@ -357,7 +357,7 @@ jobs:
   wheel-build-libcuopt:
     needs: [wheel-build-cuopt-mps-parser, compute-matrix-filters]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.14
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main
     with:
       # build for every combination of arch and CUDA version, but only for the latest Python
       matrix_filter: ${{ needs.compute-matrix-filters.outputs.libcuopt_filter }}
@@ -368,7 +368,7 @@ jobs:
   wheel-build-cuopt:
     needs: [wheel-build-cuopt-mps-parser, wheel-build-libcuopt, compute-matrix-filters]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.14
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main
     with:
       build_type: pull-request
       script: ci/build_wheel_cuopt.sh
@@ -377,7 +377,7 @@ jobs:
       matrix_filter: ${{ needs.compute-matrix-filters.outputs.wheel_lean_filter }}
   wheel-tests-cuopt:
     needs: [wheel-build-cuopt, wheel-build-cuopt-mps-parser, wheel-build-cuopt-sh-client, changed-files, compute-matrix-filters]
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.14
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@main
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python_wheels
     with:
       build_type: pull-request
@@ -393,7 +393,7 @@ jobs:
   wheel-build-cuopt-server:
     needs: [checks, compute-matrix-filters]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.14
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main
     with:
       build_type: pull-request
       script: ci/build_wheel_cuopt_server.sh
@@ -405,7 +405,7 @@ jobs:
   wheel-build-cuopt-sh-client:
     needs: compute-matrix-filters
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.14
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main
     with:
       build_type: pull-request
       script: ci/build_wheel_cuopt_sh_client.sh
@@ -417,7 +417,7 @@ jobs:
       matrix_filter: ${{ needs.compute-matrix-filters.outputs.cuopt_sh_client_filter }}
   wheel-tests-cuopt-server:
     needs: [wheel-build-cuopt, wheel-build-cuopt-server, changed-files, compute-matrix-filters]
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.14
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@main
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python_wheels
     with:
       build_type: pull-request
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index e88b7829f5..a8cc5f2943 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -97,5 +97,5 @@ jobs:
       sha: ${{ inputs.sha }}
       node_type: "gpu-l4-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci-conda:26.04-latest"
+      container_image: "rapidsai/ci-conda:26.06-latest"
       script: ci/test_notebooks.sh
diff --git a/README.md b/README.md
index 379a48c350..95c8598d77 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 # cuOpt - GPU-accelerated Optimization
 
 [![Build Status](https://github.com/NVIDIA/cuopt/actions/workflows/build.yaml/badge.svg)](https://github.com/NVIDIA/cuopt/actions/workflows/build.yaml)
-[![Version](https://img.shields.io/badge/version-26.04.00-blue)](https://github.com/NVIDIA/cuopt/releases)
+[![Version](https://img.shields.io/badge/version-26.06.00-blue)](https://github.com/NVIDIA/cuopt/releases)
 [![Documentation](https://img.shields.io/badge/docs-latest-brightgreen)](https://docs.nvidia.com/cuopt/user-guide/latest/introduction.html)
 [![Docker Hub](https://img.shields.io/badge/docker-nvidia%2Fcuopt-blue?logo=docker)](https://hub.docker.com/r/nvidia/cuopt)
 [![Examples](https://img.shields.io/badge/examples-cuopt--examples-orange)](https://github.com/NVIDIA/cuopt-examples)
@@ -83,7 +83,7 @@ For CUDA 12.x:
 pip install \
   --extra-index-url=https://pypi.nvidia.com \
   nvidia-cuda-runtime-cu12==12.9.* \
-  cuopt-server-cu12==26.04.* cuopt-sh-client==26.04.*
+  cuopt-server-cu12==26.06.* cuopt-sh-client==26.06.*
 ```
 
 Development wheels are available as nightlies, please update `--extra-index-url` to `https://pypi.anaconda.org/rapidsai-wheels-nightly/simple/` to install latest nightly packages.
@@ -91,7 +91,7 @@ Development wheels are available as nightlies, please update `--extra-index-url`
 pip install --pre \
   --extra-index-url=https://pypi.nvidia.com \
   --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple/ \
-  cuopt-server-cu12==26.04.* cuopt-sh-client==26.04.*
+  cuopt-server-cu12==26.06.* cuopt-sh-client==26.06.*
 ```
 
 For CUDA 13.x:
@@ -99,7 +99,7 @@ For CUDA 13.x:
 ```bash
 pip install \
   --extra-index-url=https://pypi.nvidia.com \
-  cuopt-server-cu13==26.04.* cuopt-sh-client==26.04.*
+  cuopt-server-cu13==26.06.* cuopt-sh-client==26.06.*
 ```
 
 Development wheels are available as nightlies, please update `--extra-index-url` to `https://pypi.anaconda.org/rapidsai-wheels-nightly/simple/` to install latest nightly packages.
@@ -107,7 +107,7 @@ Development wheels are available as nightlies, please update `--extra-index-url`
 pip install --pre \
   --extra-index-url=https://pypi.nvidia.com \
   --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple/ \
-  cuopt-server-cu13==26.04.* cuopt-sh-client==26.04.*
+  cuopt-server-cu13==26.06.* cuopt-sh-client==26.06.*
 ```
 
 
@@ -118,7 +118,7 @@ cuOpt can be installed with conda (via [miniforge](https://github.com/conda-forg
 All other dependencies are installed automatically when `cuopt-server` and `cuopt-sh-client` are installed.
 
 ```bash
-conda install -c rapidsai -c conda-forge -c nvidia cuopt-server=26.04.* cuopt-sh-client=26.04.*
+conda install -c rapidsai -c conda-forge -c nvidia cuopt-server=26.06.* cuopt-sh-client=26.06.*
 ```
 
 We also provide [nightly conda packages](https://anaconda.org/rapidsai-nightly) built from the HEAD
diff --git a/VERSION b/VERSION
index 0bd0e8a95b..cdb610a24d 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-26.04.00
+26.06.00
diff --git a/conda/environments/all_cuda-129_arch-aarch64.yaml b/conda/environments/all_cuda-129_arch-aarch64.yaml
index ecef112dd5..104e7e70d1 100644
--- a/conda/environments/all_cuda-129_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-129_arch-aarch64.yaml
@@ -18,7 +18,7 @@ dependencies:
 - cuda-python>=12.9.2,<13.0
 - cuda-sanitizer-api
 - cuda-version=12.9
-- cudf==26.4.*,>=0.0.0a0
+- cudf==26.6.*,>=0.0.0a0
 - cupy>=13.6.0
 - cxx-compiler
 - cython>=3.0.3
@@ -32,8 +32,8 @@ dependencies:
 - libcurand-dev
 - libcusolver-dev
 - libcusparse-dev
-- libraft-headers==26.4.*,>=0.0.0a0
-- librmm==26.4.*,>=0.0.0a0
+- libraft-headers==26.6.*,>=0.0.0a0
+- librmm==26.6.*,>=0.0.0a0
 - make
 - msgpack-numpy==0.4.8
 - msgpack-python==1.1.2
@@ -50,7 +50,7 @@ dependencies:
 - pip
 - pre-commit
 - psutil>=6.0.0
-- pylibraft==26.4.*,>=0.0.0a0
+- pylibraft==26.6.*,>=0.0.0a0
 - pyrsistent
 - pytest-cov
 - pytest<9.0
@@ -59,7 +59,7 @@ dependencies:
 - rapids-build-backend>=0.4.0,<0.5.0
 - rapids-logger==0.2.*,>=0.0.0a0
 - requests
-- rmm==26.4.*,>=0.0.0a0
+- rmm==26.6.*,>=0.0.0a0
 - scikit-build-core>=0.11.0
 - scipy>=1.14.1
 - sphinx
diff --git a/conda/environments/all_cuda-129_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml
index 35c825280c..06aa6121f9 100644
--- a/conda/environments/all_cuda-129_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-129_arch-x86_64.yaml
@@ -18,7 +18,7 @@ dependencies:
 - cuda-python>=12.9.2,<13.0
 - cuda-sanitizer-api
 - cuda-version=12.9
-- cudf==26.4.*,>=0.0.0a0
+- cudf==26.6.*,>=0.0.0a0
 - cupy>=13.6.0
 - cxx-compiler
 - cython>=3.0.3
@@ -32,8 +32,8 @@ dependencies:
 - libcurand-dev
 - libcusolver-dev
 - libcusparse-dev
-- libraft-headers==26.4.*,>=0.0.0a0
-- librmm==26.4.*,>=0.0.0a0
+- libraft-headers==26.6.*,>=0.0.0a0
+- librmm==26.6.*,>=0.0.0a0
 - make
 - msgpack-numpy==0.4.8
 - msgpack-python==1.1.2
@@ -50,7 +50,7 @@ dependencies:
 - pip
 - pre-commit
 - psutil>=6.0.0
-- pylibraft==26.4.*,>=0.0.0a0
+- pylibraft==26.6.*,>=0.0.0a0
 - pyrsistent
 - pytest-cov
 - pytest<9.0
@@ -59,7 +59,7 @@ dependencies:
 - rapids-build-backend>=0.4.0,<0.5.0
 - rapids-logger==0.2.*,>=0.0.0a0
 - requests
-- rmm==26.4.*,>=0.0.0a0
+- rmm==26.6.*,>=0.0.0a0
 - scikit-build-core>=0.11.0
 - scipy>=1.14.1
 - sphinx
diff --git a/conda/environments/all_cuda-131_arch-aarch64.yaml b/conda/environments/all_cuda-131_arch-aarch64.yaml
index 2b717d4e98..a68ebf1285 100644
--- a/conda/environments/all_cuda-131_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-131_arch-aarch64.yaml
@@ -18,7 +18,7 @@ dependencies:
 - cuda-python>=13.0.1,<14.0
 - cuda-sanitizer-api
 - cuda-version=13.1
-- cudf==26.4.*,>=0.0.0a0
+- cudf==26.6.*,>=0.0.0a0
 - cupy>=13.6.0
 - cxx-compiler
 - cython>=3.0.3
@@ -32,8 +32,8 @@ dependencies:
 - libcurand-dev
 - libcusolver-dev
 - libcusparse-dev
-- libraft-headers==26.4.*,>=0.0.0a0
-- librmm==26.4.*,>=0.0.0a0
+- libraft-headers==26.6.*,>=0.0.0a0
+- librmm==26.6.*,>=0.0.0a0
 - make
 - msgpack-numpy==0.4.8
 - msgpack-python==1.1.2
@@ -50,7 +50,7 @@ dependencies:
 - pip
 - pre-commit
 - psutil>=6.0.0
-- pylibraft==26.4.*,>=0.0.0a0
+- pylibraft==26.6.*,>=0.0.0a0
 - pyrsistent
 - pytest-cov
 - pytest<9.0
@@ -59,7 +59,7 @@ dependencies:
 - rapids-build-backend>=0.4.0,<0.5.0
 - rapids-logger==0.2.*,>=0.0.0a0
 - requests
-- rmm==26.4.*,>=0.0.0a0
+- rmm==26.6.*,>=0.0.0a0
 - scikit-build-core>=0.11.0
 - scipy>=1.14.1
 - sphinx
diff --git a/conda/environments/all_cuda-131_arch-x86_64.yaml b/conda/environments/all_cuda-131_arch-x86_64.yaml
index f605a83f3b..043d55e148 100644
--- a/conda/environments/all_cuda-131_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-131_arch-x86_64.yaml
@@ -18,7 +18,7 @@ dependencies:
 - cuda-python>=13.0.1,<14.0
 - cuda-sanitizer-api
 - cuda-version=13.1
-- cudf==26.4.*,>=0.0.0a0
+- cudf==26.6.*,>=0.0.0a0
 - cupy>=13.6.0
 - cxx-compiler
 - cython>=3.0.3
@@ -32,8 +32,8 @@ dependencies:
 - libcurand-dev
 - libcusolver-dev
 - libcusparse-dev
-- libraft-headers==26.4.*,>=0.0.0a0
-- librmm==26.4.*,>=0.0.0a0
+- libraft-headers==26.6.*,>=0.0.0a0
+- librmm==26.6.*,>=0.0.0a0
 - make
 - msgpack-numpy==0.4.8
 - msgpack-python==1.1.2
@@ -50,7 +50,7 @@ dependencies:
 - pip
 - pre-commit
 - psutil>=6.0.0
-- pylibraft==26.4.*,>=0.0.0a0
+- pylibraft==26.6.*,>=0.0.0a0
 - pyrsistent
 - pytest-cov
 - pytest<9.0
@@ -59,7 +59,7 @@ dependencies:
 - rapids-build-backend>=0.4.0,<0.5.0
 - rapids-logger==0.2.*,>=0.0.0a0
 - requests
-- rmm==26.4.*,>=0.0.0a0
+- rmm==26.6.*,>=0.0.0a0
 - scikit-build-core>=0.11.0
 - scipy>=1.14.1
 - sphinx
diff --git a/dependencies.yaml b/dependencies.yaml
index 014889c7d5..db60f63569 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -311,7 +311,7 @@ dependencies:
     common:
       - output_types: [conda]
         packages:
-          - libcuopt-tests==26.4.*,>=0.0.0a0
+          - libcuopt-tests==26.6.*,>=0.0.0a0
   build_wheels:
     common:
       - output_types: [requirements, pyproject]
@@ -413,7 +413,7 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - &libcuopt_unsuffixed libcuopt==26.4.*,>=0.0.0a0
+          - &libcuopt_unsuffixed libcuopt==26.6.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -426,18 +426,18 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - libcuopt-cu12==26.4.*,>=0.0.0a0
+              - libcuopt-cu12==26.6.*,>=0.0.0a0
           - matrix:
               cuda: "13.*"
               cuda_suffixed: "true"
             packages:
-              - libcuopt-cu13==26.4.*,>=0.0.0a0
+              - libcuopt-cu13==26.6.*,>=0.0.0a0
           - {matrix: null, packages: [*libcuopt_unsuffixed]}
   depends_on_cuopt:
     common:
       - output_types: conda
         packages:
-          - &cuopt_unsuffixed cuopt==26.4.*,>=0.0.0a0
+          - &cuopt_unsuffixed cuopt==26.6.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -450,18 +450,18 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - cuopt-cu12==26.4.*,>=0.0.0a0
+              - cuopt-cu12==26.6.*,>=0.0.0a0
           - matrix:
               cuda: "13.*"
               cuda_suffixed: "true"
             packages:
-              - cuopt-cu13==26.4.*,>=0.0.0a0
+              - cuopt-cu13==26.6.*,>=0.0.0a0
           - {matrix: null, packages: [*cuopt_unsuffixed]}
   depends_on_cuopt_server:
     common:
       - output_types: conda
         packages:
-          - &cuopt_server_unsuffixed cuopt-server==26.4.*,>=0.0.0a0
+          - &cuopt_server_unsuffixed cuopt-server==26.6.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -474,18 +474,18 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - cuopt-server-cu12==26.4.*,>=0.0.0a0
+              - cuopt-server-cu12==26.6.*,>=0.0.0a0
           - matrix:
               cuda: "13.*"
               cuda_suffixed: "true"
             packages:
-              - cuopt-server-cu13==26.4.*,>=0.0.0a0
+              - cuopt-server-cu13==26.6.*,>=0.0.0a0
           - {matrix: null, packages: [*cuopt_server_unsuffixed]}
   depends_on_cuopt_sh_client:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - &cuopt_sh_client_unsuffixed cuopt-sh-client==26.4.*,>=0.0.0a0
+          - &cuopt_sh_client_unsuffixed cuopt-sh-client==26.6.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -495,7 +495,7 @@ dependencies:
     common:
       - output_types: [requirements, pyproject, conda]
         packages:
-          - cuopt-mps-parser==26.4.*,>=0.0.0a0
+          - cuopt-mps-parser==26.6.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -505,12 +505,12 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - libraft-headers==26.4.*,>=0.0.0a0
+          - libraft-headers==26.6.*,>=0.0.0a0
   depends_on_librmm:
     common:
       - output_types: conda
         packages:
-          - &librmm_unsuffixed librmm==26.4.*,>=0.0.0a0
+          - &librmm_unsuffixed librmm==26.6.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -522,12 +522,12 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - librmm-cu12==26.4.*,>=0.0.0a0
+              - librmm-cu12==26.6.*,>=0.0.0a0
           - matrix:
               cuda: "13.*"
               cuda_suffixed: "true"
             packages:
-              - librmm-cu13==26.4.*,>=0.0.0a0
+              - librmm-cu13==26.6.*,>=0.0.0a0
           - {matrix: null, packages: [*librmm_unsuffixed]}
   depends_on_cupy:
     common:
@@ -562,7 +562,7 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - &rmm_unsuffixed rmm==26.4.*,>=0.0.0a0
+          - &rmm_unsuffixed rmm==26.6.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -574,12 +574,12 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - rmm-cu12==26.4.*,>=0.0.0a0
+              - rmm-cu12==26.6.*,>=0.0.0a0
           - matrix:
               cuda: "13.*"
               cuda_suffixed: "true"
             packages:
-              - rmm-cu13==26.4.*,>=0.0.0a0
+              - rmm-cu13==26.6.*,>=0.0.0a0
           - matrix:
             packages:
               - *rmm_unsuffixed
@@ -588,7 +588,7 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - &cudf_unsuffixed cudf==26.4.*,>=0.0.0a0
+          - &cudf_unsuffixed cudf==26.6.*,>=0.0.0a0
       - output_types: requirements
         packages:
           - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
@@ -599,12 +599,12 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - cudf-cu12==26.4.*,>=0.0.0a0
+              - cudf-cu12==26.6.*,>=0.0.0a0
           - matrix:
               cuda: "13.*"
               cuda_suffixed: "true"
             packages:
-              - cudf-cu13==26.4.*,>=0.0.0a0
+              - cudf-cu13==26.6.*,>=0.0.0a0
           - matrix:
             packages:
               - *cudf_unsuffixed
@@ -613,7 +613,7 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - &pylibraft_unsuffixed pylibraft==26.4.*,>=0.0.0a0
+          - &pylibraft_unsuffixed pylibraft==26.6.*,>=0.0.0a0
       - output_types: requirements
         packages:
           - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
@@ -624,12 +624,12 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - pylibraft-cu12==26.4.*,>=0.0.0a0
+              - pylibraft-cu12==26.6.*,>=0.0.0a0
           - matrix:
               cuda: "13.*"
               cuda_suffixed: "true"
             packages:
-              - pylibraft-cu13==26.4.*,>=0.0.0a0
+              - pylibraft-cu13==26.6.*,>=0.0.0a0
           - matrix:
             packages:
               - *pylibraft_unsuffixed
diff --git a/gemini-extension.json b/gemini-extension.json
index b4c6b764a4..c5ef9883f8 100644
--- a/gemini-extension.json
+++ b/gemini-extension.json
@@ -1,6 +1,6 @@
 {
   "name": "nvidia-cuopt-skills",
   "description": "Agent skills for NVIDIA cuOpt optimization engine: routing, LP/MILP/QP, installation, and server.",
-  "version": "26.04.00",
+  "version": "26.06.00",
   "contextFileName": "AGENTS.md"
 }
diff --git a/helmchart/cuopt-server/Chart.yaml b/helmchart/cuopt-server/Chart.yaml
index 074d94bec9..811ac067cb 100644
--- a/helmchart/cuopt-server/Chart.yaml
+++ b/helmchart/cuopt-server/Chart.yaml
@@ -1,5 +1,5 @@
 apiVersion: v2
-appVersion: 26.4.0
+appVersion: 26.6.0
 description: A Helm chart for NVIDIA cuOpt Server with GPU support
 home: https://docs.nvidia.com/cuopt/user-guide/latest/resources.html
 keywords:
@@ -14,4 +14,4 @@ name: cuopt-server
 sources:
 - https://docs.nvidia.com/cuopt/user-guide/latest/resources.html
 type: application
-version: 26.4.0
+version: 26.6.0
diff --git a/helmchart/cuopt-server/values.yaml b/helmchart/cuopt-server/values.yaml
index 5218596552..6adafea79e 100644
--- a/helmchart/cuopt-server/values.yaml
+++ b/helmchart/cuopt-server/values.yaml
@@ -7,7 +7,7 @@ replicaCount: 1
 image:
   repository: nvidia/cuopt
   pullPolicy: IfNotPresent
-  tag: "26.4.0-cuda12.9-py3.12"
+  tag: "26.6.0-cuda12.9-py3.12"
 
 imagePullSecrets: []
 nameOverride: ""
diff --git a/python/cuopt/pyproject.toml b/python/cuopt/pyproject.toml
index e86b5bdd73..eff7e01769 100644
--- a/python/cuopt/pyproject.toml
+++ b/python/cuopt/pyproject.toml
@@ -20,18 +20,18 @@ license = "Apache-2.0"
 requires-python = ">=3.11"
 dependencies = [
     "cuda-python>=13.0.1,<14.0",
-    "cudf==26.4.*,>=0.0.0a0",
-    "cuopt-mps-parser==26.4.*,>=0.0.0a0",
+    "cudf==26.6.*,>=0.0.0a0",
+    "cuopt-mps-parser==26.6.*,>=0.0.0a0",
     "cupy-cuda13x>=13.6.0",
-    "libcuopt==26.4.*,>=0.0.0a0",
+    "libcuopt==26.6.*,>=0.0.0a0",
     "numba-cuda>=0.22.1",
     "numba>=0.60.0,<0.65.0",
     "numpy>=1.23.5,<3.0",
     "pandas>=2.0",
-    "pylibraft==26.4.*,>=0.0.0a0",
+    "pylibraft==26.6.*,>=0.0.0a0",
     "pyyaml>=6.0.0",
     "rapids-logger==0.2.*,>=0.0.0a0",
-    "rmm==26.4.*,>=0.0.0a0",
+    "rmm==26.6.*,>=0.0.0a0",
     "scipy>=1.14.1",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
@@ -101,12 +101,12 @@ dependencies-file = "../../dependencies.yaml"
 matrix-entry = "cuda_suffixed=true;use_cuda_wheels=true"
 requires = [
     "cmake>=3.30.4",
-    "cuopt-mps-parser==26.4.*,>=0.0.0a0",
+    "cuopt-mps-parser==26.6.*,>=0.0.0a0",
     "cupy-cuda13x>=13.6.0",
     "cython>=3.0.3",
-    "libcuopt==26.4.*,>=0.0.0a0",
+    "libcuopt==26.6.*,>=0.0.0a0",
     "ninja",
-    "pylibraft==26.4.*,>=0.0.0a0",
+    "pylibraft==26.6.*,>=0.0.0a0",
     "rapids-logger==0.2.*,>=0.0.0a0",
-    "rmm==26.4.*,>=0.0.0a0",
+    "rmm==26.6.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
diff --git a/python/cuopt_self_hosted/pyproject.toml b/python/cuopt_self_hosted/pyproject.toml
index 7645c99ed0..43aa80a5b3 100644
--- a/python/cuopt_self_hosted/pyproject.toml
+++ b/python/cuopt_self_hosted/pyproject.toml
@@ -20,7 +20,7 @@ license = "Apache-2.0"
 license-files = ["LICENSE"]
 requires-python = ">=3.11"
 dependencies = [
-    "cuopt-mps-parser==26.4.*,>=0.0.0a0",
+    "cuopt-mps-parser==26.6.*,>=0.0.0a0",
     "msgpack-numpy==0.4.8",
     "msgpack==1.1.2",
     "requests",
diff --git a/python/cuopt_server/pyproject.toml b/python/cuopt_server/pyproject.toml
index d24cfcbd77..ce96c884be 100644
--- a/python/cuopt_server/pyproject.toml
+++ b/python/cuopt_server/pyproject.toml
@@ -21,7 +21,7 @@ license = "Apache-2.0"
 license-files = ["LICENSE"]
 requires-python = ">=3.11"
 dependencies = [
-    "cuopt==26.4.*,>=0.0.0a0",
+    "cuopt==26.6.*,>=0.0.0a0",
     "cupy-cuda13x>=13.6.0",
     "fastapi",
     "jsonref==1.1.0",
diff --git a/python/libcuopt/pyproject.toml b/python/libcuopt/pyproject.toml
index 2507971a0f..e5c0c58fab 100644
--- a/python/libcuopt/pyproject.toml
+++ b/python/libcuopt/pyproject.toml
@@ -30,8 +30,8 @@ classifiers = [
     "Programming Language :: Python :: 3.14",
 ]
 dependencies = [
-    "cuopt-mps-parser==26.4.*,>=0.0.0a0",
-    "librmm==26.4.*,>=0.0.0a0",
+    "cuopt-mps-parser==26.6.*,>=0.0.0a0",
+    "librmm==26.6.*,>=0.0.0a0",
     "nvidia-cublas",
     "nvidia-cudart",
     "nvidia-cudss",
@@ -81,8 +81,8 @@ dependencies-file = "../../dependencies.yaml"
 matrix-entry = "cuda_suffixed=true;use_cuda_wheels=true"
 requires = [
     "cmake>=3.30.4",
-    "cuopt-mps-parser==26.4.*,>=0.0.0a0",
-    "librmm==26.4.*,>=0.0.0a0",
+    "cuopt-mps-parser==26.6.*,>=0.0.0a0",
+    "librmm==26.6.*,>=0.0.0a0",
     "ninja",
     "rapids-logger==0.2.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
diff --git a/skills/cuopt-developer/SKILL.md b/skills/cuopt-developer/SKILL.md
index 12419153ac..99743f9171 100644
--- a/skills/cuopt-developer/SKILL.md
+++ b/skills/cuopt-developer/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: cuopt-developer
-version: "26.04.00"
+version: "26.06.00"
 description: Contribute to NVIDIA cuOpt codebase including C++/CUDA, Python, server, docs, and CI. Use when the user wants to modify solver internals, add features, submit PRs, or understand the codebase architecture.
 ---
 
diff --git a/skills/cuopt-installation-api-c/SKILL.md b/skills/cuopt-installation-api-c/SKILL.md
index 747382e3c7..bd4d60becc 100644
--- a/skills/cuopt-installation-api-c/SKILL.md
+++ b/skills/cuopt-installation-api-c/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: cuopt-installation-api-c
-version: "26.04.00"
+version: "26.06.00"
 description: Install cuOpt for C — conda, locate lib/headers, verification. Use when the user is installing or verifying the C API. Standalone; no common skill.
 ---
 
diff --git a/skills/cuopt-installation-api-python/SKILL.md b/skills/cuopt-installation-api-python/SKILL.md
index a3d7a5e5d2..771f5ec8b0 100644
--- a/skills/cuopt-installation-api-python/SKILL.md
+++ b/skills/cuopt-installation-api-python/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: cuopt-installation-api-python
-version: "26.04.00"
+version: "26.06.00"
 description: Install cuOpt for Python — pip, conda, Docker, verification. Use when the user is installing or verifying the Python API. Standalone; no common skill.
 ---
 
diff --git a/skills/cuopt-installation-common/SKILL.md b/skills/cuopt-installation-common/SKILL.md
index 6ceb9f9000..88534fb810 100644
--- a/skills/cuopt-installation-common/SKILL.md
+++ b/skills/cuopt-installation-common/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: cuopt-installation-common
-version: "26.04.00"
+version: "26.06.00"
 description: Install cuOpt — system and environment requirements only. Domain concepts; no install commands or interface guidance.
 ---
 
diff --git a/skills/cuopt-installation-developer/SKILL.md b/skills/cuopt-installation-developer/SKILL.md
index a002498853..1f3dff0d3f 100644
--- a/skills/cuopt-installation-developer/SKILL.md
+++ b/skills/cuopt-installation-developer/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: cuopt-installation-developer
-version: "26.04.00"
+version: "26.06.00"
 description: Developer installation — build cuOpt from source, run tests. Use when the user wants to set up a dev environment to contribute or modify cuOpt.
 ---
 
diff --git a/skills/cuopt-lp-milp-api-c/SKILL.md b/skills/cuopt-lp-milp-api-c/SKILL.md
index 53df3de63e..74b0d5dc92 100644
--- a/skills/cuopt-lp-milp-api-c/SKILL.md
+++ b/skills/cuopt-lp-milp-api-c/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: cuopt-lp-milp-api-c
-version: "26.04.00"
+version: "26.06.00"
 description: LP and MILP with cuOpt — C API only. Use when the user is embedding LP/MILP in C/C++.
 ---
 
diff --git a/skills/cuopt-lp-milp-api-cli/SKILL.md b/skills/cuopt-lp-milp-api-cli/SKILL.md
index cbdc1e7778..1f8e8a157c 100644
--- a/skills/cuopt-lp-milp-api-cli/SKILL.md
+++ b/skills/cuopt-lp-milp-api-cli/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: cuopt-lp-milp-api-cli
-version: "26.04.00"
+version: "26.06.00"
 description: LP and MILP with cuOpt — CLI only (MPS files, cuopt_cli). Use when the user is solving from MPS via command line.
 ---
 
diff --git a/skills/cuopt-lp-milp-api-python/SKILL.md b/skills/cuopt-lp-milp-api-python/SKILL.md
index a7cd9a59f2..e8435867db 100644
--- a/skills/cuopt-lp-milp-api-python/SKILL.md
+++ b/skills/cuopt-lp-milp-api-python/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: cuopt-lp-milp-api-python
-version: "26.04.00"
+version: "26.06.00"
 description: Solve Linear Programming (LP) and Mixed-Integer Linear Programming (MILP) with the Python API. Use when the user asks about optimization with linear constraints, integer variables, scheduling, resource allocation, facility location, or production planning.
 ---
 
diff --git a/skills/cuopt-qp-api-c/SKILL.md b/skills/cuopt-qp-api-c/SKILL.md
index bc1efb63d3..85014b81fd 100644
--- a/skills/cuopt-qp-api-c/SKILL.md
+++ b/skills/cuopt-qp-api-c/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: cuopt-qp-api-c
-version: "26.04.00"
+version: "26.06.00"
 description: Quadratic Programming (QP) with cuOpt — C API. Use when the user is embedding QP in C/C++.
 ---
 
diff --git a/skills/cuopt-qp-api-cli/SKILL.md b/skills/cuopt-qp-api-cli/SKILL.md
index 5f8a8e848a..7aec559126 100644
--- a/skills/cuopt-qp-api-cli/SKILL.md
+++ b/skills/cuopt-qp-api-cli/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: cuopt-qp-api-cli
-version: "26.04.00"
+version: "26.06.00"
 description: QP with cuOpt — CLI (e.g. cuopt_cli with QP-capable input). Use when the user is solving QP from the command line.
 ---
 
diff --git a/skills/cuopt-qp-api-python/SKILL.md b/skills/cuopt-qp-api-python/SKILL.md
index b85b9e3db2..39533aaeca 100644
--- a/skills/cuopt-qp-api-python/SKILL.md
+++ b/skills/cuopt-qp-api-python/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: cuopt-qp-api-python
-version: "26.04.00"
+version: "26.06.00"
 description: Quadratic Programming (QP) with cuOpt — Python API only (beta). Use when the user is building or solving QP in Python.
 ---
 
diff --git a/skills/cuopt-routing-api-python/SKILL.md b/skills/cuopt-routing-api-python/SKILL.md
index d8bf736f8f..c386107241 100644
--- a/skills/cuopt-routing-api-python/SKILL.md
+++ b/skills/cuopt-routing-api-python/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: cuopt-routing-api-python
-version: "26.04.00"
+version: "26.06.00"
 description: Vehicle routing (VRP, TSP, PDP) with cuOpt — Python API only. Use when the user is building or solving routing in Python.
 ---
 
diff --git a/skills/cuopt-server-api-python/SKILL.md b/skills/cuopt-server-api-python/SKILL.md
index b340e9883f..7d6ed175dd 100644
--- a/skills/cuopt-server-api-python/SKILL.md
+++ b/skills/cuopt-server-api-python/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: cuopt-server-api-python
-version: "26.04.00"
+version: "26.06.00"
 description: cuOpt REST server — start server, endpoints, Python/curl client examples. Use when the user is deploying or calling the REST API.
 ---
 
diff --git a/skills/cuopt-server-common/SKILL.md b/skills/cuopt-server-common/SKILL.md
index f23c9c4a5f..cc2a3728d5 100644
--- a/skills/cuopt-server-common/SKILL.md
+++ b/skills/cuopt-server-common/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: cuopt-server-common
-version: "26.04.00"
+version: "26.06.00"
 description: cuOpt REST server — what it does and how requests flow. Domain concepts; no deploy or client code.
 ---
 
diff --git a/skills/cuopt-user-rules/SKILL.md b/skills/cuopt-user-rules/SKILL.md
index 0777b9af15..87734f72a2 100644
--- a/skills/cuopt-user-rules/SKILL.md
+++ b/skills/cuopt-user-rules/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: cuopt-user-rules
-version: "26.04.00"
+version: "26.06.00"
 description: Base behavior rules for using NVIDIA cuOpt. Read this FIRST before any cuOpt user task (routing, LP/MILP, QP, installation, server). Covers handling incomplete questions, clarifying data requirements, verifying understanding, and running commands safely.
 ---
 
diff --git a/skills/lp-milp-formulation/SKILL.md b/skills/lp-milp-formulation/SKILL.md
index 64431a04c4..e429282033 100644
--- a/skills/lp-milp-formulation/SKILL.md
+++ b/skills/lp-milp-formulation/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: lp-milp-formulation
-version: "26.04.00"
+version: "26.06.00"
 description: LP/MILP concepts and going from problem text to formulation. What LP/MILP are, required formulation questions, typical modeling elements, and how to parse problem statements (parameters, constraints, decisions, objective).
 ---
 
diff --git a/skills/qp-formulation/SKILL.md b/skills/qp-formulation/SKILL.md
index c87b887fbc..60aed00ede 100644
--- a/skills/qp-formulation/SKILL.md
+++ b/skills/qp-formulation/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: qp-formulation
-version: "26.04.00"
+version: "26.06.00"
 description: Quadratic Programming (QP) — problem form and constraints. Domain concepts; no API or interface. QP is beta.
 ---
 
diff --git a/skills/routing-formulation/SKILL.md b/skills/routing-formulation/SKILL.md
index 4ab8d6419d..9cf8060cdf 100644
--- a/skills/routing-formulation/SKILL.md
+++ b/skills/routing-formulation/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: routing-formulation
-version: "26.04.00"
+version: "26.06.00"
 description: Vehicle routing (VRP, TSP, PDP) — problem types and data requirements. Domain concepts; no API or interface.
 ---
 
diff --git a/skills/skill-evolution/SKILL.md b/skills/skill-evolution/SKILL.md
index d77fba1a3f..f3605795b7 100644
--- a/skills/skill-evolution/SKILL.md
+++ b/skills/skill-evolution/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: skill-evolution
-version: "26.04.00"
+version: "26.06.00"
 description: After solving a non-trivial problem, detect generalizable learnings and propose skill updates so future interactions benefit automatically. Always active — applies to every interaction.
 ---
 
@@ -182,7 +182,7 @@ When skill evolution creates an entirely new skill directory, add `origin: skill
 ```yaml
 ---
 name: new-skill-name
-version: "26.04.00"
+version: "26.06.00"
 description: ...
 origin: skill-evolution
 ---

From d44661ded5958438c360dcf41a4a08e05a44d9d8 Mon Sep 17 00:00:00 2001
From: Rajesh Gandham <rgandham@nvidia.com>
Date: Thu, 19 Mar 2026 11:57:04 -0700
Subject: [PATCH 11/30] Unify two constructors

---
 cpp/src/branch_and_bound/CMakeLists.txt       |   1 -
 cpp/src/branch_and_bound/branch_and_bound.cpp |  14 ++-
 cpp/src/branch_and_bound/branch_and_bound.hpp |  19 ++--
 .../branch_and_bound_from_mip.cu              | 105 ------------------
 cpp/src/dual_simplex/solve.cpp                |   4 +-
 cpp/src/mip_heuristics/diversity/lns/rins.cu  |   6 +-
 .../diversity/recombiners/sub_mip.cuh         |   6 +-
 cpp/src/mip_heuristics/solver.cu              |   8 +-
 8 files changed, 38 insertions(+), 125 deletions(-)
 delete mode 100644 cpp/src/branch_and_bound/branch_and_bound_from_mip.cu

diff --git a/cpp/src/branch_and_bound/CMakeLists.txt b/cpp/src/branch_and_bound/CMakeLists.txt
index 9b04014fb7..5bb1017120 100644
--- a/cpp/src/branch_and_bound/CMakeLists.txt
+++ b/cpp/src/branch_and_bound/CMakeLists.txt
@@ -5,7 +5,6 @@
 
 set(BRANCH_AND_BOUND_SRC_FILES
   ${CMAKE_CURRENT_SOURCE_DIR}/branch_and_bound.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/branch_and_bound_from_mip.cu
   ${CMAKE_CURRENT_SOURCE_DIR}/mip_node.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/pseudo_costs.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/diving_heuristics.cpp
diff --git a/cpp/src/branch_and_bound/branch_and_bound.cpp b/cpp/src/branch_and_bound/branch_and_bound.cpp
index 3cf273545e..06ea002cac 100644
--- a/cpp/src/branch_and_bound/branch_and_bound.cpp
+++ b/cpp/src/branch_and_bound/branch_and_bound.cpp
@@ -248,7 +248,9 @@ template <typename i_t, typename f_t>
 branch_and_bound_t<i_t, f_t>::branch_and_bound_t(
   const user_problem_t<i_t, f_t>& user_problem,
   const simplex_solver_settings_t<i_t, f_t>& solver_settings,
-  f_t start_time)
+  f_t start_time,
+  cuopt::linear_programming::detail::problem_t<i_t, f_t>* mip_problem_ptr,
+  i_t pdlp_root_num_gpus)
   : original_problem_(user_problem),
     settings_(solver_settings),
     original_lp_(user_problem.handle_ptr, 1, 1, 1),
@@ -257,8 +259,8 @@ branch_and_bound_t<i_t, f_t>::branch_and_bound_t(
     root_relax_soln_(1, 1),
     pc_(1),
     solver_status_(mip_status_t::UNSET),
-    mip_problem_ptr_(nullptr),
-    pdlp_root_num_gpus_(1)
+    mip_problem_ptr_(mip_problem_ptr),
+    pdlp_root_num_gpus_(pdlp_root_num_gpus)
 {
   exploration_stats_.start_time = start_time;
 #ifdef PRINT_CONSTRAINT_MATRIX
@@ -3870,4 +3872,10 @@ template class branch_and_bound_t<int, double>;
 
 #endif
 
+#ifdef MIP_INSTANTIATION_FLOAT
+
+template class branch_and_bound_t<int, float>;
+
+#endif
+
 }  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/branch_and_bound/branch_and_bound.hpp b/cpp/src/branch_and_bound/branch_and_bound.hpp
index eeccb75af3..2cbe55a48c 100644
--- a/cpp/src/branch_and_bound/branch_and_bound.hpp
+++ b/cpp/src/branch_and_bound/branch_and_bound.hpp
@@ -75,16 +75,15 @@ struct deterministic_diving_policy_t;
 template <typename i_t, typename f_t>
 class branch_and_bound_t {
  public:
-  /** Build from MIP problem_t (used by mip_heuristics). Implemented in
-   * branch_and_bound_from_mip.cu. */
-  branch_and_bound_t(cuopt::linear_programming::detail::problem_t<i_t, f_t>* mip_problem_ptr,
-                     const simplex_solver_settings_t<i_t, f_t>& solver_settings,
-                     f_t start_time,
-                     i_t num_gpus = 1);
-  /** Build from user_problem_t (used by dual_simplex/solve.cpp, RINS, sub_mip). */
-  branch_and_bound_t(const user_problem_t<i_t, f_t>& user_problem,
-                     const simplex_solver_settings_t<i_t, f_t>& solver_settings,
-                     f_t start_time);
+  /** Host \p user_problem must be fully populated by the caller. When \p mip_problem_ptr is
+   *  non-null (GPU MIP / concurrent root), the caller must sync from device first, e.g.
+   *  recompute_objective_integrality(), set objective_is_integral, get_host_user_problem(). */
+  branch_and_bound_t(
+    const user_problem_t<i_t, f_t>& user_problem,
+    const simplex_solver_settings_t<i_t, f_t>& solver_settings,
+    f_t start_time,
+    cuopt::linear_programming::detail::problem_t<i_t, f_t>* mip_problem_ptr = nullptr,
+    i_t pdlp_root_num_gpus                                                  = 1);
 
   // Set an initial guess based on the user_problem. This should be called before solve.
   void set_initial_guess(const std::vector<f_t>& user_guess) { guess_ = user_guess; }
diff --git a/cpp/src/branch_and_bound/branch_and_bound_from_mip.cu b/cpp/src/branch_and_bound/branch_and_bound_from_mip.cu
deleted file mode 100644
index 4e90956f68..0000000000
--- a/cpp/src/branch_and_bound/branch_and_bound_from_mip.cu
+++ /dev/null
@@ -1,105 +0,0 @@
-/* clang-format off */
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
- */
-/* clang-format on */
-
-#include <branch_and_bound/branch_and_bound.hpp>
-#include <branch_and_bound/mip_node.hpp>
-#include <branch_and_bound/pseudo_costs.hpp>
-
-#include <mip_heuristics/problem/problem.cuh>
-
-#include <cuts/cuts.hpp>
-#include <dual_simplex/initial_basis.hpp>
-#include <dual_simplex/presolve.hpp>
-#include <dual_simplex/user_problem.hpp>
-
-namespace cuopt::linear_programming::dual_simplex {
-
-namespace {
-template <typename i_t, typename f_t>
-void full_variable_types(const user_problem_t<i_t, f_t>& original_problem,
-                         const lp_problem_t<i_t, f_t>& original_lp,
-                         std::vector<variable_type_t>& var_types)
-{
-  var_types = original_problem.var_types;
-  if (original_lp.num_cols > original_problem.num_cols) {
-    var_types.resize(original_lp.num_cols);
-    for (i_t k = original_problem.num_cols; k < original_lp.num_cols; k++) {
-      var_types[k] = variable_type_t::CONTINUOUS;
-    }
-  }
-}
-}  // anonymous namespace
-
-template <typename i_t, typename f_t>
-branch_and_bound_t<i_t, f_t>::branch_and_bound_t(
-  cuopt::linear_programming::detail::problem_t<i_t, f_t>* mip_problem_ptr,
-  const simplex_solver_settings_t<i_t, f_t>& solver_settings,
-  f_t start_time,
-  i_t num_gpus)
-  : original_problem_(mip_problem_ptr->handle_ptr),
-    settings_(solver_settings),
-    original_lp_(mip_problem_ptr->handle_ptr, 1, 1, 1),
-    Arow_(1, 1, 0),
-    incumbent_(1),
-    root_relax_soln_(1, 1),
-    pc_(1),
-    solver_status_(mip_status_t::UNSET),
-    mip_problem_ptr_(mip_problem_ptr),
-    pdlp_root_num_gpus_(num_gpus)
-{
-  exploration_stats_.start_time = start_time;
-  mip_problem_ptr->recompute_objective_integrality();
-  original_problem_.objective_is_integral = mip_problem_ptr->is_objective_integral();
-  mip_problem_ptr->get_host_user_problem(original_problem_);
-
-#ifdef PRINT_CONSTRAINT_MATRIX
-  settings_.log.printf("A");
-  original_problem_.A.print_matrix();
-#endif
-
-  dualize_info_t<i_t, f_t> dualize_info;
-  convert_user_problem(original_problem_, settings_, original_lp_, new_slacks_, dualize_info);
-  full_variable_types(original_problem_, original_lp_, var_types_);
-
-#ifdef CHECK_SLACKS
-  assert(new_slacks_.size() == original_lp_.num_rows);
-  for (i_t slack : new_slacks_) {
-    const i_t col_start = original_lp_.A.col_start[slack];
-    const i_t col_end   = original_lp_.A.col_start[slack + 1];
-    const i_t col_len   = col_end - col_start;
-    if (col_len != 1) {
-      settings_.log.printf("Slack %d has %d nzs\n", slack, col_len);
-      assert(col_len == 1);
-    }
-    const i_t i = original_lp_.A.i[col_start];
-    const f_t x = original_lp_.A.x[col_start];
-    if (std::abs(x) != 1.0) {
-      settings_.log.printf("Slack %d row %d has non-unit coefficient %e\n", slack, i, x);
-      assert(std::abs(x) == 1.0);
-    }
-  }
-#endif
-
-  upper_bound_    = inf;
-  root_objective_ = std::numeric_limits<f_t>::quiet_NaN();
-}
-
-template branch_and_bound_t<int, double>::branch_and_bound_t(
-  cuopt::linear_programming::detail::problem_t<int, double>*,
-  const simplex_solver_settings_t<int, double>&,
-  double,
-  int);
-
-#ifdef MIP_INSTANTIATION_FLOAT
-template branch_and_bound_t<int, float>::branch_and_bound_t(
-  cuopt::linear_programming::detail::problem_t<int, float>*,
-  const simplex_solver_settings_t<int, float>&,
-  float,
-  int);
-#endif
-
-}  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/dual_simplex/solve.cpp b/cpp/src/dual_simplex/solve.cpp
index d300d6011c..d5525891b6 100644
--- a/cpp/src/dual_simplex/solve.cpp
+++ b/cpp/src/dual_simplex/solve.cpp
@@ -706,7 +706,7 @@ i_t solve(const user_problem_t<i_t, f_t>& problem,
 {
   i_t status;
   if (is_mip(problem) && !settings.relaxation) {
-    branch_and_bound_t branch_and_bound(problem, settings, tic());
+    branch_and_bound_t<i_t, f_t> branch_and_bound(problem, settings, tic());
     mip_solution_t<i_t, f_t> mip_solution(problem.num_cols);
     mip_status_t mip_status = branch_and_bound.solve(mip_solution);
     if (mip_status == mip_status_t::OPTIMAL) {
@@ -745,7 +745,7 @@ i_t solve_mip_with_guess(const user_problem_t<i_t, f_t>& problem,
 {
   i_t status;
   if (is_mip(problem)) {
-    branch_and_bound_t branch_and_bound(problem, settings, tic());
+    branch_and_bound_t<i_t, f_t> branch_and_bound(problem, settings, tic());
     branch_and_bound.set_initial_guess(guess);
     mip_status_t mip_status = branch_and_bound.solve(solution);
     if (mip_status == mip_status_t::OPTIMAL) {
diff --git a/cpp/src/mip_heuristics/diversity/lns/rins.cu b/cpp/src/mip_heuristics/diversity/lns/rins.cu
index 1d009b8fb7..31819c9722 100644
--- a/cpp/src/mip_heuristics/diversity/lns/rins.cu
+++ b/cpp/src/mip_heuristics/diversity/lns/rins.cu
@@ -270,8 +270,12 @@ void rins_t<i_t, f_t>::run_rins()
                                                                        f_t objective) {
     rins_solution_queue.push_back(solution);
   };
+  dual_simplex::user_problem_t<i_t, f_t> bb_user_problem(fixed_problem.handle_ptr);
+  fixed_problem.recompute_objective_integrality();
+  bb_user_problem.objective_is_integral = fixed_problem.is_objective_integral();
+  fixed_problem.get_host_user_problem(bb_user_problem);
   dual_simplex::branch_and_bound_t<i_t, f_t> branch_and_bound(
-    &fixed_problem, branch_and_bound_settings, dual_simplex::tic(), 1);
+    bb_user_problem, branch_and_bound_settings, dual_simplex::tic(), &fixed_problem, 1);
   branch_and_bound_solution.resize(branch_and_bound.get_num_cols());
   branch_and_bound.set_initial_guess(cuopt::host_copy(fixed_assignment, rins_handle.get_stream()));
   branch_and_bound_status = branch_and_bound.solve(branch_and_bound_solution);
diff --git a/cpp/src/mip_heuristics/diversity/recombiners/sub_mip.cuh b/cpp/src/mip_heuristics/diversity/recombiners/sub_mip.cuh
index 5b9821cc3f..4494f5d3c5 100644
--- a/cpp/src/mip_heuristics/diversity/recombiners/sub_mip.cuh
+++ b/cpp/src/mip_heuristics/diversity/recombiners/sub_mip.cuh
@@ -113,8 +113,12 @@ class sub_mip_recombiner_t : public recombiner_t<i_t, f_t> {
 
       // disable B&B logs, so that it is not interfering with the main B&B thread
       branch_and_bound_settings.log.log = false;
+      dual_simplex::user_problem_t<i_t, f_t> bb_user_problem(fixed_problem.handle_ptr);
+      fixed_problem.recompute_objective_integrality();
+      bb_user_problem.objective_is_integral = fixed_problem.is_objective_integral();
+      fixed_problem.get_host_user_problem(bb_user_problem);
       dual_simplex::branch_and_bound_t<i_t, f_t> branch_and_bound(
-        &fixed_problem, branch_and_bound_settings, dual_simplex::tic(), 1);
+        bb_user_problem, branch_and_bound_settings, dual_simplex::tic(), &fixed_problem, 1);
       branch_and_bound_solution.resize(branch_and_bound.get_num_cols());
       branch_and_bound_status = branch_and_bound.solve(branch_and_bound_solution);
       if (solution_vector.size() > 0) {
diff --git a/cpp/src/mip_heuristics/solver.cu b/cpp/src/mip_heuristics/solver.cu
index 1257aa6409..dd0a084477 100644
--- a/cpp/src/mip_heuristics/solver.cu
+++ b/cpp/src/mip_heuristics/solver.cu
@@ -257,11 +257,15 @@ solution_t<i_t, f_t> mip_solver_t<i_t, f_t>::run_solver()
         };
     }
 
-    // Create the branch and bound object (builds user_problem from context.problem_ptr)
+    dual_simplex::user_problem_t<i_t, f_t> bb_user_problem(context.problem_ptr->handle_ptr);
+    context.problem_ptr->recompute_objective_integrality();
+    bb_user_problem.objective_is_integral = context.problem_ptr->is_objective_integral();
+    context.problem_ptr->get_host_user_problem(bb_user_problem);
     branch_and_bound =
-      std::make_unique<dual_simplex::branch_and_bound_t<i_t, f_t>>(context.problem_ptr,
+      std::make_unique<dual_simplex::branch_and_bound_t<i_t, f_t>>(bb_user_problem,
                                                                    branch_and_bound_settings,
                                                                    timer_.get_tic_start(),
+                                                                   context.problem_ptr,
                                                                    context.settings.num_gpus);
     branch_and_bound_solution.resize(branch_and_bound->get_num_cols());
     context.branch_and_bound_ptr = branch_and_bound.get();

From c9e39d39e9b7d9cf8a821f9d1073f512b5ea26bd Mon Sep 17 00:00:00 2001
From: Rajesh Gandham <rgandham@nvidia.com>
Date: Thu, 19 Mar 2026 12:38:50 -0700
Subject: [PATCH 12/30] Fix compilation error

---
 cpp/src/branch_and_bound/branch_and_bound.hpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cpp/src/branch_and_bound/branch_and_bound.hpp b/cpp/src/branch_and_bound/branch_and_bound.hpp
index feb4a561b1..b4c46ac8e9 100644
--- a/cpp/src/branch_and_bound/branch_and_bound.hpp
+++ b/cpp/src/branch_and_bound/branch_and_bound.hpp
@@ -48,7 +48,7 @@ class problem_t;
 
 template <typename i_t, typename f_t>
 struct clique_table_t;
-}
+}  // namespace cuopt::linear_programming::detail
 
 namespace cuopt::linear_programming::dual_simplex {
 
@@ -88,9 +88,9 @@ class branch_and_bound_t {
     const user_problem_t<i_t, f_t>& user_problem,
     const simplex_solver_settings_t<i_t, f_t>& solver_settings,
     f_t start_time,
-    cuopt::linear_programming::detail::problem_t<i_t, f_t>* mip_problem_ptr,
-    i_t pdlp_root_num_gpus,
-    std::shared_ptr<detail::clique_table_t<i_t, f_t>> clique_table = nullptr);
+    cuopt::linear_programming::detail::problem_t<i_t, f_t>* mip_problem_ptr = nullptr,
+    i_t pdlp_root_num_gpus                                                  = 1,
+    std::shared_ptr<detail::clique_table_t<i_t, f_t>> clique_table          = nullptr);
 
   // Set an initial guess based on the user_problem. This should be called before solve.
   void set_initial_guess(const std::vector<f_t>& user_guess) { guess_ = user_guess; }

From c99e1c3f1111701350333f5eb145a638d859c0f6 Mon Sep 17 00:00:00 2001
From: Rajesh Gandham <rgandham@nvidia.com>
Date: Tue, 10 Mar 2026 11:42:26 -0700
Subject: [PATCH 13/30]  Optimize right-looking LU factorization with O(1)
 degree-bucket ops  Replace linear degree-bucket search with O(1)
 swap-with-last removal  using col_pos/row_pos position arrays, and eliminate
 O(row_degree)  pre-traversal in schur_complement via a persistent
 last_in_row[] array

---
 cpp/src/dual_simplex/right_looking_lu.cpp | 210 +++++++++++++---------
 1 file changed, 126 insertions(+), 84 deletions(-)

diff --git a/cpp/src/dual_simplex/right_looking_lu.cpp b/cpp/src/dual_simplex/right_looking_lu.cpp
index 657ebc4762..53bfcf8ac5 100644
--- a/cpp/src/dual_simplex/right_looking_lu.cpp
+++ b/cpp/src/dual_simplex/right_looking_lu.cpp
@@ -30,7 +30,7 @@ struct element_t {
   f_t x;               // coefficient value
   i_t next_in_column;  // index of the next element in the column: kNone if there is no next element
   i_t next_in_row;     // index of the next element in the row: kNone if there is no next element
-};
+};  // 24 bytes
 constexpr int kNone = -1;
 
 template <typename i_t, typename f_t>
@@ -86,11 +86,11 @@ i_t load_elements(const csc_matrix_t<i_t, f_t>& A,
                   std::vector<element_t<i_t, f_t>>& elements,
                   std::vector<i_t>& first_in_row,
                   std::vector<i_t>& first_in_col,
+                  std::vector<i_t>& last_in_row,
                   f_t& work_estimate)
 {
   const i_t m = A.m;
   const i_t n = column_list.size();
-  std::vector<i_t> last_element_in_row(m, kNone);
   work_estimate += m;
 
   i_t nz = 0;
@@ -105,15 +105,9 @@ i_t load_elements(const csc_matrix_t<i_t, f_t>& A,
       elements[nz].x              = A.x[p];
       elements[nz].next_in_column = kNone;
       if (p > col_start) { elements[nz - 1].next_in_column = nz; }
-      elements[nz].next_in_row = kNone;  // set the current next in row to None (since we don't know
-                                         // if there will be more entries in this row)
-      if (last_element_in_row[i] != kNone) {
-        // If we have seen an entry in this row before, set the last entry we've seen in this row to
-        // point to the current entry
-        elements[last_element_in_row[i]].next_in_row = nz;
-      }
-      // The current entry becomes the last element seen in the row
-      last_element_in_row[i] = nz;
+      elements[nz].next_in_row = kNone;
+      if (last_in_row[i] != kNone) { elements[last_in_row[i]].next_in_row = nz; }
+      last_in_row[i] = nz;
       if (p == col_start) { first_in_col[k] = nz; }
       if (first_in_row[i] == kNone) { first_in_row[i] = nz; }
       nz++;
@@ -316,10 +310,11 @@ void update_Cdegree_and_col_count(i_t pivot_i,
                                   const std::vector<i_t>& first_in_row,
                                   std::vector<i_t>& Cdegree,
                                   std::vector<std::vector<i_t>>& col_count,
+                                  std::vector<i_t>& col_pos,
                                   std::vector<element_t<i_t, f_t>>& elements,
                                   f_t& work_estimate)
 {
-  // Update Cdegree and col_count
+  // Update Cdegree and col_count (O(1) removal using position array)
   i_t loop_count = 0;
   for (i_t p = first_in_row[pivot_i]; p != kNone; p = elements[p].next_in_row) {
     element_t<i_t, f_t>* entry = &elements[p];
@@ -327,20 +322,20 @@ void update_Cdegree_and_col_count(i_t pivot_i,
     assert(entry->i == pivot_i);
     i_t cdeg = Cdegree[j];
     assert(cdeg >= 0);
-    for (typename std::vector<i_t>::iterator it = col_count[cdeg].begin();
-         it != col_count[cdeg].end();
-         it++) {
-      if (*it == j) {
-        // Remove col j from col_count[cdeg]
-        std::swap(*it, col_count[cdeg].back());
-        col_count[cdeg].pop_back();
-        work_estimate += (it - col_count[cdeg].begin());
-        break;
-      }
+    // O(1) swap-with-last removal
+    {
+      i_t pos              = col_pos[j];
+      i_t other            = col_count[cdeg].back();
+      col_count[cdeg][pos] = other;
+      col_pos[other]       = pos;
+      col_count[cdeg].pop_back();
     }
     cdeg = --Cdegree[j];
     assert(cdeg >= 0);
-    if (j != pivot_j && cdeg >= 0) { col_count[cdeg].push_back(j); }
+    if (j != pivot_j && cdeg >= 0) {
+      col_pos[j] = col_count[cdeg].size();
+      col_count[cdeg].push_back(j);
+    }
     loop_count++;
   }
   work_estimate += 7 * loop_count;
@@ -353,30 +348,31 @@ void update_Rdegree_and_row_count(i_t pivot_i,
                                   const std::vector<i_t>& first_in_col,
                                   std::vector<i_t>& Rdegree,
                                   std::vector<std::vector<i_t>>& row_count,
+                                  std::vector<i_t>& row_pos,
                                   std::vector<element_t<i_t, f_t>>& elements,
                                   f_t& work_estimate)
 {
-  // Update Rdegree and row_count
+  // Update Rdegree and row_count (O(1) removal using position array)
   i_t loop_count = 0;
   for (i_t p = first_in_col[pivot_j]; p != kNone; p = elements[p].next_in_column) {
     element_t<i_t, f_t>* entry = &elements[p];
     const i_t i                = entry->i;
     i_t rdeg                   = Rdegree[i];
     assert(rdeg >= 0);
-    for (typename std::vector<i_t>::iterator it = row_count[rdeg].begin();
-         it != row_count[rdeg].end();
-         it++) {
-      if (*it == i) {
-        // Remove row i from row_count[rdeg]
-        std::swap(*it, row_count[rdeg].back());
-        row_count[rdeg].pop_back();
-        work_estimate += (it - row_count[rdeg].begin());
-        break;
-      }
+    // O(1) swap-with-last removal
+    {
+      i_t pos              = row_pos[i];
+      i_t other            = row_count[rdeg].back();
+      row_count[rdeg][pos] = other;
+      row_pos[other]       = pos;
+      row_count[rdeg].pop_back();
     }
     rdeg = --Rdegree[i];
     assert(rdeg >= 0);
-    if (i != pivot_i && rdeg >= 0) { row_count[rdeg].push_back(i); }
+    if (i != pivot_i && rdeg >= 0) {
+      row_pos[i] = row_count[rdeg].size();
+      row_count[rdeg].push_back(i);
+    }
     loop_count++;
   }
   work_estimate += 7 * loop_count;
@@ -400,18 +396,15 @@ void schur_complement(i_t pivot_i,
                       std::vector<i_t>& Cdegree,
                       std::vector<std::vector<i_t>>& row_count,
                       std::vector<std::vector<i_t>>& col_count,
+                      std::vector<i_t>& last_in_row,
+                      std::vector<i_t>& col_pos,
+                      std::vector<i_t>& row_pos,
                       std::vector<element_t<i_t, f_t>>& elements,
                       f_t& work_estimate)
 {
+  // Initialize row_last_workspace from last_in_row (O(1) per row, no full row traversal)
   for (i_t p1 = first_in_col[pivot_j]; p1 != kNone; p1 = elements[p1].next_in_column) {
-    element_t<i_t, f_t>* e = &elements[p1];
-    const i_t i            = e->i;
-    i_t row_last           = kNone;
-    for (i_t p3 = first_in_row[i]; p3 != kNone; p3 = elements[p3].next_in_row) {
-      row_last = p3;
-    }
-    work_estimate += 2 * Rdegree[i];
-    row_last_workspace[i] = row_last;
+    row_last_workspace[elements[p1].i] = last_in_row[elements[p1].i];
   }
   work_estimate += 4 * Cdegree[pivot_j];
 
@@ -478,35 +471,29 @@ void schur_complement(i_t pivot_i,
           first_in_row[i] = fill_p;
         }
         row_last_workspace[i] = fill_p;
-        i_t rdeg              = Rdegree[i];  // Rdgree must increase
-        for (typename std::vector<i_t>::iterator it = row_count[rdeg].begin();
-             it != row_count[rdeg].end();
-             it++) {
-          if (*it == i) {
-            // Remove row i from row_count[rdeg]
-            std::swap(*it, row_count[rdeg].back());
-            row_count[rdeg].pop_back();
-            work_estimate += 2 * (it - row_count[rdeg].begin());
-            break;
-          }
+        last_in_row[i]        = fill_p;  // maintain last_in_row persistent state
+        // Row degree update: O(1) removal using row_pos
+        {
+          i_t rdeg             = Rdegree[i];
+          i_t pos              = row_pos[i];
+          i_t other            = row_count[rdeg].back();
+          row_count[rdeg][pos] = other;
+          row_pos[other]       = pos;
+          row_count[rdeg].pop_back();
+          row_pos[i] = row_count[rdeg + 1].size();
+          row_count[++Rdegree[i]].push_back(i);
         }
-        rdeg = ++Rdegree[i];           // Increase rdeg
-        row_count[rdeg].push_back(i);  // Add row i to row_count[rdeg]
-
-        i_t cdeg = Cdegree[j];  // Cdegree must increase
-        for (typename std::vector<i_t>::iterator it = col_count[cdeg].begin();
-             it != col_count[cdeg].end();
-             it++) {
-          if (*it == j) {
-            // Remove col j from col_count[cdeg]
-            std::swap(*it, col_count[cdeg].back());
-            col_count[cdeg].pop_back();
-            work_estimate += 2 * (it - col_count[cdeg].begin());
-            break;
-          }
+        // Col degree update: O(1) removal using col_pos
+        {
+          i_t cdeg             = Cdegree[j];
+          i_t pos              = col_pos[j];
+          i_t other            = col_count[cdeg].back();
+          col_count[cdeg][pos] = other;
+          col_pos[other]       = pos;
+          col_count[cdeg].pop_back();
+          col_pos[j] = col_count[cdeg + 1].size();
+          col_count[++Cdegree[j]].push_back(j);
         }
-        cdeg = ++Cdegree[j];           // Increase Cdegree
-        col_count[cdeg].push_back(j);  // Add column j to col_count[cdeg]
       }
     }
     work_estimate += 10 * Cdegree[pivot_j];
@@ -532,7 +519,6 @@ void remove_pivot_row(i_t pivot_i,
                       f_t& work_estimate)
 {
   // Remove the pivot row
-
   i_t row_loop_count = 0;
   for (i_t p0 = first_in_row[pivot_i]; p0 != kNone; p0 = elements[p0].next_in_row) {
     element_t<i_t, f_t>* e = &elements[p0];
@@ -574,6 +560,7 @@ void remove_pivot_col(i_t pivot_i,
                       std::vector<i_t>& first_in_col,
                       std::vector<i_t>& first_in_row,
                       std::vector<f_t>& max_in_row,
+                      std::vector<i_t>& last_in_row,
                       std::vector<element_t<i_t, f_t>>& elements,
                       f_t& work_estimate)
 {
@@ -583,6 +570,7 @@ void remove_pivot_col(i_t pivot_i,
     element_t<i_t, f_t>* e = &elements[p1];
     const i_t i            = e->i;
     i_t last               = kNone;
+    i_t last_surviving     = kNone;
 #ifdef THRESHOLD_ROOK_PIVOTING
     f_t max_in_row_i = 0.0;
 #endif
@@ -598,16 +586,17 @@ void remove_pivot_col(i_t pivot_i,
         entry->i = -1;
         entry->j = -1;
         entry->x = std::numeric_limits<f_t>::quiet_NaN();
-      }
+      } else {
+        last_surviving = p;
 #ifdef THRESHOLD_ROOK_PIVOTING
-      else {
         const f_t abs_entryx = std::abs(entry->x);
         if (abs_entryx > max_in_row_i) { max_in_row_i = abs_entryx; }
-      }
 #endif
+      }
       last = p;
       row_loop_count++;
     }
+    last_in_row[i] = last_surviving;
     work_estimate += 3 * row_loop_count;
 #ifdef THRESHOLD_ROOK_PIVOTING
     max_in_row[i] = max_in_row_i;
@@ -656,11 +645,28 @@ i_t right_looking_lu(const csc_matrix_t<i_t, f_t>& A,
 
   const i_t Bnz =
     initialize_degree_data(A, column_list, Cdegree, Rdegree, col_count, row_count, work_estimate);
+
+  // Position arrays for O(1) degree-bucket removal
+  std::vector<i_t> col_pos(n);
+  for (i_t d = 0; d <= n; ++d) {
+    for (i_t pos = 0; pos < static_cast<i_t>(col_count[d].size()); ++pos) {
+      col_pos[col_count[d][pos]] = pos;
+    }
+  }
+  std::vector<i_t> row_pos(n);
+  for (i_t d = 0; d <= n; ++d) {
+    for (i_t pos = 0; pos < static_cast<i_t>(row_count[d].size()); ++pos) {
+      row_pos[row_count[d][pos]] = pos;
+    }
+  }
+
   std::vector<element_t<i_t, f_t>> elements(Bnz);
   std::vector<i_t> first_in_row(n, kNone);
   std::vector<i_t> first_in_col(n, kNone);
+  std::vector<i_t> last_in_row(n, kNone);
   work_estimate += 2 * n + Bnz;
-  load_elements(A, column_list, Bnz, elements, first_in_row, first_in_col, work_estimate);
+  load_elements(
+    A, column_list, Bnz, elements, first_in_row, first_in_col, last_in_row, work_estimate);
 
   std::vector<i_t> column_j_workspace(n, kNone);
   std::vector<i_t> row_last_workspace(n);
@@ -777,9 +783,9 @@ i_t right_looking_lu(const csc_matrix_t<i_t, f_t>& A,
 
     // Update Cdegree and col_count
     update_Cdegree_and_col_count(
-      pivot_i, pivot_j, first_in_row, Cdegree, col_count, elements, work_estimate);
+      pivot_i, pivot_j, first_in_row, Cdegree, col_count, col_pos, elements, work_estimate);
     update_Rdegree_and_row_count(
-      pivot_i, pivot_j, first_in_col, Rdegree, row_count, elements, work_estimate);
+      pivot_i, pivot_j, first_in_col, Rdegree, row_count, row_pos, elements, work_estimate);
 
     // A22 <- A22 - l u^T
     schur_complement(pivot_i,
@@ -798,14 +804,23 @@ i_t right_looking_lu(const csc_matrix_t<i_t, f_t>& A,
                      Cdegree,
                      row_count,
                      col_count,
+                     last_in_row,
+                     col_pos,
+                     row_pos,
                      elements,
                      work_estimate);
 
     // Remove the pivot row
     remove_pivot_row(
       pivot_i, pivot_j, first_in_col, first_in_row, max_in_column, elements, work_estimate);
-    remove_pivot_col(
-      pivot_i, pivot_j, first_in_col, first_in_row, max_in_row, elements, work_estimate);
+    remove_pivot_col(pivot_i,
+                     pivot_j,
+                     first_in_col,
+                     first_in_row,
+                     max_in_row,
+                     last_in_row,
+                     elements,
+                     work_estimate);
 
     // Set pivot entry to sentinel value
     pivot_entry->i = -1;
@@ -1030,10 +1045,28 @@ i_t right_looking_lu_row_permutation_only(const csc_matrix_t<i_t, f_t>& A,
 
   const i_t Bnz =
     initialize_degree_data(A, column_list, Cdegree, Rdegree, col_count, row_count, work_estimate);
+
+  // Position arrays for O(1) degree-bucket removal
+  // col_count has m+1 buckets, row_count has n+1 buckets
+  std::vector<i_t> col_pos(n);
+  for (i_t d = 0; d <= m; ++d) {
+    for (i_t pos = 0; pos < static_cast<i_t>(col_count[d].size()); ++pos) {
+      col_pos[col_count[d][pos]] = pos;
+    }
+  }
+  std::vector<i_t> row_pos(m);
+  for (i_t d = 0; d <= n; ++d) {
+    for (i_t pos = 0; pos < static_cast<i_t>(row_count[d].size()); ++pos) {
+      row_pos[row_count[d][pos]] = pos;
+    }
+  }
+
   std::vector<element_t<i_t, f_t>> elements(Bnz);
   std::vector<i_t> first_in_row(m, kNone);
   std::vector<i_t> first_in_col(n, kNone);
-  load_elements(A, column_list, Bnz, elements, first_in_row, first_in_col, work_estimate);
+  std::vector<i_t> last_in_row(m, kNone);
+  load_elements(
+    A, column_list, Bnz, elements, first_in_row, first_in_col, last_in_row, work_estimate);
 
   std::vector<i_t> column_j_workspace(m, kNone);
   std::vector<i_t> row_last_workspace(m);
@@ -1100,9 +1133,9 @@ i_t right_looking_lu_row_permutation_only(const csc_matrix_t<i_t, f_t>& A,
 
     // Update Cdegree and col_count
     update_Cdegree_and_col_count<i_t, f_t>(
-      pivot_i, pivot_j, first_in_row, Cdegree, col_count, elements, work_estimate);
+      pivot_i, pivot_j, first_in_row, Cdegree, col_count, col_pos, elements, work_estimate);
     update_Rdegree_and_row_count<i_t, f_t>(
-      pivot_i, pivot_j, first_in_col, Rdegree, row_count, elements, work_estimate);
+      pivot_i, pivot_j, first_in_col, Rdegree, row_count, row_pos, elements, work_estimate);
 
     // A22 <- A22 - l u^T
     schur_complement<i_t, f_t>(pivot_i,
@@ -1121,14 +1154,23 @@ i_t right_looking_lu_row_permutation_only(const csc_matrix_t<i_t, f_t>& A,
                                Cdegree,
                                row_count,
                                col_count,
+                               last_in_row,
+                               col_pos,
+                               row_pos,
                                elements,
                                work_estimate);
 
     // Remove the pivot row
     remove_pivot_row<i_t, f_t>(
       pivot_i, pivot_j, first_in_col, first_in_row, max_in_column, elements, work_estimate);
-    remove_pivot_col<i_t, f_t>(
-      pivot_i, pivot_j, first_in_col, first_in_row, max_in_row, elements, work_estimate);
+    remove_pivot_col<i_t, f_t>(pivot_i,
+                               pivot_j,
+                               first_in_col,
+                               first_in_row,
+                               max_in_row,
+                               last_in_row,
+                               elements,
+                               work_estimate);
 
     // Set pivot entry to sentinel value
     pivot_entry->i = -1;

From 07cb595a7ddf30638755010a734a460525e99cd8 Mon Sep 17 00:00:00 2001
From: Rajesh Gandham <rgandham@nvidia.com>
Date: Mon, 16 Mar 2026 07:40:15 -0700
Subject: [PATCH 14/30] crossover: hoist delta_zN and delta_expanded out of
 dual push loop

Allocate buffers once before the superbasic loop and reset with std::fill
each iteration to avoid repeated O(n) allocations (PR #948 review).

Made-with: Cursor
---
 cpp/src/dual_simplex/crossover.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/cpp/src/dual_simplex/crossover.cpp b/cpp/src/dual_simplex/crossover.cpp
index 16f503e893..832f6891f6 100644
--- a/cpp/src/dual_simplex/crossover.cpp
+++ b/cpp/src/dual_simplex/crossover.cpp
@@ -388,6 +388,8 @@ i_t dual_push(const lp_problem_t<i_t, f_t>& lp,
   std::vector<f_t>& y       = solution.y;
   const std::vector<f_t>& x = solution.x;
   i_t num_pushes            = 0;
+  std::vector<f_t> delta_zN(n - m);
+  std::vector<f_t> delta_expanded(n);
   while (superbasic_list.size() > 0) {
     const i_t s                   = superbasic_list.back();
     const i_t basic_leaving_index = superbasic_list_index.back();
@@ -415,9 +417,9 @@ i_t dual_push(const lp_problem_t<i_t, f_t>& lp,
     }
 
     // delta_zN = -N^T delta_y
-    std::vector<f_t> delta_zN(n - m);
-    std::vector<f_t> delta_expanded(n, 0.);
-    
+    std::fill(delta_expanded.begin(), delta_expanded.end(), 0.);
+    std::fill(delta_zN.begin(), delta_zN.end(), 0.);
+
     // Iterate directly over sparse delta_y instead of checking zeros
     for (i_t nnz_idx = 0; nnz_idx < delta_y_sparse.i.size(); ++nnz_idx) {
       const i_t row = delta_y_sparse.i[nnz_idx];

From 953b83eb52237791df6973678da5cfea9ac06af6 Mon Sep 17 00:00:00 2001
From: Rajesh Gandham <rgandham@nvidia.com>
Date: Mon, 16 Mar 2026 09:24:59 -0700
Subject: [PATCH 15/30] Added review comments

---
 cpp/src/dual_simplex/right_looking_lu.cpp | 33 ++++++++++++++---------
 1 file changed, 21 insertions(+), 12 deletions(-)

diff --git a/cpp/src/dual_simplex/right_looking_lu.cpp b/cpp/src/dual_simplex/right_looking_lu.cpp
index 53bfcf8ac5..4800d644c1 100644
--- a/cpp/src/dual_simplex/right_looking_lu.cpp
+++ b/cpp/src/dual_simplex/right_looking_lu.cpp
@@ -105,8 +105,14 @@ i_t load_elements(const csc_matrix_t<i_t, f_t>& A,
       elements[nz].x              = A.x[p];
       elements[nz].next_in_column = kNone;
       if (p > col_start) { elements[nz - 1].next_in_column = nz; }
-      elements[nz].next_in_row = kNone;
-      if (last_in_row[i] != kNone) { elements[last_in_row[i]].next_in_row = nz; }
+      elements[nz].next_in_row = kNone;  // set the current next in row to None (since we don't know
+      // if there will be more entries in this row yet))
+      if (last_in_row[i] != kNone) {
+        // If we have seen an entry in this row before, set the last entry we've seen in this row to
+        // point to the current entry
+        elements[last_in_row[i]].next_in_row = nz;
+      }
+      // The current entry becomes the last element seen in the row
       last_in_row[i] = nz;
       if (p == col_start) { first_in_col[k] = nz; }
       if (first_in_row[i] == kNone) { first_in_row[i] = nz; }
@@ -402,9 +408,11 @@ void schur_complement(i_t pivot_i,
                       std::vector<element_t<i_t, f_t>>& elements,
                       f_t& work_estimate)
 {
-  // Initialize row_last_workspace from last_in_row (O(1) per row, no full row traversal)
+  // row_last_workspace: temp copy of last_in_row for this pivot step, updated when adding fill
+  // last_in_row: persistent tail pointer per row
   for (i_t p1 = first_in_col[pivot_j]; p1 != kNone; p1 = elements[p1].next_in_column) {
-    row_last_workspace[elements[p1].i] = last_in_row[elements[p1].i];
+    const i_t i           = elements[p1].i;
+    row_last_workspace[i] = last_in_row[i];
   }
   work_estimate += 4 * Cdegree[pivot_j];
 
@@ -569,8 +577,10 @@ void remove_pivot_col(i_t pivot_i,
   for (i_t p1 = first_in_col[pivot_j]; p1 != kNone; p1 = elements[p1].next_in_column) {
     element_t<i_t, f_t>* e = &elements[p1];
     const i_t i            = e->i;
-    i_t last               = kNone;
-    i_t last_surviving     = kNone;
+    // Need both: last = previous-in-row (for link update when removing); last_surviving = new row
+    // tail (for last_in_row[i]). They differ when the pivot is the last element in the row.
+    i_t last           = kNone;
+    i_t last_surviving = kNone;
 #ifdef THRESHOLD_ROOK_PIVOTING
     f_t max_in_row_i = 0.0;
 #endif
@@ -647,13 +657,13 @@ i_t right_looking_lu(const csc_matrix_t<i_t, f_t>& A,
     initialize_degree_data(A, column_list, Cdegree, Rdegree, col_count, row_count, work_estimate);
 
   // Position arrays for O(1) degree-bucket removal
-  std::vector<i_t> col_pos(n);
+  std::vector<i_t> col_pos(n);  // if Cdegree[j] = nz, then j is in col_count[nz][col_pos[j]]
   for (i_t d = 0; d <= n; ++d) {
     for (i_t pos = 0; pos < static_cast<i_t>(col_count[d].size()); ++pos) {
       col_pos[col_count[d][pos]] = pos;
     }
   }
-  std::vector<i_t> row_pos(n);
+  std::vector<i_t> row_pos(n);  // if Rdegree[i] = nz, then i is in row_count[nz][row_pos[i]]
   for (i_t d = 0; d <= n; ++d) {
     for (i_t pos = 0; pos < static_cast<i_t>(row_count[d].size()); ++pos) {
       row_pos[row_count[d][pos]] = pos;
@@ -1046,15 +1056,14 @@ i_t right_looking_lu_row_permutation_only(const csc_matrix_t<i_t, f_t>& A,
   const i_t Bnz =
     initialize_degree_data(A, column_list, Cdegree, Rdegree, col_count, row_count, work_estimate);
 
-  // Position arrays for O(1) degree-bucket removal
-  // col_count has m+1 buckets, row_count has n+1 buckets
-  std::vector<i_t> col_pos(n);
+  // Position arrays for O(1) degree-bucket removal (col_count has m+1 buckets, row_count n+1)
+  std::vector<i_t> col_pos(n);  // if Cdegree[j] = nz, then j is in col_count[nz][col_pos[j]]
   for (i_t d = 0; d <= m; ++d) {
     for (i_t pos = 0; pos < static_cast<i_t>(col_count[d].size()); ++pos) {
       col_pos[col_count[d][pos]] = pos;
     }
   }
-  std::vector<i_t> row_pos(m);
+  std::vector<i_t> row_pos(m);  // if Rdegree[i] = nz, then i is in row_count[nz][row_pos[i]]
   for (i_t d = 0; d <= n; ++d) {
     for (i_t pos = 0; pos < static_cast<i_t>(row_count[d].size()); ++pos) {
       row_pos[row_count[d][pos]] = pos;

From 787fadf31cdc174700a201918d876e78b410e1da Mon Sep 17 00:00:00 2001
From: Rajesh Gandham <rgandham@nvidia.com>
Date: Mon, 16 Mar 2026 09:32:17 -0700
Subject: [PATCH 16/30] Remove code duplication

---
 cpp/src/dual_simplex/right_looking_lu.cpp | 47 +++++++++++++----------
 1 file changed, 26 insertions(+), 21 deletions(-)

diff --git a/cpp/src/dual_simplex/right_looking_lu.cpp b/cpp/src/dual_simplex/right_looking_lu.cpp
index 4800d644c1..37202000f8 100644
--- a/cpp/src/dual_simplex/right_looking_lu.cpp
+++ b/cpp/src/dual_simplex/right_looking_lu.cpp
@@ -79,6 +79,29 @@ i_t initialize_degree_data(const csc_matrix_t<i_t, f_t>& A,
   return Bnz;
 }
 
+// Fill col_pos and row_pos so that column j has col_pos[j] = its index in col_count[Cdegree[j]],
+// and row i has row_pos[i] = its index in row_count[Rdegree[i]]. Enables O(1) degree-bucket
+// removal.
+template <typename i_t>
+void initialize_bucket_positions(const std::vector<std::vector<i_t>>& col_count,
+                                 const std::vector<std::vector<i_t>>& row_count,
+                                 i_t col_max_degree,
+                                 i_t row_max_degree,
+                                 std::vector<i_t>& col_pos,
+                                 std::vector<i_t>& row_pos)
+{
+  for (i_t d = 0; d <= col_max_degree; ++d) {
+    for (i_t pos = 0; pos < static_cast<i_t>(col_count[d].size()); ++pos) {
+      col_pos[col_count[d][pos]] = pos;
+    }
+  }
+  for (i_t d = 0; d <= row_max_degree; ++d) {
+    for (i_t pos = 0; pos < static_cast<i_t>(row_count[d].size()); ++pos) {
+      row_pos[row_count[d][pos]] = pos;
+    }
+  }
+}
+
 template <typename i_t, typename f_t>
 i_t load_elements(const csc_matrix_t<i_t, f_t>& A,
                   const std::vector<i_t>& column_list,
@@ -656,19 +679,10 @@ i_t right_looking_lu(const csc_matrix_t<i_t, f_t>& A,
   const i_t Bnz =
     initialize_degree_data(A, column_list, Cdegree, Rdegree, col_count, row_count, work_estimate);
 
-  // Position arrays for O(1) degree-bucket removal
+  // Position arrays for O(1) degree-bucket removal (col_count and row_count each have n+1 buckets)
   std::vector<i_t> col_pos(n);  // if Cdegree[j] = nz, then j is in col_count[nz][col_pos[j]]
-  for (i_t d = 0; d <= n; ++d) {
-    for (i_t pos = 0; pos < static_cast<i_t>(col_count[d].size()); ++pos) {
-      col_pos[col_count[d][pos]] = pos;
-    }
-  }
   std::vector<i_t> row_pos(n);  // if Rdegree[i] = nz, then i is in row_count[nz][row_pos[i]]
-  for (i_t d = 0; d <= n; ++d) {
-    for (i_t pos = 0; pos < static_cast<i_t>(row_count[d].size()); ++pos) {
-      row_pos[row_count[d][pos]] = pos;
-    }
-  }
+  initialize_bucket_positions(col_count, row_count, n, n, col_pos, row_pos);
 
   std::vector<element_t<i_t, f_t>> elements(Bnz);
   std::vector<i_t> first_in_row(n, kNone);
@@ -1058,17 +1072,8 @@ i_t right_looking_lu_row_permutation_only(const csc_matrix_t<i_t, f_t>& A,
 
   // Position arrays for O(1) degree-bucket removal (col_count has m+1 buckets, row_count n+1)
   std::vector<i_t> col_pos(n);  // if Cdegree[j] = nz, then j is in col_count[nz][col_pos[j]]
-  for (i_t d = 0; d <= m; ++d) {
-    for (i_t pos = 0; pos < static_cast<i_t>(col_count[d].size()); ++pos) {
-      col_pos[col_count[d][pos]] = pos;
-    }
-  }
   std::vector<i_t> row_pos(m);  // if Rdegree[i] = nz, then i is in row_count[nz][row_pos[i]]
-  for (i_t d = 0; d <= n; ++d) {
-    for (i_t pos = 0; pos < static_cast<i_t>(row_count[d].size()); ++pos) {
-      row_pos[row_count[d][pos]] = pos;
-    }
-  }
+  initialize_bucket_positions(col_count, row_count, m, n, col_pos, row_pos);
 
   std::vector<element_t<i_t, f_t>> elements(Bnz);
   std::vector<i_t> first_in_row(m, kNone);

From 197bf8d54d6cd1de62ce403876e7a2c2e0b7aa15 Mon Sep 17 00:00:00 2001
From: Rajesh Gandham <rgandham@nvidia.com>
Date: Mon, 16 Mar 2026 10:37:23 -0700
Subject: [PATCH 17/30] keep the dense vector path alive

---
 cpp/src/dual_simplex/crossover.cpp | 61 ++++++++++++++++++++----------
 1 file changed, 42 insertions(+), 19 deletions(-)

diff --git a/cpp/src/dual_simplex/crossover.cpp b/cpp/src/dual_simplex/crossover.cpp
index 832f6891f6..14624a4f4c 100644
--- a/cpp/src/dual_simplex/crossover.cpp
+++ b/cpp/src/dual_simplex/crossover.cpp
@@ -389,7 +389,8 @@ i_t dual_push(const lp_problem_t<i_t, f_t>& lp,
   const std::vector<f_t>& x = solution.x;
   i_t num_pushes            = 0;
   std::vector<f_t> delta_zN(n - m);
-  std::vector<f_t> delta_expanded(n);
+  std::vector<f_t> delta_expanded;  // workspace for sparse path (delta_y is sparse enough)
+  std::vector<f_t> delta_y_dense;   // workspace for dense path (delta_y is not sparse enough)
   while (superbasic_list.size() > 0) {
     const i_t s                   = superbasic_list.back();
     const i_t basic_leaving_index = superbasic_list_index.back();
@@ -417,24 +418,38 @@ i_t dual_push(const lp_problem_t<i_t, f_t>& lp,
     }
 
     // delta_zN = -N^T delta_y
-    std::fill(delta_expanded.begin(), delta_expanded.end(), 0.);
+    // Choose sparse vs dense method by delta_y sparsity (match dual simplex: sparse if <= 30% nnz)
     std::fill(delta_zN.begin(), delta_zN.end(), 0.);
-
-    // Iterate directly over sparse delta_y instead of checking zeros
-    for (i_t nnz_idx = 0; nnz_idx < delta_y_sparse.i.size(); ++nnz_idx) {
-      const i_t row = delta_y_sparse.i[nnz_idx];
-      const f_t val = delta_y_sparse.x[nnz_idx];
-      
-      // Accumulate contributions from this row to all columns
-      const i_t row_start = Arow.row_start[row];
-      const i_t row_end   = Arow.row_start[row + 1];
-      for (i_t p = row_start; p < row_end; ++p) {
-        const i_t col = Arow.j[p];
-        delta_expanded[col] += Arow.x[p] * val;
+    const bool use_sparse = (delta_y_sparse.i.size() * 1.0 / m) <= 0.3;
+
+    if (use_sparse) {
+      delta_expanded.resize(n);
+      std::fill(delta_expanded.begin(), delta_expanded.end(), 0.);
+      for (i_t nnz_idx = 0; nnz_idx < static_cast<i_t>(delta_y_sparse.i.size()); ++nnz_idx) {
+        const i_t row       = delta_y_sparse.i[nnz_idx];
+        const f_t val       = delta_y_sparse.x[nnz_idx];
+        const i_t row_start = Arow.row_start[row];
+        const i_t row_end   = Arow.row_start[row + 1];
+        for (i_t p = row_start; p < row_end; ++p) {
+          const i_t col = Arow.j[p];
+          delta_expanded[col] += Arow.x[p] * val;
+        }
+      }
+      for (i_t k = 0; k < n - m; ++k) {
+        delta_zN[k] = -delta_expanded[nonbasic_list[k]];
+      }
+    } else {
+      delta_y_sparse.to_dense(delta_y_dense);
+      for (i_t k = 0; k < n - m; ++k) {
+        const i_t j       = nonbasic_list[k];
+        f_t dot           = 0.0;
+        const i_t c_start = lp.A.col_start[j];
+        const i_t c_end   = lp.A.col_start[j + 1];
+        for (i_t p = c_start; p < c_end; ++p) {
+          dot += lp.A.x[p] * delta_y_dense[lp.A.i[p]];
+        }
+        delta_zN[k] = -dot;
       }
-    }
-    for (i_t k = 0; k < n - m; ++k) {
-      delta_zN[k] = -delta_expanded[nonbasic_list[k]];
     }
 
     i_t entering_index          = -1;
@@ -1345,8 +1360,16 @@ crossover_status_t crossover(const lp_problem_t<i_t, f_t>& lp,
   basis_update_mpf_t ft(L, U, p, settings.refactor_frequency);
   verify_basis<i_t, f_t>(m, n, vstatus);
   compare_vstatus_with_lists<i_t, f_t>(m, n, basic_list, nonbasic_list, vstatus);
-  i_t dual_push_status = dual_push(
-    lp, Arow, settings, start_time, solution, ft, basic_list, nonbasic_list, superbasic_list, vstatus);
+  i_t dual_push_status = dual_push(lp,
+                                   Arow,
+                                   settings,
+                                   start_time,
+                                   solution,
+                                   ft,
+                                   basic_list,
+                                   nonbasic_list,
+                                   superbasic_list,
+                                   vstatus);
   if (dual_push_status < 0) { return return_to_status(dual_push_status); }
   settings.log.debug("basic list size %ld m %d\n", basic_list.size(), m);
   settings.log.debug("nonbasic list size %ld n - m %d\n", nonbasic_list.size(), n - m);

From 990bcd0c05b0562ae184d7a1794bd4907f960fb8 Mon Sep 17 00:00:00 2001
From: Rajesh Gandham <rgandham@nvidia.com>
Date: Fri, 20 Mar 2026 08:21:54 -0700
Subject: [PATCH 18/30] Revert "Update to 26.06 (#975)"

This reverts commit e2ea6872842475d01f95e3ed8f8abaee0534fdc9.
---
 .claude-plugin/marketplace.json               |  2 +-
 .cursor-plugin/plugin.json                    |  2 +-
 .github/workflows/build.yaml                  | 30 +++++------
 .../workflows/build_test_publish_images.yaml  |  2 +-
 .github/workflows/pr.yaml                     | 32 ++++++------
 .github/workflows/test.yaml                   |  2 +-
 README.md                                     | 12 ++---
 VERSION                                       |  2 +-
 .../all_cuda-129_arch-aarch64.yaml            | 10 ++--
 .../all_cuda-129_arch-x86_64.yaml             | 10 ++--
 .../all_cuda-131_arch-aarch64.yaml            | 10 ++--
 .../all_cuda-131_arch-x86_64.yaml             | 10 ++--
 dependencies.yaml                             | 50 +++++++++----------
 gemini-extension.json                         |  2 +-
 helmchart/cuopt-server/Chart.yaml             |  4 +-
 helmchart/cuopt-server/values.yaml            |  2 +-
 python/cuopt/pyproject.toml                   | 18 +++----
 python/cuopt_self_hosted/pyproject.toml       |  2 +-
 python/cuopt_server/pyproject.toml            |  2 +-
 python/libcuopt/pyproject.toml                |  8 +--
 skills/cuopt-developer/SKILL.md               |  2 +-
 skills/cuopt-installation-api-c/SKILL.md      |  2 +-
 skills/cuopt-installation-api-python/SKILL.md |  2 +-
 skills/cuopt-installation-common/SKILL.md     |  2 +-
 skills/cuopt-installation-developer/SKILL.md  |  2 +-
 skills/cuopt-lp-milp-api-c/SKILL.md           |  2 +-
 skills/cuopt-lp-milp-api-cli/SKILL.md         |  2 +-
 skills/cuopt-lp-milp-api-python/SKILL.md      |  2 +-
 skills/cuopt-qp-api-c/SKILL.md                |  2 +-
 skills/cuopt-qp-api-cli/SKILL.md              |  2 +-
 skills/cuopt-qp-api-python/SKILL.md           |  2 +-
 skills/cuopt-routing-api-python/SKILL.md      |  2 +-
 skills/cuopt-server-api-python/SKILL.md       |  2 +-
 skills/cuopt-server-common/SKILL.md           |  2 +-
 skills/cuopt-user-rules/SKILL.md              |  2 +-
 skills/lp-milp-formulation/SKILL.md           |  2 +-
 skills/qp-formulation/SKILL.md                |  2 +-
 skills/routing-formulation/SKILL.md           |  2 +-
 skills/skill-evolution/SKILL.md               |  4 +-
 39 files changed, 126 insertions(+), 126 deletions(-)

diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json
index 6ddf2583c4..4c5df380f6 100644
--- a/.claude-plugin/marketplace.json
+++ b/.claude-plugin/marketplace.json
@@ -5,7 +5,7 @@
   },
   "metadata": {
     "description": "Agent skills for NVIDIA cuOpt: routing (VRP, TSP, PDP), LP/MILP/QP, installation (Python/C/developer), and REST server.",
-    "version": "26.06.00"
+    "version": "26.04.00"
   },
   "plugins": [
     {
diff --git a/.cursor-plugin/plugin.json b/.cursor-plugin/plugin.json
index e740506140..5f34873671 100644
--- a/.cursor-plugin/plugin.json
+++ b/.cursor-plugin/plugin.json
@@ -1,7 +1,7 @@
 {
   "name": "nvidia-cuopt-skills",
   "description": "Agent skills for NVIDIA cuOpt: routing (VRP, TSP, PDP), LP/MILP/QP, installation (Python/C/developer), and REST server. Use when building or solving optimization with cuOpt.",
-  "version": "26.06.00",
+  "version": "26.04.00",
   "author": {
     "name": "NVIDIA"
   },
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index a945cde8ec..593d48bd74 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -45,7 +45,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@main
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@python-3.14
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -55,7 +55,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@main
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@python-3.14
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -65,7 +65,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@main
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@python-3.14
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -73,7 +73,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-cuopt-mps-parser:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.14
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -88,7 +88,7 @@ jobs:
   wheel-publish-cuopt-mps-parser:
     needs: wheel-build-cuopt-mps-parser
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@main
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.14
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -99,7 +99,7 @@ jobs:
   wheel-build-libcuopt:
     needs: wheel-build-cuopt-mps-parser
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.14
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -112,7 +112,7 @@ jobs:
   wheel-publish-libcuopt:
     needs: wheel-build-libcuopt
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@main
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.14
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -123,7 +123,7 @@ jobs:
   wheel-build-cuopt:
     needs: [wheel-build-cuopt-mps-parser, wheel-build-libcuopt]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.14
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -135,7 +135,7 @@ jobs:
   wheel-publish-cuopt:
     needs: wheel-build-cuopt
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@main
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.14
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -145,7 +145,7 @@ jobs:
       package-type: python
   wheel-build-cuopt-server:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.14
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -160,7 +160,7 @@ jobs:
   wheel-publish-cuopt-server:
     needs: wheel-build-cuopt-server
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@main
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.14
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -171,7 +171,7 @@ jobs:
   docs-build:
     needs: [python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.14
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       node_type: "gpu-l4-latest-1"
@@ -181,11 +181,11 @@ jobs:
       arch: "amd64"
       file_to_upload: "docs/cuopt/build/html/"
       artifact-name: "cuopt_docs"
-      container_image: "rapidsai/ci-conda:26.06-latest"
+      container_image: "rapidsai/ci-conda:26.04-latest"
       script: "ci/build_docs.sh"
   wheel-build-cuopt-sh-client:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.14
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -201,7 +201,7 @@ jobs:
   wheel-publish-cuopt-sh-client:
     needs: wheel-build-cuopt-sh-client
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@main
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.14
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/build_test_publish_images.yaml b/.github/workflows/build_test_publish_images.yaml
index 17d4e9ab57..f8f7366e13 100644
--- a/.github/workflows/build_test_publish_images.yaml
+++ b/.github/workflows/build_test_publish_images.yaml
@@ -55,7 +55,7 @@ jobs:
   compute-matrix:
     runs-on: ubuntu-latest
     container:
-      image: rapidsai/ci-conda:26.06-latest
+      image: rapidsai/ci-conda:26.04-latest
     outputs:
       MATRIX: ${{ steps.compute-matrix.outputs.MATRIX }}
       CUOPT_VER: ${{ steps.compute-cuopt-ver.outputs.CUOPT_VER }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index a652c23b9a..95741c1fb5 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -34,7 +34,7 @@ jobs:
       - wheel-build-cuopt-sh-client
       - test-self-hosted-server
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@main
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@python-3.14
     if: always()
     with:
       needs: ${{ toJSON(needs) }}
@@ -111,7 +111,7 @@ jobs:
 
   changed-files:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@main
+    uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@python-3.14
     with:
       files_yaml: |
         build_docs:
@@ -279,20 +279,20 @@ jobs:
           - '!gemini-extension.json'
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@main
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@python-3.14
     with:
       enable_check_generated_files: false
   conda-cpp-build:
     needs: [checks, compute-matrix-filters]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@main
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@python-3.14
     with:
       build_type: pull-request
       script: ci/build_cpp.sh
       matrix_filter: ${{ needs.compute-matrix-filters.outputs.conda_lean_filter }}
   conda-cpp-tests:
     needs: [conda-cpp-build, changed-files, compute-matrix-filters]
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@main
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@python-3.14
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_cpp
     with:
       build_type: pull-request
@@ -308,14 +308,14 @@ jobs:
   conda-python-build:
     needs: [conda-cpp-build, compute-matrix-filters]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@main
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@python-3.14
     with:
       build_type: pull-request
       script: ci/build_python.sh
       matrix_filter: ${{ needs.compute-matrix-filters.outputs.conda_test_filter }}
   conda-python-tests:
     needs: [conda-python-build, changed-files, compute-matrix-filters]
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@main
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.14
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python_conda
     with:
       run_codecov: false
@@ -332,7 +332,7 @@ jobs:
   docs-build:
     needs: [conda-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.14
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).build_docs
     with:
       build_type: pull-request
@@ -340,12 +340,12 @@ jobs:
       arch: "amd64"
       file_to_upload: "docs/cuopt/build/html/"
       artifact-name: "cuopt_docs"
-      container_image: "rapidsai/ci-conda:26.06-latest"
+      container_image: "rapidsai/ci-conda:26.04-latest"
       script: "ci/build_docs.sh"
   wheel-build-cuopt-mps-parser:
     needs: compute-matrix-filters
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.14
     with:
       build_type: pull-request
       script: ci/build_wheel_cuopt_mps_parser.sh
@@ -357,7 +357,7 @@ jobs:
   wheel-build-libcuopt:
     needs: [wheel-build-cuopt-mps-parser, compute-matrix-filters]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.14
     with:
       # build for every combination of arch and CUDA version, but only for the latest Python
       matrix_filter: ${{ needs.compute-matrix-filters.outputs.libcuopt_filter }}
@@ -368,7 +368,7 @@ jobs:
   wheel-build-cuopt:
     needs: [wheel-build-cuopt-mps-parser, wheel-build-libcuopt, compute-matrix-filters]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.14
     with:
       build_type: pull-request
       script: ci/build_wheel_cuopt.sh
@@ -377,7 +377,7 @@ jobs:
       matrix_filter: ${{ needs.compute-matrix-filters.outputs.wheel_lean_filter }}
   wheel-tests-cuopt:
     needs: [wheel-build-cuopt, wheel-build-cuopt-mps-parser, wheel-build-cuopt-sh-client, changed-files, compute-matrix-filters]
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@main
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.14
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python_wheels
     with:
       build_type: pull-request
@@ -393,7 +393,7 @@ jobs:
   wheel-build-cuopt-server:
     needs: [checks, compute-matrix-filters]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.14
     with:
       build_type: pull-request
       script: ci/build_wheel_cuopt_server.sh
@@ -405,7 +405,7 @@ jobs:
   wheel-build-cuopt-sh-client:
     needs: compute-matrix-filters
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.14
     with:
       build_type: pull-request
       script: ci/build_wheel_cuopt_sh_client.sh
@@ -417,7 +417,7 @@ jobs:
       matrix_filter: ${{ needs.compute-matrix-filters.outputs.cuopt_sh_client_filter }}
   wheel-tests-cuopt-server:
     needs: [wheel-build-cuopt, wheel-build-cuopt-server, changed-files, compute-matrix-filters]
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@main
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.14
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python_wheels
     with:
       build_type: pull-request
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index a8cc5f2943..e88b7829f5 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -97,5 +97,5 @@ jobs:
       sha: ${{ inputs.sha }}
       node_type: "gpu-l4-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci-conda:26.06-latest"
+      container_image: "rapidsai/ci-conda:26.04-latest"
       script: ci/test_notebooks.sh
diff --git a/README.md b/README.md
index 95c8598d77..379a48c350 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 # cuOpt - GPU-accelerated Optimization
 
 [![Build Status](https://github.com/NVIDIA/cuopt/actions/workflows/build.yaml/badge.svg)](https://github.com/NVIDIA/cuopt/actions/workflows/build.yaml)
-[![Version](https://img.shields.io/badge/version-26.06.00-blue)](https://github.com/NVIDIA/cuopt/releases)
+[![Version](https://img.shields.io/badge/version-26.04.00-blue)](https://github.com/NVIDIA/cuopt/releases)
 [![Documentation](https://img.shields.io/badge/docs-latest-brightgreen)](https://docs.nvidia.com/cuopt/user-guide/latest/introduction.html)
 [![Docker Hub](https://img.shields.io/badge/docker-nvidia%2Fcuopt-blue?logo=docker)](https://hub.docker.com/r/nvidia/cuopt)
 [![Examples](https://img.shields.io/badge/examples-cuopt--examples-orange)](https://github.com/NVIDIA/cuopt-examples)
@@ -83,7 +83,7 @@ For CUDA 12.x:
 pip install \
   --extra-index-url=https://pypi.nvidia.com \
   nvidia-cuda-runtime-cu12==12.9.* \
-  cuopt-server-cu12==26.06.* cuopt-sh-client==26.06.*
+  cuopt-server-cu12==26.04.* cuopt-sh-client==26.04.*
 ```
 
 Development wheels are available as nightlies, please update `--extra-index-url` to `https://pypi.anaconda.org/rapidsai-wheels-nightly/simple/` to install latest nightly packages.
@@ -91,7 +91,7 @@ Development wheels are available as nightlies, please update `--extra-index-url`
 pip install --pre \
   --extra-index-url=https://pypi.nvidia.com \
   --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple/ \
-  cuopt-server-cu12==26.06.* cuopt-sh-client==26.06.*
+  cuopt-server-cu12==26.04.* cuopt-sh-client==26.04.*
 ```
 
 For CUDA 13.x:
@@ -99,7 +99,7 @@ For CUDA 13.x:
 ```bash
 pip install \
   --extra-index-url=https://pypi.nvidia.com \
-  cuopt-server-cu13==26.06.* cuopt-sh-client==26.06.*
+  cuopt-server-cu13==26.04.* cuopt-sh-client==26.04.*
 ```
 
 Development wheels are available as nightlies, please update `--extra-index-url` to `https://pypi.anaconda.org/rapidsai-wheels-nightly/simple/` to install latest nightly packages.
@@ -107,7 +107,7 @@ Development wheels are available as nightlies, please update `--extra-index-url`
 pip install --pre \
   --extra-index-url=https://pypi.nvidia.com \
   --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple/ \
-  cuopt-server-cu13==26.06.* cuopt-sh-client==26.06.*
+  cuopt-server-cu13==26.04.* cuopt-sh-client==26.04.*
 ```
 
 
@@ -118,7 +118,7 @@ cuOpt can be installed with conda (via [miniforge](https://github.com/conda-forg
 All other dependencies are installed automatically when `cuopt-server` and `cuopt-sh-client` are installed.
 
 ```bash
-conda install -c rapidsai -c conda-forge -c nvidia cuopt-server=26.06.* cuopt-sh-client=26.06.*
+conda install -c rapidsai -c conda-forge -c nvidia cuopt-server=26.04.* cuopt-sh-client=26.04.*
 ```
 
 We also provide [nightly conda packages](https://anaconda.org/rapidsai-nightly) built from the HEAD
diff --git a/VERSION b/VERSION
index cdb610a24d..0bd0e8a95b 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-26.06.00
+26.04.00
diff --git a/conda/environments/all_cuda-129_arch-aarch64.yaml b/conda/environments/all_cuda-129_arch-aarch64.yaml
index 104e7e70d1..ecef112dd5 100644
--- a/conda/environments/all_cuda-129_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-129_arch-aarch64.yaml
@@ -18,7 +18,7 @@ dependencies:
 - cuda-python>=12.9.2,<13.0
 - cuda-sanitizer-api
 - cuda-version=12.9
-- cudf==26.6.*,>=0.0.0a0
+- cudf==26.4.*,>=0.0.0a0
 - cupy>=13.6.0
 - cxx-compiler
 - cython>=3.0.3
@@ -32,8 +32,8 @@ dependencies:
 - libcurand-dev
 - libcusolver-dev
 - libcusparse-dev
-- libraft-headers==26.6.*,>=0.0.0a0
-- librmm==26.6.*,>=0.0.0a0
+- libraft-headers==26.4.*,>=0.0.0a0
+- librmm==26.4.*,>=0.0.0a0
 - make
 - msgpack-numpy==0.4.8
 - msgpack-python==1.1.2
@@ -50,7 +50,7 @@ dependencies:
 - pip
 - pre-commit
 - psutil>=6.0.0
-- pylibraft==26.6.*,>=0.0.0a0
+- pylibraft==26.4.*,>=0.0.0a0
 - pyrsistent
 - pytest-cov
 - pytest<9.0
@@ -59,7 +59,7 @@ dependencies:
 - rapids-build-backend>=0.4.0,<0.5.0
 - rapids-logger==0.2.*,>=0.0.0a0
 - requests
-- rmm==26.6.*,>=0.0.0a0
+- rmm==26.4.*,>=0.0.0a0
 - scikit-build-core>=0.11.0
 - scipy>=1.14.1
 - sphinx
diff --git a/conda/environments/all_cuda-129_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml
index 06aa6121f9..35c825280c 100644
--- a/conda/environments/all_cuda-129_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-129_arch-x86_64.yaml
@@ -18,7 +18,7 @@ dependencies:
 - cuda-python>=12.9.2,<13.0
 - cuda-sanitizer-api
 - cuda-version=12.9
-- cudf==26.6.*,>=0.0.0a0
+- cudf==26.4.*,>=0.0.0a0
 - cupy>=13.6.0
 - cxx-compiler
 - cython>=3.0.3
@@ -32,8 +32,8 @@ dependencies:
 - libcurand-dev
 - libcusolver-dev
 - libcusparse-dev
-- libraft-headers==26.6.*,>=0.0.0a0
-- librmm==26.6.*,>=0.0.0a0
+- libraft-headers==26.4.*,>=0.0.0a0
+- librmm==26.4.*,>=0.0.0a0
 - make
 - msgpack-numpy==0.4.8
 - msgpack-python==1.1.2
@@ -50,7 +50,7 @@ dependencies:
 - pip
 - pre-commit
 - psutil>=6.0.0
-- pylibraft==26.6.*,>=0.0.0a0
+- pylibraft==26.4.*,>=0.0.0a0
 - pyrsistent
 - pytest-cov
 - pytest<9.0
@@ -59,7 +59,7 @@ dependencies:
 - rapids-build-backend>=0.4.0,<0.5.0
 - rapids-logger==0.2.*,>=0.0.0a0
 - requests
-- rmm==26.6.*,>=0.0.0a0
+- rmm==26.4.*,>=0.0.0a0
 - scikit-build-core>=0.11.0
 - scipy>=1.14.1
 - sphinx
diff --git a/conda/environments/all_cuda-131_arch-aarch64.yaml b/conda/environments/all_cuda-131_arch-aarch64.yaml
index a68ebf1285..2b717d4e98 100644
--- a/conda/environments/all_cuda-131_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-131_arch-aarch64.yaml
@@ -18,7 +18,7 @@ dependencies:
 - cuda-python>=13.0.1,<14.0
 - cuda-sanitizer-api
 - cuda-version=13.1
-- cudf==26.6.*,>=0.0.0a0
+- cudf==26.4.*,>=0.0.0a0
 - cupy>=13.6.0
 - cxx-compiler
 - cython>=3.0.3
@@ -32,8 +32,8 @@ dependencies:
 - libcurand-dev
 - libcusolver-dev
 - libcusparse-dev
-- libraft-headers==26.6.*,>=0.0.0a0
-- librmm==26.6.*,>=0.0.0a0
+- libraft-headers==26.4.*,>=0.0.0a0
+- librmm==26.4.*,>=0.0.0a0
 - make
 - msgpack-numpy==0.4.8
 - msgpack-python==1.1.2
@@ -50,7 +50,7 @@ dependencies:
 - pip
 - pre-commit
 - psutil>=6.0.0
-- pylibraft==26.6.*,>=0.0.0a0
+- pylibraft==26.4.*,>=0.0.0a0
 - pyrsistent
 - pytest-cov
 - pytest<9.0
@@ -59,7 +59,7 @@ dependencies:
 - rapids-build-backend>=0.4.0,<0.5.0
 - rapids-logger==0.2.*,>=0.0.0a0
 - requests
-- rmm==26.6.*,>=0.0.0a0
+- rmm==26.4.*,>=0.0.0a0
 - scikit-build-core>=0.11.0
 - scipy>=1.14.1
 - sphinx
diff --git a/conda/environments/all_cuda-131_arch-x86_64.yaml b/conda/environments/all_cuda-131_arch-x86_64.yaml
index 043d55e148..f605a83f3b 100644
--- a/conda/environments/all_cuda-131_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-131_arch-x86_64.yaml
@@ -18,7 +18,7 @@ dependencies:
 - cuda-python>=13.0.1,<14.0
 - cuda-sanitizer-api
 - cuda-version=13.1
-- cudf==26.6.*,>=0.0.0a0
+- cudf==26.4.*,>=0.0.0a0
 - cupy>=13.6.0
 - cxx-compiler
 - cython>=3.0.3
@@ -32,8 +32,8 @@ dependencies:
 - libcurand-dev
 - libcusolver-dev
 - libcusparse-dev
-- libraft-headers==26.6.*,>=0.0.0a0
-- librmm==26.6.*,>=0.0.0a0
+- libraft-headers==26.4.*,>=0.0.0a0
+- librmm==26.4.*,>=0.0.0a0
 - make
 - msgpack-numpy==0.4.8
 - msgpack-python==1.1.2
@@ -50,7 +50,7 @@ dependencies:
 - pip
 - pre-commit
 - psutil>=6.0.0
-- pylibraft==26.6.*,>=0.0.0a0
+- pylibraft==26.4.*,>=0.0.0a0
 - pyrsistent
 - pytest-cov
 - pytest<9.0
@@ -59,7 +59,7 @@ dependencies:
 - rapids-build-backend>=0.4.0,<0.5.0
 - rapids-logger==0.2.*,>=0.0.0a0
 - requests
-- rmm==26.6.*,>=0.0.0a0
+- rmm==26.4.*,>=0.0.0a0
 - scikit-build-core>=0.11.0
 - scipy>=1.14.1
 - sphinx
diff --git a/dependencies.yaml b/dependencies.yaml
index db60f63569..014889c7d5 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -311,7 +311,7 @@ dependencies:
     common:
       - output_types: [conda]
         packages:
-          - libcuopt-tests==26.6.*,>=0.0.0a0
+          - libcuopt-tests==26.4.*,>=0.0.0a0
   build_wheels:
     common:
       - output_types: [requirements, pyproject]
@@ -413,7 +413,7 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - &libcuopt_unsuffixed libcuopt==26.6.*,>=0.0.0a0
+          - &libcuopt_unsuffixed libcuopt==26.4.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -426,18 +426,18 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - libcuopt-cu12==26.6.*,>=0.0.0a0
+              - libcuopt-cu12==26.4.*,>=0.0.0a0
           - matrix:
               cuda: "13.*"
               cuda_suffixed: "true"
             packages:
-              - libcuopt-cu13==26.6.*,>=0.0.0a0
+              - libcuopt-cu13==26.4.*,>=0.0.0a0
           - {matrix: null, packages: [*libcuopt_unsuffixed]}
   depends_on_cuopt:
     common:
       - output_types: conda
         packages:
-          - &cuopt_unsuffixed cuopt==26.6.*,>=0.0.0a0
+          - &cuopt_unsuffixed cuopt==26.4.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -450,18 +450,18 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - cuopt-cu12==26.6.*,>=0.0.0a0
+              - cuopt-cu12==26.4.*,>=0.0.0a0
           - matrix:
               cuda: "13.*"
               cuda_suffixed: "true"
             packages:
-              - cuopt-cu13==26.6.*,>=0.0.0a0
+              - cuopt-cu13==26.4.*,>=0.0.0a0
           - {matrix: null, packages: [*cuopt_unsuffixed]}
   depends_on_cuopt_server:
     common:
       - output_types: conda
         packages:
-          - &cuopt_server_unsuffixed cuopt-server==26.6.*,>=0.0.0a0
+          - &cuopt_server_unsuffixed cuopt-server==26.4.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -474,18 +474,18 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - cuopt-server-cu12==26.6.*,>=0.0.0a0
+              - cuopt-server-cu12==26.4.*,>=0.0.0a0
           - matrix:
               cuda: "13.*"
               cuda_suffixed: "true"
             packages:
-              - cuopt-server-cu13==26.6.*,>=0.0.0a0
+              - cuopt-server-cu13==26.4.*,>=0.0.0a0
           - {matrix: null, packages: [*cuopt_server_unsuffixed]}
   depends_on_cuopt_sh_client:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - &cuopt_sh_client_unsuffixed cuopt-sh-client==26.6.*,>=0.0.0a0
+          - &cuopt_sh_client_unsuffixed cuopt-sh-client==26.4.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -495,7 +495,7 @@ dependencies:
     common:
       - output_types: [requirements, pyproject, conda]
         packages:
-          - cuopt-mps-parser==26.6.*,>=0.0.0a0
+          - cuopt-mps-parser==26.4.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -505,12 +505,12 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - libraft-headers==26.6.*,>=0.0.0a0
+          - libraft-headers==26.4.*,>=0.0.0a0
   depends_on_librmm:
     common:
       - output_types: conda
         packages:
-          - &librmm_unsuffixed librmm==26.6.*,>=0.0.0a0
+          - &librmm_unsuffixed librmm==26.4.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -522,12 +522,12 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - librmm-cu12==26.6.*,>=0.0.0a0
+              - librmm-cu12==26.4.*,>=0.0.0a0
           - matrix:
               cuda: "13.*"
               cuda_suffixed: "true"
             packages:
-              - librmm-cu13==26.6.*,>=0.0.0a0
+              - librmm-cu13==26.4.*,>=0.0.0a0
           - {matrix: null, packages: [*librmm_unsuffixed]}
   depends_on_cupy:
     common:
@@ -562,7 +562,7 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - &rmm_unsuffixed rmm==26.6.*,>=0.0.0a0
+          - &rmm_unsuffixed rmm==26.4.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -574,12 +574,12 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - rmm-cu12==26.6.*,>=0.0.0a0
+              - rmm-cu12==26.4.*,>=0.0.0a0
           - matrix:
               cuda: "13.*"
               cuda_suffixed: "true"
             packages:
-              - rmm-cu13==26.6.*,>=0.0.0a0
+              - rmm-cu13==26.4.*,>=0.0.0a0
           - matrix:
             packages:
               - *rmm_unsuffixed
@@ -588,7 +588,7 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - &cudf_unsuffixed cudf==26.6.*,>=0.0.0a0
+          - &cudf_unsuffixed cudf==26.4.*,>=0.0.0a0
       - output_types: requirements
         packages:
           - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
@@ -599,12 +599,12 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - cudf-cu12==26.6.*,>=0.0.0a0
+              - cudf-cu12==26.4.*,>=0.0.0a0
           - matrix:
               cuda: "13.*"
               cuda_suffixed: "true"
             packages:
-              - cudf-cu13==26.6.*,>=0.0.0a0
+              - cudf-cu13==26.4.*,>=0.0.0a0
           - matrix:
             packages:
               - *cudf_unsuffixed
@@ -613,7 +613,7 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - &pylibraft_unsuffixed pylibraft==26.6.*,>=0.0.0a0
+          - &pylibraft_unsuffixed pylibraft==26.4.*,>=0.0.0a0
       - output_types: requirements
         packages:
           - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
@@ -624,12 +624,12 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - pylibraft-cu12==26.6.*,>=0.0.0a0
+              - pylibraft-cu12==26.4.*,>=0.0.0a0
           - matrix:
               cuda: "13.*"
               cuda_suffixed: "true"
             packages:
-              - pylibraft-cu13==26.6.*,>=0.0.0a0
+              - pylibraft-cu13==26.4.*,>=0.0.0a0
           - matrix:
             packages:
               - *pylibraft_unsuffixed
diff --git a/gemini-extension.json b/gemini-extension.json
index c5ef9883f8..b4c6b764a4 100644
--- a/gemini-extension.json
+++ b/gemini-extension.json
@@ -1,6 +1,6 @@
 {
   "name": "nvidia-cuopt-skills",
   "description": "Agent skills for NVIDIA cuOpt optimization engine: routing, LP/MILP/QP, installation, and server.",
-  "version": "26.06.00",
+  "version": "26.04.00",
   "contextFileName": "AGENTS.md"
 }
diff --git a/helmchart/cuopt-server/Chart.yaml b/helmchart/cuopt-server/Chart.yaml
index 811ac067cb..074d94bec9 100644
--- a/helmchart/cuopt-server/Chart.yaml
+++ b/helmchart/cuopt-server/Chart.yaml
@@ -1,5 +1,5 @@
 apiVersion: v2
-appVersion: 26.6.0
+appVersion: 26.4.0
 description: A Helm chart for NVIDIA cuOpt Server with GPU support
 home: https://docs.nvidia.com/cuopt/user-guide/latest/resources.html
 keywords:
@@ -14,4 +14,4 @@ name: cuopt-server
 sources:
 - https://docs.nvidia.com/cuopt/user-guide/latest/resources.html
 type: application
-version: 26.6.0
+version: 26.4.0
diff --git a/helmchart/cuopt-server/values.yaml b/helmchart/cuopt-server/values.yaml
index 6adafea79e..5218596552 100644
--- a/helmchart/cuopt-server/values.yaml
+++ b/helmchart/cuopt-server/values.yaml
@@ -7,7 +7,7 @@ replicaCount: 1
 image:
   repository: nvidia/cuopt
   pullPolicy: IfNotPresent
-  tag: "26.6.0-cuda12.9-py3.12"
+  tag: "26.4.0-cuda12.9-py3.12"
 
 imagePullSecrets: []
 nameOverride: ""
diff --git a/python/cuopt/pyproject.toml b/python/cuopt/pyproject.toml
index eff7e01769..e86b5bdd73 100644
--- a/python/cuopt/pyproject.toml
+++ b/python/cuopt/pyproject.toml
@@ -20,18 +20,18 @@ license = "Apache-2.0"
 requires-python = ">=3.11"
 dependencies = [
     "cuda-python>=13.0.1,<14.0",
-    "cudf==26.6.*,>=0.0.0a0",
-    "cuopt-mps-parser==26.6.*,>=0.0.0a0",
+    "cudf==26.4.*,>=0.0.0a0",
+    "cuopt-mps-parser==26.4.*,>=0.0.0a0",
     "cupy-cuda13x>=13.6.0",
-    "libcuopt==26.6.*,>=0.0.0a0",
+    "libcuopt==26.4.*,>=0.0.0a0",
     "numba-cuda>=0.22.1",
     "numba>=0.60.0,<0.65.0",
     "numpy>=1.23.5,<3.0",
     "pandas>=2.0",
-    "pylibraft==26.6.*,>=0.0.0a0",
+    "pylibraft==26.4.*,>=0.0.0a0",
     "pyyaml>=6.0.0",
     "rapids-logger==0.2.*,>=0.0.0a0",
-    "rmm==26.6.*,>=0.0.0a0",
+    "rmm==26.4.*,>=0.0.0a0",
     "scipy>=1.14.1",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
@@ -101,12 +101,12 @@ dependencies-file = "../../dependencies.yaml"
 matrix-entry = "cuda_suffixed=true;use_cuda_wheels=true"
 requires = [
     "cmake>=3.30.4",
-    "cuopt-mps-parser==26.6.*,>=0.0.0a0",
+    "cuopt-mps-parser==26.4.*,>=0.0.0a0",
     "cupy-cuda13x>=13.6.0",
     "cython>=3.0.3",
-    "libcuopt==26.6.*,>=0.0.0a0",
+    "libcuopt==26.4.*,>=0.0.0a0",
     "ninja",
-    "pylibraft==26.6.*,>=0.0.0a0",
+    "pylibraft==26.4.*,>=0.0.0a0",
     "rapids-logger==0.2.*,>=0.0.0a0",
-    "rmm==26.6.*,>=0.0.0a0",
+    "rmm==26.4.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
diff --git a/python/cuopt_self_hosted/pyproject.toml b/python/cuopt_self_hosted/pyproject.toml
index 43aa80a5b3..7645c99ed0 100644
--- a/python/cuopt_self_hosted/pyproject.toml
+++ b/python/cuopt_self_hosted/pyproject.toml
@@ -20,7 +20,7 @@ license = "Apache-2.0"
 license-files = ["LICENSE"]
 requires-python = ">=3.11"
 dependencies = [
-    "cuopt-mps-parser==26.6.*,>=0.0.0a0",
+    "cuopt-mps-parser==26.4.*,>=0.0.0a0",
     "msgpack-numpy==0.4.8",
     "msgpack==1.1.2",
     "requests",
diff --git a/python/cuopt_server/pyproject.toml b/python/cuopt_server/pyproject.toml
index ce96c884be..d24cfcbd77 100644
--- a/python/cuopt_server/pyproject.toml
+++ b/python/cuopt_server/pyproject.toml
@@ -21,7 +21,7 @@ license = "Apache-2.0"
 license-files = ["LICENSE"]
 requires-python = ">=3.11"
 dependencies = [
-    "cuopt==26.6.*,>=0.0.0a0",
+    "cuopt==26.4.*,>=0.0.0a0",
     "cupy-cuda13x>=13.6.0",
     "fastapi",
     "jsonref==1.1.0",
diff --git a/python/libcuopt/pyproject.toml b/python/libcuopt/pyproject.toml
index e5c0c58fab..2507971a0f 100644
--- a/python/libcuopt/pyproject.toml
+++ b/python/libcuopt/pyproject.toml
@@ -30,8 +30,8 @@ classifiers = [
     "Programming Language :: Python :: 3.14",
 ]
 dependencies = [
-    "cuopt-mps-parser==26.6.*,>=0.0.0a0",
-    "librmm==26.6.*,>=0.0.0a0",
+    "cuopt-mps-parser==26.4.*,>=0.0.0a0",
+    "librmm==26.4.*,>=0.0.0a0",
     "nvidia-cublas",
     "nvidia-cudart",
     "nvidia-cudss",
@@ -81,8 +81,8 @@ dependencies-file = "../../dependencies.yaml"
 matrix-entry = "cuda_suffixed=true;use_cuda_wheels=true"
 requires = [
     "cmake>=3.30.4",
-    "cuopt-mps-parser==26.6.*,>=0.0.0a0",
-    "librmm==26.6.*,>=0.0.0a0",
+    "cuopt-mps-parser==26.4.*,>=0.0.0a0",
+    "librmm==26.4.*,>=0.0.0a0",
     "ninja",
     "rapids-logger==0.2.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
diff --git a/skills/cuopt-developer/SKILL.md b/skills/cuopt-developer/SKILL.md
index 99743f9171..12419153ac 100644
--- a/skills/cuopt-developer/SKILL.md
+++ b/skills/cuopt-developer/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: cuopt-developer
-version: "26.06.00"
+version: "26.04.00"
 description: Contribute to NVIDIA cuOpt codebase including C++/CUDA, Python, server, docs, and CI. Use when the user wants to modify solver internals, add features, submit PRs, or understand the codebase architecture.
 ---
 
diff --git a/skills/cuopt-installation-api-c/SKILL.md b/skills/cuopt-installation-api-c/SKILL.md
index bd4d60becc..747382e3c7 100644
--- a/skills/cuopt-installation-api-c/SKILL.md
+++ b/skills/cuopt-installation-api-c/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: cuopt-installation-api-c
-version: "26.06.00"
+version: "26.04.00"
 description: Install cuOpt for C — conda, locate lib/headers, verification. Use when the user is installing or verifying the C API. Standalone; no common skill.
 ---
 
diff --git a/skills/cuopt-installation-api-python/SKILL.md b/skills/cuopt-installation-api-python/SKILL.md
index 771f5ec8b0..a3d7a5e5d2 100644
--- a/skills/cuopt-installation-api-python/SKILL.md
+++ b/skills/cuopt-installation-api-python/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: cuopt-installation-api-python
-version: "26.06.00"
+version: "26.04.00"
 description: Install cuOpt for Python — pip, conda, Docker, verification. Use when the user is installing or verifying the Python API. Standalone; no common skill.
 ---
 
diff --git a/skills/cuopt-installation-common/SKILL.md b/skills/cuopt-installation-common/SKILL.md
index 88534fb810..6ceb9f9000 100644
--- a/skills/cuopt-installation-common/SKILL.md
+++ b/skills/cuopt-installation-common/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: cuopt-installation-common
-version: "26.06.00"
+version: "26.04.00"
 description: Install cuOpt — system and environment requirements only. Domain concepts; no install commands or interface guidance.
 ---
 
diff --git a/skills/cuopt-installation-developer/SKILL.md b/skills/cuopt-installation-developer/SKILL.md
index 1f3dff0d3f..a002498853 100644
--- a/skills/cuopt-installation-developer/SKILL.md
+++ b/skills/cuopt-installation-developer/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: cuopt-installation-developer
-version: "26.06.00"
+version: "26.04.00"
 description: Developer installation — build cuOpt from source, run tests. Use when the user wants to set up a dev environment to contribute or modify cuOpt.
 ---
 
diff --git a/skills/cuopt-lp-milp-api-c/SKILL.md b/skills/cuopt-lp-milp-api-c/SKILL.md
index 74b0d5dc92..53df3de63e 100644
--- a/skills/cuopt-lp-milp-api-c/SKILL.md
+++ b/skills/cuopt-lp-milp-api-c/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: cuopt-lp-milp-api-c
-version: "26.06.00"
+version: "26.04.00"
 description: LP and MILP with cuOpt — C API only. Use when the user is embedding LP/MILP in C/C++.
 ---
 
diff --git a/skills/cuopt-lp-milp-api-cli/SKILL.md b/skills/cuopt-lp-milp-api-cli/SKILL.md
index 1f8e8a157c..cbdc1e7778 100644
--- a/skills/cuopt-lp-milp-api-cli/SKILL.md
+++ b/skills/cuopt-lp-milp-api-cli/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: cuopt-lp-milp-api-cli
-version: "26.06.00"
+version: "26.04.00"
 description: LP and MILP with cuOpt — CLI only (MPS files, cuopt_cli). Use when the user is solving from MPS via command line.
 ---
 
diff --git a/skills/cuopt-lp-milp-api-python/SKILL.md b/skills/cuopt-lp-milp-api-python/SKILL.md
index e8435867db..a7cd9a59f2 100644
--- a/skills/cuopt-lp-milp-api-python/SKILL.md
+++ b/skills/cuopt-lp-milp-api-python/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: cuopt-lp-milp-api-python
-version: "26.06.00"
+version: "26.04.00"
 description: Solve Linear Programming (LP) and Mixed-Integer Linear Programming (MILP) with the Python API. Use when the user asks about optimization with linear constraints, integer variables, scheduling, resource allocation, facility location, or production planning.
 ---
 
diff --git a/skills/cuopt-qp-api-c/SKILL.md b/skills/cuopt-qp-api-c/SKILL.md
index 85014b81fd..bc1efb63d3 100644
--- a/skills/cuopt-qp-api-c/SKILL.md
+++ b/skills/cuopt-qp-api-c/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: cuopt-qp-api-c
-version: "26.06.00"
+version: "26.04.00"
 description: Quadratic Programming (QP) with cuOpt — C API. Use when the user is embedding QP in C/C++.
 ---
 
diff --git a/skills/cuopt-qp-api-cli/SKILL.md b/skills/cuopt-qp-api-cli/SKILL.md
index 7aec559126..5f8a8e848a 100644
--- a/skills/cuopt-qp-api-cli/SKILL.md
+++ b/skills/cuopt-qp-api-cli/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: cuopt-qp-api-cli
-version: "26.06.00"
+version: "26.04.00"
 description: QP with cuOpt — CLI (e.g. cuopt_cli with QP-capable input). Use when the user is solving QP from the command line.
 ---
 
diff --git a/skills/cuopt-qp-api-python/SKILL.md b/skills/cuopt-qp-api-python/SKILL.md
index 39533aaeca..b85b9e3db2 100644
--- a/skills/cuopt-qp-api-python/SKILL.md
+++ b/skills/cuopt-qp-api-python/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: cuopt-qp-api-python
-version: "26.06.00"
+version: "26.04.00"
 description: Quadratic Programming (QP) with cuOpt — Python API only (beta). Use when the user is building or solving QP in Python.
 ---
 
diff --git a/skills/cuopt-routing-api-python/SKILL.md b/skills/cuopt-routing-api-python/SKILL.md
index c386107241..d8bf736f8f 100644
--- a/skills/cuopt-routing-api-python/SKILL.md
+++ b/skills/cuopt-routing-api-python/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: cuopt-routing-api-python
-version: "26.06.00"
+version: "26.04.00"
 description: Vehicle routing (VRP, TSP, PDP) with cuOpt — Python API only. Use when the user is building or solving routing in Python.
 ---
 
diff --git a/skills/cuopt-server-api-python/SKILL.md b/skills/cuopt-server-api-python/SKILL.md
index 7d6ed175dd..b340e9883f 100644
--- a/skills/cuopt-server-api-python/SKILL.md
+++ b/skills/cuopt-server-api-python/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: cuopt-server-api-python
-version: "26.06.00"
+version: "26.04.00"
 description: cuOpt REST server — start server, endpoints, Python/curl client examples. Use when the user is deploying or calling the REST API.
 ---
 
diff --git a/skills/cuopt-server-common/SKILL.md b/skills/cuopt-server-common/SKILL.md
index cc2a3728d5..f23c9c4a5f 100644
--- a/skills/cuopt-server-common/SKILL.md
+++ b/skills/cuopt-server-common/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: cuopt-server-common
-version: "26.06.00"
+version: "26.04.00"
 description: cuOpt REST server — what it does and how requests flow. Domain concepts; no deploy or client code.
 ---
 
diff --git a/skills/cuopt-user-rules/SKILL.md b/skills/cuopt-user-rules/SKILL.md
index 87734f72a2..0777b9af15 100644
--- a/skills/cuopt-user-rules/SKILL.md
+++ b/skills/cuopt-user-rules/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: cuopt-user-rules
-version: "26.06.00"
+version: "26.04.00"
 description: Base behavior rules for using NVIDIA cuOpt. Read this FIRST before any cuOpt user task (routing, LP/MILP, QP, installation, server). Covers handling incomplete questions, clarifying data requirements, verifying understanding, and running commands safely.
 ---
 
diff --git a/skills/lp-milp-formulation/SKILL.md b/skills/lp-milp-formulation/SKILL.md
index e429282033..64431a04c4 100644
--- a/skills/lp-milp-formulation/SKILL.md
+++ b/skills/lp-milp-formulation/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: lp-milp-formulation
-version: "26.06.00"
+version: "26.04.00"
 description: LP/MILP concepts and going from problem text to formulation. What LP/MILP are, required formulation questions, typical modeling elements, and how to parse problem statements (parameters, constraints, decisions, objective).
 ---
 
diff --git a/skills/qp-formulation/SKILL.md b/skills/qp-formulation/SKILL.md
index 60aed00ede..c87b887fbc 100644
--- a/skills/qp-formulation/SKILL.md
+++ b/skills/qp-formulation/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: qp-formulation
-version: "26.06.00"
+version: "26.04.00"
 description: Quadratic Programming (QP) — problem form and constraints. Domain concepts; no API or interface. QP is beta.
 ---
 
diff --git a/skills/routing-formulation/SKILL.md b/skills/routing-formulation/SKILL.md
index 9cf8060cdf..4ab8d6419d 100644
--- a/skills/routing-formulation/SKILL.md
+++ b/skills/routing-formulation/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: routing-formulation
-version: "26.06.00"
+version: "26.04.00"
 description: Vehicle routing (VRP, TSP, PDP) — problem types and data requirements. Domain concepts; no API or interface.
 ---
 
diff --git a/skills/skill-evolution/SKILL.md b/skills/skill-evolution/SKILL.md
index f3605795b7..d77fba1a3f 100644
--- a/skills/skill-evolution/SKILL.md
+++ b/skills/skill-evolution/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: skill-evolution
-version: "26.06.00"
+version: "26.04.00"
 description: After solving a non-trivial problem, detect generalizable learnings and propose skill updates so future interactions benefit automatically. Always active — applies to every interaction.
 ---
 
@@ -182,7 +182,7 @@ When skill evolution creates an entirely new skill directory, add `origin: skill
 ```yaml
 ---
 name: new-skill-name
-version: "26.06.00"
+version: "26.04.00"
 description: ...
 origin: skill-evolution
 ---

From 40b9e49fb32bc31d59e965ddb1fa479e7a5f126c Mon Sep 17 00:00:00 2001
From: Rajesh Gandham <rgandham@nvidia.com>
Date: Fri, 20 Mar 2026 08:28:55 -0700
Subject: [PATCH 19/30] Cleanup unnecessary changes

---
 .github/workflows/build.yaml                  | 28 ++++++++---------
 .github/workflows/pr.yaml                     | 30 +++++++++----------
 .github/workflows/test.yaml                   | 10 +++----
 .../trigger-breaking-change-alert.yaml        |  2 +-
 4 files changed, 35 insertions(+), 35 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 593d48bd74..3eb1f1f066 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -45,7 +45,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@python-3.14
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@release/26.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -55,7 +55,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@python-3.14
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@release/26.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -65,7 +65,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@python-3.14
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@release/26.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -73,7 +73,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-cuopt-mps-parser:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.14
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/26.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -88,7 +88,7 @@ jobs:
   wheel-publish-cuopt-mps-parser:
     needs: wheel-build-cuopt-mps-parser
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.14
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@release/26.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -99,7 +99,7 @@ jobs:
   wheel-build-libcuopt:
     needs: wheel-build-cuopt-mps-parser
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.14
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/26.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -112,7 +112,7 @@ jobs:
   wheel-publish-libcuopt:
     needs: wheel-build-libcuopt
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.14
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@release/26.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -123,7 +123,7 @@ jobs:
   wheel-build-cuopt:
     needs: [wheel-build-cuopt-mps-parser, wheel-build-libcuopt]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.14
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/26.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -135,7 +135,7 @@ jobs:
   wheel-publish-cuopt:
     needs: wheel-build-cuopt
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.14
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@release/26.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -145,7 +145,7 @@ jobs:
       package-type: python
   wheel-build-cuopt-server:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.14
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/26.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -160,7 +160,7 @@ jobs:
   wheel-publish-cuopt-server:
     needs: wheel-build-cuopt-server
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.14
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@release/26.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -171,7 +171,7 @@ jobs:
   docs-build:
     needs: [python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.14
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@release/26.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       node_type: "gpu-l4-latest-1"
@@ -185,7 +185,7 @@ jobs:
       script: "ci/build_docs.sh"
   wheel-build-cuopt-sh-client:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.14
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/26.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -201,7 +201,7 @@ jobs:
   wheel-publish-cuopt-sh-client:
     needs: wheel-build-cuopt-sh-client
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.14
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@release/26.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 95741c1fb5..47a3bd9fca 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -34,7 +34,7 @@ jobs:
       - wheel-build-cuopt-sh-client
       - test-self-hosted-server
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@python-3.14
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@release/26.04
     if: always()
     with:
       needs: ${{ toJSON(needs) }}
@@ -111,7 +111,7 @@ jobs:
 
   changed-files:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@python-3.14
+    uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@release/26.04
     with:
       files_yaml: |
         build_docs:
@@ -279,20 +279,20 @@ jobs:
           - '!gemini-extension.json'
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@python-3.14
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@release/26.04
     with:
       enable_check_generated_files: false
   conda-cpp-build:
     needs: [checks, compute-matrix-filters]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@python-3.14
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@release/26.04
     with:
       build_type: pull-request
       script: ci/build_cpp.sh
       matrix_filter: ${{ needs.compute-matrix-filters.outputs.conda_lean_filter }}
   conda-cpp-tests:
     needs: [conda-cpp-build, changed-files, compute-matrix-filters]
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@python-3.14
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@release/26.04
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_cpp
     with:
       build_type: pull-request
@@ -308,14 +308,14 @@ jobs:
   conda-python-build:
     needs: [conda-cpp-build, compute-matrix-filters]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@python-3.14
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@release/26.04
     with:
       build_type: pull-request
       script: ci/build_python.sh
       matrix_filter: ${{ needs.compute-matrix-filters.outputs.conda_test_filter }}
   conda-python-tests:
     needs: [conda-python-build, changed-files, compute-matrix-filters]
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.14
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@release/26.04
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python_conda
     with:
       run_codecov: false
@@ -332,7 +332,7 @@ jobs:
   docs-build:
     needs: [conda-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.14
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@release/26.04
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).build_docs
     with:
       build_type: pull-request
@@ -345,7 +345,7 @@ jobs:
   wheel-build-cuopt-mps-parser:
     needs: compute-matrix-filters
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.14
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/26.04
     with:
       build_type: pull-request
       script: ci/build_wheel_cuopt_mps_parser.sh
@@ -357,7 +357,7 @@ jobs:
   wheel-build-libcuopt:
     needs: [wheel-build-cuopt-mps-parser, compute-matrix-filters]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.14
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/26.04
     with:
       # build for every combination of arch and CUDA version, but only for the latest Python
       matrix_filter: ${{ needs.compute-matrix-filters.outputs.libcuopt_filter }}
@@ -368,7 +368,7 @@ jobs:
   wheel-build-cuopt:
     needs: [wheel-build-cuopt-mps-parser, wheel-build-libcuopt, compute-matrix-filters]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.14
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/26.04
     with:
       build_type: pull-request
       script: ci/build_wheel_cuopt.sh
@@ -377,7 +377,7 @@ jobs:
       matrix_filter: ${{ needs.compute-matrix-filters.outputs.wheel_lean_filter }}
   wheel-tests-cuopt:
     needs: [wheel-build-cuopt, wheel-build-cuopt-mps-parser, wheel-build-cuopt-sh-client, changed-files, compute-matrix-filters]
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.14
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@release/26.04
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python_wheels
     with:
       build_type: pull-request
@@ -393,7 +393,7 @@ jobs:
   wheel-build-cuopt-server:
     needs: [checks, compute-matrix-filters]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.14
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/26.04
     with:
       build_type: pull-request
       script: ci/build_wheel_cuopt_server.sh
@@ -405,7 +405,7 @@ jobs:
   wheel-build-cuopt-sh-client:
     needs: compute-matrix-filters
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.14
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/26.04
     with:
       build_type: pull-request
       script: ci/build_wheel_cuopt_sh_client.sh
@@ -417,7 +417,7 @@ jobs:
       matrix_filter: ${{ needs.compute-matrix-filters.outputs.cuopt_sh_client_filter }}
   wheel-tests-cuopt-server:
     needs: [wheel-build-cuopt, wheel-build-cuopt-server, changed-files, compute-matrix-filters]
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.14
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@release/26.04
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python_wheels
     with:
       build_type: pull-request
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index e88b7829f5..9ad7609e8a 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -27,7 +27,7 @@ on:
 
 jobs:
   conda-cpp-tests:
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@main
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@release/26.04
     with:
       build_type: ${{ inputs.build_type }}
       branch: ${{ inputs.branch }}
@@ -42,7 +42,7 @@ jobs:
       script-env-secret-3-key: CUOPT_AWS_SECRET_ACCESS_KEY
       script-env-secret-3-value: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }}
   conda-python-tests:
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@main
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@release/26.04
     with:
       run_codecov: false
       build_type: ${{ inputs.build_type }}
@@ -58,7 +58,7 @@ jobs:
       script-env-secret-3-key: CUOPT_AWS_SECRET_ACCESS_KEY
       script-env-secret-3-value: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }}
   wheel-tests-cuopt:
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@main
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@release/26.04
     with:
       build_type: ${{ inputs.build_type }}
       branch: ${{ inputs.branch }}
@@ -73,7 +73,7 @@ jobs:
       script-env-secret-3-key: CUOPT_AWS_SECRET_ACCESS_KEY
       script-env-secret-3-value: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }}
   wheel-tests-cuopt-server:
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@main
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@release/26.04
     with:
       build_type: ${{ inputs.build_type }}
       branch: ${{ inputs.branch }}
@@ -89,7 +89,7 @@ jobs:
       script-env-secret-3-value: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }}
   conda-notebook-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@release/26.04
     with:
       build_type: ${{ inputs.build_type }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/trigger-breaking-change-alert.yaml b/.github/workflows/trigger-breaking-change-alert.yaml
index 57b178740c..d394b97db4 100644
--- a/.github/workflows/trigger-breaking-change-alert.yaml
+++ b/.github/workflows/trigger-breaking-change-alert.yaml
@@ -15,7 +15,7 @@ jobs:
   trigger-notifier:
     if: contains(github.event.pull_request.labels.*.name, 'breaking')
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@main
+    uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@release/26.04
     with:
       sender_login: ${{ github.event.sender.login }}
       sender_avatar: ${{ github.event.sender.avatar_url }}

From c0d1514db232731c4720e7fe4b7ac37afa226bb8 Mon Sep 17 00:00:00 2001
From: Rajesh Gandham <rgandham@nvidia.com>
Date: Mon, 23 Mar 2026 13:05:44 -0700
Subject: [PATCH 20/30] Remove unused variable

---
 cpp/src/mip_heuristics/diversity/diversity_manager.cu  | 5 ++---
 cpp/src/mip_heuristics/diversity/diversity_manager.cuh | 1 -
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/cpp/src/mip_heuristics/diversity/diversity_manager.cu b/cpp/src/mip_heuristics/diversity/diversity_manager.cu
index c659e9788b..18346209f9 100644
--- a/cpp/src/mip_heuristics/diversity/diversity_manager.cu
+++ b/cpp/src/mip_heuristics/diversity/diversity_manager.cu
@@ -40,7 +40,6 @@ std::vector<recombiner_enum_t> recombiner_t<i_t, f_t>::enabled_recombiners;
 template <typename i_t, typename f_t>
 diversity_manager_t<i_t, f_t>::diversity_manager_t(mip_solver_context_t<i_t, f_t>& context_)
   : context(context_),
-    branch_and_bound_ptr(nullptr),
     problem_ptr(context.problem_ptr),
     diversity_config(),
     population("population",
@@ -417,8 +416,8 @@ solution_t<i_t, f_t> diversity_manager_t<i_t, f_t>::run_solver()
   bool bb_thread_solution_exists = simplex_solution_exists.load();
   if (bb_thread_solution_exists) {
     ls.lp_optimal_exists = true;
-  } else if (branch_and_bound_ptr != nullptr &&
-             branch_and_bound_ptr->enable_concurrent_lp_root_solve()) {
+  } else if (context.branch_and_bound_ptr != nullptr &&
+             context.branch_and_bound_ptr->enable_concurrent_lp_root_solve()) {
     // B&B drives root relaxation; wait for first solution (PDLP/Barrier or dual simplex)
     first_solution_ready_.store(false, std::memory_order_release);
     std::unique_lock<std::mutex> lock(first_solution_mutex_);
diff --git a/cpp/src/mip_heuristics/diversity/diversity_manager.cuh b/cpp/src/mip_heuristics/diversity/diversity_manager.cuh
index fed937a88b..a9517484c9 100644
--- a/cpp/src/mip_heuristics/diversity/diversity_manager.cuh
+++ b/cpp/src/mip_heuristics/diversity/diversity_manager.cuh
@@ -82,7 +82,6 @@ class diversity_manager_t {
       result);
 
   mip_solver_context_t<i_t, f_t>& context;
-  dual_simplex::branch_and_bound_t<i_t, f_t>* branch_and_bound_ptr;
   problem_t<i_t, f_t>* problem_ptr;
   diversity_config_t diversity_config;
   population_t<i_t, f_t> population;

From 31641083824c318aaf28ddbf459a5dedb49c9ae4 Mon Sep 17 00:00:00 2001
From: Rajesh Gandham <rgandham@nvidia.com>
Date: Mon, 30 Mar 2026 07:22:06 -0700
Subject: [PATCH 21/30] Disable green context

---
 cpp/src/barrier/sparse_cholesky.cuh | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/cpp/src/barrier/sparse_cholesky.cuh b/cpp/src/barrier/sparse_cholesky.cuh
index f7938fb989..3bdba68401 100644
--- a/cpp/src/barrier/sparse_cholesky.cuh
+++ b/cpp/src/barrier/sparse_cholesky.cuh
@@ -131,6 +131,8 @@ std::size_t compute_hash(const f_t* arr, size_t size)
   return seed;
 }
 
+//#define USE_BARRIER_GREEN_CONTEXT
+
 template <typename i_t, typename f_t>
 class sparse_cholesky_cudss_t : public sparse_cholesky_base_t<i_t, f_t> {
  public:
@@ -155,6 +157,7 @@ class sparse_cholesky_cudss_t : public sparse_cholesky_base_t<i_t, f_t> {
     cuda_error = cudaSuccess;
     status     = CUDSS_STATUS_SUCCESS;
 
+#ifdef USE_BARRIER_GREEN_CONTEXT
     if (CUDART_VERSION >= 13000 && settings_.concurrent_halt != nullptr &&
         settings_.num_gpus == 1) {
       cuGetErrorString_func = cuopt::detail::get_driver_entry_point("cuGetErrorString");
@@ -238,6 +241,7 @@ class sparse_cholesky_cudss_t : public sparse_cholesky_base_t<i_t, f_t> {
                  &stream, barrier_green_ctx, CU_STREAM_NON_BLOCKING, stream_priority),
                reinterpret_cast<decltype(::cuGetErrorString)*>(cuGetErrorString_func));
     }
+#endif
 
     auto cudss_device_idx   = handle_ptr_->get_device();
     auto cudss_device_count = 1;
@@ -363,6 +367,8 @@ class sparse_cholesky_cudss_t : public sparse_cholesky_base_t<i_t, f_t> {
     CUDSS_CALL_AND_CHECK_EXIT(cudssConfigDestroy(solverConfig), status, "cudssConfigDestroy");
     CUDSS_CALL_AND_CHECK_EXIT(cudssDestroy(handle), status, "cudssDestroy");
     CUDA_CALL_AND_CHECK_EXIT(cudaStreamSynchronize(stream), "cudaStreamSynchronize");
+ 
+#ifdef USE_BARRIER_GREEN_CONTEXT
 #if CUDART_VERSION >= 13000
     if (settings_.concurrent_halt != nullptr && settings_.num_gpus == 1) {
       auto cuStreamDestroy_func = cuopt::detail::get_driver_entry_point("cuStreamDestroy");
@@ -374,6 +380,7 @@ class sparse_cholesky_cudss_t : public sparse_cholesky_base_t<i_t, f_t> {
         reinterpret_cast<decltype(::cuGetErrorString)*>(cuGetErrorString_func));
       handle_ptr_->get_stream().synchronize();
     }
+#endif
 #endif
   }
 

From b920f9a1fac9d48bed8a509cf5cf72a9333bf4bb Mon Sep 17 00:00:00 2001
From: Rajesh Gandham <rgandham@nvidia.com>
Date: Mon, 30 Mar 2026 07:23:29 -0700
Subject: [PATCH 22/30] Move to appropriate file

---
 cpp/src/mip_heuristics/problem/problem.cu     | 27 +++++++++++++++++++
 cpp/src/mip_heuristics/problem/problem.cuh    |  3 +++
 .../problem/problem_helpers.cuh               | 23 ----------------
 3 files changed, 30 insertions(+), 23 deletions(-)

diff --git a/cpp/src/mip_heuristics/problem/problem.cu b/cpp/src/mip_heuristics/problem/problem.cu
index 32aeef695a..c4f82843f4 100644
--- a/cpp/src/mip_heuristics/problem/problem.cu
+++ b/cpp/src/mip_heuristics/problem/problem.cu
@@ -2381,12 +2381,39 @@ void problem_t<i_t, f_t>::update_variable_bounds(const std::vector<i_t>& var_ind
   RAFT_CHECK_CUDA(handle_ptr->get_stream());
 }
 
+template <typename i_t, typename f_t>
+void convert_greater_to_less<i_t, f_t>(detail::problem_t<i_t, f_t>& problem)
+{
+  raft::common::nvtx::range scope("convert_greater_to_less");
+
+  auto* handle_ptr = problem.handle_ptr;
+
+  constexpr i_t TPB = 256;
+  kernel_convert_greater_to_less<i_t, f_t>
+    <<<problem.n_constraints, TPB, 0, handle_ptr->get_stream()>>>(
+      raft::device_span<f_t>(problem.coefficients.data(), problem.coefficients.size()),
+      raft::device_span<const i_t>(problem.offsets.data(), problem.offsets.size()),
+      raft::device_span<f_t>(problem.constraint_lower_bounds.data(),
+                             problem.constraint_lower_bounds.size()),
+      raft::device_span<f_t>(problem.constraint_upper_bounds.data(),
+                             problem.constraint_upper_bounds.size()));
+  RAFT_CHECK_CUDA(handle_ptr->get_stream());
+
+  problem.compute_transpose_of_problem();
+
+  handle_ptr->sync_stream();
+}
+
 #if MIP_INSTANTIATE_FLOAT || PDLP_INSTANTIATE_FLOAT
 template class problem_t<int, float>;
+
+template void convert_greater_to_less<int, float>(detail::problem_t<int, float>&);
 #endif
 
 #if MIP_INSTANTIATE_DOUBLE
 template class problem_t<int, double>;
+
+template void convert_greater_to_less<int, double>(detail::problem_t<int, double>&);
 #endif
 
 }  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/mip_heuristics/problem/problem.cuh b/cpp/src/mip_heuristics/problem/problem.cuh
index 130c97526e..7a837d4ceb 100644
--- a/cpp/src/mip_heuristics/problem/problem.cuh
+++ b/cpp/src/mip_heuristics/problem/problem.cuh
@@ -323,5 +323,8 @@ class problem_t {
   std::vector<f_t> Q_values;
 };
 
+template <typename i_t, typename f_t>
+void convert_greater_to_less(detail::problem_t<i_t, f_t>& problem);
+
 }  // namespace linear_programming::detail
 }  // namespace cuopt
diff --git a/cpp/src/mip_heuristics/problem/problem_helpers.cuh b/cpp/src/mip_heuristics/problem/problem_helpers.cuh
index ebc8a488ea..deca71bf3d 100644
--- a/cpp/src/mip_heuristics/problem/problem_helpers.cuh
+++ b/cpp/src/mip_heuristics/problem/problem_helpers.cuh
@@ -398,27 +398,4 @@ static void csrsort_cusparse(rmm::device_uvector<f_t>& values,
   check_csr_representation(values, offsets, indices, handle_ptr, cols, rows);
 }
 
-template <typename i_t, typename f_t>
-static void convert_greater_to_less(detail::problem_t<i_t, f_t>& problem)
-{
-  raft::common::nvtx::range scope("convert_greater_to_less");
-
-  auto* handle_ptr = problem.handle_ptr;
-
-  constexpr i_t TPB = 256;
-  kernel_convert_greater_to_less<i_t, f_t>
-    <<<problem.n_constraints, TPB, 0, handle_ptr->get_stream()>>>(
-      raft::device_span<f_t>(problem.coefficients.data(), problem.coefficients.size()),
-      raft::device_span<const i_t>(problem.offsets.data(), problem.offsets.size()),
-      raft::device_span<f_t>(problem.constraint_lower_bounds.data(),
-                             problem.constraint_lower_bounds.size()),
-      raft::device_span<f_t>(problem.constraint_upper_bounds.data(),
-                             problem.constraint_upper_bounds.size()));
-  RAFT_CHECK_CUDA(handle_ptr->get_stream());
-
-  problem.compute_transpose_of_problem();
-
-  handle_ptr->sync_stream();
-}
-
 }  // namespace cuopt::linear_programming::detail

From f4d0fa566f9e1cb51f8178a1b061c46a1793f31c Mon Sep 17 00:00:00 2001
From: Rajesh Gandham <rgandham@nvidia.com>
Date: Mon, 30 Mar 2026 07:42:58 -0700
Subject: [PATCH 23/30] Cleanup

---
 cpp/src/branch_and_bound/branch_and_bound.cpp |  12 +-
 cpp/src/branch_and_bound/branch_and_bound.hpp |   5 +-
 .../diversity/diversity_manager.cu            | 150 +++++-------------
 .../diversity/diversity_manager.cuh           |   9 +-
 .../mip_heuristics/relaxed_lp/relaxed_lp.cu   |   2 +-
 cpp/src/mip_heuristics/root_lp.cu             |  11 ++
 cpp/src/mip_heuristics/root_lp.cuh            |   4 +
 7 files changed, 80 insertions(+), 113 deletions(-)

diff --git a/cpp/src/branch_and_bound/branch_and_bound.cpp b/cpp/src/branch_and_bound/branch_and_bound.cpp
index a5c3948ec9..15db509d14 100644
--- a/cpp/src/branch_and_bound/branch_and_bound.cpp
+++ b/cpp/src/branch_and_bound/branch_and_bound.cpp
@@ -1876,7 +1876,10 @@ void branch_and_bound_t<i_t, f_t>::run_concurrent_pdlp_and_barrier_with_crossove
       get_root_concurrent_halt(),
       pdlp_root_num_gpus_,
       cuopt::linear_programming::method_t::PDLP);
-    (void)do_crush_crossover(result, "PDLP", 2);
+    // Only call crossover if the result status is OPTIMAL
+    if (result.is_optimal) {
+      (void)do_crush_crossover(result, "PDLP", 2);
+    }
   });
 
   barrier_thread_out = std::thread([this, &lp_settings, do_crush_crossover]() {
@@ -1886,7 +1889,11 @@ void branch_and_bound_t<i_t, f_t>::run_concurrent_pdlp_and_barrier_with_crossove
       get_root_concurrent_halt(),
       pdlp_root_num_gpus_,
       cuopt::linear_programming::method_t::Barrier);
-    (void)do_crush_crossover(result, "Barrier", 3);
+
+    // Only call crossover if the result status is OPTIMAL
+    if (result.is_optimal) {
+      (void)do_crush_crossover(result, "Barrier", 3);
+    }
   });
 }
 
@@ -1928,6 +1935,7 @@ lp_status_t branch_and_bound_t<i_t, f_t>::solve_root_relaxation(
   std::atomic<int> winner{0};  // 0=none, 1=dual, 2=PDLP, 3=Barrier
 
   if (enable_concurrent_lp_root_solve_ && mip_problem_ptr_ != nullptr) {
+    convert_greater_to_less_2(*mip_problem_ptr_);
     // All three run in threads; main only starts them and joins. First to finish with OPTIMAL sets
     // winner and halt.
     std::mutex first_solver_mutex;
diff --git a/cpp/src/branch_and_bound/branch_and_bound.hpp b/cpp/src/branch_and_bound/branch_and_bound.hpp
index b4c46ac8e9..e60805939e 100644
--- a/cpp/src/branch_and_bound/branch_and_bound.hpp
+++ b/cpp/src/branch_and_bound/branch_and_bound.hpp
@@ -118,7 +118,10 @@ class branch_and_bound_t {
   i_t get_num_cols() const { return original_problem_.num_cols; }
   bool enable_concurrent_lp_root_solve() const { return enable_concurrent_lp_root_solve_; }
   std::atomic<int>* get_root_concurrent_halt() { return &root_concurrent_halt_; }
-  void set_root_concurrent_halt(int value) { root_concurrent_halt_ = value; }
+  void set_root_concurrent_halt(int value)
+  {
+    root_concurrent_halt_.store(value, std::memory_order_relaxed);
+  }
   lp_status_t solve_root_relaxation(simplex_solver_settings_t<i_t, f_t> const& lp_settings,
                                     lp_solution_t<i_t, f_t>& root_relax_soln,
                                     std::vector<variable_status_t>& root_vstatus,
diff --git a/cpp/src/mip_heuristics/diversity/diversity_manager.cu b/cpp/src/mip_heuristics/diversity/diversity_manager.cu
index 18346209f9..2c7a5d08e2 100644
--- a/cpp/src/mip_heuristics/diversity/diversity_manager.cu
+++ b/cpp/src/mip_heuristics/diversity/diversity_manager.cu
@@ -15,8 +15,6 @@
 #include <mip_heuristics/presolve/trivial_presolve.cuh>
 #include <mip_heuristics/problem/problem_helpers.cuh>
 
-#include <pdlp/solve.cuh>
-
 #include <utilities/scope_guard.hpp>
 
 #include <memory>
@@ -192,6 +190,7 @@ bool diversity_manager_t<i_t, f_t>::run_presolve(f_t time_limit, timer_t global_
     ls.constraint_prop.bounds_update.set_updated_bounds(*problem_ptr);
   }
   bool run_probing_cache = !fj_only_run;
+  run_probing_cache = false;
   // Don't run probing cache in deterministic mode yet as neither B&B nor CPUFJ need it
   // and it doesn't make use of work units yet
   if (context.settings.determinism_mode == CUOPT_MODE_DETERMINISTIC) { run_probing_cache = false; }
@@ -333,6 +332,20 @@ struct ls_cpufj_raii_guard_t {
   local_search_t<i_t, f_t>& ls;
 };
 
+template <typename i_t, typename f_t>
+void diversity_manager_t<i_t, f_t>::wait_for_branch_and_bound_first_root_relaxation()
+{
+  if (simplex_solution_exists.load(std::memory_order_acquire) ||
+      first_solution_ready_.load(std::memory_order_acquire)) {
+    return;
+  }
+  std::unique_lock<std::mutex> lock(first_solution_mutex_);
+  first_solution_cv_.wait(lock, [this]() {
+    return first_solution_ready_.load(std::memory_order_acquire) ||
+           simplex_solution_exists.load(std::memory_order_acquire);
+  });
+}
+
 // returns the best feasible solution
 template <typename i_t, typename f_t>
 solution_t<i_t, f_t> diversity_manager_t<i_t, f_t>::run_solver()
@@ -381,10 +394,7 @@ solution_t<i_t, f_t> diversity_manager_t<i_t, f_t>::run_solver()
     return population.best_feasible();
   }
 
-  population.timer     = timer;
-  const f_t time_limit = timer.remaining_time();
-  const f_t lp_time_limit =
-    std::min(diversity_config.max_time_on_lp, time_limit * diversity_config.time_ratio_on_init_lp);
+  population.timer = timer;
   // after every change to the problem, we should resize all the relevant vars
   // we need to encapsulate that to prevent repetitions
   recombine_stats.reset();
@@ -413,93 +423,13 @@ solution_t<i_t, f_t> diversity_manager_t<i_t, f_t>::run_solver()
   lp_state_t<i_t, f_t>& lp_state = problem_ptr->lp_state;
   // resize because some constructor might be called before the presolve
   lp_state.resize(*problem_ptr, problem_ptr->handle_ptr->get_stream());
-  bool bb_thread_solution_exists = simplex_solution_exists.load();
-  if (bb_thread_solution_exists) {
-    ls.lp_optimal_exists = true;
-  } else if (context.branch_and_bound_ptr != nullptr &&
-             context.branch_and_bound_ptr->enable_concurrent_lp_root_solve()) {
-    // B&B drives root relaxation; wait for first solution (PDLP/Barrier or dual simplex)
-    first_solution_ready_.store(false, std::memory_order_release);
-    std::unique_lock<std::mutex> lock(first_solution_mutex_);
-    first_solution_cv_.wait(lock, [this]() { return first_solution_ready_.load(); });
-    lock.unlock();
-    clamp_within_var_bounds(lp_optimal_solution, problem_ptr, problem_ptr->handle_ptr);
-  } else if (!fj_only_run) {
-    // Heuristics-only or non-concurrent: diversity manager runs LP solve
-    convert_greater_to_less(*problem_ptr);
-
-    f_t tolerance_divisor =
-      problem_ptr->tolerances.absolute_tolerance / problem_ptr->tolerances.relative_tolerance;
-    if (tolerance_divisor == 0) { tolerance_divisor = 1; }
-    f_t absolute_tolerance = context.settings.tolerances.absolute_tolerance;
-
-    pdlp_solver_settings_t<i_t, f_t> pdlp_settings{};
-    pdlp_settings.tolerances.relative_primal_tolerance = absolute_tolerance / tolerance_divisor;
-    pdlp_settings.tolerances.relative_dual_tolerance   = absolute_tolerance / tolerance_divisor;
-    pdlp_settings.time_limit                           = lp_time_limit;
-    pdlp_settings.first_primal_feasible                = false;
-    pdlp_settings.concurrent_halt                      = &global_concurrent_halt;
-    pdlp_settings.method                               = method_t::Concurrent;
-    pdlp_settings.inside_mip                           = true;
-    pdlp_settings.pdlp_solver_mode                     = pdlp_solver_mode_t::Stable2;
-    pdlp_settings.num_gpus                             = context.settings.num_gpus;
-    pdlp_settings.presolver                            = presolver_t::None;
-
-    timer_t lp_timer(lp_time_limit);
-    auto lp_result = solve_lp_with_method<i_t, f_t>(*problem_ptr, pdlp_settings, lp_timer);
-
-    {
-      std::lock_guard<std::mutex> guard(relaxed_solution_mutex);
-      if (!simplex_solution_exists.load()) {
-        cuopt_assert(lp_result.get_primal_solution().size() == lp_optimal_solution.size(),
-                     "LP optimal solution size mismatch");
-        cuopt_assert(lp_result.get_dual_solution().size() == lp_dual_optimal_solution.size(),
-                     "LP dual optimal solution size mismatch");
-        raft::copy(lp_optimal_solution.data(),
-                   lp_result.get_primal_solution().data(),
-                   lp_optimal_solution.size(),
-                   problem_ptr->handle_ptr->get_stream());
-        raft::copy(lp_dual_optimal_solution.data(),
-                   lp_result.get_dual_solution().data(),
-                   lp_dual_optimal_solution.size(),
-                   problem_ptr->handle_ptr->get_stream());
-      } else {
-        // copy the lp state
-        raft::copy(lp_state.prev_primal.data(),
-                   lp_optimal_solution.data(),
-                   lp_optimal_solution.size(),
-                   problem_ptr->handle_ptr->get_stream());
-        raft::copy(lp_state.prev_dual.data(),
-                   lp_dual_optimal_solution.data(),
-                   lp_dual_optimal_solution.size(),
-                   problem_ptr->handle_ptr->get_stream());
-      }
-      problem_ptr->handle_ptr->sync_stream();
-    }
-    cuopt_assert(thrust::all_of(problem_ptr->handle_ptr->get_thrust_policy(),
-                                lp_optimal_solution.begin(),
-                                lp_optimal_solution.end(),
-                                [] __host__ __device__(f_t val) { return std::isfinite(val); }),
-                 "LP optimal solution contains non-finite values");
-    ls.lp_optimal_exists = true;
-    if (lp_result.get_termination_status() == pdlp_termination_status_t::Optimal) {
-      set_new_user_bound(lp_result.get_objective_value());
-    } else if (lp_result.get_termination_status() == pdlp_termination_status_t::PrimalInfeasible) {
-      CUOPT_LOG_ERROR("Problem is primal infeasible, continuing anyway!");
-      ls.lp_optimal_exists = false;
-    } else if (lp_result.get_termination_status() == pdlp_termination_status_t::DualInfeasible) {
-      CUOPT_LOG_ERROR("PDLP detected dual infeasibility, continuing anyway!");
-      ls.lp_optimal_exists = false;
-    } else if (lp_result.get_termination_status() == pdlp_termination_status_t::TimeLimit) {
-      CUOPT_LOG_DEBUG(
-        "Initial LP run exceeded time limit, continuing solver with partial LP result!");
-      // note to developer, in debug mode the LP run might be too slow and it might cause PDLP not
-      // to bring variables within the bounds
-    }
 
-    // in case the pdlp returned var boudns that are out of bounds
+  const bool bb_drives_root  = context.branch_and_bound_ptr != nullptr;
+  if (bb_drives_root) {
+    wait_for_branch_and_bound_first_root_relaxation(); 
+
     clamp_within_var_bounds(lp_optimal_solution, problem_ptr, problem_ptr->handle_ptr);
-  }
+  } 
 
   if (ls.lp_optimal_exists) {
     solution_t<i_t, f_t> lp_rounded_sol(*problem_ptr);
@@ -854,7 +784,7 @@ void diversity_manager_t<i_t, f_t>::on_first_lp_solution(
                result.dual.size(),
                problem_ptr->handle_ptr->get_stream());
     problem_ptr->handle_ptr->sync_stream();
-    ls.lp_optimal_exists = true;
+    ls.lp_optimal_exists = result.has_optimal_basis_relaxation;
     set_new_user_bound(result.user_objective);
   }
   {
@@ -879,21 +809,27 @@ void diversity_manager_t<i_t, f_t>::set_simplex_solution(const std::vector<f_t>&
   cuopt_func_call(new_sol.copy_new_assignment(solution));
   cuopt_func_call(new_sol.compute_feasibility());
   cuopt_assert(integer_equal(new_sol.get_user_objective(), objective, 1e-3), "Objective mismatch");
-  std::lock_guard<std::mutex> lock(relaxed_solution_mutex);
-  simplex_solution_exists.store(true, std::memory_order_release);
-  global_concurrent_halt = 1;
-  CUOPT_LOG_DEBUG("Setting concurrent halt for PDLP inside diversity manager");
-  // global_concurrent_halt.store(1, std::memory_order_release);
-  // it is safe to use lp_optimal_solution while executing the copy operation
-  // the operations are ordered as long as they are on the same stream
-  raft::copy(
-    lp_optimal_solution.data(), solution.data(), solution.size(), context.handle_ptr->get_stream());
-  raft::copy(lp_dual_optimal_solution.data(),
-             dual_solution.data(),
-             dual_solution.size(),
-             context.handle_ptr->get_stream());
-  set_new_user_bound(objective);
-  context.handle_ptr->sync_stream();
+  {
+    std::lock_guard<std::mutex> lock(relaxed_solution_mutex);
+    simplex_solution_exists.store(true, std::memory_order_release);
+    global_concurrent_halt = 1;
+    CUOPT_LOG_DEBUG("Setting concurrent halt for PDLP inside diversity manager");
+    // it is safe to use lp_optimal_solution while executing the copy operation
+    // the operations are ordered as long as they are on the same stream
+    raft::copy(
+      lp_optimal_solution.data(), solution.data(), solution.size(), context.handle_ptr->get_stream());
+    raft::copy(lp_dual_optimal_solution.data(),
+               dual_solution.data(),
+               dual_solution.size(),
+               context.handle_ptr->get_stream());
+    set_new_user_bound(objective);
+    context.handle_ptr->sync_stream();
+  }
+  ls.lp_optimal_exists = true;
+  {
+    std::lock_guard<std::mutex> notify_lock(first_solution_mutex_);
+    first_solution_cv_.notify_all();
+  }
 }
 
 #if MIP_INSTANTIATE_FLOAT
diff --git a/cpp/src/mip_heuristics/diversity/diversity_manager.cuh b/cpp/src/mip_heuristics/diversity/diversity_manager.cuh
index a9517484c9..4d4154d557 100644
--- a/cpp/src/mip_heuristics/diversity/diversity_manager.cuh
+++ b/cpp/src/mip_heuristics/diversity/diversity_manager.cuh
@@ -76,7 +76,8 @@ class diversity_manager_t {
                             const std::vector<f_t>& dual_solution,
                             f_t objective);
 
-  // Called by B&B when first LP solution is available (PDLP/Barrier or dual simplex).
+  // Called when the first root LP vectors are available (PDLP/Barrier pre-crossover or dual-simplex
+  // root). has_optimal_basis_relaxation distinguishes basis-optimal roots from interior iterates.
   void on_first_lp_solution(
     cuopt::linear_programming::dual_simplex::root_relaxation_first_solution_t<i_t, f_t> const&
       result);
@@ -107,7 +108,8 @@ class diversity_manager_t {
   // atomic for signalling pdlp to stop
   std::atomic<int> global_concurrent_halt{0};
 
-  // First solution from B&B: wait for B&B to call on_first_lp_solution when run_bb and concurrent
+  // Sync with B&B root relaxation: on_first_lp_solution (PDLP/Barrier inner, or dual on main
+  // thread) or set_simplex_solution fills lp_*; run_solver waits on first_solution_cv_.
   std::mutex first_solution_mutex_;
   std::condition_variable first_solution_cv_;
   std::atomic<bool> first_solution_ready_{false};
@@ -118,6 +120,9 @@ class diversity_manager_t {
   bool run_only_bp_recombiner{false};
   bool run_only_fp_recombiner{false};
   bool run_only_sub_mip_recombiner{false};
+
+ private:
+  void wait_for_branch_and_bound_first_root_relaxation();
 };
 
 }  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/mip_heuristics/relaxed_lp/relaxed_lp.cu b/cpp/src/mip_heuristics/relaxed_lp/relaxed_lp.cu
index e2bbc8feb1..d26a4020b8 100644
--- a/cpp/src/mip_heuristics/relaxed_lp/relaxed_lp.cu
+++ b/cpp/src/mip_heuristics/relaxed_lp/relaxed_lp.cu
@@ -49,7 +49,7 @@ optimization_problem_solution_t<i_t, f_t> get_relaxed_lp_solution(
   pdlp_settings.tolerances.relative_primal_tolerance = settings.tolerance / tolerance_divisor;
   pdlp_settings.tolerances.relative_dual_tolerance   = settings.tolerance / tolerance_divisor;
   pdlp_settings.time_limit                           = settings.time_limit;
-  pdlp_settings.concurrent_halt                      = settings.concurrent_halt;
+  pdlp_settings.concurrent_halt                      = nullptr; //settings.concurrent_halt;
   pdlp_settings.per_constraint_residual              = settings.per_constraint_residual;
   pdlp_settings.first_primal_feasible                = settings.return_first_feasible;
   pdlp_settings.pdlp_solver_mode                     = pdlp_solver_mode_t::Stable2;
diff --git a/cpp/src/mip_heuristics/root_lp.cu b/cpp/src/mip_heuristics/root_lp.cu
index b181db43cd..d4b4ee3cda 100644
--- a/cpp/src/mip_heuristics/root_lp.cu
+++ b/cpp/src/mip_heuristics/root_lp.cu
@@ -42,6 +42,8 @@ copy_lp_result_to_root_solution(problem_t<i_t, f_t>* problem,
   result.objective      = problem->get_solver_obj_from_user_obj(lp_result.get_objective_value());
   result.user_objective = lp_result.get_objective_value();
   result.iterations     = lp_result.get_additional_termination_information().number_of_steps_taken;
+  result.is_optimal     = lp_result.get_termination_status() == pdlp_termination_status_t::Optimal;
+  result.has_optimal_basis_relaxation = false;  // crush/crossover not done yet
   return result;
 }
 
@@ -162,6 +164,12 @@ cuopt::linear_programming::dual_simplex::crossover_status_t run_crush_crossover_
   return status;
 }
 
+template <typename i_t, typename f_t>
+void convert_greater_to_less_2(problem_t<i_t, f_t>& problem)
+{
+  convert_greater_to_less(problem);
+}
+
 template cuopt::linear_programming::dual_simplex::root_relaxation_first_solution_t<int, double>
 run_solver_for_root_lp<int, double>(
   problem_t<int, double>*, double, std::atomic<int>*, int, method_t);
@@ -188,6 +196,7 @@ run_crush_crossover_and_maybe_win<int, double>(
   double*,
   const char*,
   std::string*);
+template void convert_greater_to_less_2<int, double>(problem_t<int, double>&);
 
 #ifdef MIP_INSTANTIATION_FLOAT
 template cuopt::linear_programming::dual_simplex::root_relaxation_first_solution_t<int, float>
@@ -215,5 +224,7 @@ run_crush_crossover_and_maybe_win<int, float>(
   float*,
   const char*,
   std::string*);
+
+template void convert_greater_to_less_2<int, float>(problem_t<int, float>&);
 #endif
 }  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/mip_heuristics/root_lp.cuh b/cpp/src/mip_heuristics/root_lp.cuh
index 2f87884fe9..20b930be6d 100644
--- a/cpp/src/mip_heuristics/root_lp.cuh
+++ b/cpp/src/mip_heuristics/root_lp.cuh
@@ -63,4 +63,8 @@ cuopt::linear_programming::dual_simplex::crossover_status_t run_crush_crossover_
   const char* this_solver_name,
   std::string* winner_solver_name_out);
 
+
+template <typename i_t, typename f_t>
+void convert_greater_to_less_2(detail::problem_t<i_t, f_t>& problem);
+
 }  // namespace cuopt::linear_programming::detail

From 95090005072b962d68b36a262212659109aaa5db Mon Sep 17 00:00:00 2001
From: Rajesh Gandham <rgandham@nvidia.com>
Date: Mon, 30 Mar 2026 08:05:24 -0700
Subject: [PATCH 24/30] Fix missing entries

---
 cpp/src/dual_simplex/types.hpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/cpp/src/dual_simplex/types.hpp b/cpp/src/dual_simplex/types.hpp
index 6660a86f0a..776c766e9a 100644
--- a/cpp/src/dual_simplex/types.hpp
+++ b/cpp/src/dual_simplex/types.hpp
@@ -24,6 +24,11 @@ constexpr float64_t inf = std::numeric_limits<float64_t>::infinity();
 // without B&B depending on PDLP types.
 template <typename i_t, typename f_t>
 struct root_relaxation_first_solution_t {
+  /// Inner PDLP/Barrier termination reported optimal (may still be pre-crossover).
+  bool is_optimal{false};
+  /// True only when vectors are an optimal root relaxation on a basis (dual simplex optimal
+  /// root, or equivalently post-crossover). False for PDLP/Barrier inner iterates before crossover.
+  bool has_optimal_basis_relaxation{false};
   std::vector<f_t> primal;
   std::vector<f_t> dual;
   std::vector<f_t> reduced_costs;

From 2a1ff142a878885dc0b36849e8fe51399127bb17 Mon Sep 17 00:00:00 2001
From: Rajesh Gandham <rgandham@nvidia.com>
Date: Mon, 30 Mar 2026 12:58:22 -0700
Subject: [PATCH 25/30] Cleanup concurrent halt handling

---
 cpp/src/barrier/barrier.cu                    | 43 +++++++++----------
 cpp/src/barrier/sparse_cholesky.cuh           | 39 ++++++-----------
 cpp/src/branch_and_bound/branch_and_bound.cpp | 18 +++-----
 cpp/src/branch_and_bound/branch_and_bound.hpp |  9 ++--
 cpp/src/dual_simplex/basis_solves.cpp         | 13 ++----
 .../bound_flipping_ratio_test.cpp             |  3 +-
 cpp/src/dual_simplex/crossover.cpp            | 37 +++++++++++++---
 cpp/src/dual_simplex/phase2.cpp               | 10 +++--
 cpp/src/dual_simplex/right_looking_lu.cpp     |  7 ++-
 .../diversity/diversity_manager.cu            | 17 +++++---
 cpp/src/pdlp/pdlp.cu                          |  4 +-
 cpp/src/pdlp/solve.cu                         | 14 +++---
 12 files changed, 111 insertions(+), 103 deletions(-)

diff --git a/cpp/src/barrier/barrier.cu b/cpp/src/barrier/barrier.cu
index 075323744d..8e9f51d21e 100644
--- a/cpp/src/barrier/barrier.cu
+++ b/cpp/src/barrier/barrier.cu
@@ -16,6 +16,7 @@
 #include <barrier/iterative_refinement.hpp>
 #include <barrier/sparse_cholesky.cuh>
 #include <barrier/sparse_matrix_kernels.cuh>
+#include <dual_simplex/concurrent_halt.hpp>
 
 #include <dual_simplex/presolve.hpp>
 #include <dual_simplex/solve.hpp>
@@ -289,7 +290,7 @@ class iteration_data_t {
     // Ignore Q matrix for now
     find_dense_columns(
       lp.A, settings, dense_columns_unordered, n_dense_rows, max_row_nz, estimated_nz_AAT);
-    if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { return; }
+    if (concurrent_halt_is_set(settings.concurrent_halt)) { return; }
 #ifdef PRINT_INFO
     for (i_t j : dense_columns_unordered) {
       settings.log.printf("Dense column %6d\n", j);
@@ -350,7 +351,7 @@ class iteration_data_t {
     inv_sqrt_diag.set_scalar(1.0);
     if (n_upper_bounds > 0 || (has_Q && !use_augmented)) { inv_diag.sqrt(inv_sqrt_diag); }
 
-    if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { return; }
+    if (concurrent_halt_is_set(settings.concurrent_halt)) { return; }
 
     // Copy A into AD
     AD = lp.A;
@@ -396,22 +397,22 @@ class iteration_data_t {
     device_A.copy(host_A_CSR, lp.handle_ptr->get_stream());
     RAFT_CHECK_CUDA(handle_ptr->get_stream());
 
-    if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { return; }
+    if (concurrent_halt_is_set(settings.concurrent_halt)) { return; }
     i_t factorization_size = use_augmented ? lp.num_rows + lp.num_cols : lp.num_rows;
     chol =
       std::make_unique<sparse_cholesky_cudss_t<i_t, f_t>>(handle_ptr, settings, factorization_size);
     chol->set_positive_definite(false);
-    if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { return; }
+    if (concurrent_halt_is_set(settings.concurrent_halt)) { return; }
     // Perform symbolic analysis
     symbolic_status = 0;
     if (use_augmented) {
       // Build the sparsity pattern of the augmented system
       form_augmented(true);
-      if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { return; }
+      if (concurrent_halt_is_set(settings.concurrent_halt)) { return; }
       symbolic_status = chol->analyze(device_augmented);
     } else {
       form_adat(true);
-      if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { return; }
+      if (concurrent_halt_is_set(settings.concurrent_halt)) { return; }
       symbolic_status = chol->analyze(device_ADAT);
     }
   }
@@ -581,7 +582,7 @@ class iteration_data_t {
                          span_x[i] *= span_scale[span_col_ind[i]];
                        });
     RAFT_CHECK_CUDA(stream_view_);
-    if (settings_.concurrent_halt != nullptr && *settings_.concurrent_halt == 1) { return; }
+    if (concurrent_halt_is_set(settings_.concurrent_halt)) { return; }
     if (first_call) {
       try {
         initialize_cusparse_data<i_t, f_t>(
@@ -591,7 +592,7 @@ class iteration_data_t {
         return;
       }
     }
-    if (settings_.concurrent_halt != nullptr && *settings_.concurrent_halt == 1) { return; }
+    if (concurrent_halt_is_set(settings_.concurrent_halt)) { return; }
 
     multiply_kernels<i_t, f_t>(handle_ptr, device_A, device_AD, device_ADAT, cusparse_info);
     handle_ptr->sync_stream();
@@ -682,9 +683,7 @@ class iteration_data_t {
           dense_vector_t<i_t, f_t> M_col(AD.m);
           solve_status = chol->solve(U_col, M_col);
           if (solve_status != 0) { return solve_status; }
-          if (settings_.concurrent_halt != nullptr && *settings_.concurrent_halt == 1) {
-            return CONCURRENT_HALT_RETURN;
-          }
+          if (concurrent_halt_is_set(settings_.concurrent_halt)) { return CONCURRENT_HALT_RETURN; }
           M.set_column(k, M_col);
 
           if (debug) {
@@ -701,9 +700,7 @@ class iteration_data_t {
         for (i_t k = 0; k < n_dense_columns; k++) {
           AD_dense.transpose_multiply(
             1.0, M.values.data() + k * M.m, 0.0, H.values.data() + k * H.m);
-          if (settings_.concurrent_halt != nullptr && *settings_.concurrent_halt == 1) {
-            return CONCURRENT_HALT_RETURN;
-          }
+          if (concurrent_halt_is_set(settings_.concurrent_halt)) { return CONCURRENT_HALT_RETURN; }
         }
 
         dense_vector_t<i_t, f_t> e(n_dense_columns);
@@ -1193,7 +1190,7 @@ class iteration_data_t {
         delta_nz[j] +=
           fill;  // Capture contributions from A(:, j). j will be encountered multiple times
       }
-      if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { return; }
+      if (concurrent_halt_is_set(settings.concurrent_halt)) { return; }
     }
 
     int64_t sparse_nz_C = 0;
@@ -1233,7 +1230,7 @@ class iteration_data_t {
           delta_nz[j] + static_cast<int64_t>(
                           fill_estimate));  // Capture the estimated fill associated with column j
       }
-      if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { return; }
+      if (concurrent_halt_is_set(settings.concurrent_halt)) { return; }
     }
 
     int64_t estimated_nz_C = 0;
@@ -3429,7 +3426,7 @@ lp_status_t barrier_solver_t<i_t, f_t>::solve(f_t start_time,
     if (lp.Q.n > 0) { create_Q(lp, Q); }
 
     iteration_data_t<i_t, f_t> data(lp, num_upper_bounds, Q, settings);
-    if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
+    if (concurrent_halt_is_set(settings.concurrent_halt)) {
       settings.log.printf("Barrier solver halted\n");
       return lp_status_t::CONCURRENT_LIMIT;
     }
@@ -3458,7 +3455,7 @@ lp_status_t barrier_solver_t<i_t, f_t>::solve(f_t start_time,
       settings.log.printf("Barrier time limit exceeded\n");
       return lp_status_t::TIME_LIMIT;
     }
-    if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
+    if (concurrent_halt_is_set(settings.concurrent_halt)) {
       settings.log.printf("Barrier solver halted\n");
       return lp_status_t::CONCURRENT_LIMIT;
     }
@@ -3557,7 +3554,7 @@ lp_status_t barrier_solver_t<i_t, f_t>::solve(f_t start_time,
         settings.log.printf("Barrier time limit exceeded\n");
         return lp_status_t::TIME_LIMIT;
       }
-      if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
+      if (concurrent_halt_is_set(settings.concurrent_halt)) {
         settings.log.printf("Barrier solver halted\n");
         return lp_status_t::CONCURRENT_LIMIT;
       }
@@ -3568,7 +3565,7 @@ lp_status_t barrier_solver_t<i_t, f_t>::solve(f_t start_time,
 
       i_t status = gpu_compute_search_direction(
         data, data.dw_aff, data.dx_aff, data.dy_aff, data.dv_aff, data.dz_aff, max_affine_residual);
-      if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
+      if (concurrent_halt_is_set(settings.concurrent_halt)) {
         settings.log.printf("Barrier solver halted\n");
         return lp_status_t::CONCURRENT_LIMIT;
       }
@@ -3593,7 +3590,7 @@ lp_status_t barrier_solver_t<i_t, f_t>::solve(f_t start_time,
         settings.log.printf("Barrier time limit exceeded\n");
         return lp_status_t::TIME_LIMIT;
       }
-      if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
+      if (concurrent_halt_is_set(settings.concurrent_halt)) {
         settings.log.printf("Barrier solver halted\n");
         return lp_status_t::CONCURRENT_LIMIT;
       }
@@ -3607,7 +3604,7 @@ lp_status_t barrier_solver_t<i_t, f_t>::solve(f_t start_time,
 
       status = gpu_compute_search_direction(
         data, data.dw, data.dx, data.dy, data.dv, data.dz, max_corrector_residual);
-      if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
+      if (concurrent_halt_is_set(settings.concurrent_halt)) {
         settings.log.printf("Barrier solver halted\n");
         return lp_status_t::CONCURRENT_LIMIT;
       }
@@ -3633,7 +3630,7 @@ lp_status_t barrier_solver_t<i_t, f_t>::solve(f_t start_time,
         settings.log.printf("Barrier time limit exceeded\n");
         return lp_status_t::TIME_LIMIT;
       }
-      if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
+      if (concurrent_halt_is_set(settings.concurrent_halt)) {
         settings.log.printf("Barrier solver halted\n");
         return lp_status_t::CONCURRENT_LIMIT;
       }
diff --git a/cpp/src/barrier/sparse_cholesky.cuh b/cpp/src/barrier/sparse_cholesky.cuh
index 3bdba68401..49f80f9393 100644
--- a/cpp/src/barrier/sparse_cholesky.cuh
+++ b/cpp/src/barrier/sparse_cholesky.cuh
@@ -9,6 +9,7 @@
 #include <barrier/dense_vector.hpp>
 #include <barrier/device_sparse_matrix.cuh>
 
+#include <dual_simplex/concurrent_halt.hpp>
 #include <dual_simplex/simplex_solver_settings.hpp>
 #include <dual_simplex/sparse_matrix.hpp>
 #include <dual_simplex/tic_toc.hpp>
@@ -131,7 +132,7 @@ std::size_t compute_hash(const f_t* arr, size_t size)
   return seed;
 }
 
-//#define USE_BARRIER_GREEN_CONTEXT
+// #define USE_BARRIER_GREEN_CONTEXT
 
 template <typename i_t, typename f_t>
 class sparse_cholesky_cudss_t : public sparse_cholesky_base_t<i_t, f_t> {
@@ -367,7 +368,7 @@ class sparse_cholesky_cudss_t : public sparse_cholesky_base_t<i_t, f_t> {
     CUDSS_CALL_AND_CHECK_EXIT(cudssConfigDestroy(solverConfig), status, "cudssConfigDestroy");
     CUDSS_CALL_AND_CHECK_EXIT(cudssDestroy(handle), status, "cudssDestroy");
     CUDA_CALL_AND_CHECK_EXIT(cudaStreamSynchronize(stream), "cudaStreamSynchronize");
- 
+
 #ifdef USE_BARRIER_GREEN_CONTEXT
 #if CUDART_VERSION >= 13000
     if (settings_.concurrent_halt != nullptr && settings_.num_gpus == 1) {
@@ -452,9 +453,7 @@ class sparse_cholesky_cudss_t : public sparse_cholesky_base_t<i_t, f_t> {
       raft::common::nvtx::range fun_scope("Barrier: cuDSS Analyze : CUDSS_PHASE_ANALYSIS");
       status =
         cudssExecute(handle, CUDSS_PHASE_REORDERING, solverConfig, solverData, A, cudss_x, cudss_b);
-      if (settings_.concurrent_halt != nullptr && *settings_.concurrent_halt == 1) {
-        return CONCURRENT_HALT_RETURN;
-      }
+      if (concurrent_halt_is_set(settings_.concurrent_halt)) { return CONCURRENT_HALT_RETURN; }
       if (status != CUDSS_STATUS_SUCCESS) {
         settings_.log.printf(
           "FAILED: CUDSS call ended unsuccessfully with status = %d, details: cuDSSExecute for "
@@ -468,9 +467,7 @@ class sparse_cholesky_cudss_t : public sparse_cholesky_base_t<i_t, f_t> {
 
       status = cudssExecute(
         handle, CUDSS_PHASE_SYMBOLIC_FACTORIZATION, solverConfig, solverData, A, cudss_x, cudss_b);
-      if (settings_.concurrent_halt != nullptr && *settings_.concurrent_halt == 1) {
-        return CONCURRENT_HALT_RETURN;
-      }
+      if (concurrent_halt_is_set(settings_.concurrent_halt)) { return CONCURRENT_HALT_RETURN; }
       if (status != CUDSS_STATUS_SUCCESS) {
         settings_.log.printf(
           "FAILED: CUDSS call ended unsuccessfully with status = %d, details: cuDSSExecute for "
@@ -526,9 +523,7 @@ class sparse_cholesky_cudss_t : public sparse_cholesky_base_t<i_t, f_t> {
     f_t start_numeric = tic();
     status            = cudssExecute(
       handle, CUDSS_PHASE_FACTORIZATION, solverConfig, solverData, A, cudss_x, cudss_b);
-    if (settings_.concurrent_halt != nullptr && *settings_.concurrent_halt == 1) {
-      return CONCURRENT_HALT_RETURN;
-    }
+    if (concurrent_halt_is_set(settings_.concurrent_halt)) { return CONCURRENT_HALT_RETURN; }
     if (status != CUDSS_STATUS_SUCCESS) {
       settings_.log.printf(
         "FAILED: CUDSS call ended unsuccessfully with status = %d, details: cuDSSExecute for "
@@ -542,9 +537,7 @@ class sparse_cholesky_cudss_t : public sparse_cholesky_base_t<i_t, f_t> {
 #endif
 
     f_t numeric_time = toc(start_numeric);
-    if (settings_.concurrent_halt != nullptr && *settings_.concurrent_halt == 1) {
-      return CONCURRENT_HALT_RETURN;
-    }
+    if (concurrent_halt_is_set(settings_.concurrent_halt)) { return CONCURRENT_HALT_RETURN; }
 
     int info;
     size_t sizeWritten = 0;
@@ -642,9 +635,7 @@ class sparse_cholesky_cudss_t : public sparse_cholesky_base_t<i_t, f_t> {
     A_created = true;
 
     // Perform symbolic analysis
-    if (settings_.concurrent_halt != nullptr && *settings_.concurrent_halt == 1) {
-      return CONCURRENT_HALT_RETURN;
-    }
+    if (concurrent_halt_is_set(settings_.concurrent_halt)) { return CONCURRENT_HALT_RETURN; }
     f_t start_analysis = tic();
     CUDSS_CALL_AND_CHECK(
       cudssExecute(handle, CUDSS_PHASE_REORDERING, solverConfig, solverData, A, cudss_x, cudss_b),
@@ -652,9 +643,7 @@ class sparse_cholesky_cudss_t : public sparse_cholesky_base_t<i_t, f_t> {
       "cudssExecute for reordering");
 
     f_t reorder_time = toc(start_analysis);
-    if (settings_.concurrent_halt != nullptr && *settings_.concurrent_halt == 1) {
-      return CONCURRENT_HALT_RETURN;
-    }
+    if (concurrent_halt_is_set(settings_.concurrent_halt)) { return CONCURRENT_HALT_RETURN; }
 
     f_t start_symbolic = tic();
 
@@ -667,7 +656,7 @@ class sparse_cholesky_cudss_t : public sparse_cholesky_base_t<i_t, f_t> {
     f_t symbolic_time = toc(start_symbolic);
     f_t analysis_time = toc(start_analysis);
     settings_.log.printf("Symbolic factorization time : %.2fs\n", symbolic_time);
-    if (settings_.concurrent_halt != nullptr && *settings_.concurrent_halt == 1) {
+    if (concurrent_halt_is_set(settings_.concurrent_halt)) {
       RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
       handle_ptr_->get_stream().synchronize();
       return CONCURRENT_HALT_RETURN;
@@ -718,9 +707,7 @@ class sparse_cholesky_cudss_t : public sparse_cholesky_base_t<i_t, f_t> {
       "cudssExecute for factorization");
 
     f_t numeric_time = toc(start_numeric);
-    if (settings_.concurrent_halt != nullptr && *settings_.concurrent_halt == 1) {
-      return CONCURRENT_HALT_RETURN;
-    }
+    if (concurrent_halt_is_set(settings_.concurrent_halt)) { return CONCURRENT_HALT_RETURN; }
 
     int info;
     size_t sizeWritten = 0;
@@ -783,9 +770,7 @@ class sparse_cholesky_cudss_t : public sparse_cholesky_base_t<i_t, f_t> {
       cudssMatrixSetValues(cudss_x, x.data()), status, "cudssMatrixSetValues for x");
 
     status = cudssExecute(handle, CUDSS_PHASE_SOLVE, solverConfig, solverData, A, cudss_x, cudss_b);
-    if (settings_.concurrent_halt != nullptr && *settings_.concurrent_halt == 1) {
-      return CONCURRENT_HALT_RETURN;
-    }
+    if (concurrent_halt_is_set(settings_.concurrent_halt)) { return CONCURRENT_HALT_RETURN; }
     if (status != CUDSS_STATUS_SUCCESS) {
       settings_.log.printf(
         "FAILED: CUDSS call ended unsuccessfully with status = %d, details: cuDSSExecute for "
diff --git a/cpp/src/branch_and_bound/branch_and_bound.cpp b/cpp/src/branch_and_bound/branch_and_bound.cpp
index 15db509d14..87a7b867b3 100644
--- a/cpp/src/branch_and_bound/branch_and_bound.cpp
+++ b/cpp/src/branch_and_bound/branch_and_bound.cpp
@@ -1854,7 +1854,7 @@ void branch_and_bound_t<i_t, f_t>::run_concurrent_pdlp_and_barrier_with_crossove
       settings_,
       exploration_stats_.start_time,
       get_root_concurrent_halt(),
-      [this]() { set_root_concurrent_halt(1); },
+      [this]() { signal_root_concurrent_halt(); },
       lp_settings.on_first_lp_solution_available,
       first_solver_mutex,
       first_solver_callback_done,
@@ -1877,9 +1877,7 @@ void branch_and_bound_t<i_t, f_t>::run_concurrent_pdlp_and_barrier_with_crossove
       pdlp_root_num_gpus_,
       cuopt::linear_programming::method_t::PDLP);
     // Only call crossover if the result status is OPTIMAL
-    if (result.is_optimal) {
-      (void)do_crush_crossover(result, "PDLP", 2);
-    }
+    if (result.is_optimal) { (void)do_crush_crossover(result, "PDLP", 2); }
   });
 
   barrier_thread_out = std::thread([this, &lp_settings, do_crush_crossover]() {
@@ -1891,9 +1889,7 @@ void branch_and_bound_t<i_t, f_t>::run_concurrent_pdlp_and_barrier_with_crossove
       cuopt::linear_programming::method_t::Barrier);
 
     // Only call crossover if the result status is OPTIMAL
-    if (result.is_optimal) {
-      (void)do_crush_crossover(result, "Barrier", 3);
-    }
+    if (result.is_optimal) { (void)do_crush_crossover(result, "Barrier", 3); }
   });
 }
 
@@ -1979,7 +1975,7 @@ lp_status_t branch_and_bound_t<i_t, f_t>::solve_root_relaxation(
       int expected       = 0;
       if (status == lp_status_t::OPTIMAL &&
           winner.compare_exchange_strong(expected, 1, std::memory_order_acq_rel)) {
-        set_root_concurrent_halt(1);
+        signal_root_concurrent_halt();
       }
     });
 
@@ -2005,9 +2001,9 @@ lp_status_t branch_and_bound_t<i_t, f_t>::solve_root_relaxation(
     join_guard.b = nullptr;
     join_guard.c = nullptr;
 
-    // Winner may have set concurrent_halt==1 to stop peer solvers. All threads are joined; reset
+    // Winner may have signaled concurrent halt to stop peer solvers. All threads are joined; reset
     // the flag for the rest of B&B (subsequent LP solves, etc.).
-    set_root_concurrent_halt(0);
+    reset_root_concurrent_halt();
 
     const int w   = winner.load(std::memory_order_acquire);
     use_pdlp_path = (w == 2 || w == 3);
@@ -2126,7 +2122,7 @@ lp_status_t branch_and_bound_t<i_t, f_t>::solve_root_relaxation(
 
   settings_.log.printf("\n");
 
-  set_root_concurrent_halt(0);
+  reset_root_concurrent_halt();
   return root_status;
 }
 
diff --git a/cpp/src/branch_and_bound/branch_and_bound.hpp b/cpp/src/branch_and_bound/branch_and_bound.hpp
index e60805939e..98ec74f477 100644
--- a/cpp/src/branch_and_bound/branch_and_bound.hpp
+++ b/cpp/src/branch_and_bound/branch_and_bound.hpp
@@ -118,10 +118,11 @@ class branch_and_bound_t {
   i_t get_num_cols() const { return original_problem_.num_cols; }
   bool enable_concurrent_lp_root_solve() const { return enable_concurrent_lp_root_solve_; }
   std::atomic<int>* get_root_concurrent_halt() { return &root_concurrent_halt_; }
-  void set_root_concurrent_halt(int value)
-  {
-    root_concurrent_halt_.store(value, std::memory_order_relaxed);
-  }
+  /** Tell concurrent root solvers to stop; pairs with acquire loads on the shared halt pointer. */
+  void signal_root_concurrent_halt() { concurrent_halt_signal(&root_concurrent_halt_); }
+  /** Clear halt after concurrent root threads have joined; no peers are reading the flag. */
+  void reset_root_concurrent_halt() { concurrent_halt_reset(&root_concurrent_halt_); }
+
   lp_status_t solve_root_relaxation(simplex_solver_settings_t<i_t, f_t> const& lp_settings,
                                     lp_solution_t<i_t, f_t>& root_relax_soln,
                                     std::vector<variable_status_t>& root_vstatus,
diff --git a/cpp/src/dual_simplex/basis_solves.cpp b/cpp/src/dual_simplex/basis_solves.cpp
index c5fee4e108..b425c4a886 100644
--- a/cpp/src/dual_simplex/basis_solves.cpp
+++ b/cpp/src/dual_simplex/basis_solves.cpp
@@ -6,6 +6,7 @@
 /* clang-format on */
 
 #include <dual_simplex/basis_solves.hpp>
+#include <dual_simplex/concurrent_halt.hpp>
 
 #include <dual_simplex/initial_basis.hpp>
 #include <dual_simplex/right_looking_lu.hpp>
@@ -390,9 +391,7 @@ i_t factorize_basis(const csc_matrix_t<i_t, f_t>& A,
                                  SU,
                                  S_perm_inv,
                                  work_estimate);
-        if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
-          return CONCURRENT_HALT_RETURN;
-        }
+        if (concurrent_halt_is_set(settings.concurrent_halt)) { return CONCURRENT_HALT_RETURN; }
         if (Srank < 0) { return Srank; }
         if (Srank != Sdim) {
           // Get the rank deficient columns
@@ -623,9 +622,7 @@ i_t factorize_basis(const csc_matrix_t<i_t, f_t>& A,
   rank =
     right_looking_lu(A, settings, medium_tol, basic_list, start_time, q, L, U, pinv, work_estimate);
   if (rank < 0) {
-    if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
-      return CONCURRENT_HALT_RETURN;
-    }
+    if (concurrent_halt_is_set(settings.concurrent_halt)) { return CONCURRENT_HALT_RETURN; }
     return rank;
   }
   inverse_permutation(pinv, p);
@@ -646,9 +643,7 @@ i_t factorize_basis(const csc_matrix_t<i_t, f_t>& A,
     }
     work_estimate += 3 * (m - rank);
   }
-  if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
-    return CONCURRENT_HALT_RETURN;
-  }
+  if (concurrent_halt_is_set(settings.concurrent_halt)) { return CONCURRENT_HALT_RETURN; }
   if (verbose) {
     printf("Right Lnz+Unz %d t %.3f\n", L.col_start[m] + U.col_start[m], toc(fact_start));
   }
diff --git a/cpp/src/dual_simplex/bound_flipping_ratio_test.cpp b/cpp/src/dual_simplex/bound_flipping_ratio_test.cpp
index e30b067398..e776676eca 100644
--- a/cpp/src/dual_simplex/bound_flipping_ratio_test.cpp
+++ b/cpp/src/dual_simplex/bound_flipping_ratio_test.cpp
@@ -6,6 +6,7 @@
 /* clang-format on */
 
 #include <dual_simplex/bound_flipping_ratio_test.hpp>
+#include <dual_simplex/concurrent_halt.hpp>
 
 #include <dual_simplex/tic_toc.hpp>
 
@@ -269,7 +270,7 @@ void bound_flipping_ratio_test_t<i_t, f_t>::heap_passes(const std::vector<i_t>&
       entering_index = RATIO_TEST_TIME_LIMIT;
       return;
     }
-    if (settings_.concurrent_halt != nullptr && *settings_.concurrent_halt == 1) {
+    if (concurrent_halt_is_set(settings_.concurrent_halt)) {
       entering_index = CONCURRENT_HALT_RETURN;
       return;
     }
diff --git a/cpp/src/dual_simplex/crossover.cpp b/cpp/src/dual_simplex/crossover.cpp
index 14624a4f4c..6e177570d8 100644
--- a/cpp/src/dual_simplex/crossover.cpp
+++ b/cpp/src/dual_simplex/crossover.cpp
@@ -5,6 +5,7 @@
  */
 /* clang-format on */
 
+#include <dual_simplex/concurrent_halt.hpp>
 #include <dual_simplex/crossover.hpp>
 
 #include <dual_simplex/basis_solves.hpp>
@@ -611,7 +612,7 @@ i_t dual_push(const lp_problem_t<i_t, f_t>& lp,
       settings.log.printf("Crossover time exceeded\n");
       return TIME_LIMIT_RETURN;
     }
-    if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
+    if (concurrent_halt_is_set(settings.concurrent_halt)) {
       settings.log.printf("Concurrent halt\n");
       return CONCURRENT_HALT_RETURN;
     }
@@ -988,7 +989,7 @@ i_t primal_push(const lp_problem_t<i_t, f_t>& lp,
       settings.log.printf("Crossover time limit exceeded\n");
       return TIME_LIMIT_RETURN;
     }
-    if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
+    if (concurrent_halt_is_set(settings.concurrent_halt)) {
       settings.log.printf("Concurrent halt\n");
       return CONCURRENT_HALT_RETURN;
     }
@@ -1239,6 +1240,10 @@ crossover_status_t crossover(const lp_problem_t<i_t, f_t>& lp,
     settings.log.printf("Aborting: initial basis selection\n");
     return return_to_status(rank);
   }
+  if (concurrent_halt_is_set(settings.concurrent_halt)) {
+    settings.log.printf("Concurrent halt (after initial basis selection)\n");
+    return crossover_status_t::CONCURRENT_LIMIT;
+  }
 
   i_t num_basic = 0;
   if (rank < m) {
@@ -1247,6 +1252,10 @@ crossover_status_t crossover(const lp_problem_t<i_t, f_t>& lp,
   }
 
   for (i_t k = 0; k < candidate_columns.size(); k++) {
+    if ((k & 31) == 0 && concurrent_halt_is_set(settings.concurrent_halt)) {
+      settings.log.printf("Concurrent halt (candidate column loop)\n");
+      return crossover_status_t::CONCURRENT_LIMIT;
+    }
     const i_t j = candidate_columns[k];
     vstatus[j]  = vstatus_for_candidates[k];
     if (vstatus[j] == variable_status_t::BASIC) { num_basic++; }
@@ -1312,6 +1321,10 @@ crossover_status_t crossover(const lp_problem_t<i_t, f_t>& lp,
                          slacks_needed,
                          work_estimate);
   if (rank < 0) { return return_to_status(rank); }
+  if (concurrent_halt_is_set(settings.concurrent_halt)) {
+    settings.log.printf("Concurrent halt (after initial basis factorization)\n");
+    return crossover_status_t::CONCURRENT_LIMIT;
+  }
   if (rank != m) {
     settings.log.debug("Failed to factorize basis. rank %d m %d\n", rank, m);
     basis_repair(lp.A,
@@ -1352,7 +1365,7 @@ crossover_status_t crossover(const lp_problem_t<i_t, f_t>& lp,
     settings.log.printf("Time limit exceeded\n");
     return crossover_status_t::TIME_LIMIT;
   }
-  if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
+  if (concurrent_halt_is_set(settings.concurrent_halt)) {
     settings.log.printf("Concurrent halt\n");
     return crossover_status_t::CONCURRENT_LIMIT;
   }
@@ -1408,13 +1421,17 @@ crossover_status_t crossover(const lp_problem_t<i_t, f_t>& lp,
   } else if (dual_feasible && !primal_feasible) {
     i_t dual_iter = 0;
     std::vector<f_t> edge_norms;
+    if (concurrent_halt_is_set(settings.concurrent_halt)) {
+      settings.log.printf("Concurrent halt (before crossover dual phase2 cleanup)\n");
+      return crossover_status_t::CONCURRENT_LIMIT;
+    }
     dual::status_t status =
       dual_phase2(2, 0, start_time, lp, settings, vstatus, solution, dual_iter, edge_norms);
     if (toc(start_time) > settings.time_limit) {
       settings.log.printf("Time limit exceeded\n");
       return crossover_status_t::TIME_LIMIT;
     }
-    if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
+    if (concurrent_halt_is_set(settings.concurrent_halt)) {
       settings.log.printf("Concurrent halt\n");
       return crossover_status_t::CONCURRENT_LIMIT;
     }
@@ -1454,6 +1471,10 @@ crossover_status_t crossover(const lp_problem_t<i_t, f_t>& lp,
     i_t iter = 0;
     lp_solution_t<i_t, f_t> phase1_solution(phase1_problem.num_rows, phase1_problem.num_cols);
     std::vector<f_t> junk;
+    if (concurrent_halt_is_set(settings.concurrent_halt)) {
+      settings.log.printf("Concurrent halt (before crossover dual phase1)\n");
+      return crossover_status_t::CONCURRENT_LIMIT;
+    }
     dual::status_t phase1_status = dual_phase2(
       1, 1, start_time, phase1_problem, settings, phase1_vstatus, phase1_solution, iter, junk);
     if (phase1_status == dual::status_t::NUMERICAL ||
@@ -1570,13 +1591,17 @@ crossover_status_t crossover(const lp_problem_t<i_t, f_t>& lp,
       dual::status_t status = dual::status_t::NUMERICAL;
       if (dual_infeas <= settings.dual_tol) {
         std::vector<f_t> edge_norms;
+        if (concurrent_halt_is_set(settings.concurrent_halt)) {
+          settings.log.printf("Concurrent halt (before crossover dual phase2 after phase1)\n");
+          return crossover_status_t::CONCURRENT_LIMIT;
+        }
         status = dual_phase2(
           2, iter == 0 ? 1 : 0, start_time, lp, settings, vstatus, solution, iter, edge_norms);
         if (toc(start_time) > settings.time_limit) {
           settings.log.printf("Time limit exceeded\n");
           return crossover_status_t::TIME_LIMIT;
         }
-        if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
+        if (concurrent_halt_is_set(settings.concurrent_halt)) {
           settings.log.printf("Concurrent halt\n");
           return crossover_status_t::CONCURRENT_LIMIT;
         }
@@ -1604,7 +1629,7 @@ crossover_status_t crossover(const lp_problem_t<i_t, f_t>& lp,
   if (primal_feasible) { status = crossover_status_t::PRIMAL_FEASIBLE; }
   if (primal_feasible && dual_feasible) {
     status = crossover_status_t::OPTIMAL;
-    if (settings.concurrent_halt != nullptr) { *settings.concurrent_halt = 1; }
+    concurrent_halt_signal(settings.concurrent_halt);
   }
   return status;
 }
diff --git a/cpp/src/dual_simplex/phase2.cpp b/cpp/src/dual_simplex/phase2.cpp
index 9434f4661a..f678b5f4b1 100644
--- a/cpp/src/dual_simplex/phase2.cpp
+++ b/cpp/src/dual_simplex/phase2.cpp
@@ -8,6 +8,7 @@
 #include <dual_simplex/basis_solves.hpp>
 #include <dual_simplex/basis_updates.hpp>
 #include <dual_simplex/bound_flipping_ratio_test.hpp>
+#include <dual_simplex/concurrent_halt.hpp>
 #include <dual_simplex/initial_basis.hpp>
 #include <dual_simplex/phase1.hpp>
 #include <dual_simplex/phase2.hpp>
@@ -1377,9 +1378,7 @@ i_t initialize_steepest_edge_norms(const lp_problem_t<i_t, f_t>& lp,
       settings.log.printf("Initialized %d of %d steepest edge norms in %.2fs\n", k, m, now);
     }
     if (toc(start_time) > settings.time_limit) { return -1; }
-    if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
-      return CONCURRENT_HALT_RETURN;
-    }
+    if (concurrent_halt_is_set(settings.concurrent_halt)) { return CONCURRENT_HALT_RETURN; }
   }
   work_estimate += 7 * m;
   return 0;
@@ -2784,6 +2783,9 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
 
   while (iter < iter_limit) {
     PHASE2_NVTX_RANGE("DualSimplex::phase2_main_loop");
+    if (concurrent_halt_is_set(settings.concurrent_halt)) {
+      return dual::status_t::CONCURRENT_LIMIT;
+    }
     // Pricing
     i_t direction           = 0;
     i_t basic_leaving_index = -1;
@@ -3579,7 +3581,7 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
 
     if (now > settings.time_limit) { return dual::status_t::TIME_LIMIT; }
 
-    if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
+    if (concurrent_halt_is_set(settings.concurrent_halt)) {
       return dual::status_t::CONCURRENT_LIMIT;
     }
   }
diff --git a/cpp/src/dual_simplex/right_looking_lu.cpp b/cpp/src/dual_simplex/right_looking_lu.cpp
index 37202000f8..1400924a4e 100644
--- a/cpp/src/dual_simplex/right_looking_lu.cpp
+++ b/cpp/src/dual_simplex/right_looking_lu.cpp
@@ -5,6 +5,7 @@
  */
 /* clang-format on */
 
+#include <dual_simplex/concurrent_halt.hpp>
 #include <dual_simplex/right_looking_lu.hpp>
 #include <dual_simplex/tic_toc.hpp>
 #include <utilities/memory_instrumentation.hpp>
@@ -724,9 +725,7 @@ i_t right_looking_lu(const csc_matrix_t<i_t, f_t>& A,
 
   i_t pivots = 0;
   for (i_t k = 0; k < n; ++k) {
-    if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
-      return CONCURRENT_HALT_RETURN;
-    }
+    if (concurrent_halt_is_set(settings.concurrent_halt)) { return CONCURRENT_HALT_RETURN; }
     if (toc(start_time) > settings.time_limit) { return TIME_LIMIT_RETURN; }
     // Find pivot that satisfies
     // abs(pivot) >= abstol,
@@ -1257,7 +1256,7 @@ i_t right_looking_lu_row_permutation_only(const csc_matrix_t<i_t, f_t>& A,
       last_print = tic();
     }
     if (toc(start_time) > settings.time_limit) { return TIME_LIMIT_RETURN; }
-    if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
+    if (concurrent_halt_is_set(settings.concurrent_halt)) {
       settings.log.printf("Concurrent halt\n");
       return CONCURRENT_HALT_RETURN;
     }
diff --git a/cpp/src/mip_heuristics/diversity/diversity_manager.cu b/cpp/src/mip_heuristics/diversity/diversity_manager.cu
index 2c7a5d08e2..12e6ee51f8 100644
--- a/cpp/src/mip_heuristics/diversity/diversity_manager.cu
+++ b/cpp/src/mip_heuristics/diversity/diversity_manager.cu
@@ -17,6 +17,7 @@
 
 #include <utilities/scope_guard.hpp>
 
+#include <atomic>
 #include <memory>
 
 constexpr bool fj_only_run = false;
@@ -190,7 +191,7 @@ bool diversity_manager_t<i_t, f_t>::run_presolve(f_t time_limit, timer_t global_
     ls.constraint_prop.bounds_update.set_updated_bounds(*problem_ptr);
   }
   bool run_probing_cache = !fj_only_run;
-  run_probing_cache = false;
+  run_probing_cache      = false;
   // Don't run probing cache in deterministic mode yet as neither B&B nor CPUFJ need it
   // and it doesn't make use of work units yet
   if (context.settings.determinism_mode == CUOPT_MODE_DETERMINISTIC) { run_probing_cache = false; }
@@ -424,12 +425,12 @@ solution_t<i_t, f_t> diversity_manager_t<i_t, f_t>::run_solver()
   // resize because some constructor might be called before the presolve
   lp_state.resize(*problem_ptr, problem_ptr->handle_ptr->get_stream());
 
-  const bool bb_drives_root  = context.branch_and_bound_ptr != nullptr;
+  const bool bb_drives_root = context.branch_and_bound_ptr != nullptr;
   if (bb_drives_root) {
-    wait_for_branch_and_bound_first_root_relaxation(); 
+    wait_for_branch_and_bound_first_root_relaxation();
 
     clamp_within_var_bounds(lp_optimal_solution, problem_ptr, problem_ptr->handle_ptr);
-  } 
+  }
 
   if (ls.lp_optimal_exists) {
     solution_t<i_t, f_t> lp_rounded_sol(*problem_ptr);
@@ -812,12 +813,14 @@ void diversity_manager_t<i_t, f_t>::set_simplex_solution(const std::vector<f_t>&
   {
     std::lock_guard<std::mutex> lock(relaxed_solution_mutex);
     simplex_solution_exists.store(true, std::memory_order_release);
-    global_concurrent_halt = 1;
+    global_concurrent_halt.store(1, std::memory_order_release);
     CUOPT_LOG_DEBUG("Setting concurrent halt for PDLP inside diversity manager");
     // it is safe to use lp_optimal_solution while executing the copy operation
     // the operations are ordered as long as they are on the same stream
-    raft::copy(
-      lp_optimal_solution.data(), solution.data(), solution.size(), context.handle_ptr->get_stream());
+    raft::copy(lp_optimal_solution.data(),
+               solution.data(),
+               solution.size(),
+               context.handle_ptr->get_stream());
     raft::copy(lp_dual_optimal_solution.data(),
                dual_solution.data(),
                dual_solution.size(),
diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu
index 9424240a08..18e548b0bf 100644
--- a/cpp/src/pdlp/pdlp.cu
+++ b/cpp/src/pdlp/pdlp.cu
@@ -37,6 +37,8 @@
 #include <thrust/extrema.h>
 #include <thrust/logical.h>
 
+#include <dual_simplex/concurrent_halt.hpp>
+
 #include <cmath>
 #include <optional>
 #include <unordered_set>
@@ -501,7 +503,7 @@ std::optional<optimization_problem_solution_t<i_t, f_t>> pdlp_solver_t<i_t, f_t>
 
   // Check for concurrent limit (whenever caller provides a halt flag, e.g. B&B racing PDLP vs
   // Barrier)
-  if (settings_.concurrent_halt != nullptr && *settings_.concurrent_halt == 1) {
+  if (cuopt::linear_programming::dual_simplex::concurrent_halt_is_set(settings_.concurrent_halt)) {
 #ifdef PDLP_VERBOSE_MODE
     RAFT_CUDA_TRY(cudaDeviceSynchronize());
     std::cout << "Concurrent Limit reached, returning current solution" << std::endl;
diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu
index 3c3ce1e0eb..fc7831bbe3 100644
--- a/cpp/src/pdlp/solve.cu
+++ b/cpp/src/pdlp/solve.cu
@@ -40,6 +40,7 @@
 
 #include <barrier/sparse_cholesky.cuh>
 
+#include <dual_simplex/concurrent_halt.hpp>
 #include <dual_simplex/crossover.hpp>
 #include <dual_simplex/solve.hpp>
 #include <dual_simplex/tic_toc.hpp>
@@ -468,7 +469,7 @@ run_barrier(dual_simplex::user_problem_t<i_t, f_t>& user_problem,
        status == dual_simplex::lp_status_t::UNBOUNDED ||
        status == dual_simplex::lp_status_t::INFEASIBLE)) {
     // We finished. Tell PDLP to stop if it is still running.
-    *settings.concurrent_halt = 1;
+    dual_simplex::concurrent_halt_signal(settings.concurrent_halt);
   }
 
   return {std::move(solution), status, timer.elapsed_time(), norm_user_objective, norm_rhs};
@@ -541,7 +542,7 @@ run_dual_simplex(dual_simplex::user_problem_t<i_t, f_t>& user_problem,
        status == dual_simplex::lp_status_t::UNBOUNDED ||
        status == dual_simplex::lp_status_t::INFEASIBLE)) {
     // We finished. Tell PDLP to stop if it is still running.
-    *settings.concurrent_halt = 1;
+    dual_simplex::concurrent_halt_signal(settings.concurrent_halt);
   }
 
   return {std::move(solution), status, timer.elapsed_time(), norm_user_objective, norm_rhs};
@@ -830,11 +831,12 @@ optimization_problem_solution_t<i_t, f_t> run_pdlp(detail::problem_t<i_t, f_t>&
       CUOPT_LOG_CONDITIONAL_INFO(
         !settings.inside_mip, "Crossover status %s", sol.get_termination_status_string().c_str());
     }
-    if (!settings.halt_set_by_caller && settings.method == method_t::Concurrent && settings.concurrent_halt != nullptr &&
-        crossover_info == 0 && sol.get_termination_status() == pdlp_termination_status_t::Optimal) {
+    if (!settings.halt_set_by_caller && settings.method == method_t::Concurrent &&
+        settings.concurrent_halt != nullptr && crossover_info == 0 &&
+        sol.get_termination_status() == pdlp_termination_status_t::Optimal) {
       // We finished. Tell dual simplex to stop if it is still running.
       CUOPT_LOG_CONDITIONAL_INFO(!settings.inside_mip, "PDLP finished. Telling others to stop");
-      *settings.concurrent_halt = 1;
+      dual_simplex::concurrent_halt_signal(settings.concurrent_halt);
     }
   }
   return sol;
@@ -1109,7 +1111,7 @@ optimization_problem_solution_t<i_t, f_t> run_concurrent(
   pdlp_solver_settings_t<i_t, f_t> settings_pdlp(settings);
 
   // Set the concurrent halt pointer
-  global_concurrent_halt        = 0;
+  global_concurrent_halt.store(0, std::memory_order_relaxed);
   settings_pdlp.concurrent_halt = &global_concurrent_halt;
 
   // Make sure allocations are done on the original stream

From bb074ac29258e55826c74ac6b277160a56306309 Mon Sep 17 00:00:00 2001
From: Rajesh Gandham <rgandham@nvidia.com>
Date: Mon, 30 Mar 2026 12:58:51 -0700
Subject: [PATCH 26/30] add missing include

---
 cpp/src/dual_simplex/concurrent_halt.hpp | 31 ++++++++++++++++++++++++
 1 file changed, 31 insertions(+)
 create mode 100644 cpp/src/dual_simplex/concurrent_halt.hpp

diff --git a/cpp/src/dual_simplex/concurrent_halt.hpp b/cpp/src/dual_simplex/concurrent_halt.hpp
new file mode 100644
index 0000000000..c7752424a4
--- /dev/null
+++ b/cpp/src/dual_simplex/concurrent_halt.hpp
@@ -0,0 +1,31 @@
+/* clang-format off */
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+/* clang-format on */
+#pragma once
+
+#include <atomic>
+
+namespace cuopt::linear_programming::dual_simplex {
+
+/** True if caller requested stop (any non-zero value). Uses acquire for pairing with release stores. */
+inline bool concurrent_halt_is_set(std::atomic<int> const* halt)
+{
+  return halt != nullptr && std::atomic_load_explicit(halt, std::memory_order_acquire) != 0;
+}
+
+/** Signal peer solvers to stop. No-op if halt is null. Uses release for pairing with acquire loads. */
+inline void concurrent_halt_signal(std::atomic<int>* halt)
+{
+  if (halt != nullptr) { std::atomic_store_explicit(halt, 1, std::memory_order_release); }
+}
+
+/** Clear halt after concurrent threads have joined; no peers are reading the flag. */
+inline void concurrent_halt_reset(std::atomic<int>* halt)
+{
+  if (halt != nullptr) { std::atomic_store_explicit(halt, 0, std::memory_order_relaxed); }
+}
+
+}  // namespace cuopt::linear_programming::dual_simplex

From 47daeae0b4e0537154a29995f4cd795605926877 Mon Sep 17 00:00:00 2001
From: Rajesh Gandham <rgandham@nvidia.com>
Date: Mon, 30 Mar 2026 13:24:05 -0700
Subject: [PATCH 27/30] Add missing include

---
 cpp/src/branch_and_bound/branch_and_bound.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpp/src/branch_and_bound/branch_and_bound.hpp b/cpp/src/branch_and_bound/branch_and_bound.hpp
index 98ec74f477..f088888f0f 100644
--- a/cpp/src/branch_and_bound/branch_and_bound.hpp
+++ b/cpp/src/branch_and_bound/branch_and_bound.hpp
@@ -14,6 +14,7 @@
 #include <branch_and_bound/mip_node.hpp>
 #include <branch_and_bound/node_queue.hpp>
 #include <branch_and_bound/pseudo_costs.hpp>
+#include <dual_simplex/concurrent_halt.hpp>
 
 #include <cuts/cuts.hpp>
 

From 70ab58f424c60f3166b212e01792d3db9b027ad0 Mon Sep 17 00:00:00 2001
From: Rajesh Gandham <rgandham@nvidia.com>
Date: Tue, 31 Mar 2026 08:23:00 -0700
Subject: [PATCH 28/30] Handle failures after cut generation cleanly

---
 cpp/src/branch_and_bound/branch_and_bound.cpp | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/cpp/src/branch_and_bound/branch_and_bound.cpp b/cpp/src/branch_and_bound/branch_and_bound.cpp
index 87a7b867b3..729b6e4212 100644
--- a/cpp/src/branch_and_bound/branch_and_bound.cpp
+++ b/cpp/src/branch_and_bound/branch_and_bound.cpp
@@ -2520,13 +2520,21 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
       }
 
       if (cut_status != dual::status_t::OPTIMAL) {
-        settings_.log.printf("Numerical issue at root node. Resolving from scratch\n");
+        // Root relaxation was already optimal; this is reoptimization after adding cuts / bound
+        // changes (warm-started dual phase2), which can fail for numerical reasons unrelated to
+        // the initial Barrier/PDLP root solve.
+        settings_.log.printf(
+          "Dual phase2 after cuts did not reach optimal (status=%s, cut pass %d). "
+          "Resolving root LP from scratch.\n",
+          dual::status_to_string(cut_status).c_str(),
+          static_cast<int>(cut_pass));
+        basis_update_mpf_t<i_t, f_t> scratch_basis(original_lp_.num_rows, settings_.refactor_frequency);
         lp_status_t scratch_status =
           solve_linear_program_with_advanced_basis(original_lp_,
                                                    exploration_stats_.start_time,
                                                    lp_settings,
                                                    root_relax_soln_,
-                                                   basis_update,
+                                                   scratch_basis,
                                                    basic_list,
                                                    nonbasic_list,
                                                    root_vstatus_,
@@ -2536,8 +2544,11 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
           cut_status = convert_lp_status_to_dual_status(scratch_status);
           exploration_stats_.total_lp_iters += root_relax_soln_.iterations;
           root_objective_ = compute_objective(original_lp_, root_relax_soln_.x);
+          basis_update    = std::move(scratch_basis);
         } else {
-          settings_.log.printf("Cut status %s\n", dual::status_to_string(cut_status).c_str());
+          settings_.log.printf("Scratch resolve status %s; dual phase2 after cuts was %s\n",
+                               lp_status_to_string(scratch_status).c_str(),
+                               dual::status_to_string(cut_status).c_str());
 #ifdef WRITE_CUT_INFEASIBLE_MPS
           original_lp_.write_mps("cut_infeasible.mps");
 #endif

From 14dff02fd0bdf06e2c52b6b2bf816b320bc9782f Mon Sep 17 00:00:00 2001
From: Rajesh Gandham <rgandham@nvidia.com>
Date: Wed, 1 Apr 2026 15:54:15 -0700
Subject: [PATCH 29/30] Fix compilation error

---
 cpp/src/pdlp/termination_strategy/infeasibility_information.cu | 2 ++
 cpp/tests/routing/unit_tests/breaks.cu                         | 1 -
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/cpp/src/pdlp/termination_strategy/infeasibility_information.cu b/cpp/src/pdlp/termination_strategy/infeasibility_information.cu
index dbb35b732d..0e001b802f 100644
--- a/cpp/src/pdlp/termination_strategy/infeasibility_information.cu
+++ b/cpp/src/pdlp/termination_strategy/infeasibility_information.cu
@@ -24,6 +24,8 @@
 #include <raft/linalg/unary_op.cuh>
 #include <raft/util/cuda_utils.cuh>
 
+#include <thrust/iterator/transform_output_iterator.h>
+
 namespace cuopt::linear_programming::detail {
 template <typename i_t, typename f_t>
 infeasibility_information_t<i_t, f_t>::infeasibility_information_t(
diff --git a/cpp/tests/routing/unit_tests/breaks.cu b/cpp/tests/routing/unit_tests/breaks.cu
index a2abc0ac8c..0d8a578b6e 100644
--- a/cpp/tests/routing/unit_tests/breaks.cu
+++ b/cpp/tests/routing/unit_tests/breaks.cu
@@ -354,7 +354,6 @@ TEST(vehicle_breaks, non_uniform_breaks)
     order_service[i]   = route.service_time_h[i + 1];
   }
   int num_v_type_1 = vehicle_num / 2;
-  int num_v_type_2 = vehicle_num - num_v_type_1;
   int num_breaks   = 3;
 
   // Type 1: [40,50]/5, [100,120]/20, [170,180]/10

From 9a020db31aacb3c4d58870d047f311932ea2a817 Mon Sep 17 00:00:00 2001
From: Rajesh Gandham <rgandham@nvidia.com>
Date: Thu, 2 Apr 2026 11:26:23 -0700
Subject: [PATCH 30/30] Move the problem conversion to upstream

---
 cpp/src/branch_and_bound/branch_and_bound.cpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/cpp/src/branch_and_bound/branch_and_bound.cpp b/cpp/src/branch_and_bound/branch_and_bound.cpp
index 729b6e4212..6d8ed3361b 100644
--- a/cpp/src/branch_and_bound/branch_and_bound.cpp
+++ b/cpp/src/branch_and_bound/branch_and_bound.cpp
@@ -271,6 +271,7 @@ branch_and_bound_t<i_t, f_t>::branch_and_bound_t(
   original_problem_.A.print_matrix();
 #endif
 
+  convert_greater_to_less_2(*mip_problem_ptr_);
   dualize_info_t<i_t, f_t> dualize_info;
   convert_user_problem(original_problem_, settings_, original_lp_, new_slacks_, dualize_info);
   full_variable_types(original_problem_, original_lp_, var_types_);
@@ -1931,9 +1932,9 @@ lp_status_t branch_and_bound_t<i_t, f_t>::solve_root_relaxation(
   std::atomic<int> winner{0};  // 0=none, 1=dual, 2=PDLP, 3=Barrier
 
   if (enable_concurrent_lp_root_solve_ && mip_problem_ptr_ != nullptr) {
-    convert_greater_to_less_2(*mip_problem_ptr_);
-    // All three run in threads; main only starts them and joins. First to finish with OPTIMAL sets
-    // winner and halt.
+    // convert_greater_to_less_2(*mip_problem_ptr_);
+    //  All three run in threads; main only starts them and joins. First to finish with OPTIMAL sets
+    //  winner and halt.
     std::mutex first_solver_mutex;
     bool first_solver_callback_done = false;
     run_concurrent_pdlp_and_barrier_with_crossover(lp_settings,
@@ -2528,7 +2529,8 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
           "Resolving root LP from scratch.\n",
           dual::status_to_string(cut_status).c_str(),
           static_cast<int>(cut_pass));
-        basis_update_mpf_t<i_t, f_t> scratch_basis(original_lp_.num_rows, settings_.refactor_frequency);
+        basis_update_mpf_t<i_t, f_t> scratch_basis(original_lp_.num_rows,
+                                                   settings_.refactor_frequency);
         lp_status_t scratch_status =
           solve_linear_program_with_advanced_basis(original_lp_,
                                                    exploration_stats_.start_time,