NVIDIA · rg20 · Mar 2, 2026 · Mar 6, 2026 · Mar 6, 2026 · Mar 6, 2026
diff --git a/RAPIDS_BRANCH b/RAPIDS_BRANCH
@@ -1 +1 @@
-release/26.04
+main
@@ -264,6 +264,8 @@ class pdlp_solver_settings_t {
   bool inside_mip{false};
   // For concurrent termination
   std::atomic<int>* concurrent_halt{nullptr};
+  /** If true, solver does not set concurrent_halt; caller sets it after crossover. */
+  bool halt_set_by_caller{false};
   static constexpr f_t minimal_absolute_tolerance = 1.0e-12;
   pdlp_hyper_params::pdlp_hyper_params_t hyper_params;
   // Holds the information of new variable lower and upper bounds for each climber in the format:

@@ -235,6 +235,7 @@ class optimization_problem_solution_t : public base_solution_t {
    * @return rmm::device_uvector<i_t> The device memory container for the reduced cost.
    */
   rmm::device_uvector<f_t>& get_reduced_cost();
+  const rmm::device_uvector<f_t>& get_reduced_cost() const;
 
   /**
    * @brief Get termination reason

@@ -16,6 +16,7 @@
 #include <barrier/iterative_refinement.hpp>
 #include <barrier/sparse_cholesky.cuh>
 #include <barrier/sparse_matrix_kernels.cuh>
+#include <dual_simplex/concurrent_halt.hpp>
 
 #include <dual_simplex/presolve.hpp>
 #include <dual_simplex/solve.hpp>
@@ -289,7 +290,7 @@ class iteration_data_t {
     // Ignore Q matrix for now
     find_dense_columns(
       lp.A, settings, dense_columns_unordered, n_dense_rows, max_row_nz, estimated_nz_AAT);
-    if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { return; }
+    if (concurrent_halt_is_set(settings.concurrent_halt)) { return; }
 #ifdef PRINT_INFO
     for (i_t j : dense_columns_unordered) {
       settings.log.printf("Dense column %6d\n", j);
@@ -350,7 +351,7 @@ class iteration_data_t {
     inv_sqrt_diag.set_scalar(1.0);
     if (n_upper_bounds > 0 || (has_Q && !use_augmented)) { inv_diag.sqrt(inv_sqrt_diag); }
 
-    if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { return; }
+    if (concurrent_halt_is_set(settings.concurrent_halt)) { return; }
 
     // Copy A into AD
     AD = lp.A;
@@ -396,22 +397,22 @@ class iteration_data_t {
     device_A.copy(host_A_CSR, lp.handle_ptr->get_stream());
     RAFT_CHECK_CUDA(handle_ptr->get_stream());
 
-    if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { return; }
+    if (concurrent_halt_is_set(settings.concurrent_halt)) { return; }
     i_t factorization_size = use_augmented ? lp.num_rows + lp.num_cols : lp.num_rows;
     chol =
       std::make_unique<sparse_cholesky_cudss_t<i_t, f_t>>(handle_ptr, settings, factorization_size);
     chol->set_positive_definite(false);
-    if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { return; }
+    if (concurrent_halt_is_set(settings.concurrent_halt)) { return; }
     // Perform symbolic analysis
     symbolic_status = 0;
     if (use_augmented) {
       // Build the sparsity pattern of the augmented system
       form_augmented(true);
-      if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { return; }
+      if (concurrent_halt_is_set(settings.concurrent_halt)) { return; }
       symbolic_status = chol->analyze(device_augmented);
     } else {
       form_adat(true);
-      if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { return; }
+      if (concurrent_halt_is_set(settings.concurrent_halt)) { return; }
       symbolic_status = chol->analyze(device_ADAT);
     }
   }
@@ -581,7 +582,7 @@ class iteration_data_t {
                          span_x[i] *= span_scale[span_col_ind[i]];
                        });
     RAFT_CHECK_CUDA(stream_view_);
-    if (settings_.concurrent_halt != nullptr && *settings_.concurrent_halt == 1) { return; }
+    if (concurrent_halt_is_set(settings_.concurrent_halt)) { return; }
     if (first_call) {
       try {
         initialize_cusparse_data<i_t, f_t>(
@@ -591,7 +592,7 @@ class iteration_data_t {
         return;
       }
     }
-    if (settings_.concurrent_halt != nullptr && *settings_.concurrent_halt == 1) { return; }
+    if (concurrent_halt_is_set(settings_.concurrent_halt)) { return; }
 
     multiply_kernels<i_t, f_t>(handle_ptr, device_A, device_AD, device_ADAT, cusparse_info);
     handle_ptr->sync_stream();
@@ -682,9 +683,7 @@ class iteration_data_t {
           dense_vector_t<i_t, f_t> M_col(AD.m);
           solve_status = chol->solve(U_col, M_col);
           if (solve_status != 0) { return solve_status; }
-          if (settings_.concurrent_halt != nullptr && *settings_.concurrent_halt == 1) {
-            return CONCURRENT_HALT_RETURN;
-          }
+          if (concurrent_halt_is_set(settings_.concurrent_halt)) { return CONCURRENT_HALT_RETURN; }
           M.set_column(k, M_col);
 
           if (debug) {
@@ -701,9 +700,7 @@ class iteration_data_t {
         for (i_t k = 0; k < n_dense_columns; k++) {
           AD_dense.transpose_multiply(
             1.0, M.values.data() + k * M.m, 0.0, H.values.data() + k * H.m);
-          if (settings_.concurrent_halt != nullptr && *settings_.concurrent_halt == 1) {
-            return CONCURRENT_HALT_RETURN;
-          }
+          if (concurrent_halt_is_set(settings_.concurrent_halt)) { return CONCURRENT_HALT_RETURN; }
         }
 
         dense_vector_t<i_t, f_t> e(n_dense_columns);
@@ -1193,7 +1190,7 @@ class iteration_data_t {
         delta_nz[j] +=
           fill;  // Capture contributions from A(:, j). j will be encountered multiple times
       }
-      if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { return; }
+      if (concurrent_halt_is_set(settings.concurrent_halt)) { return; }
     }
 
     int64_t sparse_nz_C = 0;
@@ -1233,7 +1230,7 @@ class iteration_data_t {
           delta_nz[j] + static_cast<int64_t>(
                           fill_estimate));  // Capture the estimated fill associated with column j
       }
-      if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { return; }
+      if (concurrent_halt_is_set(settings.concurrent_halt)) { return; }
     }
 
     int64_t estimated_nz_C = 0;
@@ -3429,7 +3426,7 @@ lp_status_t barrier_solver_t<i_t, f_t>::solve(f_t start_time,
     if (lp.Q.n > 0) { create_Q(lp, Q); }
 
     iteration_data_t<i_t, f_t> data(lp, num_upper_bounds, Q, settings);
-    if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
+    if (concurrent_halt_is_set(settings.concurrent_halt)) {
       settings.log.printf("Barrier solver halted\n");
       return lp_status_t::CONCURRENT_LIMIT;
     }
@@ -3458,7 +3455,7 @@ lp_status_t barrier_solver_t<i_t, f_t>::solve(f_t start_time,
       settings.log.printf("Barrier time limit exceeded\n");
       return lp_status_t::TIME_LIMIT;
     }
-    if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
+    if (concurrent_halt_is_set(settings.concurrent_halt)) {
       settings.log.printf("Barrier solver halted\n");
       return lp_status_t::CONCURRENT_LIMIT;
     }
@@ -3557,7 +3554,7 @@ lp_status_t barrier_solver_t<i_t, f_t>::solve(f_t start_time,
         settings.log.printf("Barrier time limit exceeded\n");
         return lp_status_t::TIME_LIMIT;
       }
-      if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
+      if (concurrent_halt_is_set(settings.concurrent_halt)) {
         settings.log.printf("Barrier solver halted\n");
         return lp_status_t::CONCURRENT_LIMIT;
       }
@@ -3568,7 +3565,7 @@ lp_status_t barrier_solver_t<i_t, f_t>::solve(f_t start_time,
 
       i_t status = gpu_compute_search_direction(
         data, data.dw_aff, data.dx_aff, data.dy_aff, data.dv_aff, data.dz_aff, max_affine_residual);
-      if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
+      if (concurrent_halt_is_set(settings.concurrent_halt)) {
         settings.log.printf("Barrier solver halted\n");
         return lp_status_t::CONCURRENT_LIMIT;
       }
@@ -3593,7 +3590,7 @@ lp_status_t barrier_solver_t<i_t, f_t>::solve(f_t start_time,
         settings.log.printf("Barrier time limit exceeded\n");
         return lp_status_t::TIME_LIMIT;
       }
-      if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
+      if (concurrent_halt_is_set(settings.concurrent_halt)) {
         settings.log.printf("Barrier solver halted\n");
         return lp_status_t::CONCURRENT_LIMIT;
       }
@@ -3607,7 +3604,7 @@ lp_status_t barrier_solver_t<i_t, f_t>::solve(f_t start_time,
 
       status = gpu_compute_search_direction(
         data, data.dw, data.dx, data.dy, data.dv, data.dz, max_corrector_residual);
-      if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
+      if (concurrent_halt_is_set(settings.concurrent_halt)) {
         settings.log.printf("Barrier solver halted\n");
         return lp_status_t::CONCURRENT_LIMIT;
       }
@@ -3633,7 +3630,7 @@ lp_status_t barrier_solver_t<i_t, f_t>::solve(f_t start_time,
         settings.log.printf("Barrier time limit exceeded\n");
         return lp_status_t::TIME_LIMIT;
       }
-      if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
+      if (concurrent_halt_is_set(settings.concurrent_halt)) {
         settings.log.printf("Barrier solver halted\n");
         return lp_status_t::CONCURRENT_LIMIT;
       }

@@ -9,6 +9,7 @@
 #include <barrier/dense_vector.hpp>
 #include <barrier/device_sparse_matrix.cuh>
 
+#include <dual_simplex/concurrent_halt.hpp>
 #include <dual_simplex/simplex_solver_settings.hpp>
 #include <dual_simplex/sparse_matrix.hpp>
 #include <dual_simplex/tic_toc.hpp>
@@ -131,6 +132,8 @@ std::size_t compute_hash(const f_t* arr, size_t size)
   return seed;
 }
 
+// #define USE_BARRIER_GREEN_CONTEXT
+
 template <typename i_t, typename f_t>
 class sparse_cholesky_cudss_t : public sparse_cholesky_base_t<i_t, f_t> {
  public:
@@ -155,6 +158,7 @@ class sparse_cholesky_cudss_t : public sparse_cholesky_base_t<i_t, f_t> {
     cuda_error = cudaSuccess;
     status     = CUDSS_STATUS_SUCCESS;
 
+#ifdef USE_BARRIER_GREEN_CONTEXT
     if (CUDART_VERSION >= 13000 && settings_.concurrent_halt != nullptr &&
         settings_.num_gpus == 1) {
       cuGetErrorString_func = cuopt::detail::get_driver_entry_point("cuGetErrorString");
@@ -238,6 +242,7 @@ class sparse_cholesky_cudss_t : public sparse_cholesky_base_t<i_t, f_t> {
                  &stream, barrier_green_ctx, CU_STREAM_NON_BLOCKING, stream_priority),
                reinterpret_cast<decltype(::cuGetErrorString)*>(cuGetErrorString_func));
     }
+#endif
 
     auto cudss_device_idx   = handle_ptr_->get_device();
     auto cudss_device_count = 1;
@@ -363,6 +368,8 @@ class sparse_cholesky_cudss_t : public sparse_cholesky_base_t<i_t, f_t> {
     CUDSS_CALL_AND_CHECK_EXIT(cudssConfigDestroy(solverConfig), status, "cudssConfigDestroy");
     CUDSS_CALL_AND_CHECK_EXIT(cudssDestroy(handle), status, "cudssDestroy");
     CUDA_CALL_AND_CHECK_EXIT(cudaStreamSynchronize(stream), "cudaStreamSynchronize");
+
+#ifdef USE_BARRIER_GREEN_CONTEXT
 #if CUDART_VERSION >= 13000
     if (settings_.concurrent_halt != nullptr && settings_.num_gpus == 1) {
       auto cuStreamDestroy_func = cuopt::detail::get_driver_entry_point("cuStreamDestroy");
@@ -374,6 +381,7 @@ class sparse_cholesky_cudss_t : public sparse_cholesky_base_t<i_t, f_t> {
         reinterpret_cast<decltype(::cuGetErrorString)*>(cuGetErrorString_func));
       handle_ptr_->get_stream().synchronize();
     }
+#endif
 #endif
   }
 
@@ -445,9 +453,7 @@ class sparse_cholesky_cudss_t : public sparse_cholesky_base_t<i_t, f_t> {
       raft::common::nvtx::range fun_scope("Barrier: cuDSS Analyze : CUDSS_PHASE_ANALYSIS");
       status =
         cudssExecute(handle, CUDSS_PHASE_REORDERING, solverConfig, solverData, A, cudss_x, cudss_b);
-      if (settings_.concurrent_halt != nullptr && *settings_.concurrent_halt == 1) {
-        return CONCURRENT_HALT_RETURN;
-      }
+      if (concurrent_halt_is_set(settings_.concurrent_halt)) { return CONCURRENT_HALT_RETURN; }
       if (status != CUDSS_STATUS_SUCCESS) {
         settings_.log.printf(
           "FAILED: CUDSS call ended unsuccessfully with status = %d, details: cuDSSExecute for "
@@ -461,9 +467,7 @@ class sparse_cholesky_cudss_t : public sparse_cholesky_base_t<i_t, f_t> {
 
       status = cudssExecute(
         handle, CUDSS_PHASE_SYMBOLIC_FACTORIZATION, solverConfig, solverData, A, cudss_x, cudss_b);
-      if (settings_.concurrent_halt != nullptr && *settings_.concurrent_halt == 1) {
-        return CONCURRENT_HALT_RETURN;
-      }
+      if (concurrent_halt_is_set(settings_.concurrent_halt)) { return CONCURRENT_HALT_RETURN; }
       if (status != CUDSS_STATUS_SUCCESS) {
         settings_.log.printf(
           "FAILED: CUDSS call ended unsuccessfully with status = %d, details: cuDSSExecute for "
@@ -519,9 +523,7 @@ class sparse_cholesky_cudss_t : public sparse_cholesky_base_t<i_t, f_t> {
     f_t start_numeric = tic();
     status            = cudssExecute(
       handle, CUDSS_PHASE_FACTORIZATION, solverConfig, solverData, A, cudss_x, cudss_b);
-    if (settings_.concurrent_halt != nullptr && *settings_.concurrent_halt == 1) {
-      return CONCURRENT_HALT_RETURN;
-    }
+    if (concurrent_halt_is_set(settings_.concurrent_halt)) { return CONCURRENT_HALT_RETURN; }
     if (status != CUDSS_STATUS_SUCCESS) {
       settings_.log.printf(
         "FAILED: CUDSS call ended unsuccessfully with status = %d, details: cuDSSExecute for "
@@ -535,9 +537,7 @@ class sparse_cholesky_cudss_t : public sparse_cholesky_base_t<i_t, f_t> {
 #endif
 
     f_t numeric_time = toc(start_numeric);
-    if (settings_.concurrent_halt != nullptr && *settings_.concurrent_halt == 1) {
-      return CONCURRENT_HALT_RETURN;
-    }
+    if (concurrent_halt_is_set(settings_.concurrent_halt)) { return CONCURRENT_HALT_RETURN; }
 
     int info;
     size_t sizeWritten = 0;
@@ -635,19 +635,15 @@ class sparse_cholesky_cudss_t : public sparse_cholesky_base_t<i_t, f_t> {
     A_created = true;
 
     // Perform symbolic analysis
-    if (settings_.concurrent_halt != nullptr && *settings_.concurrent_halt == 1) {
-      return CONCURRENT_HALT_RETURN;
-    }
+    if (concurrent_halt_is_set(settings_.concurrent_halt)) { return CONCURRENT_HALT_RETURN; }
     f_t start_analysis = tic();
     CUDSS_CALL_AND_CHECK(
       cudssExecute(handle, CUDSS_PHASE_REORDERING, solverConfig, solverData, A, cudss_x, cudss_b),
       status,
       "cudssExecute for reordering");
 
     f_t reorder_time = toc(start_analysis);
-    if (settings_.concurrent_halt != nullptr && *settings_.concurrent_halt == 1) {
-      return CONCURRENT_HALT_RETURN;
-    }
+    if (concurrent_halt_is_set(settings_.concurrent_halt)) { return CONCURRENT_HALT_RETURN; }
 
     f_t start_symbolic = tic();
 
@@ -660,7 +656,7 @@ class sparse_cholesky_cudss_t : public sparse_cholesky_base_t<i_t, f_t> {
     f_t symbolic_time = toc(start_symbolic);
     f_t analysis_time = toc(start_analysis);
     settings_.log.printf("Symbolic factorization time : %.2fs\n", symbolic_time);
-    if (settings_.concurrent_halt != nullptr && *settings_.concurrent_halt == 1) {
+    if (concurrent_halt_is_set(settings_.concurrent_halt)) {
       RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
       handle_ptr_->get_stream().synchronize();
       return CONCURRENT_HALT_RETURN;
@@ -711,9 +707,7 @@ class sparse_cholesky_cudss_t : public sparse_cholesky_base_t<i_t, f_t> {
       "cudssExecute for factorization");
 
     f_t numeric_time = toc(start_numeric);
-    if (settings_.concurrent_halt != nullptr && *settings_.concurrent_halt == 1) {
-      return CONCURRENT_HALT_RETURN;
-    }
+    if (concurrent_halt_is_set(settings_.concurrent_halt)) { return CONCURRENT_HALT_RETURN; }
 
     int info;
     size_t sizeWritten = 0;
@@ -776,9 +770,7 @@ class sparse_cholesky_cudss_t : public sparse_cholesky_base_t<i_t, f_t> {
       cudssMatrixSetValues(cudss_x, x.data()), status, "cudssMatrixSetValues for x");
 
     status = cudssExecute(handle, CUDSS_PHASE_SOLVE, solverConfig, solverData, A, cudss_x, cudss_b);
-    if (settings_.concurrent_halt != nullptr && *settings_.concurrent_halt == 1) {
-      return CONCURRENT_HALT_RETURN;
-    }
+    if (concurrent_halt_is_set(settings_.concurrent_halt)) { return CONCURRENT_HALT_RETURN; }
     if (status != CUDSS_STATUS_SUCCESS) {
       settings_.log.printf(
         "FAILED: CUDSS call ended unsuccessfully with status = %d, details: cuDSSExecute for "
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		release/26.04
		main
Copy link Copy Markdown coderabbitai bot Mar 19, 2026 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. ⚠️ Potential issue \| 🟠 Major Avoid pointing release flow to `main` for this branch marker. Using `main` here can pull moving-head workflow/config changes into a release line and make builds non-reproducible. Prefer pinning to the release branch/tag used for this train (e.g., `release/26.04` or an explicit immutable ref). 🤖 Prompt for AI Agents Verify each finding against the current code and only fix it if needed. In `@RAPIDS_BRANCH` at line 1, The RAPIDS_BRANCH file currently points at the mutable "main" branch which risks pulling moving-head changes into the release flow; update the branch marker to a stable release branch or tag (for example use "release/26.04" or an explicit immutable ref) by replacing the "main" entry in RAPIDS_BRANCH with the chosen release branch or tag so builds remain reproducible.