diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json
index 4c5df380f6..6ddf2583c4 100644
--- a/.claude-plugin/marketplace.json
+++ b/.claude-plugin/marketplace.json
@@ -5,7 +5,7 @@
   },
   "metadata": {
     "description": "Agent skills for NVIDIA cuOpt: routing (VRP, TSP, PDP), LP/MILP/QP, installation (Python/C/developer), and REST server.",
-    "version": "26.04.00"
+    "version": "26.06.00"
   },
   "plugins": [
     {
diff --git a/.cursor-plugin/plugin.json b/.cursor-plugin/plugin.json
index 5f34873671..e740506140 100644
--- a/.cursor-plugin/plugin.json
+++ b/.cursor-plugin/plugin.json
@@ -1,7 +1,7 @@
 {
   "name": "nvidia-cuopt-skills",
   "description": "Agent skills for NVIDIA cuOpt: routing (VRP, TSP, PDP), LP/MILP/QP, installation (Python/C/developer), and REST server. Use when building or solving optimization with cuOpt.",
-  "version": "26.04.00",
+  "version": "26.06.00",
   "author": {
     "name": "NVIDIA"
   },
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 7958eac440..cdbf4df577 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -1,3 +1,6 @@
+# Default owner for paths with no later, more specific match
+*                  @nvidia/cuopt-infra-codeowners
+
 #cpp code owners
 cpp/               @nvidia/cuopt-engine-codeowners
 
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 3eb1f1f066..a945cde8ec 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -45,7 +45,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@main
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -55,7 +55,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@main
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -65,7 +65,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@main
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -73,7 +73,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-cuopt-mps-parser:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -88,7 +88,7 @@ jobs:
   wheel-publish-cuopt-mps-parser:
     needs: wheel-build-cuopt-mps-parser
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@main
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -99,7 +99,7 @@ jobs:
   wheel-build-libcuopt:
     needs: wheel-build-cuopt-mps-parser
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -112,7 +112,7 @@ jobs:
   wheel-publish-libcuopt:
     needs: wheel-build-libcuopt
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@main
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -123,7 +123,7 @@ jobs:
   wheel-build-cuopt:
     needs: [wheel-build-cuopt-mps-parser, wheel-build-libcuopt]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -135,7 +135,7 @@ jobs:
   wheel-publish-cuopt:
     needs: wheel-build-cuopt
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@main
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -145,7 +145,7 @@ jobs:
       package-type: python
   wheel-build-cuopt-server:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -160,7 +160,7 @@ jobs:
   wheel-publish-cuopt-server:
     needs: wheel-build-cuopt-server
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@main
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -171,7 +171,7 @@ jobs:
   docs-build:
     needs: [python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       node_type: "gpu-l4-latest-1"
@@ -181,11 +181,11 @@ jobs:
       arch: "amd64"
       file_to_upload: "docs/cuopt/build/html/"
       artifact-name: "cuopt_docs"
-      container_image: "rapidsai/ci-conda:26.04-latest"
+      container_image: "rapidsai/ci-conda:26.06-latest"
       script: "ci/build_docs.sh"
   wheel-build-cuopt-sh-client:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -201,7 +201,7 @@ jobs:
   wheel-publish-cuopt-sh-client:
     needs: wheel-build-cuopt-sh-client
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@main
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/build_test_publish_images.yaml b/.github/workflows/build_test_publish_images.yaml
index f8f7366e13..17d4e9ab57 100644
--- a/.github/workflows/build_test_publish_images.yaml
+++ b/.github/workflows/build_test_publish_images.yaml
@@ -55,7 +55,7 @@ jobs:
   compute-matrix:
     runs-on: ubuntu-latest
     container:
-      image: rapidsai/ci-conda:26.04-latest
+      image: rapidsai/ci-conda:26.06-latest
     outputs:
       MATRIX: ${{ steps.compute-matrix.outputs.MATRIX }}
       CUOPT_VER: ${{ steps.compute-cuopt-ver.outputs.CUOPT_VER }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 47a3bd9fca..a652c23b9a 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -34,7 +34,7 @@ jobs:
       - wheel-build-cuopt-sh-client
       - test-self-hosted-server
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@main
     if: always()
     with:
       needs: ${{ toJSON(needs) }}
@@ -111,7 +111,7 @@ jobs:
 
   changed-files:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@main
     with:
       files_yaml: |
         build_docs:
@@ -279,20 +279,20 @@ jobs:
           - '!gemini-extension.json'
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@main
     with:
       enable_check_generated_files: false
   conda-cpp-build:
     needs: [checks, compute-matrix-filters]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@main
     with:
       build_type: pull-request
       script: ci/build_cpp.sh
       matrix_filter: ${{ needs.compute-matrix-filters.outputs.conda_lean_filter }}
   conda-cpp-tests:
     needs: [conda-cpp-build, changed-files, compute-matrix-filters]
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@main
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_cpp
     with:
       build_type: pull-request
@@ -308,14 +308,14 @@ jobs:
   conda-python-build:
     needs: [conda-cpp-build, compute-matrix-filters]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@main
     with:
       build_type: pull-request
       script: ci/build_python.sh
       matrix_filter: ${{ needs.compute-matrix-filters.outputs.conda_test_filter }}
   conda-python-tests:
     needs: [conda-python-build, changed-files, compute-matrix-filters]
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@main
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python_conda
     with:
       run_codecov: false
@@ -332,7 +332,7 @@ jobs:
   docs-build:
     needs: [conda-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).build_docs
     with:
       build_type: pull-request
@@ -340,12 +340,12 @@ jobs:
       arch: "amd64"
       file_to_upload: "docs/cuopt/build/html/"
       artifact-name: "cuopt_docs"
-      container_image: "rapidsai/ci-conda:26.04-latest"
+      container_image: "rapidsai/ci-conda:26.06-latest"
       script: "ci/build_docs.sh"
   wheel-build-cuopt-mps-parser:
     needs: compute-matrix-filters
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main
     with:
       build_type: pull-request
       script: ci/build_wheel_cuopt_mps_parser.sh
@@ -357,7 +357,7 @@ jobs:
   wheel-build-libcuopt:
     needs: [wheel-build-cuopt-mps-parser, compute-matrix-filters]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main
     with:
       # build for every combination of arch and CUDA version, but only for the latest Python
       matrix_filter: ${{ needs.compute-matrix-filters.outputs.libcuopt_filter }}
@@ -368,7 +368,7 @@ jobs:
   wheel-build-cuopt:
     needs: [wheel-build-cuopt-mps-parser, wheel-build-libcuopt, compute-matrix-filters]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main
     with:
       build_type: pull-request
       script: ci/build_wheel_cuopt.sh
@@ -377,7 +377,7 @@ jobs:
       matrix_filter: ${{ needs.compute-matrix-filters.outputs.wheel_lean_filter }}
   wheel-tests-cuopt:
     needs: [wheel-build-cuopt, wheel-build-cuopt-mps-parser, wheel-build-cuopt-sh-client, changed-files, compute-matrix-filters]
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@main
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python_wheels
     with:
       build_type: pull-request
@@ -393,7 +393,7 @@ jobs:
   wheel-build-cuopt-server:
     needs: [checks, compute-matrix-filters]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main
     with:
       build_type: pull-request
       script: ci/build_wheel_cuopt_server.sh
@@ -405,7 +405,7 @@ jobs:
   wheel-build-cuopt-sh-client:
     needs: compute-matrix-filters
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main
     with:
       build_type: pull-request
       script: ci/build_wheel_cuopt_sh_client.sh
@@ -417,7 +417,7 @@ jobs:
       matrix_filter: ${{ needs.compute-matrix-filters.outputs.cuopt_sh_client_filter }}
   wheel-tests-cuopt-server:
     needs: [wheel-build-cuopt, wheel-build-cuopt-server, changed-files, compute-matrix-filters]
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@main
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python_wheels
     with:
       build_type: pull-request
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 9ad7609e8a..a8cc5f2943 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -27,7 +27,7 @@ on:
 
 jobs:
   conda-cpp-tests:
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@main
     with:
       build_type: ${{ inputs.build_type }}
       branch: ${{ inputs.branch }}
@@ -42,7 +42,7 @@ jobs:
       script-env-secret-3-key: CUOPT_AWS_SECRET_ACCESS_KEY
       script-env-secret-3-value: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }}
   conda-python-tests:
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@main
     with:
       run_codecov: false
       build_type: ${{ inputs.build_type }}
@@ -58,7 +58,7 @@ jobs:
       script-env-secret-3-key: CUOPT_AWS_SECRET_ACCESS_KEY
       script-env-secret-3-value: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }}
   wheel-tests-cuopt:
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@main
     with:
       build_type: ${{ inputs.build_type }}
       branch: ${{ inputs.branch }}
@@ -73,7 +73,7 @@ jobs:
       script-env-secret-3-key: CUOPT_AWS_SECRET_ACCESS_KEY
       script-env-secret-3-value: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }}
   wheel-tests-cuopt-server:
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@main
     with:
       build_type: ${{ inputs.build_type }}
       branch: ${{ inputs.branch }}
@@ -89,7 +89,7 @@ jobs:
       script-env-secret-3-value: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }}
   conda-notebook-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main
     with:
       build_type: ${{ inputs.build_type }}
       branch: ${{ inputs.branch }}
@@ -97,5 +97,5 @@ jobs:
       sha: ${{ inputs.sha }}
       node_type: "gpu-l4-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci-conda:26.04-latest"
+      container_image: "rapidsai/ci-conda:26.06-latest"
       script: ci/test_notebooks.sh
diff --git a/.github/workflows/trigger-breaking-change-alert.yaml b/.github/workflows/trigger-breaking-change-alert.yaml
index d394b97db4..57b178740c 100644
--- a/.github/workflows/trigger-breaking-change-alert.yaml
+++ b/.github/workflows/trigger-breaking-change-alert.yaml
@@ -15,7 +15,7 @@ jobs:
   trigger-notifier:
     if: contains(github.event.pull_request.labels.*.name, 'breaking')
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@release/26.04
+    uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@main
     with:
       sender_login: ${{ github.event.sender.login }}
       sender_avatar: ${{ github.event.sender.avatar_url }}
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 8d03641fde..a935201f21 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -117,7 +117,7 @@ Architecture:
 - Clone the repository:
 
 ```bash
-CUOPT_HOME=$(pwd)/cuopt
+export CUOPT_HOME=$(pwd)/cuopt
 git clone https://github.com/NVIDIA/cuopt.git $CUOPT_HOME
 cd $CUOPT_HOME
 ```
@@ -193,19 +193,20 @@ To build all libraries and tests, simply run
 To run the C++ tests, run
 
 ```bash
-cd $CUOPT_HOME/datasets && get_test_data.sh
+cd $CUOPT_HOME/datasets && ./get_test_data.sh
 cd $CUOPT_HOME && datasets/linear_programming/download_pdlp_test_dataset.sh
 datasets/mip/download_miplib_test_dataset.sh
 export RAPIDS_DATASET_ROOT_DIR=$CUOPT_HOME/datasets/
-ctest --test-dir ${CUOPT_HOME}/cpp/build  # libcuopt
+ctest --test-dir ${CUOPT_HOME}/cpp/build -E L1TEST  # libcuopt
 ```
+`L1TEST`s are excluded because they are expensive and not run as part of the typical development process.
 
 To run python tests, run
 
 - To run `cuopt` tests:
 ```bash
 
-cd $CUOPT_HOME/datasets && get_test_data.sh
+cd $CUOPT_HOME/datasets && ./get_test_data.sh
 cd $CUOPT_HOME && datasets/linear_programming/download_pdlp_test_dataset.sh
 datasets/mip/download_miplib_test_dataset.sh
 export RAPIDS_DATASET_ROOT_DIR=$CUOPT_HOME/datasets/
diff --git a/RAPIDS_BRANCH b/RAPIDS_BRANCH
index d5ea6ced53..ba2906d066 100644
--- a/RAPIDS_BRANCH
+++ b/RAPIDS_BRANCH
@@ -1 +1 @@
-release/26.04
+main
diff --git a/README.md b/README.md
index 379a48c350..95c8598d77 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 # cuOpt - GPU-accelerated Optimization
 
 [![Build Status](https://github.com/NVIDIA/cuopt/actions/workflows/build.yaml/badge.svg)](https://github.com/NVIDIA/cuopt/actions/workflows/build.yaml)
-[![Version](https://img.shields.io/badge/version-26.04.00-blue)](https://github.com/NVIDIA/cuopt/releases)
+[![Version](https://img.shields.io/badge/version-26.06.00-blue)](https://github.com/NVIDIA/cuopt/releases)
 [![Documentation](https://img.shields.io/badge/docs-latest-brightgreen)](https://docs.nvidia.com/cuopt/user-guide/latest/introduction.html)
 [![Docker Hub](https://img.shields.io/badge/docker-nvidia%2Fcuopt-blue?logo=docker)](https://hub.docker.com/r/nvidia/cuopt)
 [![Examples](https://img.shields.io/badge/examples-cuopt--examples-orange)](https://github.com/NVIDIA/cuopt-examples)
@@ -83,7 +83,7 @@ For CUDA 12.x:
 pip install \
   --extra-index-url=https://pypi.nvidia.com \
   nvidia-cuda-runtime-cu12==12.9.* \
-  cuopt-server-cu12==26.04.* cuopt-sh-client==26.04.*
+  cuopt-server-cu12==26.06.* cuopt-sh-client==26.06.*
 ```
 
 Development wheels are available as nightlies, please update `--extra-index-url` to `https://pypi.anaconda.org/rapidsai-wheels-nightly/simple/` to install latest nightly packages.
@@ -91,7 +91,7 @@ Development wheels are available as nightlies, please update `--extra-index-url`
 pip install --pre \
   --extra-index-url=https://pypi.nvidia.com \
   --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple/ \
-  cuopt-server-cu12==26.04.* cuopt-sh-client==26.04.*
+  cuopt-server-cu12==26.06.* cuopt-sh-client==26.06.*
 ```
 
 For CUDA 13.x:
@@ -99,7 +99,7 @@ For CUDA 13.x:
 ```bash
 pip install \
   --extra-index-url=https://pypi.nvidia.com \
-  cuopt-server-cu13==26.04.* cuopt-sh-client==26.04.*
+  cuopt-server-cu13==26.06.* cuopt-sh-client==26.06.*
 ```
 
 Development wheels are available as nightlies, please update `--extra-index-url` to `https://pypi.anaconda.org/rapidsai-wheels-nightly/simple/` to install latest nightly packages.
@@ -107,7 +107,7 @@ Development wheels are available as nightlies, please update `--extra-index-url`
 pip install --pre \
   --extra-index-url=https://pypi.nvidia.com \
   --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple/ \
-  cuopt-server-cu13==26.04.* cuopt-sh-client==26.04.*
+  cuopt-server-cu13==26.06.* cuopt-sh-client==26.06.*
 ```
 
 
@@ -118,7 +118,7 @@ cuOpt can be installed with conda (via [miniforge](https://github.com/conda-forg
 All other dependencies are installed automatically when `cuopt-server` and `cuopt-sh-client` are installed.
 
 ```bash
-conda install -c rapidsai -c conda-forge -c nvidia cuopt-server=26.04.* cuopt-sh-client=26.04.*
+conda install -c rapidsai -c conda-forge -c nvidia cuopt-server=26.06.* cuopt-sh-client=26.06.*
 ```
 
 We also provide [nightly conda packages](https://anaconda.org/rapidsai-nightly) built from the HEAD
diff --git a/VERSION b/VERSION
index 0bd0e8a95b..cdb610a24d 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-26.04.00
+26.06.00
diff --git a/benchmarks/linear_programming/cuopt/run_mip.cpp b/benchmarks/linear_programming/cuopt/run_mip.cpp
index e01e533a65..40f7c73eac 100644
--- a/benchmarks/linear_programming/cuopt/run_mip.cpp
+++ b/benchmarks/linear_programming/cuopt/run_mip.cpp
@@ -12,6 +12,7 @@
 #include <cuopt/linear_programming/mip/solver_solution.hpp>
 #include <cuopt/linear_programming/optimization_problem_interface.hpp>
 #include <cuopt/linear_programming/solve.hpp>
+#include <cuopt/linear_programming/utilities/internals.hpp>
 #include <mps_parser/parser.hpp>
 #include <utilities/logger.hpp>
 
@@ -137,6 +138,58 @@ std::vector<std::vector<double>> read_solution_from_dir(const std::string file_p
   return initial_solutions;
 }
 
+struct incumbent_record_t {
+  double objective;
+  double work_timestamp;
+  double wall_time;
+  cuopt::internals::mip_solution_origin_t origin;
+};
+
+class incumbent_tracker_t : public cuopt::internals::get_solution_callback_ext_t {
+ public:
+  incumbent_tracker_t(std::chrono::high_resolution_clock::time_point start_time)
+    : start_time_(start_time)
+  {
+  }
+
+  void get_solution(void* data,
+                    void* cost,
+                    void* solution_bound,
+                    const cuopt::internals::mip_solution_callback_info_t* info,
+                    void* user_data) override
+  {
+    double obj    = *static_cast<double*>(cost);
+    double wt     = (info != nullptr) ? info->work_timestamp : -1.0;
+    auto origin   = (info != nullptr) ? (cuopt::internals::mip_solution_origin_t)info->origin
+                                      : cuopt::internals::mip_solution_origin_t::UNKNOWN;
+    auto now      = std::chrono::high_resolution_clock::now();
+    double wall_s = std::chrono::duration<double>(now - start_time_).count();
+    records_.push_back({obj, wt, wall_s, (cuopt::internals::mip_solution_origin_t)origin});
+  }
+
+  void write_csv(const std::string& path) const
+  {
+    std::ofstream f(path);
+    if (!f.is_open()) {
+      fprintf(stderr, "Failed to open incumbent CSV: %s\n", path.c_str());
+      return;
+    }
+    f << "index,objective,work_timestamp,wall_time_s,origin\n";
+    for (size_t i = 0; i < records_.size(); ++i) {
+      auto& r = records_[i];
+      f << i << "," << std::setprecision(15) << r.objective << "," << r.work_timestamp << ","
+        << std::setprecision(6) << r.wall_time << ","
+        << cuopt::internals::mip_solution_origin_to_string(r.origin) << "\n";
+    }
+  }
+
+  size_t size() const { return records_.size(); }
+
+ private:
+  std::chrono::high_resolution_clock::time_point start_time_;
+  std::vector<incumbent_record_t> records_;
+};
+
 int run_single_file(std::string file_path,
                     int device,
                     int batch_id,
@@ -203,21 +256,40 @@ int run_single_file(std::string file_path,
       }
     }
   }
-  settings.time_limit       = time_limit;
-  settings.work_limit       = work_limit;
-  settings.heuristics_only  = heuristics_only;
-  settings.num_cpu_threads  = num_cpu_threads;
-  settings.log_to_console   = log_to_console;
-  settings.determinism_mode = deterministic ? CUOPT_MODE_DETERMINISTIC : CUOPT_MODE_OPPORTUNISTIC;
+  settings.time_limit      = time_limit;
+  settings.work_limit      = work_limit;
+  settings.heuristics_only = heuristics_only;
+  settings.num_cpu_threads = num_cpu_threads;
+  settings.log_to_console  = log_to_console;
+  if (deterministic) {
+    settings.determinism_mode =
+      heuristics_only ? CUOPT_MODE_DETERMINISTIC_GPU_HEURISTICS : CUOPT_MODE_DETERMINISTIC;
+  } else {
+    settings.determinism_mode = CUOPT_MODE_OPPORTUNISTIC;
+  }
+  CUOPT_LOG_INFO(
+    "run_mip settings: heuristics_only=%d deterministic=%d determinism_mode=%d "
+    "time_limit=%.6f work_limit=%.6f",
+    (int)heuristics_only,
+    (int)deterministic,
+    settings.determinism_mode,
+    settings.time_limit,
+    settings.work_limit);
   settings.tolerances.relative_tolerance = 1e-12;
   settings.tolerances.absolute_tolerance = 1e-6;
   settings.presolver                     = cuopt::linear_programming::presolver_t::Default;
   settings.reliability_branching         = reliability_branching;
   settings.clique_cuts                   = -1;
   settings.seed                          = 42;
+  settings.bb_work_unit_scale            = 1.0;
+  settings.gpu_heur_work_unit_scale      = 1.0;
+  settings.mip_scaling                   = false;
+  settings.gpu_heur_wait_for_exploration = false;
   cuopt::linear_programming::benchmark_info_t benchmark_info;
   settings.benchmark_info_ptr = &benchmark_info;
   auto start_run_solver       = std::chrono::high_resolution_clock::now();
+  incumbent_tracker_t incumbent_tracker(start_run_solver);
+  settings.set_mip_callback(&incumbent_tracker);
   auto solution = cuopt::linear_programming::solve_mip(&handle_, mps_data_model, settings);
   CUOPT_LOG_INFO(
     "first obj: %f last improvement of best feasible: %f last improvement after recombination: %f",
@@ -253,7 +325,13 @@ int run_single_file(std::string file_path,
      << benchmark_info.last_improvement_after_recombination << "," << mip_gap << "," << is_optimal
      << "\n";
   write_to_output_file(out_dir, base_filename, device, n_gpus, batch_id, ss.str());
-  CUOPT_LOG_INFO("Results written to the file %s", base_filename.c_str());
+  if (!out_dir.empty()) {
+    std::string mps_stem = base_filename.substr(0, base_filename.find(".mps"));
+    std::string csv_path = out_dir + "/" + mps_stem + "_incumbents.csv";
+    incumbent_tracker.write_csv(csv_path);
+    CUOPT_LOG_INFO(
+      "Incumbent trace (%zu entries) written to %s", incumbent_tracker.size(), csv_path.c_str());
+  }
   return sol_found;
 }
 
diff --git a/ci/compute-sanitizer-suppressions.xml b/ci/compute-sanitizer-suppressions.xml
new file mode 100644
index 0000000000..624b3aa0bd
--- /dev/null
+++ b/ci/compute-sanitizer-suppressions.xml
@@ -0,0 +1,249 @@
+<?xml version="1.0" encoding="utf-8"?>
+<ComputeSanitizerOutput>
+  <record>
+    <kind>Initcheck</kind>
+    <what>
+      <text>Uninitialized __global__ memory read of size 4 bytes</text>
+      <size>4</size>
+    </what>
+    <where>
+      <func>.*</func>
+    </where>
+    <hostStack>
+      <frame>
+        <module>.*libcuda.so.*</module>
+      </frame>
+      <frame>
+        <func>cusparseCsr2cscEx2</func>
+        <module>.*libcusparse.so.*</module>
+      </frame>
+    </hostStack>
+  </record>
+  <record>
+    <kind>Initcheck</kind>
+    <what>
+      <text>Uninitialized __global__ memory read of size 4 bytes</text>
+      <size>4</size>
+    </what>
+    <where>
+      <func>ThreadLoad</func>
+    </where>
+    <hostStack>
+      <frame>
+        <module>.*libcuda.so.*</module>
+      </frame>
+      <frame>
+        <module>libcudart.*</module>
+      </frame>
+      <frame>
+        <func>cudaLaunchKernel</func>
+      </frame>
+      <frame>
+        <func>.*cub::.*::Device(Segmented)?(Reduce|Scan)(SingleTile)?Kernel.*</func>
+      </frame>
+    </hostStack>
+  </record>
+  <record>
+    <kind>Initcheck</kind>
+    <what>
+      <text>Uninitialized __global__ memory read of size 2 bytes</text>
+      <size>2</size>
+    </what>
+    <where>
+      <func>ThreadLoad</func>
+    </where>
+    <hostStack>
+      <frame>
+        <module>.*libcuda.so.*</module>
+      </frame>
+      <frame>
+        <module>libcudart.*</module>
+      </frame>
+      <frame>
+        <func>cudaLaunchKernel</func>
+      </frame>
+      <frame>
+        <func>.*cub::.*::Device(Segmented)?(Reduce|Scan)(SingleTile)?Kernel.*</func>
+      </frame>
+    </hostStack>
+  </record>
+  <record>
+    <kind>Initcheck</kind>
+    <what>
+      <text>Uninitialized __global__ memory read of size 8 bytes</text>
+      <size>8</size>
+    </what>
+    <where>
+      <func>DeviceSegmentedReduceKernel</func>
+    </where>
+  </record>
+  <record>
+    <kind>Initcheck</kind>
+    <what>
+      <text>Uninitialized __global__ memory read of size 4 bytes</text>
+      <size>4</size>
+    </what>
+    <where>
+      <func>ThreadLoad</func>
+    </where>
+    <hostStack>
+      <frame>
+        <module>.*libcuda.so.*</module>
+      </frame>
+      <frame>
+        <module>libcudart.*</module>
+      </frame>
+      <frame>
+        <module>libcudart.*</module>
+      </frame>
+      <frame>
+        <module>.*libcuopt.*</module>
+      </frame>
+      <frame>
+        <func>.*Device(Reduce|Scan)Kernel.*</func>
+      </frame>
+    </hostStack>
+  </record>
+  <!-- Rule matching cccl's pattern of copying tuples back to host after reduce_by_keys, which contain uninitialized padding  -->
+  <!-- Because of aggressive inlining, thrust calls are elided out of the host stack, which prevents a more finely grained rule. In practice this is good enough -->
+  <record>
+    <kind>InitcheckApiError</kind>
+    <level>Error</level>
+    <what>
+      <text>Host API uninitialized memory access</text>
+      <accessSize>16</accessSize>
+    </what>
+    <hostStack>
+      <frame>
+        <func>cuMemcpyDtoHAsync.*</func>
+        <module>.*libcuda.so.*</module>
+      </frame>
+    </hostStack>
+  </record>
+  <!-- Suppress uninit copies on rmm::device_vector copy constructor - often vector members are allocated but not filled -->
+  <record>
+    <kind>InitcheckApiError</kind>
+    <level>Error</level>
+    <what>
+      <text>Host API uninitialized memory access</text>
+    </what>
+    <hostStack>
+      <frame>
+        <func>cuMemcpyAsync</func>
+        <module>.*libcuda.so.*</module>
+      </frame>
+      <frame>
+        <module>.*libcudart.so.*</module>
+      </frame>
+      <frame>
+        <module>.*libcudart.so.*</module>
+      </frame>
+      <frame>
+        <module>.*libcudart.so.*</module>
+      </frame>
+      <frame>
+        <module>.*librmm.so.*</module>
+      </frame>
+      <frame>
+        <func>rmm::device_buffer::device_buffer</func>
+        <module>.*librmm.so.*</module>
+      </frame>
+    </hostStack>
+  </record>
+  <!-- Suppress likely harmless Thrust/CUB tuple-buffer initcheck reads during sort_by_key
+       in trivial_presolve's COO->CSC reorder. Source and destination arrays are validated
+       immediately before the sort; the warning appears to come from internal tuple staging. -->
+  <record>
+    <kind>Initcheck</kind>
+    <what>
+      <text>Uninitialized __global__ memory read</text>
+    </what>
+    <where>
+      <func>transform_kernel</func>
+    </where>
+    <hostStack>
+      <frame>
+        <func>cuLaunchKernel_ptsz</func>
+        <module>.*libcuda.so.*</module>
+      </frame>
+      <frame>
+        <module>.*libcudart.so.*</module>
+      </frame>
+      <frame>
+        <func>cudaLaunchKernel_ptsz</func>
+      </frame>
+    </hostStack>
+  </record>
+  <record>
+    <kind>InitcheckApiError</kind>
+    <level>Error</level>
+    <what>
+      <text>Host API uninitialized memory access</text>
+    </what>
+    <hostStack>
+      <frame>
+        <func>cuMemcpyAsync</func>
+        <module>.*libcuda.so.*</module>
+      </frame>
+      <frame>
+        <module>.*libcudart.so.*</module>
+      </frame>
+      <frame>
+        <module>.*libcudart.so.*</module>
+      </frame>
+      <frame>
+        <module>.*libcudart.so.*</module>
+      </frame>
+      <frame>
+        <module>.*librmm.so.*</module>
+      </frame>
+      <frame>
+        <module>.*librmm.so.*</module>
+      </frame>
+      <frame>
+        <func>rmm::device_uvector.*::device_uvector</func>
+        <module>.*libcuopt.so.*</module>
+      </frame>
+    </hostStack>
+  </record>
+  <!-- Uninitialized device-to-device copies are usually harmless - if actualy bogus, errors may be caught later on -->
+  <record>
+    <kind>InitcheckApiError</kind>
+    <level>Error</level>
+    <what>
+      <text>Host API uninitialized memory access</text>
+    </what>
+    <hostStack>
+      <frame>
+        <func>cuMemcpyDtoDAsync.*</func>
+        <module>.*libcuda.so.*</module>
+      </frame>
+    </hostStack>
+  </record>
+  <record>
+    <kind>InitcheckApiError</kind>
+    <level>Error</level>
+    <what>
+      <text>Host API uninitialized memory access</text>
+    </what>
+    <hostStack>
+      <frame>
+        <func>cuMemcpyAsync</func>
+        <module>.*libcuda.so.*</module>
+      </frame>
+      <frame>
+        <module>.*libcudart.so.*</module>
+      </frame>
+      <frame>
+        <module>.*libcudart.so.*</module>
+      </frame>
+      <frame>
+        <func>cudaMemcpyAsync</func>
+      </frame>
+      <frame>
+        <func>rmm::device_buffer::resize</func>
+        <module>.*librmm.so.*</module>
+      </frame>
+    </hostStack>
+  </record>
+</ComputeSanitizerOutput>
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 3d6c356b3d..9a67bb65a5 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -152,3 +152,6 @@ elif [[ "${RUN_CONTEXT}" == "release" ]]; then
   sed_runner "s|\\bmain\\b|release/${NEXT_SHORT_TAG}|g" docs/cuopt/source/faq.rst
   sed_runner "s|\\bmain\\b|release/${NEXT_SHORT_TAG}|g" docs/cuopt/source/cuopt-python/routing/routing-example.ipynb
 fi
+
+# Update docs version switcher to include the new version
+python ci/utils/update_doc_versions.py
diff --git a/conda/environments/all_cuda-129_arch-aarch64.yaml b/conda/environments/all_cuda-129_arch-aarch64.yaml
index cf3563d476..04dc6bb83c 100644
--- a/conda/environments/all_cuda-129_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-129_arch-aarch64.yaml
@@ -19,7 +19,7 @@ dependencies:
 - cuda-python>=12.9.2,<13.0
 - cuda-sanitizer-api
 - cuda-version=12.9
-- cudf==26.4.*,>=0.0.0a0
+- cudf==26.6.*,>=0.0.0a0
 - cupy>=13.6.0
 - cxx-compiler
 - cython>=3.0.3
@@ -36,8 +36,8 @@ dependencies:
 - libcusparse-dev
 - libgrpc >=1.78.0,<1.80.0a0
 - libprotobuf
-- libraft-headers==26.4.*,>=0.0.0a0
-- librmm==26.4.*,>=0.0.0a0
+- libraft-headers==26.6.*,>=0.0.0a0
+- librmm==26.6.*,>=0.0.0a0
 - make
 - msgpack-numpy==0.4.8
 - msgpack-python==1.1.2
@@ -55,7 +55,7 @@ dependencies:
 - pip
 - pre-commit
 - psutil>=6.0.0
-- pylibraft==26.4.*,>=0.0.0a0
+- pylibraft==26.6.*,>=0.0.0a0
 - pyrsistent
 - pytest-cov
 - pytest<9.0
@@ -65,7 +65,7 @@ dependencies:
 - rapids-logger==0.2.*,>=0.0.0a0
 - re2
 - requests
-- rmm==26.4.*,>=0.0.0a0
+- rmm==26.6.*,>=0.0.0a0
 - scikit-build-core>=0.11.0
 - scipy>=1.14.1
 - sphinx
diff --git a/conda/environments/all_cuda-129_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml
index a8a589e48b..21891cc9f2 100644
--- a/conda/environments/all_cuda-129_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-129_arch-x86_64.yaml
@@ -19,7 +19,7 @@ dependencies:
 - cuda-python>=12.9.2,<13.0
 - cuda-sanitizer-api
 - cuda-version=12.9
-- cudf==26.4.*,>=0.0.0a0
+- cudf==26.6.*,>=0.0.0a0
 - cupy>=13.6.0
 - cxx-compiler
 - cython>=3.0.3
@@ -36,8 +36,8 @@ dependencies:
 - libcusparse-dev
 - libgrpc >=1.78.0,<1.80.0a0
 - libprotobuf
-- libraft-headers==26.4.*,>=0.0.0a0
-- librmm==26.4.*,>=0.0.0a0
+- libraft-headers==26.6.*,>=0.0.0a0
+- librmm==26.6.*,>=0.0.0a0
 - make
 - msgpack-numpy==0.4.8
 - msgpack-python==1.1.2
@@ -55,7 +55,7 @@ dependencies:
 - pip
 - pre-commit
 - psutil>=6.0.0
-- pylibraft==26.4.*,>=0.0.0a0
+- pylibraft==26.6.*,>=0.0.0a0
 - pyrsistent
 - pytest-cov
 - pytest<9.0
@@ -65,7 +65,7 @@ dependencies:
 - rapids-logger==0.2.*,>=0.0.0a0
 - re2
 - requests
-- rmm==26.4.*,>=0.0.0a0
+- rmm==26.6.*,>=0.0.0a0
 - scikit-build-core>=0.11.0
 - scipy>=1.14.1
 - sphinx
diff --git a/conda/environments/all_cuda-131_arch-aarch64.yaml b/conda/environments/all_cuda-131_arch-aarch64.yaml
index 477c708918..89147b18a7 100644
--- a/conda/environments/all_cuda-131_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-131_arch-aarch64.yaml
@@ -19,7 +19,7 @@ dependencies:
 - cuda-python>=13.0.1,<14.0
 - cuda-sanitizer-api
 - cuda-version=13.1
-- cudf==26.4.*,>=0.0.0a0
+- cudf==26.6.*,>=0.0.0a0
 - cupy>=13.6.0
 - cxx-compiler
 - cython>=3.0.3
@@ -36,8 +36,8 @@ dependencies:
 - libcusparse-dev
 - libgrpc >=1.78.0,<1.80.0a0
 - libprotobuf
-- libraft-headers==26.4.*,>=0.0.0a0
-- librmm==26.4.*,>=0.0.0a0
+- libraft-headers==26.6.*,>=0.0.0a0
+- librmm==26.6.*,>=0.0.0a0
 - make
 - msgpack-numpy==0.4.8
 - msgpack-python==1.1.2
@@ -55,7 +55,7 @@ dependencies:
 - pip
 - pre-commit
 - psutil>=6.0.0
-- pylibraft==26.4.*,>=0.0.0a0
+- pylibraft==26.6.*,>=0.0.0a0
 - pyrsistent
 - pytest-cov
 - pytest<9.0
@@ -65,7 +65,7 @@ dependencies:
 - rapids-logger==0.2.*,>=0.0.0a0
 - re2
 - requests
-- rmm==26.4.*,>=0.0.0a0
+- rmm==26.6.*,>=0.0.0a0
 - scikit-build-core>=0.11.0
 - scipy>=1.14.1
 - sphinx
diff --git a/conda/environments/all_cuda-131_arch-x86_64.yaml b/conda/environments/all_cuda-131_arch-x86_64.yaml
index d5fcba0b73..8df6f28bf7 100644
--- a/conda/environments/all_cuda-131_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-131_arch-x86_64.yaml
@@ -19,7 +19,7 @@ dependencies:
 - cuda-python>=13.0.1,<14.0
 - cuda-sanitizer-api
 - cuda-version=13.1
-- cudf==26.4.*,>=0.0.0a0
+- cudf==26.6.*,>=0.0.0a0
 - cupy>=13.6.0
 - cxx-compiler
 - cython>=3.0.3
@@ -36,8 +36,8 @@ dependencies:
 - libcusparse-dev
 - libgrpc >=1.78.0,<1.80.0a0
 - libprotobuf
-- libraft-headers==26.4.*,>=0.0.0a0
-- librmm==26.4.*,>=0.0.0a0
+- libraft-headers==26.6.*,>=0.0.0a0
+- librmm==26.6.*,>=0.0.0a0
 - make
 - msgpack-numpy==0.4.8
 - msgpack-python==1.1.2
@@ -55,7 +55,7 @@ dependencies:
 - pip
 - pre-commit
 - psutil>=6.0.0
-- pylibraft==26.4.*,>=0.0.0a0
+- pylibraft==26.6.*,>=0.0.0a0
 - pyrsistent
 - pytest-cov
 - pytest<9.0
@@ -65,7 +65,7 @@ dependencies:
 - rapids-logger==0.2.*,>=0.0.0a0
 - re2
 - requests
-- rmm==26.4.*,>=0.0.0a0
+- rmm==26.6.*,>=0.0.0a0
 - scikit-build-core>=0.11.0
 - scipy>=1.14.1
 - sphinx
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 9249b53171..c95224ea9c 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -543,6 +543,17 @@ target_link_libraries(cuopt
   gRPC::grpc++
   )
 
+# find_path(PAPI_INCLUDE_DIR papi.h)
+# find_library(PAPI_LIBRARY papi)
+
+# if (PAPI_INCLUDE_DIR AND PAPI_LIBRARY)
+#     message(STATUS "Found PAPI in ${PAPI_INCLUDE_DIR}")
+#     target_include_directories(cuopt PRIVATE ${PAPI_INCLUDE_DIR})
+#     target_link_libraries(cuopt PRIVATE ${PAPI_LIBRARY})
+# else()
+#     message(FATAL_ERROR "Could not find PAPI")
+# endif()
+
 
 # ##################################################################################################
 # - generate tests --------------------------------------------------------------------------------
@@ -652,11 +663,14 @@ rapids_cpm_find(
 if(NOT BUILD_LP_ONLY)
 add_executable(cuopt_cli cuopt_cli.cpp)
 
+# PIE executable: auditwheel/patchelf expands .dynstr/RPATH when repairing wheels; non-PIE
+# (ET_EXEC) binaries are prone to corrupt segment layout. PIE (ET_DYN) survives RPATH edits.
 set_target_properties(cuopt_cli
   PROPERTIES
   CXX_STANDARD 20
   CXX_STANDARD_REQUIRED ON
   CXX_SCAN_FOR_MODULES OFF
+  POSITION_INDEPENDENT_CODE ON
 )
 
 target_compile_options(cuopt_cli
@@ -664,6 +678,8 @@ target_compile_options(cuopt_cli
   "$<$<COMPILE_LANGUAGE:CUDA>:${CUOPT_CUDA_FLAGS}>"
 )
 
+target_link_options(cuopt_cli PRIVATE -pie)
+
 target_include_directories(cuopt_cli
   PRIVATE
   "${CMAKE_CURRENT_SOURCE_DIR}/src"
diff --git a/cpp/include/cuopt/linear_programming/constants.h b/cpp/include/cuopt/linear_programming/constants.h
index 06eacb3408..24eb02aab8 100644
--- a/cpp/include/cuopt/linear_programming/constants.h
+++ b/cpp/include/cuopt/linear_programming/constants.h
@@ -104,10 +104,40 @@
 #define CUOPT_MIP_HYPER_HEURISTIC_RELAXED_LP_TIME_LIMIT "mip_hyper_heuristic_relaxed_lp_time_limit"
 #define CUOPT_MIP_HYPER_HEURISTIC_RELATED_VARS_TIME_LIMIT \
   "mip_hyper_heuristic_related_vars_time_limit"
-
-/* @brief MIP determinism mode constants */
-#define CUOPT_MODE_OPPORTUNISTIC 0
-#define CUOPT_MODE_DETERMINISTIC 1
+#define CUOPT_MIP_HYPER_HEURISTIC_CPUFJ_WORK_UNIT_SCALE "mip_hyper_heuristic_cpufj_work_unit_scale"
+#define CUOPT_MIP_HYPER_HEURISTIC_GPU_HEUR_WORK_UNIT_SCALE \
+  "mip_hyper_heuristic_gpu_heur_work_unit_scale"
+#define CUOPT_MIP_HYPER_HEURISTIC_BB_WORK_UNIT_SCALE "mip_hyper_heuristic_bb_work_unit_scale"
+#define CUOPT_MIP_HYPER_HEURISTIC_GPU_HEUR_WAIT_FOR_EXPLORATION \
+  "mip_hyper_heuristic_gpu_heur_wait_for_exploration"
+
+/* @brief MIP determinism mode flags (bitset) */
+#define CUOPT_DETERMINISM_NONE 0x0
+// matches the previous value of '1' which was for B&B-only determinism in the previous release
+#define CUOPT_DETERMINISM_BB             0x1
+#define CUOPT_DETERMINISM_GPU_HEURISTICS 0x2
+#define CUOPT_DETERMINISM_FULL           (CUOPT_DETERMINISM_BB | CUOPT_DETERMINISM_GPU_HEURISTICS)
+
+#define CUOPT_MODE_OPPORTUNISTIC                CUOPT_DETERMINISM_NONE
+#define CUOPT_MODE_DETERMINISTIC                CUOPT_DETERMINISM_FULL
+#define CUOPT_MODE_DETERMINISTIC_BB             CUOPT_DETERMINISM_BB
+#define CUOPT_MODE_DETERMINISTIC_GPU_HEURISTICS CUOPT_DETERMINISM_GPU_HEURISTICS
+
+/* @brief MIP solution origin constants */
+#define CUOPT_MIP_SOLUTION_ORIGIN_UNKNOWN                 0
+#define CUOPT_MIP_SOLUTION_ORIGIN_BRANCH_AND_BOUND        1
+#define CUOPT_MIP_SOLUTION_ORIGIN_BRANCH_AND_BOUND_DIVING 2
+#define CUOPT_MIP_SOLUTION_ORIGIN_FEASIBILITY_JUMP        3
+#define CUOPT_MIP_SOLUTION_ORIGIN_CPU_FEASIBILITY_JUMP    4
+#define CUOPT_MIP_SOLUTION_ORIGIN_LOCAL_SEARCH            5
+#define CUOPT_MIP_SOLUTION_ORIGIN_QUICK_FEASIBLE          6
+#define CUOPT_MIP_SOLUTION_ORIGIN_LP_ROUNDING             7
+#define CUOPT_MIP_SOLUTION_ORIGIN_RECOMBINATION           8
+#define CUOPT_MIP_SOLUTION_ORIGIN_SUB_MIP                 9
+#define CUOPT_MIP_SOLUTION_ORIGIN_USER_INITIAL            10
+#define CUOPT_MIP_SOLUTION_ORIGIN_USER_INJECTED           11
+#define CUOPT_MIP_SOLUTION_ORIGIN_RINS                    12
+#define CUOPT_MIP_SOLUTION_ORIGIN_PRESOLVE                13
 
 /* @brief LP/MIP termination status constants */
 #define CUOPT_TERMINATION_STATUS_NO_TERMINATION          0
diff --git a/cpp/include/cuopt/linear_programming/cuopt_c.h b/cpp/include/cuopt/linear_programming/cuopt_c.h
index 4c4d44c764..f72a00e932 100644
--- a/cpp/include/cuopt/linear_programming/cuopt_c.h
+++ b/cpp/include/cuopt/linear_programming/cuopt_c.h
@@ -71,6 +71,23 @@ typedef int32_t cuopt_int_t;
 typedef int64_t cuopt_int_t;
 #endif
 
+/**
+ * @brief Extended callback information passed to cuOptMIPGetSolutionCallbackExt.
+ *
+ * Provides metadata about each incumbent solution reported during a MIP solve.
+ *
+ * Fields are append-only. Existing fields will never be reordered, removed,
+ * or change type across releases.
+ */
+typedef struct {
+  /** Which solver component found this solution (CUOPT_MIP_SOLUTION_ORIGIN_*). */
+  uint32_t origin;
+  /** Deterministic work-unit timestamp at which the solution was found.
+   *  Monotonically increasing across successive callbacks within a single solve.
+   *  In non-deterministic mode this value is informational only. */
+  double work_timestamp;
+} cuOptMIPSolutionCallbackInfo;
+
 /**
  * @brief Get the size of the float type.
  *
@@ -713,6 +730,24 @@ typedef void (*cuOptMIPGetSolutionCallback)(const cuopt_float_t* solution,
                                             const cuopt_float_t* solution_bound,
                                             void* user_data);
 
+/**
+ * @brief Type of callback for receiving incumbent MIP solutions with extended metadata.
+ *
+ * @param[in] solution - Pointer to incumbent solution values.
+ * @param[in] objective_value - Pointer to incumbent objective value.
+ * @param[in] solution_bound - Pointer to current solution (dual/user) bound.
+ * @param[in] callback_info - Pointer to callback metadata.
+ * @param[in] user_data - Pointer to user data.
+ * @note All pointer arguments refer to host memory and are only valid during the callback
+ * invocation. Do not pass device/GPU pointers. Copy any data you need to keep after the callback
+ * returns.
+ */
+typedef void (*cuOptMIPGetSolutionCallbackExt)(const cuopt_float_t* solution,
+                                               const cuopt_float_t* objective_value,
+                                               const cuopt_float_t* solution_bound,
+                                               const cuOptMIPSolutionCallbackInfo* callback_info,
+                                               void* user_data);
+
 /**
  * @brief Type of callback for injecting MIP solutions with user context.
  *
@@ -748,6 +783,19 @@ cuopt_int_t cuOptSetMIPGetSolutionCallback(cuOptSolverSettings settings,
                                            cuOptMIPGetSolutionCallback callback,
                                            void* user_data);
 
+/**
+ * @brief Register an extended callback to receive incumbent MIP solutions with extended metadata.
+ *
+ * @param[in] settings - The solver settings object.
+ * @param[in] callback - Callback function to receive incumbent solutions and callback metadata.
+ * @param[in] user_data - User-defined pointer passed through to the callback.
+ *
+ * @return A status code indicating success or failure.
+ */
+cuopt_int_t cuOptSetMIPGetSolutionCallbackExt(cuOptSolverSettings settings,
+                                              cuOptMIPGetSolutionCallbackExt callback,
+                                              void* user_data);
+
 /**
  * @brief Register a callback to inject MIP solutions.
  *
diff --git a/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp b/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp
index 14c4d227bc..77425276c3 100644
--- a/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp
+++ b/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp
@@ -107,6 +107,13 @@ class mip_solver_settings_t {
   i_t strong_branching_simplex_iteration_limit = -1;
   i_t num_gpus                                 = 1;
   bool log_to_console                          = true;
+  // User-facing multipliers on top of internal baseline work-unit scales.
+  // 1.0 = use internally calibrated default. Values > 1 make that component appear to do more work.
+  f_t cpufj_work_unit_scale    = 1.0;
+  f_t gpu_heur_work_unit_scale = 1.0;
+  f_t bb_work_unit_scale       = 1.0;
+  // When true, GPU heuristics wait for B&B to finish root solve before starting.
+  bool gpu_heur_wait_for_exploration = false;
 
   std::string log_file;
   std::string sol_file;
@@ -118,15 +125,15 @@ class mip_solver_settings_t {
   int mip_scaling = CUOPT_MIP_SCALING_NO_OBJECTIVE;
   presolver_t presolver{presolver_t::Default};
   /**
-   * @brief Determinism mode for MIP solver.
+   * @brief Determinism mode for MIP solver (bitset).
    *
-   * Controls the determinism behavior of the MIP solver:
-   * - CUOPT_MODE_OPPORTUNISTIC (0): Default mode, allows non-deterministic
-   *   parallelism for better performance
-   * - CUOPT_MODE_DETERMINISTIC (1): Ensures deterministic results across runs
-   *   at potential cost of performance
+   * Bitwise OR of CUOPT_DETERMINISM_* flags:
+   * - CUOPT_DETERMINISM_NONE (0x0): Opportunistic, non-deterministic.
+   * - CUOPT_DETERMINISM_BB (0x1): Deterministic B&B tree exploration.
+   * - CUOPT_DETERMINISM_GPU_HEURISTICS (0x2): Deterministic GPU heuristic pipeline.
+   * - CUOPT_DETERMINISM_FULL (0x3): Both B&B and GPU heuristics deterministic.
    */
-  int determinism_mode = CUOPT_MODE_OPPORTUNISTIC;
+  int determinism_mode = CUOPT_DETERMINISM_NONE;
   /**
    * @brief Random seed for the MIP solver.
    *
diff --git a/cpp/include/cuopt/linear_programming/utilities/internals.hpp b/cpp/include/cuopt/linear_programming/utilities/internals.hpp
index bdfbb969d2..509e5c4100 100644
--- a/cpp/include/cuopt/linear_programming/utilities/internals.hpp
+++ b/cpp/include/cuopt/linear_programming/utilities/internals.hpp
@@ -13,6 +13,8 @@
 #include <type_traits>
 
 #include <cuopt/linear_programming/constants.h>
+#include <cuopt/linear_programming/cuopt_c.h>
+
 namespace cuopt {
 namespace internals {
 
@@ -21,7 +23,51 @@ class Callback {
   virtual ~Callback() {}
 };
 
-enum class base_solution_callback_type { GET_SOLUTION, SET_SOLUTION };
+enum class mip_solution_origin_t : uint32_t {
+  UNKNOWN                 = CUOPT_MIP_SOLUTION_ORIGIN_UNKNOWN,
+  BRANCH_AND_BOUND_NODE   = CUOPT_MIP_SOLUTION_ORIGIN_BRANCH_AND_BOUND,
+  BRANCH_AND_BOUND_DIVING = CUOPT_MIP_SOLUTION_ORIGIN_BRANCH_AND_BOUND_DIVING,
+  FEASIBILITY_JUMP        = CUOPT_MIP_SOLUTION_ORIGIN_FEASIBILITY_JUMP,
+  CPU_FEASIBILITY_JUMP    = CUOPT_MIP_SOLUTION_ORIGIN_CPU_FEASIBILITY_JUMP,
+  LOCAL_SEARCH            = CUOPT_MIP_SOLUTION_ORIGIN_LOCAL_SEARCH,
+  QUICK_FEASIBLE          = CUOPT_MIP_SOLUTION_ORIGIN_QUICK_FEASIBLE,
+  LP_ROUNDING             = CUOPT_MIP_SOLUTION_ORIGIN_LP_ROUNDING,
+  RECOMBINATION           = CUOPT_MIP_SOLUTION_ORIGIN_RECOMBINATION,
+  SUB_MIP                 = CUOPT_MIP_SOLUTION_ORIGIN_SUB_MIP,
+  USER_INITIAL            = CUOPT_MIP_SOLUTION_ORIGIN_USER_INITIAL,
+  USER_INJECTED           = CUOPT_MIP_SOLUTION_ORIGIN_USER_INJECTED,
+  RINS                    = CUOPT_MIP_SOLUTION_ORIGIN_RINS,
+  PRESOLVE                = CUOPT_MIP_SOLUTION_ORIGIN_PRESOLVE,
+};
+
+constexpr const char* mip_solution_origin_to_string(mip_solution_origin_t origin)
+{
+  switch (origin) {
+    case mip_solution_origin_t::UNKNOWN: return "unknown";
+    case mip_solution_origin_t::BRANCH_AND_BOUND_NODE: return "branch_and_bound_node";
+    case mip_solution_origin_t::BRANCH_AND_BOUND_DIVING: return "branch_and_bound_diving";
+    case mip_solution_origin_t::FEASIBILITY_JUMP: return "feasibility_jump";
+    case mip_solution_origin_t::CPU_FEASIBILITY_JUMP: return "cpu_feasibility_jump";
+    case mip_solution_origin_t::LOCAL_SEARCH: return "local_search";
+    case mip_solution_origin_t::QUICK_FEASIBLE: return "quick_feasible";
+    case mip_solution_origin_t::LP_ROUNDING: return "lp_rounding";
+    case mip_solution_origin_t::RECOMBINATION: return "recombination";
+    case mip_solution_origin_t::SUB_MIP: return "sub_mip";
+    case mip_solution_origin_t::USER_INITIAL: return "user_initial";
+    case mip_solution_origin_t::USER_INJECTED: return "user_injected";
+    case mip_solution_origin_t::RINS: return "rins";
+    case mip_solution_origin_t::PRESOLVE:
+      return "presolve";
+      // no default to trigger compiler -Werror
+  }
+  return "unknown";
+}
+
+using mip_solution_callback_info_t = cuOptMIPSolutionCallbackInfo;
+
+// get_solution_ext was added to support passing additional information to the get_solution callback
+// without inducing a breaking ABI change
+enum class base_solution_callback_type { GET_SOLUTION, GET_SOLUTION_EXT, SET_SOLUTION };
 
 class base_solution_callback_t : public Callback {
  public:
@@ -55,6 +101,19 @@ class get_solution_callback_t : public base_solution_callback_t {
   }
 };
 
+class get_solution_callback_ext_t : public base_solution_callback_t {
+ public:
+  virtual void get_solution(void* data,
+                            void* objective_value,
+                            void* solution_bound,
+                            const mip_solution_callback_info_t* callback_info,
+                            void* user_data) = 0;
+  base_solution_callback_type get_type() const override
+  {
+    return base_solution_callback_type::GET_SOLUTION_EXT;
+  }
+};
+
 class set_solution_callback_t : public base_solution_callback_t {
  public:
   virtual void set_solution(void* data,
diff --git a/cpp/src/barrier/barrier.cu b/cpp/src/barrier/barrier.cu
index 4da66abe77..902e691e64 100644
--- a/cpp/src/barrier/barrier.cu
+++ b/cpp/src/barrier/barrier.cu
@@ -40,7 +40,9 @@
 #include <raft/linalg/dot.cuh>
 
 #include <thrust/iterator/permutation_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
 #include <thrust/iterator/transform_output_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
 
 namespace cuopt::linear_programming::dual_simplex {
 
diff --git a/cpp/src/barrier/iterative_refinement.hpp b/cpp/src/barrier/iterative_refinement.hpp
index d37760cd07..69e72d66bc 100644
--- a/cpp/src/barrier/iterative_refinement.hpp
+++ b/cpp/src/barrier/iterative_refinement.hpp
@@ -13,6 +13,7 @@
 #include <dual_simplex/vector_math.hpp>
 
 #include <thrust/execution_policy.h>
+#include <thrust/extrema.h>
 #include <thrust/fill.h>
 #include <thrust/inner_product.h>
 #include <thrust/reduce.h>
diff --git a/cpp/src/branch_and_bound/branch_and_bound.cpp b/cpp/src/branch_and_bound/branch_and_bound.cpp
index 33a2d983c9..631edcbc84 100644
--- a/cpp/src/branch_and_bound/branch_and_bound.cpp
+++ b/cpp/src/branch_and_bound/branch_and_bound.cpp
@@ -5,9 +5,12 @@
  */
 /* clang-format on */
 
+#include <utilities/determinism_log.hpp>
+
 #include <branch_and_bound/branch_and_bound.hpp>
 #include <branch_and_bound/mip_node.hpp>
 #include <branch_and_bound/pseudo_costs.hpp>
+#include <mip_heuristics/mip_constants.hpp>
 
 #include <cuts/cuts.hpp>
 #include <mip_heuristics/presolve/conflict_graph/clique_table.cuh>
@@ -25,6 +28,7 @@
 
 #include <raft/core/nvtx.hpp>
 #include <utilities/hashing.hpp>
+#include <utilities/scope_guard.hpp>
 
 #include <omp.h>
 
@@ -35,13 +39,20 @@
 #include <deque>
 #include <future>
 #include <limits>
-#include <map>
 #include <optional>
 #include <string>
 #include <thread>
-#include <unordered_map>
 #include <vector>
 
+// enable to activate detailed determinism logs
+#if 0
+#undef CUOPT_DETERMINISM_LOG
+#define CUOPT_DETERMINISM_LOG(logger, ...) \
+  do {                                     \
+    logger.printf(__VA_ARGS__);            \
+  } while (0)
+#endif
+
 namespace cuopt::linear_programming::dual_simplex {
 
 namespace {
@@ -270,6 +281,22 @@ branch_and_bound_t<i_t, f_t>::branch_and_bound_t(
   dualize_info_t<i_t, f_t> dualize_info;
   convert_user_problem(original_problem_, settings_, original_lp_, new_slacks_, dualize_info);
   full_variable_types(original_problem_, original_lp_, var_types_);
+  CUOPT_DETERMINISM_LOG(
+    settings_.log,
+    "Deterministic LP init state: rows=%d cols=%d nnz=%zu slacks=%zu slack_hash=0x%x "
+    "rhs_hash=0x%x lower_hash=0x%x upper_hash=0x%x Acol_hash=0x%x Arow_hash=0x%x "
+    "Aval_hash=0x%x\n",
+    original_lp_.num_rows,
+    original_lp_.num_cols,
+    original_lp_.A.x.size(),
+    new_slacks_.size(),
+    detail::compute_hash(new_slacks_),
+    detail::compute_hash(original_lp_.rhs),
+    detail::compute_hash(original_lp_.lower),
+    detail::compute_hash(original_lp_.upper),
+    detail::compute_hash(original_lp_.A.col_start),
+    detail::compute_hash(original_lp_.A.i),
+    detail::compute_hash(original_lp_.A.x));
 
   // Check slack
 #ifdef CHECK_SLACKS
@@ -320,19 +347,30 @@ void branch_and_bound_t<i_t, f_t>::set_initial_upper_bound(f_t bound)
 }
 
 template <typename i_t, typename f_t>
-void branch_and_bound_t<i_t, f_t>::report_heuristic(f_t obj)
+void branch_and_bound_t<i_t, f_t>::report_heuristic(f_t obj, double work_time)
 {
   if (is_running_) {
     f_t user_obj         = compute_user_objective(original_lp_, obj);
     f_t user_lower       = compute_user_objective(original_lp_, get_lower_bound());
     std::string user_gap = user_mip_gap<i_t, f_t>(original_lp_, obj, get_lower_bound());
-
-    settings_.log.printf(
-      "H                            %+13.6e    %+10.6e                               %s %9.2f\n",
-      user_obj,
-      user_lower,
-      user_gap.c_str(),
-      toc(exploration_stats_.start_time));
+    if (settings_.deterministic) {
+      const double reported_work = work_time >= 0.0 ? work_time : work_unit_context_.current_work();
+      settings_.log.printf(
+        "H                            %+13.6e    %+10.6e                               %s "
+        "%9.2f %9.2f\n",
+        user_obj,
+        user_lower,
+        user_gap.c_str(),
+        reported_work,
+        toc(exploration_stats_.start_time));
+    } else {
+      settings_.log.printf(
+        "H                            %+13.6e    %+10.6e                               %s %9.2f\n",
+        user_obj,
+        user_lower,
+        user_gap.c_str(),
+        toc(exploration_stats_.start_time));
+    }
   } else {
     if (solving_root_relaxation_.load()) {
       f_t user_obj = compute_user_objective(original_lp_, obj);
@@ -461,8 +499,11 @@ void branch_and_bound_t<i_t, f_t>::update_user_bound(f_t lower_bound)
 }
 
 template <typename i_t, typename f_t>
-void branch_and_bound_t<i_t, f_t>::set_new_solution(const std::vector<f_t>& solution)
+void branch_and_bound_t<i_t, f_t>::set_new_solution(const std::vector<f_t>& solution,
+                                                    cuopt::internals::mip_solution_origin_t origin)
 {
+  cuopt_assert(!settings_.deterministic, "set_new_solution is for opportunistic B&B only");
+
   mutex_original_lp_.lock();
   if (solution.size() != original_problem_.num_cols) {
     settings_.log.printf(
@@ -513,51 +554,91 @@ void branch_and_bound_t<i_t, f_t>::set_new_solution(const std::vector<f_t>& solu
   if (is_feasible) { report_heuristic(obj); }
   if (attempt_repair) {
     mutex_repair_.lock();
-    repair_queue_.push_back(solution);
+    repair_queue_.push_back({solution, origin});
     mutex_repair_.unlock();
   }
 }
 
 template <typename i_t, typename f_t>
-void branch_and_bound_t<i_t, f_t>::queue_external_solution_deterministic(
-  const std::vector<f_t>& solution, double work_unit_ts)
+void branch_and_bound_t<i_t, f_t>::emit_solution_callback(
+  std::vector<f_t>& original_x,
+  f_t objective,
+  cuopt::internals::mip_solution_origin_t origin,
+  double work_timestamp)
+{
+  cuopt_assert(!settings_.deterministic || work_timestamp >= 0.0,
+               "work_timestamp must not be negative in deterministic mode");
+  if (settings_.new_incumbent_callback != nullptr) {
+    settings_.log.debug("Publishing incumbent: obj=%g wut=%.6f origin=%s\n",
+                        compute_user_objective(original_lp_, objective),
+                        work_timestamp,
+                        cuopt::internals::mip_solution_origin_to_string(origin));
+    cuopt::internals::mip_solution_callback_info_t callback_info{};
+    callback_info.origin         = (uint32_t)origin;
+    callback_info.work_timestamp = work_timestamp;
+    settings_.new_incumbent_callback(original_x, objective, callback_info, work_timestamp);
+  }
+}
+
+template <typename i_t, typename f_t>
+void branch_and_bound_t<i_t, f_t>::emit_solution_callback_from_crushed(
+  const std::vector<f_t>& crushed_solution,
+  f_t objective,
+  cuopt::internals::mip_solution_origin_t origin,
+  double work_timestamp)
 {
-  // In deterministic mode, queue the solution to be processed at the correct work unit timestamp
-  // This ensures deterministic ordering of solution events
+  if (settings_.new_incumbent_callback == nullptr) { return; }
+  std::vector<f_t> original_x;
+  uncrush_primal_solution(original_problem_, original_lp_, crushed_solution, original_x);
+  emit_solution_callback(original_x, objective, origin, work_timestamp);
+}
 
+template <typename i_t, typename f_t>
+void branch_and_bound_t<i_t, f_t>::queue_external_solution_deterministic(
+  const std::vector<f_t>& solution,
+  f_t user_objective,
+  double work_unit_ts,
+  cuopt::internals::mip_solution_origin_t origin)
+{
   if (solution.size() != original_problem_.num_cols) {
     settings_.log.printf(
       "Solution size mismatch %ld %d\n", solution.size(), original_problem_.num_cols);
     return;
   }
+  settings_.log.printf(
+    "Queueing deterministic external incumbent: obj=%g heur_wut=%.3f bnb_wut=%.3f origin=%s "
+    "hash=0x%x\n",
+    user_objective,
+    work_unit_ts,
+    work_unit_context_.current_work(),
+    cuopt::internals::mip_solution_origin_to_string(origin),
+    detail::compute_hash(solution));
 
   mutex_original_lp_.lock();
-  std::vector<f_t> crushed_solution;
-  crush_primal_solution<i_t, f_t>(
-    original_problem_, original_lp_, solution, new_slacks_, crushed_solution);
-  f_t obj = compute_objective(original_lp_, crushed_solution);
-
-  // Validate solution before queueing
-  f_t primal_err;
-  f_t bound_err;
-  i_t num_fractional;
-  bool is_feasible = check_guess(
-    original_lp_, settings_, var_types_, crushed_solution, primal_err, bound_err, num_fractional);
+  CUOPT_DETERMINISM_LOG(
+    settings_.log,
+    "Deterministic external crush ctx: wut=%.6f lp_rows=%d lp_cols=%d lp_nnz=%zu "
+    "active_cut_rows=%d "
+    "slacks=%zu slack_hash=0x%x rhs_hash=0x%x lower_hash=0x%x upper_hash=0x%x "
+    "Acol_hash=0x%x Arow_hash=0x%x Aval_hash=0x%x\n",
+    work_unit_ts,
+    original_lp_.num_rows,
+    original_lp_.num_cols,
+    original_lp_.A.x.size(),
+    std::max((i_t)0, original_lp_.num_rows - original_problem_.num_rows),
+    new_slacks_.size(),
+    detail::compute_hash(new_slacks_),
+    detail::compute_hash(original_lp_.rhs),
+    detail::compute_hash(original_lp_.lower),
+    detail::compute_hash(original_lp_.upper),
+    detail::compute_hash(original_lp_.A.col_start),
+    detail::compute_hash(original_lp_.A.i),
+    detail::compute_hash(original_lp_.A.x));
   mutex_original_lp_.unlock();
 
-  if (!is_feasible) {
-    // Queue the uncrushed solution for repair; it will be crushed at
-    // consumption time so that the crush reflects the current LP state
-    // (which may have gained slack columns from cuts added after this point).
-    mutex_repair_.lock();
-    repair_queue_.push_back(solution);
-    mutex_repair_.unlock();
-    return;
-  }
-
-  // Queue the solution with its work unit timestamp
   mutex_heuristic_queue_.lock();
-  heuristic_solution_queue_.push_back({obj, std::move(crushed_solution), 0, -1, 0, work_unit_ts});
+  heuristic_solution_queue_.push_back({solution, user_objective, work_unit_ts, origin});
+  const size_t heuristic_queue_size = heuristic_solution_queue_.size();
   mutex_heuristic_queue_.unlock();
 }
 
@@ -620,6 +701,14 @@ bool branch_and_bound_t<i_t, f_t>::repair_solution(const std::vector<f_t>& edge_
         num_fractional,
         repaired_obj);
     }
+  } else {
+    settings_.log.printf(
+      "Repair LP failed: status=%s iters=%d time=%.3fs time_limit=%.3f cut_off=%e\n",
+      dual::status_to_string(lp_status).c_str(),
+      iter,
+      toc(lp_start_time),
+      lp_settings.time_limit,
+      lp_settings.cut_off);
   }
 
   return feasible;
@@ -630,7 +719,7 @@ void branch_and_bound_t<i_t, f_t>::repair_heuristic_solutions()
 {
   raft::common::nvtx::range scope("BB::repair_heuristics");
   // Check if there are any solutions to repair
-  std::vector<std::vector<f_t>> to_repair;
+  std::vector<queued_repair_solution_t> to_repair;
   mutex_repair_.lock();
   if (repair_queue_.size() > 0) {
     to_repair = repair_queue_;
@@ -640,7 +729,8 @@ void branch_and_bound_t<i_t, f_t>::repair_heuristic_solutions()
 
   if (to_repair.size() > 0) {
     settings_.log.debug("Attempting to repair %ld injected solutions\n", to_repair.size());
-    for (const std::vector<f_t>& uncrushed_solution : to_repair) {
+    for (const auto& queued_solution : to_repair) {
+      const std::vector<f_t>& uncrushed_solution = queued_solution.solution;
       std::vector<f_t> crushed_solution;
       crush_primal_solution<i_t, f_t>(
         original_problem_, original_lp_, uncrushed_solution, new_slacks_, crushed_solution);
@@ -652,15 +742,23 @@ void branch_and_bound_t<i_t, f_t>::repair_heuristic_solutions()
         mutex_upper_.lock();
 
         if (improves_incumbent(repaired_obj)) {
-          upper_bound_ = std::min(upper_bound_.load(), repaired_obj);
+          const f_t previous_upper = upper_bound_;
+          upper_bound_             = std::min(upper_bound_.load(), repaired_obj);
           incumbent_.set_incumbent_solution(repaired_obj, repaired_solution);
-          report_heuristic(repaired_obj);
-
-          if (settings_.solution_callback != nullptr) {
-            std::vector<f_t> original_x;
-            uncrush_primal_solution(original_problem_, original_lp_, repaired_solution, original_x);
-            settings_.solution_callback(original_x, repaired_obj);
-          }
+          CUOPT_DETERMINISM_LOG(
+            settings_.log,
+            "Deterministic B&B incumbent update: source=repair_queue prev_upper=%.16e "
+            "new_upper=%.16e obj=%.16e hash=0x%x\n",
+            previous_upper,
+            upper_bound_.load(),
+            repaired_obj,
+            detail::compute_hash(repaired_solution));
+          report_heuristic(repaired_obj, queued_solution.work_timestamp);
+
+          emit_solution_callback_from_crushed(repaired_solution,
+                                              repaired_obj,
+                                              queued_solution.origin,
+                                              queued_solution.work_timestamp);
         }
 
         mutex_upper_.unlock();
@@ -690,14 +788,47 @@ void branch_and_bound_t<i_t, f_t>::set_solution_at_root(mip_solution_t<i_t, f_t>
                        compute_user_objective(original_lp_, root_objective_),
                        toc(exploration_stats_.start_time));
 
-  if (settings_.solution_callback != nullptr) {
-    settings_.solution_callback(solution.x, solution.objective);
-  }
+  emit_solution_callback(solution.x,
+                         solution.objective,
+                         cuopt::internals::mip_solution_origin_t::BRANCH_AND_BOUND_NODE,
+                         work_unit_context_.current_work());
   if (settings_.heuristic_preemption_callback != nullptr) {
     settings_.heuristic_preemption_callback();
   }
 }
 
+template <typename i_t, typename f_t>
+std::tuple<bool, f_t, std::vector<f_t>> branch_and_bound_t<i_t, f_t>::retire_queued_solution(
+  const queued_external_solution_t& queued_solution)
+{
+  f_t primal_err;
+  f_t bound_err;
+  i_t num_fractional;
+  std::vector<f_t> crushed;
+
+  mutex_original_lp_.lock();
+  crush_primal_solution<i_t, f_t>(
+    original_problem_, original_lp_, queued_solution.solution, new_slacks_, crushed);
+  f_t obj          = compute_objective(original_lp_, crushed);
+  bool is_feasible = check_guess(
+    original_lp_, settings_, var_types_, crushed, primal_err, bound_err, num_fractional);
+  mutex_original_lp_.unlock();
+
+  if (is_feasible) { return {true, obj, std::move(crushed)}; }
+
+  // Attempt repair immediately, no separate repair queue in deterministic mode
+  std::vector<f_t> repaired_solution;
+  f_t repaired_obj;
+  bool repaired = repair_solution(edge_norms_, crushed, repaired_obj, repaired_solution);
+  if (repaired) { return {true, repaired_obj, std::move(repaired_solution)}; }
+
+  CUOPT_DETERMINISM_LOG(settings_.log,
+                        "Deterministic repair FAILED: wut=%.3f origin=%s\n",
+                        queued_solution.work_timestamp,
+                        cuopt::internals::mip_solution_origin_to_string(queued_solution.origin));
+  return {false, {}, {}};
+}
+
 template <typename i_t, typename f_t>
 void branch_and_bound_t<i_t, f_t>::set_final_solution(mip_solution_t<i_t, f_t>& solution,
                                                       f_t lower_bound)
@@ -767,6 +898,53 @@ void branch_and_bound_t<i_t, f_t>::set_final_solution(mip_solution_t<i_t, f_t>&
     }
   }
 
+  // Drain any pending heuristic solutions that B&B never got to retire during exploration
+  // (e.g., root solve consumed the entire budget).
+  if (settings_.deterministic) {
+    const double current_work = work_unit_context_.current_work();
+    mutex_heuristic_queue_.lock();
+    std::vector<queued_external_solution_t> pending;
+    pending.swap(heuristic_solution_queue_);
+    mutex_heuristic_queue_.unlock();
+
+    std::sort(pending.begin(),
+              pending.end(),
+              [](const queued_external_solution_t& a, const queued_external_solution_t& b) {
+                if (a.work_timestamp != b.work_timestamp) {
+                  return a.work_timestamp < b.work_timestamp;
+                }
+                if (a.user_objective != b.user_objective) {
+                  return a.user_objective < b.user_objective;
+                }
+                if (a.origin != b.origin) { return a.origin < b.origin; }
+                return a.solution < b.solution;
+              });
+
+    for (const auto& queued_solution : pending) {
+      if (queued_solution.work_timestamp > current_work) { continue; }
+      auto [feasible, obj, crushed] = retire_queued_solution(queued_solution);
+      if (feasible && improves_incumbent(obj)) {
+        upper_bound_ = std::min(upper_bound_.load(), obj);
+        incumbent_.set_incumbent_solution(obj, crushed);
+        settings_.log.debug(
+          "Late-retired heuristic incumbent: obj=%.6e wut=%.3f origin=%s\n",
+          compute_user_objective(original_lp_, obj),
+          queued_solution.work_timestamp,
+          cuopt::internals::mip_solution_origin_to_string(queued_solution.origin));
+        emit_solution_callback_from_crushed(
+          crushed, obj, queued_solution.origin, queued_solution.work_timestamp);
+      }
+    }
+    size_t n_drained = pending.size();
+    CUOPT_DETERMINISM_LOG(
+      settings_.log,
+      "Post-drain: user_upper=%.16e has_incumbent=%d drained=%zu user_lower_arg=%.16e\n",
+      compute_user_objective(original_lp_, upper_bound_.load()),
+      (int)incumbent_.has_incumbent,
+      n_drained,
+      compute_user_objective(original_lp_, lower_bound));
+  }
+
   if (has_solver_space_incumbent()) {
     uncrush_primal_solution(original_problem_, original_lp_, incumbent_.x, solution.x);
     solution.objective = incumbent_.objective;
@@ -790,16 +968,29 @@ void branch_and_bound_t<i_t, f_t>::add_feasible_solution(f_t leaf_objective,
 
   mutex_upper_.lock();
   if (improves_incumbent(leaf_objective)) {
+    const f_t previous_upper = upper_bound_;
     incumbent_.set_incumbent_solution(leaf_objective, leaf_solution);
     upper_bound_ = std::min(upper_bound_.load(), leaf_objective);
+    CUOPT_DETERMINISM_LOG(
+      settings_.log,
+      "Deterministic B&B incumbent update: source=leaf prev_upper=%.16e new_upper=%.16e "
+      "obj=%.16e hash=0x%x depth=%d worker_type=%d\n",
+      previous_upper,
+      upper_bound_.load(),
+      leaf_objective,
+      detail::compute_hash(leaf_solution),
+      leaf_depth,
+      (int)thread_type);
     report(feasible_solution_symbol(thread_type), leaf_objective, get_lower_bound(), leaf_depth, 0);
     send_solution = true;
   }
 
-  if (send_solution && settings_.solution_callback != nullptr) {
-    std::vector<f_t> original_x;
-    uncrush_primal_solution(original_problem_, original_lp_, incumbent_.x, original_x);
-    settings_.solution_callback(original_x, leaf_objective);
+  if (send_solution) {
+    emit_solution_callback_from_crushed(
+      incumbent_.x,
+      leaf_objective,
+      cuopt::internals::mip_solution_origin_t::BRANCH_AND_BOUND_NODE,
+      work_unit_context_.current_work());
   }
   mutex_upper_.unlock();
 }
@@ -936,6 +1127,23 @@ struct nondeterministic_policy_t : tree_update_policy_t<i_t, f_t> {
                                f_t obj,
                                const std::vector<f_t>& x) override
   {
+    f_t primal_err;
+    f_t bound_err;
+    i_t num_fractional;
+    bool cg = check_guess(
+      bnb.original_lp_, bnb.settings_, bnb.var_types_, x, primal_err, bound_err, num_fractional);
+    if (!cg) {
+      bnb.settings_.log.printf(
+        "Rejecting infeasible integer solution: node=%d depth=%d "
+        "obj=%.6e primal_err=%.6e bound_err=%.6e fractional=%d\n",
+        node->node_id,
+        node->depth,
+        obj,
+        primal_err,
+        bound_err,
+        num_fractional);
+      return;
+    }
     bnb.add_feasible_solution(obj, x, node->depth, worker->search_strategy);
   }
 
@@ -1008,8 +1216,11 @@ struct deterministic_policy_base_t : tree_update_policy_t<i_t, f_t> {
                    ? node->fractional_val - std::floor(node->fractional_val)
                    : std::ceil(node->fractional_val) - node->fractional_val;
     if (frac > 1e-10) {
-      worker.pc_snapshot.queue_update(
-        node->branch_var, node->branch_dir, change / frac, worker.clock, worker.worker_id);
+      worker.pc_snapshot.queue_update(node->branch_var,
+                                      node->branch_dir,
+                                      change / frac,
+                                      worker.work_context.current_work(),
+                                      worker.worker_id);
     }
   }
 
@@ -1029,17 +1240,94 @@ struct deterministic_bfs_policy_t
                                const std::vector<f_t>& x) override
   {
     if (obj < this->worker.local_upper_bound) {
+      f_t primal_err;
+      f_t bound_err;
+      i_t num_fractional;
+      bool cg = check_guess(this->bnb.original_lp_,
+                            this->bnb.settings_,
+                            this->bnb.var_types_,
+                            x,
+                            primal_err,
+                            bound_err,
+                            num_fractional);
+      if (!cg) {
+        this->bnb.settings_.log.printf(
+          "Rejecting infeasible integer solution: worker=%d node=%d depth=%d "
+          "obj=%.6e primal_err=%.6e bound_err=%.6e fractional=%d\n",
+          this->worker.worker_id,
+          node->creation_seq,
+          node->depth,
+          obj,
+          primal_err,
+          bound_err,
+          num_fractional);
+        return;
+      }
       this->worker.local_upper_bound = obj;
+      CUOPT_DETERMINISM_LOG(
+        bnb.settings_.log,
+        "BFS integer solution queued: worker=%d clock=%.6f ctx_work=%.6f obj=%.6e depth=%d\n",
+        this->worker.worker_id,
+        this->worker.work_context.current_work(),
+        this->worker.work_context.global_work_units_elapsed,
+        obj,
+        node->depth);
       this->worker.integer_solutions.push_back(
-        {obj, x, node->depth, this->worker.worker_id, this->worker.next_solution_seq++});
+        {obj,
+         x,
+         node->depth,
+         this->worker.worker_id,
+         this->worker.next_solution_seq++,
+         this->worker.work_context.current_work(),
+         cuopt::internals::mip_solution_origin_t::BRANCH_AND_BOUND_NODE});
     }
   }
 
-  branch_variable_t<i_t> select_branch_variable(mip_node_t<i_t, f_t>*,
+  branch_variable_t<i_t> select_branch_variable(mip_node_t<i_t, f_t>* node,
                                                 const std::vector<i_t>& fractional,
                                                 const std::vector<f_t>& x) override
   {
-    i_t var  = this->worker.pc_snapshot.variable_selection(fractional, x);
+    i_t var;
+    if (this->bnb.settings_.reliability_branching != 0 &&
+        this->worker.nodes_explored_snapshot > 0) {
+      auto& snap = this->worker.pc_snapshot;
+
+      sb_update_callback_t<i_t, f_t> on_sb_update = [&](
+                                                      i_t j, rounding_direction_t dir, f_t delta) {
+        snap.record_update(
+          j, dir, delta, this->worker.work_context.current_work(), this->worker.worker_id);
+      };
+
+      var = reliable_variable_selection_core(node,
+                                             fractional,
+                                             x,
+                                             this->bnb.settings_,
+                                             this->bnb.var_types_,
+                                             this->worker.leaf_problem,
+                                             this->worker.leaf_edge_norms,
+                                             this->worker.basis_factors,
+                                             this->worker.basic_list,
+                                             this->worker.nonbasic_list,
+                                             snap.sum_down_.data(),
+                                             snap.sum_up_.data(),
+                                             snap.num_down_.data(),
+                                             snap.num_up_.data(),
+                                             snap.n_vars(),
+                                             snap.strong_branching_lp_iter_,
+                                             this->worker.local_upper_bound,
+                                             (int64_t)this->worker.total_lp_iters_snapshot,
+                                             (int64_t)this->worker.nodes_explored_snapshot,
+                                             this->bnb.exploration_stats_.start_time,
+                                             this->bnb.pc_.reliability_branching_settings,
+                                             1,
+                                             nullptr,
+                                             nullptr,
+                                             &this->worker.rng,
+                                             &this->worker.work_context,
+                                             on_sb_update);
+    } else {
+      var = this->worker.pc_snapshot.variable_selection(fractional, x);
+    }
     auto dir = martin_criteria(x[var], this->bnb.root_relax_soln_.x[var]);
     return {var, dir};
   }
@@ -1072,9 +1360,12 @@ struct deterministic_bfs_policy_t
         this->worker.enqueue_children_for_plunge(node->get_down_child(), node->get_up_child(), dir);
         break;
       case node_status_t::NUMERICAL: this->worker.record_numerical(node); break;
+      case node_status_t::PENDING: this->worker.plunge_stack.push_back(node); break;
       default: break;
     }
-    if (status != node_status_t::HAS_CHILDREN) { this->worker.recompute_bounds_and_basis = true; }
+    if (status != node_status_t::HAS_CHILDREN && status != node_status_t::PENDING) {
+      this->worker.recompute_bounds_and_basis = true;
+    }
   }
 
   void on_numerical_issue(mip_node_t<i_t, f_t>* node) override
@@ -1105,6 +1396,31 @@ struct deterministic_diving_policy_t
                                const std::vector<f_t>& x) override
   {
     if (obj < this->worker.local_upper_bound) {
+      f_t primal_err;
+      f_t bound_err;
+      i_t num_fractional;
+      bool cg = check_guess(this->bnb.original_lp_,
+                            this->bnb.settings_,
+                            this->bnb.var_types_,
+                            x,
+                            primal_err,
+                            bound_err,
+                            num_fractional);
+      if (!cg) {
+        this->bnb.settings_.log.printf(
+          "Rejecting infeasible diving integer solution: worker=%d node=%d depth=%d "
+          "obj=%.6e primal_err=%.6e bound_err=%.6e fractional=%d\n",
+          this->worker.worker_id,
+          node->creation_seq,
+          node->depth,
+          obj,
+          primal_err,
+          bound_err,
+          num_fractional);
+        return;
+      }
+      const f_t previous_local_upper = this->worker.local_upper_bound;
+      const int previous_seq         = this->worker.next_solution_seq;
       this->worker.local_upper_bound = obj;
       this->worker.queue_integer_solution(obj, x, node->depth);
     }
@@ -2017,6 +2333,18 @@ template <typename i_t, typename f_t>
 mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solution)
 {
   raft::common::nvtx::range scope("BB::solve");
+  auto exploration_signal_guard   = cuopt::scope_guard([this]() {
+    if (!exploration_started_.load()) {
+      std::lock_guard<std::mutex> lock(exploration_started_mutex_);
+      exploration_started_ = true;
+      exploration_started_cv_.notify_all();
+    }
+  });
+  auto heuristic_preemption_guard = cuopt::scope_guard([this]() {
+    if (settings_.heuristic_preemption_callback != nullptr) {
+      settings_.heuristic_preemption_callback();
+    }
+  });
 
   logger_t log;
   log.log                             = false;
@@ -2028,6 +2356,25 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
   exploration_stats_.nodes_explored   = 0;
   original_lp_.A.to_compressed_row(Arow_);
 
+  work_unit_scheduler_t* saved_scheduler = work_unit_context_.scheduler;
+  if (settings_.deterministic) {
+    work_unit_context_.deterministic = true;
+    cuopt_assert(settings_.bb_work_unit_scale > 0.0, "B&B work-unit scale must be positive");
+    if (settings_.gpu_heur_wait_for_exploration) {
+      // Scale=0 during pre-exploration: root LP/cuts/SB don't advance the deterministic timeline.
+      // GPU heuristics start after exploration, so both timelines begin at 0 together.
+      work_unit_context_.work_unit_scale = 0.0;
+    } else {
+      // GPU heuristics race with B&B pre-exploration, so B&B work must advance normally.
+      work_unit_context_.work_unit_scale = BB_BASE_WORK_SCALE * settings_.bb_work_unit_scale;
+    }
+
+    // Detach the scheduler during the serial root/cuts/SB phase.
+    // record_work_sync_on_horizon still accumulates global_work_units_elapsed,
+    // but avoids scheduler->on_work_recorded
+    work_unit_context_.scheduler = nullptr;
+  }
+
   settings_.log.printf("Reduced cost strengthening enabled: %d\n",
                        settings_.reduced_cost_strengthening);
 
@@ -2047,14 +2394,15 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
       const f_t computed_obj = compute_objective(original_lp_, crushed_guess);
       mutex_upper_.lock();
       incumbent_.set_incumbent_solution(computed_obj, crushed_guess);
-      upper_bound_ = computed_obj;
+      upper_bound_ = std::min(upper_bound_.load(), computed_obj);
       mutex_upper_.unlock();
     }
   }
 
   root_relax_soln_.resize(original_lp_.num_rows, original_lp_.num_cols);
 
-  if (settings_.clique_cuts != 0 && clique_table_ == nullptr) {
+  // TODO: ensure clique tables work well w/ determinism
+  if (settings_.clique_cuts != 0 && clique_table_ == nullptr && !settings_.deterministic) {
     signal_extend_cliques_.store(false, std::memory_order_release);
     typename ::cuopt::linear_programming::mip_solver_settings_t<i_t, f_t>::tolerances_t
       tolerances_for_clique{};
@@ -2104,7 +2452,8 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
                                                            basic_list,
                                                            nonbasic_list,
                                                            root_vstatus_,
-                                                           edge_norms_);
+                                                           edge_norms_,
+                                                           &work_unit_context_);
   } else {
     settings_.log.printf("\nSolving LP root relaxation in concurrent mode\n");
     root_status = solve_root_relaxation(lp_settings,
@@ -2118,6 +2467,10 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
   solving_root_relaxation_               = false;
   exploration_stats_.total_lp_iters      = root_relax_soln_.iterations;
   exploration_stats_.total_lp_solve_time = toc(exploration_stats_.start_time);
+  CUOPT_DETERMINISM_LOG(settings_.log,
+                        "Post-root-LP work: %.16e iters=%d\n",
+                        work_unit_context_.current_work(),
+                        root_relax_soln_.iterations);
 
   auto finish_clique_thread = [this]() {
     if (clique_table_future_.valid()) {
@@ -2163,7 +2516,18 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
   assert(root_vstatus_.size() == original_lp_.num_cols);
   set_uninitialized_steepest_edge_norms<i_t, f_t>(original_lp_, basic_list, edge_norms_);
 
-  root_objective_ = compute_objective(original_lp_, root_relax_soln_.x);
+  {
+    const f_t previous_root_objective = root_objective_;
+    root_objective_                   = compute_objective(original_lp_, root_relax_soln_.x);
+    CUOPT_DETERMINISM_LOG(
+      settings_.log,
+      "Deterministic root objective assign: source=post_root_solve old=%.16e new=%.16e "
+      "x_hash=0x%x obj_hash=0x%x\n",
+      previous_root_objective,
+      root_objective_,
+      detail::compute_hash(root_relax_soln_.x),
+      detail::compute_hash(original_lp_.objective));
+  }
 
   if (settings_.set_simplex_solution_callback != nullptr) {
     std::vector<f_t> original_x;
@@ -2395,7 +2759,8 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
                                                                   nonbasic_list,
                                                                   root_relax_soln_,
                                                                   iter,
-                                                                  edge_norms_);
+                                                                  edge_norms_,
+                                                                  &work_unit_context_);
       exploration_stats_.total_lp_iters += iter;
       f_t dual_phase2_time = toc(dual_phase2_start_time);
       if (dual_phase2_time > 1.0) {
@@ -2406,6 +2771,11 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
         set_final_solution(solution, root_objective_);
         return solver_status_;
       }
+      if (cut_status == dual::status_t::WORK_LIMIT) {
+        solver_status_ = mip_status_t::WORK_LIMIT;
+        set_final_solution(solution, root_objective_);
+        return solver_status_;
+      }
 
       if (cut_status != dual::status_t::OPTIMAL) {
         settings_.log.printf("Numerical issue at root node. Resolving from scratch\n");
@@ -2418,12 +2788,25 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
                                                    basic_list,
                                                    nonbasic_list,
                                                    root_vstatus_,
-                                                   edge_norms_);
+                                                   edge_norms_,
+                                                   &work_unit_context_);
         if (scratch_status == lp_status_t::OPTIMAL) {
           // We recovered
           cut_status = convert_lp_status_to_dual_status(scratch_status);
           exploration_stats_.total_lp_iters += root_relax_soln_.iterations;
-          root_objective_ = compute_objective(original_lp_, root_relax_soln_.x);
+          {
+            const f_t previous_root_objective = root_objective_;
+            root_objective_                   = compute_objective(original_lp_, root_relax_soln_.x);
+            CUOPT_DETERMINISM_LOG(
+              settings_.log,
+              "Deterministic root objective assign: source=cut_lp_scratch old=%.16e new=%.16e "
+              "pass=%d x_hash=0x%x obj_hash=0x%x\n",
+              previous_root_objective,
+              root_objective_,
+              cut_pass,
+              detail::compute_hash(root_relax_soln_.x),
+              detail::compute_hash(original_lp_.objective));
+          }
         } else {
           settings_.log.printf("Cut status %s\n", dual::status_to_string(cut_status).c_str());
 #ifdef WRITE_CUT_INFEASIBLE_MPS
@@ -2461,9 +2844,18 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
       num_fractional = fractional_variables(settings_, root_relax_soln_.x, var_types_, fractional);
 
       if (num_fractional == 0) {
-        upper_bound_ = root_objective_;
+        const f_t previous_upper = upper_bound_;
+        upper_bound_             = std::min(upper_bound_.load(), root_objective_);
         mutex_upper_.lock();
         incumbent_.set_incumbent_solution(root_objective_, root_relax_soln_.x);
+        CUOPT_DETERMINISM_LOG(
+          settings_.log,
+          "Deterministic B&B incumbent update: source=root_integral_pass prev_upper=%.16e "
+          "new_upper=%.16e obj=%.16e hash=0x%x\n",
+          previous_upper,
+          upper_bound_.load(),
+          root_objective_,
+          detail::compute_hash(root_relax_soln_.x));
         mutex_upper_.unlock();
       }
       f_t obj = upper_bound_.load();
@@ -2523,7 +2915,8 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
                                basic_list,
                                nonbasic_list,
                                basis_update,
-                               pc_);
+                               pc_,
+                               &work_unit_context_);
   }
 
   if (toc(exploration_stats_.start_time) > settings_.time_limit) {
@@ -2605,6 +2998,15 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
     calculate_variable_locks(original_lp_, var_up_locks_, var_down_locks_);
   }
   if (settings_.deterministic) {
+    pre_exploration_work_ = work_unit_context_.current_work();
+    CUOPT_DETERMINISM_LOG(
+      settings_.log,
+      "Pre-exploration work breakdown: total=%.16e scale=%.6f deterministic=%d\n",
+      pre_exploration_work_,
+      work_unit_context_.work_unit_scale,
+      (int)work_unit_context_.deterministic);
+    work_unit_context_.scheduler       = saved_scheduler;
+    work_unit_context_.work_unit_scale = BB_BASE_WORK_SCALE * settings_.bb_work_unit_scale;
     settings_.log.printf(
       " | Explored | Unexplored |    Objective    |     Bound     | IntInf | Depth | Iter/Node "
       "|   Gap    |  Work |  Time  |\n");
@@ -2614,11 +3016,25 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
       "|   Gap    |  Time  |\n");
   }
 
+  // Signal to producers (like GPU heuristics) that pre-exploration work is finished
+  {
+    std::lock_guard<std::mutex> lock(exploration_started_mutex_);
+    exploration_started_ = true;
+  }
+  exploration_started_cv_.notify_all();
+
+  int bb_device_id = 0;
+  RAFT_CUDA_TRY(cudaGetDevice(&bb_device_id));
+
   if (settings_.deterministic) {
     run_deterministic_coordinator(Arow_);
   } else if (settings_.num_threads > 1) {
 #pragma omp parallel num_threads(settings_.num_threads)
     {
+      // Any OMP thread may end up holding the lock during horizon syncs, and thus
+      // handle publication of solutions to the callback. Uncrush to the original problem requires
+      // GPU ops so ensure all threads call cudaSetDevice at init
+      RAFT_CUDA_TRY(cudaSetDevice(bb_device_id));
 #pragma omp master
       run_scheduler();
     }
@@ -2633,6 +3049,13 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
   if (deterministic_mode_enabled_) {
     lower_bound    = deterministic_compute_lower_bound();
     solver_status_ = deterministic_global_termination_status_;
+    CUOPT_DETERMINISM_LOG(
+      settings_.log,
+      "Final lower bound: user_lb=%.16e user_ub=%.16e status=%d has_incumbent=%d\n",
+      compute_user_objective(original_lp_, lower_bound),
+      compute_user_objective(original_lp_, upper_bound_.load()),
+      (int)deterministic_global_termination_status_,
+      (int)incumbent_.has_incumbent);
   } else {
     if (node_queue_.best_first_queue_size() > 0) {
       // We need to clear the queue and use the info in the search tree for the lower bound
@@ -2786,8 +3209,7 @@ void branch_and_bound_t<i_t, f_t>::run_deterministic_coordinator(const csr_matri
 
   deterministic_horizon_step_ = 0.50;
 
-  // Compute worker counts using the same formula as reliability-branching scheduler
-  const i_t num_workers = 2 * settings_.num_threads;
+  const i_t num_workers = settings_.num_threads;
   std::vector<search_strategy_t> search_strategies =
     get_search_strategies(settings_.diving_settings);
   std::array<i_t, num_search_strategies> max_num_workers =
@@ -2800,7 +3222,7 @@ void branch_and_bound_t<i_t, f_t>::run_deterministic_coordinator(const csr_matri
   }
 
   deterministic_mode_enabled_              = true;
-  deterministic_current_horizon_           = deterministic_horizon_step_;
+  deterministic_current_horizon_           = pre_exploration_work_ + deterministic_horizon_step_;
   deterministic_horizon_number_            = 0;
   deterministic_global_termination_status_ = mip_status_t::UNSET;
 
@@ -2828,14 +3250,17 @@ void branch_and_bound_t<i_t, f_t>::run_deterministic_coordinator(const csr_matri
     }
   }
 
-  deterministic_scheduler_ = std::make_unique<work_unit_scheduler_t>(deterministic_horizon_step_);
+  deterministic_scheduler_ =
+    std::make_unique<work_unit_scheduler_t>(deterministic_horizon_step_, pre_exploration_work_);
 
   scoped_context_registrations_t context_registrations(*deterministic_scheduler_);
   for (auto& worker : *deterministic_workers_) {
+    worker.work_context.set_current_work(pre_exploration_work_, false);
     context_registrations.add(worker.work_context);
   }
   if (deterministic_diving_workers_) {
     for (auto& worker : *deterministic_diving_workers_) {
+      worker.work_context.set_current_work(pre_exploration_work_, false);
       context_registrations.add(worker.work_context);
     }
   }
@@ -2843,8 +3268,9 @@ void branch_and_bound_t<i_t, f_t>::run_deterministic_coordinator(const csr_matri
   int actual_diving_workers =
     deterministic_diving_workers_ ? (int)deterministic_diving_workers_->size() : 0;
   settings_.log.printf(
-    "Deterministic Mode: %d BFS workers + %d diving workers, horizon step = %.2f work "
-    "units\n",
+    "Deterministic Mode: %d total threads split as %d BFS workers + %d diving workers, "
+    "horizon step = %.2f work units\n",
+    num_workers,
     num_bfs_workers,
     actual_diving_workers,
     deterministic_horizon_step_);
@@ -2868,9 +3294,12 @@ void branch_and_bound_t<i_t, f_t>::run_deterministic_coordinator(const csr_matri
   }
 
   const int total_thread_count = num_bfs_workers + num_diving_workers;
+  int coordinator_device_id    = 0;
+  RAFT_CUDA_TRY(cudaGetDevice(&coordinator_device_id));
 
 #pragma omp parallel num_threads(total_thread_count)
   {
+    RAFT_CUDA_TRY(cudaSetDevice(coordinator_device_id));
     int thread_id = omp_get_thread_num();
     if (thread_id < num_bfs_workers) {
       auto& worker          = (*deterministic_workers_)[thread_id];
@@ -2976,11 +3405,17 @@ void branch_and_bound_t<i_t, f_t>::run_deterministic_bfs_loop(
       bool is_child                     = (node->parent == worker.last_solved_node);
       worker.recompute_bounds_and_basis = !is_child;
 
-      node_status_t status    = solve_node_deterministic(worker, node, search_tree);
-      worker.last_solved_node = node;
+      node_status_t status = solve_node_deterministic(worker, node, search_tree);
+      worker.current_node  = nullptr;
 
-      worker.current_node = nullptr;
-      continue;
+      if (status == node_status_t::PENDING) {
+        // Global termination limits were hit (TIME_LIMIT/WORK_LIMIT). Node was re-enqueued by
+        // on_node_completed. Fall through to sync barrier and let the sync callback handle
+        // termination.
+      } else {
+        worker.last_solved_node = node;
+        continue;
+      }
     }
 
     // No work - advance to sync point to participate in barrier
@@ -3005,24 +3440,40 @@ void branch_and_bound_t<i_t, f_t>::deterministic_sync_callback()
   max_producer_wait_time_ = std::max(max_producer_wait_time_, wait_time);
   ++producer_wait_count_;
 
-  work_unit_context_.global_work_units_elapsed = horizon_end;
+  work_unit_context_.set_current_work(horizon_end, false);
 
-  bb_event_batch_t<i_t, f_t> all_events = deterministic_workers_->collect_and_sort_events();
+  {
+    std::string worker_clocks_str;
+    for (const auto& w : *deterministic_workers_) {
+      worker_clocks_str += std::to_string(w.worker_id) + ":" +
+                           std::to_string(w.work_context.current_work()) + "/" +
+                           std::to_string(w.integer_solutions.size()) + " ";
+    }
+    CUOPT_DETERMINISM_LOG(
+      settings_.log,
+      "Deterministic sync #%d: horizon=%.6f pre_expl=%.6f heur_q=%zu workers=[%s]\n",
+      deterministic_horizon_number_,
+      deterministic_current_horizon_,
+      pre_exploration_work_,
+      heuristic_solution_queue_.size(),
+      worker_clocks_str.c_str());
+  }
 
-  deterministic_sort_replay_events(all_events);
+  bb_event_batch_t<i_t, f_t> all_events = deterministic_workers_->collect_and_sort_events();
 
-  // deterministic_prune_worker_nodes_vs_incumbent();
+  std::vector<typename branch_and_bound_t<i_t, f_t>::deterministic_replay_solution_t>
+    replay_solutions;
+  deterministic_collect_worker_solutions(
+    *deterministic_workers_,
+    [](const deterministic_bfs_worker_pool_t<i_t, f_t>&, int) {
+      return search_strategy_t::BEST_FIRST;
+    },
+    replay_solutions);
+  deterministic_collect_diving_solutions_and_update_pseudocosts(replay_solutions);
 
-  deterministic_collect_diving_solutions_and_update_pseudocosts();
+  deterministic_sort_replay_events(all_events, replay_solutions);
 
-  for (auto& worker : *deterministic_workers_) {
-    worker.integer_solutions.clear();
-  }
-  if (deterministic_diving_workers_) {
-    for (auto& worker : *deterministic_diving_workers_) {
-      worker.integer_solutions.clear();
-    }
-  }
+  // deterministic_prune_worker_nodes_vs_incumbent();
 
   deterministic_populate_diving_heap();
 
@@ -3079,6 +3530,19 @@ void branch_and_bound_t<i_t, f_t>::deterministic_sync_callback()
   f_t abs_gap     = compute_user_abs_gap(original_lp_, upper_bound, lower_bound);
   f_t rel_gap     = user_relative_gap(original_lp_, upper_bound, lower_bound);
 
+  CUOPT_DETERMINISM_LOG(
+    settings_.log,
+    "Sync termination check: horizon=%.6f user_lower=%.16e user_upper=%.16e abs_gap=%.6e "
+    "rel_gap=%.6e bfs_has_work=%d diving_has_work=%d status=%d\n",
+    deterministic_current_horizon_,
+    compute_user_objective(original_lp_, lower_bound),
+    compute_user_objective(original_lp_, upper_bound),
+    abs_gap,
+    rel_gap,
+    (int)deterministic_workers_->any_has_work(),
+    deterministic_diving_workers_ ? (int)deterministic_diving_workers_->any_has_work() : -1,
+    (int)deterministic_global_termination_status_);
+
   if (abs_gap <= settings_.absolute_mip_gap_tol || rel_gap <= settings_.relative_mip_gap_tol) {
     deterministic_global_termination_status_ = mip_status_t::OPTIMAL;
   }
@@ -3167,7 +3631,12 @@ node_status_t branch_and_bound_t<i_t, f_t>::solve_node_deterministic(
   simplex_solver_settings_t<i_t, f_t> lp_settings = settings_;
   lp_settings.set_log(false);
 
-  lp_settings.cut_off       = worker.local_upper_bound + settings_.dual_tol;
+  if (original_lp_.objective_is_integral) {
+    lp_settings.cut_off =
+      std::ceil(worker.local_upper_bound - settings_.integer_tol) + settings_.dual_tol;
+  } else {
+    lp_settings.cut_off = worker.local_upper_bound + settings_.dual_tol;
+  }
   lp_settings.inside_mip    = 2;
   lp_settings.time_limit    = remaining_time;
   lp_settings.scale_columns = false;
@@ -3199,7 +3668,7 @@ node_status_t branch_and_bound_t<i_t, f_t>::solve_node_deterministic(
   std::vector<variable_status_t>& leaf_vstatus = node_ptr->vstatus;
   i_t node_iter                                = 0;
   f_t lp_start_time                            = tic();
-  std::vector<f_t> leaf_edge_norms             = edge_norms_;
+  worker.leaf_edge_norms                       = edge_norms_;
 
   dual::status_t lp_status = dual_phase2_with_advanced_basis(2,
                                                              0,
@@ -3213,7 +3682,7 @@ node_status_t branch_and_bound_t<i_t, f_t>::solve_node_deterministic(
                                                              worker.nonbasic_list,
                                                              worker.leaf_solution,
                                                              node_iter,
-                                                             leaf_edge_norms,
+                                                             worker.leaf_edge_norms,
                                                              &worker.work_context);
 
   if (lp_status == dual::status_t::NUMERICAL) {
@@ -3226,18 +3695,20 @@ node_status_t branch_and_bound_t<i_t, f_t>::solve_node_deterministic(
                                                                          worker.basic_list,
                                                                          worker.nonbasic_list,
                                                                          leaf_vstatus,
-                                                                         leaf_edge_norms,
+                                                                         worker.leaf_edge_norms,
                                                                          &worker.work_context);
     lp_status                 = convert_lp_status_to_dual_status(second_status);
   }
 
-  double work_performed = worker.work_context.global_work_units_elapsed - work_units_at_start;
-  worker.clock += work_performed;
-
   exploration_stats_.total_lp_solve_time += toc(lp_start_time);
   exploration_stats_.total_lp_iters += node_iter;
-  ++exploration_stats_.nodes_explored;
-  --exploration_stats_.nodes_unexplored;
+
+  bool lp_conclusive =
+    (lp_status != dual::status_t::TIME_LIMIT && lp_status != dual::status_t::WORK_LIMIT);
+  if (lp_conclusive) {
+    ++exploration_stats_.nodes_explored;
+    --exploration_stats_.nodes_unexplored;
+  }
 
   deterministic_bfs_policy_t<i_t, f_t> policy{*this, worker};
   auto [status, round_dir] = update_tree_impl(node_ptr, search_tree, &worker, lp_status, policy);
@@ -3247,58 +3718,17 @@ node_status_t branch_and_bound_t<i_t, f_t>::solve_node_deterministic(
 
 template <typename i_t, typename f_t>
 template <typename PoolT, typename WorkerTypeGetter>
-void branch_and_bound_t<i_t, f_t>::deterministic_process_worker_solutions(
-  PoolT& pool, WorkerTypeGetter get_worker_type)
+void branch_and_bound_t<i_t, f_t>::deterministic_collect_worker_solutions(
+  PoolT& pool,
+  WorkerTypeGetter get_worker_type,
+  std::vector<typename branch_and_bound_t<i_t, f_t>::deterministic_replay_solution_t>&
+    replay_solutions)
 {
-  std::vector<queued_integer_solution_t<i_t, f_t>*> all_solutions;
   for (auto& worker : pool) {
     for (auto& sol : worker.integer_solutions) {
-      all_solutions.push_back(&sol);
+      const search_strategy_t strategy = get_worker_type(pool, sol.worker_id);
+      replay_solutions.push_back({std::move(sol), strategy});
     }
-  }
-
-  // relies on queued_integer_solution_t's operator<
-  // sorts based on objective first, then the <worker_id, seq_id> tuple
-  std::sort(all_solutions.begin(),
-            all_solutions.end(),
-            [](const queued_integer_solution_t<i_t, f_t>* a,
-               const queued_integer_solution_t<i_t, f_t>* b) { return *a < *b; });
-
-  f_t deterministic_lower = deterministic_compute_lower_bound();
-  f_t current_upper       = upper_bound_.load();
-
-  for (const auto* sol : all_solutions) {
-    if (sol->objective < current_upper) {
-      f_t user_obj         = compute_user_objective(original_lp_, sol->objective);
-      f_t user_lower       = compute_user_objective(original_lp_, deterministic_lower);
-      i_t nodes_explored   = exploration_stats_.nodes_explored.load();
-      i_t nodes_unexplored = exploration_stats_.nodes_unexplored.load();
-
-      search_strategy_t worker_type = get_worker_type(pool, sol->worker_id);
-      report(feasible_solution_symbol(worker_type),
-             sol->objective,
-             deterministic_lower,
-             sol->depth,
-             0,
-             deterministic_current_horizon_);
-
-      bool improved = false;
-      if (improves_incumbent(sol->objective)) {
-        upper_bound_ = std::min(upper_bound_.load(), sol->objective);
-        incumbent_.set_incumbent_solution(sol->objective, sol->solution);
-        current_upper = sol->objective;
-        improved      = true;
-      }
-
-      if (improved && settings_.solution_callback != nullptr) {
-        std::vector<f_t> original_x;
-        uncrush_primal_solution(original_problem_, original_lp_, sol->solution, original_x);
-        settings_.solution_callback(original_x, sol->objective);
-      }
-    }
-  }
-
-  for (auto& worker : pool) {
     worker.integer_solutions.clear();
   }
 }
@@ -3308,12 +3738,17 @@ template <typename PoolT>
 void branch_and_bound_t<i_t, f_t>::deterministic_merge_pseudo_cost_updates(PoolT& pool)
 {
   std::vector<pseudo_cost_update_t<i_t, f_t>> all_pc_updates;
+  int64_t sb_iter_delta = 0;
+  int64_t base_sb       = pc_.strong_branching_lp_iter.load();
   for (auto& worker : pool) {
     auto updates = worker.pc_snapshot.take_updates();
     all_pc_updates.insert(all_pc_updates.end(), updates.begin(), updates.end());
+    int64_t snapshot_sb = worker.pc_snapshot.strong_branching_lp_iter_;
+    sb_iter_delta += snapshot_sb - base_sb;
   }
   std::sort(all_pc_updates.begin(), all_pc_updates.end());
   pc_.merge_updates(all_pc_updates);
+  pc_.strong_branching_lp_iter += sb_iter_delta;
 }
 
 template <typename i_t, typename f_t>
@@ -3324,6 +3759,7 @@ void branch_and_bound_t<i_t, f_t>::deterministic_broadcast_snapshots(
   deterministic_snapshot_t<i_t, f_t> snap;
   snap.upper_bound    = upper_bound_.load();
   snap.total_lp_iters = exploration_stats_.total_lp_iters.load();
+  snap.nodes_explored = exploration_stats_.nodes_explored.load();
   snap.incumbent      = incumbent_snapshot;
   snap.pc_snapshot    = pc_.create_snapshot();
 
@@ -3334,91 +3770,158 @@ void branch_and_bound_t<i_t, f_t>::deterministic_broadcast_snapshots(
 
 template <typename i_t, typename f_t>
 void branch_and_bound_t<i_t, f_t>::deterministic_sort_replay_events(
-  const bb_event_batch_t<i_t, f_t>& events)
+  const bb_event_batch_t<i_t, f_t>& events,
+  std::vector<typename branch_and_bound_t<i_t, f_t>::deterministic_replay_solution_t>&
+    replay_solutions)
 {
-  // Infeasible solutions from GPU heuristics are queued for repair; process them now
+  // Retire external solutions that have reached the current horizon. Feasibility
+  // classification and repair happen only here in deterministic mode.
   {
-    std::vector<std::vector<f_t>> to_repair;
-    // TODO: support repair queue in deterministic mode
-    // mutex_repair_.lock();
-    // if (repair_queue_.size() > 0) {
-    //   to_repair = repair_queue_;
-    //   repair_queue_.clear();
-    // }
-    // mutex_repair_.unlock();
-
-    std::sort(to_repair.begin(),
-              to_repair.end(),
-              [](const std::vector<f_t>& a, const std::vector<f_t>& b) { return a < b; });
-
-    if (to_repair.size() > 0) {
-      settings_.log.debug("Deterministic sync: Attempting to repair %ld injected solutions\n",
-                          to_repair.size());
-      for (const std::vector<f_t>& uncrushed_solution : to_repair) {
-        std::vector<f_t> crushed_solution;
-        crush_primal_solution<i_t, f_t>(
-          original_problem_, original_lp_, uncrushed_solution, new_slacks_, crushed_solution);
-        std::vector<f_t> repaired_solution;
-        f_t repaired_obj;
-        bool success =
-          repair_solution(edge_norms_, crushed_solution, repaired_obj, repaired_solution);
-        if (success) {
-          // Queue repaired solution with work unit timestamp (...workstamp?)
-          mutex_heuristic_queue_.lock();
-          heuristic_solution_queue_.push_back(
-            {repaired_obj, std::move(repaired_solution), 0, -1, 0, deterministic_current_horizon_});
-          mutex_heuristic_queue_.unlock();
+    std::vector<queued_external_solution_t> due_solutions;
+    mutex_heuristic_queue_.lock();
+    {
+      std::vector<queued_external_solution_t> future_solutions;
+      for (auto& sol : heuristic_solution_queue_) {
+        if (sol.work_timestamp < deterministic_current_horizon_) {
+          due_solutions.push_back(std::move(sol));
+        } else {
+          future_solutions.push_back(std::move(sol));
         }
       }
+      heuristic_solution_queue_ = std::move(future_solutions);
     }
-  }
-
-  // Extract heuristic solutions, keeping future solutions for next horizon
-  // Use deterministic_current_horizon_ as the upper bound (horizon_end)
-  std::vector<queued_integer_solution_t<i_t, f_t>> heuristic_solutions;
-  mutex_heuristic_queue_.lock();
-  {
-    std::vector<queued_integer_solution_t<i_t, f_t>> future_solutions;
-    for (auto& sol : heuristic_solution_queue_) {
-      if (sol.work_timestamp < deterministic_current_horizon_) {
-        heuristic_solutions.push_back(std::move(sol));
-      } else {
-        future_solutions.push_back(std::move(sol));
+    mutex_heuristic_queue_.unlock();
+
+    std::sort(due_solutions.begin(),
+              due_solutions.end(),
+              [](const queued_external_solution_t& a, const queued_external_solution_t& b) {
+                if (a.work_timestamp != b.work_timestamp) {
+                  return a.work_timestamp < b.work_timestamp;
+                }
+                if (a.user_objective != b.user_objective) {
+                  return a.user_objective < b.user_objective;
+                }
+                if (a.origin != b.origin) { return a.origin < b.origin; }
+                return a.solution < b.solution;
+              });
+
+    if (!due_solutions.empty() || !heuristic_solution_queue_.empty()) {
+      CUOPT_DETERMINISM_LOG(
+        settings_.log,
+        "Deterministic sync retire: horizon=%.6f due=%zu future=%zu pre_expl=%.6f\n",
+        deterministic_current_horizon_,
+        due_solutions.size(),
+        heuristic_solution_queue_.size(),
+        pre_exploration_work_);
+      for (size_t i = 0; i < due_solutions.size(); ++i) {
+        CUOPT_DETERMINISM_LOG(
+          settings_.log,
+          "  due[%zu]: wut=%.6f obj=%g origin=%s\n",
+          i,
+          due_solutions[i].work_timestamp,
+          due_solutions[i].user_objective,
+          cuopt::internals::mip_solution_origin_to_string(due_solutions[i].origin));
+      }
+    }
+    if (!due_solutions.empty()) {
+      CUOPT_DETERMINISM_LOG(settings_.log,
+                            "Deterministic sync: retiring %ld external solutions\n",
+                            due_solutions.size());
+      for (const auto& queued_solution : due_solutions) {
+        auto [feasible, obj, crushed] = retire_queued_solution(queued_solution);
+        if (feasible) {
+          replay_solutions.push_back({{obj,
+                                       std::move(crushed),
+                                       0,
+                                       -1,
+                                       0,
+                                       queued_solution.work_timestamp,
+                                       queued_solution.origin},
+                                      search_strategy_t::BEST_FIRST});
+        }
       }
     }
-    heuristic_solution_queue_ = std::move(future_solutions);
   }
-  mutex_heuristic_queue_.unlock();
+  if (!replay_solutions.empty() || !heuristic_solution_queue_.empty()) {
+    CUOPT_DETERMINISM_LOG(
+      settings_.log,
+      "Deterministic replay extract: horizon=%.6f now=%zu future=%zu upper=%.16e\n",
+      deterministic_current_horizon_,
+      replay_solutions.size(),
+      heuristic_solution_queue_.size(),
+      upper_bound_.load());
+  }
 
-  // sort by work unit timestamp, with objective and solution values as tie-breakers
-  std::sort(
-    heuristic_solutions.begin(),
-    heuristic_solutions.end(),
-    [](const queued_integer_solution_t<i_t, f_t>& a, const queued_integer_solution_t<i_t, f_t>& b) {
-      if (a.work_timestamp != b.work_timestamp) { return a.work_timestamp < b.work_timestamp; }
-      if (a.objective != b.objective) { return a.objective < b.objective; }
-      return a.solution < b.solution;  // edge-case - lexicographical comparison
-    });
+  // Sort the full replay stream by work unit timestamp, with stable deterministic tie-breakers.
+  std::sort(replay_solutions.begin(), replay_solutions.end(), [](const auto& a, const auto& b) {
+    if (a.solution.work_timestamp != b.solution.work_timestamp) {
+      return a.solution.work_timestamp < b.solution.work_timestamp;
+    }
+    if (a.solution.objective != b.solution.objective) {
+      return a.solution.objective < b.solution.objective;
+    }
+    if (a.solution.origin != b.solution.origin) { return a.solution.origin < b.solution.origin; }
+    if (a.solution.worker_id != b.solution.worker_id) {
+      return a.solution.worker_id < b.solution.worker_id;
+    }
+    if (a.solution.sequence_id != b.solution.sequence_id) {
+      return a.solution.sequence_id < b.solution.sequence_id;
+    }
+    return a.solution.solution < b.solution.solution;
+  });
 
-  // Merge B&B events and heuristic solutions for unified timeline replay
-  size_t event_idx     = 0;
-  size_t heuristic_idx = 0;
+  f_t deterministic_lower = deterministic_compute_lower_bound();
+  f_t current_upper       = upper_bound_.load();
+  CUOPT_DETERMINISM_LOG(
+    settings_.log,
+    "Sync replay begin: horizon=%.6f n_events=%zu n_solutions=%zu user_upper_before=%.16e\n",
+    deterministic_current_horizon_,
+    events.events.size(),
+    replay_solutions.size(),
+    compute_user_objective(original_lp_, current_upper));
+  if (deterministic_current_horizon_ <= deterministic_horizon_step_) {
+    CUOPT_DETERMINISM_LOG(
+      settings_.log,
+      "Deterministic solution replay: candidates=%zu lower=%.16e upper_before=%.16e\n",
+      replay_solutions.size(),
+      deterministic_lower,
+      current_upper);
+    for (size_t i = 0; i < replay_solutions.size(); ++i) {
+      const auto& replay = replay_solutions[i];
+      const auto& sol    = replay.solution;
+      CUOPT_DETERMINISM_LOG(
+        settings_.log,
+        "Deterministic replay solution[%zu]: wut=%.6f obj=%.16e origin=%s worker=%d seq=%d "
+        "depth=%d sol_hash=0x%x\n",
+        i,
+        sol.work_timestamp,
+        sol.objective,
+        cuopt::internals::mip_solution_origin_to_string(sol.origin),
+        sol.worker_id,
+        sol.sequence_id,
+        sol.depth,
+        detail::compute_hash(sol.solution));
+    }
+  }
 
-  while (event_idx < events.events.size() || heuristic_idx < heuristic_solutions.size()) {
-    bool process_event     = false;
-    bool process_heuristic = false;
+  // Merge B&B events and all incumbent-producing solutions for unified timeline replay.
+  size_t event_idx    = 0;
+  size_t solution_idx = 0;
+
+  while (event_idx < events.events.size() || solution_idx < replay_solutions.size()) {
+    bool process_event    = false;
+    bool process_solution = false;
 
     if (event_idx >= events.events.size()) {
-      process_heuristic = true;
-    } else if (heuristic_idx >= heuristic_solutions.size()) {
+      process_solution = true;
+    } else if (solution_idx >= replay_solutions.size()) {
       process_event = true;
     } else {
-      // Both have items - pick the one with smaller WUT
       if (events.events[event_idx].work_timestamp <=
-          heuristic_solutions[heuristic_idx].work_timestamp) {
+          replay_solutions[solution_idx].solution.work_timestamp) {
         process_event = true;
       } else {
-        process_heuristic = true;
+        process_solution = true;
       }
     }
 
@@ -3433,42 +3936,80 @@ void branch_and_bound_t<i_t, f_t>::deterministic_sort_replay_events(
       }
     }
 
-    if (process_heuristic) {
-      const auto& hsol = heuristic_solutions[heuristic_idx++];
-
-      CUOPT_LOG_TRACE(
-        "Deterministic sync: Heuristic solution received at WUT %f with objective %g, current "
-        "horizon %f",
-        hsol.work_timestamp,
-        hsol.objective,
-        deterministic_current_horizon_);
-
-      // Process heuristic solution at its correct work unit timestamp position
-      f_t new_upper = std::numeric_limits<f_t>::infinity();
+    if (process_solution) {
+      const auto& replay = replay_solutions[solution_idx++];
+      const auto& sol    = replay.solution;
+      bool improved      = false;
 
-      if (improves_incumbent(hsol.objective)) {
-        upper_bound_ = std::min(upper_bound_.load(), hsol.objective);
-        incumbent_.set_incumbent_solution(hsol.objective, hsol.solution);
-        new_upper = hsol.objective;
+      if (improves_incumbent(sol.objective)) {
+        const f_t previous_upper = upper_bound_;
+        upper_bound_             = std::min(upper_bound_.load(), sol.objective);
+        incumbent_.set_incumbent_solution(sol.objective, sol.solution);
+        current_upper = sol.objective;
+        improved      = true;
+        CUOPT_DETERMINISM_LOG(
+          settings_.log,
+          "Deterministic B&B incumbent update: source=det_replay prev_upper=%.16e "
+          "new_upper=%.16e obj=%.16e hash=0x%x worker=%d seq=%d wut=%.6f horizon=%.6f\n",
+          previous_upper,
+          upper_bound_.load(),
+          sol.objective,
+          detail::compute_hash(sol.solution),
+          sol.worker_id,
+          sol.sequence_id,
+          sol.work_timestamp,
+          deterministic_current_horizon_);
       }
-
-      if (new_upper < std::numeric_limits<f_t>::infinity()) {
-        report_heuristic(new_upper);
-
-        if (settings_.solution_callback != nullptr) {
-          std::vector<f_t> original_x;
-          uncrush_primal_solution(original_problem_, original_lp_, hsol.solution, original_x);
-          settings_.solution_callback(original_x, hsol.objective);
+      CUOPT_DETERMINISM_LOG(
+        settings_.log,
+        "Deterministic replay: horizon=%.6f wut=%.6f obj=%.16e origin=%s accepted=%d "
+        "upper_now=%.16e worker=%d seq=%d sol_hash=0x%x\n",
+        deterministic_current_horizon_,
+        sol.work_timestamp,
+        sol.objective,
+        cuopt::internals::mip_solution_origin_to_string(sol.origin),
+        (int)improved,
+        current_upper,
+        sol.worker_id,
+        sol.sequence_id,
+        detail::compute_hash(sol.solution));
+
+      if (improved) {
+        CUOPT_DETERMINISM_LOG(
+          settings_.log,
+          "Deterministic replay PUBLISH: horizon=%.6f wut=%.6f obj=%g origin=%s worker=%d "
+          "upper_after=%.16e\n",
+          deterministic_current_horizon_,
+          sol.work_timestamp,
+          compute_user_objective(original_lp_, sol.objective),
+          cuopt::internals::mip_solution_origin_to_string(sol.origin),
+          sol.worker_id,
+          current_upper);
+        if (sol.origin == cuopt::internals::mip_solution_origin_t::BRANCH_AND_BOUND_NODE ||
+            sol.origin == cuopt::internals::mip_solution_origin_t::BRANCH_AND_BOUND_DIVING) {
+          report(feasible_solution_symbol(replay.strategy),
+                 sol.objective,
+                 deterministic_lower,
+                 sol.depth,
+                 0,
+                 deterministic_current_horizon_);
+        } else {
+          report_heuristic(sol.objective, sol.work_timestamp);
         }
+        emit_solution_callback_from_crushed(
+          sol.solution, sol.objective, sol.origin, sol.work_timestamp);
       }
     }
   }
 
-  // Merge integer solutions from BFS workers and update global incumbent
-  deterministic_process_worker_solutions(*deterministic_workers_,
-                                         [](const deterministic_bfs_worker_pool_t<i_t, f_t>&, int) {
-                                           return search_strategy_t::BEST_FIRST;
-                                         });
+  CUOPT_DETERMINISM_LOG(
+    settings_.log,
+    "Sync replay done: horizon=%.6f user_upper_after=%.16e events_processed=%zu "
+    "solutions_processed=%zu\n",
+    deterministic_current_horizon_,
+    compute_user_objective(original_lp_, upper_bound_.load()),
+    event_idx,
+    solution_idx);
 
   // Merge and apply pseudo-cost updates from BFS workers
   deterministic_merge_pseudo_cost_updates(*deterministic_workers_);
@@ -3525,52 +4066,44 @@ void branch_and_bound_t<i_t, f_t>::deterministic_balance_worker_loads()
 
   constexpr bool force_rebalance_every_sync = false;
 
-  // Count work for each worker: current_node (if any) + plunge_stack + backlog
-  std::vector<size_t> work_counts(num_workers);
-  size_t total_work = 0;
-  size_t max_work   = 0;
-  size_t min_work   = std::numeric_limits<size_t>::max();
+  std::vector<size_t> backlog_counts(num_workers);
+  size_t total_backlog = 0;
+  size_t max_backlog   = 0;
+  size_t min_backlog   = std::numeric_limits<size_t>::max();
 
   for (size_t w = 0; w < num_workers; ++w) {
-    auto& worker   = (*deterministic_workers_)[w];
-    work_counts[w] = worker.queue_size();
-    total_work += work_counts[w];
-    max_work = std::max(max_work, work_counts[w]);
-    min_work = std::min(min_work, work_counts[w]);
+    auto& worker      = (*deterministic_workers_)[w];
+    backlog_counts[w] = worker.backlog.size();
+    total_backlog += backlog_counts[w];
+    max_backlog = std::max(max_backlog, backlog_counts[w]);
+    min_backlog = std::min(min_backlog, backlog_counts[w]);
   }
-  if (total_work == 0) return;
+  if (total_backlog == 0) return;
 
   bool needs_balance;
   if (force_rebalance_every_sync) {
-    needs_balance = (total_work > 1);
+    needs_balance = (total_backlog > 1);
   } else {
-    needs_balance = (min_work == 0 && max_work >= 2) || (min_work > 0 && max_work > 4 * min_work);
+    needs_balance =
+      (min_backlog == 0 && max_backlog >= 2) || (min_backlog > 0 && max_backlog > 4 * min_backlog);
   }
 
   if (!needs_balance) return;
 
-  std::vector<mip_node_t<i_t, f_t>*> all_nodes;
+  std::vector<mip_node_t<i_t, f_t>*> all_backlog_nodes;
   for (auto& worker : *deterministic_workers_) {
     for (auto* node : worker.backlog.data()) {
-      all_nodes.push_back(node);
+      all_backlog_nodes.push_back(node);
     }
     worker.backlog.clear();
   }
 
-  if (all_nodes.empty()) return;
-
-  auto deterministic_less = [](const mip_node_t<i_t, f_t>* a, const mip_node_t<i_t, f_t>* b) {
-    if (a->origin_worker_id != b->origin_worker_id) {
-      return a->origin_worker_id < b->origin_worker_id;
-    }
-    return a->creation_seq < b->creation_seq;
-  };
-  std::sort(all_nodes.begin(), all_nodes.end(), deterministic_less);
+  if (all_backlog_nodes.empty()) return;
 
-  // Distribute nodes
-  for (size_t i = 0; i < all_nodes.size(); ++i) {
+  // Round-robin distribute into backlogs; priority queue handles ordering internally
+  for (size_t i = 0; i < all_backlog_nodes.size(); ++i) {
     size_t worker_idx = i % num_workers;
-    (*deterministic_workers_)[worker_idx].enqueue_node(all_nodes[i]);
+    (*deterministic_workers_)[worker_idx].backlog.push(all_backlog_nodes[i]);
   }
 }
 
@@ -3598,11 +4131,33 @@ f_t branch_and_bound_t<i_t, f_t>::deterministic_compute_lower_bound()
     }
   }
 
+  f_t min_from_workers = lower_bound;
+
   // Tree is exhausted
   if (lower_bound == std::numeric_limits<f_t>::infinity() && incumbent_.has_incumbent) {
     lower_bound = upper_bound_.load();
   }
 
+  lower_bound = std::min(lower_bound, upper_bound_.load());
+
+  CUOPT_DETERMINISM_LOG(
+    settings_.log,
+    "compute_lower_bound: user_min_bfs=%.16e user_upper=%.16e user_result=%.16e "
+    "has_incumbent=%d n_bfs_nodes=%d\n",
+    compute_user_objective(original_lp_, min_from_workers),
+    compute_user_objective(original_lp_, upper_bound_.load()),
+    compute_user_objective(original_lp_, lower_bound),
+    (int)incumbent_.has_incumbent,
+    [&]() {
+      int count = 0;
+      for (const auto& w : *deterministic_workers_) {
+        count += (w.current_node != nullptr ? 1 : 0);
+        count += (int)w.plunge_stack.size();
+        count += (int)w.backlog.size();
+      }
+      return count;
+    }());
+
   return lower_bound;
 }
 
@@ -3690,19 +4245,27 @@ void branch_and_bound_t<i_t, f_t>::deterministic_assign_diving_nodes()
 }
 
 template <typename i_t, typename f_t>
-void branch_and_bound_t<i_t, f_t>::deterministic_collect_diving_solutions_and_update_pseudocosts()
+void branch_and_bound_t<i_t, f_t>::deterministic_collect_diving_solutions_and_update_pseudocosts(
+  std::vector<typename branch_and_bound_t<i_t, f_t>::deterministic_replay_solution_t>&
+    replay_solutions)
 {
   if (!deterministic_diving_workers_) return;
 
-  // Collect integer solutions from diving workers and update global incumbent
-  deterministic_process_worker_solutions(
+  deterministic_collect_worker_solutions(
     *deterministic_diving_workers_,
     [](const deterministic_diving_worker_pool_t<i_t, f_t>& pool, int worker_id) {
       return pool[worker_id].diving_type;
-    });
+    },
+    replay_solutions);
 
   // Merge pseudo-cost updates from diving workers
   deterministic_merge_pseudo_cost_updates(*deterministic_diving_workers_);
+
+  for (auto& worker : *deterministic_diving_workers_) {
+    i_t delta                       = worker.total_nodes_explored - worker.nodes_explored_last_sync;
+    worker.nodes_explored_last_sync = worker.total_nodes_explored;
+    exploration_stats_.nodes_explored += delta;
+  }
 }
 
 template <typename i_t, typename f_t>
@@ -3777,7 +4340,12 @@ void branch_and_bound_t<i_t, f_t>::deterministic_dive(
     // Setup LP settings
     simplex_solver_settings_t<i_t, f_t> lp_settings = settings_;
     lp_settings.set_log(false);
-    lp_settings.cut_off       = worker.local_upper_bound + settings_.dual_tol;
+    if (original_lp_.objective_is_integral) {
+      lp_settings.cut_off =
+        std::ceil(worker.local_upper_bound - settings_.integer_tol) + settings_.dual_tol;
+    } else {
+      lp_settings.cut_off = worker.local_upper_bound + settings_.dual_tol;
+    }
     lp_settings.inside_mip    = 2;
     lp_settings.time_limit    = remaining_time;
     lp_settings.scale_columns = false;
@@ -3787,7 +4355,6 @@ void branch_and_bound_t<i_t, f_t>::deterministic_dive(
       lp_settings, worker.bounds_changed, worker.leaf_problem.lower, worker.leaf_problem.upper);
 
     if (settings_.deterministic) {
-      // TEMP APPROXIMATION;
       worker.work_context.record_work_sync_on_horizon(worker.node_presolver.last_nnz_processed /
                                                       1e8);
     }
@@ -3841,17 +4408,16 @@ void branch_and_bound_t<i_t, f_t>::deterministic_dive(
       lp_status                 = convert_lp_status_to_dual_status(second_status);
     }
 
-    ++nodes_this_dive;
-    ++worker.total_nodes_explored;
     worker.lp_iters_this_dive += node_iter;
 
-    worker.clock = worker.work_context.global_work_units_elapsed;
-
     if (lp_status == dual::status_t::TIME_LIMIT || lp_status == dual::status_t::WORK_LIMIT ||
         lp_status == dual::status_t::ITERATION_LIMIT) {
       break;
     }
 
+    ++nodes_this_dive;
+    ++worker.total_nodes_explored;
+
     deterministic_diving_policy_t<i_t, f_t> policy{*this, worker, stack, max_backtrack_depth};
     update_tree_impl(node_ptr, dive_tree, &worker, lp_status, policy);
   }
diff --git a/cpp/src/branch_and_bound/branch_and_bound.hpp b/cpp/src/branch_and_bound/branch_and_bound.hpp
index f2917ba930..7dec38b640 100644
--- a/cpp/src/branch_and_bound/branch_and_bound.hpp
+++ b/cpp/src/branch_and_bound/branch_and_bound.hpp
@@ -24,6 +24,7 @@
 #include <dual_simplex/solve.hpp>
 #include <dual_simplex/types.hpp>
 
+#include <utilities/determinism_log.hpp>
 #include <utilities/macros.cuh>
 #include <utilities/omp_helpers.hpp>
 #include <utilities/producer_sync.hpp>
@@ -35,9 +36,12 @@
 #include <omp.h>
 
 #include <atomic>
+#include <condition_variable>
 #include <functional>
 #include <future>
 #include <memory>
+#include <mutex>
+#include <tuple>
 #include <vector>
 
 namespace cuopt::linear_programming::detail {
@@ -108,10 +112,15 @@ class branch_and_bound_t {
   }
 
   // Set a solution based on the user problem during the course of the solve
-  void set_new_solution(const std::vector<f_t>& solution);
+  void set_new_solution(const std::vector<f_t>& solution,
+                        cuopt::internals::mip_solution_origin_t origin =
+                          cuopt::internals::mip_solution_origin_t::UNKNOWN);
 
   // This queues the solution to be processed at the correct work unit timestamp
-  void queue_external_solution_deterministic(const std::vector<f_t>& solution, double work_unit_ts);
+  void queue_external_solution_deterministic(const std::vector<f_t>& solution,
+                                             f_t user_objective,
+                                             double work_unit_ts,
+                                             cuopt::internals::mip_solution_origin_t origin);
 
   void set_user_bound_callback(std::function<void(f_t)> callback)
   {
@@ -157,6 +166,12 @@ class branch_and_bound_t {
   // Get producer sync for external heuristics (e.g., CPUFJ) to register
   producer_sync_t& get_producer_sync() { return producer_sync_; }
 
+  void wait_for_exploration_start()
+  {
+    std::unique_lock<std::mutex> lock(exploration_started_mutex_);
+    exploration_started_cv_.wait(lock, [this] { return exploration_started_.load(); });
+  }
+
  private:
   const user_problem_t<i_t, f_t>& original_problem_;
   const simplex_solver_settings_t<i_t, f_t> settings_;
@@ -166,6 +181,10 @@ class branch_and_bound_t {
   std::atomic<bool> signal_extend_cliques_{false};
 
   work_limit_context_t work_unit_context_{"B&B"};
+  double pre_exploration_work_{0.0};
+  std::atomic<bool> exploration_started_{false};
+  std::mutex exploration_started_mutex_;
+  std::condition_variable exploration_started_cv_;
 
   // Initial guess.
   std::vector<f_t> guess_;
@@ -214,7 +233,13 @@ class branch_and_bound_t {
 
   // Mutex for repair
   omp_mutex_t mutex_repair_;
-  std::vector<std::vector<f_t>> repair_queue_;
+  struct queued_repair_solution_t {
+    std::vector<f_t> solution;
+    cuopt::internals::mip_solution_origin_t origin{
+      cuopt::internals::mip_solution_origin_t::UNKNOWN};
+    double work_timestamp{-1.0};
+  };
+  std::vector<queued_repair_solution_t> repair_queue_;
 
   // Variables for the root node in the search tree.
   std::vector<variable_status_t> root_vstatus_;
@@ -262,13 +287,21 @@ class branch_and_bound_t {
   omp_atomic_t<f_t> lower_bound_ceiling_;
   std::function<void(f_t)> user_bound_callback_;
 
-  void report_heuristic(f_t obj);
+  void report_heuristic(f_t obj, double work_time = -1.0);
   void report(char symbol,
               f_t obj,
               f_t lower_bound,
               i_t node_depth,
               i_t node_int_infeas,
               double work_time = -1);
+  void emit_solution_callback(std::vector<f_t>& original_x,
+                              f_t objective,
+                              cuopt::internals::mip_solution_origin_t origin,
+                              double work_timestamp);
+  void emit_solution_callback_from_crushed(const std::vector<f_t>& crushed_solution,
+                                           f_t objective,
+                                           cuopt::internals::mip_solution_origin_t origin,
+                                           double work_timestamp);
 
   // Set the solution when found at the root node
   void set_solution_at_root(mip_solution_t<i_t, f_t>& solution,
@@ -341,7 +374,14 @@ class branch_and_bound_t {
   void run_deterministic_coordinator(const csr_matrix_t<i_t, f_t>& Arow);
 
   // Gather all events generated, sort by WU timestamp, apply
-  void deterministic_sort_replay_events(const bb_event_batch_t<i_t, f_t>& events);
+  struct deterministic_replay_solution_t {
+    queued_integer_solution_t<i_t, f_t> solution;
+    search_strategy_t strategy{search_strategy_t::BEST_FIRST};
+  };
+
+  void deterministic_sort_replay_events(
+    const bb_event_batch_t<i_t, f_t>& events,
+    std::vector<deterministic_replay_solution_t>& replay_solutions);
 
   // Prune nodes held by workers based on new incumbent
   void deterministic_prune_worker_nodes_vs_incumbent();
@@ -374,10 +414,14 @@ class branch_and_bound_t {
   void deterministic_assign_diving_nodes();
 
   // Collect and merge diving solutions at sync
-  void deterministic_collect_diving_solutions_and_update_pseudocosts();
+  void deterministic_collect_diving_solutions_and_update_pseudocosts(
+    std::vector<deterministic_replay_solution_t>& replay_solutions);
 
   template <typename PoolT, typename WorkerTypeGetter>
-  void deterministic_process_worker_solutions(PoolT& pool, WorkerTypeGetter get_worker_type);
+  void deterministic_collect_worker_solutions(
+    PoolT& pool,
+    WorkerTypeGetter get_worker_type,
+    std::vector<deterministic_replay_solution_t>& replay_solutions);
 
   template <typename PoolT>
   void deterministic_merge_pseudo_cost_updates(PoolT& pool);
@@ -408,10 +452,22 @@ class branch_and_bound_t {
   double max_producer_wait_time_{0.0};
   i_t producer_wait_count_{0};
 
-  // Determinism heuristic solution queue - solutions received from GPU heuristics
-  // Stored with work unit timestamp for deterministic ordering
+  struct queued_external_solution_t {
+    std::vector<f_t> solution;
+    f_t user_objective{std::numeric_limits<f_t>::infinity()};
+    double work_timestamp{0.0};
+    cuopt::internals::mip_solution_origin_t origin{
+      cuopt::internals::mip_solution_origin_t::UNKNOWN};
+  };
+
+  std::tuple<bool, f_t, std::vector<f_t>> retire_queued_solution(
+    const queued_external_solution_t& queued_solution);
+
+  // Deterministic pending external solution queue.
+  // External solutions stay raw until their retirement horizon, where they are
+  // crushed, checked, and repaired immediately if needed.
   omp_mutex_t mutex_heuristic_queue_;
-  std::vector<queued_integer_solution_t<i_t, f_t>> heuristic_solution_queue_;
+  std::vector<queued_external_solution_t> heuristic_solution_queue_;
 
   // ============================================================================
   // Determinism Diving state
diff --git a/cpp/src/branch_and_bound/deterministic_workers.hpp b/cpp/src/branch_and_bound/deterministic_workers.hpp
index 7a074051c6..b90706285b 100644
--- a/cpp/src/branch_and_bound/deterministic_workers.hpp
+++ b/cpp/src/branch_and_bound/deterministic_workers.hpp
@@ -11,6 +11,7 @@
 #include <branch_and_bound/branch_and_bound_worker.hpp>
 #include <branch_and_bound/diving_heuristics.hpp>
 #include <branch_and_bound/node_queue.hpp>
+#include <cuopt/linear_programming/utilities/internals.hpp>
 
 #include <utilities/work_limit_context.hpp>
 
@@ -44,6 +45,8 @@ struct queued_integer_solution_t {
   int worker_id{-1};
   int sequence_id{0};
   double work_timestamp{0.0};
+  cuopt::internals::mip_solution_origin_t origin{
+    cuopt::internals::mip_solution_origin_t::BRANCH_AND_BOUND_NODE};
 
   bool operator<(const queued_integer_solution_t& other) const
   {
@@ -59,6 +62,7 @@ struct deterministic_snapshot_t {
   pseudo_cost_snapshot_t<i_t, f_t> pc_snapshot;
   std::vector<f_t> incumbent;
   i_t total_lp_iters;
+  i_t nodes_explored;
 };
 
 template <typename i_t, typename f_t, typename Derived>
@@ -66,7 +70,6 @@ class deterministic_worker_base_t : public branch_and_bound_worker_t<i_t, f_t> {
   using base_t = branch_and_bound_worker_t<i_t, f_t>;
 
  public:
-  double clock{0.0};
   work_limit_context_t work_context;
 
   pseudo_cost_snapshot_t<i_t, f_t> pc_snapshot;
@@ -75,6 +78,7 @@ class deterministic_worker_base_t : public branch_and_bound_worker_t<i_t, f_t> {
   // Diving-specific snapshots (ignored by BFS workers)
   std::vector<f_t> incumbent_snapshot;
   i_t total_lp_iters_snapshot{0};
+  i_t nodes_explored_snapshot{0};
 
   std::vector<queued_integer_solution_t<i_t, f_t>> integer_solutions;
   int next_solution_seq{0};
@@ -101,6 +105,7 @@ class deterministic_worker_base_t : public branch_and_bound_worker_t<i_t, f_t> {
     pc_snapshot             = snap.pc_snapshot;
     incumbent_snapshot      = snap.incumbent;
     total_lp_iters_snapshot = snap.total_lp_iters;
+    nodes_explored_snapshot = snap.nodes_explored;
   }
 
   bool has_work() const { return static_cast<const Derived*>(this)->has_work_impl(); }
@@ -158,11 +163,6 @@ class deterministic_bfs_worker_t
                                                     mip_node_t<i_t, f_t>* up_child,
                                                     rounding_direction_t preferred_direction)
   {
-    if (!plunge_stack.empty()) {
-      backlog.push(plunge_stack.back());
-      plunge_stack.pop_back();
-    }
-
     down_child->origin_worker_id = this->worker_id;
     down_child->creation_seq     = next_creation_seq++;
     up_child->origin_worker_id   = this->worker_id;
@@ -170,11 +170,11 @@ class deterministic_bfs_worker_t
 
     mip_node_t<i_t, f_t>* first_child;
     if (preferred_direction == rounding_direction_t::UP) {
-      plunge_stack.push_front(down_child);
+      backlog.push(down_child);
       plunge_stack.push_front(up_child);
       first_child = up_child;
     } else {
-      plunge_stack.push_front(up_child);
+      backlog.push(up_child);
       plunge_stack.push_front(down_child);
       first_child = down_child;
     }
@@ -211,7 +211,7 @@ class deterministic_bfs_worker_t
   void record_branched(
     mip_node_t<i_t, f_t>* node, i_t down_child_id, i_t up_child_id, i_t branch_var, f_t branch_val)
   {
-    record_event(bb_event_t<i_t, f_t>::make_branched(this->clock,
+    record_event(bb_event_t<i_t, f_t>::make_branched(this->work_context.current_work(),
                                                      this->worker_id,
                                                      node->creation_seq,
                                                      down_child_id,
@@ -227,7 +227,7 @@ class deterministic_bfs_worker_t
   void record_integer_solution(mip_node_t<i_t, f_t>* node, f_t objective)
   {
     record_event(bb_event_t<i_t, f_t>::make_integer_solution(
-      this->clock, this->worker_id, node->creation_seq, objective));
+      this->work_context.current_work(), this->worker_id, node->creation_seq, objective));
     ++nodes_processed_this_horizon;
     ++this->total_nodes_processed;
     ++this->total_integer_solutions;
@@ -236,7 +236,7 @@ class deterministic_bfs_worker_t
   void record_fathomed(mip_node_t<i_t, f_t>* node, f_t lower_bound)
   {
     record_event(bb_event_t<i_t, f_t>::make_fathomed(
-      this->clock, this->worker_id, node->creation_seq, lower_bound));
+      this->work_context.current_work(), this->worker_id, node->creation_seq, lower_bound));
     ++nodes_processed_this_horizon;
     ++this->total_nodes_processed;
     ++total_nodes_pruned;
@@ -244,8 +244,8 @@ class deterministic_bfs_worker_t
 
   void record_infeasible(mip_node_t<i_t, f_t>* node)
   {
-    record_event(
-      bb_event_t<i_t, f_t>::make_infeasible(this->clock, this->worker_id, node->creation_seq));
+    record_event(bb_event_t<i_t, f_t>::make_infeasible(
+      this->work_context.current_work(), this->worker_id, node->creation_seq));
     ++nodes_processed_this_horizon;
     ++this->total_nodes_processed;
     ++total_nodes_infeasible;
@@ -253,8 +253,8 @@ class deterministic_bfs_worker_t
 
   void record_numerical(mip_node_t<i_t, f_t>* node)
   {
-    record_event(
-      bb_event_t<i_t, f_t>::make_numerical(this->clock, this->worker_id, node->creation_seq));
+    record_event(bb_event_t<i_t, f_t>::make_numerical(
+      this->work_context.current_work(), this->worker_id, node->creation_seq));
     ++nodes_processed_this_horizon;
     ++this->total_nodes_processed;
   }
@@ -288,6 +288,7 @@ class deterministic_diving_worker_t
 
   // Diving statistics
   i_t total_nodes_explored{0};
+  i_t nodes_explored_last_sync{0};
   i_t total_dives{0};
   i_t lp_iters_this_dive{0};
 
@@ -339,7 +340,13 @@ class deterministic_diving_worker_t
   void queue_integer_solution(f_t objective, const std::vector<f_t>& solution, i_t depth)
   {
     this->integer_solutions.push_back(
-      {objective, solution, depth, this->worker_id, this->next_solution_seq++});
+      {objective,
+       solution,
+       depth,
+       this->worker_id,
+       this->next_solution_seq++,
+       this->work_context.current_work(),
+       cuopt::internals::mip_solution_origin_t::BRANCH_AND_BOUND_DIVING});
     ++this->total_integer_solutions;
   }
 
diff --git a/cpp/src/branch_and_bound/pseudo_costs.cpp b/cpp/src/branch_and_bound/pseudo_costs.cpp
index c38e98e27d..bb2fd3a6da 100644
--- a/cpp/src/branch_and_bound/pseudo_costs.cpp
+++ b/cpp/src/branch_and_bound/pseudo_costs.cpp
@@ -79,11 +79,9 @@ objective_change_estimate_t<f_t> single_pivot_objective_change_estimate(
   std::vector<f_t>& delta_z,
   f_t& work_estimate)
 {
-  // Compute the objective estimate for the down and up branches of variable j
   assert(variable_j >= 0);
   assert(basic_j >= 0);
 
-  // Down branch
   i_t direction = -1;
   sparse_vector_t<i_t, f_t> e_k(lp.num_rows, 0);
   e_k.i.push_back(basic_j);
@@ -92,7 +90,6 @@ objective_change_estimate_t<f_t> single_pivot_objective_change_estimate(
   sparse_vector_t<i_t, f_t> delta_y(lp.num_rows, 0);
   basis_factors.b_transpose_solve(e_k, delta_y);
 
-  // Compute delta_z_N = -N^T * delta_y
   i_t delta_y_nz0      = 0;
   const i_t nz_delta_y = delta_y.i.size();
   for (i_t k = 0; k < nz_delta_y; k++) {
@@ -102,7 +99,6 @@ objective_change_estimate_t<f_t> single_pivot_objective_change_estimate(
   const f_t delta_y_nz_percentage = delta_y_nz0 / static_cast<f_t>(lp.num_rows) * 100.0;
   const bool use_transpose        = delta_y_nz_percentage <= 30.0;
   std::vector<i_t> delta_z_indices;
-  // delta_z starts out all zero
   if (use_transpose) {
     compute_delta_z(A_transpose,
                     delta_y,
@@ -128,84 +124,31 @@ objective_change_estimate_t<f_t> single_pivot_objective_change_estimate(
                                 work_estimate);
   }
 
-  // Verify dual feasibility
-#ifdef CHECK_DUAL_FEASIBILITY
-  {
-    std::vector<f_t> dual_residual = lp_solution.z;
-    for (i_t j = 0; j < lp.num_cols; j++) {
-      dual_residual[j] -= lp.objective[j];
-    }
-    matrix_transpose_vector_multiply(lp.A, 1.0, lp_solution.y, 1.0, dual_residual);
-    f_t dual_residual_norm = vector_norm_inf<i_t, f_t>(dual_residual);
-    settings.log.printf("Dual residual norm: %e\n", dual_residual_norm);
-  }
-#endif
-
-  // Compute the step-length
   f_t step_length = compute_step_length(settings, vstatus, lp_solution.z, delta_z, delta_z_indices);
-
-  // Handle the leaving variable case
-
   f_t delta_obj_down =
     step_length * (lp_solution.x[variable_j] - std::floor(lp_solution.x[variable_j]));
-#ifdef CHECK_DELTA_OBJ
-  f_t delta_obj_check = 0.0;
-  for (i_t k = 0; k < delta_y.i.size(); k++) {
-    delta_obj_check += lp.rhs[delta_y.i[k]] * delta_y.x[k];
-  }
-  for (i_t h = 0; h < delta_z_indices.size(); h++) {
-    const i_t j = delta_z_indices[h];
-    if (vstatus[j] == variable_status_t::NONBASIC_LOWER) {
-      delta_obj_check += lp.lower[j] * delta_z[j];
-    } else if (vstatus[j] == variable_status_t::NONBASIC_UPPER) {
-      delta_obj_check += lp.upper[j] * delta_z[j];
-    }
-  }
-  delta_obj_check += std::floor(lp_solution.x[variable_j]) * delta_z[variable_j];
-  delta_obj_check *= step_length;
-  if (std::abs(delta_obj_check - delta_obj) > 1e-6) {
-    settings.log.printf("Delta obj check %e. Delta obj %e. Step length %e.\n",
-                        delta_obj_check,
-                        delta_obj,
-                        step_length);
-  }
-#endif
 
   settings.log.debug(
-    "Down branch %d. Step length: %e. Delta obj: %e. \n", variable_j, step_length, delta_obj_down);
+    "Down branch %d. Step length: %e. Delta obj: %e.\n", variable_j, step_length, delta_obj_down);
 
-  // Up branch
   direction = 1;
-  // Negate delta_z
   for (i_t j : delta_z_indices) {
     delta_z[j] *= -1.0;
   }
 
-  // Compute the step-length
   step_length = compute_step_length(settings, vstatus, lp_solution.z, delta_z, delta_z_indices);
-
   f_t delta_obj_up =
     step_length * (std::ceil(lp_solution.x[variable_j]) - lp_solution.x[variable_j]);
+
   settings.log.debug(
     "Up branch %d. Step length: %e. Delta obj: %e.\n", variable_j, step_length, delta_obj_up);
 
   delta_z_indices.push_back(variable_j);
-
-  // Clear delta_z
   for (i_t j : delta_z_indices) {
     delta_z[j]   = 0.0;
     workspace[j] = 0;
   }
 
-#ifdef CHECK_DELTA_Z
-  for (i_t j = 0; j < lp.num_cols; j++) {
-    if (delta_z[j] != 0.0) { settings.log.printf("Delta z %d: %e\n", j, delta_z[j]); }
-  }
-  for (i_t j = 0; j < lp.num_cols; j++) {
-    if (workspace[j] != 0) { settings.log.printf("Workspace %d: %d\n", j, workspace[j]); }
-  }
-#endif
-
   return {.down_obj_change = std::max<f_t>(delta_obj_down, 0),
           .up_obj_change   = std::max<f_t>(delta_obj_up, 0)};
 }
@@ -226,7 +169,6 @@ void initialize_pseudo_costs_with_estimate(const lp_problem_t<i_t, f_t>& lp,
 
   std::vector<f_t> delta_z(n, 0);
   std::vector<i_t> workspace(n, 0);
-
   f_t work_estimate = 0;
 
   std::vector<i_t> basic_map(n, -1);
@@ -241,8 +183,6 @@ void initialize_pseudo_costs_with_estimate(const lp_problem_t<i_t, f_t>& lp,
 
   for (i_t k = 0; k < fractional.size(); k++) {
     const i_t j = fractional[k];
-    assert(j >= 0);
-
     objective_change_estimate_t<f_t> estimate =
       single_pivot_objective_change_estimate(lp,
                                              settings,
@@ -266,21 +206,16 @@ void initialize_pseudo_costs_with_estimate(const lp_problem_t<i_t, f_t>& lp,
 template <typename i_t, typename f_t>
 f_t objective_upper_bound(const lp_problem_t<i_t, f_t>& lp, f_t upper_bound, f_t dual_tol)
 {
-  f_t cut_off = 0;
+  if (std::isfinite(upper_bound)) { return upper_bound + dual_tol; }
 
-  if (std::isfinite(upper_bound)) {
-    cut_off = upper_bound + dual_tol;
-  } else {
-    cut_off = 0;
-    for (i_t j = 0; j < lp.num_cols; ++j) {
-      if (lp.objective[j] > 0) {
-        cut_off += lp.objective[j] * lp.upper[j];
-      } else if (lp.objective[j] < 0) {
-        cut_off += lp.objective[j] * lp.lower[j];
-      }
+  f_t cut_off = 0;
+  for (i_t j = 0; j < lp.num_cols; ++j) {
+    if (lp.objective[j] > 0) {
+      cut_off += lp.objective[j] * lp.upper[j];
+    } else if (lp.objective[j] < 0) {
+      cut_off += lp.objective[j] * lp.lower[j];
     }
   }
-
   return cut_off;
 }
 
@@ -303,33 +238,23 @@ void strong_branch_helper(i_t start,
                           std::vector<f_t>& dual_simplex_obj_up,
                           std::vector<dual::status_t>& dual_simplex_status_down,
                           std::vector<dual::status_t>& dual_simplex_status_up,
-                          shared_strong_branching_context_view_t<i_t, f_t>& sb_view)
+                          shared_strong_branching_context_view_t<i_t, f_t>& sb_view,
+                          cuopt::work_limit_context_t* work_unit_context = nullptr)
 {
   raft::common::nvtx::range scope("BB::strong_branch_helper");
+  (void)var_types;
+
   lp_problem_t child_problem = original_lp;
+  constexpr bool verbose     = false;
+  f_t last_log               = tic();
+  i_t thread_id              = omp_get_thread_num();
 
-  constexpr bool verbose = false;
-  f_t last_log           = tic();
-  i_t thread_id          = omp_get_thread_num();
   for (i_t k = start; k < end; ++k) {
     const i_t j = fractional[k];
 
     for (i_t branch = 0; branch < 2; branch++) {
-      // Do the down branch
       const i_t shared_idx = (branch == 0) ? k : k + static_cast<i_t>(fractional.size());
-      // Batch PDLP has already solved this subproblem, skip it
-      if (sb_view.is_valid() && sb_view.is_solved(shared_idx)) {
-        if (verbose) {
-          settings.log.printf(
-            "[COOP SB] DS thread %d skipping variable %d branch %s (shared_idx %d): already solved "
-            "by PDLP\n",
-            thread_id,
-            j,
-            branch == 0 ? "down" : "up",
-            shared_idx);
-        }
-        continue;
-      }
+      if (sb_view.is_valid() && sb_view.is_solved(shared_idx)) { continue; }
 
       if (branch == 0) {
         child_problem.lower[j] = original_lp.lower[j];
@@ -341,10 +266,9 @@ void strong_branch_helper(i_t start,
 
       simplex_solver_settings_t<i_t, f_t> child_settings = settings;
       child_settings.set_log(false);
-      f_t lp_start_time = tic();
-      f_t elapsed_time  = toc(start_time);
+      const f_t elapsed_time = toc(start_time);
       if (elapsed_time > settings.time_limit) { break; }
-      child_settings.time_limit      = std::max(0.0, settings.time_limit - elapsed_time);
+      child_settings.time_limit      = std::max<f_t>(0.0, settings.time_limit - elapsed_time);
       child_settings.iteration_limit = iter_limit;
       child_settings.cut_off =
         objective_upper_bound(child_problem, upper_bound, child_settings.dual_tol);
@@ -355,17 +279,17 @@ void strong_branch_helper(i_t start,
       std::vector<f_t> child_edge_norms      = edge_norms;
       dual::status_t status                  = dual_phase2(2,
                                           0,
-                                          lp_start_time,
+                                          tic(),
                                           child_problem,
                                           child_settings,
                                           vstatus,
                                           solution,
                                           iter,
-                                          child_edge_norms);
+                                          child_edge_norms,
+                                          work_unit_context);
 
       f_t obj = std::numeric_limits<f_t>::quiet_NaN();
       if (status == dual::status_t::DUAL_UNBOUNDED) {
-        // LP was infeasible
         obj = std::numeric_limits<f_t>::infinity();
       } else if (status == dual::status_t::OPTIMAL || status == dual::status_t::ITERATION_LIMIT ||
                  status == dual::status_t::CUTOFF) {
@@ -379,60 +303,36 @@ void strong_branch_helper(i_t start,
                            status);
       }
 
+      const f_t delta_obj = std::max(obj - root_obj, f_t(0.0));
       if (branch == 0) {
-        pc.strong_branch_down[k]    = std::max(obj - root_obj, 0.0);
-        dual_simplex_obj_down[k]    = std::max(obj - root_obj, 0.0);
+        pc.strong_branch_down[k]    = delta_obj;
+        dual_simplex_obj_down[k]    = delta_obj;
         dual_simplex_status_down[k] = status;
-        if (verbose) {
-          settings.log.printf("Thread id %2d remaining %d variable %d branch %d obj %e time %.2f\n",
-                              thread_id,
-                              end - 1 - k,
-                              j,
-                              branch,
-                              obj,
-                              toc(start_time));
-        }
       } else {
-        pc.strong_branch_up[k]    = std::max(obj - root_obj, 0.0);
-        dual_simplex_obj_up[k]    = std::max(obj - root_obj, 0.0);
+        pc.strong_branch_up[k]    = delta_obj;
+        dual_simplex_obj_up[k]    = delta_obj;
         dual_simplex_status_up[k] = status;
-        if (verbose) {
-          settings.log.printf(
-            "Thread id %2d remaining %d variable %d branch %d obj %e change down %e change up %e "
-            "time %.2f\n",
-            thread_id,
-            end - 1 - k,
-            j,
-            branch,
-            obj,
-            dual_simplex_obj_down[k],
-            dual_simplex_obj_up[k],
-            toc(start_time));
-        }
       }
-      // Mark the subproblem as solved so that batch PDLP removes it from the batch
+
       if (sb_view.is_valid()) {
-        // We could not mark as solved nodes hitting iteration limit in DS
-        if ((branch == 0 && is_dual_simplex_done(dual_simplex_status_down[k])) ||
-            (branch == 1 && is_dual_simplex_done(dual_simplex_status_up[k]))) {
-          sb_view.mark_solved(shared_idx);
-          if (verbose) {
-            settings.log.printf(
-              "[COOP SB] DS thread %d solved variable %d branch %s (shared_idx %d), marking in "
-              "shared context\n",
-              thread_id,
-              j,
-              branch == 0 ? "down" : "up",
-              shared_idx);
-          }
-        }
+        const dual::status_t branch_status =
+          branch == 0 ? dual_simplex_status_down[k] : dual_simplex_status_up[k];
+        if (is_dual_simplex_done(branch_status)) { sb_view.mark_solved(shared_idx); }
+      }
+
+      if (verbose) {
+        settings.log.printf("Thread %d variable %d branch %d obj %e time %.2f\n",
+                            thread_id,
+                            j,
+                            branch,
+                            obj,
+                            toc(start_time));
       }
       if (toc(start_time) > settings.time_limit) { break; }
     }
-    if (toc(start_time) > settings.time_limit) { break; }
 
+    if (toc(start_time) > settings.time_limit) { break; }
     const i_t completed = pc.num_strong_branches_completed++;
-
     if (thread_id == 0 && toc(last_log) > 10) {
       last_log = tic();
       settings.log.printf("%d of %ld strong branches completed in %.1fs\n",
@@ -443,8 +343,6 @@ void strong_branch_helper(i_t start,
 
     child_problem.lower[j] = original_lp.lower[j];
     child_problem.upper[j] = original_lp.upper[j];
-
-    if (toc(start_time) > settings.time_limit) { break; }
   }
 }
 
@@ -463,8 +361,11 @@ std::pair<f_t, dual::status_t> trial_branching(const lp_problem_t<i_t, f_t>& ori
                                                f_t upper_bound,
                                                f_t start_time,
                                                i_t iter_limit,
-                                               omp_atomic_t<int64_t>& total_lp_iter)
+                                               omp_atomic_t<int64_t>& total_lp_iter,
+                                               cuopt::work_limit_context_t* work_ctx = nullptr)
 {
+  (void)var_types;
+
   lp_problem_t child_problem      = original_lp;
   child_problem.lower[branch_var] = branch_var_lower;
   child_problem.upper[branch_var] = branch_var_upper;
@@ -485,8 +386,6 @@ std::pair<f_t, dual::status_t> trial_branching(const lp_problem_t<i_t, f_t>& ori
   std::vector<i_t> child_basic_list                = basic_list;
   std::vector<i_t> child_nonbasic_list             = nonbasic_list;
   basis_update_mpf_t<i_t, f_t> child_basis_factors = basis_factors;
-
-  // Only refactor the basis if we encounter numerical issues.
   child_basis_factors.set_refactor_frequency(iter_limit);
 
   dual::status_t status = dual_phase2_with_advanced_basis(2,
@@ -501,7 +400,8 @@ std::pair<f_t, dual::status_t> trial_branching(const lp_problem_t<i_t, f_t>& ori
                                                           child_nonbasic_list,
                                                           solution,
                                                           iter,
-                                                          child_edge_norms);
+                                                          child_edge_norms,
+                                                          work_ctx);
   total_lp_iter += iter;
   settings.log.debug("Trial branching on variable %d. Lo: %e Up: %e. Iter %d. Status %s. Obj %e\n",
                      branch_var,
@@ -512,7 +412,6 @@ std::pair<f_t, dual::status_t> trial_branching(const lp_problem_t<i_t, f_t>& ori
                      compute_objective(child_problem, solution.x));
 
   if (status == dual::status_t::DUAL_UNBOUNDED) {
-    // LP was infeasible
     return {std::numeric_limits<f_t>::infinity(), dual::status_t::DUAL_UNBOUNDED};
   } else if (status == dual::status_t::OPTIMAL || status == dual::status_t::ITERATION_LIMIT ||
              status == dual::status_t::CUTOFF) {
@@ -522,6 +421,88 @@ std::pair<f_t, dual::status_t> trial_branching(const lp_problem_t<i_t, f_t>& ori
   }
 }
 
+template <typename i_t, typename f_t>
+f_t trial_branching_generic(const lp_problem_t<i_t, f_t>& original_lp,
+                            const simplex_solver_settings_t<i_t, f_t>& settings,
+                            const std::vector<variable_type_t>& var_types,
+                            const std::vector<variable_status_t>& vstatus,
+                            const std::vector<f_t>& edge_norms,
+                            const basis_update_mpf_t<i_t, f_t>& basis_factors,
+                            const std::vector<i_t>& basic_list,
+                            const std::vector<i_t>& nonbasic_list,
+                            i_t branch_var,
+                            f_t branch_var_lower,
+                            f_t branch_var_upper,
+                            f_t upper_bound,
+                            i_t bnb_lp_iter_per_node,
+                            f_t start_time,
+                            i_t upper_max_lp_iter,
+                            i_t lower_max_lp_iter,
+                            omp_atomic_t<int64_t>& total_lp_iter,
+                            cuopt::work_limit_context_t* work_ctx = nullptr)
+{
+  const i_t iter_limit = std::clamp(bnb_lp_iter_per_node, lower_max_lp_iter, upper_max_lp_iter);
+  return trial_branching(original_lp,
+                         settings,
+                         var_types,
+                         vstatus,
+                         edge_norms,
+                         basis_factors,
+                         basic_list,
+                         nonbasic_list,
+                         branch_var,
+                         branch_var_lower,
+                         branch_var_upper,
+                         upper_bound,
+                         start_time,
+                         iter_limit,
+                         total_lp_iter,
+                         work_ctx)
+    .first;
+}
+
+template <typename i_t, typename f_t>
+f_t trial_branching_generic(const lp_problem_t<i_t, f_t>& original_lp,
+                            const simplex_solver_settings_t<i_t, f_t>& settings,
+                            const std::vector<variable_type_t>& var_types,
+                            const std::vector<variable_status_t>& vstatus,
+                            const std::vector<f_t>& edge_norms,
+                            const basis_update_mpf_t<i_t, f_t>& basis_factors,
+                            const std::vector<i_t>& basic_list,
+                            const std::vector<i_t>& nonbasic_list,
+                            i_t branch_var,
+                            f_t branch_var_lower,
+                            f_t branch_var_upper,
+                            f_t upper_bound,
+                            i_t bnb_lp_iter_per_node,
+                            f_t start_time,
+                            i_t upper_max_lp_iter,
+                            i_t lower_max_lp_iter,
+                            int64_t& total_lp_iter,
+                            cuopt::work_limit_context_t* work_ctx = nullptr)
+{
+  omp_atomic_t<int64_t> atomic_iter{0};
+  auto result =
+    trial_branching(original_lp,
+                    settings,
+                    var_types,
+                    vstatus,
+                    edge_norms,
+                    basis_factors,
+                    basic_list,
+                    nonbasic_list,
+                    branch_var,
+                    branch_var_lower,
+                    branch_var_upper,
+                    upper_bound,
+                    start_time,
+                    std::clamp(bnb_lp_iter_per_node, lower_max_lp_iter, upper_max_lp_iter),
+                    atomic_iter,
+                    work_ctx);
+  total_lp_iter += atomic_iter.load();
+  return result.first;
+}
+
 }  // namespace
 
 template <typename i_t, typename f_t>
@@ -531,25 +512,11 @@ static cuopt::mps_parser::mps_data_model_t<i_t, f_t> simplex_problem_to_mps_data
   const std::vector<f_t>& root_soln,
   std::vector<f_t>& original_root_soln_x)
 {
-  // Branch and bound has a problem of the form:
-  // minimize c^T x
-  // subject to A*x + Es = b
-  //            l <= x <= u
-  //            E_{jj} = sigma_j, where sigma_j is +1 or -1
-
-  // We need to convert this into a problem that is better for PDLP
-  // to solve. PDLP perfers inequality constraints. Thus, we want
-  // to convert the above into the problem:
-  // minimize c^T x
-  // subject to  lb <= A*x <= ub
-  //             l <= x <= u
-
   cuopt::mps_parser::mps_data_model_t<i_t, f_t> mps_model;
   int m = lp.num_rows;
   int n = lp.num_cols - new_slacks.size();
   original_root_soln_x.resize(n);
 
-  // Remove slacks from A
   dual_simplex::csc_matrix_t<i_t, f_t> A_no_slacks = lp.A;
   std::vector<i_t> cols_to_remove(lp.A.n, 0);
   for (i_t j : new_slacks) {
@@ -561,33 +528,22 @@ static cuopt::mps_parser::mps_data_model_t<i_t, f_t> simplex_problem_to_mps_data
     original_root_soln_x[j] = root_soln[j];
   }
 
-  // Convert CSC to CSR using built-in method
   dual_simplex::csr_matrix_t<i_t, f_t> csr_A(m, n, 0);
   A_no_slacks.to_compressed_row(csr_A);
-
   int nz = csr_A.row_start[m];
 
-  // Set CSR constraint matrix
   mps_model.set_csr_constraint_matrix(
     csr_A.x.data(), nz, csr_A.j.data(), nz, csr_A.row_start.data(), m + 1);
-
-  // Set objective coefficients
   mps_model.set_objective_coefficients(lp.objective.data(), n);
-
-  // The LP is already in minimization form (objective negated for max problems).
-  // Pass identity scaling so PDLP returns the raw DS-space objective directly.
   mps_model.set_objective_scaling_factor(f_t(1.0));
   mps_model.set_objective_offset(f_t(0.0));
-
-  // Set variable bounds
   mps_model.set_variable_lower_bounds(lp.lower.data(), n);
   mps_model.set_variable_upper_bounds(lp.upper.data(), n);
 
-  // Convert row sense and RHS to constraint bounds
   std::vector<f_t> constraint_lower(m);
   std::vector<f_t> constraint_upper(m);
-
   std::vector<i_t> slack_map(m, -1);
+
   for (i_t j : new_slacks) {
     const i_t col_start = lp.A.col_start[j];
     const i_t i         = lp.A.i[col_start];
@@ -595,22 +551,6 @@ static cuopt::mps_parser::mps_data_model_t<i_t, f_t> simplex_problem_to_mps_data
   }
 
   for (i_t i = 0; i < m; ++i) {
-    // Each row is of the form a_i^T x + sigma * s_i = b_i
-    // with sigma = +1 or -1
-    // and l_i <= s_i <= u_i
-    // We have that a_i^T x - b_i = -sigma * s_i
-    // If sigma = -1, then we have
-    //    a_i^T x - b_i = s_i
-    //  l_i <= a_i^T x - b_i <= u_i
-    //  l_i + b_i <= a_i^T x <= u_i + b_i
-    //
-    // If sigma = +1, then we have
-    //    a_i^T x - b_i = -s_i
-    //   -a_i^T x + b_i = s_i
-    //  l_i <= -a_i^T x + b_i <= u_i
-    //  l_i - b_i <= -a_i^T x <= u_i - b_i
-    //  -u_i + b_i <= a_i^T x <= -l_i + b_i
-
     const i_t slack = slack_map[i];
     assert(slack != -1);
     const i_t col_start   = lp.A.col_start[slack];
@@ -621,64 +561,43 @@ static cuopt::mps_parser::mps_data_model_t<i_t, f_t> simplex_problem_to_mps_data
     if (sigma == -1) {
       constraint_lower[i] = slack_lower + lp.rhs[i];
       constraint_upper[i] = slack_upper + lp.rhs[i];
-    } else if (sigma == 1) {
+    } else {
       constraint_lower[i] = -slack_upper + lp.rhs[i];
       constraint_upper[i] = -slack_lower + lp.rhs[i];
-    } else {
-      assert(sigma == 1.0 || sigma == -1.0);
     }
   }
 
   mps_model.set_constraint_lower_bounds(constraint_lower.data(), m);
   mps_model.set_constraint_upper_bounds(constraint_upper.data(), m);
   mps_model.set_maximize(false);
-
   return mps_model;
 }
 
 enum class sb_source_t { DUAL_SIMPLEX, PDLP, NONE };
 
-// Merge a single strong branching result from Dual Simplex and PDLP.
-// Rules:
-//   1. If both found optimal   -> keep DS (higher quality vertex solution)
-//   2. Else if Dual Simplex found infeasible -> declare infeasible
-//   3. Else if one is optimal -> keep the optimal one
-//   4. Else if Dual Simplex hit iteration limit -> keep DS
-//   5. Else if none converged -> NaN (original objective)
 template <typename i_t, typename f_t>
 static std::pair<f_t, sb_source_t> merge_sb_result(f_t dual_simplex_val,
                                                    dual::status_t dual_simplex_status,
                                                    f_t pdlp_dual_obj,
                                                    bool pdlp_optimal)
 {
-  // Dual simplex always maintains dual feasibility, so OPTIMAL and ITERATION_LIMIT both qualify
-
-  // Rule 1: Both optimal -> keep DS
   if (dual_simplex_status == dual::status_t::OPTIMAL && pdlp_optimal) {
     return {dual_simplex_val, sb_source_t::DUAL_SIMPLEX};
   }
-
-  // Rule 2: Dual Simplex found infeasible -> declare infeasible
   if (dual_simplex_status == dual::status_t::DUAL_UNBOUNDED) {
     return {std::numeric_limits<f_t>::infinity(), sb_source_t::DUAL_SIMPLEX};
   }
-
-  // Rule 3: Only one converged -> keep that
   if (dual_simplex_status == dual::status_t::OPTIMAL && !pdlp_optimal) {
     return {dual_simplex_val, sb_source_t::DUAL_SIMPLEX};
   }
   if (pdlp_optimal && dual_simplex_status != dual::status_t::OPTIMAL) {
     return {pdlp_dual_obj, sb_source_t::PDLP};
   }
-
-  // Rule 4: Dual Simplex hit iteration limit or work limit or cutoff -> keep DS
   if (dual_simplex_status == dual::status_t::ITERATION_LIMIT ||
       dual_simplex_status == dual::status_t::WORK_LIMIT ||
       dual_simplex_status == dual::status_t::CUTOFF) {
     return {dual_simplex_val, sb_source_t::DUAL_SIMPLEX};
   }
-
-  // Rule 5: None converged -> NaN
   return {std::numeric_limits<f_t>::quiet_NaN(), sb_source_t::NONE};
 }
 
@@ -706,23 +625,14 @@ static void batch_pdlp_strong_branching_task(
 
   f_t start_batch = tic();
   std::vector<f_t> original_root_soln_x;
-
   if (concurrent_halt.load() == 1) { return; }
 
   const auto mps_model =
     simplex_problem_to_mps_data_model(original_lp, new_slacks, root_soln, original_root_soln_x);
 
   std::vector<f_t> fraction_values;
-
-  std::vector<f_t> original_root_soln_y, original_root_soln_z;
-  // TODO put back later once Chris has this part
-  /*uncrush_dual_solution(
-    original_problem, original_lp, root_soln_y, root_soln_z, original_root_soln_y,
-    original_root_soln_z);*/
-
   for (i_t k = 0; k < fractional.size(); k++) {
-    const i_t j = fractional[k];
-    fraction_values.push_back(original_root_soln_x[j]);
+    fraction_values.push_back(original_root_soln_x[fractional[k]]);
   }
 
   if (concurrent_halt.load() == 1) { return; }
@@ -732,19 +642,14 @@ static void batch_pdlp_strong_branching_task(
     std::max(static_cast<f_t>(0.0), settings.time_limit - batch_elapsed_time);
   if (warm_start_remaining_time <= 0.0) { return; }
 
-  assert(!pc.pdlp_warm_cache.populated && "PDLP warm cache should not be populated at this point");
-
+  assert(!pc.pdlp_warm_cache.populated);
   if (!pc.pdlp_warm_cache.populated) {
     pdlp_solver_settings_t<i_t, f_t> ws_settings;
-    ws_settings.method               = method_t::PDLP;
-    ws_settings.presolver            = presolver_t::None;
-    ws_settings.pdlp_solver_mode     = pdlp_solver_mode_t::Stable3;
-    ws_settings.detect_infeasibility = false;
-    // Since the warm start will be used over and over again we want to maximize the chance of
-    // convergeance Batch PDLP is very compute intensive so we want to minimize the number of
-    // iterations
-    constexpr int warm_start_iteration_limit         = 500000;
-    ws_settings.iteration_limit                      = warm_start_iteration_limit;
+    ws_settings.method                               = method_t::PDLP;
+    ws_settings.presolver                            = presolver_t::None;
+    ws_settings.pdlp_solver_mode                     = pdlp_solver_mode_t::Stable3;
+    ws_settings.detect_infeasibility                 = false;
+    ws_settings.iteration_limit                      = 500000;
     ws_settings.time_limit                           = warm_start_remaining_time;
     constexpr f_t pdlp_tolerance                     = 1e-5;
     ws_settings.tolerances.relative_dual_tolerance   = pdlp_tolerance;
@@ -756,51 +661,18 @@ static void batch_pdlp_strong_branching_task(
     ws_settings.inside_mip                           = true;
     if (effective_batch_pdlp == 1) { ws_settings.concurrent_halt = &concurrent_halt; }
 
-    auto start_time = std::chrono::high_resolution_clock::now();
-
     auto ws_solution = solve_lp(&pc.pdlp_warm_cache.batch_pdlp_handle, mps_model, ws_settings);
-
-    if (verbose) {
-      auto end_time = std::chrono::high_resolution_clock::now();
-      auto duration =
-        std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time).count();
-      settings.log.printf(
-        "Original problem solved in %d milliseconds"
-        " and iterations: %d\n",
-        duration,
-        ws_solution.get_pdlp_warm_start_data().total_pdlp_iterations_);
-    }
-
-    if (ws_solution.get_termination_status() == pdlp_termination_status_t::Optimal) {
-      auto& cache           = pc.pdlp_warm_cache;
-      const auto& ws_primal = ws_solution.get_primal_solution();
-      const auto& ws_dual   = ws_solution.get_dual_solution();
-      // Need to use the pc steam since the batch pdlp handle will get destroyed after the warm
-      // start
-      cache.initial_primal = rmm::device_uvector<f_t>(ws_primal, ws_primal.stream());
-      cache.initial_dual   = rmm::device_uvector<f_t>(ws_dual, ws_dual.stream());
-      cache.step_size      = ws_solution.get_pdlp_warm_start_data().initial_step_size_;
-      cache.primal_weight  = ws_solution.get_pdlp_warm_start_data().initial_primal_weight_;
-      cache.pdlp_iteration = ws_solution.get_pdlp_warm_start_data().total_pdlp_iterations_;
-      cache.populated      = true;
-
-      if (verbose) {
-        settings.log.printf(
-          "Cached PDLP warm start: primal=%zu dual=%zu step_size=%e primal_weight=%e iters=%d\n",
-          cache.initial_primal.size(),
-          cache.initial_dual.size(),
-          cache.step_size,
-          cache.primal_weight,
-          cache.pdlp_iteration);
-      }
-    } else {
-      if (verbose) {
-        settings.log.printf(
-          "PDLP warm start solve did not reach optimality (%s), skipping cache and batch PDLP\n",
-          ws_solution.get_termination_status_string().c_str());
-      }
-      return;
-    }
+    if (ws_solution.get_termination_status() != pdlp_termination_status_t::Optimal) { return; }
+
+    auto& cache           = pc.pdlp_warm_cache;
+    const auto& ws_primal = ws_solution.get_primal_solution();
+    const auto& ws_dual   = ws_solution.get_dual_solution();
+    cache.initial_primal  = rmm::device_uvector<f_t>(ws_primal, ws_primal.stream());
+    cache.initial_dual    = rmm::device_uvector<f_t>(ws_dual, ws_dual.stream());
+    cache.step_size       = ws_solution.get_pdlp_warm_start_data().initial_step_size_;
+    cache.primal_weight   = ws_solution.get_pdlp_warm_start_data().initial_primal_weight_;
+    cache.pdlp_iteration  = ws_solution.get_pdlp_warm_start_data().total_pdlp_iterations_;
+    cache.populated       = true;
   }
 
   if (concurrent_halt.load() == 1) { return; }
@@ -817,49 +689,37 @@ static void batch_pdlp_strong_branching_task(
   if (batch_remaining_time <= 0.0) { return; }
   pdlp_settings.time_limit = batch_remaining_time;
 
-  if (pc.pdlp_warm_cache.populated) {
-    auto& cache = pc.pdlp_warm_cache;
-    pdlp_settings.set_initial_primal_solution(cache.initial_primal.data(),
-                                              cache.initial_primal.size(),
-                                              cache.batch_pdlp_handle.get_stream());
-    pdlp_settings.set_initial_dual_solution(
-      cache.initial_dual.data(), cache.initial_dual.size(), cache.batch_pdlp_handle.get_stream());
-    pdlp_settings.set_initial_step_size(cache.step_size);
-    pdlp_settings.set_initial_primal_weight(cache.primal_weight);
-    pdlp_settings.set_initial_pdlp_iteration(cache.pdlp_iteration);
-  }
+  auto& cache = pc.pdlp_warm_cache;
+  pdlp_settings.set_initial_primal_solution(
+    cache.initial_primal.data(), cache.initial_primal.size(), cache.batch_pdlp_handle.get_stream());
+  pdlp_settings.set_initial_dual_solution(
+    cache.initial_dual.data(), cache.initial_dual.size(), cache.batch_pdlp_handle.get_stream());
+  pdlp_settings.set_initial_step_size(cache.step_size);
+  pdlp_settings.set_initial_primal_weight(cache.primal_weight);
+  pdlp_settings.set_initial_pdlp_iteration(cache.pdlp_iteration);
 
   if (concurrent_halt.load() == 1) { return; }
 
   const auto solutions = batch_pdlp_solve(
     &pc.pdlp_warm_cache.batch_pdlp_handle, mps_model, fractional, fraction_values, pdlp_settings);
-  f_t batch_pdlp_strong_branching_time = toc(start_batch);
+  const f_t batch_pdlp_time = toc(start_batch);
 
-  // Fail safe in case the batch PDLP failed and produced no solutions
   if (solutions.get_additional_termination_informations().size() != fractional.size() * 2) {
     if (verbose) { settings.log.printf("Batch PDLP failed and produced no solutions\n"); }
     return;
   }
 
-  // Find max iteration on how many are done accross the batch
-  i_t max_iterations = 0;
-  i_t amount_done    = 0;
-  for (i_t k = 0; k < solutions.get_additional_termination_informations().size(); k++) {
-    max_iterations = std::max(
-      max_iterations, solutions.get_additional_termination_information(k).number_of_steps_taken);
-    // TODO batch mode infeasible: should also count as done if infeasible
-    if (solutions.get_termination_status(k) == pdlp_termination_status_t::Optimal) {
-      amount_done++;
-    }
-  }
-
   if (verbose) {
-    settings.log.printf(
-      "Batch PDLP strong branching completed in %.2fs. Solved %d/%d with max %d iterations\n",
-      batch_pdlp_strong_branching_time,
-      amount_done,
-      fractional.size() * 2,
-      max_iterations);
+    i_t amount_done = 0;
+    for (i_t k = 0; k < solutions.get_additional_termination_informations().size(); k++) {
+      if (solutions.get_termination_status(k) == pdlp_termination_status_t::Optimal) {
+        amount_done++;
+      }
+    }
+    settings.log.printf("Batch PDLP strong branching completed in %.2fs. Solved %d/%d\n",
+                        batch_pdlp_time,
+                        amount_done,
+                        fractional.size() * 2);
   }
 
   for (i_t k = 0; k < fractional.size(); k++) {
@@ -897,9 +757,7 @@ static void batch_pdlp_reliability_branching_task(
              num_candidates);
 
   f_t start_batch = tic();
-
   std::vector<f_t> original_soln_x;
-
   if (concurrent_halt.load() == 1) { return; }
 
   auto mps_model =
@@ -925,9 +783,7 @@ static void batch_pdlp_reliability_branching_task(
     std::max(static_cast<f_t>(0.0), settings.time_limit - batch_elapsed_time);
   if (batch_remaining_time <= 0.0) { return; }
 
-  // One handle per batch PDLP since there can be concurrent calls
   const raft::handle_t batch_pdlp_handle;
-
   pdlp_solver_settings_t<i_t, f_t> pdlp_settings;
   if (rb_mode == 1) {
     pdlp_settings.concurrent_halt  = &concurrent_halt;
@@ -950,8 +806,7 @@ static void batch_pdlp_reliability_branching_task(
 
   const auto solutions =
     batch_pdlp_solve(&batch_pdlp_handle, mps_model, candidate_vars, fraction_values, pdlp_settings);
-
-  f_t batch_pdlp_time = toc(start_batch);
+  const f_t batch_pdlp_time = toc(start_batch);
 
   if (solutions.get_additional_termination_informations().size() !=
       static_cast<size_t>(num_candidates) * 2) {
@@ -997,7 +852,8 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
                       const std::vector<i_t>& basic_list,
                       const std::vector<i_t>& nonbasic_list,
                       basis_update_mpf_t<i_t, f_t>& basis_factors,
-                      pseudo_costs_t<i_t, f_t>& pc)
+                      pseudo_costs_t<i_t, f_t>& pc,
+                      cuopt::work_limit_context_t* work_unit_context)
 {
   constexpr bool verbose = false;
 
@@ -1006,17 +862,17 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
   pc.strong_branch_up.assign(fractional.size(), 0);
   pc.num_strong_branches_completed = 0;
 
-  const f_t elapsed_time = toc(start_time);
-  if (elapsed_time > settings.time_limit) { return; }
+  if (fractional.empty()) { return; }
+  if (toc(start_time) > settings.time_limit) { return; }
 
-  // 0: no batch PDLP, 1: cooperative batch PDLP and DS, 2: batch PDLP only
+  const bool deterministic_work_accounting =
+    work_unit_context != nullptr && work_unit_context->deterministic;
+  const bool disable_batch_pdlp =
+    settings.sub_mip || settings.deterministic || deterministic_work_accounting;
   const i_t effective_batch_pdlp =
-    (settings.sub_mip || (settings.deterministic && settings.mip_batch_pdlp_strong_branching == 1))
-      ? 0
-      : settings.mip_batch_pdlp_strong_branching;
+    disable_batch_pdlp ? 0 : settings.mip_batch_pdlp_strong_branching;
 
-  if (settings.mip_batch_pdlp_strong_branching != 0 &&
-      (settings.sub_mip || settings.deterministic)) {
+  if (settings.mip_batch_pdlp_strong_branching != 0 && disable_batch_pdlp) {
     settings.log.printf(
       "Batch PDLP strong branching is disabled because sub-MIP or deterministic mode is enabled\n");
   }
@@ -1025,21 +881,15 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
                       settings.num_threads,
                       fractional.size());
 
-  // Cooperative DS + PDLP: shared context tracks which subproblems are solved
-  shared_strong_branching_context_t<i_t, f_t> shared_ctx(2 * fractional.size());
-  shared_strong_branching_context_view_t<i_t, f_t> sb_view(shared_ctx.solved);
-
-  std::atomic<int> concurrent_halt{0};
-
   std::vector<f_t> pdlp_obj_down(fractional.size(), std::numeric_limits<f_t>::quiet_NaN());
   std::vector<f_t> pdlp_obj_up(fractional.size(), std::numeric_limits<f_t>::quiet_NaN());
-
   std::vector<dual::status_t> dual_simplex_status_down(fractional.size(), dual::status_t::UNSET);
   std::vector<dual::status_t> dual_simplex_status_up(fractional.size(), dual::status_t::UNSET);
   std::vector<f_t> dual_simplex_obj_down(fractional.size(), std::numeric_limits<f_t>::quiet_NaN());
   std::vector<f_t> dual_simplex_obj_up(fractional.size(), std::numeric_limits<f_t>::quiet_NaN());
-  f_t strong_branching_start_time = tic();
-  i_t simplex_iteration_limit     = settings.strong_branching_simplex_iteration_limit;
+
+  const i_t simplex_iteration_limit     = settings.strong_branching_simplex_iteration_limit;
+  const f_t strong_branching_start_time = tic();
 
   if (simplex_iteration_limit < 1) {
     initialize_pseudo_costs_with_estimate(original_lp,
@@ -1051,7 +901,63 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
                                           fractional,
                                           basis_factors,
                                           pc);
+  } else if (effective_batch_pdlp == 0) {
+    const i_t n_tasks =
+      std::max<i_t>(1, std::min<i_t>(4 * settings.num_threads, fractional.size()));
+    std::vector<cuopt::work_limit_context_t> task_work_contexts;
+    if (deterministic_work_accounting) {
+      task_work_contexts.reserve(n_tasks);
+      for (i_t k = 0; k < n_tasks; ++k) {
+        task_work_contexts.emplace_back("sb_task_" + std::to_string(k));
+        task_work_contexts.back().deterministic = true;
+      }
+    }
+
+    shared_strong_branching_context_view_t<i_t, f_t> empty_sb_view;
+
+#pragma omp parallel num_threads(settings.num_threads)
+    {
+#pragma omp for schedule(dynamic, 1)
+      for (i_t k = 0; k < n_tasks; k++) {
+        const i_t start = std::floor(k * fractional.size() / n_tasks);
+        const i_t end   = std::floor((k + 1) * fractional.size() / n_tasks);
+        cuopt::work_limit_context_t* task_ctx =
+          deterministic_work_accounting ? &task_work_contexts[k] : nullptr;
+        strong_branch_helper(start,
+                             end,
+                             start_time,
+                             original_lp,
+                             settings,
+                             var_types,
+                             fractional,
+                             root_solution.x,
+                             root_vstatus,
+                             edge_norms,
+                             root_obj,
+                             upper_bound,
+                             simplex_iteration_limit,
+                             pc,
+                             dual_simplex_obj_down,
+                             dual_simplex_obj_up,
+                             dual_simplex_status_down,
+                             dual_simplex_status_up,
+                             empty_sb_view,
+                             task_ctx);
+      }
+    }
+
+    if (deterministic_work_accounting) {
+      double max_work = 0.0;
+      for (auto& ctx : task_work_contexts) {
+        max_work = std::max(max_work, ctx.current_work());
+      }
+      work_unit_context->record_work_sync_on_horizon(max_work);
+    }
   } else {
+    shared_strong_branching_context_t<i_t, f_t> shared_ctx(2 * fractional.size());
+    shared_strong_branching_context_view_t<i_t, f_t> sb_view(shared_ctx.solved);
+    std::atomic<int> concurrent_halt{0};
+
 #pragma omp parallel num_threads(settings.num_threads)
     {
 #pragma omp single nowait
@@ -1074,24 +980,12 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
         }
 
         if (effective_batch_pdlp != 2) {
-          i_t n = std::min<i_t>(4 * settings.num_threads, fractional.size());
-// Here we are creating more tasks than the number of threads
-// such that they can be scheduled dynamically to the threads.
-#pragma omp taskloop num_tasks(n)
-          for (i_t k = 0; k < n; k++) {
-            i_t start = std::floor(k * fractional.size() / n);
-            i_t end   = std::floor((k + 1) * fractional.size() / n);
-
-            constexpr bool verbose = false;
-            if (verbose) {
-              settings.log.printf("Thread id %d task id %d start %d end %d. size %d\n",
-                                  omp_get_thread_num(),
-                                  k,
-                                  start,
-                                  end,
-                                  end - start);
-            }
-
+          const i_t n_tasks =
+            std::max<i_t>(1, std::min<i_t>(4 * settings.num_threads, fractional.size()));
+#pragma omp taskloop num_tasks(n_tasks)
+          for (i_t k = 0; k < n_tasks; k++) {
+            const i_t start = std::floor(k * fractional.size() / n_tasks);
+            const i_t end   = std::floor((k + 1) * fractional.size() / n_tasks);
             strong_branch_helper(start,
                                  end,
                                  start_time,
@@ -1112,118 +1006,42 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
                                  dual_simplex_status_up,
                                  sb_view);
           }
-          // DS done: signal PDLP to stop (time-limit or all work done) and wait
           if (effective_batch_pdlp == 1) { concurrent_halt.store(1); }
         }
-      }
-    }
-  }
 
-  settings.log.printf("Strong branching completed in %.2fs\n", toc(strong_branching_start_time));
-
-  if (verbose) {
-    // Collect Dual Simplex statistics
-    i_t dual_simplex_optimal = 0, dual_simplex_infeasible = 0, dual_simplex_iter_limit = 0;
-    i_t dual_simplex_numerical = 0, dual_simplex_cutoff = 0, dual_simplex_time_limit = 0;
-    i_t dual_simplex_concurrent = 0, dual_simplex_work_limit = 0, dual_simplex_unset = 0;
-    const i_t total_subproblems = fractional.size() * 2;
-    for (i_t k = 0; k < fractional.size(); k++) {
-      for (auto st : {dual_simplex_status_down[k], dual_simplex_status_up[k]}) {
-        switch (st) {
-          case dual::status_t::OPTIMAL: dual_simplex_optimal++; break;
-          case dual::status_t::DUAL_UNBOUNDED: dual_simplex_infeasible++; break;
-          case dual::status_t::ITERATION_LIMIT: dual_simplex_iter_limit++; break;
-          case dual::status_t::NUMERICAL: dual_simplex_numerical++; break;
-          case dual::status_t::CUTOFF: dual_simplex_cutoff++; break;
-          case dual::status_t::TIME_LIMIT: dual_simplex_time_limit++; break;
-          case dual::status_t::CONCURRENT_LIMIT: dual_simplex_concurrent++; break;
-          case dual::status_t::WORK_LIMIT: dual_simplex_work_limit++; break;
-          case dual::status_t::UNSET: dual_simplex_unset++; break;
-        }
+#pragma omp taskwait
       }
     }
-
-    settings.log.printf("Dual Simplex: %d/%d optimal, %d infeasible, %d iter-limit",
-                        dual_simplex_optimal,
-                        total_subproblems,
-                        dual_simplex_infeasible,
-                        dual_simplex_iter_limit);
-    if (dual_simplex_cutoff) settings.log.printf(", %d cutoff", dual_simplex_cutoff);
-    if (dual_simplex_time_limit) settings.log.printf(", %d time-limit", dual_simplex_time_limit);
-    if (dual_simplex_numerical) settings.log.printf(", %d numerical", dual_simplex_numerical);
-    if (dual_simplex_concurrent)
-      settings.log.printf(", %d concurrent-halt", dual_simplex_concurrent);
-    if (dual_simplex_work_limit) settings.log.printf(", %d work-limit", dual_simplex_work_limit);
-    if (dual_simplex_unset) settings.log.printf(", %d unset/skipped", dual_simplex_unset);
-    settings.log.printf("\n");
   }
 
-  if (effective_batch_pdlp != 0 && verbose) {
-    i_t pdlp_optimal_count = 0;
-    for (i_t k = 0; k < fractional.size(); k++) {
-      if (!std::isnan(pdlp_obj_down[k])) pdlp_optimal_count++;
-      if (!std::isnan(pdlp_obj_up[k])) pdlp_optimal_count++;
-    }
-
-    settings.log.printf("Batch PDLP found %d/%d optimal solutions\n",
-                        pdlp_optimal_count,
-                        static_cast<int>(fractional.size() * 2));
-  }
+  settings.log.printf("Strong branching completed in %.2fs\n", toc(strong_branching_start_time));
 
   if (effective_batch_pdlp != 0) {
-    i_t merged_from_ds   = 0;
     i_t merged_from_pdlp = 0;
-    i_t merged_nan       = 0;
-    i_t solved_by_both   = 0;
     for (i_t k = 0; k < fractional.size(); k++) {
       for (i_t branch = 0; branch < 2; branch++) {
-        const bool is_down = (branch == 0);
+        const bool is_down = branch == 0;
         f_t& sb_dest       = is_down ? pc.strong_branch_down[k] : pc.strong_branch_up[k];
-        f_t ds_obj         = is_down ? dual_simplex_obj_down[k] : dual_simplex_obj_up[k];
-        dual::status_t ds_status =
+        const f_t ds_obj   = is_down ? dual_simplex_obj_down[k] : dual_simplex_obj_up[k];
+        const dual::status_t ds_status =
           is_down ? dual_simplex_status_down[k] : dual_simplex_status_up[k];
-        f_t pdlp_obj  = is_down ? pdlp_obj_down[k] : pdlp_obj_up[k];
-        bool pdlp_has = !std::isnan(pdlp_obj);
-        bool ds_has   = ds_status != dual::status_t::UNSET;
+        const f_t pdlp_obj  = is_down ? pdlp_obj_down[k] : pdlp_obj_up[k];
+        const bool pdlp_has = !std::isnan(pdlp_obj);
 
         const auto [value, source] =
           merge_sb_result<i_t, f_t>(ds_obj, ds_status, pdlp_obj, pdlp_has);
-
         if (source == sb_source_t::PDLP || effective_batch_pdlp == 2) { sb_dest = value; }
-
-        if (source == sb_source_t::DUAL_SIMPLEX)
-          merged_from_ds++;
-        else if (source == sb_source_t::PDLP)
-          merged_from_pdlp++;
-        else
-          merged_nan++;
-
-        if (ds_has && pdlp_has && verbose) {
-          solved_by_both++;
-          settings.log.printf(
-            "[COOP SB] Merge: variable %d %s solved by BOTH (DS=%e PDLP=%e) -> kept %s\n",
-            fractional[k],
-            is_down ? "DOWN" : "UP",
-            ds_obj,
-            pdlp_obj,
-            source == sb_source_t::DUAL_SIMPLEX ? "DS" : "PDLP");
-        }
+        if (source == sb_source_t::PDLP) { merged_from_pdlp++; }
       }
     }
 
     pc.pdlp_warm_cache.percent_solved_by_batch_pdlp_at_root =
       (f_t(merged_from_pdlp) / f_t(fractional.size() * 2)) * 100.0;
-    if (verbose) {
-      settings.log.printf(
-        "Batch PDLP for strong branching. Percent solved by batch PDLP at root: %f\n",
-        pc.pdlp_warm_cache.percent_solved_by_batch_pdlp_at_root);
-      settings.log.printf(
-        "Merged results: %d from DS, %d from PDLP, %d unresolved (NaN), %d solved by both\n",
-        merged_from_ds,
-        merged_from_pdlp,
-        merged_nan,
-        solved_by_both);
-    }
+  }
+
+  if (verbose) {
+    settings.log.printf("Batch PDLP solved %.2f%% of root strong-branching subproblems\n",
+                        pc.pdlp_warm_cache.percent_solved_by_batch_pdlp_at_root);
   }
 
   pc.update_pseudo_costs_from_strong_branching(fractional, root_solution.x);
@@ -1235,13 +1053,13 @@ f_t pseudo_costs_t<i_t, f_t>::calculate_pseudocost_score(i_t j,
                                                          f_t pseudo_cost_up_avg,
                                                          f_t pseudo_cost_down_avg) const
 {
-  constexpr f_t eps = 1e-6;
-  i_t num_up        = pseudo_cost_num_up[j];
-  i_t num_down      = pseudo_cost_num_down[j];
-  f_t pc_up         = num_up > 0 ? pseudo_cost_sum_up[j] / num_up : pseudo_cost_up_avg;
-  f_t pc_down       = num_down > 0 ? pseudo_cost_sum_down[j] / num_down : pseudo_cost_down_avg;
-  f_t f_down        = solution[j] - std::floor(solution[j]);
-  f_t f_up          = std::ceil(solution[j]) - solution[j];
+  constexpr f_t eps  = 1e-6;
+  const i_t num_up   = pseudo_cost_num_up[j];
+  const i_t num_down = pseudo_cost_num_down[j];
+  const f_t pc_up    = num_up > 0 ? pseudo_cost_sum_up[j] / num_up : pseudo_cost_up_avg;
+  const f_t pc_down  = num_down > 0 ? pseudo_cost_sum_down[j] / num_down : pseudo_cost_down_avg;
+  const f_t f_down   = solution[j] - std::floor(solution[j]);
+  const f_t f_up     = std::ceil(solution[j]) - solution[j];
   return std::max(f_down * pc_down, eps) * std::max(f_up * pc_up, eps);
 }
 
@@ -1269,6 +1087,13 @@ void pseudo_costs_t<i_t, f_t>::initialized(i_t& num_initialized_down,
                                            f_t& pseudo_cost_down_avg,
                                            f_t& pseudo_cost_up_avg) const
 {
+  num_initialized_down = 0;
+  num_initialized_up   = 0;
+  for (size_t j = 0; j < pseudo_cost_sum_down.size(); ++j) {
+    if (pseudo_cost_num_down[j] > 0) { num_initialized_down++; }
+    if (pseudo_cost_num_up[j] > 0) { num_initialized_up++; }
+  }
+
   auto avgs            = compute_pseudo_cost_averages(pseudo_cost_sum_down.data(),
                                            pseudo_cost_sum_up.data(),
                                            pseudo_cost_num_down.data(),
@@ -1299,8 +1124,8 @@ i_t pseudo_costs_t<i_t, f_t>::variable_selection(const std::vector<i_t>& fractio
              pseudo_cost_up_avg);
 
   for (i_t j : fractional) {
-    f_t score = calculate_pseudocost_score(j, solution, pseudo_cost_up_avg, pseudo_cost_down_avg);
-
+    const f_t score =
+      calculate_pseudocost_score(j, solution, pseudo_cost_up_avg, pseudo_cost_down_avg);
     if (score > max_score) {
       max_score  = score;
       branch_var = j;
@@ -1311,7 +1136,6 @@ i_t pseudo_costs_t<i_t, f_t>::variable_selection(const std::vector<i_t>& fractio
             branch_var,
             solution[branch_var],
             max_score);
-
   return branch_var;
 }
 
@@ -1330,7 +1154,7 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
   const lp_problem_t<i_t, f_t>& original_lp)
 {
   constexpr f_t eps                      = 1e-6;
-  f_t start_time                         = bnb_stats.start_time;
+  const f_t start_time                   = bnb_stats.start_time;
   i_t branch_var                         = fractional[0];
   f_t max_score                          = -1;
   f_t pseudo_cost_down_avg               = -1;
@@ -1346,11 +1170,9 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
 
   i_t reliable_threshold = settings.reliability_branching;
   if (reliable_threshold < 0) {
-    const i_t max_threshold            = reliability_branching_settings.max_reliable_threshold;
-    const i_t min_threshold            = reliability_branching_settings.min_reliable_threshold;
-    const f_t iter_factor              = reliability_branching_settings.bnb_lp_factor;
-    const i_t iter_offset              = reliability_branching_settings.bnb_lp_offset;
-    const int64_t alpha                = iter_factor * branch_and_bound_lp_iters;
+    const i_t max_threshold = reliability_branching_settings.max_reliable_threshold;
+    const i_t min_threshold = reliability_branching_settings.min_reliable_threshold;
+    const int64_t alpha = reliability_branching_settings.bnb_lp_factor * branch_and_bound_lp_iters;
     const int64_t max_reliability_iter = alpha + reliability_branching_settings.bnb_lp_offset;
 
     f_t iter_fraction =
@@ -1362,10 +1184,6 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
     reliable_threshold = strong_branching_lp_iter < max_reliability_iter ? reliable_threshold : 0;
   }
 
-  // If `reliable_threshold == 0`, then we set the uninitialized pseudocosts to the average.
-  // Otherwise, the best ones are initialized via strong branching, while the other are ignored.  //
-  // In the latter, we are not using the average pseudocost (which calculated in the `initialized`
-  // method).
   if (reliable_threshold == 0) {
     i_t num_initialized_up;
     i_t num_initialized_down;
@@ -1386,9 +1204,8 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
       unreliable_list.push_back(std::make_pair(-1, j));
       continue;
     }
-    f_t score =
+    const f_t score =
       calculate_pseudocost_score(j, leaf_solution.x, pseudo_cost_up_avg, pseudo_cost_down_avg);
-
     if (score > max_score) {
       max_score  = score;
       branch_var = j;
@@ -1400,144 +1217,84 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
                branch_var,
                leaf_solution.x[branch_var],
                max_score);
-
     return branch_var;
   }
 
-  // 0: no batch PDLP, 1: cooperative batch PDLP and DS, 2: batch PDLP only
-  const i_t rb_mode = settings.mip_batch_pdlp_reliability_branching;
-  // We don't use batch PDLP in reliability branching if the PDLP warm start data was not filled
-  // This indicates that PDLP alone (not batched) couldn't even run at the root node
-  // So it will most likely perform poorly compared to DS
-  // It is also off if the number of candidate is very small
-  // If warm start could run but almost none of the BPDLP results were used, we also want to avoid
-  // using batch PDLP
-  constexpr i_t min_num_candidates_for_pdlp                       = 5;
+  const i_t rb_mode                         = settings.mip_batch_pdlp_reliability_branching;
+  constexpr i_t min_num_candidates_for_pdlp = 5;
   constexpr f_t min_percent_solved_by_batch_pdlp_at_root_for_pdlp = 5.0;
-  // Batch PDLP is either forced or we use the heuristic to decide if it should be used
   const bool use_pdlp = (rb_mode == 2) || (rb_mode != 0 && !settings.sub_mip &&
                                            !settings.deterministic && pdlp_warm_cache.populated &&
                                            unreliable_list.size() > min_num_candidates_for_pdlp &&
                                            pdlp_warm_cache.percent_solved_by_batch_pdlp_at_root >
                                              min_percent_solved_by_batch_pdlp_at_root_for_pdlp);
 
-  if (rb_mode != 0 && !pdlp_warm_cache.populated) {
-    log.printf("PDLP warm start data not populated, using DS only\n");
-  } else if (rb_mode != 0 && settings.sub_mip) {
-    log.printf("Batch PDLP reliability branching is disabled because sub-MIP is enabled\n");
-  } else if (rb_mode != 0 && settings.deterministic) {
-    log.printf(
-      "Batch PDLP reliability branching is disabled because deterministic mode is enabled\n");
-  } else if (rb_mode != 0 && unreliable_list.size() < min_num_candidates_for_pdlp) {
-    log.printf("Not enough candidates to use batch PDLP, using DS only\n");
-  } else if (rb_mode != 0 && pdlp_warm_cache.percent_solved_by_batch_pdlp_at_root < 5.0) {
-    log.printf("Percent solved by batch PDLP at root is too low, using DS only\n");
-  } else if (use_pdlp) {
-    log.printf(
-      "Using batch PDLP because populated, unreliable list size is %d (> %d), and percent solved "
-      "by batch PDLP at root is %f%% (> %f%%)\n",
-      static_cast<i_t>(unreliable_list.size()),
-      min_num_candidates_for_pdlp,
-      pdlp_warm_cache.percent_solved_by_batch_pdlp_at_root,
-      min_percent_solved_by_batch_pdlp_at_root_for_pdlp);
-  }
-
-  const int num_tasks     = std::max(max_num_tasks, 10);
-  const int task_priority = reliability_branching_settings.task_priority;
-  // If both batch PDLP and DS are used we double the max number of candidates
+  const int num_tasks          = std::max(max_num_tasks, 10);
+  const int task_priority      = reliability_branching_settings.task_priority;
   const i_t max_num_candidates = use_pdlp ? 2 * reliability_branching_settings.max_num_candidates
                                           : reliability_branching_settings.max_num_candidates;
   const i_t num_candidates     = std::min<size_t>(unreliable_list.size(), max_num_candidates);
 
-  assert(task_priority > 0);
-  assert(max_num_candidates > 0);
-  assert(num_candidates > 0);
-  assert(num_tasks > 0);
-
-  log.printf(
-    "RB iters = %d, B&B iters = %d, unreliable = %d, num_tasks = %d, reliable_threshold = %d\n",
-    strong_branching_lp_iter.load(),
-    branch_and_bound_lp_iters,
-    unreliable_list.size(),
-    num_tasks,
-    reliable_threshold);
-
   if (unreliable_list.size() > max_num_candidates) {
     if (reliability_branching_settings.rank_candidates_with_dual_pivot) {
-      i_t m             = worker->leaf_problem.num_rows;
-      i_t n             = worker->leaf_problem.num_cols;
+      const i_t m       = worker->leaf_problem.num_rows;
+      const i_t n       = worker->leaf_problem.num_cols;
       f_t work_estimate = 0;
-
       std::vector<f_t> delta_z(n, 0);
       std::vector<i_t> workspace(n, 0);
-
       std::vector<i_t> basic_map(n, -1);
+      std::vector<i_t> nonbasic_mark(n, -1);
       for (i_t i = 0; i < m; i++) {
         basic_map[worker->basic_list[i]] = i;
       }
-
-      std::vector<i_t> nonbasic_mark(n, -1);
       for (i_t i = 0; i < n - m; i++) {
         nonbasic_mark[worker->nonbasic_list[i]] = i;
       }
-
       for (auto& [score, j] : unreliable_list) {
         if (pseudo_cost_num_down[j] == 0 || pseudo_cost_num_up[j] == 0) {
-          // Estimate the objective change by performing a single pivot of dual simplex.
-          objective_change_estimate_t<f_t> estimate =
-            single_pivot_objective_change_estimate(worker->leaf_problem,
-                                                   settings,
-                                                   AT,
-                                                   node_ptr->vstatus,
-                                                   j,
-                                                   basic_map[j],
-                                                   leaf_solution,
-                                                   worker->basic_list,
-                                                   worker->nonbasic_list,
-                                                   nonbasic_mark,
-                                                   worker->basis_factors,
-                                                   workspace,
-                                                   delta_z,
-                                                   work_estimate);
-
+          auto estimate = single_pivot_objective_change_estimate(worker->leaf_problem,
+                                                                 settings,
+                                                                 AT,
+                                                                 node_ptr->vstatus,
+                                                                 j,
+                                                                 basic_map[j],
+                                                                 leaf_solution,
+                                                                 worker->basic_list,
+                                                                 worker->nonbasic_list,
+                                                                 nonbasic_mark,
+                                                                 worker->basis_factors,
+                                                                 workspace,
+                                                                 delta_z,
+                                                                 work_estimate);
           score = std::max(estimate.up_obj_change, eps) * std::max(estimate.down_obj_change, eps);
         } else {
-          // Use the previous score, even if it is unreliable
           score = calculate_pseudocost_score(
             j, leaf_solution.x, pseudo_cost_up_avg, pseudo_cost_down_avg);
         }
       }
     } else {
-      f_t high = max_score > 0 ? max_score : 1;
-      f_t low  = 0;
-
+      const f_t high = max_score > 0 ? max_score : 1;
       for (auto& [score, j] : unreliable_list) {
-        if (score == -1) { score = worker->rng.uniform(low, high); }
+        (void)j;
+        if (score == -1) { score = worker->rng.uniform(f_t{0}, high); }
       }
     }
 
-    // We only need to get the top-k elements in the list, where
-    // k = num_candidates
     std::partial_sort(unreliable_list.begin(),
                       unreliable_list.begin() + num_candidates,
                       unreliable_list.end(),
                       [](auto el1, auto el2) { return el1.first > el2.first; });
   }
 
-  // Both DS and PDLP work on the same candidate set
   std::vector<i_t> candidate_vars(num_candidates);
   for (i_t i = 0; i < num_candidates; i++) {
     candidate_vars[i] = unreliable_list[i].second;
   }
 
-  // Shared context for cooperative work-stealing (mode 1)
-  // [0..num_candidates) = down, [num_candidates..2*num_candidates) = up
   shared_strong_branching_context_t<i_t, f_t> shared_ctx(2 * num_candidates);
   shared_strong_branching_context_view_t<i_t, f_t> sb_view(shared_ctx.solved);
-
   std::vector<f_t> pdlp_obj_down(num_candidates, std::numeric_limits<f_t>::quiet_NaN());
   std::vector<f_t> pdlp_obj_up(num_candidates, std::numeric_limits<f_t>::quiet_NaN());
-
   std::atomic<int> concurrent_halt{0};
 
   if (use_pdlp) {
@@ -1560,7 +1317,6 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
   }
 
   if (toc(start_time) > settings.time_limit) {
-    log.printf("Time limit reached\n");
     if (use_pdlp) {
       concurrent_halt.store(1);
 #pragma omp taskwait
@@ -1573,8 +1329,6 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
   std::vector<dual::status_t> dual_simplex_status_down(num_candidates, dual::status_t::UNSET);
   std::vector<dual::status_t> dual_simplex_status_up(num_candidates, dual::status_t::UNSET);
 
-  f_t dual_simplex_start_time = tic();
-
   if (rb_mode != 2) {
 #pragma omp taskloop if (num_tasks > 1) priority(task_priority) num_tasks(num_tasks) \
   shared(score_mutex,                                                                \
@@ -1589,14 +1343,10 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
 
       if (toc(start_time) > settings.time_limit) { continue; }
 
-      if (rb_mode == 1 && sb_view.is_solved(i)) {
-        log.printf(
-          "DS skipping variable %d branch down (shared_idx %d): already solved by PDLP\n", j, i);
-      } else {
+      if (!(rb_mode == 1 && sb_view.is_solved(i))) {
         pseudo_cost_mutex_down[j].lock();
         if (pseudo_cost_num_down[j] < reliable_threshold) {
-          // Do trial branching on the down branch
-          const auto [obj, status] = trial_branching(worker->leaf_problem,
+          const auto [obj, status]    = trial_branching(worker->leaf_problem,
                                                      settings,
                                                      var_types,
                                                      node_ptr->vstatus,
@@ -1611,20 +1361,17 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
                                                      start_time,
                                                      iter_limit_per_trial,
                                                      strong_branching_lp_iter);
-
           dual_simplex_obj_down[i]    = obj;
           dual_simplex_status_down[i] = status;
           if (!std::isnan(obj)) {
-            f_t change_in_obj = std::max(obj - node_ptr->lower_bound, eps);
-            f_t change_in_x   = leaf_solution.x[j] - std::floor(leaf_solution.x[j]);
+            const f_t change_in_obj = std::max(obj - node_ptr->lower_bound, eps);
+            const f_t change_in_x   = leaf_solution.x[j] - std::floor(leaf_solution.x[j]);
             pseudo_cost_sum_down[j] += change_in_obj / change_in_x;
             pseudo_cost_num_down[j]++;
-            // Should be valid if were are already here
             if (rb_mode == 1 && is_dual_simplex_done(status)) { sb_view.mark_solved(i); }
           }
-        } else {
-          // Variable became reliable, make it as solved so that batch PDLP does not solve it again
-          if (rb_mode == 1) sb_view.mark_solved(i);
+        } else if (rb_mode == 1) {
+          sb_view.mark_solved(i);
         }
         pseudo_cost_mutex_down[j].unlock();
       }
@@ -1632,14 +1379,10 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
       if (toc(start_time) > settings.time_limit) { continue; }
 
       const i_t shared_idx = i + num_candidates;
-      if (rb_mode == 1 && sb_view.is_solved(shared_idx)) {
-        log.printf("DS skipping variable %d branch up (shared_idx %d): already solved by PDLP\n",
-                   j,
-                   shared_idx);
-      } else {
+      if (!(rb_mode == 1 && sb_view.is_solved(shared_idx))) {
         pseudo_cost_mutex_up[j].lock();
         if (pseudo_cost_num_up[j] < reliable_threshold) {
-          const auto [obj, status] = trial_branching(worker->leaf_problem,
+          const auto [obj, status]  = trial_branching(worker->leaf_problem,
                                                      settings,
                                                      var_types,
                                                      node_ptr->vstatus,
@@ -1654,20 +1397,17 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
                                                      start_time,
                                                      iter_limit_per_trial,
                                                      strong_branching_lp_iter);
-
           dual_simplex_obj_up[i]    = obj;
           dual_simplex_status_up[i] = status;
           if (!std::isnan(obj)) {
-            f_t change_in_obj = std::max(obj - node_ptr->lower_bound, eps);
-            f_t change_in_x   = std::ceil(leaf_solution.x[j]) - leaf_solution.x[j];
+            const f_t change_in_obj = std::max(obj - node_ptr->lower_bound, eps);
+            const f_t change_in_x   = std::ceil(leaf_solution.x[j]) - leaf_solution.x[j];
             pseudo_cost_sum_up[j] += change_in_obj / change_in_x;
             pseudo_cost_num_up[j]++;
-            // Should be valid if were are already here
             if (rb_mode == 1 && is_dual_simplex_done(status)) { sb_view.mark_solved(shared_idx); }
           }
-        } else {
-          // Variable became reliable, make it as solved so that batch PDLP does not solve it again
-          if (rb_mode == 1) sb_view.mark_solved(shared_idx);
+        } else if (rb_mode == 1) {
+          sb_view.mark_solved(shared_idx);
         }
         pseudo_cost_mutex_up[j].unlock();
       }
@@ -1676,7 +1416,6 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
 
       score =
         calculate_pseudocost_score(j, leaf_solution.x, pseudo_cost_up_avg, pseudo_cost_down_avg);
-
       score_mutex.lock();
       if (score > max_score) {
         max_score  = score;
@@ -1688,92 +1427,232 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
     concurrent_halt.store(1);
   }
 
-  f_t dual_simplex_elapsed = toc(dual_simplex_start_time);
-
-  // TODO put back
-  // if (rb_mode != 2) {
-  //  if (rb_mode == 1) {
-  //    log.printf(
-  //      "RB Dual Simplex: %d candidates, %d/%d optimal, %d/%d infeasible, %d/%d failed, %d skipped
-  //      (PDLP) in %.2fs\n", num_candidates, dual_simplex_optimal.load(), num_candidates * 2,
-  //      dual_simplex_infeasible.load(), num_candidates * 2,
-  //      dual_simplex_failed.load(), num_candidates * 2,
-  //      dual_simplex_skipped.load(), dual_simplex_elapsed);
-  //  } else {
-  //    log.printf(
-  //      "RB Dual Simplex: %d candidates, %d/%d optimal, %d/%d infeasible, %d/%d failed in
-  //      %.2fs\n", num_candidates, dual_simplex_optimal.load(), num_candidates * 2,
-  //      dual_simplex_infeasible.load(), num_candidates * 2, dual_simplex_failed.load(),
-  //      num_candidates * 2, dual_simplex_elapsed);
-  //  }
-  //}
-
   if (use_pdlp) {
 #pragma omp taskwait
-
-    i_t pdlp_applied = 0;
-    i_t pdlp_optimal = 0;
     for (i_t i = 0; i < num_candidates; i++) {
       const i_t j = candidate_vars[i];
 
-      // Down: check if PDLP should override DS
       if (!std::isnan(pdlp_obj_down[i])) {
-        pdlp_optimal++;
         const auto [merged_obj, source] = merge_sb_result<i_t, f_t>(
           dual_simplex_obj_down[i], dual_simplex_status_down[i], pdlp_obj_down[i], true);
-        // PDLP won the merge, update the pseudo-cost only if node is still unreliable (concurrent
-        // calls may have made it reliable)
         if (source == sb_source_t::PDLP) {
           pseudo_cost_mutex_down[j].lock();
           if (pseudo_cost_num_down[j] < reliable_threshold) {
-            f_t change_in_obj = std::max(merged_obj - node_ptr->lower_bound, eps);
-            f_t change_in_x   = leaf_solution.x[j] - std::floor(leaf_solution.x[j]);
+            const f_t change_in_obj = std::max(merged_obj - node_ptr->lower_bound, eps);
+            const f_t change_in_x   = leaf_solution.x[j] - std::floor(leaf_solution.x[j]);
             pseudo_cost_sum_down[j] += change_in_obj / change_in_x;
             pseudo_cost_num_down[j]++;
-            pdlp_applied++;
           }
           pseudo_cost_mutex_down[j].unlock();
         }
       }
 
-      // Up: check if PDLP should override DS
       if (!std::isnan(pdlp_obj_up[i])) {
-        pdlp_optimal++;
         const auto [merged_obj, source] = merge_sb_result<i_t, f_t>(
           dual_simplex_obj_up[i], dual_simplex_status_up[i], pdlp_obj_up[i], true);
-        // PDLP won the merge, update the pseudo-cost only if node is still unreliable (concurrent
-        // calls may have made it reliable)
         if (source == sb_source_t::PDLP) {
           pseudo_cost_mutex_up[j].lock();
           if (pseudo_cost_num_up[j] < reliable_threshold) {
-            f_t change_in_obj = std::max(merged_obj - node_ptr->lower_bound, eps);
-            f_t change_in_x   = std::ceil(leaf_solution.x[j]) - leaf_solution.x[j];
+            const f_t change_in_obj = std::max(merged_obj - node_ptr->lower_bound, eps);
+            const f_t change_in_x   = std::ceil(leaf_solution.x[j]) - leaf_solution.x[j];
             pseudo_cost_sum_up[j] += change_in_obj / change_in_x;
             pseudo_cost_num_up[j]++;
-            pdlp_applied++;
           }
           pseudo_cost_mutex_up[j].unlock();
         }
       }
 
-      f_t score =
+      const f_t score =
         calculate_pseudocost_score(j, leaf_solution.x, pseudo_cost_up_avg, pseudo_cost_down_avg);
       if (score > max_score) {
         max_score  = score;
         branch_var = j;
       }
     }
-
-    log.printf("RB batch PDLP: %d candidates, %d/%d optimal, %d applied to pseudo-costs\n",
-               num_candidates,
-               pdlp_optimal,
-               num_candidates * 2,
-               pdlp_applied);
   }
 
   log.printf(
     "pc branching on %d. Value %e. Score %e\n", branch_var, leaf_solution.x[branch_var], max_score);
+  return branch_var;
+}
+
+template <typename i_t, typename f_t, typename SumT, typename CountT, typename SBIterT>
+i_t reliable_variable_selection_core(mip_node_t<i_t, f_t>* node_ptr,
+                                     const std::vector<i_t>& fractional,
+                                     const std::vector<f_t>& solution,
+                                     const simplex_solver_settings_t<i_t, f_t>& settings,
+                                     const std::vector<variable_type_t>& var_types,
+                                     const lp_problem_t<i_t, f_t>& leaf_problem,
+                                     const std::vector<f_t>& edge_norms,
+                                     const basis_update_mpf_t<i_t, f_t>& basis_factors,
+                                     const std::vector<i_t>& basic_list,
+                                     const std::vector<i_t>& nonbasic_list,
+                                     SumT* sum_down,
+                                     SumT* sum_up,
+                                     CountT* num_down,
+                                     CountT* num_up,
+                                     i_t n_vars,
+                                     SBIterT& strong_branching_lp_iter,
+                                     f_t upper_bound,
+                                     int64_t bnb_lp_iters,
+                                     int64_t bnb_nodes_explored,
+                                     f_t start_time,
+                                     const reliability_branching_settings_t<i_t, f_t>& rb_settings,
+                                     int num_tasks,
+                                     omp_mutex_t* var_mutex_down,
+                                     omp_mutex_t* var_mutex_up,
+                                     pcgenerator_t* rng,
+                                     cuopt::work_limit_context_t* work_ctx,
+                                     const sb_update_callback_t<i_t, f_t>& on_sb_update)
+{
+  constexpr f_t eps = 1e-6;
+  i_t branch_var    = fractional[0];
+  f_t max_score     = -1;
+
+  auto avgs = compute_pseudo_cost_averages(sum_down, sum_up, num_down, num_up, (size_t)n_vars);
+  const f_t pseudo_cost_down_avg = avgs.down_avg;
+  const f_t pseudo_cost_up_avg   = avgs.up_avg;
+
+  const i_t bnb_lp_iter_per_node =
+    bnb_nodes_explored > 0 ? (i_t)(bnb_lp_iters / bnb_nodes_explored) : 0;
+
+  i_t reliable_threshold = settings.reliability_branching;
+  if (reliable_threshold < 0) {
+    const int64_t alpha                = (int64_t)(rb_settings.bnb_lp_factor * bnb_lp_iters);
+    const int64_t max_reliability_iter = alpha + rb_settings.bnb_lp_offset;
+
+    f_t iter_fraction =
+      (max_reliability_iter - strong_branching_lp_iter) / (strong_branching_lp_iter + 1.0);
+    iter_fraction = std::min(1.0, iter_fraction);
+    iter_fraction = std::max((alpha - strong_branching_lp_iter) / (strong_branching_lp_iter + 1.0),
+                             iter_fraction);
+    reliable_threshold = (int)((1 - iter_fraction) * rb_settings.min_reliable_threshold +
+                               iter_fraction * rb_settings.max_reliable_threshold);
+    reliable_threshold = strong_branching_lp_iter < max_reliability_iter ? reliable_threshold : 0;
+  }
 
+  std::vector<i_t> unreliable_list;
+  for (i_t j : fractional) {
+    if (num_down[j] < reliable_threshold || num_up[j] < reliable_threshold) {
+      unreliable_list.push_back(j);
+      continue;
+    }
+    const f_t pc_down = num_down[j] > 0 ? sum_down[j] / num_down[j] : pseudo_cost_down_avg;
+    const f_t pc_up   = num_up[j] > 0 ? sum_up[j] / num_up[j] : pseudo_cost_up_avg;
+    const f_t f_down  = solution[j] - std::floor(solution[j]);
+    const f_t f_up    = std::ceil(solution[j]) - solution[j];
+    const f_t score   = std::max(f_down * pc_down, eps) * std::max(f_up * pc_up, eps);
+    if (score > max_score) {
+      max_score  = score;
+      branch_var = j;
+    }
+  }
+
+  if (unreliable_list.empty()) {
+    settings.log.debug(
+      "pc branching on %d. Value %e. Score %e\n", branch_var, solution[branch_var], max_score);
+    return branch_var;
+  }
+
+  const i_t max_num_candidates = rb_settings.max_num_candidates;
+  const int task_priority      = rb_settings.task_priority;
+  const i_t num_candidates     = std::min<size_t>(unreliable_list.size(), max_num_candidates);
+
+  cuopt_assert(rng != nullptr, "rng must be provided for candidate shuffling");
+  if (unreliable_list.size() > (size_t)max_num_candidates) { rng->shuffle(unreliable_list); }
+  if (toc(start_time) > settings.time_limit) { return branch_var; }
+
+  omp_mutex_t score_mutex;
+
+#pragma omp taskloop if (num_tasks > 1) priority(task_priority) num_tasks(num_tasks) \
+  shared(score_mutex, strong_branching_lp_iter)
+  for (i_t i = 0; i < num_candidates; ++i) {
+    const i_t j = unreliable_list[i];
+    if (toc(start_time) > settings.time_limit) { continue; }
+
+    if (var_mutex_down) { var_mutex_down[j].lock(); }
+    if (num_down[j] < reliable_threshold) {
+      const f_t obj = trial_branching_generic(leaf_problem,
+                                              settings,
+                                              var_types,
+                                              node_ptr->vstatus,
+                                              edge_norms,
+                                              basis_factors,
+                                              basic_list,
+                                              nonbasic_list,
+                                              j,
+                                              leaf_problem.lower[j],
+                                              std::floor(solution[j]),
+                                              upper_bound,
+                                              bnb_lp_iter_per_node,
+                                              start_time,
+                                              rb_settings.upper_max_lp_iter,
+                                              rb_settings.lower_max_lp_iter,
+                                              strong_branching_lp_iter,
+                                              work_ctx);
+      if (!std::isnan(obj)) {
+        const f_t delta =
+          std::max(obj - node_ptr->lower_bound, eps) / (solution[j] - std::floor(solution[j]));
+        sum_down[j] += delta;
+        num_down[j]++;
+        if (on_sb_update) { on_sb_update(j, rounding_direction_t::DOWN, delta); }
+      }
+    }
+    if (var_mutex_down) { var_mutex_down[j].unlock(); }
+
+    if (toc(start_time) > settings.time_limit) { continue; }
+
+    if (var_mutex_up) { var_mutex_up[j].lock(); }
+    if (num_up[j] < reliable_threshold) {
+      const f_t obj = trial_branching_generic(leaf_problem,
+                                              settings,
+                                              var_types,
+                                              node_ptr->vstatus,
+                                              edge_norms,
+                                              basis_factors,
+                                              basic_list,
+                                              nonbasic_list,
+                                              j,
+                                              std::ceil(solution[j]),
+                                              leaf_problem.upper[j],
+                                              upper_bound,
+                                              bnb_lp_iter_per_node,
+                                              start_time,
+                                              rb_settings.upper_max_lp_iter,
+                                              rb_settings.lower_max_lp_iter,
+                                              strong_branching_lp_iter,
+                                              work_ctx);
+      if (!std::isnan(obj)) {
+        const f_t delta =
+          std::max(obj - node_ptr->lower_bound, eps) / (std::ceil(solution[j]) - solution[j]);
+        sum_up[j] += delta;
+        num_up[j]++;
+        if (on_sb_update) { on_sb_update(j, rounding_direction_t::UP, delta); }
+      }
+    }
+    if (var_mutex_up) { var_mutex_up[j].unlock(); }
+
+    if (toc(start_time) > settings.time_limit) { continue; }
+
+    const f_t pc_down = num_down[j] > 0 ? sum_down[j] / num_down[j] : pseudo_cost_down_avg;
+    const f_t pc_up   = num_up[j] > 0 ? sum_up[j] / num_up[j] : pseudo_cost_up_avg;
+    const f_t f_down  = solution[j] - std::floor(solution[j]);
+    const f_t f_up    = std::ceil(solution[j]) - solution[j];
+    const f_t score   = std::max(f_down * pc_down, eps) * std::max(f_up * pc_up, eps);
+
+    score_mutex.lock();
+    if (score > max_score) {
+      max_score  = score;
+      branch_var = j;
+    }
+    score_mutex.unlock();
+  }
+
+  settings.log.debug("Reliability branching result: node=%d branch_var=%d value=%e score=%e\n",
+                     node_ptr->node_id,
+                     branch_var,
+                     solution[branch_var],
+                     max_score);
   return branch_var;
 }
 
@@ -1783,24 +1662,20 @@ f_t pseudo_costs_t<i_t, f_t>::obj_estimate(const std::vector<i_t>& fractional,
                                            f_t lower_bound,
                                            logger_t& log)
 {
-  const i_t num_fractional = fractional.size();
-  f_t estimate             = lower_bound;
-
+  f_t estimate = lower_bound;
   i_t num_initialized_down;
   i_t num_initialized_up;
   f_t pseudo_cost_down_avg;
   f_t pseudo_cost_up_avg;
-
   initialized(num_initialized_down, num_initialized_up, pseudo_cost_down_avg, pseudo_cost_up_avg);
 
   for (i_t j : fractional) {
-    constexpr f_t eps = 1e-6;
-    i_t num_up        = pseudo_cost_num_up[j];
-    i_t num_down      = pseudo_cost_num_down[j];
-    f_t pc_up         = num_up > 0 ? pseudo_cost_sum_up[j] / num_up : pseudo_cost_up_avg;
-    f_t pc_down       = num_down > 0 ? pseudo_cost_sum_down[j] / num_down : pseudo_cost_down_avg;
-    f_t f_down        = solution[j] - std::floor(solution[j]);
-    f_t f_up          = std::ceil(solution[j]) - solution[j];
+    const i_t num_up   = pseudo_cost_num_up[j];
+    const i_t num_down = pseudo_cost_num_down[j];
+    const f_t pc_up    = num_up > 0 ? pseudo_cost_sum_up[j] / num_up : pseudo_cost_up_avg;
+    const f_t pc_down  = num_down > 0 ? pseudo_cost_sum_down[j] / num_down : pseudo_cost_down_avg;
+    const f_t f_down   = solution[j] - std::floor(solution[j]);
+    const f_t f_up     = std::ceil(solution[j]) - solution[j];
     estimate += std::min(pc_down * f_down, pc_up * f_up);
   }
 
@@ -1814,20 +1689,15 @@ void pseudo_costs_t<i_t, f_t>::update_pseudo_costs_from_strong_branching(
 {
   for (i_t k = 0; k < fractional.size(); k++) {
     const i_t j = fractional[k];
-    for (i_t branch = 0; branch < 2; branch++) {
-      if (branch == 0) {
-        f_t change_in_obj = strong_branch_down[k];
-        if (std::isnan(change_in_obj)) { continue; }
-        f_t frac = root_soln[j] - std::floor(root_soln[j]);
-        pseudo_cost_sum_down[j] += change_in_obj / frac;
-        pseudo_cost_num_down[j]++;
-      } else {
-        f_t change_in_obj = strong_branch_up[k];
-        if (std::isnan(change_in_obj)) { continue; }
-        f_t frac = std::ceil(root_soln[j]) - root_soln[j];
-        pseudo_cost_sum_up[j] += change_in_obj / frac;
-        pseudo_cost_num_up[j]++;
-      }
+    if (!std::isnan(strong_branch_down[k])) {
+      const f_t frac = root_soln[j] - std::floor(root_soln[j]);
+      pseudo_cost_sum_down[j] += strong_branch_down[k] / frac;
+      pseudo_cost_num_down[j]++;
+    }
+    if (!std::isnan(strong_branch_up[k])) {
+      const f_t frac = std::ceil(root_soln[j]) - root_soln[j];
+      pseudo_cost_sum_up[j] += strong_branch_up[k] / frac;
+      pseudo_cost_num_up[j]++;
     }
   }
 }
@@ -1836,6 +1706,68 @@ void pseudo_costs_t<i_t, f_t>::update_pseudo_costs_from_strong_branching(
 
 template class pseudo_costs_t<int, double>;
 
+template int reliable_variable_selection_core<int,
+                                              double,
+                                              omp_atomic_t<double>,
+                                              omp_atomic_t<int>,
+                                              omp_atomic_t<int64_t>>(
+  mip_node_t<int, double>*,
+  const std::vector<int>&,
+  const std::vector<double>&,
+  const simplex_solver_settings_t<int, double>&,
+  const std::vector<variable_type_t>&,
+  const lp_problem_t<int, double>&,
+  const std::vector<double>&,
+  const basis_update_mpf_t<int, double>&,
+  const std::vector<int>&,
+  const std::vector<int>&,
+  omp_atomic_t<double>*,
+  omp_atomic_t<double>*,
+  omp_atomic_t<int>*,
+  omp_atomic_t<int>*,
+  int,
+  omp_atomic_t<int64_t>&,
+  double,
+  int64_t,
+  int64_t,
+  double,
+  const reliability_branching_settings_t<int, double>&,
+  int,
+  omp_mutex_t*,
+  omp_mutex_t*,
+  pcgenerator_t*,
+  cuopt::work_limit_context_t*,
+  const sb_update_callback_t<int, double>&);
+
+template int reliable_variable_selection_core<int, double, double, int, int64_t>(
+  mip_node_t<int, double>*,
+  const std::vector<int>&,
+  const std::vector<double>&,
+  const simplex_solver_settings_t<int, double>&,
+  const std::vector<variable_type_t>&,
+  const lp_problem_t<int, double>&,
+  const std::vector<double>&,
+  const basis_update_mpf_t<int, double>&,
+  const std::vector<int>&,
+  const std::vector<int>&,
+  double*,
+  double*,
+  int*,
+  int*,
+  int,
+  int64_t&,
+  double,
+  int64_t,
+  int64_t,
+  double,
+  const reliability_branching_settings_t<int, double>&,
+  int,
+  omp_mutex_t*,
+  omp_mutex_t*,
+  pcgenerator_t*,
+  cuopt::work_limit_context_t*,
+  const sb_update_callback_t<int, double>&);
+
 template void strong_branching<int, double>(const lp_problem_t<int, double>& original_lp,
                                             const simplex_solver_settings_t<int, double>& settings,
                                             double start_time,
@@ -1850,7 +1782,8 @@ template void strong_branching<int, double>(const lp_problem_t<int, double>& ori
                                             const std::vector<int>& basic_list,
                                             const std::vector<int>& nonbasic_list,
                                             basis_update_mpf_t<int, double>& basis_factors,
-                                            pseudo_costs_t<int, double>& pc);
+                                            pseudo_costs_t<int, double>& pc,
+                                            cuopt::work_limit_context_t* work_unit_context);
 
 #endif
 
diff --git a/cpp/src/branch_and_bound/pseudo_costs.hpp b/cpp/src/branch_and_bound/pseudo_costs.hpp
index 009bd8b81a..6393a8cd41 100644
--- a/cpp/src/branch_and_bound/pseudo_costs.hpp
+++ b/cpp/src/branch_and_bound/pseudo_costs.hpp
@@ -17,12 +17,14 @@
 
 #include <utilities/omp_helpers.hpp>
 #include <utilities/pcgenerator.hpp>
+#include <utilities/work_limit_context.hpp>
 
 #include <omp.h>
 #include <cmath>
 #include <rmm/device_uvector.hpp>
 
 #include <cstdint>
+#include <functional>
 #include <limits>
 
 namespace cuopt::linear_programming::dual_simplex {
@@ -357,6 +359,13 @@ class pseudo_cost_snapshot_t {
     }
   }
 
+  // Record an update that was already applied to the arrays (e.g. by strong branching).
+  void record_update(
+    i_t variable, rounding_direction_t direction, f_t delta, double clock, int worker_id)
+  {
+    updates_.push_back({variable, direction, delta, clock, worker_id});
+  }
+
   std::vector<pseudo_cost_update_t<i_t, f_t>> take_updates()
   {
     std::vector<pseudo_cost_update_t<i_t, f_t>> result;
@@ -370,6 +379,7 @@ class pseudo_cost_snapshot_t {
   std::vector<f_t> sum_up_;
   std::vector<i_t> num_down_;
   std::vector<i_t> num_up_;
+  int64_t strong_branching_lp_iter_{0};
 
  private:
   std::vector<pseudo_cost_update_t<i_t, f_t>> updates_;
@@ -452,8 +462,10 @@ class pseudo_costs_t {
       nd[j] = pseudo_cost_num_down[j];
       nu[j] = pseudo_cost_num_up[j];
     }
-    return pseudo_cost_snapshot_t<i_t, f_t>(
-      std::move(sd), std::move(su), std::move(nd), std::move(nu));
+    auto snap =
+      pseudo_cost_snapshot_t<i_t, f_t>(std::move(sd), std::move(su), std::move(nd), std::move(nu));
+    snap.strong_branching_lp_iter_ = strong_branching_lp_iter.load();
+    return snap;
   }
 
   void merge_updates(const std::vector<pseudo_cost_update_t<i_t, f_t>>& updates)
@@ -541,6 +553,44 @@ class pseudo_costs_t {
   batch_pdlp_warm_cache_t<i_t, f_t> pdlp_warm_cache;
 };
 
+// Callback invoked after each strong-branching pseudocost discovery.
+template <typename i_t, typename f_t>
+using sb_update_callback_t =
+  std::function<void(i_t variable, rounding_direction_t direction, f_t delta)>;
+
+// Core reliability branching loop usable by both opportunistic and deterministic paths.
+// When num_tasks == 1, runs serially with no locking (deterministic).
+// When num_tasks > 1 with mutexes/rng, uses OMP taskloop (opportunistic).
+// SumT/CountT can be f_t/i_t (deterministic snapshot) or omp_atomic_t<f_t>/omp_atomic_t<i_t>.
+template <typename i_t, typename f_t, typename SumT, typename CountT, typename SBIterT>
+i_t reliable_variable_selection_core(mip_node_t<i_t, f_t>* node_ptr,
+                                     const std::vector<i_t>& fractional,
+                                     const std::vector<f_t>& solution,
+                                     const simplex_solver_settings_t<i_t, f_t>& settings,
+                                     const std::vector<variable_type_t>& var_types,
+                                     const lp_problem_t<i_t, f_t>& leaf_problem,
+                                     const std::vector<f_t>& edge_norms,
+                                     const basis_update_mpf_t<i_t, f_t>& basis_factors,
+                                     const std::vector<i_t>& basic_list,
+                                     const std::vector<i_t>& nonbasic_list,
+                                     SumT* sum_down,
+                                     SumT* sum_up,
+                                     CountT* num_down,
+                                     CountT* num_up,
+                                     i_t n_vars,
+                                     SBIterT& strong_branching_lp_iter,
+                                     f_t upper_bound,
+                                     int64_t bnb_lp_iters,
+                                     int64_t bnb_nodes_explored,
+                                     f_t start_time,
+                                     const reliability_branching_settings_t<i_t, f_t>& rb_settings,
+                                     int num_tasks,
+                                     omp_mutex_t* var_mutex_down,
+                                     omp_mutex_t* var_mutex_up,
+                                     pcgenerator_t* rng,
+                                     cuopt::work_limit_context_t* work_ctx              = nullptr,
+                                     const sb_update_callback_t<i_t, f_t>& on_sb_update = {});
+
 template <typename i_t, typename f_t>
 void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
                       const simplex_solver_settings_t<i_t, f_t>& settings,
@@ -556,6 +606,7 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
                       const std::vector<i_t>& basic_list,
                       const std::vector<i_t>& nonbasic_list,
                       basis_update_mpf_t<i_t, f_t>& basis_factors,
-                      pseudo_costs_t<i_t, f_t>& pc);
+                      pseudo_costs_t<i_t, f_t>& pc,
+                      cuopt::work_limit_context_t* work_unit_context = nullptr);
 
 }  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/dual_simplex/basis_updates.cpp b/cpp/src/dual_simplex/basis_updates.cpp
index 9c56ada50e..69ac7e43df 100644
--- a/cpp/src/dual_simplex/basis_updates.cpp
+++ b/cpp/src/dual_simplex/basis_updates.cpp
@@ -2202,7 +2202,7 @@ i_t basis_update_mpf_t<i_t, f_t>::update(const sparse_vector_t<i_t, f_t>& utilde
 
   // Ensure the workspace is sorted. Otherwise, the sparse dot will be incorrect.
   std::sort(xi_workspace_.begin() + m, xi_workspace_.begin() + m + nz, std::less<i_t>());
-  work_estimate_ += (m + nz) * std::log2(m + nz);
+  if (nz > 1) { work_estimate_ += (nz)*std::log2((f_t)(nz)); }
 
   // Gather the workspace into a column of S
   i_t S_start;
@@ -2214,7 +2214,7 @@ i_t basis_update_mpf_t<i_t, f_t>::update(const sparse_vector_t<i_t, f_t>& utilde
 
   // Gather etilde into a column of S
   etilde.sort();  // Needs to be sorted for the sparse dot. TODO(CMM): Is etilde sorted on input?
-  work_estimate_ += etilde.i.size() * std::log2(etilde.i.size());
+  if (etilde.i.size() > 1) { work_estimate_ += etilde.i.size() * std::log2((f_t)etilde.i.size()); }
   S_.append_column(etilde);
   work_estimate_ += 4 * etilde.i.size();
 
diff --git a/cpp/src/dual_simplex/bound_flipping_ratio_test.cpp b/cpp/src/dual_simplex/bound_flipping_ratio_test.cpp
index e30b067398..d9abc26fe1 100644
--- a/cpp/src/dual_simplex/bound_flipping_ratio_test.cpp
+++ b/cpp/src/dual_simplex/bound_flipping_ratio_test.cpp
@@ -235,7 +235,7 @@ void bound_flipping_ratio_test_t<i_t, f_t>::heap_passes(const std::vector<i_t>&
     // Remove minimum ratio from the heap and rebalance
     i_t heap_index = bare_idx.front();
     std::pop_heap(bare_idx.begin(), bare_idx.end(), compare);
-    work_estimate_ += 2 * std::log2(bare_idx.size());
+    if (bare_idx.size() > 1) { work_estimate_ += 2 * std::log2((f_t)bare_idx.size()); }
     bare_idx.pop_back();
 
     nonbasic_entering = current_indicies[heap_index];
diff --git a/cpp/src/dual_simplex/bound_flipping_ratio_test.hpp b/cpp/src/dual_simplex/bound_flipping_ratio_test.hpp
index 244ff334df..4b62c66771 100644
--- a/cpp/src/dual_simplex/bound_flipping_ratio_test.hpp
+++ b/cpp/src/dual_simplex/bound_flipping_ratio_test.hpp
@@ -100,7 +100,7 @@ class bound_flipping_ratio_test_t {
   i_t n_;
   i_t m_;
 
-  f_t work_estimate_;
+  f_t work_estimate_{0.0};
 };
 
 }  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/dual_simplex/phase2.cpp b/cpp/src/dual_simplex/phase2.cpp
index 5b1130796e..0e841fe22f 100644
--- a/cpp/src/dual_simplex/phase2.cpp
+++ b/cpp/src/dual_simplex/phase2.cpp
@@ -3551,7 +3551,9 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
 
       phase2_work_estimate += ft.work_estimate();
       ft.clear_work_estimate();
-      work_unit_context->record_work_sync_on_horizon(phase2_work_estimate / 1e8);
+      if (work_unit_context) {
+        work_unit_context->record_work_sync_on_horizon(phase2_work_estimate / 1e8);
+      }
       phase2_work_estimate = 0.0;
 
       last_feature_log_iter = iter;
diff --git a/cpp/src/dual_simplex/simplex_solver_settings.hpp b/cpp/src/dual_simplex/simplex_solver_settings.hpp
index cfc120e477..9aea2f1648 100644
--- a/cpp/src/dual_simplex/simplex_solver_settings.hpp
+++ b/cpp/src/dual_simplex/simplex_solver_settings.hpp
@@ -7,6 +7,7 @@
 
 #pragma once
 
+#include <cuopt/linear_programming/utilities/internals.hpp>
 #include <dual_simplex/logger.hpp>
 #include <dual_simplex/types.hpp>
 
@@ -113,7 +114,7 @@ struct simplex_solver_settings_t {
       reliability_branching(-1),
       inside_mip(0),
       sub_mip(0),
-      solution_callback(nullptr),
+      new_incumbent_callback(nullptr),
       heuristic_preemption_callback(nullptr),
       dual_simplex_objective_callback(nullptr),
       concurrent_halt(nullptr)
@@ -202,6 +203,8 @@ struct simplex_solver_settings_t {
   // 0, 1 - Estimate the objective change using a single pivot of dual simplex
   // >1 - Set as the iteration limit in dual simplex
   i_t strong_branching_simplex_iteration_limit;
+  f_t bb_work_unit_scale{1.0};
+  bool gpu_heur_wait_for_exploration{true};
 
   diving_heuristics_settings_t<i_t, f_t> diving_settings;  // Settings for the diving heuristics
 
@@ -214,7 +217,9 @@ struct simplex_solver_settings_t {
   i_t inside_mip;  // 0 if outside MIP, 1 if inside MIP at root node, 2 if inside MIP at leaf node
   i_t sub_mip;     // 0 if in regular MIP solve, 1 if in sub-MIP solve
 
-  std::function<void(std::vector<f_t>&, f_t)> solution_callback;
+  std::function<void(
+    std::vector<f_t>&, f_t, const cuopt::internals::mip_solution_callback_info_t&, double)>
+    new_incumbent_callback;
   std::function<void(const std::vector<f_t>&, f_t)> node_processed_callback;
   std::function<void()> heuristic_preemption_callback;
   std::function<void(std::vector<f_t>&, std::vector<f_t>&, f_t)> set_simplex_solution_callback;
diff --git a/cpp/src/math_optimization/solver_settings.cu b/cpp/src/math_optimization/solver_settings.cu
index c23b1d27ca..10026eb05e 100644
--- a/cpp/src/math_optimization/solver_settings.cu
+++ b/cpp/src/math_optimization/solver_settings.cu
@@ -113,6 +113,9 @@ solver_settings_t<i_t, f_t>::solver_settings_t() : pdlp_settings(), mip_settings
     {CUOPT_MIP_HYPER_HEURISTIC_INITIAL_INFEASIBILITY_WEIGHT, &mip_settings.heuristic_params.initial_infeasibility_weight, f_t(1e-9), std::numeric_limits<f_t>::infinity(), f_t(1000.0), "constraint violation penalty seed"},
     {CUOPT_MIP_HYPER_HEURISTIC_RELAXED_LP_TIME_LIMIT, &mip_settings.heuristic_params.relaxed_lp_time_limit, f_t(1e-9), std::numeric_limits<f_t>::infinity(), f_t(1.0), "base relaxed LP time cap in heuristics"},
     {CUOPT_MIP_HYPER_HEURISTIC_RELATED_VARS_TIME_LIMIT, &mip_settings.heuristic_params.related_vars_time_limit, f_t(1e-9), std::numeric_limits<f_t>::infinity(), f_t(30.0), "time for related-variable structure build"},
+    {CUOPT_MIP_HYPER_HEURISTIC_CPUFJ_WORK_UNIT_SCALE, &mip_settings.cpufj_work_unit_scale, f_t(0.0), std::numeric_limits<f_t>::infinity(), f_t(1.0), "user multiplier on CPUFJ work-unit rate"},
+    {CUOPT_MIP_HYPER_HEURISTIC_GPU_HEUR_WORK_UNIT_SCALE, &mip_settings.gpu_heur_work_unit_scale, f_t(0.0), std::numeric_limits<f_t>::infinity(), f_t(1.0), "user multiplier on GPU heuristics work-unit rate"},
+    {CUOPT_MIP_HYPER_HEURISTIC_BB_WORK_UNIT_SCALE, &mip_settings.bb_work_unit_scale, f_t(0.0), std::numeric_limits<f_t>::infinity(), f_t(1.0), "user multiplier on B&B work-unit rate"},
    };
 
   // Int parameters
@@ -142,7 +145,7 @@ solver_settings_t<i_t, f_t>::solver_settings_t() : pdlp_settings(), mip_settings
     {CUOPT_MIP_STRONG_BRANCHING_SIMPLEX_ITERATION_LIMIT, &mip_settings.strong_branching_simplex_iteration_limit, -1,std::numeric_limits<i_t>::max(), -1},
     {CUOPT_PRESOLVE, reinterpret_cast<int*>(&pdlp_settings.presolver), CUOPT_PRESOLVE_DEFAULT, CUOPT_PRESOLVE_PSLP, CUOPT_PRESOLVE_DEFAULT},
     {CUOPT_PRESOLVE, reinterpret_cast<int*>(&mip_settings.presolver), CUOPT_PRESOLVE_DEFAULT, CUOPT_PRESOLVE_PSLP, CUOPT_PRESOLVE_DEFAULT},
-    {CUOPT_MIP_DETERMINISM_MODE, &mip_settings.determinism_mode, CUOPT_MODE_OPPORTUNISTIC, CUOPT_MODE_DETERMINISTIC, CUOPT_MODE_OPPORTUNISTIC},
+    {CUOPT_MIP_DETERMINISM_MODE, &mip_settings.determinism_mode, CUOPT_DETERMINISM_NONE, CUOPT_DETERMINISM_FULL, CUOPT_DETERMINISM_NONE},
     {CUOPT_RANDOM_SEED, &mip_settings.seed, -1, std::numeric_limits<i_t>::max(), -1},
     {CUOPT_MIP_RELIABILITY_BRANCHING, &mip_settings.reliability_branching, -1, std::numeric_limits<i_t>::max(), -1},
     {CUOPT_PDLP_PRECISION, reinterpret_cast<int*>(&pdlp_settings.pdlp_precision), CUOPT_PDLP_DEFAULT_PRECISION, CUOPT_PDLP_MIXED_PRECISION, CUOPT_PDLP_DEFAULT_PRECISION},
@@ -171,6 +174,7 @@ solver_settings_t<i_t, f_t>::solver_settings_t() : pdlp_settings(), mip_settings
     {CUOPT_ELIMINATE_DENSE_COLUMNS, &pdlp_settings.eliminate_dense_columns, true},
     {CUOPT_CUDSS_DETERMINISTIC, &pdlp_settings.cudss_deterministic, false},
     {CUOPT_DUAL_POSTSOLVE, &pdlp_settings.dual_postsolve, true},
+    {CUOPT_MIP_HYPER_HEURISTIC_GPU_HEUR_WAIT_FOR_EXPLORATION, &mip_settings.gpu_heur_wait_for_exploration, false, "GPU heuristics wait for B&B root solve before starting"},
   };
   // String parameters
   string_parameters = {
diff --git a/cpp/src/mip_heuristics/diversity/diversity_config.hpp b/cpp/src/mip_heuristics/diversity/diversity_config.hpp
index dacf7773de..c27f857ba0 100644
--- a/cpp/src/mip_heuristics/diversity/diversity_config.hpp
+++ b/cpp/src/mip_heuristics/diversity/diversity_config.hpp
@@ -26,6 +26,10 @@ struct diversity_config_t {
   double lp_run_time_if_feasible     = 2.;
   double lp_run_time_if_infeasible   = 1.;
   bool halve_population              = false;
+  bool fj_only_run                   = false;
+  bool dry_run                       = false;
+  bool initial_solution_only         = false;
+  int n_fp_iterations                = 1000000;
 };
 
 }  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/mip_heuristics/diversity/diversity_manager.cu b/cpp/src/mip_heuristics/diversity/diversity_manager.cu
index b8dc3d33bf..b84043d773 100644
--- a/cpp/src/mip_heuristics/diversity/diversity_manager.cu
+++ b/cpp/src/mip_heuristics/diversity/diversity_manager.cu
@@ -5,7 +5,6 @@
  */
 /* clang-format on */
 
-#include "cuda_profiler_api.h"
 #include "diversity_manager.cuh"
 
 #include <mip_heuristics/mip_constants.hpp>
@@ -14,12 +13,21 @@
 #include <mip_heuristics/presolve/probing_cache.cuh>
 #include <mip_heuristics/presolve/trivial_presolve.cuh>
 #include <mip_heuristics/problem/problem_helpers.cuh>
+#include <mip_heuristics/relaxed_lp/relaxed_lp.cuh>
 
 #include <pdlp/solve.cuh>
 
+#include <utilities/determinism_log.hpp>
 #include <utilities/scope_guard.hpp>
 
-#include <memory>
+// enable to activate detailed determinism logs
+#if 0
+#undef CUOPT_DETERMINISM_LOG
+#define CUOPT_DETERMINISM_LOG(...) \
+  do {                             \
+    CUOPT_LOG_INFO(__VA_ARGS__);   \
+  } while (0)
+#endif
 
 constexpr bool fj_only_run = false;
 
@@ -55,7 +63,7 @@ diversity_manager_t<i_t, f_t>::diversity_manager_t(mip_solver_context_t<i_t, f_t
                              context.problem_ptr->handle_ptr->get_stream()),
     ls(context, lp_optimal_solution),
     rins(context, *this),
-    timer(diversity_config.default_time_limit),
+    timer(0.0, cuopt::termination_checker_t::root_tag_t{}),
     bound_prop_recombiner(context,
                           context.problem_ptr->n_variables,
                           ls.constraint_prop,
@@ -79,6 +87,30 @@ diversity_manager_t<i_t, f_t>::diversity_manager_t(mip_solver_context_t<i_t, f_t
     mab_ls(mab_ls_config_t<i_t, f_t>::n_of_arms, cuopt::seed_generator::get_seed(), ls_alpha, "ls"),
     ls_hash_map(*context.problem_ptr)
 {
+  fp_recombiner_config_t::max_n_of_vars_from_other =
+    fp_recombiner_config_t::initial_n_of_vars_from_other;
+  ls_recombiner_config_t::max_n_of_vars_from_other =
+    ls_recombiner_config_t::initial_n_of_vars_from_other;
+  bp_recombiner_config_t::max_n_of_vars_from_other =
+    bp_recombiner_config_t::initial_n_of_vars_from_other;
+  sub_mip_recombiner_config_t::max_n_of_vars_from_other =
+    sub_mip_recombiner_config_t::initial_n_of_vars_from_other;
+  mab_ls_config_t<i_t, f_t>::last_lm_config     = 0;
+  mab_ls_config_t<i_t, f_t>::last_ls_mab_option = 0;
+
+  CUOPT_DETERMINISM_LOG(
+    "Deterministic solve start diversity state: seed_state=%lld fp_max=%zu "
+    "ls_max=%zu bp_max=%zu sub_mip_max=%zu last_lm=%d last_ls=%d "
+    "enabled_recombiners=%zu",
+    (long long)cuopt::seed_generator::peek_seed(),
+    fp_recombiner_config_t::max_n_of_vars_from_other,
+    ls_recombiner_config_t::max_n_of_vars_from_other,
+    bp_recombiner_config_t::max_n_of_vars_from_other,
+    sub_mip_recombiner_config_t::max_n_of_vars_from_other,
+    (int)mab_ls_config_t<i_t, f_t>::last_lm_config,
+    (int)mab_ls_config_t<i_t, f_t>::last_ls_mab_option,
+    recombiner_t<i_t, f_t>::enabled_recombiners.size());
+
   int max_config             = -1;
   int env_config_id          = -1;
   const char* env_max_config = std::getenv("CUOPT_MAX_CONFIG");
@@ -106,6 +138,9 @@ diversity_manager_t<i_t, f_t>::diversity_manager_t(mip_solver_context_t<i_t, f_t
       "CUOPT_CONFIG_ID=%d is outside [0, %d). Ignoring cut override.", env_config_id, max_config);
     return;
   }
+
+  context.gpu_heur_loop.deterministic =
+    (context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS);
 }
 
 template <typename i_t, typename f_t>
@@ -153,7 +188,7 @@ void diversity_manager_t<i_t, f_t>::consume_staged_simplex_solution(lp_state_t<i
 template <typename i_t, typename f_t>
 bool diversity_manager_t<i_t, f_t>::run_local_search(solution_t<i_t, f_t>& solution,
                                                      const weight_t<i_t, f_t>& weights,
-                                                     timer_t& timer,
+                                                     work_limit_timer_t& timer,
                                                      ls_config_t<i_t, f_t>& ls_config)
 {
   raft::common::nvtx::range fun_scope("run_local_search");
@@ -174,7 +209,7 @@ void diversity_manager_t<i_t, f_t>::generate_solution(f_t time_limit, bool rando
   sol.compute_feasibility();
   // if a feasible is found, it is added to the population
   ls.generate_solution(sol, random_start, &population, time_limit);
-  population.add_solution(std::move(sol));
+  population.add_solution(std::move(sol), internals::mip_solution_origin_t::LOCAL_SEARCH);
 }
 
 template <typename i_t, typename f_t>
@@ -187,7 +222,12 @@ void diversity_manager_t<i_t, f_t>::add_user_given_solutions(
     rmm::device_uvector<f_t> init_sol_assignment(*init_sol, sol.handle_ptr->get_stream());
     if (problem_ptr->pre_process_assignment(init_sol_assignment)) {
       relaxed_lp_settings_t lp_settings;
-      lp_settings.time_limit            = std::min(60., timer.remaining_time() / 2);
+      lp_settings.time_limit = std::min(60., timer.remaining_time() / 2);
+      if (timer.deterministic) {
+        lp_settings.work_limit   = lp_settings.time_limit;
+        lp_settings.work_context = timer.work_context;
+        cuopt_assert(lp_settings.work_context != nullptr, "Missing deterministic work context");
+      }
       lp_settings.tolerance             = problem_ptr->tolerances.absolute_tolerance;
       lp_settings.save_state            = false;
       lp_settings.return_first_feasible = true;
@@ -206,7 +246,9 @@ void diversity_manager_t<i_t, f_t>::add_user_given_solutions(
                      is_feasible,
                      sol.get_user_objective(),
                      sol.get_total_excess());
-      population.run_solution_callbacks(sol);
+      if (is_feasible) {
+        population.run_solution_callbacks(sol, internals::mip_solution_origin_t::USER_INITIAL);
+      }
       initial_sol_vector.emplace_back(std::move(sol));
     } else {
       CUOPT_LOG_ERROR(
@@ -220,11 +262,13 @@ void diversity_manager_t<i_t, f_t>::add_user_given_solutions(
 }
 
 template <typename i_t, typename f_t>
-bool diversity_manager_t<i_t, f_t>::run_presolve(f_t time_limit, timer_t global_timer)
+bool diversity_manager_t<i_t, f_t>::run_presolve(f_t time_limit,
+                                                 cuopt::termination_checker_t& global_timer)
 {
   raft::common::nvtx::range fun_scope("run_presolve");
   CUOPT_LOG_INFO("Running presolve!");
-  timer_t presolve_timer(time_limit);
+  CUOPT_LOG_INFO("Problem fingerprint before DM presolve: 0x%x", problem_ptr->get_fingerprint());
+  work_limit_timer_t presolve_timer(context.gpu_heur_loop, time_limit, *context.termination);
 
   auto term_crit = ls.constraint_prop.bounds_update.solve(*problem_ptr);
   if (ls.constraint_prop.bounds_update.infeas_constraints_count > 0) {
@@ -234,15 +278,17 @@ bool diversity_manager_t<i_t, f_t>::run_presolve(f_t time_limit, timer_t global_
   if (termination_criterion_t::NO_UPDATE != term_crit) {
     ls.constraint_prop.bounds_update.set_updated_bounds(*problem_ptr);
   }
+
   bool run_probing_cache = !fj_only_run;
-  // Don't run probing cache in deterministic mode yet as neither B&B nor CPUFJ need it
-  // and it doesn't make use of work units yet
-  if (context.settings.determinism_mode == CUOPT_MODE_DETERMINISTIC) { run_probing_cache = false; }
   if (run_probing_cache) {
     // Run probing cache before trivial presolve to discover variable implications
-    const f_t max_time_on_probing = diversity_config.max_time_on_probing;
-    f_t time_for_probing_cache    = std::min(max_time_on_probing, time_limit);
-    timer_t probing_timer{time_for_probing_cache};
+    const f_t max_time_on_probing =
+      (context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS)
+        ? std::numeric_limits<f_t>::infinity()
+        : diversity_config.max_time_on_probing;
+    f_t time_for_probing_cache = std::min(max_time_on_probing, time_limit);
+    work_limit_timer_t probing_timer(
+      context.gpu_heur_loop, time_for_probing_cache, *context.termination);
     // this function computes probing cache, finds singletons, substitutions and changes the problem
     bool problem_is_infeasible =
       compute_probing_cache(ls.constraint_prop.bounds_update, *problem_ptr, probing_timer);
@@ -252,8 +298,10 @@ bool diversity_manager_t<i_t, f_t>::run_presolve(f_t time_limit, timer_t global_
   problem_ptr->related_vars_time_limit = context.settings.heuristic_params.related_vars_time_limit;
   if (!global_timer.check_time_limit()) { trivial_presolve(*problem_ptr, remap_cache_ids); }
   if (!problem_ptr->empty && !check_bounds_sanity(*problem_ptr)) { return false; }
-  // if (!presolve_timer.check_time_limit() && !context.settings.heuristics_only &&
-  //     !problem_ptr->empty) {
+  const bool run_clique_table =
+    !presolve_timer.check_time_limit() && !context.settings.heuristics_only &&
+    !problem_ptr->empty && !(context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS);
+  // if (run_clique_table) {
   //   f_t time_limit_for_clique_table = std::min(3., presolve_timer.remaining_time() / 5);
   //   timer_t clique_timer(time_limit_for_clique_table);
   //   dual_simplex::user_problem_t<i_t, f_t> host_problem(problem_ptr->handle_ptr);
@@ -292,6 +340,10 @@ bool diversity_manager_t<i_t, f_t>::run_presolve(f_t time_limit, timer_t global_
   }
   stats.presolve_time = presolve_timer.elapsed_time();
   lp_optimal_solution.resize(problem_ptr->n_variables, problem_ptr->handle_ptr->get_stream());
+  thrust::fill(problem_ptr->handle_ptr->get_thrust_policy(),
+               lp_optimal_solution.begin(),
+               lp_optimal_solution.end(),
+               f_t(0));
   lp_dual_optimal_solution.resize(problem_ptr->n_constraints,
                                   problem_ptr->handle_ptr->get_stream());
   problem_ptr->handle_ptr->sync_stream();
@@ -299,7 +351,9 @@ bool diversity_manager_t<i_t, f_t>::run_presolve(f_t time_limit, timer_t global_
                  problem_ptr->n_constraints,
                  problem_ptr->n_variables,
                  problem_ptr->presolve_data.objective_offset);
-  CUOPT_LOG_INFO("cuOpt presolve time: %.2f", stats.presolve_time);
+  CUOPT_LOG_INFO("cuOpt presolve time: %.2f, fingerprint: 0x%x",
+                 stats.presolve_time,
+                 problem_ptr->get_fingerprint());
   return true;
 }
 
@@ -311,24 +365,25 @@ void diversity_manager_t<i_t, f_t>::generate_quick_feasible_solution()
   // min 1 second, max 10 seconds
   const f_t generate_fast_solution_time =
     std::min(diversity_config.max_fast_sol_time, std::max(1., timer.remaining_time() / 20.));
-  timer_t sol_timer(generate_fast_solution_time);
+  work_limit_timer_t sol_timer(
+    context.gpu_heur_loop, generate_fast_solution_time, *context.termination);
   // do very short LP run to get somewhere close to the optimal point
   ls.generate_fast_solution(solution, sol_timer);
   if (solution.get_feasible()) {
-    population.run_solution_callbacks(solution);
     initial_sol_vector.emplace_back(std::move(solution));
     problem_ptr->handle_ptr->sync_stream();
     solution_t<i_t, f_t> searched_sol(initial_sol_vector.back());
     ls_config_t<i_t, f_t> ls_config;
     run_local_search(searched_sol, population.weights, sol_timer, ls_config);
-    population.run_solution_callbacks(searched_sol);
     initial_sol_vector.emplace_back(std::move(searched_sol));
     auto& feas_sol = initial_sol_vector.back().get_feasible()
                        ? initial_sol_vector.back()
                        : initial_sol_vector[initial_sol_vector.size() - 2];
-    CUOPT_LOG_INFO("Generated fast solution in %f seconds with objective %f",
+    population.run_solution_callbacks(feas_sol, internals::mip_solution_origin_t::LOCAL_SEARCH);
+    CUOPT_LOG_INFO("Generated fast solution in %f seconds with objective %f, hash 0x%x",
                    timer.elapsed_time(),
-                   feas_sol.get_user_objective());
+                   feas_sol.get_user_objective(),
+                   feas_sol.get_hash());
   }
   problem_ptr->handle_ptr->sync_stream();
 }
@@ -366,8 +421,29 @@ void diversity_manager_t<i_t, f_t>::run_fp_alone()
 {
   CUOPT_LOG_DEBUG("Running FP alone!");
   solution_t<i_t, f_t> sol(population.best_feasible());
-  ls.run_fp(sol, timer, &population);
-  CUOPT_LOG_DEBUG("FP alone finished!");
+  CUOPT_DETERMINISM_LOG(
+    "Deterministic FP alone input: hash=0x%x feasible=%d obj=%.16e excess=%.16e",
+    sol.get_hash(),
+    (int)sol.get_feasible(),
+    sol.get_user_objective(),
+    sol.get_total_excess());
+  ls.run_fp(sol, timer, &population, diversity_config.n_fp_iterations);
+  CUOPT_DETERMINISM_LOG(
+    "Deterministic FP alone output: hash=0x%x feasible=%d obj=%.16e excess=%.16e",
+    sol.get_hash(),
+    (int)sol.get_feasible(),
+    sol.get_user_objective(),
+    sol.get_total_excess());
+  if (sol.get_feasible()) {
+    population.add_solution(std::move(sol), internals::mip_solution_origin_t::LOCAL_SEARCH);
+  }
+  auto& best_sol = population.best_feasible();
+  CUOPT_DETERMINISM_LOG(
+    "Deterministic FP alone population best after: hash=0x%x feasible=%d obj=%.16e excess=%.16e",
+    best_sol.get_hash(),
+    (int)best_sol.get_feasible(),
+    best_sol.get_user_objective(),
+    best_sol.get_total_excess());
 }
 
 template <typename i_t, typename f_t>
@@ -384,17 +460,38 @@ solution_t<i_t, f_t> diversity_manager_t<i_t, f_t>::run_solver()
   raft::common::nvtx::range fun_scope("run_solver");
 
   CUOPT_LOG_DEBUG("Determinism mode: %s",
-                  context.settings.determinism_mode == CUOPT_MODE_DETERMINISTIC ? "deterministic"
-                                                                                : "opportunistic");
+                  (context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS)
+                    ? "deterministic"
+                    : "opportunistic");
 
   // to automatically compute the solving time on scope exit
   auto timer_raii_guard =
     cuopt::scope_guard([&]() { stats.total_solve_time = timer.elapsed_time(); });
+  auto log_return_solution = [&](const char* reason, solution_t<i_t, f_t>& sol) {
+    CUOPT_DETERMINISM_LOG(
+      "Deterministic run_solver return: reason=%s hash=0x%x feasible=%d "
+      "obj=%.16e excess=%.16e",
+      reason,
+      sol.get_hash(),
+      (int)sol.get_feasible(),
+      sol.get_user_objective(),
+      sol.get_total_excess());
+  };
 
-  // Debug: Allow disabling GPU heuristics to test B&B tree determinism in isolation
+  const bool deterministic_bb_without_deterministic_heuristics =
+    (context.settings.determinism_mode & CUOPT_DETERMINISM_BB) &&
+    !(context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS);
   const char* disable_heuristics_env = std::getenv("CUOPT_DISABLE_GPU_HEURISTICS");
-  if (context.settings.determinism_mode == CUOPT_MODE_DETERMINISTIC) {
-    CUOPT_LOG_INFO("Running deterministic mode with CPUFJ heuristic");
+  if (deterministic_bb_without_deterministic_heuristics ||
+      (disable_heuristics_env != nullptr && std::string(disable_heuristics_env) == "1")) {
+    CUOPT_LOG_INFO("GPU heuristics disabled (det_bb_only=%d env=%s)",
+                   (int)deterministic_bb_without_deterministic_heuristics,
+                   disable_heuristics_env ? disable_heuristics_env : "unset");
+    if ((context.settings.determinism_mode & CUOPT_DETERMINISM_BB) &&
+        context.branch_and_bound_ptr != nullptr) {
+      auto& producer_sync = context.branch_and_bound_ptr->get_producer_sync();
+      producer_sync.registration_complete();
+    }
     population.initialize_population();
     population.allocate_solutions();
 
@@ -412,21 +509,38 @@ solution_t<i_t, f_t> diversity_manager_t<i_t, f_t>::run_solver()
     ls.stop_cpufj_deterministic();
 
     population.add_external_solutions_to_population();
-    return population.best_feasible();
+    auto& best_sol = population.best_feasible();
+    log_return_solution("heuristics_disabled", best_sol);
+    return best_sol;
   }
-  if (disable_heuristics_env != nullptr && std::string(disable_heuristics_env) == "1") {
-    CUOPT_LOG_INFO("GPU heuristics disabled via CUOPT_DISABLE_GPU_HEURISTICS=1");
-    population.initialize_population();
-    population.allocate_solutions();
 
-    while (!check_b_b_preemption()) {
-      std::this_thread::sleep_for(std::chrono::milliseconds(100));
+  bool gpu_heuristic_producer_registered = false;
+  auto gpu_heuristic_producer_guard      = cuopt::scope_guard([&]() {
+    if (!gpu_heuristic_producer_registered || context.branch_and_bound_ptr == nullptr) { return; }
+    auto& producer_sync = context.branch_and_bound_ptr->get_producer_sync();
+    producer_sync.deregister_producer(context.gpu_heur_loop.producer_progress_ptr());
+    context.gpu_heur_loop.detach_producer_sync();
+  });
+  if ((context.settings.determinism_mode & CUOPT_DETERMINISM_BB) &&
+      context.branch_and_bound_ptr != nullptr) {
+    if (context.settings.gpu_heur_wait_for_exploration) {
+      CUOPT_LOG_INFO("GPU heuristics waiting for B&B tree exploration to start...");
+      auto wait_start = std::chrono::high_resolution_clock::now();
+      context.branch_and_bound_ptr->wait_for_exploration_start();
+      double wait_elapsed =
+        std::chrono::duration<double>(std::chrono::high_resolution_clock::now() - wait_start)
+          .count();
+      CUOPT_LOG_INFO("GPU heuristics resumed after %.2fs (B&B exploration started)", wait_elapsed);
     }
-    return population.best_feasible();
+    auto& producer_sync = context.branch_and_bound_ptr->get_producer_sync();
+    context.gpu_heur_loop.attach_producer_sync(&producer_sync);
+    producer_sync.register_producer(context.gpu_heur_loop.producer_progress_ptr());
+    producer_sync.registration_complete();
+    gpu_heuristic_producer_registered = true;
   }
 
   population.timer        = timer;
-  const f_t time_limit    = timer.remaining_time();
+  const f_t time_limit    = timer.deterministic ? timer.get_time_limit() : timer.remaining_time();
   const auto& hp          = context.settings.heuristic_params;
   const f_t lp_time_limit = std::min(hp.root_lp_max_time, time_limit * hp.root_lp_time_ratio);
   // after every change to the problem, we should resize all the relevant vars
@@ -438,7 +552,7 @@ solution_t<i_t, f_t> diversity_manager_t<i_t, f_t>::run_solver()
   // have the structure ready for reusing later
   problem_ptr->compute_integer_fixed_problem();
   recombiner_t<i_t, f_t>::init_enabled_recombiners(
-    *problem_ptr, context.settings.heuristic_params.enabled_recombiners);
+    context, *problem_ptr, context.settings.heuristic_params.enabled_recombiners);
   mab_recombiner.resize_mab_arm_stats(recombiner_t<i_t, f_t>::enabled_recombiners.size());
   // test problem is not ii
   cuopt_func_call(
@@ -448,13 +562,27 @@ solution_t<i_t, f_t> diversity_manager_t<i_t, f_t>::run_solver()
     "The problem must not be ii");
   population.initialize_population();
   population.allocate_solutions();
-  if (check_b_b_preemption()) { return population.best_feasible(); }
+  if (check_b_b_preemption()) {
+    auto& best_sol = population.best_feasible();
+    log_return_solution("preempted_after_population_init", best_sol);
+    return best_sol;
+  }
   add_user_given_solutions(initial_sol_vector);
+  CUOPT_DETERMINISM_LOG("DM bootstrap: initial_sol_vector size after user solutions = %lu",
+                        initial_sol_vector.size());
   // Run CPUFJ early to find quick initial solutions
   ls_cpufj_raii_guard_t ls_cpufj_raii_guard(ls);  // RAII to stop cpufj threads on solve stop
-  ls.start_cpufj_scratch_threads(population);
 
-  if (check_b_b_preemption()) { return population.best_feasible(); }
+  if (!diversity_config.dry_run &&
+      !(context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS)) {
+    ls.start_cpufj_scratch_threads(population);
+  }
+
+  if (check_b_b_preemption()) {
+    auto& best_sol = population.best_feasible();
+    log_return_solution("preempted_before_lp", best_sol);
+    return best_sol;
+  }
   lp_state_t<i_t, f_t>& lp_state = problem_ptr->lp_state;
   // resize because some constructor might be called before the presolve
   lp_state.resize(*problem_ptr, problem_ptr->handle_ptr->get_stream());
@@ -462,30 +590,59 @@ solution_t<i_t, f_t> diversity_manager_t<i_t, f_t>::run_solver()
   if (bb_thread_solution_exists) {
     consume_staged_simplex_solution(lp_state);
     ls.lp_optimal_exists = true;
-  } else if (!fj_only_run) {
+  } else if (!diversity_config.fj_only_run) {
     convert_greater_to_less(*problem_ptr);
 
     f_t absolute_tolerance = context.settings.tolerances.absolute_tolerance;
+    f_t tolerance_divisor =
+      problem_ptr->tolerances.absolute_tolerance / problem_ptr->tolerances.relative_tolerance;
+    if (tolerance_divisor == 0) { tolerance_divisor = 1; }
 
-    pdlp_solver_settings_t<i_t, f_t> pdlp_settings{};
-    pdlp_settings.tolerances.absolute_dual_tolerance = absolute_tolerance;
-    pdlp_settings.tolerances.relative_dual_tolerance =
-      context.settings.tolerances.relative_tolerance;
-    pdlp_settings.tolerances.absolute_primal_tolerance = absolute_tolerance;
-    pdlp_settings.tolerances.relative_primal_tolerance =
-      context.settings.tolerances.relative_tolerance;
-    pdlp_settings.time_limit              = lp_time_limit;
-    pdlp_settings.first_primal_feasible   = false;
-    pdlp_settings.concurrent_halt         = &global_concurrent_halt;
-    pdlp_settings.method                  = method_t::Concurrent;
-    pdlp_settings.inside_mip              = true;
-    pdlp_settings.pdlp_solver_mode        = pdlp_solver_mode_t::Stable2;
-    pdlp_settings.num_gpus                = context.settings.num_gpus;
-    pdlp_settings.presolver               = presolver_t::None;
-    pdlp_settings.per_constraint_residual = true;
-    set_pdlp_solver_mode(pdlp_settings);
-    timer_t lp_timer(lp_time_limit);
-    auto lp_result = solve_lp_with_method<i_t, f_t>(*problem_ptr, pdlp_settings, lp_timer);
+    auto lp_result = [&]() {
+      // no concurrent root solve in determinism mode, reuse the work-accounted relaxed_lp machinery
+      // for this
+      if (timer.deterministic) {
+        relaxed_lp_settings_t lp_settings{};
+        lp_settings.time_limit              = lp_time_limit;
+        lp_settings.work_limit              = lp_time_limit;
+        lp_settings.tolerance               = absolute_tolerance;
+        lp_settings.check_infeasibility     = true;
+        lp_settings.return_first_feasible   = false;
+        lp_settings.save_state              = true;
+        lp_settings.per_constraint_residual = true;
+        lp_settings.has_initial_primal      = false;
+        lp_settings.concurrent_halt         = &global_concurrent_halt;
+        lp_settings.work_context            = &context.gpu_heur_loop;
+        cuopt_assert(lp_settings.work_context != nullptr, "Missing deterministic work context");
+        CUOPT_DETERMINISM_LOG(
+          "DM root LP config: dry_run=%d deterministic=%d work_limit=%.6f time_limit=%.6f",
+          (int)diversity_config.dry_run,
+          (int)timer.deterministic,
+          lp_settings.work_limit,
+          lp_settings.time_limit);
+        return get_relaxed_lp_solution<i_t, f_t>(
+          *problem_ptr, lp_optimal_solution, lp_state, lp_settings);
+      }
+      pdlp_solver_settings_t<i_t, f_t> pdlp_settings{};
+      pdlp_settings.tolerances.relative_primal_tolerance = absolute_tolerance / tolerance_divisor;
+      pdlp_settings.tolerances.relative_dual_tolerance   = absolute_tolerance / tolerance_divisor;
+      pdlp_settings.time_limit                           = lp_time_limit;
+      pdlp_settings.first_primal_feasible                = false;
+      pdlp_settings.concurrent_halt                      = &global_concurrent_halt;
+      pdlp_settings.method                               = method_t::Concurrent;
+      pdlp_settings.inside_mip                           = true;
+      pdlp_settings.pdlp_solver_mode                     = pdlp_solver_mode_t::Stable2;
+      pdlp_settings.num_gpus                             = context.settings.num_gpus;
+      pdlp_settings.presolver                            = presolver_t::None;
+      timer_t lp_timer(lp_time_limit);
+      return solve_lp_with_method<i_t, f_t>(*problem_ptr, pdlp_settings, lp_timer);
+    }();
+    CUOPT_DETERMINISM_LOG(
+      "DM root LP result: status=%d iters=%d user_obj=%.12f primal_hash=0x%x",
+      (int)lp_result.get_termination_status(),
+      lp_result.get_additional_termination_information().number_of_steps_taken,
+      lp_result.get_objective_value(),
+      detail::compute_hash(lp_result.get_primal_solution(), problem_ptr->handle_ptr->get_stream()));
 
     bool use_staged_simplex_solution = false;
     {
@@ -527,9 +684,11 @@ solution_t<i_t, f_t> diversity_manager_t<i_t, f_t>::run_solver()
       } else if (lp_result.get_termination_status() == pdlp_termination_status_t::DualInfeasible) {
         CUOPT_LOG_ERROR("PDLP detected dual infeasibility, continuing anyway!");
         ls.lp_optimal_exists = false;
-      } else if (lp_result.get_termination_status() == pdlp_termination_status_t::TimeLimit) {
+      } else if (lp_result.get_termination_status() == pdlp_termination_status_t::TimeLimit ||
+                 lp_result.get_termination_status() == pdlp_termination_status_t::IterationLimit) {
         CUOPT_LOG_DEBUG(
-          "Initial LP run exceeded time limit, continuing solver with partial LP result!");
+          "Initial LP run exceeded time/iteration limit, continuing solver with partial LP "
+          "result!");
         // note to developer, in debug mode the LP run might be too slow and it might cause PDLP
         // not to bring variables within the bounds
       }
@@ -573,50 +732,106 @@ solution_t<i_t, f_t> diversity_manager_t<i_t, f_t>::run_solver()
     if (!use_staged_simplex_solution) {
       // in case the pdlp returned var boudns that are out of bounds
       clamp_within_var_bounds(lp_optimal_solution, problem_ptr, problem_ptr->handle_ptr);
+      CUOPT_DETERMINISM_LOG(
+        "DM root LP post-clamp: lp_optimal_solution hash=0x%x",
+        detail::compute_hash(lp_optimal_solution, problem_ptr->handle_ptr->get_stream()));
     }
   }
 
   if (ls.lp_optimal_exists) {
     solution_t<i_t, f_t> lp_rounded_sol(*problem_ptr);
     lp_rounded_sol.copy_new_assignment(lp_optimal_solution);
+    CUOPT_DETERMINISM_LOG("DM bootstrap candidate (LP raw): hash=0x%x feas=%d obj=%.12f",
+                          lp_rounded_sol.get_hash(),
+                          (int)lp_rounded_sol.get_feasible(),
+                          lp_rounded_sol.get_user_objective());
     lp_rounded_sol.round_nearest();
     lp_rounded_sol.compute_feasibility();
-    population.add_solution(std::move(lp_rounded_sol));
-    ls.start_cpufj_lptopt_scratch_threads(population);
+    CUOPT_DETERMINISM_LOG("DM bootstrap candidate (LP rounded): hash=0x%x feas=%d obj=%.12f",
+                          lp_rounded_sol.get_hash(),
+                          (int)lp_rounded_sol.get_feasible(),
+                          lp_rounded_sol.get_user_objective());
+    population.add_solution(std::move(lp_rounded_sol),
+                            internals::mip_solution_origin_t::LP_ROUNDING);
+    if (!diversity_config.dry_run &&
+        !(context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS)) {
+      ls.start_cpufj_lptopt_scratch_threads(population);
+    }
   }
 
-  population.add_solutions_from_vec(std::move(initial_sol_vector));
+  for (size_t i = 0; i < initial_sol_vector.size(); ++i) {
+    CUOPT_DETERMINISM_LOG(
+      "DM bootstrap candidate (initial_sol_vector[%lu]): hash=0x%x feas=%d obj=%.12f",
+      i,
+      initial_sol_vector[i].get_hash(),
+      (int)initial_sol_vector[i].get_feasible(),
+      initial_sol_vector[i].get_user_objective());
+  }
+  population.add_solutions_from_vec(std::move(initial_sol_vector),
+                                    internals::mip_solution_origin_t::USER_INITIAL);
 
-  if (check_b_b_preemption()) { return population.best_feasible(); }
+  if (check_b_b_preemption()) {
+    auto& best_sol = population.best_feasible();
+    log_return_solution("preempted_after_initial_population", best_sol);
+    return best_sol;
+  }
 
   if (context.settings.benchmark_info_ptr != nullptr) {
     context.settings.benchmark_info_ptr->objective_of_initial_population =
       population.best_feasible().get_user_objective();
   }
 
-  if (fj_only_run) {
+  if (diversity_config.dry_run) {
+    auto& best_sol = population.best_feasible();
+    log_return_solution("dry_run", best_sol);
+    return best_sol;
+  }
+  if (diversity_config.fj_only_run) {
     solution_t<i_t, f_t> sol(*problem_ptr);
     run_fj_alone(sol);
+    log_return_solution("fj_only_run", sol);
     return sol;
   }
-  rins.enable();
+  // RINS not supported in deterministic mode yet
+  if (!(context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS)) { rins.enable(); }
 
   generate_solution(timer.remaining_time(), false);
+  if (diversity_config.initial_solution_only) {
+    auto& best_sol = population.best_feasible();
+    log_return_solution("initial_solution_only", best_sol);
+    return best_sol;
+  }
   if (timer.check_time_limit()) {
     rins.stop_rins();
     population.add_external_solutions_to_population();
-    return population.best_feasible();
+    auto& best_sol = population.best_feasible();
+    log_return_solution("work_limit_reached", best_sol);
+    return best_sol;
   }
   if (check_b_b_preemption()) {
     rins.stop_rins();
     population.add_external_solutions_to_population();
-    return population.best_feasible();
+    auto& best_sol = population.best_feasible();
+    log_return_solution("preempted_before_fp", best_sol);
+    return best_sol;
   }
 
+  CUOPT_LOG_DEBUG("pre-run_fp_alone: gpu_work=%g gpu_prod=%g",
+                  context.gpu_heur_loop.current_work(),
+                  context.gpu_heur_loop.current_producer_work());
   run_fp_alone();
+  CUOPT_LOG_DEBUG("post-run_fp_alone: gpu_work=%g gpu_prod=%g",
+                  context.gpu_heur_loop.current_work(),
+                  context.gpu_heur_loop.current_producer_work());
   rins.stop_rins();
   population.add_external_solutions_to_population();
-  return population.best_feasible();
+  auto& best_sol = population.best_feasible();
+  CUOPT_LOG_DEBUG("post-fp handoff: feas=%d obj=%g hash=0x%x",
+                  (int)best_sol.get_feasible(),
+                  best_sol.get_user_objective(),
+                  best_sol.get_hash());
+  log_return_solution("post_fp_alone", best_sol);
+  return best_sol;
 };
 
 template <typename i_t, typename f_t>
@@ -641,8 +856,10 @@ void diversity_manager_t<i_t, f_t>::diversity_step(i_t max_iterations_without_im
       auto [sol1, sol2]         = population.get_two_random(tournament);
       cuopt_assert(population.test_invariant(), "");
       auto [lp_offspring, offspring]        = recombine_and_local_search(sol1, sol2);
-      auto [inserted_pos_1, best_updated_1] = population.add_solution(std::move(lp_offspring));
-      auto [inserted_pos_2, best_updated_2] = population.add_solution(std::move(offspring));
+      auto [inserted_pos_1, best_updated_1] = population.add_solution(
+        std::move(lp_offspring), internals::mip_solution_origin_t::RECOMBINATION);
+      auto [inserted_pos_2, best_updated_2] = population.add_solution(
+        std::move(offspring), internals::mip_solution_origin_t::RECOMBINATION);
       if (best_updated_1 || best_updated_2) { recombine_stats.add_best_updated(); }
       cuopt_assert(population.test_invariant(), "");
       if ((inserted_pos_1 != -1 && inserted_pos_1 <= 2) ||
@@ -684,10 +901,12 @@ void diversity_manager_t<i_t, f_t>::recombine_and_ls_with_all(solution_t<i_t, f_
         auto [offspring, lp_offspring] =
           recombine_and_local_search(curr_sol, solution, recombiner_type);
         if (!add_only_feasible || lp_offspring.get_feasible()) {
-          population.add_solution(std::move(lp_offspring));
+          population.add_solution(std::move(lp_offspring),
+                                  internals::mip_solution_origin_t::RECOMBINATION);
         }
         if (!add_only_feasible || offspring.get_feasible()) {
-          population.add_solution(std::move(offspring));
+          population.add_solution(std::move(offspring),
+                                  internals::mip_solution_origin_t::RECOMBINATION);
         }
         if (timer.check_time_limit()) { return; }
       }
@@ -697,17 +916,20 @@ void diversity_manager_t<i_t, f_t>::recombine_and_ls_with_all(solution_t<i_t, f_
 
 template <typename i_t, typename f_t>
 void diversity_manager_t<i_t, f_t>::recombine_and_ls_with_all(
-  std::vector<solution_t<i_t, f_t>>& solutions, bool add_only_feasible)
+  std::vector<typename population_t<i_t, f_t>::drained_external_solution_t>& solutions,
+  bool add_only_feasible)
 {
   raft::common::nvtx::range fun_scope("recombine_and_ls_with_all");
   if (solutions.size() > 0) {
     CUOPT_LOG_DEBUG("Running recombiners on B&B solutions with size %lu", solutions.size());
     // add all solutions because time limit might have been consumed and we might have exited before
-    for (auto& sol : solutions) {
+    for (auto& drained_sol : solutions) {
+      auto& sol = drained_sol.solution;
       cuopt_func_call(sol.test_feasibility(true));
-      population.add_solution(std::move(solution_t<i_t, f_t>(sol)));
+      population.add_solution(std::move(solution_t<i_t, f_t>(sol)), drained_sol.origin);
     }
-    for (auto& sol : solutions) {
+    for (auto& drained_sol : solutions) {
+      auto& sol = drained_sol.solution;
       if (timer.check_time_limit()) { return; }
       solution_t<i_t, f_t> ls_solution(sol);
       ls_config_t<i_t, f_t> ls_config;
@@ -759,6 +981,7 @@ diversity_manager_t<i_t, f_t>::recombine_and_local_search(solution_t<i_t, f_t>&
                   sol1.get_feasible(),
                   sol2.get_quality(population.weights),
                   sol2.get_feasible());
+  bool deterministic = (context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS);
   double best_objective_of_parents  = std::min(sol1.get_objective(), sol2.get_objective());
   bool at_least_one_parent_feasible = sol1.get_feasible() || sol2.get_feasible();
   // randomly choose among 3 recombiners
@@ -769,7 +992,7 @@ diversity_manager_t<i_t, f_t>::recombine_and_local_search(solution_t<i_t, f_t>&
                                   std::numeric_limits<double>::lowest(),
                                   std::numeric_limits<double>::lowest(),
                                   std::numeric_limits<double>::max(),
-                                  recombiner_work_normalized_reward_t(0.0));
+                                  recombiner_work_normalized_reward_t(deterministic, 0.0));
     return std::make_pair(solution_t<i_t, f_t>(sol1), solution_t<i_t, f_t>(sol2));
   }
   cuopt_assert(population.test_invariant(), "");
@@ -789,7 +1012,7 @@ diversity_manager_t<i_t, f_t>::recombine_and_local_search(solution_t<i_t, f_t>&
                                   std::numeric_limits<double>::lowest(),
                                   std::numeric_limits<double>::lowest(),
                                   std::numeric_limits<double>::max(),
-                                  recombiner_work_normalized_reward_t(0.0));
+                                  recombiner_work_normalized_reward_t(deterministic, 0.0));
     return std::make_pair(solution_t<i_t, f_t>(sol1), solution_t<i_t, f_t>(sol2));
   }
   cuopt_assert(offspring.test_number_all_integer(), "All must be integers after LS");
@@ -807,7 +1030,12 @@ diversity_manager_t<i_t, f_t>::recombine_and_local_search(solution_t<i_t, f_t>&
                                              : diversity_config.lp_run_time_if_infeasible;
   lp_run_time     = std::min(lp_run_time, timer.remaining_time());
   relaxed_lp_settings_t lp_settings;
-  lp_settings.time_limit              = lp_run_time;
+  lp_settings.time_limit = lp_run_time;
+  if (timer.deterministic) {
+    lp_settings.work_limit   = lp_settings.time_limit;
+    lp_settings.work_context = timer.work_context;
+    cuopt_assert(lp_settings.work_context != nullptr, "Missing deterministic work context");
+  }
   lp_settings.tolerance               = context.settings.tolerances.absolute_tolerance;
   lp_settings.return_first_feasible   = false;
   lp_settings.save_state              = true;
@@ -828,12 +1056,15 @@ diversity_manager_t<i_t, f_t>::recombine_and_local_search(solution_t<i_t, f_t>&
     offspring_qual, sol1.get_quality(population.weights), sol2.get_quality(population.weights));
   f_t best_quality_of_parents =
     std::min(sol1.get_quality(population.weights), sol2.get_quality(population.weights));
-  mab_recombiner.add_mab_reward(
-    mab_recombiner.last_chosen_option,
-    best_quality_of_parents,
-    population.best().get_quality(population.weights),
-    offspring_qual,
-    recombiner_work_normalized_reward_t(recombine_stats.get_last_recombiner_time()));
+  mab_recombiner.add_mab_reward(mab_recombiner.last_chosen_option,
+                                best_quality_of_parents,
+                                population.best().get_quality(population.weights),
+                                offspring_qual,
+                                !deterministic
+                                  ? recombiner_work_normalized_reward_t(
+                                      deterministic, recombine_stats.get_last_recombiner_time())
+                                  : recombiner_work_normalized_reward_t(
+                                      deterministic, recombine_stats.get_last_recombiner_work()));
   mab_ls.add_mab_reward(mab_ls_config_t<i_t, f_t>::last_ls_mab_option,
                         best_quality_of_parents,
                         population.best_feasible().get_quality(population.weights),
@@ -878,31 +1109,50 @@ std::pair<solution_t<i_t, f_t>, bool> diversity_manager_t<i_t, f_t>::recombine(
       }
     }
   }
+  CUOPT_DETERMINISM_LOG(
+    "Deterministic recombiner selection: requested=%s selected_index=%d chosen=%s "
+    "enabled_size=%zu last_choice_before=%d current_seed=%d",
+    recombiner_t<i_t, f_t>::recombiner_name(recombiner_type),
+    (int)selected_index,
+    recombiner_t<i_t, f_t>::recombiner_name(recombiner),
+    recombiner_t<i_t, f_t>::enabled_recombiners.size(),
+    mab_recombiner.last_chosen_option,
+    (unsigned int)cuopt::seed_generator::get_seed());
   mab_recombiner.set_last_chosen_option(selected_index);
   recombine_stats.add_attempt((recombiner_enum_t)recombiner);
   recombine_stats.start_recombiner_time();
+  CUOPT_DETERMINISM_LOG("Recombining sol %x and %x with recombiner %d, weights %x",
+                        a.get_hash(),
+                        b.get_hash(),
+                        recombiner,
+                        population.weights.get_hash());
+
   // Refactored code using a switch statement
   switch (recombiner) {
     case recombiner_enum_t::BOUND_PROP: {
-      auto [sol, success] = bound_prop_recombiner.recombine(a, b, population.weights);
+      auto [sol, success, work] = bound_prop_recombiner.recombine(a, b, population.weights);
+      recombine_stats.set_recombiner_work(work);
       recombine_stats.stop_recombiner_time();
       if (success) { recombine_stats.add_success(); }
       return std::make_pair(sol, success);
     }
     case recombiner_enum_t::FP: {
-      auto [sol, success] = fp_recombiner.recombine(a, b, population.weights);
+      auto [sol, success, work] = fp_recombiner.recombine(a, b, population.weights);
+      recombine_stats.set_recombiner_work(work);
       recombine_stats.stop_recombiner_time();
       if (success) { recombine_stats.add_success(); }
       return std::make_pair(sol, success);
     }
     case recombiner_enum_t::LINE_SEGMENT: {
-      auto [sol, success] = line_segment_recombiner.recombine(a, b, population.weights);
+      auto [sol, success, work] = line_segment_recombiner.recombine(a, b, population.weights);
+      recombine_stats.set_recombiner_work(work);
       recombine_stats.stop_recombiner_time();
       if (success) { recombine_stats.add_success(); }
       return std::make_pair(sol, success);
     }
     case recombiner_enum_t::SUB_MIP: {
-      auto [sol, success] = sub_mip_recombiner.recombine(a, b, population.weights);
+      auto [sol, success, work] = sub_mip_recombiner.recombine(a, b, population.weights);
+      recombine_stats.set_recombiner_work(work);
       recombine_stats.stop_recombiner_time();
       if (success) { recombine_stats.add_success(); }
       return std::make_pair(sol, success);
diff --git a/cpp/src/mip_heuristics/diversity/diversity_manager.cuh b/cpp/src/mip_heuristics/diversity/diversity_manager.cuh
index 863933de48..e1c50562d7 100644
--- a/cpp/src/mip_heuristics/diversity/diversity_manager.cuh
+++ b/cpp/src/mip_heuristics/diversity/diversity_manager.cuh
@@ -26,6 +26,7 @@
 #include <mip_heuristics/solution/solution.cuh>
 #include <mip_heuristics/solver.cuh>
 #include <utilities/timer.hpp>
+#include <utilities/work_limit_timer.hpp>
 
 #include <cstdint>
 
@@ -35,7 +36,7 @@ template <typename i_t, typename f_t>
 class diversity_manager_t {
  public:
   diversity_manager_t(mip_solver_context_t<i_t, f_t>& context);
-  bool run_presolve(f_t time_limit, timer_t global_timer);
+  bool run_presolve(f_t time_limit, cuopt::termination_checker_t& global_timer);
   solution_t<i_t, f_t> run_solver();
   void generate_solution(f_t time_limit, bool random_start = true);
   void run_fj_alone(solution_t<i_t, f_t>& solution);
@@ -50,8 +51,9 @@ class diversity_manager_t {
   void diversity_step(i_t max_iterations_without_improvement);
   void add_user_given_solutions(std::vector<solution_t<i_t, f_t>>& initial_sol_vector);
   population_t<i_t, f_t>* get_population_pointer() { return &population; }
-  void recombine_and_ls_with_all(std::vector<solution_t<i_t, f_t>>& solutions,
-                                 bool add_only_feasible = false);
+  void recombine_and_ls_with_all(
+    std::vector<typename population_t<i_t, f_t>::drained_external_solution_t>& solutions,
+    bool add_only_feasible = false);
   void recombine_and_ls_with_all(solution_t<i_t, f_t>& solution, bool add_only_feasible = false);
   std::pair<solution_t<i_t, f_t>, solution_t<i_t, f_t>> recombine_and_local_search(
     solution_t<i_t, f_t>& a,
@@ -65,7 +67,7 @@ class diversity_manager_t {
                               solution_t<i_t, f_t>& sol2);
   bool run_local_search(solution_t<i_t, f_t>& solution,
                         const weight_t<i_t, f_t>& weights,
-                        timer_t& timer,
+                        work_limit_timer_t& timer,
                         ls_config_t<i_t, f_t>& ls_config);
 
   void consume_staged_simplex_solution(lp_state_t<i_t, f_t>& lp_state);
@@ -84,7 +86,7 @@ class diversity_manager_t {
   std::vector<f_t> staged_simplex_dual_solution;
   f_t staged_simplex_objective{std::numeric_limits<f_t>::infinity()};
   local_search_t<i_t, f_t> ls;
-  cuopt::timer_t timer;
+  cuopt::work_limit_timer_t timer;
   bound_prop_recombiner_t<i_t, f_t> bound_prop_recombiner;
   fp_recombiner_t<i_t, f_t> fp_recombiner;
   line_segment_recombiner_t<i_t, f_t> line_segment_recombiner;
diff --git a/cpp/src/mip_heuristics/diversity/lns/rins.cu b/cpp/src/mip_heuristics/diversity/lns/rins.cu
index c4331343de..2247dfcef4 100644
--- a/cpp/src/mip_heuristics/diversity/lns/rins.cu
+++ b/cpp/src/mip_heuristics/diversity/lns/rins.cu
@@ -271,10 +271,11 @@ void rins_t<i_t, f_t>::run_rins()
   branch_and_bound_settings.strong_branching_simplex_iteration_limit = 200;
   branch_and_bound_settings.log.log                                  = false;
   branch_and_bound_settings.log.log_prefix                           = "[RINS] ";
-  branch_and_bound_settings.solution_callback = [&rins_solution_queue](std::vector<f_t>& solution,
-                                                                       f_t objective) {
-    rins_solution_queue.push_back(solution);
-  };
+  branch_and_bound_settings.new_incumbent_callback =
+    [&rins_solution_queue](std::vector<f_t>& solution,
+                           f_t objective,
+                           const cuopt::internals::mip_solution_callback_info_t&,
+                           double) { rins_solution_queue.push_back(solution); };
   dual_simplex::probing_implied_bound_t<i_t, f_t> empty_probing(branch_and_bound_problem.num_cols);
   dual_simplex::branch_and_bound_t<i_t, f_t> branch_and_bound(
     branch_and_bound_problem, branch_and_bound_settings, dual_simplex::tic(), empty_probing);
@@ -347,8 +348,9 @@ void rins_t<i_t, f_t>::run_rins()
       cuopt_assert(best_sol.assignment.size() == sol_size_before_rins, "Assignment size mismatch");
       cuopt_assert(best_sol.assignment.size() == problem_copy->n_variables,
                    "Assignment size mismatch");
-      dm.population.add_external_solution(
-        best_sol.get_host_assignment(), best_sol.get_objective(), solution_origin_t::RINS);
+      dm.population.add_external_solution(best_sol.get_host_assignment(),
+                                          best_sol.get_objective(),
+                                          internals::mip_solution_origin_t::RINS);
     }
   }
 
diff --git a/cpp/src/mip_heuristics/diversity/multi_armed_bandit.cuh b/cpp/src/mip_heuristics/diversity/multi_armed_bandit.cuh
index 4571d0d57f..b9219b8dcb 100644
--- a/cpp/src/mip_heuristics/diversity/multi_armed_bandit.cuh
+++ b/cpp/src/mip_heuristics/diversity/multi_armed_bandit.cuh
@@ -45,16 +45,22 @@ struct ls_work_normalized_reward_t {
 };
 
 struct recombiner_work_normalized_reward_t {
-  double time_in_miliseconds;
-  recombiner_work_normalized_reward_t(double time_in_miliseconds)
-    : time_in_miliseconds(time_in_miliseconds)
+  bool deterministic;
+  double work;
+  recombiner_work_normalized_reward_t(bool deterministic, double work)
+    : deterministic(deterministic), work(work)
   {
   }
 
   double operator()(double factor) const
   {
     // normal recombiners take 2000 ms
-    return factor * (std::max(0.1, 4.0 - (time_in_miliseconds / 2000)));
+    if (!deterministic) {
+      double time_in_miliseconds = work;
+      return factor * (std::max(0.1, 4.0 - (time_in_miliseconds / 2000)));
+    } else {
+      return factor * (std::max(0.1, 4.0 - (work / 200)));
+    }
   }
 };
 
diff --git a/cpp/src/mip_heuristics/diversity/population.cu b/cpp/src/mip_heuristics/diversity/population.cu
index bb0fdd6d11..cbdcf4fdab 100644
--- a/cpp/src/mip_heuristics/diversity/population.cu
+++ b/cpp/src/mip_heuristics/diversity/population.cu
@@ -8,15 +8,27 @@
 #include "diversity_manager.cuh"
 #include "population.cuh"
 
+#include <branch_and_bound/branch_and_bound.hpp>
+
 #include <thrust/for_each.h>
 #include <mip_heuristics/mip_constants.hpp>
 #include <mip_heuristics/utils.cuh>
 #include <pdlp/utils.cuh>
 #include <utilities/copy_helpers.hpp>
+#include <utilities/determinism_log.hpp>
 #include <utilities/seed_generator.cuh>
 
 #include <mutex>
 
+// enable to activate detailed determinism logs
+#if 0
+#undef CUOPT_DETERMINISM_LOG
+#define CUOPT_DETERMINISM_LOG(...) \
+  do {                             \
+    CUOPT_LOG_INFO(__VA_ARGS__);   \
+  } while (0)
+#endif
+
 namespace cuopt::linear_programming::detail {
 
 constexpr double weight_increase_ratio       = 2.;
@@ -44,7 +56,7 @@ population_t<i_t, f_t>::population_t(std::string const& name_,
     rng(cuopt::seed_generator::get_seed()),
     early_exit_primal_generation(false),
     population_hash_map(*problem_ptr),
-    timer(0)
+    timer(0.0, cuopt::termination_checker_t::root_tag_t{})
 {
   best_feasible_objective = std::numeric_limits<f_t>::max();
 }
@@ -125,11 +137,12 @@ std::pair<solution_t<i_t, f_t>, solution_t<i_t, f_t>> population_t<i_t, f_t>::ge
 }
 
 template <typename i_t, typename f_t>
-void population_t<i_t, f_t>::add_solutions_from_vec(std::vector<solution_t<i_t, f_t>>&& solutions)
+void population_t<i_t, f_t>::add_solutions_from_vec(
+  std::vector<solution_t<i_t, f_t>>&& solutions, internals::mip_solution_origin_t callback_origin)
 {
   raft::common::nvtx::range fun_scope("add_solution_from_vec");
   for (auto&& sol : solutions) {
-    add_solution(std::move(sol));
+    add_solution(std::move(sol), callback_origin);
   }
 }
 
@@ -143,11 +156,11 @@ size_t population_t<i_t, f_t>::get_external_solution_size()
 template <typename i_t, typename f_t>
 void population_t<i_t, f_t>::add_external_solution(const std::vector<f_t>& solution,
                                                    f_t objective,
-                                                   solution_origin_t origin)
+                                                   internals::mip_solution_origin_t origin)
 {
   std::lock_guard<std::mutex> lock(solution_mutex);
 
-  if (origin == solution_origin_t::CPUFJ) {
+  if (origin == internals::mip_solution_origin_t::CPU_FEASIBILITY_JUMP) {
     external_solution_queue_cpufj.emplace_back(solution, objective, origin);
   } else {
     external_solution_queue.emplace_back(solution, objective, origin);
@@ -165,7 +178,7 @@ void population_t<i_t, f_t>::add_external_solution(const std::vector<f_t>& solut
   }
 
   CUOPT_LOG_DEBUG("%s added a solution to population, solution queue size %lu with objective %g",
-                  solution_origin_to_string(origin),
+                  internals::mip_solution_origin_to_string(origin),
                   external_solution_queue.size(),
                   problem_ptr->get_user_obj_from_solver_obj(objective));
   if (objective < best_feasible_objective) {
@@ -179,9 +192,13 @@ void population_t<i_t, f_t>::add_external_solution(const std::vector<f_t>& solut
 template <typename i_t, typename f_t>
 void population_t<i_t, f_t>::add_external_solutions_to_population()
 {
+  // GPU heuristics are producer-only in the current GPU determinism implementation
+  if ((context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS)) { return; }
   // don't do early exit checks here. mutex needs to be acquired to prevent race conditions
   auto new_sol_vector = get_external_solutions();
-  add_solutions_from_vec(std::move(new_sol_vector));
+  for (auto& drained_sol : new_sol_vector) {
+    add_solution(std::move(drained_sol.solution), drained_sol.origin);
+  }
 }
 
 // normally we would need a lock here but these are boolean types and race conditions are not
@@ -194,10 +211,11 @@ void population_t<i_t, f_t>::preempt_heuristic_solver()
 }
 
 template <typename i_t, typename f_t>
-std::vector<solution_t<i_t, f_t>> population_t<i_t, f_t>::get_external_solutions()
+std::vector<typename population_t<i_t, f_t>::drained_external_solution_t>
+population_t<i_t, f_t>::get_external_solutions()
 {
   std::lock_guard<std::mutex> lock(solution_mutex);
-  std::vector<solution_t<i_t, f_t>> return_vector;
+  std::vector<drained_external_solution_t> return_vector;
   i_t counter                     = 0;
   f_t new_best_feasible_objective = best_feasible_objective;
   f_t longest_wait_time           = 0;
@@ -205,10 +223,10 @@ std::vector<solution_t<i_t, f_t>> population_t<i_t, f_t>::get_external_solutions
     for (auto& h_entry : queue) {
       // ignore CPUFJ solutions if they're not better than the best feasible.
       // It seems they worsen results on some instances despite the potential for improved diversity
-      if (h_entry.origin == solution_origin_t::CPUFJ &&
+      if (h_entry.origin == internals::mip_solution_origin_t::CPU_FEASIBILITY_JUMP &&
           h_entry.objective > new_best_feasible_objective) {
         continue;
-      } else if (h_entry.origin != solution_origin_t::CPUFJ &&
+      } else if (h_entry.origin != internals::mip_solution_origin_t::CPU_FEASIBILITY_JUMP &&
                  h_entry.objective > new_best_feasible_objective) {
         new_best_feasible_objective = h_entry.objective;
       }
@@ -233,7 +251,7 @@ std::vector<solution_t<i_t, f_t>> population_t<i_t, f_t>::get_external_solutions
           problem_ptr->n_integer_vars);
       }
       sol.handle_ptr->sync_stream();
-      return_vector.emplace_back(std::move(sol));
+      return_vector.emplace_back(std::move(sol), h_entry.origin);
       counter++;
     }
   }
@@ -258,114 +276,53 @@ bool population_t<i_t, f_t>::is_better_than_best_feasible(solution_t<i_t, f_t>&
 }
 
 template <typename i_t, typename f_t>
-void population_t<i_t, f_t>::invoke_get_solution_callback(
-  solution_t<i_t, f_t>& sol, internals::get_solution_callback_t* callback)
+void population_t<i_t, f_t>::run_solution_callbacks(
+  solution_t<i_t, f_t>& sol, internals::mip_solution_origin_t callback_origin)
 {
-  f_t user_objective = sol.get_user_objective();
-  f_t user_bound     = context.stats.get_solution_bound();
-  solution_t<i_t, f_t> temp_sol(sol);
-  problem_ptr->post_process_assignment(temp_sol.assignment);
-  if (problem_ptr->has_papilo_presolve_data()) {
-    problem_ptr->papilo_uncrush_assignment(temp_sol.assignment);
-  }
+  if (is_better_than_best_feasible(sol)) {
+    const bool deterministic_bb = (context.settings.determinism_mode & CUOPT_DETERMINISM_BB) &&
+                                  context.branch_and_bound_ptr != nullptr;
 
-  std::vector<f_t> user_objective_vec(1);
-  std::vector<f_t> user_bound_vec(1);
-  std::vector<f_t> user_assignment_vec(temp_sol.assignment.size());
-  user_objective_vec[0] = user_objective;
-  user_bound_vec[0]     = user_bound;
-  raft::copy(user_assignment_vec.data(),
-             temp_sol.assignment.data(),
-             temp_sol.assignment.size(),
-             temp_sol.handle_ptr->get_stream());
-  temp_sol.handle_ptr->sync_stream();
-  callback->get_solution(user_assignment_vec.data(),
-                         user_objective_vec.data(),
-                         user_bound_vec.data(),
-                         callback->get_user_data());
-}
-
-template <typename i_t, typename f_t>
-void population_t<i_t, f_t>::run_solution_callbacks(solution_t<i_t, f_t>& sol)
-{
-  bool better_solution_found = is_better_than_best_feasible(sol);
-  auto user_callbacks        = context.settings.get_mip_callbacks();
-  if (better_solution_found) {
-    if (context.settings.benchmark_info_ptr != nullptr) {
-      context.settings.benchmark_info_ptr->last_improvement_of_best_feasible = timer.elapsed_time();
-    }
-    CUOPT_LOG_DEBUG("Population: Found new best solution %g", sol.get_user_objective());
-    if (problem_ptr->branch_and_bound_callback != nullptr) {
-      problem_ptr->branch_and_bound_callback(sol.get_host_assignment());
-    }
-    for (auto callback : user_callbacks) {
-      if (callback->get_type() == internals::base_solution_callback_type::GET_SOLUTION) {
-        auto get_sol_callback = static_cast<internals::get_solution_callback_t*>(callback);
-        invoke_get_solution_callback(sol, get_sol_callback);
+    if (deterministic_bb) {
+      const double work_timestamp = context.gpu_heur_loop.current_producer_work();
+      cuopt_assert(std::isfinite(work_timestamp),
+                   "Deterministic heuristic work timestamp must be finite");
+      context.branch_and_bound_ptr->queue_external_solution_deterministic(
+        sol.get_host_assignment(), sol.get_user_objective(), work_timestamp, callback_origin);
+    } else {
+      if (context.branch_and_bound_ptr != nullptr &&
+          context.problem_ptr->branch_and_bound_callback != nullptr) {
+        context.problem_ptr->branch_and_bound_callback(sol.get_host_assignment(), callback_origin);
       }
+
+      const double work_timestamp = context.gpu_heur_loop.current_work();
+      const auto payload          = context.solution_publication.build_callback_payload(
+        context.problem_ptr, sol, callback_origin, work_timestamp);
+      context.solution_publication.publish_new_best_feasible(payload, timer.elapsed_time());
     }
     // Save the best objective here even if callback handling later exits early.
     // This prevents older solutions from being reported as "new best" in subsequent callbacks.
     best_feasible_objective = sol.get_objective();
   }
 
-  for (auto callback : user_callbacks) {
-    if (callback->get_type() == internals::base_solution_callback_type::SET_SOLUTION) {
-      auto set_sol_callback       = static_cast<internals::set_solution_callback_t*>(callback);
-      f_t user_bound              = context.stats.get_solution_bound();
-      auto callback_num_variables = problem_ptr->original_problem_ptr->get_n_variables();
-      rmm::device_uvector<f_t> incumbent_assignment(callback_num_variables,
-                                                    sol.handle_ptr->get_stream());
-      solution_t<i_t, f_t> outside_sol(sol);
-      rmm::device_scalar<f_t> d_outside_sol_objective(sol.handle_ptr->get_stream());
-      auto inf = std::numeric_limits<f_t>::infinity();
-      d_outside_sol_objective.set_value_async(inf, sol.handle_ptr->get_stream());
-      sol.handle_ptr->sync_stream();
-      std::vector<f_t> h_incumbent_assignment(incumbent_assignment.size());
-      std::vector<f_t> h_outside_sol_objective(1, inf);
-      std::vector<f_t> h_user_bound(1, user_bound);
-      set_sol_callback->set_solution(h_incumbent_assignment.data(),
-                                     h_outside_sol_objective.data(),
-                                     h_user_bound.data(),
-                                     set_sol_callback->get_user_data());
-      f_t outside_sol_objective = h_outside_sol_objective[0];
-      // The callback might be called without setting any valid solution or objective which triggers
-      // asserts
-      if (outside_sol_objective == inf) { return; }
-      d_outside_sol_objective.set_value_async(outside_sol_objective, sol.handle_ptr->get_stream());
-      raft::copy(incumbent_assignment.data(),
-                 h_incumbent_assignment.data(),
-                 incumbent_assignment.size(),
-                 sol.handle_ptr->get_stream());
-
-      bool is_valid = problem_ptr->pre_process_assignment(incumbent_assignment);
-      if (!is_valid) { return; }
-      cuopt_assert(outside_sol.assignment.size() == incumbent_assignment.size(),
-                   "Incumbent assignment size mismatch");
-      raft::copy(outside_sol.assignment.data(),
-                 incumbent_assignment.data(),
-                 incumbent_assignment.size(),
-                 sol.handle_ptr->get_stream());
-      outside_sol.compute_feasibility();
-
-      CUOPT_LOG_DEBUG("Injected solution feasibility =  %d objective = %g excess = %g",
-                      outside_sol.get_feasible(),
-                      outside_sol.get_user_objective(),
-                      outside_sol.get_total_excess());
-      if (std::abs(outside_sol.get_user_objective() - outside_sol_objective) > 1e-6) {
-        cuopt_func_call(
-          CUOPT_LOG_DEBUG("External solution objective mismatch: outside_sol.get_user_objective() "
-                          "= %g, outside_sol_objective = %g",
-                          outside_sol.get_user_objective(),
-                          outside_sol_objective));
+  context.solution_injection.invoke_set_solution_callbacks(
+    problem_ptr,
+    sol,
+    [this](
+      const std::vector<f_t>& assignment, f_t objective, internals::mip_solution_origin_t origin) {
+      const bool deterministic_bb = (context.settings.determinism_mode & CUOPT_DETERMINISM_BB) &&
+                                    context.branch_and_bound_ptr != nullptr;
+      if (deterministic_bb) {
+        const double work_timestamp = context.gpu_heur_loop.current_producer_work();
+        context.branch_and_bound_ptr->queue_external_solution_deterministic(
+          assignment,
+          context.problem_ptr->get_user_obj_from_solver_obj(objective),
+          work_timestamp,
+          origin);
+      } else {
+        add_external_solution(assignment, objective, origin);
       }
-      cuopt_assert(std::abs(outside_sol.get_user_objective() - outside_sol_objective) <= 1e-6,
-                   "External solution objective mismatch");
-      auto h_outside_sol = outside_sol.get_host_assignment();
-      add_external_solution(
-        h_outside_sol, outside_sol.get_objective(), solution_origin_t::EXTERNAL);
-    }
-  }
+    });
 }
 
 template <typename i_t, typename f_t>
@@ -401,7 +358,8 @@ void population_t<i_t, f_t>::adjust_weights_according_to_best_feasible()
 }
 
 template <typename i_t, typename f_t>
-std::pair<i_t, bool> population_t<i_t, f_t>::add_solution(solution_t<i_t, f_t>&& sol)
+std::pair<i_t, bool> population_t<i_t, f_t>::add_solution(
+  solution_t<i_t, f_t>&& sol, internals::mip_solution_origin_t callback_origin)
 {
   std::lock_guard<std::recursive_mutex> lock(write_mutex);
   raft::common::nvtx::range fun_scope("add_solution");
@@ -411,16 +369,18 @@ std::pair<i_t, bool> population_t<i_t, f_t>::add_solution(solution_t<i_t, f_t>&&
   // for hash computation, quality calculation, and similarity comparisons.
   sol.handle_ptr->sync_stream();
   population_hash_map.insert(sol);
-  double sol_cost   = sol.get_quality(weights);
-  bool best_updated = false;
-  CUOPT_LOG_DEBUG("Adding solution with quality %f and objective %f n_integers %d!",
+  double sol_cost               = sol.get_quality(weights);
+  bool best_updated             = false;
+  const uint32_t candidate_hash = sol.get_hash();
+  CUOPT_LOG_DEBUG("Adding solution with quality %f and objective %f n_integers %d, hash %x!",
                   sol_cost,
                   sol.get_user_objective(),
-                  sol.n_assigned_integers);
+                  sol.n_assigned_integers,
+                  candidate_hash);
   // We store the best feasible found so far at index 0.
   if (sol.get_feasible() &&
       (solutions[0].first == false || sol_cost + OBJECTIVE_EPSILON < indices[0].second)) {
-    run_solution_callbacks(sol);
+    run_solution_callbacks(sol, callback_origin);
     solutions[0].first = true;
     // we only have move assignment operator
     solution_t<i_t, f_t> temp_sol(sol);
@@ -706,7 +666,7 @@ void population_t<i_t, f_t>::halve_the_population()
     clear_except_best_feasible();
     var_threshold = std::max(var_threshold * 0.97, 0.5 * problem_ptr->n_integer_vars);
     for (auto& sol : sol_vec) {
-      add_solution(solution_t<i_t, f_t>(sol));
+      add_solution(solution_t<i_t, f_t>(sol), internals::mip_solution_origin_t::LOCAL_SEARCH);
     }
     if (counter++ > max_adjustments) break;
   }
@@ -718,7 +678,7 @@ void population_t<i_t, f_t>::halve_the_population()
       max_var_threshold,
       std::min((size_t)(var_threshold * 1.02), (size_t)(0.995 * problem_ptr->n_integer_vars)));
     for (auto& sol : sol_vec) {
-      add_solution(solution_t<i_t, f_t>(sol));
+      add_solution(solution_t<i_t, f_t>(sol), internals::mip_solution_origin_t::LOCAL_SEARCH);
     }
     if (counter++ > max_adjustments) break;
   }
@@ -744,7 +704,7 @@ void population_t<i_t, f_t>::start_threshold_adjustment()
 }
 
 template <typename i_t, typename f_t>
-void population_t<i_t, f_t>::adjust_threshold(cuopt::timer_t timer)
+void population_t<i_t, f_t>::adjust_threshold(cuopt::work_limit_timer_t& timer)
 {
   double time_ratio = (timer.elapsed_time() - population_start_time) /
                       (timer.get_time_limit() - population_start_time);
@@ -833,23 +793,29 @@ bool population_t<i_t, f_t>::test_invariant()
 template <typename i_t, typename f_t>
 void population_t<i_t, f_t>::print()
 {
+  std::vector<uint32_t> hashes;
+  for (auto& index : indices)
+    hashes.push_back(solutions[index.first].second.get_hash());
+  uint32_t final_hash = compute_hash(hashes);
   CUOPT_LOG_DEBUG(" -------------- ");
-  CUOPT_LOG_DEBUG("%s infeas weight %f threshold %d/%d:",
+  CUOPT_LOG_DEBUG("%s infeas weight %f threshold %d/%d (hash %x):",
                   name.c_str(),
                   infeasibility_importance,
                   var_threshold,
-                  problem_ptr->n_integer_vars);
+                  problem_ptr->n_integer_vars,
+                  final_hash);
   i_t i = 0;
   for (auto& index : indices) {
     if (index.first == 0 && solutions[0].first) {
       CUOPT_LOG_DEBUG(" Best feasible: %f", solutions[index.first].second.get_user_objective());
     }
-    CUOPT_LOG_DEBUG("%d :  %f\t%f\t%f\t%d",
+    CUOPT_LOG_DEBUG("%d :  %f\t%f\t%f\t%d (hash %x)",
                     i,
                     index.second,
                     solutions[index.first].second.get_total_excess(),
                     solutions[index.first].second.get_user_objective(),
-                    solutions[index.first].second.get_feasible());
+                    solutions[index.first].second.get_feasible(),
+                    solutions[index.first].second.get_hash());
     i++;
   }
   CUOPT_LOG_DEBUG(" -------------- ");
@@ -858,8 +824,8 @@ void population_t<i_t, f_t>::print()
 template <typename i_t, typename f_t>
 void population_t<i_t, f_t>::run_all_recombiners(solution_t<i_t, f_t>& sol)
 {
-  std::vector<solution_t<i_t, f_t>> sol_vec;
-  sol_vec.emplace_back(std::move(solution_t<i_t, f_t>(sol)));
+  std::vector<typename population_t<i_t, f_t>::drained_external_solution_t> sol_vec;
+  sol_vec.emplace_back(solution_t<i_t, f_t>(sol), internals::mip_solution_origin_t::LOCAL_SEARCH);
   dm.recombine_and_ls_with_all(sol_vec, true);
 }
 
diff --git a/cpp/src/mip_heuristics/diversity/population.cuh b/cpp/src/mip_heuristics/diversity/population.cuh
index c83a4bfb83..9250b7cdcb 100644
--- a/cpp/src/mip_heuristics/diversity/population.cuh
+++ b/cpp/src/mip_heuristics/diversity/population.cuh
@@ -25,22 +25,20 @@ namespace cuopt::linear_programming::detail {
 template <typename i_t, typename f_t>
 class diversity_manager_t;
 
-enum class solution_origin_t { BRANCH_AND_BOUND, CPUFJ, RINS, EXTERNAL };
-
-constexpr const char* solution_origin_to_string(solution_origin_t origin)
-{
-  switch (origin) {
-    case solution_origin_t::BRANCH_AND_BOUND: return "B&B";
-    case solution_origin_t::CPUFJ: return "CPUFJ";
-    case solution_origin_t::RINS: return "RINS";
-    case solution_origin_t::EXTERNAL: return "injected";
-    default: return "unknown";
-  }
-}
-
 template <typename i_t, typename f_t>
 class population_t {
  public:
+  struct drained_external_solution_t {
+    drained_external_solution_t(solution_t<i_t, f_t>&& solution_,
+                                internals::mip_solution_origin_t origin_)
+      : solution(std::move(solution_)), origin(origin_)
+    {
+    }
+
+    solution_t<i_t, f_t> solution;
+    internals::mip_solution_origin_t origin;
+  };
+
   population_t(std::string const& name,
                mip_solver_context_t<i_t, f_t>& context,
                diversity_manager_t<i_t, f_t>& dm,
@@ -83,6 +81,7 @@ class population_t {
       a.first = false;
     indices[0].second = std::numeric_limits<f_t>::max();
     indices.erase(indices.begin() + 1, indices.end());
+    best_feasible_objective = std::numeric_limits<f_t>::max();
   }
 
   void clear_except_best_feasible()
@@ -92,6 +91,7 @@ class population_t {
     }
     solutions[indices[0].first].first = true;
     indices.erase(indices.begin() + 1, indices.end());
+    best_feasible_objective = solutions[indices[0].first].second.get_objective();
   }
 
   // -------------------
@@ -103,16 +103,18 @@ class population_t {
   /*! \brief { Add a solution to population. Similar solutions may be ejected from the pool. }
    *  \return { -1 = not inserted , others = inserted index}
    */
-  std::pair<i_t, bool> add_solution(solution_t<i_t, f_t>&& sol);
+  std::pair<i_t, bool> add_solution(solution_t<i_t, f_t>&& sol,
+                                    internals::mip_solution_origin_t callback_origin);
   void add_external_solution(const std::vector<f_t>& solution,
                              f_t objective,
-                             solution_origin_t origin);
-  std::vector<solution_t<i_t, f_t>> get_external_solutions();
+                             internals::mip_solution_origin_t origin);
+  std::vector<drained_external_solution_t> get_external_solutions();
   void add_external_solutions_to_population();
   size_t get_external_solution_size();
   void preempt_heuristic_solver();
 
-  void add_solutions_from_vec(std::vector<solution_t<i_t, f_t>>&& solutions);
+  void add_solutions_from_vec(std::vector<solution_t<i_t, f_t>>&& solutions,
+                              internals::mip_solution_origin_t callback_origin);
 
   // Updates the cstr weights according to the best solutions feasibility
   void compute_new_weights();
@@ -122,7 +124,7 @@ class population_t {
   // updates qualities of each solution
   void update_qualities();
   // adjusts the threshold of the population
-  void adjust_threshold(cuopt::timer_t timer);
+  void adjust_threshold(cuopt::work_limit_timer_t& timer);
   /*! \param sol { Input solution }
    *  \return { Index of the best solution similar to sol. If no similar is found we return
    * max_solutions. }*/
@@ -153,7 +155,8 @@ class population_t {
   std::vector<solution_t<i_t, f_t>> population_to_vector();
   void halve_the_population();
 
-  void run_solution_callbacks(solution_t<i_t, f_t>& sol);
+  void run_solution_callbacks(solution_t<i_t, f_t>& sol,
+                              internals::mip_solution_origin_t callback_origin);
 
   void adjust_weights_according_to_best_feasible();
 
@@ -161,9 +164,6 @@ class population_t {
 
   void diversity_step(i_t max_iterations_without_improvement);
 
-  void invoke_get_solution_callback(solution_t<i_t, f_t>& sol,
-                                    internals::get_solution_callback_t* callback);
-
   // does some consistency tests
   bool test_invariant();
 
@@ -186,7 +186,9 @@ class population_t {
 
   struct external_solution_t {
     external_solution_t() = default;
-    external_solution_t(const std::vector<f_t>& solution, f_t objective, solution_origin_t origin)
+    external_solution_t(const std::vector<f_t>& solution,
+                        f_t objective,
+                        internals::mip_solution_origin_t origin)
       : solution(solution),
         objective(objective),
         origin(origin),
@@ -195,7 +197,7 @@ class population_t {
     }
     std::vector<f_t> solution;
     f_t objective;
-    solution_origin_t origin;
+    internals::mip_solution_origin_t origin;
     timer_t timer;  // debug timer to track how long a solution has lingered in the queue
   };
 
@@ -211,7 +213,7 @@ class population_t {
   // be seeded from an early-FJ incumbent objective before a matching population solution exists.
   f_t best_feasible_objective = std::numeric_limits<f_t>::max();
   assignment_hash_map_t<i_t, f_t> population_hash_map;
-  cuopt::timer_t timer;
+  cuopt::work_limit_timer_t timer;
 };
 
 }  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/mip_heuristics/diversity/recombiners/bound_prop_recombiner.cuh b/cpp/src/mip_heuristics/diversity/recombiners/bound_prop_recombiner.cuh
index 9d6bb3902c..687eb3ae54 100644
--- a/cpp/src/mip_heuristics/diversity/recombiners/bound_prop_recombiner.cuh
+++ b/cpp/src/mip_heuristics/diversity/recombiners/bound_prop_recombiner.cuh
@@ -29,6 +29,7 @@ class bound_prop_recombiner_t : public recombiner_t<i_t, f_t> {
       rng(cuopt::seed_generator::get_seed()),
       vars_to_fix(n_vars, handle_ptr->get_stream())
   {
+    thrust::fill(handle_ptr->get_thrust_policy(), vars_to_fix.begin(), vars_to_fix.end(), -1);
   }
 
   void get_probing_values_for_infeasible(
@@ -131,9 +132,9 @@ class bound_prop_recombiner_t : public recombiner_t<i_t, f_t> {
       });
   }
 
-  std::pair<solution_t<i_t, f_t>, bool> recombine(solution_t<i_t, f_t>& a,
-                                                  solution_t<i_t, f_t>& b,
-                                                  const weight_t<i_t, f_t>& weights)
+  std::tuple<solution_t<i_t, f_t>, bool, double> recombine(solution_t<i_t, f_t>& a,
+                                                           solution_t<i_t, f_t>& b,
+                                                           const weight_t<i_t, f_t>& weights)
   {
     raft::common::nvtx::range fun_scope("bound_prop_recombiner");
     auto& guiding_solution = a.get_feasible() ? a : b;
@@ -148,10 +149,11 @@ class bound_prop_recombiner_t : public recombiner_t<i_t, f_t> {
     i_t n_vars_from_other  = n_different_vars;
     i_t fixed_from_guiding = 0;
     i_t fixed_from_other   = 0;
+    i_t seed               = cuopt::seed_generator::get_seed();
     if (n_different_vars > (i_t)bp_recombiner_config_t::max_n_of_vars_from_other) {
       fixed_from_guiding = n_vars_from_other - bp_recombiner_config_t::max_n_of_vars_from_other;
       n_vars_from_other  = bp_recombiner_config_t::max_n_of_vars_from_other;
-      thrust::default_random_engine g{(unsigned int)cuopt::seed_generator::get_seed()};
+      thrust::default_random_engine g{(unsigned int)seed};
       thrust::shuffle(a.handle_ptr->get_thrust_policy(),
                       this->remaining_indices.data(),
                       this->remaining_indices.data() + n_different_vars,
@@ -160,12 +162,36 @@ class bound_prop_recombiner_t : public recombiner_t<i_t, f_t> {
     i_t n_vars_from_guiding = a.problem_ptr->n_integer_vars - n_vars_from_other;
     CUOPT_LOG_DEBUG(
       "n_vars_from_guiding %d n_vars_from_other %d", n_vars_from_guiding, n_vars_from_other);
+
+    // DETERMINISM DEBUG: Log everything that could affect divergence
+    CUOPT_DETERMINISM_LOG("BP_DET: sol_a_hash=0x%x sol_b_hash=0x%x offspring_hash=0x%x, seed %x",
+                          a.get_hash(),
+                          b.get_hash(),
+                          offspring.get_hash(),
+                          seed);
+    CUOPT_DETERMINISM_LOG("BP_DET: n_different_vars=%d n_vars_from_other=%d n_vars_from_guiding=%d",
+                          n_different_vars,
+                          n_vars_from_other,
+                          n_vars_from_guiding);
+    CUOPT_DETERMINISM_LOG(
+      "BP_DET: remaining_indices_hash=0x%x (first %d elements)",
+      detail::compute_hash(make_span(this->remaining_indices), a.handle_ptr->get_stream()),
+      std::min((i_t)10, n_vars_from_other));
+    CUOPT_DETERMINISM_LOG("BP_DET: guiding_feasible=%d other_feasible=%d expensive_to_fix=%d",
+                          guiding_solution.get_feasible(),
+                          other_solution.get_feasible(),
+                          a.problem_ptr->expensive_to_fix_vars);
+    CUOPT_DETERMINISM_LOG(
+      "BP_DET: fixed_from_guiding=%d fixed_from_other=%d", fixed_from_guiding, fixed_from_other);
+
     // if either all integers are from A(meaning all are common) or all integers are from B(meaning
     // all are different), return
     if (n_vars_from_guiding == 0 || n_vars_from_other == 0) {
       CUOPT_LOG_DEBUG("Returning false because all vars are common or different");
-      return std::make_pair(offspring, false);
+      return std::make_tuple(offspring, false, 0.0);
     }
+    // TODO: REPLACE!
+    double work = static_cast<double>(n_vars_from_other) / 1e8;
 
     cuopt_assert(a.problem_ptr == b.problem_ptr,
                  "The two solutions should not refer to different problems");
@@ -175,9 +201,16 @@ class bound_prop_recombiner_t : public recombiner_t<i_t, f_t> {
                                                                a.handle_ptr->get_stream());
     probing_config_t<i_t, f_t> probing_config(a.problem_ptr->n_variables, a.handle_ptr);
     if (guiding_solution.get_feasible() && !a.problem_ptr->expensive_to_fix_vars) {
+      CUOPT_DETERMINISM_LOG("BP_DET: Taking FEASIBLE path (with variable fixing)");
       this->compute_vars_to_fix(offspring, vars_to_fix, n_vars_from_other, n_vars_from_guiding);
+      CUOPT_DETERMINISM_LOG("BP_DET: vars_to_fix_size=%lu", vars_to_fix.size());
       auto [fixed_problem, fixed_assignment, variable_map] = offspring.fix_variables(vars_to_fix);
-      timer_t timer(bp_recombiner_config_t::bounds_prop_time_limit);
+      CUOPT_DETERMINISM_LOG("BP_DET: fixed_problem_fingerprint=0x%x variable_map_size=%lu",
+                            fixed_problem.get_fingerprint(),
+                            variable_map.size());
+      work_limit_timer_t timer(this->context.gpu_heur_loop,
+                               bp_recombiner_config_t::bounds_prop_time_limit,
+                               *this->context.termination);
       rmm::device_uvector<f_t> old_assignment(offspring.assignment,
                                               offspring.handle_ptr->get_stream());
       offspring.handle_ptr->sync_stream();
@@ -197,26 +230,44 @@ class bound_prop_recombiner_t : public recombiner_t<i_t, f_t> {
       constraint_prop.single_rounding_only  = true;
       constraint_prop.apply_round(offspring, lp_run_time_after_feasible, timer, probing_config);
       constraint_prop.single_rounding_only = false;
-      cuopt_func_call(bool feasible_after_bounds_prop = offspring.get_feasible());
+      offspring.compute_feasibility();
+      bool feasible_after_bounds_prop = offspring.get_feasible();
       offspring.handle_ptr->sync_stream();
       offspring.problem_ptr = a.problem_ptr;
       fixed_assignment      = std::move(offspring.assignment);
       offspring.assignment  = std::move(old_assignment);
       offspring.handle_ptr->sync_stream();
       offspring.unfix_variables(fixed_assignment, variable_map);
-      cuopt_func_call(bool feasible_after_unfix = offspring.get_feasible());
-      // May be triggered due to numerical issues
-      // TODO: investigate further
-      // cuopt_assert(feasible_after_unfix == feasible_after_bounds_prop,
-      //              "Feasible after unfix should be same as feasible after bounds prop!");
+      offspring.compute_feasibility();
+      bool feasible_after_unfix = offspring.get_feasible();
+      cuopt_func_call(f_t excess_after_unfix = offspring.get_total_excess());
+      if (feasible_after_unfix != feasible_after_bounds_prop) {
+        CUOPT_LOG_WARN("Numerical issue in bounds prop, infeasibility after unfix");
+        // might become infeasible after unfixing due to numerical issues. Check that the excess
+        // remains consistent
+        // CUOPT_LOG_ERROR("Excess: %g, %g, %g, %g, feas %d", offspring.get_total_excess(),
+        // offspring.compute_max_constraint_violation(), offspring.compute_max_int_violation(),
+        // offspring.compute_max_variable_violation(), feasible_after_unfix);
+        // cuopt_assert(fabs(excess_after_unfix - excess_before) < 1e-6,
+        //              "Excess after unfix should be same as before unfix!");
+      }
       a.handle_ptr->sync_stream();
     } else {
-      timer_t timer(bp_recombiner_config_t::bounds_prop_time_limit);
+      CUOPT_DETERMINISM_LOG("BP_DET: Taking INFEASIBLE path (no variable fixing)");
+      work_limit_timer_t timer(this->context.gpu_heur_loop,
+                               bp_recombiner_config_t::bounds_prop_time_limit,
+                               *this->context.termination);
       get_probing_values_for_infeasible(
         guiding_solution, other_solution, offspring, probing_values, n_vars_from_other);
       probing_config.probing_values = host_copy(probing_values, offspring.handle_ptr->get_stream());
+      CUOPT_DETERMINISM_LOG(
+        "BP_DET: probing_values_hash=0x%x",
+        detail::compute_hash(make_span(probing_values), a.handle_ptr->get_stream()));
       constraint_prop.apply_round(offspring, lp_run_time_after_feasible, timer, probing_config);
     }
+    CUOPT_DETERMINISM_LOG("BP_DET: After apply_round: offspring_hash=0x%x feasible=%d",
+                          offspring.get_hash(),
+                          offspring.get_feasible());
     constraint_prop.max_n_failed_repair_iterations = 1;
     cuopt_func_call(offspring.test_number_all_integer());
     bool better_cost_than_parents =
@@ -236,11 +287,17 @@ class bound_prop_recombiner_t : public recombiner_t<i_t, f_t> {
         bp_recombiner_config_t::decrease_max_n_of_vars_from_other();
       }
     }
+    CUOPT_DETERMINISM_LOG(
+      "BP_DET: Final offspring_hash=0x%x same_as_parents=%d better_cost=%d better_feas=%d",
+      offspring.get_hash(),
+      same_as_parents,
+      better_cost_than_parents,
+      better_feasibility_than_parents);
     if (better_cost_than_parents || better_feasibility_than_parents) {
       CUOPT_LOG_DEBUG("Offspring is feasible or better than both parents");
-      return std::make_pair(offspring, true);
+      return std::make_tuple(offspring, true, work);
     }
-    return std::make_pair(offspring, !same_as_parents);
+    return std::make_tuple(offspring, !same_as_parents, work);
   }
 
   rmm::device_uvector<i_t> vars_to_fix;
diff --git a/cpp/src/mip_heuristics/diversity/recombiners/fp_recombiner.cuh b/cpp/src/mip_heuristics/diversity/recombiners/fp_recombiner.cuh
index 1cca1ba371..0fe73c9e60 100644
--- a/cpp/src/mip_heuristics/diversity/recombiners/fp_recombiner.cuh
+++ b/cpp/src/mip_heuristics/diversity/recombiners/fp_recombiner.cuh
@@ -35,9 +35,9 @@ class fp_recombiner_t : public recombiner_t<i_t, f_t> {
   {
   }
 
-  std::pair<solution_t<i_t, f_t>, bool> recombine(solution_t<i_t, f_t>& a,
-                                                  solution_t<i_t, f_t>& b,
-                                                  const weight_t<i_t, f_t>& weights)
+  std::tuple<solution_t<i_t, f_t>, bool, double> recombine(solution_t<i_t, f_t>& a,
+                                                           solution_t<i_t, f_t>& b,
+                                                           const weight_t<i_t, f_t>& weights)
   {
     raft::common::nvtx::range fun_scope("FP recombiner");
     auto& guiding_solution = a.get_feasible() ? a : b;
@@ -50,6 +50,7 @@ class fp_recombiner_t : public recombiner_t<i_t, f_t> {
     CUOPT_LOG_DEBUG("FP rec: Number of different variables %d MAX_VARS %d",
                     n_different_vars,
                     fp_recombiner_config_t::max_n_of_vars_from_other);
+    CUOPT_DETERMINISM_LOG("FP rec: offspring hash 0x%x", offspring.get_hash());
     i_t n_vars_from_other = n_different_vars;
     if (n_vars_from_other > (i_t)fp_recombiner_config_t::max_n_of_vars_from_other) {
       n_vars_from_other = fp_recombiner_config_t::max_n_of_vars_from_other;
@@ -62,17 +63,34 @@ class fp_recombiner_t : public recombiner_t<i_t, f_t> {
     i_t n_vars_from_guiding = a.problem_ptr->n_integer_vars - n_vars_from_other;
     if (n_vars_from_other == 0 || n_vars_from_guiding == 0) {
       CUOPT_LOG_DEBUG("Returning false because all vars are common or different");
-      return std::make_pair(offspring, false);
+      return std::make_tuple(offspring, false, 0.0);
     }
+    // TODO: CHANGE
+    double work = static_cast<double>(n_vars_from_other) / 1e8;
     CUOPT_LOG_DEBUG(
       "n_vars_from_guiding %d n_vars_from_other %d", n_vars_from_guiding, n_vars_from_other);
     this->compute_vars_to_fix(offspring, vars_to_fix, n_vars_from_other, n_vars_from_guiding);
+    CUOPT_DETERMINISM_LOG(
+      "FP rec post computevarstofix: offspring hash 0x%x, vars to fix 0x%x",
+      offspring.get_hash(),
+      detail::compute_hash(make_span(vars_to_fix), offspring.handle_ptr->get_stream()));
     auto [fixed_problem, fixed_assignment, variable_map] = offspring.fix_variables(vars_to_fix);
+    CUOPT_DETERMINISM_LOG(
+      "FP rec: fixed_problem hash 0x%x assigned hash 0x%x",
+      fixed_problem.get_fingerprint(),
+      detail::compute_hash(make_span(fixed_assignment), offspring.handle_ptr->get_stream()));
     fixed_problem.check_problem_representation(true);
     if (!guiding_solution.get_feasible() && !other_solution.get_feasible()) {
+      CUOPT_DETERMINISM_LOG("FP rec: running LP with infeasibility detection");
       relaxed_lp_settings_t lp_settings;
       lp_settings.time_limit = fp_recombiner_config_t::infeasibility_detection_time_limit;
-      lp_settings.tolerance  = fixed_problem.tolerances.absolute_tolerance;
+      if (this->context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS) {
+        lp_settings.time_limit   = std::numeric_limits<double>::max();
+        lp_settings.work_limit   = fp_recombiner_config_t::infeasibility_detection_time_limit;
+        lp_settings.work_context = &this->context.gpu_heur_loop;
+        cuopt_assert(lp_settings.work_context != nullptr, "Missing deterministic work context");
+      }
+      lp_settings.tolerance             = fixed_problem.tolerances.absolute_tolerance;
       lp_settings.return_first_feasible = true;
       lp_settings.save_state            = true;
       lp_settings.check_infeasibility   = true;
@@ -83,7 +101,7 @@ class fp_recombiner_t : public recombiner_t<i_t, f_t> {
           lp_response.get_termination_status() == pdlp_termination_status_t::DualInfeasible ||
           lp_response.get_termination_status() == pdlp_termination_status_t::TimeLimit) {
         CUOPT_LOG_DEBUG("FP recombiner failed because LP found infeasible!");
-        return std::make_pair(offspring, false);
+        return std::make_tuple(offspring, false, 0.0);
       }
     }
     // brute force rounding threshold is 8
@@ -96,7 +114,16 @@ class fp_recombiner_t : public recombiner_t<i_t, f_t> {
       offspring.handle_ptr->sync_stream();
       offspring.assignment = std::move(fixed_assignment);
       cuopt_func_call(offspring.test_variable_bounds(false));
-      timer_t timer(fp_recombiner_config_t::fp_time_limit);
+      CUOPT_DETERMINISM_LOG(
+        "FP rec pre-descent: offspring_hash=0x%x fixed_assignment_hash=0x%x "
+        "problem_fingerprint=0x%x fixed_n_integer_vars=%d",
+        offspring.get_hash(),
+        detail::compute_hash(offspring.assignment, offspring.handle_ptr->get_stream()),
+        fixed_problem.get_fingerprint(),
+        fixed_problem.n_integer_vars);
+      work_limit_timer_t timer(this->context.gpu_heur_loop,
+                               fp_recombiner_config_t::fp_time_limit,
+                               *this->context.termination);
       fp.timer = timer;
       fp.cycle_queue.reset(offspring);
       fp.reset();
@@ -134,9 +161,9 @@ class fp_recombiner_t : public recombiner_t<i_t, f_t> {
                                            !guiding_solution.get_feasible();
     if (better_cost_than_parents || better_feasibility_than_parents) {
       CUOPT_LOG_DEBUG("Offspring is feasible or better than both parents");
-      return std::make_pair(offspring, true);
+      return std::make_tuple(offspring, true, work);
     }
-    return std::make_pair(offspring, !same_as_parents);
+    return std::make_tuple(offspring, !same_as_parents, work);
   }
   rmm::device_uvector<i_t> vars_to_fix;
   // keep a copy of FP to prevent interference with generation FP
diff --git a/cpp/src/mip_heuristics/diversity/recombiners/line_segment_recombiner.cuh b/cpp/src/mip_heuristics/diversity/recombiners/line_segment_recombiner.cuh
index d413af86cd..80e6bc9dcd 100644
--- a/cpp/src/mip_heuristics/diversity/recombiners/line_segment_recombiner.cuh
+++ b/cpp/src/mip_heuristics/diversity/recombiners/line_segment_recombiner.cuh
@@ -66,22 +66,26 @@ class line_segment_recombiner_t : public recombiner_t<i_t, f_t> {
     return delta_vector;
   }
 
-  std::pair<solution_t<i_t, f_t>, bool> recombine(solution_t<i_t, f_t>& a,
-                                                  solution_t<i_t, f_t>& b,
-                                                  const weight_t<i_t, f_t>& weights)
+  std::tuple<solution_t<i_t, f_t>, bool, double> recombine(solution_t<i_t, f_t>& a,
+                                                           solution_t<i_t, f_t>& b,
+                                                           const weight_t<i_t, f_t>& weights)
   {
     raft::common::nvtx::range fun_scope("line_segment_recombiner");
+    CUOPT_DETERMINISM_LOG("LS rec: a %d b %d", a.get_hash(), b.get_hash());
     auto& guiding_solution = a.get_feasible() ? a : b;
     auto& other_solution   = a.get_feasible() ? b : a;
     // copy the solution from A
     solution_t<i_t, f_t> offspring(guiding_solution);
-    timer_t line_segment_timer{ls_recombiner_config_t::time_limit};
+    work_limit_timer_t line_segment_timer{
+      this->context.gpu_heur_loop, ls_recombiner_config_t::time_limit, *this->context.termination};
     // TODO after we have the conic combination, detect the lambda change
     // (i.e. the integral variables flip on line segment)
     i_t n_points_to_search        = ls_recombiner_config_t::n_points_to_search;
     const bool is_feasibility_run = false;
     i_t n_different_vars =
       this->assign_same_integer_values(guiding_solution, other_solution, offspring);
+    // TODO: CHANGE
+    double work                           = static_cast<double>(n_different_vars) / 1e8;
     rmm::device_uvector<f_t> delta_vector = generate_delta_vector(
       guiding_solution, other_solution, offspring, n_points_to_search, n_different_vars);
     line_segment_search.fj.copy_weights(weights, offspring.handle_ptr);
@@ -117,9 +121,9 @@ class line_segment_recombiner_t : public recombiner_t<i_t, f_t> {
     }
     if (better_cost_than_parents || better_feasibility_than_parents) {
       CUOPT_LOG_DEBUG("Offspring is feasible or better than both parents");
-      return std::make_pair(offspring, true);
+      return std::make_tuple(offspring, true, work);
     }
-    return std::make_pair(offspring, !same_as_parents);
+    return std::make_tuple(offspring, !same_as_parents, work);
   }
 
   line_segment_search_t<i_t, f_t>& line_segment_search;
diff --git a/cpp/src/mip_heuristics/diversity/recombiners/recombiner.cuh b/cpp/src/mip_heuristics/diversity/recombiners/recombiner.cuh
index 4782e9612b..452374796b 100644
--- a/cpp/src/mip_heuristics/diversity/recombiners/recombiner.cuh
+++ b/cpp/src/mip_heuristics/diversity/recombiners/recombiner.cuh
@@ -14,6 +14,7 @@
 #include <mip_heuristics/solver.cuh>
 #include <mip_heuristics/utils.cuh>
 #include <utilities/copy_helpers.hpp>
+#include <utilities/determinism_log.hpp>
 #include <utilities/device_utils.cuh>
 #include <utilities/seed_generator.cuh>
 
@@ -63,6 +64,18 @@ __global__ void assign_same_variables_kernel(typename solution_t<i_t, f_t>::view
 template <typename i_t, typename f_t>
 class recombiner_t {
  public:
+  static const char* recombiner_name(recombiner_enum_t recombiner)
+  {
+    switch (recombiner) {
+      case recombiner_enum_t::BOUND_PROP: return "BOUND_PROP";
+      case recombiner_enum_t::FP: return "FP";
+      case recombiner_enum_t::LINE_SEGMENT: return "LINE_SEGMENT";
+      case recombiner_enum_t::SUB_MIP: return "SUB_MIP";
+      case recombiner_enum_t::SIZE: return "SIZE";
+    }
+    return "UNKNOWN";
+  }
+
   recombiner_t(mip_solver_context_t<i_t, f_t>& context_,
                i_t n_integer_vars,
                const raft::handle_t* handle_ptr)
@@ -92,6 +105,15 @@ class recombiner_t {
                                                          cuopt::make_span(remaining_indices),
                                                          n_remaining.data());
     i_t remaining_variables = this->n_remaining.value(a.handle_ptr->get_stream());
+    // Sort the indices to resolve nondeterministic order due to atomicAdd
+    thrust::sort(a.handle_ptr->get_thrust_policy(),
+                 this->remaining_indices.data(),
+                 this->remaining_indices.data() + remaining_variables);
+
+    CUOPT_DETERMINISM_LOG(
+      "remaining indices hash 0x%x, size %d",
+      detail::compute_hash(make_span(this->remaining_indices), a.handle_ptr->get_stream()),
+      remaining_variables);
 
     auto vec_remaining_indices =
       host_copy(this->remaining_indices.data(), remaining_variables, a.handle_ptr->get_stream());
@@ -173,6 +195,12 @@ class recombiner_t {
                            i_t n_vars_from_guiding)
   {
     vars_to_fix.resize(n_vars_from_guiding, offspring.handle_ptr->get_stream());
+    CUOPT_DETERMINISM_LOG(
+      "remaining indices hash 0x%x",
+      detail::compute_hash(make_span(this->remaining_indices), offspring.handle_ptr->get_stream()));
+    CUOPT_DETERMINISM_LOG("integer_indices hash 0x%x",
+                          detail::compute_hash(make_span(offspring.problem_ptr->integer_indices),
+                                               offspring.handle_ptr->get_stream()));
     // set difference needs two sorted arrays
     thrust::sort(offspring.handle_ptr->get_thrust_policy(),
                  this->remaining_indices.data(),
@@ -195,27 +223,54 @@ class recombiner_t {
                  "vars_to_fix should be sorted!");
   }
 
-  static void init_enabled_recombiners(const problem_t<i_t, f_t>& problem,
+  static void init_enabled_recombiners(mip_solver_context_t<i_t, f_t>& context,
+                                       const problem_t<i_t, f_t>& problem,
                                        int user_enabled_mask = -1)
   {
     std::unordered_set<recombiner_enum_t> enabled_recombiners;
+    const bool disable_fp_and_submip_for_expensive_fix = problem.expensive_to_fix_vars;
+    const i_t n_continuous_vars = problem.n_variables - problem.n_integer_vars;
+    const bool disable_submip_for_continuous_limit =
+      n_continuous_vars > (i_t)sub_mip_recombiner_config_t::max_continuous_vars;
+    const bool disable_submip_for_determinism =
+      (context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS) != 0;
     for (auto recombiner : recombiner_types) {
       if (user_enabled_mask >= 0 && !(user_enabled_mask & (1 << (uint32_t)recombiner))) {
         continue;
       }
       enabled_recombiners.insert(recombiner);
     }
-    if (problem.expensive_to_fix_vars) {
+    if (disable_fp_and_submip_for_expensive_fix) {
       enabled_recombiners.erase(recombiner_enum_t::FP);
       enabled_recombiners.erase(recombiner_enum_t::SUB_MIP);
     }
     // check the size of the continous vars
-    if (problem.n_variables - problem.n_integer_vars >
-        (i_t)sub_mip_recombiner_config_t::max_continuous_vars) {
+    if (disable_submip_for_continuous_limit) {
       enabled_recombiners.erase(recombiner_enum_t::SUB_MIP);
     }
+    // submip not supported in deterministic mode yet
+    if (disable_submip_for_determinism) { enabled_recombiners.erase(recombiner_enum_t::SUB_MIP); }
     recombiner_t::enabled_recombiners =
       std::vector<recombiner_enum_t>(enabled_recombiners.begin(), enabled_recombiners.end());
+    cuopt_assert(!recombiner_t::enabled_recombiners.empty(), "No recombiners enabled after init");
+    std::string order_str;
+    for (size_t i = 0; i < recombiner_t::enabled_recombiners.size(); ++i) {
+      if (i > 0) { order_str += ','; }
+      order_str += recombiner_name(recombiner_t::enabled_recombiners[i]);
+    }
+    CUOPT_DETERMINISM_LOG(
+      "Deterministic recombiner init: expensive_to_fix=%d n_continuous=%d "
+      "max_continuous=%zu disable_fp_submip_expensive=%d "
+      "disable_submip_continuous=%d disable_submip_deterministic=%d size=%zu "
+      "order=[%s]",
+      (int)problem.expensive_to_fix_vars,
+      (int)n_continuous_vars,
+      sub_mip_recombiner_config_t::max_continuous_vars,
+      (int)disable_fp_and_submip_for_expensive_fix,
+      (int)disable_submip_for_continuous_limit,
+      (int)disable_submip_for_determinism,
+      recombiner_t::enabled_recombiners.size(),
+      order_str.c_str());
   }
 
   mip_solver_context_t<i_t, f_t>& context;
diff --git a/cpp/src/mip_heuristics/diversity/recombiners/recombiner_stats.hpp b/cpp/src/mip_heuristics/diversity/recombiners/recombiner_stats.hpp
index 044e313284..6cd2767f81 100644
--- a/cpp/src/mip_heuristics/diversity/recombiners/recombiner_stats.hpp
+++ b/cpp/src/mip_heuristics/diversity/recombiners/recombiner_stats.hpp
@@ -75,8 +75,13 @@ struct all_recombine_stats {
 
   // enum of the last attempted recombiner
   std::optional<recombiner_enum_t> last_attempt;
-  double last_recombiner_time;
+  double last_recombiner_time{0.0};
   std::chrono::high_resolution_clock::time_point last_recombiner_start_time;
+  double last_recombiner_work{0.0};
+
+  void set_recombiner_work(double work) { last_recombiner_work = work; }
+
+  double get_last_recombiner_work() { return last_recombiner_work; }
 
   void start_recombiner_time()
   {
diff --git a/cpp/src/mip_heuristics/diversity/recombiners/sub_mip.cuh b/cpp/src/mip_heuristics/diversity/recombiners/sub_mip.cuh
index 5a637aae8e..052aa515b1 100644
--- a/cpp/src/mip_heuristics/diversity/recombiners/sub_mip.cuh
+++ b/cpp/src/mip_heuristics/diversity/recombiners/sub_mip.cuh
@@ -38,9 +38,9 @@ class sub_mip_recombiner_t : public recombiner_t<i_t, f_t> {
     solution_vector.push_back(solution);
   }
 
-  std::pair<solution_t<i_t, f_t>, bool> recombine(solution_t<i_t, f_t>& a,
-                                                  solution_t<i_t, f_t>& b,
-                                                  const weight_t<i_t, f_t>& weights)
+  std::tuple<solution_t<i_t, f_t>, bool, double> recombine(solution_t<i_t, f_t>& a,
+                                                           solution_t<i_t, f_t>& b,
+                                                           const weight_t<i_t, f_t>& weights)
   {
     raft::common::nvtx::range fun_scope("Sub-MIP recombiner");
     solution_vector.clear();
@@ -66,8 +66,10 @@ class sub_mip_recombiner_t : public recombiner_t<i_t, f_t> {
     i_t n_vars_from_guiding = a.problem_ptr->n_integer_vars - n_vars_from_other;
     if (n_vars_from_other == 0 || n_vars_from_guiding == 0) {
       CUOPT_LOG_DEBUG("Returning false because all vars are common or different");
-      return std::make_pair(offspring, false);
+      return std::make_tuple(offspring, false, 0.0);
     }
+    // TODO: CHANGE
+    double work = static_cast<double>(n_vars_from_other) / 1e8;
     CUOPT_LOG_DEBUG(
       "n_vars_from_guiding %d n_vars_from_other %d", n_vars_from_guiding, n_vars_from_other);
     this->compute_vars_to_fix(offspring, vars_to_fix, n_vars_from_other, n_vars_from_guiding);
@@ -112,10 +114,11 @@ class sub_mip_recombiner_t : public recombiner_t<i_t, f_t> {
       branch_and_bound_settings.clique_cuts                              = 0;
       branch_and_bound_settings.sub_mip                                  = 1;
       branch_and_bound_settings.strong_branching_simplex_iteration_limit = 200;
-      branch_and_bound_settings.solution_callback = [this](std::vector<f_t>& solution,
-                                                           f_t objective) {
-        this->solution_callback(solution, objective);
-      };
+      branch_and_bound_settings.new_incumbent_callback =
+        [this](std::vector<f_t>& solution,
+               f_t objective,
+               const cuopt::internals::mip_solution_callback_info_t&,
+               double) { this->solution_callback(solution, objective); };
 
       // disable B&B logs, so that it is not interfering with the main B&B thread
       branch_and_bound_settings.log.log = false;
@@ -185,7 +188,7 @@ class sub_mip_recombiner_t : public recombiner_t<i_t, f_t> {
       sol.clamp_within_bounds();  // Scaling might bring some very slight variable bound violations
       sol.compute_feasibility();
       cuopt_func_call(sol.test_variable_bounds());
-      population.add_solution(std::move(sol));
+      population.add_solution(std::move(sol), internals::mip_solution_origin_t::SUB_MIP);
     }
     bool better_cost_than_parents =
       offspring.get_quality(weights) <
@@ -195,9 +198,9 @@ class sub_mip_recombiner_t : public recombiner_t<i_t, f_t> {
                                            !guiding_solution.get_feasible();
     if (better_cost_than_parents || better_feasibility_than_parents) {
       CUOPT_LOG_DEBUG("Offspring is feasible or better than both parents");
-      return std::make_pair(offspring, true);
+      return std::make_tuple(offspring, true, work);
     }
-    return std::make_pair(offspring, !std::isnan(branch_and_bound_solution.objective));
+    return std::make_tuple(offspring, !std::isnan(branch_and_bound_solution.objective), work);
   }
   rmm::device_uvector<i_t> vars_to_fix;
   mip_solver_context_t<i_t, f_t>& context;
diff --git a/cpp/src/mip_heuristics/diversity/weights.cuh b/cpp/src/mip_heuristics/diversity/weights.cuh
index 7502ae9210..fbe72aba8e 100644
--- a/cpp/src/mip_heuristics/diversity/weights.cuh
+++ b/cpp/src/mip_heuristics/diversity/weights.cuh
@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -12,6 +12,8 @@
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
+#include <mip_heuristics/utils.cuh>
+
 namespace cuopt::linear_programming::detail {
 
 template <typename i_t, typename f_t>
@@ -25,6 +27,11 @@ struct weight_t {
     objective_weight.set_value_async(one, handle_ptr->get_stream());
   }
 
+  uint32_t get_hash(rmm::cuda_stream_view stream = rmm::cuda_stream_default) const
+  {
+    return compute_hash(cstr_weights, stream) ^ compute_hash(objective_weight.value(stream));
+  }
+
   rmm::device_uvector<f_t> cstr_weights;
   rmm::device_scalar<f_t> objective_weight;
 };
diff --git a/cpp/src/mip_heuristics/early_heuristic.cuh b/cpp/src/mip_heuristics/early_heuristic.cuh
index 090cfd4901..ddab090a5b 100644
--- a/cpp/src/mip_heuristics/early_heuristic.cuh
+++ b/cpp/src/mip_heuristics/early_heuristic.cuh
@@ -24,8 +24,10 @@
 namespace cuopt::linear_programming::detail {
 
 template <typename f_t>
-using early_incumbent_callback_t = std::function<void(
-  f_t solver_obj, f_t user_obj, const std::vector<f_t>& assignment, const char* heuristic_name)>;
+using early_incumbent_callback_t = std::function<void(f_t solver_obj,
+                                                      f_t user_obj,
+                                                      const std::vector<f_t>& assignment,
+                                                      internals::mip_solution_origin_t origin)>;
 
 // CRTP base for early heuristics that run on the original (or papilo-presolved) problem
 // during presolve to find incumbents as early as possible.
@@ -89,10 +91,14 @@ class early_heuristic_t {
     best_assignment_ = user_assignment;
     solution_found_  = true;
     f_t user_obj     = problem_ptr_->get_user_obj_from_solver_obj(solver_obj);
-    // Log and callback are deferred to the shared incumbent_callback_ which enforces
-    // global monotonicity across all early heuristic instances.
+    double elapsed =
+      std::chrono::duration<double>(std::chrono::steady_clock::now() - start_time_).count();
+    CUOPT_LOG_INFO("Early heuristics (%s) lowered the primal bound. Objective %g. Time %.2f",
+                   Derived::name(),
+                   user_obj,
+                   elapsed);
     if (incumbent_callback_) {
-      incumbent_callback_(solver_obj, user_obj, user_assignment, Derived::name());
+      incumbent_callback_(solver_obj, user_obj, user_assignment, Derived::origin());
     }
   }
 
diff --git a/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cuh b/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cuh
index 911e846551..89bdff1092 100644
--- a/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cuh
+++ b/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cuh
@@ -27,6 +27,10 @@ class early_cpufj_t : public early_heuristic_t<i_t, f_t, early_cpufj_t<i_t, f_t>
   ~early_cpufj_t();
 
   static constexpr const char* name() { return "CPUFJ"; }
+  static constexpr internals::mip_solution_origin_t origin()
+  {
+    return internals::mip_solution_origin_t::CPU_FEASIBILITY_JUMP;
+  }
 
   void start();
   void stop();
diff --git a/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cu b/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cu
index 3f77427d87..59ad7ed0fd 100644
--- a/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cu
+++ b/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cu
@@ -52,10 +52,10 @@ void early_gpufj_t<i_t, f_t>::start()
 
   fj_ptr_ = std::make_unique<fj_t<i_t, f_t>>(*context_ptr_, fj_settings);
 
-  fj_ptr_->improvement_callback = [this](f_t user_obj, const std::vector<f_t>& h_assignment) {
+  fj_ptr_->set_improvement_callback([this](f_t user_obj, const std::vector<f_t>& h_assignment) {
     f_t solver_obj = this->problem_ptr_->get_solver_obj_from_user_obj(user_obj);
     this->try_update_best(solver_obj, h_assignment);
-  };
+  });
 
   worker_thread_ = std::make_unique<std::thread>(&early_gpufj_t::run_worker, this);
 }
diff --git a/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cuh b/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cuh
index 4a7769143e..f09fc011d5 100644
--- a/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cuh
+++ b/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cuh
@@ -30,6 +30,10 @@ class early_gpufj_t : public early_heuristic_t<i_t, f_t, early_gpufj_t<i_t, f_t>
   ~early_gpufj_t();
 
   static constexpr const char* name() { return "GPUFJ"; }
+  static constexpr internals::mip_solution_origin_t origin()
+  {
+    return internals::mip_solution_origin_t::FEASIBILITY_JUMP;
+  }
 
   void start();
   void stop();
diff --git a/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump.cu b/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump.cu
index 748dd41dfb..ed41402621 100644
--- a/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump.cu
+++ b/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump.cu
@@ -14,6 +14,7 @@
 #include <mip_heuristics/diversity/population.cuh>
 #include <mip_heuristics/mip_constants.hpp>
 #include <mip_heuristics/utils.cuh>
+#include <utilities/determinism_log.hpp>
 #include <utilities/seed_generator.cuh>
 #include <utilities/timer.hpp>
 
@@ -23,6 +24,7 @@
 
 #include <thrust/copy.h>
 #include <thrust/count.h>
+#include <thrust/iterator/transform_iterator.h>
 #include <thrust/logical.h>
 #include <thrust/sort.h>
 #include <cub/cub.cuh>
@@ -63,7 +65,8 @@ fj_t<i_t, f_t>::fj_t(mip_solver_context_t<i_t, f_t>& context_, fj_settings_t in_
     work_id_to_nonbin_var_idx(pb_ptr->coefficients.size(), pb_ptr->handle_ptr->get_stream()),
     row_size_bin_prefix_sum(pb_ptr->binary_indices.size(), pb_ptr->handle_ptr->get_stream()),
     row_size_nonbin_prefix_sum(pb_ptr->nonbinary_indices.size(), pb_ptr->handle_ptr->get_stream()),
-    work_ids_for_related_vars(pb_ptr->n_variables, pb_ptr->handle_ptr->get_stream())
+    work_ids_for_related_vars(pb_ptr->n_variables, pb_ptr->handle_ptr->get_stream()),
+    deterministic_frontier_work_by_var_d_(0, pb_ptr->handle_ptr->get_stream())
 {
   setval_launch_dims = get_launch_dims_max_occupancy(
     (void*)update_assignment_kernel<i_t, f_t>, TPB_setval, pb_ptr->handle_ptr);
@@ -111,6 +114,158 @@ void fj_t<i_t, f_t>::reset_cuda_graph()
   graph_created = false;
 }
 
+template <typename i_t, typename f_t>
+bool fj_t<i_t, f_t>::use_load_balancing_codepath() const
+{
+  bool use_load_balancing = false;
+  if (settings.load_balancing_mode == fj_load_balancing_mode_t::ALWAYS_OFF) {
+    use_load_balancing = false;
+  } else if (settings.load_balancing_mode == fj_load_balancing_mode_t::ALWAYS_ON) {
+    use_load_balancing = true;
+  } else if (settings.load_balancing_mode == fj_load_balancing_mode_t::AUTO) {
+    use_load_balancing =
+      pb_ptr->n_variables > settings.parameters.load_balancing_codepath_min_varcount;
+  }
+  if (settings.mode == fj_mode_t::ROUNDING) { use_load_balancing = false; }
+  return use_load_balancing;
+}
+
+// precompute estimates of the amount of work performed per selected variable
+// using the related_variables table to estimate the nnz touched
+// will be replaced with a model estimator in the future.
+template <typename i_t, typename f_t>
+void fj_t<i_t, f_t>::initialize_deterministic_work_estimator()
+{
+  const i_t num_vars     = pb_ptr->n_variables;
+  const i_t num_cstrs    = pb_ptr->n_constraints;
+  const double total_nnz = static_cast<double>(pb_ptr->coefficients.size());
+
+  deterministic_refresh_work_          = total_nnz;
+  deterministic_average_frontier_work_ = total_nnz;
+  if (num_vars == 0) { return; }
+
+  auto stream = handle_ptr->get_stream();
+  auto policy = handle_ptr->get_thrust_policy();
+
+  // degree[v] = number of constraints variable v appears in
+  rmm::device_uvector<double> degree(num_vars, stream);
+  auto rev_offsets = make_span(pb_ptr->reverse_offsets);
+  thrust::tabulate(policy, degree.begin(), degree.end(), [rev_offsets] __device__(i_t v) -> double {
+    return (double)(rev_offsets[v + 1] - rev_offsets[v]);
+  });
+
+  deterministic_frontier_work_by_var_d_.resize(num_vars, stream);
+
+  if (pb_ptr->related_variables_offsets.size() > 0 && pb_ptr->related_variables.size() > 0) {
+    // Exact path: segmented reduce over the precomputed related_variables table
+    auto degree_ptr        = degree.data();
+    auto related_offsets   = pb_ptr->related_variables_offsets.data();
+    auto degree_of_related = thrust::make_transform_iterator(
+      pb_ptr->related_variables.begin(), [degree_ptr, num_vars] __device__(i_t rv) -> double {
+        return (rv >= 0 && rv < num_vars) ? degree_ptr[rv] : 0.0;
+      });
+
+    size_t temp_bytes = 0;
+    cub::DeviceSegmentedReduce::Sum(nullptr,
+                                    temp_bytes,
+                                    degree_of_related,
+                                    deterministic_frontier_work_by_var_d_.data(),
+                                    num_vars,
+                                    related_offsets,
+                                    related_offsets + 1,
+                                    stream);
+    rmm::device_uvector<char> temp(temp_bytes, stream);
+    cub::DeviceSegmentedReduce::Sum(temp.data(),
+                                    temp_bytes,
+                                    degree_of_related,
+                                    deterministic_frontier_work_by_var_d_.data(),
+                                    num_vars,
+                                    related_offsets,
+                                    related_offsets + 1,
+                                    stream);
+
+  } else {
+    // SpMV path: frontier_work ≈ A^T * (A * degree)
+    // Overestimates by double-counting shared neighbors, but deterministic and
+    // load-balanced. Acceptable for a work-unit proxy.
+
+    // Step 1: y[c] = sum of degree[v] for v in constraint c
+    rmm::device_uvector<double> y(num_cstrs, stream);
+    auto degree_ptr    = degree.data();
+    auto offsets_ptr   = pb_ptr->offsets.data();
+    auto degree_of_var = thrust::make_transform_iterator(
+      pb_ptr->variables.begin(),
+      [degree_ptr] __device__(i_t v) -> double { return degree_ptr[v]; });
+
+    size_t temp_bytes = 0;
+    cub::DeviceSegmentedReduce::Sum(nullptr,
+                                    temp_bytes,
+                                    degree_of_var,
+                                    y.data(),
+                                    num_cstrs,
+                                    offsets_ptr,
+                                    offsets_ptr + 1,
+                                    stream);
+    rmm::device_uvector<char> temp(temp_bytes, stream);
+    cub::DeviceSegmentedReduce::Sum(temp.data(),
+                                    temp_bytes,
+                                    degree_of_var,
+                                    y.data(),
+                                    num_cstrs,
+                                    offsets_ptr,
+                                    offsets_ptr + 1,
+                                    stream);
+
+    // Step 2: frontier_work[v] = sum of y[c] for c in constraints_of(v)
+    auto rev_offs_ptr = pb_ptr->reverse_offsets.data();
+    auto y_ptr        = y.data();
+    auto y_of_constraint =
+      thrust::make_transform_iterator(pb_ptr->reverse_constraints.begin(),
+                                      [y_ptr] __device__(i_t c) -> double { return y_ptr[c]; });
+
+    temp_bytes = 0;
+    cub::DeviceSegmentedReduce::Sum(nullptr,
+                                    temp_bytes,
+                                    y_of_constraint,
+                                    deterministic_frontier_work_by_var_d_.data(),
+                                    num_vars,
+                                    rev_offs_ptr,
+                                    rev_offs_ptr + 1,
+                                    stream);
+    temp.resize(temp_bytes, stream);
+    cub::DeviceSegmentedReduce::Sum(temp.data(),
+                                    temp_bytes,
+                                    y_of_constraint,
+                                    deterministic_frontier_work_by_var_d_.data(),
+                                    num_vars,
+                                    rev_offs_ptr,
+                                    rev_offs_ptr + 1,
+                                    stream);
+  }
+
+  deterministic_average_frontier_work_ =
+    thrust::reduce(policy,
+                   deterministic_frontier_work_by_var_d_.begin(),
+                   deterministic_frontier_work_by_var_d_.end(),
+                   0.0,
+                   thrust::plus<double>()) /
+    (double)num_vars;
+  deterministic_frontier_work_by_var_.resize(num_vars);
+  raft::copy(deterministic_frontier_work_by_var_.data(),
+             deterministic_frontier_work_by_var_d_.data(),
+             num_vars,
+             stream);
+
+  CUOPT_LOG_DEBUG(
+    "FJ determ: initialized frontier work estimator avg_frontier_nnz=%.6f refresh_nnz=%.6f "
+    "vars=%zu nnz=%zu load_balancing=%d",
+    deterministic_average_frontier_work_,
+    deterministic_refresh_work_,
+    num_vars,
+    pb_ptr->coefficients.size(),
+    (int)use_load_balancing_codepath());
+}
+
 template <typename i_t, typename f_t>
 fj_t<i_t, f_t>::~fj_t()
 {
@@ -189,38 +344,43 @@ fj_t<i_t, f_t>::climber_data_t::view_t fj_t<i_t, f_t>::climber_data_t::view()
   v.jump_candidates      = make_span(jump_candidates);
   v.jump_candidate_count = make_span(jump_candidate_count);
   v.jump_locks           = make_span(jump_locks);
-  v.candidate_arrived_workids         = make_span(candidate_arrived_workids);
-  v.grid_score_buf                    = make_span(grid_score_buf);
-  v.grid_delta_buf                    = make_span(grid_delta_buf);
-  v.grid_var_buf                      = make_span(grid_var_buf);
-  v.row_size_bin_prefix_sum           = make_span(fj.row_size_bin_prefix_sum);
-  v.row_size_nonbin_prefix_sum        = make_span(fj.row_size_nonbin_prefix_sum);
-  v.work_id_to_bin_var_idx            = make_span(fj.work_id_to_bin_var_idx);
-  v.work_id_to_nonbin_var_idx         = make_span(fj.work_id_to_nonbin_var_idx);
-  v.work_ids_for_related_vars         = make_span(fj.work_ids_for_related_vars);
-  v.fractional_variables              = fractional_variables.view();
-  v.saved_best_fractional_count       = saved_best_fractional_count.data();
-  v.handle_fractionals_only           = handle_fractionals_only.data();
-  v.selected_var                      = selected_var.data();
-  v.violation_score                   = violation_score.data();
-  v.weighted_violation_score          = weighted_violation_score.data();
-  v.constraints_changed_count         = constraints_changed_count.data();
-  v.local_minimums_reached            = local_minimums_reached.data();
-  v.iterations                        = iterations.data();
-  v.best_excess                       = best_excess.data();
-  v.best_objective                    = best_objective.data();
-  v.saved_solution_objective          = saved_solution_objective.data();
-  v.incumbent_quality                 = incumbent_quality.data();
-  v.incumbent_objective               = incumbent_objective.data();
-  v.weight_update_increment           = fj.weight_update_increment;
-  v.objective_weight                  = fj.objective_weight.data();
-  v.last_minimum_iteration            = last_minimum_iteration.data();
-  v.last_improving_minimum            = last_improving_minimum.data();
-  v.last_iter_candidates              = last_iter_candidates.data();
-  v.relvar_count_last_update          = relvar_count_last_update.data();
-  v.load_balancing_skip               = load_balancing_skip.data();
-  v.break_condition                   = break_condition.data();
-  v.temp_break_condition              = temp_break_condition.data();
+  v.candidate_arrived_workids          = make_span(candidate_arrived_workids);
+  v.grid_score_buf                     = make_span(grid_score_buf);
+  v.grid_delta_buf                     = make_span(grid_delta_buf);
+  v.grid_var_buf                       = make_span(grid_var_buf);
+  v.row_size_bin_prefix_sum            = make_span(fj.row_size_bin_prefix_sum);
+  v.row_size_nonbin_prefix_sum         = make_span(fj.row_size_nonbin_prefix_sum);
+  v.work_id_to_bin_var_idx             = make_span(fj.work_id_to_bin_var_idx);
+  v.work_id_to_nonbin_var_idx          = make_span(fj.work_id_to_nonbin_var_idx);
+  v.work_ids_for_related_vars          = make_span(fj.work_ids_for_related_vars);
+  v.deterministic_frontier_work_by_var = make_span(fj.deterministic_frontier_work_by_var_d_);
+  v.fractional_variables               = fractional_variables.view();
+  v.saved_best_fractional_count        = saved_best_fractional_count.data();
+  v.handle_fractionals_only            = handle_fractionals_only.data();
+  v.selected_var                       = selected_var.data();
+  v.violation_score                    = violation_score.data();
+  v.weighted_violation_score           = weighted_violation_score.data();
+  v.constraints_changed_count          = constraints_changed_count.data();
+  v.local_minimums_reached             = local_minimums_reached.data();
+  v.iterations                         = iterations.data();
+  v.best_excess                        = best_excess.data();
+  v.best_objective                     = best_objective.data();
+  v.saved_solution_objective           = saved_solution_objective.data();
+  v.incumbent_quality                  = incumbent_quality.data();
+  v.incumbent_objective                = incumbent_objective.data();
+  v.weight_update_increment            = fj.weight_update_increment;
+  v.objective_weight                   = fj.objective_weight.data();
+  v.last_minimum_iteration             = last_minimum_iteration.data();
+  v.last_improving_minimum             = last_improving_minimum.data();
+  v.last_iter_candidates               = last_iter_candidates.data();
+  v.relvar_count_last_update           = relvar_count_last_update.data();
+  v.load_balancing_skip                = load_balancing_skip.data();
+  v.break_condition                    = break_condition.data();
+  v.temp_break_condition               = temp_break_condition.data();
+  v.deterministic_batch_work           = deterministic_batch_work.data();
+  v.deterministic_refresh_work         = fj.deterministic_refresh_work_;
+  v.deterministic_work_accounting =
+    (fj.context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS);
   v.best_jump_idx                     = best_jump_idx.data();
   v.small_move_tabu                   = small_move_tabu.data();
   v.stop_threshold                    = fj.stop_threshold;
@@ -432,9 +592,7 @@ void fj_t<i_t, f_t>::climber_init(i_t climber_idx, const rmm::cuda_stream_view&
   f_t inf = std::numeric_limits<f_t>::infinity();
   climber->best_objective.set_value_async(inf, climber_stream);
   climber->saved_solution_objective.set_value_async(inf, climber_stream);
-  climber->violation_score.set_value_to_zero_async(climber_stream);
-  climber->weighted_violation_score.set_value_to_zero_async(climber_stream);
-  init_lhs_and_violation<i_t, f_t><<<256, 256, 0, climber_stream.value()>>>(view);
+  refresh_lhs_and_violation(climber_stream);
 
   // initialize the best_objective values according to the initial assignment
   f_t best_obj = compute_objective_from_vec<i_t, f_t>(
@@ -458,6 +616,7 @@ void fj_t<i_t, f_t>::climber_init(i_t climber_idx, const rmm::cuda_stream_view&
   climber->last_iter_candidates.set_value_to_zero_async(climber_stream);
   climber->relvar_count_last_update.set_value_to_zero_async(climber_stream);
   climber->load_balancing_skip.set_value_to_zero_async(climber_stream);
+  climber->deterministic_batch_work.set_value_to_zero_async(climber_stream);
   climber->constraints_changed_count.set_value_to_zero_async(climber_stream);
   climber->iterations.set_value_to_zero_async(climber_stream);
   climber->full_refresh_iteration.set_value_to_zero_async(climber_stream);
@@ -650,10 +809,10 @@ void fj_t<i_t, f_t>::run_step_device(const rmm::cuda_stream_view& climber_stream
   auto [grid_setval, blocks_setval] = setval_launch_dims;
   auto [grid_update_changed_constraints, blocks_update_changed_constraints] =
     update_changed_constraints_launch_dims;
-  auto [grid_resetmoves, blocks_resetmoves]         = resetmoves_launch_dims;
-  auto [grid_resetmoves_bin, blocks_resetmoves_bin] = resetmoves_bin_launch_dims;
-  auto [grid_update_weights, blocks_update_weights] = update_weights_launch_dims;
-  auto [grid_lift_move, blocks_lift_move]           = lift_move_launch_dims;
+  auto [grid_resetmoves, blocks_resetmoves]                          = resetmoves_launch_dims;
+  auto [grid_resetmoves_bin, blocks_resetmoves_bin]                  = resetmoves_bin_launch_dims;
+  [[maybe_unused]] auto [grid_update_weights, blocks_update_weights] = update_weights_launch_dims;
+  [[maybe_unused]] auto [grid_lift_move, blocks_lift_move]           = lift_move_launch_dims;
 
   auto& data    = *climbers[climber_idx];
   auto v        = data.view();
@@ -669,17 +828,10 @@ void fj_t<i_t, f_t>::run_step_device(const rmm::cuda_stream_view& climber_stream
   // as it breaks assumptions in the binary_pb codepath
   if (settings.mode == fj_mode_t::ROUNDING) { is_binary_pb = false; }
 
-  bool use_load_balancing = false;
-  if (settings.load_balancing_mode == fj_load_balancing_mode_t::ALWAYS_OFF) {
-    use_load_balancing = false;
-  } else if (settings.load_balancing_mode == fj_load_balancing_mode_t::ALWAYS_ON) {
-    use_load_balancing = true;
-  } else if (settings.load_balancing_mode == fj_load_balancing_mode_t::AUTO) {
-    use_load_balancing =
-      pb_ptr->n_variables > settings.parameters.load_balancing_codepath_min_varcount;
+  bool use_load_balancing = use_load_balancing_codepath();
+  if (context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS) {
+    data.deterministic_batch_work.set_value_to_zero_async(climber_stream);
   }
-  // Load-balanced codepath not updated yet to handle rounding mode
-  if (settings.mode == fj_mode_t::ROUNDING) { use_load_balancing = false; }
 
   cudaGraph_t graph;
   void* kernel_args[]            = {&v};
@@ -841,9 +993,40 @@ void fj_t<i_t, f_t>::refresh_lhs_and_violation(const rmm::cuda_stream_view& stre
   auto v     = data.view();
 
   data.violated_constraints.clear(stream);
-  data.violation_score.set_value_to_zero_async(stream);
-  data.weighted_violation_score.set_value_to_zero_async(stream);
-  init_lhs_and_violation<i_t, f_t><<<4096, 256, 0, stream>>>(v);
+  init_lhs_and_violated_constraints<i_t, f_t><<<4096, 256, 0, stream>>>(v);
+  // both transformreduce could be fused; but oh well hardly a bottleneck
+  auto violation =
+    thrust::transform_reduce(rmm::exec_policy(stream),
+                             thrust::make_counting_iterator<i_t>(0),
+                             thrust::make_counting_iterator<i_t>(pb_ptr->n_constraints),
+                             cuda::proclaim_return_type<f_t>([v] __device__(i_t cstr_idx) {
+                               return v.excess_score(cstr_idx, v.incumbent_lhs[cstr_idx]);
+                             }),
+                             (f_t)0,
+                             thrust::plus<f_t>());
+  auto weighted_violation = thrust::transform_reduce(
+    rmm::exec_policy(stream),
+    thrust::make_counting_iterator<i_t>(0),
+    thrust::make_counting_iterator<i_t>(pb_ptr->n_constraints),
+    cuda::proclaim_return_type<f_t>([v] __device__(i_t cstr_idx) {
+      return v.excess_score(cstr_idx, v.incumbent_lhs[cstr_idx]) * v.cstr_weights[cstr_idx];
+    }),
+    (f_t)0,
+    thrust::plus<f_t>());
+  data.violation_score.set_value_async(violation, stream);
+  data.weighted_violation_score.set_value_async(weighted_violation, stream);
+  if ((context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS)) {
+    data.violated_constraints.sort(stream);
+  }
+#if FJ_SINGLE_STEP
+  CUOPT_LOG_DEBUG("hash assignment %x, hash lhs %x, hash lhscomp %x",
+                  detail::compute_hash(data.incumbent_assignment, stream),
+                  detail::compute_hash(data.incumbent_lhs, stream),
+                  detail::compute_hash(data.incumbent_lhs_sumcomp, stream));
+  CUOPT_LOG_DEBUG("Violated constraints hash post sort: %x, index map %x",
+                  detail::compute_hash(data.violated_constraints.contents, stream),
+                  detail::compute_hash(data.violated_constraints.index_map, stream));
+#endif
 }
 
 template <typename i_t, typename f_t>
@@ -851,6 +1034,10 @@ i_t fj_t<i_t, f_t>::host_loop(solution_t<i_t, f_t>& solution, i_t climber_idx)
 {
   auto& data = *climbers[climber_idx];
   auto v     = data.view();  // == climber_views[climber_idx]
+  const bool deterministic_work_estimate =
+    (context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS);
+  const bool use_graph           = true;
+  const i_t iterations_per_batch = use_graph ? iterations_per_graph : 1;
 
   auto climber_stream = data.stream.view();
   if (climber_idx == 0) climber_stream = handle_ptr->get_stream();
@@ -865,12 +1052,13 @@ i_t fj_t<i_t, f_t>::host_loop(solution_t<i_t, f_t>& solution, i_t climber_idx)
 
   data.incumbent_quality.set_value_async(obj, handle_ptr->get_stream());
 
-  timer_t timer(settings.time_limit);
+  work_limit_timer_t timer(context.gpu_heur_loop, settings.time_limit, *context.termination);
   i_t steps;
   bool limit_reached = false;
-  for (steps = 0; steps < std::numeric_limits<i_t>::max(); steps += iterations_per_graph) {
+  for (steps = 0; steps < std::numeric_limits<i_t>::max(); steps += iterations_per_batch) {
     // to actualize time limit
     handle_ptr->sync_stream();
+    const bool lhs_refreshed = (steps % settings.parameters.lhs_refresh_period == 0);
     if (timer.check_time_limit() || steps >= settings.iteration_limit ||
         context.preempt_heuristic_solver_.load()) {
       limit_reached = true;
@@ -879,9 +1067,11 @@ i_t fj_t<i_t, f_t>::host_loop(solution_t<i_t, f_t>& solution, i_t climber_idx)
     // every now and then, ensure external solutions are added to the population
     // this is done here because FJ is called within FP and also after recombiners
     // so FJ is one of the most inner and most frequent functions to be called
-    if (steps % 10000 == 0 && context.diversity_manager_ptr != nullptr) {
-      context.diversity_manager_ptr->get_population_pointer()
-        ->add_external_solutions_to_population();
+    if (steps % 10000 == 0 && context.diversity_manager_ptr != nullptr &&
+        context.diversity_manager_ptr != nullptr) {
+      auto* population_ptr = context.diversity_manager_ptr->get_population_pointer();
+      cuopt_assert(population_ptr != nullptr, "");
+      population_ptr->add_external_solutions_to_population();
     }
 
 #if !FJ_SINGLE_STEP
@@ -891,7 +1081,7 @@ i_t fj_t<i_t, f_t>::host_loop(solution_t<i_t, f_t>& solution, i_t climber_idx)
       CUOPT_LOG_TRACE(
         "FJ "
         "step %d viol %.2g [%d], obj %.8g, best %.8g, mins %d, maxw %g, "
-        "objw %g",
+        "objw %g, sol %x, delta %x, inc %x, lhs %x, lhscomp %x, viol %x, weights %x",
         steps,
         data.violation_score.value(climber_stream),
         data.violated_constraints.set_size.value(climber_stream),
@@ -899,15 +1089,26 @@ i_t fj_t<i_t, f_t>::host_loop(solution_t<i_t, f_t>& solution, i_t climber_idx)
         data.best_objective.value(climber_stream),
         data.local_minimums_reached.value(climber_stream),
         max_cstr_weight.value(climber_stream),
-        objective_weight.value(climber_stream));
+        objective_weight.value(climber_stream),
+        solution.get_hash(),
+        detail::compute_hash(data.jump_move_delta, climber_stream),
+        detail::compute_hash(data.incumbent_assignment, climber_stream),
+        detail::compute_hash(data.incumbent_lhs, climber_stream),
+        detail::compute_hash(data.incumbent_lhs_sumcomp, climber_stream),
+        detail::compute_hash(data.violated_constraints.contents, climber_stream),
+        detail::compute_hash(cstr_left_weights, climber_stream));
     }
 
-    if (!limit_reached) { run_step_device(climber_stream, climber_idx); }
+    if (!limit_reached) { run_step_device(climber_stream, climber_idx, use_graph); }
 
     // periodically recompute the LHS and violation scores
     // to correct any accumulated numerical errors
-    if (steps % settings.parameters.lhs_refresh_period == 0) {
-      refresh_lhs_and_violation(climber_stream, climber_idx);
+    if (lhs_refreshed) { refresh_lhs_and_violation(climber_stream, climber_idx); }
+    if (deterministic_work_estimate && !limit_reached) {
+      // TODO: replace with work predictor model
+      double batch_work = data.deterministic_batch_work.value(climber_stream) / 1e8;
+      timer.record_work(batch_work);
+      if (timer.check_time_limit()) { limit_reached = true; }
     }
 
     // periodically synchronize and check the latest solution
@@ -985,6 +1186,9 @@ i_t fj_t<i_t, f_t>::host_loop(solution_t<i_t, f_t>& solution, i_t climber_idx)
                   solution.get_feasible(),
                   data.local_minimums_reached.value(climber_stream));
 
+  // compute total time spent
+  double elapsed_time = timer.elapsed_time();
+
   CUOPT_LOG_TRACE("best fractional count %d",
                   data.saved_best_fractional_count.value(climber_stream));
 
@@ -1074,7 +1278,11 @@ template <typename i_t, typename f_t>
 i_t fj_t<i_t, f_t>::solve(solution_t<i_t, f_t>& solution)
 {
   raft::common::nvtx::range scope("fj_solve");
-  timer_t timer(settings.time_limit);
+  bool deterministic = (context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS);
+  if (deterministic) {
+    settings.time_limit = std::max((f_t)0.0, settings.time_limit);
+    settings.work_limit = settings.time_limit;
+  }
   handle_ptr               = const_cast<raft::handle_t*>(solution.handle_ptr);
   pb_ptr                   = solution.problem_ptr;
   last_reported_objective_ = std::numeric_limits<f_t>::infinity();
@@ -1082,9 +1290,26 @@ i_t fj_t<i_t, f_t>::solve(solution_t<i_t, f_t>& solution)
     cuopt_func_call(solution.test_variable_bounds(true));
     cuopt_assert(solution.test_number_all_integer(), "All integers must be rounded");
   }
+  if (deterministic && settings.work_limit == 0.0) {
+    CUOPT_LOG_DEBUG("FJ: skipping solve due to exhausted deterministic work budget");
+    return solution.compute_feasibility();
+  }
+  auto total_work_start = context.gpu_heur_loop.current_work();
+  auto total_time_start = std::chrono::high_resolution_clock::now();
   pb_ptr->check_problem_representation(true);
   resize_vectors(solution.handle_ptr);
 
+  CUOPT_LOG_DEBUG(
+    "FJ: work_limit %f time_limit %f sol hash %x pb hash %x",
+    settings.work_limit < std::numeric_limits<double>::max() ? settings.work_limit : -1.0,
+    settings.time_limit < std::numeric_limits<double>::max() ? settings.time_limit : -1.0,
+    solution.get_hash(),
+    pb_ptr->get_fingerprint());
+  CUOPT_LOG_DEBUG("FJ: weights hash %x, left weights hash %x, right weights hash %x",
+                  detail::compute_hash(cstr_weights, handle_ptr->get_stream()),
+                  detail::compute_hash(cstr_left_weights, handle_ptr->get_stream()),
+                  detail::compute_hash(cstr_right_weights, handle_ptr->get_stream()));
+
   bool is_initial_feasible = solution.compute_feasibility();
   auto initial_solution    = solution;
   // if we're in rounding mode, split the time/iteration limit between the first and second stage
@@ -1119,11 +1344,16 @@ i_t fj_t<i_t, f_t>::solve(solution_t<i_t, f_t>& solution)
   RAFT_CHECK_CUDA(handle_ptr->get_stream());
   handle_ptr->sync_stream();
 
+  if (deterministic) { initialize_deterministic_work_estimator(); }
+
   i_t iterations = host_loop(solution);
   RAFT_CHECK_CUDA(handle_ptr->get_stream());
   handle_ptr->sync_stream();
 
-  f_t effort_rate = (f_t)iterations / timer.elapsed_time();
+  f_t elapsed_time = std::chrono::duration_cast<std::chrono::duration<double>>(
+                       std::chrono::high_resolution_clock::now() - total_time_start)
+                       .count();
+  f_t effort_rate = (f_t)iterations / elapsed_time;
 
   // If we're in rounding mode and some fractionals remain: round them all
   // limit = total_limit * second_stage_split
@@ -1141,7 +1371,7 @@ i_t fj_t<i_t, f_t>::solve(solution_t<i_t, f_t>& solution)
     }
   }
 
-  CUOPT_LOG_TRACE("GPU solver took %g", timer.elapsed_time());
+  CUOPT_LOG_TRACE("GPU solver took %g", elapsed_time);
   CUOPT_LOG_TRACE("limit reached, effort rate %g steps/secm %d steps", effort_rate, iterations);
   reset_cuda_graph();
   i_t n_integer_vars = thrust::count_if(
@@ -1166,6 +1396,18 @@ i_t fj_t<i_t, f_t>::solve(solution_t<i_t, f_t>& solution)
     cuopt_assert(solution.compute_feasibility(), "Reverted solution should be feasible");
   }
 
+  cuopt_func_call(solution.test_variable_bounds());
+
+  if (deterministic) {
+    auto total_work_end = context.gpu_heur_loop.current_work();
+    CUOPT_LOG_DEBUG("FJ: worked %fwu for %d iterations, %g seconds",
+                    total_work_end - total_work_start,
+                    iterations,
+                    elapsed_time);
+  }
+
+  CUOPT_LOG_DEBUG("FJ sol hash %x", solution.get_hash());
+
   return is_new_feasible;
 }
 
diff --git a/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump.cuh b/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump.cuh
index 50b451a86e..a68ba1c467 100644
--- a/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump.cuh
+++ b/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump.cuh
@@ -19,6 +19,9 @@
 
 #include <utilities/event_handler.cuh>
 
+#include <map>
+#include <string>
+
 #include <functional>
 
 #define FJ_DEBUG_LOAD_BALANCING 0
@@ -105,6 +108,7 @@ struct fj_settings_t {
   fj_mode_t mode{fj_mode_t::FIRST_FEASIBLE};
   fj_candidate_selection_t candidate_selection{fj_candidate_selection_t::WEIGHTED_SCORE};
   double time_limit{60.0};
+  double work_limit{std::numeric_limits<double>::infinity()};
   int iteration_limit{std::numeric_limits<int>::max()};
   fj_hyper_parameters_t parameters{};
   int n_of_minimums_for_exit  = 7000;
@@ -131,12 +135,17 @@ struct fj_move_t {
   bool operator!=(const fj_move_t& rhs) const { return !(*this == rhs); }
 };
 
-// TODO: use 32bit integers instead,
-// as we dont need them to be floating point per the FJ2 scoring scheme
 // sizeof(fj_staged_score_t) <= 8 is needed to allow for atomic loads
 struct fj_staged_score_t {
-  float base{-std::numeric_limits<float>::infinity()};
-  float bonus{-std::numeric_limits<float>::infinity()};
+  int32_t base{std::numeric_limits<int32_t>::lowest()};
+  int32_t bonus{std::numeric_limits<int32_t>::lowest()};
+
+  fj_staged_score_t() = default;
+  HDI fj_staged_score_t(int32_t base_, int32_t bonus_) : base(base_), bonus(bonus_) {}
+  fj_staged_score_t(const fj_staged_score_t&)            = default;
+  fj_staged_score_t(fj_staged_score_t&&)                 = default;
+  fj_staged_score_t& operator=(const fj_staged_score_t&) = default;
+  fj_staged_score_t& operator=(fj_staged_score_t&&)      = default;
 
   HDI bool operator<(fj_staged_score_t other) const noexcept
   {
@@ -154,7 +163,7 @@ struct fj_staged_score_t {
 
   HDI static fj_staged_score_t invalid()
   {
-    return {-std::numeric_limits<float>::infinity(), -std::numeric_limits<float>::infinity()};
+    return {std::numeric_limits<int32_t>::lowest(), std::numeric_limits<int32_t>::lowest()};
   }
   HDI static fj_staged_score_t zero() { return {0, 0}; }
 
@@ -268,6 +277,7 @@ class fj_t {
   rmm::device_uvector<fj_load_balancing_workid_mapping_t> work_id_to_bin_var_idx;
   rmm::device_uvector<fj_load_balancing_workid_mapping_t> work_id_to_nonbin_var_idx;
   rmm::device_uvector<i_t> work_ids_for_related_vars;
+  rmm::device_uvector<double> deterministic_frontier_work_by_var_d_;
 
   cudaGraphExec_t graph_instance;
   bool graph_created = false;
@@ -326,6 +336,7 @@ class fj_t {
     rmm::device_scalar<i_t> full_refresh_iteration;
     rmm::device_scalar<i_t> relvar_count_last_update;
     rmm::device_scalar<i_t> load_balancing_skip;
+    rmm::device_scalar<double> deterministic_batch_work;
 
     contiguous_set_t<i_t, f_t> violated_constraints;
     contiguous_set_t<i_t, f_t> candidate_variables;
@@ -420,6 +431,7 @@ class fj_t {
         last_iter_candidates(0, fj.handle_ptr->get_stream()),
         relvar_count_last_update(0, fj.handle_ptr->get_stream()),
         load_balancing_skip(0, fj.handle_ptr->get_stream()),
+        deterministic_batch_work(0.0, fj.handle_ptr->get_stream()),
         break_condition(0, fj.handle_ptr->get_stream()),
         temp_break_condition(0, fj.handle_ptr->get_stream()),
         cub_storage_bytes(0, fj.handle_ptr->get_stream()),
@@ -490,6 +502,7 @@ class fj_t {
       raft::device_span<i_t> row_size_nonbin_prefix_sum;
       raft::device_span<fj_load_balancing_workid_mapping_t> work_id_to_bin_var_idx;
       raft::device_span<fj_load_balancing_workid_mapping_t> work_id_to_nonbin_var_idx;
+      raft::device_span<double> deterministic_frontier_work_by_var;
 
       i_t* selected_var;
       i_t* constraints_changed_count;
@@ -518,6 +531,9 @@ class fj_t {
       i_t* relvar_count_last_update;
       i_t* load_balancing_skip;
       f_t* max_cstr_weight;
+      double* deterministic_batch_work;
+      double deterministic_refresh_work;
+      bool deterministic_work_accounting;
 
       fj_settings_t* settings;
 
@@ -634,6 +650,19 @@ class fj_t {
   std::vector<std::unique_ptr<climber_data_t>> climbers;
   rmm::device_uvector<typename climber_data_t::view_t> climber_views;
   fj_settings_t settings;
+  std::vector<double> deterministic_frontier_work_by_var_;
+  double deterministic_average_frontier_work_{0.0};
+  double deterministic_refresh_work_{0.0};
+
+ public:
+  void initialize_deterministic_work_estimator();
+  void set_improvement_callback(fj_improvement_callback_t<f_t> callback)
+  {
+    improvement_callback = std::move(callback);
+  }
+
+ private:
+  bool use_load_balancing_codepath() const;
 
   fj_improvement_callback_t<f_t> improvement_callback;
   f_t last_reported_objective_{std::numeric_limits<f_t>::infinity()};
diff --git a/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump_impl_common.cuh b/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump_impl_common.cuh
index e57f0ec9e2..ec9b592550 100644
--- a/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump_impl_common.cuh
+++ b/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump_impl_common.cuh
@@ -103,7 +103,9 @@ HDI std::pair<f_t, f_t> feas_score_constraint(
   f_t cstr_coeff,
   f_t c_lb,
   f_t c_ub,
-  f_t current_lhs)
+  f_t current_lhs,
+  f_t cstr_left_weight,
+  f_t cstr_right_weight)
 {
   cuopt_assert(isfinite(delta), "invalid delta");
   cuopt_assert(cstr_coeff != 0 && isfinite(cstr_coeff), "invalid coefficient");
@@ -123,14 +125,13 @@ HDI std::pair<f_t, f_t> feas_score_constraint(
     // TODO: broadcast left/right weights to a csr_offset-indexed table? local minimums
     // usually occur on a rarer basis (around 50 iteratiosn to 1 local minimum)
     // likely unreasonable and overkill however
-    f_t cstr_weight =
-      bound_idx == 0 ? fj.cstr_left_weights[cstr_idx] : fj.cstr_right_weights[cstr_idx];
-    f_t sign      = bound_idx == 0 ? -1 : 1;
-    f_t rhs       = bounds[bound_idx] * sign;
-    f_t old_lhs   = current_lhs * sign;
-    f_t new_lhs   = (current_lhs + cstr_coeff * delta) * sign;
-    f_t old_slack = rhs - old_lhs;
-    f_t new_slack = rhs - new_lhs;
+    f_t cstr_weight = bound_idx == 0 ? cstr_left_weight : cstr_right_weight;
+    f_t sign        = bound_idx == 0 ? -1 : 1;
+    f_t rhs         = bounds[bound_idx] * sign;
+    f_t old_lhs     = current_lhs * sign;
+    f_t new_lhs     = (current_lhs + cstr_coeff * delta) * sign;
+    f_t old_slack   = rhs - old_lhs;
+    f_t new_slack   = rhs - new_lhs;
 
     cuopt_assert(isfinite(cstr_weight), "invalid weight");
     cuopt_assert(cstr_weight >= 0, "invalid weight");
diff --git a/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump_kernels.cu b/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump_kernels.cu
index ebbb761277..90f26ac4a5 100644
--- a/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump_kernels.cu
+++ b/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump_kernels.cu
@@ -14,6 +14,11 @@
 
 #include <raft/random/rng.cuh>
 
+#include <thrust/sort.h>
+#include <thrust/tuple.h>
+
+#include <thrust/iterator/transform_iterator.h>
+
 #include <cooperative_groups.h>
 
 #include "feasibility_jump_impl_common.cuh"
@@ -25,6 +30,39 @@ namespace cg = cooperative_groups;
 
 namespace cuopt::linear_programming::detail {
 
+template <typename i_t, typename f_t>
+DI void charge_deterministic_iteration_work(typename fj_t<i_t, f_t>::climber_data_t::view_t fj,
+                                            bool full_score_refresh)
+{
+  if (!fj.deterministic_work_accounting || !FIRST_THREAD) { return; }
+
+  const i_t selected_var = *fj.selected_var;
+
+  double work = fj.deterministic_refresh_work;
+  if (!full_score_refresh && selected_var >= 0 &&
+      selected_var < static_cast<i_t>(fj.deterministic_frontier_work_by_var.size())) {
+    work = fj.deterministic_frontier_work_by_var[selected_var];
+  }
+
+  *fj.deterministic_batch_work += work;
+}
+
+template <typename move_score_t, typename i_t>
+struct score_with_tiebreaker_comparator {
+  DI auto operator()(const thrust::pair<move_score_t, i_t>& a,
+                     const thrust::pair<move_score_t, i_t>& b) const
+  {
+    auto a_score = a.first;
+    auto a_idx   = a.second;
+    auto b_score = b.first;
+    auto b_idx   = b.second;
+
+    if (a_score > b_score) return a;
+    if (a_score == b_score && a_idx > b_idx) return a;
+    return b;
+  }
+};
+
 template <typename i_t, typename f_t>
 DI thrust::pair<f_t, f_t> move_objective_score(
   const typename fj_t<i_t, f_t>::climber_data_t::view_t& fj, i_t var_idx, f_t delta)
@@ -139,7 +177,8 @@ DI void update_weights(typename fj_t<i_t, f_t>::climber_data_t::view_t& fj)
 }
 
 template <typename i_t, typename f_t>
-__global__ void init_lhs_and_violation(typename fj_t<i_t, f_t>::climber_data_t::view_t fj)
+__global__ void init_lhs_and_violated_constraints(
+  typename fj_t<i_t, f_t>::climber_data_t::view_t fj)
 {
   for (i_t cstr_idx = TH_ID_X; cstr_idx < fj.pb.n_constraints; cstr_idx += GRID_STRIDE) {
     auto [offset_begin, offset_end] = fj.pb.range_for_constraint(cstr_idx);
@@ -152,10 +191,7 @@ __global__ void init_lhs_and_violation(typename fj_t<i_t, f_t>::climber_data_t::
       fj_kahan_babushka_neumaier_sum<i_t, f_t>(delta_it + offset_begin, delta_it + offset_end);
     fj.incumbent_lhs_sumcomp[cstr_idx] = 0;
 
-    f_t th_violation       = fj.excess_score(cstr_idx, fj.incumbent_lhs[cstr_idx]);
-    f_t weighted_violation = th_violation * fj.cstr_weights[cstr_idx];
-    atomicAdd(fj.violation_score, th_violation);
-    atomicAdd(fj.weighted_violation_score, weighted_violation);
+    f_t th_violation   = fj.excess_score(cstr_idx, fj.incumbent_lhs[cstr_idx]);
     f_t cstr_tolerance = fj.get_corrected_tolerance(cstr_idx);
     if (th_violation < -cstr_tolerance) { fj.violated_constraints.insert(cstr_idx); }
   }
@@ -191,8 +227,17 @@ DI typename fj_t<i_t, f_t>::move_score_info_t compute_new_score(
     f_t c_lb = fj.pb.constraint_lower_bounds[cstr_idx];
     f_t c_ub = fj.pb.constraint_upper_bounds[cstr_idx];
 
-    auto [cstr_base_feas, cstr_bonus_robust] = feas_score_constraint<i_t, f_t>(
-      fj, var_idx, delta, cstr_idx, cstr_coeff, c_lb, c_ub, fj.incumbent_lhs[cstr_idx]);
+    auto [cstr_base_feas, cstr_bonus_robust] =
+      feas_score_constraint<i_t, f_t>(fj,
+                                      var_idx,
+                                      delta,
+                                      cstr_idx,
+                                      cstr_coeff,
+                                      c_lb,
+                                      c_ub,
+                                      fj.incumbent_lhs[cstr_idx],
+                                      fj.cstr_left_weights[cstr_idx],
+                                      fj.cstr_right_weights[cstr_idx]);
 
     base_feas += cstr_base_feas;
     bonus_robust += cstr_bonus_robust;
@@ -349,7 +394,7 @@ DI std::pair<f_t, typename fj_t<i_t, f_t>::move_score_info_t> compute_best_mtm(
   return std::make_pair(best_val, best_score_info);
 }
 
-template <typename i_t, typename f_t, MTMMoveType move_type, bool is_binary_pb = false>
+template <typename i_t, typename f_t, MTMMoveType move_type, i_t TPB, bool is_binary_pb = false>
 DI void update_jump_value(typename fj_t<i_t, f_t>::climber_data_t::view_t fj, i_t var_idx)
 {
   cuopt_assert(var_idx >= 0 && var_idx < fj.pb.n_variables, "invalid variable index");
@@ -376,12 +421,11 @@ DI void update_jump_value(typename fj_t<i_t, f_t>::climber_data_t::view_t fj, i_
           fj.pb.check_variable_within_bounds(var_idx, fj.incumbent_assignment[var_idx] + delta),
           "Var not within bounds!");
       }
-      best_score_info = compute_new_score<i_t, f_t, TPB_resetmoves>(fj, var_idx, delta);
+      best_score_info = compute_new_score<i_t, f_t, TPB>(fj, var_idx, delta);
     } else {
-      auto [best_val, score_info] =
-        compute_best_mtm<i_t, f_t, TPB_resetmoves, move_type>(fj, var_idx);
-      delta           = best_val - fj.incumbent_assignment[var_idx];
-      best_score_info = score_info;
+      auto [best_val, score_info] = compute_best_mtm<i_t, f_t, TPB, move_type>(fj, var_idx);
+      delta                       = best_val - fj.incumbent_assignment[var_idx];
+      best_score_info             = score_info;
     }
   } else {
     delta = round(1.0 - 2 * fj.incumbent_assignment[var_idx]);
@@ -577,14 +621,16 @@ __global__ void update_assignment_kernel(typename fj_t<i_t, f_t>::climber_data_t
 
     __syncthreads();
 
-    cuopt_assert(isfinite(fj.jump_move_delta[var_idx]), "delta should be finite");
-    // Kahan compensated summation
-    // fj.incumbent_lhs[cstr_idx] = old_lhs + cstr_coeff * fj.jump_move_delta[var_idx];
-    f_t y = cstr_coeff * fj.jump_move_delta[var_idx] - fj.incumbent_lhs_sumcomp[cstr_idx];
-    f_t t = old_lhs + y;
-    fj.incumbent_lhs_sumcomp[cstr_idx] = (t - old_lhs) - y;
-    fj.incumbent_lhs[cstr_idx]         = t;
-    cuopt_assert(isfinite(fj.incumbent_lhs[cstr_idx]), "assignment should be finite");
+    if (threadIdx.x == 0) {
+      cuopt_assert(isfinite(fj.jump_move_delta[var_idx]), "delta should be finite");
+      // Kahan compensated summation
+      // fj.incumbent_lhs[cstr_idx] = old_lhs + cstr_coeff * fj.jump_move_delta[var_idx];
+      f_t y = cstr_coeff * fj.jump_move_delta[var_idx] - fj.incumbent_lhs_sumcomp[cstr_idx];
+      f_t t = old_lhs + y;
+      fj.incumbent_lhs_sumcomp[cstr_idx] = (t - old_lhs) - y;
+      fj.incumbent_lhs[cstr_idx]         = t;
+      cuopt_assert(isfinite(fj.incumbent_lhs[cstr_idx]), "assignment should be finite");
+    }
   }
 
   // update the assignment and objective proper
@@ -626,8 +672,8 @@ __global__ void update_assignment_kernel(typename fj_t<i_t, f_t>::climber_data_t
 
 #if FJ_SINGLE_STEP
     DEVICE_LOG_DEBUG(
-      "=---- FJ[%d]: updated %d [%g/%g] :%.4g+{%.4g}=%.4g score {%g,%g}, d_obj %.2g+%.2g=%.2g, "
-      "err_range %.2g%%, infeas %.2g, total viol %d\n",
+      "=---- FJ[%d]: updated %d [%g/%g] :%.4g+{%.4g}=%.4g score {%d,%d}, d_obj %.2g+%.2g=%.2g, "
+      "err_range %.2g%%, infeas %.2g, total viol %d, obj %x, delta %x, coef %x\n",
       *fj.iterations,
       var_idx,
       get_lower(fj.pb.variable_bounds[var_idx]),
@@ -642,7 +688,10 @@ __global__ void update_assignment_kernel(typename fj_t<i_t, f_t>::climber_data_t
       *fj.incumbent_objective + fj.jump_move_delta[var_idx] * fj.pb.objective_coefficients[var_idx],
       delta_rel_err,
       fj.jump_move_infeasibility[var_idx],
-      fj.violated_constraints.size());
+      fj.violated_constraints.size(),
+      detail::compute_hash(*fj.incumbent_objective),
+      detail::compute_hash(fj.jump_move_delta[var_idx]),
+      detail::compute_hash(fj.pb.objective_coefficients[var_idx]));
 #endif
     // reset the score
     fj.jump_move_scores[var_idx]        = fj_t<i_t, f_t>::move_score_t::invalid();
@@ -862,6 +911,16 @@ DI void update_changed_constraints(typename fj_t<i_t, f_t>::climber_data_t::view
 
   if (blockIdx.x == 0) {
     if (threadIdx.x == 0) {
+      // sort changed constraints to guarantee determinism
+      // TODO: usually csontraint changed few, but thats still rather dreadful...
+      // block-parallelize at least? but not trivial for arbitrary sizes w/ CUB
+      // TODO: replace once focus shifts to tuning deterministic GPU heuristics
+      if (fj.deterministic_work_accounting) {
+        thrust::sort(thrust::seq,
+                     fj.constraints_changed.begin(),
+                     fj.constraints_changed.begin() + *fj.constraints_changed_count);
+      }
+
       for (i_t i = 0; i < *fj.constraints_changed_count; ++i) {
         i_t idx = fj.constraints_changed[i];
         if ((idx & 1) == CONSTRAINT_FLAG_INSERT) {
@@ -953,7 +1012,7 @@ __global__ void compute_iteration_related_variables_kernel(
   compute_iteration_related_variables<i_t, f_t>(fj);
 }
 
-template <typename i_t, typename f_t, MTMMoveType move_type, bool is_binary_pb>
+template <typename i_t, typename f_t, MTMMoveType move_type, bool is_binary_pb, i_t TPB>
 __device__ void compute_mtm_moves(typename fj_t<i_t, f_t>::climber_data_t::view_t fj,
                                   bool ForceRefresh)
 {
@@ -965,11 +1024,14 @@ __device__ void compute_mtm_moves(typename fj_t<i_t, f_t>::climber_data_t::view_
   if (*fj.selected_var == std::numeric_limits<i_t>::max()) full_refresh = true;
 
   // always do a full sweep when looking for satisfied mtm moves
-  if constexpr (move_type == MTMMoveType::FJ_MTM_SATISFIED) full_refresh = true;
-
-  // only update related variables
   i_t split_begin, split_end;
-  if (full_refresh) {
+  if constexpr (move_type == MTMMoveType::FJ_MTM_SATISFIED) {
+    full_refresh = true;
+    split_begin  = 0;
+    split_end    = fj.objective_vars.size();
+  }
+  // only update related variables
+  else if (full_refresh) {
     split_begin = 0;
     split_end   = fj.pb.n_variables;
   }
@@ -989,12 +1051,20 @@ __device__ void compute_mtm_moves(typename fj_t<i_t, f_t>::climber_data_t::view_
     split_end   = range.second;
   }
 
+  charge_deterministic_iteration_work<i_t, f_t>(fj, full_refresh);
+
   if (FIRST_THREAD) *fj.relvar_count_last_update = split_end - split_begin;
 
   for (i_t i = blockIdx.x + split_begin; i < split_end; i += gridDim.x) {
-    i_t var_idx = full_refresh                          ? i
-                  : fj.pb.related_variables.size() == 0 ? i
-                                                        : fj.pb.related_variables[i];
+    // if sat MTM mode, go over objective variables only
+    i_t var_idx;
+    if constexpr (move_type == MTMMoveType::FJ_MTM_SATISFIED) {
+      var_idx = fj.objective_vars[i];
+    } else {
+      var_idx = full_refresh                          ? i
+                : fj.pb.related_variables.size() == 0 ? i
+                                                      : fj.pb.related_variables[i];
+    }
 
     // skip if we couldnt precompute a related var table and
     // this variable isnt in the dynamic related variable table
@@ -1017,7 +1087,7 @@ __device__ void compute_mtm_moves(typename fj_t<i_t, f_t>::climber_data_t::view_
     }
 
     cuopt_assert(var_idx >= 0 && var_idx < fj.pb.n_variables, "");
-    update_jump_value<i_t, f_t, move_type, is_binary_pb>(fj, var_idx);
+    update_jump_value<i_t, f_t, move_type, TPB, is_binary_pb>(fj, var_idx);
   }
 }
 
@@ -1025,7 +1095,7 @@ template <typename i_t, typename f_t, MTMMoveType move_type, bool is_binary_pb>
 __global__ void compute_mtm_moves_kernel(typename fj_t<i_t, f_t>::climber_data_t::view_t fj,
                                          bool ForceRefresh)
 {
-  compute_mtm_moves<i_t, f_t, move_type, is_binary_pb>(fj, ForceRefresh);
+  compute_mtm_moves<i_t, f_t, move_type, is_binary_pb, TPB_resetmoves>(fj, ForceRefresh);
 }
 
 template <typename i_t, typename f_t>
@@ -1037,8 +1107,9 @@ __global__ void select_variable_kernel(typename fj_t<i_t, f_t>::climber_data_t::
     fj.settings->seed, *fj.iterations * fj.settings->parameters.max_sampled_moves, 0);
 
   using move_score_t = typename fj_t<i_t, f_t>::move_score_t;
-  __shared__ alignas(move_score_t) char shmem_storage[2 * raft::WarpSize * sizeof(move_score_t)];
-  auto* const shmem = (move_score_t*)shmem_storage;
+  __shared__ alignas(thrust::pair<move_score_t, i_t>) char
+    shmem_storage[raft::WarpSize * sizeof(thrust::pair<move_score_t, i_t>)];
+  auto* const shmem = (thrust::pair<move_score_t, i_t>*)shmem_storage;
 
   auto th_best_score  = fj_t<i_t, f_t>::move_score_t::invalid();
   i_t th_selected_var = std::numeric_limits<i_t>::max();
@@ -1075,8 +1146,11 @@ __global__ void select_variable_kernel(typename fj_t<i_t, f_t>::climber_data_t::
       }
     }
     // Block level reduction to get the best variable from the sample
+    // Use deterministic tie-breaking comparator based on var_idx
     auto [best_score, reduced_selected_var] =
-      raft::blockRankedReduce(th_best_score, shmem, th_selected_var, raft::max_op{});
+      raft::blockReduce(thrust::make_pair(th_best_score, th_selected_var),
+                        (char*)shmem,
+                        score_with_tiebreaker_comparator<move_score_t, i_t>{});
     if (FIRST_THREAD) {
       // assign it to print the value outside
       th_best_score = best_score;
@@ -1111,9 +1185,9 @@ __global__ void select_variable_kernel(typename fj_t<i_t, f_t>::climber_data_t::
       i_t var_range        = get_upper(bounds) - get_lower(bounds);
       double delta_rel_err = fabs(fj.jump_move_delta[selected_var]) / var_range * 100;
       DEVICE_LOG_INFO(
-        "=---- FJ: selected %d [%g/%g] %c :%.4g+{%.4g}=%.4g score {%g,%g}, d_obj %.2g+%.2g->%.2g, "
+        "=---- FJ: selected %d [%g/%g] %c :%.4g+{%.4g}=%.4g score {%d,%d}, d_obj %.2g+%.2g->%.2g, "
         "delta_rel_err %.2g%%, "
-        "infeas %.2g, total viol %d, out of %d\n",
+        "infeas %.2g, total viol %d, out of %d, obj %x\n",
         selected_var,
         get_lower(bounds),
         get_upper(bounds),
@@ -1130,9 +1204,18 @@ __global__ void select_variable_kernel(typename fj_t<i_t, f_t>::climber_data_t::
         delta_rel_err,
         fj.jump_move_infeasibility[selected_var],
         fj.violated_constraints.size(),
-        good_var_count);
+        good_var_count,
+        detail::compute_hash(*fj.incumbent_objective));
 #endif
       cuopt_assert(fj.jump_move_scores[selected_var].valid(), "");
+    } else {
+#if FJ_SINGLE_STEP
+      DEVICE_LOG_INFO("=[%d]---- FJ: no var selected, obj is %g, viol %d, out of %d\n",
+                      *fj.iterations,
+                      *fj.incumbent_objective,
+                      fj.violated_constraints.size(),
+                      good_var_count);
+#endif
     }
   }
 }
@@ -1202,27 +1285,32 @@ DI thrust::tuple<i_t, f_t, typename fj_t<i_t, f_t>::move_score_t> gridwide_reduc
 
   if (blockIdx.x == 0) {
     using move_score_t = typename fj_t<i_t, f_t>::move_score_t;
-    __shared__ alignas(move_score_t) char shmem_storage[2 * raft::WarpSize * sizeof(move_score_t)];
-    auto* const shmem = (move_score_t*)shmem_storage;
+    __shared__ alignas(thrust::pair<move_score_t, i_t>) char
+      shmem_storage[2 * raft::WarpSize * sizeof(thrust::pair<move_score_t, i_t>)];
+    auto* const shmem = (thrust::pair<move_score_t, i_t>*)shmem_storage;
 
     auto th_best_score = fj_t<i_t, f_t>::move_score_t::invalid();
     i_t th_best_block  = 0;
+    i_t th_best_var    = -1;
     for (i_t i = threadIdx.x; i < gridDim.x; i += blockDim.x) {
       auto var_idx    = fj.grid_var_buf[i];
       auto move_score = fj.grid_score_buf[i];
 
-      if (move_score > th_best_score ||
-          (move_score == th_best_score && var_idx > fj.grid_var_buf[th_best_block])) {
+      if (move_score > th_best_score || (move_score == th_best_score && var_idx > th_best_var)) {
         th_best_score = move_score;
         th_best_block = i;
+        th_best_var   = var_idx;
       }
     }
     // Block level reduction to get the best variable from all blocks
-    auto [reduced_best_score, reduced_best_block] =
-      raft::blockRankedReduce(th_best_score, shmem, th_best_block, raft::max_op{});
-
-    if (reduced_best_score.valid() && threadIdx.x == 0) {
-      cuopt_assert(th_best_block < gridDim.x, "");
+    auto [reduced_best_score_pair, reduced_best_block] =
+      raft::blockRankedReduce(thrust::make_pair(th_best_score, th_best_var),
+                              shmem,
+                              th_best_block,
+                              score_with_tiebreaker_comparator<move_score_t, i_t>{});
+
+    if (reduced_best_score_pair.first.valid() && threadIdx.x == 0) {
+      cuopt_assert(reduced_best_block < gridDim.x, "");
       best_var   = fj.grid_var_buf[reduced_best_block];
       best_delta = fj.grid_delta_buf[reduced_best_block];
       best_score = fj.grid_score_buf[reduced_best_block];
@@ -1244,6 +1332,9 @@ DI thrust::tuple<i_t, f_t, typename fj_t<i_t, f_t>::move_score_t> best_random_mt
   raft::random::PCGenerator rng(fj.settings->seed + *fj.iterations, 0, 0);
 
   i_t cstr_idx = fj.violated_constraints.contents[rng.next_u32() % fj.violated_constraints.size()];
+  cuopt_assert(fj.excess_score(cstr_idx, fj.incumbent_lhs[cstr_idx]) < 0,
+               "constraint isn't violated");
+
   auto [offset_begin, offset_end] = fj.pb.range_for_constraint(cstr_idx);
 
   return gridwide_reduce_best_move<i_t, f_t, TPB, /*WeakTabu=*/true, /*recompute_score=*/true>(
@@ -1258,7 +1349,9 @@ DI thrust::tuple<i_t, f_t, typename fj_t<i_t, f_t>::move_score_t> best_sat_cstr_
   typename fj_t<i_t, f_t>::climber_data_t::view_t fj)
 {
   // compute all MTM moves within satisfied constraints
-  compute_mtm_moves<i_t, f_t, MTMMoveType::FJ_MTM_SATISFIED, false>(fj, true);
+  compute_mtm_moves<i_t, f_t, MTMMoveType::FJ_MTM_SATISFIED, false, TPB>(fj, true);
+  // NOTE: grid sync not required since each block only reduces over variables that it updated in
+  // compute_mtm_moves
   return gridwide_reduce_best_move<i_t, f_t, TPB, /*WeakTabu=*/false, /*recompute_score=*/false>(
     fj, fj.objective_vars.begin(), fj.objective_vars.end(), [fj] __device__(i_t var_idx) {
       return fj.jump_move_delta[var_idx];
@@ -1413,9 +1506,10 @@ __global__ void handle_local_minimum_kernel(typename fj_t<i_t, f_t>::climber_dat
 
     if (sat_best_score.base > 0 && sat_best_score > best_score) {
       if (FIRST_THREAD) {
-        best_score = sat_best_score;
-        best_var   = sat_best_var;
-        best_delta = sat_best_delta;
+        best_score    = sat_best_score;
+        best_var      = sat_best_var;
+        best_delta    = sat_best_delta;
+        best_movetype = 'S';
       }
     }
   }
@@ -1427,6 +1521,15 @@ __global__ void handle_local_minimum_kernel(typename fj_t<i_t, f_t>::climber_dat
                      best_var, fj.incumbent_assignment[best_var] + best_delta),
                    "assignment not within bounds");
       fj.jump_move_delta[best_var] = best_delta;
+#if FJ_SINGLE_STEP
+      DEVICE_LOG_DEBUG("FJ[%d] selected_var: %d, delta %g, score {%d %d}, type %c\n",
+                       *fj.iterations,
+                       best_var,
+                       best_delta,
+                       best_score.base,
+                       best_score.bonus,
+                       best_movetype);
+#endif
     }
   }
 }
@@ -1458,7 +1561,7 @@ __global__ void handle_local_minimum_kernel(typename fj_t<i_t, f_t>::climber_dat
     const __grid_constant__ typename fj_t<int, F_TYPE>::climber_data_t::view_t fj);   \
   template __global__ void load_balancing_sanity_checks<int, F_TYPE>(                 \
     const __grid_constant__ typename fj_t<int, F_TYPE>::climber_data_t::view_t fj);   \
-  template __global__ void init_lhs_and_violation<int, F_TYPE>(                       \
+  template __global__ void init_lhs_and_violated_constraints<int, F_TYPE>(            \
     typename fj_t<int, F_TYPE>::climber_data_t::view_t fj);                           \
   template __global__ void update_lift_moves_kernel<int, F_TYPE>(                     \
     typename fj_t<int, F_TYPE>::climber_data_t::view_t fj);                           \
diff --git a/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump_kernels.cuh b/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump_kernels.cuh
index 55fd4e61f1..9b99cdeb21 100644
--- a/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump_kernels.cuh
+++ b/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump_kernels.cuh
@@ -52,7 +52,8 @@ __global__ void load_balancing_mtm_compute_scores(
   const __grid_constant__ typename fj_t<i_t, f_t>::climber_data_t::view_t fj);
 
 template <typename i_t, typename f_t>
-__global__ void init_lhs_and_violation(typename fj_t<i_t, f_t>::climber_data_t::view_t fj);
+__global__ void init_lhs_and_violated_constraints(
+  typename fj_t<i_t, f_t>::climber_data_t::view_t fj);
 
 // Update the jump move tables after the best jump value has been computed for a "heavy" variable
 template <typename i_t, typename f_t>
diff --git a/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cu b/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cu
index b16f299bf1..34634959c8 100644
--- a/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cu
+++ b/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cu
@@ -11,16 +11,20 @@
 #include "feasibility_jump_impl_common.cuh"
 #include "fj_cpu.cuh"
 
+#include <utilities/determinism_log.hpp>
 #include <utilities/seed_generator.cuh>
 
 #include <raft/core/nvtx.hpp>
 
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/tuple.h>
+
+#include <cerrno>
 #include <chrono>
-#include <iomanip>
-#include <mutex>
+#include <cmath>
+#include <cstdlib>
 #include <random>
 #include <sstream>
-#include <thread>
 #include <unordered_set>
 #include <vector>
 
@@ -38,6 +42,24 @@
 
 namespace cuopt::linear_programming::detail {
 
+namespace {
+
+double read_positive_work_unit_scale(const char* env_name)
+{
+  const char* env_value = std::getenv(env_name);
+  if (env_value == nullptr || env_value[0] == '\0') { return 1.0; }
+
+  errno                     = 0;
+  char* end_ptr             = nullptr;
+  const double parsed_value = std::strtod(env_value, &end_ptr);
+  const bool valid_value    = errno == 0 && end_ptr != env_value && *end_ptr == '\0' &&
+                           std::isfinite(parsed_value) && parsed_value > 0.0;
+  cuopt_assert(valid_value, "Invalid CPUFJ work-unit scale env var");
+  return parsed_value;
+}
+
+}  // namespace
+
 template <typename i_t, typename f_t, typename ArrayType>
 thrust::tuple<f_t, f_t> get_mtm_for_bound(const typename fj_t<i_t, f_t>::climber_data_t::view_t& fj,
                                           i_t var_idx,
@@ -107,99 +129,6 @@ thrust::tuple<f_t, f_t, f_t, f_t> get_mtm_for_constraint(
   return {delta_ij, sign, slack, cstr_tolerance};
 }
 
-template <typename i_t, typename f_t>
-std::pair<f_t, f_t> feas_score_constraint(const typename fj_t<i_t, f_t>::climber_data_t::view_t& fj,
-                                          i_t var_idx,
-                                          f_t delta,
-                                          i_t cstr_idx,
-                                          f_t cstr_coeff,
-                                          f_t c_lb,
-                                          f_t c_ub,
-                                          f_t current_lhs,
-                                          f_t left_weight,
-                                          f_t right_weight)
-{
-  cuopt_assert(isfinite(delta), "invalid delta");
-  cuopt_assert(cstr_coeff != 0 && isfinite(cstr_coeff), "invalid coefficient");
-
-  f_t base_feas    = 0;
-  f_t bonus_robust = 0;
-
-  f_t bounds[2] = {c_lb, c_ub};
-  cuopt_assert(isfinite(c_lb) || isfinite(c_ub), "no range");
-  for (i_t bound_idx = 0; bound_idx < 2; ++bound_idx) {
-    if (!isfinite(bounds[bound_idx])) continue;
-
-    // factor to correct the lhs/rhs to turn a lb <= lhs <= ub constraint into
-    // two virtual leq constraints "lhs <= ub" and "-lhs <= -lb" in order to match
-    // the convention of the paper
-
-    // TODO: broadcast left/right weights to a csr_offset-indexed table? local minimums
-    // usually occur on a rarer basis (around 50 iteratiosn to 1 local minimum)
-    // likely unreasonable and overkill however
-    f_t cstr_weight = bound_idx == 0 ? left_weight : right_weight;
-    f_t sign        = bound_idx == 0 ? -1 : 1;
-    f_t rhs         = bounds[bound_idx] * sign;
-    f_t old_lhs     = current_lhs * sign;
-    f_t new_lhs     = (current_lhs + cstr_coeff * delta) * sign;
-    f_t old_slack   = rhs - old_lhs;
-    f_t new_slack   = rhs - new_lhs;
-
-    cuopt_assert(isfinite(cstr_weight), "invalid weight");
-    cuopt_assert(cstr_weight >= 0, "invalid weight");
-    cuopt_assert(isfinite(old_lhs), "");
-    cuopt_assert(isfinite(new_lhs), "");
-    cuopt_assert(isfinite(old_slack) && isfinite(new_slack), "");
-
-    f_t cstr_tolerance = fj.get_corrected_tolerance(cstr_idx, c_lb, c_ub);
-
-    bool old_viol = fj.excess_score(cstr_idx, current_lhs, c_lb, c_ub) < -cstr_tolerance;
-    bool new_viol =
-      fj.excess_score(cstr_idx, current_lhs + cstr_coeff * delta, c_lb, c_ub) < -cstr_tolerance;
-
-    bool old_sat = old_lhs < rhs + cstr_tolerance;
-    bool new_sat = new_lhs < rhs + cstr_tolerance;
-
-    // equality
-    if (fj.pb.integer_equal(c_lb, c_ub)) {
-      if (!old_viol) cuopt_assert(old_sat == !old_viol, "");
-      if (!new_viol) cuopt_assert(new_sat == !new_viol, "");
-    }
-
-    // if it would feasibilize this constraint
-    if (!old_sat && new_sat) {
-      cuopt_assert(old_viol, "");
-      base_feas += cstr_weight;
-    }
-    // would cause this constraint to be violated
-    else if (old_sat && !new_sat) {
-      cuopt_assert(new_viol, "");
-      base_feas -= cstr_weight;
-    }
-    // simple improvement
-    else if (!old_sat && !new_sat && old_lhs > new_lhs) {
-      cuopt_assert(old_viol && new_viol, "");
-      base_feas += (i_t)(cstr_weight * fj.settings->parameters.excess_improvement_weight);
-    }
-    // simple worsening
-    else if (!old_sat && !new_sat && old_lhs <= new_lhs) {
-      cuopt_assert(old_viol && new_viol, "");
-      base_feas -= (i_t)(cstr_weight * fj.settings->parameters.excess_improvement_weight);
-    }
-
-    // robustness score bonus if this would leave some strick slack
-    bool old_stable = old_lhs < rhs - cstr_tolerance;
-    bool new_stable = new_lhs < rhs - cstr_tolerance;
-    if (!old_stable && new_stable) {
-      bonus_robust += cstr_weight;
-    } else if (old_stable && !new_stable) {
-      bonus_robust -= cstr_weight;
-    }
-  }
-
-  return {base_feas, bonus_robust};
-}
-
 static constexpr double BIGVAL_THRESHOLD = 1e20;
 
 template <typename i_t, typename f_t>
@@ -1401,6 +1330,15 @@ std::unique_ptr<fj_cpu_climber_t<i_t, f_t>> fj_t<i_t, f_t>::create_cpu_climber(
 
   // Initialize fj_cpu with all the data
   init_fj_cpu(*fj_cpu, solution, left_weights, right_weights, objective_weight);
+  const double cpu_work_unit_scale =
+    context.settings.cpufj_work_unit_scale != 1.0
+      ? context.settings.cpufj_work_unit_scale
+      : read_positive_work_unit_scale("CUOPT_CPUFJ_WORK_UNIT_SCALE");
+  fj_cpu->work_unit_bias *= cpu_work_unit_scale;
+  if (cpu_work_unit_scale != 1.0) {
+    CUOPT_DETERMINISM_LOG(
+      "CPUFJ using work-unit scale %f (bias=%f)", cpu_work_unit_scale, fj_cpu->work_unit_bias);
+  }
   fj_cpu->settings = settings;
   if (randomize_params) {
     auto rng                 = std::mt19937(cuopt::seed_generator::get_seed());
@@ -1550,6 +1488,10 @@ static bool cpufj_solve_loop(fj_cpu_climber_t<i_t, f_t>& fj_cpu, f_t in_time_lim
       fj_cpu.work_units_elapsed += biased_work;
 
       if (fj_cpu.producer_sync != nullptr) { fj_cpu.producer_sync->notify_progress(); }
+
+      if (fj_cpu.work_units_elapsed.load(std::memory_order_relaxed) >= fj_cpu.work_budget) {
+        break;
+      }
     }
 
     cuopt_func_call(sanity_checks(fj_cpu));
diff --git a/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cuh b/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cuh
index 3263609a2b..4124bd079a 100644
--- a/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cuh
+++ b/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cuh
@@ -154,7 +154,8 @@ struct fj_cpu_climber_t {
 
   // Work unit tracking for deterministic synchronization
   std::atomic<double> work_units_elapsed{0.0};
-  double work_unit_bias{1.5};               // Bias factor to keep CPUFJ ahead of B&B
+  double work_unit_bias{1.5};  // Bias factor to keep CPUFJ ahead of B&B
+  double work_budget{std::numeric_limits<double>::infinity()};
   producer_sync_t* producer_sync{nullptr};  // Optional sync utility for notifying progress
 
   std::atomic<bool> halted{false};
diff --git a/cpp/src/mip_heuristics/feasibility_jump/load_balancing.cuh b/cpp/src/mip_heuristics/feasibility_jump/load_balancing.cuh
index dfc9b3c885..8b77367ac4 100644
--- a/cpp/src/mip_heuristics/feasibility_jump/load_balancing.cuh
+++ b/cpp/src/mip_heuristics/feasibility_jump/load_balancing.cuh
@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -120,16 +120,19 @@ __global__ void load_balancing_prepare_iteration(const __grid_constant__
                                                  typename fj_t<i_t, f_t>::climber_data_t::view_t fj)
 {
   bool full_refresh = needs_full_refresh<i_t, f_t>(fj);
+  charge_deterministic_iteration_work<i_t, f_t>(fj, full_refresh);
 
   // alternate codepath in the case of a small related_var/total_var ratio
   if (!full_refresh && fj.pb.related_variables.size() > 0 &&
       fj.pb.n_variables / fj.work_ids_for_related_vars[*fj.selected_var] >=
-        fj.settings->parameters.old_codepath_total_var_to_relvar_ratio_threshold) {
+        fj.settings->parameters.old_codepath_total_var_to_relvar_ratio_threshold &&
+      fj.settings->load_balancing_mode != fj_load_balancing_mode_t::ALWAYS_ON) {
     auto range = fj.pb.range_for_related_vars(*fj.selected_var);
 
     for (i_t i = blockIdx.x + range.first; i < range.second; i += gridDim.x) {
       i_t var_idx = fj.pb.related_variables[i];
-      update_jump_value<i_t, f_t, MTMMoveType::FJ_MTM_VIOLATED, false>(fj, var_idx);
+      update_jump_value<i_t, f_t, MTMMoveType::FJ_MTM_VIOLATED, TPB_loadbalance, false>(fj,
+                                                                                        var_idx);
     }
 
     if (FIRST_THREAD) *fj.load_balancing_skip = true;
@@ -334,8 +337,17 @@ __global__ void load_balancing_compute_scores_binary(
       auto c_lb = fj.constraint_lower_bounds_csr[csr_offset];
       auto c_ub = fj.constraint_upper_bounds_csr[csr_offset];
 
-      auto [cstr_base_feas, cstr_bonus_robust] = feas_score_constraint<i_t, f_t>(
-        fj, var_idx, delta, cstr_idx, cstr_coeff, c_lb, c_ub, fj.incumbent_lhs[cstr_idx]);
+      auto [cstr_base_feas, cstr_bonus_robust] =
+        feas_score_constraint<i_t, f_t>(fj,
+                                        var_idx,
+                                        delta,
+                                        cstr_idx,
+                                        cstr_coeff,
+                                        c_lb,
+                                        c_ub,
+                                        fj.incumbent_lhs[cstr_idx],
+                                        fj.cstr_left_weights[cstr_idx],
+                                        fj.cstr_right_weights[cstr_idx]);
 
       base_feas += cstr_base_feas;
       bonus_robust += cstr_bonus_robust;
@@ -526,8 +538,8 @@ __launch_bounds__(TPB_loadbalance, 16) __global__
 
       auto& score_info = candidate.score;
 
-      f_t base_feas    = 0;
-      f_t bonus_robust = 0;
+      int32_t base_feas    = 0;
+      int32_t bonus_robust = 0;
 
       // same as for the binary var kernel, compute each score compoenent per thread
       // and merge then via a wapr reduce
@@ -535,8 +547,17 @@ __launch_bounds__(TPB_loadbalance, 16) __global__
         cuopt_assert(c_lb == fj.pb.constraint_lower_bounds[cstr_idx], "bound sanity check failed");
         cuopt_assert(c_ub == fj.pb.constraint_upper_bounds[cstr_idx], "bound sanity check failed");
 
-        auto [cstr_base_feas, cstr_bonus_robust] = feas_score_constraint<i_t, f_t>(
-          fj, var_idx, delta, cstr_idx, cstr_coeff, c_lb, c_ub, fj.incumbent_lhs[cstr_idx]);
+        auto [cstr_base_feas, cstr_bonus_robust] =
+          feas_score_constraint<i_t, f_t>(fj,
+                                          var_idx,
+                                          delta,
+                                          cstr_idx,
+                                          cstr_coeff,
+                                          c_lb,
+                                          c_ub,
+                                          fj.incumbent_lhs[cstr_idx],
+                                          fj.cstr_left_weights[cstr_idx],
+                                          fj.cstr_right_weights[cstr_idx]);
 
         base_feas += cstr_base_feas;
         bonus_robust += cstr_bonus_robust;
@@ -565,24 +586,29 @@ __launch_bounds__(TPB_loadbalance, 16) __global__
             best_score_ref{fj.jump_move_scores[var_idx]};
           auto best_score = best_score_ref.load(cuda::memory_order_relaxed);
 
+          cuda::atomic_ref<f_t, cuda::thread_scope_device> best_delta_ref{
+            fj.jump_move_delta[var_idx]};
+          auto best_delta = best_delta_ref.load(cuda::memory_order_relaxed);
+
           if (best_score < candidate.score ||
-              (best_score == candidate.score && candidate.delta < fj.jump_move_delta[var_idx])) {
+              (best_score == candidate.score && candidate.delta < best_delta)) {
             // update the best move delta
             acquire_lock(&fj.jump_locks[var_idx]);
 
             // reject this move if it would increase the target variable to a numerically unstable
             // value
-            if (!fj.move_numerically_stable(fj.incumbent_assignment[var_idx],
-                                            fj.incumbent_assignment[var_idx] + delta,
-                                            base_feas,
-                                            *fj.violation_score)) {
-              fj.jump_move_scores[var_idx] = fj_t<i_t, f_t>::move_score_t::invalid();
-            } else if (fj.jump_move_scores[var_idx] < candidate.score
-                       // determinism for ease of debugging
-                       || (fj.jump_move_scores[var_idx] == candidate.score &&
-                           candidate.delta < fj.jump_move_delta[var_idx])) {
-              fj.jump_move_delta[var_idx]  = candidate.delta;
-              fj.jump_move_scores[var_idx] = candidate.score;
+            // only skip updating, don't invalidate existing valid moves
+            if (fj.move_numerically_stable(fj.incumbent_assignment[var_idx],
+                                           fj.incumbent_assignment[var_idx] + delta,
+                                           base_feas,
+                                           *fj.violation_score)) {
+              if (fj.jump_move_scores[var_idx] < candidate.score
+                  // determinism for ease of debugging
+                  || (fj.jump_move_scores[var_idx] == candidate.score &&
+                      candidate.delta < fj.jump_move_delta[var_idx])) {
+                fj.jump_move_delta[var_idx]  = candidate.delta;
+                fj.jump_move_scores[var_idx] = candidate.score;
+              }
             }
             release_lock(&fj.jump_locks[var_idx]);
           }
@@ -644,7 +670,7 @@ __global__ void load_balancing_sanity_checks(const __grid_constant__
     if (!(score_1 == score_1.invalid() && score_2 == score_2.invalid()) &&
         !(v.pb.integer_equal(score_1.base, score_2.base) &&
           v.pb.integer_equal(score_1.bonus, score_2.bonus))) {
-      printf("(iter %d) [%d, int:%d]: delta %g/%g was %f/%f, is %f/%f\n",
+      printf("(iter %d) [%d, int:%d]: delta %g/%g was %d/%d, is %d/%d\n",
              *v.iterations,
              var_idx,
              v.pb.is_integer_var(var_idx),
diff --git a/cpp/src/mip_heuristics/feasibility_jump/utils.cuh b/cpp/src/mip_heuristics/feasibility_jump/utils.cuh
index d98686bcc6..a16567b092 100644
--- a/cpp/src/mip_heuristics/feasibility_jump/utils.cuh
+++ b/cpp/src/mip_heuristics/feasibility_jump/utils.cuh
@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -10,6 +10,7 @@
 #include "feasibility_jump.cuh"
 
 #include <thrust/pair.h>
+#include <thrust/sort.h>
 #include <cuda/atomic>
 #include <raft/core/device_span.hpp>
 #include <rmm/device_scalar.hpp>
@@ -133,6 +134,23 @@ struct contiguous_set_t {
     validity_bitmap.resize(size, stream);
   }
 
+  void sort(const rmm::cuda_stream_view& stream)
+  {
+    thrust::sort(
+      rmm::exec_policy(stream), contents.begin(), contents.begin() + set_size.value(stream));
+    thrust::fill(rmm::exec_policy(stream), index_map.begin(), index_map.end(), -1);
+    thrust::for_each(rmm::exec_policy(stream),
+                     thrust::make_counting_iterator<i_t>(0),
+                     thrust::make_counting_iterator<i_t>(set_size.value(stream)),
+                     [v = view()] __device__(i_t idx) { v.index_map[v.contents[idx]] = idx; });
+
+    // only useful for debugging and ensuring the same hashes are printed
+#if FJ_SINGLE_STEP
+    thrust::fill(
+      rmm::exec_policy(stream), contents.begin() + set_size.value(stream), contents.end(), 0);
+#endif
+  }
+
   struct view_t {
     i_t* set_size;
     i_t* lock;
diff --git a/cpp/src/mip_heuristics/local_search/feasibility_pump/feasibility_pump.cu b/cpp/src/mip_heuristics/local_search/feasibility_pump/feasibility_pump.cu
index 0a17e3ebfd..34034956af 100644
--- a/cpp/src/mip_heuristics/local_search/feasibility_pump/feasibility_pump.cu
+++ b/cpp/src/mip_heuristics/local_search/feasibility_pump/feasibility_pump.cu
@@ -29,6 +29,14 @@
 #include <thrust/gather.h>
 #include <thrust/tabulate.h>
 
+// enable to activate detailed determinism logs
+#if 0
+#undef CUOPT_DETERMINISM_LOG
+#define CUOPT_DETERMINISM_LOG(...) \
+  do {                             \
+    CUOPT_LOG_INFO(__VA_ARGS__);   \
+  } while (0)
+#endif
 namespace cuopt::linear_programming::detail {
 
 template <typename i_t, typename f_t>
@@ -52,7 +60,7 @@ feasibility_pump_t<i_t, f_t>::feasibility_pump_t(
                         context.problem_ptr->handle_ptr->get_stream()),
     lp_optimal_solution(lp_optimal_solution_),
     rng(cuopt::seed_generator::get_seed()),
-    timer(20.)
+    timer(20., *context.termination)
 {
 }
 
@@ -147,18 +155,36 @@ bool feasibility_pump_t<i_t, f_t>::linear_project_onto_polytope(solution_t<i_t,
   problem_t<i_t, f_t> temp_p(*solution.problem_ptr);
   auto h_integer_indices =
     cuopt::host_copy(solution.problem_ptr->integer_indices, solution.handle_ptr->get_stream());
+  cuopt_assert(h_assignment.size() == solution.problem_ptr->n_variables, "Size mismatch");
+  cuopt_assert(h_last_projection.size() == solution.problem_ptr->n_variables, "Size mismatch");
+  cuopt_assert(h_variable_bounds.size() == solution.problem_ptr->n_variables, "Size mismatch");
+  CUOPT_DETERMINISM_LOG(
+    "FP proj inputs: assign_hash=0x%x last_proj_hash=0x%x integer_idx_hash=0x%x n_vars=%d n_int=%d",
+    detail::compute_hash(h_assignment),
+    detail::compute_hash(h_last_projection),
+    detail::compute_hash(h_integer_indices),
+    solution.problem_ptr->n_variables,
+    solution.problem_ptr->n_integer_vars);
   f_t obj_offset = 0;
+  i_t n_at_upper = 0;
+  i_t n_at_lower = 0;
+  i_t n_interior = 0;
+  std::vector<i_t> interior_integer_indices;
+  interior_integer_indices.reserve(h_integer_indices.size());
   // for each integer add the variable and the distance constraints
   for (auto i : h_integer_indices) {
+    cuopt_assert(i >= 0 && i < solution.problem_ptr->n_variables, "Index out of bounds");
     auto h_var_bounds = h_variable_bounds[i];
     if (solution.problem_ptr->integer_equal(h_assignment[i], get_upper(h_var_bounds))) {
       obj_offset += get_upper(h_var_bounds);
       // set the objective weight to -1,  u - x
       obj_coefficients[i] = -1;
+      n_at_upper++;
     } else if (solution.problem_ptr->integer_equal(h_assignment[i], get_lower(h_var_bounds))) {
       obj_offset -= get_lower(h_var_bounds);
       // set the objective weight to +1,  x - l
       obj_coefficients[i] = 1;
+      n_at_lower++;
     } else {
       // objective weight is 1
       const f_t obj_weight = 1.;
@@ -183,9 +209,30 @@ bool feasibility_pump_t<i_t, f_t>::linear_project_onto_polytope(solution_t<i_t,
       std::vector<f_t> constr_coeffs_2{1, 1};
       h_constraints.add_constraint(
         constr_indices, constr_coeffs_2, h_assignment[i], (f_t)default_cont_upper);
+      n_interior++;
+      interior_integer_indices.push_back(i);
     }
   }
+  CUOPT_DETERMINISM_LOG(
+    "FP proj build: at_lower=%d at_upper=%d interior=%d interior_idx_hash=0x%x obj_hash=0x%x "
+    "assign_aug_hash=0x%x vars_added=%d cstr_added=%d cstr_var_hash=0x%x cstr_coeff_hash=0x%x "
+    "cstr_offset_hash=0x%x cstr_lb_hash=0x%x cstr_ub_hash=0x%x",
+    n_at_lower,
+    n_at_upper,
+    n_interior,
+    detail::compute_hash(interior_integer_indices),
+    detail::compute_hash(obj_coefficients),
+    detail::compute_hash(h_assignment),
+    h_variables.size(),
+    h_constraints.n_constraints(),
+    detail::compute_hash(h_constraints.constraint_variables),
+    detail::compute_hash(h_constraints.constraint_coefficients),
+    detail::compute_hash(h_constraints.constraint_offsets),
+    detail::compute_hash(h_constraints.constraint_lower_bounds),
+    detail::compute_hash(h_constraints.constraint_upper_bounds));
   adjust_objective_with_original(solution, obj_coefficients, longer_lp_run);
+  CUOPT_DETERMINISM_LOG("FP proj adjusted objective hash=0x%x",
+                        detail::compute_hash(obj_coefficients));
   // commit all the changes that were done by the host
   if (h_variables.size() > 0) { temp_p.insert_variables(h_variables); }
   if (h_constraints.n_constraints() > 0) { temp_p.insert_constraints(h_constraints); }
@@ -196,6 +243,12 @@ bool feasibility_pump_t<i_t, f_t>::linear_project_onto_polytope(solution_t<i_t,
   cuopt_assert(temp_p.objective_coefficients.size() == temp_p.n_variables, "Var count mismatch!");
   solution.copy_new_assignment(h_assignment);
   cuopt_assert(solution.assignment.size() == temp_p.n_variables, "Var count mismatch!");
+  CUOPT_DETERMINISM_LOG(
+    "FP proj pre-LP: temp_fingerprint=0x%x assignment_hash=0x%x n_vars=%d n_cstr=%d",
+    temp_p.get_fingerprint(),
+    detail::compute_hash(solution.assignment, solution.handle_ptr->get_stream()),
+    temp_p.n_variables,
+    temp_p.n_constraints);
   // copy new objective coefficients
   raft::copy(temp_p.objective_coefficients.data(),
              obj_coefficients.data(),
@@ -210,13 +263,19 @@ bool feasibility_pump_t<i_t, f_t>::linear_project_onto_polytope(solution_t<i_t,
   temp_p.check_problem_representation(true);
   const f_t rlp_base = context.settings.heuristic_params.relaxed_lp_time_limit;
   f_t time_limit     = longer_lp_run ? 5. * rlp_base : rlp_base;
-  time_limit         = std::max(0.05, std::min(time_limit, timer.remaining_time() / 10.));
+  if (timer.deterministic) {
+    time_limit = std::max((f_t)0.0, std::min(time_limit, timer.remaining_time() / 10.));
+  } else {
+    time_limit = std::max((f_t)0.05, std::min(time_limit, timer.remaining_time() / 10.));
+  }
   static f_t lp_time = 0;
   static i_t n_calls = 0;
   f_t old_remaining  = timer.remaining_time();
   cuopt_func_call(solution.test_variable_bounds(false));
   relaxed_lp_settings_t lp_settings;
-  lp_settings.time_limit          = time_limit;
+  lp_settings.time_limit = time_limit;
+  if (timer.deterministic) { lp_settings.work_limit = lp_settings.time_limit; }
+  lp_settings.work_context        = timer.work_context;
   lp_settings.tolerance           = lp_tolerance;
   lp_settings.check_infeasibility = false;
   auto solver_response            = get_relaxed_lp_solution(temp_p, solution, lp_settings);
@@ -248,7 +307,21 @@ bool feasibility_pump_t<i_t, f_t>::round(solution_t<i_t, f_t>& solution)
 {
   bool result;
   CUOPT_LOG_DEBUG("Rounding the point");
-  timer_t bounds_prop_timer(std::max(0.05, std::min(0.5, timer.remaining_time() / 10.)));
+  const int64_t seed_before  = cuopt::seed_generator::peek_seed();
+  const uint32_t hash_before = solution.get_hash();
+  CUOPT_DETERMINISM_LOG("FP round entry: hash=0x%x seed=%lld rem=%.6f",
+                        hash_before,
+                        (long long)seed_before,
+                        timer.remaining_time());
+
+  f_t bounds_prop_time_limit = std::min((f_t)0.5, timer.remaining_time() / 10.);
+  if (timer.deterministic) {
+    bounds_prop_time_limit = std::max((f_t)0.0, bounds_prop_time_limit);
+  } else {
+    bounds_prop_time_limit = std::max((f_t)0.05, bounds_prop_time_limit);
+  }
+  work_limit_timer_t bounds_prop_timer(
+    context.gpu_heur_loop, bounds_prop_time_limit, *context.termination);
   const f_t lp_run_time_after_feasible     = 0.;
   bool old_var                             = constraint_prop.round_all_vars;
   f_t old_time                             = constraint_prop.max_time_for_bounds_prop;
@@ -257,13 +330,20 @@ bool feasibility_pump_t<i_t, f_t>::round(solution_t<i_t, f_t>& solution)
   result = constraint_prop.apply_round(solution, lp_run_time_after_feasible, bounds_prop_timer);
   constraint_prop.round_all_vars           = old_var;
   constraint_prop.max_time_for_bounds_prop = old_time;
-  // result = solution.round_nearest();
   cuopt_func_call(solution.test_variable_bounds(true));
-  // copy the last rounding
   raft::copy(last_rounding.data(),
              solution.assignment.data(),
              solution.assignment.size(),
              solution.handle_ptr->get_stream());
+
+  const int64_t seed_after = cuopt::seed_generator::peek_seed();
+  CUOPT_DETERMINISM_LOG("FP round exit: hash=0x%x seed=%lld seed_delta=%lld feasible=%d rem=%.6f",
+                        solution.get_hash(),
+                        (long long)seed_after,
+                        (long long)(seed_after - seed_before),
+                        (int)result,
+                        timer.remaining_time());
+
   if (result) {
     CUOPT_LOG_DEBUG("New feasible solution with objective %g", solution.get_user_objective());
   }
@@ -308,6 +388,13 @@ bool feasibility_pump_t<i_t, f_t>::test_fj_feasible(solution_t<i_t, f_t>& soluti
   fj.settings.feasibility_run        = true;
   fj.settings.n_of_minimums_for_exit = 5000;
   fj.settings.time_limit             = std::min(time_limit, timer.remaining_time());
+  if (timer.deterministic) {
+    fj.settings.time_limit = std::max((f_t)0.0, fj.settings.time_limit);
+    if (fj.settings.time_limit == 0.0) {
+      CUOPT_LOG_DEBUG("Skipping 20%% FJ run due to exhausted deterministic work budget");
+      return false;
+    }
+  }
   cuopt_func_call(solution.test_variable_bounds(true));
   is_feasible = fj.solve(solution);
   cuopt_func_call(solution.test_variable_bounds(true));
@@ -472,14 +559,39 @@ template <typename i_t, typename f_t>
 bool feasibility_pump_t<i_t, f_t>::run_single_fp_descent(solution_t<i_t, f_t>& solution)
 {
   raft::common::nvtx::range fun_scope("run_single_fp_descent");
+  i_t fp_iter = 0;
+  CUOPT_DETERMINISM_LOG("FP descent start: hash=0x%x feas=%d obj=%.12f timer_det=%d rem=%.6f",
+                        solution.get_hash(),
+                        (int)solution.get_feasible(),
+                        solution.get_user_objective(),
+                        (int)timer.deterministic,
+                        timer.remaining_time());
   // start by doing nearest rounding
   solution.round_nearest();
+  CUOPT_DETERMINISM_LOG("FP descent after initial round: hash=0x%x feas=%d obj=%.12f",
+                        solution.get_hash(),
+                        (int)solution.get_feasible(),
+                        solution.get_user_objective());
+  cuopt_assert(last_projection.size() == solution.assignment.size(), "Size mismatch");
+  // First projection in a descent has no previous projection history: initialize explicitly
+  raft::copy(last_projection.data(),
+             solution.assignment.data(),
+             solution.assignment.size(),
+             solution.handle_ptr->get_stream());
   raft::copy(last_rounding.data(),
              solution.assignment.data(),
              solution.assignment.size(),
              solution.handle_ptr->get_stream());
   while (true) {
-    if (context.diversity_manager_ptr->check_b_b_preemption() || timer.check_time_limit()) {
+    CUOPT_DETERMINISM_LOG("FP iter %d pre-projection: hash=0x%x feas=%d obj=%.12f rem=%.6f",
+                          fp_iter,
+                          solution.get_hash(),
+                          (int)solution.get_feasible(),
+                          solution.get_user_objective(),
+                          timer.remaining_time());
+    bool preempt = context.diversity_manager_ptr != nullptr &&
+                   context.diversity_manager_ptr->check_b_b_preemption();
+    if (preempt || timer.check_time_limit()) {
       CUOPT_LOG_DEBUG("FP time limit reached!");
       round(solution);
       return false;
@@ -489,10 +601,25 @@ bool feasibility_pump_t<i_t, f_t>::run_single_fp_descent(solution_t<i_t, f_t>& s
     f_t ratio_of_assigned_integers =
       f_t(solution.n_assigned_integers) / solution.problem_ptr->n_integer_vars;
     bool is_feasible = linear_project_onto_polytope(solution, ratio_of_assigned_integers);
-    i_t n_integers   = solution.compute_number_of_integers();
+    const f_t remaining_after_projection = timer.remaining_time();
+    i_t n_integers                       = solution.compute_number_of_integers();
     CUOPT_LOG_DEBUG("after fp projection n_integers %d total n_integes %d",
                     n_integers,
                     solution.problem_ptr->n_integer_vars);
+    CUOPT_DETERMINISM_LOG(
+      "FP iter %d post-projection: hash=0x%x feasible_after_lp=%d obj=%.12f rem=%.6f lp_stage=%.6f",
+      fp_iter,
+      solution.get_hash(),
+      (int)is_feasible,
+      solution.get_user_objective(),
+      remaining_after_projection,
+      proj_begin - remaining_after_projection);
+    CUOPT_DETERMINISM_LOG("FP iter %d pre-round: hash=0x%x feas=%d obj=%.12f rem=%.6f",
+                          fp_iter,
+                          solution.get_hash(),
+                          (int)is_feasible,
+                          solution.get_user_objective(),
+                          remaining_after_projection);
     bool is_cycle = true;
     // temp comment for presolve run
     if (config.check_distance_cycle) {
@@ -524,30 +651,71 @@ bool feasibility_pump_t<i_t, f_t>::run_single_fp_descent(solution_t<i_t, f_t>& s
         // run the LP with full precision to check if it actually is feasible
         const f_t lp_verify_time_limit = 5.;
         relaxed_lp_settings_t lp_settings;
-        lp_settings.time_limit            = lp_verify_time_limit;
+        lp_settings.time_limit = lp_verify_time_limit;
+        bool run_verify_lp     = true;
+        if (timer.deterministic) {
+          const f_t remaining_work_limit = std::max((f_t)0.0, timer.remaining_time());
+          lp_settings.work_limit         = std::min(lp_verify_time_limit, remaining_work_limit);
+          lp_settings.time_limit         = lp_settings.work_limit;
+          if (lp_settings.work_limit == 0.0) {
+            CUOPT_LOG_DEBUG(
+              "Skipping FP verification LP due to exhausted deterministic work budget");
+            run_verify_lp = false;
+          }
+        }
+        lp_settings.work_context          = timer.work_context;
         lp_settings.tolerance             = solution.problem_ptr->tolerances.absolute_tolerance;
         lp_settings.return_first_feasible = true;
         lp_settings.save_state            = true;
-        run_lp_with_vars_fixed(*solution.problem_ptr,
-                               solution,
-                               solution.problem_ptr->integer_indices,
-                               lp_settings,
-                               &constraint_prop.bounds_update);
-        is_feasible = solution.get_feasible();
-        n_integers  = solution.compute_number_of_integers();
-        if (is_feasible && n_integers == solution.problem_ptr->n_integer_vars) {
-          CUOPT_LOG_DEBUG("Feasible solution verified with LP!");
-          return true;
+        if (run_verify_lp) {
+          run_lp_with_vars_fixed(*solution.problem_ptr,
+                                 solution,
+                                 solution.problem_ptr->integer_indices,
+                                 lp_settings,
+                                 &constraint_prop.bounds_update);
+          is_feasible = solution.get_feasible();
+          n_integers  = solution.compute_number_of_integers();
+          if (is_feasible && n_integers == solution.problem_ptr->n_integer_vars) {
+            CUOPT_LOG_TRACE("Feasible solution verified with LP!");
+            return true;
+          }
         }
       }
     }
     cuopt_func_call(solution.test_variable_bounds(false));
     is_feasible = round(solution);
     cuopt_func_call(solution.test_variable_bounds(true));
-    proj_and_round_time = proj_begin - timer.remaining_time();
+    const f_t remaining_after_round = timer.remaining_time();
+    proj_and_round_time             = proj_begin - remaining_after_round;
+    CUOPT_DETERMINISM_LOG(
+      "FP iter %d post-round: hash=0x%x feasible_after_round=%d obj=%.12f rem=%.6f "
+      "round_stage=%.6f proj_round_total=%.6f",
+      fp_iter,
+      solution.get_hash(),
+      (int)is_feasible,
+      solution.get_user_objective(),
+      remaining_after_round,
+      remaining_after_projection - remaining_after_round,
+      proj_and_round_time);
     if (!is_feasible) {
       const f_t time_ratio = 0.2;
-      is_feasible          = test_fj_feasible(solution, time_ratio * proj_and_round_time);
+      const f_t fj_budget  = time_ratio * proj_and_round_time;
+      CUOPT_DETERMINISM_LOG("FP iter %d pre-fj-fallback: hash=0x%x rem=%.6f fj_budget=%.6f",
+                            fp_iter,
+                            solution.get_hash(),
+                            remaining_after_round,
+                            fj_budget);
+      is_feasible                  = test_fj_feasible(solution, fj_budget);
+      const f_t remaining_after_fj = timer.remaining_time();
+      CUOPT_DETERMINISM_LOG(
+        "FP iter %d post-fj-fallback: hash=0x%x feasible_after_fj=%d obj=%.12f rem=%.6f "
+        "fj_stage=%.6f",
+        fp_iter,
+        solution.get_hash(),
+        (int)is_feasible,
+        solution.get_user_objective(),
+        remaining_after_fj,
+        remaining_after_round - remaining_after_fj);
     }
     if (timer.check_time_limit()) {
       CUOPT_LOG_DEBUG("FP time limit reached!");
@@ -576,6 +744,7 @@ bool feasibility_pump_t<i_t, f_t>::run_single_fp_descent(solution_t<i_t, f_t>& s
       return false;
     }
     cycle_queue.n_iterations_without_cycle++;
+    fp_iter++;
   }
   // unreachable
   return false;
diff --git a/cpp/src/mip_heuristics/local_search/feasibility_pump/feasibility_pump.cuh b/cpp/src/mip_heuristics/local_search/feasibility_pump/feasibility_pump.cuh
index df3ad405e6..d89933bd17 100644
--- a/cpp/src/mip_heuristics/local_search/feasibility_pump/feasibility_pump.cuh
+++ b/cpp/src/mip_heuristics/local_search/feasibility_pump/feasibility_pump.cuh
@@ -106,7 +106,6 @@ class feasibility_pump_t {
   feasibility_pump_t() = delete;
   feasibility_pump_t(mip_solver_context_t<i_t, f_t>& context,
                      fj_t<i_t, f_t>& fj,
-                     //                     fj_tree_t<i_t, f_t>& fj_tree_,
                      constraint_prop_t<i_t, f_t>& constraint_prop_,
                      line_segment_search_t<i_t, f_t>& line_segment_search_,
                      rmm::device_uvector<f_t>& lp_optimal_solution_);
@@ -128,7 +127,7 @@ class feasibility_pump_t {
   bool check_distance_cycle(solution_t<i_t, f_t>& solution);
   void reset();
   void resize_vectors(problem_t<i_t, f_t>& problem, const raft::handle_t* handle_ptr);
-  bool random_round_with_fj(solution_t<i_t, f_t>& solution, timer_t& round_timer);
+  bool random_round_with_fj(solution_t<i_t, f_t>& solution, work_limit_timer_t& round_timer);
   bool round_multiple_points(solution_t<i_t, f_t>& solution);
   void relax_general_integers(solution_t<i_t, f_t>& solution);
   void revert_relaxation(solution_t<i_t, f_t>& solution);
@@ -137,7 +136,6 @@ class feasibility_pump_t {
   mip_solver_context_t<i_t, f_t>& context;
   // keep a reference from upstream local search
   fj_t<i_t, f_t>& fj;
-  // fj_tree_t<i_t, f_t>& fj_tree;
   line_segment_search_t<i_t, f_t>& line_segment_search;
   cycle_queue_t<i_t, f_t> cycle_queue;
   constraint_prop_t<i_t, f_t>& constraint_prop;
@@ -156,7 +154,7 @@ class feasibility_pump_t {
   f_t proj_begin;
   i_t n_fj_single_descents;
   i_t max_n_of_integers = 0;
-  cuopt::timer_t timer;
+  cuopt::work_limit_timer_t timer;
 };
 
 }  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/mip_heuristics/local_search/line_segment_search/line_segment_search.cu b/cpp/src/mip_heuristics/local_search/line_segment_search/line_segment_search.cu
index ce70aec745..094a45cd17 100644
--- a/cpp/src/mip_heuristics/local_search/line_segment_search/line_segment_search.cu
+++ b/cpp/src/mip_heuristics/local_search/line_segment_search/line_segment_search.cu
@@ -17,8 +17,10 @@ namespace cuopt::linear_programming::detail {
 
 template <typename i_t, typename f_t>
 line_segment_search_t<i_t, f_t>::line_segment_search_t(
-  fj_t<i_t, f_t>& fj_, constraint_prop_t<i_t, f_t>& constraint_prop_)
-  : fj(fj_), constraint_prop(constraint_prop_)
+  mip_solver_context_t<i_t, f_t>& context_,
+  fj_t<i_t, f_t>& fj_,
+  constraint_prop_t<i_t, f_t>& constraint_prop_)
+  : context(context_), fj(fj_), constraint_prop(constraint_prop_)
 {
 }
 
@@ -128,7 +130,7 @@ bool line_segment_search_t<i_t, f_t>::search_line_segment(
   const rmm::device_uvector<f_t>& point_2,
   const rmm::device_uvector<f_t>& delta_vector,
   bool is_feasibility_run,
-  cuopt::timer_t& timer)
+  cuopt::work_limit_timer_t& timer)
 {
   CUOPT_LOG_DEBUG("Running line segment search with a given delta vector");
   cuopt_assert(point_1.size() == point_2.size(), "size mismatch");
@@ -263,7 +265,7 @@ bool line_segment_search_t<i_t, f_t>::search_line_segment(solution_t<i_t, f_t>&
                                                           const rmm::device_uvector<f_t>& point_1,
                                                           const rmm::device_uvector<f_t>& point_2,
                                                           bool is_feasibility_run,
-                                                          cuopt::timer_t& timer)
+                                                          cuopt::work_limit_timer_t& timer)
 {
   CUOPT_LOG_DEBUG("Running line segment search");
   cuopt_assert(point_1.size() == point_2.size(), "size mismatch");
diff --git a/cpp/src/mip_heuristics/local_search/line_segment_search/line_segment_search.cuh b/cpp/src/mip_heuristics/local_search/line_segment_search/line_segment_search.cuh
index 30e169e9d9..7a040ddbd2 100644
--- a/cpp/src/mip_heuristics/local_search/line_segment_search/line_segment_search.cuh
+++ b/cpp/src/mip_heuristics/local_search/line_segment_search/line_segment_search.cuh
@@ -9,7 +9,7 @@
 
 #include <mip_heuristics/feasibility_jump/feasibility_jump.cuh>
 #include <mip_heuristics/local_search/rounding/constraint_prop.cuh>
-#include <utilities/timer.hpp>
+#include <utilities/work_limit_timer.hpp>
 
 namespace cuopt::linear_programming::detail {
 
@@ -26,19 +26,21 @@ template <typename i_t, typename f_t>
 class line_segment_search_t {
  public:
   line_segment_search_t() = delete;
-  line_segment_search_t(fj_t<i_t, f_t>& fj, constraint_prop_t<i_t, f_t>& constraint_prop);
+  line_segment_search_t(mip_solver_context_t<i_t, f_t>& context,
+                        fj_t<i_t, f_t>& fj,
+                        constraint_prop_t<i_t, f_t>& constraint_prop);
   bool search_line_segment(solution_t<i_t, f_t>& solution,
                            const rmm::device_uvector<f_t>& point_1,
                            const rmm::device_uvector<f_t>& point_2,
                            bool is_feasibility_run,
-                           cuopt::timer_t& timer);
+                           cuopt::work_limit_timer_t& timer);
 
   bool search_line_segment(solution_t<i_t, f_t>& solution,
                            const rmm::device_uvector<f_t>& point_1,
                            const rmm::device_uvector<f_t>& point_2,
                            const rmm::device_uvector<f_t>& delta_vector,
                            bool is_feasibility_run,
-                           cuopt::timer_t& timer);
+                           cuopt::work_limit_timer_t& timer);
 
   void save_solution_if_better(solution_t<i_t, f_t>& solution,
                                const rmm::device_uvector<f_t>& point_1,
@@ -49,6 +51,7 @@ class line_segment_search_t {
                                f_t& best_feasible_cost,
                                f_t curr_cost);
 
+  mip_solver_context_t<i_t, f_t>& context;
   fj_t<i_t, f_t>& fj;
   constraint_prop_t<i_t, f_t>& constraint_prop;
   line_segment_settings_t settings;
diff --git a/cpp/src/mip_heuristics/local_search/local_search.cu b/cpp/src/mip_heuristics/local_search/local_search.cu
index da29511d70..cb3955fb83 100644
--- a/cpp/src/mip_heuristics/local_search/local_search.cu
+++ b/cpp/src/mip_heuristics/local_search/local_search.cu
@@ -15,8 +15,9 @@
 #include <mip_heuristics/mip_constants.hpp>
 #include <mip_heuristics/relaxed_lp/relaxed_lp.cuh>
 #include <mip_heuristics/utils.cuh>
+#include <utilities/determinism_log.hpp>
 #include <utilities/seed_generator.cuh>
-#include <utilities/timer.hpp>
+#include <utilities/work_limit_timer.hpp>
 
 #include <mip_heuristics/feasibility_jump/fj_cpu.cuh>
 
@@ -24,6 +25,15 @@
 
 #include <future>
 
+// enable to activate detailed determinism logs
+#if 0
+#undef CUOPT_DETERMINISM_LOG
+#define CUOPT_DETERMINISM_LOG(...) \
+  do {                             \
+    CUOPT_LOG_INFO(__VA_ARGS__);   \
+  } while (0)
+#endif
+
 namespace cuopt::linear_programming::detail {
 
 template <typename i_t, typename f_t>
@@ -36,7 +46,7 @@ local_search_t<i_t, f_t>::local_search_t(mip_solver_context_t<i_t, f_t>& context
     fj(context),
     // fj_tree(fj),
     constraint_prop(context),
-    line_segment_search(fj, constraint_prop),
+    line_segment_search(context, fj, constraint_prop),
     fp(context,
        fj,
        // fj_tree,
@@ -54,18 +64,17 @@ local_search_t<i_t, f_t>::local_search_t(mip_solver_context_t<i_t, f_t>& context
   scratch_cpu_fj.push_back(std::make_unique<cpu_fj_thread_t<i_t, f_t>>());
   scratch_cpu_fj.back()->fj_ptr   = &fj;
   scratch_cpu_fj_on_lp_opt.fj_ptr = &fj;
+  CUOPT_DETERMINISM_LOG("Deterministic solve start local_search state: seed_state=%lld",
+                        (long long)cuopt::seed_generator::peek_seed());
 
   fj.settings.n_of_minimums_for_exit = context.settings.heuristic_params.n_of_minimums_for_exit;
 }
 
-static double local_search_best_obj       = std::numeric_limits<double>::max();
-static population_t<int, double>* pop_ptr = nullptr;
-
 template <typename i_t, typename f_t>
 void local_search_t<i_t, f_t>::start_cpufj_scratch_threads(population_t<i_t, f_t>& population)
 {
-  pop_ptr = &population;
-
+  cuopt_assert(!(context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS),
+               "Scratch CPUFJ must remain opportunistic-only");
   std::vector<f_t> default_weights(context.problem_ptr->n_constraints, 1.);
 
   solution_t<i_t, f_t> solution(*context.problem_ptr);
@@ -88,18 +97,9 @@ void local_search_t<i_t, f_t>::start_cpufj_scratch_threads(population_t<i_t, f_t
 
     cpu_fj.fj_cpu->log_prefix = "******* scratch " + std::to_string(counter) + ": ";
     cpu_fj.fj_cpu->improvement_callback =
-      [&population, problem_ptr = context.problem_ptr](
-        f_t obj, const std::vector<f_t>& h_vec, double /*work_units*/) {
-        population.add_external_solution(h_vec, obj, solution_origin_t::CPUFJ);
-        (void)problem_ptr;
-        if (obj < local_search_best_obj) {
-          CUOPT_LOG_TRACE("******* New local search best obj %g, best overall %g",
-                          problem_ptr->get_user_obj_from_solver_obj(obj),
-                          problem_ptr->get_user_obj_from_solver_obj(
-                            population.is_feasible() ? population.best_feasible().get_objective()
-                                                     : std::numeric_limits<f_t>::max()));
-          local_search_best_obj = obj;
-        }
+      [&population](f_t obj, const std::vector<f_t>& h_vec, double /*work_units*/) {
+        population.add_external_solution(
+          h_vec, obj, internals::mip_solution_origin_t::CPU_FEASIBILITY_JUMP);
       };
     counter++;
   };
@@ -113,7 +113,8 @@ template <typename i_t, typename f_t>
 void local_search_t<i_t, f_t>::start_cpufj_lptopt_scratch_threads(
   population_t<i_t, f_t>& population)
 {
-  pop_ptr = &population;
+  cuopt_assert(!(context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS),
+               "LP-opt CPUFJ scratch must remain opportunistic-only");
 
   std::vector<f_t> default_weights(context.problem_ptr->n_constraints, 1.);
 
@@ -125,16 +126,9 @@ void local_search_t<i_t, f_t>::start_cpufj_lptopt_scratch_threads(
     solution_lp, default_weights, default_weights, 0., context.preempt_heuristic_solver_);
   scratch_cpu_fj_on_lp_opt.fj_cpu->log_prefix = "******* scratch on LP optimal: ";
   scratch_cpu_fj_on_lp_opt.fj_cpu->improvement_callback =
-    [this, &population](f_t obj, const std::vector<f_t>& h_vec, double /*work_units*/) {
-      population.add_external_solution(h_vec, obj, solution_origin_t::CPUFJ);
-      if (obj < local_search_best_obj) {
-        CUOPT_LOG_DEBUG("******* New local search best obj %g, best overall %g",
-                        context.problem_ptr->get_user_obj_from_solver_obj(obj),
-                        context.problem_ptr->get_user_obj_from_solver_obj(
-                          population.is_feasible() ? population.best_feasible().get_objective()
-                                                   : std::numeric_limits<f_t>::max()));
-        local_search_best_obj = obj;
-      }
+    [&population](f_t obj, const std::vector<f_t>& h_vec, double /*work_units*/) {
+      population.add_external_solution(
+        h_vec, obj, internals::mip_solution_origin_t::CPU_FEASIBILITY_JUMP);
     };
 
   // default weights
@@ -182,8 +176,11 @@ void local_search_t<i_t, f_t>::start_cpufj_deterministic(
 
   // Set up callback to send solutions to B&B with work unit timestamps
   deterministic_cpu_fj.fj_cpu->improvement_callback =
-    [&bb](f_t obj, const std::vector<f_t>& h_vec, double work_units) {
-      bb.queue_external_solution_deterministic(h_vec, work_units);
+    [&bb, problem_ptr = context.problem_ptr](
+      f_t obj, const std::vector<f_t>& h_vec, double work_units) {
+      f_t user_obj = problem_ptr->get_user_obj_from_solver_obj(obj);
+      bb.queue_external_solution_deterministic(
+        h_vec, user_obj, work_units, cuopt::internals::mip_solution_origin_t::CPU_FEASIBILITY_JUMP);
     };
 
   deterministic_cpu_fj.start_cpu_solver();
@@ -211,8 +208,9 @@ bool local_search_t<i_t, f_t>::do_fj_solve(solution_t<i_t, f_t>& solution,
                                            const std::string& source)
 {
   if (time_limit == 0.) return solution.get_feasible();
+  const bool deterministic = (context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS);
 
-  timer_t timer(time_limit);
+  work_limit_timer_t timer(context.gpu_heur_loop, time_limit, *context.termination);
   const auto old_n_cstr_weights      = in_fj.cstr_weights.size();
   const auto expected_n_cstr_weights = static_cast<size_t>(solution.problem_ptr->n_constraints);
   // in case this is the first time run, resize
@@ -231,17 +229,24 @@ bool local_search_t<i_t, f_t>::do_fj_solve(solution_t<i_t, f_t>& solution,
                                  1.);
     }
   }
-  auto h_weights          = cuopt::host_copy(in_fj.cstr_weights, solution.handle_ptr->get_stream());
-  auto h_objective_weight = in_fj.objective_weight.value(solution.handle_ptr->get_stream());
-  for (auto& cpu_fj_ptr : ls_cpu_fj) {
-    auto& cpu_fj  = *cpu_fj_ptr;
-    cpu_fj.fj_cpu = cpu_fj.fj_ptr->create_cpu_climber(solution,
-                                                      h_weights,
-                                                      h_weights,
-                                                      h_objective_weight,
-                                                      context.preempt_heuristic_solver_,
-                                                      fj_settings_t{},
-                                                      true);
+
+  {
+    auto h_weights = cuopt::host_copy(in_fj.cstr_weights, solution.handle_ptr->get_stream());
+    auto h_objective_weight = in_fj.objective_weight.value(solution.handle_ptr->get_stream());
+    for (auto& cpu_fj_ptr : ls_cpu_fj) {
+      auto& cpu_fj  = *cpu_fj_ptr;
+      cpu_fj.fj_cpu = cpu_fj.fj_ptr->create_cpu_climber(solution,
+                                                        h_weights,
+                                                        h_weights,
+                                                        h_objective_weight,
+                                                        context.preempt_heuristic_solver_,
+                                                        fj_settings_t{},
+                                                        true);
+      if (deterministic) {
+        cpu_fj.fj_cpu->work_units_elapsed = 0.0;
+        cpu_fj.fj_cpu->work_budget        = time_limit;
+      }
+    }
   }
 
   auto solution_copy = solution;
@@ -256,9 +261,10 @@ bool local_search_t<i_t, f_t>::do_fj_solve(solution_t<i_t, f_t>& solution,
   in_fj.settings.time_limit = timer.remaining_time();
   in_fj.solve(solution);
 
-  // Stop CPU solver
-  for (auto& cpu_fj_ptr : ls_cpu_fj) {
-    cpu_fj_ptr->stop_cpu_solver();
+  if (!deterministic) {
+    for (auto& cpu_fj_ptr : ls_cpu_fj) {
+      cpu_fj_ptr->stop_cpu_solver();
+    }
   }
 
   auto gpu_fj_end        = std::chrono::high_resolution_clock::now();
@@ -267,7 +273,6 @@ bool local_search_t<i_t, f_t>::do_fj_solve(solution_t<i_t, f_t>& solution,
   solution_t<i_t, f_t> solution_cpu(*solution.problem_ptr);
 
   f_t best_cpu_obj = std::numeric_limits<f_t>::max();
-  // // Wait for CPU solver to finish
   for (auto& cpu_fj_ptr : ls_cpu_fj) {
     bool cpu_sol_found = cpu_fj_ptr->wait_for_cpu_solver();
     if (cpu_sol_found) {
@@ -313,8 +318,10 @@ bool local_search_t<i_t, f_t>::do_fj_solve(solution_t<i_t, f_t>& solution,
 }
 
 template <typename i_t, typename f_t>
-void local_search_t<i_t, f_t>::generate_fast_solution(solution_t<i_t, f_t>& solution, timer_t timer)
+void local_search_t<i_t, f_t>::generate_fast_solution(solution_t<i_t, f_t>& solution,
+                                                      work_limit_timer_t& timer)
 {
+  CUOPT_LOG_DEBUG("Running FJ fast sol");
   thrust::fill(solution.handle_ptr->get_thrust_policy(),
                solution.assignment.begin(),
                solution.assignment.end(),
@@ -325,8 +332,11 @@ void local_search_t<i_t, f_t>::generate_fast_solution(solution_t<i_t, f_t>& solu
   fj.settings.update_weights         = true;
   fj.settings.feasibility_run        = true;
   fj.settings.time_limit             = std::min(30., timer.remaining_time());
-  while (!context.diversity_manager_ptr->check_b_b_preemption() && !timer.check_time_limit()) {
-    timer_t constr_prop_timer = timer_t(std::min(timer.remaining_time(), 2.));
+  while ((context.diversity_manager_ptr == nullptr ||
+          !context.diversity_manager_ptr->check_b_b_preemption()) &&
+         !timer.check_time_limit()) {
+    work_limit_timer_t constr_prop_timer = work_limit_timer_t(
+      context.gpu_heur_loop, std::min(timer.remaining_time(), 2.), *context.termination);
     // do constraint prop on lp optimal solution
     constraint_prop.apply_round(solution, 1., constr_prop_timer);
     if (solution.compute_feasibility()) { return; }
@@ -343,7 +353,7 @@ void local_search_t<i_t, f_t>::generate_fast_solution(solution_t<i_t, f_t>& solu
 template <typename i_t, typename f_t>
 bool local_search_t<i_t, f_t>::run_local_search(solution_t<i_t, f_t>& solution,
                                                 const weight_t<i_t, f_t>& weights,
-                                                timer_t timer,
+                                                work_limit_timer_t& timer,
                                                 const ls_config_t<i_t, f_t>& ls_config)
 {
   raft::common::nvtx::range fun_scope("local search");
@@ -353,11 +363,10 @@ bool local_search_t<i_t, f_t>::run_local_search(solution_t<i_t, f_t>& solution,
   if (!solution.get_feasible()) {
     if (ls_config.at_least_one_parent_feasible) {
       fj_settings.time_limit = 0.5;
-      timer                  = timer_t(fj_settings.time_limit);
     } else {
       fj_settings.time_limit = 0.25;
-      timer                  = timer_t(fj_settings.time_limit);
     }
+    timer = work_limit_timer_t(context.gpu_heur_loop, fj_settings.time_limit, *context.termination);
   } else {
     fj_settings.time_limit = std::min(1., timer.remaining_time());
   }
@@ -387,8 +396,9 @@ bool local_search_t<i_t, f_t>::run_local_search(solution_t<i_t, f_t>& solution,
 template <typename i_t, typename f_t>
 bool local_search_t<i_t, f_t>::run_fj_until_timer(solution_t<i_t, f_t>& solution,
                                                   const weight_t<i_t, f_t>& weights,
-                                                  timer_t timer)
+                                                  work_limit_timer_t& timer)
 {
+  CUOPT_LOG_DEBUG("Running FJ until timer");
   bool is_feasible;
   fj.settings.n_of_minimums_for_exit = 1e6;
   fj.settings.mode                   = fj_mode_t::EXIT_NON_IMPROVING;
@@ -405,7 +415,7 @@ bool local_search_t<i_t, f_t>::run_fj_until_timer(solution_t<i_t, f_t>& solution
 
 template <typename i_t, typename f_t>
 bool local_search_t<i_t, f_t>::run_fj_annealing(solution_t<i_t, f_t>& solution,
-                                                timer_t timer,
+                                                work_limit_timer_t& timer,
                                                 const ls_config_t<i_t, f_t>& ls_config)
 {
   raft::common::nvtx::range fun_scope("run_fj_annealing");
@@ -435,7 +445,7 @@ bool local_search_t<i_t, f_t>::run_fj_annealing(solution_t<i_t, f_t>& solution,
 
 template <typename i_t, typename f_t>
 bool local_search_t<i_t, f_t>::run_fj_line_segment(solution_t<i_t, f_t>& solution,
-                                                   timer_t timer,
+                                                   work_limit_timer_t& timer,
                                                    const ls_config_t<i_t, f_t>& ls_config)
 {
   raft::common::nvtx::range fun_scope("run_fj_line_segment");
@@ -458,7 +468,7 @@ bool local_search_t<i_t, f_t>::run_fj_line_segment(solution_t<i_t, f_t>& solutio
 template <typename i_t, typename f_t>
 bool local_search_t<i_t, f_t>::check_fj_on_lp_optimal(solution_t<i_t, f_t>& solution,
                                                       bool perturb,
-                                                      timer_t timer)
+                                                      work_limit_timer_t& timer)
 {
   raft::common::nvtx::range fun_scope("check_fj_on_lp_optimal");
   if (lp_optimal_exists) {
@@ -474,15 +484,21 @@ bool local_search_t<i_t, f_t>::check_fj_on_lp_optimal(solution_t<i_t, f_t>& solu
     solution.assign_random_within_bounds(perturbation_ratio);
   }
   cuopt_func_call(solution.test_variable_bounds(false));
-  f_t lp_run_time_after_feasible = std::min(1., timer.remaining_time());
-  timer_t bounds_prop_timer      = timer_t(std::min(timer.remaining_time(), 10.));
+  f_t lp_run_time_after_feasible       = std::min(1., timer.remaining_time());
+  work_limit_timer_t bounds_prop_timer = work_limit_timer_t(
+    context.gpu_heur_loop, std::min(timer.remaining_time(), 10.), *context.termination);
   bool is_feasible =
     constraint_prop.apply_round(solution, lp_run_time_after_feasible, bounds_prop_timer);
   if (!is_feasible) {
     const f_t lp_run_time = 2.;
     relaxed_lp_settings_t lp_settings;
     lp_settings.time_limit = std::min(lp_run_time, timer.remaining_time());
-    lp_settings.tolerance  = solution.problem_ptr->tolerances.absolute_tolerance;
+    if (timer.deterministic) {
+      lp_settings.work_limit   = lp_settings.time_limit;
+      lp_settings.work_context = timer.work_context;
+      cuopt_assert(lp_settings.work_context != nullptr, "Missing deterministic work context");
+    }
+    lp_settings.tolerance = solution.problem_ptr->tolerances.absolute_tolerance;
     run_lp_with_vars_fixed(
       *solution.problem_ptr, solution, solution.problem_ptr->integer_indices, lp_settings);
   } else {
@@ -499,7 +515,8 @@ bool local_search_t<i_t, f_t>::check_fj_on_lp_optimal(solution_t<i_t, f_t>& solu
 }
 
 template <typename i_t, typename f_t>
-bool local_search_t<i_t, f_t>::run_fj_on_zero(solution_t<i_t, f_t>& solution, timer_t timer)
+bool local_search_t<i_t, f_t>::run_fj_on_zero(solution_t<i_t, f_t>& solution,
+                                              work_limit_timer_t& timer)
 {
   raft::common::nvtx::range fun_scope("run_fj_on_zero");
   thrust::fill(solution.handle_ptr->get_thrust_policy(),
@@ -518,7 +535,7 @@ bool local_search_t<i_t, f_t>::run_fj_on_zero(solution_t<i_t, f_t>& solution, ti
 
 template <typename i_t, typename f_t>
 bool local_search_t<i_t, f_t>::run_staged_fp(solution_t<i_t, f_t>& solution,
-                                             timer_t timer,
+                                             work_limit_timer_t& timer,
                                              population_t<i_t, f_t>* population_ptr)
 {
   raft::common::nvtx::range fun_scope("run_staged_fp");
@@ -546,7 +563,8 @@ bool local_search_t<i_t, f_t>::run_staged_fp(solution_t<i_t, f_t>& solution,
       }
       CUOPT_LOG_DEBUG("Running staged FP from beginning it %d", i);
       fp.relax_general_integers(solution);
-      timer_t binary_timer(timer.remaining_time() / 3);
+      work_limit_timer_t binary_timer(
+        context.gpu_heur_loop, timer.remaining_time() / 3, *context.termination);
       i_t binary_it_counter = 0;
       for (; binary_it_counter < 100; ++binary_it_counter) {
         population_ptr->add_external_solutions_to_population();
@@ -626,6 +644,9 @@ void local_search_t<i_t, f_t>::save_solution_and_add_cutting_plane(
 template <typename i_t, typename f_t>
 void local_search_t<i_t, f_t>::resize_to_new_problem()
 {
+  CUOPT_LOG_DEBUG("resize_to_new_problem: nv=%d nc=%d",
+                  problem_with_objective_cut.n_variables,
+                  problem_with_objective_cut.n_constraints);
   resize_vectors(problem_with_objective_cut, problem_with_objective_cut.handle_ptr);
   // hint for next PR in case load balanced is reintroduced
   // lb_constraint_prop.temp_problem.setup(problem_with_objective_cut);
@@ -636,6 +657,9 @@ void local_search_t<i_t, f_t>::resize_to_new_problem()
 template <typename i_t, typename f_t>
 void local_search_t<i_t, f_t>::resize_to_old_problem(problem_t<i_t, f_t>* old_problem_ptr)
 {
+  CUOPT_LOG_DEBUG("resize_to_old_problem: nv=%d nc=%d",
+                  old_problem_ptr->n_variables,
+                  old_problem_ptr->n_constraints);
   resize_vectors(*old_problem_ptr, old_problem_ptr->handle_ptr);
   // hint for next PR in case load balanced is reintroduced
   // lb_constraint_prop.temp_problem.setup(*old_problem_ptr);
@@ -658,7 +682,8 @@ void local_search_t<i_t, f_t>::reset_alpha_and_save_solution(
   solution_t<i_t, f_t> solution_copy(solution);
   solution_copy.problem_ptr = old_problem_ptr;
   solution_copy.resize_to_problem();
-  population_ptr->add_solution(std::move(solution_copy));
+  population_ptr->add_solution(std::move(solution_copy),
+                               internals::mip_solution_origin_t::LOCAL_SEARCH);
   population_ptr->add_external_solutions_to_population();
   if (!cutting_plane_added_for_active_run) {
     solution.problem_ptr = &problem_with_objective_cut;
@@ -712,34 +737,53 @@ void local_search_t<i_t, f_t>::reset_alpha_and_run_recombiners(
 
 template <typename i_t, typename f_t>
 bool local_search_t<i_t, f_t>::run_fp(solution_t<i_t, f_t>& solution,
-                                      timer_t timer,
-                                      population_t<i_t, f_t>* population_ptr)
+                                      work_limit_timer_t& timer,
+                                      population_t<i_t, f_t>* population_ptr,
+                                      i_t n_fp_iterations)
 {
   raft::common::nvtx::range fun_scope("run_fp");
   cuopt_assert(population_ptr != nullptr, "Population pointer must not be null");
-  const i_t n_fp_iterations          = 1000000;
   bool is_feasible                   = solution.compute_feasibility();
   cutting_plane_added_for_active_run = is_feasible;
   double best_objective =
     is_feasible ? solution.get_objective() : std::numeric_limits<double>::max();
   rmm::device_uvector<f_t> best_solution(solution.assignment, solution.handle_ptr->get_stream());
   problem_t<i_t, f_t>* old_problem_ptr = solution.problem_ptr;
-  fp.timer                             = timer_t(timer.remaining_time());
+  fp.timer =
+    work_limit_timer_t(context.gpu_heur_loop, timer.remaining_time(), *context.termination);
   // if it has not been initialized yet, create a new problem and move it to the cut problem
   if (!problem_with_objective_cut.cutting_plane_added) {
     problem_with_objective_cut = std::move(problem_t<i_t, f_t>(*old_problem_ptr));
+    CUOPT_LOG_DEBUG("FP cut-problem clone: old_nv=%d old_nc=%d cut_nv=%d cut_nc=%d",
+                    old_problem_ptr->n_variables,
+                    old_problem_ptr->n_constraints,
+                    problem_with_objective_cut.n_variables,
+                    problem_with_objective_cut.n_constraints);
   }
   if (is_feasible) {
     CUOPT_LOG_DEBUG("FP initial solution is feasible, adding cutting plane at obj");
     f_t objective_cut =
       best_objective - std::max(std::abs(0.001 * best_objective), OBJECTIVE_EPSILON);
+    CUOPT_LOG_DEBUG("FP cut-problem add: cut_obj=%g cut_nv=%d cut_nc=%d cut_added=%d fj_w=%zu",
+                    objective_cut,
+                    problem_with_objective_cut.n_variables,
+                    problem_with_objective_cut.n_constraints,
+                    (int)problem_with_objective_cut.cutting_plane_added,
+                    fj.cstr_weights.size());
     problem_with_objective_cut.add_cutting_plane_at_objective(objective_cut);
+    CUOPT_LOG_DEBUG("FP cut-problem post-add: cut_nv=%d cut_nc=%d",
+                    problem_with_objective_cut.n_variables,
+                    problem_with_objective_cut.n_constraints);
     // Do the copy here for proper handling of the added constraints weight
     fj.copy_weights(
       population_ptr->weights, solution.handle_ptr, problem_with_objective_cut.n_constraints);
     solution.problem_ptr = &problem_with_objective_cut;
     solution.resize_to_problem();
     resize_to_new_problem();
+    CUOPT_LOG_DEBUG("FP cut-problem resize done: sol_assign=%zu sol_nv=%d sol_nc=%d",
+                    solution.assignment.size(),
+                    solution.problem_ptr->n_variables,
+                    solution.problem_ptr->n_constraints);
   }
   i_t last_improved_iteration = 0;
   for (i_t i = 0; i < n_fp_iterations && !timer.check_time_limit(); ++i) {
@@ -806,14 +850,45 @@ bool local_search_t<i_t, f_t>::run_fp(solution_t<i_t, f_t>& solution,
       }
     }
   }
+  CUOPT_LOG_DEBUG(
+    "FP teardown start: assign=%zu best=%zu curr_pb=%p old_pb=%p curr_nv=%d curr_nc=%d "
+    "old_nv=%d old_nc=%d prevp=%zu prevd=%zu fp_rem=%g parent_rem=%g gpu_work=%g "
+    "gpu_prod=%g cut_added=%d",
+    solution.assignment.size(),
+    best_solution.size(),
+    (void*)solution.problem_ptr,
+    (void*)old_problem_ptr,
+    solution.problem_ptr->n_variables,
+    solution.problem_ptr->n_constraints,
+    old_problem_ptr->n_variables,
+    old_problem_ptr->n_constraints,
+    solution.lp_state.prev_primal.size(),
+    solution.lp_state.prev_dual.size(),
+    fp.timer.remaining_time(),
+    timer.remaining_time(),
+    context.gpu_heur_loop.current_work(),
+    context.gpu_heur_loop.current_producer_work(),
+    (int)problem_with_objective_cut.cutting_plane_added);
   raft::copy(solution.assignment.data(),
              best_solution.data(),
              solution.assignment.size(),
              solution.handle_ptr->get_stream());
+  CUOPT_LOG_DEBUG("FP teardown post-copy: assign=%zu", solution.assignment.size());
   solution.problem_ptr = old_problem_ptr;
+  CUOPT_LOG_DEBUG("FP teardown post-ptr: pb=%p nv=%d nc=%d",
+                  (void*)solution.problem_ptr,
+                  solution.problem_ptr->n_variables,
+                  solution.problem_ptr->n_constraints);
   solution.resize_to_problem();
+  CUOPT_LOG_DEBUG("FP teardown post-resize: assign=%zu prevp=%zu prevd=%zu",
+                  solution.assignment.size(),
+                  solution.lp_state.prev_primal.size(),
+                  solution.lp_state.prev_dual.size());
   resize_to_old_problem(old_problem_ptr);
+  CUOPT_LOG_DEBUG("FP teardown pre-sync");
   solution.handle_ptr->sync_stream();
+  CUOPT_LOG_DEBUG(
+    "FP teardown post-sync: hash=0x%x feas=%d", solution.get_hash(), (int)solution.get_feasible());
   return is_feasible;
 }
 
@@ -825,7 +900,7 @@ bool local_search_t<i_t, f_t>::generate_solution(solution_t<i_t, f_t>& solution,
 {
   raft::common::nvtx::range fun_scope("generate_solution");
   cuopt_assert(population_ptr != nullptr, "Population pointer must not be null");
-  timer_t timer(time_limit);
+  work_limit_timer_t timer(context.gpu_heur_loop, time_limit, *context.termination);
   auto n_vars         = solution.problem_ptr->n_variables;
   auto n_binary_vars  = solution.problem_ptr->get_n_binary_variables();
   auto n_integer_vars = solution.problem_ptr->n_integer_vars;
diff --git a/cpp/src/mip_heuristics/local_search/local_search.cuh b/cpp/src/mip_heuristics/local_search/local_search.cuh
index 94493ebcb3..04b30b8ccc 100644
--- a/cpp/src/mip_heuristics/local_search/local_search.cuh
+++ b/cpp/src/mip_heuristics/local_search/local_search.cuh
@@ -13,13 +13,7 @@
 #include <mip_heuristics/local_search/line_segment_search/line_segment_search.cuh>
 #include <mip_heuristics/solution/solution.cuh>
 #include <mip_heuristics/solver.cuh>
-#include <utilities/timer.hpp>
-
-#include <atomic>
-#include <chrono>
-#include <condition_variable>
-#include <mutex>
-#include <thread>
+#include <utilities/work_limit_timer.hpp>
 
 namespace cuopt::linear_programming::dual_simplex {
 template <typename i_t, typename f_t>
@@ -58,32 +52,35 @@ class local_search_t {
   void start_cpufj_scratch_threads(population_t<i_t, f_t>& population);
   void start_cpufj_lptopt_scratch_threads(population_t<i_t, f_t>& population);
   void stop_cpufj_scratch_threads();
-  void generate_fast_solution(solution_t<i_t, f_t>& solution, timer_t timer);
+  void generate_fast_solution(solution_t<i_t, f_t>& solution, work_limit_timer_t& timer);
   bool generate_solution(solution_t<i_t, f_t>& solution,
                          bool perturb,
                          population_t<i_t, f_t>* population_ptr,
                          f_t time_limit = 300.);
   bool run_fj_until_timer(solution_t<i_t, f_t>& solution,
                           const weight_t<i_t, f_t>& weights,
-                          timer_t timer);
+                          work_limit_timer_t& timer);
   bool run_local_search(solution_t<i_t, f_t>& solution,
                         const weight_t<i_t, f_t>& weights,
-                        timer_t timer,
+                        work_limit_timer_t& timer,
                         const ls_config_t<i_t, f_t>& ls_config);
   bool run_fj_annealing(solution_t<i_t, f_t>& solution,
-                        timer_t timer,
+                        work_limit_timer_t& timer,
                         const ls_config_t<i_t, f_t>& ls_config);
   bool run_fj_line_segment(solution_t<i_t, f_t>& solution,
-                           timer_t timer,
+                           work_limit_timer_t& timer,
                            const ls_config_t<i_t, f_t>& ls_config);
-  bool run_fj_on_zero(solution_t<i_t, f_t>& solution, timer_t timer);
-  bool check_fj_on_lp_optimal(solution_t<i_t, f_t>& solution, bool perturb, timer_t timer);
+  bool run_fj_on_zero(solution_t<i_t, f_t>& solution, work_limit_timer_t& timer);
+  bool check_fj_on_lp_optimal(solution_t<i_t, f_t>& solution,
+                              bool perturb,
+                              work_limit_timer_t& timer);
   bool run_staged_fp(solution_t<i_t, f_t>& solution,
-                     timer_t timer,
+                     work_limit_timer_t& timer,
                      population_t<i_t, f_t>* population_ptr);
   bool run_fp(solution_t<i_t, f_t>& solution,
-              timer_t timer,
-              population_t<i_t, f_t>* population_ptr = nullptr);
+              work_limit_timer_t& timer,
+              population_t<i_t, f_t>* population_ptr = nullptr,
+              i_t n_fp_iterations                    = std::numeric_limits<i_t>::max());
   void resize_vectors(problem_t<i_t, f_t>& problem, const raft::handle_t* handle_ptr);
 
   bool do_fj_solve(solution_t<i_t, f_t>& solution,
diff --git a/cpp/src/mip_heuristics/local_search/rounding/bounds_repair.cu b/cpp/src/mip_heuristics/local_search/rounding/bounds_repair.cu
index f3233cc8f4..ebea04495c 100644
--- a/cpp/src/mip_heuristics/local_search/rounding/bounds_repair.cu
+++ b/cpp/src/mip_heuristics/local_search/rounding/bounds_repair.cu
@@ -8,16 +8,114 @@
 #include "bounds_repair.cuh"
 
 #include <thrust/copy.h>
+#include <thrust/iterator/zip_iterator.h>
 #include <thrust/partition.h>
 #include <thrust/sort.h>
+#include <thrust/tuple.h>
 #include <cuda/std/functional>
 #include <mip_heuristics/logger.cuh>
 #include <mip_heuristics/mip_constants.hpp>
 #include <utilities/copy_helpers.hpp>
 #include <utilities/seed_generator.cuh>
 
+#include <cmath>
+
+// enable to activate detailed determinism logs
+#if 0
+#undef CUOPT_DETERMINISM_LOG
+#define CUOPT_DETERMINISM_LOG(...) \
+  do {                             \
+    CUOPT_LOG_INFO(__VA_ARGS__);   \
+  } while (0)
+#endif
+
 namespace cuopt::linear_programming::detail {
 
+namespace {
+
+constexpr double bounds_repair_setup_base_work            = 5e-4;
+constexpr double bounds_repair_violation_base_work        = 4e-4;
+constexpr double bounds_repair_violation_nnz_work         = 2e-6;
+constexpr double bounds_repair_violation_constraint_work  = 3e-6;
+constexpr double bounds_repair_best_bounds_variable_work  = 2e-6;
+constexpr double bounds_repair_shift_base_work            = 3e-4;
+constexpr double bounds_repair_shift_row_entry_work       = 3e-6;
+constexpr double bounds_repair_shift_candidate_work       = 8e-6;
+constexpr double bounds_repair_shift_neighbor_entry_work  = 3e-6;
+constexpr double bounds_repair_shift_sort_work            = 5e-6;
+constexpr double bounds_repair_damage_base_work           = 3e-4;
+constexpr double bounds_repair_damage_neighbor_entry_work = 8e-6;
+constexpr double bounds_repair_damage_sort_work           = 5e-6;
+constexpr double bounds_repair_move_base_work             = 5e-5;
+constexpr double bounds_repair_no_candidate_base_work     = 4e-4;
+constexpr double bounds_repair_cycle_penalty_work         = 3e-4;
+
+template <typename i_t, typename f_t>
+double estimate_bounds_repair_violation_refresh_work(const problem_t<i_t, f_t>& problem,
+                                                     bool update_best_bounds)
+{
+  double estimate = bounds_repair_violation_base_work +
+                    bounds_repair_violation_nnz_work * (double)problem.nnz +
+                    bounds_repair_violation_constraint_work * (double)problem.n_constraints;
+  if (update_best_bounds) {
+    estimate += bounds_repair_best_bounds_variable_work * (double)problem.n_variables;
+  }
+  return estimate;
+}
+
+template <typename i_t, typename f_t>
+double estimate_bounds_repair_setup_work(const problem_t<i_t, f_t>& problem)
+{
+  return bounds_repair_setup_base_work +
+         estimate_bounds_repair_violation_refresh_work(problem, true);
+}
+
+template <typename i_t, typename f_t>
+double estimate_bounds_repair_shift_work(const problem_t<i_t, f_t>& problem,
+                                         i_t curr_cstr,
+                                         i_t n_candidates,
+                                         bool is_cycle)
+{
+  const auto stream    = problem.handle_ptr->get_stream();
+  const i_t cstr_begin = problem.offsets.element(curr_cstr, stream);
+  const i_t cstr_end   = problem.offsets.element(curr_cstr + 1, stream);
+  const double row_nnz = cstr_end - cstr_begin;
+  const double avg_rev_degree =
+    problem.n_variables > 0 ? ((double)problem.nnz / (double)problem.n_variables) : 0.0;
+  const double sort_work =
+    n_candidates > 1 ? (double)n_candidates * std::log2((double)n_candidates) : 0.0;
+  double estimate = bounds_repair_shift_base_work + bounds_repair_shift_row_entry_work * row_nnz;
+  if (n_candidates == 0) { estimate = bounds_repair_no_candidate_base_work + estimate; }
+  estimate += bounds_repair_shift_candidate_work * (double)n_candidates;
+  estimate += bounds_repair_shift_neighbor_entry_work * (double)n_candidates * avg_rev_degree;
+  estimate += bounds_repair_shift_sort_work * sort_work;
+  if (is_cycle) { estimate += bounds_repair_cycle_penalty_work; }
+  return estimate;
+}
+
+template <typename i_t, typename f_t>
+double estimate_bounds_repair_damage_work(const problem_t<i_t, f_t>& problem, i_t n_candidates)
+{
+  if (n_candidates == 0) { return 0.0; }
+  const double avg_rev_degree =
+    problem.n_variables > 0 ? ((double)problem.nnz / (double)problem.n_variables) : 0.0;
+  const double sort_work =
+    n_candidates > 1 ? (double)n_candidates * std::log2((double)n_candidates) : 0.0;
+  return bounds_repair_damage_base_work +
+         bounds_repair_damage_neighbor_entry_work * (double)n_candidates * avg_rev_degree +
+         bounds_repair_damage_sort_work * sort_work;
+}
+
+template <typename timer_t>
+void record_estimated_work(timer_t& timer, double* total_estimated_work, double work)
+{
+  cuopt_assert(std::isfinite(work) && work >= 0.0, "Bounds repair work estimate must be finite");
+  timer.record_work(work);
+  *total_estimated_work += work;
+}
+
+}  // namespace
+
 template <typename i_t, typename f_t>
 bounds_repair_t<i_t, f_t>::bounds_repair_t(const problem_t<i_t, f_t>& pb,
                                            bound_presolve_t<i_t, f_t>& bound_presolve_)
@@ -30,7 +128,8 @@ bounds_repair_t<i_t, f_t>::bounds_repair_t(const problem_t<i_t, f_t>& pb,
     violated_cstr_map(0, pb.handle_ptr->get_stream()),
     total_vio(pb.handle_ptr->get_stream()),
     gen(cuopt::seed_generator::get_seed()),
-    cycle_vector(MAX_CYCLE_SEQUENCE, -1)
+    cycle_vector(MAX_CYCLE_SEQUENCE, -1),
+    timer(0.0, cuopt::termination_checker_t::root_tag_t{})
 {
 }
 
@@ -68,8 +167,7 @@ f_t bounds_repair_t<i_t, f_t>::get_ii_violation(problem_t<i_t, f_t>& problem)
      min_act              = bound_presolve.upd.min_activity.data(),
      max_act              = bound_presolve.upd.max_activity.data(),
      cstr_violations_up   = cstr_violations_up.data(),
-     cstr_violations_down = cstr_violations_down.data(),
-     total_vio            = total_vio.data()] __device__(i_t cstr_idx) {
+     cstr_violations_down = cstr_violations_down.data()] __device__(i_t cstr_idx) {
       f_t cnst_lb = pb_v.constraint_lower_bounds[cstr_idx];
       f_t cnst_ub = pb_v.constraint_upper_bounds[cstr_idx];
       f_t eps     = get_cstr_tolerance<i_t, f_t>(
@@ -79,21 +177,31 @@ f_t bounds_repair_t<i_t, f_t>::get_ii_violation(problem_t<i_t, f_t>& problem)
       f_t violation                = max(curr_cstr_violation_up, curr_cstr_violation_down);
       if (violation >= ROUNDOFF_TOLERANCE) {
         violated_cstr_map[cstr_idx] = 1;
-        atomicAdd(total_vio, violation);
       } else {
         violated_cstr_map[cstr_idx] = 0;
       }
       cstr_violations_up[cstr_idx]   = curr_cstr_violation_up;
       cstr_violations_down[cstr_idx] = curr_cstr_violation_down;
     });
-  auto iter           = thrust::copy_if(handle_ptr->get_thrust_policy(),
+  auto iter         = thrust::copy_if(handle_ptr->get_thrust_policy(),
                               thrust::make_counting_iterator(0),
                               thrust::make_counting_iterator(0) + problem.n_constraints,
                               violated_cstr_map.data(),
                               violated_constraints.data(),
                               cuda::std::identity{});
-  h_n_violated_cstr   = iter - violated_constraints.data();
-  f_t total_violation = total_vio.value(handle_ptr->get_stream());
+  h_n_violated_cstr = iter - violated_constraints.data();
+  // Use deterministic reduction instead of non-deterministic atomicAdd
+  f_t total_violation = thrust::transform_reduce(
+    handle_ptr->get_thrust_policy(),
+    thrust::make_counting_iterator(0),
+    thrust::make_counting_iterator(0) + problem.n_constraints,
+    [cstr_violations_up   = cstr_violations_up.data(),
+     cstr_violations_down = cstr_violations_down.data()] __device__(i_t cstr_idx) -> f_t {
+      auto violation = max(cstr_violations_up[cstr_idx], cstr_violations_down[cstr_idx]);
+      return violation >= ROUNDOFF_TOLERANCE ? violation : 0.;
+    },
+    (f_t)0,
+    thrust::plus<f_t>());
   CUOPT_LOG_TRACE(
     "Repair: n_violated_cstr %d total_violation %f", h_n_violated_cstr, total_violation);
   return total_violation;
@@ -103,10 +211,13 @@ template <typename i_t, typename f_t>
 i_t bounds_repair_t<i_t, f_t>::get_random_cstr()
 {
   std::uniform_int_distribution<> dist(0, h_n_violated_cstr - 1);
-  // Generate random number
-  i_t random_number = dist(gen);
-  i_t cstr_idx      = violated_constraints.element(random_number, handle_ptr->get_stream());
+  i_t random_index = dist(gen);
+  i_t cstr_idx     = violated_constraints.element(random_index, handle_ptr->get_stream());
   CUOPT_LOG_TRACE("Repair: selected random cstr %d", cstr_idx);
+  CUOPT_DETERMINISM_LOG("Repair cstr select: random_index=%d cstr=%d n_violated=%d",
+                        random_index,
+                        cstr_idx,
+                        h_n_violated_cstr);
   return cstr_idx;
 }
 
@@ -190,7 +301,14 @@ i_t bounds_repair_t<i_t, f_t>::compute_best_shift(problem_t<i_t, f_t>& problem,
       }
     });
   handle_ptr->sync_stream();
-  return candidates.n_candidates.value(handle_ptr->get_stream());
+  i_t n_candidates = candidates.n_candidates.value(handle_ptr->get_stream());
+
+  // Sort by (variable_index, bound_shift) to ensure fully deterministic ordering
+  auto key_iter = thrust::make_zip_iterator(
+    thrust::make_tuple(candidates.variable_index.begin(), candidates.bound_shift.begin()));
+  thrust::sort(handle_ptr->get_thrust_policy(), key_iter, key_iter + n_candidates);
+
+  return n_candidates;
 }
 
 template <typename i_t, typename f_t>
@@ -377,36 +495,100 @@ void bounds_repair_t<i_t, f_t>::apply_move(problem_t<i_t, f_t>& problem,
 template <typename i_t, typename f_t>
 bool bounds_repair_t<i_t, f_t>::repair_problem(problem_t<i_t, f_t>& problem,
                                                problem_t<i_t, f_t>& original_problem,
-                                               timer_t timer_,
+                                               work_limit_timer_t& timer_,
                                                const raft::handle_t* handle_ptr_)
 {
   CUOPT_LOG_DEBUG("Running bounds repair");
   handle_ptr = handle_ptr_;
   timer      = timer_;
+  cuopt_assert(timer.deterministic == problem.deterministic,
+               "Bounds repair timer/problem determinism mismatch");
   resize(problem);
   reset();
   best_violation = get_ii_violation(problem);
   curr_violation = best_violation;
   best_bounds.update_from(problem, handle_ptr);
-  i_t no_candidate_in_a_row = 0;
-  while (h_n_violated_cstr > 0) {
+  double total_estimated_work = 0.0;
+  i_t repair_iterations       = 0;
+  if (timer.deterministic) {
+    const double setup_work = estimate_bounds_repair_setup_work(problem);
+    record_estimated_work(timer, &total_estimated_work, setup_work);
+    CUOPT_DETERMINISM_LOG(
+      "Repair entry: pb_hash=0x%x bounds_hash=0x%x violated_hash=0x%x n_violated=%d "
+      "best_violation=%.6f timer_rem=%.6f total_work=%.6f setup_work=%.6f",
+      problem.get_fingerprint(),
+      detail::compute_hash(make_span(problem.variable_bounds), handle_ptr->get_stream()),
+      detail::compute_hash(make_span(violated_constraints, 0, h_n_violated_cstr),
+                           handle_ptr->get_stream()),
+      h_n_violated_cstr,
+      best_violation,
+      timer.remaining_time(),
+      total_estimated_work,
+      setup_work);
+  }
+  i_t no_candidate_in_a_row                = 0;
+  [[maybe_unused]] const char* exit_reason = "FEASIBLE";
+  // TODO: do this better
+  i_t iter_limit = std::numeric_limits<i_t>::max();
+  if (timer.deterministic) { iter_limit = 20; }
+  while (h_n_violated_cstr > 0 && iter_limit-- > 0) {
+    repair_iterations++;
     CUOPT_LOG_TRACE("Bounds repair loop: n_violated %d best_violation %f curr_violation %f",
                     h_n_violated_cstr,
                     best_violation,
                     curr_violation);
-    if (timer.check_time_limit()) { break; }
+    if (timer.deterministic) {
+      CUOPT_DETERMINISM_LOG(
+        "Repair iter entry: iter=%d pb_hash=0x%x bounds_hash=0x%x violated_hash=0x%x "
+        "n_violated=%d best_violation=%.6f curr_violation=%.6f timer_rem=%.6f total_work=%.6f",
+        repair_iterations,
+        problem.get_fingerprint(),
+        detail::compute_hash(make_span(problem.variable_bounds), handle_ptr->get_stream()),
+        detail::compute_hash(make_span(violated_constraints, 0, h_n_violated_cstr),
+                             handle_ptr->get_stream()),
+        h_n_violated_cstr,
+        best_violation,
+        curr_violation,
+        timer.remaining_time(),
+        total_estimated_work);
+    }
+    if (timer.check_time_limit()) {
+      exit_reason = "TIME_LIMIT";
+      break;
+    }
     i_t curr_cstr = get_random_cstr();
     // best way would be to check a variable cycle, but this is easier and more performant
     bool is_cycle = detect_cycle(curr_cstr);
     if (is_cycle) { CUOPT_LOG_DEBUG("Repair: cycle detected at cstr %d", curr_cstr); }
     // in parallel compute the best shift and best respective damage
-    i_t n_candidates = compute_best_shift(problem, original_problem, curr_cstr);
+    i_t n_candidates  = compute_best_shift(problem, original_problem, curr_cstr);
+    double shift_work = 0.0;
+    if (timer.deterministic) {
+      shift_work = estimate_bounds_repair_shift_work(problem, curr_cstr, n_candidates, is_cycle);
+      record_estimated_work(timer, &total_estimated_work, shift_work);
+      CUOPT_DETERMINISM_LOG(
+        "Repair iter shift: iter=%d curr_cstr=%d cycle=%d n_candidates=%d cand_var_hash=0x%x "
+        "cand_shift_hash=0x%x singleton_moved=%d shift_work=%.6f timer_rem=%.6f total_work=%.6f",
+        repair_iterations,
+        curr_cstr,
+        (int)is_cycle,
+        n_candidates,
+        detail::compute_hash(make_span(candidates.variable_index, 0, n_candidates),
+                             handle_ptr->get_stream()),
+        detail::compute_hash(make_span(candidates.bound_shift, 0, n_candidates),
+                             handle_ptr->get_stream()),
+        (int)candidates.at_least_one_singleton_moved.value(handle_ptr->get_stream()),
+        shift_work,
+        timer.remaining_time(),
+        total_estimated_work);
+    }
     // if no candidate is there continue with another constraint
     if (n_candidates == 0) {
       CUOPT_LOG_DEBUG("Repair: no candidate var found for cstr %d", curr_cstr);
       if (no_candidate_in_a_row++ == 10 || h_n_violated_cstr == 1) {
         CUOPT_LOG_DEBUG("Repair: no candidate var found on last violated constraint %d. Exiting...",
                         curr_cstr);
+        exit_reason = "NO_CANDIDATE";
         break;
       }
       continue;
@@ -418,17 +600,36 @@ bool bounds_repair_t<i_t, f_t>::repair_problem(problem_t<i_t, f_t>& problem,
     // get the best damage
     i_t best_cstr_delta = candidates.cstr_delta.front_element(handle_ptr->get_stream());
     f_t best_damage     = candidates.damage.front_element(handle_ptr->get_stream());
+    double damage_work  = 0.0;
+    if (timer.deterministic) {
+      damage_work = estimate_bounds_repair_damage_work(problem, n_candidates);
+      record_estimated_work(timer, &total_estimated_work, damage_work);
+      CUOPT_DETERMINISM_LOG(
+        "Repair iter damage: iter=%d curr_cstr=%d cand_cdelta_hash=0x%x cand_damage_hash=0x%x "
+        "best_cstr_delta=%d best_damage=%.6f damage_work=%.6f timer_rem=%.6f total_work=%.6f",
+        repair_iterations,
+        curr_cstr,
+        detail::compute_hash(make_span(candidates.cstr_delta, 0, n_candidates),
+                             handle_ptr->get_stream()),
+        detail::compute_hash(make_span(candidates.damage, 0, n_candidates),
+                             handle_ptr->get_stream()),
+        best_cstr_delta,
+        best_damage,
+        damage_work,
+        timer.remaining_time(),
+        total_estimated_work);
+    }
     CUOPT_LOG_TRACE(
       "Repair: best_cstr_delta value %d best_damage %f", best_cstr_delta, best_damage);
     i_t best_move_idx;
-    // if the best damage is positive and we are within the prop (paper uses 0.75)
-    if ((best_cstr_delta > 0 && rand_double(0, 1, gen) < p) || is_cycle) {
-      // pick a random move from the candidate list
+    i_t n_of_eligible_candidates = -1;
+
+    const double rand_u01         = rand_double(0, 1, gen);
+    const bool took_random_branch = (best_cstr_delta > 0 && rand_u01 < p) || is_cycle;
+    if (took_random_branch) {
       best_move_idx = get_random_idx(n_candidates);
     } else {
-      // filter the moves with best_damage(it can be zero or not) and then pick a candidate among
-      // them
-      i_t n_of_eligible_candidates =
+      n_of_eligible_candidates =
         find_cutoff_index(candidates, best_cstr_delta, best_damage, n_candidates);
       cuopt_assert(n_of_eligible_candidates > 0, "");
       CUOPT_LOG_TRACE("n_of_eligible_candidates %d", n_of_eligible_candidates);
@@ -440,22 +641,79 @@ bool bounds_repair_t<i_t, f_t>::repair_problem(problem_t<i_t, f_t>& problem,
                     candidates.bound_shift.element(best_move_idx, handle_ptr->get_stream()),
                     candidates.cstr_delta.element(best_move_idx, handle_ptr->get_stream()),
                     candidates.damage.element(best_move_idx, handle_ptr->get_stream()));
+    if (timer.deterministic) {
+      CUOPT_DETERMINISM_LOG(
+        "Repair iter select: iter=%d cycle=%d rand_u01=%.12f took_random=%d "
+        "cutoff_idx=%d n_eligible=%d chosen_idx=%d chosen_var=%d chosen_shift=%.6f "
+        "chosen_cdelta=%d chosen_damage=%.6f",
+        repair_iterations,
+        (int)is_cycle,
+        rand_u01,
+        (int)took_random_branch,
+        (int)(took_random_branch ? -1 : n_of_eligible_candidates),
+        (int)(took_random_branch ? n_candidates : n_of_eligible_candidates),
+        best_move_idx,
+        candidates.variable_index.element(best_move_idx, handle_ptr->get_stream()),
+        candidates.bound_shift.element(best_move_idx, handle_ptr->get_stream()),
+        candidates.cstr_delta.element(best_move_idx, handle_ptr->get_stream()),
+        candidates.damage.element(best_move_idx, handle_ptr->get_stream()));
+    }
     apply_move(problem, original_problem, best_move_idx);
     reset();
     // TODO we might optimize this to only calculate the changed constraints
-    curr_violation = get_ii_violation(problem);
+    curr_violation                = get_ii_violation(problem);
+    const bool improved_violation = curr_violation < best_violation;
+    double refresh_work           = 0.0;
+    if (timer.deterministic) {
+      refresh_work = bounds_repair_move_base_work +
+                     estimate_bounds_repair_violation_refresh_work(problem, improved_violation);
+      record_estimated_work(timer, &total_estimated_work, refresh_work);
+      CUOPT_DETERMINISM_LOG(
+        "Repair iter post: iter=%d pb_hash=0x%x bounds_hash=0x%x violated_hash=0x%x "
+        "n_violated=%d curr_violation=%.6f improved=%d refresh_work=%.6f total_work=%.6f "
+        "timer_rem=%.6f",
+        repair_iterations,
+        problem.get_fingerprint(),
+        detail::compute_hash(make_span(problem.variable_bounds), handle_ptr->get_stream()),
+        detail::compute_hash(make_span(violated_constraints, 0, h_n_violated_cstr),
+                             handle_ptr->get_stream()),
+        h_n_violated_cstr,
+        curr_violation,
+        (int)improved_violation,
+        refresh_work,
+        total_estimated_work,
+        timer.remaining_time());
+      CUOPT_DETERMINISM_LOG(
+        "Repair iter work: cstr=%d candidates=%d cycle=%d improved=%d total=%.6f",
+        curr_cstr,
+        n_candidates,
+        (int)is_cycle,
+        (int)improved_violation,
+        total_estimated_work);
+    }
 
-    if (curr_violation < best_violation) {
+    if (improved_violation) {
       best_violation = curr_violation;
       // update best bounds
       best_bounds.update_from(problem, handle_ptr);
     }
   }
-  // fill the problem with the best bounds
+  if (h_n_violated_cstr > 0 && iter_limit <= 0) { exit_reason = "ITER_LIMIT"; }
   bool feasible = h_n_violated_cstr == 0;
-  // copy best bounds into problem
   best_bounds.update_to(problem, handle_ptr);
   CUOPT_LOG_DEBUG("Repair: returning with feas: %d vio %f", feasible, best_violation);
+  if (timer.deterministic) {
+    CUOPT_DETERMINISM_LOG(
+      "Repair exit: reason=%s iters=%d feasible=%d n_violated=%d best_violation=%.6f "
+      "total_work=%.6f timer_rem=%.6f",
+      exit_reason,
+      repair_iterations,
+      (int)feasible,
+      h_n_violated_cstr,
+      best_violation,
+      total_estimated_work,
+      timer.remaining_time());
+  }
   return feasible;
 }
 
diff --git a/cpp/src/mip_heuristics/local_search/rounding/bounds_repair.cuh b/cpp/src/mip_heuristics/local_search/rounding/bounds_repair.cuh
index 29161c5d25..8fd9d601a5 100644
--- a/cpp/src/mip_heuristics/local_search/rounding/bounds_repair.cuh
+++ b/cpp/src/mip_heuristics/local_search/rounding/bounds_repair.cuh
@@ -13,6 +13,9 @@
 #include <utilities/copy_helpers.hpp>
 #include <utilities/timer.hpp>
 
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/tuple.h>
+
 namespace cuopt::linear_programming::detail {
 
 // from the paper, probability of choosing random candidate= noise parameter
@@ -120,7 +123,7 @@ class bounds_repair_t {
   void compute_damages(problem_t<i_t, f_t>& problem, i_t n_candidates);
   bool repair_problem(problem_t<i_t, f_t>& problem,
                       problem_t<i_t, f_t>& original_problem,
-                      timer_t timer_,
+                      work_limit_timer_t& timer_,
                       const raft::handle_t* handle_ptr_);
   void apply_move(problem_t<i_t, f_t>& problem,
                   problem_t<i_t, f_t>& original_problem,
@@ -144,7 +147,7 @@ class bounds_repair_t {
   i_t h_n_violated_cstr;
   const raft::handle_t* handle_ptr;
   std::mt19937 gen;
-  timer_t timer{0.};
+  work_limit_timer_t timer;
   std::vector<i_t> cycle_vector;
   i_t cycle_write_pos = 0;
 };
diff --git a/cpp/src/mip_heuristics/local_search/rounding/constraint_prop.cu b/cpp/src/mip_heuristics/local_search/rounding/constraint_prop.cu
index 8db4d7ae85..41ab0f3e91 100644
--- a/cpp/src/mip_heuristics/local_search/rounding/constraint_prop.cu
+++ b/cpp/src/mip_heuristics/local_search/rounding/constraint_prop.cu
@@ -5,6 +5,7 @@
  */
 /* clang-format on */
 
+#include <mip_heuristics/logger.cuh>
 #include <mip_heuristics/mip_constants.hpp>
 #include <mip_heuristics/relaxed_lp/relaxed_lp.cuh>
 #include <mip_heuristics/utils.cuh>
@@ -16,8 +17,12 @@
 #include <thrust/copy.h>
 #include <thrust/gather.h>
 #include <thrust/iterator/transform_output_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
 #include <thrust/partition.h>
 #include <thrust/sort.h>
+#include <thrust/tuple.h>
+
+#include <chrono>
 
 namespace cuopt::linear_programming::detail {
 
@@ -39,7 +44,8 @@ constraint_prop_t<i_t, f_t>::constraint_prop_t(mip_solver_context_t<i_t, f_t>& c
     ub_restore(context.problem_ptr->n_variables, context.problem_ptr->handle_ptr->get_stream()),
     assignment_restore(context.problem_ptr->n_variables,
                        context.problem_ptr->handle_ptr->get_stream()),
-    rng(cuopt::seed_generator::get_seed(), 0, 0)
+    rng(cuopt::seed_generator::get_seed(), 0, 0),
+    max_timer(0.0, cuopt::termination_checker_t::root_tag_t{})
 {
 }
 
@@ -725,6 +731,10 @@ void constraint_prop_t<i_t, f_t>::update_host_assignment(const solution_t<i_t, f
              sol.assignment.data(),
              sol.problem_ptr->n_variables,
              sol.handle_ptr->get_stream());
+  sol.handle_ptr->sync_stream();
+  CUOPT_DETERMINISM_LOG(
+    "update_host_assignment: device_hash=0x%x",
+    detail::compute_hash(make_span(sol.assignment), sol.handle_ptr->get_stream()));
 }
 
 template <typename i_t, typename f_t>
@@ -755,7 +765,7 @@ void constraint_prop_t<i_t, f_t>::restore_original_bounds_on_unfixed(
 template <typename i_t, typename f_t>
 bool constraint_prop_t<i_t, f_t>::run_repair_procedure(problem_t<i_t, f_t>& problem,
                                                        problem_t<i_t, f_t>& original_problem,
-                                                       timer_t& timer,
+                                                       work_limit_timer_t& timer,
                                                        const raft::handle_t* handle_ptr)
 {
   // select the first probing value
@@ -765,9 +775,14 @@ bool constraint_prop_t<i_t, f_t>::run_repair_procedure(problem_t<i_t, f_t>& prob
   repair_stats.repair_attempts++;
   f_t repair_start_time                = timer.remaining_time();
   i_t n_of_repairs_needed_for_feasible = 0;
+  // TODO: do this better
+  i_t iter_limit = std::numeric_limits<i_t>::max();
+  if ((this->context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS)) {
+    iter_limit = 100;
+  }
   do {
     n_of_repairs_needed_for_feasible++;
-    if (timer.check_time_limit()) {
+    if (timer.check_time_limit() || iter_limit-- <= 0) {
       CUOPT_LOG_DEBUG("Time limit is reached in repair loop!");
       f_t repair_end_time = timer.remaining_time();
       repair_stats.total_time_spent_on_repair += repair_start_time - repair_end_time;
@@ -775,8 +790,24 @@ bool constraint_prop_t<i_t, f_t>::run_repair_procedure(problem_t<i_t, f_t>& prob
     }
     repair_stats.total_repair_loops++;
     collapse_crossing_bounds(problem, original_problem, handle_ptr);
+    if (timer.deterministic) {
+      CUOPT_DETERMINISM_LOG(
+        "run_repair_procedure pre-repair: loop=%d bounds_hash=0x%x infeas_count=%d timer_rem=%.6f",
+        n_of_repairs_needed_for_feasible,
+        detail::compute_hash(make_span(problem.variable_bounds), handle_ptr->get_stream()),
+        bounds_update.infeas_constraints_count,
+        timer.remaining_time());
+    }
     bool bounds_repaired =
       bounds_repair.repair_problem(problem, original_problem, timer, handle_ptr);
+    if (timer.deterministic) {
+      CUOPT_DETERMINISM_LOG(
+        "run_repair_procedure post-repair: loop=%d repaired=%d bounds_hash=0x%x timer_rem=%.6f",
+        n_of_repairs_needed_for_feasible,
+        (int)bounds_repaired,
+        detail::compute_hash(make_span(problem.variable_bounds), handle_ptr->get_stream()),
+        timer.remaining_time());
+    }
     if (bounds_repaired) {
       repair_stats.intermediate_repair_success++;
       CUOPT_LOG_DEBUG("Bounds repair success, running bounds prop to verify feasibility!");
@@ -841,11 +872,15 @@ bool constraint_prop_t<i_t, f_t>::find_integer(
   solution_t<i_t, f_t>& sol,
   solution_t<i_t, f_t>& orig_sol,
   f_t lp_run_time_after_feasible,
-  timer_t& timer,
+  work_limit_timer_t& timer,
   std::optional<std::reference_wrapper<probing_config_t<i_t, f_t>>> probing_config)
 {
   using crit_t             = termination_criterion_t;
   auto& unset_integer_vars = unset_vars;
+  CUOPT_DETERMINISM_LOG("find_integer entry: seed=%lld hash=0x%x rem=%.6f",
+                        (long long)cuopt::seed_generator::peek_seed(),
+                        sol.get_hash(),
+                        timer.remaining_time());
   std::mt19937 rng(cuopt::seed_generator::get_seed());
   lb_restore.resize(sol.problem_ptr->n_variables, sol.handle_ptr->get_stream());
   ub_restore.resize(sol.problem_ptr->n_variables, sol.handle_ptr->get_stream());
@@ -871,6 +906,7 @@ bool constraint_prop_t<i_t, f_t>::find_integer(
                sol.problem_ptr->integer_indices.data(),
                sol.problem_ptr->n_integer_vars,
                sol.handle_ptr->get_stream());
+    CUOPT_DETERMINISM_LOG("sol hash 0x%x", sol.get_hash());
   } else {
     find_unset_integer_vars(sol, unset_integer_vars);
     sort_by_frac(sol, make_span(unset_integer_vars));
@@ -895,16 +931,17 @@ bool constraint_prop_t<i_t, f_t>::find_integer(
     set_bounds_on_fixed_vars(sol);
   }
 
-  CUOPT_LOG_DEBUG("Bounds propagation rounding: unset vars %lu", unset_integer_vars.size());
+  CUOPT_DETERMINISM_LOG("Bounds propagation rounding: unset vars %lu", unset_integer_vars.size());
   if (unset_integer_vars.size() == 0) {
-    CUOPT_LOG_DEBUG("No integer variables provided in the bounds prop rounding");
+    CUOPT_DETERMINISM_LOG("No integer variables provided in the bounds prop rounding");
     expand_device_copy(orig_sol.assignment, sol.assignment, sol.handle_ptr->get_stream());
     cuopt_func_call(orig_sol.test_variable_bounds());
     return orig_sol.compute_feasibility();
   }
   // this is needed for the sort inside of the loop
   bool problem_ii = is_problem_ii(*sol.problem_ptr);
-  // if the problem is ii, run the bounds prop in the beginning
+  CUOPT_DETERMINISM_LOG("is problem ii %d", problem_ii);
+  //  if the problem is ii, run the bounds prop in the beginning
   if (problem_ii) {
     bool bounds_repaired =
       bounds_repair.repair_problem(*sol.problem_ptr, *orig_sol.problem_ptr, timer, sol.handle_ptr);
@@ -925,11 +962,16 @@ bool constraint_prop_t<i_t, f_t>::find_integer(
     sort_by_interval_and_frac(sol, make_span(unset_integer_vars), rng);
   }
   set_host_bounds(sol);
+  CUOPT_DETERMINISM_LOG("find_integer pre-loop: seed=%lld hash=0x%x",
+                        (long long)cuopt::seed_generator::peek_seed(),
+                        sol.get_hash());
   size_t set_count               = 0;
   bool timeout_happened          = false;
   i_t n_failed_repair_iterations = 0;
   while (set_count < unset_integer_vars.size()) {
-    CUOPT_LOG_TRACE("n_set_vars %d vars to set %lu", set_count, unset_integer_vars.size());
+    CUOPT_DETERMINISM_LOG("n_set_vars %d vars to set %lu", set_count, unset_integer_vars.size());
+    CUOPT_DETERMINISM_LOG("unset_integer_vars size %lu", unset_integer_vars.size());
+    const size_t set_count_before = set_count;
     update_host_assignment(sol);
     if (max_timer.check_time_limit()) {
       CUOPT_LOG_DEBUG("Second time limit is reached returning nearest rounding!");
@@ -954,7 +996,8 @@ bool constraint_prop_t<i_t, f_t>::find_integer(
         bounds_prop_interval = 1;
       }
     }
-    i_t n_vars_to_set = recovery_mode ? 1 : bounds_prop_interval;
+    i_t n_vars_to_set   = recovery_mode ? 1 : bounds_prop_interval;
+    const bool did_sort = n_vars_to_set != 1;
     // if we are not at the last stage or if we are in recovery mode, don't sort
     if (n_vars_to_set != 1) {
       sort_by_implied_slack_consumption(
@@ -965,17 +1008,63 @@ bool constraint_prop_t<i_t, f_t>::find_integer(
                unset_integer_vars.data() + set_count,
                n_vars_to_set,
                sol.handle_ptr->get_stream());
+    sol.handle_ptr->sync_stream();
     auto var_probe_vals =
       generate_bulk_rounding_vector(sol, orig_sol, host_vars_to_set, probing_config);
+    if (timer.deterministic) {
+      const auto& vids = std::get<0>(var_probe_vals);
+      const auto& fp   = std::get<1>(var_probe_vals);
+      const auto& sp   = std::get<2>(var_probe_vals);
+      std::string probe_str;
+      for (size_t k = 0; k < std::min(vids.size(), (size_t)8); ++k) {
+        char buf[128];
+        snprintf(buf, sizeof(buf), " (%d,%.4f,%.4f)", vids[k], fp[k], sp[k]);
+        probe_str += buf;
+      }
+      CUOPT_DETERMINISM_LOG(
+        "find_integer loop: set_count=%zu n_vars_to_set=%d seed=%lld probes=[%s]",
+        set_count,
+        n_vars_to_set,
+        (long long)cuopt::seed_generator::peek_seed(),
+        probe_str.c_str());
+    }
     probe(
       sol, orig_sol.problem_ptr, var_probe_vals, &set_count, unset_integer_vars, probing_config);
+    CUOPT_DETERMINISM_LOG("find_integer post-probe: seed=%lld set_count=%zu hash=0x%x",
+                          (long long)cuopt::seed_generator::peek_seed(),
+                          set_count,
+                          sol.get_hash());
+    [[maybe_unused]] bool repair_attempted = false;
+    bool bounds_repaired                   = false;
+    i_t n_fixed_vars                       = 0;
     if (!(n_failed_repair_iterations >= max_n_failed_repair_iterations) && rounding_ii &&
         !timeout_happened) {
-      timer_t repair_timer{std::min(timer.remaining_time() / 5, timer.elapsed_time() / 3)};
+      // timer_t repair_timer{std::min(timer.remaining_time() / 5, timer.elapsed_time() / 3)};
+      work_limit_timer_t repair_timer(
+        context.gpu_heur_loop, timer.remaining_time() / 5, *context.termination);
       save_bounds(sol);
-      // update bounds and run repair procedure
+      if (timer.deterministic) {
+        CUOPT_DETERMINISM_LOG(
+          "find_integer pre-repair: bounds_hash=0x%x assignment_hash=0x%x infeas_count=%d "
+          "timer_rem=%.6f",
+          detail::compute_hash(make_span(sol.problem_ptr->variable_bounds),
+                               sol.handle_ptr->get_stream()),
+          detail::compute_hash(make_span(sol.assignment), sol.handle_ptr->get_stream()),
+          bounds_update.infeas_constraints_count,
+          timer.remaining_time());
+      }
       bool bounds_repaired =
         run_repair_procedure(*sol.problem_ptr, *orig_sol.problem_ptr, repair_timer, sol.handle_ptr);
+      if (timer.deterministic) {
+        CUOPT_DETERMINISM_LOG(
+          "find_integer post-repair: repaired=%d bounds_hash=0x%x assignment_hash=0x%x "
+          "timer_rem=%.6f",
+          (int)bounds_repaired,
+          detail::compute_hash(make_span(sol.problem_ptr->variable_bounds),
+                               sol.handle_ptr->get_stream()),
+          detail::compute_hash(make_span(sol.assignment), sol.handle_ptr->get_stream()),
+          timer.remaining_time());
+      }
       if (!bounds_repaired) {
         restore_bounds(sol);
         n_failed_repair_iterations++;
@@ -998,7 +1087,7 @@ bool constraint_prop_t<i_t, f_t>::find_integer(
                                      make_span(sol.problem_ptr->variable_bounds),
                                      make_span(orig_sol.problem_ptr->variable_bounds),
                                      make_span(sol.assignment)});
-        i_t n_fixed_vars = (iter - (unset_vars.begin() + set_count));
+        n_fixed_vars = (iter - (unset_vars.begin() + set_count));
         CUOPT_LOG_TRACE("After repair procedure, number of additional fixed vars %d", n_fixed_vars);
         set_count += n_fixed_vars;
       }
@@ -1026,7 +1115,7 @@ bool constraint_prop_t<i_t, f_t>::find_integer(
     // which is the unchanged problem bounds
     multi_probe.update_host_bounds(sol.handle_ptr, make_span(sol.problem_ptr->variable_bounds));
   }
-  CUOPT_LOG_DEBUG(
+  CUOPT_DETERMINISM_LOG(
     "Bounds propagation rounding end: ii constraint count first buffer %d, second buffer %d",
     multi_probe.infeas_constraints_count_0,
     multi_probe.infeas_constraints_count_1);
@@ -1038,7 +1127,12 @@ bool constraint_prop_t<i_t, f_t>::find_integer(
        multi_probe.infeas_constraints_count_1 == 0) &&
       !timeout_happened && lp_run_time_after_feasible > 0) {
     relaxed_lp_settings_t lp_settings;
-    lp_settings.time_limit            = lp_run_time_after_feasible;
+    lp_settings.time_limit = lp_run_time_after_feasible;
+    if (timer.deterministic) {
+      lp_settings.work_limit   = lp_settings.time_limit;
+      lp_settings.work_context = timer.work_context;
+      cuopt_assert(lp_settings.work_context != nullptr, "Missing deterministic work context");
+    }
     lp_settings.tolerance             = orig_sol.problem_ptr->tolerances.absolute_tolerance;
     lp_settings.save_state            = false;
     lp_settings.return_first_feasible = true;
@@ -1050,6 +1144,10 @@ bool constraint_prop_t<i_t, f_t>::find_integer(
   }
   bool res_feasible = orig_sol.compute_feasibility();
   orig_sol.handle_ptr->sync_stream();
+  CUOPT_DETERMINISM_LOG("find_integer exit: seed=%lld feasible=%d hash=0x%x",
+                        (long long)cuopt::seed_generator::peek_seed(),
+                        (int)res_feasible,
+                        orig_sol.get_hash());
   return res_feasible;
 }
 
@@ -1057,11 +1155,13 @@ template <typename i_t, typename f_t>
 bool constraint_prop_t<i_t, f_t>::apply_round(
   solution_t<i_t, f_t>& sol,
   f_t lp_run_time_after_feasible,
-  timer_t& timer,
+  work_limit_timer_t& timer,
   std::optional<std::reference_wrapper<probing_config_t<i_t, f_t>>> probing_config)
 {
   raft::common::nvtx::range fun_scope("constraint prop round");
-  max_timer = timer_t{max_time_for_bounds_prop};
+
+  sol.compute_feasibility();
+  max_timer = work_limit_timer_t{context.gpu_heur_loop, max_time_for_bounds_prop, timer};
   if (check_brute_force_rounding(sol)) { return true; }
   recovery_mode      = false;
   rounding_ii        = false;
@@ -1076,9 +1176,9 @@ bool constraint_prop_t<i_t, f_t>::apply_round(
   f_t bounds_prop_end_time = max_timer.remaining_time();
   repair_stats.total_time_spent_on_bounds_prop += bounds_prop_start_time - bounds_prop_end_time;
 
-  CUOPT_LOG_DEBUG(
-    "repair_success %lu repair_attempts %lu intermediate_repair_success %lu total_repair_loops %lu "
-    "total_time_spent_on_repair %f total_time_spent_bounds_prop_after_repair %f "
+  CUOPT_DETERMINISM_LOG(
+    "repair_success %lu repair_attempts %lu intermediate_repair_success %lu total_repair_loops"
+    "%lu total_time_spent_on_repair %f total_time_spent_bounds_prop_after_repair %f "
     "total_time_spent_on_bounds_prop %f",
     repair_stats.repair_success,
     repair_stats.repair_attempts,
@@ -1229,6 +1329,13 @@ bool constraint_prop_t<i_t, f_t>::probe(
   }
   selected_update = 0;
   if (first_bounds_update_ii) { selected_update = 1; }
+  CUOPT_DETERMINISM_LOG(
+    "probe result: infeas_0=%d infeas_1=%d selected_update=%d recovery=%d rounding_ii=%d",
+    multi_probe.infeas_constraints_count_0,
+    multi_probe.infeas_constraints_count_1,
+    selected_update,
+    (int)recovery_mode,
+    (int)rounding_ii);
   // if we are doing single rounding
   if (probing_config.has_value() && probing_config.value().get().use_balanced_probing) {
     cuopt_assert(std::get<0>(var_probe_vals).size() == 1,
diff --git a/cpp/src/mip_heuristics/local_search/rounding/constraint_prop.cuh b/cpp/src/mip_heuristics/local_search/rounding/constraint_prop.cuh
index 2c609228e8..7ad4253cc4 100644
--- a/cpp/src/mip_heuristics/local_search/rounding/constraint_prop.cuh
+++ b/cpp/src/mip_heuristics/local_search/rounding/constraint_prop.cuh
@@ -43,7 +43,7 @@ struct constraint_prop_t {
   constraint_prop_t(mip_solver_context_t<i_t, f_t>& context);
   bool apply_round(solution_t<i_t, f_t>& sol,
                    f_t lp_run_time_after_feasible,
-                   timer_t& timer,
+                   work_limit_timer_t& timer,
                    std::optional<std::reference_wrapper<probing_config_t<i_t, f_t>>>
                      probing_config = std::nullopt);
   void sort_by_implied_slack_consumption(solution_t<i_t, f_t>& sol,
@@ -56,7 +56,7 @@ struct constraint_prop_t {
   bool find_integer(solution_t<i_t, f_t>& sol,
                     solution_t<i_t, f_t>& orig_sol,
                     f_t lp_run_time_after_feasible,
-                    timer_t& timer,
+                    work_limit_timer_t& timer,
                     std::optional<std::reference_wrapper<probing_config_t<i_t, f_t>>>
                       probing_config = std::nullopt);
   void find_set_integer_vars(solution_t<i_t, f_t>& sol, rmm::device_uvector<i_t>& set_vars);
@@ -121,7 +121,7 @@ struct constraint_prop_t {
                                           const raft::handle_t* handle_ptr);
   bool run_repair_procedure(problem_t<i_t, f_t>& problem,
                             problem_t<i_t, f_t>& original_problem,
-                            timer_t& timer,
+                            work_limit_timer_t& timer,
                             const raft::handle_t* handle_ptr);
   bool handle_fixed_vars(
     solution_t<i_t, f_t>& sol,
@@ -149,7 +149,7 @@ struct constraint_prop_t {
   i_t bounds_prop_interval           = 1;
   i_t n_iter_in_recovery             = 0;
   i_t max_n_failed_repair_iterations = 1;
-  timer_t max_timer{0.};
+  work_limit_timer_t max_timer;
   bool use_probing_cache = true;
   static repair_stats_t repair_stats;
   bool single_rounding_only = false;
diff --git a/cpp/src/mip_heuristics/local_search/rounding/lb_bounds_repair.cu b/cpp/src/mip_heuristics/local_search/rounding/lb_bounds_repair.cu
index 7d074aea5e..612ed8160b 100644
--- a/cpp/src/mip_heuristics/local_search/rounding/lb_bounds_repair.cu
+++ b/cpp/src/mip_heuristics/local_search/rounding/lb_bounds_repair.cu
@@ -8,8 +8,10 @@
 #include "lb_bounds_repair.cuh"
 
 #include <thrust/copy.h>
+#include <thrust/iterator/zip_iterator.h>
 #include <thrust/partition.h>
 #include <thrust/sort.h>
+#include <thrust/tuple.h>
 #include <mip_heuristics/logger.cuh>
 #include <mip_heuristics/mip_constants.hpp>
 #include <utilities/seed_generator.cuh>
@@ -26,7 +28,8 @@ lb_bounds_repair_t<i_t, f_t>::lb_bounds_repair_t(const raft::handle_t* handle_pt
     violated_cstr_map(0, handle_ptr->get_stream()),
     total_vio(handle_ptr->get_stream()),
     gen(cuopt::seed_generator::get_seed()),
-    cycle_vector(MAX_CYCLE_SEQUENCE, -1)
+    cycle_vector(MAX_CYCLE_SEQUENCE, -1),
+    timer(0.0, cuopt::termination_checker_t::root_tag_t{})
 {
 }
 
@@ -68,8 +71,7 @@ std::tuple<f_t, i_t> lb_bounds_repair_t<i_t, f_t>::get_ii_violation(
      constraint_upper_bounds = problem.constraint_upper_bounds,
      cnst_slack              = make_span_2(lb_bound_presolve.cnst_slack),
      cstr_violations_up      = cstr_violations_up.data(),
-     cstr_violations_down    = cstr_violations_down.data(),
-     total_vio               = total_vio.data()] __device__(i_t cstr_idx) {
+     cstr_violations_down    = cstr_violations_down.data()] __device__(i_t cstr_idx) {
       f_t cnst_lb = constraint_lower_bounds[cstr_idx];
       f_t cnst_ub = constraint_upper_bounds[cstr_idx];
       f_t2 slack  = cnst_slack[cstr_idx];
@@ -80,7 +82,6 @@ std::tuple<f_t, i_t> lb_bounds_repair_t<i_t, f_t>::get_ii_violation(
       f_t violation                = max(curr_cstr_violation_up, curr_cstr_violation_down);
       if (violation >= ROUNDOFF_TOLERANCE) {
         violated_cstr_map[cstr_idx] = 1;
-        atomicAdd(total_vio, violation);
       } else {
         violated_cstr_map[cstr_idx] = 0;
       }
@@ -94,7 +95,18 @@ std::tuple<f_t, i_t> lb_bounds_repair_t<i_t, f_t>::get_ii_violation(
                               violated_constraints.data(),
                               cuda::std::identity{});
   i_t n_violated_cstr = iter - violated_constraints.data();
-  f_t total_violation = total_vio.value(handle_ptr->get_stream());
+  // Use deterministic reduction instead of non-deterministic atomicAdd
+  f_t total_violation = thrust::transform_reduce(
+    handle_ptr->get_thrust_policy(),
+    thrust::make_counting_iterator(0),
+    thrust::make_counting_iterator(0) + problem.n_constraints,
+    [cstr_violations_up   = cstr_violations_up.data(),
+     cstr_violations_down = cstr_violations_down.data()] __device__(i_t cstr_idx) -> f_t {
+      auto violation = max(cstr_violations_up[cstr_idx], cstr_violations_down[cstr_idx]);
+      return violation >= ROUNDOFF_TOLERANCE ? violation : 0.;
+    },
+    (f_t)0,
+    thrust::plus<f_t>());
   CUOPT_LOG_TRACE(
     "Repair: n_violated_cstr %d total_violation %f", n_violated_cstr, total_violation);
   return std::make_tuple(total_violation, n_violated_cstr);
@@ -397,10 +409,11 @@ bool lb_bounds_repair_t<i_t, f_t>::repair_problem(
   load_balanced_problem_t<i_t, f_t>* problem,
   load_balanced_bounds_presolve_t<i_t, f_t>& lb_bound_presolve,
   problem_t<i_t, f_t>& original_problem,
-  timer_t timer_,
+  work_limit_timer_t& timer_,
   const raft::handle_t* handle_ptr_)
 {
-  CUOPT_LOG_DEBUG("Running bounds repair");
+  nvtx::range fun_scope("LB repair_problem");
+  CUOPT_LOG_DEBUG("LB Running bounds repair");
   handle_ptr = handle_ptr_;
   timer      = timer_;
   resize(*problem);
diff --git a/cpp/src/mip_heuristics/local_search/rounding/lb_bounds_repair.cuh b/cpp/src/mip_heuristics/local_search/rounding/lb_bounds_repair.cuh
index 0b549c684d..068c0d57bf 100644
--- a/cpp/src/mip_heuristics/local_search/rounding/lb_bounds_repair.cuh
+++ b/cpp/src/mip_heuristics/local_search/rounding/lb_bounds_repair.cuh
@@ -58,7 +58,7 @@ class lb_bounds_repair_t {
   bool repair_problem(load_balanced_problem_t<i_t, f_t>* problem,
                       load_balanced_bounds_presolve_t<i_t, f_t>& lb_bound_presolve,
                       problem_t<i_t, f_t>& original_problem,
-                      timer_t timer_,
+                      work_limit_timer_t& timer_,
                       const raft::handle_t* handle_ptr_);
   void apply_move(load_balanced_problem_t<i_t, f_t>* problem,
                   problem_t<i_t, f_t>& original_problem,
@@ -82,7 +82,7 @@ class lb_bounds_repair_t {
   i_t h_n_violated_cstr;
   const raft::handle_t* handle_ptr;
   std::mt19937 gen;
-  timer_t timer{0.};
+  work_limit_timer_t timer;
   std::vector<i_t> cycle_vector;
   i_t cycle_write_pos = 0;
 };
diff --git a/cpp/src/mip_heuristics/local_search/rounding/lb_constraint_prop.cu b/cpp/src/mip_heuristics/local_search/rounding/lb_constraint_prop.cu
index bb72834ab4..d8e3bcc040 100644
--- a/cpp/src/mip_heuristics/local_search/rounding/lb_constraint_prop.cu
+++ b/cpp/src/mip_heuristics/local_search/rounding/lb_constraint_prop.cu
@@ -33,7 +33,8 @@ lb_constraint_prop_t<i_t, f_t>::lb_constraint_prop_t(mip_solver_context_t<i_t, f
                    context.problem_ptr->handle_ptr->get_stream()),
     assignment_restore(context.problem_ptr->n_variables,
                        context.problem_ptr->handle_ptr->get_stream()),
-    rng(cuopt::seed_generator::get_seed(), 0, 0)
+    rng(cuopt::seed_generator::get_seed(), 0, 0),
+    max_timer(0.0, cuopt::termination_checker_t::root_tag_t{})
 {
 }
 
@@ -700,14 +701,15 @@ template <typename i_t, typename f_t>
 bool lb_constraint_prop_t<i_t, f_t>::apply_round(
   solution_t<i_t, f_t>& sol,
   f_t lp_run_time_after_feasible,
-  timer_t& timer,
+  work_limit_timer_t& timer,
   std::optional<std::vector<thrust::pair<f_t, f_t>>> probing_candidates)
 {
   raft::common::nvtx::range fun_scope("constraint prop round");
 
   // this is second timer that can continue but without recovery mode
   const f_t max_time_for_bounds_prop = 5.;
-  max_timer                          = timer_t{max_time_for_bounds_prop};
+  max_timer =
+    work_limit_timer_t{context.gpu_heur_loop, max_time_for_bounds_prop, *context.termination};
   if (check_brute_force_rounding(sol)) { return true; }
   recovery_mode      = false;
   rounding_ii        = false;
diff --git a/cpp/src/mip_heuristics/local_search/rounding/lb_constraint_prop.cuh b/cpp/src/mip_heuristics/local_search/rounding/lb_constraint_prop.cuh
index 20e28e7cb9..6fb88467ab 100644
--- a/cpp/src/mip_heuristics/local_search/rounding/lb_constraint_prop.cuh
+++ b/cpp/src/mip_heuristics/local_search/rounding/lb_constraint_prop.cuh
@@ -23,7 +23,7 @@ struct lb_constraint_prop_t {
   bool apply_round(
     solution_t<i_t, f_t>& sol,
     f_t lp_run_time_after_feasible,
-    timer_t& timer,
+    work_limit_timer_t& timer,
     std::optional<std::vector<thrust::pair<f_t, f_t>>> probing_candidates = std::nullopt);
   void sort_by_implied_slack_consumption(
     problem_t<i_t, f_t>& original_problem,
@@ -40,7 +40,7 @@ struct lb_constraint_prop_t {
                     load_balanced_bounds_presolve_t<i_t, f_t>& lb_bounds_update,
                     solution_t<i_t, f_t>& orig_sol,
                     f_t lp_run_time_after_feasible,
-                    timer_t& timer,
+                    work_limit_timer_t& timer,
                     std::optional<std::vector<thrust::pair<f_t, f_t>>> probing_candidates);
   std::tuple<f_t, f_t, f_t> probing_values(
     load_balanced_bounds_presolve_t<i_t, f_t>& lb_bounds_update,
@@ -83,7 +83,7 @@ struct lb_constraint_prop_t {
   bool run_repair_procedure(load_balanced_problem_t<i_t, f_t>* problem,
                             load_balanced_bounds_presolve_t<i_t, f_t>& lb_bounds_update,
                             problem_t<i_t, f_t>& original_problem,
-                            timer_t& timer,
+                            work_limit_timer_t& timer,
                             const raft::handle_t* handle_ptr);
 
   mip_solver_context_t<i_t, f_t>& context;
@@ -100,7 +100,7 @@ struct lb_constraint_prop_t {
   bool rounding_ii         = false;
   i_t bounds_prop_interval = 1;
   i_t n_iter_in_recovery   = 0;
-  timer_t max_timer{0.};
+  work_limit_timer_t max_timer;
   bool use_probing_cache = true;
 
   size_t repair_attempts                           = 0;
diff --git a/cpp/src/mip_heuristics/local_search/rounding/simple_rounding.cu b/cpp/src/mip_heuristics/local_search/rounding/simple_rounding.cu
index 4f3a015a6c..9a2bf317b7 100644
--- a/cpp/src/mip_heuristics/local_search/rounding/simple_rounding.cu
+++ b/cpp/src/mip_heuristics/local_search/rounding/simple_rounding.cu
@@ -8,8 +8,10 @@
 #include "simple_rounding.cuh"
 #include "simple_rounding_kernels.cuh"
 
+#include <mip_heuristics/logger.cuh>
 #include <mip_heuristics/mip_constants.hpp>
 #include <utilities/copy_helpers.hpp>
+#include <utilities/determinism_log.hpp>
 #include <utilities/seed_generator.cuh>
 
 #include <thrust/copy.h>
@@ -35,6 +37,8 @@ bool check_brute_force_rounding(solution_t<i_t, f_t>& solution)
   if (n_integers_to_round == 0) { return solution.compute_feasibility(); }
   constexpr i_t brute_force_rounding_threshold = 8;
   if (n_integers_to_round <= brute_force_rounding_threshold) {
+    CUOPT_DETERMINISM_LOG(
+      "Brute-force rounding: n_to_round=%d hash=0x%x", n_integers_to_round, solution.get_hash());
     solution.compute_constraints();
     i_t n_configs = pow(2, n_integers_to_round);
     i_t n_blocks  = (n_configs + TPB - 1) / TPB;
@@ -42,7 +46,8 @@ bool check_brute_force_rounding(solution_t<i_t, f_t>& solution)
     rmm::device_uvector<i_t> var_map(n_integers_to_round, solution.handle_ptr->get_stream());
     rmm::device_uvector<f_t> constraint_buf(n_configs * solution.problem_ptr->n_constraints,
                                             solution.handle_ptr->get_stream());
-    rmm::device_scalar<i_t> best_config(-1, solution.handle_ptr->get_stream());
+    rmm::device_scalar<i_t> best_config(std::numeric_limits<i_t>::max(),
+                                        solution.handle_ptr->get_stream());
     thrust::copy_if(
       solution.handle_ptr->get_thrust_policy(),
       solution.problem_ptr->integer_indices.begin(),
@@ -58,7 +63,13 @@ bool check_brute_force_rounding(solution_t<i_t, f_t>& solution)
                                                                 cuopt::make_span(var_map),
                                                                 cuopt::make_span(constraint_buf),
                                                                 best_config.data());
-    if (best_config.value(solution.handle_ptr->get_stream()) != -1) {
+    i_t best_config_val = best_config.value(solution.handle_ptr->get_stream());
+    CUOPT_DETERMINISM_LOG(
+      "Brute-force rounding: best_config=%d (max=%d) var_map_hash=0x%x",
+      best_config_val,
+      (int)std::numeric_limits<i_t>::max(),
+      detail::compute_hash(make_span(var_map), solution.handle_ptr->get_stream()));
+    if (best_config_val != std::numeric_limits<i_t>::max()) {
       CUOPT_LOG_DEBUG("Feasible found during brute force rounding!");
       // apply the feasible rounding
       apply_feasible_rounding_kernel<i_t, f_t><<<1, TPB, 0, solution.handle_ptr->get_stream()>>>(
diff --git a/cpp/src/mip_heuristics/local_search/rounding/simple_rounding_kernels.cuh b/cpp/src/mip_heuristics/local_search/rounding/simple_rounding_kernels.cuh
index 2edca8fb08..a0b8468ea7 100644
--- a/cpp/src/mip_heuristics/local_search/rounding/simple_rounding_kernels.cuh
+++ b/cpp/src/mip_heuristics/local_search/rounding/simple_rounding_kernels.cuh
@@ -131,7 +131,7 @@ __global__ void brute_force_check_kernel(typename solution_t<i_t, f_t>::view_t s
   __shared__ i_t shbuf[raft::WarpSize];
   i_t total_feasible = raft::blockReduce(th_feasible_count, (char*)shbuf);
   if (threadIdx.x == 0) {
-    if (total_feasible == solution.problem.n_constraints) { atomicExch(best_config, config); }
+    if (total_feasible == solution.problem.n_constraints) { atomicMin(best_config, config); }
   }
 }
 
diff --git a/cpp/src/mip_heuristics/mip_constants.hpp b/cpp/src/mip_heuristics/mip_constants.hpp
index 47d3d22de4..94b511da60 100644
--- a/cpp/src/mip_heuristics/mip_constants.hpp
+++ b/cpp/src/mip_heuristics/mip_constants.hpp
@@ -13,3 +13,7 @@
 #define MIP_INSTANTIATE_DOUBLE CUOPT_INSTANTIATE_DOUBLE
 
 #define PDLP_INSTANTIATE_FLOAT 1
+
+#define BB_BASE_WORK_SCALE       1.0
+#define GPU_HEUR_BASE_WORK_SCALE 0.4
+#define CPUFJ_BASE_WORK_SCALE    1.0
diff --git a/cpp/src/mip_heuristics/presolve/bounds_presolve.cu b/cpp/src/mip_heuristics/presolve/bounds_presolve.cu
index d78f8beb16..6cc57cf153 100644
--- a/cpp/src/mip_heuristics/presolve/bounds_presolve.cu
+++ b/cpp/src/mip_heuristics/presolve/bounds_presolve.cu
@@ -11,6 +11,7 @@
 #include <thrust/count.h>
 #include <thrust/extrema.h>
 #include <thrust/functional.h>
+#include <thrust/iterator/transform_iterator.h>
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/transform_reduce.h>
 #include <thrust/tuple.h>
@@ -167,10 +168,14 @@ void bound_presolve_t<i_t, f_t>::set_bounds(
 
 template <typename i_t, typename f_t>
 termination_criterion_t bound_presolve_t<i_t, f_t>::bound_update_loop(problem_t<i_t, f_t>& pb,
-                                                                      timer_t timer)
+                                                                      work_limit_timer_t& timer)
 {
   termination_criterion_t criteria = termination_criterion_t::ITERATION_LIMIT;
 
+  if ((context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS)) {
+    settings.iteration_limit = std::min(settings.iteration_limit, 50);
+  }
+
   i_t iter;
   upd.init_changed_constraints(pb.handle_ptr);
   for (iter = 0; iter < settings.iteration_limit; ++iter) {
@@ -229,7 +234,7 @@ termination_criterion_t bound_presolve_t<i_t, f_t>::solve(problem_t<i_t, f_t>& p
                                                           i_t var_idx)
 {
   auto& handle_ptr = pb.handle_ptr;
-  timer_t timer(settings.time_limit);
+  work_limit_timer_t timer(context.gpu_heur_loop, settings.time_limit, *context.termination);
   copy_input_bounds(pb);
   upd.lb.set_element_async(var_idx, var_lb, handle_ptr->get_stream());
   upd.ub.set_element_async(var_idx, var_ub, handle_ptr->get_stream());
@@ -242,7 +247,7 @@ termination_criterion_t bound_presolve_t<i_t, f_t>::solve(
   const std::vector<thrust::pair<i_t, f_t>>& var_probe_val_pairs,
   bool use_host_bounds)
 {
-  timer_t timer(settings.time_limit);
+  work_limit_timer_t timer(context.gpu_heur_loop, settings.time_limit, *context.termination);
   auto& handle_ptr = pb.handle_ptr;
   if (use_host_bounds) {
     update_device_bounds(handle_ptr);
@@ -257,7 +262,7 @@ termination_criterion_t bound_presolve_t<i_t, f_t>::solve(
 template <typename i_t, typename f_t>
 termination_criterion_t bound_presolve_t<i_t, f_t>::solve(problem_t<i_t, f_t>& pb)
 {
-  timer_t timer(settings.time_limit);
+  work_limit_timer_t timer(context.gpu_heur_loop, settings.time_limit, *context.termination);
   auto& handle_ptr = pb.handle_ptr;
   copy_input_bounds(pb);
   return bound_update_loop(pb, timer);
diff --git a/cpp/src/mip_heuristics/presolve/bounds_presolve.cuh b/cpp/src/mip_heuristics/presolve/bounds_presolve.cuh
index 8b57cc7019..dee642ba36 100644
--- a/cpp/src/mip_heuristics/presolve/bounds_presolve.cuh
+++ b/cpp/src/mip_heuristics/presolve/bounds_presolve.cuh
@@ -15,6 +15,7 @@
 #include <mip_heuristics/utils.cuh>
 
 #include <utilities/timer.hpp>
+#include <utilities/work_limit_timer.hpp>
 
 #include <thrust/pair.h>
 
@@ -60,7 +61,7 @@ class bound_presolve_t {
   void set_updated_bounds(const raft::handle_t* handle_ptr,
                           raft::device_span<f_t> output_lb,
                           raft::device_span<f_t> output_ub);
-  termination_criterion_t bound_update_loop(problem_t<i_t, f_t>& pb, timer_t timer);
+  termination_criterion_t bound_update_loop(problem_t<i_t, f_t>& pb, work_limit_timer_t& timer);
   void set_bounds(raft::device_span<f_t> var_lb,
                   raft::device_span<f_t> var_ub,
                   const std::vector<thrust::pair<i_t, f_t>>& var_probe_vals,
diff --git a/cpp/src/mip_heuristics/presolve/bounds_update_data.cu b/cpp/src/mip_heuristics/presolve/bounds_update_data.cu
index 487549aa4a..b83f474791 100644
--- a/cpp/src/mip_heuristics/presolve/bounds_update_data.cu
+++ b/cpp/src/mip_heuristics/presolve/bounds_update_data.cu
@@ -28,6 +28,17 @@ bounds_update_data_t<i_t, f_t>::bounds_update_data_t(problem_t<i_t, f_t>& proble
 template <typename i_t, typename f_t>
 void bounds_update_data_t<i_t, f_t>::resize(problem_t<i_t, f_t>& problem)
 {
+  CUOPT_LOG_DEBUG(
+    "bounds_update_data resize: nv=%d nc=%d min_act=%zu max_act=%zu lb=%zu ub=%zu "
+    "chg_c=%zu chg_v=%zu",
+    problem.n_variables,
+    problem.n_constraints,
+    min_activity.size(),
+    max_activity.size(),
+    lb.size(),
+    ub.size(),
+    changed_constraints.size(),
+    changed_variables.size());
   min_activity.resize(problem.n_constraints, problem.handle_ptr->get_stream());
   max_activity.resize(problem.n_constraints, problem.handle_ptr->get_stream());
   lb.resize(problem.n_variables, problem.handle_ptr->get_stream());
@@ -35,6 +46,35 @@ void bounds_update_data_t<i_t, f_t>::resize(problem_t<i_t, f_t>& problem)
   changed_constraints.resize(problem.n_constraints, problem.handle_ptr->get_stream());
   next_changed_constraints.resize(problem.n_constraints, problem.handle_ptr->get_stream());
   changed_variables.resize(problem.n_variables, problem.handle_ptr->get_stream());
+
+  thrust::fill(problem.handle_ptr->get_thrust_policy(),
+               min_activity.begin(),
+               min_activity.end(),
+               std::numeric_limits<f_t>::signaling_NaN());
+  thrust::fill(problem.handle_ptr->get_thrust_policy(),
+               max_activity.begin(),
+               max_activity.end(),
+               std::numeric_limits<f_t>::signaling_NaN());
+  thrust::fill(problem.handle_ptr->get_thrust_policy(),
+               lb.begin(),
+               lb.end(),
+               std::numeric_limits<f_t>::signaling_NaN());
+  thrust::fill(problem.handle_ptr->get_thrust_policy(),
+               ub.begin(),
+               ub.end(),
+               std::numeric_limits<f_t>::signaling_NaN());
+  thrust::fill(problem.handle_ptr->get_thrust_policy(),
+               changed_constraints.begin(),
+               changed_constraints.end(),
+               -1);
+  thrust::fill(problem.handle_ptr->get_thrust_policy(),
+               next_changed_constraints.begin(),
+               next_changed_constraints.end(),
+               -1);
+  thrust::fill(problem.handle_ptr->get_thrust_policy(),
+               changed_variables.begin(),
+               changed_variables.end(),
+               -1);
 }
 
 template <typename i_t, typename f_t>
diff --git a/cpp/src/mip_heuristics/presolve/conditional_bound_strengthening.cu b/cpp/src/mip_heuristics/presolve/conditional_bound_strengthening.cu
index 13412614b8..24cac7129f 100644
--- a/cpp/src/mip_heuristics/presolve/conditional_bound_strengthening.cu
+++ b/cpp/src/mip_heuristics/presolve/conditional_bound_strengthening.cu
@@ -17,6 +17,12 @@
 #include "cusparse.h"
 
 #include <cub/cub.cuh>
+
+#include <thrust/extrema.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/transform_reduce.h>
+#include <thrust/tuple.h>
+
 #include "conditional_bound_strengthening.cuh"
 
 #include <unordered_set>
diff --git a/cpp/src/mip_heuristics/presolve/lb_probing_cache.cu b/cpp/src/mip_heuristics/presolve/lb_probing_cache.cu
index 3a6d1bce21..308230527a 100644
--- a/cpp/src/mip_heuristics/presolve/lb_probing_cache.cu
+++ b/cpp/src/mip_heuristics/presolve/lb_probing_cache.cu
@@ -10,7 +10,9 @@
 #include <mip_heuristics/mip_constants.hpp>
 #include <mip_heuristics/utils.cuh>
 
+#include <thrust/iterator/zip_iterator.h>
 #include <thrust/sort.h>
+#include <thrust/tuple.h>
 #include <utilities/copy_helpers.hpp>
 #include <utilities/timer.hpp>
 
@@ -309,7 +311,7 @@ inline std::vector<i_t> compute_prioritized_integer_indices(
 template <typename i_t, typename f_t>
 void compute_probing_cache(load_balanced_bounds_presolve_t<i_t, f_t>& bound_presolve,
                            load_balanced_problem_t<i_t, f_t>& problem,
-                           timer_t timer)
+                           work_limit_timer_t& timer)
 {
   // we dont want to compute the probing cache for all variables for time and computation resources
   auto priority_indices = compute_prioritized_integer_indices(bound_presolve, problem);
@@ -400,7 +402,7 @@ void compute_probing_cache(load_balanced_bounds_presolve_t<i_t, f_t>& bound_pres
   template void compute_probing_cache<int, F_TYPE>(                \
     load_balanced_bounds_presolve_t<int, F_TYPE> & bound_presolve, \
     load_balanced_problem_t<int, F_TYPE> & problem,                \
-    timer_t timer);                                                \
+    work_limit_timer_t & timer);                                   \
   template class lb_probing_cache_t<int, F_TYPE>;
 
 #if MIP_INSTANTIATE_FLOAT
diff --git a/cpp/src/mip_heuristics/presolve/load_balanced_bounds_presolve.cu b/cpp/src/mip_heuristics/presolve/load_balanced_bounds_presolve.cu
index 0d16c26cae..0bf12390ca 100644
--- a/cpp/src/mip_heuristics/presolve/load_balanced_bounds_presolve.cu
+++ b/cpp/src/mip_heuristics/presolve/load_balanced_bounds_presolve.cu
@@ -9,6 +9,7 @@
 #include <thrust/count.h>
 #include <thrust/extrema.h>
 #include <thrust/iterator/transform_input_output_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/partition.h>
 #include <thrust/reduce.h>
@@ -526,7 +527,7 @@ bool load_balanced_bounds_presolve_t<i_t, f_t>::update_bounds_from_slack(
 
 template <typename i_t, typename f_t>
 termination_criterion_t load_balanced_bounds_presolve_t<i_t, f_t>::bound_update_loop(
-  const raft::handle_t* handle_ptr, timer_t timer)
+  const raft::handle_t* handle_ptr, work_limit_timer_t& timer)
 {
   termination_criterion_t criteria = termination_criterion_t::ITERATION_LIMIT;
 
@@ -626,7 +627,7 @@ termination_criterion_t load_balanced_bounds_presolve_t<i_t, f_t>::solve(f_t var
                                                                          f_t var_ub,
                                                                          i_t var_idx)
 {
-  timer_t timer(settings.time_limit);
+  work_limit_timer_t timer(context.gpu_heur_loop, settings.time_limit, *context.termination);
   auto& handle_ptr = pb->handle_ptr;
   copy_input_bounds(*pb);
   vars_bnd.set_element_async(2 * var_idx, var_lb, handle_ptr->get_stream());
@@ -638,7 +639,7 @@ template <typename i_t, typename f_t>
 termination_criterion_t load_balanced_bounds_presolve_t<i_t, f_t>::solve(
   raft::device_span<f_t> input_bounds)
 {
-  timer_t timer(settings.time_limit);
+  work_limit_timer_t timer(context.gpu_heur_loop, settings.time_limit, *context.termination);
   auto& handle_ptr = pb->handle_ptr;
   if (input_bounds.size() != 0) {
     raft::copy(vars_bnd.data(), input_bounds.data(), input_bounds.size(), handle_ptr->get_stream());
@@ -667,7 +668,7 @@ template <typename i_t, typename f_t>
 termination_criterion_t load_balanced_bounds_presolve_t<i_t, f_t>::solve(
   const std::vector<thrust::pair<i_t, f_t>>& var_probe_val_pairs, bool use_host_bounds)
 {
-  timer_t timer(settings.time_limit);
+  work_limit_timer_t timer(context.gpu_heur_loop, settings.time_limit, *context.termination);
   auto& handle_ptr = pb->handle_ptr;
   if (use_host_bounds) {
     update_device_bounds(handle_ptr);
diff --git a/cpp/src/mip_heuristics/presolve/load_balanced_bounds_presolve.cuh b/cpp/src/mip_heuristics/presolve/load_balanced_bounds_presolve.cuh
index ff085ca962..2b9d31061e 100644
--- a/cpp/src/mip_heuristics/presolve/load_balanced_bounds_presolve.cuh
+++ b/cpp/src/mip_heuristics/presolve/load_balanced_bounds_presolve.cuh
@@ -16,6 +16,7 @@
 #include <mip_heuristics/utils.cuh>
 
 #include <utilities/timer.hpp>
+#include <utilities/work_limit_timer.hpp>
 
 #include "load_balanced_partition_helpers.cuh"
 #include "utils.cuh"
@@ -159,7 +160,8 @@ class load_balanced_bounds_presolve_t {
   void calculate_constraint_slack_iter(const raft::handle_t* handle_ptr);
   bool update_bounds_from_slack(const raft::handle_t* handle_ptr);
 
-  termination_criterion_t bound_update_loop(const raft::handle_t* handle_ptr, timer_t timer);
+  termination_criterion_t bound_update_loop(const raft::handle_t* handle_ptr,
+                                            work_limit_timer_t& timer);
   bool calculate_infeasible_redundant_constraints(const raft::handle_t* handle_ptr);
 
   // void calculate_constraint_slack_on_problem_bounds();
diff --git a/cpp/src/mip_heuristics/presolve/load_balanced_bounds_presolve_helpers.cuh b/cpp/src/mip_heuristics/presolve/load_balanced_bounds_presolve_helpers.cuh
index cbcd91a7d7..f276840bdf 100644
--- a/cpp/src/mip_heuristics/presolve/load_balanced_bounds_presolve_helpers.cuh
+++ b/cpp/src/mip_heuristics/presolve/load_balanced_bounds_presolve_helpers.cuh
@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -10,6 +10,7 @@
 #include "load_balanced_bounds_presolve_kernels.cuh"
 #include "load_balanced_partition_helpers.cuh"
 
+#include <thrust/extrema.h>
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
diff --git a/cpp/src/mip_heuristics/presolve/multi_probe.cu b/cpp/src/mip_heuristics/presolve/multi_probe.cu
index 7789b3281b..6a2e88a1b2 100644
--- a/cpp/src/mip_heuristics/presolve/multi_probe.cu
+++ b/cpp/src/mip_heuristics/presolve/multi_probe.cu
@@ -5,10 +5,13 @@
  */
 /* clang-format on */
 
+#include <mip_heuristics/logger.cuh>
 #include <mip_heuristics/mip_constants.hpp>
+#include <utilities/determinism_log.hpp>
 
 #include <thrust/count.h>
 #include <thrust/extrema.h>
+#include <thrust/iterator/transform_iterator.h>
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/tuple.h>
 #include <utilities/copy_helpers.hpp>
@@ -19,6 +22,15 @@
 #include "bounds_update_helpers.cuh"
 #include "multi_probe.cuh"
 
+// enable to activate detailed determinism logs
+#if 0
+#undef CUOPT_DETERMINISM_LOG
+#define CUOPT_DETERMINISM_LOG(...) \
+  do {                             \
+    CUOPT_LOG_INFO(__VA_ARGS__);   \
+  } while (0)
+#endif
+
 namespace cuopt::linear_programming::detail {
 
 // Tobias Achterberg, Robert E. Bixby, Zonghao Gu, Edward Rothberg, Dieter Weninger (2019) Presolve
@@ -263,7 +275,7 @@ void multi_probe_t<i_t, f_t>::set_bounds(
 template <typename i_t, typename f_t>
 termination_criterion_t multi_probe_t<i_t, f_t>::bound_update_loop(problem_t<i_t, f_t>& pb,
                                                                    const raft::handle_t* handle_ptr,
-                                                                   timer_t timer)
+                                                                   work_limit_timer_t& timer)
 {
   termination_criterion_t criteria = termination_criterion_t::ITERATION_LIMIT;
   skip_0                           = false;
@@ -279,12 +291,17 @@ termination_criterion_t multi_probe_t<i_t, f_t>::bound_update_loop(problem_t<i_t
     // reset for the next calls on the same object
     init_changed_constraints = true;
   }
+  CUOPT_DETERMINISM_LOG("multi_probe entry: iter_limit=%d timer_rem=%.6f timer_det=%d",
+                        settings.iteration_limit,
+                        timer.remaining_time(),
+                        (int)timer.deterministic);
+  [[maybe_unused]] i_t final_iter = 0;
   for (i_t iter = 0; iter < settings.iteration_limit; ++iter) {
+    final_iter = iter;
     if (timer.check_time_limit()) {
       criteria = termination_criterion_t::TIME_LIMIT;
       break;
     }
-    // calculate activity for both probes
     calculate_activity(pb, handle_ptr);
     if (!calculate_bounds_update(pb, handle_ptr)) {
       if (iter == 0) {
@@ -294,8 +311,6 @@ termination_criterion_t multi_probe_t<i_t, f_t>::bound_update_loop(problem_t<i_t
       }
       break;
     }
-    // next_changed are updated, fill current changed with zero and swap
-    // swap next and current changed constraints
     if (!skip_0) { upd_0.prepare_for_next_iteration(handle_ptr); }
     if (!skip_1) { upd_1.prepare_for_next_iteration(handle_ptr); }
     iter_0 += !skip_0;
@@ -309,6 +324,19 @@ termination_criterion_t multi_probe_t<i_t, f_t>::bound_update_loop(problem_t<i_t
     constraint_stats(pb, handle_ptr);
   }
 
+  CUOPT_DETERMINISM_LOG(
+    "multi_probe exit: iters=%d iter_0=%d iter_1=%d criterion=%d "
+    "lb0_hash=0x%x ub0_hash=0x%x lb1_hash=0x%x ub1_hash=0x%x timer_rem=%.6f",
+    final_iter,
+    iter_0,
+    iter_1,
+    (int)criteria,
+    detail::compute_hash(make_span(upd_0.lb), handle_ptr->get_stream()),
+    detail::compute_hash(make_span(upd_0.ub), handle_ptr->get_stream()),
+    detail::compute_hash(make_span(upd_1.lb), handle_ptr->get_stream()),
+    detail::compute_hash(make_span(upd_1.ub), handle_ptr->get_stream()),
+    timer.remaining_time());
+
   return criteria;
 }
 
@@ -343,6 +371,10 @@ void multi_probe_t<i_t, f_t>::update_host_bounds(
     [] __device__(auto i) { return thrust::make_tuple(get_lower(i), get_upper(i)); });
   raft::copy(host_lb.data(), var_lb.data(), var_lb.size(), handle_ptr->get_stream());
   raft::copy(host_ub.data(), var_ub.data(), var_ub.size(), handle_ptr->get_stream());
+  handle_ptr->sync_stream();
+  CUOPT_DETERMINISM_LOG("update_host_bounds: lb_hash=0x%x ub_hash=0x%x",
+                        detail::compute_hash(make_span(var_lb), handle_ptr->get_stream()),
+                        detail::compute_hash(make_span(var_ub), handle_ptr->get_stream()));
 }
 
 template <typename i_t, typename f_t>
@@ -375,7 +407,7 @@ termination_criterion_t multi_probe_t<i_t, f_t>::solve_for_interval(
   const std::tuple<i_t, std::pair<f_t, f_t>, std::pair<f_t, f_t>>& var_interval_vals,
   const raft::handle_t* handle_ptr)
 {
-  timer_t timer(settings.time_limit);
+  work_limit_timer_t timer(context.gpu_heur_loop, settings.time_limit, *context.termination);
 
   copy_problem_into_probing_buffers(pb, handle_ptr);
   set_interval_bounds(var_interval_vals, pb, handle_ptr);
@@ -389,7 +421,7 @@ termination_criterion_t multi_probe_t<i_t, f_t>::solve(
   const std::tuple<std::vector<i_t>, std::vector<f_t>, std::vector<f_t>>& var_probe_vals,
   bool use_host_bounds)
 {
-  timer_t timer(settings.time_limit);
+  work_limit_timer_t timer(context.gpu_heur_loop, settings.time_limit, *context.termination);
   auto& handle_ptr = pb.handle_ptr;
   if (use_host_bounds) {
     update_device_bounds(handle_ptr);
diff --git a/cpp/src/mip_heuristics/presolve/multi_probe.cuh b/cpp/src/mip_heuristics/presolve/multi_probe.cuh
index a043770789..747713a53d 100644
--- a/cpp/src/mip_heuristics/presolve/multi_probe.cuh
+++ b/cpp/src/mip_heuristics/presolve/multi_probe.cuh
@@ -13,6 +13,7 @@
 #include <mip_heuristics/utils.cuh>
 
 #include <utilities/timer.hpp>
+#include <utilities/work_limit_timer.hpp>
 
 #include "bounds_update_data.cuh"
 #include "utils.cuh"
@@ -54,7 +55,7 @@ class multi_probe_t {
                           i_t select_update);
   termination_criterion_t bound_update_loop(problem_t<i_t, f_t>& pb,
                                             const raft::handle_t* handle_ptr,
-                                            timer_t timer);
+                                            work_limit_timer_t& timer);
   void set_interval_bounds(
     const std::tuple<i_t, std::pair<f_t, f_t>, std::pair<f_t, f_t>>& var_interval_vals,
     problem_t<i_t, f_t>& pb,
diff --git a/cpp/src/mip_heuristics/presolve/probing_cache.cu b/cpp/src/mip_heuristics/presolve/probing_cache.cu
index 4f5e16ddb9..9f9e781a70 100644
--- a/cpp/src/mip_heuristics/presolve/probing_cache.cu
+++ b/cpp/src/mip_heuristics/presolve/probing_cache.cu
@@ -14,7 +14,10 @@
 
 #include <omp.h>
 #include <thrust/binary_search.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
 #include <thrust/sort.h>
+#include <thrust/tuple.h>
 #include <utilities/copy_helpers.hpp>
 #include <utilities/timer.hpp>
 
@@ -367,7 +370,7 @@ void compute_cache_for_var(i_t var_idx,
                            std::atomic<bool>& problem_is_infeasible,
                            std::vector<std::tuple<f_t, i_t, f_t, f_t>>& modification_vector,
                            std::vector<substitution_t<i_t, f_t>>& substitution_vector,
-                           timer_t timer,
+                           const work_limit_timer_t& timer,
                            i_t device_id)
 {
   RAFT_CUDA_TRY(cudaSetDevice(device_id));
@@ -704,8 +707,11 @@ void apply_substitution_queue_to_problem(
     host_copy(problem.presolve_data.variable_mapping, problem.handle_ptr->get_stream());
   problem.handle_ptr->sync_stream();
 
+  // remove duplicate substitution proposals to avoid races later
+  std::unordered_set<i_t> seen_substituted;
   for (const auto& [substituting_var, substitutions] : all_substitutions) {
     for (const auto& [substituted_var, substitution] : substitutions) {
+      if (!seen_substituted.insert(substitution.substituted_var).second) { continue; }
       CUOPT_LOG_TRACE("Applying substitution: %d -> %d",
                       substitution.substituting_var,
                       substitution.substituted_var);
@@ -843,7 +849,7 @@ std::vector<i_t> compute_priority_indices_by_implied_integers(problem_t<i_t, f_t
 template <typename i_t, typename f_t>
 bool compute_probing_cache(bound_presolve_t<i_t, f_t>& bound_presolve,
                            problem_t<i_t, f_t>& problem,
-                           timer_t timer)
+                           work_limit_timer_t& timer)
 {
   raft::common::nvtx::range fun_scope("compute_probing_cache");
   // we dont want to compute the probing cache for all variables for time and computation resources
@@ -857,6 +863,12 @@ bool compute_probing_cache(bound_presolve_t<i_t, f_t>& bound_presolve,
   bound_presolve.settings.iteration_limit = 50;
   bound_presolve.settings.time_limit      = timer.remaining_time();
 
+  // TODO: proper work unit accounting in deterministic mode for the probing cache
+  if ((bound_presolve.context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS)) {
+    bound_presolve.settings.iteration_limit = 1;
+    priority_indices.resize(std::min<size_t>(priority_indices.size(), 2048));
+  }
+
   size_t num_threads = bound_presolve.settings.num_threads < 0
                          ? 0.2 * omp_get_max_threads()
                          : bound_presolve.settings.num_threads;
@@ -949,7 +961,7 @@ bool compute_probing_cache(bound_presolve_t<i_t, f_t>& bound_presolve,
 #define INSTANTIATE(F_TYPE)                                                                        \
   template bool compute_probing_cache<int, F_TYPE>(bound_presolve_t<int, F_TYPE> & bound_presolve, \
                                                    problem_t<int, F_TYPE> & problem,               \
-                                                   timer_t timer);                                 \
+                                                   work_limit_timer_t & timer);                    \
   template class probing_cache_t<int, F_TYPE>;
 
 #if MIP_INSTANTIATE_FLOAT
diff --git a/cpp/src/mip_heuristics/presolve/probing_cache.cuh b/cpp/src/mip_heuristics/presolve/probing_cache.cuh
index 91da6a15c8..8e1db6d5d7 100644
--- a/cpp/src/mip_heuristics/presolve/probing_cache.cuh
+++ b/cpp/src/mip_heuristics/presolve/probing_cache.cuh
@@ -12,6 +12,7 @@
 #include <mip_heuristics/utils.cuh>
 
 #include <utilities/timer.hpp>
+#include <utilities/work_limit_timer.hpp>
 
 namespace cuopt::linear_programming::detail {
 
@@ -119,6 +120,6 @@ class lb_probing_cache_t {
 template <typename i_t, typename f_t>
 bool compute_probing_cache(bound_presolve_t<i_t, f_t>& bound_presolve,
                            problem_t<i_t, f_t>& problem,
-                           timer_t timer);
+                           work_limit_timer_t& timer);
 
 }  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/mip_heuristics/presolve/third_party_presolve.cpp b/cpp/src/mip_heuristics/presolve/third_party_presolve.cpp
index d94cf5aa67..7025ce2a96 100644
--- a/cpp/src/mip_heuristics/presolve/third_party_presolve.cpp
+++ b/cpp/src/mip_heuristics/presolve/third_party_presolve.cpp
@@ -544,7 +544,8 @@ void check_postsolve_status(const papilo::PostsolveStatus& status)
 template <typename f_t>
 void set_presolve_methods(papilo::Presolve<f_t>& presolver,
                           problem_category_t category,
-                          bool dual_postsolve)
+                          bool dual_postsolve,
+                          bool deterministic)
 {
   using uptr = std::unique_ptr<papilo::PresolveMethod<f_t>>;
 
@@ -571,7 +572,9 @@ void set_presolve_methods(papilo::Presolve<f_t>& presolver,
   // exhaustive presolvers
   presolver.addPresolveMethod(uptr(new papilo::ImplIntDetection<f_t>()));
   presolver.addPresolveMethod(uptr(new papilo::DominatedCols<f_t>()));
-  presolver.addPresolveMethod(uptr(new papilo::Probing<f_t>()));
+  // Papilo's Probing presolver is nondeterministic.
+  // TODO: push an upstream PR
+  if (!deterministic) { presolver.addPresolveMethod(uptr(new papilo::Probing<f_t>())); }
 
   if (!dual_postsolve) {
     presolver.addPresolveMethod(uptr(new papilo::DualInfer<f_t>()));
@@ -605,17 +608,20 @@ template <typename f_t>
 void set_presolve_parameters(papilo::Presolve<f_t>& presolver,
                              problem_category_t category,
                              int nrows,
-                             int ncols)
+                             int ncols,
+                             bool deterministic = false)
 {
   // It looks like a copy. But this copy has the pointers to relevant variables in papilo
   auto params = presolver.getParameters();
   if (category == problem_category_t::MIP) {
-    // Papilo has work unit measurements for probing. Because of this when the first batch fails to
-    // produce any reductions, the algorithm stops. To avoid stopping the algorithm, we set a
-    // minimum badge size to a huge value. The time limit makes sure that we exit if it takes too
-    // long
-    int min_badgesize = std::max(ncols / 2, 32);
-    params.setParameter("probing.minbadgesize", min_badgesize);
+    if (!deterministic) {
+      // Papilo has work unit measurements for probing. Because of this when the first batch fails
+      // to produce any reductions, the algorithm stops. To avoid stopping the algorithm, we set a
+      // minimum badge size to a huge value. The time limit makes sure that we exit if it takes too
+      // long
+      int min_badgesize = std::max(ncols / 2, 32);
+      params.setParameter("probing.minbadgesize", min_badgesize);
+    }
     params.setParameter("cliquemerging.enabled", true);
     params.setParameter("cliquemerging.maxcalls", 50);
   }
@@ -690,7 +696,7 @@ third_party_presolve_result_t<i_t, f_t> third_party_presolve_t<i_t, f_t>::apply(
   CUOPT_LOG_INFO("Calling Papilo presolver (git hash %s)", PAPILO_GITHASH);
   if (category == problem_category_t::MIP) { dual_postsolve = false; }
   papilo::Presolve<f_t> papilo_presolver;
-  set_presolve_methods(papilo_presolver, category, dual_postsolve);
+  set_presolve_methods(papilo_presolver, category, dual_postsolve, deterministic_);
   set_presolve_options<i_t, f_t>(papilo_presolver,
                                  category,
                                  absolute_tolerance,
@@ -698,8 +704,11 @@ third_party_presolve_result_t<i_t, f_t> third_party_presolve_t<i_t, f_t>::apply(
                                  time_limit,
                                  dual_postsolve,
                                  num_cpu_threads);
-  set_presolve_parameters(
-    papilo_presolver, category, op_problem.get_n_constraints(), op_problem.get_n_variables());
+  set_presolve_parameters(papilo_presolver,
+                          category,
+                          op_problem.get_n_constraints(),
+                          op_problem.get_n_variables(),
+                          deterministic_);
 
   // Disable papilo logs
   papilo_presolver.setVerbosityLevel(papilo::VerbosityLevel::kQuiet);
diff --git a/cpp/src/mip_heuristics/presolve/third_party_presolve.hpp b/cpp/src/mip_heuristics/presolve/third_party_presolve.hpp
index a067f604e7..9a7db11e18 100644
--- a/cpp/src/mip_heuristics/presolve/third_party_presolve.hpp
+++ b/cpp/src/mip_heuristics/presolve/third_party_presolve.hpp
@@ -79,6 +79,7 @@ class third_party_presolve_t {
                                std::vector<f_t>& full_primal) const;
   const std::vector<i_t>& get_reduced_to_original_map() const { return reduced_to_original_map_; }
   const std::vector<i_t>& get_original_to_reduced_map() const { return original_to_reduced_map_; }
+  void set_deterministic(bool d) { deterministic_ = d; }
 
   ~third_party_presolve_t();
 
@@ -91,6 +92,7 @@ class third_party_presolve_t {
                  rmm::device_uvector<f_t>& reduced_costs,
                  rmm::cuda_stream_view stream_view);
 
+  bool deterministic_                               = false;
   bool maximize_                                    = false;
   cuopt::linear_programming::presolver_t presolver_ = cuopt::linear_programming::presolver_t::PSLP;
   // PSLP settings
diff --git a/cpp/src/mip_heuristics/presolve/trivial_presolve.cuh b/cpp/src/mip_heuristics/presolve/trivial_presolve.cuh
index 568719dfd8..28162d7482 100644
--- a/cpp/src/mip_heuristics/presolve/trivial_presolve.cuh
+++ b/cpp/src/mip_heuristics/presolve/trivial_presolve.cuh
@@ -14,9 +14,11 @@
 #include <utilities/copy_helpers.hpp>
 
 #include <thrust/count.h>
+#include <thrust/extrema.h>
 #include <thrust/for_each.h>
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
 #include <thrust/iterator/transform_output_iterator.h>
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/partition.h>
diff --git a/cpp/src/mip_heuristics/presolve/utils.cuh b/cpp/src/mip_heuristics/presolve/utils.cuh
index 4870b3180c..803c00a022 100644
--- a/cpp/src/mip_heuristics/presolve/utils.cuh
+++ b/cpp/src/mip_heuristics/presolve/utils.cuh
@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -14,7 +14,8 @@ enum class termination_criterion_t {
   ITERATION_LIMIT,
   CONVERGENCE,
   INFEASIBLE,
-  NO_UPDATE
+  NO_UPDATE,
+  WORK_LIMIT
 };
 
 }  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/mip_heuristics/problem/presolve_data.cu b/cpp/src/mip_heuristics/problem/presolve_data.cu
index 884b5f9069..9326ab82ef 100644
--- a/cpp/src/mip_heuristics/problem/presolve_data.cu
+++ b/cpp/src/mip_heuristics/problem/presolve_data.cu
@@ -112,8 +112,10 @@ void presolve_data_t<i_t, f_t>::post_process_assignment(
 {
   raft::common::nvtx::range fun_scope("post_process_assignment");
   cuopt_assert(current_assignment.size() == variable_mapping.size(), "size mismatch");
+  rmm::device_uvector<f_t> local_fixed(fixed_var_assignment.size(), stream);
+  raft::copy(local_fixed.data(), fixed_var_assignment.data(), fixed_var_assignment.size(), stream);
   auto assgn       = make_span(current_assignment);
-  auto fixed_assgn = make_span(fixed_var_assignment);
+  auto fixed_assgn = make_span(local_fixed);
   auto var_map     = make_span(variable_mapping);
   if (current_assignment.size() > 0) {
     thrust::for_each(rmm::exec_policy(stream),
@@ -123,7 +125,7 @@ void presolve_data_t<i_t, f_t>::post_process_assignment(
                        fixed_assgn[var_map[idx]] = assgn[idx];
                      });
   }
-  expand_device_copy(current_assignment, fixed_var_assignment, stream);
+  expand_device_copy(current_assignment, local_fixed, stream);
   auto h_assignment = cuopt::host_copy(current_assignment, stream);
   cuopt_assert(additional_var_id_per_var.size() == h_assignment.size(), "Size mismatch");
   cuopt_assert(additional_var_used.size() == h_assignment.size(), "Size mismatch");
@@ -134,8 +136,6 @@ void presolve_data_t<i_t, f_t>::post_process_assignment(
     }
   }
 
-  // Apply variable substitutions from probing: x_substituted = offset + coefficient *
-  // x_substituting
   for (const auto& sub : variable_substitutions) {
     cuopt_assert(sub.substituted_var < (i_t)h_assignment.size(), "substituted_var out of bounds");
     cuopt_assert(sub.substituting_var < (i_t)h_assignment.size(), "substituting_var out of bounds");
@@ -223,23 +223,23 @@ void presolve_data_t<i_t, f_t>::set_papilo_presolve_data(
 
 template <typename i_t, typename f_t>
 void presolve_data_t<i_t, f_t>::papilo_uncrush_assignment(
-  problem_t<i_t, f_t>& problem, rmm::device_uvector<f_t>& assignment) const
+  problem_t<i_t, f_t>& problem,
+  rmm::device_uvector<f_t>& assignment,
+  const raft::handle_t* handle_override) const
 {
   if (papilo_presolve_ptr == nullptr) {
     CUOPT_LOG_INFO("Papilo presolve data not set, skipping uncrushing assignment");
     return;
   }
+  const auto* h = handle_override ? handle_override : problem.handle_ptr;
   cuopt_assert(assignment.size() == papilo_reduced_to_original_map.size(),
                "Papilo uncrush assignment size mismatch");
-  auto h_assignment = cuopt::host_copy(assignment, problem.handle_ptr->get_stream());
+  auto h_assignment = cuopt::host_copy(assignment, h->get_stream());
   std::vector<f_t> full_assignment;
   papilo_presolve_ptr->uncrush_primal_solution(h_assignment, full_assignment);
-  assignment.resize(full_assignment.size(), problem.handle_ptr->get_stream());
-  raft::copy(assignment.data(),
-             full_assignment.data(),
-             full_assignment.size(),
-             problem.handle_ptr->get_stream());
-  problem.handle_ptr->sync_stream();
+  assignment.resize(full_assignment.size(), h->get_stream());
+  raft::copy(assignment.data(), full_assignment.data(), full_assignment.size(), h->get_stream());
+  h->sync_stream();
 }
 
 #if MIP_INSTANTIATE_FLOAT || PDLP_INSTANTIATE_FLOAT
diff --git a/cpp/src/mip_heuristics/problem/presolve_data.cuh b/cpp/src/mip_heuristics/problem/presolve_data.cuh
index cac3e71650..e62c16a16b 100644
--- a/cpp/src/mip_heuristics/problem/presolve_data.cuh
+++ b/cpp/src/mip_heuristics/problem/presolve_data.cuh
@@ -93,10 +93,12 @@ class presolve_data_t {
                                rmm::cuda_stream_view stream);
   void post_process_assignment(problem_t<i_t, f_t>& problem,
                                rmm::device_uvector<f_t>& current_assignment,
-                               bool resize_to_original_problem = true)
+                               bool resize_to_original_problem       = true,
+                               const raft::handle_t* handle_override = nullptr)
   {
-    post_process_assignment(
-      problem, current_assignment, resize_to_original_problem, problem.handle_ptr->get_stream());
+    auto stream =
+      handle_override ? handle_override->get_stream() : problem.handle_ptr->get_stream();
+    post_process_assignment(problem, current_assignment, resize_to_original_problem, stream);
   }
   void post_process_solution(problem_t<i_t, f_t>& problem, solution_t<i_t, f_t>& solution);
 
@@ -107,7 +109,8 @@ class presolve_data_t {
   bool has_papilo_presolve_data() const { return papilo_presolve_ptr != nullptr; }
   i_t get_papilo_original_num_variables() const { return papilo_original_num_variables; }
   void papilo_uncrush_assignment(problem_t<i_t, f_t>& problem,
-                                 rmm::device_uvector<f_t>& assignment) const;
+                                 rmm::device_uvector<f_t>& assignment,
+                                 const raft::handle_t* handle_override = nullptr) const;
 
   presolve_data_t(presolve_data_t&&)                 = default;
   presolve_data_t& operator=(presolve_data_t&&)      = default;
diff --git a/cpp/src/mip_heuristics/problem/problem.cu b/cpp/src/mip_heuristics/problem/problem.cu
index 5d5fbc445a..0a86a0d009 100644
--- a/cpp/src/mip_heuristics/problem/problem.cu
+++ b/cpp/src/mip_heuristics/problem/problem.cu
@@ -27,9 +27,13 @@
 #include <thrust/count.h>
 #include <thrust/gather.h>
 #include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/reduce.h>
 #include <thrust/set_operations.h>
 #include <thrust/sort.h>
 #include <thrust/tabulate.h>
+#include <thrust/transform_reduce.h>
 #include <thrust/tuple.h>
 #include <cuda/std/functional>
 
@@ -64,6 +68,10 @@ void problem_t<i_t, f_t>::op_problem_cstr_body(const optimization_problem_t<i_t,
   set_bounds_if_not_set(*this);
 
   set_variable_bounds(*this);
+  thrust::fill(handle_ptr->get_thrust_policy(),
+               integer_fixed_variable_map.begin(),
+               integer_fixed_variable_map.end(),
+               -1);
 
   const bool is_mip = original_problem_ptr->get_problem_category() != problem_category_t::LP;
   if (is_mip) {
@@ -136,7 +144,7 @@ problem_t<i_t, f_t>::problem_t(
     nonbinary_indices(0, problem_.get_handle_ptr()->get_stream()),
     is_binary_variable(0, problem_.get_handle_ptr()->get_stream()),
     related_variables(0, problem_.get_handle_ptr()->get_stream()),
-    related_variables_offsets(n_variables, problem_.get_handle_ptr()->get_stream()),
+    related_variables_offsets(0, problem_.get_handle_ptr()->get_stream()),
     var_names(problem_.get_variable_names()),
     row_names(problem_.get_row_names()),
     objective_name(problem_.get_objective_name()),
@@ -946,8 +954,12 @@ void problem_t<i_t, f_t>::compute_related_variables(double time_limit)
 
   handle_ptr->sync_stream();
 
-  // CHANGE
-  if (deterministic) { time_limit = std::numeric_limits<f_t>::infinity(); }
+  if (deterministic) {
+    // TODO: Re-enable deterministic related-variable construction once we have a work estimator.
+    related_variables.resize(0, handle_ptr->get_stream());
+    related_variables_offsets.resize(0, handle_ptr->get_stream());
+    return;
+  }
 
   // previously used constants were based on 40GB of memory. Scale accordingly on smaller GPUs
   // We can't rely on querying free memory or allocation try/catch
@@ -1418,6 +1430,12 @@ void problem_t<i_t, f_t>::substitute_variables(const std::vector<i_t>& var_indic
   raft::common::nvtx::range fun_scope("substitute_variables");
   cuopt_assert((are_exclusive<i_t, f_t>(var_indices, var_to_substitute_indices)),
                "variables and var_to_substitute_indices are not exclusive");
+  {
+    std::vector<i_t> sorted_vi(var_indices);
+    std::sort(sorted_vi.begin(), sorted_vi.end());
+    cuopt_assert(std::adjacent_find(sorted_vi.begin(), sorted_vi.end()) == sorted_vi.end(),
+                 "var_indices must not contain duplicates");
+  }
   const i_t dummy_substituted_variable = var_indices[0];
   cuopt_assert(var_indices.size() == var_to_substitute_indices.size(), "size mismatch");
   cuopt_assert(var_indices.size() == offset_values.size(), "size mismatch");
@@ -1446,10 +1464,16 @@ void problem_t<i_t, f_t>::substitute_variables(const std::vector<i_t>& var_indic
                objective_offset_delta_per_variable.begin(),
                objective_offset_delta_per_variable.end(),
                zero_value);
+  const i_t n_substitutions = d_var_indices.size();
+  rmm::device_uvector<i_t> obj_coeff_keys(n_substitutions, handle_ptr->get_stream());
+  rmm::device_uvector<f_t> obj_coeff_deltas(n_substitutions, handle_ptr->get_stream());
+
+  CUOPT_LOG_INFO("Substituting %d variables", n_substitutions);
+
   thrust::for_each(
     handle_ptr->get_thrust_policy(),
     thrust::make_counting_iterator(0),
-    thrust::make_counting_iterator(0) + d_var_indices.size(),
+    thrust::make_counting_iterator(0) + n_substitutions,
     [variable_fix_mask                   = make_span(fixing_helpers.variable_fix_mask),
      var_indices                         = make_span(d_var_indices),
      n_variables                         = n_variables,
@@ -1458,20 +1482,40 @@ void problem_t<i_t, f_t>::substitute_variables(const std::vector<i_t>& var_indic
      var_to_substitute_indices           = make_span(d_var_to_substitute_indices),
      objective_coefficients              = make_span(objective_coefficients),
      objective_offset_delta_per_variable = make_span(objective_offset_delta_per_variable),
-     objective_offset                    = objective_offset.data(),
+     obj_keys                            = make_span(obj_coeff_keys),
+     obj_deltas                          = make_span(obj_coeff_deltas),
      var_flags                           = make_span(presolve_data.var_flags)] __device__(i_t idx) {
-      i_t var_idx                     = var_indices[idx];
-      i_t substituting_var_idx        = var_to_substitute_indices[idx];
-      variable_fix_mask[var_idx]      = idx;
-      f_t objective_offset_difference = objective_coefficients[var_idx] * substitute_offset[idx];
-      objective_offset_delta_per_variable[idx] += objective_offset_difference;
-      //  atomicAdd(objective_offset, objective_offset_difference);
-      atomicAdd(&objective_coefficients[substituting_var_idx],
-                objective_coefficients[var_idx] * substitute_coefficient[idx]);
-      // Substitution changes the constraint coefficients on x_B, invalidating
-      // any implied-integrality proof that relied on the original structure.
+      i_t var_idx                = var_indices[idx];
+      i_t substituting_var_idx   = var_to_substitute_indices[idx];
+      variable_fix_mask[var_idx] = idx;
+      objective_offset_delta_per_variable[idx] +=
+        objective_coefficients[var_idx] * substitute_offset[idx];
+      obj_keys[idx]   = substituting_var_idx;
+      obj_deltas[idx] = objective_coefficients[var_idx] * substitute_coefficient[idx];
       var_flags[substituting_var_idx] &= ~(i_t)VAR_IMPLIED_INTEGER;
     });
+
+  // Deterministic reduction of objective coefficient deltas per substituting variable
+  thrust::sort_by_key(handle_ptr->get_thrust_policy(),
+                      obj_coeff_keys.begin(),
+                      obj_coeff_keys.end(),
+                      obj_coeff_deltas.begin());
+  rmm::device_uvector<i_t> unique_keys(n_substitutions, handle_ptr->get_stream());
+  rmm::device_uvector<f_t> summed_deltas(n_substitutions, handle_ptr->get_stream());
+  auto [keys_end, vals_end] = thrust::reduce_by_key(handle_ptr->get_thrust_policy(),
+                                                    obj_coeff_keys.begin(),
+                                                    obj_coeff_keys.end(),
+                                                    obj_coeff_deltas.begin(),
+                                                    unique_keys.begin(),
+                                                    summed_deltas.begin());
+  i_t n_unique              = keys_end - unique_keys.begin();
+  thrust::for_each(
+    handle_ptr->get_thrust_policy(),
+    thrust::make_counting_iterator(0),
+    thrust::make_counting_iterator(n_unique),
+    [obj_coeffs = make_span(objective_coefficients),
+     keys       = unique_keys.data(),
+     deltas     = summed_deltas.data()] __device__(i_t i) { obj_coeffs[keys[i]] += deltas[i]; });
   presolve_data.objective_offset += thrust::reduce(handle_ptr->get_thrust_policy(),
                                                    objective_offset_delta_per_variable.begin(),
                                                    objective_offset_delta_per_variable.end(),
@@ -2167,9 +2211,11 @@ void problem_t<i_t, f_t>::set_papilo_presolve_data(
 }
 
 template <typename i_t, typename f_t>
-void problem_t<i_t, f_t>::papilo_uncrush_assignment(rmm::device_uvector<f_t>& assignment) const
+void problem_t<i_t, f_t>::papilo_uncrush_assignment(rmm::device_uvector<f_t>& assignment,
+                                                    const raft::handle_t* handle_override) const
 {
-  presolve_data.papilo_uncrush_assignment(const_cast<problem_t&>(*this), assignment);
+  presolve_data.papilo_uncrush_assignment(
+    const_cast<problem_t&>(*this), assignment, handle_override);
 }
 
 template <typename i_t, typename f_t>
diff --git a/cpp/src/mip_heuristics/problem/problem.cuh b/cpp/src/mip_heuristics/problem/problem.cuh
index a801cc4067..a16dae3b53 100644
--- a/cpp/src/mip_heuristics/problem/problem.cuh
+++ b/cpp/src/mip_heuristics/problem/problem.cuh
@@ -102,10 +102,11 @@ class problem_t {
                                bool resize_to_original_problem,
                                rmm::cuda_stream_view stream);
   void post_process_assignment(rmm::device_uvector<f_t>& current_assignment,
-                               bool resize_to_original_problem = true)
+                               bool resize_to_original_problem       = true,
+                               const raft::handle_t* handle_override = nullptr)
   {
-    post_process_assignment(
-      current_assignment, resize_to_original_problem, handle_ptr->get_stream());
+    auto stream = handle_override ? handle_override->get_stream() : handle_ptr->get_stream();
+    post_process_assignment(current_assignment, resize_to_original_problem, stream);
   }
   void post_process_solution(solution_t<i_t, f_t>& solution);
   void set_papilo_presolve_data(const third_party_presolve_t<i_t, f_t>* presolver_ptr,
@@ -117,7 +118,8 @@ class problem_t {
   {
     return presolve_data.get_papilo_original_num_variables();
   }
-  void papilo_uncrush_assignment(rmm::device_uvector<f_t>& assignment) const;
+  void papilo_uncrush_assignment(rmm::device_uvector<f_t>& assignment,
+                                 const raft::handle_t* handle_override = nullptr) const;
   void compute_transpose_of_problem();
   f_t get_user_obj_from_solver_obj(f_t solver_obj) const;
   f_t get_solver_obj_from_user_obj(f_t user_obj) const;
@@ -249,7 +251,8 @@ class problem_t {
   std::shared_ptr<problem_t<i_t, f_t>> integer_fixed_problem = nullptr;
   rmm::device_uvector<i_t> integer_fixed_variable_map;
 
-  std::function<void(const std::vector<f_t>&)> branch_and_bound_callback;
+  std::function<void(const std::vector<f_t>&, cuopt::internals::mip_solution_origin_t)>
+    branch_and_bound_callback;
   std::function<void(const std::vector<f_t>&,
                      const std::vector<f_t>&,
                      const std::vector<f_t>&,
diff --git a/cpp/src/mip_heuristics/problem/problem_fixing.cuh b/cpp/src/mip_heuristics/problem/problem_fixing.cuh
index 820b74e329..c462838d96 100644
--- a/cpp/src/mip_heuristics/problem/problem_fixing.cuh
+++ b/cpp/src/mip_heuristics/problem/problem_fixing.cuh
@@ -1,12 +1,13 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
 
 #pragma once
 
+#include <thrust/fill.h>
 #include <rmm/device_uvector.hpp>
 
 namespace cuopt {
@@ -18,6 +19,10 @@ struct problem_fixing_helpers_t {
     : reduction_in_rhs(n_constraints, handle_ptr->get_stream()),
       variable_fix_mask(n_variables, handle_ptr->get_stream())
   {
+    thrust::fill(
+      handle_ptr->get_thrust_policy(), reduction_in_rhs.begin(), reduction_in_rhs.end(), f_t(0));
+    thrust::fill(
+      handle_ptr->get_thrust_policy(), variable_fix_mask.begin(), variable_fix_mask.end(), i_t(0));
   }
 
   problem_fixing_helpers_t(const problem_fixing_helpers_t& other, const raft::handle_t* handle_ptr)
diff --git a/cpp/src/mip_heuristics/problem/problem_helpers.cuh b/cpp/src/mip_heuristics/problem/problem_helpers.cuh
index ebc8a488ea..939702e97d 100644
--- a/cpp/src/mip_heuristics/problem/problem_helpers.cuh
+++ b/cpp/src/mip_heuristics/problem/problem_helpers.cuh
@@ -19,8 +19,10 @@
 #include <thrust/count.h>
 #include <thrust/functional.h>
 #include <thrust/gather.h>
+#include <thrust/iterator/zip_iterator.h>
 #include <thrust/logical.h>
 #include <thrust/sort.h>
+#include <thrust/tuple.h>
 
 namespace cuopt::linear_programming::detail {
 template <typename f_t>
diff --git a/cpp/src/mip_heuristics/relaxed_lp/relaxed_lp.cu b/cpp/src/mip_heuristics/relaxed_lp/relaxed_lp.cu
index 84415f5372..04366cf37b 100644
--- a/cpp/src/mip_heuristics/relaxed_lp/relaxed_lp.cu
+++ b/cpp/src/mip_heuristics/relaxed_lp/relaxed_lp.cu
@@ -20,6 +20,17 @@
 
 #include <thrust/tabulate.h>
 
+#include <atomic>
+
+// enable to activate detailed determinism logs
+#if 0
+#undef CUOPT_DETERMINISM_LOG
+#define CUOPT_DETERMINISM_LOG(...) \
+  do {                             \
+    CUOPT_LOG_INFO(__VA_ARGS__);   \
+  } while (0)
+#endif
+
 namespace cuopt::linear_programming::detail {
 
 template <typename i_t, typename f_t>
@@ -39,6 +50,9 @@ optimization_problem_solution_t<i_t, f_t> get_relaxed_lp_solution(
   const relaxed_lp_settings_t& settings)
 {
   raft::common::nvtx::range fun_scope("get_relaxed_lp_solution");
+  static std::atomic<uint64_t> lp_call_counter{0};
+  const uint64_t lp_call_id = lp_call_counter.fetch_add(1, std::memory_order_relaxed);
+
   pdlp_solver_settings_t<i_t, f_t> pdlp_settings{};
   pdlp_settings.detect_infeasibility = settings.check_infeasibility;
   pdlp_settings.set_optimality_tolerance(settings.tolerance);
@@ -48,17 +62,59 @@ optimization_problem_solution_t<i_t, f_t> get_relaxed_lp_solution(
   pdlp_settings.tolerances.relative_primal_tolerance = settings.tolerance / tolerance_divisor;
   pdlp_settings.tolerances.relative_dual_tolerance   = settings.tolerance / tolerance_divisor;
   pdlp_settings.time_limit                           = settings.time_limit;
-  pdlp_settings.concurrent_halt                      = settings.concurrent_halt;
-  pdlp_settings.per_constraint_residual              = settings.per_constraint_residual;
-  pdlp_settings.first_primal_feasible                = settings.return_first_feasible;
-  pdlp_settings.pdlp_solver_mode                     = pdlp_solver_mode_t::Stable2;
-  pdlp_settings.presolver                            = presolver_t::None;
+  pdlp_settings.iteration_limit                      = settings.iteration_limit;
+
+  const f_t work_limit                  = settings.work_limit;
+  const bool determinism_mode           = std::isfinite(work_limit);
+  pdlp_settings.concurrent_halt         = settings.concurrent_halt;
+  pdlp_settings.per_constraint_residual = settings.per_constraint_residual;
+  pdlp_settings.first_primal_feasible   = settings.return_first_feasible;
+  pdlp_settings.pdlp_solver_mode        = pdlp_solver_mode_t::Stable2;
+  int estim_iters                       = pdlp_settings.iteration_limit;
+  if (determinism_mode) {
+    // try to estimate the iteration count based on the requested work limit
+    // TODO: replace with an actual model. this is a rather ugly hack to avoid having
+    // to touch the PDLP code for this initial PR
+    estim_iters = 100;
+    if (!std::isinf(work_limit)) {
+      do {
+        // TODO: use an actual predictor model here
+        double estim_ms = 313 + 200 * op_problem.n_variables - 400 * op_problem.n_constraints +
+                          600 * op_problem.coefficients.size() + 7100 * estim_iters;
+        estim_ms = std::max(0.0, estim_ms);
+        if (estim_ms > work_limit * 1000) { break; }
+        estim_iters += 100;
+      } while (true);
+    } else {
+      estim_iters = std::numeric_limits<int>::max();
+    }
+    CUOPT_DETERMINISM_LOG(
+      "estimated iterations %d for work limit %f", estim_iters, settings.work_limit);
+    pdlp_settings.iteration_limit  = estim_iters;
+    pdlp_settings.time_limit       = std::numeric_limits<double>::infinity();
+    pdlp_settings.pdlp_solver_mode = pdlp_solver_mode_t::Stable2;
+    pdlp_settings.presolver        = presolver_t::None;
+  }
+  CUOPT_DETERMINISM_LOG(
+    "LP call %lu config: det=%d work_limit=%.6f time_limit=%.6f iter_limit=%d method=%d mode=%d "
+    "presolver=%d save_state=%d has_initial=%d assignment_hash=0x%x",
+    lp_call_id,
+    (int)determinism_mode,
+    settings.work_limit,
+    pdlp_settings.time_limit,
+    pdlp_settings.iteration_limit,
+    (int)pdlp_settings.method,
+    (int)pdlp_settings.pdlp_solver_mode,
+    (int)pdlp_settings.presolver,
+    (int)settings.save_state,
+    (int)settings.has_initial_primal,
+    detail::compute_hash(assignment, op_problem.handle_ptr->get_stream()));
   set_pdlp_solver_mode(pdlp_settings);
   // TODO: set Stable3 here?
   pdlp_solver_t<i_t, f_t> lp_solver(op_problem, pdlp_settings);
   if (settings.has_initial_primal) {
     i_t prev_size = lp_state.prev_dual.size();
-    CUOPT_LOG_DEBUG(
+    CUOPT_LOG_TRACE(
       "setting initial primal solution of size %d dual size %d problem vars %d cstrs %d",
       assignment.size(),
       lp_state.prev_dual.size(),
@@ -72,25 +128,68 @@ optimization_problem_solution_t<i_t, f_t> get_relaxed_lp_solution(
                      lp_state.prev_dual.data(),
                      lp_state.prev_dual.data() + op_problem.n_constraints,
                      [prev_size, dual = make_span(lp_state.prev_dual)] __device__(i_t i) {
+                       // early exit to avoid a false positive in compute-sanitizer initcheck
+                       if (i >= prev_size) { return 0.0; }
                        f_t x = dual[i];
-                       if (!isfinite(x) || i >= prev_size) { return 0.0; }
+                       if (!isfinite(x)) { return 0.0; }
                        return x;
                      });
     lp_solver.set_initial_primal_solution(assignment);
     lp_solver.set_initial_dual_solution(lp_state.prev_dual);
   }
-  CUOPT_LOG_DEBUG(
+  CUOPT_LOG_TRACE(
     "running LP with n_vars %d n_cstr %d", op_problem.n_variables, op_problem.n_constraints);
   // before LP flush the logs as it takes quite some time
   cuopt::default_logger().flush();
   // temporarily add timer
   auto start_time = timer_t(pdlp_settings.time_limit);
   lp_solver.set_inside_mip(true);
+  CUOPT_DETERMINISM_LOG(
+    "prev solution sizes primal=%lu dual=%lu", assignment.size(), lp_state.prev_dual.size());
+  if (determinism_mode) {
+    auto init_primal_hash =
+      detail::compute_hash(make_span(assignment), op_problem.handle_ptr->get_stream());
+    auto init_dual_hash =
+      settings.has_initial_primal
+        ? detail::compute_hash(make_span(lp_state.prev_dual), op_problem.handle_ptr->get_stream())
+        : 0u;
+    CUOPT_DETERMINISM_LOG("LP call %lu pre-solve state: init_primal_hash=0x%x init_dual_hash=0x%x",
+                          lp_call_id,
+                          init_primal_hash,
+                          init_dual_hash);
+  }
   auto solver_response = lp_solver.run_solver(start_time);
+  CUOPT_DETERMINISM_LOG("post LP primal size %lu", solver_response.get_primal_solution().size());
+  const int actual_iters =
+    solver_response.get_additional_termination_information().number_of_steps_taken;
+  CUOPT_DETERMINISM_LOG("LP call %lu result: status=%d iters=%d primal_hash=0x%x",
+                        lp_call_id,
+                        (int)solver_response.get_termination_status(),
+                        actual_iters,
+                        solver_response.get_primal_solution().size() != 0
+                          ? detail::compute_hash(solver_response.get_primal_solution(),
+                                                 op_problem.handle_ptr->get_stream())
+                          : 0u);
+
+  if (determinism_mode && settings.work_context != nullptr) {
+    double work_to_record = settings.work_limit;
+    if (estim_iters > 0) {
+      work_to_record =
+        settings.work_limit * std::clamp((double)actual_iters / (double)estim_iters, 0.0, 1.0);
+    }
+    CUOPT_DETERMINISM_LOG(
+      "LP call %lu recording %.6fwu (actual_iters=%d estim_iters=%d requested=%.6f)",
+      lp_call_id,
+      work_to_record,
+      actual_iters,
+      estim_iters,
+      settings.work_limit);
+    settings.work_context->record_work_sync_on_horizon(work_to_record);
+  }
 
   if (solver_response.get_primal_solution().size() != 0 &&
       solver_response.get_dual_solution().size() != 0 && settings.save_state) {
-    CUOPT_LOG_DEBUG("saving initial primal solution of size %d", lp_state.prev_primal.size());
+    CUOPT_LOG_TRACE("saving initial primal solution of size %d", lp_state.prev_primal.size());
     lp_state.set_state(solver_response.get_primal_solution(), solver_response.get_dual_solution());
   }
   if (solver_response.get_primal_solution().size() != 0) {
@@ -100,11 +199,17 @@ optimization_problem_solution_t<i_t, f_t> get_relaxed_lp_solution(
                solver_response.get_primal_solution().size(),
                op_problem.handle_ptr->get_stream());
   }
+  CUOPT_DETERMINISM_LOG("LP call %lu assignment_after_copy hash=0x%x",
+                        lp_call_id,
+                        detail::compute_hash(assignment, op_problem.handle_ptr->get_stream()));
   if (solver_response.get_termination_status() == pdlp_termination_status_t::Optimal) {
-    CUOPT_LOG_DEBUG("feasible solution found with LP objective %f",
+    CUOPT_LOG_TRACE("feasible solution found with LP objective %f",
                     solver_response.get_objective_value());
   } else {
-    CUOPT_LOG_DEBUG("LP returned with reason %d", solver_response.get_termination_status());
+    CUOPT_DETERMINISM_LOG(
+      "LP returned with reason %d, %d iterations",
+      solver_response.get_termination_status(),
+      solver_response.get_additional_termination_information().number_of_steps_taken);
   }
 
   return solver_response;
diff --git a/cpp/src/mip_heuristics/relaxed_lp/relaxed_lp.cuh b/cpp/src/mip_heuristics/relaxed_lp/relaxed_lp.cuh
index 9fe5fb9071..06698d79ae 100644
--- a/cpp/src/mip_heuristics/relaxed_lp/relaxed_lp.cuh
+++ b/cpp/src/mip_heuristics/relaxed_lp/relaxed_lp.cuh
@@ -12,19 +12,23 @@
 #include <mip_heuristics/presolve/bounds_presolve.cuh>
 #include <mip_heuristics/problem/problem.cuh>
 #include <mip_heuristics/solution/solution.cuh>
+#include <utilities/work_limit_context.hpp>
 #include "lp_state.cuh"
 
 namespace cuopt::linear_programming::detail {
 
 struct relaxed_lp_settings_t {
-  double tolerance                  = 1e-4;
-  double time_limit                 = 1.0;
-  bool check_infeasibility          = true;
-  bool return_first_feasible        = false;
-  bool save_state                   = true;
-  bool per_constraint_residual      = true;
-  bool has_initial_primal           = true;
-  std::atomic<int>* concurrent_halt = nullptr;
+  double tolerance                          = 1e-4;
+  double time_limit                         = 1.0;
+  int iteration_limit                       = std::numeric_limits<int>::max();
+  double work_limit                         = std::numeric_limits<double>::infinity();
+  bool check_infeasibility                  = true;
+  bool return_first_feasible                = false;
+  bool save_state                           = true;
+  bool per_constraint_residual              = true;
+  bool has_initial_primal                   = true;
+  std::atomic<int>* concurrent_halt         = nullptr;
+  cuopt::work_limit_context_t* work_context = nullptr;
 };
 
 template <typename i_t, typename f_t>
diff --git a/cpp/src/mip_heuristics/solution/solution.cu b/cpp/src/mip_heuristics/solution/solution.cu
index e4192c0195..db3bd7fedc 100644
--- a/cpp/src/mip_heuristics/solution/solution.cu
+++ b/cpp/src/mip_heuristics/solution/solution.cu
@@ -19,6 +19,8 @@
 #include <raft/sparse/detail/cusparse_wrappers.h>
 
 #include <thrust/count.h>
+#include <thrust/extrema.h>
+#include <thrust/fill.h>
 #include <thrust/transform_reduce.h>
 #include <cuda/functional>
 #include <raft/linalg/binary_op.cuh>
@@ -46,8 +48,6 @@ solution_t<i_t, f_t>::solution_t(problem_t<i_t, f_t>& problem_)
     assignment(std::move(get_lower_bounds<f_t>(problem_.variable_bounds, handle_ptr))),
     lower_excess(problem_.n_constraints, handle_ptr->get_stream()),
     upper_excess(problem_.n_constraints, handle_ptr->get_stream()),
-    lower_slack(problem_.n_constraints, handle_ptr->get_stream()),
-    upper_slack(problem_.n_constraints, handle_ptr->get_stream()),
     constraint_value(problem_.n_constraints, handle_ptr->get_stream()),
     obj_val(handle_ptr->get_stream()),
     n_feasible_constraints(handle_ptr->get_stream()),
@@ -56,6 +56,22 @@ solution_t<i_t, f_t>::solution_t(problem_t<i_t, f_t>& problem_)
   clamp_within_var_bounds(assignment, problem_ptr, handle_ptr);
 }
 
+template <typename i_t, typename f_t>
+solution_t<i_t, f_t>::solution_t(problem_t<i_t, f_t>& problem_,
+                                 const raft::handle_t* handle_override)
+  : problem_ptr(&problem_),
+    handle_ptr(handle_override),
+    assignment(std::move(get_lower_bounds<f_t>(problem_.variable_bounds, handle_ptr))),
+    lower_excess(problem_.n_constraints, handle_ptr->get_stream()),
+    upper_excess(problem_.n_constraints, handle_ptr->get_stream()),
+    constraint_value(problem_.n_constraints, handle_ptr->get_stream()),
+    obj_val(handle_ptr->get_stream()),
+    n_feasible_constraints(handle_ptr->get_stream()),
+    lp_state(problem_, handle_ptr->get_stream())
+{
+  clamp_within_var_bounds(assignment, problem_ptr, handle_ptr);
+}
+
 template <typename i_t, typename f_t>
 solution_t<i_t, f_t>::solution_t(const solution_t<i_t, f_t>& other)
   : problem_ptr(other.problem_ptr),
@@ -63,8 +79,6 @@ solution_t<i_t, f_t>::solution_t(const solution_t<i_t, f_t>& other)
     assignment(other.assignment, handle_ptr->get_stream()),
     lower_excess(other.lower_excess, handle_ptr->get_stream()),
     upper_excess(other.upper_excess, handle_ptr->get_stream()),
-    lower_slack(other.lower_slack, handle_ptr->get_stream()),
-    upper_slack(other.upper_slack, handle_ptr->get_stream()),
     constraint_value(other.constraint_value, handle_ptr->get_stream()),
     obj_val(other.obj_val, handle_ptr->get_stream()),
     n_feasible_constraints(other.n_feasible_constraints, handle_ptr->get_stream()),
@@ -91,10 +105,18 @@ void solution_t<i_t, f_t>::copy_from(const solution_t<i_t, f_t>& other_sol)
   h_user_obj           = other_sol.h_user_obj;
   h_infeasibility_cost = other_sol.h_infeasibility_cost;
   expand_device_copy(assignment, other_sol.assignment, handle_ptr->get_stream());
+
+  // excess and constraint value may be uninitialized (and computed later). Mark them as
+  // such
+  cuopt::mark_span_as_initialized(make_span(other_sol.lower_excess), handle_ptr->get_stream());
+  cuopt::mark_span_as_initialized(make_span(other_sol.upper_excess), handle_ptr->get_stream());
+  cuopt::mark_span_as_initialized(make_span(other_sol.constraint_value), handle_ptr->get_stream());
+  cuopt::mark_span_as_initialized(make_span(other_sol.obj_val), handle_ptr->get_stream());
+  cuopt::mark_span_as_initialized(make_span(other_sol.n_feasible_constraints),
+                                  handle_ptr->get_stream());
+
   expand_device_copy(lower_excess, other_sol.lower_excess, handle_ptr->get_stream());
   expand_device_copy(upper_excess, other_sol.upper_excess, handle_ptr->get_stream());
-  expand_device_copy(lower_slack, other_sol.lower_slack, handle_ptr->get_stream());
-  expand_device_copy(upper_slack, other_sol.upper_slack, handle_ptr->get_stream());
   expand_device_copy(constraint_value, other_sol.constraint_value, handle_ptr->get_stream());
   raft::copy(obj_val.data(), other_sol.obj_val.data(), 1, handle_ptr->get_stream());
   raft::copy(n_feasible_constraints.data(),
@@ -113,14 +135,26 @@ void solution_t<i_t, f_t>::copy_from(const solution_t<i_t, f_t>& other_sol)
 template <typename i_t, typename f_t>
 void solution_t<i_t, f_t>::resize_to_problem()
 {
+  i_t old_n_vars  = lp_state.prev_primal.size();
+  i_t old_n_cstrs = lp_state.prev_dual.size();
   assignment.resize(problem_ptr->n_variables, handle_ptr->get_stream());
   lower_excess.resize(problem_ptr->n_constraints, handle_ptr->get_stream());
   upper_excess.resize(problem_ptr->n_constraints, handle_ptr->get_stream());
-  lower_slack.resize(problem_ptr->n_constraints, handle_ptr->get_stream());
-  upper_slack.resize(problem_ptr->n_constraints, handle_ptr->get_stream());
   constraint_value.resize(problem_ptr->n_constraints, handle_ptr->get_stream());
   lp_state.prev_primal.resize(problem_ptr->n_variables, handle_ptr->get_stream());
   lp_state.prev_dual.resize(problem_ptr->n_constraints, handle_ptr->get_stream());
+  if (problem_ptr->n_variables > old_n_vars) {
+    thrust::fill(handle_ptr->get_thrust_policy(),
+                 lp_state.prev_primal.data() + old_n_vars,
+                 lp_state.prev_primal.data() + problem_ptr->n_variables,
+                 f_t(0));
+  }
+  if (problem_ptr->n_constraints > old_n_cstrs) {
+    thrust::fill(handle_ptr->get_thrust_policy(),
+                 lp_state.prev_dual.data() + old_n_cstrs,
+                 lp_state.prev_dual.data() + problem_ptr->n_constraints,
+                 f_t(0));
+  }
 }
 
 template <typename i_t, typename f_t>
@@ -131,10 +165,6 @@ void solution_t<i_t, f_t>::resize_to_original_problem()
                       handle_ptr->get_stream());
   upper_excess.resize(problem_ptr->original_problem_ptr->get_n_constraints(),
                       handle_ptr->get_stream());
-  lower_slack.resize(problem_ptr->original_problem_ptr->get_n_constraints(),
-                     handle_ptr->get_stream());
-  upper_slack.resize(problem_ptr->original_problem_ptr->get_n_constraints(),
-                     handle_ptr->get_stream());
   constraint_value.resize(problem_ptr->original_problem_ptr->get_n_constraints(),
                           handle_ptr->get_stream());
   lp_state.prev_primal.resize(problem_ptr->original_problem_ptr->get_n_variables(),
@@ -149,8 +179,6 @@ void solution_t<i_t, f_t>::resize_copy(const solution_t<i_t, f_t>& other_sol)
   assignment.resize(other_sol.assignment.size(), handle_ptr->get_stream());
   lower_excess.resize(other_sol.lower_excess.size(), handle_ptr->get_stream());
   upper_excess.resize(other_sol.upper_excess.size(), handle_ptr->get_stream());
-  lower_slack.resize(other_sol.lower_slack.size(), handle_ptr->get_stream());
-  upper_slack.resize(other_sol.upper_slack.size(), handle_ptr->get_stream());
   constraint_value.resize(other_sol.constraint_value.size(), handle_ptr->get_stream());
   lp_state.prev_primal.resize(other_sol.lp_state.prev_primal.size(), handle_ptr->get_stream());
   lp_state.prev_dual.resize(other_sol.lp_state.prev_dual.size(), handle_ptr->get_stream());
@@ -165,8 +193,6 @@ typename solution_t<i_t, f_t>::view_t solution_t<i_t, f_t>::view()
   v.assignment       = raft::device_span<f_t>{assignment.data(), assignment.size()};
   v.lower_excess     = raft::device_span<f_t>{lower_excess.data(), lower_excess.size()};
   v.upper_excess     = raft::device_span<f_t>{upper_excess.data(), upper_excess.size()};
-  v.lower_slack      = raft::device_span<f_t>{lower_slack.data(), lower_slack.size()};
-  v.upper_slack      = raft::device_span<f_t>{upper_slack.data(), upper_slack.size()};
   v.constraint_value = raft::device_span<f_t>{constraint_value.data(), constraint_value.size()};
   v.obj_val          = obj_val.data();
   v.n_feasible_constraints = n_feasible_constraints.data();
@@ -235,7 +261,7 @@ void solution_t<i_t, f_t>::assign_random_within_bounds(f_t ratio_of_vars_to_rand
 
   auto variable_bounds = cuopt::host_copy(problem_ptr->variable_bounds, stream);
   auto variable_types  = cuopt::host_copy(problem_ptr->variable_types, stream);
-  problem_ptr->handle_ptr->sync_stream();
+  handle_ptr->sync_stream();
   for (size_t i = 0; i < problem_ptr->variable_bounds.size(); ++i) {
     if (only_integers && variable_types[i] != var_t::INTEGER) { continue; }
     bool skip = unif_prob(rng) > ratio_of_vars_to_random_assign;
@@ -642,6 +668,14 @@ mip_solution_t<i_t, f_t> solution_t<i_t, f_t>::get_solution(bool output_feasible
   }
 }
 
+template <typename i_t, typename f_t>
+uint32_t solution_t<i_t, f_t>::get_hash() const
+{
+  auto h_assignment =
+    host_copy(assignment.data(), problem_ptr->n_variables, handle_ptr->get_stream());
+  return compute_hash(h_assignment);
+}
+
 #if MIP_INSTANTIATE_FLOAT || PDLP_INSTANTIATE_FLOAT
 template class solution_t<int, float>;
 #endif
diff --git a/cpp/src/mip_heuristics/solution/solution.cuh b/cpp/src/mip_heuristics/solution/solution.cuh
index 9de10ed980..c0f3c539e7 100644
--- a/cpp/src/mip_heuristics/solution/solution.cuh
+++ b/cpp/src/mip_heuristics/solution/solution.cuh
@@ -25,6 +25,7 @@ template <typename i_t, typename f_t>
 class solution_t {
  public:
   solution_t(problem_t<i_t, f_t>& problem);
+  solution_t(problem_t<i_t, f_t>& problem, const raft::handle_t* handle_override);
   solution_t(const solution_t<i_t, f_t>& other);
   solution_t& operator=(solution_t<i_t, f_t>&& other) noexcept = default;
   solution_t(solution_t<i_t, f_t>&& other)                     = default;
@@ -99,6 +100,7 @@ class solution_t {
   f_t compute_max_constraint_violation();
   f_t compute_max_int_violation();
   f_t compute_max_variable_violation();
+  uint32_t get_hash() const;
 
   struct view_t {
     // let's not bloat the class for every simple getter and setters
@@ -112,8 +114,6 @@ class solution_t {
     raft::device_span<f_t> assignment;
     raft::device_span<f_t> lower_excess;
     raft::device_span<f_t> upper_excess;
-    raft::device_span<f_t> lower_slack;
-    raft::device_span<f_t> upper_slack;
     raft::device_span<f_t> constraint_value;
     f_t* obj_val;
     i_t* n_feasible_constraints;
@@ -128,8 +128,6 @@ class solution_t {
   rmm::device_uvector<f_t> assignment;
   rmm::device_uvector<f_t> lower_excess;
   rmm::device_uvector<f_t> upper_excess;
-  rmm::device_uvector<f_t> lower_slack;
-  rmm::device_uvector<f_t> upper_slack;
   rmm::device_uvector<f_t> constraint_value;
   rmm::device_scalar<f_t> obj_val;
   rmm::device_scalar<i_t> n_feasible_constraints;
diff --git a/cpp/src/mip_heuristics/solution_callbacks.cuh b/cpp/src/mip_heuristics/solution_callbacks.cuh
new file mode 100644
index 0000000000..b6f3ded8d6
--- /dev/null
+++ b/cpp/src/mip_heuristics/solution_callbacks.cuh
@@ -0,0 +1,223 @@
+/* clang-format off */
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+/* clang-format on */
+
+#pragma once
+
+#include <cuopt/linear_programming/mip/solver_stats.hpp>
+
+#include <mip_heuristics/problem/problem.cuh>
+#include <mip_heuristics/solution/solution.cuh>
+
+#include <limits>
+#include <mutex>
+#include <vector>
+
+namespace cuopt::linear_programming::detail {
+
+template <typename i_t, typename f_t>
+struct solution_callback_payload_t {
+  std::vector<f_t> assignment{};
+  f_t user_objective{};
+  f_t solver_objective{};
+  internals::mip_solution_callback_info_t callback_info{};
+};
+
+template <typename f_t>
+void dispatch_get_solution_callbacks(
+  const std::vector<internals::base_solution_callback_t*>& user_callbacks,
+  const std::vector<f_t>& assignment,
+  f_t user_objective,
+  f_t solution_bound,
+  const internals::mip_solution_callback_info_t& callback_info)
+{
+  for (auto callback : user_callbacks) {
+    if (callback->get_type() != internals::base_solution_callback_type::GET_SOLUTION_EXT &&
+        callback->get_type() != internals::base_solution_callback_type::GET_SOLUTION) {
+      continue;
+    }
+
+    std::vector<f_t> user_assignment(assignment);
+    std::vector<f_t> user_objective_vec(1, user_objective);
+    std::vector<f_t> user_bound_vec(1, solution_bound);
+    if (callback->get_type() == internals::base_solution_callback_type::GET_SOLUTION_EXT) {
+      auto get_sol_callback_ext = static_cast<internals::get_solution_callback_ext_t*>(callback);
+      get_sol_callback_ext->get_solution(user_assignment.data(),
+                                         user_objective_vec.data(),
+                                         user_bound_vec.data(),
+                                         &callback_info,
+                                         get_sol_callback_ext->get_user_data());
+    } else if (callback->get_type() == internals::base_solution_callback_type::GET_SOLUTION) {
+      auto get_sol_callback = static_cast<internals::get_solution_callback_t*>(callback);
+      get_sol_callback->get_solution(user_assignment.data(),
+                                     user_objective_vec.data(),
+                                     user_bound_vec.data(),
+                                     get_sol_callback->get_user_data());
+    }
+  }
+}
+
+template <typename i_t, typename f_t>
+class solution_publication_t {
+ public:
+  solution_publication_t(const mip_solver_settings_t<i_t, f_t>& settings_,
+                         solver_stats_t<i_t, f_t>& stats_)
+    : settings(settings_), stats(stats_)
+  {
+  }
+
+  void reset_published_best(f_t objective = std::numeric_limits<f_t>::max())
+  {
+    best_callback_feasible_objective_ = objective;
+  }
+
+  solution_callback_payload_t<i_t, f_t> build_callback_payload(
+    problem_t<i_t, f_t>* problem_ptr,
+    solution_t<i_t, f_t>& sol,
+    internals::mip_solution_origin_t origin,
+    double work_timestamp)
+  {
+    cuopt_assert(problem_ptr != nullptr, "Callback payload problem pointer must not be null");
+    cuopt_assert(work_timestamp >= 0.0, "work_timestamp must not be negative");
+    solution_callback_payload_t<i_t, f_t> payload{};
+    payload.user_objective               = sol.get_user_objective();
+    payload.solver_objective             = sol.get_objective();
+    payload.callback_info.origin         = (uint32_t)origin;
+    payload.callback_info.work_timestamp = work_timestamp;
+    solution_t<i_t, f_t> temp_sol(sol);
+    CUOPT_LOG_DEBUG("build_callback_payload: pre_postprocess size=%zu handle=%p problem_handle=%p",
+                    temp_sol.assignment.size(),
+                    (void*)sol.handle_ptr,
+                    (void*)problem_ptr->handle_ptr);
+    problem_ptr->post_process_assignment(temp_sol.assignment, true, sol.handle_ptr);
+    CUOPT_LOG_DEBUG("build_callback_payload: post_postprocess size=%zu",
+                    temp_sol.assignment.size());
+    if (problem_ptr->has_papilo_presolve_data()) {
+      CUOPT_LOG_DEBUG("build_callback_payload: pre_papilo size=%zu papilo_reduced_size=%zu",
+                      temp_sol.assignment.size(),
+                      problem_ptr->get_papilo_original_num_variables());
+      problem_ptr->papilo_uncrush_assignment(temp_sol.assignment, sol.handle_ptr);
+      CUOPT_LOG_DEBUG("build_callback_payload: post_papilo size=%zu", temp_sol.assignment.size());
+    }
+    payload.assignment = cuopt::host_copy(temp_sol.assignment, temp_sol.handle_ptr->get_stream());
+    CUOPT_LOG_DEBUG("build_callback_payload: final payload size=%zu obj=%.6g origin=%s",
+                    payload.assignment.size(),
+                    payload.user_objective,
+                    internals::mip_solution_origin_to_string(origin));
+    return payload;
+  }
+
+  bool publish_new_best_feasible(const solution_callback_payload_t<i_t, f_t>& payload,
+                                 double elapsed_time = -1.0)
+  {
+    std::lock_guard<std::mutex> lock(solution_callback_mutex_);
+    cuopt_assert(std::isfinite(payload.solver_objective),
+                 "Feasible incumbent objective must be finite");
+    if (!(payload.solver_objective < best_callback_feasible_objective_)) { return false; }
+
+    best_callback_feasible_objective_ = payload.solver_objective;
+    if (settings.benchmark_info_ptr != nullptr && elapsed_time >= 0.0) {
+      settings.benchmark_info_ptr->last_improvement_of_best_feasible = elapsed_time;
+    }
+    invoke_get_solution_callbacks(payload);
+    return true;
+  }
+
+ private:
+  void invoke_get_solution_callbacks(const solution_callback_payload_t<i_t, f_t>& payload)
+  {
+    auto user_callbacks = settings.get_mip_callbacks();
+    CUOPT_LOG_DEBUG("Publishing incumbent: obj=%g wut=%.6f origin=%s callbacks=%zu",
+                    payload.user_objective,
+                    payload.callback_info.work_timestamp,
+                    internals::mip_solution_origin_to_string(
+                      (internals::mip_solution_origin_t)payload.callback_info.origin),
+                    user_callbacks.size());
+    dispatch_get_solution_callbacks(user_callbacks,
+                                    payload.assignment,
+                                    payload.user_objective,
+                                    stats.get_solution_bound(),
+                                    payload.callback_info);
+  }
+
+  const mip_solver_settings_t<i_t, f_t>& settings;
+  solver_stats_t<i_t, f_t>& stats;
+  std::mutex solution_callback_mutex_;
+  f_t best_callback_feasible_objective_{std::numeric_limits<f_t>::max()};
+};
+
+// Processes SET_SOLUTION user callbacks: invokes the callback, validates/scales/preprocesses
+// the returned assignment, and returns it for the caller to reinject.
+template <typename i_t, typename f_t>
+class solution_injection_t {
+ public:
+  solution_injection_t(const mip_solver_settings_t<i_t, f_t>& settings_,
+                       solver_stats_t<i_t, f_t>& stats_)
+    : settings(settings_), stats(stats_)
+  {
+  }
+
+  template <typename OnInjectedFn>
+  void invoke_set_solution_callbacks(problem_t<i_t, f_t>* problem_ptr,
+                                     solution_t<i_t, f_t>& current_incumbent,
+                                     OnInjectedFn&& on_injected)
+  {
+    auto user_callbacks = settings.get_mip_callbacks();
+    for (auto callback : user_callbacks) {
+      if (callback->get_type() != internals::base_solution_callback_type::SET_SOLUTION) {
+        continue;
+      }
+      auto set_sol_callback       = static_cast<internals::set_solution_callback_t*>(callback);
+      f_t user_bound              = stats.get_solution_bound();
+      auto callback_num_variables = problem_ptr->original_problem_ptr->get_n_variables();
+      rmm::device_uvector<f_t> incumbent_assignment(callback_num_variables,
+                                                    current_incumbent.handle_ptr->get_stream());
+      auto inf = std::numeric_limits<f_t>::infinity();
+      current_incumbent.handle_ptr->sync_stream();
+      std::vector<f_t> h_incumbent_assignment(incumbent_assignment.size());
+      std::vector<f_t> h_outside_sol_objective(1, inf);
+      std::vector<f_t> h_user_bound(1, user_bound);
+      set_sol_callback->set_solution(h_incumbent_assignment.data(),
+                                     h_outside_sol_objective.data(),
+                                     h_user_bound.data(),
+                                     set_sol_callback->get_user_data());
+      f_t outside_sol_objective = h_outside_sol_objective[0];
+      if (outside_sol_objective == inf) { continue; }
+
+      raft::copy(incumbent_assignment.data(),
+                 h_incumbent_assignment.data(),
+                 incumbent_assignment.size(),
+                 current_incumbent.handle_ptr->get_stream());
+      bool is_valid = problem_ptr->pre_process_assignment(incumbent_assignment);
+      if (!is_valid) { continue; }
+
+      solution_t<i_t, f_t> outside_sol(current_incumbent);
+      cuopt_assert(outside_sol.assignment.size() == incumbent_assignment.size(),
+                   "Incumbent assignment size mismatch");
+      raft::copy(outside_sol.assignment.data(),
+                 incumbent_assignment.data(),
+                 incumbent_assignment.size(),
+                 current_incumbent.handle_ptr->get_stream());
+      outside_sol.compute_feasibility();
+
+      CUOPT_LOG_DEBUG("Injected solution feasibility = %d objective = %g excess = %g",
+                      outside_sol.get_feasible(),
+                      outside_sol.get_user_objective(),
+                      outside_sol.get_total_excess());
+      cuopt_assert(std::abs(outside_sol.get_user_objective() - outside_sol_objective) <= 1e-6,
+                   "External solution objective mismatch");
+      on_injected(outside_sol.get_host_assignment(),
+                  outside_sol.get_objective(),
+                  internals::mip_solution_origin_t::USER_INJECTED);
+    }
+  }
+
+ private:
+  const mip_solver_settings_t<i_t, f_t>& settings;
+  solver_stats_t<i_t, f_t>& stats;
+};
+
+}  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/mip_heuristics/solve.cu b/cpp/src/mip_heuristics/solve.cu
index be01516657..521f0e16b7 100644
--- a/cpp/src/mip_heuristics/solve.cu
+++ b/cpp/src/mip_heuristics/solve.cu
@@ -17,6 +17,7 @@
 #include <mip_heuristics/solver.cuh>
 #include <mip_heuristics/utilities/sort_csr.cuh>
 #include <mip_heuristics/utils.cuh>
+#include <utilities/determinism_log.hpp>
 
 #include <pdlp/pdlp.cuh>
 #include <pdlp/restart_strategy/pdlp_restart_strategy.cuh>
@@ -26,6 +27,7 @@
 #include <utilities/logger.hpp>
 #include <utilities/seed_generator.cuh>
 #include <utilities/version_info.hpp>
+#include <utilities/work_limit_timer.hpp>
 
 #include <cuopt/linear_programming/backend_selection.hpp>
 #include <cuopt/linear_programming/cpu_optimization_problem.hpp>
@@ -60,34 +62,21 @@ static void init_handler(const raft::handle_t* handle_ptr)
     handle_ptr->get_cusparse_handle(), CUSPARSE_POINTER_MODE_DEVICE, handle_ptr->get_stream()));
 }
 
-template <typename f_t>
-static void invoke_solution_callbacks(
-  const std::vector<internals::base_solution_callback_t*>& mip_callbacks,
-  f_t objective,
-  std::vector<f_t>& assignment,
-  f_t bound)
-{
-  std::vector<f_t> obj_vec   = {objective};
-  std::vector<f_t> bound_vec = {bound};
-  for (auto callback : mip_callbacks) {
-    if (callback != nullptr &&
-        callback->get_type() == internals::base_solution_callback_type::GET_SOLUTION) {
-      auto get_sol_callback = static_cast<internals::get_solution_callback_t*>(callback);
-      get_sol_callback->get_solution(
-        assignment.data(), obj_vec.data(), bound_vec.data(), get_sol_callback->get_user_data());
-    }
-  }
-}
-
 template <typename i_t, typename f_t>
 mip_solution_t<i_t, f_t> run_mip(detail::problem_t<i_t, f_t>& problem,
                                  mip_solver_settings_t<i_t, f_t> const& settings,
-                                 timer_t& timer,
+                                 cuopt::termination_checker_t& timer,
                                  f_t& initial_upper_bound,
                                  std::vector<f_t>& initial_incumbent_assignment)
 {
+  raft::common::nvtx::range fun_scope("run_mip");
   try {
-    raft::common::nvtx::range fun_scope("run_mip");
+    auto constexpr const running_mip = true;
+
+    // TODO ask Akif and Alice how was this passed down?
+    [[maybe_unused]] auto hyper_params                    = settings.hyper_params;
+    hyper_params.update_primal_weight_on_initial_solution = false;
+    hyper_params.update_step_size_on_initial_solution     = true;
     if (settings.get_mip_callbacks().size() > 0) {
       auto callback_num_variables = problem.original_problem_ptr->get_n_variables();
       if (problem.has_papilo_presolve_data()) {
@@ -115,34 +104,26 @@ mip_solution_t<i_t, f_t> run_mip(detail::problem_t<i_t, f_t>& problem,
       stats.set_solution_bound(solution.get_user_objective());
       // log the objective for scripts which need it
       CUOPT_LOG_INFO("Best feasible: %f", solution.get_user_objective());
-      for (auto callback : settings.get_mip_callbacks()) {
-        if (callback->get_type() == internals::base_solution_callback_type::GET_SOLUTION) {
-          auto temp_sol(solution);
-          auto get_sol_callback = static_cast<internals::get_solution_callback_t*>(callback);
-          std::vector<f_t> user_objective_vec(1);
-          std::vector<f_t> user_bound_vec(1);
-          user_objective_vec[0] = solution.get_user_objective();
-          user_bound_vec[0]     = stats.get_solution_bound();
-          if (problem.has_papilo_presolve_data()) {
-            problem.papilo_uncrush_assignment(temp_sol.assignment);
-          }
-          std::vector<f_t> user_assignment_vec(temp_sol.assignment.size());
-          raft::copy(user_assignment_vec.data(),
-                     temp_sol.assignment.data(),
-                     temp_sol.assignment.size(),
-                     temp_sol.handle_ptr->get_stream());
-          solution.handle_ptr->sync_stream();
-          get_sol_callback->get_solution(user_assignment_vec.data(),
-                                         user_objective_vec.data(),
-                                         user_bound_vec.data(),
-                                         get_sol_callback->get_user_data());
+      {
+        detail::solution_callback_payload_t<i_t, f_t> payload{};
+        payload.user_objective               = solution.get_user_objective();
+        payload.solver_objective             = solution.get_objective();
+        payload.callback_info.origin         = (uint32_t)internals::mip_solution_origin_t::PRESOLVE;
+        payload.callback_info.work_timestamp = 0.0;
+        detail::solution_t<i_t, f_t> temp_sol(solution);
+        if (problem.has_papilo_presolve_data()) {
+          problem.papilo_uncrush_assignment(temp_sol.assignment);
         }
+        payload.assignment = temp_sol.get_host_assignment();
+        detail::solution_publication_t<i_t, f_t> pub(settings, stats);
+        pub.publish_new_best_feasible(payload);
       }
       return solution.get_solution(true, stats, false);
     }
+
     // problem contains unpreprocessed data
     detail::problem_t<i_t, f_t> scaled_problem(problem);
-    cuopt_func_call(auto saved_problem = scaled_problem);
+
     CUOPT_LOG_INFO("Objective offset %f scaling_factor %f",
                    problem.presolve_data.objective_offset,
                    problem.presolve_data.objective_scaling_factor);
@@ -151,6 +132,7 @@ mip_solution_t<i_t, f_t> run_mip(detail::problem_t<i_t, f_t>& problem,
                  "Size mismatch");
     cuopt_assert(problem.original_problem_ptr->get_n_constraints() == scaled_problem.n_constraints,
                  "Size mismatch");
+
     // only call preprocess on scaled problem, so we can compute feasibility on the original problem
     scaled_problem.preprocess_problem();
     scaled_problem.related_vars_time_limit = settings.heuristic_params.related_vars_time_limit;
@@ -178,33 +160,37 @@ mip_solution_t<i_t, f_t> run_mip(detail::problem_t<i_t, f_t>& problem,
     // via problem.get_solver_obj_from_user_obj.
     std::unique_ptr<detail::early_cpufj_t<i_t, f_t>> early_cpufj;
     bool run_early_cpufj = problem.has_papilo_presolve_data() &&
-                           settings.determinism_mode != CUOPT_MODE_DETERMINISTIC &&
+                           settings.determinism_mode == CUOPT_DETERMINISM_NONE &&
                            problem.original_problem_ptr->get_n_integers() > 0;
     if (run_early_cpufj) {
       auto early_fj_start = std::chrono::steady_clock::now();
       auto* presolver_ptr = problem.presolve_data.papilo_presolve_ptr;
       auto mip_callbacks  = settings.get_mip_callbacks();
       f_t no_bound = problem.presolve_data.objective_scaling_factor >= 0 ? (f_t)-1e20 : (f_t)1e20;
-      auto incumbent_callback = [presolver_ptr,
-                                 mip_callbacks,
-                                 no_bound,
-                                 ctx_ptr = &solver.context,
-                                 early_fj_start](f_t solver_obj,
-                                                 f_t user_obj,
-                                                 const std::vector<f_t>& assignment,
-                                                 const char* heuristic_name) {
-        std::vector<f_t> user_assignment;
-        presolver_ptr->uncrush_primal_solution(assignment, user_assignment);
-        ctx_ptr->initial_incumbent_assignment = user_assignment;
-        ctx_ptr->initial_upper_bound          = user_obj;
-        double elapsed =
-          std::chrono::duration<double>(std::chrono::steady_clock::now() - early_fj_start).count();
-        CUOPT_LOG_INFO("New solution from early primal heuristics (%s). Objective %+.6e. Time %.2f",
-                       heuristic_name,
-                       user_obj,
-                       elapsed);
-        invoke_solution_callbacks(mip_callbacks, user_obj, user_assignment, no_bound);
-      };
+      detail::early_incumbent_callback_t<f_t> incumbent_callback =
+        [presolver_ptr, mip_callbacks, no_bound, ctx_ptr = &solver.context, early_fj_start](
+          f_t solver_obj,
+          f_t user_obj,
+          const std::vector<f_t>& assignment,
+          internals::mip_solution_origin_t origin) {
+          std::vector<f_t> user_assignment;
+          presolver_ptr->uncrush_primal_solution(assignment, user_assignment);
+          ctx_ptr->initial_incumbent_assignment = user_assignment;
+          ctx_ptr->initial_upper_bound          = user_obj;
+          double elapsed =
+            std::chrono::duration<double>(std::chrono::steady_clock::now() - early_fj_start)
+              .count();
+          CUOPT_LOG_INFO(
+            "New solution from early primal heuristics (%s). Objective %+.6e. Time %.2f",
+            internals::mip_solution_origin_to_string(origin),
+            user_obj,
+            elapsed);
+          internals::mip_solution_callback_info_t callback_info{};
+          callback_info.origin         = (uint32_t)origin;
+          callback_info.work_timestamp = 0.0;
+          detail::dispatch_get_solution_callbacks(
+            mip_callbacks, user_assignment, user_obj, no_bound, callback_info);
+        };
       early_cpufj = std::make_unique<detail::early_cpufj_t<i_t, f_t>>(
         *problem.original_problem_ptr, settings.get_tolerances(), incumbent_callback);
       // Convert initial_upper_bound from user-space to the CPUFJ's solver-space (papilo-presolved).
@@ -216,7 +202,6 @@ mip_solution_t<i_t, f_t> run_mip(detail::problem_t<i_t, f_t>& problem,
       solver.context.early_cpufj_ptr = early_cpufj.get();
       CUOPT_LOG_DEBUG("Started early CPUFJ on papilo-presolved problem during cuOpt presolve");
     }
-
     auto presolved_sol            = solver.run_solver();
     bool is_feasible_on_presolved = presolved_sol.get_feasible();
     presolved_sol.problem_ptr     = &problem;
@@ -277,6 +262,15 @@ mip_solution_t<i_t, f_t> solve_mip(optimization_problem_t<i_t, f_t>& op_problem,
 
     // Initialize seed generator if a specific seed is requested
     if (settings.seed >= 0) { cuopt::seed_generator::set_seed(settings.seed); }
+    CUOPT_DETERMINISM_LOG(
+      "Deterministic solve start settings: seed=%lld seed_state=%lld det_mode=%d "
+      "work_limit=%.6f max_cut_passes=%d num_cpu_threads=%d",
+      (long long)settings.seed,
+      (long long)cuopt::seed_generator::peek_seed(),
+      (int)settings.determinism_mode,
+      (double)settings.work_limit,
+      settings.max_cut_passes,
+      settings.num_cpu_threads);
 
     raft::common::nvtx::range fun_scope("Running solver");
 
@@ -303,7 +297,9 @@ mip_solution_t<i_t, f_t> solve_mip(optimization_problem_t<i_t, f_t>& op_problem,
       callback->template setup<f_t>(op_problem.get_n_variables());
     }
 
-    auto timer = timer_t(time_limit);
+    auto timer =
+      cuopt::termination_checker_t(time_limit, cuopt::termination_checker_t::root_tag_t{});
+    const bool deterministic_run = (settings.determinism_mode != CUOPT_DETERMINISM_NONE);
     if (settings.mip_scaling != CUOPT_MIP_SCALING_OFF) {
       detail::mip_scaling_strategy_t<i_t, f_t> scaling(op_problem);
       scaling.scale_problem(settings.mip_scaling != CUOPT_MIP_SCALING_NO_OBJECTIVE);
@@ -311,8 +307,7 @@ mip_solution_t<i_t, f_t> solve_mip(optimization_problem_t<i_t, f_t>& op_problem,
     double presolve_time = 0.0;
     std::unique_ptr<detail::third_party_presolve_t<i_t, f_t>> presolver;
     std::optional<detail::third_party_presolve_result_t<i_t, f_t>> presolve_result_opt;
-    detail::problem_t<i_t, f_t> problem(
-      op_problem, settings.get_tolerances(), settings.determinism_mode == CUOPT_MODE_DETERMINISTIC);
+    detail::problem_t<i_t, f_t> problem(op_problem, settings.get_tolerances(), deterministic_run);
 
     auto run_presolve              = settings.presolver != presolver_t::None;
     run_presolve                   = run_presolve && settings.initial_solutions.size() == 0;
@@ -347,35 +342,41 @@ mip_solution_t<i_t, f_t> solve_mip(optimization_problem_t<i_t, f_t>& op_problem,
     std::vector<f_t> early_best_user_assignment;
     std::mutex early_callback_mutex;
 
-    bool run_early_fj = run_presolve && settings.determinism_mode != CUOPT_MODE_DETERMINISTIC &&
+    bool run_early_fj = run_presolve && settings.determinism_mode == CUOPT_DETERMINISM_NONE &&
                         op_problem.get_n_integers() > 0 && op_problem.get_n_constraints() > 0;
     f_t no_bound = problem.presolve_data.objective_scaling_factor >= 0 ? (f_t)-1e20 : (f_t)1e20;
     if (run_early_fj) {
-      auto early_fj_start    = std::chrono::steady_clock::now();
-      auto early_fj_callback = [&early_best_objective,
-                                &early_best_user_obj,
-                                &early_best_user_assignment,
-                                &early_callback_mutex,
-                                &early_fj_start,
-                                mip_callbacks = settings.get_mip_callbacks(),
-                                no_bound](f_t solver_obj,
-                                          f_t user_obj,
-                                          const std::vector<f_t>& assignment,
-                                          const char* heuristic_name) {
-        std::lock_guard<std::mutex> lock(early_callback_mutex);
-        if (solver_obj >= early_best_objective.load()) { return; }
-        early_best_objective.store(solver_obj);
-        early_best_user_obj        = user_obj;
-        early_best_user_assignment = assignment;
-        double elapsed =
-          std::chrono::duration<double>(std::chrono::steady_clock::now() - early_fj_start).count();
-        CUOPT_LOG_INFO("New solution from early primal heuristics (%s). Objective %+.6e. Time %.2f",
-                       heuristic_name,
-                       user_obj,
-                       elapsed);
-        auto user_assignment = assignment;
-        invoke_solution_callbacks(mip_callbacks, user_obj, user_assignment, no_bound);
-      };
+      auto early_fj_start = std::chrono::steady_clock::now();
+      detail::early_incumbent_callback_t<f_t> early_fj_callback =
+        [&early_best_objective,
+         &early_best_user_obj,
+         &early_best_user_assignment,
+         &early_callback_mutex,
+         &early_fj_start,
+         mip_callbacks = settings.get_mip_callbacks(),
+         no_bound](f_t solver_obj,
+                   f_t user_obj,
+                   const std::vector<f_t>& assignment,
+                   internals::mip_solution_origin_t origin) {
+          std::lock_guard<std::mutex> lock(early_callback_mutex);
+          if (solver_obj >= early_best_objective.load()) { return; }
+          early_best_objective.store(solver_obj);
+          early_best_user_obj        = user_obj;
+          early_best_user_assignment = assignment;
+          internals::mip_solution_callback_info_t callback_info{};
+          callback_info.origin         = (uint32_t)origin;
+          callback_info.work_timestamp = 0.0;
+          double elapsed =
+            std::chrono::duration<double>(std::chrono::steady_clock::now() - early_fj_start)
+              .count();
+          CUOPT_LOG_INFO(
+            "New solution from early primal heuristics (%s). Objective %+.6e. Time %.2f",
+            internals::mip_solution_origin_to_string(origin),
+            user_obj,
+            elapsed);
+          detail::dispatch_get_solution_callbacks(
+            mip_callbacks, assignment, user_obj, no_bound, callback_info);
+        };
 
       // Start early CPUFJ on original problem (will restart on presolved problem after Papilo)
       early_cpufj = std::make_unique<detail::early_cpufj_t<i_t, f_t>>(
@@ -398,10 +399,9 @@ mip_solution_t<i_t, f_t> solve_mip(optimization_problem_t<i_t, f_t>& op_problem,
       const auto& hp = settings.heuristic_params;
       double presolve_time_limit =
         std::min(hp.presolve_time_ratio * time_limit, hp.presolve_max_time);
-      if (settings.determinism_mode == CUOPT_MODE_DETERMINISTIC) {
-        presolve_time_limit = std::numeric_limits<double>::infinity();
-      }
-      presolver   = std::make_unique<detail::third_party_presolve_t<i_t, f_t>>();
+      if (deterministic_run) { presolve_time_limit = timer.remaining_time(); }
+      presolver = std::make_unique<detail::third_party_presolve_t<i_t, f_t>>();
+      presolver->set_deterministic(deterministic_run);
       auto result = presolver->apply(op_problem,
                                      cuopt::linear_programming::problem_category_t::MIP,
                                      settings.presolver,
@@ -428,7 +428,8 @@ mip_solution_t<i_t, f_t> solve_mip(optimization_problem_t<i_t, f_t>& op_problem,
       }
       presolve_result_opt.emplace(std::move(result));
 
-      problem = detail::problem_t<i_t, f_t>(presolve_result_opt->reduced_problem);
+      problem = detail::problem_t<i_t, f_t>(
+        presolve_result_opt->reduced_problem, settings.get_tolerances(), deterministic_run);
       problem.set_papilo_presolve_data(presolver.get(),
                                        presolve_result_opt->reduced_to_original_map,
                                        presolve_result_opt->original_to_reduced_map,
@@ -499,7 +500,8 @@ mip_solution_t<i_t, f_t> solve_mip(optimization_problem_t<i_t, f_t>& op_problem,
                      reduced_costs.data(),
                      reduced_costs.data() + reduced_costs.size(),
                      std::numeric_limits<f_t>::signaling_NaN());
-        detail::problem_t<i_t, f_t> full_problem(op_problem);
+        detail::problem_t<i_t, f_t> full_problem(
+          op_problem, settings.get_tolerances(), deterministic_run);
         detail::solution_t<i_t, f_t> full_sol(full_problem);
         full_sol.copy_new_assignment(
           cuopt::host_copy(primal_solution, op_problem.get_handle_ptr()->get_stream()));
diff --git a/cpp/src/mip_heuristics/solver.cu b/cpp/src/mip_heuristics/solver.cu
index ce6b602fba..35bce62acf 100644
--- a/cpp/src/mip_heuristics/solver.cu
+++ b/cpp/src/mip_heuristics/solver.cu
@@ -18,6 +18,7 @@
 #include <dual_simplex/simplex_solver_settings.hpp>
 #include <dual_simplex/solve.hpp>
 #include <mip_heuristics/feasibility_jump/early_cpufj.cuh>
+#include <utilities/determinism_log.hpp>
 
 #include <raft/sparse/detail/cusparse_wrappers.h>
 #include <raft/core/cusparse_macros.hpp>
@@ -25,7 +26,12 @@
 #include <cmath>
 #include <future>
 #include <memory>
-#include <thread>
+
+// enable to activate detailed determinism logs
+#if 0
+#undef CUOPT_DETERMINISM_LOG
+#define CUOPT_DETERMINISM_LOG(...) CUOPT_LOG_INFO(__VA_ARGS__)
+#endif
 
 namespace cuopt::linear_programming::detail {
 
@@ -42,25 +48,46 @@ static void init_handler(const raft::handle_t* handle_ptr)
 template <typename i_t, typename f_t>
 mip_solver_t<i_t, f_t>::mip_solver_t(const problem_t<i_t, f_t>& op_problem,
                                      const mip_solver_settings_t<i_t, f_t>& solver_settings,
-                                     timer_t timer)
+                                     cuopt::termination_checker_t& timer)
   : op_problem_(op_problem),
     solver_settings_(solver_settings),
     context(op_problem.handle_ptr, const_cast<problem_t<i_t, f_t>*>(&op_problem), solver_settings),
     timer_(timer)
 {
+  context.termination = &timer_;
   init_handler(op_problem.handle_ptr);
 }
 
 template <typename i_t, typename f_t>
-struct branch_and_bound_solution_helper_t {
-  branch_and_bound_solution_helper_t(diversity_manager_t<i_t, f_t>* dm,
-                                     dual_simplex::simplex_solver_settings_t<i_t, f_t>& settings)
-    : dm(dm), settings_(settings) {};
-
-  void solution_callback(std::vector<f_t>& solution, f_t objective)
+struct bb_callback_adapter_t {
+  bb_callback_adapter_t(mip_solver_context_t<i_t, f_t>* context, diversity_manager_t<i_t, f_t>* dm)
+    : context(context), dm(dm) {};
+
+  void new_incumbent_callback(std::vector<f_t>& solution,
+                              f_t objective,
+                              const internals::mip_solution_callback_info_t& info,
+                              double work_timestamp)
   {
-    dm->population.add_external_solution(solution, objective, solution_origin_t::BRANCH_AND_BOUND);
-    dm->rins.new_best_incumbent_callback(solution);
+    if (context->settings.determinism_mode & CUOPT_DETERMINISM_BB) {
+      // B&B calls this from its own thread. Use a dedicated per-thread stream
+      // to avoid racing on the heuristic thread's stream.
+      raft::handle_t callback_handle(rmm::cuda_stream_per_thread);
+      solution_t<i_t, f_t> temp_sol(*context->problem_ptr, &callback_handle);
+      temp_sol.copy_new_assignment(solution);
+      temp_sol.compute_feasibility();
+      const auto payload = context->solution_publication.build_callback_payload(
+        context->problem_ptr,
+        temp_sol,
+        (internals::mip_solution_origin_t)info.origin,
+        work_timestamp);
+      context->solution_publication.publish_new_best_feasible(payload, work_timestamp);
+    }
+    if (context->diversity_manager_ptr != nullptr &&
+        !(context->settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS)) {
+      context->diversity_manager_ptr->population.add_external_solution(
+        solution, objective, (internals::mip_solution_origin_t)info.origin);
+      context->diversity_manager_ptr->rins.new_best_incumbent_callback(solution);
+    }
   }
 
   void set_simplex_solution(std::vector<f_t>& solution,
@@ -76,8 +103,8 @@ struct branch_and_bound_solution_helper_t {
   }
 
   void preempt_heuristic_solver() { dm->population.preempt_heuristic_solver(); }
+  mip_solver_context_t<i_t, f_t>* context;
   diversity_manager_t<i_t, f_t>* dm;
-  dual_simplex::simplex_solver_settings_t<i_t, f_t>& settings_;
 };
 
 // Extract probing cache into CPU-only CSR struct for implied bounds cuts
@@ -183,6 +210,7 @@ solution_t<i_t, f_t> mip_solver_t<i_t, f_t>::run_solver()
 {
   //  we need to keep original problem const
   cuopt_assert(context.problem_ptr != nullptr, "invalid problem pointer");
+  cuopt_assert(context.termination != nullptr, "termination checker must be set before run_solver");
   context.problem_ptr->tolerances = context.settings.get_tolerances();
   cuopt_expects(context.problem_ptr->preprocess_called,
                 error_type_t::RuntimeError,
@@ -193,25 +221,28 @@ solution_t<i_t, f_t> mip_solver_t<i_t, f_t>::run_solver()
     CUOPT_LOG_INFO("Problem fully reduced in presolve");
     solution_t<i_t, f_t> sol(*context.problem_ptr);
     sol.set_problem_fully_reduced();
-    for (auto callback : context.settings.get_mip_callbacks()) {
-      if (callback->get_type() == internals::base_solution_callback_type::GET_SOLUTION) {
-        auto get_sol_callback = static_cast<internals::get_solution_callback_t*>(callback);
-        dm.population.invoke_get_solution_callback(sol, get_sol_callback);
-      }
-    }
+    const auto payload = context.solution_publication.build_callback_payload(
+      context.problem_ptr, sol, internals::mip_solution_origin_t::PRESOLVE, 0.0);
+    context.solution_publication.publish_new_best_feasible(payload);
     context.problem_ptr->post_process_solution(sol);
     return sol;
   }
-  dm.timer                   = timer_;
-  const bool run_presolve    = context.settings.presolver != presolver_t::None;
-  f_t time_limit             = context.settings.determinism_mode == CUOPT_MODE_DETERMINISTIC
-                                 ? std::numeric_limits<f_t>::infinity()
-                                 : timer_.remaining_time();
-  const auto& hp             = context.settings.heuristic_params;
-  double presolve_time_limit = std::min(hp.presolve_time_ratio * time_limit, hp.presolve_max_time);
-  presolve_time_limit        = context.settings.determinism_mode == CUOPT_MODE_DETERMINISTIC
-                                 ? std::numeric_limits<f_t>::infinity()
-                                 : presolve_time_limit;
+  const bool deterministic_run =
+    (context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS);
+  const f_t gpu_heur_work_limit =
+    deterministic_run ? context.settings.work_limit : timer_.get_time_limit();
+  if (deterministic_run)
+    cuopt_assert(gpu_heur_work_limit >= 0.0,
+                 "Deterministic GPU heuristic work limit must be non-negative");
+  dm.timer = cuopt::termination_checker_t(context.gpu_heur_loop, gpu_heur_work_limit, timer_);
+  const bool run_presolve = context.settings.presolver != presolver_t::None;
+  f_t time_limit =
+    deterministic_run ? std::numeric_limits<f_t>::infinity() : timer_.remaining_time();
+  const auto& hp = context.settings.heuristic_params;
+  double presolve_time_limit =
+    deterministic_run ? timer_.remaining_time()
+                      : std::min(hp.presolve_time_ratio * time_limit, hp.presolve_max_time);
+
   if (std::isfinite(presolve_time_limit))
     CUOPT_LOG_DEBUG("Presolve time limit: %g", presolve_time_limit);
   bool presolve_success = run_presolve ? dm.run_presolve(presolve_time_limit, timer_) : true;
@@ -236,12 +267,9 @@ solution_t<i_t, f_t> mip_solver_t<i_t, f_t>::run_solver()
     CUOPT_LOG_INFO("Problem full reduced in presolve");
     solution_t<i_t, f_t> sol(*context.problem_ptr);
     sol.set_problem_fully_reduced();
-    for (auto callback : context.settings.get_mip_callbacks()) {
-      if (callback->get_type() == internals::base_solution_callback_type::GET_SOLUTION) {
-        auto get_sol_callback = static_cast<internals::get_solution_callback_t*>(callback);
-        dm.population.invoke_get_solution_callback(sol, get_sol_callback);
-      }
-    }
+    const auto payload = context.solution_publication.build_callback_payload(
+      context.problem_ptr, sol, internals::mip_solution_origin_t::PRESOLVE, 0.0);
+    context.solution_publication.publish_new_best_feasible(payload);
     context.problem_ptr->post_process_solution(sol);
     return sol;
   }
@@ -274,12 +302,9 @@ solution_t<i_t, f_t> mip_solver_t<i_t, f_t>::run_solver()
       sol.set_problem_fully_reduced();
     }
     if (opt_sol.get_termination_status() == pdlp_termination_status_t::Optimal) {
-      for (auto callback : context.settings.get_mip_callbacks()) {
-        if (callback->get_type() == internals::base_solution_callback_type::GET_SOLUTION) {
-          auto get_sol_callback = static_cast<internals::get_solution_callback_t*>(callback);
-          dm.population.invoke_get_solution_callback(sol, get_sol_callback);
-        }
-      }
+      const auto payload = context.solution_publication.build_callback_payload(
+        context.problem_ptr, sol, internals::mip_solution_origin_t::PRESOLVE, 0.0);
+      context.solution_publication.publish_new_best_feasible(payload);
     }
     context.problem_ptr->post_process_solution(sol);
     return sol;
@@ -297,7 +322,7 @@ solution_t<i_t, f_t> mip_solver_t<i_t, f_t>::run_solver()
   branch_and_bound_problem.objective_is_integral = context.problem_ptr->is_objective_integral();
   dual_simplex::simplex_solver_settings_t<i_t, f_t> branch_and_bound_settings;
   std::unique_ptr<dual_simplex::branch_and_bound_t<i_t, f_t>> branch_and_bound;
-  branch_and_bound_solution_helper_t solution_helper(&dm, branch_and_bound_settings);
+  bb_callback_adapter_t solution_helper(&context, &dm);
   dual_simplex::mip_solution_t<i_t, f_t> branch_and_bound_solution(1);
 
   dual_simplex::probing_implied_bound_t<i_t, f_t> probing_implied_bound;
@@ -325,9 +350,9 @@ solution_t<i_t, f_t> mip_solver_t<i_t, f_t>::run_solver()
     branch_and_bound_settings.max_cut_passes        = context.settings.max_cut_passes;
     branch_and_bound_settings.mir_cuts              = context.settings.mir_cuts;
     branch_and_bound_settings.deterministic =
-      context.settings.determinism_mode == CUOPT_MODE_DETERMINISTIC;
+      (context.settings.determinism_mode & CUOPT_DETERMINISM_BB);
 
-    if (context.settings.determinism_mode == CUOPT_MODE_DETERMINISTIC) {
+    if ((context.settings.determinism_mode & CUOPT_DETERMINISM_BB)) {
       branch_and_bound_settings.work_limit = context.settings.work_limit;
     } else {
       branch_and_bound_settings.work_limit = std::numeric_limits<f_t>::infinity();
@@ -355,32 +380,36 @@ solution_t<i_t, f_t> mip_solver_t<i_t, f_t>::run_solver()
       context.settings.reduced_cost_strengthening == -1
         ? 2
         : context.settings.reduced_cost_strengthening;
+    branch_and_bound_settings.bb_work_unit_scale = solver_settings_.bb_work_unit_scale;
+    branch_and_bound_settings.gpu_heur_wait_for_exploration =
+      solver_settings_.gpu_heur_wait_for_exploration;
 
     if (context.settings.num_cpu_threads < 0) {
       branch_and_bound_settings.num_threads = std::max(1, omp_get_max_threads() - 1);
     } else {
       branch_and_bound_settings.num_threads = std::max(1, context.settings.num_cpu_threads);
     }
+    CUOPT_LOG_INFO("Using %d CPU threads for B&B", branch_and_bound_settings.num_threads);
 
-    // Set the branch and bound -> primal heuristics callback
-    branch_and_bound_settings.solution_callback =
-      std::bind(&branch_and_bound_solution_helper_t<i_t, f_t>::solution_callback,
+    branch_and_bound_settings.new_incumbent_callback =
+      std::bind(&bb_callback_adapter_t<i_t, f_t>::new_incumbent_callback,
                 &solution_helper,
                 std::placeholders::_1,
-                std::placeholders::_2);
-    // heuristic_preemption_callback is needed in both modes to properly stop the heuristic thread
-    branch_and_bound_settings.heuristic_preemption_callback = std::bind(
-      &branch_and_bound_solution_helper_t<i_t, f_t>::preempt_heuristic_solver, &solution_helper);
-    if (context.settings.determinism_mode == CUOPT_MODE_OPPORTUNISTIC) {
+                std::placeholders::_2,
+                std::placeholders::_3,
+                std::placeholders::_4);
+    branch_and_bound_settings.heuristic_preemption_callback =
+      std::bind(&bb_callback_adapter_t<i_t, f_t>::preempt_heuristic_solver, &solution_helper);
+    if (!(context.settings.determinism_mode & CUOPT_DETERMINISM_BB)) {
       branch_and_bound_settings.set_simplex_solution_callback =
-        std::bind(&branch_and_bound_solution_helper_t<i_t, f_t>::set_simplex_solution,
+        std::bind(&bb_callback_adapter_t<i_t, f_t>::set_simplex_solution,
                   &solution_helper,
                   std::placeholders::_1,
                   std::placeholders::_2,
                   std::placeholders::_3);
 
       branch_and_bound_settings.node_processed_callback =
-        std::bind(&branch_and_bound_solution_helper_t<i_t, f_t>::node_processed_callback,
+        std::bind(&bb_callback_adapter_t<i_t, f_t>::node_processed_callback,
                   &solution_helper,
                   std::placeholders::_1,
                   std::placeholders::_2);
@@ -412,14 +441,15 @@ solution_t<i_t, f_t> mip_solver_t<i_t, f_t>::run_solver()
       [stats_ptr](f_t user_bound) { stats_ptr->set_solution_bound(user_bound); });
 
     // Set the primal heuristics -> branch and bound callback
-    if (context.settings.determinism_mode == CUOPT_MODE_OPPORTUNISTIC) {
+    if (!(context.settings.determinism_mode & CUOPT_DETERMINISM_BB)) {
       branch_and_bound->set_concurrent_lp_root_solve(true);
 
       context.problem_ptr->branch_and_bound_callback =
         std::bind(&dual_simplex::branch_and_bound_t<i_t, f_t>::set_new_solution,
                   branch_and_bound.get(),
-                  std::placeholders::_1);
-    } else if (context.settings.determinism_mode == CUOPT_MODE_DETERMINISTIC) {
+                  std::placeholders::_1,
+                  std::placeholders::_2);
+    } else if ((context.settings.determinism_mode & CUOPT_DETERMINISM_BB)) {
       branch_and_bound->set_concurrent_lp_root_solve(false);
       // TODO once deterministic GPU heuristics are integrated
       // context.problem_ptr->branch_and_bound_callback =
@@ -429,18 +459,21 @@ solution_t<i_t, f_t> mip_solver_t<i_t, f_t>::run_solver()
     }
 
     context.work_unit_scheduler_.register_context(branch_and_bound->get_work_unit_context());
-    // context.work_unit_scheduler_.verbose = true;
 
-    context.problem_ptr->set_root_relaxation_solution_callback =
-      std::bind(&dual_simplex::branch_and_bound_t<i_t, f_t>::set_root_relaxation_solution,
-                branch_and_bound.get(),
-                std::placeholders::_1,
-                std::placeholders::_2,
-                std::placeholders::_3,
-                std::placeholders::_4,
-                std::placeholders::_5,
-                std::placeholders::_6,
-                std::placeholders::_7);
+    if ((context.settings.determinism_mode & CUOPT_DETERMINISM_BB)) {
+      context.problem_ptr->set_root_relaxation_solution_callback = nullptr;
+    } else {
+      context.problem_ptr->set_root_relaxation_solution_callback =
+        std::bind(&dual_simplex::branch_and_bound_t<i_t, f_t>::set_root_relaxation_solution,
+                  branch_and_bound.get(),
+                  std::placeholders::_1,
+                  std::placeholders::_2,
+                  std::placeholders::_3,
+                  std::placeholders::_4,
+                  std::placeholders::_5,
+                  std::placeholders::_6,
+                  std::placeholders::_7);
+    }
 
     if (timer_.check_time_limit()) {
       CUOPT_LOG_INFO("Time limit reached during B&B setup");
@@ -454,10 +487,12 @@ solution_t<i_t, f_t> mip_solver_t<i_t, f_t>::run_solver()
     // std::async and std::future allow us to get the return value of bb::solve()
     // without having to manually manage the thread
     // std::future.get() performs a join() operation to wait until the return status is available
-    branch_and_bound_status_future = std::async(std::launch::async,
-                                                &dual_simplex::branch_and_bound_t<i_t, f_t>::solve,
-                                                branch_and_bound.get(),
-                                                std::ref(branch_and_bound_solution));
+    int bb_device_id = context.handle_ptr->get_device();
+    branch_and_bound_status_future =
+      std::async(std::launch::async, [&branch_and_bound, &branch_and_bound_solution, bb_device_id] {
+        RAFT_CUDA_TRY(cudaSetDevice(bb_device_id));
+        return branch_and_bound->solve(branch_and_bound_solution);
+      });
   }
 
   // Start the primal heuristics
@@ -470,9 +505,46 @@ solution_t<i_t, f_t> mip_solver_t<i_t, f_t>::run_solver()
       context.stats.set_solution_bound(
         context.problem_ptr->get_user_obj_from_solver_obj(branch_and_bound_solution.lower_bound));
     }
+    CUOPT_LOG_DEBUG(
+      "B&B solution reconstruction: det_bb=%d obj_finite=%d obj=%.16e bb_status=%d "
+      "has_incumbent=%d sol_size=%zu",
+      (int)(context.settings.determinism_mode & CUOPT_DETERMINISM_BB),
+      (int)std::isfinite(branch_and_bound_solution.objective),
+      branch_and_bound_solution.objective,
+      (int)bb_status,
+      (int)branch_and_bound_solution.has_incumbent,
+      branch_and_bound_solution.x.size());
+    if ((context.settings.determinism_mode & CUOPT_DETERMINISM_BB) &&
+        std::isfinite(branch_and_bound_solution.objective)) {
+      solution_t<i_t, f_t> bb_sol(*context.problem_ptr);
+      bb_sol.copy_new_assignment(branch_and_bound_solution.x);
+      bb_sol.compute_feasibility();
+      sol = std::move(bb_sol);
+    } else if ((context.settings.determinism_mode & CUOPT_DETERMINISM_BB)) {
+      // In deterministic mode, only solutions formally retired by B&B are valid output.
+      // Discard the GPU heuristic incumbent that B&B never processed.
+      sol = solution_t<i_t, f_t>(*context.problem_ptr);
+    }
     if (bb_status == dual_simplex::mip_status_t::INFEASIBLE) { sol.set_problem_fully_reduced(); }
     context.stats.num_nodes              = branch_and_bound_solution.nodes_explored;
     context.stats.num_simplex_iterations = branch_and_bound_solution.simplex_iterations;
+
+    if ((context.settings.determinism_mode & CUOPT_DETERMINISM_BB)) {
+      double bnb_work  = branch_and_bound->get_work_unit_context().current_work();
+      double gpu_work  = context.gpu_heur_loop.current_work();
+      double bnb_scale = BB_BASE_WORK_SCALE * solver_settings_.bb_work_unit_scale;
+      double gpu_scale = GPU_HEUR_BASE_WORK_SCALE * solver_settings_.gpu_heur_work_unit_scale;
+      CUOPT_LOG_INFO(
+        "Work unit summary: B&B=%.2f (scale=%.3f, raw=%.2f) GPU_heur=%.2f (scale=%.3f, raw=%.2f) "
+        "ratio=%.2fx",
+        bnb_work,
+        bnb_scale,
+        bnb_scale > 0 ? bnb_work / bnb_scale : 0.0,
+        gpu_work,
+        gpu_scale,
+        gpu_scale > 0 ? gpu_work / gpu_scale : 0.0,
+        gpu_work > 0 ? bnb_work / gpu_work : 0.0);
+    }
   }
   sol.compute_feasibility();
 
diff --git a/cpp/src/mip_heuristics/solver.cuh b/cpp/src/mip_heuristics/solver.cuh
index 9b9024a1dc..1c18a62c08 100644
--- a/cpp/src/mip_heuristics/solver.cuh
+++ b/cpp/src/mip_heuristics/solver.cuh
@@ -10,7 +10,7 @@
 #include <cuopt/linear_programming/pdlp/solver_solution.hpp>
 #include <mip_heuristics/problem/problem.cuh>
 #include <mip_heuristics/solver_context.cuh>
-#include <utilities/timer.hpp>
+#include <utilities/termination_checker.hpp>
 #pragma once
 
 namespace cuopt::linear_programming::detail {
@@ -20,7 +20,7 @@ class mip_solver_t {
  public:
   explicit mip_solver_t(const problem_t<i_t, f_t>& op_problem,
                         const mip_solver_settings_t<i_t, f_t>& solver_settings,
-                        timer_t timer);
+                        cuopt::termination_checker_t& timer);
 
   solution_t<i_t, f_t> run_solver();
   solver_stats_t<i_t, f_t>& get_solver_stats() { return context.stats; }
@@ -29,7 +29,7 @@ class mip_solver_t {
   // reference to the original problem
   const problem_t<i_t, f_t>& op_problem_;
   const mip_solver_settings_t<i_t, f_t>& solver_settings_;
-  timer_t timer_;
+  cuopt::termination_checker_t& timer_;
 };
 
 }  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/mip_heuristics/solver_context.cuh b/cpp/src/mip_heuristics/solver_context.cuh
index b1bf3fbd70..2f36f52db3 100644
--- a/cpp/src/mip_heuristics/solver_context.cuh
+++ b/cpp/src/mip_heuristics/solver_context.cuh
@@ -5,16 +5,16 @@
  */
 /* clang-format on */
 
+#pragma once
+
 #include <cuopt/linear_programming/mip/solver_stats.hpp>
 
-#include <mip_heuristics/problem/problem.cuh>
-#include <mip_heuristics/relaxed_lp/lp_state.cuh>
+#include <mip_heuristics/mip_constants.hpp>
+#include <mip_heuristics/solution_callbacks.cuh>
 #include <utilities/work_limit_context.hpp>
 #include <utilities/work_unit_scheduler.hpp>
 
-#include <limits>
-
-#pragma once
+#include <utilities/work_limit_timer.hpp>
 
 // Forward declare
 namespace cuopt::linear_programming::dual_simplex {
@@ -37,12 +37,20 @@ struct mip_solver_context_t {
   explicit mip_solver_context_t(raft::handle_t const* handle_ptr_,
                                 problem_t<i_t, f_t>* problem_ptr_,
                                 mip_solver_settings_t<i_t, f_t> settings_)
-    : handle_ptr(handle_ptr_), problem_ptr(problem_ptr_), settings(settings_)
+    : handle_ptr(handle_ptr_),
+      problem_ptr(problem_ptr_),
+      settings(settings_),
+      solution_publication(settings, stats),
+      solution_injection(settings, stats)
   {
     cuopt_assert(problem_ptr != nullptr, "problem_ptr is nullptr");
     stats.set_solution_bound(problem_ptr->maximize ? std::numeric_limits<f_t>::infinity()
                                                    : -std::numeric_limits<f_t>::infinity());
-    gpu_heur_loop.deterministic = settings.determinism_mode == CUOPT_MODE_DETERMINISTIC;
+    gpu_heur_loop.deterministic = (settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS);
+    cuopt_assert(settings.cpufj_work_unit_scale > 0.0, "CPUFJ work-unit scale must be positive");
+    cuopt_assert(settings.gpu_heur_work_unit_scale > 0.0,
+                 "GPU heuristic work-unit scale must be positive");
+    gpu_heur_loop.work_unit_scale = GPU_HEUR_BASE_WORK_SCALE * settings.gpu_heur_work_unit_scale;
   }
 
   mip_solver_context_t(const mip_solver_context_t&)            = delete;
@@ -58,8 +66,13 @@ struct mip_solver_context_t {
   // Work limit context for tracking work units in deterministic mode (shared across all timers in
   // GPU heuristic loop)
   work_limit_context_t gpu_heur_loop{"GPUHeur"};
+  solution_publication_t<i_t, f_t> solution_publication;
+  solution_injection_t<i_t, f_t> solution_injection;
+
+  // Root termination checker — set by mip_solver_t after construction.
+  // All sub-timers should use this as parent for wall-clock safety.
+  cuopt::termination_checker_t* termination{nullptr};
 
-  // synchronization every 5 seconds for deterministic mode
   work_unit_scheduler_t work_unit_scheduler_{5.0};
 
   early_cpufj_t<i_t, f_t>* early_cpufj_ptr{nullptr};
diff --git a/cpp/src/mip_heuristics/solver_solution.cu b/cpp/src/mip_heuristics/solver_solution.cu
index 8f6f8de05f..8d179eafe6 100644
--- a/cpp/src/mip_heuristics/solver_solution.cu
+++ b/cpp/src/mip_heuristics/solver_solution.cu
@@ -7,6 +7,8 @@
 
 #include <cuopt/linear_programming/mip/solver_solution.hpp>
 #include <mip_heuristics/mip_constants.hpp>
+#include <utilities/copy_helpers.hpp>
+#include <utilities/hashing.hpp>
 #include <utilities/logger.hpp>
 
 #include <limits>
@@ -238,11 +240,25 @@ void mip_solution_t<i_t, f_t>::log_summary() const
 template <typename i_t, typename f_t>
 void mip_solution_t<i_t, f_t>::log_detailed_summary() const
 {
+  uint32_t sol_hash = 0;
+  if (solution_.size() > 0) {
+    auto host_sol = cuopt::host_copy(solution_, rmm::cuda_stream_default);
+    sol_hash      = detail::compute_hash(host_sol);
+  }
+
+  uint32_t pool_hash = 0;
+  for (const auto& pool_sol : solution_pool_) {
+    if (pool_sol.size() > 0) {
+      auto host_pool_sol = cuopt::host_copy(pool_sol, rmm::cuda_stream_default);
+      pool_hash ^= detail::compute_hash(host_pool_sol);
+    }
+  }
+
   CUOPT_LOG_INFO(
     "Solution objective: %f , relative_mip_gap %f solution_bound %f presolve_time %f "
     "total_solve_time %f "
     "max constraint violation %f max int violation %f max var bounds violation %f "
-    "nodes %d simplex_iterations %d",
+    "nodes %d simplex_iterations %d solution_hash %08x pool_hash %08x pool_size %d",
     objective_,
     mip_gap_,
     stats_.get_solution_bound(),
@@ -252,7 +268,10 @@ void mip_solution_t<i_t, f_t>::log_detailed_summary() const
     max_int_violation_,
     max_variable_bound_violation_,
     stats_.num_nodes,
-    stats_.num_simplex_iterations);
+    stats_.num_simplex_iterations,
+    sol_hash,
+    pool_hash,
+    (int)solution_pool_.size());
 }
 
 #if MIP_INSTANTIATE_FLOAT || PDLP_INSTANTIATE_FLOAT
diff --git a/cpp/src/pdlp/cuopt_c.cpp b/cpp/src/pdlp/cuopt_c.cpp
index ed2eab02f2..0af7f183d2 100644
--- a/cpp/src/pdlp/cuopt_c.cpp
+++ b/cpp/src/pdlp/cuopt_c.cpp
@@ -49,6 +49,39 @@ class c_get_solution_callback_t : public cuopt::internals::get_solution_callback
   cuOptMIPGetSolutionCallback callback_;
 };
 
+class c_get_solution_callback_ext_t : public cuopt::internals::get_solution_callback_ext_t {
+ public:
+  explicit c_get_solution_callback_ext_t(cuOptMIPGetSolutionCallbackExt callback)
+    : callback_(callback)
+  {
+  }
+
+  void get_solution(void* data,
+                    void* objective_value,
+                    void* solution_bound,
+                    const cuopt::internals::mip_solution_callback_info_t* callback_info,
+                    void* user_data) override
+  {
+    if (callback_ == nullptr) { return; }
+    cuOptMIPSolutionCallbackInfo c_callback_info{};
+    if (callback_info != nullptr) {
+      c_callback_info.origin         = (uint32_t)callback_info->origin;
+      c_callback_info.work_timestamp = callback_info->work_timestamp;
+    } else {
+      c_callback_info.origin         = CUOPT_MIP_SOLUTION_ORIGIN_UNKNOWN;
+      c_callback_info.work_timestamp = -1.0;
+    }
+    callback_(static_cast<const cuopt_float_t*>(data),
+              static_cast<const cuopt_float_t*>(objective_value),
+              static_cast<const cuopt_float_t*>(solution_bound),
+              &c_callback_info,
+              user_data);
+  }
+
+ private:
+  cuOptMIPGetSolutionCallbackExt callback_;
+};
+
 class c_set_solution_callback_t : public cuopt::internals::set_solution_callback_t {
  public:
   explicit c_set_solution_callback_t(cuOptMIPSetSolutionCallback callback) : callback_(callback) {}
@@ -69,6 +102,11 @@ class c_set_solution_callback_t : public cuopt::internals::set_solution_callback
   cuOptMIPSetSolutionCallback callback_;
 };
 
+// ABI guards: these fire at compile time if the struct layout changes
+// and existing field offsets are changed
+static_assert(offsetof(cuOptMIPSolutionCallbackInfo, origin) == 0, "ABI break");
+static_assert(offsetof(cuOptMIPSolutionCallbackInfo, work_timestamp) == 8, "ABI break");
+
 // Owns solver settings and C callback wrappers for C API lifetime.
 struct solver_settings_handle_t {
   solver_settings_handle_t() : settings(new solver_settings_t<cuopt_int_t, cuopt_float_t>()) {}
@@ -767,6 +805,19 @@ cuopt_int_t cuOptSetMIPGetSolutionCallback(cuOptSolverSettings settings,
   return CUOPT_SUCCESS;
 }
 
+cuopt_int_t cuOptSetMIPGetSolutionCallbackExt(cuOptSolverSettings settings,
+                                              cuOptMIPGetSolutionCallbackExt callback,
+                                              void* user_data)
+{
+  if (settings == nullptr) { return CUOPT_INVALID_ARGUMENT; }
+  if (callback == nullptr) { return CUOPT_INVALID_ARGUMENT; }
+  solver_settings_handle_t* settings_handle = get_settings_handle(settings);
+  auto callback_wrapper = std::make_unique<c_get_solution_callback_ext_t>(callback);
+  settings_handle->settings->set_mip_callback(callback_wrapper.get(), user_data);
+  settings_handle->callbacks.push_back(std::move(callback_wrapper));
+  return CUOPT_SUCCESS;
+}
+
 cuopt_int_t cuOptSetMIPSetSolutionCallback(cuOptSolverSettings settings,
                                            cuOptMIPSetSolutionCallback callback,
                                            void* user_data)
diff --git a/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu b/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu
index b618550f6e..bdc7aff1a0 100644
--- a/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu
+++ b/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu
@@ -22,6 +22,8 @@
 #include <raft/util/cudart_utils.hpp>
 
 #include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
 #include <thrust/scatter.h>
 
 namespace cuopt::linear_programming::detail {
@@ -684,9 +686,12 @@ template <typename i_t, typename f_t>
 void pdlp_initial_scaling_strategy_t<i_t, f_t>::unscale_solutions(
   rmm::device_uvector<f_t>& primal_solution,
   rmm::device_uvector<f_t>& dual_solution,
-  rmm::device_uvector<f_t>& dual_slack) const
+  rmm::device_uvector<f_t>& dual_slack,
+  cudaStream_t stream_override) const
 {
   raft::common::nvtx::range fun_scope("unscale_solutions");
+  const rmm::cuda_stream_view stream =
+    stream_override ? rmm::cuda_stream_view{stream_override} : stream_view_;
 
   if (primal_solution.size()) {
     cuopt_expects(primal_solution.size() % static_cast<size_t>(primal_size_h_) == 0,
@@ -703,7 +708,7 @@ void pdlp_initial_scaling_strategy_t<i_t, f_t>::unscale_solutions(
       primal_solution.data(),
       primal_solution.size(),
       cuda::std::multiplies<>{},
-      stream_view_);
+      stream);
 
     if (hyper_params_.bound_objective_rescaling && !running_mip_) {
       cuopt_assert(h_bound_rescaling != f_t(0),
@@ -712,7 +717,7 @@ void pdlp_initial_scaling_strategy_t<i_t, f_t>::unscale_solutions(
                                       primal_solution.data(),
                                       primal_solution.size(),
                                       a_times_scalar<f_t>(f_t(1.0) / h_bound_rescaling),
-                                      stream_view_);
+                                      stream);
     }
   }
 
@@ -731,7 +736,7 @@ void pdlp_initial_scaling_strategy_t<i_t, f_t>::unscale_solutions(
       dual_solution.data(),
       dual_solution.size(),
       cuda::std::multiplies<>{},
-      stream_view_);
+      stream);
     if (hyper_params_.bound_objective_rescaling && !running_mip_) {
       cuopt_assert(h_bound_rescaling != f_t(0),
                    "Numerical error: bound_rescaling_ should never equal 0");
@@ -739,7 +744,7 @@ void pdlp_initial_scaling_strategy_t<i_t, f_t>::unscale_solutions(
                                       dual_solution.data(),
                                       dual_solution.size(),
                                       a_times_scalar<f_t>(f_t(1.0) / h_objective_rescaling),
-                                      stream_view_);
+                                      stream);
     }
   }
 
@@ -756,7 +761,7 @@ void pdlp_initial_scaling_strategy_t<i_t, f_t>::unscale_solutions(
       dual_slack.data(),
       dual_slack.size(),
       batch_safe_div<f_t>(),
-      stream_view_);
+      stream);
     if (hyper_params_.bound_objective_rescaling && !running_mip_) {
       cuopt_assert(h_bound_rescaling != f_t(0),
                    "Numerical error: bound_rescaling_ should never equal 0");
@@ -764,7 +769,7 @@ void pdlp_initial_scaling_strategy_t<i_t, f_t>::unscale_solutions(
                                       dual_slack.data(),
                                       dual_slack.size(),
                                       a_times_scalar<f_t>{f_t(1.0) / h_objective_rescaling},
-                                      stream_view_);
+                                      stream);
     }
   }
 }
@@ -781,10 +786,12 @@ void pdlp_initial_scaling_strategy_t<i_t, f_t>::unscale_solutions(
 
 template <typename i_t, typename f_t>
 void pdlp_initial_scaling_strategy_t<i_t, f_t>::unscale_solutions(
-  rmm::device_uvector<f_t>& solution, rmm::device_uvector<f_t>& s) const
+  rmm::device_uvector<f_t>& solution,
+  rmm::device_uvector<f_t>& s,
+  cudaStream_t stream_override) const
 {
   rmm::device_uvector<f_t> dummy(0, solution.stream());
-  unscale_solutions(solution, s, dummy);
+  unscale_solutions(solution, s, dummy, stream_override);
 }
 
 template <typename i_t, typename f_t>
diff --git a/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cuh b/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cuh
index 5a3dcfaca2..c537825724 100644
--- a/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cuh
+++ b/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cuh
@@ -64,10 +64,12 @@ class pdlp_initial_scaling_strategy_t {
   void scale_primal(rmm::device_uvector<f_t>& primal_solution) const;
   void scale_dual(rmm::device_uvector<f_t>& dual_solution) const;
   void unscale_solutions(rmm::device_uvector<f_t>& primal_solution,
-                         rmm::device_uvector<f_t>& dual_solution) const;
+                         rmm::device_uvector<f_t>& dual_solution,
+                         cudaStream_t stream_override = nullptr) const;
   void unscale_solutions(rmm::device_uvector<f_t>& primal_solution,
                          rmm::device_uvector<f_t>& dual_solution,
-                         rmm::device_uvector<f_t>& dual_slack) const;
+                         rmm::device_uvector<f_t>& dual_slack,
+                         cudaStream_t stream_override = nullptr) const;
   void unscale_solutions(solution_t<i_t, f_t>& solution) const;
   const rmm::device_uvector<f_t>& get_constraint_matrix_scaling_vector() const;
   const rmm::device_uvector<f_t>& get_variable_scaling_vector() const;
diff --git a/cpp/src/pdlp/pdhg.cu b/cpp/src/pdlp/pdhg.cu
index 74df7fee01..cb16c9d662 100644
--- a/cpp/src/pdlp/pdhg.cu
+++ b/cpp/src/pdlp/pdhg.cu
@@ -30,6 +30,8 @@
 
 #include <cub/cub.cuh>
 
+#include <thrust/iterator/zip_iterator.h>
+
 #include <cusparse_v2.h>
 
 namespace cuopt::linear_programming::detail {
diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu
index 8e6e80e322..a759887fc5 100644
--- a/cpp/src/pdlp/pdlp.cu
+++ b/cpp/src/pdlp/pdlp.cu
@@ -35,6 +35,7 @@
 
 #include <thrust/count.h>
 #include <thrust/extrema.h>
+#include <thrust/iterator/transform_iterator.h>
 #include <thrust/logical.h>
 
 #include <cmath>
diff --git a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu
index 2b10310260..821238fe84 100644
--- a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu
+++ b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu
@@ -29,6 +29,7 @@
 #include <raft/linalg/unary_op.cuh>
 #include <raft/util/cuda_utils.cuh>
 
+#include <thrust/device_ptr.h>
 #include <thrust/device_vector.h>
 #include <thrust/extrema.h>
 #include <thrust/for_each.h>
@@ -39,6 +40,7 @@
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/logical.h>
 #include <thrust/sort.h>
+#include <thrust/tuple.h>
 
 #include <cub/cub.cuh>
 
diff --git a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu
index d17a88dd29..c95ed67ca6 100644
--- a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu
+++ b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu
@@ -28,6 +28,9 @@
 
 #include <cub/cub.cuh>
 
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+
 #include <limits>
 
 namespace cuopt::linear_programming::detail {
diff --git a/cpp/src/pdlp/termination_strategy/convergence_information.cu b/cpp/src/pdlp/termination_strategy/convergence_information.cu
index ab0c921cc7..b4da4ffbde 100644
--- a/cpp/src/pdlp/termination_strategy/convergence_information.cu
+++ b/cpp/src/pdlp/termination_strategy/convergence_information.cu
@@ -25,6 +25,7 @@
 #include <raft/util/cuda_utils.cuh>
 
 #include <thrust/device_ptr.h>
+#include <thrust/iterator/transform_iterator.h>
 #include <thrust/iterator/zip_iterator.h>
 
 #include <cub/cub.cuh>
diff --git a/cpp/src/pdlp/termination_strategy/infeasibility_information.cu b/cpp/src/pdlp/termination_strategy/infeasibility_information.cu
index dbb35b732d..37972ba442 100644
--- a/cpp/src/pdlp/termination_strategy/infeasibility_information.cu
+++ b/cpp/src/pdlp/termination_strategy/infeasibility_information.cu
@@ -24,6 +24,14 @@
 #include <raft/linalg/unary_op.cuh>
 #include <raft/util/cuda_utils.cuh>
 
+#include <thrust/device_ptr.h>
+#include <thrust/extrema.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/iterator/transform_output_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/transform_reduce.h>
+#include <thrust/tuple.h>
+
 namespace cuopt::linear_programming::detail {
 template <typename i_t, typename f_t>
 infeasibility_information_t<i_t, f_t>::infeasibility_information_t(
diff --git a/cpp/src/pdlp/utils.cuh b/cpp/src/pdlp/utils.cuh
index 138c9c2ab9..77bc6b18ce 100644
--- a/cpp/src/pdlp/utils.cuh
+++ b/cpp/src/pdlp/utils.cuh
@@ -24,6 +24,8 @@
 
 #include <thrust/execution_policy.h>
 #include <thrust/functional.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
 #include <thrust/transform_reduce.h>
 #include <thrust/tuple.h>
 
diff --git a/cpp/src/routing/local_search/compute_compatible.cu b/cpp/src/routing/local_search/compute_compatible.cu
index 8386cb087b..457e970632 100644
--- a/cpp/src/routing/local_search/compute_compatible.cu
+++ b/cpp/src/routing/local_search/compute_compatible.cu
@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -9,6 +9,8 @@
 #include "compute_compatible.cuh"
 #include "local_search.cuh"
 
+#include <thrust/extrema.h>
+#include <thrust/iterator/zip_iterator.h>
 #include <thrust/reduce.h>
 
 #include <raft/util/cudart_utils.hpp>
diff --git a/cpp/src/routing/route/break_route.cuh b/cpp/src/routing/route/break_route.cuh
index 68ab015646..1d5b3472f9 100644
--- a/cpp/src/routing/route/break_route.cuh
+++ b/cpp/src/routing/route/break_route.cuh
@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -17,6 +17,8 @@
 
 #include <rmm/device_uvector.hpp>
 
+#include <thrust/tuple.h>
+
 namespace cuopt {
 namespace routing {
 namespace detail {
diff --git a/cpp/src/routing/route/capacity_route.cuh b/cpp/src/routing/route/capacity_route.cuh
index a39ef46a93..388e573c1c 100644
--- a/cpp/src/routing/route/capacity_route.cuh
+++ b/cpp/src/routing/route/capacity_route.cuh
@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2021-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -17,6 +17,9 @@
 #include <raft/core/nvtx.hpp>
 
 #include <rmm/device_uvector.hpp>
+
+#include <thrust/tuple.h>
+
 namespace cuopt {
 namespace routing {
 namespace detail {
diff --git a/cpp/src/routing/route/dimensions_route.cuh b/cpp/src/routing/route/dimensions_route.cuh
index d1131ea550..bc08ba9819 100644
--- a/cpp/src/routing/route/dimensions_route.cuh
+++ b/cpp/src/routing/route/dimensions_route.cuh
@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2021-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -30,6 +30,8 @@
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
+#include <thrust/tuple.h>
+
 namespace cuopt {
 namespace routing {
 namespace detail {
diff --git a/cpp/src/routing/route/distance_route.cuh b/cpp/src/routing/route/distance_route.cuh
index e01c552080..a5f98c13ce 100644
--- a/cpp/src/routing/route/distance_route.cuh
+++ b/cpp/src/routing/route/distance_route.cuh
@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2021-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -17,6 +17,8 @@
 
 #include <rmm/device_uvector.hpp>
 
+#include <thrust/tuple.h>
+
 namespace cuopt {
 namespace routing {
 namespace detail {
diff --git a/cpp/src/routing/route/mismatch_route.cuh b/cpp/src/routing/route/mismatch_route.cuh
index d72f01735a..78975750e0 100644
--- a/cpp/src/routing/route/mismatch_route.cuh
+++ b/cpp/src/routing/route/mismatch_route.cuh
@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -15,6 +15,8 @@
 
 #include <rmm/device_uvector.hpp>
 
+#include <thrust/tuple.h>
+
 namespace cuopt {
 namespace routing {
 namespace detail {
diff --git a/cpp/src/routing/route/pdp_route.cuh b/cpp/src/routing/route/pdp_route.cuh
index dc9b8ad699..dd20e2fec3 100644
--- a/cpp/src/routing/route/pdp_route.cuh
+++ b/cpp/src/routing/route/pdp_route.cuh
@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2021-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -17,6 +17,8 @@
 
 #include <rmm/device_uvector.hpp>
 
+#include <thrust/tuple.h>
+
 namespace cuopt {
 namespace routing {
 namespace detail {
diff --git a/cpp/src/routing/route/prize_route.cuh b/cpp/src/routing/route/prize_route.cuh
index 0330d14590..80b27061b5 100644
--- a/cpp/src/routing/route/prize_route.cuh
+++ b/cpp/src/routing/route/prize_route.cuh
@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -17,6 +17,8 @@
 
 #include <rmm/device_uvector.hpp>
 
+#include <thrust/tuple.h>
+
 namespace cuopt {
 namespace routing {
 namespace detail {
diff --git a/cpp/src/routing/route/route.cuh b/cpp/src/routing/route/route.cuh
index e6367a4836..b624acb903 100644
--- a/cpp/src/routing/route/route.cuh
+++ b/cpp/src/routing/route/route.cuh
@@ -11,6 +11,8 @@
 
 #include <routing/fleet_info.hpp>
 
+#include <thrust/tuple.h>
+
 namespace cuopt {
 namespace routing {
 namespace detail {
diff --git a/cpp/src/routing/route/service_time_route.cuh b/cpp/src/routing/route/service_time_route.cuh
index b35e53c2d8..03c48b2e42 100644
--- a/cpp/src/routing/route/service_time_route.cuh
+++ b/cpp/src/routing/route/service_time_route.cuh
@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -15,6 +15,8 @@
 #include <raft/core/handle.hpp>
 #include <rmm/device_uvector.hpp>
 
+#include <thrust/tuple.h>
+
 namespace cuopt {
 namespace routing {
 namespace detail {
diff --git a/cpp/src/routing/route/tasks_route.cuh b/cpp/src/routing/route/tasks_route.cuh
index 6da9e4372a..3624d647e7 100644
--- a/cpp/src/routing/route/tasks_route.cuh
+++ b/cpp/src/routing/route/tasks_route.cuh
@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2021-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -15,6 +15,8 @@
 
 #include <rmm/device_uvector.hpp>
 
+#include <thrust/tuple.h>
+
 namespace cuopt {
 namespace routing {
 namespace detail {
diff --git a/cpp/src/routing/route/time_route.cuh b/cpp/src/routing/route/time_route.cuh
index bb5ec653e1..21448c4273 100644
--- a/cpp/src/routing/route/time_route.cuh
+++ b/cpp/src/routing/route/time_route.cuh
@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2021-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -17,6 +17,8 @@
 
 #include <rmm/device_uvector.hpp>
 
+#include <thrust/tuple.h>
+
 namespace cuopt {
 namespace routing {
 namespace detail {
diff --git a/cpp/src/routing/route/tsp_route.cuh b/cpp/src/routing/route/tsp_route.cuh
index ee1ba5370c..9b7eeeee56 100644
--- a/cpp/src/routing/route/tsp_route.cuh
+++ b/cpp/src/routing/route/tsp_route.cuh
@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -16,6 +16,8 @@
 
 #include <rmm/device_uvector.hpp>
 
+#include <thrust/tuple.h>
+
 namespace cuopt {
 namespace routing {
 namespace detail {
diff --git a/cpp/src/routing/route/vehicle_fixed_cost_route.cuh b/cpp/src/routing/route/vehicle_fixed_cost_route.cuh
index 83ea5db481..1e246fbb6e 100644
--- a/cpp/src/routing/route/vehicle_fixed_cost_route.cuh
+++ b/cpp/src/routing/route/vehicle_fixed_cost_route.cuh
@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -14,6 +14,8 @@
 
 #include <raft/core/handle.hpp>
 
+#include <thrust/tuple.h>
+
 namespace cuopt {
 namespace routing {
 namespace detail {
diff --git a/cpp/src/routing/solution/route_node_map.cuh b/cpp/src/routing/solution/route_node_map.cuh
index 25a6c4919b..a4a1b171aa 100644
--- a/cpp/src/routing/solution/route_node_map.cuh
+++ b/cpp/src/routing/solution/route_node_map.cuh
@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -8,6 +8,7 @@
 #pragma once
 
 #include <thrust/pair.h>
+#include <thrust/tuple.h>
 #include <raft/core/device_span.hpp>
 #include <raft/util/cuda_utils.cuh>
 #include <rmm/device_uvector.hpp>
diff --git a/cpp/src/routing/utilities/check_input.cu b/cpp/src/routing/utilities/check_input.cu
index e902f2d460..eccc3179bb 100644
--- a/cpp/src/routing/utilities/check_input.cu
+++ b/cpp/src/routing/utilities/check_input.cu
@@ -15,6 +15,7 @@
 #include <thrust/equal.h>
 #include <thrust/extrema.h>
 #include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
 #include <thrust/logical.h>
 #include <thrust/pair.h>
 #include <thrust/tuple.h>
diff --git a/cpp/src/utilities/copy_helpers.hpp b/cpp/src/utilities/copy_helpers.hpp
index 36a4659059..fc07e3b829 100644
--- a/cpp/src/utilities/copy_helpers.hpp
+++ b/cpp/src/utilities/copy_helpers.hpp
@@ -11,9 +11,11 @@
 #include <raft/core/handle.hpp>
 #include <raft/util/cudart_utils.hpp>
 
+#include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <thrust/iterator/zip_iterator.h>
 #include <thrust/transform.h>
 #include <thrust/tuple.h>
 #include <thrust/universal_vector.h>
@@ -335,6 +337,17 @@ raft::device_span<const T> make_span(rmm::device_uvector<T> const& container)
   return raft::device_span<const T>(container.data(), container.size());
 }
 
+template <typename T>
+raft::device_span<T> make_span(rmm::device_scalar<T>& scalar)
+{
+  return raft::device_span<T>(scalar.data(), 1);
+}
+
+template <typename T>
+raft::device_span<const T> make_span(rmm::device_scalar<T> const& scalar)
+{
+  return raft::device_span<const T>(scalar.data(), 1);
+}
 // resizes the device vector if it the std vector is larger
 template <typename T>
 inline void expand_device_copy(rmm::device_uvector<T>& device_vec,
diff --git a/cpp/src/utilities/cuda_helpers.cuh b/cpp/src/utilities/cuda_helpers.cuh
index 946099648d..7c591624d2 100644
--- a/cpp/src/utilities/cuda_helpers.cuh
+++ b/cpp/src/utilities/cuda_helpers.cuh
@@ -13,6 +13,7 @@
 #include <thrust/tuple.h>
 #include <mutex>
 #include <raft/core/device_span.hpp>
+#include <raft/core/nvtx.hpp>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
 #include <rmm/device_uvector.hpp>
@@ -20,6 +21,17 @@
 #include <rmm/mr/limiting_resource_adaptor.hpp>
 #include <unordered_map>
 
+#if CUDART_VERSION >= 12080
+// TODO: investigate why this is necessary? dependency conflict? file NVBUG if necessary
+#include <nvtx3/nvtx3.hpp>
+#ifndef NVTX_NULLPTR
+#define NVTX_NULLPTR nullptr
+#endif
+#ifndef NVTX_REINTERPRET_CAST
+#define NVTX_REINTERPRET_CAST(type, value) (reinterpret_cast<type>(value))
+#endif
+#include <nvtx3/nvToolsExtMemCudaRt.h>
+#endif
 namespace cuopt {
 
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700)
@@ -237,4 +249,48 @@ inline size_t get_device_memory_size()
   }
 }
 
+// NOTE: this marks a range of virtual memory as initialized. This is not tied to any object's
+// lifetime As such, when using a pool for allocations, false negatives could occurs e.g. a range
+// previously marked as initialized is now occupied by a new uninitialized object Unlikely to cause
+// issues in practice - but worth noting (RAII? I'm not even sure the API allows to un-mark a range
+// as initialized)
+static inline void mark_memory_as_initialized(const void* ptr, size_t size, cudaStream_t stream = 0)
+{
+#if CUDART_VERSION >= 12080
+
+  if (size == 0 || ptr == nullptr) return;
+
+#if defined(CUDA_API_PER_THREAD_DEFAULT_STREAM)
+  constexpr auto PerThreadDefaultStream = true;
+#else
+  constexpr auto PerThreadDefaultStream = false;
+#endif
+
+  nvtxMemVirtualRangeDesc_t nvtxRangeDesc = {};
+  nvtxRangeDesc.size                      = size;
+  nvtxRangeDesc.ptr                       = ptr;
+
+  nvtxMemMarkInitializedBatch_t nvtxRegionsDesc = {};
+  nvtxRegionsDesc.extCompatID                   = NVTX_EXT_COMPATID_MEM;
+  nvtxRegionsDesc.structSize                    = sizeof(nvtxRegionsDesc);
+  nvtxRegionsDesc.regionType                    = NVTX_MEM_TYPE_VIRTUAL_ADDRESS;
+  nvtxRegionsDesc.regionDescCount               = 1;
+  nvtxRegionsDesc.regionDescElementSize         = sizeof(nvtxRangeDesc);
+  nvtxRegionsDesc.regionDescElements            = &nvtxRangeDesc;
+
+  nvtxMemCudaMarkInitialized(
+    raft::common::nvtx::detail::domain_store<raft::common::nvtx::domain::app>::value(),
+    stream,
+    PerThreadDefaultStream,
+    &nvtxRegionsDesc);
+#endif
+}
+
+template <typename T>
+static inline void mark_span_as_initialized(const raft::device_span<T> span,
+                                            rmm::cuda_stream_view stream)
+{
+  mark_memory_as_initialized(span.data(), span.size() * sizeof(T), stream.value());
+}
+
 }  // namespace cuopt
diff --git a/cpp/src/utilities/determinism_log.hpp b/cpp/src/utilities/determinism_log.hpp
new file mode 100644
index 0000000000..71517d7d27
--- /dev/null
+++ b/cpp/src/utilities/determinism_log.hpp
@@ -0,0 +1,23 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#ifndef CUOPT_DETERMINISM_LOG
+#define CUOPT_DETERMINISM_LOG(...) \
+  do {                             \
+  } while (0)
+#endif
diff --git a/cpp/src/utilities/seed_generator.cu b/cpp/src/utilities/seed_generator.cu
index 1da6662bc1..612093a7a8 100644
--- a/cpp/src/utilities/seed_generator.cu
+++ b/cpp/src/utilities/seed_generator.cu
@@ -1,10 +1,11 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
 
 #include <utilities/seed_generator.cuh>
 
-int64_t cuopt::seed_generator::seed_ = 0;
+int64_t cuopt::seed_generator::base_seed_ = 0;
+std::atomic<int64_t> cuopt::seed_generator::epoch_{0};
diff --git a/cpp/src/utilities/seed_generator.cuh b/cpp/src/utilities/seed_generator.cuh
index dd5e79d847..5415e9e80b 100644
--- a/cpp/src/utilities/seed_generator.cuh
+++ b/cpp/src/utilities/seed_generator.cuh
@@ -1,29 +1,50 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
 
 #pragma once
+#include <atomic>
 #include <raft/random/rng_device.cuh>
 #include <utilities/cuda_helpers.cuh>
 
 namespace cuopt {
 
-// TODO: should be thread local?
 class seed_generator {
-  static int64_t seed_;
+  static int64_t base_seed_;
+  // Monotonically increasing epoch; incremented on every set_seed() call.
+  // Thread-local state compares against this to detect resets, even when
+  // the same seed value is set again (e.g., repeated solve_mip() calls).
+  static std::atomic<int64_t> epoch_;
+
+  struct thread_state_t {
+    int64_t counter{0};
+    int64_t last_epoch{-1};
+  };
+
+  static thread_state_t& local_state()
+  {
+    thread_local thread_state_t state;
+    int64_t current_epoch = epoch_.load(std::memory_order_acquire);
+    if (state.last_epoch != current_epoch) {
+      state.counter    = base_seed_;
+      state.last_epoch = current_epoch;
+    }
+    return state;
+  }
 
  public:
   template <typename seed_t>
   static void set_seed(seed_t seed)
   {
 #ifdef BENCHMARK
-    seed_ = std::random_device{}();
+    base_seed_ = std::random_device{}();
 #else
-    seed_ = static_cast<int64_t>(seed);
+    base_seed_ = static_cast<int64_t>(seed);
 #endif
+    epoch_.fetch_add(1, std::memory_order_release);
   }
   template <typename arg0, typename arg1, typename... args>
   static void set_seed(arg0 seed0, arg1 seed1, args... seeds)
@@ -31,7 +52,19 @@ class seed_generator {
     set_seed(seed1 + ((seed0 + seed1) * (seed0 + seed1 + 1) / 2), seeds...);
   }
 
-  static int64_t get_seed() { return seed_++; }
+#if SEED_GENERATOR_DEBUG
+  static int64_t get_seed(const char* caller = __builtin_FUNCTION(),
+                          const char* file   = __builtin_FILE(),
+                          int line           = __builtin_LINE())
+  {
+    printf("SEED CALLED BY %s:%d: %s() ***\n", file, line, caller);
+    return local_state().counter++;
+  }
+#else
+  static int64_t get_seed() { return local_state().counter++; }
+#endif
+
+  static int64_t peek_seed() { return local_state().counter; }
 
  public:
   seed_generator(seed_generator const&) = delete;
diff --git a/cpp/src/utilities/termination_checker.hpp b/cpp/src/utilities/termination_checker.hpp
new file mode 100644
index 0000000000..d2ecd41141
--- /dev/null
+++ b/cpp/src/utilities/termination_checker.hpp
@@ -0,0 +1,239 @@
+/* clang-format off */
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+/* clang-format on */
+
+#pragma once
+
+#include <algorithm>
+#include <cmath>
+#include <string>
+
+#include <mip_heuristics/logger.hpp>
+
+#include "timer.hpp"
+#include "work_limit_context.hpp"
+
+namespace cuopt {
+
+/**
+ * Unified termination checker that subsumes timer_t and work_limit_timer_t.
+ *
+ * In non-deterministic mode: checks wall-clock time.
+ * In deterministic mode: checks work units via work_limit_context_t.
+ * In BOTH modes: checks parent chain (inheriting root wall-clock limit) and user callbacks.
+ *
+ * This is the single timer type used throughout the solver. It replaces work_limit_timer_t.
+ */
+class termination_checker_t {
+ public:
+  struct root_tag_t {};
+
+  // Root constructor (top-level solver, wall-clock only)
+  explicit termination_checker_t(double time_limit, root_tag_t)
+    : deterministic(false),
+      work_limit(time_limit),
+      timer(time_limit),
+      work_context(nullptr),
+      work_units_at_start(0),
+      parent_(nullptr)
+  {
+  }
+
+  // Non-deterministic constructor with parent
+  termination_checker_t(double time_limit_, const termination_checker_t& parent)
+    : deterministic(false),
+      work_limit(time_limit_),
+      timer(time_limit_),
+      work_context(nullptr),
+      work_units_at_start(0),
+      parent_(&parent)
+  {
+  }
+
+  // Deterministic constructor with parent (inherits parent's termination)
+  termination_checker_t(work_limit_context_t& context,
+                        double work_limit_,
+                        const termination_checker_t& parent)
+    : deterministic(context.deterministic),
+      work_limit(work_limit_),
+      timer(work_limit_),
+      work_context(&context),
+      work_units_at_start(context.deterministic ? context.current_work() : 0),
+      parent_(&parent)
+  {
+  }
+
+  void set_parent(const termination_checker_t* parent) { parent_ = parent; }
+  const termination_checker_t* get_parent() const { return parent_; }
+
+  void set_termination_callback(bool (*cb)(void*), void* data)
+  {
+    termination_callback_      = cb;
+    termination_callback_data_ = data;
+  }
+
+  bool check(const char* caller = __builtin_FUNCTION(),
+             const char* file   = __builtin_FILE(),
+             int line           = __builtin_LINE()) const noexcept
+  {
+    if (termination_callback_ != nullptr && termination_callback_(termination_callback_data_)) {
+      return true;
+    }
+
+    if (parent_ != nullptr && parent_->check()) { return true; }
+
+    if (deterministic) {
+      if (!work_context) { return false; }
+      double elapsed_since_start = work_context->current_work() - work_units_at_start;
+      bool finished_now          = elapsed_since_start >= work_limit;
+      if (finished_now && !finished) {
+        finished                   = true;
+        double actual_elapsed_time = timer.elapsed_time();
+
+        if (work_limit > 0 && std::abs(actual_elapsed_time - work_limit) / work_limit > 0.10) {
+          CUOPT_LOG_TRACE(
+            "%s:%d: %s(): Work limit timer finished with a large discrepancy: %fs for %fwu "
+            "(global: %g, start: %g)",
+            file,
+            line,
+            caller,
+            actual_elapsed_time,
+            work_limit,
+            work_context->current_work(),
+            work_units_at_start);
+        }
+      }
+      return finished;
+    } else {
+      return timer.check_time_limit();
+    }
+  }
+
+  // Aliases for compatibility with work_limit_timer_t and timer_t interfaces
+  bool check_time_limit(const char* caller = __builtin_FUNCTION(),
+                        const char* file   = __builtin_FILE(),
+                        int line           = __builtin_LINE()) const noexcept
+  {
+    return check(caller, file, line);
+  }
+
+  bool check_limit(const char* caller = __builtin_FUNCTION(),
+                   const char* file   = __builtin_FILE(),
+                   int line           = __builtin_LINE()) const noexcept
+  {
+    return check(caller, file, line);
+  }
+
+  void record_work(double work_units,
+                   const char* caller = __builtin_FUNCTION(),
+                   const char* file   = __builtin_FILE(),
+                   int line           = __builtin_LINE())
+  {
+    if (deterministic && work_context) {
+      // debugging info
+      double parent_elapsed_time = parent_ != nullptr ? parent_->timer.elapsed_time() : 0.0;
+      double parent_time_limit   = parent_ != nullptr ? parent_->timer.get_time_limit() : 0.0;
+
+      CUOPT_LOG_TRACE("%s:%d: %s(): Recorded %f work units in %fs, total %f (parent time: %g/%g)",
+                      file,
+                      line,
+                      caller,
+                      work_units,
+                      timer.elapsed_time(),
+                      work_context->current_work(),
+                      parent_elapsed_time,
+                      parent_time_limit);
+      work_context->record_work_sync_on_horizon(work_units);
+    }
+  }
+
+  double remaining_units() const noexcept
+  {
+    double local_remaining;
+    if (deterministic) {
+      if (!work_context) {
+        local_remaining = work_limit;
+      } else {
+        double elapsed_since_start = work_context->current_work() - work_units_at_start;
+        local_remaining            = std::max(0.0, work_limit - elapsed_since_start);
+      }
+    } else {
+      local_remaining = timer.remaining_time();
+    }
+    // don't let the root's global time limit contaminate work limits further down
+    if (parent_ != nullptr && !(deterministic && !parent_->deterministic)) {
+      local_remaining = std::min(local_remaining, parent_->remaining_units());
+    }
+    if (!std::isfinite(local_remaining)) {
+      CUOPT_LOG_WARN(
+        "remaining_units non-finite: %g det=%d work_limit=%g start=%g "
+        "ctx_work=%g has_parent=%d",
+        local_remaining,
+        (int)deterministic,
+        work_limit,
+        work_units_at_start,
+        work_context ? work_context->current_work() : -1.0,
+        parent_ != nullptr);
+    }
+    return local_remaining;
+  }
+
+  double remaining_time() const noexcept { return remaining_units(); }
+
+  double elapsed_time() const noexcept
+  {
+    if (deterministic) {
+      if (!work_context) { return 0.0; }
+      return work_context->current_work() - work_units_at_start;
+    } else {
+      return timer.elapsed_time();
+    }
+  }
+
+  bool check_half_time() const noexcept
+  {
+    if (deterministic) {
+      if (!work_context) { return false; }
+      double elapsed_since_start = work_context->current_work() - work_units_at_start;
+      return elapsed_since_start >= work_limit / 2;
+    } else {
+      return timer.check_half_time();
+    }
+  }
+
+  double clamp_remaining_time(double desired_time) const noexcept
+  {
+    return std::min<double>(desired_time, remaining_time());
+  }
+
+  double get_time_limit() const noexcept
+  {
+    if (deterministic) {
+      return work_limit;
+    } else {
+      return timer.get_time_limit();
+    }
+  }
+
+  double get_tic_start() const noexcept { return timer.get_tic_start(); }
+
+  timer_t timer;
+  double work_limit{};
+  mutable bool finished{false};
+  bool deterministic{false};
+  work_limit_context_t* work_context{nullptr};
+  double work_units_at_start{0};
+
+ private:
+  const termination_checker_t* parent_{nullptr};
+  bool (*termination_callback_)(void*) = nullptr;
+  void* termination_callback_data_     = nullptr;
+};
+
+// Backward compatibility
+using work_limit_timer_t = termination_checker_t;
+
+}  // namespace cuopt
diff --git a/cpp/src/utilities/timer.hpp b/cpp/src/utilities/timer.hpp
index b7ab6a63bd..ccfab4c57f 100644
--- a/cpp/src/utilities/timer.hpp
+++ b/cpp/src/utilities/timer.hpp
@@ -34,7 +34,21 @@ class timer_t {
            elapsed_time());
   }
 
-  bool check_time_limit() const noexcept { return elapsed_time() >= time_limit; }
+  bool check_time_limit(const char* caller = __builtin_FUNCTION(),
+                        const char* file   = __builtin_FILE(),
+                        int line           = __builtin_LINE()) const noexcept
+  {
+    bool elapsed = elapsed_time() >= time_limit;
+    // if (elapsed) {
+    //   printf("************ TIME LIMIT (%.2gs) REACHED BY %s:%d: %s() ***\n",
+    //          time_limit,
+    //          file,
+    //          line,
+    //          caller);
+    //   //__builtin_trap();
+    // }
+    return elapsed;
+  }
 
   bool check_half_time() const noexcept { return elapsed_time() >= time_limit / 2; }
 
diff --git a/cpp/src/utilities/work_limit_context.hpp b/cpp/src/utilities/work_limit_context.hpp
index c75a37b818..55edee85b5 100644
--- a/cpp/src/utilities/work_limit_context.hpp
+++ b/cpp/src/utilities/work_limit_context.hpp
@@ -17,30 +17,117 @@
 #pragma once
 
 #include <algorithm>
+#include <atomic>
+#include <cerrno>
+#include <cmath>
+#include <cstdlib>
+#include <memory>
 #include <string>
 
 #include <mip_heuristics/logger.hpp>
+#include <utilities/determinism_log.hpp>
+#include <utilities/macros.cuh>
 
+#include "producer_sync.hpp"
 #include "timer.hpp"
 #include "work_unit_scheduler.hpp"
 
 namespace cuopt {
 
+inline double read_work_unit_scale_env_or_default(const char* env_name, double default_value)
+{
+  const char* env_value = std::getenv(env_name);
+  if (env_value == nullptr || env_value[0] == '\0') { return default_value; }
+
+  errno                     = 0;
+  char* end_ptr             = nullptr;
+  const double parsed_value = std::strtod(env_value, &end_ptr);
+  const bool valid_value    = errno == 0 && end_ptr != env_value && *end_ptr == '\0' &&
+                           std::isfinite(parsed_value) && parsed_value > 0.0;
+  cuopt_assert(valid_value, "Invalid work-unit scale env var");
+  return parsed_value;
+}
+
 struct work_limit_context_t {
   double global_work_units_elapsed{0.0};
   double total_sync_time{0.0};  // Total time spent waiting at sync barriers (seconds)
   bool deterministic{false};
   work_unit_scheduler_t* scheduler{nullptr};
+  producer_sync_t* producer_sync{nullptr};
   std::string name;
+  std::unique_ptr<std::atomic<double>> producer_work_units_elapsed{
+    std::make_unique<std::atomic<double>>(0.0)};
+  double producer_progress_scale{
+    read_work_unit_scale_env_or_default("CUOPT_GPU_HEUR_WORK_UNIT_SCALE", 1.0)};
+  double work_unit_scale{1.0};
 
   work_limit_context_t(const std::string& name) : name(name) {}
 
+  work_limit_context_t(const work_limit_context_t&)            = delete;
+  work_limit_context_t& operator=(const work_limit_context_t&) = delete;
+  work_limit_context_t(work_limit_context_t&&)                 = default;
+  work_limit_context_t& operator=(work_limit_context_t&&)      = default;
+
+  double current_work() const noexcept { return global_work_units_elapsed; }
+
+  double current_producer_work() const noexcept
+  {
+    double result = current_work() * producer_progress_scale;
+    if (!std::isfinite(result)) {
+      CUOPT_LOG_WARN("current_producer_work non-finite: %g (work=%g scale=%g) ctx=%s",
+                     result,
+                     current_work(),
+                     producer_progress_scale,
+                     name.c_str());
+    }
+    return result;
+  }
+
+  std::atomic<double>* producer_progress_ptr() noexcept
+  {
+    return producer_work_units_elapsed.get();
+  }
+
+  void attach_producer_sync(producer_sync_t* producer_sync_)
+  {
+    producer_sync = producer_sync_;
+    producer_work_units_elapsed->store(current_producer_work(), std::memory_order_release);
+    if (work_unit_scale != 1.0) {
+      CUOPT_DETERMINISM_LOG("[%s] Using work-unit scale %f", name.c_str(), work_unit_scale);
+    }
+  }
+
+  void detach_producer_sync() noexcept { producer_sync = nullptr; }
+
+  void set_current_work(double total_work, bool notify_producer = true)
+  {
+    if (!deterministic) return;
+    if (!std::isfinite(total_work)) {
+      CUOPT_LOG_WARN("set_current_work non-finite: %g (prev=%g) ctx=%s",
+                     total_work,
+                     global_work_units_elapsed,
+                     name.c_str());
+    }
+    cuopt_assert(total_work + 1e-12 >= global_work_units_elapsed,
+                 "Deterministic work progress must be monotonic");
+    global_work_units_elapsed = total_work;
+    producer_work_units_elapsed->store(current_producer_work(), std::memory_order_release);
+    if (notify_producer && producer_sync != nullptr) { producer_sync->notify_progress(); }
+  }
+
   void record_work_sync_on_horizon(double work)
   {
     if (!deterministic) return;
-    global_work_units_elapsed += work;
-    if (scheduler) { scheduler->on_work_recorded(*this, global_work_units_elapsed); }
+    cuopt_assert(std::isfinite(work), "Recorded work must be finite");
+    cuopt_assert(work >= 0.0, "Recorded work must be non-negative");
+    const double scaled_work = work * work_unit_scale;
+    const double total_work  = global_work_units_elapsed + scaled_work;
+    set_current_work(total_work, false);
+    if (scheduler) { scheduler->on_work_recorded(*this, total_work); }
+    if (producer_sync != nullptr) { producer_sync->notify_progress(); }
   }
+
+  void record_work(double work) { record_work_sync_on_horizon(work); }
 };
 
 }  // namespace cuopt
diff --git a/cpp/src/utilities/work_limit_timer.hpp b/cpp/src/utilities/work_limit_timer.hpp
new file mode 100644
index 0000000000..801a3e5ee9
--- /dev/null
+++ b/cpp/src/utilities/work_limit_timer.hpp
@@ -0,0 +1,11 @@
+/* clang-format off */
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: Apache-2.0
+ */
+/* clang-format on */
+#pragma once
+
+// work_limit_timer_t is now an alias for termination_checker_t.
+// This header exists for backward compatibility.
+#include "termination_checker.hpp"
diff --git a/cpp/src/utilities/work_unit_scheduler.cpp b/cpp/src/utilities/work_unit_scheduler.cpp
index b0e5c5f12f..5dc798ddb3 100644
--- a/cpp/src/utilities/work_unit_scheduler.cpp
+++ b/cpp/src/utilities/work_unit_scheduler.cpp
@@ -29,7 +29,8 @@
 
 namespace cuopt {
 
-work_unit_scheduler_t::work_unit_scheduler_t(double sync_interval) : sync_interval_(sync_interval)
+work_unit_scheduler_t::work_unit_scheduler_t(double sync_interval, double base)
+  : sync_interval_(sync_interval), base_(base)
 {
 }
 
@@ -79,15 +80,15 @@ void work_unit_scheduler_t::wait_for_next_sync(work_limit_context_t& ctx)
 {
   if (is_shutdown()) return;
 
-  double next_sync              = current_sync_target();
-  ctx.global_work_units_elapsed = next_sync;
+  double next_sync = current_sync_target();
+  ctx.set_current_work(next_sync, false);
   wait_at_sync_point(ctx, next_sync);
 }
 
 double work_unit_scheduler_t::current_sync_target() const
 {
   if (sync_interval_ <= 0) return std::numeric_limits<double>::infinity();
-  return (barrier_generation_ + 1) * sync_interval_;
+  return base_ + (barrier_generation_ + 1) * sync_interval_;
 }
 
 void work_unit_scheduler_t::wait_at_sync_point(work_limit_context_t& ctx, double sync_target)
diff --git a/cpp/src/utilities/work_unit_scheduler.hpp b/cpp/src/utilities/work_unit_scheduler.hpp
index 84e7b95fab..286fe74686 100644
--- a/cpp/src/utilities/work_unit_scheduler.hpp
+++ b/cpp/src/utilities/work_unit_scheduler.hpp
@@ -26,7 +26,7 @@ struct work_limit_context_t;
 
 class work_unit_scheduler_t {
  public:
-  explicit work_unit_scheduler_t(double sync_interval = 5.0);
+  explicit work_unit_scheduler_t(double sync_interval = 5.0, double base = 0.0);
 
   void set_sync_interval(double interval);
   double get_sync_interval() const { return sync_interval_; }
@@ -54,6 +54,7 @@ class work_unit_scheduler_t {
   void wait_at_sync_point(work_limit_context_t& ctx, double sync_target);
 
   double sync_interval_;
+  double base_;
   std::vector<std::reference_wrapper<work_limit_context_t>> contexts_;
 
   size_t barrier_generation_{0};
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index a73a3361ce..fe9dd4fde9 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -1,4 +1,4 @@
-﻿# cmake-format: off
+# cmake-format: off
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 # cmake-format: on
@@ -33,6 +33,40 @@ endif()
 
 set(CUOPT_TEST_DIR ${CMAKE_CURRENT_SOURCE_DIR})
 
+if (EXISTS "${CUDAToolkit_LIBRARY_ROOT}/extras/CUPTI/lib64")
+  # NVIDIA installer layout:
+  set(cuopt_cupti_root "${CUDAToolkit_LIBRARY_ROOT}/extras/CUPTI")
+else()
+  # Ubuntu package layout:
+  set(cuopt_cupti_root "${CUDAToolkit_LIBRARY_ROOT}")
+endif()
+message(STATUS "cuopt_cupti_root = ${cuopt_cupti_root}")
+
+# The CUPTI targets in FindCUDAToolkit are broken:
+# - The dll locations are not specified
+# - Dependent libraries nvperf_* are not linked.
+# So we create our own targets:
+function(cuopt_add_cupti_dep dep_name)
+  string(TOLOWER ${dep_name} dep_name_lower)
+  string(TOUPPER ${dep_name} dep_name_upper)
+
+  add_library(cuopt::${dep_name_lower} SHARED IMPORTED)
+
+  find_library(CUOPT_${dep_name_upper}_LIBRARY ${dep_name_lower} REQUIRED
+    DOC "The full path to lib${dep_name_lower}.so from the CUDA Toolkit."
+    HINTS "${cuopt_cupti_root}/lib64" "${cuopt_cupti_root}/lib"
+  )
+  mark_as_advanced(CUOPT_${dep_name_upper}_LIBRARY)
+
+  set_target_properties(cuopt::${dep_name_lower} PROPERTIES
+    IMPORTED_LOCATION "${CUOPT_${dep_name_upper}_LIBRARY}"
+  )
+endfunction()
+
+#cuopt_add_cupti_dep(nvperf_target)
+#cuopt_add_cupti_dep(nvperf_host)
+#cuopt_add_cupti_dep(cupti)
+
 # ################################################################ ------------------------------------------------------------------
 function(ConfigureTest CMAKE_TEST_NAME)
     add_executable(${CMAKE_TEST_NAME} ${ARGN})
diff --git a/cpp/tests/mip/CMakeLists.txt b/cpp/tests/mip/CMakeLists.txt
index f2cf53ff6c..584bbc243b 100644
--- a/cpp/tests/mip/CMakeLists.txt
+++ b/cpp/tests/mip/CMakeLists.txt
@@ -40,15 +40,21 @@ ConfigureTest(PRESOLVE_TEST
     ${CMAKE_CURRENT_SOURCE_DIR}/presolve_test.cu
 )
 # Disable for now
-# ConfigureTest(FEASIBILITY_JUMP_TEST
-#    ${CMAKE_CURRENT_SOURCE_DIR}/feasibility_jump_tests.cu
-# )
+ConfigureTest(FEASIBILITY_JUMP_TEST
+   ${CMAKE_CURRENT_SOURCE_DIR}/feasibility_jump_tests.cu
+)
 ConfigureTest(MIP_TERMINATION_STATUS_TEST
     ${CMAKE_CURRENT_SOURCE_DIR}/termination_test.cu
 )
 ConfigureTest(DETERMINISM_TEST
     ${CMAKE_CURRENT_SOURCE_DIR}/determinism_test.cu
 )
+ConfigureTest(LOCAL_SEARCH_TEST
+    ${CMAKE_CURRENT_SOURCE_DIR}/local_search_test.cu
+)
+ConfigureTest(DIVERSITY_TEST
+    ${CMAKE_CURRENT_SOURCE_DIR}/diversity_test.cu
+)
 ConfigureTest(HEURISTICS_HYPER_PARAMS_TEST
     ${CMAKE_CURRENT_SOURCE_DIR}/heuristics_hyper_params_test.cu
 )
diff --git a/cpp/tests/mip/determinism_test.cu b/cpp/tests/mip/determinism_test.cu
index dcd6f7749d..53b1066fa8 100644
--- a/cpp/tests/mip/determinism_test.cu
+++ b/cpp/tests/mip/determinism_test.cu
@@ -11,6 +11,7 @@
 #include <cuopt/linear_programming/constants.h>
 #include <cuopt/linear_programming/mip/solver_settings.hpp>
 #include <cuopt/linear_programming/solve.hpp>
+#include <cuopt/linear_programming/utilities/internals.hpp>
 #include <mps_parser/parser.hpp>
 #include <utilities/common_utils.hpp>
 #include <utilities/copy_helpers.hpp>
@@ -24,6 +25,7 @@
 
 #include <algorithm>
 #include <cmath>
+#include <cstdlib>
 #include <string>
 #include <vector>
 
@@ -45,6 +47,117 @@ void expect_solutions_bitwise_equal(const mip_solution_t<int, double>& sol1,
   }
 }
 
+struct callback_solution_t {
+  std::vector<double> assignment;
+  double objective{};
+  double solution_bound{};
+  internals::mip_solution_origin_t origin{internals::mip_solution_origin_t::UNKNOWN};
+};
+
+class first_n_get_solution_callback_t : public cuopt::internals::get_solution_callback_ext_t {
+ public:
+  first_n_get_solution_callback_t(std::vector<callback_solution_t>& solutions_in,
+                                  int n_variables_,
+                                  size_t max_solutions_,
+                                  void* expected_user_data_)
+    : solutions(solutions_in),
+      expected_user_data(expected_user_data_),
+      n_variables(n_variables_),
+      max_solutions(max_solutions_)
+  {
+  }
+
+  void get_solution(void* data,
+                    void* cost,
+                    void* solution_bound,
+                    const internals::mip_solution_callback_info_t* callback_info,
+                    void* user_data) override
+  {
+    EXPECT_EQ(user_data, expected_user_data);
+    ASSERT_NE(callback_info, nullptr);
+    n_calls++;
+
+    auto assignment_ptr     = static_cast<double*>(data);
+    auto objective_ptr      = static_cast<double*>(cost);
+    auto solution_bound_ptr = static_cast<double*>(solution_bound);
+    EXPECT_FALSE(std::isnan(objective_ptr[0]));
+    EXPECT_FALSE(std::isnan(solution_bound_ptr[0]));
+
+    if (solutions.size() >= max_solutions) { return; }
+
+    callback_solution_t callback_solution;
+    callback_solution.assignment.assign(assignment_ptr, assignment_ptr + n_variables);
+    callback_solution.objective      = objective_ptr[0];
+    callback_solution.solution_bound = solution_bound_ptr[0];
+    callback_solution.origin         = (internals::mip_solution_origin_t)callback_info->origin;
+    solutions.push_back(std::move(callback_solution));
+  }
+
+  std::vector<callback_solution_t>& solutions;
+  void* expected_user_data;
+  int n_calls{0};
+  int n_variables;
+  size_t max_solutions;
+};
+
+bool is_gpu_callback_origin(internals::mip_solution_origin_t origin)
+{
+  switch (origin) {
+    case internals::mip_solution_origin_t::FEASIBILITY_JUMP:
+    case internals::mip_solution_origin_t::LOCAL_SEARCH:
+    case internals::mip_solution_origin_t::QUICK_FEASIBLE:
+    case internals::mip_solution_origin_t::LP_ROUNDING:
+    case internals::mip_solution_origin_t::RECOMBINATION:
+    case internals::mip_solution_origin_t::SUB_MIP: return true;
+    default: return false;
+  }
+}
+
+size_t count_callbacks_with_origin(const std::vector<callback_solution_t>& callbacks,
+                                   internals::mip_solution_origin_t origin)
+{
+  return std::count_if(callbacks.begin(),
+                       callbacks.end(),
+                       [origin](const callback_solution_t& sol) { return sol.origin == origin; });
+}
+
+size_t count_gpu_callbacks(const std::vector<callback_solution_t>& callbacks)
+{
+  return std::count_if(callbacks.begin(), callbacks.end(), [](const callback_solution_t& sol) {
+    return is_gpu_callback_origin(sol.origin);
+  });
+}
+
+size_t count_branch_and_bound_callbacks(const std::vector<callback_solution_t>& callbacks)
+{
+  return std::count_if(callbacks.begin(), callbacks.end(), [](const callback_solution_t& sol) {
+    return sol.origin == internals::mip_solution_origin_t::BRANCH_AND_BOUND_NODE ||
+           sol.origin == internals::mip_solution_origin_t::BRANCH_AND_BOUND_DIVING;
+  });
+}
+
+void expect_callback_prefixes_bitwise_equal(const std::vector<callback_solution_t>& lhs,
+                                            const std::vector<callback_solution_t>& rhs,
+                                            size_t prefix_size,
+                                            const std::string& label)
+{
+  ASSERT_GE(lhs.size(), prefix_size) << label << "Left callback prefix missing entries";
+  ASSERT_GE(rhs.size(), prefix_size) << label << "Right callback prefix missing entries";
+  for (size_t i = 0; i < prefix_size; ++i) {
+    EXPECT_EQ(lhs[i].objective, rhs[i].objective)
+      << label << "Callback objective differs at index " << i;
+    EXPECT_EQ(lhs[i].solution_bound, rhs[i].solution_bound)
+      << label << "Callback bound differs at index " << i;
+    EXPECT_EQ(lhs[i].origin, rhs[i].origin) << label << "Callback origin differs at index " << i;
+    ASSERT_EQ(lhs[i].assignment.size(), rhs[i].assignment.size())
+      << label << "Callback assignment size differs at index " << i;
+    for (size_t j = 0; j < lhs[i].assignment.size(); ++j) {
+      EXPECT_EQ(lhs[i].assignment[j], rhs[i].assignment[j])
+        << label << "Callback assignment differs at callback " << i << " variable " << j;
+    }
+  }
+}
+
 }  // namespace
 
 class DeterministicBBTest : public ::testing::Test {
@@ -61,9 +174,9 @@ TEST_F(DeterministicBBTest, reproducible_objective)
 
   mip_solver_settings_t<int, double> settings;
   settings.time_limit       = 60.0;
-  settings.determinism_mode = CUOPT_MODE_DETERMINISTIC;
+  settings.determinism_mode = CUOPT_MODE_DETERMINISTIC_BB;
   settings.num_cpu_threads  = 8;
-  settings.work_limit       = 4;
+  settings.work_limit       = 2;
 
   // Ensure seed is positive int32_t
   auto seed = std::random_device{}() & 0x7fffffff;
@@ -93,7 +206,7 @@ TEST_F(DeterministicBBTest, reproducible_infeasibility)
 
   mip_solver_settings_t<int, double> settings;
   settings.time_limit       = 60.0;
-  settings.determinism_mode = CUOPT_MODE_DETERMINISTIC;
+  settings.determinism_mode = CUOPT_MODE_DETERMINISTIC_BB;
   settings.num_cpu_threads  = 8;
   settings.work_limit       = 100;  // High enough to fully explore
 
@@ -125,7 +238,7 @@ TEST_F(DeterministicBBTest, reproducible_high_contention)
 
   mip_solver_settings_t<int, double> settings;
   settings.time_limit       = 60.0;
-  settings.determinism_mode = CUOPT_MODE_DETERMINISTIC;
+  settings.determinism_mode = CUOPT_MODE_DETERMINISTIC_BB;
   settings.num_cpu_threads  = 128;  // High thread count to stress contention
   settings.work_limit       = 1;
 
@@ -160,7 +273,7 @@ TEST_F(DeterministicBBTest, reproducible_solution_vector)
 
   mip_solver_settings_t<int, double> settings;
   settings.time_limit       = 60.0;
-  settings.determinism_mode = CUOPT_MODE_DETERMINISTIC;
+  settings.determinism_mode = CUOPT_MODE_DETERMINISTIC_BB;
   settings.num_cpu_threads  = 8;
   settings.work_limit       = 2;
 
@@ -177,6 +290,117 @@ TEST_F(DeterministicBBTest, reproducible_solution_vector)
   expect_solutions_bitwise_equal(solution1, solution2, handle_);
 }
 
+TEST_F(DeterministicBBTest, deterministic_callback_sequence_reproducible_with_gpu_pipeline)
+{
+  constexpr size_t callback_compare_count = 5;
+  constexpr size_t callback_capture_limit = 32;
+  constexpr size_t min_gpu_callback_count = 3;
+
+  auto path    = make_path_absolute("/mip/50v-10.mps");
+  auto problem = mps_parser::parse_mps<int, double>(path, false);
+  handle_.sync_stream();
+
+  mip_solver_settings_t<int, double> settings;
+  settings.time_limit               = 360.0;
+  settings.determinism_mode         = CUOPT_MODE_DETERMINISTIC;
+  settings.num_cpu_threads          = 2;
+  settings.work_limit               = 4;
+  settings.bb_work_unit_scale       = 2.0;
+  settings.gpu_heur_work_unit_scale = 1.0;
+  settings.cpufj_work_unit_scale    = 1.0;
+
+  auto seed = std::random_device{}() & 0x7fffffff;
+  std::cout << "Tested with seed " << seed << "\n";
+  settings.seed = seed;
+
+  const int n_variables = problem.get_variable_lower_bounds().size();
+  int user_data         = 7;
+
+  std::vector<callback_solution_t> callbacks_run1;
+  first_n_get_solution_callback_t callback_run1(
+    callbacks_run1, n_variables, callback_capture_limit, &user_data);
+  auto settings_run1 = settings;
+  settings_run1.set_mip_callback(&callback_run1, &user_data);
+  cuopt::seed_generator::set_seed(seed);
+  auto solution1 = solve_mip(&handle_, problem, settings_run1);
+
+  std::vector<callback_solution_t> callbacks_run2;
+  first_n_get_solution_callback_t callback_run2(
+    callbacks_run2, n_variables, callback_capture_limit, &user_data);
+  auto settings_run2 = settings;
+  settings_run2.set_mip_callback(&callback_run2, &user_data);
+  cuopt::seed_generator::set_seed(seed);
+  auto solution2 = solve_mip(&handle_, problem, settings_run2);
+
+  EXPECT_EQ(solution1.get_termination_status(), solution2.get_termination_status());
+  EXPECT_GE(callback_run1.n_calls, (int)callback_compare_count);
+  EXPECT_GE(callback_run2.n_calls, (int)callback_compare_count);
+  ASSERT_GE(callbacks_run1.size(), callback_compare_count);
+  ASSERT_GE(callbacks_run2.size(), callback_compare_count);
+
+  EXPECT_GE(count_gpu_callbacks(callbacks_run1), min_gpu_callback_count);
+  EXPECT_GE(count_gpu_callbacks(callbacks_run2), min_gpu_callback_count);
+
+  expect_callback_prefixes_bitwise_equal(
+    callbacks_run1, callbacks_run2, callback_compare_count, "Deterministic callback run 1 vs 2: ");
+}
+
+class DeterministicGpuHeuristicsInstanceTest : public ::testing::TestWithParam<std::string> {
+ protected:
+  raft::handle_t handle_;
+};
+
+TEST_P(DeterministicGpuHeuristicsInstanceTest, reproducible_with_gpu_heuristics)
+{
+  auto path    = make_path_absolute(GetParam());
+  auto problem = mps_parser::parse_mps<int, double>(path, false);
+  handle_.sync_stream();
+
+  mip_solver_settings_t<int, double> settings;
+  settings.time_limit       = 60.0;
+  settings.determinism_mode = CUOPT_MODE_DETERMINISTIC;
+  settings.num_cpu_threads  = 8;
+  settings.work_limit       = 5;
+
+  auto seed = std::random_device{}() & 0x7fffffff;
+  std::cout << "Tested with seed " << seed << "\n";
+  settings.seed = seed;
+
+  cuopt::seed_generator::set_seed(seed);
+  auto solution1 = solve_mip(&handle_, problem, settings);
+  cuopt::seed_generator::set_seed(seed);
+  auto solution2 = solve_mip(&handle_, problem, settings);
+  cuopt::seed_generator::set_seed(seed);
+  auto solution3 = solve_mip(&handle_, problem, settings);
+
+  EXPECT_EQ(solution1.get_termination_status(), solution2.get_termination_status());
+  EXPECT_EQ(solution1.get_termination_status(), solution3.get_termination_status());
+
+  EXPECT_DOUBLE_EQ(solution1.get_objective_value(), solution2.get_objective_value());
+  EXPECT_DOUBLE_EQ(solution1.get_objective_value(), solution3.get_objective_value());
+
+  EXPECT_DOUBLE_EQ(solution1.get_solution_bound(), solution2.get_solution_bound());
+  EXPECT_DOUBLE_EQ(solution1.get_solution_bound(), solution3.get_solution_bound());
+
+  expect_solutions_bitwise_equal(solution1, solution2, handle_, "GPU heur run 1 vs 2: ");
+  expect_solutions_bitwise_equal(solution1, solution3, handle_, "GPU heur run 1 vs 3: ");
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  DeterministicGpuHeuristics,
+  DeterministicGpuHeuristicsInstanceTest,
+  ::testing::Values(std::string("/mip/gen-ip054.mps"),
+                    std::string("/mip/pk1.mps"),
+                    // std::string("/mip/sct2.mps"),
+                    // std::string("/mip/thor50dday.mps"),
+                    std::string("/mip/neos5.mps")),
+  [](const ::testing::TestParamInfo<DeterministicGpuHeuristicsInstanceTest::ParamType>& info) {
+    std::string name = info.param.substr(info.param.rfind('/') + 1);
+    name             = name.substr(0, name.rfind('.'));
+    std::replace(name.begin(), name.end(), '-', '_');
+    return name;
+  });
+
 // Parameterized test for different problem instances
 class DeterministicBBInstanceTest
   : public ::testing::TestWithParam<std::tuple<std::string, int, double, int>> {
@@ -227,9 +451,10 @@ INSTANTIATE_TEST_SUITE_P(
   DeterministicBB,
   DeterministicBBInstanceTest,
   ::testing::Values(
-    // Instance, threads, time_limit
+    // Instance, threads, time_limit, work limiy
     std::make_tuple("/mip/gen-ip054.mps", 4, 60.0, 4),
     std::make_tuple("/mip/swath1.mps", 8, 60.0, 4),
+    std::make_tuple("/mip/50v-10.mps", 8, 60.0, 4),
     std::make_tuple("/mip/gen-ip054.mps", 128, 120.0, 1),
     std::make_tuple("/mip/bb_optimality.mps", 4, 60.0, 4),
     std::make_tuple("/mip/neos5.mps", 16, 60.0, 1),
diff --git a/cpp/tests/mip/determinism_utils.cuh b/cpp/tests/mip/determinism_utils.cuh
new file mode 100644
index 0000000000..b4e0d4e01e
--- /dev/null
+++ b/cpp/tests/mip/determinism_utils.cuh
@@ -0,0 +1,77 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <thread>
+
+#include <cuda/atomic>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_scalar.hpp>
+
+#include <utilities/copy_helpers.hpp>
+
+namespace cuopt::linear_programming::test {
+
+static __global__ void spin_kernel(int* flag, unsigned long long timeout_clocks = 10000000)
+{
+  cuda::atomic_ref<int> flag_ref(*flag);
+
+  long long int start_clock, sample_clock;
+  start_clock = clock64();
+
+  while (flag_ref.load() == 0) {
+    sample_clock = clock64();
+
+    if (sample_clock - start_clock > timeout_clocks) { break; }
+  }
+}
+
+static void launch_spin_kernel_stream_thread(rmm::cuda_stream_view stream_view, int* flag)
+{
+  while (true) {
+    int blocks  = rand() % 64 + 1;
+    int threads = rand() % 1024 + 1;
+    spin_kernel<<<blocks, threads, 0, stream_view>>>(flag);
+    cudaStreamSynchronize(stream_view);
+    if (host_copy(flag, 1, stream_view)[0] != 0) { break; }
+    std::this_thread::sleep_for(std::chrono::milliseconds(rand() % 1000 + 1));
+  }
+}
+
+class spin_stream_raii_t {
+ public:
+  spin_stream_raii_t()
+    : flag(0, stream), spin_thread(launch_spin_kernel_stream_thread, stream.view(), flag.data())
+  {
+  }
+
+  ~spin_stream_raii_t()
+  {
+    int one = 1;
+    flag.set_value_async(one, stream);
+    spin_thread.join();
+  }
+
+ private:
+  rmm::cuda_stream stream;
+  rmm::device_scalar<int> flag;
+  std::thread spin_thread;
+};
+
+}  // namespace cuopt::linear_programming::test
diff --git a/cpp/tests/mip/diversity_test.cu b/cpp/tests/mip/diversity_test.cu
new file mode 100644
index 0000000000..c5e1c0842d
--- /dev/null
+++ b/cpp/tests/mip/diversity_test.cu
@@ -0,0 +1,395 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../linear_programming/utilities/pdlp_test_utilities.cuh"
+#include "determinism_utils.cuh"
+#include "mip_utils.cuh"
+
+#include <cuopt/error.hpp>
+#include <cuopt/linear_programming/pdlp/pdlp_hyper_params.cuh>
+#include <cuopt/linear_programming/solve.hpp>
+#include <cuopt/linear_programming/utilities/internals.hpp>
+#include <mip_heuristics/diversity/diversity_manager.cuh>
+#include <mip_heuristics/feasibility_jump/feasibility_jump.cuh>
+#include <mip_heuristics/local_search/local_search.cuh>
+#include <mip_heuristics/relaxed_lp/relaxed_lp.cuh>
+#include <mip_heuristics/solution/solution.cuh>
+#include <mip_heuristics/solver_context.cuh>
+#include <mps_parser/parser.hpp>
+#include <pdlp/initial_scaling_strategy/initial_scaling.cuh>
+#include <pdlp/pdlp.cuh>
+#include <pdlp/restart_strategy/pdlp_restart_strategy.cuh>
+#include <pdlp/step_size_strategy/adaptive_step_size_strategy.hpp>
+#include <pdlp/utilities/problem_checking.cuh>
+#include <utilities/common_utils.hpp>
+#include <utilities/logger.hpp>
+#include <utilities/seed_generator.cuh>
+
+#include <raft/sparse/detail/cusparse_wrappers.h>
+#include <raft/core/handle.hpp>
+#include <raft/util/cudart_utils.hpp>
+
+#include <gtest/gtest.h>
+
+#include <thrust/count.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/sequence.h>
+
+#include <cstdint>
+#include <limits>
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace cuopt::linear_programming::test {
+
+void init_handler(const raft::handle_t* handle_ptr)
+{
+  // Init cuBlas / cuSparse context here to avoid having it during solving time
+  RAFT_CUBLAS_TRY(raft::linalg::detail::cublassetpointermode(
+    handle_ptr->get_cublas_handle(), CUBLAS_POINTER_MODE_DEVICE, handle_ptr->get_stream()));
+  RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsesetpointermode(
+    handle_ptr->get_cusparse_handle(), CUSPARSE_POINTER_MODE_DEVICE, handle_ptr->get_stream()));
+}
+
+static void setup_device_symbols(rmm::cuda_stream_view stream_view) { (void)stream_view; }
+
+static uint32_t test_full_run_determinism(std::string path,
+                                          unsigned long seed = std::random_device{}(),
+                                          float work_limit   = 10.0f)
+{
+  const raft::handle_t handle_{};
+
+  cuopt::mps_parser::mps_data_model_t<int, double> mps_problem =
+    cuopt::mps_parser::parse_mps<int, double>(path, false);
+  handle_.sync_stream();
+  auto op_problem = mps_data_model_to_optimization_problem(&handle_, mps_problem);
+  problem_checking_t<int, double>::check_problem_representation(op_problem);
+
+  init_handler(op_problem.get_handle_ptr());
+  // run the problem constructor of MIP, so that we do bounds standardization
+  detail::problem_t<int, double> problem(op_problem);
+  problem.deterministic = true;
+  problem.preprocess_problem();
+
+  setup_device_symbols(op_problem.get_handle_ptr()->get_stream());
+
+  auto settings             = mip_solver_settings_t<int, double>{};
+  settings.time_limit       = 3000.;
+  settings.work_limit       = work_limit;
+  settings.determinism_mode = CUOPT_MODE_DETERMINISTIC_GPU_HEURISTICS;
+  settings.heuristics_only  = true;
+  auto timer = cuopt::termination_checker_t(3000.0, cuopt::termination_checker_t::root_tag_t{});
+  detail::mip_solver_t<int, double> solver(problem, settings, timer);
+  problem.tolerances = settings.get_tolerances();
+
+  detail::diversity_manager_t<int, double> diversity_manager(solver.context);
+  solver.context.gpu_heur_loop.deterministic = true;
+  diversity_manager.timer =
+    work_limit_timer_t(solver.context.gpu_heur_loop, settings.work_limit, timer);
+  diversity_manager.run_solver();
+
+  std::vector<uint32_t> hashes;
+  auto pop = diversity_manager.get_population_pointer();
+  for (const auto& sol : pop->population_to_vector()) {
+    hashes.push_back(sol.get_hash());
+  }
+
+  uint32_t final_hash = detail::compute_hash(hashes);
+  printf("%s: final hash: 0x%x, pop size %d\n",
+         path.c_str(),
+         final_hash,
+         (int)pop->population_to_vector().size());
+  return final_hash;
+}
+
+static uint32_t test_initial_solution_determinism(std::string path,
+                                                  unsigned long seed = std::random_device{}())
+{
+  const raft::handle_t handle_{};
+
+  cuopt::mps_parser::mps_data_model_t<int, double> mps_problem =
+    cuopt::mps_parser::parse_mps<int, double>(path, false);
+  handle_.sync_stream();
+  auto op_problem = mps_data_model_to_optimization_problem(&handle_, mps_problem);
+  problem_checking_t<int, double>::check_problem_representation(op_problem);
+
+  init_handler(op_problem.get_handle_ptr());
+  // run the problem constructor of MIP, so that we do bounds standardization
+  detail::problem_t<int, double> problem(op_problem);
+  problem.deterministic = true;
+  problem.preprocess_problem();
+
+  setup_device_symbols(op_problem.get_handle_ptr()->get_stream());
+
+  auto settings             = mip_solver_settings_t<int, double>{};
+  settings.time_limit       = 3000.;
+  settings.determinism_mode = CUOPT_MODE_DETERMINISTIC_GPU_HEURISTICS;
+  settings.heuristics_only  = true;
+  auto timer = cuopt::termination_checker_t(3000.0, cuopt::termination_checker_t::root_tag_t{});
+  detail::mip_solver_t<int, double> solver(problem, settings, timer);
+  problem.tolerances = settings.get_tolerances();
+
+  detail::diversity_manager_t<int, double> diversity_manager(solver.context);
+  solver.context.diversity_manager_ptr = &diversity_manager;
+  work_limit_context_t work_limit_context("DiversityManager");
+  work_limit_context.deterministic = true;
+  diversity_manager.timer          = work_limit_timer_t(work_limit_context, 60000, timer);
+  diversity_manager.diversity_config.initial_solution_only = true;
+  diversity_manager.run_solver();
+
+  std::vector<uint32_t> hashes;
+  auto pop = diversity_manager.get_population_pointer();
+  for (const auto& sol : pop->population_to_vector()) {
+    hashes.push_back(sol.get_hash());
+  }
+
+  uint32_t final_hash = detail::compute_hash(hashes);
+  printf("%s: final hash: 0x%x, pop size %d\n",
+         path.c_str(),
+         final_hash,
+         (int)pop->population_to_vector().size());
+  return final_hash;
+}
+
+static uint32_t test_recombiners_determinism(std::string path,
+                                             unsigned long seed = std::random_device{}())
+{
+  const raft::handle_t handle_{};
+
+  cuopt::mps_parser::mps_data_model_t<int, double> mps_problem =
+    cuopt::mps_parser::parse_mps<int, double>(path, false);
+  handle_.sync_stream();
+  auto op_problem = mps_data_model_to_optimization_problem(&handle_, mps_problem);
+  problem_checking_t<int, double>::check_problem_representation(op_problem);
+
+  init_handler(op_problem.get_handle_ptr());
+  // run the problem constructor of MIP, so that we do bounds standardization
+  detail::problem_t<int, double> problem(op_problem);
+  problem.deterministic = true;
+  problem.preprocess_problem();
+
+  setup_device_symbols(op_problem.get_handle_ptr()->get_stream());
+
+  auto settings             = mip_solver_settings_t<int, double>{};
+  settings.time_limit       = 3000.;
+  settings.determinism_mode = CUOPT_MODE_DETERMINISTIC_GPU_HEURISTICS;
+  settings.heuristics_only  = true;
+  auto timer = cuopt::termination_checker_t(3000.0, cuopt::termination_checker_t::root_tag_t{});
+  detail::mip_solver_t<int, double> solver(problem, settings, timer);
+  problem.tolerances = settings.get_tolerances();
+
+  detail::diversity_manager_t<int, double> diversity_manager(solver.context);
+  solver.context.diversity_manager_ptr = &diversity_manager;
+  work_limit_context_t work_limit_context("DiversityManager");
+  work_limit_context.deterministic           = true;
+  diversity_manager.timer                    = work_limit_timer_t(work_limit_context, 60000, timer);
+  diversity_manager.diversity_config.dry_run = true;
+  diversity_manager.run_solver();
+
+  // Generate a population by running FJ on random starting points
+  // recombine a few solutions, observe the output
+  for (int i = diversity_manager.population.current_size(); i < 3; ++i) {
+    detail::solution_t<int, double> random_initial_solution(problem);
+    random_initial_solution.assign_random_within_bounds();
+    detail::fj_settings_t fj_settings;
+    fj_settings.feasibility_run = false;
+    fj_settings.iteration_limit = 1000 + i * 100;
+    fj_settings.seed            = seed + i;
+    auto solution               = run_fj(problem,
+                           fj_settings,
+                           fj_tweaks_t{},
+                           random_initial_solution.get_host_assignment(),
+                           CUOPT_MODE_DETERMINISTIC)
+                      .solution;
+    printf("population %d hash: 0x%x\n", i, solution.get_hash());
+    diversity_manager.population.add_solution(std::move(solution),
+                                              internals::mip_solution_origin_t::FEASIBILITY_JUMP);
+  }
+
+  auto pop_vector = diversity_manager.get_population_pointer()->population_to_vector();
+  int pop_size    = std::min(6, (int)pop_vector.size());
+
+  std::vector<uint32_t> hashes;
+
+  static std::map<std::tuple<std::string, int, int, detail::recombiner_enum_t>, uint32_t> hash_map;
+
+  for (auto recombiner : {detail::recombiner_enum_t::LINE_SEGMENT,
+                          detail::recombiner_enum_t::BOUND_PROP,
+                          detail::recombiner_enum_t::FP}) {
+    for (int i = 1; i < pop_size; i++) {
+      for (int j = i + 1; j < pop_size; j++) {
+        printf("recombining %d and %d w/ recombiner %s\n",
+               i,
+               j,
+               detail::all_recombine_stats::recombiner_labels[(int)recombiner]);
+        auto [offspring, success] =
+          diversity_manager.recombine(pop_vector[i], pop_vector[j], recombiner);
+        auto offspring_hash = offspring.get_hash();
+        printf("for %d,%d: offspring hash: 0x%x, parent 1 hash: 0x%x, parent 2 hash: 0x%x\n",
+               i,
+               j,
+               offspring_hash,
+               pop_vector[i].get_hash(),
+               pop_vector[j].get_hash());
+        if (hash_map.find(std::make_tuple(path, i, j, recombiner)) == hash_map.end()) {
+          hash_map[std::make_tuple(path, i, j, recombiner)] = offspring_hash;
+        } else {
+          if (hash_map[std::make_tuple(path, i, j, recombiner)] != offspring_hash) {
+            printf("%s: hash mismatch for %d,%d: %d != %d\n",
+                   path.c_str(),
+                   i,
+                   j,
+                   hash_map[std::make_tuple(path, i, j, recombiner)],
+                   offspring_hash);
+            ADD_FAILURE() << "hash mismatch";
+          }
+        }
+        hashes.push_back(offspring_hash);
+      }
+    }
+  }
+  return detail::compute_hash(hashes);
+
+  auto pop = diversity_manager.get_population_pointer();
+  for (const auto& sol : pop->population_to_vector()) {
+    hashes.push_back(sol.get_hash());
+  }
+
+  uint32_t final_hash = detail::compute_hash(hashes);
+  printf("%s: final hash: 0x%x, pop size %d\n",
+         path.c_str(),
+         final_hash,
+         (int)pop->population_to_vector().size());
+  return final_hash;
+}
+
+class DiversityTestParams : public testing::TestWithParam<std::tuple<std::string, float>> {};
+
+TEST_P(DiversityTestParams, recombiners_deterministic)
+{
+  // cuopt::init_logger_t log("", true);
+  cuopt::default_logger().set_pattern("[%n] [%-6l] %v");
+  cuopt::default_logger().set_level(rapids_logger::level_enum::debug);
+  cuopt::default_logger().flush_on(rapids_logger::level_enum::debug);
+
+  spin_stream_raii_t spin_stream_1;
+  spin_stream_raii_t spin_stream_2;
+
+  auto test_instance = std::get<0>(GetParam());
+  std::cout << "Running: " << test_instance << std::endl;
+  int seed =
+    std::getenv("CUOPT_SEED") ? std::stoi(std::getenv("CUOPT_SEED")) : std::random_device{}();
+  std::cerr << "Tested with seed " << seed << "\n";
+  auto path     = make_path_absolute(test_instance);
+  test_instance = std::getenv("CUOPT_INSTANCE") ? std::getenv("CUOPT_INSTANCE") : test_instance;
+  uint32_t gold_hash = 0;
+  for (int i = 0; i < 2; ++i) {
+    cuopt::seed_generator::set_seed(seed);
+    std::cout << "Running " << test_instance << " " << i << std::endl;
+    std::cout << "-------------------------------------------------------------\n";
+    auto hash = test_recombiners_determinism(path, seed);
+    if (i == 0) {
+      gold_hash = hash;
+      std::cout << "Gold hash: " << gold_hash << std::endl;
+    } else {
+      ASSERT_EQ(hash, gold_hash);
+    }
+  }
+}
+
+TEST_P(DiversityTestParams, initial_solution_deterministic)
+{
+  cuopt::default_logger().set_pattern("[%n] [%-6l] %v");
+
+  spin_stream_raii_t spin_stream_1;
+  spin_stream_raii_t spin_stream_2;
+
+  auto test_instance = std::get<0>(GetParam());
+  std::cout << "Running: " << test_instance << std::endl;
+  int seed =
+    std::getenv("CUOPT_SEED") ? std::stoi(std::getenv("CUOPT_SEED")) : std::random_device{}();
+  std::cerr << "Tested with seed " << seed << "\n";
+  auto path     = make_path_absolute(test_instance);
+  test_instance = std::getenv("CUOPT_INSTANCE") ? std::getenv("CUOPT_INSTANCE") : test_instance;
+  uint32_t gold_hash = 0;
+  for (int i = 0; i < 2; ++i) {
+    cuopt::seed_generator::set_seed(seed);
+    std::cout << "Running " << test_instance << " " << i << std::endl;
+    std::cout << "-------------------------------------------------------------\n";
+    auto hash = test_initial_solution_determinism(path, seed);
+    if (i == 0) {
+      gold_hash = hash;
+      std::cout << "Gold hash: " << gold_hash << std::endl;
+    } else {
+      ASSERT_EQ(hash, gold_hash);
+    }
+  }
+}
+
+// Disabled as it takes too long to run in CI and overlaps with other determinism full run tests.
+TEST_P(DiversityTestParams, DISABLED_full_run_deterministic)
+{
+  cuopt::init_logger_t log("", true);
+  // cuopt::default_logger().set_pattern("[%n] [%-6l] %v");
+  cuopt::default_logger().set_level(rapids_logger::level_enum::debug);
+  cuopt::default_logger().flush_on(rapids_logger::level_enum::debug);
+
+  spin_stream_raii_t spin_stream_1;
+  spin_stream_raii_t spin_stream_2;
+
+  auto test_instance     = std::get<0>(GetParam());
+  const float work_limit = std::get<1>(GetParam());
+  std::cout << "Running: " << test_instance << std::endl;
+  int seed =
+    std::getenv("CUOPT_SEED") ? std::stoi(std::getenv("CUOPT_SEED")) : std::random_device{}();
+  std::cerr << "Tested with seed " << seed << "\n";
+  auto path = make_path_absolute(test_instance);
+  if (std::getenv("CUOPT_INSTANCE")) {
+    test_instance = std::getenv("CUOPT_INSTANCE");
+    path          = make_path_absolute(test_instance);
+  }
+  uint32_t gold_hash = 0;
+  for (int i = 0; i < 4; ++i) {
+    cuopt::seed_generator::set_seed(seed);
+    std::cout << "Running " << test_instance << " " << i << std::endl;
+    std::cout << "-------------------------------------------------------------\n";
+    auto hash = test_full_run_determinism(path, seed, work_limit);
+    if (i == 0) {
+      gold_hash = hash;
+      std::cout << "Gold hash: " << gold_hash << std::endl;
+    } else {
+      ASSERT_EQ(hash, gold_hash);
+    }
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(DiversityTest,
+                         DiversityTestParams,
+                         testing::Values(
+                           // std::make_tuple("mip/gen-ip054.mps", 5.0f),
+                           // std::make_tuple("mip/pk1.mps", 5.0f),
+                           std::make_tuple("mip/neos5.mps", 5.0f),
+                           std::make_tuple("mip/gen-ip054.mps", 5.0f),
+                           std::make_tuple("mip/pk1.mps", 5.0f),
+                           //  std::make_tuple("uccase9.mps"),
+                           // std::make_tuple("mip/neos5.mps", 5.0f),
+                           std::make_tuple("mip/50v-10.mps", 5.0f)
+                           // std::make_tuple("mip/rmatr200-p5.mps", 5.0f)
+                           ));
+
+}  // namespace cuopt::linear_programming::test
diff --git a/cpp/tests/mip/feasibility_jump_tests.cu b/cpp/tests/mip/feasibility_jump_tests.cu
index 4e8a518522..05b2c03fec 100644
--- a/cpp/tests/mip/feasibility_jump_tests.cu
+++ b/cpp/tests/mip/feasibility_jump_tests.cu
@@ -18,6 +18,7 @@
 #include <mps_parser/parser.hpp>
 #include <pdlp/utilities/problem_checking.cuh>
 #include <utilities/common_utils.hpp>
+#include <utilities/seed_generator.cuh>
 
 #include <raft/sparse/detail/cusparse_wrappers.h>
 #include <raft/core/handle.hpp>
@@ -46,28 +47,23 @@ void init_handler(const raft::handle_t* handle_ptr)
     handle_ptr->get_cusparse_handle(), CUSPARSE_POINTER_MODE_DEVICE, handle_ptr->get_stream()));
 }
 
-struct fj_tweaks_t {
-  double objective_weight = 0;
-};
-
-struct fj_state_t {
-  detail::solution_t<int, double> solution;
-  std::vector<double> solution_vector;
-  int minimums;
-  double incumbent_objective;
-  double incumbent_violation;
-};
-
 // Helper function to setup MIP solver and run FJ with given settings and initial solution
-static fj_state_t run_fj(std::string test_instance,
-                         const detail::fj_settings_t& fj_settings,
-                         fj_tweaks_t tweaks                   = {},
-                         std::vector<double> initial_solution = {})
+static fj_state_t run_fj_instance(std::string test_instance,
+                                  const detail::fj_settings_t& fj_settings,
+                                  fj_tweaks_t tweaks                   = {},
+                                  std::vector<double> initial_solution = {},
+                                  int determinism_mode                 = CUOPT_MODE_DETERMINISTIC)
 {
   const raft::handle_t handle_{};
   std::cout << "Running: " << test_instance << std::endl;
 
   auto path = cuopt::test::get_rapids_dataset_root_dir() + ("/mip/" + test_instance);
+
+  if (std::getenv("CUOPT_INSTANCE")) {
+    path = make_path_absolute(std::getenv("CUOPT_INSTANCE"));
+    std::cout << "Using instance from CUOPT_INSTANCE: " << path << std::endl;
+  }
+
   cuopt::mps_parser::mps_data_model_t<int, double> mps_problem =
     cuopt::mps_parser::parse_mps<int, double>(path, false);
   handle_.sync_stream();
@@ -78,37 +74,8 @@ static fj_state_t run_fj(std::string test_instance,
   // run the problem constructor of MIP, so that we do bounds standardization
   detail::problem_t<int, double> problem(op_problem);
   problem.preprocess_problem();
-  detail::mip_scaling_strategy_t<int, double> scaling(problem);
-
-  auto settings       = mip_solver_settings_t<int, double>{};
-  settings.time_limit = 30.;
-  auto timer          = cuopt::timer_t(30);
-  detail::mip_solver_t<int, double> solver(problem, settings, scaling, timer);
-
-  detail::solution_t<int, double> solution(*solver.context.problem_ptr);
-  if (initial_solution.size() > 0) {
-    expand_device_copy(solution.assignment, initial_solution, solution.handle_ptr->get_stream());
-  } else {
-    thrust::fill(solution.handle_ptr->get_thrust_policy(),
-                 solution.assignment.begin(),
-                 solution.assignment.end(),
-                 0.0);
-  }
-  solution.clamp_within_bounds();
-
-  detail::fj_t<int, double> fj(solver.context, fj_settings);
-  fj.reset_weights(solution.handle_ptr->get_stream(), 1.);
-  fj.objective_weight.set_value_async(tweaks.objective_weight, solution.handle_ptr->get_stream());
-  solution.handle_ptr->sync_stream();
 
-  fj.solve(solution);
-  auto solution_vector = host_copy(solution.assignment, solution.handle_ptr->get_stream());
-
-  return {solution,
-          solution_vector,
-          fj.climbers[0]->local_minimums_reached.value(solution.handle_ptr->get_stream()),
-          fj.climbers[0]->incumbent_objective.value(solution.handle_ptr->get_stream()),
-          fj.climbers[0]->violation_score.value(solution.handle_ptr->get_stream())};
+  return run_fj(problem, fj_settings, tweaks, initial_solution, determinism_mode);
 }
 
 // FJ had a bug causing objective/violation values to explode in magnitude in certain scenarios.
@@ -118,12 +85,12 @@ static bool run_fj_check_no_obj_runoff(std::string test_instance)
   detail::fj_settings_t fj_settings;
   fj_settings.time_limit             = 30.;
   fj_settings.mode                   = detail::fj_mode_t::EXIT_NON_IMPROVING;
-  fj_settings.n_of_minimums_for_exit = 20000 * 1000;
+  fj_settings.n_of_minimums_for_exit = 5000;
   fj_settings.update_weights         = true;
   fj_settings.feasibility_run        = false;
   fj_settings.iteration_limit        = 20000;
 
-  auto state = run_fj(test_instance, fj_settings);
+  auto state = run_fj_instance(test_instance, fj_settings);
 
   // ensure that the objective and the violation in the FJ state are not too large (<1e60)
   EXPECT_LE(state.incumbent_violation, 1e60) << "FJ violation too large";
@@ -140,12 +107,13 @@ static bool run_fj_check_objective(std::string test_instance, int iter_limit, do
   detail::fj_settings_t fj_settings;
   fj_settings.time_limit             = 30.;
   fj_settings.mode                   = detail::fj_mode_t::EXIT_NON_IMPROVING;
-  fj_settings.n_of_minimums_for_exit = 20000 * 1000;
+  fj_settings.n_of_minimums_for_exit = 5000;
   fj_settings.update_weights         = true;
   fj_settings.feasibility_run        = obj_target == +std::numeric_limits<double>::infinity();
   fj_settings.iteration_limit        = iter_limit;
 
-  auto state     = run_fj(test_instance, fj_settings);
+  auto state =
+    run_fj_instance(test_instance, fj_settings, fj_tweaks_t{}, {}, CUOPT_MODE_DETERMINISTIC);
   auto& solution = state.solution;
 
   CUOPT_LOG_DEBUG("%s: Solution generated with FJ: is_feasible %d, objective %g (raw %g)",
@@ -167,12 +135,12 @@ static bool run_fj_check_feasible(std::string test_instance)
   detail::fj_settings_t fj_settings;
   fj_settings.time_limit             = 30.;
   fj_settings.mode                   = detail::fj_mode_t::EXIT_NON_IMPROVING;
-  fj_settings.n_of_minimums_for_exit = 20000 * 1000;
+  fj_settings.n_of_minimums_for_exit = 5000;
   fj_settings.update_weights         = true;
   fj_settings.feasibility_run        = false;
   fj_settings.iteration_limit        = 25000;
 
-  auto state     = run_fj(test_instance, fj_settings);
+  auto state     = run_fj_instance(test_instance, fj_settings);
   auto& solution = state.solution;
 
   bool previous_feasible = solution.get_feasible();
@@ -183,8 +151,8 @@ static bool run_fj_check_feasible(std::string test_instance)
   // again but with very large obj weight to force FJ into the infeasible region
   fj_tweaks_t tweaks;
   tweaks.objective_weight = 1e6;
-  auto new_state          = run_fj(test_instance, fj_settings, tweaks, state.solution_vector);
-  auto& new_solution      = new_state.solution;
+  auto new_state     = run_fj_instance(test_instance, fj_settings, tweaks, state.solution_vector);
+  auto& new_solution = new_state.solution;
 
   CUOPT_LOG_DEBUG("%s: Solution generated with FJ: is_feasible %d, objective %g (raw %g)",
                   test_instance.c_str(),
@@ -199,63 +167,57 @@ static bool run_fj_check_feasible(std::string test_instance)
   return true;
 }
 
-class MIPSolveParametricTest : public testing::TestWithParam<std::tuple<std::string, double, int>> {
-};
-
-TEST_P(MIPSolveParametricTest, feasibility_jump_obj_test)
+static bool run_fj_check_determinism(std::string test_instance, int iter_limit)
 {
-  auto [instance, obj_target, iter_limit] = GetParam();
-  EXPECT_TRUE(run_fj_check_objective(instance, iter_limit, obj_target));
-}
+  detail::fj_settings_t fj_settings;
+  fj_settings.time_limit             = std::numeric_limits<double>::max();
+  fj_settings.mode                   = detail::fj_mode_t::EXIT_NON_IMPROVING;
+  fj_settings.n_of_minimums_for_exit = 5000 * 1000;
+  // fj_settings.work_limit             = 0.5;  // run for 0.5wu (~0.5s)
+  fj_settings.update_weights      = true;
+  fj_settings.feasibility_run     = false;
+  fj_settings.iteration_limit     = iter_limit;
+  fj_settings.load_balancing_mode = detail::fj_load_balancing_mode_t::ALWAYS_ON;
+  fj_settings.seed                = cuopt::seed_generator::get_seed();
+
+  auto state     = run_fj_instance(test_instance, fj_settings);
+  auto& solution = state.solution;
 
-INSTANTIATE_TEST_SUITE_P(
-  MIPSolveTest,
-  MIPSolveParametricTest,
-  testing::Values(
-    // Bug: https://github.com/NVIDIA/cuopt/issues/214
-    // std::make_tuple("50v-10.mps", 7800, 100000),
-    // std::make_tuple("fiball.mps", 140, 25000),
-    // std::make_tuple("rmatr200-p5.mps", 7000, 10000),
-    std::make_tuple("gen-ip054.mps", 7500, 20000),
-    std::make_tuple("sct2.mps", 100, 50000),
-    std::make_tuple("uccase9.mps", 4000000, 50000),
-    // unstable, prone to failure on slight weight changes
-    // std::make_tuple("drayage-25-23.mps", 300000, 50000),
-    std::make_tuple("tr12-30.mps", 300000, 50000),
-    std::make_tuple("neos-3004026-krka.mps",
-                    +std::numeric_limits<double>::infinity(),
-                    35000),  // feasibility
-    // std::make_tuple("nursesched-medium-hint03.mps", 12000, 50000), // too large
-    std::make_tuple("ns1208400.mps", 2, 60000),
-    std::make_tuple("gmu-35-50.mps", -2300000, 25000),
-    std::make_tuple("n2seq36q.mps", 158800, 25000),
-    std::make_tuple("seymour1.mps", 440, 50000),
-    std::make_tuple("cvs16r128-89.mps", -50, 10000)
-// TEMPORARY: occasional cusparse transpose issues on ARM in CI
-#ifndef __aarch64__
-      ,
-    std::make_tuple("thor50dday.mps", 250000, 1000)
-#endif
-      ));
-
-TEST(mip_solve, feasibility_jump_feas_test)
-{
-  for (const auto& instance : {"tr12-30.mps",
-                               "sct2.mps"
-#ifndef __aarch64__
-                               ,
-                               "thor50dday.mps"
-#endif
-       }) {
-    run_fj_check_feasible(instance);
+  printf("%s[seed=%x]: Solution generated with FJ: is_feasible %d, objective %g (raw %g)",
+         test_instance.c_str(),
+         fj_settings.seed,
+         solution.get_feasible(),
+         solution.get_user_objective(),
+         solution.get_objective());
+
+  static std::unordered_map<std::string, double> first_val_map;
+  if (first_val_map.count(test_instance) == 0) {
+    first_val_map[test_instance] = solution.get_user_objective();
   }
+  EXPECT_NEAR(solution.get_user_objective(), first_val_map[test_instance], 1.0)
+    << test_instance << " determinism objective mismatch";
+
+  return true;
 }
 
-TEST(mip_solve, feasibility_jump_obj_runoff_test)
+TEST(mip_solve, feasibility_jump_determinism)
 {
-  for (const auto& instance : {"minrep_inf.mps", "sct2.mps", "uccase9.mps",
-                               /*"buildingenergy.mps"*/}) {
-    run_fj_check_no_obj_runoff(instance);
+  cuopt::init_logger_t log("", true);
+
+  int seed =
+    std::getenv("CUOPT_SEED") ? std::stoi(std::getenv("CUOPT_SEED")) : std::random_device{}();
+
+  for (const auto& [instance, iter_limit] : {std::make_pair("thor50dday.mps", 1000),
+                                             std::make_pair("gen-ip054.mps", 1000),
+                                             std::make_pair("50v-10.mps", 1000),
+                                             std::make_pair("seymour1.mps", 1000),
+                                             std::make_pair("rmatr200-p5.mps", 1000),
+                                             std::make_pair("tr12-30.mps", 1000),
+                                             std::make_pair("sct2.mps", 1000)}) {
+    for (int i = 0; i < 10; i++) {
+      cuopt::seed_generator::set_seed(seed);
+      run_fj_check_determinism(instance, iter_limit);
+    }
   }
 }
 
diff --git a/cpp/tests/mip/load_balancing_test.cu b/cpp/tests/mip/load_balancing_test.cu
index 1f825a26f7..9fc15d0325 100644
--- a/cpp/tests/mip/load_balancing_test.cu
+++ b/cpp/tests/mip/load_balancing_test.cu
@@ -9,6 +9,7 @@
 #include "mip_utils.cuh"
 
 #include <raft/sparse/detail/cusparse_wrappers.h>
+#include <cuopt/linear_programming/pdlp/pdlp_hyper_params.cuh>
 #include <mip_heuristics/mip_scaling_strategy.cuh>
 #include <mip_heuristics/presolve/bounds_presolve.cuh>
 #include <mip_heuristics/presolve/load_balanced_bounds_presolve.cuh>
@@ -128,8 +129,8 @@ void test_multi_probe(std::string path)
   problem_checking_t<int, double>::check_problem_representation(op_problem);
   detail::problem_t<int, double> problem(op_problem);
   mip_solver_settings_t<int, double> default_settings{};
-  detail::mip_scaling_strategy_t<int, double> scaling(problem);
-  detail::mip_solver_t<int, double> solver(problem, default_settings, scaling, cuopt::timer_t(0));
+  auto timer = cuopt::termination_checker_t(0.0, cuopt::termination_checker_t::root_tag_t{});
+  detail::mip_solver_t<int, double> solver(problem, default_settings, timer);
   detail::load_balanced_problem_t<int, double> lb_problem(problem);
   detail::load_balanced_bounds_presolve_t<int, double> lb_prs(lb_problem, solver.context);
 
diff --git a/cpp/tests/mip/local_search_test.cu b/cpp/tests/mip/local_search_test.cu
new file mode 100644
index 0000000000..fc9334d98d
--- /dev/null
+++ b/cpp/tests/mip/local_search_test.cu
@@ -0,0 +1,238 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../linear_programming/utilities/pdlp_test_utilities.cuh"
+#include "determinism_utils.cuh"
+#include "mip_utils.cuh"
+
+#include <cuopt/error.hpp>
+#include <cuopt/linear_programming/pdlp/pdlp_hyper_params.cuh>
+#include <cuopt/linear_programming/solve.hpp>
+#include <cuopt/linear_programming/utilities/internals.hpp>
+#include <mip_heuristics/diversity/diversity_manager.cuh>
+#include <mip_heuristics/feasibility_jump/feasibility_jump.cuh>
+#include <mip_heuristics/local_search/local_search.cuh>
+#include <mip_heuristics/relaxed_lp/relaxed_lp.cuh>
+#include <mip_heuristics/solution/solution.cuh>
+#include <mip_heuristics/solver_context.cuh>
+#include <mps_parser/parser.hpp>
+#include <pdlp/initial_scaling_strategy/initial_scaling.cuh>
+#include <pdlp/pdlp.cuh>
+#include <pdlp/restart_strategy/pdlp_restart_strategy.cuh>
+#include <pdlp/step_size_strategy/adaptive_step_size_strategy.hpp>
+#include <pdlp/utilities/problem_checking.cuh>
+#include <utilities/common_utils.hpp>
+#include <utilities/seed_generator.cuh>
+
+#include <raft/sparse/detail/cusparse_wrappers.h>
+#include <raft/core/handle.hpp>
+#include <raft/util/cudart_utils.hpp>
+
+#include <gtest/gtest.h>
+
+#include <thrust/count.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/sequence.h>
+
+#include <cstdint>
+#include <limits>
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace cuopt::linear_programming::test {
+
+void init_handler(const raft::handle_t* handle_ptr)
+{
+  // Init cuBlas / cuSparse context here to avoid having it during solving time
+  RAFT_CUBLAS_TRY(raft::linalg::detail::cublassetpointermode(
+    handle_ptr->get_cublas_handle(), CUBLAS_POINTER_MODE_DEVICE, handle_ptr->get_stream()));
+  RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsesetpointermode(
+    handle_ptr->get_cusparse_handle(), CUSPARSE_POINTER_MODE_DEVICE, handle_ptr->get_stream()));
+}
+
+static void setup_device_symbols(rmm::cuda_stream_view stream_view) { (void)stream_view; }
+
+enum local_search_mode_t {
+  FP = 0,
+  STAGED_FP,
+  FJ_LINE_SEGMENT,
+  FJ_ON_ZERO,
+  FJ_ANNEALING,
+};
+
+// Helper function to setup MIP solver and run FJ with given settings and initial solution
+static uint32_t run_fp(std::string test_instance, local_search_mode_t mode, double work_limit = 4.0)
+{
+  const raft::handle_t handle_{};
+  std::cout << "Running: " << test_instance << std::endl;
+
+  auto path = cuopt::test::get_rapids_dataset_root_dir() + ("/mip/" + test_instance);
+  cuopt::mps_parser::mps_data_model_t<int, double> mps_problem =
+    cuopt::mps_parser::parse_mps<int, double>(path, false);
+  handle_.sync_stream();
+  auto op_problem = mps_data_model_to_optimization_problem(&handle_, mps_problem);
+  problem_checking_t<int, double>::check_problem_representation(op_problem);
+
+  init_handler(op_problem.get_handle_ptr());
+  // run the problem constructor of MIP, so that we do bounds standardization
+  auto settings             = mip_solver_settings_t<int, double>{};
+  settings.time_limit       = 120.;
+  settings.determinism_mode = CUOPT_MODE_DETERMINISTIC;
+
+  detail::problem_t<int, double> problem(op_problem, settings.get_tolerances(), true);
+  problem.preprocess_problem();
+
+  setup_device_symbols(op_problem.get_handle_ptr()->get_stream());
+  auto timer =
+    cuopt::termination_checker_t(settings.time_limit, cuopt::termination_checker_t::root_tag_t{});
+  detail::mip_solver_t<int, double> solver(problem, settings, timer);
+  problem.tolerances = settings.get_tolerances();
+
+  rmm::device_uvector<double> lp_optimal_solution(problem.n_variables,
+                                                  problem.handle_ptr->get_stream());
+  thrust::fill(problem.handle_ptr->get_thrust_policy(),
+               lp_optimal_solution.begin(),
+               lp_optimal_solution.end(),
+               0.0);
+  detail::lp_state_t<int, double>& lp_state = problem.lp_state;
+  // resize because some constructor might be called before the presolve
+  lp_state.resize(problem, problem.handle_ptr->get_stream());
+  detail::relaxed_lp_settings_t lp_settings{};
+  lp_settings.time_limit            = std::numeric_limits<double>::max();
+  lp_settings.tolerance             = 1e-6;
+  lp_settings.return_first_feasible = false;
+  lp_settings.save_state            = false;
+  // lp_settings.iteration_limit       = 5;
+  auto lp_result =
+    detail::get_relaxed_lp_solution(problem, lp_optimal_solution, lp_state, lp_settings);
+  EXPECT_EQ(lp_result.get_termination_status(), pdlp_termination_status_t::Optimal);
+  clamp_within_var_bounds(lp_optimal_solution, &problem, problem.handle_ptr);
+
+  // return detail::compute_hash(lp_optimal_solution);
+
+  detail::local_search_t<int, double> local_search(solver.context, lp_optimal_solution);
+
+  detail::solution_t<int, double> solution(problem);
+  solution.assign_random_within_bounds();
+  solution.compute_feasibility();
+
+  printf("Model fingerprint: 0x%x\n", problem.get_fingerprint());
+  printf("LP optimal hash: 0x%x\n",
+         detail::compute_hash(make_span(lp_optimal_solution), problem.handle_ptr->get_stream()));
+  printf("running mode: %d\n", mode);
+
+  work_limit_context_t work_limit_context("LocalSearch");
+  work_limit_context.deterministic = true;
+  local_search.fp.timer            = work_limit_timer_t(work_limit_context, work_limit, timer);
+
+  detail::ls_config_t<int, double> ls_config{};
+
+  if (mode == local_search_mode_t::FP) {
+    bool is_feasible = false;
+    int iterations   = 0;
+    while (!local_search.fp.timer.check_time_limit()) {
+      is_feasible = local_search.fp.run_single_fp_descent(solution);
+      printf("fp_loop it %d, is_feasible %d\n", iterations, is_feasible);
+      if (is_feasible) {
+        break;
+      } else {
+        is_feasible = local_search.fp.restart_fp(solution);
+        if (is_feasible) { break; }
+      }
+      iterations++;
+    }
+  } else if (mode == local_search_mode_t::FJ_LINE_SEGMENT) {
+    work_limit_timer_t wlt(work_limit_context, work_limit, timer);
+    local_search.run_fj_line_segment(solution, wlt, ls_config);
+  } else if (mode == local_search_mode_t::FJ_ON_ZERO) {
+    work_limit_timer_t wlt(work_limit_context, work_limit, timer);
+    local_search.run_fj_on_zero(solution, wlt);
+  } else if (mode == local_search_mode_t::FJ_ANNEALING) {
+    work_limit_timer_t wlt(work_limit_context, work_limit, timer);
+    local_search.run_fj_annealing(solution, wlt, ls_config);
+  }
+
+  std::vector<uint32_t> hashes;
+  hashes.push_back(detail::compute_hash(solution.get_host_assignment()));
+  printf("hashes: 0x%x, hash of the hash: 0x%x\n", hashes[0], detail::compute_hash(hashes));
+
+  return detail::compute_hash(hashes);
+}
+
+static uint32_t run_fp_check_determinism(std::string test_instance,
+                                         local_search_mode_t mode,
+                                         unsigned long seed,
+                                         double work_limit = 4.0)
+{
+  cuopt::seed_generator::set_seed(seed);
+
+  return run_fp(test_instance, mode, work_limit);
+}
+
+class LocalSearchTestParams : public testing::TestWithParam<std::tuple<local_search_mode_t>> {};
+
+TEST_P(LocalSearchTestParams, local_search_operator_determinism)
+{
+  cuopt::init_logger_t log("", true);
+  cuopt::default_logger().set_pattern("[%n] [%-6l] %v");
+  cuopt::default_logger().set_level(rapids_logger::level_enum::debug);
+  cuopt::default_logger().flush_on(rapids_logger::level_enum::debug);
+
+  spin_stream_raii_t spin_stream_1;
+  spin_stream_raii_t spin_stream_2;
+
+  auto mode = std::get<0>(GetParam());
+
+  struct instance_config_t {
+    const char* name;
+    double work_limit;
+  };
+  for (const auto& cfg : {
+         instance_config_t{"gen-ip054.mps", 4.0},
+         instance_config_t{"50v-10.mps", 2.0},
+         // instance_config_t{"n2seq36q.mps", 4.0},
+         instance_config_t{"neos5.mps", 2.0},
+         // instance_config_t{"neos8.mps", 2.0},
+       }) {
+    unsigned long seed = std::getenv("CUOPT_SEED")
+                           ? (unsigned long)std::stoi(std::getenv("CUOPT_SEED"))
+                           : (unsigned long)std::random_device{}();
+    std::cerr << "Tested with seed " << seed << " instance " << cfg.name << " work_limit "
+              << cfg.work_limit << "\n";
+    uint32_t gold_hash = 0;
+    for (int i = 0; i < 5; ++i) {
+      uint32_t hash = run_fp_check_determinism(cfg.name, mode, seed, cfg.work_limit);
+      if (i == 0) {
+        gold_hash = hash;
+        printf("Gold hash: 0x%x\n", gold_hash);
+      } else {
+        ASSERT_EQ(hash, gold_hash);
+        printf("Hash: 0x%x\n", hash);
+      }
+    }
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(LocalSearchTests,
+                         LocalSearchTestParams,
+                         testing::Values(std::make_tuple(local_search_mode_t::FP),
+                                         std::make_tuple(local_search_mode_t::FJ_LINE_SEGMENT),
+                                         // std::make_tuple(local_search_mode_t::FJ_ON_ZERO),
+                                         std::make_tuple(local_search_mode_t::FJ_ANNEALING)));
+
+}  // namespace cuopt::linear_programming::test
diff --git a/cpp/tests/mip/mip_utils.cuh b/cpp/tests/mip/mip_utils.cuh
index 5c2b39d290..4595939e1f 100644
--- a/cpp/tests/mip/mip_utils.cuh
+++ b/cpp/tests/mip/mip_utils.cuh
@@ -8,9 +8,14 @@
 #include <algorithm>
 #include <cuopt/linear_programming/mip/solver_settings.hpp>
 #include <cuopt/linear_programming/solve.hpp>
+#include <mip_heuristics/feasibility_jump/feasibility_jump.cuh>
 #include <mip_heuristics/problem/problem.cuh>
+#include <mip_heuristics/solution/solution.cuh>
+#include <mip_heuristics/solver.cuh>
 #include <mps_parser/parser.hpp>
+#include <pdlp/initial_scaling_strategy/initial_scaling.cuh>
 #include <utilities/copy_helpers.hpp>
+#include <utilities/timer.hpp>
 
 namespace cuopt::linear_programming::test {
 
@@ -180,4 +185,54 @@ static std::tuple<mip_termination_status_t, double, double> test_mps_file(
                          solution.get_solution_bound());
 }
 
+struct fj_tweaks_t {
+  double objective_weight = 0;
+};
+
+struct fj_state_t {
+  detail::solution_t<int, double> solution;
+  std::vector<double> solution_vector;
+  int minimums;
+  double incumbent_objective;
+  double incumbent_violation;
+};
+
+static fj_state_t run_fj(detail::problem_t<int, double>& problem,
+                         const detail::fj_settings_t& fj_settings,
+                         fj_tweaks_t tweaks                   = {},
+                         std::vector<double> initial_solution = {},
+                         int determinism_mode                 = CUOPT_MODE_OPPORTUNISTIC)
+{
+  auto settings             = mip_solver_settings_t<int, double>{};
+  settings.time_limit       = 30.;
+  settings.determinism_mode = determinism_mode;
+  auto timer = cuopt::termination_checker_t(30.0, cuopt::termination_checker_t::root_tag_t{});
+  detail::mip_solver_t<int, double> solver(problem, settings, timer);
+
+  detail::solution_t<int, double> solution(*solver.context.problem_ptr);
+  if (initial_solution.size() > 0) {
+    expand_device_copy(solution.assignment, initial_solution, solution.handle_ptr->get_stream());
+  } else {
+    thrust::fill(solution.handle_ptr->get_thrust_policy(),
+                 solution.assignment.begin(),
+                 solution.assignment.end(),
+                 0.0);
+  }
+  solution.clamp_within_bounds();
+
+  detail::fj_t<int, double> fj(solver.context, fj_settings);
+  fj.reset_weights(solution.handle_ptr->get_stream(), 1.);
+  fj.objective_weight.set_value_async(tweaks.objective_weight, solution.handle_ptr->get_stream());
+  solution.handle_ptr->sync_stream();
+
+  fj.solve(solution);
+  auto solution_vector = host_copy(solution.assignment, solution.handle_ptr->get_stream());
+
+  return {solution,
+          solution_vector,
+          fj.climbers[0]->local_minimums_reached.value(solution.handle_ptr->get_stream()),
+          fj.climbers[0]->incumbent_objective.value(solution.handle_ptr->get_stream()),
+          fj.climbers[0]->violation_score.value(solution.handle_ptr->get_stream())};
+}
+
 }  // namespace cuopt::linear_programming::test
diff --git a/cpp/tests/mip/multi_probe_test.cu b/cpp/tests/mip/multi_probe_test.cu
index 003220de9b..ee0753cb32 100644
--- a/cpp/tests/mip/multi_probe_test.cu
+++ b/cpp/tests/mip/multi_probe_test.cu
@@ -6,6 +6,7 @@
 /* clang-format on */
 
 #include "../linear_programming/utilities/pdlp_test_utilities.cuh"
+#include "determinism_utils.cuh"
 #include "mip_utils.cuh"
 
 #include <raft/sparse/detail/cusparse_wrappers.h>
@@ -43,9 +44,10 @@ void init_handler(const raft::handle_t* handle_ptr)
 }
 
 std::tuple<std::vector<int>, std::vector<double>, std::vector<double>> select_k_random(
-  detail::problem_t<int, double>& problem, int sample_size)
+  detail::problem_t<int, double>& problem,
+  int sample_size,
+  unsigned long seed = std::random_device{}())
 {
-  auto seed = std::random_device{}();
   std::cerr << "Tested with seed " << seed << "\n";
   problem.compute_n_integer_vars();
   auto [v_lb, v_ub] = extract_host_bounds<double>(problem.variable_bounds, problem.handle_ptr);
@@ -138,10 +140,8 @@ multi_probe_results(
     std::move(h_lb_0), std::move(h_ub_0), std::move(h_lb_1), std::move(h_ub_1));
 }
 
-void test_multi_probe(std::string path)
+uint32_t test_multi_probe(std::string path, unsigned long seed = std::random_device{}())
 {
-  auto memory_resource = make_async();
-  rmm::mr::set_current_device_resource(memory_resource.get());
   const raft::handle_t handle_{};
   cuopt::mps_parser::mps_data_model_t<int, double> mps_problem =
     cuopt::mps_parser::parse_mps<int, double>(path, false);
@@ -150,12 +150,13 @@ void test_multi_probe(std::string path)
   problem_checking_t<int, double>::check_problem_representation(op_problem);
   detail::problem_t<int, double> problem(op_problem);
   mip_solver_settings_t<int, double> default_settings{};
-  detail::mip_solver_t<int, double> solver(problem, default_settings, cuopt::timer_t(0));
+  auto timer = cuopt::termination_checker_t(0.0, cuopt::termination_checker_t::root_tag_t{});
+  detail::mip_solver_t<int, double> solver(problem, default_settings, timer);
   detail::bound_presolve_t<int, double> bnd_prb_0(solver.context);
   detail::bound_presolve_t<int, double> bnd_prb_1(solver.context);
   detail::multi_probe_t<int, double> multi_probe_prs(solver.context);
 
-  auto probe_tuple       = select_k_random(problem, 100);
+  auto probe_tuple       = select_k_random(problem, 100, seed);
   auto bounds_probe_vals = convert_probe_tuple(probe_tuple);
 
   auto [bnd_lb_0, bnd_ub_0, bnd_lb_1, bnd_ub_1] =
@@ -174,6 +175,16 @@ void test_multi_probe(std::string path)
   auto mlp_min_act_1 = host_copy(multi_probe_prs.upd_1.min_activity, stream);
   auto mlp_max_act_1 = host_copy(multi_probe_prs.upd_1.max_activity, stream);
 
+  std::vector<uint32_t> hashes;
+  hashes.push_back(detail::compute_hash(bnd_min_act_0));
+  hashes.push_back(detail::compute_hash(bnd_min_act_1));
+  hashes.push_back(detail::compute_hash(bnd_max_act_0));
+  hashes.push_back(detail::compute_hash(bnd_max_act_1));
+  hashes.push_back(detail::compute_hash(bnd_lb_0));
+  hashes.push_back(detail::compute_hash(bnd_ub_0));
+  hashes.push_back(detail::compute_hash(bnd_lb_1));
+  hashes.push_back(detail::compute_hash(bnd_ub_1));
+
   for (int i = 0; i < (int)bnd_min_act_0.size(); ++i) {
     EXPECT_DOUBLE_EQ(bnd_min_act_0[i], mlp_min_act_0[i]);
     EXPECT_DOUBLE_EQ(bnd_max_act_0[i], mlp_max_act_0[i]);
@@ -187,6 +198,9 @@ void test_multi_probe(std::string path)
     EXPECT_DOUBLE_EQ(bnd_lb_1[i], m_lb_1[i]);
     EXPECT_DOUBLE_EQ(bnd_ub_1[i], m_ub_1[i]);
   }
+
+  // return a composite hash of all the hashes to check for determinism
+  return detail::compute_hash(hashes);
 }
 
 TEST(presolve, multi_probe)
@@ -200,4 +214,29 @@ TEST(presolve, multi_probe)
   }
 }
 
+TEST(presolve, multi_probe_deterministic)
+{
+  spin_stream_raii_t spin_stream_1;
+
+  std::vector<std::string> test_instances = {
+    "mip/50v-10-free-bound.mps",
+    "mip/neos5-free-bound.mps",
+    "mip/neos5.mps",
+    "mip/50v-10.mps",
+  };
+  for (const auto& test_instance : test_instances) {
+    std::cout << "Running: " << test_instance << std::endl;
+    unsigned long seed = std::random_device{}();
+    auto path          = make_path_absolute(test_instance);
+    uint32_t gold_hash = 0;
+    for (int i = 0; i < 10; ++i) {
+      auto hash = test_multi_probe(path, seed);
+      if (i == 0) {
+        gold_hash = hash;
+      } else {
+        EXPECT_EQ(hash, gold_hash);
+      }
+    }
+  }
+}
 }  // namespace cuopt::linear_programming::test
diff --git a/cpp/tests/mip/presolve_test.cu b/cpp/tests/mip/presolve_test.cu
index cf2532d0f2..a11d1c7288 100644
--- a/cpp/tests/mip/presolve_test.cu
+++ b/cpp/tests/mip/presolve_test.cu
@@ -6,12 +6,22 @@
 /* clang-format on */
 
 #include "../linear_programming/utilities/pdlp_test_utilities.cuh"
+#include "determinism_utils.cuh"
+#include "mip_utils.cuh"
 
+#include <raft/sparse/detail/cusparse_wrappers.h>
+#include <cuopt/linear_programming/pdlp/pdlp_hyper_params.cuh>
 #include <cuopt/linear_programming/solve.hpp>
+#include <mip_heuristics/presolve/bounds_presolve.cuh>
+#include <mip_heuristics/presolve/multi_probe.cuh>
 #include <mip_heuristics/presolve/third_party_presolve.hpp>
+#include <mip_heuristics/presolve/trivial_presolve.cuh>
 #include <mip_heuristics/problem/problem.cuh>
+#include <mip_heuristics/utils.cuh>
 #include <mps_parser/mps_data_model.hpp>
 #include <mps_parser/parser.hpp>
+#include <pdlp/initial_scaling_strategy/initial_scaling.cuh>
+#include <pdlp/utilities/problem_checking.cuh>
 #include <pdlp/utils.cuh>
 #include <utilities/common_utils.hpp>
 #include <utilities/copy_helpers.hpp>
@@ -29,6 +39,171 @@
 
 namespace cuopt::linear_programming::test {
 
+void init_handler(const raft::handle_t* handle_ptr)
+{
+  // Init cuBlas / cuSparse context here to avoid having it during solving time
+  RAFT_CUBLAS_TRY(raft::linalg::detail::cublassetpointermode(
+    handle_ptr->get_cublas_handle(), CUBLAS_POINTER_MODE_DEVICE, handle_ptr->get_stream()));
+  RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsesetpointermode(
+    handle_ptr->get_cusparse_handle(), CUSPARSE_POINTER_MODE_DEVICE, handle_ptr->get_stream()));
+}
+
+std::tuple<std::vector<int>, std::vector<double>, std::vector<double>> select_k_random(
+  detail::problem_t<int, double>& problem,
+  int sample_size,
+  unsigned long seed = std::random_device{}())
+{
+  std::cerr << "Tested with seed " << seed << "\n";
+  problem.compute_n_integer_vars();
+  auto [v_lb, v_ub] = extract_host_bounds<double>(problem.variable_bounds, problem.handle_ptr);
+  auto int_var_id   = host_copy(problem.integer_indices, problem.handle_ptr->get_stream());
+  int_var_id.erase(
+    std::remove_if(int_var_id.begin(),
+                   int_var_id.end(),
+                   [v_lb_sp = v_lb, v_ub_sp = v_ub](auto id) {
+                     return !(std::isfinite(v_lb_sp[id]) && std::isfinite(v_ub_sp[id]));
+                   }),
+    int_var_id.end());
+  sample_size = std::min(sample_size, static_cast<int>(int_var_id.size()));
+  std::vector<int> random_int_vars;
+  std::mt19937 m{seed};
+  std::sample(
+    int_var_id.begin(), int_var_id.end(), std::back_inserter(random_int_vars), sample_size, m);
+  std::vector<double> probe_0(sample_size);
+  std::vector<double> probe_1(sample_size);
+  for (int i = 0; i < static_cast<int>(random_int_vars.size()); ++i) {
+    if (i % 2) {
+      probe_0[i] = v_lb[random_int_vars[i]];
+      probe_1[i] = v_ub[random_int_vars[i]];
+    } else {
+      probe_1[i] = v_lb[random_int_vars[i]];
+      probe_0[i] = v_ub[random_int_vars[i]];
+    }
+  }
+  return std::make_tuple(std::move(random_int_vars), std::move(probe_0), std::move(probe_1));
+}
+
+std::pair<std::vector<thrust::pair<int, double>>, std::vector<thrust::pair<int, double>>>
+convert_probe_tuple(std::tuple<std::vector<int>, std::vector<double>, std::vector<double>>& probe)
+{
+  std::vector<thrust::pair<int, double>> probe_first;
+  std::vector<thrust::pair<int, double>> probe_second;
+  for (size_t i = 0; i < std::get<0>(probe).size(); ++i) {
+    probe_first.emplace_back(thrust::make_pair(std::get<0>(probe)[i], std::get<1>(probe)[i]));
+    probe_second.emplace_back(thrust::make_pair(std::get<0>(probe)[i], std::get<2>(probe)[i]));
+  }
+  return std::make_pair(std::move(probe_first), std::move(probe_second));
+}
+
+uint32_t test_probing_cache_determinism(std::string path,
+                                        unsigned long seed = std::random_device{}())
+{
+  const raft::handle_t handle_{};
+  cuopt::mps_parser::mps_data_model_t<int, double> mps_problem =
+    cuopt::mps_parser::parse_mps<int, double>(path, false);
+  handle_.sync_stream();
+  auto op_problem = mps_data_model_to_optimization_problem(&handle_, mps_problem);
+  problem_checking_t<int, double>::check_problem_representation(op_problem);
+  detail::problem_t<int, double> problem(op_problem);
+  mip_solver_settings_t<int, double> default_settings{};
+  default_settings.mip_scaling = false;  // we're not checking scaling determinism here
+  auto timer = cuopt::termination_checker_t(0.0, cuopt::termination_checker_t::root_tag_t{});
+  detail::mip_solver_t<int, double> solver(problem, default_settings, timer);
+  detail::bound_presolve_t<int, double> bnd_prb(solver.context);
+
+  work_limit_context_t work_limit_context("ProbingCache");
+  // rely on the iteration limit
+  work_limit_timer_t probing_timer(work_limit_context, std::numeric_limits<double>::max(), timer);
+  compute_probing_cache(bnd_prb, problem, probing_timer);
+  std::vector<std::pair<int, std::array<detail::cache_entry_t<int, double>, 2>>> cached_values(
+    bnd_prb.probing_cache.probing_cache.begin(), bnd_prb.probing_cache.probing_cache.end());
+  std::sort(cached_values.begin(), cached_values.end(), [](const auto& a, const auto& b) {
+    return a.first < b.first;
+  });
+
+  std::vector<int> probed_indices;
+  std::vector<double> intervals;
+  std::vector<int> interval_types;
+
+  std::vector<int> var_to_cached_bound_keys;
+  std::vector<double> var_to_cached_bound_lb;
+  std::vector<double> var_to_cached_bound_ub;
+  for (const auto& a : cached_values) {
+    probed_indices.push_back(a.first);
+    intervals.push_back(a.second[0].val_interval.val);
+    intervals.push_back(a.second[1].val_interval.val);
+    interval_types.push_back(a.second[0].val_interval.interval_type);
+    interval_types.push_back(a.second[1].val_interval.interval_type);
+
+    auto sorted_map = std::map<int, detail::cached_bound_t<double>>(
+      a.second[0].var_to_cached_bound_map.begin(), a.second[0].var_to_cached_bound_map.end());
+    for (const auto& [var_id, cached_bound] : sorted_map) {
+      var_to_cached_bound_keys.push_back(var_id);
+      var_to_cached_bound_lb.push_back(cached_bound.lb);
+      var_to_cached_bound_ub.push_back(cached_bound.ub);
+    }
+  }
+
+  std::vector<uint32_t> hashes;
+  hashes.push_back(detail::compute_hash(probed_indices));
+  hashes.push_back(detail::compute_hash(intervals));
+  hashes.push_back(detail::compute_hash(interval_types));
+  hashes.push_back(detail::compute_hash(var_to_cached_bound_keys));
+  hashes.push_back(detail::compute_hash(var_to_cached_bound_lb));
+  hashes.push_back(detail::compute_hash(var_to_cached_bound_ub));
+
+  // return a composite hash of all the hashes to check for determinism
+  return detail::compute_hash(hashes);
+}
+
+uint32_t test_scaling_determinism(std::string path, unsigned long seed = std::random_device{}())
+{
+  const raft::handle_t handle_{};
+  cuopt::mps_parser::mps_data_model_t<int, double> mps_problem =
+    cuopt::mps_parser::parse_mps<int, double>(path, false);
+  handle_.sync_stream();
+  auto op_problem = mps_data_model_to_optimization_problem(&handle_, mps_problem);
+  problem_checking_t<int, double>::check_problem_representation(op_problem);
+  detail::problem_t<int, double> problem(op_problem);
+
+  pdlp_hyper_params::pdlp_hyper_params_t hyper_params{};
+  hyper_params.update_primal_weight_on_initial_solution = false;
+  hyper_params.update_step_size_on_initial_solution     = true;
+  // problem contains unpreprocessed data
+  detail::problem_t<int, double> scaled_problem(problem);
+
+  detail::pdlp_initial_scaling_strategy_t<int, double> scaling(
+    scaled_problem.handle_ptr,
+    scaled_problem,
+    hyper_params.default_l_inf_ruiz_iterations,
+    (double)hyper_params.default_alpha_pock_chambolle_rescaling,
+    scaled_problem.reverse_coefficients,
+    scaled_problem.reverse_offsets,
+    scaled_problem.reverse_constraints,
+    nullptr,
+    hyper_params,
+    true);
+
+  scaling.scale_problem();
+
+  // generate a random initial solution in order to ensure scaling of solution vectors is
+  // deterministic as well as the initial step size
+  std::vector<double> initial_solution(scaled_problem.n_variables);
+  std::mt19937 m{seed};
+  std::generate(initial_solution.begin(), initial_solution.end(), [&m]() { return m(); });
+  auto d_initial_solution = device_copy(initial_solution, handle_.get_stream());
+  scaling.scale_primal(d_initial_solution);
+
+  scaled_problem.preprocess_problem();
+
+  detail::trivial_presolve(scaled_problem);
+
+  std::vector<uint32_t> hashes;
+  hashes.push_back(detail::compute_hash(d_initial_solution, handle_.get_stream()));
+  hashes.push_back(scaled_problem.get_fingerprint());
+  return detail::compute_hash(hashes);
+}
+
 TEST(problem, find_implied_integers)
 {
   const raft::handle_t handle_{};
@@ -63,4 +238,63 @@ TEST(problem, find_implied_integers)
             ((int)detail::problem_t<int, double>::var_flags_t::VAR_IMPLIED_INTEGER));
 }
 
+TEST(presolve, probing_cache_deterministic)
+{
+  spin_stream_raii_t spin_stream_1;
+
+  std::vector<std::string> test_instances = {"mip/50v-10-free-bound.mps",
+                                             "mip/neos5-free-bound.mps",
+                                             "mip/neos5.mps",
+                                             "mip/50v-10.mps",
+                                             "mip/gen-ip054.mps",
+                                             "mip/rmatr200-p5.mps"};
+  for (const auto& test_instance : test_instances) {
+    std::cout << "Running: " << test_instance << std::endl;
+    unsigned long seed = std::random_device{}();
+    std::cerr << "Tested with seed " << seed << "\n";
+    auto path          = make_path_absolute(test_instance);
+    uint32_t gold_hash = 0;
+    for (int i = 0; i < 10; ++i) {
+      auto hash = test_probing_cache_determinism(path, seed);
+      if (i == 0) {
+        gold_hash = hash;
+        std::cout << "Gold hash: " << gold_hash << std::endl;
+      } else {
+        EXPECT_EQ(hash, gold_hash);
+      }
+    }
+  }
+}
+
+TEST(presolve, mip_scaling_deterministic)
+{
+  spin_stream_raii_t spin_stream_1;
+  spin_stream_raii_t spin_stream_2;
+
+  std::vector<std::string> test_instances = {"mip/sct2.mps",
+                                             "mip/thor50dday.mps",
+                                             "mip/uccase9.mps",
+                                             "mip/neos5-free-bound.mps",
+                                             "mip/neos5.mps",
+                                             "mip/50v-10.mps",
+                                             "mip/gen-ip054.mps",
+                                             "mip/rmatr200-p5.mps"};
+  for (const auto& test_instance : test_instances) {
+    std::cout << "Running: " << test_instance << std::endl;
+    unsigned long seed = std::random_device{}();
+    std::cerr << "Tested with seed " << seed << "\n";
+    auto path          = make_path_absolute(test_instance);
+    uint32_t gold_hash = 0;
+    for (int i = 0; i < 10; ++i) {
+      auto hash = test_scaling_determinism(path, seed);
+      if (i == 0) {
+        gold_hash = hash;
+        std::cout << "Gold hash: " << gold_hash << std::endl;
+      } else {
+        EXPECT_EQ(hash, gold_hash);
+      }
+    }
+  }
+}
+
 }  // namespace cuopt::linear_programming::test
diff --git a/datasets/mip/download_miplib_test_dataset.sh b/datasets/mip/download_miplib_test_dataset.sh
index d9cefbc32d..28a2b5b6fc 100755
--- a/datasets/mip/download_miplib_test_dataset.sh
+++ b/datasets/mip/download_miplib_test_dataset.sh
@@ -25,6 +25,7 @@ INSTANCES=(
     "enlight_hard"
     "enlight11"
     "supportcase22"
+    "supportcase42"
     "pk1"
 )
 
diff --git a/dependencies.yaml b/dependencies.yaml
index ecd9deb6b4..057fc2a318 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -317,7 +317,7 @@ dependencies:
     common:
       - output_types: [conda]
         packages:
-          - libcuopt-tests==26.4.*,>=0.0.0a0
+          - libcuopt-tests==26.6.*,>=0.0.0a0
   build_wheels:
     common:
       - output_types: [requirements, pyproject]
@@ -419,7 +419,7 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - &libcuopt_unsuffixed libcuopt==26.4.*,>=0.0.0a0
+          - &libcuopt_unsuffixed libcuopt==26.6.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -432,18 +432,18 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - libcuopt-cu12==26.4.*,>=0.0.0a0
+              - libcuopt-cu12==26.6.*,>=0.0.0a0
           - matrix:
               cuda: "13.*"
               cuda_suffixed: "true"
             packages:
-              - libcuopt-cu13==26.4.*,>=0.0.0a0
+              - libcuopt-cu13==26.6.*,>=0.0.0a0
           - {matrix: null, packages: [*libcuopt_unsuffixed]}
   depends_on_cuopt:
     common:
       - output_types: conda
         packages:
-          - &cuopt_unsuffixed cuopt==26.4.*,>=0.0.0a0
+          - &cuopt_unsuffixed cuopt==26.6.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -456,18 +456,18 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - cuopt-cu12==26.4.*,>=0.0.0a0
+              - cuopt-cu12==26.6.*,>=0.0.0a0
           - matrix:
               cuda: "13.*"
               cuda_suffixed: "true"
             packages:
-              - cuopt-cu13==26.4.*,>=0.0.0a0
+              - cuopt-cu13==26.6.*,>=0.0.0a0
           - {matrix: null, packages: [*cuopt_unsuffixed]}
   depends_on_cuopt_server:
     common:
       - output_types: conda
         packages:
-          - &cuopt_server_unsuffixed cuopt-server==26.4.*,>=0.0.0a0
+          - &cuopt_server_unsuffixed cuopt-server==26.6.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -480,18 +480,18 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - cuopt-server-cu12==26.4.*,>=0.0.0a0
+              - cuopt-server-cu12==26.6.*,>=0.0.0a0
           - matrix:
               cuda: "13.*"
               cuda_suffixed: "true"
             packages:
-              - cuopt-server-cu13==26.4.*,>=0.0.0a0
+              - cuopt-server-cu13==26.6.*,>=0.0.0a0
           - {matrix: null, packages: [*cuopt_server_unsuffixed]}
   depends_on_cuopt_sh_client:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - &cuopt_sh_client_unsuffixed cuopt-sh-client==26.4.*,>=0.0.0a0
+          - &cuopt_sh_client_unsuffixed cuopt-sh-client==26.6.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -501,7 +501,7 @@ dependencies:
     common:
       - output_types: [requirements, pyproject, conda]
         packages:
-          - cuopt-mps-parser==26.4.*,>=0.0.0a0
+          - cuopt-mps-parser==26.6.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -511,12 +511,12 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - libraft-headers==26.4.*,>=0.0.0a0
+          - libraft-headers==26.6.*,>=0.0.0a0
   depends_on_librmm:
     common:
       - output_types: conda
         packages:
-          - &librmm_unsuffixed librmm==26.4.*,>=0.0.0a0
+          - &librmm_unsuffixed librmm==26.6.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -528,12 +528,12 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - librmm-cu12==26.4.*,>=0.0.0a0
+              - librmm-cu12==26.6.*,>=0.0.0a0
           - matrix:
               cuda: "13.*"
               cuda_suffixed: "true"
             packages:
-              - librmm-cu13==26.4.*,>=0.0.0a0
+              - librmm-cu13==26.6.*,>=0.0.0a0
           - {matrix: null, packages: [*librmm_unsuffixed]}
   depends_on_cupy:
     common:
@@ -568,7 +568,7 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - &rmm_unsuffixed rmm==26.4.*,>=0.0.0a0
+          - &rmm_unsuffixed rmm==26.6.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -580,12 +580,12 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - rmm-cu12==26.4.*,>=0.0.0a0
+              - rmm-cu12==26.6.*,>=0.0.0a0
           - matrix:
               cuda: "13.*"
               cuda_suffixed: "true"
             packages:
-              - rmm-cu13==26.4.*,>=0.0.0a0
+              - rmm-cu13==26.6.*,>=0.0.0a0
           - matrix:
             packages:
               - *rmm_unsuffixed
@@ -594,7 +594,7 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - &cudf_unsuffixed cudf==26.4.*,>=0.0.0a0
+          - &cudf_unsuffixed cudf==26.6.*,>=0.0.0a0
       - output_types: requirements
         packages:
           - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
@@ -605,12 +605,12 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - cudf-cu12==26.4.*,>=0.0.0a0
+              - cudf-cu12==26.6.*,>=0.0.0a0
           - matrix:
               cuda: "13.*"
               cuda_suffixed: "true"
             packages:
-              - cudf-cu13==26.4.*,>=0.0.0a0
+              - cudf-cu13==26.6.*,>=0.0.0a0
           - matrix:
             packages:
               - *cudf_unsuffixed
@@ -619,7 +619,7 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - &pylibraft_unsuffixed pylibraft==26.4.*,>=0.0.0a0
+          - &pylibraft_unsuffixed pylibraft==26.6.*,>=0.0.0a0
       - output_types: requirements
         packages:
           - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
@@ -630,12 +630,12 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - pylibraft-cu12==26.4.*,>=0.0.0a0
+              - pylibraft-cu12==26.6.*,>=0.0.0a0
           - matrix:
               cuda: "13.*"
               cuda_suffixed: "true"
             packages:
-              - pylibraft-cu13==26.4.*,>=0.0.0a0
+              - pylibraft-cu13==26.6.*,>=0.0.0a0
           - matrix:
             packages:
               - *pylibraft_unsuffixed
diff --git a/docs/cuopt/source/versions1.json b/docs/cuopt/source/versions1.json
index 3e986996a4..507dfe57a4 100644
--- a/docs/cuopt/source/versions1.json
+++ b/docs/cuopt/source/versions1.json
@@ -1,10 +1,14 @@
 [
   {
-    "version": "26.04.00",
-    "url": "https://docs.nvidia.com/cuopt/user-guide/26.04.00/",
+    "version": "26.06.00",
+    "url": "https://docs.nvidia.com/cuopt/user-guide/26.06.00/",
     "name": "latest",
     "preferred": true
   },
+  {
+    "version": "26.04.00",
+    "url": "https://docs.nvidia.com/cuopt/user-guide/26.04.00/"
+  },
   {
     "version": "26.02.00",
     "url": "https://docs.nvidia.com/cuopt/user-guide/26.02.00/"
diff --git a/gemini-extension.json b/gemini-extension.json
index b4c6b764a4..c5ef9883f8 100644
--- a/gemini-extension.json
+++ b/gemini-extension.json
@@ -1,6 +1,6 @@
 {
   "name": "nvidia-cuopt-skills",
   "description": "Agent skills for NVIDIA cuOpt optimization engine: routing, LP/MILP/QP, installation, and server.",
-  "version": "26.04.00",
+  "version": "26.06.00",
   "contextFileName": "AGENTS.md"
 }
diff --git a/helmchart/cuopt-server/Chart.yaml b/helmchart/cuopt-server/Chart.yaml
index 074d94bec9..811ac067cb 100644
--- a/helmchart/cuopt-server/Chart.yaml
+++ b/helmchart/cuopt-server/Chart.yaml
@@ -1,5 +1,5 @@
 apiVersion: v2
-appVersion: 26.4.0
+appVersion: 26.6.0
 description: A Helm chart for NVIDIA cuOpt Server with GPU support
 home: https://docs.nvidia.com/cuopt/user-guide/latest/resources.html
 keywords:
@@ -14,4 +14,4 @@ name: cuopt-server
 sources:
 - https://docs.nvidia.com/cuopt/user-guide/latest/resources.html
 type: application
-version: 26.4.0
+version: 26.6.0
diff --git a/helmchart/cuopt-server/values.yaml b/helmchart/cuopt-server/values.yaml
index 5218596552..6adafea79e 100644
--- a/helmchart/cuopt-server/values.yaml
+++ b/helmchart/cuopt-server/values.yaml
@@ -7,7 +7,7 @@ replicaCount: 1
 image:
   repository: nvidia/cuopt
   pullPolicy: IfNotPresent
-  tag: "26.4.0-cuda12.9-py3.12"
+  tag: "26.6.0-cuda12.9-py3.12"
 
 imagePullSecrets: []
 nameOverride: ""
diff --git a/merge_review_findings_release_26_04.md b/merge_review_findings_release_26_04.md
new file mode 100644
index 0000000000..9491c63c79
--- /dev/null
+++ b/merge_review_findings_release_26_04.md
@@ -0,0 +1,50 @@
+# Merge Review Findings vs `release/26.04`
+
+Scope:
+- Current merge state reviewed statically against `release/26.04`
+- Excluding `cpp/src/branch_and_bound/pseudo_costs.cpp`
+- Notes are incremental and may grow as the review continues
+
+## Resolved High Confidence Findings
+
+1. `cpp/src/mip_heuristics/solver_context.cuh`
+   - Restored the `release/26.04` scaling ownership model for MIP.
+   - Removed the extra scaling constructor parameter from `mip_solver_context_t`; callers now match the context definition again.
+
+2. `cpp/src/mip_heuristics/solution_callbacks.cuh`
+   - Removed the incorrect `pdlp_initial_scaling_strategy_t` dependency from MIP callback plumbing.
+   - `solution_publication_t` and `solution_injection_t` no longer try to own or apply scaling; they now operate on the release-side MIP flow and dispatch both `GET_SOLUTION` and `GET_SOLUTION_EXT`.
+
+3. `cpp/src/mip_heuristics/solver.cu`
+   - Removed `context.scaling` uses from incumbent publication and injection paths.
+   - Removed the stale `bb_callback_adapter_t::settings_` reference member, which was left uninitialized by the merge.
+
+4. `cpp/src/mip_heuristics/diversity/population.cu`
+   - Removed `context.scaling` from the callback/publication and injection calls so the file matches the release-side scaling model.
+
+5. `cpp/src/mip_heuristics/solve.cu`
+   - Deleted the stale local `invoke_solution_callbacks(...)` helper instead of extending it.
+   - Rewired the early incumbent publication paths to the determinism-side callback dispatch (`GET_SOLUTION_EXT` compatible, with origin and work timestamp metadata).
+   - Removed the stray `scaling.scale_problem()` / `scale_primal(...)` block from `run_mip()`, which had no scaling object in scope.
+   - Restored the `try` / `catch` structure in `run_mip()` after the merge splice dropped the opening `try`.
+   - Updated the early-heuristic gates to the bitset model by allowing them only when `determinism_mode == CUOPT_DETERMINISM_NONE`.
+
+6. `cpp/src/mip_heuristics/problem/problem.cuh`, `cpp/src/mip_heuristics/problem/problem.cu`, `cpp/src/mip_heuristics/problem/presolve_data.cuh`
+   - Repaired the half-merged `post_process_assignment(...)` overloads.
+   - The handle-override wrappers now forward the override stream correctly, and the stream-based implementation no longer references the nonexistent `handle_override` variable.
+
+7. `cpp/src/mip_heuristics/diversity/diversity_manager.cu`
+   - Restored the missing `tolerance_divisor` local used to derive PDLP relative tolerances in the non-deterministic root LP path.
+
+8. `cpp/src/mip_heuristics/feasibility_jump/feasibility_jump.cuh`, `cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cu`
+   - Fixed the early GPU FJ merge splice where `early_gpufj_t` reached into the now-private `fj_t::improvement_callback`.
+   - Added a proper setter and updated the caller to use it.
+
+9. `cpp/src/mip_heuristics/solve.cu`
+   - Removed merge-leftover unused locals (`running_mip`, `hyper_params`) that were tripping `-Werror`.
+
+## Lower Confidence Risks
+
+1. `cpp/src/mip_heuristics/diversity/population.cu`
+   - In deterministic B&B mode, `run_solution_callbacks()` updates `best_feasible_objective` immediately after queueing a heuristic solution to B&B, before B&B validates or repairs it.
+   - If the queued solution is later rejected after crushing/validation, later heuristic candidates can be suppressed against an incumbent objective that never actually became valid.
diff --git a/python/cuopt/pyproject.toml b/python/cuopt/pyproject.toml
index e86b5bdd73..eff7e01769 100644
--- a/python/cuopt/pyproject.toml
+++ b/python/cuopt/pyproject.toml
@@ -20,18 +20,18 @@ license = "Apache-2.0"
 requires-python = ">=3.11"
 dependencies = [
     "cuda-python>=13.0.1,<14.0",
-    "cudf==26.4.*,>=0.0.0a0",
-    "cuopt-mps-parser==26.4.*,>=0.0.0a0",
+    "cudf==26.6.*,>=0.0.0a0",
+    "cuopt-mps-parser==26.6.*,>=0.0.0a0",
     "cupy-cuda13x>=13.6.0",
-    "libcuopt==26.4.*,>=0.0.0a0",
+    "libcuopt==26.6.*,>=0.0.0a0",
     "numba-cuda>=0.22.1",
     "numba>=0.60.0,<0.65.0",
     "numpy>=1.23.5,<3.0",
     "pandas>=2.0",
-    "pylibraft==26.4.*,>=0.0.0a0",
+    "pylibraft==26.6.*,>=0.0.0a0",
     "pyyaml>=6.0.0",
     "rapids-logger==0.2.*,>=0.0.0a0",
-    "rmm==26.4.*,>=0.0.0a0",
+    "rmm==26.6.*,>=0.0.0a0",
     "scipy>=1.14.1",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
@@ -101,12 +101,12 @@ dependencies-file = "../../dependencies.yaml"
 matrix-entry = "cuda_suffixed=true;use_cuda_wheels=true"
 requires = [
     "cmake>=3.30.4",
-    "cuopt-mps-parser==26.4.*,>=0.0.0a0",
+    "cuopt-mps-parser==26.6.*,>=0.0.0a0",
     "cupy-cuda13x>=13.6.0",
     "cython>=3.0.3",
-    "libcuopt==26.4.*,>=0.0.0a0",
+    "libcuopt==26.6.*,>=0.0.0a0",
     "ninja",
-    "pylibraft==26.4.*,>=0.0.0a0",
+    "pylibraft==26.6.*,>=0.0.0a0",
     "rapids-logger==0.2.*,>=0.0.0a0",
-    "rmm==26.4.*,>=0.0.0a0",
+    "rmm==26.6.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
diff --git a/python/cuopt_self_hosted/pyproject.toml b/python/cuopt_self_hosted/pyproject.toml
index 7645c99ed0..43aa80a5b3 100644
--- a/python/cuopt_self_hosted/pyproject.toml
+++ b/python/cuopt_self_hosted/pyproject.toml
@@ -20,7 +20,7 @@ license = "Apache-2.0"
 license-files = ["LICENSE"]
 requires-python = ">=3.11"
 dependencies = [
-    "cuopt-mps-parser==26.4.*,>=0.0.0a0",
+    "cuopt-mps-parser==26.6.*,>=0.0.0a0",
     "msgpack-numpy==0.4.8",
     "msgpack==1.1.2",
     "requests",
diff --git a/python/cuopt_server/pyproject.toml b/python/cuopt_server/pyproject.toml
index d24cfcbd77..ce96c884be 100644
--- a/python/cuopt_server/pyproject.toml
+++ b/python/cuopt_server/pyproject.toml
@@ -21,7 +21,7 @@ license = "Apache-2.0"
 license-files = ["LICENSE"]
 requires-python = ">=3.11"
 dependencies = [
-    "cuopt==26.4.*,>=0.0.0a0",
+    "cuopt==26.6.*,>=0.0.0a0",
     "cupy-cuda13x>=13.6.0",
     "fastapi",
     "jsonref==1.1.0",
diff --git a/python/libcuopt/pyproject.toml b/python/libcuopt/pyproject.toml
index de9680aefe..6ba41c60dd 100644
--- a/python/libcuopt/pyproject.toml
+++ b/python/libcuopt/pyproject.toml
@@ -31,8 +31,8 @@ classifiers = [
 ]
 dependencies = [
     "cuda-toolkit[cublas,cudart,curand,cusolver,cusparse,nvtx]==13.*",
-    "cuopt-mps-parser==26.4.*,>=0.0.0a0",
-    "librmm==26.4.*,>=0.0.0a0",
+    "cuopt-mps-parser==26.6.*,>=0.0.0a0",
+    "librmm==26.6.*,>=0.0.0a0",
     "nvidia-cudss-cu13",
     "nvidia-nvjitlink>=13.0,<14",
     "rapids-logger==0.2.*,>=0.0.0a0",
@@ -76,8 +76,8 @@ dependencies-file = "../../dependencies.yaml"
 matrix-entry = "cuda_suffixed=true;use_cuda_wheels=true"
 requires = [
     "cmake>=3.30.4",
-    "cuopt-mps-parser==26.4.*,>=0.0.0a0",
-    "librmm==26.4.*,>=0.0.0a0",
+    "cuopt-mps-parser==26.6.*,>=0.0.0a0",
+    "librmm==26.6.*,>=0.0.0a0",
     "ninja",
     "rapids-logger==0.2.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
diff --git a/skills/cuopt-developer/SKILL.md b/skills/cuopt-developer/SKILL.md
index 12419153ac..99743f9171 100644
--- a/skills/cuopt-developer/SKILL.md
+++ b/skills/cuopt-developer/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: cuopt-developer
-version: "26.04.00"
+version: "26.06.00"
 description: Contribute to NVIDIA cuOpt codebase including C++/CUDA, Python, server, docs, and CI. Use when the user wants to modify solver internals, add features, submit PRs, or understand the codebase architecture.
 ---
 
diff --git a/skills/cuopt-installation-api-c/SKILL.md b/skills/cuopt-installation-api-c/SKILL.md
index 747382e3c7..bd4d60becc 100644
--- a/skills/cuopt-installation-api-c/SKILL.md
+++ b/skills/cuopt-installation-api-c/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: cuopt-installation-api-c
-version: "26.04.00"
+version: "26.06.00"
 description: Install cuOpt for C — conda, locate lib/headers, verification. Use when the user is installing or verifying the C API. Standalone; no common skill.
 ---
 
diff --git a/skills/cuopt-installation-api-python/SKILL.md b/skills/cuopt-installation-api-python/SKILL.md
index a3d7a5e5d2..771f5ec8b0 100644
--- a/skills/cuopt-installation-api-python/SKILL.md
+++ b/skills/cuopt-installation-api-python/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: cuopt-installation-api-python
-version: "26.04.00"
+version: "26.06.00"
 description: Install cuOpt for Python — pip, conda, Docker, verification. Use when the user is installing or verifying the Python API. Standalone; no common skill.
 ---
 
diff --git a/skills/cuopt-installation-common/SKILL.md b/skills/cuopt-installation-common/SKILL.md
index 6ceb9f9000..88534fb810 100644
--- a/skills/cuopt-installation-common/SKILL.md
+++ b/skills/cuopt-installation-common/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: cuopt-installation-common
-version: "26.04.00"
+version: "26.06.00"
 description: Install cuOpt — system and environment requirements only. Domain concepts; no install commands or interface guidance.
 ---
 
diff --git a/skills/cuopt-installation-developer/SKILL.md b/skills/cuopt-installation-developer/SKILL.md
index a002498853..1f3dff0d3f 100644
--- a/skills/cuopt-installation-developer/SKILL.md
+++ b/skills/cuopt-installation-developer/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: cuopt-installation-developer
-version: "26.04.00"
+version: "26.06.00"
 description: Developer installation — build cuOpt from source, run tests. Use when the user wants to set up a dev environment to contribute or modify cuOpt.
 ---
 
diff --git a/skills/cuopt-lp-milp-api-c/SKILL.md b/skills/cuopt-lp-milp-api-c/SKILL.md
index 53df3de63e..74b0d5dc92 100644
--- a/skills/cuopt-lp-milp-api-c/SKILL.md
+++ b/skills/cuopt-lp-milp-api-c/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: cuopt-lp-milp-api-c
-version: "26.04.00"
+version: "26.06.00"
 description: LP and MILP with cuOpt — C API only. Use when the user is embedding LP/MILP in C/C++.
 ---
 
diff --git a/skills/cuopt-lp-milp-api-cli/SKILL.md b/skills/cuopt-lp-milp-api-cli/SKILL.md
index cbdc1e7778..1f8e8a157c 100644
--- a/skills/cuopt-lp-milp-api-cli/SKILL.md
+++ b/skills/cuopt-lp-milp-api-cli/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: cuopt-lp-milp-api-cli
-version: "26.04.00"
+version: "26.06.00"
 description: LP and MILP with cuOpt — CLI only (MPS files, cuopt_cli). Use when the user is solving from MPS via command line.
 ---
 
diff --git a/skills/cuopt-lp-milp-api-python/SKILL.md b/skills/cuopt-lp-milp-api-python/SKILL.md
index a7cd9a59f2..e8435867db 100644
--- a/skills/cuopt-lp-milp-api-python/SKILL.md
+++ b/skills/cuopt-lp-milp-api-python/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: cuopt-lp-milp-api-python
-version: "26.04.00"
+version: "26.06.00"
 description: Solve Linear Programming (LP) and Mixed-Integer Linear Programming (MILP) with the Python API. Use when the user asks about optimization with linear constraints, integer variables, scheduling, resource allocation, facility location, or production planning.
 ---
 
diff --git a/skills/cuopt-qp-api-c/SKILL.md b/skills/cuopt-qp-api-c/SKILL.md
index bc1efb63d3..85014b81fd 100644
--- a/skills/cuopt-qp-api-c/SKILL.md
+++ b/skills/cuopt-qp-api-c/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: cuopt-qp-api-c
-version: "26.04.00"
+version: "26.06.00"
 description: Quadratic Programming (QP) with cuOpt — C API. Use when the user is embedding QP in C/C++.
 ---
 
diff --git a/skills/cuopt-qp-api-cli/SKILL.md b/skills/cuopt-qp-api-cli/SKILL.md
index 5f8a8e848a..7aec559126 100644
--- a/skills/cuopt-qp-api-cli/SKILL.md
+++ b/skills/cuopt-qp-api-cli/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: cuopt-qp-api-cli
-version: "26.04.00"
+version: "26.06.00"
 description: QP with cuOpt — CLI (e.g. cuopt_cli with QP-capable input). Use when the user is solving QP from the command line.
 ---
 
diff --git a/skills/cuopt-qp-api-python/SKILL.md b/skills/cuopt-qp-api-python/SKILL.md
index b85b9e3db2..39533aaeca 100644
--- a/skills/cuopt-qp-api-python/SKILL.md
+++ b/skills/cuopt-qp-api-python/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: cuopt-qp-api-python
-version: "26.04.00"
+version: "26.06.00"
 description: Quadratic Programming (QP) with cuOpt — Python API only (beta). Use when the user is building or solving QP in Python.
 ---
 
diff --git a/skills/cuopt-routing-api-python/SKILL.md b/skills/cuopt-routing-api-python/SKILL.md
index d8bf736f8f..c386107241 100644
--- a/skills/cuopt-routing-api-python/SKILL.md
+++ b/skills/cuopt-routing-api-python/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: cuopt-routing-api-python
-version: "26.04.00"
+version: "26.06.00"
 description: Vehicle routing (VRP, TSP, PDP) with cuOpt — Python API only. Use when the user is building or solving routing in Python.
 ---
 
diff --git a/skills/cuopt-server-api-python/SKILL.md b/skills/cuopt-server-api-python/SKILL.md
index b340e9883f..7d6ed175dd 100644
--- a/skills/cuopt-server-api-python/SKILL.md
+++ b/skills/cuopt-server-api-python/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: cuopt-server-api-python
-version: "26.04.00"
+version: "26.06.00"
 description: cuOpt REST server — start server, endpoints, Python/curl client examples. Use when the user is deploying or calling the REST API.
 ---
 
diff --git a/skills/cuopt-server-common/SKILL.md b/skills/cuopt-server-common/SKILL.md
index f23c9c4a5f..cc2a3728d5 100644
--- a/skills/cuopt-server-common/SKILL.md
+++ b/skills/cuopt-server-common/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: cuopt-server-common
-version: "26.04.00"
+version: "26.06.00"
 description: cuOpt REST server — what it does and how requests flow. Domain concepts; no deploy or client code.
 ---
 
diff --git a/skills/cuopt-user-rules/SKILL.md b/skills/cuopt-user-rules/SKILL.md
index 0777b9af15..87734f72a2 100644
--- a/skills/cuopt-user-rules/SKILL.md
+++ b/skills/cuopt-user-rules/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: cuopt-user-rules
-version: "26.04.00"
+version: "26.06.00"
 description: Base behavior rules for using NVIDIA cuOpt. Read this FIRST before any cuOpt user task (routing, LP/MILP, QP, installation, server). Covers handling incomplete questions, clarifying data requirements, verifying understanding, and running commands safely.
 ---
 
diff --git a/skills/lp-milp-formulation/SKILL.md b/skills/lp-milp-formulation/SKILL.md
index 64431a04c4..e429282033 100644
--- a/skills/lp-milp-formulation/SKILL.md
+++ b/skills/lp-milp-formulation/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: lp-milp-formulation
-version: "26.04.00"
+version: "26.06.00"
 description: LP/MILP concepts and going from problem text to formulation. What LP/MILP are, required formulation questions, typical modeling elements, and how to parse problem statements (parameters, constraints, decisions, objective).
 ---
 
diff --git a/skills/qp-formulation/SKILL.md b/skills/qp-formulation/SKILL.md
index c87b887fbc..60aed00ede 100644
--- a/skills/qp-formulation/SKILL.md
+++ b/skills/qp-formulation/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: qp-formulation
-version: "26.04.00"
+version: "26.06.00"
 description: Quadratic Programming (QP) — problem form and constraints. Domain concepts; no API or interface. QP is beta.
 ---
 
diff --git a/skills/routing-formulation/SKILL.md b/skills/routing-formulation/SKILL.md
index 4ab8d6419d..9cf8060cdf 100644
--- a/skills/routing-formulation/SKILL.md
+++ b/skills/routing-formulation/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: routing-formulation
-version: "26.04.00"
+version: "26.06.00"
 description: Vehicle routing (VRP, TSP, PDP) — problem types and data requirements. Domain concepts; no API or interface.
 ---
 
diff --git a/skills/skill-evolution/SKILL.md b/skills/skill-evolution/SKILL.md
index d77fba1a3f..f3605795b7 100644
--- a/skills/skill-evolution/SKILL.md
+++ b/skills/skill-evolution/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: skill-evolution
-version: "26.04.00"
+version: "26.06.00"
 description: After solving a non-trivial problem, detect generalizable learnings and propose skill updates so future interactions benefit automatically. Always active — applies to every interaction.
 ---
 
@@ -182,7 +182,7 @@ When skill evolution creates an entirely new skill directory, add `origin: skill
 ```yaml
 ---
 name: new-skill-name
-version: "26.04.00"
+version: "26.06.00"
 description: ...
 origin: skill-evolution
 ---
diff --git a/sonar-project.properties b/sonar-project.properties
index ae8d6bd25c..7dafbc9969 100644
--- a/sonar-project.properties
+++ b/sonar-project.properties
@@ -5,6 +5,6 @@
 sonar.projectKey=GPUSW_cuOpt_Nvidia-cuOpt_cuopt
 sonar.projectName=NVIDIA cuOpt
 sonar.projectVersion=1.0
-
+sonar.host.url=https://sonar.nvidia.com
 # Source code location
 sonar.sources=.
diff --git a/sonarqube/sonar-branches.txt b/sonarqube/sonar-branches.txt
index a75ecac679..14fe38226d 100644
--- a/sonarqube/sonar-branches.txt
+++ b/sonarqube/sonar-branches.txt
@@ -5,7 +5,7 @@
 
 # Main development branches
 main
-release/26.02
+release/26.04
 
 # Add release branches as needed
 # release/v1.0