diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index 4c5df380f6..6ddf2583c4 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -5,7 +5,7 @@ }, "metadata": { "description": "Agent skills for NVIDIA cuOpt: routing (VRP, TSP, PDP), LP/MILP/QP, installation (Python/C/developer), and REST server.", - "version": "26.04.00" + "version": "26.06.00" }, "plugins": [ { diff --git a/.cursor-plugin/plugin.json b/.cursor-plugin/plugin.json index 5f34873671..e740506140 100644 --- a/.cursor-plugin/plugin.json +++ b/.cursor-plugin/plugin.json @@ -1,7 +1,7 @@ { "name": "nvidia-cuopt-skills", "description": "Agent skills for NVIDIA cuOpt: routing (VRP, TSP, PDP), LP/MILP/QP, installation (Python/C/developer), and REST server. Use when building or solving optimization with cuOpt.", - "version": "26.04.00", + "version": "26.06.00", "author": { "name": "NVIDIA" }, diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 7958eac440..cdbf4df577 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1,3 +1,6 @@ +# Default owner for paths with no later, more specific match +* @nvidia/cuopt-infra-codeowners + #cpp code owners cpp/ @nvidia/cuopt-engine-codeowners diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 3eb1f1f066..a945cde8ec 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -45,7 +45,7 @@ concurrency: jobs: cpp-build: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@main with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -55,7 +55,7 @@ jobs: python-build: needs: [cpp-build] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@main with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -65,7 +65,7 @@ jobs: upload-conda: needs: [cpp-build, python-build] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@main with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -73,7 +73,7 @@ jobs: sha: ${{ inputs.sha }} wheel-build-cuopt-mps-parser: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -88,7 +88,7 @@ jobs: wheel-publish-cuopt-mps-parser: needs: wheel-build-cuopt-mps-parser secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@main with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -99,7 +99,7 @@ jobs: wheel-build-libcuopt: needs: wheel-build-cuopt-mps-parser secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -112,7 +112,7 @@ jobs: wheel-publish-libcuopt: needs: wheel-build-libcuopt secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@main with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -123,7 +123,7 @@ jobs: wheel-build-cuopt: needs: [wheel-build-cuopt-mps-parser, wheel-build-libcuopt] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -135,7 +135,7 @@ jobs: wheel-publish-cuopt: needs: wheel-build-cuopt secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@main with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -145,7 +145,7 @@ jobs: package-type: python wheel-build-cuopt-server: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -160,7 +160,7 @@ jobs: wheel-publish-cuopt-server: needs: wheel-build-cuopt-server secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@main with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -171,7 +171,7 @@ jobs: docs-build: needs: [python-build] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main with: build_type: ${{ inputs.build_type || 'branch' }} node_type: "gpu-l4-latest-1" @@ -181,11 +181,11 @@ jobs: arch: "amd64" file_to_upload: "docs/cuopt/build/html/" artifact-name: "cuopt_docs" - container_image: "rapidsai/ci-conda:26.04-latest" + container_image: "rapidsai/ci-conda:26.06-latest" script: "ci/build_docs.sh" wheel-build-cuopt-sh-client: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -201,7 +201,7 @@ jobs: wheel-publish-cuopt-sh-client: needs: wheel-build-cuopt-sh-client secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@main with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} diff --git a/.github/workflows/build_test_publish_images.yaml b/.github/workflows/build_test_publish_images.yaml index f8f7366e13..17d4e9ab57 100644 --- a/.github/workflows/build_test_publish_images.yaml +++ b/.github/workflows/build_test_publish_images.yaml @@ -55,7 +55,7 @@ jobs: compute-matrix: runs-on: ubuntu-latest container: - image: rapidsai/ci-conda:26.04-latest + image: rapidsai/ci-conda:26.06-latest outputs: MATRIX: ${{ steps.compute-matrix.outputs.MATRIX }} CUOPT_VER: ${{ steps.compute-cuopt-ver.outputs.CUOPT_VER }} diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 47a3bd9fca..a652c23b9a 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -34,7 +34,7 @@ jobs: - wheel-build-cuopt-sh-client - test-self-hosted-server secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@main if: always() with: needs: ${{ toJSON(needs) }} @@ -111,7 +111,7 @@ jobs: changed-files: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@main with: files_yaml: | build_docs: @@ -279,20 +279,20 @@ jobs: - '!gemini-extension.json' checks: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@main with: enable_check_generated_files: false conda-cpp-build: needs: [checks, compute-matrix-filters] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@main with: build_type: pull-request script: ci/build_cpp.sh matrix_filter: ${{ needs.compute-matrix-filters.outputs.conda_lean_filter }} conda-cpp-tests: needs: [conda-cpp-build, changed-files, compute-matrix-filters] - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@main if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_cpp with: build_type: pull-request @@ -308,14 +308,14 @@ jobs: conda-python-build: needs: [conda-cpp-build, compute-matrix-filters] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@main with: build_type: pull-request script: ci/build_python.sh matrix_filter: ${{ needs.compute-matrix-filters.outputs.conda_test_filter }} conda-python-tests: needs: [conda-python-build, changed-files, compute-matrix-filters] - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@main if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python_conda with: run_codecov: false @@ -332,7 +332,7 @@ jobs: docs-build: needs: [conda-python-build, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main if: fromJSON(needs.changed-files.outputs.changed_file_groups).build_docs with: build_type: pull-request @@ -340,12 +340,12 @@ jobs: arch: "amd64" file_to_upload: "docs/cuopt/build/html/" artifact-name: "cuopt_docs" - container_image: "rapidsai/ci-conda:26.04-latest" + container_image: "rapidsai/ci-conda:26.06-latest" script: "ci/build_docs.sh" wheel-build-cuopt-mps-parser: needs: compute-matrix-filters secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main with: build_type: pull-request script: ci/build_wheel_cuopt_mps_parser.sh @@ -357,7 +357,7 @@ jobs: wheel-build-libcuopt: needs: [wheel-build-cuopt-mps-parser, compute-matrix-filters] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main with: # build for every combination of arch and CUDA version, but only for the latest Python matrix_filter: ${{ needs.compute-matrix-filters.outputs.libcuopt_filter }} @@ -368,7 +368,7 @@ jobs: wheel-build-cuopt: needs: [wheel-build-cuopt-mps-parser, wheel-build-libcuopt, compute-matrix-filters] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main with: build_type: pull-request script: ci/build_wheel_cuopt.sh @@ -377,7 +377,7 @@ jobs: matrix_filter: ${{ needs.compute-matrix-filters.outputs.wheel_lean_filter }} wheel-tests-cuopt: needs: [wheel-build-cuopt, wheel-build-cuopt-mps-parser, wheel-build-cuopt-sh-client, changed-files, compute-matrix-filters] - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@main if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python_wheels with: build_type: pull-request @@ -393,7 +393,7 @@ jobs: wheel-build-cuopt-server: needs: [checks, compute-matrix-filters] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main with: build_type: pull-request script: ci/build_wheel_cuopt_server.sh @@ -405,7 +405,7 @@ jobs: wheel-build-cuopt-sh-client: needs: compute-matrix-filters secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main with: build_type: pull-request script: ci/build_wheel_cuopt_sh_client.sh @@ -417,7 +417,7 @@ jobs: matrix_filter: ${{ needs.compute-matrix-filters.outputs.cuopt_sh_client_filter }} wheel-tests-cuopt-server: needs: [wheel-build-cuopt, wheel-build-cuopt-server, changed-files, compute-matrix-filters] - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@main if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python_wheels with: build_type: pull-request diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 9ad7609e8a..a8cc5f2943 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -27,7 +27,7 @@ on: jobs: conda-cpp-tests: - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@main with: build_type: ${{ inputs.build_type }} branch: ${{ inputs.branch }} @@ -42,7 +42,7 @@ jobs: script-env-secret-3-key: CUOPT_AWS_SECRET_ACCESS_KEY script-env-secret-3-value: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }} conda-python-tests: - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@main with: run_codecov: false build_type: ${{ inputs.build_type }} @@ -58,7 +58,7 @@ jobs: script-env-secret-3-key: CUOPT_AWS_SECRET_ACCESS_KEY script-env-secret-3-value: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }} wheel-tests-cuopt: - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@main with: build_type: ${{ inputs.build_type }} branch: ${{ inputs.branch }} @@ -73,7 +73,7 @@ jobs: script-env-secret-3-key: CUOPT_AWS_SECRET_ACCESS_KEY script-env-secret-3-value: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }} wheel-tests-cuopt-server: - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@main with: build_type: ${{ inputs.build_type }} branch: ${{ inputs.branch }} @@ -89,7 +89,7 @@ jobs: script-env-secret-3-value: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }} conda-notebook-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main with: build_type: ${{ inputs.build_type }} branch: ${{ inputs.branch }} @@ -97,5 +97,5 @@ jobs: sha: ${{ inputs.sha }} node_type: "gpu-l4-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:26.04-latest" + container_image: "rapidsai/ci-conda:26.06-latest" script: ci/test_notebooks.sh diff --git a/.github/workflows/trigger-breaking-change-alert.yaml b/.github/workflows/trigger-breaking-change-alert.yaml index d394b97db4..57b178740c 100644 --- a/.github/workflows/trigger-breaking-change-alert.yaml +++ b/.github/workflows/trigger-breaking-change-alert.yaml @@ -15,7 +15,7 @@ jobs: trigger-notifier: if: contains(github.event.pull_request.labels.*.name, 'breaking') secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@main with: sender_login: ${{ github.event.sender.login }} sender_avatar: ${{ github.event.sender.avatar_url }} diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 8d03641fde..a935201f21 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -117,7 +117,7 @@ Architecture: - Clone the repository: ```bash -CUOPT_HOME=$(pwd)/cuopt +export CUOPT_HOME=$(pwd)/cuopt git clone https://github.com/NVIDIA/cuopt.git $CUOPT_HOME cd $CUOPT_HOME ``` @@ -193,19 +193,20 @@ To build all libraries and tests, simply run To run the C++ tests, run ```bash -cd $CUOPT_HOME/datasets && get_test_data.sh +cd $CUOPT_HOME/datasets && ./get_test_data.sh cd $CUOPT_HOME && datasets/linear_programming/download_pdlp_test_dataset.sh datasets/mip/download_miplib_test_dataset.sh export RAPIDS_DATASET_ROOT_DIR=$CUOPT_HOME/datasets/ -ctest --test-dir ${CUOPT_HOME}/cpp/build # libcuopt +ctest --test-dir ${CUOPT_HOME}/cpp/build -E L1TEST # libcuopt ``` +`L1TEST`s are excluded because they are expensive and not run as part of the typical development process. To run python tests, run - To run `cuopt` tests: ```bash -cd $CUOPT_HOME/datasets && get_test_data.sh +cd $CUOPT_HOME/datasets && ./get_test_data.sh cd $CUOPT_HOME && datasets/linear_programming/download_pdlp_test_dataset.sh datasets/mip/download_miplib_test_dataset.sh export RAPIDS_DATASET_ROOT_DIR=$CUOPT_HOME/datasets/ diff --git a/RAPIDS_BRANCH b/RAPIDS_BRANCH index d5ea6ced53..ba2906d066 100644 --- a/RAPIDS_BRANCH +++ b/RAPIDS_BRANCH @@ -1 +1 @@ -release/26.04 +main diff --git a/README.md b/README.md index 379a48c350..95c8598d77 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ # cuOpt - GPU-accelerated Optimization [![Build Status](https://github.com/NVIDIA/cuopt/actions/workflows/build.yaml/badge.svg)](https://github.com/NVIDIA/cuopt/actions/workflows/build.yaml) -[![Version](https://img.shields.io/badge/version-26.04.00-blue)](https://github.com/NVIDIA/cuopt/releases) +[![Version](https://img.shields.io/badge/version-26.06.00-blue)](https://github.com/NVIDIA/cuopt/releases) [![Documentation](https://img.shields.io/badge/docs-latest-brightgreen)](https://docs.nvidia.com/cuopt/user-guide/latest/introduction.html) [![Docker Hub](https://img.shields.io/badge/docker-nvidia%2Fcuopt-blue?logo=docker)](https://hub.docker.com/r/nvidia/cuopt) [![Examples](https://img.shields.io/badge/examples-cuopt--examples-orange)](https://github.com/NVIDIA/cuopt-examples) @@ -83,7 +83,7 @@ For CUDA 12.x: pip install \ --extra-index-url=https://pypi.nvidia.com \ nvidia-cuda-runtime-cu12==12.9.* \ - cuopt-server-cu12==26.04.* cuopt-sh-client==26.04.* + cuopt-server-cu12==26.06.* cuopt-sh-client==26.06.* ``` Development wheels are available as nightlies, please update `--extra-index-url` to `https://pypi.anaconda.org/rapidsai-wheels-nightly/simple/` to install latest nightly packages. @@ -91,7 +91,7 @@ Development wheels are available as nightlies, please update `--extra-index-url` pip install --pre \ --extra-index-url=https://pypi.nvidia.com \ --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple/ \ - cuopt-server-cu12==26.04.* cuopt-sh-client==26.04.* + cuopt-server-cu12==26.06.* cuopt-sh-client==26.06.* ``` For CUDA 13.x: @@ -99,7 +99,7 @@ For CUDA 13.x: ```bash pip install \ --extra-index-url=https://pypi.nvidia.com \ - cuopt-server-cu13==26.04.* cuopt-sh-client==26.04.* + cuopt-server-cu13==26.06.* cuopt-sh-client==26.06.* ``` Development wheels are available as nightlies, please update `--extra-index-url` to `https://pypi.anaconda.org/rapidsai-wheels-nightly/simple/` to install latest nightly packages. @@ -107,7 +107,7 @@ Development wheels are available as nightlies, please update `--extra-index-url` pip install --pre \ --extra-index-url=https://pypi.nvidia.com \ --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple/ \ - cuopt-server-cu13==26.04.* cuopt-sh-client==26.04.* + cuopt-server-cu13==26.06.* cuopt-sh-client==26.06.* ``` @@ -118,7 +118,7 @@ cuOpt can be installed with conda (via [miniforge](https://github.com/conda-forg All other dependencies are installed automatically when `cuopt-server` and `cuopt-sh-client` are installed. ```bash -conda install -c rapidsai -c conda-forge -c nvidia cuopt-server=26.04.* cuopt-sh-client=26.04.* +conda install -c rapidsai -c conda-forge -c nvidia cuopt-server=26.06.* cuopt-sh-client=26.06.* ``` We also provide [nightly conda packages](https://anaconda.org/rapidsai-nightly) built from the HEAD diff --git a/VERSION b/VERSION index 0bd0e8a95b..cdb610a24d 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -26.04.00 +26.06.00 diff --git a/benchmarks/linear_programming/cuopt/run_mip.cpp b/benchmarks/linear_programming/cuopt/run_mip.cpp index e01e533a65..40f7c73eac 100644 --- a/benchmarks/linear_programming/cuopt/run_mip.cpp +++ b/benchmarks/linear_programming/cuopt/run_mip.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -137,6 +138,58 @@ std::vector> read_solution_from_dir(const std::string file_p return initial_solutions; } +struct incumbent_record_t { + double objective; + double work_timestamp; + double wall_time; + cuopt::internals::mip_solution_origin_t origin; +}; + +class incumbent_tracker_t : public cuopt::internals::get_solution_callback_ext_t { + public: + incumbent_tracker_t(std::chrono::high_resolution_clock::time_point start_time) + : start_time_(start_time) + { + } + + void get_solution(void* data, + void* cost, + void* solution_bound, + const cuopt::internals::mip_solution_callback_info_t* info, + void* user_data) override + { + double obj = *static_cast(cost); + double wt = (info != nullptr) ? info->work_timestamp : -1.0; + auto origin = (info != nullptr) ? (cuopt::internals::mip_solution_origin_t)info->origin + : cuopt::internals::mip_solution_origin_t::UNKNOWN; + auto now = std::chrono::high_resolution_clock::now(); + double wall_s = std::chrono::duration(now - start_time_).count(); + records_.push_back({obj, wt, wall_s, (cuopt::internals::mip_solution_origin_t)origin}); + } + + void write_csv(const std::string& path) const + { + std::ofstream f(path); + if (!f.is_open()) { + fprintf(stderr, "Failed to open incumbent CSV: %s\n", path.c_str()); + return; + } + f << "index,objective,work_timestamp,wall_time_s,origin\n"; + for (size_t i = 0; i < records_.size(); ++i) { + auto& r = records_[i]; + f << i << "," << std::setprecision(15) << r.objective << "," << r.work_timestamp << "," + << std::setprecision(6) << r.wall_time << "," + << cuopt::internals::mip_solution_origin_to_string(r.origin) << "\n"; + } + } + + size_t size() const { return records_.size(); } + + private: + std::chrono::high_resolution_clock::time_point start_time_; + std::vector records_; +}; + int run_single_file(std::string file_path, int device, int batch_id, @@ -203,21 +256,40 @@ int run_single_file(std::string file_path, } } } - settings.time_limit = time_limit; - settings.work_limit = work_limit; - settings.heuristics_only = heuristics_only; - settings.num_cpu_threads = num_cpu_threads; - settings.log_to_console = log_to_console; - settings.determinism_mode = deterministic ? CUOPT_MODE_DETERMINISTIC : CUOPT_MODE_OPPORTUNISTIC; + settings.time_limit = time_limit; + settings.work_limit = work_limit; + settings.heuristics_only = heuristics_only; + settings.num_cpu_threads = num_cpu_threads; + settings.log_to_console = log_to_console; + if (deterministic) { + settings.determinism_mode = + heuristics_only ? CUOPT_MODE_DETERMINISTIC_GPU_HEURISTICS : CUOPT_MODE_DETERMINISTIC; + } else { + settings.determinism_mode = CUOPT_MODE_OPPORTUNISTIC; + } + CUOPT_LOG_INFO( + "run_mip settings: heuristics_only=%d deterministic=%d determinism_mode=%d " + "time_limit=%.6f work_limit=%.6f", + (int)heuristics_only, + (int)deterministic, + settings.determinism_mode, + settings.time_limit, + settings.work_limit); settings.tolerances.relative_tolerance = 1e-12; settings.tolerances.absolute_tolerance = 1e-6; settings.presolver = cuopt::linear_programming::presolver_t::Default; settings.reliability_branching = reliability_branching; settings.clique_cuts = -1; settings.seed = 42; + settings.bb_work_unit_scale = 1.0; + settings.gpu_heur_work_unit_scale = 1.0; + settings.mip_scaling = false; + settings.gpu_heur_wait_for_exploration = false; cuopt::linear_programming::benchmark_info_t benchmark_info; settings.benchmark_info_ptr = &benchmark_info; auto start_run_solver = std::chrono::high_resolution_clock::now(); + incumbent_tracker_t incumbent_tracker(start_run_solver); + settings.set_mip_callback(&incumbent_tracker); auto solution = cuopt::linear_programming::solve_mip(&handle_, mps_data_model, settings); CUOPT_LOG_INFO( "first obj: %f last improvement of best feasible: %f last improvement after recombination: %f", @@ -253,7 +325,13 @@ int run_single_file(std::string file_path, << benchmark_info.last_improvement_after_recombination << "," << mip_gap << "," << is_optimal << "\n"; write_to_output_file(out_dir, base_filename, device, n_gpus, batch_id, ss.str()); - CUOPT_LOG_INFO("Results written to the file %s", base_filename.c_str()); + if (!out_dir.empty()) { + std::string mps_stem = base_filename.substr(0, base_filename.find(".mps")); + std::string csv_path = out_dir + "/" + mps_stem + "_incumbents.csv"; + incumbent_tracker.write_csv(csv_path); + CUOPT_LOG_INFO( + "Incumbent trace (%zu entries) written to %s", incumbent_tracker.size(), csv_path.c_str()); + } return sol_found; } diff --git a/ci/compute-sanitizer-suppressions.xml b/ci/compute-sanitizer-suppressions.xml new file mode 100644 index 0000000000..624b3aa0bd --- /dev/null +++ b/ci/compute-sanitizer-suppressions.xml @@ -0,0 +1,249 @@ + + + + Initcheck + + Uninitialized __global__ memory read of size 4 bytes + 4 + + + .* + + + + .*libcuda.so.* + + + cusparseCsr2cscEx2 + .*libcusparse.so.* + + + + + Initcheck + + Uninitialized __global__ memory read of size 4 bytes + 4 + + + ThreadLoad + + + + .*libcuda.so.* + + + libcudart.* + + + cudaLaunchKernel + + + .*cub::.*::Device(Segmented)?(Reduce|Scan)(SingleTile)?Kernel.* + + + + + Initcheck + + Uninitialized __global__ memory read of size 2 bytes + 2 + + + ThreadLoad + + + + .*libcuda.so.* + + + libcudart.* + + + cudaLaunchKernel + + + .*cub::.*::Device(Segmented)?(Reduce|Scan)(SingleTile)?Kernel.* + + + + + Initcheck + + Uninitialized __global__ memory read of size 8 bytes + 8 + + + DeviceSegmentedReduceKernel + + + + Initcheck + + Uninitialized __global__ memory read of size 4 bytes + 4 + + + ThreadLoad + + + + .*libcuda.so.* + + + libcudart.* + + + libcudart.* + + + .*libcuopt.* + + + .*Device(Reduce|Scan)Kernel.* + + + + + + + InitcheckApiError + Error + + Host API uninitialized memory access + 16 + + + + cuMemcpyDtoHAsync.* + .*libcuda.so.* + + + + + + InitcheckApiError + Error + + Host API uninitialized memory access + + + + cuMemcpyAsync + .*libcuda.so.* + + + .*libcudart.so.* + + + .*libcudart.so.* + + + .*libcudart.so.* + + + .*librmm.so.* + + + rmm::device_buffer::device_buffer + .*librmm.so.* + + + + + + Initcheck + + Uninitialized __global__ memory read + + + transform_kernel + + + + cuLaunchKernel_ptsz + .*libcuda.so.* + + + .*libcudart.so.* + + + cudaLaunchKernel_ptsz + + + + + InitcheckApiError + Error + + Host API uninitialized memory access + + + + cuMemcpyAsync + .*libcuda.so.* + + + .*libcudart.so.* + + + .*libcudart.so.* + + + .*libcudart.so.* + + + .*librmm.so.* + + + .*librmm.so.* + + + rmm::device_uvector.*::device_uvector + .*libcuopt.so.* + + + + + + InitcheckApiError + Error + + Host API uninitialized memory access + + + + cuMemcpyDtoDAsync.* + .*libcuda.so.* + + + + + InitcheckApiError + Error + + Host API uninitialized memory access + + + + cuMemcpyAsync + .*libcuda.so.* + + + .*libcudart.so.* + + + .*libcudart.so.* + + + cudaMemcpyAsync + + + rmm::device_buffer::resize + .*librmm.so.* + + + + diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh index 3d6c356b3d..9a67bb65a5 100755 --- a/ci/release/update-version.sh +++ b/ci/release/update-version.sh @@ -152,3 +152,6 @@ elif [[ "${RUN_CONTEXT}" == "release" ]]; then sed_runner "s|\\bmain\\b|release/${NEXT_SHORT_TAG}|g" docs/cuopt/source/faq.rst sed_runner "s|\\bmain\\b|release/${NEXT_SHORT_TAG}|g" docs/cuopt/source/cuopt-python/routing/routing-example.ipynb fi + +# Update docs version switcher to include the new version +python ci/utils/update_doc_versions.py diff --git a/conda/environments/all_cuda-129_arch-aarch64.yaml b/conda/environments/all_cuda-129_arch-aarch64.yaml index cf3563d476..04dc6bb83c 100644 --- a/conda/environments/all_cuda-129_arch-aarch64.yaml +++ b/conda/environments/all_cuda-129_arch-aarch64.yaml @@ -19,7 +19,7 @@ dependencies: - cuda-python>=12.9.2,<13.0 - cuda-sanitizer-api - cuda-version=12.9 -- cudf==26.4.*,>=0.0.0a0 +- cudf==26.6.*,>=0.0.0a0 - cupy>=13.6.0 - cxx-compiler - cython>=3.0.3 @@ -36,8 +36,8 @@ dependencies: - libcusparse-dev - libgrpc >=1.78.0,<1.80.0a0 - libprotobuf -- libraft-headers==26.4.*,>=0.0.0a0 -- librmm==26.4.*,>=0.0.0a0 +- libraft-headers==26.6.*,>=0.0.0a0 +- librmm==26.6.*,>=0.0.0a0 - make - msgpack-numpy==0.4.8 - msgpack-python==1.1.2 @@ -55,7 +55,7 @@ dependencies: - pip - pre-commit - psutil>=6.0.0 -- pylibraft==26.4.*,>=0.0.0a0 +- pylibraft==26.6.*,>=0.0.0a0 - pyrsistent - pytest-cov - pytest<9.0 @@ -65,7 +65,7 @@ dependencies: - rapids-logger==0.2.*,>=0.0.0a0 - re2 - requests -- rmm==26.4.*,>=0.0.0a0 +- rmm==26.6.*,>=0.0.0a0 - scikit-build-core>=0.11.0 - scipy>=1.14.1 - sphinx diff --git a/conda/environments/all_cuda-129_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml index a8a589e48b..21891cc9f2 100644 --- a/conda/environments/all_cuda-129_arch-x86_64.yaml +++ b/conda/environments/all_cuda-129_arch-x86_64.yaml @@ -19,7 +19,7 @@ dependencies: - cuda-python>=12.9.2,<13.0 - cuda-sanitizer-api - cuda-version=12.9 -- cudf==26.4.*,>=0.0.0a0 +- cudf==26.6.*,>=0.0.0a0 - cupy>=13.6.0 - cxx-compiler - cython>=3.0.3 @@ -36,8 +36,8 @@ dependencies: - libcusparse-dev - libgrpc >=1.78.0,<1.80.0a0 - libprotobuf -- libraft-headers==26.4.*,>=0.0.0a0 -- librmm==26.4.*,>=0.0.0a0 +- libraft-headers==26.6.*,>=0.0.0a0 +- librmm==26.6.*,>=0.0.0a0 - make - msgpack-numpy==0.4.8 - msgpack-python==1.1.2 @@ -55,7 +55,7 @@ dependencies: - pip - pre-commit - psutil>=6.0.0 -- pylibraft==26.4.*,>=0.0.0a0 +- pylibraft==26.6.*,>=0.0.0a0 - pyrsistent - pytest-cov - pytest<9.0 @@ -65,7 +65,7 @@ dependencies: - rapids-logger==0.2.*,>=0.0.0a0 - re2 - requests -- rmm==26.4.*,>=0.0.0a0 +- rmm==26.6.*,>=0.0.0a0 - scikit-build-core>=0.11.0 - scipy>=1.14.1 - sphinx diff --git a/conda/environments/all_cuda-131_arch-aarch64.yaml b/conda/environments/all_cuda-131_arch-aarch64.yaml index 477c708918..89147b18a7 100644 --- a/conda/environments/all_cuda-131_arch-aarch64.yaml +++ b/conda/environments/all_cuda-131_arch-aarch64.yaml @@ -19,7 +19,7 @@ dependencies: - cuda-python>=13.0.1,<14.0 - cuda-sanitizer-api - cuda-version=13.1 -- cudf==26.4.*,>=0.0.0a0 +- cudf==26.6.*,>=0.0.0a0 - cupy>=13.6.0 - cxx-compiler - cython>=3.0.3 @@ -36,8 +36,8 @@ dependencies: - libcusparse-dev - libgrpc >=1.78.0,<1.80.0a0 - libprotobuf -- libraft-headers==26.4.*,>=0.0.0a0 -- librmm==26.4.*,>=0.0.0a0 +- libraft-headers==26.6.*,>=0.0.0a0 +- librmm==26.6.*,>=0.0.0a0 - make - msgpack-numpy==0.4.8 - msgpack-python==1.1.2 @@ -55,7 +55,7 @@ dependencies: - pip - pre-commit - psutil>=6.0.0 -- pylibraft==26.4.*,>=0.0.0a0 +- pylibraft==26.6.*,>=0.0.0a0 - pyrsistent - pytest-cov - pytest<9.0 @@ -65,7 +65,7 @@ dependencies: - rapids-logger==0.2.*,>=0.0.0a0 - re2 - requests -- rmm==26.4.*,>=0.0.0a0 +- rmm==26.6.*,>=0.0.0a0 - scikit-build-core>=0.11.0 - scipy>=1.14.1 - sphinx diff --git a/conda/environments/all_cuda-131_arch-x86_64.yaml b/conda/environments/all_cuda-131_arch-x86_64.yaml index d5fcba0b73..8df6f28bf7 100644 --- a/conda/environments/all_cuda-131_arch-x86_64.yaml +++ b/conda/environments/all_cuda-131_arch-x86_64.yaml @@ -19,7 +19,7 @@ dependencies: - cuda-python>=13.0.1,<14.0 - cuda-sanitizer-api - cuda-version=13.1 -- cudf==26.4.*,>=0.0.0a0 +- cudf==26.6.*,>=0.0.0a0 - cupy>=13.6.0 - cxx-compiler - cython>=3.0.3 @@ -36,8 +36,8 @@ dependencies: - libcusparse-dev - libgrpc >=1.78.0,<1.80.0a0 - libprotobuf -- libraft-headers==26.4.*,>=0.0.0a0 -- librmm==26.4.*,>=0.0.0a0 +- libraft-headers==26.6.*,>=0.0.0a0 +- librmm==26.6.*,>=0.0.0a0 - make - msgpack-numpy==0.4.8 - msgpack-python==1.1.2 @@ -55,7 +55,7 @@ dependencies: - pip - pre-commit - psutil>=6.0.0 -- pylibraft==26.4.*,>=0.0.0a0 +- pylibraft==26.6.*,>=0.0.0a0 - pyrsistent - pytest-cov - pytest<9.0 @@ -65,7 +65,7 @@ dependencies: - rapids-logger==0.2.*,>=0.0.0a0 - re2 - requests -- rmm==26.4.*,>=0.0.0a0 +- rmm==26.6.*,>=0.0.0a0 - scikit-build-core>=0.11.0 - scipy>=1.14.1 - sphinx diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 9249b53171..c95224ea9c 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -543,6 +543,17 @@ target_link_libraries(cuopt gRPC::grpc++ ) +# find_path(PAPI_INCLUDE_DIR papi.h) +# find_library(PAPI_LIBRARY papi) + +# if (PAPI_INCLUDE_DIR AND PAPI_LIBRARY) +# message(STATUS "Found PAPI in ${PAPI_INCLUDE_DIR}") +# target_include_directories(cuopt PRIVATE ${PAPI_INCLUDE_DIR}) +# target_link_libraries(cuopt PRIVATE ${PAPI_LIBRARY}) +# else() +# message(FATAL_ERROR "Could not find PAPI") +# endif() + # ################################################################################################## # - generate tests -------------------------------------------------------------------------------- @@ -652,11 +663,14 @@ rapids_cpm_find( if(NOT BUILD_LP_ONLY) add_executable(cuopt_cli cuopt_cli.cpp) +# PIE executable: auditwheel/patchelf expands .dynstr/RPATH when repairing wheels; non-PIE +# (ET_EXEC) binaries are prone to corrupt segment layout. PIE (ET_DYN) survives RPATH edits. set_target_properties(cuopt_cli PROPERTIES CXX_STANDARD 20 CXX_STANDARD_REQUIRED ON CXX_SCAN_FOR_MODULES OFF + POSITION_INDEPENDENT_CODE ON ) target_compile_options(cuopt_cli @@ -664,6 +678,8 @@ target_compile_options(cuopt_cli "$<$:${CUOPT_CUDA_FLAGS}>" ) +target_link_options(cuopt_cli PRIVATE -pie) + target_include_directories(cuopt_cli PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/src" diff --git a/cpp/include/cuopt/linear_programming/constants.h b/cpp/include/cuopt/linear_programming/constants.h index 06eacb3408..24eb02aab8 100644 --- a/cpp/include/cuopt/linear_programming/constants.h +++ b/cpp/include/cuopt/linear_programming/constants.h @@ -104,10 +104,40 @@ #define CUOPT_MIP_HYPER_HEURISTIC_RELAXED_LP_TIME_LIMIT "mip_hyper_heuristic_relaxed_lp_time_limit" #define CUOPT_MIP_HYPER_HEURISTIC_RELATED_VARS_TIME_LIMIT \ "mip_hyper_heuristic_related_vars_time_limit" - -/* @brief MIP determinism mode constants */ -#define CUOPT_MODE_OPPORTUNISTIC 0 -#define CUOPT_MODE_DETERMINISTIC 1 +#define CUOPT_MIP_HYPER_HEURISTIC_CPUFJ_WORK_UNIT_SCALE "mip_hyper_heuristic_cpufj_work_unit_scale" +#define CUOPT_MIP_HYPER_HEURISTIC_GPU_HEUR_WORK_UNIT_SCALE \ + "mip_hyper_heuristic_gpu_heur_work_unit_scale" +#define CUOPT_MIP_HYPER_HEURISTIC_BB_WORK_UNIT_SCALE "mip_hyper_heuristic_bb_work_unit_scale" +#define CUOPT_MIP_HYPER_HEURISTIC_GPU_HEUR_WAIT_FOR_EXPLORATION \ + "mip_hyper_heuristic_gpu_heur_wait_for_exploration" + +/* @brief MIP determinism mode flags (bitset) */ +#define CUOPT_DETERMINISM_NONE 0x0 +// matches the previous value of '1' which was for B&B-only determinism in the previous release +#define CUOPT_DETERMINISM_BB 0x1 +#define CUOPT_DETERMINISM_GPU_HEURISTICS 0x2 +#define CUOPT_DETERMINISM_FULL (CUOPT_DETERMINISM_BB | CUOPT_DETERMINISM_GPU_HEURISTICS) + +#define CUOPT_MODE_OPPORTUNISTIC CUOPT_DETERMINISM_NONE +#define CUOPT_MODE_DETERMINISTIC CUOPT_DETERMINISM_FULL +#define CUOPT_MODE_DETERMINISTIC_BB CUOPT_DETERMINISM_BB +#define CUOPT_MODE_DETERMINISTIC_GPU_HEURISTICS CUOPT_DETERMINISM_GPU_HEURISTICS + +/* @brief MIP solution origin constants */ +#define CUOPT_MIP_SOLUTION_ORIGIN_UNKNOWN 0 +#define CUOPT_MIP_SOLUTION_ORIGIN_BRANCH_AND_BOUND 1 +#define CUOPT_MIP_SOLUTION_ORIGIN_BRANCH_AND_BOUND_DIVING 2 +#define CUOPT_MIP_SOLUTION_ORIGIN_FEASIBILITY_JUMP 3 +#define CUOPT_MIP_SOLUTION_ORIGIN_CPU_FEASIBILITY_JUMP 4 +#define CUOPT_MIP_SOLUTION_ORIGIN_LOCAL_SEARCH 5 +#define CUOPT_MIP_SOLUTION_ORIGIN_QUICK_FEASIBLE 6 +#define CUOPT_MIP_SOLUTION_ORIGIN_LP_ROUNDING 7 +#define CUOPT_MIP_SOLUTION_ORIGIN_RECOMBINATION 8 +#define CUOPT_MIP_SOLUTION_ORIGIN_SUB_MIP 9 +#define CUOPT_MIP_SOLUTION_ORIGIN_USER_INITIAL 10 +#define CUOPT_MIP_SOLUTION_ORIGIN_USER_INJECTED 11 +#define CUOPT_MIP_SOLUTION_ORIGIN_RINS 12 +#define CUOPT_MIP_SOLUTION_ORIGIN_PRESOLVE 13 /* @brief LP/MIP termination status constants */ #define CUOPT_TERMINATION_STATUS_NO_TERMINATION 0 diff --git a/cpp/include/cuopt/linear_programming/cuopt_c.h b/cpp/include/cuopt/linear_programming/cuopt_c.h index 4c4d44c764..f72a00e932 100644 --- a/cpp/include/cuopt/linear_programming/cuopt_c.h +++ b/cpp/include/cuopt/linear_programming/cuopt_c.h @@ -71,6 +71,23 @@ typedef int32_t cuopt_int_t; typedef int64_t cuopt_int_t; #endif +/** + * @brief Extended callback information passed to cuOptMIPGetSolutionCallbackExt. + * + * Provides metadata about each incumbent solution reported during a MIP solve. + * + * Fields are append-only. Existing fields will never be reordered, removed, + * or change type across releases. + */ +typedef struct { + /** Which solver component found this solution (CUOPT_MIP_SOLUTION_ORIGIN_*). */ + uint32_t origin; + /** Deterministic work-unit timestamp at which the solution was found. + * Monotonically increasing across successive callbacks within a single solve. + * In non-deterministic mode this value is informational only. */ + double work_timestamp; +} cuOptMIPSolutionCallbackInfo; + /** * @brief Get the size of the float type. * @@ -713,6 +730,24 @@ typedef void (*cuOptMIPGetSolutionCallback)(const cuopt_float_t* solution, const cuopt_float_t* solution_bound, void* user_data); +/** + * @brief Type of callback for receiving incumbent MIP solutions with extended metadata. + * + * @param[in] solution - Pointer to incumbent solution values. + * @param[in] objective_value - Pointer to incumbent objective value. + * @param[in] solution_bound - Pointer to current solution (dual/user) bound. + * @param[in] callback_info - Pointer to callback metadata. + * @param[in] user_data - Pointer to user data. + * @note All pointer arguments refer to host memory and are only valid during the callback + * invocation. Do not pass device/GPU pointers. Copy any data you need to keep after the callback + * returns. + */ +typedef void (*cuOptMIPGetSolutionCallbackExt)(const cuopt_float_t* solution, + const cuopt_float_t* objective_value, + const cuopt_float_t* solution_bound, + const cuOptMIPSolutionCallbackInfo* callback_info, + void* user_data); + /** * @brief Type of callback for injecting MIP solutions with user context. * @@ -748,6 +783,19 @@ cuopt_int_t cuOptSetMIPGetSolutionCallback(cuOptSolverSettings settings, cuOptMIPGetSolutionCallback callback, void* user_data); +/** + * @brief Register an extended callback to receive incumbent MIP solutions with extended metadata. + * + * @param[in] settings - The solver settings object. + * @param[in] callback - Callback function to receive incumbent solutions and callback metadata. + * @param[in] user_data - User-defined pointer passed through to the callback. + * + * @return A status code indicating success or failure. + */ +cuopt_int_t cuOptSetMIPGetSolutionCallbackExt(cuOptSolverSettings settings, + cuOptMIPGetSolutionCallbackExt callback, + void* user_data); + /** * @brief Register a callback to inject MIP solutions. * diff --git a/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp b/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp index 14c4d227bc..77425276c3 100644 --- a/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp +++ b/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp @@ -107,6 +107,13 @@ class mip_solver_settings_t { i_t strong_branching_simplex_iteration_limit = -1; i_t num_gpus = 1; bool log_to_console = true; + // User-facing multipliers on top of internal baseline work-unit scales. + // 1.0 = use internally calibrated default. Values > 1 make that component appear to do more work. + f_t cpufj_work_unit_scale = 1.0; + f_t gpu_heur_work_unit_scale = 1.0; + f_t bb_work_unit_scale = 1.0; + // When true, GPU heuristics wait for B&B to finish root solve before starting. + bool gpu_heur_wait_for_exploration = false; std::string log_file; std::string sol_file; @@ -118,15 +125,15 @@ class mip_solver_settings_t { int mip_scaling = CUOPT_MIP_SCALING_NO_OBJECTIVE; presolver_t presolver{presolver_t::Default}; /** - * @brief Determinism mode for MIP solver. + * @brief Determinism mode for MIP solver (bitset). * - * Controls the determinism behavior of the MIP solver: - * - CUOPT_MODE_OPPORTUNISTIC (0): Default mode, allows non-deterministic - * parallelism for better performance - * - CUOPT_MODE_DETERMINISTIC (1): Ensures deterministic results across runs - * at potential cost of performance + * Bitwise OR of CUOPT_DETERMINISM_* flags: + * - CUOPT_DETERMINISM_NONE (0x0): Opportunistic, non-deterministic. + * - CUOPT_DETERMINISM_BB (0x1): Deterministic B&B tree exploration. + * - CUOPT_DETERMINISM_GPU_HEURISTICS (0x2): Deterministic GPU heuristic pipeline. + * - CUOPT_DETERMINISM_FULL (0x3): Both B&B and GPU heuristics deterministic. */ - int determinism_mode = CUOPT_MODE_OPPORTUNISTIC; + int determinism_mode = CUOPT_DETERMINISM_NONE; /** * @brief Random seed for the MIP solver. * diff --git a/cpp/include/cuopt/linear_programming/utilities/internals.hpp b/cpp/include/cuopt/linear_programming/utilities/internals.hpp index bdfbb969d2..509e5c4100 100644 --- a/cpp/include/cuopt/linear_programming/utilities/internals.hpp +++ b/cpp/include/cuopt/linear_programming/utilities/internals.hpp @@ -13,6 +13,8 @@ #include #include +#include + namespace cuopt { namespace internals { @@ -21,7 +23,51 @@ class Callback { virtual ~Callback() {} }; -enum class base_solution_callback_type { GET_SOLUTION, SET_SOLUTION }; +enum class mip_solution_origin_t : uint32_t { + UNKNOWN = CUOPT_MIP_SOLUTION_ORIGIN_UNKNOWN, + BRANCH_AND_BOUND_NODE = CUOPT_MIP_SOLUTION_ORIGIN_BRANCH_AND_BOUND, + BRANCH_AND_BOUND_DIVING = CUOPT_MIP_SOLUTION_ORIGIN_BRANCH_AND_BOUND_DIVING, + FEASIBILITY_JUMP = CUOPT_MIP_SOLUTION_ORIGIN_FEASIBILITY_JUMP, + CPU_FEASIBILITY_JUMP = CUOPT_MIP_SOLUTION_ORIGIN_CPU_FEASIBILITY_JUMP, + LOCAL_SEARCH = CUOPT_MIP_SOLUTION_ORIGIN_LOCAL_SEARCH, + QUICK_FEASIBLE = CUOPT_MIP_SOLUTION_ORIGIN_QUICK_FEASIBLE, + LP_ROUNDING = CUOPT_MIP_SOLUTION_ORIGIN_LP_ROUNDING, + RECOMBINATION = CUOPT_MIP_SOLUTION_ORIGIN_RECOMBINATION, + SUB_MIP = CUOPT_MIP_SOLUTION_ORIGIN_SUB_MIP, + USER_INITIAL = CUOPT_MIP_SOLUTION_ORIGIN_USER_INITIAL, + USER_INJECTED = CUOPT_MIP_SOLUTION_ORIGIN_USER_INJECTED, + RINS = CUOPT_MIP_SOLUTION_ORIGIN_RINS, + PRESOLVE = CUOPT_MIP_SOLUTION_ORIGIN_PRESOLVE, +}; + +constexpr const char* mip_solution_origin_to_string(mip_solution_origin_t origin) +{ + switch (origin) { + case mip_solution_origin_t::UNKNOWN: return "unknown"; + case mip_solution_origin_t::BRANCH_AND_BOUND_NODE: return "branch_and_bound_node"; + case mip_solution_origin_t::BRANCH_AND_BOUND_DIVING: return "branch_and_bound_diving"; + case mip_solution_origin_t::FEASIBILITY_JUMP: return "feasibility_jump"; + case mip_solution_origin_t::CPU_FEASIBILITY_JUMP: return "cpu_feasibility_jump"; + case mip_solution_origin_t::LOCAL_SEARCH: return "local_search"; + case mip_solution_origin_t::QUICK_FEASIBLE: return "quick_feasible"; + case mip_solution_origin_t::LP_ROUNDING: return "lp_rounding"; + case mip_solution_origin_t::RECOMBINATION: return "recombination"; + case mip_solution_origin_t::SUB_MIP: return "sub_mip"; + case mip_solution_origin_t::USER_INITIAL: return "user_initial"; + case mip_solution_origin_t::USER_INJECTED: return "user_injected"; + case mip_solution_origin_t::RINS: return "rins"; + case mip_solution_origin_t::PRESOLVE: + return "presolve"; + // no default to trigger compiler -Werror + } + return "unknown"; +} + +using mip_solution_callback_info_t = cuOptMIPSolutionCallbackInfo; + +// get_solution_ext was added to support passing additional information to the get_solution callback +// without inducing a breaking ABI change +enum class base_solution_callback_type { GET_SOLUTION, GET_SOLUTION_EXT, SET_SOLUTION }; class base_solution_callback_t : public Callback { public: @@ -55,6 +101,19 @@ class get_solution_callback_t : public base_solution_callback_t { } }; +class get_solution_callback_ext_t : public base_solution_callback_t { + public: + virtual void get_solution(void* data, + void* objective_value, + void* solution_bound, + const mip_solution_callback_info_t* callback_info, + void* user_data) = 0; + base_solution_callback_type get_type() const override + { + return base_solution_callback_type::GET_SOLUTION_EXT; + } +}; + class set_solution_callback_t : public base_solution_callback_t { public: virtual void set_solution(void* data, diff --git a/cpp/src/barrier/barrier.cu b/cpp/src/barrier/barrier.cu index 4da66abe77..902e691e64 100644 --- a/cpp/src/barrier/barrier.cu +++ b/cpp/src/barrier/barrier.cu @@ -40,7 +40,9 @@ #include #include +#include #include +#include namespace cuopt::linear_programming::dual_simplex { diff --git a/cpp/src/barrier/iterative_refinement.hpp b/cpp/src/barrier/iterative_refinement.hpp index d37760cd07..69e72d66bc 100644 --- a/cpp/src/barrier/iterative_refinement.hpp +++ b/cpp/src/barrier/iterative_refinement.hpp @@ -13,6 +13,7 @@ #include #include +#include #include #include #include diff --git a/cpp/src/branch_and_bound/branch_and_bound.cpp b/cpp/src/branch_and_bound/branch_and_bound.cpp index 33a2d983c9..631edcbc84 100644 --- a/cpp/src/branch_and_bound/branch_and_bound.cpp +++ b/cpp/src/branch_and_bound/branch_and_bound.cpp @@ -5,9 +5,12 @@ */ /* clang-format on */ +#include + #include #include #include +#include #include #include @@ -25,6 +28,7 @@ #include #include +#include #include @@ -35,13 +39,20 @@ #include #include #include -#include #include #include #include -#include #include +// enable to activate detailed determinism logs +#if 0 +#undef CUOPT_DETERMINISM_LOG +#define CUOPT_DETERMINISM_LOG(logger, ...) \ + do { \ + logger.printf(__VA_ARGS__); \ + } while (0) +#endif + namespace cuopt::linear_programming::dual_simplex { namespace { @@ -270,6 +281,22 @@ branch_and_bound_t::branch_and_bound_t( dualize_info_t dualize_info; convert_user_problem(original_problem_, settings_, original_lp_, new_slacks_, dualize_info); full_variable_types(original_problem_, original_lp_, var_types_); + CUOPT_DETERMINISM_LOG( + settings_.log, + "Deterministic LP init state: rows=%d cols=%d nnz=%zu slacks=%zu slack_hash=0x%x " + "rhs_hash=0x%x lower_hash=0x%x upper_hash=0x%x Acol_hash=0x%x Arow_hash=0x%x " + "Aval_hash=0x%x\n", + original_lp_.num_rows, + original_lp_.num_cols, + original_lp_.A.x.size(), + new_slacks_.size(), + detail::compute_hash(new_slacks_), + detail::compute_hash(original_lp_.rhs), + detail::compute_hash(original_lp_.lower), + detail::compute_hash(original_lp_.upper), + detail::compute_hash(original_lp_.A.col_start), + detail::compute_hash(original_lp_.A.i), + detail::compute_hash(original_lp_.A.x)); // Check slack #ifdef CHECK_SLACKS @@ -320,19 +347,30 @@ void branch_and_bound_t::set_initial_upper_bound(f_t bound) } template -void branch_and_bound_t::report_heuristic(f_t obj) +void branch_and_bound_t::report_heuristic(f_t obj, double work_time) { if (is_running_) { f_t user_obj = compute_user_objective(original_lp_, obj); f_t user_lower = compute_user_objective(original_lp_, get_lower_bound()); std::string user_gap = user_mip_gap(original_lp_, obj, get_lower_bound()); - - settings_.log.printf( - "H %+13.6e %+10.6e %s %9.2f\n", - user_obj, - user_lower, - user_gap.c_str(), - toc(exploration_stats_.start_time)); + if (settings_.deterministic) { + const double reported_work = work_time >= 0.0 ? work_time : work_unit_context_.current_work(); + settings_.log.printf( + "H %+13.6e %+10.6e %s " + "%9.2f %9.2f\n", + user_obj, + user_lower, + user_gap.c_str(), + reported_work, + toc(exploration_stats_.start_time)); + } else { + settings_.log.printf( + "H %+13.6e %+10.6e %s %9.2f\n", + user_obj, + user_lower, + user_gap.c_str(), + toc(exploration_stats_.start_time)); + } } else { if (solving_root_relaxation_.load()) { f_t user_obj = compute_user_objective(original_lp_, obj); @@ -461,8 +499,11 @@ void branch_and_bound_t::update_user_bound(f_t lower_bound) } template -void branch_and_bound_t::set_new_solution(const std::vector& solution) +void branch_and_bound_t::set_new_solution(const std::vector& solution, + cuopt::internals::mip_solution_origin_t origin) { + cuopt_assert(!settings_.deterministic, "set_new_solution is for opportunistic B&B only"); + mutex_original_lp_.lock(); if (solution.size() != original_problem_.num_cols) { settings_.log.printf( @@ -513,51 +554,91 @@ void branch_and_bound_t::set_new_solution(const std::vector& solu if (is_feasible) { report_heuristic(obj); } if (attempt_repair) { mutex_repair_.lock(); - repair_queue_.push_back(solution); + repair_queue_.push_back({solution, origin}); mutex_repair_.unlock(); } } template -void branch_and_bound_t::queue_external_solution_deterministic( - const std::vector& solution, double work_unit_ts) +void branch_and_bound_t::emit_solution_callback( + std::vector& original_x, + f_t objective, + cuopt::internals::mip_solution_origin_t origin, + double work_timestamp) +{ + cuopt_assert(!settings_.deterministic || work_timestamp >= 0.0, + "work_timestamp must not be negative in deterministic mode"); + if (settings_.new_incumbent_callback != nullptr) { + settings_.log.debug("Publishing incumbent: obj=%g wut=%.6f origin=%s\n", + compute_user_objective(original_lp_, objective), + work_timestamp, + cuopt::internals::mip_solution_origin_to_string(origin)); + cuopt::internals::mip_solution_callback_info_t callback_info{}; + callback_info.origin = (uint32_t)origin; + callback_info.work_timestamp = work_timestamp; + settings_.new_incumbent_callback(original_x, objective, callback_info, work_timestamp); + } +} + +template +void branch_and_bound_t::emit_solution_callback_from_crushed( + const std::vector& crushed_solution, + f_t objective, + cuopt::internals::mip_solution_origin_t origin, + double work_timestamp) { - // In deterministic mode, queue the solution to be processed at the correct work unit timestamp - // This ensures deterministic ordering of solution events + if (settings_.new_incumbent_callback == nullptr) { return; } + std::vector original_x; + uncrush_primal_solution(original_problem_, original_lp_, crushed_solution, original_x); + emit_solution_callback(original_x, objective, origin, work_timestamp); +} +template +void branch_and_bound_t::queue_external_solution_deterministic( + const std::vector& solution, + f_t user_objective, + double work_unit_ts, + cuopt::internals::mip_solution_origin_t origin) +{ if (solution.size() != original_problem_.num_cols) { settings_.log.printf( "Solution size mismatch %ld %d\n", solution.size(), original_problem_.num_cols); return; } + settings_.log.printf( + "Queueing deterministic external incumbent: obj=%g heur_wut=%.3f bnb_wut=%.3f origin=%s " + "hash=0x%x\n", + user_objective, + work_unit_ts, + work_unit_context_.current_work(), + cuopt::internals::mip_solution_origin_to_string(origin), + detail::compute_hash(solution)); mutex_original_lp_.lock(); - std::vector crushed_solution; - crush_primal_solution( - original_problem_, original_lp_, solution, new_slacks_, crushed_solution); - f_t obj = compute_objective(original_lp_, crushed_solution); - - // Validate solution before queueing - f_t primal_err; - f_t bound_err; - i_t num_fractional; - bool is_feasible = check_guess( - original_lp_, settings_, var_types_, crushed_solution, primal_err, bound_err, num_fractional); + CUOPT_DETERMINISM_LOG( + settings_.log, + "Deterministic external crush ctx: wut=%.6f lp_rows=%d lp_cols=%d lp_nnz=%zu " + "active_cut_rows=%d " + "slacks=%zu slack_hash=0x%x rhs_hash=0x%x lower_hash=0x%x upper_hash=0x%x " + "Acol_hash=0x%x Arow_hash=0x%x Aval_hash=0x%x\n", + work_unit_ts, + original_lp_.num_rows, + original_lp_.num_cols, + original_lp_.A.x.size(), + std::max((i_t)0, original_lp_.num_rows - original_problem_.num_rows), + new_slacks_.size(), + detail::compute_hash(new_slacks_), + detail::compute_hash(original_lp_.rhs), + detail::compute_hash(original_lp_.lower), + detail::compute_hash(original_lp_.upper), + detail::compute_hash(original_lp_.A.col_start), + detail::compute_hash(original_lp_.A.i), + detail::compute_hash(original_lp_.A.x)); mutex_original_lp_.unlock(); - if (!is_feasible) { - // Queue the uncrushed solution for repair; it will be crushed at - // consumption time so that the crush reflects the current LP state - // (which may have gained slack columns from cuts added after this point). - mutex_repair_.lock(); - repair_queue_.push_back(solution); - mutex_repair_.unlock(); - return; - } - - // Queue the solution with its work unit timestamp mutex_heuristic_queue_.lock(); - heuristic_solution_queue_.push_back({obj, std::move(crushed_solution), 0, -1, 0, work_unit_ts}); + heuristic_solution_queue_.push_back({solution, user_objective, work_unit_ts, origin}); + const size_t heuristic_queue_size = heuristic_solution_queue_.size(); mutex_heuristic_queue_.unlock(); } @@ -620,6 +701,14 @@ bool branch_and_bound_t::repair_solution(const std::vector& edge_ num_fractional, repaired_obj); } + } else { + settings_.log.printf( + "Repair LP failed: status=%s iters=%d time=%.3fs time_limit=%.3f cut_off=%e\n", + dual::status_to_string(lp_status).c_str(), + iter, + toc(lp_start_time), + lp_settings.time_limit, + lp_settings.cut_off); } return feasible; @@ -630,7 +719,7 @@ void branch_and_bound_t::repair_heuristic_solutions() { raft::common::nvtx::range scope("BB::repair_heuristics"); // Check if there are any solutions to repair - std::vector> to_repair; + std::vector to_repair; mutex_repair_.lock(); if (repair_queue_.size() > 0) { to_repair = repair_queue_; @@ -640,7 +729,8 @@ void branch_and_bound_t::repair_heuristic_solutions() if (to_repair.size() > 0) { settings_.log.debug("Attempting to repair %ld injected solutions\n", to_repair.size()); - for (const std::vector& uncrushed_solution : to_repair) { + for (const auto& queued_solution : to_repair) { + const std::vector& uncrushed_solution = queued_solution.solution; std::vector crushed_solution; crush_primal_solution( original_problem_, original_lp_, uncrushed_solution, new_slacks_, crushed_solution); @@ -652,15 +742,23 @@ void branch_and_bound_t::repair_heuristic_solutions() mutex_upper_.lock(); if (improves_incumbent(repaired_obj)) { - upper_bound_ = std::min(upper_bound_.load(), repaired_obj); + const f_t previous_upper = upper_bound_; + upper_bound_ = std::min(upper_bound_.load(), repaired_obj); incumbent_.set_incumbent_solution(repaired_obj, repaired_solution); - report_heuristic(repaired_obj); - - if (settings_.solution_callback != nullptr) { - std::vector original_x; - uncrush_primal_solution(original_problem_, original_lp_, repaired_solution, original_x); - settings_.solution_callback(original_x, repaired_obj); - } + CUOPT_DETERMINISM_LOG( + settings_.log, + "Deterministic B&B incumbent update: source=repair_queue prev_upper=%.16e " + "new_upper=%.16e obj=%.16e hash=0x%x\n", + previous_upper, + upper_bound_.load(), + repaired_obj, + detail::compute_hash(repaired_solution)); + report_heuristic(repaired_obj, queued_solution.work_timestamp); + + emit_solution_callback_from_crushed(repaired_solution, + repaired_obj, + queued_solution.origin, + queued_solution.work_timestamp); } mutex_upper_.unlock(); @@ -690,14 +788,47 @@ void branch_and_bound_t::set_solution_at_root(mip_solution_t compute_user_objective(original_lp_, root_objective_), toc(exploration_stats_.start_time)); - if (settings_.solution_callback != nullptr) { - settings_.solution_callback(solution.x, solution.objective); - } + emit_solution_callback(solution.x, + solution.objective, + cuopt::internals::mip_solution_origin_t::BRANCH_AND_BOUND_NODE, + work_unit_context_.current_work()); if (settings_.heuristic_preemption_callback != nullptr) { settings_.heuristic_preemption_callback(); } } +template +std::tuple> branch_and_bound_t::retire_queued_solution( + const queued_external_solution_t& queued_solution) +{ + f_t primal_err; + f_t bound_err; + i_t num_fractional; + std::vector crushed; + + mutex_original_lp_.lock(); + crush_primal_solution( + original_problem_, original_lp_, queued_solution.solution, new_slacks_, crushed); + f_t obj = compute_objective(original_lp_, crushed); + bool is_feasible = check_guess( + original_lp_, settings_, var_types_, crushed, primal_err, bound_err, num_fractional); + mutex_original_lp_.unlock(); + + if (is_feasible) { return {true, obj, std::move(crushed)}; } + + // Attempt repair immediately, no separate repair queue in deterministic mode + std::vector repaired_solution; + f_t repaired_obj; + bool repaired = repair_solution(edge_norms_, crushed, repaired_obj, repaired_solution); + if (repaired) { return {true, repaired_obj, std::move(repaired_solution)}; } + + CUOPT_DETERMINISM_LOG(settings_.log, + "Deterministic repair FAILED: wut=%.3f origin=%s\n", + queued_solution.work_timestamp, + cuopt::internals::mip_solution_origin_to_string(queued_solution.origin)); + return {false, {}, {}}; +} + template void branch_and_bound_t::set_final_solution(mip_solution_t& solution, f_t lower_bound) @@ -767,6 +898,53 @@ void branch_and_bound_t::set_final_solution(mip_solution_t& } } + // Drain any pending heuristic solutions that B&B never got to retire during exploration + // (e.g., root solve consumed the entire budget). + if (settings_.deterministic) { + const double current_work = work_unit_context_.current_work(); + mutex_heuristic_queue_.lock(); + std::vector pending; + pending.swap(heuristic_solution_queue_); + mutex_heuristic_queue_.unlock(); + + std::sort(pending.begin(), + pending.end(), + [](const queued_external_solution_t& a, const queued_external_solution_t& b) { + if (a.work_timestamp != b.work_timestamp) { + return a.work_timestamp < b.work_timestamp; + } + if (a.user_objective != b.user_objective) { + return a.user_objective < b.user_objective; + } + if (a.origin != b.origin) { return a.origin < b.origin; } + return a.solution < b.solution; + }); + + for (const auto& queued_solution : pending) { + if (queued_solution.work_timestamp > current_work) { continue; } + auto [feasible, obj, crushed] = retire_queued_solution(queued_solution); + if (feasible && improves_incumbent(obj)) { + upper_bound_ = std::min(upper_bound_.load(), obj); + incumbent_.set_incumbent_solution(obj, crushed); + settings_.log.debug( + "Late-retired heuristic incumbent: obj=%.6e wut=%.3f origin=%s\n", + compute_user_objective(original_lp_, obj), + queued_solution.work_timestamp, + cuopt::internals::mip_solution_origin_to_string(queued_solution.origin)); + emit_solution_callback_from_crushed( + crushed, obj, queued_solution.origin, queued_solution.work_timestamp); + } + } + size_t n_drained = pending.size(); + CUOPT_DETERMINISM_LOG( + settings_.log, + "Post-drain: user_upper=%.16e has_incumbent=%d drained=%zu user_lower_arg=%.16e\n", + compute_user_objective(original_lp_, upper_bound_.load()), + (int)incumbent_.has_incumbent, + n_drained, + compute_user_objective(original_lp_, lower_bound)); + } + if (has_solver_space_incumbent()) { uncrush_primal_solution(original_problem_, original_lp_, incumbent_.x, solution.x); solution.objective = incumbent_.objective; @@ -790,16 +968,29 @@ void branch_and_bound_t::add_feasible_solution(f_t leaf_objective, mutex_upper_.lock(); if (improves_incumbent(leaf_objective)) { + const f_t previous_upper = upper_bound_; incumbent_.set_incumbent_solution(leaf_objective, leaf_solution); upper_bound_ = std::min(upper_bound_.load(), leaf_objective); + CUOPT_DETERMINISM_LOG( + settings_.log, + "Deterministic B&B incumbent update: source=leaf prev_upper=%.16e new_upper=%.16e " + "obj=%.16e hash=0x%x depth=%d worker_type=%d\n", + previous_upper, + upper_bound_.load(), + leaf_objective, + detail::compute_hash(leaf_solution), + leaf_depth, + (int)thread_type); report(feasible_solution_symbol(thread_type), leaf_objective, get_lower_bound(), leaf_depth, 0); send_solution = true; } - if (send_solution && settings_.solution_callback != nullptr) { - std::vector original_x; - uncrush_primal_solution(original_problem_, original_lp_, incumbent_.x, original_x); - settings_.solution_callback(original_x, leaf_objective); + if (send_solution) { + emit_solution_callback_from_crushed( + incumbent_.x, + leaf_objective, + cuopt::internals::mip_solution_origin_t::BRANCH_AND_BOUND_NODE, + work_unit_context_.current_work()); } mutex_upper_.unlock(); } @@ -936,6 +1127,23 @@ struct nondeterministic_policy_t : tree_update_policy_t { f_t obj, const std::vector& x) override { + f_t primal_err; + f_t bound_err; + i_t num_fractional; + bool cg = check_guess( + bnb.original_lp_, bnb.settings_, bnb.var_types_, x, primal_err, bound_err, num_fractional); + if (!cg) { + bnb.settings_.log.printf( + "Rejecting infeasible integer solution: node=%d depth=%d " + "obj=%.6e primal_err=%.6e bound_err=%.6e fractional=%d\n", + node->node_id, + node->depth, + obj, + primal_err, + bound_err, + num_fractional); + return; + } bnb.add_feasible_solution(obj, x, node->depth, worker->search_strategy); } @@ -1008,8 +1216,11 @@ struct deterministic_policy_base_t : tree_update_policy_t { ? node->fractional_val - std::floor(node->fractional_val) : std::ceil(node->fractional_val) - node->fractional_val; if (frac > 1e-10) { - worker.pc_snapshot.queue_update( - node->branch_var, node->branch_dir, change / frac, worker.clock, worker.worker_id); + worker.pc_snapshot.queue_update(node->branch_var, + node->branch_dir, + change / frac, + worker.work_context.current_work(), + worker.worker_id); } } @@ -1029,17 +1240,94 @@ struct deterministic_bfs_policy_t const std::vector& x) override { if (obj < this->worker.local_upper_bound) { + f_t primal_err; + f_t bound_err; + i_t num_fractional; + bool cg = check_guess(this->bnb.original_lp_, + this->bnb.settings_, + this->bnb.var_types_, + x, + primal_err, + bound_err, + num_fractional); + if (!cg) { + this->bnb.settings_.log.printf( + "Rejecting infeasible integer solution: worker=%d node=%d depth=%d " + "obj=%.6e primal_err=%.6e bound_err=%.6e fractional=%d\n", + this->worker.worker_id, + node->creation_seq, + node->depth, + obj, + primal_err, + bound_err, + num_fractional); + return; + } this->worker.local_upper_bound = obj; + CUOPT_DETERMINISM_LOG( + bnb.settings_.log, + "BFS integer solution queued: worker=%d clock=%.6f ctx_work=%.6f obj=%.6e depth=%d\n", + this->worker.worker_id, + this->worker.work_context.current_work(), + this->worker.work_context.global_work_units_elapsed, + obj, + node->depth); this->worker.integer_solutions.push_back( - {obj, x, node->depth, this->worker.worker_id, this->worker.next_solution_seq++}); + {obj, + x, + node->depth, + this->worker.worker_id, + this->worker.next_solution_seq++, + this->worker.work_context.current_work(), + cuopt::internals::mip_solution_origin_t::BRANCH_AND_BOUND_NODE}); } } - branch_variable_t select_branch_variable(mip_node_t*, + branch_variable_t select_branch_variable(mip_node_t* node, const std::vector& fractional, const std::vector& x) override { - i_t var = this->worker.pc_snapshot.variable_selection(fractional, x); + i_t var; + if (this->bnb.settings_.reliability_branching != 0 && + this->worker.nodes_explored_snapshot > 0) { + auto& snap = this->worker.pc_snapshot; + + sb_update_callback_t on_sb_update = [&]( + i_t j, rounding_direction_t dir, f_t delta) { + snap.record_update( + j, dir, delta, this->worker.work_context.current_work(), this->worker.worker_id); + }; + + var = reliable_variable_selection_core(node, + fractional, + x, + this->bnb.settings_, + this->bnb.var_types_, + this->worker.leaf_problem, + this->worker.leaf_edge_norms, + this->worker.basis_factors, + this->worker.basic_list, + this->worker.nonbasic_list, + snap.sum_down_.data(), + snap.sum_up_.data(), + snap.num_down_.data(), + snap.num_up_.data(), + snap.n_vars(), + snap.strong_branching_lp_iter_, + this->worker.local_upper_bound, + (int64_t)this->worker.total_lp_iters_snapshot, + (int64_t)this->worker.nodes_explored_snapshot, + this->bnb.exploration_stats_.start_time, + this->bnb.pc_.reliability_branching_settings, + 1, + nullptr, + nullptr, + &this->worker.rng, + &this->worker.work_context, + on_sb_update); + } else { + var = this->worker.pc_snapshot.variable_selection(fractional, x); + } auto dir = martin_criteria(x[var], this->bnb.root_relax_soln_.x[var]); return {var, dir}; } @@ -1072,9 +1360,12 @@ struct deterministic_bfs_policy_t this->worker.enqueue_children_for_plunge(node->get_down_child(), node->get_up_child(), dir); break; case node_status_t::NUMERICAL: this->worker.record_numerical(node); break; + case node_status_t::PENDING: this->worker.plunge_stack.push_back(node); break; default: break; } - if (status != node_status_t::HAS_CHILDREN) { this->worker.recompute_bounds_and_basis = true; } + if (status != node_status_t::HAS_CHILDREN && status != node_status_t::PENDING) { + this->worker.recompute_bounds_and_basis = true; + } } void on_numerical_issue(mip_node_t* node) override @@ -1105,6 +1396,31 @@ struct deterministic_diving_policy_t const std::vector& x) override { if (obj < this->worker.local_upper_bound) { + f_t primal_err; + f_t bound_err; + i_t num_fractional; + bool cg = check_guess(this->bnb.original_lp_, + this->bnb.settings_, + this->bnb.var_types_, + x, + primal_err, + bound_err, + num_fractional); + if (!cg) { + this->bnb.settings_.log.printf( + "Rejecting infeasible diving integer solution: worker=%d node=%d depth=%d " + "obj=%.6e primal_err=%.6e bound_err=%.6e fractional=%d\n", + this->worker.worker_id, + node->creation_seq, + node->depth, + obj, + primal_err, + bound_err, + num_fractional); + return; + } + const f_t previous_local_upper = this->worker.local_upper_bound; + const int previous_seq = this->worker.next_solution_seq; this->worker.local_upper_bound = obj; this->worker.queue_integer_solution(obj, x, node->depth); } @@ -2017,6 +2333,18 @@ template mip_status_t branch_and_bound_t::solve(mip_solution_t& solution) { raft::common::nvtx::range scope("BB::solve"); + auto exploration_signal_guard = cuopt::scope_guard([this]() { + if (!exploration_started_.load()) { + std::lock_guard lock(exploration_started_mutex_); + exploration_started_ = true; + exploration_started_cv_.notify_all(); + } + }); + auto heuristic_preemption_guard = cuopt::scope_guard([this]() { + if (settings_.heuristic_preemption_callback != nullptr) { + settings_.heuristic_preemption_callback(); + } + }); logger_t log; log.log = false; @@ -2028,6 +2356,25 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut exploration_stats_.nodes_explored = 0; original_lp_.A.to_compressed_row(Arow_); + work_unit_scheduler_t* saved_scheduler = work_unit_context_.scheduler; + if (settings_.deterministic) { + work_unit_context_.deterministic = true; + cuopt_assert(settings_.bb_work_unit_scale > 0.0, "B&B work-unit scale must be positive"); + if (settings_.gpu_heur_wait_for_exploration) { + // Scale=0 during pre-exploration: root LP/cuts/SB don't advance the deterministic timeline. + // GPU heuristics start after exploration, so both timelines begin at 0 together. + work_unit_context_.work_unit_scale = 0.0; + } else { + // GPU heuristics race with B&B pre-exploration, so B&B work must advance normally. + work_unit_context_.work_unit_scale = BB_BASE_WORK_SCALE * settings_.bb_work_unit_scale; + } + + // Detach the scheduler during the serial root/cuts/SB phase. + // record_work_sync_on_horizon still accumulates global_work_units_elapsed, + // but avoids scheduler->on_work_recorded + work_unit_context_.scheduler = nullptr; + } + settings_.log.printf("Reduced cost strengthening enabled: %d\n", settings_.reduced_cost_strengthening); @@ -2047,14 +2394,15 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut const f_t computed_obj = compute_objective(original_lp_, crushed_guess); mutex_upper_.lock(); incumbent_.set_incumbent_solution(computed_obj, crushed_guess); - upper_bound_ = computed_obj; + upper_bound_ = std::min(upper_bound_.load(), computed_obj); mutex_upper_.unlock(); } } root_relax_soln_.resize(original_lp_.num_rows, original_lp_.num_cols); - if (settings_.clique_cuts != 0 && clique_table_ == nullptr) { + // TODO: ensure clique tables work well w/ determinism + if (settings_.clique_cuts != 0 && clique_table_ == nullptr && !settings_.deterministic) { signal_extend_cliques_.store(false, std::memory_order_release); typename ::cuopt::linear_programming::mip_solver_settings_t::tolerances_t tolerances_for_clique{}; @@ -2104,7 +2452,8 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut basic_list, nonbasic_list, root_vstatus_, - edge_norms_); + edge_norms_, + &work_unit_context_); } else { settings_.log.printf("\nSolving LP root relaxation in concurrent mode\n"); root_status = solve_root_relaxation(lp_settings, @@ -2118,6 +2467,10 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut solving_root_relaxation_ = false; exploration_stats_.total_lp_iters = root_relax_soln_.iterations; exploration_stats_.total_lp_solve_time = toc(exploration_stats_.start_time); + CUOPT_DETERMINISM_LOG(settings_.log, + "Post-root-LP work: %.16e iters=%d\n", + work_unit_context_.current_work(), + root_relax_soln_.iterations); auto finish_clique_thread = [this]() { if (clique_table_future_.valid()) { @@ -2163,7 +2516,18 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut assert(root_vstatus_.size() == original_lp_.num_cols); set_uninitialized_steepest_edge_norms(original_lp_, basic_list, edge_norms_); - root_objective_ = compute_objective(original_lp_, root_relax_soln_.x); + { + const f_t previous_root_objective = root_objective_; + root_objective_ = compute_objective(original_lp_, root_relax_soln_.x); + CUOPT_DETERMINISM_LOG( + settings_.log, + "Deterministic root objective assign: source=post_root_solve old=%.16e new=%.16e " + "x_hash=0x%x obj_hash=0x%x\n", + previous_root_objective, + root_objective_, + detail::compute_hash(root_relax_soln_.x), + detail::compute_hash(original_lp_.objective)); + } if (settings_.set_simplex_solution_callback != nullptr) { std::vector original_x; @@ -2395,7 +2759,8 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut nonbasic_list, root_relax_soln_, iter, - edge_norms_); + edge_norms_, + &work_unit_context_); exploration_stats_.total_lp_iters += iter; f_t dual_phase2_time = toc(dual_phase2_start_time); if (dual_phase2_time > 1.0) { @@ -2406,6 +2771,11 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut set_final_solution(solution, root_objective_); return solver_status_; } + if (cut_status == dual::status_t::WORK_LIMIT) { + solver_status_ = mip_status_t::WORK_LIMIT; + set_final_solution(solution, root_objective_); + return solver_status_; + } if (cut_status != dual::status_t::OPTIMAL) { settings_.log.printf("Numerical issue at root node. Resolving from scratch\n"); @@ -2418,12 +2788,25 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut basic_list, nonbasic_list, root_vstatus_, - edge_norms_); + edge_norms_, + &work_unit_context_); if (scratch_status == lp_status_t::OPTIMAL) { // We recovered cut_status = convert_lp_status_to_dual_status(scratch_status); exploration_stats_.total_lp_iters += root_relax_soln_.iterations; - root_objective_ = compute_objective(original_lp_, root_relax_soln_.x); + { + const f_t previous_root_objective = root_objective_; + root_objective_ = compute_objective(original_lp_, root_relax_soln_.x); + CUOPT_DETERMINISM_LOG( + settings_.log, + "Deterministic root objective assign: source=cut_lp_scratch old=%.16e new=%.16e " + "pass=%d x_hash=0x%x obj_hash=0x%x\n", + previous_root_objective, + root_objective_, + cut_pass, + detail::compute_hash(root_relax_soln_.x), + detail::compute_hash(original_lp_.objective)); + } } else { settings_.log.printf("Cut status %s\n", dual::status_to_string(cut_status).c_str()); #ifdef WRITE_CUT_INFEASIBLE_MPS @@ -2461,9 +2844,18 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut num_fractional = fractional_variables(settings_, root_relax_soln_.x, var_types_, fractional); if (num_fractional == 0) { - upper_bound_ = root_objective_; + const f_t previous_upper = upper_bound_; + upper_bound_ = std::min(upper_bound_.load(), root_objective_); mutex_upper_.lock(); incumbent_.set_incumbent_solution(root_objective_, root_relax_soln_.x); + CUOPT_DETERMINISM_LOG( + settings_.log, + "Deterministic B&B incumbent update: source=root_integral_pass prev_upper=%.16e " + "new_upper=%.16e obj=%.16e hash=0x%x\n", + previous_upper, + upper_bound_.load(), + root_objective_, + detail::compute_hash(root_relax_soln_.x)); mutex_upper_.unlock(); } f_t obj = upper_bound_.load(); @@ -2523,7 +2915,8 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut basic_list, nonbasic_list, basis_update, - pc_); + pc_, + &work_unit_context_); } if (toc(exploration_stats_.start_time) > settings_.time_limit) { @@ -2605,6 +2998,15 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut calculate_variable_locks(original_lp_, var_up_locks_, var_down_locks_); } if (settings_.deterministic) { + pre_exploration_work_ = work_unit_context_.current_work(); + CUOPT_DETERMINISM_LOG( + settings_.log, + "Pre-exploration work breakdown: total=%.16e scale=%.6f deterministic=%d\n", + pre_exploration_work_, + work_unit_context_.work_unit_scale, + (int)work_unit_context_.deterministic); + work_unit_context_.scheduler = saved_scheduler; + work_unit_context_.work_unit_scale = BB_BASE_WORK_SCALE * settings_.bb_work_unit_scale; settings_.log.printf( " | Explored | Unexplored | Objective | Bound | IntInf | Depth | Iter/Node " "| Gap | Work | Time |\n"); @@ -2614,11 +3016,25 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut "| Gap | Time |\n"); } + // Signal to producers (like GPU heuristics) that pre-exploration work is finished + { + std::lock_guard lock(exploration_started_mutex_); + exploration_started_ = true; + } + exploration_started_cv_.notify_all(); + + int bb_device_id = 0; + RAFT_CUDA_TRY(cudaGetDevice(&bb_device_id)); + if (settings_.deterministic) { run_deterministic_coordinator(Arow_); } else if (settings_.num_threads > 1) { #pragma omp parallel num_threads(settings_.num_threads) { + // Any OMP thread may end up holding the lock during horizon syncs, and thus + // handle publication of solutions to the callback. Uncrush to the original problem requires + // GPU ops so ensure all threads call cudaSetDevice at init + RAFT_CUDA_TRY(cudaSetDevice(bb_device_id)); #pragma omp master run_scheduler(); } @@ -2633,6 +3049,13 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut if (deterministic_mode_enabled_) { lower_bound = deterministic_compute_lower_bound(); solver_status_ = deterministic_global_termination_status_; + CUOPT_DETERMINISM_LOG( + settings_.log, + "Final lower bound: user_lb=%.16e user_ub=%.16e status=%d has_incumbent=%d\n", + compute_user_objective(original_lp_, lower_bound), + compute_user_objective(original_lp_, upper_bound_.load()), + (int)deterministic_global_termination_status_, + (int)incumbent_.has_incumbent); } else { if (node_queue_.best_first_queue_size() > 0) { // We need to clear the queue and use the info in the search tree for the lower bound @@ -2786,8 +3209,7 @@ void branch_and_bound_t::run_deterministic_coordinator(const csr_matri deterministic_horizon_step_ = 0.50; - // Compute worker counts using the same formula as reliability-branching scheduler - const i_t num_workers = 2 * settings_.num_threads; + const i_t num_workers = settings_.num_threads; std::vector search_strategies = get_search_strategies(settings_.diving_settings); std::array max_num_workers = @@ -2800,7 +3222,7 @@ void branch_and_bound_t::run_deterministic_coordinator(const csr_matri } deterministic_mode_enabled_ = true; - deterministic_current_horizon_ = deterministic_horizon_step_; + deterministic_current_horizon_ = pre_exploration_work_ + deterministic_horizon_step_; deterministic_horizon_number_ = 0; deterministic_global_termination_status_ = mip_status_t::UNSET; @@ -2828,14 +3250,17 @@ void branch_and_bound_t::run_deterministic_coordinator(const csr_matri } } - deterministic_scheduler_ = std::make_unique(deterministic_horizon_step_); + deterministic_scheduler_ = + std::make_unique(deterministic_horizon_step_, pre_exploration_work_); scoped_context_registrations_t context_registrations(*deterministic_scheduler_); for (auto& worker : *deterministic_workers_) { + worker.work_context.set_current_work(pre_exploration_work_, false); context_registrations.add(worker.work_context); } if (deterministic_diving_workers_) { for (auto& worker : *deterministic_diving_workers_) { + worker.work_context.set_current_work(pre_exploration_work_, false); context_registrations.add(worker.work_context); } } @@ -2843,8 +3268,9 @@ void branch_and_bound_t::run_deterministic_coordinator(const csr_matri int actual_diving_workers = deterministic_diving_workers_ ? (int)deterministic_diving_workers_->size() : 0; settings_.log.printf( - "Deterministic Mode: %d BFS workers + %d diving workers, horizon step = %.2f work " - "units\n", + "Deterministic Mode: %d total threads split as %d BFS workers + %d diving workers, " + "horizon step = %.2f work units\n", + num_workers, num_bfs_workers, actual_diving_workers, deterministic_horizon_step_); @@ -2868,9 +3294,12 @@ void branch_and_bound_t::run_deterministic_coordinator(const csr_matri } const int total_thread_count = num_bfs_workers + num_diving_workers; + int coordinator_device_id = 0; + RAFT_CUDA_TRY(cudaGetDevice(&coordinator_device_id)); #pragma omp parallel num_threads(total_thread_count) { + RAFT_CUDA_TRY(cudaSetDevice(coordinator_device_id)); int thread_id = omp_get_thread_num(); if (thread_id < num_bfs_workers) { auto& worker = (*deterministic_workers_)[thread_id]; @@ -2976,11 +3405,17 @@ void branch_and_bound_t::run_deterministic_bfs_loop( bool is_child = (node->parent == worker.last_solved_node); worker.recompute_bounds_and_basis = !is_child; - node_status_t status = solve_node_deterministic(worker, node, search_tree); - worker.last_solved_node = node; + node_status_t status = solve_node_deterministic(worker, node, search_tree); + worker.current_node = nullptr; - worker.current_node = nullptr; - continue; + if (status == node_status_t::PENDING) { + // Global termination limits were hit (TIME_LIMIT/WORK_LIMIT). Node was re-enqueued by + // on_node_completed. Fall through to sync barrier and let the sync callback handle + // termination. + } else { + worker.last_solved_node = node; + continue; + } } // No work - advance to sync point to participate in barrier @@ -3005,24 +3440,40 @@ void branch_and_bound_t::deterministic_sync_callback() max_producer_wait_time_ = std::max(max_producer_wait_time_, wait_time); ++producer_wait_count_; - work_unit_context_.global_work_units_elapsed = horizon_end; + work_unit_context_.set_current_work(horizon_end, false); - bb_event_batch_t all_events = deterministic_workers_->collect_and_sort_events(); + { + std::string worker_clocks_str; + for (const auto& w : *deterministic_workers_) { + worker_clocks_str += std::to_string(w.worker_id) + ":" + + std::to_string(w.work_context.current_work()) + "/" + + std::to_string(w.integer_solutions.size()) + " "; + } + CUOPT_DETERMINISM_LOG( + settings_.log, + "Deterministic sync #%d: horizon=%.6f pre_expl=%.6f heur_q=%zu workers=[%s]\n", + deterministic_horizon_number_, + deterministic_current_horizon_, + pre_exploration_work_, + heuristic_solution_queue_.size(), + worker_clocks_str.c_str()); + } - deterministic_sort_replay_events(all_events); + bb_event_batch_t all_events = deterministic_workers_->collect_and_sort_events(); - // deterministic_prune_worker_nodes_vs_incumbent(); + std::vector::deterministic_replay_solution_t> + replay_solutions; + deterministic_collect_worker_solutions( + *deterministic_workers_, + [](const deterministic_bfs_worker_pool_t&, int) { + return search_strategy_t::BEST_FIRST; + }, + replay_solutions); + deterministic_collect_diving_solutions_and_update_pseudocosts(replay_solutions); - deterministic_collect_diving_solutions_and_update_pseudocosts(); + deterministic_sort_replay_events(all_events, replay_solutions); - for (auto& worker : *deterministic_workers_) { - worker.integer_solutions.clear(); - } - if (deterministic_diving_workers_) { - for (auto& worker : *deterministic_diving_workers_) { - worker.integer_solutions.clear(); - } - } + // deterministic_prune_worker_nodes_vs_incumbent(); deterministic_populate_diving_heap(); @@ -3079,6 +3530,19 @@ void branch_and_bound_t::deterministic_sync_callback() f_t abs_gap = compute_user_abs_gap(original_lp_, upper_bound, lower_bound); f_t rel_gap = user_relative_gap(original_lp_, upper_bound, lower_bound); + CUOPT_DETERMINISM_LOG( + settings_.log, + "Sync termination check: horizon=%.6f user_lower=%.16e user_upper=%.16e abs_gap=%.6e " + "rel_gap=%.6e bfs_has_work=%d diving_has_work=%d status=%d\n", + deterministic_current_horizon_, + compute_user_objective(original_lp_, lower_bound), + compute_user_objective(original_lp_, upper_bound), + abs_gap, + rel_gap, + (int)deterministic_workers_->any_has_work(), + deterministic_diving_workers_ ? (int)deterministic_diving_workers_->any_has_work() : -1, + (int)deterministic_global_termination_status_); + if (abs_gap <= settings_.absolute_mip_gap_tol || rel_gap <= settings_.relative_mip_gap_tol) { deterministic_global_termination_status_ = mip_status_t::OPTIMAL; } @@ -3167,7 +3631,12 @@ node_status_t branch_and_bound_t::solve_node_deterministic( simplex_solver_settings_t lp_settings = settings_; lp_settings.set_log(false); - lp_settings.cut_off = worker.local_upper_bound + settings_.dual_tol; + if (original_lp_.objective_is_integral) { + lp_settings.cut_off = + std::ceil(worker.local_upper_bound - settings_.integer_tol) + settings_.dual_tol; + } else { + lp_settings.cut_off = worker.local_upper_bound + settings_.dual_tol; + } lp_settings.inside_mip = 2; lp_settings.time_limit = remaining_time; lp_settings.scale_columns = false; @@ -3199,7 +3668,7 @@ node_status_t branch_and_bound_t::solve_node_deterministic( std::vector& leaf_vstatus = node_ptr->vstatus; i_t node_iter = 0; f_t lp_start_time = tic(); - std::vector leaf_edge_norms = edge_norms_; + worker.leaf_edge_norms = edge_norms_; dual::status_t lp_status = dual_phase2_with_advanced_basis(2, 0, @@ -3213,7 +3682,7 @@ node_status_t branch_and_bound_t::solve_node_deterministic( worker.nonbasic_list, worker.leaf_solution, node_iter, - leaf_edge_norms, + worker.leaf_edge_norms, &worker.work_context); if (lp_status == dual::status_t::NUMERICAL) { @@ -3226,18 +3695,20 @@ node_status_t branch_and_bound_t::solve_node_deterministic( worker.basic_list, worker.nonbasic_list, leaf_vstatus, - leaf_edge_norms, + worker.leaf_edge_norms, &worker.work_context); lp_status = convert_lp_status_to_dual_status(second_status); } - double work_performed = worker.work_context.global_work_units_elapsed - work_units_at_start; - worker.clock += work_performed; - exploration_stats_.total_lp_solve_time += toc(lp_start_time); exploration_stats_.total_lp_iters += node_iter; - ++exploration_stats_.nodes_explored; - --exploration_stats_.nodes_unexplored; + + bool lp_conclusive = + (lp_status != dual::status_t::TIME_LIMIT && lp_status != dual::status_t::WORK_LIMIT); + if (lp_conclusive) { + ++exploration_stats_.nodes_explored; + --exploration_stats_.nodes_unexplored; + } deterministic_bfs_policy_t policy{*this, worker}; auto [status, round_dir] = update_tree_impl(node_ptr, search_tree, &worker, lp_status, policy); @@ -3247,58 +3718,17 @@ node_status_t branch_and_bound_t::solve_node_deterministic( template template -void branch_and_bound_t::deterministic_process_worker_solutions( - PoolT& pool, WorkerTypeGetter get_worker_type) +void branch_and_bound_t::deterministic_collect_worker_solutions( + PoolT& pool, + WorkerTypeGetter get_worker_type, + std::vector::deterministic_replay_solution_t>& + replay_solutions) { - std::vector*> all_solutions; for (auto& worker : pool) { for (auto& sol : worker.integer_solutions) { - all_solutions.push_back(&sol); + const search_strategy_t strategy = get_worker_type(pool, sol.worker_id); + replay_solutions.push_back({std::move(sol), strategy}); } - } - - // relies on queued_integer_solution_t's operator< - // sorts based on objective first, then the tuple - std::sort(all_solutions.begin(), - all_solutions.end(), - [](const queued_integer_solution_t* a, - const queued_integer_solution_t* b) { return *a < *b; }); - - f_t deterministic_lower = deterministic_compute_lower_bound(); - f_t current_upper = upper_bound_.load(); - - for (const auto* sol : all_solutions) { - if (sol->objective < current_upper) { - f_t user_obj = compute_user_objective(original_lp_, sol->objective); - f_t user_lower = compute_user_objective(original_lp_, deterministic_lower); - i_t nodes_explored = exploration_stats_.nodes_explored.load(); - i_t nodes_unexplored = exploration_stats_.nodes_unexplored.load(); - - search_strategy_t worker_type = get_worker_type(pool, sol->worker_id); - report(feasible_solution_symbol(worker_type), - sol->objective, - deterministic_lower, - sol->depth, - 0, - deterministic_current_horizon_); - - bool improved = false; - if (improves_incumbent(sol->objective)) { - upper_bound_ = std::min(upper_bound_.load(), sol->objective); - incumbent_.set_incumbent_solution(sol->objective, sol->solution); - current_upper = sol->objective; - improved = true; - } - - if (improved && settings_.solution_callback != nullptr) { - std::vector original_x; - uncrush_primal_solution(original_problem_, original_lp_, sol->solution, original_x); - settings_.solution_callback(original_x, sol->objective); - } - } - } - - for (auto& worker : pool) { worker.integer_solutions.clear(); } } @@ -3308,12 +3738,17 @@ template void branch_and_bound_t::deterministic_merge_pseudo_cost_updates(PoolT& pool) { std::vector> all_pc_updates; + int64_t sb_iter_delta = 0; + int64_t base_sb = pc_.strong_branching_lp_iter.load(); for (auto& worker : pool) { auto updates = worker.pc_snapshot.take_updates(); all_pc_updates.insert(all_pc_updates.end(), updates.begin(), updates.end()); + int64_t snapshot_sb = worker.pc_snapshot.strong_branching_lp_iter_; + sb_iter_delta += snapshot_sb - base_sb; } std::sort(all_pc_updates.begin(), all_pc_updates.end()); pc_.merge_updates(all_pc_updates); + pc_.strong_branching_lp_iter += sb_iter_delta; } template @@ -3324,6 +3759,7 @@ void branch_and_bound_t::deterministic_broadcast_snapshots( deterministic_snapshot_t snap; snap.upper_bound = upper_bound_.load(); snap.total_lp_iters = exploration_stats_.total_lp_iters.load(); + snap.nodes_explored = exploration_stats_.nodes_explored.load(); snap.incumbent = incumbent_snapshot; snap.pc_snapshot = pc_.create_snapshot(); @@ -3334,91 +3770,158 @@ void branch_and_bound_t::deterministic_broadcast_snapshots( template void branch_and_bound_t::deterministic_sort_replay_events( - const bb_event_batch_t& events) + const bb_event_batch_t& events, + std::vector::deterministic_replay_solution_t>& + replay_solutions) { - // Infeasible solutions from GPU heuristics are queued for repair; process them now + // Retire external solutions that have reached the current horizon. Feasibility + // classification and repair happen only here in deterministic mode. { - std::vector> to_repair; - // TODO: support repair queue in deterministic mode - // mutex_repair_.lock(); - // if (repair_queue_.size() > 0) { - // to_repair = repair_queue_; - // repair_queue_.clear(); - // } - // mutex_repair_.unlock(); - - std::sort(to_repair.begin(), - to_repair.end(), - [](const std::vector& a, const std::vector& b) { return a < b; }); - - if (to_repair.size() > 0) { - settings_.log.debug("Deterministic sync: Attempting to repair %ld injected solutions\n", - to_repair.size()); - for (const std::vector& uncrushed_solution : to_repair) { - std::vector crushed_solution; - crush_primal_solution( - original_problem_, original_lp_, uncrushed_solution, new_slacks_, crushed_solution); - std::vector repaired_solution; - f_t repaired_obj; - bool success = - repair_solution(edge_norms_, crushed_solution, repaired_obj, repaired_solution); - if (success) { - // Queue repaired solution with work unit timestamp (...workstamp?) - mutex_heuristic_queue_.lock(); - heuristic_solution_queue_.push_back( - {repaired_obj, std::move(repaired_solution), 0, -1, 0, deterministic_current_horizon_}); - mutex_heuristic_queue_.unlock(); + std::vector due_solutions; + mutex_heuristic_queue_.lock(); + { + std::vector future_solutions; + for (auto& sol : heuristic_solution_queue_) { + if (sol.work_timestamp < deterministic_current_horizon_) { + due_solutions.push_back(std::move(sol)); + } else { + future_solutions.push_back(std::move(sol)); } } + heuristic_solution_queue_ = std::move(future_solutions); } - } - - // Extract heuristic solutions, keeping future solutions for next horizon - // Use deterministic_current_horizon_ as the upper bound (horizon_end) - std::vector> heuristic_solutions; - mutex_heuristic_queue_.lock(); - { - std::vector> future_solutions; - for (auto& sol : heuristic_solution_queue_) { - if (sol.work_timestamp < deterministic_current_horizon_) { - heuristic_solutions.push_back(std::move(sol)); - } else { - future_solutions.push_back(std::move(sol)); + mutex_heuristic_queue_.unlock(); + + std::sort(due_solutions.begin(), + due_solutions.end(), + [](const queued_external_solution_t& a, const queued_external_solution_t& b) { + if (a.work_timestamp != b.work_timestamp) { + return a.work_timestamp < b.work_timestamp; + } + if (a.user_objective != b.user_objective) { + return a.user_objective < b.user_objective; + } + if (a.origin != b.origin) { return a.origin < b.origin; } + return a.solution < b.solution; + }); + + if (!due_solutions.empty() || !heuristic_solution_queue_.empty()) { + CUOPT_DETERMINISM_LOG( + settings_.log, + "Deterministic sync retire: horizon=%.6f due=%zu future=%zu pre_expl=%.6f\n", + deterministic_current_horizon_, + due_solutions.size(), + heuristic_solution_queue_.size(), + pre_exploration_work_); + for (size_t i = 0; i < due_solutions.size(); ++i) { + CUOPT_DETERMINISM_LOG( + settings_.log, + " due[%zu]: wut=%.6f obj=%g origin=%s\n", + i, + due_solutions[i].work_timestamp, + due_solutions[i].user_objective, + cuopt::internals::mip_solution_origin_to_string(due_solutions[i].origin)); + } + } + if (!due_solutions.empty()) { + CUOPT_DETERMINISM_LOG(settings_.log, + "Deterministic sync: retiring %ld external solutions\n", + due_solutions.size()); + for (const auto& queued_solution : due_solutions) { + auto [feasible, obj, crushed] = retire_queued_solution(queued_solution); + if (feasible) { + replay_solutions.push_back({{obj, + std::move(crushed), + 0, + -1, + 0, + queued_solution.work_timestamp, + queued_solution.origin}, + search_strategy_t::BEST_FIRST}); + } } } - heuristic_solution_queue_ = std::move(future_solutions); } - mutex_heuristic_queue_.unlock(); + if (!replay_solutions.empty() || !heuristic_solution_queue_.empty()) { + CUOPT_DETERMINISM_LOG( + settings_.log, + "Deterministic replay extract: horizon=%.6f now=%zu future=%zu upper=%.16e\n", + deterministic_current_horizon_, + replay_solutions.size(), + heuristic_solution_queue_.size(), + upper_bound_.load()); + } - // sort by work unit timestamp, with objective and solution values as tie-breakers - std::sort( - heuristic_solutions.begin(), - heuristic_solutions.end(), - [](const queued_integer_solution_t& a, const queued_integer_solution_t& b) { - if (a.work_timestamp != b.work_timestamp) { return a.work_timestamp < b.work_timestamp; } - if (a.objective != b.objective) { return a.objective < b.objective; } - return a.solution < b.solution; // edge-case - lexicographical comparison - }); + // Sort the full replay stream by work unit timestamp, with stable deterministic tie-breakers. + std::sort(replay_solutions.begin(), replay_solutions.end(), [](const auto& a, const auto& b) { + if (a.solution.work_timestamp != b.solution.work_timestamp) { + return a.solution.work_timestamp < b.solution.work_timestamp; + } + if (a.solution.objective != b.solution.objective) { + return a.solution.objective < b.solution.objective; + } + if (a.solution.origin != b.solution.origin) { return a.solution.origin < b.solution.origin; } + if (a.solution.worker_id != b.solution.worker_id) { + return a.solution.worker_id < b.solution.worker_id; + } + if (a.solution.sequence_id != b.solution.sequence_id) { + return a.solution.sequence_id < b.solution.sequence_id; + } + return a.solution.solution < b.solution.solution; + }); - // Merge B&B events and heuristic solutions for unified timeline replay - size_t event_idx = 0; - size_t heuristic_idx = 0; + f_t deterministic_lower = deterministic_compute_lower_bound(); + f_t current_upper = upper_bound_.load(); + CUOPT_DETERMINISM_LOG( + settings_.log, + "Sync replay begin: horizon=%.6f n_events=%zu n_solutions=%zu user_upper_before=%.16e\n", + deterministic_current_horizon_, + events.events.size(), + replay_solutions.size(), + compute_user_objective(original_lp_, current_upper)); + if (deterministic_current_horizon_ <= deterministic_horizon_step_) { + CUOPT_DETERMINISM_LOG( + settings_.log, + "Deterministic solution replay: candidates=%zu lower=%.16e upper_before=%.16e\n", + replay_solutions.size(), + deterministic_lower, + current_upper); + for (size_t i = 0; i < replay_solutions.size(); ++i) { + const auto& replay = replay_solutions[i]; + const auto& sol = replay.solution; + CUOPT_DETERMINISM_LOG( + settings_.log, + "Deterministic replay solution[%zu]: wut=%.6f obj=%.16e origin=%s worker=%d seq=%d " + "depth=%d sol_hash=0x%x\n", + i, + sol.work_timestamp, + sol.objective, + cuopt::internals::mip_solution_origin_to_string(sol.origin), + sol.worker_id, + sol.sequence_id, + sol.depth, + detail::compute_hash(sol.solution)); + } + } - while (event_idx < events.events.size() || heuristic_idx < heuristic_solutions.size()) { - bool process_event = false; - bool process_heuristic = false; + // Merge B&B events and all incumbent-producing solutions for unified timeline replay. + size_t event_idx = 0; + size_t solution_idx = 0; + + while (event_idx < events.events.size() || solution_idx < replay_solutions.size()) { + bool process_event = false; + bool process_solution = false; if (event_idx >= events.events.size()) { - process_heuristic = true; - } else if (heuristic_idx >= heuristic_solutions.size()) { + process_solution = true; + } else if (solution_idx >= replay_solutions.size()) { process_event = true; } else { - // Both have items - pick the one with smaller WUT if (events.events[event_idx].work_timestamp <= - heuristic_solutions[heuristic_idx].work_timestamp) { + replay_solutions[solution_idx].solution.work_timestamp) { process_event = true; } else { - process_heuristic = true; + process_solution = true; } } @@ -3433,42 +3936,80 @@ void branch_and_bound_t::deterministic_sort_replay_events( } } - if (process_heuristic) { - const auto& hsol = heuristic_solutions[heuristic_idx++]; - - CUOPT_LOG_TRACE( - "Deterministic sync: Heuristic solution received at WUT %f with objective %g, current " - "horizon %f", - hsol.work_timestamp, - hsol.objective, - deterministic_current_horizon_); - - // Process heuristic solution at its correct work unit timestamp position - f_t new_upper = std::numeric_limits::infinity(); + if (process_solution) { + const auto& replay = replay_solutions[solution_idx++]; + const auto& sol = replay.solution; + bool improved = false; - if (improves_incumbent(hsol.objective)) { - upper_bound_ = std::min(upper_bound_.load(), hsol.objective); - incumbent_.set_incumbent_solution(hsol.objective, hsol.solution); - new_upper = hsol.objective; + if (improves_incumbent(sol.objective)) { + const f_t previous_upper = upper_bound_; + upper_bound_ = std::min(upper_bound_.load(), sol.objective); + incumbent_.set_incumbent_solution(sol.objective, sol.solution); + current_upper = sol.objective; + improved = true; + CUOPT_DETERMINISM_LOG( + settings_.log, + "Deterministic B&B incumbent update: source=det_replay prev_upper=%.16e " + "new_upper=%.16e obj=%.16e hash=0x%x worker=%d seq=%d wut=%.6f horizon=%.6f\n", + previous_upper, + upper_bound_.load(), + sol.objective, + detail::compute_hash(sol.solution), + sol.worker_id, + sol.sequence_id, + sol.work_timestamp, + deterministic_current_horizon_); } - - if (new_upper < std::numeric_limits::infinity()) { - report_heuristic(new_upper); - - if (settings_.solution_callback != nullptr) { - std::vector original_x; - uncrush_primal_solution(original_problem_, original_lp_, hsol.solution, original_x); - settings_.solution_callback(original_x, hsol.objective); + CUOPT_DETERMINISM_LOG( + settings_.log, + "Deterministic replay: horizon=%.6f wut=%.6f obj=%.16e origin=%s accepted=%d " + "upper_now=%.16e worker=%d seq=%d sol_hash=0x%x\n", + deterministic_current_horizon_, + sol.work_timestamp, + sol.objective, + cuopt::internals::mip_solution_origin_to_string(sol.origin), + (int)improved, + current_upper, + sol.worker_id, + sol.sequence_id, + detail::compute_hash(sol.solution)); + + if (improved) { + CUOPT_DETERMINISM_LOG( + settings_.log, + "Deterministic replay PUBLISH: horizon=%.6f wut=%.6f obj=%g origin=%s worker=%d " + "upper_after=%.16e\n", + deterministic_current_horizon_, + sol.work_timestamp, + compute_user_objective(original_lp_, sol.objective), + cuopt::internals::mip_solution_origin_to_string(sol.origin), + sol.worker_id, + current_upper); + if (sol.origin == cuopt::internals::mip_solution_origin_t::BRANCH_AND_BOUND_NODE || + sol.origin == cuopt::internals::mip_solution_origin_t::BRANCH_AND_BOUND_DIVING) { + report(feasible_solution_symbol(replay.strategy), + sol.objective, + deterministic_lower, + sol.depth, + 0, + deterministic_current_horizon_); + } else { + report_heuristic(sol.objective, sol.work_timestamp); } + emit_solution_callback_from_crushed( + sol.solution, sol.objective, sol.origin, sol.work_timestamp); } } } - // Merge integer solutions from BFS workers and update global incumbent - deterministic_process_worker_solutions(*deterministic_workers_, - [](const deterministic_bfs_worker_pool_t&, int) { - return search_strategy_t::BEST_FIRST; - }); + CUOPT_DETERMINISM_LOG( + settings_.log, + "Sync replay done: horizon=%.6f user_upper_after=%.16e events_processed=%zu " + "solutions_processed=%zu\n", + deterministic_current_horizon_, + compute_user_objective(original_lp_, upper_bound_.load()), + event_idx, + solution_idx); // Merge and apply pseudo-cost updates from BFS workers deterministic_merge_pseudo_cost_updates(*deterministic_workers_); @@ -3525,52 +4066,44 @@ void branch_and_bound_t::deterministic_balance_worker_loads() constexpr bool force_rebalance_every_sync = false; - // Count work for each worker: current_node (if any) + plunge_stack + backlog - std::vector work_counts(num_workers); - size_t total_work = 0; - size_t max_work = 0; - size_t min_work = std::numeric_limits::max(); + std::vector backlog_counts(num_workers); + size_t total_backlog = 0; + size_t max_backlog = 0; + size_t min_backlog = std::numeric_limits::max(); for (size_t w = 0; w < num_workers; ++w) { - auto& worker = (*deterministic_workers_)[w]; - work_counts[w] = worker.queue_size(); - total_work += work_counts[w]; - max_work = std::max(max_work, work_counts[w]); - min_work = std::min(min_work, work_counts[w]); + auto& worker = (*deterministic_workers_)[w]; + backlog_counts[w] = worker.backlog.size(); + total_backlog += backlog_counts[w]; + max_backlog = std::max(max_backlog, backlog_counts[w]); + min_backlog = std::min(min_backlog, backlog_counts[w]); } - if (total_work == 0) return; + if (total_backlog == 0) return; bool needs_balance; if (force_rebalance_every_sync) { - needs_balance = (total_work > 1); + needs_balance = (total_backlog > 1); } else { - needs_balance = (min_work == 0 && max_work >= 2) || (min_work > 0 && max_work > 4 * min_work); + needs_balance = + (min_backlog == 0 && max_backlog >= 2) || (min_backlog > 0 && max_backlog > 4 * min_backlog); } if (!needs_balance) return; - std::vector*> all_nodes; + std::vector*> all_backlog_nodes; for (auto& worker : *deterministic_workers_) { for (auto* node : worker.backlog.data()) { - all_nodes.push_back(node); + all_backlog_nodes.push_back(node); } worker.backlog.clear(); } - if (all_nodes.empty()) return; - - auto deterministic_less = [](const mip_node_t* a, const mip_node_t* b) { - if (a->origin_worker_id != b->origin_worker_id) { - return a->origin_worker_id < b->origin_worker_id; - } - return a->creation_seq < b->creation_seq; - }; - std::sort(all_nodes.begin(), all_nodes.end(), deterministic_less); + if (all_backlog_nodes.empty()) return; - // Distribute nodes - for (size_t i = 0; i < all_nodes.size(); ++i) { + // Round-robin distribute into backlogs; priority queue handles ordering internally + for (size_t i = 0; i < all_backlog_nodes.size(); ++i) { size_t worker_idx = i % num_workers; - (*deterministic_workers_)[worker_idx].enqueue_node(all_nodes[i]); + (*deterministic_workers_)[worker_idx].backlog.push(all_backlog_nodes[i]); } } @@ -3598,11 +4131,33 @@ f_t branch_and_bound_t::deterministic_compute_lower_bound() } } + f_t min_from_workers = lower_bound; + // Tree is exhausted if (lower_bound == std::numeric_limits::infinity() && incumbent_.has_incumbent) { lower_bound = upper_bound_.load(); } + lower_bound = std::min(lower_bound, upper_bound_.load()); + + CUOPT_DETERMINISM_LOG( + settings_.log, + "compute_lower_bound: user_min_bfs=%.16e user_upper=%.16e user_result=%.16e " + "has_incumbent=%d n_bfs_nodes=%d\n", + compute_user_objective(original_lp_, min_from_workers), + compute_user_objective(original_lp_, upper_bound_.load()), + compute_user_objective(original_lp_, lower_bound), + (int)incumbent_.has_incumbent, + [&]() { + int count = 0; + for (const auto& w : *deterministic_workers_) { + count += (w.current_node != nullptr ? 1 : 0); + count += (int)w.plunge_stack.size(); + count += (int)w.backlog.size(); + } + return count; + }()); + return lower_bound; } @@ -3690,19 +4245,27 @@ void branch_and_bound_t::deterministic_assign_diving_nodes() } template -void branch_and_bound_t::deterministic_collect_diving_solutions_and_update_pseudocosts() +void branch_and_bound_t::deterministic_collect_diving_solutions_and_update_pseudocosts( + std::vector::deterministic_replay_solution_t>& + replay_solutions) { if (!deterministic_diving_workers_) return; - // Collect integer solutions from diving workers and update global incumbent - deterministic_process_worker_solutions( + deterministic_collect_worker_solutions( *deterministic_diving_workers_, [](const deterministic_diving_worker_pool_t& pool, int worker_id) { return pool[worker_id].diving_type; - }); + }, + replay_solutions); // Merge pseudo-cost updates from diving workers deterministic_merge_pseudo_cost_updates(*deterministic_diving_workers_); + + for (auto& worker : *deterministic_diving_workers_) { + i_t delta = worker.total_nodes_explored - worker.nodes_explored_last_sync; + worker.nodes_explored_last_sync = worker.total_nodes_explored; + exploration_stats_.nodes_explored += delta; + } } template @@ -3777,7 +4340,12 @@ void branch_and_bound_t::deterministic_dive( // Setup LP settings simplex_solver_settings_t lp_settings = settings_; lp_settings.set_log(false); - lp_settings.cut_off = worker.local_upper_bound + settings_.dual_tol; + if (original_lp_.objective_is_integral) { + lp_settings.cut_off = + std::ceil(worker.local_upper_bound - settings_.integer_tol) + settings_.dual_tol; + } else { + lp_settings.cut_off = worker.local_upper_bound + settings_.dual_tol; + } lp_settings.inside_mip = 2; lp_settings.time_limit = remaining_time; lp_settings.scale_columns = false; @@ -3787,7 +4355,6 @@ void branch_and_bound_t::deterministic_dive( lp_settings, worker.bounds_changed, worker.leaf_problem.lower, worker.leaf_problem.upper); if (settings_.deterministic) { - // TEMP APPROXIMATION; worker.work_context.record_work_sync_on_horizon(worker.node_presolver.last_nnz_processed / 1e8); } @@ -3841,17 +4408,16 @@ void branch_and_bound_t::deterministic_dive( lp_status = convert_lp_status_to_dual_status(second_status); } - ++nodes_this_dive; - ++worker.total_nodes_explored; worker.lp_iters_this_dive += node_iter; - worker.clock = worker.work_context.global_work_units_elapsed; - if (lp_status == dual::status_t::TIME_LIMIT || lp_status == dual::status_t::WORK_LIMIT || lp_status == dual::status_t::ITERATION_LIMIT) { break; } + ++nodes_this_dive; + ++worker.total_nodes_explored; + deterministic_diving_policy_t policy{*this, worker, stack, max_backtrack_depth}; update_tree_impl(node_ptr, dive_tree, &worker, lp_status, policy); } diff --git a/cpp/src/branch_and_bound/branch_and_bound.hpp b/cpp/src/branch_and_bound/branch_and_bound.hpp index f2917ba930..7dec38b640 100644 --- a/cpp/src/branch_and_bound/branch_and_bound.hpp +++ b/cpp/src/branch_and_bound/branch_and_bound.hpp @@ -24,6 +24,7 @@ #include #include +#include #include #include #include @@ -35,9 +36,12 @@ #include #include +#include #include #include #include +#include +#include #include namespace cuopt::linear_programming::detail { @@ -108,10 +112,15 @@ class branch_and_bound_t { } // Set a solution based on the user problem during the course of the solve - void set_new_solution(const std::vector& solution); + void set_new_solution(const std::vector& solution, + cuopt::internals::mip_solution_origin_t origin = + cuopt::internals::mip_solution_origin_t::UNKNOWN); // This queues the solution to be processed at the correct work unit timestamp - void queue_external_solution_deterministic(const std::vector& solution, double work_unit_ts); + void queue_external_solution_deterministic(const std::vector& solution, + f_t user_objective, + double work_unit_ts, + cuopt::internals::mip_solution_origin_t origin); void set_user_bound_callback(std::function callback) { @@ -157,6 +166,12 @@ class branch_and_bound_t { // Get producer sync for external heuristics (e.g., CPUFJ) to register producer_sync_t& get_producer_sync() { return producer_sync_; } + void wait_for_exploration_start() + { + std::unique_lock lock(exploration_started_mutex_); + exploration_started_cv_.wait(lock, [this] { return exploration_started_.load(); }); + } + private: const user_problem_t& original_problem_; const simplex_solver_settings_t settings_; @@ -166,6 +181,10 @@ class branch_and_bound_t { std::atomic signal_extend_cliques_{false}; work_limit_context_t work_unit_context_{"B&B"}; + double pre_exploration_work_{0.0}; + std::atomic exploration_started_{false}; + std::mutex exploration_started_mutex_; + std::condition_variable exploration_started_cv_; // Initial guess. std::vector guess_; @@ -214,7 +233,13 @@ class branch_and_bound_t { // Mutex for repair omp_mutex_t mutex_repair_; - std::vector> repair_queue_; + struct queued_repair_solution_t { + std::vector solution; + cuopt::internals::mip_solution_origin_t origin{ + cuopt::internals::mip_solution_origin_t::UNKNOWN}; + double work_timestamp{-1.0}; + }; + std::vector repair_queue_; // Variables for the root node in the search tree. std::vector root_vstatus_; @@ -262,13 +287,21 @@ class branch_and_bound_t { omp_atomic_t lower_bound_ceiling_; std::function user_bound_callback_; - void report_heuristic(f_t obj); + void report_heuristic(f_t obj, double work_time = -1.0); void report(char symbol, f_t obj, f_t lower_bound, i_t node_depth, i_t node_int_infeas, double work_time = -1); + void emit_solution_callback(std::vector& original_x, + f_t objective, + cuopt::internals::mip_solution_origin_t origin, + double work_timestamp); + void emit_solution_callback_from_crushed(const std::vector& crushed_solution, + f_t objective, + cuopt::internals::mip_solution_origin_t origin, + double work_timestamp); // Set the solution when found at the root node void set_solution_at_root(mip_solution_t& solution, @@ -341,7 +374,14 @@ class branch_and_bound_t { void run_deterministic_coordinator(const csr_matrix_t& Arow); // Gather all events generated, sort by WU timestamp, apply - void deterministic_sort_replay_events(const bb_event_batch_t& events); + struct deterministic_replay_solution_t { + queued_integer_solution_t solution; + search_strategy_t strategy{search_strategy_t::BEST_FIRST}; + }; + + void deterministic_sort_replay_events( + const bb_event_batch_t& events, + std::vector& replay_solutions); // Prune nodes held by workers based on new incumbent void deterministic_prune_worker_nodes_vs_incumbent(); @@ -374,10 +414,14 @@ class branch_and_bound_t { void deterministic_assign_diving_nodes(); // Collect and merge diving solutions at sync - void deterministic_collect_diving_solutions_and_update_pseudocosts(); + void deterministic_collect_diving_solutions_and_update_pseudocosts( + std::vector& replay_solutions); template - void deterministic_process_worker_solutions(PoolT& pool, WorkerTypeGetter get_worker_type); + void deterministic_collect_worker_solutions( + PoolT& pool, + WorkerTypeGetter get_worker_type, + std::vector& replay_solutions); template void deterministic_merge_pseudo_cost_updates(PoolT& pool); @@ -408,10 +452,22 @@ class branch_and_bound_t { double max_producer_wait_time_{0.0}; i_t producer_wait_count_{0}; - // Determinism heuristic solution queue - solutions received from GPU heuristics - // Stored with work unit timestamp for deterministic ordering + struct queued_external_solution_t { + std::vector solution; + f_t user_objective{std::numeric_limits::infinity()}; + double work_timestamp{0.0}; + cuopt::internals::mip_solution_origin_t origin{ + cuopt::internals::mip_solution_origin_t::UNKNOWN}; + }; + + std::tuple> retire_queued_solution( + const queued_external_solution_t& queued_solution); + + // Deterministic pending external solution queue. + // External solutions stay raw until their retirement horizon, where they are + // crushed, checked, and repaired immediately if needed. omp_mutex_t mutex_heuristic_queue_; - std::vector> heuristic_solution_queue_; + std::vector heuristic_solution_queue_; // ============================================================================ // Determinism Diving state diff --git a/cpp/src/branch_and_bound/deterministic_workers.hpp b/cpp/src/branch_and_bound/deterministic_workers.hpp index 7a074051c6..b90706285b 100644 --- a/cpp/src/branch_and_bound/deterministic_workers.hpp +++ b/cpp/src/branch_and_bound/deterministic_workers.hpp @@ -11,6 +11,7 @@ #include #include #include +#include #include @@ -44,6 +45,8 @@ struct queued_integer_solution_t { int worker_id{-1}; int sequence_id{0}; double work_timestamp{0.0}; + cuopt::internals::mip_solution_origin_t origin{ + cuopt::internals::mip_solution_origin_t::BRANCH_AND_BOUND_NODE}; bool operator<(const queued_integer_solution_t& other) const { @@ -59,6 +62,7 @@ struct deterministic_snapshot_t { pseudo_cost_snapshot_t pc_snapshot; std::vector incumbent; i_t total_lp_iters; + i_t nodes_explored; }; template @@ -66,7 +70,6 @@ class deterministic_worker_base_t : public branch_and_bound_worker_t { using base_t = branch_and_bound_worker_t; public: - double clock{0.0}; work_limit_context_t work_context; pseudo_cost_snapshot_t pc_snapshot; @@ -75,6 +78,7 @@ class deterministic_worker_base_t : public branch_and_bound_worker_t { // Diving-specific snapshots (ignored by BFS workers) std::vector incumbent_snapshot; i_t total_lp_iters_snapshot{0}; + i_t nodes_explored_snapshot{0}; std::vector> integer_solutions; int next_solution_seq{0}; @@ -101,6 +105,7 @@ class deterministic_worker_base_t : public branch_and_bound_worker_t { pc_snapshot = snap.pc_snapshot; incumbent_snapshot = snap.incumbent; total_lp_iters_snapshot = snap.total_lp_iters; + nodes_explored_snapshot = snap.nodes_explored; } bool has_work() const { return static_cast(this)->has_work_impl(); } @@ -158,11 +163,6 @@ class deterministic_bfs_worker_t mip_node_t* up_child, rounding_direction_t preferred_direction) { - if (!plunge_stack.empty()) { - backlog.push(plunge_stack.back()); - plunge_stack.pop_back(); - } - down_child->origin_worker_id = this->worker_id; down_child->creation_seq = next_creation_seq++; up_child->origin_worker_id = this->worker_id; @@ -170,11 +170,11 @@ class deterministic_bfs_worker_t mip_node_t* first_child; if (preferred_direction == rounding_direction_t::UP) { - plunge_stack.push_front(down_child); + backlog.push(down_child); plunge_stack.push_front(up_child); first_child = up_child; } else { - plunge_stack.push_front(up_child); + backlog.push(up_child); plunge_stack.push_front(down_child); first_child = down_child; } @@ -211,7 +211,7 @@ class deterministic_bfs_worker_t void record_branched( mip_node_t* node, i_t down_child_id, i_t up_child_id, i_t branch_var, f_t branch_val) { - record_event(bb_event_t::make_branched(this->clock, + record_event(bb_event_t::make_branched(this->work_context.current_work(), this->worker_id, node->creation_seq, down_child_id, @@ -227,7 +227,7 @@ class deterministic_bfs_worker_t void record_integer_solution(mip_node_t* node, f_t objective) { record_event(bb_event_t::make_integer_solution( - this->clock, this->worker_id, node->creation_seq, objective)); + this->work_context.current_work(), this->worker_id, node->creation_seq, objective)); ++nodes_processed_this_horizon; ++this->total_nodes_processed; ++this->total_integer_solutions; @@ -236,7 +236,7 @@ class deterministic_bfs_worker_t void record_fathomed(mip_node_t* node, f_t lower_bound) { record_event(bb_event_t::make_fathomed( - this->clock, this->worker_id, node->creation_seq, lower_bound)); + this->work_context.current_work(), this->worker_id, node->creation_seq, lower_bound)); ++nodes_processed_this_horizon; ++this->total_nodes_processed; ++total_nodes_pruned; @@ -244,8 +244,8 @@ class deterministic_bfs_worker_t void record_infeasible(mip_node_t* node) { - record_event( - bb_event_t::make_infeasible(this->clock, this->worker_id, node->creation_seq)); + record_event(bb_event_t::make_infeasible( + this->work_context.current_work(), this->worker_id, node->creation_seq)); ++nodes_processed_this_horizon; ++this->total_nodes_processed; ++total_nodes_infeasible; @@ -253,8 +253,8 @@ class deterministic_bfs_worker_t void record_numerical(mip_node_t* node) { - record_event( - bb_event_t::make_numerical(this->clock, this->worker_id, node->creation_seq)); + record_event(bb_event_t::make_numerical( + this->work_context.current_work(), this->worker_id, node->creation_seq)); ++nodes_processed_this_horizon; ++this->total_nodes_processed; } @@ -288,6 +288,7 @@ class deterministic_diving_worker_t // Diving statistics i_t total_nodes_explored{0}; + i_t nodes_explored_last_sync{0}; i_t total_dives{0}; i_t lp_iters_this_dive{0}; @@ -339,7 +340,13 @@ class deterministic_diving_worker_t void queue_integer_solution(f_t objective, const std::vector& solution, i_t depth) { this->integer_solutions.push_back( - {objective, solution, depth, this->worker_id, this->next_solution_seq++}); + {objective, + solution, + depth, + this->worker_id, + this->next_solution_seq++, + this->work_context.current_work(), + cuopt::internals::mip_solution_origin_t::BRANCH_AND_BOUND_DIVING}); ++this->total_integer_solutions; } diff --git a/cpp/src/branch_and_bound/pseudo_costs.cpp b/cpp/src/branch_and_bound/pseudo_costs.cpp index c38e98e27d..bb2fd3a6da 100644 --- a/cpp/src/branch_and_bound/pseudo_costs.cpp +++ b/cpp/src/branch_and_bound/pseudo_costs.cpp @@ -79,11 +79,9 @@ objective_change_estimate_t single_pivot_objective_change_estimate( std::vector& delta_z, f_t& work_estimate) { - // Compute the objective estimate for the down and up branches of variable j assert(variable_j >= 0); assert(basic_j >= 0); - // Down branch i_t direction = -1; sparse_vector_t e_k(lp.num_rows, 0); e_k.i.push_back(basic_j); @@ -92,7 +90,6 @@ objective_change_estimate_t single_pivot_objective_change_estimate( sparse_vector_t delta_y(lp.num_rows, 0); basis_factors.b_transpose_solve(e_k, delta_y); - // Compute delta_z_N = -N^T * delta_y i_t delta_y_nz0 = 0; const i_t nz_delta_y = delta_y.i.size(); for (i_t k = 0; k < nz_delta_y; k++) { @@ -102,7 +99,6 @@ objective_change_estimate_t single_pivot_objective_change_estimate( const f_t delta_y_nz_percentage = delta_y_nz0 / static_cast(lp.num_rows) * 100.0; const bool use_transpose = delta_y_nz_percentage <= 30.0; std::vector delta_z_indices; - // delta_z starts out all zero if (use_transpose) { compute_delta_z(A_transpose, delta_y, @@ -128,84 +124,31 @@ objective_change_estimate_t single_pivot_objective_change_estimate( work_estimate); } - // Verify dual feasibility -#ifdef CHECK_DUAL_FEASIBILITY - { - std::vector dual_residual = lp_solution.z; - for (i_t j = 0; j < lp.num_cols; j++) { - dual_residual[j] -= lp.objective[j]; - } - matrix_transpose_vector_multiply(lp.A, 1.0, lp_solution.y, 1.0, dual_residual); - f_t dual_residual_norm = vector_norm_inf(dual_residual); - settings.log.printf("Dual residual norm: %e\n", dual_residual_norm); - } -#endif - - // Compute the step-length f_t step_length = compute_step_length(settings, vstatus, lp_solution.z, delta_z, delta_z_indices); - - // Handle the leaving variable case - f_t delta_obj_down = step_length * (lp_solution.x[variable_j] - std::floor(lp_solution.x[variable_j])); -#ifdef CHECK_DELTA_OBJ - f_t delta_obj_check = 0.0; - for (i_t k = 0; k < delta_y.i.size(); k++) { - delta_obj_check += lp.rhs[delta_y.i[k]] * delta_y.x[k]; - } - for (i_t h = 0; h < delta_z_indices.size(); h++) { - const i_t j = delta_z_indices[h]; - if (vstatus[j] == variable_status_t::NONBASIC_LOWER) { - delta_obj_check += lp.lower[j] * delta_z[j]; - } else if (vstatus[j] == variable_status_t::NONBASIC_UPPER) { - delta_obj_check += lp.upper[j] * delta_z[j]; - } - } - delta_obj_check += std::floor(lp_solution.x[variable_j]) * delta_z[variable_j]; - delta_obj_check *= step_length; - if (std::abs(delta_obj_check - delta_obj) > 1e-6) { - settings.log.printf("Delta obj check %e. Delta obj %e. Step length %e.\n", - delta_obj_check, - delta_obj, - step_length); - } -#endif settings.log.debug( - "Down branch %d. Step length: %e. Delta obj: %e. \n", variable_j, step_length, delta_obj_down); + "Down branch %d. Step length: %e. Delta obj: %e.\n", variable_j, step_length, delta_obj_down); - // Up branch direction = 1; - // Negate delta_z for (i_t j : delta_z_indices) { delta_z[j] *= -1.0; } - // Compute the step-length step_length = compute_step_length(settings, vstatus, lp_solution.z, delta_z, delta_z_indices); - f_t delta_obj_up = step_length * (std::ceil(lp_solution.x[variable_j]) - lp_solution.x[variable_j]); + settings.log.debug( "Up branch %d. Step length: %e. Delta obj: %e.\n", variable_j, step_length, delta_obj_up); delta_z_indices.push_back(variable_j); - - // Clear delta_z for (i_t j : delta_z_indices) { delta_z[j] = 0.0; workspace[j] = 0; } -#ifdef CHECK_DELTA_Z - for (i_t j = 0; j < lp.num_cols; j++) { - if (delta_z[j] != 0.0) { settings.log.printf("Delta z %d: %e\n", j, delta_z[j]); } - } - for (i_t j = 0; j < lp.num_cols; j++) { - if (workspace[j] != 0) { settings.log.printf("Workspace %d: %d\n", j, workspace[j]); } - } -#endif - return {.down_obj_change = std::max(delta_obj_down, 0), .up_obj_change = std::max(delta_obj_up, 0)}; } @@ -226,7 +169,6 @@ void initialize_pseudo_costs_with_estimate(const lp_problem_t& lp, std::vector delta_z(n, 0); std::vector workspace(n, 0); - f_t work_estimate = 0; std::vector basic_map(n, -1); @@ -241,8 +183,6 @@ void initialize_pseudo_costs_with_estimate(const lp_problem_t& lp, for (i_t k = 0; k < fractional.size(); k++) { const i_t j = fractional[k]; - assert(j >= 0); - objective_change_estimate_t estimate = single_pivot_objective_change_estimate(lp, settings, @@ -266,21 +206,16 @@ void initialize_pseudo_costs_with_estimate(const lp_problem_t& lp, template f_t objective_upper_bound(const lp_problem_t& lp, f_t upper_bound, f_t dual_tol) { - f_t cut_off = 0; + if (std::isfinite(upper_bound)) { return upper_bound + dual_tol; } - if (std::isfinite(upper_bound)) { - cut_off = upper_bound + dual_tol; - } else { - cut_off = 0; - for (i_t j = 0; j < lp.num_cols; ++j) { - if (lp.objective[j] > 0) { - cut_off += lp.objective[j] * lp.upper[j]; - } else if (lp.objective[j] < 0) { - cut_off += lp.objective[j] * lp.lower[j]; - } + f_t cut_off = 0; + for (i_t j = 0; j < lp.num_cols; ++j) { + if (lp.objective[j] > 0) { + cut_off += lp.objective[j] * lp.upper[j]; + } else if (lp.objective[j] < 0) { + cut_off += lp.objective[j] * lp.lower[j]; } } - return cut_off; } @@ -303,33 +238,23 @@ void strong_branch_helper(i_t start, std::vector& dual_simplex_obj_up, std::vector& dual_simplex_status_down, std::vector& dual_simplex_status_up, - shared_strong_branching_context_view_t& sb_view) + shared_strong_branching_context_view_t& sb_view, + cuopt::work_limit_context_t* work_unit_context = nullptr) { raft::common::nvtx::range scope("BB::strong_branch_helper"); + (void)var_types; + lp_problem_t child_problem = original_lp; + constexpr bool verbose = false; + f_t last_log = tic(); + i_t thread_id = omp_get_thread_num(); - constexpr bool verbose = false; - f_t last_log = tic(); - i_t thread_id = omp_get_thread_num(); for (i_t k = start; k < end; ++k) { const i_t j = fractional[k]; for (i_t branch = 0; branch < 2; branch++) { - // Do the down branch const i_t shared_idx = (branch == 0) ? k : k + static_cast(fractional.size()); - // Batch PDLP has already solved this subproblem, skip it - if (sb_view.is_valid() && sb_view.is_solved(shared_idx)) { - if (verbose) { - settings.log.printf( - "[COOP SB] DS thread %d skipping variable %d branch %s (shared_idx %d): already solved " - "by PDLP\n", - thread_id, - j, - branch == 0 ? "down" : "up", - shared_idx); - } - continue; - } + if (sb_view.is_valid() && sb_view.is_solved(shared_idx)) { continue; } if (branch == 0) { child_problem.lower[j] = original_lp.lower[j]; @@ -341,10 +266,9 @@ void strong_branch_helper(i_t start, simplex_solver_settings_t child_settings = settings; child_settings.set_log(false); - f_t lp_start_time = tic(); - f_t elapsed_time = toc(start_time); + const f_t elapsed_time = toc(start_time); if (elapsed_time > settings.time_limit) { break; } - child_settings.time_limit = std::max(0.0, settings.time_limit - elapsed_time); + child_settings.time_limit = std::max(0.0, settings.time_limit - elapsed_time); child_settings.iteration_limit = iter_limit; child_settings.cut_off = objective_upper_bound(child_problem, upper_bound, child_settings.dual_tol); @@ -355,17 +279,17 @@ void strong_branch_helper(i_t start, std::vector child_edge_norms = edge_norms; dual::status_t status = dual_phase2(2, 0, - lp_start_time, + tic(), child_problem, child_settings, vstatus, solution, iter, - child_edge_norms); + child_edge_norms, + work_unit_context); f_t obj = std::numeric_limits::quiet_NaN(); if (status == dual::status_t::DUAL_UNBOUNDED) { - // LP was infeasible obj = std::numeric_limits::infinity(); } else if (status == dual::status_t::OPTIMAL || status == dual::status_t::ITERATION_LIMIT || status == dual::status_t::CUTOFF) { @@ -379,60 +303,36 @@ void strong_branch_helper(i_t start, status); } + const f_t delta_obj = std::max(obj - root_obj, f_t(0.0)); if (branch == 0) { - pc.strong_branch_down[k] = std::max(obj - root_obj, 0.0); - dual_simplex_obj_down[k] = std::max(obj - root_obj, 0.0); + pc.strong_branch_down[k] = delta_obj; + dual_simplex_obj_down[k] = delta_obj; dual_simplex_status_down[k] = status; - if (verbose) { - settings.log.printf("Thread id %2d remaining %d variable %d branch %d obj %e time %.2f\n", - thread_id, - end - 1 - k, - j, - branch, - obj, - toc(start_time)); - } } else { - pc.strong_branch_up[k] = std::max(obj - root_obj, 0.0); - dual_simplex_obj_up[k] = std::max(obj - root_obj, 0.0); + pc.strong_branch_up[k] = delta_obj; + dual_simplex_obj_up[k] = delta_obj; dual_simplex_status_up[k] = status; - if (verbose) { - settings.log.printf( - "Thread id %2d remaining %d variable %d branch %d obj %e change down %e change up %e " - "time %.2f\n", - thread_id, - end - 1 - k, - j, - branch, - obj, - dual_simplex_obj_down[k], - dual_simplex_obj_up[k], - toc(start_time)); - } } - // Mark the subproblem as solved so that batch PDLP removes it from the batch + if (sb_view.is_valid()) { - // We could not mark as solved nodes hitting iteration limit in DS - if ((branch == 0 && is_dual_simplex_done(dual_simplex_status_down[k])) || - (branch == 1 && is_dual_simplex_done(dual_simplex_status_up[k]))) { - sb_view.mark_solved(shared_idx); - if (verbose) { - settings.log.printf( - "[COOP SB] DS thread %d solved variable %d branch %s (shared_idx %d), marking in " - "shared context\n", - thread_id, - j, - branch == 0 ? "down" : "up", - shared_idx); - } - } + const dual::status_t branch_status = + branch == 0 ? dual_simplex_status_down[k] : dual_simplex_status_up[k]; + if (is_dual_simplex_done(branch_status)) { sb_view.mark_solved(shared_idx); } + } + + if (verbose) { + settings.log.printf("Thread %d variable %d branch %d obj %e time %.2f\n", + thread_id, + j, + branch, + obj, + toc(start_time)); } if (toc(start_time) > settings.time_limit) { break; } } - if (toc(start_time) > settings.time_limit) { break; } + if (toc(start_time) > settings.time_limit) { break; } const i_t completed = pc.num_strong_branches_completed++; - if (thread_id == 0 && toc(last_log) > 10) { last_log = tic(); settings.log.printf("%d of %ld strong branches completed in %.1fs\n", @@ -443,8 +343,6 @@ void strong_branch_helper(i_t start, child_problem.lower[j] = original_lp.lower[j]; child_problem.upper[j] = original_lp.upper[j]; - - if (toc(start_time) > settings.time_limit) { break; } } } @@ -463,8 +361,11 @@ std::pair trial_branching(const lp_problem_t& ori f_t upper_bound, f_t start_time, i_t iter_limit, - omp_atomic_t& total_lp_iter) + omp_atomic_t& total_lp_iter, + cuopt::work_limit_context_t* work_ctx = nullptr) { + (void)var_types; + lp_problem_t child_problem = original_lp; child_problem.lower[branch_var] = branch_var_lower; child_problem.upper[branch_var] = branch_var_upper; @@ -485,8 +386,6 @@ std::pair trial_branching(const lp_problem_t& ori std::vector child_basic_list = basic_list; std::vector child_nonbasic_list = nonbasic_list; basis_update_mpf_t child_basis_factors = basis_factors; - - // Only refactor the basis if we encounter numerical issues. child_basis_factors.set_refactor_frequency(iter_limit); dual::status_t status = dual_phase2_with_advanced_basis(2, @@ -501,7 +400,8 @@ std::pair trial_branching(const lp_problem_t& ori child_nonbasic_list, solution, iter, - child_edge_norms); + child_edge_norms, + work_ctx); total_lp_iter += iter; settings.log.debug("Trial branching on variable %d. Lo: %e Up: %e. Iter %d. Status %s. Obj %e\n", branch_var, @@ -512,7 +412,6 @@ std::pair trial_branching(const lp_problem_t& ori compute_objective(child_problem, solution.x)); if (status == dual::status_t::DUAL_UNBOUNDED) { - // LP was infeasible return {std::numeric_limits::infinity(), dual::status_t::DUAL_UNBOUNDED}; } else if (status == dual::status_t::OPTIMAL || status == dual::status_t::ITERATION_LIMIT || status == dual::status_t::CUTOFF) { @@ -522,6 +421,88 @@ std::pair trial_branching(const lp_problem_t& ori } } +template +f_t trial_branching_generic(const lp_problem_t& original_lp, + const simplex_solver_settings_t& settings, + const std::vector& var_types, + const std::vector& vstatus, + const std::vector& edge_norms, + const basis_update_mpf_t& basis_factors, + const std::vector& basic_list, + const std::vector& nonbasic_list, + i_t branch_var, + f_t branch_var_lower, + f_t branch_var_upper, + f_t upper_bound, + i_t bnb_lp_iter_per_node, + f_t start_time, + i_t upper_max_lp_iter, + i_t lower_max_lp_iter, + omp_atomic_t& total_lp_iter, + cuopt::work_limit_context_t* work_ctx = nullptr) +{ + const i_t iter_limit = std::clamp(bnb_lp_iter_per_node, lower_max_lp_iter, upper_max_lp_iter); + return trial_branching(original_lp, + settings, + var_types, + vstatus, + edge_norms, + basis_factors, + basic_list, + nonbasic_list, + branch_var, + branch_var_lower, + branch_var_upper, + upper_bound, + start_time, + iter_limit, + total_lp_iter, + work_ctx) + .first; +} + +template +f_t trial_branching_generic(const lp_problem_t& original_lp, + const simplex_solver_settings_t& settings, + const std::vector& var_types, + const std::vector& vstatus, + const std::vector& edge_norms, + const basis_update_mpf_t& basis_factors, + const std::vector& basic_list, + const std::vector& nonbasic_list, + i_t branch_var, + f_t branch_var_lower, + f_t branch_var_upper, + f_t upper_bound, + i_t bnb_lp_iter_per_node, + f_t start_time, + i_t upper_max_lp_iter, + i_t lower_max_lp_iter, + int64_t& total_lp_iter, + cuopt::work_limit_context_t* work_ctx = nullptr) +{ + omp_atomic_t atomic_iter{0}; + auto result = + trial_branching(original_lp, + settings, + var_types, + vstatus, + edge_norms, + basis_factors, + basic_list, + nonbasic_list, + branch_var, + branch_var_lower, + branch_var_upper, + upper_bound, + start_time, + std::clamp(bnb_lp_iter_per_node, lower_max_lp_iter, upper_max_lp_iter), + atomic_iter, + work_ctx); + total_lp_iter += atomic_iter.load(); + return result.first; +} + } // namespace template @@ -531,25 +512,11 @@ static cuopt::mps_parser::mps_data_model_t simplex_problem_to_mps_data const std::vector& root_soln, std::vector& original_root_soln_x) { - // Branch and bound has a problem of the form: - // minimize c^T x - // subject to A*x + Es = b - // l <= x <= u - // E_{jj} = sigma_j, where sigma_j is +1 or -1 - - // We need to convert this into a problem that is better for PDLP - // to solve. PDLP perfers inequality constraints. Thus, we want - // to convert the above into the problem: - // minimize c^T x - // subject to lb <= A*x <= ub - // l <= x <= u - cuopt::mps_parser::mps_data_model_t mps_model; int m = lp.num_rows; int n = lp.num_cols - new_slacks.size(); original_root_soln_x.resize(n); - // Remove slacks from A dual_simplex::csc_matrix_t A_no_slacks = lp.A; std::vector cols_to_remove(lp.A.n, 0); for (i_t j : new_slacks) { @@ -561,33 +528,22 @@ static cuopt::mps_parser::mps_data_model_t simplex_problem_to_mps_data original_root_soln_x[j] = root_soln[j]; } - // Convert CSC to CSR using built-in method dual_simplex::csr_matrix_t csr_A(m, n, 0); A_no_slacks.to_compressed_row(csr_A); - int nz = csr_A.row_start[m]; - // Set CSR constraint matrix mps_model.set_csr_constraint_matrix( csr_A.x.data(), nz, csr_A.j.data(), nz, csr_A.row_start.data(), m + 1); - - // Set objective coefficients mps_model.set_objective_coefficients(lp.objective.data(), n); - - // The LP is already in minimization form (objective negated for max problems). - // Pass identity scaling so PDLP returns the raw DS-space objective directly. mps_model.set_objective_scaling_factor(f_t(1.0)); mps_model.set_objective_offset(f_t(0.0)); - - // Set variable bounds mps_model.set_variable_lower_bounds(lp.lower.data(), n); mps_model.set_variable_upper_bounds(lp.upper.data(), n); - // Convert row sense and RHS to constraint bounds std::vector constraint_lower(m); std::vector constraint_upper(m); - std::vector slack_map(m, -1); + for (i_t j : new_slacks) { const i_t col_start = lp.A.col_start[j]; const i_t i = lp.A.i[col_start]; @@ -595,22 +551,6 @@ static cuopt::mps_parser::mps_data_model_t simplex_problem_to_mps_data } for (i_t i = 0; i < m; ++i) { - // Each row is of the form a_i^T x + sigma * s_i = b_i - // with sigma = +1 or -1 - // and l_i <= s_i <= u_i - // We have that a_i^T x - b_i = -sigma * s_i - // If sigma = -1, then we have - // a_i^T x - b_i = s_i - // l_i <= a_i^T x - b_i <= u_i - // l_i + b_i <= a_i^T x <= u_i + b_i - // - // If sigma = +1, then we have - // a_i^T x - b_i = -s_i - // -a_i^T x + b_i = s_i - // l_i <= -a_i^T x + b_i <= u_i - // l_i - b_i <= -a_i^T x <= u_i - b_i - // -u_i + b_i <= a_i^T x <= -l_i + b_i - const i_t slack = slack_map[i]; assert(slack != -1); const i_t col_start = lp.A.col_start[slack]; @@ -621,64 +561,43 @@ static cuopt::mps_parser::mps_data_model_t simplex_problem_to_mps_data if (sigma == -1) { constraint_lower[i] = slack_lower + lp.rhs[i]; constraint_upper[i] = slack_upper + lp.rhs[i]; - } else if (sigma == 1) { + } else { constraint_lower[i] = -slack_upper + lp.rhs[i]; constraint_upper[i] = -slack_lower + lp.rhs[i]; - } else { - assert(sigma == 1.0 || sigma == -1.0); } } mps_model.set_constraint_lower_bounds(constraint_lower.data(), m); mps_model.set_constraint_upper_bounds(constraint_upper.data(), m); mps_model.set_maximize(false); - return mps_model; } enum class sb_source_t { DUAL_SIMPLEX, PDLP, NONE }; -// Merge a single strong branching result from Dual Simplex and PDLP. -// Rules: -// 1. If both found optimal -> keep DS (higher quality vertex solution) -// 2. Else if Dual Simplex found infeasible -> declare infeasible -// 3. Else if one is optimal -> keep the optimal one -// 4. Else if Dual Simplex hit iteration limit -> keep DS -// 5. Else if none converged -> NaN (original objective) template static std::pair merge_sb_result(f_t dual_simplex_val, dual::status_t dual_simplex_status, f_t pdlp_dual_obj, bool pdlp_optimal) { - // Dual simplex always maintains dual feasibility, so OPTIMAL and ITERATION_LIMIT both qualify - - // Rule 1: Both optimal -> keep DS if (dual_simplex_status == dual::status_t::OPTIMAL && pdlp_optimal) { return {dual_simplex_val, sb_source_t::DUAL_SIMPLEX}; } - - // Rule 2: Dual Simplex found infeasible -> declare infeasible if (dual_simplex_status == dual::status_t::DUAL_UNBOUNDED) { return {std::numeric_limits::infinity(), sb_source_t::DUAL_SIMPLEX}; } - - // Rule 3: Only one converged -> keep that if (dual_simplex_status == dual::status_t::OPTIMAL && !pdlp_optimal) { return {dual_simplex_val, sb_source_t::DUAL_SIMPLEX}; } if (pdlp_optimal && dual_simplex_status != dual::status_t::OPTIMAL) { return {pdlp_dual_obj, sb_source_t::PDLP}; } - - // Rule 4: Dual Simplex hit iteration limit or work limit or cutoff -> keep DS if (dual_simplex_status == dual::status_t::ITERATION_LIMIT || dual_simplex_status == dual::status_t::WORK_LIMIT || dual_simplex_status == dual::status_t::CUTOFF) { return {dual_simplex_val, sb_source_t::DUAL_SIMPLEX}; } - - // Rule 5: None converged -> NaN return {std::numeric_limits::quiet_NaN(), sb_source_t::NONE}; } @@ -706,23 +625,14 @@ static void batch_pdlp_strong_branching_task( f_t start_batch = tic(); std::vector original_root_soln_x; - if (concurrent_halt.load() == 1) { return; } const auto mps_model = simplex_problem_to_mps_data_model(original_lp, new_slacks, root_soln, original_root_soln_x); std::vector fraction_values; - - std::vector original_root_soln_y, original_root_soln_z; - // TODO put back later once Chris has this part - /*uncrush_dual_solution( - original_problem, original_lp, root_soln_y, root_soln_z, original_root_soln_y, - original_root_soln_z);*/ - for (i_t k = 0; k < fractional.size(); k++) { - const i_t j = fractional[k]; - fraction_values.push_back(original_root_soln_x[j]); + fraction_values.push_back(original_root_soln_x[fractional[k]]); } if (concurrent_halt.load() == 1) { return; } @@ -732,19 +642,14 @@ static void batch_pdlp_strong_branching_task( std::max(static_cast(0.0), settings.time_limit - batch_elapsed_time); if (warm_start_remaining_time <= 0.0) { return; } - assert(!pc.pdlp_warm_cache.populated && "PDLP warm cache should not be populated at this point"); - + assert(!pc.pdlp_warm_cache.populated); if (!pc.pdlp_warm_cache.populated) { pdlp_solver_settings_t ws_settings; - ws_settings.method = method_t::PDLP; - ws_settings.presolver = presolver_t::None; - ws_settings.pdlp_solver_mode = pdlp_solver_mode_t::Stable3; - ws_settings.detect_infeasibility = false; - // Since the warm start will be used over and over again we want to maximize the chance of - // convergeance Batch PDLP is very compute intensive so we want to minimize the number of - // iterations - constexpr int warm_start_iteration_limit = 500000; - ws_settings.iteration_limit = warm_start_iteration_limit; + ws_settings.method = method_t::PDLP; + ws_settings.presolver = presolver_t::None; + ws_settings.pdlp_solver_mode = pdlp_solver_mode_t::Stable3; + ws_settings.detect_infeasibility = false; + ws_settings.iteration_limit = 500000; ws_settings.time_limit = warm_start_remaining_time; constexpr f_t pdlp_tolerance = 1e-5; ws_settings.tolerances.relative_dual_tolerance = pdlp_tolerance; @@ -756,51 +661,18 @@ static void batch_pdlp_strong_branching_task( ws_settings.inside_mip = true; if (effective_batch_pdlp == 1) { ws_settings.concurrent_halt = &concurrent_halt; } - auto start_time = std::chrono::high_resolution_clock::now(); - auto ws_solution = solve_lp(&pc.pdlp_warm_cache.batch_pdlp_handle, mps_model, ws_settings); - - if (verbose) { - auto end_time = std::chrono::high_resolution_clock::now(); - auto duration = - std::chrono::duration_cast(end_time - start_time).count(); - settings.log.printf( - "Original problem solved in %d milliseconds" - " and iterations: %d\n", - duration, - ws_solution.get_pdlp_warm_start_data().total_pdlp_iterations_); - } - - if (ws_solution.get_termination_status() == pdlp_termination_status_t::Optimal) { - auto& cache = pc.pdlp_warm_cache; - const auto& ws_primal = ws_solution.get_primal_solution(); - const auto& ws_dual = ws_solution.get_dual_solution(); - // Need to use the pc steam since the batch pdlp handle will get destroyed after the warm - // start - cache.initial_primal = rmm::device_uvector(ws_primal, ws_primal.stream()); - cache.initial_dual = rmm::device_uvector(ws_dual, ws_dual.stream()); - cache.step_size = ws_solution.get_pdlp_warm_start_data().initial_step_size_; - cache.primal_weight = ws_solution.get_pdlp_warm_start_data().initial_primal_weight_; - cache.pdlp_iteration = ws_solution.get_pdlp_warm_start_data().total_pdlp_iterations_; - cache.populated = true; - - if (verbose) { - settings.log.printf( - "Cached PDLP warm start: primal=%zu dual=%zu step_size=%e primal_weight=%e iters=%d\n", - cache.initial_primal.size(), - cache.initial_dual.size(), - cache.step_size, - cache.primal_weight, - cache.pdlp_iteration); - } - } else { - if (verbose) { - settings.log.printf( - "PDLP warm start solve did not reach optimality (%s), skipping cache and batch PDLP\n", - ws_solution.get_termination_status_string().c_str()); - } - return; - } + if (ws_solution.get_termination_status() != pdlp_termination_status_t::Optimal) { return; } + + auto& cache = pc.pdlp_warm_cache; + const auto& ws_primal = ws_solution.get_primal_solution(); + const auto& ws_dual = ws_solution.get_dual_solution(); + cache.initial_primal = rmm::device_uvector(ws_primal, ws_primal.stream()); + cache.initial_dual = rmm::device_uvector(ws_dual, ws_dual.stream()); + cache.step_size = ws_solution.get_pdlp_warm_start_data().initial_step_size_; + cache.primal_weight = ws_solution.get_pdlp_warm_start_data().initial_primal_weight_; + cache.pdlp_iteration = ws_solution.get_pdlp_warm_start_data().total_pdlp_iterations_; + cache.populated = true; } if (concurrent_halt.load() == 1) { return; } @@ -817,49 +689,37 @@ static void batch_pdlp_strong_branching_task( if (batch_remaining_time <= 0.0) { return; } pdlp_settings.time_limit = batch_remaining_time; - if (pc.pdlp_warm_cache.populated) { - auto& cache = pc.pdlp_warm_cache; - pdlp_settings.set_initial_primal_solution(cache.initial_primal.data(), - cache.initial_primal.size(), - cache.batch_pdlp_handle.get_stream()); - pdlp_settings.set_initial_dual_solution( - cache.initial_dual.data(), cache.initial_dual.size(), cache.batch_pdlp_handle.get_stream()); - pdlp_settings.set_initial_step_size(cache.step_size); - pdlp_settings.set_initial_primal_weight(cache.primal_weight); - pdlp_settings.set_initial_pdlp_iteration(cache.pdlp_iteration); - } + auto& cache = pc.pdlp_warm_cache; + pdlp_settings.set_initial_primal_solution( + cache.initial_primal.data(), cache.initial_primal.size(), cache.batch_pdlp_handle.get_stream()); + pdlp_settings.set_initial_dual_solution( + cache.initial_dual.data(), cache.initial_dual.size(), cache.batch_pdlp_handle.get_stream()); + pdlp_settings.set_initial_step_size(cache.step_size); + pdlp_settings.set_initial_primal_weight(cache.primal_weight); + pdlp_settings.set_initial_pdlp_iteration(cache.pdlp_iteration); if (concurrent_halt.load() == 1) { return; } const auto solutions = batch_pdlp_solve( &pc.pdlp_warm_cache.batch_pdlp_handle, mps_model, fractional, fraction_values, pdlp_settings); - f_t batch_pdlp_strong_branching_time = toc(start_batch); + const f_t batch_pdlp_time = toc(start_batch); - // Fail safe in case the batch PDLP failed and produced no solutions if (solutions.get_additional_termination_informations().size() != fractional.size() * 2) { if (verbose) { settings.log.printf("Batch PDLP failed and produced no solutions\n"); } return; } - // Find max iteration on how many are done accross the batch - i_t max_iterations = 0; - i_t amount_done = 0; - for (i_t k = 0; k < solutions.get_additional_termination_informations().size(); k++) { - max_iterations = std::max( - max_iterations, solutions.get_additional_termination_information(k).number_of_steps_taken); - // TODO batch mode infeasible: should also count as done if infeasible - if (solutions.get_termination_status(k) == pdlp_termination_status_t::Optimal) { - amount_done++; - } - } - if (verbose) { - settings.log.printf( - "Batch PDLP strong branching completed in %.2fs. Solved %d/%d with max %d iterations\n", - batch_pdlp_strong_branching_time, - amount_done, - fractional.size() * 2, - max_iterations); + i_t amount_done = 0; + for (i_t k = 0; k < solutions.get_additional_termination_informations().size(); k++) { + if (solutions.get_termination_status(k) == pdlp_termination_status_t::Optimal) { + amount_done++; + } + } + settings.log.printf("Batch PDLP strong branching completed in %.2fs. Solved %d/%d\n", + batch_pdlp_time, + amount_done, + fractional.size() * 2); } for (i_t k = 0; k < fractional.size(); k++) { @@ -897,9 +757,7 @@ static void batch_pdlp_reliability_branching_task( num_candidates); f_t start_batch = tic(); - std::vector original_soln_x; - if (concurrent_halt.load() == 1) { return; } auto mps_model = @@ -925,9 +783,7 @@ static void batch_pdlp_reliability_branching_task( std::max(static_cast(0.0), settings.time_limit - batch_elapsed_time); if (batch_remaining_time <= 0.0) { return; } - // One handle per batch PDLP since there can be concurrent calls const raft::handle_t batch_pdlp_handle; - pdlp_solver_settings_t pdlp_settings; if (rb_mode == 1) { pdlp_settings.concurrent_halt = &concurrent_halt; @@ -950,8 +806,7 @@ static void batch_pdlp_reliability_branching_task( const auto solutions = batch_pdlp_solve(&batch_pdlp_handle, mps_model, candidate_vars, fraction_values, pdlp_settings); - - f_t batch_pdlp_time = toc(start_batch); + const f_t batch_pdlp_time = toc(start_batch); if (solutions.get_additional_termination_informations().size() != static_cast(num_candidates) * 2) { @@ -997,7 +852,8 @@ void strong_branching(const lp_problem_t& original_lp, const std::vector& basic_list, const std::vector& nonbasic_list, basis_update_mpf_t& basis_factors, - pseudo_costs_t& pc) + pseudo_costs_t& pc, + cuopt::work_limit_context_t* work_unit_context) { constexpr bool verbose = false; @@ -1006,17 +862,17 @@ void strong_branching(const lp_problem_t& original_lp, pc.strong_branch_up.assign(fractional.size(), 0); pc.num_strong_branches_completed = 0; - const f_t elapsed_time = toc(start_time); - if (elapsed_time > settings.time_limit) { return; } + if (fractional.empty()) { return; } + if (toc(start_time) > settings.time_limit) { return; } - // 0: no batch PDLP, 1: cooperative batch PDLP and DS, 2: batch PDLP only + const bool deterministic_work_accounting = + work_unit_context != nullptr && work_unit_context->deterministic; + const bool disable_batch_pdlp = + settings.sub_mip || settings.deterministic || deterministic_work_accounting; const i_t effective_batch_pdlp = - (settings.sub_mip || (settings.deterministic && settings.mip_batch_pdlp_strong_branching == 1)) - ? 0 - : settings.mip_batch_pdlp_strong_branching; + disable_batch_pdlp ? 0 : settings.mip_batch_pdlp_strong_branching; - if (settings.mip_batch_pdlp_strong_branching != 0 && - (settings.sub_mip || settings.deterministic)) { + if (settings.mip_batch_pdlp_strong_branching != 0 && disable_batch_pdlp) { settings.log.printf( "Batch PDLP strong branching is disabled because sub-MIP or deterministic mode is enabled\n"); } @@ -1025,21 +881,15 @@ void strong_branching(const lp_problem_t& original_lp, settings.num_threads, fractional.size()); - // Cooperative DS + PDLP: shared context tracks which subproblems are solved - shared_strong_branching_context_t shared_ctx(2 * fractional.size()); - shared_strong_branching_context_view_t sb_view(shared_ctx.solved); - - std::atomic concurrent_halt{0}; - std::vector pdlp_obj_down(fractional.size(), std::numeric_limits::quiet_NaN()); std::vector pdlp_obj_up(fractional.size(), std::numeric_limits::quiet_NaN()); - std::vector dual_simplex_status_down(fractional.size(), dual::status_t::UNSET); std::vector dual_simplex_status_up(fractional.size(), dual::status_t::UNSET); std::vector dual_simplex_obj_down(fractional.size(), std::numeric_limits::quiet_NaN()); std::vector dual_simplex_obj_up(fractional.size(), std::numeric_limits::quiet_NaN()); - f_t strong_branching_start_time = tic(); - i_t simplex_iteration_limit = settings.strong_branching_simplex_iteration_limit; + + const i_t simplex_iteration_limit = settings.strong_branching_simplex_iteration_limit; + const f_t strong_branching_start_time = tic(); if (simplex_iteration_limit < 1) { initialize_pseudo_costs_with_estimate(original_lp, @@ -1051,7 +901,63 @@ void strong_branching(const lp_problem_t& original_lp, fractional, basis_factors, pc); + } else if (effective_batch_pdlp == 0) { + const i_t n_tasks = + std::max(1, std::min(4 * settings.num_threads, fractional.size())); + std::vector task_work_contexts; + if (deterministic_work_accounting) { + task_work_contexts.reserve(n_tasks); + for (i_t k = 0; k < n_tasks; ++k) { + task_work_contexts.emplace_back("sb_task_" + std::to_string(k)); + task_work_contexts.back().deterministic = true; + } + } + + shared_strong_branching_context_view_t empty_sb_view; + +#pragma omp parallel num_threads(settings.num_threads) + { +#pragma omp for schedule(dynamic, 1) + for (i_t k = 0; k < n_tasks; k++) { + const i_t start = std::floor(k * fractional.size() / n_tasks); + const i_t end = std::floor((k + 1) * fractional.size() / n_tasks); + cuopt::work_limit_context_t* task_ctx = + deterministic_work_accounting ? &task_work_contexts[k] : nullptr; + strong_branch_helper(start, + end, + start_time, + original_lp, + settings, + var_types, + fractional, + root_solution.x, + root_vstatus, + edge_norms, + root_obj, + upper_bound, + simplex_iteration_limit, + pc, + dual_simplex_obj_down, + dual_simplex_obj_up, + dual_simplex_status_down, + dual_simplex_status_up, + empty_sb_view, + task_ctx); + } + } + + if (deterministic_work_accounting) { + double max_work = 0.0; + for (auto& ctx : task_work_contexts) { + max_work = std::max(max_work, ctx.current_work()); + } + work_unit_context->record_work_sync_on_horizon(max_work); + } } else { + shared_strong_branching_context_t shared_ctx(2 * fractional.size()); + shared_strong_branching_context_view_t sb_view(shared_ctx.solved); + std::atomic concurrent_halt{0}; + #pragma omp parallel num_threads(settings.num_threads) { #pragma omp single nowait @@ -1074,24 +980,12 @@ void strong_branching(const lp_problem_t& original_lp, } if (effective_batch_pdlp != 2) { - i_t n = std::min(4 * settings.num_threads, fractional.size()); -// Here we are creating more tasks than the number of threads -// such that they can be scheduled dynamically to the threads. -#pragma omp taskloop num_tasks(n) - for (i_t k = 0; k < n; k++) { - i_t start = std::floor(k * fractional.size() / n); - i_t end = std::floor((k + 1) * fractional.size() / n); - - constexpr bool verbose = false; - if (verbose) { - settings.log.printf("Thread id %d task id %d start %d end %d. size %d\n", - omp_get_thread_num(), - k, - start, - end, - end - start); - } - + const i_t n_tasks = + std::max(1, std::min(4 * settings.num_threads, fractional.size())); +#pragma omp taskloop num_tasks(n_tasks) + for (i_t k = 0; k < n_tasks; k++) { + const i_t start = std::floor(k * fractional.size() / n_tasks); + const i_t end = std::floor((k + 1) * fractional.size() / n_tasks); strong_branch_helper(start, end, start_time, @@ -1112,118 +1006,42 @@ void strong_branching(const lp_problem_t& original_lp, dual_simplex_status_up, sb_view); } - // DS done: signal PDLP to stop (time-limit or all work done) and wait if (effective_batch_pdlp == 1) { concurrent_halt.store(1); } } - } - } - } - settings.log.printf("Strong branching completed in %.2fs\n", toc(strong_branching_start_time)); - - if (verbose) { - // Collect Dual Simplex statistics - i_t dual_simplex_optimal = 0, dual_simplex_infeasible = 0, dual_simplex_iter_limit = 0; - i_t dual_simplex_numerical = 0, dual_simplex_cutoff = 0, dual_simplex_time_limit = 0; - i_t dual_simplex_concurrent = 0, dual_simplex_work_limit = 0, dual_simplex_unset = 0; - const i_t total_subproblems = fractional.size() * 2; - for (i_t k = 0; k < fractional.size(); k++) { - for (auto st : {dual_simplex_status_down[k], dual_simplex_status_up[k]}) { - switch (st) { - case dual::status_t::OPTIMAL: dual_simplex_optimal++; break; - case dual::status_t::DUAL_UNBOUNDED: dual_simplex_infeasible++; break; - case dual::status_t::ITERATION_LIMIT: dual_simplex_iter_limit++; break; - case dual::status_t::NUMERICAL: dual_simplex_numerical++; break; - case dual::status_t::CUTOFF: dual_simplex_cutoff++; break; - case dual::status_t::TIME_LIMIT: dual_simplex_time_limit++; break; - case dual::status_t::CONCURRENT_LIMIT: dual_simplex_concurrent++; break; - case dual::status_t::WORK_LIMIT: dual_simplex_work_limit++; break; - case dual::status_t::UNSET: dual_simplex_unset++; break; - } +#pragma omp taskwait } } - - settings.log.printf("Dual Simplex: %d/%d optimal, %d infeasible, %d iter-limit", - dual_simplex_optimal, - total_subproblems, - dual_simplex_infeasible, - dual_simplex_iter_limit); - if (dual_simplex_cutoff) settings.log.printf(", %d cutoff", dual_simplex_cutoff); - if (dual_simplex_time_limit) settings.log.printf(", %d time-limit", dual_simplex_time_limit); - if (dual_simplex_numerical) settings.log.printf(", %d numerical", dual_simplex_numerical); - if (dual_simplex_concurrent) - settings.log.printf(", %d concurrent-halt", dual_simplex_concurrent); - if (dual_simplex_work_limit) settings.log.printf(", %d work-limit", dual_simplex_work_limit); - if (dual_simplex_unset) settings.log.printf(", %d unset/skipped", dual_simplex_unset); - settings.log.printf("\n"); } - if (effective_batch_pdlp != 0 && verbose) { - i_t pdlp_optimal_count = 0; - for (i_t k = 0; k < fractional.size(); k++) { - if (!std::isnan(pdlp_obj_down[k])) pdlp_optimal_count++; - if (!std::isnan(pdlp_obj_up[k])) pdlp_optimal_count++; - } - - settings.log.printf("Batch PDLP found %d/%d optimal solutions\n", - pdlp_optimal_count, - static_cast(fractional.size() * 2)); - } + settings.log.printf("Strong branching completed in %.2fs\n", toc(strong_branching_start_time)); if (effective_batch_pdlp != 0) { - i_t merged_from_ds = 0; i_t merged_from_pdlp = 0; - i_t merged_nan = 0; - i_t solved_by_both = 0; for (i_t k = 0; k < fractional.size(); k++) { for (i_t branch = 0; branch < 2; branch++) { - const bool is_down = (branch == 0); + const bool is_down = branch == 0; f_t& sb_dest = is_down ? pc.strong_branch_down[k] : pc.strong_branch_up[k]; - f_t ds_obj = is_down ? dual_simplex_obj_down[k] : dual_simplex_obj_up[k]; - dual::status_t ds_status = + const f_t ds_obj = is_down ? dual_simplex_obj_down[k] : dual_simplex_obj_up[k]; + const dual::status_t ds_status = is_down ? dual_simplex_status_down[k] : dual_simplex_status_up[k]; - f_t pdlp_obj = is_down ? pdlp_obj_down[k] : pdlp_obj_up[k]; - bool pdlp_has = !std::isnan(pdlp_obj); - bool ds_has = ds_status != dual::status_t::UNSET; + const f_t pdlp_obj = is_down ? pdlp_obj_down[k] : pdlp_obj_up[k]; + const bool pdlp_has = !std::isnan(pdlp_obj); const auto [value, source] = merge_sb_result(ds_obj, ds_status, pdlp_obj, pdlp_has); - if (source == sb_source_t::PDLP || effective_batch_pdlp == 2) { sb_dest = value; } - - if (source == sb_source_t::DUAL_SIMPLEX) - merged_from_ds++; - else if (source == sb_source_t::PDLP) - merged_from_pdlp++; - else - merged_nan++; - - if (ds_has && pdlp_has && verbose) { - solved_by_both++; - settings.log.printf( - "[COOP SB] Merge: variable %d %s solved by BOTH (DS=%e PDLP=%e) -> kept %s\n", - fractional[k], - is_down ? "DOWN" : "UP", - ds_obj, - pdlp_obj, - source == sb_source_t::DUAL_SIMPLEX ? "DS" : "PDLP"); - } + if (source == sb_source_t::PDLP) { merged_from_pdlp++; } } } pc.pdlp_warm_cache.percent_solved_by_batch_pdlp_at_root = (f_t(merged_from_pdlp) / f_t(fractional.size() * 2)) * 100.0; - if (verbose) { - settings.log.printf( - "Batch PDLP for strong branching. Percent solved by batch PDLP at root: %f\n", - pc.pdlp_warm_cache.percent_solved_by_batch_pdlp_at_root); - settings.log.printf( - "Merged results: %d from DS, %d from PDLP, %d unresolved (NaN), %d solved by both\n", - merged_from_ds, - merged_from_pdlp, - merged_nan, - solved_by_both); - } + } + + if (verbose) { + settings.log.printf("Batch PDLP solved %.2f%% of root strong-branching subproblems\n", + pc.pdlp_warm_cache.percent_solved_by_batch_pdlp_at_root); } pc.update_pseudo_costs_from_strong_branching(fractional, root_solution.x); @@ -1235,13 +1053,13 @@ f_t pseudo_costs_t::calculate_pseudocost_score(i_t j, f_t pseudo_cost_up_avg, f_t pseudo_cost_down_avg) const { - constexpr f_t eps = 1e-6; - i_t num_up = pseudo_cost_num_up[j]; - i_t num_down = pseudo_cost_num_down[j]; - f_t pc_up = num_up > 0 ? pseudo_cost_sum_up[j] / num_up : pseudo_cost_up_avg; - f_t pc_down = num_down > 0 ? pseudo_cost_sum_down[j] / num_down : pseudo_cost_down_avg; - f_t f_down = solution[j] - std::floor(solution[j]); - f_t f_up = std::ceil(solution[j]) - solution[j]; + constexpr f_t eps = 1e-6; + const i_t num_up = pseudo_cost_num_up[j]; + const i_t num_down = pseudo_cost_num_down[j]; + const f_t pc_up = num_up > 0 ? pseudo_cost_sum_up[j] / num_up : pseudo_cost_up_avg; + const f_t pc_down = num_down > 0 ? pseudo_cost_sum_down[j] / num_down : pseudo_cost_down_avg; + const f_t f_down = solution[j] - std::floor(solution[j]); + const f_t f_up = std::ceil(solution[j]) - solution[j]; return std::max(f_down * pc_down, eps) * std::max(f_up * pc_up, eps); } @@ -1269,6 +1087,13 @@ void pseudo_costs_t::initialized(i_t& num_initialized_down, f_t& pseudo_cost_down_avg, f_t& pseudo_cost_up_avg) const { + num_initialized_down = 0; + num_initialized_up = 0; + for (size_t j = 0; j < pseudo_cost_sum_down.size(); ++j) { + if (pseudo_cost_num_down[j] > 0) { num_initialized_down++; } + if (pseudo_cost_num_up[j] > 0) { num_initialized_up++; } + } + auto avgs = compute_pseudo_cost_averages(pseudo_cost_sum_down.data(), pseudo_cost_sum_up.data(), pseudo_cost_num_down.data(), @@ -1299,8 +1124,8 @@ i_t pseudo_costs_t::variable_selection(const std::vector& fractio pseudo_cost_up_avg); for (i_t j : fractional) { - f_t score = calculate_pseudocost_score(j, solution, pseudo_cost_up_avg, pseudo_cost_down_avg); - + const f_t score = + calculate_pseudocost_score(j, solution, pseudo_cost_up_avg, pseudo_cost_down_avg); if (score > max_score) { max_score = score; branch_var = j; @@ -1311,7 +1136,6 @@ i_t pseudo_costs_t::variable_selection(const std::vector& fractio branch_var, solution[branch_var], max_score); - return branch_var; } @@ -1330,7 +1154,7 @@ i_t pseudo_costs_t::reliable_variable_selection( const lp_problem_t& original_lp) { constexpr f_t eps = 1e-6; - f_t start_time = bnb_stats.start_time; + const f_t start_time = bnb_stats.start_time; i_t branch_var = fractional[0]; f_t max_score = -1; f_t pseudo_cost_down_avg = -1; @@ -1346,11 +1170,9 @@ i_t pseudo_costs_t::reliable_variable_selection( i_t reliable_threshold = settings.reliability_branching; if (reliable_threshold < 0) { - const i_t max_threshold = reliability_branching_settings.max_reliable_threshold; - const i_t min_threshold = reliability_branching_settings.min_reliable_threshold; - const f_t iter_factor = reliability_branching_settings.bnb_lp_factor; - const i_t iter_offset = reliability_branching_settings.bnb_lp_offset; - const int64_t alpha = iter_factor * branch_and_bound_lp_iters; + const i_t max_threshold = reliability_branching_settings.max_reliable_threshold; + const i_t min_threshold = reliability_branching_settings.min_reliable_threshold; + const int64_t alpha = reliability_branching_settings.bnb_lp_factor * branch_and_bound_lp_iters; const int64_t max_reliability_iter = alpha + reliability_branching_settings.bnb_lp_offset; f_t iter_fraction = @@ -1362,10 +1184,6 @@ i_t pseudo_costs_t::reliable_variable_selection( reliable_threshold = strong_branching_lp_iter < max_reliability_iter ? reliable_threshold : 0; } - // If `reliable_threshold == 0`, then we set the uninitialized pseudocosts to the average. - // Otherwise, the best ones are initialized via strong branching, while the other are ignored. // - // In the latter, we are not using the average pseudocost (which calculated in the `initialized` - // method). if (reliable_threshold == 0) { i_t num_initialized_up; i_t num_initialized_down; @@ -1386,9 +1204,8 @@ i_t pseudo_costs_t::reliable_variable_selection( unreliable_list.push_back(std::make_pair(-1, j)); continue; } - f_t score = + const f_t score = calculate_pseudocost_score(j, leaf_solution.x, pseudo_cost_up_avg, pseudo_cost_down_avg); - if (score > max_score) { max_score = score; branch_var = j; @@ -1400,144 +1217,84 @@ i_t pseudo_costs_t::reliable_variable_selection( branch_var, leaf_solution.x[branch_var], max_score); - return branch_var; } - // 0: no batch PDLP, 1: cooperative batch PDLP and DS, 2: batch PDLP only - const i_t rb_mode = settings.mip_batch_pdlp_reliability_branching; - // We don't use batch PDLP in reliability branching if the PDLP warm start data was not filled - // This indicates that PDLP alone (not batched) couldn't even run at the root node - // So it will most likely perform poorly compared to DS - // It is also off if the number of candidate is very small - // If warm start could run but almost none of the BPDLP results were used, we also want to avoid - // using batch PDLP - constexpr i_t min_num_candidates_for_pdlp = 5; + const i_t rb_mode = settings.mip_batch_pdlp_reliability_branching; + constexpr i_t min_num_candidates_for_pdlp = 5; constexpr f_t min_percent_solved_by_batch_pdlp_at_root_for_pdlp = 5.0; - // Batch PDLP is either forced or we use the heuristic to decide if it should be used const bool use_pdlp = (rb_mode == 2) || (rb_mode != 0 && !settings.sub_mip && !settings.deterministic && pdlp_warm_cache.populated && unreliable_list.size() > min_num_candidates_for_pdlp && pdlp_warm_cache.percent_solved_by_batch_pdlp_at_root > min_percent_solved_by_batch_pdlp_at_root_for_pdlp); - if (rb_mode != 0 && !pdlp_warm_cache.populated) { - log.printf("PDLP warm start data not populated, using DS only\n"); - } else if (rb_mode != 0 && settings.sub_mip) { - log.printf("Batch PDLP reliability branching is disabled because sub-MIP is enabled\n"); - } else if (rb_mode != 0 && settings.deterministic) { - log.printf( - "Batch PDLP reliability branching is disabled because deterministic mode is enabled\n"); - } else if (rb_mode != 0 && unreliable_list.size() < min_num_candidates_for_pdlp) { - log.printf("Not enough candidates to use batch PDLP, using DS only\n"); - } else if (rb_mode != 0 && pdlp_warm_cache.percent_solved_by_batch_pdlp_at_root < 5.0) { - log.printf("Percent solved by batch PDLP at root is too low, using DS only\n"); - } else if (use_pdlp) { - log.printf( - "Using batch PDLP because populated, unreliable list size is %d (> %d), and percent solved " - "by batch PDLP at root is %f%% (> %f%%)\n", - static_cast(unreliable_list.size()), - min_num_candidates_for_pdlp, - pdlp_warm_cache.percent_solved_by_batch_pdlp_at_root, - min_percent_solved_by_batch_pdlp_at_root_for_pdlp); - } - - const int num_tasks = std::max(max_num_tasks, 10); - const int task_priority = reliability_branching_settings.task_priority; - // If both batch PDLP and DS are used we double the max number of candidates + const int num_tasks = std::max(max_num_tasks, 10); + const int task_priority = reliability_branching_settings.task_priority; const i_t max_num_candidates = use_pdlp ? 2 * reliability_branching_settings.max_num_candidates : reliability_branching_settings.max_num_candidates; const i_t num_candidates = std::min(unreliable_list.size(), max_num_candidates); - assert(task_priority > 0); - assert(max_num_candidates > 0); - assert(num_candidates > 0); - assert(num_tasks > 0); - - log.printf( - "RB iters = %d, B&B iters = %d, unreliable = %d, num_tasks = %d, reliable_threshold = %d\n", - strong_branching_lp_iter.load(), - branch_and_bound_lp_iters, - unreliable_list.size(), - num_tasks, - reliable_threshold); - if (unreliable_list.size() > max_num_candidates) { if (reliability_branching_settings.rank_candidates_with_dual_pivot) { - i_t m = worker->leaf_problem.num_rows; - i_t n = worker->leaf_problem.num_cols; + const i_t m = worker->leaf_problem.num_rows; + const i_t n = worker->leaf_problem.num_cols; f_t work_estimate = 0; - std::vector delta_z(n, 0); std::vector workspace(n, 0); - std::vector basic_map(n, -1); + std::vector nonbasic_mark(n, -1); for (i_t i = 0; i < m; i++) { basic_map[worker->basic_list[i]] = i; } - - std::vector nonbasic_mark(n, -1); for (i_t i = 0; i < n - m; i++) { nonbasic_mark[worker->nonbasic_list[i]] = i; } - for (auto& [score, j] : unreliable_list) { if (pseudo_cost_num_down[j] == 0 || pseudo_cost_num_up[j] == 0) { - // Estimate the objective change by performing a single pivot of dual simplex. - objective_change_estimate_t estimate = - single_pivot_objective_change_estimate(worker->leaf_problem, - settings, - AT, - node_ptr->vstatus, - j, - basic_map[j], - leaf_solution, - worker->basic_list, - worker->nonbasic_list, - nonbasic_mark, - worker->basis_factors, - workspace, - delta_z, - work_estimate); - + auto estimate = single_pivot_objective_change_estimate(worker->leaf_problem, + settings, + AT, + node_ptr->vstatus, + j, + basic_map[j], + leaf_solution, + worker->basic_list, + worker->nonbasic_list, + nonbasic_mark, + worker->basis_factors, + workspace, + delta_z, + work_estimate); score = std::max(estimate.up_obj_change, eps) * std::max(estimate.down_obj_change, eps); } else { - // Use the previous score, even if it is unreliable score = calculate_pseudocost_score( j, leaf_solution.x, pseudo_cost_up_avg, pseudo_cost_down_avg); } } } else { - f_t high = max_score > 0 ? max_score : 1; - f_t low = 0; - + const f_t high = max_score > 0 ? max_score : 1; for (auto& [score, j] : unreliable_list) { - if (score == -1) { score = worker->rng.uniform(low, high); } + (void)j; + if (score == -1) { score = worker->rng.uniform(f_t{0}, high); } } } - // We only need to get the top-k elements in the list, where - // k = num_candidates std::partial_sort(unreliable_list.begin(), unreliable_list.begin() + num_candidates, unreliable_list.end(), [](auto el1, auto el2) { return el1.first > el2.first; }); } - // Both DS and PDLP work on the same candidate set std::vector candidate_vars(num_candidates); for (i_t i = 0; i < num_candidates; i++) { candidate_vars[i] = unreliable_list[i].second; } - // Shared context for cooperative work-stealing (mode 1) - // [0..num_candidates) = down, [num_candidates..2*num_candidates) = up shared_strong_branching_context_t shared_ctx(2 * num_candidates); shared_strong_branching_context_view_t sb_view(shared_ctx.solved); - std::vector pdlp_obj_down(num_candidates, std::numeric_limits::quiet_NaN()); std::vector pdlp_obj_up(num_candidates, std::numeric_limits::quiet_NaN()); - std::atomic concurrent_halt{0}; if (use_pdlp) { @@ -1560,7 +1317,6 @@ i_t pseudo_costs_t::reliable_variable_selection( } if (toc(start_time) > settings.time_limit) { - log.printf("Time limit reached\n"); if (use_pdlp) { concurrent_halt.store(1); #pragma omp taskwait @@ -1573,8 +1329,6 @@ i_t pseudo_costs_t::reliable_variable_selection( std::vector dual_simplex_status_down(num_candidates, dual::status_t::UNSET); std::vector dual_simplex_status_up(num_candidates, dual::status_t::UNSET); - f_t dual_simplex_start_time = tic(); - if (rb_mode != 2) { #pragma omp taskloop if (num_tasks > 1) priority(task_priority) num_tasks(num_tasks) \ shared(score_mutex, \ @@ -1589,14 +1343,10 @@ i_t pseudo_costs_t::reliable_variable_selection( if (toc(start_time) > settings.time_limit) { continue; } - if (rb_mode == 1 && sb_view.is_solved(i)) { - log.printf( - "DS skipping variable %d branch down (shared_idx %d): already solved by PDLP\n", j, i); - } else { + if (!(rb_mode == 1 && sb_view.is_solved(i))) { pseudo_cost_mutex_down[j].lock(); if (pseudo_cost_num_down[j] < reliable_threshold) { - // Do trial branching on the down branch - const auto [obj, status] = trial_branching(worker->leaf_problem, + const auto [obj, status] = trial_branching(worker->leaf_problem, settings, var_types, node_ptr->vstatus, @@ -1611,20 +1361,17 @@ i_t pseudo_costs_t::reliable_variable_selection( start_time, iter_limit_per_trial, strong_branching_lp_iter); - dual_simplex_obj_down[i] = obj; dual_simplex_status_down[i] = status; if (!std::isnan(obj)) { - f_t change_in_obj = std::max(obj - node_ptr->lower_bound, eps); - f_t change_in_x = leaf_solution.x[j] - std::floor(leaf_solution.x[j]); + const f_t change_in_obj = std::max(obj - node_ptr->lower_bound, eps); + const f_t change_in_x = leaf_solution.x[j] - std::floor(leaf_solution.x[j]); pseudo_cost_sum_down[j] += change_in_obj / change_in_x; pseudo_cost_num_down[j]++; - // Should be valid if were are already here if (rb_mode == 1 && is_dual_simplex_done(status)) { sb_view.mark_solved(i); } } - } else { - // Variable became reliable, make it as solved so that batch PDLP does not solve it again - if (rb_mode == 1) sb_view.mark_solved(i); + } else if (rb_mode == 1) { + sb_view.mark_solved(i); } pseudo_cost_mutex_down[j].unlock(); } @@ -1632,14 +1379,10 @@ i_t pseudo_costs_t::reliable_variable_selection( if (toc(start_time) > settings.time_limit) { continue; } const i_t shared_idx = i + num_candidates; - if (rb_mode == 1 && sb_view.is_solved(shared_idx)) { - log.printf("DS skipping variable %d branch up (shared_idx %d): already solved by PDLP\n", - j, - shared_idx); - } else { + if (!(rb_mode == 1 && sb_view.is_solved(shared_idx))) { pseudo_cost_mutex_up[j].lock(); if (pseudo_cost_num_up[j] < reliable_threshold) { - const auto [obj, status] = trial_branching(worker->leaf_problem, + const auto [obj, status] = trial_branching(worker->leaf_problem, settings, var_types, node_ptr->vstatus, @@ -1654,20 +1397,17 @@ i_t pseudo_costs_t::reliable_variable_selection( start_time, iter_limit_per_trial, strong_branching_lp_iter); - dual_simplex_obj_up[i] = obj; dual_simplex_status_up[i] = status; if (!std::isnan(obj)) { - f_t change_in_obj = std::max(obj - node_ptr->lower_bound, eps); - f_t change_in_x = std::ceil(leaf_solution.x[j]) - leaf_solution.x[j]; + const f_t change_in_obj = std::max(obj - node_ptr->lower_bound, eps); + const f_t change_in_x = std::ceil(leaf_solution.x[j]) - leaf_solution.x[j]; pseudo_cost_sum_up[j] += change_in_obj / change_in_x; pseudo_cost_num_up[j]++; - // Should be valid if were are already here if (rb_mode == 1 && is_dual_simplex_done(status)) { sb_view.mark_solved(shared_idx); } } - } else { - // Variable became reliable, make it as solved so that batch PDLP does not solve it again - if (rb_mode == 1) sb_view.mark_solved(shared_idx); + } else if (rb_mode == 1) { + sb_view.mark_solved(shared_idx); } pseudo_cost_mutex_up[j].unlock(); } @@ -1676,7 +1416,6 @@ i_t pseudo_costs_t::reliable_variable_selection( score = calculate_pseudocost_score(j, leaf_solution.x, pseudo_cost_up_avg, pseudo_cost_down_avg); - score_mutex.lock(); if (score > max_score) { max_score = score; @@ -1688,92 +1427,232 @@ i_t pseudo_costs_t::reliable_variable_selection( concurrent_halt.store(1); } - f_t dual_simplex_elapsed = toc(dual_simplex_start_time); - - // TODO put back - // if (rb_mode != 2) { - // if (rb_mode == 1) { - // log.printf( - // "RB Dual Simplex: %d candidates, %d/%d optimal, %d/%d infeasible, %d/%d failed, %d skipped - // (PDLP) in %.2fs\n", num_candidates, dual_simplex_optimal.load(), num_candidates * 2, - // dual_simplex_infeasible.load(), num_candidates * 2, - // dual_simplex_failed.load(), num_candidates * 2, - // dual_simplex_skipped.load(), dual_simplex_elapsed); - // } else { - // log.printf( - // "RB Dual Simplex: %d candidates, %d/%d optimal, %d/%d infeasible, %d/%d failed in - // %.2fs\n", num_candidates, dual_simplex_optimal.load(), num_candidates * 2, - // dual_simplex_infeasible.load(), num_candidates * 2, dual_simplex_failed.load(), - // num_candidates * 2, dual_simplex_elapsed); - // } - //} - if (use_pdlp) { #pragma omp taskwait - - i_t pdlp_applied = 0; - i_t pdlp_optimal = 0; for (i_t i = 0; i < num_candidates; i++) { const i_t j = candidate_vars[i]; - // Down: check if PDLP should override DS if (!std::isnan(pdlp_obj_down[i])) { - pdlp_optimal++; const auto [merged_obj, source] = merge_sb_result( dual_simplex_obj_down[i], dual_simplex_status_down[i], pdlp_obj_down[i], true); - // PDLP won the merge, update the pseudo-cost only if node is still unreliable (concurrent - // calls may have made it reliable) if (source == sb_source_t::PDLP) { pseudo_cost_mutex_down[j].lock(); if (pseudo_cost_num_down[j] < reliable_threshold) { - f_t change_in_obj = std::max(merged_obj - node_ptr->lower_bound, eps); - f_t change_in_x = leaf_solution.x[j] - std::floor(leaf_solution.x[j]); + const f_t change_in_obj = std::max(merged_obj - node_ptr->lower_bound, eps); + const f_t change_in_x = leaf_solution.x[j] - std::floor(leaf_solution.x[j]); pseudo_cost_sum_down[j] += change_in_obj / change_in_x; pseudo_cost_num_down[j]++; - pdlp_applied++; } pseudo_cost_mutex_down[j].unlock(); } } - // Up: check if PDLP should override DS if (!std::isnan(pdlp_obj_up[i])) { - pdlp_optimal++; const auto [merged_obj, source] = merge_sb_result( dual_simplex_obj_up[i], dual_simplex_status_up[i], pdlp_obj_up[i], true); - // PDLP won the merge, update the pseudo-cost only if node is still unreliable (concurrent - // calls may have made it reliable) if (source == sb_source_t::PDLP) { pseudo_cost_mutex_up[j].lock(); if (pseudo_cost_num_up[j] < reliable_threshold) { - f_t change_in_obj = std::max(merged_obj - node_ptr->lower_bound, eps); - f_t change_in_x = std::ceil(leaf_solution.x[j]) - leaf_solution.x[j]; + const f_t change_in_obj = std::max(merged_obj - node_ptr->lower_bound, eps); + const f_t change_in_x = std::ceil(leaf_solution.x[j]) - leaf_solution.x[j]; pseudo_cost_sum_up[j] += change_in_obj / change_in_x; pseudo_cost_num_up[j]++; - pdlp_applied++; } pseudo_cost_mutex_up[j].unlock(); } } - f_t score = + const f_t score = calculate_pseudocost_score(j, leaf_solution.x, pseudo_cost_up_avg, pseudo_cost_down_avg); if (score > max_score) { max_score = score; branch_var = j; } } - - log.printf("RB batch PDLP: %d candidates, %d/%d optimal, %d applied to pseudo-costs\n", - num_candidates, - pdlp_optimal, - num_candidates * 2, - pdlp_applied); } log.printf( "pc branching on %d. Value %e. Score %e\n", branch_var, leaf_solution.x[branch_var], max_score); + return branch_var; +} + +template +i_t reliable_variable_selection_core(mip_node_t* node_ptr, + const std::vector& fractional, + const std::vector& solution, + const simplex_solver_settings_t& settings, + const std::vector& var_types, + const lp_problem_t& leaf_problem, + const std::vector& edge_norms, + const basis_update_mpf_t& basis_factors, + const std::vector& basic_list, + const std::vector& nonbasic_list, + SumT* sum_down, + SumT* sum_up, + CountT* num_down, + CountT* num_up, + i_t n_vars, + SBIterT& strong_branching_lp_iter, + f_t upper_bound, + int64_t bnb_lp_iters, + int64_t bnb_nodes_explored, + f_t start_time, + const reliability_branching_settings_t& rb_settings, + int num_tasks, + omp_mutex_t* var_mutex_down, + omp_mutex_t* var_mutex_up, + pcgenerator_t* rng, + cuopt::work_limit_context_t* work_ctx, + const sb_update_callback_t& on_sb_update) +{ + constexpr f_t eps = 1e-6; + i_t branch_var = fractional[0]; + f_t max_score = -1; + + auto avgs = compute_pseudo_cost_averages(sum_down, sum_up, num_down, num_up, (size_t)n_vars); + const f_t pseudo_cost_down_avg = avgs.down_avg; + const f_t pseudo_cost_up_avg = avgs.up_avg; + + const i_t bnb_lp_iter_per_node = + bnb_nodes_explored > 0 ? (i_t)(bnb_lp_iters / bnb_nodes_explored) : 0; + + i_t reliable_threshold = settings.reliability_branching; + if (reliable_threshold < 0) { + const int64_t alpha = (int64_t)(rb_settings.bnb_lp_factor * bnb_lp_iters); + const int64_t max_reliability_iter = alpha + rb_settings.bnb_lp_offset; + + f_t iter_fraction = + (max_reliability_iter - strong_branching_lp_iter) / (strong_branching_lp_iter + 1.0); + iter_fraction = std::min(1.0, iter_fraction); + iter_fraction = std::max((alpha - strong_branching_lp_iter) / (strong_branching_lp_iter + 1.0), + iter_fraction); + reliable_threshold = (int)((1 - iter_fraction) * rb_settings.min_reliable_threshold + + iter_fraction * rb_settings.max_reliable_threshold); + reliable_threshold = strong_branching_lp_iter < max_reliability_iter ? reliable_threshold : 0; + } + std::vector unreliable_list; + for (i_t j : fractional) { + if (num_down[j] < reliable_threshold || num_up[j] < reliable_threshold) { + unreliable_list.push_back(j); + continue; + } + const f_t pc_down = num_down[j] > 0 ? sum_down[j] / num_down[j] : pseudo_cost_down_avg; + const f_t pc_up = num_up[j] > 0 ? sum_up[j] / num_up[j] : pseudo_cost_up_avg; + const f_t f_down = solution[j] - std::floor(solution[j]); + const f_t f_up = std::ceil(solution[j]) - solution[j]; + const f_t score = std::max(f_down * pc_down, eps) * std::max(f_up * pc_up, eps); + if (score > max_score) { + max_score = score; + branch_var = j; + } + } + + if (unreliable_list.empty()) { + settings.log.debug( + "pc branching on %d. Value %e. Score %e\n", branch_var, solution[branch_var], max_score); + return branch_var; + } + + const i_t max_num_candidates = rb_settings.max_num_candidates; + const int task_priority = rb_settings.task_priority; + const i_t num_candidates = std::min(unreliable_list.size(), max_num_candidates); + + cuopt_assert(rng != nullptr, "rng must be provided for candidate shuffling"); + if (unreliable_list.size() > (size_t)max_num_candidates) { rng->shuffle(unreliable_list); } + if (toc(start_time) > settings.time_limit) { return branch_var; } + + omp_mutex_t score_mutex; + +#pragma omp taskloop if (num_tasks > 1) priority(task_priority) num_tasks(num_tasks) \ + shared(score_mutex, strong_branching_lp_iter) + for (i_t i = 0; i < num_candidates; ++i) { + const i_t j = unreliable_list[i]; + if (toc(start_time) > settings.time_limit) { continue; } + + if (var_mutex_down) { var_mutex_down[j].lock(); } + if (num_down[j] < reliable_threshold) { + const f_t obj = trial_branching_generic(leaf_problem, + settings, + var_types, + node_ptr->vstatus, + edge_norms, + basis_factors, + basic_list, + nonbasic_list, + j, + leaf_problem.lower[j], + std::floor(solution[j]), + upper_bound, + bnb_lp_iter_per_node, + start_time, + rb_settings.upper_max_lp_iter, + rb_settings.lower_max_lp_iter, + strong_branching_lp_iter, + work_ctx); + if (!std::isnan(obj)) { + const f_t delta = + std::max(obj - node_ptr->lower_bound, eps) / (solution[j] - std::floor(solution[j])); + sum_down[j] += delta; + num_down[j]++; + if (on_sb_update) { on_sb_update(j, rounding_direction_t::DOWN, delta); } + } + } + if (var_mutex_down) { var_mutex_down[j].unlock(); } + + if (toc(start_time) > settings.time_limit) { continue; } + + if (var_mutex_up) { var_mutex_up[j].lock(); } + if (num_up[j] < reliable_threshold) { + const f_t obj = trial_branching_generic(leaf_problem, + settings, + var_types, + node_ptr->vstatus, + edge_norms, + basis_factors, + basic_list, + nonbasic_list, + j, + std::ceil(solution[j]), + leaf_problem.upper[j], + upper_bound, + bnb_lp_iter_per_node, + start_time, + rb_settings.upper_max_lp_iter, + rb_settings.lower_max_lp_iter, + strong_branching_lp_iter, + work_ctx); + if (!std::isnan(obj)) { + const f_t delta = + std::max(obj - node_ptr->lower_bound, eps) / (std::ceil(solution[j]) - solution[j]); + sum_up[j] += delta; + num_up[j]++; + if (on_sb_update) { on_sb_update(j, rounding_direction_t::UP, delta); } + } + } + if (var_mutex_up) { var_mutex_up[j].unlock(); } + + if (toc(start_time) > settings.time_limit) { continue; } + + const f_t pc_down = num_down[j] > 0 ? sum_down[j] / num_down[j] : pseudo_cost_down_avg; + const f_t pc_up = num_up[j] > 0 ? sum_up[j] / num_up[j] : pseudo_cost_up_avg; + const f_t f_down = solution[j] - std::floor(solution[j]); + const f_t f_up = std::ceil(solution[j]) - solution[j]; + const f_t score = std::max(f_down * pc_down, eps) * std::max(f_up * pc_up, eps); + + score_mutex.lock(); + if (score > max_score) { + max_score = score; + branch_var = j; + } + score_mutex.unlock(); + } + + settings.log.debug("Reliability branching result: node=%d branch_var=%d value=%e score=%e\n", + node_ptr->node_id, + branch_var, + solution[branch_var], + max_score); return branch_var; } @@ -1783,24 +1662,20 @@ f_t pseudo_costs_t::obj_estimate(const std::vector& fractional, f_t lower_bound, logger_t& log) { - const i_t num_fractional = fractional.size(); - f_t estimate = lower_bound; - + f_t estimate = lower_bound; i_t num_initialized_down; i_t num_initialized_up; f_t pseudo_cost_down_avg; f_t pseudo_cost_up_avg; - initialized(num_initialized_down, num_initialized_up, pseudo_cost_down_avg, pseudo_cost_up_avg); for (i_t j : fractional) { - constexpr f_t eps = 1e-6; - i_t num_up = pseudo_cost_num_up[j]; - i_t num_down = pseudo_cost_num_down[j]; - f_t pc_up = num_up > 0 ? pseudo_cost_sum_up[j] / num_up : pseudo_cost_up_avg; - f_t pc_down = num_down > 0 ? pseudo_cost_sum_down[j] / num_down : pseudo_cost_down_avg; - f_t f_down = solution[j] - std::floor(solution[j]); - f_t f_up = std::ceil(solution[j]) - solution[j]; + const i_t num_up = pseudo_cost_num_up[j]; + const i_t num_down = pseudo_cost_num_down[j]; + const f_t pc_up = num_up > 0 ? pseudo_cost_sum_up[j] / num_up : pseudo_cost_up_avg; + const f_t pc_down = num_down > 0 ? pseudo_cost_sum_down[j] / num_down : pseudo_cost_down_avg; + const f_t f_down = solution[j] - std::floor(solution[j]); + const f_t f_up = std::ceil(solution[j]) - solution[j]; estimate += std::min(pc_down * f_down, pc_up * f_up); } @@ -1814,20 +1689,15 @@ void pseudo_costs_t::update_pseudo_costs_from_strong_branching( { for (i_t k = 0; k < fractional.size(); k++) { const i_t j = fractional[k]; - for (i_t branch = 0; branch < 2; branch++) { - if (branch == 0) { - f_t change_in_obj = strong_branch_down[k]; - if (std::isnan(change_in_obj)) { continue; } - f_t frac = root_soln[j] - std::floor(root_soln[j]); - pseudo_cost_sum_down[j] += change_in_obj / frac; - pseudo_cost_num_down[j]++; - } else { - f_t change_in_obj = strong_branch_up[k]; - if (std::isnan(change_in_obj)) { continue; } - f_t frac = std::ceil(root_soln[j]) - root_soln[j]; - pseudo_cost_sum_up[j] += change_in_obj / frac; - pseudo_cost_num_up[j]++; - } + if (!std::isnan(strong_branch_down[k])) { + const f_t frac = root_soln[j] - std::floor(root_soln[j]); + pseudo_cost_sum_down[j] += strong_branch_down[k] / frac; + pseudo_cost_num_down[j]++; + } + if (!std::isnan(strong_branch_up[k])) { + const f_t frac = std::ceil(root_soln[j]) - root_soln[j]; + pseudo_cost_sum_up[j] += strong_branch_up[k] / frac; + pseudo_cost_num_up[j]++; } } } @@ -1836,6 +1706,68 @@ void pseudo_costs_t::update_pseudo_costs_from_strong_branching( template class pseudo_costs_t; +template int reliable_variable_selection_core, + omp_atomic_t, + omp_atomic_t>( + mip_node_t*, + const std::vector&, + const std::vector&, + const simplex_solver_settings_t&, + const std::vector&, + const lp_problem_t&, + const std::vector&, + const basis_update_mpf_t&, + const std::vector&, + const std::vector&, + omp_atomic_t*, + omp_atomic_t*, + omp_atomic_t*, + omp_atomic_t*, + int, + omp_atomic_t&, + double, + int64_t, + int64_t, + double, + const reliability_branching_settings_t&, + int, + omp_mutex_t*, + omp_mutex_t*, + pcgenerator_t*, + cuopt::work_limit_context_t*, + const sb_update_callback_t&); + +template int reliable_variable_selection_core( + mip_node_t*, + const std::vector&, + const std::vector&, + const simplex_solver_settings_t&, + const std::vector&, + const lp_problem_t&, + const std::vector&, + const basis_update_mpf_t&, + const std::vector&, + const std::vector&, + double*, + double*, + int*, + int*, + int, + int64_t&, + double, + int64_t, + int64_t, + double, + const reliability_branching_settings_t&, + int, + omp_mutex_t*, + omp_mutex_t*, + pcgenerator_t*, + cuopt::work_limit_context_t*, + const sb_update_callback_t&); + template void strong_branching(const lp_problem_t& original_lp, const simplex_solver_settings_t& settings, double start_time, @@ -1850,7 +1782,8 @@ template void strong_branching(const lp_problem_t& ori const std::vector& basic_list, const std::vector& nonbasic_list, basis_update_mpf_t& basis_factors, - pseudo_costs_t& pc); + pseudo_costs_t& pc, + cuopt::work_limit_context_t* work_unit_context); #endif diff --git a/cpp/src/branch_and_bound/pseudo_costs.hpp b/cpp/src/branch_and_bound/pseudo_costs.hpp index 009bd8b81a..6393a8cd41 100644 --- a/cpp/src/branch_and_bound/pseudo_costs.hpp +++ b/cpp/src/branch_and_bound/pseudo_costs.hpp @@ -17,12 +17,14 @@ #include #include +#include #include #include #include #include +#include #include namespace cuopt::linear_programming::dual_simplex { @@ -357,6 +359,13 @@ class pseudo_cost_snapshot_t { } } + // Record an update that was already applied to the arrays (e.g. by strong branching). + void record_update( + i_t variable, rounding_direction_t direction, f_t delta, double clock, int worker_id) + { + updates_.push_back({variable, direction, delta, clock, worker_id}); + } + std::vector> take_updates() { std::vector> result; @@ -370,6 +379,7 @@ class pseudo_cost_snapshot_t { std::vector sum_up_; std::vector num_down_; std::vector num_up_; + int64_t strong_branching_lp_iter_{0}; private: std::vector> updates_; @@ -452,8 +462,10 @@ class pseudo_costs_t { nd[j] = pseudo_cost_num_down[j]; nu[j] = pseudo_cost_num_up[j]; } - return pseudo_cost_snapshot_t( - std::move(sd), std::move(su), std::move(nd), std::move(nu)); + auto snap = + pseudo_cost_snapshot_t(std::move(sd), std::move(su), std::move(nd), std::move(nu)); + snap.strong_branching_lp_iter_ = strong_branching_lp_iter.load(); + return snap; } void merge_updates(const std::vector>& updates) @@ -541,6 +553,44 @@ class pseudo_costs_t { batch_pdlp_warm_cache_t pdlp_warm_cache; }; +// Callback invoked after each strong-branching pseudocost discovery. +template +using sb_update_callback_t = + std::function; + +// Core reliability branching loop usable by both opportunistic and deterministic paths. +// When num_tasks == 1, runs serially with no locking (deterministic). +// When num_tasks > 1 with mutexes/rng, uses OMP taskloop (opportunistic). +// SumT/CountT can be f_t/i_t (deterministic snapshot) or omp_atomic_t/omp_atomic_t. +template +i_t reliable_variable_selection_core(mip_node_t* node_ptr, + const std::vector& fractional, + const std::vector& solution, + const simplex_solver_settings_t& settings, + const std::vector& var_types, + const lp_problem_t& leaf_problem, + const std::vector& edge_norms, + const basis_update_mpf_t& basis_factors, + const std::vector& basic_list, + const std::vector& nonbasic_list, + SumT* sum_down, + SumT* sum_up, + CountT* num_down, + CountT* num_up, + i_t n_vars, + SBIterT& strong_branching_lp_iter, + f_t upper_bound, + int64_t bnb_lp_iters, + int64_t bnb_nodes_explored, + f_t start_time, + const reliability_branching_settings_t& rb_settings, + int num_tasks, + omp_mutex_t* var_mutex_down, + omp_mutex_t* var_mutex_up, + pcgenerator_t* rng, + cuopt::work_limit_context_t* work_ctx = nullptr, + const sb_update_callback_t& on_sb_update = {}); + template void strong_branching(const lp_problem_t& original_lp, const simplex_solver_settings_t& settings, @@ -556,6 +606,7 @@ void strong_branching(const lp_problem_t& original_lp, const std::vector& basic_list, const std::vector& nonbasic_list, basis_update_mpf_t& basis_factors, - pseudo_costs_t& pc); + pseudo_costs_t& pc, + cuopt::work_limit_context_t* work_unit_context = nullptr); } // namespace cuopt::linear_programming::dual_simplex diff --git a/cpp/src/dual_simplex/basis_updates.cpp b/cpp/src/dual_simplex/basis_updates.cpp index 9c56ada50e..69ac7e43df 100644 --- a/cpp/src/dual_simplex/basis_updates.cpp +++ b/cpp/src/dual_simplex/basis_updates.cpp @@ -2202,7 +2202,7 @@ i_t basis_update_mpf_t::update(const sparse_vector_t& utilde // Ensure the workspace is sorted. Otherwise, the sparse dot will be incorrect. std::sort(xi_workspace_.begin() + m, xi_workspace_.begin() + m + nz, std::less()); - work_estimate_ += (m + nz) * std::log2(m + nz); + if (nz > 1) { work_estimate_ += (nz)*std::log2((f_t)(nz)); } // Gather the workspace into a column of S i_t S_start; @@ -2214,7 +2214,7 @@ i_t basis_update_mpf_t::update(const sparse_vector_t& utilde // Gather etilde into a column of S etilde.sort(); // Needs to be sorted for the sparse dot. TODO(CMM): Is etilde sorted on input? - work_estimate_ += etilde.i.size() * std::log2(etilde.i.size()); + if (etilde.i.size() > 1) { work_estimate_ += etilde.i.size() * std::log2((f_t)etilde.i.size()); } S_.append_column(etilde); work_estimate_ += 4 * etilde.i.size(); diff --git a/cpp/src/dual_simplex/bound_flipping_ratio_test.cpp b/cpp/src/dual_simplex/bound_flipping_ratio_test.cpp index e30b067398..d9abc26fe1 100644 --- a/cpp/src/dual_simplex/bound_flipping_ratio_test.cpp +++ b/cpp/src/dual_simplex/bound_flipping_ratio_test.cpp @@ -235,7 +235,7 @@ void bound_flipping_ratio_test_t::heap_passes(const std::vector& // Remove minimum ratio from the heap and rebalance i_t heap_index = bare_idx.front(); std::pop_heap(bare_idx.begin(), bare_idx.end(), compare); - work_estimate_ += 2 * std::log2(bare_idx.size()); + if (bare_idx.size() > 1) { work_estimate_ += 2 * std::log2((f_t)bare_idx.size()); } bare_idx.pop_back(); nonbasic_entering = current_indicies[heap_index]; diff --git a/cpp/src/dual_simplex/bound_flipping_ratio_test.hpp b/cpp/src/dual_simplex/bound_flipping_ratio_test.hpp index 244ff334df..4b62c66771 100644 --- a/cpp/src/dual_simplex/bound_flipping_ratio_test.hpp +++ b/cpp/src/dual_simplex/bound_flipping_ratio_test.hpp @@ -100,7 +100,7 @@ class bound_flipping_ratio_test_t { i_t n_; i_t m_; - f_t work_estimate_; + f_t work_estimate_{0.0}; }; } // namespace cuopt::linear_programming::dual_simplex diff --git a/cpp/src/dual_simplex/phase2.cpp b/cpp/src/dual_simplex/phase2.cpp index 5b1130796e..0e841fe22f 100644 --- a/cpp/src/dual_simplex/phase2.cpp +++ b/cpp/src/dual_simplex/phase2.cpp @@ -3551,7 +3551,9 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, phase2_work_estimate += ft.work_estimate(); ft.clear_work_estimate(); - work_unit_context->record_work_sync_on_horizon(phase2_work_estimate / 1e8); + if (work_unit_context) { + work_unit_context->record_work_sync_on_horizon(phase2_work_estimate / 1e8); + } phase2_work_estimate = 0.0; last_feature_log_iter = iter; diff --git a/cpp/src/dual_simplex/simplex_solver_settings.hpp b/cpp/src/dual_simplex/simplex_solver_settings.hpp index cfc120e477..9aea2f1648 100644 --- a/cpp/src/dual_simplex/simplex_solver_settings.hpp +++ b/cpp/src/dual_simplex/simplex_solver_settings.hpp @@ -7,6 +7,7 @@ #pragma once +#include #include #include @@ -113,7 +114,7 @@ struct simplex_solver_settings_t { reliability_branching(-1), inside_mip(0), sub_mip(0), - solution_callback(nullptr), + new_incumbent_callback(nullptr), heuristic_preemption_callback(nullptr), dual_simplex_objective_callback(nullptr), concurrent_halt(nullptr) @@ -202,6 +203,8 @@ struct simplex_solver_settings_t { // 0, 1 - Estimate the objective change using a single pivot of dual simplex // >1 - Set as the iteration limit in dual simplex i_t strong_branching_simplex_iteration_limit; + f_t bb_work_unit_scale{1.0}; + bool gpu_heur_wait_for_exploration{true}; diving_heuristics_settings_t diving_settings; // Settings for the diving heuristics @@ -214,7 +217,9 @@ struct simplex_solver_settings_t { i_t inside_mip; // 0 if outside MIP, 1 if inside MIP at root node, 2 if inside MIP at leaf node i_t sub_mip; // 0 if in regular MIP solve, 1 if in sub-MIP solve - std::function&, f_t)> solution_callback; + std::function&, f_t, const cuopt::internals::mip_solution_callback_info_t&, double)> + new_incumbent_callback; std::function&, f_t)> node_processed_callback; std::function heuristic_preemption_callback; std::function&, std::vector&, f_t)> set_simplex_solution_callback; diff --git a/cpp/src/math_optimization/solver_settings.cu b/cpp/src/math_optimization/solver_settings.cu index c23b1d27ca..10026eb05e 100644 --- a/cpp/src/math_optimization/solver_settings.cu +++ b/cpp/src/math_optimization/solver_settings.cu @@ -113,6 +113,9 @@ solver_settings_t::solver_settings_t() : pdlp_settings(), mip_settings {CUOPT_MIP_HYPER_HEURISTIC_INITIAL_INFEASIBILITY_WEIGHT, &mip_settings.heuristic_params.initial_infeasibility_weight, f_t(1e-9), std::numeric_limits::infinity(), f_t(1000.0), "constraint violation penalty seed"}, {CUOPT_MIP_HYPER_HEURISTIC_RELAXED_LP_TIME_LIMIT, &mip_settings.heuristic_params.relaxed_lp_time_limit, f_t(1e-9), std::numeric_limits::infinity(), f_t(1.0), "base relaxed LP time cap in heuristics"}, {CUOPT_MIP_HYPER_HEURISTIC_RELATED_VARS_TIME_LIMIT, &mip_settings.heuristic_params.related_vars_time_limit, f_t(1e-9), std::numeric_limits::infinity(), f_t(30.0), "time for related-variable structure build"}, + {CUOPT_MIP_HYPER_HEURISTIC_CPUFJ_WORK_UNIT_SCALE, &mip_settings.cpufj_work_unit_scale, f_t(0.0), std::numeric_limits::infinity(), f_t(1.0), "user multiplier on CPUFJ work-unit rate"}, + {CUOPT_MIP_HYPER_HEURISTIC_GPU_HEUR_WORK_UNIT_SCALE, &mip_settings.gpu_heur_work_unit_scale, f_t(0.0), std::numeric_limits::infinity(), f_t(1.0), "user multiplier on GPU heuristics work-unit rate"}, + {CUOPT_MIP_HYPER_HEURISTIC_BB_WORK_UNIT_SCALE, &mip_settings.bb_work_unit_scale, f_t(0.0), std::numeric_limits::infinity(), f_t(1.0), "user multiplier on B&B work-unit rate"}, }; // Int parameters @@ -142,7 +145,7 @@ solver_settings_t::solver_settings_t() : pdlp_settings(), mip_settings {CUOPT_MIP_STRONG_BRANCHING_SIMPLEX_ITERATION_LIMIT, &mip_settings.strong_branching_simplex_iteration_limit, -1,std::numeric_limits::max(), -1}, {CUOPT_PRESOLVE, reinterpret_cast(&pdlp_settings.presolver), CUOPT_PRESOLVE_DEFAULT, CUOPT_PRESOLVE_PSLP, CUOPT_PRESOLVE_DEFAULT}, {CUOPT_PRESOLVE, reinterpret_cast(&mip_settings.presolver), CUOPT_PRESOLVE_DEFAULT, CUOPT_PRESOLVE_PSLP, CUOPT_PRESOLVE_DEFAULT}, - {CUOPT_MIP_DETERMINISM_MODE, &mip_settings.determinism_mode, CUOPT_MODE_OPPORTUNISTIC, CUOPT_MODE_DETERMINISTIC, CUOPT_MODE_OPPORTUNISTIC}, + {CUOPT_MIP_DETERMINISM_MODE, &mip_settings.determinism_mode, CUOPT_DETERMINISM_NONE, CUOPT_DETERMINISM_FULL, CUOPT_DETERMINISM_NONE}, {CUOPT_RANDOM_SEED, &mip_settings.seed, -1, std::numeric_limits::max(), -1}, {CUOPT_MIP_RELIABILITY_BRANCHING, &mip_settings.reliability_branching, -1, std::numeric_limits::max(), -1}, {CUOPT_PDLP_PRECISION, reinterpret_cast(&pdlp_settings.pdlp_precision), CUOPT_PDLP_DEFAULT_PRECISION, CUOPT_PDLP_MIXED_PRECISION, CUOPT_PDLP_DEFAULT_PRECISION}, @@ -171,6 +174,7 @@ solver_settings_t::solver_settings_t() : pdlp_settings(), mip_settings {CUOPT_ELIMINATE_DENSE_COLUMNS, &pdlp_settings.eliminate_dense_columns, true}, {CUOPT_CUDSS_DETERMINISTIC, &pdlp_settings.cudss_deterministic, false}, {CUOPT_DUAL_POSTSOLVE, &pdlp_settings.dual_postsolve, true}, + {CUOPT_MIP_HYPER_HEURISTIC_GPU_HEUR_WAIT_FOR_EXPLORATION, &mip_settings.gpu_heur_wait_for_exploration, false, "GPU heuristics wait for B&B root solve before starting"}, }; // String parameters string_parameters = { diff --git a/cpp/src/mip_heuristics/diversity/diversity_config.hpp b/cpp/src/mip_heuristics/diversity/diversity_config.hpp index dacf7773de..c27f857ba0 100644 --- a/cpp/src/mip_heuristics/diversity/diversity_config.hpp +++ b/cpp/src/mip_heuristics/diversity/diversity_config.hpp @@ -26,6 +26,10 @@ struct diversity_config_t { double lp_run_time_if_feasible = 2.; double lp_run_time_if_infeasible = 1.; bool halve_population = false; + bool fj_only_run = false; + bool dry_run = false; + bool initial_solution_only = false; + int n_fp_iterations = 1000000; }; } // namespace cuopt::linear_programming::detail diff --git a/cpp/src/mip_heuristics/diversity/diversity_manager.cu b/cpp/src/mip_heuristics/diversity/diversity_manager.cu index b8dc3d33bf..b84043d773 100644 --- a/cpp/src/mip_heuristics/diversity/diversity_manager.cu +++ b/cpp/src/mip_heuristics/diversity/diversity_manager.cu @@ -5,7 +5,6 @@ */ /* clang-format on */ -#include "cuda_profiler_api.h" #include "diversity_manager.cuh" #include @@ -14,12 +13,21 @@ #include #include #include +#include #include +#include #include -#include +// enable to activate detailed determinism logs +#if 0 +#undef CUOPT_DETERMINISM_LOG +#define CUOPT_DETERMINISM_LOG(...) \ + do { \ + CUOPT_LOG_INFO(__VA_ARGS__); \ + } while (0) +#endif constexpr bool fj_only_run = false; @@ -55,7 +63,7 @@ diversity_manager_t::diversity_manager_t(mip_solver_context_thandle_ptr->get_stream()), ls(context, lp_optimal_solution), rins(context, *this), - timer(diversity_config.default_time_limit), + timer(0.0, cuopt::termination_checker_t::root_tag_t{}), bound_prop_recombiner(context, context.problem_ptr->n_variables, ls.constraint_prop, @@ -79,6 +87,30 @@ diversity_manager_t::diversity_manager_t(mip_solver_context_t::n_of_arms, cuopt::seed_generator::get_seed(), ls_alpha, "ls"), ls_hash_map(*context.problem_ptr) { + fp_recombiner_config_t::max_n_of_vars_from_other = + fp_recombiner_config_t::initial_n_of_vars_from_other; + ls_recombiner_config_t::max_n_of_vars_from_other = + ls_recombiner_config_t::initial_n_of_vars_from_other; + bp_recombiner_config_t::max_n_of_vars_from_other = + bp_recombiner_config_t::initial_n_of_vars_from_other; + sub_mip_recombiner_config_t::max_n_of_vars_from_other = + sub_mip_recombiner_config_t::initial_n_of_vars_from_other; + mab_ls_config_t::last_lm_config = 0; + mab_ls_config_t::last_ls_mab_option = 0; + + CUOPT_DETERMINISM_LOG( + "Deterministic solve start diversity state: seed_state=%lld fp_max=%zu " + "ls_max=%zu bp_max=%zu sub_mip_max=%zu last_lm=%d last_ls=%d " + "enabled_recombiners=%zu", + (long long)cuopt::seed_generator::peek_seed(), + fp_recombiner_config_t::max_n_of_vars_from_other, + ls_recombiner_config_t::max_n_of_vars_from_other, + bp_recombiner_config_t::max_n_of_vars_from_other, + sub_mip_recombiner_config_t::max_n_of_vars_from_other, + (int)mab_ls_config_t::last_lm_config, + (int)mab_ls_config_t::last_ls_mab_option, + recombiner_t::enabled_recombiners.size()); + int max_config = -1; int env_config_id = -1; const char* env_max_config = std::getenv("CUOPT_MAX_CONFIG"); @@ -106,6 +138,9 @@ diversity_manager_t::diversity_manager_t(mip_solver_context_t @@ -153,7 +188,7 @@ void diversity_manager_t::consume_staged_simplex_solution(lp_state_t bool diversity_manager_t::run_local_search(solution_t& solution, const weight_t& weights, - timer_t& timer, + work_limit_timer_t& timer, ls_config_t& ls_config) { raft::common::nvtx::range fun_scope("run_local_search"); @@ -174,7 +209,7 @@ void diversity_manager_t::generate_solution(f_t time_limit, bool rando sol.compute_feasibility(); // if a feasible is found, it is added to the population ls.generate_solution(sol, random_start, &population, time_limit); - population.add_solution(std::move(sol)); + population.add_solution(std::move(sol), internals::mip_solution_origin_t::LOCAL_SEARCH); } template @@ -187,7 +222,12 @@ void diversity_manager_t::add_user_given_solutions( rmm::device_uvector init_sol_assignment(*init_sol, sol.handle_ptr->get_stream()); if (problem_ptr->pre_process_assignment(init_sol_assignment)) { relaxed_lp_settings_t lp_settings; - lp_settings.time_limit = std::min(60., timer.remaining_time() / 2); + lp_settings.time_limit = std::min(60., timer.remaining_time() / 2); + if (timer.deterministic) { + lp_settings.work_limit = lp_settings.time_limit; + lp_settings.work_context = timer.work_context; + cuopt_assert(lp_settings.work_context != nullptr, "Missing deterministic work context"); + } lp_settings.tolerance = problem_ptr->tolerances.absolute_tolerance; lp_settings.save_state = false; lp_settings.return_first_feasible = true; @@ -206,7 +246,9 @@ void diversity_manager_t::add_user_given_solutions( is_feasible, sol.get_user_objective(), sol.get_total_excess()); - population.run_solution_callbacks(sol); + if (is_feasible) { + population.run_solution_callbacks(sol, internals::mip_solution_origin_t::USER_INITIAL); + } initial_sol_vector.emplace_back(std::move(sol)); } else { CUOPT_LOG_ERROR( @@ -220,11 +262,13 @@ void diversity_manager_t::add_user_given_solutions( } template -bool diversity_manager_t::run_presolve(f_t time_limit, timer_t global_timer) +bool diversity_manager_t::run_presolve(f_t time_limit, + cuopt::termination_checker_t& global_timer) { raft::common::nvtx::range fun_scope("run_presolve"); CUOPT_LOG_INFO("Running presolve!"); - timer_t presolve_timer(time_limit); + CUOPT_LOG_INFO("Problem fingerprint before DM presolve: 0x%x", problem_ptr->get_fingerprint()); + work_limit_timer_t presolve_timer(context.gpu_heur_loop, time_limit, *context.termination); auto term_crit = ls.constraint_prop.bounds_update.solve(*problem_ptr); if (ls.constraint_prop.bounds_update.infeas_constraints_count > 0) { @@ -234,15 +278,17 @@ bool diversity_manager_t::run_presolve(f_t time_limit, timer_t global_ if (termination_criterion_t::NO_UPDATE != term_crit) { ls.constraint_prop.bounds_update.set_updated_bounds(*problem_ptr); } + bool run_probing_cache = !fj_only_run; - // Don't run probing cache in deterministic mode yet as neither B&B nor CPUFJ need it - // and it doesn't make use of work units yet - if (context.settings.determinism_mode == CUOPT_MODE_DETERMINISTIC) { run_probing_cache = false; } if (run_probing_cache) { // Run probing cache before trivial presolve to discover variable implications - const f_t max_time_on_probing = diversity_config.max_time_on_probing; - f_t time_for_probing_cache = std::min(max_time_on_probing, time_limit); - timer_t probing_timer{time_for_probing_cache}; + const f_t max_time_on_probing = + (context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS) + ? std::numeric_limits::infinity() + : diversity_config.max_time_on_probing; + f_t time_for_probing_cache = std::min(max_time_on_probing, time_limit); + work_limit_timer_t probing_timer( + context.gpu_heur_loop, time_for_probing_cache, *context.termination); // this function computes probing cache, finds singletons, substitutions and changes the problem bool problem_is_infeasible = compute_probing_cache(ls.constraint_prop.bounds_update, *problem_ptr, probing_timer); @@ -252,8 +298,10 @@ bool diversity_manager_t::run_presolve(f_t time_limit, timer_t global_ problem_ptr->related_vars_time_limit = context.settings.heuristic_params.related_vars_time_limit; if (!global_timer.check_time_limit()) { trivial_presolve(*problem_ptr, remap_cache_ids); } if (!problem_ptr->empty && !check_bounds_sanity(*problem_ptr)) { return false; } - // if (!presolve_timer.check_time_limit() && !context.settings.heuristics_only && - // !problem_ptr->empty) { + const bool run_clique_table = + !presolve_timer.check_time_limit() && !context.settings.heuristics_only && + !problem_ptr->empty && !(context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS); + // if (run_clique_table) { // f_t time_limit_for_clique_table = std::min(3., presolve_timer.remaining_time() / 5); // timer_t clique_timer(time_limit_for_clique_table); // dual_simplex::user_problem_t host_problem(problem_ptr->handle_ptr); @@ -292,6 +340,10 @@ bool diversity_manager_t::run_presolve(f_t time_limit, timer_t global_ } stats.presolve_time = presolve_timer.elapsed_time(); lp_optimal_solution.resize(problem_ptr->n_variables, problem_ptr->handle_ptr->get_stream()); + thrust::fill(problem_ptr->handle_ptr->get_thrust_policy(), + lp_optimal_solution.begin(), + lp_optimal_solution.end(), + f_t(0)); lp_dual_optimal_solution.resize(problem_ptr->n_constraints, problem_ptr->handle_ptr->get_stream()); problem_ptr->handle_ptr->sync_stream(); @@ -299,7 +351,9 @@ bool diversity_manager_t::run_presolve(f_t time_limit, timer_t global_ problem_ptr->n_constraints, problem_ptr->n_variables, problem_ptr->presolve_data.objective_offset); - CUOPT_LOG_INFO("cuOpt presolve time: %.2f", stats.presolve_time); + CUOPT_LOG_INFO("cuOpt presolve time: %.2f, fingerprint: 0x%x", + stats.presolve_time, + problem_ptr->get_fingerprint()); return true; } @@ -311,24 +365,25 @@ void diversity_manager_t::generate_quick_feasible_solution() // min 1 second, max 10 seconds const f_t generate_fast_solution_time = std::min(diversity_config.max_fast_sol_time, std::max(1., timer.remaining_time() / 20.)); - timer_t sol_timer(generate_fast_solution_time); + work_limit_timer_t sol_timer( + context.gpu_heur_loop, generate_fast_solution_time, *context.termination); // do very short LP run to get somewhere close to the optimal point ls.generate_fast_solution(solution, sol_timer); if (solution.get_feasible()) { - population.run_solution_callbacks(solution); initial_sol_vector.emplace_back(std::move(solution)); problem_ptr->handle_ptr->sync_stream(); solution_t searched_sol(initial_sol_vector.back()); ls_config_t ls_config; run_local_search(searched_sol, population.weights, sol_timer, ls_config); - population.run_solution_callbacks(searched_sol); initial_sol_vector.emplace_back(std::move(searched_sol)); auto& feas_sol = initial_sol_vector.back().get_feasible() ? initial_sol_vector.back() : initial_sol_vector[initial_sol_vector.size() - 2]; - CUOPT_LOG_INFO("Generated fast solution in %f seconds with objective %f", + population.run_solution_callbacks(feas_sol, internals::mip_solution_origin_t::LOCAL_SEARCH); + CUOPT_LOG_INFO("Generated fast solution in %f seconds with objective %f, hash 0x%x", timer.elapsed_time(), - feas_sol.get_user_objective()); + feas_sol.get_user_objective(), + feas_sol.get_hash()); } problem_ptr->handle_ptr->sync_stream(); } @@ -366,8 +421,29 @@ void diversity_manager_t::run_fp_alone() { CUOPT_LOG_DEBUG("Running FP alone!"); solution_t sol(population.best_feasible()); - ls.run_fp(sol, timer, &population); - CUOPT_LOG_DEBUG("FP alone finished!"); + CUOPT_DETERMINISM_LOG( + "Deterministic FP alone input: hash=0x%x feasible=%d obj=%.16e excess=%.16e", + sol.get_hash(), + (int)sol.get_feasible(), + sol.get_user_objective(), + sol.get_total_excess()); + ls.run_fp(sol, timer, &population, diversity_config.n_fp_iterations); + CUOPT_DETERMINISM_LOG( + "Deterministic FP alone output: hash=0x%x feasible=%d obj=%.16e excess=%.16e", + sol.get_hash(), + (int)sol.get_feasible(), + sol.get_user_objective(), + sol.get_total_excess()); + if (sol.get_feasible()) { + population.add_solution(std::move(sol), internals::mip_solution_origin_t::LOCAL_SEARCH); + } + auto& best_sol = population.best_feasible(); + CUOPT_DETERMINISM_LOG( + "Deterministic FP alone population best after: hash=0x%x feasible=%d obj=%.16e excess=%.16e", + best_sol.get_hash(), + (int)best_sol.get_feasible(), + best_sol.get_user_objective(), + best_sol.get_total_excess()); } template @@ -384,17 +460,38 @@ solution_t diversity_manager_t::run_solver() raft::common::nvtx::range fun_scope("run_solver"); CUOPT_LOG_DEBUG("Determinism mode: %s", - context.settings.determinism_mode == CUOPT_MODE_DETERMINISTIC ? "deterministic" - : "opportunistic"); + (context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS) + ? "deterministic" + : "opportunistic"); // to automatically compute the solving time on scope exit auto timer_raii_guard = cuopt::scope_guard([&]() { stats.total_solve_time = timer.elapsed_time(); }); + auto log_return_solution = [&](const char* reason, solution_t& sol) { + CUOPT_DETERMINISM_LOG( + "Deterministic run_solver return: reason=%s hash=0x%x feasible=%d " + "obj=%.16e excess=%.16e", + reason, + sol.get_hash(), + (int)sol.get_feasible(), + sol.get_user_objective(), + sol.get_total_excess()); + }; - // Debug: Allow disabling GPU heuristics to test B&B tree determinism in isolation + const bool deterministic_bb_without_deterministic_heuristics = + (context.settings.determinism_mode & CUOPT_DETERMINISM_BB) && + !(context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS); const char* disable_heuristics_env = std::getenv("CUOPT_DISABLE_GPU_HEURISTICS"); - if (context.settings.determinism_mode == CUOPT_MODE_DETERMINISTIC) { - CUOPT_LOG_INFO("Running deterministic mode with CPUFJ heuristic"); + if (deterministic_bb_without_deterministic_heuristics || + (disable_heuristics_env != nullptr && std::string(disable_heuristics_env) == "1")) { + CUOPT_LOG_INFO("GPU heuristics disabled (det_bb_only=%d env=%s)", + (int)deterministic_bb_without_deterministic_heuristics, + disable_heuristics_env ? disable_heuristics_env : "unset"); + if ((context.settings.determinism_mode & CUOPT_DETERMINISM_BB) && + context.branch_and_bound_ptr != nullptr) { + auto& producer_sync = context.branch_and_bound_ptr->get_producer_sync(); + producer_sync.registration_complete(); + } population.initialize_population(); population.allocate_solutions(); @@ -412,21 +509,38 @@ solution_t diversity_manager_t::run_solver() ls.stop_cpufj_deterministic(); population.add_external_solutions_to_population(); - return population.best_feasible(); + auto& best_sol = population.best_feasible(); + log_return_solution("heuristics_disabled", best_sol); + return best_sol; } - if (disable_heuristics_env != nullptr && std::string(disable_heuristics_env) == "1") { - CUOPT_LOG_INFO("GPU heuristics disabled via CUOPT_DISABLE_GPU_HEURISTICS=1"); - population.initialize_population(); - population.allocate_solutions(); - while (!check_b_b_preemption()) { - std::this_thread::sleep_for(std::chrono::milliseconds(100)); + bool gpu_heuristic_producer_registered = false; + auto gpu_heuristic_producer_guard = cuopt::scope_guard([&]() { + if (!gpu_heuristic_producer_registered || context.branch_and_bound_ptr == nullptr) { return; } + auto& producer_sync = context.branch_and_bound_ptr->get_producer_sync(); + producer_sync.deregister_producer(context.gpu_heur_loop.producer_progress_ptr()); + context.gpu_heur_loop.detach_producer_sync(); + }); + if ((context.settings.determinism_mode & CUOPT_DETERMINISM_BB) && + context.branch_and_bound_ptr != nullptr) { + if (context.settings.gpu_heur_wait_for_exploration) { + CUOPT_LOG_INFO("GPU heuristics waiting for B&B tree exploration to start..."); + auto wait_start = std::chrono::high_resolution_clock::now(); + context.branch_and_bound_ptr->wait_for_exploration_start(); + double wait_elapsed = + std::chrono::duration(std::chrono::high_resolution_clock::now() - wait_start) + .count(); + CUOPT_LOG_INFO("GPU heuristics resumed after %.2fs (B&B exploration started)", wait_elapsed); } - return population.best_feasible(); + auto& producer_sync = context.branch_and_bound_ptr->get_producer_sync(); + context.gpu_heur_loop.attach_producer_sync(&producer_sync); + producer_sync.register_producer(context.gpu_heur_loop.producer_progress_ptr()); + producer_sync.registration_complete(); + gpu_heuristic_producer_registered = true; } population.timer = timer; - const f_t time_limit = timer.remaining_time(); + const f_t time_limit = timer.deterministic ? timer.get_time_limit() : timer.remaining_time(); const auto& hp = context.settings.heuristic_params; const f_t lp_time_limit = std::min(hp.root_lp_max_time, time_limit * hp.root_lp_time_ratio); // after every change to the problem, we should resize all the relevant vars @@ -438,7 +552,7 @@ solution_t diversity_manager_t::run_solver() // have the structure ready for reusing later problem_ptr->compute_integer_fixed_problem(); recombiner_t::init_enabled_recombiners( - *problem_ptr, context.settings.heuristic_params.enabled_recombiners); + context, *problem_ptr, context.settings.heuristic_params.enabled_recombiners); mab_recombiner.resize_mab_arm_stats(recombiner_t::enabled_recombiners.size()); // test problem is not ii cuopt_func_call( @@ -448,13 +562,27 @@ solution_t diversity_manager_t::run_solver() "The problem must not be ii"); population.initialize_population(); population.allocate_solutions(); - if (check_b_b_preemption()) { return population.best_feasible(); } + if (check_b_b_preemption()) { + auto& best_sol = population.best_feasible(); + log_return_solution("preempted_after_population_init", best_sol); + return best_sol; + } add_user_given_solutions(initial_sol_vector); + CUOPT_DETERMINISM_LOG("DM bootstrap: initial_sol_vector size after user solutions = %lu", + initial_sol_vector.size()); // Run CPUFJ early to find quick initial solutions ls_cpufj_raii_guard_t ls_cpufj_raii_guard(ls); // RAII to stop cpufj threads on solve stop - ls.start_cpufj_scratch_threads(population); - if (check_b_b_preemption()) { return population.best_feasible(); } + if (!diversity_config.dry_run && + !(context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS)) { + ls.start_cpufj_scratch_threads(population); + } + + if (check_b_b_preemption()) { + auto& best_sol = population.best_feasible(); + log_return_solution("preempted_before_lp", best_sol); + return best_sol; + } lp_state_t& lp_state = problem_ptr->lp_state; // resize because some constructor might be called before the presolve lp_state.resize(*problem_ptr, problem_ptr->handle_ptr->get_stream()); @@ -462,30 +590,59 @@ solution_t diversity_manager_t::run_solver() if (bb_thread_solution_exists) { consume_staged_simplex_solution(lp_state); ls.lp_optimal_exists = true; - } else if (!fj_only_run) { + } else if (!diversity_config.fj_only_run) { convert_greater_to_less(*problem_ptr); f_t absolute_tolerance = context.settings.tolerances.absolute_tolerance; + f_t tolerance_divisor = + problem_ptr->tolerances.absolute_tolerance / problem_ptr->tolerances.relative_tolerance; + if (tolerance_divisor == 0) { tolerance_divisor = 1; } - pdlp_solver_settings_t pdlp_settings{}; - pdlp_settings.tolerances.absolute_dual_tolerance = absolute_tolerance; - pdlp_settings.tolerances.relative_dual_tolerance = - context.settings.tolerances.relative_tolerance; - pdlp_settings.tolerances.absolute_primal_tolerance = absolute_tolerance; - pdlp_settings.tolerances.relative_primal_tolerance = - context.settings.tolerances.relative_tolerance; - pdlp_settings.time_limit = lp_time_limit; - pdlp_settings.first_primal_feasible = false; - pdlp_settings.concurrent_halt = &global_concurrent_halt; - pdlp_settings.method = method_t::Concurrent; - pdlp_settings.inside_mip = true; - pdlp_settings.pdlp_solver_mode = pdlp_solver_mode_t::Stable2; - pdlp_settings.num_gpus = context.settings.num_gpus; - pdlp_settings.presolver = presolver_t::None; - pdlp_settings.per_constraint_residual = true; - set_pdlp_solver_mode(pdlp_settings); - timer_t lp_timer(lp_time_limit); - auto lp_result = solve_lp_with_method(*problem_ptr, pdlp_settings, lp_timer); + auto lp_result = [&]() { + // no concurrent root solve in determinism mode, reuse the work-accounted relaxed_lp machinery + // for this + if (timer.deterministic) { + relaxed_lp_settings_t lp_settings{}; + lp_settings.time_limit = lp_time_limit; + lp_settings.work_limit = lp_time_limit; + lp_settings.tolerance = absolute_tolerance; + lp_settings.check_infeasibility = true; + lp_settings.return_first_feasible = false; + lp_settings.save_state = true; + lp_settings.per_constraint_residual = true; + lp_settings.has_initial_primal = false; + lp_settings.concurrent_halt = &global_concurrent_halt; + lp_settings.work_context = &context.gpu_heur_loop; + cuopt_assert(lp_settings.work_context != nullptr, "Missing deterministic work context"); + CUOPT_DETERMINISM_LOG( + "DM root LP config: dry_run=%d deterministic=%d work_limit=%.6f time_limit=%.6f", + (int)diversity_config.dry_run, + (int)timer.deterministic, + lp_settings.work_limit, + lp_settings.time_limit); + return get_relaxed_lp_solution( + *problem_ptr, lp_optimal_solution, lp_state, lp_settings); + } + pdlp_solver_settings_t pdlp_settings{}; + pdlp_settings.tolerances.relative_primal_tolerance = absolute_tolerance / tolerance_divisor; + pdlp_settings.tolerances.relative_dual_tolerance = absolute_tolerance / tolerance_divisor; + pdlp_settings.time_limit = lp_time_limit; + pdlp_settings.first_primal_feasible = false; + pdlp_settings.concurrent_halt = &global_concurrent_halt; + pdlp_settings.method = method_t::Concurrent; + pdlp_settings.inside_mip = true; + pdlp_settings.pdlp_solver_mode = pdlp_solver_mode_t::Stable2; + pdlp_settings.num_gpus = context.settings.num_gpus; + pdlp_settings.presolver = presolver_t::None; + timer_t lp_timer(lp_time_limit); + return solve_lp_with_method(*problem_ptr, pdlp_settings, lp_timer); + }(); + CUOPT_DETERMINISM_LOG( + "DM root LP result: status=%d iters=%d user_obj=%.12f primal_hash=0x%x", + (int)lp_result.get_termination_status(), + lp_result.get_additional_termination_information().number_of_steps_taken, + lp_result.get_objective_value(), + detail::compute_hash(lp_result.get_primal_solution(), problem_ptr->handle_ptr->get_stream())); bool use_staged_simplex_solution = false; { @@ -527,9 +684,11 @@ solution_t diversity_manager_t::run_solver() } else if (lp_result.get_termination_status() == pdlp_termination_status_t::DualInfeasible) { CUOPT_LOG_ERROR("PDLP detected dual infeasibility, continuing anyway!"); ls.lp_optimal_exists = false; - } else if (lp_result.get_termination_status() == pdlp_termination_status_t::TimeLimit) { + } else if (lp_result.get_termination_status() == pdlp_termination_status_t::TimeLimit || + lp_result.get_termination_status() == pdlp_termination_status_t::IterationLimit) { CUOPT_LOG_DEBUG( - "Initial LP run exceeded time limit, continuing solver with partial LP result!"); + "Initial LP run exceeded time/iteration limit, continuing solver with partial LP " + "result!"); // note to developer, in debug mode the LP run might be too slow and it might cause PDLP // not to bring variables within the bounds } @@ -573,50 +732,106 @@ solution_t diversity_manager_t::run_solver() if (!use_staged_simplex_solution) { // in case the pdlp returned var boudns that are out of bounds clamp_within_var_bounds(lp_optimal_solution, problem_ptr, problem_ptr->handle_ptr); + CUOPT_DETERMINISM_LOG( + "DM root LP post-clamp: lp_optimal_solution hash=0x%x", + detail::compute_hash(lp_optimal_solution, problem_ptr->handle_ptr->get_stream())); } } if (ls.lp_optimal_exists) { solution_t lp_rounded_sol(*problem_ptr); lp_rounded_sol.copy_new_assignment(lp_optimal_solution); + CUOPT_DETERMINISM_LOG("DM bootstrap candidate (LP raw): hash=0x%x feas=%d obj=%.12f", + lp_rounded_sol.get_hash(), + (int)lp_rounded_sol.get_feasible(), + lp_rounded_sol.get_user_objective()); lp_rounded_sol.round_nearest(); lp_rounded_sol.compute_feasibility(); - population.add_solution(std::move(lp_rounded_sol)); - ls.start_cpufj_lptopt_scratch_threads(population); + CUOPT_DETERMINISM_LOG("DM bootstrap candidate (LP rounded): hash=0x%x feas=%d obj=%.12f", + lp_rounded_sol.get_hash(), + (int)lp_rounded_sol.get_feasible(), + lp_rounded_sol.get_user_objective()); + population.add_solution(std::move(lp_rounded_sol), + internals::mip_solution_origin_t::LP_ROUNDING); + if (!diversity_config.dry_run && + !(context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS)) { + ls.start_cpufj_lptopt_scratch_threads(population); + } } - population.add_solutions_from_vec(std::move(initial_sol_vector)); + for (size_t i = 0; i < initial_sol_vector.size(); ++i) { + CUOPT_DETERMINISM_LOG( + "DM bootstrap candidate (initial_sol_vector[%lu]): hash=0x%x feas=%d obj=%.12f", + i, + initial_sol_vector[i].get_hash(), + (int)initial_sol_vector[i].get_feasible(), + initial_sol_vector[i].get_user_objective()); + } + population.add_solutions_from_vec(std::move(initial_sol_vector), + internals::mip_solution_origin_t::USER_INITIAL); - if (check_b_b_preemption()) { return population.best_feasible(); } + if (check_b_b_preemption()) { + auto& best_sol = population.best_feasible(); + log_return_solution("preempted_after_initial_population", best_sol); + return best_sol; + } if (context.settings.benchmark_info_ptr != nullptr) { context.settings.benchmark_info_ptr->objective_of_initial_population = population.best_feasible().get_user_objective(); } - if (fj_only_run) { + if (diversity_config.dry_run) { + auto& best_sol = population.best_feasible(); + log_return_solution("dry_run", best_sol); + return best_sol; + } + if (diversity_config.fj_only_run) { solution_t sol(*problem_ptr); run_fj_alone(sol); + log_return_solution("fj_only_run", sol); return sol; } - rins.enable(); + // RINS not supported in deterministic mode yet + if (!(context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS)) { rins.enable(); } generate_solution(timer.remaining_time(), false); + if (diversity_config.initial_solution_only) { + auto& best_sol = population.best_feasible(); + log_return_solution("initial_solution_only", best_sol); + return best_sol; + } if (timer.check_time_limit()) { rins.stop_rins(); population.add_external_solutions_to_population(); - return population.best_feasible(); + auto& best_sol = population.best_feasible(); + log_return_solution("work_limit_reached", best_sol); + return best_sol; } if (check_b_b_preemption()) { rins.stop_rins(); population.add_external_solutions_to_population(); - return population.best_feasible(); + auto& best_sol = population.best_feasible(); + log_return_solution("preempted_before_fp", best_sol); + return best_sol; } + CUOPT_LOG_DEBUG("pre-run_fp_alone: gpu_work=%g gpu_prod=%g", + context.gpu_heur_loop.current_work(), + context.gpu_heur_loop.current_producer_work()); run_fp_alone(); + CUOPT_LOG_DEBUG("post-run_fp_alone: gpu_work=%g gpu_prod=%g", + context.gpu_heur_loop.current_work(), + context.gpu_heur_loop.current_producer_work()); rins.stop_rins(); population.add_external_solutions_to_population(); - return population.best_feasible(); + auto& best_sol = population.best_feasible(); + CUOPT_LOG_DEBUG("post-fp handoff: feas=%d obj=%g hash=0x%x", + (int)best_sol.get_feasible(), + best_sol.get_user_objective(), + best_sol.get_hash()); + log_return_solution("post_fp_alone", best_sol); + return best_sol; }; template @@ -641,8 +856,10 @@ void diversity_manager_t::diversity_step(i_t max_iterations_without_im auto [sol1, sol2] = population.get_two_random(tournament); cuopt_assert(population.test_invariant(), ""); auto [lp_offspring, offspring] = recombine_and_local_search(sol1, sol2); - auto [inserted_pos_1, best_updated_1] = population.add_solution(std::move(lp_offspring)); - auto [inserted_pos_2, best_updated_2] = population.add_solution(std::move(offspring)); + auto [inserted_pos_1, best_updated_1] = population.add_solution( + std::move(lp_offspring), internals::mip_solution_origin_t::RECOMBINATION); + auto [inserted_pos_2, best_updated_2] = population.add_solution( + std::move(offspring), internals::mip_solution_origin_t::RECOMBINATION); if (best_updated_1 || best_updated_2) { recombine_stats.add_best_updated(); } cuopt_assert(population.test_invariant(), ""); if ((inserted_pos_1 != -1 && inserted_pos_1 <= 2) || @@ -684,10 +901,12 @@ void diversity_manager_t::recombine_and_ls_with_all(solution_t::recombine_and_ls_with_all(solution_t void diversity_manager_t::recombine_and_ls_with_all( - std::vector>& solutions, bool add_only_feasible) + std::vector::drained_external_solution_t>& solutions, + bool add_only_feasible) { raft::common::nvtx::range fun_scope("recombine_and_ls_with_all"); if (solutions.size() > 0) { CUOPT_LOG_DEBUG("Running recombiners on B&B solutions with size %lu", solutions.size()); // add all solutions because time limit might have been consumed and we might have exited before - for (auto& sol : solutions) { + for (auto& drained_sol : solutions) { + auto& sol = drained_sol.solution; cuopt_func_call(sol.test_feasibility(true)); - population.add_solution(std::move(solution_t(sol))); + population.add_solution(std::move(solution_t(sol)), drained_sol.origin); } - for (auto& sol : solutions) { + for (auto& drained_sol : solutions) { + auto& sol = drained_sol.solution; if (timer.check_time_limit()) { return; } solution_t ls_solution(sol); ls_config_t ls_config; @@ -759,6 +981,7 @@ diversity_manager_t::recombine_and_local_search(solution_t& sol1.get_feasible(), sol2.get_quality(population.weights), sol2.get_feasible()); + bool deterministic = (context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS); double best_objective_of_parents = std::min(sol1.get_objective(), sol2.get_objective()); bool at_least_one_parent_feasible = sol1.get_feasible() || sol2.get_feasible(); // randomly choose among 3 recombiners @@ -769,7 +992,7 @@ diversity_manager_t::recombine_and_local_search(solution_t& std::numeric_limits::lowest(), std::numeric_limits::lowest(), std::numeric_limits::max(), - recombiner_work_normalized_reward_t(0.0)); + recombiner_work_normalized_reward_t(deterministic, 0.0)); return std::make_pair(solution_t(sol1), solution_t(sol2)); } cuopt_assert(population.test_invariant(), ""); @@ -789,7 +1012,7 @@ diversity_manager_t::recombine_and_local_search(solution_t& std::numeric_limits::lowest(), std::numeric_limits::lowest(), std::numeric_limits::max(), - recombiner_work_normalized_reward_t(0.0)); + recombiner_work_normalized_reward_t(deterministic, 0.0)); return std::make_pair(solution_t(sol1), solution_t(sol2)); } cuopt_assert(offspring.test_number_all_integer(), "All must be integers after LS"); @@ -807,7 +1030,12 @@ diversity_manager_t::recombine_and_local_search(solution_t& : diversity_config.lp_run_time_if_infeasible; lp_run_time = std::min(lp_run_time, timer.remaining_time()); relaxed_lp_settings_t lp_settings; - lp_settings.time_limit = lp_run_time; + lp_settings.time_limit = lp_run_time; + if (timer.deterministic) { + lp_settings.work_limit = lp_settings.time_limit; + lp_settings.work_context = timer.work_context; + cuopt_assert(lp_settings.work_context != nullptr, "Missing deterministic work context"); + } lp_settings.tolerance = context.settings.tolerances.absolute_tolerance; lp_settings.return_first_feasible = false; lp_settings.save_state = true; @@ -828,12 +1056,15 @@ diversity_manager_t::recombine_and_local_search(solution_t& offspring_qual, sol1.get_quality(population.weights), sol2.get_quality(population.weights)); f_t best_quality_of_parents = std::min(sol1.get_quality(population.weights), sol2.get_quality(population.weights)); - mab_recombiner.add_mab_reward( - mab_recombiner.last_chosen_option, - best_quality_of_parents, - population.best().get_quality(population.weights), - offspring_qual, - recombiner_work_normalized_reward_t(recombine_stats.get_last_recombiner_time())); + mab_recombiner.add_mab_reward(mab_recombiner.last_chosen_option, + best_quality_of_parents, + population.best().get_quality(population.weights), + offspring_qual, + !deterministic + ? recombiner_work_normalized_reward_t( + deterministic, recombine_stats.get_last_recombiner_time()) + : recombiner_work_normalized_reward_t( + deterministic, recombine_stats.get_last_recombiner_work())); mab_ls.add_mab_reward(mab_ls_config_t::last_ls_mab_option, best_quality_of_parents, population.best_feasible().get_quality(population.weights), @@ -878,31 +1109,50 @@ std::pair, bool> diversity_manager_t::recombine( } } } + CUOPT_DETERMINISM_LOG( + "Deterministic recombiner selection: requested=%s selected_index=%d chosen=%s " + "enabled_size=%zu last_choice_before=%d current_seed=%d", + recombiner_t::recombiner_name(recombiner_type), + (int)selected_index, + recombiner_t::recombiner_name(recombiner), + recombiner_t::enabled_recombiners.size(), + mab_recombiner.last_chosen_option, + (unsigned int)cuopt::seed_generator::get_seed()); mab_recombiner.set_last_chosen_option(selected_index); recombine_stats.add_attempt((recombiner_enum_t)recombiner); recombine_stats.start_recombiner_time(); + CUOPT_DETERMINISM_LOG("Recombining sol %x and %x with recombiner %d, weights %x", + a.get_hash(), + b.get_hash(), + recombiner, + population.weights.get_hash()); + // Refactored code using a switch statement switch (recombiner) { case recombiner_enum_t::BOUND_PROP: { - auto [sol, success] = bound_prop_recombiner.recombine(a, b, population.weights); + auto [sol, success, work] = bound_prop_recombiner.recombine(a, b, population.weights); + recombine_stats.set_recombiner_work(work); recombine_stats.stop_recombiner_time(); if (success) { recombine_stats.add_success(); } return std::make_pair(sol, success); } case recombiner_enum_t::FP: { - auto [sol, success] = fp_recombiner.recombine(a, b, population.weights); + auto [sol, success, work] = fp_recombiner.recombine(a, b, population.weights); + recombine_stats.set_recombiner_work(work); recombine_stats.stop_recombiner_time(); if (success) { recombine_stats.add_success(); } return std::make_pair(sol, success); } case recombiner_enum_t::LINE_SEGMENT: { - auto [sol, success] = line_segment_recombiner.recombine(a, b, population.weights); + auto [sol, success, work] = line_segment_recombiner.recombine(a, b, population.weights); + recombine_stats.set_recombiner_work(work); recombine_stats.stop_recombiner_time(); if (success) { recombine_stats.add_success(); } return std::make_pair(sol, success); } case recombiner_enum_t::SUB_MIP: { - auto [sol, success] = sub_mip_recombiner.recombine(a, b, population.weights); + auto [sol, success, work] = sub_mip_recombiner.recombine(a, b, population.weights); + recombine_stats.set_recombiner_work(work); recombine_stats.stop_recombiner_time(); if (success) { recombine_stats.add_success(); } return std::make_pair(sol, success); diff --git a/cpp/src/mip_heuristics/diversity/diversity_manager.cuh b/cpp/src/mip_heuristics/diversity/diversity_manager.cuh index 863933de48..e1c50562d7 100644 --- a/cpp/src/mip_heuristics/diversity/diversity_manager.cuh +++ b/cpp/src/mip_heuristics/diversity/diversity_manager.cuh @@ -26,6 +26,7 @@ #include #include #include +#include #include @@ -35,7 +36,7 @@ template class diversity_manager_t { public: diversity_manager_t(mip_solver_context_t& context); - bool run_presolve(f_t time_limit, timer_t global_timer); + bool run_presolve(f_t time_limit, cuopt::termination_checker_t& global_timer); solution_t run_solver(); void generate_solution(f_t time_limit, bool random_start = true); void run_fj_alone(solution_t& solution); @@ -50,8 +51,9 @@ class diversity_manager_t { void diversity_step(i_t max_iterations_without_improvement); void add_user_given_solutions(std::vector>& initial_sol_vector); population_t* get_population_pointer() { return &population; } - void recombine_and_ls_with_all(std::vector>& solutions, - bool add_only_feasible = false); + void recombine_and_ls_with_all( + std::vector::drained_external_solution_t>& solutions, + bool add_only_feasible = false); void recombine_and_ls_with_all(solution_t& solution, bool add_only_feasible = false); std::pair, solution_t> recombine_and_local_search( solution_t& a, @@ -65,7 +67,7 @@ class diversity_manager_t { solution_t& sol2); bool run_local_search(solution_t& solution, const weight_t& weights, - timer_t& timer, + work_limit_timer_t& timer, ls_config_t& ls_config); void consume_staged_simplex_solution(lp_state_t& lp_state); @@ -84,7 +86,7 @@ class diversity_manager_t { std::vector staged_simplex_dual_solution; f_t staged_simplex_objective{std::numeric_limits::infinity()}; local_search_t ls; - cuopt::timer_t timer; + cuopt::work_limit_timer_t timer; bound_prop_recombiner_t bound_prop_recombiner; fp_recombiner_t fp_recombiner; line_segment_recombiner_t line_segment_recombiner; diff --git a/cpp/src/mip_heuristics/diversity/lns/rins.cu b/cpp/src/mip_heuristics/diversity/lns/rins.cu index c4331343de..2247dfcef4 100644 --- a/cpp/src/mip_heuristics/diversity/lns/rins.cu +++ b/cpp/src/mip_heuristics/diversity/lns/rins.cu @@ -271,10 +271,11 @@ void rins_t::run_rins() branch_and_bound_settings.strong_branching_simplex_iteration_limit = 200; branch_and_bound_settings.log.log = false; branch_and_bound_settings.log.log_prefix = "[RINS] "; - branch_and_bound_settings.solution_callback = [&rins_solution_queue](std::vector& solution, - f_t objective) { - rins_solution_queue.push_back(solution); - }; + branch_and_bound_settings.new_incumbent_callback = + [&rins_solution_queue](std::vector& solution, + f_t objective, + const cuopt::internals::mip_solution_callback_info_t&, + double) { rins_solution_queue.push_back(solution); }; dual_simplex::probing_implied_bound_t empty_probing(branch_and_bound_problem.num_cols); dual_simplex::branch_and_bound_t branch_and_bound( branch_and_bound_problem, branch_and_bound_settings, dual_simplex::tic(), empty_probing); @@ -347,8 +348,9 @@ void rins_t::run_rins() cuopt_assert(best_sol.assignment.size() == sol_size_before_rins, "Assignment size mismatch"); cuopt_assert(best_sol.assignment.size() == problem_copy->n_variables, "Assignment size mismatch"); - dm.population.add_external_solution( - best_sol.get_host_assignment(), best_sol.get_objective(), solution_origin_t::RINS); + dm.population.add_external_solution(best_sol.get_host_assignment(), + best_sol.get_objective(), + internals::mip_solution_origin_t::RINS); } } diff --git a/cpp/src/mip_heuristics/diversity/multi_armed_bandit.cuh b/cpp/src/mip_heuristics/diversity/multi_armed_bandit.cuh index 4571d0d57f..b9219b8dcb 100644 --- a/cpp/src/mip_heuristics/diversity/multi_armed_bandit.cuh +++ b/cpp/src/mip_heuristics/diversity/multi_armed_bandit.cuh @@ -45,16 +45,22 @@ struct ls_work_normalized_reward_t { }; struct recombiner_work_normalized_reward_t { - double time_in_miliseconds; - recombiner_work_normalized_reward_t(double time_in_miliseconds) - : time_in_miliseconds(time_in_miliseconds) + bool deterministic; + double work; + recombiner_work_normalized_reward_t(bool deterministic, double work) + : deterministic(deterministic), work(work) { } double operator()(double factor) const { // normal recombiners take 2000 ms - return factor * (std::max(0.1, 4.0 - (time_in_miliseconds / 2000))); + if (!deterministic) { + double time_in_miliseconds = work; + return factor * (std::max(0.1, 4.0 - (time_in_miliseconds / 2000))); + } else { + return factor * (std::max(0.1, 4.0 - (work / 200))); + } } }; diff --git a/cpp/src/mip_heuristics/diversity/population.cu b/cpp/src/mip_heuristics/diversity/population.cu index bb0fdd6d11..cbdcf4fdab 100644 --- a/cpp/src/mip_heuristics/diversity/population.cu +++ b/cpp/src/mip_heuristics/diversity/population.cu @@ -8,15 +8,27 @@ #include "diversity_manager.cuh" #include "population.cuh" +#include + #include #include #include #include #include +#include #include #include +// enable to activate detailed determinism logs +#if 0 +#undef CUOPT_DETERMINISM_LOG +#define CUOPT_DETERMINISM_LOG(...) \ + do { \ + CUOPT_LOG_INFO(__VA_ARGS__); \ + } while (0) +#endif + namespace cuopt::linear_programming::detail { constexpr double weight_increase_ratio = 2.; @@ -44,7 +56,7 @@ population_t::population_t(std::string const& name_, rng(cuopt::seed_generator::get_seed()), early_exit_primal_generation(false), population_hash_map(*problem_ptr), - timer(0) + timer(0.0, cuopt::termination_checker_t::root_tag_t{}) { best_feasible_objective = std::numeric_limits::max(); } @@ -125,11 +137,12 @@ std::pair, solution_t> population_t::ge } template -void population_t::add_solutions_from_vec(std::vector>&& solutions) +void population_t::add_solutions_from_vec( + std::vector>&& solutions, internals::mip_solution_origin_t callback_origin) { raft::common::nvtx::range fun_scope("add_solution_from_vec"); for (auto&& sol : solutions) { - add_solution(std::move(sol)); + add_solution(std::move(sol), callback_origin); } } @@ -143,11 +156,11 @@ size_t population_t::get_external_solution_size() template void population_t::add_external_solution(const std::vector& solution, f_t objective, - solution_origin_t origin) + internals::mip_solution_origin_t origin) { std::lock_guard lock(solution_mutex); - if (origin == solution_origin_t::CPUFJ) { + if (origin == internals::mip_solution_origin_t::CPU_FEASIBILITY_JUMP) { external_solution_queue_cpufj.emplace_back(solution, objective, origin); } else { external_solution_queue.emplace_back(solution, objective, origin); @@ -165,7 +178,7 @@ void population_t::add_external_solution(const std::vector& solut } CUOPT_LOG_DEBUG("%s added a solution to population, solution queue size %lu with objective %g", - solution_origin_to_string(origin), + internals::mip_solution_origin_to_string(origin), external_solution_queue.size(), problem_ptr->get_user_obj_from_solver_obj(objective)); if (objective < best_feasible_objective) { @@ -179,9 +192,13 @@ void population_t::add_external_solution(const std::vector& solut template void population_t::add_external_solutions_to_population() { + // GPU heuristics are producer-only in the current GPU determinism implementation + if ((context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS)) { return; } // don't do early exit checks here. mutex needs to be acquired to prevent race conditions auto new_sol_vector = get_external_solutions(); - add_solutions_from_vec(std::move(new_sol_vector)); + for (auto& drained_sol : new_sol_vector) { + add_solution(std::move(drained_sol.solution), drained_sol.origin); + } } // normally we would need a lock here but these are boolean types and race conditions are not @@ -194,10 +211,11 @@ void population_t::preempt_heuristic_solver() } template -std::vector> population_t::get_external_solutions() +std::vector::drained_external_solution_t> +population_t::get_external_solutions() { std::lock_guard lock(solution_mutex); - std::vector> return_vector; + std::vector return_vector; i_t counter = 0; f_t new_best_feasible_objective = best_feasible_objective; f_t longest_wait_time = 0; @@ -205,10 +223,10 @@ std::vector> population_t::get_external_solutions for (auto& h_entry : queue) { // ignore CPUFJ solutions if they're not better than the best feasible. // It seems they worsen results on some instances despite the potential for improved diversity - if (h_entry.origin == solution_origin_t::CPUFJ && + if (h_entry.origin == internals::mip_solution_origin_t::CPU_FEASIBILITY_JUMP && h_entry.objective > new_best_feasible_objective) { continue; - } else if (h_entry.origin != solution_origin_t::CPUFJ && + } else if (h_entry.origin != internals::mip_solution_origin_t::CPU_FEASIBILITY_JUMP && h_entry.objective > new_best_feasible_objective) { new_best_feasible_objective = h_entry.objective; } @@ -233,7 +251,7 @@ std::vector> population_t::get_external_solutions problem_ptr->n_integer_vars); } sol.handle_ptr->sync_stream(); - return_vector.emplace_back(std::move(sol)); + return_vector.emplace_back(std::move(sol), h_entry.origin); counter++; } } @@ -258,114 +276,53 @@ bool population_t::is_better_than_best_feasible(solution_t& } template -void population_t::invoke_get_solution_callback( - solution_t& sol, internals::get_solution_callback_t* callback) +void population_t::run_solution_callbacks( + solution_t& sol, internals::mip_solution_origin_t callback_origin) { - f_t user_objective = sol.get_user_objective(); - f_t user_bound = context.stats.get_solution_bound(); - solution_t temp_sol(sol); - problem_ptr->post_process_assignment(temp_sol.assignment); - if (problem_ptr->has_papilo_presolve_data()) { - problem_ptr->papilo_uncrush_assignment(temp_sol.assignment); - } + if (is_better_than_best_feasible(sol)) { + const bool deterministic_bb = (context.settings.determinism_mode & CUOPT_DETERMINISM_BB) && + context.branch_and_bound_ptr != nullptr; - std::vector user_objective_vec(1); - std::vector user_bound_vec(1); - std::vector user_assignment_vec(temp_sol.assignment.size()); - user_objective_vec[0] = user_objective; - user_bound_vec[0] = user_bound; - raft::copy(user_assignment_vec.data(), - temp_sol.assignment.data(), - temp_sol.assignment.size(), - temp_sol.handle_ptr->get_stream()); - temp_sol.handle_ptr->sync_stream(); - callback->get_solution(user_assignment_vec.data(), - user_objective_vec.data(), - user_bound_vec.data(), - callback->get_user_data()); -} - -template -void population_t::run_solution_callbacks(solution_t& sol) -{ - bool better_solution_found = is_better_than_best_feasible(sol); - auto user_callbacks = context.settings.get_mip_callbacks(); - if (better_solution_found) { - if (context.settings.benchmark_info_ptr != nullptr) { - context.settings.benchmark_info_ptr->last_improvement_of_best_feasible = timer.elapsed_time(); - } - CUOPT_LOG_DEBUG("Population: Found new best solution %g", sol.get_user_objective()); - if (problem_ptr->branch_and_bound_callback != nullptr) { - problem_ptr->branch_and_bound_callback(sol.get_host_assignment()); - } - for (auto callback : user_callbacks) { - if (callback->get_type() == internals::base_solution_callback_type::GET_SOLUTION) { - auto get_sol_callback = static_cast(callback); - invoke_get_solution_callback(sol, get_sol_callback); + if (deterministic_bb) { + const double work_timestamp = context.gpu_heur_loop.current_producer_work(); + cuopt_assert(std::isfinite(work_timestamp), + "Deterministic heuristic work timestamp must be finite"); + context.branch_and_bound_ptr->queue_external_solution_deterministic( + sol.get_host_assignment(), sol.get_user_objective(), work_timestamp, callback_origin); + } else { + if (context.branch_and_bound_ptr != nullptr && + context.problem_ptr->branch_and_bound_callback != nullptr) { + context.problem_ptr->branch_and_bound_callback(sol.get_host_assignment(), callback_origin); } + + const double work_timestamp = context.gpu_heur_loop.current_work(); + const auto payload = context.solution_publication.build_callback_payload( + context.problem_ptr, sol, callback_origin, work_timestamp); + context.solution_publication.publish_new_best_feasible(payload, timer.elapsed_time()); } // Save the best objective here even if callback handling later exits early. // This prevents older solutions from being reported as "new best" in subsequent callbacks. best_feasible_objective = sol.get_objective(); } - for (auto callback : user_callbacks) { - if (callback->get_type() == internals::base_solution_callback_type::SET_SOLUTION) { - auto set_sol_callback = static_cast(callback); - f_t user_bound = context.stats.get_solution_bound(); - auto callback_num_variables = problem_ptr->original_problem_ptr->get_n_variables(); - rmm::device_uvector incumbent_assignment(callback_num_variables, - sol.handle_ptr->get_stream()); - solution_t outside_sol(sol); - rmm::device_scalar d_outside_sol_objective(sol.handle_ptr->get_stream()); - auto inf = std::numeric_limits::infinity(); - d_outside_sol_objective.set_value_async(inf, sol.handle_ptr->get_stream()); - sol.handle_ptr->sync_stream(); - std::vector h_incumbent_assignment(incumbent_assignment.size()); - std::vector h_outside_sol_objective(1, inf); - std::vector h_user_bound(1, user_bound); - set_sol_callback->set_solution(h_incumbent_assignment.data(), - h_outside_sol_objective.data(), - h_user_bound.data(), - set_sol_callback->get_user_data()); - f_t outside_sol_objective = h_outside_sol_objective[0]; - // The callback might be called without setting any valid solution or objective which triggers - // asserts - if (outside_sol_objective == inf) { return; } - d_outside_sol_objective.set_value_async(outside_sol_objective, sol.handle_ptr->get_stream()); - raft::copy(incumbent_assignment.data(), - h_incumbent_assignment.data(), - incumbent_assignment.size(), - sol.handle_ptr->get_stream()); - - bool is_valid = problem_ptr->pre_process_assignment(incumbent_assignment); - if (!is_valid) { return; } - cuopt_assert(outside_sol.assignment.size() == incumbent_assignment.size(), - "Incumbent assignment size mismatch"); - raft::copy(outside_sol.assignment.data(), - incumbent_assignment.data(), - incumbent_assignment.size(), - sol.handle_ptr->get_stream()); - outside_sol.compute_feasibility(); - - CUOPT_LOG_DEBUG("Injected solution feasibility = %d objective = %g excess = %g", - outside_sol.get_feasible(), - outside_sol.get_user_objective(), - outside_sol.get_total_excess()); - if (std::abs(outside_sol.get_user_objective() - outside_sol_objective) > 1e-6) { - cuopt_func_call( - CUOPT_LOG_DEBUG("External solution objective mismatch: outside_sol.get_user_objective() " - "= %g, outside_sol_objective = %g", - outside_sol.get_user_objective(), - outside_sol_objective)); + context.solution_injection.invoke_set_solution_callbacks( + problem_ptr, + sol, + [this]( + const std::vector& assignment, f_t objective, internals::mip_solution_origin_t origin) { + const bool deterministic_bb = (context.settings.determinism_mode & CUOPT_DETERMINISM_BB) && + context.branch_and_bound_ptr != nullptr; + if (deterministic_bb) { + const double work_timestamp = context.gpu_heur_loop.current_producer_work(); + context.branch_and_bound_ptr->queue_external_solution_deterministic( + assignment, + context.problem_ptr->get_user_obj_from_solver_obj(objective), + work_timestamp, + origin); + } else { + add_external_solution(assignment, objective, origin); } - cuopt_assert(std::abs(outside_sol.get_user_objective() - outside_sol_objective) <= 1e-6, - "External solution objective mismatch"); - auto h_outside_sol = outside_sol.get_host_assignment(); - add_external_solution( - h_outside_sol, outside_sol.get_objective(), solution_origin_t::EXTERNAL); - } - } + }); } template @@ -401,7 +358,8 @@ void population_t::adjust_weights_according_to_best_feasible() } template -std::pair population_t::add_solution(solution_t&& sol) +std::pair population_t::add_solution( + solution_t&& sol, internals::mip_solution_origin_t callback_origin) { std::lock_guard lock(write_mutex); raft::common::nvtx::range fun_scope("add_solution"); @@ -411,16 +369,18 @@ std::pair population_t::add_solution(solution_t&& // for hash computation, quality calculation, and similarity comparisons. sol.handle_ptr->sync_stream(); population_hash_map.insert(sol); - double sol_cost = sol.get_quality(weights); - bool best_updated = false; - CUOPT_LOG_DEBUG("Adding solution with quality %f and objective %f n_integers %d!", + double sol_cost = sol.get_quality(weights); + bool best_updated = false; + const uint32_t candidate_hash = sol.get_hash(); + CUOPT_LOG_DEBUG("Adding solution with quality %f and objective %f n_integers %d, hash %x!", sol_cost, sol.get_user_objective(), - sol.n_assigned_integers); + sol.n_assigned_integers, + candidate_hash); // We store the best feasible found so far at index 0. if (sol.get_feasible() && (solutions[0].first == false || sol_cost + OBJECTIVE_EPSILON < indices[0].second)) { - run_solution_callbacks(sol); + run_solution_callbacks(sol, callback_origin); solutions[0].first = true; // we only have move assignment operator solution_t temp_sol(sol); @@ -706,7 +666,7 @@ void population_t::halve_the_population() clear_except_best_feasible(); var_threshold = std::max(var_threshold * 0.97, 0.5 * problem_ptr->n_integer_vars); for (auto& sol : sol_vec) { - add_solution(solution_t(sol)); + add_solution(solution_t(sol), internals::mip_solution_origin_t::LOCAL_SEARCH); } if (counter++ > max_adjustments) break; } @@ -718,7 +678,7 @@ void population_t::halve_the_population() max_var_threshold, std::min((size_t)(var_threshold * 1.02), (size_t)(0.995 * problem_ptr->n_integer_vars))); for (auto& sol : sol_vec) { - add_solution(solution_t(sol)); + add_solution(solution_t(sol), internals::mip_solution_origin_t::LOCAL_SEARCH); } if (counter++ > max_adjustments) break; } @@ -744,7 +704,7 @@ void population_t::start_threshold_adjustment() } template -void population_t::adjust_threshold(cuopt::timer_t timer) +void population_t::adjust_threshold(cuopt::work_limit_timer_t& timer) { double time_ratio = (timer.elapsed_time() - population_start_time) / (timer.get_time_limit() - population_start_time); @@ -833,23 +793,29 @@ bool population_t::test_invariant() template void population_t::print() { + std::vector hashes; + for (auto& index : indices) + hashes.push_back(solutions[index.first].second.get_hash()); + uint32_t final_hash = compute_hash(hashes); CUOPT_LOG_DEBUG(" -------------- "); - CUOPT_LOG_DEBUG("%s infeas weight %f threshold %d/%d:", + CUOPT_LOG_DEBUG("%s infeas weight %f threshold %d/%d (hash %x):", name.c_str(), infeasibility_importance, var_threshold, - problem_ptr->n_integer_vars); + problem_ptr->n_integer_vars, + final_hash); i_t i = 0; for (auto& index : indices) { if (index.first == 0 && solutions[0].first) { CUOPT_LOG_DEBUG(" Best feasible: %f", solutions[index.first].second.get_user_objective()); } - CUOPT_LOG_DEBUG("%d : %f\t%f\t%f\t%d", + CUOPT_LOG_DEBUG("%d : %f\t%f\t%f\t%d (hash %x)", i, index.second, solutions[index.first].second.get_total_excess(), solutions[index.first].second.get_user_objective(), - solutions[index.first].second.get_feasible()); + solutions[index.first].second.get_feasible(), + solutions[index.first].second.get_hash()); i++; } CUOPT_LOG_DEBUG(" -------------- "); @@ -858,8 +824,8 @@ void population_t::print() template void population_t::run_all_recombiners(solution_t& sol) { - std::vector> sol_vec; - sol_vec.emplace_back(std::move(solution_t(sol))); + std::vector::drained_external_solution_t> sol_vec; + sol_vec.emplace_back(solution_t(sol), internals::mip_solution_origin_t::LOCAL_SEARCH); dm.recombine_and_ls_with_all(sol_vec, true); } diff --git a/cpp/src/mip_heuristics/diversity/population.cuh b/cpp/src/mip_heuristics/diversity/population.cuh index c83a4bfb83..9250b7cdcb 100644 --- a/cpp/src/mip_heuristics/diversity/population.cuh +++ b/cpp/src/mip_heuristics/diversity/population.cuh @@ -25,22 +25,20 @@ namespace cuopt::linear_programming::detail { template class diversity_manager_t; -enum class solution_origin_t { BRANCH_AND_BOUND, CPUFJ, RINS, EXTERNAL }; - -constexpr const char* solution_origin_to_string(solution_origin_t origin) -{ - switch (origin) { - case solution_origin_t::BRANCH_AND_BOUND: return "B&B"; - case solution_origin_t::CPUFJ: return "CPUFJ"; - case solution_origin_t::RINS: return "RINS"; - case solution_origin_t::EXTERNAL: return "injected"; - default: return "unknown"; - } -} - template class population_t { public: + struct drained_external_solution_t { + drained_external_solution_t(solution_t&& solution_, + internals::mip_solution_origin_t origin_) + : solution(std::move(solution_)), origin(origin_) + { + } + + solution_t solution; + internals::mip_solution_origin_t origin; + }; + population_t(std::string const& name, mip_solver_context_t& context, diversity_manager_t& dm, @@ -83,6 +81,7 @@ class population_t { a.first = false; indices[0].second = std::numeric_limits::max(); indices.erase(indices.begin() + 1, indices.end()); + best_feasible_objective = std::numeric_limits::max(); } void clear_except_best_feasible() @@ -92,6 +91,7 @@ class population_t { } solutions[indices[0].first].first = true; indices.erase(indices.begin() + 1, indices.end()); + best_feasible_objective = solutions[indices[0].first].second.get_objective(); } // ------------------- @@ -103,16 +103,18 @@ class population_t { /*! \brief { Add a solution to population. Similar solutions may be ejected from the pool. } * \return { -1 = not inserted , others = inserted index} */ - std::pair add_solution(solution_t&& sol); + std::pair add_solution(solution_t&& sol, + internals::mip_solution_origin_t callback_origin); void add_external_solution(const std::vector& solution, f_t objective, - solution_origin_t origin); - std::vector> get_external_solutions(); + internals::mip_solution_origin_t origin); + std::vector get_external_solutions(); void add_external_solutions_to_population(); size_t get_external_solution_size(); void preempt_heuristic_solver(); - void add_solutions_from_vec(std::vector>&& solutions); + void add_solutions_from_vec(std::vector>&& solutions, + internals::mip_solution_origin_t callback_origin); // Updates the cstr weights according to the best solutions feasibility void compute_new_weights(); @@ -122,7 +124,7 @@ class population_t { // updates qualities of each solution void update_qualities(); // adjusts the threshold of the population - void adjust_threshold(cuopt::timer_t timer); + void adjust_threshold(cuopt::work_limit_timer_t& timer); /*! \param sol { Input solution } * \return { Index of the best solution similar to sol. If no similar is found we return * max_solutions. }*/ @@ -153,7 +155,8 @@ class population_t { std::vector> population_to_vector(); void halve_the_population(); - void run_solution_callbacks(solution_t& sol); + void run_solution_callbacks(solution_t& sol, + internals::mip_solution_origin_t callback_origin); void adjust_weights_according_to_best_feasible(); @@ -161,9 +164,6 @@ class population_t { void diversity_step(i_t max_iterations_without_improvement); - void invoke_get_solution_callback(solution_t& sol, - internals::get_solution_callback_t* callback); - // does some consistency tests bool test_invariant(); @@ -186,7 +186,9 @@ class population_t { struct external_solution_t { external_solution_t() = default; - external_solution_t(const std::vector& solution, f_t objective, solution_origin_t origin) + external_solution_t(const std::vector& solution, + f_t objective, + internals::mip_solution_origin_t origin) : solution(solution), objective(objective), origin(origin), @@ -195,7 +197,7 @@ class population_t { } std::vector solution; f_t objective; - solution_origin_t origin; + internals::mip_solution_origin_t origin; timer_t timer; // debug timer to track how long a solution has lingered in the queue }; @@ -211,7 +213,7 @@ class population_t { // be seeded from an early-FJ incumbent objective before a matching population solution exists. f_t best_feasible_objective = std::numeric_limits::max(); assignment_hash_map_t population_hash_map; - cuopt::timer_t timer; + cuopt::work_limit_timer_t timer; }; } // namespace cuopt::linear_programming::detail diff --git a/cpp/src/mip_heuristics/diversity/recombiners/bound_prop_recombiner.cuh b/cpp/src/mip_heuristics/diversity/recombiners/bound_prop_recombiner.cuh index 9d6bb3902c..687eb3ae54 100644 --- a/cpp/src/mip_heuristics/diversity/recombiners/bound_prop_recombiner.cuh +++ b/cpp/src/mip_heuristics/diversity/recombiners/bound_prop_recombiner.cuh @@ -29,6 +29,7 @@ class bound_prop_recombiner_t : public recombiner_t { rng(cuopt::seed_generator::get_seed()), vars_to_fix(n_vars, handle_ptr->get_stream()) { + thrust::fill(handle_ptr->get_thrust_policy(), vars_to_fix.begin(), vars_to_fix.end(), -1); } void get_probing_values_for_infeasible( @@ -131,9 +132,9 @@ class bound_prop_recombiner_t : public recombiner_t { }); } - std::pair, bool> recombine(solution_t& a, - solution_t& b, - const weight_t& weights) + std::tuple, bool, double> recombine(solution_t& a, + solution_t& b, + const weight_t& weights) { raft::common::nvtx::range fun_scope("bound_prop_recombiner"); auto& guiding_solution = a.get_feasible() ? a : b; @@ -148,10 +149,11 @@ class bound_prop_recombiner_t : public recombiner_t { i_t n_vars_from_other = n_different_vars; i_t fixed_from_guiding = 0; i_t fixed_from_other = 0; + i_t seed = cuopt::seed_generator::get_seed(); if (n_different_vars > (i_t)bp_recombiner_config_t::max_n_of_vars_from_other) { fixed_from_guiding = n_vars_from_other - bp_recombiner_config_t::max_n_of_vars_from_other; n_vars_from_other = bp_recombiner_config_t::max_n_of_vars_from_other; - thrust::default_random_engine g{(unsigned int)cuopt::seed_generator::get_seed()}; + thrust::default_random_engine g{(unsigned int)seed}; thrust::shuffle(a.handle_ptr->get_thrust_policy(), this->remaining_indices.data(), this->remaining_indices.data() + n_different_vars, @@ -160,12 +162,36 @@ class bound_prop_recombiner_t : public recombiner_t { i_t n_vars_from_guiding = a.problem_ptr->n_integer_vars - n_vars_from_other; CUOPT_LOG_DEBUG( "n_vars_from_guiding %d n_vars_from_other %d", n_vars_from_guiding, n_vars_from_other); + + // DETERMINISM DEBUG: Log everything that could affect divergence + CUOPT_DETERMINISM_LOG("BP_DET: sol_a_hash=0x%x sol_b_hash=0x%x offspring_hash=0x%x, seed %x", + a.get_hash(), + b.get_hash(), + offspring.get_hash(), + seed); + CUOPT_DETERMINISM_LOG("BP_DET: n_different_vars=%d n_vars_from_other=%d n_vars_from_guiding=%d", + n_different_vars, + n_vars_from_other, + n_vars_from_guiding); + CUOPT_DETERMINISM_LOG( + "BP_DET: remaining_indices_hash=0x%x (first %d elements)", + detail::compute_hash(make_span(this->remaining_indices), a.handle_ptr->get_stream()), + std::min((i_t)10, n_vars_from_other)); + CUOPT_DETERMINISM_LOG("BP_DET: guiding_feasible=%d other_feasible=%d expensive_to_fix=%d", + guiding_solution.get_feasible(), + other_solution.get_feasible(), + a.problem_ptr->expensive_to_fix_vars); + CUOPT_DETERMINISM_LOG( + "BP_DET: fixed_from_guiding=%d fixed_from_other=%d", fixed_from_guiding, fixed_from_other); + // if either all integers are from A(meaning all are common) or all integers are from B(meaning // all are different), return if (n_vars_from_guiding == 0 || n_vars_from_other == 0) { CUOPT_LOG_DEBUG("Returning false because all vars are common or different"); - return std::make_pair(offspring, false); + return std::make_tuple(offspring, false, 0.0); } + // TODO: REPLACE! + double work = static_cast(n_vars_from_other) / 1e8; cuopt_assert(a.problem_ptr == b.problem_ptr, "The two solutions should not refer to different problems"); @@ -175,9 +201,16 @@ class bound_prop_recombiner_t : public recombiner_t { a.handle_ptr->get_stream()); probing_config_t probing_config(a.problem_ptr->n_variables, a.handle_ptr); if (guiding_solution.get_feasible() && !a.problem_ptr->expensive_to_fix_vars) { + CUOPT_DETERMINISM_LOG("BP_DET: Taking FEASIBLE path (with variable fixing)"); this->compute_vars_to_fix(offspring, vars_to_fix, n_vars_from_other, n_vars_from_guiding); + CUOPT_DETERMINISM_LOG("BP_DET: vars_to_fix_size=%lu", vars_to_fix.size()); auto [fixed_problem, fixed_assignment, variable_map] = offspring.fix_variables(vars_to_fix); - timer_t timer(bp_recombiner_config_t::bounds_prop_time_limit); + CUOPT_DETERMINISM_LOG("BP_DET: fixed_problem_fingerprint=0x%x variable_map_size=%lu", + fixed_problem.get_fingerprint(), + variable_map.size()); + work_limit_timer_t timer(this->context.gpu_heur_loop, + bp_recombiner_config_t::bounds_prop_time_limit, + *this->context.termination); rmm::device_uvector old_assignment(offspring.assignment, offspring.handle_ptr->get_stream()); offspring.handle_ptr->sync_stream(); @@ -197,26 +230,44 @@ class bound_prop_recombiner_t : public recombiner_t { constraint_prop.single_rounding_only = true; constraint_prop.apply_round(offspring, lp_run_time_after_feasible, timer, probing_config); constraint_prop.single_rounding_only = false; - cuopt_func_call(bool feasible_after_bounds_prop = offspring.get_feasible()); + offspring.compute_feasibility(); + bool feasible_after_bounds_prop = offspring.get_feasible(); offspring.handle_ptr->sync_stream(); offspring.problem_ptr = a.problem_ptr; fixed_assignment = std::move(offspring.assignment); offspring.assignment = std::move(old_assignment); offspring.handle_ptr->sync_stream(); offspring.unfix_variables(fixed_assignment, variable_map); - cuopt_func_call(bool feasible_after_unfix = offspring.get_feasible()); - // May be triggered due to numerical issues - // TODO: investigate further - // cuopt_assert(feasible_after_unfix == feasible_after_bounds_prop, - // "Feasible after unfix should be same as feasible after bounds prop!"); + offspring.compute_feasibility(); + bool feasible_after_unfix = offspring.get_feasible(); + cuopt_func_call(f_t excess_after_unfix = offspring.get_total_excess()); + if (feasible_after_unfix != feasible_after_bounds_prop) { + CUOPT_LOG_WARN("Numerical issue in bounds prop, infeasibility after unfix"); + // might become infeasible after unfixing due to numerical issues. Check that the excess + // remains consistent + // CUOPT_LOG_ERROR("Excess: %g, %g, %g, %g, feas %d", offspring.get_total_excess(), + // offspring.compute_max_constraint_violation(), offspring.compute_max_int_violation(), + // offspring.compute_max_variable_violation(), feasible_after_unfix); + // cuopt_assert(fabs(excess_after_unfix - excess_before) < 1e-6, + // "Excess after unfix should be same as before unfix!"); + } a.handle_ptr->sync_stream(); } else { - timer_t timer(bp_recombiner_config_t::bounds_prop_time_limit); + CUOPT_DETERMINISM_LOG("BP_DET: Taking INFEASIBLE path (no variable fixing)"); + work_limit_timer_t timer(this->context.gpu_heur_loop, + bp_recombiner_config_t::bounds_prop_time_limit, + *this->context.termination); get_probing_values_for_infeasible( guiding_solution, other_solution, offspring, probing_values, n_vars_from_other); probing_config.probing_values = host_copy(probing_values, offspring.handle_ptr->get_stream()); + CUOPT_DETERMINISM_LOG( + "BP_DET: probing_values_hash=0x%x", + detail::compute_hash(make_span(probing_values), a.handle_ptr->get_stream())); constraint_prop.apply_round(offspring, lp_run_time_after_feasible, timer, probing_config); } + CUOPT_DETERMINISM_LOG("BP_DET: After apply_round: offspring_hash=0x%x feasible=%d", + offspring.get_hash(), + offspring.get_feasible()); constraint_prop.max_n_failed_repair_iterations = 1; cuopt_func_call(offspring.test_number_all_integer()); bool better_cost_than_parents = @@ -236,11 +287,17 @@ class bound_prop_recombiner_t : public recombiner_t { bp_recombiner_config_t::decrease_max_n_of_vars_from_other(); } } + CUOPT_DETERMINISM_LOG( + "BP_DET: Final offspring_hash=0x%x same_as_parents=%d better_cost=%d better_feas=%d", + offspring.get_hash(), + same_as_parents, + better_cost_than_parents, + better_feasibility_than_parents); if (better_cost_than_parents || better_feasibility_than_parents) { CUOPT_LOG_DEBUG("Offspring is feasible or better than both parents"); - return std::make_pair(offspring, true); + return std::make_tuple(offspring, true, work); } - return std::make_pair(offspring, !same_as_parents); + return std::make_tuple(offspring, !same_as_parents, work); } rmm::device_uvector vars_to_fix; diff --git a/cpp/src/mip_heuristics/diversity/recombiners/fp_recombiner.cuh b/cpp/src/mip_heuristics/diversity/recombiners/fp_recombiner.cuh index 1cca1ba371..0fe73c9e60 100644 --- a/cpp/src/mip_heuristics/diversity/recombiners/fp_recombiner.cuh +++ b/cpp/src/mip_heuristics/diversity/recombiners/fp_recombiner.cuh @@ -35,9 +35,9 @@ class fp_recombiner_t : public recombiner_t { { } - std::pair, bool> recombine(solution_t& a, - solution_t& b, - const weight_t& weights) + std::tuple, bool, double> recombine(solution_t& a, + solution_t& b, + const weight_t& weights) { raft::common::nvtx::range fun_scope("FP recombiner"); auto& guiding_solution = a.get_feasible() ? a : b; @@ -50,6 +50,7 @@ class fp_recombiner_t : public recombiner_t { CUOPT_LOG_DEBUG("FP rec: Number of different variables %d MAX_VARS %d", n_different_vars, fp_recombiner_config_t::max_n_of_vars_from_other); + CUOPT_DETERMINISM_LOG("FP rec: offspring hash 0x%x", offspring.get_hash()); i_t n_vars_from_other = n_different_vars; if (n_vars_from_other > (i_t)fp_recombiner_config_t::max_n_of_vars_from_other) { n_vars_from_other = fp_recombiner_config_t::max_n_of_vars_from_other; @@ -62,17 +63,34 @@ class fp_recombiner_t : public recombiner_t { i_t n_vars_from_guiding = a.problem_ptr->n_integer_vars - n_vars_from_other; if (n_vars_from_other == 0 || n_vars_from_guiding == 0) { CUOPT_LOG_DEBUG("Returning false because all vars are common or different"); - return std::make_pair(offspring, false); + return std::make_tuple(offspring, false, 0.0); } + // TODO: CHANGE + double work = static_cast(n_vars_from_other) / 1e8; CUOPT_LOG_DEBUG( "n_vars_from_guiding %d n_vars_from_other %d", n_vars_from_guiding, n_vars_from_other); this->compute_vars_to_fix(offspring, vars_to_fix, n_vars_from_other, n_vars_from_guiding); + CUOPT_DETERMINISM_LOG( + "FP rec post computevarstofix: offspring hash 0x%x, vars to fix 0x%x", + offspring.get_hash(), + detail::compute_hash(make_span(vars_to_fix), offspring.handle_ptr->get_stream())); auto [fixed_problem, fixed_assignment, variable_map] = offspring.fix_variables(vars_to_fix); + CUOPT_DETERMINISM_LOG( + "FP rec: fixed_problem hash 0x%x assigned hash 0x%x", + fixed_problem.get_fingerprint(), + detail::compute_hash(make_span(fixed_assignment), offspring.handle_ptr->get_stream())); fixed_problem.check_problem_representation(true); if (!guiding_solution.get_feasible() && !other_solution.get_feasible()) { + CUOPT_DETERMINISM_LOG("FP rec: running LP with infeasibility detection"); relaxed_lp_settings_t lp_settings; lp_settings.time_limit = fp_recombiner_config_t::infeasibility_detection_time_limit; - lp_settings.tolerance = fixed_problem.tolerances.absolute_tolerance; + if (this->context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS) { + lp_settings.time_limit = std::numeric_limits::max(); + lp_settings.work_limit = fp_recombiner_config_t::infeasibility_detection_time_limit; + lp_settings.work_context = &this->context.gpu_heur_loop; + cuopt_assert(lp_settings.work_context != nullptr, "Missing deterministic work context"); + } + lp_settings.tolerance = fixed_problem.tolerances.absolute_tolerance; lp_settings.return_first_feasible = true; lp_settings.save_state = true; lp_settings.check_infeasibility = true; @@ -83,7 +101,7 @@ class fp_recombiner_t : public recombiner_t { lp_response.get_termination_status() == pdlp_termination_status_t::DualInfeasible || lp_response.get_termination_status() == pdlp_termination_status_t::TimeLimit) { CUOPT_LOG_DEBUG("FP recombiner failed because LP found infeasible!"); - return std::make_pair(offspring, false); + return std::make_tuple(offspring, false, 0.0); } } // brute force rounding threshold is 8 @@ -96,7 +114,16 @@ class fp_recombiner_t : public recombiner_t { offspring.handle_ptr->sync_stream(); offspring.assignment = std::move(fixed_assignment); cuopt_func_call(offspring.test_variable_bounds(false)); - timer_t timer(fp_recombiner_config_t::fp_time_limit); + CUOPT_DETERMINISM_LOG( + "FP rec pre-descent: offspring_hash=0x%x fixed_assignment_hash=0x%x " + "problem_fingerprint=0x%x fixed_n_integer_vars=%d", + offspring.get_hash(), + detail::compute_hash(offspring.assignment, offspring.handle_ptr->get_stream()), + fixed_problem.get_fingerprint(), + fixed_problem.n_integer_vars); + work_limit_timer_t timer(this->context.gpu_heur_loop, + fp_recombiner_config_t::fp_time_limit, + *this->context.termination); fp.timer = timer; fp.cycle_queue.reset(offspring); fp.reset(); @@ -134,9 +161,9 @@ class fp_recombiner_t : public recombiner_t { !guiding_solution.get_feasible(); if (better_cost_than_parents || better_feasibility_than_parents) { CUOPT_LOG_DEBUG("Offspring is feasible or better than both parents"); - return std::make_pair(offspring, true); + return std::make_tuple(offspring, true, work); } - return std::make_pair(offspring, !same_as_parents); + return std::make_tuple(offspring, !same_as_parents, work); } rmm::device_uvector vars_to_fix; // keep a copy of FP to prevent interference with generation FP diff --git a/cpp/src/mip_heuristics/diversity/recombiners/line_segment_recombiner.cuh b/cpp/src/mip_heuristics/diversity/recombiners/line_segment_recombiner.cuh index d413af86cd..80e6bc9dcd 100644 --- a/cpp/src/mip_heuristics/diversity/recombiners/line_segment_recombiner.cuh +++ b/cpp/src/mip_heuristics/diversity/recombiners/line_segment_recombiner.cuh @@ -66,22 +66,26 @@ class line_segment_recombiner_t : public recombiner_t { return delta_vector; } - std::pair, bool> recombine(solution_t& a, - solution_t& b, - const weight_t& weights) + std::tuple, bool, double> recombine(solution_t& a, + solution_t& b, + const weight_t& weights) { raft::common::nvtx::range fun_scope("line_segment_recombiner"); + CUOPT_DETERMINISM_LOG("LS rec: a %d b %d", a.get_hash(), b.get_hash()); auto& guiding_solution = a.get_feasible() ? a : b; auto& other_solution = a.get_feasible() ? b : a; // copy the solution from A solution_t offspring(guiding_solution); - timer_t line_segment_timer{ls_recombiner_config_t::time_limit}; + work_limit_timer_t line_segment_timer{ + this->context.gpu_heur_loop, ls_recombiner_config_t::time_limit, *this->context.termination}; // TODO after we have the conic combination, detect the lambda change // (i.e. the integral variables flip on line segment) i_t n_points_to_search = ls_recombiner_config_t::n_points_to_search; const bool is_feasibility_run = false; i_t n_different_vars = this->assign_same_integer_values(guiding_solution, other_solution, offspring); + // TODO: CHANGE + double work = static_cast(n_different_vars) / 1e8; rmm::device_uvector delta_vector = generate_delta_vector( guiding_solution, other_solution, offspring, n_points_to_search, n_different_vars); line_segment_search.fj.copy_weights(weights, offspring.handle_ptr); @@ -117,9 +121,9 @@ class line_segment_recombiner_t : public recombiner_t { } if (better_cost_than_parents || better_feasibility_than_parents) { CUOPT_LOG_DEBUG("Offspring is feasible or better than both parents"); - return std::make_pair(offspring, true); + return std::make_tuple(offspring, true, work); } - return std::make_pair(offspring, !same_as_parents); + return std::make_tuple(offspring, !same_as_parents, work); } line_segment_search_t& line_segment_search; diff --git a/cpp/src/mip_heuristics/diversity/recombiners/recombiner.cuh b/cpp/src/mip_heuristics/diversity/recombiners/recombiner.cuh index 4782e9612b..452374796b 100644 --- a/cpp/src/mip_heuristics/diversity/recombiners/recombiner.cuh +++ b/cpp/src/mip_heuristics/diversity/recombiners/recombiner.cuh @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -63,6 +64,18 @@ __global__ void assign_same_variables_kernel(typename solution_t::view template class recombiner_t { public: + static const char* recombiner_name(recombiner_enum_t recombiner) + { + switch (recombiner) { + case recombiner_enum_t::BOUND_PROP: return "BOUND_PROP"; + case recombiner_enum_t::FP: return "FP"; + case recombiner_enum_t::LINE_SEGMENT: return "LINE_SEGMENT"; + case recombiner_enum_t::SUB_MIP: return "SUB_MIP"; + case recombiner_enum_t::SIZE: return "SIZE"; + } + return "UNKNOWN"; + } + recombiner_t(mip_solver_context_t& context_, i_t n_integer_vars, const raft::handle_t* handle_ptr) @@ -92,6 +105,15 @@ class recombiner_t { cuopt::make_span(remaining_indices), n_remaining.data()); i_t remaining_variables = this->n_remaining.value(a.handle_ptr->get_stream()); + // Sort the indices to resolve nondeterministic order due to atomicAdd + thrust::sort(a.handle_ptr->get_thrust_policy(), + this->remaining_indices.data(), + this->remaining_indices.data() + remaining_variables); + + CUOPT_DETERMINISM_LOG( + "remaining indices hash 0x%x, size %d", + detail::compute_hash(make_span(this->remaining_indices), a.handle_ptr->get_stream()), + remaining_variables); auto vec_remaining_indices = host_copy(this->remaining_indices.data(), remaining_variables, a.handle_ptr->get_stream()); @@ -173,6 +195,12 @@ class recombiner_t { i_t n_vars_from_guiding) { vars_to_fix.resize(n_vars_from_guiding, offspring.handle_ptr->get_stream()); + CUOPT_DETERMINISM_LOG( + "remaining indices hash 0x%x", + detail::compute_hash(make_span(this->remaining_indices), offspring.handle_ptr->get_stream())); + CUOPT_DETERMINISM_LOG("integer_indices hash 0x%x", + detail::compute_hash(make_span(offspring.problem_ptr->integer_indices), + offspring.handle_ptr->get_stream())); // set difference needs two sorted arrays thrust::sort(offspring.handle_ptr->get_thrust_policy(), this->remaining_indices.data(), @@ -195,27 +223,54 @@ class recombiner_t { "vars_to_fix should be sorted!"); } - static void init_enabled_recombiners(const problem_t& problem, + static void init_enabled_recombiners(mip_solver_context_t& context, + const problem_t& problem, int user_enabled_mask = -1) { std::unordered_set enabled_recombiners; + const bool disable_fp_and_submip_for_expensive_fix = problem.expensive_to_fix_vars; + const i_t n_continuous_vars = problem.n_variables - problem.n_integer_vars; + const bool disable_submip_for_continuous_limit = + n_continuous_vars > (i_t)sub_mip_recombiner_config_t::max_continuous_vars; + const bool disable_submip_for_determinism = + (context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS) != 0; for (auto recombiner : recombiner_types) { if (user_enabled_mask >= 0 && !(user_enabled_mask & (1 << (uint32_t)recombiner))) { continue; } enabled_recombiners.insert(recombiner); } - if (problem.expensive_to_fix_vars) { + if (disable_fp_and_submip_for_expensive_fix) { enabled_recombiners.erase(recombiner_enum_t::FP); enabled_recombiners.erase(recombiner_enum_t::SUB_MIP); } // check the size of the continous vars - if (problem.n_variables - problem.n_integer_vars > - (i_t)sub_mip_recombiner_config_t::max_continuous_vars) { + if (disable_submip_for_continuous_limit) { enabled_recombiners.erase(recombiner_enum_t::SUB_MIP); } + // submip not supported in deterministic mode yet + if (disable_submip_for_determinism) { enabled_recombiners.erase(recombiner_enum_t::SUB_MIP); } recombiner_t::enabled_recombiners = std::vector(enabled_recombiners.begin(), enabled_recombiners.end()); + cuopt_assert(!recombiner_t::enabled_recombiners.empty(), "No recombiners enabled after init"); + std::string order_str; + for (size_t i = 0; i < recombiner_t::enabled_recombiners.size(); ++i) { + if (i > 0) { order_str += ','; } + order_str += recombiner_name(recombiner_t::enabled_recombiners[i]); + } + CUOPT_DETERMINISM_LOG( + "Deterministic recombiner init: expensive_to_fix=%d n_continuous=%d " + "max_continuous=%zu disable_fp_submip_expensive=%d " + "disable_submip_continuous=%d disable_submip_deterministic=%d size=%zu " + "order=[%s]", + (int)problem.expensive_to_fix_vars, + (int)n_continuous_vars, + sub_mip_recombiner_config_t::max_continuous_vars, + (int)disable_fp_and_submip_for_expensive_fix, + (int)disable_submip_for_continuous_limit, + (int)disable_submip_for_determinism, + recombiner_t::enabled_recombiners.size(), + order_str.c_str()); } mip_solver_context_t& context; diff --git a/cpp/src/mip_heuristics/diversity/recombiners/recombiner_stats.hpp b/cpp/src/mip_heuristics/diversity/recombiners/recombiner_stats.hpp index 044e313284..6cd2767f81 100644 --- a/cpp/src/mip_heuristics/diversity/recombiners/recombiner_stats.hpp +++ b/cpp/src/mip_heuristics/diversity/recombiners/recombiner_stats.hpp @@ -75,8 +75,13 @@ struct all_recombine_stats { // enum of the last attempted recombiner std::optional last_attempt; - double last_recombiner_time; + double last_recombiner_time{0.0}; std::chrono::high_resolution_clock::time_point last_recombiner_start_time; + double last_recombiner_work{0.0}; + + void set_recombiner_work(double work) { last_recombiner_work = work; } + + double get_last_recombiner_work() { return last_recombiner_work; } void start_recombiner_time() { diff --git a/cpp/src/mip_heuristics/diversity/recombiners/sub_mip.cuh b/cpp/src/mip_heuristics/diversity/recombiners/sub_mip.cuh index 5a637aae8e..052aa515b1 100644 --- a/cpp/src/mip_heuristics/diversity/recombiners/sub_mip.cuh +++ b/cpp/src/mip_heuristics/diversity/recombiners/sub_mip.cuh @@ -38,9 +38,9 @@ class sub_mip_recombiner_t : public recombiner_t { solution_vector.push_back(solution); } - std::pair, bool> recombine(solution_t& a, - solution_t& b, - const weight_t& weights) + std::tuple, bool, double> recombine(solution_t& a, + solution_t& b, + const weight_t& weights) { raft::common::nvtx::range fun_scope("Sub-MIP recombiner"); solution_vector.clear(); @@ -66,8 +66,10 @@ class sub_mip_recombiner_t : public recombiner_t { i_t n_vars_from_guiding = a.problem_ptr->n_integer_vars - n_vars_from_other; if (n_vars_from_other == 0 || n_vars_from_guiding == 0) { CUOPT_LOG_DEBUG("Returning false because all vars are common or different"); - return std::make_pair(offspring, false); + return std::make_tuple(offspring, false, 0.0); } + // TODO: CHANGE + double work = static_cast(n_vars_from_other) / 1e8; CUOPT_LOG_DEBUG( "n_vars_from_guiding %d n_vars_from_other %d", n_vars_from_guiding, n_vars_from_other); this->compute_vars_to_fix(offspring, vars_to_fix, n_vars_from_other, n_vars_from_guiding); @@ -112,10 +114,11 @@ class sub_mip_recombiner_t : public recombiner_t { branch_and_bound_settings.clique_cuts = 0; branch_and_bound_settings.sub_mip = 1; branch_and_bound_settings.strong_branching_simplex_iteration_limit = 200; - branch_and_bound_settings.solution_callback = [this](std::vector& solution, - f_t objective) { - this->solution_callback(solution, objective); - }; + branch_and_bound_settings.new_incumbent_callback = + [this](std::vector& solution, + f_t objective, + const cuopt::internals::mip_solution_callback_info_t&, + double) { this->solution_callback(solution, objective); }; // disable B&B logs, so that it is not interfering with the main B&B thread branch_and_bound_settings.log.log = false; @@ -185,7 +188,7 @@ class sub_mip_recombiner_t : public recombiner_t { sol.clamp_within_bounds(); // Scaling might bring some very slight variable bound violations sol.compute_feasibility(); cuopt_func_call(sol.test_variable_bounds()); - population.add_solution(std::move(sol)); + population.add_solution(std::move(sol), internals::mip_solution_origin_t::SUB_MIP); } bool better_cost_than_parents = offspring.get_quality(weights) < @@ -195,9 +198,9 @@ class sub_mip_recombiner_t : public recombiner_t { !guiding_solution.get_feasible(); if (better_cost_than_parents || better_feasibility_than_parents) { CUOPT_LOG_DEBUG("Offspring is feasible or better than both parents"); - return std::make_pair(offspring, true); + return std::make_tuple(offspring, true, work); } - return std::make_pair(offspring, !std::isnan(branch_and_bound_solution.objective)); + return std::make_tuple(offspring, !std::isnan(branch_and_bound_solution.objective), work); } rmm::device_uvector vars_to_fix; mip_solver_context_t& context; diff --git a/cpp/src/mip_heuristics/diversity/weights.cuh b/cpp/src/mip_heuristics/diversity/weights.cuh index 7502ae9210..fbe72aba8e 100644 --- a/cpp/src/mip_heuristics/diversity/weights.cuh +++ b/cpp/src/mip_heuristics/diversity/weights.cuh @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -12,6 +12,8 @@ #include #include +#include + namespace cuopt::linear_programming::detail { template @@ -25,6 +27,11 @@ struct weight_t { objective_weight.set_value_async(one, handle_ptr->get_stream()); } + uint32_t get_hash(rmm::cuda_stream_view stream = rmm::cuda_stream_default) const + { + return compute_hash(cstr_weights, stream) ^ compute_hash(objective_weight.value(stream)); + } + rmm::device_uvector cstr_weights; rmm::device_scalar objective_weight; }; diff --git a/cpp/src/mip_heuristics/early_heuristic.cuh b/cpp/src/mip_heuristics/early_heuristic.cuh index 090cfd4901..ddab090a5b 100644 --- a/cpp/src/mip_heuristics/early_heuristic.cuh +++ b/cpp/src/mip_heuristics/early_heuristic.cuh @@ -24,8 +24,10 @@ namespace cuopt::linear_programming::detail { template -using early_incumbent_callback_t = std::function& assignment, const char* heuristic_name)>; +using early_incumbent_callback_t = std::function& assignment, + internals::mip_solution_origin_t origin)>; // CRTP base for early heuristics that run on the original (or papilo-presolved) problem // during presolve to find incumbents as early as possible. @@ -89,10 +91,14 @@ class early_heuristic_t { best_assignment_ = user_assignment; solution_found_ = true; f_t user_obj = problem_ptr_->get_user_obj_from_solver_obj(solver_obj); - // Log and callback are deferred to the shared incumbent_callback_ which enforces - // global monotonicity across all early heuristic instances. + double elapsed = + std::chrono::duration(std::chrono::steady_clock::now() - start_time_).count(); + CUOPT_LOG_INFO("Early heuristics (%s) lowered the primal bound. Objective %g. Time %.2f", + Derived::name(), + user_obj, + elapsed); if (incumbent_callback_) { - incumbent_callback_(solver_obj, user_obj, user_assignment, Derived::name()); + incumbent_callback_(solver_obj, user_obj, user_assignment, Derived::origin()); } } diff --git a/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cuh b/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cuh index 911e846551..89bdff1092 100644 --- a/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cuh +++ b/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cuh @@ -27,6 +27,10 @@ class early_cpufj_t : public early_heuristic_t ~early_cpufj_t(); static constexpr const char* name() { return "CPUFJ"; } + static constexpr internals::mip_solution_origin_t origin() + { + return internals::mip_solution_origin_t::CPU_FEASIBILITY_JUMP; + } void start(); void stop(); diff --git a/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cu b/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cu index 3f77427d87..59ad7ed0fd 100644 --- a/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cu +++ b/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cu @@ -52,10 +52,10 @@ void early_gpufj_t::start() fj_ptr_ = std::make_unique>(*context_ptr_, fj_settings); - fj_ptr_->improvement_callback = [this](f_t user_obj, const std::vector& h_assignment) { + fj_ptr_->set_improvement_callback([this](f_t user_obj, const std::vector& h_assignment) { f_t solver_obj = this->problem_ptr_->get_solver_obj_from_user_obj(user_obj); this->try_update_best(solver_obj, h_assignment); - }; + }); worker_thread_ = std::make_unique(&early_gpufj_t::run_worker, this); } diff --git a/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cuh b/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cuh index 4a7769143e..f09fc011d5 100644 --- a/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cuh +++ b/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cuh @@ -30,6 +30,10 @@ class early_gpufj_t : public early_heuristic_t ~early_gpufj_t(); static constexpr const char* name() { return "GPUFJ"; } + static constexpr internals::mip_solution_origin_t origin() + { + return internals::mip_solution_origin_t::FEASIBILITY_JUMP; + } void start(); void stop(); diff --git a/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump.cu b/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump.cu index 748dd41dfb..ed41402621 100644 --- a/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump.cu +++ b/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump.cu @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -23,6 +24,7 @@ #include #include +#include #include #include #include @@ -63,7 +65,8 @@ fj_t::fj_t(mip_solver_context_t& context_, fj_settings_t in_ work_id_to_nonbin_var_idx(pb_ptr->coefficients.size(), pb_ptr->handle_ptr->get_stream()), row_size_bin_prefix_sum(pb_ptr->binary_indices.size(), pb_ptr->handle_ptr->get_stream()), row_size_nonbin_prefix_sum(pb_ptr->nonbinary_indices.size(), pb_ptr->handle_ptr->get_stream()), - work_ids_for_related_vars(pb_ptr->n_variables, pb_ptr->handle_ptr->get_stream()) + work_ids_for_related_vars(pb_ptr->n_variables, pb_ptr->handle_ptr->get_stream()), + deterministic_frontier_work_by_var_d_(0, pb_ptr->handle_ptr->get_stream()) { setval_launch_dims = get_launch_dims_max_occupancy( (void*)update_assignment_kernel, TPB_setval, pb_ptr->handle_ptr); @@ -111,6 +114,158 @@ void fj_t::reset_cuda_graph() graph_created = false; } +template +bool fj_t::use_load_balancing_codepath() const +{ + bool use_load_balancing = false; + if (settings.load_balancing_mode == fj_load_balancing_mode_t::ALWAYS_OFF) { + use_load_balancing = false; + } else if (settings.load_balancing_mode == fj_load_balancing_mode_t::ALWAYS_ON) { + use_load_balancing = true; + } else if (settings.load_balancing_mode == fj_load_balancing_mode_t::AUTO) { + use_load_balancing = + pb_ptr->n_variables > settings.parameters.load_balancing_codepath_min_varcount; + } + if (settings.mode == fj_mode_t::ROUNDING) { use_load_balancing = false; } + return use_load_balancing; +} + +// precompute estimates of the amount of work performed per selected variable +// using the related_variables table to estimate the nnz touched +// will be replaced with a model estimator in the future. +template +void fj_t::initialize_deterministic_work_estimator() +{ + const i_t num_vars = pb_ptr->n_variables; + const i_t num_cstrs = pb_ptr->n_constraints; + const double total_nnz = static_cast(pb_ptr->coefficients.size()); + + deterministic_refresh_work_ = total_nnz; + deterministic_average_frontier_work_ = total_nnz; + if (num_vars == 0) { return; } + + auto stream = handle_ptr->get_stream(); + auto policy = handle_ptr->get_thrust_policy(); + + // degree[v] = number of constraints variable v appears in + rmm::device_uvector degree(num_vars, stream); + auto rev_offsets = make_span(pb_ptr->reverse_offsets); + thrust::tabulate(policy, degree.begin(), degree.end(), [rev_offsets] __device__(i_t v) -> double { + return (double)(rev_offsets[v + 1] - rev_offsets[v]); + }); + + deterministic_frontier_work_by_var_d_.resize(num_vars, stream); + + if (pb_ptr->related_variables_offsets.size() > 0 && pb_ptr->related_variables.size() > 0) { + // Exact path: segmented reduce over the precomputed related_variables table + auto degree_ptr = degree.data(); + auto related_offsets = pb_ptr->related_variables_offsets.data(); + auto degree_of_related = thrust::make_transform_iterator( + pb_ptr->related_variables.begin(), [degree_ptr, num_vars] __device__(i_t rv) -> double { + return (rv >= 0 && rv < num_vars) ? degree_ptr[rv] : 0.0; + }); + + size_t temp_bytes = 0; + cub::DeviceSegmentedReduce::Sum(nullptr, + temp_bytes, + degree_of_related, + deterministic_frontier_work_by_var_d_.data(), + num_vars, + related_offsets, + related_offsets + 1, + stream); + rmm::device_uvector temp(temp_bytes, stream); + cub::DeviceSegmentedReduce::Sum(temp.data(), + temp_bytes, + degree_of_related, + deterministic_frontier_work_by_var_d_.data(), + num_vars, + related_offsets, + related_offsets + 1, + stream); + + } else { + // SpMV path: frontier_work ≈ A^T * (A * degree) + // Overestimates by double-counting shared neighbors, but deterministic and + // load-balanced. Acceptable for a work-unit proxy. + + // Step 1: y[c] = sum of degree[v] for v in constraint c + rmm::device_uvector y(num_cstrs, stream); + auto degree_ptr = degree.data(); + auto offsets_ptr = pb_ptr->offsets.data(); + auto degree_of_var = thrust::make_transform_iterator( + pb_ptr->variables.begin(), + [degree_ptr] __device__(i_t v) -> double { return degree_ptr[v]; }); + + size_t temp_bytes = 0; + cub::DeviceSegmentedReduce::Sum(nullptr, + temp_bytes, + degree_of_var, + y.data(), + num_cstrs, + offsets_ptr, + offsets_ptr + 1, + stream); + rmm::device_uvector temp(temp_bytes, stream); + cub::DeviceSegmentedReduce::Sum(temp.data(), + temp_bytes, + degree_of_var, + y.data(), + num_cstrs, + offsets_ptr, + offsets_ptr + 1, + stream); + + // Step 2: frontier_work[v] = sum of y[c] for c in constraints_of(v) + auto rev_offs_ptr = pb_ptr->reverse_offsets.data(); + auto y_ptr = y.data(); + auto y_of_constraint = + thrust::make_transform_iterator(pb_ptr->reverse_constraints.begin(), + [y_ptr] __device__(i_t c) -> double { return y_ptr[c]; }); + + temp_bytes = 0; + cub::DeviceSegmentedReduce::Sum(nullptr, + temp_bytes, + y_of_constraint, + deterministic_frontier_work_by_var_d_.data(), + num_vars, + rev_offs_ptr, + rev_offs_ptr + 1, + stream); + temp.resize(temp_bytes, stream); + cub::DeviceSegmentedReduce::Sum(temp.data(), + temp_bytes, + y_of_constraint, + deterministic_frontier_work_by_var_d_.data(), + num_vars, + rev_offs_ptr, + rev_offs_ptr + 1, + stream); + } + + deterministic_average_frontier_work_ = + thrust::reduce(policy, + deterministic_frontier_work_by_var_d_.begin(), + deterministic_frontier_work_by_var_d_.end(), + 0.0, + thrust::plus()) / + (double)num_vars; + deterministic_frontier_work_by_var_.resize(num_vars); + raft::copy(deterministic_frontier_work_by_var_.data(), + deterministic_frontier_work_by_var_d_.data(), + num_vars, + stream); + + CUOPT_LOG_DEBUG( + "FJ determ: initialized frontier work estimator avg_frontier_nnz=%.6f refresh_nnz=%.6f " + "vars=%zu nnz=%zu load_balancing=%d", + deterministic_average_frontier_work_, + deterministic_refresh_work_, + num_vars, + pb_ptr->coefficients.size(), + (int)use_load_balancing_codepath()); +} + template fj_t::~fj_t() { @@ -189,38 +344,43 @@ fj_t::climber_data_t::view_t fj_t::climber_data_t::view() v.jump_candidates = make_span(jump_candidates); v.jump_candidate_count = make_span(jump_candidate_count); v.jump_locks = make_span(jump_locks); - v.candidate_arrived_workids = make_span(candidate_arrived_workids); - v.grid_score_buf = make_span(grid_score_buf); - v.grid_delta_buf = make_span(grid_delta_buf); - v.grid_var_buf = make_span(grid_var_buf); - v.row_size_bin_prefix_sum = make_span(fj.row_size_bin_prefix_sum); - v.row_size_nonbin_prefix_sum = make_span(fj.row_size_nonbin_prefix_sum); - v.work_id_to_bin_var_idx = make_span(fj.work_id_to_bin_var_idx); - v.work_id_to_nonbin_var_idx = make_span(fj.work_id_to_nonbin_var_idx); - v.work_ids_for_related_vars = make_span(fj.work_ids_for_related_vars); - v.fractional_variables = fractional_variables.view(); - v.saved_best_fractional_count = saved_best_fractional_count.data(); - v.handle_fractionals_only = handle_fractionals_only.data(); - v.selected_var = selected_var.data(); - v.violation_score = violation_score.data(); - v.weighted_violation_score = weighted_violation_score.data(); - v.constraints_changed_count = constraints_changed_count.data(); - v.local_minimums_reached = local_minimums_reached.data(); - v.iterations = iterations.data(); - v.best_excess = best_excess.data(); - v.best_objective = best_objective.data(); - v.saved_solution_objective = saved_solution_objective.data(); - v.incumbent_quality = incumbent_quality.data(); - v.incumbent_objective = incumbent_objective.data(); - v.weight_update_increment = fj.weight_update_increment; - v.objective_weight = fj.objective_weight.data(); - v.last_minimum_iteration = last_minimum_iteration.data(); - v.last_improving_minimum = last_improving_minimum.data(); - v.last_iter_candidates = last_iter_candidates.data(); - v.relvar_count_last_update = relvar_count_last_update.data(); - v.load_balancing_skip = load_balancing_skip.data(); - v.break_condition = break_condition.data(); - v.temp_break_condition = temp_break_condition.data(); + v.candidate_arrived_workids = make_span(candidate_arrived_workids); + v.grid_score_buf = make_span(grid_score_buf); + v.grid_delta_buf = make_span(grid_delta_buf); + v.grid_var_buf = make_span(grid_var_buf); + v.row_size_bin_prefix_sum = make_span(fj.row_size_bin_prefix_sum); + v.row_size_nonbin_prefix_sum = make_span(fj.row_size_nonbin_prefix_sum); + v.work_id_to_bin_var_idx = make_span(fj.work_id_to_bin_var_idx); + v.work_id_to_nonbin_var_idx = make_span(fj.work_id_to_nonbin_var_idx); + v.work_ids_for_related_vars = make_span(fj.work_ids_for_related_vars); + v.deterministic_frontier_work_by_var = make_span(fj.deterministic_frontier_work_by_var_d_); + v.fractional_variables = fractional_variables.view(); + v.saved_best_fractional_count = saved_best_fractional_count.data(); + v.handle_fractionals_only = handle_fractionals_only.data(); + v.selected_var = selected_var.data(); + v.violation_score = violation_score.data(); + v.weighted_violation_score = weighted_violation_score.data(); + v.constraints_changed_count = constraints_changed_count.data(); + v.local_minimums_reached = local_minimums_reached.data(); + v.iterations = iterations.data(); + v.best_excess = best_excess.data(); + v.best_objective = best_objective.data(); + v.saved_solution_objective = saved_solution_objective.data(); + v.incumbent_quality = incumbent_quality.data(); + v.incumbent_objective = incumbent_objective.data(); + v.weight_update_increment = fj.weight_update_increment; + v.objective_weight = fj.objective_weight.data(); + v.last_minimum_iteration = last_minimum_iteration.data(); + v.last_improving_minimum = last_improving_minimum.data(); + v.last_iter_candidates = last_iter_candidates.data(); + v.relvar_count_last_update = relvar_count_last_update.data(); + v.load_balancing_skip = load_balancing_skip.data(); + v.break_condition = break_condition.data(); + v.temp_break_condition = temp_break_condition.data(); + v.deterministic_batch_work = deterministic_batch_work.data(); + v.deterministic_refresh_work = fj.deterministic_refresh_work_; + v.deterministic_work_accounting = + (fj.context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS); v.best_jump_idx = best_jump_idx.data(); v.small_move_tabu = small_move_tabu.data(); v.stop_threshold = fj.stop_threshold; @@ -432,9 +592,7 @@ void fj_t::climber_init(i_t climber_idx, const rmm::cuda_stream_view& f_t inf = std::numeric_limits::infinity(); climber->best_objective.set_value_async(inf, climber_stream); climber->saved_solution_objective.set_value_async(inf, climber_stream); - climber->violation_score.set_value_to_zero_async(climber_stream); - climber->weighted_violation_score.set_value_to_zero_async(climber_stream); - init_lhs_and_violation<<<256, 256, 0, climber_stream.value()>>>(view); + refresh_lhs_and_violation(climber_stream); // initialize the best_objective values according to the initial assignment f_t best_obj = compute_objective_from_vec( @@ -458,6 +616,7 @@ void fj_t::climber_init(i_t climber_idx, const rmm::cuda_stream_view& climber->last_iter_candidates.set_value_to_zero_async(climber_stream); climber->relvar_count_last_update.set_value_to_zero_async(climber_stream); climber->load_balancing_skip.set_value_to_zero_async(climber_stream); + climber->deterministic_batch_work.set_value_to_zero_async(climber_stream); climber->constraints_changed_count.set_value_to_zero_async(climber_stream); climber->iterations.set_value_to_zero_async(climber_stream); climber->full_refresh_iteration.set_value_to_zero_async(climber_stream); @@ -650,10 +809,10 @@ void fj_t::run_step_device(const rmm::cuda_stream_view& climber_stream auto [grid_setval, blocks_setval] = setval_launch_dims; auto [grid_update_changed_constraints, blocks_update_changed_constraints] = update_changed_constraints_launch_dims; - auto [grid_resetmoves, blocks_resetmoves] = resetmoves_launch_dims; - auto [grid_resetmoves_bin, blocks_resetmoves_bin] = resetmoves_bin_launch_dims; - auto [grid_update_weights, blocks_update_weights] = update_weights_launch_dims; - auto [grid_lift_move, blocks_lift_move] = lift_move_launch_dims; + auto [grid_resetmoves, blocks_resetmoves] = resetmoves_launch_dims; + auto [grid_resetmoves_bin, blocks_resetmoves_bin] = resetmoves_bin_launch_dims; + [[maybe_unused]] auto [grid_update_weights, blocks_update_weights] = update_weights_launch_dims; + [[maybe_unused]] auto [grid_lift_move, blocks_lift_move] = lift_move_launch_dims; auto& data = *climbers[climber_idx]; auto v = data.view(); @@ -669,17 +828,10 @@ void fj_t::run_step_device(const rmm::cuda_stream_view& climber_stream // as it breaks assumptions in the binary_pb codepath if (settings.mode == fj_mode_t::ROUNDING) { is_binary_pb = false; } - bool use_load_balancing = false; - if (settings.load_balancing_mode == fj_load_balancing_mode_t::ALWAYS_OFF) { - use_load_balancing = false; - } else if (settings.load_balancing_mode == fj_load_balancing_mode_t::ALWAYS_ON) { - use_load_balancing = true; - } else if (settings.load_balancing_mode == fj_load_balancing_mode_t::AUTO) { - use_load_balancing = - pb_ptr->n_variables > settings.parameters.load_balancing_codepath_min_varcount; + bool use_load_balancing = use_load_balancing_codepath(); + if (context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS) { + data.deterministic_batch_work.set_value_to_zero_async(climber_stream); } - // Load-balanced codepath not updated yet to handle rounding mode - if (settings.mode == fj_mode_t::ROUNDING) { use_load_balancing = false; } cudaGraph_t graph; void* kernel_args[] = {&v}; @@ -841,9 +993,40 @@ void fj_t::refresh_lhs_and_violation(const rmm::cuda_stream_view& stre auto v = data.view(); data.violated_constraints.clear(stream); - data.violation_score.set_value_to_zero_async(stream); - data.weighted_violation_score.set_value_to_zero_async(stream); - init_lhs_and_violation<<<4096, 256, 0, stream>>>(v); + init_lhs_and_violated_constraints<<<4096, 256, 0, stream>>>(v); + // both transformreduce could be fused; but oh well hardly a bottleneck + auto violation = + thrust::transform_reduce(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(pb_ptr->n_constraints), + cuda::proclaim_return_type([v] __device__(i_t cstr_idx) { + return v.excess_score(cstr_idx, v.incumbent_lhs[cstr_idx]); + }), + (f_t)0, + thrust::plus()); + auto weighted_violation = thrust::transform_reduce( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(pb_ptr->n_constraints), + cuda::proclaim_return_type([v] __device__(i_t cstr_idx) { + return v.excess_score(cstr_idx, v.incumbent_lhs[cstr_idx]) * v.cstr_weights[cstr_idx]; + }), + (f_t)0, + thrust::plus()); + data.violation_score.set_value_async(violation, stream); + data.weighted_violation_score.set_value_async(weighted_violation, stream); + if ((context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS)) { + data.violated_constraints.sort(stream); + } +#if FJ_SINGLE_STEP + CUOPT_LOG_DEBUG("hash assignment %x, hash lhs %x, hash lhscomp %x", + detail::compute_hash(data.incumbent_assignment, stream), + detail::compute_hash(data.incumbent_lhs, stream), + detail::compute_hash(data.incumbent_lhs_sumcomp, stream)); + CUOPT_LOG_DEBUG("Violated constraints hash post sort: %x, index map %x", + detail::compute_hash(data.violated_constraints.contents, stream), + detail::compute_hash(data.violated_constraints.index_map, stream)); +#endif } template @@ -851,6 +1034,10 @@ i_t fj_t::host_loop(solution_t& solution, i_t climber_idx) { auto& data = *climbers[climber_idx]; auto v = data.view(); // == climber_views[climber_idx] + const bool deterministic_work_estimate = + (context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS); + const bool use_graph = true; + const i_t iterations_per_batch = use_graph ? iterations_per_graph : 1; auto climber_stream = data.stream.view(); if (climber_idx == 0) climber_stream = handle_ptr->get_stream(); @@ -865,12 +1052,13 @@ i_t fj_t::host_loop(solution_t& solution, i_t climber_idx) data.incumbent_quality.set_value_async(obj, handle_ptr->get_stream()); - timer_t timer(settings.time_limit); + work_limit_timer_t timer(context.gpu_heur_loop, settings.time_limit, *context.termination); i_t steps; bool limit_reached = false; - for (steps = 0; steps < std::numeric_limits::max(); steps += iterations_per_graph) { + for (steps = 0; steps < std::numeric_limits::max(); steps += iterations_per_batch) { // to actualize time limit handle_ptr->sync_stream(); + const bool lhs_refreshed = (steps % settings.parameters.lhs_refresh_period == 0); if (timer.check_time_limit() || steps >= settings.iteration_limit || context.preempt_heuristic_solver_.load()) { limit_reached = true; @@ -879,9 +1067,11 @@ i_t fj_t::host_loop(solution_t& solution, i_t climber_idx) // every now and then, ensure external solutions are added to the population // this is done here because FJ is called within FP and also after recombiners // so FJ is one of the most inner and most frequent functions to be called - if (steps % 10000 == 0 && context.diversity_manager_ptr != nullptr) { - context.diversity_manager_ptr->get_population_pointer() - ->add_external_solutions_to_population(); + if (steps % 10000 == 0 && context.diversity_manager_ptr != nullptr && + context.diversity_manager_ptr != nullptr) { + auto* population_ptr = context.diversity_manager_ptr->get_population_pointer(); + cuopt_assert(population_ptr != nullptr, ""); + population_ptr->add_external_solutions_to_population(); } #if !FJ_SINGLE_STEP @@ -891,7 +1081,7 @@ i_t fj_t::host_loop(solution_t& solution, i_t climber_idx) CUOPT_LOG_TRACE( "FJ " "step %d viol %.2g [%d], obj %.8g, best %.8g, mins %d, maxw %g, " - "objw %g", + "objw %g, sol %x, delta %x, inc %x, lhs %x, lhscomp %x, viol %x, weights %x", steps, data.violation_score.value(climber_stream), data.violated_constraints.set_size.value(climber_stream), @@ -899,15 +1089,26 @@ i_t fj_t::host_loop(solution_t& solution, i_t climber_idx) data.best_objective.value(climber_stream), data.local_minimums_reached.value(climber_stream), max_cstr_weight.value(climber_stream), - objective_weight.value(climber_stream)); + objective_weight.value(climber_stream), + solution.get_hash(), + detail::compute_hash(data.jump_move_delta, climber_stream), + detail::compute_hash(data.incumbent_assignment, climber_stream), + detail::compute_hash(data.incumbent_lhs, climber_stream), + detail::compute_hash(data.incumbent_lhs_sumcomp, climber_stream), + detail::compute_hash(data.violated_constraints.contents, climber_stream), + detail::compute_hash(cstr_left_weights, climber_stream)); } - if (!limit_reached) { run_step_device(climber_stream, climber_idx); } + if (!limit_reached) { run_step_device(climber_stream, climber_idx, use_graph); } // periodically recompute the LHS and violation scores // to correct any accumulated numerical errors - if (steps % settings.parameters.lhs_refresh_period == 0) { - refresh_lhs_and_violation(climber_stream, climber_idx); + if (lhs_refreshed) { refresh_lhs_and_violation(climber_stream, climber_idx); } + if (deterministic_work_estimate && !limit_reached) { + // TODO: replace with work predictor model + double batch_work = data.deterministic_batch_work.value(climber_stream) / 1e8; + timer.record_work(batch_work); + if (timer.check_time_limit()) { limit_reached = true; } } // periodically synchronize and check the latest solution @@ -985,6 +1186,9 @@ i_t fj_t::host_loop(solution_t& solution, i_t climber_idx) solution.get_feasible(), data.local_minimums_reached.value(climber_stream)); + // compute total time spent + double elapsed_time = timer.elapsed_time(); + CUOPT_LOG_TRACE("best fractional count %d", data.saved_best_fractional_count.value(climber_stream)); @@ -1074,7 +1278,11 @@ template i_t fj_t::solve(solution_t& solution) { raft::common::nvtx::range scope("fj_solve"); - timer_t timer(settings.time_limit); + bool deterministic = (context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS); + if (deterministic) { + settings.time_limit = std::max((f_t)0.0, settings.time_limit); + settings.work_limit = settings.time_limit; + } handle_ptr = const_cast(solution.handle_ptr); pb_ptr = solution.problem_ptr; last_reported_objective_ = std::numeric_limits::infinity(); @@ -1082,9 +1290,26 @@ i_t fj_t::solve(solution_t& solution) cuopt_func_call(solution.test_variable_bounds(true)); cuopt_assert(solution.test_number_all_integer(), "All integers must be rounded"); } + if (deterministic && settings.work_limit == 0.0) { + CUOPT_LOG_DEBUG("FJ: skipping solve due to exhausted deterministic work budget"); + return solution.compute_feasibility(); + } + auto total_work_start = context.gpu_heur_loop.current_work(); + auto total_time_start = std::chrono::high_resolution_clock::now(); pb_ptr->check_problem_representation(true); resize_vectors(solution.handle_ptr); + CUOPT_LOG_DEBUG( + "FJ: work_limit %f time_limit %f sol hash %x pb hash %x", + settings.work_limit < std::numeric_limits::max() ? settings.work_limit : -1.0, + settings.time_limit < std::numeric_limits::max() ? settings.time_limit : -1.0, + solution.get_hash(), + pb_ptr->get_fingerprint()); + CUOPT_LOG_DEBUG("FJ: weights hash %x, left weights hash %x, right weights hash %x", + detail::compute_hash(cstr_weights, handle_ptr->get_stream()), + detail::compute_hash(cstr_left_weights, handle_ptr->get_stream()), + detail::compute_hash(cstr_right_weights, handle_ptr->get_stream())); + bool is_initial_feasible = solution.compute_feasibility(); auto initial_solution = solution; // if we're in rounding mode, split the time/iteration limit between the first and second stage @@ -1119,11 +1344,16 @@ i_t fj_t::solve(solution_t& solution) RAFT_CHECK_CUDA(handle_ptr->get_stream()); handle_ptr->sync_stream(); + if (deterministic) { initialize_deterministic_work_estimator(); } + i_t iterations = host_loop(solution); RAFT_CHECK_CUDA(handle_ptr->get_stream()); handle_ptr->sync_stream(); - f_t effort_rate = (f_t)iterations / timer.elapsed_time(); + f_t elapsed_time = std::chrono::duration_cast>( + std::chrono::high_resolution_clock::now() - total_time_start) + .count(); + f_t effort_rate = (f_t)iterations / elapsed_time; // If we're in rounding mode and some fractionals remain: round them all // limit = total_limit * second_stage_split @@ -1141,7 +1371,7 @@ i_t fj_t::solve(solution_t& solution) } } - CUOPT_LOG_TRACE("GPU solver took %g", timer.elapsed_time()); + CUOPT_LOG_TRACE("GPU solver took %g", elapsed_time); CUOPT_LOG_TRACE("limit reached, effort rate %g steps/secm %d steps", effort_rate, iterations); reset_cuda_graph(); i_t n_integer_vars = thrust::count_if( @@ -1166,6 +1396,18 @@ i_t fj_t::solve(solution_t& solution) cuopt_assert(solution.compute_feasibility(), "Reverted solution should be feasible"); } + cuopt_func_call(solution.test_variable_bounds()); + + if (deterministic) { + auto total_work_end = context.gpu_heur_loop.current_work(); + CUOPT_LOG_DEBUG("FJ: worked %fwu for %d iterations, %g seconds", + total_work_end - total_work_start, + iterations, + elapsed_time); + } + + CUOPT_LOG_DEBUG("FJ sol hash %x", solution.get_hash()); + return is_new_feasible; } diff --git a/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump.cuh b/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump.cuh index 50b451a86e..a68ba1c467 100644 --- a/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump.cuh +++ b/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump.cuh @@ -19,6 +19,9 @@ #include +#include +#include + #include #define FJ_DEBUG_LOAD_BALANCING 0 @@ -105,6 +108,7 @@ struct fj_settings_t { fj_mode_t mode{fj_mode_t::FIRST_FEASIBLE}; fj_candidate_selection_t candidate_selection{fj_candidate_selection_t::WEIGHTED_SCORE}; double time_limit{60.0}; + double work_limit{std::numeric_limits::infinity()}; int iteration_limit{std::numeric_limits::max()}; fj_hyper_parameters_t parameters{}; int n_of_minimums_for_exit = 7000; @@ -131,12 +135,17 @@ struct fj_move_t { bool operator!=(const fj_move_t& rhs) const { return !(*this == rhs); } }; -// TODO: use 32bit integers instead, -// as we dont need them to be floating point per the FJ2 scoring scheme // sizeof(fj_staged_score_t) <= 8 is needed to allow for atomic loads struct fj_staged_score_t { - float base{-std::numeric_limits::infinity()}; - float bonus{-std::numeric_limits::infinity()}; + int32_t base{std::numeric_limits::lowest()}; + int32_t bonus{std::numeric_limits::lowest()}; + + fj_staged_score_t() = default; + HDI fj_staged_score_t(int32_t base_, int32_t bonus_) : base(base_), bonus(bonus_) {} + fj_staged_score_t(const fj_staged_score_t&) = default; + fj_staged_score_t(fj_staged_score_t&&) = default; + fj_staged_score_t& operator=(const fj_staged_score_t&) = default; + fj_staged_score_t& operator=(fj_staged_score_t&&) = default; HDI bool operator<(fj_staged_score_t other) const noexcept { @@ -154,7 +163,7 @@ struct fj_staged_score_t { HDI static fj_staged_score_t invalid() { - return {-std::numeric_limits::infinity(), -std::numeric_limits::infinity()}; + return {std::numeric_limits::lowest(), std::numeric_limits::lowest()}; } HDI static fj_staged_score_t zero() { return {0, 0}; } @@ -268,6 +277,7 @@ class fj_t { rmm::device_uvector work_id_to_bin_var_idx; rmm::device_uvector work_id_to_nonbin_var_idx; rmm::device_uvector work_ids_for_related_vars; + rmm::device_uvector deterministic_frontier_work_by_var_d_; cudaGraphExec_t graph_instance; bool graph_created = false; @@ -326,6 +336,7 @@ class fj_t { rmm::device_scalar full_refresh_iteration; rmm::device_scalar relvar_count_last_update; rmm::device_scalar load_balancing_skip; + rmm::device_scalar deterministic_batch_work; contiguous_set_t violated_constraints; contiguous_set_t candidate_variables; @@ -420,6 +431,7 @@ class fj_t { last_iter_candidates(0, fj.handle_ptr->get_stream()), relvar_count_last_update(0, fj.handle_ptr->get_stream()), load_balancing_skip(0, fj.handle_ptr->get_stream()), + deterministic_batch_work(0.0, fj.handle_ptr->get_stream()), break_condition(0, fj.handle_ptr->get_stream()), temp_break_condition(0, fj.handle_ptr->get_stream()), cub_storage_bytes(0, fj.handle_ptr->get_stream()), @@ -490,6 +502,7 @@ class fj_t { raft::device_span row_size_nonbin_prefix_sum; raft::device_span work_id_to_bin_var_idx; raft::device_span work_id_to_nonbin_var_idx; + raft::device_span deterministic_frontier_work_by_var; i_t* selected_var; i_t* constraints_changed_count; @@ -518,6 +531,9 @@ class fj_t { i_t* relvar_count_last_update; i_t* load_balancing_skip; f_t* max_cstr_weight; + double* deterministic_batch_work; + double deterministic_refresh_work; + bool deterministic_work_accounting; fj_settings_t* settings; @@ -634,6 +650,19 @@ class fj_t { std::vector> climbers; rmm::device_uvector climber_views; fj_settings_t settings; + std::vector deterministic_frontier_work_by_var_; + double deterministic_average_frontier_work_{0.0}; + double deterministic_refresh_work_{0.0}; + + public: + void initialize_deterministic_work_estimator(); + void set_improvement_callback(fj_improvement_callback_t callback) + { + improvement_callback = std::move(callback); + } + + private: + bool use_load_balancing_codepath() const; fj_improvement_callback_t improvement_callback; f_t last_reported_objective_{std::numeric_limits::infinity()}; diff --git a/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump_impl_common.cuh b/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump_impl_common.cuh index e57f0ec9e2..ec9b592550 100644 --- a/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump_impl_common.cuh +++ b/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump_impl_common.cuh @@ -103,7 +103,9 @@ HDI std::pair feas_score_constraint( f_t cstr_coeff, f_t c_lb, f_t c_ub, - f_t current_lhs) + f_t current_lhs, + f_t cstr_left_weight, + f_t cstr_right_weight) { cuopt_assert(isfinite(delta), "invalid delta"); cuopt_assert(cstr_coeff != 0 && isfinite(cstr_coeff), "invalid coefficient"); @@ -123,14 +125,13 @@ HDI std::pair feas_score_constraint( // TODO: broadcast left/right weights to a csr_offset-indexed table? local minimums // usually occur on a rarer basis (around 50 iteratiosn to 1 local minimum) // likely unreasonable and overkill however - f_t cstr_weight = - bound_idx == 0 ? fj.cstr_left_weights[cstr_idx] : fj.cstr_right_weights[cstr_idx]; - f_t sign = bound_idx == 0 ? -1 : 1; - f_t rhs = bounds[bound_idx] * sign; - f_t old_lhs = current_lhs * sign; - f_t new_lhs = (current_lhs + cstr_coeff * delta) * sign; - f_t old_slack = rhs - old_lhs; - f_t new_slack = rhs - new_lhs; + f_t cstr_weight = bound_idx == 0 ? cstr_left_weight : cstr_right_weight; + f_t sign = bound_idx == 0 ? -1 : 1; + f_t rhs = bounds[bound_idx] * sign; + f_t old_lhs = current_lhs * sign; + f_t new_lhs = (current_lhs + cstr_coeff * delta) * sign; + f_t old_slack = rhs - old_lhs; + f_t new_slack = rhs - new_lhs; cuopt_assert(isfinite(cstr_weight), "invalid weight"); cuopt_assert(cstr_weight >= 0, "invalid weight"); diff --git a/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump_kernels.cu b/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump_kernels.cu index ebbb761277..90f26ac4a5 100644 --- a/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump_kernels.cu +++ b/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump_kernels.cu @@ -14,6 +14,11 @@ #include +#include +#include + +#include + #include #include "feasibility_jump_impl_common.cuh" @@ -25,6 +30,39 @@ namespace cg = cooperative_groups; namespace cuopt::linear_programming::detail { +template +DI void charge_deterministic_iteration_work(typename fj_t::climber_data_t::view_t fj, + bool full_score_refresh) +{ + if (!fj.deterministic_work_accounting || !FIRST_THREAD) { return; } + + const i_t selected_var = *fj.selected_var; + + double work = fj.deterministic_refresh_work; + if (!full_score_refresh && selected_var >= 0 && + selected_var < static_cast(fj.deterministic_frontier_work_by_var.size())) { + work = fj.deterministic_frontier_work_by_var[selected_var]; + } + + *fj.deterministic_batch_work += work; +} + +template +struct score_with_tiebreaker_comparator { + DI auto operator()(const thrust::pair& a, + const thrust::pair& b) const + { + auto a_score = a.first; + auto a_idx = a.second; + auto b_score = b.first; + auto b_idx = b.second; + + if (a_score > b_score) return a; + if (a_score == b_score && a_idx > b_idx) return a; + return b; + } +}; + template DI thrust::pair move_objective_score( const typename fj_t::climber_data_t::view_t& fj, i_t var_idx, f_t delta) @@ -139,7 +177,8 @@ DI void update_weights(typename fj_t::climber_data_t::view_t& fj) } template -__global__ void init_lhs_and_violation(typename fj_t::climber_data_t::view_t fj) +__global__ void init_lhs_and_violated_constraints( + typename fj_t::climber_data_t::view_t fj) { for (i_t cstr_idx = TH_ID_X; cstr_idx < fj.pb.n_constraints; cstr_idx += GRID_STRIDE) { auto [offset_begin, offset_end] = fj.pb.range_for_constraint(cstr_idx); @@ -152,10 +191,7 @@ __global__ void init_lhs_and_violation(typename fj_t::climber_data_t:: fj_kahan_babushka_neumaier_sum(delta_it + offset_begin, delta_it + offset_end); fj.incumbent_lhs_sumcomp[cstr_idx] = 0; - f_t th_violation = fj.excess_score(cstr_idx, fj.incumbent_lhs[cstr_idx]); - f_t weighted_violation = th_violation * fj.cstr_weights[cstr_idx]; - atomicAdd(fj.violation_score, th_violation); - atomicAdd(fj.weighted_violation_score, weighted_violation); + f_t th_violation = fj.excess_score(cstr_idx, fj.incumbent_lhs[cstr_idx]); f_t cstr_tolerance = fj.get_corrected_tolerance(cstr_idx); if (th_violation < -cstr_tolerance) { fj.violated_constraints.insert(cstr_idx); } } @@ -191,8 +227,17 @@ DI typename fj_t::move_score_info_t compute_new_score( f_t c_lb = fj.pb.constraint_lower_bounds[cstr_idx]; f_t c_ub = fj.pb.constraint_upper_bounds[cstr_idx]; - auto [cstr_base_feas, cstr_bonus_robust] = feas_score_constraint( - fj, var_idx, delta, cstr_idx, cstr_coeff, c_lb, c_ub, fj.incumbent_lhs[cstr_idx]); + auto [cstr_base_feas, cstr_bonus_robust] = + feas_score_constraint(fj, + var_idx, + delta, + cstr_idx, + cstr_coeff, + c_lb, + c_ub, + fj.incumbent_lhs[cstr_idx], + fj.cstr_left_weights[cstr_idx], + fj.cstr_right_weights[cstr_idx]); base_feas += cstr_base_feas; bonus_robust += cstr_bonus_robust; @@ -349,7 +394,7 @@ DI std::pair::move_score_info_t> compute_best_mtm( return std::make_pair(best_val, best_score_info); } -template +template DI void update_jump_value(typename fj_t::climber_data_t::view_t fj, i_t var_idx) { cuopt_assert(var_idx >= 0 && var_idx < fj.pb.n_variables, "invalid variable index"); @@ -376,12 +421,11 @@ DI void update_jump_value(typename fj_t::climber_data_t::view_t fj, i_ fj.pb.check_variable_within_bounds(var_idx, fj.incumbent_assignment[var_idx] + delta), "Var not within bounds!"); } - best_score_info = compute_new_score(fj, var_idx, delta); + best_score_info = compute_new_score(fj, var_idx, delta); } else { - auto [best_val, score_info] = - compute_best_mtm(fj, var_idx); - delta = best_val - fj.incumbent_assignment[var_idx]; - best_score_info = score_info; + auto [best_val, score_info] = compute_best_mtm(fj, var_idx); + delta = best_val - fj.incumbent_assignment[var_idx]; + best_score_info = score_info; } } else { delta = round(1.0 - 2 * fj.incumbent_assignment[var_idx]); @@ -577,14 +621,16 @@ __global__ void update_assignment_kernel(typename fj_t::climber_data_t __syncthreads(); - cuopt_assert(isfinite(fj.jump_move_delta[var_idx]), "delta should be finite"); - // Kahan compensated summation - // fj.incumbent_lhs[cstr_idx] = old_lhs + cstr_coeff * fj.jump_move_delta[var_idx]; - f_t y = cstr_coeff * fj.jump_move_delta[var_idx] - fj.incumbent_lhs_sumcomp[cstr_idx]; - f_t t = old_lhs + y; - fj.incumbent_lhs_sumcomp[cstr_idx] = (t - old_lhs) - y; - fj.incumbent_lhs[cstr_idx] = t; - cuopt_assert(isfinite(fj.incumbent_lhs[cstr_idx]), "assignment should be finite"); + if (threadIdx.x == 0) { + cuopt_assert(isfinite(fj.jump_move_delta[var_idx]), "delta should be finite"); + // Kahan compensated summation + // fj.incumbent_lhs[cstr_idx] = old_lhs + cstr_coeff * fj.jump_move_delta[var_idx]; + f_t y = cstr_coeff * fj.jump_move_delta[var_idx] - fj.incumbent_lhs_sumcomp[cstr_idx]; + f_t t = old_lhs + y; + fj.incumbent_lhs_sumcomp[cstr_idx] = (t - old_lhs) - y; + fj.incumbent_lhs[cstr_idx] = t; + cuopt_assert(isfinite(fj.incumbent_lhs[cstr_idx]), "assignment should be finite"); + } } // update the assignment and objective proper @@ -626,8 +672,8 @@ __global__ void update_assignment_kernel(typename fj_t::climber_data_t #if FJ_SINGLE_STEP DEVICE_LOG_DEBUG( - "=---- FJ[%d]: updated %d [%g/%g] :%.4g+{%.4g}=%.4g score {%g,%g}, d_obj %.2g+%.2g=%.2g, " - "err_range %.2g%%, infeas %.2g, total viol %d\n", + "=---- FJ[%d]: updated %d [%g/%g] :%.4g+{%.4g}=%.4g score {%d,%d}, d_obj %.2g+%.2g=%.2g, " + "err_range %.2g%%, infeas %.2g, total viol %d, obj %x, delta %x, coef %x\n", *fj.iterations, var_idx, get_lower(fj.pb.variable_bounds[var_idx]), @@ -642,7 +688,10 @@ __global__ void update_assignment_kernel(typename fj_t::climber_data_t *fj.incumbent_objective + fj.jump_move_delta[var_idx] * fj.pb.objective_coefficients[var_idx], delta_rel_err, fj.jump_move_infeasibility[var_idx], - fj.violated_constraints.size()); + fj.violated_constraints.size(), + detail::compute_hash(*fj.incumbent_objective), + detail::compute_hash(fj.jump_move_delta[var_idx]), + detail::compute_hash(fj.pb.objective_coefficients[var_idx])); #endif // reset the score fj.jump_move_scores[var_idx] = fj_t::move_score_t::invalid(); @@ -862,6 +911,16 @@ DI void update_changed_constraints(typename fj_t::climber_data_t::view if (blockIdx.x == 0) { if (threadIdx.x == 0) { + // sort changed constraints to guarantee determinism + // TODO: usually csontraint changed few, but thats still rather dreadful... + // block-parallelize at least? but not trivial for arbitrary sizes w/ CUB + // TODO: replace once focus shifts to tuning deterministic GPU heuristics + if (fj.deterministic_work_accounting) { + thrust::sort(thrust::seq, + fj.constraints_changed.begin(), + fj.constraints_changed.begin() + *fj.constraints_changed_count); + } + for (i_t i = 0; i < *fj.constraints_changed_count; ++i) { i_t idx = fj.constraints_changed[i]; if ((idx & 1) == CONSTRAINT_FLAG_INSERT) { @@ -953,7 +1012,7 @@ __global__ void compute_iteration_related_variables_kernel( compute_iteration_related_variables(fj); } -template +template __device__ void compute_mtm_moves(typename fj_t::climber_data_t::view_t fj, bool ForceRefresh) { @@ -965,11 +1024,14 @@ __device__ void compute_mtm_moves(typename fj_t::climber_data_t::view_ if (*fj.selected_var == std::numeric_limits::max()) full_refresh = true; // always do a full sweep when looking for satisfied mtm moves - if constexpr (move_type == MTMMoveType::FJ_MTM_SATISFIED) full_refresh = true; - - // only update related variables i_t split_begin, split_end; - if (full_refresh) { + if constexpr (move_type == MTMMoveType::FJ_MTM_SATISFIED) { + full_refresh = true; + split_begin = 0; + split_end = fj.objective_vars.size(); + } + // only update related variables + else if (full_refresh) { split_begin = 0; split_end = fj.pb.n_variables; } @@ -989,12 +1051,20 @@ __device__ void compute_mtm_moves(typename fj_t::climber_data_t::view_ split_end = range.second; } + charge_deterministic_iteration_work(fj, full_refresh); + if (FIRST_THREAD) *fj.relvar_count_last_update = split_end - split_begin; for (i_t i = blockIdx.x + split_begin; i < split_end; i += gridDim.x) { - i_t var_idx = full_refresh ? i - : fj.pb.related_variables.size() == 0 ? i - : fj.pb.related_variables[i]; + // if sat MTM mode, go over objective variables only + i_t var_idx; + if constexpr (move_type == MTMMoveType::FJ_MTM_SATISFIED) { + var_idx = fj.objective_vars[i]; + } else { + var_idx = full_refresh ? i + : fj.pb.related_variables.size() == 0 ? i + : fj.pb.related_variables[i]; + } // skip if we couldnt precompute a related var table and // this variable isnt in the dynamic related variable table @@ -1017,7 +1087,7 @@ __device__ void compute_mtm_moves(typename fj_t::climber_data_t::view_ } cuopt_assert(var_idx >= 0 && var_idx < fj.pb.n_variables, ""); - update_jump_value(fj, var_idx); + update_jump_value(fj, var_idx); } } @@ -1025,7 +1095,7 @@ template __global__ void compute_mtm_moves_kernel(typename fj_t::climber_data_t::view_t fj, bool ForceRefresh) { - compute_mtm_moves(fj, ForceRefresh); + compute_mtm_moves(fj, ForceRefresh); } template @@ -1037,8 +1107,9 @@ __global__ void select_variable_kernel(typename fj_t::climber_data_t:: fj.settings->seed, *fj.iterations * fj.settings->parameters.max_sampled_moves, 0); using move_score_t = typename fj_t::move_score_t; - __shared__ alignas(move_score_t) char shmem_storage[2 * raft::WarpSize * sizeof(move_score_t)]; - auto* const shmem = (move_score_t*)shmem_storage; + __shared__ alignas(thrust::pair) char + shmem_storage[raft::WarpSize * sizeof(thrust::pair)]; + auto* const shmem = (thrust::pair*)shmem_storage; auto th_best_score = fj_t::move_score_t::invalid(); i_t th_selected_var = std::numeric_limits::max(); @@ -1075,8 +1146,11 @@ __global__ void select_variable_kernel(typename fj_t::climber_data_t:: } } // Block level reduction to get the best variable from the sample + // Use deterministic tie-breaking comparator based on var_idx auto [best_score, reduced_selected_var] = - raft::blockRankedReduce(th_best_score, shmem, th_selected_var, raft::max_op{}); + raft::blockReduce(thrust::make_pair(th_best_score, th_selected_var), + (char*)shmem, + score_with_tiebreaker_comparator{}); if (FIRST_THREAD) { // assign it to print the value outside th_best_score = best_score; @@ -1111,9 +1185,9 @@ __global__ void select_variable_kernel(typename fj_t::climber_data_t:: i_t var_range = get_upper(bounds) - get_lower(bounds); double delta_rel_err = fabs(fj.jump_move_delta[selected_var]) / var_range * 100; DEVICE_LOG_INFO( - "=---- FJ: selected %d [%g/%g] %c :%.4g+{%.4g}=%.4g score {%g,%g}, d_obj %.2g+%.2g->%.2g, " + "=---- FJ: selected %d [%g/%g] %c :%.4g+{%.4g}=%.4g score {%d,%d}, d_obj %.2g+%.2g->%.2g, " "delta_rel_err %.2g%%, " - "infeas %.2g, total viol %d, out of %d\n", + "infeas %.2g, total viol %d, out of %d, obj %x\n", selected_var, get_lower(bounds), get_upper(bounds), @@ -1130,9 +1204,18 @@ __global__ void select_variable_kernel(typename fj_t::climber_data_t:: delta_rel_err, fj.jump_move_infeasibility[selected_var], fj.violated_constraints.size(), - good_var_count); + good_var_count, + detail::compute_hash(*fj.incumbent_objective)); #endif cuopt_assert(fj.jump_move_scores[selected_var].valid(), ""); + } else { +#if FJ_SINGLE_STEP + DEVICE_LOG_INFO("=[%d]---- FJ: no var selected, obj is %g, viol %d, out of %d\n", + *fj.iterations, + *fj.incumbent_objective, + fj.violated_constraints.size(), + good_var_count); +#endif } } } @@ -1202,27 +1285,32 @@ DI thrust::tuple::move_score_t> gridwide_reduc if (blockIdx.x == 0) { using move_score_t = typename fj_t::move_score_t; - __shared__ alignas(move_score_t) char shmem_storage[2 * raft::WarpSize * sizeof(move_score_t)]; - auto* const shmem = (move_score_t*)shmem_storage; + __shared__ alignas(thrust::pair) char + shmem_storage[2 * raft::WarpSize * sizeof(thrust::pair)]; + auto* const shmem = (thrust::pair*)shmem_storage; auto th_best_score = fj_t::move_score_t::invalid(); i_t th_best_block = 0; + i_t th_best_var = -1; for (i_t i = threadIdx.x; i < gridDim.x; i += blockDim.x) { auto var_idx = fj.grid_var_buf[i]; auto move_score = fj.grid_score_buf[i]; - if (move_score > th_best_score || - (move_score == th_best_score && var_idx > fj.grid_var_buf[th_best_block])) { + if (move_score > th_best_score || (move_score == th_best_score && var_idx > th_best_var)) { th_best_score = move_score; th_best_block = i; + th_best_var = var_idx; } } // Block level reduction to get the best variable from all blocks - auto [reduced_best_score, reduced_best_block] = - raft::blockRankedReduce(th_best_score, shmem, th_best_block, raft::max_op{}); - - if (reduced_best_score.valid() && threadIdx.x == 0) { - cuopt_assert(th_best_block < gridDim.x, ""); + auto [reduced_best_score_pair, reduced_best_block] = + raft::blockRankedReduce(thrust::make_pair(th_best_score, th_best_var), + shmem, + th_best_block, + score_with_tiebreaker_comparator{}); + + if (reduced_best_score_pair.first.valid() && threadIdx.x == 0) { + cuopt_assert(reduced_best_block < gridDim.x, ""); best_var = fj.grid_var_buf[reduced_best_block]; best_delta = fj.grid_delta_buf[reduced_best_block]; best_score = fj.grid_score_buf[reduced_best_block]; @@ -1244,6 +1332,9 @@ DI thrust::tuple::move_score_t> best_random_mt raft::random::PCGenerator rng(fj.settings->seed + *fj.iterations, 0, 0); i_t cstr_idx = fj.violated_constraints.contents[rng.next_u32() % fj.violated_constraints.size()]; + cuopt_assert(fj.excess_score(cstr_idx, fj.incumbent_lhs[cstr_idx]) < 0, + "constraint isn't violated"); + auto [offset_begin, offset_end] = fj.pb.range_for_constraint(cstr_idx); return gridwide_reduce_best_move( @@ -1258,7 +1349,9 @@ DI thrust::tuple::move_score_t> best_sat_cstr_ typename fj_t::climber_data_t::view_t fj) { // compute all MTM moves within satisfied constraints - compute_mtm_moves(fj, true); + compute_mtm_moves(fj, true); + // NOTE: grid sync not required since each block only reduces over variables that it updated in + // compute_mtm_moves return gridwide_reduce_best_move( fj, fj.objective_vars.begin(), fj.objective_vars.end(), [fj] __device__(i_t var_idx) { return fj.jump_move_delta[var_idx]; @@ -1413,9 +1506,10 @@ __global__ void handle_local_minimum_kernel(typename fj_t::climber_dat if (sat_best_score.base > 0 && sat_best_score > best_score) { if (FIRST_THREAD) { - best_score = sat_best_score; - best_var = sat_best_var; - best_delta = sat_best_delta; + best_score = sat_best_score; + best_var = sat_best_var; + best_delta = sat_best_delta; + best_movetype = 'S'; } } } @@ -1427,6 +1521,15 @@ __global__ void handle_local_minimum_kernel(typename fj_t::climber_dat best_var, fj.incumbent_assignment[best_var] + best_delta), "assignment not within bounds"); fj.jump_move_delta[best_var] = best_delta; +#if FJ_SINGLE_STEP + DEVICE_LOG_DEBUG("FJ[%d] selected_var: %d, delta %g, score {%d %d}, type %c\n", + *fj.iterations, + best_var, + best_delta, + best_score.base, + best_score.bonus, + best_movetype); +#endif } } } @@ -1458,7 +1561,7 @@ __global__ void handle_local_minimum_kernel(typename fj_t::climber_dat const __grid_constant__ typename fj_t::climber_data_t::view_t fj); \ template __global__ void load_balancing_sanity_checks( \ const __grid_constant__ typename fj_t::climber_data_t::view_t fj); \ - template __global__ void init_lhs_and_violation( \ + template __global__ void init_lhs_and_violated_constraints( \ typename fj_t::climber_data_t::view_t fj); \ template __global__ void update_lift_moves_kernel( \ typename fj_t::climber_data_t::view_t fj); \ diff --git a/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump_kernels.cuh b/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump_kernels.cuh index 55fd4e61f1..9b99cdeb21 100644 --- a/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump_kernels.cuh +++ b/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump_kernels.cuh @@ -52,7 +52,8 @@ __global__ void load_balancing_mtm_compute_scores( const __grid_constant__ typename fj_t::climber_data_t::view_t fj); template -__global__ void init_lhs_and_violation(typename fj_t::climber_data_t::view_t fj); +__global__ void init_lhs_and_violated_constraints( + typename fj_t::climber_data_t::view_t fj); // Update the jump move tables after the best jump value has been computed for a "heavy" variable template diff --git a/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cu b/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cu index b16f299bf1..34634959c8 100644 --- a/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cu +++ b/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cu @@ -11,16 +11,20 @@ #include "feasibility_jump_impl_common.cuh" #include "fj_cpu.cuh" +#include #include #include +#include +#include + +#include #include -#include -#include +#include +#include #include #include -#include #include #include @@ -38,6 +42,24 @@ namespace cuopt::linear_programming::detail { +namespace { + +double read_positive_work_unit_scale(const char* env_name) +{ + const char* env_value = std::getenv(env_name); + if (env_value == nullptr || env_value[0] == '\0') { return 1.0; } + + errno = 0; + char* end_ptr = nullptr; + const double parsed_value = std::strtod(env_value, &end_ptr); + const bool valid_value = errno == 0 && end_ptr != env_value && *end_ptr == '\0' && + std::isfinite(parsed_value) && parsed_value > 0.0; + cuopt_assert(valid_value, "Invalid CPUFJ work-unit scale env var"); + return parsed_value; +} + +} // namespace + template thrust::tuple get_mtm_for_bound(const typename fj_t::climber_data_t::view_t& fj, i_t var_idx, @@ -107,99 +129,6 @@ thrust::tuple get_mtm_for_constraint( return {delta_ij, sign, slack, cstr_tolerance}; } -template -std::pair feas_score_constraint(const typename fj_t::climber_data_t::view_t& fj, - i_t var_idx, - f_t delta, - i_t cstr_idx, - f_t cstr_coeff, - f_t c_lb, - f_t c_ub, - f_t current_lhs, - f_t left_weight, - f_t right_weight) -{ - cuopt_assert(isfinite(delta), "invalid delta"); - cuopt_assert(cstr_coeff != 0 && isfinite(cstr_coeff), "invalid coefficient"); - - f_t base_feas = 0; - f_t bonus_robust = 0; - - f_t bounds[2] = {c_lb, c_ub}; - cuopt_assert(isfinite(c_lb) || isfinite(c_ub), "no range"); - for (i_t bound_idx = 0; bound_idx < 2; ++bound_idx) { - if (!isfinite(bounds[bound_idx])) continue; - - // factor to correct the lhs/rhs to turn a lb <= lhs <= ub constraint into - // two virtual leq constraints "lhs <= ub" and "-lhs <= -lb" in order to match - // the convention of the paper - - // TODO: broadcast left/right weights to a csr_offset-indexed table? local minimums - // usually occur on a rarer basis (around 50 iteratiosn to 1 local minimum) - // likely unreasonable and overkill however - f_t cstr_weight = bound_idx == 0 ? left_weight : right_weight; - f_t sign = bound_idx == 0 ? -1 : 1; - f_t rhs = bounds[bound_idx] * sign; - f_t old_lhs = current_lhs * sign; - f_t new_lhs = (current_lhs + cstr_coeff * delta) * sign; - f_t old_slack = rhs - old_lhs; - f_t new_slack = rhs - new_lhs; - - cuopt_assert(isfinite(cstr_weight), "invalid weight"); - cuopt_assert(cstr_weight >= 0, "invalid weight"); - cuopt_assert(isfinite(old_lhs), ""); - cuopt_assert(isfinite(new_lhs), ""); - cuopt_assert(isfinite(old_slack) && isfinite(new_slack), ""); - - f_t cstr_tolerance = fj.get_corrected_tolerance(cstr_idx, c_lb, c_ub); - - bool old_viol = fj.excess_score(cstr_idx, current_lhs, c_lb, c_ub) < -cstr_tolerance; - bool new_viol = - fj.excess_score(cstr_idx, current_lhs + cstr_coeff * delta, c_lb, c_ub) < -cstr_tolerance; - - bool old_sat = old_lhs < rhs + cstr_tolerance; - bool new_sat = new_lhs < rhs + cstr_tolerance; - - // equality - if (fj.pb.integer_equal(c_lb, c_ub)) { - if (!old_viol) cuopt_assert(old_sat == !old_viol, ""); - if (!new_viol) cuopt_assert(new_sat == !new_viol, ""); - } - - // if it would feasibilize this constraint - if (!old_sat && new_sat) { - cuopt_assert(old_viol, ""); - base_feas += cstr_weight; - } - // would cause this constraint to be violated - else if (old_sat && !new_sat) { - cuopt_assert(new_viol, ""); - base_feas -= cstr_weight; - } - // simple improvement - else if (!old_sat && !new_sat && old_lhs > new_lhs) { - cuopt_assert(old_viol && new_viol, ""); - base_feas += (i_t)(cstr_weight * fj.settings->parameters.excess_improvement_weight); - } - // simple worsening - else if (!old_sat && !new_sat && old_lhs <= new_lhs) { - cuopt_assert(old_viol && new_viol, ""); - base_feas -= (i_t)(cstr_weight * fj.settings->parameters.excess_improvement_weight); - } - - // robustness score bonus if this would leave some strick slack - bool old_stable = old_lhs < rhs - cstr_tolerance; - bool new_stable = new_lhs < rhs - cstr_tolerance; - if (!old_stable && new_stable) { - bonus_robust += cstr_weight; - } else if (old_stable && !new_stable) { - bonus_robust -= cstr_weight; - } - } - - return {base_feas, bonus_robust}; -} - static constexpr double BIGVAL_THRESHOLD = 1e20; template @@ -1401,6 +1330,15 @@ std::unique_ptr> fj_t::create_cpu_climber( // Initialize fj_cpu with all the data init_fj_cpu(*fj_cpu, solution, left_weights, right_weights, objective_weight); + const double cpu_work_unit_scale = + context.settings.cpufj_work_unit_scale != 1.0 + ? context.settings.cpufj_work_unit_scale + : read_positive_work_unit_scale("CUOPT_CPUFJ_WORK_UNIT_SCALE"); + fj_cpu->work_unit_bias *= cpu_work_unit_scale; + if (cpu_work_unit_scale != 1.0) { + CUOPT_DETERMINISM_LOG( + "CPUFJ using work-unit scale %f (bias=%f)", cpu_work_unit_scale, fj_cpu->work_unit_bias); + } fj_cpu->settings = settings; if (randomize_params) { auto rng = std::mt19937(cuopt::seed_generator::get_seed()); @@ -1550,6 +1488,10 @@ static bool cpufj_solve_loop(fj_cpu_climber_t& fj_cpu, f_t in_time_lim fj_cpu.work_units_elapsed += biased_work; if (fj_cpu.producer_sync != nullptr) { fj_cpu.producer_sync->notify_progress(); } + + if (fj_cpu.work_units_elapsed.load(std::memory_order_relaxed) >= fj_cpu.work_budget) { + break; + } } cuopt_func_call(sanity_checks(fj_cpu)); diff --git a/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cuh b/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cuh index 3263609a2b..4124bd079a 100644 --- a/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cuh +++ b/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cuh @@ -154,7 +154,8 @@ struct fj_cpu_climber_t { // Work unit tracking for deterministic synchronization std::atomic work_units_elapsed{0.0}; - double work_unit_bias{1.5}; // Bias factor to keep CPUFJ ahead of B&B + double work_unit_bias{1.5}; // Bias factor to keep CPUFJ ahead of B&B + double work_budget{std::numeric_limits::infinity()}; producer_sync_t* producer_sync{nullptr}; // Optional sync utility for notifying progress std::atomic halted{false}; diff --git a/cpp/src/mip_heuristics/feasibility_jump/load_balancing.cuh b/cpp/src/mip_heuristics/feasibility_jump/load_balancing.cuh index dfc9b3c885..8b77367ac4 100644 --- a/cpp/src/mip_heuristics/feasibility_jump/load_balancing.cuh +++ b/cpp/src/mip_heuristics/feasibility_jump/load_balancing.cuh @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -120,16 +120,19 @@ __global__ void load_balancing_prepare_iteration(const __grid_constant__ typename fj_t::climber_data_t::view_t fj) { bool full_refresh = needs_full_refresh(fj); + charge_deterministic_iteration_work(fj, full_refresh); // alternate codepath in the case of a small related_var/total_var ratio if (!full_refresh && fj.pb.related_variables.size() > 0 && fj.pb.n_variables / fj.work_ids_for_related_vars[*fj.selected_var] >= - fj.settings->parameters.old_codepath_total_var_to_relvar_ratio_threshold) { + fj.settings->parameters.old_codepath_total_var_to_relvar_ratio_threshold && + fj.settings->load_balancing_mode != fj_load_balancing_mode_t::ALWAYS_ON) { auto range = fj.pb.range_for_related_vars(*fj.selected_var); for (i_t i = blockIdx.x + range.first; i < range.second; i += gridDim.x) { i_t var_idx = fj.pb.related_variables[i]; - update_jump_value(fj, var_idx); + update_jump_value(fj, + var_idx); } if (FIRST_THREAD) *fj.load_balancing_skip = true; @@ -334,8 +337,17 @@ __global__ void load_balancing_compute_scores_binary( auto c_lb = fj.constraint_lower_bounds_csr[csr_offset]; auto c_ub = fj.constraint_upper_bounds_csr[csr_offset]; - auto [cstr_base_feas, cstr_bonus_robust] = feas_score_constraint( - fj, var_idx, delta, cstr_idx, cstr_coeff, c_lb, c_ub, fj.incumbent_lhs[cstr_idx]); + auto [cstr_base_feas, cstr_bonus_robust] = + feas_score_constraint(fj, + var_idx, + delta, + cstr_idx, + cstr_coeff, + c_lb, + c_ub, + fj.incumbent_lhs[cstr_idx], + fj.cstr_left_weights[cstr_idx], + fj.cstr_right_weights[cstr_idx]); base_feas += cstr_base_feas; bonus_robust += cstr_bonus_robust; @@ -526,8 +538,8 @@ __launch_bounds__(TPB_loadbalance, 16) __global__ auto& score_info = candidate.score; - f_t base_feas = 0; - f_t bonus_robust = 0; + int32_t base_feas = 0; + int32_t bonus_robust = 0; // same as for the binary var kernel, compute each score compoenent per thread // and merge then via a wapr reduce @@ -535,8 +547,17 @@ __launch_bounds__(TPB_loadbalance, 16) __global__ cuopt_assert(c_lb == fj.pb.constraint_lower_bounds[cstr_idx], "bound sanity check failed"); cuopt_assert(c_ub == fj.pb.constraint_upper_bounds[cstr_idx], "bound sanity check failed"); - auto [cstr_base_feas, cstr_bonus_robust] = feas_score_constraint( - fj, var_idx, delta, cstr_idx, cstr_coeff, c_lb, c_ub, fj.incumbent_lhs[cstr_idx]); + auto [cstr_base_feas, cstr_bonus_robust] = + feas_score_constraint(fj, + var_idx, + delta, + cstr_idx, + cstr_coeff, + c_lb, + c_ub, + fj.incumbent_lhs[cstr_idx], + fj.cstr_left_weights[cstr_idx], + fj.cstr_right_weights[cstr_idx]); base_feas += cstr_base_feas; bonus_robust += cstr_bonus_robust; @@ -565,24 +586,29 @@ __launch_bounds__(TPB_loadbalance, 16) __global__ best_score_ref{fj.jump_move_scores[var_idx]}; auto best_score = best_score_ref.load(cuda::memory_order_relaxed); + cuda::atomic_ref best_delta_ref{ + fj.jump_move_delta[var_idx]}; + auto best_delta = best_delta_ref.load(cuda::memory_order_relaxed); + if (best_score < candidate.score || - (best_score == candidate.score && candidate.delta < fj.jump_move_delta[var_idx])) { + (best_score == candidate.score && candidate.delta < best_delta)) { // update the best move delta acquire_lock(&fj.jump_locks[var_idx]); // reject this move if it would increase the target variable to a numerically unstable // value - if (!fj.move_numerically_stable(fj.incumbent_assignment[var_idx], - fj.incumbent_assignment[var_idx] + delta, - base_feas, - *fj.violation_score)) { - fj.jump_move_scores[var_idx] = fj_t::move_score_t::invalid(); - } else if (fj.jump_move_scores[var_idx] < candidate.score - // determinism for ease of debugging - || (fj.jump_move_scores[var_idx] == candidate.score && - candidate.delta < fj.jump_move_delta[var_idx])) { - fj.jump_move_delta[var_idx] = candidate.delta; - fj.jump_move_scores[var_idx] = candidate.score; + // only skip updating, don't invalidate existing valid moves + if (fj.move_numerically_stable(fj.incumbent_assignment[var_idx], + fj.incumbent_assignment[var_idx] + delta, + base_feas, + *fj.violation_score)) { + if (fj.jump_move_scores[var_idx] < candidate.score + // determinism for ease of debugging + || (fj.jump_move_scores[var_idx] == candidate.score && + candidate.delta < fj.jump_move_delta[var_idx])) { + fj.jump_move_delta[var_idx] = candidate.delta; + fj.jump_move_scores[var_idx] = candidate.score; + } } release_lock(&fj.jump_locks[var_idx]); } @@ -644,7 +670,7 @@ __global__ void load_balancing_sanity_checks(const __grid_constant__ if (!(score_1 == score_1.invalid() && score_2 == score_2.invalid()) && !(v.pb.integer_equal(score_1.base, score_2.base) && v.pb.integer_equal(score_1.bonus, score_2.bonus))) { - printf("(iter %d) [%d, int:%d]: delta %g/%g was %f/%f, is %f/%f\n", + printf("(iter %d) [%d, int:%d]: delta %g/%g was %d/%d, is %d/%d\n", *v.iterations, var_idx, v.pb.is_integer_var(var_idx), diff --git a/cpp/src/mip_heuristics/feasibility_jump/utils.cuh b/cpp/src/mip_heuristics/feasibility_jump/utils.cuh index d98686bcc6..a16567b092 100644 --- a/cpp/src/mip_heuristics/feasibility_jump/utils.cuh +++ b/cpp/src/mip_heuristics/feasibility_jump/utils.cuh @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -10,6 +10,7 @@ #include "feasibility_jump.cuh" #include +#include #include #include #include @@ -133,6 +134,23 @@ struct contiguous_set_t { validity_bitmap.resize(size, stream); } + void sort(const rmm::cuda_stream_view& stream) + { + thrust::sort( + rmm::exec_policy(stream), contents.begin(), contents.begin() + set_size.value(stream)); + thrust::fill(rmm::exec_policy(stream), index_map.begin(), index_map.end(), -1); + thrust::for_each(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(set_size.value(stream)), + [v = view()] __device__(i_t idx) { v.index_map[v.contents[idx]] = idx; }); + + // only useful for debugging and ensuring the same hashes are printed +#if FJ_SINGLE_STEP + thrust::fill( + rmm::exec_policy(stream), contents.begin() + set_size.value(stream), contents.end(), 0); +#endif + } + struct view_t { i_t* set_size; i_t* lock; diff --git a/cpp/src/mip_heuristics/local_search/feasibility_pump/feasibility_pump.cu b/cpp/src/mip_heuristics/local_search/feasibility_pump/feasibility_pump.cu index 0a17e3ebfd..34034956af 100644 --- a/cpp/src/mip_heuristics/local_search/feasibility_pump/feasibility_pump.cu +++ b/cpp/src/mip_heuristics/local_search/feasibility_pump/feasibility_pump.cu @@ -29,6 +29,14 @@ #include #include +// enable to activate detailed determinism logs +#if 0 +#undef CUOPT_DETERMINISM_LOG +#define CUOPT_DETERMINISM_LOG(...) \ + do { \ + CUOPT_LOG_INFO(__VA_ARGS__); \ + } while (0) +#endif namespace cuopt::linear_programming::detail { template @@ -52,7 +60,7 @@ feasibility_pump_t::feasibility_pump_t( context.problem_ptr->handle_ptr->get_stream()), lp_optimal_solution(lp_optimal_solution_), rng(cuopt::seed_generator::get_seed()), - timer(20.) + timer(20., *context.termination) { } @@ -147,18 +155,36 @@ bool feasibility_pump_t::linear_project_onto_polytope(solution_t temp_p(*solution.problem_ptr); auto h_integer_indices = cuopt::host_copy(solution.problem_ptr->integer_indices, solution.handle_ptr->get_stream()); + cuopt_assert(h_assignment.size() == solution.problem_ptr->n_variables, "Size mismatch"); + cuopt_assert(h_last_projection.size() == solution.problem_ptr->n_variables, "Size mismatch"); + cuopt_assert(h_variable_bounds.size() == solution.problem_ptr->n_variables, "Size mismatch"); + CUOPT_DETERMINISM_LOG( + "FP proj inputs: assign_hash=0x%x last_proj_hash=0x%x integer_idx_hash=0x%x n_vars=%d n_int=%d", + detail::compute_hash(h_assignment), + detail::compute_hash(h_last_projection), + detail::compute_hash(h_integer_indices), + solution.problem_ptr->n_variables, + solution.problem_ptr->n_integer_vars); f_t obj_offset = 0; + i_t n_at_upper = 0; + i_t n_at_lower = 0; + i_t n_interior = 0; + std::vector interior_integer_indices; + interior_integer_indices.reserve(h_integer_indices.size()); // for each integer add the variable and the distance constraints for (auto i : h_integer_indices) { + cuopt_assert(i >= 0 && i < solution.problem_ptr->n_variables, "Index out of bounds"); auto h_var_bounds = h_variable_bounds[i]; if (solution.problem_ptr->integer_equal(h_assignment[i], get_upper(h_var_bounds))) { obj_offset += get_upper(h_var_bounds); // set the objective weight to -1, u - x obj_coefficients[i] = -1; + n_at_upper++; } else if (solution.problem_ptr->integer_equal(h_assignment[i], get_lower(h_var_bounds))) { obj_offset -= get_lower(h_var_bounds); // set the objective weight to +1, x - l obj_coefficients[i] = 1; + n_at_lower++; } else { // objective weight is 1 const f_t obj_weight = 1.; @@ -183,9 +209,30 @@ bool feasibility_pump_t::linear_project_onto_polytope(solution_t constr_coeffs_2{1, 1}; h_constraints.add_constraint( constr_indices, constr_coeffs_2, h_assignment[i], (f_t)default_cont_upper); + n_interior++; + interior_integer_indices.push_back(i); } } + CUOPT_DETERMINISM_LOG( + "FP proj build: at_lower=%d at_upper=%d interior=%d interior_idx_hash=0x%x obj_hash=0x%x " + "assign_aug_hash=0x%x vars_added=%d cstr_added=%d cstr_var_hash=0x%x cstr_coeff_hash=0x%x " + "cstr_offset_hash=0x%x cstr_lb_hash=0x%x cstr_ub_hash=0x%x", + n_at_lower, + n_at_upper, + n_interior, + detail::compute_hash(interior_integer_indices), + detail::compute_hash(obj_coefficients), + detail::compute_hash(h_assignment), + h_variables.size(), + h_constraints.n_constraints(), + detail::compute_hash(h_constraints.constraint_variables), + detail::compute_hash(h_constraints.constraint_coefficients), + detail::compute_hash(h_constraints.constraint_offsets), + detail::compute_hash(h_constraints.constraint_lower_bounds), + detail::compute_hash(h_constraints.constraint_upper_bounds)); adjust_objective_with_original(solution, obj_coefficients, longer_lp_run); + CUOPT_DETERMINISM_LOG("FP proj adjusted objective hash=0x%x", + detail::compute_hash(obj_coefficients)); // commit all the changes that were done by the host if (h_variables.size() > 0) { temp_p.insert_variables(h_variables); } if (h_constraints.n_constraints() > 0) { temp_p.insert_constraints(h_constraints); } @@ -196,6 +243,12 @@ bool feasibility_pump_t::linear_project_onto_polytope(solution_tget_stream()), + temp_p.n_variables, + temp_p.n_constraints); // copy new objective coefficients raft::copy(temp_p.objective_coefficients.data(), obj_coefficients.data(), @@ -210,13 +263,19 @@ bool feasibility_pump_t::linear_project_onto_polytope(solution_t::round(solution_t& solution) { bool result; CUOPT_LOG_DEBUG("Rounding the point"); - timer_t bounds_prop_timer(std::max(0.05, std::min(0.5, timer.remaining_time() / 10.))); + const int64_t seed_before = cuopt::seed_generator::peek_seed(); + const uint32_t hash_before = solution.get_hash(); + CUOPT_DETERMINISM_LOG("FP round entry: hash=0x%x seed=%lld rem=%.6f", + hash_before, + (long long)seed_before, + timer.remaining_time()); + + f_t bounds_prop_time_limit = std::min((f_t)0.5, timer.remaining_time() / 10.); + if (timer.deterministic) { + bounds_prop_time_limit = std::max((f_t)0.0, bounds_prop_time_limit); + } else { + bounds_prop_time_limit = std::max((f_t)0.05, bounds_prop_time_limit); + } + work_limit_timer_t bounds_prop_timer( + context.gpu_heur_loop, bounds_prop_time_limit, *context.termination); const f_t lp_run_time_after_feasible = 0.; bool old_var = constraint_prop.round_all_vars; f_t old_time = constraint_prop.max_time_for_bounds_prop; @@ -257,13 +330,20 @@ bool feasibility_pump_t::round(solution_t& solution) result = constraint_prop.apply_round(solution, lp_run_time_after_feasible, bounds_prop_timer); constraint_prop.round_all_vars = old_var; constraint_prop.max_time_for_bounds_prop = old_time; - // result = solution.round_nearest(); cuopt_func_call(solution.test_variable_bounds(true)); - // copy the last rounding raft::copy(last_rounding.data(), solution.assignment.data(), solution.assignment.size(), solution.handle_ptr->get_stream()); + + const int64_t seed_after = cuopt::seed_generator::peek_seed(); + CUOPT_DETERMINISM_LOG("FP round exit: hash=0x%x seed=%lld seed_delta=%lld feasible=%d rem=%.6f", + solution.get_hash(), + (long long)seed_after, + (long long)(seed_after - seed_before), + (int)result, + timer.remaining_time()); + if (result) { CUOPT_LOG_DEBUG("New feasible solution with objective %g", solution.get_user_objective()); } @@ -308,6 +388,13 @@ bool feasibility_pump_t::test_fj_feasible(solution_t& soluti fj.settings.feasibility_run = true; fj.settings.n_of_minimums_for_exit = 5000; fj.settings.time_limit = std::min(time_limit, timer.remaining_time()); + if (timer.deterministic) { + fj.settings.time_limit = std::max((f_t)0.0, fj.settings.time_limit); + if (fj.settings.time_limit == 0.0) { + CUOPT_LOG_DEBUG("Skipping 20%% FJ run due to exhausted deterministic work budget"); + return false; + } + } cuopt_func_call(solution.test_variable_bounds(true)); is_feasible = fj.solve(solution); cuopt_func_call(solution.test_variable_bounds(true)); @@ -472,14 +559,39 @@ template bool feasibility_pump_t::run_single_fp_descent(solution_t& solution) { raft::common::nvtx::range fun_scope("run_single_fp_descent"); + i_t fp_iter = 0; + CUOPT_DETERMINISM_LOG("FP descent start: hash=0x%x feas=%d obj=%.12f timer_det=%d rem=%.6f", + solution.get_hash(), + (int)solution.get_feasible(), + solution.get_user_objective(), + (int)timer.deterministic, + timer.remaining_time()); // start by doing nearest rounding solution.round_nearest(); + CUOPT_DETERMINISM_LOG("FP descent after initial round: hash=0x%x feas=%d obj=%.12f", + solution.get_hash(), + (int)solution.get_feasible(), + solution.get_user_objective()); + cuopt_assert(last_projection.size() == solution.assignment.size(), "Size mismatch"); + // First projection in a descent has no previous projection history: initialize explicitly + raft::copy(last_projection.data(), + solution.assignment.data(), + solution.assignment.size(), + solution.handle_ptr->get_stream()); raft::copy(last_rounding.data(), solution.assignment.data(), solution.assignment.size(), solution.handle_ptr->get_stream()); while (true) { - if (context.diversity_manager_ptr->check_b_b_preemption() || timer.check_time_limit()) { + CUOPT_DETERMINISM_LOG("FP iter %d pre-projection: hash=0x%x feas=%d obj=%.12f rem=%.6f", + fp_iter, + solution.get_hash(), + (int)solution.get_feasible(), + solution.get_user_objective(), + timer.remaining_time()); + bool preempt = context.diversity_manager_ptr != nullptr && + context.diversity_manager_ptr->check_b_b_preemption(); + if (preempt || timer.check_time_limit()) { CUOPT_LOG_DEBUG("FP time limit reached!"); round(solution); return false; @@ -489,10 +601,25 @@ bool feasibility_pump_t::run_single_fp_descent(solution_t& s f_t ratio_of_assigned_integers = f_t(solution.n_assigned_integers) / solution.problem_ptr->n_integer_vars; bool is_feasible = linear_project_onto_polytope(solution, ratio_of_assigned_integers); - i_t n_integers = solution.compute_number_of_integers(); + const f_t remaining_after_projection = timer.remaining_time(); + i_t n_integers = solution.compute_number_of_integers(); CUOPT_LOG_DEBUG("after fp projection n_integers %d total n_integes %d", n_integers, solution.problem_ptr->n_integer_vars); + CUOPT_DETERMINISM_LOG( + "FP iter %d post-projection: hash=0x%x feasible_after_lp=%d obj=%.12f rem=%.6f lp_stage=%.6f", + fp_iter, + solution.get_hash(), + (int)is_feasible, + solution.get_user_objective(), + remaining_after_projection, + proj_begin - remaining_after_projection); + CUOPT_DETERMINISM_LOG("FP iter %d pre-round: hash=0x%x feas=%d obj=%.12f rem=%.6f", + fp_iter, + solution.get_hash(), + (int)is_feasible, + solution.get_user_objective(), + remaining_after_projection); bool is_cycle = true; // temp comment for presolve run if (config.check_distance_cycle) { @@ -524,30 +651,71 @@ bool feasibility_pump_t::run_single_fp_descent(solution_t& s // run the LP with full precision to check if it actually is feasible const f_t lp_verify_time_limit = 5.; relaxed_lp_settings_t lp_settings; - lp_settings.time_limit = lp_verify_time_limit; + lp_settings.time_limit = lp_verify_time_limit; + bool run_verify_lp = true; + if (timer.deterministic) { + const f_t remaining_work_limit = std::max((f_t)0.0, timer.remaining_time()); + lp_settings.work_limit = std::min(lp_verify_time_limit, remaining_work_limit); + lp_settings.time_limit = lp_settings.work_limit; + if (lp_settings.work_limit == 0.0) { + CUOPT_LOG_DEBUG( + "Skipping FP verification LP due to exhausted deterministic work budget"); + run_verify_lp = false; + } + } + lp_settings.work_context = timer.work_context; lp_settings.tolerance = solution.problem_ptr->tolerances.absolute_tolerance; lp_settings.return_first_feasible = true; lp_settings.save_state = true; - run_lp_with_vars_fixed(*solution.problem_ptr, - solution, - solution.problem_ptr->integer_indices, - lp_settings, - &constraint_prop.bounds_update); - is_feasible = solution.get_feasible(); - n_integers = solution.compute_number_of_integers(); - if (is_feasible && n_integers == solution.problem_ptr->n_integer_vars) { - CUOPT_LOG_DEBUG("Feasible solution verified with LP!"); - return true; + if (run_verify_lp) { + run_lp_with_vars_fixed(*solution.problem_ptr, + solution, + solution.problem_ptr->integer_indices, + lp_settings, + &constraint_prop.bounds_update); + is_feasible = solution.get_feasible(); + n_integers = solution.compute_number_of_integers(); + if (is_feasible && n_integers == solution.problem_ptr->n_integer_vars) { + CUOPT_LOG_TRACE("Feasible solution verified with LP!"); + return true; + } } } } cuopt_func_call(solution.test_variable_bounds(false)); is_feasible = round(solution); cuopt_func_call(solution.test_variable_bounds(true)); - proj_and_round_time = proj_begin - timer.remaining_time(); + const f_t remaining_after_round = timer.remaining_time(); + proj_and_round_time = proj_begin - remaining_after_round; + CUOPT_DETERMINISM_LOG( + "FP iter %d post-round: hash=0x%x feasible_after_round=%d obj=%.12f rem=%.6f " + "round_stage=%.6f proj_round_total=%.6f", + fp_iter, + solution.get_hash(), + (int)is_feasible, + solution.get_user_objective(), + remaining_after_round, + remaining_after_projection - remaining_after_round, + proj_and_round_time); if (!is_feasible) { const f_t time_ratio = 0.2; - is_feasible = test_fj_feasible(solution, time_ratio * proj_and_round_time); + const f_t fj_budget = time_ratio * proj_and_round_time; + CUOPT_DETERMINISM_LOG("FP iter %d pre-fj-fallback: hash=0x%x rem=%.6f fj_budget=%.6f", + fp_iter, + solution.get_hash(), + remaining_after_round, + fj_budget); + is_feasible = test_fj_feasible(solution, fj_budget); + const f_t remaining_after_fj = timer.remaining_time(); + CUOPT_DETERMINISM_LOG( + "FP iter %d post-fj-fallback: hash=0x%x feasible_after_fj=%d obj=%.12f rem=%.6f " + "fj_stage=%.6f", + fp_iter, + solution.get_hash(), + (int)is_feasible, + solution.get_user_objective(), + remaining_after_fj, + remaining_after_round - remaining_after_fj); } if (timer.check_time_limit()) { CUOPT_LOG_DEBUG("FP time limit reached!"); @@ -576,6 +744,7 @@ bool feasibility_pump_t::run_single_fp_descent(solution_t& s return false; } cycle_queue.n_iterations_without_cycle++; + fp_iter++; } // unreachable return false; diff --git a/cpp/src/mip_heuristics/local_search/feasibility_pump/feasibility_pump.cuh b/cpp/src/mip_heuristics/local_search/feasibility_pump/feasibility_pump.cuh index df3ad405e6..d89933bd17 100644 --- a/cpp/src/mip_heuristics/local_search/feasibility_pump/feasibility_pump.cuh +++ b/cpp/src/mip_heuristics/local_search/feasibility_pump/feasibility_pump.cuh @@ -106,7 +106,6 @@ class feasibility_pump_t { feasibility_pump_t() = delete; feasibility_pump_t(mip_solver_context_t& context, fj_t& fj, - // fj_tree_t& fj_tree_, constraint_prop_t& constraint_prop_, line_segment_search_t& line_segment_search_, rmm::device_uvector& lp_optimal_solution_); @@ -128,7 +127,7 @@ class feasibility_pump_t { bool check_distance_cycle(solution_t& solution); void reset(); void resize_vectors(problem_t& problem, const raft::handle_t* handle_ptr); - bool random_round_with_fj(solution_t& solution, timer_t& round_timer); + bool random_round_with_fj(solution_t& solution, work_limit_timer_t& round_timer); bool round_multiple_points(solution_t& solution); void relax_general_integers(solution_t& solution); void revert_relaxation(solution_t& solution); @@ -137,7 +136,6 @@ class feasibility_pump_t { mip_solver_context_t& context; // keep a reference from upstream local search fj_t& fj; - // fj_tree_t& fj_tree; line_segment_search_t& line_segment_search; cycle_queue_t cycle_queue; constraint_prop_t& constraint_prop; @@ -156,7 +154,7 @@ class feasibility_pump_t { f_t proj_begin; i_t n_fj_single_descents; i_t max_n_of_integers = 0; - cuopt::timer_t timer; + cuopt::work_limit_timer_t timer; }; } // namespace cuopt::linear_programming::detail diff --git a/cpp/src/mip_heuristics/local_search/line_segment_search/line_segment_search.cu b/cpp/src/mip_heuristics/local_search/line_segment_search/line_segment_search.cu index ce70aec745..094a45cd17 100644 --- a/cpp/src/mip_heuristics/local_search/line_segment_search/line_segment_search.cu +++ b/cpp/src/mip_heuristics/local_search/line_segment_search/line_segment_search.cu @@ -17,8 +17,10 @@ namespace cuopt::linear_programming::detail { template line_segment_search_t::line_segment_search_t( - fj_t& fj_, constraint_prop_t& constraint_prop_) - : fj(fj_), constraint_prop(constraint_prop_) + mip_solver_context_t& context_, + fj_t& fj_, + constraint_prop_t& constraint_prop_) + : context(context_), fj(fj_), constraint_prop(constraint_prop_) { } @@ -128,7 +130,7 @@ bool line_segment_search_t::search_line_segment( const rmm::device_uvector& point_2, const rmm::device_uvector& delta_vector, bool is_feasibility_run, - cuopt::timer_t& timer) + cuopt::work_limit_timer_t& timer) { CUOPT_LOG_DEBUG("Running line segment search with a given delta vector"); cuopt_assert(point_1.size() == point_2.size(), "size mismatch"); @@ -263,7 +265,7 @@ bool line_segment_search_t::search_line_segment(solution_t& const rmm::device_uvector& point_1, const rmm::device_uvector& point_2, bool is_feasibility_run, - cuopt::timer_t& timer) + cuopt::work_limit_timer_t& timer) { CUOPT_LOG_DEBUG("Running line segment search"); cuopt_assert(point_1.size() == point_2.size(), "size mismatch"); diff --git a/cpp/src/mip_heuristics/local_search/line_segment_search/line_segment_search.cuh b/cpp/src/mip_heuristics/local_search/line_segment_search/line_segment_search.cuh index 30e169e9d9..7a040ddbd2 100644 --- a/cpp/src/mip_heuristics/local_search/line_segment_search/line_segment_search.cuh +++ b/cpp/src/mip_heuristics/local_search/line_segment_search/line_segment_search.cuh @@ -9,7 +9,7 @@ #include #include -#include +#include namespace cuopt::linear_programming::detail { @@ -26,19 +26,21 @@ template class line_segment_search_t { public: line_segment_search_t() = delete; - line_segment_search_t(fj_t& fj, constraint_prop_t& constraint_prop); + line_segment_search_t(mip_solver_context_t& context, + fj_t& fj, + constraint_prop_t& constraint_prop); bool search_line_segment(solution_t& solution, const rmm::device_uvector& point_1, const rmm::device_uvector& point_2, bool is_feasibility_run, - cuopt::timer_t& timer); + cuopt::work_limit_timer_t& timer); bool search_line_segment(solution_t& solution, const rmm::device_uvector& point_1, const rmm::device_uvector& point_2, const rmm::device_uvector& delta_vector, bool is_feasibility_run, - cuopt::timer_t& timer); + cuopt::work_limit_timer_t& timer); void save_solution_if_better(solution_t& solution, const rmm::device_uvector& point_1, @@ -49,6 +51,7 @@ class line_segment_search_t { f_t& best_feasible_cost, f_t curr_cost); + mip_solver_context_t& context; fj_t& fj; constraint_prop_t& constraint_prop; line_segment_settings_t settings; diff --git a/cpp/src/mip_heuristics/local_search/local_search.cu b/cpp/src/mip_heuristics/local_search/local_search.cu index da29511d70..cb3955fb83 100644 --- a/cpp/src/mip_heuristics/local_search/local_search.cu +++ b/cpp/src/mip_heuristics/local_search/local_search.cu @@ -15,8 +15,9 @@ #include #include #include +#include #include -#include +#include #include @@ -24,6 +25,15 @@ #include +// enable to activate detailed determinism logs +#if 0 +#undef CUOPT_DETERMINISM_LOG +#define CUOPT_DETERMINISM_LOG(...) \ + do { \ + CUOPT_LOG_INFO(__VA_ARGS__); \ + } while (0) +#endif + namespace cuopt::linear_programming::detail { template @@ -36,7 +46,7 @@ local_search_t::local_search_t(mip_solver_context_t& context fj(context), // fj_tree(fj), constraint_prop(context), - line_segment_search(fj, constraint_prop), + line_segment_search(context, fj, constraint_prop), fp(context, fj, // fj_tree, @@ -54,18 +64,17 @@ local_search_t::local_search_t(mip_solver_context_t& context scratch_cpu_fj.push_back(std::make_unique>()); scratch_cpu_fj.back()->fj_ptr = &fj; scratch_cpu_fj_on_lp_opt.fj_ptr = &fj; + CUOPT_DETERMINISM_LOG("Deterministic solve start local_search state: seed_state=%lld", + (long long)cuopt::seed_generator::peek_seed()); fj.settings.n_of_minimums_for_exit = context.settings.heuristic_params.n_of_minimums_for_exit; } -static double local_search_best_obj = std::numeric_limits::max(); -static population_t* pop_ptr = nullptr; - template void local_search_t::start_cpufj_scratch_threads(population_t& population) { - pop_ptr = &population; - + cuopt_assert(!(context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS), + "Scratch CPUFJ must remain opportunistic-only"); std::vector default_weights(context.problem_ptr->n_constraints, 1.); solution_t solution(*context.problem_ptr); @@ -88,18 +97,9 @@ void local_search_t::start_cpufj_scratch_threads(population_tlog_prefix = "******* scratch " + std::to_string(counter) + ": "; cpu_fj.fj_cpu->improvement_callback = - [&population, problem_ptr = context.problem_ptr]( - f_t obj, const std::vector& h_vec, double /*work_units*/) { - population.add_external_solution(h_vec, obj, solution_origin_t::CPUFJ); - (void)problem_ptr; - if (obj < local_search_best_obj) { - CUOPT_LOG_TRACE("******* New local search best obj %g, best overall %g", - problem_ptr->get_user_obj_from_solver_obj(obj), - problem_ptr->get_user_obj_from_solver_obj( - population.is_feasible() ? population.best_feasible().get_objective() - : std::numeric_limits::max())); - local_search_best_obj = obj; - } + [&population](f_t obj, const std::vector& h_vec, double /*work_units*/) { + population.add_external_solution( + h_vec, obj, internals::mip_solution_origin_t::CPU_FEASIBILITY_JUMP); }; counter++; }; @@ -113,7 +113,8 @@ template void local_search_t::start_cpufj_lptopt_scratch_threads( population_t& population) { - pop_ptr = &population; + cuopt_assert(!(context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS), + "LP-opt CPUFJ scratch must remain opportunistic-only"); std::vector default_weights(context.problem_ptr->n_constraints, 1.); @@ -125,16 +126,9 @@ void local_search_t::start_cpufj_lptopt_scratch_threads( solution_lp, default_weights, default_weights, 0., context.preempt_heuristic_solver_); scratch_cpu_fj_on_lp_opt.fj_cpu->log_prefix = "******* scratch on LP optimal: "; scratch_cpu_fj_on_lp_opt.fj_cpu->improvement_callback = - [this, &population](f_t obj, const std::vector& h_vec, double /*work_units*/) { - population.add_external_solution(h_vec, obj, solution_origin_t::CPUFJ); - if (obj < local_search_best_obj) { - CUOPT_LOG_DEBUG("******* New local search best obj %g, best overall %g", - context.problem_ptr->get_user_obj_from_solver_obj(obj), - context.problem_ptr->get_user_obj_from_solver_obj( - population.is_feasible() ? population.best_feasible().get_objective() - : std::numeric_limits::max())); - local_search_best_obj = obj; - } + [&population](f_t obj, const std::vector& h_vec, double /*work_units*/) { + population.add_external_solution( + h_vec, obj, internals::mip_solution_origin_t::CPU_FEASIBILITY_JUMP); }; // default weights @@ -182,8 +176,11 @@ void local_search_t::start_cpufj_deterministic( // Set up callback to send solutions to B&B with work unit timestamps deterministic_cpu_fj.fj_cpu->improvement_callback = - [&bb](f_t obj, const std::vector& h_vec, double work_units) { - bb.queue_external_solution_deterministic(h_vec, work_units); + [&bb, problem_ptr = context.problem_ptr]( + f_t obj, const std::vector& h_vec, double work_units) { + f_t user_obj = problem_ptr->get_user_obj_from_solver_obj(obj); + bb.queue_external_solution_deterministic( + h_vec, user_obj, work_units, cuopt::internals::mip_solution_origin_t::CPU_FEASIBILITY_JUMP); }; deterministic_cpu_fj.start_cpu_solver(); @@ -211,8 +208,9 @@ bool local_search_t::do_fj_solve(solution_t& solution, const std::string& source) { if (time_limit == 0.) return solution.get_feasible(); + const bool deterministic = (context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS); - timer_t timer(time_limit); + work_limit_timer_t timer(context.gpu_heur_loop, time_limit, *context.termination); const auto old_n_cstr_weights = in_fj.cstr_weights.size(); const auto expected_n_cstr_weights = static_cast(solution.problem_ptr->n_constraints); // in case this is the first time run, resize @@ -231,17 +229,24 @@ bool local_search_t::do_fj_solve(solution_t& solution, 1.); } } - auto h_weights = cuopt::host_copy(in_fj.cstr_weights, solution.handle_ptr->get_stream()); - auto h_objective_weight = in_fj.objective_weight.value(solution.handle_ptr->get_stream()); - for (auto& cpu_fj_ptr : ls_cpu_fj) { - auto& cpu_fj = *cpu_fj_ptr; - cpu_fj.fj_cpu = cpu_fj.fj_ptr->create_cpu_climber(solution, - h_weights, - h_weights, - h_objective_weight, - context.preempt_heuristic_solver_, - fj_settings_t{}, - true); + + { + auto h_weights = cuopt::host_copy(in_fj.cstr_weights, solution.handle_ptr->get_stream()); + auto h_objective_weight = in_fj.objective_weight.value(solution.handle_ptr->get_stream()); + for (auto& cpu_fj_ptr : ls_cpu_fj) { + auto& cpu_fj = *cpu_fj_ptr; + cpu_fj.fj_cpu = cpu_fj.fj_ptr->create_cpu_climber(solution, + h_weights, + h_weights, + h_objective_weight, + context.preempt_heuristic_solver_, + fj_settings_t{}, + true); + if (deterministic) { + cpu_fj.fj_cpu->work_units_elapsed = 0.0; + cpu_fj.fj_cpu->work_budget = time_limit; + } + } } auto solution_copy = solution; @@ -256,9 +261,10 @@ bool local_search_t::do_fj_solve(solution_t& solution, in_fj.settings.time_limit = timer.remaining_time(); in_fj.solve(solution); - // Stop CPU solver - for (auto& cpu_fj_ptr : ls_cpu_fj) { - cpu_fj_ptr->stop_cpu_solver(); + if (!deterministic) { + for (auto& cpu_fj_ptr : ls_cpu_fj) { + cpu_fj_ptr->stop_cpu_solver(); + } } auto gpu_fj_end = std::chrono::high_resolution_clock::now(); @@ -267,7 +273,6 @@ bool local_search_t::do_fj_solve(solution_t& solution, solution_t solution_cpu(*solution.problem_ptr); f_t best_cpu_obj = std::numeric_limits::max(); - // // Wait for CPU solver to finish for (auto& cpu_fj_ptr : ls_cpu_fj) { bool cpu_sol_found = cpu_fj_ptr->wait_for_cpu_solver(); if (cpu_sol_found) { @@ -313,8 +318,10 @@ bool local_search_t::do_fj_solve(solution_t& solution, } template -void local_search_t::generate_fast_solution(solution_t& solution, timer_t timer) +void local_search_t::generate_fast_solution(solution_t& solution, + work_limit_timer_t& timer) { + CUOPT_LOG_DEBUG("Running FJ fast sol"); thrust::fill(solution.handle_ptr->get_thrust_policy(), solution.assignment.begin(), solution.assignment.end(), @@ -325,8 +332,11 @@ void local_search_t::generate_fast_solution(solution_t& solu fj.settings.update_weights = true; fj.settings.feasibility_run = true; fj.settings.time_limit = std::min(30., timer.remaining_time()); - while (!context.diversity_manager_ptr->check_b_b_preemption() && !timer.check_time_limit()) { - timer_t constr_prop_timer = timer_t(std::min(timer.remaining_time(), 2.)); + while ((context.diversity_manager_ptr == nullptr || + !context.diversity_manager_ptr->check_b_b_preemption()) && + !timer.check_time_limit()) { + work_limit_timer_t constr_prop_timer = work_limit_timer_t( + context.gpu_heur_loop, std::min(timer.remaining_time(), 2.), *context.termination); // do constraint prop on lp optimal solution constraint_prop.apply_round(solution, 1., constr_prop_timer); if (solution.compute_feasibility()) { return; } @@ -343,7 +353,7 @@ void local_search_t::generate_fast_solution(solution_t& solu template bool local_search_t::run_local_search(solution_t& solution, const weight_t& weights, - timer_t timer, + work_limit_timer_t& timer, const ls_config_t& ls_config) { raft::common::nvtx::range fun_scope("local search"); @@ -353,11 +363,10 @@ bool local_search_t::run_local_search(solution_t& solution, if (!solution.get_feasible()) { if (ls_config.at_least_one_parent_feasible) { fj_settings.time_limit = 0.5; - timer = timer_t(fj_settings.time_limit); } else { fj_settings.time_limit = 0.25; - timer = timer_t(fj_settings.time_limit); } + timer = work_limit_timer_t(context.gpu_heur_loop, fj_settings.time_limit, *context.termination); } else { fj_settings.time_limit = std::min(1., timer.remaining_time()); } @@ -387,8 +396,9 @@ bool local_search_t::run_local_search(solution_t& solution, template bool local_search_t::run_fj_until_timer(solution_t& solution, const weight_t& weights, - timer_t timer) + work_limit_timer_t& timer) { + CUOPT_LOG_DEBUG("Running FJ until timer"); bool is_feasible; fj.settings.n_of_minimums_for_exit = 1e6; fj.settings.mode = fj_mode_t::EXIT_NON_IMPROVING; @@ -405,7 +415,7 @@ bool local_search_t::run_fj_until_timer(solution_t& solution template bool local_search_t::run_fj_annealing(solution_t& solution, - timer_t timer, + work_limit_timer_t& timer, const ls_config_t& ls_config) { raft::common::nvtx::range fun_scope("run_fj_annealing"); @@ -435,7 +445,7 @@ bool local_search_t::run_fj_annealing(solution_t& solution, template bool local_search_t::run_fj_line_segment(solution_t& solution, - timer_t timer, + work_limit_timer_t& timer, const ls_config_t& ls_config) { raft::common::nvtx::range fun_scope("run_fj_line_segment"); @@ -458,7 +468,7 @@ bool local_search_t::run_fj_line_segment(solution_t& solutio template bool local_search_t::check_fj_on_lp_optimal(solution_t& solution, bool perturb, - timer_t timer) + work_limit_timer_t& timer) { raft::common::nvtx::range fun_scope("check_fj_on_lp_optimal"); if (lp_optimal_exists) { @@ -474,15 +484,21 @@ bool local_search_t::check_fj_on_lp_optimal(solution_t& solu solution.assign_random_within_bounds(perturbation_ratio); } cuopt_func_call(solution.test_variable_bounds(false)); - f_t lp_run_time_after_feasible = std::min(1., timer.remaining_time()); - timer_t bounds_prop_timer = timer_t(std::min(timer.remaining_time(), 10.)); + f_t lp_run_time_after_feasible = std::min(1., timer.remaining_time()); + work_limit_timer_t bounds_prop_timer = work_limit_timer_t( + context.gpu_heur_loop, std::min(timer.remaining_time(), 10.), *context.termination); bool is_feasible = constraint_prop.apply_round(solution, lp_run_time_after_feasible, bounds_prop_timer); if (!is_feasible) { const f_t lp_run_time = 2.; relaxed_lp_settings_t lp_settings; lp_settings.time_limit = std::min(lp_run_time, timer.remaining_time()); - lp_settings.tolerance = solution.problem_ptr->tolerances.absolute_tolerance; + if (timer.deterministic) { + lp_settings.work_limit = lp_settings.time_limit; + lp_settings.work_context = timer.work_context; + cuopt_assert(lp_settings.work_context != nullptr, "Missing deterministic work context"); + } + lp_settings.tolerance = solution.problem_ptr->tolerances.absolute_tolerance; run_lp_with_vars_fixed( *solution.problem_ptr, solution, solution.problem_ptr->integer_indices, lp_settings); } else { @@ -499,7 +515,8 @@ bool local_search_t::check_fj_on_lp_optimal(solution_t& solu } template -bool local_search_t::run_fj_on_zero(solution_t& solution, timer_t timer) +bool local_search_t::run_fj_on_zero(solution_t& solution, + work_limit_timer_t& timer) { raft::common::nvtx::range fun_scope("run_fj_on_zero"); thrust::fill(solution.handle_ptr->get_thrust_policy(), @@ -518,7 +535,7 @@ bool local_search_t::run_fj_on_zero(solution_t& solution, ti template bool local_search_t::run_staged_fp(solution_t& solution, - timer_t timer, + work_limit_timer_t& timer, population_t* population_ptr) { raft::common::nvtx::range fun_scope("run_staged_fp"); @@ -546,7 +563,8 @@ bool local_search_t::run_staged_fp(solution_t& solution, } CUOPT_LOG_DEBUG("Running staged FP from beginning it %d", i); fp.relax_general_integers(solution); - timer_t binary_timer(timer.remaining_time() / 3); + work_limit_timer_t binary_timer( + context.gpu_heur_loop, timer.remaining_time() / 3, *context.termination); i_t binary_it_counter = 0; for (; binary_it_counter < 100; ++binary_it_counter) { population_ptr->add_external_solutions_to_population(); @@ -626,6 +644,9 @@ void local_search_t::save_solution_and_add_cutting_plane( template void local_search_t::resize_to_new_problem() { + CUOPT_LOG_DEBUG("resize_to_new_problem: nv=%d nc=%d", + problem_with_objective_cut.n_variables, + problem_with_objective_cut.n_constraints); resize_vectors(problem_with_objective_cut, problem_with_objective_cut.handle_ptr); // hint for next PR in case load balanced is reintroduced // lb_constraint_prop.temp_problem.setup(problem_with_objective_cut); @@ -636,6 +657,9 @@ void local_search_t::resize_to_new_problem() template void local_search_t::resize_to_old_problem(problem_t* old_problem_ptr) { + CUOPT_LOG_DEBUG("resize_to_old_problem: nv=%d nc=%d", + old_problem_ptr->n_variables, + old_problem_ptr->n_constraints); resize_vectors(*old_problem_ptr, old_problem_ptr->handle_ptr); // hint for next PR in case load balanced is reintroduced // lb_constraint_prop.temp_problem.setup(*old_problem_ptr); @@ -658,7 +682,8 @@ void local_search_t::reset_alpha_and_save_solution( solution_t solution_copy(solution); solution_copy.problem_ptr = old_problem_ptr; solution_copy.resize_to_problem(); - population_ptr->add_solution(std::move(solution_copy)); + population_ptr->add_solution(std::move(solution_copy), + internals::mip_solution_origin_t::LOCAL_SEARCH); population_ptr->add_external_solutions_to_population(); if (!cutting_plane_added_for_active_run) { solution.problem_ptr = &problem_with_objective_cut; @@ -712,34 +737,53 @@ void local_search_t::reset_alpha_and_run_recombiners( template bool local_search_t::run_fp(solution_t& solution, - timer_t timer, - population_t* population_ptr) + work_limit_timer_t& timer, + population_t* population_ptr, + i_t n_fp_iterations) { raft::common::nvtx::range fun_scope("run_fp"); cuopt_assert(population_ptr != nullptr, "Population pointer must not be null"); - const i_t n_fp_iterations = 1000000; bool is_feasible = solution.compute_feasibility(); cutting_plane_added_for_active_run = is_feasible; double best_objective = is_feasible ? solution.get_objective() : std::numeric_limits::max(); rmm::device_uvector best_solution(solution.assignment, solution.handle_ptr->get_stream()); problem_t* old_problem_ptr = solution.problem_ptr; - fp.timer = timer_t(timer.remaining_time()); + fp.timer = + work_limit_timer_t(context.gpu_heur_loop, timer.remaining_time(), *context.termination); // if it has not been initialized yet, create a new problem and move it to the cut problem if (!problem_with_objective_cut.cutting_plane_added) { problem_with_objective_cut = std::move(problem_t(*old_problem_ptr)); + CUOPT_LOG_DEBUG("FP cut-problem clone: old_nv=%d old_nc=%d cut_nv=%d cut_nc=%d", + old_problem_ptr->n_variables, + old_problem_ptr->n_constraints, + problem_with_objective_cut.n_variables, + problem_with_objective_cut.n_constraints); } if (is_feasible) { CUOPT_LOG_DEBUG("FP initial solution is feasible, adding cutting plane at obj"); f_t objective_cut = best_objective - std::max(std::abs(0.001 * best_objective), OBJECTIVE_EPSILON); + CUOPT_LOG_DEBUG("FP cut-problem add: cut_obj=%g cut_nv=%d cut_nc=%d cut_added=%d fj_w=%zu", + objective_cut, + problem_with_objective_cut.n_variables, + problem_with_objective_cut.n_constraints, + (int)problem_with_objective_cut.cutting_plane_added, + fj.cstr_weights.size()); problem_with_objective_cut.add_cutting_plane_at_objective(objective_cut); + CUOPT_LOG_DEBUG("FP cut-problem post-add: cut_nv=%d cut_nc=%d", + problem_with_objective_cut.n_variables, + problem_with_objective_cut.n_constraints); // Do the copy here for proper handling of the added constraints weight fj.copy_weights( population_ptr->weights, solution.handle_ptr, problem_with_objective_cut.n_constraints); solution.problem_ptr = &problem_with_objective_cut; solution.resize_to_problem(); resize_to_new_problem(); + CUOPT_LOG_DEBUG("FP cut-problem resize done: sol_assign=%zu sol_nv=%d sol_nc=%d", + solution.assignment.size(), + solution.problem_ptr->n_variables, + solution.problem_ptr->n_constraints); } i_t last_improved_iteration = 0; for (i_t i = 0; i < n_fp_iterations && !timer.check_time_limit(); ++i) { @@ -806,14 +850,45 @@ bool local_search_t::run_fp(solution_t& solution, } } } + CUOPT_LOG_DEBUG( + "FP teardown start: assign=%zu best=%zu curr_pb=%p old_pb=%p curr_nv=%d curr_nc=%d " + "old_nv=%d old_nc=%d prevp=%zu prevd=%zu fp_rem=%g parent_rem=%g gpu_work=%g " + "gpu_prod=%g cut_added=%d", + solution.assignment.size(), + best_solution.size(), + (void*)solution.problem_ptr, + (void*)old_problem_ptr, + solution.problem_ptr->n_variables, + solution.problem_ptr->n_constraints, + old_problem_ptr->n_variables, + old_problem_ptr->n_constraints, + solution.lp_state.prev_primal.size(), + solution.lp_state.prev_dual.size(), + fp.timer.remaining_time(), + timer.remaining_time(), + context.gpu_heur_loop.current_work(), + context.gpu_heur_loop.current_producer_work(), + (int)problem_with_objective_cut.cutting_plane_added); raft::copy(solution.assignment.data(), best_solution.data(), solution.assignment.size(), solution.handle_ptr->get_stream()); + CUOPT_LOG_DEBUG("FP teardown post-copy: assign=%zu", solution.assignment.size()); solution.problem_ptr = old_problem_ptr; + CUOPT_LOG_DEBUG("FP teardown post-ptr: pb=%p nv=%d nc=%d", + (void*)solution.problem_ptr, + solution.problem_ptr->n_variables, + solution.problem_ptr->n_constraints); solution.resize_to_problem(); + CUOPT_LOG_DEBUG("FP teardown post-resize: assign=%zu prevp=%zu prevd=%zu", + solution.assignment.size(), + solution.lp_state.prev_primal.size(), + solution.lp_state.prev_dual.size()); resize_to_old_problem(old_problem_ptr); + CUOPT_LOG_DEBUG("FP teardown pre-sync"); solution.handle_ptr->sync_stream(); + CUOPT_LOG_DEBUG( + "FP teardown post-sync: hash=0x%x feas=%d", solution.get_hash(), (int)solution.get_feasible()); return is_feasible; } @@ -825,7 +900,7 @@ bool local_search_t::generate_solution(solution_t& solution, { raft::common::nvtx::range fun_scope("generate_solution"); cuopt_assert(population_ptr != nullptr, "Population pointer must not be null"); - timer_t timer(time_limit); + work_limit_timer_t timer(context.gpu_heur_loop, time_limit, *context.termination); auto n_vars = solution.problem_ptr->n_variables; auto n_binary_vars = solution.problem_ptr->get_n_binary_variables(); auto n_integer_vars = solution.problem_ptr->n_integer_vars; diff --git a/cpp/src/mip_heuristics/local_search/local_search.cuh b/cpp/src/mip_heuristics/local_search/local_search.cuh index 94493ebcb3..04b30b8ccc 100644 --- a/cpp/src/mip_heuristics/local_search/local_search.cuh +++ b/cpp/src/mip_heuristics/local_search/local_search.cuh @@ -13,13 +13,7 @@ #include #include #include -#include - -#include -#include -#include -#include -#include +#include namespace cuopt::linear_programming::dual_simplex { template @@ -58,32 +52,35 @@ class local_search_t { void start_cpufj_scratch_threads(population_t& population); void start_cpufj_lptopt_scratch_threads(population_t& population); void stop_cpufj_scratch_threads(); - void generate_fast_solution(solution_t& solution, timer_t timer); + void generate_fast_solution(solution_t& solution, work_limit_timer_t& timer); bool generate_solution(solution_t& solution, bool perturb, population_t* population_ptr, f_t time_limit = 300.); bool run_fj_until_timer(solution_t& solution, const weight_t& weights, - timer_t timer); + work_limit_timer_t& timer); bool run_local_search(solution_t& solution, const weight_t& weights, - timer_t timer, + work_limit_timer_t& timer, const ls_config_t& ls_config); bool run_fj_annealing(solution_t& solution, - timer_t timer, + work_limit_timer_t& timer, const ls_config_t& ls_config); bool run_fj_line_segment(solution_t& solution, - timer_t timer, + work_limit_timer_t& timer, const ls_config_t& ls_config); - bool run_fj_on_zero(solution_t& solution, timer_t timer); - bool check_fj_on_lp_optimal(solution_t& solution, bool perturb, timer_t timer); + bool run_fj_on_zero(solution_t& solution, work_limit_timer_t& timer); + bool check_fj_on_lp_optimal(solution_t& solution, + bool perturb, + work_limit_timer_t& timer); bool run_staged_fp(solution_t& solution, - timer_t timer, + work_limit_timer_t& timer, population_t* population_ptr); bool run_fp(solution_t& solution, - timer_t timer, - population_t* population_ptr = nullptr); + work_limit_timer_t& timer, + population_t* population_ptr = nullptr, + i_t n_fp_iterations = std::numeric_limits::max()); void resize_vectors(problem_t& problem, const raft::handle_t* handle_ptr); bool do_fj_solve(solution_t& solution, diff --git a/cpp/src/mip_heuristics/local_search/rounding/bounds_repair.cu b/cpp/src/mip_heuristics/local_search/rounding/bounds_repair.cu index f3233cc8f4..ebea04495c 100644 --- a/cpp/src/mip_heuristics/local_search/rounding/bounds_repair.cu +++ b/cpp/src/mip_heuristics/local_search/rounding/bounds_repair.cu @@ -8,16 +8,114 @@ #include "bounds_repair.cuh" #include +#include #include #include +#include #include #include #include #include #include +#include + +// enable to activate detailed determinism logs +#if 0 +#undef CUOPT_DETERMINISM_LOG +#define CUOPT_DETERMINISM_LOG(...) \ + do { \ + CUOPT_LOG_INFO(__VA_ARGS__); \ + } while (0) +#endif + namespace cuopt::linear_programming::detail { +namespace { + +constexpr double bounds_repair_setup_base_work = 5e-4; +constexpr double bounds_repair_violation_base_work = 4e-4; +constexpr double bounds_repair_violation_nnz_work = 2e-6; +constexpr double bounds_repair_violation_constraint_work = 3e-6; +constexpr double bounds_repair_best_bounds_variable_work = 2e-6; +constexpr double bounds_repair_shift_base_work = 3e-4; +constexpr double bounds_repair_shift_row_entry_work = 3e-6; +constexpr double bounds_repair_shift_candidate_work = 8e-6; +constexpr double bounds_repair_shift_neighbor_entry_work = 3e-6; +constexpr double bounds_repair_shift_sort_work = 5e-6; +constexpr double bounds_repair_damage_base_work = 3e-4; +constexpr double bounds_repair_damage_neighbor_entry_work = 8e-6; +constexpr double bounds_repair_damage_sort_work = 5e-6; +constexpr double bounds_repair_move_base_work = 5e-5; +constexpr double bounds_repair_no_candidate_base_work = 4e-4; +constexpr double bounds_repair_cycle_penalty_work = 3e-4; + +template +double estimate_bounds_repair_violation_refresh_work(const problem_t& problem, + bool update_best_bounds) +{ + double estimate = bounds_repair_violation_base_work + + bounds_repair_violation_nnz_work * (double)problem.nnz + + bounds_repair_violation_constraint_work * (double)problem.n_constraints; + if (update_best_bounds) { + estimate += bounds_repair_best_bounds_variable_work * (double)problem.n_variables; + } + return estimate; +} + +template +double estimate_bounds_repair_setup_work(const problem_t& problem) +{ + return bounds_repair_setup_base_work + + estimate_bounds_repair_violation_refresh_work(problem, true); +} + +template +double estimate_bounds_repair_shift_work(const problem_t& problem, + i_t curr_cstr, + i_t n_candidates, + bool is_cycle) +{ + const auto stream = problem.handle_ptr->get_stream(); + const i_t cstr_begin = problem.offsets.element(curr_cstr, stream); + const i_t cstr_end = problem.offsets.element(curr_cstr + 1, stream); + const double row_nnz = cstr_end - cstr_begin; + const double avg_rev_degree = + problem.n_variables > 0 ? ((double)problem.nnz / (double)problem.n_variables) : 0.0; + const double sort_work = + n_candidates > 1 ? (double)n_candidates * std::log2((double)n_candidates) : 0.0; + double estimate = bounds_repair_shift_base_work + bounds_repair_shift_row_entry_work * row_nnz; + if (n_candidates == 0) { estimate = bounds_repair_no_candidate_base_work + estimate; } + estimate += bounds_repair_shift_candidate_work * (double)n_candidates; + estimate += bounds_repair_shift_neighbor_entry_work * (double)n_candidates * avg_rev_degree; + estimate += bounds_repair_shift_sort_work * sort_work; + if (is_cycle) { estimate += bounds_repair_cycle_penalty_work; } + return estimate; +} + +template +double estimate_bounds_repair_damage_work(const problem_t& problem, i_t n_candidates) +{ + if (n_candidates == 0) { return 0.0; } + const double avg_rev_degree = + problem.n_variables > 0 ? ((double)problem.nnz / (double)problem.n_variables) : 0.0; + const double sort_work = + n_candidates > 1 ? (double)n_candidates * std::log2((double)n_candidates) : 0.0; + return bounds_repair_damage_base_work + + bounds_repair_damage_neighbor_entry_work * (double)n_candidates * avg_rev_degree + + bounds_repair_damage_sort_work * sort_work; +} + +template +void record_estimated_work(timer_t& timer, double* total_estimated_work, double work) +{ + cuopt_assert(std::isfinite(work) && work >= 0.0, "Bounds repair work estimate must be finite"); + timer.record_work(work); + *total_estimated_work += work; +} + +} // namespace + template bounds_repair_t::bounds_repair_t(const problem_t& pb, bound_presolve_t& bound_presolve_) @@ -30,7 +128,8 @@ bounds_repair_t::bounds_repair_t(const problem_t& pb, violated_cstr_map(0, pb.handle_ptr->get_stream()), total_vio(pb.handle_ptr->get_stream()), gen(cuopt::seed_generator::get_seed()), - cycle_vector(MAX_CYCLE_SEQUENCE, -1) + cycle_vector(MAX_CYCLE_SEQUENCE, -1), + timer(0.0, cuopt::termination_checker_t::root_tag_t{}) { } @@ -68,8 +167,7 @@ f_t bounds_repair_t::get_ii_violation(problem_t& problem) min_act = bound_presolve.upd.min_activity.data(), max_act = bound_presolve.upd.max_activity.data(), cstr_violations_up = cstr_violations_up.data(), - cstr_violations_down = cstr_violations_down.data(), - total_vio = total_vio.data()] __device__(i_t cstr_idx) { + cstr_violations_down = cstr_violations_down.data()] __device__(i_t cstr_idx) { f_t cnst_lb = pb_v.constraint_lower_bounds[cstr_idx]; f_t cnst_ub = pb_v.constraint_upper_bounds[cstr_idx]; f_t eps = get_cstr_tolerance( @@ -79,21 +177,31 @@ f_t bounds_repair_t::get_ii_violation(problem_t& problem) f_t violation = max(curr_cstr_violation_up, curr_cstr_violation_down); if (violation >= ROUNDOFF_TOLERANCE) { violated_cstr_map[cstr_idx] = 1; - atomicAdd(total_vio, violation); } else { violated_cstr_map[cstr_idx] = 0; } cstr_violations_up[cstr_idx] = curr_cstr_violation_up; cstr_violations_down[cstr_idx] = curr_cstr_violation_down; }); - auto iter = thrust::copy_if(handle_ptr->get_thrust_policy(), + auto iter = thrust::copy_if(handle_ptr->get_thrust_policy(), thrust::make_counting_iterator(0), thrust::make_counting_iterator(0) + problem.n_constraints, violated_cstr_map.data(), violated_constraints.data(), cuda::std::identity{}); - h_n_violated_cstr = iter - violated_constraints.data(); - f_t total_violation = total_vio.value(handle_ptr->get_stream()); + h_n_violated_cstr = iter - violated_constraints.data(); + // Use deterministic reduction instead of non-deterministic atomicAdd + f_t total_violation = thrust::transform_reduce( + handle_ptr->get_thrust_policy(), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(0) + problem.n_constraints, + [cstr_violations_up = cstr_violations_up.data(), + cstr_violations_down = cstr_violations_down.data()] __device__(i_t cstr_idx) -> f_t { + auto violation = max(cstr_violations_up[cstr_idx], cstr_violations_down[cstr_idx]); + return violation >= ROUNDOFF_TOLERANCE ? violation : 0.; + }, + (f_t)0, + thrust::plus()); CUOPT_LOG_TRACE( "Repair: n_violated_cstr %d total_violation %f", h_n_violated_cstr, total_violation); return total_violation; @@ -103,10 +211,13 @@ template i_t bounds_repair_t::get_random_cstr() { std::uniform_int_distribution<> dist(0, h_n_violated_cstr - 1); - // Generate random number - i_t random_number = dist(gen); - i_t cstr_idx = violated_constraints.element(random_number, handle_ptr->get_stream()); + i_t random_index = dist(gen); + i_t cstr_idx = violated_constraints.element(random_index, handle_ptr->get_stream()); CUOPT_LOG_TRACE("Repair: selected random cstr %d", cstr_idx); + CUOPT_DETERMINISM_LOG("Repair cstr select: random_index=%d cstr=%d n_violated=%d", + random_index, + cstr_idx, + h_n_violated_cstr); return cstr_idx; } @@ -190,7 +301,14 @@ i_t bounds_repair_t::compute_best_shift(problem_t& problem, } }); handle_ptr->sync_stream(); - return candidates.n_candidates.value(handle_ptr->get_stream()); + i_t n_candidates = candidates.n_candidates.value(handle_ptr->get_stream()); + + // Sort by (variable_index, bound_shift) to ensure fully deterministic ordering + auto key_iter = thrust::make_zip_iterator( + thrust::make_tuple(candidates.variable_index.begin(), candidates.bound_shift.begin())); + thrust::sort(handle_ptr->get_thrust_policy(), key_iter, key_iter + n_candidates); + + return n_candidates; } template @@ -377,36 +495,100 @@ void bounds_repair_t::apply_move(problem_t& problem, template bool bounds_repair_t::repair_problem(problem_t& problem, problem_t& original_problem, - timer_t timer_, + work_limit_timer_t& timer_, const raft::handle_t* handle_ptr_) { CUOPT_LOG_DEBUG("Running bounds repair"); handle_ptr = handle_ptr_; timer = timer_; + cuopt_assert(timer.deterministic == problem.deterministic, + "Bounds repair timer/problem determinism mismatch"); resize(problem); reset(); best_violation = get_ii_violation(problem); curr_violation = best_violation; best_bounds.update_from(problem, handle_ptr); - i_t no_candidate_in_a_row = 0; - while (h_n_violated_cstr > 0) { + double total_estimated_work = 0.0; + i_t repair_iterations = 0; + if (timer.deterministic) { + const double setup_work = estimate_bounds_repair_setup_work(problem); + record_estimated_work(timer, &total_estimated_work, setup_work); + CUOPT_DETERMINISM_LOG( + "Repair entry: pb_hash=0x%x bounds_hash=0x%x violated_hash=0x%x n_violated=%d " + "best_violation=%.6f timer_rem=%.6f total_work=%.6f setup_work=%.6f", + problem.get_fingerprint(), + detail::compute_hash(make_span(problem.variable_bounds), handle_ptr->get_stream()), + detail::compute_hash(make_span(violated_constraints, 0, h_n_violated_cstr), + handle_ptr->get_stream()), + h_n_violated_cstr, + best_violation, + timer.remaining_time(), + total_estimated_work, + setup_work); + } + i_t no_candidate_in_a_row = 0; + [[maybe_unused]] const char* exit_reason = "FEASIBLE"; + // TODO: do this better + i_t iter_limit = std::numeric_limits::max(); + if (timer.deterministic) { iter_limit = 20; } + while (h_n_violated_cstr > 0 && iter_limit-- > 0) { + repair_iterations++; CUOPT_LOG_TRACE("Bounds repair loop: n_violated %d best_violation %f curr_violation %f", h_n_violated_cstr, best_violation, curr_violation); - if (timer.check_time_limit()) { break; } + if (timer.deterministic) { + CUOPT_DETERMINISM_LOG( + "Repair iter entry: iter=%d pb_hash=0x%x bounds_hash=0x%x violated_hash=0x%x " + "n_violated=%d best_violation=%.6f curr_violation=%.6f timer_rem=%.6f total_work=%.6f", + repair_iterations, + problem.get_fingerprint(), + detail::compute_hash(make_span(problem.variable_bounds), handle_ptr->get_stream()), + detail::compute_hash(make_span(violated_constraints, 0, h_n_violated_cstr), + handle_ptr->get_stream()), + h_n_violated_cstr, + best_violation, + curr_violation, + timer.remaining_time(), + total_estimated_work); + } + if (timer.check_time_limit()) { + exit_reason = "TIME_LIMIT"; + break; + } i_t curr_cstr = get_random_cstr(); // best way would be to check a variable cycle, but this is easier and more performant bool is_cycle = detect_cycle(curr_cstr); if (is_cycle) { CUOPT_LOG_DEBUG("Repair: cycle detected at cstr %d", curr_cstr); } // in parallel compute the best shift and best respective damage - i_t n_candidates = compute_best_shift(problem, original_problem, curr_cstr); + i_t n_candidates = compute_best_shift(problem, original_problem, curr_cstr); + double shift_work = 0.0; + if (timer.deterministic) { + shift_work = estimate_bounds_repair_shift_work(problem, curr_cstr, n_candidates, is_cycle); + record_estimated_work(timer, &total_estimated_work, shift_work); + CUOPT_DETERMINISM_LOG( + "Repair iter shift: iter=%d curr_cstr=%d cycle=%d n_candidates=%d cand_var_hash=0x%x " + "cand_shift_hash=0x%x singleton_moved=%d shift_work=%.6f timer_rem=%.6f total_work=%.6f", + repair_iterations, + curr_cstr, + (int)is_cycle, + n_candidates, + detail::compute_hash(make_span(candidates.variable_index, 0, n_candidates), + handle_ptr->get_stream()), + detail::compute_hash(make_span(candidates.bound_shift, 0, n_candidates), + handle_ptr->get_stream()), + (int)candidates.at_least_one_singleton_moved.value(handle_ptr->get_stream()), + shift_work, + timer.remaining_time(), + total_estimated_work); + } // if no candidate is there continue with another constraint if (n_candidates == 0) { CUOPT_LOG_DEBUG("Repair: no candidate var found for cstr %d", curr_cstr); if (no_candidate_in_a_row++ == 10 || h_n_violated_cstr == 1) { CUOPT_LOG_DEBUG("Repair: no candidate var found on last violated constraint %d. Exiting...", curr_cstr); + exit_reason = "NO_CANDIDATE"; break; } continue; @@ -418,17 +600,36 @@ bool bounds_repair_t::repair_problem(problem_t& problem, // get the best damage i_t best_cstr_delta = candidates.cstr_delta.front_element(handle_ptr->get_stream()); f_t best_damage = candidates.damage.front_element(handle_ptr->get_stream()); + double damage_work = 0.0; + if (timer.deterministic) { + damage_work = estimate_bounds_repair_damage_work(problem, n_candidates); + record_estimated_work(timer, &total_estimated_work, damage_work); + CUOPT_DETERMINISM_LOG( + "Repair iter damage: iter=%d curr_cstr=%d cand_cdelta_hash=0x%x cand_damage_hash=0x%x " + "best_cstr_delta=%d best_damage=%.6f damage_work=%.6f timer_rem=%.6f total_work=%.6f", + repair_iterations, + curr_cstr, + detail::compute_hash(make_span(candidates.cstr_delta, 0, n_candidates), + handle_ptr->get_stream()), + detail::compute_hash(make_span(candidates.damage, 0, n_candidates), + handle_ptr->get_stream()), + best_cstr_delta, + best_damage, + damage_work, + timer.remaining_time(), + total_estimated_work); + } CUOPT_LOG_TRACE( "Repair: best_cstr_delta value %d best_damage %f", best_cstr_delta, best_damage); i_t best_move_idx; - // if the best damage is positive and we are within the prop (paper uses 0.75) - if ((best_cstr_delta > 0 && rand_double(0, 1, gen) < p) || is_cycle) { - // pick a random move from the candidate list + i_t n_of_eligible_candidates = -1; + + const double rand_u01 = rand_double(0, 1, gen); + const bool took_random_branch = (best_cstr_delta > 0 && rand_u01 < p) || is_cycle; + if (took_random_branch) { best_move_idx = get_random_idx(n_candidates); } else { - // filter the moves with best_damage(it can be zero or not) and then pick a candidate among - // them - i_t n_of_eligible_candidates = + n_of_eligible_candidates = find_cutoff_index(candidates, best_cstr_delta, best_damage, n_candidates); cuopt_assert(n_of_eligible_candidates > 0, ""); CUOPT_LOG_TRACE("n_of_eligible_candidates %d", n_of_eligible_candidates); @@ -440,22 +641,79 @@ bool bounds_repair_t::repair_problem(problem_t& problem, candidates.bound_shift.element(best_move_idx, handle_ptr->get_stream()), candidates.cstr_delta.element(best_move_idx, handle_ptr->get_stream()), candidates.damage.element(best_move_idx, handle_ptr->get_stream())); + if (timer.deterministic) { + CUOPT_DETERMINISM_LOG( + "Repair iter select: iter=%d cycle=%d rand_u01=%.12f took_random=%d " + "cutoff_idx=%d n_eligible=%d chosen_idx=%d chosen_var=%d chosen_shift=%.6f " + "chosen_cdelta=%d chosen_damage=%.6f", + repair_iterations, + (int)is_cycle, + rand_u01, + (int)took_random_branch, + (int)(took_random_branch ? -1 : n_of_eligible_candidates), + (int)(took_random_branch ? n_candidates : n_of_eligible_candidates), + best_move_idx, + candidates.variable_index.element(best_move_idx, handle_ptr->get_stream()), + candidates.bound_shift.element(best_move_idx, handle_ptr->get_stream()), + candidates.cstr_delta.element(best_move_idx, handle_ptr->get_stream()), + candidates.damage.element(best_move_idx, handle_ptr->get_stream())); + } apply_move(problem, original_problem, best_move_idx); reset(); // TODO we might optimize this to only calculate the changed constraints - curr_violation = get_ii_violation(problem); + curr_violation = get_ii_violation(problem); + const bool improved_violation = curr_violation < best_violation; + double refresh_work = 0.0; + if (timer.deterministic) { + refresh_work = bounds_repair_move_base_work + + estimate_bounds_repair_violation_refresh_work(problem, improved_violation); + record_estimated_work(timer, &total_estimated_work, refresh_work); + CUOPT_DETERMINISM_LOG( + "Repair iter post: iter=%d pb_hash=0x%x bounds_hash=0x%x violated_hash=0x%x " + "n_violated=%d curr_violation=%.6f improved=%d refresh_work=%.6f total_work=%.6f " + "timer_rem=%.6f", + repair_iterations, + problem.get_fingerprint(), + detail::compute_hash(make_span(problem.variable_bounds), handle_ptr->get_stream()), + detail::compute_hash(make_span(violated_constraints, 0, h_n_violated_cstr), + handle_ptr->get_stream()), + h_n_violated_cstr, + curr_violation, + (int)improved_violation, + refresh_work, + total_estimated_work, + timer.remaining_time()); + CUOPT_DETERMINISM_LOG( + "Repair iter work: cstr=%d candidates=%d cycle=%d improved=%d total=%.6f", + curr_cstr, + n_candidates, + (int)is_cycle, + (int)improved_violation, + total_estimated_work); + } - if (curr_violation < best_violation) { + if (improved_violation) { best_violation = curr_violation; // update best bounds best_bounds.update_from(problem, handle_ptr); } } - // fill the problem with the best bounds + if (h_n_violated_cstr > 0 && iter_limit <= 0) { exit_reason = "ITER_LIMIT"; } bool feasible = h_n_violated_cstr == 0; - // copy best bounds into problem best_bounds.update_to(problem, handle_ptr); CUOPT_LOG_DEBUG("Repair: returning with feas: %d vio %f", feasible, best_violation); + if (timer.deterministic) { + CUOPT_DETERMINISM_LOG( + "Repair exit: reason=%s iters=%d feasible=%d n_violated=%d best_violation=%.6f " + "total_work=%.6f timer_rem=%.6f", + exit_reason, + repair_iterations, + (int)feasible, + h_n_violated_cstr, + best_violation, + total_estimated_work, + timer.remaining_time()); + } return feasible; } diff --git a/cpp/src/mip_heuristics/local_search/rounding/bounds_repair.cuh b/cpp/src/mip_heuristics/local_search/rounding/bounds_repair.cuh index 29161c5d25..8fd9d601a5 100644 --- a/cpp/src/mip_heuristics/local_search/rounding/bounds_repair.cuh +++ b/cpp/src/mip_heuristics/local_search/rounding/bounds_repair.cuh @@ -13,6 +13,9 @@ #include #include +#include +#include + namespace cuopt::linear_programming::detail { // from the paper, probability of choosing random candidate= noise parameter @@ -120,7 +123,7 @@ class bounds_repair_t { void compute_damages(problem_t& problem, i_t n_candidates); bool repair_problem(problem_t& problem, problem_t& original_problem, - timer_t timer_, + work_limit_timer_t& timer_, const raft::handle_t* handle_ptr_); void apply_move(problem_t& problem, problem_t& original_problem, @@ -144,7 +147,7 @@ class bounds_repair_t { i_t h_n_violated_cstr; const raft::handle_t* handle_ptr; std::mt19937 gen; - timer_t timer{0.}; + work_limit_timer_t timer; std::vector cycle_vector; i_t cycle_write_pos = 0; }; diff --git a/cpp/src/mip_heuristics/local_search/rounding/constraint_prop.cu b/cpp/src/mip_heuristics/local_search/rounding/constraint_prop.cu index 8db4d7ae85..41ab0f3e91 100644 --- a/cpp/src/mip_heuristics/local_search/rounding/constraint_prop.cu +++ b/cpp/src/mip_heuristics/local_search/rounding/constraint_prop.cu @@ -5,6 +5,7 @@ */ /* clang-format on */ +#include #include #include #include @@ -16,8 +17,12 @@ #include #include #include +#include #include #include +#include + +#include namespace cuopt::linear_programming::detail { @@ -39,7 +44,8 @@ constraint_prop_t::constraint_prop_t(mip_solver_context_t& c ub_restore(context.problem_ptr->n_variables, context.problem_ptr->handle_ptr->get_stream()), assignment_restore(context.problem_ptr->n_variables, context.problem_ptr->handle_ptr->get_stream()), - rng(cuopt::seed_generator::get_seed(), 0, 0) + rng(cuopt::seed_generator::get_seed(), 0, 0), + max_timer(0.0, cuopt::termination_checker_t::root_tag_t{}) { } @@ -725,6 +731,10 @@ void constraint_prop_t::update_host_assignment(const solution_tn_variables, sol.handle_ptr->get_stream()); + sol.handle_ptr->sync_stream(); + CUOPT_DETERMINISM_LOG( + "update_host_assignment: device_hash=0x%x", + detail::compute_hash(make_span(sol.assignment), sol.handle_ptr->get_stream())); } template @@ -755,7 +765,7 @@ void constraint_prop_t::restore_original_bounds_on_unfixed( template bool constraint_prop_t::run_repair_procedure(problem_t& problem, problem_t& original_problem, - timer_t& timer, + work_limit_timer_t& timer, const raft::handle_t* handle_ptr) { // select the first probing value @@ -765,9 +775,14 @@ bool constraint_prop_t::run_repair_procedure(problem_t& prob repair_stats.repair_attempts++; f_t repair_start_time = timer.remaining_time(); i_t n_of_repairs_needed_for_feasible = 0; + // TODO: do this better + i_t iter_limit = std::numeric_limits::max(); + if ((this->context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS)) { + iter_limit = 100; + } do { n_of_repairs_needed_for_feasible++; - if (timer.check_time_limit()) { + if (timer.check_time_limit() || iter_limit-- <= 0) { CUOPT_LOG_DEBUG("Time limit is reached in repair loop!"); f_t repair_end_time = timer.remaining_time(); repair_stats.total_time_spent_on_repair += repair_start_time - repair_end_time; @@ -775,8 +790,24 @@ bool constraint_prop_t::run_repair_procedure(problem_t& prob } repair_stats.total_repair_loops++; collapse_crossing_bounds(problem, original_problem, handle_ptr); + if (timer.deterministic) { + CUOPT_DETERMINISM_LOG( + "run_repair_procedure pre-repair: loop=%d bounds_hash=0x%x infeas_count=%d timer_rem=%.6f", + n_of_repairs_needed_for_feasible, + detail::compute_hash(make_span(problem.variable_bounds), handle_ptr->get_stream()), + bounds_update.infeas_constraints_count, + timer.remaining_time()); + } bool bounds_repaired = bounds_repair.repair_problem(problem, original_problem, timer, handle_ptr); + if (timer.deterministic) { + CUOPT_DETERMINISM_LOG( + "run_repair_procedure post-repair: loop=%d repaired=%d bounds_hash=0x%x timer_rem=%.6f", + n_of_repairs_needed_for_feasible, + (int)bounds_repaired, + detail::compute_hash(make_span(problem.variable_bounds), handle_ptr->get_stream()), + timer.remaining_time()); + } if (bounds_repaired) { repair_stats.intermediate_repair_success++; CUOPT_LOG_DEBUG("Bounds repair success, running bounds prop to verify feasibility!"); @@ -841,11 +872,15 @@ bool constraint_prop_t::find_integer( solution_t& sol, solution_t& orig_sol, f_t lp_run_time_after_feasible, - timer_t& timer, + work_limit_timer_t& timer, std::optional>> probing_config) { using crit_t = termination_criterion_t; auto& unset_integer_vars = unset_vars; + CUOPT_DETERMINISM_LOG("find_integer entry: seed=%lld hash=0x%x rem=%.6f", + (long long)cuopt::seed_generator::peek_seed(), + sol.get_hash(), + timer.remaining_time()); std::mt19937 rng(cuopt::seed_generator::get_seed()); lb_restore.resize(sol.problem_ptr->n_variables, sol.handle_ptr->get_stream()); ub_restore.resize(sol.problem_ptr->n_variables, sol.handle_ptr->get_stream()); @@ -871,6 +906,7 @@ bool constraint_prop_t::find_integer( sol.problem_ptr->integer_indices.data(), sol.problem_ptr->n_integer_vars, sol.handle_ptr->get_stream()); + CUOPT_DETERMINISM_LOG("sol hash 0x%x", sol.get_hash()); } else { find_unset_integer_vars(sol, unset_integer_vars); sort_by_frac(sol, make_span(unset_integer_vars)); @@ -895,16 +931,17 @@ bool constraint_prop_t::find_integer( set_bounds_on_fixed_vars(sol); } - CUOPT_LOG_DEBUG("Bounds propagation rounding: unset vars %lu", unset_integer_vars.size()); + CUOPT_DETERMINISM_LOG("Bounds propagation rounding: unset vars %lu", unset_integer_vars.size()); if (unset_integer_vars.size() == 0) { - CUOPT_LOG_DEBUG("No integer variables provided in the bounds prop rounding"); + CUOPT_DETERMINISM_LOG("No integer variables provided in the bounds prop rounding"); expand_device_copy(orig_sol.assignment, sol.assignment, sol.handle_ptr->get_stream()); cuopt_func_call(orig_sol.test_variable_bounds()); return orig_sol.compute_feasibility(); } // this is needed for the sort inside of the loop bool problem_ii = is_problem_ii(*sol.problem_ptr); - // if the problem is ii, run the bounds prop in the beginning + CUOPT_DETERMINISM_LOG("is problem ii %d", problem_ii); + // if the problem is ii, run the bounds prop in the beginning if (problem_ii) { bool bounds_repaired = bounds_repair.repair_problem(*sol.problem_ptr, *orig_sol.problem_ptr, timer, sol.handle_ptr); @@ -925,11 +962,16 @@ bool constraint_prop_t::find_integer( sort_by_interval_and_frac(sol, make_span(unset_integer_vars), rng); } set_host_bounds(sol); + CUOPT_DETERMINISM_LOG("find_integer pre-loop: seed=%lld hash=0x%x", + (long long)cuopt::seed_generator::peek_seed(), + sol.get_hash()); size_t set_count = 0; bool timeout_happened = false; i_t n_failed_repair_iterations = 0; while (set_count < unset_integer_vars.size()) { - CUOPT_LOG_TRACE("n_set_vars %d vars to set %lu", set_count, unset_integer_vars.size()); + CUOPT_DETERMINISM_LOG("n_set_vars %d vars to set %lu", set_count, unset_integer_vars.size()); + CUOPT_DETERMINISM_LOG("unset_integer_vars size %lu", unset_integer_vars.size()); + const size_t set_count_before = set_count; update_host_assignment(sol); if (max_timer.check_time_limit()) { CUOPT_LOG_DEBUG("Second time limit is reached returning nearest rounding!"); @@ -954,7 +996,8 @@ bool constraint_prop_t::find_integer( bounds_prop_interval = 1; } } - i_t n_vars_to_set = recovery_mode ? 1 : bounds_prop_interval; + i_t n_vars_to_set = recovery_mode ? 1 : bounds_prop_interval; + const bool did_sort = n_vars_to_set != 1; // if we are not at the last stage or if we are in recovery mode, don't sort if (n_vars_to_set != 1) { sort_by_implied_slack_consumption( @@ -965,17 +1008,63 @@ bool constraint_prop_t::find_integer( unset_integer_vars.data() + set_count, n_vars_to_set, sol.handle_ptr->get_stream()); + sol.handle_ptr->sync_stream(); auto var_probe_vals = generate_bulk_rounding_vector(sol, orig_sol, host_vars_to_set, probing_config); + if (timer.deterministic) { + const auto& vids = std::get<0>(var_probe_vals); + const auto& fp = std::get<1>(var_probe_vals); + const auto& sp = std::get<2>(var_probe_vals); + std::string probe_str; + for (size_t k = 0; k < std::min(vids.size(), (size_t)8); ++k) { + char buf[128]; + snprintf(buf, sizeof(buf), " (%d,%.4f,%.4f)", vids[k], fp[k], sp[k]); + probe_str += buf; + } + CUOPT_DETERMINISM_LOG( + "find_integer loop: set_count=%zu n_vars_to_set=%d seed=%lld probes=[%s]", + set_count, + n_vars_to_set, + (long long)cuopt::seed_generator::peek_seed(), + probe_str.c_str()); + } probe( sol, orig_sol.problem_ptr, var_probe_vals, &set_count, unset_integer_vars, probing_config); + CUOPT_DETERMINISM_LOG("find_integer post-probe: seed=%lld set_count=%zu hash=0x%x", + (long long)cuopt::seed_generator::peek_seed(), + set_count, + sol.get_hash()); + [[maybe_unused]] bool repair_attempted = false; + bool bounds_repaired = false; + i_t n_fixed_vars = 0; if (!(n_failed_repair_iterations >= max_n_failed_repair_iterations) && rounding_ii && !timeout_happened) { - timer_t repair_timer{std::min(timer.remaining_time() / 5, timer.elapsed_time() / 3)}; + // timer_t repair_timer{std::min(timer.remaining_time() / 5, timer.elapsed_time() / 3)}; + work_limit_timer_t repair_timer( + context.gpu_heur_loop, timer.remaining_time() / 5, *context.termination); save_bounds(sol); - // update bounds and run repair procedure + if (timer.deterministic) { + CUOPT_DETERMINISM_LOG( + "find_integer pre-repair: bounds_hash=0x%x assignment_hash=0x%x infeas_count=%d " + "timer_rem=%.6f", + detail::compute_hash(make_span(sol.problem_ptr->variable_bounds), + sol.handle_ptr->get_stream()), + detail::compute_hash(make_span(sol.assignment), sol.handle_ptr->get_stream()), + bounds_update.infeas_constraints_count, + timer.remaining_time()); + } bool bounds_repaired = run_repair_procedure(*sol.problem_ptr, *orig_sol.problem_ptr, repair_timer, sol.handle_ptr); + if (timer.deterministic) { + CUOPT_DETERMINISM_LOG( + "find_integer post-repair: repaired=%d bounds_hash=0x%x assignment_hash=0x%x " + "timer_rem=%.6f", + (int)bounds_repaired, + detail::compute_hash(make_span(sol.problem_ptr->variable_bounds), + sol.handle_ptr->get_stream()), + detail::compute_hash(make_span(sol.assignment), sol.handle_ptr->get_stream()), + timer.remaining_time()); + } if (!bounds_repaired) { restore_bounds(sol); n_failed_repair_iterations++; @@ -998,7 +1087,7 @@ bool constraint_prop_t::find_integer( make_span(sol.problem_ptr->variable_bounds), make_span(orig_sol.problem_ptr->variable_bounds), make_span(sol.assignment)}); - i_t n_fixed_vars = (iter - (unset_vars.begin() + set_count)); + n_fixed_vars = (iter - (unset_vars.begin() + set_count)); CUOPT_LOG_TRACE("After repair procedure, number of additional fixed vars %d", n_fixed_vars); set_count += n_fixed_vars; } @@ -1026,7 +1115,7 @@ bool constraint_prop_t::find_integer( // which is the unchanged problem bounds multi_probe.update_host_bounds(sol.handle_ptr, make_span(sol.problem_ptr->variable_bounds)); } - CUOPT_LOG_DEBUG( + CUOPT_DETERMINISM_LOG( "Bounds propagation rounding end: ii constraint count first buffer %d, second buffer %d", multi_probe.infeas_constraints_count_0, multi_probe.infeas_constraints_count_1); @@ -1038,7 +1127,12 @@ bool constraint_prop_t::find_integer( multi_probe.infeas_constraints_count_1 == 0) && !timeout_happened && lp_run_time_after_feasible > 0) { relaxed_lp_settings_t lp_settings; - lp_settings.time_limit = lp_run_time_after_feasible; + lp_settings.time_limit = lp_run_time_after_feasible; + if (timer.deterministic) { + lp_settings.work_limit = lp_settings.time_limit; + lp_settings.work_context = timer.work_context; + cuopt_assert(lp_settings.work_context != nullptr, "Missing deterministic work context"); + } lp_settings.tolerance = orig_sol.problem_ptr->tolerances.absolute_tolerance; lp_settings.save_state = false; lp_settings.return_first_feasible = true; @@ -1050,6 +1144,10 @@ bool constraint_prop_t::find_integer( } bool res_feasible = orig_sol.compute_feasibility(); orig_sol.handle_ptr->sync_stream(); + CUOPT_DETERMINISM_LOG("find_integer exit: seed=%lld feasible=%d hash=0x%x", + (long long)cuopt::seed_generator::peek_seed(), + (int)res_feasible, + orig_sol.get_hash()); return res_feasible; } @@ -1057,11 +1155,13 @@ template bool constraint_prop_t::apply_round( solution_t& sol, f_t lp_run_time_after_feasible, - timer_t& timer, + work_limit_timer_t& timer, std::optional>> probing_config) { raft::common::nvtx::range fun_scope("constraint prop round"); - max_timer = timer_t{max_time_for_bounds_prop}; + + sol.compute_feasibility(); + max_timer = work_limit_timer_t{context.gpu_heur_loop, max_time_for_bounds_prop, timer}; if (check_brute_force_rounding(sol)) { return true; } recovery_mode = false; rounding_ii = false; @@ -1076,9 +1176,9 @@ bool constraint_prop_t::apply_round( f_t bounds_prop_end_time = max_timer.remaining_time(); repair_stats.total_time_spent_on_bounds_prop += bounds_prop_start_time - bounds_prop_end_time; - CUOPT_LOG_DEBUG( - "repair_success %lu repair_attempts %lu intermediate_repair_success %lu total_repair_loops %lu " - "total_time_spent_on_repair %f total_time_spent_bounds_prop_after_repair %f " + CUOPT_DETERMINISM_LOG( + "repair_success %lu repair_attempts %lu intermediate_repair_success %lu total_repair_loops" + "%lu total_time_spent_on_repair %f total_time_spent_bounds_prop_after_repair %f " "total_time_spent_on_bounds_prop %f", repair_stats.repair_success, repair_stats.repair_attempts, @@ -1229,6 +1329,13 @@ bool constraint_prop_t::probe( } selected_update = 0; if (first_bounds_update_ii) { selected_update = 1; } + CUOPT_DETERMINISM_LOG( + "probe result: infeas_0=%d infeas_1=%d selected_update=%d recovery=%d rounding_ii=%d", + multi_probe.infeas_constraints_count_0, + multi_probe.infeas_constraints_count_1, + selected_update, + (int)recovery_mode, + (int)rounding_ii); // if we are doing single rounding if (probing_config.has_value() && probing_config.value().get().use_balanced_probing) { cuopt_assert(std::get<0>(var_probe_vals).size() == 1, diff --git a/cpp/src/mip_heuristics/local_search/rounding/constraint_prop.cuh b/cpp/src/mip_heuristics/local_search/rounding/constraint_prop.cuh index 2c609228e8..7ad4253cc4 100644 --- a/cpp/src/mip_heuristics/local_search/rounding/constraint_prop.cuh +++ b/cpp/src/mip_heuristics/local_search/rounding/constraint_prop.cuh @@ -43,7 +43,7 @@ struct constraint_prop_t { constraint_prop_t(mip_solver_context_t& context); bool apply_round(solution_t& sol, f_t lp_run_time_after_feasible, - timer_t& timer, + work_limit_timer_t& timer, std::optional>> probing_config = std::nullopt); void sort_by_implied_slack_consumption(solution_t& sol, @@ -56,7 +56,7 @@ struct constraint_prop_t { bool find_integer(solution_t& sol, solution_t& orig_sol, f_t lp_run_time_after_feasible, - timer_t& timer, + work_limit_timer_t& timer, std::optional>> probing_config = std::nullopt); void find_set_integer_vars(solution_t& sol, rmm::device_uvector& set_vars); @@ -121,7 +121,7 @@ struct constraint_prop_t { const raft::handle_t* handle_ptr); bool run_repair_procedure(problem_t& problem, problem_t& original_problem, - timer_t& timer, + work_limit_timer_t& timer, const raft::handle_t* handle_ptr); bool handle_fixed_vars( solution_t& sol, @@ -149,7 +149,7 @@ struct constraint_prop_t { i_t bounds_prop_interval = 1; i_t n_iter_in_recovery = 0; i_t max_n_failed_repair_iterations = 1; - timer_t max_timer{0.}; + work_limit_timer_t max_timer; bool use_probing_cache = true; static repair_stats_t repair_stats; bool single_rounding_only = false; diff --git a/cpp/src/mip_heuristics/local_search/rounding/lb_bounds_repair.cu b/cpp/src/mip_heuristics/local_search/rounding/lb_bounds_repair.cu index 7d074aea5e..612ed8160b 100644 --- a/cpp/src/mip_heuristics/local_search/rounding/lb_bounds_repair.cu +++ b/cpp/src/mip_heuristics/local_search/rounding/lb_bounds_repair.cu @@ -8,8 +8,10 @@ #include "lb_bounds_repair.cuh" #include +#include #include #include +#include #include #include #include @@ -26,7 +28,8 @@ lb_bounds_repair_t::lb_bounds_repair_t(const raft::handle_t* handle_pt violated_cstr_map(0, handle_ptr->get_stream()), total_vio(handle_ptr->get_stream()), gen(cuopt::seed_generator::get_seed()), - cycle_vector(MAX_CYCLE_SEQUENCE, -1) + cycle_vector(MAX_CYCLE_SEQUENCE, -1), + timer(0.0, cuopt::termination_checker_t::root_tag_t{}) { } @@ -68,8 +71,7 @@ std::tuple lb_bounds_repair_t::get_ii_violation( constraint_upper_bounds = problem.constraint_upper_bounds, cnst_slack = make_span_2(lb_bound_presolve.cnst_slack), cstr_violations_up = cstr_violations_up.data(), - cstr_violations_down = cstr_violations_down.data(), - total_vio = total_vio.data()] __device__(i_t cstr_idx) { + cstr_violations_down = cstr_violations_down.data()] __device__(i_t cstr_idx) { f_t cnst_lb = constraint_lower_bounds[cstr_idx]; f_t cnst_ub = constraint_upper_bounds[cstr_idx]; f_t2 slack = cnst_slack[cstr_idx]; @@ -80,7 +82,6 @@ std::tuple lb_bounds_repair_t::get_ii_violation( f_t violation = max(curr_cstr_violation_up, curr_cstr_violation_down); if (violation >= ROUNDOFF_TOLERANCE) { violated_cstr_map[cstr_idx] = 1; - atomicAdd(total_vio, violation); } else { violated_cstr_map[cstr_idx] = 0; } @@ -94,7 +95,18 @@ std::tuple lb_bounds_repair_t::get_ii_violation( violated_constraints.data(), cuda::std::identity{}); i_t n_violated_cstr = iter - violated_constraints.data(); - f_t total_violation = total_vio.value(handle_ptr->get_stream()); + // Use deterministic reduction instead of non-deterministic atomicAdd + f_t total_violation = thrust::transform_reduce( + handle_ptr->get_thrust_policy(), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(0) + problem.n_constraints, + [cstr_violations_up = cstr_violations_up.data(), + cstr_violations_down = cstr_violations_down.data()] __device__(i_t cstr_idx) -> f_t { + auto violation = max(cstr_violations_up[cstr_idx], cstr_violations_down[cstr_idx]); + return violation >= ROUNDOFF_TOLERANCE ? violation : 0.; + }, + (f_t)0, + thrust::plus()); CUOPT_LOG_TRACE( "Repair: n_violated_cstr %d total_violation %f", n_violated_cstr, total_violation); return std::make_tuple(total_violation, n_violated_cstr); @@ -397,10 +409,11 @@ bool lb_bounds_repair_t::repair_problem( load_balanced_problem_t* problem, load_balanced_bounds_presolve_t& lb_bound_presolve, problem_t& original_problem, - timer_t timer_, + work_limit_timer_t& timer_, const raft::handle_t* handle_ptr_) { - CUOPT_LOG_DEBUG("Running bounds repair"); + nvtx::range fun_scope("LB repair_problem"); + CUOPT_LOG_DEBUG("LB Running bounds repair"); handle_ptr = handle_ptr_; timer = timer_; resize(*problem); diff --git a/cpp/src/mip_heuristics/local_search/rounding/lb_bounds_repair.cuh b/cpp/src/mip_heuristics/local_search/rounding/lb_bounds_repair.cuh index 0b549c684d..068c0d57bf 100644 --- a/cpp/src/mip_heuristics/local_search/rounding/lb_bounds_repair.cuh +++ b/cpp/src/mip_heuristics/local_search/rounding/lb_bounds_repair.cuh @@ -58,7 +58,7 @@ class lb_bounds_repair_t { bool repair_problem(load_balanced_problem_t* problem, load_balanced_bounds_presolve_t& lb_bound_presolve, problem_t& original_problem, - timer_t timer_, + work_limit_timer_t& timer_, const raft::handle_t* handle_ptr_); void apply_move(load_balanced_problem_t* problem, problem_t& original_problem, @@ -82,7 +82,7 @@ class lb_bounds_repair_t { i_t h_n_violated_cstr; const raft::handle_t* handle_ptr; std::mt19937 gen; - timer_t timer{0.}; + work_limit_timer_t timer; std::vector cycle_vector; i_t cycle_write_pos = 0; }; diff --git a/cpp/src/mip_heuristics/local_search/rounding/lb_constraint_prop.cu b/cpp/src/mip_heuristics/local_search/rounding/lb_constraint_prop.cu index bb72834ab4..d8e3bcc040 100644 --- a/cpp/src/mip_heuristics/local_search/rounding/lb_constraint_prop.cu +++ b/cpp/src/mip_heuristics/local_search/rounding/lb_constraint_prop.cu @@ -33,7 +33,8 @@ lb_constraint_prop_t::lb_constraint_prop_t(mip_solver_context_thandle_ptr->get_stream()), assignment_restore(context.problem_ptr->n_variables, context.problem_ptr->handle_ptr->get_stream()), - rng(cuopt::seed_generator::get_seed(), 0, 0) + rng(cuopt::seed_generator::get_seed(), 0, 0), + max_timer(0.0, cuopt::termination_checker_t::root_tag_t{}) { } @@ -700,14 +701,15 @@ template bool lb_constraint_prop_t::apply_round( solution_t& sol, f_t lp_run_time_after_feasible, - timer_t& timer, + work_limit_timer_t& timer, std::optional>> probing_candidates) { raft::common::nvtx::range fun_scope("constraint prop round"); // this is second timer that can continue but without recovery mode const f_t max_time_for_bounds_prop = 5.; - max_timer = timer_t{max_time_for_bounds_prop}; + max_timer = + work_limit_timer_t{context.gpu_heur_loop, max_time_for_bounds_prop, *context.termination}; if (check_brute_force_rounding(sol)) { return true; } recovery_mode = false; rounding_ii = false; diff --git a/cpp/src/mip_heuristics/local_search/rounding/lb_constraint_prop.cuh b/cpp/src/mip_heuristics/local_search/rounding/lb_constraint_prop.cuh index 20e28e7cb9..6fb88467ab 100644 --- a/cpp/src/mip_heuristics/local_search/rounding/lb_constraint_prop.cuh +++ b/cpp/src/mip_heuristics/local_search/rounding/lb_constraint_prop.cuh @@ -23,7 +23,7 @@ struct lb_constraint_prop_t { bool apply_round( solution_t& sol, f_t lp_run_time_after_feasible, - timer_t& timer, + work_limit_timer_t& timer, std::optional>> probing_candidates = std::nullopt); void sort_by_implied_slack_consumption( problem_t& original_problem, @@ -40,7 +40,7 @@ struct lb_constraint_prop_t { load_balanced_bounds_presolve_t& lb_bounds_update, solution_t& orig_sol, f_t lp_run_time_after_feasible, - timer_t& timer, + work_limit_timer_t& timer, std::optional>> probing_candidates); std::tuple probing_values( load_balanced_bounds_presolve_t& lb_bounds_update, @@ -83,7 +83,7 @@ struct lb_constraint_prop_t { bool run_repair_procedure(load_balanced_problem_t* problem, load_balanced_bounds_presolve_t& lb_bounds_update, problem_t& original_problem, - timer_t& timer, + work_limit_timer_t& timer, const raft::handle_t* handle_ptr); mip_solver_context_t& context; @@ -100,7 +100,7 @@ struct lb_constraint_prop_t { bool rounding_ii = false; i_t bounds_prop_interval = 1; i_t n_iter_in_recovery = 0; - timer_t max_timer{0.}; + work_limit_timer_t max_timer; bool use_probing_cache = true; size_t repair_attempts = 0; diff --git a/cpp/src/mip_heuristics/local_search/rounding/simple_rounding.cu b/cpp/src/mip_heuristics/local_search/rounding/simple_rounding.cu index 4f3a015a6c..9a2bf317b7 100644 --- a/cpp/src/mip_heuristics/local_search/rounding/simple_rounding.cu +++ b/cpp/src/mip_heuristics/local_search/rounding/simple_rounding.cu @@ -8,8 +8,10 @@ #include "simple_rounding.cuh" #include "simple_rounding_kernels.cuh" +#include #include #include +#include #include #include @@ -35,6 +37,8 @@ bool check_brute_force_rounding(solution_t& solution) if (n_integers_to_round == 0) { return solution.compute_feasibility(); } constexpr i_t brute_force_rounding_threshold = 8; if (n_integers_to_round <= brute_force_rounding_threshold) { + CUOPT_DETERMINISM_LOG( + "Brute-force rounding: n_to_round=%d hash=0x%x", n_integers_to_round, solution.get_hash()); solution.compute_constraints(); i_t n_configs = pow(2, n_integers_to_round); i_t n_blocks = (n_configs + TPB - 1) / TPB; @@ -42,7 +46,8 @@ bool check_brute_force_rounding(solution_t& solution) rmm::device_uvector var_map(n_integers_to_round, solution.handle_ptr->get_stream()); rmm::device_uvector constraint_buf(n_configs * solution.problem_ptr->n_constraints, solution.handle_ptr->get_stream()); - rmm::device_scalar best_config(-1, solution.handle_ptr->get_stream()); + rmm::device_scalar best_config(std::numeric_limits::max(), + solution.handle_ptr->get_stream()); thrust::copy_if( solution.handle_ptr->get_thrust_policy(), solution.problem_ptr->integer_indices.begin(), @@ -58,7 +63,13 @@ bool check_brute_force_rounding(solution_t& solution) cuopt::make_span(var_map), cuopt::make_span(constraint_buf), best_config.data()); - if (best_config.value(solution.handle_ptr->get_stream()) != -1) { + i_t best_config_val = best_config.value(solution.handle_ptr->get_stream()); + CUOPT_DETERMINISM_LOG( + "Brute-force rounding: best_config=%d (max=%d) var_map_hash=0x%x", + best_config_val, + (int)std::numeric_limits::max(), + detail::compute_hash(make_span(var_map), solution.handle_ptr->get_stream())); + if (best_config_val != std::numeric_limits::max()) { CUOPT_LOG_DEBUG("Feasible found during brute force rounding!"); // apply the feasible rounding apply_feasible_rounding_kernel<<<1, TPB, 0, solution.handle_ptr->get_stream()>>>( diff --git a/cpp/src/mip_heuristics/local_search/rounding/simple_rounding_kernels.cuh b/cpp/src/mip_heuristics/local_search/rounding/simple_rounding_kernels.cuh index 2edca8fb08..a0b8468ea7 100644 --- a/cpp/src/mip_heuristics/local_search/rounding/simple_rounding_kernels.cuh +++ b/cpp/src/mip_heuristics/local_search/rounding/simple_rounding_kernels.cuh @@ -131,7 +131,7 @@ __global__ void brute_force_check_kernel(typename solution_t::view_t s __shared__ i_t shbuf[raft::WarpSize]; i_t total_feasible = raft::blockReduce(th_feasible_count, (char*)shbuf); if (threadIdx.x == 0) { - if (total_feasible == solution.problem.n_constraints) { atomicExch(best_config, config); } + if (total_feasible == solution.problem.n_constraints) { atomicMin(best_config, config); } } } diff --git a/cpp/src/mip_heuristics/mip_constants.hpp b/cpp/src/mip_heuristics/mip_constants.hpp index 47d3d22de4..94b511da60 100644 --- a/cpp/src/mip_heuristics/mip_constants.hpp +++ b/cpp/src/mip_heuristics/mip_constants.hpp @@ -13,3 +13,7 @@ #define MIP_INSTANTIATE_DOUBLE CUOPT_INSTANTIATE_DOUBLE #define PDLP_INSTANTIATE_FLOAT 1 + +#define BB_BASE_WORK_SCALE 1.0 +#define GPU_HEUR_BASE_WORK_SCALE 0.4 +#define CPUFJ_BASE_WORK_SCALE 1.0 diff --git a/cpp/src/mip_heuristics/presolve/bounds_presolve.cu b/cpp/src/mip_heuristics/presolve/bounds_presolve.cu index d78f8beb16..6cc57cf153 100644 --- a/cpp/src/mip_heuristics/presolve/bounds_presolve.cu +++ b/cpp/src/mip_heuristics/presolve/bounds_presolve.cu @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -167,10 +168,14 @@ void bound_presolve_t::set_bounds( template termination_criterion_t bound_presolve_t::bound_update_loop(problem_t& pb, - timer_t timer) + work_limit_timer_t& timer) { termination_criterion_t criteria = termination_criterion_t::ITERATION_LIMIT; + if ((context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS)) { + settings.iteration_limit = std::min(settings.iteration_limit, 50); + } + i_t iter; upd.init_changed_constraints(pb.handle_ptr); for (iter = 0; iter < settings.iteration_limit; ++iter) { @@ -229,7 +234,7 @@ termination_criterion_t bound_presolve_t::solve(problem_t& p i_t var_idx) { auto& handle_ptr = pb.handle_ptr; - timer_t timer(settings.time_limit); + work_limit_timer_t timer(context.gpu_heur_loop, settings.time_limit, *context.termination); copy_input_bounds(pb); upd.lb.set_element_async(var_idx, var_lb, handle_ptr->get_stream()); upd.ub.set_element_async(var_idx, var_ub, handle_ptr->get_stream()); @@ -242,7 +247,7 @@ termination_criterion_t bound_presolve_t::solve( const std::vector>& var_probe_val_pairs, bool use_host_bounds) { - timer_t timer(settings.time_limit); + work_limit_timer_t timer(context.gpu_heur_loop, settings.time_limit, *context.termination); auto& handle_ptr = pb.handle_ptr; if (use_host_bounds) { update_device_bounds(handle_ptr); @@ -257,7 +262,7 @@ termination_criterion_t bound_presolve_t::solve( template termination_criterion_t bound_presolve_t::solve(problem_t& pb) { - timer_t timer(settings.time_limit); + work_limit_timer_t timer(context.gpu_heur_loop, settings.time_limit, *context.termination); auto& handle_ptr = pb.handle_ptr; copy_input_bounds(pb); return bound_update_loop(pb, timer); diff --git a/cpp/src/mip_heuristics/presolve/bounds_presolve.cuh b/cpp/src/mip_heuristics/presolve/bounds_presolve.cuh index 8b57cc7019..dee642ba36 100644 --- a/cpp/src/mip_heuristics/presolve/bounds_presolve.cuh +++ b/cpp/src/mip_heuristics/presolve/bounds_presolve.cuh @@ -15,6 +15,7 @@ #include #include +#include #include @@ -60,7 +61,7 @@ class bound_presolve_t { void set_updated_bounds(const raft::handle_t* handle_ptr, raft::device_span output_lb, raft::device_span output_ub); - termination_criterion_t bound_update_loop(problem_t& pb, timer_t timer); + termination_criterion_t bound_update_loop(problem_t& pb, work_limit_timer_t& timer); void set_bounds(raft::device_span var_lb, raft::device_span var_ub, const std::vector>& var_probe_vals, diff --git a/cpp/src/mip_heuristics/presolve/bounds_update_data.cu b/cpp/src/mip_heuristics/presolve/bounds_update_data.cu index 487549aa4a..b83f474791 100644 --- a/cpp/src/mip_heuristics/presolve/bounds_update_data.cu +++ b/cpp/src/mip_heuristics/presolve/bounds_update_data.cu @@ -28,6 +28,17 @@ bounds_update_data_t::bounds_update_data_t(problem_t& proble template void bounds_update_data_t::resize(problem_t& problem) { + CUOPT_LOG_DEBUG( + "bounds_update_data resize: nv=%d nc=%d min_act=%zu max_act=%zu lb=%zu ub=%zu " + "chg_c=%zu chg_v=%zu", + problem.n_variables, + problem.n_constraints, + min_activity.size(), + max_activity.size(), + lb.size(), + ub.size(), + changed_constraints.size(), + changed_variables.size()); min_activity.resize(problem.n_constraints, problem.handle_ptr->get_stream()); max_activity.resize(problem.n_constraints, problem.handle_ptr->get_stream()); lb.resize(problem.n_variables, problem.handle_ptr->get_stream()); @@ -35,6 +46,35 @@ void bounds_update_data_t::resize(problem_t& problem) changed_constraints.resize(problem.n_constraints, problem.handle_ptr->get_stream()); next_changed_constraints.resize(problem.n_constraints, problem.handle_ptr->get_stream()); changed_variables.resize(problem.n_variables, problem.handle_ptr->get_stream()); + + thrust::fill(problem.handle_ptr->get_thrust_policy(), + min_activity.begin(), + min_activity.end(), + std::numeric_limits::signaling_NaN()); + thrust::fill(problem.handle_ptr->get_thrust_policy(), + max_activity.begin(), + max_activity.end(), + std::numeric_limits::signaling_NaN()); + thrust::fill(problem.handle_ptr->get_thrust_policy(), + lb.begin(), + lb.end(), + std::numeric_limits::signaling_NaN()); + thrust::fill(problem.handle_ptr->get_thrust_policy(), + ub.begin(), + ub.end(), + std::numeric_limits::signaling_NaN()); + thrust::fill(problem.handle_ptr->get_thrust_policy(), + changed_constraints.begin(), + changed_constraints.end(), + -1); + thrust::fill(problem.handle_ptr->get_thrust_policy(), + next_changed_constraints.begin(), + next_changed_constraints.end(), + -1); + thrust::fill(problem.handle_ptr->get_thrust_policy(), + changed_variables.begin(), + changed_variables.end(), + -1); } template diff --git a/cpp/src/mip_heuristics/presolve/conditional_bound_strengthening.cu b/cpp/src/mip_heuristics/presolve/conditional_bound_strengthening.cu index 13412614b8..24cac7129f 100644 --- a/cpp/src/mip_heuristics/presolve/conditional_bound_strengthening.cu +++ b/cpp/src/mip_heuristics/presolve/conditional_bound_strengthening.cu @@ -17,6 +17,12 @@ #include "cusparse.h" #include + +#include +#include +#include +#include + #include "conditional_bound_strengthening.cuh" #include diff --git a/cpp/src/mip_heuristics/presolve/lb_probing_cache.cu b/cpp/src/mip_heuristics/presolve/lb_probing_cache.cu index 3a6d1bce21..308230527a 100644 --- a/cpp/src/mip_heuristics/presolve/lb_probing_cache.cu +++ b/cpp/src/mip_heuristics/presolve/lb_probing_cache.cu @@ -10,7 +10,9 @@ #include #include +#include #include +#include #include #include @@ -309,7 +311,7 @@ inline std::vector compute_prioritized_integer_indices( template void compute_probing_cache(load_balanced_bounds_presolve_t& bound_presolve, load_balanced_problem_t& problem, - timer_t timer) + work_limit_timer_t& timer) { // we dont want to compute the probing cache for all variables for time and computation resources auto priority_indices = compute_prioritized_integer_indices(bound_presolve, problem); @@ -400,7 +402,7 @@ void compute_probing_cache(load_balanced_bounds_presolve_t& bound_pres template void compute_probing_cache( \ load_balanced_bounds_presolve_t & bound_presolve, \ load_balanced_problem_t & problem, \ - timer_t timer); \ + work_limit_timer_t & timer); \ template class lb_probing_cache_t; #if MIP_INSTANTIATE_FLOAT diff --git a/cpp/src/mip_heuristics/presolve/load_balanced_bounds_presolve.cu b/cpp/src/mip_heuristics/presolve/load_balanced_bounds_presolve.cu index 0d16c26cae..0bf12390ca 100644 --- a/cpp/src/mip_heuristics/presolve/load_balanced_bounds_presolve.cu +++ b/cpp/src/mip_heuristics/presolve/load_balanced_bounds_presolve.cu @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -526,7 +527,7 @@ bool load_balanced_bounds_presolve_t::update_bounds_from_slack( template termination_criterion_t load_balanced_bounds_presolve_t::bound_update_loop( - const raft::handle_t* handle_ptr, timer_t timer) + const raft::handle_t* handle_ptr, work_limit_timer_t& timer) { termination_criterion_t criteria = termination_criterion_t::ITERATION_LIMIT; @@ -626,7 +627,7 @@ termination_criterion_t load_balanced_bounds_presolve_t::solve(f_t var f_t var_ub, i_t var_idx) { - timer_t timer(settings.time_limit); + work_limit_timer_t timer(context.gpu_heur_loop, settings.time_limit, *context.termination); auto& handle_ptr = pb->handle_ptr; copy_input_bounds(*pb); vars_bnd.set_element_async(2 * var_idx, var_lb, handle_ptr->get_stream()); @@ -638,7 +639,7 @@ template termination_criterion_t load_balanced_bounds_presolve_t::solve( raft::device_span input_bounds) { - timer_t timer(settings.time_limit); + work_limit_timer_t timer(context.gpu_heur_loop, settings.time_limit, *context.termination); auto& handle_ptr = pb->handle_ptr; if (input_bounds.size() != 0) { raft::copy(vars_bnd.data(), input_bounds.data(), input_bounds.size(), handle_ptr->get_stream()); @@ -667,7 +668,7 @@ template termination_criterion_t load_balanced_bounds_presolve_t::solve( const std::vector>& var_probe_val_pairs, bool use_host_bounds) { - timer_t timer(settings.time_limit); + work_limit_timer_t timer(context.gpu_heur_loop, settings.time_limit, *context.termination); auto& handle_ptr = pb->handle_ptr; if (use_host_bounds) { update_device_bounds(handle_ptr); diff --git a/cpp/src/mip_heuristics/presolve/load_balanced_bounds_presolve.cuh b/cpp/src/mip_heuristics/presolve/load_balanced_bounds_presolve.cuh index ff085ca962..2b9d31061e 100644 --- a/cpp/src/mip_heuristics/presolve/load_balanced_bounds_presolve.cuh +++ b/cpp/src/mip_heuristics/presolve/load_balanced_bounds_presolve.cuh @@ -16,6 +16,7 @@ #include #include +#include #include "load_balanced_partition_helpers.cuh" #include "utils.cuh" @@ -159,7 +160,8 @@ class load_balanced_bounds_presolve_t { void calculate_constraint_slack_iter(const raft::handle_t* handle_ptr); bool update_bounds_from_slack(const raft::handle_t* handle_ptr); - termination_criterion_t bound_update_loop(const raft::handle_t* handle_ptr, timer_t timer); + termination_criterion_t bound_update_loop(const raft::handle_t* handle_ptr, + work_limit_timer_t& timer); bool calculate_infeasible_redundant_constraints(const raft::handle_t* handle_ptr); // void calculate_constraint_slack_on_problem_bounds(); diff --git a/cpp/src/mip_heuristics/presolve/load_balanced_bounds_presolve_helpers.cuh b/cpp/src/mip_heuristics/presolve/load_balanced_bounds_presolve_helpers.cuh index cbcd91a7d7..f276840bdf 100644 --- a/cpp/src/mip_heuristics/presolve/load_balanced_bounds_presolve_helpers.cuh +++ b/cpp/src/mip_heuristics/presolve/load_balanced_bounds_presolve_helpers.cuh @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -10,6 +10,7 @@ #include "load_balanced_bounds_presolve_kernels.cuh" #include "load_balanced_partition_helpers.cuh" +#include #include #include #include diff --git a/cpp/src/mip_heuristics/presolve/multi_probe.cu b/cpp/src/mip_heuristics/presolve/multi_probe.cu index 7789b3281b..6a2e88a1b2 100644 --- a/cpp/src/mip_heuristics/presolve/multi_probe.cu +++ b/cpp/src/mip_heuristics/presolve/multi_probe.cu @@ -5,10 +5,13 @@ */ /* clang-format on */ +#include #include +#include #include #include +#include #include #include #include @@ -19,6 +22,15 @@ #include "bounds_update_helpers.cuh" #include "multi_probe.cuh" +// enable to activate detailed determinism logs +#if 0 +#undef CUOPT_DETERMINISM_LOG +#define CUOPT_DETERMINISM_LOG(...) \ + do { \ + CUOPT_LOG_INFO(__VA_ARGS__); \ + } while (0) +#endif + namespace cuopt::linear_programming::detail { // Tobias Achterberg, Robert E. Bixby, Zonghao Gu, Edward Rothberg, Dieter Weninger (2019) Presolve @@ -263,7 +275,7 @@ void multi_probe_t::set_bounds( template termination_criterion_t multi_probe_t::bound_update_loop(problem_t& pb, const raft::handle_t* handle_ptr, - timer_t timer) + work_limit_timer_t& timer) { termination_criterion_t criteria = termination_criterion_t::ITERATION_LIMIT; skip_0 = false; @@ -279,12 +291,17 @@ termination_criterion_t multi_probe_t::bound_update_loop(problem_t::bound_update_loop(problem_t::bound_update_loop(problem_tget_stream()), + detail::compute_hash(make_span(upd_0.ub), handle_ptr->get_stream()), + detail::compute_hash(make_span(upd_1.lb), handle_ptr->get_stream()), + detail::compute_hash(make_span(upd_1.ub), handle_ptr->get_stream()), + timer.remaining_time()); + return criteria; } @@ -343,6 +371,10 @@ void multi_probe_t::update_host_bounds( [] __device__(auto i) { return thrust::make_tuple(get_lower(i), get_upper(i)); }); raft::copy(host_lb.data(), var_lb.data(), var_lb.size(), handle_ptr->get_stream()); raft::copy(host_ub.data(), var_ub.data(), var_ub.size(), handle_ptr->get_stream()); + handle_ptr->sync_stream(); + CUOPT_DETERMINISM_LOG("update_host_bounds: lb_hash=0x%x ub_hash=0x%x", + detail::compute_hash(make_span(var_lb), handle_ptr->get_stream()), + detail::compute_hash(make_span(var_ub), handle_ptr->get_stream())); } template @@ -375,7 +407,7 @@ termination_criterion_t multi_probe_t::solve_for_interval( const std::tuple, std::pair>& var_interval_vals, const raft::handle_t* handle_ptr) { - timer_t timer(settings.time_limit); + work_limit_timer_t timer(context.gpu_heur_loop, settings.time_limit, *context.termination); copy_problem_into_probing_buffers(pb, handle_ptr); set_interval_bounds(var_interval_vals, pb, handle_ptr); @@ -389,7 +421,7 @@ termination_criterion_t multi_probe_t::solve( const std::tuple, std::vector, std::vector>& var_probe_vals, bool use_host_bounds) { - timer_t timer(settings.time_limit); + work_limit_timer_t timer(context.gpu_heur_loop, settings.time_limit, *context.termination); auto& handle_ptr = pb.handle_ptr; if (use_host_bounds) { update_device_bounds(handle_ptr); diff --git a/cpp/src/mip_heuristics/presolve/multi_probe.cuh b/cpp/src/mip_heuristics/presolve/multi_probe.cuh index a043770789..747713a53d 100644 --- a/cpp/src/mip_heuristics/presolve/multi_probe.cuh +++ b/cpp/src/mip_heuristics/presolve/multi_probe.cuh @@ -13,6 +13,7 @@ #include #include +#include #include "bounds_update_data.cuh" #include "utils.cuh" @@ -54,7 +55,7 @@ class multi_probe_t { i_t select_update); termination_criterion_t bound_update_loop(problem_t& pb, const raft::handle_t* handle_ptr, - timer_t timer); + work_limit_timer_t& timer); void set_interval_bounds( const std::tuple, std::pair>& var_interval_vals, problem_t& pb, diff --git a/cpp/src/mip_heuristics/presolve/probing_cache.cu b/cpp/src/mip_heuristics/presolve/probing_cache.cu index 4f5e16ddb9..9f9e781a70 100644 --- a/cpp/src/mip_heuristics/presolve/probing_cache.cu +++ b/cpp/src/mip_heuristics/presolve/probing_cache.cu @@ -14,7 +14,10 @@ #include #include +#include +#include #include +#include #include #include @@ -367,7 +370,7 @@ void compute_cache_for_var(i_t var_idx, std::atomic& problem_is_infeasible, std::vector>& modification_vector, std::vector>& substitution_vector, - timer_t timer, + const work_limit_timer_t& timer, i_t device_id) { RAFT_CUDA_TRY(cudaSetDevice(device_id)); @@ -704,8 +707,11 @@ void apply_substitution_queue_to_problem( host_copy(problem.presolve_data.variable_mapping, problem.handle_ptr->get_stream()); problem.handle_ptr->sync_stream(); + // remove duplicate substitution proposals to avoid races later + std::unordered_set seen_substituted; for (const auto& [substituting_var, substitutions] : all_substitutions) { for (const auto& [substituted_var, substitution] : substitutions) { + if (!seen_substituted.insert(substitution.substituted_var).second) { continue; } CUOPT_LOG_TRACE("Applying substitution: %d -> %d", substitution.substituting_var, substitution.substituted_var); @@ -843,7 +849,7 @@ std::vector compute_priority_indices_by_implied_integers(problem_t bool compute_probing_cache(bound_presolve_t& bound_presolve, problem_t& problem, - timer_t timer) + work_limit_timer_t& timer) { raft::common::nvtx::range fun_scope("compute_probing_cache"); // we dont want to compute the probing cache for all variables for time and computation resources @@ -857,6 +863,12 @@ bool compute_probing_cache(bound_presolve_t& bound_presolve, bound_presolve.settings.iteration_limit = 50; bound_presolve.settings.time_limit = timer.remaining_time(); + // TODO: proper work unit accounting in deterministic mode for the probing cache + if ((bound_presolve.context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS)) { + bound_presolve.settings.iteration_limit = 1; + priority_indices.resize(std::min(priority_indices.size(), 2048)); + } + size_t num_threads = bound_presolve.settings.num_threads < 0 ? 0.2 * omp_get_max_threads() : bound_presolve.settings.num_threads; @@ -949,7 +961,7 @@ bool compute_probing_cache(bound_presolve_t& bound_presolve, #define INSTANTIATE(F_TYPE) \ template bool compute_probing_cache(bound_presolve_t & bound_presolve, \ problem_t & problem, \ - timer_t timer); \ + work_limit_timer_t & timer); \ template class probing_cache_t; #if MIP_INSTANTIATE_FLOAT diff --git a/cpp/src/mip_heuristics/presolve/probing_cache.cuh b/cpp/src/mip_heuristics/presolve/probing_cache.cuh index 91da6a15c8..8e1db6d5d7 100644 --- a/cpp/src/mip_heuristics/presolve/probing_cache.cuh +++ b/cpp/src/mip_heuristics/presolve/probing_cache.cuh @@ -12,6 +12,7 @@ #include #include +#include namespace cuopt::linear_programming::detail { @@ -119,6 +120,6 @@ class lb_probing_cache_t { template bool compute_probing_cache(bound_presolve_t& bound_presolve, problem_t& problem, - timer_t timer); + work_limit_timer_t& timer); } // namespace cuopt::linear_programming::detail diff --git a/cpp/src/mip_heuristics/presolve/third_party_presolve.cpp b/cpp/src/mip_heuristics/presolve/third_party_presolve.cpp index d94cf5aa67..7025ce2a96 100644 --- a/cpp/src/mip_heuristics/presolve/third_party_presolve.cpp +++ b/cpp/src/mip_heuristics/presolve/third_party_presolve.cpp @@ -544,7 +544,8 @@ void check_postsolve_status(const papilo::PostsolveStatus& status) template void set_presolve_methods(papilo::Presolve& presolver, problem_category_t category, - bool dual_postsolve) + bool dual_postsolve, + bool deterministic) { using uptr = std::unique_ptr>; @@ -571,7 +572,9 @@ void set_presolve_methods(papilo::Presolve& presolver, // exhaustive presolvers presolver.addPresolveMethod(uptr(new papilo::ImplIntDetection())); presolver.addPresolveMethod(uptr(new papilo::DominatedCols())); - presolver.addPresolveMethod(uptr(new papilo::Probing())); + // Papilo's Probing presolver is nondeterministic. + // TODO: push an upstream PR + if (!deterministic) { presolver.addPresolveMethod(uptr(new papilo::Probing())); } if (!dual_postsolve) { presolver.addPresolveMethod(uptr(new papilo::DualInfer())); @@ -605,17 +608,20 @@ template void set_presolve_parameters(papilo::Presolve& presolver, problem_category_t category, int nrows, - int ncols) + int ncols, + bool deterministic = false) { // It looks like a copy. But this copy has the pointers to relevant variables in papilo auto params = presolver.getParameters(); if (category == problem_category_t::MIP) { - // Papilo has work unit measurements for probing. Because of this when the first batch fails to - // produce any reductions, the algorithm stops. To avoid stopping the algorithm, we set a - // minimum badge size to a huge value. The time limit makes sure that we exit if it takes too - // long - int min_badgesize = std::max(ncols / 2, 32); - params.setParameter("probing.minbadgesize", min_badgesize); + if (!deterministic) { + // Papilo has work unit measurements for probing. Because of this when the first batch fails + // to produce any reductions, the algorithm stops. To avoid stopping the algorithm, we set a + // minimum badge size to a huge value. The time limit makes sure that we exit if it takes too + // long + int min_badgesize = std::max(ncols / 2, 32); + params.setParameter("probing.minbadgesize", min_badgesize); + } params.setParameter("cliquemerging.enabled", true); params.setParameter("cliquemerging.maxcalls", 50); } @@ -690,7 +696,7 @@ third_party_presolve_result_t third_party_presolve_t::apply( CUOPT_LOG_INFO("Calling Papilo presolver (git hash %s)", PAPILO_GITHASH); if (category == problem_category_t::MIP) { dual_postsolve = false; } papilo::Presolve papilo_presolver; - set_presolve_methods(papilo_presolver, category, dual_postsolve); + set_presolve_methods(papilo_presolver, category, dual_postsolve, deterministic_); set_presolve_options(papilo_presolver, category, absolute_tolerance, @@ -698,8 +704,11 @@ third_party_presolve_result_t third_party_presolve_t::apply( time_limit, dual_postsolve, num_cpu_threads); - set_presolve_parameters( - papilo_presolver, category, op_problem.get_n_constraints(), op_problem.get_n_variables()); + set_presolve_parameters(papilo_presolver, + category, + op_problem.get_n_constraints(), + op_problem.get_n_variables(), + deterministic_); // Disable papilo logs papilo_presolver.setVerbosityLevel(papilo::VerbosityLevel::kQuiet); diff --git a/cpp/src/mip_heuristics/presolve/third_party_presolve.hpp b/cpp/src/mip_heuristics/presolve/third_party_presolve.hpp index a067f604e7..9a7db11e18 100644 --- a/cpp/src/mip_heuristics/presolve/third_party_presolve.hpp +++ b/cpp/src/mip_heuristics/presolve/third_party_presolve.hpp @@ -79,6 +79,7 @@ class third_party_presolve_t { std::vector& full_primal) const; const std::vector& get_reduced_to_original_map() const { return reduced_to_original_map_; } const std::vector& get_original_to_reduced_map() const { return original_to_reduced_map_; } + void set_deterministic(bool d) { deterministic_ = d; } ~third_party_presolve_t(); @@ -91,6 +92,7 @@ class third_party_presolve_t { rmm::device_uvector& reduced_costs, rmm::cuda_stream_view stream_view); + bool deterministic_ = false; bool maximize_ = false; cuopt::linear_programming::presolver_t presolver_ = cuopt::linear_programming::presolver_t::PSLP; // PSLP settings diff --git a/cpp/src/mip_heuristics/presolve/trivial_presolve.cuh b/cpp/src/mip_heuristics/presolve/trivial_presolve.cuh index 568719dfd8..28162d7482 100644 --- a/cpp/src/mip_heuristics/presolve/trivial_presolve.cuh +++ b/cpp/src/mip_heuristics/presolve/trivial_presolve.cuh @@ -14,9 +14,11 @@ #include #include +#include #include #include #include +#include #include #include #include diff --git a/cpp/src/mip_heuristics/presolve/utils.cuh b/cpp/src/mip_heuristics/presolve/utils.cuh index 4870b3180c..803c00a022 100644 --- a/cpp/src/mip_heuristics/presolve/utils.cuh +++ b/cpp/src/mip_heuristics/presolve/utils.cuh @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -14,7 +14,8 @@ enum class termination_criterion_t { ITERATION_LIMIT, CONVERGENCE, INFEASIBLE, - NO_UPDATE + NO_UPDATE, + WORK_LIMIT }; } // namespace cuopt::linear_programming::detail diff --git a/cpp/src/mip_heuristics/problem/presolve_data.cu b/cpp/src/mip_heuristics/problem/presolve_data.cu index 884b5f9069..9326ab82ef 100644 --- a/cpp/src/mip_heuristics/problem/presolve_data.cu +++ b/cpp/src/mip_heuristics/problem/presolve_data.cu @@ -112,8 +112,10 @@ void presolve_data_t::post_process_assignment( { raft::common::nvtx::range fun_scope("post_process_assignment"); cuopt_assert(current_assignment.size() == variable_mapping.size(), "size mismatch"); + rmm::device_uvector local_fixed(fixed_var_assignment.size(), stream); + raft::copy(local_fixed.data(), fixed_var_assignment.data(), fixed_var_assignment.size(), stream); auto assgn = make_span(current_assignment); - auto fixed_assgn = make_span(fixed_var_assignment); + auto fixed_assgn = make_span(local_fixed); auto var_map = make_span(variable_mapping); if (current_assignment.size() > 0) { thrust::for_each(rmm::exec_policy(stream), @@ -123,7 +125,7 @@ void presolve_data_t::post_process_assignment( fixed_assgn[var_map[idx]] = assgn[idx]; }); } - expand_device_copy(current_assignment, fixed_var_assignment, stream); + expand_device_copy(current_assignment, local_fixed, stream); auto h_assignment = cuopt::host_copy(current_assignment, stream); cuopt_assert(additional_var_id_per_var.size() == h_assignment.size(), "Size mismatch"); cuopt_assert(additional_var_used.size() == h_assignment.size(), "Size mismatch"); @@ -134,8 +136,6 @@ void presolve_data_t::post_process_assignment( } } - // Apply variable substitutions from probing: x_substituted = offset + coefficient * - // x_substituting for (const auto& sub : variable_substitutions) { cuopt_assert(sub.substituted_var < (i_t)h_assignment.size(), "substituted_var out of bounds"); cuopt_assert(sub.substituting_var < (i_t)h_assignment.size(), "substituting_var out of bounds"); @@ -223,23 +223,23 @@ void presolve_data_t::set_papilo_presolve_data( template void presolve_data_t::papilo_uncrush_assignment( - problem_t& problem, rmm::device_uvector& assignment) const + problem_t& problem, + rmm::device_uvector& assignment, + const raft::handle_t* handle_override) const { if (papilo_presolve_ptr == nullptr) { CUOPT_LOG_INFO("Papilo presolve data not set, skipping uncrushing assignment"); return; } + const auto* h = handle_override ? handle_override : problem.handle_ptr; cuopt_assert(assignment.size() == papilo_reduced_to_original_map.size(), "Papilo uncrush assignment size mismatch"); - auto h_assignment = cuopt::host_copy(assignment, problem.handle_ptr->get_stream()); + auto h_assignment = cuopt::host_copy(assignment, h->get_stream()); std::vector full_assignment; papilo_presolve_ptr->uncrush_primal_solution(h_assignment, full_assignment); - assignment.resize(full_assignment.size(), problem.handle_ptr->get_stream()); - raft::copy(assignment.data(), - full_assignment.data(), - full_assignment.size(), - problem.handle_ptr->get_stream()); - problem.handle_ptr->sync_stream(); + assignment.resize(full_assignment.size(), h->get_stream()); + raft::copy(assignment.data(), full_assignment.data(), full_assignment.size(), h->get_stream()); + h->sync_stream(); } #if MIP_INSTANTIATE_FLOAT || PDLP_INSTANTIATE_FLOAT diff --git a/cpp/src/mip_heuristics/problem/presolve_data.cuh b/cpp/src/mip_heuristics/problem/presolve_data.cuh index cac3e71650..e62c16a16b 100644 --- a/cpp/src/mip_heuristics/problem/presolve_data.cuh +++ b/cpp/src/mip_heuristics/problem/presolve_data.cuh @@ -93,10 +93,12 @@ class presolve_data_t { rmm::cuda_stream_view stream); void post_process_assignment(problem_t& problem, rmm::device_uvector& current_assignment, - bool resize_to_original_problem = true) + bool resize_to_original_problem = true, + const raft::handle_t* handle_override = nullptr) { - post_process_assignment( - problem, current_assignment, resize_to_original_problem, problem.handle_ptr->get_stream()); + auto stream = + handle_override ? handle_override->get_stream() : problem.handle_ptr->get_stream(); + post_process_assignment(problem, current_assignment, resize_to_original_problem, stream); } void post_process_solution(problem_t& problem, solution_t& solution); @@ -107,7 +109,8 @@ class presolve_data_t { bool has_papilo_presolve_data() const { return papilo_presolve_ptr != nullptr; } i_t get_papilo_original_num_variables() const { return papilo_original_num_variables; } void papilo_uncrush_assignment(problem_t& problem, - rmm::device_uvector& assignment) const; + rmm::device_uvector& assignment, + const raft::handle_t* handle_override = nullptr) const; presolve_data_t(presolve_data_t&&) = default; presolve_data_t& operator=(presolve_data_t&&) = default; diff --git a/cpp/src/mip_heuristics/problem/problem.cu b/cpp/src/mip_heuristics/problem/problem.cu index 5d5fbc445a..0a86a0d009 100644 --- a/cpp/src/mip_heuristics/problem/problem.cu +++ b/cpp/src/mip_heuristics/problem/problem.cu @@ -27,9 +27,13 @@ #include #include #include +#include +#include +#include #include #include #include +#include #include #include @@ -64,6 +68,10 @@ void problem_t::op_problem_cstr_body(const optimization_problem_tget_thrust_policy(), + integer_fixed_variable_map.begin(), + integer_fixed_variable_map.end(), + -1); const bool is_mip = original_problem_ptr->get_problem_category() != problem_category_t::LP; if (is_mip) { @@ -136,7 +144,7 @@ problem_t::problem_t( nonbinary_indices(0, problem_.get_handle_ptr()->get_stream()), is_binary_variable(0, problem_.get_handle_ptr()->get_stream()), related_variables(0, problem_.get_handle_ptr()->get_stream()), - related_variables_offsets(n_variables, problem_.get_handle_ptr()->get_stream()), + related_variables_offsets(0, problem_.get_handle_ptr()->get_stream()), var_names(problem_.get_variable_names()), row_names(problem_.get_row_names()), objective_name(problem_.get_objective_name()), @@ -946,8 +954,12 @@ void problem_t::compute_related_variables(double time_limit) handle_ptr->sync_stream(); - // CHANGE - if (deterministic) { time_limit = std::numeric_limits::infinity(); } + if (deterministic) { + // TODO: Re-enable deterministic related-variable construction once we have a work estimator. + related_variables.resize(0, handle_ptr->get_stream()); + related_variables_offsets.resize(0, handle_ptr->get_stream()); + return; + } // previously used constants were based on 40GB of memory. Scale accordingly on smaller GPUs // We can't rely on querying free memory or allocation try/catch @@ -1418,6 +1430,12 @@ void problem_t::substitute_variables(const std::vector& var_indic raft::common::nvtx::range fun_scope("substitute_variables"); cuopt_assert((are_exclusive(var_indices, var_to_substitute_indices)), "variables and var_to_substitute_indices are not exclusive"); + { + std::vector sorted_vi(var_indices); + std::sort(sorted_vi.begin(), sorted_vi.end()); + cuopt_assert(std::adjacent_find(sorted_vi.begin(), sorted_vi.end()) == sorted_vi.end(), + "var_indices must not contain duplicates"); + } const i_t dummy_substituted_variable = var_indices[0]; cuopt_assert(var_indices.size() == var_to_substitute_indices.size(), "size mismatch"); cuopt_assert(var_indices.size() == offset_values.size(), "size mismatch"); @@ -1446,10 +1464,16 @@ void problem_t::substitute_variables(const std::vector& var_indic objective_offset_delta_per_variable.begin(), objective_offset_delta_per_variable.end(), zero_value); + const i_t n_substitutions = d_var_indices.size(); + rmm::device_uvector obj_coeff_keys(n_substitutions, handle_ptr->get_stream()); + rmm::device_uvector obj_coeff_deltas(n_substitutions, handle_ptr->get_stream()); + + CUOPT_LOG_INFO("Substituting %d variables", n_substitutions); + thrust::for_each( handle_ptr->get_thrust_policy(), thrust::make_counting_iterator(0), - thrust::make_counting_iterator(0) + d_var_indices.size(), + thrust::make_counting_iterator(0) + n_substitutions, [variable_fix_mask = make_span(fixing_helpers.variable_fix_mask), var_indices = make_span(d_var_indices), n_variables = n_variables, @@ -1458,20 +1482,40 @@ void problem_t::substitute_variables(const std::vector& var_indic var_to_substitute_indices = make_span(d_var_to_substitute_indices), objective_coefficients = make_span(objective_coefficients), objective_offset_delta_per_variable = make_span(objective_offset_delta_per_variable), - objective_offset = objective_offset.data(), + obj_keys = make_span(obj_coeff_keys), + obj_deltas = make_span(obj_coeff_deltas), var_flags = make_span(presolve_data.var_flags)] __device__(i_t idx) { - i_t var_idx = var_indices[idx]; - i_t substituting_var_idx = var_to_substitute_indices[idx]; - variable_fix_mask[var_idx] = idx; - f_t objective_offset_difference = objective_coefficients[var_idx] * substitute_offset[idx]; - objective_offset_delta_per_variable[idx] += objective_offset_difference; - // atomicAdd(objective_offset, objective_offset_difference); - atomicAdd(&objective_coefficients[substituting_var_idx], - objective_coefficients[var_idx] * substitute_coefficient[idx]); - // Substitution changes the constraint coefficients on x_B, invalidating - // any implied-integrality proof that relied on the original structure. + i_t var_idx = var_indices[idx]; + i_t substituting_var_idx = var_to_substitute_indices[idx]; + variable_fix_mask[var_idx] = idx; + objective_offset_delta_per_variable[idx] += + objective_coefficients[var_idx] * substitute_offset[idx]; + obj_keys[idx] = substituting_var_idx; + obj_deltas[idx] = objective_coefficients[var_idx] * substitute_coefficient[idx]; var_flags[substituting_var_idx] &= ~(i_t)VAR_IMPLIED_INTEGER; }); + + // Deterministic reduction of objective coefficient deltas per substituting variable + thrust::sort_by_key(handle_ptr->get_thrust_policy(), + obj_coeff_keys.begin(), + obj_coeff_keys.end(), + obj_coeff_deltas.begin()); + rmm::device_uvector unique_keys(n_substitutions, handle_ptr->get_stream()); + rmm::device_uvector summed_deltas(n_substitutions, handle_ptr->get_stream()); + auto [keys_end, vals_end] = thrust::reduce_by_key(handle_ptr->get_thrust_policy(), + obj_coeff_keys.begin(), + obj_coeff_keys.end(), + obj_coeff_deltas.begin(), + unique_keys.begin(), + summed_deltas.begin()); + i_t n_unique = keys_end - unique_keys.begin(); + thrust::for_each( + handle_ptr->get_thrust_policy(), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(n_unique), + [obj_coeffs = make_span(objective_coefficients), + keys = unique_keys.data(), + deltas = summed_deltas.data()] __device__(i_t i) { obj_coeffs[keys[i]] += deltas[i]; }); presolve_data.objective_offset += thrust::reduce(handle_ptr->get_thrust_policy(), objective_offset_delta_per_variable.begin(), objective_offset_delta_per_variable.end(), @@ -2167,9 +2211,11 @@ void problem_t::set_papilo_presolve_data( } template -void problem_t::papilo_uncrush_assignment(rmm::device_uvector& assignment) const +void problem_t::papilo_uncrush_assignment(rmm::device_uvector& assignment, + const raft::handle_t* handle_override) const { - presolve_data.papilo_uncrush_assignment(const_cast(*this), assignment); + presolve_data.papilo_uncrush_assignment( + const_cast(*this), assignment, handle_override); } template diff --git a/cpp/src/mip_heuristics/problem/problem.cuh b/cpp/src/mip_heuristics/problem/problem.cuh index a801cc4067..a16dae3b53 100644 --- a/cpp/src/mip_heuristics/problem/problem.cuh +++ b/cpp/src/mip_heuristics/problem/problem.cuh @@ -102,10 +102,11 @@ class problem_t { bool resize_to_original_problem, rmm::cuda_stream_view stream); void post_process_assignment(rmm::device_uvector& current_assignment, - bool resize_to_original_problem = true) + bool resize_to_original_problem = true, + const raft::handle_t* handle_override = nullptr) { - post_process_assignment( - current_assignment, resize_to_original_problem, handle_ptr->get_stream()); + auto stream = handle_override ? handle_override->get_stream() : handle_ptr->get_stream(); + post_process_assignment(current_assignment, resize_to_original_problem, stream); } void post_process_solution(solution_t& solution); void set_papilo_presolve_data(const third_party_presolve_t* presolver_ptr, @@ -117,7 +118,8 @@ class problem_t { { return presolve_data.get_papilo_original_num_variables(); } - void papilo_uncrush_assignment(rmm::device_uvector& assignment) const; + void papilo_uncrush_assignment(rmm::device_uvector& assignment, + const raft::handle_t* handle_override = nullptr) const; void compute_transpose_of_problem(); f_t get_user_obj_from_solver_obj(f_t solver_obj) const; f_t get_solver_obj_from_user_obj(f_t user_obj) const; @@ -249,7 +251,8 @@ class problem_t { std::shared_ptr> integer_fixed_problem = nullptr; rmm::device_uvector integer_fixed_variable_map; - std::function&)> branch_and_bound_callback; + std::function&, cuopt::internals::mip_solution_origin_t)> + branch_and_bound_callback; std::function&, const std::vector&, const std::vector&, diff --git a/cpp/src/mip_heuristics/problem/problem_fixing.cuh b/cpp/src/mip_heuristics/problem/problem_fixing.cuh index 820b74e329..c462838d96 100644 --- a/cpp/src/mip_heuristics/problem/problem_fixing.cuh +++ b/cpp/src/mip_heuristics/problem/problem_fixing.cuh @@ -1,12 +1,13 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ #pragma once +#include #include namespace cuopt { @@ -18,6 +19,10 @@ struct problem_fixing_helpers_t { : reduction_in_rhs(n_constraints, handle_ptr->get_stream()), variable_fix_mask(n_variables, handle_ptr->get_stream()) { + thrust::fill( + handle_ptr->get_thrust_policy(), reduction_in_rhs.begin(), reduction_in_rhs.end(), f_t(0)); + thrust::fill( + handle_ptr->get_thrust_policy(), variable_fix_mask.begin(), variable_fix_mask.end(), i_t(0)); } problem_fixing_helpers_t(const problem_fixing_helpers_t& other, const raft::handle_t* handle_ptr) diff --git a/cpp/src/mip_heuristics/problem/problem_helpers.cuh b/cpp/src/mip_heuristics/problem/problem_helpers.cuh index ebc8a488ea..939702e97d 100644 --- a/cpp/src/mip_heuristics/problem/problem_helpers.cuh +++ b/cpp/src/mip_heuristics/problem/problem_helpers.cuh @@ -19,8 +19,10 @@ #include #include #include +#include #include #include +#include namespace cuopt::linear_programming::detail { template diff --git a/cpp/src/mip_heuristics/relaxed_lp/relaxed_lp.cu b/cpp/src/mip_heuristics/relaxed_lp/relaxed_lp.cu index 84415f5372..04366cf37b 100644 --- a/cpp/src/mip_heuristics/relaxed_lp/relaxed_lp.cu +++ b/cpp/src/mip_heuristics/relaxed_lp/relaxed_lp.cu @@ -20,6 +20,17 @@ #include +#include + +// enable to activate detailed determinism logs +#if 0 +#undef CUOPT_DETERMINISM_LOG +#define CUOPT_DETERMINISM_LOG(...) \ + do { \ + CUOPT_LOG_INFO(__VA_ARGS__); \ + } while (0) +#endif + namespace cuopt::linear_programming::detail { template @@ -39,6 +50,9 @@ optimization_problem_solution_t get_relaxed_lp_solution( const relaxed_lp_settings_t& settings) { raft::common::nvtx::range fun_scope("get_relaxed_lp_solution"); + static std::atomic lp_call_counter{0}; + const uint64_t lp_call_id = lp_call_counter.fetch_add(1, std::memory_order_relaxed); + pdlp_solver_settings_t pdlp_settings{}; pdlp_settings.detect_infeasibility = settings.check_infeasibility; pdlp_settings.set_optimality_tolerance(settings.tolerance); @@ -48,17 +62,59 @@ optimization_problem_solution_t get_relaxed_lp_solution( pdlp_settings.tolerances.relative_primal_tolerance = settings.tolerance / tolerance_divisor; pdlp_settings.tolerances.relative_dual_tolerance = settings.tolerance / tolerance_divisor; pdlp_settings.time_limit = settings.time_limit; - pdlp_settings.concurrent_halt = settings.concurrent_halt; - pdlp_settings.per_constraint_residual = settings.per_constraint_residual; - pdlp_settings.first_primal_feasible = settings.return_first_feasible; - pdlp_settings.pdlp_solver_mode = pdlp_solver_mode_t::Stable2; - pdlp_settings.presolver = presolver_t::None; + pdlp_settings.iteration_limit = settings.iteration_limit; + + const f_t work_limit = settings.work_limit; + const bool determinism_mode = std::isfinite(work_limit); + pdlp_settings.concurrent_halt = settings.concurrent_halt; + pdlp_settings.per_constraint_residual = settings.per_constraint_residual; + pdlp_settings.first_primal_feasible = settings.return_first_feasible; + pdlp_settings.pdlp_solver_mode = pdlp_solver_mode_t::Stable2; + int estim_iters = pdlp_settings.iteration_limit; + if (determinism_mode) { + // try to estimate the iteration count based on the requested work limit + // TODO: replace with an actual model. this is a rather ugly hack to avoid having + // to touch the PDLP code for this initial PR + estim_iters = 100; + if (!std::isinf(work_limit)) { + do { + // TODO: use an actual predictor model here + double estim_ms = 313 + 200 * op_problem.n_variables - 400 * op_problem.n_constraints + + 600 * op_problem.coefficients.size() + 7100 * estim_iters; + estim_ms = std::max(0.0, estim_ms); + if (estim_ms > work_limit * 1000) { break; } + estim_iters += 100; + } while (true); + } else { + estim_iters = std::numeric_limits::max(); + } + CUOPT_DETERMINISM_LOG( + "estimated iterations %d for work limit %f", estim_iters, settings.work_limit); + pdlp_settings.iteration_limit = estim_iters; + pdlp_settings.time_limit = std::numeric_limits::infinity(); + pdlp_settings.pdlp_solver_mode = pdlp_solver_mode_t::Stable2; + pdlp_settings.presolver = presolver_t::None; + } + CUOPT_DETERMINISM_LOG( + "LP call %lu config: det=%d work_limit=%.6f time_limit=%.6f iter_limit=%d method=%d mode=%d " + "presolver=%d save_state=%d has_initial=%d assignment_hash=0x%x", + lp_call_id, + (int)determinism_mode, + settings.work_limit, + pdlp_settings.time_limit, + pdlp_settings.iteration_limit, + (int)pdlp_settings.method, + (int)pdlp_settings.pdlp_solver_mode, + (int)pdlp_settings.presolver, + (int)settings.save_state, + (int)settings.has_initial_primal, + detail::compute_hash(assignment, op_problem.handle_ptr->get_stream())); set_pdlp_solver_mode(pdlp_settings); // TODO: set Stable3 here? pdlp_solver_t lp_solver(op_problem, pdlp_settings); if (settings.has_initial_primal) { i_t prev_size = lp_state.prev_dual.size(); - CUOPT_LOG_DEBUG( + CUOPT_LOG_TRACE( "setting initial primal solution of size %d dual size %d problem vars %d cstrs %d", assignment.size(), lp_state.prev_dual.size(), @@ -72,25 +128,68 @@ optimization_problem_solution_t get_relaxed_lp_solution( lp_state.prev_dual.data(), lp_state.prev_dual.data() + op_problem.n_constraints, [prev_size, dual = make_span(lp_state.prev_dual)] __device__(i_t i) { + // early exit to avoid a false positive in compute-sanitizer initcheck + if (i >= prev_size) { return 0.0; } f_t x = dual[i]; - if (!isfinite(x) || i >= prev_size) { return 0.0; } + if (!isfinite(x)) { return 0.0; } return x; }); lp_solver.set_initial_primal_solution(assignment); lp_solver.set_initial_dual_solution(lp_state.prev_dual); } - CUOPT_LOG_DEBUG( + CUOPT_LOG_TRACE( "running LP with n_vars %d n_cstr %d", op_problem.n_variables, op_problem.n_constraints); // before LP flush the logs as it takes quite some time cuopt::default_logger().flush(); // temporarily add timer auto start_time = timer_t(pdlp_settings.time_limit); lp_solver.set_inside_mip(true); + CUOPT_DETERMINISM_LOG( + "prev solution sizes primal=%lu dual=%lu", assignment.size(), lp_state.prev_dual.size()); + if (determinism_mode) { + auto init_primal_hash = + detail::compute_hash(make_span(assignment), op_problem.handle_ptr->get_stream()); + auto init_dual_hash = + settings.has_initial_primal + ? detail::compute_hash(make_span(lp_state.prev_dual), op_problem.handle_ptr->get_stream()) + : 0u; + CUOPT_DETERMINISM_LOG("LP call %lu pre-solve state: init_primal_hash=0x%x init_dual_hash=0x%x", + lp_call_id, + init_primal_hash, + init_dual_hash); + } auto solver_response = lp_solver.run_solver(start_time); + CUOPT_DETERMINISM_LOG("post LP primal size %lu", solver_response.get_primal_solution().size()); + const int actual_iters = + solver_response.get_additional_termination_information().number_of_steps_taken; + CUOPT_DETERMINISM_LOG("LP call %lu result: status=%d iters=%d primal_hash=0x%x", + lp_call_id, + (int)solver_response.get_termination_status(), + actual_iters, + solver_response.get_primal_solution().size() != 0 + ? detail::compute_hash(solver_response.get_primal_solution(), + op_problem.handle_ptr->get_stream()) + : 0u); + + if (determinism_mode && settings.work_context != nullptr) { + double work_to_record = settings.work_limit; + if (estim_iters > 0) { + work_to_record = + settings.work_limit * std::clamp((double)actual_iters / (double)estim_iters, 0.0, 1.0); + } + CUOPT_DETERMINISM_LOG( + "LP call %lu recording %.6fwu (actual_iters=%d estim_iters=%d requested=%.6f)", + lp_call_id, + work_to_record, + actual_iters, + estim_iters, + settings.work_limit); + settings.work_context->record_work_sync_on_horizon(work_to_record); + } if (solver_response.get_primal_solution().size() != 0 && solver_response.get_dual_solution().size() != 0 && settings.save_state) { - CUOPT_LOG_DEBUG("saving initial primal solution of size %d", lp_state.prev_primal.size()); + CUOPT_LOG_TRACE("saving initial primal solution of size %d", lp_state.prev_primal.size()); lp_state.set_state(solver_response.get_primal_solution(), solver_response.get_dual_solution()); } if (solver_response.get_primal_solution().size() != 0) { @@ -100,11 +199,17 @@ optimization_problem_solution_t get_relaxed_lp_solution( solver_response.get_primal_solution().size(), op_problem.handle_ptr->get_stream()); } + CUOPT_DETERMINISM_LOG("LP call %lu assignment_after_copy hash=0x%x", + lp_call_id, + detail::compute_hash(assignment, op_problem.handle_ptr->get_stream())); if (solver_response.get_termination_status() == pdlp_termination_status_t::Optimal) { - CUOPT_LOG_DEBUG("feasible solution found with LP objective %f", + CUOPT_LOG_TRACE("feasible solution found with LP objective %f", solver_response.get_objective_value()); } else { - CUOPT_LOG_DEBUG("LP returned with reason %d", solver_response.get_termination_status()); + CUOPT_DETERMINISM_LOG( + "LP returned with reason %d, %d iterations", + solver_response.get_termination_status(), + solver_response.get_additional_termination_information().number_of_steps_taken); } return solver_response; diff --git a/cpp/src/mip_heuristics/relaxed_lp/relaxed_lp.cuh b/cpp/src/mip_heuristics/relaxed_lp/relaxed_lp.cuh index 9fe5fb9071..06698d79ae 100644 --- a/cpp/src/mip_heuristics/relaxed_lp/relaxed_lp.cuh +++ b/cpp/src/mip_heuristics/relaxed_lp/relaxed_lp.cuh @@ -12,19 +12,23 @@ #include #include #include +#include #include "lp_state.cuh" namespace cuopt::linear_programming::detail { struct relaxed_lp_settings_t { - double tolerance = 1e-4; - double time_limit = 1.0; - bool check_infeasibility = true; - bool return_first_feasible = false; - bool save_state = true; - bool per_constraint_residual = true; - bool has_initial_primal = true; - std::atomic* concurrent_halt = nullptr; + double tolerance = 1e-4; + double time_limit = 1.0; + int iteration_limit = std::numeric_limits::max(); + double work_limit = std::numeric_limits::infinity(); + bool check_infeasibility = true; + bool return_first_feasible = false; + bool save_state = true; + bool per_constraint_residual = true; + bool has_initial_primal = true; + std::atomic* concurrent_halt = nullptr; + cuopt::work_limit_context_t* work_context = nullptr; }; template diff --git a/cpp/src/mip_heuristics/solution/solution.cu b/cpp/src/mip_heuristics/solution/solution.cu index e4192c0195..db3bd7fedc 100644 --- a/cpp/src/mip_heuristics/solution/solution.cu +++ b/cpp/src/mip_heuristics/solution/solution.cu @@ -19,6 +19,8 @@ #include #include +#include +#include #include #include #include @@ -46,8 +48,6 @@ solution_t::solution_t(problem_t& problem_) assignment(std::move(get_lower_bounds(problem_.variable_bounds, handle_ptr))), lower_excess(problem_.n_constraints, handle_ptr->get_stream()), upper_excess(problem_.n_constraints, handle_ptr->get_stream()), - lower_slack(problem_.n_constraints, handle_ptr->get_stream()), - upper_slack(problem_.n_constraints, handle_ptr->get_stream()), constraint_value(problem_.n_constraints, handle_ptr->get_stream()), obj_val(handle_ptr->get_stream()), n_feasible_constraints(handle_ptr->get_stream()), @@ -56,6 +56,22 @@ solution_t::solution_t(problem_t& problem_) clamp_within_var_bounds(assignment, problem_ptr, handle_ptr); } +template +solution_t::solution_t(problem_t& problem_, + const raft::handle_t* handle_override) + : problem_ptr(&problem_), + handle_ptr(handle_override), + assignment(std::move(get_lower_bounds(problem_.variable_bounds, handle_ptr))), + lower_excess(problem_.n_constraints, handle_ptr->get_stream()), + upper_excess(problem_.n_constraints, handle_ptr->get_stream()), + constraint_value(problem_.n_constraints, handle_ptr->get_stream()), + obj_val(handle_ptr->get_stream()), + n_feasible_constraints(handle_ptr->get_stream()), + lp_state(problem_, handle_ptr->get_stream()) +{ + clamp_within_var_bounds(assignment, problem_ptr, handle_ptr); +} + template solution_t::solution_t(const solution_t& other) : problem_ptr(other.problem_ptr), @@ -63,8 +79,6 @@ solution_t::solution_t(const solution_t& other) assignment(other.assignment, handle_ptr->get_stream()), lower_excess(other.lower_excess, handle_ptr->get_stream()), upper_excess(other.upper_excess, handle_ptr->get_stream()), - lower_slack(other.lower_slack, handle_ptr->get_stream()), - upper_slack(other.upper_slack, handle_ptr->get_stream()), constraint_value(other.constraint_value, handle_ptr->get_stream()), obj_val(other.obj_val, handle_ptr->get_stream()), n_feasible_constraints(other.n_feasible_constraints, handle_ptr->get_stream()), @@ -91,10 +105,18 @@ void solution_t::copy_from(const solution_t& other_sol) h_user_obj = other_sol.h_user_obj; h_infeasibility_cost = other_sol.h_infeasibility_cost; expand_device_copy(assignment, other_sol.assignment, handle_ptr->get_stream()); + + // excess and constraint value may be uninitialized (and computed later). Mark them as + // such + cuopt::mark_span_as_initialized(make_span(other_sol.lower_excess), handle_ptr->get_stream()); + cuopt::mark_span_as_initialized(make_span(other_sol.upper_excess), handle_ptr->get_stream()); + cuopt::mark_span_as_initialized(make_span(other_sol.constraint_value), handle_ptr->get_stream()); + cuopt::mark_span_as_initialized(make_span(other_sol.obj_val), handle_ptr->get_stream()); + cuopt::mark_span_as_initialized(make_span(other_sol.n_feasible_constraints), + handle_ptr->get_stream()); + expand_device_copy(lower_excess, other_sol.lower_excess, handle_ptr->get_stream()); expand_device_copy(upper_excess, other_sol.upper_excess, handle_ptr->get_stream()); - expand_device_copy(lower_slack, other_sol.lower_slack, handle_ptr->get_stream()); - expand_device_copy(upper_slack, other_sol.upper_slack, handle_ptr->get_stream()); expand_device_copy(constraint_value, other_sol.constraint_value, handle_ptr->get_stream()); raft::copy(obj_val.data(), other_sol.obj_val.data(), 1, handle_ptr->get_stream()); raft::copy(n_feasible_constraints.data(), @@ -113,14 +135,26 @@ void solution_t::copy_from(const solution_t& other_sol) template void solution_t::resize_to_problem() { + i_t old_n_vars = lp_state.prev_primal.size(); + i_t old_n_cstrs = lp_state.prev_dual.size(); assignment.resize(problem_ptr->n_variables, handle_ptr->get_stream()); lower_excess.resize(problem_ptr->n_constraints, handle_ptr->get_stream()); upper_excess.resize(problem_ptr->n_constraints, handle_ptr->get_stream()); - lower_slack.resize(problem_ptr->n_constraints, handle_ptr->get_stream()); - upper_slack.resize(problem_ptr->n_constraints, handle_ptr->get_stream()); constraint_value.resize(problem_ptr->n_constraints, handle_ptr->get_stream()); lp_state.prev_primal.resize(problem_ptr->n_variables, handle_ptr->get_stream()); lp_state.prev_dual.resize(problem_ptr->n_constraints, handle_ptr->get_stream()); + if (problem_ptr->n_variables > old_n_vars) { + thrust::fill(handle_ptr->get_thrust_policy(), + lp_state.prev_primal.data() + old_n_vars, + lp_state.prev_primal.data() + problem_ptr->n_variables, + f_t(0)); + } + if (problem_ptr->n_constraints > old_n_cstrs) { + thrust::fill(handle_ptr->get_thrust_policy(), + lp_state.prev_dual.data() + old_n_cstrs, + lp_state.prev_dual.data() + problem_ptr->n_constraints, + f_t(0)); + } } template @@ -131,10 +165,6 @@ void solution_t::resize_to_original_problem() handle_ptr->get_stream()); upper_excess.resize(problem_ptr->original_problem_ptr->get_n_constraints(), handle_ptr->get_stream()); - lower_slack.resize(problem_ptr->original_problem_ptr->get_n_constraints(), - handle_ptr->get_stream()); - upper_slack.resize(problem_ptr->original_problem_ptr->get_n_constraints(), - handle_ptr->get_stream()); constraint_value.resize(problem_ptr->original_problem_ptr->get_n_constraints(), handle_ptr->get_stream()); lp_state.prev_primal.resize(problem_ptr->original_problem_ptr->get_n_variables(), @@ -149,8 +179,6 @@ void solution_t::resize_copy(const solution_t& other_sol) assignment.resize(other_sol.assignment.size(), handle_ptr->get_stream()); lower_excess.resize(other_sol.lower_excess.size(), handle_ptr->get_stream()); upper_excess.resize(other_sol.upper_excess.size(), handle_ptr->get_stream()); - lower_slack.resize(other_sol.lower_slack.size(), handle_ptr->get_stream()); - upper_slack.resize(other_sol.upper_slack.size(), handle_ptr->get_stream()); constraint_value.resize(other_sol.constraint_value.size(), handle_ptr->get_stream()); lp_state.prev_primal.resize(other_sol.lp_state.prev_primal.size(), handle_ptr->get_stream()); lp_state.prev_dual.resize(other_sol.lp_state.prev_dual.size(), handle_ptr->get_stream()); @@ -165,8 +193,6 @@ typename solution_t::view_t solution_t::view() v.assignment = raft::device_span{assignment.data(), assignment.size()}; v.lower_excess = raft::device_span{lower_excess.data(), lower_excess.size()}; v.upper_excess = raft::device_span{upper_excess.data(), upper_excess.size()}; - v.lower_slack = raft::device_span{lower_slack.data(), lower_slack.size()}; - v.upper_slack = raft::device_span{upper_slack.data(), upper_slack.size()}; v.constraint_value = raft::device_span{constraint_value.data(), constraint_value.size()}; v.obj_val = obj_val.data(); v.n_feasible_constraints = n_feasible_constraints.data(); @@ -235,7 +261,7 @@ void solution_t::assign_random_within_bounds(f_t ratio_of_vars_to_rand auto variable_bounds = cuopt::host_copy(problem_ptr->variable_bounds, stream); auto variable_types = cuopt::host_copy(problem_ptr->variable_types, stream); - problem_ptr->handle_ptr->sync_stream(); + handle_ptr->sync_stream(); for (size_t i = 0; i < problem_ptr->variable_bounds.size(); ++i) { if (only_integers && variable_types[i] != var_t::INTEGER) { continue; } bool skip = unif_prob(rng) > ratio_of_vars_to_random_assign; @@ -642,6 +668,14 @@ mip_solution_t solution_t::get_solution(bool output_feasible } } +template +uint32_t solution_t::get_hash() const +{ + auto h_assignment = + host_copy(assignment.data(), problem_ptr->n_variables, handle_ptr->get_stream()); + return compute_hash(h_assignment); +} + #if MIP_INSTANTIATE_FLOAT || PDLP_INSTANTIATE_FLOAT template class solution_t; #endif diff --git a/cpp/src/mip_heuristics/solution/solution.cuh b/cpp/src/mip_heuristics/solution/solution.cuh index 9de10ed980..c0f3c539e7 100644 --- a/cpp/src/mip_heuristics/solution/solution.cuh +++ b/cpp/src/mip_heuristics/solution/solution.cuh @@ -25,6 +25,7 @@ template class solution_t { public: solution_t(problem_t& problem); + solution_t(problem_t& problem, const raft::handle_t* handle_override); solution_t(const solution_t& other); solution_t& operator=(solution_t&& other) noexcept = default; solution_t(solution_t&& other) = default; @@ -99,6 +100,7 @@ class solution_t { f_t compute_max_constraint_violation(); f_t compute_max_int_violation(); f_t compute_max_variable_violation(); + uint32_t get_hash() const; struct view_t { // let's not bloat the class for every simple getter and setters @@ -112,8 +114,6 @@ class solution_t { raft::device_span assignment; raft::device_span lower_excess; raft::device_span upper_excess; - raft::device_span lower_slack; - raft::device_span upper_slack; raft::device_span constraint_value; f_t* obj_val; i_t* n_feasible_constraints; @@ -128,8 +128,6 @@ class solution_t { rmm::device_uvector assignment; rmm::device_uvector lower_excess; rmm::device_uvector upper_excess; - rmm::device_uvector lower_slack; - rmm::device_uvector upper_slack; rmm::device_uvector constraint_value; rmm::device_scalar obj_val; rmm::device_scalar n_feasible_constraints; diff --git a/cpp/src/mip_heuristics/solution_callbacks.cuh b/cpp/src/mip_heuristics/solution_callbacks.cuh new file mode 100644 index 0000000000..b6f3ded8d6 --- /dev/null +++ b/cpp/src/mip_heuristics/solution_callbacks.cuh @@ -0,0 +1,223 @@ +/* clang-format off */ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ +/* clang-format on */ + +#pragma once + +#include + +#include +#include + +#include +#include +#include + +namespace cuopt::linear_programming::detail { + +template +struct solution_callback_payload_t { + std::vector assignment{}; + f_t user_objective{}; + f_t solver_objective{}; + internals::mip_solution_callback_info_t callback_info{}; +}; + +template +void dispatch_get_solution_callbacks( + const std::vector& user_callbacks, + const std::vector& assignment, + f_t user_objective, + f_t solution_bound, + const internals::mip_solution_callback_info_t& callback_info) +{ + for (auto callback : user_callbacks) { + if (callback->get_type() != internals::base_solution_callback_type::GET_SOLUTION_EXT && + callback->get_type() != internals::base_solution_callback_type::GET_SOLUTION) { + continue; + } + + std::vector user_assignment(assignment); + std::vector user_objective_vec(1, user_objective); + std::vector user_bound_vec(1, solution_bound); + if (callback->get_type() == internals::base_solution_callback_type::GET_SOLUTION_EXT) { + auto get_sol_callback_ext = static_cast(callback); + get_sol_callback_ext->get_solution(user_assignment.data(), + user_objective_vec.data(), + user_bound_vec.data(), + &callback_info, + get_sol_callback_ext->get_user_data()); + } else if (callback->get_type() == internals::base_solution_callback_type::GET_SOLUTION) { + auto get_sol_callback = static_cast(callback); + get_sol_callback->get_solution(user_assignment.data(), + user_objective_vec.data(), + user_bound_vec.data(), + get_sol_callback->get_user_data()); + } + } +} + +template +class solution_publication_t { + public: + solution_publication_t(const mip_solver_settings_t& settings_, + solver_stats_t& stats_) + : settings(settings_), stats(stats_) + { + } + + void reset_published_best(f_t objective = std::numeric_limits::max()) + { + best_callback_feasible_objective_ = objective; + } + + solution_callback_payload_t build_callback_payload( + problem_t* problem_ptr, + solution_t& sol, + internals::mip_solution_origin_t origin, + double work_timestamp) + { + cuopt_assert(problem_ptr != nullptr, "Callback payload problem pointer must not be null"); + cuopt_assert(work_timestamp >= 0.0, "work_timestamp must not be negative"); + solution_callback_payload_t payload{}; + payload.user_objective = sol.get_user_objective(); + payload.solver_objective = sol.get_objective(); + payload.callback_info.origin = (uint32_t)origin; + payload.callback_info.work_timestamp = work_timestamp; + solution_t temp_sol(sol); + CUOPT_LOG_DEBUG("build_callback_payload: pre_postprocess size=%zu handle=%p problem_handle=%p", + temp_sol.assignment.size(), + (void*)sol.handle_ptr, + (void*)problem_ptr->handle_ptr); + problem_ptr->post_process_assignment(temp_sol.assignment, true, sol.handle_ptr); + CUOPT_LOG_DEBUG("build_callback_payload: post_postprocess size=%zu", + temp_sol.assignment.size()); + if (problem_ptr->has_papilo_presolve_data()) { + CUOPT_LOG_DEBUG("build_callback_payload: pre_papilo size=%zu papilo_reduced_size=%zu", + temp_sol.assignment.size(), + problem_ptr->get_papilo_original_num_variables()); + problem_ptr->papilo_uncrush_assignment(temp_sol.assignment, sol.handle_ptr); + CUOPT_LOG_DEBUG("build_callback_payload: post_papilo size=%zu", temp_sol.assignment.size()); + } + payload.assignment = cuopt::host_copy(temp_sol.assignment, temp_sol.handle_ptr->get_stream()); + CUOPT_LOG_DEBUG("build_callback_payload: final payload size=%zu obj=%.6g origin=%s", + payload.assignment.size(), + payload.user_objective, + internals::mip_solution_origin_to_string(origin)); + return payload; + } + + bool publish_new_best_feasible(const solution_callback_payload_t& payload, + double elapsed_time = -1.0) + { + std::lock_guard lock(solution_callback_mutex_); + cuopt_assert(std::isfinite(payload.solver_objective), + "Feasible incumbent objective must be finite"); + if (!(payload.solver_objective < best_callback_feasible_objective_)) { return false; } + + best_callback_feasible_objective_ = payload.solver_objective; + if (settings.benchmark_info_ptr != nullptr && elapsed_time >= 0.0) { + settings.benchmark_info_ptr->last_improvement_of_best_feasible = elapsed_time; + } + invoke_get_solution_callbacks(payload); + return true; + } + + private: + void invoke_get_solution_callbacks(const solution_callback_payload_t& payload) + { + auto user_callbacks = settings.get_mip_callbacks(); + CUOPT_LOG_DEBUG("Publishing incumbent: obj=%g wut=%.6f origin=%s callbacks=%zu", + payload.user_objective, + payload.callback_info.work_timestamp, + internals::mip_solution_origin_to_string( + (internals::mip_solution_origin_t)payload.callback_info.origin), + user_callbacks.size()); + dispatch_get_solution_callbacks(user_callbacks, + payload.assignment, + payload.user_objective, + stats.get_solution_bound(), + payload.callback_info); + } + + const mip_solver_settings_t& settings; + solver_stats_t& stats; + std::mutex solution_callback_mutex_; + f_t best_callback_feasible_objective_{std::numeric_limits::max()}; +}; + +// Processes SET_SOLUTION user callbacks: invokes the callback, validates/scales/preprocesses +// the returned assignment, and returns it for the caller to reinject. +template +class solution_injection_t { + public: + solution_injection_t(const mip_solver_settings_t& settings_, + solver_stats_t& stats_) + : settings(settings_), stats(stats_) + { + } + + template + void invoke_set_solution_callbacks(problem_t* problem_ptr, + solution_t& current_incumbent, + OnInjectedFn&& on_injected) + { + auto user_callbacks = settings.get_mip_callbacks(); + for (auto callback : user_callbacks) { + if (callback->get_type() != internals::base_solution_callback_type::SET_SOLUTION) { + continue; + } + auto set_sol_callback = static_cast(callback); + f_t user_bound = stats.get_solution_bound(); + auto callback_num_variables = problem_ptr->original_problem_ptr->get_n_variables(); + rmm::device_uvector incumbent_assignment(callback_num_variables, + current_incumbent.handle_ptr->get_stream()); + auto inf = std::numeric_limits::infinity(); + current_incumbent.handle_ptr->sync_stream(); + std::vector h_incumbent_assignment(incumbent_assignment.size()); + std::vector h_outside_sol_objective(1, inf); + std::vector h_user_bound(1, user_bound); + set_sol_callback->set_solution(h_incumbent_assignment.data(), + h_outside_sol_objective.data(), + h_user_bound.data(), + set_sol_callback->get_user_data()); + f_t outside_sol_objective = h_outside_sol_objective[0]; + if (outside_sol_objective == inf) { continue; } + + raft::copy(incumbent_assignment.data(), + h_incumbent_assignment.data(), + incumbent_assignment.size(), + current_incumbent.handle_ptr->get_stream()); + bool is_valid = problem_ptr->pre_process_assignment(incumbent_assignment); + if (!is_valid) { continue; } + + solution_t outside_sol(current_incumbent); + cuopt_assert(outside_sol.assignment.size() == incumbent_assignment.size(), + "Incumbent assignment size mismatch"); + raft::copy(outside_sol.assignment.data(), + incumbent_assignment.data(), + incumbent_assignment.size(), + current_incumbent.handle_ptr->get_stream()); + outside_sol.compute_feasibility(); + + CUOPT_LOG_DEBUG("Injected solution feasibility = %d objective = %g excess = %g", + outside_sol.get_feasible(), + outside_sol.get_user_objective(), + outside_sol.get_total_excess()); + cuopt_assert(std::abs(outside_sol.get_user_objective() - outside_sol_objective) <= 1e-6, + "External solution objective mismatch"); + on_injected(outside_sol.get_host_assignment(), + outside_sol.get_objective(), + internals::mip_solution_origin_t::USER_INJECTED); + } + } + + private: + const mip_solver_settings_t& settings; + solver_stats_t& stats; +}; + +} // namespace cuopt::linear_programming::detail diff --git a/cpp/src/mip_heuristics/solve.cu b/cpp/src/mip_heuristics/solve.cu index be01516657..521f0e16b7 100644 --- a/cpp/src/mip_heuristics/solve.cu +++ b/cpp/src/mip_heuristics/solve.cu @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -26,6 +27,7 @@ #include #include #include +#include #include #include @@ -60,34 +62,21 @@ static void init_handler(const raft::handle_t* handle_ptr) handle_ptr->get_cusparse_handle(), CUSPARSE_POINTER_MODE_DEVICE, handle_ptr->get_stream())); } -template -static void invoke_solution_callbacks( - const std::vector& mip_callbacks, - f_t objective, - std::vector& assignment, - f_t bound) -{ - std::vector obj_vec = {objective}; - std::vector bound_vec = {bound}; - for (auto callback : mip_callbacks) { - if (callback != nullptr && - callback->get_type() == internals::base_solution_callback_type::GET_SOLUTION) { - auto get_sol_callback = static_cast(callback); - get_sol_callback->get_solution( - assignment.data(), obj_vec.data(), bound_vec.data(), get_sol_callback->get_user_data()); - } - } -} - template mip_solution_t run_mip(detail::problem_t& problem, mip_solver_settings_t const& settings, - timer_t& timer, + cuopt::termination_checker_t& timer, f_t& initial_upper_bound, std::vector& initial_incumbent_assignment) { + raft::common::nvtx::range fun_scope("run_mip"); try { - raft::common::nvtx::range fun_scope("run_mip"); + auto constexpr const running_mip = true; + + // TODO ask Akif and Alice how was this passed down? + [[maybe_unused]] auto hyper_params = settings.hyper_params; + hyper_params.update_primal_weight_on_initial_solution = false; + hyper_params.update_step_size_on_initial_solution = true; if (settings.get_mip_callbacks().size() > 0) { auto callback_num_variables = problem.original_problem_ptr->get_n_variables(); if (problem.has_papilo_presolve_data()) { @@ -115,34 +104,26 @@ mip_solution_t run_mip(detail::problem_t& problem, stats.set_solution_bound(solution.get_user_objective()); // log the objective for scripts which need it CUOPT_LOG_INFO("Best feasible: %f", solution.get_user_objective()); - for (auto callback : settings.get_mip_callbacks()) { - if (callback->get_type() == internals::base_solution_callback_type::GET_SOLUTION) { - auto temp_sol(solution); - auto get_sol_callback = static_cast(callback); - std::vector user_objective_vec(1); - std::vector user_bound_vec(1); - user_objective_vec[0] = solution.get_user_objective(); - user_bound_vec[0] = stats.get_solution_bound(); - if (problem.has_papilo_presolve_data()) { - problem.papilo_uncrush_assignment(temp_sol.assignment); - } - std::vector user_assignment_vec(temp_sol.assignment.size()); - raft::copy(user_assignment_vec.data(), - temp_sol.assignment.data(), - temp_sol.assignment.size(), - temp_sol.handle_ptr->get_stream()); - solution.handle_ptr->sync_stream(); - get_sol_callback->get_solution(user_assignment_vec.data(), - user_objective_vec.data(), - user_bound_vec.data(), - get_sol_callback->get_user_data()); + { + detail::solution_callback_payload_t payload{}; + payload.user_objective = solution.get_user_objective(); + payload.solver_objective = solution.get_objective(); + payload.callback_info.origin = (uint32_t)internals::mip_solution_origin_t::PRESOLVE; + payload.callback_info.work_timestamp = 0.0; + detail::solution_t temp_sol(solution); + if (problem.has_papilo_presolve_data()) { + problem.papilo_uncrush_assignment(temp_sol.assignment); } + payload.assignment = temp_sol.get_host_assignment(); + detail::solution_publication_t pub(settings, stats); + pub.publish_new_best_feasible(payload); } return solution.get_solution(true, stats, false); } + // problem contains unpreprocessed data detail::problem_t scaled_problem(problem); - cuopt_func_call(auto saved_problem = scaled_problem); + CUOPT_LOG_INFO("Objective offset %f scaling_factor %f", problem.presolve_data.objective_offset, problem.presolve_data.objective_scaling_factor); @@ -151,6 +132,7 @@ mip_solution_t run_mip(detail::problem_t& problem, "Size mismatch"); cuopt_assert(problem.original_problem_ptr->get_n_constraints() == scaled_problem.n_constraints, "Size mismatch"); + // only call preprocess on scaled problem, so we can compute feasibility on the original problem scaled_problem.preprocess_problem(); scaled_problem.related_vars_time_limit = settings.heuristic_params.related_vars_time_limit; @@ -178,33 +160,37 @@ mip_solution_t run_mip(detail::problem_t& problem, // via problem.get_solver_obj_from_user_obj. std::unique_ptr> early_cpufj; bool run_early_cpufj = problem.has_papilo_presolve_data() && - settings.determinism_mode != CUOPT_MODE_DETERMINISTIC && + settings.determinism_mode == CUOPT_DETERMINISM_NONE && problem.original_problem_ptr->get_n_integers() > 0; if (run_early_cpufj) { auto early_fj_start = std::chrono::steady_clock::now(); auto* presolver_ptr = problem.presolve_data.papilo_presolve_ptr; auto mip_callbacks = settings.get_mip_callbacks(); f_t no_bound = problem.presolve_data.objective_scaling_factor >= 0 ? (f_t)-1e20 : (f_t)1e20; - auto incumbent_callback = [presolver_ptr, - mip_callbacks, - no_bound, - ctx_ptr = &solver.context, - early_fj_start](f_t solver_obj, - f_t user_obj, - const std::vector& assignment, - const char* heuristic_name) { - std::vector user_assignment; - presolver_ptr->uncrush_primal_solution(assignment, user_assignment); - ctx_ptr->initial_incumbent_assignment = user_assignment; - ctx_ptr->initial_upper_bound = user_obj; - double elapsed = - std::chrono::duration(std::chrono::steady_clock::now() - early_fj_start).count(); - CUOPT_LOG_INFO("New solution from early primal heuristics (%s). Objective %+.6e. Time %.2f", - heuristic_name, - user_obj, - elapsed); - invoke_solution_callbacks(mip_callbacks, user_obj, user_assignment, no_bound); - }; + detail::early_incumbent_callback_t incumbent_callback = + [presolver_ptr, mip_callbacks, no_bound, ctx_ptr = &solver.context, early_fj_start]( + f_t solver_obj, + f_t user_obj, + const std::vector& assignment, + internals::mip_solution_origin_t origin) { + std::vector user_assignment; + presolver_ptr->uncrush_primal_solution(assignment, user_assignment); + ctx_ptr->initial_incumbent_assignment = user_assignment; + ctx_ptr->initial_upper_bound = user_obj; + double elapsed = + std::chrono::duration(std::chrono::steady_clock::now() - early_fj_start) + .count(); + CUOPT_LOG_INFO( + "New solution from early primal heuristics (%s). Objective %+.6e. Time %.2f", + internals::mip_solution_origin_to_string(origin), + user_obj, + elapsed); + internals::mip_solution_callback_info_t callback_info{}; + callback_info.origin = (uint32_t)origin; + callback_info.work_timestamp = 0.0; + detail::dispatch_get_solution_callbacks( + mip_callbacks, user_assignment, user_obj, no_bound, callback_info); + }; early_cpufj = std::make_unique>( *problem.original_problem_ptr, settings.get_tolerances(), incumbent_callback); // Convert initial_upper_bound from user-space to the CPUFJ's solver-space (papilo-presolved). @@ -216,7 +202,6 @@ mip_solution_t run_mip(detail::problem_t& problem, solver.context.early_cpufj_ptr = early_cpufj.get(); CUOPT_LOG_DEBUG("Started early CPUFJ on papilo-presolved problem during cuOpt presolve"); } - auto presolved_sol = solver.run_solver(); bool is_feasible_on_presolved = presolved_sol.get_feasible(); presolved_sol.problem_ptr = &problem; @@ -277,6 +262,15 @@ mip_solution_t solve_mip(optimization_problem_t& op_problem, // Initialize seed generator if a specific seed is requested if (settings.seed >= 0) { cuopt::seed_generator::set_seed(settings.seed); } + CUOPT_DETERMINISM_LOG( + "Deterministic solve start settings: seed=%lld seed_state=%lld det_mode=%d " + "work_limit=%.6f max_cut_passes=%d num_cpu_threads=%d", + (long long)settings.seed, + (long long)cuopt::seed_generator::peek_seed(), + (int)settings.determinism_mode, + (double)settings.work_limit, + settings.max_cut_passes, + settings.num_cpu_threads); raft::common::nvtx::range fun_scope("Running solver"); @@ -303,7 +297,9 @@ mip_solution_t solve_mip(optimization_problem_t& op_problem, callback->template setup(op_problem.get_n_variables()); } - auto timer = timer_t(time_limit); + auto timer = + cuopt::termination_checker_t(time_limit, cuopt::termination_checker_t::root_tag_t{}); + const bool deterministic_run = (settings.determinism_mode != CUOPT_DETERMINISM_NONE); if (settings.mip_scaling != CUOPT_MIP_SCALING_OFF) { detail::mip_scaling_strategy_t scaling(op_problem); scaling.scale_problem(settings.mip_scaling != CUOPT_MIP_SCALING_NO_OBJECTIVE); @@ -311,8 +307,7 @@ mip_solution_t solve_mip(optimization_problem_t& op_problem, double presolve_time = 0.0; std::unique_ptr> presolver; std::optional> presolve_result_opt; - detail::problem_t problem( - op_problem, settings.get_tolerances(), settings.determinism_mode == CUOPT_MODE_DETERMINISTIC); + detail::problem_t problem(op_problem, settings.get_tolerances(), deterministic_run); auto run_presolve = settings.presolver != presolver_t::None; run_presolve = run_presolve && settings.initial_solutions.size() == 0; @@ -347,35 +342,41 @@ mip_solution_t solve_mip(optimization_problem_t& op_problem, std::vector early_best_user_assignment; std::mutex early_callback_mutex; - bool run_early_fj = run_presolve && settings.determinism_mode != CUOPT_MODE_DETERMINISTIC && + bool run_early_fj = run_presolve && settings.determinism_mode == CUOPT_DETERMINISM_NONE && op_problem.get_n_integers() > 0 && op_problem.get_n_constraints() > 0; f_t no_bound = problem.presolve_data.objective_scaling_factor >= 0 ? (f_t)-1e20 : (f_t)1e20; if (run_early_fj) { - auto early_fj_start = std::chrono::steady_clock::now(); - auto early_fj_callback = [&early_best_objective, - &early_best_user_obj, - &early_best_user_assignment, - &early_callback_mutex, - &early_fj_start, - mip_callbacks = settings.get_mip_callbacks(), - no_bound](f_t solver_obj, - f_t user_obj, - const std::vector& assignment, - const char* heuristic_name) { - std::lock_guard lock(early_callback_mutex); - if (solver_obj >= early_best_objective.load()) { return; } - early_best_objective.store(solver_obj); - early_best_user_obj = user_obj; - early_best_user_assignment = assignment; - double elapsed = - std::chrono::duration(std::chrono::steady_clock::now() - early_fj_start).count(); - CUOPT_LOG_INFO("New solution from early primal heuristics (%s). Objective %+.6e. Time %.2f", - heuristic_name, - user_obj, - elapsed); - auto user_assignment = assignment; - invoke_solution_callbacks(mip_callbacks, user_obj, user_assignment, no_bound); - }; + auto early_fj_start = std::chrono::steady_clock::now(); + detail::early_incumbent_callback_t early_fj_callback = + [&early_best_objective, + &early_best_user_obj, + &early_best_user_assignment, + &early_callback_mutex, + &early_fj_start, + mip_callbacks = settings.get_mip_callbacks(), + no_bound](f_t solver_obj, + f_t user_obj, + const std::vector& assignment, + internals::mip_solution_origin_t origin) { + std::lock_guard lock(early_callback_mutex); + if (solver_obj >= early_best_objective.load()) { return; } + early_best_objective.store(solver_obj); + early_best_user_obj = user_obj; + early_best_user_assignment = assignment; + internals::mip_solution_callback_info_t callback_info{}; + callback_info.origin = (uint32_t)origin; + callback_info.work_timestamp = 0.0; + double elapsed = + std::chrono::duration(std::chrono::steady_clock::now() - early_fj_start) + .count(); + CUOPT_LOG_INFO( + "New solution from early primal heuristics (%s). Objective %+.6e. Time %.2f", + internals::mip_solution_origin_to_string(origin), + user_obj, + elapsed); + detail::dispatch_get_solution_callbacks( + mip_callbacks, assignment, user_obj, no_bound, callback_info); + }; // Start early CPUFJ on original problem (will restart on presolved problem after Papilo) early_cpufj = std::make_unique>( @@ -398,10 +399,9 @@ mip_solution_t solve_mip(optimization_problem_t& op_problem, const auto& hp = settings.heuristic_params; double presolve_time_limit = std::min(hp.presolve_time_ratio * time_limit, hp.presolve_max_time); - if (settings.determinism_mode == CUOPT_MODE_DETERMINISTIC) { - presolve_time_limit = std::numeric_limits::infinity(); - } - presolver = std::make_unique>(); + if (deterministic_run) { presolve_time_limit = timer.remaining_time(); } + presolver = std::make_unique>(); + presolver->set_deterministic(deterministic_run); auto result = presolver->apply(op_problem, cuopt::linear_programming::problem_category_t::MIP, settings.presolver, @@ -428,7 +428,8 @@ mip_solution_t solve_mip(optimization_problem_t& op_problem, } presolve_result_opt.emplace(std::move(result)); - problem = detail::problem_t(presolve_result_opt->reduced_problem); + problem = detail::problem_t( + presolve_result_opt->reduced_problem, settings.get_tolerances(), deterministic_run); problem.set_papilo_presolve_data(presolver.get(), presolve_result_opt->reduced_to_original_map, presolve_result_opt->original_to_reduced_map, @@ -499,7 +500,8 @@ mip_solution_t solve_mip(optimization_problem_t& op_problem, reduced_costs.data(), reduced_costs.data() + reduced_costs.size(), std::numeric_limits::signaling_NaN()); - detail::problem_t full_problem(op_problem); + detail::problem_t full_problem( + op_problem, settings.get_tolerances(), deterministic_run); detail::solution_t full_sol(full_problem); full_sol.copy_new_assignment( cuopt::host_copy(primal_solution, op_problem.get_handle_ptr()->get_stream())); diff --git a/cpp/src/mip_heuristics/solver.cu b/cpp/src/mip_heuristics/solver.cu index ce6b602fba..35bce62acf 100644 --- a/cpp/src/mip_heuristics/solver.cu +++ b/cpp/src/mip_heuristics/solver.cu @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -25,7 +26,12 @@ #include #include #include -#include + +// enable to activate detailed determinism logs +#if 0 +#undef CUOPT_DETERMINISM_LOG +#define CUOPT_DETERMINISM_LOG(...) CUOPT_LOG_INFO(__VA_ARGS__) +#endif namespace cuopt::linear_programming::detail { @@ -42,25 +48,46 @@ static void init_handler(const raft::handle_t* handle_ptr) template mip_solver_t::mip_solver_t(const problem_t& op_problem, const mip_solver_settings_t& solver_settings, - timer_t timer) + cuopt::termination_checker_t& timer) : op_problem_(op_problem), solver_settings_(solver_settings), context(op_problem.handle_ptr, const_cast*>(&op_problem), solver_settings), timer_(timer) { + context.termination = &timer_; init_handler(op_problem.handle_ptr); } template -struct branch_and_bound_solution_helper_t { - branch_and_bound_solution_helper_t(diversity_manager_t* dm, - dual_simplex::simplex_solver_settings_t& settings) - : dm(dm), settings_(settings) {}; - - void solution_callback(std::vector& solution, f_t objective) +struct bb_callback_adapter_t { + bb_callback_adapter_t(mip_solver_context_t* context, diversity_manager_t* dm) + : context(context), dm(dm) {}; + + void new_incumbent_callback(std::vector& solution, + f_t objective, + const internals::mip_solution_callback_info_t& info, + double work_timestamp) { - dm->population.add_external_solution(solution, objective, solution_origin_t::BRANCH_AND_BOUND); - dm->rins.new_best_incumbent_callback(solution); + if (context->settings.determinism_mode & CUOPT_DETERMINISM_BB) { + // B&B calls this from its own thread. Use a dedicated per-thread stream + // to avoid racing on the heuristic thread's stream. + raft::handle_t callback_handle(rmm::cuda_stream_per_thread); + solution_t temp_sol(*context->problem_ptr, &callback_handle); + temp_sol.copy_new_assignment(solution); + temp_sol.compute_feasibility(); + const auto payload = context->solution_publication.build_callback_payload( + context->problem_ptr, + temp_sol, + (internals::mip_solution_origin_t)info.origin, + work_timestamp); + context->solution_publication.publish_new_best_feasible(payload, work_timestamp); + } + if (context->diversity_manager_ptr != nullptr && + !(context->settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS)) { + context->diversity_manager_ptr->population.add_external_solution( + solution, objective, (internals::mip_solution_origin_t)info.origin); + context->diversity_manager_ptr->rins.new_best_incumbent_callback(solution); + } } void set_simplex_solution(std::vector& solution, @@ -76,8 +103,8 @@ struct branch_and_bound_solution_helper_t { } void preempt_heuristic_solver() { dm->population.preempt_heuristic_solver(); } + mip_solver_context_t* context; diversity_manager_t* dm; - dual_simplex::simplex_solver_settings_t& settings_; }; // Extract probing cache into CPU-only CSR struct for implied bounds cuts @@ -183,6 +210,7 @@ solution_t mip_solver_t::run_solver() { // we need to keep original problem const cuopt_assert(context.problem_ptr != nullptr, "invalid problem pointer"); + cuopt_assert(context.termination != nullptr, "termination checker must be set before run_solver"); context.problem_ptr->tolerances = context.settings.get_tolerances(); cuopt_expects(context.problem_ptr->preprocess_called, error_type_t::RuntimeError, @@ -193,25 +221,28 @@ solution_t mip_solver_t::run_solver() CUOPT_LOG_INFO("Problem fully reduced in presolve"); solution_t sol(*context.problem_ptr); sol.set_problem_fully_reduced(); - for (auto callback : context.settings.get_mip_callbacks()) { - if (callback->get_type() == internals::base_solution_callback_type::GET_SOLUTION) { - auto get_sol_callback = static_cast(callback); - dm.population.invoke_get_solution_callback(sol, get_sol_callback); - } - } + const auto payload = context.solution_publication.build_callback_payload( + context.problem_ptr, sol, internals::mip_solution_origin_t::PRESOLVE, 0.0); + context.solution_publication.publish_new_best_feasible(payload); context.problem_ptr->post_process_solution(sol); return sol; } - dm.timer = timer_; - const bool run_presolve = context.settings.presolver != presolver_t::None; - f_t time_limit = context.settings.determinism_mode == CUOPT_MODE_DETERMINISTIC - ? std::numeric_limits::infinity() - : timer_.remaining_time(); - const auto& hp = context.settings.heuristic_params; - double presolve_time_limit = std::min(hp.presolve_time_ratio * time_limit, hp.presolve_max_time); - presolve_time_limit = context.settings.determinism_mode == CUOPT_MODE_DETERMINISTIC - ? std::numeric_limits::infinity() - : presolve_time_limit; + const bool deterministic_run = + (context.settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS); + const f_t gpu_heur_work_limit = + deterministic_run ? context.settings.work_limit : timer_.get_time_limit(); + if (deterministic_run) + cuopt_assert(gpu_heur_work_limit >= 0.0, + "Deterministic GPU heuristic work limit must be non-negative"); + dm.timer = cuopt::termination_checker_t(context.gpu_heur_loop, gpu_heur_work_limit, timer_); + const bool run_presolve = context.settings.presolver != presolver_t::None; + f_t time_limit = + deterministic_run ? std::numeric_limits::infinity() : timer_.remaining_time(); + const auto& hp = context.settings.heuristic_params; + double presolve_time_limit = + deterministic_run ? timer_.remaining_time() + : std::min(hp.presolve_time_ratio * time_limit, hp.presolve_max_time); + if (std::isfinite(presolve_time_limit)) CUOPT_LOG_DEBUG("Presolve time limit: %g", presolve_time_limit); bool presolve_success = run_presolve ? dm.run_presolve(presolve_time_limit, timer_) : true; @@ -236,12 +267,9 @@ solution_t mip_solver_t::run_solver() CUOPT_LOG_INFO("Problem full reduced in presolve"); solution_t sol(*context.problem_ptr); sol.set_problem_fully_reduced(); - for (auto callback : context.settings.get_mip_callbacks()) { - if (callback->get_type() == internals::base_solution_callback_type::GET_SOLUTION) { - auto get_sol_callback = static_cast(callback); - dm.population.invoke_get_solution_callback(sol, get_sol_callback); - } - } + const auto payload = context.solution_publication.build_callback_payload( + context.problem_ptr, sol, internals::mip_solution_origin_t::PRESOLVE, 0.0); + context.solution_publication.publish_new_best_feasible(payload); context.problem_ptr->post_process_solution(sol); return sol; } @@ -274,12 +302,9 @@ solution_t mip_solver_t::run_solver() sol.set_problem_fully_reduced(); } if (opt_sol.get_termination_status() == pdlp_termination_status_t::Optimal) { - for (auto callback : context.settings.get_mip_callbacks()) { - if (callback->get_type() == internals::base_solution_callback_type::GET_SOLUTION) { - auto get_sol_callback = static_cast(callback); - dm.population.invoke_get_solution_callback(sol, get_sol_callback); - } - } + const auto payload = context.solution_publication.build_callback_payload( + context.problem_ptr, sol, internals::mip_solution_origin_t::PRESOLVE, 0.0); + context.solution_publication.publish_new_best_feasible(payload); } context.problem_ptr->post_process_solution(sol); return sol; @@ -297,7 +322,7 @@ solution_t mip_solver_t::run_solver() branch_and_bound_problem.objective_is_integral = context.problem_ptr->is_objective_integral(); dual_simplex::simplex_solver_settings_t branch_and_bound_settings; std::unique_ptr> branch_and_bound; - branch_and_bound_solution_helper_t solution_helper(&dm, branch_and_bound_settings); + bb_callback_adapter_t solution_helper(&context, &dm); dual_simplex::mip_solution_t branch_and_bound_solution(1); dual_simplex::probing_implied_bound_t probing_implied_bound; @@ -325,9 +350,9 @@ solution_t mip_solver_t::run_solver() branch_and_bound_settings.max_cut_passes = context.settings.max_cut_passes; branch_and_bound_settings.mir_cuts = context.settings.mir_cuts; branch_and_bound_settings.deterministic = - context.settings.determinism_mode == CUOPT_MODE_DETERMINISTIC; + (context.settings.determinism_mode & CUOPT_DETERMINISM_BB); - if (context.settings.determinism_mode == CUOPT_MODE_DETERMINISTIC) { + if ((context.settings.determinism_mode & CUOPT_DETERMINISM_BB)) { branch_and_bound_settings.work_limit = context.settings.work_limit; } else { branch_and_bound_settings.work_limit = std::numeric_limits::infinity(); @@ -355,32 +380,36 @@ solution_t mip_solver_t::run_solver() context.settings.reduced_cost_strengthening == -1 ? 2 : context.settings.reduced_cost_strengthening; + branch_and_bound_settings.bb_work_unit_scale = solver_settings_.bb_work_unit_scale; + branch_and_bound_settings.gpu_heur_wait_for_exploration = + solver_settings_.gpu_heur_wait_for_exploration; if (context.settings.num_cpu_threads < 0) { branch_and_bound_settings.num_threads = std::max(1, omp_get_max_threads() - 1); } else { branch_and_bound_settings.num_threads = std::max(1, context.settings.num_cpu_threads); } + CUOPT_LOG_INFO("Using %d CPU threads for B&B", branch_and_bound_settings.num_threads); - // Set the branch and bound -> primal heuristics callback - branch_and_bound_settings.solution_callback = - std::bind(&branch_and_bound_solution_helper_t::solution_callback, + branch_and_bound_settings.new_incumbent_callback = + std::bind(&bb_callback_adapter_t::new_incumbent_callback, &solution_helper, std::placeholders::_1, - std::placeholders::_2); - // heuristic_preemption_callback is needed in both modes to properly stop the heuristic thread - branch_and_bound_settings.heuristic_preemption_callback = std::bind( - &branch_and_bound_solution_helper_t::preempt_heuristic_solver, &solution_helper); - if (context.settings.determinism_mode == CUOPT_MODE_OPPORTUNISTIC) { + std::placeholders::_2, + std::placeholders::_3, + std::placeholders::_4); + branch_and_bound_settings.heuristic_preemption_callback = + std::bind(&bb_callback_adapter_t::preempt_heuristic_solver, &solution_helper); + if (!(context.settings.determinism_mode & CUOPT_DETERMINISM_BB)) { branch_and_bound_settings.set_simplex_solution_callback = - std::bind(&branch_and_bound_solution_helper_t::set_simplex_solution, + std::bind(&bb_callback_adapter_t::set_simplex_solution, &solution_helper, std::placeholders::_1, std::placeholders::_2, std::placeholders::_3); branch_and_bound_settings.node_processed_callback = - std::bind(&branch_and_bound_solution_helper_t::node_processed_callback, + std::bind(&bb_callback_adapter_t::node_processed_callback, &solution_helper, std::placeholders::_1, std::placeholders::_2); @@ -412,14 +441,15 @@ solution_t mip_solver_t::run_solver() [stats_ptr](f_t user_bound) { stats_ptr->set_solution_bound(user_bound); }); // Set the primal heuristics -> branch and bound callback - if (context.settings.determinism_mode == CUOPT_MODE_OPPORTUNISTIC) { + if (!(context.settings.determinism_mode & CUOPT_DETERMINISM_BB)) { branch_and_bound->set_concurrent_lp_root_solve(true); context.problem_ptr->branch_and_bound_callback = std::bind(&dual_simplex::branch_and_bound_t::set_new_solution, branch_and_bound.get(), - std::placeholders::_1); - } else if (context.settings.determinism_mode == CUOPT_MODE_DETERMINISTIC) { + std::placeholders::_1, + std::placeholders::_2); + } else if ((context.settings.determinism_mode & CUOPT_DETERMINISM_BB)) { branch_and_bound->set_concurrent_lp_root_solve(false); // TODO once deterministic GPU heuristics are integrated // context.problem_ptr->branch_and_bound_callback = @@ -429,18 +459,21 @@ solution_t mip_solver_t::run_solver() } context.work_unit_scheduler_.register_context(branch_and_bound->get_work_unit_context()); - // context.work_unit_scheduler_.verbose = true; - context.problem_ptr->set_root_relaxation_solution_callback = - std::bind(&dual_simplex::branch_and_bound_t::set_root_relaxation_solution, - branch_and_bound.get(), - std::placeholders::_1, - std::placeholders::_2, - std::placeholders::_3, - std::placeholders::_4, - std::placeholders::_5, - std::placeholders::_6, - std::placeholders::_7); + if ((context.settings.determinism_mode & CUOPT_DETERMINISM_BB)) { + context.problem_ptr->set_root_relaxation_solution_callback = nullptr; + } else { + context.problem_ptr->set_root_relaxation_solution_callback = + std::bind(&dual_simplex::branch_and_bound_t::set_root_relaxation_solution, + branch_and_bound.get(), + std::placeholders::_1, + std::placeholders::_2, + std::placeholders::_3, + std::placeholders::_4, + std::placeholders::_5, + std::placeholders::_6, + std::placeholders::_7); + } if (timer_.check_time_limit()) { CUOPT_LOG_INFO("Time limit reached during B&B setup"); @@ -454,10 +487,12 @@ solution_t mip_solver_t::run_solver() // std::async and std::future allow us to get the return value of bb::solve() // without having to manually manage the thread // std::future.get() performs a join() operation to wait until the return status is available - branch_and_bound_status_future = std::async(std::launch::async, - &dual_simplex::branch_and_bound_t::solve, - branch_and_bound.get(), - std::ref(branch_and_bound_solution)); + int bb_device_id = context.handle_ptr->get_device(); + branch_and_bound_status_future = + std::async(std::launch::async, [&branch_and_bound, &branch_and_bound_solution, bb_device_id] { + RAFT_CUDA_TRY(cudaSetDevice(bb_device_id)); + return branch_and_bound->solve(branch_and_bound_solution); + }); } // Start the primal heuristics @@ -470,9 +505,46 @@ solution_t mip_solver_t::run_solver() context.stats.set_solution_bound( context.problem_ptr->get_user_obj_from_solver_obj(branch_and_bound_solution.lower_bound)); } + CUOPT_LOG_DEBUG( + "B&B solution reconstruction: det_bb=%d obj_finite=%d obj=%.16e bb_status=%d " + "has_incumbent=%d sol_size=%zu", + (int)(context.settings.determinism_mode & CUOPT_DETERMINISM_BB), + (int)std::isfinite(branch_and_bound_solution.objective), + branch_and_bound_solution.objective, + (int)bb_status, + (int)branch_and_bound_solution.has_incumbent, + branch_and_bound_solution.x.size()); + if ((context.settings.determinism_mode & CUOPT_DETERMINISM_BB) && + std::isfinite(branch_and_bound_solution.objective)) { + solution_t bb_sol(*context.problem_ptr); + bb_sol.copy_new_assignment(branch_and_bound_solution.x); + bb_sol.compute_feasibility(); + sol = std::move(bb_sol); + } else if ((context.settings.determinism_mode & CUOPT_DETERMINISM_BB)) { + // In deterministic mode, only solutions formally retired by B&B are valid output. + // Discard the GPU heuristic incumbent that B&B never processed. + sol = solution_t(*context.problem_ptr); + } if (bb_status == dual_simplex::mip_status_t::INFEASIBLE) { sol.set_problem_fully_reduced(); } context.stats.num_nodes = branch_and_bound_solution.nodes_explored; context.stats.num_simplex_iterations = branch_and_bound_solution.simplex_iterations; + + if ((context.settings.determinism_mode & CUOPT_DETERMINISM_BB)) { + double bnb_work = branch_and_bound->get_work_unit_context().current_work(); + double gpu_work = context.gpu_heur_loop.current_work(); + double bnb_scale = BB_BASE_WORK_SCALE * solver_settings_.bb_work_unit_scale; + double gpu_scale = GPU_HEUR_BASE_WORK_SCALE * solver_settings_.gpu_heur_work_unit_scale; + CUOPT_LOG_INFO( + "Work unit summary: B&B=%.2f (scale=%.3f, raw=%.2f) GPU_heur=%.2f (scale=%.3f, raw=%.2f) " + "ratio=%.2fx", + bnb_work, + bnb_scale, + bnb_scale > 0 ? bnb_work / bnb_scale : 0.0, + gpu_work, + gpu_scale, + gpu_scale > 0 ? gpu_work / gpu_scale : 0.0, + gpu_work > 0 ? bnb_work / gpu_work : 0.0); + } } sol.compute_feasibility(); diff --git a/cpp/src/mip_heuristics/solver.cuh b/cpp/src/mip_heuristics/solver.cuh index 9b9024a1dc..1c18a62c08 100644 --- a/cpp/src/mip_heuristics/solver.cuh +++ b/cpp/src/mip_heuristics/solver.cuh @@ -10,7 +10,7 @@ #include #include #include -#include +#include #pragma once namespace cuopt::linear_programming::detail { @@ -20,7 +20,7 @@ class mip_solver_t { public: explicit mip_solver_t(const problem_t& op_problem, const mip_solver_settings_t& solver_settings, - timer_t timer); + cuopt::termination_checker_t& timer); solution_t run_solver(); solver_stats_t& get_solver_stats() { return context.stats; } @@ -29,7 +29,7 @@ class mip_solver_t { // reference to the original problem const problem_t& op_problem_; const mip_solver_settings_t& solver_settings_; - timer_t timer_; + cuopt::termination_checker_t& timer_; }; } // namespace cuopt::linear_programming::detail diff --git a/cpp/src/mip_heuristics/solver_context.cuh b/cpp/src/mip_heuristics/solver_context.cuh index b1bf3fbd70..2f36f52db3 100644 --- a/cpp/src/mip_heuristics/solver_context.cuh +++ b/cpp/src/mip_heuristics/solver_context.cuh @@ -5,16 +5,16 @@ */ /* clang-format on */ +#pragma once + #include -#include -#include +#include +#include #include #include -#include - -#pragma once +#include // Forward declare namespace cuopt::linear_programming::dual_simplex { @@ -37,12 +37,20 @@ struct mip_solver_context_t { explicit mip_solver_context_t(raft::handle_t const* handle_ptr_, problem_t* problem_ptr_, mip_solver_settings_t settings_) - : handle_ptr(handle_ptr_), problem_ptr(problem_ptr_), settings(settings_) + : handle_ptr(handle_ptr_), + problem_ptr(problem_ptr_), + settings(settings_), + solution_publication(settings, stats), + solution_injection(settings, stats) { cuopt_assert(problem_ptr != nullptr, "problem_ptr is nullptr"); stats.set_solution_bound(problem_ptr->maximize ? std::numeric_limits::infinity() : -std::numeric_limits::infinity()); - gpu_heur_loop.deterministic = settings.determinism_mode == CUOPT_MODE_DETERMINISTIC; + gpu_heur_loop.deterministic = (settings.determinism_mode & CUOPT_DETERMINISM_GPU_HEURISTICS); + cuopt_assert(settings.cpufj_work_unit_scale > 0.0, "CPUFJ work-unit scale must be positive"); + cuopt_assert(settings.gpu_heur_work_unit_scale > 0.0, + "GPU heuristic work-unit scale must be positive"); + gpu_heur_loop.work_unit_scale = GPU_HEUR_BASE_WORK_SCALE * settings.gpu_heur_work_unit_scale; } mip_solver_context_t(const mip_solver_context_t&) = delete; @@ -58,8 +66,13 @@ struct mip_solver_context_t { // Work limit context for tracking work units in deterministic mode (shared across all timers in // GPU heuristic loop) work_limit_context_t gpu_heur_loop{"GPUHeur"}; + solution_publication_t solution_publication; + solution_injection_t solution_injection; + + // Root termination checker — set by mip_solver_t after construction. + // All sub-timers should use this as parent for wall-clock safety. + cuopt::termination_checker_t* termination{nullptr}; - // synchronization every 5 seconds for deterministic mode work_unit_scheduler_t work_unit_scheduler_{5.0}; early_cpufj_t* early_cpufj_ptr{nullptr}; diff --git a/cpp/src/mip_heuristics/solver_solution.cu b/cpp/src/mip_heuristics/solver_solution.cu index 8f6f8de05f..8d179eafe6 100644 --- a/cpp/src/mip_heuristics/solver_solution.cu +++ b/cpp/src/mip_heuristics/solver_solution.cu @@ -7,6 +7,8 @@ #include #include +#include +#include #include #include @@ -238,11 +240,25 @@ void mip_solution_t::log_summary() const template void mip_solution_t::log_detailed_summary() const { + uint32_t sol_hash = 0; + if (solution_.size() > 0) { + auto host_sol = cuopt::host_copy(solution_, rmm::cuda_stream_default); + sol_hash = detail::compute_hash(host_sol); + } + + uint32_t pool_hash = 0; + for (const auto& pool_sol : solution_pool_) { + if (pool_sol.size() > 0) { + auto host_pool_sol = cuopt::host_copy(pool_sol, rmm::cuda_stream_default); + pool_hash ^= detail::compute_hash(host_pool_sol); + } + } + CUOPT_LOG_INFO( "Solution objective: %f , relative_mip_gap %f solution_bound %f presolve_time %f " "total_solve_time %f " "max constraint violation %f max int violation %f max var bounds violation %f " - "nodes %d simplex_iterations %d", + "nodes %d simplex_iterations %d solution_hash %08x pool_hash %08x pool_size %d", objective_, mip_gap_, stats_.get_solution_bound(), @@ -252,7 +268,10 @@ void mip_solution_t::log_detailed_summary() const max_int_violation_, max_variable_bound_violation_, stats_.num_nodes, - stats_.num_simplex_iterations); + stats_.num_simplex_iterations, + sol_hash, + pool_hash, + (int)solution_pool_.size()); } #if MIP_INSTANTIATE_FLOAT || PDLP_INSTANTIATE_FLOAT diff --git a/cpp/src/pdlp/cuopt_c.cpp b/cpp/src/pdlp/cuopt_c.cpp index ed2eab02f2..0af7f183d2 100644 --- a/cpp/src/pdlp/cuopt_c.cpp +++ b/cpp/src/pdlp/cuopt_c.cpp @@ -49,6 +49,39 @@ class c_get_solution_callback_t : public cuopt::internals::get_solution_callback cuOptMIPGetSolutionCallback callback_; }; +class c_get_solution_callback_ext_t : public cuopt::internals::get_solution_callback_ext_t { + public: + explicit c_get_solution_callback_ext_t(cuOptMIPGetSolutionCallbackExt callback) + : callback_(callback) + { + } + + void get_solution(void* data, + void* objective_value, + void* solution_bound, + const cuopt::internals::mip_solution_callback_info_t* callback_info, + void* user_data) override + { + if (callback_ == nullptr) { return; } + cuOptMIPSolutionCallbackInfo c_callback_info{}; + if (callback_info != nullptr) { + c_callback_info.origin = (uint32_t)callback_info->origin; + c_callback_info.work_timestamp = callback_info->work_timestamp; + } else { + c_callback_info.origin = CUOPT_MIP_SOLUTION_ORIGIN_UNKNOWN; + c_callback_info.work_timestamp = -1.0; + } + callback_(static_cast(data), + static_cast(objective_value), + static_cast(solution_bound), + &c_callback_info, + user_data); + } + + private: + cuOptMIPGetSolutionCallbackExt callback_; +}; + class c_set_solution_callback_t : public cuopt::internals::set_solution_callback_t { public: explicit c_set_solution_callback_t(cuOptMIPSetSolutionCallback callback) : callback_(callback) {} @@ -69,6 +102,11 @@ class c_set_solution_callback_t : public cuopt::internals::set_solution_callback cuOptMIPSetSolutionCallback callback_; }; +// ABI guards: these fire at compile time if the struct layout changes +// and existing field offsets are changed +static_assert(offsetof(cuOptMIPSolutionCallbackInfo, origin) == 0, "ABI break"); +static_assert(offsetof(cuOptMIPSolutionCallbackInfo, work_timestamp) == 8, "ABI break"); + // Owns solver settings and C callback wrappers for C API lifetime. struct solver_settings_handle_t { solver_settings_handle_t() : settings(new solver_settings_t()) {} @@ -767,6 +805,19 @@ cuopt_int_t cuOptSetMIPGetSolutionCallback(cuOptSolverSettings settings, return CUOPT_SUCCESS; } +cuopt_int_t cuOptSetMIPGetSolutionCallbackExt(cuOptSolverSettings settings, + cuOptMIPGetSolutionCallbackExt callback, + void* user_data) +{ + if (settings == nullptr) { return CUOPT_INVALID_ARGUMENT; } + if (callback == nullptr) { return CUOPT_INVALID_ARGUMENT; } + solver_settings_handle_t* settings_handle = get_settings_handle(settings); + auto callback_wrapper = std::make_unique(callback); + settings_handle->settings->set_mip_callback(callback_wrapper.get(), user_data); + settings_handle->callbacks.push_back(std::move(callback_wrapper)); + return CUOPT_SUCCESS; +} + cuopt_int_t cuOptSetMIPSetSolutionCallback(cuOptSolverSettings settings, cuOptMIPSetSolutionCallback callback, void* user_data) diff --git a/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu b/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu index b618550f6e..bdc7aff1a0 100644 --- a/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu +++ b/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu @@ -22,6 +22,8 @@ #include #include +#include +#include #include namespace cuopt::linear_programming::detail { @@ -684,9 +686,12 @@ template void pdlp_initial_scaling_strategy_t::unscale_solutions( rmm::device_uvector& primal_solution, rmm::device_uvector& dual_solution, - rmm::device_uvector& dual_slack) const + rmm::device_uvector& dual_slack, + cudaStream_t stream_override) const { raft::common::nvtx::range fun_scope("unscale_solutions"); + const rmm::cuda_stream_view stream = + stream_override ? rmm::cuda_stream_view{stream_override} : stream_view_; if (primal_solution.size()) { cuopt_expects(primal_solution.size() % static_cast(primal_size_h_) == 0, @@ -703,7 +708,7 @@ void pdlp_initial_scaling_strategy_t::unscale_solutions( primal_solution.data(), primal_solution.size(), cuda::std::multiplies<>{}, - stream_view_); + stream); if (hyper_params_.bound_objective_rescaling && !running_mip_) { cuopt_assert(h_bound_rescaling != f_t(0), @@ -712,7 +717,7 @@ void pdlp_initial_scaling_strategy_t::unscale_solutions( primal_solution.data(), primal_solution.size(), a_times_scalar(f_t(1.0) / h_bound_rescaling), - stream_view_); + stream); } } @@ -731,7 +736,7 @@ void pdlp_initial_scaling_strategy_t::unscale_solutions( dual_solution.data(), dual_solution.size(), cuda::std::multiplies<>{}, - stream_view_); + stream); if (hyper_params_.bound_objective_rescaling && !running_mip_) { cuopt_assert(h_bound_rescaling != f_t(0), "Numerical error: bound_rescaling_ should never equal 0"); @@ -739,7 +744,7 @@ void pdlp_initial_scaling_strategy_t::unscale_solutions( dual_solution.data(), dual_solution.size(), a_times_scalar(f_t(1.0) / h_objective_rescaling), - stream_view_); + stream); } } @@ -756,7 +761,7 @@ void pdlp_initial_scaling_strategy_t::unscale_solutions( dual_slack.data(), dual_slack.size(), batch_safe_div(), - stream_view_); + stream); if (hyper_params_.bound_objective_rescaling && !running_mip_) { cuopt_assert(h_bound_rescaling != f_t(0), "Numerical error: bound_rescaling_ should never equal 0"); @@ -764,7 +769,7 @@ void pdlp_initial_scaling_strategy_t::unscale_solutions( dual_slack.data(), dual_slack.size(), a_times_scalar{f_t(1.0) / h_objective_rescaling}, - stream_view_); + stream); } } } @@ -781,10 +786,12 @@ void pdlp_initial_scaling_strategy_t::unscale_solutions( template void pdlp_initial_scaling_strategy_t::unscale_solutions( - rmm::device_uvector& solution, rmm::device_uvector& s) const + rmm::device_uvector& solution, + rmm::device_uvector& s, + cudaStream_t stream_override) const { rmm::device_uvector dummy(0, solution.stream()); - unscale_solutions(solution, s, dummy); + unscale_solutions(solution, s, dummy, stream_override); } template diff --git a/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cuh b/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cuh index 5a3dcfaca2..c537825724 100644 --- a/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cuh +++ b/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cuh @@ -64,10 +64,12 @@ class pdlp_initial_scaling_strategy_t { void scale_primal(rmm::device_uvector& primal_solution) const; void scale_dual(rmm::device_uvector& dual_solution) const; void unscale_solutions(rmm::device_uvector& primal_solution, - rmm::device_uvector& dual_solution) const; + rmm::device_uvector& dual_solution, + cudaStream_t stream_override = nullptr) const; void unscale_solutions(rmm::device_uvector& primal_solution, rmm::device_uvector& dual_solution, - rmm::device_uvector& dual_slack) const; + rmm::device_uvector& dual_slack, + cudaStream_t stream_override = nullptr) const; void unscale_solutions(solution_t& solution) const; const rmm::device_uvector& get_constraint_matrix_scaling_vector() const; const rmm::device_uvector& get_variable_scaling_vector() const; diff --git a/cpp/src/pdlp/pdhg.cu b/cpp/src/pdlp/pdhg.cu index 74df7fee01..cb16c9d662 100644 --- a/cpp/src/pdlp/pdhg.cu +++ b/cpp/src/pdlp/pdhg.cu @@ -30,6 +30,8 @@ #include +#include + #include namespace cuopt::linear_programming::detail { diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index 8e6e80e322..a759887fc5 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -35,6 +35,7 @@ #include #include +#include #include #include diff --git a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu index 2b10310260..821238fe84 100644 --- a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu +++ b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu @@ -29,6 +29,7 @@ #include #include +#include #include #include #include @@ -39,6 +40,7 @@ #include #include #include +#include #include diff --git a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu index d17a88dd29..c95ed67ca6 100644 --- a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu +++ b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu @@ -28,6 +28,9 @@ #include +#include +#include + #include namespace cuopt::linear_programming::detail { diff --git a/cpp/src/pdlp/termination_strategy/convergence_information.cu b/cpp/src/pdlp/termination_strategy/convergence_information.cu index ab0c921cc7..b4da4ffbde 100644 --- a/cpp/src/pdlp/termination_strategy/convergence_information.cu +++ b/cpp/src/pdlp/termination_strategy/convergence_information.cu @@ -25,6 +25,7 @@ #include #include +#include #include #include diff --git a/cpp/src/pdlp/termination_strategy/infeasibility_information.cu b/cpp/src/pdlp/termination_strategy/infeasibility_information.cu index dbb35b732d..37972ba442 100644 --- a/cpp/src/pdlp/termination_strategy/infeasibility_information.cu +++ b/cpp/src/pdlp/termination_strategy/infeasibility_information.cu @@ -24,6 +24,14 @@ #include #include +#include +#include +#include +#include +#include +#include +#include + namespace cuopt::linear_programming::detail { template infeasibility_information_t::infeasibility_information_t( diff --git a/cpp/src/pdlp/utils.cuh b/cpp/src/pdlp/utils.cuh index 138c9c2ab9..77bc6b18ce 100644 --- a/cpp/src/pdlp/utils.cuh +++ b/cpp/src/pdlp/utils.cuh @@ -24,6 +24,8 @@ #include #include +#include +#include #include #include diff --git a/cpp/src/routing/local_search/compute_compatible.cu b/cpp/src/routing/local_search/compute_compatible.cu index 8386cb087b..457e970632 100644 --- a/cpp/src/routing/local_search/compute_compatible.cu +++ b/cpp/src/routing/local_search/compute_compatible.cu @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -9,6 +9,8 @@ #include "compute_compatible.cuh" #include "local_search.cuh" +#include +#include #include #include diff --git a/cpp/src/routing/route/break_route.cuh b/cpp/src/routing/route/break_route.cuh index 68ab015646..1d5b3472f9 100644 --- a/cpp/src/routing/route/break_route.cuh +++ b/cpp/src/routing/route/break_route.cuh @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -17,6 +17,8 @@ #include +#include + namespace cuopt { namespace routing { namespace detail { diff --git a/cpp/src/routing/route/capacity_route.cuh b/cpp/src/routing/route/capacity_route.cuh index a39ef46a93..388e573c1c 100644 --- a/cpp/src/routing/route/capacity_route.cuh +++ b/cpp/src/routing/route/capacity_route.cuh @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2021-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -17,6 +17,9 @@ #include #include + +#include + namespace cuopt { namespace routing { namespace detail { diff --git a/cpp/src/routing/route/dimensions_route.cuh b/cpp/src/routing/route/dimensions_route.cuh index d1131ea550..bc08ba9819 100644 --- a/cpp/src/routing/route/dimensions_route.cuh +++ b/cpp/src/routing/route/dimensions_route.cuh @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2021-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -30,6 +30,8 @@ #include #include +#include + namespace cuopt { namespace routing { namespace detail { diff --git a/cpp/src/routing/route/distance_route.cuh b/cpp/src/routing/route/distance_route.cuh index e01c552080..a5f98c13ce 100644 --- a/cpp/src/routing/route/distance_route.cuh +++ b/cpp/src/routing/route/distance_route.cuh @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2021-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -17,6 +17,8 @@ #include +#include + namespace cuopt { namespace routing { namespace detail { diff --git a/cpp/src/routing/route/mismatch_route.cuh b/cpp/src/routing/route/mismatch_route.cuh index d72f01735a..78975750e0 100644 --- a/cpp/src/routing/route/mismatch_route.cuh +++ b/cpp/src/routing/route/mismatch_route.cuh @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -15,6 +15,8 @@ #include +#include + namespace cuopt { namespace routing { namespace detail { diff --git a/cpp/src/routing/route/pdp_route.cuh b/cpp/src/routing/route/pdp_route.cuh index dc9b8ad699..dd20e2fec3 100644 --- a/cpp/src/routing/route/pdp_route.cuh +++ b/cpp/src/routing/route/pdp_route.cuh @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2021-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -17,6 +17,8 @@ #include +#include + namespace cuopt { namespace routing { namespace detail { diff --git a/cpp/src/routing/route/prize_route.cuh b/cpp/src/routing/route/prize_route.cuh index 0330d14590..80b27061b5 100644 --- a/cpp/src/routing/route/prize_route.cuh +++ b/cpp/src/routing/route/prize_route.cuh @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -17,6 +17,8 @@ #include +#include + namespace cuopt { namespace routing { namespace detail { diff --git a/cpp/src/routing/route/route.cuh b/cpp/src/routing/route/route.cuh index e6367a4836..b624acb903 100644 --- a/cpp/src/routing/route/route.cuh +++ b/cpp/src/routing/route/route.cuh @@ -11,6 +11,8 @@ #include +#include + namespace cuopt { namespace routing { namespace detail { diff --git a/cpp/src/routing/route/service_time_route.cuh b/cpp/src/routing/route/service_time_route.cuh index b35e53c2d8..03c48b2e42 100644 --- a/cpp/src/routing/route/service_time_route.cuh +++ b/cpp/src/routing/route/service_time_route.cuh @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -15,6 +15,8 @@ #include #include +#include + namespace cuopt { namespace routing { namespace detail { diff --git a/cpp/src/routing/route/tasks_route.cuh b/cpp/src/routing/route/tasks_route.cuh index 6da9e4372a..3624d647e7 100644 --- a/cpp/src/routing/route/tasks_route.cuh +++ b/cpp/src/routing/route/tasks_route.cuh @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2021-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -15,6 +15,8 @@ #include +#include + namespace cuopt { namespace routing { namespace detail { diff --git a/cpp/src/routing/route/time_route.cuh b/cpp/src/routing/route/time_route.cuh index bb5ec653e1..21448c4273 100644 --- a/cpp/src/routing/route/time_route.cuh +++ b/cpp/src/routing/route/time_route.cuh @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2021-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -17,6 +17,8 @@ #include +#include + namespace cuopt { namespace routing { namespace detail { diff --git a/cpp/src/routing/route/tsp_route.cuh b/cpp/src/routing/route/tsp_route.cuh index ee1ba5370c..9b7eeeee56 100644 --- a/cpp/src/routing/route/tsp_route.cuh +++ b/cpp/src/routing/route/tsp_route.cuh @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -16,6 +16,8 @@ #include +#include + namespace cuopt { namespace routing { namespace detail { diff --git a/cpp/src/routing/route/vehicle_fixed_cost_route.cuh b/cpp/src/routing/route/vehicle_fixed_cost_route.cuh index 83ea5db481..1e246fbb6e 100644 --- a/cpp/src/routing/route/vehicle_fixed_cost_route.cuh +++ b/cpp/src/routing/route/vehicle_fixed_cost_route.cuh @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -14,6 +14,8 @@ #include +#include + namespace cuopt { namespace routing { namespace detail { diff --git a/cpp/src/routing/solution/route_node_map.cuh b/cpp/src/routing/solution/route_node_map.cuh index 25a6c4919b..a4a1b171aa 100644 --- a/cpp/src/routing/solution/route_node_map.cuh +++ b/cpp/src/routing/solution/route_node_map.cuh @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -8,6 +8,7 @@ #pragma once #include +#include #include #include #include diff --git a/cpp/src/routing/utilities/check_input.cu b/cpp/src/routing/utilities/check_input.cu index e902f2d460..eccc3179bb 100644 --- a/cpp/src/routing/utilities/check_input.cu +++ b/cpp/src/routing/utilities/check_input.cu @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/utilities/copy_helpers.hpp b/cpp/src/utilities/copy_helpers.hpp index 36a4659059..fc07e3b829 100644 --- a/cpp/src/utilities/copy_helpers.hpp +++ b/cpp/src/utilities/copy_helpers.hpp @@ -11,9 +11,11 @@ #include #include +#include #include #include +#include #include #include #include @@ -335,6 +337,17 @@ raft::device_span make_span(rmm::device_uvector const& container) return raft::device_span(container.data(), container.size()); } +template +raft::device_span make_span(rmm::device_scalar& scalar) +{ + return raft::device_span(scalar.data(), 1); +} + +template +raft::device_span make_span(rmm::device_scalar const& scalar) +{ + return raft::device_span(scalar.data(), 1); +} // resizes the device vector if it the std vector is larger template inline void expand_device_copy(rmm::device_uvector& device_vec, diff --git a/cpp/src/utilities/cuda_helpers.cuh b/cpp/src/utilities/cuda_helpers.cuh index 946099648d..7c591624d2 100644 --- a/cpp/src/utilities/cuda_helpers.cuh +++ b/cpp/src/utilities/cuda_helpers.cuh @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -20,6 +21,17 @@ #include #include +#if CUDART_VERSION >= 12080 +// TODO: investigate why this is necessary? dependency conflict? file NVBUG if necessary +#include +#ifndef NVTX_NULLPTR +#define NVTX_NULLPTR nullptr +#endif +#ifndef NVTX_REINTERPRET_CAST +#define NVTX_REINTERPRET_CAST(type, value) (reinterpret_cast(value)) +#endif +#include +#endif namespace cuopt { #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700) @@ -237,4 +249,48 @@ inline size_t get_device_memory_size() } } +// NOTE: this marks a range of virtual memory as initialized. This is not tied to any object's +// lifetime As such, when using a pool for allocations, false negatives could occurs e.g. a range +// previously marked as initialized is now occupied by a new uninitialized object Unlikely to cause +// issues in practice - but worth noting (RAII? I'm not even sure the API allows to un-mark a range +// as initialized) +static inline void mark_memory_as_initialized(const void* ptr, size_t size, cudaStream_t stream = 0) +{ +#if CUDART_VERSION >= 12080 + + if (size == 0 || ptr == nullptr) return; + +#if defined(CUDA_API_PER_THREAD_DEFAULT_STREAM) + constexpr auto PerThreadDefaultStream = true; +#else + constexpr auto PerThreadDefaultStream = false; +#endif + + nvtxMemVirtualRangeDesc_t nvtxRangeDesc = {}; + nvtxRangeDesc.size = size; + nvtxRangeDesc.ptr = ptr; + + nvtxMemMarkInitializedBatch_t nvtxRegionsDesc = {}; + nvtxRegionsDesc.extCompatID = NVTX_EXT_COMPATID_MEM; + nvtxRegionsDesc.structSize = sizeof(nvtxRegionsDesc); + nvtxRegionsDesc.regionType = NVTX_MEM_TYPE_VIRTUAL_ADDRESS; + nvtxRegionsDesc.regionDescCount = 1; + nvtxRegionsDesc.regionDescElementSize = sizeof(nvtxRangeDesc); + nvtxRegionsDesc.regionDescElements = &nvtxRangeDesc; + + nvtxMemCudaMarkInitialized( + raft::common::nvtx::detail::domain_store::value(), + stream, + PerThreadDefaultStream, + &nvtxRegionsDesc); +#endif +} + +template +static inline void mark_span_as_initialized(const raft::device_span span, + rmm::cuda_stream_view stream) +{ + mark_memory_as_initialized(span.data(), span.size() * sizeof(T), stream.value()); +} + } // namespace cuopt diff --git a/cpp/src/utilities/determinism_log.hpp b/cpp/src/utilities/determinism_log.hpp new file mode 100644 index 0000000000..71517d7d27 --- /dev/null +++ b/cpp/src/utilities/determinism_log.hpp @@ -0,0 +1,23 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights + * reserved. SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#ifndef CUOPT_DETERMINISM_LOG +#define CUOPT_DETERMINISM_LOG(...) \ + do { \ + } while (0) +#endif diff --git a/cpp/src/utilities/seed_generator.cu b/cpp/src/utilities/seed_generator.cu index 1da6662bc1..612093a7a8 100644 --- a/cpp/src/utilities/seed_generator.cu +++ b/cpp/src/utilities/seed_generator.cu @@ -1,10 +1,11 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ #include -int64_t cuopt::seed_generator::seed_ = 0; +int64_t cuopt::seed_generator::base_seed_ = 0; +std::atomic cuopt::seed_generator::epoch_{0}; diff --git a/cpp/src/utilities/seed_generator.cuh b/cpp/src/utilities/seed_generator.cuh index dd5e79d847..5415e9e80b 100644 --- a/cpp/src/utilities/seed_generator.cuh +++ b/cpp/src/utilities/seed_generator.cuh @@ -1,29 +1,50 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ #pragma once +#include #include #include namespace cuopt { -// TODO: should be thread local? class seed_generator { - static int64_t seed_; + static int64_t base_seed_; + // Monotonically increasing epoch; incremented on every set_seed() call. + // Thread-local state compares against this to detect resets, even when + // the same seed value is set again (e.g., repeated solve_mip() calls). + static std::atomic epoch_; + + struct thread_state_t { + int64_t counter{0}; + int64_t last_epoch{-1}; + }; + + static thread_state_t& local_state() + { + thread_local thread_state_t state; + int64_t current_epoch = epoch_.load(std::memory_order_acquire); + if (state.last_epoch != current_epoch) { + state.counter = base_seed_; + state.last_epoch = current_epoch; + } + return state; + } public: template static void set_seed(seed_t seed) { #ifdef BENCHMARK - seed_ = std::random_device{}(); + base_seed_ = std::random_device{}(); #else - seed_ = static_cast(seed); + base_seed_ = static_cast(seed); #endif + epoch_.fetch_add(1, std::memory_order_release); } template static void set_seed(arg0 seed0, arg1 seed1, args... seeds) @@ -31,7 +52,19 @@ class seed_generator { set_seed(seed1 + ((seed0 + seed1) * (seed0 + seed1 + 1) / 2), seeds...); } - static int64_t get_seed() { return seed_++; } +#if SEED_GENERATOR_DEBUG + static int64_t get_seed(const char* caller = __builtin_FUNCTION(), + const char* file = __builtin_FILE(), + int line = __builtin_LINE()) + { + printf("SEED CALLED BY %s:%d: %s() ***\n", file, line, caller); + return local_state().counter++; + } +#else + static int64_t get_seed() { return local_state().counter++; } +#endif + + static int64_t peek_seed() { return local_state().counter; } public: seed_generator(seed_generator const&) = delete; diff --git a/cpp/src/utilities/termination_checker.hpp b/cpp/src/utilities/termination_checker.hpp new file mode 100644 index 0000000000..d2ecd41141 --- /dev/null +++ b/cpp/src/utilities/termination_checker.hpp @@ -0,0 +1,239 @@ +/* clang-format off */ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ +/* clang-format on */ + +#pragma once + +#include +#include +#include + +#include + +#include "timer.hpp" +#include "work_limit_context.hpp" + +namespace cuopt { + +/** + * Unified termination checker that subsumes timer_t and work_limit_timer_t. + * + * In non-deterministic mode: checks wall-clock time. + * In deterministic mode: checks work units via work_limit_context_t. + * In BOTH modes: checks parent chain (inheriting root wall-clock limit) and user callbacks. + * + * This is the single timer type used throughout the solver. It replaces work_limit_timer_t. + */ +class termination_checker_t { + public: + struct root_tag_t {}; + + // Root constructor (top-level solver, wall-clock only) + explicit termination_checker_t(double time_limit, root_tag_t) + : deterministic(false), + work_limit(time_limit), + timer(time_limit), + work_context(nullptr), + work_units_at_start(0), + parent_(nullptr) + { + } + + // Non-deterministic constructor with parent + termination_checker_t(double time_limit_, const termination_checker_t& parent) + : deterministic(false), + work_limit(time_limit_), + timer(time_limit_), + work_context(nullptr), + work_units_at_start(0), + parent_(&parent) + { + } + + // Deterministic constructor with parent (inherits parent's termination) + termination_checker_t(work_limit_context_t& context, + double work_limit_, + const termination_checker_t& parent) + : deterministic(context.deterministic), + work_limit(work_limit_), + timer(work_limit_), + work_context(&context), + work_units_at_start(context.deterministic ? context.current_work() : 0), + parent_(&parent) + { + } + + void set_parent(const termination_checker_t* parent) { parent_ = parent; } + const termination_checker_t* get_parent() const { return parent_; } + + void set_termination_callback(bool (*cb)(void*), void* data) + { + termination_callback_ = cb; + termination_callback_data_ = data; + } + + bool check(const char* caller = __builtin_FUNCTION(), + const char* file = __builtin_FILE(), + int line = __builtin_LINE()) const noexcept + { + if (termination_callback_ != nullptr && termination_callback_(termination_callback_data_)) { + return true; + } + + if (parent_ != nullptr && parent_->check()) { return true; } + + if (deterministic) { + if (!work_context) { return false; } + double elapsed_since_start = work_context->current_work() - work_units_at_start; + bool finished_now = elapsed_since_start >= work_limit; + if (finished_now && !finished) { + finished = true; + double actual_elapsed_time = timer.elapsed_time(); + + if (work_limit > 0 && std::abs(actual_elapsed_time - work_limit) / work_limit > 0.10) { + CUOPT_LOG_TRACE( + "%s:%d: %s(): Work limit timer finished with a large discrepancy: %fs for %fwu " + "(global: %g, start: %g)", + file, + line, + caller, + actual_elapsed_time, + work_limit, + work_context->current_work(), + work_units_at_start); + } + } + return finished; + } else { + return timer.check_time_limit(); + } + } + + // Aliases for compatibility with work_limit_timer_t and timer_t interfaces + bool check_time_limit(const char* caller = __builtin_FUNCTION(), + const char* file = __builtin_FILE(), + int line = __builtin_LINE()) const noexcept + { + return check(caller, file, line); + } + + bool check_limit(const char* caller = __builtin_FUNCTION(), + const char* file = __builtin_FILE(), + int line = __builtin_LINE()) const noexcept + { + return check(caller, file, line); + } + + void record_work(double work_units, + const char* caller = __builtin_FUNCTION(), + const char* file = __builtin_FILE(), + int line = __builtin_LINE()) + { + if (deterministic && work_context) { + // debugging info + double parent_elapsed_time = parent_ != nullptr ? parent_->timer.elapsed_time() : 0.0; + double parent_time_limit = parent_ != nullptr ? parent_->timer.get_time_limit() : 0.0; + + CUOPT_LOG_TRACE("%s:%d: %s(): Recorded %f work units in %fs, total %f (parent time: %g/%g)", + file, + line, + caller, + work_units, + timer.elapsed_time(), + work_context->current_work(), + parent_elapsed_time, + parent_time_limit); + work_context->record_work_sync_on_horizon(work_units); + } + } + + double remaining_units() const noexcept + { + double local_remaining; + if (deterministic) { + if (!work_context) { + local_remaining = work_limit; + } else { + double elapsed_since_start = work_context->current_work() - work_units_at_start; + local_remaining = std::max(0.0, work_limit - elapsed_since_start); + } + } else { + local_remaining = timer.remaining_time(); + } + // don't let the root's global time limit contaminate work limits further down + if (parent_ != nullptr && !(deterministic && !parent_->deterministic)) { + local_remaining = std::min(local_remaining, parent_->remaining_units()); + } + if (!std::isfinite(local_remaining)) { + CUOPT_LOG_WARN( + "remaining_units non-finite: %g det=%d work_limit=%g start=%g " + "ctx_work=%g has_parent=%d", + local_remaining, + (int)deterministic, + work_limit, + work_units_at_start, + work_context ? work_context->current_work() : -1.0, + parent_ != nullptr); + } + return local_remaining; + } + + double remaining_time() const noexcept { return remaining_units(); } + + double elapsed_time() const noexcept + { + if (deterministic) { + if (!work_context) { return 0.0; } + return work_context->current_work() - work_units_at_start; + } else { + return timer.elapsed_time(); + } + } + + bool check_half_time() const noexcept + { + if (deterministic) { + if (!work_context) { return false; } + double elapsed_since_start = work_context->current_work() - work_units_at_start; + return elapsed_since_start >= work_limit / 2; + } else { + return timer.check_half_time(); + } + } + + double clamp_remaining_time(double desired_time) const noexcept + { + return std::min(desired_time, remaining_time()); + } + + double get_time_limit() const noexcept + { + if (deterministic) { + return work_limit; + } else { + return timer.get_time_limit(); + } + } + + double get_tic_start() const noexcept { return timer.get_tic_start(); } + + timer_t timer; + double work_limit{}; + mutable bool finished{false}; + bool deterministic{false}; + work_limit_context_t* work_context{nullptr}; + double work_units_at_start{0}; + + private: + const termination_checker_t* parent_{nullptr}; + bool (*termination_callback_)(void*) = nullptr; + void* termination_callback_data_ = nullptr; +}; + +// Backward compatibility +using work_limit_timer_t = termination_checker_t; + +} // namespace cuopt diff --git a/cpp/src/utilities/timer.hpp b/cpp/src/utilities/timer.hpp index b7ab6a63bd..ccfab4c57f 100644 --- a/cpp/src/utilities/timer.hpp +++ b/cpp/src/utilities/timer.hpp @@ -34,7 +34,21 @@ class timer_t { elapsed_time()); } - bool check_time_limit() const noexcept { return elapsed_time() >= time_limit; } + bool check_time_limit(const char* caller = __builtin_FUNCTION(), + const char* file = __builtin_FILE(), + int line = __builtin_LINE()) const noexcept + { + bool elapsed = elapsed_time() >= time_limit; + // if (elapsed) { + // printf("************ TIME LIMIT (%.2gs) REACHED BY %s:%d: %s() ***\n", + // time_limit, + // file, + // line, + // caller); + // //__builtin_trap(); + // } + return elapsed; + } bool check_half_time() const noexcept { return elapsed_time() >= time_limit / 2; } diff --git a/cpp/src/utilities/work_limit_context.hpp b/cpp/src/utilities/work_limit_context.hpp index c75a37b818..55edee85b5 100644 --- a/cpp/src/utilities/work_limit_context.hpp +++ b/cpp/src/utilities/work_limit_context.hpp @@ -17,30 +17,117 @@ #pragma once #include +#include +#include +#include +#include +#include #include #include +#include +#include +#include "producer_sync.hpp" #include "timer.hpp" #include "work_unit_scheduler.hpp" namespace cuopt { +inline double read_work_unit_scale_env_or_default(const char* env_name, double default_value) +{ + const char* env_value = std::getenv(env_name); + if (env_value == nullptr || env_value[0] == '\0') { return default_value; } + + errno = 0; + char* end_ptr = nullptr; + const double parsed_value = std::strtod(env_value, &end_ptr); + const bool valid_value = errno == 0 && end_ptr != env_value && *end_ptr == '\0' && + std::isfinite(parsed_value) && parsed_value > 0.0; + cuopt_assert(valid_value, "Invalid work-unit scale env var"); + return parsed_value; +} + struct work_limit_context_t { double global_work_units_elapsed{0.0}; double total_sync_time{0.0}; // Total time spent waiting at sync barriers (seconds) bool deterministic{false}; work_unit_scheduler_t* scheduler{nullptr}; + producer_sync_t* producer_sync{nullptr}; std::string name; + std::unique_ptr> producer_work_units_elapsed{ + std::make_unique>(0.0)}; + double producer_progress_scale{ + read_work_unit_scale_env_or_default("CUOPT_GPU_HEUR_WORK_UNIT_SCALE", 1.0)}; + double work_unit_scale{1.0}; work_limit_context_t(const std::string& name) : name(name) {} + work_limit_context_t(const work_limit_context_t&) = delete; + work_limit_context_t& operator=(const work_limit_context_t&) = delete; + work_limit_context_t(work_limit_context_t&&) = default; + work_limit_context_t& operator=(work_limit_context_t&&) = default; + + double current_work() const noexcept { return global_work_units_elapsed; } + + double current_producer_work() const noexcept + { + double result = current_work() * producer_progress_scale; + if (!std::isfinite(result)) { + CUOPT_LOG_WARN("current_producer_work non-finite: %g (work=%g scale=%g) ctx=%s", + result, + current_work(), + producer_progress_scale, + name.c_str()); + } + return result; + } + + std::atomic* producer_progress_ptr() noexcept + { + return producer_work_units_elapsed.get(); + } + + void attach_producer_sync(producer_sync_t* producer_sync_) + { + producer_sync = producer_sync_; + producer_work_units_elapsed->store(current_producer_work(), std::memory_order_release); + if (work_unit_scale != 1.0) { + CUOPT_DETERMINISM_LOG("[%s] Using work-unit scale %f", name.c_str(), work_unit_scale); + } + } + + void detach_producer_sync() noexcept { producer_sync = nullptr; } + + void set_current_work(double total_work, bool notify_producer = true) + { + if (!deterministic) return; + if (!std::isfinite(total_work)) { + CUOPT_LOG_WARN("set_current_work non-finite: %g (prev=%g) ctx=%s", + total_work, + global_work_units_elapsed, + name.c_str()); + } + cuopt_assert(total_work + 1e-12 >= global_work_units_elapsed, + "Deterministic work progress must be monotonic"); + global_work_units_elapsed = total_work; + producer_work_units_elapsed->store(current_producer_work(), std::memory_order_release); + if (notify_producer && producer_sync != nullptr) { producer_sync->notify_progress(); } + } + void record_work_sync_on_horizon(double work) { if (!deterministic) return; - global_work_units_elapsed += work; - if (scheduler) { scheduler->on_work_recorded(*this, global_work_units_elapsed); } + cuopt_assert(std::isfinite(work), "Recorded work must be finite"); + cuopt_assert(work >= 0.0, "Recorded work must be non-negative"); + const double scaled_work = work * work_unit_scale; + const double total_work = global_work_units_elapsed + scaled_work; + set_current_work(total_work, false); + if (scheduler) { scheduler->on_work_recorded(*this, total_work); } + if (producer_sync != nullptr) { producer_sync->notify_progress(); } } + + void record_work(double work) { record_work_sync_on_horizon(work); } }; } // namespace cuopt diff --git a/cpp/src/utilities/work_limit_timer.hpp b/cpp/src/utilities/work_limit_timer.hpp new file mode 100644 index 0000000000..801a3e5ee9 --- /dev/null +++ b/cpp/src/utilities/work_limit_timer.hpp @@ -0,0 +1,11 @@ +/* clang-format off */ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights + * reserved. SPDX-License-Identifier: Apache-2.0 + */ +/* clang-format on */ +#pragma once + +// work_limit_timer_t is now an alias for termination_checker_t. +// This header exists for backward compatibility. +#include "termination_checker.hpp" diff --git a/cpp/src/utilities/work_unit_scheduler.cpp b/cpp/src/utilities/work_unit_scheduler.cpp index b0e5c5f12f..5dc798ddb3 100644 --- a/cpp/src/utilities/work_unit_scheduler.cpp +++ b/cpp/src/utilities/work_unit_scheduler.cpp @@ -29,7 +29,8 @@ namespace cuopt { -work_unit_scheduler_t::work_unit_scheduler_t(double sync_interval) : sync_interval_(sync_interval) +work_unit_scheduler_t::work_unit_scheduler_t(double sync_interval, double base) + : sync_interval_(sync_interval), base_(base) { } @@ -79,15 +80,15 @@ void work_unit_scheduler_t::wait_for_next_sync(work_limit_context_t& ctx) { if (is_shutdown()) return; - double next_sync = current_sync_target(); - ctx.global_work_units_elapsed = next_sync; + double next_sync = current_sync_target(); + ctx.set_current_work(next_sync, false); wait_at_sync_point(ctx, next_sync); } double work_unit_scheduler_t::current_sync_target() const { if (sync_interval_ <= 0) return std::numeric_limits::infinity(); - return (barrier_generation_ + 1) * sync_interval_; + return base_ + (barrier_generation_ + 1) * sync_interval_; } void work_unit_scheduler_t::wait_at_sync_point(work_limit_context_t& ctx, double sync_target) diff --git a/cpp/src/utilities/work_unit_scheduler.hpp b/cpp/src/utilities/work_unit_scheduler.hpp index 84e7b95fab..286fe74686 100644 --- a/cpp/src/utilities/work_unit_scheduler.hpp +++ b/cpp/src/utilities/work_unit_scheduler.hpp @@ -26,7 +26,7 @@ struct work_limit_context_t; class work_unit_scheduler_t { public: - explicit work_unit_scheduler_t(double sync_interval = 5.0); + explicit work_unit_scheduler_t(double sync_interval = 5.0, double base = 0.0); void set_sync_interval(double interval); double get_sync_interval() const { return sync_interval_; } @@ -54,6 +54,7 @@ class work_unit_scheduler_t { void wait_at_sync_point(work_limit_context_t& ctx, double sync_target); double sync_interval_; + double base_; std::vector> contexts_; size_t barrier_generation_{0}; diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index a73a3361ce..fe9dd4fde9 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -1,4 +1,4 @@ -# cmake-format: off +# cmake-format: off # SPDX-FileCopyrightText: Copyright (c) 2021-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # cmake-format: on @@ -33,6 +33,40 @@ endif() set(CUOPT_TEST_DIR ${CMAKE_CURRENT_SOURCE_DIR}) +if (EXISTS "${CUDAToolkit_LIBRARY_ROOT}/extras/CUPTI/lib64") + # NVIDIA installer layout: + set(cuopt_cupti_root "${CUDAToolkit_LIBRARY_ROOT}/extras/CUPTI") +else() + # Ubuntu package layout: + set(cuopt_cupti_root "${CUDAToolkit_LIBRARY_ROOT}") +endif() +message(STATUS "cuopt_cupti_root = ${cuopt_cupti_root}") + +# The CUPTI targets in FindCUDAToolkit are broken: +# - The dll locations are not specified +# - Dependent libraries nvperf_* are not linked. +# So we create our own targets: +function(cuopt_add_cupti_dep dep_name) + string(TOLOWER ${dep_name} dep_name_lower) + string(TOUPPER ${dep_name} dep_name_upper) + + add_library(cuopt::${dep_name_lower} SHARED IMPORTED) + + find_library(CUOPT_${dep_name_upper}_LIBRARY ${dep_name_lower} REQUIRED + DOC "The full path to lib${dep_name_lower}.so from the CUDA Toolkit." + HINTS "${cuopt_cupti_root}/lib64" "${cuopt_cupti_root}/lib" + ) + mark_as_advanced(CUOPT_${dep_name_upper}_LIBRARY) + + set_target_properties(cuopt::${dep_name_lower} PROPERTIES + IMPORTED_LOCATION "${CUOPT_${dep_name_upper}_LIBRARY}" + ) +endfunction() + +#cuopt_add_cupti_dep(nvperf_target) +#cuopt_add_cupti_dep(nvperf_host) +#cuopt_add_cupti_dep(cupti) + # ################################################################ ------------------------------------------------------------------ function(ConfigureTest CMAKE_TEST_NAME) add_executable(${CMAKE_TEST_NAME} ${ARGN}) diff --git a/cpp/tests/mip/CMakeLists.txt b/cpp/tests/mip/CMakeLists.txt index f2cf53ff6c..584bbc243b 100644 --- a/cpp/tests/mip/CMakeLists.txt +++ b/cpp/tests/mip/CMakeLists.txt @@ -40,15 +40,21 @@ ConfigureTest(PRESOLVE_TEST ${CMAKE_CURRENT_SOURCE_DIR}/presolve_test.cu ) # Disable for now -# ConfigureTest(FEASIBILITY_JUMP_TEST -# ${CMAKE_CURRENT_SOURCE_DIR}/feasibility_jump_tests.cu -# ) +ConfigureTest(FEASIBILITY_JUMP_TEST + ${CMAKE_CURRENT_SOURCE_DIR}/feasibility_jump_tests.cu +) ConfigureTest(MIP_TERMINATION_STATUS_TEST ${CMAKE_CURRENT_SOURCE_DIR}/termination_test.cu ) ConfigureTest(DETERMINISM_TEST ${CMAKE_CURRENT_SOURCE_DIR}/determinism_test.cu ) +ConfigureTest(LOCAL_SEARCH_TEST + ${CMAKE_CURRENT_SOURCE_DIR}/local_search_test.cu +) +ConfigureTest(DIVERSITY_TEST + ${CMAKE_CURRENT_SOURCE_DIR}/diversity_test.cu +) ConfigureTest(HEURISTICS_HYPER_PARAMS_TEST ${CMAKE_CURRENT_SOURCE_DIR}/heuristics_hyper_params_test.cu ) diff --git a/cpp/tests/mip/determinism_test.cu b/cpp/tests/mip/determinism_test.cu index dcd6f7749d..53b1066fa8 100644 --- a/cpp/tests/mip/determinism_test.cu +++ b/cpp/tests/mip/determinism_test.cu @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -24,6 +25,7 @@ #include #include +#include #include #include @@ -45,6 +47,117 @@ void expect_solutions_bitwise_equal(const mip_solution_t& sol1, } } +struct callback_solution_t { + std::vector assignment; + double objective{}; + double solution_bound{}; + internals::mip_solution_origin_t origin{internals::mip_solution_origin_t::UNKNOWN}; +}; + +class first_n_get_solution_callback_t : public cuopt::internals::get_solution_callback_ext_t { + public: + first_n_get_solution_callback_t(std::vector& solutions_in, + int n_variables_, + size_t max_solutions_, + void* expected_user_data_) + : solutions(solutions_in), + expected_user_data(expected_user_data_), + n_variables(n_variables_), + max_solutions(max_solutions_) + { + } + + void get_solution(void* data, + void* cost, + void* solution_bound, + const internals::mip_solution_callback_info_t* callback_info, + void* user_data) override + { + EXPECT_EQ(user_data, expected_user_data); + ASSERT_NE(callback_info, nullptr); + n_calls++; + + auto assignment_ptr = static_cast(data); + auto objective_ptr = static_cast(cost); + auto solution_bound_ptr = static_cast(solution_bound); + EXPECT_FALSE(std::isnan(objective_ptr[0])); + EXPECT_FALSE(std::isnan(solution_bound_ptr[0])); + + if (solutions.size() >= max_solutions) { return; } + + callback_solution_t callback_solution; + callback_solution.assignment.assign(assignment_ptr, assignment_ptr + n_variables); + callback_solution.objective = objective_ptr[0]; + callback_solution.solution_bound = solution_bound_ptr[0]; + callback_solution.origin = (internals::mip_solution_origin_t)callback_info->origin; + solutions.push_back(std::move(callback_solution)); + } + + std::vector& solutions; + void* expected_user_data; + int n_calls{0}; + int n_variables; + size_t max_solutions; +}; + +bool is_gpu_callback_origin(internals::mip_solution_origin_t origin) +{ + switch (origin) { + case internals::mip_solution_origin_t::FEASIBILITY_JUMP: + case internals::mip_solution_origin_t::LOCAL_SEARCH: + case internals::mip_solution_origin_t::QUICK_FEASIBLE: + case internals::mip_solution_origin_t::LP_ROUNDING: + case internals::mip_solution_origin_t::RECOMBINATION: + case internals::mip_solution_origin_t::SUB_MIP: return true; + default: return false; + } +} + +size_t count_callbacks_with_origin(const std::vector& callbacks, + internals::mip_solution_origin_t origin) +{ + return std::count_if(callbacks.begin(), + callbacks.end(), + [origin](const callback_solution_t& sol) { return sol.origin == origin; }); +} + +size_t count_gpu_callbacks(const std::vector& callbacks) +{ + return std::count_if(callbacks.begin(), callbacks.end(), [](const callback_solution_t& sol) { + return is_gpu_callback_origin(sol.origin); + }); +} + +size_t count_branch_and_bound_callbacks(const std::vector& callbacks) +{ + return std::count_if(callbacks.begin(), callbacks.end(), [](const callback_solution_t& sol) { + return sol.origin == internals::mip_solution_origin_t::BRANCH_AND_BOUND_NODE || + sol.origin == internals::mip_solution_origin_t::BRANCH_AND_BOUND_DIVING; + }); +} + +void expect_callback_prefixes_bitwise_equal(const std::vector& lhs, + const std::vector& rhs, + size_t prefix_size, + const std::string& label) +{ + ASSERT_GE(lhs.size(), prefix_size) << label << "Left callback prefix missing entries"; + ASSERT_GE(rhs.size(), prefix_size) << label << "Right callback prefix missing entries"; + for (size_t i = 0; i < prefix_size; ++i) { + EXPECT_EQ(lhs[i].objective, rhs[i].objective) + << label << "Callback objective differs at index " << i; + EXPECT_EQ(lhs[i].solution_bound, rhs[i].solution_bound) + << label << "Callback bound differs at index " << i; + EXPECT_EQ(lhs[i].origin, rhs[i].origin) << label << "Callback origin differs at index " << i; + ASSERT_EQ(lhs[i].assignment.size(), rhs[i].assignment.size()) + << label << "Callback assignment size differs at index " << i; + for (size_t j = 0; j < lhs[i].assignment.size(); ++j) { + EXPECT_EQ(lhs[i].assignment[j], rhs[i].assignment[j]) + << label << "Callback assignment differs at callback " << i << " variable " << j; + } + } +} + } // namespace class DeterministicBBTest : public ::testing::Test { @@ -61,9 +174,9 @@ TEST_F(DeterministicBBTest, reproducible_objective) mip_solver_settings_t settings; settings.time_limit = 60.0; - settings.determinism_mode = CUOPT_MODE_DETERMINISTIC; + settings.determinism_mode = CUOPT_MODE_DETERMINISTIC_BB; settings.num_cpu_threads = 8; - settings.work_limit = 4; + settings.work_limit = 2; // Ensure seed is positive int32_t auto seed = std::random_device{}() & 0x7fffffff; @@ -93,7 +206,7 @@ TEST_F(DeterministicBBTest, reproducible_infeasibility) mip_solver_settings_t settings; settings.time_limit = 60.0; - settings.determinism_mode = CUOPT_MODE_DETERMINISTIC; + settings.determinism_mode = CUOPT_MODE_DETERMINISTIC_BB; settings.num_cpu_threads = 8; settings.work_limit = 100; // High enough to fully explore @@ -125,7 +238,7 @@ TEST_F(DeterministicBBTest, reproducible_high_contention) mip_solver_settings_t settings; settings.time_limit = 60.0; - settings.determinism_mode = CUOPT_MODE_DETERMINISTIC; + settings.determinism_mode = CUOPT_MODE_DETERMINISTIC_BB; settings.num_cpu_threads = 128; // High thread count to stress contention settings.work_limit = 1; @@ -160,7 +273,7 @@ TEST_F(DeterministicBBTest, reproducible_solution_vector) mip_solver_settings_t settings; settings.time_limit = 60.0; - settings.determinism_mode = CUOPT_MODE_DETERMINISTIC; + settings.determinism_mode = CUOPT_MODE_DETERMINISTIC_BB; settings.num_cpu_threads = 8; settings.work_limit = 2; @@ -177,6 +290,117 @@ TEST_F(DeterministicBBTest, reproducible_solution_vector) expect_solutions_bitwise_equal(solution1, solution2, handle_); } +TEST_F(DeterministicBBTest, deterministic_callback_sequence_reproducible_with_gpu_pipeline) +{ + constexpr size_t callback_compare_count = 5; + constexpr size_t callback_capture_limit = 32; + constexpr size_t min_gpu_callback_count = 3; + + auto path = make_path_absolute("/mip/50v-10.mps"); + auto problem = mps_parser::parse_mps(path, false); + handle_.sync_stream(); + + mip_solver_settings_t settings; + settings.time_limit = 360.0; + settings.determinism_mode = CUOPT_MODE_DETERMINISTIC; + settings.num_cpu_threads = 2; + settings.work_limit = 4; + settings.bb_work_unit_scale = 2.0; + settings.gpu_heur_work_unit_scale = 1.0; + settings.cpufj_work_unit_scale = 1.0; + + auto seed = std::random_device{}() & 0x7fffffff; + std::cout << "Tested with seed " << seed << "\n"; + settings.seed = seed; + + const int n_variables = problem.get_variable_lower_bounds().size(); + int user_data = 7; + + std::vector callbacks_run1; + first_n_get_solution_callback_t callback_run1( + callbacks_run1, n_variables, callback_capture_limit, &user_data); + auto settings_run1 = settings; + settings_run1.set_mip_callback(&callback_run1, &user_data); + cuopt::seed_generator::set_seed(seed); + auto solution1 = solve_mip(&handle_, problem, settings_run1); + + std::vector callbacks_run2; + first_n_get_solution_callback_t callback_run2( + callbacks_run2, n_variables, callback_capture_limit, &user_data); + auto settings_run2 = settings; + settings_run2.set_mip_callback(&callback_run2, &user_data); + cuopt::seed_generator::set_seed(seed); + auto solution2 = solve_mip(&handle_, problem, settings_run2); + + EXPECT_EQ(solution1.get_termination_status(), solution2.get_termination_status()); + EXPECT_GE(callback_run1.n_calls, (int)callback_compare_count); + EXPECT_GE(callback_run2.n_calls, (int)callback_compare_count); + ASSERT_GE(callbacks_run1.size(), callback_compare_count); + ASSERT_GE(callbacks_run2.size(), callback_compare_count); + + EXPECT_GE(count_gpu_callbacks(callbacks_run1), min_gpu_callback_count); + EXPECT_GE(count_gpu_callbacks(callbacks_run2), min_gpu_callback_count); + + expect_callback_prefixes_bitwise_equal( + callbacks_run1, callbacks_run2, callback_compare_count, "Deterministic callback run 1 vs 2: "); +} + +class DeterministicGpuHeuristicsInstanceTest : public ::testing::TestWithParam { + protected: + raft::handle_t handle_; +}; + +TEST_P(DeterministicGpuHeuristicsInstanceTest, reproducible_with_gpu_heuristics) +{ + auto path = make_path_absolute(GetParam()); + auto problem = mps_parser::parse_mps(path, false); + handle_.sync_stream(); + + mip_solver_settings_t settings; + settings.time_limit = 60.0; + settings.determinism_mode = CUOPT_MODE_DETERMINISTIC; + settings.num_cpu_threads = 8; + settings.work_limit = 5; + + auto seed = std::random_device{}() & 0x7fffffff; + std::cout << "Tested with seed " << seed << "\n"; + settings.seed = seed; + + cuopt::seed_generator::set_seed(seed); + auto solution1 = solve_mip(&handle_, problem, settings); + cuopt::seed_generator::set_seed(seed); + auto solution2 = solve_mip(&handle_, problem, settings); + cuopt::seed_generator::set_seed(seed); + auto solution3 = solve_mip(&handle_, problem, settings); + + EXPECT_EQ(solution1.get_termination_status(), solution2.get_termination_status()); + EXPECT_EQ(solution1.get_termination_status(), solution3.get_termination_status()); + + EXPECT_DOUBLE_EQ(solution1.get_objective_value(), solution2.get_objective_value()); + EXPECT_DOUBLE_EQ(solution1.get_objective_value(), solution3.get_objective_value()); + + EXPECT_DOUBLE_EQ(solution1.get_solution_bound(), solution2.get_solution_bound()); + EXPECT_DOUBLE_EQ(solution1.get_solution_bound(), solution3.get_solution_bound()); + + expect_solutions_bitwise_equal(solution1, solution2, handle_, "GPU heur run 1 vs 2: "); + expect_solutions_bitwise_equal(solution1, solution3, handle_, "GPU heur run 1 vs 3: "); +} + +INSTANTIATE_TEST_SUITE_P( + DeterministicGpuHeuristics, + DeterministicGpuHeuristicsInstanceTest, + ::testing::Values(std::string("/mip/gen-ip054.mps"), + std::string("/mip/pk1.mps"), + // std::string("/mip/sct2.mps"), + // std::string("/mip/thor50dday.mps"), + std::string("/mip/neos5.mps")), + [](const ::testing::TestParamInfo& info) { + std::string name = info.param.substr(info.param.rfind('/') + 1); + name = name.substr(0, name.rfind('.')); + std::replace(name.begin(), name.end(), '-', '_'); + return name; + }); + // Parameterized test for different problem instances class DeterministicBBInstanceTest : public ::testing::TestWithParam> { @@ -227,9 +451,10 @@ INSTANTIATE_TEST_SUITE_P( DeterministicBB, DeterministicBBInstanceTest, ::testing::Values( - // Instance, threads, time_limit + // Instance, threads, time_limit, work limiy std::make_tuple("/mip/gen-ip054.mps", 4, 60.0, 4), std::make_tuple("/mip/swath1.mps", 8, 60.0, 4), + std::make_tuple("/mip/50v-10.mps", 8, 60.0, 4), std::make_tuple("/mip/gen-ip054.mps", 128, 120.0, 1), std::make_tuple("/mip/bb_optimality.mps", 4, 60.0, 4), std::make_tuple("/mip/neos5.mps", 16, 60.0, 1), diff --git a/cpp/tests/mip/determinism_utils.cuh b/cpp/tests/mip/determinism_utils.cuh new file mode 100644 index 0000000000..b4e0d4e01e --- /dev/null +++ b/cpp/tests/mip/determinism_utils.cuh @@ -0,0 +1,77 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights + * reserved. SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include + +#include +#include + +#include + +namespace cuopt::linear_programming::test { + +static __global__ void spin_kernel(int* flag, unsigned long long timeout_clocks = 10000000) +{ + cuda::atomic_ref flag_ref(*flag); + + long long int start_clock, sample_clock; + start_clock = clock64(); + + while (flag_ref.load() == 0) { + sample_clock = clock64(); + + if (sample_clock - start_clock > timeout_clocks) { break; } + } +} + +static void launch_spin_kernel_stream_thread(rmm::cuda_stream_view stream_view, int* flag) +{ + while (true) { + int blocks = rand() % 64 + 1; + int threads = rand() % 1024 + 1; + spin_kernel<<>>(flag); + cudaStreamSynchronize(stream_view); + if (host_copy(flag, 1, stream_view)[0] != 0) { break; } + std::this_thread::sleep_for(std::chrono::milliseconds(rand() % 1000 + 1)); + } +} + +class spin_stream_raii_t { + public: + spin_stream_raii_t() + : flag(0, stream), spin_thread(launch_spin_kernel_stream_thread, stream.view(), flag.data()) + { + } + + ~spin_stream_raii_t() + { + int one = 1; + flag.set_value_async(one, stream); + spin_thread.join(); + } + + private: + rmm::cuda_stream stream; + rmm::device_scalar flag; + std::thread spin_thread; +}; + +} // namespace cuopt::linear_programming::test diff --git a/cpp/tests/mip/diversity_test.cu b/cpp/tests/mip/diversity_test.cu new file mode 100644 index 0000000000..c5e1c0842d --- /dev/null +++ b/cpp/tests/mip/diversity_test.cu @@ -0,0 +1,395 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights + * reserved. SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../linear_programming/utilities/pdlp_test_utilities.cuh" +#include "determinism_utils.cuh" +#include "mip_utils.cuh" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace cuopt::linear_programming::test { + +void init_handler(const raft::handle_t* handle_ptr) +{ + // Init cuBlas / cuSparse context here to avoid having it during solving time + RAFT_CUBLAS_TRY(raft::linalg::detail::cublassetpointermode( + handle_ptr->get_cublas_handle(), CUBLAS_POINTER_MODE_DEVICE, handle_ptr->get_stream())); + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsesetpointermode( + handle_ptr->get_cusparse_handle(), CUSPARSE_POINTER_MODE_DEVICE, handle_ptr->get_stream())); +} + +static void setup_device_symbols(rmm::cuda_stream_view stream_view) { (void)stream_view; } + +static uint32_t test_full_run_determinism(std::string path, + unsigned long seed = std::random_device{}(), + float work_limit = 10.0f) +{ + const raft::handle_t handle_{}; + + cuopt::mps_parser::mps_data_model_t mps_problem = + cuopt::mps_parser::parse_mps(path, false); + handle_.sync_stream(); + auto op_problem = mps_data_model_to_optimization_problem(&handle_, mps_problem); + problem_checking_t::check_problem_representation(op_problem); + + init_handler(op_problem.get_handle_ptr()); + // run the problem constructor of MIP, so that we do bounds standardization + detail::problem_t problem(op_problem); + problem.deterministic = true; + problem.preprocess_problem(); + + setup_device_symbols(op_problem.get_handle_ptr()->get_stream()); + + auto settings = mip_solver_settings_t{}; + settings.time_limit = 3000.; + settings.work_limit = work_limit; + settings.determinism_mode = CUOPT_MODE_DETERMINISTIC_GPU_HEURISTICS; + settings.heuristics_only = true; + auto timer = cuopt::termination_checker_t(3000.0, cuopt::termination_checker_t::root_tag_t{}); + detail::mip_solver_t solver(problem, settings, timer); + problem.tolerances = settings.get_tolerances(); + + detail::diversity_manager_t diversity_manager(solver.context); + solver.context.gpu_heur_loop.deterministic = true; + diversity_manager.timer = + work_limit_timer_t(solver.context.gpu_heur_loop, settings.work_limit, timer); + diversity_manager.run_solver(); + + std::vector hashes; + auto pop = diversity_manager.get_population_pointer(); + for (const auto& sol : pop->population_to_vector()) { + hashes.push_back(sol.get_hash()); + } + + uint32_t final_hash = detail::compute_hash(hashes); + printf("%s: final hash: 0x%x, pop size %d\n", + path.c_str(), + final_hash, + (int)pop->population_to_vector().size()); + return final_hash; +} + +static uint32_t test_initial_solution_determinism(std::string path, + unsigned long seed = std::random_device{}()) +{ + const raft::handle_t handle_{}; + + cuopt::mps_parser::mps_data_model_t mps_problem = + cuopt::mps_parser::parse_mps(path, false); + handle_.sync_stream(); + auto op_problem = mps_data_model_to_optimization_problem(&handle_, mps_problem); + problem_checking_t::check_problem_representation(op_problem); + + init_handler(op_problem.get_handle_ptr()); + // run the problem constructor of MIP, so that we do bounds standardization + detail::problem_t problem(op_problem); + problem.deterministic = true; + problem.preprocess_problem(); + + setup_device_symbols(op_problem.get_handle_ptr()->get_stream()); + + auto settings = mip_solver_settings_t{}; + settings.time_limit = 3000.; + settings.determinism_mode = CUOPT_MODE_DETERMINISTIC_GPU_HEURISTICS; + settings.heuristics_only = true; + auto timer = cuopt::termination_checker_t(3000.0, cuopt::termination_checker_t::root_tag_t{}); + detail::mip_solver_t solver(problem, settings, timer); + problem.tolerances = settings.get_tolerances(); + + detail::diversity_manager_t diversity_manager(solver.context); + solver.context.diversity_manager_ptr = &diversity_manager; + work_limit_context_t work_limit_context("DiversityManager"); + work_limit_context.deterministic = true; + diversity_manager.timer = work_limit_timer_t(work_limit_context, 60000, timer); + diversity_manager.diversity_config.initial_solution_only = true; + diversity_manager.run_solver(); + + std::vector hashes; + auto pop = diversity_manager.get_population_pointer(); + for (const auto& sol : pop->population_to_vector()) { + hashes.push_back(sol.get_hash()); + } + + uint32_t final_hash = detail::compute_hash(hashes); + printf("%s: final hash: 0x%x, pop size %d\n", + path.c_str(), + final_hash, + (int)pop->population_to_vector().size()); + return final_hash; +} + +static uint32_t test_recombiners_determinism(std::string path, + unsigned long seed = std::random_device{}()) +{ + const raft::handle_t handle_{}; + + cuopt::mps_parser::mps_data_model_t mps_problem = + cuopt::mps_parser::parse_mps(path, false); + handle_.sync_stream(); + auto op_problem = mps_data_model_to_optimization_problem(&handle_, mps_problem); + problem_checking_t::check_problem_representation(op_problem); + + init_handler(op_problem.get_handle_ptr()); + // run the problem constructor of MIP, so that we do bounds standardization + detail::problem_t problem(op_problem); + problem.deterministic = true; + problem.preprocess_problem(); + + setup_device_symbols(op_problem.get_handle_ptr()->get_stream()); + + auto settings = mip_solver_settings_t{}; + settings.time_limit = 3000.; + settings.determinism_mode = CUOPT_MODE_DETERMINISTIC_GPU_HEURISTICS; + settings.heuristics_only = true; + auto timer = cuopt::termination_checker_t(3000.0, cuopt::termination_checker_t::root_tag_t{}); + detail::mip_solver_t solver(problem, settings, timer); + problem.tolerances = settings.get_tolerances(); + + detail::diversity_manager_t diversity_manager(solver.context); + solver.context.diversity_manager_ptr = &diversity_manager; + work_limit_context_t work_limit_context("DiversityManager"); + work_limit_context.deterministic = true; + diversity_manager.timer = work_limit_timer_t(work_limit_context, 60000, timer); + diversity_manager.diversity_config.dry_run = true; + diversity_manager.run_solver(); + + // Generate a population by running FJ on random starting points + // recombine a few solutions, observe the output + for (int i = diversity_manager.population.current_size(); i < 3; ++i) { + detail::solution_t random_initial_solution(problem); + random_initial_solution.assign_random_within_bounds(); + detail::fj_settings_t fj_settings; + fj_settings.feasibility_run = false; + fj_settings.iteration_limit = 1000 + i * 100; + fj_settings.seed = seed + i; + auto solution = run_fj(problem, + fj_settings, + fj_tweaks_t{}, + random_initial_solution.get_host_assignment(), + CUOPT_MODE_DETERMINISTIC) + .solution; + printf("population %d hash: 0x%x\n", i, solution.get_hash()); + diversity_manager.population.add_solution(std::move(solution), + internals::mip_solution_origin_t::FEASIBILITY_JUMP); + } + + auto pop_vector = diversity_manager.get_population_pointer()->population_to_vector(); + int pop_size = std::min(6, (int)pop_vector.size()); + + std::vector hashes; + + static std::map, uint32_t> hash_map; + + for (auto recombiner : {detail::recombiner_enum_t::LINE_SEGMENT, + detail::recombiner_enum_t::BOUND_PROP, + detail::recombiner_enum_t::FP}) { + for (int i = 1; i < pop_size; i++) { + for (int j = i + 1; j < pop_size; j++) { + printf("recombining %d and %d w/ recombiner %s\n", + i, + j, + detail::all_recombine_stats::recombiner_labels[(int)recombiner]); + auto [offspring, success] = + diversity_manager.recombine(pop_vector[i], pop_vector[j], recombiner); + auto offspring_hash = offspring.get_hash(); + printf("for %d,%d: offspring hash: 0x%x, parent 1 hash: 0x%x, parent 2 hash: 0x%x\n", + i, + j, + offspring_hash, + pop_vector[i].get_hash(), + pop_vector[j].get_hash()); + if (hash_map.find(std::make_tuple(path, i, j, recombiner)) == hash_map.end()) { + hash_map[std::make_tuple(path, i, j, recombiner)] = offspring_hash; + } else { + if (hash_map[std::make_tuple(path, i, j, recombiner)] != offspring_hash) { + printf("%s: hash mismatch for %d,%d: %d != %d\n", + path.c_str(), + i, + j, + hash_map[std::make_tuple(path, i, j, recombiner)], + offspring_hash); + ADD_FAILURE() << "hash mismatch"; + } + } + hashes.push_back(offspring_hash); + } + } + } + return detail::compute_hash(hashes); + + auto pop = diversity_manager.get_population_pointer(); + for (const auto& sol : pop->population_to_vector()) { + hashes.push_back(sol.get_hash()); + } + + uint32_t final_hash = detail::compute_hash(hashes); + printf("%s: final hash: 0x%x, pop size %d\n", + path.c_str(), + final_hash, + (int)pop->population_to_vector().size()); + return final_hash; +} + +class DiversityTestParams : public testing::TestWithParam> {}; + +TEST_P(DiversityTestParams, recombiners_deterministic) +{ + // cuopt::init_logger_t log("", true); + cuopt::default_logger().set_pattern("[%n] [%-6l] %v"); + cuopt::default_logger().set_level(rapids_logger::level_enum::debug); + cuopt::default_logger().flush_on(rapids_logger::level_enum::debug); + + spin_stream_raii_t spin_stream_1; + spin_stream_raii_t spin_stream_2; + + auto test_instance = std::get<0>(GetParam()); + std::cout << "Running: " << test_instance << std::endl; + int seed = + std::getenv("CUOPT_SEED") ? std::stoi(std::getenv("CUOPT_SEED")) : std::random_device{}(); + std::cerr << "Tested with seed " << seed << "\n"; + auto path = make_path_absolute(test_instance); + test_instance = std::getenv("CUOPT_INSTANCE") ? std::getenv("CUOPT_INSTANCE") : test_instance; + uint32_t gold_hash = 0; + for (int i = 0; i < 2; ++i) { + cuopt::seed_generator::set_seed(seed); + std::cout << "Running " << test_instance << " " << i << std::endl; + std::cout << "-------------------------------------------------------------\n"; + auto hash = test_recombiners_determinism(path, seed); + if (i == 0) { + gold_hash = hash; + std::cout << "Gold hash: " << gold_hash << std::endl; + } else { + ASSERT_EQ(hash, gold_hash); + } + } +} + +TEST_P(DiversityTestParams, initial_solution_deterministic) +{ + cuopt::default_logger().set_pattern("[%n] [%-6l] %v"); + + spin_stream_raii_t spin_stream_1; + spin_stream_raii_t spin_stream_2; + + auto test_instance = std::get<0>(GetParam()); + std::cout << "Running: " << test_instance << std::endl; + int seed = + std::getenv("CUOPT_SEED") ? std::stoi(std::getenv("CUOPT_SEED")) : std::random_device{}(); + std::cerr << "Tested with seed " << seed << "\n"; + auto path = make_path_absolute(test_instance); + test_instance = std::getenv("CUOPT_INSTANCE") ? std::getenv("CUOPT_INSTANCE") : test_instance; + uint32_t gold_hash = 0; + for (int i = 0; i < 2; ++i) { + cuopt::seed_generator::set_seed(seed); + std::cout << "Running " << test_instance << " " << i << std::endl; + std::cout << "-------------------------------------------------------------\n"; + auto hash = test_initial_solution_determinism(path, seed); + if (i == 0) { + gold_hash = hash; + std::cout << "Gold hash: " << gold_hash << std::endl; + } else { + ASSERT_EQ(hash, gold_hash); + } + } +} + +// Disabled as it takes too long to run in CI and overlaps with other determinism full run tests. +TEST_P(DiversityTestParams, DISABLED_full_run_deterministic) +{ + cuopt::init_logger_t log("", true); + // cuopt::default_logger().set_pattern("[%n] [%-6l] %v"); + cuopt::default_logger().set_level(rapids_logger::level_enum::debug); + cuopt::default_logger().flush_on(rapids_logger::level_enum::debug); + + spin_stream_raii_t spin_stream_1; + spin_stream_raii_t spin_stream_2; + + auto test_instance = std::get<0>(GetParam()); + const float work_limit = std::get<1>(GetParam()); + std::cout << "Running: " << test_instance << std::endl; + int seed = + std::getenv("CUOPT_SEED") ? std::stoi(std::getenv("CUOPT_SEED")) : std::random_device{}(); + std::cerr << "Tested with seed " << seed << "\n"; + auto path = make_path_absolute(test_instance); + if (std::getenv("CUOPT_INSTANCE")) { + test_instance = std::getenv("CUOPT_INSTANCE"); + path = make_path_absolute(test_instance); + } + uint32_t gold_hash = 0; + for (int i = 0; i < 4; ++i) { + cuopt::seed_generator::set_seed(seed); + std::cout << "Running " << test_instance << " " << i << std::endl; + std::cout << "-------------------------------------------------------------\n"; + auto hash = test_full_run_determinism(path, seed, work_limit); + if (i == 0) { + gold_hash = hash; + std::cout << "Gold hash: " << gold_hash << std::endl; + } else { + ASSERT_EQ(hash, gold_hash); + } + } +} + +INSTANTIATE_TEST_SUITE_P(DiversityTest, + DiversityTestParams, + testing::Values( + // std::make_tuple("mip/gen-ip054.mps", 5.0f), + // std::make_tuple("mip/pk1.mps", 5.0f), + std::make_tuple("mip/neos5.mps", 5.0f), + std::make_tuple("mip/gen-ip054.mps", 5.0f), + std::make_tuple("mip/pk1.mps", 5.0f), + // std::make_tuple("uccase9.mps"), + // std::make_tuple("mip/neos5.mps", 5.0f), + std::make_tuple("mip/50v-10.mps", 5.0f) + // std::make_tuple("mip/rmatr200-p5.mps", 5.0f) + )); + +} // namespace cuopt::linear_programming::test diff --git a/cpp/tests/mip/feasibility_jump_tests.cu b/cpp/tests/mip/feasibility_jump_tests.cu index 4e8a518522..05b2c03fec 100644 --- a/cpp/tests/mip/feasibility_jump_tests.cu +++ b/cpp/tests/mip/feasibility_jump_tests.cu @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -46,28 +47,23 @@ void init_handler(const raft::handle_t* handle_ptr) handle_ptr->get_cusparse_handle(), CUSPARSE_POINTER_MODE_DEVICE, handle_ptr->get_stream())); } -struct fj_tweaks_t { - double objective_weight = 0; -}; - -struct fj_state_t { - detail::solution_t solution; - std::vector solution_vector; - int minimums; - double incumbent_objective; - double incumbent_violation; -}; - // Helper function to setup MIP solver and run FJ with given settings and initial solution -static fj_state_t run_fj(std::string test_instance, - const detail::fj_settings_t& fj_settings, - fj_tweaks_t tweaks = {}, - std::vector initial_solution = {}) +static fj_state_t run_fj_instance(std::string test_instance, + const detail::fj_settings_t& fj_settings, + fj_tweaks_t tweaks = {}, + std::vector initial_solution = {}, + int determinism_mode = CUOPT_MODE_DETERMINISTIC) { const raft::handle_t handle_{}; std::cout << "Running: " << test_instance << std::endl; auto path = cuopt::test::get_rapids_dataset_root_dir() + ("/mip/" + test_instance); + + if (std::getenv("CUOPT_INSTANCE")) { + path = make_path_absolute(std::getenv("CUOPT_INSTANCE")); + std::cout << "Using instance from CUOPT_INSTANCE: " << path << std::endl; + } + cuopt::mps_parser::mps_data_model_t mps_problem = cuopt::mps_parser::parse_mps(path, false); handle_.sync_stream(); @@ -78,37 +74,8 @@ static fj_state_t run_fj(std::string test_instance, // run the problem constructor of MIP, so that we do bounds standardization detail::problem_t problem(op_problem); problem.preprocess_problem(); - detail::mip_scaling_strategy_t scaling(problem); - - auto settings = mip_solver_settings_t{}; - settings.time_limit = 30.; - auto timer = cuopt::timer_t(30); - detail::mip_solver_t solver(problem, settings, scaling, timer); - - detail::solution_t solution(*solver.context.problem_ptr); - if (initial_solution.size() > 0) { - expand_device_copy(solution.assignment, initial_solution, solution.handle_ptr->get_stream()); - } else { - thrust::fill(solution.handle_ptr->get_thrust_policy(), - solution.assignment.begin(), - solution.assignment.end(), - 0.0); - } - solution.clamp_within_bounds(); - - detail::fj_t fj(solver.context, fj_settings); - fj.reset_weights(solution.handle_ptr->get_stream(), 1.); - fj.objective_weight.set_value_async(tweaks.objective_weight, solution.handle_ptr->get_stream()); - solution.handle_ptr->sync_stream(); - fj.solve(solution); - auto solution_vector = host_copy(solution.assignment, solution.handle_ptr->get_stream()); - - return {solution, - solution_vector, - fj.climbers[0]->local_minimums_reached.value(solution.handle_ptr->get_stream()), - fj.climbers[0]->incumbent_objective.value(solution.handle_ptr->get_stream()), - fj.climbers[0]->violation_score.value(solution.handle_ptr->get_stream())}; + return run_fj(problem, fj_settings, tweaks, initial_solution, determinism_mode); } // FJ had a bug causing objective/violation values to explode in magnitude in certain scenarios. @@ -118,12 +85,12 @@ static bool run_fj_check_no_obj_runoff(std::string test_instance) detail::fj_settings_t fj_settings; fj_settings.time_limit = 30.; fj_settings.mode = detail::fj_mode_t::EXIT_NON_IMPROVING; - fj_settings.n_of_minimums_for_exit = 20000 * 1000; + fj_settings.n_of_minimums_for_exit = 5000; fj_settings.update_weights = true; fj_settings.feasibility_run = false; fj_settings.iteration_limit = 20000; - auto state = run_fj(test_instance, fj_settings); + auto state = run_fj_instance(test_instance, fj_settings); // ensure that the objective and the violation in the FJ state are not too large (<1e60) EXPECT_LE(state.incumbent_violation, 1e60) << "FJ violation too large"; @@ -140,12 +107,13 @@ static bool run_fj_check_objective(std::string test_instance, int iter_limit, do detail::fj_settings_t fj_settings; fj_settings.time_limit = 30.; fj_settings.mode = detail::fj_mode_t::EXIT_NON_IMPROVING; - fj_settings.n_of_minimums_for_exit = 20000 * 1000; + fj_settings.n_of_minimums_for_exit = 5000; fj_settings.update_weights = true; fj_settings.feasibility_run = obj_target == +std::numeric_limits::infinity(); fj_settings.iteration_limit = iter_limit; - auto state = run_fj(test_instance, fj_settings); + auto state = + run_fj_instance(test_instance, fj_settings, fj_tweaks_t{}, {}, CUOPT_MODE_DETERMINISTIC); auto& solution = state.solution; CUOPT_LOG_DEBUG("%s: Solution generated with FJ: is_feasible %d, objective %g (raw %g)", @@ -167,12 +135,12 @@ static bool run_fj_check_feasible(std::string test_instance) detail::fj_settings_t fj_settings; fj_settings.time_limit = 30.; fj_settings.mode = detail::fj_mode_t::EXIT_NON_IMPROVING; - fj_settings.n_of_minimums_for_exit = 20000 * 1000; + fj_settings.n_of_minimums_for_exit = 5000; fj_settings.update_weights = true; fj_settings.feasibility_run = false; fj_settings.iteration_limit = 25000; - auto state = run_fj(test_instance, fj_settings); + auto state = run_fj_instance(test_instance, fj_settings); auto& solution = state.solution; bool previous_feasible = solution.get_feasible(); @@ -183,8 +151,8 @@ static bool run_fj_check_feasible(std::string test_instance) // again but with very large obj weight to force FJ into the infeasible region fj_tweaks_t tweaks; tweaks.objective_weight = 1e6; - auto new_state = run_fj(test_instance, fj_settings, tweaks, state.solution_vector); - auto& new_solution = new_state.solution; + auto new_state = run_fj_instance(test_instance, fj_settings, tweaks, state.solution_vector); + auto& new_solution = new_state.solution; CUOPT_LOG_DEBUG("%s: Solution generated with FJ: is_feasible %d, objective %g (raw %g)", test_instance.c_str(), @@ -199,63 +167,57 @@ static bool run_fj_check_feasible(std::string test_instance) return true; } -class MIPSolveParametricTest : public testing::TestWithParam> { -}; - -TEST_P(MIPSolveParametricTest, feasibility_jump_obj_test) +static bool run_fj_check_determinism(std::string test_instance, int iter_limit) { - auto [instance, obj_target, iter_limit] = GetParam(); - EXPECT_TRUE(run_fj_check_objective(instance, iter_limit, obj_target)); -} + detail::fj_settings_t fj_settings; + fj_settings.time_limit = std::numeric_limits::max(); + fj_settings.mode = detail::fj_mode_t::EXIT_NON_IMPROVING; + fj_settings.n_of_minimums_for_exit = 5000 * 1000; + // fj_settings.work_limit = 0.5; // run for 0.5wu (~0.5s) + fj_settings.update_weights = true; + fj_settings.feasibility_run = false; + fj_settings.iteration_limit = iter_limit; + fj_settings.load_balancing_mode = detail::fj_load_balancing_mode_t::ALWAYS_ON; + fj_settings.seed = cuopt::seed_generator::get_seed(); + + auto state = run_fj_instance(test_instance, fj_settings); + auto& solution = state.solution; -INSTANTIATE_TEST_SUITE_P( - MIPSolveTest, - MIPSolveParametricTest, - testing::Values( - // Bug: https://github.com/NVIDIA/cuopt/issues/214 - // std::make_tuple("50v-10.mps", 7800, 100000), - // std::make_tuple("fiball.mps", 140, 25000), - // std::make_tuple("rmatr200-p5.mps", 7000, 10000), - std::make_tuple("gen-ip054.mps", 7500, 20000), - std::make_tuple("sct2.mps", 100, 50000), - std::make_tuple("uccase9.mps", 4000000, 50000), - // unstable, prone to failure on slight weight changes - // std::make_tuple("drayage-25-23.mps", 300000, 50000), - std::make_tuple("tr12-30.mps", 300000, 50000), - std::make_tuple("neos-3004026-krka.mps", - +std::numeric_limits::infinity(), - 35000), // feasibility - // std::make_tuple("nursesched-medium-hint03.mps", 12000, 50000), // too large - std::make_tuple("ns1208400.mps", 2, 60000), - std::make_tuple("gmu-35-50.mps", -2300000, 25000), - std::make_tuple("n2seq36q.mps", 158800, 25000), - std::make_tuple("seymour1.mps", 440, 50000), - std::make_tuple("cvs16r128-89.mps", -50, 10000) -// TEMPORARY: occasional cusparse transpose issues on ARM in CI -#ifndef __aarch64__ - , - std::make_tuple("thor50dday.mps", 250000, 1000) -#endif - )); - -TEST(mip_solve, feasibility_jump_feas_test) -{ - for (const auto& instance : {"tr12-30.mps", - "sct2.mps" -#ifndef __aarch64__ - , - "thor50dday.mps" -#endif - }) { - run_fj_check_feasible(instance); + printf("%s[seed=%x]: Solution generated with FJ: is_feasible %d, objective %g (raw %g)", + test_instance.c_str(), + fj_settings.seed, + solution.get_feasible(), + solution.get_user_objective(), + solution.get_objective()); + + static std::unordered_map first_val_map; + if (first_val_map.count(test_instance) == 0) { + first_val_map[test_instance] = solution.get_user_objective(); } + EXPECT_NEAR(solution.get_user_objective(), first_val_map[test_instance], 1.0) + << test_instance << " determinism objective mismatch"; + + return true; } -TEST(mip_solve, feasibility_jump_obj_runoff_test) +TEST(mip_solve, feasibility_jump_determinism) { - for (const auto& instance : {"minrep_inf.mps", "sct2.mps", "uccase9.mps", - /*"buildingenergy.mps"*/}) { - run_fj_check_no_obj_runoff(instance); + cuopt::init_logger_t log("", true); + + int seed = + std::getenv("CUOPT_SEED") ? std::stoi(std::getenv("CUOPT_SEED")) : std::random_device{}(); + + for (const auto& [instance, iter_limit] : {std::make_pair("thor50dday.mps", 1000), + std::make_pair("gen-ip054.mps", 1000), + std::make_pair("50v-10.mps", 1000), + std::make_pair("seymour1.mps", 1000), + std::make_pair("rmatr200-p5.mps", 1000), + std::make_pair("tr12-30.mps", 1000), + std::make_pair("sct2.mps", 1000)}) { + for (int i = 0; i < 10; i++) { + cuopt::seed_generator::set_seed(seed); + run_fj_check_determinism(instance, iter_limit); + } } } diff --git a/cpp/tests/mip/load_balancing_test.cu b/cpp/tests/mip/load_balancing_test.cu index 1f825a26f7..9fc15d0325 100644 --- a/cpp/tests/mip/load_balancing_test.cu +++ b/cpp/tests/mip/load_balancing_test.cu @@ -9,6 +9,7 @@ #include "mip_utils.cuh" #include +#include #include #include #include @@ -128,8 +129,8 @@ void test_multi_probe(std::string path) problem_checking_t::check_problem_representation(op_problem); detail::problem_t problem(op_problem); mip_solver_settings_t default_settings{}; - detail::mip_scaling_strategy_t scaling(problem); - detail::mip_solver_t solver(problem, default_settings, scaling, cuopt::timer_t(0)); + auto timer = cuopt::termination_checker_t(0.0, cuopt::termination_checker_t::root_tag_t{}); + detail::mip_solver_t solver(problem, default_settings, timer); detail::load_balanced_problem_t lb_problem(problem); detail::load_balanced_bounds_presolve_t lb_prs(lb_problem, solver.context); diff --git a/cpp/tests/mip/local_search_test.cu b/cpp/tests/mip/local_search_test.cu new file mode 100644 index 0000000000..fc9334d98d --- /dev/null +++ b/cpp/tests/mip/local_search_test.cu @@ -0,0 +1,238 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights + * reserved. SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../linear_programming/utilities/pdlp_test_utilities.cuh" +#include "determinism_utils.cuh" +#include "mip_utils.cuh" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace cuopt::linear_programming::test { + +void init_handler(const raft::handle_t* handle_ptr) +{ + // Init cuBlas / cuSparse context here to avoid having it during solving time + RAFT_CUBLAS_TRY(raft::linalg::detail::cublassetpointermode( + handle_ptr->get_cublas_handle(), CUBLAS_POINTER_MODE_DEVICE, handle_ptr->get_stream())); + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsesetpointermode( + handle_ptr->get_cusparse_handle(), CUSPARSE_POINTER_MODE_DEVICE, handle_ptr->get_stream())); +} + +static void setup_device_symbols(rmm::cuda_stream_view stream_view) { (void)stream_view; } + +enum local_search_mode_t { + FP = 0, + STAGED_FP, + FJ_LINE_SEGMENT, + FJ_ON_ZERO, + FJ_ANNEALING, +}; + +// Helper function to setup MIP solver and run FJ with given settings and initial solution +static uint32_t run_fp(std::string test_instance, local_search_mode_t mode, double work_limit = 4.0) +{ + const raft::handle_t handle_{}; + std::cout << "Running: " << test_instance << std::endl; + + auto path = cuopt::test::get_rapids_dataset_root_dir() + ("/mip/" + test_instance); + cuopt::mps_parser::mps_data_model_t mps_problem = + cuopt::mps_parser::parse_mps(path, false); + handle_.sync_stream(); + auto op_problem = mps_data_model_to_optimization_problem(&handle_, mps_problem); + problem_checking_t::check_problem_representation(op_problem); + + init_handler(op_problem.get_handle_ptr()); + // run the problem constructor of MIP, so that we do bounds standardization + auto settings = mip_solver_settings_t{}; + settings.time_limit = 120.; + settings.determinism_mode = CUOPT_MODE_DETERMINISTIC; + + detail::problem_t problem(op_problem, settings.get_tolerances(), true); + problem.preprocess_problem(); + + setup_device_symbols(op_problem.get_handle_ptr()->get_stream()); + auto timer = + cuopt::termination_checker_t(settings.time_limit, cuopt::termination_checker_t::root_tag_t{}); + detail::mip_solver_t solver(problem, settings, timer); + problem.tolerances = settings.get_tolerances(); + + rmm::device_uvector lp_optimal_solution(problem.n_variables, + problem.handle_ptr->get_stream()); + thrust::fill(problem.handle_ptr->get_thrust_policy(), + lp_optimal_solution.begin(), + lp_optimal_solution.end(), + 0.0); + detail::lp_state_t& lp_state = problem.lp_state; + // resize because some constructor might be called before the presolve + lp_state.resize(problem, problem.handle_ptr->get_stream()); + detail::relaxed_lp_settings_t lp_settings{}; + lp_settings.time_limit = std::numeric_limits::max(); + lp_settings.tolerance = 1e-6; + lp_settings.return_first_feasible = false; + lp_settings.save_state = false; + // lp_settings.iteration_limit = 5; + auto lp_result = + detail::get_relaxed_lp_solution(problem, lp_optimal_solution, lp_state, lp_settings); + EXPECT_EQ(lp_result.get_termination_status(), pdlp_termination_status_t::Optimal); + clamp_within_var_bounds(lp_optimal_solution, &problem, problem.handle_ptr); + + // return detail::compute_hash(lp_optimal_solution); + + detail::local_search_t local_search(solver.context, lp_optimal_solution); + + detail::solution_t solution(problem); + solution.assign_random_within_bounds(); + solution.compute_feasibility(); + + printf("Model fingerprint: 0x%x\n", problem.get_fingerprint()); + printf("LP optimal hash: 0x%x\n", + detail::compute_hash(make_span(lp_optimal_solution), problem.handle_ptr->get_stream())); + printf("running mode: %d\n", mode); + + work_limit_context_t work_limit_context("LocalSearch"); + work_limit_context.deterministic = true; + local_search.fp.timer = work_limit_timer_t(work_limit_context, work_limit, timer); + + detail::ls_config_t ls_config{}; + + if (mode == local_search_mode_t::FP) { + bool is_feasible = false; + int iterations = 0; + while (!local_search.fp.timer.check_time_limit()) { + is_feasible = local_search.fp.run_single_fp_descent(solution); + printf("fp_loop it %d, is_feasible %d\n", iterations, is_feasible); + if (is_feasible) { + break; + } else { + is_feasible = local_search.fp.restart_fp(solution); + if (is_feasible) { break; } + } + iterations++; + } + } else if (mode == local_search_mode_t::FJ_LINE_SEGMENT) { + work_limit_timer_t wlt(work_limit_context, work_limit, timer); + local_search.run_fj_line_segment(solution, wlt, ls_config); + } else if (mode == local_search_mode_t::FJ_ON_ZERO) { + work_limit_timer_t wlt(work_limit_context, work_limit, timer); + local_search.run_fj_on_zero(solution, wlt); + } else if (mode == local_search_mode_t::FJ_ANNEALING) { + work_limit_timer_t wlt(work_limit_context, work_limit, timer); + local_search.run_fj_annealing(solution, wlt, ls_config); + } + + std::vector hashes; + hashes.push_back(detail::compute_hash(solution.get_host_assignment())); + printf("hashes: 0x%x, hash of the hash: 0x%x\n", hashes[0], detail::compute_hash(hashes)); + + return detail::compute_hash(hashes); +} + +static uint32_t run_fp_check_determinism(std::string test_instance, + local_search_mode_t mode, + unsigned long seed, + double work_limit = 4.0) +{ + cuopt::seed_generator::set_seed(seed); + + return run_fp(test_instance, mode, work_limit); +} + +class LocalSearchTestParams : public testing::TestWithParam> {}; + +TEST_P(LocalSearchTestParams, local_search_operator_determinism) +{ + cuopt::init_logger_t log("", true); + cuopt::default_logger().set_pattern("[%n] [%-6l] %v"); + cuopt::default_logger().set_level(rapids_logger::level_enum::debug); + cuopt::default_logger().flush_on(rapids_logger::level_enum::debug); + + spin_stream_raii_t spin_stream_1; + spin_stream_raii_t spin_stream_2; + + auto mode = std::get<0>(GetParam()); + + struct instance_config_t { + const char* name; + double work_limit; + }; + for (const auto& cfg : { + instance_config_t{"gen-ip054.mps", 4.0}, + instance_config_t{"50v-10.mps", 2.0}, + // instance_config_t{"n2seq36q.mps", 4.0}, + instance_config_t{"neos5.mps", 2.0}, + // instance_config_t{"neos8.mps", 2.0}, + }) { + unsigned long seed = std::getenv("CUOPT_SEED") + ? (unsigned long)std::stoi(std::getenv("CUOPT_SEED")) + : (unsigned long)std::random_device{}(); + std::cerr << "Tested with seed " << seed << " instance " << cfg.name << " work_limit " + << cfg.work_limit << "\n"; + uint32_t gold_hash = 0; + for (int i = 0; i < 5; ++i) { + uint32_t hash = run_fp_check_determinism(cfg.name, mode, seed, cfg.work_limit); + if (i == 0) { + gold_hash = hash; + printf("Gold hash: 0x%x\n", gold_hash); + } else { + ASSERT_EQ(hash, gold_hash); + printf("Hash: 0x%x\n", hash); + } + } + } +} + +INSTANTIATE_TEST_SUITE_P(LocalSearchTests, + LocalSearchTestParams, + testing::Values(std::make_tuple(local_search_mode_t::FP), + std::make_tuple(local_search_mode_t::FJ_LINE_SEGMENT), + // std::make_tuple(local_search_mode_t::FJ_ON_ZERO), + std::make_tuple(local_search_mode_t::FJ_ANNEALING))); + +} // namespace cuopt::linear_programming::test diff --git a/cpp/tests/mip/mip_utils.cuh b/cpp/tests/mip/mip_utils.cuh index 5c2b39d290..4595939e1f 100644 --- a/cpp/tests/mip/mip_utils.cuh +++ b/cpp/tests/mip/mip_utils.cuh @@ -8,9 +8,14 @@ #include #include #include +#include #include +#include +#include #include +#include #include +#include namespace cuopt::linear_programming::test { @@ -180,4 +185,54 @@ static std::tuple test_mps_file( solution.get_solution_bound()); } +struct fj_tweaks_t { + double objective_weight = 0; +}; + +struct fj_state_t { + detail::solution_t solution; + std::vector solution_vector; + int minimums; + double incumbent_objective; + double incumbent_violation; +}; + +static fj_state_t run_fj(detail::problem_t& problem, + const detail::fj_settings_t& fj_settings, + fj_tweaks_t tweaks = {}, + std::vector initial_solution = {}, + int determinism_mode = CUOPT_MODE_OPPORTUNISTIC) +{ + auto settings = mip_solver_settings_t{}; + settings.time_limit = 30.; + settings.determinism_mode = determinism_mode; + auto timer = cuopt::termination_checker_t(30.0, cuopt::termination_checker_t::root_tag_t{}); + detail::mip_solver_t solver(problem, settings, timer); + + detail::solution_t solution(*solver.context.problem_ptr); + if (initial_solution.size() > 0) { + expand_device_copy(solution.assignment, initial_solution, solution.handle_ptr->get_stream()); + } else { + thrust::fill(solution.handle_ptr->get_thrust_policy(), + solution.assignment.begin(), + solution.assignment.end(), + 0.0); + } + solution.clamp_within_bounds(); + + detail::fj_t fj(solver.context, fj_settings); + fj.reset_weights(solution.handle_ptr->get_stream(), 1.); + fj.objective_weight.set_value_async(tweaks.objective_weight, solution.handle_ptr->get_stream()); + solution.handle_ptr->sync_stream(); + + fj.solve(solution); + auto solution_vector = host_copy(solution.assignment, solution.handle_ptr->get_stream()); + + return {solution, + solution_vector, + fj.climbers[0]->local_minimums_reached.value(solution.handle_ptr->get_stream()), + fj.climbers[0]->incumbent_objective.value(solution.handle_ptr->get_stream()), + fj.climbers[0]->violation_score.value(solution.handle_ptr->get_stream())}; +} + } // namespace cuopt::linear_programming::test diff --git a/cpp/tests/mip/multi_probe_test.cu b/cpp/tests/mip/multi_probe_test.cu index 003220de9b..ee0753cb32 100644 --- a/cpp/tests/mip/multi_probe_test.cu +++ b/cpp/tests/mip/multi_probe_test.cu @@ -6,6 +6,7 @@ /* clang-format on */ #include "../linear_programming/utilities/pdlp_test_utilities.cuh" +#include "determinism_utils.cuh" #include "mip_utils.cuh" #include @@ -43,9 +44,10 @@ void init_handler(const raft::handle_t* handle_ptr) } std::tuple, std::vector, std::vector> select_k_random( - detail::problem_t& problem, int sample_size) + detail::problem_t& problem, + int sample_size, + unsigned long seed = std::random_device{}()) { - auto seed = std::random_device{}(); std::cerr << "Tested with seed " << seed << "\n"; problem.compute_n_integer_vars(); auto [v_lb, v_ub] = extract_host_bounds(problem.variable_bounds, problem.handle_ptr); @@ -138,10 +140,8 @@ multi_probe_results( std::move(h_lb_0), std::move(h_ub_0), std::move(h_lb_1), std::move(h_ub_1)); } -void test_multi_probe(std::string path) +uint32_t test_multi_probe(std::string path, unsigned long seed = std::random_device{}()) { - auto memory_resource = make_async(); - rmm::mr::set_current_device_resource(memory_resource.get()); const raft::handle_t handle_{}; cuopt::mps_parser::mps_data_model_t mps_problem = cuopt::mps_parser::parse_mps(path, false); @@ -150,12 +150,13 @@ void test_multi_probe(std::string path) problem_checking_t::check_problem_representation(op_problem); detail::problem_t problem(op_problem); mip_solver_settings_t default_settings{}; - detail::mip_solver_t solver(problem, default_settings, cuopt::timer_t(0)); + auto timer = cuopt::termination_checker_t(0.0, cuopt::termination_checker_t::root_tag_t{}); + detail::mip_solver_t solver(problem, default_settings, timer); detail::bound_presolve_t bnd_prb_0(solver.context); detail::bound_presolve_t bnd_prb_1(solver.context); detail::multi_probe_t multi_probe_prs(solver.context); - auto probe_tuple = select_k_random(problem, 100); + auto probe_tuple = select_k_random(problem, 100, seed); auto bounds_probe_vals = convert_probe_tuple(probe_tuple); auto [bnd_lb_0, bnd_ub_0, bnd_lb_1, bnd_ub_1] = @@ -174,6 +175,16 @@ void test_multi_probe(std::string path) auto mlp_min_act_1 = host_copy(multi_probe_prs.upd_1.min_activity, stream); auto mlp_max_act_1 = host_copy(multi_probe_prs.upd_1.max_activity, stream); + std::vector hashes; + hashes.push_back(detail::compute_hash(bnd_min_act_0)); + hashes.push_back(detail::compute_hash(bnd_min_act_1)); + hashes.push_back(detail::compute_hash(bnd_max_act_0)); + hashes.push_back(detail::compute_hash(bnd_max_act_1)); + hashes.push_back(detail::compute_hash(bnd_lb_0)); + hashes.push_back(detail::compute_hash(bnd_ub_0)); + hashes.push_back(detail::compute_hash(bnd_lb_1)); + hashes.push_back(detail::compute_hash(bnd_ub_1)); + for (int i = 0; i < (int)bnd_min_act_0.size(); ++i) { EXPECT_DOUBLE_EQ(bnd_min_act_0[i], mlp_min_act_0[i]); EXPECT_DOUBLE_EQ(bnd_max_act_0[i], mlp_max_act_0[i]); @@ -187,6 +198,9 @@ void test_multi_probe(std::string path) EXPECT_DOUBLE_EQ(bnd_lb_1[i], m_lb_1[i]); EXPECT_DOUBLE_EQ(bnd_ub_1[i], m_ub_1[i]); } + + // return a composite hash of all the hashes to check for determinism + return detail::compute_hash(hashes); } TEST(presolve, multi_probe) @@ -200,4 +214,29 @@ TEST(presolve, multi_probe) } } +TEST(presolve, multi_probe_deterministic) +{ + spin_stream_raii_t spin_stream_1; + + std::vector test_instances = { + "mip/50v-10-free-bound.mps", + "mip/neos5-free-bound.mps", + "mip/neos5.mps", + "mip/50v-10.mps", + }; + for (const auto& test_instance : test_instances) { + std::cout << "Running: " << test_instance << std::endl; + unsigned long seed = std::random_device{}(); + auto path = make_path_absolute(test_instance); + uint32_t gold_hash = 0; + for (int i = 0; i < 10; ++i) { + auto hash = test_multi_probe(path, seed); + if (i == 0) { + gold_hash = hash; + } else { + EXPECT_EQ(hash, gold_hash); + } + } + } +} } // namespace cuopt::linear_programming::test diff --git a/cpp/tests/mip/presolve_test.cu b/cpp/tests/mip/presolve_test.cu index cf2532d0f2..a11d1c7288 100644 --- a/cpp/tests/mip/presolve_test.cu +++ b/cpp/tests/mip/presolve_test.cu @@ -6,12 +6,22 @@ /* clang-format on */ #include "../linear_programming/utilities/pdlp_test_utilities.cuh" +#include "determinism_utils.cuh" +#include "mip_utils.cuh" +#include +#include #include +#include +#include #include +#include #include +#include #include #include +#include +#include #include #include #include @@ -29,6 +39,171 @@ namespace cuopt::linear_programming::test { +void init_handler(const raft::handle_t* handle_ptr) +{ + // Init cuBlas / cuSparse context here to avoid having it during solving time + RAFT_CUBLAS_TRY(raft::linalg::detail::cublassetpointermode( + handle_ptr->get_cublas_handle(), CUBLAS_POINTER_MODE_DEVICE, handle_ptr->get_stream())); + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsesetpointermode( + handle_ptr->get_cusparse_handle(), CUSPARSE_POINTER_MODE_DEVICE, handle_ptr->get_stream())); +} + +std::tuple, std::vector, std::vector> select_k_random( + detail::problem_t& problem, + int sample_size, + unsigned long seed = std::random_device{}()) +{ + std::cerr << "Tested with seed " << seed << "\n"; + problem.compute_n_integer_vars(); + auto [v_lb, v_ub] = extract_host_bounds(problem.variable_bounds, problem.handle_ptr); + auto int_var_id = host_copy(problem.integer_indices, problem.handle_ptr->get_stream()); + int_var_id.erase( + std::remove_if(int_var_id.begin(), + int_var_id.end(), + [v_lb_sp = v_lb, v_ub_sp = v_ub](auto id) { + return !(std::isfinite(v_lb_sp[id]) && std::isfinite(v_ub_sp[id])); + }), + int_var_id.end()); + sample_size = std::min(sample_size, static_cast(int_var_id.size())); + std::vector random_int_vars; + std::mt19937 m{seed}; + std::sample( + int_var_id.begin(), int_var_id.end(), std::back_inserter(random_int_vars), sample_size, m); + std::vector probe_0(sample_size); + std::vector probe_1(sample_size); + for (int i = 0; i < static_cast(random_int_vars.size()); ++i) { + if (i % 2) { + probe_0[i] = v_lb[random_int_vars[i]]; + probe_1[i] = v_ub[random_int_vars[i]]; + } else { + probe_1[i] = v_lb[random_int_vars[i]]; + probe_0[i] = v_ub[random_int_vars[i]]; + } + } + return std::make_tuple(std::move(random_int_vars), std::move(probe_0), std::move(probe_1)); +} + +std::pair>, std::vector>> +convert_probe_tuple(std::tuple, std::vector, std::vector>& probe) +{ + std::vector> probe_first; + std::vector> probe_second; + for (size_t i = 0; i < std::get<0>(probe).size(); ++i) { + probe_first.emplace_back(thrust::make_pair(std::get<0>(probe)[i], std::get<1>(probe)[i])); + probe_second.emplace_back(thrust::make_pair(std::get<0>(probe)[i], std::get<2>(probe)[i])); + } + return std::make_pair(std::move(probe_first), std::move(probe_second)); +} + +uint32_t test_probing_cache_determinism(std::string path, + unsigned long seed = std::random_device{}()) +{ + const raft::handle_t handle_{}; + cuopt::mps_parser::mps_data_model_t mps_problem = + cuopt::mps_parser::parse_mps(path, false); + handle_.sync_stream(); + auto op_problem = mps_data_model_to_optimization_problem(&handle_, mps_problem); + problem_checking_t::check_problem_representation(op_problem); + detail::problem_t problem(op_problem); + mip_solver_settings_t default_settings{}; + default_settings.mip_scaling = false; // we're not checking scaling determinism here + auto timer = cuopt::termination_checker_t(0.0, cuopt::termination_checker_t::root_tag_t{}); + detail::mip_solver_t solver(problem, default_settings, timer); + detail::bound_presolve_t bnd_prb(solver.context); + + work_limit_context_t work_limit_context("ProbingCache"); + // rely on the iteration limit + work_limit_timer_t probing_timer(work_limit_context, std::numeric_limits::max(), timer); + compute_probing_cache(bnd_prb, problem, probing_timer); + std::vector, 2>>> cached_values( + bnd_prb.probing_cache.probing_cache.begin(), bnd_prb.probing_cache.probing_cache.end()); + std::sort(cached_values.begin(), cached_values.end(), [](const auto& a, const auto& b) { + return a.first < b.first; + }); + + std::vector probed_indices; + std::vector intervals; + std::vector interval_types; + + std::vector var_to_cached_bound_keys; + std::vector var_to_cached_bound_lb; + std::vector var_to_cached_bound_ub; + for (const auto& a : cached_values) { + probed_indices.push_back(a.first); + intervals.push_back(a.second[0].val_interval.val); + intervals.push_back(a.second[1].val_interval.val); + interval_types.push_back(a.second[0].val_interval.interval_type); + interval_types.push_back(a.second[1].val_interval.interval_type); + + auto sorted_map = std::map>( + a.second[0].var_to_cached_bound_map.begin(), a.second[0].var_to_cached_bound_map.end()); + for (const auto& [var_id, cached_bound] : sorted_map) { + var_to_cached_bound_keys.push_back(var_id); + var_to_cached_bound_lb.push_back(cached_bound.lb); + var_to_cached_bound_ub.push_back(cached_bound.ub); + } + } + + std::vector hashes; + hashes.push_back(detail::compute_hash(probed_indices)); + hashes.push_back(detail::compute_hash(intervals)); + hashes.push_back(detail::compute_hash(interval_types)); + hashes.push_back(detail::compute_hash(var_to_cached_bound_keys)); + hashes.push_back(detail::compute_hash(var_to_cached_bound_lb)); + hashes.push_back(detail::compute_hash(var_to_cached_bound_ub)); + + // return a composite hash of all the hashes to check for determinism + return detail::compute_hash(hashes); +} + +uint32_t test_scaling_determinism(std::string path, unsigned long seed = std::random_device{}()) +{ + const raft::handle_t handle_{}; + cuopt::mps_parser::mps_data_model_t mps_problem = + cuopt::mps_parser::parse_mps(path, false); + handle_.sync_stream(); + auto op_problem = mps_data_model_to_optimization_problem(&handle_, mps_problem); + problem_checking_t::check_problem_representation(op_problem); + detail::problem_t problem(op_problem); + + pdlp_hyper_params::pdlp_hyper_params_t hyper_params{}; + hyper_params.update_primal_weight_on_initial_solution = false; + hyper_params.update_step_size_on_initial_solution = true; + // problem contains unpreprocessed data + detail::problem_t scaled_problem(problem); + + detail::pdlp_initial_scaling_strategy_t scaling( + scaled_problem.handle_ptr, + scaled_problem, + hyper_params.default_l_inf_ruiz_iterations, + (double)hyper_params.default_alpha_pock_chambolle_rescaling, + scaled_problem.reverse_coefficients, + scaled_problem.reverse_offsets, + scaled_problem.reverse_constraints, + nullptr, + hyper_params, + true); + + scaling.scale_problem(); + + // generate a random initial solution in order to ensure scaling of solution vectors is + // deterministic as well as the initial step size + std::vector initial_solution(scaled_problem.n_variables); + std::mt19937 m{seed}; + std::generate(initial_solution.begin(), initial_solution.end(), [&m]() { return m(); }); + auto d_initial_solution = device_copy(initial_solution, handle_.get_stream()); + scaling.scale_primal(d_initial_solution); + + scaled_problem.preprocess_problem(); + + detail::trivial_presolve(scaled_problem); + + std::vector hashes; + hashes.push_back(detail::compute_hash(d_initial_solution, handle_.get_stream())); + hashes.push_back(scaled_problem.get_fingerprint()); + return detail::compute_hash(hashes); +} + TEST(problem, find_implied_integers) { const raft::handle_t handle_{}; @@ -63,4 +238,63 @@ TEST(problem, find_implied_integers) ((int)detail::problem_t::var_flags_t::VAR_IMPLIED_INTEGER)); } +TEST(presolve, probing_cache_deterministic) +{ + spin_stream_raii_t spin_stream_1; + + std::vector test_instances = {"mip/50v-10-free-bound.mps", + "mip/neos5-free-bound.mps", + "mip/neos5.mps", + "mip/50v-10.mps", + "mip/gen-ip054.mps", + "mip/rmatr200-p5.mps"}; + for (const auto& test_instance : test_instances) { + std::cout << "Running: " << test_instance << std::endl; + unsigned long seed = std::random_device{}(); + std::cerr << "Tested with seed " << seed << "\n"; + auto path = make_path_absolute(test_instance); + uint32_t gold_hash = 0; + for (int i = 0; i < 10; ++i) { + auto hash = test_probing_cache_determinism(path, seed); + if (i == 0) { + gold_hash = hash; + std::cout << "Gold hash: " << gold_hash << std::endl; + } else { + EXPECT_EQ(hash, gold_hash); + } + } + } +} + +TEST(presolve, mip_scaling_deterministic) +{ + spin_stream_raii_t spin_stream_1; + spin_stream_raii_t spin_stream_2; + + std::vector test_instances = {"mip/sct2.mps", + "mip/thor50dday.mps", + "mip/uccase9.mps", + "mip/neos5-free-bound.mps", + "mip/neos5.mps", + "mip/50v-10.mps", + "mip/gen-ip054.mps", + "mip/rmatr200-p5.mps"}; + for (const auto& test_instance : test_instances) { + std::cout << "Running: " << test_instance << std::endl; + unsigned long seed = std::random_device{}(); + std::cerr << "Tested with seed " << seed << "\n"; + auto path = make_path_absolute(test_instance); + uint32_t gold_hash = 0; + for (int i = 0; i < 10; ++i) { + auto hash = test_scaling_determinism(path, seed); + if (i == 0) { + gold_hash = hash; + std::cout << "Gold hash: " << gold_hash << std::endl; + } else { + EXPECT_EQ(hash, gold_hash); + } + } + } +} + } // namespace cuopt::linear_programming::test diff --git a/datasets/mip/download_miplib_test_dataset.sh b/datasets/mip/download_miplib_test_dataset.sh index d9cefbc32d..28a2b5b6fc 100755 --- a/datasets/mip/download_miplib_test_dataset.sh +++ b/datasets/mip/download_miplib_test_dataset.sh @@ -25,6 +25,7 @@ INSTANCES=( "enlight_hard" "enlight11" "supportcase22" + "supportcase42" "pk1" ) diff --git a/dependencies.yaml b/dependencies.yaml index ecd9deb6b4..057fc2a318 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -317,7 +317,7 @@ dependencies: common: - output_types: [conda] packages: - - libcuopt-tests==26.4.*,>=0.0.0a0 + - libcuopt-tests==26.6.*,>=0.0.0a0 build_wheels: common: - output_types: [requirements, pyproject] @@ -419,7 +419,7 @@ dependencies: common: - output_types: conda packages: - - &libcuopt_unsuffixed libcuopt==26.4.*,>=0.0.0a0 + - &libcuopt_unsuffixed libcuopt==26.6.*,>=0.0.0a0 - output_types: requirements packages: # pip recognizes the index as a global option for the requirements.txt file @@ -432,18 +432,18 @@ dependencies: cuda: "12.*" cuda_suffixed: "true" packages: - - libcuopt-cu12==26.4.*,>=0.0.0a0 + - libcuopt-cu12==26.6.*,>=0.0.0a0 - matrix: cuda: "13.*" cuda_suffixed: "true" packages: - - libcuopt-cu13==26.4.*,>=0.0.0a0 + - libcuopt-cu13==26.6.*,>=0.0.0a0 - {matrix: null, packages: [*libcuopt_unsuffixed]} depends_on_cuopt: common: - output_types: conda packages: - - &cuopt_unsuffixed cuopt==26.4.*,>=0.0.0a0 + - &cuopt_unsuffixed cuopt==26.6.*,>=0.0.0a0 - output_types: requirements packages: # pip recognizes the index as a global option for the requirements.txt file @@ -456,18 +456,18 @@ dependencies: cuda: "12.*" cuda_suffixed: "true" packages: - - cuopt-cu12==26.4.*,>=0.0.0a0 + - cuopt-cu12==26.6.*,>=0.0.0a0 - matrix: cuda: "13.*" cuda_suffixed: "true" packages: - - cuopt-cu13==26.4.*,>=0.0.0a0 + - cuopt-cu13==26.6.*,>=0.0.0a0 - {matrix: null, packages: [*cuopt_unsuffixed]} depends_on_cuopt_server: common: - output_types: conda packages: - - &cuopt_server_unsuffixed cuopt-server==26.4.*,>=0.0.0a0 + - &cuopt_server_unsuffixed cuopt-server==26.6.*,>=0.0.0a0 - output_types: requirements packages: # pip recognizes the index as a global option for the requirements.txt file @@ -480,18 +480,18 @@ dependencies: cuda: "12.*" cuda_suffixed: "true" packages: - - cuopt-server-cu12==26.4.*,>=0.0.0a0 + - cuopt-server-cu12==26.6.*,>=0.0.0a0 - matrix: cuda: "13.*" cuda_suffixed: "true" packages: - - cuopt-server-cu13==26.4.*,>=0.0.0a0 + - cuopt-server-cu13==26.6.*,>=0.0.0a0 - {matrix: null, packages: [*cuopt_server_unsuffixed]} depends_on_cuopt_sh_client: common: - output_types: [conda, requirements, pyproject] packages: - - &cuopt_sh_client_unsuffixed cuopt-sh-client==26.4.*,>=0.0.0a0 + - &cuopt_sh_client_unsuffixed cuopt-sh-client==26.6.*,>=0.0.0a0 - output_types: requirements packages: # pip recognizes the index as a global option for the requirements.txt file @@ -501,7 +501,7 @@ dependencies: common: - output_types: [requirements, pyproject, conda] packages: - - cuopt-mps-parser==26.4.*,>=0.0.0a0 + - cuopt-mps-parser==26.6.*,>=0.0.0a0 - output_types: requirements packages: # pip recognizes the index as a global option for the requirements.txt file @@ -511,12 +511,12 @@ dependencies: common: - output_types: conda packages: - - libraft-headers==26.4.*,>=0.0.0a0 + - libraft-headers==26.6.*,>=0.0.0a0 depends_on_librmm: common: - output_types: conda packages: - - &librmm_unsuffixed librmm==26.4.*,>=0.0.0a0 + - &librmm_unsuffixed librmm==26.6.*,>=0.0.0a0 - output_types: requirements packages: # pip recognizes the index as a global option for the requirements.txt file @@ -528,12 +528,12 @@ dependencies: cuda: "12.*" cuda_suffixed: "true" packages: - - librmm-cu12==26.4.*,>=0.0.0a0 + - librmm-cu12==26.6.*,>=0.0.0a0 - matrix: cuda: "13.*" cuda_suffixed: "true" packages: - - librmm-cu13==26.4.*,>=0.0.0a0 + - librmm-cu13==26.6.*,>=0.0.0a0 - {matrix: null, packages: [*librmm_unsuffixed]} depends_on_cupy: common: @@ -568,7 +568,7 @@ dependencies: common: - output_types: conda packages: - - &rmm_unsuffixed rmm==26.4.*,>=0.0.0a0 + - &rmm_unsuffixed rmm==26.6.*,>=0.0.0a0 - output_types: requirements packages: # pip recognizes the index as a global option for the requirements.txt file @@ -580,12 +580,12 @@ dependencies: cuda: "12.*" cuda_suffixed: "true" packages: - - rmm-cu12==26.4.*,>=0.0.0a0 + - rmm-cu12==26.6.*,>=0.0.0a0 - matrix: cuda: "13.*" cuda_suffixed: "true" packages: - - rmm-cu13==26.4.*,>=0.0.0a0 + - rmm-cu13==26.6.*,>=0.0.0a0 - matrix: packages: - *rmm_unsuffixed @@ -594,7 +594,7 @@ dependencies: common: - output_types: conda packages: - - &cudf_unsuffixed cudf==26.4.*,>=0.0.0a0 + - &cudf_unsuffixed cudf==26.6.*,>=0.0.0a0 - output_types: requirements packages: - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple @@ -605,12 +605,12 @@ dependencies: cuda: "12.*" cuda_suffixed: "true" packages: - - cudf-cu12==26.4.*,>=0.0.0a0 + - cudf-cu12==26.6.*,>=0.0.0a0 - matrix: cuda: "13.*" cuda_suffixed: "true" packages: - - cudf-cu13==26.4.*,>=0.0.0a0 + - cudf-cu13==26.6.*,>=0.0.0a0 - matrix: packages: - *cudf_unsuffixed @@ -619,7 +619,7 @@ dependencies: common: - output_types: conda packages: - - &pylibraft_unsuffixed pylibraft==26.4.*,>=0.0.0a0 + - &pylibraft_unsuffixed pylibraft==26.6.*,>=0.0.0a0 - output_types: requirements packages: - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple @@ -630,12 +630,12 @@ dependencies: cuda: "12.*" cuda_suffixed: "true" packages: - - pylibraft-cu12==26.4.*,>=0.0.0a0 + - pylibraft-cu12==26.6.*,>=0.0.0a0 - matrix: cuda: "13.*" cuda_suffixed: "true" packages: - - pylibraft-cu13==26.4.*,>=0.0.0a0 + - pylibraft-cu13==26.6.*,>=0.0.0a0 - matrix: packages: - *pylibraft_unsuffixed diff --git a/docs/cuopt/source/versions1.json b/docs/cuopt/source/versions1.json index 3e986996a4..507dfe57a4 100644 --- a/docs/cuopt/source/versions1.json +++ b/docs/cuopt/source/versions1.json @@ -1,10 +1,14 @@ [ { - "version": "26.04.00", - "url": "https://docs.nvidia.com/cuopt/user-guide/26.04.00/", + "version": "26.06.00", + "url": "https://docs.nvidia.com/cuopt/user-guide/26.06.00/", "name": "latest", "preferred": true }, + { + "version": "26.04.00", + "url": "https://docs.nvidia.com/cuopt/user-guide/26.04.00/" + }, { "version": "26.02.00", "url": "https://docs.nvidia.com/cuopt/user-guide/26.02.00/" diff --git a/gemini-extension.json b/gemini-extension.json index b4c6b764a4..c5ef9883f8 100644 --- a/gemini-extension.json +++ b/gemini-extension.json @@ -1,6 +1,6 @@ { "name": "nvidia-cuopt-skills", "description": "Agent skills for NVIDIA cuOpt optimization engine: routing, LP/MILP/QP, installation, and server.", - "version": "26.04.00", + "version": "26.06.00", "contextFileName": "AGENTS.md" } diff --git a/helmchart/cuopt-server/Chart.yaml b/helmchart/cuopt-server/Chart.yaml index 074d94bec9..811ac067cb 100644 --- a/helmchart/cuopt-server/Chart.yaml +++ b/helmchart/cuopt-server/Chart.yaml @@ -1,5 +1,5 @@ apiVersion: v2 -appVersion: 26.4.0 +appVersion: 26.6.0 description: A Helm chart for NVIDIA cuOpt Server with GPU support home: https://docs.nvidia.com/cuopt/user-guide/latest/resources.html keywords: @@ -14,4 +14,4 @@ name: cuopt-server sources: - https://docs.nvidia.com/cuopt/user-guide/latest/resources.html type: application -version: 26.4.0 +version: 26.6.0 diff --git a/helmchart/cuopt-server/values.yaml b/helmchart/cuopt-server/values.yaml index 5218596552..6adafea79e 100644 --- a/helmchart/cuopt-server/values.yaml +++ b/helmchart/cuopt-server/values.yaml @@ -7,7 +7,7 @@ replicaCount: 1 image: repository: nvidia/cuopt pullPolicy: IfNotPresent - tag: "26.4.0-cuda12.9-py3.12" + tag: "26.6.0-cuda12.9-py3.12" imagePullSecrets: [] nameOverride: "" diff --git a/merge_review_findings_release_26_04.md b/merge_review_findings_release_26_04.md new file mode 100644 index 0000000000..9491c63c79 --- /dev/null +++ b/merge_review_findings_release_26_04.md @@ -0,0 +1,50 @@ +# Merge Review Findings vs `release/26.04` + +Scope: +- Current merge state reviewed statically against `release/26.04` +- Excluding `cpp/src/branch_and_bound/pseudo_costs.cpp` +- Notes are incremental and may grow as the review continues + +## Resolved High Confidence Findings + +1. `cpp/src/mip_heuristics/solver_context.cuh` + - Restored the `release/26.04` scaling ownership model for MIP. + - Removed the extra scaling constructor parameter from `mip_solver_context_t`; callers now match the context definition again. + +2. `cpp/src/mip_heuristics/solution_callbacks.cuh` + - Removed the incorrect `pdlp_initial_scaling_strategy_t` dependency from MIP callback plumbing. + - `solution_publication_t` and `solution_injection_t` no longer try to own or apply scaling; they now operate on the release-side MIP flow and dispatch both `GET_SOLUTION` and `GET_SOLUTION_EXT`. + +3. `cpp/src/mip_heuristics/solver.cu` + - Removed `context.scaling` uses from incumbent publication and injection paths. + - Removed the stale `bb_callback_adapter_t::settings_` reference member, which was left uninitialized by the merge. + +4. `cpp/src/mip_heuristics/diversity/population.cu` + - Removed `context.scaling` from the callback/publication and injection calls so the file matches the release-side scaling model. + +5. `cpp/src/mip_heuristics/solve.cu` + - Deleted the stale local `invoke_solution_callbacks(...)` helper instead of extending it. + - Rewired the early incumbent publication paths to the determinism-side callback dispatch (`GET_SOLUTION_EXT` compatible, with origin and work timestamp metadata). + - Removed the stray `scaling.scale_problem()` / `scale_primal(...)` block from `run_mip()`, which had no scaling object in scope. + - Restored the `try` / `catch` structure in `run_mip()` after the merge splice dropped the opening `try`. + - Updated the early-heuristic gates to the bitset model by allowing them only when `determinism_mode == CUOPT_DETERMINISM_NONE`. + +6. `cpp/src/mip_heuristics/problem/problem.cuh`, `cpp/src/mip_heuristics/problem/problem.cu`, `cpp/src/mip_heuristics/problem/presolve_data.cuh` + - Repaired the half-merged `post_process_assignment(...)` overloads. + - The handle-override wrappers now forward the override stream correctly, and the stream-based implementation no longer references the nonexistent `handle_override` variable. + +7. `cpp/src/mip_heuristics/diversity/diversity_manager.cu` + - Restored the missing `tolerance_divisor` local used to derive PDLP relative tolerances in the non-deterministic root LP path. + +8. `cpp/src/mip_heuristics/feasibility_jump/feasibility_jump.cuh`, `cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cu` + - Fixed the early GPU FJ merge splice where `early_gpufj_t` reached into the now-private `fj_t::improvement_callback`. + - Added a proper setter and updated the caller to use it. + +9. `cpp/src/mip_heuristics/solve.cu` + - Removed merge-leftover unused locals (`running_mip`, `hyper_params`) that were tripping `-Werror`. + +## Lower Confidence Risks + +1. `cpp/src/mip_heuristics/diversity/population.cu` + - In deterministic B&B mode, `run_solution_callbacks()` updates `best_feasible_objective` immediately after queueing a heuristic solution to B&B, before B&B validates or repairs it. + - If the queued solution is later rejected after crushing/validation, later heuristic candidates can be suppressed against an incumbent objective that never actually became valid. diff --git a/python/cuopt/pyproject.toml b/python/cuopt/pyproject.toml index e86b5bdd73..eff7e01769 100644 --- a/python/cuopt/pyproject.toml +++ b/python/cuopt/pyproject.toml @@ -20,18 +20,18 @@ license = "Apache-2.0" requires-python = ">=3.11" dependencies = [ "cuda-python>=13.0.1,<14.0", - "cudf==26.4.*,>=0.0.0a0", - "cuopt-mps-parser==26.4.*,>=0.0.0a0", + "cudf==26.6.*,>=0.0.0a0", + "cuopt-mps-parser==26.6.*,>=0.0.0a0", "cupy-cuda13x>=13.6.0", - "libcuopt==26.4.*,>=0.0.0a0", + "libcuopt==26.6.*,>=0.0.0a0", "numba-cuda>=0.22.1", "numba>=0.60.0,<0.65.0", "numpy>=1.23.5,<3.0", "pandas>=2.0", - "pylibraft==26.4.*,>=0.0.0a0", + "pylibraft==26.6.*,>=0.0.0a0", "pyyaml>=6.0.0", "rapids-logger==0.2.*,>=0.0.0a0", - "rmm==26.4.*,>=0.0.0a0", + "rmm==26.6.*,>=0.0.0a0", "scipy>=1.14.1", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ @@ -101,12 +101,12 @@ dependencies-file = "../../dependencies.yaml" matrix-entry = "cuda_suffixed=true;use_cuda_wheels=true" requires = [ "cmake>=3.30.4", - "cuopt-mps-parser==26.4.*,>=0.0.0a0", + "cuopt-mps-parser==26.6.*,>=0.0.0a0", "cupy-cuda13x>=13.6.0", "cython>=3.0.3", - "libcuopt==26.4.*,>=0.0.0a0", + "libcuopt==26.6.*,>=0.0.0a0", "ninja", - "pylibraft==26.4.*,>=0.0.0a0", + "pylibraft==26.6.*,>=0.0.0a0", "rapids-logger==0.2.*,>=0.0.0a0", - "rmm==26.4.*,>=0.0.0a0", + "rmm==26.6.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. diff --git a/python/cuopt_self_hosted/pyproject.toml b/python/cuopt_self_hosted/pyproject.toml index 7645c99ed0..43aa80a5b3 100644 --- a/python/cuopt_self_hosted/pyproject.toml +++ b/python/cuopt_self_hosted/pyproject.toml @@ -20,7 +20,7 @@ license = "Apache-2.0" license-files = ["LICENSE"] requires-python = ">=3.11" dependencies = [ - "cuopt-mps-parser==26.4.*,>=0.0.0a0", + "cuopt-mps-parser==26.6.*,>=0.0.0a0", "msgpack-numpy==0.4.8", "msgpack==1.1.2", "requests", diff --git a/python/cuopt_server/pyproject.toml b/python/cuopt_server/pyproject.toml index d24cfcbd77..ce96c884be 100644 --- a/python/cuopt_server/pyproject.toml +++ b/python/cuopt_server/pyproject.toml @@ -21,7 +21,7 @@ license = "Apache-2.0" license-files = ["LICENSE"] requires-python = ">=3.11" dependencies = [ - "cuopt==26.4.*,>=0.0.0a0", + "cuopt==26.6.*,>=0.0.0a0", "cupy-cuda13x>=13.6.0", "fastapi", "jsonref==1.1.0", diff --git a/python/libcuopt/pyproject.toml b/python/libcuopt/pyproject.toml index de9680aefe..6ba41c60dd 100644 --- a/python/libcuopt/pyproject.toml +++ b/python/libcuopt/pyproject.toml @@ -31,8 +31,8 @@ classifiers = [ ] dependencies = [ "cuda-toolkit[cublas,cudart,curand,cusolver,cusparse,nvtx]==13.*", - "cuopt-mps-parser==26.4.*,>=0.0.0a0", - "librmm==26.4.*,>=0.0.0a0", + "cuopt-mps-parser==26.6.*,>=0.0.0a0", + "librmm==26.6.*,>=0.0.0a0", "nvidia-cudss-cu13", "nvidia-nvjitlink>=13.0,<14", "rapids-logger==0.2.*,>=0.0.0a0", @@ -76,8 +76,8 @@ dependencies-file = "../../dependencies.yaml" matrix-entry = "cuda_suffixed=true;use_cuda_wheels=true" requires = [ "cmake>=3.30.4", - "cuopt-mps-parser==26.4.*,>=0.0.0a0", - "librmm==26.4.*,>=0.0.0a0", + "cuopt-mps-parser==26.6.*,>=0.0.0a0", + "librmm==26.6.*,>=0.0.0a0", "ninja", "rapids-logger==0.2.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. diff --git a/skills/cuopt-developer/SKILL.md b/skills/cuopt-developer/SKILL.md index 12419153ac..99743f9171 100644 --- a/skills/cuopt-developer/SKILL.md +++ b/skills/cuopt-developer/SKILL.md @@ -1,6 +1,6 @@ --- name: cuopt-developer -version: "26.04.00" +version: "26.06.00" description: Contribute to NVIDIA cuOpt codebase including C++/CUDA, Python, server, docs, and CI. Use when the user wants to modify solver internals, add features, submit PRs, or understand the codebase architecture. --- diff --git a/skills/cuopt-installation-api-c/SKILL.md b/skills/cuopt-installation-api-c/SKILL.md index 747382e3c7..bd4d60becc 100644 --- a/skills/cuopt-installation-api-c/SKILL.md +++ b/skills/cuopt-installation-api-c/SKILL.md @@ -1,6 +1,6 @@ --- name: cuopt-installation-api-c -version: "26.04.00" +version: "26.06.00" description: Install cuOpt for C — conda, locate lib/headers, verification. Use when the user is installing or verifying the C API. Standalone; no common skill. --- diff --git a/skills/cuopt-installation-api-python/SKILL.md b/skills/cuopt-installation-api-python/SKILL.md index a3d7a5e5d2..771f5ec8b0 100644 --- a/skills/cuopt-installation-api-python/SKILL.md +++ b/skills/cuopt-installation-api-python/SKILL.md @@ -1,6 +1,6 @@ --- name: cuopt-installation-api-python -version: "26.04.00" +version: "26.06.00" description: Install cuOpt for Python — pip, conda, Docker, verification. Use when the user is installing or verifying the Python API. Standalone; no common skill. --- diff --git a/skills/cuopt-installation-common/SKILL.md b/skills/cuopt-installation-common/SKILL.md index 6ceb9f9000..88534fb810 100644 --- a/skills/cuopt-installation-common/SKILL.md +++ b/skills/cuopt-installation-common/SKILL.md @@ -1,6 +1,6 @@ --- name: cuopt-installation-common -version: "26.04.00" +version: "26.06.00" description: Install cuOpt — system and environment requirements only. Domain concepts; no install commands or interface guidance. --- diff --git a/skills/cuopt-installation-developer/SKILL.md b/skills/cuopt-installation-developer/SKILL.md index a002498853..1f3dff0d3f 100644 --- a/skills/cuopt-installation-developer/SKILL.md +++ b/skills/cuopt-installation-developer/SKILL.md @@ -1,6 +1,6 @@ --- name: cuopt-installation-developer -version: "26.04.00" +version: "26.06.00" description: Developer installation — build cuOpt from source, run tests. Use when the user wants to set up a dev environment to contribute or modify cuOpt. --- diff --git a/skills/cuopt-lp-milp-api-c/SKILL.md b/skills/cuopt-lp-milp-api-c/SKILL.md index 53df3de63e..74b0d5dc92 100644 --- a/skills/cuopt-lp-milp-api-c/SKILL.md +++ b/skills/cuopt-lp-milp-api-c/SKILL.md @@ -1,6 +1,6 @@ --- name: cuopt-lp-milp-api-c -version: "26.04.00" +version: "26.06.00" description: LP and MILP with cuOpt — C API only. Use when the user is embedding LP/MILP in C/C++. --- diff --git a/skills/cuopt-lp-milp-api-cli/SKILL.md b/skills/cuopt-lp-milp-api-cli/SKILL.md index cbdc1e7778..1f8e8a157c 100644 --- a/skills/cuopt-lp-milp-api-cli/SKILL.md +++ b/skills/cuopt-lp-milp-api-cli/SKILL.md @@ -1,6 +1,6 @@ --- name: cuopt-lp-milp-api-cli -version: "26.04.00" +version: "26.06.00" description: LP and MILP with cuOpt — CLI only (MPS files, cuopt_cli). Use when the user is solving from MPS via command line. --- diff --git a/skills/cuopt-lp-milp-api-python/SKILL.md b/skills/cuopt-lp-milp-api-python/SKILL.md index a7cd9a59f2..e8435867db 100644 --- a/skills/cuopt-lp-milp-api-python/SKILL.md +++ b/skills/cuopt-lp-milp-api-python/SKILL.md @@ -1,6 +1,6 @@ --- name: cuopt-lp-milp-api-python -version: "26.04.00" +version: "26.06.00" description: Solve Linear Programming (LP) and Mixed-Integer Linear Programming (MILP) with the Python API. Use when the user asks about optimization with linear constraints, integer variables, scheduling, resource allocation, facility location, or production planning. --- diff --git a/skills/cuopt-qp-api-c/SKILL.md b/skills/cuopt-qp-api-c/SKILL.md index bc1efb63d3..85014b81fd 100644 --- a/skills/cuopt-qp-api-c/SKILL.md +++ b/skills/cuopt-qp-api-c/SKILL.md @@ -1,6 +1,6 @@ --- name: cuopt-qp-api-c -version: "26.04.00" +version: "26.06.00" description: Quadratic Programming (QP) with cuOpt — C API. Use when the user is embedding QP in C/C++. --- diff --git a/skills/cuopt-qp-api-cli/SKILL.md b/skills/cuopt-qp-api-cli/SKILL.md index 5f8a8e848a..7aec559126 100644 --- a/skills/cuopt-qp-api-cli/SKILL.md +++ b/skills/cuopt-qp-api-cli/SKILL.md @@ -1,6 +1,6 @@ --- name: cuopt-qp-api-cli -version: "26.04.00" +version: "26.06.00" description: QP with cuOpt — CLI (e.g. cuopt_cli with QP-capable input). Use when the user is solving QP from the command line. --- diff --git a/skills/cuopt-qp-api-python/SKILL.md b/skills/cuopt-qp-api-python/SKILL.md index b85b9e3db2..39533aaeca 100644 --- a/skills/cuopt-qp-api-python/SKILL.md +++ b/skills/cuopt-qp-api-python/SKILL.md @@ -1,6 +1,6 @@ --- name: cuopt-qp-api-python -version: "26.04.00" +version: "26.06.00" description: Quadratic Programming (QP) with cuOpt — Python API only (beta). Use when the user is building or solving QP in Python. --- diff --git a/skills/cuopt-routing-api-python/SKILL.md b/skills/cuopt-routing-api-python/SKILL.md index d8bf736f8f..c386107241 100644 --- a/skills/cuopt-routing-api-python/SKILL.md +++ b/skills/cuopt-routing-api-python/SKILL.md @@ -1,6 +1,6 @@ --- name: cuopt-routing-api-python -version: "26.04.00" +version: "26.06.00" description: Vehicle routing (VRP, TSP, PDP) with cuOpt — Python API only. Use when the user is building or solving routing in Python. --- diff --git a/skills/cuopt-server-api-python/SKILL.md b/skills/cuopt-server-api-python/SKILL.md index b340e9883f..7d6ed175dd 100644 --- a/skills/cuopt-server-api-python/SKILL.md +++ b/skills/cuopt-server-api-python/SKILL.md @@ -1,6 +1,6 @@ --- name: cuopt-server-api-python -version: "26.04.00" +version: "26.06.00" description: cuOpt REST server — start server, endpoints, Python/curl client examples. Use when the user is deploying or calling the REST API. --- diff --git a/skills/cuopt-server-common/SKILL.md b/skills/cuopt-server-common/SKILL.md index f23c9c4a5f..cc2a3728d5 100644 --- a/skills/cuopt-server-common/SKILL.md +++ b/skills/cuopt-server-common/SKILL.md @@ -1,6 +1,6 @@ --- name: cuopt-server-common -version: "26.04.00" +version: "26.06.00" description: cuOpt REST server — what it does and how requests flow. Domain concepts; no deploy or client code. --- diff --git a/skills/cuopt-user-rules/SKILL.md b/skills/cuopt-user-rules/SKILL.md index 0777b9af15..87734f72a2 100644 --- a/skills/cuopt-user-rules/SKILL.md +++ b/skills/cuopt-user-rules/SKILL.md @@ -1,6 +1,6 @@ --- name: cuopt-user-rules -version: "26.04.00" +version: "26.06.00" description: Base behavior rules for using NVIDIA cuOpt. Read this FIRST before any cuOpt user task (routing, LP/MILP, QP, installation, server). Covers handling incomplete questions, clarifying data requirements, verifying understanding, and running commands safely. --- diff --git a/skills/lp-milp-formulation/SKILL.md b/skills/lp-milp-formulation/SKILL.md index 64431a04c4..e429282033 100644 --- a/skills/lp-milp-formulation/SKILL.md +++ b/skills/lp-milp-formulation/SKILL.md @@ -1,6 +1,6 @@ --- name: lp-milp-formulation -version: "26.04.00" +version: "26.06.00" description: LP/MILP concepts and going from problem text to formulation. What LP/MILP are, required formulation questions, typical modeling elements, and how to parse problem statements (parameters, constraints, decisions, objective). --- diff --git a/skills/qp-formulation/SKILL.md b/skills/qp-formulation/SKILL.md index c87b887fbc..60aed00ede 100644 --- a/skills/qp-formulation/SKILL.md +++ b/skills/qp-formulation/SKILL.md @@ -1,6 +1,6 @@ --- name: qp-formulation -version: "26.04.00" +version: "26.06.00" description: Quadratic Programming (QP) — problem form and constraints. Domain concepts; no API or interface. QP is beta. --- diff --git a/skills/routing-formulation/SKILL.md b/skills/routing-formulation/SKILL.md index 4ab8d6419d..9cf8060cdf 100644 --- a/skills/routing-formulation/SKILL.md +++ b/skills/routing-formulation/SKILL.md @@ -1,6 +1,6 @@ --- name: routing-formulation -version: "26.04.00" +version: "26.06.00" description: Vehicle routing (VRP, TSP, PDP) — problem types and data requirements. Domain concepts; no API or interface. --- diff --git a/skills/skill-evolution/SKILL.md b/skills/skill-evolution/SKILL.md index d77fba1a3f..f3605795b7 100644 --- a/skills/skill-evolution/SKILL.md +++ b/skills/skill-evolution/SKILL.md @@ -1,6 +1,6 @@ --- name: skill-evolution -version: "26.04.00" +version: "26.06.00" description: After solving a non-trivial problem, detect generalizable learnings and propose skill updates so future interactions benefit automatically. Always active — applies to every interaction. --- @@ -182,7 +182,7 @@ When skill evolution creates an entirely new skill directory, add `origin: skill ```yaml --- name: new-skill-name -version: "26.04.00" +version: "26.06.00" description: ... origin: skill-evolution --- diff --git a/sonar-project.properties b/sonar-project.properties index ae8d6bd25c..7dafbc9969 100644 --- a/sonar-project.properties +++ b/sonar-project.properties @@ -5,6 +5,6 @@ sonar.projectKey=GPUSW_cuOpt_Nvidia-cuOpt_cuopt sonar.projectName=NVIDIA cuOpt sonar.projectVersion=1.0 - +sonar.host.url=https://sonar.nvidia.com # Source code location sonar.sources=. diff --git a/sonarqube/sonar-branches.txt b/sonarqube/sonar-branches.txt index a75ecac679..14fe38226d 100644 --- a/sonarqube/sonar-branches.txt +++ b/sonarqube/sonar-branches.txt @@ -5,7 +5,7 @@ # Main development branches main -release/26.02 +release/26.04 # Add release branches as needed # release/v1.0