cppalliance · mborland · Mar 11, 2026 · Mar 11, 2026 · Mar 12, 2026 · Mar 12, 2026
diff --git a/include/boost/int128/detail/config.hpp b/include/boost/int128/detail/config.hpp
@@ -10,7 +10,7 @@
 #endif
 
 // Use 128-bit integers
-#if defined(BOOST_HAS_INT128) || (defined(__SIZEOF_INT128__) && !defined(_MSC_VER)) && !defined(BOOST_INT128_NO_BUILTIN_INT128)
+#if defined(BOOST_HAS_INT128) || (defined(__SIZEOF_INT128__) && !defined(_MSC_VER)) && !defined(BOOST_INT128_NO_BUILTIN_INT128) && !defined(__NVCC__)
 
 #define BOOST_INT128_HAS_INT128
 
@@ -37,7 +37,7 @@ using builtin_u128 = unsigned __int128;
 } // namespace int128
 } // namespace boost
 
-#elif __has_include(<__msvc_int128.hpp>) && _MSVC_LANG >= 202002L
+#elif __has_include(<__msvc_int128.hpp>) && _MSVC_LANG >= 202002L && !defined(__NVCC__) && !defined(__CUDACC__)
 
 #ifndef BOOST_INT128_BUILD_MODULE
 #include <__msvc_int128.hpp>
@@ -58,6 +58,10 @@ using builtin_u128 = std::_Unsigned128;
 } // namespace int128
 } // namespace boost
 
+#else
+
+#define BOOST_INT128_NO_BUILTIN_INT128
+
 #endif // builtin 128-bit detection
 
 // Determine endianness

diff --git a/include/boost/int128/detail/int128_imp.hpp b/include/boost/int128/detail/int128_imp.hpp
@@ -2215,7 +2215,7 @@ BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr int128_t default_ad
 
     return static_cast<int128_t>(static_cast<detail::builtin_i128>(lhs) + static_cast<detail::builtin_i128>(rhs));
 
-    #elif defined(BOOST_INT128_HAS_BUILTIN_ADD_OVERFLOW)
+    #elif defined(BOOST_INT128_HAS_BUILTIN_ADD_OVERFLOW) && !defined(__NVCC__)
 
     std::uint64_t result_low {};
     std::uint64_t result_high {};
@@ -2224,7 +2224,7 @@ BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr int128_t default_ad
 
     return int128_t{static_cast<std::int64_t>(result_high), result_low};
 
-    #elif defined(_M_AMD64) && !defined(BOOST_INT128_NO_CONSTEVAL_DETECTION)
+    #elif defined(_M_AMD64) && !defined(BOOST_INT128_NO_CONSTEVAL_DETECTION) && !defined(__NVCC__)
 
     if (BOOST_INT128_IS_CONSTANT_EVALUATED(lhs))
     {
@@ -2273,11 +2273,11 @@ BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr int128_t default_su
 
     return int128_t{static_cast<std::int64_t>(result_high), result_low};
 
-    #elif defined(__aarch64__) && !defined(__APPLE__)
+    #elif defined(__aarch64__) && !defined(__APPLE__) && defined(BOOST_INT128_HAS_INT128)
 
     return static_cast<int128_t>(static_cast<detail::builtin_i128>(lhs) - static_cast<detail::builtin_i128>(rhs));
 
-    #elif defined(_M_AMD64) && !defined(BOOST_INT128_NO_CONSTEVAL_DETECTION)
+    #elif defined(_M_AMD64) && !defined(BOOST_INT128_NO_CONSTEVAL_DETECTION) && !defined(__NVCC__)
 
     if (BOOST_INT128_IS_CONSTANT_EVALUATED(lhs))
     {
@@ -2690,7 +2690,7 @@ BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr int128_t default_mu
 
     return static_cast<int128_t>(static_cast<detail::builtin_i128>(lhs) * static_cast<detail::builtin_i128>(rhs));
 
-    #elif defined(_M_AMD64) && !defined(__GNUC__) && !defined(BOOST_INT128_NO_CONSTEVAL_DETECTION)
+    #elif defined(_M_AMD64) && !defined(__GNUC__) && !defined(BOOST_INT128_NO_CONSTEVAL_DETECTION) && !defined(__NVCC__)
 
     if (BOOST_INT128_IS_CONSTANT_EVALUATED(rhs))
     {
@@ -2701,7 +2701,7 @@ BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr int128_t default_mu
         return msvc_amd64_mul(lhs, rhs);
     }
 
-    #elif (defined(_M_IX86) || defined(_M_ARM) || defined(__arm__)) && !defined(BOOST_INT128_NO_CONSTEVAL_DETECTION)
+    #elif (defined(_M_IX86) || defined(_M_ARM) || defined(__arm__)) && !defined(BOOST_INT128_NO_CONSTEVAL_DETECTION) && !defined(__NVCC__)
 
     if (BOOST_INT128_IS_CONSTANT_EVALUATED(rhs))
     {

diff --git a/test/cuda_jamfile b/test/cuda_jamfile
@@ -61,3 +61,18 @@ run test_signed_ge.cu ;
 
 run test_unsigned_cstdlib_div.cu ;
 run test_signed_cstdlib_div.cu ;
+
+run test_unsigned_add_sat.cu ;
+run test_signed_add_sat.cu ;
+run test_unsigned_sub_sat.cu ;
+run test_signed_sub_sat.cu ;
+run test_unsigned_mul_sat.cu ;
+run test_signed_mul_sat.cu ;
+run test_unsigned_div_sat.cu ;
+run test_signed_div_sat.cu ;
+run test_unsigned_gcd.cu ;
+run test_signed_gcd.cu ;
+run test_unsigned_lcm.cu ;
+run test_signed_lcm.cu ;
+run test_unsigned_midpoint.cu ;
+run test_signed_midpoint.cu ;
diff --git a/test/test_signed_add_sat.cu b/test/test_signed_add_sat.cu
@@ -0,0 +1,106 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/numeric.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::int128_t;
+
+__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::int128::add_sat(in[i], in2[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    cudaError_t err = cudaSuccess;
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<test_type> dist {(std::numeric_limits<test_type>::min)(), (std::numeric_limits<test_type>::max)()};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+        input_vector2[i] = dist(rng);
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::int128::add_sat(input_vector[i], input_vector2[i]));
+    }
+    double t = w.elapsed();
+
+    int fail_count = 0;
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            if (fail_count < 5)
+            {
+                std::cerr << "Result verification failed at element " << i << std::endl;
+                std::cerr << "  input1 high: " << input_vector[i].high << " low: " << input_vector[i].low << std::endl;
+                std::cerr << "  input2 high: " << input_vector2[i].high << " low: " << input_vector2[i].low << std::endl;
+                std::cerr << "  GPU    high: " << output_vector[i].high << " low: " << output_vector[i].low << std::endl;
+                std::cerr << "  CPU    high: " << results[i].high << " low: " << results[i].low << std::endl;
+            }
+            ++fail_count;
+        }
+    }
+    if (fail_count > 0)
+    {
+        std::cerr << "Total failures: " << fail_count << " out of " << numElements << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_signed_div_sat.cu b/test/test_signed_div_sat.cu
@@ -0,0 +1,96 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/numeric.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::int128_t;
+
+__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::int128::div_sat(in[i], in2[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    cudaError_t err = cudaSuccess;
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<test_type> dist {(std::numeric_limits<test_type>::min)(), (std::numeric_limits<test_type>::max)()};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+        do
+        {
+            input_vector2[i] = dist(rng);
+        } while (input_vector2[i] == 0);
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::int128::div_sat(input_vector[i], input_vector2[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_signed_gcd.cu b/test/test_signed_gcd.cu
@@ -0,0 +1,93 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/numeric.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::int128_t;
+
+__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::int128::gcd(in[i], in2[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    cudaError_t err = cudaSuccess;
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<test_type> dist {(std::numeric_limits<test_type>::min)(), (std::numeric_limits<test_type>::max)()};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+        input_vector2[i] = dist(rng);
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::int128::gcd(input_vector[i], input_vector2[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}