Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions include/boost/int128/detail/config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
#endif

// Use 128-bit integers
#if defined(BOOST_HAS_INT128) || (defined(__SIZEOF_INT128__) && !defined(_MSC_VER)) && !defined(BOOST_INT128_NO_BUILTIN_INT128)
#if defined(BOOST_HAS_INT128) || (defined(__SIZEOF_INT128__) && !defined(_MSC_VER)) && !defined(BOOST_INT128_NO_BUILTIN_INT128) && !defined(__NVCC__)

#define BOOST_INT128_HAS_INT128

Expand All @@ -37,7 +37,7 @@ using builtin_u128 = unsigned __int128;
} // namespace int128
} // namespace boost

#elif __has_include(<__msvc_int128.hpp>) && _MSVC_LANG >= 202002L
#elif __has_include(<__msvc_int128.hpp>) && _MSVC_LANG >= 202002L && !defined(__NVCC__) && !defined(__CUDACC__)

#ifndef BOOST_INT128_BUILD_MODULE
#include <__msvc_int128.hpp>
Expand All @@ -58,6 +58,10 @@ using builtin_u128 = std::_Unsigned128;
} // namespace int128
} // namespace boost

#else

#define BOOST_INT128_NO_BUILTIN_INT128

#endif // builtin 128-bit detection

// Determine endianness
Expand Down
12 changes: 6 additions & 6 deletions include/boost/int128/detail/int128_imp.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2215,7 +2215,7 @@ BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr int128_t default_ad

return static_cast<int128_t>(static_cast<detail::builtin_i128>(lhs) + static_cast<detail::builtin_i128>(rhs));

#elif defined(BOOST_INT128_HAS_BUILTIN_ADD_OVERFLOW)
#elif defined(BOOST_INT128_HAS_BUILTIN_ADD_OVERFLOW) && !defined(__NVCC__)

std::uint64_t result_low {};
std::uint64_t result_high {};
Expand All @@ -2224,7 +2224,7 @@ BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr int128_t default_ad

return int128_t{static_cast<std::int64_t>(result_high), result_low};

#elif defined(_M_AMD64) && !defined(BOOST_INT128_NO_CONSTEVAL_DETECTION)
#elif defined(_M_AMD64) && !defined(BOOST_INT128_NO_CONSTEVAL_DETECTION) && !defined(__NVCC__)

if (BOOST_INT128_IS_CONSTANT_EVALUATED(lhs))
{
Expand Down Expand Up @@ -2273,11 +2273,11 @@ BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr int128_t default_su

return int128_t{static_cast<std::int64_t>(result_high), result_low};

#elif defined(__aarch64__) && !defined(__APPLE__)
#elif defined(__aarch64__) && !defined(__APPLE__) && defined(BOOST_INT128_HAS_INT128)

return static_cast<int128_t>(static_cast<detail::builtin_i128>(lhs) - static_cast<detail::builtin_i128>(rhs));

#elif defined(_M_AMD64) && !defined(BOOST_INT128_NO_CONSTEVAL_DETECTION)
#elif defined(_M_AMD64) && !defined(BOOST_INT128_NO_CONSTEVAL_DETECTION) && !defined(__NVCC__)

if (BOOST_INT128_IS_CONSTANT_EVALUATED(lhs))
{
Expand Down Expand Up @@ -2690,7 +2690,7 @@ BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr int128_t default_mu

return static_cast<int128_t>(static_cast<detail::builtin_i128>(lhs) * static_cast<detail::builtin_i128>(rhs));

#elif defined(_M_AMD64) && !defined(__GNUC__) && !defined(BOOST_INT128_NO_CONSTEVAL_DETECTION)
#elif defined(_M_AMD64) && !defined(__GNUC__) && !defined(BOOST_INT128_NO_CONSTEVAL_DETECTION) && !defined(__NVCC__)

if (BOOST_INT128_IS_CONSTANT_EVALUATED(rhs))
{
Expand All @@ -2701,7 +2701,7 @@ BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr int128_t default_mu
return msvc_amd64_mul(lhs, rhs);
}

#elif (defined(_M_IX86) || defined(_M_ARM) || defined(__arm__)) && !defined(BOOST_INT128_NO_CONSTEVAL_DETECTION)
#elif (defined(_M_IX86) || defined(_M_ARM) || defined(__arm__)) && !defined(BOOST_INT128_NO_CONSTEVAL_DETECTION) && !defined(__NVCC__)

if (BOOST_INT128_IS_CONSTANT_EVALUATED(rhs))
{
Expand Down
15 changes: 15 additions & 0 deletions test/cuda_jamfile
Original file line number Diff line number Diff line change
Expand Up @@ -61,3 +61,18 @@ run test_signed_ge.cu ;

run test_unsigned_cstdlib_div.cu ;
run test_signed_cstdlib_div.cu ;

run test_unsigned_add_sat.cu ;
run test_signed_add_sat.cu ;
run test_unsigned_sub_sat.cu ;
run test_signed_sub_sat.cu ;
run test_unsigned_mul_sat.cu ;
run test_signed_mul_sat.cu ;
run test_unsigned_div_sat.cu ;
run test_signed_div_sat.cu ;
run test_unsigned_gcd.cu ;
run test_signed_gcd.cu ;
run test_unsigned_lcm.cu ;
run test_signed_lcm.cu ;
run test_unsigned_midpoint.cu ;
run test_signed_midpoint.cu ;
106 changes: 106 additions & 0 deletions test/test_signed_add_sat.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
// Copyright Matt Borland 2026.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)

#define BOOST_INT128_ALLOW_SIGN_CONVERSION

#include <iostream>
#include <vector>
#include <random>
#include <limits>
#include <boost/int128.hpp>
#include <boost/int128/numeric.hpp>
#include <boost/int128/random.hpp>
#include <boost/random/uniform_int_distribution.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"

#include <cuda_runtime.h>

using test_type = boost::int128::int128_t;

__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;

if (i < numElements)
{
out[i] = boost::int128::add_sat(in[i], in2[i]);
}
}

int main(void)
{
std::mt19937_64 rng {42};

cudaError_t err = cudaSuccess;

int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;

cuda_managed_ptr<test_type> input_vector(numElements);
cuda_managed_ptr<test_type> input_vector2(numElements);
cuda_managed_ptr<test_type> output_vector(numElements);

boost::random::uniform_int_distribution<test_type> dist {(std::numeric_limits<test_type>::min)(), (std::numeric_limits<test_type>::max)()};
for (std::size_t i = 0; i < numElements; ++i)
{
input_vector[i] = dist(rng);
input_vector2[i] = dist(rng);
}

int threadsPerBlock = 256;
int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;

watch w;

cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();

std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;

err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}

std::vector<test_type> results;
results.reserve(numElements);
w.reset();
for (int i = 0; i < numElements; ++i)
{
results.push_back(boost::int128::add_sat(input_vector[i], input_vector2[i]));
}
double t = w.elapsed();

int fail_count = 0;
for (int i = 0; i < numElements; ++i)
{
if (output_vector[i] != results[i])
{
if (fail_count < 5)
{
std::cerr << "Result verification failed at element " << i << std::endl;
std::cerr << " input1 high: " << input_vector[i].high << " low: " << input_vector[i].low << std::endl;
std::cerr << " input2 high: " << input_vector2[i].high << " low: " << input_vector2[i].low << std::endl;
std::cerr << " GPU high: " << output_vector[i].high << " low: " << output_vector[i].low << std::endl;
std::cerr << " CPU high: " << results[i].high << " low: " << results[i].low << std::endl;
}
++fail_count;
}
}
if (fail_count > 0)
{
std::cerr << "Total failures: " << fail_count << " out of " << numElements << std::endl;
return EXIT_FAILURE;
}

std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";

return 0;
}
96 changes: 96 additions & 0 deletions test/test_signed_div_sat.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
// Copyright Matt Borland 2026.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)

#define BOOST_INT128_ALLOW_SIGN_CONVERSION

#include <iostream>
#include <vector>
#include <random>
#include <limits>
#include <boost/int128.hpp>
#include <boost/int128/numeric.hpp>
#include <boost/int128/random.hpp>
#include <boost/random/uniform_int_distribution.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"

#include <cuda_runtime.h>

using test_type = boost::int128::int128_t;

__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;

if (i < numElements)
{
out[i] = boost::int128::div_sat(in[i], in2[i]);
}
}

int main(void)
{
std::mt19937_64 rng {42};

cudaError_t err = cudaSuccess;

int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;

cuda_managed_ptr<test_type> input_vector(numElements);
cuda_managed_ptr<test_type> input_vector2(numElements);
cuda_managed_ptr<test_type> output_vector(numElements);

boost::random::uniform_int_distribution<test_type> dist {(std::numeric_limits<test_type>::min)(), (std::numeric_limits<test_type>::max)()};
for (std::size_t i = 0; i < numElements; ++i)
{
input_vector[i] = dist(rng);
do
{
input_vector2[i] = dist(rng);
} while (input_vector2[i] == 0);
}

int threadsPerBlock = 256;
int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;

watch w;

cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();

std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;

err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}

std::vector<test_type> results;
results.reserve(numElements);
w.reset();
for (int i = 0; i < numElements; ++i)
{
results.push_back(boost::int128::div_sat(input_vector[i], input_vector2[i]));
}
double t = w.elapsed();

for (int i = 0; i < numElements; ++i)
{
if (output_vector[i] != results[i])
{
std::cerr << "Result verification failed at element " << i << "!" << std::endl;
return EXIT_FAILURE;
}
}

std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";

return 0;
}
93 changes: 93 additions & 0 deletions test/test_signed_gcd.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
// Copyright Matt Borland 2026.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)

#define BOOST_INT128_ALLOW_SIGN_CONVERSION

#include <iostream>
#include <vector>
#include <random>
#include <limits>
#include <boost/int128.hpp>
#include <boost/int128/numeric.hpp>
#include <boost/int128/random.hpp>
#include <boost/random/uniform_int_distribution.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"

#include <cuda_runtime.h>

using test_type = boost::int128::int128_t;

__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;

if (i < numElements)
{
out[i] = boost::int128::gcd(in[i], in2[i]);
}
}

int main(void)
{
std::mt19937_64 rng {42};

cudaError_t err = cudaSuccess;

int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;

cuda_managed_ptr<test_type> input_vector(numElements);
cuda_managed_ptr<test_type> input_vector2(numElements);
cuda_managed_ptr<test_type> output_vector(numElements);

boost::random::uniform_int_distribution<test_type> dist {(std::numeric_limits<test_type>::min)(), (std::numeric_limits<test_type>::max)()};
for (std::size_t i = 0; i < numElements; ++i)
{
input_vector[i] = dist(rng);
input_vector2[i] = dist(rng);
}

int threadsPerBlock = 256;
int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;

watch w;

cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();

std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;

err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}

std::vector<test_type> results;
results.reserve(numElements);
w.reset();
for (int i = 0; i < numElements; ++i)
{
results.push_back(boost::int128::gcd(input_vector[i], input_vector2[i]));
}
double t = w.elapsed();

for (int i = 0; i < numElements; ++i)
{
if (output_vector[i] != results[i])
{
std::cerr << "Result verification failed at element " << i << "!" << std::endl;
return EXIT_FAILURE;
}
}

std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";

return 0;
}
Loading
Loading