Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 64 additions & 0 deletions .github/workflows/windows.yml
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,70 @@ jobs:
run: |
cd _build && ./test/test_xsimd

build-windows-clang-cl:
name: 'clang-cl x64 AVX2'
defaults:
run:
shell: bash {0}
runs-on: windows-2025
steps:
- name: Setup compiler
uses: ilammy/msvc-dev-cmd@v1
with:
arch: amd64
- name: Check clang-cl
run: |
command -v clang-cl
clang-cl --version
- name: Setup Ninja
run: |
python3 -m pip install --upgrade pip setuptools wheel
python3 -m pip install ninja
- name: Checkout xsimd
uses: actions/checkout@v3
- name: Setup
run: |
mkdir _build
cd _build && cmake .. -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON -DBUILD_BENCHMARK=ON -DBUILD_EXAMPLES=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_CXX_FLAGS="/arch:AVX2" -G Ninja
- name: Build
run: |
cd _build && cmake --build .
- name: Testing xsimd
run: |
cd _build && ./test/test_xsimd

build-windows-clang-cl-fast-math:
name: 'clang-cl x64 /fp:fast'
defaults:
run:
shell: bash {0}
runs-on: windows-2025
steps:
- name: Setup compiler
uses: ilammy/msvc-dev-cmd@v1
with:
arch: amd64
- name: Check clang-cl
run: |
command -v clang-cl
clang-cl --version
- name: Setup Ninja
run: |
python3 -m pip install --upgrade pip setuptools wheel
python3 -m pip install ninja
- name: Checkout xsimd
uses: actions/checkout@v3
- name: Setup
run: |
mkdir _build
cd _build && cmake .. -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON -DBUILD_BENCHMARK=OFF -DBUILD_EXAMPLES=OFF -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_CXX_FLAGS="/fp:fast" -G Ninja
- name: Build
run: |
cd _build && cmake --build .
- name: Testing xsimd
run: |
cd _build && ./test/test_xsimd

build-windows-arm64:
name: 'MSVC arm64'
defaults:
Expand Down
16 changes: 16 additions & 0 deletions include/xsimd/arch/common/xsimd_common_details.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,22 @@ namespace xsimd

namespace detail
{
template <class T>
XSIMD_INLINE void reassociation_barrier(T& x, memory_barrier_tag) noexcept
{
#if XSIMD_WITH_INLINE_ASM
__asm__ volatile("" : : "r"(&x) : "memory");
#else
(void)x;
#endif
}

template <class T, class A>
XSIMD_INLINE void reassociation_barrier(T& x, A const&) noexcept
{
detail::reassociation_barrier(x, memory_barrier_tag {});
}

template <class F, class A, class T, class... Batches>
XSIMD_INLINE batch<T, A> apply(F&& func, batch<T, A> const& self, batch<T, A> const& other) noexcept
{
Expand Down
43 changes: 30 additions & 13 deletions include/xsimd/arch/common/xsimd_common_math.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -878,6 +878,8 @@ namespace xsimd
{
batch_type k = nearbyint(a);
x = (a - k) * constants::log_2<batch_type>();
// Keep the reduced exponent offset from being reassociated before finalize().
detail::reassociation_barrier(x, A {});
return k;
}

Expand Down Expand Up @@ -937,7 +939,11 @@ namespace xsimd
template <class A, class T>
XSIMD_INLINE batch<T, A> exp10(batch<T, A> const& self, requires_arch<common>) noexcept
{
return detail::exp<detail::exp10_tag>(self);
using batch_type = batch<T, A>;
batch_type out = detail::exp<detail::exp10_tag>(self);
// Prevent -ffast-math from folding the whole exp10 batch path for literal inputs.
detail::reassociation_barrier(out, A {});
return out;
}

// exp2
Expand Down Expand Up @@ -1494,6 +1500,8 @@ namespace xsimd
batch_type R = t2 + t1;
batch_type hfsq = batch_type(0.5) * f * f;
batch_type dk = to_float(k);
// Keep the compensated k -> float conversion intact before scaling by split log(2).
detail::reassociation_barrier(dk, A {});
batch_type r = fma(dk, constants::log_2hi<batch_type>(), fma(s, (hfsq + R), dk * constants::log_2lo<batch_type>()) - hfsq + f);
#ifdef __FAST_MATH__
return r;
Expand Down Expand Up @@ -1525,6 +1533,8 @@ namespace xsimd
hx += 0x3ff00000 - 0x3fe6a09e;
k += (hx >> 20) - 0x3ff;
batch_type dk = to_float(k);
// Keep the compensated k -> double conversion intact before scaling by split log(2).
detail::reassociation_barrier(dk, A {});
hx = (hx & i_type(0x000fffff)) + 0x3fe6a09e;
x = ::xsimd::bitwise_cast<double>(hx << 32 | (i_type(0xffffffff) & ::xsimd::bitwise_cast<int_type>(x)));

Expand Down Expand Up @@ -1705,6 +1715,8 @@ namespace xsimd
batch_type t2 = z * detail::horner<batch_type, 0x3f2aaaaa, 0x3e91e9ee>(w);
batch_type R = t2 + t1;
batch_type dk = to_float(k);
// Prevent fast-math from distributing later multiplies through the compensated exponent conversion.
detail::reassociation_barrier(dk, A {});
batch_type hfsq = batch_type(0.5) * f * f;
batch_type hibits = f - hfsq;
hibits &= ::xsimd::bitwise_cast<float>(i_type(0xfffff000));
Expand Down Expand Up @@ -1752,10 +1764,12 @@ namespace xsimd
#endif
hx += 0x3ff00000 - 0x3fe6a09e;
k += (hx >> 20) - 0x3ff;
batch_type dk = to_float(k);
// Prevent fast-math from distributing later multiplies through the compensated exponent conversion.
detail::reassociation_barrier(dk, A {});
hx = (hx & i_type(0x000fffff)) + 0x3fe6a09e;
x = ::xsimd::bitwise_cast<double>(hx << 32 | (i_type(0xffffffff) & ::xsimd::bitwise_cast<int_type>(x)));
batch_type f = --x;
batch_type dk = to_float(k);
batch_type s = f / (batch_type(2.) + f);
batch_type z = s * s;
batch_type w = z * z;
Expand Down Expand Up @@ -1818,6 +1832,8 @@ namespace xsimd
batch_type R = t2 + t1;
batch_type hfsq = batch_type(0.5) * f * f;
batch_type dk = to_float(k);
// Prevent fast-math from distributing later multiplies through the compensated exponent conversion.
detail::reassociation_barrier(dk, A {});
/* correction term ~ log(1+x)-log(u), avoid underflow in c/u */
batch_type c = select(batch_bool_cast<float>(k >= i_type(2)), batch_type(1.) - (uf - self), self - (uf - batch_type(1.))) / uf;
batch_type r = fma(dk, constants::log_2hi<batch_type>(), fma(s, (hfsq + R), dk * constants::log_2lo<batch_type>() + c) - hfsq + f);
Expand Down Expand Up @@ -1853,6 +1869,8 @@ namespace xsimd
batch_type t2 = z * detail::horner<batch_type, 0x3fe5555555555593ll, 0x3fd2492494229359ll, 0x3fc7466496cb03dell, 0x3fc2f112df3e5244ll>(w);
batch_type R = t2 + t1;
batch_type dk = to_float(k);
// Prevent fast-math from distributing later multiplies through the compensated exponent conversion.
detail::reassociation_barrier(dk, A {});
batch_type r = fma(dk, constants::log_2hi<batch_type>(), fma(s, hfsq + R, dk * constants::log_2lo<batch_type>() + c) - hfsq + f);
#ifdef __FAST_MATH__
return r;
Expand Down Expand Up @@ -1900,17 +1918,10 @@ namespace xsimd
batch_type s = bitofsign(self);
batch_type v = self ^ s;
batch_type t2n = constants::twotonmb<batch_type>();
// Under fast-math, reordering is possible and the compiler optimizes d
// to v. That's not what we want, so prevent compiler optimization here.
// FIXME: it may be better to emit a memory barrier here (?).
#ifdef __FAST_MATH__
batch_type d0 = v + t2n;
asm volatile("" ::"r"(&d0) : "memory");
// Prevent fast-math from collapsing (v + 2^n) - 2^n back to v.
detail::reassociation_barrier(d0.data, A {});
batch_type d = d0 - t2n;
#else
batch_type d0 = v + t2n;
batch_type d = d0 - t2n;
#endif
return s ^ select(v < t2n, d, v);
}
}
Expand Down Expand Up @@ -2192,12 +2203,18 @@ namespace xsimd
template <class A>
XSIMD_INLINE batch<float, A> remainder(batch<float, A> const& self, batch<float, A> const& other, requires_arch<common>) noexcept
{
return fnma(nearbyint(self / other), other, self);
batch<float, A> q = nearbyint(self / other);
// Prevent fast-math from pulling the later multiply back through the rounded quotient.
detail::reassociation_barrier(q, A {});
return fnma(q, other, self);
}
template <class A>
XSIMD_INLINE batch<double, A> remainder(batch<double, A> const& self, batch<double, A> const& other, requires_arch<common>) noexcept
{
return fnma(nearbyint(self / other), other, self);
batch<double, A> q = nearbyint(self / other);
// Prevent fast-math from pulling the later multiply back through the rounded quotient.
detail::reassociation_barrier(q, A {});
return fnma(q, other, self);
}
template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
XSIMD_INLINE batch<T, A> remainder(batch<T, A> const& self, batch<T, A> const& other, requires_arch<common>) noexcept
Expand Down
15 changes: 15 additions & 0 deletions include/xsimd/arch/common/xsimd_common_trigo.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -550,34 +550,49 @@ namespace xsimd
else if (all(x <= constants::pio2<B>()))
{
auto test = x > constants::pio4<B>();
// Keep the compensated pio2 subtraction sequence ordered under -ffast-math.
xr = x - constants::pio2_1<B>();
detail::reassociation_barrier(xr, typename B::arch_type {});
xr -= constants::pio2_2<B>();
detail::reassociation_barrier(xr, typename B::arch_type {});
xr -= constants::pio2_3<B>();
detail::reassociation_barrier(xr, typename B::arch_type {});
xr = select(test, xr, x);
return select(test, B(1.), B(0.));
}
else if (all(x <= constants::twentypi<B>()))
{
B xi = nearbyint(x * constants::twoopi<B>());
// Preserve the quadrant selection and compensated range reduction under -ffast-math.
detail::reassociation_barrier(xi, typename B::arch_type {});
xr = fnma(xi, constants::pio2_1<B>(), x);
detail::reassociation_barrier(xr, typename B::arch_type {});
xr -= xi * constants::pio2_2<B>();
detail::reassociation_barrier(xr, typename B::arch_type {});
xr -= xi * constants::pio2_3<B>();
detail::reassociation_barrier(xr, typename B::arch_type {});
return quadrant(xi);
}
else if (all(x <= constants::mediumpi<B>()))
{
B fn = nearbyint(x * constants::twoopi<B>());
// Keep the multi-term range reduction from being reassociated across correction terms.
detail::reassociation_barrier(fn, typename B::arch_type {});
B r = x - fn * constants::pio2_1<B>();
detail::reassociation_barrier(r, typename B::arch_type {});
B w = fn * constants::pio2_1t<B>();
B t = r;
w = fn * constants::pio2_2<B>();
r = t - w;
detail::reassociation_barrier(r, typename B::arch_type {});
w = fn * constants::pio2_2t<B>() - ((t - r) - w);
t = r;
w = fn * constants::pio2_3<B>();
r = t - w;
detail::reassociation_barrier(r, typename B::arch_type {});
w = fn * constants::pio2_3t<B>() - ((t - r) - w);
xr = r - w;
detail::reassociation_barrier(xr, typename B::arch_type {});
return quadrant(fn);
}
else
Expand Down
11 changes: 3 additions & 8 deletions include/xsimd/arch/xsimd_avx2.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -554,11 +554,8 @@ namespace xsimd
__m256d f = _mm256_sub_pd(_mm256_castsi256_pd(xH), _mm256_set1_pd(19342813118337666422669312.)); // 2^84 + 2^52
// With -ffast-math, the compiler may reassociate (xH-C)+xL into
// xH+(xL-C). Since xL<<C this causes catastrophic cancellation.
// The asm barrier forces f into a register before the add, blocking
// the reorder. It emits zero instructions.
#if defined(__GNUC__)
__asm__ volatile("" : "+x"(f));
#endif
// Barrier the intermediate before the final add.
detail::reassociation_barrier(f, A {});
return _mm256_add_pd(f, _mm256_castsi256_pd(xL));
}

Expand All @@ -575,9 +572,7 @@ namespace xsimd
__m256i xL = _mm256_or_si256(_mm256_and_si256(mask, x), _mm256_andnot_si256(mask, _mm256_castpd_si256(_mm256_set1_pd(0x0010000000000000)))); // 2^52
__m256d f = _mm256_sub_pd(_mm256_castsi256_pd(xH), _mm256_set1_pd(442726361368656609280.)); // 3*2^67 + 2^52
// See above: prevent -ffast-math from reassociating (xH-C)+xL.
#if defined(__GNUC__)
__asm__ volatile("" : "+x"(f));
#endif
detail::reassociation_barrier(f, A {});
return _mm256_add_pd(f, _mm256_castsi256_pd(xL));
}
}
Expand Down
16 changes: 15 additions & 1 deletion include/xsimd/arch/xsimd_common_fwd.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,15 @@ namespace xsimd
class batch;
template <class T, class A>
class batch_bool;
namespace kernel
{
namespace detail
{
struct memory_barrier_tag
{
};
}
}
template <class T, class A, T... Vs>
struct batch_constant;
template <class T, class A, bool... Vs>
Expand Down Expand Up @@ -101,6 +110,12 @@ namespace xsimd
// Forward declarations for pack-level helpers
namespace detail
{
template <class T>
XSIMD_INLINE void reassociation_barrier(T& x, memory_barrier_tag) noexcept;

template <class T, class A>
XSIMD_INLINE void reassociation_barrier(T& x, A const&) noexcept;

template <typename T, T... Vs>
XSIMD_INLINE constexpr bool is_identity() noexcept;
template <typename T, class A, T... Vs>
Expand All @@ -115,7 +130,6 @@ namespace xsimd
XSIMD_INLINE constexpr bool is_only_from_lo(batch_constant<T, A, Vs...>) noexcept;
template <typename T, class A, T... Vs>
XSIMD_INLINE constexpr bool is_only_from_hi(batch_constant<T, A, Vs...>) noexcept;

}
}
}
Expand Down
4 changes: 4 additions & 0 deletions include/xsimd/arch/xsimd_sse2.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -716,6 +716,8 @@ namespace xsimd
__m128i mask = _mm_setr_epi16(0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000);
__m128i xL = _mm_or_si128(_mm_and_si128(mask, x), _mm_andnot_si128(mask, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)))); // 2^52
__m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(19342813118337666422669312.)); // 2^84 + 2^52
// Prevent -ffast-math from reassociating (xH-C)+xL into xH+(xL-C).
detail::reassociation_barrier(f, A {});
return _mm_add_pd(f, _mm_castsi128_pd(xL));
}

Expand All @@ -730,6 +732,8 @@ namespace xsimd
__m128i mask = _mm_setr_epi16(0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000);
__m128i xL = _mm_or_si128(_mm_and_si128(mask, x), _mm_andnot_si128(mask, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)))); // 2^52
__m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(442726361368656609280.)); // 3*2^67 + 2^52
// Prevent -ffast-math from reassociating (xH-C)+xL into xH+(xL-C).
detail::reassociation_barrier(f, A {});
return _mm_add_pd(f, _mm_castsi128_pd(xL));
}

Expand Down
11 changes: 3 additions & 8 deletions include/xsimd/arch/xsimd_sse4_1.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,11 +64,8 @@ namespace xsimd
__m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(442726361368656609280.)); // 3*2^67 + 2^52
// With -ffast-math, the compiler may reassociate (xH-C)+xL into
// xH+(xL-C). Since xL<<C this causes catastrophic cancellation.
// The asm barrier forces f into a register before the add, blocking
// the reorder. It emits zero instructions.
#if defined(__GNUC__)
__asm__ volatile("" : "+x"(f));
#endif
// Barrier the intermediate before the final add.
detail::reassociation_barrier(f, A {});
return _mm_add_pd(f, _mm_castsi128_pd(xL));
}

Expand All @@ -81,9 +78,7 @@ namespace xsimd
__m128i xL = _mm_blend_epi16(x, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)), 0xcc); // 2^52
__m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(19342813118337666422669312.)); // 2^84 + 2^52
// See above: prevent -ffast-math from reassociating (xH-C)+xL.
#if defined(__GNUC__)
__asm__ volatile("" : "+x"(f));
#endif
detail::reassociation_barrier(f, A {});
return _mm_add_pd(f, _mm_castsi128_pd(xL));
}
}
Expand Down
21 changes: 21 additions & 0 deletions include/xsimd/config/xsimd_config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,27 @@
#define XSIMD_TARGET_X86 0
#endif

/**
* @ingroup xsimd_config_macro
*
* Set to 1 if GNU-style inline assembly is available, to 0 otherwise.
*/
/* Use __clang__ || __GNUC__ for GNU-style inline asm. clang-cl runs in
* MSVC-compatibility mode and does not define __GNUC__ by default, but it
* still defines __clang__. Clang documents __asm__/__asm__ support and broad
* GCC-extension compatibility:
* https://clang.llvm.org/docs/LanguageExtensions.html
* Clang only emits __GNUC__ when GNUCVersion != 0:
* https://raw.githubusercontent.com/llvm/llvm-project/main/clang/lib/Frontend/InitPreprocessor.cpp
* and GNUCVersion defaults to 0:
* https://raw.githubusercontent.com/llvm/llvm-project/main/clang/include/clang/Basic/LangOptions.def
*/
#if defined(__clang__) || defined(__GNUC__)
#define XSIMD_WITH_INLINE_ASM 1
#else
#define XSIMD_WITH_INLINE_ASM 0
#endif

/**
* @ingroup xsimd_config_macro
*
Expand Down
Loading
Loading