diff --git a/src/VecSim/spaces/CMakeLists.txt b/src/VecSim/spaces/CMakeLists.txt index fe354ded5..9b7477837 100644 --- a/src/VecSim/spaces/CMakeLists.txt +++ b/src/VecSim/spaces/CMakeLists.txt @@ -50,18 +50,40 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64|amd64)|(^i.86$)") list(APPEND OPTIMIZATIONS functions/AVX512F_BW_VL_VNNI.cpp) endif() + # F16C is VEX-encoded and requires AVX state, so it is only meaningful when the toolchain + # can also emit AVX/FMA. Mirrors the OPT_F16C macro condition in x86_64InstructionFlags.cmake. + set(_has_full_f16c FALSE) + if(CXX_F16C AND CXX_FMA AND CXX_AVX) + set(_has_full_f16c TRUE) + endif() + + # Base AVX2 / AVX2+FMA dispatcher TUs hold only kernels with no F16C dependency. + # SQ8↔FP16 kernels (which require F16C) live in sibling TUs functions/AVX2_F16C.cpp and + # functions/AVX2_FMA_F16C.cpp, compiled only when _has_full_f16c is true. if(CXX_AVX2) - message("Building with AVX2") - set_source_files_properties(functions/AVX2.cpp PROPERTIES COMPILE_FLAGS -mavx2) + message("Building functions/AVX2.cpp with AVX2") + set_source_files_properties(functions/AVX2.cpp PROPERTIES COMPILE_FLAGS "-mavx2") list(APPEND OPTIMIZATIONS functions/AVX2.cpp) endif() + if(CXX_AVX2 AND _has_full_f16c) + message("Building functions/AVX2_F16C.cpp with AVX2 and F16C") + set_source_files_properties(functions/AVX2_F16C.cpp PROPERTIES COMPILE_FLAGS "-mavx2 -mf16c") + list(APPEND OPTIMIZATIONS functions/AVX2_F16C.cpp) + endif() + if(CXX_AVX2 AND CXX_FMA) - message("Building with AVX2 and FMA") + message("Building functions/AVX2_FMA.cpp with AVX2 and FMA") set_source_files_properties(functions/AVX2_FMA.cpp PROPERTIES COMPILE_FLAGS "-mavx2 -mfma") list(APPEND OPTIMIZATIONS functions/AVX2_FMA.cpp) endif() + if(CXX_AVX2 AND CXX_FMA AND _has_full_f16c) + message("Building functions/AVX2_FMA_F16C.cpp with AVX2, FMA, and F16C") + set_source_files_properties(functions/AVX2_FMA_F16C.cpp PROPERTIES COMPILE_FLAGS "-mavx2 -mfma -mf16c") + list(APPEND OPTIMIZATIONS functions/AVX2_FMA_F16C.cpp) + endif() + if(CXX_F16C AND CXX_FMA AND CXX_AVX) message("Building with CXX_F16C") set_source_files_properties(functions/F16C.cpp PROPERTIES COMPILE_FLAGS "-mf16c -mfma -mavx") @@ -81,11 +103,19 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64|amd64)|(^i.86$)") endif() if(CXX_SSE4) - message("Building with SSE4") - set_source_files_properties(functions/SSE4.cpp PROPERTIES COMPILE_FLAGS -msse4.1) + message("Building functions/SSE4.cpp with SSE4.1") + set_source_files_properties(functions/SSE4.cpp PROPERTIES COMPILE_FLAGS "-msse4.1") list(APPEND OPTIMIZATIONS functions/SSE4.cpp) endif() + # SSE4 SQ8↔FP16 kernels need F16C, which is VEX-encoded → require -mavx alongside -mf16c + # (mirrors the F16C.cpp recipe above). + if(CXX_SSE4 AND _has_full_f16c) + message("Building functions/SSE4_F16C.cpp with SSE4.1, AVX, and F16C") + set_source_files_properties(functions/SSE4_F16C.cpp PROPERTIES COMPILE_FLAGS "-msse4.1 -mavx -mf16c") + list(APPEND OPTIMIZATIONS functions/SSE4_F16C.cpp) + endif() + if(CXX_SSE) message("Building with SSE") set_source_files_properties(functions/SSE.cpp PROPERTIES COMPILE_FLAGS -msse) diff --git a/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8_FP16.h b/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8_FP16.h new file mode 100644 index 000000000..a4c1612ea --- /dev/null +++ b/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8_FP16.h @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#pragma once +#include "VecSim/spaces/space_includes.h" +#include "VecSim/spaces/AVX_utils.h" +#include "VecSim/types/sq8.h" +#include "VecSim/types/float16.h" +#include "VecSim/utils/alignment.h" + +using sq8 = vecsim_types::sq8; +using float16 = vecsim_types::float16; + +/* + * Asymmetric SQ8 (storage) ↔ FP16 (query) inner product using algebraic identity: + * IP(x, y) = Σ(x_i * y_i) + * ≈ Σ((min + delta * q_i) * y_i) + * = min * Σy_i + delta * Σ(q_i * y_i) + * = min * y_sum + delta * quantized_dot_product + * + * FP16 query lanes are widened to FP32 per 8-lane chunk via _mm256_cvtph_ps (F16C); + * inner-loop arithmetic runs in FP32 with _mm256_fmadd_ps. + */ + +// 8-wide AVX2+FMA step: 8 SQ8 lanes + 8 FP16 lanes -> 8 FP32 fused-multiply-add. +static inline void SQ8_FP16_InnerProductStep_AVX2_FMA(const uint8_t *&pVect1, + const float16 *&pVect2, __m256 &sum256) { + __m128i v1_128 = _mm_loadl_epi64(reinterpret_cast(pVect1)); + pVect1 += 8; + __m256i v1_256 = _mm256_cvtepu8_epi32(v1_128); + __m256 v1_f = _mm256_cvtepi32_ps(v1_256); + + __m128i v2_128 = _mm_loadu_si128(reinterpret_cast(pVect2)); + __m256 v2_f = _mm256_cvtph_ps(v2_128); + pVect2 += 8; + + sum256 = _mm256_fmadd_ps(v1_f, v2_f, sum256); +} + +// Precondition: dim >= 16. Caller is the dispatcher in IP_space.cpp / L2_space.cpp. +// The residual block reads 8 SQ8 bytes and 16 FP16 bytes unconditionally; shorter blobs would +// under-read. +template // 0..15 +float SQ8_FP16_InnerProductImp_AVX2_FMA(const void *pVec1v, const void *pVec2v, size_t dimension) { + const uint8_t *pVec1 = static_cast(pVec1v); + const float16 *pVec2 = static_cast(pVec2v); + const uint8_t *pEnd1 = pVec1 + dimension; + + // Two independent accumulators break the FMA dependency chain so consecutive iterations + // can issue in parallel through both FMA ports. + __m256 sum_a = _mm256_setzero_ps(); + __m256 sum_b = _mm256_setzero_ps(); + + if constexpr (residual % 8) { + constexpr int mask = (1 << (residual % 8)) - 1; + + // Single-side mask is sufficient: SQ8 lanes beyond `residual` may hold garbage, but the + // FP16 query blend below forces those FP32 query lanes to 0, so garbage·0=0 contributes + // nothing to the dot product. SQ8 load is intentionally unmasked. + __m128i v1_128 = _mm_loadl_epi64(reinterpret_cast(pVec1)); + pVec1 += residual % 8; + __m256i v1_256 = _mm256_cvtepu8_epi32(v1_128); + __m256 v1_f = _mm256_cvtepi32_ps(v1_256); + + // FP16 side: load full 16-byte block (safe — dim >= 16 and metadata follows). + __m128i v2_128 = _mm_loadu_si128(reinterpret_cast(pVec2)); + __m256 v2_f = _mm256_cvtph_ps(v2_128); + v2_f = _mm256_blend_ps(_mm256_setzero_ps(), v2_f, mask); + pVec2 += residual % 8; + + sum_a = _mm256_mul_ps(v1_f, v2_f); + } + + if constexpr (residual >= 8) { + // Route the half-residual chunk to the second accumulator for ILP. + SQ8_FP16_InnerProductStep_AVX2_FMA(pVec1, pVec2, sum_b); + } + + do { + SQ8_FP16_InnerProductStep_AVX2_FMA(pVec1, pVec2, sum_a); + SQ8_FP16_InnerProductStep_AVX2_FMA(pVec1, pVec2, sum_b); + } while (pVec1 < pEnd1); + + __m256 sum256 = _mm256_add_ps(sum_a, sum_b); + float quantized_dot = my_mm256_reduce_add_ps(sum256); + + const uint8_t *pVec1Base = static_cast(pVec1v); + const uint8_t *params_bytes = pVec1Base + dimension; + const float min_val = load_unaligned(params_bytes + sq8::MIN_VAL * sizeof(float)); + const float delta = load_unaligned(params_bytes + sq8::DELTA * sizeof(float)); + + const float16 *pVec2Base = static_cast(pVec2v); + const auto *query_meta_bytes = reinterpret_cast(pVec2Base + dimension); + const float y_sum = load_unaligned(query_meta_bytes + sq8::SUM_QUERY * sizeof(float)); + + return min_val * y_sum + delta * quantized_dot; +} + +template // 0..15 +float SQ8_FP16_InnerProductSIMD16_AVX2_FMA(const void *pVec1v, const void *pVec2v, + size_t dimension) { + return 1.0f - SQ8_FP16_InnerProductImp_AVX2_FMA(pVec1v, pVec2v, dimension); +} + +template // 0..15 +float SQ8_FP16_CosineSIMD16_AVX2_FMA(const void *pVec1v, const void *pVec2v, size_t dimension) { + return SQ8_FP16_InnerProductSIMD16_AVX2_FMA(pVec1v, pVec2v, dimension); +} diff --git a/src/VecSim/spaces/IP/IP_AVX2_SQ8_FP16.h b/src/VecSim/spaces/IP/IP_AVX2_SQ8_FP16.h new file mode 100644 index 000000000..3a01d80f2 --- /dev/null +++ b/src/VecSim/spaces/IP/IP_AVX2_SQ8_FP16.h @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#pragma once +#include "VecSim/spaces/space_includes.h" +#include "VecSim/spaces/AVX_utils.h" +#include "VecSim/types/sq8.h" +#include "VecSim/types/float16.h" +#include "VecSim/utils/alignment.h" + +using sq8 = vecsim_types::sq8; +using float16 = vecsim_types::float16; + +/* + * Asymmetric SQ8 (storage) ↔ FP16 (query) inner product using algebraic identity: + * IP(x, y) = Σ(x_i * y_i) + * ≈ Σ((min + delta * q_i) * y_i) + * = min * Σy_i + delta * Σ(q_i * y_i) + * = min * y_sum + delta * quantized_dot_product + * + * FP16 query lanes are widened to FP32 per 8-lane chunk via _mm256_cvtph_ps (F16C); + * inner-loop arithmetic runs in FP32 with separate _mm256_mul_ps + _mm256_add_ps + * (no FMA tier — Haswell-era AVX2 without FMA support). + */ + +// 8-wide AVX2 step (no FMA): 8 SQ8 lanes + 8 FP16 lanes -> mul + add into sum. +static inline void SQ8_FP16_InnerProductStep_AVX2(const uint8_t *&pVect1, const float16 *&pVect2, + __m256 &sum256) { + __m128i v1_128 = _mm_loadl_epi64(reinterpret_cast(pVect1)); + pVect1 += 8; + __m256i v1_256 = _mm256_cvtepu8_epi32(v1_128); + __m256 v1_f = _mm256_cvtepi32_ps(v1_256); + + __m128i v2_128 = _mm_loadu_si128(reinterpret_cast(pVect2)); + __m256 v2_f = _mm256_cvtph_ps(v2_128); + pVect2 += 8; + + sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v1_f, v2_f)); +} + +// Precondition: dim >= 16. Caller is the dispatcher in IP_space.cpp / L2_space.cpp. +// The residual block reads 8 SQ8 bytes and 16 FP16 bytes unconditionally; shorter blobs would +// under-read. +template // 0..15 +float SQ8_FP16_InnerProductImp_AVX2(const void *pVec1v, const void *pVec2v, size_t dimension) { + const uint8_t *pVec1 = static_cast(pVec1v); + const float16 *pVec2 = static_cast(pVec2v); + const uint8_t *pEnd1 = pVec1 + dimension; + + // Two independent accumulators break the mul→add dependency chain on Haswell-class CPUs + // without FMA, where the add cannot retire before the prior mul. + __m256 sum_a = _mm256_setzero_ps(); + __m256 sum_b = _mm256_setzero_ps(); + + if constexpr (residual % 8) { + constexpr int mask = (1 << (residual % 8)) - 1; + + // Single-side mask is sufficient: SQ8 lanes beyond `residual` may hold garbage, but the + // FP16 query blend below forces those FP32 query lanes to 0, so garbage·0=0 contributes + // nothing to the dot product. SQ8 load is intentionally unmasked. + __m128i v1_128 = _mm_loadl_epi64(reinterpret_cast(pVec1)); + pVec1 += residual % 8; + __m256i v1_256 = _mm256_cvtepu8_epi32(v1_128); + __m256 v1_f = _mm256_cvtepi32_ps(v1_256); + + __m128i v2_128 = _mm_loadu_si128(reinterpret_cast(pVec2)); + __m256 v2_f = _mm256_cvtph_ps(v2_128); + v2_f = _mm256_blend_ps(_mm256_setzero_ps(), v2_f, mask); + pVec2 += residual % 8; + + sum_a = _mm256_mul_ps(v1_f, v2_f); + } + + if constexpr (residual >= 8) { + // Route the half-residual chunk to the second accumulator for ILP. + SQ8_FP16_InnerProductStep_AVX2(pVec1, pVec2, sum_b); + } + + do { + SQ8_FP16_InnerProductStep_AVX2(pVec1, pVec2, sum_a); + SQ8_FP16_InnerProductStep_AVX2(pVec1, pVec2, sum_b); + } while (pVec1 < pEnd1); + + __m256 sum256 = _mm256_add_ps(sum_a, sum_b); + float quantized_dot = my_mm256_reduce_add_ps(sum256); + + const uint8_t *pVec1Base = static_cast(pVec1v); + const uint8_t *params_bytes = pVec1Base + dimension; + const float min_val = load_unaligned(params_bytes + sq8::MIN_VAL * sizeof(float)); + const float delta = load_unaligned(params_bytes + sq8::DELTA * sizeof(float)); + + const float16 *pVec2Base = static_cast(pVec2v); + const auto *query_meta_bytes = reinterpret_cast(pVec2Base + dimension); + const float y_sum = load_unaligned(query_meta_bytes + sq8::SUM_QUERY * sizeof(float)); + + return min_val * y_sum + delta * quantized_dot; +} + +template // 0..15 +float SQ8_FP16_InnerProductSIMD16_AVX2(const void *pVec1v, const void *pVec2v, size_t dimension) { + return 1.0f - SQ8_FP16_InnerProductImp_AVX2(pVec1v, pVec2v, dimension); +} + +template // 0..15 +float SQ8_FP16_CosineSIMD16_AVX2(const void *pVec1v, const void *pVec2v, size_t dimension) { + return SQ8_FP16_InnerProductSIMD16_AVX2(pVec1v, pVec2v, dimension); +} diff --git a/src/VecSim/spaces/IP/IP_AVX512F_SQ8_FP16.h b/src/VecSim/spaces/IP/IP_AVX512F_SQ8_FP16.h new file mode 100644 index 000000000..7ba9c0412 --- /dev/null +++ b/src/VecSim/spaces/IP/IP_AVX512F_SQ8_FP16.h @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#pragma once +#include "VecSim/spaces/space_includes.h" +#include "VecSim/types/sq8.h" +#include "VecSim/types/float16.h" +#include "VecSim/utils/alignment.h" +#include + +using sq8 = vecsim_types::sq8; +using float16 = vecsim_types::float16; + +// Helper: load 16 SQ8 + 16 FP16 lanes, widen both to FP32, fused-multiply-add into sum. +static inline void SQ8_FP16_InnerProductStep_AVX512(const uint8_t *&pVec1, const float16 *&pVec2, + __m512 &sum) { + // 16 uint8 -> 16 fp32 + __m128i v1_128 = _mm_loadu_si128(reinterpret_cast(pVec1)); + __m512i v1_512 = _mm512_cvtepu8_epi32(v1_128); + __m512 v1_f = _mm512_cvtepi32_ps(v1_512); + + // 16 fp16 -> 16 fp32. _mm512_cvtph_ps is part of AVX512F. + __m256i v2_16 = _mm256_loadu_si256(reinterpret_cast(pVec2)); + __m512 v2_f = _mm512_cvtph_ps(v2_16); + + sum = _mm512_fmadd_ps(v1_f, v2_f, sum); + + pVec1 += 16; + pVec2 += 16; +} + +// Raw inner product Σ((min + delta * q_i) * y_i). Used by both InnerProduct/Cosine wrappers +// and by the L2 kernel. +// Precondition: dim >= 16. Caller is the dispatcher in IP_space.cpp / L2_space.cpp, which gates +// this. The residual block reads 16 SQ8 bytes and 32 FP16 bytes unconditionally; shorter blobs +// would under-read. +template // 0..15 +float SQ8_FP16_InnerProductImp_AVX512(const void *pVec1v, const void *pVec2v, size_t dimension) { + const uint8_t *pVec1 = static_cast(pVec1v); // SQ8 storage + const float16 *pVec2 = static_cast(pVec2v); // FP16 query + const uint8_t *pEnd1 = pVec1 + dimension; + + // Four independent accumulators break the FMA dependency chain so the inner loop can + // saturate both FMA ports on Sapphire Rapids / Zen 4. + __m512 sum0 = _mm512_setzero_ps(); + __m512 sum1 = _mm512_setzero_ps(); + __m512 sum2 = _mm512_setzero_ps(); + __m512 sum3 = _mm512_setzero_ps(); + + if constexpr (residual > 0) { + __mmask16 mask = (1U << residual) - 1; + + __m128i v1_128 = _mm_loadu_si128(reinterpret_cast(pVec1)); + __m512i v1_512 = _mm512_cvtepu8_epi32(v1_128); + __m512 v1_f = _mm512_cvtepi32_ps(v1_512); + + // Safe to read the full 32-byte FP16 chunk: dim >= 16 and the FP16 metadata follows + // the lanes, so the load stays within the query blob. + __m256i v2_16 = _mm256_loadu_si256(reinterpret_cast(pVec2)); + __m512 v2_f = _mm512_cvtph_ps(v2_16); + + // Mask out unused lanes by folding the mask into the multiply. + sum0 = _mm512_maskz_mul_ps(mask, v1_f, v2_f); + + pVec1 += residual; + pVec2 += residual; + } + + // Main unrolled loop: 4 chunks of 16 lanes per iteration, one chunk per accumulator. + // Residual leaves `dim - residual` lanes remaining (a multiple of 16), so the + // pointer comparison stays exact. Compare via pointer subtraction (not + // `pVec1 + 64 <= pEnd1`) so we never form a pointer past one-past-the-end, + // which would be UB in C++ regardless of dereference. + while (static_cast(pEnd1 - pVec1) >= 64) { + SQ8_FP16_InnerProductStep_AVX512(pVec1, pVec2, sum0); + SQ8_FP16_InnerProductStep_AVX512(pVec1, pVec2, sum1); + SQ8_FP16_InnerProductStep_AVX512(pVec1, pVec2, sum2); + SQ8_FP16_InnerProductStep_AVX512(pVec1, pVec2, sum3); + } + + // Reduce the four accumulators into one. + __m512 sum = _mm512_add_ps(_mm512_add_ps(sum0, sum1), _mm512_add_ps(sum2, sum3)); + + // Tail: at most three remaining 16-lane chunks. + while (pVec1 < pEnd1) { + SQ8_FP16_InnerProductStep_AVX512(pVec1, pVec2, sum); + } + + float quantized_dot = _mm512_reduce_add_ps(sum); + + // SQ8 metadata starts at byte offset `dimension`; for odd `dimension` it is not + // 4-byte aligned, so use load_unaligned. Mirrors the scalar SQ8_FP16_Impl pattern. + const uint8_t *pVec1Base = static_cast(pVec1v); + const uint8_t *params_bytes = pVec1Base + dimension; + const float min_val = load_unaligned(params_bytes + sq8::MIN_VAL * sizeof(float)); + const float delta = load_unaligned(params_bytes + sq8::DELTA * sizeof(float)); + + // FP16 query metadata sits at byte offset 2*dimension; for odd `dimension` it is + // 2-byte aligned only. + const float16 *pVec2Base = static_cast(pVec2v); + const auto *query_meta_bytes = reinterpret_cast(pVec2Base + dimension); + const float y_sum = load_unaligned(query_meta_bytes + sq8::SUM_QUERY * sizeof(float)); + + return min_val * y_sum + delta * quantized_dot; +} + +template // 0..15 +float SQ8_FP16_InnerProductSIMD16_AVX512F(const void *pVec1v, const void *pVec2v, + size_t dimension) { + return 1.0f - SQ8_FP16_InnerProductImp_AVX512(pVec1v, pVec2v, dimension); +} + +template // 0..15 +float SQ8_FP16_CosineSIMD16_AVX512F(const void *pVec1v, const void *pVec2v, size_t dimension) { + // Cosine distance = 1 - IP for pre-normalised vectors. Aliases InnerProduct, matching the + // SQ8_FP32 pattern. + return SQ8_FP16_InnerProductSIMD16_AVX512F(pVec1v, pVec2v, dimension); +} diff --git a/src/VecSim/spaces/IP/IP_SSE4_SQ8_FP16.h b/src/VecSim/spaces/IP/IP_SSE4_SQ8_FP16.h new file mode 100644 index 000000000..871a189dc --- /dev/null +++ b/src/VecSim/spaces/IP/IP_SSE4_SQ8_FP16.h @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#pragma once +#include "VecSim/spaces/space_includes.h" +#include "VecSim/types/sq8.h" +#include "VecSim/types/float16.h" +#include "VecSim/utils/alignment.h" + +using sq8 = vecsim_types::sq8; +using float16 = vecsim_types::float16; + +/* + * Asymmetric SQ8 (storage) ↔ FP16 (query) inner product using algebraic identity: + * IP(x, y) = Σ(x_i * y_i) + * ≈ Σ((min + delta * q_i) * y_i) + * = min * Σy_i + delta * Σ(q_i * y_i) + * = min * y_sum + delta * quantized_dot_product + * + * FP16 query lanes are widened to FP32 per 4-lane chunk via _mm_cvtph_ps (F16C); + * inner-loop arithmetic runs in FP32 with separate _mm_mul_ps + _mm_add_ps (SSE4 has no FMA). + */ + +// 4-wide SSE4+F16C step: 4 SQ8 lanes + 4 FP16 lanes -> mul + add into sum. +static inline void SQ8_FP16_InnerProductStep_SSE4(const uint8_t *&pVect1, const float16 *&pVect2, + __m128 &sum) { + // Alignment-safe 4-byte load of SQ8 lanes via load_unaligned (no strict-aliasing UB). + __m128i v1_i = _mm_cvtepu8_epi32(_mm_cvtsi32_si128(load_unaligned(pVect1))); + pVect1 += 4; + __m128 v1_f = _mm_cvtepi32_ps(v1_i); + + __m128i v2_8 = _mm_loadl_epi64(reinterpret_cast(pVect2)); + __m128 v2_f = _mm_cvtph_ps(v2_8); + pVect2 += 4; + + sum = _mm_add_ps(sum, _mm_mul_ps(v1_f, v2_f)); +} + +// Precondition: dim >= 16. Caller is the dispatcher in IP_space.cpp / L2_space.cpp. +// Shorter blobs would underflow the residual ladder + final do-while loop. +template // 0..15 +float SQ8_FP16_InnerProductSIMD16_SSE4_IMP(const void *pVec1v, const void *pVec2v, + size_t dimension) { + const uint8_t *pVec1 = static_cast(pVec1v); + const float16 *pVec2 = static_cast(pVec2v); + const uint8_t *pEnd1 = pVec1 + dimension; + + // Two independent accumulators break the mul→add dependency chain (SSE4 lacks FMA). + __m128 sum_a = _mm_setzero_ps(); + __m128 sum_b = _mm_setzero_ps(); + + if constexpr (residual % 4) { + __m128 v1_f; + __m128 v2_f; + + if constexpr (residual % 4 == 3) { + v1_f = _mm_set_ps(0.0f, static_cast(pVec1[2]), static_cast(pVec1[1]), + static_cast(pVec1[0])); + v2_f = _mm_set_ps(0.0f, vecsim_types::FP16_to_FP32(pVec2[2]), + vecsim_types::FP16_to_FP32(pVec2[1]), + vecsim_types::FP16_to_FP32(pVec2[0])); + } else if constexpr (residual % 4 == 2) { + v1_f = + _mm_set_ps(0.0f, 0.0f, static_cast(pVec1[1]), static_cast(pVec1[0])); + v2_f = _mm_set_ps(0.0f, 0.0f, vecsim_types::FP16_to_FP32(pVec2[1]), + vecsim_types::FP16_to_FP32(pVec2[0])); + } else if constexpr (residual % 4 == 1) { + v1_f = _mm_set_ps(0.0f, 0.0f, 0.0f, static_cast(pVec1[0])); + v2_f = _mm_set_ps(0.0f, 0.0f, 0.0f, vecsim_types::FP16_to_FP32(pVec2[0])); + } + + pVec1 += residual % 4; + pVec2 += residual % 4; + + sum_a = _mm_mul_ps(v1_f, v2_f); + } + + // Alternate the residual-ladder steps across the two accumulators for ILP. + if constexpr (residual >= 4) { + SQ8_FP16_InnerProductStep_SSE4(pVec1, pVec2, sum_b); + } + if constexpr (residual >= 8) { + SQ8_FP16_InnerProductStep_SSE4(pVec1, pVec2, sum_a); + } + if constexpr (residual >= 12) { + SQ8_FP16_InnerProductStep_SSE4(pVec1, pVec2, sum_b); + } + + // Remaining lanes after the residual block are a multiple of 16, hence a multiple of 8, + // so two 4-lane steps per iteration consume the tail exactly. + do { + SQ8_FP16_InnerProductStep_SSE4(pVec1, pVec2, sum_a); + SQ8_FP16_InnerProductStep_SSE4(pVec1, pVec2, sum_b); + } while (pVec1 < pEnd1); + + __m128 sum = _mm_add_ps(sum_a, sum_b); + float PORTABLE_ALIGN16 TmpRes[4]; + _mm_store_ps(TmpRes, sum); + float quantized_dot = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3]; + + const uint8_t *pVec1Base = static_cast(pVec1v); + const uint8_t *params_bytes = pVec1Base + dimension; + const float min_val = load_unaligned(params_bytes + sq8::MIN_VAL * sizeof(float)); + const float delta = load_unaligned(params_bytes + sq8::DELTA * sizeof(float)); + + const float16 *pVec2Base = static_cast(pVec2v); + const auto *query_meta_bytes = reinterpret_cast(pVec2Base + dimension); + const float y_sum = load_unaligned(query_meta_bytes + sq8::SUM_QUERY * sizeof(float)); + + return min_val * y_sum + delta * quantized_dot; +} + +template // 0..15 +float SQ8_FP16_InnerProductSIMD16_SSE4(const void *pVec1v, const void *pVec2v, size_t dimension) { + return 1.0f - SQ8_FP16_InnerProductSIMD16_SSE4_IMP(pVec1v, pVec2v, dimension); +} + +template // 0..15 +float SQ8_FP16_CosineSIMD16_SSE4(const void *pVec1v, const void *pVec2v, size_t dimension) { + return SQ8_FP16_InnerProductSIMD16_SSE4(pVec1v, pVec2v, dimension); +} diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp index 55979e25a..b57971b60 100644 --- a/src/VecSim/spaces/IP_space.cpp +++ b/src/VecSim/spaces/IP_space.cpp @@ -20,9 +20,12 @@ #include "VecSim/spaces/functions/AVX512BF16_VL.h" #include "VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h" #include "VecSim/spaces/functions/AVX2.h" +#include "VecSim/spaces/functions/AVX2_F16C.h" #include "VecSim/spaces/functions/AVX2_FMA.h" +#include "VecSim/spaces/functions/AVX2_FMA_F16C.h" #include "VecSim/spaces/functions/SSE3.h" #include "VecSim/spaces/functions/SSE4.h" +#include "VecSim/spaces/functions/SSE4_F16C.h" #include "VecSim/spaces/functions/NEON.h" #include "VecSim/spaces/functions/NEON_DOTPROD.h" #include "VecSim/spaces/functions/NEON_HP.h" @@ -172,31 +175,106 @@ dist_func_t Cosine_SQ8_FP32_GetDistFunc(size_t dim, unsigned char *alignm } // SQ8-FP16: asymmetric inner product distance between SQ8 storage and FP16 query. -// SIMD chooser slots are added by P1b (MOD-15152) / P1c (MOD-15153); for now this always -// returns the scalar implementation. dist_func_t IP_SQ8_FP16_GetDistFunc(size_t dim, unsigned char *alignment, const void *arch_opt) { unsigned char dummy_alignment; if (alignment == nullptr) { alignment = &dummy_alignment; } - (void)dim; - (void)arch_opt; - return SQ8_FP16_InnerProduct; + + dist_func_t ret_dist_func = SQ8_FP16_InnerProduct; + [[maybe_unused]] auto features = getCpuOptimizationFeatures(arch_opt); + +#ifdef CPU_FEATURES_ARCH_X86_64 + if (dim < 16) { + return ret_dist_func; + } + // Alignment hints below refer to the SQ8 (first) operand per the GetDistFunc contract. + // AVX-512 tier only needs AVX-512F (cvtph_ps is part of AVX-512F, no VNNI/BW/VL required). +#ifdef OPT_AVX512F + if (features.avx512f) { + if (dim % 16 == 0) // SQ8 chunk = 16 bytes + *alignment = 16 * sizeof(uint8_t); + return Choose_SQ8_FP16_IP_implementation_AVX512F(dim); + } +#endif + // F16C is required by every non-AVX-512 SQ8↔FP16 tier (vcvtph2ps), so the guard is hoisted + // around all three. +#ifdef OPT_F16C +#ifdef OPT_AVX2_FMA + if (features.avx2 && features.fma3 && features.f16c) { + if (dim % 8 == 0) // SQ8 chunk = 8 bytes + *alignment = 8 * sizeof(uint8_t); + return Choose_SQ8_FP16_IP_implementation_AVX2_FMA(dim); + } +#endif +#ifdef OPT_AVX2 + if (features.avx2 && features.f16c) { + if (dim % 8 == 0) + *alignment = 8 * sizeof(uint8_t); + return Choose_SQ8_FP16_IP_implementation_AVX2(dim); + } +#endif +#ifdef OPT_SSE4 + // F16C is VEX-encoded — require AVX as well, matching the existing F16C/FP16 dispatcher. + if (features.sse4_1 && features.f16c && features.avx) { + if (dim % 4 == 0) + *alignment = 4 * sizeof(uint8_t); + return Choose_SQ8_FP16_IP_implementation_SSE4(dim); + } +#endif +#endif // OPT_F16C +#endif // x86_64 + return ret_dist_func; } // SQ8-FP16: asymmetric cosine distance between SQ8 storage and FP16 query. -// SIMD chooser slots are added by P1b (MOD-15152) / P1c (MOD-15153); for now this always -// returns the scalar implementation. dist_func_t Cosine_SQ8_FP16_GetDistFunc(size_t dim, unsigned char *alignment, const void *arch_opt) { unsigned char dummy_alignment; if (alignment == nullptr) { alignment = &dummy_alignment; } - (void)dim; - (void)arch_opt; - return SQ8_FP16_Cosine; + + dist_func_t ret_dist_func = SQ8_FP16_Cosine; + [[maybe_unused]] auto features = getCpuOptimizationFeatures(arch_opt); + +#ifdef CPU_FEATURES_ARCH_X86_64 + if (dim < 16) { + return ret_dist_func; + } +#ifdef OPT_AVX512F + if (features.avx512f) { + if (dim % 16 == 0) + *alignment = 16 * sizeof(uint8_t); + return Choose_SQ8_FP16_Cosine_implementation_AVX512F(dim); + } +#endif +#ifdef OPT_F16C +#ifdef OPT_AVX2_FMA + if (features.avx2 && features.fma3 && features.f16c) { + if (dim % 8 == 0) + *alignment = 8 * sizeof(uint8_t); + return Choose_SQ8_FP16_Cosine_implementation_AVX2_FMA(dim); + } +#endif +#ifdef OPT_AVX2 + if (features.avx2 && features.f16c) { + if (dim % 8 == 0) + *alignment = 8 * sizeof(uint8_t); + return Choose_SQ8_FP16_Cosine_implementation_AVX2(dim); + } +#endif +#ifdef OPT_SSE4 + if (features.sse4_1 && features.f16c && features.avx) { + if (dim % 4 == 0) + *alignment = 4 * sizeof(uint8_t); + return Choose_SQ8_FP16_Cosine_implementation_SSE4(dim); + } +#endif +#endif // OPT_F16C +#endif // x86_64 + return ret_dist_func; } // SQ8-to-SQ8 Inner Product distance function (both vectors are uint8 quantized with precomputed diff --git a/src/VecSim/spaces/L2/L2_AVX2_FMA_SQ8_FP16.h b/src/VecSim/spaces/L2/L2_AVX2_FMA_SQ8_FP16.h new file mode 100644 index 000000000..38809e9c2 --- /dev/null +++ b/src/VecSim/spaces/L2/L2_AVX2_FMA_SQ8_FP16.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#pragma once +#include "VecSim/spaces/space_includes.h" +#include "VecSim/spaces/AVX_utils.h" +#include "VecSim/spaces/IP/IP_AVX2_FMA_SQ8_FP16.h" +#include "VecSim/types/sq8.h" +#include "VecSim/types/float16.h" +#include "VecSim/utils/alignment.h" + +using sq8 = vecsim_types::sq8; +using float16 = vecsim_types::float16; + +template // 0..15 +float SQ8_FP16_L2SqrSIMD16_AVX2_FMA(const void *pVect1v, const void *pVect2v, size_t dimension) { + const float ip = SQ8_FP16_InnerProductImp_AVX2_FMA(pVect1v, pVect2v, dimension); + + const uint8_t *pVect1 = static_cast(pVect1v); + const uint8_t *params_bytes = pVect1 + dimension; + const float x_sum_sq = load_unaligned(params_bytes + sq8::SUM_SQUARES * sizeof(float)); + + const float16 *pVect2 = static_cast(pVect2v); + const auto *query_meta_bytes = reinterpret_cast(pVect2 + dimension); + const float y_sum_sq = + load_unaligned(query_meta_bytes + sq8::SUM_SQUARES_QUERY * sizeof(float)); + + return x_sum_sq + y_sum_sq - 2.0f * ip; +} diff --git a/src/VecSim/spaces/L2/L2_AVX2_SQ8_FP16.h b/src/VecSim/spaces/L2/L2_AVX2_SQ8_FP16.h new file mode 100644 index 000000000..98bb29c05 --- /dev/null +++ b/src/VecSim/spaces/L2/L2_AVX2_SQ8_FP16.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#pragma once +#include "VecSim/spaces/space_includes.h" +#include "VecSim/spaces/AVX_utils.h" +#include "VecSim/spaces/IP/IP_AVX2_SQ8_FP16.h" +#include "VecSim/types/sq8.h" +#include "VecSim/types/float16.h" +#include "VecSim/utils/alignment.h" + +using sq8 = vecsim_types::sq8; +using float16 = vecsim_types::float16; + +template // 0..15 +float SQ8_FP16_L2SqrSIMD16_AVX2(const void *pVect1v, const void *pVect2v, size_t dimension) { + const float ip = SQ8_FP16_InnerProductImp_AVX2(pVect1v, pVect2v, dimension); + + const uint8_t *pVect1 = static_cast(pVect1v); + const uint8_t *params_bytes = pVect1 + dimension; + const float x_sum_sq = load_unaligned(params_bytes + sq8::SUM_SQUARES * sizeof(float)); + + const float16 *pVect2 = static_cast(pVect2v); + const auto *query_meta_bytes = reinterpret_cast(pVect2 + dimension); + const float y_sum_sq = + load_unaligned(query_meta_bytes + sq8::SUM_SQUARES_QUERY * sizeof(float)); + + return x_sum_sq + y_sum_sq - 2.0f * ip; +} diff --git a/src/VecSim/spaces/L2/L2_AVX512F_SQ8_FP16.h b/src/VecSim/spaces/L2/L2_AVX512F_SQ8_FP16.h new file mode 100644 index 000000000..384870b21 --- /dev/null +++ b/src/VecSim/spaces/L2/L2_AVX512F_SQ8_FP16.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#pragma once +#include "VecSim/spaces/space_includes.h" +#include "VecSim/spaces/IP/IP_AVX512F_SQ8_FP16.h" +#include "VecSim/types/sq8.h" +#include "VecSim/types/float16.h" +#include "VecSim/utils/alignment.h" + +using sq8 = vecsim_types::sq8; +using float16 = vecsim_types::float16; + +// L2² = x_sum_squares + y_sum_squares - 2 * IP(x, y), computed via the AVX-512 IP impl above. +template // 0..15 +float SQ8_FP16_L2SqrSIMD16_AVX512F(const void *pVect1v, const void *pVect2v, size_t dimension) { + const float ip = SQ8_FP16_InnerProductImp_AVX512(pVect1v, pVect2v, dimension); + + const uint8_t *pVect1 = static_cast(pVect1v); + const uint8_t *params_bytes = pVect1 + dimension; + const float x_sum_sq = load_unaligned(params_bytes + sq8::SUM_SQUARES * sizeof(float)); + + const float16 *pVect2 = static_cast(pVect2v); + const auto *query_meta_bytes = reinterpret_cast(pVect2 + dimension); + const float y_sum_sq = + load_unaligned(query_meta_bytes + sq8::SUM_SQUARES_QUERY * sizeof(float)); + + return x_sum_sq + y_sum_sq - 2.0f * ip; +} diff --git a/src/VecSim/spaces/L2/L2_SSE4_SQ8_FP16.h b/src/VecSim/spaces/L2/L2_SSE4_SQ8_FP16.h new file mode 100644 index 000000000..75bbd46f8 --- /dev/null +++ b/src/VecSim/spaces/L2/L2_SSE4_SQ8_FP16.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#pragma once +#include "VecSim/spaces/space_includes.h" +#include "VecSim/spaces/IP/IP_SSE4_SQ8_FP16.h" +#include "VecSim/types/sq8.h" +#include "VecSim/types/float16.h" +#include "VecSim/utils/alignment.h" + +using sq8 = vecsim_types::sq8; +using float16 = vecsim_types::float16; + +template // 0..15 +float SQ8_FP16_L2SqrSIMD16_SSE4(const void *pVect1v, const void *pVect2v, size_t dimension) { + const float ip = SQ8_FP16_InnerProductSIMD16_SSE4_IMP(pVect1v, pVect2v, dimension); + + const uint8_t *pVect1 = static_cast(pVect1v); + const uint8_t *params_bytes = pVect1 + dimension; + const float x_sum_sq = load_unaligned(params_bytes + sq8::SUM_SQUARES * sizeof(float)); + + const float16 *pVect2 = static_cast(pVect2v); + const auto *query_meta_bytes = reinterpret_cast(pVect2 + dimension); + const float y_sum_sq = + load_unaligned(query_meta_bytes + sq8::SUM_SQUARES_QUERY * sizeof(float)); + + return x_sum_sq + y_sum_sq - 2.0f * ip; +} diff --git a/src/VecSim/spaces/L2_space.cpp b/src/VecSim/spaces/L2_space.cpp index ba3dd7cab..43020399f 100644 --- a/src/VecSim/spaces/L2_space.cpp +++ b/src/VecSim/spaces/L2_space.cpp @@ -19,9 +19,12 @@ #include "VecSim/spaces/functions/AVX512FP16_VL.h" #include "VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h" #include "VecSim/spaces/functions/AVX2.h" +#include "VecSim/spaces/functions/AVX2_F16C.h" #include "VecSim/spaces/functions/AVX2_FMA.h" +#include "VecSim/spaces/functions/AVX2_FMA_F16C.h" #include "VecSim/spaces/functions/SSE3.h" #include "VecSim/spaces/functions/SSE4.h" +#include "VecSim/spaces/functions/SSE4_F16C.h" #include "VecSim/spaces/functions/NEON.h" #include "VecSim/spaces/functions/NEON_DOTPROD.h" #include "VecSim/spaces/functions/NEON_HP.h" @@ -104,17 +107,56 @@ dist_func_t L2_SQ8_FP32_GetDistFunc(size_t dim, unsigned char *alignment, } // SQ8-FP16: asymmetric L2 distance between SQ8 storage and FP16 query. -// SIMD chooser slots are added by P1b (MOD-15152) / P1c (MOD-15153); for now this always -// returns the scalar implementation. dist_func_t L2_SQ8_FP16_GetDistFunc(size_t dim, unsigned char *alignment, const void *arch_opt) { unsigned char dummy_alignment; if (!alignment) { alignment = &dummy_alignment; } - (void)dim; - (void)arch_opt; - return SQ8_FP16_L2Sqr; + + dist_func_t ret_dist_func = SQ8_FP16_L2Sqr; + [[maybe_unused]] auto features = getCpuOptimizationFeatures(arch_opt); + +#ifdef CPU_FEATURES_ARCH_X86_64 + if (dim < 16) { + return ret_dist_func; + } + // Alignment hints below refer to the SQ8 (first) operand per the GetDistFunc contract. + // AVX-512 tier only needs AVX-512F (cvtph_ps is part of AVX-512F, no VNNI/BW/VL required). +#ifdef OPT_AVX512F + if (features.avx512f) { + if (dim % 16 == 0) + *alignment = 16 * sizeof(uint8_t); + return Choose_SQ8_FP16_L2_implementation_AVX512F(dim); + } +#endif + // F16C is required by every non-AVX-512 SQ8↔FP16 tier (vcvtph2ps), so the guard is hoisted + // around all three. +#ifdef OPT_F16C +#ifdef OPT_AVX2_FMA + if (features.avx2 && features.fma3 && features.f16c) { + if (dim % 8 == 0) + *alignment = 8 * sizeof(uint8_t); + return Choose_SQ8_FP16_L2_implementation_AVX2_FMA(dim); + } +#endif +#ifdef OPT_AVX2 + if (features.avx2 && features.f16c) { + if (dim % 8 == 0) + *alignment = 8 * sizeof(uint8_t); + return Choose_SQ8_FP16_L2_implementation_AVX2(dim); + } +#endif +#ifdef OPT_SSE4 + if (features.sse4_1 && features.f16c && features.avx) { + if (dim % 4 == 0) + *alignment = 4 * sizeof(uint8_t); + return Choose_SQ8_FP16_L2_implementation_SSE4(dim); + } +#endif +#endif // OPT_F16C +#endif // x86_64 + return ret_dist_func; } dist_func_t L2_FP32_GetDistFunc(size_t dim, unsigned char *alignment, const void *arch_opt) { diff --git a/src/VecSim/spaces/functions/AVX2_F16C.cpp b/src/VecSim/spaces/functions/AVX2_F16C.cpp new file mode 100644 index 000000000..3d298e81b --- /dev/null +++ b/src/VecSim/spaces/functions/AVX2_F16C.cpp @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#include "AVX2_F16C.h" +#include "VecSim/spaces/IP/IP_AVX2_SQ8_FP16.h" +#include "VecSim/spaces/L2/L2_AVX2_SQ8_FP16.h" + +namespace spaces { + +#include "implementation_chooser.h" + +dist_func_t Choose_SQ8_FP16_IP_implementation_AVX2(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_FP16_InnerProductSIMD16_AVX2); + return ret_dist_func; +} +dist_func_t Choose_SQ8_FP16_Cosine_implementation_AVX2(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_FP16_CosineSIMD16_AVX2); + return ret_dist_func; +} +dist_func_t Choose_SQ8_FP16_L2_implementation_AVX2(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_FP16_L2SqrSIMD16_AVX2); + return ret_dist_func; +} + +#include "implementation_chooser_cleanup.h" + +} // namespace spaces diff --git a/src/VecSim/spaces/functions/AVX2_F16C.h b/src/VecSim/spaces/functions/AVX2_F16C.h new file mode 100644 index 000000000..95a171199 --- /dev/null +++ b/src/VecSim/spaces/functions/AVX2_F16C.h @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#pragma once + +#include "VecSim/spaces/spaces.h" + +// SQ8↔FP16 kernels for the AVX2 (no FMA) tier. Live in a sibling TU compiled only when the +// toolchain supports F16C (via `-mf16c`), so this header has no preprocessor guard. Callers +// still gate the calls themselves with `#ifdef OPT_F16C`. + +namespace spaces { + +dist_func_t Choose_SQ8_FP16_IP_implementation_AVX2(size_t dim); +dist_func_t Choose_SQ8_FP16_Cosine_implementation_AVX2(size_t dim); +dist_func_t Choose_SQ8_FP16_L2_implementation_AVX2(size_t dim); + +} // namespace spaces diff --git a/src/VecSim/spaces/functions/AVX2_FMA_F16C.cpp b/src/VecSim/spaces/functions/AVX2_FMA_F16C.cpp new file mode 100644 index 000000000..4e9dd8131 --- /dev/null +++ b/src/VecSim/spaces/functions/AVX2_FMA_F16C.cpp @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#include "AVX2_FMA_F16C.h" +#include "VecSim/spaces/IP/IP_AVX2_FMA_SQ8_FP16.h" +#include "VecSim/spaces/L2/L2_AVX2_FMA_SQ8_FP16.h" + +namespace spaces { + +#include "implementation_chooser.h" + +dist_func_t Choose_SQ8_FP16_IP_implementation_AVX2_FMA(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_FP16_InnerProductSIMD16_AVX2_FMA); + return ret_dist_func; +} +dist_func_t Choose_SQ8_FP16_Cosine_implementation_AVX2_FMA(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_FP16_CosineSIMD16_AVX2_FMA); + return ret_dist_func; +} +dist_func_t Choose_SQ8_FP16_L2_implementation_AVX2_FMA(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_FP16_L2SqrSIMD16_AVX2_FMA); + return ret_dist_func; +} + +#include "implementation_chooser_cleanup.h" + +} // namespace spaces diff --git a/src/VecSim/spaces/functions/AVX2_FMA_F16C.h b/src/VecSim/spaces/functions/AVX2_FMA_F16C.h new file mode 100644 index 000000000..7943ff4eb --- /dev/null +++ b/src/VecSim/spaces/functions/AVX2_FMA_F16C.h @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#pragma once + +#include "VecSim/spaces/spaces.h" + +// SQ8↔FP16 kernels for the AVX2+FMA tier. Live in a sibling TU compiled only when the +// toolchain supports F16C (via `-mf16c`), so this header has no preprocessor guard. Callers +// still gate the calls themselves with `#ifdef OPT_F16C`. + +namespace spaces { + +dist_func_t Choose_SQ8_FP16_IP_implementation_AVX2_FMA(size_t dim); +dist_func_t Choose_SQ8_FP16_Cosine_implementation_AVX2_FMA(size_t dim); +dist_func_t Choose_SQ8_FP16_L2_implementation_AVX2_FMA(size_t dim); + +} // namespace spaces diff --git a/src/VecSim/spaces/functions/AVX512F.cpp b/src/VecSim/spaces/functions/AVX512F.cpp index e765f4c8b..feb261fb4 100644 --- a/src/VecSim/spaces/functions/AVX512F.cpp +++ b/src/VecSim/spaces/functions/AVX512F.cpp @@ -11,10 +11,12 @@ #include "VecSim/spaces/L2/L2_AVX512F_FP16.h" #include "VecSim/spaces/L2/L2_AVX512F_FP32.h" #include "VecSim/spaces/L2/L2_AVX512F_FP64.h" +#include "VecSim/spaces/L2/L2_AVX512F_SQ8_FP16.h" #include "VecSim/spaces/IP/IP_AVX512F_FP16.h" #include "VecSim/spaces/IP/IP_AVX512F_FP32.h" #include "VecSim/spaces/IP/IP_AVX512F_FP64.h" +#include "VecSim/spaces/IP/IP_AVX512F_SQ8_FP16.h" namespace spaces { @@ -56,6 +58,25 @@ dist_func_t Choose_FP16_L2_implementation_AVX512F(size_t dim) { return ret_dist_func; } +// SQ8↔FP16 kernels only use AVX-512F (cvtph_ps + FMA), so they register here rather than under +// the VNNI tier — CPUs with AVX-512F but no VNNI (Skylake-X, some Cascade Lake variants) can use +// these kernels. +dist_func_t Choose_SQ8_FP16_IP_implementation_AVX512F(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_FP16_InnerProductSIMD16_AVX512F); + return ret_dist_func; +} +dist_func_t Choose_SQ8_FP16_Cosine_implementation_AVX512F(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_FP16_CosineSIMD16_AVX512F); + return ret_dist_func; +} +dist_func_t Choose_SQ8_FP16_L2_implementation_AVX512F(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_FP16_L2SqrSIMD16_AVX512F); + return ret_dist_func; +} + #include "implementation_chooser_cleanup.h" } // namespace spaces diff --git a/src/VecSim/spaces/functions/AVX512F.h b/src/VecSim/spaces/functions/AVX512F.h index fd36f312f..8d600f961 100644 --- a/src/VecSim/spaces/functions/AVX512F.h +++ b/src/VecSim/spaces/functions/AVX512F.h @@ -20,4 +20,9 @@ dist_func_t Choose_FP16_L2_implementation_AVX512F(size_t dim); dist_func_t Choose_FP32_L2_implementation_AVX512F(size_t dim); dist_func_t Choose_FP64_L2_implementation_AVX512F(size_t dim); +// SQ8↔FP16 kernels — only need AVX-512F, not VNNI/BW/VL. +dist_func_t Choose_SQ8_FP16_IP_implementation_AVX512F(size_t dim); +dist_func_t Choose_SQ8_FP16_Cosine_implementation_AVX512F(size_t dim); +dist_func_t Choose_SQ8_FP16_L2_implementation_AVX512F(size_t dim); + } // namespace spaces diff --git a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp index 3b8813b89..712bdda4e 100644 --- a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp +++ b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp @@ -75,6 +75,7 @@ dist_func_t Choose_SQ8_FP32_L2_implementation_AVX512F_BW_VL_VNNI(size_t d CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_FP32_L2SqrSIMD16_AVX512F_BW_VL_VNNI); return ret_dist_func; } + // SQ8-to-SQ8 distance functions (both vectors are uint8 quantized with precomputed sum) dist_func_t Choose_SQ8_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim) { dist_func_t ret_dist_func; diff --git a/src/VecSim/spaces/functions/SSE4_F16C.cpp b/src/VecSim/spaces/functions/SSE4_F16C.cpp new file mode 100644 index 000000000..91a11885f --- /dev/null +++ b/src/VecSim/spaces/functions/SSE4_F16C.cpp @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#include "SSE4_F16C.h" +#include "VecSim/spaces/IP/IP_SSE4_SQ8_FP16.h" +#include "VecSim/spaces/L2/L2_SSE4_SQ8_FP16.h" + +namespace spaces { + +#include "implementation_chooser.h" + +dist_func_t Choose_SQ8_FP16_IP_implementation_SSE4(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_FP16_InnerProductSIMD16_SSE4); + return ret_dist_func; +} +dist_func_t Choose_SQ8_FP16_Cosine_implementation_SSE4(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_FP16_CosineSIMD16_SSE4); + return ret_dist_func; +} +dist_func_t Choose_SQ8_FP16_L2_implementation_SSE4(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_FP16_L2SqrSIMD16_SSE4); + return ret_dist_func; +} + +#include "implementation_chooser_cleanup.h" + +} // namespace spaces diff --git a/src/VecSim/spaces/functions/SSE4_F16C.h b/src/VecSim/spaces/functions/SSE4_F16C.h new file mode 100644 index 000000000..2459c216c --- /dev/null +++ b/src/VecSim/spaces/functions/SSE4_F16C.h @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#pragma once + +#include "VecSim/spaces/spaces.h" + +// SQ8↔FP16 kernels for the SSE4 tier. Live in a sibling TU compiled only when the toolchain +// supports F16C (via `-mf16c -mavx`), so this header has no preprocessor guard. Callers +// still gate the calls themselves with `#ifdef OPT_F16C`. + +namespace spaces { + +dist_func_t Choose_SQ8_FP16_IP_implementation_SSE4(size_t dim); +dist_func_t Choose_SQ8_FP16_Cosine_implementation_SSE4(size_t dim); +dist_func_t Choose_SQ8_FP16_L2_implementation_SSE4(size_t dim); + +} // namespace spaces diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces.h b/tests/benchmark/spaces_benchmarks/bm_spaces.h index d99bcc4ca..2303eac0a 100644 --- a/tests/benchmark/spaces_benchmarks/bm_spaces.h +++ b/tests/benchmark/spaces_benchmarks/bm_spaces.h @@ -24,9 +24,12 @@ #include "VecSim/spaces/functions/AVX512BF16_VL.h" #include "VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h" #include "VecSim/spaces/functions/AVX2.h" +#include "VecSim/spaces/functions/AVX2_F16C.h" #include "VecSim/spaces/functions/AVX2_FMA.h" +#include "VecSim/spaces/functions/AVX2_FMA_F16C.h" #include "VecSim/spaces/functions/F16C.h" #include "VecSim/spaces/functions/SSE4.h" +#include "VecSim/spaces/functions/SSE4_F16C.h" #include "VecSim/spaces/functions/SSE3.h" #include "VecSim/spaces/functions/SSE.h" #include "VecSim/spaces/functions/NEON.h" diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_fp16.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_fp16.cpp index 2133a047e..ba3030064 100644 --- a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_fp16.cpp +++ b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_fp16.cpp @@ -15,8 +15,9 @@ using float16 = vecsim_types::float16; /** * SQ8-to-FP16 benchmarks: SQ8 quantized storage with FP16 query. - * Only naive (scalar) benchmarks are registered for now; SIMD chooser symbols are added - * by P1b (MOD-15152, x86) and P1c (MOD-15153, ARM). + * Registers the naive (scalar) baseline plus per-ISA SIMD variants (x86: AVX-512 / AVX2+FMA / + * AVX2 / SSE4 — gated on the matching OPT_* defines and runtime CPU features). ARM kernels + * land via MOD-14972. */ class BM_VecSimSpaces_SQ8_FP16 : public benchmark::Fixture { protected: @@ -50,8 +51,41 @@ class BM_VecSimSpaces_SQ8_FP16 : public benchmark::Fixture { } }; -// Naive (scalar) algorithms. SIMD chooser slots will be added by P1b (MOD-15152) and -// P1c (MOD-15153), following the SQ8_FP32 layout in bm_spaces_sq8_fp32.cpp. +#ifdef CPU_FEATURES_ARCH_X86_64 +cpu_features::X86Features opt = cpu_features::GetX86Info().features; + +// AVX-512F is sufficient — _mm512_cvtph_ps is part of AVX-512F, no F16C/VNNI/BW/VL needed. +#ifdef OPT_AVX512F +bool avx512f_supported = opt.avx512f; +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, AVX512F, 16, avx512f_supported); +INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, AVX512F, 16, + avx512f_supported); +#endif + +#ifdef OPT_F16C +#ifdef OPT_AVX2_FMA +bool avx2_fma3_f16c_supported = opt.avx2 && opt.fma3 && opt.f16c; +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, AVX2_FMA, 16, + avx2_fma3_f16c_supported); +INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, AVX2_FMA, 16, + avx2_fma3_f16c_supported); +#endif + +#ifdef OPT_AVX2 +bool avx2_f16c_supported = opt.avx2 && opt.f16c; +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, AVX2, 16, avx2_f16c_supported); +INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, AVX2, 16, avx2_f16c_supported); +#endif + +#ifdef OPT_SSE4 +bool sse4_f16c_supported = opt.sse4_1 && opt.f16c && opt.avx; +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, SSE4, 16, sse4_f16c_supported); +INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, SSE4, 16, sse4_f16c_supported); +#endif +#endif // OPT_F16C +#endif // x86_64 + +// Naive (scalar) baseline — always registered as the comparison anchor. INITIALIZE_NAIVE_BM(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, InnerProduct, 16); INITIALIZE_NAIVE_BM(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, Cosine, 16); diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index a6bb88cef..b880b6f13 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -32,9 +32,12 @@ #include "VecSim/spaces/functions/AVX512FP16_VL.h" #include "VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h" #include "VecSim/spaces/functions/AVX2.h" +#include "VecSim/spaces/functions/AVX2_F16C.h" #include "VecSim/spaces/functions/AVX2_FMA.h" +#include "VecSim/spaces/functions/AVX2_FMA_F16C.h" #include "VecSim/spaces/functions/SSE3.h" #include "VecSim/spaces/functions/SSE4.h" +#include "VecSim/spaces/functions/SSE4_F16C.h" #include "VecSim/spaces/functions/F16C.h" #include "VecSim/spaces/functions/NEON.h" #include "VecSim/spaces/functions/NEON_DOTPROD.h" @@ -560,9 +563,8 @@ TEST_F(SpacesTest, GetDistFuncSQ8Asymmetric) { } TEST_F(SpacesTest, GetDistFuncSQ8FP16Asymmetric) { - // SQ8 storage with FP16 query (asymmetric) - should return scalar SQ8_FP16 functions. - // SIMD chooser slots are added by P1b (MOD-15152) / P1c (MOD-15153); for now the - // dispatcher returns the scalar implementations regardless of dim or arch. + // SQ8 storage with FP16 query (asymmetric) - should return SQ8_FP16 functions. + // Per-ISA dispatcher walk coverage lives in the SQ8_FP16 SpacesOptimizationTest below. size_t dim = 128; auto l2_func = spaces::GetDistFunc(VecSimMetric_L2, dim, nullptr); auto ip_func = spaces::GetDistFunc(VecSimMetric_IP, dim, nullptr); @@ -570,9 +572,6 @@ TEST_F(SpacesTest, GetDistFuncSQ8FP16Asymmetric) { ASSERT_EQ(l2_func, L2_SQ8_FP16_GetDistFunc(dim, nullptr)); ASSERT_EQ(ip_func, IP_SQ8_FP16_GetDistFunc(dim, nullptr)); ASSERT_EQ(cosine_func, Cosine_SQ8_FP16_GetDistFunc(dim, nullptr)); - ASSERT_EQ(l2_func, SQ8_FP16_L2Sqr); - ASSERT_EQ(ip_func, SQ8_FP16_InnerProduct); - ASSERT_EQ(cosine_func, SQ8_FP16_Cosine); } #ifdef CPU_FEATURES_ARCH_X86_64 @@ -3000,8 +2999,9 @@ TEST(SQ8_FP32_EdgeCases, CosineExtremeValuesTest) { // Parameterized tests that verify the scalar SQ8_FP16 kernels against the not-optimized // baseline across multiple dimensions, including odd dimensions and SIMD-boundary residues. -// SIMD chooser slots are added by P1b (MOD-15152) / P1c (MOD-15153); the dispatcher always -// returns the scalar implementation for now. +// The SIMD-tier dispatcher coverage lives in SQ8_FP16_SpacesOptimizationTest below; this +// suite intentionally exercises the scalar reference directly to keep it as a fixed baseline +// the SIMD tiers are compared against. class SQ8_FP16_NoOptimizationSpacesTest : public testing::TestWithParam {}; TEST_P(SQ8_FP16_NoOptimizationSpacesTest, SQ8_FP16_L2SqrTest) { @@ -3070,10 +3070,318 @@ INSTANTIATE_TEST_SUITE_P(SQ8_FP16_NoOpt, SQ8_FP16_NoOptimizationSpacesTest, testing::Values(1, 5, 7, 8, 9, 15, 16, 17, 31, 32, 33, 47, 48, 49, 63, 64, 65, 127, 128)); +/* ======================== SQ8_FP16 SIMD optimisation tests ========================= */ + +// Walks down the x86 ISA tiers (AVX-512 → AVX2+FMA → AVX2 → SSE4 → scalar) and asserts +// that {IP,Cosine,L2}_SQ8_FP16_GetDistFunc returns the expected Choose_* symbol and that +// its output matches the scalar baseline within 0.01. +class SQ8_FP16_SpacesOptimizationTest : public testing::TestWithParam {}; + +TEST_P(SQ8_FP16_SpacesOptimizationTest, SQ8_FP16_L2SqrTest) { + auto optimization = getCpuOptimizationFeatures(); + size_t dim = GetParam(); + + size_t query_count = + dim + sq8::query_metadata_count() * (sizeof(float) / sizeof(float16)); + std::vector v1_query(query_count); + test_utils::populate_sq8_fp16_query(v1_query.data(), dim, false, 1234); + + size_t quantized_size = + dim * sizeof(uint8_t) + sq8::storage_metadata_count() * sizeof(float); + std::vector v2_compressed(quantized_size); + test_utils::populate_float_vec_to_sq8_with_metadata(v2_compressed.data(), dim, false, 5678); + + dist_func_t arch_opt_func; + float baseline = SQ8_FP16_L2Sqr(v2_compressed.data(), v1_query.data(), dim); + +#ifdef OPT_AVX512F + if (optimization.avx512f) { + unsigned char alignment = 0; + arch_opt_func = L2_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_L2_implementation_AVX512F(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01) + << "AVX512 with dim " << dim; + optimization.avx512f = 0; + } +#endif +#ifdef OPT_AVX2_FMA +#ifdef OPT_F16C + if (optimization.avx2 && optimization.fma3 && optimization.f16c) { + unsigned char alignment = 0; + arch_opt_func = L2_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_L2_implementation_AVX2_FMA(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01) + << "AVX2+FMA with dim " << dim; + optimization.fma3 = 0; + } +#endif +#endif +#ifdef OPT_AVX2 +#ifdef OPT_F16C + if (optimization.avx2 && optimization.f16c) { + unsigned char alignment = 0; + arch_opt_func = L2_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_L2_implementation_AVX2(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01) + << "AVX2 with dim " << dim; + optimization.avx2 = 0; + } +#endif +#endif +#ifdef OPT_SSE4 +#ifdef OPT_F16C + if (optimization.sse4_1 && optimization.f16c && optimization.avx) { + unsigned char alignment = 0; + arch_opt_func = L2_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_L2_implementation_SSE4(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01) + << "SSE4 with dim " << dim; + optimization.sse4_1 = 0; + } +#endif +#endif + + // Scalar fallback. Init alignment to a sentinel (0xFF) so the assert below actually verifies + // that the dispatcher LEAVES THE VALUE UNTOUCHED on the scalar path — initialising to 0 then + // asserting `== 0` would pass even if the dispatcher were a no-op. + unsigned char alignment = 0xFF; + arch_opt_func = L2_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, SQ8_FP16_L2Sqr) + << "Unexpected scalar fallback function for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01) + << "Scalar fallback with dim " << dim; + ASSERT_EQ(alignment, 0xFF) << "Scalar fallback must leave caller's alignment value untouched " + "(dim " + << dim << ")"; +} + +TEST_P(SQ8_FP16_SpacesOptimizationTest, SQ8_FP16_InnerProductTest) { + auto optimization = getCpuOptimizationFeatures(); + size_t dim = GetParam(); + + size_t query_count = + dim + sq8::query_metadata_count() * (sizeof(float) / sizeof(float16)); + std::vector v1_query(query_count); + test_utils::populate_sq8_fp16_query(v1_query.data(), dim, true, 1234); + + size_t quantized_size = + dim * sizeof(uint8_t) + sq8::storage_metadata_count() * sizeof(float); + std::vector v2_compressed(quantized_size); + test_utils::populate_float_vec_to_sq8_with_metadata(v2_compressed.data(), dim, true, 5678); + + dist_func_t arch_opt_func; + float baseline = SQ8_FP16_InnerProduct(v2_compressed.data(), v1_query.data(), dim); + +#ifdef OPT_AVX512F + if (optimization.avx512f) { + unsigned char alignment = 0; + arch_opt_func = IP_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_IP_implementation_AVX512F(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01) + << "AVX512 with dim " << dim; + optimization.avx512f = 0; + } +#endif +#ifdef OPT_AVX2_FMA +#ifdef OPT_F16C + if (optimization.avx2 && optimization.fma3 && optimization.f16c) { + unsigned char alignment = 0; + arch_opt_func = IP_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_IP_implementation_AVX2_FMA(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01) + << "AVX2+FMA with dim " << dim; + optimization.fma3 = 0; + } +#endif +#endif +#ifdef OPT_AVX2 +#ifdef OPT_F16C + if (optimization.avx2 && optimization.f16c) { + unsigned char alignment = 0; + arch_opt_func = IP_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_IP_implementation_AVX2(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01) + << "AVX2 with dim " << dim; + optimization.avx2 = 0; + } +#endif +#endif +#ifdef OPT_SSE4 +#ifdef OPT_F16C + if (optimization.sse4_1 && optimization.f16c && optimization.avx) { + unsigned char alignment = 0; + arch_opt_func = IP_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_IP_implementation_SSE4(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01) + << "SSE4 with dim " << dim; + optimization.sse4_1 = 0; + } +#endif +#endif + + // Scalar fallback — see L2 test for the 0xFF sentinel rationale. + unsigned char alignment = 0xFF; + arch_opt_func = IP_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, SQ8_FP16_InnerProduct) + << "Unexpected scalar fallback function for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01) + << "Scalar fallback with dim " << dim; + ASSERT_EQ(alignment, 0xFF) << "Scalar fallback must leave caller's alignment value untouched " + "(dim " + << dim << ")"; +} + +TEST_P(SQ8_FP16_SpacesOptimizationTest, SQ8_FP16_CosineTest) { + auto optimization = getCpuOptimizationFeatures(); + size_t dim = GetParam(); + + size_t query_count = + dim + sq8::query_metadata_count() * (sizeof(float) / sizeof(float16)); + std::vector v1_query(query_count); + test_utils::populate_sq8_fp16_query(v1_query.data(), dim, true, 1234); + + size_t quantized_size = + dim * sizeof(uint8_t) + sq8::storage_metadata_count() * sizeof(float); + std::vector v2_compressed(quantized_size); + test_utils::populate_float_vec_to_sq8_with_metadata(v2_compressed.data(), dim, true, 5678); + + dist_func_t arch_opt_func; + float baseline = SQ8_FP16_Cosine(v2_compressed.data(), v1_query.data(), dim); + +#ifdef OPT_AVX512F + if (optimization.avx512f) { + unsigned char alignment = 0; + arch_opt_func = Cosine_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_Cosine_implementation_AVX512F(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01) + << "AVX512 with dim " << dim; + optimization.avx512f = 0; + } +#endif +#ifdef OPT_AVX2_FMA +#ifdef OPT_F16C + if (optimization.avx2 && optimization.fma3 && optimization.f16c) { + unsigned char alignment = 0; + arch_opt_func = Cosine_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_Cosine_implementation_AVX2_FMA(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01) + << "AVX2+FMA with dim " << dim; + optimization.fma3 = 0; + } +#endif +#endif +#ifdef OPT_AVX2 +#ifdef OPT_F16C + if (optimization.avx2 && optimization.f16c) { + unsigned char alignment = 0; + arch_opt_func = Cosine_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_Cosine_implementation_AVX2(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01) + << "AVX2 with dim " << dim; + optimization.avx2 = 0; + } +#endif +#endif +#ifdef OPT_SSE4 +#ifdef OPT_F16C + if (optimization.sse4_1 && optimization.f16c && optimization.avx) { + unsigned char alignment = 0; + arch_opt_func = Cosine_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_Cosine_implementation_SSE4(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01) + << "SSE4 with dim " << dim; + optimization.sse4_1 = 0; + } +#endif +#endif + + // Scalar fallback — see L2 test for the 0xFF sentinel rationale. + unsigned char alignment = 0xFF; + arch_opt_func = Cosine_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, SQ8_FP16_Cosine) + << "Unexpected scalar fallback function for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01) + << "Scalar fallback with dim " << dim; + ASSERT_EQ(alignment, 0xFF) << "Scalar fallback must leave caller's alignment value untouched " + "(dim " + << dim << ")"; +} + +// Dim range [16, 32] covers every residual class for the 16-element chunk used by every tier. +INSTANTIATE_TEST_SUITE_P(SQ8_FP16_SIMD, SQ8_FP16_SpacesOptimizationTest, + testing::Range(16UL, 16 * 2UL + 1)); + +// Higher dimensions surface multi-iteration loop bugs (pointer stride, do-while termination +// off-by-one) that the [16, 32] range does not exercise because the AVX-512 inner loop runs at +// most twice in that range. +INSTANTIATE_TEST_SUITE_P(SQ8_FP16_SIMD_HighDim, SQ8_FP16_SpacesOptimizationTest, + testing::Values(64UL, 128UL, 256UL, 512UL, 1024UL)); + +// Surfaces which SIMD tiers were actually exercised on the current host. Without this, a CI +// runner that lacks AVX-512 silently passes with zero tier-1 coverage. Logs per-tier presence +// to stderr and GTEST_SKIPs only when no SIMD tier is available at all. +TEST(SQ8_FP16_SIMD_TierCoverage, ReportTiersExercised) { + auto opt = getCpuOptimizationFeatures(); + bool any_simd = false; + +#ifdef CPU_FEATURES_ARCH_X86_64 +#ifdef OPT_AVX512_F_BW_VL_VNNI + if (opt.avx512f && opt.avx512bw && opt.avx512vl && opt.avx512vnni) { + std::cerr << "[SQ8_FP16] AVX-512 F+BW+VL+VNNI tier exercised\n"; + any_simd = true; + } else { + std::cerr << "[SQ8_FP16] AVX-512 F+BW+VL+VNNI tier NOT exercised on this host\n"; + } +#endif +#if defined(OPT_AVX2_FMA) && defined(OPT_F16C) + if (opt.avx2 && opt.fma3 && opt.f16c) { + std::cerr << "[SQ8_FP16] AVX2+FMA+F16C tier exercised\n"; + any_simd = true; + } else { + std::cerr << "[SQ8_FP16] AVX2+FMA+F16C tier NOT exercised on this host\n"; + } +#endif +#if defined(OPT_AVX2) && defined(OPT_F16C) + if (opt.avx2 && opt.f16c) { + std::cerr << "[SQ8_FP16] AVX2+F16C tier exercised\n"; + any_simd = true; + } else { + std::cerr << "[SQ8_FP16] AVX2+F16C tier NOT exercised on this host\n"; + } +#endif +#if defined(OPT_SSE4) && defined(OPT_F16C) + if (opt.sse4_1 && opt.f16c && opt.avx) { + std::cerr << "[SQ8_FP16] SSE4+F16C+AVX tier exercised\n"; + any_simd = true; + } else { + std::cerr << "[SQ8_FP16] SSE4+F16C+AVX tier NOT exercised on this host\n"; + } +#endif +#endif // x86_64 + + if (!any_simd) { + GTEST_SKIP() << "No SQ8_FP16 SIMD tier available on this host — scalar fallback only."; + } +} + /* ======================== Tests SQ8_FP16 (edge cases) ========================= */ // Zero FP16 query against a non-zero SQ8 storage. IP must be exactly 1.0 (1 - 0), -// L2² must equal Σ dequantized². +// L2² must equal Σ dequantized². Math correctness on adversarial inputs is verified +// against the scalar reference; SIMD tier coverage with branchless kernels is provided +// separately by SQ8_FP16_SpacesOptimizationTest. TEST(SQ8_FP16_EdgeCases, ZeroQueryTest) { size_t dim = 64;