diff --git a/src/VecSim/spaces/CMakeLists.txt b/src/VecSim/spaces/CMakeLists.txt
index fe354ded5..9b7477837 100644
--- a/src/VecSim/spaces/CMakeLists.txt
+++ b/src/VecSim/spaces/CMakeLists.txt
@@ -50,18 +50,40 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64|amd64)|(^i.86$)")
 		list(APPEND OPTIMIZATIONS functions/AVX512F_BW_VL_VNNI.cpp)
 	endif()
 
+	# F16C is VEX-encoded and requires AVX state, so it is only meaningful when the toolchain
+	# can also emit AVX/FMA. Mirrors the OPT_F16C macro condition in x86_64InstructionFlags.cmake.
+	set(_has_full_f16c FALSE)
+	if(CXX_F16C AND CXX_FMA AND CXX_AVX)
+		set(_has_full_f16c TRUE)
+	endif()
+
+	# Base AVX2 / AVX2+FMA dispatcher TUs hold only kernels with no F16C dependency.
+	# SQ8↔FP16 kernels (which require F16C) live in sibling TUs functions/AVX2_F16C.cpp and
+	# functions/AVX2_FMA_F16C.cpp, compiled only when _has_full_f16c is true.
 	if(CXX_AVX2)
-		message("Building with AVX2")
-		set_source_files_properties(functions/AVX2.cpp PROPERTIES COMPILE_FLAGS -mavx2)
+		message("Building functions/AVX2.cpp with AVX2")
+		set_source_files_properties(functions/AVX2.cpp PROPERTIES COMPILE_FLAGS "-mavx2")
 		list(APPEND OPTIMIZATIONS functions/AVX2.cpp)
 	endif()
 
+	if(CXX_AVX2 AND _has_full_f16c)
+		message("Building functions/AVX2_F16C.cpp with AVX2 and F16C")
+		set_source_files_properties(functions/AVX2_F16C.cpp PROPERTIES COMPILE_FLAGS "-mavx2 -mf16c")
+		list(APPEND OPTIMIZATIONS functions/AVX2_F16C.cpp)
+	endif()
+
 	if(CXX_AVX2 AND CXX_FMA)
-		message("Building with AVX2 and FMA")
+		message("Building functions/AVX2_FMA.cpp with AVX2 and FMA")
 		set_source_files_properties(functions/AVX2_FMA.cpp PROPERTIES COMPILE_FLAGS "-mavx2 -mfma")
 		list(APPEND OPTIMIZATIONS functions/AVX2_FMA.cpp)
 	endif()
 
+	if(CXX_AVX2 AND CXX_FMA AND _has_full_f16c)
+		message("Building functions/AVX2_FMA_F16C.cpp with AVX2, FMA, and F16C")
+		set_source_files_properties(functions/AVX2_FMA_F16C.cpp PROPERTIES COMPILE_FLAGS "-mavx2 -mfma -mf16c")
+		list(APPEND OPTIMIZATIONS functions/AVX2_FMA_F16C.cpp)
+	endif()
+
 	if(CXX_F16C AND CXX_FMA AND CXX_AVX)
 		message("Building with CXX_F16C")
 		set_source_files_properties(functions/F16C.cpp PROPERTIES COMPILE_FLAGS "-mf16c -mfma -mavx")
@@ -81,11 +103,19 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64|amd64)|(^i.86$)")
 	endif()
 
 	if(CXX_SSE4)
-		message("Building with SSE4")
-		set_source_files_properties(functions/SSE4.cpp PROPERTIES COMPILE_FLAGS -msse4.1)
+		message("Building functions/SSE4.cpp with SSE4.1")
+		set_source_files_properties(functions/SSE4.cpp PROPERTIES COMPILE_FLAGS "-msse4.1")
 		list(APPEND OPTIMIZATIONS functions/SSE4.cpp)
 	endif()
 
+	# SSE4 SQ8↔FP16 kernels need F16C, which is VEX-encoded → require -mavx alongside -mf16c
+	# (mirrors the F16C.cpp recipe above).
+	if(CXX_SSE4 AND _has_full_f16c)
+		message("Building functions/SSE4_F16C.cpp with SSE4.1, AVX, and F16C")
+		set_source_files_properties(functions/SSE4_F16C.cpp PROPERTIES COMPILE_FLAGS "-msse4.1 -mavx -mf16c")
+		list(APPEND OPTIMIZATIONS functions/SSE4_F16C.cpp)
+	endif()
+
 	if(CXX_SSE)
 		message("Building with SSE")
 		set_source_files_properties(functions/SSE.cpp PROPERTIES COMPILE_FLAGS -msse)
diff --git a/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8_FP16.h b/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8_FP16.h
new file mode 100644
index 000000000..a4c1612ea
--- /dev/null
+++ b/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8_FP16.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+ */
+#pragma once
+#include "VecSim/spaces/space_includes.h"
+#include "VecSim/spaces/AVX_utils.h"
+#include "VecSim/types/sq8.h"
+#include "VecSim/types/float16.h"
+#include "VecSim/utils/alignment.h"
+
+using sq8 = vecsim_types::sq8;
+using float16 = vecsim_types::float16;
+
+/*
+ * Asymmetric SQ8 (storage) ↔ FP16 (query) inner product using algebraic identity:
+ *   IP(x, y) = Σ(x_i * y_i)
+ *            ≈ Σ((min + delta * q_i) * y_i)
+ *            = min * Σy_i + delta * Σ(q_i * y_i)
+ *            = min * y_sum + delta * quantized_dot_product
+ *
+ * FP16 query lanes are widened to FP32 per 8-lane chunk via _mm256_cvtph_ps (F16C);
+ * inner-loop arithmetic runs in FP32 with _mm256_fmadd_ps.
+ */
+
+// 8-wide AVX2+FMA step: 8 SQ8 lanes + 8 FP16 lanes -> 8 FP32 fused-multiply-add.
+static inline void SQ8_FP16_InnerProductStep_AVX2_FMA(const uint8_t *&pVect1,
+                                                      const float16 *&pVect2, __m256 &sum256) {
+    __m128i v1_128 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(pVect1));
+    pVect1 += 8;
+    __m256i v1_256 = _mm256_cvtepu8_epi32(v1_128);
+    __m256 v1_f = _mm256_cvtepi32_ps(v1_256);
+
+    __m128i v2_128 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pVect2));
+    __m256 v2_f = _mm256_cvtph_ps(v2_128);
+    pVect2 += 8;
+
+    sum256 = _mm256_fmadd_ps(v1_f, v2_f, sum256);
+}
+
+// Precondition: dim >= 16. Caller is the dispatcher in IP_space.cpp / L2_space.cpp.
+// The residual block reads 8 SQ8 bytes and 16 FP16 bytes unconditionally; shorter blobs would
+// under-read.
+template <unsigned char residual> // 0..15
+float SQ8_FP16_InnerProductImp_AVX2_FMA(const void *pVec1v, const void *pVec2v, size_t dimension) {
+    const uint8_t *pVec1 = static_cast<const uint8_t *>(pVec1v);
+    const float16 *pVec2 = static_cast<const float16 *>(pVec2v);
+    const uint8_t *pEnd1 = pVec1 + dimension;
+
+    // Two independent accumulators break the FMA dependency chain so consecutive iterations
+    // can issue in parallel through both FMA ports.
+    __m256 sum_a = _mm256_setzero_ps();
+    __m256 sum_b = _mm256_setzero_ps();
+
+    if constexpr (residual % 8) {
+        constexpr int mask = (1 << (residual % 8)) - 1;
+
+        // Single-side mask is sufficient: SQ8 lanes beyond `residual` may hold garbage, but the
+        // FP16 query blend below forces those FP32 query lanes to 0, so garbage·0=0 contributes
+        // nothing to the dot product. SQ8 load is intentionally unmasked.
+        __m128i v1_128 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(pVec1));
+        pVec1 += residual % 8;
+        __m256i v1_256 = _mm256_cvtepu8_epi32(v1_128);
+        __m256 v1_f = _mm256_cvtepi32_ps(v1_256);
+
+        // FP16 side: load full 16-byte block (safe — dim >= 16 and metadata follows).
+        __m128i v2_128 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pVec2));
+        __m256 v2_f = _mm256_cvtph_ps(v2_128);
+        v2_f = _mm256_blend_ps(_mm256_setzero_ps(), v2_f, mask);
+        pVec2 += residual % 8;
+
+        sum_a = _mm256_mul_ps(v1_f, v2_f);
+    }
+
+    if constexpr (residual >= 8) {
+        // Route the half-residual chunk to the second accumulator for ILP.
+        SQ8_FP16_InnerProductStep_AVX2_FMA(pVec1, pVec2, sum_b);
+    }
+
+    do {
+        SQ8_FP16_InnerProductStep_AVX2_FMA(pVec1, pVec2, sum_a);
+        SQ8_FP16_InnerProductStep_AVX2_FMA(pVec1, pVec2, sum_b);
+    } while (pVec1 < pEnd1);
+
+    __m256 sum256 = _mm256_add_ps(sum_a, sum_b);
+    float quantized_dot = my_mm256_reduce_add_ps(sum256);
+
+    const uint8_t *pVec1Base = static_cast<const uint8_t *>(pVec1v);
+    const uint8_t *params_bytes = pVec1Base + dimension;
+    const float min_val = load_unaligned<float>(params_bytes + sq8::MIN_VAL * sizeof(float));
+    const float delta = load_unaligned<float>(params_bytes + sq8::DELTA * sizeof(float));
+
+    const float16 *pVec2Base = static_cast<const float16 *>(pVec2v);
+    const auto *query_meta_bytes = reinterpret_cast<const uint8_t *>(pVec2Base + dimension);
+    const float y_sum = load_unaligned<float>(query_meta_bytes + sq8::SUM_QUERY * sizeof(float));
+
+    return min_val * y_sum + delta * quantized_dot;
+}
+
+template <unsigned char residual> // 0..15
+float SQ8_FP16_InnerProductSIMD16_AVX2_FMA(const void *pVec1v, const void *pVec2v,
+                                           size_t dimension) {
+    return 1.0f - SQ8_FP16_InnerProductImp_AVX2_FMA<residual>(pVec1v, pVec2v, dimension);
+}
+
+template <unsigned char residual> // 0..15
+float SQ8_FP16_CosineSIMD16_AVX2_FMA(const void *pVec1v, const void *pVec2v, size_t dimension) {
+    return SQ8_FP16_InnerProductSIMD16_AVX2_FMA<residual>(pVec1v, pVec2v, dimension);
+}
diff --git a/src/VecSim/spaces/IP/IP_AVX2_SQ8_FP16.h b/src/VecSim/spaces/IP/IP_AVX2_SQ8_FP16.h
new file mode 100644
index 000000000..3a01d80f2
--- /dev/null
+++ b/src/VecSim/spaces/IP/IP_AVX2_SQ8_FP16.h
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+ */
+#pragma once
+#include "VecSim/spaces/space_includes.h"
+#include "VecSim/spaces/AVX_utils.h"
+#include "VecSim/types/sq8.h"
+#include "VecSim/types/float16.h"
+#include "VecSim/utils/alignment.h"
+
+using sq8 = vecsim_types::sq8;
+using float16 = vecsim_types::float16;
+
+/*
+ * Asymmetric SQ8 (storage) ↔ FP16 (query) inner product using algebraic identity:
+ *   IP(x, y) = Σ(x_i * y_i)
+ *            ≈ Σ((min + delta * q_i) * y_i)
+ *            = min * Σy_i + delta * Σ(q_i * y_i)
+ *            = min * y_sum + delta * quantized_dot_product
+ *
+ * FP16 query lanes are widened to FP32 per 8-lane chunk via _mm256_cvtph_ps (F16C);
+ * inner-loop arithmetic runs in FP32 with separate _mm256_mul_ps + _mm256_add_ps
+ * (no FMA tier — Haswell-era AVX2 without FMA support).
+ */
+
+// 8-wide AVX2 step (no FMA): 8 SQ8 lanes + 8 FP16 lanes -> mul + add into sum.
+static inline void SQ8_FP16_InnerProductStep_AVX2(const uint8_t *&pVect1, const float16 *&pVect2,
+                                                  __m256 &sum256) {
+    __m128i v1_128 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(pVect1));
+    pVect1 += 8;
+    __m256i v1_256 = _mm256_cvtepu8_epi32(v1_128);
+    __m256 v1_f = _mm256_cvtepi32_ps(v1_256);
+
+    __m128i v2_128 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pVect2));
+    __m256 v2_f = _mm256_cvtph_ps(v2_128);
+    pVect2 += 8;
+
+    sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v1_f, v2_f));
+}
+
+// Precondition: dim >= 16. Caller is the dispatcher in IP_space.cpp / L2_space.cpp.
+// The residual block reads 8 SQ8 bytes and 16 FP16 bytes unconditionally; shorter blobs would
+// under-read.
+template <unsigned char residual> // 0..15
+float SQ8_FP16_InnerProductImp_AVX2(const void *pVec1v, const void *pVec2v, size_t dimension) {
+    const uint8_t *pVec1 = static_cast<const uint8_t *>(pVec1v);
+    const float16 *pVec2 = static_cast<const float16 *>(pVec2v);
+    const uint8_t *pEnd1 = pVec1 + dimension;
+
+    // Two independent accumulators break the mul→add dependency chain on Haswell-class CPUs
+    // without FMA, where the add cannot retire before the prior mul.
+    __m256 sum_a = _mm256_setzero_ps();
+    __m256 sum_b = _mm256_setzero_ps();
+
+    if constexpr (residual % 8) {
+        constexpr int mask = (1 << (residual % 8)) - 1;
+
+        // Single-side mask is sufficient: SQ8 lanes beyond `residual` may hold garbage, but the
+        // FP16 query blend below forces those FP32 query lanes to 0, so garbage·0=0 contributes
+        // nothing to the dot product. SQ8 load is intentionally unmasked.
+        __m128i v1_128 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(pVec1));
+        pVec1 += residual % 8;
+        __m256i v1_256 = _mm256_cvtepu8_epi32(v1_128);
+        __m256 v1_f = _mm256_cvtepi32_ps(v1_256);
+
+        __m128i v2_128 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pVec2));
+        __m256 v2_f = _mm256_cvtph_ps(v2_128);
+        v2_f = _mm256_blend_ps(_mm256_setzero_ps(), v2_f, mask);
+        pVec2 += residual % 8;
+
+        sum_a = _mm256_mul_ps(v1_f, v2_f);
+    }
+
+    if constexpr (residual >= 8) {
+        // Route the half-residual chunk to the second accumulator for ILP.
+        SQ8_FP16_InnerProductStep_AVX2(pVec1, pVec2, sum_b);
+    }
+
+    do {
+        SQ8_FP16_InnerProductStep_AVX2(pVec1, pVec2, sum_a);
+        SQ8_FP16_InnerProductStep_AVX2(pVec1, pVec2, sum_b);
+    } while (pVec1 < pEnd1);
+
+    __m256 sum256 = _mm256_add_ps(sum_a, sum_b);
+    float quantized_dot = my_mm256_reduce_add_ps(sum256);
+
+    const uint8_t *pVec1Base = static_cast<const uint8_t *>(pVec1v);
+    const uint8_t *params_bytes = pVec1Base + dimension;
+    const float min_val = load_unaligned<float>(params_bytes + sq8::MIN_VAL * sizeof(float));
+    const float delta = load_unaligned<float>(params_bytes + sq8::DELTA * sizeof(float));
+
+    const float16 *pVec2Base = static_cast<const float16 *>(pVec2v);
+    const auto *query_meta_bytes = reinterpret_cast<const uint8_t *>(pVec2Base + dimension);
+    const float y_sum = load_unaligned<float>(query_meta_bytes + sq8::SUM_QUERY * sizeof(float));
+
+    return min_val * y_sum + delta * quantized_dot;
+}
+
+template <unsigned char residual> // 0..15
+float SQ8_FP16_InnerProductSIMD16_AVX2(const void *pVec1v, const void *pVec2v, size_t dimension) {
+    return 1.0f - SQ8_FP16_InnerProductImp_AVX2<residual>(pVec1v, pVec2v, dimension);
+}
+
+template <unsigned char residual> // 0..15
+float SQ8_FP16_CosineSIMD16_AVX2(const void *pVec1v, const void *pVec2v, size_t dimension) {
+    return SQ8_FP16_InnerProductSIMD16_AVX2<residual>(pVec1v, pVec2v, dimension);
+}
diff --git a/src/VecSim/spaces/IP/IP_AVX512F_SQ8_FP16.h b/src/VecSim/spaces/IP/IP_AVX512F_SQ8_FP16.h
new file mode 100644
index 000000000..7ba9c0412
--- /dev/null
+++ b/src/VecSim/spaces/IP/IP_AVX512F_SQ8_FP16.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+ */
+#pragma once
+#include "VecSim/spaces/space_includes.h"
+#include "VecSim/types/sq8.h"
+#include "VecSim/types/float16.h"
+#include "VecSim/utils/alignment.h"
+#include <immintrin.h>
+
+using sq8 = vecsim_types::sq8;
+using float16 = vecsim_types::float16;
+
+// Helper: load 16 SQ8 + 16 FP16 lanes, widen both to FP32, fused-multiply-add into sum.
+static inline void SQ8_FP16_InnerProductStep_AVX512(const uint8_t *&pVec1, const float16 *&pVec2,
+                                                    __m512 &sum) {
+    // 16 uint8 -> 16 fp32
+    __m128i v1_128 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pVec1));
+    __m512i v1_512 = _mm512_cvtepu8_epi32(v1_128);
+    __m512 v1_f = _mm512_cvtepi32_ps(v1_512);
+
+    // 16 fp16 -> 16 fp32. _mm512_cvtph_ps is part of AVX512F.
+    __m256i v2_16 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(pVec2));
+    __m512 v2_f = _mm512_cvtph_ps(v2_16);
+
+    sum = _mm512_fmadd_ps(v1_f, v2_f, sum);
+
+    pVec1 += 16;
+    pVec2 += 16;
+}
+
+// Raw inner product Σ((min + delta * q_i) * y_i). Used by both InnerProduct/Cosine wrappers
+// and by the L2 kernel.
+// Precondition: dim >= 16. Caller is the dispatcher in IP_space.cpp / L2_space.cpp, which gates
+// this. The residual block reads 16 SQ8 bytes and 32 FP16 bytes unconditionally; shorter blobs
+// would under-read.
+template <unsigned char residual> // 0..15
+float SQ8_FP16_InnerProductImp_AVX512(const void *pVec1v, const void *pVec2v, size_t dimension) {
+    const uint8_t *pVec1 = static_cast<const uint8_t *>(pVec1v); // SQ8 storage
+    const float16 *pVec2 = static_cast<const float16 *>(pVec2v); // FP16 query
+    const uint8_t *pEnd1 = pVec1 + dimension;
+
+    // Four independent accumulators break the FMA dependency chain so the inner loop can
+    // saturate both FMA ports on Sapphire Rapids / Zen 4.
+    __m512 sum0 = _mm512_setzero_ps();
+    __m512 sum1 = _mm512_setzero_ps();
+    __m512 sum2 = _mm512_setzero_ps();
+    __m512 sum3 = _mm512_setzero_ps();
+
+    if constexpr (residual > 0) {
+        __mmask16 mask = (1U << residual) - 1;
+
+        __m128i v1_128 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pVec1));
+        __m512i v1_512 = _mm512_cvtepu8_epi32(v1_128);
+        __m512 v1_f = _mm512_cvtepi32_ps(v1_512);
+
+        // Safe to read the full 32-byte FP16 chunk: dim >= 16 and the FP16 metadata follows
+        // the lanes, so the load stays within the query blob.
+        __m256i v2_16 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(pVec2));
+        __m512 v2_f = _mm512_cvtph_ps(v2_16);
+
+        // Mask out unused lanes by folding the mask into the multiply.
+        sum0 = _mm512_maskz_mul_ps(mask, v1_f, v2_f);
+
+        pVec1 += residual;
+        pVec2 += residual;
+    }
+
+    // Main unrolled loop: 4 chunks of 16 lanes per iteration, one chunk per accumulator.
+    // Residual leaves `dim - residual` lanes remaining (a multiple of 16), so the
+    // pointer comparison stays exact. Compare via pointer subtraction (not
+    // `pVec1 + 64 <= pEnd1`) so we never form a pointer past one-past-the-end,
+    // which would be UB in C++ regardless of dereference.
+    while (static_cast<size_t>(pEnd1 - pVec1) >= 64) {
+        SQ8_FP16_InnerProductStep_AVX512(pVec1, pVec2, sum0);
+        SQ8_FP16_InnerProductStep_AVX512(pVec1, pVec2, sum1);
+        SQ8_FP16_InnerProductStep_AVX512(pVec1, pVec2, sum2);
+        SQ8_FP16_InnerProductStep_AVX512(pVec1, pVec2, sum3);
+    }
+
+    // Reduce the four accumulators into one.
+    __m512 sum = _mm512_add_ps(_mm512_add_ps(sum0, sum1), _mm512_add_ps(sum2, sum3));
+
+    // Tail: at most three remaining 16-lane chunks.
+    while (pVec1 < pEnd1) {
+        SQ8_FP16_InnerProductStep_AVX512(pVec1, pVec2, sum);
+    }
+
+    float quantized_dot = _mm512_reduce_add_ps(sum);
+
+    // SQ8 metadata starts at byte offset `dimension`; for odd `dimension` it is not
+    // 4-byte aligned, so use load_unaligned. Mirrors the scalar SQ8_FP16_Impl pattern.
+    const uint8_t *pVec1Base = static_cast<const uint8_t *>(pVec1v);
+    const uint8_t *params_bytes = pVec1Base + dimension;
+    const float min_val = load_unaligned<float>(params_bytes + sq8::MIN_VAL * sizeof(float));
+    const float delta = load_unaligned<float>(params_bytes + sq8::DELTA * sizeof(float));
+
+    // FP16 query metadata sits at byte offset 2*dimension; for odd `dimension` it is
+    // 2-byte aligned only.
+    const float16 *pVec2Base = static_cast<const float16 *>(pVec2v);
+    const auto *query_meta_bytes = reinterpret_cast<const uint8_t *>(pVec2Base + dimension);
+    const float y_sum = load_unaligned<float>(query_meta_bytes + sq8::SUM_QUERY * sizeof(float));
+
+    return min_val * y_sum + delta * quantized_dot;
+}
+
+template <unsigned char residual> // 0..15
+float SQ8_FP16_InnerProductSIMD16_AVX512F(const void *pVec1v, const void *pVec2v,
+                                          size_t dimension) {
+    return 1.0f - SQ8_FP16_InnerProductImp_AVX512<residual>(pVec1v, pVec2v, dimension);
+}
+
+template <unsigned char residual> // 0..15
+float SQ8_FP16_CosineSIMD16_AVX512F(const void *pVec1v, const void *pVec2v, size_t dimension) {
+    // Cosine distance = 1 - IP for pre-normalised vectors. Aliases InnerProduct, matching the
+    // SQ8_FP32 pattern.
+    return SQ8_FP16_InnerProductSIMD16_AVX512F<residual>(pVec1v, pVec2v, dimension);
+}
diff --git a/src/VecSim/spaces/IP/IP_SSE4_SQ8_FP16.h b/src/VecSim/spaces/IP/IP_SSE4_SQ8_FP16.h
new file mode 100644
index 000000000..871a189dc
--- /dev/null
+++ b/src/VecSim/spaces/IP/IP_SSE4_SQ8_FP16.h
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+ */
+#pragma once
+#include "VecSim/spaces/space_includes.h"
+#include "VecSim/types/sq8.h"
+#include "VecSim/types/float16.h"
+#include "VecSim/utils/alignment.h"
+
+using sq8 = vecsim_types::sq8;
+using float16 = vecsim_types::float16;
+
+/*
+ * Asymmetric SQ8 (storage) ↔ FP16 (query) inner product using algebraic identity:
+ *   IP(x, y) = Σ(x_i * y_i)
+ *            ≈ Σ((min + delta * q_i) * y_i)
+ *            = min * Σy_i + delta * Σ(q_i * y_i)
+ *            = min * y_sum + delta * quantized_dot_product
+ *
+ * FP16 query lanes are widened to FP32 per 4-lane chunk via _mm_cvtph_ps (F16C);
+ * inner-loop arithmetic runs in FP32 with separate _mm_mul_ps + _mm_add_ps (SSE4 has no FMA).
+ */
+
+// 4-wide SSE4+F16C step: 4 SQ8 lanes + 4 FP16 lanes -> mul + add into sum.
+static inline void SQ8_FP16_InnerProductStep_SSE4(const uint8_t *&pVect1, const float16 *&pVect2,
+                                                  __m128 &sum) {
+    // Alignment-safe 4-byte load of SQ8 lanes via load_unaligned<int32_t> (no strict-aliasing UB).
+    __m128i v1_i = _mm_cvtepu8_epi32(_mm_cvtsi32_si128(load_unaligned<int32_t>(pVect1)));
+    pVect1 += 4;
+    __m128 v1_f = _mm_cvtepi32_ps(v1_i);
+
+    __m128i v2_8 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(pVect2));
+    __m128 v2_f = _mm_cvtph_ps(v2_8);
+    pVect2 += 4;
+
+    sum = _mm_add_ps(sum, _mm_mul_ps(v1_f, v2_f));
+}
+
+// Precondition: dim >= 16. Caller is the dispatcher in IP_space.cpp / L2_space.cpp.
+// Shorter blobs would underflow the residual ladder + final do-while loop.
+template <unsigned char residual> // 0..15
+float SQ8_FP16_InnerProductSIMD16_SSE4_IMP(const void *pVec1v, const void *pVec2v,
+                                           size_t dimension) {
+    const uint8_t *pVec1 = static_cast<const uint8_t *>(pVec1v);
+    const float16 *pVec2 = static_cast<const float16 *>(pVec2v);
+    const uint8_t *pEnd1 = pVec1 + dimension;
+
+    // Two independent accumulators break the mul→add dependency chain (SSE4 lacks FMA).
+    __m128 sum_a = _mm_setzero_ps();
+    __m128 sum_b = _mm_setzero_ps();
+
+    if constexpr (residual % 4) {
+        __m128 v1_f;
+        __m128 v2_f;
+
+        if constexpr (residual % 4 == 3) {
+            v1_f = _mm_set_ps(0.0f, static_cast<float>(pVec1[2]), static_cast<float>(pVec1[1]),
+                              static_cast<float>(pVec1[0]));
+            v2_f = _mm_set_ps(0.0f, vecsim_types::FP16_to_FP32(pVec2[2]),
+                              vecsim_types::FP16_to_FP32(pVec2[1]),
+                              vecsim_types::FP16_to_FP32(pVec2[0]));
+        } else if constexpr (residual % 4 == 2) {
+            v1_f =
+                _mm_set_ps(0.0f, 0.0f, static_cast<float>(pVec1[1]), static_cast<float>(pVec1[0]));
+            v2_f = _mm_set_ps(0.0f, 0.0f, vecsim_types::FP16_to_FP32(pVec2[1]),
+                              vecsim_types::FP16_to_FP32(pVec2[0]));
+        } else if constexpr (residual % 4 == 1) {
+            v1_f = _mm_set_ps(0.0f, 0.0f, 0.0f, static_cast<float>(pVec1[0]));
+            v2_f = _mm_set_ps(0.0f, 0.0f, 0.0f, vecsim_types::FP16_to_FP32(pVec2[0]));
+        }
+
+        pVec1 += residual % 4;
+        pVec2 += residual % 4;
+
+        sum_a = _mm_mul_ps(v1_f, v2_f);
+    }
+
+    // Alternate the residual-ladder steps across the two accumulators for ILP.
+    if constexpr (residual >= 4) {
+        SQ8_FP16_InnerProductStep_SSE4(pVec1, pVec2, sum_b);
+    }
+    if constexpr (residual >= 8) {
+        SQ8_FP16_InnerProductStep_SSE4(pVec1, pVec2, sum_a);
+    }
+    if constexpr (residual >= 12) {
+        SQ8_FP16_InnerProductStep_SSE4(pVec1, pVec2, sum_b);
+    }
+
+    // Remaining lanes after the residual block are a multiple of 16, hence a multiple of 8,
+    // so two 4-lane steps per iteration consume the tail exactly.
+    do {
+        SQ8_FP16_InnerProductStep_SSE4(pVec1, pVec2, sum_a);
+        SQ8_FP16_InnerProductStep_SSE4(pVec1, pVec2, sum_b);
+    } while (pVec1 < pEnd1);
+
+    __m128 sum = _mm_add_ps(sum_a, sum_b);
+    float PORTABLE_ALIGN16 TmpRes[4];
+    _mm_store_ps(TmpRes, sum);
+    float quantized_dot = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];
+
+    const uint8_t *pVec1Base = static_cast<const uint8_t *>(pVec1v);
+    const uint8_t *params_bytes = pVec1Base + dimension;
+    const float min_val = load_unaligned<float>(params_bytes + sq8::MIN_VAL * sizeof(float));
+    const float delta = load_unaligned<float>(params_bytes + sq8::DELTA * sizeof(float));
+
+    const float16 *pVec2Base = static_cast<const float16 *>(pVec2v);
+    const auto *query_meta_bytes = reinterpret_cast<const uint8_t *>(pVec2Base + dimension);
+    const float y_sum = load_unaligned<float>(query_meta_bytes + sq8::SUM_QUERY * sizeof(float));
+
+    return min_val * y_sum + delta * quantized_dot;
+}
+
+template <unsigned char residual> // 0..15
+float SQ8_FP16_InnerProductSIMD16_SSE4(const void *pVec1v, const void *pVec2v, size_t dimension) {
+    return 1.0f - SQ8_FP16_InnerProductSIMD16_SSE4_IMP<residual>(pVec1v, pVec2v, dimension);
+}
+
+template <unsigned char residual> // 0..15
+float SQ8_FP16_CosineSIMD16_SSE4(const void *pVec1v, const void *pVec2v, size_t dimension) {
+    return SQ8_FP16_InnerProductSIMD16_SSE4<residual>(pVec1v, pVec2v, dimension);
+}
diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp
index 55979e25a..b57971b60 100644
--- a/src/VecSim/spaces/IP_space.cpp
+++ b/src/VecSim/spaces/IP_space.cpp
@@ -20,9 +20,12 @@
 #include "VecSim/spaces/functions/AVX512BF16_VL.h"
 #include "VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h"
 #include "VecSim/spaces/functions/AVX2.h"
+#include "VecSim/spaces/functions/AVX2_F16C.h"
 #include "VecSim/spaces/functions/AVX2_FMA.h"
+#include "VecSim/spaces/functions/AVX2_FMA_F16C.h"
 #include "VecSim/spaces/functions/SSE3.h"
 #include "VecSim/spaces/functions/SSE4.h"
+#include "VecSim/spaces/functions/SSE4_F16C.h"
 #include "VecSim/spaces/functions/NEON.h"
 #include "VecSim/spaces/functions/NEON_DOTPROD.h"
 #include "VecSim/spaces/functions/NEON_HP.h"
@@ -172,31 +175,106 @@ dist_func_t<float> Cosine_SQ8_FP32_GetDistFunc(size_t dim, unsigned char *alignm
 }
 
 // SQ8-FP16: asymmetric inner product distance between SQ8 storage and FP16 query.
-// SIMD chooser slots are added by P1b (MOD-15152) / P1c (MOD-15153); for now this always
-// returns the scalar implementation.
 dist_func_t<float> IP_SQ8_FP16_GetDistFunc(size_t dim, unsigned char *alignment,
                                            const void *arch_opt) {
     unsigned char dummy_alignment;
     if (alignment == nullptr) {
         alignment = &dummy_alignment;
     }
-    (void)dim;
-    (void)arch_opt;
-    return SQ8_FP16_InnerProduct;
+
+    dist_func_t<float> ret_dist_func = SQ8_FP16_InnerProduct;
+    [[maybe_unused]] auto features = getCpuOptimizationFeatures(arch_opt);
+
+#ifdef CPU_FEATURES_ARCH_X86_64
+    if (dim < 16) {
+        return ret_dist_func;
+    }
+    // Alignment hints below refer to the SQ8 (first) operand per the GetDistFunc contract.
+    // AVX-512 tier only needs AVX-512F (cvtph_ps is part of AVX-512F, no VNNI/BW/VL required).
+#ifdef OPT_AVX512F
+    if (features.avx512f) {
+        if (dim % 16 == 0) // SQ8 chunk = 16 bytes
+            *alignment = 16 * sizeof(uint8_t);
+        return Choose_SQ8_FP16_IP_implementation_AVX512F(dim);
+    }
+#endif
+    // F16C is required by every non-AVX-512 SQ8↔FP16 tier (vcvtph2ps), so the guard is hoisted
+    // around all three.
+#ifdef OPT_F16C
+#ifdef OPT_AVX2_FMA
+    if (features.avx2 && features.fma3 && features.f16c) {
+        if (dim % 8 == 0) // SQ8 chunk = 8 bytes
+            *alignment = 8 * sizeof(uint8_t);
+        return Choose_SQ8_FP16_IP_implementation_AVX2_FMA(dim);
+    }
+#endif
+#ifdef OPT_AVX2
+    if (features.avx2 && features.f16c) {
+        if (dim % 8 == 0)
+            *alignment = 8 * sizeof(uint8_t);
+        return Choose_SQ8_FP16_IP_implementation_AVX2(dim);
+    }
+#endif
+#ifdef OPT_SSE4
+    // F16C is VEX-encoded — require AVX as well, matching the existing F16C/FP16 dispatcher.
+    if (features.sse4_1 && features.f16c && features.avx) {
+        if (dim % 4 == 0)
+            *alignment = 4 * sizeof(uint8_t);
+        return Choose_SQ8_FP16_IP_implementation_SSE4(dim);
+    }
+#endif
+#endif // OPT_F16C
+#endif // x86_64
+    return ret_dist_func;
 }
 
 // SQ8-FP16: asymmetric cosine distance between SQ8 storage and FP16 query.
-// SIMD chooser slots are added by P1b (MOD-15152) / P1c (MOD-15153); for now this always
-// returns the scalar implementation.
 dist_func_t<float> Cosine_SQ8_FP16_GetDistFunc(size_t dim, unsigned char *alignment,
                                                const void *arch_opt) {
     unsigned char dummy_alignment;
     if (alignment == nullptr) {
         alignment = &dummy_alignment;
     }
-    (void)dim;
-    (void)arch_opt;
-    return SQ8_FP16_Cosine;
+
+    dist_func_t<float> ret_dist_func = SQ8_FP16_Cosine;
+    [[maybe_unused]] auto features = getCpuOptimizationFeatures(arch_opt);
+
+#ifdef CPU_FEATURES_ARCH_X86_64
+    if (dim < 16) {
+        return ret_dist_func;
+    }
+#ifdef OPT_AVX512F
+    if (features.avx512f) {
+        if (dim % 16 == 0)
+            *alignment = 16 * sizeof(uint8_t);
+        return Choose_SQ8_FP16_Cosine_implementation_AVX512F(dim);
+    }
+#endif
+#ifdef OPT_F16C
+#ifdef OPT_AVX2_FMA
+    if (features.avx2 && features.fma3 && features.f16c) {
+        if (dim % 8 == 0)
+            *alignment = 8 * sizeof(uint8_t);
+        return Choose_SQ8_FP16_Cosine_implementation_AVX2_FMA(dim);
+    }
+#endif
+#ifdef OPT_AVX2
+    if (features.avx2 && features.f16c) {
+        if (dim % 8 == 0)
+            *alignment = 8 * sizeof(uint8_t);
+        return Choose_SQ8_FP16_Cosine_implementation_AVX2(dim);
+    }
+#endif
+#ifdef OPT_SSE4
+    if (features.sse4_1 && features.f16c && features.avx) {
+        if (dim % 4 == 0)
+            *alignment = 4 * sizeof(uint8_t);
+        return Choose_SQ8_FP16_Cosine_implementation_SSE4(dim);
+    }
+#endif
+#endif // OPT_F16C
+#endif // x86_64
+    return ret_dist_func;
 }
 
 // SQ8-to-SQ8 Inner Product distance function (both vectors are uint8 quantized with precomputed
diff --git a/src/VecSim/spaces/L2/L2_AVX2_FMA_SQ8_FP16.h b/src/VecSim/spaces/L2/L2_AVX2_FMA_SQ8_FP16.h
new file mode 100644
index 000000000..38809e9c2
--- /dev/null
+++ b/src/VecSim/spaces/L2/L2_AVX2_FMA_SQ8_FP16.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+ */
+#pragma once
+#include "VecSim/spaces/space_includes.h"
+#include "VecSim/spaces/AVX_utils.h"
+#include "VecSim/spaces/IP/IP_AVX2_FMA_SQ8_FP16.h"
+#include "VecSim/types/sq8.h"
+#include "VecSim/types/float16.h"
+#include "VecSim/utils/alignment.h"
+
+using sq8 = vecsim_types::sq8;
+using float16 = vecsim_types::float16;
+
+template <unsigned char residual> // 0..15
+float SQ8_FP16_L2SqrSIMD16_AVX2_FMA(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    const float ip = SQ8_FP16_InnerProductImp_AVX2_FMA<residual>(pVect1v, pVect2v, dimension);
+
+    const uint8_t *pVect1 = static_cast<const uint8_t *>(pVect1v);
+    const uint8_t *params_bytes = pVect1 + dimension;
+    const float x_sum_sq = load_unaligned<float>(params_bytes + sq8::SUM_SQUARES * sizeof(float));
+
+    const float16 *pVect2 = static_cast<const float16 *>(pVect2v);
+    const auto *query_meta_bytes = reinterpret_cast<const uint8_t *>(pVect2 + dimension);
+    const float y_sum_sq =
+        load_unaligned<float>(query_meta_bytes + sq8::SUM_SQUARES_QUERY * sizeof(float));
+
+    return x_sum_sq + y_sum_sq - 2.0f * ip;
+}
diff --git a/src/VecSim/spaces/L2/L2_AVX2_SQ8_FP16.h b/src/VecSim/spaces/L2/L2_AVX2_SQ8_FP16.h
new file mode 100644
index 000000000..98bb29c05
--- /dev/null
+++ b/src/VecSim/spaces/L2/L2_AVX2_SQ8_FP16.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+ */
+#pragma once
+#include "VecSim/spaces/space_includes.h"
+#include "VecSim/spaces/AVX_utils.h"
+#include "VecSim/spaces/IP/IP_AVX2_SQ8_FP16.h"
+#include "VecSim/types/sq8.h"
+#include "VecSim/types/float16.h"
+#include "VecSim/utils/alignment.h"
+
+using sq8 = vecsim_types::sq8;
+using float16 = vecsim_types::float16;
+
+template <unsigned char residual> // 0..15
+float SQ8_FP16_L2SqrSIMD16_AVX2(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    const float ip = SQ8_FP16_InnerProductImp_AVX2<residual>(pVect1v, pVect2v, dimension);
+
+    const uint8_t *pVect1 = static_cast<const uint8_t *>(pVect1v);
+    const uint8_t *params_bytes = pVect1 + dimension;
+    const float x_sum_sq = load_unaligned<float>(params_bytes + sq8::SUM_SQUARES * sizeof(float));
+
+    const float16 *pVect2 = static_cast<const float16 *>(pVect2v);
+    const auto *query_meta_bytes = reinterpret_cast<const uint8_t *>(pVect2 + dimension);
+    const float y_sum_sq =
+        load_unaligned<float>(query_meta_bytes + sq8::SUM_SQUARES_QUERY * sizeof(float));
+
+    return x_sum_sq + y_sum_sq - 2.0f * ip;
+}
diff --git a/src/VecSim/spaces/L2/L2_AVX512F_SQ8_FP16.h b/src/VecSim/spaces/L2/L2_AVX512F_SQ8_FP16.h
new file mode 100644
index 000000000..384870b21
--- /dev/null
+++ b/src/VecSim/spaces/L2/L2_AVX512F_SQ8_FP16.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+ */
+#pragma once
+#include "VecSim/spaces/space_includes.h"
+#include "VecSim/spaces/IP/IP_AVX512F_SQ8_FP16.h"
+#include "VecSim/types/sq8.h"
+#include "VecSim/types/float16.h"
+#include "VecSim/utils/alignment.h"
+
+using sq8 = vecsim_types::sq8;
+using float16 = vecsim_types::float16;
+
+// L2² = x_sum_squares + y_sum_squares - 2 * IP(x, y), computed via the AVX-512 IP impl above.
+template <unsigned char residual> // 0..15
+float SQ8_FP16_L2SqrSIMD16_AVX512F(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    const float ip = SQ8_FP16_InnerProductImp_AVX512<residual>(pVect1v, pVect2v, dimension);
+
+    const uint8_t *pVect1 = static_cast<const uint8_t *>(pVect1v);
+    const uint8_t *params_bytes = pVect1 + dimension;
+    const float x_sum_sq = load_unaligned<float>(params_bytes + sq8::SUM_SQUARES * sizeof(float));
+
+    const float16 *pVect2 = static_cast<const float16 *>(pVect2v);
+    const auto *query_meta_bytes = reinterpret_cast<const uint8_t *>(pVect2 + dimension);
+    const float y_sum_sq =
+        load_unaligned<float>(query_meta_bytes + sq8::SUM_SQUARES_QUERY * sizeof(float));
+
+    return x_sum_sq + y_sum_sq - 2.0f * ip;
+}
diff --git a/src/VecSim/spaces/L2/L2_SSE4_SQ8_FP16.h b/src/VecSim/spaces/L2/L2_SSE4_SQ8_FP16.h
new file mode 100644
index 000000000..75bbd46f8
--- /dev/null
+++ b/src/VecSim/spaces/L2/L2_SSE4_SQ8_FP16.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+ */
+#pragma once
+#include "VecSim/spaces/space_includes.h"
+#include "VecSim/spaces/IP/IP_SSE4_SQ8_FP16.h"
+#include "VecSim/types/sq8.h"
+#include "VecSim/types/float16.h"
+#include "VecSim/utils/alignment.h"
+
+using sq8 = vecsim_types::sq8;
+using float16 = vecsim_types::float16;
+
+template <unsigned char residual> // 0..15
+float SQ8_FP16_L2SqrSIMD16_SSE4(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    const float ip = SQ8_FP16_InnerProductSIMD16_SSE4_IMP<residual>(pVect1v, pVect2v, dimension);
+
+    const uint8_t *pVect1 = static_cast<const uint8_t *>(pVect1v);
+    const uint8_t *params_bytes = pVect1 + dimension;
+    const float x_sum_sq = load_unaligned<float>(params_bytes + sq8::SUM_SQUARES * sizeof(float));
+
+    const float16 *pVect2 = static_cast<const float16 *>(pVect2v);
+    const auto *query_meta_bytes = reinterpret_cast<const uint8_t *>(pVect2 + dimension);
+    const float y_sum_sq =
+        load_unaligned<float>(query_meta_bytes + sq8::SUM_SQUARES_QUERY * sizeof(float));
+
+    return x_sum_sq + y_sum_sq - 2.0f * ip;
+}
diff --git a/src/VecSim/spaces/L2_space.cpp b/src/VecSim/spaces/L2_space.cpp
index ba3dd7cab..43020399f 100644
--- a/src/VecSim/spaces/L2_space.cpp
+++ b/src/VecSim/spaces/L2_space.cpp
@@ -19,9 +19,12 @@
 #include "VecSim/spaces/functions/AVX512FP16_VL.h"
 #include "VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h"
 #include "VecSim/spaces/functions/AVX2.h"
+#include "VecSim/spaces/functions/AVX2_F16C.h"
 #include "VecSim/spaces/functions/AVX2_FMA.h"
+#include "VecSim/spaces/functions/AVX2_FMA_F16C.h"
 #include "VecSim/spaces/functions/SSE3.h"
 #include "VecSim/spaces/functions/SSE4.h"
+#include "VecSim/spaces/functions/SSE4_F16C.h"
 #include "VecSim/spaces/functions/NEON.h"
 #include "VecSim/spaces/functions/NEON_DOTPROD.h"
 #include "VecSim/spaces/functions/NEON_HP.h"
@@ -104,17 +107,56 @@ dist_func_t<float> L2_SQ8_FP32_GetDistFunc(size_t dim, unsigned char *alignment,
 }
 
 // SQ8-FP16: asymmetric L2 distance between SQ8 storage and FP16 query.
-// SIMD chooser slots are added by P1b (MOD-15152) / P1c (MOD-15153); for now this always
-// returns the scalar implementation.
 dist_func_t<float> L2_SQ8_FP16_GetDistFunc(size_t dim, unsigned char *alignment,
                                            const void *arch_opt) {
     unsigned char dummy_alignment;
     if (!alignment) {
         alignment = &dummy_alignment;
     }
-    (void)dim;
-    (void)arch_opt;
-    return SQ8_FP16_L2Sqr;
+
+    dist_func_t<float> ret_dist_func = SQ8_FP16_L2Sqr;
+    [[maybe_unused]] auto features = getCpuOptimizationFeatures(arch_opt);
+
+#ifdef CPU_FEATURES_ARCH_X86_64
+    if (dim < 16) {
+        return ret_dist_func;
+    }
+    // Alignment hints below refer to the SQ8 (first) operand per the GetDistFunc contract.
+    // AVX-512 tier only needs AVX-512F (cvtph_ps is part of AVX-512F, no VNNI/BW/VL required).
+#ifdef OPT_AVX512F
+    if (features.avx512f) {
+        if (dim % 16 == 0)
+            *alignment = 16 * sizeof(uint8_t);
+        return Choose_SQ8_FP16_L2_implementation_AVX512F(dim);
+    }
+#endif
+    // F16C is required by every non-AVX-512 SQ8↔FP16 tier (vcvtph2ps), so the guard is hoisted
+    // around all three.
+#ifdef OPT_F16C
+#ifdef OPT_AVX2_FMA
+    if (features.avx2 && features.fma3 && features.f16c) {
+        if (dim % 8 == 0)
+            *alignment = 8 * sizeof(uint8_t);
+        return Choose_SQ8_FP16_L2_implementation_AVX2_FMA(dim);
+    }
+#endif
+#ifdef OPT_AVX2
+    if (features.avx2 && features.f16c) {
+        if (dim % 8 == 0)
+            *alignment = 8 * sizeof(uint8_t);
+        return Choose_SQ8_FP16_L2_implementation_AVX2(dim);
+    }
+#endif
+#ifdef OPT_SSE4
+    if (features.sse4_1 && features.f16c && features.avx) {
+        if (dim % 4 == 0)
+            *alignment = 4 * sizeof(uint8_t);
+        return Choose_SQ8_FP16_L2_implementation_SSE4(dim);
+    }
+#endif
+#endif // OPT_F16C
+#endif // x86_64
+    return ret_dist_func;
 }
 
 dist_func_t<float> L2_FP32_GetDistFunc(size_t dim, unsigned char *alignment, const void *arch_opt) {
diff --git a/src/VecSim/spaces/functions/AVX2_F16C.cpp b/src/VecSim/spaces/functions/AVX2_F16C.cpp
new file mode 100644
index 000000000..3d298e81b
--- /dev/null
+++ b/src/VecSim/spaces/functions/AVX2_F16C.cpp
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+ */
+#include "AVX2_F16C.h"
+#include "VecSim/spaces/IP/IP_AVX2_SQ8_FP16.h"
+#include "VecSim/spaces/L2/L2_AVX2_SQ8_FP16.h"
+
+namespace spaces {
+
+#include "implementation_chooser.h"
+
+dist_func_t<float> Choose_SQ8_FP16_IP_implementation_AVX2(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_FP16_InnerProductSIMD16_AVX2);
+    return ret_dist_func;
+}
+dist_func_t<float> Choose_SQ8_FP16_Cosine_implementation_AVX2(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_FP16_CosineSIMD16_AVX2);
+    return ret_dist_func;
+}
+dist_func_t<float> Choose_SQ8_FP16_L2_implementation_AVX2(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_FP16_L2SqrSIMD16_AVX2);
+    return ret_dist_func;
+}
+
+#include "implementation_chooser_cleanup.h"
+
+} // namespace spaces
diff --git a/src/VecSim/spaces/functions/AVX2_F16C.h b/src/VecSim/spaces/functions/AVX2_F16C.h
new file mode 100644
index 000000000..95a171199
--- /dev/null
+++ b/src/VecSim/spaces/functions/AVX2_F16C.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+ */
+#pragma once
+
+#include "VecSim/spaces/spaces.h"
+
+// SQ8↔FP16 kernels for the AVX2 (no FMA) tier. Live in a sibling TU compiled only when the
+// toolchain supports F16C (via `-mf16c`), so this header has no preprocessor guard. Callers
+// still gate the calls themselves with `#ifdef OPT_F16C`.
+
+namespace spaces {
+
+dist_func_t<float> Choose_SQ8_FP16_IP_implementation_AVX2(size_t dim);
+dist_func_t<float> Choose_SQ8_FP16_Cosine_implementation_AVX2(size_t dim);
+dist_func_t<float> Choose_SQ8_FP16_L2_implementation_AVX2(size_t dim);
+
+} // namespace spaces
diff --git a/src/VecSim/spaces/functions/AVX2_FMA_F16C.cpp b/src/VecSim/spaces/functions/AVX2_FMA_F16C.cpp
new file mode 100644
index 000000000..4e9dd8131
--- /dev/null
+++ b/src/VecSim/spaces/functions/AVX2_FMA_F16C.cpp
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+ */
+#include "AVX2_FMA_F16C.h"
+#include "VecSim/spaces/IP/IP_AVX2_FMA_SQ8_FP16.h"
+#include "VecSim/spaces/L2/L2_AVX2_FMA_SQ8_FP16.h"
+
+namespace spaces {
+
+#include "implementation_chooser.h"
+
+dist_func_t<float> Choose_SQ8_FP16_IP_implementation_AVX2_FMA(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_FP16_InnerProductSIMD16_AVX2_FMA);
+    return ret_dist_func;
+}
+dist_func_t<float> Choose_SQ8_FP16_Cosine_implementation_AVX2_FMA(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_FP16_CosineSIMD16_AVX2_FMA);
+    return ret_dist_func;
+}
+dist_func_t<float> Choose_SQ8_FP16_L2_implementation_AVX2_FMA(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_FP16_L2SqrSIMD16_AVX2_FMA);
+    return ret_dist_func;
+}
+
+#include "implementation_chooser_cleanup.h"
+
+} // namespace spaces
diff --git a/src/VecSim/spaces/functions/AVX2_FMA_F16C.h b/src/VecSim/spaces/functions/AVX2_FMA_F16C.h
new file mode 100644
index 000000000..7943ff4eb
--- /dev/null
+++ b/src/VecSim/spaces/functions/AVX2_FMA_F16C.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+ */
+#pragma once
+
+#include "VecSim/spaces/spaces.h"
+
+// SQ8↔FP16 kernels for the AVX2+FMA tier. Live in a sibling TU compiled only when the
+// toolchain supports F16C (via `-mf16c`), so this header has no preprocessor guard. Callers
+// still gate the calls themselves with `#ifdef OPT_F16C`.
+
+namespace spaces {
+
+dist_func_t<float> Choose_SQ8_FP16_IP_implementation_AVX2_FMA(size_t dim);
+dist_func_t<float> Choose_SQ8_FP16_Cosine_implementation_AVX2_FMA(size_t dim);
+dist_func_t<float> Choose_SQ8_FP16_L2_implementation_AVX2_FMA(size_t dim);
+
+} // namespace spaces
diff --git a/src/VecSim/spaces/functions/AVX512F.cpp b/src/VecSim/spaces/functions/AVX512F.cpp
index e765f4c8b..feb261fb4 100644
--- a/src/VecSim/spaces/functions/AVX512F.cpp
+++ b/src/VecSim/spaces/functions/AVX512F.cpp
@@ -11,10 +11,12 @@
 #include "VecSim/spaces/L2/L2_AVX512F_FP16.h"
 #include "VecSim/spaces/L2/L2_AVX512F_FP32.h"
 #include "VecSim/spaces/L2/L2_AVX512F_FP64.h"
+#include "VecSim/spaces/L2/L2_AVX512F_SQ8_FP16.h"
 
 #include "VecSim/spaces/IP/IP_AVX512F_FP16.h"
 #include "VecSim/spaces/IP/IP_AVX512F_FP32.h"
 #include "VecSim/spaces/IP/IP_AVX512F_FP64.h"
+#include "VecSim/spaces/IP/IP_AVX512F_SQ8_FP16.h"
 
 namespace spaces {
 
@@ -56,6 +58,25 @@ dist_func_t<float> Choose_FP16_L2_implementation_AVX512F(size_t dim) {
     return ret_dist_func;
 }
 
+// SQ8↔FP16 kernels only use AVX-512F (cvtph_ps + FMA), so they register here rather than under
+// the VNNI tier — CPUs with AVX-512F but no VNNI (Skylake-X, some Cascade Lake variants) can use
+// these kernels.
+dist_func_t<float> Choose_SQ8_FP16_IP_implementation_AVX512F(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_FP16_InnerProductSIMD16_AVX512F);
+    return ret_dist_func;
+}
+dist_func_t<float> Choose_SQ8_FP16_Cosine_implementation_AVX512F(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_FP16_CosineSIMD16_AVX512F);
+    return ret_dist_func;
+}
+dist_func_t<float> Choose_SQ8_FP16_L2_implementation_AVX512F(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_FP16_L2SqrSIMD16_AVX512F);
+    return ret_dist_func;
+}
+
 #include "implementation_chooser_cleanup.h"
 
 } // namespace spaces
diff --git a/src/VecSim/spaces/functions/AVX512F.h b/src/VecSim/spaces/functions/AVX512F.h
index fd36f312f..8d600f961 100644
--- a/src/VecSim/spaces/functions/AVX512F.h
+++ b/src/VecSim/spaces/functions/AVX512F.h
@@ -20,4 +20,9 @@ dist_func_t<float> Choose_FP16_L2_implementation_AVX512F(size_t dim);
 dist_func_t<float> Choose_FP32_L2_implementation_AVX512F(size_t dim);
 dist_func_t<double> Choose_FP64_L2_implementation_AVX512F(size_t dim);
 
+// SQ8↔FP16 kernels — only need AVX-512F, not VNNI/BW/VL.
+dist_func_t<float> Choose_SQ8_FP16_IP_implementation_AVX512F(size_t dim);
+dist_func_t<float> Choose_SQ8_FP16_Cosine_implementation_AVX512F(size_t dim);
+dist_func_t<float> Choose_SQ8_FP16_L2_implementation_AVX512F(size_t dim);
+
 } // namespace spaces
diff --git a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp
index 3b8813b89..712bdda4e 100644
--- a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp
+++ b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp
@@ -75,6 +75,7 @@ dist_func_t<float> Choose_SQ8_FP32_L2_implementation_AVX512F_BW_VL_VNNI(size_t d
     CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_FP32_L2SqrSIMD16_AVX512F_BW_VL_VNNI);
     return ret_dist_func;
 }
+
 // SQ8-to-SQ8 distance functions (both vectors are uint8 quantized with precomputed sum)
 dist_func_t<float> Choose_SQ8_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim) {
     dist_func_t<float> ret_dist_func;
diff --git a/src/VecSim/spaces/functions/SSE4_F16C.cpp b/src/VecSim/spaces/functions/SSE4_F16C.cpp
new file mode 100644
index 000000000..91a11885f
--- /dev/null
+++ b/src/VecSim/spaces/functions/SSE4_F16C.cpp
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+ */
+#include "SSE4_F16C.h"
+#include "VecSim/spaces/IP/IP_SSE4_SQ8_FP16.h"
+#include "VecSim/spaces/L2/L2_SSE4_SQ8_FP16.h"
+
+namespace spaces {
+
+#include "implementation_chooser.h"
+
+dist_func_t<float> Choose_SQ8_FP16_IP_implementation_SSE4(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_FP16_InnerProductSIMD16_SSE4);
+    return ret_dist_func;
+}
+dist_func_t<float> Choose_SQ8_FP16_Cosine_implementation_SSE4(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_FP16_CosineSIMD16_SSE4);
+    return ret_dist_func;
+}
+dist_func_t<float> Choose_SQ8_FP16_L2_implementation_SSE4(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_FP16_L2SqrSIMD16_SSE4);
+    return ret_dist_func;
+}
+
+#include "implementation_chooser_cleanup.h"
+
+} // namespace spaces
diff --git a/src/VecSim/spaces/functions/SSE4_F16C.h b/src/VecSim/spaces/functions/SSE4_F16C.h
new file mode 100644
index 000000000..2459c216c
--- /dev/null
+++ b/src/VecSim/spaces/functions/SSE4_F16C.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+ */
+#pragma once
+
+#include "VecSim/spaces/spaces.h"
+
+// SQ8↔FP16 kernels for the SSE4 tier. Live in a sibling TU compiled only when the toolchain
+// supports F16C (via `-mf16c -mavx`), so this header has no preprocessor guard. Callers
+// still gate the calls themselves with `#ifdef OPT_F16C`.
+
+namespace spaces {
+
+dist_func_t<float> Choose_SQ8_FP16_IP_implementation_SSE4(size_t dim);
+dist_func_t<float> Choose_SQ8_FP16_Cosine_implementation_SSE4(size_t dim);
+dist_func_t<float> Choose_SQ8_FP16_L2_implementation_SSE4(size_t dim);
+
+} // namespace spaces
diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces.h b/tests/benchmark/spaces_benchmarks/bm_spaces.h
index d99bcc4ca..2303eac0a 100644
--- a/tests/benchmark/spaces_benchmarks/bm_spaces.h
+++ b/tests/benchmark/spaces_benchmarks/bm_spaces.h
@@ -24,9 +24,12 @@
 #include "VecSim/spaces/functions/AVX512BF16_VL.h"
 #include "VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h"
 #include "VecSim/spaces/functions/AVX2.h"
+#include "VecSim/spaces/functions/AVX2_F16C.h"
 #include "VecSim/spaces/functions/AVX2_FMA.h"
+#include "VecSim/spaces/functions/AVX2_FMA_F16C.h"
 #include "VecSim/spaces/functions/F16C.h"
 #include "VecSim/spaces/functions/SSE4.h"
+#include "VecSim/spaces/functions/SSE4_F16C.h"
 #include "VecSim/spaces/functions/SSE3.h"
 #include "VecSim/spaces/functions/SSE.h"
 #include "VecSim/spaces/functions/NEON.h"
diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_fp16.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_fp16.cpp
index 2133a047e..ba3030064 100644
--- a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_fp16.cpp
+++ b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_fp16.cpp
@@ -15,8 +15,9 @@ using float16 = vecsim_types::float16;
 
 /**
  * SQ8-to-FP16 benchmarks: SQ8 quantized storage with FP16 query.
- * Only naive (scalar) benchmarks are registered for now; SIMD chooser symbols are added
- * by P1b (MOD-15152, x86) and P1c (MOD-15153, ARM).
+ * Registers the naive (scalar) baseline plus per-ISA SIMD variants (x86: AVX-512 / AVX2+FMA /
+ * AVX2 / SSE4 — gated on the matching OPT_* defines and runtime CPU features). ARM kernels
+ * land via MOD-14972.
  */
 class BM_VecSimSpaces_SQ8_FP16 : public benchmark::Fixture {
 protected:
@@ -50,8 +51,41 @@ class BM_VecSimSpaces_SQ8_FP16 : public benchmark::Fixture {
     }
 };
 
-// Naive (scalar) algorithms. SIMD chooser slots will be added by P1b (MOD-15152) and
-// P1c (MOD-15153), following the SQ8_FP32 layout in bm_spaces_sq8_fp32.cpp.
+#ifdef CPU_FEATURES_ARCH_X86_64
+cpu_features::X86Features opt = cpu_features::GetX86Info().features;
+
+// AVX-512F is sufficient — _mm512_cvtph_ps is part of AVX-512F, no F16C/VNNI/BW/VL needed.
+#ifdef OPT_AVX512F
+bool avx512f_supported = opt.avx512f;
+INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, AVX512F, 16, avx512f_supported);
+INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, AVX512F, 16,
+                                 avx512f_supported);
+#endif
+
+#ifdef OPT_F16C
+#ifdef OPT_AVX2_FMA
+bool avx2_fma3_f16c_supported = opt.avx2 && opt.fma3 && opt.f16c;
+INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, AVX2_FMA, 16,
+                                avx2_fma3_f16c_supported);
+INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, AVX2_FMA, 16,
+                                 avx2_fma3_f16c_supported);
+#endif
+
+#ifdef OPT_AVX2
+bool avx2_f16c_supported = opt.avx2 && opt.f16c;
+INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, AVX2, 16, avx2_f16c_supported);
+INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, AVX2, 16, avx2_f16c_supported);
+#endif
+
+#ifdef OPT_SSE4
+bool sse4_f16c_supported = opt.sse4_1 && opt.f16c && opt.avx;
+INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, SSE4, 16, sse4_f16c_supported);
+INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, SSE4, 16, sse4_f16c_supported);
+#endif
+#endif // OPT_F16C
+#endif // x86_64
+
+// Naive (scalar) baseline — always registered as the comparison anchor.
 
 INITIALIZE_NAIVE_BM(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, InnerProduct, 16);
 INITIALIZE_NAIVE_BM(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, Cosine, 16);
diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp
index a6bb88cef..b880b6f13 100644
--- a/tests/unit/test_spaces.cpp
+++ b/tests/unit/test_spaces.cpp
@@ -32,9 +32,12 @@
 #include "VecSim/spaces/functions/AVX512FP16_VL.h"
 #include "VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h"
 #include "VecSim/spaces/functions/AVX2.h"
+#include "VecSim/spaces/functions/AVX2_F16C.h"
 #include "VecSim/spaces/functions/AVX2_FMA.h"
+#include "VecSim/spaces/functions/AVX2_FMA_F16C.h"
 #include "VecSim/spaces/functions/SSE3.h"
 #include "VecSim/spaces/functions/SSE4.h"
+#include "VecSim/spaces/functions/SSE4_F16C.h"
 #include "VecSim/spaces/functions/F16C.h"
 #include "VecSim/spaces/functions/NEON.h"
 #include "VecSim/spaces/functions/NEON_DOTPROD.h"
@@ -560,9 +563,8 @@ TEST_F(SpacesTest, GetDistFuncSQ8Asymmetric) {
 }
 
 TEST_F(SpacesTest, GetDistFuncSQ8FP16Asymmetric) {
-    // SQ8 storage with FP16 query (asymmetric) - should return scalar SQ8_FP16 functions.
-    // SIMD chooser slots are added by P1b (MOD-15152) / P1c (MOD-15153); for now the
-    // dispatcher returns the scalar implementations regardless of dim or arch.
+    // SQ8 storage with FP16 query (asymmetric) - should return SQ8_FP16 functions.
+    // Per-ISA dispatcher walk coverage lives in the SQ8_FP16 SpacesOptimizationTest below.
     size_t dim = 128;
     auto l2_func = spaces::GetDistFunc<sq8, float, float16>(VecSimMetric_L2, dim, nullptr);
     auto ip_func = spaces::GetDistFunc<sq8, float, float16>(VecSimMetric_IP, dim, nullptr);
@@ -570,9 +572,6 @@ TEST_F(SpacesTest, GetDistFuncSQ8FP16Asymmetric) {
     ASSERT_EQ(l2_func, L2_SQ8_FP16_GetDistFunc(dim, nullptr));
     ASSERT_EQ(ip_func, IP_SQ8_FP16_GetDistFunc(dim, nullptr));
     ASSERT_EQ(cosine_func, Cosine_SQ8_FP16_GetDistFunc(dim, nullptr));
-    ASSERT_EQ(l2_func, SQ8_FP16_L2Sqr);
-    ASSERT_EQ(ip_func, SQ8_FP16_InnerProduct);
-    ASSERT_EQ(cosine_func, SQ8_FP16_Cosine);
 }
 
 #ifdef CPU_FEATURES_ARCH_X86_64
@@ -3000,8 +2999,9 @@ TEST(SQ8_FP32_EdgeCases, CosineExtremeValuesTest) {
 
 // Parameterized tests that verify the scalar SQ8_FP16 kernels against the not-optimized
 // baseline across multiple dimensions, including odd dimensions and SIMD-boundary residues.
-// SIMD chooser slots are added by P1b (MOD-15152) / P1c (MOD-15153); the dispatcher always
-// returns the scalar implementation for now.
+// The SIMD-tier dispatcher coverage lives in SQ8_FP16_SpacesOptimizationTest below; this
+// suite intentionally exercises the scalar reference directly to keep it as a fixed baseline
+// the SIMD tiers are compared against.
 class SQ8_FP16_NoOptimizationSpacesTest : public testing::TestWithParam<size_t> {};
 
 TEST_P(SQ8_FP16_NoOptimizationSpacesTest, SQ8_FP16_L2SqrTest) {
@@ -3070,10 +3070,318 @@ INSTANTIATE_TEST_SUITE_P(SQ8_FP16_NoOpt, SQ8_FP16_NoOptimizationSpacesTest,
                          testing::Values(1, 5, 7, 8, 9, 15, 16, 17, 31, 32, 33, 47, 48, 49, 63, 64,
                                          65, 127, 128));
 
+/* ======================== SQ8_FP16 SIMD optimisation tests ========================= */
+
+// Walks down the x86 ISA tiers (AVX-512 → AVX2+FMA → AVX2 → SSE4 → scalar) and asserts
+// that {IP,Cosine,L2}_SQ8_FP16_GetDistFunc returns the expected Choose_* symbol and that
+// its output matches the scalar baseline within 0.01.
+class SQ8_FP16_SpacesOptimizationTest : public testing::TestWithParam<size_t> {};
+
+TEST_P(SQ8_FP16_SpacesOptimizationTest, SQ8_FP16_L2SqrTest) {
+    auto optimization = getCpuOptimizationFeatures();
+    size_t dim = GetParam();
+
+    size_t query_count =
+        dim + sq8::query_metadata_count<VecSimMetric_L2>() * (sizeof(float) / sizeof(float16));
+    std::vector<float16> v1_query(query_count);
+    test_utils::populate_sq8_fp16_query(v1_query.data(), dim, false, 1234);
+
+    size_t quantized_size =
+        dim * sizeof(uint8_t) + sq8::storage_metadata_count<VecSimMetric_L2>() * sizeof(float);
+    std::vector<uint8_t> v2_compressed(quantized_size);
+    test_utils::populate_float_vec_to_sq8_with_metadata(v2_compressed.data(), dim, false, 5678);
+
+    dist_func_t<float> arch_opt_func;
+    float baseline = SQ8_FP16_L2Sqr(v2_compressed.data(), v1_query.data(), dim);
+
+#ifdef OPT_AVX512F
+    if (optimization.avx512f) {
+        unsigned char alignment = 0;
+        arch_opt_func = L2_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_L2_implementation_AVX512F(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01)
+            << "AVX512 with dim " << dim;
+        optimization.avx512f = 0;
+    }
+#endif
+#ifdef OPT_AVX2_FMA
+#ifdef OPT_F16C
+    if (optimization.avx2 && optimization.fma3 && optimization.f16c) {
+        unsigned char alignment = 0;
+        arch_opt_func = L2_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_L2_implementation_AVX2_FMA(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01)
+            << "AVX2+FMA with dim " << dim;
+        optimization.fma3 = 0;
+    }
+#endif
+#endif
+#ifdef OPT_AVX2
+#ifdef OPT_F16C
+    if (optimization.avx2 && optimization.f16c) {
+        unsigned char alignment = 0;
+        arch_opt_func = L2_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_L2_implementation_AVX2(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01)
+            << "AVX2 with dim " << dim;
+        optimization.avx2 = 0;
+    }
+#endif
+#endif
+#ifdef OPT_SSE4
+#ifdef OPT_F16C
+    if (optimization.sse4_1 && optimization.f16c && optimization.avx) {
+        unsigned char alignment = 0;
+        arch_opt_func = L2_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_L2_implementation_SSE4(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01)
+            << "SSE4 with dim " << dim;
+        optimization.sse4_1 = 0;
+    }
+#endif
+#endif
+
+    // Scalar fallback. Init alignment to a sentinel (0xFF) so the assert below actually verifies
+    // that the dispatcher LEAVES THE VALUE UNTOUCHED on the scalar path — initialising to 0 then
+    // asserting `== 0` would pass even if the dispatcher were a no-op.
+    unsigned char alignment = 0xFF;
+    arch_opt_func = L2_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization);
+    ASSERT_EQ(arch_opt_func, SQ8_FP16_L2Sqr)
+        << "Unexpected scalar fallback function for dim " << dim;
+    ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01)
+        << "Scalar fallback with dim " << dim;
+    ASSERT_EQ(alignment, 0xFF) << "Scalar fallback must leave caller's alignment value untouched "
+                                  "(dim "
+                               << dim << ")";
+}
+
+TEST_P(SQ8_FP16_SpacesOptimizationTest, SQ8_FP16_InnerProductTest) {
+    auto optimization = getCpuOptimizationFeatures();
+    size_t dim = GetParam();
+
+    size_t query_count =
+        dim + sq8::query_metadata_count<VecSimMetric_L2>() * (sizeof(float) / sizeof(float16));
+    std::vector<float16> v1_query(query_count);
+    test_utils::populate_sq8_fp16_query(v1_query.data(), dim, true, 1234);
+
+    size_t quantized_size =
+        dim * sizeof(uint8_t) + sq8::storage_metadata_count<VecSimMetric_L2>() * sizeof(float);
+    std::vector<uint8_t> v2_compressed(quantized_size);
+    test_utils::populate_float_vec_to_sq8_with_metadata(v2_compressed.data(), dim, true, 5678);
+
+    dist_func_t<float> arch_opt_func;
+    float baseline = SQ8_FP16_InnerProduct(v2_compressed.data(), v1_query.data(), dim);
+
+#ifdef OPT_AVX512F
+    if (optimization.avx512f) {
+        unsigned char alignment = 0;
+        arch_opt_func = IP_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_IP_implementation_AVX512F(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01)
+            << "AVX512 with dim " << dim;
+        optimization.avx512f = 0;
+    }
+#endif
+#ifdef OPT_AVX2_FMA
+#ifdef OPT_F16C
+    if (optimization.avx2 && optimization.fma3 && optimization.f16c) {
+        unsigned char alignment = 0;
+        arch_opt_func = IP_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_IP_implementation_AVX2_FMA(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01)
+            << "AVX2+FMA with dim " << dim;
+        optimization.fma3 = 0;
+    }
+#endif
+#endif
+#ifdef OPT_AVX2
+#ifdef OPT_F16C
+    if (optimization.avx2 && optimization.f16c) {
+        unsigned char alignment = 0;
+        arch_opt_func = IP_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_IP_implementation_AVX2(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01)
+            << "AVX2 with dim " << dim;
+        optimization.avx2 = 0;
+    }
+#endif
+#endif
+#ifdef OPT_SSE4
+#ifdef OPT_F16C
+    if (optimization.sse4_1 && optimization.f16c && optimization.avx) {
+        unsigned char alignment = 0;
+        arch_opt_func = IP_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_IP_implementation_SSE4(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01)
+            << "SSE4 with dim " << dim;
+        optimization.sse4_1 = 0;
+    }
+#endif
+#endif
+
+    // Scalar fallback — see L2 test for the 0xFF sentinel rationale.
+    unsigned char alignment = 0xFF;
+    arch_opt_func = IP_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization);
+    ASSERT_EQ(arch_opt_func, SQ8_FP16_InnerProduct)
+        << "Unexpected scalar fallback function for dim " << dim;
+    ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01)
+        << "Scalar fallback with dim " << dim;
+    ASSERT_EQ(alignment, 0xFF) << "Scalar fallback must leave caller's alignment value untouched "
+                                  "(dim "
+                               << dim << ")";
+}
+
+TEST_P(SQ8_FP16_SpacesOptimizationTest, SQ8_FP16_CosineTest) {
+    auto optimization = getCpuOptimizationFeatures();
+    size_t dim = GetParam();
+
+    size_t query_count =
+        dim + sq8::query_metadata_count<VecSimMetric_L2>() * (sizeof(float) / sizeof(float16));
+    std::vector<float16> v1_query(query_count);
+    test_utils::populate_sq8_fp16_query(v1_query.data(), dim, true, 1234);
+
+    size_t quantized_size =
+        dim * sizeof(uint8_t) + sq8::storage_metadata_count<VecSimMetric_L2>() * sizeof(float);
+    std::vector<uint8_t> v2_compressed(quantized_size);
+    test_utils::populate_float_vec_to_sq8_with_metadata(v2_compressed.data(), dim, true, 5678);
+
+    dist_func_t<float> arch_opt_func;
+    float baseline = SQ8_FP16_Cosine(v2_compressed.data(), v1_query.data(), dim);
+
+#ifdef OPT_AVX512F
+    if (optimization.avx512f) {
+        unsigned char alignment = 0;
+        arch_opt_func = Cosine_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_Cosine_implementation_AVX512F(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01)
+            << "AVX512 with dim " << dim;
+        optimization.avx512f = 0;
+    }
+#endif
+#ifdef OPT_AVX2_FMA
+#ifdef OPT_F16C
+    if (optimization.avx2 && optimization.fma3 && optimization.f16c) {
+        unsigned char alignment = 0;
+        arch_opt_func = Cosine_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_Cosine_implementation_AVX2_FMA(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01)
+            << "AVX2+FMA with dim " << dim;
+        optimization.fma3 = 0;
+    }
+#endif
+#endif
+#ifdef OPT_AVX2
+#ifdef OPT_F16C
+    if (optimization.avx2 && optimization.f16c) {
+        unsigned char alignment = 0;
+        arch_opt_func = Cosine_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_Cosine_implementation_AVX2(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01)
+            << "AVX2 with dim " << dim;
+        optimization.avx2 = 0;
+    }
+#endif
+#endif
+#ifdef OPT_SSE4
+#ifdef OPT_F16C
+    if (optimization.sse4_1 && optimization.f16c && optimization.avx) {
+        unsigned char alignment = 0;
+        arch_opt_func = Cosine_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_Cosine_implementation_SSE4(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01)
+            << "SSE4 with dim " << dim;
+        optimization.sse4_1 = 0;
+    }
+#endif
+#endif
+
+    // Scalar fallback — see L2 test for the 0xFF sentinel rationale.
+    unsigned char alignment = 0xFF;
+    arch_opt_func = Cosine_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization);
+    ASSERT_EQ(arch_opt_func, SQ8_FP16_Cosine)
+        << "Unexpected scalar fallback function for dim " << dim;
+    ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01)
+        << "Scalar fallback with dim " << dim;
+    ASSERT_EQ(alignment, 0xFF) << "Scalar fallback must leave caller's alignment value untouched "
+                                  "(dim "
+                               << dim << ")";
+}
+
+// Dim range [16, 32] covers every residual class for the 16-element chunk used by every tier.
+INSTANTIATE_TEST_SUITE_P(SQ8_FP16_SIMD, SQ8_FP16_SpacesOptimizationTest,
+                         testing::Range(16UL, 16 * 2UL + 1));
+
+// Higher dimensions surface multi-iteration loop bugs (pointer stride, do-while termination
+// off-by-one) that the [16, 32] range does not exercise because the AVX-512 inner loop runs at
+// most twice in that range.
+INSTANTIATE_TEST_SUITE_P(SQ8_FP16_SIMD_HighDim, SQ8_FP16_SpacesOptimizationTest,
+                         testing::Values(64UL, 128UL, 256UL, 512UL, 1024UL));
+
+// Surfaces which SIMD tiers were actually exercised on the current host. Without this, a CI
+// runner that lacks AVX-512 silently passes with zero tier-1 coverage. Logs per-tier presence
+// to stderr and GTEST_SKIPs only when no SIMD tier is available at all.
+TEST(SQ8_FP16_SIMD_TierCoverage, ReportTiersExercised) {
+    auto opt = getCpuOptimizationFeatures();
+    bool any_simd = false;
+
+#ifdef CPU_FEATURES_ARCH_X86_64
+#ifdef OPT_AVX512_F_BW_VL_VNNI
+    if (opt.avx512f && opt.avx512bw && opt.avx512vl && opt.avx512vnni) {
+        std::cerr << "[SQ8_FP16] AVX-512 F+BW+VL+VNNI tier exercised\n";
+        any_simd = true;
+    } else {
+        std::cerr << "[SQ8_FP16] AVX-512 F+BW+VL+VNNI tier NOT exercised on this host\n";
+    }
+#endif
+#if defined(OPT_AVX2_FMA) && defined(OPT_F16C)
+    if (opt.avx2 && opt.fma3 && opt.f16c) {
+        std::cerr << "[SQ8_FP16] AVX2+FMA+F16C tier exercised\n";
+        any_simd = true;
+    } else {
+        std::cerr << "[SQ8_FP16] AVX2+FMA+F16C tier NOT exercised on this host\n";
+    }
+#endif
+#if defined(OPT_AVX2) && defined(OPT_F16C)
+    if (opt.avx2 && opt.f16c) {
+        std::cerr << "[SQ8_FP16] AVX2+F16C tier exercised\n";
+        any_simd = true;
+    } else {
+        std::cerr << "[SQ8_FP16] AVX2+F16C tier NOT exercised on this host\n";
+    }
+#endif
+#if defined(OPT_SSE4) && defined(OPT_F16C)
+    if (opt.sse4_1 && opt.f16c && opt.avx) {
+        std::cerr << "[SQ8_FP16] SSE4+F16C+AVX tier exercised\n";
+        any_simd = true;
+    } else {
+        std::cerr << "[SQ8_FP16] SSE4+F16C+AVX tier NOT exercised on this host\n";
+    }
+#endif
+#endif // x86_64
+
+    if (!any_simd) {
+        GTEST_SKIP() << "No SQ8_FP16 SIMD tier available on this host — scalar fallback only.";
+    }
+}
+
 /* ======================== Tests SQ8_FP16 (edge cases) ========================= */
 
 // Zero FP16 query against a non-zero SQ8 storage. IP must be exactly 1.0 (1 - 0),
-// L2² must equal Σ dequantized².
+// L2² must equal Σ dequantized². Math correctness on adversarial inputs is verified
+// against the scalar reference; SIMD tier coverage with branchless kernels is provided
+// separately by SQ8_FP16_SpacesOptimizationTest.
 TEST(SQ8_FP16_EdgeCases, ZeroQueryTest) {
     size_t dim = 64;