diff --git a/.github/workflows/run-bench.yml b/.github/workflows/run-bench.yml
index 4026e3707..dbd0c67d3 100644
--- a/.github/workflows/run-bench.yml
+++ b/.github/workflows/run-bench.yml
@@ -90,6 +90,9 @@ jobs:
       - name: Set up GCC
         run: |
           sudo apt install -y gcc
+      - name: Install Meson and Ninja
+        run: |
+          sudo apt update && sudo apt install -y meson ninja-build
       - uses: actions/checkout@v4
       - name: Set up JDK ${{ matrix.jdk }}
         uses: actions/setup-java@v3
@@ -126,6 +129,9 @@ jobs:
           ref: ${{ matrix.branch }}
           fetch-depth: 0
 
+      - name: Initialize Git Submodules
+        run: git submodule update --init
+
       # ==========================================
       # Decode and write the protected dataset catalog
       #
@@ -241,9 +247,15 @@ jobs:
     needs: test-avx512
     runs-on: ubuntu-latest
     steps:
+      - name: Install Meson and Ninja
+        run: |
+          sudo apt update && sudo apt install -y meson ninja-build
       - name: Checkout repository
         uses: actions/checkout@v4
 
+      - name: Initialize Git Submodules
+        run: git submodule update --init
+
       - name: Download all benchmark results
         uses: actions/download-artifact@v4
         with:
diff --git a/.github/workflows/unit-tests.yaml b/.github/workflows/unit-tests.yaml
index 7fb47f9ba..2ffd08ee9 100644
--- a/.github/workflows/unit-tests.yaml
+++ b/.github/workflows/unit-tests.yaml
@@ -40,7 +40,12 @@ jobs:
       - name: Set up GCC
         run: |
           sudo apt install -y gcc
+      - name: Install Meson and Ninja
+        run: |
+          sudo apt update && sudo apt install -y meson ninja-build
       - uses: actions/checkout@v4
+      - name: Initialize Git Submodules
+        run: git submodule update --init
       - name: Set up JDK ${{ matrix.jdk }}
         uses: actions/setup-java@v3
         with:
@@ -102,6 +107,8 @@ jobs:
     runs-on: ${{ matrix.os }}
     steps:
       - uses: actions/checkout@v4
+      - name: Initialize Git Submodules
+        run: git submodule update --init
       - name: Set up JDK
         uses: actions/setup-java@v3
         with:
@@ -112,6 +119,10 @@ jobs:
         if: matrix.os == 'ubuntu-latest'
         run: |
           sudo apt install -y gcc
+      - name: Install Meson and Ninja
+        if: matrix.os == 'ubuntu-latest'
+        run: |
+          sudo apt update && sudo apt install -y meson ninja-build
       - name: Compile, run tests, and package (JDK 22)
         run: mvn -B verify
         if: matrix.jdk == '22'
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 000000000..dcfe38082
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "jvector-native/src/main/c/third_party/highway"]
+	path = jvector-native/src/main/c/third_party/highway
+	url = https://github.com/google/highway.git
diff --git a/README.md b/README.md
index 10e9eb738..bb0f17dd7 100644
--- a/README.md
+++ b/README.md
@@ -59,6 +59,30 @@ a dependency from any Java 11 code. When run on a Java 20+ JVM with the Vector m
 providers will be used. In general, the project is structured to be built with JDK 20+, but when `JAVA_HOME` is set to
 Java 11 -> Java 19, certain build features will still be available.
 
+### Cloning
+
+This repository uses a Git submodule for [Google Highway](https://github.com/google/highway), located at
+`jvector-native/src/main/c/third_party/highway`. After cloning, initialise it with:
+
+```bash
+git submodule update --init
+```
+
+Or clone with submodules in one step:
+
+```bash
+git clone --recurse-submodules <repo-url>
+```
+
+### Building native libraries
+
+The native SIMD library (`libjvector.so`) requires **g++ 11+** and is built by the script
+`jvector-native/src/main/c/jextract_vector_simd.sh`. To build and auto-install `g++` on Ubuntu:
+
+```bash
+./jvector-native/src/main/c/jextract_vector_simd.sh --auto-install-g++
+```
+
 Base code is in [jvector-base](./jvector-base) and will be built for Java 11 releases, restricting language features and APIs
 appropriately. Code in [jvector-twenty](./jvector-twenty) will be compiled for Java 20 language features/APIs and included in the final
 multirelease jar targeting supported JVMs. [jvector-multirelease](./jvector-multirelease) packages [jvector-base](./jvector-base) and [jvector-twenty](./jvector-twenty) as a
diff --git a/jvector-base/src/main/java/io/github/jbellis/jvector/quantization/PQDecoder.java b/jvector-base/src/main/java/io/github/jbellis/jvector/quantization/PQDecoder.java
index 85244befd..ced0124fd 100644
--- a/jvector-base/src/main/java/io/github/jbellis/jvector/quantization/PQDecoder.java
+++ b/jvector-base/src/main/java/io/github/jbellis/jvector/quantization/PQDecoder.java
@@ -99,9 +99,7 @@ public CosineDecoder(PQVectors cv, VectorFloat<?> query) {
                 for (int m = 0; m < pq.getSubspaceCount(); ++m) {
                     int size = pq.subvectorSizesAndOffsets[m][0];
                     var codebook = pq.codebooks[m];
-                    for (int j = 0; j < pq.getClusterCount(); ++j) {
-                        partialMagnitudes.set((m * pq.getClusterCount()) + j, VectorUtil.dotProduct(codebook, j * size, codebook, j * size, size));
-                    }
+                    VectorUtil.calculatePartialSelfMagnitudes(codebook, m, size, pq.getClusterCount(), partialMagnitudes);
                 }
                 return partialMagnitudes;
             });
@@ -117,9 +115,7 @@ public CosineDecoder(PQVectors cv, VectorFloat<?> query) {
                 int offset = pq.subvectorSizesAndOffsets[m][1];
                 int size = pq.subvectorSizesAndOffsets[m][0];
                 var codebook = pq.codebooks[m];
-                for (int j = 0; j < pq.getClusterCount(); ++j) {
-                    partialSums.set((m * pq.getClusterCount()) + j, VectorUtil.dotProduct(codebook, j * size, centeredQuery, offset, size));
-                }
+                VectorUtil.calculatePartialSums(codebook, m, size, pq.getClusterCount(), centeredQuery, offset, VectorSimilarityFunction.DOT_PRODUCT, partialSums);
             }
 
             this.bMagnitude = VectorUtil.dotProduct(centeredQuery, centeredQuery);
diff --git a/jvector-base/src/main/java/io/github/jbellis/jvector/vector/VectorUtil.java b/jvector-base/src/main/java/io/github/jbellis/jvector/vector/VectorUtil.java
index 83cb5885b..744d5ec75 100644
--- a/jvector-base/src/main/java/io/github/jbellis/jvector/vector/VectorUtil.java
+++ b/jvector-base/src/main/java/io/github/jbellis/jvector/vector/VectorUtil.java
@@ -182,6 +182,10 @@ public static void calculatePartialSums(VectorFloat<?> codebook, int codebookInd
     impl.calculatePartialSums(codebook, codebookIndex, size, clusterCount, query, offset, vsf, partialSums);
   }
 
+  public static void calculatePartialSelfMagnitudes(VectorFloat<?> codebook, int codebookIndex, int size, int clusterCount, VectorFloat<?> partialMagnitudes) {
+    impl.calculatePartialSelfMagnitudes(codebook, codebookIndex, size, clusterCount, partialMagnitudes);
+  }
+
   /**
    * Calculates the maximum value in the vector.
    * @param v vector
diff --git a/jvector-base/src/main/java/io/github/jbellis/jvector/vector/VectorUtilSupport.java b/jvector-base/src/main/java/io/github/jbellis/jvector/vector/VectorUtilSupport.java
index d8223ab12..118f16ca6 100644
--- a/jvector-base/src/main/java/io/github/jbellis/jvector/vector/VectorUtilSupport.java
+++ b/jvector-base/src/main/java/io/github/jbellis/jvector/vector/VectorUtilSupport.java
@@ -134,6 +134,13 @@ public interface VectorUtilSupport {
 
   void calculatePartialSums(VectorFloat<?> codebook, int codebookIndex, int size, int clusterCount, VectorFloat<?> query, int offset, VectorSimilarityFunction vsf, VectorFloat<?> partialSums);
 
+  default void calculatePartialSelfMagnitudes(VectorFloat<?> codebook, int codebookIndex, int size, int clusterCount, VectorFloat<?> partialMagnitudes) {
+    int codebookBase = codebookIndex * clusterCount;
+    for (int i = 0; i < clusterCount; i++) {
+      partialMagnitudes.set(codebookBase + i, dotProduct(codebook, i * size, codebook, i * size, size));
+    }
+  }
+
   float max(VectorFloat<?> v);
   float min(VectorFloat<?> v);
 
diff --git a/jvector-native/src/main/c/README.md b/jvector-native/src/main/c/README.md
new file mode 100644
index 000000000..5f3e2329d
--- /dev/null
+++ b/jvector-native/src/main/c/README.md
@@ -0,0 +1,310 @@
+<!--
+Copyright DataStax, Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# JVector Native SIMD Library
+
+This directory contains the C++ source for `libjvector.so`, the native SIMD
+backend that accelerates vector operations in JVector via the Java Foreign
+Function & Memory (FFM) API.
+
+> **Platform support:** Currently enabled on **x86-64** (SSE4.2, AVX2, and
+> AVX-512). Support for **ARM** (NEON and SVE) is planned for the near future;
+> the [Google Highway](https://github.com/google/highway) library used for
+> SIMD portability already targets both AArch64 targets, which will make the
+> extension straightforward.
+
+---
+
+## Directory layout
+
+```
+jvector_simd.h              — Public C ABI (symbols exported to Java via FFM)
+jvector_simd.cpp            — Runtime ISA dispatcher; thin wrappers over the vtable
+jvector_simd_kernels.h      — Internal C++ declarations for all ISA namespaces
+jvector_simd_kernels.cpp    — Actual SIMD kernels (compiled three times, see below)
+jvector_cpuFeatures.h       — CPUID/XGETBV-based CPU feature detection
+assertHwyTargets.h          — Compile-time assertions that the expected HWY target is active
+meson.build                 — Build description
+jextract_vector_simd.sh     — Build + jextract script (the usual entry point)
+third_party/highway/        — Google Highway header-only library (git submodule)
+```
+
+---
+
+## How to build
+
+### Prerequisites
+
+| Tool | Minimum version | Notes |
+|------|----------------|-------|
+| g++ / clang++ | GCC 11+ | Must support `-march=skylake-avx512` |
+| [Meson](https://mesonbuild.com/) | 0.55 | `pip install meson` |
+| [Ninja](https://ninja-build.org/) | any | `sudo apt install ninja-build` |
+| Git submodules | — | `git submodule update --init` (needed once) |
+
+### Build steps
+
+Run the build script from this directory:
+
+```bash
+bash jextract_vector_simd.sh
+```
+
+The script:
+1. Verifies prerequisites (g++, meson, ninja, Highway submodule).
+2. Runs `meson setup build --wipe --buildtype=release` then `meson compile`.
+3. Copies the versioned `.so` to `../resources/libjvector.so` where the Java
+   `LibraryLoader` expects it.
+4. Optionally re-generates the Java FFM bindings via `jextract` (only needed
+   when `jvector_simd.h` changes — see [Updating the Java bindings](#updating-the-java-bindings)).
+
+To build without regenerating bindings (e.g. when `jextract` is not installed):
+
+```bash
+bash jextract_vector_simd.sh   # jextract step is skipped with a warning if not found
+```
+
+### Manual meson build
+
+```bash
+cd jvector-native/src/main/c
+meson setup build --wipe --buildtype=release
+meson compile -C build
+```
+
+The output is `build/libjvector.so.<version>`.
+
+---
+
+## How it is integrated into JVector
+
+```
+Java caller
+  └─ NativeVectorUtilSupport   (jvector-native/.../vector/)
+       └─ NativeSimdOps        (jvector-native/.../vector/cnative/ — FFM glue, generated by jextract)
+            └─ libjvector.so   (this library, loaded at runtime by LibraryLoader)
+                 └─ jvector_simd.cpp — dispatches to the best ISA vtable
+                      ├─ AVX3::*   (compiled with -march=skylake-avx512)
+                      ├─ AVX2::*   (compiled with -march=haswell)
+                      └─ SSE42::*  (compiled with -msse4.2, scalar fallback)
+```
+
+### Load sequence
+
+1. `NativeVectorizationProvider` calls `LibraryLoader.loadJvector()` at startup.
+2. `LibraryLoader` first tries `System.loadLibrary("jvector")` (picks up a
+   system-installed `.so`), then falls back to extracting `libjvector.so` from
+   the JAR's resources and loading it from a temp file.
+3. On first call into `NativeSimdOps`, the FFM `SymbolLookup` resolves each
+   exported symbol directly against the loaded library.
+
+### ISA dispatch
+
+Dispatch happens **once** at C++ static-init time (before `main()`):
+
+1. `populate_cpu_features()` issues CPUID / XGETBV and fills a feature array.
+2. `dispatch_kernels()` checks the feature array (AVX-512 ⊇ SKX baseline, then
+   AVX2, then SSE4.2) and returns a pointer to the matching `KernelVTable`.
+3. All public API functions are one-liner wrappers that call through
+   `kernels.<fn>`.
+
+### ISA cap (runtime override)
+
+Set the `JVECTOR_MAX_ISA` environment variable before starting the JVM to cap
+the selected ISA without recompiling:
+
+```bash
+JVECTOR_MAX_ISA=avx2  java ...   # use AVX2 even on an AVX-512 machine
+JVECTOR_MAX_ISA=sse42 java ...   # force scalar/SSE4.2 fallback
+```
+
+Accepted values (case-sensitive): `avx3`, `avx2`, `sse42`.
+An unrecognised value is silently ignored and full CPU detection is used.
+
+### Updating the Java bindings
+
+The Java FFM glue in `cnative/NativeSimdOps.java` is generated from
+`jvector_simd.h` by `jextract`. Re-run the script whenever the public C header
+changes:
+
+```bash
+bash jextract_vector_simd.sh   # requires jextract on PATH
+```
+
+After regeneration the script automatically patches the generated file to add
+`Linker.Option.critical(true)` to every downcall, which avoids heap allocation
+on each call.
+
+---
+
+## Adding a new SIMD kernel
+
+Follow these steps to add a new function, e.g. `my_new_op_f32`.
+
+### 1. Declare the function in the public header (`jvector_simd.h`)
+
+```c
+JVECTOR_SIMD_API float my_new_op_f32(const float* a, size_t length);
+```
+
+Use the `JVECTOR_SIMD_API` macro on every symbol that must be visible to Java.
+
+### 2. Add the per-ISA declaration to `jvector_simd_kernels.h`
+
+Inside the `DECLARE_SIMD_KERNELS` macro, add:
+
+```cpp
+float my_new_op_f32(const float* a, size_t length); \
+```
+
+The macro is instantiated for `AVX3`, `AVX2`, and `SSE42`, so this single
+addition covers all three namespaces.
+
+### 3. Implement the kernel in `jvector_simd_kernels.cpp`
+
+Because the file is compiled three times (once per ISA variant via different
+`-DJV_ISA=<NS>` / `-march=` flags), a single implementation body serves all
+tiers. Use Highway intrinsics so that the compiler emits the right instructions
+for each target:
+
+```cpp
+namespace JV_ISA {   // expands to AVX3 / AVX2 / SSE42 at compile time
+
+HWY_FLATTEN float my_new_op_f32(const float* HWY_RESTRICT a, size_t length)
+{
+    const ScalableTag<float> d;
+    auto sum = Zero(d);
+    size_t i = 0;
+    for (; i + Lanes(d) <= length; i += Lanes(d))
+        sum = Add(sum, LoadU(d, a + i));
+    // scalar tail ...
+    return ReduceSum(d, sum);
+}
+
+} // namespace JV_ISA
+```
+
+`HWY_FLATTEN` ensures the compiler inlines all Highway helpers into one body,
+preventing cross-ISA call overhead.
+
+#### Platform-specific override (when Highway's auto-vectorisation is suboptimal)
+
+In case Highway generates sub-optimal code for a particular ISA, you can write 
+a hand-tuned path for one tier while keeping the generic Highway path for the 
+rest. Use `HWY_STATIC_TARGET` (the compile-time target selected for this translation
+unit) to gate the specialisation:
+
+```cpp
+namespace JV_ISA {
+
+HWY_FLATTEN float my_new_op_f32(const float* HWY_RESTRICT a, size_t length)
+{
+#if HWY_STATIC_TARGET == HWY_AVX3
+    // Hand-tuned AVX-512 path: use 512-bit loads + VRANGEPS for absolute value,
+    // or any other intrinsic that Highway does not yet expose.
+    __m512 acc = _mm512_setzero_ps();
+    size_t i = 0;
+    for (; i + 16 <= length; i += 16)
+        acc = _mm512_add_ps(acc, _mm512_loadu_ps(a + i));
+    float result = _mm512_reduce_add_ps(acc);
+    for (; i < length; ++i) result += a[i];   // scalar tail
+    return result;
+#else
+    // Generic Highway path — works for SSE42, AVX2, and any future target.
+    const ScalableTag<float> d;
+    auto sum = Zero(d);
+    size_t i = 0;
+    for (; i + Lanes(d) <= length; i += Lanes(d))
+        sum = Add(sum, LoadU(d, a + i));
+    // scalar tail
+    for (; i < length; ++i) sum = Add(sum, Set(d, a[i]));
+    return ReduceSum(d, sum);
+#endif
+}
+
+} // namespace JV_ISA
+```
+
+Key points:
+
+- `HWY_STATIC_TARGET` is defined by Highway to the active compile-time target
+  (`HWY_AVX3`, `HWY_AVX2`, `HWY_SSE4`, …). Because each ISA variant is a
+  **separate compilation unit** (see `meson.build`), the preprocessor condition
+  is resolved at compile time with zero runtime overhead.
+- The available macros for x86 are `HWY_SSE4`, `HWY_AVX2`, and `HWY_AVX3`
+  (AVX-512 / Skylake-AVX512 baseline). Future ARM tiers will expose
+  `HWY_NEON` and `HWY_SVE`.
+- Only add a specialised path when benchmarks justify the extra maintenance
+  cost. The generic Highway path is usually within a few percent of hand-written
+  intrinsics.
+- Raw intrinsics (`_mm512_*`, `_mm256_*`) require the corresponding headers
+  (`<immintrin.h>`). Highway already includes them transitively, so no
+  additional `#include` is needed.
+
+### 4. Add the function pointer to the `KernelVTable` struct (`jvector_simd.cpp`)
+
+```cpp
+struct KernelVTable {
+    // ... existing fields ...
+    float (*my_new_op_f32)(const float *, size_t);
+};
+```
+
+### 5. Fill the vtable for each ISA in the `DEFINE_ISA_VTABLE` macro
+
+```cpp
+#define DEFINE_ISA_VTABLE(ISA)          \
+    static const KernelVTable ISA##_vtable = { \
+        /* ... existing entries ... */  \
+        ISA::my_new_op_f32,             \
+    }
+```
+
+Order must match the struct field order exactly.
+
+### 6. Add the public wrapper (`jvector_simd.cpp`)
+
+```cpp
+float my_new_op_f32(const float *a, size_t length)
+{
+    return kernels.my_new_op_f32(a, length);
+}
+```
+
+### 7. Rebuild and regenerate Java bindings
+
+```bash
+bash jextract_vector_simd.sh
+```
+
+This rebuilds the `.so` and regenerates `NativeSimdOps.java` from the updated
+header.
+
+### 8. Add the Java call site
+
+In `NativeVectorUtilSupport.java` (or wherever appropriate), call through
+`NativeSimdOps`:
+
+```java
+public float myNewOp(VectorFloat<?> a) {
+    return NativeSimdOps.my_new_op_f32(
+        ((MemorySegmentVectorFloat) a).get(), a.length());
+}
+```
+
+Override the corresponding method in the `VectorUtilSupport` interface if one
+exists, or add a new interface method and update `PanamaVectorUtilSupport` with
+a pure-Java fallback first.
diff --git a/jvector-native/src/main/c/jvector_simd_check.c b/jvector-native/src/main/c/assertHwyTargets.h
similarity index 52%
rename from jvector-native/src/main/c/jvector_simd_check.c
rename to jvector-native/src/main/c/assertHwyTargets.h
index 50736e85a..a096b197b 100644
--- a/jvector-native/src/main/c/jvector_simd_check.c
+++ b/jvector-native/src/main/c/assertHwyTargets.h
@@ -14,17 +14,14 @@
  * limitations under the License.
  */
 
-#include <cpuid.h>
-#include "jvector_simd.h"
-
-bool check_avx512_compatibility(void) {
-    /* __builtin_cpu_init required when this is used in ifunc
-       resolver/__attribute__((constructor)) context, otherwise the CPU
-       features may not be detected correctly. */
-    __builtin_cpu_init();
-    return (__builtin_cpu_supports("avx512f") &&
-        __builtin_cpu_supports("avx512cd") &&
-        __builtin_cpu_supports("avx512dq") &&
-        __builtin_cpu_supports("avx512bw") &&
-        __builtin_cpu_supports("avx512vl"));
-}
+#if defined(__x86_64__) || defined(_M_X64)
+#if defined(JV_REQUIRE_HWY_AVX3)
+#if HWY_STATIC_TARGET != HWY_AVX3
+#error "Highway did not select HWY_AVX3 for the AVX-512 build. Check compiler flags, compiler support, and Highway blocklists."
+#endif
+#elif defined(JV_REQUIRE_HWY_AVX2)
+#if HWY_STATIC_TARGET != HWY_AVX2
+#error "Highway did not select HWY_AVX2 for the AVX2 build. Check compiler flags, compiler support, and Highway blocklists."
+#endif
+#endif //
+#endif // __X86_64__
diff --git a/jvector-native/src/main/c/jextract_vector_simd.sh b/jvector-native/src/main/c/jextract_vector_simd.sh
index d44d375dd..18719ba3d 100755
--- a/jvector-native/src/main/c/jextract_vector_simd.sh
+++ b/jvector-native/src/main/c/jextract_vector_simd.sh
@@ -17,48 +17,80 @@ set -e
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-if [ "$1" == "--auto-install-gcc" ] ; then AUTO_INSTALL_GCC=true ; shift ; fi
+if [ "$1" == "--auto-install-gcc" ] || [ "$1" == "--auto-install-g++" ] ; then AUTO_INSTALL_GCC=true ; shift ; fi
 printf "AUTO_INSTALL_GCC=%s\n" "${AUTO_INSTALL_GCC}"
 
 mkdir -p ../resources
-# compile jvector_simd_check.c as x86-64
-# compile jvector_simd.c as skylake-avx512
+# compile jvector_simd_check.cpp as x86-64
+# compile jvector_simd.cpp as skylake-avx512
 # produce one shared library
 
+# Check that the Google Highway submodule has been initialised
+HIGHWAY_DIR="third_party/highway"
+if [ ! -f "${HIGHWAY_DIR}/hwy/highway.h" ]; then
+  echo "ERROR: Google Highway submodule not found at ${HIGHWAY_DIR}."
+  echo "       Run the following command from the repository root to fix this:"
+  echo ""
+  echo "         git submodule update --init"
+  echo ""
+  exit 1
+fi
+
 # Desired minimum GCC version
 MIN_GCC_VERSION=11
 
-if ! command -v gcc &> /dev/null; then
+if ! command -v g++ &> /dev/null; then
   if [ "$AUTO_INSTALL_GCC" == "true" ]
   then
     LSB_RELEASE=$(lsb_release --id --short)
     printf "LSB_RELEASE=%s\n" "${LSB_RELEASE}"
     if [ "${LSB_RELEASE}" == "Ubuntu" ]
-    then sudo apt update && sudo apt install -y gcc
-    else printf "distribution %s needs a gcc install command in %s\n" "${LSB_RELEASE}" "${0}" ; exit 2
+    then sudo apt update && sudo apt install -y g++
+    else printf "distribution %s needs a g++ install command in %s\n" "${LSB_RELEASE}" "${0}" ; exit 2
     fi
   else
-    echo "GCC is not installed. Please install GCC 11+ to build supporting native libraries."
+    echo "g++ is not installed. Please install g++ 11+ to build supporting native libraries."
     exit 2
   fi
 fi
 
-# Check if GCC is installed
-CURRENT_GCC_VERSION=$(gcc -dumpversion)
+# Check g++ version
+CURRENT_GPP_VERSION=$(g++ -dumpversion)
 
 # Check if the current GCC version is greater than or equal to the minimum required version
-if [ "$(printf '%s\n' "$MIN_GCC_VERSION" "$CURRENT_GCC_VERSION" | sort -V | head -n1)" = "$MIN_GCC_VERSION" ]; then
-    rm -rf ../resources/libjvector.so
-    gcc -fPIC -O3 -march=icelake-server -c jvector_simd.c -o jvector_simd.o
-    gcc -fPIC -O3 -march=x86-64 -c jvector_simd_check.c -o jvector_simd_check.o
-    gcc -shared -o ../resources/libjvector.so jvector_simd_check.o jvector_simd.o
-
-    rm -rf jvector_common.o
-    rm -rf jvector_simd.o
-    rm -rf jvector_simd_check.o
-else
-    echo "WARNING: GCC version $CURRENT_GCC_VERSION is too old. Please upgrade to GCC $MIN_GCC_VERSION or newer."
+if [ "$(printf '%s\n' "$MIN_GCC_VERSION" "$CURRENT_GPP_VERSION" | sort -V | head -n1)" != "$MIN_GCC_VERSION" ]; then
+    echo "WARNING: g++ version $CURRENT_GPP_VERSION is too old. Please upgrade to g++ $MIN_GCC_VERSION or newer."
+    exit 1
+fi
+
+# Check meson and ninja are available
+if ! command -v meson &> /dev/null; then
+    echo "meson is not installed. Please install it: pip install meson  or  sudo apt install meson"
+    exit 2
+fi
+if ! command -v ninja &> /dev/null; then
+    echo "ninja is not installed. Please install it: sudo apt install ninja-build"
+    exit 2
+fi
+
+BUILD_DIR="build"
+rm -rf ../resources/libjvector.so
+
+# Configure (--wipe resets any stale configuration) then compile
+meson setup "${BUILD_DIR}" \
+    --wipe \
+    --buildtype=release
+
+meson compile -C "${BUILD_DIR}"
+
+# The versioned .so (e.g. libjvector.so.0.1.0) is the real file; symlinks point to it.
+# Copy it to ../resources/ as the plain libjvector.so for Java System.load().
+SOFILE=$(find "${BUILD_DIR}" -maxdepth 1 -name 'libjvector.so.*' -type f | head -1)
+if [ -z "${SOFILE}" ]; then
+    echo "ERROR: libjvector.so not found in ${BUILD_DIR} after build."
+    exit 1
 fi
+cp "${SOFILE}" ../resources/libjvector.so
 
 # Generate Java source code
 # Should only be run when c header changes
@@ -77,4 +109,4 @@ jextract \
   jvector_simd.h
 
 # Set critical linker option with heap-based segments for all generated methods
-sed -i 's/DESC)/DESC, Linker.Option.critical(true))/g' ../java/io/github/jbellis/jvector/vector/cnative/NativeSimdOps.java
\ No newline at end of file
+sed -i 's/DESC)/DESC, Linker.Option.critical(true))/g' ../java/io/github/jbellis/jvector/vector/cnative/NativeSimdOps.java
diff --git a/jvector-native/src/main/c/jvector_cpuFeatures.h b/jvector-native/src/main/c/jvector_cpuFeatures.h
new file mode 100644
index 000000000..524a7cc30
--- /dev/null
+++ b/jvector-native/src/main/c/jvector_cpuFeatures.h
@@ -0,0 +1,173 @@
+/*
+ * Copyright DataStax, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CPU_FEATURES_H
+#define CPU_FEATURES_H
+
+#include <array>
+#include <cstdint>
+
+#if defined(_MSC_VER)
+#include <intrin.h>
+#elif defined(__GNUC__) || defined(__clang__)
+#include <cpuid.h>
+#endif
+
+// Features needed by the ISA dispatch table.  Extend as new targets are added.
+//
+// ICX  = Intel Ice Lake-SP (Xeon Scalable 3rd Gen)
+// SPR  = Intel Sapphire Rapids (Xeon Scalable 4th Gen)
+enum class CpuFeature : uint32_t {
+    // ---- Base AVX2 / AVX-512 foundation (all SKUs) ----------------------
+    AVX2 = 0,
+    AVX512F = 1,
+    AVX512BW = 2,
+    AVX512CD = 3,
+    AVX512DQ = 4,
+    AVX512VL = 5,
+    // ---- ICX additions --------------------------------------------------
+    AVX512_VNNI = 6, // INT8 dot-product (CPUID 7.0 ECX[11])
+    AVX512_VBMI = 7, // byte permute/shuffle (CPUID 7.0 ECX[1])
+    AVX512_VBMI2 = 8, // byte/word expand+compress (CPUID 7.0 ECX[6])
+    AVX512_IFMA = 9, // 52-bit integer multiply-add (CPUID 7.0 EBX[21])
+    AVX512_BITALG = 10, // bit-manipulation (CPUID 7.0 ECX[12])
+    AVX512_VPOPCNTDQ = 11, // vector popcount dword/qword (CPUID 7.0 ECX[14])
+    GFNI = 12, // Galois-field instructions (CPUID 7.0 ECX[8])
+    VAES = 13, // 256/512-bit AES (CPUID 7.0 ECX[9])
+    VPCLMULQDQ = 14, // wide carry-less multiply (CPUID 7.0 ECX[10])
+    // ---- SPR additions --------------------------------------------------
+    AVX512_FP16 = 15, // FP16 arithmetic (CPUID 7.0 EDX[23])
+    AVX512_BF16 = 16, // BFloat16 arithmetic (CPUID 7.1 EAX[5])
+    AVX_VNNI = 17, // VEX-encoded VNNI 256-bit (CPUID 7.1 EAX[4])
+    COUNT
+};
+
+// Populate `features` by issuing CPUID and XGETBV.
+// All entries are false on non-x86 architectures.
+inline void
+populate_cpu_features(std::array<bool, static_cast<uint32_t>(CpuFeature::COUNT)>
+                              &features) noexcept
+{
+    features.fill(false);
+
+#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) \
+        || defined(_M_X64)
+
+    // Portable CPUID: GCC/Clang use <cpuid.h>; MSVC uses <intrin.h>.
+    auto run_cpuid = [](uint32_t leaf,
+                        uint32_t subleaf,
+                        uint32_t &eax,
+                        uint32_t &ebx,
+                        uint32_t &ecx,
+                        uint32_t &edx) noexcept {
+#if defined(_MSC_VER)
+        int info[4];
+        __cpuidex(info, static_cast<int>(leaf), static_cast<int>(subleaf));
+        eax = static_cast<uint32_t>(info[0]);
+        ebx = static_cast<uint32_t>(info[1]);
+        ecx = static_cast<uint32_t>(info[2]);
+        edx = static_cast<uint32_t>(info[3]);
+#else
+        __cpuid_count(leaf, subleaf, eax, ebx, ecx, edx);
+#endif
+    };
+
+    // Read XCR0.  Must only be called when OSXSAVE (CPUID.1:ECX[27]) is set;
+    // otherwise executing XGETBV raises #UD.
+    auto read_xcr0 = []() noexcept -> uint64_t {
+#if defined(_MSC_VER)
+        return static_cast<uint64_t>(_xgetbv(0));
+#else
+        uint32_t lo, hi;
+        __asm__("xgetbv" : "=a"(lo), "=d"(hi) : "c"(0u));
+        return (static_cast<uint64_t>(hi) << 32) | lo;
+#endif
+    };
+
+    uint32_t eax, ebx, ecx, edx;
+
+    // CPUID leaf 1 — check OSXSAVE: OS enabled XSAVE/XRSTOR (ECX bit 27).
+    run_cpuid(1u, 0u, eax, ebx, ecx, edx);
+    const bool osxsave = (ecx >> 27) & 1u;
+
+    // XCR0 encodes which register state the OS saves on context switch.
+    // Read it only if the CPU and OS have declared XSAVE support.
+    uint64_t xcr0 = 0u;
+    if (osxsave) { xcr0 = read_xcr0(); }
+
+    // Bits 1‥2: XMM and YMM state — required for AVX / AVX2.
+    const bool ymm_enabled = (xcr0 & 0x06u) == 0x06u;
+    // Bits 5‥7: opmask, ZMM_Hi256, Hi16_ZMM — required for AVX-512.
+    const bool zmm_enabled = ymm_enabled && ((xcr0 & 0xe0u) == 0xe0u);
+
+    // CPUID leaf 7, subleaf 0 — extended feature flags.
+    // EAX returns the maximum supported subleaf index for leaf 7.
+    run_cpuid(7u, 0u, eax, ebx, ecx, edx);
+    const uint32_t leaf7_max_subleaf = eax;
+
+    // ---- Base AVX2 / AVX-512 foundation ---------------------------------
+    features[static_cast<uint32_t>(CpuFeature::AVX2)]
+            = ymm_enabled && ((ebx >> 5) & 1u);
+    features[static_cast<uint32_t>(CpuFeature::AVX512F)]
+            = zmm_enabled && ((ebx >> 16) & 1u);
+    features[static_cast<uint32_t>(CpuFeature::AVX512BW)]
+            = zmm_enabled && ((ebx >> 30) & 1u);
+    features[static_cast<uint32_t>(CpuFeature::AVX512CD)]
+            = zmm_enabled && ((ebx >> 28) & 1u);
+    features[static_cast<uint32_t>(CpuFeature::AVX512DQ)]
+            = zmm_enabled && ((ebx >> 17) & 1u);
+    features[static_cast<uint32_t>(CpuFeature::AVX512VL)]
+            = zmm_enabled && ((ebx >> 31) & 1u);
+
+    // ---- ICX: leaf 7.0 EBX additions ------------------------------------
+    features[static_cast<uint32_t>(CpuFeature::AVX512_IFMA)]
+            = zmm_enabled && ((ebx >> 21) & 1u);
+
+    // ---- ICX: leaf 7.0 ECX additions ------------------------------------
+    features[static_cast<uint32_t>(CpuFeature::AVX512_VBMI)]
+            = zmm_enabled && ((ecx >> 1) & 1u);
+    features[static_cast<uint32_t>(CpuFeature::AVX512_VBMI2)]
+            = zmm_enabled && ((ecx >> 6) & 1u);
+    features[static_cast<uint32_t>(CpuFeature::GFNI)]
+            = zmm_enabled && ((ecx >> 8) & 1u);
+    features[static_cast<uint32_t>(CpuFeature::VAES)]
+            = zmm_enabled && ((ecx >> 9) & 1u);
+    features[static_cast<uint32_t>(CpuFeature::VPCLMULQDQ)]
+            = zmm_enabled && ((ecx >> 10) & 1u);
+    features[static_cast<uint32_t>(CpuFeature::AVX512_VNNI)]
+            = zmm_enabled && ((ecx >> 11) & 1u);
+    features[static_cast<uint32_t>(CpuFeature::AVX512_BITALG)]
+            = zmm_enabled && ((ecx >> 12) & 1u);
+    features[static_cast<uint32_t>(CpuFeature::AVX512_VPOPCNTDQ)]
+            = zmm_enabled && ((ecx >> 14) & 1u);
+
+    // ---- SPR: leaf 7.0 EDX additions ------------------------------------
+    features[static_cast<uint32_t>(CpuFeature::AVX512_FP16)]
+            = zmm_enabled && ((edx >> 23) & 1u);
+
+    // ---- SPR: leaf 7.1 EAX additions (guarded by max subleaf) ----------
+    if (leaf7_max_subleaf >= 1u) {
+        run_cpuid(7u, 1u, eax, ebx, ecx, edx);
+        features[static_cast<uint32_t>(CpuFeature::AVX_VNNI)]
+                = ymm_enabled && ((eax >> 4) & 1u);
+        features[static_cast<uint32_t>(CpuFeature::AVX512_BF16)]
+                = zmm_enabled && ((eax >> 5) & 1u);
+    }
+
+#endif // x86 / x86_64
+}
+
+#endif // CPU_FEATURES_H
diff --git a/jvector-native/src/main/c/jvector_simd.c b/jvector-native/src/main/c/jvector_simd.c
deleted file mode 100644
index 6bfde3f9a..000000000
--- a/jvector-native/src/main/c/jvector_simd.c
+++ /dev/null
@@ -1,547 +0,0 @@
-/*
- * Copyright DataStax, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <immintrin.h>
-#include <inttypes.h>
-#include <math.h>
-#include "jvector_simd.h"
-
-__m512i initialIndexRegister;
-__m512i indexIncrement;
-__m512i maskSeventhBit;
-__m512i maskEighthBit;
-
-__attribute__((constructor))
-void initialize_constants() {
-    if (check_avx512_compatibility()) {
-        initialIndexRegister = _mm512_setr_epi32(-16, -15, -14, -13, -12, -11, -10, -9,
-                                             -8, -7, -6, -5, -4, -3, -2, -1);
-        indexIncrement = _mm512_set1_epi32(16);
-        maskSeventhBit = _mm512_set1_epi16(0x0040);
-        maskEighthBit = _mm512_set1_epi16(0x0080);
-    }
-}
-
-float dot_product_f32_64(const float* a, int aoffset, const float* b, int boffset) {
-
-     __m128 va = _mm_castsi128_ps(_mm_loadl_epi64((__m128i *)(a + aoffset)));
-     __m128 vb = _mm_castsi128_ps(_mm_loadl_epi64((__m128i *)(b + boffset)));
-     __m128 r  = _mm_mul_ps(va, vb); // Perform element-wise multiplication
-
-    // Horizontal sum of the vector to get dot product
-    __attribute__((aligned(16))) float result[4];
-    _mm_store_ps(result, r);
-    return result[0] + result[1];
-}
-
-float dot_product_f32_128(const float* a, int aoffset, const float* b, int boffset, int length) {
-    float dot = 0.0;
-    int ao = aoffset;
-    int bo = boffset;
-    int alim = aoffset + length;
-    int blim = boffset + length;
-    int simd_length = length - (length % 4);
-
-    if (length >= 4) {
-        __m128 sum = _mm_setzero_ps();
-
-        for(; ao < aoffset + simd_length; ao += 4, bo += 4) {
-            // Load float32
-            __m128 va = _mm_loadu_ps(a + ao);
-            __m128 vb = _mm_loadu_ps(b + bo);
-
-            // Multiply and accumulate
-            sum = _mm_fmadd_ps(va, vb, sum);
-        }
-
-        // Horizontal sum of the vector to get dot product
-        __attribute__((aligned(16))) float result[4];
-        _mm_store_ps(result, sum);
-
-        for(int i = 0; i < 4; ++i) {
-            dot += result[i];
-        }
-    }
-
-    for (; ao < alim && bo < blim; ao++, bo++) {
-        dot += a[ao] * b[bo];
-    }
-
-    return dot;
-}
-
-float dot_product_f32_256(const float* a, int aoffset, const float* b, int boffset, int length) {
-    float dot = 0.0;
-    int ao = aoffset;
-    int bo = boffset;
-    int alim = aoffset + length;
-    int blim = boffset + length;
-    int simd_length = length - (length % 8);
-
-    if (length >= 8) {
-        __m256 sum = _mm256_setzero_ps();
-
-        for(; ao < aoffset + simd_length; ao += 8, bo += 8) {
-            // Load float32
-            __m256 va = _mm256_loadu_ps(a + ao);
-            __m256 vb = _mm256_loadu_ps(b + bo);
-
-            // Multiply and accumulate
-            sum = _mm256_fmadd_ps(va, vb, sum);
-        }
-
-        // Horizontal sum of the vector to get dot product
-        __attribute__((aligned(32))) float result[8];
-        _mm256_store_ps(result, sum);
-
-        for(int i = 0; i < 8; ++i) {
-            dot += result[i];
-        }
-    }
-
-    for (; ao < alim && bo < blim; ao++, bo++) {
-        dot += a[ao] * b[bo];
-    }
-
-    return dot;
-}
-
-float dot_product_f32_512(const float* a, int aoffset, const float* b, int boffset, int length) {
-    float dot = 0.0;
-    int ao = aoffset;
-    int bo = boffset;
-    int alim = aoffset + length;
-    int blim = boffset + length;
-    int simd_length = length - (length % 16);
-
-    if (length >= 16) {
-        __m512 sum = _mm512_setzero_ps();
-        for(; ao < aoffset + simd_length; ao += 16, bo += 16) {
-            // Load float32
-            __m512 va = _mm512_loadu_ps(a + ao);
-            __m512 vb = _mm512_loadu_ps(b + bo);
-
-            // Multiply and accumulate
-            sum = _mm512_fmadd_ps(va, vb, sum);
-        }
-
-        // Horizontal sum of the vector to get dot product
-        dot = _mm512_reduce_add_ps(sum);
-    }
-
-    for (; ao < alim && bo < blim; ao++, bo++) {
-        dot += a[ao] * b[bo];
-    }
-
-    return dot;
-}
-
-float dot_product_f32(int preferred_size, const float* a, int aoffset, const float* b, int boffset, int length) {
-    if (length == 2)
-        return dot_product_f32_64(a, aoffset, b, boffset);
-    if (length <= 7)
-        return dot_product_f32_128(a, aoffset, b, boffset, length);
-
-    return (preferred_size == 512 && length >= 16)
-           ? dot_product_f32_512(a, aoffset, b, boffset, length)
-           : dot_product_f32_256(a, aoffset, b, boffset, length);
-}
-
-float euclidean_f32_64(const float* a, int aoffset, const float* b, int boffset) {
-     __m128 va = _mm_castsi128_ps(_mm_loadl_epi64((__m128i *)(a + aoffset)));
-     __m128 vb = _mm_castsi128_ps(_mm_loadl_epi64((__m128i *)(b + boffset)));
-     __m128 r  = _mm_sub_ps(va, vb);
-     r = _mm_mul_ps(r, r);
-
-    // Horizontal sum of the vector to get square distance
-    __attribute__((aligned(8))) float result[2];
-    _mm_store_ps(result, r);
-    return result[0] + result[1];
-}
-
-float euclidean_f32_128(const float* a, int aoffset, const float* b, int boffset, int length) {
-    float squareDistance = 0.0;
-    int ao = aoffset;
-    int bo = boffset;
-    int alim = aoffset + length;
-    int blim = boffset + length;
-    int simd_length = length - (length % 4);
-
-    if (length >= 4) {
-        __m128 sum = _mm_setzero_ps();
-
-        for(; ao < aoffset + simd_length; ao += 4, bo += 4) {
-            // Load float32
-            __m128 va = _mm_loadu_ps(a + ao);
-            __m128 vb = _mm_loadu_ps(b + bo);
-            __m128 diff = _mm_sub_ps(va, vb);
-            // Multiply and accumulate
-            sum = _mm_fmadd_ps(diff, diff, sum);
-        }
-
-        // Horizontal sum of the vector to get dot product
-        __attribute__((aligned(16))) float result[4];
-        _mm_store_ps(result, sum);
-
-        for(int i = 0; i < 4; ++i) {
-            squareDistance += result[i];
-        }
-    }
-
-    for (; ao < alim && bo < blim; ao++, bo++) {
-        float diff = a[ao] - b[bo];
-        squareDistance += diff * diff;
-    }
-
-    return squareDistance;
-}
-
-float euclidean_f32_256(const float* a, int aoffset, const float* b, int boffset, int length) {
-    float squareDistance = 0.0;
-    int ao = aoffset;
-    int bo = boffset;
-    int alim = aoffset + length;
-    int blim = boffset + length;
-    int simd_length = length - (length % 8);
-
-    if (length >= 8) {
-        __m256 sum = _mm256_setzero_ps();
-
-        for(; ao < aoffset + simd_length; ao += 8, bo += 8) {
-            // Load float32
-            __m256 va = _mm256_loadu_ps(a + ao);
-            __m256 vb = _mm256_loadu_ps(b + bo);
-            __m256 diff = _mm256_sub_ps(va, vb);
-
-            // Multiply and accumulate
-            sum = _mm256_fmadd_ps(diff, diff, sum);
-        }
-
-        __attribute__((aligned(32))) float result[8];
-        _mm256_store_ps(result, sum);
-
-        for(int i = 0; i < 8; ++i) {
-            squareDistance += result[i];
-        }
-    }
-
-    for (; ao < alim && bo < blim; ao++, bo++) {
-        float diff = a[ao] - b[bo];
-        squareDistance += diff * diff;
-    }
-
-    return squareDistance;
-}
-
-float euclidean_f32_512(const float* a, int aoffset, const float* b, int boffset, int length) {
-    float squareDistance = 0.0;
-    int ao = aoffset;
-    int bo = boffset;
-    int alim = aoffset + length;
-    int blim = boffset + length;
-    int simd_length = length - (length % 16);
-
-    if (length >= 16) {
-        __m512 sum = _mm512_setzero_ps();
-        for(; ao < aoffset + simd_length; ao += 16, bo += 16) {
-            // Load float32
-            __m512 va = _mm512_loadu_ps(a + ao);
-            __m512 vb = _mm512_loadu_ps(b + bo);
-            __m512 diff = _mm512_sub_ps(va, vb);
-
-            // Multiply and accumulate
-            sum = _mm512_fmadd_ps(diff, diff, sum);
-        }
-
-        // Horizontal sum of the vector to get dot product
-        squareDistance = _mm512_reduce_add_ps(sum);
-    }
-
-    for (; ao < alim && bo < blim; ao++, bo++) {
-        float diff = a[ao] - b[bo];
-        squareDistance += diff * diff;
-    }
-
-    return squareDistance;
-}
-
-float euclidean_f32(int preferred_size, const float* a, int aoffset, const float* b, int boffset, int length) {
-    if (length == 2)
-        return euclidean_f32_64(a, aoffset, b, boffset);
-    if (length <= 7)
-        return euclidean_f32_128(a, aoffset, b, boffset, length);
-
-    return (preferred_size == 512 && length >= 16)
-           ? euclidean_f32_512(a, aoffset, b, boffset, length)
-           : euclidean_f32_256(a, aoffset, b, boffset, length);
-}
-
-float assemble_and_sum_f32_512(const float* data, int dataBase, const unsigned char* baseOffsets, int baseOffsetsOffset, int baseOffsetsLength) {
-    __m512 sum = _mm512_setzero_ps();
-    int i = 0;
-    int limit = baseOffsetsLength - (baseOffsetsLength % 16);
-    __m512i indexRegister = initialIndexRegister;
-    __m512i dataBaseVec = _mm512_set1_epi32(dataBase);
-    baseOffsets = baseOffsets + baseOffsetsOffset;
-
-    for (; i < limit; i += 16) {
-        __m128i baseOffsetsRaw = _mm_loadu_si128((__m128i *)(baseOffsets + i));
-        __m512i baseOffsetsInt = _mm512_cvtepu8_epi32(baseOffsetsRaw);
-        // we have base offsets int, which we need to scale to index into data.
-        // first, we want to initialize a vector with the lane number added as an index
-        indexRegister = _mm512_add_epi32(indexRegister, indexIncrement);
-        // then we want to multiply by dataBase
-        __m512i scale = _mm512_mullo_epi32(indexRegister, dataBaseVec);
-        // then we want to add the base offsets
-        __m512i convOffsets = _mm512_add_epi32(scale, baseOffsetsInt);
-
-        __m512 partials = _mm512_i32gather_ps(convOffsets, data, 4);
-        sum = _mm512_add_ps(sum, partials);
-    }
-
-    float res = _mm512_reduce_add_ps(sum);
-    for (; i < baseOffsetsLength; i++) {
-        res += data[dataBase * i + baseOffsets[i]];
-    }
-
-    return res;
-}
-
-float pq_decoded_cosine_similarity_f32_512(const unsigned char* baseOffsets, int baseOffsetsOffset, int baseOffsetsLength, int clusterCount, const float* partialSums, const float* aMagnitude, float bMagnitude) {
-    __m512 sum = _mm512_setzero_ps();
-    __m512 vaMagnitude = _mm512_setzero_ps();
-    int i = 0;
-    int limit = baseOffsetsLength - (baseOffsetsLength % 16);
-    __m512i indexRegister = initialIndexRegister;
-    __m512i scale = _mm512_set1_epi32(clusterCount);
-    baseOffsets = baseOffsets + baseOffsetsOffset;
-
-
-    for (; i < limit; i += 16) {
-        // Load and convert baseOffsets to integers
-        __m128i baseOffsetsRaw = _mm_loadu_si128((__m128i *)(baseOffsets + i));
-        __m512i baseOffsetsInt = _mm512_cvtepu8_epi32(baseOffsetsRaw);
-
-        indexRegister = _mm512_add_epi32(indexRegister, indexIncrement);
-        // Scale the baseOffsets by the cluster count
-        __m512i scaledOffsets = _mm512_mullo_epi32(indexRegister, scale);
-
-        // Calculate the final convOffsets by adding the scaled indexes and the base offsets
-        __m512i convOffsets = _mm512_add_epi32(scaledOffsets, baseOffsetsInt);
-
-        // Gather and sum values for partial sums and a magnitude
-        __m512 partialSumVals = _mm512_i32gather_ps(convOffsets, partialSums, 4);
-        sum = _mm512_add_ps(sum, partialSumVals);
-
-        __m512 aMagnitudeVals = _mm512_i32gather_ps(convOffsets, aMagnitude, 4);
-        vaMagnitude = _mm512_add_ps(vaMagnitude, aMagnitudeVals);
-    }
-
-    // Reduce sums
-    float sumResult = _mm512_reduce_add_ps(sum);
-    float aMagnitudeResult = _mm512_reduce_add_ps(vaMagnitude);
-
-    // Handle the remaining elements
-    for (; i < baseOffsetsLength; i++) {
-        int offset = clusterCount * i + baseOffsets[i];
-        sumResult += partialSums[offset];
-        aMagnitudeResult += aMagnitude[offset];
-    }
-
-    return sumResult / sqrtf(aMagnitudeResult * bMagnitude);
-}
-
-void calculate_partial_sums_dot_f32_512(const float* codebook, int codebookIndex, int size, int clusterCount, const float* query, int queryOffset, float* partialSums) {
-    int codebookBase = codebookIndex * clusterCount;
-    for (int i = 0; i < clusterCount; i++) {
-      partialSums[codebookBase + i] = dot_product_f32(512, codebook, i * size, query, queryOffset, size);
-    }
-}
-
-void calculate_partial_sums_euclidean_f32_512(const float* codebook, int codebookIndex, int size, int clusterCount, const float* query, int queryOffset, float* partialSums) {
-    int codebookBase = codebookIndex * clusterCount;
-    for (int i = 0; i < clusterCount; i++) {
-      partialSums[codebookBase + i] = euclidean_f32(512, codebook, i * size, query, queryOffset, size);
-    }
-}
-
-/* Bulk shuffles for Fused ADC
- * These shuffles take an array of transposed PQ neighbors (in shuffles) and an of quantized partial distances to shuffle.
- * Partial distance quantization depends on the best distance and delta used to quantize.
- * The shuffles for each codebook will be loaded as bytes (supporting up to 256 cluster PQ) and zero-padded to align
- * with 16-bit quantized partial distances. These partial distances will be loaded into SIMD registers, supporting 32 partials
- * per register. Each permutation will take 2 registers, so we need four total permutations to look up against all
- * 256 partial distances. These four permutations will be blended based on the top two bits of each shuffle, allowing 256
- * entry codebook lookup. Quantized partials are quantized based on bounds provided during the search that suggest total
- * distances above the maximum value of an unsigned 16-bit integer will be irrelevant. This allows us to use saturating
- * arithmetic, eliminating the need to widen lanes during accumulation. The total quantized distance is then de-quantized
- * and transformed into the appropriate similarity score.
- *
- * In the case of cosine, we have an additional set of partials used for partial squared magnitudes. These are quantized \
- * with a different pair of delta/base, so they will be aggregated and dequantized separately.
- */
-
-
-__attribute__((always_inline)) inline __m512i lookup_partial_sums(__m512i shuffle, const char* quantizedPartials, int i) {
-    __m512i partialsVecA = _mm512_loadu_epi16(quantizedPartials + i * 512);
-    __m512i partialsVecB = _mm512_loadu_epi16(quantizedPartials + i * 512 + 64);
-    __m512i partialsVecC = _mm512_loadu_epi16(quantizedPartials + i * 512 + 128);
-    __m512i partialsVecD = _mm512_loadu_epi16(quantizedPartials + i * 512 + 192);
-    __m512i partialsVecE = _mm512_loadu_epi16(quantizedPartials + i * 512 + 256);
-    __m512i partialsVecF = _mm512_loadu_epi16(quantizedPartials + i * 512 + 320);
-    __m512i partialsVecG = _mm512_loadu_epi16(quantizedPartials + i * 512 + 384);
-    __m512i partialsVecH = _mm512_loadu_epi16(quantizedPartials + i * 512 + 448);
-
-    __m512i partialsVecAB = _mm512_permutex2var_epi16(partialsVecA, shuffle, partialsVecB);
-    __m512i partialsVecCD = _mm512_permutex2var_epi16(partialsVecC, shuffle, partialsVecD);
-    __m512i partialsVecEF = _mm512_permutex2var_epi16(partialsVecE, shuffle, partialsVecF);
-    __m512i partialsVecGH = _mm512_permutex2var_epi16(partialsVecG, shuffle, partialsVecH);
-
-    __mmask32 maskSeven = _mm512_test_epi16_mask(shuffle, maskSeventhBit);
-    __mmask32 maskEight = _mm512_test_epi16_mask(shuffle, maskEighthBit);
-    __m512i partialsVecABCD = _mm512_mask_blend_epi16(maskSeven, partialsVecAB, partialsVecCD);
-    __m512i partialsVecEFGH = _mm512_mask_blend_epi16(maskSeven, partialsVecEF, partialsVecGH);
-    __m512i partialSumsVec = _mm512_mask_blend_epi16(maskEight, partialsVecABCD, partialsVecEFGH);
-
-    return partialSumsVec;
-}
-
-// dequantize a 256-bit vector containing 16 unsigned 16-bit integers into a 512-bit vector containing 16 32-bit floats
-__attribute__((always_inline)) inline __m512 dequantize(__m256i quantizedVec, float delta, float base) {
-    __m512i quantizedVecWidened = _mm512_cvtepu16_epi32(quantizedVec);
-    __m512 floatVec = _mm512_cvtepi32_ps(quantizedVecWidened);
-    __m512 deltaVec = _mm512_set1_ps(delta);
-    __m512 baseVec = _mm512_set1_ps(base);
-    __m512 dequantizedVec = _mm512_fmadd_ps(floatVec, deltaVec, baseVec);
-    return dequantizedVec;
-}
-
-void bulk_quantized_shuffle_euclidean_f32_512(const unsigned char* shuffles, int codebookCount, const char* quantizedPartials, float delta, float minDistance, float* results) {
-    __m512i sum = _mm512_setzero_epi32();
-
-    for (int i = 0; i < codebookCount; i++) {
-         __m256i smallShuffle = _mm256_loadu_epi8(shuffles + i * 32);
-         __m512i shuffle = _mm512_cvtepu8_epi16(smallShuffle);
-        __m512i partialsVec = lookup_partial_sums(shuffle, quantizedPartials, i);
-
-        sum = _mm512_adds_epu16(sum, partialsVec);
-    }
-
-    __m256i quantizedResultsLeftRaw = _mm512_extracti32x8_epi32(sum, 0);
-    __m256i quantizedResultsRightRaw = _mm512_extracti32x8_epi32(sum, 1);
-    __m512 resultsLeft = dequantize(quantizedResultsLeftRaw, delta, minDistance);
-    __m512 resultsRight = dequantize(quantizedResultsRightRaw, delta, minDistance);
-
-    __m512 ones = _mm512_set1_ps(1.0);
-    resultsLeft = _mm512_add_ps(resultsLeft, ones);
-    resultsRight = _mm512_add_ps(resultsRight, ones);
-    resultsLeft = _mm512_rcp14_ps(resultsLeft);
-    resultsRight = _mm512_rcp14_ps(resultsRight);
-    _mm512_storeu_ps(results, resultsLeft);
-    _mm512_storeu_ps(results + 16, resultsRight);
-}
-
-void bulk_quantized_shuffle_dot_f32_512(const unsigned char* shuffles, int codebookCount, const char* quantizedPartials, float delta, float best, float* results) {
-    __m512i sum = _mm512_setzero_epi32();
-
-    for (int i = 0; i < codebookCount; i++) {
-         __m256i smallShuffle = _mm256_loadu_epi8(shuffles + i * 32);
-         __m512i shuffle = _mm512_cvtepu8_epi16(smallShuffle);
-        __m512i partialsVec = lookup_partial_sums(shuffle, quantizedPartials, i);
-        sum = _mm512_adds_epu16(sum, partialsVec);
-    }
-
-    __m256i quantizedResultsLeftRaw = _mm512_extracti32x8_epi32(sum, 0);
-    __m256i quantizedResultsRightRaw = _mm512_extracti32x8_epi32(sum, 1);
-    __m512 resultsLeft = dequantize(quantizedResultsLeftRaw, delta, best);
-    __m512 resultsRight = dequantize(quantizedResultsRightRaw, delta, best);
-
-    __m512 ones = _mm512_set1_ps(1.0);
-    resultsLeft = _mm512_add_ps(resultsLeft, ones);
-    resultsRight = _mm512_add_ps(resultsRight, ones);
-    resultsLeft = _mm512_div_ps(resultsLeft, _mm512_set1_ps(2.0));
-    resultsRight = _mm512_div_ps(resultsRight, _mm512_set1_ps(2.0));
-    _mm512_storeu_ps(results, resultsLeft);
-    _mm512_storeu_ps(results + 16, resultsRight);
-}
-
-void bulk_quantized_shuffle_cosine_f32_512(const unsigned char* shuffles, int codebookCount, const char* quantizedPartialSums, float sumDelta, float minDistance, const char* quantizedPartialMagnitudes, float magnitudeDelta, float minMagnitude, float queryMagnitudeSquared, float* results) {
-    __m512i sum = _mm512_setzero_epi32();
-    __m512i magnitude = _mm512_setzero_epi32();
-
-    for (int i = 0; i < codebookCount; i++) {
-        __m256i smallShuffle = _mm256_loadu_epi8((shuffles + i * 32));
-        __m512i shuffle = _mm512_cvtepu8_epi16(smallShuffle);
-        __m512i partialSumsVec = lookup_partial_sums(shuffle, quantizedPartialSums, i);
-        sum = _mm512_adds_epu16(sum, partialSumsVec);
-
-        __m512i partialMagnitudesVec = lookup_partial_sums(shuffle, quantizedPartialMagnitudes, i);
-        magnitude = _mm512_adds_epu16(magnitude, partialMagnitudesVec);
-    }
-
-    __m256i quantizedSumsLeftRaw = _mm512_extracti32x8_epi32(sum, 0);
-    __m256i quantizedSumsRightRaw = _mm512_extracti32x8_epi32(sum, 1);
-    __m512 sumsLeft = dequantize(quantizedSumsLeftRaw, sumDelta, minDistance);
-    __m512 sumsRight = dequantize(quantizedSumsRightRaw, sumDelta, minDistance);
-
-    __m256i quantizedMagnitudesLeftRaw = _mm512_extracti32x8_epi32(magnitude, 0);
-    __m256i quantizedMagnitudesRightRaw = _mm512_extracti32x8_epi32(magnitude, 1);
-    __m512 magnitudesLeft = dequantize(quantizedMagnitudesLeftRaw, magnitudeDelta, minMagnitude);
-    __m512 magnitudesRight = dequantize(quantizedMagnitudesRightRaw, magnitudeDelta, minMagnitude);
-
-    __m512 queryMagnitudeSquaredVec = _mm512_set1_ps(queryMagnitudeSquared);
-    magnitudesLeft = _mm512_mul_ps(magnitudesLeft, queryMagnitudeSquaredVec);
-    magnitudesRight = _mm512_mul_ps(magnitudesRight, queryMagnitudeSquaredVec);
-    magnitudesLeft = _mm512_sqrt_ps(magnitudesLeft);
-    magnitudesRight = _mm512_sqrt_ps(magnitudesRight);
-    __m512 resultsLeft = _mm512_div_ps(sumsLeft, magnitudesLeft);
-    __m512 resultsRight = _mm512_div_ps(sumsRight, magnitudesRight);
-
-    __m512 ones = _mm512_set1_ps(1.0);
-    resultsLeft = _mm512_add_ps(resultsLeft, ones);
-    resultsRight = _mm512_add_ps(resultsRight, ones);
-    resultsLeft = _mm512_div_ps(resultsLeft, _mm512_set1_ps(2.0));
-    resultsRight = _mm512_div_ps(resultsRight, _mm512_set1_ps(2.0));
-    _mm512_storeu_ps(results, resultsLeft);
-    _mm512_storeu_ps(results + 16, resultsRight);
-}
-
-// Partial sum calculations that also record best distances, as this is necessary for Fused ADC quantization
-void calculate_partial_sums_best_dot_f32_512(const float* codebook, int codebookIndex, int size, int clusterCount, const float* query, int queryOffset, float* partialSums, float* partialBestDistances) {
-    float best = -INFINITY;
-    int codebookBase = codebookIndex * clusterCount;
-    for (int i = 0; i < clusterCount; i++) {
-      float val = dot_product_f32(512, codebook, i * size, query, queryOffset, size);
-      partialSums[codebookBase + i] = val;
-      if (val > best) {
-        best = val;
-      }
-    }
-    partialBestDistances[codebookIndex] = best;
-}
-
-void calculate_partial_sums_best_euclidean_f32_512(const float* codebook, int codebookIndex, int size, int clusterCount, const float* query, int queryOffset, float* partialSums, float* partialBestDistances) {
-    float best = INFINITY;
-    int codebookBase = codebookIndex * clusterCount;
-    for (int i = 0; i < clusterCount; i++) {
-      float val = euclidean_f32(512, codebook, i * size, query, queryOffset, size);
-      partialSums[codebookBase + i] = val;
-      if (val < best) {
-        best = val;
-      }
-    }
-    partialBestDistances[codebookIndex] = best;
-}
\ No newline at end of file
diff --git a/jvector-native/src/main/c/jvector_simd.cpp b/jvector-native/src/main/c/jvector_simd.cpp
new file mode 100644
index 000000000..69bc5977a
--- /dev/null
+++ b/jvector-native/src/main/c/jvector_simd.cpp
@@ -0,0 +1,361 @@
+/*
+ * Copyright DataStax, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Runtime SIMD dispatch: selects AVX3 (AVX-512), AVX2, or SSE42 at startup.
+// SSE42 is the baseline and is assumed always available without a CPUID check.
+// AVX2 and AVX3 are probed via CPUID.  Function pointers are resolved once at
+// static-init time; each public call is a single indirect branch.
+#include "jvector_simd.h"
+#include "jvector_simd_kernels.h" // AVX3::, AVX2::, SSE42:: kernel declarations
+#include "jvector_cpuFeatures.h"  // populate_cpu_features(), CpuFeature enum
+
+#include <array>
+#include <cstdlib> // std::getenv
+#include <cstring> // std::strcmp
+
+// Everything in this anonymous namespace is translation-unit private; none of
+// these symbols are exported from the shared library.
+namespace {
+
+// Enumerates the ISA tiers in ascending capability order so that numeric
+// comparisons (e.g. max_isa != MaxIsa::AVX2) correctly gate higher tiers.
+// Unset (-1) means "no override; use best available CPU capability".
+enum class MaxIsa { Unset = -1, SSE42 = 0, AVX2 = 1, AVX3 = 2 };
+
+// Reads the JVECTOR_MAX_ISA environment variable and maps it to a MaxIsa
+// value.  This lets callers cap the ISA at runtime without recompiling —
+// useful for benchmarking or working around CPU errata.
+// Accepted values (case-sensitive): "avx3", "avx2", "sse42".
+static MaxIsa read_max_isa() noexcept
+{
+    const char *val = std::getenv("JVECTOR_MAX_ISA");
+    if (!val) return MaxIsa::Unset;
+    if (std::strcmp(val, "avx3")  == 0) return MaxIsa::AVX3;
+    if (std::strcmp(val, "avx2")  == 0) return MaxIsa::AVX2;
+    if (std::strcmp(val, "sse42") == 0) return MaxIsa::SSE42;
+    return MaxIsa::Unset; // unrecognised value: ignore and use CPU detection
+}
+
+// KernelVTable holds one function pointer per public kernel.  Storing them
+// together in a struct means a single pointer comparison during dispatch_kernels()
+// selects all kernels for the chosen ISA in one shot.
+struct KernelVTable {
+    /* Vector similarity */
+    float (*cosine_f32)(const float *, size_t, const float *, size_t, size_t);
+    float (*dot_product_f32)(const float *, size_t, const float *, size_t, size_t);
+    float (*euclidean_f32)(const float *, size_t, const float *, size_t, size_t);
+    /* Element-wise in-place arithmetic */
+    void  (*add_in_place_f32)(float *, const float *, size_t);
+    void  (*add_scalar_in_place_f32)(float *, float, size_t);
+    void  (*sub_in_place_f32)(float *, const float *, size_t);
+    void  (*sub_scalar_in_place_f32)(float *, float, size_t);
+    float (*max_f32)(const float *, size_t);
+    void  (*min_in_place_f32)(float *, const float *, size_t);
+    /* PQ kernels */
+    float (*assemble_and_sum_f32)(const float *, int,
+                                  const unsigned char *, int, size_t);
+    float (*assemble_and_sum_pq_f32)(const float *, size_t,
+                                     const unsigned char *, int,
+                                     const unsigned char *, int, int);
+    float (*pq_decoded_cosine_similarity_f32)(const unsigned char *, int,
+                                              size_t, int,
+                                              const float *, const float *,
+                                              float);
+    void  (*calculate_partial_sums_dot_f32)(const float *, int,
+                                            size_t, int,
+                                            const float *, int, float *);
+    void  (*calculate_partial_sums_euclidean_f32)(const float *, int,
+                                                  size_t, int,
+                                                  const float *, int, float *);
+    void  (*calculate_partial_sums_self_magnitude_f32)(const float *, int,
+                                                       size_t, int, float *);
+    /* NVQ kernels */
+    void    (*nvq_quantize_8bit)(const float *, size_t,
+                                 float, float, float, float,
+                                 unsigned char *);
+    float   (*nvq_loss)(const float *, size_t,
+                        float, float, float, float, int);
+    float   (*nvq_uniform_loss)(const float *, size_t, float, float, int);
+    float   (*nvq_square_l2_distance_8bit)(const float *,
+                                           const unsigned char *, size_t,
+                                           float, float, float, float);
+    float   (*nvq_dot_product_8bit)(const float *,
+                                    const unsigned char *, size_t,
+                                    float, float, float, float);
+    int64_t (*nvq_cosine_8bit_packed)(const float *,
+                                      const unsigned char *, size_t,
+                                      float, float, float, float,
+                                      const float *);
+    void    (*nvq_shuffle_query_in_place_8bit)(float *, size_t);
+};
+
+// One pre-filled vtable per ISA.  These are constant data; no heap allocation.
+#define DEFINE_ISA_VTABLE(ISA)                                          \
+    static const KernelVTable ISA##_vtable = {                          \
+        ISA::cosine_f32,                                                 \
+        ISA::dot_product_f32,                                            \
+        ISA::euclidean_f32,                                              \
+        ISA::add_in_place_f32,                                           \
+        ISA::add_scalar_in_place_f32,                                    \
+        ISA::sub_in_place_f32,                                           \
+        ISA::sub_scalar_in_place_f32,                                    \
+        ISA::max_f32,                                                    \
+        ISA::min_in_place_f32,                                           \
+        ISA::assemble_and_sum_f32,                                       \
+        ISA::assemble_and_sum_pq_f32,                                    \
+        ISA::pq_decoded_cosine_similarity_f32,                           \
+        ISA::calculate_partial_sums_dot_f32,                             \
+        ISA::calculate_partial_sums_euclidean_f32,                       \
+        ISA::calculate_partial_sums_self_magnitude_f32,                  \
+        ISA::nvq_quantize_8bit,                                          \
+        ISA::nvq_loss,                                                   \
+        ISA::nvq_uniform_loss,                                           \
+        ISA::nvq_square_l2_distance_8bit,                                \
+        ISA::nvq_dot_product_8bit,                                       \
+        ISA::nvq_cosine_8bit_packed,                                     \
+        ISA::nvq_shuffle_query_in_place_8bit,                            \
+    }
+
+DEFINE_ISA_VTABLE(AVX3);
+DEFINE_ISA_VTABLE(AVX2);
+DEFINE_ISA_VTABLE(SSE42);
+
+// Selects and returns the best vtable for the current CPU and environment.
+// Called exactly once during static initialisation (before main()).
+static KernelVTable dispatch_kernels() noexcept
+{
+    // Check whether the caller has capped the ISA via the environment variable.
+    const MaxIsa max_isa = read_max_isa();
+
+    // Populate a boolean feature array by issuing CPUID and reading XCR0.
+    std::array<bool, static_cast<uint32_t>(CpuFeature::COUNT)> features;
+    populate_cpu_features(features);
+
+    auto has = [&](CpuFeature f) noexcept {
+        return features[static_cast<uint32_t>(f)];
+    };
+
+    // AVX3 tier requires the full Skylake-AVX512 (SKX) baseline:
+    // AVX512F (foundation) + BW (byte/word) + CD (conflict detect)
+    // + DQ (dword/qword) + VL (vector length extensions).
+    if (max_isa != MaxIsa::SSE42 && max_isa != MaxIsa::AVX2
+        && has(CpuFeature::AVX512F) && has(CpuFeature::AVX512BW)
+        && has(CpuFeature::AVX512CD) && has(CpuFeature::AVX512DQ)
+        && has(CpuFeature::AVX512VL)) {
+        return AVX3_vtable;
+    }
+    // AVX2 tier: 256-bit integer/FP SIMD, available on Haswell and later.
+    if (max_isa != MaxIsa::SSE42 && has(CpuFeature::AVX2)) {
+        return AVX2_vtable;
+    }
+    // SSE42 is the baseline — assumed always present, no CPUID check needed.
+    return SSE42_vtable;
+}
+
+// 'kernels' is initialised once at static-init time to the vtable chosen by
+// dispatch_kernels().  After that every public API call goes through one
+// indirect branch to the right ISA implementation — no runtime comparisons.
+static const KernelVTable kernels = dispatch_kernels();
+
+} // namespace
+
+// ---- Public API ------------------------------------------------------------
+// Each function is a thin wrapper that forwards to the pre-resolved function
+// pointer in `kernels`.
+
+/* Vector similarity */
+
+float cosine_f32(const float *a, size_t aoffset,
+                 const float *b, size_t boffset, size_t length)
+{
+    return kernels.cosine_f32(a, aoffset, b, boffset, length);
+}
+
+float dot_product_f32(const float *a, size_t aoffset,
+                      const float *b, size_t boffset, size_t length)
+{
+    return kernels.dot_product_f32(a, aoffset, b, boffset, length);
+}
+
+float euclidean_f32(const float *a, size_t aoffset,
+                    const float *b, size_t boffset, size_t length)
+{
+    return kernels.euclidean_f32(a, aoffset, b, boffset, length);
+}
+
+/* Element-wise in-place arithmetic */
+
+void add_in_place_f32(float *v1, const float *v2, size_t length)
+{
+    kernels.add_in_place_f32(v1, v2, length);
+}
+
+void add_scalar_in_place_f32(float *v1, float value, size_t length)
+{
+    kernels.add_scalar_in_place_f32(v1, value, length);
+}
+
+void sub_in_place_f32(float *v1, const float *v2, size_t length)
+{
+    kernels.sub_in_place_f32(v1, v2, length);
+}
+
+void sub_scalar_in_place_f32(float *v1, float value, size_t length)
+{
+    kernels.sub_scalar_in_place_f32(v1, value, length);
+}
+
+float max_f32(const float *v, size_t length)
+{
+    return kernels.max_f32(v, length);
+}
+
+void min_in_place_f32(float *v1, const float *v2, size_t length)
+{
+    kernels.min_in_place_f32(v1, v2, length);
+}
+
+/* PQ kernels */
+
+float assemble_and_sum_f32(const float *data, int dataBase,
+                            const unsigned char *baseOffsets,
+                            int baseOffsetsOffset, size_t baseOffsetsLength)
+{
+    return kernels.assemble_and_sum_f32(
+            data, dataBase, baseOffsets, baseOffsetsOffset, baseOffsetsLength);
+}
+
+float assemble_and_sum_pq_f32(const float *data, size_t subspaceCount,
+                               const unsigned char *baseOffsets1,
+                               int baseOffsetsOffset1,
+                               const unsigned char *baseOffsets2,
+                               int baseOffsetsOffset2, int clusterCount)
+{
+    return kernels.assemble_and_sum_pq_f32(data, subspaceCount,
+                                           baseOffsets1, baseOffsetsOffset1,
+                                           baseOffsets2, baseOffsetsOffset2,
+                                           clusterCount);
+}
+
+float pq_decoded_cosine_similarity_f32(const unsigned char *baseOffsets,
+                                        int baseOffsetsOffset,
+                                        size_t baseOffsetsLength,
+                                        int clusterCount,
+                                        const float *partialSums,
+                                        const float *aMagnitude,
+                                        float bMagnitude)
+{
+    return kernels.pq_decoded_cosine_similarity_f32(baseOffsets,
+                                                    baseOffsetsOffset,
+                                                    baseOffsetsLength,
+                                                    clusterCount,
+                                                    partialSums,
+                                                    aMagnitude,
+                                                    bMagnitude);
+}
+
+void calculate_partial_sums_dot_f32(const float *codebook, int codebookIndex,
+                                     size_t size, int clusterCount,
+                                     const float *query, int queryOffset,
+                                     float *partialSums)
+{
+    kernels.calculate_partial_sums_dot_f32(codebook, codebookIndex,
+                                           size, clusterCount,
+                                           query, queryOffset, partialSums);
+}
+
+void calculate_partial_sums_euclidean_f32(const float *codebook,
+                                           int codebookIndex,
+                                           size_t size, int clusterCount,
+                                           const float *query, int queryOffset,
+                                           float *partialSums)
+{
+    kernels.calculate_partial_sums_euclidean_f32(codebook, codebookIndex,
+                                                 size, clusterCount,
+                                                 query, queryOffset,
+                                                 partialSums);
+}
+
+void calculate_partial_sums_self_magnitude_f32(const float *codebook,
+                                               int codebookIndex,
+                                               size_t size, int clusterCount,
+                                               float *partialSums)
+{
+    kernels.calculate_partial_sums_self_magnitude_f32(codebook, codebookIndex,
+                                                      size, clusterCount,
+                                                      partialSums);
+}
+
+/* NVQ kernels */
+
+void nvq_quantize_8bit(const float *vector, size_t length,
+                        float alpha, float x0,
+                        float minValue, float maxValue,
+                        unsigned char *destination)
+{
+    kernels.nvq_quantize_8bit(vector, length, alpha, x0,
+                              minValue, maxValue, destination);
+}
+
+float nvq_loss(const float *vector, size_t length,
+               float alpha, float x0,
+               float minValue, float maxValue, int nBits)
+{
+    return kernels.nvq_loss(vector, length, alpha, x0, minValue, maxValue, nBits);
+}
+
+float nvq_uniform_loss(const float *vector, size_t length,
+                        float minValue, float maxValue, int nBits)
+{
+    return kernels.nvq_uniform_loss(vector, length, minValue, maxValue, nBits);
+}
+
+float nvq_square_l2_distance_8bit(const float *vector,
+                                   const unsigned char *quantized,
+                                   size_t length,
+                                   float alpha, float x0,
+                                   float minValue, float maxValue)
+{
+    return kernels.nvq_square_l2_distance_8bit(vector, quantized, length,
+                                               alpha, x0, minValue, maxValue);
+}
+
+float nvq_dot_product_8bit(const float *vector,
+                            const unsigned char *quantized,
+                            size_t length,
+                            float alpha, float x0,
+                            float minValue, float maxValue)
+{
+    return kernels.nvq_dot_product_8bit(vector, quantized, length,
+                                        alpha, x0, minValue, maxValue);
+}
+
+int64_t nvq_cosine_8bit_packed(const float *vector,
+                                const unsigned char *quantized,
+                                size_t length,
+                                float alpha, float x0,
+                                float minValue, float maxValue,
+                                const float *centroid)
+{
+    return kernels.nvq_cosine_8bit_packed(vector, quantized, length,
+                                          alpha, x0, minValue, maxValue,
+                                          centroid);
+}
+
+void nvq_shuffle_query_in_place_8bit(float *vector, size_t length)
+{
+    kernels.nvq_shuffle_query_in_place_8bit(vector, length);
+}
diff --git a/jvector-native/src/main/c/jvector_simd.h b/jvector-native/src/main/c/jvector_simd.h
index 39da01316..5000059f7 100644
--- a/jvector-native/src/main/c/jvector_simd.h
+++ b/jvector-native/src/main/c/jvector_simd.h
@@ -15,23 +15,50 @@
  */
 
 #include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
 
 #ifndef VECTOR_SIMD_DOT_H
 #define VECTOR_SIMD_DOT_H
 
-// check CPU support
-bool check_avx512_compatibility(void);
+// Mark a symbol as part of the public ABI even when the library is built
+// with -fvisibility=hidden.
+#define JVECTOR_SIMD_API __attribute__((visibility("default")))
 
-//F32
-float dot_product_f32(int preferred_size, const float* a, int aoffset, const float* b, int boffset, int length);
-float euclidean_f32(int preferred_size, const float* a, int aoffset, const float* b, int boffset, int length);
-void bulk_quantized_shuffle_dot_f32_512(const unsigned char* shuffles, int codebookCount, const char* quantizedPartials, float delta, float minDistance, float* results);
-void bulk_quantized_shuffle_euclidean_f32_512(const unsigned char* shuffles, int codebookCount, const char* quantizedPartials, float delta, float minDistance, float* results);
-void bulk_quantized_shuffle_cosine_f32_512(const unsigned char* shuffles, int codebookCount, const char* quantizedPartialSums, float sumDelta, float minDistance, const char* quantizedPartialMagnitudes, float magnitudeDelta, float minMagnitude, float queryMagnitudeSquared, float* results);
-float assemble_and_sum_f32_512(const float* data, int dataBase, const unsigned char* baseOffsets, int baseOffsetsOffset, int baseOffsetsLength);
-float pq_decoded_cosine_similarity_f32_512(const unsigned char* baseOffsets, int baseOffsetsOffset, int baseOffsetsLength, int clusterCount, const float* partialSums, const float* aMagnitude, float bMagnitude);
-void calculate_partial_sums_dot_f32_512(const float* codebook, int codebookBase, int size, int clusterCount, const float* query, int queryOffset, float* partialSums);
-void calculate_partial_sums_euclidean_f32_512(const float* codebook, int codebookBase, int size, int clusterCount, const float* query, int queryOffset, float* partialSums);
-void calculate_partial_sums_best_dot_f32_512(const float* codebook, int codebookBase, int size, int clusterCount, const float* query, int queryOffset, float* partialSums, float* partialBestDistances);
-void calculate_partial_sums_best_euclidean_f32_512(const float* codebook, int codebookBase, int size, int clusterCount, const float* query, int queryOffset, float* partialSums, float* partialBestDistances);
-#endif
\ No newline at end of file
+// APIs exposed to Java via FFI
+#ifdef __cplusplus
+extern "C" {
+#endif
+/* PQ kernels */
+JVECTOR_SIMD_API float assemble_and_sum_f32(const float* data, int dataBase, const unsigned char* baseOffsets, int baseOffsetsOffset, size_t baseOffsetsLength);
+JVECTOR_SIMD_API float assemble_and_sum_pq_f32(const float* data, size_t subspaceCount, const unsigned char* baseOffsets1, int baseOffsetsOffset1, const unsigned char* baseOffsets2, int baseOffsetsOffset2, int clusterCount);
+JVECTOR_SIMD_API float pq_decoded_cosine_similarity_f32(const unsigned char* baseOffsets, int baseOffsetsOffset, size_t baseOffsetsLength, int clusterCount, const float* partialSums, const float* aMagnitude, float bMagnitude);
+JVECTOR_SIMD_API void calculate_partial_sums_euclidean_f32(const float* codebook, int codebookBase, size_t size, int clusterCount, const float* query, int queryOffset, float* partialSums);
+JVECTOR_SIMD_API void calculate_partial_sums_dot_f32(const float* codebook, int codebookBase, size_t size, int clusterCount, const float* query, int queryOffset, float* partialSums);
+JVECTOR_SIMD_API void calculate_partial_sums_self_magnitude_f32(const float* codebook, int codebookBase, size_t size, int clusterCount, float* partialSums);
+
+/* Vector similarity kernels */
+JVECTOR_SIMD_API float dot_product_f32(const float* a, size_t aoffset, const float* b, size_t boffset, size_t length);
+JVECTOR_SIMD_API float cosine_f32(const float* a, size_t aoffset, const float* b, size_t boffset, size_t length);
+JVECTOR_SIMD_API float euclidean_f32(const float* a, size_t aoffset, const float* b, size_t boffset, size_t length);
+
+/* NVQ kernels */
+JVECTOR_SIMD_API void    nvq_quantize_8bit(const float* vector, size_t length, float alpha, float x0, float minValue, float maxValue, unsigned char* destination);
+JVECTOR_SIMD_API float   nvq_loss(const float* vector, size_t length, float alpha, float x0, float minValue, float maxValue, int nBits);
+JVECTOR_SIMD_API float   nvq_uniform_loss(const float* vector, size_t length, float minValue, float maxValue, int nBits);
+JVECTOR_SIMD_API float   nvq_square_l2_distance_8bit(const float* vector, const unsigned char* quantized, size_t length, float alpha, float x0, float minValue, float maxValue);
+JVECTOR_SIMD_API float   nvq_dot_product_8bit(const float* vector, const unsigned char* quantized, size_t length, float alpha, float x0, float minValue, float maxValue);
+JVECTOR_SIMD_API int64_t nvq_cosine_8bit_packed(const float* vector, const unsigned char* quantized, size_t length, float alpha, float x0, float minValue, float maxValue, const float* centroid);
+JVECTOR_SIMD_API void    nvq_shuffle_query_in_place_8bit(float* vector, size_t length);
+
+/* Element-wise in-place vector arithmetic */
+JVECTOR_SIMD_API void    add_in_place_f32(float* v1, const float* v2, size_t length);
+JVECTOR_SIMD_API void    add_scalar_in_place_f32(float* v1, float value, size_t length);
+JVECTOR_SIMD_API void    sub_in_place_f32(float* v1, const float* v2, size_t length);
+JVECTOR_SIMD_API void    sub_scalar_in_place_f32(float* v1, float value, size_t length);
+JVECTOR_SIMD_API float   max_f32(const float* v, size_t length);
+JVECTOR_SIMD_API void    min_in_place_f32(float* v1, const float* v2, size_t length);
+#ifdef __cplusplus
+}
+#endif // extern "C"
+#endif // VECTOR_SIMD_DOT_H
diff --git a/jvector-native/src/main/c/jvector_simd_kernels.cpp b/jvector-native/src/main/c/jvector_simd_kernels.cpp
new file mode 100644
index 000000000..20e8930ca
--- /dev/null
+++ b/jvector-native/src/main/c/jvector_simd_kernels.cpp
@@ -0,0 +1,1643 @@
+/*
+ * Copyright DataStax, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <inttypes.h>
+#include <float.h>
+#include <math.h>
+#include <cstring>
+#include "jvector_simd.h"
+#include "hwy/highway.h"
+#include "assertHwyTargets.h"
+
+// =============================================================================
+// Highway macro usage in this file
+// =============================================================================
+//
+// HWY_INLINE
+//   Expands to `inline __attribute__((always_inline))` on GCC/Clang.
+//   Used on every helper that participates in a hot SIMD loop to prevent the
+//   compiler from ever emitting a real call and to keep register pressure
+//   visible across the inlined body.  Prefer this over plain `inline` for any
+//   function that contains SIMD intrinsics.
+//
+// HWY_FLATTEN
+//   Expands to `__attribute__((flatten))`, which asks the compiler to inline
+//   *all* callees into the annotated function.  Used on the public entry-points
+//   (assemble_and_sum_f32, pq_decoded_cosine_similarity_f32, the three
+//   distance wrappers) so that the multi-target Highway dispatch stub sees a
+//   single monolithic body with no residual call overhead.
+//
+// HWY_RESTRICT
+//   Portable spelling of `__restrict__` / `restrict`.  Tells the compiler that
+//   a pointer does not alias any other pointer in scope, so loads through it
+//   remain valid across stores made through a different pointer.  Applied to:
+//     - Load helper parameters (e.g. LoadDup256) — ensures the loaded value is
+//       treated as loop-invariant even when the caller stores to an accumulator.
+//     - calculate_partial_sums_f32 inputs (codebook, query) and output
+//       (partialSums) — prevents the compiler from reloading read-only inputs
+//       after each write to partialSums.
+//   Not needed when inputs and output already have different types (e.g.
+//   float* vs unsigned char*), because C++ strict-aliasing rules already
+//   guarantee they cannot alias.
+//
+// =============================================================================
+// Highway API tutorial — intrinsics used in this file
+// =============================================================================
+//
+// --- Tags (describe vector type and width) ---
+//
+// ScalableTag<T>
+//   Represents the full native SIMD width for type T.
+//   e.g. ScalableTag<float> is 8 lanes on AVX2, 16 lanes on AVX-512.
+//   Used in the main loop bodies where we want the widest available vector.
+//
+// CappedTag<T, N>  /  HWY_CAPPED(T, N)
+//   A tag capped to at most N lanes, even on wider ISAs.
+//   Used in the small-vector fast paths (e.g. size==4, size==8) so that we
+//   avoid wasting the extra lanes of a wide register on tiny inputs.
+//
+// Half<D>
+//   Produces a tag whose lane count is half that of D.
+//   Used in LoadDup256 to load 8 floats into the lower half of a 512-bit
+//   register before Combine duplicates them into the upper half.
+//
+// Rebind<NewT, Tag>  /  RebindToSigned<Tag>
+//   Produce a new tag of the same width but a different element type.
+//   Used in assemble_and_sum_f32 and pq_decoded_cosine_similarity to
+//   reinterpret the float-width register as uint8/uint16/int32 during the
+//   index promotion pipeline.
+//
+// Lanes(tag)   — runtime lane count for the given tag.
+// MaxLanes(tag) — compile-time upper bound on lane count (used in static_assert
+//                 and constexpr branches).
+//
+// --- Vector type ---
+//
+// Vec<Tag>
+//   The SIMD vector type corresponding to a tag.
+//   All arithmetic and load/store operations return or accept Vec<Tag>.
+//
+// --- Initialisation ---
+//
+// Zero(tag)         — vector of all zeros; used to initialise accumulators.
+// Set(tag, scalar)  — broadcast a scalar to every lane.
+// Iota(tag, start)  — fill lanes with start, start+1, start+2, …
+//                     Used to build the running index vector for GatherIndex.
+//
+// --- Loads ---
+//
+// LoadU(tag, ptr)        — unaligned load of Lanes(tag) elements from ptr.
+// LoadN(tag, ptr, n)     — load n elements; remaining lanes are zero-padded.
+//                          Used for loop tails without a branch per element.
+// LoadDup128(tag, ptr)   — load 128 bits and broadcast across the full vector.
+//                          Used for size==2 and size==4 query vectors so the
+//                          same query chunk lines up with every centroid chunk.
+// MaskedLoad(mask, tag, ptr)
+//                        — load only the lanes where mask is set; others zero.
+//                          Used in CosineDistance tail handling.
+//
+// --- Store ---
+//
+// StoreU(vec, tag, ptr)  — unaligned store of Lanes(tag) elements to ptr.
+//
+// --- Arithmetic ---
+//
+// Add(a, b)              — lane-wise addition.
+// Sub(a, b)              — lane-wise subtraction.
+// Mul(a, b)              — lane-wise multiplication.
+// MulAdd(a, b, c)        — fused multiply-add: (a * b) + c.
+//                          Preferred over separate Mul+Add for FMA throughput.
+//
+// --- Type promotion ---
+//
+// PromoteTo(narrower_tag, vec)
+//   Zero-extends each element to the wider type.
+//   Used twice in the gather pipeline: u8 → u16 → i32, so that byte offsets
+//   become 32-bit gather indices without sign-extension artefacts.
+//
+// --- Gather ---
+//
+// GatherIndex(tag, base_ptr, index_vec)
+//   Loads one element per lane using per-lane 32-bit indices (in elements,
+//   not bytes).  Used to collect PQ lookup-table entries and codebook floats
+//   whose positions are determined at runtime by the encoded offsets.
+//
+// --- Reductions ---
+//
+// ReduceSum(tag, vec)    — horizontal sum of all lanes; returns a scalar.
+//
+// --- Horizontal-reduction shuffles (used in calculate_partial_sums_f32) ---
+//
+// Shuffle2301(vec)
+//   Swaps adjacent 32-bit lane pairs within each 64-bit group: [0,1,2,3] → [1,0,3,2].
+//   (Name "2301" encodes source indices from lane 3 down to lane 0 per _MM_SHUFFLE
+//   convention: lane3←2, lane2←3, lane1←0, lane0←1.)
+//   Adding a vector with its Shuffle2301 partner sums adjacent lane pairs:
+//   result[0]=result[1]=(0+1) and result[2]=result[3]=(2+3).
+//
+// Shuffle1032(vec)
+//   Swaps the two 64-bit halves within each 128-bit block: [0,1,2,3] → [2,3,0,1].
+//   Used as the second reduction step: adding those pairwise sums together
+//   yields the full 4-lane horizontal sum in every lane.
+//
+// SwapAdjacentBlocks(vec)
+//   Swaps the two 128-bit halves of each 256-bit block.
+//   On AVX-512 (512-bit vector = four 128-bit blocks) this is used as the
+//   first step of the size==8 horizontal reduction before Shuffle1032/2301.
+//
+// --- Masks ---
+//
+// FirstN(tag, n)
+//   Returns a mask with the first n lanes set and the rest clear.
+//   Used with MaskedLoad to handle the tail of a vector that doesn't fill
+//   a full register.
+//
+// --- Combine ---
+//
+// Combine(d, hi, lo)
+//   Concatenates two half-width vectors into one full-width vector.
+//   Used in LoadDup256 to duplicate 8 floats across both halves of a
+//   512-bit register (lo = hi = the same 256-bit load).
+//
+
+namespace hn = hwy::HWY_NAMESPACE;
+
+// Loads 8 floats from ptr and broadcasts them to fill the full vector D.
+// On ISAs where D is exactly 8 lanes (e.g. AVX2) this is a plain LoadU.
+// On wider ISAs (e.g. AVX-512, 16 lanes) the 8 floats are loaded into the
+// 256-bit half-tag and then Combine'd to duplicate them into both halves.
+// NOTE: not designed for ISAs wider than 512-bit (would need additional Combine levels).
+// HWY_RESTRICT tells the compiler that ptr does not alias any accumulator or
+// other pointer visible at the call site, allowing it to treat all loads from
+// ptr as invariant across iterations and hoist them freely.
+template <class D>
+HWY_INLINE hn::Vec<D> LoadDup256(D d, const float *HWY_RESTRICT ptr)
+{
+    static_assert(hn::MaxLanes(d) <= 16,
+                  "LoadDup256 is not implemented for ISAs wider than 512-bit");
+    if constexpr (hn::MaxLanes(d) > 8) {
+        const hn::Half<D> dh;
+        const auto half = hn::LoadU(dh, ptr);
+        return hn::Combine(d, half, half);
+    }
+    else {
+        return hn::LoadU(d, ptr);
+    }
+}
+// =============================================================================
+// Base Fp32 kernels
+// =============================================================================
+
+// Dot product kernel templated on the Highway tag type, shared by the
+// full-width ScalableTag path and the HWY_CAPPED fast paths.
+// The 4x unrolled loop hides FMA latency; for capped tags the loop body
+// is never entered for small sizes and the single-vector path handles them.
+template <class Tag>
+HWY_INLINE float DotProductImpl(Tag tag, const float *a, const float *b, size_t size)
+{
+    const size_t lanes = hn::Lanes(tag);
+    auto acc0 = hn::Zero(tag), acc1 = hn::Zero(tag);
+    auto acc2 = hn::Zero(tag), acc3 = hn::Zero(tag);
+    size_t ii = 0;
+    for (; ii + 4 * lanes <= size; ii += 4 * lanes) {
+        acc0 = hn::MulAdd(hn::LoadU(tag, a + ii + 0*lanes), hn::LoadU(tag, b + ii + 0*lanes), acc0);
+        acc1 = hn::MulAdd(hn::LoadU(tag, a + ii + 1*lanes), hn::LoadU(tag, b + ii + 1*lanes), acc1);
+        acc2 = hn::MulAdd(hn::LoadU(tag, a + ii + 2*lanes), hn::LoadU(tag, b + ii + 2*lanes), acc2);
+        acc3 = hn::MulAdd(hn::LoadU(tag, a + ii + 3*lanes), hn::LoadU(tag, b + ii + 3*lanes), acc3);
+    }
+    auto acc = hn::Add(hn::Add(acc0, acc1), hn::Add(acc2, acc3));
+    for (; ii + lanes <= size; ii += lanes) {
+        acc = hn::MulAdd(hn::LoadU(tag, a + ii), hn::LoadU(tag, b + ii), acc);
+    }
+    if (ii < size) {
+        acc = hn::MulAdd(hn::LoadN(tag, a + ii, size - ii),
+                         hn::LoadN(tag, b + ii, size - ii), acc);
+    }
+    return hn::ReduceSum(tag, acc);
+}
+
+// L2 square distance kernel templated on the Highway tag type, shared by
+// the full-width ScalableTag path and the HWY_CAPPED fast paths.
+template <class Tag>
+HWY_INLINE float L2SquareDistanceImpl(Tag tag, const float *a, const float *b, size_t size)
+{
+    const size_t lanes = hn::Lanes(tag);
+    auto acc0 = hn::Zero(tag), acc1 = hn::Zero(tag);
+    auto acc2 = hn::Zero(tag), acc3 = hn::Zero(tag);
+    size_t ii = 0;
+    for (; ii + 4 * lanes <= size; ii += 4 * lanes) {
+        auto d0 = hn::LoadU(tag, a + ii + 0*lanes) - hn::LoadU(tag, b + ii + 0*lanes);
+        auto d1 = hn::LoadU(tag, a + ii + 1*lanes) - hn::LoadU(tag, b + ii + 1*lanes);
+        auto d2 = hn::LoadU(tag, a + ii + 2*lanes) - hn::LoadU(tag, b + ii + 2*lanes);
+        auto d3 = hn::LoadU(tag, a + ii + 3*lanes) - hn::LoadU(tag, b + ii + 3*lanes);
+        acc0 = hn::MulAdd(d0, d0, acc0);
+        acc1 = hn::MulAdd(d1, d1, acc1);
+        acc2 = hn::MulAdd(d2, d2, acc2);
+        acc3 = hn::MulAdd(d3, d3, acc3);
+    }
+    auto acc = hn::Add(hn::Add(acc0, acc1), hn::Add(acc2, acc3));
+    for (; ii + lanes <= size; ii += lanes) {
+        auto d = hn::LoadU(tag, a + ii) - hn::LoadU(tag, b + ii);
+        acc = hn::MulAdd(d, d, acc);
+    }
+    if (ii < size) {
+        auto d = hn::LoadN(tag, a + ii, size - ii) - hn::LoadN(tag, b + ii, size - ii);
+        acc = hn::MulAdd(d, d, acc);
+    }
+    return hn::ReduceSum(tag, acc);
+}
+
+// Cosine distance kernel templated on the Highway tag type, shared by
+// the full-width ScalableTag path and the HWY_CAPPED fast paths.
+template <class Tag>
+HWY_INLINE float CosineDistanceImpl(Tag tag, const float *a, const float *b, size_t size)
+{
+    const size_t lanes = hn::Lanes(tag);
+    auto sum_ab = hn::Zero(tag), sum_aa = hn::Zero(tag), sum_bb = hn::Zero(tag);
+    size_t ii = 0;
+    for (; ii + lanes <= size; ii += lanes) {
+        auto va = hn::LoadU(tag, a + ii);
+        auto vb = hn::LoadU(tag, b + ii);
+        sum_ab = hn::MulAdd(va, vb, sum_ab);
+        sum_aa = hn::MulAdd(va, va, sum_aa);
+        sum_bb = hn::MulAdd(vb, vb, sum_bb);
+    }
+    if (ii < size) {
+        auto va = hn::LoadN(tag, a + ii, size - ii);
+        auto vb = hn::LoadN(tag, b + ii, size - ii);
+        sum_ab = hn::MulAdd(va, vb, sum_ab);
+        sum_aa = hn::MulAdd(va, va, sum_aa);
+        sum_bb = hn::MulAdd(vb, vb, sum_bb);
+    }
+    return hn::ReduceSum(tag, sum_ab)
+           / sqrtf(hn::ReduceSum(tag, sum_aa) * hn::ReduceSum(tag, sum_bb));
+}
+
+// Returns the dot product sum(a[ii] * b[ii]).
+//
+// Short-vector fast paths: when the register width is wider than the vector
+// (e.g. a 4-element input on AVX-512), using the full register wastes lanes
+// and can hurt latency.  Capped tags keep execution in narrow registers.
+HWY_INLINE float DotProduct(const float *a,
+                            size_t aoffset,
+                            const float *b,
+                            size_t boffset,
+                            size_t length)
+{
+    a += aoffset;
+    b += boffset;
+#if HWY_MAX_BYTES > 16
+    if (length <= 4) { return DotProductImpl(HWY_CAPPED(float, 4){}, a, b, length); }
+#if HWY_MAX_BYTES > 32
+    if (length <= 8) { return DotProductImpl(HWY_CAPPED(float, 8){}, a, b, length); }
+#endif
+#endif
+    return DotProductImpl(hn::ScalableTag<float>{}, a, b, length);
+}
+
+HWY_INLINE float CosineDistance(
+        const float *a, size_t aoffset, const float *b, size_t boffset, size_t length)
+{
+    const float *ap = a + aoffset;
+    const float *bp = b + boffset;
+#if HWY_MAX_BYTES > 16
+    if (length <= 4) { return CosineDistanceImpl(HWY_CAPPED(float, 4){}, ap, bp, length); }
+#if HWY_MAX_BYTES > 32
+    if (length <= 8) { return CosineDistanceImpl(HWY_CAPPED(float, 8){}, ap, bp, length); }
+#endif
+#endif
+    return CosineDistanceImpl(hn::ScalableTag<float>{}, ap, bp, length);
+}
+
+HWY_INLINE float L2SquareDistance(const float *a,
+                                  size_t aoffset,
+                                  const float *b,
+                                  size_t boffset,
+                                  size_t length)
+{
+    a += aoffset;
+    b += boffset;
+#if HWY_MAX_BYTES > 16
+    if (length <= 4) { return L2SquareDistanceImpl(HWY_CAPPED(float, 4){}, a, b, length); }
+#if HWY_MAX_BYTES > 32
+    if (length <= 8) { return L2SquareDistanceImpl(HWY_CAPPED(float, 8){}, a, b, length); }
+#endif
+#endif
+    return L2SquareDistanceImpl(hn::ScalableTag<float>{}, a, b, length);
+}
+
+namespace JV_ISA {
+
+HWY_FLATTEN float cosine_f32(
+        const float *a, size_t aoffset, const float *b, size_t boffset, size_t length)
+{
+    return CosineDistance(a, aoffset, b, boffset, length);
+}
+
+HWY_FLATTEN float dot_product_f32(
+        const float *a, size_t aoffset, const float *b, size_t boffset, size_t length)
+{
+    return DotProduct(a, aoffset, b, boffset, length);
+}
+
+HWY_FLATTEN float euclidean_f32(
+        const float *a, size_t aoffset, const float *b, size_t boffset, size_t length)
+{
+    return L2SquareDistance(a, aoffset, b, boffset, length);
+}
+
+// =============================================================================
+// Element-wise in-place arithmetic and reduction kernels
+// =============================================================================
+//
+// rename-registers: extra GCC register-renaming pass that breaks false WAR/WAW
+//   hazards between short-lived zmm values, enabling more ILP in the SIMD loops.
+// #pragma GCC unroll 4: unroll by 4 to hide the 4-cycle FMA latency and keep
+//   both AVX-512 FMA ports saturated across independent load–op–store chains.
+//
+__attribute__((optimize("rename-registers")))
+HWY_FLATTEN void add_in_place_f32(float *HWY_RESTRICT v1,
+                                   const float *HWY_RESTRICT v2,
+                                   size_t length)
+{
+    hn::ScalableTag<float> d;
+    const size_t lanes = hn::Lanes(d);
+    size_t i = 0;
+#pragma GCC unroll 4
+    for (; i + lanes <= length; i += lanes) {
+        auto a = hn::LoadU(d, v1 + i);
+        auto b = hn::LoadU(d, v2 + i);
+        hn::StoreU(hn::Add(a, b), d, v1 + i);
+    }
+    if (i < length) {
+        const size_t rem = length - i;
+        auto a = hn::LoadN(d, v1 + i, rem);
+        auto b = hn::LoadN(d, v2 + i, rem);
+        hn::StoreN(hn::Add(a, b), d, v1 + i, rem);
+    }
+}
+
+__attribute__((optimize("rename-registers")))
+HWY_FLATTEN void add_scalar_in_place_f32(float *HWY_RESTRICT v1,
+                                          float value,
+                                          size_t length)
+{
+    hn::ScalableTag<float> d;
+    const size_t lanes = hn::Lanes(d);
+    const auto vval = hn::Set(d, value);
+    size_t i = 0;
+#pragma GCC unroll 4
+    for (; i + lanes <= length; i += lanes) {
+        auto a = hn::LoadU(d, v1 + i);
+        hn::StoreU(hn::Add(a, vval), d, v1 + i);
+    }
+    if (i < length) {
+        const size_t rem = length - i;
+        auto a = hn::LoadN(d, v1 + i, rem);
+        hn::StoreN(hn::Add(a, vval), d, v1 + i, rem);
+    }
+}
+
+__attribute__((optimize("rename-registers")))
+HWY_FLATTEN void sub_in_place_f32(float *HWY_RESTRICT v1,
+                                   const float *HWY_RESTRICT v2,
+                                   size_t length)
+{
+    hn::ScalableTag<float> d;
+    const size_t lanes = hn::Lanes(d);
+    size_t i = 0;
+#pragma GCC unroll 4
+    for (; i + lanes <= length; i += lanes) {
+        auto a = hn::LoadU(d, v1 + i);
+        auto b = hn::LoadU(d, v2 + i);
+        hn::StoreU(hn::Sub(a, b), d, v1 + i);
+    }
+    if (i < length) {
+        const size_t rem = length - i;
+        auto a = hn::LoadN(d, v1 + i, rem);
+        auto b = hn::LoadN(d, v2 + i, rem);
+        hn::StoreN(hn::Sub(a, b), d, v1 + i, rem);
+    }
+}
+
+__attribute__((optimize("rename-registers")))
+HWY_FLATTEN void sub_scalar_in_place_f32(float *HWY_RESTRICT v1,
+                                          float value,
+                                          size_t length)
+{
+    hn::ScalableTag<float> d;
+    const size_t lanes = hn::Lanes(d);
+    const auto vval = hn::Set(d, value);
+    size_t i = 0;
+#pragma GCC unroll 4
+    for (; i + lanes <= length; i += lanes) {
+        auto a = hn::LoadU(d, v1 + i);
+        hn::StoreU(hn::Sub(a, vval), d, v1 + i);
+    }
+    if (i < length) {
+        const size_t rem = length - i;
+        auto a = hn::LoadN(d, v1 + i, rem);
+        hn::StoreN(hn::Sub(a, vval), d, v1 + i, rem);
+    }
+}
+
+__attribute__((optimize("rename-registers")))
+HWY_FLATTEN float max_f32(const float *HWY_RESTRICT v, size_t length)
+{
+    hn::ScalableTag<float> d;
+    const size_t lanes = hn::Lanes(d);
+    auto accum = hn::Set(d, -FLT_MAX);
+    size_t i = 0;
+#pragma GCC unroll 4
+    for (; i + lanes <= length; i += lanes) {
+        accum = hn::Max(accum, hn::LoadU(d, v + i));
+    }
+    float result = hn::ReduceMax(d, accum);
+    for (; i < length; i++) {
+        if (v[i] > result) result = v[i];
+    }
+    return result;
+}
+
+__attribute__((optimize("rename-registers")))
+HWY_FLATTEN void min_in_place_f32(float *HWY_RESTRICT v1,
+                                   const float *HWY_RESTRICT v2,
+                                   size_t length)
+{
+    hn::ScalableTag<float> d;
+    const size_t lanes = hn::Lanes(d);
+    size_t i = 0;
+#pragma GCC unroll 4
+    for (; i + lanes <= length; i += lanes) {
+        auto a = hn::LoadU(d, v1 + i);
+        auto b = hn::LoadU(d, v2 + i);
+        hn::StoreU(hn::Min(a, b), d, v1 + i);
+    }
+    if (i < length) {
+        const size_t rem = length - i;
+        auto a = hn::LoadN(d, v1 + i, rem);
+        auto b = hn::LoadN(d, v2 + i, rem);
+        hn::StoreN(hn::Min(a, b), d, v1 + i, rem);
+    }
+}
+
+// =============================================================================
+// PQ kernels
+// =============================================================================
+
+enum class DistanceType { DotProduct, Euclidean };
+
+// Computes the per-element score vector for a single SIMD register pair (c, qq).
+// For DotProduct: score[ii] = c[ii] * qq[ii]
+// For Euclidean:  score[ii] = (c[ii] - qq[ii])^2
+template <DistanceType DT, class D>
+HWY_INLINE hn::Vec<D> partial_sum_score(const hn::Vec<D> &c,
+                                        const hn::Vec<D> &qq)
+{
+    if constexpr (DT == DistanceType::DotProduct) { return hn::Mul(c, qq); }
+    else if constexpr (DT == DistanceType::Euclidean) {
+        const hn::Vec<D> diff = hn::Sub(c, qq);
+        return hn::Mul(diff, diff);
+    }
+    else {
+        static_assert(DT == DistanceType::DotProduct
+                              || DT == DistanceType::Euclidean,
+                      "Unsupported DistanceType");
+        // Unreachable, but silences compiler warnings about missing return.
+        return hn::Zero(c);
+    }
+}
+
+// Scalar fallback: returns dot_product_f32 or euclidean_f32 depending on DT.
+template <DistanceType DT>
+HWY_INLINE float distance_func(const float *codebook,
+                               int clusterOffset,
+                               const float *query,
+                               int queryOffset,
+                               size_t size)
+{
+    if constexpr (DT == DistanceType::DotProduct)
+        return DotProduct(codebook, clusterOffset, query, queryOffset, size);
+    else
+        return L2SquareDistance(
+                codebook, clusterOffset, query, queryOffset, size);
+}
+
+template <DistanceType DT>
+// HWY_RESTRICT on codebook and query informs the compiler that writes to
+// partialSums cannot alias either read-only input, so it need not reload
+// codebook/query values after each store to partialSums.
+HWY_INLINE void calculate_partial_sums_f32(const float *HWY_RESTRICT codebook,
+                                           int codebookIndex,
+                                           size_t size,
+                                           int clusterCount,
+                                           const float *HWY_RESTRICT query,
+                                           int queryOffset,
+                                           float *HWY_RESTRICT partialSums)
+{
+    int codebookBase = codebookIndex * clusterCount;
+    using FloatTag = hn::ScalableTag<float>;
+    FloatTag tag;
+    constexpr size_t kLanes = hn::MaxLanes(tag);
+    alignas(64) float tmp[kLanes];
+    int ii = 0;
+
+    if constexpr (kLanes >= 2) {
+        if (size == 2) {
+            float qtmp[4] = {query[queryOffset],
+                             query[queryOffset + 1],
+                             query[queryOffset],
+                             query[queryOffset + 1]};
+            hn::Vec<FloatTag> queryVec = hn::LoadDup128(tag, qtmp);
+
+            constexpr size_t kBlock = 2;
+            constexpr int centroids_per_iter = kLanes / kBlock;
+
+            for (; ii + centroids_per_iter <= clusterCount;
+                 ii += centroids_per_iter) {
+                const float *cptr = codebook + ii * 2;
+                hn::Vec<FloatTag> centroidVec = hn::LoadU(tag, cptr);
+                hn::Vec<FloatTag> score = partial_sum_score<DT, FloatTag>(
+                        centroidVec, queryVec);
+                hn::Vec<FloatTag> swapped = hn::Shuffle2301(score);
+                hn::Vec<FloatTag> sum = score + swapped;
+                hn::StoreU(sum, tag, tmp);
+#pragma GCC unroll 8
+                for (int jj = 0; jj < centroids_per_iter; ++jj) {
+                    partialSums[codebookBase + ii + jj] = tmp[jj * 2];
+                }
+            }
+        }
+    }
+    if constexpr (kLanes >= 4) {
+        if (size == 4) {
+            constexpr int centroids_per_iter = static_cast<int>(kLanes / 4);
+            hn::Vec<FloatTag> queryVec
+                    = hn::LoadDup128(tag, query + queryOffset);
+
+            for (; ii + centroids_per_iter <= clusterCount;
+                 ii += centroids_per_iter) {
+                const float *cptr = codebook + ii * size;
+                hn::Vec<FloatTag> centroidVec = hn::LoadU(tag, cptr);
+                hn::Vec<FloatTag> sum = partial_sum_score<DT, FloatTag>(
+                        centroidVec, queryVec);
+                hn::Vec<FloatTag> temp = hn::Shuffle2301(sum);
+                sum = hn::Add(sum, temp);
+                temp = hn::Shuffle1032(sum);
+                sum = hn::Add(sum, temp);
+                hn::StoreU(sum, tag, tmp);
+#pragma GCC unroll 4
+                for (int jj = 0; jj < centroids_per_iter; ++jj) {
+                    partialSums[codebookBase + ii + jj] = tmp[jj * 4];
+                }
+            }
+        }
+    }
+    if constexpr (kLanes >= 8) {
+        if (size == 8) {
+            hn::Vec<FloatTag> queryVec = LoadDup256(tag, query + queryOffset);
+            constexpr int centroids_per_iter = static_cast<int>(kLanes / 8);
+
+            for (; ii + centroids_per_iter <= clusterCount;
+                 ii += centroids_per_iter) {
+                const float *cptr = codebook + ii * size;
+                hn::Vec<FloatTag> centroidVec = hn::LoadU(tag, cptr);
+                hn::Vec<FloatTag> sum = partial_sum_score<DT, FloatTag>(
+                        centroidVec, queryVec);
+                hn::Vec<FloatTag> temp = hn::SwapAdjacentBlocks(sum);
+                sum = hn::Add(sum, temp);
+                temp = hn::Shuffle1032(sum);
+                sum = hn::Add(sum, temp);
+                temp = hn::Shuffle2301(sum);
+                sum = hn::Add(sum, temp);
+                hn::StoreU(sum, tag, tmp);
+#pragma GCC unroll 2
+                for (int jj = 0; jj < centroids_per_iter; ++jj) {
+                    partialSums[codebookBase + ii + jj] = tmp[jj * 8];
+                }
+            }
+        }
+    }
+    if constexpr (kLanes == 16) {
+        // Don't have to worry about making this work on 1024-bit lanes just yet
+        if (size == 16) {
+            const hn::Vec<FloatTag> queryVec
+                    = hn::LoadU(tag, query + queryOffset);
+            for (; ii < clusterCount; ++ii) {
+                const hn::Vec<FloatTag> centroidVec
+                        = hn::LoadU(tag, codebook + ii * size);
+                partialSums[codebookBase + ii] = hn::ReduceSum(
+                        tag,
+                        partial_sum_score<DT, FloatTag>(centroidVec, queryVec));
+            }
+        }
+    }
+    for (; ii < clusterCount; ii++) {
+        partialSums[codebookBase + ii] = distance_func<DT>(
+                codebook, ii * size, query, queryOffset, size);
+    }
+}
+
+// HWY_RESTRICT is not needed here: `data` (float*) and `baseOffsets` (unsigned char*)
+// have different types, so the compiler already treats them as non-aliasing under
+// C++ strict-aliasing rules. Neither pointer is written through, so there are no
+// stores that could force a reload of the other.
+
+// Inner kernel templated on the Highway tag type so it works for both the
+// full-width ScalableTag path and the capped HWY_CAPPED fast paths.
+// gatherIndices[k] = (ii + k) * dataBase + baseOffsets[ii + k]
+template <class FloatTag>
+HWY_INLINE float AssembleAndSumImpl(
+        const float         *HWY_RESTRICT data,
+        int                  dataBase,
+        const uint8_t       *HWY_RESTRICT baseOffsets,
+        size_t               baseOffsetsLength,
+        FloatTag             floatTag)
+{
+    const hn::RebindToSigned<FloatTag>                  int32Tag;
+    const hn::Rebind<uint16_t, hn::RebindToSigned<FloatTag>> uint16Tag;
+    const hn::Rebind<uint8_t,  hn::RebindToSigned<FloatTag>> uint8Tag;
+    const size_t lanes = hn::Lanes(floatTag);
+
+    // Precompute scaleVec = [0, db, 2*db, ..., (lanes-1)*db] once.
+    // This eliminates a 512-bit VPMULLD on every iteration; the per-iteration
+    // base (ii*dataBase) is a scalar that the compiler strength-reduces to an add.
+    const auto scaleVec = hn::Mul(hn::Iota(int32Tag, 0),
+                                  hn::Set(int32Tag, dataBase));
+    auto sumVec = hn::Zero(floatTag);
+
+    size_t ii = 0;
+    for (; ii + lanes <= baseOffsetsLength; ii += lanes) {
+        // Load `lanes` bytes and zero-extend to i32 via u8→u16→i32.
+        const auto offsetVec = hn::PromoteTo(int32Tag,
+                                   hn::PromoteTo(uint16Tag,
+                                       hn::LoadU(uint8Tag, baseOffsets + ii)));
+        const auto base = hn::Set(int32Tag,
+                                  static_cast<int32_t>(ii * static_cast<size_t>(dataBase)));
+        sumVec = hn::Add(sumVec, hn::GatherIndex(floatTag, data,
+                             hn::Add(hn::Add(base, scaleVec), offsetVec)));
+    }
+
+    float res = hn::ReduceSum(floatTag, sumVec);
+    for (; ii < baseOffsetsLength; ii++) {
+        res += data[dataBase * ii + baseOffsets[ii]];
+    }
+    return res;
+}
+
+HWY_FLATTEN float assemble_and_sum_f32(const float *data,
+                                           int dataBase,
+                                           const unsigned char *baseOffsets,
+                                           int baseOffsetsOffset,
+                                           size_t baseOffsetsLength)
+{
+    baseOffsets += baseOffsetsOffset;
+
+#if HWY_MAX_BYTES > 16
+    if (baseOffsetsLength <= 4) {
+        return AssembleAndSumImpl(data, dataBase, baseOffsets, baseOffsetsLength,
+                                  HWY_CAPPED(float, 4){});
+    }
+#if HWY_MAX_BYTES > 32
+    if (baseOffsetsLength <= 8) {
+        return AssembleAndSumImpl(data, dataBase, baseOffsets, baseOffsetsLength,
+                                  HWY_CAPPED(float, 8){});
+    }
+#endif
+#endif
+
+    return AssembleAndSumImpl(data, dataBase, baseOffsets, baseOffsetsLength,
+                              hn::ScalableTag<float>{});
+}
+
+// Inner kernel for the triangular-table PQ gather, templated on the Highway tag
+// type so it works for both the full-width ScalableTag path and HWY_CAPPED fast paths.
+// gatherIndex[j] = laneIdx[j]*blockSize + triangularIndex(c1[j], c2[j])
+template <class FloatTag>
+HWY_INLINE float AssembleAndSumPQImpl(
+        const float    *HWY_RESTRICT data,
+        size_t          subspaceCount,
+        const uint8_t  *HWY_RESTRICT baseOffsets1,
+        const uint8_t  *HWY_RESTRICT baseOffsets2,
+        int             k,
+        int             blockSize,
+        FloatTag        d_f)
+{
+    const hn::RebindToSigned<FloatTag>                       d_i;
+    const hn::Rebind<uint16_t, hn::RebindToSigned<FloatTag>> d_u16;
+    const hn::Rebind<uint8_t,  hn::RebindToSigned<FloatTag>> d_u8;
+    const size_t lanes = hn::Lanes(d_f);
+
+    const auto vk         = hn::Set(d_i, k);
+    const auto vBlockSize = hn::Set(d_i, blockSize);
+    auto sumVec  = hn::Zero(d_f);
+
+    // Precompute laneIdxScaled = [0, bs, 2*bs, ..., (lanes-1)*bs] and advance
+    // it by a fixed increment each iteration.  Replaces Mul(laneIdx, vBlockSize)
+    // (VPMULLD, 3-cycle latency on critical path) with VPADDD (1-cycle latency).
+    auto laneIdxScaled = hn::Mul(hn::Iota(d_i, 0), vBlockSize);
+    const auto laneScaledInc = hn::Set(d_i, static_cast<int32_t>(lanes) * blockSize);
+
+    size_t ii = 0;
+    for (; ii + lanes <= subspaceCount; ii += lanes) {
+        // Load `lanes` u8 ordinals and zero-extend to i32 via u8→u16→i32.
+        const auto c1 = hn::PromoteTo(d_i, hn::PromoteTo(d_u16, hn::LoadU(d_u8, baseOffsets1 + ii)));
+        const auto c2 = hn::PromoteTo(d_i, hn::PromoteTo(d_u16, hn::LoadU(d_u8, baseOffsets2 + ii)));
+        const auto r  = hn::Min(c1, c2);
+        const auto c  = hn::Max(c1, c2);
+        // triangular = r*(r-1)/2; always even & non-negative so ShiftRight<1> is exact.
+        const auto triangular = hn::ShiftRight<1>(hn::Mul(r, hn::Sub(r, hn::Set(d_i, 1))));
+        const auto offsetRow  = hn::Sub(hn::Mul(r, vk), triangular);
+        // gatherIndex = laneIdxScaled + offsetRow + (c - r)
+        const auto gatherIdx = hn::Add(laneIdxScaled,
+                                       hn::Add(offsetRow, hn::Sub(c, r)));
+        sumVec       = hn::Add(sumVec, hn::GatherIndex(d_f, data, gatherIdx));
+        laneIdxScaled = hn::Add(laneIdxScaled, laneScaledInc);
+    }
+
+    float res = hn::ReduceSum(d_f, sumVec);
+    for (; ii < subspaceCount; ii++) {
+        int c1v = baseOffsets1[ii], c2v = baseOffsets2[ii];
+        int r   = c1v < c2v ? c1v : c2v;
+        int cv  = c1v > c2v ? c1v : c2v;
+        res += data[ii * blockSize + r * k - r * (r - 1) / 2 + (cv - r)];
+    }
+    return res;
+}
+
+// For each of the M subspaces, looks up data[i*blockSize + triangularIndex(c1[i], c2[i])]
+// where blockSize = k*(k+1)/2 and triangularIndex(r,c) = r*k - r*(r-1)/2 + (c-r),
+// r = min(c1,c2), c = max(c1,c2).  Vectorised via a gather over i32 indices built
+// from integer min/max and an arithmetic right-shift for the triangular number.
+// On ISAs with >128-bit registers, capped fast-paths are used when subspaceCount
+// fits in 4 or 8 lanes to avoid wasting the extra lanes of a wide register.
+HWY_FLATTEN float assemble_and_sum_pq_f32(
+        const float    *HWY_RESTRICT data,
+        size_t          subspaceCount,
+        const uint8_t  *HWY_RESTRICT baseOffsets1, int baseOffsetsOffset1,
+        const uint8_t  *HWY_RESTRICT baseOffsets2, int baseOffsetsOffset2,
+        int             clusterCount)
+{
+    baseOffsets1 += baseOffsetsOffset1;
+    baseOffsets2 += baseOffsetsOffset2;
+
+    const int k         = clusterCount;
+    const int blockSize = k * (k + 1) / 2;
+
+#if HWY_MAX_BYTES > 16
+    if (subspaceCount <= 4) {
+        return AssembleAndSumPQImpl(data, subspaceCount, baseOffsets1, baseOffsets2,
+                                    k, blockSize, HWY_CAPPED(float, 4){});
+    }
+#if HWY_MAX_BYTES > 32
+    if (subspaceCount <= 8) {
+        return AssembleAndSumPQImpl(data, subspaceCount, baseOffsets1, baseOffsets2,
+                                    k, blockSize, HWY_CAPPED(float, 8){});
+    }
+#endif
+#endif
+
+    return AssembleAndSumPQImpl(data, subspaceCount, baseOffsets1, baseOffsets2,
+                                k, blockSize, hn::ScalableTag<float>{});
+}
+
+// HWY_RESTRICT is not needed here: `baseOffsets` (unsigned char*) is a different type
+// from `partialSums` and `aMagnitude` (float*), so strict-aliasing already guarantees
+// the compiler that writes through one cannot affect loads from the other. All three
+// pointers are read-only within the loop, so there are no stores to reason about anyway.
+HWY_FLATTEN float
+pq_decoded_cosine_similarity_f32(const unsigned char *baseOffsets,
+                                     int baseOffsetsOffset,
+                                     size_t baseOffsetsLength,
+                                     int clusterCount,
+                                     const float *partialSums,
+                                     const float *aMagnitude,
+                                     float bMagnitude)
+{
+    using FloatTag = hn::ScalableTag<float>;
+    using Int32Tag = hn::RebindToSigned<FloatTag>;
+    using Uint16Tag = hn::Rebind<uint16_t, Int32Tag>;
+    using Uint8Tag = hn::Rebind<uint8_t, Int32Tag>;
+
+    const FloatTag floatTag;
+    const Int32Tag int32Tag;
+    const Uint16Tag uint16Tag;
+    const Uint8Tag uint8Tag;
+    const size_t kLanes = hn::Lanes(floatTag);
+
+    baseOffsets += baseOffsetsOffset;
+
+    auto sumVec = hn::Zero(floatTag);
+    auto magnitudeVec = hn::Zero(floatTag);
+
+    // Precompute scaleVec = [0, cc, 2*cc, ..., (kLanes-1)*cc] once.
+    // Eliminates a 512-bit VPMULLD on every iteration; the per-iteration scalar
+    // base (ii*clusterCount) is strength-reduced to an add by the compiler.
+    const auto scaleVec = hn::Mul(hn::Iota(int32Tag, 0),
+                                  hn::Set(int32Tag, clusterCount));
+
+    size_t ii = 0;
+    for (; ii + kLanes <= baseOffsetsLength; ii += kLanes) {
+        // Load kLanes bytes and zero-extend to int32 via two PromoteTo steps (u8→u16→i32)
+        const auto u8Vec     = hn::LoadU(uint8Tag, baseOffsets + ii);
+        const auto u16Vec    = hn::PromoteTo(uint16Tag, u8Vec);
+        const auto offsetVec = hn::PromoteTo(int32Tag, u16Vec);
+
+        // gatherIndices[k] = (ii + k) * clusterCount + baseOffsets[ii + k]
+        const auto base = hn::Set(int32Tag,
+                                  static_cast<int32_t>(ii * static_cast<size_t>(clusterCount)));
+        const auto gatherIndices = hn::Add(hn::Add(base, scaleVec), offsetVec);
+
+        sumVec       = hn::Add(sumVec,       hn::GatherIndex(floatTag, partialSums, gatherIndices));
+        magnitudeVec = hn::Add(magnitudeVec, hn::GatherIndex(floatTag, aMagnitude,  gatherIndices));
+    }
+
+    float sumResult = hn::ReduceSum(floatTag, sumVec);
+    float aMagnitudeResult = hn::ReduceSum(floatTag, magnitudeVec);
+
+    // Handle the remaining elements
+    for (; ii < baseOffsetsLength; ii++) {
+        int offset = clusterCount * static_cast<int>(ii) + baseOffsets[ii];
+        sumResult += partialSums[offset];
+        aMagnitudeResult += aMagnitude[offset];
+    }
+
+    return sumResult / sqrtf(aMagnitudeResult * bMagnitude);
+}
+
+HWY_FLATTEN void calculate_partial_sums_dot_f32(const float *codebook,
+                                                    int codebookIndex,
+                                                    size_t size,
+                                                    int clusterCount,
+                                                    const float *query,
+                                                    int queryOffset,
+                                                    float *partialSums)
+{
+    calculate_partial_sums_f32<DistanceType::DotProduct>(codebook,
+                                                         codebookIndex,
+                                                         size,
+                                                         clusterCount,
+                                                         query,
+                                                         queryOffset,
+                                                         partialSums);
+}
+
+HWY_FLATTEN void calculate_partial_sums_euclidean_f32(const float *codebook,
+                                                          int codebookIndex,
+                                                          size_t size,
+                                                          int clusterCount,
+                                                          const float *query,
+                                                          int queryOffset,
+                                                          float *partialSums)
+{
+    calculate_partial_sums_f32<DistanceType::Euclidean>(codebook,
+                                                        codebookIndex,
+                                                        size,
+                                                        clusterCount,
+                                                        query,
+                                                        queryOffset,
+                                                        partialSums);
+}
+
+// Computes partialSums[codebookIndex * clusterCount + i] = dot(centroid_i, centroid_i)
+// i.e. the squared L2 norm of each codebook centroid. This is the per-subspace
+// contribution to aMagnitude in the PQ cosine similarity formula.
+//
+// Fast paths for size == 2/4/8/16 pack multiple centroids into one SIMD register
+// (kLanes/size per iteration), eliminating the per-centroid ReduceSum overhead.
+// They mirror the structure of calculate_partial_sums_f32 but replace the
+// centroid×query multiply with centroid×centroid (self-square).
+HWY_FLATTEN void calculate_partial_sums_self_magnitude_f32(
+        const float *HWY_RESTRICT codebook,
+        int codebookIndex,
+        size_t size,
+        int clusterCount,
+        float *HWY_RESTRICT partialSums)
+{
+    const int codebookBase = codebookIndex * clusterCount;
+    using FloatTag = hn::ScalableTag<float>;
+    FloatTag tag;
+    constexpr size_t kLanes = hn::MaxLanes(tag);
+    alignas(64) float tmp[kLanes];
+    int ii = 0;
+
+    if constexpr (kLanes >= 2) {
+        if (size == 2) {
+            constexpr size_t kBlock = 2;
+            constexpr int centroids_per_iter = kLanes / kBlock;
+
+            for (; ii + centroids_per_iter <= clusterCount;
+                 ii += centroids_per_iter) {
+                const float *cptr = codebook + ii * 2;
+                const hn::Vec<FloatTag> cv = hn::LoadU(tag, cptr);
+                hn::Vec<FloatTag> sum = hn::Mul(cv, cv);
+                hn::Vec<FloatTag> swapped = hn::Shuffle2301(sum);
+                sum = hn::Add(sum, swapped);
+                hn::StoreU(sum, tag, tmp);
+#pragma GCC unroll 8
+                for (int jj = 0; jj < centroids_per_iter; ++jj) {
+                    partialSums[codebookBase + ii + jj] = tmp[jj * 2];
+                }
+            }
+        }
+    }
+    if constexpr (kLanes >= 4) {
+        if (size == 4) {
+            constexpr int centroids_per_iter = static_cast<int>(kLanes / 4);
+
+            for (; ii + centroids_per_iter <= clusterCount;
+                 ii += centroids_per_iter) {
+                const float *cptr = codebook + ii * size;
+                const hn::Vec<FloatTag> cv = hn::LoadU(tag, cptr);
+                hn::Vec<FloatTag> sum = hn::Mul(cv, cv);
+                hn::Vec<FloatTag> temp = hn::Shuffle2301(sum);
+                sum = hn::Add(sum, temp);
+                temp = hn::Shuffle1032(sum);
+                sum = hn::Add(sum, temp);
+                hn::StoreU(sum, tag, tmp);
+#pragma GCC unroll 4
+                for (int jj = 0; jj < centroids_per_iter; ++jj) {
+                    partialSums[codebookBase + ii + jj] = tmp[jj * 4];
+                }
+            }
+        }
+    }
+    if constexpr (kLanes >= 8) {
+        if (size == 8) {
+            constexpr int centroids_per_iter = static_cast<int>(kLanes / 8);
+
+            for (; ii + centroids_per_iter <= clusterCount;
+                 ii += centroids_per_iter) {
+                const float *cptr = codebook + ii * size;
+                const hn::Vec<FloatTag> cv = hn::LoadU(tag, cptr);
+                hn::Vec<FloatTag> sum = hn::Mul(cv, cv);
+                hn::Vec<FloatTag> temp = hn::SwapAdjacentBlocks(sum);
+                sum = hn::Add(sum, temp);
+                temp = hn::Shuffle1032(sum);
+                sum = hn::Add(sum, temp);
+                temp = hn::Shuffle2301(sum);
+                sum = hn::Add(sum, temp);
+                hn::StoreU(sum, tag, tmp);
+#pragma GCC unroll 2
+                for (int jj = 0; jj < centroids_per_iter; ++jj) {
+                    partialSums[codebookBase + ii + jj] = tmp[jj * 8];
+                }
+            }
+        }
+    }
+    if constexpr (kLanes == 16) {
+        // AVX-512 only: one full register holds exactly one size==16 centroid.
+        if (size == 16) {
+            for (; ii < clusterCount; ++ii) {
+                const hn::Vec<FloatTag> cv
+                        = hn::LoadU(tag, codebook + ii * size);
+                partialSums[codebookBase + ii]
+                        = hn::ReduceSum(tag, hn::Mul(cv, cv));
+            }
+        }
+    }
+    // General fallback: one centroid at a time, vector-accumulate then reduce.
+    for (; ii < clusterCount; ii++) {
+        const float *cptr = codebook + ii * size;
+        auto accVec = hn::Zero(tag);
+        size_t j = 0;
+        for (; j + kLanes <= size; j += kLanes) {
+            const auto v = hn::LoadU(tag, cptr + j);
+            accVec = hn::MulAdd(v, v, accVec);
+        }
+        float sum = hn::ReduceSum(tag, accVec);
+        for (; j < size; j++) {
+            sum += cptr[j] * cptr[j];
+        }
+        partialSums[codebookBase + ii] = sum;
+    }
+}
+
+// =============================================================================
+// NVQ kernels
+// =============================================================================
+//
+// Bit-manipulation helpers used by all NVQ public kernels:
+//
+// logisticNQT — approximate sigmoid via IEEE 754 bit tricks (2^x approximation).
+// logitNQT    — inverse: fast log2 via exponent extraction.
+// Both exploit the float bit layout to avoid transcendental instructions.
+//
+
+// logisticNQT: approximate sigmoid using integer bit manipulation.
+// Computes an approximation of the logistic function:
+//   result ≈ 1 / (1 + exp(-alpha * (v - x0)))
+// using the identity sigmoid(x) = 2^x / (2^x + 1) and a fast bit-hack for 2^x:
+//   given x = p + f where p is integer and f ∈ [0,1):
+//     2^x ≈ reinterpret_float(bits_of((f*0.5+1.0) << 23  +  p << 23))
+template <class D>
+HWY_INLINE hn::Vec<D> logisticNQT(D d, hn::Vec<D> v, float alpha, float x0)
+{
+    const hn::RebindToSigned<D> di;
+
+    // temp = alpha * v - alpha * x0
+    auto temp = hn::MulAdd(v, hn::Set(d, alpha), hn::Set(d, -alpha * x0));
+
+    // p = (int)(temp + 1) where temp >= 0, else (int)(temp)
+    // Mirrors Java: p = (int) floor(temp + 1); truncation == floor for temp >= 0.
+    const auto isPositive = hn::Not(hn::IsNegative(temp));
+    auto selected = hn::IfThenElse(isPositive,
+                                   hn::Add(temp, hn::Set(d, 1.0f)),
+                                   temp);
+    auto p = hn::ConvertTo(di, selected);  // truncate towards zero
+
+    // e = (float) p
+    auto e = hn::ConvertTo(d, p);
+
+    // m = reinterpret_bits((temp - e) * 0.5 + 1.0)
+    // (temp - e) is in (-1, 1), so the result is in (0.5, 1.5) — a mantissa value.
+    auto m = hn::BitCast(di,
+                         hn::MulAdd(hn::Sub(temp, e),
+                                    hn::Set(d, 0.5f),
+                                    hn::Set(d, 1.0f)));
+
+    // Reconstruct: (m_bits + (p << 23)) reinterpreted as float  =  m_mantissa * 2^p
+    auto result = hn::BitCast(d, hn::Add(m, hn::ShiftLeft<23>(p)));
+
+    // Sigmoid: result / (result + 1)
+    return hn::Div(result, hn::Add(result, hn::Set(d, 1.0f)));
+}
+
+// logitNQT: inverse of logisticNQT — fast log2 via IEEE 754 exponent extraction.
+// Computes approximately:
+//   inverseAlpha * (log2(v / (1-v)) - 1) + x0
+// The "-1" offset comes from subtracting 128 instead of 127 from the biased exponent,
+// matching the Java implementation exactly.
+template <class D>
+HWY_INLINE hn::Vec<D> logitNQT(D d, hn::Vec<D> v, float inverseAlpha, float x0)
+{
+    const hn::RebindToSigned<D> di;
+
+    // z = v / (1 - v)
+    auto z = hn::Div(v, hn::Sub(hn::Set(d, 1.0f), v));
+
+    // Reinterpret float bits as int32 to extract exponent and mantissa fields.
+    auto temp = hn::BitCast(di, z);
+
+    // p = (biased_exponent >> 23) - 128
+    // Masking with 0x7f800000 isolates the 8 exponent bits; shifting by 23 places
+    // them in the low byte. Subtracting 128 (vs. the standard 127 bias) is intentional
+    // and matches the Java source.
+    auto p = hn::Sub(hn::ShiftRight<23>(hn::And(temp, hn::Set(di, 0x7f800000))),
+                     hn::Set(di, 128));
+
+    // m = reinterpret as float: set exponent to 127 (i.e., 2^0) and keep mantissa
+    // → value in [1.0, 2.0)
+    auto m = hn::BitCast(d, hn::Add(hn::And(temp, hn::Set(di, 0x007fffff)),
+                                    hn::Set(di, 0x3f800000)));
+
+    // return (m + (float)p) * inverseAlpha + x0
+    return hn::MulAdd(hn::Add(m, hn::ConvertTo(d, p)),
+                      hn::Set(d, inverseAlpha),
+                      hn::Set(d, x0));
+}
+
+// Single-element wrappers used only for the two setup constants (logisticBias,
+// logisticScale) computed at the start of each NVQ kernel.  They delegate to
+// the vector templates via CappedTag<float,1> so there is no duplication of
+// the bit-manipulation logic.  They are NOT called in any hot loop — all tail
+// elements are handled by LoadN + FirstN-masked vector operations below.
+static HWY_INLINE float logisticNQT_scalar(float value, float alpha, float x0)
+{
+    const hn::CappedTag<float, 1> d1;
+    return hn::GetLane(logisticNQT(d1, hn::Set(d1, value), alpha, x0));
+}
+
+// Public kernels — called from Java via FFI.
+//
+// All six functions mirror the @Override methods in PanamaVectorUtilSupport.
+// They share the same mathematical logic; the Highway vector loops replace the
+// Panama FloatVector loops and the scalar tails are identical to the Java ones.
+//
+// Byte↔float conversion pipeline (mirrors Java nvqDequantize8bit):
+//   LoadU(uint8Tag, ptr)   — fill 4N-lane u8 vector from N bytes at ptr
+//   PromoteTo(uint16Tag)   — lower N u8s → N u16s (2N-lane vector)
+//   PromoteTo(int32Tag)    — lower N u16s → N i32s (N-lane vector)
+//   ConvertTo(floatTag)    — i32 → float
+//   MulAdd(scale, bias)    — byte * logisticScale + logisticBias
+//   logitNQT(...)          — inverse logistic
+//
+// Float→byte pipeline (nvq_quantize_8bit):
+//   logisticNQT(...)       — forward logistic
+//   scale and shift
+//   ConvertTo(int32) after +0.5  — round toward nearest for positive values
+//   StoreU to tmp[] + scalar clamp and byte-cast
+//
+// Cosine packing: nvq_cosine_8bit_packed returns an int64_t whose low 32 bits
+// are the IEEE-754 bits of `sum` and whose high 32 bits are `bMagnitude`, so
+// the caller can unpack with Float.intBitsToFloat without any heap allocation.
+// =============================================================================
+
+HWY_FLATTEN void nvq_quantize_8bit(const float *HWY_RESTRICT vector,
+                                   size_t length,
+                                   float alpha, float x0,
+                                   float minValue, float maxValue,
+                                   uint8_t *HWY_RESTRICT destination)
+{
+    using FloatTag = hn::ScalableTag<float>;
+    using Int32Tag = hn::RebindToSigned<FloatTag>;
+    FloatTag d_f;
+    Int32Tag d_i;
+    constexpr size_t kLanes = hn::MaxLanes(d_f);
+    alignas(64) int32_t tmp[kLanes];
+
+    float delta            = maxValue - minValue;
+    float scaledAlpha      = alpha / delta;
+    float scaledX0         = x0 * delta;
+    float logisticBias     = logisticNQT_scalar(minValue, scaledAlpha, scaledX0);
+    float invLogisticScale = 255.0f / (logisticNQT_scalar(maxValue, scaledAlpha, scaledX0) - logisticBias);
+
+    size_t i = 0;
+    for (; i + kLanes <= length; i += kLanes) {
+        auto arr = hn::LoadU(d_f, vector + i);
+        arr = logisticNQT(d_f, arr, scaledAlpha, scaledX0);
+        arr = hn::Add(hn::Mul(hn::Sub(arr, hn::Set(d_f, logisticBias)),
+                              hn::Set(d_f, invLogisticScale)),
+                      hn::Set(d_f, 0.5f));
+        auto fi = hn::ConvertTo(d_i, arr);
+        hn::StoreU(fi, d_i, tmp);
+        for (size_t j = 0; j < kLanes; j++) {
+            int v = tmp[j];
+            destination[i + j] = (uint8_t)(v < 0 ? 0 : (v > 255 ? 255 : v));
+        }
+    }
+    // Tail: LoadN zero-pads lanes beyond `remaining`; only write the first
+    // `remaining` bytes from tmp[] so the padding lanes are never observed.
+    const size_t remaining = length - i;
+    if (remaining > 0) {
+        auto arr = hn::LoadN(d_f, vector + i, remaining);
+        arr = logisticNQT(d_f, arr, scaledAlpha, scaledX0);
+        arr = hn::Add(hn::Mul(hn::Sub(arr, hn::Set(d_f, logisticBias)),
+                              hn::Set(d_f, invLogisticScale)),
+                      hn::Set(d_f, 0.5f));
+        hn::StoreU(hn::ConvertTo(d_i, arr), d_i, tmp);
+        for (size_t j = 0; j < remaining; j++) {
+            int v = tmp[j];
+            destination[i + j] = (uint8_t)(v < 0 ? 0 : (v > 255 ? 255 : v));
+        }
+    }
+}
+
+HWY_FLATTEN float nvq_loss(const float *HWY_RESTRICT vector,
+                           size_t length,
+                           float alpha, float x0,
+                           float minValue, float maxValue,
+                           int nBits)
+{
+    using FloatTag = hn::ScalableTag<float>;
+    using Int32Tag = hn::RebindToSigned<FloatTag>;
+    FloatTag d_f;
+    Int32Tag d_i;
+    constexpr size_t kLanes = hn::MaxLanes(d_f);
+
+    int   constant        = (1 << nBits) - 1;
+    float delta           = maxValue - minValue;
+    float scaledAlpha     = alpha / delta;
+    float invScaledAlpha  = delta / alpha;   // 1 / scaledAlpha
+    float scaledX0        = x0 * delta;
+    float logisticBias    = logisticNQT_scalar(minValue, scaledAlpha, scaledX0);
+    float logisticScale   = (logisticNQT_scalar(maxValue, scaledAlpha, scaledX0) - logisticBias) / (float)constant;
+    float invLogisticScale = 1.0f / logisticScale;
+
+    auto squaredSum = hn::Zero(d_f);
+
+    size_t i = 0;
+    for (; i + kLanes <= length; i += kLanes) {
+        auto arr    = hn::LoadU(d_f, vector + i);
+        auto recArr = logisticNQT(d_f, arr, scaledAlpha, scaledX0);
+        recArr = hn::Mul(hn::Sub(recArr, hn::Set(d_f, logisticBias)),
+                         hn::Set(d_f, invLogisticScale));
+        // Round to nearest integer (add 0.5, truncate toward zero)
+        auto recInt = hn::ConvertTo(d_i, hn::Add(recArr, hn::Set(d_f, 0.5f)));
+        recArr = hn::ConvertTo(d_f, recInt);
+        recArr = hn::MulAdd(recArr, hn::Set(d_f, logisticScale), hn::Set(d_f, logisticBias));
+        recArr = logitNQT(d_f, recArr, invScaledAlpha, scaledX0);
+        auto diff = hn::Sub(arr, recArr);
+        squaredSum = hn::MulAdd(diff, diff, squaredSum);
+    }
+
+    float result = hn::ReduceSum(d_f, squaredSum);
+
+    // Tail: LoadN zero-pads; mask the diff so padding lanes don't contribute.
+    const size_t remaining = length - i;
+    if (remaining > 0) {
+        const auto mask = hn::FirstN(d_f, remaining);
+        auto arr    = hn::LoadN(d_f, vector + i, remaining);
+        auto recArr = logisticNQT(d_f, arr, scaledAlpha, scaledX0);
+        recArr = hn::Mul(hn::Sub(recArr, hn::Set(d_f, logisticBias)),
+                         hn::Set(d_f, invLogisticScale));
+        auto recInt = hn::ConvertTo(d_i, hn::Add(recArr, hn::Set(d_f, 0.5f)));
+        recArr = hn::ConvertTo(d_f, recInt);
+        recArr = hn::MulAdd(recArr, hn::Set(d_f, logisticScale), hn::Set(d_f, logisticBias));
+        recArr = logitNQT(d_f, recArr, invScaledAlpha, scaledX0);
+        auto diff = hn::IfThenElseZero(mask, hn::Sub(arr, recArr));
+        result += hn::ReduceSum(d_f, hn::Mul(diff, diff));
+    }
+
+    return result;
+}
+
+HWY_FLATTEN float nvq_uniform_loss(const float *HWY_RESTRICT vector,
+                                   size_t length,
+                                   float minValue, float maxValue,
+                                   int nBits)
+{
+    using FloatTag = hn::ScalableTag<float>;
+    using Int32Tag = hn::RebindToSigned<FloatTag>;
+    FloatTag d_f;
+    Int32Tag d_i;
+    constexpr size_t kLanes = hn::MaxLanes(d_f);
+
+    float constant = (float)((1 << nBits) - 1);
+    float delta    = maxValue - minValue;
+
+    auto squaredSum = hn::Zero(d_f);
+
+    size_t i = 0;
+    for (; i + kLanes <= length; i += kLanes) {
+        auto arr    = hn::LoadU(d_f, vector + i);
+        auto recArr = hn::Mul(hn::Sub(arr, hn::Set(d_f, minValue)),
+                              hn::Set(d_f, constant / delta));
+        auto recInt = hn::ConvertTo(d_i, hn::Add(recArr, hn::Set(d_f, 0.5f)));
+        recArr = hn::ConvertTo(d_f, recInt);
+        recArr = hn::MulAdd(recArr, hn::Set(d_f, delta / constant), hn::Set(d_f, minValue));
+        auto diff = hn::Sub(arr, recArr);
+        squaredSum = hn::MulAdd(diff, diff, squaredSum);
+    }
+
+    float result = hn::ReduceSum(d_f, squaredSum);
+
+    // Tail: LoadN zero-pads; mask the diff so padding lanes don't contribute.
+    const size_t remaining = length - i;
+    if (remaining > 0) {
+        const auto mask = hn::FirstN(d_f, remaining);
+        auto arr    = hn::LoadN(d_f, vector + i, remaining);
+        auto recArr = hn::Mul(hn::Sub(arr, hn::Set(d_f, minValue)),
+                              hn::Set(d_f, constant / delta));
+        auto recInt = hn::ConvertTo(d_i, hn::Add(recArr, hn::Set(d_f, 0.5f)));
+        recArr = hn::ConvertTo(d_f, recInt);
+        recArr = hn::MulAdd(recArr, hn::Set(d_f, delta / constant), hn::Set(d_f, minValue));
+        auto diff = hn::IfThenElseZero(mask, hn::Sub(arr, recArr));
+        result += hn::ReduceSum(d_f, hn::Mul(diff, diff));
+    }
+
+    return result;
+}
+
+// ─────────────────────────────────────────────────────────────────────────────
+// Two dequantization helpers — used by the three NVQ 8-bit scoring kernels.
+//
+// dequantize_bytes  (tail / non-hot path)
+//   Loads kLanes uint8 values via a narrow u8 tag, widens to int32 through
+//   two PromoteTo calls, converts to float, then applies scale+bias+logitNQT.
+//   Used only for the unaligned tail elements after the main FastLanes loop;
+//   those elements' query floats are in their original (un-shuffled) order.
+//
+// dequantize_bytes_fastlanes  (hot path)
+//   Accepts a Vec<Int32Tag> that is a full-width byte vector BitCast-ed to
+//   kLanes int32 lanes (i.e., 4*kLanes bytes loaded in one 512-bit register).
+//   Extracts the `part`-th byte from each int32 lane via shift-right + AND,
+//   converts to float, and applies scale+bias+logitNQT.  All operations run
+//   at the native SIMD width — no widening, no register-width changes.
+//   Mirrors the Panama FastLanes strategy:
+//   https://www.vldb.org/pvldb/vol16/p2132-afroozeh.pdf
+// ─────────────────────────────────────────────────────────────────────────────
+template <class FloatTag,
+          class Int32Tag  = hn::RebindToSigned<FloatTag>,
+          class Uint16Tag = hn::Rebind<uint16_t, Int32Tag>,
+          class Uint8Tag  = hn::Rebind<uint8_t,  Int32Tag>>
+HWY_INLINE hn::Vec<FloatTag>
+dequantize_bytes(FloatTag d_f, Int32Tag d_i, Uint16Tag d_u16, Uint8Tag d_u8,
+                 const uint8_t *HWY_RESTRICT quantized, size_t i,
+                 float logisticScale, float logisticBias,
+                 float invScaledAlpha, float scaledX0)
+{
+    const auto b_u8  = hn::LoadU(d_u8,  quantized + i);
+    const auto b_u16 = hn::PromoteTo(d_u16, b_u8);
+    const auto b_i32 = hn::PromoteTo(d_i,   b_u16);
+    auto vb = hn::ConvertTo(d_f, b_i32);
+    vb = hn::MulAdd(vb, hn::Set(d_f, logisticScale), hn::Set(d_f, logisticBias));
+    return logitNQT(d_f, vb, invScaledAlpha, scaledX0);
+}
+
+template <class FloatTag,
+          class Int32Tag = hn::RebindToSigned<FloatTag>>
+HWY_INLINE hn::Vec<FloatTag>
+dequantize_bytes_fastlanes(FloatTag d_f, Int32Tag d_i,
+                           hn::Vec<Int32Tag> as_ints, int part,
+                           float logisticScale, float logisticBias,
+                           float invScaledAlpha, float scaledX0)
+{
+    // Extract the `part`-th byte from each int32 lane, then convert to float.
+    // ShiftRightSame on a signed tag does arithmetic shift, but the AND with
+    // 0xFF zeroes the sign-extended upper bits, giving an unsigned 0-255 value.
+    auto shifted = hn::ShiftRightSame(as_ints, 8 * part);
+    auto masked  = hn::And(shifted, hn::Set(d_i, 0xFF));
+    auto vb      = hn::ConvertTo(d_f, masked);
+    vb = hn::MulAdd(vb, hn::Set(d_f, logisticScale), hn::Set(d_f, logisticBias));
+    return logitNQT(d_f, vb, invScaledAlpha, scaledX0);
+}
+
+HWY_FLATTEN float nvq_square_l2_distance_8bit(const float    *HWY_RESTRICT vector,
+                                              const uint8_t  *HWY_RESTRICT quantized,
+                                              size_t length,
+                                              float alpha, float x0,
+                                              float minValue, float maxValue)
+{
+    using FloatTag   = hn::ScalableTag<float>;
+    using Int32Tag   = hn::RebindToSigned<FloatTag>;
+    using Uint8x4Tag = hn::ScalableTag<uint8_t>;   // 4*kLanes lanes — same total width as FloatTag
+    // Tail-path tags (narrower, used only outside the hot loop)
+    using Uint16Tag  = hn::Rebind<uint16_t, Int32Tag>;
+    using Uint8Tag   = hn::Rebind<uint8_t,  Int32Tag>;
+    FloatTag   d_f;
+    Int32Tag   d_i;
+    Uint8x4Tag d_b;
+    Uint16Tag  d_u16;
+    Uint8Tag   d_u8;
+    constexpr size_t kLanes = hn::MaxLanes(d_f);
+
+    float delta          = maxValue - minValue;
+    float scaledAlpha    = alpha / delta;
+    float invScaledAlpha = delta / alpha;
+    float scaledX0       = x0 * delta;
+    float logisticBias   = logisticNQT_scalar(minValue, scaledAlpha, scaledX0);
+    float logisticScale  = (logisticNQT_scalar(maxValue, scaledAlpha, scaledX0) - logisticBias) / 255.0f;
+
+    auto squaredSum = hn::Zero(d_f);
+
+    // FastLanes main loop: load 4*kLanes bytes per iteration (full native width),
+    // reinterpret as kLanes int32 values, and extract one byte per int32 via
+    // shift+mask for each of the 4 parts.  Requires the query vector to have
+    // been pre-shuffled by nvq_shuffle_query_in_place_8bit.
+    size_t i = 0;
+    for (; i + 4 * kLanes <= length; i += 4 * kLanes) {
+        auto bytes   = hn::LoadU(d_b, quantized + i);
+        auto as_ints = hn::BitCast(d_i, bytes);
+        for (int part = 0; part < 4; ++part) {
+            auto va   = hn::LoadU(d_f, vector + i + part * kLanes);
+            auto vb   = dequantize_bytes_fastlanes(d_f, d_i, as_ints, part,
+                                                   logisticScale, logisticBias,
+                                                   invScaledAlpha, scaledX0);
+            auto diff = hn::Sub(va, vb);
+            squaredSum = hn::MulAdd(diff, diff, squaredSum);
+        }
+    }
+
+    float result = hn::ReduceSum(d_f, squaredSum);
+
+    // kLanes-aligned tail: query floats are un-shuffled here, so use the
+    // sequential PromoteTo path which reads bytes in natural order.
+    for (; i + kLanes <= length; i += kLanes) {
+        auto va   = hn::LoadU(d_f, vector + i);
+        auto vb   = dequantize_bytes(d_f, d_i, d_u16, d_u8, quantized, i,
+                                     logisticScale, logisticBias, invScaledAlpha, scaledX0);
+        auto diff = hn::Sub(va, vb);
+        result += hn::ReduceSum(d_f, hn::Mul(diff, diff));
+    }
+
+    // Sub-kLanes tail: LoadN zero-pads; mask diff to exclude padding lanes.
+    const size_t remaining = length - i;
+    if (remaining > 0) {
+        const auto mask  = hn::FirstN(d_f, remaining);
+        auto va          = hn::LoadN(d_f,  vector    + i, remaining);
+        const auto b_u8  = hn::LoadN(d_u8, quantized + i, remaining);
+        const auto b_u16 = hn::PromoteTo(d_u16, b_u8);
+        const auto b_i32 = hn::PromoteTo(d_i,   b_u16);
+        auto vb          = hn::MulAdd(hn::ConvertTo(d_f, b_i32),
+                                      hn::Set(d_f, logisticScale),
+                                      hn::Set(d_f, logisticBias));
+        vb = logitNQT(d_f, vb, invScaledAlpha, scaledX0);
+        auto diff = hn::IfThenElseZero(mask, hn::Sub(va, vb));
+        result += hn::ReduceSum(d_f, hn::Mul(diff, diff));
+    }
+
+    return result;
+}
+
+HWY_FLATTEN float nvq_dot_product_8bit(const float   *HWY_RESTRICT vector,
+                                       const uint8_t *HWY_RESTRICT quantized,
+                                       size_t length,
+                                       float alpha, float x0,
+                                       float minValue, float maxValue)
+{
+    using FloatTag   = hn::ScalableTag<float>;
+    using Int32Tag   = hn::RebindToSigned<FloatTag>;
+    using Uint8x4Tag = hn::ScalableTag<uint8_t>;   // 4*kLanes lanes — same total width as FloatTag
+    using Uint16Tag  = hn::Rebind<uint16_t, Int32Tag>;
+    using Uint8Tag   = hn::Rebind<uint8_t,  Int32Tag>;
+    FloatTag   d_f;
+    Int32Tag   d_i;
+    Uint8x4Tag d_b;
+    Uint16Tag  d_u16;
+    Uint8Tag   d_u8;
+    constexpr size_t kLanes = hn::MaxLanes(d_f);
+
+    float delta          = maxValue - minValue;
+    float scaledAlpha    = alpha / delta;
+    float invScaledAlpha = delta / alpha;
+    float scaledX0       = x0 * delta;
+    float logisticBias   = logisticNQT_scalar(minValue, scaledAlpha, scaledX0);
+    float logisticScale  = (logisticNQT_scalar(maxValue, scaledAlpha, scaledX0) - logisticBias) / 255.0f;
+
+    auto dotProd = hn::Zero(d_f);
+
+    // FastLanes main loop: full-width byte load, shift+mask extraction.
+    size_t i = 0;
+    for (; i + 4 * kLanes <= length; i += 4 * kLanes) {
+        auto bytes   = hn::LoadU(d_b, quantized + i);
+        auto as_ints = hn::BitCast(d_i, bytes);
+        for (int part = 0; part < 4; ++part) {
+            auto va = hn::LoadU(d_f, vector + i + part * kLanes);
+            auto vb = dequantize_bytes_fastlanes(d_f, d_i, as_ints, part,
+                                                 logisticScale, logisticBias,
+                                                 invScaledAlpha, scaledX0);
+            dotProd = hn::MulAdd(va, vb, dotProd);
+        }
+    }
+
+    float result = hn::ReduceSum(d_f, dotProd);
+
+    // kLanes-aligned tail: un-shuffled query, sequential byte access.
+    for (; i + kLanes <= length; i += kLanes) {
+        auto va = hn::LoadU(d_f, vector + i);
+        auto vb = dequantize_bytes(d_f, d_i, d_u16, d_u8, quantized, i,
+                                   logisticScale, logisticBias, invScaledAlpha, scaledX0);
+        result += hn::ReduceSum(d_f, hn::Mul(va, vb));
+    }
+
+    // Sub-kLanes tail: LoadN zero-pads va; 0 * vb = 0 for padding lanes.
+    const size_t remaining = length - i;
+    if (remaining > 0) {
+        auto va          = hn::LoadN(d_f,  vector    + i, remaining);
+        const auto b_u8  = hn::LoadN(d_u8, quantized + i, remaining);
+        const auto b_u16 = hn::PromoteTo(d_u16, b_u8);
+        const auto b_i32 = hn::PromoteTo(d_i,   b_u16);
+        auto vb          = hn::MulAdd(hn::ConvertTo(d_f, b_i32),
+                                      hn::Set(d_f, logisticScale),
+                                      hn::Set(d_f, logisticBias));
+        vb = logitNQT(d_f, vb, invScaledAlpha, scaledX0);
+        result += hn::ReduceSum(d_f, hn::Mul(va, vb));
+    }
+
+    return result;
+}
+
+// nvq_shuffle_query_in_place_8bit
+//
+// Pre-processes the float query vector in-place so that nvq_cosine_8bit_packed,
+// nvq_dot_product_8bit, and nvq_square_l2_distance_8bit can use the FastLanes
+// byte-extraction strategy.  Only complete blocks of 4*kLanes floats are
+// transposed; the remaining tail elements are left in their original order and
+// processed by the sequential tail path inside each scoring kernel.
+//
+// The permutation applied is an in-place matrix transpose of 4×kLanes blocks:
+// after the shuffle, shuffled[j*kLanes + k] == original[k*4 + j], so that
+// when the scoring kernel extracts byte `j` from int32 slot `k` (via
+// ShiftRight(8*j) & 0xFF), it lines up with query float at position j*kLanes+k.
+HWY_FLATTEN void nvq_shuffle_query_in_place_8bit(float *HWY_RESTRICT vector,
+                                                  size_t length)
+{
+    using FloatTag = hn::ScalableTag<float>;
+    FloatTag d_f;
+    const size_t kLanes = hn::Lanes(d_f);
+    const size_t step   = 4 * kLanes;   // block size = number of bytes in a full-width vector
+    const size_t mn1    = step - 1;
+
+    // Maximum step across all ISAs compiled for: 4 * 16 = 64 (AVX-512).
+    // Declared outside the loop so the stack frame is allocated once.
+    constexpr size_t kMaxStep = 4 * hn::MaxLanes(FloatTag{});
+    bool visited[kMaxStep];
+
+    size_t offset = 0;
+    while (offset + step <= length) {
+        float *arr = vector + offset;
+        memset(visited, 0, step);  // only zero the portion we will inspect
+
+        // In-place cyclic transposition: for each unvisited cycle, rotate
+        // elements along the cycle defined by a -> (kLanes * a) % mn1.
+        // This maps shuffled[p] = original[(p % kLanes) * 4 + (p / kLanes)],
+        // which is the inverse of the FastLanes interleaving.
+        for (size_t cycle = 1; cycle < step; ++cycle) {
+            if (visited[cycle]) continue;
+            size_t a = cycle;
+            do {
+                a = (a == mn1) ? mn1 : (kLanes * a) % mn1;
+                float temp = arr[a];
+                arr[a]     = arr[cycle];
+                arr[cycle] = temp;
+                visited[a] = true;
+            } while (a != cycle);
+        }
+        offset += step;
+    }
+}
+
+// Returns sum and bMagnitude packed into a single int64_t:
+//   bits [31:0]  = IEEE-754 bits of sum
+//   bits [63:32] = IEEE-754 bits of bMagnitude
+// The Java caller unpacks with Float.intBitsToFloat — no heap allocation needed.
+HWY_FLATTEN int64_t nvq_cosine_8bit_packed(const float   *HWY_RESTRICT vector,
+                                           const uint8_t *HWY_RESTRICT quantized,
+                                           size_t length,
+                                           float alpha, float x0,
+                                           float minValue, float maxValue,
+                                           const float   *HWY_RESTRICT centroid)
+{
+    using FloatTag   = hn::ScalableTag<float>;
+    using Int32Tag   = hn::RebindToSigned<FloatTag>;
+    using Uint8x4Tag = hn::ScalableTag<uint8_t>;   // 4*kLanes lanes — same total width as FloatTag
+    using Uint16Tag  = hn::Rebind<uint16_t, Int32Tag>;
+    using Uint8Tag   = hn::Rebind<uint8_t,  Int32Tag>;
+    FloatTag   d_f;
+    Int32Tag   d_i;
+    Uint8x4Tag d_b;
+    Uint16Tag  d_u16;
+    Uint8Tag   d_u8;
+    constexpr size_t kLanes = hn::MaxLanes(d_f);
+
+    float delta          = maxValue - minValue;
+    float scaledAlpha    = alpha / delta;
+    float invScaledAlpha = delta / alpha;
+    float scaledX0       = x0 * delta;
+    float logisticBias   = logisticNQT_scalar(minValue, scaledAlpha, scaledX0);
+    float logisticScale  = (logisticNQT_scalar(maxValue, scaledAlpha, scaledX0) - logisticBias) / 255.0f;
+
+    auto sumVec  = hn::Zero(d_f);
+    auto bMagVec = hn::Zero(d_f);
+
+    // FastLanes main loop: full-width byte load, shift+mask extraction.
+    size_t i = 0;
+    for (; i + 4 * kLanes <= length; i += 4 * kLanes) {
+        auto bytes   = hn::LoadU(d_b, quantized + i);
+        auto as_ints = hn::BitCast(d_i, bytes);
+        for (int part = 0; part < 4; ++part) {
+            auto va = hn::LoadU(d_f, vector   + i + part * kLanes);
+            auto vc = hn::LoadU(d_f, centroid + i + part * kLanes);
+            auto vb = dequantize_bytes_fastlanes(d_f, d_i, as_ints, part,
+                                                 logisticScale, logisticBias,
+                                                 invScaledAlpha, scaledX0);
+            vb = hn::Add(vb, vc);
+            sumVec  = hn::MulAdd(va, vb, sumVec);
+            bMagVec = hn::MulAdd(vb, vb, bMagVec);
+        }
+    }
+
+    float sum  = hn::ReduceSum(d_f, sumVec);
+    float bMag = hn::ReduceSum(d_f, bMagVec);
+
+    // kLanes-aligned tail: un-shuffled query and centroid, sequential bytes.
+    for (; i + kLanes <= length; i += kLanes) {
+        auto va = hn::LoadU(d_f, vector   + i);
+        auto vc = hn::LoadU(d_f, centroid + i);
+        auto vb = dequantize_bytes(d_f, d_i, d_u16, d_u8, quantized, i,
+                                   logisticScale, logisticBias, invScaledAlpha, scaledX0);
+        vb = hn::Add(vb, vc);
+        sum  += hn::ReduceSum(d_f, hn::Mul(va, vb));
+        bMag += hn::ReduceSum(d_f, hn::Mul(vb, vb));
+    }
+
+    // Sub-kLanes tail: LoadN zero-pads va and vc.  Mask vb before adding the
+    // centroid so padding lanes contribute 0 to both sum and bMagnitude.
+    const size_t remaining = length - i;
+    if (remaining > 0) {
+        const auto mask  = hn::FirstN(d_f, remaining);
+        auto va          = hn::LoadN(d_f,  vector    + i, remaining);
+        auto vc          = hn::LoadN(d_f,  centroid  + i, remaining);
+        const auto b_u8  = hn::LoadN(d_u8, quantized + i, remaining);
+        const auto b_u16 = hn::PromoteTo(d_u16, b_u8);
+        const auto b_i32 = hn::PromoteTo(d_i,   b_u16);
+        auto vb          = hn::MulAdd(hn::ConvertTo(d_f, b_i32),
+                                      hn::Set(d_f, logisticScale),
+                                      hn::Set(d_f, logisticBias));
+        vb = logitNQT(d_f, vb, invScaledAlpha, scaledX0);
+        auto vb_c = hn::Add(hn::IfThenElseZero(mask, vb), vc);
+        sum  += hn::ReduceSum(d_f, hn::Mul(va, vb_c));
+        bMag += hn::ReduceSum(d_f, hn::Mul(vb_c, vb_c));
+    }
+
+    int32_t sum_bits, bmag_bits;
+    memcpy(&sum_bits,  &sum,  sizeof(float));
+    memcpy(&bmag_bits, &bMag, sizeof(float));
+    return ((int64_t)bmag_bits << 32) | (int64_t)(uint32_t)sum_bits;
+}
+
+}  // namespace JV_ISA
diff --git a/jvector-native/src/main/c/jvector_simd_kernels.h b/jvector-native/src/main/c/jvector_simd_kernels.h
new file mode 100644
index 000000000..d6790c991
--- /dev/null
+++ b/jvector-native/src/main/c/jvector_simd_kernels.h
@@ -0,0 +1,138 @@
+/*
+ * Copyright DataStax, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// headerfile for the simd kernels
+#ifndef SIMD_KERNELS_H
+#define SIMD_KERNELS_H
+
+#include <cstddef>
+#include <cstdint>
+
+#define DECLARE_SIMD_KERNELS(ISA) \
+    namespace ISA { \
+    /* Base Fp32 distance kernels */ \
+    float cosine_f32(const float *a, \
+                     size_t aoffset, \
+                     const float *b, \
+                     size_t boffset, \
+                     size_t length); \
+    float dot_product_f32(const float *a, \
+                          size_t aoffset, \
+                          const float *b, \
+                          size_t boffset, \
+                          size_t length); \
+    float euclidean_f32(const float *a, \
+                        size_t aoffset, \
+                        const float *b, \
+                        size_t boffset, \
+                        size_t length); \
+    /* Element-wise in-place arithmetic */ \
+    void add_in_place_f32(float *v1, \
+                          const float *v2, \
+                          size_t length); \
+    void add_scalar_in_place_f32(float *v1, \
+                                 float value, \
+                                 size_t length); \
+    void sub_in_place_f32(float *v1, \
+                          const float *v2, \
+                          size_t length); \
+    void sub_scalar_in_place_f32(float *v1, \
+                                 float value, \
+                                 size_t length); \
+    float max_f32(const float *v, \
+                  size_t length); \
+    void min_in_place_f32(float *v1, \
+                          const float *v2, \
+                          size_t length); \
+    /* PQ kernels */ \
+    float assemble_and_sum_f32(const float *data, \
+                               int dataBase, \
+                               const unsigned char *baseOffsets, \
+                               int baseOffsetsOffset, \
+                               size_t baseOffsetsLength); \
+    float assemble_and_sum_pq_f32(const float *data, \
+                                  size_t subspaceCount, \
+                                  const unsigned char *baseOffsets1, \
+                                  int baseOffsetsOffset1, \
+                                  const unsigned char *baseOffsets2, \
+                                  int baseOffsetsOffset2, \
+                                  int clusterCount); \
+    float pq_decoded_cosine_similarity_f32(const unsigned char *baseOffsets, \
+                                           int baseOffsetsOffset, \
+                                           size_t baseOffsetsLength, \
+                                           int clusterCount, \
+                                           const float *partialSums, \
+                                           const float *aMagnitude, \
+                                           float bMagnitude); \
+    void calculate_partial_sums_dot_f32(const float *codebook, \
+                                        int codebookIndex, \
+                                        size_t size, \
+                                        int clusterCount, \
+                                        const float *query, \
+                                        int queryOffset, \
+                                        float *partialSums); \
+    void calculate_partial_sums_euclidean_f32(const float *codebook, \
+                                              int codebookIndex, \
+                                              size_t size, \
+                                              int clusterCount, \
+                                              const float *query, \
+                                              int queryOffset, \
+                                              float *partialSums); \
+    void calculate_partial_sums_self_magnitude_f32(const float *codebook, \
+                                                   int codebookIndex, \
+                                                   size_t size, \
+                                                   int clusterCount, \
+                                                   float *partialSums); \
+    /* NVQ kernels */ \
+    void nvq_quantize_8bit(const float *vector, \
+                           size_t length, \
+                           float alpha, float x0, \
+                           float minValue, float maxValue, \
+                           unsigned char *destination); \
+    float nvq_loss(const float *vector, \
+                   size_t length, \
+                   float alpha, float x0, \
+                   float minValue, float maxValue, \
+                   int nBits); \
+    float nvq_uniform_loss(const float *vector, \
+                           size_t length, \
+                           float minValue, float maxValue, \
+                           int nBits); \
+    float nvq_square_l2_distance_8bit(const float *vector, \
+                                      const unsigned char *quantized, \
+                                      size_t length, \
+                                      float alpha, float x0, \
+                                      float minValue, float maxValue); \
+    float nvq_dot_product_8bit(const float *vector, \
+                               const unsigned char *quantized, \
+                               size_t length, \
+                               float alpha, float x0, \
+                               float minValue, float maxValue); \
+    int64_t nvq_cosine_8bit_packed(const float *vector, \
+                                   const unsigned char *quantized, \
+                                   size_t length, \
+                                   float alpha, float x0, \
+                                   float minValue, float maxValue, \
+                                   const float *centroid); \
+    void nvq_shuffle_query_in_place_8bit(float *vector, \
+                                         size_t length); \
+    }
+
+DECLARE_SIMD_KERNELS(AVX3)
+DECLARE_SIMD_KERNELS(AVX2)
+DECLARE_SIMD_KERNELS(SSE42)
+
+#endif // SIMD_KERNELS_H
diff --git a/jvector-native/src/main/c/meson.build b/jvector-native/src/main/c/meson.build
new file mode 100644
index 000000000..c2ffe20cb
--- /dev/null
+++ b/jvector-native/src/main/c/meson.build
@@ -0,0 +1,128 @@
+# Copyright DataStax, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+project('jvector_simd_kernels', 'cpp',
+  version: '0.1.0',
+  meson_version: '>= 0.55.0',
+  default_options: ['cpp_std=c++17']
+)
+
+# Highway headers are used as an include directory only.
+# HWY_COMPILE_ONLY_STATIC / HWY_COMPILE_ONLY_SCALAR bypass the
+# dynamic-dispatch runtime, so only headers are needed at compile time.
+hwy_inc = include_directories('third_party/highway')
+
+# Each ISA variant: name, JV_ISA namespace, and extra compiler flags
+isa_variants = [
+  {
+    'name'     : 'avx3',
+    'namespace': 'AVX3',
+    'args'     : ['-march=skylake-avx512',
+                  '-DHWY_COMPILE_ONLY_STATIC',
+                  '-DJV_REQUIRE_HWY_AVX3'],
+  },
+  {
+    'name'     : 'avx2',
+    'namespace': 'AVX2',
+    'args'     : ['-march=haswell', 
+                  '-maes',
+                  '-DHWY_COMPILE_ONLY_STATIC',
+                  '-DJV_REQUIRE_HWY_AVX2'],
+  },
+  {
+    'name'     : 'sse42',
+    'namespace': 'SSE42',
+    'args'     : ['-msse4.2', '-mpclmul', '-maes',
+                  '-DHWY_COMPILE_ONLY_STATIC',
+                  '-DJV_REQUIRE_HWY_SCALAR'],
+  },
+]
+
+isa_libs = []
+foreach isa : isa_variants
+  lib = static_library(
+    'simdKernels_' + isa['name'],
+    sources            : 'jvector_simd_kernels.cpp',
+    include_directories: hwy_inc,
+    cpp_args           : isa['args'] + ['-DJV_ISA=' + isa['namespace'], '-fvisibility=hidden']
+  )
+  isa_libs += lib
+endforeach
+
+# vectorUtil provides a single runtime-dispatch entry point.
+# link_whole pulls every ISA object into the shared library so that no
+# symbols are dropped when consumers link against it.
+# abort.cc defines hwy::Abort, referenced by the SSE4 scalar fallbacks in
+# generic_ops-inl.h. It was extracted from targets.cc specifically for
+# projects that manage their own dispatch (google/highway#1935).
+vectorutil_lib = shared_library(
+  'jvector',
+  sources            : ['jvector_simd.cpp',
+                        'third_party/highway/hwy/abort.cc'],
+  include_directories: [include_directories('.'), hwy_inc],
+  cpp_args           : ['-DJVECTOR_BUILD', '-fvisibility=hidden'],
+  link_whole         : isa_libs,
+  version            : meson.project_version(),
+  install            : true,
+)
+
+# Dependency object for use by executables/tests in this build tree.
+vectorutil_dep = declare_dependency(
+  link_with          : vectorutil_lib,
+  include_directories: include_directories('.'),
+)
+
+## Example driver that exercises the runtime-dispatch API.
+#executable(
+#  'cpp_driver',
+#  sources     : 'examples/cpp_driver.cpp',
+#  dependencies: vectorutil_dep,
+#)
+#
+## ---- Tests -----------------------------------------------------------------
+#gtest_dep = dependency('gtest_main', required: true)
+#
+#test_exe = executable(
+#  'test_kernels',
+#  sources     : [
+#    'tests/test_kernels.cpp',
+#    'tests/test_cpuFeatures.cpp',
+#  ],
+#  dependencies: [vectorutil_dep, gtest_dep],
+#)
+#
+#test('kernels',     test_exe, protocol: 'gtest', suite: 'kernels')
+#test('cpu_features', test_exe, protocol: 'gtest', suite: 'cpu',
+#     args: ['--gtest_filter=CpuFeaturesTest.*'])
+#
+## ---- Benchmarks ------------------------------------------------------------
+#gbench_dep = dependency('benchmark', required: false)
+#if gbench_dep.found()
+#  executable(
+#    'bench_kernels',
+#    sources     : 'benchmarks/bench_kernels.cpp',
+#    dependencies: [vectorutil_dep, gbench_dep],
+#    cpp_args    : ['-O3'],
+#  )
+#endif
+#
+#rust_enabled = add_languages('rust', required: false)
+#if rust_enabled
+#  executable(
+#    'rust_driver',
+#    'examples/rust_driver.rs',
+#    link_with: vectorutil_lib,
+#  )
+#endif
+#
\ No newline at end of file
diff --git a/jvector-native/src/main/c/third_party/highway b/jvector-native/src/main/c/third_party/highway
new file mode 160000
index 000000000..16f5f5c76
--- /dev/null
+++ b/jvector-native/src/main/c/third_party/highway
@@ -0,0 +1 @@
+Subproject commit 16f5f5c768064d0bb52952888a14236bec75c104
diff --git a/jvector-native/src/main/java/io/github/jbellis/jvector/vector/NativeVectorUtilSupport.java b/jvector-native/src/main/java/io/github/jbellis/jvector/vector/NativeVectorUtilSupport.java
index 48cd7d66e..6518ac3ef 100644
--- a/jvector-native/src/main/java/io/github/jbellis/jvector/vector/NativeVectorUtilSupport.java
+++ b/jvector-native/src/main/java/io/github/jbellis/jvector/vector/NativeVectorUtilSupport.java
@@ -76,7 +76,37 @@ public float assembleAndSum(VectorFloat<?> data, int dataBase, ByteSequence<?> b
     {
         assert baseOffsets.offset() == 0 : "Base offsets are expected to have an offset of 0. Found: " + baseOffsets.offset();
         // baseOffsets is a pointer into a PQ chunk - we need to index into it by baseOffsetsOffset and provide baseOffsetsLength to the native code
-        return NativeSimdOps.assemble_and_sum_f32_512(((MemorySegmentVectorFloat) data).get(), dataBase, ((MemorySegmentByteSequence) baseOffsets).get(), baseOffsetsOffset, baseOffsetsLength);
+        return NativeSimdOps.assemble_and_sum_f32(((MemorySegmentVectorFloat) data).get(), dataBase, ((MemorySegmentByteSequence) baseOffsets).get(), baseOffsetsOffset, (long) baseOffsetsLength);
+    }
+
+    @Override
+    public void addInPlace(VectorFloat<?> v1, VectorFloat<?> v2) {
+        NativeSimdOps.add_in_place_f32(((MemorySegmentVectorFloat) v1).get(), ((MemorySegmentVectorFloat) v2).get(), v1.length());
+    }
+
+    @Override
+    public void addInPlace(VectorFloat<?> v1, float value) {
+        NativeSimdOps.add_scalar_in_place_f32(((MemorySegmentVectorFloat) v1).get(), value, v1.length());
+    }
+
+    @Override
+    public void subInPlace(VectorFloat<?> v1, VectorFloat<?> v2) {
+        NativeSimdOps.sub_in_place_f32(((MemorySegmentVectorFloat) v1).get(), ((MemorySegmentVectorFloat) v2).get(), v1.length());
+    }
+
+    @Override
+    public void subInPlace(VectorFloat<?> vector, float value) {
+        NativeSimdOps.sub_scalar_in_place_f32(((MemorySegmentVectorFloat) vector).get(), value, vector.length());
+    }
+
+    @Override
+    public float max(VectorFloat<?> v) {
+        return NativeSimdOps.max_f32(((MemorySegmentVectorFloat) v).get(), v.length());
+    }
+
+    @Override
+    public void minInPlace(VectorFloat<?> v1, VectorFloat<?> v2) {
+        NativeSimdOps.min_in_place_f32(((MemorySegmentVectorFloat) v1).get(), ((MemorySegmentVectorFloat) v2).get(), v1.length());
     }
 
     @Override
@@ -89,8 +119,14 @@ public float assembleAndSumPQ(
             int vector2OrdinalOffset,
             int clusterCount                    // = k
     ) {
-        //Use the non-panama solution for now
-        return assembleAndSumPQ_128(codebookPartialSums, subspaceCount, vector1Ordinals, vector1OrdinalOffset, vector2Ordinals, vector2OrdinalOffset, clusterCount);
+        assert vector1Ordinals.offset() == 0 : "vector1Ordinals offset must be 0. Found: " + vector1Ordinals.offset();
+        assert vector2Ordinals.offset() == 0 : "vector2Ordinals offset must be 0. Found: " + vector2Ordinals.offset();
+        return NativeSimdOps.assemble_and_sum_pq_f32(
+                ((MemorySegmentVectorFloat) codebookPartialSums).get(),
+                (long) subspaceCount,
+                ((MemorySegmentByteSequence) vector1Ordinals).get(), vector1OrdinalOffset,
+                ((MemorySegmentByteSequence) vector2Ordinals).get(), vector2OrdinalOffset,
+                clusterCount);
     }
 
     @Override
@@ -102,6 +138,137 @@ public float pqDecodedCosineSimilarity(ByteSequence<?> encoded, int clusterCount
     public float pqDecodedCosineSimilarity(ByteSequence<?> encoded, int encodedOffset, int encodedLength, int clusterCount, VectorFloat<?> partialSums, VectorFloat<?> aMagnitude, float bMagnitude) {
         assert encoded.offset() == 0 : "Bulk shuffle shuffles are expected to have an offset of 0. Found: " + encoded.offset();
         // encoded is a pointer into a PQ chunk - we need to index into it by encodedOffset and provide encodedLength to the native code
-        return NativeSimdOps.pq_decoded_cosine_similarity_f32_512(((MemorySegmentByteSequence) encoded).get(), encodedOffset, encodedLength, clusterCount, ((MemorySegmentVectorFloat) partialSums).get(), ((MemorySegmentVectorFloat) aMagnitude).get(), bMagnitude);
+        return NativeSimdOps.pq_decoded_cosine_similarity_f32(((MemorySegmentByteSequence) encoded).get(), encodedOffset, (long) encodedLength, clusterCount, ((MemorySegmentVectorFloat) partialSums).get(), ((MemorySegmentVectorFloat) aMagnitude).get(), bMagnitude);
+    }
+
+    @Override
+    public float squareDistance(VectorFloat<?> v1, VectorFloat<?> v2) {
+        return NativeSimdOps.euclidean_f32(((MemorySegmentVectorFloat) v1).get(), 0,
+                                          ((MemorySegmentVectorFloat) v2).get(), 0,
+                                          v1.length());
+    }
+
+    @Override
+    public float squareDistance(VectorFloat<?> v1, int v1offset, VectorFloat<?> v2, int v2offset, int length) {
+        return NativeSimdOps.euclidean_f32(((MemorySegmentVectorFloat) v1).get(), v1offset,
+                                          ((MemorySegmentVectorFloat) v2).get(), v2offset,
+                                          length);
+    }
+
+    @Override
+    public float cosine(VectorFloat<?> v1, VectorFloat<?> v2) {
+        return NativeSimdOps.cosine_f32(((MemorySegmentVectorFloat) v1).get(), 0,
+                                       ((MemorySegmentVectorFloat) v2).get(), 0,
+                                       v1.length());
+    }
+
+    @Override
+    public float cosine(VectorFloat<?> v1, int v1offset, VectorFloat<?> v2, int v2offset, int length) {
+        return NativeSimdOps.cosine_f32(((MemorySegmentVectorFloat) v1).get(), v1offset,
+                                       ((MemorySegmentVectorFloat) v2).get(), v2offset,
+                                       length);
+    }
+
+    @Override
+    public float dotProduct(VectorFloat<?> v1, VectorFloat<?> v2) {
+        return NativeSimdOps.dot_product_f32(((MemorySegmentVectorFloat) v1).get(), 0,
+                                             ((MemorySegmentVectorFloat) v2).get(), 0,
+                                             v1.length());
+    }
+
+    @Override
+    public float dotProduct(VectorFloat<?> v1, int v1offset, VectorFloat<?> v2, int v2offset, int length) {
+        return NativeSimdOps.dot_product_f32(((MemorySegmentVectorFloat) v1).get(), v1offset,
+                                             ((MemorySegmentVectorFloat) v2).get(), v2offset,
+                                             length);
+    }
+
+    @Override
+    public void calculatePartialSums(VectorFloat<?> codebook, int codebookIndex, int size, int clusterCount, VectorFloat<?> query, int queryOffset, VectorSimilarityFunction vsf, VectorFloat<?> partialSums) {
+        var nativeCodebook = ((MemorySegmentVectorFloat) codebook).get();
+        var nativeQuery = ((MemorySegmentVectorFloat) query).get();
+        var nativePartialSums = ((MemorySegmentVectorFloat) partialSums).get();
+        switch (vsf) {
+            case EUCLIDEAN -> NativeSimdOps.calculate_partial_sums_euclidean_f32(nativeCodebook, codebookIndex, (long) size, clusterCount, nativeQuery, queryOffset, nativePartialSums);
+            case DOT_PRODUCT -> NativeSimdOps.calculate_partial_sums_dot_f32(nativeCodebook, codebookIndex, (long) size, clusterCount, nativeQuery, queryOffset, nativePartialSums);
+            default -> throw new UnsupportedOperationException("Unsupported similarity function " + vsf);
+        }
+    }
+
+    @Override
+    public void calculatePartialSelfMagnitudes(VectorFloat<?> codebook, int codebookIndex, int size, int clusterCount, VectorFloat<?> partialMagnitudes) {
+        NativeSimdOps.calculate_partial_sums_self_magnitude_f32(
+                ((MemorySegmentVectorFloat) codebook).get(),
+                codebookIndex,
+                (long) size,
+                clusterCount,
+                ((MemorySegmentVectorFloat) partialMagnitudes).get());
+    }
+
+    @Override
+    public void nvqShuffleQueryInPlace8bit(VectorFloat<?> vector) {
+        NativeSimdOps.nvq_shuffle_query_in_place_8bit(
+                ((MemorySegmentVectorFloat) vector).get(),
+                (long) vector.length());
+    }
+
+    @Override
+    public void nvqQuantize8bit(VectorFloat<?> vector, float alpha, float x0, float minValue, float maxValue, ByteSequence<?> destination) {
+        NativeSimdOps.nvq_quantize_8bit(
+                ((MemorySegmentVectorFloat) vector).get(),
+                (long) vector.length(),
+                alpha, x0, minValue, maxValue,
+                ((MemorySegmentByteSequence) destination).get());
+    }
+
+    @Override
+    public float nvqLoss(VectorFloat<?> vector, float alpha, float x0, float minValue, float maxValue, int nBits) {
+        return NativeSimdOps.nvq_loss(
+                ((MemorySegmentVectorFloat) vector).get(),
+                (long) vector.length(),
+                alpha, x0, minValue, maxValue, nBits);
+    }
+
+    @Override
+    public float nvqUniformLoss(VectorFloat<?> vector, float minValue, float maxValue, int nBits) {
+        return NativeSimdOps.nvq_uniform_loss(
+                ((MemorySegmentVectorFloat) vector).get(),
+                (long) vector.length(),
+                minValue, maxValue, nBits);
+    }
+
+    @Override
+    public float nvqSquareL2Distance8bit(VectorFloat<?> vector, ByteSequence<?> quantizedVector,
+                                         float alpha, float x0, float minValue, float maxValue) {
+        return NativeSimdOps.nvq_square_l2_distance_8bit(
+                ((MemorySegmentVectorFloat) vector).get(),
+                ((MemorySegmentByteSequence) quantizedVector).get(),
+                (long) vector.length(),
+                alpha, x0, minValue, maxValue);
+    }
+
+    @Override
+    public float nvqDotProduct8bit(VectorFloat<?> vector, ByteSequence<?> quantizedVector,
+                                   float alpha, float x0, float minValue, float maxValue) {
+        return NativeSimdOps.nvq_dot_product_8bit(
+                ((MemorySegmentVectorFloat) vector).get(),
+                ((MemorySegmentByteSequence) quantizedVector).get(),
+                (long) vector.length(),
+                alpha, x0, minValue, maxValue);
+    }
+
+    @Override
+    public float[] nvqCosine8bit(VectorFloat<?> vector, ByteSequence<?> quantizedVector,
+                                 float alpha, float x0, float minValue, float maxValue,
+                                 VectorFloat<?> centroid) {
+        long packed = NativeSimdOps.nvq_cosine_8bit_packed(
+                ((MemorySegmentVectorFloat) vector).get(),
+                ((MemorySegmentByteSequence) quantizedVector).get(),
+                (long) vector.length(),
+                alpha, x0, minValue, maxValue,
+                ((MemorySegmentVectorFloat) centroid).get());
+        float sum  = Float.intBitsToFloat((int)(packed & 0xFFFFFFFFL));
+        float bMag = Float.intBitsToFloat((int)(packed >>> 32));
+        return new float[]{sum, bMag};
     }
 }
diff --git a/jvector-native/src/main/java/io/github/jbellis/jvector/vector/NativeVectorizationProvider.java b/jvector-native/src/main/java/io/github/jbellis/jvector/vector/NativeVectorizationProvider.java
index 5194e1108..f73f237a6 100644
--- a/jvector-native/src/main/java/io/github/jbellis/jvector/vector/NativeVectorizationProvider.java
+++ b/jvector-native/src/main/java/io/github/jbellis/jvector/vector/NativeVectorizationProvider.java
@@ -18,7 +18,6 @@
 
 import io.github.jbellis.jvector.annotations.Experimental;
 import io.github.jbellis.jvector.vector.cnative.LibraryLoader;
-import io.github.jbellis.jvector.vector.cnative.NativeSimdOps;
 import io.github.jbellis.jvector.vector.types.VectorTypeSupport;
 
 /**
@@ -31,13 +30,14 @@ public class NativeVectorizationProvider extends VectorizationProvider {
     private final VectorTypeSupport vectorTypeSupport;
 
     public NativeVectorizationProvider() {
+        var arch = System.getProperty("os.arch", "");
+        if (!arch.equals("amd64") && !arch.equals("x86_64")) {
+            throw new UnsupportedOperationException("Native SIMD operations are only supported on x86_64.");
+        }
         var libraryLoaded = LibraryLoader.loadJvector();
         if (!libraryLoaded) {
             throw new UnsupportedOperationException("Failed to load supporting native library.");
         }
-        if (!NativeSimdOps.check_avx512_compatibility()) {
-            throw new UnsupportedOperationException("Native SIMD operations are not supported on this platform due to missing CPU support.");
-        }
         this.vectorUtilSupport = new NativeVectorUtilSupport();
         this.vectorTypeSupport = new MemorySegmentVectorProvider();
     }
diff --git a/jvector-native/src/main/java/io/github/jbellis/jvector/vector/cnative/NativeSimdOps.java b/jvector-native/src/main/java/io/github/jbellis/jvector/vector/cnative/NativeSimdOps.java
index 5bf5b2f01..fc65c56b6 100644
--- a/jvector-native/src/main/java/io/github/jbellis/jvector/vector/cnative/NativeSimdOps.java
+++ b/jvector-native/src/main/java/io/github/jbellis/jvector/vector/cnative/NativeSimdOps.java
@@ -22,15 +22,15 @@ public class NativeSimdOps {
     static final boolean TRACE_DOWNCALLS = Boolean.getBoolean("jextract.trace.downcalls");
 
     static void traceDowncall(String name, Object... args) {
-        String traceArgs = Arrays.stream(args)
-                .map(Object::toString)
-                .collect(Collectors.joining(", "));
-        System.out.printf("%s(%s)\n", name, traceArgs);
+         String traceArgs = Arrays.stream(args)
+                       .map(Object::toString)
+                       .collect(Collectors.joining(", "));
+         System.out.printf("%s(%s)\n", name, traceArgs);
     }
 
     static MemorySegment findOrThrow(String symbol) {
         return SYMBOL_LOOKUP.find(symbol)
-                .orElseThrow(() -> new UnsatisfiedLinkError("unresolved symbol: " + symbol));
+            .orElseThrow(() -> new UnsatisfiedLinkError("unresolved symbol: " + symbol));
     }
 
     static MethodHandle upcallHandle(Class<?> fi, String name, FunctionDescriptor fdesc) {
@@ -95,756 +95,2946 @@ public static int false_() {
     public static int __bool_true_false_are_defined() {
         return __bool_true_false_are_defined;
     }
+    private static final int _STDINT_H = (int)1L;
+    /**
+     * {@snippet lang=c :
+     * #define _STDINT_H 1
+     * }
+     */
+    public static int _STDINT_H() {
+        return _STDINT_H;
+    }
+    private static final int _FEATURES_H = (int)1L;
+    /**
+     * {@snippet lang=c :
+     * #define _FEATURES_H 1
+     * }
+     */
+    public static int _FEATURES_H() {
+        return _FEATURES_H;
+    }
+    private static final int _DEFAULT_SOURCE = (int)1L;
+    /**
+     * {@snippet lang=c :
+     * #define _DEFAULT_SOURCE 1
+     * }
+     */
+    public static int _DEFAULT_SOURCE() {
+        return _DEFAULT_SOURCE;
+    }
+    private static final int __GLIBC_USE_ISOC2X = (int)0L;
+    /**
+     * {@snippet lang=c :
+     * #define __GLIBC_USE_ISOC2X 0
+     * }
+     */
+    public static int __GLIBC_USE_ISOC2X() {
+        return __GLIBC_USE_ISOC2X;
+    }
+    private static final int __USE_ISOC11 = (int)1L;
+    /**
+     * {@snippet lang=c :
+     * #define __USE_ISOC11 1
+     * }
+     */
+    public static int __USE_ISOC11() {
+        return __USE_ISOC11;
+    }
+    private static final int __USE_ISOC99 = (int)1L;
+    /**
+     * {@snippet lang=c :
+     * #define __USE_ISOC99 1
+     * }
+     */
+    public static int __USE_ISOC99() {
+        return __USE_ISOC99;
+    }
+    private static final int __USE_ISOC95 = (int)1L;
+    /**
+     * {@snippet lang=c :
+     * #define __USE_ISOC95 1
+     * }
+     */
+    public static int __USE_ISOC95() {
+        return __USE_ISOC95;
+    }
+    private static final int __USE_POSIX_IMPLICITLY = (int)1L;
+    /**
+     * {@snippet lang=c :
+     * #define __USE_POSIX_IMPLICITLY 1
+     * }
+     */
+    public static int __USE_POSIX_IMPLICITLY() {
+        return __USE_POSIX_IMPLICITLY;
+    }
+    private static final int _POSIX_SOURCE = (int)1L;
+    /**
+     * {@snippet lang=c :
+     * #define _POSIX_SOURCE 1
+     * }
+     */
+    public static int _POSIX_SOURCE() {
+        return _POSIX_SOURCE;
+    }
+    private static final int __USE_POSIX = (int)1L;
+    /**
+     * {@snippet lang=c :
+     * #define __USE_POSIX 1
+     * }
+     */
+    public static int __USE_POSIX() {
+        return __USE_POSIX;
+    }
+    private static final int __USE_POSIX2 = (int)1L;
+    /**
+     * {@snippet lang=c :
+     * #define __USE_POSIX2 1
+     * }
+     */
+    public static int __USE_POSIX2() {
+        return __USE_POSIX2;
+    }
+    private static final int __USE_POSIX199309 = (int)1L;
+    /**
+     * {@snippet lang=c :
+     * #define __USE_POSIX199309 1
+     * }
+     */
+    public static int __USE_POSIX199309() {
+        return __USE_POSIX199309;
+    }
+    private static final int __USE_POSIX199506 = (int)1L;
+    /**
+     * {@snippet lang=c :
+     * #define __USE_POSIX199506 1
+     * }
+     */
+    public static int __USE_POSIX199506() {
+        return __USE_POSIX199506;
+    }
+    private static final int __USE_XOPEN2K = (int)1L;
+    /**
+     * {@snippet lang=c :
+     * #define __USE_XOPEN2K 1
+     * }
+     */
+    public static int __USE_XOPEN2K() {
+        return __USE_XOPEN2K;
+    }
+    private static final int __USE_XOPEN2K8 = (int)1L;
+    /**
+     * {@snippet lang=c :
+     * #define __USE_XOPEN2K8 1
+     * }
+     */
+    public static int __USE_XOPEN2K8() {
+        return __USE_XOPEN2K8;
+    }
+    private static final int _ATFILE_SOURCE = (int)1L;
+    /**
+     * {@snippet lang=c :
+     * #define _ATFILE_SOURCE 1
+     * }
+     */
+    public static int _ATFILE_SOURCE() {
+        return _ATFILE_SOURCE;
+    }
+    private static final int __WORDSIZE = (int)64L;
+    /**
+     * {@snippet lang=c :
+     * #define __WORDSIZE 64
+     * }
+     */
+    public static int __WORDSIZE() {
+        return __WORDSIZE;
+    }
+    private static final int __WORDSIZE_TIME64_COMPAT32 = (int)1L;
+    /**
+     * {@snippet lang=c :
+     * #define __WORDSIZE_TIME64_COMPAT32 1
+     * }
+     */
+    public static int __WORDSIZE_TIME64_COMPAT32() {
+        return __WORDSIZE_TIME64_COMPAT32;
+    }
+    private static final int __SYSCALL_WORDSIZE = (int)64L;
+    /**
+     * {@snippet lang=c :
+     * #define __SYSCALL_WORDSIZE 64
+     * }
+     */
+    public static int __SYSCALL_WORDSIZE() {
+        return __SYSCALL_WORDSIZE;
+    }
+    private static final int __USE_MISC = (int)1L;
+    /**
+     * {@snippet lang=c :
+     * #define __USE_MISC 1
+     * }
+     */
+    public static int __USE_MISC() {
+        return __USE_MISC;
+    }
+    private static final int __USE_ATFILE = (int)1L;
+    /**
+     * {@snippet lang=c :
+     * #define __USE_ATFILE 1
+     * }
+     */
+    public static int __USE_ATFILE() {
+        return __USE_ATFILE;
+    }
+    private static final int __USE_FORTIFY_LEVEL = (int)0L;
+    /**
+     * {@snippet lang=c :
+     * #define __USE_FORTIFY_LEVEL 0
+     * }
+     */
+    public static int __USE_FORTIFY_LEVEL() {
+        return __USE_FORTIFY_LEVEL;
+    }
+    private static final int __GLIBC_USE_DEPRECATED_GETS = (int)0L;
+    /**
+     * {@snippet lang=c :
+     * #define __GLIBC_USE_DEPRECATED_GETS 0
+     * }
+     */
+    public static int __GLIBC_USE_DEPRECATED_GETS() {
+        return __GLIBC_USE_DEPRECATED_GETS;
+    }
+    private static final int __GLIBC_USE_DEPRECATED_SCANF = (int)0L;
+    /**
+     * {@snippet lang=c :
+     * #define __GLIBC_USE_DEPRECATED_SCANF 0
+     * }
+     */
+    public static int __GLIBC_USE_DEPRECATED_SCANF() {
+        return __GLIBC_USE_DEPRECATED_SCANF;
+    }
+    private static final int __GLIBC_USE_C2X_STRTOL = (int)0L;
+    /**
+     * {@snippet lang=c :
+     * #define __GLIBC_USE_C2X_STRTOL 0
+     * }
+     */
+    public static int __GLIBC_USE_C2X_STRTOL() {
+        return __GLIBC_USE_C2X_STRTOL;
+    }
+    private static final int _STDC_PREDEF_H = (int)1L;
+    /**
+     * {@snippet lang=c :
+     * #define _STDC_PREDEF_H 1
+     * }
+     */
+    public static int _STDC_PREDEF_H() {
+        return _STDC_PREDEF_H;
+    }
+    private static final int __STDC_IEC_559__ = (int)1L;
+    /**
+     * {@snippet lang=c :
+     * #define __STDC_IEC_559__ 1
+     * }
+     */
+    public static int __STDC_IEC_559__() {
+        return __STDC_IEC_559__;
+    }
+    private static final int __STDC_IEC_559_COMPLEX__ = (int)1L;
+    /**
+     * {@snippet lang=c :
+     * #define __STDC_IEC_559_COMPLEX__ 1
+     * }
+     */
+    public static int __STDC_IEC_559_COMPLEX__() {
+        return __STDC_IEC_559_COMPLEX__;
+    }
+    private static final int __GNU_LIBRARY__ = (int)6L;
+    /**
+     * {@snippet lang=c :
+     * #define __GNU_LIBRARY__ 6
+     * }
+     */
+    public static int __GNU_LIBRARY__() {
+        return __GNU_LIBRARY__;
+    }
+    private static final int __GLIBC__ = (int)2L;
+    /**
+     * {@snippet lang=c :
+     * #define __GLIBC__ 2
+     * }
+     */
+    public static int __GLIBC__() {
+        return __GLIBC__;
+    }
+    private static final int __GLIBC_MINOR__ = (int)39L;
+    /**
+     * {@snippet lang=c :
+     * #define __GLIBC_MINOR__ 39
+     * }
+     */
+    public static int __GLIBC_MINOR__() {
+        return __GLIBC_MINOR__;
+    }
+    private static final int _SYS_CDEFS_H = (int)1L;
+    /**
+     * {@snippet lang=c :
+     * #define _SYS_CDEFS_H 1
+     * }
+     */
+    public static int _SYS_CDEFS_H() {
+        return _SYS_CDEFS_H;
+    }
+    private static final int __glibc_c99_flexarr_available = (int)1L;
+    /**
+     * {@snippet lang=c :
+     * #define __glibc_c99_flexarr_available 1
+     * }
+     */
+    public static int __glibc_c99_flexarr_available() {
+        return __glibc_c99_flexarr_available;
+    }
+    private static final int __LDOUBLE_REDIRECTS_TO_FLOAT128_ABI = (int)0L;
+    /**
+     * {@snippet lang=c :
+     * #define __LDOUBLE_REDIRECTS_TO_FLOAT128_ABI 0
+     * }
+     */
+    public static int __LDOUBLE_REDIRECTS_TO_FLOAT128_ABI() {
+        return __LDOUBLE_REDIRECTS_TO_FLOAT128_ABI;
+    }
+    private static final int __HAVE_GENERIC_SELECTION = (int)1L;
+    /**
+     * {@snippet lang=c :
+     * #define __HAVE_GENERIC_SELECTION 1
+     * }
+     */
+    public static int __HAVE_GENERIC_SELECTION() {
+        return __HAVE_GENERIC_SELECTION;
+    }
+    private static final int __GLIBC_USE_LIB_EXT2 = (int)0L;
+    /**
+     * {@snippet lang=c :
+     * #define __GLIBC_USE_LIB_EXT2 0
+     * }
+     */
+    public static int __GLIBC_USE_LIB_EXT2() {
+        return __GLIBC_USE_LIB_EXT2;
+    }
+    private static final int __GLIBC_USE_IEC_60559_BFP_EXT = (int)0L;
+    /**
+     * {@snippet lang=c :
+     * #define __GLIBC_USE_IEC_60559_BFP_EXT 0
+     * }
+     */
+    public static int __GLIBC_USE_IEC_60559_BFP_EXT() {
+        return __GLIBC_USE_IEC_60559_BFP_EXT;
+    }
+    private static final int __GLIBC_USE_IEC_60559_BFP_EXT_C2X = (int)0L;
+    /**
+     * {@snippet lang=c :
+     * #define __GLIBC_USE_IEC_60559_BFP_EXT_C2X 0
+     * }
+     */
+    public static int __GLIBC_USE_IEC_60559_BFP_EXT_C2X() {
+        return __GLIBC_USE_IEC_60559_BFP_EXT_C2X;
+    }
+    private static final int __GLIBC_USE_IEC_60559_EXT = (int)0L;
+    /**
+     * {@snippet lang=c :
+     * #define __GLIBC_USE_IEC_60559_EXT 0
+     * }
+     */
+    public static int __GLIBC_USE_IEC_60559_EXT() {
+        return __GLIBC_USE_IEC_60559_EXT;
+    }
+    private static final int __GLIBC_USE_IEC_60559_FUNCS_EXT = (int)0L;
+    /**
+     * {@snippet lang=c :
+     * #define __GLIBC_USE_IEC_60559_FUNCS_EXT 0
+     * }
+     */
+    public static int __GLIBC_USE_IEC_60559_FUNCS_EXT() {
+        return __GLIBC_USE_IEC_60559_FUNCS_EXT;
+    }
+    private static final int __GLIBC_USE_IEC_60559_FUNCS_EXT_C2X = (int)0L;
+    /**
+     * {@snippet lang=c :
+     * #define __GLIBC_USE_IEC_60559_FUNCS_EXT_C2X 0
+     * }
+     */
+    public static int __GLIBC_USE_IEC_60559_FUNCS_EXT_C2X() {
+        return __GLIBC_USE_IEC_60559_FUNCS_EXT_C2X;
+    }
+    private static final int __GLIBC_USE_IEC_60559_TYPES_EXT = (int)0L;
+    /**
+     * {@snippet lang=c :
+     * #define __GLIBC_USE_IEC_60559_TYPES_EXT 0
+     * }
+     */
+    public static int __GLIBC_USE_IEC_60559_TYPES_EXT() {
+        return __GLIBC_USE_IEC_60559_TYPES_EXT;
+    }
+    private static final int _BITS_TYPES_H = (int)1L;
+    /**
+     * {@snippet lang=c :
+     * #define _BITS_TYPES_H 1
+     * }
+     */
+    public static int _BITS_TYPES_H() {
+        return _BITS_TYPES_H;
+    }
+    private static final int _BITS_TYPESIZES_H = (int)1L;
+    /**
+     * {@snippet lang=c :
+     * #define _BITS_TYPESIZES_H 1
+     * }
+     */
+    public static int _BITS_TYPESIZES_H() {
+        return _BITS_TYPESIZES_H;
+    }
+    private static final int __OFF_T_MATCHES_OFF64_T = (int)1L;
+    /**
+     * {@snippet lang=c :
+     * #define __OFF_T_MATCHES_OFF64_T 1
+     * }
+     */
+    public static int __OFF_T_MATCHES_OFF64_T() {
+        return __OFF_T_MATCHES_OFF64_T;
+    }
+    private static final int __INO_T_MATCHES_INO64_T = (int)1L;
+    /**
+     * {@snippet lang=c :
+     * #define __INO_T_MATCHES_INO64_T 1
+     * }
+     */
+    public static int __INO_T_MATCHES_INO64_T() {
+        return __INO_T_MATCHES_INO64_T;
+    }
+    private static final int __RLIM_T_MATCHES_RLIM64_T = (int)1L;
+    /**
+     * {@snippet lang=c :
+     * #define __RLIM_T_MATCHES_RLIM64_T 1
+     * }
+     */
+    public static int __RLIM_T_MATCHES_RLIM64_T() {
+        return __RLIM_T_MATCHES_RLIM64_T;
+    }
+    private static final int __STATFS_MATCHES_STATFS64 = (int)1L;
+    /**
+     * {@snippet lang=c :
+     * #define __STATFS_MATCHES_STATFS64 1
+     * }
+     */
+    public static int __STATFS_MATCHES_STATFS64() {
+        return __STATFS_MATCHES_STATFS64;
+    }
+    private static final int __KERNEL_OLD_TIMEVAL_MATCHES_TIMEVAL64 = (int)1L;
+    /**
+     * {@snippet lang=c :
+     * #define __KERNEL_OLD_TIMEVAL_MATCHES_TIMEVAL64 1
+     * }
+     */
+    public static int __KERNEL_OLD_TIMEVAL_MATCHES_TIMEVAL64() {
+        return __KERNEL_OLD_TIMEVAL_MATCHES_TIMEVAL64;
+    }
+    private static final int __FD_SETSIZE = (int)1024L;
+    /**
+     * {@snippet lang=c :
+     * #define __FD_SETSIZE 1024
+     * }
+     */
+    public static int __FD_SETSIZE() {
+        return __FD_SETSIZE;
+    }
+    private static final int _BITS_TIME64_H = (int)1L;
+    /**
+     * {@snippet lang=c :
+     * #define _BITS_TIME64_H 1
+     * }
+     */
+    public static int _BITS_TIME64_H() {
+        return _BITS_TIME64_H;
+    }
+    private static final int _BITS_WCHAR_H = (int)1L;
+    /**
+     * {@snippet lang=c :
+     * #define _BITS_WCHAR_H 1
+     * }
+     */
+    public static int _BITS_WCHAR_H() {
+        return _BITS_WCHAR_H;
+    }
+    private static final int _BITS_STDINT_INTN_H = (int)1L;
+    /**
+     * {@snippet lang=c :
+     * #define _BITS_STDINT_INTN_H 1
+     * }
+     */
+    public static int _BITS_STDINT_INTN_H() {
+        return _BITS_STDINT_INTN_H;
+    }
+    private static final int _BITS_STDINT_UINTN_H = (int)1L;
+    /**
+     * {@snippet lang=c :
+     * #define _BITS_STDINT_UINTN_H 1
+     * }
+     */
+    public static int _BITS_STDINT_UINTN_H() {
+        return _BITS_STDINT_UINTN_H;
+    }
+    private static final int _BITS_STDINT_LEAST_H = (int)1L;
+    /**
+     * {@snippet lang=c :
+     * #define _BITS_STDINT_LEAST_H 1
+     * }
+     */
+    public static int _BITS_STDINT_LEAST_H() {
+        return _BITS_STDINT_LEAST_H;
+    }
+    /**
+     * {@snippet lang=c :
+     * typedef long ptrdiff_t
+     * }
+     */
+    public static final OfLong ptrdiff_t = NativeSimdOps.C_LONG;
+    /**
+     * {@snippet lang=c :
+     * typedef unsigned long size_t
+     * }
+     */
+    public static final OfLong size_t = NativeSimdOps.C_LONG;
+    /**
+     * {@snippet lang=c :
+     * typedef int wchar_t
+     * }
+     */
+    public static final OfInt wchar_t = NativeSimdOps.C_INT;
+    /**
+     * {@snippet lang=c :
+     * typedef unsigned char __u_char
+     * }
+     */
+    public static final OfByte __u_char = NativeSimdOps.C_CHAR;
+    /**
+     * {@snippet lang=c :
+     * typedef unsigned short __u_short
+     * }
+     */
+    public static final OfShort __u_short = NativeSimdOps.C_SHORT;
+    /**
+     * {@snippet lang=c :
+     * typedef unsigned int __u_int
+     * }
+     */
+    public static final OfInt __u_int = NativeSimdOps.C_INT;
+    /**
+     * {@snippet lang=c :
+     * typedef unsigned long __u_long
+     * }
+     */
+    public static final OfLong __u_long = NativeSimdOps.C_LONG;
+    /**
+     * {@snippet lang=c :
+     * typedef signed char __int8_t
+     * }
+     */
+    public static final OfByte __int8_t = NativeSimdOps.C_CHAR;
+    /**
+     * {@snippet lang=c :
+     * typedef unsigned char __uint8_t
+     * }
+     */
+    public static final OfByte __uint8_t = NativeSimdOps.C_CHAR;
+    /**
+     * {@snippet lang=c :
+     * typedef short __int16_t
+     * }
+     */
+    public static final OfShort __int16_t = NativeSimdOps.C_SHORT;
+    /**
+     * {@snippet lang=c :
+     * typedef unsigned short __uint16_t
+     * }
+     */
+    public static final OfShort __uint16_t = NativeSimdOps.C_SHORT;
+    /**
+     * {@snippet lang=c :
+     * typedef int __int32_t
+     * }
+     */
+    public static final OfInt __int32_t = NativeSimdOps.C_INT;
+    /**
+     * {@snippet lang=c :
+     * typedef unsigned int __uint32_t
+     * }
+     */
+    public static final OfInt __uint32_t = NativeSimdOps.C_INT;
+    /**
+     * {@snippet lang=c :
+     * typedef long __int64_t
+     * }
+     */
+    public static final OfLong __int64_t = NativeSimdOps.C_LONG;
+    /**
+     * {@snippet lang=c :
+     * typedef unsigned long __uint64_t
+     * }
+     */
+    public static final OfLong __uint64_t = NativeSimdOps.C_LONG;
+    /**
+     * {@snippet lang=c :
+     * typedef __int8_t __int_least8_t
+     * }
+     */
+    public static final OfByte __int_least8_t = NativeSimdOps.C_CHAR;
+    /**
+     * {@snippet lang=c :
+     * typedef __uint8_t __uint_least8_t
+     * }
+     */
+    public static final OfByte __uint_least8_t = NativeSimdOps.C_CHAR;
+    /**
+     * {@snippet lang=c :
+     * typedef __int16_t __int_least16_t
+     * }
+     */
+    public static final OfShort __int_least16_t = NativeSimdOps.C_SHORT;
+    /**
+     * {@snippet lang=c :
+     * typedef __uint16_t __uint_least16_t
+     * }
+     */
+    public static final OfShort __uint_least16_t = NativeSimdOps.C_SHORT;
+    /**
+     * {@snippet lang=c :
+     * typedef __int32_t __int_least32_t
+     * }
+     */
+    public static final OfInt __int_least32_t = NativeSimdOps.C_INT;
+    /**
+     * {@snippet lang=c :
+     * typedef __uint32_t __uint_least32_t
+     * }
+     */
+    public static final OfInt __uint_least32_t = NativeSimdOps.C_INT;
+    /**
+     * {@snippet lang=c :
+     * typedef __int64_t __int_least64_t
+     * }
+     */
+    public static final OfLong __int_least64_t = NativeSimdOps.C_LONG;
+    /**
+     * {@snippet lang=c :
+     * typedef __uint64_t __uint_least64_t
+     * }
+     */
+    public static final OfLong __uint_least64_t = NativeSimdOps.C_LONG;
+    /**
+     * {@snippet lang=c :
+     * typedef long __quad_t
+     * }
+     */
+    public static final OfLong __quad_t = NativeSimdOps.C_LONG;
+    /**
+     * {@snippet lang=c :
+     * typedef unsigned long __u_quad_t
+     * }
+     */
+    public static final OfLong __u_quad_t = NativeSimdOps.C_LONG;
+    /**
+     * {@snippet lang=c :
+     * typedef long __intmax_t
+     * }
+     */
+    public static final OfLong __intmax_t = NativeSimdOps.C_LONG;
+    /**
+     * {@snippet lang=c :
+     * typedef unsigned long __uintmax_t
+     * }
+     */
+    public static final OfLong __uintmax_t = NativeSimdOps.C_LONG;
+    /**
+     * {@snippet lang=c :
+     * typedef unsigned long __dev_t
+     * }
+     */
+    public static final OfLong __dev_t = NativeSimdOps.C_LONG;
+    /**
+     * {@snippet lang=c :
+     * typedef unsigned int __uid_t
+     * }
+     */
+    public static final OfInt __uid_t = NativeSimdOps.C_INT;
+    /**
+     * {@snippet lang=c :
+     * typedef unsigned int __gid_t
+     * }
+     */
+    public static final OfInt __gid_t = NativeSimdOps.C_INT;
+    /**
+     * {@snippet lang=c :
+     * typedef unsigned long __ino_t
+     * }
+     */
+    public static final OfLong __ino_t = NativeSimdOps.C_LONG;
+    /**
+     * {@snippet lang=c :
+     * typedef unsigned long __ino64_t
+     * }
+     */
+    public static final OfLong __ino64_t = NativeSimdOps.C_LONG;
+    /**
+     * {@snippet lang=c :
+     * typedef unsigned int __mode_t
+     * }
+     */
+    public static final OfInt __mode_t = NativeSimdOps.C_INT;
+    /**
+     * {@snippet lang=c :
+     * typedef unsigned long __nlink_t
+     * }
+     */
+    public static final OfLong __nlink_t = NativeSimdOps.C_LONG;
+    /**
+     * {@snippet lang=c :
+     * typedef long __off_t
+     * }
+     */
+    public static final OfLong __off_t = NativeSimdOps.C_LONG;
+    /**
+     * {@snippet lang=c :
+     * typedef long __off64_t
+     * }
+     */
+    public static final OfLong __off64_t = NativeSimdOps.C_LONG;
+    /**
+     * {@snippet lang=c :
+     * typedef int __pid_t
+     * }
+     */
+    public static final OfInt __pid_t = NativeSimdOps.C_INT;
+    /**
+     * {@snippet lang=c :
+     * typedef long __clock_t
+     * }
+     */
+    public static final OfLong __clock_t = NativeSimdOps.C_LONG;
+    /**
+     * {@snippet lang=c :
+     * typedef unsigned long __rlim_t
+     * }
+     */
+    public static final OfLong __rlim_t = NativeSimdOps.C_LONG;
+    /**
+     * {@snippet lang=c :
+     * typedef unsigned long __rlim64_t
+     * }
+     */
+    public static final OfLong __rlim64_t = NativeSimdOps.C_LONG;
+    /**
+     * {@snippet lang=c :
+     * typedef unsigned int __id_t
+     * }
+     */
+    public static final OfInt __id_t = NativeSimdOps.C_INT;
+    /**
+     * {@snippet lang=c :
+     * typedef long __time_t
+     * }
+     */
+    public static final OfLong __time_t = NativeSimdOps.C_LONG;
+    /**
+     * {@snippet lang=c :
+     * typedef unsigned int __useconds_t
+     * }
+     */
+    public static final OfInt __useconds_t = NativeSimdOps.C_INT;
+    /**
+     * {@snippet lang=c :
+     * typedef long __suseconds_t
+     * }
+     */
+    public static final OfLong __suseconds_t = NativeSimdOps.C_LONG;
+    /**
+     * {@snippet lang=c :
+     * typedef long __suseconds64_t
+     * }
+     */
+    public static final OfLong __suseconds64_t = NativeSimdOps.C_LONG;
+    /**
+     * {@snippet lang=c :
+     * typedef int __daddr_t
+     * }
+     */
+    public static final OfInt __daddr_t = NativeSimdOps.C_INT;
+    /**
+     * {@snippet lang=c :
+     * typedef int __key_t
+     * }
+     */
+    public static final OfInt __key_t = NativeSimdOps.C_INT;
+    /**
+     * {@snippet lang=c :
+     * typedef int __clockid_t
+     * }
+     */
+    public static final OfInt __clockid_t = NativeSimdOps.C_INT;
+    /**
+     * {@snippet lang=c :
+     * typedef void *__timer_t
+     * }
+     */
+    public static final AddressLayout __timer_t = NativeSimdOps.C_POINTER;
+    /**
+     * {@snippet lang=c :
+     * typedef long __blksize_t
+     * }
+     */
+    public static final OfLong __blksize_t = NativeSimdOps.C_LONG;
+    /**
+     * {@snippet lang=c :
+     * typedef long __blkcnt_t
+     * }
+     */
+    public static final OfLong __blkcnt_t = NativeSimdOps.C_LONG;
+    /**
+     * {@snippet lang=c :
+     * typedef long __blkcnt64_t
+     * }
+     */
+    public static final OfLong __blkcnt64_t = NativeSimdOps.C_LONG;
+    /**
+     * {@snippet lang=c :
+     * typedef unsigned long __fsblkcnt_t
+     * }
+     */
+    public static final OfLong __fsblkcnt_t = NativeSimdOps.C_LONG;
+    /**
+     * {@snippet lang=c :
+     * typedef unsigned long __fsblkcnt64_t
+     * }
+     */
+    public static final OfLong __fsblkcnt64_t = NativeSimdOps.C_LONG;
+    /**
+     * {@snippet lang=c :
+     * typedef unsigned long __fsfilcnt_t
+     * }
+     */
+    public static final OfLong __fsfilcnt_t = NativeSimdOps.C_LONG;
+    /**
+     * {@snippet lang=c :
+     * typedef unsigned long __fsfilcnt64_t
+     * }
+     */
+    public static final OfLong __fsfilcnt64_t = NativeSimdOps.C_LONG;
+    /**
+     * {@snippet lang=c :
+     * typedef long __fsword_t
+     * }
+     */
+    public static final OfLong __fsword_t = NativeSimdOps.C_LONG;
+    /**
+     * {@snippet lang=c :
+     * typedef long __ssize_t
+     * }
+     */
+    public static final OfLong __ssize_t = NativeSimdOps.C_LONG;
+    /**
+     * {@snippet lang=c :
+     * typedef long __syscall_slong_t
+     * }
+     */
+    public static final OfLong __syscall_slong_t = NativeSimdOps.C_LONG;
+    /**
+     * {@snippet lang=c :
+     * typedef unsigned long __syscall_ulong_t
+     * }
+     */
+    public static final OfLong __syscall_ulong_t = NativeSimdOps.C_LONG;
+    /**
+     * {@snippet lang=c :
+     * typedef __off64_t __loff_t
+     * }
+     */
+    public static final OfLong __loff_t = NativeSimdOps.C_LONG;
+    /**
+     * {@snippet lang=c :
+     * typedef char *__caddr_t
+     * }
+     */
+    public static final AddressLayout __caddr_t = NativeSimdOps.C_POINTER;
+    /**
+     * {@snippet lang=c :
+     * typedef long __intptr_t
+     * }
+     */
+    public static final OfLong __intptr_t = NativeSimdOps.C_LONG;
+    /**
+     * {@snippet lang=c :
+     * typedef unsigned int __socklen_t
+     * }
+     */
+    public static final OfInt __socklen_t = NativeSimdOps.C_INT;
+    /**
+     * {@snippet lang=c :
+     * typedef int __sig_atomic_t
+     * }
+     */
+    public static final OfInt __sig_atomic_t = NativeSimdOps.C_INT;
+    /**
+     * {@snippet lang=c :
+     * typedef __int8_t int8_t
+     * }
+     */
+    public static final OfByte int8_t = NativeSimdOps.C_CHAR;
+    /**
+     * {@snippet lang=c :
+     * typedef __int16_t int16_t
+     * }
+     */
+    public static final OfShort int16_t = NativeSimdOps.C_SHORT;
+    /**
+     * {@snippet lang=c :
+     * typedef __int32_t int32_t
+     * }
+     */
+    public static final OfInt int32_t = NativeSimdOps.C_INT;
+    /**
+     * {@snippet lang=c :
+     * typedef __int64_t int64_t
+     * }
+     */
+    public static final OfLong int64_t = NativeSimdOps.C_LONG;
+    /**
+     * {@snippet lang=c :
+     * typedef __uint8_t uint8_t
+     * }
+     */
+    public static final OfByte uint8_t = NativeSimdOps.C_CHAR;
+    /**
+     * {@snippet lang=c :
+     * typedef __uint16_t uint16_t
+     * }
+     */
+    public static final OfShort uint16_t = NativeSimdOps.C_SHORT;
+    /**
+     * {@snippet lang=c :
+     * typedef __uint32_t uint32_t
+     * }
+     */
+    public static final OfInt uint32_t = NativeSimdOps.C_INT;
+    /**
+     * {@snippet lang=c :
+     * typedef __uint64_t uint64_t
+     * }
+     */
+    public static final OfLong uint64_t = NativeSimdOps.C_LONG;
+    /**
+     * {@snippet lang=c :
+     * typedef __int_least8_t int_least8_t
+     * }
+     */
+    public static final OfByte int_least8_t = NativeSimdOps.C_CHAR;
+    /**
+     * {@snippet lang=c :
+     * typedef __int_least16_t int_least16_t
+     * }
+     */
+    public static final OfShort int_least16_t = NativeSimdOps.C_SHORT;
+    /**
+     * {@snippet lang=c :
+     * typedef __int_least32_t int_least32_t
+     * }
+     */
+    public static final OfInt int_least32_t = NativeSimdOps.C_INT;
+    /**
+     * {@snippet lang=c :
+     * typedef __int_least64_t int_least64_t
+     * }
+     */
+    public static final OfLong int_least64_t = NativeSimdOps.C_LONG;
+    /**
+     * {@snippet lang=c :
+     * typedef __uint_least8_t uint_least8_t
+     * }
+     */
+    public static final OfByte uint_least8_t = NativeSimdOps.C_CHAR;
+    /**
+     * {@snippet lang=c :
+     * typedef __uint_least16_t uint_least16_t
+     * }
+     */
+    public static final OfShort uint_least16_t = NativeSimdOps.C_SHORT;
+    /**
+     * {@snippet lang=c :
+     * typedef __uint_least32_t uint_least32_t
+     * }
+     */
+    public static final OfInt uint_least32_t = NativeSimdOps.C_INT;
+    /**
+     * {@snippet lang=c :
+     * typedef __uint_least64_t uint_least64_t
+     * }
+     */
+    public static final OfLong uint_least64_t = NativeSimdOps.C_LONG;
+    /**
+     * {@snippet lang=c :
+     * typedef signed char int_fast8_t
+     * }
+     */
+    public static final OfByte int_fast8_t = NativeSimdOps.C_CHAR;
+    /**
+     * {@snippet lang=c :
+     * typedef long int_fast16_t
+     * }
+     */
+    public static final OfLong int_fast16_t = NativeSimdOps.C_LONG;
+    /**
+     * {@snippet lang=c :
+     * typedef long int_fast32_t
+     * }
+     */
+    public static final OfLong int_fast32_t = NativeSimdOps.C_LONG;
+    /**
+     * {@snippet lang=c :
+     * typedef long int_fast64_t
+     * }
+     */
+    public static final OfLong int_fast64_t = NativeSimdOps.C_LONG;
+    /**
+     * {@snippet lang=c :
+     * typedef unsigned char uint_fast8_t
+     * }
+     */
+    public static final OfByte uint_fast8_t = NativeSimdOps.C_CHAR;
+    /**
+     * {@snippet lang=c :
+     * typedef unsigned long uint_fast16_t
+     * }
+     */
+    public static final OfLong uint_fast16_t = NativeSimdOps.C_LONG;
+    /**
+     * {@snippet lang=c :
+     * typedef unsigned long uint_fast32_t
+     * }
+     */
+    public static final OfLong uint_fast32_t = NativeSimdOps.C_LONG;
+    /**
+     * {@snippet lang=c :
+     * typedef unsigned long uint_fast64_t
+     * }
+     */
+    public static final OfLong uint_fast64_t = NativeSimdOps.C_LONG;
+    /**
+     * {@snippet lang=c :
+     * typedef long intptr_t
+     * }
+     */
+    public static final OfLong intptr_t = NativeSimdOps.C_LONG;
+    /**
+     * {@snippet lang=c :
+     * typedef unsigned long uintptr_t
+     * }
+     */
+    public static final OfLong uintptr_t = NativeSimdOps.C_LONG;
+    /**
+     * {@snippet lang=c :
+     * typedef __intmax_t intmax_t
+     * }
+     */
+    public static final OfLong intmax_t = NativeSimdOps.C_LONG;
+    /**
+     * {@snippet lang=c :
+     * typedef __uintmax_t uintmax_t
+     * }
+     */
+    public static final OfLong uintmax_t = NativeSimdOps.C_LONG;
+
+    private static class assemble_and_sum_f32 {
+        public static final FunctionDescriptor DESC = FunctionDescriptor.of(
+            NativeSimdOps.C_FLOAT,
+            NativeSimdOps.C_POINTER,
+            NativeSimdOps.C_INT,
+            NativeSimdOps.C_POINTER,
+            NativeSimdOps.C_INT,
+            NativeSimdOps.C_LONG
+        );
+
+        public static final MemorySegment ADDR = NativeSimdOps.findOrThrow("assemble_and_sum_f32");
+
+        public static final MethodHandle HANDLE = Linker.nativeLinker().downcallHandle(ADDR, DESC, Linker.Option.critical(true));
+    }
+
+    /**
+     * Function descriptor for:
+     * {@snippet lang=c :
+     * float assemble_and_sum_f32(const float *data, int dataBase, const unsigned char *baseOffsets, int baseOffsetsOffset, size_t baseOffsetsLength)
+     * }
+     */
+    public static FunctionDescriptor assemble_and_sum_f32$descriptor() {
+        return assemble_and_sum_f32.DESC;
+    }
+
+    /**
+     * Downcall method handle for:
+     * {@snippet lang=c :
+     * float assemble_and_sum_f32(const float *data, int dataBase, const unsigned char *baseOffsets, int baseOffsetsOffset, size_t baseOffsetsLength)
+     * }
+     */
+    public static MethodHandle assemble_and_sum_f32$handle() {
+        return assemble_and_sum_f32.HANDLE;
+    }
+
+    /**
+     * Address for:
+     * {@snippet lang=c :
+     * float assemble_and_sum_f32(const float *data, int dataBase, const unsigned char *baseOffsets, int baseOffsetsOffset, size_t baseOffsetsLength)
+     * }
+     */
+    public static MemorySegment assemble_and_sum_f32$address() {
+        return assemble_and_sum_f32.ADDR;
+    }
+
+    /**
+     * {@snippet lang=c :
+     * float assemble_and_sum_f32(const float *data, int dataBase, const unsigned char *baseOffsets, int baseOffsetsOffset, size_t baseOffsetsLength)
+     * }
+     */
+    public static float assemble_and_sum_f32(MemorySegment data, int dataBase, MemorySegment baseOffsets, int baseOffsetsOffset, long baseOffsetsLength) {
+        var mh$ = assemble_and_sum_f32.HANDLE;
+        try {
+            if (TRACE_DOWNCALLS) {
+                traceDowncall("assemble_and_sum_f32", data, dataBase, baseOffsets, baseOffsetsOffset, baseOffsetsLength);
+            }
+            return (float)mh$.invokeExact(data, dataBase, baseOffsets, baseOffsetsOffset, baseOffsetsLength);
+        } catch (Throwable ex$) {
+           throw new AssertionError("should not reach here", ex$);
+        }
+    }
+
+    private static class assemble_and_sum_pq_f32 {
+        public static final FunctionDescriptor DESC = FunctionDescriptor.of(
+            NativeSimdOps.C_FLOAT,
+            NativeSimdOps.C_POINTER,
+            NativeSimdOps.C_LONG,
+            NativeSimdOps.C_POINTER,
+            NativeSimdOps.C_INT,
+            NativeSimdOps.C_POINTER,
+            NativeSimdOps.C_INT,
+            NativeSimdOps.C_INT
+        );
+
+        public static final MemorySegment ADDR = NativeSimdOps.findOrThrow("assemble_and_sum_pq_f32");
+
+        public static final MethodHandle HANDLE = Linker.nativeLinker().downcallHandle(ADDR, DESC, Linker.Option.critical(true));
+    }
+
+    /**
+     * Function descriptor for:
+     * {@snippet lang=c :
+     * float assemble_and_sum_pq_f32(const float *data, size_t subspaceCount, const unsigned char *baseOffsets1, int baseOffsetsOffset1, const unsigned char *baseOffsets2, int baseOffsetsOffset2, int clusterCount)
+     * }
+     */
+    public static FunctionDescriptor assemble_and_sum_pq_f32$descriptor() {
+        return assemble_and_sum_pq_f32.DESC;
+    }
+
+    /**
+     * Downcall method handle for:
+     * {@snippet lang=c :
+     * float assemble_and_sum_pq_f32(const float *data, size_t subspaceCount, const unsigned char *baseOffsets1, int baseOffsetsOffset1, const unsigned char *baseOffsets2, int baseOffsetsOffset2, int clusterCount)
+     * }
+     */
+    public static MethodHandle assemble_and_sum_pq_f32$handle() {
+        return assemble_and_sum_pq_f32.HANDLE;
+    }
+
+    /**
+     * Address for:
+     * {@snippet lang=c :
+     * float assemble_and_sum_pq_f32(const float *data, size_t subspaceCount, const unsigned char *baseOffsets1, int baseOffsetsOffset1, const unsigned char *baseOffsets2, int baseOffsetsOffset2, int clusterCount)
+     * }
+     */
+    public static MemorySegment assemble_and_sum_pq_f32$address() {
+        return assemble_and_sum_pq_f32.ADDR;
+    }
+
+    /**
+     * {@snippet lang=c :
+     * float assemble_and_sum_pq_f32(const float *data, size_t subspaceCount, const unsigned char *baseOffsets1, int baseOffsetsOffset1, const unsigned char *baseOffsets2, int baseOffsetsOffset2, int clusterCount)
+     * }
+     */
+    public static float assemble_and_sum_pq_f32(MemorySegment data, long subspaceCount, MemorySegment baseOffsets1, int baseOffsetsOffset1, MemorySegment baseOffsets2, int baseOffsetsOffset2, int clusterCount) {
+        var mh$ = assemble_and_sum_pq_f32.HANDLE;
+        try {
+            if (TRACE_DOWNCALLS) {
+                traceDowncall("assemble_and_sum_pq_f32", data, subspaceCount, baseOffsets1, baseOffsetsOffset1, baseOffsets2, baseOffsetsOffset2, clusterCount);
+            }
+            return (float)mh$.invokeExact(data, subspaceCount, baseOffsets1, baseOffsetsOffset1, baseOffsets2, baseOffsetsOffset2, clusterCount);
+        } catch (Throwable ex$) {
+           throw new AssertionError("should not reach here", ex$);
+        }
+    }
+
+    private static class pq_decoded_cosine_similarity_f32 {
+        public static final FunctionDescriptor DESC = FunctionDescriptor.of(
+            NativeSimdOps.C_FLOAT,
+            NativeSimdOps.C_POINTER,
+            NativeSimdOps.C_INT,
+            NativeSimdOps.C_LONG,
+            NativeSimdOps.C_INT,
+            NativeSimdOps.C_POINTER,
+            NativeSimdOps.C_POINTER,
+            NativeSimdOps.C_FLOAT
+        );
+
+        public static final MemorySegment ADDR = NativeSimdOps.findOrThrow("pq_decoded_cosine_similarity_f32");
+
+        public static final MethodHandle HANDLE = Linker.nativeLinker().downcallHandle(ADDR, DESC, Linker.Option.critical(true));
+    }
+
+    /**
+     * Function descriptor for:
+     * {@snippet lang=c :
+     * float pq_decoded_cosine_similarity_f32(const unsigned char *baseOffsets, int baseOffsetsOffset, size_t baseOffsetsLength, int clusterCount, const float *partialSums, const float *aMagnitude, float bMagnitude)
+     * }
+     */
+    public static FunctionDescriptor pq_decoded_cosine_similarity_f32$descriptor() {
+        return pq_decoded_cosine_similarity_f32.DESC;
+    }
+
+    /**
+     * Downcall method handle for:
+     * {@snippet lang=c :
+     * float pq_decoded_cosine_similarity_f32(const unsigned char *baseOffsets, int baseOffsetsOffset, size_t baseOffsetsLength, int clusterCount, const float *partialSums, const float *aMagnitude, float bMagnitude)
+     * }
+     */
+    public static MethodHandle pq_decoded_cosine_similarity_f32$handle() {
+        return pq_decoded_cosine_similarity_f32.HANDLE;
+    }
+
+    /**
+     * Address for:
+     * {@snippet lang=c :
+     * float pq_decoded_cosine_similarity_f32(const unsigned char *baseOffsets, int baseOffsetsOffset, size_t baseOffsetsLength, int clusterCount, const float *partialSums, const float *aMagnitude, float bMagnitude)
+     * }
+     */
+    public static MemorySegment pq_decoded_cosine_similarity_f32$address() {
+        return pq_decoded_cosine_similarity_f32.ADDR;
+    }
+
+    /**
+     * {@snippet lang=c :
+     * float pq_decoded_cosine_similarity_f32(const unsigned char *baseOffsets, int baseOffsetsOffset, size_t baseOffsetsLength, int clusterCount, const float *partialSums, const float *aMagnitude, float bMagnitude)
+     * }
+     */
+    public static float pq_decoded_cosine_similarity_f32(MemorySegment baseOffsets, int baseOffsetsOffset, long baseOffsetsLength, int clusterCount, MemorySegment partialSums, MemorySegment aMagnitude, float bMagnitude) {
+        var mh$ = pq_decoded_cosine_similarity_f32.HANDLE;
+        try {
+            if (TRACE_DOWNCALLS) {
+                traceDowncall("pq_decoded_cosine_similarity_f32", baseOffsets, baseOffsetsOffset, baseOffsetsLength, clusterCount, partialSums, aMagnitude, bMagnitude);
+            }
+            return (float)mh$.invokeExact(baseOffsets, baseOffsetsOffset, baseOffsetsLength, clusterCount, partialSums, aMagnitude, bMagnitude);
+        } catch (Throwable ex$) {
+           throw new AssertionError("should not reach here", ex$);
+        }
+    }
+
+    private static class calculate_partial_sums_euclidean_f32 {
+        public static final FunctionDescriptor DESC = FunctionDescriptor.ofVoid(
+            NativeSimdOps.C_POINTER,
+            NativeSimdOps.C_INT,
+            NativeSimdOps.C_LONG,
+            NativeSimdOps.C_INT,
+            NativeSimdOps.C_POINTER,
+            NativeSimdOps.C_INT,
+            NativeSimdOps.C_POINTER
+        );
+
+        public static final MemorySegment ADDR = NativeSimdOps.findOrThrow("calculate_partial_sums_euclidean_f32");
+
+        public static final MethodHandle HANDLE = Linker.nativeLinker().downcallHandle(ADDR, DESC, Linker.Option.critical(true));
+    }
+
+    /**
+     * Function descriptor for:
+     * {@snippet lang=c :
+     * void calculate_partial_sums_euclidean_f32(const float *codebook, int codebookBase, size_t size, int clusterCount, const float *query, int queryOffset, float *partialSums)
+     * }
+     */
+    public static FunctionDescriptor calculate_partial_sums_euclidean_f32$descriptor() {
+        return calculate_partial_sums_euclidean_f32.DESC;
+    }
+
+    /**
+     * Downcall method handle for:
+     * {@snippet lang=c :
+     * void calculate_partial_sums_euclidean_f32(const float *codebook, int codebookBase, size_t size, int clusterCount, const float *query, int queryOffset, float *partialSums)
+     * }
+     */
+    public static MethodHandle calculate_partial_sums_euclidean_f32$handle() {
+        return calculate_partial_sums_euclidean_f32.HANDLE;
+    }
+
+    /**
+     * Address for:
+     * {@snippet lang=c :
+     * void calculate_partial_sums_euclidean_f32(const float *codebook, int codebookBase, size_t size, int clusterCount, const float *query, int queryOffset, float *partialSums)
+     * }
+     */
+    public static MemorySegment calculate_partial_sums_euclidean_f32$address() {
+        return calculate_partial_sums_euclidean_f32.ADDR;
+    }
+
+    /**
+     * {@snippet lang=c :
+     * void calculate_partial_sums_euclidean_f32(const float *codebook, int codebookBase, size_t size, int clusterCount, const float *query, int queryOffset, float *partialSums)
+     * }
+     */
+    public static void calculate_partial_sums_euclidean_f32(MemorySegment codebook, int codebookBase, long size, int clusterCount, MemorySegment query, int queryOffset, MemorySegment partialSums) {
+        var mh$ = calculate_partial_sums_euclidean_f32.HANDLE;
+        try {
+            if (TRACE_DOWNCALLS) {
+                traceDowncall("calculate_partial_sums_euclidean_f32", codebook, codebookBase, size, clusterCount, query, queryOffset, partialSums);
+            }
+            mh$.invokeExact(codebook, codebookBase, size, clusterCount, query, queryOffset, partialSums);
+        } catch (Throwable ex$) {
+           throw new AssertionError("should not reach here", ex$);
+        }
+    }
+
+    private static class calculate_partial_sums_dot_f32 {
+        public static final FunctionDescriptor DESC = FunctionDescriptor.ofVoid(
+            NativeSimdOps.C_POINTER,
+            NativeSimdOps.C_INT,
+            NativeSimdOps.C_LONG,
+            NativeSimdOps.C_INT,
+            NativeSimdOps.C_POINTER,
+            NativeSimdOps.C_INT,
+            NativeSimdOps.C_POINTER
+        );
+
+        public static final MemorySegment ADDR = NativeSimdOps.findOrThrow("calculate_partial_sums_dot_f32");
+
+        public static final MethodHandle HANDLE = Linker.nativeLinker().downcallHandle(ADDR, DESC, Linker.Option.critical(true));
+    }
+
+    /**
+     * Function descriptor for:
+     * {@snippet lang=c :
+     * void calculate_partial_sums_dot_f32(const float *codebook, int codebookBase, size_t size, int clusterCount, const float *query, int queryOffset, float *partialSums)
+     * }
+     */
+    public static FunctionDescriptor calculate_partial_sums_dot_f32$descriptor() {
+        return calculate_partial_sums_dot_f32.DESC;
+    }
+
+    /**
+     * Downcall method handle for:
+     * {@snippet lang=c :
+     * void calculate_partial_sums_dot_f32(const float *codebook, int codebookBase, size_t size, int clusterCount, const float *query, int queryOffset, float *partialSums)
+     * }
+     */
+    public static MethodHandle calculate_partial_sums_dot_f32$handle() {
+        return calculate_partial_sums_dot_f32.HANDLE;
+    }
+
+    /**
+     * Address for:
+     * {@snippet lang=c :
+     * void calculate_partial_sums_dot_f32(const float *codebook, int codebookBase, size_t size, int clusterCount, const float *query, int queryOffset, float *partialSums)
+     * }
+     */
+    public static MemorySegment calculate_partial_sums_dot_f32$address() {
+        return calculate_partial_sums_dot_f32.ADDR;
+    }
+
+    /**
+     * {@snippet lang=c :
+     * void calculate_partial_sums_dot_f32(const float *codebook, int codebookBase, size_t size, int clusterCount, const float *query, int queryOffset, float *partialSums)
+     * }
+     */
+    public static void calculate_partial_sums_dot_f32(MemorySegment codebook, int codebookBase, long size, int clusterCount, MemorySegment query, int queryOffset, MemorySegment partialSums) {
+        var mh$ = calculate_partial_sums_dot_f32.HANDLE;
+        try {
+            if (TRACE_DOWNCALLS) {
+                traceDowncall("calculate_partial_sums_dot_f32", codebook, codebookBase, size, clusterCount, query, queryOffset, partialSums);
+            }
+            mh$.invokeExact(codebook, codebookBase, size, clusterCount, query, queryOffset, partialSums);
+        } catch (Throwable ex$) {
+           throw new AssertionError("should not reach here", ex$);
+        }
+    }
+
+    private static class calculate_partial_sums_self_magnitude_f32 {
+        public static final FunctionDescriptor DESC = FunctionDescriptor.ofVoid(
+            NativeSimdOps.C_POINTER,
+            NativeSimdOps.C_INT,
+            NativeSimdOps.C_LONG,
+            NativeSimdOps.C_INT,
+            NativeSimdOps.C_POINTER
+        );
+
+        public static final MemorySegment ADDR = NativeSimdOps.findOrThrow("calculate_partial_sums_self_magnitude_f32");
+
+        public static final MethodHandle HANDLE = Linker.nativeLinker().downcallHandle(ADDR, DESC, Linker.Option.critical(true));
+    }
+
+    /**
+     * Function descriptor for:
+     * {@snippet lang=c :
+     * void calculate_partial_sums_self_magnitude_f32(const float *codebook, int codebookBase, size_t size, int clusterCount, float *partialSums)
+     * }
+     */
+    public static FunctionDescriptor calculate_partial_sums_self_magnitude_f32$descriptor() {
+        return calculate_partial_sums_self_magnitude_f32.DESC;
+    }
+
+    /**
+     * Downcall method handle for:
+     * {@snippet lang=c :
+     * void calculate_partial_sums_self_magnitude_f32(const float *codebook, int codebookBase, size_t size, int clusterCount, float *partialSums)
+     * }
+     */
+    public static MethodHandle calculate_partial_sums_self_magnitude_f32$handle() {
+        return calculate_partial_sums_self_magnitude_f32.HANDLE;
+    }
+
+    /**
+     * Address for:
+     * {@snippet lang=c :
+     * void calculate_partial_sums_self_magnitude_f32(const float *codebook, int codebookBase, size_t size, int clusterCount, float *partialSums)
+     * }
+     */
+    public static MemorySegment calculate_partial_sums_self_magnitude_f32$address() {
+        return calculate_partial_sums_self_magnitude_f32.ADDR;
+    }
+
+    /**
+     * {@snippet lang=c :
+     * void calculate_partial_sums_self_magnitude_f32(const float *codebook, int codebookBase, size_t size, int clusterCount, float *partialSums)
+     * }
+     */
+    public static void calculate_partial_sums_self_magnitude_f32(MemorySegment codebook, int codebookBase, long size, int clusterCount, MemorySegment partialSums) {
+        var mh$ = calculate_partial_sums_self_magnitude_f32.HANDLE;
+        try {
+            if (TRACE_DOWNCALLS) {
+                traceDowncall("calculate_partial_sums_self_magnitude_f32", codebook, codebookBase, size, clusterCount, partialSums);
+            }
+            mh$.invokeExact(codebook, codebookBase, size, clusterCount, partialSums);
+        } catch (Throwable ex$) {
+           throw new AssertionError("should not reach here", ex$);
+        }
+    }
+
+    private static class dot_product_f32 {
+        public static final FunctionDescriptor DESC = FunctionDescriptor.of(
+            NativeSimdOps.C_FLOAT,
+            NativeSimdOps.C_POINTER,
+            NativeSimdOps.C_LONG,
+            NativeSimdOps.C_POINTER,
+            NativeSimdOps.C_LONG,
+            NativeSimdOps.C_LONG
+        );
+
+        public static final MemorySegment ADDR = NativeSimdOps.findOrThrow("dot_product_f32");
+
+        public static final MethodHandle HANDLE = Linker.nativeLinker().downcallHandle(ADDR, DESC, Linker.Option.critical(true));
+    }
+
+    /**
+     * Function descriptor for:
+     * {@snippet lang=c :
+     * float dot_product_f32(const float *a, size_t aoffset, const float *b, size_t boffset, size_t length)
+     * }
+     */
+    public static FunctionDescriptor dot_product_f32$descriptor() {
+        return dot_product_f32.DESC;
+    }
+
+    /**
+     * Downcall method handle for:
+     * {@snippet lang=c :
+     * float dot_product_f32(const float *a, size_t aoffset, const float *b, size_t boffset, size_t length)
+     * }
+     */
+    public static MethodHandle dot_product_f32$handle() {
+        return dot_product_f32.HANDLE;
+    }
+
+    /**
+     * Address for:
+     * {@snippet lang=c :
+     * float dot_product_f32(const float *a, size_t aoffset, const float *b, size_t boffset, size_t length)
+     * }
+     */
+    public static MemorySegment dot_product_f32$address() {
+        return dot_product_f32.ADDR;
+    }
+
+    /**
+     * {@snippet lang=c :
+     * float dot_product_f32(const float *a, size_t aoffset, const float *b, size_t boffset, size_t length)
+     * }
+     */
+    public static float dot_product_f32(MemorySegment a, long aoffset, MemorySegment b, long boffset, long length) {
+        var mh$ = dot_product_f32.HANDLE;
+        try {
+            if (TRACE_DOWNCALLS) {
+                traceDowncall("dot_product_f32", a, aoffset, b, boffset, length);
+            }
+            return (float)mh$.invokeExact(a, aoffset, b, boffset, length);
+        } catch (Throwable ex$) {
+           throw new AssertionError("should not reach here", ex$);
+        }
+    }
+
+    private static class cosine_f32 {
+        public static final FunctionDescriptor DESC = FunctionDescriptor.of(
+            NativeSimdOps.C_FLOAT,
+            NativeSimdOps.C_POINTER,
+            NativeSimdOps.C_LONG,
+            NativeSimdOps.C_POINTER,
+            NativeSimdOps.C_LONG,
+            NativeSimdOps.C_LONG
+        );
+
+        public static final MemorySegment ADDR = NativeSimdOps.findOrThrow("cosine_f32");
+
+        public static final MethodHandle HANDLE = Linker.nativeLinker().downcallHandle(ADDR, DESC, Linker.Option.critical(true));
+    }
+
+    /**
+     * Function descriptor for:
+     * {@snippet lang=c :
+     * float cosine_f32(const float *a, size_t aoffset, const float *b, size_t boffset, size_t length)
+     * }
+     */
+    public static FunctionDescriptor cosine_f32$descriptor() {
+        return cosine_f32.DESC;
+    }
+
+    /**
+     * Downcall method handle for:
+     * {@snippet lang=c :
+     * float cosine_f32(const float *a, size_t aoffset, const float *b, size_t boffset, size_t length)
+     * }
+     */
+    public static MethodHandle cosine_f32$handle() {
+        return cosine_f32.HANDLE;
+    }
+
+    /**
+     * Address for:
+     * {@snippet lang=c :
+     * float cosine_f32(const float *a, size_t aoffset, const float *b, size_t boffset, size_t length)
+     * }
+     */
+    public static MemorySegment cosine_f32$address() {
+        return cosine_f32.ADDR;
+    }
+
+    /**
+     * {@snippet lang=c :
+     * float cosine_f32(const float *a, size_t aoffset, const float *b, size_t boffset, size_t length)
+     * }
+     */
+    public static float cosine_f32(MemorySegment a, long aoffset, MemorySegment b, long boffset, long length) {
+        var mh$ = cosine_f32.HANDLE;
+        try {
+            if (TRACE_DOWNCALLS) {
+                traceDowncall("cosine_f32", a, aoffset, b, boffset, length);
+            }
+            return (float)mh$.invokeExact(a, aoffset, b, boffset, length);
+        } catch (Throwable ex$) {
+           throw new AssertionError("should not reach here", ex$);
+        }
+    }
+
+    private static class euclidean_f32 {
+        public static final FunctionDescriptor DESC = FunctionDescriptor.of(
+            NativeSimdOps.C_FLOAT,
+            NativeSimdOps.C_POINTER,
+            NativeSimdOps.C_LONG,
+            NativeSimdOps.C_POINTER,
+            NativeSimdOps.C_LONG,
+            NativeSimdOps.C_LONG
+        );
+
+        public static final MemorySegment ADDR = NativeSimdOps.findOrThrow("euclidean_f32");
+
+        public static final MethodHandle HANDLE = Linker.nativeLinker().downcallHandle(ADDR, DESC, Linker.Option.critical(true));
+    }
+
+    /**
+     * Function descriptor for:
+     * {@snippet lang=c :
+     * float euclidean_f32(const float *a, size_t aoffset, const float *b, size_t boffset, size_t length)
+     * }
+     */
+    public static FunctionDescriptor euclidean_f32$descriptor() {
+        return euclidean_f32.DESC;
+    }
+
+    /**
+     * Downcall method handle for:
+     * {@snippet lang=c :
+     * float euclidean_f32(const float *a, size_t aoffset, const float *b, size_t boffset, size_t length)
+     * }
+     */
+    public static MethodHandle euclidean_f32$handle() {
+        return euclidean_f32.HANDLE;
+    }
+
+    /**
+     * Address for:
+     * {@snippet lang=c :
+     * float euclidean_f32(const float *a, size_t aoffset, const float *b, size_t boffset, size_t length)
+     * }
+     */
+    public static MemorySegment euclidean_f32$address() {
+        return euclidean_f32.ADDR;
+    }
+
+    /**
+     * {@snippet lang=c :
+     * float euclidean_f32(const float *a, size_t aoffset, const float *b, size_t boffset, size_t length)
+     * }
+     */
+    public static float euclidean_f32(MemorySegment a, long aoffset, MemorySegment b, long boffset, long length) {
+        var mh$ = euclidean_f32.HANDLE;
+        try {
+            if (TRACE_DOWNCALLS) {
+                traceDowncall("euclidean_f32", a, aoffset, b, boffset, length);
+            }
+            return (float)mh$.invokeExact(a, aoffset, b, boffset, length);
+        } catch (Throwable ex$) {
+           throw new AssertionError("should not reach here", ex$);
+        }
+    }
+
+    private static class nvq_quantize_8bit {
+        public static final FunctionDescriptor DESC = FunctionDescriptor.ofVoid(
+            NativeSimdOps.C_POINTER,
+            NativeSimdOps.C_LONG,
+            NativeSimdOps.C_FLOAT,
+            NativeSimdOps.C_FLOAT,
+            NativeSimdOps.C_FLOAT,
+            NativeSimdOps.C_FLOAT,
+            NativeSimdOps.C_POINTER
+        );
+
+        public static final MemorySegment ADDR = NativeSimdOps.findOrThrow("nvq_quantize_8bit");
+
+        public static final MethodHandle HANDLE = Linker.nativeLinker().downcallHandle(ADDR, DESC, Linker.Option.critical(true));
+    }
+
+    /**
+     * Function descriptor for:
+     * {@snippet lang=c :
+     * void nvq_quantize_8bit(const float *vector, size_t length, float alpha, float x0, float minValue, float maxValue, unsigned char *destination)
+     * }
+     */
+    public static FunctionDescriptor nvq_quantize_8bit$descriptor() {
+        return nvq_quantize_8bit.DESC;
+    }
+
+    /**
+     * Downcall method handle for:
+     * {@snippet lang=c :
+     * void nvq_quantize_8bit(const float *vector, size_t length, float alpha, float x0, float minValue, float maxValue, unsigned char *destination)
+     * }
+     */
+    public static MethodHandle nvq_quantize_8bit$handle() {
+        return nvq_quantize_8bit.HANDLE;
+    }
+
+    /**
+     * Address for:
+     * {@snippet lang=c :
+     * void nvq_quantize_8bit(const float *vector, size_t length, float alpha, float x0, float minValue, float maxValue, unsigned char *destination)
+     * }
+     */
+    public static MemorySegment nvq_quantize_8bit$address() {
+        return nvq_quantize_8bit.ADDR;
+    }
+
+    /**
+     * {@snippet lang=c :
+     * void nvq_quantize_8bit(const float *vector, size_t length, float alpha, float x0, float minValue, float maxValue, unsigned char *destination)
+     * }
+     */
+    public static void nvq_quantize_8bit(MemorySegment vector, long length, float alpha, float x0, float minValue, float maxValue, MemorySegment destination) {
+        var mh$ = nvq_quantize_8bit.HANDLE;
+        try {
+            if (TRACE_DOWNCALLS) {
+                traceDowncall("nvq_quantize_8bit", vector, length, alpha, x0, minValue, maxValue, destination);
+            }
+            mh$.invokeExact(vector, length, alpha, x0, minValue, maxValue, destination);
+        } catch (Throwable ex$) {
+           throw new AssertionError("should not reach here", ex$);
+        }
+    }
+
+    private static class nvq_loss {
+        public static final FunctionDescriptor DESC = FunctionDescriptor.of(
+            NativeSimdOps.C_FLOAT,
+            NativeSimdOps.C_POINTER,
+            NativeSimdOps.C_LONG,
+            NativeSimdOps.C_FLOAT,
+            NativeSimdOps.C_FLOAT,
+            NativeSimdOps.C_FLOAT,
+            NativeSimdOps.C_FLOAT,
+            NativeSimdOps.C_INT
+        );
+
+        public static final MemorySegment ADDR = NativeSimdOps.findOrThrow("nvq_loss");
+
+        public static final MethodHandle HANDLE = Linker.nativeLinker().downcallHandle(ADDR, DESC, Linker.Option.critical(true));
+    }
+
+    /**
+     * Function descriptor for:
+     * {@snippet lang=c :
+     * float nvq_loss(const float *vector, size_t length, float alpha, float x0, float minValue, float maxValue, int nBits)
+     * }
+     */
+    public static FunctionDescriptor nvq_loss$descriptor() {
+        return nvq_loss.DESC;
+    }
+
+    /**
+     * Downcall method handle for:
+     * {@snippet lang=c :
+     * float nvq_loss(const float *vector, size_t length, float alpha, float x0, float minValue, float maxValue, int nBits)
+     * }
+     */
+    public static MethodHandle nvq_loss$handle() {
+        return nvq_loss.HANDLE;
+    }
+
+    /**
+     * Address for:
+     * {@snippet lang=c :
+     * float nvq_loss(const float *vector, size_t length, float alpha, float x0, float minValue, float maxValue, int nBits)
+     * }
+     */
+    public static MemorySegment nvq_loss$address() {
+        return nvq_loss.ADDR;
+    }
+
+    /**
+     * {@snippet lang=c :
+     * float nvq_loss(const float *vector, size_t length, float alpha, float x0, float minValue, float maxValue, int nBits)
+     * }
+     */
+    public static float nvq_loss(MemorySegment vector, long length, float alpha, float x0, float minValue, float maxValue, int nBits) {
+        var mh$ = nvq_loss.HANDLE;
+        try {
+            if (TRACE_DOWNCALLS) {
+                traceDowncall("nvq_loss", vector, length, alpha, x0, minValue, maxValue, nBits);
+            }
+            return (float)mh$.invokeExact(vector, length, alpha, x0, minValue, maxValue, nBits);
+        } catch (Throwable ex$) {
+           throw new AssertionError("should not reach here", ex$);
+        }
+    }
+
+    private static class nvq_uniform_loss {
+        public static final FunctionDescriptor DESC = FunctionDescriptor.of(
+            NativeSimdOps.C_FLOAT,
+            NativeSimdOps.C_POINTER,
+            NativeSimdOps.C_LONG,
+            NativeSimdOps.C_FLOAT,
+            NativeSimdOps.C_FLOAT,
+            NativeSimdOps.C_INT
+        );
+
+        public static final MemorySegment ADDR = NativeSimdOps.findOrThrow("nvq_uniform_loss");
+
+        public static final MethodHandle HANDLE = Linker.nativeLinker().downcallHandle(ADDR, DESC, Linker.Option.critical(true));
+    }
+
+    /**
+     * Function descriptor for:
+     * {@snippet lang=c :
+     * float nvq_uniform_loss(const float *vector, size_t length, float minValue, float maxValue, int nBits)
+     * }
+     */
+    public static FunctionDescriptor nvq_uniform_loss$descriptor() {
+        return nvq_uniform_loss.DESC;
+    }
+
+    /**
+     * Downcall method handle for:
+     * {@snippet lang=c :
+     * float nvq_uniform_loss(const float *vector, size_t length, float minValue, float maxValue, int nBits)
+     * }
+     */
+    public static MethodHandle nvq_uniform_loss$handle() {
+        return nvq_uniform_loss.HANDLE;
+    }
+
+    /**
+     * Address for:
+     * {@snippet lang=c :
+     * float nvq_uniform_loss(const float *vector, size_t length, float minValue, float maxValue, int nBits)
+     * }
+     */
+    public static MemorySegment nvq_uniform_loss$address() {
+        return nvq_uniform_loss.ADDR;
+    }
+
+    /**
+     * {@snippet lang=c :
+     * float nvq_uniform_loss(const float *vector, size_t length, float minValue, float maxValue, int nBits)
+     * }
+     */
+    public static float nvq_uniform_loss(MemorySegment vector, long length, float minValue, float maxValue, int nBits) {
+        var mh$ = nvq_uniform_loss.HANDLE;
+        try {
+            if (TRACE_DOWNCALLS) {
+                traceDowncall("nvq_uniform_loss", vector, length, minValue, maxValue, nBits);
+            }
+            return (float)mh$.invokeExact(vector, length, minValue, maxValue, nBits);
+        } catch (Throwable ex$) {
+           throw new AssertionError("should not reach here", ex$);
+        }
+    }
+
+    private static class nvq_square_l2_distance_8bit {
+        public static final FunctionDescriptor DESC = FunctionDescriptor.of(
+            NativeSimdOps.C_FLOAT,
+            NativeSimdOps.C_POINTER,
+            NativeSimdOps.C_POINTER,
+            NativeSimdOps.C_LONG,
+            NativeSimdOps.C_FLOAT,
+            NativeSimdOps.C_FLOAT,
+            NativeSimdOps.C_FLOAT,
+            NativeSimdOps.C_FLOAT
+        );
+
+        public static final MemorySegment ADDR = NativeSimdOps.findOrThrow("nvq_square_l2_distance_8bit");
+
+        public static final MethodHandle HANDLE = Linker.nativeLinker().downcallHandle(ADDR, DESC, Linker.Option.critical(true));
+    }
+
+    /**
+     * Function descriptor for:
+     * {@snippet lang=c :
+     * float nvq_square_l2_distance_8bit(const float *vector, const unsigned char *quantized, size_t length, float alpha, float x0, float minValue, float maxValue)
+     * }
+     */
+    public static FunctionDescriptor nvq_square_l2_distance_8bit$descriptor() {
+        return nvq_square_l2_distance_8bit.DESC;
+    }
+
+    /**
+     * Downcall method handle for:
+     * {@snippet lang=c :
+     * float nvq_square_l2_distance_8bit(const float *vector, const unsigned char *quantized, size_t length, float alpha, float x0, float minValue, float maxValue)
+     * }
+     */
+    public static MethodHandle nvq_square_l2_distance_8bit$handle() {
+        return nvq_square_l2_distance_8bit.HANDLE;
+    }
+
+    /**
+     * Address for:
+     * {@snippet lang=c :
+     * float nvq_square_l2_distance_8bit(const float *vector, const unsigned char *quantized, size_t length, float alpha, float x0, float minValue, float maxValue)
+     * }
+     */
+    public static MemorySegment nvq_square_l2_distance_8bit$address() {
+        return nvq_square_l2_distance_8bit.ADDR;
+    }
+
+    /**
+     * {@snippet lang=c :
+     * float nvq_square_l2_distance_8bit(const float *vector, const unsigned char *quantized, size_t length, float alpha, float x0, float minValue, float maxValue)
+     * }
+     */
+    public static float nvq_square_l2_distance_8bit(MemorySegment vector, MemorySegment quantized, long length, float alpha, float x0, float minValue, float maxValue) {
+        var mh$ = nvq_square_l2_distance_8bit.HANDLE;
+        try {
+            if (TRACE_DOWNCALLS) {
+                traceDowncall("nvq_square_l2_distance_8bit", vector, quantized, length, alpha, x0, minValue, maxValue);
+            }
+            return (float)mh$.invokeExact(vector, quantized, length, alpha, x0, minValue, maxValue);
+        } catch (Throwable ex$) {
+           throw new AssertionError("should not reach here", ex$);
+        }
+    }
+
+    private static class nvq_dot_product_8bit {
+        public static final FunctionDescriptor DESC = FunctionDescriptor.of(
+            NativeSimdOps.C_FLOAT,
+            NativeSimdOps.C_POINTER,
+            NativeSimdOps.C_POINTER,
+            NativeSimdOps.C_LONG,
+            NativeSimdOps.C_FLOAT,
+            NativeSimdOps.C_FLOAT,
+            NativeSimdOps.C_FLOAT,
+            NativeSimdOps.C_FLOAT
+        );
+
+        public static final MemorySegment ADDR = NativeSimdOps.findOrThrow("nvq_dot_product_8bit");
+
+        public static final MethodHandle HANDLE = Linker.nativeLinker().downcallHandle(ADDR, DESC, Linker.Option.critical(true));
+    }
+
+    /**
+     * Function descriptor for:
+     * {@snippet lang=c :
+     * float nvq_dot_product_8bit(const float *vector, const unsigned char *quantized, size_t length, float alpha, float x0, float minValue, float maxValue)
+     * }
+     */
+    public static FunctionDescriptor nvq_dot_product_8bit$descriptor() {
+        return nvq_dot_product_8bit.DESC;
+    }
+
+    /**
+     * Downcall method handle for:
+     * {@snippet lang=c :
+     * float nvq_dot_product_8bit(const float *vector, const unsigned char *quantized, size_t length, float alpha, float x0, float minValue, float maxValue)
+     * }
+     */
+    public static MethodHandle nvq_dot_product_8bit$handle() {
+        return nvq_dot_product_8bit.HANDLE;
+    }
+
+    /**
+     * Address for:
+     * {@snippet lang=c :
+     * float nvq_dot_product_8bit(const float *vector, const unsigned char *quantized, size_t length, float alpha, float x0, float minValue, float maxValue)
+     * }
+     */
+    public static MemorySegment nvq_dot_product_8bit$address() {
+        return nvq_dot_product_8bit.ADDR;
+    }
+
+    /**
+     * {@snippet lang=c :
+     * float nvq_dot_product_8bit(const float *vector, const unsigned char *quantized, size_t length, float alpha, float x0, float minValue, float maxValue)
+     * }
+     */
+    public static float nvq_dot_product_8bit(MemorySegment vector, MemorySegment quantized, long length, float alpha, float x0, float minValue, float maxValue) {
+        var mh$ = nvq_dot_product_8bit.HANDLE;
+        try {
+            if (TRACE_DOWNCALLS) {
+                traceDowncall("nvq_dot_product_8bit", vector, quantized, length, alpha, x0, minValue, maxValue);
+            }
+            return (float)mh$.invokeExact(vector, quantized, length, alpha, x0, minValue, maxValue);
+        } catch (Throwable ex$) {
+           throw new AssertionError("should not reach here", ex$);
+        }
+    }
+
+    private static class nvq_cosine_8bit_packed {
+        public static final FunctionDescriptor DESC = FunctionDescriptor.of(
+            NativeSimdOps.C_LONG,
+            NativeSimdOps.C_POINTER,
+            NativeSimdOps.C_POINTER,
+            NativeSimdOps.C_LONG,
+            NativeSimdOps.C_FLOAT,
+            NativeSimdOps.C_FLOAT,
+            NativeSimdOps.C_FLOAT,
+            NativeSimdOps.C_FLOAT,
+            NativeSimdOps.C_POINTER
+        );
+
+        public static final MemorySegment ADDR = NativeSimdOps.findOrThrow("nvq_cosine_8bit_packed");
+
+        public static final MethodHandle HANDLE = Linker.nativeLinker().downcallHandle(ADDR, DESC, Linker.Option.critical(true));
+    }
+
+    /**
+     * Function descriptor for:
+     * {@snippet lang=c :
+     * int64_t nvq_cosine_8bit_packed(const float *vector, const unsigned char *quantized, size_t length, float alpha, float x0, float minValue, float maxValue, const float *centroid)
+     * }
+     */
+    public static FunctionDescriptor nvq_cosine_8bit_packed$descriptor() {
+        return nvq_cosine_8bit_packed.DESC;
+    }
+
+    /**
+     * Downcall method handle for:
+     * {@snippet lang=c :
+     * int64_t nvq_cosine_8bit_packed(const float *vector, const unsigned char *quantized, size_t length, float alpha, float x0, float minValue, float maxValue, const float *centroid)
+     * }
+     */
+    public static MethodHandle nvq_cosine_8bit_packed$handle() {
+        return nvq_cosine_8bit_packed.HANDLE;
+    }
+
+    /**
+     * Address for:
+     * {@snippet lang=c :
+     * int64_t nvq_cosine_8bit_packed(const float *vector, const unsigned char *quantized, size_t length, float alpha, float x0, float minValue, float maxValue, const float *centroid)
+     * }
+     */
+    public static MemorySegment nvq_cosine_8bit_packed$address() {
+        return nvq_cosine_8bit_packed.ADDR;
+    }
+
+    /**
+     * {@snippet lang=c :
+     * int64_t nvq_cosine_8bit_packed(const float *vector, const unsigned char *quantized, size_t length, float alpha, float x0, float minValue, float maxValue, const float *centroid)
+     * }
+     */
+    public static long nvq_cosine_8bit_packed(MemorySegment vector, MemorySegment quantized, long length, float alpha, float x0, float minValue, float maxValue, MemorySegment centroid) {
+        var mh$ = nvq_cosine_8bit_packed.HANDLE;
+        try {
+            if (TRACE_DOWNCALLS) {
+                traceDowncall("nvq_cosine_8bit_packed", vector, quantized, length, alpha, x0, minValue, maxValue, centroid);
+            }
+            return (long)mh$.invokeExact(vector, quantized, length, alpha, x0, minValue, maxValue, centroid);
+        } catch (Throwable ex$) {
+           throw new AssertionError("should not reach here", ex$);
+        }
+    }
+
+    private static class nvq_shuffle_query_in_place_8bit {
+        public static final FunctionDescriptor DESC = FunctionDescriptor.ofVoid(
+            NativeSimdOps.C_POINTER,
+            NativeSimdOps.C_LONG
+        );
+
+        public static final MemorySegment ADDR = NativeSimdOps.findOrThrow("nvq_shuffle_query_in_place_8bit");
+
+        public static final MethodHandle HANDLE = Linker.nativeLinker().downcallHandle(ADDR, DESC, Linker.Option.critical(true));
+    }
+
+    /**
+     * Function descriptor for:
+     * {@snippet lang=c :
+     * void nvq_shuffle_query_in_place_8bit(float *vector, size_t length)
+     * }
+     */
+    public static FunctionDescriptor nvq_shuffle_query_in_place_8bit$descriptor() {
+        return nvq_shuffle_query_in_place_8bit.DESC;
+    }
+
+    /**
+     * Downcall method handle for:
+     * {@snippet lang=c :
+     * void nvq_shuffle_query_in_place_8bit(float *vector, size_t length)
+     * }
+     */
+    public static MethodHandle nvq_shuffle_query_in_place_8bit$handle() {
+        return nvq_shuffle_query_in_place_8bit.HANDLE;
+    }
+
+    /**
+     * Address for:
+     * {@snippet lang=c :
+     * void nvq_shuffle_query_in_place_8bit(float *vector, size_t length)
+     * }
+     */
+    public static MemorySegment nvq_shuffle_query_in_place_8bit$address() {
+        return nvq_shuffle_query_in_place_8bit.ADDR;
+    }
+
+    /**
+     * {@snippet lang=c :
+     * void nvq_shuffle_query_in_place_8bit(float *vector, size_t length)
+     * }
+     */
+    public static void nvq_shuffle_query_in_place_8bit(MemorySegment vector, long length) {
+        var mh$ = nvq_shuffle_query_in_place_8bit.HANDLE;
+        try {
+            if (TRACE_DOWNCALLS) {
+                traceDowncall("nvq_shuffle_query_in_place_8bit", vector, length);
+            }
+            mh$.invokeExact(vector, length);
+        } catch (Throwable ex$) {
+           throw new AssertionError("should not reach here", ex$);
+        }
+    }
+
+    private static class add_in_place_f32 {
+        public static final FunctionDescriptor DESC = FunctionDescriptor.ofVoid(
+            NativeSimdOps.C_POINTER,
+            NativeSimdOps.C_POINTER,
+            NativeSimdOps.C_LONG
+        );
+
+        public static final MemorySegment ADDR = NativeSimdOps.findOrThrow("add_in_place_f32");
+
+        public static final MethodHandle HANDLE = Linker.nativeLinker().downcallHandle(ADDR, DESC, Linker.Option.critical(true));
+    }
 
-    private static class check_avx512_compatibility {
+    /**
+     * Function descriptor for:
+     * {@snippet lang=c :
+     * void add_in_place_f32(float *v1, const float *v2, size_t length)
+     * }
+     */
+    public static FunctionDescriptor add_in_place_f32$descriptor() {
+        return add_in_place_f32.DESC;
+    }
+
+    /**
+     * Downcall method handle for:
+     * {@snippet lang=c :
+     * void add_in_place_f32(float *v1, const float *v2, size_t length)
+     * }
+     */
+    public static MethodHandle add_in_place_f32$handle() {
+        return add_in_place_f32.HANDLE;
+    }
+
+    /**
+     * Address for:
+     * {@snippet lang=c :
+     * void add_in_place_f32(float *v1, const float *v2, size_t length)
+     * }
+     */
+    public static MemorySegment add_in_place_f32$address() {
+        return add_in_place_f32.ADDR;
+    }
+
+    /**
+     * {@snippet lang=c :
+     * void add_in_place_f32(float *v1, const float *v2, size_t length)
+     * }
+     */
+    public static void add_in_place_f32(MemorySegment v1, MemorySegment v2, long length) {
+        var mh$ = add_in_place_f32.HANDLE;
+        try {
+            if (TRACE_DOWNCALLS) {
+                traceDowncall("add_in_place_f32", v1, v2, length);
+            }
+            mh$.invokeExact(v1, v2, length);
+        } catch (Throwable ex$) {
+           throw new AssertionError("should not reach here", ex$);
+        }
+    }
+
+    private static class add_scalar_in_place_f32 {
+        public static final FunctionDescriptor DESC = FunctionDescriptor.ofVoid(
+            NativeSimdOps.C_POINTER,
+            NativeSimdOps.C_FLOAT,
+            NativeSimdOps.C_LONG
+        );
+
+        public static final MemorySegment ADDR = NativeSimdOps.findOrThrow("add_scalar_in_place_f32");
+
+        public static final MethodHandle HANDLE = Linker.nativeLinker().downcallHandle(ADDR, DESC, Linker.Option.critical(true));
+    }
+
+    /**
+     * Function descriptor for:
+     * {@snippet lang=c :
+     * void add_scalar_in_place_f32(float *v1, float value, size_t length)
+     * }
+     */
+    public static FunctionDescriptor add_scalar_in_place_f32$descriptor() {
+        return add_scalar_in_place_f32.DESC;
+    }
+
+    /**
+     * Downcall method handle for:
+     * {@snippet lang=c :
+     * void add_scalar_in_place_f32(float *v1, float value, size_t length)
+     * }
+     */
+    public static MethodHandle add_scalar_in_place_f32$handle() {
+        return add_scalar_in_place_f32.HANDLE;
+    }
+
+    /**
+     * Address for:
+     * {@snippet lang=c :
+     * void add_scalar_in_place_f32(float *v1, float value, size_t length)
+     * }
+     */
+    public static MemorySegment add_scalar_in_place_f32$address() {
+        return add_scalar_in_place_f32.ADDR;
+    }
+
+    /**
+     * {@snippet lang=c :
+     * void add_scalar_in_place_f32(float *v1, float value, size_t length)
+     * }
+     */
+    public static void add_scalar_in_place_f32(MemorySegment v1, float value, long length) {
+        var mh$ = add_scalar_in_place_f32.HANDLE;
+        try {
+            if (TRACE_DOWNCALLS) {
+                traceDowncall("add_scalar_in_place_f32", v1, value, length);
+            }
+            mh$.invokeExact(v1, value, length);
+        } catch (Throwable ex$) {
+           throw new AssertionError("should not reach here", ex$);
+        }
+    }
+
+    private static class sub_in_place_f32 {
+        public static final FunctionDescriptor DESC = FunctionDescriptor.ofVoid(
+            NativeSimdOps.C_POINTER,
+            NativeSimdOps.C_POINTER,
+            NativeSimdOps.C_LONG
+        );
+
+        public static final MemorySegment ADDR = NativeSimdOps.findOrThrow("sub_in_place_f32");
+
+        public static final MethodHandle HANDLE = Linker.nativeLinker().downcallHandle(ADDR, DESC, Linker.Option.critical(true));
+    }
+
+    /**
+     * Function descriptor for:
+     * {@snippet lang=c :
+     * void sub_in_place_f32(float *v1, const float *v2, size_t length)
+     * }
+     */
+    public static FunctionDescriptor sub_in_place_f32$descriptor() {
+        return sub_in_place_f32.DESC;
+    }
+
+    /**
+     * Downcall method handle for:
+     * {@snippet lang=c :
+     * void sub_in_place_f32(float *v1, const float *v2, size_t length)
+     * }
+     */
+    public static MethodHandle sub_in_place_f32$handle() {
+        return sub_in_place_f32.HANDLE;
+    }
+
+    /**
+     * Address for:
+     * {@snippet lang=c :
+     * void sub_in_place_f32(float *v1, const float *v2, size_t length)
+     * }
+     */
+    public static MemorySegment sub_in_place_f32$address() {
+        return sub_in_place_f32.ADDR;
+    }
+
+    /**
+     * {@snippet lang=c :
+     * void sub_in_place_f32(float *v1, const float *v2, size_t length)
+     * }
+     */
+    public static void sub_in_place_f32(MemorySegment v1, MemorySegment v2, long length) {
+        var mh$ = sub_in_place_f32.HANDLE;
+        try {
+            if (TRACE_DOWNCALLS) {
+                traceDowncall("sub_in_place_f32", v1, v2, length);
+            }
+            mh$.invokeExact(v1, v2, length);
+        } catch (Throwable ex$) {
+           throw new AssertionError("should not reach here", ex$);
+        }
+    }
+
+    private static class sub_scalar_in_place_f32 {
+        public static final FunctionDescriptor DESC = FunctionDescriptor.ofVoid(
+            NativeSimdOps.C_POINTER,
+            NativeSimdOps.C_FLOAT,
+            NativeSimdOps.C_LONG
+        );
+
+        public static final MemorySegment ADDR = NativeSimdOps.findOrThrow("sub_scalar_in_place_f32");
+
+        public static final MethodHandle HANDLE = Linker.nativeLinker().downcallHandle(ADDR, DESC, Linker.Option.critical(true));
+    }
+
+    /**
+     * Function descriptor for:
+     * {@snippet lang=c :
+     * void sub_scalar_in_place_f32(float *v1, float value, size_t length)
+     * }
+     */
+    public static FunctionDescriptor sub_scalar_in_place_f32$descriptor() {
+        return sub_scalar_in_place_f32.DESC;
+    }
+
+    /**
+     * Downcall method handle for:
+     * {@snippet lang=c :
+     * void sub_scalar_in_place_f32(float *v1, float value, size_t length)
+     * }
+     */
+    public static MethodHandle sub_scalar_in_place_f32$handle() {
+        return sub_scalar_in_place_f32.HANDLE;
+    }
+
+    /**
+     * Address for:
+     * {@snippet lang=c :
+     * void sub_scalar_in_place_f32(float *v1, float value, size_t length)
+     * }
+     */
+    public static MemorySegment sub_scalar_in_place_f32$address() {
+        return sub_scalar_in_place_f32.ADDR;
+    }
+
+    /**
+     * {@snippet lang=c :
+     * void sub_scalar_in_place_f32(float *v1, float value, size_t length)
+     * }
+     */
+    public static void sub_scalar_in_place_f32(MemorySegment v1, float value, long length) {
+        var mh$ = sub_scalar_in_place_f32.HANDLE;
+        try {
+            if (TRACE_DOWNCALLS) {
+                traceDowncall("sub_scalar_in_place_f32", v1, value, length);
+            }
+            mh$.invokeExact(v1, value, length);
+        } catch (Throwable ex$) {
+           throw new AssertionError("should not reach here", ex$);
+        }
+    }
+
+    private static class max_f32 {
         public static final FunctionDescriptor DESC = FunctionDescriptor.of(
-                NativeSimdOps.C_BOOL    );
+            NativeSimdOps.C_FLOAT,
+            NativeSimdOps.C_POINTER,
+            NativeSimdOps.C_LONG
+        );
+
+        public static final MemorySegment ADDR = NativeSimdOps.findOrThrow("max_f32");
+
+        public static final MethodHandle HANDLE = Linker.nativeLinker().downcallHandle(ADDR, DESC, Linker.Option.critical(true));
+    }
+
+    /**
+     * Function descriptor for:
+     * {@snippet lang=c :
+     * float max_f32(const float *v, size_t length)
+     * }
+     */
+    public static FunctionDescriptor max_f32$descriptor() {
+        return max_f32.DESC;
+    }
+
+    /**
+     * Downcall method handle for:
+     * {@snippet lang=c :
+     * float max_f32(const float *v, size_t length)
+     * }
+     */
+    public static MethodHandle max_f32$handle() {
+        return max_f32.HANDLE;
+    }
+
+    /**
+     * Address for:
+     * {@snippet lang=c :
+     * float max_f32(const float *v, size_t length)
+     * }
+     */
+    public static MemorySegment max_f32$address() {
+        return max_f32.ADDR;
+    }
+
+    /**
+     * {@snippet lang=c :
+     * float max_f32(const float *v, size_t length)
+     * }
+     */
+    public static float max_f32(MemorySegment v, long length) {
+        var mh$ = max_f32.HANDLE;
+        try {
+            if (TRACE_DOWNCALLS) {
+                traceDowncall("max_f32", v, length);
+            }
+            return (float)mh$.invokeExact(v, length);
+        } catch (Throwable ex$) {
+           throw new AssertionError("should not reach here", ex$);
+        }
+    }
+
+    private static class min_in_place_f32 {
+        public static final FunctionDescriptor DESC = FunctionDescriptor.ofVoid(
+            NativeSimdOps.C_POINTER,
+            NativeSimdOps.C_POINTER,
+            NativeSimdOps.C_LONG
+        );
 
-        public static final MemorySegment ADDR = NativeSimdOps.findOrThrow("check_avx512_compatibility");
+        public static final MemorySegment ADDR = NativeSimdOps.findOrThrow("min_in_place_f32");
 
         public static final MethodHandle HANDLE = Linker.nativeLinker().downcallHandle(ADDR, DESC, Linker.Option.critical(true));
     }
-
+
+    /**
+     * Function descriptor for:
+     * {@snippet lang=c :
+     * void min_in_place_f32(float *v1, const float *v2, size_t length)
+     * }
+     */
+    public static FunctionDescriptor min_in_place_f32$descriptor() {
+        return min_in_place_f32.DESC;
+    }
+
+    /**
+     * Downcall method handle for:
+     * {@snippet lang=c :
+     * void min_in_place_f32(float *v1, const float *v2, size_t length)
+     * }
+     */
+    public static MethodHandle min_in_place_f32$handle() {
+        return min_in_place_f32.HANDLE;
+    }
+
+    /**
+     * Address for:
+     * {@snippet lang=c :
+     * void min_in_place_f32(float *v1, const float *v2, size_t length)
+     * }
+     */
+    public static MemorySegment min_in_place_f32$address() {
+        return min_in_place_f32.ADDR;
+    }
+
+    /**
+     * {@snippet lang=c :
+     * void min_in_place_f32(float *v1, const float *v2, size_t length)
+     * }
+     */
+    public static void min_in_place_f32(MemorySegment v1, MemorySegment v2, long length) {
+        var mh$ = min_in_place_f32.HANDLE;
+        try {
+            if (TRACE_DOWNCALLS) {
+                traceDowncall("min_in_place_f32", v1, v2, length);
+            }
+            mh$.invokeExact(v1, v2, length);
+        } catch (Throwable ex$) {
+           throw new AssertionError("should not reach here", ex$);
+        }
+    }
+    private static final MemorySegment NULL = MemorySegment.ofAddress(0L);
+    /**
+     * {@snippet lang=c :
+     * #define NULL (void*) 0
+     * }
+     */
+    public static MemorySegment NULL() {
+        return NULL;
+    }
+    private static final long _POSIX_C_SOURCE = 200809L;
+    /**
+     * {@snippet lang=c :
+     * #define _POSIX_C_SOURCE 200809
+     * }
+     */
+    public static long _POSIX_C_SOURCE() {
+        return _POSIX_C_SOURCE;
+    }
+    private static final int __TIMESIZE = (int)64L;
+    /**
+     * {@snippet lang=c :
+     * #define __TIMESIZE 64
+     * }
+     */
+    public static int __TIMESIZE() {
+        return __TIMESIZE;
+    }
+    private static final long __STDC_IEC_60559_BFP__ = 201404L;
+    /**
+     * {@snippet lang=c :
+     * #define __STDC_IEC_60559_BFP__ 201404
+     * }
+     */
+    public static long __STDC_IEC_60559_BFP__() {
+        return __STDC_IEC_60559_BFP__;
+    }
+    private static final long __STDC_IEC_60559_COMPLEX__ = 201404L;
+    /**
+     * {@snippet lang=c :
+     * #define __STDC_IEC_60559_COMPLEX__ 201404
+     * }
+     */
+    public static long __STDC_IEC_60559_COMPLEX__() {
+        return __STDC_IEC_60559_COMPLEX__;
+    }
+    private static final long __STDC_ISO_10646__ = 201706L;
+    /**
+     * {@snippet lang=c :
+     * #define __STDC_ISO_10646__ 201706
+     * }
+     */
+    public static long __STDC_ISO_10646__() {
+        return __STDC_ISO_10646__;
+    }
+    private static final int __WCHAR_MAX = (int)2147483647L;
+    /**
+     * {@snippet lang=c :
+     * #define __WCHAR_MAX 2147483647
+     * }
+     */
+    public static int __WCHAR_MAX() {
+        return __WCHAR_MAX;
+    }
+    private static final int __WCHAR_MIN = (int)-2147483648L;
     /**
-     * Function descriptor for:
      * {@snippet lang=c :
-     * _Bool check_avx512_compatibility()
+     * #define __WCHAR_MIN -2147483648
      * }
      */
-    public static FunctionDescriptor check_avx512_compatibility$descriptor() {
-        return check_avx512_compatibility.DESC;
+    public static int __WCHAR_MIN() {
+        return __WCHAR_MIN;
     }
-
+    private static final int INT8_MIN = (int)-128L;
     /**
-     * Downcall method handle for:
      * {@snippet lang=c :
-     * _Bool check_avx512_compatibility()
+     * #define INT8_MIN -128
      * }
      */
-    public static MethodHandle check_avx512_compatibility$handle() {
-        return check_avx512_compatibility.HANDLE;
+    public static int INT8_MIN() {
+        return INT8_MIN;
     }
-
+    private static final int INT16_MIN = (int)-32768L;
     /**
-     * Address for:
      * {@snippet lang=c :
-     * _Bool check_avx512_compatibility()
+     * #define INT16_MIN -32768
      * }
      */
-    public static MemorySegment check_avx512_compatibility$address() {
-        return check_avx512_compatibility.ADDR;
+    public static int INT16_MIN() {
+        return INT16_MIN;
     }
-
+    private static final int INT32_MIN = (int)-2147483648L;
     /**
      * {@snippet lang=c :
-     * _Bool check_avx512_compatibility()
+     * #define INT32_MIN -2147483648
      * }
      */
-    public static boolean check_avx512_compatibility() {
-        var mh$ = check_avx512_compatibility.HANDLE;
-        try {
-            if (TRACE_DOWNCALLS) {
-                traceDowncall("check_avx512_compatibility");
-            }
-            return (boolean)mh$.invokeExact();
-        } catch (Throwable ex$) {
-            throw new AssertionError("should not reach here", ex$);
-        }
-    }
-
-    private static class dot_product_f32 {
-        public static final FunctionDescriptor DESC = FunctionDescriptor.of(
-                NativeSimdOps.C_FLOAT,
-                NativeSimdOps.C_INT,
-                NativeSimdOps.C_POINTER,
-                NativeSimdOps.C_INT,
-                NativeSimdOps.C_POINTER,
-                NativeSimdOps.C_INT,
-                NativeSimdOps.C_INT
-        );
-
-        public static final MemorySegment ADDR = NativeSimdOps.findOrThrow("dot_product_f32");
-
-        public static final MethodHandle HANDLE = Linker.nativeLinker().downcallHandle(ADDR, DESC, Linker.Option.critical(true));
+    public static int INT32_MIN() {
+        return INT32_MIN;
     }
-
+    private static final long INT64_MIN = -9223372036854775808L;
     /**
-     * Function descriptor for:
      * {@snippet lang=c :
-     * float dot_product_f32(int preferred_size, const float *a, int aoffset, const float *b, int boffset, int length)
+     * #define INT64_MIN -9223372036854775808
      * }
      */
-    public static FunctionDescriptor dot_product_f32$descriptor() {
-        return dot_product_f32.DESC;
+    public static long INT64_MIN() {
+        return INT64_MIN;
     }
-
+    private static final int INT8_MAX = (int)127L;
     /**
-     * Downcall method handle for:
      * {@snippet lang=c :
-     * float dot_product_f32(int preferred_size, const float *a, int aoffset, const float *b, int boffset, int length)
+     * #define INT8_MAX 127
      * }
      */
-    public static MethodHandle dot_product_f32$handle() {
-        return dot_product_f32.HANDLE;
+    public static int INT8_MAX() {
+        return INT8_MAX;
     }
-
+    private static final int INT16_MAX = (int)32767L;
     /**
-     * Address for:
      * {@snippet lang=c :
-     * float dot_product_f32(int preferred_size, const float *a, int aoffset, const float *b, int boffset, int length)
+     * #define INT16_MAX 32767
      * }
      */
-    public static MemorySegment dot_product_f32$address() {
-        return dot_product_f32.ADDR;
+    public static int INT16_MAX() {
+        return INT16_MAX;
     }
-
+    private static final int INT32_MAX = (int)2147483647L;
     /**
      * {@snippet lang=c :
-     * float dot_product_f32(int preferred_size, const float *a, int aoffset, const float *b, int boffset, int length)
+     * #define INT32_MAX 2147483647
      * }
      */
-    public static float dot_product_f32(int preferred_size, MemorySegment a, int aoffset, MemorySegment b, int boffset, int length) {
-        var mh$ = dot_product_f32.HANDLE;
-        try {
-            if (TRACE_DOWNCALLS) {
-                traceDowncall("dot_product_f32", preferred_size, a, aoffset, b, boffset, length);
-            }
-            return (float)mh$.invokeExact(preferred_size, a, aoffset, b, boffset, length);
-        } catch (Throwable ex$) {
-            throw new AssertionError("should not reach here", ex$);
-        }
-    }
-
-    private static class euclidean_f32 {
-        public static final FunctionDescriptor DESC = FunctionDescriptor.of(
-                NativeSimdOps.C_FLOAT,
-                NativeSimdOps.C_INT,
-                NativeSimdOps.C_POINTER,
-                NativeSimdOps.C_INT,
-                NativeSimdOps.C_POINTER,
-                NativeSimdOps.C_INT,
-                NativeSimdOps.C_INT
-        );
-
-        public static final MemorySegment ADDR = NativeSimdOps.findOrThrow("euclidean_f32");
-
-        public static final MethodHandle HANDLE = Linker.nativeLinker().downcallHandle(ADDR, DESC, Linker.Option.critical(true));
+    public static int INT32_MAX() {
+        return INT32_MAX;
     }
-
+    private static final long INT64_MAX = 9223372036854775807L;
     /**
-     * Function descriptor for:
      * {@snippet lang=c :
-     * float euclidean_f32(int preferred_size, const float *a, int aoffset, const float *b, int boffset, int length)
+     * #define INT64_MAX 9223372036854775807
      * }
      */
-    public static FunctionDescriptor euclidean_f32$descriptor() {
-        return euclidean_f32.DESC;
+    public static long INT64_MAX() {
+        return INT64_MAX;
     }
-
+    private static final int UINT8_MAX = (int)255L;
     /**
-     * Downcall method handle for:
      * {@snippet lang=c :
-     * float euclidean_f32(int preferred_size, const float *a, int aoffset, const float *b, int boffset, int length)
+     * #define UINT8_MAX 255
      * }
      */
-    public static MethodHandle euclidean_f32$handle() {
-        return euclidean_f32.HANDLE;
+    public static int UINT8_MAX() {
+        return UINT8_MAX;
     }
-
+    private static final int UINT16_MAX = (int)65535L;
     /**
-     * Address for:
      * {@snippet lang=c :
-     * float euclidean_f32(int preferred_size, const float *a, int aoffset, const float *b, int boffset, int length)
+     * #define UINT16_MAX 65535
      * }
      */
-    public static MemorySegment euclidean_f32$address() {
-        return euclidean_f32.ADDR;
+    public static int UINT16_MAX() {
+        return UINT16_MAX;
     }
-
+    private static final int UINT32_MAX = (int)4294967295L;
     /**
      * {@snippet lang=c :
-     * float euclidean_f32(int preferred_size, const float *a, int aoffset, const float *b, int boffset, int length)
+     * #define UINT32_MAX 4294967295
      * }
      */
-    public static float euclidean_f32(int preferred_size, MemorySegment a, int aoffset, MemorySegment b, int boffset, int length) {
-        var mh$ = euclidean_f32.HANDLE;
-        try {
-            if (TRACE_DOWNCALLS) {
-                traceDowncall("euclidean_f32", preferred_size, a, aoffset, b, boffset, length);
-            }
-            return (float)mh$.invokeExact(preferred_size, a, aoffset, b, boffset, length);
-        } catch (Throwable ex$) {
-            throw new AssertionError("should not reach here", ex$);
-        }
-    }
-
-    private static class bulk_quantized_shuffle_dot_f32_512 {
-        public static final FunctionDescriptor DESC = FunctionDescriptor.ofVoid(
-                NativeSimdOps.C_POINTER,
-                NativeSimdOps.C_INT,
-                NativeSimdOps.C_POINTER,
-                NativeSimdOps.C_FLOAT,
-                NativeSimdOps.C_FLOAT,
-                NativeSimdOps.C_POINTER
-        );
-
-        public static final MemorySegment ADDR = NativeSimdOps.findOrThrow("bulk_quantized_shuffle_dot_f32_512");
-
-        public static final MethodHandle HANDLE = Linker.nativeLinker().downcallHandle(ADDR, DESC, Linker.Option.critical(true));
+    public static int UINT32_MAX() {
+        return UINT32_MAX;
     }
-
+    private static final long UINT64_MAX = -1L;
     /**
-     * Function descriptor for:
      * {@snippet lang=c :
-     * void bulk_quantized_shuffle_dot_f32_512(const unsigned char *shuffles, int codebookCount, const char *quantizedPartials, float delta, float minDistance, float *results)
+     * #define UINT64_MAX -1
      * }
      */
-    public static FunctionDescriptor bulk_quantized_shuffle_dot_f32_512$descriptor() {
-        return bulk_quantized_shuffle_dot_f32_512.DESC;
+    public static long UINT64_MAX() {
+        return UINT64_MAX;
     }
-
+    private static final int INT_LEAST8_MIN = (int)-128L;
     /**
-     * Downcall method handle for:
      * {@snippet lang=c :
-     * void bulk_quantized_shuffle_dot_f32_512(const unsigned char *shuffles, int codebookCount, const char *quantizedPartials, float delta, float minDistance, float *results)
+     * #define INT_LEAST8_MIN -128
      * }
      */
-    public static MethodHandle bulk_quantized_shuffle_dot_f32_512$handle() {
-        return bulk_quantized_shuffle_dot_f32_512.HANDLE;
+    public static int INT_LEAST8_MIN() {
+        return INT_LEAST8_MIN;
     }
-
+    private static final int INT_LEAST16_MIN = (int)-32768L;
     /**
-     * Address for:
      * {@snippet lang=c :
-     * void bulk_quantized_shuffle_dot_f32_512(const unsigned char *shuffles, int codebookCount, const char *quantizedPartials, float delta, float minDistance, float *results)
+     * #define INT_LEAST16_MIN -32768
      * }
      */
-    public static MemorySegment bulk_quantized_shuffle_dot_f32_512$address() {
-        return bulk_quantized_shuffle_dot_f32_512.ADDR;
+    public static int INT_LEAST16_MIN() {
+        return INT_LEAST16_MIN;
     }
-
+    private static final int INT_LEAST32_MIN = (int)-2147483648L;
     /**
      * {@snippet lang=c :
-     * void bulk_quantized_shuffle_dot_f32_512(const unsigned char *shuffles, int codebookCount, const char *quantizedPartials, float delta, float minDistance, float *results)
+     * #define INT_LEAST32_MIN -2147483648
      * }
      */
-    public static void bulk_quantized_shuffle_dot_f32_512(MemorySegment shuffles, int codebookCount, MemorySegment quantizedPartials, float delta, float minDistance, MemorySegment results) {
-        var mh$ = bulk_quantized_shuffle_dot_f32_512.HANDLE;
-        try {
-            if (TRACE_DOWNCALLS) {
-                traceDowncall("bulk_quantized_shuffle_dot_f32_512", shuffles, codebookCount, quantizedPartials, delta, minDistance, results);
-            }
-            mh$.invokeExact(shuffles, codebookCount, quantizedPartials, delta, minDistance, results);
-        } catch (Throwable ex$) {
-            throw new AssertionError("should not reach here", ex$);
-        }
-    }
-
-    private static class bulk_quantized_shuffle_euclidean_f32_512 {
-        public static final FunctionDescriptor DESC = FunctionDescriptor.ofVoid(
-                NativeSimdOps.C_POINTER,
-                NativeSimdOps.C_INT,
-                NativeSimdOps.C_POINTER,
-                NativeSimdOps.C_FLOAT,
-                NativeSimdOps.C_FLOAT,
-                NativeSimdOps.C_POINTER
-        );
-
-        public static final MemorySegment ADDR = NativeSimdOps.findOrThrow("bulk_quantized_shuffle_euclidean_f32_512");
-
-        public static final MethodHandle HANDLE = Linker.nativeLinker().downcallHandle(ADDR, DESC, Linker.Option.critical(true));
+    public static int INT_LEAST32_MIN() {
+        return INT_LEAST32_MIN;
     }
-
+    private static final long INT_LEAST64_MIN = -9223372036854775808L;
     /**
-     * Function descriptor for:
      * {@snippet lang=c :
-     * void bulk_quantized_shuffle_euclidean_f32_512(const unsigned char *shuffles, int codebookCount, const char *quantizedPartials, float delta, float minDistance, float *results)
+     * #define INT_LEAST64_MIN -9223372036854775808
      * }
      */
-    public static FunctionDescriptor bulk_quantized_shuffle_euclidean_f32_512$descriptor() {
-        return bulk_quantized_shuffle_euclidean_f32_512.DESC;
+    public static long INT_LEAST64_MIN() {
+        return INT_LEAST64_MIN;
     }
-
+    private static final int INT_LEAST8_MAX = (int)127L;
     /**
-     * Downcall method handle for:
      * {@snippet lang=c :
-     * void bulk_quantized_shuffle_euclidean_f32_512(const unsigned char *shuffles, int codebookCount, const char *quantizedPartials, float delta, float minDistance, float *results)
+     * #define INT_LEAST8_MAX 127
      * }
      */
-    public static MethodHandle bulk_quantized_shuffle_euclidean_f32_512$handle() {
-        return bulk_quantized_shuffle_euclidean_f32_512.HANDLE;
+    public static int INT_LEAST8_MAX() {
+        return INT_LEAST8_MAX;
     }
-
+    private static final int INT_LEAST16_MAX = (int)32767L;
     /**
-     * Address for:
      * {@snippet lang=c :
-     * void bulk_quantized_shuffle_euclidean_f32_512(const unsigned char *shuffles, int codebookCount, const char *quantizedPartials, float delta, float minDistance, float *results)
+     * #define INT_LEAST16_MAX 32767
      * }
      */
-    public static MemorySegment bulk_quantized_shuffle_euclidean_f32_512$address() {
-        return bulk_quantized_shuffle_euclidean_f32_512.ADDR;
+    public static int INT_LEAST16_MAX() {
+        return INT_LEAST16_MAX;
     }
-
+    private static final int INT_LEAST32_MAX = (int)2147483647L;
     /**
      * {@snippet lang=c :
-     * void bulk_quantized_shuffle_euclidean_f32_512(const unsigned char *shuffles, int codebookCount, const char *quantizedPartials, float delta, float minDistance, float *results)
+     * #define INT_LEAST32_MAX 2147483647
      * }
      */
-    public static void bulk_quantized_shuffle_euclidean_f32_512(MemorySegment shuffles, int codebookCount, MemorySegment quantizedPartials, float delta, float minDistance, MemorySegment results) {
-        var mh$ = bulk_quantized_shuffle_euclidean_f32_512.HANDLE;
-        try {
-            if (TRACE_DOWNCALLS) {
-                traceDowncall("bulk_quantized_shuffle_euclidean_f32_512", shuffles, codebookCount, quantizedPartials, delta, minDistance, results);
-            }
-            mh$.invokeExact(shuffles, codebookCount, quantizedPartials, delta, minDistance, results);
-        } catch (Throwable ex$) {
-            throw new AssertionError("should not reach here", ex$);
-        }
-    }
-
-    private static class bulk_quantized_shuffle_cosine_f32_512 {
-        public static final FunctionDescriptor DESC = FunctionDescriptor.ofVoid(
-                NativeSimdOps.C_POINTER,
-                NativeSimdOps.C_INT,
-                NativeSimdOps.C_POINTER,
-                NativeSimdOps.C_FLOAT,
-                NativeSimdOps.C_FLOAT,
-                NativeSimdOps.C_POINTER,
-                NativeSimdOps.C_FLOAT,
-                NativeSimdOps.C_FLOAT,
-                NativeSimdOps.C_FLOAT,
-                NativeSimdOps.C_POINTER
-        );
-
-        public static final MemorySegment ADDR = NativeSimdOps.findOrThrow("bulk_quantized_shuffle_cosine_f32_512");
-
-        public static final MethodHandle HANDLE = Linker.nativeLinker().downcallHandle(ADDR, DESC, Linker.Option.critical(true));
+    public static int INT_LEAST32_MAX() {
+        return INT_LEAST32_MAX;
     }
-
+    private static final long INT_LEAST64_MAX = 9223372036854775807L;
     /**
-     * Function descriptor for:
      * {@snippet lang=c :
-     * void bulk_quantized_shuffle_cosine_f32_512(const unsigned char *shuffles, int codebookCount, const char *quantizedPartialSums, float sumDelta, float minDistance, const char *quantizedPartialMagnitudes, float magnitudeDelta, float minMagnitude, float queryMagnitudeSquared, float *results)
+     * #define INT_LEAST64_MAX 9223372036854775807
      * }
      */
-    public static FunctionDescriptor bulk_quantized_shuffle_cosine_f32_512$descriptor() {
-        return bulk_quantized_shuffle_cosine_f32_512.DESC;
+    public static long INT_LEAST64_MAX() {
+        return INT_LEAST64_MAX;
     }
-
+    private static final int UINT_LEAST8_MAX = (int)255L;
     /**
-     * Downcall method handle for:
      * {@snippet lang=c :
-     * void bulk_quantized_shuffle_cosine_f32_512(const unsigned char *shuffles, int codebookCount, const char *quantizedPartialSums, float sumDelta, float minDistance, const char *quantizedPartialMagnitudes, float magnitudeDelta, float minMagnitude, float queryMagnitudeSquared, float *results)
+     * #define UINT_LEAST8_MAX 255
      * }
      */
-    public static MethodHandle bulk_quantized_shuffle_cosine_f32_512$handle() {
-        return bulk_quantized_shuffle_cosine_f32_512.HANDLE;
+    public static int UINT_LEAST8_MAX() {
+        return UINT_LEAST8_MAX;
     }
-
+    private static final int UINT_LEAST16_MAX = (int)65535L;
     /**
-     * Address for:
      * {@snippet lang=c :
-     * void bulk_quantized_shuffle_cosine_f32_512(const unsigned char *shuffles, int codebookCount, const char *quantizedPartialSums, float sumDelta, float minDistance, const char *quantizedPartialMagnitudes, float magnitudeDelta, float minMagnitude, float queryMagnitudeSquared, float *results)
+     * #define UINT_LEAST16_MAX 65535
      * }
      */
-    public static MemorySegment bulk_quantized_shuffle_cosine_f32_512$address() {
-        return bulk_quantized_shuffle_cosine_f32_512.ADDR;
+    public static int UINT_LEAST16_MAX() {
+        return UINT_LEAST16_MAX;
     }
-
+    private static final int UINT_LEAST32_MAX = (int)4294967295L;
     /**
      * {@snippet lang=c :
-     * void bulk_quantized_shuffle_cosine_f32_512(const unsigned char *shuffles, int codebookCount, const char *quantizedPartialSums, float sumDelta, float minDistance, const char *quantizedPartialMagnitudes, float magnitudeDelta, float minMagnitude, float queryMagnitudeSquared, float *results)
+     * #define UINT_LEAST32_MAX 4294967295
      * }
      */
-    public static void bulk_quantized_shuffle_cosine_f32_512(MemorySegment shuffles, int codebookCount, MemorySegment quantizedPartialSums, float sumDelta, float minDistance, MemorySegment quantizedPartialMagnitudes, float magnitudeDelta, float minMagnitude, float queryMagnitudeSquared, MemorySegment results) {
-        var mh$ = bulk_quantized_shuffle_cosine_f32_512.HANDLE;
-        try {
-            if (TRACE_DOWNCALLS) {
-                traceDowncall("bulk_quantized_shuffle_cosine_f32_512", shuffles, codebookCount, quantizedPartialSums, sumDelta, minDistance, quantizedPartialMagnitudes, magnitudeDelta, minMagnitude, queryMagnitudeSquared, results);
-            }
-            mh$.invokeExact(shuffles, codebookCount, quantizedPartialSums, sumDelta, minDistance, quantizedPartialMagnitudes, magnitudeDelta, minMagnitude, queryMagnitudeSquared, results);
-        } catch (Throwable ex$) {
-            throw new AssertionError("should not reach here", ex$);
-        }
-    }
-
-    private static class assemble_and_sum_f32_512 {
-        public static final FunctionDescriptor DESC = FunctionDescriptor.of(
-                NativeSimdOps.C_FLOAT,
-                NativeSimdOps.C_POINTER,
-                NativeSimdOps.C_INT,
-                NativeSimdOps.C_POINTER,
-                NativeSimdOps.C_INT,
-                NativeSimdOps.C_INT
-        );
-
-        public static final MemorySegment ADDR = NativeSimdOps.findOrThrow("assemble_and_sum_f32_512");
-
-        public static final MethodHandle HANDLE = Linker.nativeLinker().downcallHandle(ADDR, DESC, Linker.Option.critical(true));
+    public static int UINT_LEAST32_MAX() {
+        return UINT_LEAST32_MAX;
     }
-
+    private static final long UINT_LEAST64_MAX = -1L;
     /**
-     * Function descriptor for:
      * {@snippet lang=c :
-     * float assemble_and_sum_f32_512(const float *data, int dataBase, const unsigned char *baseOffsets, int baseOffsetsOffset, int baseOffsetsLength)
+     * #define UINT_LEAST64_MAX -1
      * }
      */
-    public static FunctionDescriptor assemble_and_sum_f32_512$descriptor() {
-        return assemble_and_sum_f32_512.DESC;
+    public static long UINT_LEAST64_MAX() {
+        return UINT_LEAST64_MAX;
     }
-
+    private static final int INT_FAST8_MIN = (int)-128L;
     /**
-     * Downcall method handle for:
      * {@snippet lang=c :
-     * float assemble_and_sum_f32_512(const float *data, int dataBase, const unsigned char *baseOffsets, int baseOffsetsOffset, int baseOffsetsLength)
+     * #define INT_FAST8_MIN -128
      * }
      */
-    public static MethodHandle assemble_and_sum_f32_512$handle() {
-        return assemble_and_sum_f32_512.HANDLE;
+    public static int INT_FAST8_MIN() {
+        return INT_FAST8_MIN;
     }
-
+    private static final long INT_FAST16_MIN = -9223372036854775808L;
     /**
-     * Address for:
      * {@snippet lang=c :
-     * float assemble_and_sum_f32_512(const float *data, int dataBase, const unsigned char *baseOffsets, int baseOffsetsOffset, int baseOffsetsLength)
+     * #define INT_FAST16_MIN -9223372036854775808
      * }
      */
-    public static MemorySegment assemble_and_sum_f32_512$address() {
-        return assemble_and_sum_f32_512.ADDR;
+    public static long INT_FAST16_MIN() {
+        return INT_FAST16_MIN;
     }
-
+    private static final long INT_FAST32_MIN = -9223372036854775808L;
     /**
      * {@snippet lang=c :
-     * float assemble_and_sum_f32_512(const float *data, int dataBase, const unsigned char *baseOffsets, int baseOffsetsOffset, int baseOffsetsLength)
+     * #define INT_FAST32_MIN -9223372036854775808
      * }
      */
-    public static float assemble_and_sum_f32_512(MemorySegment data, int dataBase, MemorySegment baseOffsets, int baseOffsetsOffset, int baseOffsetsLength) {
-        var mh$ = assemble_and_sum_f32_512.HANDLE;
-        try {
-            if (TRACE_DOWNCALLS) {
-                traceDowncall("assemble_and_sum_f32_512", data, dataBase, baseOffsets, baseOffsetsOffset, baseOffsetsLength);
-            }
-            return (float)mh$.invokeExact(data, dataBase, baseOffsets, baseOffsetsOffset, baseOffsetsLength);
-        } catch (Throwable ex$) {
-            throw new AssertionError("should not reach here", ex$);
-        }
-    }
-
-    private static class pq_decoded_cosine_similarity_f32_512 {
-        public static final FunctionDescriptor DESC = FunctionDescriptor.of(
-                NativeSimdOps.C_FLOAT,
-                NativeSimdOps.C_POINTER,
-                NativeSimdOps.C_INT,
-                NativeSimdOps.C_INT,
-                NativeSimdOps.C_INT,
-                NativeSimdOps.C_POINTER,
-                NativeSimdOps.C_POINTER,
-                NativeSimdOps.C_FLOAT
-        );
-
-        public static final MemorySegment ADDR = NativeSimdOps.findOrThrow("pq_decoded_cosine_similarity_f32_512");
-
-        public static final MethodHandle HANDLE = Linker.nativeLinker().downcallHandle(ADDR, DESC, Linker.Option.critical(true));
+    public static long INT_FAST32_MIN() {
+        return INT_FAST32_MIN;
     }
-
+    private static final long INT_FAST64_MIN = -9223372036854775808L;
     /**
-     * Function descriptor for:
      * {@snippet lang=c :
-     * float pq_decoded_cosine_similarity_f32_512(const unsigned char *baseOffsets, int baseOffsetsOffset, int baseOffsetsLength, int clusterCount, const float *partialSums, const float *aMagnitude, float bMagnitude)
+     * #define INT_FAST64_MIN -9223372036854775808
      * }
      */
-    public static FunctionDescriptor pq_decoded_cosine_similarity_f32_512$descriptor() {
-        return pq_decoded_cosine_similarity_f32_512.DESC;
+    public static long INT_FAST64_MIN() {
+        return INT_FAST64_MIN;
     }
-
+    private static final int INT_FAST8_MAX = (int)127L;
     /**
-     * Downcall method handle for:
      * {@snippet lang=c :
-     * float pq_decoded_cosine_similarity_f32_512(const unsigned char *baseOffsets, int baseOffsetsOffset, int baseOffsetsLength, int clusterCount, const float *partialSums, const float *aMagnitude, float bMagnitude)
+     * #define INT_FAST8_MAX 127
      * }
      */
-    public static MethodHandle pq_decoded_cosine_similarity_f32_512$handle() {
-        return pq_decoded_cosine_similarity_f32_512.HANDLE;
+    public static int INT_FAST8_MAX() {
+        return INT_FAST8_MAX;
     }
-
+    private static final long INT_FAST16_MAX = 9223372036854775807L;
     /**
-     * Address for:
      * {@snippet lang=c :
-     * float pq_decoded_cosine_similarity_f32_512(const unsigned char *baseOffsets, int baseOffsetsOffset, int baseOffsetsLength, int clusterCount, const float *partialSums, const float *aMagnitude, float bMagnitude)
+     * #define INT_FAST16_MAX 9223372036854775807
      * }
      */
-    public static MemorySegment pq_decoded_cosine_similarity_f32_512$address() {
-        return pq_decoded_cosine_similarity_f32_512.ADDR;
+    public static long INT_FAST16_MAX() {
+        return INT_FAST16_MAX;
     }
-
+    private static final long INT_FAST32_MAX = 9223372036854775807L;
     /**
      * {@snippet lang=c :
-     * float pq_decoded_cosine_similarity_f32_512(const unsigned char *baseOffsets, int baseOffsetsOffset, int baseOffsetsLength, int clusterCount, const float *partialSums, const float *aMagnitude, float bMagnitude)
+     * #define INT_FAST32_MAX 9223372036854775807
      * }
      */
-    public static float pq_decoded_cosine_similarity_f32_512(MemorySegment baseOffsets, int baseOffsetsOffset, int baseOffsetsLength, int clusterCount, MemorySegment partialSums, MemorySegment aMagnitude, float bMagnitude) {
-        var mh$ = pq_decoded_cosine_similarity_f32_512.HANDLE;
-        try {
-            if (TRACE_DOWNCALLS) {
-                traceDowncall("pq_decoded_cosine_similarity_f32_512", baseOffsets, baseOffsetsOffset, baseOffsetsLength, clusterCount, partialSums, aMagnitude, bMagnitude);
-            }
-            return (float)mh$.invokeExact(baseOffsets, baseOffsetsOffset, baseOffsetsLength, clusterCount, partialSums, aMagnitude, bMagnitude);
-        } catch (Throwable ex$) {
-            throw new AssertionError("should not reach here", ex$);
-        }
+    public static long INT_FAST32_MAX() {
+        return INT_FAST32_MAX;
     }
-
-    private static class calculate_partial_sums_dot_f32_512 {
-        public static final FunctionDescriptor DESC = FunctionDescriptor.ofVoid(
-                NativeSimdOps.C_POINTER,
-                NativeSimdOps.C_INT,
-                NativeSimdOps.C_INT,
-                NativeSimdOps.C_INT,
-                NativeSimdOps.C_POINTER,
-                NativeSimdOps.C_INT,
-                NativeSimdOps.C_POINTER
-        );
-
-        public static final MemorySegment ADDR = NativeSimdOps.findOrThrow("calculate_partial_sums_dot_f32_512");
-
-        public static final MethodHandle HANDLE = Linker.nativeLinker().downcallHandle(ADDR, DESC, Linker.Option.critical(true));
+    private static final long INT_FAST64_MAX = 9223372036854775807L;
+    /**
+     * {@snippet lang=c :
+     * #define INT_FAST64_MAX 9223372036854775807
+     * }
+     */
+    public static long INT_FAST64_MAX() {
+        return INT_FAST64_MAX;
     }
-
+    private static final int UINT_FAST8_MAX = (int)255L;
     /**
-     * Function descriptor for:
      * {@snippet lang=c :
-     * void calculate_partial_sums_dot_f32_512(const float *codebook, int codebookBase, int size, int clusterCount, const float *query, int queryOffset, float *partialSums)
+     * #define UINT_FAST8_MAX 255
      * }
      */
-    public static FunctionDescriptor calculate_partial_sums_dot_f32_512$descriptor() {
-        return calculate_partial_sums_dot_f32_512.DESC;
+    public static int UINT_FAST8_MAX() {
+        return UINT_FAST8_MAX;
     }
-
+    private static final long UINT_FAST16_MAX = -1L;
     /**
-     * Downcall method handle for:
      * {@snippet lang=c :
-     * void calculate_partial_sums_dot_f32_512(const float *codebook, int codebookBase, int size, int clusterCount, const float *query, int queryOffset, float *partialSums)
+     * #define UINT_FAST16_MAX -1
      * }
      */
-    public static MethodHandle calculate_partial_sums_dot_f32_512$handle() {
-        return calculate_partial_sums_dot_f32_512.HANDLE;
+    public static long UINT_FAST16_MAX() {
+        return UINT_FAST16_MAX;
     }
-
+    private static final long UINT_FAST32_MAX = -1L;
     /**
-     * Address for:
      * {@snippet lang=c :
-     * void calculate_partial_sums_dot_f32_512(const float *codebook, int codebookBase, int size, int clusterCount, const float *query, int queryOffset, float *partialSums)
+     * #define UINT_FAST32_MAX -1
      * }
      */
-    public static MemorySegment calculate_partial_sums_dot_f32_512$address() {
-        return calculate_partial_sums_dot_f32_512.ADDR;
+    public static long UINT_FAST32_MAX() {
+        return UINT_FAST32_MAX;
     }
-
+    private static final long UINT_FAST64_MAX = -1L;
     /**
      * {@snippet lang=c :
-     * void calculate_partial_sums_dot_f32_512(const float *codebook, int codebookBase, int size, int clusterCount, const float *query, int queryOffset, float *partialSums)
+     * #define UINT_FAST64_MAX -1
      * }
      */
-    public static void calculate_partial_sums_dot_f32_512(MemorySegment codebook, int codebookBase, int size, int clusterCount, MemorySegment query, int queryOffset, MemorySegment partialSums) {
-        var mh$ = calculate_partial_sums_dot_f32_512.HANDLE;
-        try {
-            if (TRACE_DOWNCALLS) {
-                traceDowncall("calculate_partial_sums_dot_f32_512", codebook, codebookBase, size, clusterCount, query, queryOffset, partialSums);
-            }
-            mh$.invokeExact(codebook, codebookBase, size, clusterCount, query, queryOffset, partialSums);
-        } catch (Throwable ex$) {
-            throw new AssertionError("should not reach here", ex$);
-        }
+    public static long UINT_FAST64_MAX() {
+        return UINT_FAST64_MAX;
     }
-
-    private static class calculate_partial_sums_euclidean_f32_512 {
-        public static final FunctionDescriptor DESC = FunctionDescriptor.ofVoid(
-                NativeSimdOps.C_POINTER,
-                NativeSimdOps.C_INT,
-                NativeSimdOps.C_INT,
-                NativeSimdOps.C_INT,
-                NativeSimdOps.C_POINTER,
-                NativeSimdOps.C_INT,
-                NativeSimdOps.C_POINTER
-        );
-
-        public static final MemorySegment ADDR = NativeSimdOps.findOrThrow("calculate_partial_sums_euclidean_f32_512");
-
-        public static final MethodHandle HANDLE = Linker.nativeLinker().downcallHandle(ADDR, DESC, Linker.Option.critical(true));
+    private static final long INTPTR_MIN = -9223372036854775808L;
+    /**
+     * {@snippet lang=c :
+     * #define INTPTR_MIN -9223372036854775808
+     * }
+     */
+    public static long INTPTR_MIN() {
+        return INTPTR_MIN;
     }
-
+    private static final long INTPTR_MAX = 9223372036854775807L;
     /**
-     * Function descriptor for:
      * {@snippet lang=c :
-     * void calculate_partial_sums_euclidean_f32_512(const float *codebook, int codebookBase, int size, int clusterCount, const float *query, int queryOffset, float *partialSums)
+     * #define INTPTR_MAX 9223372036854775807
      * }
      */
-    public static FunctionDescriptor calculate_partial_sums_euclidean_f32_512$descriptor() {
-        return calculate_partial_sums_euclidean_f32_512.DESC;
+    public static long INTPTR_MAX() {
+        return INTPTR_MAX;
     }
-
+    private static final long UINTPTR_MAX = -1L;
     /**
-     * Downcall method handle for:
      * {@snippet lang=c :
-     * void calculate_partial_sums_euclidean_f32_512(const float *codebook, int codebookBase, int size, int clusterCount, const float *query, int queryOffset, float *partialSums)
+     * #define UINTPTR_MAX -1
      * }
      */
-    public static MethodHandle calculate_partial_sums_euclidean_f32_512$handle() {
-        return calculate_partial_sums_euclidean_f32_512.HANDLE;
+    public static long UINTPTR_MAX() {
+        return UINTPTR_MAX;
     }
-
+    private static final long INTMAX_MIN = -9223372036854775808L;
     /**
-     * Address for:
      * {@snippet lang=c :
-     * void calculate_partial_sums_euclidean_f32_512(const float *codebook, int codebookBase, int size, int clusterCount, const float *query, int queryOffset, float *partialSums)
+     * #define INTMAX_MIN -9223372036854775808
      * }
      */
-    public static MemorySegment calculate_partial_sums_euclidean_f32_512$address() {
-        return calculate_partial_sums_euclidean_f32_512.ADDR;
+    public static long INTMAX_MIN() {
+        return INTMAX_MIN;
     }
-
+    private static final long INTMAX_MAX = 9223372036854775807L;
     /**
      * {@snippet lang=c :
-     * void calculate_partial_sums_euclidean_f32_512(const float *codebook, int codebookBase, int size, int clusterCount, const float *query, int queryOffset, float *partialSums)
+     * #define INTMAX_MAX 9223372036854775807
      * }
      */
-    public static void calculate_partial_sums_euclidean_f32_512(MemorySegment codebook, int codebookBase, int size, int clusterCount, MemorySegment query, int queryOffset, MemorySegment partialSums) {
-        var mh$ = calculate_partial_sums_euclidean_f32_512.HANDLE;
-        try {
-            if (TRACE_DOWNCALLS) {
-                traceDowncall("calculate_partial_sums_euclidean_f32_512", codebook, codebookBase, size, clusterCount, query, queryOffset, partialSums);
-            }
-            mh$.invokeExact(codebook, codebookBase, size, clusterCount, query, queryOffset, partialSums);
-        } catch (Throwable ex$) {
-            throw new AssertionError("should not reach here", ex$);
-        }
+    public static long INTMAX_MAX() {
+        return INTMAX_MAX;
     }
-
-    private static class calculate_partial_sums_best_dot_f32_512 {
-        public static final FunctionDescriptor DESC = FunctionDescriptor.ofVoid(
-                NativeSimdOps.C_POINTER,
-                NativeSimdOps.C_INT,
-                NativeSimdOps.C_INT,
-                NativeSimdOps.C_INT,
-                NativeSimdOps.C_POINTER,
-                NativeSimdOps.C_INT,
-                NativeSimdOps.C_POINTER,
-                NativeSimdOps.C_POINTER
-        );
-
-        public static final MemorySegment ADDR = NativeSimdOps.findOrThrow("calculate_partial_sums_best_dot_f32_512");
-
-        public static final MethodHandle HANDLE = Linker.nativeLinker().downcallHandle(ADDR, DESC, Linker.Option.critical(true));
+    private static final long UINTMAX_MAX = -1L;
+    /**
+     * {@snippet lang=c :
+     * #define UINTMAX_MAX -1
+     * }
+     */
+    public static long UINTMAX_MAX() {
+        return UINTMAX_MAX;
     }
-
+    private static final long PTRDIFF_MIN = -9223372036854775808L;
     /**
-     * Function descriptor for:
      * {@snippet lang=c :
-     * void calculate_partial_sums_best_dot_f32_512(const float *codebook, int codebookBase, int size, int clusterCount, const float *query, int queryOffset, float *partialSums, float *partialBestDistances)
+     * #define PTRDIFF_MIN -9223372036854775808
      * }
      */
-    public static FunctionDescriptor calculate_partial_sums_best_dot_f32_512$descriptor() {
-        return calculate_partial_sums_best_dot_f32_512.DESC;
+    public static long PTRDIFF_MIN() {
+        return PTRDIFF_MIN;
     }
-
+    private static final long PTRDIFF_MAX = 9223372036854775807L;
     /**
-     * Downcall method handle for:
      * {@snippet lang=c :
-     * void calculate_partial_sums_best_dot_f32_512(const float *codebook, int codebookBase, int size, int clusterCount, const float *query, int queryOffset, float *partialSums, float *partialBestDistances)
+     * #define PTRDIFF_MAX 9223372036854775807
      * }
      */
-    public static MethodHandle calculate_partial_sums_best_dot_f32_512$handle() {
-        return calculate_partial_sums_best_dot_f32_512.HANDLE;
+    public static long PTRDIFF_MAX() {
+        return PTRDIFF_MAX;
     }
-
+    private static final int SIG_ATOMIC_MIN = (int)-2147483648L;
     /**
-     * Address for:
      * {@snippet lang=c :
-     * void calculate_partial_sums_best_dot_f32_512(const float *codebook, int codebookBase, int size, int clusterCount, const float *query, int queryOffset, float *partialSums, float *partialBestDistances)
+     * #define SIG_ATOMIC_MIN -2147483648
      * }
      */
-    public static MemorySegment calculate_partial_sums_best_dot_f32_512$address() {
-        return calculate_partial_sums_best_dot_f32_512.ADDR;
+    public static int SIG_ATOMIC_MIN() {
+        return SIG_ATOMIC_MIN;
     }
-
+    private static final int SIG_ATOMIC_MAX = (int)2147483647L;
     /**
      * {@snippet lang=c :
-     * void calculate_partial_sums_best_dot_f32_512(const float *codebook, int codebookBase, int size, int clusterCount, const float *query, int queryOffset, float *partialSums, float *partialBestDistances)
+     * #define SIG_ATOMIC_MAX 2147483647
      * }
      */
-    public static void calculate_partial_sums_best_dot_f32_512(MemorySegment codebook, int codebookBase, int size, int clusterCount, MemorySegment query, int queryOffset, MemorySegment partialSums, MemorySegment partialBestDistances) {
-        var mh$ = calculate_partial_sums_best_dot_f32_512.HANDLE;
-        try {
-            if (TRACE_DOWNCALLS) {
-                traceDowncall("calculate_partial_sums_best_dot_f32_512", codebook, codebookBase, size, clusterCount, query, queryOffset, partialSums, partialBestDistances);
-            }
-            mh$.invokeExact(codebook, codebookBase, size, clusterCount, query, queryOffset, partialSums, partialBestDistances);
-        } catch (Throwable ex$) {
-            throw new AssertionError("should not reach here", ex$);
-        }
+    public static int SIG_ATOMIC_MAX() {
+        return SIG_ATOMIC_MAX;
     }
-
-    private static class calculate_partial_sums_best_euclidean_f32_512 {
-        public static final FunctionDescriptor DESC = FunctionDescriptor.ofVoid(
-                NativeSimdOps.C_POINTER,
-                NativeSimdOps.C_INT,
-                NativeSimdOps.C_INT,
-                NativeSimdOps.C_INT,
-                NativeSimdOps.C_POINTER,
-                NativeSimdOps.C_INT,
-                NativeSimdOps.C_POINTER,
-                NativeSimdOps.C_POINTER
-        );
-
-        public static final MemorySegment ADDR = NativeSimdOps.findOrThrow("calculate_partial_sums_best_euclidean_f32_512");
-
-        public static final MethodHandle HANDLE = Linker.nativeLinker().downcallHandle(ADDR, DESC, Linker.Option.critical(true));
+    private static final long SIZE_MAX = -1L;
+    /**
+     * {@snippet lang=c :
+     * #define SIZE_MAX -1
+     * }
+     */
+    public static long SIZE_MAX() {
+        return SIZE_MAX;
     }
-
+    private static final int WCHAR_MIN = (int)-2147483648L;
     /**
-     * Function descriptor for:
      * {@snippet lang=c :
-     * void calculate_partial_sums_best_euclidean_f32_512(const float *codebook, int codebookBase, int size, int clusterCount, const float *query, int queryOffset, float *partialSums, float *partialBestDistances)
+     * #define WCHAR_MIN -2147483648
      * }
      */
-    public static FunctionDescriptor calculate_partial_sums_best_euclidean_f32_512$descriptor() {
-        return calculate_partial_sums_best_euclidean_f32_512.DESC;
+    public static int WCHAR_MIN() {
+        return WCHAR_MIN;
     }
-
+    private static final int WCHAR_MAX = (int)2147483647L;
     /**
-     * Downcall method handle for:
      * {@snippet lang=c :
-     * void calculate_partial_sums_best_euclidean_f32_512(const float *codebook, int codebookBase, int size, int clusterCount, const float *query, int queryOffset, float *partialSums, float *partialBestDistances)
+     * #define WCHAR_MAX 2147483647
      * }
      */
-    public static MethodHandle calculate_partial_sums_best_euclidean_f32_512$handle() {
-        return calculate_partial_sums_best_euclidean_f32_512.HANDLE;
+    public static int WCHAR_MAX() {
+        return WCHAR_MAX;
     }
-
+    private static final int WINT_MIN = (int)0L;
     /**
-     * Address for:
      * {@snippet lang=c :
-     * void calculate_partial_sums_best_euclidean_f32_512(const float *codebook, int codebookBase, int size, int clusterCount, const float *query, int queryOffset, float *partialSums, float *partialBestDistances)
+     * #define WINT_MIN 0
      * }
      */
-    public static MemorySegment calculate_partial_sums_best_euclidean_f32_512$address() {
-        return calculate_partial_sums_best_euclidean_f32_512.ADDR;
+    public static int WINT_MIN() {
+        return WINT_MIN;
     }
-
+    private static final int WINT_MAX = (int)4294967295L;
     /**
      * {@snippet lang=c :
-     * void calculate_partial_sums_best_euclidean_f32_512(const float *codebook, int codebookBase, int size, int clusterCount, const float *query, int queryOffset, float *partialSums, float *partialBestDistances)
+     * #define WINT_MAX 4294967295
      * }
      */
-    public static void calculate_partial_sums_best_euclidean_f32_512(MemorySegment codebook, int codebookBase, int size, int clusterCount, MemorySegment query, int queryOffset, MemorySegment partialSums, MemorySegment partialBestDistances) {
-        var mh$ = calculate_partial_sums_best_euclidean_f32_512.HANDLE;
-        try {
-            if (TRACE_DOWNCALLS) {
-                traceDowncall("calculate_partial_sums_best_euclidean_f32_512", codebook, codebookBase, size, clusterCount, query, queryOffset, partialSums, partialBestDistances);
-            }
-            mh$.invokeExact(codebook, codebookBase, size, clusterCount, query, queryOffset, partialSums, partialBestDistances);
-        } catch (Throwable ex$) {
-            throw new AssertionError("should not reach here", ex$);
-        }
+    public static int WINT_MAX() {
+        return WINT_MAX;
     }
-}
\ No newline at end of file
+}
+
diff --git a/jvector-native/src/main/java/io/github/jbellis/jvector/vector/cnative/__fsid_t.java b/jvector-native/src/main/java/io/github/jbellis/jvector/vector/cnative/__fsid_t.java
new file mode 100644
index 000000000..fcf526e28
--- /dev/null
+++ b/jvector-native/src/main/java/io/github/jbellis/jvector/vector/cnative/__fsid_t.java
@@ -0,0 +1,160 @@
+// Generated by jextract
+
+package io.github.jbellis.jvector.vector.cnative;
+
+import java.lang.invoke.*;
+import java.lang.foreign.*;
+import java.nio.ByteOrder;
+import java.util.*;
+import java.util.function.*;
+import java.util.stream.*;
+
+import static java.lang.foreign.ValueLayout.*;
+import static java.lang.foreign.MemoryLayout.PathElement.*;
+
+/**
+ * {@snippet lang=c :
+ * struct {
+ *     int __val[2];
+ * }
+ * }
+ */
+public class __fsid_t {
+
+    __fsid_t() {
+        // Should not be called directly
+    }
+
+    private static final GroupLayout $LAYOUT = MemoryLayout.structLayout(
+        MemoryLayout.sequenceLayout(2, NativeSimdOps.C_INT).withName("__val")
+    ).withName("$anon$155:12");
+
+    /**
+     * The layout of this struct
+     */
+    public static final GroupLayout layout() {
+        return $LAYOUT;
+    }
+
+    private static final SequenceLayout __val$LAYOUT = (SequenceLayout)$LAYOUT.select(groupElement("__val"));
+
+    /**
+     * Layout for field:
+     * {@snippet lang=c :
+     * int __val[2]
+     * }
+     */
+    public static final SequenceLayout __val$layout() {
+        return __val$LAYOUT;
+    }
+
+    private static final long __val$OFFSET = 0;
+
+    /**
+     * Offset for field:
+     * {@snippet lang=c :
+     * int __val[2]
+     * }
+     */
+    public static final long __val$offset() {
+        return __val$OFFSET;
+    }
+
+    /**
+     * Getter for field:
+     * {@snippet lang=c :
+     * int __val[2]
+     * }
+     */
+    public static MemorySegment __val(MemorySegment struct) {
+        return struct.asSlice(__val$OFFSET, __val$LAYOUT.byteSize());
+    }
+
+    /**
+     * Setter for field:
+     * {@snippet lang=c :
+     * int __val[2]
+     * }
+     */
+    public static void __val(MemorySegment struct, MemorySegment fieldValue) {
+        MemorySegment.copy(fieldValue, 0L, struct, __val$OFFSET, __val$LAYOUT.byteSize());
+    }
+
+    private static long[] __val$DIMS = { 2 };
+
+    /**
+     * Dimensions for array field:
+     * {@snippet lang=c :
+     * int __val[2]
+     * }
+     */
+    public static long[] __val$dimensions() {
+        return __val$DIMS;
+    }
+    private static final VarHandle __val$ELEM_HANDLE = __val$LAYOUT.varHandle(sequenceElement());
+
+    /**
+     * Indexed getter for field:
+     * {@snippet lang=c :
+     * int __val[2]
+     * }
+     */
+    public static int __val(MemorySegment struct, long index0) {
+        return (int)__val$ELEM_HANDLE.get(struct, 0L, index0);
+    }
+
+    /**
+     * Indexed setter for field:
+     * {@snippet lang=c :
+     * int __val[2]
+     * }
+     */
+    public static void __val(MemorySegment struct, long index0, int fieldValue) {
+        __val$ELEM_HANDLE.set(struct, 0L, index0, fieldValue);
+    }
+
+    /**
+     * Obtains a slice of {@code arrayParam} which selects the array element at {@code index}.
+     * The returned segment has address {@code arrayParam.address() + index * layout().byteSize()}
+     */
+    public static MemorySegment asSlice(MemorySegment array, long index) {
+        return array.asSlice(layout().byteSize() * index);
+    }
+
+    /**
+     * The size (in bytes) of this struct
+     */
+    public static long sizeof() { return layout().byteSize(); }
+
+    /**
+     * Allocate a segment of size {@code layout().byteSize()} using {@code allocator}
+     */
+    public static MemorySegment allocate(SegmentAllocator allocator) {
+        return allocator.allocate(layout());
+    }
+
+    /**
+     * Allocate an array of size {@code elementCount} using {@code allocator}.
+     * The returned segment has size {@code elementCount * layout().byteSize()}.
+     */
+    public static MemorySegment allocateArray(long elementCount, SegmentAllocator allocator) {
+        return allocator.allocate(MemoryLayout.sequenceLayout(elementCount, layout()));
+    }
+
+    /**
+     * Reinterprets {@code addr} using target {@code arena} and {@code cleanupAction} (if any).
+     * The returned segment has size {@code layout().byteSize()}
+     */
+    public static MemorySegment reinterpret(MemorySegment addr, Arena arena, Consumer<MemorySegment> cleanup) {
+        return reinterpret(addr, 1, arena, cleanup);
+    }
+
+    /**
+     * Reinterprets {@code addr} using target {@code arena} and {@code cleanupAction} (if any).
+     * The returned segment has size {@code elementCount * layout().byteSize()}
+     */
+    public static MemorySegment reinterpret(MemorySegment addr, long elementCount, Arena arena, Consumer<MemorySegment> cleanup) {
+        return addr.reinterpret(layout().byteSize() * elementCount, arena, cleanup);
+    }
+}
+
diff --git a/jvector-native/src/main/java/io/github/jbellis/jvector/vector/cnative/max_align_t.java b/jvector-native/src/main/java/io/github/jbellis/jvector/vector/cnative/max_align_t.java
new file mode 100644
index 000000000..dccb93436
--- /dev/null
+++ b/jvector-native/src/main/java/io/github/jbellis/jvector/vector/cnative/max_align_t.java
@@ -0,0 +1,129 @@
+// Generated by jextract
+
+package io.github.jbellis.jvector.vector.cnative;
+
+import java.lang.invoke.*;
+import java.lang.foreign.*;
+import java.nio.ByteOrder;
+import java.util.*;
+import java.util.function.*;
+import java.util.stream.*;
+
+import static java.lang.foreign.ValueLayout.*;
+import static java.lang.foreign.MemoryLayout.PathElement.*;
+
+/**
+ * {@snippet lang=c :
+ * struct {
+ *     long long __clang_max_align_nonce1;
+ *     long double __clang_max_align_nonce2;
+ * }
+ * }
+ */
+public class max_align_t {
+
+    max_align_t() {
+        // Should not be called directly
+    }
+
+    private static final GroupLayout $LAYOUT = MemoryLayout.structLayout(
+        NativeSimdOps.C_LONG_LONG.withName("__clang_max_align_nonce1"),
+        MemoryLayout.paddingLayout(24)
+    ).withName("$anon$19:9");
+
+    /**
+     * The layout of this struct
+     */
+    public static final GroupLayout layout() {
+        return $LAYOUT;
+    }
+
+    private static final OfLong __clang_max_align_nonce1$LAYOUT = (OfLong)$LAYOUT.select(groupElement("__clang_max_align_nonce1"));
+
+    /**
+     * Layout for field:
+     * {@snippet lang=c :
+     * long long __clang_max_align_nonce1
+     * }
+     */
+    public static final OfLong __clang_max_align_nonce1$layout() {
+        return __clang_max_align_nonce1$LAYOUT;
+    }
+
+    private static final long __clang_max_align_nonce1$OFFSET = 0;
+
+    /**
+     * Offset for field:
+     * {@snippet lang=c :
+     * long long __clang_max_align_nonce1
+     * }
+     */
+    public static final long __clang_max_align_nonce1$offset() {
+        return __clang_max_align_nonce1$OFFSET;
+    }
+
+    /**
+     * Getter for field:
+     * {@snippet lang=c :
+     * long long __clang_max_align_nonce1
+     * }
+     */
+    public static long __clang_max_align_nonce1(MemorySegment struct) {
+        return struct.get(__clang_max_align_nonce1$LAYOUT, __clang_max_align_nonce1$OFFSET);
+    }
+
+    /**
+     * Setter for field:
+     * {@snippet lang=c :
+     * long long __clang_max_align_nonce1
+     * }
+     */
+    public static void __clang_max_align_nonce1(MemorySegment struct, long fieldValue) {
+        struct.set(__clang_max_align_nonce1$LAYOUT, __clang_max_align_nonce1$OFFSET, fieldValue);
+    }
+
+    /**
+     * Obtains a slice of {@code arrayParam} which selects the array element at {@code index}.
+     * The returned segment has address {@code arrayParam.address() + index * layout().byteSize()}
+     */
+    public static MemorySegment asSlice(MemorySegment array, long index) {
+        return array.asSlice(layout().byteSize() * index);
+    }
+
+    /**
+     * The size (in bytes) of this struct
+     */
+    public static long sizeof() { return layout().byteSize(); }
+
+    /**
+     * Allocate a segment of size {@code layout().byteSize()} using {@code allocator}
+     */
+    public static MemorySegment allocate(SegmentAllocator allocator) {
+        return allocator.allocate(layout());
+    }
+
+    /**
+     * Allocate an array of size {@code elementCount} using {@code allocator}.
+     * The returned segment has size {@code elementCount * layout().byteSize()}.
+     */
+    public static MemorySegment allocateArray(long elementCount, SegmentAllocator allocator) {
+        return allocator.allocate(MemoryLayout.sequenceLayout(elementCount, layout()));
+    }
+
+    /**
+     * Reinterprets {@code addr} using target {@code arena} and {@code cleanupAction} (if any).
+     * The returned segment has size {@code layout().byteSize()}
+     */
+    public static MemorySegment reinterpret(MemorySegment addr, Arena arena, Consumer<MemorySegment> cleanup) {
+        return reinterpret(addr, 1, arena, cleanup);
+    }
+
+    /**
+     * Reinterprets {@code addr} using target {@code arena} and {@code cleanupAction} (if any).
+     * The returned segment has size {@code elementCount * layout().byteSize()}
+     */
+    public static MemorySegment reinterpret(MemorySegment addr, long elementCount, Arena arena, Consumer<MemorySegment> cleanup) {
+        return addr.reinterpret(layout().byteSize() * elementCount, arena, cleanup);
+    }
+}
+
diff --git a/jvector-native/src/test/java/io/github/jbellis/jvector/vector/cnative/NativeSimdOpsTest.java b/jvector-native/src/test/java/io/github/jbellis/jvector/vector/cnative/NativeSimdOpsTest.java
deleted file mode 100644
index 68dfca187..000000000
--- a/jvector-native/src/test/java/io/github/jbellis/jvector/vector/cnative/NativeSimdOpsTest.java
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright DataStax, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package io.github.jbellis.jvector.vector.cnative;
-
-import org.junit.Assume;
-import org.junit.Test;
-
-import java.io.IOException;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.util.List;
-
-import static org.junit.Assert.assertEquals;
-
-public class NativeSimdOpsTest {
-
-    /**
-     * Reads /proc/cpuinfo and returns true if all AVX-512 flags required by
-     * check_avx512_compatibility() are present: avx512f, avx512cd, avx512dq,
-     * avx512bw, avx512vl.
-     */
-    private static boolean cpuinfoReportsAvx512() throws IOException {
-        List<String> lines = Files.readAllLines(Path.of("/proc/cpuinfo"));
-        List<String> required = List.of("avx512f", "avx512cd", "avx512dq", "avx512bw", "avx512vl");
-        for (String line : lines) {
-            if (line.startsWith("flags")) {
-                String[] flags = line.split("\\s+");
-                List<String> flagList = List.of(flags);
-                return flagList.containsAll(required);
-            }
-        }
-        return false;
-    }
-
-    @Test
-    public void testCheckAvx512CompatibilityMatchesCpuinfo() throws IOException {
-        boolean libraryLoaded = LibraryLoader.loadJvector();
-        Assume.assumeTrue("Native jvector library not available; skipping AVX-512 check", libraryLoaded);
-
-        boolean expectedFromCpuinfo = cpuinfoReportsAvx512();
-        boolean actualFromNative = NativeSimdOps.check_avx512_compatibility();
-
-        assertEquals(
-                "check_avx512_compatibility() should match AVX-512 flag presence in /proc/cpuinfo",
-                expectedFromCpuinfo,
-                actualFromNative);
-    }
-}
diff --git a/rat-excludes.txt b/rat-excludes.txt
index 436c97822..d12b98e8d 100644
--- a/rat-excludes.txt
+++ b/rat-excludes.txt
@@ -9,6 +9,7 @@ package.json
 .github/workflows/run-bench.yml
 .mvn/wrapper/maven-wrapper.properties
 .mvn/jvm.config
+.gitmodules
 README.md
 UPGRADING.md
 CHANGELOG.md
@@ -31,4 +32,8 @@ docs/**/*.md
 yaml-configs/**/*.md
 local_datasets/**
 **/datasets/**
+jvector-native/src/main/c/build/**
+jvector-native/src/main/c/third_party/**
+src/main/c/build/**
+src/main/c/third_party/**