diff --git a/.github/workflows/run-bench.yml b/.github/workflows/run-bench.yml index 4026e3707..dbd0c67d3 100644 --- a/.github/workflows/run-bench.yml +++ b/.github/workflows/run-bench.yml @@ -90,6 +90,9 @@ jobs: - name: Set up GCC run: | sudo apt install -y gcc + - name: Install Meson and Ninja + run: | + sudo apt update && sudo apt install -y meson ninja-build - uses: actions/checkout@v4 - name: Set up JDK ${{ matrix.jdk }} uses: actions/setup-java@v3 @@ -126,6 +129,9 @@ jobs: ref: ${{ matrix.branch }} fetch-depth: 0 + - name: Initialize Git Submodules + run: git submodule update --init + # ========================================== # Decode and write the protected dataset catalog # @@ -241,9 +247,15 @@ jobs: needs: test-avx512 runs-on: ubuntu-latest steps: + - name: Install Meson and Ninja + run: | + sudo apt update && sudo apt install -y meson ninja-build - name: Checkout repository uses: actions/checkout@v4 + - name: Initialize Git Submodules + run: git submodule update --init + - name: Download all benchmark results uses: actions/download-artifact@v4 with: diff --git a/.github/workflows/unit-tests.yaml b/.github/workflows/unit-tests.yaml index 7fb47f9ba..2ffd08ee9 100644 --- a/.github/workflows/unit-tests.yaml +++ b/.github/workflows/unit-tests.yaml @@ -40,7 +40,12 @@ jobs: - name: Set up GCC run: | sudo apt install -y gcc + - name: Install Meson and Ninja + run: | + sudo apt update && sudo apt install -y meson ninja-build - uses: actions/checkout@v4 + - name: Initialize Git Submodules + run: git submodule update --init - name: Set up JDK ${{ matrix.jdk }} uses: actions/setup-java@v3 with: @@ -102,6 +107,8 @@ jobs: runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v4 + - name: Initialize Git Submodules + run: git submodule update --init - name: Set up JDK uses: actions/setup-java@v3 with: @@ -112,6 +119,10 @@ jobs: if: matrix.os == 'ubuntu-latest' run: | sudo apt install -y gcc + - name: Install Meson and Ninja + if: matrix.os == 'ubuntu-latest' + run: | + sudo apt update && sudo apt install -y meson ninja-build - name: Compile, run tests, and package (JDK 22) run: mvn -B verify if: matrix.jdk == '22' diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 000000000..dcfe38082 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "jvector-native/src/main/c/third_party/highway"] + path = jvector-native/src/main/c/third_party/highway + url = https://github.com/google/highway.git diff --git a/README.md b/README.md index 10e9eb738..bb0f17dd7 100644 --- a/README.md +++ b/README.md @@ -59,6 +59,30 @@ a dependency from any Java 11 code. When run on a Java 20+ JVM with the Vector m providers will be used. In general, the project is structured to be built with JDK 20+, but when `JAVA_HOME` is set to Java 11 -> Java 19, certain build features will still be available. +### Cloning + +This repository uses a Git submodule for [Google Highway](https://github.com/google/highway), located at +`jvector-native/src/main/c/third_party/highway`. After cloning, initialise it with: + +```bash +git submodule update --init +``` + +Or clone with submodules in one step: + +```bash +git clone --recurse-submodules +``` + +### Building native libraries + +The native SIMD library (`libjvector.so`) requires **g++ 11+** and is built by the script +`jvector-native/src/main/c/jextract_vector_simd.sh`. To build and auto-install `g++` on Ubuntu: + +```bash +./jvector-native/src/main/c/jextract_vector_simd.sh --auto-install-g++ +``` + Base code is in [jvector-base](./jvector-base) and will be built for Java 11 releases, restricting language features and APIs appropriately. Code in [jvector-twenty](./jvector-twenty) will be compiled for Java 20 language features/APIs and included in the final multirelease jar targeting supported JVMs. [jvector-multirelease](./jvector-multirelease) packages [jvector-base](./jvector-base) and [jvector-twenty](./jvector-twenty) as a diff --git a/jvector-base/src/main/java/io/github/jbellis/jvector/quantization/PQDecoder.java b/jvector-base/src/main/java/io/github/jbellis/jvector/quantization/PQDecoder.java index 85244befd..ced0124fd 100644 --- a/jvector-base/src/main/java/io/github/jbellis/jvector/quantization/PQDecoder.java +++ b/jvector-base/src/main/java/io/github/jbellis/jvector/quantization/PQDecoder.java @@ -99,9 +99,7 @@ public CosineDecoder(PQVectors cv, VectorFloat query) { for (int m = 0; m < pq.getSubspaceCount(); ++m) { int size = pq.subvectorSizesAndOffsets[m][0]; var codebook = pq.codebooks[m]; - for (int j = 0; j < pq.getClusterCount(); ++j) { - partialMagnitudes.set((m * pq.getClusterCount()) + j, VectorUtil.dotProduct(codebook, j * size, codebook, j * size, size)); - } + VectorUtil.calculatePartialSelfMagnitudes(codebook, m, size, pq.getClusterCount(), partialMagnitudes); } return partialMagnitudes; }); @@ -117,9 +115,7 @@ public CosineDecoder(PQVectors cv, VectorFloat query) { int offset = pq.subvectorSizesAndOffsets[m][1]; int size = pq.subvectorSizesAndOffsets[m][0]; var codebook = pq.codebooks[m]; - for (int j = 0; j < pq.getClusterCount(); ++j) { - partialSums.set((m * pq.getClusterCount()) + j, VectorUtil.dotProduct(codebook, j * size, centeredQuery, offset, size)); - } + VectorUtil.calculatePartialSums(codebook, m, size, pq.getClusterCount(), centeredQuery, offset, VectorSimilarityFunction.DOT_PRODUCT, partialSums); } this.bMagnitude = VectorUtil.dotProduct(centeredQuery, centeredQuery); diff --git a/jvector-base/src/main/java/io/github/jbellis/jvector/vector/VectorUtil.java b/jvector-base/src/main/java/io/github/jbellis/jvector/vector/VectorUtil.java index 83cb5885b..744d5ec75 100644 --- a/jvector-base/src/main/java/io/github/jbellis/jvector/vector/VectorUtil.java +++ b/jvector-base/src/main/java/io/github/jbellis/jvector/vector/VectorUtil.java @@ -182,6 +182,10 @@ public static void calculatePartialSums(VectorFloat codebook, int codebookInd impl.calculatePartialSums(codebook, codebookIndex, size, clusterCount, query, offset, vsf, partialSums); } + public static void calculatePartialSelfMagnitudes(VectorFloat codebook, int codebookIndex, int size, int clusterCount, VectorFloat partialMagnitudes) { + impl.calculatePartialSelfMagnitudes(codebook, codebookIndex, size, clusterCount, partialMagnitudes); + } + /** * Calculates the maximum value in the vector. * @param v vector diff --git a/jvector-base/src/main/java/io/github/jbellis/jvector/vector/VectorUtilSupport.java b/jvector-base/src/main/java/io/github/jbellis/jvector/vector/VectorUtilSupport.java index d8223ab12..118f16ca6 100644 --- a/jvector-base/src/main/java/io/github/jbellis/jvector/vector/VectorUtilSupport.java +++ b/jvector-base/src/main/java/io/github/jbellis/jvector/vector/VectorUtilSupport.java @@ -134,6 +134,13 @@ public interface VectorUtilSupport { void calculatePartialSums(VectorFloat codebook, int codebookIndex, int size, int clusterCount, VectorFloat query, int offset, VectorSimilarityFunction vsf, VectorFloat partialSums); + default void calculatePartialSelfMagnitudes(VectorFloat codebook, int codebookIndex, int size, int clusterCount, VectorFloat partialMagnitudes) { + int codebookBase = codebookIndex * clusterCount; + for (int i = 0; i < clusterCount; i++) { + partialMagnitudes.set(codebookBase + i, dotProduct(codebook, i * size, codebook, i * size, size)); + } + } + float max(VectorFloat v); float min(VectorFloat v); diff --git a/jvector-native/src/main/c/README.md b/jvector-native/src/main/c/README.md new file mode 100644 index 000000000..5f3e2329d --- /dev/null +++ b/jvector-native/src/main/c/README.md @@ -0,0 +1,310 @@ + + +# JVector Native SIMD Library + +This directory contains the C++ source for `libjvector.so`, the native SIMD +backend that accelerates vector operations in JVector via the Java Foreign +Function & Memory (FFM) API. + +> **Platform support:** Currently enabled on **x86-64** (SSE4.2, AVX2, and +> AVX-512). Support for **ARM** (NEON and SVE) is planned for the near future; +> the [Google Highway](https://github.com/google/highway) library used for +> SIMD portability already targets both AArch64 targets, which will make the +> extension straightforward. + +--- + +## Directory layout + +``` +jvector_simd.h — Public C ABI (symbols exported to Java via FFM) +jvector_simd.cpp — Runtime ISA dispatcher; thin wrappers over the vtable +jvector_simd_kernels.h — Internal C++ declarations for all ISA namespaces +jvector_simd_kernels.cpp — Actual SIMD kernels (compiled three times, see below) +jvector_cpuFeatures.h — CPUID/XGETBV-based CPU feature detection +assertHwyTargets.h — Compile-time assertions that the expected HWY target is active +meson.build — Build description +jextract_vector_simd.sh — Build + jextract script (the usual entry point) +third_party/highway/ — Google Highway header-only library (git submodule) +``` + +--- + +## How to build + +### Prerequisites + +| Tool | Minimum version | Notes | +|------|----------------|-------| +| g++ / clang++ | GCC 11+ | Must support `-march=skylake-avx512` | +| [Meson](https://mesonbuild.com/) | 0.55 | `pip install meson` | +| [Ninja](https://ninja-build.org/) | any | `sudo apt install ninja-build` | +| Git submodules | — | `git submodule update --init` (needed once) | + +### Build steps + +Run the build script from this directory: + +```bash +bash jextract_vector_simd.sh +``` + +The script: +1. Verifies prerequisites (g++, meson, ninja, Highway submodule). +2. Runs `meson setup build --wipe --buildtype=release` then `meson compile`. +3. Copies the versioned `.so` to `../resources/libjvector.so` where the Java + `LibraryLoader` expects it. +4. Optionally re-generates the Java FFM bindings via `jextract` (only needed + when `jvector_simd.h` changes — see [Updating the Java bindings](#updating-the-java-bindings)). + +To build without regenerating bindings (e.g. when `jextract` is not installed): + +```bash +bash jextract_vector_simd.sh # jextract step is skipped with a warning if not found +``` + +### Manual meson build + +```bash +cd jvector-native/src/main/c +meson setup build --wipe --buildtype=release +meson compile -C build +``` + +The output is `build/libjvector.so.`. + +--- + +## How it is integrated into JVector + +``` +Java caller + └─ NativeVectorUtilSupport (jvector-native/.../vector/) + └─ NativeSimdOps (jvector-native/.../vector/cnative/ — FFM glue, generated by jextract) + └─ libjvector.so (this library, loaded at runtime by LibraryLoader) + └─ jvector_simd.cpp — dispatches to the best ISA vtable + ├─ AVX3::* (compiled with -march=skylake-avx512) + ├─ AVX2::* (compiled with -march=haswell) + └─ SSE42::* (compiled with -msse4.2, scalar fallback) +``` + +### Load sequence + +1. `NativeVectorizationProvider` calls `LibraryLoader.loadJvector()` at startup. +2. `LibraryLoader` first tries `System.loadLibrary("jvector")` (picks up a + system-installed `.so`), then falls back to extracting `libjvector.so` from + the JAR's resources and loading it from a temp file. +3. On first call into `NativeSimdOps`, the FFM `SymbolLookup` resolves each + exported symbol directly against the loaded library. + +### ISA dispatch + +Dispatch happens **once** at C++ static-init time (before `main()`): + +1. `populate_cpu_features()` issues CPUID / XGETBV and fills a feature array. +2. `dispatch_kernels()` checks the feature array (AVX-512 ⊇ SKX baseline, then + AVX2, then SSE4.2) and returns a pointer to the matching `KernelVTable`. +3. All public API functions are one-liner wrappers that call through + `kernels.`. + +### ISA cap (runtime override) + +Set the `JVECTOR_MAX_ISA` environment variable before starting the JVM to cap +the selected ISA without recompiling: + +```bash +JVECTOR_MAX_ISA=avx2 java ... # use AVX2 even on an AVX-512 machine +JVECTOR_MAX_ISA=sse42 java ... # force scalar/SSE4.2 fallback +``` + +Accepted values (case-sensitive): `avx3`, `avx2`, `sse42`. +An unrecognised value is silently ignored and full CPU detection is used. + +### Updating the Java bindings + +The Java FFM glue in `cnative/NativeSimdOps.java` is generated from +`jvector_simd.h` by `jextract`. Re-run the script whenever the public C header +changes: + +```bash +bash jextract_vector_simd.sh # requires jextract on PATH +``` + +After regeneration the script automatically patches the generated file to add +`Linker.Option.critical(true)` to every downcall, which avoids heap allocation +on each call. + +--- + +## Adding a new SIMD kernel + +Follow these steps to add a new function, e.g. `my_new_op_f32`. + +### 1. Declare the function in the public header (`jvector_simd.h`) + +```c +JVECTOR_SIMD_API float my_new_op_f32(const float* a, size_t length); +``` + +Use the `JVECTOR_SIMD_API` macro on every symbol that must be visible to Java. + +### 2. Add the per-ISA declaration to `jvector_simd_kernels.h` + +Inside the `DECLARE_SIMD_KERNELS` macro, add: + +```cpp +float my_new_op_f32(const float* a, size_t length); \ +``` + +The macro is instantiated for `AVX3`, `AVX2`, and `SSE42`, so this single +addition covers all three namespaces. + +### 3. Implement the kernel in `jvector_simd_kernels.cpp` + +Because the file is compiled three times (once per ISA variant via different +`-DJV_ISA=` / `-march=` flags), a single implementation body serves all +tiers. Use Highway intrinsics so that the compiler emits the right instructions +for each target: + +```cpp +namespace JV_ISA { // expands to AVX3 / AVX2 / SSE42 at compile time + +HWY_FLATTEN float my_new_op_f32(const float* HWY_RESTRICT a, size_t length) +{ + const ScalableTag d; + auto sum = Zero(d); + size_t i = 0; + for (; i + Lanes(d) <= length; i += Lanes(d)) + sum = Add(sum, LoadU(d, a + i)); + // scalar tail ... + return ReduceSum(d, sum); +} + +} // namespace JV_ISA +``` + +`HWY_FLATTEN` ensures the compiler inlines all Highway helpers into one body, +preventing cross-ISA call overhead. + +#### Platform-specific override (when Highway's auto-vectorisation is suboptimal) + +In case Highway generates sub-optimal code for a particular ISA, you can write +a hand-tuned path for one tier while keeping the generic Highway path for the +rest. Use `HWY_STATIC_TARGET` (the compile-time target selected for this translation +unit) to gate the specialisation: + +```cpp +namespace JV_ISA { + +HWY_FLATTEN float my_new_op_f32(const float* HWY_RESTRICT a, size_t length) +{ +#if HWY_STATIC_TARGET == HWY_AVX3 + // Hand-tuned AVX-512 path: use 512-bit loads + VRANGEPS for absolute value, + // or any other intrinsic that Highway does not yet expose. + __m512 acc = _mm512_setzero_ps(); + size_t i = 0; + for (; i + 16 <= length; i += 16) + acc = _mm512_add_ps(acc, _mm512_loadu_ps(a + i)); + float result = _mm512_reduce_add_ps(acc); + for (; i < length; ++i) result += a[i]; // scalar tail + return result; +#else + // Generic Highway path — works for SSE42, AVX2, and any future target. + const ScalableTag d; + auto sum = Zero(d); + size_t i = 0; + for (; i + Lanes(d) <= length; i += Lanes(d)) + sum = Add(sum, LoadU(d, a + i)); + // scalar tail + for (; i < length; ++i) sum = Add(sum, Set(d, a[i])); + return ReduceSum(d, sum); +#endif +} + +} // namespace JV_ISA +``` + +Key points: + +- `HWY_STATIC_TARGET` is defined by Highway to the active compile-time target + (`HWY_AVX3`, `HWY_AVX2`, `HWY_SSE4`, …). Because each ISA variant is a + **separate compilation unit** (see `meson.build`), the preprocessor condition + is resolved at compile time with zero runtime overhead. +- The available macros for x86 are `HWY_SSE4`, `HWY_AVX2`, and `HWY_AVX3` + (AVX-512 / Skylake-AVX512 baseline). Future ARM tiers will expose + `HWY_NEON` and `HWY_SVE`. +- Only add a specialised path when benchmarks justify the extra maintenance + cost. The generic Highway path is usually within a few percent of hand-written + intrinsics. +- Raw intrinsics (`_mm512_*`, `_mm256_*`) require the corresponding headers + (``). Highway already includes them transitively, so no + additional `#include` is needed. + +### 4. Add the function pointer to the `KernelVTable` struct (`jvector_simd.cpp`) + +```cpp +struct KernelVTable { + // ... existing fields ... + float (*my_new_op_f32)(const float *, size_t); +}; +``` + +### 5. Fill the vtable for each ISA in the `DEFINE_ISA_VTABLE` macro + +```cpp +#define DEFINE_ISA_VTABLE(ISA) \ + static const KernelVTable ISA##_vtable = { \ + /* ... existing entries ... */ \ + ISA::my_new_op_f32, \ + } +``` + +Order must match the struct field order exactly. + +### 6. Add the public wrapper (`jvector_simd.cpp`) + +```cpp +float my_new_op_f32(const float *a, size_t length) +{ + return kernels.my_new_op_f32(a, length); +} +``` + +### 7. Rebuild and regenerate Java bindings + +```bash +bash jextract_vector_simd.sh +``` + +This rebuilds the `.so` and regenerates `NativeSimdOps.java` from the updated +header. + +### 8. Add the Java call site + +In `NativeVectorUtilSupport.java` (or wherever appropriate), call through +`NativeSimdOps`: + +```java +public float myNewOp(VectorFloat a) { + return NativeSimdOps.my_new_op_f32( + ((MemorySegmentVectorFloat) a).get(), a.length()); +} +``` + +Override the corresponding method in the `VectorUtilSupport` interface if one +exists, or add a new interface method and update `PanamaVectorUtilSupport` with +a pure-Java fallback first. diff --git a/jvector-native/src/main/c/jvector_simd_check.c b/jvector-native/src/main/c/assertHwyTargets.h similarity index 52% rename from jvector-native/src/main/c/jvector_simd_check.c rename to jvector-native/src/main/c/assertHwyTargets.h index 50736e85a..a096b197b 100644 --- a/jvector-native/src/main/c/jvector_simd_check.c +++ b/jvector-native/src/main/c/assertHwyTargets.h @@ -14,17 +14,14 @@ * limitations under the License. */ -#include -#include "jvector_simd.h" - -bool check_avx512_compatibility(void) { - /* __builtin_cpu_init required when this is used in ifunc - resolver/__attribute__((constructor)) context, otherwise the CPU - features may not be detected correctly. */ - __builtin_cpu_init(); - return (__builtin_cpu_supports("avx512f") && - __builtin_cpu_supports("avx512cd") && - __builtin_cpu_supports("avx512dq") && - __builtin_cpu_supports("avx512bw") && - __builtin_cpu_supports("avx512vl")); -} +#if defined(__x86_64__) || defined(_M_X64) +#if defined(JV_REQUIRE_HWY_AVX3) +#if HWY_STATIC_TARGET != HWY_AVX3 +#error "Highway did not select HWY_AVX3 for the AVX-512 build. Check compiler flags, compiler support, and Highway blocklists." +#endif +#elif defined(JV_REQUIRE_HWY_AVX2) +#if HWY_STATIC_TARGET != HWY_AVX2 +#error "Highway did not select HWY_AVX2 for the AVX2 build. Check compiler flags, compiler support, and Highway blocklists." +#endif +#endif // +#endif // __X86_64__ diff --git a/jvector-native/src/main/c/jextract_vector_simd.sh b/jvector-native/src/main/c/jextract_vector_simd.sh index d44d375dd..18719ba3d 100755 --- a/jvector-native/src/main/c/jextract_vector_simd.sh +++ b/jvector-native/src/main/c/jextract_vector_simd.sh @@ -17,48 +17,80 @@ set -e # See the License for the specific language governing permissions and # limitations under the License. -if [ "$1" == "--auto-install-gcc" ] ; then AUTO_INSTALL_GCC=true ; shift ; fi +if [ "$1" == "--auto-install-gcc" ] || [ "$1" == "--auto-install-g++" ] ; then AUTO_INSTALL_GCC=true ; shift ; fi printf "AUTO_INSTALL_GCC=%s\n" "${AUTO_INSTALL_GCC}" mkdir -p ../resources -# compile jvector_simd_check.c as x86-64 -# compile jvector_simd.c as skylake-avx512 +# compile jvector_simd_check.cpp as x86-64 +# compile jvector_simd.cpp as skylake-avx512 # produce one shared library +# Check that the Google Highway submodule has been initialised +HIGHWAY_DIR="third_party/highway" +if [ ! -f "${HIGHWAY_DIR}/hwy/highway.h" ]; then + echo "ERROR: Google Highway submodule not found at ${HIGHWAY_DIR}." + echo " Run the following command from the repository root to fix this:" + echo "" + echo " git submodule update --init" + echo "" + exit 1 +fi + # Desired minimum GCC version MIN_GCC_VERSION=11 -if ! command -v gcc &> /dev/null; then +if ! command -v g++ &> /dev/null; then if [ "$AUTO_INSTALL_GCC" == "true" ] then LSB_RELEASE=$(lsb_release --id --short) printf "LSB_RELEASE=%s\n" "${LSB_RELEASE}" if [ "${LSB_RELEASE}" == "Ubuntu" ] - then sudo apt update && sudo apt install -y gcc - else printf "distribution %s needs a gcc install command in %s\n" "${LSB_RELEASE}" "${0}" ; exit 2 + then sudo apt update && sudo apt install -y g++ + else printf "distribution %s needs a g++ install command in %s\n" "${LSB_RELEASE}" "${0}" ; exit 2 fi else - echo "GCC is not installed. Please install GCC 11+ to build supporting native libraries." + echo "g++ is not installed. Please install g++ 11+ to build supporting native libraries." exit 2 fi fi -# Check if GCC is installed -CURRENT_GCC_VERSION=$(gcc -dumpversion) +# Check g++ version +CURRENT_GPP_VERSION=$(g++ -dumpversion) # Check if the current GCC version is greater than or equal to the minimum required version -if [ "$(printf '%s\n' "$MIN_GCC_VERSION" "$CURRENT_GCC_VERSION" | sort -V | head -n1)" = "$MIN_GCC_VERSION" ]; then - rm -rf ../resources/libjvector.so - gcc -fPIC -O3 -march=icelake-server -c jvector_simd.c -o jvector_simd.o - gcc -fPIC -O3 -march=x86-64 -c jvector_simd_check.c -o jvector_simd_check.o - gcc -shared -o ../resources/libjvector.so jvector_simd_check.o jvector_simd.o - - rm -rf jvector_common.o - rm -rf jvector_simd.o - rm -rf jvector_simd_check.o -else - echo "WARNING: GCC version $CURRENT_GCC_VERSION is too old. Please upgrade to GCC $MIN_GCC_VERSION or newer." +if [ "$(printf '%s\n' "$MIN_GCC_VERSION" "$CURRENT_GPP_VERSION" | sort -V | head -n1)" != "$MIN_GCC_VERSION" ]; then + echo "WARNING: g++ version $CURRENT_GPP_VERSION is too old. Please upgrade to g++ $MIN_GCC_VERSION or newer." + exit 1 +fi + +# Check meson and ninja are available +if ! command -v meson &> /dev/null; then + echo "meson is not installed. Please install it: pip install meson or sudo apt install meson" + exit 2 +fi +if ! command -v ninja &> /dev/null; then + echo "ninja is not installed. Please install it: sudo apt install ninja-build" + exit 2 +fi + +BUILD_DIR="build" +rm -rf ../resources/libjvector.so + +# Configure (--wipe resets any stale configuration) then compile +meson setup "${BUILD_DIR}" \ + --wipe \ + --buildtype=release + +meson compile -C "${BUILD_DIR}" + +# The versioned .so (e.g. libjvector.so.0.1.0) is the real file; symlinks point to it. +# Copy it to ../resources/ as the plain libjvector.so for Java System.load(). +SOFILE=$(find "${BUILD_DIR}" -maxdepth 1 -name 'libjvector.so.*' -type f | head -1) +if [ -z "${SOFILE}" ]; then + echo "ERROR: libjvector.so not found in ${BUILD_DIR} after build." + exit 1 fi +cp "${SOFILE}" ../resources/libjvector.so # Generate Java source code # Should only be run when c header changes @@ -77,4 +109,4 @@ jextract \ jvector_simd.h # Set critical linker option with heap-based segments for all generated methods -sed -i 's/DESC)/DESC, Linker.Option.critical(true))/g' ../java/io/github/jbellis/jvector/vector/cnative/NativeSimdOps.java \ No newline at end of file +sed -i 's/DESC)/DESC, Linker.Option.critical(true))/g' ../java/io/github/jbellis/jvector/vector/cnative/NativeSimdOps.java diff --git a/jvector-native/src/main/c/jvector_cpuFeatures.h b/jvector-native/src/main/c/jvector_cpuFeatures.h new file mode 100644 index 000000000..524a7cc30 --- /dev/null +++ b/jvector-native/src/main/c/jvector_cpuFeatures.h @@ -0,0 +1,173 @@ +/* + * Copyright DataStax, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef CPU_FEATURES_H +#define CPU_FEATURES_H + +#include +#include + +#if defined(_MSC_VER) +#include +#elif defined(__GNUC__) || defined(__clang__) +#include +#endif + +// Features needed by the ISA dispatch table. Extend as new targets are added. +// +// ICX = Intel Ice Lake-SP (Xeon Scalable 3rd Gen) +// SPR = Intel Sapphire Rapids (Xeon Scalable 4th Gen) +enum class CpuFeature : uint32_t { + // ---- Base AVX2 / AVX-512 foundation (all SKUs) ---------------------- + AVX2 = 0, + AVX512F = 1, + AVX512BW = 2, + AVX512CD = 3, + AVX512DQ = 4, + AVX512VL = 5, + // ---- ICX additions -------------------------------------------------- + AVX512_VNNI = 6, // INT8 dot-product (CPUID 7.0 ECX[11]) + AVX512_VBMI = 7, // byte permute/shuffle (CPUID 7.0 ECX[1]) + AVX512_VBMI2 = 8, // byte/word expand+compress (CPUID 7.0 ECX[6]) + AVX512_IFMA = 9, // 52-bit integer multiply-add (CPUID 7.0 EBX[21]) + AVX512_BITALG = 10, // bit-manipulation (CPUID 7.0 ECX[12]) + AVX512_VPOPCNTDQ = 11, // vector popcount dword/qword (CPUID 7.0 ECX[14]) + GFNI = 12, // Galois-field instructions (CPUID 7.0 ECX[8]) + VAES = 13, // 256/512-bit AES (CPUID 7.0 ECX[9]) + VPCLMULQDQ = 14, // wide carry-less multiply (CPUID 7.0 ECX[10]) + // ---- SPR additions -------------------------------------------------- + AVX512_FP16 = 15, // FP16 arithmetic (CPUID 7.0 EDX[23]) + AVX512_BF16 = 16, // BFloat16 arithmetic (CPUID 7.1 EAX[5]) + AVX_VNNI = 17, // VEX-encoded VNNI 256-bit (CPUID 7.1 EAX[4]) + COUNT +}; + +// Populate `features` by issuing CPUID and XGETBV. +// All entries are false on non-x86 architectures. +inline void +populate_cpu_features(std::array(CpuFeature::COUNT)> + &features) noexcept +{ + features.fill(false); + +#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) \ + || defined(_M_X64) + + // Portable CPUID: GCC/Clang use ; MSVC uses . + auto run_cpuid = [](uint32_t leaf, + uint32_t subleaf, + uint32_t &eax, + uint32_t &ebx, + uint32_t &ecx, + uint32_t &edx) noexcept { +#if defined(_MSC_VER) + int info[4]; + __cpuidex(info, static_cast(leaf), static_cast(subleaf)); + eax = static_cast(info[0]); + ebx = static_cast(info[1]); + ecx = static_cast(info[2]); + edx = static_cast(info[3]); +#else + __cpuid_count(leaf, subleaf, eax, ebx, ecx, edx); +#endif + }; + + // Read XCR0. Must only be called when OSXSAVE (CPUID.1:ECX[27]) is set; + // otherwise executing XGETBV raises #UD. + auto read_xcr0 = []() noexcept -> uint64_t { +#if defined(_MSC_VER) + return static_cast(_xgetbv(0)); +#else + uint32_t lo, hi; + __asm__("xgetbv" : "=a"(lo), "=d"(hi) : "c"(0u)); + return (static_cast(hi) << 32) | lo; +#endif + }; + + uint32_t eax, ebx, ecx, edx; + + // CPUID leaf 1 — check OSXSAVE: OS enabled XSAVE/XRSTOR (ECX bit 27). + run_cpuid(1u, 0u, eax, ebx, ecx, edx); + const bool osxsave = (ecx >> 27) & 1u; + + // XCR0 encodes which register state the OS saves on context switch. + // Read it only if the CPU and OS have declared XSAVE support. + uint64_t xcr0 = 0u; + if (osxsave) { xcr0 = read_xcr0(); } + + // Bits 1‥2: XMM and YMM state — required for AVX / AVX2. + const bool ymm_enabled = (xcr0 & 0x06u) == 0x06u; + // Bits 5‥7: opmask, ZMM_Hi256, Hi16_ZMM — required for AVX-512. + const bool zmm_enabled = ymm_enabled && ((xcr0 & 0xe0u) == 0xe0u); + + // CPUID leaf 7, subleaf 0 — extended feature flags. + // EAX returns the maximum supported subleaf index for leaf 7. + run_cpuid(7u, 0u, eax, ebx, ecx, edx); + const uint32_t leaf7_max_subleaf = eax; + + // ---- Base AVX2 / AVX-512 foundation --------------------------------- + features[static_cast(CpuFeature::AVX2)] + = ymm_enabled && ((ebx >> 5) & 1u); + features[static_cast(CpuFeature::AVX512F)] + = zmm_enabled && ((ebx >> 16) & 1u); + features[static_cast(CpuFeature::AVX512BW)] + = zmm_enabled && ((ebx >> 30) & 1u); + features[static_cast(CpuFeature::AVX512CD)] + = zmm_enabled && ((ebx >> 28) & 1u); + features[static_cast(CpuFeature::AVX512DQ)] + = zmm_enabled && ((ebx >> 17) & 1u); + features[static_cast(CpuFeature::AVX512VL)] + = zmm_enabled && ((ebx >> 31) & 1u); + + // ---- ICX: leaf 7.0 EBX additions ------------------------------------ + features[static_cast(CpuFeature::AVX512_IFMA)] + = zmm_enabled && ((ebx >> 21) & 1u); + + // ---- ICX: leaf 7.0 ECX additions ------------------------------------ + features[static_cast(CpuFeature::AVX512_VBMI)] + = zmm_enabled && ((ecx >> 1) & 1u); + features[static_cast(CpuFeature::AVX512_VBMI2)] + = zmm_enabled && ((ecx >> 6) & 1u); + features[static_cast(CpuFeature::GFNI)] + = zmm_enabled && ((ecx >> 8) & 1u); + features[static_cast(CpuFeature::VAES)] + = zmm_enabled && ((ecx >> 9) & 1u); + features[static_cast(CpuFeature::VPCLMULQDQ)] + = zmm_enabled && ((ecx >> 10) & 1u); + features[static_cast(CpuFeature::AVX512_VNNI)] + = zmm_enabled && ((ecx >> 11) & 1u); + features[static_cast(CpuFeature::AVX512_BITALG)] + = zmm_enabled && ((ecx >> 12) & 1u); + features[static_cast(CpuFeature::AVX512_VPOPCNTDQ)] + = zmm_enabled && ((ecx >> 14) & 1u); + + // ---- SPR: leaf 7.0 EDX additions ------------------------------------ + features[static_cast(CpuFeature::AVX512_FP16)] + = zmm_enabled && ((edx >> 23) & 1u); + + // ---- SPR: leaf 7.1 EAX additions (guarded by max subleaf) ---------- + if (leaf7_max_subleaf >= 1u) { + run_cpuid(7u, 1u, eax, ebx, ecx, edx); + features[static_cast(CpuFeature::AVX_VNNI)] + = ymm_enabled && ((eax >> 4) & 1u); + features[static_cast(CpuFeature::AVX512_BF16)] + = zmm_enabled && ((eax >> 5) & 1u); + } + +#endif // x86 / x86_64 +} + +#endif // CPU_FEATURES_H diff --git a/jvector-native/src/main/c/jvector_simd.c b/jvector-native/src/main/c/jvector_simd.c deleted file mode 100644 index 6bfde3f9a..000000000 --- a/jvector-native/src/main/c/jvector_simd.c +++ /dev/null @@ -1,547 +0,0 @@ -/* - * Copyright DataStax, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include "jvector_simd.h" - -__m512i initialIndexRegister; -__m512i indexIncrement; -__m512i maskSeventhBit; -__m512i maskEighthBit; - -__attribute__((constructor)) -void initialize_constants() { - if (check_avx512_compatibility()) { - initialIndexRegister = _mm512_setr_epi32(-16, -15, -14, -13, -12, -11, -10, -9, - -8, -7, -6, -5, -4, -3, -2, -1); - indexIncrement = _mm512_set1_epi32(16); - maskSeventhBit = _mm512_set1_epi16(0x0040); - maskEighthBit = _mm512_set1_epi16(0x0080); - } -} - -float dot_product_f32_64(const float* a, int aoffset, const float* b, int boffset) { - - __m128 va = _mm_castsi128_ps(_mm_loadl_epi64((__m128i *)(a + aoffset))); - __m128 vb = _mm_castsi128_ps(_mm_loadl_epi64((__m128i *)(b + boffset))); - __m128 r = _mm_mul_ps(va, vb); // Perform element-wise multiplication - - // Horizontal sum of the vector to get dot product - __attribute__((aligned(16))) float result[4]; - _mm_store_ps(result, r); - return result[0] + result[1]; -} - -float dot_product_f32_128(const float* a, int aoffset, const float* b, int boffset, int length) { - float dot = 0.0; - int ao = aoffset; - int bo = boffset; - int alim = aoffset + length; - int blim = boffset + length; - int simd_length = length - (length % 4); - - if (length >= 4) { - __m128 sum = _mm_setzero_ps(); - - for(; ao < aoffset + simd_length; ao += 4, bo += 4) { - // Load float32 - __m128 va = _mm_loadu_ps(a + ao); - __m128 vb = _mm_loadu_ps(b + bo); - - // Multiply and accumulate - sum = _mm_fmadd_ps(va, vb, sum); - } - - // Horizontal sum of the vector to get dot product - __attribute__((aligned(16))) float result[4]; - _mm_store_ps(result, sum); - - for(int i = 0; i < 4; ++i) { - dot += result[i]; - } - } - - for (; ao < alim && bo < blim; ao++, bo++) { - dot += a[ao] * b[bo]; - } - - return dot; -} - -float dot_product_f32_256(const float* a, int aoffset, const float* b, int boffset, int length) { - float dot = 0.0; - int ao = aoffset; - int bo = boffset; - int alim = aoffset + length; - int blim = boffset + length; - int simd_length = length - (length % 8); - - if (length >= 8) { - __m256 sum = _mm256_setzero_ps(); - - for(; ao < aoffset + simd_length; ao += 8, bo += 8) { - // Load float32 - __m256 va = _mm256_loadu_ps(a + ao); - __m256 vb = _mm256_loadu_ps(b + bo); - - // Multiply and accumulate - sum = _mm256_fmadd_ps(va, vb, sum); - } - - // Horizontal sum of the vector to get dot product - __attribute__((aligned(32))) float result[8]; - _mm256_store_ps(result, sum); - - for(int i = 0; i < 8; ++i) { - dot += result[i]; - } - } - - for (; ao < alim && bo < blim; ao++, bo++) { - dot += a[ao] * b[bo]; - } - - return dot; -} - -float dot_product_f32_512(const float* a, int aoffset, const float* b, int boffset, int length) { - float dot = 0.0; - int ao = aoffset; - int bo = boffset; - int alim = aoffset + length; - int blim = boffset + length; - int simd_length = length - (length % 16); - - if (length >= 16) { - __m512 sum = _mm512_setzero_ps(); - for(; ao < aoffset + simd_length; ao += 16, bo += 16) { - // Load float32 - __m512 va = _mm512_loadu_ps(a + ao); - __m512 vb = _mm512_loadu_ps(b + bo); - - // Multiply and accumulate - sum = _mm512_fmadd_ps(va, vb, sum); - } - - // Horizontal sum of the vector to get dot product - dot = _mm512_reduce_add_ps(sum); - } - - for (; ao < alim && bo < blim; ao++, bo++) { - dot += a[ao] * b[bo]; - } - - return dot; -} - -float dot_product_f32(int preferred_size, const float* a, int aoffset, const float* b, int boffset, int length) { - if (length == 2) - return dot_product_f32_64(a, aoffset, b, boffset); - if (length <= 7) - return dot_product_f32_128(a, aoffset, b, boffset, length); - - return (preferred_size == 512 && length >= 16) - ? dot_product_f32_512(a, aoffset, b, boffset, length) - : dot_product_f32_256(a, aoffset, b, boffset, length); -} - -float euclidean_f32_64(const float* a, int aoffset, const float* b, int boffset) { - __m128 va = _mm_castsi128_ps(_mm_loadl_epi64((__m128i *)(a + aoffset))); - __m128 vb = _mm_castsi128_ps(_mm_loadl_epi64((__m128i *)(b + boffset))); - __m128 r = _mm_sub_ps(va, vb); - r = _mm_mul_ps(r, r); - - // Horizontal sum of the vector to get square distance - __attribute__((aligned(8))) float result[2]; - _mm_store_ps(result, r); - return result[0] + result[1]; -} - -float euclidean_f32_128(const float* a, int aoffset, const float* b, int boffset, int length) { - float squareDistance = 0.0; - int ao = aoffset; - int bo = boffset; - int alim = aoffset + length; - int blim = boffset + length; - int simd_length = length - (length % 4); - - if (length >= 4) { - __m128 sum = _mm_setzero_ps(); - - for(; ao < aoffset + simd_length; ao += 4, bo += 4) { - // Load float32 - __m128 va = _mm_loadu_ps(a + ao); - __m128 vb = _mm_loadu_ps(b + bo); - __m128 diff = _mm_sub_ps(va, vb); - // Multiply and accumulate - sum = _mm_fmadd_ps(diff, diff, sum); - } - - // Horizontal sum of the vector to get dot product - __attribute__((aligned(16))) float result[4]; - _mm_store_ps(result, sum); - - for(int i = 0; i < 4; ++i) { - squareDistance += result[i]; - } - } - - for (; ao < alim && bo < blim; ao++, bo++) { - float diff = a[ao] - b[bo]; - squareDistance += diff * diff; - } - - return squareDistance; -} - -float euclidean_f32_256(const float* a, int aoffset, const float* b, int boffset, int length) { - float squareDistance = 0.0; - int ao = aoffset; - int bo = boffset; - int alim = aoffset + length; - int blim = boffset + length; - int simd_length = length - (length % 8); - - if (length >= 8) { - __m256 sum = _mm256_setzero_ps(); - - for(; ao < aoffset + simd_length; ao += 8, bo += 8) { - // Load float32 - __m256 va = _mm256_loadu_ps(a + ao); - __m256 vb = _mm256_loadu_ps(b + bo); - __m256 diff = _mm256_sub_ps(va, vb); - - // Multiply and accumulate - sum = _mm256_fmadd_ps(diff, diff, sum); - } - - __attribute__((aligned(32))) float result[8]; - _mm256_store_ps(result, sum); - - for(int i = 0; i < 8; ++i) { - squareDistance += result[i]; - } - } - - for (; ao < alim && bo < blim; ao++, bo++) { - float diff = a[ao] - b[bo]; - squareDistance += diff * diff; - } - - return squareDistance; -} - -float euclidean_f32_512(const float* a, int aoffset, const float* b, int boffset, int length) { - float squareDistance = 0.0; - int ao = aoffset; - int bo = boffset; - int alim = aoffset + length; - int blim = boffset + length; - int simd_length = length - (length % 16); - - if (length >= 16) { - __m512 sum = _mm512_setzero_ps(); - for(; ao < aoffset + simd_length; ao += 16, bo += 16) { - // Load float32 - __m512 va = _mm512_loadu_ps(a + ao); - __m512 vb = _mm512_loadu_ps(b + bo); - __m512 diff = _mm512_sub_ps(va, vb); - - // Multiply and accumulate - sum = _mm512_fmadd_ps(diff, diff, sum); - } - - // Horizontal sum of the vector to get dot product - squareDistance = _mm512_reduce_add_ps(sum); - } - - for (; ao < alim && bo < blim; ao++, bo++) { - float diff = a[ao] - b[bo]; - squareDistance += diff * diff; - } - - return squareDistance; -} - -float euclidean_f32(int preferred_size, const float* a, int aoffset, const float* b, int boffset, int length) { - if (length == 2) - return euclidean_f32_64(a, aoffset, b, boffset); - if (length <= 7) - return euclidean_f32_128(a, aoffset, b, boffset, length); - - return (preferred_size == 512 && length >= 16) - ? euclidean_f32_512(a, aoffset, b, boffset, length) - : euclidean_f32_256(a, aoffset, b, boffset, length); -} - -float assemble_and_sum_f32_512(const float* data, int dataBase, const unsigned char* baseOffsets, int baseOffsetsOffset, int baseOffsetsLength) { - __m512 sum = _mm512_setzero_ps(); - int i = 0; - int limit = baseOffsetsLength - (baseOffsetsLength % 16); - __m512i indexRegister = initialIndexRegister; - __m512i dataBaseVec = _mm512_set1_epi32(dataBase); - baseOffsets = baseOffsets + baseOffsetsOffset; - - for (; i < limit; i += 16) { - __m128i baseOffsetsRaw = _mm_loadu_si128((__m128i *)(baseOffsets + i)); - __m512i baseOffsetsInt = _mm512_cvtepu8_epi32(baseOffsetsRaw); - // we have base offsets int, which we need to scale to index into data. - // first, we want to initialize a vector with the lane number added as an index - indexRegister = _mm512_add_epi32(indexRegister, indexIncrement); - // then we want to multiply by dataBase - __m512i scale = _mm512_mullo_epi32(indexRegister, dataBaseVec); - // then we want to add the base offsets - __m512i convOffsets = _mm512_add_epi32(scale, baseOffsetsInt); - - __m512 partials = _mm512_i32gather_ps(convOffsets, data, 4); - sum = _mm512_add_ps(sum, partials); - } - - float res = _mm512_reduce_add_ps(sum); - for (; i < baseOffsetsLength; i++) { - res += data[dataBase * i + baseOffsets[i]]; - } - - return res; -} - -float pq_decoded_cosine_similarity_f32_512(const unsigned char* baseOffsets, int baseOffsetsOffset, int baseOffsetsLength, int clusterCount, const float* partialSums, const float* aMagnitude, float bMagnitude) { - __m512 sum = _mm512_setzero_ps(); - __m512 vaMagnitude = _mm512_setzero_ps(); - int i = 0; - int limit = baseOffsetsLength - (baseOffsetsLength % 16); - __m512i indexRegister = initialIndexRegister; - __m512i scale = _mm512_set1_epi32(clusterCount); - baseOffsets = baseOffsets + baseOffsetsOffset; - - - for (; i < limit; i += 16) { - // Load and convert baseOffsets to integers - __m128i baseOffsetsRaw = _mm_loadu_si128((__m128i *)(baseOffsets + i)); - __m512i baseOffsetsInt = _mm512_cvtepu8_epi32(baseOffsetsRaw); - - indexRegister = _mm512_add_epi32(indexRegister, indexIncrement); - // Scale the baseOffsets by the cluster count - __m512i scaledOffsets = _mm512_mullo_epi32(indexRegister, scale); - - // Calculate the final convOffsets by adding the scaled indexes and the base offsets - __m512i convOffsets = _mm512_add_epi32(scaledOffsets, baseOffsetsInt); - - // Gather and sum values for partial sums and a magnitude - __m512 partialSumVals = _mm512_i32gather_ps(convOffsets, partialSums, 4); - sum = _mm512_add_ps(sum, partialSumVals); - - __m512 aMagnitudeVals = _mm512_i32gather_ps(convOffsets, aMagnitude, 4); - vaMagnitude = _mm512_add_ps(vaMagnitude, aMagnitudeVals); - } - - // Reduce sums - float sumResult = _mm512_reduce_add_ps(sum); - float aMagnitudeResult = _mm512_reduce_add_ps(vaMagnitude); - - // Handle the remaining elements - for (; i < baseOffsetsLength; i++) { - int offset = clusterCount * i + baseOffsets[i]; - sumResult += partialSums[offset]; - aMagnitudeResult += aMagnitude[offset]; - } - - return sumResult / sqrtf(aMagnitudeResult * bMagnitude); -} - -void calculate_partial_sums_dot_f32_512(const float* codebook, int codebookIndex, int size, int clusterCount, const float* query, int queryOffset, float* partialSums) { - int codebookBase = codebookIndex * clusterCount; - for (int i = 0; i < clusterCount; i++) { - partialSums[codebookBase + i] = dot_product_f32(512, codebook, i * size, query, queryOffset, size); - } -} - -void calculate_partial_sums_euclidean_f32_512(const float* codebook, int codebookIndex, int size, int clusterCount, const float* query, int queryOffset, float* partialSums) { - int codebookBase = codebookIndex * clusterCount; - for (int i = 0; i < clusterCount; i++) { - partialSums[codebookBase + i] = euclidean_f32(512, codebook, i * size, query, queryOffset, size); - } -} - -/* Bulk shuffles for Fused ADC - * These shuffles take an array of transposed PQ neighbors (in shuffles) and an of quantized partial distances to shuffle. - * Partial distance quantization depends on the best distance and delta used to quantize. - * The shuffles for each codebook will be loaded as bytes (supporting up to 256 cluster PQ) and zero-padded to align - * with 16-bit quantized partial distances. These partial distances will be loaded into SIMD registers, supporting 32 partials - * per register. Each permutation will take 2 registers, so we need four total permutations to look up against all - * 256 partial distances. These four permutations will be blended based on the top two bits of each shuffle, allowing 256 - * entry codebook lookup. Quantized partials are quantized based on bounds provided during the search that suggest total - * distances above the maximum value of an unsigned 16-bit integer will be irrelevant. This allows us to use saturating - * arithmetic, eliminating the need to widen lanes during accumulation. The total quantized distance is then de-quantized - * and transformed into the appropriate similarity score. - * - * In the case of cosine, we have an additional set of partials used for partial squared magnitudes. These are quantized \ - * with a different pair of delta/base, so they will be aggregated and dequantized separately. - */ - - -__attribute__((always_inline)) inline __m512i lookup_partial_sums(__m512i shuffle, const char* quantizedPartials, int i) { - __m512i partialsVecA = _mm512_loadu_epi16(quantizedPartials + i * 512); - __m512i partialsVecB = _mm512_loadu_epi16(quantizedPartials + i * 512 + 64); - __m512i partialsVecC = _mm512_loadu_epi16(quantizedPartials + i * 512 + 128); - __m512i partialsVecD = _mm512_loadu_epi16(quantizedPartials + i * 512 + 192); - __m512i partialsVecE = _mm512_loadu_epi16(quantizedPartials + i * 512 + 256); - __m512i partialsVecF = _mm512_loadu_epi16(quantizedPartials + i * 512 + 320); - __m512i partialsVecG = _mm512_loadu_epi16(quantizedPartials + i * 512 + 384); - __m512i partialsVecH = _mm512_loadu_epi16(quantizedPartials + i * 512 + 448); - - __m512i partialsVecAB = _mm512_permutex2var_epi16(partialsVecA, shuffle, partialsVecB); - __m512i partialsVecCD = _mm512_permutex2var_epi16(partialsVecC, shuffle, partialsVecD); - __m512i partialsVecEF = _mm512_permutex2var_epi16(partialsVecE, shuffle, partialsVecF); - __m512i partialsVecGH = _mm512_permutex2var_epi16(partialsVecG, shuffle, partialsVecH); - - __mmask32 maskSeven = _mm512_test_epi16_mask(shuffle, maskSeventhBit); - __mmask32 maskEight = _mm512_test_epi16_mask(shuffle, maskEighthBit); - __m512i partialsVecABCD = _mm512_mask_blend_epi16(maskSeven, partialsVecAB, partialsVecCD); - __m512i partialsVecEFGH = _mm512_mask_blend_epi16(maskSeven, partialsVecEF, partialsVecGH); - __m512i partialSumsVec = _mm512_mask_blend_epi16(maskEight, partialsVecABCD, partialsVecEFGH); - - return partialSumsVec; -} - -// dequantize a 256-bit vector containing 16 unsigned 16-bit integers into a 512-bit vector containing 16 32-bit floats -__attribute__((always_inline)) inline __m512 dequantize(__m256i quantizedVec, float delta, float base) { - __m512i quantizedVecWidened = _mm512_cvtepu16_epi32(quantizedVec); - __m512 floatVec = _mm512_cvtepi32_ps(quantizedVecWidened); - __m512 deltaVec = _mm512_set1_ps(delta); - __m512 baseVec = _mm512_set1_ps(base); - __m512 dequantizedVec = _mm512_fmadd_ps(floatVec, deltaVec, baseVec); - return dequantizedVec; -} - -void bulk_quantized_shuffle_euclidean_f32_512(const unsigned char* shuffles, int codebookCount, const char* quantizedPartials, float delta, float minDistance, float* results) { - __m512i sum = _mm512_setzero_epi32(); - - for (int i = 0; i < codebookCount; i++) { - __m256i smallShuffle = _mm256_loadu_epi8(shuffles + i * 32); - __m512i shuffle = _mm512_cvtepu8_epi16(smallShuffle); - __m512i partialsVec = lookup_partial_sums(shuffle, quantizedPartials, i); - - sum = _mm512_adds_epu16(sum, partialsVec); - } - - __m256i quantizedResultsLeftRaw = _mm512_extracti32x8_epi32(sum, 0); - __m256i quantizedResultsRightRaw = _mm512_extracti32x8_epi32(sum, 1); - __m512 resultsLeft = dequantize(quantizedResultsLeftRaw, delta, minDistance); - __m512 resultsRight = dequantize(quantizedResultsRightRaw, delta, minDistance); - - __m512 ones = _mm512_set1_ps(1.0); - resultsLeft = _mm512_add_ps(resultsLeft, ones); - resultsRight = _mm512_add_ps(resultsRight, ones); - resultsLeft = _mm512_rcp14_ps(resultsLeft); - resultsRight = _mm512_rcp14_ps(resultsRight); - _mm512_storeu_ps(results, resultsLeft); - _mm512_storeu_ps(results + 16, resultsRight); -} - -void bulk_quantized_shuffle_dot_f32_512(const unsigned char* shuffles, int codebookCount, const char* quantizedPartials, float delta, float best, float* results) { - __m512i sum = _mm512_setzero_epi32(); - - for (int i = 0; i < codebookCount; i++) { - __m256i smallShuffle = _mm256_loadu_epi8(shuffles + i * 32); - __m512i shuffle = _mm512_cvtepu8_epi16(smallShuffle); - __m512i partialsVec = lookup_partial_sums(shuffle, quantizedPartials, i); - sum = _mm512_adds_epu16(sum, partialsVec); - } - - __m256i quantizedResultsLeftRaw = _mm512_extracti32x8_epi32(sum, 0); - __m256i quantizedResultsRightRaw = _mm512_extracti32x8_epi32(sum, 1); - __m512 resultsLeft = dequantize(quantizedResultsLeftRaw, delta, best); - __m512 resultsRight = dequantize(quantizedResultsRightRaw, delta, best); - - __m512 ones = _mm512_set1_ps(1.0); - resultsLeft = _mm512_add_ps(resultsLeft, ones); - resultsRight = _mm512_add_ps(resultsRight, ones); - resultsLeft = _mm512_div_ps(resultsLeft, _mm512_set1_ps(2.0)); - resultsRight = _mm512_div_ps(resultsRight, _mm512_set1_ps(2.0)); - _mm512_storeu_ps(results, resultsLeft); - _mm512_storeu_ps(results + 16, resultsRight); -} - -void bulk_quantized_shuffle_cosine_f32_512(const unsigned char* shuffles, int codebookCount, const char* quantizedPartialSums, float sumDelta, float minDistance, const char* quantizedPartialMagnitudes, float magnitudeDelta, float minMagnitude, float queryMagnitudeSquared, float* results) { - __m512i sum = _mm512_setzero_epi32(); - __m512i magnitude = _mm512_setzero_epi32(); - - for (int i = 0; i < codebookCount; i++) { - __m256i smallShuffle = _mm256_loadu_epi8((shuffles + i * 32)); - __m512i shuffle = _mm512_cvtepu8_epi16(smallShuffle); - __m512i partialSumsVec = lookup_partial_sums(shuffle, quantizedPartialSums, i); - sum = _mm512_adds_epu16(sum, partialSumsVec); - - __m512i partialMagnitudesVec = lookup_partial_sums(shuffle, quantizedPartialMagnitudes, i); - magnitude = _mm512_adds_epu16(magnitude, partialMagnitudesVec); - } - - __m256i quantizedSumsLeftRaw = _mm512_extracti32x8_epi32(sum, 0); - __m256i quantizedSumsRightRaw = _mm512_extracti32x8_epi32(sum, 1); - __m512 sumsLeft = dequantize(quantizedSumsLeftRaw, sumDelta, minDistance); - __m512 sumsRight = dequantize(quantizedSumsRightRaw, sumDelta, minDistance); - - __m256i quantizedMagnitudesLeftRaw = _mm512_extracti32x8_epi32(magnitude, 0); - __m256i quantizedMagnitudesRightRaw = _mm512_extracti32x8_epi32(magnitude, 1); - __m512 magnitudesLeft = dequantize(quantizedMagnitudesLeftRaw, magnitudeDelta, minMagnitude); - __m512 magnitudesRight = dequantize(quantizedMagnitudesRightRaw, magnitudeDelta, minMagnitude); - - __m512 queryMagnitudeSquaredVec = _mm512_set1_ps(queryMagnitudeSquared); - magnitudesLeft = _mm512_mul_ps(magnitudesLeft, queryMagnitudeSquaredVec); - magnitudesRight = _mm512_mul_ps(magnitudesRight, queryMagnitudeSquaredVec); - magnitudesLeft = _mm512_sqrt_ps(magnitudesLeft); - magnitudesRight = _mm512_sqrt_ps(magnitudesRight); - __m512 resultsLeft = _mm512_div_ps(sumsLeft, magnitudesLeft); - __m512 resultsRight = _mm512_div_ps(sumsRight, magnitudesRight); - - __m512 ones = _mm512_set1_ps(1.0); - resultsLeft = _mm512_add_ps(resultsLeft, ones); - resultsRight = _mm512_add_ps(resultsRight, ones); - resultsLeft = _mm512_div_ps(resultsLeft, _mm512_set1_ps(2.0)); - resultsRight = _mm512_div_ps(resultsRight, _mm512_set1_ps(2.0)); - _mm512_storeu_ps(results, resultsLeft); - _mm512_storeu_ps(results + 16, resultsRight); -} - -// Partial sum calculations that also record best distances, as this is necessary for Fused ADC quantization -void calculate_partial_sums_best_dot_f32_512(const float* codebook, int codebookIndex, int size, int clusterCount, const float* query, int queryOffset, float* partialSums, float* partialBestDistances) { - float best = -INFINITY; - int codebookBase = codebookIndex * clusterCount; - for (int i = 0; i < clusterCount; i++) { - float val = dot_product_f32(512, codebook, i * size, query, queryOffset, size); - partialSums[codebookBase + i] = val; - if (val > best) { - best = val; - } - } - partialBestDistances[codebookIndex] = best; -} - -void calculate_partial_sums_best_euclidean_f32_512(const float* codebook, int codebookIndex, int size, int clusterCount, const float* query, int queryOffset, float* partialSums, float* partialBestDistances) { - float best = INFINITY; - int codebookBase = codebookIndex * clusterCount; - for (int i = 0; i < clusterCount; i++) { - float val = euclidean_f32(512, codebook, i * size, query, queryOffset, size); - partialSums[codebookBase + i] = val; - if (val < best) { - best = val; - } - } - partialBestDistances[codebookIndex] = best; -} \ No newline at end of file diff --git a/jvector-native/src/main/c/jvector_simd.cpp b/jvector-native/src/main/c/jvector_simd.cpp new file mode 100644 index 000000000..69bc5977a --- /dev/null +++ b/jvector-native/src/main/c/jvector_simd.cpp @@ -0,0 +1,361 @@ +/* + * Copyright DataStax, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Runtime SIMD dispatch: selects AVX3 (AVX-512), AVX2, or SSE42 at startup. +// SSE42 is the baseline and is assumed always available without a CPUID check. +// AVX2 and AVX3 are probed via CPUID. Function pointers are resolved once at +// static-init time; each public call is a single indirect branch. +#include "jvector_simd.h" +#include "jvector_simd_kernels.h" // AVX3::, AVX2::, SSE42:: kernel declarations +#include "jvector_cpuFeatures.h" // populate_cpu_features(), CpuFeature enum + +#include +#include // std::getenv +#include // std::strcmp + +// Everything in this anonymous namespace is translation-unit private; none of +// these symbols are exported from the shared library. +namespace { + +// Enumerates the ISA tiers in ascending capability order so that numeric +// comparisons (e.g. max_isa != MaxIsa::AVX2) correctly gate higher tiers. +// Unset (-1) means "no override; use best available CPU capability". +enum class MaxIsa { Unset = -1, SSE42 = 0, AVX2 = 1, AVX3 = 2 }; + +// Reads the JVECTOR_MAX_ISA environment variable and maps it to a MaxIsa +// value. This lets callers cap the ISA at runtime without recompiling — +// useful for benchmarking or working around CPU errata. +// Accepted values (case-sensitive): "avx3", "avx2", "sse42". +static MaxIsa read_max_isa() noexcept +{ + const char *val = std::getenv("JVECTOR_MAX_ISA"); + if (!val) return MaxIsa::Unset; + if (std::strcmp(val, "avx3") == 0) return MaxIsa::AVX3; + if (std::strcmp(val, "avx2") == 0) return MaxIsa::AVX2; + if (std::strcmp(val, "sse42") == 0) return MaxIsa::SSE42; + return MaxIsa::Unset; // unrecognised value: ignore and use CPU detection +} + +// KernelVTable holds one function pointer per public kernel. Storing them +// together in a struct means a single pointer comparison during dispatch_kernels() +// selects all kernels for the chosen ISA in one shot. +struct KernelVTable { + /* Vector similarity */ + float (*cosine_f32)(const float *, size_t, const float *, size_t, size_t); + float (*dot_product_f32)(const float *, size_t, const float *, size_t, size_t); + float (*euclidean_f32)(const float *, size_t, const float *, size_t, size_t); + /* Element-wise in-place arithmetic */ + void (*add_in_place_f32)(float *, const float *, size_t); + void (*add_scalar_in_place_f32)(float *, float, size_t); + void (*sub_in_place_f32)(float *, const float *, size_t); + void (*sub_scalar_in_place_f32)(float *, float, size_t); + float (*max_f32)(const float *, size_t); + void (*min_in_place_f32)(float *, const float *, size_t); + /* PQ kernels */ + float (*assemble_and_sum_f32)(const float *, int, + const unsigned char *, int, size_t); + float (*assemble_and_sum_pq_f32)(const float *, size_t, + const unsigned char *, int, + const unsigned char *, int, int); + float (*pq_decoded_cosine_similarity_f32)(const unsigned char *, int, + size_t, int, + const float *, const float *, + float); + void (*calculate_partial_sums_dot_f32)(const float *, int, + size_t, int, + const float *, int, float *); + void (*calculate_partial_sums_euclidean_f32)(const float *, int, + size_t, int, + const float *, int, float *); + void (*calculate_partial_sums_self_magnitude_f32)(const float *, int, + size_t, int, float *); + /* NVQ kernels */ + void (*nvq_quantize_8bit)(const float *, size_t, + float, float, float, float, + unsigned char *); + float (*nvq_loss)(const float *, size_t, + float, float, float, float, int); + float (*nvq_uniform_loss)(const float *, size_t, float, float, int); + float (*nvq_square_l2_distance_8bit)(const float *, + const unsigned char *, size_t, + float, float, float, float); + float (*nvq_dot_product_8bit)(const float *, + const unsigned char *, size_t, + float, float, float, float); + int64_t (*nvq_cosine_8bit_packed)(const float *, + const unsigned char *, size_t, + float, float, float, float, + const float *); + void (*nvq_shuffle_query_in_place_8bit)(float *, size_t); +}; + +// One pre-filled vtable per ISA. These are constant data; no heap allocation. +#define DEFINE_ISA_VTABLE(ISA) \ + static const KernelVTable ISA##_vtable = { \ + ISA::cosine_f32, \ + ISA::dot_product_f32, \ + ISA::euclidean_f32, \ + ISA::add_in_place_f32, \ + ISA::add_scalar_in_place_f32, \ + ISA::sub_in_place_f32, \ + ISA::sub_scalar_in_place_f32, \ + ISA::max_f32, \ + ISA::min_in_place_f32, \ + ISA::assemble_and_sum_f32, \ + ISA::assemble_and_sum_pq_f32, \ + ISA::pq_decoded_cosine_similarity_f32, \ + ISA::calculate_partial_sums_dot_f32, \ + ISA::calculate_partial_sums_euclidean_f32, \ + ISA::calculate_partial_sums_self_magnitude_f32, \ + ISA::nvq_quantize_8bit, \ + ISA::nvq_loss, \ + ISA::nvq_uniform_loss, \ + ISA::nvq_square_l2_distance_8bit, \ + ISA::nvq_dot_product_8bit, \ + ISA::nvq_cosine_8bit_packed, \ + ISA::nvq_shuffle_query_in_place_8bit, \ + } + +DEFINE_ISA_VTABLE(AVX3); +DEFINE_ISA_VTABLE(AVX2); +DEFINE_ISA_VTABLE(SSE42); + +// Selects and returns the best vtable for the current CPU and environment. +// Called exactly once during static initialisation (before main()). +static KernelVTable dispatch_kernels() noexcept +{ + // Check whether the caller has capped the ISA via the environment variable. + const MaxIsa max_isa = read_max_isa(); + + // Populate a boolean feature array by issuing CPUID and reading XCR0. + std::array(CpuFeature::COUNT)> features; + populate_cpu_features(features); + + auto has = [&](CpuFeature f) noexcept { + return features[static_cast(f)]; + }; + + // AVX3 tier requires the full Skylake-AVX512 (SKX) baseline: + // AVX512F (foundation) + BW (byte/word) + CD (conflict detect) + // + DQ (dword/qword) + VL (vector length extensions). + if (max_isa != MaxIsa::SSE42 && max_isa != MaxIsa::AVX2 + && has(CpuFeature::AVX512F) && has(CpuFeature::AVX512BW) + && has(CpuFeature::AVX512CD) && has(CpuFeature::AVX512DQ) + && has(CpuFeature::AVX512VL)) { + return AVX3_vtable; + } + // AVX2 tier: 256-bit integer/FP SIMD, available on Haswell and later. + if (max_isa != MaxIsa::SSE42 && has(CpuFeature::AVX2)) { + return AVX2_vtable; + } + // SSE42 is the baseline — assumed always present, no CPUID check needed. + return SSE42_vtable; +} + +// 'kernels' is initialised once at static-init time to the vtable chosen by +// dispatch_kernels(). After that every public API call goes through one +// indirect branch to the right ISA implementation — no runtime comparisons. +static const KernelVTable kernels = dispatch_kernels(); + +} // namespace + +// ---- Public API ------------------------------------------------------------ +// Each function is a thin wrapper that forwards to the pre-resolved function +// pointer in `kernels`. + +/* Vector similarity */ + +float cosine_f32(const float *a, size_t aoffset, + const float *b, size_t boffset, size_t length) +{ + return kernels.cosine_f32(a, aoffset, b, boffset, length); +} + +float dot_product_f32(const float *a, size_t aoffset, + const float *b, size_t boffset, size_t length) +{ + return kernels.dot_product_f32(a, aoffset, b, boffset, length); +} + +float euclidean_f32(const float *a, size_t aoffset, + const float *b, size_t boffset, size_t length) +{ + return kernels.euclidean_f32(a, aoffset, b, boffset, length); +} + +/* Element-wise in-place arithmetic */ + +void add_in_place_f32(float *v1, const float *v2, size_t length) +{ + kernels.add_in_place_f32(v1, v2, length); +} + +void add_scalar_in_place_f32(float *v1, float value, size_t length) +{ + kernels.add_scalar_in_place_f32(v1, value, length); +} + +void sub_in_place_f32(float *v1, const float *v2, size_t length) +{ + kernels.sub_in_place_f32(v1, v2, length); +} + +void sub_scalar_in_place_f32(float *v1, float value, size_t length) +{ + kernels.sub_scalar_in_place_f32(v1, value, length); +} + +float max_f32(const float *v, size_t length) +{ + return kernels.max_f32(v, length); +} + +void min_in_place_f32(float *v1, const float *v2, size_t length) +{ + kernels.min_in_place_f32(v1, v2, length); +} + +/* PQ kernels */ + +float assemble_and_sum_f32(const float *data, int dataBase, + const unsigned char *baseOffsets, + int baseOffsetsOffset, size_t baseOffsetsLength) +{ + return kernels.assemble_and_sum_f32( + data, dataBase, baseOffsets, baseOffsetsOffset, baseOffsetsLength); +} + +float assemble_and_sum_pq_f32(const float *data, size_t subspaceCount, + const unsigned char *baseOffsets1, + int baseOffsetsOffset1, + const unsigned char *baseOffsets2, + int baseOffsetsOffset2, int clusterCount) +{ + return kernels.assemble_and_sum_pq_f32(data, subspaceCount, + baseOffsets1, baseOffsetsOffset1, + baseOffsets2, baseOffsetsOffset2, + clusterCount); +} + +float pq_decoded_cosine_similarity_f32(const unsigned char *baseOffsets, + int baseOffsetsOffset, + size_t baseOffsetsLength, + int clusterCount, + const float *partialSums, + const float *aMagnitude, + float bMagnitude) +{ + return kernels.pq_decoded_cosine_similarity_f32(baseOffsets, + baseOffsetsOffset, + baseOffsetsLength, + clusterCount, + partialSums, + aMagnitude, + bMagnitude); +} + +void calculate_partial_sums_dot_f32(const float *codebook, int codebookIndex, + size_t size, int clusterCount, + const float *query, int queryOffset, + float *partialSums) +{ + kernels.calculate_partial_sums_dot_f32(codebook, codebookIndex, + size, clusterCount, + query, queryOffset, partialSums); +} + +void calculate_partial_sums_euclidean_f32(const float *codebook, + int codebookIndex, + size_t size, int clusterCount, + const float *query, int queryOffset, + float *partialSums) +{ + kernels.calculate_partial_sums_euclidean_f32(codebook, codebookIndex, + size, clusterCount, + query, queryOffset, + partialSums); +} + +void calculate_partial_sums_self_magnitude_f32(const float *codebook, + int codebookIndex, + size_t size, int clusterCount, + float *partialSums) +{ + kernels.calculate_partial_sums_self_magnitude_f32(codebook, codebookIndex, + size, clusterCount, + partialSums); +} + +/* NVQ kernels */ + +void nvq_quantize_8bit(const float *vector, size_t length, + float alpha, float x0, + float minValue, float maxValue, + unsigned char *destination) +{ + kernels.nvq_quantize_8bit(vector, length, alpha, x0, + minValue, maxValue, destination); +} + +float nvq_loss(const float *vector, size_t length, + float alpha, float x0, + float minValue, float maxValue, int nBits) +{ + return kernels.nvq_loss(vector, length, alpha, x0, minValue, maxValue, nBits); +} + +float nvq_uniform_loss(const float *vector, size_t length, + float minValue, float maxValue, int nBits) +{ + return kernels.nvq_uniform_loss(vector, length, minValue, maxValue, nBits); +} + +float nvq_square_l2_distance_8bit(const float *vector, + const unsigned char *quantized, + size_t length, + float alpha, float x0, + float minValue, float maxValue) +{ + return kernels.nvq_square_l2_distance_8bit(vector, quantized, length, + alpha, x0, minValue, maxValue); +} + +float nvq_dot_product_8bit(const float *vector, + const unsigned char *quantized, + size_t length, + float alpha, float x0, + float minValue, float maxValue) +{ + return kernels.nvq_dot_product_8bit(vector, quantized, length, + alpha, x0, minValue, maxValue); +} + +int64_t nvq_cosine_8bit_packed(const float *vector, + const unsigned char *quantized, + size_t length, + float alpha, float x0, + float minValue, float maxValue, + const float *centroid) +{ + return kernels.nvq_cosine_8bit_packed(vector, quantized, length, + alpha, x0, minValue, maxValue, + centroid); +} + +void nvq_shuffle_query_in_place_8bit(float *vector, size_t length) +{ + kernels.nvq_shuffle_query_in_place_8bit(vector, length); +} diff --git a/jvector-native/src/main/c/jvector_simd.h b/jvector-native/src/main/c/jvector_simd.h index 39da01316..5000059f7 100644 --- a/jvector-native/src/main/c/jvector_simd.h +++ b/jvector-native/src/main/c/jvector_simd.h @@ -15,23 +15,50 @@ */ #include +#include +#include #ifndef VECTOR_SIMD_DOT_H #define VECTOR_SIMD_DOT_H -// check CPU support -bool check_avx512_compatibility(void); +// Mark a symbol as part of the public ABI even when the library is built +// with -fvisibility=hidden. +#define JVECTOR_SIMD_API __attribute__((visibility("default"))) -//F32 -float dot_product_f32(int preferred_size, const float* a, int aoffset, const float* b, int boffset, int length); -float euclidean_f32(int preferred_size, const float* a, int aoffset, const float* b, int boffset, int length); -void bulk_quantized_shuffle_dot_f32_512(const unsigned char* shuffles, int codebookCount, const char* quantizedPartials, float delta, float minDistance, float* results); -void bulk_quantized_shuffle_euclidean_f32_512(const unsigned char* shuffles, int codebookCount, const char* quantizedPartials, float delta, float minDistance, float* results); -void bulk_quantized_shuffle_cosine_f32_512(const unsigned char* shuffles, int codebookCount, const char* quantizedPartialSums, float sumDelta, float minDistance, const char* quantizedPartialMagnitudes, float magnitudeDelta, float minMagnitude, float queryMagnitudeSquared, float* results); -float assemble_and_sum_f32_512(const float* data, int dataBase, const unsigned char* baseOffsets, int baseOffsetsOffset, int baseOffsetsLength); -float pq_decoded_cosine_similarity_f32_512(const unsigned char* baseOffsets, int baseOffsetsOffset, int baseOffsetsLength, int clusterCount, const float* partialSums, const float* aMagnitude, float bMagnitude); -void calculate_partial_sums_dot_f32_512(const float* codebook, int codebookBase, int size, int clusterCount, const float* query, int queryOffset, float* partialSums); -void calculate_partial_sums_euclidean_f32_512(const float* codebook, int codebookBase, int size, int clusterCount, const float* query, int queryOffset, float* partialSums); -void calculate_partial_sums_best_dot_f32_512(const float* codebook, int codebookBase, int size, int clusterCount, const float* query, int queryOffset, float* partialSums, float* partialBestDistances); -void calculate_partial_sums_best_euclidean_f32_512(const float* codebook, int codebookBase, int size, int clusterCount, const float* query, int queryOffset, float* partialSums, float* partialBestDistances); -#endif \ No newline at end of file +// APIs exposed to Java via FFI +#ifdef __cplusplus +extern "C" { +#endif +/* PQ kernels */ +JVECTOR_SIMD_API float assemble_and_sum_f32(const float* data, int dataBase, const unsigned char* baseOffsets, int baseOffsetsOffset, size_t baseOffsetsLength); +JVECTOR_SIMD_API float assemble_and_sum_pq_f32(const float* data, size_t subspaceCount, const unsigned char* baseOffsets1, int baseOffsetsOffset1, const unsigned char* baseOffsets2, int baseOffsetsOffset2, int clusterCount); +JVECTOR_SIMD_API float pq_decoded_cosine_similarity_f32(const unsigned char* baseOffsets, int baseOffsetsOffset, size_t baseOffsetsLength, int clusterCount, const float* partialSums, const float* aMagnitude, float bMagnitude); +JVECTOR_SIMD_API void calculate_partial_sums_euclidean_f32(const float* codebook, int codebookBase, size_t size, int clusterCount, const float* query, int queryOffset, float* partialSums); +JVECTOR_SIMD_API void calculate_partial_sums_dot_f32(const float* codebook, int codebookBase, size_t size, int clusterCount, const float* query, int queryOffset, float* partialSums); +JVECTOR_SIMD_API void calculate_partial_sums_self_magnitude_f32(const float* codebook, int codebookBase, size_t size, int clusterCount, float* partialSums); + +/* Vector similarity kernels */ +JVECTOR_SIMD_API float dot_product_f32(const float* a, size_t aoffset, const float* b, size_t boffset, size_t length); +JVECTOR_SIMD_API float cosine_f32(const float* a, size_t aoffset, const float* b, size_t boffset, size_t length); +JVECTOR_SIMD_API float euclidean_f32(const float* a, size_t aoffset, const float* b, size_t boffset, size_t length); + +/* NVQ kernels */ +JVECTOR_SIMD_API void nvq_quantize_8bit(const float* vector, size_t length, float alpha, float x0, float minValue, float maxValue, unsigned char* destination); +JVECTOR_SIMD_API float nvq_loss(const float* vector, size_t length, float alpha, float x0, float minValue, float maxValue, int nBits); +JVECTOR_SIMD_API float nvq_uniform_loss(const float* vector, size_t length, float minValue, float maxValue, int nBits); +JVECTOR_SIMD_API float nvq_square_l2_distance_8bit(const float* vector, const unsigned char* quantized, size_t length, float alpha, float x0, float minValue, float maxValue); +JVECTOR_SIMD_API float nvq_dot_product_8bit(const float* vector, const unsigned char* quantized, size_t length, float alpha, float x0, float minValue, float maxValue); +JVECTOR_SIMD_API int64_t nvq_cosine_8bit_packed(const float* vector, const unsigned char* quantized, size_t length, float alpha, float x0, float minValue, float maxValue, const float* centroid); +JVECTOR_SIMD_API void nvq_shuffle_query_in_place_8bit(float* vector, size_t length); + +/* Element-wise in-place vector arithmetic */ +JVECTOR_SIMD_API void add_in_place_f32(float* v1, const float* v2, size_t length); +JVECTOR_SIMD_API void add_scalar_in_place_f32(float* v1, float value, size_t length); +JVECTOR_SIMD_API void sub_in_place_f32(float* v1, const float* v2, size_t length); +JVECTOR_SIMD_API void sub_scalar_in_place_f32(float* v1, float value, size_t length); +JVECTOR_SIMD_API float max_f32(const float* v, size_t length); +JVECTOR_SIMD_API void min_in_place_f32(float* v1, const float* v2, size_t length); +#ifdef __cplusplus +} +#endif // extern "C" +#endif // VECTOR_SIMD_DOT_H diff --git a/jvector-native/src/main/c/jvector_simd_kernels.cpp b/jvector-native/src/main/c/jvector_simd_kernels.cpp new file mode 100644 index 000000000..20e8930ca --- /dev/null +++ b/jvector-native/src/main/c/jvector_simd_kernels.cpp @@ -0,0 +1,1643 @@ +/* + * Copyright DataStax, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include "jvector_simd.h" +#include "hwy/highway.h" +#include "assertHwyTargets.h" + +// ============================================================================= +// Highway macro usage in this file +// ============================================================================= +// +// HWY_INLINE +// Expands to `inline __attribute__((always_inline))` on GCC/Clang. +// Used on every helper that participates in a hot SIMD loop to prevent the +// compiler from ever emitting a real call and to keep register pressure +// visible across the inlined body. Prefer this over plain `inline` for any +// function that contains SIMD intrinsics. +// +// HWY_FLATTEN +// Expands to `__attribute__((flatten))`, which asks the compiler to inline +// *all* callees into the annotated function. Used on the public entry-points +// (assemble_and_sum_f32, pq_decoded_cosine_similarity_f32, the three +// distance wrappers) so that the multi-target Highway dispatch stub sees a +// single monolithic body with no residual call overhead. +// +// HWY_RESTRICT +// Portable spelling of `__restrict__` / `restrict`. Tells the compiler that +// a pointer does not alias any other pointer in scope, so loads through it +// remain valid across stores made through a different pointer. Applied to: +// - Load helper parameters (e.g. LoadDup256) — ensures the loaded value is +// treated as loop-invariant even when the caller stores to an accumulator. +// - calculate_partial_sums_f32 inputs (codebook, query) and output +// (partialSums) — prevents the compiler from reloading read-only inputs +// after each write to partialSums. +// Not needed when inputs and output already have different types (e.g. +// float* vs unsigned char*), because C++ strict-aliasing rules already +// guarantee they cannot alias. +// +// ============================================================================= +// Highway API tutorial — intrinsics used in this file +// ============================================================================= +// +// --- Tags (describe vector type and width) --- +// +// ScalableTag +// Represents the full native SIMD width for type T. +// e.g. ScalableTag is 8 lanes on AVX2, 16 lanes on AVX-512. +// Used in the main loop bodies where we want the widest available vector. +// +// CappedTag / HWY_CAPPED(T, N) +// A tag capped to at most N lanes, even on wider ISAs. +// Used in the small-vector fast paths (e.g. size==4, size==8) so that we +// avoid wasting the extra lanes of a wide register on tiny inputs. +// +// Half +// Produces a tag whose lane count is half that of D. +// Used in LoadDup256 to load 8 floats into the lower half of a 512-bit +// register before Combine duplicates them into the upper half. +// +// Rebind / RebindToSigned +// Produce a new tag of the same width but a different element type. +// Used in assemble_and_sum_f32 and pq_decoded_cosine_similarity to +// reinterpret the float-width register as uint8/uint16/int32 during the +// index promotion pipeline. +// +// Lanes(tag) — runtime lane count for the given tag. +// MaxLanes(tag) — compile-time upper bound on lane count (used in static_assert +// and constexpr branches). +// +// --- Vector type --- +// +// Vec +// The SIMD vector type corresponding to a tag. +// All arithmetic and load/store operations return or accept Vec. +// +// --- Initialisation --- +// +// Zero(tag) — vector of all zeros; used to initialise accumulators. +// Set(tag, scalar) — broadcast a scalar to every lane. +// Iota(tag, start) — fill lanes with start, start+1, start+2, … +// Used to build the running index vector for GatherIndex. +// +// --- Loads --- +// +// LoadU(tag, ptr) — unaligned load of Lanes(tag) elements from ptr. +// LoadN(tag, ptr, n) — load n elements; remaining lanes are zero-padded. +// Used for loop tails without a branch per element. +// LoadDup128(tag, ptr) — load 128 bits and broadcast across the full vector. +// Used for size==2 and size==4 query vectors so the +// same query chunk lines up with every centroid chunk. +// MaskedLoad(mask, tag, ptr) +// — load only the lanes where mask is set; others zero. +// Used in CosineDistance tail handling. +// +// --- Store --- +// +// StoreU(vec, tag, ptr) — unaligned store of Lanes(tag) elements to ptr. +// +// --- Arithmetic --- +// +// Add(a, b) — lane-wise addition. +// Sub(a, b) — lane-wise subtraction. +// Mul(a, b) — lane-wise multiplication. +// MulAdd(a, b, c) — fused multiply-add: (a * b) + c. +// Preferred over separate Mul+Add for FMA throughput. +// +// --- Type promotion --- +// +// PromoteTo(narrower_tag, vec) +// Zero-extends each element to the wider type. +// Used twice in the gather pipeline: u8 → u16 → i32, so that byte offsets +// become 32-bit gather indices without sign-extension artefacts. +// +// --- Gather --- +// +// GatherIndex(tag, base_ptr, index_vec) +// Loads one element per lane using per-lane 32-bit indices (in elements, +// not bytes). Used to collect PQ lookup-table entries and codebook floats +// whose positions are determined at runtime by the encoded offsets. +// +// --- Reductions --- +// +// ReduceSum(tag, vec) — horizontal sum of all lanes; returns a scalar. +// +// --- Horizontal-reduction shuffles (used in calculate_partial_sums_f32) --- +// +// Shuffle2301(vec) +// Swaps adjacent 32-bit lane pairs within each 64-bit group: [0,1,2,3] → [1,0,3,2]. +// (Name "2301" encodes source indices from lane 3 down to lane 0 per _MM_SHUFFLE +// convention: lane3←2, lane2←3, lane1←0, lane0←1.) +// Adding a vector with its Shuffle2301 partner sums adjacent lane pairs: +// result[0]=result[1]=(0+1) and result[2]=result[3]=(2+3). +// +// Shuffle1032(vec) +// Swaps the two 64-bit halves within each 128-bit block: [0,1,2,3] → [2,3,0,1]. +// Used as the second reduction step: adding those pairwise sums together +// yields the full 4-lane horizontal sum in every lane. +// +// SwapAdjacentBlocks(vec) +// Swaps the two 128-bit halves of each 256-bit block. +// On AVX-512 (512-bit vector = four 128-bit blocks) this is used as the +// first step of the size==8 horizontal reduction before Shuffle1032/2301. +// +// --- Masks --- +// +// FirstN(tag, n) +// Returns a mask with the first n lanes set and the rest clear. +// Used with MaskedLoad to handle the tail of a vector that doesn't fill +// a full register. +// +// --- Combine --- +// +// Combine(d, hi, lo) +// Concatenates two half-width vectors into one full-width vector. +// Used in LoadDup256 to duplicate 8 floats across both halves of a +// 512-bit register (lo = hi = the same 256-bit load). +// + +namespace hn = hwy::HWY_NAMESPACE; + +// Loads 8 floats from ptr and broadcasts them to fill the full vector D. +// On ISAs where D is exactly 8 lanes (e.g. AVX2) this is a plain LoadU. +// On wider ISAs (e.g. AVX-512, 16 lanes) the 8 floats are loaded into the +// 256-bit half-tag and then Combine'd to duplicate them into both halves. +// NOTE: not designed for ISAs wider than 512-bit (would need additional Combine levels). +// HWY_RESTRICT tells the compiler that ptr does not alias any accumulator or +// other pointer visible at the call site, allowing it to treat all loads from +// ptr as invariant across iterations and hoist them freely. +template +HWY_INLINE hn::Vec LoadDup256(D d, const float *HWY_RESTRICT ptr) +{ + static_assert(hn::MaxLanes(d) <= 16, + "LoadDup256 is not implemented for ISAs wider than 512-bit"); + if constexpr (hn::MaxLanes(d) > 8) { + const hn::Half dh; + const auto half = hn::LoadU(dh, ptr); + return hn::Combine(d, half, half); + } + else { + return hn::LoadU(d, ptr); + } +} +// ============================================================================= +// Base Fp32 kernels +// ============================================================================= + +// Dot product kernel templated on the Highway tag type, shared by the +// full-width ScalableTag path and the HWY_CAPPED fast paths. +// The 4x unrolled loop hides FMA latency; for capped tags the loop body +// is never entered for small sizes and the single-vector path handles them. +template +HWY_INLINE float DotProductImpl(Tag tag, const float *a, const float *b, size_t size) +{ + const size_t lanes = hn::Lanes(tag); + auto acc0 = hn::Zero(tag), acc1 = hn::Zero(tag); + auto acc2 = hn::Zero(tag), acc3 = hn::Zero(tag); + size_t ii = 0; + for (; ii + 4 * lanes <= size; ii += 4 * lanes) { + acc0 = hn::MulAdd(hn::LoadU(tag, a + ii + 0*lanes), hn::LoadU(tag, b + ii + 0*lanes), acc0); + acc1 = hn::MulAdd(hn::LoadU(tag, a + ii + 1*lanes), hn::LoadU(tag, b + ii + 1*lanes), acc1); + acc2 = hn::MulAdd(hn::LoadU(tag, a + ii + 2*lanes), hn::LoadU(tag, b + ii + 2*lanes), acc2); + acc3 = hn::MulAdd(hn::LoadU(tag, a + ii + 3*lanes), hn::LoadU(tag, b + ii + 3*lanes), acc3); + } + auto acc = hn::Add(hn::Add(acc0, acc1), hn::Add(acc2, acc3)); + for (; ii + lanes <= size; ii += lanes) { + acc = hn::MulAdd(hn::LoadU(tag, a + ii), hn::LoadU(tag, b + ii), acc); + } + if (ii < size) { + acc = hn::MulAdd(hn::LoadN(tag, a + ii, size - ii), + hn::LoadN(tag, b + ii, size - ii), acc); + } + return hn::ReduceSum(tag, acc); +} + +// L2 square distance kernel templated on the Highway tag type, shared by +// the full-width ScalableTag path and the HWY_CAPPED fast paths. +template +HWY_INLINE float L2SquareDistanceImpl(Tag tag, const float *a, const float *b, size_t size) +{ + const size_t lanes = hn::Lanes(tag); + auto acc0 = hn::Zero(tag), acc1 = hn::Zero(tag); + auto acc2 = hn::Zero(tag), acc3 = hn::Zero(tag); + size_t ii = 0; + for (; ii + 4 * lanes <= size; ii += 4 * lanes) { + auto d0 = hn::LoadU(tag, a + ii + 0*lanes) - hn::LoadU(tag, b + ii + 0*lanes); + auto d1 = hn::LoadU(tag, a + ii + 1*lanes) - hn::LoadU(tag, b + ii + 1*lanes); + auto d2 = hn::LoadU(tag, a + ii + 2*lanes) - hn::LoadU(tag, b + ii + 2*lanes); + auto d3 = hn::LoadU(tag, a + ii + 3*lanes) - hn::LoadU(tag, b + ii + 3*lanes); + acc0 = hn::MulAdd(d0, d0, acc0); + acc1 = hn::MulAdd(d1, d1, acc1); + acc2 = hn::MulAdd(d2, d2, acc2); + acc3 = hn::MulAdd(d3, d3, acc3); + } + auto acc = hn::Add(hn::Add(acc0, acc1), hn::Add(acc2, acc3)); + for (; ii + lanes <= size; ii += lanes) { + auto d = hn::LoadU(tag, a + ii) - hn::LoadU(tag, b + ii); + acc = hn::MulAdd(d, d, acc); + } + if (ii < size) { + auto d = hn::LoadN(tag, a + ii, size - ii) - hn::LoadN(tag, b + ii, size - ii); + acc = hn::MulAdd(d, d, acc); + } + return hn::ReduceSum(tag, acc); +} + +// Cosine distance kernel templated on the Highway tag type, shared by +// the full-width ScalableTag path and the HWY_CAPPED fast paths. +template +HWY_INLINE float CosineDistanceImpl(Tag tag, const float *a, const float *b, size_t size) +{ + const size_t lanes = hn::Lanes(tag); + auto sum_ab = hn::Zero(tag), sum_aa = hn::Zero(tag), sum_bb = hn::Zero(tag); + size_t ii = 0; + for (; ii + lanes <= size; ii += lanes) { + auto va = hn::LoadU(tag, a + ii); + auto vb = hn::LoadU(tag, b + ii); + sum_ab = hn::MulAdd(va, vb, sum_ab); + sum_aa = hn::MulAdd(va, va, sum_aa); + sum_bb = hn::MulAdd(vb, vb, sum_bb); + } + if (ii < size) { + auto va = hn::LoadN(tag, a + ii, size - ii); + auto vb = hn::LoadN(tag, b + ii, size - ii); + sum_ab = hn::MulAdd(va, vb, sum_ab); + sum_aa = hn::MulAdd(va, va, sum_aa); + sum_bb = hn::MulAdd(vb, vb, sum_bb); + } + return hn::ReduceSum(tag, sum_ab) + / sqrtf(hn::ReduceSum(tag, sum_aa) * hn::ReduceSum(tag, sum_bb)); +} + +// Returns the dot product sum(a[ii] * b[ii]). +// +// Short-vector fast paths: when the register width is wider than the vector +// (e.g. a 4-element input on AVX-512), using the full register wastes lanes +// and can hurt latency. Capped tags keep execution in narrow registers. +HWY_INLINE float DotProduct(const float *a, + size_t aoffset, + const float *b, + size_t boffset, + size_t length) +{ + a += aoffset; + b += boffset; +#if HWY_MAX_BYTES > 16 + if (length <= 4) { return DotProductImpl(HWY_CAPPED(float, 4){}, a, b, length); } +#if HWY_MAX_BYTES > 32 + if (length <= 8) { return DotProductImpl(HWY_CAPPED(float, 8){}, a, b, length); } +#endif +#endif + return DotProductImpl(hn::ScalableTag{}, a, b, length); +} + +HWY_INLINE float CosineDistance( + const float *a, size_t aoffset, const float *b, size_t boffset, size_t length) +{ + const float *ap = a + aoffset; + const float *bp = b + boffset; +#if HWY_MAX_BYTES > 16 + if (length <= 4) { return CosineDistanceImpl(HWY_CAPPED(float, 4){}, ap, bp, length); } +#if HWY_MAX_BYTES > 32 + if (length <= 8) { return CosineDistanceImpl(HWY_CAPPED(float, 8){}, ap, bp, length); } +#endif +#endif + return CosineDistanceImpl(hn::ScalableTag{}, ap, bp, length); +} + +HWY_INLINE float L2SquareDistance(const float *a, + size_t aoffset, + const float *b, + size_t boffset, + size_t length) +{ + a += aoffset; + b += boffset; +#if HWY_MAX_BYTES > 16 + if (length <= 4) { return L2SquareDistanceImpl(HWY_CAPPED(float, 4){}, a, b, length); } +#if HWY_MAX_BYTES > 32 + if (length <= 8) { return L2SquareDistanceImpl(HWY_CAPPED(float, 8){}, a, b, length); } +#endif +#endif + return L2SquareDistanceImpl(hn::ScalableTag{}, a, b, length); +} + +namespace JV_ISA { + +HWY_FLATTEN float cosine_f32( + const float *a, size_t aoffset, const float *b, size_t boffset, size_t length) +{ + return CosineDistance(a, aoffset, b, boffset, length); +} + +HWY_FLATTEN float dot_product_f32( + const float *a, size_t aoffset, const float *b, size_t boffset, size_t length) +{ + return DotProduct(a, aoffset, b, boffset, length); +} + +HWY_FLATTEN float euclidean_f32( + const float *a, size_t aoffset, const float *b, size_t boffset, size_t length) +{ + return L2SquareDistance(a, aoffset, b, boffset, length); +} + +// ============================================================================= +// Element-wise in-place arithmetic and reduction kernels +// ============================================================================= +// +// rename-registers: extra GCC register-renaming pass that breaks false WAR/WAW +// hazards between short-lived zmm values, enabling more ILP in the SIMD loops. +// #pragma GCC unroll 4: unroll by 4 to hide the 4-cycle FMA latency and keep +// both AVX-512 FMA ports saturated across independent load–op–store chains. +// +__attribute__((optimize("rename-registers"))) +HWY_FLATTEN void add_in_place_f32(float *HWY_RESTRICT v1, + const float *HWY_RESTRICT v2, + size_t length) +{ + hn::ScalableTag d; + const size_t lanes = hn::Lanes(d); + size_t i = 0; +#pragma GCC unroll 4 + for (; i + lanes <= length; i += lanes) { + auto a = hn::LoadU(d, v1 + i); + auto b = hn::LoadU(d, v2 + i); + hn::StoreU(hn::Add(a, b), d, v1 + i); + } + if (i < length) { + const size_t rem = length - i; + auto a = hn::LoadN(d, v1 + i, rem); + auto b = hn::LoadN(d, v2 + i, rem); + hn::StoreN(hn::Add(a, b), d, v1 + i, rem); + } +} + +__attribute__((optimize("rename-registers"))) +HWY_FLATTEN void add_scalar_in_place_f32(float *HWY_RESTRICT v1, + float value, + size_t length) +{ + hn::ScalableTag d; + const size_t lanes = hn::Lanes(d); + const auto vval = hn::Set(d, value); + size_t i = 0; +#pragma GCC unroll 4 + for (; i + lanes <= length; i += lanes) { + auto a = hn::LoadU(d, v1 + i); + hn::StoreU(hn::Add(a, vval), d, v1 + i); + } + if (i < length) { + const size_t rem = length - i; + auto a = hn::LoadN(d, v1 + i, rem); + hn::StoreN(hn::Add(a, vval), d, v1 + i, rem); + } +} + +__attribute__((optimize("rename-registers"))) +HWY_FLATTEN void sub_in_place_f32(float *HWY_RESTRICT v1, + const float *HWY_RESTRICT v2, + size_t length) +{ + hn::ScalableTag d; + const size_t lanes = hn::Lanes(d); + size_t i = 0; +#pragma GCC unroll 4 + for (; i + lanes <= length; i += lanes) { + auto a = hn::LoadU(d, v1 + i); + auto b = hn::LoadU(d, v2 + i); + hn::StoreU(hn::Sub(a, b), d, v1 + i); + } + if (i < length) { + const size_t rem = length - i; + auto a = hn::LoadN(d, v1 + i, rem); + auto b = hn::LoadN(d, v2 + i, rem); + hn::StoreN(hn::Sub(a, b), d, v1 + i, rem); + } +} + +__attribute__((optimize("rename-registers"))) +HWY_FLATTEN void sub_scalar_in_place_f32(float *HWY_RESTRICT v1, + float value, + size_t length) +{ + hn::ScalableTag d; + const size_t lanes = hn::Lanes(d); + const auto vval = hn::Set(d, value); + size_t i = 0; +#pragma GCC unroll 4 + for (; i + lanes <= length; i += lanes) { + auto a = hn::LoadU(d, v1 + i); + hn::StoreU(hn::Sub(a, vval), d, v1 + i); + } + if (i < length) { + const size_t rem = length - i; + auto a = hn::LoadN(d, v1 + i, rem); + hn::StoreN(hn::Sub(a, vval), d, v1 + i, rem); + } +} + +__attribute__((optimize("rename-registers"))) +HWY_FLATTEN float max_f32(const float *HWY_RESTRICT v, size_t length) +{ + hn::ScalableTag d; + const size_t lanes = hn::Lanes(d); + auto accum = hn::Set(d, -FLT_MAX); + size_t i = 0; +#pragma GCC unroll 4 + for (; i + lanes <= length; i += lanes) { + accum = hn::Max(accum, hn::LoadU(d, v + i)); + } + float result = hn::ReduceMax(d, accum); + for (; i < length; i++) { + if (v[i] > result) result = v[i]; + } + return result; +} + +__attribute__((optimize("rename-registers"))) +HWY_FLATTEN void min_in_place_f32(float *HWY_RESTRICT v1, + const float *HWY_RESTRICT v2, + size_t length) +{ + hn::ScalableTag d; + const size_t lanes = hn::Lanes(d); + size_t i = 0; +#pragma GCC unroll 4 + for (; i + lanes <= length; i += lanes) { + auto a = hn::LoadU(d, v1 + i); + auto b = hn::LoadU(d, v2 + i); + hn::StoreU(hn::Min(a, b), d, v1 + i); + } + if (i < length) { + const size_t rem = length - i; + auto a = hn::LoadN(d, v1 + i, rem); + auto b = hn::LoadN(d, v2 + i, rem); + hn::StoreN(hn::Min(a, b), d, v1 + i, rem); + } +} + +// ============================================================================= +// PQ kernels +// ============================================================================= + +enum class DistanceType { DotProduct, Euclidean }; + +// Computes the per-element score vector for a single SIMD register pair (c, qq). +// For DotProduct: score[ii] = c[ii] * qq[ii] +// For Euclidean: score[ii] = (c[ii] - qq[ii])^2 +template +HWY_INLINE hn::Vec partial_sum_score(const hn::Vec &c, + const hn::Vec &qq) +{ + if constexpr (DT == DistanceType::DotProduct) { return hn::Mul(c, qq); } + else if constexpr (DT == DistanceType::Euclidean) { + const hn::Vec diff = hn::Sub(c, qq); + return hn::Mul(diff, diff); + } + else { + static_assert(DT == DistanceType::DotProduct + || DT == DistanceType::Euclidean, + "Unsupported DistanceType"); + // Unreachable, but silences compiler warnings about missing return. + return hn::Zero(c); + } +} + +// Scalar fallback: returns dot_product_f32 or euclidean_f32 depending on DT. +template +HWY_INLINE float distance_func(const float *codebook, + int clusterOffset, + const float *query, + int queryOffset, + size_t size) +{ + if constexpr (DT == DistanceType::DotProduct) + return DotProduct(codebook, clusterOffset, query, queryOffset, size); + else + return L2SquareDistance( + codebook, clusterOffset, query, queryOffset, size); +} + +template +// HWY_RESTRICT on codebook and query informs the compiler that writes to +// partialSums cannot alias either read-only input, so it need not reload +// codebook/query values after each store to partialSums. +HWY_INLINE void calculate_partial_sums_f32(const float *HWY_RESTRICT codebook, + int codebookIndex, + size_t size, + int clusterCount, + const float *HWY_RESTRICT query, + int queryOffset, + float *HWY_RESTRICT partialSums) +{ + int codebookBase = codebookIndex * clusterCount; + using FloatTag = hn::ScalableTag; + FloatTag tag; + constexpr size_t kLanes = hn::MaxLanes(tag); + alignas(64) float tmp[kLanes]; + int ii = 0; + + if constexpr (kLanes >= 2) { + if (size == 2) { + float qtmp[4] = {query[queryOffset], + query[queryOffset + 1], + query[queryOffset], + query[queryOffset + 1]}; + hn::Vec queryVec = hn::LoadDup128(tag, qtmp); + + constexpr size_t kBlock = 2; + constexpr int centroids_per_iter = kLanes / kBlock; + + for (; ii + centroids_per_iter <= clusterCount; + ii += centroids_per_iter) { + const float *cptr = codebook + ii * 2; + hn::Vec centroidVec = hn::LoadU(tag, cptr); + hn::Vec score = partial_sum_score( + centroidVec, queryVec); + hn::Vec swapped = hn::Shuffle2301(score); + hn::Vec sum = score + swapped; + hn::StoreU(sum, tag, tmp); +#pragma GCC unroll 8 + for (int jj = 0; jj < centroids_per_iter; ++jj) { + partialSums[codebookBase + ii + jj] = tmp[jj * 2]; + } + } + } + } + if constexpr (kLanes >= 4) { + if (size == 4) { + constexpr int centroids_per_iter = static_cast(kLanes / 4); + hn::Vec queryVec + = hn::LoadDup128(tag, query + queryOffset); + + for (; ii + centroids_per_iter <= clusterCount; + ii += centroids_per_iter) { + const float *cptr = codebook + ii * size; + hn::Vec centroidVec = hn::LoadU(tag, cptr); + hn::Vec sum = partial_sum_score( + centroidVec, queryVec); + hn::Vec temp = hn::Shuffle2301(sum); + sum = hn::Add(sum, temp); + temp = hn::Shuffle1032(sum); + sum = hn::Add(sum, temp); + hn::StoreU(sum, tag, tmp); +#pragma GCC unroll 4 + for (int jj = 0; jj < centroids_per_iter; ++jj) { + partialSums[codebookBase + ii + jj] = tmp[jj * 4]; + } + } + } + } + if constexpr (kLanes >= 8) { + if (size == 8) { + hn::Vec queryVec = LoadDup256(tag, query + queryOffset); + constexpr int centroids_per_iter = static_cast(kLanes / 8); + + for (; ii + centroids_per_iter <= clusterCount; + ii += centroids_per_iter) { + const float *cptr = codebook + ii * size; + hn::Vec centroidVec = hn::LoadU(tag, cptr); + hn::Vec sum = partial_sum_score( + centroidVec, queryVec); + hn::Vec temp = hn::SwapAdjacentBlocks(sum); + sum = hn::Add(sum, temp); + temp = hn::Shuffle1032(sum); + sum = hn::Add(sum, temp); + temp = hn::Shuffle2301(sum); + sum = hn::Add(sum, temp); + hn::StoreU(sum, tag, tmp); +#pragma GCC unroll 2 + for (int jj = 0; jj < centroids_per_iter; ++jj) { + partialSums[codebookBase + ii + jj] = tmp[jj * 8]; + } + } + } + } + if constexpr (kLanes == 16) { + // Don't have to worry about making this work on 1024-bit lanes just yet + if (size == 16) { + const hn::Vec queryVec + = hn::LoadU(tag, query + queryOffset); + for (; ii < clusterCount; ++ii) { + const hn::Vec centroidVec + = hn::LoadU(tag, codebook + ii * size); + partialSums[codebookBase + ii] = hn::ReduceSum( + tag, + partial_sum_score(centroidVec, queryVec)); + } + } + } + for (; ii < clusterCount; ii++) { + partialSums[codebookBase + ii] = distance_func
( + codebook, ii * size, query, queryOffset, size); + } +} + +// HWY_RESTRICT is not needed here: `data` (float*) and `baseOffsets` (unsigned char*) +// have different types, so the compiler already treats them as non-aliasing under +// C++ strict-aliasing rules. Neither pointer is written through, so there are no +// stores that could force a reload of the other. + +// Inner kernel templated on the Highway tag type so it works for both the +// full-width ScalableTag path and the capped HWY_CAPPED fast paths. +// gatherIndices[k] = (ii + k) * dataBase + baseOffsets[ii + k] +template +HWY_INLINE float AssembleAndSumImpl( + const float *HWY_RESTRICT data, + int dataBase, + const uint8_t *HWY_RESTRICT baseOffsets, + size_t baseOffsetsLength, + FloatTag floatTag) +{ + const hn::RebindToSigned int32Tag; + const hn::Rebind> uint16Tag; + const hn::Rebind> uint8Tag; + const size_t lanes = hn::Lanes(floatTag); + + // Precompute scaleVec = [0, db, 2*db, ..., (lanes-1)*db] once. + // This eliminates a 512-bit VPMULLD on every iteration; the per-iteration + // base (ii*dataBase) is a scalar that the compiler strength-reduces to an add. + const auto scaleVec = hn::Mul(hn::Iota(int32Tag, 0), + hn::Set(int32Tag, dataBase)); + auto sumVec = hn::Zero(floatTag); + + size_t ii = 0; + for (; ii + lanes <= baseOffsetsLength; ii += lanes) { + // Load `lanes` bytes and zero-extend to i32 via u8→u16→i32. + const auto offsetVec = hn::PromoteTo(int32Tag, + hn::PromoteTo(uint16Tag, + hn::LoadU(uint8Tag, baseOffsets + ii))); + const auto base = hn::Set(int32Tag, + static_cast(ii * static_cast(dataBase))); + sumVec = hn::Add(sumVec, hn::GatherIndex(floatTag, data, + hn::Add(hn::Add(base, scaleVec), offsetVec))); + } + + float res = hn::ReduceSum(floatTag, sumVec); + for (; ii < baseOffsetsLength; ii++) { + res += data[dataBase * ii + baseOffsets[ii]]; + } + return res; +} + +HWY_FLATTEN float assemble_and_sum_f32(const float *data, + int dataBase, + const unsigned char *baseOffsets, + int baseOffsetsOffset, + size_t baseOffsetsLength) +{ + baseOffsets += baseOffsetsOffset; + +#if HWY_MAX_BYTES > 16 + if (baseOffsetsLength <= 4) { + return AssembleAndSumImpl(data, dataBase, baseOffsets, baseOffsetsLength, + HWY_CAPPED(float, 4){}); + } +#if HWY_MAX_BYTES > 32 + if (baseOffsetsLength <= 8) { + return AssembleAndSumImpl(data, dataBase, baseOffsets, baseOffsetsLength, + HWY_CAPPED(float, 8){}); + } +#endif +#endif + + return AssembleAndSumImpl(data, dataBase, baseOffsets, baseOffsetsLength, + hn::ScalableTag{}); +} + +// Inner kernel for the triangular-table PQ gather, templated on the Highway tag +// type so it works for both the full-width ScalableTag path and HWY_CAPPED fast paths. +// gatherIndex[j] = laneIdx[j]*blockSize + triangularIndex(c1[j], c2[j]) +template +HWY_INLINE float AssembleAndSumPQImpl( + const float *HWY_RESTRICT data, + size_t subspaceCount, + const uint8_t *HWY_RESTRICT baseOffsets1, + const uint8_t *HWY_RESTRICT baseOffsets2, + int k, + int blockSize, + FloatTag d_f) +{ + const hn::RebindToSigned d_i; + const hn::Rebind> d_u16; + const hn::Rebind> d_u8; + const size_t lanes = hn::Lanes(d_f); + + const auto vk = hn::Set(d_i, k); + const auto vBlockSize = hn::Set(d_i, blockSize); + auto sumVec = hn::Zero(d_f); + + // Precompute laneIdxScaled = [0, bs, 2*bs, ..., (lanes-1)*bs] and advance + // it by a fixed increment each iteration. Replaces Mul(laneIdx, vBlockSize) + // (VPMULLD, 3-cycle latency on critical path) with VPADDD (1-cycle latency). + auto laneIdxScaled = hn::Mul(hn::Iota(d_i, 0), vBlockSize); + const auto laneScaledInc = hn::Set(d_i, static_cast(lanes) * blockSize); + + size_t ii = 0; + for (; ii + lanes <= subspaceCount; ii += lanes) { + // Load `lanes` u8 ordinals and zero-extend to i32 via u8→u16→i32. + const auto c1 = hn::PromoteTo(d_i, hn::PromoteTo(d_u16, hn::LoadU(d_u8, baseOffsets1 + ii))); + const auto c2 = hn::PromoteTo(d_i, hn::PromoteTo(d_u16, hn::LoadU(d_u8, baseOffsets2 + ii))); + const auto r = hn::Min(c1, c2); + const auto c = hn::Max(c1, c2); + // triangular = r*(r-1)/2; always even & non-negative so ShiftRight<1> is exact. + const auto triangular = hn::ShiftRight<1>(hn::Mul(r, hn::Sub(r, hn::Set(d_i, 1)))); + const auto offsetRow = hn::Sub(hn::Mul(r, vk), triangular); + // gatherIndex = laneIdxScaled + offsetRow + (c - r) + const auto gatherIdx = hn::Add(laneIdxScaled, + hn::Add(offsetRow, hn::Sub(c, r))); + sumVec = hn::Add(sumVec, hn::GatherIndex(d_f, data, gatherIdx)); + laneIdxScaled = hn::Add(laneIdxScaled, laneScaledInc); + } + + float res = hn::ReduceSum(d_f, sumVec); + for (; ii < subspaceCount; ii++) { + int c1v = baseOffsets1[ii], c2v = baseOffsets2[ii]; + int r = c1v < c2v ? c1v : c2v; + int cv = c1v > c2v ? c1v : c2v; + res += data[ii * blockSize + r * k - r * (r - 1) / 2 + (cv - r)]; + } + return res; +} + +// For each of the M subspaces, looks up data[i*blockSize + triangularIndex(c1[i], c2[i])] +// where blockSize = k*(k+1)/2 and triangularIndex(r,c) = r*k - r*(r-1)/2 + (c-r), +// r = min(c1,c2), c = max(c1,c2). Vectorised via a gather over i32 indices built +// from integer min/max and an arithmetic right-shift for the triangular number. +// On ISAs with >128-bit registers, capped fast-paths are used when subspaceCount +// fits in 4 or 8 lanes to avoid wasting the extra lanes of a wide register. +HWY_FLATTEN float assemble_and_sum_pq_f32( + const float *HWY_RESTRICT data, + size_t subspaceCount, + const uint8_t *HWY_RESTRICT baseOffsets1, int baseOffsetsOffset1, + const uint8_t *HWY_RESTRICT baseOffsets2, int baseOffsetsOffset2, + int clusterCount) +{ + baseOffsets1 += baseOffsetsOffset1; + baseOffsets2 += baseOffsetsOffset2; + + const int k = clusterCount; + const int blockSize = k * (k + 1) / 2; + +#if HWY_MAX_BYTES > 16 + if (subspaceCount <= 4) { + return AssembleAndSumPQImpl(data, subspaceCount, baseOffsets1, baseOffsets2, + k, blockSize, HWY_CAPPED(float, 4){}); + } +#if HWY_MAX_BYTES > 32 + if (subspaceCount <= 8) { + return AssembleAndSumPQImpl(data, subspaceCount, baseOffsets1, baseOffsets2, + k, blockSize, HWY_CAPPED(float, 8){}); + } +#endif +#endif + + return AssembleAndSumPQImpl(data, subspaceCount, baseOffsets1, baseOffsets2, + k, blockSize, hn::ScalableTag{}); +} + +// HWY_RESTRICT is not needed here: `baseOffsets` (unsigned char*) is a different type +// from `partialSums` and `aMagnitude` (float*), so strict-aliasing already guarantees +// the compiler that writes through one cannot affect loads from the other. All three +// pointers are read-only within the loop, so there are no stores to reason about anyway. +HWY_FLATTEN float +pq_decoded_cosine_similarity_f32(const unsigned char *baseOffsets, + int baseOffsetsOffset, + size_t baseOffsetsLength, + int clusterCount, + const float *partialSums, + const float *aMagnitude, + float bMagnitude) +{ + using FloatTag = hn::ScalableTag; + using Int32Tag = hn::RebindToSigned; + using Uint16Tag = hn::Rebind; + using Uint8Tag = hn::Rebind; + + const FloatTag floatTag; + const Int32Tag int32Tag; + const Uint16Tag uint16Tag; + const Uint8Tag uint8Tag; + const size_t kLanes = hn::Lanes(floatTag); + + baseOffsets += baseOffsetsOffset; + + auto sumVec = hn::Zero(floatTag); + auto magnitudeVec = hn::Zero(floatTag); + + // Precompute scaleVec = [0, cc, 2*cc, ..., (kLanes-1)*cc] once. + // Eliminates a 512-bit VPMULLD on every iteration; the per-iteration scalar + // base (ii*clusterCount) is strength-reduced to an add by the compiler. + const auto scaleVec = hn::Mul(hn::Iota(int32Tag, 0), + hn::Set(int32Tag, clusterCount)); + + size_t ii = 0; + for (; ii + kLanes <= baseOffsetsLength; ii += kLanes) { + // Load kLanes bytes and zero-extend to int32 via two PromoteTo steps (u8→u16→i32) + const auto u8Vec = hn::LoadU(uint8Tag, baseOffsets + ii); + const auto u16Vec = hn::PromoteTo(uint16Tag, u8Vec); + const auto offsetVec = hn::PromoteTo(int32Tag, u16Vec); + + // gatherIndices[k] = (ii + k) * clusterCount + baseOffsets[ii + k] + const auto base = hn::Set(int32Tag, + static_cast(ii * static_cast(clusterCount))); + const auto gatherIndices = hn::Add(hn::Add(base, scaleVec), offsetVec); + + sumVec = hn::Add(sumVec, hn::GatherIndex(floatTag, partialSums, gatherIndices)); + magnitudeVec = hn::Add(magnitudeVec, hn::GatherIndex(floatTag, aMagnitude, gatherIndices)); + } + + float sumResult = hn::ReduceSum(floatTag, sumVec); + float aMagnitudeResult = hn::ReduceSum(floatTag, magnitudeVec); + + // Handle the remaining elements + for (; ii < baseOffsetsLength; ii++) { + int offset = clusterCount * static_cast(ii) + baseOffsets[ii]; + sumResult += partialSums[offset]; + aMagnitudeResult += aMagnitude[offset]; + } + + return sumResult / sqrtf(aMagnitudeResult * bMagnitude); +} + +HWY_FLATTEN void calculate_partial_sums_dot_f32(const float *codebook, + int codebookIndex, + size_t size, + int clusterCount, + const float *query, + int queryOffset, + float *partialSums) +{ + calculate_partial_sums_f32(codebook, + codebookIndex, + size, + clusterCount, + query, + queryOffset, + partialSums); +} + +HWY_FLATTEN void calculate_partial_sums_euclidean_f32(const float *codebook, + int codebookIndex, + size_t size, + int clusterCount, + const float *query, + int queryOffset, + float *partialSums) +{ + calculate_partial_sums_f32(codebook, + codebookIndex, + size, + clusterCount, + query, + queryOffset, + partialSums); +} + +// Computes partialSums[codebookIndex * clusterCount + i] = dot(centroid_i, centroid_i) +// i.e. the squared L2 norm of each codebook centroid. This is the per-subspace +// contribution to aMagnitude in the PQ cosine similarity formula. +// +// Fast paths for size == 2/4/8/16 pack multiple centroids into one SIMD register +// (kLanes/size per iteration), eliminating the per-centroid ReduceSum overhead. +// They mirror the structure of calculate_partial_sums_f32 but replace the +// centroid×query multiply with centroid×centroid (self-square). +HWY_FLATTEN void calculate_partial_sums_self_magnitude_f32( + const float *HWY_RESTRICT codebook, + int codebookIndex, + size_t size, + int clusterCount, + float *HWY_RESTRICT partialSums) +{ + const int codebookBase = codebookIndex * clusterCount; + using FloatTag = hn::ScalableTag; + FloatTag tag; + constexpr size_t kLanes = hn::MaxLanes(tag); + alignas(64) float tmp[kLanes]; + int ii = 0; + + if constexpr (kLanes >= 2) { + if (size == 2) { + constexpr size_t kBlock = 2; + constexpr int centroids_per_iter = kLanes / kBlock; + + for (; ii + centroids_per_iter <= clusterCount; + ii += centroids_per_iter) { + const float *cptr = codebook + ii * 2; + const hn::Vec cv = hn::LoadU(tag, cptr); + hn::Vec sum = hn::Mul(cv, cv); + hn::Vec swapped = hn::Shuffle2301(sum); + sum = hn::Add(sum, swapped); + hn::StoreU(sum, tag, tmp); +#pragma GCC unroll 8 + for (int jj = 0; jj < centroids_per_iter; ++jj) { + partialSums[codebookBase + ii + jj] = tmp[jj * 2]; + } + } + } + } + if constexpr (kLanes >= 4) { + if (size == 4) { + constexpr int centroids_per_iter = static_cast(kLanes / 4); + + for (; ii + centroids_per_iter <= clusterCount; + ii += centroids_per_iter) { + const float *cptr = codebook + ii * size; + const hn::Vec cv = hn::LoadU(tag, cptr); + hn::Vec sum = hn::Mul(cv, cv); + hn::Vec temp = hn::Shuffle2301(sum); + sum = hn::Add(sum, temp); + temp = hn::Shuffle1032(sum); + sum = hn::Add(sum, temp); + hn::StoreU(sum, tag, tmp); +#pragma GCC unroll 4 + for (int jj = 0; jj < centroids_per_iter; ++jj) { + partialSums[codebookBase + ii + jj] = tmp[jj * 4]; + } + } + } + } + if constexpr (kLanes >= 8) { + if (size == 8) { + constexpr int centroids_per_iter = static_cast(kLanes / 8); + + for (; ii + centroids_per_iter <= clusterCount; + ii += centroids_per_iter) { + const float *cptr = codebook + ii * size; + const hn::Vec cv = hn::LoadU(tag, cptr); + hn::Vec sum = hn::Mul(cv, cv); + hn::Vec temp = hn::SwapAdjacentBlocks(sum); + sum = hn::Add(sum, temp); + temp = hn::Shuffle1032(sum); + sum = hn::Add(sum, temp); + temp = hn::Shuffle2301(sum); + sum = hn::Add(sum, temp); + hn::StoreU(sum, tag, tmp); +#pragma GCC unroll 2 + for (int jj = 0; jj < centroids_per_iter; ++jj) { + partialSums[codebookBase + ii + jj] = tmp[jj * 8]; + } + } + } + } + if constexpr (kLanes == 16) { + // AVX-512 only: one full register holds exactly one size==16 centroid. + if (size == 16) { + for (; ii < clusterCount; ++ii) { + const hn::Vec cv + = hn::LoadU(tag, codebook + ii * size); + partialSums[codebookBase + ii] + = hn::ReduceSum(tag, hn::Mul(cv, cv)); + } + } + } + // General fallback: one centroid at a time, vector-accumulate then reduce. + for (; ii < clusterCount; ii++) { + const float *cptr = codebook + ii * size; + auto accVec = hn::Zero(tag); + size_t j = 0; + for (; j + kLanes <= size; j += kLanes) { + const auto v = hn::LoadU(tag, cptr + j); + accVec = hn::MulAdd(v, v, accVec); + } + float sum = hn::ReduceSum(tag, accVec); + for (; j < size; j++) { + sum += cptr[j] * cptr[j]; + } + partialSums[codebookBase + ii] = sum; + } +} + +// ============================================================================= +// NVQ kernels +// ============================================================================= +// +// Bit-manipulation helpers used by all NVQ public kernels: +// +// logisticNQT — approximate sigmoid via IEEE 754 bit tricks (2^x approximation). +// logitNQT — inverse: fast log2 via exponent extraction. +// Both exploit the float bit layout to avoid transcendental instructions. +// + +// logisticNQT: approximate sigmoid using integer bit manipulation. +// Computes an approximation of the logistic function: +// result ≈ 1 / (1 + exp(-alpha * (v - x0))) +// using the identity sigmoid(x) = 2^x / (2^x + 1) and a fast bit-hack for 2^x: +// given x = p + f where p is integer and f ∈ [0,1): +// 2^x ≈ reinterpret_float(bits_of((f*0.5+1.0) << 23 + p << 23)) +template +HWY_INLINE hn::Vec logisticNQT(D d, hn::Vec v, float alpha, float x0) +{ + const hn::RebindToSigned di; + + // temp = alpha * v - alpha * x0 + auto temp = hn::MulAdd(v, hn::Set(d, alpha), hn::Set(d, -alpha * x0)); + + // p = (int)(temp + 1) where temp >= 0, else (int)(temp) + // Mirrors Java: p = (int) floor(temp + 1); truncation == floor for temp >= 0. + const auto isPositive = hn::Not(hn::IsNegative(temp)); + auto selected = hn::IfThenElse(isPositive, + hn::Add(temp, hn::Set(d, 1.0f)), + temp); + auto p = hn::ConvertTo(di, selected); // truncate towards zero + + // e = (float) p + auto e = hn::ConvertTo(d, p); + + // m = reinterpret_bits((temp - e) * 0.5 + 1.0) + // (temp - e) is in (-1, 1), so the result is in (0.5, 1.5) — a mantissa value. + auto m = hn::BitCast(di, + hn::MulAdd(hn::Sub(temp, e), + hn::Set(d, 0.5f), + hn::Set(d, 1.0f))); + + // Reconstruct: (m_bits + (p << 23)) reinterpreted as float = m_mantissa * 2^p + auto result = hn::BitCast(d, hn::Add(m, hn::ShiftLeft<23>(p))); + + // Sigmoid: result / (result + 1) + return hn::Div(result, hn::Add(result, hn::Set(d, 1.0f))); +} + +// logitNQT: inverse of logisticNQT — fast log2 via IEEE 754 exponent extraction. +// Computes approximately: +// inverseAlpha * (log2(v / (1-v)) - 1) + x0 +// The "-1" offset comes from subtracting 128 instead of 127 from the biased exponent, +// matching the Java implementation exactly. +template +HWY_INLINE hn::Vec logitNQT(D d, hn::Vec v, float inverseAlpha, float x0) +{ + const hn::RebindToSigned di; + + // z = v / (1 - v) + auto z = hn::Div(v, hn::Sub(hn::Set(d, 1.0f), v)); + + // Reinterpret float bits as int32 to extract exponent and mantissa fields. + auto temp = hn::BitCast(di, z); + + // p = (biased_exponent >> 23) - 128 + // Masking with 0x7f800000 isolates the 8 exponent bits; shifting by 23 places + // them in the low byte. Subtracting 128 (vs. the standard 127 bias) is intentional + // and matches the Java source. + auto p = hn::Sub(hn::ShiftRight<23>(hn::And(temp, hn::Set(di, 0x7f800000))), + hn::Set(di, 128)); + + // m = reinterpret as float: set exponent to 127 (i.e., 2^0) and keep mantissa + // → value in [1.0, 2.0) + auto m = hn::BitCast(d, hn::Add(hn::And(temp, hn::Set(di, 0x007fffff)), + hn::Set(di, 0x3f800000))); + + // return (m + (float)p) * inverseAlpha + x0 + return hn::MulAdd(hn::Add(m, hn::ConvertTo(d, p)), + hn::Set(d, inverseAlpha), + hn::Set(d, x0)); +} + +// Single-element wrappers used only for the two setup constants (logisticBias, +// logisticScale) computed at the start of each NVQ kernel. They delegate to +// the vector templates via CappedTag so there is no duplication of +// the bit-manipulation logic. They are NOT called in any hot loop — all tail +// elements are handled by LoadN + FirstN-masked vector operations below. +static HWY_INLINE float logisticNQT_scalar(float value, float alpha, float x0) +{ + const hn::CappedTag d1; + return hn::GetLane(logisticNQT(d1, hn::Set(d1, value), alpha, x0)); +} + +// Public kernels — called from Java via FFI. +// +// All six functions mirror the @Override methods in PanamaVectorUtilSupport. +// They share the same mathematical logic; the Highway vector loops replace the +// Panama FloatVector loops and the scalar tails are identical to the Java ones. +// +// Byte↔float conversion pipeline (mirrors Java nvqDequantize8bit): +// LoadU(uint8Tag, ptr) — fill 4N-lane u8 vector from N bytes at ptr +// PromoteTo(uint16Tag) — lower N u8s → N u16s (2N-lane vector) +// PromoteTo(int32Tag) — lower N u16s → N i32s (N-lane vector) +// ConvertTo(floatTag) — i32 → float +// MulAdd(scale, bias) — byte * logisticScale + logisticBias +// logitNQT(...) — inverse logistic +// +// Float→byte pipeline (nvq_quantize_8bit): +// logisticNQT(...) — forward logistic +// scale and shift +// ConvertTo(int32) after +0.5 — round toward nearest for positive values +// StoreU to tmp[] + scalar clamp and byte-cast +// +// Cosine packing: nvq_cosine_8bit_packed returns an int64_t whose low 32 bits +// are the IEEE-754 bits of `sum` and whose high 32 bits are `bMagnitude`, so +// the caller can unpack with Float.intBitsToFloat without any heap allocation. +// ============================================================================= + +HWY_FLATTEN void nvq_quantize_8bit(const float *HWY_RESTRICT vector, + size_t length, + float alpha, float x0, + float minValue, float maxValue, + uint8_t *HWY_RESTRICT destination) +{ + using FloatTag = hn::ScalableTag; + using Int32Tag = hn::RebindToSigned; + FloatTag d_f; + Int32Tag d_i; + constexpr size_t kLanes = hn::MaxLanes(d_f); + alignas(64) int32_t tmp[kLanes]; + + float delta = maxValue - minValue; + float scaledAlpha = alpha / delta; + float scaledX0 = x0 * delta; + float logisticBias = logisticNQT_scalar(minValue, scaledAlpha, scaledX0); + float invLogisticScale = 255.0f / (logisticNQT_scalar(maxValue, scaledAlpha, scaledX0) - logisticBias); + + size_t i = 0; + for (; i + kLanes <= length; i += kLanes) { + auto arr = hn::LoadU(d_f, vector + i); + arr = logisticNQT(d_f, arr, scaledAlpha, scaledX0); + arr = hn::Add(hn::Mul(hn::Sub(arr, hn::Set(d_f, logisticBias)), + hn::Set(d_f, invLogisticScale)), + hn::Set(d_f, 0.5f)); + auto fi = hn::ConvertTo(d_i, arr); + hn::StoreU(fi, d_i, tmp); + for (size_t j = 0; j < kLanes; j++) { + int v = tmp[j]; + destination[i + j] = (uint8_t)(v < 0 ? 0 : (v > 255 ? 255 : v)); + } + } + // Tail: LoadN zero-pads lanes beyond `remaining`; only write the first + // `remaining` bytes from tmp[] so the padding lanes are never observed. + const size_t remaining = length - i; + if (remaining > 0) { + auto arr = hn::LoadN(d_f, vector + i, remaining); + arr = logisticNQT(d_f, arr, scaledAlpha, scaledX0); + arr = hn::Add(hn::Mul(hn::Sub(arr, hn::Set(d_f, logisticBias)), + hn::Set(d_f, invLogisticScale)), + hn::Set(d_f, 0.5f)); + hn::StoreU(hn::ConvertTo(d_i, arr), d_i, tmp); + for (size_t j = 0; j < remaining; j++) { + int v = tmp[j]; + destination[i + j] = (uint8_t)(v < 0 ? 0 : (v > 255 ? 255 : v)); + } + } +} + +HWY_FLATTEN float nvq_loss(const float *HWY_RESTRICT vector, + size_t length, + float alpha, float x0, + float minValue, float maxValue, + int nBits) +{ + using FloatTag = hn::ScalableTag; + using Int32Tag = hn::RebindToSigned; + FloatTag d_f; + Int32Tag d_i; + constexpr size_t kLanes = hn::MaxLanes(d_f); + + int constant = (1 << nBits) - 1; + float delta = maxValue - minValue; + float scaledAlpha = alpha / delta; + float invScaledAlpha = delta / alpha; // 1 / scaledAlpha + float scaledX0 = x0 * delta; + float logisticBias = logisticNQT_scalar(minValue, scaledAlpha, scaledX0); + float logisticScale = (logisticNQT_scalar(maxValue, scaledAlpha, scaledX0) - logisticBias) / (float)constant; + float invLogisticScale = 1.0f / logisticScale; + + auto squaredSum = hn::Zero(d_f); + + size_t i = 0; + for (; i + kLanes <= length; i += kLanes) { + auto arr = hn::LoadU(d_f, vector + i); + auto recArr = logisticNQT(d_f, arr, scaledAlpha, scaledX0); + recArr = hn::Mul(hn::Sub(recArr, hn::Set(d_f, logisticBias)), + hn::Set(d_f, invLogisticScale)); + // Round to nearest integer (add 0.5, truncate toward zero) + auto recInt = hn::ConvertTo(d_i, hn::Add(recArr, hn::Set(d_f, 0.5f))); + recArr = hn::ConvertTo(d_f, recInt); + recArr = hn::MulAdd(recArr, hn::Set(d_f, logisticScale), hn::Set(d_f, logisticBias)); + recArr = logitNQT(d_f, recArr, invScaledAlpha, scaledX0); + auto diff = hn::Sub(arr, recArr); + squaredSum = hn::MulAdd(diff, diff, squaredSum); + } + + float result = hn::ReduceSum(d_f, squaredSum); + + // Tail: LoadN zero-pads; mask the diff so padding lanes don't contribute. + const size_t remaining = length - i; + if (remaining > 0) { + const auto mask = hn::FirstN(d_f, remaining); + auto arr = hn::LoadN(d_f, vector + i, remaining); + auto recArr = logisticNQT(d_f, arr, scaledAlpha, scaledX0); + recArr = hn::Mul(hn::Sub(recArr, hn::Set(d_f, logisticBias)), + hn::Set(d_f, invLogisticScale)); + auto recInt = hn::ConvertTo(d_i, hn::Add(recArr, hn::Set(d_f, 0.5f))); + recArr = hn::ConvertTo(d_f, recInt); + recArr = hn::MulAdd(recArr, hn::Set(d_f, logisticScale), hn::Set(d_f, logisticBias)); + recArr = logitNQT(d_f, recArr, invScaledAlpha, scaledX0); + auto diff = hn::IfThenElseZero(mask, hn::Sub(arr, recArr)); + result += hn::ReduceSum(d_f, hn::Mul(diff, diff)); + } + + return result; +} + +HWY_FLATTEN float nvq_uniform_loss(const float *HWY_RESTRICT vector, + size_t length, + float minValue, float maxValue, + int nBits) +{ + using FloatTag = hn::ScalableTag; + using Int32Tag = hn::RebindToSigned; + FloatTag d_f; + Int32Tag d_i; + constexpr size_t kLanes = hn::MaxLanes(d_f); + + float constant = (float)((1 << nBits) - 1); + float delta = maxValue - minValue; + + auto squaredSum = hn::Zero(d_f); + + size_t i = 0; + for (; i + kLanes <= length; i += kLanes) { + auto arr = hn::LoadU(d_f, vector + i); + auto recArr = hn::Mul(hn::Sub(arr, hn::Set(d_f, minValue)), + hn::Set(d_f, constant / delta)); + auto recInt = hn::ConvertTo(d_i, hn::Add(recArr, hn::Set(d_f, 0.5f))); + recArr = hn::ConvertTo(d_f, recInt); + recArr = hn::MulAdd(recArr, hn::Set(d_f, delta / constant), hn::Set(d_f, minValue)); + auto diff = hn::Sub(arr, recArr); + squaredSum = hn::MulAdd(diff, diff, squaredSum); + } + + float result = hn::ReduceSum(d_f, squaredSum); + + // Tail: LoadN zero-pads; mask the diff so padding lanes don't contribute. + const size_t remaining = length - i; + if (remaining > 0) { + const auto mask = hn::FirstN(d_f, remaining); + auto arr = hn::LoadN(d_f, vector + i, remaining); + auto recArr = hn::Mul(hn::Sub(arr, hn::Set(d_f, minValue)), + hn::Set(d_f, constant / delta)); + auto recInt = hn::ConvertTo(d_i, hn::Add(recArr, hn::Set(d_f, 0.5f))); + recArr = hn::ConvertTo(d_f, recInt); + recArr = hn::MulAdd(recArr, hn::Set(d_f, delta / constant), hn::Set(d_f, minValue)); + auto diff = hn::IfThenElseZero(mask, hn::Sub(arr, recArr)); + result += hn::ReduceSum(d_f, hn::Mul(diff, diff)); + } + + return result; +} + +// ───────────────────────────────────────────────────────────────────────────── +// Two dequantization helpers — used by the three NVQ 8-bit scoring kernels. +// +// dequantize_bytes (tail / non-hot path) +// Loads kLanes uint8 values via a narrow u8 tag, widens to int32 through +// two PromoteTo calls, converts to float, then applies scale+bias+logitNQT. +// Used only for the unaligned tail elements after the main FastLanes loop; +// those elements' query floats are in their original (un-shuffled) order. +// +// dequantize_bytes_fastlanes (hot path) +// Accepts a Vec that is a full-width byte vector BitCast-ed to +// kLanes int32 lanes (i.e., 4*kLanes bytes loaded in one 512-bit register). +// Extracts the `part`-th byte from each int32 lane via shift-right + AND, +// converts to float, and applies scale+bias+logitNQT. All operations run +// at the native SIMD width — no widening, no register-width changes. +// Mirrors the Panama FastLanes strategy: +// https://www.vldb.org/pvldb/vol16/p2132-afroozeh.pdf +// ───────────────────────────────────────────────────────────────────────────── +template , + class Uint16Tag = hn::Rebind, + class Uint8Tag = hn::Rebind> +HWY_INLINE hn::Vec +dequantize_bytes(FloatTag d_f, Int32Tag d_i, Uint16Tag d_u16, Uint8Tag d_u8, + const uint8_t *HWY_RESTRICT quantized, size_t i, + float logisticScale, float logisticBias, + float invScaledAlpha, float scaledX0) +{ + const auto b_u8 = hn::LoadU(d_u8, quantized + i); + const auto b_u16 = hn::PromoteTo(d_u16, b_u8); + const auto b_i32 = hn::PromoteTo(d_i, b_u16); + auto vb = hn::ConvertTo(d_f, b_i32); + vb = hn::MulAdd(vb, hn::Set(d_f, logisticScale), hn::Set(d_f, logisticBias)); + return logitNQT(d_f, vb, invScaledAlpha, scaledX0); +} + +template > +HWY_INLINE hn::Vec +dequantize_bytes_fastlanes(FloatTag d_f, Int32Tag d_i, + hn::Vec as_ints, int part, + float logisticScale, float logisticBias, + float invScaledAlpha, float scaledX0) +{ + // Extract the `part`-th byte from each int32 lane, then convert to float. + // ShiftRightSame on a signed tag does arithmetic shift, but the AND with + // 0xFF zeroes the sign-extended upper bits, giving an unsigned 0-255 value. + auto shifted = hn::ShiftRightSame(as_ints, 8 * part); + auto masked = hn::And(shifted, hn::Set(d_i, 0xFF)); + auto vb = hn::ConvertTo(d_f, masked); + vb = hn::MulAdd(vb, hn::Set(d_f, logisticScale), hn::Set(d_f, logisticBias)); + return logitNQT(d_f, vb, invScaledAlpha, scaledX0); +} + +HWY_FLATTEN float nvq_square_l2_distance_8bit(const float *HWY_RESTRICT vector, + const uint8_t *HWY_RESTRICT quantized, + size_t length, + float alpha, float x0, + float minValue, float maxValue) +{ + using FloatTag = hn::ScalableTag; + using Int32Tag = hn::RebindToSigned; + using Uint8x4Tag = hn::ScalableTag; // 4*kLanes lanes — same total width as FloatTag + // Tail-path tags (narrower, used only outside the hot loop) + using Uint16Tag = hn::Rebind; + using Uint8Tag = hn::Rebind; + FloatTag d_f; + Int32Tag d_i; + Uint8x4Tag d_b; + Uint16Tag d_u16; + Uint8Tag d_u8; + constexpr size_t kLanes = hn::MaxLanes(d_f); + + float delta = maxValue - minValue; + float scaledAlpha = alpha / delta; + float invScaledAlpha = delta / alpha; + float scaledX0 = x0 * delta; + float logisticBias = logisticNQT_scalar(minValue, scaledAlpha, scaledX0); + float logisticScale = (logisticNQT_scalar(maxValue, scaledAlpha, scaledX0) - logisticBias) / 255.0f; + + auto squaredSum = hn::Zero(d_f); + + // FastLanes main loop: load 4*kLanes bytes per iteration (full native width), + // reinterpret as kLanes int32 values, and extract one byte per int32 via + // shift+mask for each of the 4 parts. Requires the query vector to have + // been pre-shuffled by nvq_shuffle_query_in_place_8bit. + size_t i = 0; + for (; i + 4 * kLanes <= length; i += 4 * kLanes) { + auto bytes = hn::LoadU(d_b, quantized + i); + auto as_ints = hn::BitCast(d_i, bytes); + for (int part = 0; part < 4; ++part) { + auto va = hn::LoadU(d_f, vector + i + part * kLanes); + auto vb = dequantize_bytes_fastlanes(d_f, d_i, as_ints, part, + logisticScale, logisticBias, + invScaledAlpha, scaledX0); + auto diff = hn::Sub(va, vb); + squaredSum = hn::MulAdd(diff, diff, squaredSum); + } + } + + float result = hn::ReduceSum(d_f, squaredSum); + + // kLanes-aligned tail: query floats are un-shuffled here, so use the + // sequential PromoteTo path which reads bytes in natural order. + for (; i + kLanes <= length; i += kLanes) { + auto va = hn::LoadU(d_f, vector + i); + auto vb = dequantize_bytes(d_f, d_i, d_u16, d_u8, quantized, i, + logisticScale, logisticBias, invScaledAlpha, scaledX0); + auto diff = hn::Sub(va, vb); + result += hn::ReduceSum(d_f, hn::Mul(diff, diff)); + } + + // Sub-kLanes tail: LoadN zero-pads; mask diff to exclude padding lanes. + const size_t remaining = length - i; + if (remaining > 0) { + const auto mask = hn::FirstN(d_f, remaining); + auto va = hn::LoadN(d_f, vector + i, remaining); + const auto b_u8 = hn::LoadN(d_u8, quantized + i, remaining); + const auto b_u16 = hn::PromoteTo(d_u16, b_u8); + const auto b_i32 = hn::PromoteTo(d_i, b_u16); + auto vb = hn::MulAdd(hn::ConvertTo(d_f, b_i32), + hn::Set(d_f, logisticScale), + hn::Set(d_f, logisticBias)); + vb = logitNQT(d_f, vb, invScaledAlpha, scaledX0); + auto diff = hn::IfThenElseZero(mask, hn::Sub(va, vb)); + result += hn::ReduceSum(d_f, hn::Mul(diff, diff)); + } + + return result; +} + +HWY_FLATTEN float nvq_dot_product_8bit(const float *HWY_RESTRICT vector, + const uint8_t *HWY_RESTRICT quantized, + size_t length, + float alpha, float x0, + float minValue, float maxValue) +{ + using FloatTag = hn::ScalableTag; + using Int32Tag = hn::RebindToSigned; + using Uint8x4Tag = hn::ScalableTag; // 4*kLanes lanes — same total width as FloatTag + using Uint16Tag = hn::Rebind; + using Uint8Tag = hn::Rebind; + FloatTag d_f; + Int32Tag d_i; + Uint8x4Tag d_b; + Uint16Tag d_u16; + Uint8Tag d_u8; + constexpr size_t kLanes = hn::MaxLanes(d_f); + + float delta = maxValue - minValue; + float scaledAlpha = alpha / delta; + float invScaledAlpha = delta / alpha; + float scaledX0 = x0 * delta; + float logisticBias = logisticNQT_scalar(minValue, scaledAlpha, scaledX0); + float logisticScale = (logisticNQT_scalar(maxValue, scaledAlpha, scaledX0) - logisticBias) / 255.0f; + + auto dotProd = hn::Zero(d_f); + + // FastLanes main loop: full-width byte load, shift+mask extraction. + size_t i = 0; + for (; i + 4 * kLanes <= length; i += 4 * kLanes) { + auto bytes = hn::LoadU(d_b, quantized + i); + auto as_ints = hn::BitCast(d_i, bytes); + for (int part = 0; part < 4; ++part) { + auto va = hn::LoadU(d_f, vector + i + part * kLanes); + auto vb = dequantize_bytes_fastlanes(d_f, d_i, as_ints, part, + logisticScale, logisticBias, + invScaledAlpha, scaledX0); + dotProd = hn::MulAdd(va, vb, dotProd); + } + } + + float result = hn::ReduceSum(d_f, dotProd); + + // kLanes-aligned tail: un-shuffled query, sequential byte access. + for (; i + kLanes <= length; i += kLanes) { + auto va = hn::LoadU(d_f, vector + i); + auto vb = dequantize_bytes(d_f, d_i, d_u16, d_u8, quantized, i, + logisticScale, logisticBias, invScaledAlpha, scaledX0); + result += hn::ReduceSum(d_f, hn::Mul(va, vb)); + } + + // Sub-kLanes tail: LoadN zero-pads va; 0 * vb = 0 for padding lanes. + const size_t remaining = length - i; + if (remaining > 0) { + auto va = hn::LoadN(d_f, vector + i, remaining); + const auto b_u8 = hn::LoadN(d_u8, quantized + i, remaining); + const auto b_u16 = hn::PromoteTo(d_u16, b_u8); + const auto b_i32 = hn::PromoteTo(d_i, b_u16); + auto vb = hn::MulAdd(hn::ConvertTo(d_f, b_i32), + hn::Set(d_f, logisticScale), + hn::Set(d_f, logisticBias)); + vb = logitNQT(d_f, vb, invScaledAlpha, scaledX0); + result += hn::ReduceSum(d_f, hn::Mul(va, vb)); + } + + return result; +} + +// nvq_shuffle_query_in_place_8bit +// +// Pre-processes the float query vector in-place so that nvq_cosine_8bit_packed, +// nvq_dot_product_8bit, and nvq_square_l2_distance_8bit can use the FastLanes +// byte-extraction strategy. Only complete blocks of 4*kLanes floats are +// transposed; the remaining tail elements are left in their original order and +// processed by the sequential tail path inside each scoring kernel. +// +// The permutation applied is an in-place matrix transpose of 4×kLanes blocks: +// after the shuffle, shuffled[j*kLanes + k] == original[k*4 + j], so that +// when the scoring kernel extracts byte `j` from int32 slot `k` (via +// ShiftRight(8*j) & 0xFF), it lines up with query float at position j*kLanes+k. +HWY_FLATTEN void nvq_shuffle_query_in_place_8bit(float *HWY_RESTRICT vector, + size_t length) +{ + using FloatTag = hn::ScalableTag; + FloatTag d_f; + const size_t kLanes = hn::Lanes(d_f); + const size_t step = 4 * kLanes; // block size = number of bytes in a full-width vector + const size_t mn1 = step - 1; + + // Maximum step across all ISAs compiled for: 4 * 16 = 64 (AVX-512). + // Declared outside the loop so the stack frame is allocated once. + constexpr size_t kMaxStep = 4 * hn::MaxLanes(FloatTag{}); + bool visited[kMaxStep]; + + size_t offset = 0; + while (offset + step <= length) { + float *arr = vector + offset; + memset(visited, 0, step); // only zero the portion we will inspect + + // In-place cyclic transposition: for each unvisited cycle, rotate + // elements along the cycle defined by a -> (kLanes * a) % mn1. + // This maps shuffled[p] = original[(p % kLanes) * 4 + (p / kLanes)], + // which is the inverse of the FastLanes interleaving. + for (size_t cycle = 1; cycle < step; ++cycle) { + if (visited[cycle]) continue; + size_t a = cycle; + do { + a = (a == mn1) ? mn1 : (kLanes * a) % mn1; + float temp = arr[a]; + arr[a] = arr[cycle]; + arr[cycle] = temp; + visited[a] = true; + } while (a != cycle); + } + offset += step; + } +} + +// Returns sum and bMagnitude packed into a single int64_t: +// bits [31:0] = IEEE-754 bits of sum +// bits [63:32] = IEEE-754 bits of bMagnitude +// The Java caller unpacks with Float.intBitsToFloat — no heap allocation needed. +HWY_FLATTEN int64_t nvq_cosine_8bit_packed(const float *HWY_RESTRICT vector, + const uint8_t *HWY_RESTRICT quantized, + size_t length, + float alpha, float x0, + float minValue, float maxValue, + const float *HWY_RESTRICT centroid) +{ + using FloatTag = hn::ScalableTag; + using Int32Tag = hn::RebindToSigned; + using Uint8x4Tag = hn::ScalableTag; // 4*kLanes lanes — same total width as FloatTag + using Uint16Tag = hn::Rebind; + using Uint8Tag = hn::Rebind; + FloatTag d_f; + Int32Tag d_i; + Uint8x4Tag d_b; + Uint16Tag d_u16; + Uint8Tag d_u8; + constexpr size_t kLanes = hn::MaxLanes(d_f); + + float delta = maxValue - minValue; + float scaledAlpha = alpha / delta; + float invScaledAlpha = delta / alpha; + float scaledX0 = x0 * delta; + float logisticBias = logisticNQT_scalar(minValue, scaledAlpha, scaledX0); + float logisticScale = (logisticNQT_scalar(maxValue, scaledAlpha, scaledX0) - logisticBias) / 255.0f; + + auto sumVec = hn::Zero(d_f); + auto bMagVec = hn::Zero(d_f); + + // FastLanes main loop: full-width byte load, shift+mask extraction. + size_t i = 0; + for (; i + 4 * kLanes <= length; i += 4 * kLanes) { + auto bytes = hn::LoadU(d_b, quantized + i); + auto as_ints = hn::BitCast(d_i, bytes); + for (int part = 0; part < 4; ++part) { + auto va = hn::LoadU(d_f, vector + i + part * kLanes); + auto vc = hn::LoadU(d_f, centroid + i + part * kLanes); + auto vb = dequantize_bytes_fastlanes(d_f, d_i, as_ints, part, + logisticScale, logisticBias, + invScaledAlpha, scaledX0); + vb = hn::Add(vb, vc); + sumVec = hn::MulAdd(va, vb, sumVec); + bMagVec = hn::MulAdd(vb, vb, bMagVec); + } + } + + float sum = hn::ReduceSum(d_f, sumVec); + float bMag = hn::ReduceSum(d_f, bMagVec); + + // kLanes-aligned tail: un-shuffled query and centroid, sequential bytes. + for (; i + kLanes <= length; i += kLanes) { + auto va = hn::LoadU(d_f, vector + i); + auto vc = hn::LoadU(d_f, centroid + i); + auto vb = dequantize_bytes(d_f, d_i, d_u16, d_u8, quantized, i, + logisticScale, logisticBias, invScaledAlpha, scaledX0); + vb = hn::Add(vb, vc); + sum += hn::ReduceSum(d_f, hn::Mul(va, vb)); + bMag += hn::ReduceSum(d_f, hn::Mul(vb, vb)); + } + + // Sub-kLanes tail: LoadN zero-pads va and vc. Mask vb before adding the + // centroid so padding lanes contribute 0 to both sum and bMagnitude. + const size_t remaining = length - i; + if (remaining > 0) { + const auto mask = hn::FirstN(d_f, remaining); + auto va = hn::LoadN(d_f, vector + i, remaining); + auto vc = hn::LoadN(d_f, centroid + i, remaining); + const auto b_u8 = hn::LoadN(d_u8, quantized + i, remaining); + const auto b_u16 = hn::PromoteTo(d_u16, b_u8); + const auto b_i32 = hn::PromoteTo(d_i, b_u16); + auto vb = hn::MulAdd(hn::ConvertTo(d_f, b_i32), + hn::Set(d_f, logisticScale), + hn::Set(d_f, logisticBias)); + vb = logitNQT(d_f, vb, invScaledAlpha, scaledX0); + auto vb_c = hn::Add(hn::IfThenElseZero(mask, vb), vc); + sum += hn::ReduceSum(d_f, hn::Mul(va, vb_c)); + bMag += hn::ReduceSum(d_f, hn::Mul(vb_c, vb_c)); + } + + int32_t sum_bits, bmag_bits; + memcpy(&sum_bits, &sum, sizeof(float)); + memcpy(&bmag_bits, &bMag, sizeof(float)); + return ((int64_t)bmag_bits << 32) | (int64_t)(uint32_t)sum_bits; +} + +} // namespace JV_ISA diff --git a/jvector-native/src/main/c/jvector_simd_kernels.h b/jvector-native/src/main/c/jvector_simd_kernels.h new file mode 100644 index 000000000..d6790c991 --- /dev/null +++ b/jvector-native/src/main/c/jvector_simd_kernels.h @@ -0,0 +1,138 @@ +/* + * Copyright DataStax, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// headerfile for the simd kernels +#ifndef SIMD_KERNELS_H +#define SIMD_KERNELS_H + +#include +#include + +#define DECLARE_SIMD_KERNELS(ISA) \ + namespace ISA { \ + /* Base Fp32 distance kernels */ \ + float cosine_f32(const float *a, \ + size_t aoffset, \ + const float *b, \ + size_t boffset, \ + size_t length); \ + float dot_product_f32(const float *a, \ + size_t aoffset, \ + const float *b, \ + size_t boffset, \ + size_t length); \ + float euclidean_f32(const float *a, \ + size_t aoffset, \ + const float *b, \ + size_t boffset, \ + size_t length); \ + /* Element-wise in-place arithmetic */ \ + void add_in_place_f32(float *v1, \ + const float *v2, \ + size_t length); \ + void add_scalar_in_place_f32(float *v1, \ + float value, \ + size_t length); \ + void sub_in_place_f32(float *v1, \ + const float *v2, \ + size_t length); \ + void sub_scalar_in_place_f32(float *v1, \ + float value, \ + size_t length); \ + float max_f32(const float *v, \ + size_t length); \ + void min_in_place_f32(float *v1, \ + const float *v2, \ + size_t length); \ + /* PQ kernels */ \ + float assemble_and_sum_f32(const float *data, \ + int dataBase, \ + const unsigned char *baseOffsets, \ + int baseOffsetsOffset, \ + size_t baseOffsetsLength); \ + float assemble_and_sum_pq_f32(const float *data, \ + size_t subspaceCount, \ + const unsigned char *baseOffsets1, \ + int baseOffsetsOffset1, \ + const unsigned char *baseOffsets2, \ + int baseOffsetsOffset2, \ + int clusterCount); \ + float pq_decoded_cosine_similarity_f32(const unsigned char *baseOffsets, \ + int baseOffsetsOffset, \ + size_t baseOffsetsLength, \ + int clusterCount, \ + const float *partialSums, \ + const float *aMagnitude, \ + float bMagnitude); \ + void calculate_partial_sums_dot_f32(const float *codebook, \ + int codebookIndex, \ + size_t size, \ + int clusterCount, \ + const float *query, \ + int queryOffset, \ + float *partialSums); \ + void calculate_partial_sums_euclidean_f32(const float *codebook, \ + int codebookIndex, \ + size_t size, \ + int clusterCount, \ + const float *query, \ + int queryOffset, \ + float *partialSums); \ + void calculate_partial_sums_self_magnitude_f32(const float *codebook, \ + int codebookIndex, \ + size_t size, \ + int clusterCount, \ + float *partialSums); \ + /* NVQ kernels */ \ + void nvq_quantize_8bit(const float *vector, \ + size_t length, \ + float alpha, float x0, \ + float minValue, float maxValue, \ + unsigned char *destination); \ + float nvq_loss(const float *vector, \ + size_t length, \ + float alpha, float x0, \ + float minValue, float maxValue, \ + int nBits); \ + float nvq_uniform_loss(const float *vector, \ + size_t length, \ + float minValue, float maxValue, \ + int nBits); \ + float nvq_square_l2_distance_8bit(const float *vector, \ + const unsigned char *quantized, \ + size_t length, \ + float alpha, float x0, \ + float minValue, float maxValue); \ + float nvq_dot_product_8bit(const float *vector, \ + const unsigned char *quantized, \ + size_t length, \ + float alpha, float x0, \ + float minValue, float maxValue); \ + int64_t nvq_cosine_8bit_packed(const float *vector, \ + const unsigned char *quantized, \ + size_t length, \ + float alpha, float x0, \ + float minValue, float maxValue, \ + const float *centroid); \ + void nvq_shuffle_query_in_place_8bit(float *vector, \ + size_t length); \ + } + +DECLARE_SIMD_KERNELS(AVX3) +DECLARE_SIMD_KERNELS(AVX2) +DECLARE_SIMD_KERNELS(SSE42) + +#endif // SIMD_KERNELS_H diff --git a/jvector-native/src/main/c/meson.build b/jvector-native/src/main/c/meson.build new file mode 100644 index 000000000..c2ffe20cb --- /dev/null +++ b/jvector-native/src/main/c/meson.build @@ -0,0 +1,128 @@ +# Copyright DataStax, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +project('jvector_simd_kernels', 'cpp', + version: '0.1.0', + meson_version: '>= 0.55.0', + default_options: ['cpp_std=c++17'] +) + +# Highway headers are used as an include directory only. +# HWY_COMPILE_ONLY_STATIC / HWY_COMPILE_ONLY_SCALAR bypass the +# dynamic-dispatch runtime, so only headers are needed at compile time. +hwy_inc = include_directories('third_party/highway') + +# Each ISA variant: name, JV_ISA namespace, and extra compiler flags +isa_variants = [ + { + 'name' : 'avx3', + 'namespace': 'AVX3', + 'args' : ['-march=skylake-avx512', + '-DHWY_COMPILE_ONLY_STATIC', + '-DJV_REQUIRE_HWY_AVX3'], + }, + { + 'name' : 'avx2', + 'namespace': 'AVX2', + 'args' : ['-march=haswell', + '-maes', + '-DHWY_COMPILE_ONLY_STATIC', + '-DJV_REQUIRE_HWY_AVX2'], + }, + { + 'name' : 'sse42', + 'namespace': 'SSE42', + 'args' : ['-msse4.2', '-mpclmul', '-maes', + '-DHWY_COMPILE_ONLY_STATIC', + '-DJV_REQUIRE_HWY_SCALAR'], + }, +] + +isa_libs = [] +foreach isa : isa_variants + lib = static_library( + 'simdKernels_' + isa['name'], + sources : 'jvector_simd_kernels.cpp', + include_directories: hwy_inc, + cpp_args : isa['args'] + ['-DJV_ISA=' + isa['namespace'], '-fvisibility=hidden'] + ) + isa_libs += lib +endforeach + +# vectorUtil provides a single runtime-dispatch entry point. +# link_whole pulls every ISA object into the shared library so that no +# symbols are dropped when consumers link against it. +# abort.cc defines hwy::Abort, referenced by the SSE4 scalar fallbacks in +# generic_ops-inl.h. It was extracted from targets.cc specifically for +# projects that manage their own dispatch (google/highway#1935). +vectorutil_lib = shared_library( + 'jvector', + sources : ['jvector_simd.cpp', + 'third_party/highway/hwy/abort.cc'], + include_directories: [include_directories('.'), hwy_inc], + cpp_args : ['-DJVECTOR_BUILD', '-fvisibility=hidden'], + link_whole : isa_libs, + version : meson.project_version(), + install : true, +) + +# Dependency object for use by executables/tests in this build tree. +vectorutil_dep = declare_dependency( + link_with : vectorutil_lib, + include_directories: include_directories('.'), +) + +## Example driver that exercises the runtime-dispatch API. +#executable( +# 'cpp_driver', +# sources : 'examples/cpp_driver.cpp', +# dependencies: vectorutil_dep, +#) +# +## ---- Tests ----------------------------------------------------------------- +#gtest_dep = dependency('gtest_main', required: true) +# +#test_exe = executable( +# 'test_kernels', +# sources : [ +# 'tests/test_kernels.cpp', +# 'tests/test_cpuFeatures.cpp', +# ], +# dependencies: [vectorutil_dep, gtest_dep], +#) +# +#test('kernels', test_exe, protocol: 'gtest', suite: 'kernels') +#test('cpu_features', test_exe, protocol: 'gtest', suite: 'cpu', +# args: ['--gtest_filter=CpuFeaturesTest.*']) +# +## ---- Benchmarks ------------------------------------------------------------ +#gbench_dep = dependency('benchmark', required: false) +#if gbench_dep.found() +# executable( +# 'bench_kernels', +# sources : 'benchmarks/bench_kernels.cpp', +# dependencies: [vectorutil_dep, gbench_dep], +# cpp_args : ['-O3'], +# ) +#endif +# +#rust_enabled = add_languages('rust', required: false) +#if rust_enabled +# executable( +# 'rust_driver', +# 'examples/rust_driver.rs', +# link_with: vectorutil_lib, +# ) +#endif +# \ No newline at end of file diff --git a/jvector-native/src/main/c/third_party/highway b/jvector-native/src/main/c/third_party/highway new file mode 160000 index 000000000..16f5f5c76 --- /dev/null +++ b/jvector-native/src/main/c/third_party/highway @@ -0,0 +1 @@ +Subproject commit 16f5f5c768064d0bb52952888a14236bec75c104 diff --git a/jvector-native/src/main/java/io/github/jbellis/jvector/vector/NativeVectorUtilSupport.java b/jvector-native/src/main/java/io/github/jbellis/jvector/vector/NativeVectorUtilSupport.java index 48cd7d66e..6518ac3ef 100644 --- a/jvector-native/src/main/java/io/github/jbellis/jvector/vector/NativeVectorUtilSupport.java +++ b/jvector-native/src/main/java/io/github/jbellis/jvector/vector/NativeVectorUtilSupport.java @@ -76,7 +76,37 @@ public float assembleAndSum(VectorFloat data, int dataBase, ByteSequence b { assert baseOffsets.offset() == 0 : "Base offsets are expected to have an offset of 0. Found: " + baseOffsets.offset(); // baseOffsets is a pointer into a PQ chunk - we need to index into it by baseOffsetsOffset and provide baseOffsetsLength to the native code - return NativeSimdOps.assemble_and_sum_f32_512(((MemorySegmentVectorFloat) data).get(), dataBase, ((MemorySegmentByteSequence) baseOffsets).get(), baseOffsetsOffset, baseOffsetsLength); + return NativeSimdOps.assemble_and_sum_f32(((MemorySegmentVectorFloat) data).get(), dataBase, ((MemorySegmentByteSequence) baseOffsets).get(), baseOffsetsOffset, (long) baseOffsetsLength); + } + + @Override + public void addInPlace(VectorFloat v1, VectorFloat v2) { + NativeSimdOps.add_in_place_f32(((MemorySegmentVectorFloat) v1).get(), ((MemorySegmentVectorFloat) v2).get(), v1.length()); + } + + @Override + public void addInPlace(VectorFloat v1, float value) { + NativeSimdOps.add_scalar_in_place_f32(((MemorySegmentVectorFloat) v1).get(), value, v1.length()); + } + + @Override + public void subInPlace(VectorFloat v1, VectorFloat v2) { + NativeSimdOps.sub_in_place_f32(((MemorySegmentVectorFloat) v1).get(), ((MemorySegmentVectorFloat) v2).get(), v1.length()); + } + + @Override + public void subInPlace(VectorFloat vector, float value) { + NativeSimdOps.sub_scalar_in_place_f32(((MemorySegmentVectorFloat) vector).get(), value, vector.length()); + } + + @Override + public float max(VectorFloat v) { + return NativeSimdOps.max_f32(((MemorySegmentVectorFloat) v).get(), v.length()); + } + + @Override + public void minInPlace(VectorFloat v1, VectorFloat v2) { + NativeSimdOps.min_in_place_f32(((MemorySegmentVectorFloat) v1).get(), ((MemorySegmentVectorFloat) v2).get(), v1.length()); } @Override @@ -89,8 +119,14 @@ public float assembleAndSumPQ( int vector2OrdinalOffset, int clusterCount // = k ) { - //Use the non-panama solution for now - return assembleAndSumPQ_128(codebookPartialSums, subspaceCount, vector1Ordinals, vector1OrdinalOffset, vector2Ordinals, vector2OrdinalOffset, clusterCount); + assert vector1Ordinals.offset() == 0 : "vector1Ordinals offset must be 0. Found: " + vector1Ordinals.offset(); + assert vector2Ordinals.offset() == 0 : "vector2Ordinals offset must be 0. Found: " + vector2Ordinals.offset(); + return NativeSimdOps.assemble_and_sum_pq_f32( + ((MemorySegmentVectorFloat) codebookPartialSums).get(), + (long) subspaceCount, + ((MemorySegmentByteSequence) vector1Ordinals).get(), vector1OrdinalOffset, + ((MemorySegmentByteSequence) vector2Ordinals).get(), vector2OrdinalOffset, + clusterCount); } @Override @@ -102,6 +138,137 @@ public float pqDecodedCosineSimilarity(ByteSequence encoded, int clusterCount public float pqDecodedCosineSimilarity(ByteSequence encoded, int encodedOffset, int encodedLength, int clusterCount, VectorFloat partialSums, VectorFloat aMagnitude, float bMagnitude) { assert encoded.offset() == 0 : "Bulk shuffle shuffles are expected to have an offset of 0. Found: " + encoded.offset(); // encoded is a pointer into a PQ chunk - we need to index into it by encodedOffset and provide encodedLength to the native code - return NativeSimdOps.pq_decoded_cosine_similarity_f32_512(((MemorySegmentByteSequence) encoded).get(), encodedOffset, encodedLength, clusterCount, ((MemorySegmentVectorFloat) partialSums).get(), ((MemorySegmentVectorFloat) aMagnitude).get(), bMagnitude); + return NativeSimdOps.pq_decoded_cosine_similarity_f32(((MemorySegmentByteSequence) encoded).get(), encodedOffset, (long) encodedLength, clusterCount, ((MemorySegmentVectorFloat) partialSums).get(), ((MemorySegmentVectorFloat) aMagnitude).get(), bMagnitude); + } + + @Override + public float squareDistance(VectorFloat v1, VectorFloat v2) { + return NativeSimdOps.euclidean_f32(((MemorySegmentVectorFloat) v1).get(), 0, + ((MemorySegmentVectorFloat) v2).get(), 0, + v1.length()); + } + + @Override + public float squareDistance(VectorFloat v1, int v1offset, VectorFloat v2, int v2offset, int length) { + return NativeSimdOps.euclidean_f32(((MemorySegmentVectorFloat) v1).get(), v1offset, + ((MemorySegmentVectorFloat) v2).get(), v2offset, + length); + } + + @Override + public float cosine(VectorFloat v1, VectorFloat v2) { + return NativeSimdOps.cosine_f32(((MemorySegmentVectorFloat) v1).get(), 0, + ((MemorySegmentVectorFloat) v2).get(), 0, + v1.length()); + } + + @Override + public float cosine(VectorFloat v1, int v1offset, VectorFloat v2, int v2offset, int length) { + return NativeSimdOps.cosine_f32(((MemorySegmentVectorFloat) v1).get(), v1offset, + ((MemorySegmentVectorFloat) v2).get(), v2offset, + length); + } + + @Override + public float dotProduct(VectorFloat v1, VectorFloat v2) { + return NativeSimdOps.dot_product_f32(((MemorySegmentVectorFloat) v1).get(), 0, + ((MemorySegmentVectorFloat) v2).get(), 0, + v1.length()); + } + + @Override + public float dotProduct(VectorFloat v1, int v1offset, VectorFloat v2, int v2offset, int length) { + return NativeSimdOps.dot_product_f32(((MemorySegmentVectorFloat) v1).get(), v1offset, + ((MemorySegmentVectorFloat) v2).get(), v2offset, + length); + } + + @Override + public void calculatePartialSums(VectorFloat codebook, int codebookIndex, int size, int clusterCount, VectorFloat query, int queryOffset, VectorSimilarityFunction vsf, VectorFloat partialSums) { + var nativeCodebook = ((MemorySegmentVectorFloat) codebook).get(); + var nativeQuery = ((MemorySegmentVectorFloat) query).get(); + var nativePartialSums = ((MemorySegmentVectorFloat) partialSums).get(); + switch (vsf) { + case EUCLIDEAN -> NativeSimdOps.calculate_partial_sums_euclidean_f32(nativeCodebook, codebookIndex, (long) size, clusterCount, nativeQuery, queryOffset, nativePartialSums); + case DOT_PRODUCT -> NativeSimdOps.calculate_partial_sums_dot_f32(nativeCodebook, codebookIndex, (long) size, clusterCount, nativeQuery, queryOffset, nativePartialSums); + default -> throw new UnsupportedOperationException("Unsupported similarity function " + vsf); + } + } + + @Override + public void calculatePartialSelfMagnitudes(VectorFloat codebook, int codebookIndex, int size, int clusterCount, VectorFloat partialMagnitudes) { + NativeSimdOps.calculate_partial_sums_self_magnitude_f32( + ((MemorySegmentVectorFloat) codebook).get(), + codebookIndex, + (long) size, + clusterCount, + ((MemorySegmentVectorFloat) partialMagnitudes).get()); + } + + @Override + public void nvqShuffleQueryInPlace8bit(VectorFloat vector) { + NativeSimdOps.nvq_shuffle_query_in_place_8bit( + ((MemorySegmentVectorFloat) vector).get(), + (long) vector.length()); + } + + @Override + public void nvqQuantize8bit(VectorFloat vector, float alpha, float x0, float minValue, float maxValue, ByteSequence destination) { + NativeSimdOps.nvq_quantize_8bit( + ((MemorySegmentVectorFloat) vector).get(), + (long) vector.length(), + alpha, x0, minValue, maxValue, + ((MemorySegmentByteSequence) destination).get()); + } + + @Override + public float nvqLoss(VectorFloat vector, float alpha, float x0, float minValue, float maxValue, int nBits) { + return NativeSimdOps.nvq_loss( + ((MemorySegmentVectorFloat) vector).get(), + (long) vector.length(), + alpha, x0, minValue, maxValue, nBits); + } + + @Override + public float nvqUniformLoss(VectorFloat vector, float minValue, float maxValue, int nBits) { + return NativeSimdOps.nvq_uniform_loss( + ((MemorySegmentVectorFloat) vector).get(), + (long) vector.length(), + minValue, maxValue, nBits); + } + + @Override + public float nvqSquareL2Distance8bit(VectorFloat vector, ByteSequence quantizedVector, + float alpha, float x0, float minValue, float maxValue) { + return NativeSimdOps.nvq_square_l2_distance_8bit( + ((MemorySegmentVectorFloat) vector).get(), + ((MemorySegmentByteSequence) quantizedVector).get(), + (long) vector.length(), + alpha, x0, minValue, maxValue); + } + + @Override + public float nvqDotProduct8bit(VectorFloat vector, ByteSequence quantizedVector, + float alpha, float x0, float minValue, float maxValue) { + return NativeSimdOps.nvq_dot_product_8bit( + ((MemorySegmentVectorFloat) vector).get(), + ((MemorySegmentByteSequence) quantizedVector).get(), + (long) vector.length(), + alpha, x0, minValue, maxValue); + } + + @Override + public float[] nvqCosine8bit(VectorFloat vector, ByteSequence quantizedVector, + float alpha, float x0, float minValue, float maxValue, + VectorFloat centroid) { + long packed = NativeSimdOps.nvq_cosine_8bit_packed( + ((MemorySegmentVectorFloat) vector).get(), + ((MemorySegmentByteSequence) quantizedVector).get(), + (long) vector.length(), + alpha, x0, minValue, maxValue, + ((MemorySegmentVectorFloat) centroid).get()); + float sum = Float.intBitsToFloat((int)(packed & 0xFFFFFFFFL)); + float bMag = Float.intBitsToFloat((int)(packed >>> 32)); + return new float[]{sum, bMag}; } } diff --git a/jvector-native/src/main/java/io/github/jbellis/jvector/vector/NativeVectorizationProvider.java b/jvector-native/src/main/java/io/github/jbellis/jvector/vector/NativeVectorizationProvider.java index 5194e1108..f73f237a6 100644 --- a/jvector-native/src/main/java/io/github/jbellis/jvector/vector/NativeVectorizationProvider.java +++ b/jvector-native/src/main/java/io/github/jbellis/jvector/vector/NativeVectorizationProvider.java @@ -18,7 +18,6 @@ import io.github.jbellis.jvector.annotations.Experimental; import io.github.jbellis.jvector.vector.cnative.LibraryLoader; -import io.github.jbellis.jvector.vector.cnative.NativeSimdOps; import io.github.jbellis.jvector.vector.types.VectorTypeSupport; /** @@ -31,13 +30,14 @@ public class NativeVectorizationProvider extends VectorizationProvider { private final VectorTypeSupport vectorTypeSupport; public NativeVectorizationProvider() { + var arch = System.getProperty("os.arch", ""); + if (!arch.equals("amd64") && !arch.equals("x86_64")) { + throw new UnsupportedOperationException("Native SIMD operations are only supported on x86_64."); + } var libraryLoaded = LibraryLoader.loadJvector(); if (!libraryLoaded) { throw new UnsupportedOperationException("Failed to load supporting native library."); } - if (!NativeSimdOps.check_avx512_compatibility()) { - throw new UnsupportedOperationException("Native SIMD operations are not supported on this platform due to missing CPU support."); - } this.vectorUtilSupport = new NativeVectorUtilSupport(); this.vectorTypeSupport = new MemorySegmentVectorProvider(); } diff --git a/jvector-native/src/main/java/io/github/jbellis/jvector/vector/cnative/NativeSimdOps.java b/jvector-native/src/main/java/io/github/jbellis/jvector/vector/cnative/NativeSimdOps.java index 5bf5b2f01..fc65c56b6 100644 --- a/jvector-native/src/main/java/io/github/jbellis/jvector/vector/cnative/NativeSimdOps.java +++ b/jvector-native/src/main/java/io/github/jbellis/jvector/vector/cnative/NativeSimdOps.java @@ -22,15 +22,15 @@ public class NativeSimdOps { static final boolean TRACE_DOWNCALLS = Boolean.getBoolean("jextract.trace.downcalls"); static void traceDowncall(String name, Object... args) { - String traceArgs = Arrays.stream(args) - .map(Object::toString) - .collect(Collectors.joining(", ")); - System.out.printf("%s(%s)\n", name, traceArgs); + String traceArgs = Arrays.stream(args) + .map(Object::toString) + .collect(Collectors.joining(", ")); + System.out.printf("%s(%s)\n", name, traceArgs); } static MemorySegment findOrThrow(String symbol) { return SYMBOL_LOOKUP.find(symbol) - .orElseThrow(() -> new UnsatisfiedLinkError("unresolved symbol: " + symbol)); + .orElseThrow(() -> new UnsatisfiedLinkError("unresolved symbol: " + symbol)); } static MethodHandle upcallHandle(Class fi, String name, FunctionDescriptor fdesc) { @@ -95,756 +95,2946 @@ public static int false_() { public static int __bool_true_false_are_defined() { return __bool_true_false_are_defined; } + private static final int _STDINT_H = (int)1L; + /** + * {@snippet lang=c : + * #define _STDINT_H 1 + * } + */ + public static int _STDINT_H() { + return _STDINT_H; + } + private static final int _FEATURES_H = (int)1L; + /** + * {@snippet lang=c : + * #define _FEATURES_H 1 + * } + */ + public static int _FEATURES_H() { + return _FEATURES_H; + } + private static final int _DEFAULT_SOURCE = (int)1L; + /** + * {@snippet lang=c : + * #define _DEFAULT_SOURCE 1 + * } + */ + public static int _DEFAULT_SOURCE() { + return _DEFAULT_SOURCE; + } + private static final int __GLIBC_USE_ISOC2X = (int)0L; + /** + * {@snippet lang=c : + * #define __GLIBC_USE_ISOC2X 0 + * } + */ + public static int __GLIBC_USE_ISOC2X() { + return __GLIBC_USE_ISOC2X; + } + private static final int __USE_ISOC11 = (int)1L; + /** + * {@snippet lang=c : + * #define __USE_ISOC11 1 + * } + */ + public static int __USE_ISOC11() { + return __USE_ISOC11; + } + private static final int __USE_ISOC99 = (int)1L; + /** + * {@snippet lang=c : + * #define __USE_ISOC99 1 + * } + */ + public static int __USE_ISOC99() { + return __USE_ISOC99; + } + private static final int __USE_ISOC95 = (int)1L; + /** + * {@snippet lang=c : + * #define __USE_ISOC95 1 + * } + */ + public static int __USE_ISOC95() { + return __USE_ISOC95; + } + private static final int __USE_POSIX_IMPLICITLY = (int)1L; + /** + * {@snippet lang=c : + * #define __USE_POSIX_IMPLICITLY 1 + * } + */ + public static int __USE_POSIX_IMPLICITLY() { + return __USE_POSIX_IMPLICITLY; + } + private static final int _POSIX_SOURCE = (int)1L; + /** + * {@snippet lang=c : + * #define _POSIX_SOURCE 1 + * } + */ + public static int _POSIX_SOURCE() { + return _POSIX_SOURCE; + } + private static final int __USE_POSIX = (int)1L; + /** + * {@snippet lang=c : + * #define __USE_POSIX 1 + * } + */ + public static int __USE_POSIX() { + return __USE_POSIX; + } + private static final int __USE_POSIX2 = (int)1L; + /** + * {@snippet lang=c : + * #define __USE_POSIX2 1 + * } + */ + public static int __USE_POSIX2() { + return __USE_POSIX2; + } + private static final int __USE_POSIX199309 = (int)1L; + /** + * {@snippet lang=c : + * #define __USE_POSIX199309 1 + * } + */ + public static int __USE_POSIX199309() { + return __USE_POSIX199309; + } + private static final int __USE_POSIX199506 = (int)1L; + /** + * {@snippet lang=c : + * #define __USE_POSIX199506 1 + * } + */ + public static int __USE_POSIX199506() { + return __USE_POSIX199506; + } + private static final int __USE_XOPEN2K = (int)1L; + /** + * {@snippet lang=c : + * #define __USE_XOPEN2K 1 + * } + */ + public static int __USE_XOPEN2K() { + return __USE_XOPEN2K; + } + private static final int __USE_XOPEN2K8 = (int)1L; + /** + * {@snippet lang=c : + * #define __USE_XOPEN2K8 1 + * } + */ + public static int __USE_XOPEN2K8() { + return __USE_XOPEN2K8; + } + private static final int _ATFILE_SOURCE = (int)1L; + /** + * {@snippet lang=c : + * #define _ATFILE_SOURCE 1 + * } + */ + public static int _ATFILE_SOURCE() { + return _ATFILE_SOURCE; + } + private static final int __WORDSIZE = (int)64L; + /** + * {@snippet lang=c : + * #define __WORDSIZE 64 + * } + */ + public static int __WORDSIZE() { + return __WORDSIZE; + } + private static final int __WORDSIZE_TIME64_COMPAT32 = (int)1L; + /** + * {@snippet lang=c : + * #define __WORDSIZE_TIME64_COMPAT32 1 + * } + */ + public static int __WORDSIZE_TIME64_COMPAT32() { + return __WORDSIZE_TIME64_COMPAT32; + } + private static final int __SYSCALL_WORDSIZE = (int)64L; + /** + * {@snippet lang=c : + * #define __SYSCALL_WORDSIZE 64 + * } + */ + public static int __SYSCALL_WORDSIZE() { + return __SYSCALL_WORDSIZE; + } + private static final int __USE_MISC = (int)1L; + /** + * {@snippet lang=c : + * #define __USE_MISC 1 + * } + */ + public static int __USE_MISC() { + return __USE_MISC; + } + private static final int __USE_ATFILE = (int)1L; + /** + * {@snippet lang=c : + * #define __USE_ATFILE 1 + * } + */ + public static int __USE_ATFILE() { + return __USE_ATFILE; + } + private static final int __USE_FORTIFY_LEVEL = (int)0L; + /** + * {@snippet lang=c : + * #define __USE_FORTIFY_LEVEL 0 + * } + */ + public static int __USE_FORTIFY_LEVEL() { + return __USE_FORTIFY_LEVEL; + } + private static final int __GLIBC_USE_DEPRECATED_GETS = (int)0L; + /** + * {@snippet lang=c : + * #define __GLIBC_USE_DEPRECATED_GETS 0 + * } + */ + public static int __GLIBC_USE_DEPRECATED_GETS() { + return __GLIBC_USE_DEPRECATED_GETS; + } + private static final int __GLIBC_USE_DEPRECATED_SCANF = (int)0L; + /** + * {@snippet lang=c : + * #define __GLIBC_USE_DEPRECATED_SCANF 0 + * } + */ + public static int __GLIBC_USE_DEPRECATED_SCANF() { + return __GLIBC_USE_DEPRECATED_SCANF; + } + private static final int __GLIBC_USE_C2X_STRTOL = (int)0L; + /** + * {@snippet lang=c : + * #define __GLIBC_USE_C2X_STRTOL 0 + * } + */ + public static int __GLIBC_USE_C2X_STRTOL() { + return __GLIBC_USE_C2X_STRTOL; + } + private static final int _STDC_PREDEF_H = (int)1L; + /** + * {@snippet lang=c : + * #define _STDC_PREDEF_H 1 + * } + */ + public static int _STDC_PREDEF_H() { + return _STDC_PREDEF_H; + } + private static final int __STDC_IEC_559__ = (int)1L; + /** + * {@snippet lang=c : + * #define __STDC_IEC_559__ 1 + * } + */ + public static int __STDC_IEC_559__() { + return __STDC_IEC_559__; + } + private static final int __STDC_IEC_559_COMPLEX__ = (int)1L; + /** + * {@snippet lang=c : + * #define __STDC_IEC_559_COMPLEX__ 1 + * } + */ + public static int __STDC_IEC_559_COMPLEX__() { + return __STDC_IEC_559_COMPLEX__; + } + private static final int __GNU_LIBRARY__ = (int)6L; + /** + * {@snippet lang=c : + * #define __GNU_LIBRARY__ 6 + * } + */ + public static int __GNU_LIBRARY__() { + return __GNU_LIBRARY__; + } + private static final int __GLIBC__ = (int)2L; + /** + * {@snippet lang=c : + * #define __GLIBC__ 2 + * } + */ + public static int __GLIBC__() { + return __GLIBC__; + } + private static final int __GLIBC_MINOR__ = (int)39L; + /** + * {@snippet lang=c : + * #define __GLIBC_MINOR__ 39 + * } + */ + public static int __GLIBC_MINOR__() { + return __GLIBC_MINOR__; + } + private static final int _SYS_CDEFS_H = (int)1L; + /** + * {@snippet lang=c : + * #define _SYS_CDEFS_H 1 + * } + */ + public static int _SYS_CDEFS_H() { + return _SYS_CDEFS_H; + } + private static final int __glibc_c99_flexarr_available = (int)1L; + /** + * {@snippet lang=c : + * #define __glibc_c99_flexarr_available 1 + * } + */ + public static int __glibc_c99_flexarr_available() { + return __glibc_c99_flexarr_available; + } + private static final int __LDOUBLE_REDIRECTS_TO_FLOAT128_ABI = (int)0L; + /** + * {@snippet lang=c : + * #define __LDOUBLE_REDIRECTS_TO_FLOAT128_ABI 0 + * } + */ + public static int __LDOUBLE_REDIRECTS_TO_FLOAT128_ABI() { + return __LDOUBLE_REDIRECTS_TO_FLOAT128_ABI; + } + private static final int __HAVE_GENERIC_SELECTION = (int)1L; + /** + * {@snippet lang=c : + * #define __HAVE_GENERIC_SELECTION 1 + * } + */ + public static int __HAVE_GENERIC_SELECTION() { + return __HAVE_GENERIC_SELECTION; + } + private static final int __GLIBC_USE_LIB_EXT2 = (int)0L; + /** + * {@snippet lang=c : + * #define __GLIBC_USE_LIB_EXT2 0 + * } + */ + public static int __GLIBC_USE_LIB_EXT2() { + return __GLIBC_USE_LIB_EXT2; + } + private static final int __GLIBC_USE_IEC_60559_BFP_EXT = (int)0L; + /** + * {@snippet lang=c : + * #define __GLIBC_USE_IEC_60559_BFP_EXT 0 + * } + */ + public static int __GLIBC_USE_IEC_60559_BFP_EXT() { + return __GLIBC_USE_IEC_60559_BFP_EXT; + } + private static final int __GLIBC_USE_IEC_60559_BFP_EXT_C2X = (int)0L; + /** + * {@snippet lang=c : + * #define __GLIBC_USE_IEC_60559_BFP_EXT_C2X 0 + * } + */ + public static int __GLIBC_USE_IEC_60559_BFP_EXT_C2X() { + return __GLIBC_USE_IEC_60559_BFP_EXT_C2X; + } + private static final int __GLIBC_USE_IEC_60559_EXT = (int)0L; + /** + * {@snippet lang=c : + * #define __GLIBC_USE_IEC_60559_EXT 0 + * } + */ + public static int __GLIBC_USE_IEC_60559_EXT() { + return __GLIBC_USE_IEC_60559_EXT; + } + private static final int __GLIBC_USE_IEC_60559_FUNCS_EXT = (int)0L; + /** + * {@snippet lang=c : + * #define __GLIBC_USE_IEC_60559_FUNCS_EXT 0 + * } + */ + public static int __GLIBC_USE_IEC_60559_FUNCS_EXT() { + return __GLIBC_USE_IEC_60559_FUNCS_EXT; + } + private static final int __GLIBC_USE_IEC_60559_FUNCS_EXT_C2X = (int)0L; + /** + * {@snippet lang=c : + * #define __GLIBC_USE_IEC_60559_FUNCS_EXT_C2X 0 + * } + */ + public static int __GLIBC_USE_IEC_60559_FUNCS_EXT_C2X() { + return __GLIBC_USE_IEC_60559_FUNCS_EXT_C2X; + } + private static final int __GLIBC_USE_IEC_60559_TYPES_EXT = (int)0L; + /** + * {@snippet lang=c : + * #define __GLIBC_USE_IEC_60559_TYPES_EXT 0 + * } + */ + public static int __GLIBC_USE_IEC_60559_TYPES_EXT() { + return __GLIBC_USE_IEC_60559_TYPES_EXT; + } + private static final int _BITS_TYPES_H = (int)1L; + /** + * {@snippet lang=c : + * #define _BITS_TYPES_H 1 + * } + */ + public static int _BITS_TYPES_H() { + return _BITS_TYPES_H; + } + private static final int _BITS_TYPESIZES_H = (int)1L; + /** + * {@snippet lang=c : + * #define _BITS_TYPESIZES_H 1 + * } + */ + public static int _BITS_TYPESIZES_H() { + return _BITS_TYPESIZES_H; + } + private static final int __OFF_T_MATCHES_OFF64_T = (int)1L; + /** + * {@snippet lang=c : + * #define __OFF_T_MATCHES_OFF64_T 1 + * } + */ + public static int __OFF_T_MATCHES_OFF64_T() { + return __OFF_T_MATCHES_OFF64_T; + } + private static final int __INO_T_MATCHES_INO64_T = (int)1L; + /** + * {@snippet lang=c : + * #define __INO_T_MATCHES_INO64_T 1 + * } + */ + public static int __INO_T_MATCHES_INO64_T() { + return __INO_T_MATCHES_INO64_T; + } + private static final int __RLIM_T_MATCHES_RLIM64_T = (int)1L; + /** + * {@snippet lang=c : + * #define __RLIM_T_MATCHES_RLIM64_T 1 + * } + */ + public static int __RLIM_T_MATCHES_RLIM64_T() { + return __RLIM_T_MATCHES_RLIM64_T; + } + private static final int __STATFS_MATCHES_STATFS64 = (int)1L; + /** + * {@snippet lang=c : + * #define __STATFS_MATCHES_STATFS64 1 + * } + */ + public static int __STATFS_MATCHES_STATFS64() { + return __STATFS_MATCHES_STATFS64; + } + private static final int __KERNEL_OLD_TIMEVAL_MATCHES_TIMEVAL64 = (int)1L; + /** + * {@snippet lang=c : + * #define __KERNEL_OLD_TIMEVAL_MATCHES_TIMEVAL64 1 + * } + */ + public static int __KERNEL_OLD_TIMEVAL_MATCHES_TIMEVAL64() { + return __KERNEL_OLD_TIMEVAL_MATCHES_TIMEVAL64; + } + private static final int __FD_SETSIZE = (int)1024L; + /** + * {@snippet lang=c : + * #define __FD_SETSIZE 1024 + * } + */ + public static int __FD_SETSIZE() { + return __FD_SETSIZE; + } + private static final int _BITS_TIME64_H = (int)1L; + /** + * {@snippet lang=c : + * #define _BITS_TIME64_H 1 + * } + */ + public static int _BITS_TIME64_H() { + return _BITS_TIME64_H; + } + private static final int _BITS_WCHAR_H = (int)1L; + /** + * {@snippet lang=c : + * #define _BITS_WCHAR_H 1 + * } + */ + public static int _BITS_WCHAR_H() { + return _BITS_WCHAR_H; + } + private static final int _BITS_STDINT_INTN_H = (int)1L; + /** + * {@snippet lang=c : + * #define _BITS_STDINT_INTN_H 1 + * } + */ + public static int _BITS_STDINT_INTN_H() { + return _BITS_STDINT_INTN_H; + } + private static final int _BITS_STDINT_UINTN_H = (int)1L; + /** + * {@snippet lang=c : + * #define _BITS_STDINT_UINTN_H 1 + * } + */ + public static int _BITS_STDINT_UINTN_H() { + return _BITS_STDINT_UINTN_H; + } + private static final int _BITS_STDINT_LEAST_H = (int)1L; + /** + * {@snippet lang=c : + * #define _BITS_STDINT_LEAST_H 1 + * } + */ + public static int _BITS_STDINT_LEAST_H() { + return _BITS_STDINT_LEAST_H; + } + /** + * {@snippet lang=c : + * typedef long ptrdiff_t + * } + */ + public static final OfLong ptrdiff_t = NativeSimdOps.C_LONG; + /** + * {@snippet lang=c : + * typedef unsigned long size_t + * } + */ + public static final OfLong size_t = NativeSimdOps.C_LONG; + /** + * {@snippet lang=c : + * typedef int wchar_t + * } + */ + public static final OfInt wchar_t = NativeSimdOps.C_INT; + /** + * {@snippet lang=c : + * typedef unsigned char __u_char + * } + */ + public static final OfByte __u_char = NativeSimdOps.C_CHAR; + /** + * {@snippet lang=c : + * typedef unsigned short __u_short + * } + */ + public static final OfShort __u_short = NativeSimdOps.C_SHORT; + /** + * {@snippet lang=c : + * typedef unsigned int __u_int + * } + */ + public static final OfInt __u_int = NativeSimdOps.C_INT; + /** + * {@snippet lang=c : + * typedef unsigned long __u_long + * } + */ + public static final OfLong __u_long = NativeSimdOps.C_LONG; + /** + * {@snippet lang=c : + * typedef signed char __int8_t + * } + */ + public static final OfByte __int8_t = NativeSimdOps.C_CHAR; + /** + * {@snippet lang=c : + * typedef unsigned char __uint8_t + * } + */ + public static final OfByte __uint8_t = NativeSimdOps.C_CHAR; + /** + * {@snippet lang=c : + * typedef short __int16_t + * } + */ + public static final OfShort __int16_t = NativeSimdOps.C_SHORT; + /** + * {@snippet lang=c : + * typedef unsigned short __uint16_t + * } + */ + public static final OfShort __uint16_t = NativeSimdOps.C_SHORT; + /** + * {@snippet lang=c : + * typedef int __int32_t + * } + */ + public static final OfInt __int32_t = NativeSimdOps.C_INT; + /** + * {@snippet lang=c : + * typedef unsigned int __uint32_t + * } + */ + public static final OfInt __uint32_t = NativeSimdOps.C_INT; + /** + * {@snippet lang=c : + * typedef long __int64_t + * } + */ + public static final OfLong __int64_t = NativeSimdOps.C_LONG; + /** + * {@snippet lang=c : + * typedef unsigned long __uint64_t + * } + */ + public static final OfLong __uint64_t = NativeSimdOps.C_LONG; + /** + * {@snippet lang=c : + * typedef __int8_t __int_least8_t + * } + */ + public static final OfByte __int_least8_t = NativeSimdOps.C_CHAR; + /** + * {@snippet lang=c : + * typedef __uint8_t __uint_least8_t + * } + */ + public static final OfByte __uint_least8_t = NativeSimdOps.C_CHAR; + /** + * {@snippet lang=c : + * typedef __int16_t __int_least16_t + * } + */ + public static final OfShort __int_least16_t = NativeSimdOps.C_SHORT; + /** + * {@snippet lang=c : + * typedef __uint16_t __uint_least16_t + * } + */ + public static final OfShort __uint_least16_t = NativeSimdOps.C_SHORT; + /** + * {@snippet lang=c : + * typedef __int32_t __int_least32_t + * } + */ + public static final OfInt __int_least32_t = NativeSimdOps.C_INT; + /** + * {@snippet lang=c : + * typedef __uint32_t __uint_least32_t + * } + */ + public static final OfInt __uint_least32_t = NativeSimdOps.C_INT; + /** + * {@snippet lang=c : + * typedef __int64_t __int_least64_t + * } + */ + public static final OfLong __int_least64_t = NativeSimdOps.C_LONG; + /** + * {@snippet lang=c : + * typedef __uint64_t __uint_least64_t + * } + */ + public static final OfLong __uint_least64_t = NativeSimdOps.C_LONG; + /** + * {@snippet lang=c : + * typedef long __quad_t + * } + */ + public static final OfLong __quad_t = NativeSimdOps.C_LONG; + /** + * {@snippet lang=c : + * typedef unsigned long __u_quad_t + * } + */ + public static final OfLong __u_quad_t = NativeSimdOps.C_LONG; + /** + * {@snippet lang=c : + * typedef long __intmax_t + * } + */ + public static final OfLong __intmax_t = NativeSimdOps.C_LONG; + /** + * {@snippet lang=c : + * typedef unsigned long __uintmax_t + * } + */ + public static final OfLong __uintmax_t = NativeSimdOps.C_LONG; + /** + * {@snippet lang=c : + * typedef unsigned long __dev_t + * } + */ + public static final OfLong __dev_t = NativeSimdOps.C_LONG; + /** + * {@snippet lang=c : + * typedef unsigned int __uid_t + * } + */ + public static final OfInt __uid_t = NativeSimdOps.C_INT; + /** + * {@snippet lang=c : + * typedef unsigned int __gid_t + * } + */ + public static final OfInt __gid_t = NativeSimdOps.C_INT; + /** + * {@snippet lang=c : + * typedef unsigned long __ino_t + * } + */ + public static final OfLong __ino_t = NativeSimdOps.C_LONG; + /** + * {@snippet lang=c : + * typedef unsigned long __ino64_t + * } + */ + public static final OfLong __ino64_t = NativeSimdOps.C_LONG; + /** + * {@snippet lang=c : + * typedef unsigned int __mode_t + * } + */ + public static final OfInt __mode_t = NativeSimdOps.C_INT; + /** + * {@snippet lang=c : + * typedef unsigned long __nlink_t + * } + */ + public static final OfLong __nlink_t = NativeSimdOps.C_LONG; + /** + * {@snippet lang=c : + * typedef long __off_t + * } + */ + public static final OfLong __off_t = NativeSimdOps.C_LONG; + /** + * {@snippet lang=c : + * typedef long __off64_t + * } + */ + public static final OfLong __off64_t = NativeSimdOps.C_LONG; + /** + * {@snippet lang=c : + * typedef int __pid_t + * } + */ + public static final OfInt __pid_t = NativeSimdOps.C_INT; + /** + * {@snippet lang=c : + * typedef long __clock_t + * } + */ + public static final OfLong __clock_t = NativeSimdOps.C_LONG; + /** + * {@snippet lang=c : + * typedef unsigned long __rlim_t + * } + */ + public static final OfLong __rlim_t = NativeSimdOps.C_LONG; + /** + * {@snippet lang=c : + * typedef unsigned long __rlim64_t + * } + */ + public static final OfLong __rlim64_t = NativeSimdOps.C_LONG; + /** + * {@snippet lang=c : + * typedef unsigned int __id_t + * } + */ + public static final OfInt __id_t = NativeSimdOps.C_INT; + /** + * {@snippet lang=c : + * typedef long __time_t + * } + */ + public static final OfLong __time_t = NativeSimdOps.C_LONG; + /** + * {@snippet lang=c : + * typedef unsigned int __useconds_t + * } + */ + public static final OfInt __useconds_t = NativeSimdOps.C_INT; + /** + * {@snippet lang=c : + * typedef long __suseconds_t + * } + */ + public static final OfLong __suseconds_t = NativeSimdOps.C_LONG; + /** + * {@snippet lang=c : + * typedef long __suseconds64_t + * } + */ + public static final OfLong __suseconds64_t = NativeSimdOps.C_LONG; + /** + * {@snippet lang=c : + * typedef int __daddr_t + * } + */ + public static final OfInt __daddr_t = NativeSimdOps.C_INT; + /** + * {@snippet lang=c : + * typedef int __key_t + * } + */ + public static final OfInt __key_t = NativeSimdOps.C_INT; + /** + * {@snippet lang=c : + * typedef int __clockid_t + * } + */ + public static final OfInt __clockid_t = NativeSimdOps.C_INT; + /** + * {@snippet lang=c : + * typedef void *__timer_t + * } + */ + public static final AddressLayout __timer_t = NativeSimdOps.C_POINTER; + /** + * {@snippet lang=c : + * typedef long __blksize_t + * } + */ + public static final OfLong __blksize_t = NativeSimdOps.C_LONG; + /** + * {@snippet lang=c : + * typedef long __blkcnt_t + * } + */ + public static final OfLong __blkcnt_t = NativeSimdOps.C_LONG; + /** + * {@snippet lang=c : + * typedef long __blkcnt64_t + * } + */ + public static final OfLong __blkcnt64_t = NativeSimdOps.C_LONG; + /** + * {@snippet lang=c : + * typedef unsigned long __fsblkcnt_t + * } + */ + public static final OfLong __fsblkcnt_t = NativeSimdOps.C_LONG; + /** + * {@snippet lang=c : + * typedef unsigned long __fsblkcnt64_t + * } + */ + public static final OfLong __fsblkcnt64_t = NativeSimdOps.C_LONG; + /** + * {@snippet lang=c : + * typedef unsigned long __fsfilcnt_t + * } + */ + public static final OfLong __fsfilcnt_t = NativeSimdOps.C_LONG; + /** + * {@snippet lang=c : + * typedef unsigned long __fsfilcnt64_t + * } + */ + public static final OfLong __fsfilcnt64_t = NativeSimdOps.C_LONG; + /** + * {@snippet lang=c : + * typedef long __fsword_t + * } + */ + public static final OfLong __fsword_t = NativeSimdOps.C_LONG; + /** + * {@snippet lang=c : + * typedef long __ssize_t + * } + */ + public static final OfLong __ssize_t = NativeSimdOps.C_LONG; + /** + * {@snippet lang=c : + * typedef long __syscall_slong_t + * } + */ + public static final OfLong __syscall_slong_t = NativeSimdOps.C_LONG; + /** + * {@snippet lang=c : + * typedef unsigned long __syscall_ulong_t + * } + */ + public static final OfLong __syscall_ulong_t = NativeSimdOps.C_LONG; + /** + * {@snippet lang=c : + * typedef __off64_t __loff_t + * } + */ + public static final OfLong __loff_t = NativeSimdOps.C_LONG; + /** + * {@snippet lang=c : + * typedef char *__caddr_t + * } + */ + public static final AddressLayout __caddr_t = NativeSimdOps.C_POINTER; + /** + * {@snippet lang=c : + * typedef long __intptr_t + * } + */ + public static final OfLong __intptr_t = NativeSimdOps.C_LONG; + /** + * {@snippet lang=c : + * typedef unsigned int __socklen_t + * } + */ + public static final OfInt __socklen_t = NativeSimdOps.C_INT; + /** + * {@snippet lang=c : + * typedef int __sig_atomic_t + * } + */ + public static final OfInt __sig_atomic_t = NativeSimdOps.C_INT; + /** + * {@snippet lang=c : + * typedef __int8_t int8_t + * } + */ + public static final OfByte int8_t = NativeSimdOps.C_CHAR; + /** + * {@snippet lang=c : + * typedef __int16_t int16_t + * } + */ + public static final OfShort int16_t = NativeSimdOps.C_SHORT; + /** + * {@snippet lang=c : + * typedef __int32_t int32_t + * } + */ + public static final OfInt int32_t = NativeSimdOps.C_INT; + /** + * {@snippet lang=c : + * typedef __int64_t int64_t + * } + */ + public static final OfLong int64_t = NativeSimdOps.C_LONG; + /** + * {@snippet lang=c : + * typedef __uint8_t uint8_t + * } + */ + public static final OfByte uint8_t = NativeSimdOps.C_CHAR; + /** + * {@snippet lang=c : + * typedef __uint16_t uint16_t + * } + */ + public static final OfShort uint16_t = NativeSimdOps.C_SHORT; + /** + * {@snippet lang=c : + * typedef __uint32_t uint32_t + * } + */ + public static final OfInt uint32_t = NativeSimdOps.C_INT; + /** + * {@snippet lang=c : + * typedef __uint64_t uint64_t + * } + */ + public static final OfLong uint64_t = NativeSimdOps.C_LONG; + /** + * {@snippet lang=c : + * typedef __int_least8_t int_least8_t + * } + */ + public static final OfByte int_least8_t = NativeSimdOps.C_CHAR; + /** + * {@snippet lang=c : + * typedef __int_least16_t int_least16_t + * } + */ + public static final OfShort int_least16_t = NativeSimdOps.C_SHORT; + /** + * {@snippet lang=c : + * typedef __int_least32_t int_least32_t + * } + */ + public static final OfInt int_least32_t = NativeSimdOps.C_INT; + /** + * {@snippet lang=c : + * typedef __int_least64_t int_least64_t + * } + */ + public static final OfLong int_least64_t = NativeSimdOps.C_LONG; + /** + * {@snippet lang=c : + * typedef __uint_least8_t uint_least8_t + * } + */ + public static final OfByte uint_least8_t = NativeSimdOps.C_CHAR; + /** + * {@snippet lang=c : + * typedef __uint_least16_t uint_least16_t + * } + */ + public static final OfShort uint_least16_t = NativeSimdOps.C_SHORT; + /** + * {@snippet lang=c : + * typedef __uint_least32_t uint_least32_t + * } + */ + public static final OfInt uint_least32_t = NativeSimdOps.C_INT; + /** + * {@snippet lang=c : + * typedef __uint_least64_t uint_least64_t + * } + */ + public static final OfLong uint_least64_t = NativeSimdOps.C_LONG; + /** + * {@snippet lang=c : + * typedef signed char int_fast8_t + * } + */ + public static final OfByte int_fast8_t = NativeSimdOps.C_CHAR; + /** + * {@snippet lang=c : + * typedef long int_fast16_t + * } + */ + public static final OfLong int_fast16_t = NativeSimdOps.C_LONG; + /** + * {@snippet lang=c : + * typedef long int_fast32_t + * } + */ + public static final OfLong int_fast32_t = NativeSimdOps.C_LONG; + /** + * {@snippet lang=c : + * typedef long int_fast64_t + * } + */ + public static final OfLong int_fast64_t = NativeSimdOps.C_LONG; + /** + * {@snippet lang=c : + * typedef unsigned char uint_fast8_t + * } + */ + public static final OfByte uint_fast8_t = NativeSimdOps.C_CHAR; + /** + * {@snippet lang=c : + * typedef unsigned long uint_fast16_t + * } + */ + public static final OfLong uint_fast16_t = NativeSimdOps.C_LONG; + /** + * {@snippet lang=c : + * typedef unsigned long uint_fast32_t + * } + */ + public static final OfLong uint_fast32_t = NativeSimdOps.C_LONG; + /** + * {@snippet lang=c : + * typedef unsigned long uint_fast64_t + * } + */ + public static final OfLong uint_fast64_t = NativeSimdOps.C_LONG; + /** + * {@snippet lang=c : + * typedef long intptr_t + * } + */ + public static final OfLong intptr_t = NativeSimdOps.C_LONG; + /** + * {@snippet lang=c : + * typedef unsigned long uintptr_t + * } + */ + public static final OfLong uintptr_t = NativeSimdOps.C_LONG; + /** + * {@snippet lang=c : + * typedef __intmax_t intmax_t + * } + */ + public static final OfLong intmax_t = NativeSimdOps.C_LONG; + /** + * {@snippet lang=c : + * typedef __uintmax_t uintmax_t + * } + */ + public static final OfLong uintmax_t = NativeSimdOps.C_LONG; + + private static class assemble_and_sum_f32 { + public static final FunctionDescriptor DESC = FunctionDescriptor.of( + NativeSimdOps.C_FLOAT, + NativeSimdOps.C_POINTER, + NativeSimdOps.C_INT, + NativeSimdOps.C_POINTER, + NativeSimdOps.C_INT, + NativeSimdOps.C_LONG + ); + + public static final MemorySegment ADDR = NativeSimdOps.findOrThrow("assemble_and_sum_f32"); + + public static final MethodHandle HANDLE = Linker.nativeLinker().downcallHandle(ADDR, DESC, Linker.Option.critical(true)); + } + + /** + * Function descriptor for: + * {@snippet lang=c : + * float assemble_and_sum_f32(const float *data, int dataBase, const unsigned char *baseOffsets, int baseOffsetsOffset, size_t baseOffsetsLength) + * } + */ + public static FunctionDescriptor assemble_and_sum_f32$descriptor() { + return assemble_and_sum_f32.DESC; + } + + /** + * Downcall method handle for: + * {@snippet lang=c : + * float assemble_and_sum_f32(const float *data, int dataBase, const unsigned char *baseOffsets, int baseOffsetsOffset, size_t baseOffsetsLength) + * } + */ + public static MethodHandle assemble_and_sum_f32$handle() { + return assemble_and_sum_f32.HANDLE; + } + + /** + * Address for: + * {@snippet lang=c : + * float assemble_and_sum_f32(const float *data, int dataBase, const unsigned char *baseOffsets, int baseOffsetsOffset, size_t baseOffsetsLength) + * } + */ + public static MemorySegment assemble_and_sum_f32$address() { + return assemble_and_sum_f32.ADDR; + } + + /** + * {@snippet lang=c : + * float assemble_and_sum_f32(const float *data, int dataBase, const unsigned char *baseOffsets, int baseOffsetsOffset, size_t baseOffsetsLength) + * } + */ + public static float assemble_and_sum_f32(MemorySegment data, int dataBase, MemorySegment baseOffsets, int baseOffsetsOffset, long baseOffsetsLength) { + var mh$ = assemble_and_sum_f32.HANDLE; + try { + if (TRACE_DOWNCALLS) { + traceDowncall("assemble_and_sum_f32", data, dataBase, baseOffsets, baseOffsetsOffset, baseOffsetsLength); + } + return (float)mh$.invokeExact(data, dataBase, baseOffsets, baseOffsetsOffset, baseOffsetsLength); + } catch (Throwable ex$) { + throw new AssertionError("should not reach here", ex$); + } + } + + private static class assemble_and_sum_pq_f32 { + public static final FunctionDescriptor DESC = FunctionDescriptor.of( + NativeSimdOps.C_FLOAT, + NativeSimdOps.C_POINTER, + NativeSimdOps.C_LONG, + NativeSimdOps.C_POINTER, + NativeSimdOps.C_INT, + NativeSimdOps.C_POINTER, + NativeSimdOps.C_INT, + NativeSimdOps.C_INT + ); + + public static final MemorySegment ADDR = NativeSimdOps.findOrThrow("assemble_and_sum_pq_f32"); + + public static final MethodHandle HANDLE = Linker.nativeLinker().downcallHandle(ADDR, DESC, Linker.Option.critical(true)); + } + + /** + * Function descriptor for: + * {@snippet lang=c : + * float assemble_and_sum_pq_f32(const float *data, size_t subspaceCount, const unsigned char *baseOffsets1, int baseOffsetsOffset1, const unsigned char *baseOffsets2, int baseOffsetsOffset2, int clusterCount) + * } + */ + public static FunctionDescriptor assemble_and_sum_pq_f32$descriptor() { + return assemble_and_sum_pq_f32.DESC; + } + + /** + * Downcall method handle for: + * {@snippet lang=c : + * float assemble_and_sum_pq_f32(const float *data, size_t subspaceCount, const unsigned char *baseOffsets1, int baseOffsetsOffset1, const unsigned char *baseOffsets2, int baseOffsetsOffset2, int clusterCount) + * } + */ + public static MethodHandle assemble_and_sum_pq_f32$handle() { + return assemble_and_sum_pq_f32.HANDLE; + } + + /** + * Address for: + * {@snippet lang=c : + * float assemble_and_sum_pq_f32(const float *data, size_t subspaceCount, const unsigned char *baseOffsets1, int baseOffsetsOffset1, const unsigned char *baseOffsets2, int baseOffsetsOffset2, int clusterCount) + * } + */ + public static MemorySegment assemble_and_sum_pq_f32$address() { + return assemble_and_sum_pq_f32.ADDR; + } + + /** + * {@snippet lang=c : + * float assemble_and_sum_pq_f32(const float *data, size_t subspaceCount, const unsigned char *baseOffsets1, int baseOffsetsOffset1, const unsigned char *baseOffsets2, int baseOffsetsOffset2, int clusterCount) + * } + */ + public static float assemble_and_sum_pq_f32(MemorySegment data, long subspaceCount, MemorySegment baseOffsets1, int baseOffsetsOffset1, MemorySegment baseOffsets2, int baseOffsetsOffset2, int clusterCount) { + var mh$ = assemble_and_sum_pq_f32.HANDLE; + try { + if (TRACE_DOWNCALLS) { + traceDowncall("assemble_and_sum_pq_f32", data, subspaceCount, baseOffsets1, baseOffsetsOffset1, baseOffsets2, baseOffsetsOffset2, clusterCount); + } + return (float)mh$.invokeExact(data, subspaceCount, baseOffsets1, baseOffsetsOffset1, baseOffsets2, baseOffsetsOffset2, clusterCount); + } catch (Throwable ex$) { + throw new AssertionError("should not reach here", ex$); + } + } + + private static class pq_decoded_cosine_similarity_f32 { + public static final FunctionDescriptor DESC = FunctionDescriptor.of( + NativeSimdOps.C_FLOAT, + NativeSimdOps.C_POINTER, + NativeSimdOps.C_INT, + NativeSimdOps.C_LONG, + NativeSimdOps.C_INT, + NativeSimdOps.C_POINTER, + NativeSimdOps.C_POINTER, + NativeSimdOps.C_FLOAT + ); + + public static final MemorySegment ADDR = NativeSimdOps.findOrThrow("pq_decoded_cosine_similarity_f32"); + + public static final MethodHandle HANDLE = Linker.nativeLinker().downcallHandle(ADDR, DESC, Linker.Option.critical(true)); + } + + /** + * Function descriptor for: + * {@snippet lang=c : + * float pq_decoded_cosine_similarity_f32(const unsigned char *baseOffsets, int baseOffsetsOffset, size_t baseOffsetsLength, int clusterCount, const float *partialSums, const float *aMagnitude, float bMagnitude) + * } + */ + public static FunctionDescriptor pq_decoded_cosine_similarity_f32$descriptor() { + return pq_decoded_cosine_similarity_f32.DESC; + } + + /** + * Downcall method handle for: + * {@snippet lang=c : + * float pq_decoded_cosine_similarity_f32(const unsigned char *baseOffsets, int baseOffsetsOffset, size_t baseOffsetsLength, int clusterCount, const float *partialSums, const float *aMagnitude, float bMagnitude) + * } + */ + public static MethodHandle pq_decoded_cosine_similarity_f32$handle() { + return pq_decoded_cosine_similarity_f32.HANDLE; + } + + /** + * Address for: + * {@snippet lang=c : + * float pq_decoded_cosine_similarity_f32(const unsigned char *baseOffsets, int baseOffsetsOffset, size_t baseOffsetsLength, int clusterCount, const float *partialSums, const float *aMagnitude, float bMagnitude) + * } + */ + public static MemorySegment pq_decoded_cosine_similarity_f32$address() { + return pq_decoded_cosine_similarity_f32.ADDR; + } + + /** + * {@snippet lang=c : + * float pq_decoded_cosine_similarity_f32(const unsigned char *baseOffsets, int baseOffsetsOffset, size_t baseOffsetsLength, int clusterCount, const float *partialSums, const float *aMagnitude, float bMagnitude) + * } + */ + public static float pq_decoded_cosine_similarity_f32(MemorySegment baseOffsets, int baseOffsetsOffset, long baseOffsetsLength, int clusterCount, MemorySegment partialSums, MemorySegment aMagnitude, float bMagnitude) { + var mh$ = pq_decoded_cosine_similarity_f32.HANDLE; + try { + if (TRACE_DOWNCALLS) { + traceDowncall("pq_decoded_cosine_similarity_f32", baseOffsets, baseOffsetsOffset, baseOffsetsLength, clusterCount, partialSums, aMagnitude, bMagnitude); + } + return (float)mh$.invokeExact(baseOffsets, baseOffsetsOffset, baseOffsetsLength, clusterCount, partialSums, aMagnitude, bMagnitude); + } catch (Throwable ex$) { + throw new AssertionError("should not reach here", ex$); + } + } + + private static class calculate_partial_sums_euclidean_f32 { + public static final FunctionDescriptor DESC = FunctionDescriptor.ofVoid( + NativeSimdOps.C_POINTER, + NativeSimdOps.C_INT, + NativeSimdOps.C_LONG, + NativeSimdOps.C_INT, + NativeSimdOps.C_POINTER, + NativeSimdOps.C_INT, + NativeSimdOps.C_POINTER + ); + + public static final MemorySegment ADDR = NativeSimdOps.findOrThrow("calculate_partial_sums_euclidean_f32"); + + public static final MethodHandle HANDLE = Linker.nativeLinker().downcallHandle(ADDR, DESC, Linker.Option.critical(true)); + } + + /** + * Function descriptor for: + * {@snippet lang=c : + * void calculate_partial_sums_euclidean_f32(const float *codebook, int codebookBase, size_t size, int clusterCount, const float *query, int queryOffset, float *partialSums) + * } + */ + public static FunctionDescriptor calculate_partial_sums_euclidean_f32$descriptor() { + return calculate_partial_sums_euclidean_f32.DESC; + } + + /** + * Downcall method handle for: + * {@snippet lang=c : + * void calculate_partial_sums_euclidean_f32(const float *codebook, int codebookBase, size_t size, int clusterCount, const float *query, int queryOffset, float *partialSums) + * } + */ + public static MethodHandle calculate_partial_sums_euclidean_f32$handle() { + return calculate_partial_sums_euclidean_f32.HANDLE; + } + + /** + * Address for: + * {@snippet lang=c : + * void calculate_partial_sums_euclidean_f32(const float *codebook, int codebookBase, size_t size, int clusterCount, const float *query, int queryOffset, float *partialSums) + * } + */ + public static MemorySegment calculate_partial_sums_euclidean_f32$address() { + return calculate_partial_sums_euclidean_f32.ADDR; + } + + /** + * {@snippet lang=c : + * void calculate_partial_sums_euclidean_f32(const float *codebook, int codebookBase, size_t size, int clusterCount, const float *query, int queryOffset, float *partialSums) + * } + */ + public static void calculate_partial_sums_euclidean_f32(MemorySegment codebook, int codebookBase, long size, int clusterCount, MemorySegment query, int queryOffset, MemorySegment partialSums) { + var mh$ = calculate_partial_sums_euclidean_f32.HANDLE; + try { + if (TRACE_DOWNCALLS) { + traceDowncall("calculate_partial_sums_euclidean_f32", codebook, codebookBase, size, clusterCount, query, queryOffset, partialSums); + } + mh$.invokeExact(codebook, codebookBase, size, clusterCount, query, queryOffset, partialSums); + } catch (Throwable ex$) { + throw new AssertionError("should not reach here", ex$); + } + } + + private static class calculate_partial_sums_dot_f32 { + public static final FunctionDescriptor DESC = FunctionDescriptor.ofVoid( + NativeSimdOps.C_POINTER, + NativeSimdOps.C_INT, + NativeSimdOps.C_LONG, + NativeSimdOps.C_INT, + NativeSimdOps.C_POINTER, + NativeSimdOps.C_INT, + NativeSimdOps.C_POINTER + ); + + public static final MemorySegment ADDR = NativeSimdOps.findOrThrow("calculate_partial_sums_dot_f32"); + + public static final MethodHandle HANDLE = Linker.nativeLinker().downcallHandle(ADDR, DESC, Linker.Option.critical(true)); + } + + /** + * Function descriptor for: + * {@snippet lang=c : + * void calculate_partial_sums_dot_f32(const float *codebook, int codebookBase, size_t size, int clusterCount, const float *query, int queryOffset, float *partialSums) + * } + */ + public static FunctionDescriptor calculate_partial_sums_dot_f32$descriptor() { + return calculate_partial_sums_dot_f32.DESC; + } + + /** + * Downcall method handle for: + * {@snippet lang=c : + * void calculate_partial_sums_dot_f32(const float *codebook, int codebookBase, size_t size, int clusterCount, const float *query, int queryOffset, float *partialSums) + * } + */ + public static MethodHandle calculate_partial_sums_dot_f32$handle() { + return calculate_partial_sums_dot_f32.HANDLE; + } + + /** + * Address for: + * {@snippet lang=c : + * void calculate_partial_sums_dot_f32(const float *codebook, int codebookBase, size_t size, int clusterCount, const float *query, int queryOffset, float *partialSums) + * } + */ + public static MemorySegment calculate_partial_sums_dot_f32$address() { + return calculate_partial_sums_dot_f32.ADDR; + } + + /** + * {@snippet lang=c : + * void calculate_partial_sums_dot_f32(const float *codebook, int codebookBase, size_t size, int clusterCount, const float *query, int queryOffset, float *partialSums) + * } + */ + public static void calculate_partial_sums_dot_f32(MemorySegment codebook, int codebookBase, long size, int clusterCount, MemorySegment query, int queryOffset, MemorySegment partialSums) { + var mh$ = calculate_partial_sums_dot_f32.HANDLE; + try { + if (TRACE_DOWNCALLS) { + traceDowncall("calculate_partial_sums_dot_f32", codebook, codebookBase, size, clusterCount, query, queryOffset, partialSums); + } + mh$.invokeExact(codebook, codebookBase, size, clusterCount, query, queryOffset, partialSums); + } catch (Throwable ex$) { + throw new AssertionError("should not reach here", ex$); + } + } + + private static class calculate_partial_sums_self_magnitude_f32 { + public static final FunctionDescriptor DESC = FunctionDescriptor.ofVoid( + NativeSimdOps.C_POINTER, + NativeSimdOps.C_INT, + NativeSimdOps.C_LONG, + NativeSimdOps.C_INT, + NativeSimdOps.C_POINTER + ); + + public static final MemorySegment ADDR = NativeSimdOps.findOrThrow("calculate_partial_sums_self_magnitude_f32"); + + public static final MethodHandle HANDLE = Linker.nativeLinker().downcallHandle(ADDR, DESC, Linker.Option.critical(true)); + } + + /** + * Function descriptor for: + * {@snippet lang=c : + * void calculate_partial_sums_self_magnitude_f32(const float *codebook, int codebookBase, size_t size, int clusterCount, float *partialSums) + * } + */ + public static FunctionDescriptor calculate_partial_sums_self_magnitude_f32$descriptor() { + return calculate_partial_sums_self_magnitude_f32.DESC; + } + + /** + * Downcall method handle for: + * {@snippet lang=c : + * void calculate_partial_sums_self_magnitude_f32(const float *codebook, int codebookBase, size_t size, int clusterCount, float *partialSums) + * } + */ + public static MethodHandle calculate_partial_sums_self_magnitude_f32$handle() { + return calculate_partial_sums_self_magnitude_f32.HANDLE; + } + + /** + * Address for: + * {@snippet lang=c : + * void calculate_partial_sums_self_magnitude_f32(const float *codebook, int codebookBase, size_t size, int clusterCount, float *partialSums) + * } + */ + public static MemorySegment calculate_partial_sums_self_magnitude_f32$address() { + return calculate_partial_sums_self_magnitude_f32.ADDR; + } + + /** + * {@snippet lang=c : + * void calculate_partial_sums_self_magnitude_f32(const float *codebook, int codebookBase, size_t size, int clusterCount, float *partialSums) + * } + */ + public static void calculate_partial_sums_self_magnitude_f32(MemorySegment codebook, int codebookBase, long size, int clusterCount, MemorySegment partialSums) { + var mh$ = calculate_partial_sums_self_magnitude_f32.HANDLE; + try { + if (TRACE_DOWNCALLS) { + traceDowncall("calculate_partial_sums_self_magnitude_f32", codebook, codebookBase, size, clusterCount, partialSums); + } + mh$.invokeExact(codebook, codebookBase, size, clusterCount, partialSums); + } catch (Throwable ex$) { + throw new AssertionError("should not reach here", ex$); + } + } + + private static class dot_product_f32 { + public static final FunctionDescriptor DESC = FunctionDescriptor.of( + NativeSimdOps.C_FLOAT, + NativeSimdOps.C_POINTER, + NativeSimdOps.C_LONG, + NativeSimdOps.C_POINTER, + NativeSimdOps.C_LONG, + NativeSimdOps.C_LONG + ); + + public static final MemorySegment ADDR = NativeSimdOps.findOrThrow("dot_product_f32"); + + public static final MethodHandle HANDLE = Linker.nativeLinker().downcallHandle(ADDR, DESC, Linker.Option.critical(true)); + } + + /** + * Function descriptor for: + * {@snippet lang=c : + * float dot_product_f32(const float *a, size_t aoffset, const float *b, size_t boffset, size_t length) + * } + */ + public static FunctionDescriptor dot_product_f32$descriptor() { + return dot_product_f32.DESC; + } + + /** + * Downcall method handle for: + * {@snippet lang=c : + * float dot_product_f32(const float *a, size_t aoffset, const float *b, size_t boffset, size_t length) + * } + */ + public static MethodHandle dot_product_f32$handle() { + return dot_product_f32.HANDLE; + } + + /** + * Address for: + * {@snippet lang=c : + * float dot_product_f32(const float *a, size_t aoffset, const float *b, size_t boffset, size_t length) + * } + */ + public static MemorySegment dot_product_f32$address() { + return dot_product_f32.ADDR; + } + + /** + * {@snippet lang=c : + * float dot_product_f32(const float *a, size_t aoffset, const float *b, size_t boffset, size_t length) + * } + */ + public static float dot_product_f32(MemorySegment a, long aoffset, MemorySegment b, long boffset, long length) { + var mh$ = dot_product_f32.HANDLE; + try { + if (TRACE_DOWNCALLS) { + traceDowncall("dot_product_f32", a, aoffset, b, boffset, length); + } + return (float)mh$.invokeExact(a, aoffset, b, boffset, length); + } catch (Throwable ex$) { + throw new AssertionError("should not reach here", ex$); + } + } + + private static class cosine_f32 { + public static final FunctionDescriptor DESC = FunctionDescriptor.of( + NativeSimdOps.C_FLOAT, + NativeSimdOps.C_POINTER, + NativeSimdOps.C_LONG, + NativeSimdOps.C_POINTER, + NativeSimdOps.C_LONG, + NativeSimdOps.C_LONG + ); + + public static final MemorySegment ADDR = NativeSimdOps.findOrThrow("cosine_f32"); + + public static final MethodHandle HANDLE = Linker.nativeLinker().downcallHandle(ADDR, DESC, Linker.Option.critical(true)); + } + + /** + * Function descriptor for: + * {@snippet lang=c : + * float cosine_f32(const float *a, size_t aoffset, const float *b, size_t boffset, size_t length) + * } + */ + public static FunctionDescriptor cosine_f32$descriptor() { + return cosine_f32.DESC; + } + + /** + * Downcall method handle for: + * {@snippet lang=c : + * float cosine_f32(const float *a, size_t aoffset, const float *b, size_t boffset, size_t length) + * } + */ + public static MethodHandle cosine_f32$handle() { + return cosine_f32.HANDLE; + } + + /** + * Address for: + * {@snippet lang=c : + * float cosine_f32(const float *a, size_t aoffset, const float *b, size_t boffset, size_t length) + * } + */ + public static MemorySegment cosine_f32$address() { + return cosine_f32.ADDR; + } + + /** + * {@snippet lang=c : + * float cosine_f32(const float *a, size_t aoffset, const float *b, size_t boffset, size_t length) + * } + */ + public static float cosine_f32(MemorySegment a, long aoffset, MemorySegment b, long boffset, long length) { + var mh$ = cosine_f32.HANDLE; + try { + if (TRACE_DOWNCALLS) { + traceDowncall("cosine_f32", a, aoffset, b, boffset, length); + } + return (float)mh$.invokeExact(a, aoffset, b, boffset, length); + } catch (Throwable ex$) { + throw new AssertionError("should not reach here", ex$); + } + } + + private static class euclidean_f32 { + public static final FunctionDescriptor DESC = FunctionDescriptor.of( + NativeSimdOps.C_FLOAT, + NativeSimdOps.C_POINTER, + NativeSimdOps.C_LONG, + NativeSimdOps.C_POINTER, + NativeSimdOps.C_LONG, + NativeSimdOps.C_LONG + ); + + public static final MemorySegment ADDR = NativeSimdOps.findOrThrow("euclidean_f32"); + + public static final MethodHandle HANDLE = Linker.nativeLinker().downcallHandle(ADDR, DESC, Linker.Option.critical(true)); + } + + /** + * Function descriptor for: + * {@snippet lang=c : + * float euclidean_f32(const float *a, size_t aoffset, const float *b, size_t boffset, size_t length) + * } + */ + public static FunctionDescriptor euclidean_f32$descriptor() { + return euclidean_f32.DESC; + } + + /** + * Downcall method handle for: + * {@snippet lang=c : + * float euclidean_f32(const float *a, size_t aoffset, const float *b, size_t boffset, size_t length) + * } + */ + public static MethodHandle euclidean_f32$handle() { + return euclidean_f32.HANDLE; + } + + /** + * Address for: + * {@snippet lang=c : + * float euclidean_f32(const float *a, size_t aoffset, const float *b, size_t boffset, size_t length) + * } + */ + public static MemorySegment euclidean_f32$address() { + return euclidean_f32.ADDR; + } + + /** + * {@snippet lang=c : + * float euclidean_f32(const float *a, size_t aoffset, const float *b, size_t boffset, size_t length) + * } + */ + public static float euclidean_f32(MemorySegment a, long aoffset, MemorySegment b, long boffset, long length) { + var mh$ = euclidean_f32.HANDLE; + try { + if (TRACE_DOWNCALLS) { + traceDowncall("euclidean_f32", a, aoffset, b, boffset, length); + } + return (float)mh$.invokeExact(a, aoffset, b, boffset, length); + } catch (Throwable ex$) { + throw new AssertionError("should not reach here", ex$); + } + } + + private static class nvq_quantize_8bit { + public static final FunctionDescriptor DESC = FunctionDescriptor.ofVoid( + NativeSimdOps.C_POINTER, + NativeSimdOps.C_LONG, + NativeSimdOps.C_FLOAT, + NativeSimdOps.C_FLOAT, + NativeSimdOps.C_FLOAT, + NativeSimdOps.C_FLOAT, + NativeSimdOps.C_POINTER + ); + + public static final MemorySegment ADDR = NativeSimdOps.findOrThrow("nvq_quantize_8bit"); + + public static final MethodHandle HANDLE = Linker.nativeLinker().downcallHandle(ADDR, DESC, Linker.Option.critical(true)); + } + + /** + * Function descriptor for: + * {@snippet lang=c : + * void nvq_quantize_8bit(const float *vector, size_t length, float alpha, float x0, float minValue, float maxValue, unsigned char *destination) + * } + */ + public static FunctionDescriptor nvq_quantize_8bit$descriptor() { + return nvq_quantize_8bit.DESC; + } + + /** + * Downcall method handle for: + * {@snippet lang=c : + * void nvq_quantize_8bit(const float *vector, size_t length, float alpha, float x0, float minValue, float maxValue, unsigned char *destination) + * } + */ + public static MethodHandle nvq_quantize_8bit$handle() { + return nvq_quantize_8bit.HANDLE; + } + + /** + * Address for: + * {@snippet lang=c : + * void nvq_quantize_8bit(const float *vector, size_t length, float alpha, float x0, float minValue, float maxValue, unsigned char *destination) + * } + */ + public static MemorySegment nvq_quantize_8bit$address() { + return nvq_quantize_8bit.ADDR; + } + + /** + * {@snippet lang=c : + * void nvq_quantize_8bit(const float *vector, size_t length, float alpha, float x0, float minValue, float maxValue, unsigned char *destination) + * } + */ + public static void nvq_quantize_8bit(MemorySegment vector, long length, float alpha, float x0, float minValue, float maxValue, MemorySegment destination) { + var mh$ = nvq_quantize_8bit.HANDLE; + try { + if (TRACE_DOWNCALLS) { + traceDowncall("nvq_quantize_8bit", vector, length, alpha, x0, minValue, maxValue, destination); + } + mh$.invokeExact(vector, length, alpha, x0, minValue, maxValue, destination); + } catch (Throwable ex$) { + throw new AssertionError("should not reach here", ex$); + } + } + + private static class nvq_loss { + public static final FunctionDescriptor DESC = FunctionDescriptor.of( + NativeSimdOps.C_FLOAT, + NativeSimdOps.C_POINTER, + NativeSimdOps.C_LONG, + NativeSimdOps.C_FLOAT, + NativeSimdOps.C_FLOAT, + NativeSimdOps.C_FLOAT, + NativeSimdOps.C_FLOAT, + NativeSimdOps.C_INT + ); + + public static final MemorySegment ADDR = NativeSimdOps.findOrThrow("nvq_loss"); + + public static final MethodHandle HANDLE = Linker.nativeLinker().downcallHandle(ADDR, DESC, Linker.Option.critical(true)); + } + + /** + * Function descriptor for: + * {@snippet lang=c : + * float nvq_loss(const float *vector, size_t length, float alpha, float x0, float minValue, float maxValue, int nBits) + * } + */ + public static FunctionDescriptor nvq_loss$descriptor() { + return nvq_loss.DESC; + } + + /** + * Downcall method handle for: + * {@snippet lang=c : + * float nvq_loss(const float *vector, size_t length, float alpha, float x0, float minValue, float maxValue, int nBits) + * } + */ + public static MethodHandle nvq_loss$handle() { + return nvq_loss.HANDLE; + } + + /** + * Address for: + * {@snippet lang=c : + * float nvq_loss(const float *vector, size_t length, float alpha, float x0, float minValue, float maxValue, int nBits) + * } + */ + public static MemorySegment nvq_loss$address() { + return nvq_loss.ADDR; + } + + /** + * {@snippet lang=c : + * float nvq_loss(const float *vector, size_t length, float alpha, float x0, float minValue, float maxValue, int nBits) + * } + */ + public static float nvq_loss(MemorySegment vector, long length, float alpha, float x0, float minValue, float maxValue, int nBits) { + var mh$ = nvq_loss.HANDLE; + try { + if (TRACE_DOWNCALLS) { + traceDowncall("nvq_loss", vector, length, alpha, x0, minValue, maxValue, nBits); + } + return (float)mh$.invokeExact(vector, length, alpha, x0, minValue, maxValue, nBits); + } catch (Throwable ex$) { + throw new AssertionError("should not reach here", ex$); + } + } + + private static class nvq_uniform_loss { + public static final FunctionDescriptor DESC = FunctionDescriptor.of( + NativeSimdOps.C_FLOAT, + NativeSimdOps.C_POINTER, + NativeSimdOps.C_LONG, + NativeSimdOps.C_FLOAT, + NativeSimdOps.C_FLOAT, + NativeSimdOps.C_INT + ); + + public static final MemorySegment ADDR = NativeSimdOps.findOrThrow("nvq_uniform_loss"); + + public static final MethodHandle HANDLE = Linker.nativeLinker().downcallHandle(ADDR, DESC, Linker.Option.critical(true)); + } + + /** + * Function descriptor for: + * {@snippet lang=c : + * float nvq_uniform_loss(const float *vector, size_t length, float minValue, float maxValue, int nBits) + * } + */ + public static FunctionDescriptor nvq_uniform_loss$descriptor() { + return nvq_uniform_loss.DESC; + } + + /** + * Downcall method handle for: + * {@snippet lang=c : + * float nvq_uniform_loss(const float *vector, size_t length, float minValue, float maxValue, int nBits) + * } + */ + public static MethodHandle nvq_uniform_loss$handle() { + return nvq_uniform_loss.HANDLE; + } + + /** + * Address for: + * {@snippet lang=c : + * float nvq_uniform_loss(const float *vector, size_t length, float minValue, float maxValue, int nBits) + * } + */ + public static MemorySegment nvq_uniform_loss$address() { + return nvq_uniform_loss.ADDR; + } + + /** + * {@snippet lang=c : + * float nvq_uniform_loss(const float *vector, size_t length, float minValue, float maxValue, int nBits) + * } + */ + public static float nvq_uniform_loss(MemorySegment vector, long length, float minValue, float maxValue, int nBits) { + var mh$ = nvq_uniform_loss.HANDLE; + try { + if (TRACE_DOWNCALLS) { + traceDowncall("nvq_uniform_loss", vector, length, minValue, maxValue, nBits); + } + return (float)mh$.invokeExact(vector, length, minValue, maxValue, nBits); + } catch (Throwable ex$) { + throw new AssertionError("should not reach here", ex$); + } + } + + private static class nvq_square_l2_distance_8bit { + public static final FunctionDescriptor DESC = FunctionDescriptor.of( + NativeSimdOps.C_FLOAT, + NativeSimdOps.C_POINTER, + NativeSimdOps.C_POINTER, + NativeSimdOps.C_LONG, + NativeSimdOps.C_FLOAT, + NativeSimdOps.C_FLOAT, + NativeSimdOps.C_FLOAT, + NativeSimdOps.C_FLOAT + ); + + public static final MemorySegment ADDR = NativeSimdOps.findOrThrow("nvq_square_l2_distance_8bit"); + + public static final MethodHandle HANDLE = Linker.nativeLinker().downcallHandle(ADDR, DESC, Linker.Option.critical(true)); + } + + /** + * Function descriptor for: + * {@snippet lang=c : + * float nvq_square_l2_distance_8bit(const float *vector, const unsigned char *quantized, size_t length, float alpha, float x0, float minValue, float maxValue) + * } + */ + public static FunctionDescriptor nvq_square_l2_distance_8bit$descriptor() { + return nvq_square_l2_distance_8bit.DESC; + } + + /** + * Downcall method handle for: + * {@snippet lang=c : + * float nvq_square_l2_distance_8bit(const float *vector, const unsigned char *quantized, size_t length, float alpha, float x0, float minValue, float maxValue) + * } + */ + public static MethodHandle nvq_square_l2_distance_8bit$handle() { + return nvq_square_l2_distance_8bit.HANDLE; + } + + /** + * Address for: + * {@snippet lang=c : + * float nvq_square_l2_distance_8bit(const float *vector, const unsigned char *quantized, size_t length, float alpha, float x0, float minValue, float maxValue) + * } + */ + public static MemorySegment nvq_square_l2_distance_8bit$address() { + return nvq_square_l2_distance_8bit.ADDR; + } + + /** + * {@snippet lang=c : + * float nvq_square_l2_distance_8bit(const float *vector, const unsigned char *quantized, size_t length, float alpha, float x0, float minValue, float maxValue) + * } + */ + public static float nvq_square_l2_distance_8bit(MemorySegment vector, MemorySegment quantized, long length, float alpha, float x0, float minValue, float maxValue) { + var mh$ = nvq_square_l2_distance_8bit.HANDLE; + try { + if (TRACE_DOWNCALLS) { + traceDowncall("nvq_square_l2_distance_8bit", vector, quantized, length, alpha, x0, minValue, maxValue); + } + return (float)mh$.invokeExact(vector, quantized, length, alpha, x0, minValue, maxValue); + } catch (Throwable ex$) { + throw new AssertionError("should not reach here", ex$); + } + } + + private static class nvq_dot_product_8bit { + public static final FunctionDescriptor DESC = FunctionDescriptor.of( + NativeSimdOps.C_FLOAT, + NativeSimdOps.C_POINTER, + NativeSimdOps.C_POINTER, + NativeSimdOps.C_LONG, + NativeSimdOps.C_FLOAT, + NativeSimdOps.C_FLOAT, + NativeSimdOps.C_FLOAT, + NativeSimdOps.C_FLOAT + ); + + public static final MemorySegment ADDR = NativeSimdOps.findOrThrow("nvq_dot_product_8bit"); + + public static final MethodHandle HANDLE = Linker.nativeLinker().downcallHandle(ADDR, DESC, Linker.Option.critical(true)); + } + + /** + * Function descriptor for: + * {@snippet lang=c : + * float nvq_dot_product_8bit(const float *vector, const unsigned char *quantized, size_t length, float alpha, float x0, float minValue, float maxValue) + * } + */ + public static FunctionDescriptor nvq_dot_product_8bit$descriptor() { + return nvq_dot_product_8bit.DESC; + } + + /** + * Downcall method handle for: + * {@snippet lang=c : + * float nvq_dot_product_8bit(const float *vector, const unsigned char *quantized, size_t length, float alpha, float x0, float minValue, float maxValue) + * } + */ + public static MethodHandle nvq_dot_product_8bit$handle() { + return nvq_dot_product_8bit.HANDLE; + } + + /** + * Address for: + * {@snippet lang=c : + * float nvq_dot_product_8bit(const float *vector, const unsigned char *quantized, size_t length, float alpha, float x0, float minValue, float maxValue) + * } + */ + public static MemorySegment nvq_dot_product_8bit$address() { + return nvq_dot_product_8bit.ADDR; + } + + /** + * {@snippet lang=c : + * float nvq_dot_product_8bit(const float *vector, const unsigned char *quantized, size_t length, float alpha, float x0, float minValue, float maxValue) + * } + */ + public static float nvq_dot_product_8bit(MemorySegment vector, MemorySegment quantized, long length, float alpha, float x0, float minValue, float maxValue) { + var mh$ = nvq_dot_product_8bit.HANDLE; + try { + if (TRACE_DOWNCALLS) { + traceDowncall("nvq_dot_product_8bit", vector, quantized, length, alpha, x0, minValue, maxValue); + } + return (float)mh$.invokeExact(vector, quantized, length, alpha, x0, minValue, maxValue); + } catch (Throwable ex$) { + throw new AssertionError("should not reach here", ex$); + } + } + + private static class nvq_cosine_8bit_packed { + public static final FunctionDescriptor DESC = FunctionDescriptor.of( + NativeSimdOps.C_LONG, + NativeSimdOps.C_POINTER, + NativeSimdOps.C_POINTER, + NativeSimdOps.C_LONG, + NativeSimdOps.C_FLOAT, + NativeSimdOps.C_FLOAT, + NativeSimdOps.C_FLOAT, + NativeSimdOps.C_FLOAT, + NativeSimdOps.C_POINTER + ); + + public static final MemorySegment ADDR = NativeSimdOps.findOrThrow("nvq_cosine_8bit_packed"); + + public static final MethodHandle HANDLE = Linker.nativeLinker().downcallHandle(ADDR, DESC, Linker.Option.critical(true)); + } + + /** + * Function descriptor for: + * {@snippet lang=c : + * int64_t nvq_cosine_8bit_packed(const float *vector, const unsigned char *quantized, size_t length, float alpha, float x0, float minValue, float maxValue, const float *centroid) + * } + */ + public static FunctionDescriptor nvq_cosine_8bit_packed$descriptor() { + return nvq_cosine_8bit_packed.DESC; + } + + /** + * Downcall method handle for: + * {@snippet lang=c : + * int64_t nvq_cosine_8bit_packed(const float *vector, const unsigned char *quantized, size_t length, float alpha, float x0, float minValue, float maxValue, const float *centroid) + * } + */ + public static MethodHandle nvq_cosine_8bit_packed$handle() { + return nvq_cosine_8bit_packed.HANDLE; + } + + /** + * Address for: + * {@snippet lang=c : + * int64_t nvq_cosine_8bit_packed(const float *vector, const unsigned char *quantized, size_t length, float alpha, float x0, float minValue, float maxValue, const float *centroid) + * } + */ + public static MemorySegment nvq_cosine_8bit_packed$address() { + return nvq_cosine_8bit_packed.ADDR; + } + + /** + * {@snippet lang=c : + * int64_t nvq_cosine_8bit_packed(const float *vector, const unsigned char *quantized, size_t length, float alpha, float x0, float minValue, float maxValue, const float *centroid) + * } + */ + public static long nvq_cosine_8bit_packed(MemorySegment vector, MemorySegment quantized, long length, float alpha, float x0, float minValue, float maxValue, MemorySegment centroid) { + var mh$ = nvq_cosine_8bit_packed.HANDLE; + try { + if (TRACE_DOWNCALLS) { + traceDowncall("nvq_cosine_8bit_packed", vector, quantized, length, alpha, x0, minValue, maxValue, centroid); + } + return (long)mh$.invokeExact(vector, quantized, length, alpha, x0, minValue, maxValue, centroid); + } catch (Throwable ex$) { + throw new AssertionError("should not reach here", ex$); + } + } + + private static class nvq_shuffle_query_in_place_8bit { + public static final FunctionDescriptor DESC = FunctionDescriptor.ofVoid( + NativeSimdOps.C_POINTER, + NativeSimdOps.C_LONG + ); + + public static final MemorySegment ADDR = NativeSimdOps.findOrThrow("nvq_shuffle_query_in_place_8bit"); + + public static final MethodHandle HANDLE = Linker.nativeLinker().downcallHandle(ADDR, DESC, Linker.Option.critical(true)); + } + + /** + * Function descriptor for: + * {@snippet lang=c : + * void nvq_shuffle_query_in_place_8bit(float *vector, size_t length) + * } + */ + public static FunctionDescriptor nvq_shuffle_query_in_place_8bit$descriptor() { + return nvq_shuffle_query_in_place_8bit.DESC; + } + + /** + * Downcall method handle for: + * {@snippet lang=c : + * void nvq_shuffle_query_in_place_8bit(float *vector, size_t length) + * } + */ + public static MethodHandle nvq_shuffle_query_in_place_8bit$handle() { + return nvq_shuffle_query_in_place_8bit.HANDLE; + } + + /** + * Address for: + * {@snippet lang=c : + * void nvq_shuffle_query_in_place_8bit(float *vector, size_t length) + * } + */ + public static MemorySegment nvq_shuffle_query_in_place_8bit$address() { + return nvq_shuffle_query_in_place_8bit.ADDR; + } + + /** + * {@snippet lang=c : + * void nvq_shuffle_query_in_place_8bit(float *vector, size_t length) + * } + */ + public static void nvq_shuffle_query_in_place_8bit(MemorySegment vector, long length) { + var mh$ = nvq_shuffle_query_in_place_8bit.HANDLE; + try { + if (TRACE_DOWNCALLS) { + traceDowncall("nvq_shuffle_query_in_place_8bit", vector, length); + } + mh$.invokeExact(vector, length); + } catch (Throwable ex$) { + throw new AssertionError("should not reach here", ex$); + } + } + + private static class add_in_place_f32 { + public static final FunctionDescriptor DESC = FunctionDescriptor.ofVoid( + NativeSimdOps.C_POINTER, + NativeSimdOps.C_POINTER, + NativeSimdOps.C_LONG + ); + + public static final MemorySegment ADDR = NativeSimdOps.findOrThrow("add_in_place_f32"); + + public static final MethodHandle HANDLE = Linker.nativeLinker().downcallHandle(ADDR, DESC, Linker.Option.critical(true)); + } - private static class check_avx512_compatibility { + /** + * Function descriptor for: + * {@snippet lang=c : + * void add_in_place_f32(float *v1, const float *v2, size_t length) + * } + */ + public static FunctionDescriptor add_in_place_f32$descriptor() { + return add_in_place_f32.DESC; + } + + /** + * Downcall method handle for: + * {@snippet lang=c : + * void add_in_place_f32(float *v1, const float *v2, size_t length) + * } + */ + public static MethodHandle add_in_place_f32$handle() { + return add_in_place_f32.HANDLE; + } + + /** + * Address for: + * {@snippet lang=c : + * void add_in_place_f32(float *v1, const float *v2, size_t length) + * } + */ + public static MemorySegment add_in_place_f32$address() { + return add_in_place_f32.ADDR; + } + + /** + * {@snippet lang=c : + * void add_in_place_f32(float *v1, const float *v2, size_t length) + * } + */ + public static void add_in_place_f32(MemorySegment v1, MemorySegment v2, long length) { + var mh$ = add_in_place_f32.HANDLE; + try { + if (TRACE_DOWNCALLS) { + traceDowncall("add_in_place_f32", v1, v2, length); + } + mh$.invokeExact(v1, v2, length); + } catch (Throwable ex$) { + throw new AssertionError("should not reach here", ex$); + } + } + + private static class add_scalar_in_place_f32 { + public static final FunctionDescriptor DESC = FunctionDescriptor.ofVoid( + NativeSimdOps.C_POINTER, + NativeSimdOps.C_FLOAT, + NativeSimdOps.C_LONG + ); + + public static final MemorySegment ADDR = NativeSimdOps.findOrThrow("add_scalar_in_place_f32"); + + public static final MethodHandle HANDLE = Linker.nativeLinker().downcallHandle(ADDR, DESC, Linker.Option.critical(true)); + } + + /** + * Function descriptor for: + * {@snippet lang=c : + * void add_scalar_in_place_f32(float *v1, float value, size_t length) + * } + */ + public static FunctionDescriptor add_scalar_in_place_f32$descriptor() { + return add_scalar_in_place_f32.DESC; + } + + /** + * Downcall method handle for: + * {@snippet lang=c : + * void add_scalar_in_place_f32(float *v1, float value, size_t length) + * } + */ + public static MethodHandle add_scalar_in_place_f32$handle() { + return add_scalar_in_place_f32.HANDLE; + } + + /** + * Address for: + * {@snippet lang=c : + * void add_scalar_in_place_f32(float *v1, float value, size_t length) + * } + */ + public static MemorySegment add_scalar_in_place_f32$address() { + return add_scalar_in_place_f32.ADDR; + } + + /** + * {@snippet lang=c : + * void add_scalar_in_place_f32(float *v1, float value, size_t length) + * } + */ + public static void add_scalar_in_place_f32(MemorySegment v1, float value, long length) { + var mh$ = add_scalar_in_place_f32.HANDLE; + try { + if (TRACE_DOWNCALLS) { + traceDowncall("add_scalar_in_place_f32", v1, value, length); + } + mh$.invokeExact(v1, value, length); + } catch (Throwable ex$) { + throw new AssertionError("should not reach here", ex$); + } + } + + private static class sub_in_place_f32 { + public static final FunctionDescriptor DESC = FunctionDescriptor.ofVoid( + NativeSimdOps.C_POINTER, + NativeSimdOps.C_POINTER, + NativeSimdOps.C_LONG + ); + + public static final MemorySegment ADDR = NativeSimdOps.findOrThrow("sub_in_place_f32"); + + public static final MethodHandle HANDLE = Linker.nativeLinker().downcallHandle(ADDR, DESC, Linker.Option.critical(true)); + } + + /** + * Function descriptor for: + * {@snippet lang=c : + * void sub_in_place_f32(float *v1, const float *v2, size_t length) + * } + */ + public static FunctionDescriptor sub_in_place_f32$descriptor() { + return sub_in_place_f32.DESC; + } + + /** + * Downcall method handle for: + * {@snippet lang=c : + * void sub_in_place_f32(float *v1, const float *v2, size_t length) + * } + */ + public static MethodHandle sub_in_place_f32$handle() { + return sub_in_place_f32.HANDLE; + } + + /** + * Address for: + * {@snippet lang=c : + * void sub_in_place_f32(float *v1, const float *v2, size_t length) + * } + */ + public static MemorySegment sub_in_place_f32$address() { + return sub_in_place_f32.ADDR; + } + + /** + * {@snippet lang=c : + * void sub_in_place_f32(float *v1, const float *v2, size_t length) + * } + */ + public static void sub_in_place_f32(MemorySegment v1, MemorySegment v2, long length) { + var mh$ = sub_in_place_f32.HANDLE; + try { + if (TRACE_DOWNCALLS) { + traceDowncall("sub_in_place_f32", v1, v2, length); + } + mh$.invokeExact(v1, v2, length); + } catch (Throwable ex$) { + throw new AssertionError("should not reach here", ex$); + } + } + + private static class sub_scalar_in_place_f32 { + public static final FunctionDescriptor DESC = FunctionDescriptor.ofVoid( + NativeSimdOps.C_POINTER, + NativeSimdOps.C_FLOAT, + NativeSimdOps.C_LONG + ); + + public static final MemorySegment ADDR = NativeSimdOps.findOrThrow("sub_scalar_in_place_f32"); + + public static final MethodHandle HANDLE = Linker.nativeLinker().downcallHandle(ADDR, DESC, Linker.Option.critical(true)); + } + + /** + * Function descriptor for: + * {@snippet lang=c : + * void sub_scalar_in_place_f32(float *v1, float value, size_t length) + * } + */ + public static FunctionDescriptor sub_scalar_in_place_f32$descriptor() { + return sub_scalar_in_place_f32.DESC; + } + + /** + * Downcall method handle for: + * {@snippet lang=c : + * void sub_scalar_in_place_f32(float *v1, float value, size_t length) + * } + */ + public static MethodHandle sub_scalar_in_place_f32$handle() { + return sub_scalar_in_place_f32.HANDLE; + } + + /** + * Address for: + * {@snippet lang=c : + * void sub_scalar_in_place_f32(float *v1, float value, size_t length) + * } + */ + public static MemorySegment sub_scalar_in_place_f32$address() { + return sub_scalar_in_place_f32.ADDR; + } + + /** + * {@snippet lang=c : + * void sub_scalar_in_place_f32(float *v1, float value, size_t length) + * } + */ + public static void sub_scalar_in_place_f32(MemorySegment v1, float value, long length) { + var mh$ = sub_scalar_in_place_f32.HANDLE; + try { + if (TRACE_DOWNCALLS) { + traceDowncall("sub_scalar_in_place_f32", v1, value, length); + } + mh$.invokeExact(v1, value, length); + } catch (Throwable ex$) { + throw new AssertionError("should not reach here", ex$); + } + } + + private static class max_f32 { public static final FunctionDescriptor DESC = FunctionDescriptor.of( - NativeSimdOps.C_BOOL ); + NativeSimdOps.C_FLOAT, + NativeSimdOps.C_POINTER, + NativeSimdOps.C_LONG + ); + + public static final MemorySegment ADDR = NativeSimdOps.findOrThrow("max_f32"); + + public static final MethodHandle HANDLE = Linker.nativeLinker().downcallHandle(ADDR, DESC, Linker.Option.critical(true)); + } + + /** + * Function descriptor for: + * {@snippet lang=c : + * float max_f32(const float *v, size_t length) + * } + */ + public static FunctionDescriptor max_f32$descriptor() { + return max_f32.DESC; + } + + /** + * Downcall method handle for: + * {@snippet lang=c : + * float max_f32(const float *v, size_t length) + * } + */ + public static MethodHandle max_f32$handle() { + return max_f32.HANDLE; + } + + /** + * Address for: + * {@snippet lang=c : + * float max_f32(const float *v, size_t length) + * } + */ + public static MemorySegment max_f32$address() { + return max_f32.ADDR; + } + + /** + * {@snippet lang=c : + * float max_f32(const float *v, size_t length) + * } + */ + public static float max_f32(MemorySegment v, long length) { + var mh$ = max_f32.HANDLE; + try { + if (TRACE_DOWNCALLS) { + traceDowncall("max_f32", v, length); + } + return (float)mh$.invokeExact(v, length); + } catch (Throwable ex$) { + throw new AssertionError("should not reach here", ex$); + } + } + + private static class min_in_place_f32 { + public static final FunctionDescriptor DESC = FunctionDescriptor.ofVoid( + NativeSimdOps.C_POINTER, + NativeSimdOps.C_POINTER, + NativeSimdOps.C_LONG + ); - public static final MemorySegment ADDR = NativeSimdOps.findOrThrow("check_avx512_compatibility"); + public static final MemorySegment ADDR = NativeSimdOps.findOrThrow("min_in_place_f32"); public static final MethodHandle HANDLE = Linker.nativeLinker().downcallHandle(ADDR, DESC, Linker.Option.critical(true)); } - + + /** + * Function descriptor for: + * {@snippet lang=c : + * void min_in_place_f32(float *v1, const float *v2, size_t length) + * } + */ + public static FunctionDescriptor min_in_place_f32$descriptor() { + return min_in_place_f32.DESC; + } + + /** + * Downcall method handle for: + * {@snippet lang=c : + * void min_in_place_f32(float *v1, const float *v2, size_t length) + * } + */ + public static MethodHandle min_in_place_f32$handle() { + return min_in_place_f32.HANDLE; + } + + /** + * Address for: + * {@snippet lang=c : + * void min_in_place_f32(float *v1, const float *v2, size_t length) + * } + */ + public static MemorySegment min_in_place_f32$address() { + return min_in_place_f32.ADDR; + } + + /** + * {@snippet lang=c : + * void min_in_place_f32(float *v1, const float *v2, size_t length) + * } + */ + public static void min_in_place_f32(MemorySegment v1, MemorySegment v2, long length) { + var mh$ = min_in_place_f32.HANDLE; + try { + if (TRACE_DOWNCALLS) { + traceDowncall("min_in_place_f32", v1, v2, length); + } + mh$.invokeExact(v1, v2, length); + } catch (Throwable ex$) { + throw new AssertionError("should not reach here", ex$); + } + } + private static final MemorySegment NULL = MemorySegment.ofAddress(0L); + /** + * {@snippet lang=c : + * #define NULL (void*) 0 + * } + */ + public static MemorySegment NULL() { + return NULL; + } + private static final long _POSIX_C_SOURCE = 200809L; + /** + * {@snippet lang=c : + * #define _POSIX_C_SOURCE 200809 + * } + */ + public static long _POSIX_C_SOURCE() { + return _POSIX_C_SOURCE; + } + private static final int __TIMESIZE = (int)64L; + /** + * {@snippet lang=c : + * #define __TIMESIZE 64 + * } + */ + public static int __TIMESIZE() { + return __TIMESIZE; + } + private static final long __STDC_IEC_60559_BFP__ = 201404L; + /** + * {@snippet lang=c : + * #define __STDC_IEC_60559_BFP__ 201404 + * } + */ + public static long __STDC_IEC_60559_BFP__() { + return __STDC_IEC_60559_BFP__; + } + private static final long __STDC_IEC_60559_COMPLEX__ = 201404L; + /** + * {@snippet lang=c : + * #define __STDC_IEC_60559_COMPLEX__ 201404 + * } + */ + public static long __STDC_IEC_60559_COMPLEX__() { + return __STDC_IEC_60559_COMPLEX__; + } + private static final long __STDC_ISO_10646__ = 201706L; + /** + * {@snippet lang=c : + * #define __STDC_ISO_10646__ 201706 + * } + */ + public static long __STDC_ISO_10646__() { + return __STDC_ISO_10646__; + } + private static final int __WCHAR_MAX = (int)2147483647L; + /** + * {@snippet lang=c : + * #define __WCHAR_MAX 2147483647 + * } + */ + public static int __WCHAR_MAX() { + return __WCHAR_MAX; + } + private static final int __WCHAR_MIN = (int)-2147483648L; /** - * Function descriptor for: * {@snippet lang=c : - * _Bool check_avx512_compatibility() + * #define __WCHAR_MIN -2147483648 * } */ - public static FunctionDescriptor check_avx512_compatibility$descriptor() { - return check_avx512_compatibility.DESC; + public static int __WCHAR_MIN() { + return __WCHAR_MIN; } - + private static final int INT8_MIN = (int)-128L; /** - * Downcall method handle for: * {@snippet lang=c : - * _Bool check_avx512_compatibility() + * #define INT8_MIN -128 * } */ - public static MethodHandle check_avx512_compatibility$handle() { - return check_avx512_compatibility.HANDLE; + public static int INT8_MIN() { + return INT8_MIN; } - + private static final int INT16_MIN = (int)-32768L; /** - * Address for: * {@snippet lang=c : - * _Bool check_avx512_compatibility() + * #define INT16_MIN -32768 * } */ - public static MemorySegment check_avx512_compatibility$address() { - return check_avx512_compatibility.ADDR; + public static int INT16_MIN() { + return INT16_MIN; } - + private static final int INT32_MIN = (int)-2147483648L; /** * {@snippet lang=c : - * _Bool check_avx512_compatibility() + * #define INT32_MIN -2147483648 * } */ - public static boolean check_avx512_compatibility() { - var mh$ = check_avx512_compatibility.HANDLE; - try { - if (TRACE_DOWNCALLS) { - traceDowncall("check_avx512_compatibility"); - } - return (boolean)mh$.invokeExact(); - } catch (Throwable ex$) { - throw new AssertionError("should not reach here", ex$); - } - } - - private static class dot_product_f32 { - public static final FunctionDescriptor DESC = FunctionDescriptor.of( - NativeSimdOps.C_FLOAT, - NativeSimdOps.C_INT, - NativeSimdOps.C_POINTER, - NativeSimdOps.C_INT, - NativeSimdOps.C_POINTER, - NativeSimdOps.C_INT, - NativeSimdOps.C_INT - ); - - public static final MemorySegment ADDR = NativeSimdOps.findOrThrow("dot_product_f32"); - - public static final MethodHandle HANDLE = Linker.nativeLinker().downcallHandle(ADDR, DESC, Linker.Option.critical(true)); + public static int INT32_MIN() { + return INT32_MIN; } - + private static final long INT64_MIN = -9223372036854775808L; /** - * Function descriptor for: * {@snippet lang=c : - * float dot_product_f32(int preferred_size, const float *a, int aoffset, const float *b, int boffset, int length) + * #define INT64_MIN -9223372036854775808 * } */ - public static FunctionDescriptor dot_product_f32$descriptor() { - return dot_product_f32.DESC; + public static long INT64_MIN() { + return INT64_MIN; } - + private static final int INT8_MAX = (int)127L; /** - * Downcall method handle for: * {@snippet lang=c : - * float dot_product_f32(int preferred_size, const float *a, int aoffset, const float *b, int boffset, int length) + * #define INT8_MAX 127 * } */ - public static MethodHandle dot_product_f32$handle() { - return dot_product_f32.HANDLE; + public static int INT8_MAX() { + return INT8_MAX; } - + private static final int INT16_MAX = (int)32767L; /** - * Address for: * {@snippet lang=c : - * float dot_product_f32(int preferred_size, const float *a, int aoffset, const float *b, int boffset, int length) + * #define INT16_MAX 32767 * } */ - public static MemorySegment dot_product_f32$address() { - return dot_product_f32.ADDR; + public static int INT16_MAX() { + return INT16_MAX; } - + private static final int INT32_MAX = (int)2147483647L; /** * {@snippet lang=c : - * float dot_product_f32(int preferred_size, const float *a, int aoffset, const float *b, int boffset, int length) + * #define INT32_MAX 2147483647 * } */ - public static float dot_product_f32(int preferred_size, MemorySegment a, int aoffset, MemorySegment b, int boffset, int length) { - var mh$ = dot_product_f32.HANDLE; - try { - if (TRACE_DOWNCALLS) { - traceDowncall("dot_product_f32", preferred_size, a, aoffset, b, boffset, length); - } - return (float)mh$.invokeExact(preferred_size, a, aoffset, b, boffset, length); - } catch (Throwable ex$) { - throw new AssertionError("should not reach here", ex$); - } - } - - private static class euclidean_f32 { - public static final FunctionDescriptor DESC = FunctionDescriptor.of( - NativeSimdOps.C_FLOAT, - NativeSimdOps.C_INT, - NativeSimdOps.C_POINTER, - NativeSimdOps.C_INT, - NativeSimdOps.C_POINTER, - NativeSimdOps.C_INT, - NativeSimdOps.C_INT - ); - - public static final MemorySegment ADDR = NativeSimdOps.findOrThrow("euclidean_f32"); - - public static final MethodHandle HANDLE = Linker.nativeLinker().downcallHandle(ADDR, DESC, Linker.Option.critical(true)); + public static int INT32_MAX() { + return INT32_MAX; } - + private static final long INT64_MAX = 9223372036854775807L; /** - * Function descriptor for: * {@snippet lang=c : - * float euclidean_f32(int preferred_size, const float *a, int aoffset, const float *b, int boffset, int length) + * #define INT64_MAX 9223372036854775807 * } */ - public static FunctionDescriptor euclidean_f32$descriptor() { - return euclidean_f32.DESC; + public static long INT64_MAX() { + return INT64_MAX; } - + private static final int UINT8_MAX = (int)255L; /** - * Downcall method handle for: * {@snippet lang=c : - * float euclidean_f32(int preferred_size, const float *a, int aoffset, const float *b, int boffset, int length) + * #define UINT8_MAX 255 * } */ - public static MethodHandle euclidean_f32$handle() { - return euclidean_f32.HANDLE; + public static int UINT8_MAX() { + return UINT8_MAX; } - + private static final int UINT16_MAX = (int)65535L; /** - * Address for: * {@snippet lang=c : - * float euclidean_f32(int preferred_size, const float *a, int aoffset, const float *b, int boffset, int length) + * #define UINT16_MAX 65535 * } */ - public static MemorySegment euclidean_f32$address() { - return euclidean_f32.ADDR; + public static int UINT16_MAX() { + return UINT16_MAX; } - + private static final int UINT32_MAX = (int)4294967295L; /** * {@snippet lang=c : - * float euclidean_f32(int preferred_size, const float *a, int aoffset, const float *b, int boffset, int length) + * #define UINT32_MAX 4294967295 * } */ - public static float euclidean_f32(int preferred_size, MemorySegment a, int aoffset, MemorySegment b, int boffset, int length) { - var mh$ = euclidean_f32.HANDLE; - try { - if (TRACE_DOWNCALLS) { - traceDowncall("euclidean_f32", preferred_size, a, aoffset, b, boffset, length); - } - return (float)mh$.invokeExact(preferred_size, a, aoffset, b, boffset, length); - } catch (Throwable ex$) { - throw new AssertionError("should not reach here", ex$); - } - } - - private static class bulk_quantized_shuffle_dot_f32_512 { - public static final FunctionDescriptor DESC = FunctionDescriptor.ofVoid( - NativeSimdOps.C_POINTER, - NativeSimdOps.C_INT, - NativeSimdOps.C_POINTER, - NativeSimdOps.C_FLOAT, - NativeSimdOps.C_FLOAT, - NativeSimdOps.C_POINTER - ); - - public static final MemorySegment ADDR = NativeSimdOps.findOrThrow("bulk_quantized_shuffle_dot_f32_512"); - - public static final MethodHandle HANDLE = Linker.nativeLinker().downcallHandle(ADDR, DESC, Linker.Option.critical(true)); + public static int UINT32_MAX() { + return UINT32_MAX; } - + private static final long UINT64_MAX = -1L; /** - * Function descriptor for: * {@snippet lang=c : - * void bulk_quantized_shuffle_dot_f32_512(const unsigned char *shuffles, int codebookCount, const char *quantizedPartials, float delta, float minDistance, float *results) + * #define UINT64_MAX -1 * } */ - public static FunctionDescriptor bulk_quantized_shuffle_dot_f32_512$descriptor() { - return bulk_quantized_shuffle_dot_f32_512.DESC; + public static long UINT64_MAX() { + return UINT64_MAX; } - + private static final int INT_LEAST8_MIN = (int)-128L; /** - * Downcall method handle for: * {@snippet lang=c : - * void bulk_quantized_shuffle_dot_f32_512(const unsigned char *shuffles, int codebookCount, const char *quantizedPartials, float delta, float minDistance, float *results) + * #define INT_LEAST8_MIN -128 * } */ - public static MethodHandle bulk_quantized_shuffle_dot_f32_512$handle() { - return bulk_quantized_shuffle_dot_f32_512.HANDLE; + public static int INT_LEAST8_MIN() { + return INT_LEAST8_MIN; } - + private static final int INT_LEAST16_MIN = (int)-32768L; /** - * Address for: * {@snippet lang=c : - * void bulk_quantized_shuffle_dot_f32_512(const unsigned char *shuffles, int codebookCount, const char *quantizedPartials, float delta, float minDistance, float *results) + * #define INT_LEAST16_MIN -32768 * } */ - public static MemorySegment bulk_quantized_shuffle_dot_f32_512$address() { - return bulk_quantized_shuffle_dot_f32_512.ADDR; + public static int INT_LEAST16_MIN() { + return INT_LEAST16_MIN; } - + private static final int INT_LEAST32_MIN = (int)-2147483648L; /** * {@snippet lang=c : - * void bulk_quantized_shuffle_dot_f32_512(const unsigned char *shuffles, int codebookCount, const char *quantizedPartials, float delta, float minDistance, float *results) + * #define INT_LEAST32_MIN -2147483648 * } */ - public static void bulk_quantized_shuffle_dot_f32_512(MemorySegment shuffles, int codebookCount, MemorySegment quantizedPartials, float delta, float minDistance, MemorySegment results) { - var mh$ = bulk_quantized_shuffle_dot_f32_512.HANDLE; - try { - if (TRACE_DOWNCALLS) { - traceDowncall("bulk_quantized_shuffle_dot_f32_512", shuffles, codebookCount, quantizedPartials, delta, minDistance, results); - } - mh$.invokeExact(shuffles, codebookCount, quantizedPartials, delta, minDistance, results); - } catch (Throwable ex$) { - throw new AssertionError("should not reach here", ex$); - } - } - - private static class bulk_quantized_shuffle_euclidean_f32_512 { - public static final FunctionDescriptor DESC = FunctionDescriptor.ofVoid( - NativeSimdOps.C_POINTER, - NativeSimdOps.C_INT, - NativeSimdOps.C_POINTER, - NativeSimdOps.C_FLOAT, - NativeSimdOps.C_FLOAT, - NativeSimdOps.C_POINTER - ); - - public static final MemorySegment ADDR = NativeSimdOps.findOrThrow("bulk_quantized_shuffle_euclidean_f32_512"); - - public static final MethodHandle HANDLE = Linker.nativeLinker().downcallHandle(ADDR, DESC, Linker.Option.critical(true)); + public static int INT_LEAST32_MIN() { + return INT_LEAST32_MIN; } - + private static final long INT_LEAST64_MIN = -9223372036854775808L; /** - * Function descriptor for: * {@snippet lang=c : - * void bulk_quantized_shuffle_euclidean_f32_512(const unsigned char *shuffles, int codebookCount, const char *quantizedPartials, float delta, float minDistance, float *results) + * #define INT_LEAST64_MIN -9223372036854775808 * } */ - public static FunctionDescriptor bulk_quantized_shuffle_euclidean_f32_512$descriptor() { - return bulk_quantized_shuffle_euclidean_f32_512.DESC; + public static long INT_LEAST64_MIN() { + return INT_LEAST64_MIN; } - + private static final int INT_LEAST8_MAX = (int)127L; /** - * Downcall method handle for: * {@snippet lang=c : - * void bulk_quantized_shuffle_euclidean_f32_512(const unsigned char *shuffles, int codebookCount, const char *quantizedPartials, float delta, float minDistance, float *results) + * #define INT_LEAST8_MAX 127 * } */ - public static MethodHandle bulk_quantized_shuffle_euclidean_f32_512$handle() { - return bulk_quantized_shuffle_euclidean_f32_512.HANDLE; + public static int INT_LEAST8_MAX() { + return INT_LEAST8_MAX; } - + private static final int INT_LEAST16_MAX = (int)32767L; /** - * Address for: * {@snippet lang=c : - * void bulk_quantized_shuffle_euclidean_f32_512(const unsigned char *shuffles, int codebookCount, const char *quantizedPartials, float delta, float minDistance, float *results) + * #define INT_LEAST16_MAX 32767 * } */ - public static MemorySegment bulk_quantized_shuffle_euclidean_f32_512$address() { - return bulk_quantized_shuffle_euclidean_f32_512.ADDR; + public static int INT_LEAST16_MAX() { + return INT_LEAST16_MAX; } - + private static final int INT_LEAST32_MAX = (int)2147483647L; /** * {@snippet lang=c : - * void bulk_quantized_shuffle_euclidean_f32_512(const unsigned char *shuffles, int codebookCount, const char *quantizedPartials, float delta, float minDistance, float *results) + * #define INT_LEAST32_MAX 2147483647 * } */ - public static void bulk_quantized_shuffle_euclidean_f32_512(MemorySegment shuffles, int codebookCount, MemorySegment quantizedPartials, float delta, float minDistance, MemorySegment results) { - var mh$ = bulk_quantized_shuffle_euclidean_f32_512.HANDLE; - try { - if (TRACE_DOWNCALLS) { - traceDowncall("bulk_quantized_shuffle_euclidean_f32_512", shuffles, codebookCount, quantizedPartials, delta, minDistance, results); - } - mh$.invokeExact(shuffles, codebookCount, quantizedPartials, delta, minDistance, results); - } catch (Throwable ex$) { - throw new AssertionError("should not reach here", ex$); - } - } - - private static class bulk_quantized_shuffle_cosine_f32_512 { - public static final FunctionDescriptor DESC = FunctionDescriptor.ofVoid( - NativeSimdOps.C_POINTER, - NativeSimdOps.C_INT, - NativeSimdOps.C_POINTER, - NativeSimdOps.C_FLOAT, - NativeSimdOps.C_FLOAT, - NativeSimdOps.C_POINTER, - NativeSimdOps.C_FLOAT, - NativeSimdOps.C_FLOAT, - NativeSimdOps.C_FLOAT, - NativeSimdOps.C_POINTER - ); - - public static final MemorySegment ADDR = NativeSimdOps.findOrThrow("bulk_quantized_shuffle_cosine_f32_512"); - - public static final MethodHandle HANDLE = Linker.nativeLinker().downcallHandle(ADDR, DESC, Linker.Option.critical(true)); + public static int INT_LEAST32_MAX() { + return INT_LEAST32_MAX; } - + private static final long INT_LEAST64_MAX = 9223372036854775807L; /** - * Function descriptor for: * {@snippet lang=c : - * void bulk_quantized_shuffle_cosine_f32_512(const unsigned char *shuffles, int codebookCount, const char *quantizedPartialSums, float sumDelta, float minDistance, const char *quantizedPartialMagnitudes, float magnitudeDelta, float minMagnitude, float queryMagnitudeSquared, float *results) + * #define INT_LEAST64_MAX 9223372036854775807 * } */ - public static FunctionDescriptor bulk_quantized_shuffle_cosine_f32_512$descriptor() { - return bulk_quantized_shuffle_cosine_f32_512.DESC; + public static long INT_LEAST64_MAX() { + return INT_LEAST64_MAX; } - + private static final int UINT_LEAST8_MAX = (int)255L; /** - * Downcall method handle for: * {@snippet lang=c : - * void bulk_quantized_shuffle_cosine_f32_512(const unsigned char *shuffles, int codebookCount, const char *quantizedPartialSums, float sumDelta, float minDistance, const char *quantizedPartialMagnitudes, float magnitudeDelta, float minMagnitude, float queryMagnitudeSquared, float *results) + * #define UINT_LEAST8_MAX 255 * } */ - public static MethodHandle bulk_quantized_shuffle_cosine_f32_512$handle() { - return bulk_quantized_shuffle_cosine_f32_512.HANDLE; + public static int UINT_LEAST8_MAX() { + return UINT_LEAST8_MAX; } - + private static final int UINT_LEAST16_MAX = (int)65535L; /** - * Address for: * {@snippet lang=c : - * void bulk_quantized_shuffle_cosine_f32_512(const unsigned char *shuffles, int codebookCount, const char *quantizedPartialSums, float sumDelta, float minDistance, const char *quantizedPartialMagnitudes, float magnitudeDelta, float minMagnitude, float queryMagnitudeSquared, float *results) + * #define UINT_LEAST16_MAX 65535 * } */ - public static MemorySegment bulk_quantized_shuffle_cosine_f32_512$address() { - return bulk_quantized_shuffle_cosine_f32_512.ADDR; + public static int UINT_LEAST16_MAX() { + return UINT_LEAST16_MAX; } - + private static final int UINT_LEAST32_MAX = (int)4294967295L; /** * {@snippet lang=c : - * void bulk_quantized_shuffle_cosine_f32_512(const unsigned char *shuffles, int codebookCount, const char *quantizedPartialSums, float sumDelta, float minDistance, const char *quantizedPartialMagnitudes, float magnitudeDelta, float minMagnitude, float queryMagnitudeSquared, float *results) + * #define UINT_LEAST32_MAX 4294967295 * } */ - public static void bulk_quantized_shuffle_cosine_f32_512(MemorySegment shuffles, int codebookCount, MemorySegment quantizedPartialSums, float sumDelta, float minDistance, MemorySegment quantizedPartialMagnitudes, float magnitudeDelta, float minMagnitude, float queryMagnitudeSquared, MemorySegment results) { - var mh$ = bulk_quantized_shuffle_cosine_f32_512.HANDLE; - try { - if (TRACE_DOWNCALLS) { - traceDowncall("bulk_quantized_shuffle_cosine_f32_512", shuffles, codebookCount, quantizedPartialSums, sumDelta, minDistance, quantizedPartialMagnitudes, magnitudeDelta, minMagnitude, queryMagnitudeSquared, results); - } - mh$.invokeExact(shuffles, codebookCount, quantizedPartialSums, sumDelta, minDistance, quantizedPartialMagnitudes, magnitudeDelta, minMagnitude, queryMagnitudeSquared, results); - } catch (Throwable ex$) { - throw new AssertionError("should not reach here", ex$); - } - } - - private static class assemble_and_sum_f32_512 { - public static final FunctionDescriptor DESC = FunctionDescriptor.of( - NativeSimdOps.C_FLOAT, - NativeSimdOps.C_POINTER, - NativeSimdOps.C_INT, - NativeSimdOps.C_POINTER, - NativeSimdOps.C_INT, - NativeSimdOps.C_INT - ); - - public static final MemorySegment ADDR = NativeSimdOps.findOrThrow("assemble_and_sum_f32_512"); - - public static final MethodHandle HANDLE = Linker.nativeLinker().downcallHandle(ADDR, DESC, Linker.Option.critical(true)); + public static int UINT_LEAST32_MAX() { + return UINT_LEAST32_MAX; } - + private static final long UINT_LEAST64_MAX = -1L; /** - * Function descriptor for: * {@snippet lang=c : - * float assemble_and_sum_f32_512(const float *data, int dataBase, const unsigned char *baseOffsets, int baseOffsetsOffset, int baseOffsetsLength) + * #define UINT_LEAST64_MAX -1 * } */ - public static FunctionDescriptor assemble_and_sum_f32_512$descriptor() { - return assemble_and_sum_f32_512.DESC; + public static long UINT_LEAST64_MAX() { + return UINT_LEAST64_MAX; } - + private static final int INT_FAST8_MIN = (int)-128L; /** - * Downcall method handle for: * {@snippet lang=c : - * float assemble_and_sum_f32_512(const float *data, int dataBase, const unsigned char *baseOffsets, int baseOffsetsOffset, int baseOffsetsLength) + * #define INT_FAST8_MIN -128 * } */ - public static MethodHandle assemble_and_sum_f32_512$handle() { - return assemble_and_sum_f32_512.HANDLE; + public static int INT_FAST8_MIN() { + return INT_FAST8_MIN; } - + private static final long INT_FAST16_MIN = -9223372036854775808L; /** - * Address for: * {@snippet lang=c : - * float assemble_and_sum_f32_512(const float *data, int dataBase, const unsigned char *baseOffsets, int baseOffsetsOffset, int baseOffsetsLength) + * #define INT_FAST16_MIN -9223372036854775808 * } */ - public static MemorySegment assemble_and_sum_f32_512$address() { - return assemble_and_sum_f32_512.ADDR; + public static long INT_FAST16_MIN() { + return INT_FAST16_MIN; } - + private static final long INT_FAST32_MIN = -9223372036854775808L; /** * {@snippet lang=c : - * float assemble_and_sum_f32_512(const float *data, int dataBase, const unsigned char *baseOffsets, int baseOffsetsOffset, int baseOffsetsLength) + * #define INT_FAST32_MIN -9223372036854775808 * } */ - public static float assemble_and_sum_f32_512(MemorySegment data, int dataBase, MemorySegment baseOffsets, int baseOffsetsOffset, int baseOffsetsLength) { - var mh$ = assemble_and_sum_f32_512.HANDLE; - try { - if (TRACE_DOWNCALLS) { - traceDowncall("assemble_and_sum_f32_512", data, dataBase, baseOffsets, baseOffsetsOffset, baseOffsetsLength); - } - return (float)mh$.invokeExact(data, dataBase, baseOffsets, baseOffsetsOffset, baseOffsetsLength); - } catch (Throwable ex$) { - throw new AssertionError("should not reach here", ex$); - } - } - - private static class pq_decoded_cosine_similarity_f32_512 { - public static final FunctionDescriptor DESC = FunctionDescriptor.of( - NativeSimdOps.C_FLOAT, - NativeSimdOps.C_POINTER, - NativeSimdOps.C_INT, - NativeSimdOps.C_INT, - NativeSimdOps.C_INT, - NativeSimdOps.C_POINTER, - NativeSimdOps.C_POINTER, - NativeSimdOps.C_FLOAT - ); - - public static final MemorySegment ADDR = NativeSimdOps.findOrThrow("pq_decoded_cosine_similarity_f32_512"); - - public static final MethodHandle HANDLE = Linker.nativeLinker().downcallHandle(ADDR, DESC, Linker.Option.critical(true)); + public static long INT_FAST32_MIN() { + return INT_FAST32_MIN; } - + private static final long INT_FAST64_MIN = -9223372036854775808L; /** - * Function descriptor for: * {@snippet lang=c : - * float pq_decoded_cosine_similarity_f32_512(const unsigned char *baseOffsets, int baseOffsetsOffset, int baseOffsetsLength, int clusterCount, const float *partialSums, const float *aMagnitude, float bMagnitude) + * #define INT_FAST64_MIN -9223372036854775808 * } */ - public static FunctionDescriptor pq_decoded_cosine_similarity_f32_512$descriptor() { - return pq_decoded_cosine_similarity_f32_512.DESC; + public static long INT_FAST64_MIN() { + return INT_FAST64_MIN; } - + private static final int INT_FAST8_MAX = (int)127L; /** - * Downcall method handle for: * {@snippet lang=c : - * float pq_decoded_cosine_similarity_f32_512(const unsigned char *baseOffsets, int baseOffsetsOffset, int baseOffsetsLength, int clusterCount, const float *partialSums, const float *aMagnitude, float bMagnitude) + * #define INT_FAST8_MAX 127 * } */ - public static MethodHandle pq_decoded_cosine_similarity_f32_512$handle() { - return pq_decoded_cosine_similarity_f32_512.HANDLE; + public static int INT_FAST8_MAX() { + return INT_FAST8_MAX; } - + private static final long INT_FAST16_MAX = 9223372036854775807L; /** - * Address for: * {@snippet lang=c : - * float pq_decoded_cosine_similarity_f32_512(const unsigned char *baseOffsets, int baseOffsetsOffset, int baseOffsetsLength, int clusterCount, const float *partialSums, const float *aMagnitude, float bMagnitude) + * #define INT_FAST16_MAX 9223372036854775807 * } */ - public static MemorySegment pq_decoded_cosine_similarity_f32_512$address() { - return pq_decoded_cosine_similarity_f32_512.ADDR; + public static long INT_FAST16_MAX() { + return INT_FAST16_MAX; } - + private static final long INT_FAST32_MAX = 9223372036854775807L; /** * {@snippet lang=c : - * float pq_decoded_cosine_similarity_f32_512(const unsigned char *baseOffsets, int baseOffsetsOffset, int baseOffsetsLength, int clusterCount, const float *partialSums, const float *aMagnitude, float bMagnitude) + * #define INT_FAST32_MAX 9223372036854775807 * } */ - public static float pq_decoded_cosine_similarity_f32_512(MemorySegment baseOffsets, int baseOffsetsOffset, int baseOffsetsLength, int clusterCount, MemorySegment partialSums, MemorySegment aMagnitude, float bMagnitude) { - var mh$ = pq_decoded_cosine_similarity_f32_512.HANDLE; - try { - if (TRACE_DOWNCALLS) { - traceDowncall("pq_decoded_cosine_similarity_f32_512", baseOffsets, baseOffsetsOffset, baseOffsetsLength, clusterCount, partialSums, aMagnitude, bMagnitude); - } - return (float)mh$.invokeExact(baseOffsets, baseOffsetsOffset, baseOffsetsLength, clusterCount, partialSums, aMagnitude, bMagnitude); - } catch (Throwable ex$) { - throw new AssertionError("should not reach here", ex$); - } + public static long INT_FAST32_MAX() { + return INT_FAST32_MAX; } - - private static class calculate_partial_sums_dot_f32_512 { - public static final FunctionDescriptor DESC = FunctionDescriptor.ofVoid( - NativeSimdOps.C_POINTER, - NativeSimdOps.C_INT, - NativeSimdOps.C_INT, - NativeSimdOps.C_INT, - NativeSimdOps.C_POINTER, - NativeSimdOps.C_INT, - NativeSimdOps.C_POINTER - ); - - public static final MemorySegment ADDR = NativeSimdOps.findOrThrow("calculate_partial_sums_dot_f32_512"); - - public static final MethodHandle HANDLE = Linker.nativeLinker().downcallHandle(ADDR, DESC, Linker.Option.critical(true)); + private static final long INT_FAST64_MAX = 9223372036854775807L; + /** + * {@snippet lang=c : + * #define INT_FAST64_MAX 9223372036854775807 + * } + */ + public static long INT_FAST64_MAX() { + return INT_FAST64_MAX; } - + private static final int UINT_FAST8_MAX = (int)255L; /** - * Function descriptor for: * {@snippet lang=c : - * void calculate_partial_sums_dot_f32_512(const float *codebook, int codebookBase, int size, int clusterCount, const float *query, int queryOffset, float *partialSums) + * #define UINT_FAST8_MAX 255 * } */ - public static FunctionDescriptor calculate_partial_sums_dot_f32_512$descriptor() { - return calculate_partial_sums_dot_f32_512.DESC; + public static int UINT_FAST8_MAX() { + return UINT_FAST8_MAX; } - + private static final long UINT_FAST16_MAX = -1L; /** - * Downcall method handle for: * {@snippet lang=c : - * void calculate_partial_sums_dot_f32_512(const float *codebook, int codebookBase, int size, int clusterCount, const float *query, int queryOffset, float *partialSums) + * #define UINT_FAST16_MAX -1 * } */ - public static MethodHandle calculate_partial_sums_dot_f32_512$handle() { - return calculate_partial_sums_dot_f32_512.HANDLE; + public static long UINT_FAST16_MAX() { + return UINT_FAST16_MAX; } - + private static final long UINT_FAST32_MAX = -1L; /** - * Address for: * {@snippet lang=c : - * void calculate_partial_sums_dot_f32_512(const float *codebook, int codebookBase, int size, int clusterCount, const float *query, int queryOffset, float *partialSums) + * #define UINT_FAST32_MAX -1 * } */ - public static MemorySegment calculate_partial_sums_dot_f32_512$address() { - return calculate_partial_sums_dot_f32_512.ADDR; + public static long UINT_FAST32_MAX() { + return UINT_FAST32_MAX; } - + private static final long UINT_FAST64_MAX = -1L; /** * {@snippet lang=c : - * void calculate_partial_sums_dot_f32_512(const float *codebook, int codebookBase, int size, int clusterCount, const float *query, int queryOffset, float *partialSums) + * #define UINT_FAST64_MAX -1 * } */ - public static void calculate_partial_sums_dot_f32_512(MemorySegment codebook, int codebookBase, int size, int clusterCount, MemorySegment query, int queryOffset, MemorySegment partialSums) { - var mh$ = calculate_partial_sums_dot_f32_512.HANDLE; - try { - if (TRACE_DOWNCALLS) { - traceDowncall("calculate_partial_sums_dot_f32_512", codebook, codebookBase, size, clusterCount, query, queryOffset, partialSums); - } - mh$.invokeExact(codebook, codebookBase, size, clusterCount, query, queryOffset, partialSums); - } catch (Throwable ex$) { - throw new AssertionError("should not reach here", ex$); - } + public static long UINT_FAST64_MAX() { + return UINT_FAST64_MAX; } - - private static class calculate_partial_sums_euclidean_f32_512 { - public static final FunctionDescriptor DESC = FunctionDescriptor.ofVoid( - NativeSimdOps.C_POINTER, - NativeSimdOps.C_INT, - NativeSimdOps.C_INT, - NativeSimdOps.C_INT, - NativeSimdOps.C_POINTER, - NativeSimdOps.C_INT, - NativeSimdOps.C_POINTER - ); - - public static final MemorySegment ADDR = NativeSimdOps.findOrThrow("calculate_partial_sums_euclidean_f32_512"); - - public static final MethodHandle HANDLE = Linker.nativeLinker().downcallHandle(ADDR, DESC, Linker.Option.critical(true)); + private static final long INTPTR_MIN = -9223372036854775808L; + /** + * {@snippet lang=c : + * #define INTPTR_MIN -9223372036854775808 + * } + */ + public static long INTPTR_MIN() { + return INTPTR_MIN; } - + private static final long INTPTR_MAX = 9223372036854775807L; /** - * Function descriptor for: * {@snippet lang=c : - * void calculate_partial_sums_euclidean_f32_512(const float *codebook, int codebookBase, int size, int clusterCount, const float *query, int queryOffset, float *partialSums) + * #define INTPTR_MAX 9223372036854775807 * } */ - public static FunctionDescriptor calculate_partial_sums_euclidean_f32_512$descriptor() { - return calculate_partial_sums_euclidean_f32_512.DESC; + public static long INTPTR_MAX() { + return INTPTR_MAX; } - + private static final long UINTPTR_MAX = -1L; /** - * Downcall method handle for: * {@snippet lang=c : - * void calculate_partial_sums_euclidean_f32_512(const float *codebook, int codebookBase, int size, int clusterCount, const float *query, int queryOffset, float *partialSums) + * #define UINTPTR_MAX -1 * } */ - public static MethodHandle calculate_partial_sums_euclidean_f32_512$handle() { - return calculate_partial_sums_euclidean_f32_512.HANDLE; + public static long UINTPTR_MAX() { + return UINTPTR_MAX; } - + private static final long INTMAX_MIN = -9223372036854775808L; /** - * Address for: * {@snippet lang=c : - * void calculate_partial_sums_euclidean_f32_512(const float *codebook, int codebookBase, int size, int clusterCount, const float *query, int queryOffset, float *partialSums) + * #define INTMAX_MIN -9223372036854775808 * } */ - public static MemorySegment calculate_partial_sums_euclidean_f32_512$address() { - return calculate_partial_sums_euclidean_f32_512.ADDR; + public static long INTMAX_MIN() { + return INTMAX_MIN; } - + private static final long INTMAX_MAX = 9223372036854775807L; /** * {@snippet lang=c : - * void calculate_partial_sums_euclidean_f32_512(const float *codebook, int codebookBase, int size, int clusterCount, const float *query, int queryOffset, float *partialSums) + * #define INTMAX_MAX 9223372036854775807 * } */ - public static void calculate_partial_sums_euclidean_f32_512(MemorySegment codebook, int codebookBase, int size, int clusterCount, MemorySegment query, int queryOffset, MemorySegment partialSums) { - var mh$ = calculate_partial_sums_euclidean_f32_512.HANDLE; - try { - if (TRACE_DOWNCALLS) { - traceDowncall("calculate_partial_sums_euclidean_f32_512", codebook, codebookBase, size, clusterCount, query, queryOffset, partialSums); - } - mh$.invokeExact(codebook, codebookBase, size, clusterCount, query, queryOffset, partialSums); - } catch (Throwable ex$) { - throw new AssertionError("should not reach here", ex$); - } + public static long INTMAX_MAX() { + return INTMAX_MAX; } - - private static class calculate_partial_sums_best_dot_f32_512 { - public static final FunctionDescriptor DESC = FunctionDescriptor.ofVoid( - NativeSimdOps.C_POINTER, - NativeSimdOps.C_INT, - NativeSimdOps.C_INT, - NativeSimdOps.C_INT, - NativeSimdOps.C_POINTER, - NativeSimdOps.C_INT, - NativeSimdOps.C_POINTER, - NativeSimdOps.C_POINTER - ); - - public static final MemorySegment ADDR = NativeSimdOps.findOrThrow("calculate_partial_sums_best_dot_f32_512"); - - public static final MethodHandle HANDLE = Linker.nativeLinker().downcallHandle(ADDR, DESC, Linker.Option.critical(true)); + private static final long UINTMAX_MAX = -1L; + /** + * {@snippet lang=c : + * #define UINTMAX_MAX -1 + * } + */ + public static long UINTMAX_MAX() { + return UINTMAX_MAX; } - + private static final long PTRDIFF_MIN = -9223372036854775808L; /** - * Function descriptor for: * {@snippet lang=c : - * void calculate_partial_sums_best_dot_f32_512(const float *codebook, int codebookBase, int size, int clusterCount, const float *query, int queryOffset, float *partialSums, float *partialBestDistances) + * #define PTRDIFF_MIN -9223372036854775808 * } */ - public static FunctionDescriptor calculate_partial_sums_best_dot_f32_512$descriptor() { - return calculate_partial_sums_best_dot_f32_512.DESC; + public static long PTRDIFF_MIN() { + return PTRDIFF_MIN; } - + private static final long PTRDIFF_MAX = 9223372036854775807L; /** - * Downcall method handle for: * {@snippet lang=c : - * void calculate_partial_sums_best_dot_f32_512(const float *codebook, int codebookBase, int size, int clusterCount, const float *query, int queryOffset, float *partialSums, float *partialBestDistances) + * #define PTRDIFF_MAX 9223372036854775807 * } */ - public static MethodHandle calculate_partial_sums_best_dot_f32_512$handle() { - return calculate_partial_sums_best_dot_f32_512.HANDLE; + public static long PTRDIFF_MAX() { + return PTRDIFF_MAX; } - + private static final int SIG_ATOMIC_MIN = (int)-2147483648L; /** - * Address for: * {@snippet lang=c : - * void calculate_partial_sums_best_dot_f32_512(const float *codebook, int codebookBase, int size, int clusterCount, const float *query, int queryOffset, float *partialSums, float *partialBestDistances) + * #define SIG_ATOMIC_MIN -2147483648 * } */ - public static MemorySegment calculate_partial_sums_best_dot_f32_512$address() { - return calculate_partial_sums_best_dot_f32_512.ADDR; + public static int SIG_ATOMIC_MIN() { + return SIG_ATOMIC_MIN; } - + private static final int SIG_ATOMIC_MAX = (int)2147483647L; /** * {@snippet lang=c : - * void calculate_partial_sums_best_dot_f32_512(const float *codebook, int codebookBase, int size, int clusterCount, const float *query, int queryOffset, float *partialSums, float *partialBestDistances) + * #define SIG_ATOMIC_MAX 2147483647 * } */ - public static void calculate_partial_sums_best_dot_f32_512(MemorySegment codebook, int codebookBase, int size, int clusterCount, MemorySegment query, int queryOffset, MemorySegment partialSums, MemorySegment partialBestDistances) { - var mh$ = calculate_partial_sums_best_dot_f32_512.HANDLE; - try { - if (TRACE_DOWNCALLS) { - traceDowncall("calculate_partial_sums_best_dot_f32_512", codebook, codebookBase, size, clusterCount, query, queryOffset, partialSums, partialBestDistances); - } - mh$.invokeExact(codebook, codebookBase, size, clusterCount, query, queryOffset, partialSums, partialBestDistances); - } catch (Throwable ex$) { - throw new AssertionError("should not reach here", ex$); - } + public static int SIG_ATOMIC_MAX() { + return SIG_ATOMIC_MAX; } - - private static class calculate_partial_sums_best_euclidean_f32_512 { - public static final FunctionDescriptor DESC = FunctionDescriptor.ofVoid( - NativeSimdOps.C_POINTER, - NativeSimdOps.C_INT, - NativeSimdOps.C_INT, - NativeSimdOps.C_INT, - NativeSimdOps.C_POINTER, - NativeSimdOps.C_INT, - NativeSimdOps.C_POINTER, - NativeSimdOps.C_POINTER - ); - - public static final MemorySegment ADDR = NativeSimdOps.findOrThrow("calculate_partial_sums_best_euclidean_f32_512"); - - public static final MethodHandle HANDLE = Linker.nativeLinker().downcallHandle(ADDR, DESC, Linker.Option.critical(true)); + private static final long SIZE_MAX = -1L; + /** + * {@snippet lang=c : + * #define SIZE_MAX -1 + * } + */ + public static long SIZE_MAX() { + return SIZE_MAX; } - + private static final int WCHAR_MIN = (int)-2147483648L; /** - * Function descriptor for: * {@snippet lang=c : - * void calculate_partial_sums_best_euclidean_f32_512(const float *codebook, int codebookBase, int size, int clusterCount, const float *query, int queryOffset, float *partialSums, float *partialBestDistances) + * #define WCHAR_MIN -2147483648 * } */ - public static FunctionDescriptor calculate_partial_sums_best_euclidean_f32_512$descriptor() { - return calculate_partial_sums_best_euclidean_f32_512.DESC; + public static int WCHAR_MIN() { + return WCHAR_MIN; } - + private static final int WCHAR_MAX = (int)2147483647L; /** - * Downcall method handle for: * {@snippet lang=c : - * void calculate_partial_sums_best_euclidean_f32_512(const float *codebook, int codebookBase, int size, int clusterCount, const float *query, int queryOffset, float *partialSums, float *partialBestDistances) + * #define WCHAR_MAX 2147483647 * } */ - public static MethodHandle calculate_partial_sums_best_euclidean_f32_512$handle() { - return calculate_partial_sums_best_euclidean_f32_512.HANDLE; + public static int WCHAR_MAX() { + return WCHAR_MAX; } - + private static final int WINT_MIN = (int)0L; /** - * Address for: * {@snippet lang=c : - * void calculate_partial_sums_best_euclidean_f32_512(const float *codebook, int codebookBase, int size, int clusterCount, const float *query, int queryOffset, float *partialSums, float *partialBestDistances) + * #define WINT_MIN 0 * } */ - public static MemorySegment calculate_partial_sums_best_euclidean_f32_512$address() { - return calculate_partial_sums_best_euclidean_f32_512.ADDR; + public static int WINT_MIN() { + return WINT_MIN; } - + private static final int WINT_MAX = (int)4294967295L; /** * {@snippet lang=c : - * void calculate_partial_sums_best_euclidean_f32_512(const float *codebook, int codebookBase, int size, int clusterCount, const float *query, int queryOffset, float *partialSums, float *partialBestDistances) + * #define WINT_MAX 4294967295 * } */ - public static void calculate_partial_sums_best_euclidean_f32_512(MemorySegment codebook, int codebookBase, int size, int clusterCount, MemorySegment query, int queryOffset, MemorySegment partialSums, MemorySegment partialBestDistances) { - var mh$ = calculate_partial_sums_best_euclidean_f32_512.HANDLE; - try { - if (TRACE_DOWNCALLS) { - traceDowncall("calculate_partial_sums_best_euclidean_f32_512", codebook, codebookBase, size, clusterCount, query, queryOffset, partialSums, partialBestDistances); - } - mh$.invokeExact(codebook, codebookBase, size, clusterCount, query, queryOffset, partialSums, partialBestDistances); - } catch (Throwable ex$) { - throw new AssertionError("should not reach here", ex$); - } + public static int WINT_MAX() { + return WINT_MAX; } -} \ No newline at end of file +} + diff --git a/jvector-native/src/main/java/io/github/jbellis/jvector/vector/cnative/__fsid_t.java b/jvector-native/src/main/java/io/github/jbellis/jvector/vector/cnative/__fsid_t.java new file mode 100644 index 000000000..fcf526e28 --- /dev/null +++ b/jvector-native/src/main/java/io/github/jbellis/jvector/vector/cnative/__fsid_t.java @@ -0,0 +1,160 @@ +// Generated by jextract + +package io.github.jbellis.jvector.vector.cnative; + +import java.lang.invoke.*; +import java.lang.foreign.*; +import java.nio.ByteOrder; +import java.util.*; +import java.util.function.*; +import java.util.stream.*; + +import static java.lang.foreign.ValueLayout.*; +import static java.lang.foreign.MemoryLayout.PathElement.*; + +/** + * {@snippet lang=c : + * struct { + * int __val[2]; + * } + * } + */ +public class __fsid_t { + + __fsid_t() { + // Should not be called directly + } + + private static final GroupLayout $LAYOUT = MemoryLayout.structLayout( + MemoryLayout.sequenceLayout(2, NativeSimdOps.C_INT).withName("__val") + ).withName("$anon$155:12"); + + /** + * The layout of this struct + */ + public static final GroupLayout layout() { + return $LAYOUT; + } + + private static final SequenceLayout __val$LAYOUT = (SequenceLayout)$LAYOUT.select(groupElement("__val")); + + /** + * Layout for field: + * {@snippet lang=c : + * int __val[2] + * } + */ + public static final SequenceLayout __val$layout() { + return __val$LAYOUT; + } + + private static final long __val$OFFSET = 0; + + /** + * Offset for field: + * {@snippet lang=c : + * int __val[2] + * } + */ + public static final long __val$offset() { + return __val$OFFSET; + } + + /** + * Getter for field: + * {@snippet lang=c : + * int __val[2] + * } + */ + public static MemorySegment __val(MemorySegment struct) { + return struct.asSlice(__val$OFFSET, __val$LAYOUT.byteSize()); + } + + /** + * Setter for field: + * {@snippet lang=c : + * int __val[2] + * } + */ + public static void __val(MemorySegment struct, MemorySegment fieldValue) { + MemorySegment.copy(fieldValue, 0L, struct, __val$OFFSET, __val$LAYOUT.byteSize()); + } + + private static long[] __val$DIMS = { 2 }; + + /** + * Dimensions for array field: + * {@snippet lang=c : + * int __val[2] + * } + */ + public static long[] __val$dimensions() { + return __val$DIMS; + } + private static final VarHandle __val$ELEM_HANDLE = __val$LAYOUT.varHandle(sequenceElement()); + + /** + * Indexed getter for field: + * {@snippet lang=c : + * int __val[2] + * } + */ + public static int __val(MemorySegment struct, long index0) { + return (int)__val$ELEM_HANDLE.get(struct, 0L, index0); + } + + /** + * Indexed setter for field: + * {@snippet lang=c : + * int __val[2] + * } + */ + public static void __val(MemorySegment struct, long index0, int fieldValue) { + __val$ELEM_HANDLE.set(struct, 0L, index0, fieldValue); + } + + /** + * Obtains a slice of {@code arrayParam} which selects the array element at {@code index}. + * The returned segment has address {@code arrayParam.address() + index * layout().byteSize()} + */ + public static MemorySegment asSlice(MemorySegment array, long index) { + return array.asSlice(layout().byteSize() * index); + } + + /** + * The size (in bytes) of this struct + */ + public static long sizeof() { return layout().byteSize(); } + + /** + * Allocate a segment of size {@code layout().byteSize()} using {@code allocator} + */ + public static MemorySegment allocate(SegmentAllocator allocator) { + return allocator.allocate(layout()); + } + + /** + * Allocate an array of size {@code elementCount} using {@code allocator}. + * The returned segment has size {@code elementCount * layout().byteSize()}. + */ + public static MemorySegment allocateArray(long elementCount, SegmentAllocator allocator) { + return allocator.allocate(MemoryLayout.sequenceLayout(elementCount, layout())); + } + + /** + * Reinterprets {@code addr} using target {@code arena} and {@code cleanupAction} (if any). + * The returned segment has size {@code layout().byteSize()} + */ + public static MemorySegment reinterpret(MemorySegment addr, Arena arena, Consumer cleanup) { + return reinterpret(addr, 1, arena, cleanup); + } + + /** + * Reinterprets {@code addr} using target {@code arena} and {@code cleanupAction} (if any). + * The returned segment has size {@code elementCount * layout().byteSize()} + */ + public static MemorySegment reinterpret(MemorySegment addr, long elementCount, Arena arena, Consumer cleanup) { + return addr.reinterpret(layout().byteSize() * elementCount, arena, cleanup); + } +} + diff --git a/jvector-native/src/main/java/io/github/jbellis/jvector/vector/cnative/max_align_t.java b/jvector-native/src/main/java/io/github/jbellis/jvector/vector/cnative/max_align_t.java new file mode 100644 index 000000000..dccb93436 --- /dev/null +++ b/jvector-native/src/main/java/io/github/jbellis/jvector/vector/cnative/max_align_t.java @@ -0,0 +1,129 @@ +// Generated by jextract + +package io.github.jbellis.jvector.vector.cnative; + +import java.lang.invoke.*; +import java.lang.foreign.*; +import java.nio.ByteOrder; +import java.util.*; +import java.util.function.*; +import java.util.stream.*; + +import static java.lang.foreign.ValueLayout.*; +import static java.lang.foreign.MemoryLayout.PathElement.*; + +/** + * {@snippet lang=c : + * struct { + * long long __clang_max_align_nonce1; + * long double __clang_max_align_nonce2; + * } + * } + */ +public class max_align_t { + + max_align_t() { + // Should not be called directly + } + + private static final GroupLayout $LAYOUT = MemoryLayout.structLayout( + NativeSimdOps.C_LONG_LONG.withName("__clang_max_align_nonce1"), + MemoryLayout.paddingLayout(24) + ).withName("$anon$19:9"); + + /** + * The layout of this struct + */ + public static final GroupLayout layout() { + return $LAYOUT; + } + + private static final OfLong __clang_max_align_nonce1$LAYOUT = (OfLong)$LAYOUT.select(groupElement("__clang_max_align_nonce1")); + + /** + * Layout for field: + * {@snippet lang=c : + * long long __clang_max_align_nonce1 + * } + */ + public static final OfLong __clang_max_align_nonce1$layout() { + return __clang_max_align_nonce1$LAYOUT; + } + + private static final long __clang_max_align_nonce1$OFFSET = 0; + + /** + * Offset for field: + * {@snippet lang=c : + * long long __clang_max_align_nonce1 + * } + */ + public static final long __clang_max_align_nonce1$offset() { + return __clang_max_align_nonce1$OFFSET; + } + + /** + * Getter for field: + * {@snippet lang=c : + * long long __clang_max_align_nonce1 + * } + */ + public static long __clang_max_align_nonce1(MemorySegment struct) { + return struct.get(__clang_max_align_nonce1$LAYOUT, __clang_max_align_nonce1$OFFSET); + } + + /** + * Setter for field: + * {@snippet lang=c : + * long long __clang_max_align_nonce1 + * } + */ + public static void __clang_max_align_nonce1(MemorySegment struct, long fieldValue) { + struct.set(__clang_max_align_nonce1$LAYOUT, __clang_max_align_nonce1$OFFSET, fieldValue); + } + + /** + * Obtains a slice of {@code arrayParam} which selects the array element at {@code index}. + * The returned segment has address {@code arrayParam.address() + index * layout().byteSize()} + */ + public static MemorySegment asSlice(MemorySegment array, long index) { + return array.asSlice(layout().byteSize() * index); + } + + /** + * The size (in bytes) of this struct + */ + public static long sizeof() { return layout().byteSize(); } + + /** + * Allocate a segment of size {@code layout().byteSize()} using {@code allocator} + */ + public static MemorySegment allocate(SegmentAllocator allocator) { + return allocator.allocate(layout()); + } + + /** + * Allocate an array of size {@code elementCount} using {@code allocator}. + * The returned segment has size {@code elementCount * layout().byteSize()}. + */ + public static MemorySegment allocateArray(long elementCount, SegmentAllocator allocator) { + return allocator.allocate(MemoryLayout.sequenceLayout(elementCount, layout())); + } + + /** + * Reinterprets {@code addr} using target {@code arena} and {@code cleanupAction} (if any). + * The returned segment has size {@code layout().byteSize()} + */ + public static MemorySegment reinterpret(MemorySegment addr, Arena arena, Consumer cleanup) { + return reinterpret(addr, 1, arena, cleanup); + } + + /** + * Reinterprets {@code addr} using target {@code arena} and {@code cleanupAction} (if any). + * The returned segment has size {@code elementCount * layout().byteSize()} + */ + public static MemorySegment reinterpret(MemorySegment addr, long elementCount, Arena arena, Consumer cleanup) { + return addr.reinterpret(layout().byteSize() * elementCount, arena, cleanup); + } +} + diff --git a/jvector-native/src/test/java/io/github/jbellis/jvector/vector/cnative/NativeSimdOpsTest.java b/jvector-native/src/test/java/io/github/jbellis/jvector/vector/cnative/NativeSimdOpsTest.java deleted file mode 100644 index 68dfca187..000000000 --- a/jvector-native/src/test/java/io/github/jbellis/jvector/vector/cnative/NativeSimdOpsTest.java +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright DataStax, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.github.jbellis.jvector.vector.cnative; - -import org.junit.Assume; -import org.junit.Test; - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.List; - -import static org.junit.Assert.assertEquals; - -public class NativeSimdOpsTest { - - /** - * Reads /proc/cpuinfo and returns true if all AVX-512 flags required by - * check_avx512_compatibility() are present: avx512f, avx512cd, avx512dq, - * avx512bw, avx512vl. - */ - private static boolean cpuinfoReportsAvx512() throws IOException { - List lines = Files.readAllLines(Path.of("/proc/cpuinfo")); - List required = List.of("avx512f", "avx512cd", "avx512dq", "avx512bw", "avx512vl"); - for (String line : lines) { - if (line.startsWith("flags")) { - String[] flags = line.split("\\s+"); - List flagList = List.of(flags); - return flagList.containsAll(required); - } - } - return false; - } - - @Test - public void testCheckAvx512CompatibilityMatchesCpuinfo() throws IOException { - boolean libraryLoaded = LibraryLoader.loadJvector(); - Assume.assumeTrue("Native jvector library not available; skipping AVX-512 check", libraryLoaded); - - boolean expectedFromCpuinfo = cpuinfoReportsAvx512(); - boolean actualFromNative = NativeSimdOps.check_avx512_compatibility(); - - assertEquals( - "check_avx512_compatibility() should match AVX-512 flag presence in /proc/cpuinfo", - expectedFromCpuinfo, - actualFromNative); - } -} diff --git a/rat-excludes.txt b/rat-excludes.txt index 436c97822..d12b98e8d 100644 --- a/rat-excludes.txt +++ b/rat-excludes.txt @@ -9,6 +9,7 @@ package.json .github/workflows/run-bench.yml .mvn/wrapper/maven-wrapper.properties .mvn/jvm.config +.gitmodules README.md UPGRADING.md CHANGELOG.md @@ -31,4 +32,8 @@ docs/**/*.md yaml-configs/**/*.md local_datasets/** **/datasets/** +jvector-native/src/main/c/build/** +jvector-native/src/main/c/third_party/** +src/main/c/build/** +src/main/c/third_party/**