From 43821b00a8d6800a861a8a255413247555a2394d Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Sun, 7 Jun 2026 18:21:18 -0400 Subject: [PATCH] update fastpfor and build setup --- .github/workflows/buildwheels.yml | 83 + .gitignore | 32 + CMakeLists.txt | 23 +- README.md | 4 +- cmake_modules/DetectCPUFeatures.cmake | 12 +- headers/VarIntG8IU.h | 7 +- headers/blockpacking.h | 1 + headers/codecfactory.h | 45 +- headers/codecs.h | 4 + headers/common.h | 10 + headers/cpubenchmark.h | 43 +- headers/deltautil.h | 12 +- headers/externalvector.h | 6 +- headers/fastpfor.h | 6 +- headers/fastpfor_neon.h | 284 ++++ headers/horizontalbitpacking.h | 2 +- headers/newpfor.h | 2 +- headers/simdfastpfor.h | 141 +- headers/simdgroupsimple.h | 2117 ++++++++++++++++--------- headers/simdnewpfor.h | 2 +- headers/simdvariablebyte.h | 10 +- headers/simple16.h | 9 +- headers/simple8b.h | 6 +- headers/simple8b_rle.h | 4 +- headers/simple9.h | 27 +- headers/simple9_rle.h | 4 +- headers/streamvariablebyte.h | 6 +- headers/synthetic.h | 6 +- headers/util.h | 6 + headers/vsencoding.h | 2 +- python_bindings/MANIFEST.in | 5 +- python_bindings/pyproject.toml | 64 + python_bindings/setup.py | 84 +- src/benchbitpacking.cpp | 2 - src/codecfactory.cpp | 227 ++- src/codecs.cpp | 82 +- src/horizontalbitpacking.cpp | 322 ++-- src/simdbitpacking.cpp | 68 +- src/simdunalignedbitpacking.cpp | 68 +- src/streamvbyte.c | 47 +- src/varintdecode.c | 45 +- 41 files changed, 2696 insertions(+), 1234 deletions(-) create mode 100644 .github/workflows/buildwheels.yml create mode 100644 .gitignore create mode 100644 headers/fastpfor_neon.h create mode 100644 python_bindings/pyproject.toml diff --git a/.github/workflows/buildwheels.yml b/.github/workflows/buildwheels.yml new file mode 100644 index 0000000..d6a9e2d --- /dev/null +++ b/.github/workflows/buildwheels.yml @@ -0,0 +1,83 @@ +name: Build and upload wheels + +on: + push: + tags: + - 'v[0-9]+\.[0-9]+\.[0-9]+' + workflow_dispatch: # Allows manual triggering + +# The Python project lives in the python_bindings/ subdirectory and reaches the +# C++ sources through a `fastpfor` symlink to the repository root. That symlink +# is fine for local builds but does not survive being copied into cibuildwheel's +# build containers (and is not preserved on Windows checkouts). To stay portable +# we build a self-contained sdist first (it bundles the real header/source +# files) and build every wheel from that extracted sdist. +jobs: + build_sdist: + name: Build source distribution + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Install uv + uses: astral-sh/setup-uv@v5 + + - name: Build sdist + run: uv build --sdist + working-directory: python_bindings + + - uses: actions/upload-artifact@v4 + with: + name: sdist-artifact + path: python_bindings/dist/*.tar.gz + if-no-files-found: error + + build_wheels: + name: Build wheels on ${{ matrix.os }} + needs: build_sdist + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + # Native runners for each architecture we publish: x86_64 Linux, + # aarch64 Linux, x86_64 macOS, Apple Silicon macOS, and x86_64 Windows. + os: [ubuntu-latest, ubuntu-24.04-arm, macos-13, macos-latest, windows-latest] + + steps: + - uses: actions/download-artifact@v4 + with: + name: sdist-artifact + path: dist + + - name: Unpack sdist + shell: bash + run: | + mkdir -p sdist_src + tar -xzf dist/*.tar.gz -C sdist_src --strip-components=1 + + - name: Build wheels + uses: pypa/cibuildwheel@v3.3.1 + with: + package-dir: sdist_src + + - uses: actions/upload-artifact@v4 + with: + name: wheel-artifact-${{ matrix.os }} + path: ./wheelhouse/*.whl + if-no-files-found: error + + upload_pypi: + needs: [build_wheels, build_sdist] + runs-on: ubuntu-latest + # Only upload to PyPI when triggered by a tag (not manual workflow_dispatch). + if: startsWith(github.ref, 'refs/tags/') + steps: + - uses: actions/download-artifact@v4 + with: + path: dist + merge-multiple: true + + - uses: pypa/gh-action-pypi-publish@v1.13.0 + with: + user: __token__ + password: ${{ secrets.PYPI_API_TOKEN }} diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..26344f5 --- /dev/null +++ b/.gitignore @@ -0,0 +1,32 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C/C++ extensions and objects +*.so +*.o +*.a + +# Distribution / packaging +.Python +env/ +venv/ +build/ +develop-eggs/ +dist/ +eggs/ +.eggs/ +sdist/ +var/ +wheels/ +wheelhouse/ +*.egg-info/ +*.egg + +# CMake build directories +cmake-build-*/ + +# Editor / OS cruft +.DS_Store +*.swp diff --git a/CMakeLists.txt b/CMakeLists.txt index c2244ce..83b34f1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -9,7 +9,7 @@ # will be re-used by the Python build process # 2) Build and run unit and performance tests. # -cmake_minimum_required(VERSION 2.8.7) +cmake_minimum_required(VERSION 3.10) if (NOT CMAKE_BUILD_TYPE) message(STATUS "No build type selected, default to Release") set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE) @@ -54,8 +54,10 @@ MESSAGE( STATUS "CMAKE_C_COMPILER: " ${CMAKE_C_COMPILER} ) MESSAGE( STATUS "CXX_COMPILER_VERSION: " ${CXX_COMPILER_VERSION} ) if( SUPPORT_SSE42 ) MESSAGE( STATUS "SSE 4.2 support detected" ) +elseif( SUPPORT_NEON ) + MESSAGE( STATUS "ARM NEON detected: SSE intrinsics are emulated via fastpfor_neon.h" ) else() - MESSAGE( STATUS "SSE 4.2 support not detected" ) + MESSAGE( STATUS "Neither SSE 4.2 nor ARM NEON support detected" ) endif() if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") @@ -79,14 +81,21 @@ elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel") set (CMAKE_CXX_FLAGS_DEBUG "-Wall -ggdb -std=c++11 -DHAVE_CXX0X -march=native") set (CMAKE_C_FLAGS_RELEASE "-Wall -Ofast -DNDEBUG -std=c99 -march=native") set (CMAKE_C_FLAGS_DEBUG "-Wall -ggdb -std=c99 -march=native") -elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") +elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang") if (CXX_COMPILER_VERSION VERSION_LESS 4.2.1) message(STATUS "Clang version must be at least 4.2.1!" ) endif() - set (CMAKE_CXX_FLAGS_RELEASE "-Wall -Wcast-align -O3 -DNDEBUG -std=c++11 -DHAVE_CXX0X -msse4.1 -march=native") - set (CMAKE_CXX_FLAGS_DEBUG "-Wall -Wcast-align -ggdb -std=c++11 -DHAVE_CXX0X -msse4.1 -march=native") - set (CMAKE_C_FLAGS_RELEASE "-Wall -Wcast-align -O3 -DNDEBUG -std=c99 -msse4.1 -march=native") - set (CMAKE_C_FLAGS_DEBUG "-Wall -Wcast-align -ggdb -std=c99 -msse4.1 -march=native") + # -msse4.1 is x86-only and rejected on ARM; -march=native already enables the + # available SIMD (SSE/AVX on x86, NEON on ARM), so only add it off ARM. + if (SUPPORT_NEON) + set (SIMD_FLAGS "-march=native") + else() + set (SIMD_FLAGS "-msse4.1 -march=native") + endif() + set (CMAKE_CXX_FLAGS_RELEASE "-Wall -Wcast-align -O3 -DNDEBUG -std=c++11 -DHAVE_CXX0X ${SIMD_FLAGS}") + set (CMAKE_CXX_FLAGS_DEBUG "-Wall -Wcast-align -ggdb -std=c++11 -DHAVE_CXX0X ${SIMD_FLAGS}") + set (CMAKE_C_FLAGS_RELEASE "-Wall -Wcast-align -O3 -DNDEBUG -std=c99 ${SIMD_FLAGS}") + set (CMAKE_C_FLAGS_DEBUG "-Wall -Wcast-align -ggdb -std=c99 ${SIMD_FLAGS}") elseif(WIN32) # TODO add support for later versions? if(NOT MSVC12) diff --git a/README.md b/README.md index 431f455..64e98cd 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,7 @@ or via pip: ``` pip install pyfastpfor ``` -Due to some compilation quirks this currently seem to work with GCC only. I will fix it in some not so distant future. You may also need to install Python dev-files. On Ubuntu, for Python 3 you can do it as follows: +The bindings build with GCC or Clang, on both x86-64 (SSE/AVX) and ARM/aarch64 (NEON, including Apple Silicon). You may also need to install Python dev-files. On Ubuntu, for Python 3 you can do it as follows: ``` sudo apt-get install python3-dev @@ -28,7 +28,7 @@ sudo apt-get install python3-dev # Documentation -The library supports all the codecs implemented in the original [FastPFor](https://github.com/lemire/FastPFor) library by July 2023. To get a list of codecs, use the function ``getCodecList``. +The library supports all the codecs implemented in the original [FastPFor](https://github.com/lemire/FastPFor) library (v0.5.0). To get a list of codecs, use the function ``getCodecList``. Typical light-weight compression does not take context into account and, consequently, works well only for small integers. When integers are large, data differencing is a common trick to make integers small. In particular, we often deal with sorted lists of integers, which can be represented by differences between neighboring numbers. diff --git a/cmake_modules/DetectCPUFeatures.cmake b/cmake_modules/DetectCPUFeatures.cmake index 412f53b..6724eb4 100644 --- a/cmake_modules/DetectCPUFeatures.cmake +++ b/cmake_modules/DetectCPUFeatures.cmake @@ -47,7 +47,17 @@ else() check_cxx_source_compiles("${AVXPROG}" SUPPORT_AVX) set(CMAKE_REQUIRED_FLAGS "-march=native -mavx2") check_cxx_source_compiles("${AVX2PROG}" SUPPORT_AVX2) -endif() +endif() set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQUIRED_FLAGS}) + +# On ARM the SSE intrinsics used throughout FastPFor are provided by the +# fastpfor_neon.h shim (mapped onto NEON), so there is no SSE4.2 support but the +# code still compiles. Flag ARM builds so the SIMD-specific compiler options +# (e.g. -msse4.1) can be skipped. +if (CMAKE_SYSTEM_PROCESSOR MATCHES "arm64" OR CMAKE_SYSTEM_PROCESSOR MATCHES "arm" + OR CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64" + OR CMAKE_GENERATOR_PLATFORM MATCHES "ARM64" OR CMAKE_GENERATOR_PLATFORM MATCHES "ARM") + set(SUPPORT_NEON ON) +endif () diff --git a/headers/VarIntG8IU.h b/headers/VarIntG8IU.h index c7d772a..222ff5b 100644 --- a/headers/VarIntG8IU.h +++ b/headers/VarIntG8IU.h @@ -2,7 +2,7 @@ * This code is released under the * Apache License Version 2.0 http://www.apache.org/licenses/. */ -#if !defined(__SSSE3__) && !(defined(_MSC_VER) && defined(__AVX__)) +#if (!defined(__SSSE3__) && !(defined(_MSC_VER) && defined(__AVX__))) && !(defined(__ARM_NEON) || defined(__aarch64__)) #ifndef _MSC_VER #pragma message \ "Disabling varintg8iu due to lack of SSSE3 support, try adding -mssse3 or the equivalent on your compiler" @@ -12,7 +12,12 @@ #else #ifndef VARINTG8IU_H__ #define VARINTG8IU_H__ +#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) #include +#elif defined(__aarch64__) +/* GCC-compatible compiler, targeting ARM with native NEON */ +#include "fastpfor_neon.h" +#endif #include "codecs.h" #ifdef __GNUC__ #define PREDICT_FALSE(x) (__builtin_expect(x, 0)) diff --git a/headers/blockpacking.h b/headers/blockpacking.h index 20f032b..2d88076 100644 --- a/headers/blockpacking.h +++ b/headers/blockpacking.h @@ -286,6 +286,7 @@ class ByteAlignedPacking : public IntegerCODEC { const uint32_t *decodeArray(const uint32_t *in, const size_t length, uint32_t *out, size_t &nvalue) { + (void)length; const uint32_t actuallength = *in++; const uint8_t *inbyte = reinterpret_cast(in); const uint32_t *const initout(out); diff --git a/headers/codecfactory.h b/headers/codecfactory.h index 1a3bf32..3021d4e 100644 --- a/headers/codecfactory.h +++ b/headers/codecfactory.h @@ -21,15 +21,54 @@ class CODECFactory { public: CODECFactory(); - std::vector> allSchemes(); + std::vector> allSchemes() const; - std::vector allNames(); + std::vector allNames() const; - std::shared_ptr &getFromName(std::string name); + std::shared_ptr const& getFromName(std::string name) const; private: CodecMap scodecmap; }; +std::unique_ptr fastbinarypacking8_codec(); +std::unique_ptr fastbinarypacking16_codec(); +std::unique_ptr fastbinarypacking32_codec(); +std::unique_ptr BP32_codec(); +std::unique_ptr vsencoding_codec(); +std::unique_ptr fastpfor128_codec(); +std::unique_ptr fastpfor256_codec(); +std::unique_ptr simdfastpfor128_codec(); +std::unique_ptr simdfastpfor256_codec(); +std::unique_ptr simplepfor_codec(); +std::unique_ptr simdsimplepfor_codec(); +std::unique_ptr pfor_codec(); +std::unique_ptr simdpfor_codec(); +std::unique_ptr pfor2008_codec(); +std::unique_ptr simdnewpfor_codec(); +std::unique_ptr newpfor_codec(); +std::unique_ptr optpfor_codec(); +std::unique_ptr simdoptpfor_codec(); +std::unique_ptr varint_codec(); +std::unique_ptr vbyte_codec(); +std::unique_ptr maskedvbyte_codec(); +std::unique_ptr streamvbyte_codec(); +std::unique_ptr varintgb_codec(); +std::unique_ptr simple16_codec(); +std::unique_ptr simple9_codec(); +std::unique_ptr simple9_rle_codec(); +std::unique_ptr simple8b_codec(); +std::unique_ptr simple8b_rle_codec(); +#ifdef VARINTG8IU_H__ +std::unique_ptr varintg8iu_codec(); +#endif +#ifdef USESNAPPY +std::unique_ptr snappy_codec(); +#endif +std::unique_ptr simdbinarypacking_codec(); +std::unique_ptr simdgroupsimple_codec(); +std::unique_ptr simdgroupsimple_ringbuf_codec(); +std::unique_ptr copy_codec(); + } // namespace FastPForLib #endif /* CODECFACTORY_H_ */ diff --git a/headers/codecs.h b/headers/codecs.h index adfcec1..a8681c5 100644 --- a/headers/codecs.h +++ b/headers/codecs.h @@ -53,6 +53,10 @@ class IntegerCODEC { * of the variable nvalue gets updated with the number actually use * (if nvalue exceeds the original value, there might be a buffer * overrun). + * + * NOTE: Decoding can be performed with an unknown input length. This + * case is indicated by a length of 0; however, nvalue must be provided + * in order for the decoder knows how many values to decode. */ virtual const uint32_t *decodeArray(const uint32_t *in, const size_t length, uint32_t *out, size_t &nvalue) = 0; diff --git a/headers/common.h b/headers/common.h index f741c5b..9827997 100644 --- a/headers/common.h +++ b/headers/common.h @@ -10,7 +10,12 @@ // C headers (sorted) #include #include +#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) #include +#elif defined(__GNUC__) && defined(__aarch64__) +#include "fastpfor_neon.h" +#endif + #include #include #include @@ -44,7 +49,12 @@ #ifdef _MSC_VER #include #include + +#if (defined(_M_IX86) || defined(_M_AMD64)) #include +#elif defined(_M_ARM64) +#include "fastpfor_neon.h" +#endif #define __attribute__(n) #define __restrict__ __restrict diff --git a/headers/cpubenchmark.h b/headers/cpubenchmark.h index 5fbfdc7..ea5ebf7 100644 --- a/headers/cpubenchmark.h +++ b/headers/cpubenchmark.h @@ -42,11 +42,34 @@ static __inline__ unsigned long long stopRDTSCP(void) { "%rdx"); return (static_cast(cycles_high) << 32) | cycles_low; } -#elif defined(_MSC_VER) +#elif (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_AMD64))) static inline unsigned long long startRDTSC(void) { return __rdtsc(); } static inline unsigned long long stopRDTSCP(void) { return __rdtsc(); } +#elif defined(_MSC_VER) && defined(_M_ARM64) + // oriented by zeromq implementation for msc arm/arm64 + // https://github.com/zeromq/libzmq/blob/master/src/clock.cpp + inline unsigned long long rdtsc() { + const int64_t pmccntr_el0 = (((3 & 1) << 14) | // op0 + ((3 & 7) << 11) | // op1 + ((9 & 15) << 7) | // crn + ((13 & 15) << 3) | // crm + ((0 & 7) << 0)); // op2 + + return _ReadStatusReg (pmccntr_el0); + } + + static inline unsigned long long startRDTSC(void) { return rdtsc(); } + + static inline unsigned long long stopRDTSCP(void) { return rdtsc(); } +#elif (defined(_MSC_VER) && (defined(_M_ARM64))) +// Taken from microsoft documentation (see +// https://learn.microsoft.com/en-us/cpp/build/overview-of-arm-abi-conventions?view=msvc-170 + +static inline unsigned long long startRDTSC(void) { return __rdpmccntr64(); } + +static inline unsigned long long stopRDTSCP(void) { return __rdpmccntr64(); } #elif defined(__i386__) || defined(__x86_64__) @@ -66,15 +89,25 @@ inline unsigned long long rdtsc() { static __inline__ unsigned long long startRDTSC(void) { return rdtsc(); } static __inline__ unsigned long long stopRDTSCP(void) { return rdtsc(); } +#elif (defined(__GNUC__) && (defined(__aarch64__))) + inline uint64_t rdtsc() { + uint64_t cycles; + asm volatile("mrs %0, cntvct_el0" + : "=r"(cycles)); /* output */ + return cycles; + } + + static __inline__ uint64_t startRDTSC(void) { return rdtsc(); } -#elif(defined(__arm__) || defined(__ppc__) || defined(__ppc64__)) + static __inline__ uint64_t stopRDTSCP(void) { return rdtsc(); } +#elif(defined(__arm__) || defined(__ppc__) || defined(__ppc64__)) || (defined(_MSC_VER) && defined(_M_ARM64)) // for PPC we should be able to use tbl, but I could not find // an equivalent to rdtsc for ARM. -inline uint64 rdtsc() { return 0; } -static __inline__ ticks startRDTSC(void) { return 0; } -static __inline__ ticks stopRDTSCP(void) { return 0; } +inline uint64_t rdtsc() { return 0; } +static __inline__ uint64_t startRDTSC(void) { return 0; } +static __inline__ uint64_t stopRDTSCP(void) { return 0; } #else #error Unknown architecture #endif diff --git a/headers/deltautil.h b/headers/deltautil.h index 789d492..37ebfa8 100644 --- a/headers/deltautil.h +++ b/headers/deltautil.h @@ -25,7 +25,7 @@ namespace FastPForLib { */ struct algostats { - algostats(std::shared_ptr &a, bool simd = false) + algostats(const std::shared_ptr &a, bool simd = false) : algo(a), deltaspeed(), compspeed(), decompspeed(), inversedeltaspeed(), bitsperint(), deltatime(0), comptime(0), decomptime(0), inversedeltatime(0), output(), input(), SIMDDeltas(simd) {} @@ -493,11 +493,11 @@ class Delta { << recoveredsize << std::endl; throw std::logic_error("arrays don't have same size: bug."); } - for (size_t i = 0; i < datas[k].size(); i++) { - if (datas[k][i] != recov[i]) { - std::cout << "difference at index " << i << ":" << std::endl; - std::cout << " expected: " << datas[k][i] << std::endl; - std::cout << " actual: " << recov[i] << std::endl; + for (size_t j = 0; j < datas[k].size(); j++) { + if (datas[k][j] != recov[j]) { + std::cout << "difference at index " << j << ":" << std::endl; + std::cout << " expected: " << datas[k][j] << std::endl; + std::cout << " actual: " << recov[j] << std::endl; throw std::logic_error("we have a bug"); } } diff --git a/headers/externalvector.h b/headers/externalvector.h index e8a7533..a3eee73 100644 --- a/headers/externalvector.h +++ b/headers/externalvector.h @@ -310,7 +310,7 @@ class externalvector { void loadACopy(std::vector &buffer, size_t begin, size_t end) const { buffer.resize(end - begin); - int result = fseek(fd, begin * sizeofdata * sizeof(Type), SEEK_SET); + int result = fseek(fd, (long)(begin * sizeofdata * sizeof(Type)), SEEK_SET); if (result != 0) { std::cerr << "could not seek to " << begin << std::endl; throw std::runtime_error("bad seek"); @@ -335,7 +335,7 @@ class externalvector { throw std::runtime_error("file not open"); } DataType ans(sizeofdata); - int result = fseek(fd, pos * sizeofdata * sizeof(Type), SEEK_SET); + int result = fseek(fd, (long)(pos * sizeofdata * sizeof(Type)), SEEK_SET); if (result != 0) { std::cerr << "could not seek to " << pos << std::endl; throw std::runtime_error("bad seek"); @@ -367,7 +367,7 @@ class externalvector { } void copyAt(const std::vector &buffer, size_t begin) { - int result = fseek(fd, begin * sizeofdata * sizeof(Type), SEEK_SET); + int result = fseek(fd, (long)(begin * sizeofdata * sizeof(Type)), SEEK_SET); if (result != 0) { std::cerr << "could not seek to " << begin << std::endl; throw std::runtime_error("bad seek"); diff --git a/headers/fastpfor.h b/headers/fastpfor.h index a086eef..7d0eb6a 100644 --- a/headers/fastpfor.h +++ b/headers/fastpfor.h @@ -236,11 +236,11 @@ class FastPForImpl { inexcept += (sizeof(IntType) + sizeof(uint32_t) - 1) / sizeof(uint32_t); for (uint32_t k = 2; k <= sizeof(IntType) * 8; ++k) { if ((bitmap & (1ULL << (k - 1))) != 0) { - uint32_t nvalue = *inexcept; - datatobepacked[k].resize((nvalue + PACKSIZE - 1) / PACKSIZE * PACKSIZE); + uint32_t tnvalue = *inexcept; + datatobepacked[k].resize((tnvalue + PACKSIZE - 1) / PACKSIZE * PACKSIZE); inexcept = packingvector<32>::unpackmetight( inexcept, datatobepacked[k].data(), datatobepacked[k].size(), k); - datatobepacked[k].resize(nvalue); + datatobepacked[k].resize(tnvalue); } } length = inexcept - initin; diff --git a/headers/fastpfor_neon.h b/headers/fastpfor_neon.h new file mode 100644 index 0000000..a211766 --- /dev/null +++ b/headers/fastpfor_neon.h @@ -0,0 +1,284 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire + */ + +/** + * Native ARM NEON implementations of the (small) subset of x86 SSE intrinsics + * used by FastPFOR. This replaces the SIMDe emulation layer on aarch64 targets: + * every operation below maps directly to native NEON instructions. + * + * The mappings follow the well-known SSE->NEON correspondences (the same ones + * used by projects such as sse2neon). Shift-by-amount operations are expressed + * with the NEON variable-shift instruction so they accept both compile-time + * constants (which the compiler folds to immediate shifts) and runtime counts. + * + * This header is valid in both C99 and C++ so it can be shared by the C codecs + * (streamvbyte.c, varintdecode.c) and the C++ headers. + */ +#ifndef FASTPFOR_NEON_H_ +#define FASTPFOR_NEON_H_ + +#if !(defined(__aarch64__) || (defined(_MSC_VER) && defined(_M_ARM64))) +#error "fastpfor_neon.h is only for ARM (aarch64 / ARM64) targets" +#endif + +#include +#include +#include + +typedef int64x2_t __m128i; +typedef float32x4_t __m128; + +/* ----------------------------- load / store ----------------------------- */ + +static inline __m128i _mm_loadu_si128(const __m128i *p) { + return vld1q_s64((const int64_t *)p); +} +static inline __m128i _mm_lddqu_si128(const __m128i *p) { + return vld1q_s64((const int64_t *)p); +} +static inline __m128i _mm_load_si128(const __m128i *p) { + return vld1q_s64((const int64_t *)p); +} +static inline __m128i _mm_loadl_epi64(const __m128i *p) { + return vcombine_s64(vld1_s64((const int64_t *)p), vdup_n_s64(0)); +} +static inline void _mm_storeu_si128(__m128i *p, __m128i a) { + vst1q_s64((int64_t *)p, a); +} +static inline void _mm_store_si128(__m128i *p, __m128i a) { + vst1q_s64((int64_t *)p, a); +} +static inline void _mm_stream_si128(__m128i *p, __m128i a) { + vst1q_s64((int64_t *)p, a); +} +static inline void _mm_storel_epi64(__m128i *p, __m128i a) { + vst1_s64((int64_t *)p, vget_low_s64(a)); +} + +/* --------------------------------- set ---------------------------------- */ + +static inline __m128i _mm_setzero_si128(void) { return vdupq_n_s64(0); } +static inline __m128i _mm_set1_epi32(int a) { + return vreinterpretq_s64_s32(vdupq_n_s32(a)); +} +static inline __m128i _mm_set1_epi16(short a) { + return vreinterpretq_s64_s16(vdupq_n_s16(a)); +} +static inline __m128i _mm_set1_epi8(signed char a) { + return vreinterpretq_s64_s8(vdupq_n_s8(a)); +} +static inline __m128i _mm_set_epi64x(int64_t e1, int64_t e0) { + int64_t d[2]; + d[0] = e0; + d[1] = e1; + return vld1q_s64(d); +} +static inline __m128i +_mm_set_epi8(signed char e15, signed char e14, signed char e13, signed char e12, + signed char e11, signed char e10, signed char e9, signed char e8, + signed char e7, signed char e6, signed char e5, signed char e4, + signed char e3, signed char e2, signed char e1, signed char e0) { + int8_t d[16]; + d[0] = e0; d[1] = e1; d[2] = e2; d[3] = e3; + d[4] = e4; d[5] = e5; d[6] = e6; d[7] = e7; + d[8] = e8; d[9] = e9; d[10] = e10; d[11] = e11; + d[12] = e12; d[13] = e13; d[14] = e14; d[15] = e15; + return vreinterpretq_s64_s8(vld1q_s8(d)); +} +static inline __m128i +_mm_setr_epi8(signed char e0, signed char e1, signed char e2, signed char e3, + signed char e4, signed char e5, signed char e6, signed char e7, + signed char e8, signed char e9, signed char e10, signed char e11, + signed char e12, signed char e13, signed char e14, + signed char e15) { + int8_t d[16]; + d[0] = e0; d[1] = e1; d[2] = e2; d[3] = e3; + d[4] = e4; d[5] = e5; d[6] = e6; d[7] = e7; + d[8] = e8; d[9] = e9; d[10] = e10; d[11] = e11; + d[12] = e12; d[13] = e13; d[14] = e14; d[15] = e15; + return vreinterpretq_s64_s8(vld1q_s8(d)); +} +static inline __m128i _mm_setr_epi16(short e0, short e1, short e2, short e3, + short e4, short e5, short e6, short e7) { + int16_t d[8]; + d[0] = e0; d[1] = e1; d[2] = e2; d[3] = e3; + d[4] = e4; d[5] = e5; d[6] = e6; d[7] = e7; + return vreinterpretq_s64_s16(vld1q_s16(d)); +} + +/* ----------------------------- bitwise / arith -------------------------- */ + +static inline __m128i _mm_and_si128(__m128i a, __m128i b) { + return vandq_s64(a, b); +} +static inline __m128i _mm_or_si128(__m128i a, __m128i b) { + return vorrq_s64(a, b); +} +static inline __m128i _mm_add_epi32(__m128i a, __m128i b) { + return vreinterpretq_s64_s32( + vaddq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b))); +} +static inline __m128i _mm_sub_epi32(__m128i a, __m128i b) { + return vreinterpretq_s64_s32( + vsubq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b))); +} +static inline __m128i _mm_mullo_epi32(__m128i a, __m128i b) { + return vreinterpretq_s64_s32( + vmulq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b))); +} +static inline __m128i _mm_mullo_epi16(__m128i a, __m128i b) { + return vreinterpretq_s64_s16( + vmulq_s16(vreinterpretq_s16_s64(a), vreinterpretq_s16_s64(b))); +} + +/* --------------------------------- shifts ------------------------------- */ +/* Variable-shift form: accepts runtime counts; the compiler lowers a + * constant count to a native immediate shift. A right shift is a left shift + * by a negative amount (NEON semantics); counts >= element width yield 0, + * matching SSE. */ + +static inline __m128i _mm_slli_epi32(__m128i a, int imm) { + return vreinterpretq_s64_u32( + vshlq_u32(vreinterpretq_u32_s64(a), vdupq_n_s32(imm))); +} +static inline __m128i _mm_srli_epi32(__m128i a, int imm) { + return vreinterpretq_s64_u32( + vshlq_u32(vreinterpretq_u32_s64(a), vdupq_n_s32(-imm))); +} +static inline __m128i _mm_srli_epi16(__m128i a, int imm) { + return vreinterpretq_s64_u16( + vshlq_u16(vreinterpretq_u16_s64(a), vdupq_n_s16((int16_t)-imm))); +} +static inline __m128i _mm_slli_epi64(__m128i a, int imm) { + return vreinterpretq_s64_u64( + vshlq_u64(vreinterpretq_u64_s64(a), vdupq_n_s64(imm))); +} +static inline __m128i _mm_srli_epi64(__m128i a, int imm) { + return vreinterpretq_s64_u64( + vshlq_u64(vreinterpretq_u64_s64(a), vdupq_n_s64(-imm))); +} + +/* Whole-register byte shifts. The byte count is always a compile-time + * constant in FastPFOR, so vextq_u8 (which needs an immediate) is used. */ +#define _mm_srli_si128(a, imm) \ + vreinterpretq_s64_u8( \ + vextq_u8(vreinterpretq_u8_s64(a), vdupq_n_u8(0), (imm))) +#define _mm_slli_si128(a, imm) \ + vreinterpretq_s64_u8( \ + vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_s64(a), (16 - (imm)))) + +/* ------------------------------- compares ------------------------------- */ + +static inline __m128i _mm_cmplt_epi32(__m128i a, __m128i b) { + return vreinterpretq_s64_u32( + vcltq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b))); +} +static inline __m128i _mm_cmpeq_epi8(__m128i a, __m128i b) { + return vreinterpretq_s64_u8( + vceqq_u8(vreinterpretq_u8_s64(a), vreinterpretq_u8_s64(b))); +} + +/* ------------------------------- shuffles ------------------------------- */ + +static inline __m128i _mm_shuffle_epi8(__m128i a, __m128i b) { + /* pshufb: a byte of the index with its high bit set produces 0. Masking the + * index with 0x8F leaves the low nibble and the high bit; vqtbl1q_u8 then + * yields 0 for any index >= 16 (i.e. when the high bit was set). */ + uint8x16_t tbl = vreinterpretq_u8_s64(a); + uint8x16_t idx = vandq_u8(vreinterpretq_u8_s64(b), vdupq_n_u8(0x8F)); + return vreinterpretq_s64_u8(vqtbl1q_u8(tbl, idx)); +} +static inline __m128i _mm_shuffle_epi32(__m128i a, const int imm) { + /* Permute the four 32-bit lanes. `imm` is a compile-time constant at every + * call site, so the broadcast fast paths fold away and the general case + * builds a constant byte index that compiles to a single TBL. */ + uint32x4_t v = vreinterpretq_u32_s64(a); + if (imm == 0x00) + return vreinterpretq_s64_u32(vdupq_laneq_u32(v, 0)); + if (imm == 0x55) + return vreinterpretq_s64_u32(vdupq_laneq_u32(v, 1)); + if (imm == 0xAA) + return vreinterpretq_s64_u32(vdupq_laneq_u32(v, 2)); + if (imm == 0xFF) + return vreinterpretq_s64_u32(vdupq_laneq_u32(v, 3)); + { + const uint8_t b0 = (uint8_t)((imm & 3) * 4); + const uint8_t b1 = (uint8_t)(((imm >> 2) & 3) * 4); + const uint8_t b2 = (uint8_t)(((imm >> 4) & 3) * 4); + const uint8_t b3 = (uint8_t)(((imm >> 6) & 3) * 4); + const uint8x16_t idx = { + b0, (uint8_t)(b0 + 1), (uint8_t)(b0 + 2), (uint8_t)(b0 + 3), + b1, (uint8_t)(b1 + 1), (uint8_t)(b1 + 2), (uint8_t)(b1 + 3), + b2, (uint8_t)(b2 + 1), (uint8_t)(b2 + 2), (uint8_t)(b2 + 3), + b3, (uint8_t)(b3 + 1), (uint8_t)(b3 + 2), (uint8_t)(b3 + 3)}; + return vreinterpretq_s64_u8(vqtbl1q_u8(vreinterpretq_u8_s64(a), idx)); + } +} +static inline __m128i _mm_blend_epi16(__m128i a, __m128i b, const int imm) { + /* Per-16-bit-lane select from `a` (0) or `b` (1). `imm` is constant at every + * call site, so this compound-literal mask folds to a constant vector load + * feeding a single BSL, with no stack round-trip. */ + const uint16x8_t mask = { + (imm & 0x01) ? (uint16_t)0xFFFF : (uint16_t)0, + (imm & 0x02) ? (uint16_t)0xFFFF : (uint16_t)0, + (imm & 0x04) ? (uint16_t)0xFFFF : (uint16_t)0, + (imm & 0x08) ? (uint16_t)0xFFFF : (uint16_t)0, + (imm & 0x10) ? (uint16_t)0xFFFF : (uint16_t)0, + (imm & 0x20) ? (uint16_t)0xFFFF : (uint16_t)0, + (imm & 0x40) ? (uint16_t)0xFFFF : (uint16_t)0, + (imm & 0x80) ? (uint16_t)0xFFFF : (uint16_t)0}; + return vreinterpretq_s64_u16(vbslq_u16(mask, vreinterpretq_u16_s64(b), + vreinterpretq_u16_s64(a))); +} + +/* --------------------------- extract / convert -------------------------- */ + +#define _mm_extract_epi32(a, imm) \ + vgetq_lane_s32(vreinterpretq_s32_s64(a), (imm)) + +static inline int _mm_cvtsi128_si32(__m128i a) { + return vgetq_lane_s32(vreinterpretq_s32_s64(a), 0); +} +static inline __m128i _mm_cvtepu8_epi16(__m128i a) { + return vreinterpretq_s64_u16(vmovl_u8(vget_low_u8(vreinterpretq_u8_s64(a)))); +} +static inline __m128i _mm_cvtepu16_epi32(__m128i a) { + return vreinterpretq_s64_u32( + vmovl_u16(vget_low_u16(vreinterpretq_u16_s64(a)))); +} +static inline __m128i _mm_cvtepi8_epi32(__m128i a) { + int16x8_t t16 = vmovl_s8(vget_low_s8(vreinterpretq_s8_s64(a))); + return vreinterpretq_s64_s32(vmovl_s16(vget_low_s16(t16))); +} + +/* -------------------------------- masks --------------------------------- */ + +static inline int _mm_movemask_epi8(__m128i a) { + uint8x16_t input = vreinterpretq_u8_s64(a); + uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7)); + uint32x4_t paired16 = + vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7)); + uint64x2_t paired32 = + vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14)); + uint8x16_t paired64 = + vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28)); + return vgetq_lane_u8(paired64, 0) | ((int)vgetq_lane_u8(paired64, 8) << 8); +} + +/* --------------------------------- float -------------------------------- */ + +static inline __m128 _mm_castsi128_ps(__m128i a) { + return vreinterpretq_f32_s64(a); +} +static inline int _mm_movemask_ps(__m128 a) { + static const int32_t shifts[4] = {0, 1, 2, 3}; + uint32x4_t signs = vshrq_n_u32(vreinterpretq_u32_f32(a), 31); + uint32x4_t weighted = vshlq_u32(signs, vld1q_s32(shifts)); + return (int)vaddvq_u32(weighted); +} + +#endif /* FASTPFOR_NEON_H_ */ diff --git a/headers/horizontalbitpacking.h b/headers/horizontalbitpacking.h index 88bbf5c..90cef10 100644 --- a/headers/horizontalbitpacking.h +++ b/headers/horizontalbitpacking.h @@ -21,8 +21,8 @@ #ifndef HORIZONTALBITPACKING_H_ #define HORIZONTALBITPACKING_H_ -#if !defined(__SSE4_1__) && !(defined(_MSC_VER) && defined(__AVX__)) +#if (!defined(__SSE4_1__) && !(defined(_MSC_VER) && defined(__AVX__))) && (!(defined(__ARM_NEON) || defined(__aarch64__))) #ifndef _MSC_VER #pragma message "No SSSE4.1 support? try adding -msse4.1 or the equivalent on your compiler" #else diff --git a/headers/newpfor.h b/headers/newpfor.h index 74f8842..15b1e1d 100644 --- a/headers/newpfor.h +++ b/headers/newpfor.h @@ -280,7 +280,7 @@ NewPFor::decodeBlock( out += 32; } - for (uint32_t e = 0, lpos = -1; e < nExceptions; e++) { + for (uint32_t e = 0, lpos = 0u-1; e < nExceptions; e++) { lpos += exceptions[e] + 1; beginout[lpos] |= (exceptions[e + nExceptions] + 1) << b; } diff --git a/headers/simdfastpfor.h b/headers/simdfastpfor.h index 29e8179..275b5c6 100644 --- a/headers/simdfastpfor.h +++ b/headers/simdfastpfor.h @@ -7,13 +7,14 @@ #ifndef SIMDFASTPFOR_H_ #define SIMDFASTPFOR_H_ -#include "codecs.h" #include "common.h" -#include "memutil.h" +#include "codecs.h" #include "simdbitpacking.h" -#include "simple8b.h" #include "usimdbitpacking.h" +#include "memutil.h" #include "util.h" +#include "simple8b.h" +#include "usimdbitpacking.h" namespace FastPForLib { @@ -40,11 +41,12 @@ namespace FastPForLib { * patented. * */ -template // BlockSizeInUnitsOfPackSize can have value 4 or 8 +template // BlockSizeInUnitsOfPackSize can have value 4 or 8 class SIMDFastPFor : public IntegerCODEC { public: - using IntegerCODEC::decodeArray; using IntegerCODEC::encodeArray; + using IntegerCODEC::decodeArray; /** * ps (page size) should be a multiple of BlockSize, any "large" @@ -64,7 +66,8 @@ class SIMDFastPFor : public IntegerCODEC { BlockSize = BlockSizeInUnitsOfPackSize * PACKSIZE }; - static uint32_t *packblockupsimd(const uint32_t *source, uint32_t *out, const uint32_t bit) { + static uint32_t *packblockupsimd(const uint32_t *source, uint32_t *out, + const uint32_t bit) { for (int k = 0; k < BlockSize; k += 128) { SIMD_fastpack_32(source, reinterpret_cast<__m128i *>(out), bit); out += 4 * bit; @@ -73,7 +76,8 @@ class SIMDFastPFor : public IntegerCODEC { return out; } - static const uint32_t *unpackblocksimd(const uint32_t *source, uint32_t *out, const uint32_t bit) { + static const uint32_t *unpackblocksimd(const uint32_t *source, uint32_t *out, + const uint32_t bit) { for (int k = 0; k < BlockSize; k += 128) { SIMD_fastunpack_32(reinterpret_cast(source), out, bit); source += 4 * bit; @@ -83,7 +87,8 @@ class SIMDFastPFor : public IntegerCODEC { } template - static const uint32_t *unpackmesimd(const uint32_t *in, STLContainer &out, const uint32_t bit) { + static const uint32_t *unpackmesimd(const uint32_t *in, STLContainer &out, + const uint32_t bit) { const uint32_t size = *in; ++in; out.resize((size + 32 - 1) / 32 * 32); @@ -111,7 +116,8 @@ class SIMDFastPFor : public IntegerCODEC { } template - static uint32_t *packmeupwithoutmasksimd(STLContainer &source, uint32_t *out, const uint32_t bit) { + static uint32_t *packmeupwithoutmasksimd(STLContainer &source, uint32_t *out, + const uint32_t bit) { const uint32_t size = static_cast(source.size()); *out = size; out++; @@ -162,7 +168,8 @@ class SIMDFastPFor : public IntegerCODEC { const uint32_t *const finalout(out + nvalue); while (out != finalout) { size_t thisnvalue(0); - size_t thissize = static_cast(finalout > PageSize + out ? PageSize : (finalout - out)); + size_t thissize = static_cast( + finalout > PageSize + out ? PageSize : (finalout - out)); __decodeArray(in, thisnvalue, out, thissize); in += thisnvalue; @@ -181,7 +188,8 @@ class SIMDFastPFor : public IntegerCODEC { * BlockSizeInUnitsOfPackSize * PACKSIZE. (This was done * to simplify slightly the implementation.) */ - void encodeArray(const uint32_t *in, const size_t length, uint32_t *out, size_t &nvalue) override { + void encodeArray(const uint32_t *in, const size_t length, uint32_t *out, + size_t &nvalue) override { checkifdivisibleby(length, BlockSize); #ifndef NDEBUG const uint32_t *const initout(out); @@ -192,7 +200,8 @@ class SIMDFastPFor : public IntegerCODEC { const size_t oldnvalue = nvalue; nvalue = 1; while (in != finalin) { - size_t thissize = static_cast(finalin > PageSize + in ? PageSize : (finalin - in)); + size_t thissize = static_cast( + finalin > PageSize + in ? PageSize : (finalin - in)); size_t thisnvalue(0); __encodeArray(in, thissize, out, thisnvalue); nvalue += thisnvalue; @@ -202,16 +211,17 @@ class SIMDFastPFor : public IntegerCODEC { assert(out == nvalue + initout); if (oldnvalue < nvalue) fprintf(stderr, - "It is possible we have a buffer overrun. You reported having allocated " - "%zu bytes for the compressed data but we needed " - "%zu bytes. Please increase the available memory " - "for compressed data or check the value of the last parameter provided " - "to the encodeArray method.\n", - oldnvalue * sizeof(uint32_t), nvalue * sizeof(uint32_t)); + "It is possible we have a buffer overrun. You reported having allocated " + "%zu bytes for the compressed data but we needed " + "%zu bytes. Please increase the available memory " + "for compressed data or check the value of the last parameter provided " + "to the encodeArray method.\n", + oldnvalue * sizeof(uint32_t), nvalue * sizeof(uint32_t)); resetBuffer(); // if you don't do this, the buffer has a memory } - void getBestBFromData(const uint32_t *in, uint8_t &bestb, uint8_t &bestcexcept, uint8_t &maxb) { + void getBestBFromData(const uint32_t *in, uint8_t &bestb, + uint8_t &bestcexcept, uint8_t &maxb) { uint32_t freqs[33]; for (uint32_t k = 0; k <= 32; ++k) freqs[k] = 0; @@ -227,7 +237,8 @@ class SIMDFastPFor : public IntegerCODEC { bestcexcept = static_cast(cexcept); for (uint32_t b = bestb - 1; b < 32; --b) { cexcept += freqs[b + 1]; - uint32_t thiscost = cexcept * overheadofeachexcept + cexcept * (maxb - b) + b * BlockSize + + uint32_t thiscost = cexcept * overheadofeachexcept + + cexcept * (maxb - b) + b * BlockSize + 8; // the extra 8 is the cost of storing maxbits if (thiscost < bestcost) { bestcost = thiscost; @@ -237,21 +248,24 @@ class SIMDFastPFor : public IntegerCODEC { } } - void __encodeArray(const uint32_t *in, const size_t length, uint32_t *out, size_t &nvalue) { + void __encodeArray(const uint32_t *in, const size_t length, uint32_t *out, + size_t &nvalue) { uint32_t *const initout = out; // keep track of this checkifdivisibleby(length, BlockSize); uint32_t *const headerout = out++; // keep track of this for (uint32_t k = 0; k < 32 + 1; ++k) datatobepacked[k].clear(); uint8_t *bc = &bytescontainer[0]; - for (const uint32_t *const final = in + length; (in + BlockSize <= final); in += BlockSize) { + for (const uint32_t *const final = in + length; (in + BlockSize <= final); + in += BlockSize) { uint8_t bestb, bestcexcept, maxb; getBestBFromData(in, bestb, bestcexcept, maxb); *bc++ = bestb; *bc++ = bestcexcept; if (bestcexcept > 0) { *bc++ = maxb; - std::vector &thisexceptioncontainer = datatobepacked[maxb - bestb]; + std::vector &thisexceptioncontainer = + datatobepacked[maxb - bestb]; const uint32_t maxval = 1U << bestb; for (uint32_t k = 0; k < BlockSize; ++k) { if (in[k] >= maxval) { @@ -264,13 +278,14 @@ class SIMDFastPFor : public IntegerCODEC { out = packblockupsimd(in, out, bestb); } headerout[0] = static_cast(out - headerout); - const uint32_t bytescontainersize = static_cast(bc - &bytescontainer[0]); + const uint32_t bytescontainersize = + static_cast(bc - &bytescontainer[0]); *(out++) = bytescontainersize; memcpy(out, &bytescontainer[0], bytescontainersize); - uint8_t *pad8 = (uint8_t *)out + bytescontainersize; + uint8_t* pad8 = (uint8_t*)out + bytescontainersize; out += (bytescontainersize + sizeof(uint32_t) - 1) / sizeof(uint32_t); - while (pad8 < (uint8_t *)out) - *pad8++ = 0; // clear padding bytes + while (pad8 < (uint8_t*)out) + *pad8++ = 0; // clear padding bytes uint32_t bitmap = 0; for (uint32_t k = 2; k <= 32; ++k) { @@ -285,7 +300,8 @@ class SIMDFastPFor : public IntegerCODEC { nvalue = out - initout; } - void __decodeArray(const uint32_t *in, size_t &length, uint32_t *out, const size_t nvalue) { + void __decodeArray(const uint32_t *in, size_t &length, uint32_t *out, + const size_t nvalue) { const uint32_t *const initin = in; const uint32_t *const headerin = in++; const uint32_t wheremeta = headerin[0]; @@ -300,7 +316,8 @@ class SIMDFastPFor : public IntegerCODEC { } } length = inexcept - initin; - std::vector::const_iterator unpackpointers[32 + 1]; + std::vector::const_iterator + unpackpointers[32 + 1]; for (uint32_t k = 1; k <= 32; ++k) { unpackpointers[k] = datatobepacked[k].begin(); } @@ -316,7 +333,8 @@ class SIMDFastPFor : public IntegerCODEC { out[pos] |= static_cast(1) << b; } } else { - std::vector::const_iterator &exceptionsptr = unpackpointers[maxbits - b]; + std::vector::const_iterator &exceptionsptr = + unpackpointers[maxbits - b]; for (uint32_t k = 0; k < cexcept; ++k) { const uint8_t pos = *(bytep++); out[pos] |= (*(exceptionsptr++)) << b; @@ -327,7 +345,9 @@ class SIMDFastPFor : public IntegerCODEC { assert(in == headerin + wheremeta); } - std::string name() const override { return std::string("SIMDFastPFor") + std::to_string(BlockSize); } + std::string name() const override { + return std::string("SIMDFastPFor") + std::to_string(BlockSize); + } }; /** @@ -343,10 +363,11 @@ class SIMDFastPFor : public IntegerCODEC { * http://arxiv.org/abs/1209.2137 * */ -template > class SIMDSimplePFor : public IntegerCODEC { +template > +class SIMDSimplePFor : public IntegerCODEC { public: - using IntegerCODEC::decodeArray; using IntegerCODEC::encodeArray; + using IntegerCODEC::decodeArray; EXCEPTIONCODER ecoder; /** @@ -354,7 +375,8 @@ template > class SIMDSimplePFor : public I * value should do. */ SIMDSimplePFor(uint32_t ps = 65536) - : ecoder(), PageSize(ps), bitsPageSize(gccbits(PageSize)), datatobepacked(PageSize), + : ecoder(), PageSize(ps), bitsPageSize(gccbits(PageSize)), + datatobepacked(PageSize), bytescontainer(PageSize + 3 * PageSize / BlockSize) { assert(ps / BlockSize * BlockSize == ps); assert(gccbits(static_cast(BlockSizeInUnitsOfPackSize * PACKSIZE - 1)) <= 8); @@ -374,7 +396,8 @@ template > class SIMDSimplePFor : public I std::vector datatobepacked; std::vector bytescontainer; - const uint32_t *decodeArray(const uint32_t *in, const size_t length, uint32_t *out, size_t &nvalue) override { + const uint32_t *decodeArray(const uint32_t *in, const size_t length, + uint32_t *out, size_t &nvalue) override { const uint32_t *const initin(in); const size_t mynvalue = *in; ++in; @@ -384,7 +407,8 @@ template > class SIMDSimplePFor : public I const uint32_t *const finalout(out + nvalue); while (out != finalout) { size_t thisnvalue = length - (in - initin); - size_t thissize = static_cast(finalout > PageSize + out ? PageSize : (finalout - out)); + size_t thissize = static_cast( + finalout > PageSize + out ? PageSize : (finalout - out)); __decodeArray(in, thisnvalue, out, thissize); in += thisnvalue; @@ -399,7 +423,8 @@ template > class SIMDSimplePFor : public I * BlockSizeInUnitsOfPackSize * PACKSIZE. (This was done * to simplify slightly the implementation.) */ - void encodeArray(const uint32_t *in, const size_t length, uint32_t *out, size_t &nvalue) override { + void encodeArray(const uint32_t *in, const size_t length, uint32_t *out, + size_t &nvalue) override { checkifdivisibleby(length, BlockSize); const uint32_t *const initout(out); const uint32_t *const finalin(in + length); @@ -408,7 +433,8 @@ template > class SIMDSimplePFor : public I const size_t oldnvalue = nvalue; nvalue = 1; while (in != finalin) { - size_t thissize = static_cast(finalin > PageSize + in ? PageSize : (finalin - in)); + size_t thissize = static_cast( + finalin > PageSize + in ? PageSize : (finalin - in)); size_t thisnvalue = oldnvalue - (out - initout); __encodeArray(in, thissize, out, thisnvalue); nvalue += thisnvalue; @@ -418,15 +444,16 @@ template > class SIMDSimplePFor : public I assert(out == nvalue + initout); if (oldnvalue < nvalue) fprintf(stderr, - "It is possible we have a buffer overrun. You reported having allocated " - "%zu bytes for the compressed data but we needed " - "%zu bytes. Please increase the available memory " - "for compressed data or check the value of the last parameter provided " - "to the encodeArray method.\n", - oldnvalue * sizeof(uint32_t), nvalue * sizeof(uint32_t)); + "It is possible we have a buffer overrun. You reported having allocated " + "%zu bytes for the compressed data but we needed " + "%zu bytes. Please increase the available memory " + "for compressed data or check the value of the last parameter provided " + "to the encodeArray method.\n", + oldnvalue * sizeof(uint32_t), nvalue * sizeof(uint32_t)); } - void getBestBFromData(const uint32_t *in, uint8_t &bestb, uint8_t &bestcexcept, uint8_t &maxb) { + void getBestBFromData(const uint32_t *in, uint8_t &bestb, + uint8_t &bestcexcept, uint8_t &maxb) { uint32_t freqs[33]; for (uint32_t k = 0; k <= 32; ++k) freqs[k] = 0; @@ -442,7 +469,8 @@ template > class SIMDSimplePFor : public I bestcexcept = static_cast(cexcept); for (uint32_t b = bestb - 1; b < 32; --b) { cexcept += freqs[b + 1]; - uint32_t thiscost = cexcept * overheadofeachexcept + cexcept * (maxb - b) + b * BlockSize; + uint32_t thiscost = + cexcept * overheadofeachexcept + cexcept * (maxb - b) + b * BlockSize; if (thiscost < bestcost) { bestcost = thiscost; bestb = static_cast(b); @@ -451,13 +479,15 @@ template > class SIMDSimplePFor : public I } } - void __encodeArray(const uint32_t *in, const size_t length, uint32_t *out, size_t &nvalue) { + void __encodeArray(const uint32_t *in, const size_t length, uint32_t *out, + size_t &nvalue) { uint32_t *const initout = out; // keep track of this checkifdivisibleby(length, BlockSize); uint32_t *const headerout = out++; // keep track of this datatobepacked.clear(); uint8_t *bc = &bytescontainer[0]; - for (const uint32_t *const final = in + length; (in + BlockSize <= final); in += BlockSize) { + for (const uint32_t *const final = in + length; (in + BlockSize <= final); + in += BlockSize) { uint8_t bestb, bestcexcept, maxb; getBestBFromData(in, bestb, bestcexcept, maxb); *bc++ = bestb; @@ -478,21 +508,24 @@ template > class SIMDSimplePFor : public I // out = packblockup(in, out, bestb); } headerout[0] = static_cast(out - headerout); - const uint32_t bytescontainersize = static_cast(bc - &bytescontainer[0]); + const uint32_t bytescontainersize = + static_cast(bc - &bytescontainer[0]); *(out++) = bytescontainersize; memcpy(out, &bytescontainer[0], bytescontainersize); - uint8_t *pad8 = (uint8_t *)out + bytescontainersize; + uint8_t* pad8 = (uint8_t*)out + bytescontainersize; out += (bytescontainersize + sizeof(uint32_t) - 1) / sizeof(uint32_t); - while (pad8 < (uint8_t *)out) - *pad8++ = 0; // clear padding bytes + while (pad8 < (uint8_t*)out) + *pad8++ = 0; // clear padding bytes size_t outcap = 0; - ecoder.encodeArray(datatobepacked.data(), datatobepacked.size(), out, outcap); + ecoder.encodeArray(datatobepacked.data(), datatobepacked.size(), out, + outcap); out += outcap; nvalue = out - initout; } - void __decodeArray(const uint32_t *in, size_t &length, uint32_t *out, const size_t nvalue) { + void __decodeArray(const uint32_t *in, size_t &length, uint32_t *out, + const size_t nvalue) { const uint32_t *const initin = in; const uint32_t *const headerin = in++; const uint32_t wheremeta = headerin[0]; diff --git a/headers/simdgroupsimple.h b/headers/simdgroupsimple.h index c210e47..501f29e 100644 --- a/headers/simdgroupsimple.h +++ b/headers/simdgroupsimple.h @@ -99,796 +99,1353 @@ namespace FastPForLib { * be compressed is a multiple of four, so it should be used with * CompositeCodec. */ -template -class SIMDGroupSimple : public IntegerCODEC { -public: - using IntegerCODEC::encodeArray; - using IntegerCODEC::decodeArray; - - // Tell CompositeCodec that this implementation can only handle input sizes - // which are multiples of four. - static const uint32_t BlockSize = 4; - - // The header consists of three 32-bit integers. - static const uint32_t countHeader32 = 3; - - // Lookup table. Key: a selector, value: the number of quads to be packed - // into one compressed block with the specified selector. - static const uint8_t tableNum[]; - // Lookup table. Key: a selector, value: the mask required in the pattern - // selection algorithm. Note that unlike in the paper, we look up the mask - // directly instead of the bit width. - static const uint32_t tableMask[]; - - /** - * Extracts the pos-th 4-bit selector from the selectors area, which starts - * at inSelArea8. Note that, as described in the original paper, two - * selectors are stored in each byte in the selectors area. - */ - inline static uint8_t extractSel(const uint8_t *const &inSelArea8, - const size_t &pos) { - // We either need to extract the lower or the upper four bits of the - // respective selector byte. - return (pos & 1) - ? ((inSelArea8[pos >> 1]) >> 4) - : ((inSelArea8[pos >> 1]) & 0b1111); - } - - /** - * Utility function to calculate the number of padding bytes needed after the - * selectors area in order to guarantee the 16-byte alignment required for - * SSE-store instructions in the data area. - */ - inline static size_t getCountPadBytes(const size_t &countSelArea8) { - const size_t offset = (countHeader32 * sizeof(uint32_t) + - countSelArea8 + sizeof(uint8_t)) % sizeof(__m128i); - return offset ? (sizeof(__m128i) - offset) : 0; - } - - /** - * This function is used to compress the n quads, i.e. 4x n integers, in the - * last input block, if that last block is not "full". Note that this - * function is called at most once per array to compress. Hence, top - * efficiency is not that crucial here. - */ - inline static void comprIncompleteBlock(const uint8_t &n, const __m128i *&in, - __m128i *&out) { - // Since we have to produce exactly one compressed vector anyway, we can - // use the highest bit width allowing us to pack all n values. - const unsigned b = 32 / n; - __m128i comprBlock = _mm_loadu_si128(in++); - for (size_t k = 1; k < n; k++) - comprBlock = _mm_or_si128(comprBlock, - _mm_slli_epi32(_mm_loadu_si128(in++), k * b)); - _mm_storeu_si128(out++, comprBlock); - } - - /** - * The following ten functions pack a certain amount of uncompressed data. - * The function unrolledPacking_#n_#b packs #n quads, i.e., 4x #n integers, - * into one 128-bit compressed block. - */ - - inline static __m128i unrolledPacking_32_1(const __m128i *&in) { - __m128i res = _mm_loadu_si128(in++); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 1)); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 2)); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 3)); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 4)); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 5)); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 6)); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 7)); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 8)); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 9)); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 10)); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 11)); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 12)); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 13)); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 14)); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 15)); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 16)); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 17)); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 18)); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 19)); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 20)); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 21)); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 22)); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 23)); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 24)); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 25)); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 26)); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 27)); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 28)); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 29)); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 30)); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 31)); - return res; - } - - inline static __m128i unrolledPacking_16_2(const __m128i *&in) { - __m128i res = _mm_loadu_si128(in++); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 2)); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 4)); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 6)); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 8)); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 10)); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 12)); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 14)); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 16)); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 18)); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 20)); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 22)); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 24)); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 26)); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 28)); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 30)); - return res; - } - - inline static __m128i unrolledPacking_10_3(const __m128i *&in) { - __m128i res = _mm_loadu_si128(in++); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 3)); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 6)); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 9)); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 12)); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 15)); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 18)); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 21)); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 24)); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 27)); - return res; - } - - inline static __m128i unrolledPacking_8_4(const __m128i *&in) { - __m128i res = _mm_loadu_si128(in++); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 4)); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 8)); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 12)); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 16)); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 20)); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 24)); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 28)); - return res; - } - - inline static __m128i unrolledPacking_6_5(const __m128i *&in) { - __m128i res = _mm_loadu_si128(in++); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 5)); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 10)); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 15)); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 20)); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 25)); - return res; - } - - inline static __m128i unrolledPacking_5_6(const __m128i *&in) { - __m128i res = _mm_loadu_si128(in++); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 6)); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 12)); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 18)); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 24)); - return res; - } - - inline static __m128i unrolledPacking_4_8(const __m128i *&in) { - __m128i res = _mm_loadu_si128(in++); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 8)); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 16)); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 24)); - return res; - } - - inline static __m128i unrolledPacking_3_10(const __m128i *&in) { - __m128i res = _mm_loadu_si128(in++); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 10)); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 20)); - return res; - } - - inline static __m128i unrolledPacking_2_16(const __m128i *&in) { - __m128i res = _mm_loadu_si128(in++); - res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 16)); - return res; - } - - inline static __m128i unrolledPacking_1_32(const __m128i *&in) { - return _mm_loadu_si128(in++); - } - - /** - * Compresses n quads, i.e. 4x n integers. Thereby, n must correspond to one - * of the ten compression modes presented in the original paper. - */ - inline static void comprCompleteBlock(const uint8_t &n, const __m128i *&in, - __m128i *&out) { - __m128i res = _mm_setzero_si128(); - - // In the following, b means the bit width. - - switch (n) { - case 32: // b = 1 - res = unrolledPacking_32_1(in); - break; - case 16: // b = 2 - res = unrolledPacking_16_2(in); - break; - case 10: // b = 3 - res = unrolledPacking_10_3(in); - break; - case 8: // b = 4 - res = unrolledPacking_8_4(in); - break; - case 6: // b = 5 - res = unrolledPacking_6_5(in); - break; - case 5: // b = 6 - res = unrolledPacking_5_6(in); - break; - case 4: // b = 8 - res = unrolledPacking_4_8(in); - break; - case 3: // b = 10 - res = unrolledPacking_3_10(in); - break; - case 2: // b = 16 - res = unrolledPacking_2_16(in); - break; - case 1: // b = 32 - res = unrolledPacking_1_32(in); - break; - } - - _mm_storeu_si128(out++, res); - } - - /** - * This function is used to decompress the n quads, i.e. 4x n integers, in - * the last input block, if that last block is not "full". Note that this - * function is called at most once per array to decompress. Hence, top - * efficiency is not that crucial here. - */ - inline static void decomprIncompleteBlock(const uint8_t &n, + template + class SIMDGroupSimple : public IntegerCODEC { + public: + using IntegerCODEC::encodeArray; + using IntegerCODEC::decodeArray; + + // Tell CompositeCodec that this implementation can only handle input sizes + // which are multiples of four. + static const uint32_t BlockSize = 4; + + // The header consists of three 32-bit integers. + static const uint32_t countHeader32 = 3; + + // Lookup table. Key: a selector, value: the number of quads to be packed + // into one compressed block with the specified selector. + static const uint8_t tableNum[]; + // Lookup table. Key: a selector, value: the mask required in the pattern + // selection algorithm. Note that unlike in the paper, we look up the mask + // directly instead of the bit width. + static const uint32_t tableMask[]; + + /** + * Extracts the pos-th 4-bit selector from the selectors area, which starts + * at inSelArea8. Note that, as described in the original paper, two + * selectors are stored in each byte in the selectors area. + */ + inline static uint8_t extractSel(const uint8_t *const &inSelArea8, + const size_t &pos) { + // We either need to extract the lower or the upper four bits of the + // respective selector byte. + return (pos & 1) + ? ((inSelArea8[pos >> 1]) >> 4) + : ((inSelArea8[pos >> 1]) & 0b1111); + } + + /** + * Utility function to calculate the number of padding bytes needed after the + * selectors area in order to guarantee the 16-byte alignment required for + * SSE-store instructions in the data area. + */ + inline static size_t getCountPadBytes(const size_t &countSelArea8) { + const size_t offset = (countHeader32 * sizeof(uint32_t) + + countSelArea8 + sizeof(uint8_t)) % sizeof(__m128i); + return offset ? (sizeof(__m128i) - offset) : 0; + } + + /** + * This function is used to compress the n quads, i.e. 4x n integers, in the + * last input block, if that last block is not "full". Note that this + * function is called at most once per array to compress. Hence, top + * efficiency is not that crucial here. + */ + +#if (defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))) || (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_AMD64))) + + inline static void comprIncompleteBlock(const uint8_t &n, const __m128i *&in, + __m128i *&out) { + // Since we have to produce exactly one compressed vector anyway, we can + // use the highest bit width allowing us to pack all n values. + const unsigned b = 32 / n; + __m128i comprBlock = _mm_loadu_si128(in++); + for (size_t k = 1; k < n; k++) + comprBlock = _mm_or_si128(comprBlock, + _mm_slli_epi32(_mm_loadu_si128(in++), (int)(k * b))); + _mm_storeu_si128(out++, comprBlock); + } + +#elif (defined(__GNUC__) && (defined(__aarch64__))) || (defined(_MSC_VER) && defined(_M_ARM64)) + inline static void comprIncompleteBlock(const uint8_t &n, const __m128i *&in, + __m128i *&out) { + // Since we have to produce exactly one compressed vector anyway, we can + // use the highest bit width allowing us to pack all n values. + const unsigned b = 32 / n; + __m128i comprBlock = _mm_load_si128(in++); + for (size_t k = 1; k < n; k++) + comprBlock = _mm_or_si128(comprBlock, + mm_slli_epi32_unrolled(_mm_load_si128(in++), (unsigned int)(k * b))); + _mm_store_si128(out++, comprBlock); + } + + inline static __m128i mm_slli_epi32_unrolled(__m128i comprBlock, unsigned int n) { + switch (n) { + case 0: return _mm_slli_epi32(comprBlock, 0); + case 1: return _mm_slli_epi32(comprBlock, 1); + case 2: return _mm_slli_epi32(comprBlock, 2); + case 3: return _mm_slli_epi32(comprBlock, 3); + case 4: return _mm_slli_epi32(comprBlock, 4); + case 5: return _mm_slli_epi32(comprBlock, 5); + case 6: return _mm_slli_epi32(comprBlock, 6); + case 7: return _mm_slli_epi32(comprBlock, 7); + case 8: return _mm_slli_epi32(comprBlock, 8); + case 9: return _mm_slli_epi32(comprBlock, 9); + case 10: return _mm_slli_epi32(comprBlock, 10); + case 11: return _mm_slli_epi32(comprBlock, 11); + case 12: return _mm_slli_epi32(comprBlock, 12); + case 13: return _mm_slli_epi32(comprBlock, 13); + case 14: return _mm_slli_epi32(comprBlock, 14); + case 15: return _mm_slli_epi32(comprBlock, 15); + case 16: return _mm_slli_epi32(comprBlock, 16); + case 17: return _mm_slli_epi32(comprBlock, 17); + case 18: return _mm_slli_epi32(comprBlock, 18); + case 19: return _mm_slli_epi32(comprBlock, 19); + case 20: return _mm_slli_epi32(comprBlock, 20); + case 21: return _mm_slli_epi32(comprBlock, 21); + case 22: return _mm_slli_epi32(comprBlock, 22); + case 23: return _mm_slli_epi32(comprBlock, 23); + case 24: return _mm_slli_epi32(comprBlock, 24); + case 25: return _mm_slli_epi32(comprBlock, 25); + case 26: return _mm_slli_epi32(comprBlock, 26); + case 27: return _mm_slli_epi32(comprBlock, 27); + case 28: return _mm_slli_epi32(comprBlock, 28); + case 29: return _mm_slli_epi32(comprBlock, 29); + case 30: return _mm_slli_epi32(comprBlock, 30); + case 31: return _mm_slli_epi32(comprBlock, 31); + case 32: return _mm_slli_epi32(comprBlock, 32); + case 33: return _mm_slli_epi32(comprBlock, 33); + case 34: return _mm_slli_epi32(comprBlock, 34); + case 35: return _mm_slli_epi32(comprBlock, 35); + case 36: return _mm_slli_epi32(comprBlock, 36); + case 37: return _mm_slli_epi32(comprBlock, 37); + case 38: return _mm_slli_epi32(comprBlock, 38); + case 39: return _mm_slli_epi32(comprBlock, 39); + case 40: return _mm_slli_epi32(comprBlock, 40); + case 41: return _mm_slli_epi32(comprBlock, 41); + case 42: return _mm_slli_epi32(comprBlock, 42); + case 43: return _mm_slli_epi32(comprBlock, 43); + case 44: return _mm_slli_epi32(comprBlock, 44); + case 45: return _mm_slli_epi32(comprBlock, 45); + case 46: return _mm_slli_epi32(comprBlock, 46); + case 47: return _mm_slli_epi32(comprBlock, 47); + case 48: return _mm_slli_epi32(comprBlock, 48); + case 49: return _mm_slli_epi32(comprBlock, 49); + case 50: return _mm_slli_epi32(comprBlock, 50); + case 51: return _mm_slli_epi32(comprBlock, 51); + case 52: return _mm_slli_epi32(comprBlock, 52); + case 53: return _mm_slli_epi32(comprBlock, 53); + case 54: return _mm_slli_epi32(comprBlock, 54); + case 55: return _mm_slli_epi32(comprBlock, 55); + case 56: return _mm_slli_epi32(comprBlock, 56); + case 57: return _mm_slli_epi32(comprBlock, 57); + case 58: return _mm_slli_epi32(comprBlock, 58); + case 59: return _mm_slli_epi32(comprBlock, 59); + case 60: return _mm_slli_epi32(comprBlock, 60); + case 61: return _mm_slli_epi32(comprBlock, 61); + case 62: return _mm_slli_epi32(comprBlock, 62); + case 63: return _mm_slli_epi32(comprBlock, 63); + case 64: return _mm_slli_epi32(comprBlock, 64); + case 65: return _mm_slli_epi32(comprBlock, 65); + case 66: return _mm_slli_epi32(comprBlock, 66); + case 67: return _mm_slli_epi32(comprBlock, 67); + case 68: return _mm_slli_epi32(comprBlock, 68); + case 69: return _mm_slli_epi32(comprBlock, 69); + case 70: return _mm_slli_epi32(comprBlock, 70); + case 71: return _mm_slli_epi32(comprBlock, 71); + case 72: return _mm_slli_epi32(comprBlock, 72); + case 73: return _mm_slli_epi32(comprBlock, 73); + case 74: return _mm_slli_epi32(comprBlock, 74); + case 75: return _mm_slli_epi32(comprBlock, 75); + case 76: return _mm_slli_epi32(comprBlock, 76); + case 77: return _mm_slli_epi32(comprBlock, 77); + case 78: return _mm_slli_epi32(comprBlock, 78); + case 79: return _mm_slli_epi32(comprBlock, 79); + case 80: return _mm_slli_epi32(comprBlock, 80); + case 81: return _mm_slli_epi32(comprBlock, 81); + case 82: return _mm_slli_epi32(comprBlock, 82); + case 83: return _mm_slli_epi32(comprBlock, 83); + case 84: return _mm_slli_epi32(comprBlock, 84); + case 85: return _mm_slli_epi32(comprBlock, 85); + case 86: return _mm_slli_epi32(comprBlock, 86); + case 87: return _mm_slli_epi32(comprBlock, 87); + case 88: return _mm_slli_epi32(comprBlock, 88); + case 89: return _mm_slli_epi32(comprBlock, 89); + case 90: return _mm_slli_epi32(comprBlock, 90); + case 91: return _mm_slli_epi32(comprBlock, 91); + case 92: return _mm_slli_epi32(comprBlock, 92); + case 93: return _mm_slli_epi32(comprBlock, 93); + case 94: return _mm_slli_epi32(comprBlock, 94); + case 95: return _mm_slli_epi32(comprBlock, 95); + case 96: return _mm_slli_epi32(comprBlock, 96); + case 97: return _mm_slli_epi32(comprBlock, 97); + case 98: return _mm_slli_epi32(comprBlock, 98); + case 99: return _mm_slli_epi32(comprBlock, 99); + case 100: return _mm_slli_epi32(comprBlock, 100); + case 101: return _mm_slli_epi32(comprBlock, 101); + case 102: return _mm_slli_epi32(comprBlock, 102); + case 103: return _mm_slli_epi32(comprBlock, 103); + case 104: return _mm_slli_epi32(comprBlock, 104); + case 105: return _mm_slli_epi32(comprBlock, 105); + case 106: return _mm_slli_epi32(comprBlock, 106); + case 107: return _mm_slli_epi32(comprBlock, 107); + case 108: return _mm_slli_epi32(comprBlock, 108); + case 109: return _mm_slli_epi32(comprBlock, 109); + case 110: return _mm_slli_epi32(comprBlock, 110); + case 111: return _mm_slli_epi32(comprBlock, 111); + case 112: return _mm_slli_epi32(comprBlock, 112); + case 113: return _mm_slli_epi32(comprBlock, 113); + case 114: return _mm_slli_epi32(comprBlock, 114); + case 115: return _mm_slli_epi32(comprBlock, 115); + case 116: return _mm_slli_epi32(comprBlock, 116); + case 117: return _mm_slli_epi32(comprBlock, 117); + case 118: return _mm_slli_epi32(comprBlock, 118); + case 119: return _mm_slli_epi32(comprBlock, 119); + case 120: return _mm_slli_epi32(comprBlock, 120); + case 121: return _mm_slli_epi32(comprBlock, 121); + case 122: return _mm_slli_epi32(comprBlock, 122); + case 123: return _mm_slli_epi32(comprBlock, 123); + case 124: return _mm_slli_epi32(comprBlock, 124); + case 125: return _mm_slli_epi32(comprBlock, 125); + case 126: return _mm_slli_epi32(comprBlock, 126); + case 127: return _mm_slli_epi32(comprBlock, 127); + case 128: return _mm_slli_epi32(comprBlock, 128); + case 129: return _mm_slli_epi32(comprBlock, 129); + case 130: return _mm_slli_epi32(comprBlock, 130); + case 131: return _mm_slli_epi32(comprBlock, 131); + case 132: return _mm_slli_epi32(comprBlock, 132); + case 133: return _mm_slli_epi32(comprBlock, 133); + case 134: return _mm_slli_epi32(comprBlock, 134); + case 135: return _mm_slli_epi32(comprBlock, 135); + case 136: return _mm_slli_epi32(comprBlock, 136); + case 137: return _mm_slli_epi32(comprBlock, 137); + case 138: return _mm_slli_epi32(comprBlock, 138); + case 139: return _mm_slli_epi32(comprBlock, 139); + case 140: return _mm_slli_epi32(comprBlock, 140); + case 141: return _mm_slli_epi32(comprBlock, 141); + case 142: return _mm_slli_epi32(comprBlock, 142); + case 143: return _mm_slli_epi32(comprBlock, 143); + case 144: return _mm_slli_epi32(comprBlock, 144); + case 145: return _mm_slli_epi32(comprBlock, 145); + case 146: return _mm_slli_epi32(comprBlock, 146); + case 147: return _mm_slli_epi32(comprBlock, 147); + case 148: return _mm_slli_epi32(comprBlock, 148); + case 149: return _mm_slli_epi32(comprBlock, 149); + case 150: return _mm_slli_epi32(comprBlock, 150); + case 151: return _mm_slli_epi32(comprBlock, 151); + case 152: return _mm_slli_epi32(comprBlock, 152); + case 153: return _mm_slli_epi32(comprBlock, 153); + case 154: return _mm_slli_epi32(comprBlock, 154); + case 155: return _mm_slli_epi32(comprBlock, 155); + case 156: return _mm_slli_epi32(comprBlock, 156); + case 157: return _mm_slli_epi32(comprBlock, 157); + case 158: return _mm_slli_epi32(comprBlock, 158); + case 159: return _mm_slli_epi32(comprBlock, 159); + case 160: return _mm_slli_epi32(comprBlock, 160); + case 161: return _mm_slli_epi32(comprBlock, 161); + case 162: return _mm_slli_epi32(comprBlock, 162); + case 163: return _mm_slli_epi32(comprBlock, 163); + case 164: return _mm_slli_epi32(comprBlock, 164); + case 165: return _mm_slli_epi32(comprBlock, 165); + case 166: return _mm_slli_epi32(comprBlock, 166); + case 167: return _mm_slli_epi32(comprBlock, 167); + case 168: return _mm_slli_epi32(comprBlock, 168); + case 169: return _mm_slli_epi32(comprBlock, 169); + case 170: return _mm_slli_epi32(comprBlock, 170); + case 171: return _mm_slli_epi32(comprBlock, 171); + case 172: return _mm_slli_epi32(comprBlock, 172); + case 173: return _mm_slli_epi32(comprBlock, 173); + case 174: return _mm_slli_epi32(comprBlock, 174); + case 175: return _mm_slli_epi32(comprBlock, 175); + case 176: return _mm_slli_epi32(comprBlock, 176); + case 177: return _mm_slli_epi32(comprBlock, 177); + case 178: return _mm_slli_epi32(comprBlock, 178); + case 179: return _mm_slli_epi32(comprBlock, 179); + case 180: return _mm_slli_epi32(comprBlock, 180); + case 181: return _mm_slli_epi32(comprBlock, 181); + case 182: return _mm_slli_epi32(comprBlock, 182); + case 183: return _mm_slli_epi32(comprBlock, 183); + case 184: return _mm_slli_epi32(comprBlock, 184); + case 185: return _mm_slli_epi32(comprBlock, 185); + case 186: return _mm_slli_epi32(comprBlock, 186); + case 187: return _mm_slli_epi32(comprBlock, 187); + case 188: return _mm_slli_epi32(comprBlock, 188); + case 189: return _mm_slli_epi32(comprBlock, 189); + case 190: return _mm_slli_epi32(comprBlock, 190); + case 191: return _mm_slli_epi32(comprBlock, 191); + case 192: return _mm_slli_epi32(comprBlock, 192); + case 193: return _mm_slli_epi32(comprBlock, 193); + case 194: return _mm_slli_epi32(comprBlock, 194); + case 195: return _mm_slli_epi32(comprBlock, 195); + case 196: return _mm_slli_epi32(comprBlock, 196); + case 197: return _mm_slli_epi32(comprBlock, 197); + case 198: return _mm_slli_epi32(comprBlock, 198); + case 199: return _mm_slli_epi32(comprBlock, 199); + case 200: return _mm_slli_epi32(comprBlock, 200); + case 201: return _mm_slli_epi32(comprBlock, 201); + case 202: return _mm_slli_epi32(comprBlock, 202); + case 203: return _mm_slli_epi32(comprBlock, 203); + case 204: return _mm_slli_epi32(comprBlock, 204); + case 205: return _mm_slli_epi32(comprBlock, 205); + case 206: return _mm_slli_epi32(comprBlock, 206); + case 207: return _mm_slli_epi32(comprBlock, 207); + case 208: return _mm_slli_epi32(comprBlock, 208); + case 209: return _mm_slli_epi32(comprBlock, 209); + case 210: return _mm_slli_epi32(comprBlock, 210); + case 211: return _mm_slli_epi32(comprBlock, 211); + case 212: return _mm_slli_epi32(comprBlock, 212); + case 213: return _mm_slli_epi32(comprBlock, 213); + case 214: return _mm_slli_epi32(comprBlock, 214); + case 215: return _mm_slli_epi32(comprBlock, 215); + case 216: return _mm_slli_epi32(comprBlock, 216); + case 217: return _mm_slli_epi32(comprBlock, 217); + case 218: return _mm_slli_epi32(comprBlock, 218); + case 219: return _mm_slli_epi32(comprBlock, 219); + case 220: return _mm_slli_epi32(comprBlock, 220); + case 221: return _mm_slli_epi32(comprBlock, 221); + case 222: return _mm_slli_epi32(comprBlock, 222); + case 223: return _mm_slli_epi32(comprBlock, 223); + case 224: return _mm_slli_epi32(comprBlock, 224); + case 225: return _mm_slli_epi32(comprBlock, 225); + case 226: return _mm_slli_epi32(comprBlock, 226); + case 227: return _mm_slli_epi32(comprBlock, 227); + case 228: return _mm_slli_epi32(comprBlock, 228); + case 229: return _mm_slli_epi32(comprBlock, 229); + case 230: return _mm_slli_epi32(comprBlock, 230); + case 231: return _mm_slli_epi32(comprBlock, 231); + case 232: return _mm_slli_epi32(comprBlock, 232); + case 233: return _mm_slli_epi32(comprBlock, 233); + case 234: return _mm_slli_epi32(comprBlock, 234); + case 235: return _mm_slli_epi32(comprBlock, 235); + case 236: return _mm_slli_epi32(comprBlock, 236); + case 237: return _mm_slli_epi32(comprBlock, 237); + case 238: return _mm_slli_epi32(comprBlock, 238); + case 239: return _mm_slli_epi32(comprBlock, 239); + case 240: return _mm_slli_epi32(comprBlock, 240); + case 241: return _mm_slli_epi32(comprBlock, 241); + case 242: return _mm_slli_epi32(comprBlock, 242); + case 243: return _mm_slli_epi32(comprBlock, 243); + case 244: return _mm_slli_epi32(comprBlock, 244); + case 245: return _mm_slli_epi32(comprBlock, 245); + case 246: return _mm_slli_epi32(comprBlock, 246); + case 247: return _mm_slli_epi32(comprBlock, 247); + case 248: return _mm_slli_epi32(comprBlock, 248); + case 249: return _mm_slli_epi32(comprBlock, 249); + case 250: return _mm_slli_epi32(comprBlock, 250); + case 251: return _mm_slli_epi32(comprBlock, 251); + case 252: return _mm_slli_epi32(comprBlock, 252); + case 253: return _mm_slli_epi32(comprBlock, 253); + case 254: return _mm_slli_epi32(comprBlock, 254); + case 255: return _mm_slli_epi32(comprBlock, 255); + default: return _mm_srli_epi32(comprBlock, 255); + } + } +#endif + + /** + * The following ten functions pack a certain amount of uncompressed data. + * The function unrolledPacking_#n_#b packs #n quads, i.e., 4x #n integers, + * into one 128-bit compressed block. + */ + + inline static __m128i unrolledPacking_32_1(const __m128i *&in) { + __m128i res = _mm_loadu_si128(in++); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 1)); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 2)); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 3)); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 4)); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 5)); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 6)); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 7)); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 8)); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 9)); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 10)); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 11)); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 12)); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 13)); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 14)); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 15)); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 16)); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 17)); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 18)); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 19)); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 20)); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 21)); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 22)); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 23)); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 24)); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 25)); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 26)); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 27)); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 28)); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 29)); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 30)); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 31)); + return res; + } + + inline static __m128i unrolledPacking_16_2(const __m128i *&in) { + __m128i res = _mm_loadu_si128(in++); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 2)); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 4)); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 6)); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 8)); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 10)); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 12)); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 14)); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 16)); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 18)); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 20)); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 22)); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 24)); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 26)); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 28)); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 30)); + return res; + } + + inline static __m128i unrolledPacking_10_3(const __m128i *&in) { + __m128i res = _mm_loadu_si128(in++); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 3)); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 6)); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 9)); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 12)); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 15)); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 18)); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 21)); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 24)); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 27)); + return res; + } + + inline static __m128i unrolledPacking_8_4(const __m128i *&in) { + __m128i res = _mm_loadu_si128(in++); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 4)); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 8)); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 12)); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 16)); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 20)); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 24)); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 28)); + return res; + } + + inline static __m128i unrolledPacking_6_5(const __m128i *&in) { + __m128i res = _mm_loadu_si128(in++); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 5)); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 10)); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 15)); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 20)); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 25)); + return res; + } + + inline static __m128i unrolledPacking_5_6(const __m128i *&in) { + __m128i res = _mm_loadu_si128(in++); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 6)); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 12)); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 18)); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 24)); + return res; + } + + inline static __m128i unrolledPacking_4_8(const __m128i *&in) { + __m128i res = _mm_loadu_si128(in++); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 8)); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 16)); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 24)); + return res; + } + + inline static __m128i unrolledPacking_3_10(const __m128i *&in) { + __m128i res = _mm_loadu_si128(in++); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 10)); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 20)); + return res; + } + + inline static __m128i unrolledPacking_2_16(const __m128i *&in) { + __m128i res = _mm_loadu_si128(in++); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_loadu_si128(in++), 16)); + return res; + } + + inline static __m128i unrolledPacking_1_32(const __m128i *&in) { + return _mm_loadu_si128(in++); + } + + /** + * Compresses n quads, i.e. 4x n integers. Thereby, n must correspond to one + * of the ten compression modes presented in the original paper. + */ + inline static void comprCompleteBlock(const uint8_t &n, const __m128i *&in, + __m128i *&out) { + __m128i res = _mm_setzero_si128(); + + // In the following, b means the bit width. + + switch (n) { + case 32: // b = 1 + res = unrolledPacking_32_1(in); + break; + case 16: // b = 2 + res = unrolledPacking_16_2(in); + break; + case 10: // b = 3 + res = unrolledPacking_10_3(in); + break; + case 8: // b = 4 + res = unrolledPacking_8_4(in); + break; + case 6: // b = 5 + res = unrolledPacking_6_5(in); + break; + case 5: // b = 6 + res = unrolledPacking_5_6(in); + break; + case 4: // b = 8 + res = unrolledPacking_4_8(in); + break; + case 3: // b = 10 + res = unrolledPacking_3_10(in); + break; + case 2: // b = 16 + res = unrolledPacking_2_16(in); + break; + case 1: // b = 32 + res = unrolledPacking_1_32(in); + break; + } + + _mm_storeu_si128(out++, res); + } + + /** + * This function is used to decompress the n quads, i.e. 4x n integers, in + * the last input block, if that last block is not "full". Note that this + * function is called at most once per array to decompress. Hence, top + * efficiency is not that crucial here. + */ + +#if (defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))) || (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_AMD64))) + + inline static void decomprIncompleteBlock(const uint8_t &n, + const __m128i *&in, + __m128i *&out) { + // We choose the bit width consistent with comprIncompleteBlock(). + const unsigned b = 32 / n; + const __m128i mask = _mm_set1_epi32((static_cast(1) << b) - 1); + const __m128i comprBlock = _mm_loadu_si128(in++); + for (size_t k = 0; k < n; k++) + _mm_storeu_si128(out++, + _mm_and_si128(_mm_srli_epi32(comprBlock, (int)(k * b)), mask)); + } + +#elif (defined(__GNUC__) && (defined(__aarch64__))) || (defined(_MSC_VER) && defined(_M_ARM64)) + inline static void decomprIncompleteBlock(const uint8_t &n, const __m128i *&in, __m128i *&out) { // We choose the bit width consistent with comprIncompleteBlock(). const unsigned b = 32 / n; const __m128i mask = _mm_set1_epi32((static_cast(1) << b) - 1); - const __m128i comprBlock = _mm_loadu_si128(in++); + const __m128i comprBlock = _mm_load_si128(in++); for (size_t k = 0; k < n; k++) - _mm_storeu_si128(out++, - _mm_and_si128(_mm_srli_epi32(comprBlock, k * b), mask)); - } - - /** - * The following ten functions unpack a certain amount of compressed data. - * The function unrolledUnpacking_#n_#b unpacks #n quads, i.e., 4x #n - * integers, from one 128-bit compressed block. - */ - - inline static void unrolledUnpacking_32_1(const __m128i &comprBlock, - __m128i *&out) { - const __m128i mask = _mm_set1_epi32(1); - _mm_storeu_si128(out++, _mm_and_si128( comprBlock, mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 1), mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 2), mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 3), mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 4), mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 5), mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 6), mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 7), mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 8), mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 9), mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 10), mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 11), mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 12), mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 13), mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 14), mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 15), mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 16), mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 17), mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 18), mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 19), mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 20), mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 21), mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 22), mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 23), mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 24), mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 25), mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 26), mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 27), mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 28), mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 29), mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 30), mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 31), mask)); + _mm_store_si128(out++, + _mm_and_si128(mm_srli_epi32_unrolled(comprBlock, (unsigned int)(k * b)), mask)); } - inline static void unrolledUnpacking_16_2(const __m128i &comprBlock, - __m128i *&out) { - const __m128i mask = _mm_set1_epi32((static_cast(1) << 2) - 1); - _mm_storeu_si128(out++, _mm_and_si128( comprBlock , mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 2), mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 4), mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 6), mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 8), mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 10), mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 12), mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 14), mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 16), mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 18), mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 20), mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 22), mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 24), mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 26), mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 28), mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 30), mask)); + inline static __m128i mm_srli_epi32_unrolled(__m128i comprBlock, unsigned int n) { + switch (n) { + case 0:return _mm_srli_epi32(comprBlock, 0); + case 1:return _mm_srli_epi32(comprBlock, 1); + case 2:return _mm_srli_epi32(comprBlock, 2); + case 3:return _mm_srli_epi32(comprBlock, 3); + case 4:return _mm_srli_epi32(comprBlock, 4); + case 5:return _mm_srli_epi32(comprBlock, 5); + case 6:return _mm_srli_epi32(comprBlock, 6); + case 7:return _mm_srli_epi32(comprBlock, 7); + case 8:return _mm_srli_epi32(comprBlock, 8); + case 9:return _mm_srli_epi32(comprBlock, 9); + case 10:return _mm_srli_epi32(comprBlock, 10); + case 11:return _mm_srli_epi32(comprBlock, 11); + case 12:return _mm_srli_epi32(comprBlock, 12); + case 13:return _mm_srli_epi32(comprBlock, 13); + case 14:return _mm_srli_epi32(comprBlock, 14); + case 15:return _mm_srli_epi32(comprBlock, 15); + case 16:return _mm_srli_epi32(comprBlock, 16); + case 17:return _mm_srli_epi32(comprBlock, 17); + case 18:return _mm_srli_epi32(comprBlock, 18); + case 19:return _mm_srli_epi32(comprBlock, 19); + case 20:return _mm_srli_epi32(comprBlock, 20); + case 21:return _mm_srli_epi32(comprBlock, 21); + case 22:return _mm_srli_epi32(comprBlock, 22); + case 23:return _mm_srli_epi32(comprBlock, 23); + case 24:return _mm_srli_epi32(comprBlock, 24); + case 25:return _mm_srli_epi32(comprBlock, 25); + case 26:return _mm_srli_epi32(comprBlock, 26); + case 27:return _mm_srli_epi32(comprBlock, 27); + case 28:return _mm_srli_epi32(comprBlock, 28); + case 29:return _mm_srli_epi32(comprBlock, 29); + case 30:return _mm_srli_epi32(comprBlock, 30); + case 31:return _mm_srli_epi32(comprBlock, 31); + case 32:return _mm_srli_epi32(comprBlock, 32); + case 33:return _mm_srli_epi32(comprBlock, 33); + case 34:return _mm_srli_epi32(comprBlock, 34); + case 35:return _mm_srli_epi32(comprBlock, 35); + case 36:return _mm_srli_epi32(comprBlock, 36); + case 37:return _mm_srli_epi32(comprBlock, 37); + case 38:return _mm_srli_epi32(comprBlock, 38); + case 39:return _mm_srli_epi32(comprBlock, 39); + case 40:return _mm_srli_epi32(comprBlock, 40); + case 41:return _mm_srli_epi32(comprBlock, 41); + case 42:return _mm_srli_epi32(comprBlock, 42); + case 43:return _mm_srli_epi32(comprBlock, 43); + case 44:return _mm_srli_epi32(comprBlock, 44); + case 45:return _mm_srli_epi32(comprBlock, 45); + case 46:return _mm_srli_epi32(comprBlock, 46); + case 47:return _mm_srli_epi32(comprBlock, 47); + case 48:return _mm_srli_epi32(comprBlock, 48); + case 49:return _mm_srli_epi32(comprBlock, 49); + case 50:return _mm_srli_epi32(comprBlock, 50); + case 51:return _mm_srli_epi32(comprBlock, 51); + case 52:return _mm_srli_epi32(comprBlock, 52); + case 53:return _mm_srli_epi32(comprBlock, 53); + case 54:return _mm_srli_epi32(comprBlock, 54); + case 55:return _mm_srli_epi32(comprBlock, 55); + case 56:return _mm_srli_epi32(comprBlock, 56); + case 57:return _mm_srli_epi32(comprBlock, 57); + case 58:return _mm_srli_epi32(comprBlock, 58); + case 59:return _mm_srli_epi32(comprBlock, 59); + case 60:return _mm_srli_epi32(comprBlock, 60); + case 61:return _mm_srli_epi32(comprBlock, 61); + case 62:return _mm_srli_epi32(comprBlock, 62); + case 63:return _mm_srli_epi32(comprBlock, 63); + case 64:return _mm_srli_epi32(comprBlock, 64); + case 65:return _mm_srli_epi32(comprBlock, 65); + case 66:return _mm_srli_epi32(comprBlock, 66); + case 67:return _mm_srli_epi32(comprBlock, 67); + case 68:return _mm_srli_epi32(comprBlock, 68); + case 69:return _mm_srli_epi32(comprBlock, 69); + case 70:return _mm_srli_epi32(comprBlock, 70); + case 71:return _mm_srli_epi32(comprBlock, 71); + case 72:return _mm_srli_epi32(comprBlock, 72); + case 73:return _mm_srli_epi32(comprBlock, 73); + case 74:return _mm_srli_epi32(comprBlock, 74); + case 75:return _mm_srli_epi32(comprBlock, 75); + case 76:return _mm_srli_epi32(comprBlock, 76); + case 77:return _mm_srli_epi32(comprBlock, 77); + case 78:return _mm_srli_epi32(comprBlock, 78); + case 79:return _mm_srli_epi32(comprBlock, 79); + case 80:return _mm_srli_epi32(comprBlock, 80); + case 81:return _mm_srli_epi32(comprBlock, 81); + case 82:return _mm_srli_epi32(comprBlock, 82); + case 83:return _mm_srli_epi32(comprBlock, 83); + case 84:return _mm_srli_epi32(comprBlock, 84); + case 85:return _mm_srli_epi32(comprBlock, 85); + case 86:return _mm_srli_epi32(comprBlock, 86); + case 87:return _mm_srli_epi32(comprBlock, 87); + case 88:return _mm_srli_epi32(comprBlock, 88); + case 89:return _mm_srli_epi32(comprBlock, 89); + case 90:return _mm_srli_epi32(comprBlock, 90); + case 91:return _mm_srli_epi32(comprBlock, 91); + case 92:return _mm_srli_epi32(comprBlock, 92); + case 93:return _mm_srli_epi32(comprBlock, 93); + case 94:return _mm_srli_epi32(comprBlock, 94); + case 95:return _mm_srli_epi32(comprBlock, 95); + case 96:return _mm_srli_epi32(comprBlock, 96); + case 97:return _mm_srli_epi32(comprBlock, 97); + case 98:return _mm_srli_epi32(comprBlock, 98); + case 99:return _mm_srli_epi32(comprBlock, 99); + case 100:return _mm_srli_epi32(comprBlock, 100); + case 101:return _mm_srli_epi32(comprBlock, 101); + case 102:return _mm_srli_epi32(comprBlock, 102); + case 103:return _mm_srli_epi32(comprBlock, 103); + case 104:return _mm_srli_epi32(comprBlock, 104); + case 105:return _mm_srli_epi32(comprBlock, 105); + case 106:return _mm_srli_epi32(comprBlock, 106); + case 107:return _mm_srli_epi32(comprBlock, 107); + case 108:return _mm_srli_epi32(comprBlock, 108); + case 109:return _mm_srli_epi32(comprBlock, 109); + case 110:return _mm_srli_epi32(comprBlock, 110); + case 111:return _mm_srli_epi32(comprBlock, 111); + case 112:return _mm_srli_epi32(comprBlock, 112); + case 113:return _mm_srli_epi32(comprBlock, 113); + case 114:return _mm_srli_epi32(comprBlock, 114); + case 115:return _mm_srli_epi32(comprBlock, 115); + case 116:return _mm_srli_epi32(comprBlock, 116); + case 117:return _mm_srli_epi32(comprBlock, 117); + case 118:return _mm_srli_epi32(comprBlock, 118); + case 119:return _mm_srli_epi32(comprBlock, 119); + case 120:return _mm_srli_epi32(comprBlock, 120); + case 121:return _mm_srli_epi32(comprBlock, 121); + case 122:return _mm_srli_epi32(comprBlock, 122); + case 123:return _mm_srli_epi32(comprBlock, 123); + case 124:return _mm_srli_epi32(comprBlock, 124); + case 125:return _mm_srli_epi32(comprBlock, 125); + case 126:return _mm_srli_epi32(comprBlock, 126); + case 127:return _mm_srli_epi32(comprBlock, 127); + case 128:return _mm_srli_epi32(comprBlock, 128); + case 129:return _mm_srli_epi32(comprBlock, 129); + case 130:return _mm_srli_epi32(comprBlock, 130); + case 131:return _mm_srli_epi32(comprBlock, 131); + case 132:return _mm_srli_epi32(comprBlock, 132); + case 133:return _mm_srli_epi32(comprBlock, 133); + case 134:return _mm_srli_epi32(comprBlock, 134); + case 135:return _mm_srli_epi32(comprBlock, 135); + case 136:return _mm_srli_epi32(comprBlock, 136); + case 137:return _mm_srli_epi32(comprBlock, 137); + case 138:return _mm_srli_epi32(comprBlock, 138); + case 139:return _mm_srli_epi32(comprBlock, 139); + case 140:return _mm_srli_epi32(comprBlock, 140); + case 141:return _mm_srli_epi32(comprBlock, 141); + case 142:return _mm_srli_epi32(comprBlock, 142); + case 143:return _mm_srli_epi32(comprBlock, 143); + case 144:return _mm_srli_epi32(comprBlock, 144); + case 145:return _mm_srli_epi32(comprBlock, 145); + case 146:return _mm_srli_epi32(comprBlock, 146); + case 147:return _mm_srli_epi32(comprBlock, 147); + case 148:return _mm_srli_epi32(comprBlock, 148); + case 149:return _mm_srli_epi32(comprBlock, 149); + case 150:return _mm_srli_epi32(comprBlock, 150); + case 151:return _mm_srli_epi32(comprBlock, 151); + case 152:return _mm_srli_epi32(comprBlock, 152); + case 153:return _mm_srli_epi32(comprBlock, 153); + case 154:return _mm_srli_epi32(comprBlock, 154); + case 155:return _mm_srli_epi32(comprBlock, 155); + case 156:return _mm_srli_epi32(comprBlock, 156); + case 157:return _mm_srli_epi32(comprBlock, 157); + case 158:return _mm_srli_epi32(comprBlock, 158); + case 159:return _mm_srli_epi32(comprBlock, 159); + case 160:return _mm_srli_epi32(comprBlock, 160); + case 161:return _mm_srli_epi32(comprBlock, 161); + case 162:return _mm_srli_epi32(comprBlock, 162); + case 163:return _mm_srli_epi32(comprBlock, 163); + case 164:return _mm_srli_epi32(comprBlock, 164); + case 165:return _mm_srli_epi32(comprBlock, 165); + case 166:return _mm_srli_epi32(comprBlock, 166); + case 167:return _mm_srli_epi32(comprBlock, 167); + case 168:return _mm_srli_epi32(comprBlock, 168); + case 169:return _mm_srli_epi32(comprBlock, 169); + case 170:return _mm_srli_epi32(comprBlock, 170); + case 171:return _mm_srli_epi32(comprBlock, 171); + case 172:return _mm_srli_epi32(comprBlock, 172); + case 173:return _mm_srli_epi32(comprBlock, 173); + case 174:return _mm_srli_epi32(comprBlock, 174); + case 175:return _mm_srli_epi32(comprBlock, 175); + case 176:return _mm_srli_epi32(comprBlock, 176); + case 177:return _mm_srli_epi32(comprBlock, 177); + case 178:return _mm_srli_epi32(comprBlock, 178); + case 179:return _mm_srli_epi32(comprBlock, 179); + case 180:return _mm_srli_epi32(comprBlock, 180); + case 181:return _mm_srli_epi32(comprBlock, 181); + case 182:return _mm_srli_epi32(comprBlock, 182); + case 183:return _mm_srli_epi32(comprBlock, 183); + case 184:return _mm_srli_epi32(comprBlock, 184); + case 185:return _mm_srli_epi32(comprBlock, 185); + case 186:return _mm_srli_epi32(comprBlock, 186); + case 187:return _mm_srli_epi32(comprBlock, 187); + case 188:return _mm_srli_epi32(comprBlock, 188); + case 189:return _mm_srli_epi32(comprBlock, 189); + case 190:return _mm_srli_epi32(comprBlock, 190); + case 191:return _mm_srli_epi32(comprBlock, 191); + case 192:return _mm_srli_epi32(comprBlock, 192); + case 193:return _mm_srli_epi32(comprBlock, 193); + case 194:return _mm_srli_epi32(comprBlock, 194); + case 195:return _mm_srli_epi32(comprBlock, 195); + case 196:return _mm_srli_epi32(comprBlock, 196); + case 197:return _mm_srli_epi32(comprBlock, 197); + case 198:return _mm_srli_epi32(comprBlock, 198); + case 199:return _mm_srli_epi32(comprBlock, 199); + case 200:return _mm_srli_epi32(comprBlock, 200); + case 201:return _mm_srli_epi32(comprBlock, 201); + case 202:return _mm_srli_epi32(comprBlock, 202); + case 203:return _mm_srli_epi32(comprBlock, 203); + case 204:return _mm_srli_epi32(comprBlock, 204); + case 205:return _mm_srli_epi32(comprBlock, 205); + case 206:return _mm_srli_epi32(comprBlock, 206); + case 207:return _mm_srli_epi32(comprBlock, 207); + case 208:return _mm_srli_epi32(comprBlock, 208); + case 209:return _mm_srli_epi32(comprBlock, 209); + case 210:return _mm_srli_epi32(comprBlock, 210); + case 211:return _mm_srli_epi32(comprBlock, 211); + case 212:return _mm_srli_epi32(comprBlock, 212); + case 213:return _mm_srli_epi32(comprBlock, 213); + case 214:return _mm_srli_epi32(comprBlock, 214); + case 215:return _mm_srli_epi32(comprBlock, 215); + case 216:return _mm_srli_epi32(comprBlock, 216); + case 217:return _mm_srli_epi32(comprBlock, 217); + case 218:return _mm_srli_epi32(comprBlock, 218); + case 219:return _mm_srli_epi32(comprBlock, 219); + case 220:return _mm_srli_epi32(comprBlock, 220); + case 221:return _mm_srli_epi32(comprBlock, 221); + case 222:return _mm_srli_epi32(comprBlock, 222); + case 223:return _mm_srli_epi32(comprBlock, 223); + case 224:return _mm_srli_epi32(comprBlock, 224); + case 225:return _mm_srli_epi32(comprBlock, 225); + case 226:return _mm_srli_epi32(comprBlock, 226); + case 227:return _mm_srli_epi32(comprBlock, 227); + case 228:return _mm_srli_epi32(comprBlock, 228); + case 229:return _mm_srli_epi32(comprBlock, 229); + case 230:return _mm_srli_epi32(comprBlock, 230); + case 231:return _mm_srli_epi32(comprBlock, 231); + case 232:return _mm_srli_epi32(comprBlock, 232); + case 233:return _mm_srli_epi32(comprBlock, 233); + case 234:return _mm_srli_epi32(comprBlock, 234); + case 235:return _mm_srli_epi32(comprBlock, 235); + case 236:return _mm_srli_epi32(comprBlock, 236); + case 237:return _mm_srli_epi32(comprBlock, 237); + case 238:return _mm_srli_epi32(comprBlock, 238); + case 239:return _mm_srli_epi32(comprBlock, 239); + case 240:return _mm_srli_epi32(comprBlock, 240); + case 241:return _mm_srli_epi32(comprBlock, 241); + case 242:return _mm_srli_epi32(comprBlock, 242); + case 243:return _mm_srli_epi32(comprBlock, 243); + case 244:return _mm_srli_epi32(comprBlock, 244); + case 245:return _mm_srli_epi32(comprBlock, 245); + case 246:return _mm_srli_epi32(comprBlock, 246); + case 247:return _mm_srli_epi32(comprBlock, 247); + case 248:return _mm_srli_epi32(comprBlock, 248); + case 249:return _mm_srli_epi32(comprBlock, 249); + case 250:return _mm_srli_epi32(comprBlock, 250); + case 251:return _mm_srli_epi32(comprBlock, 251); + case 252:return _mm_srli_epi32(comprBlock, 252); + case 253:return _mm_srli_epi32(comprBlock, 253); + case 254:return _mm_srli_epi32(comprBlock, 254); + case 255:return _mm_srli_epi32(comprBlock, 255); + default:return _mm_srli_epi32(comprBlock, 255); break; } - - inline static void unrolledUnpacking_10_3(const __m128i &comprBlock, - __m128i *&out) { - const __m128i mask = _mm_set1_epi32((static_cast(1) << 3) - 1); - _mm_storeu_si128(out++, _mm_and_si128( comprBlock , mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 3), mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 6), mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 9), mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 12), mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 15), mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 18), mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 21), mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 24), mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 27), mask)); - } - - inline static void unrolledUnpacking_8_4(const __m128i &comprBlock, - __m128i *&out) { - const __m128i mask = _mm_set1_epi32((static_cast(1) << 4) - 1); - _mm_storeu_si128(out++, _mm_and_si128( comprBlock, mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 4), mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 8), mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 12), mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 16), mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 20), mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 24), mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 28), mask)); - } - - inline static void unrolledUnpacking_6_5(const __m128i &comprBlock, - __m128i *&out) { - const __m128i mask = _mm_set1_epi32((static_cast(1) << 5) - 1); - _mm_storeu_si128(out++, _mm_and_si128( comprBlock, mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 5), mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 10), mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 15), mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 20), mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 25), mask)); - } - - inline static void unrolledUnpacking_5_6(const __m128i &comprBlock, - __m128i *&out) { - const __m128i mask = _mm_set1_epi32((static_cast(1) << 6) - 1); - _mm_storeu_si128(out++, _mm_and_si128( comprBlock, mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 6), mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 12), mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 18), mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 24), mask)); - } - - inline static void unrolledUnpacking_4_8(const __m128i &comprBlock, - __m128i *&out) { - const __m128i mask = _mm_set1_epi32((static_cast(1) << 8) - 1); - _mm_storeu_si128(out++, _mm_and_si128( comprBlock, mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 8), mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 16), mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 24), mask)); - } - - inline static void unrolledUnpacking_3_10(const __m128i &comprBlock, - __m128i *&out) { - const __m128i mask = _mm_set1_epi32((static_cast(1) << 10) - 1); - _mm_storeu_si128(out++, _mm_and_si128( comprBlock, mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 10), mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 20), mask)); - } - - inline static void unrolledUnpacking_2_16(const __m128i &comprBlock, - __m128i *&out) { - const __m128i mask = _mm_set1_epi32((static_cast(1) << 16) - 1); - _mm_storeu_si128(out++, _mm_and_si128( comprBlock, mask)); - _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 16), mask)); - } - - inline static void unrolledUnpacking_1_32(const __m128i &comprBlock, - __m128i *&out) { - _mm_storeu_si128(out++, comprBlock); - } - - /** - * Decompresses n quads, i.e. 4x n integers. Thereby, n must correspond to - * one of the ten compression modes presented in the original paper. - */ - inline static void decomprCompleteBlock(const uint8_t &n,const __m128i *&in, - __m128i *&out) { - const __m128i comprBlock = _mm_loadu_si128(in++); - - switch (n) { - case 32: // b = 1 - unrolledUnpacking_32_1(comprBlock, out); - break; - case 16: // b = 2 - unrolledUnpacking_16_2(comprBlock, out); - break; - case 10: // b = 3 - unrolledUnpacking_10_3(comprBlock, out); - break; - case 8: // b = 4 - unrolledUnpacking_8_4(comprBlock, out); - break; - case 6: // b = 5 - unrolledUnpacking_6_5(comprBlock, out); - break; - case 5: // b = 6 - unrolledUnpacking_5_6(comprBlock, out); - break; - case 4: // b = 8 - unrolledUnpacking_4_8(comprBlock, out); - break; - case 3: // b = 10 - unrolledUnpacking_3_10(comprBlock, out); - break; - case 2: // b = 16 - unrolledUnpacking_2_16(comprBlock, out); - break; - case 1: // b = 32 - unrolledUnpacking_1_32(comprBlock, out); - break; - } - } - - /** - * The original variant of the compression part of the algorithm. - */ - inline static void encodeArrayInternal_woRingBuf(const uint32_t *in, - const size_t len, - uint32_t *out, - size_t &nvalue) { - // The start of the header. - uint32_t *const outHeader32 = out; - // The start of the selectors area. - uint8_t *outSelArea8 = - reinterpret_cast(outHeader32 + countHeader32); - uint8_t *const initOutSelArea8 = outSelArea8; - - // The number of input quads, i.e., groups of four integers. Note that we - // assume the number of input integers to be a multiple of four. - const size_t countIn128 = len * sizeof(uint32_t) / sizeof(__m128i); - - // Step 1: Generation of the quad max array - // ======================================== - uint32_t *quadMaxArray = new uint32_t[countIn128]; - for (size_t i = 0; i < len; i += 4) { - const uint32_t pseudoQuadMax = in[i] | in[i + 1] | in[i + 2] | in[i + 3]; - quadMaxArray[i >> 2] = pseudoQuadMax; - } - - // Step 2: Pattern selection algorithm - // =================================== - // As described in the paper. - size_t l = countIn128; - size_t j = 0; - size_t pos = 0; - // Whether we have an even number of selectors so far. - bool even = true; - while (l > 0) { - uint8_t i; - for (i = 0; i <= 9; i++) { - const uint8_t n = tableNum[i]; - // Unlike the original pattern selection algorithm, we look up the mask - // directly instead of calculating it from a looked up bit width. - const uint32_t mask = tableMask[i]; - pos = 0; - const size_t maxPos = std::min(static_cast(n), l); - while (pos < maxPos && quadMaxArray[j + pos] <= mask) - pos++; - if (pos == maxPos) - break; - } - l -= pos; - j += pos; - // Store the selector. - if (even) - *outSelArea8 = i; - else - *outSelArea8++ |= (i << 4); - even = !even; - } - if (!even) - // The last used byte in the selectors area was touched, but not finished. - outSelArea8++; - // The number of quads in the last block. - const uint8_t countQuadsLastBlock = static_cast(pos); - *outSelArea8 = countQuadsLastBlock; - - delete[] quadMaxArray; - - // The number of bytes actually used for the selectors area. - const size_t countSelArea8Used = outSelArea8 - initOutSelArea8; - // The total number of selectors. - const int countSels = countSelArea8Used * 2 - (even ? 0 : 1); - - // The number of bytes that could be required for the selectors area in the - // worst case. - const size_t countSelArea8WorstCase = countIn128 / 2 + (countIn128 & 1); - - // Depending on whether we want to leave the "pessimistic gap" between the - // selectors area and the data area, we either reserve the true or the - // worst-case number of bytes for the selectors area. Note that this has no - // effect on the amount of data that we actually have to write. - const size_t countSelArea8 = - pessimisticGap ? countSelArea8WorstCase : countSelArea8Used; - - const size_t countPadBytes = getCountPadBytes(countSelArea8); - // The start of the data area. - __m128i *outDataArea128 = reinterpret_cast<__m128i *>(initOutSelArea8 + - countSelArea8 + sizeof(uint8_t) + countPadBytes); - const __m128i *const initOutDataArea128 = outDataArea128; - uint8_t* pad8 = (uint8_t*)outDataArea128 - countPadBytes; - while (pad8 < (uint8_t*)outDataArea128) - *pad8++ = 0; // clear padding bytes - - const __m128i *in128 = reinterpret_cast(in); - - // Step 3: Packing the uncompressed integers - // ========================================= - // Iterate over the selectors obtained from the pattern selection algorithm - // and compress the blocks accordingly. The last block is always treated - // specially, since it might not be "full". - for (int m = 0; m < countSels - 1; m++) { - const uint8_t i = extractSel(initOutSelArea8, m); - const size_t n = tableNum[i]; - comprCompleteBlock(n, in128, outDataArea128); - } - if (countQuadsLastBlock) - comprIncompleteBlock(countQuadsLastBlock, in128, outDataArea128); - - // Write some meta data to the header. - outHeader32[0] = len; - outHeader32[1] = countSels; - outHeader32[2] = countSelArea8; - - // The position of the last byte written to the output relative to the - // start of the output. Note that the actual number of written bytes might - // be slightly lower due to the inserted padding. However, it might even be - // significantly lower, if pessimisticGap is true. - const size_t nbytes = countHeader32 * sizeof(uint32_t) + - countSelArea8 + sizeof(uint8_t) + countPadBytes + - (outDataArea128 - initOutDataArea128) * sizeof(__m128i); - // Rounding the number of bytes to full 32-bit integers. - nvalue = div_roundup(nbytes, sizeof(uint32_t)); - } - - /** - * The variant of the compression part using a ring buffer for the pseudo - * quad max values. - */ - inline static void encodeArrayInternal_wRingBuf(const uint32_t *in, - const size_t len, - uint32_t *out, - size_t &nvalue) { - // The start of the header. - uint32_t *const outHeader32 = out; - // The start of the selectors area. - uint8_t *outSelArea8 = - reinterpret_cast(outHeader32 + countHeader32); - uint8_t *const initOutSelArea8 = outSelArea8; - - // The number of input quads, i.e., groups of four integers. Note that we - // assume the number of input integers to be a multiple of four. - const size_t countIn128 = len * sizeof(uint32_t) / sizeof(__m128i); - - // Maximum size of the quad max ring buffer. Note that to determine the - // next selector, we need to consider at most 32 pseudo quad max values, - // since that is the maximum number of input quads to be packed into one - // compressed block. - const size_t rbMaxSize = 32; - // The quad max ring buffer. - uint32_t quadMaxRb[rbMaxSize]; - // The current position and number of valid elements in the ring buffer. - size_t rbPos = 0; - size_t rbSize = 0; - - // The number of bytes that could be required for the selectors area in the - // worst case. In this implementation we immediately compress a block when - // we have determined the selector. Hence, we do not know the total number - // of selectors before we start the actual compression, such that we need - // to assume the worst case in order to guarantee that the selectors area - // and the data area do not overlap. - const size_t countSelArea8WorstCase = countIn128 / 2 + (countIn128 & 1); - size_t countPadBytes_wGap = getCountPadBytes(countSelArea8WorstCase); - - // The start of the data area. - __m128i *outDataArea128_wGap = - reinterpret_cast<__m128i *>(initOutSelArea8 + countSelArea8WorstCase + - sizeof(uint8_t) + countPadBytes_wGap); - __m128i *const initOutDataArea128_wGap = outDataArea128_wGap; - - const __m128i *in128 = reinterpret_cast(in); - const __m128i *const endIn128 = in128 + countIn128; - - // The following loop interleaves all three steps of the original - // algorithm: (1) the generation of the pseudo quad max values, (2) the - // pattern selection algorithm, and (3) the packing of the input blocks. - - // Whether we have an even number of selectors so far. - bool even = true; - size_t pos = 0; - while (in128 < endIn128) { - // Step 1: Refill the quad max ring buffer. - const size_t countRemainingIn128 = static_cast(endIn128 - in128); - const size_t rbSizeToReach = std::min(rbMaxSize, countRemainingIn128); - for (; rbSize < rbSizeToReach; rbSize++) { - const uint32_t *const in32 = - reinterpret_cast(in128 + rbSize); - const uint32_t pseudoQuadMax = in32[0] | in32[1] | in32[2] | in32[3]; - quadMaxRb[(rbPos + rbSize) % rbMaxSize] = pseudoQuadMax; - } - - // Step 2: Determine the next selector. - pos = 0; - uint8_t i; - uint8_t n; - for (i = 0; i <= 9; i++) { - n = tableNum[i]; - const uint32_t mask = tableMask[i]; - pos = 0; - const size_t maxPos = std::min(static_cast(n), rbSize); - while (pos < maxPos && quadMaxRb[(rbPos + pos) % rbMaxSize] <= mask) - pos++; - if (pos == maxPos) - break; - } - // Store the selector. - if (even) - *outSelArea8 = i; - else - *outSelArea8++ |= (i << 4); - even = !even; - - // Step 3: Compress the block. - if (pos == n) { - comprCompleteBlock(n, in128, outDataArea128_wGap); - rbPos = (rbPos + n) % rbMaxSize; - rbSize -= n; - // Refilling the ring buffer only here (and once before the loop) does - // not seem to yield any benefit. - } - else - // This can only happen for the last block/selector - comprIncompleteBlock(rbSize, in128, outDataArea128_wGap); - } - if (!even) - // The last used byte in the selectors area was touched, but not finished. - outSelArea8++; - - // The number of quads in the last, possibly non-"full" block. - const uint8_t countQuadsLastBlock = static_cast(pos); - *outSelArea8 = countQuadsLastBlock; - - // The number of bytes actually used for the selectors area. - const size_t countSelArea8Used = outSelArea8 - initOutSelArea8; - // The total number of selectors. - const size_t countSels = countSelArea8Used * 2 - (even ? 0 : 1); - - // Up to here, we have a gap between the last used byte of the selectors - // area and the first byte of the data area (unless all data elements were - // packed with 32 bits each, which is the worst case). If specified so, we - // remove this gap by copying the data area directly behind the used bytes - // of the selectors area. - const size_t countSelArea8 = - pessimisticGap ? countSelArea8WorstCase : countSelArea8Used; - const size_t countDataArea128 = - outDataArea128_wGap - initOutDataArea128_wGap; - size_t actualPaddingBytes; - if (pessimisticGap) - actualPaddingBytes = countPadBytes_wGap; - else { - const size_t countPadBytes_woGap = getCountPadBytes(countSelArea8Used); - actualPaddingBytes = countPadBytes_woGap; - __m128i *const outDataArea128_woGap = - reinterpret_cast<__m128i *>(initOutSelArea8 + countSelArea8Used + - sizeof(uint8_t) + countPadBytes_woGap); - if (outDataArea128_woGap != outDataArea128_wGap) - for (unsigned i = 0; i < countDataArea128; i++) - _mm_storeu_si128(outDataArea128_woGap + i, - _mm_loadu_si128(initOutDataArea128_wGap + i)); - } - - // Write some meta data to the header. - outHeader32[0] = len; - outHeader32[1] = countSels; - outHeader32[2] = countSelArea8; - - // The position of the last byte written to the output relative to the - // start of the output. Note that the actual number of written bytes might - // be slightly lower due to the inserted padding. However, it might even be - // significantly lower, if pessimisticGap is true. - const size_t nbytes = countHeader32 * sizeof(uint32_t) + - countSelArea8 + sizeof(uint8_t) + actualPaddingBytes + - countDataArea128 * sizeof(__m128i); - // Rounding the number of bytes to full 32-bit integers. - nvalue = div_roundup(nbytes, sizeof(uint32_t)); - } - - void encodeArray(const uint32_t *in, const size_t len, uint32_t *out, - size_t &nvalue) { - checkifdivisibleby(len, BlockSize); - - if (useRingBuf) - encodeArrayInternal_wRingBuf(in, len, out, nvalue); - else - encodeArrayInternal_woRingBuf(in, len, out, nvalue); - } - - const uint32_t *decodeArray(const uint32_t *in, const size_t, - uint32_t *out, size_t &nvalue) { - // The start of the header. - const uint32_t *const inHeader32 = in; - nvalue = inHeader32[0]; - const int countSels = inHeader32[1]; - // The number of bytes reserved for the selectors area. This contains the - // bytes actually used for the selectors as well as the "pessimistic gap", - // if specified so. - const size_t countSelArea8 = inHeader32[2]; - - // The start of the selectors area. - const uint8_t *const inSelArea8 = - reinterpret_cast(inHeader32 + countHeader32); - - // The number of bytes actually used within the selectors area. - const size_t countSelArea8Used = countSels / 2 + (countSels & 1); - - const size_t countPadBytes = getCountPadBytes(countSelArea8); - // The start of the data area. - const __m128i *inDataArea128 = - reinterpret_cast(inSelArea8 + countSelArea8 + - sizeof(uint8_t) + countPadBytes); - - __m128i *out128 = reinterpret_cast<__m128i *>(out); - - // Iterate over the selectors and unpack the compressed blocks accordingly. - // The last block is always treated specially, since it might not be "full". - for (int m = 0; m < countSels - 1; m++) { - const uint8_t i = extractSel(inSelArea8, m); - const size_t n = tableNum[i]; - decomprCompleteBlock(n, inDataArea128, out128); - } - const uint8_t countQuadsLastBlock = inSelArea8[countSelArea8Used]; - if (countQuadsLastBlock) - decomprIncompleteBlock(countQuadsLastBlock, inDataArea128, out128); - - return reinterpret_cast(inDataArea128); - } - - virtual std::string name() const { - return useRingBuf ? "SIMDGroupSimple_RingBuf" : "SIMDGroupSimple"; } -}; - -template -const uint8_t SIMDGroupSimple::tableNum[] = { - 32, 16, 10, 8, 6, 5, 4, 3, 2, 1 -}; -template -const uint32_t SIMDGroupSimple::tableMask[] = { - (static_cast(1) << 1) - 1, - (static_cast(1) << 2) - 1, - (static_cast(1) << 3) - 1, - (static_cast(1) << 4) - 1, - (static_cast(1) << 5) - 1, - (static_cast(1) << 6) - 1, - (static_cast(1) << 8) - 1, - (static_cast(1) << 10) - 1, - (static_cast(1) << 16) - 1, - (static_cast(1) << 32) - 1, -}; +#endif + + /** + * The following ten functions unpack a certain amount of compressed data. + * The function unrolledUnpacking_#n_#b unpacks #n quads, i.e., 4x #n + * integers, from one 128-bit compressed block. + */ + + inline static void unrolledUnpacking_32_1(const __m128i &comprBlock, + __m128i *&out) { + const __m128i mask = _mm_set1_epi32(1); + _mm_storeu_si128(out++, _mm_and_si128(comprBlock, mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 1), mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 2), mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 3), mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 4), mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 5), mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 6), mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 7), mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 8), mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 9), mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 10), mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 11), mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 12), mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 13), mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 14), mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 15), mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 16), mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 17), mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 18), mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 19), mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 20), mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 21), mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 22), mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 23), mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 24), mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 25), mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 26), mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 27), mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 28), mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 29), mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 30), mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 31), mask)); + } + + inline static void unrolledUnpacking_16_2(const __m128i &comprBlock, + __m128i *&out) { + const __m128i mask = _mm_set1_epi32((static_cast(1) << 2) - 1); + _mm_storeu_si128(out++, _mm_and_si128(comprBlock, mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 2), mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 4), mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 6), mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 8), mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 10), mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 12), mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 14), mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 16), mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 18), mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 20), mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 22), mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 24), mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 26), mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 28), mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 30), mask)); + } + + inline static void unrolledUnpacking_10_3(const __m128i &comprBlock, + __m128i *&out) { + const __m128i mask = _mm_set1_epi32((static_cast(1) << 3) - 1); + _mm_storeu_si128(out++, _mm_and_si128(comprBlock, mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 3), mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 6), mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 9), mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 12), mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 15), mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 18), mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 21), mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 24), mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 27), mask)); + } + + inline static void unrolledUnpacking_8_4(const __m128i &comprBlock, + __m128i *&out) { + const __m128i mask = _mm_set1_epi32((static_cast(1) << 4) - 1); + _mm_storeu_si128(out++, _mm_and_si128(comprBlock, mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 4), mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 8), mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 12), mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 16), mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 20), mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 24), mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 28), mask)); + } + + inline static void unrolledUnpacking_6_5(const __m128i &comprBlock, + __m128i *&out) { + const __m128i mask = _mm_set1_epi32((static_cast(1) << 5) - 1); + _mm_storeu_si128(out++, _mm_and_si128(comprBlock, mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 5), mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 10), mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 15), mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 20), mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 25), mask)); + } + + inline static void unrolledUnpacking_5_6(const __m128i &comprBlock, + __m128i *&out) { + const __m128i mask = _mm_set1_epi32((static_cast(1) << 6) - 1); + _mm_storeu_si128(out++, _mm_and_si128(comprBlock, mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 6), mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 12), mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 18), mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 24), mask)); + } + + inline static void unrolledUnpacking_4_8(const __m128i &comprBlock, + __m128i *&out) { + const __m128i mask = _mm_set1_epi32((static_cast(1) << 8) - 1); + _mm_storeu_si128(out++, _mm_and_si128(comprBlock, mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 8), mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 16), mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 24), mask)); + } + + inline static void unrolledUnpacking_3_10(const __m128i &comprBlock, + __m128i *&out) { + const __m128i mask = _mm_set1_epi32((static_cast(1) << 10) - 1); + _mm_storeu_si128(out++, _mm_and_si128(comprBlock, mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 10), mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 20), mask)); + } + + inline static void unrolledUnpacking_2_16(const __m128i &comprBlock, + __m128i *&out) { + const __m128i mask = _mm_set1_epi32((static_cast(1) << 16) - 1); + _mm_storeu_si128(out++, _mm_and_si128(comprBlock, mask)); + _mm_storeu_si128(out++, _mm_and_si128(_mm_srli_epi32(comprBlock, 16), mask)); + } + + inline static void unrolledUnpacking_1_32(const __m128i &comprBlock, + __m128i *&out) { + _mm_storeu_si128(out++, comprBlock); + } + + /** + * Decompresses n quads, i.e. 4x n integers. Thereby, n must correspond to + * one of the ten compression modes presented in the original paper. + */ + inline static void decomprCompleteBlock(const uint8_t &n, const __m128i *&in, + __m128i *&out) { + const __m128i comprBlock = _mm_loadu_si128(in++); + + switch (n) { + case 32: // b = 1 + unrolledUnpacking_32_1(comprBlock, out); + break; + case 16: // b = 2 + unrolledUnpacking_16_2(comprBlock, out); + break; + case 10: // b = 3 + unrolledUnpacking_10_3(comprBlock, out); + break; + case 8: // b = 4 + unrolledUnpacking_8_4(comprBlock, out); + break; + case 6: // b = 5 + unrolledUnpacking_6_5(comprBlock, out); + break; + case 5: // b = 6 + unrolledUnpacking_5_6(comprBlock, out); + break; + case 4: // b = 8 + unrolledUnpacking_4_8(comprBlock, out); + break; + case 3: // b = 10 + unrolledUnpacking_3_10(comprBlock, out); + break; + case 2: // b = 16 + unrolledUnpacking_2_16(comprBlock, out); + break; + case 1: // b = 32 + unrolledUnpacking_1_32(comprBlock, out); + break; + } + } + + /** + * The original variant of the compression part of the algorithm. + */ + inline static void encodeArrayInternal_woRingBuf(const uint32_t *in, + const size_t len, + uint32_t *out, + size_t &nvalue) { + // The start of the header. + uint32_t *const outHeader32 = out; + // The start of the selectors area. + uint8_t *outSelArea8 = + reinterpret_cast(outHeader32 + countHeader32); + uint8_t *const initOutSelArea8 = outSelArea8; + + // The number of input quads, i.e., groups of four integers. Note that we + // assume the number of input integers to be a multiple of four. + const size_t countIn128 = len * sizeof(uint32_t) / sizeof(__m128i); + + // Step 1: Generation of the quad max array + // ======================================== + uint32_t *quadMaxArray = new uint32_t[countIn128]; + for (size_t i = 0; i < len; i += 4) { + const uint32_t pseudoQuadMax = in[i] | in[i + 1] | in[i + 2] | in[i + 3]; + quadMaxArray[i >> 2] = pseudoQuadMax; + } + + // Step 2: Pattern selection algorithm + // =================================== + // As described in the paper. + size_t l = countIn128; + size_t j = 0; + size_t pos = 0; + // Whether we have an even number of selectors so far. + bool even = true; + while (l > 0) { + uint8_t i; + for (i = 0; i <= 9; i++) { + const uint8_t n = tableNum[i]; + // Unlike the original pattern selection algorithm, we look up the mask + // directly instead of calculating it from a looked up bit width. + const uint32_t mask = tableMask[i]; + pos = 0; + const size_t maxPos = std::min(static_cast(n), l); + while (pos < maxPos && quadMaxArray[j + pos] <= mask) + pos++; + if (pos == maxPos) + break; + } + l -= pos; + j += pos; + // Store the selector. + if (even) + *outSelArea8 = i; + else + *outSelArea8++ |= (i << 4); + even = !even; + } + if (!even) + // The last used byte in the selectors area was touched, but not finished. + outSelArea8++; + // The number of quads in the last block. + const uint8_t countQuadsLastBlock = static_cast(pos); + *outSelArea8 = countQuadsLastBlock; + + delete[] quadMaxArray; + + // The number of bytes actually used for the selectors area. + const size_t countSelArea8Used = outSelArea8 - initOutSelArea8; + // The total number of selectors. + const int countSels = int(countSelArea8Used * 2 - (even ? 0 : 1)); + + // The number of bytes that could be required for the selectors area in the + // worst case. + const size_t countSelArea8WorstCase = countIn128 / 2 + (countIn128 & 1); + + // Depending on whether we want to leave the "pessimistic gap" between the + // selectors area and the data area, we either reserve the true or the + // worst-case number of bytes for the selectors area. Note that this has no + // effect on the amount of data that we actually have to write. + const size_t countSelArea8 = + pessimisticGap ? countSelArea8WorstCase : countSelArea8Used; + + const size_t countPadBytes = getCountPadBytes(countSelArea8); + // The start of the data area. + __m128i *outDataArea128 = reinterpret_cast<__m128i *>(initOutSelArea8 + + countSelArea8 + sizeof(uint8_t) + countPadBytes); + const __m128i *const initOutDataArea128 = outDataArea128; + uint8_t *pad8 = (uint8_t *) outDataArea128 - countPadBytes; + while (pad8 < (uint8_t *) outDataArea128) + *pad8++ = 0; // clear padding bytes + + const __m128i *in128 = reinterpret_cast(in); + + // Step 3: Packing the uncompressed integers + // ========================================= + // Iterate over the selectors obtained from the pattern selection algorithm + // and compress the blocks accordingly. The last block is always treated + // specially, since it might not be "full". + for (int m = 0; m < countSels - 1; m++) { + const uint8_t i = extractSel(initOutSelArea8, m); + const size_t n = tableNum[i]; + comprCompleteBlock(uint8_t(n), in128, outDataArea128); + } + if (countQuadsLastBlock) + comprIncompleteBlock(countQuadsLastBlock, in128, outDataArea128); + + // Write some meta data to the header. + outHeader32[0] = uint32_t(len); + outHeader32[1] = uint32_t(countSels); + outHeader32[2] = uint32_t(countSelArea8); + + // The position of the last byte written to the output relative to the + // start of the output. Note that the actual number of written bytes might + // be slightly lower due to the inserted padding. However, it might even be + // significantly lower, if pessimisticGap is true. + const size_t nbytes = countHeader32 * sizeof(uint32_t) + + countSelArea8 + sizeof(uint8_t) + countPadBytes + + (outDataArea128 - initOutDataArea128) * sizeof(__m128i); + // Rounding the number of bytes to full 32-bit integers. + nvalue = div_roundup(uint32_t(nbytes), sizeof(uint32_t)); + } + + /** + * The variant of the compression part using a ring buffer for the pseudo + * quad max values. + */ + inline static void encodeArrayInternal_wRingBuf(const uint32_t *in, + const size_t len, + uint32_t *out, + size_t &nvalue) { + // The start of the header. + uint32_t *const outHeader32 = out; + // The start of the selectors area. + uint8_t *outSelArea8 = + reinterpret_cast(outHeader32 + countHeader32); + uint8_t *const initOutSelArea8 = outSelArea8; + + // The number of input quads, i.e., groups of four integers. Note that we + // assume the number of input integers to be a multiple of four. + const size_t countIn128 = len * sizeof(uint32_t) / sizeof(__m128i); + + // Maximum size of the quad max ring buffer. Note that to determine the + // next selector, we need to consider at most 32 pseudo quad max values, + // since that is the maximum number of input quads to be packed into one + // compressed block. + const size_t rbMaxSize = 32; + // The quad max ring buffer. + uint32_t quadMaxRb[rbMaxSize]; + // The current position and number of valid elements in the ring buffer. + size_t rbPos = 0; + size_t rbSize = 0; + + // The number of bytes that could be required for the selectors area in the + // worst case. In this implementation we immediately compress a block when + // we have determined the selector. Hence, we do not know the total number + // of selectors before we start the actual compression, such that we need + // to assume the worst case in order to guarantee that the selectors area + // and the data area do not overlap. + const size_t countSelArea8WorstCase = countIn128 / 2 + (countIn128 & 1); + size_t countPadBytes_wGap = getCountPadBytes(countSelArea8WorstCase); + + // The start of the data area. + __m128i *outDataArea128_wGap = + reinterpret_cast<__m128i *>(initOutSelArea8 + countSelArea8WorstCase + + sizeof(uint8_t) + countPadBytes_wGap); + __m128i *const initOutDataArea128_wGap = outDataArea128_wGap; + + const __m128i *in128 = reinterpret_cast(in); + const __m128i *const endIn128 = in128 + countIn128; + + // The following loop interleaves all three steps of the original + // algorithm: (1) the generation of the pseudo quad max values, (2) the + // pattern selection algorithm, and (3) the packing of the input blocks. + + // Whether we have an even number of selectors so far. + bool even = true; + size_t pos = 0; + while (in128 < endIn128) { + // Step 1: Refill the quad max ring buffer. + const size_t countRemainingIn128 = static_cast(endIn128 - in128); + const size_t rbSizeToReach = std::min(rbMaxSize, countRemainingIn128); + for (; rbSize < rbSizeToReach; rbSize++) { + const uint32_t *const in32 = + reinterpret_cast(in128 + rbSize); + const uint32_t pseudoQuadMax = in32[0] | in32[1] | in32[2] | in32[3]; + quadMaxRb[(rbPos + rbSize) % rbMaxSize] = pseudoQuadMax; + } + + // Step 2: Determine the next selector. + pos = 0; + uint8_t i; + uint8_t n = 0; + for (i = 0; i <= 9; i++) { + n = tableNum[i]; + const uint32_t mask = tableMask[i]; + pos = 0; + const size_t maxPos = std::min(static_cast(n), rbSize); + while (pos < maxPos && quadMaxRb[(rbPos + pos) % rbMaxSize] <= mask) + pos++; + if (pos == maxPos) + break; + } + // Store the selector. + if (even) + *outSelArea8 = i; + else + *outSelArea8++ |= (i << 4); + even = !even; + + // Step 3: Compress the block. + if (pos == n) { + comprCompleteBlock(n, in128, outDataArea128_wGap); + rbPos = (rbPos + n) % rbMaxSize; + rbSize -= n; + // Refilling the ring buffer only here (and once before the loop) does + // not seem to yield any benefit. + } else + // This can only happen for the last block/selector + comprIncompleteBlock(uint8_t(rbSize), in128, outDataArea128_wGap); + } + if (!even) + // The last used byte in the selectors area was touched, but not finished. + outSelArea8++; + + // The number of quads in the last, possibly non-"full" block. + const uint8_t countQuadsLastBlock = static_cast(pos); + *outSelArea8 = countQuadsLastBlock; + + // The number of bytes actually used for the selectors area. + const size_t countSelArea8Used = outSelArea8 - initOutSelArea8; + // The total number of selectors. + const size_t countSels = countSelArea8Used * 2 - (even ? 0 : 1); + + // Up to here, we have a gap between the last used byte of the selectors + // area and the first byte of the data area (unless all data elements were + // packed with 32 bits each, which is the worst case). If specified so, we + // remove this gap by copying the data area directly behind the used bytes + // of the selectors area. + const size_t countSelArea8 = + pessimisticGap ? countSelArea8WorstCase : countSelArea8Used; + const size_t countDataArea128 = + outDataArea128_wGap - initOutDataArea128_wGap; + size_t actualPaddingBytes; + if (pessimisticGap) + actualPaddingBytes = countPadBytes_wGap; + else { + const size_t countPadBytes_woGap = getCountPadBytes(countSelArea8Used); + actualPaddingBytes = countPadBytes_woGap; + __m128i *const outDataArea128_woGap = + reinterpret_cast<__m128i *>(initOutSelArea8 + countSelArea8Used + + sizeof(uint8_t) + countPadBytes_woGap); + if (outDataArea128_woGap != outDataArea128_wGap) + for (unsigned i = 0; i < countDataArea128; i++) + _mm_storeu_si128(outDataArea128_woGap + i, + _mm_loadu_si128(initOutDataArea128_wGap + i)); + } + + // Write some meta data to the header. + outHeader32[0] = uint32_t(len); + outHeader32[1] = uint32_t(countSels); + outHeader32[2] = uint32_t(countSelArea8); + + // The position of the last byte written to the output relative to the + // start of the output. Note that the actual number of written bytes might + // be slightly lower due to the inserted padding. However, it might even be + // significantly lower, if pessimisticGap is true. + const size_t nbytes = countHeader32 * sizeof(uint32_t) + + countSelArea8 + sizeof(uint8_t) + actualPaddingBytes + + countDataArea128 * sizeof(__m128i); + // Rounding the number of bytes to full 32-bit integers. + nvalue = div_roundup(uint32_t(nbytes), sizeof(uint32_t)); + } + + void encodeArray(const uint32_t *in, const size_t len, uint32_t *out, + size_t &nvalue) { + checkifdivisibleby(len, BlockSize); + + if (useRingBuf) + encodeArrayInternal_wRingBuf(in, len, out, nvalue); + else + encodeArrayInternal_woRingBuf(in, len, out, nvalue); + } + + const uint32_t *decodeArray(const uint32_t *in, const size_t, + uint32_t *out, size_t &nvalue) { + // The start of the header. + const uint32_t *const inHeader32 = in; + nvalue = inHeader32[0]; + const int countSels = inHeader32[1]; + // The number of bytes reserved for the selectors area. This contains the + // bytes actually used for the selectors as well as the "pessimistic gap", + // if specified so. + const size_t countSelArea8 = inHeader32[2]; + + // The start of the selectors area. + const uint8_t *const inSelArea8 = + reinterpret_cast(inHeader32 + countHeader32); + + // The number of bytes actually used within the selectors area. + const size_t countSelArea8Used = countSels / 2 + (countSels & 1); + + const size_t countPadBytes = getCountPadBytes(countSelArea8); + // The start of the data area. + const __m128i *inDataArea128 = + reinterpret_cast(inSelArea8 + countSelArea8 + + sizeof(uint8_t) + countPadBytes); + + __m128i *out128 = reinterpret_cast<__m128i *>(out); + + // Iterate over the selectors and unpack the compressed blocks accordingly. + // The last block is always treated specially, since it might not be "full". + for (int m = 0; m < countSels - 1; m++) { + const uint8_t i = extractSel(inSelArea8, m); + const size_t n = tableNum[i]; + decomprCompleteBlock(uint8_t(n), inDataArea128, out128); + } + const uint8_t countQuadsLastBlock = inSelArea8[countSelArea8Used]; + if (countQuadsLastBlock) + decomprIncompleteBlock(countQuadsLastBlock, inDataArea128, out128); + + return reinterpret_cast(inDataArea128); + } + + virtual std::string name() const { + return useRingBuf ? "SIMDGroupSimple_RingBuf" : "SIMDGroupSimple"; + } + }; + + template + const uint8_t SIMDGroupSimple::tableNum[] = { + 32, 16, 10, 8, 6, 5, 4, 3, 2, 1 + }; + template + const uint32_t SIMDGroupSimple::tableMask[] = { + (static_cast(1) << 1) - 1, + (static_cast(1) << 2) - 1, + (static_cast(1) << 3) - 1, + (static_cast(1) << 4) - 1, + (static_cast(1) << 5) - 1, + (static_cast(1) << 6) - 1, + (static_cast(1) << 8) - 1, + (static_cast(1) << 10) - 1, + (static_cast(1) << 16) - 1, + (static_cast(1) << 32) - 1, + }; } // namespace FastPForLib diff --git a/headers/simdnewpfor.h b/headers/simdnewpfor.h index 52ab2a2..4f06cf2 100644 --- a/headers/simdnewpfor.h +++ b/headers/simdnewpfor.h @@ -273,7 +273,7 @@ SIMDNewPFor::decodeArray( in += 4 * b; out += 128; - for (uint32_t e = 0, lpos = -1; e < nExceptions; e++) { + for (uint32_t e = 0, lpos = 0u-1; e < nExceptions; e++) { lpos += exceptions[e] + 1; beginout[lpos] |= (exceptions[e + nExceptions] + 1) << b; } diff --git a/headers/simdvariablebyte.h b/headers/simdvariablebyte.h index 6be0426..b16a1ec 100644 --- a/headers/simdvariablebyte.h +++ b/headers/simdvariablebyte.h @@ -13,6 +13,8 @@ #include "common.h" #include "codecs.h" +namespace FastPForLib { + #ifdef __cplusplus extern "C" { #endif @@ -26,7 +28,7 @@ size_t masked_vbyte_read_loop_fromcompressedsize(const uint8_t *in, /** * SIMD-accelerated version of VariableByteAlt. */ -class MaskedVByte : public FastPForLib::IntegerCODEC { +class MaskedVByte : public IntegerCODEC { public: MaskedVByte() {} @@ -77,7 +79,7 @@ class MaskedVByte : public FastPForLib::IntegerCODEC { ++bout; } } - while (FastPForLib::needPaddingTo32Bits(bout)) { + while (needPaddingTo32Bits(bout)) { *bout++ = 0xFFU; } const size_t storageinbytes = bout - initbout; @@ -89,9 +91,11 @@ class MaskedVByte : public FastPForLib::IntegerCODEC { const uint8_t *inbyte = reinterpret_cast(in); nvalue = masked_vbyte_read_loop_fromcompressedsize(inbyte, out, length * 4); return reinterpret_cast(inbyte); - return in + length; } std::string name() const { return "MaskedVByte"; } }; + +} // namespace FastPForLib + #endif /* SIMDVARIABLEBYTE_H_ */ diff --git a/headers/simple16.h b/headers/simple16.h index 25dceb8..bef1b6d 100644 --- a/headers/simple16.h +++ b/headers/simple16.h @@ -7,6 +7,9 @@ * Fabrizio Silvestri * Rossano Venturini * which was available under the Apache License, Version 2.0. + * + * The Simple16 scheme may overflow the buffer when decoding. + * This is a limitation of the original implementation.. */ #ifndef SIMPLE16_H_ @@ -487,7 +490,7 @@ void Simple16::encodeArray(const uint32_t *in, const size_t length, NumberOfValuesCoded += base; } else if (tryme<1, 3, 4, 4, 3, 3>(in, ValuesRemaining)) { out[0] = 6; - NumberOfValuesCoded = (ValuesRemaining < 1) ? ValuesRemaining : 1; + NumberOfValuesCoded = (ValuesRemaining < 1) ? uint32_t(ValuesRemaining) : 1; for (uint32_t i = 0; i < NumberOfValuesCoded; i++) bit_writer(out, *in++, 3); uint32_t fill = 3 * NumberOfValuesCoded; @@ -593,7 +596,7 @@ void Simple16::encodeArray(const uint32_t *in, const size_t length, assert(which(out) == 12); } else if (tryme<1, 10, 2, 9>(in, ValuesRemaining)) { out[0] = 13; - NumberOfValuesCoded = (ValuesRemaining < 1) ? ValuesRemaining : 1; + NumberOfValuesCoded = (ValuesRemaining < 1) ? uint32_t(ValuesRemaining) : 1; for (uint32_t i = 0; i < NumberOfValuesCoded; i++) bit_writer(out, *in++, 10); const uint32_t base = NumberOfValuesCoded; @@ -745,7 +748,7 @@ const uint32_t *Simple16::decodeArray(const uint32_t *in, printf("simple16 stats[%u]=%f\n", k, stats[k] * 1.0 / sum); } #endif - ASSERT(in <= endin, std::to_string(in - endin)); + ASSERT(len == 0 || in <= endin, std::to_string(in - endin)); return in; } diff --git a/headers/simple8b.h b/headers/simple8b.h index 4e89dad..4e4874c 100644 --- a/headers/simple8b.h +++ b/headers/simple8b.h @@ -410,7 +410,7 @@ void Simple8b::encodeArray(const uint32_t *in, const size_t length, assert(which(out64) == 14); } else if (tryme<1, 60>(in, ValuesRemaining)) { out64[0] = 15; - NumberOfValuesCoded = (ValuesRemaining < 1) ? ValuesRemaining : 1; + NumberOfValuesCoded = (ValuesRemaining < 1) ? uint32_t(ValuesRemaining) : 1; for (uint32_t i = 0; i < NumberOfValuesCoded; i++) bit_writer(out64, *in++, 60); out64[0] <<= 64 - SIMPLE8B_LOGDESC - 60 * NumberOfValuesCoded; @@ -637,9 +637,9 @@ const uint32_t *Simple8b::decodeArray(const uint32_t *in, printf("simple8b stats[%u]=%f\n", k, stats[k] * 1.0 / sum); } #endif - assert(in64 <= finalin64); + assert(len == 0 || in64 <= finalin64); in = reinterpret_cast(in64); - assert(in <= endin); + assert(len == 0 || in <= endin); // check that we don't overrun the buffer too much? ASSERT(out < end + 240, std::to_string(out - end)); nvalue = MarkLength ? actualvalue : out - initout; diff --git a/headers/simple8b_rle.h b/headers/simple8b_rle.h index f7256b2..a541c5a 100644 --- a/headers/simple8b_rle.h +++ b/headers/simple8b_rle.h @@ -267,7 +267,7 @@ template class Simple8b_RLE : public IntegerCODEC { // this may lead to unaligned access. Performance may be affected. // not much of an effect in practice on recent Intel processors. uint64_t *out64 = reinterpret_cast(out); - auto count = Simple8b_Codec::Compress(in, 0, length, out64, 0); + auto count = Simple8b_Codec::Compress(in, 0, uint32_t(length), out64, 0); nvalue = count * 2; } @@ -296,7 +296,7 @@ template class Simple8b_RLE : public IntegerCODEC { uint32_t pos = 0; - pos = Simple8b_Codec::Decompress(in64, 0, out, 0, nvalue); + pos = Simple8b_Codec::Decompress(in64, 0, out, 0, uint32_t(nvalue)); assert(in64 + pos <= finalin64); in = reinterpret_cast(in64 + pos); diff --git a/headers/simple9.h b/headers/simple9.h index ff85fc5..1aa7c35 100644 --- a/headers/simple9.h +++ b/headers/simple9.h @@ -7,6 +7,9 @@ * Fabrizio Silvestri * Rossano Venturini * which was available under the Apache License, Version 2.0. + * + * The Simple9 scheme may overflow the buffer when decoding. + * This is a limitation of the original implementation.. */ #ifndef SIMPLE9_H_ #define SIMPLE9_H_ @@ -34,7 +37,7 @@ class Simple9 : public IntegerCODEC { } template static bool tryme(const uint32_t *n, size_t len) { - const uint32_t min = (len < num1) ? len : num1; + const uint32_t min = (len < num1) ? uint32_t(len) : num1; for (uint32_t i = 0; i < min; i++) { if ((n[i]) >= (1U << log1)) return false; @@ -91,7 +94,7 @@ void Simple9::encodeArray(const uint32_t *in, uint32_t NumberOfValuesCoded; const uint32_t *const initout(out); if (MarkLength) - *(out++) = length; + *(out++) = uint32_t(length); size_t ValuesRemaining(length); // precompute const bool becareful = false; @@ -187,12 +190,12 @@ void Simple9::encodeArray(const uint32_t *in, while (ValuesRemaining > 0) { if (hacked && trymefull<28, 0>(in)) { out[0] = 9 << (32 - SIMPLE9_LOGDESC); - NumberOfValuesCoded = (ValuesRemaining < 28) ? ValuesRemaining : 28; + NumberOfValuesCoded = (ValuesRemaining < 28) ? uint32_t(ValuesRemaining) : 28; in += NumberOfValuesCoded; } else if (tryme<28, 1>(in, ValuesRemaining)) { out[0] = 0; - NumberOfValuesCoded = (ValuesRemaining < 28) ? ValuesRemaining : 28; + NumberOfValuesCoded = (ValuesRemaining < 28) ? uint32_t(ValuesRemaining) : 28; for (uint32_t i = 0; i < NumberOfValuesCoded; i++) bit_writer(out, *in++, 1); *out <<= 28 - NumberOfValuesCoded; @@ -201,7 +204,7 @@ void Simple9::encodeArray(const uint32_t *in, } else if (tryme<14, 2>(in, ValuesRemaining)) { out[0] = 1; - NumberOfValuesCoded = (ValuesRemaining < 14) ? ValuesRemaining : 14; + NumberOfValuesCoded = (ValuesRemaining < 14) ? uint32_t(ValuesRemaining) : 14; for (uint32_t i = 0; i < NumberOfValuesCoded; i++) bit_writer(out, *in++, 2); *out <<= 28 - 2 * NumberOfValuesCoded; @@ -209,7 +212,7 @@ void Simple9::encodeArray(const uint32_t *in, assert(which(out) == 1); } else if (tryme<9, 3>(in, ValuesRemaining)) { out[0] = 2; - NumberOfValuesCoded = (ValuesRemaining < 9) ? ValuesRemaining : 9; + NumberOfValuesCoded = (ValuesRemaining < 9) ? uint32_t(ValuesRemaining) : 9; for (uint32_t i = 0; i < NumberOfValuesCoded; i++) bit_writer(out, *in++, 3); *out <<= 28 - 3 * NumberOfValuesCoded; @@ -218,7 +221,7 @@ void Simple9::encodeArray(const uint32_t *in, } else if (tryme<7, 4>(in, ValuesRemaining)) { out[0] = 3; - NumberOfValuesCoded = (ValuesRemaining < 7) ? ValuesRemaining : 7; + NumberOfValuesCoded = (ValuesRemaining < 7) ? uint32_t(ValuesRemaining) : 7; for (uint32_t i = 0; i < NumberOfValuesCoded; i++) bit_writer(out, *in++, 4); *out <<= 28 - 4 * NumberOfValuesCoded; @@ -227,7 +230,7 @@ void Simple9::encodeArray(const uint32_t *in, } else if (tryme<5, 5>(in, ValuesRemaining)) { out[0] = 4; - NumberOfValuesCoded = (ValuesRemaining < 5) ? ValuesRemaining : 5; + NumberOfValuesCoded = (ValuesRemaining < 5) ? uint32_t(ValuesRemaining) : 5; for (uint32_t i = 0; i < NumberOfValuesCoded; i++) bit_writer(out, *in++, 5); *out <<= 28 - 5 * NumberOfValuesCoded; @@ -235,7 +238,7 @@ void Simple9::encodeArray(const uint32_t *in, assert(which(out) == 4); } else if (tryme<4, 7>(in, ValuesRemaining)) { out[0] = 5; - NumberOfValuesCoded = (ValuesRemaining < 4) ? ValuesRemaining : 4; + NumberOfValuesCoded = (ValuesRemaining < 4) ? uint32_t(ValuesRemaining) : 4; for (uint32_t i = 0; i < NumberOfValuesCoded; i++) bit_writer(out, *in++, 7); @@ -244,7 +247,7 @@ void Simple9::encodeArray(const uint32_t *in, assert(which(out) == 5); } else if (tryme<3, 9>(in, ValuesRemaining)) { out[0] = 6; - NumberOfValuesCoded = (ValuesRemaining < 3) ? ValuesRemaining : 3; + NumberOfValuesCoded = (ValuesRemaining < 3) ? uint32_t(ValuesRemaining) : 3; for (uint32_t i = 0; i < NumberOfValuesCoded; i++) bit_writer(out, *in++, 9); *out <<= 28 - 9 * NumberOfValuesCoded; @@ -252,7 +255,7 @@ void Simple9::encodeArray(const uint32_t *in, assert(which(out) == 6); } else if (tryme<2, 14>(in, ValuesRemaining)) { out[0] = 7; - NumberOfValuesCoded = (ValuesRemaining < 2) ? ValuesRemaining : 2; + NumberOfValuesCoded = (ValuesRemaining < 2) ? uint32_t(ValuesRemaining) : 2; for (uint32_t i = 0; i < NumberOfValuesCoded; i++) bit_writer(out, *in++, 14); *out <<= 28 - 14 * NumberOfValuesCoded; @@ -292,7 +295,7 @@ Simple9::decodeArray(const uint32_t *in, const size_t /* len if (MarkLength) if ((*in) > nvalue) throw NotEnoughStorage(*in); - const uint32_t actualvalue = MarkLength ? *(in++) : nvalue; + const uint32_t actualvalue = MarkLength ? *(in++) : (uint32_t)nvalue; if (nvalue < actualvalue) fprintf(stderr, "possible overrun\n"); nvalue = actualvalue; diff --git a/headers/simple9_rle.h b/headers/simple9_rle.h index 4655fd3..7e09f95 100644 --- a/headers/simple9_rle.h +++ b/headers/simple9_rle.h @@ -272,7 +272,7 @@ template class Simple9_RLE : public IntegerCODEC { if (MarkLength) { *out++ = static_cast(length); } - auto count = Simple9_Codec::Compress(input, 0, length, out, 0); + auto count = Simple9_Codec::Compress(input, 0, uint32_t(length), out, 0); nvalue = count; } @@ -289,7 +289,7 @@ template class Simple9_RLE : public IntegerCODEC { fprintf(stderr, "possible overrun\n"); } auto count = actualvalue; - Simple9_Codec::Decompress(input, 0, out, 0, count); + Simple9_Codec::Decompress(input, 0, out, 0, uint32_t(count)); nvalue = MarkLength ? actualvalue : count; input += count; return input; diff --git a/headers/streamvariablebyte.h b/headers/streamvariablebyte.h index dd1cbd1..c486fbf 100644 --- a/headers/streamvariablebyte.h +++ b/headers/streamvariablebyte.h @@ -18,7 +18,7 @@ namespace FastPForLib { */ extern "C" { -uint64_t svb_encode(uint8_t *out, const uint32_t *in, uint32_t count, int delta, +size_t svb_encode(uint8_t *out, const uint32_t *in, uint32_t count, int delta, int type); uint8_t *svb_decode_avx_simple(uint32_t *out, uint8_t *keyPtr, uint8_t *dataPtr, uint64_t count); @@ -46,11 +46,13 @@ class StreamVByte : public IntegerCODEC { public: void encodeArray(const uint32_t *in, const size_t count, uint32_t *out, size_t &nvalue) { - uint64_t bytesWritten = svb_encode( + size_t bytesWritten = svb_encode( (uint8_t *)out, in, static_cast(std::min( count, std::numeric_limits::max())), 0, 1); nvalue = static_cast(bytesWritten + 3) / 4; + for (size_t i = bytesWritten; i < nvalue * 4; ++i) + reinterpret_cast(out)[i] = 0; } const uint32_t *decodeArray(const uint32_t *in, const size_t /* count */, diff --git a/headers/synthetic.h b/headers/synthetic.h index 9dd9043..5c33286 100644 --- a/headers/synthetic.h +++ b/headers/synthetic.h @@ -171,10 +171,10 @@ class ZipfianGenerator { init(_items, _zipfianconstant); } - double zeta(int n, double theta) { + double zeta(int nn, double ttheta) { double sum = 0; - for (long i = 0; i < n; i++) { - sum += 1.0 / (pow(static_cast(i + 1), theta)); + for (long i = 0; i < nn; i++) { + sum += 1.0 / (pow(static_cast(i + 1), ttheta)); } return sum; } diff --git a/headers/util.h b/headers/util.h index d783283..0ea3699 100644 --- a/headers/util.h +++ b/headers/util.h @@ -122,6 +122,8 @@ __attribute__((const)) inline uint32_t gccbits(const uint64_t v) { return static_cast(index + 32 + 1); } #endif +#elif defined(__aarch64__) + return 64 - __builtin_clzll(v); #else uint32_t answer; __asm__("bsr %1, %0;" : "=r"(answer) : "r"(v)); @@ -173,6 +175,8 @@ inline void checkifdivisibleby(size_t a, uint32_t x) { __attribute__((const)) inline uint32_t asmbits(const uint32_t v) { #ifdef _MSC_VER return gccbits(v); +#elif defined(__aarch64__) + return gccbits(v); #else if (v == 0) return 0; @@ -185,6 +189,8 @@ __attribute__((const)) inline uint32_t asmbits(const uint32_t v) { __attribute__((const)) inline uint32_t asmbits(const uint64_t v) { #ifdef _MSC_VER return gccbits(v); +#elif defined(__aarch64__) + return gccbits(v); #else if (v == 0) return 0; uint64_t answer; diff --git a/headers/vsencoding.h b/headers/vsencoding.h index 0fc1aad..db7f100 100644 --- a/headers/vsencoding.h +++ b/headers/vsencoding.h @@ -324,7 +324,7 @@ class VSEncodingBlocks : public IntegerCODEC { // VSEncodingBlocks::TAIL_MERGIN]; }; -#ifdef _MSC_VER +#if defined(_MSC_VER) || (defined(__GNUC__) && defined(__aarch64__)) inline void __vseblocks_copy16(const uint32_t *src, uint32_t *dest) { memcpy(dest, src, 16 * sizeof(uint32_t)); } diff --git a/python_bindings/MANIFEST.in b/python_bindings/MANIFEST.in index a940e84..1cd069a 100644 --- a/python_bindings/MANIFEST.in +++ b/python_bindings/MANIFEST.in @@ -1,9 +1,8 @@ include pyfastpfor.cc -recursive-include ./fastpfor/src *.cpp *.c *.cc +include pyproject.toml +recursive-include ./fastpfor/src *.cpp *.c recursive-include ./fastpfor/headers *.h -include README.md include requirements.txt -include LICENSE global-exclude fastpfor/src/benchbitpacking.cpp global-exclude fastpfor/src/csv2maropu.cpp diff --git a/python_bindings/pyproject.toml b/python_bindings/pyproject.toml new file mode 100644 index 0000000..50f6ddf --- /dev/null +++ b/python_bindings/pyproject.toml @@ -0,0 +1,64 @@ +[build-system] +requires = ["setuptools>=77.0.0", "wheel", "pybind11>=2.4", "numpy"] +build-backend = "setuptools.build_meta" + +[project] +name = "pyfastpfor" +version = "1.5.0" +description = "Python bindings for the FastPFor library (fast integer compression)" +readme = {text = "Python bindings for FastPFor: a research library with fast integer-compression schemes for arrays of 32-bit integers. It exploits SIMD instructions (SSE on x86, NEON on ARM) and can decode billions of integers per second.", content-type = "text/markdown"} +requires-python = ">=3.9" +license = "Apache-2.0" +authors = [ + {name = "Daniel Lemire", email = "daniel@lemire.me"}, + {name = "Leonid Boytsov"}, +] +keywords = ["integer compression", "simd", "fastpfor", "bitpacking", "varint", "performance"] +classifiers = [ + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Developers", + "Intended Audience :: Science/Research", + "Programming Language :: C++", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", + "Topic :: Scientific/Engineering", + "Topic :: Software Development :: Libraries", +] +dependencies = ["numpy"] + +[project.urls] +Homepage = "https://github.com/searchivarius/PyFastPFor" +Repository = "https://github.com/searchivarius/PyFastPFor" +Issues = "https://github.com/searchivarius/PyFastPFor/issues" + +[tool.setuptools] +# pyfastpfor is a single C++ extension module with no pure-Python packages, so +# disable setuptools' automatic package discovery (which would otherwise pick up +# the `fastpfor` source symlink). +packages = [] +py-modules = [] + +[tool.cibuildwheel] +# Skip PyPy (the pybind11 extension targets CPython) and the slow/rare 32-bit +# and musllinux targets to keep the wheel matrix lean. +skip = ["pp*", "*-musllinux*", "*_i686", "*-win32"] +build-frontend = "build" +# Published wheels must run on any CPU of the target architecture, so build with +# a portable SIMD baseline rather than -march=native (see setup.py:simd_flags). +environment = { PYFASTPFOR_PORTABLE = "1" } +# Smoke-test every built wheel: import it and exercise the codec factory. +test-command = 'python -c "import pyfastpfor; assert pyfastpfor.getCodecList(); print(pyfastpfor.__version__, len(pyfastpfor.getCodecList()), \"codecs\")"' + +[tool.cibuildwheel.linux] +archs = ["native"] + +[tool.cibuildwheel.macos] +archs = ["native"] + +[tool.cibuildwheel.windows] +archs = ["AMD64"] diff --git a/python_bindings/setup.py b/python_bindings/setup.py index e78b58a..9e46eda 100755 --- a/python_bindings/setup.py +++ b/python_bindings/setup.py @@ -1,10 +1,12 @@ import os +import platform from setuptools import setup, Extension from setuptools.command.build_ext import build_ext import sys import setuptools -__version__ = '1.4.0' +# Package metadata (name, version, dependencies, ...) lives in pyproject.toml. +# This file only declares the extension module and its custom build logic. maindir = os.path.join(".", "fastpfor") library_file = os.path.join(maindir, "libFastPFor.a") @@ -13,8 +15,6 @@ libraries = [] extra_objects = [] -requirements_list = ['pybind11>=2.4', 'numpy'] - if os.path.exists(library_file): # if we have a prebuilt library file, use that. extra_objects.append(library_file) @@ -55,6 +55,32 @@ def has_flag(compiler, flagname): return True +def simd_flags(compiler): + """Return the SIMD/architecture compile flags. + + FastPFor's SIMD code requires SSE4.2 on x86 (provided natively) and NEON on + ARM (provided through the fastpfor_neon.h shim, and part of the ARMv8-A + baseline, so no special flag is needed). + + By default we use ``-march=native`` for the best performance, which is the + right choice for a source install built on the machine that runs it. For + redistributable wheels this is unsafe (the build machine may support + instructions the user's CPU lacks), so set ``PYFASTPFOR_PORTABLE=1`` to use + a portable baseline instead; the CI wheel builds do exactly that. + """ + portable = os.environ.get('PYFASTPFOR_PORTABLE', '') not in ('', '0', 'false', 'False') + machine = platform.machine().lower() + is_x86 = machine in ('x86_64', 'amd64', 'x86', 'i386', 'i686') + + if not portable and has_flag(compiler, '-march=native'): + return ['-march=native'] + if is_x86 and has_flag(compiler, '-msse4.2'): + # Portable x86 baseline: SSE4.2 is the minimum FastPFor requires. + return ['-msse4.2'] + # On ARM/aarch64 NEON is part of the baseline, so no extra flag is needed. + return [] + + def cpp_flag(compiler): """Return the -std=c++[11/14] compiler flag. @@ -73,10 +99,14 @@ def cpp_flag(compiler): class BuildExt(build_ext): """A custom build extension for adding compiler-specific options.""" + # Note: language-specific standard flags (-std=c++11 / -std=c99) are NOT + # listed here. They are applied per source file in _compile_with_std below, + # because this extension mixes C and C++ sources and a C++ standard flag is + # rejected by the compiler on C sources (and vice versa). c_opts = { 'msvc': ['/EHsc', '/openmp', '/O2'], - 'unix': ['-O3', '-march=native', '-std=c99'], - #'unix': ['-O0', '-march=native', '-g'], + 'unix': ['-O3'], + #'unix': ['-O0', '-g'], } link_opts = { 'unix': [], @@ -84,18 +114,18 @@ class BuildExt(build_ext): } if sys.platform == 'darwin': - c_opts['unix'] += ['-stdlib=libc++', '-mmacosx-version-min=10.7'] - link_opts['unix'] += ['-stdlib=libc++', '-mmacosx-version-min=10.7'] + c_opts['unix'] += ['-stdlib=libc++', '-mmacosx-version-min=10.9'] + link_opts['unix'] += ['-stdlib=libc++', '-mmacosx-version-min=10.9'] else: c_opts['unix'].append("-fopenmp") link_opts['unix'].extend(['-fopenmp', '-pthread']) def build_extensions(self): ct = self.compiler.compiler_type - opts = self.c_opts.get(ct, []) + opts = list(self.c_opts.get(ct, [])) if ct == 'unix': opts.append('-DVERSION_INFO="%s"' % self.distribution.get_version()) - opts.append(cpp_flag(self.compiler)) + opts.extend(simd_flags(self.compiler)) if has_flag(self.compiler, '-fvisibility=hidden'): opts.append('-fvisibility=hidden') elif ct == 'msvc': @@ -117,20 +147,38 @@ def build_extensions(self): np.get_include() ]) + if ct == 'unix': + self._patch_compiler_for_mixed_languages() + build_ext.build_extensions(self) + def _patch_compiler_for_mixed_languages(self): + """Apply the right -std flag to each source based on its language. + + distutils applies one set of compile args to every source in an + extension, but here C++ sources (.cc/.cpp) need -std=c++11 while C + sources (.c) need -std=c99. We wrap the compiler's _compile method to + add the appropriate standard flag (and drop C++-only flags on C). + """ + compiler = self.compiler + original_compile = compiler._compile + cxx_std = cpp_flag(compiler) + cxx_only = ('-stdlib=libc++',) + + def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts): + postargs = list(extra_postargs) + if src.endswith(('.cpp', '.cxx', '.cc', '.c++')): + postargs.append(cxx_std) + elif src.endswith('.c'): + postargs = [a for a in postargs if a not in cxx_only] + postargs.append('-std=c99') + return original_compile(obj, src, ext, cc_args, postargs, pp_opts) + + compiler._compile = _compile + setup( - name='pyfastpfor', - version=__version__, - description='Python bindings for the FastPFor library (fast integer compression)', - author='Lemire et al. for FastPFor', - url='https://github.com/searchivarius/PyFastPFor', - long_description="""Pythong bindings for FastPFor: A research library with integer compression schemes. FastPFor is broadly applicable to the compression of arrays of 32-bit integers where most integers are small. The library seeks to exploit SIMD instructions (SSE) whenever possible. This library can decode at least 4 billions of compressed integers per second on most desktop or laptop processors. That is, it can decompress data at a rate of 15 GB/s. This is significantly faster than generic codecs like gzip, LZO, Snappy or LZ4.""", ext_modules=ext_modules, - install_requires=requirements_list, - setup_requires=requirements_list, cmdclass={'build_ext': BuildExt}, - test_suite="tests", zip_safe=False, ) diff --git a/src/benchbitpacking.cpp b/src/benchbitpacking.cpp index 88de06f..2338cc3 100644 --- a/src/benchbitpacking.cpp +++ b/src/benchbitpacking.cpp @@ -271,7 +271,6 @@ void simplebenchmark(uint32_t N = 1U << 16, uint32_t T = 1U << 9) { } } -/* int main() { cout << "# cache-to-cache" << endl; simplebenchmark(1U << 16, 1U << 9); @@ -281,4 +280,3 @@ int main() { return 0; } -*/ diff --git a/src/codecfactory.cpp b/src/codecfactory.cpp index 812659f..e94a0e0 100644 --- a/src/codecfactory.cpp +++ b/src/codecfactory.cpp @@ -29,7 +29,7 @@ #include "simdgroupsimple.h" namespace FastPForLib { -std::vector> CODECFactory::allSchemes() { +std::vector> CODECFactory::allSchemes() const { std::vector> ans; for (auto i = scodecmap.begin(); i != scodecmap.end(); ++i) { ans.push_back(i->second); @@ -37,7 +37,7 @@ std::vector> CODECFactory::allSchemes() { return ans; } -std::vector CODECFactory::allNames() { +std::vector CODECFactory::allNames() const { std::vector ans; for (auto i = scodecmap.begin(); i != scodecmap.end(); ++i) { ans.push_back(i->first); @@ -45,80 +45,173 @@ std::vector CODECFactory::allNames() { return ans; } -std::shared_ptr &CODECFactory::getFromName(std::string name) { - if (scodecmap.find(name) == scodecmap.end()) { - fprintf(stderr, "name %s does not refer to a CODEC.\n" - "possible choices:\n", name.c_str()); - for (auto i = scodecmap.begin(); i != scodecmap.end(); ++i) { - fprintf(stderr, "%s\n", i->first.c_str()); - } - fprintf(stderr, "for now, I'm going to just return 'copy'\n"); - return scodecmap["copy"]; +std::shared_ptr const& CODECFactory::getFromName(std::string name) const { + auto it = scodecmap.find(name); + if (it != scodecmap.end()) + return it->second; + + fprintf(stderr, "name %s does not refer to a CODEC.\n" + "possible choices:\n", name.c_str()); + for (auto i = scodecmap.begin(); i != scodecmap.end(); ++i) { + fprintf(stderr, "%s\n", i->first.c_str()); } - return scodecmap[name]; + fprintf(stderr, "for now, I'm going to just return 'copy'\n"); + return scodecmap.at("copy"); +} + +// std::make_unique equivalent +template +static std::unique_ptr make_unique(Args&&... args) { + return std::unique_ptr(new T(std::forward(args)...)); +} + +std::unique_ptr fastbinarypacking8_codec() { + return make_unique, VariableByte>>(); +} +std::unique_ptr fastbinarypacking16_codec() { + return make_unique, VariableByte>>(); +} +std::unique_ptr fastbinarypacking32_codec() { + return make_unique, VariableByte>>(); +} +std::unique_ptr BP32_codec() { + return make_unique>(); +} +std::unique_ptr vsencoding_codec() { + return make_unique(1U << 16); +} +std::unique_ptr fastpfor128_codec() { + return make_unique, VariableByte>>(); +} +std::unique_ptr fastpfor256_codec() { + return make_unique, VariableByte>>(); +} +std::unique_ptr simdfastpfor128_codec() { + return make_unique, VariableByte>>(); +} +std::unique_ptr simdfastpfor256_codec() { + return make_unique, VariableByte>>(); +} +std::unique_ptr simplepfor_codec() { + return make_unique, VariableByte>>(); +} +std::unique_ptr simdsimplepfor_codec() { + return make_unique, VariableByte>>(); +} +std::unique_ptr pfor_codec() { + return make_unique>(); +} +std::unique_ptr simdpfor_codec() { + return make_unique>(); +} +std::unique_ptr pfor2008_codec() { + return make_unique>(); +} +std::unique_ptr simdnewpfor_codec() { + return make_unique>, VariableByte>>(); +} +std::unique_ptr newpfor_codec() { + return make_unique>, VariableByte>>(); +} +std::unique_ptr optpfor_codec() { + return make_unique>, VariableByte>>(); +} +std::unique_ptr simdoptpfor_codec() { + return make_unique>, VariableByte>>(); +} +std::unique_ptr varint_codec() { + return make_unique(); +} +std::unique_ptr vbyte_codec() { + return make_unique(); +} +std::unique_ptr maskedvbyte_codec() { + return make_unique(); +} +std::unique_ptr streamvbyte_codec() { + return make_unique(); +} +std::unique_ptr varintgb_codec() { + return make_unique>(); +} +std::unique_ptr simple16_codec() { + return make_unique>(); +} +std::unique_ptr simple9_codec() { + return make_unique>(); +} +std::unique_ptr simple9_rle_codec() { + return make_unique>(); +} +std::unique_ptr simple8b_codec() { + return make_unique>(); +} +std::unique_ptr simple8b_rle_codec() { + return make_unique>(); +} +#ifdef VARINTG8IU_H__ +std::unique_ptr varintg8iu_codec() { + return make_unique(); +} +#endif +#ifdef USESNAPPY +std::unique_ptr snappy_codec() { + return make_unique(); +} +#endif +std::unique_ptr simdbinarypacking_codec() { + return make_unique>(); +} +std::unique_ptr simdgroupsimple_codec() { + return make_unique, VariableByte>>(); +} +std::unique_ptr simdgroupsimple_ringbuf_codec() { + return make_unique, VariableByte>>(); +} +std::unique_ptr copy_codec() { + return make_unique(); } static CodecMap initializefactory() { CodecMap map; - map["fastbinarypacking8"] = std::shared_ptr( - new CompositeCodec, VariableByte>); - map["fastbinarypacking16"] = std::shared_ptr( - new CompositeCodec, VariableByte>); - map["fastbinarypacking32"] = std::shared_ptr( - new CompositeCodec, VariableByte>); - map["BP32"] = - std::shared_ptr(new CompositeCodec); - map["vsencoding"] = - std::shared_ptr(new vsencoding::VSEncodingBlocks(1U << 16)); - map["fastpfor128"] = std::shared_ptr( - new CompositeCodec, VariableByte>()); - map["fastpfor256"] = std::shared_ptr( - new CompositeCodec, VariableByte>()); - map["simdfastpfor128"] = std::shared_ptr( - new CompositeCodec, VariableByte>()); - map["simdfastpfor256"] = std::shared_ptr( - new CompositeCodec, VariableByte>()); - map["simplepfor"] = std::shared_ptr( - new CompositeCodec, VariableByte>()); - map["simdsimplepfor"] = std::shared_ptr( - new CompositeCodec, VariableByte>()); - map["pfor"] = - std::shared_ptr(new CompositeCodec()); - map["simdpfor"] = std::shared_ptr( - new CompositeCodec()); - map["pfor2008"] = std::shared_ptr( - new CompositeCodec()); - map["simdnewpfor"] = std::shared_ptr( - new CompositeCodec>, VariableByte>()); - map["newpfor"] = std::shared_ptr( - new CompositeCodec>, VariableByte>()); - map["optpfor"] = std::shared_ptr( - new CompositeCodec>, VariableByte>()); - map["simdoptpfor"] = std::shared_ptr( - new CompositeCodec>, VariableByte>()); - map["varint"] = std::shared_ptr(new VariableByte()); - map["vbyte"] = std::shared_ptr(new VByte()); - map["maskedvbyte"] = std::shared_ptr(new MaskedVByte()); - map["streamvbyte"] = std::shared_ptr(new StreamVByte()); - map["varintgb"] = std::shared_ptr(new VarIntGB<>()); - map["simple16"] = std::shared_ptr(new Simple16()); - map["simple9"] = std::shared_ptr(new Simple9()); - map["simple9_rle"] = std::shared_ptr(new Simple9_RLE()); - map["simple8b"] = std::shared_ptr(new Simple8b()); - map["simple8b_rle"] = std::shared_ptr(new Simple8b_RLE()); + map["fastbinarypacking8"] = fastbinarypacking8_codec(); + map["fastbinarypacking16"] = fastbinarypacking16_codec(); + map["fastbinarypacking32"] = fastbinarypacking32_codec(); + map["BP32"] = BP32_codec(); + map["vsencoding"] = vsencoding_codec(); + map["fastpfor128"] = fastpfor128_codec(); + map["fastpfor256"] = fastpfor256_codec(); + map["simdfastpfor128"] = simdfastpfor128_codec(); + map["simdfastpfor256"] = simdfastpfor256_codec(); + map["simplepfor"] = simplepfor_codec(); + map["simdsimplepfor"] = simdsimplepfor_codec(); + map["pfor"] = pfor_codec(); + map["simdpfor"] = simdpfor_codec(); + map["pfor2008"] = pfor2008_codec(); + map["simdnewpfor"] = simdnewpfor_codec(); + map["newpfor"] = newpfor_codec(); + map["optpfor"] = optpfor_codec(); + map["simdoptpfor"] = simdoptpfor_codec(); + map["varint"] = varint_codec(); + map["vbyte"] = vbyte_codec(); + map["maskedvbyte"] = maskedvbyte_codec(); + map["streamvbyte"] = streamvbyte_codec(); + map["varintgb"] = varintgb_codec(); + map["simple16"] = simple16_codec(); + map["simple9"] = simple9_codec(); + map["simple9_rle"] = simple9_rle_codec(); + map["simple8b"] = simple8b_codec(); + map["simple8b_rle"] = simple8b_rle_codec(); #ifdef VARINTG8IU_H__ - map["varintg8iu"] = std::shared_ptr(new VarIntG8IU()); + map["varintg8iu"] = varintg8iu_codec(); #endif #ifdef USESNAPPY - map["snappy"] = std::shared_ptr(new JustSnappy()); + map["snappy"] = snappy_codec(); #endif - map["simdbinarypacking"] = std::shared_ptr( - new CompositeCodec()); - map["simdgroupsimple"] = std::shared_ptr( - new CompositeCodec, VariableByte>()); - map["simdgroupsimple_ringbuf"] = std::shared_ptr( - new CompositeCodec, VariableByte>()); - map["copy"] = std::shared_ptr(new JustCopy()); + map["simdbinarypacking"] = simdbinarypacking_codec(); + map["simdgroupsimple"] = simdgroupsimple_codec(); + map["simdgroupsimple_ringbuf"] = simdgroupsimple_ringbuf_codec(); + map["copy"] = copy_codec(); return map; } diff --git a/src/codecs.cpp b/src/codecs.cpp index ebc7c3e..4454c76 100644 --- a/src/codecs.cpp +++ b/src/codecs.cpp @@ -154,8 +154,8 @@ int main(int argc, char **argv) { const uint32_t N = 4194304 * 16; vector> datas; cout << "# zipfian 1 data generation..." << endl; - for (uint32_t k = 0; k < (1U << 1); ++k) - datas.push_back(generateZipfianArray32(N, 1.0, 1U << 20)); + for (uint32_t k = 0; k < (1ULL << 1); ++k) + datas.push_back(generateZipfianArray32(N, 1.0, 1ULL << 20)); if (splitlongarrays) splitLongArrays(datas); processparameters pp(false, fulldisplay, displayhistogram, @@ -166,9 +166,9 @@ int main(int argc, char **argv) { } else if (strcmp(parameter, "zipfian2") == 0) { const uint32_t N = 4194304 * 16; vector> datas; - for (uint32_t k = 0; k < (1U << 1); ++k) + for (uint32_t k = 0; k < (1ULL << 1); ++k) cout << "# zipfian 2 data generation..." << endl; - datas.push_back(generateZipfianArray32(N, 2.0, 1U << 20)); + datas.push_back(generateZipfianArray32(N, 2.0, 1ULL << 20)); if (splitlongarrays) splitLongArrays(datas); processparameters pp(false, fulldisplay, displayhistogram, @@ -180,8 +180,8 @@ int main(int argc, char **argv) { cout << "# dense uniform data generation..." << endl; vector> datas; UniformDataGenerator clu; - for (uint32_t k = 0; k < (1U << 5); ++k) - datas.push_back(clu.generateUniform((1U << 18), 1U << 27)); + for (uint32_t k = 0; k < (1ULL << 5); ++k) + datas.push_back(clu.generateUniform((1ULL << 18), 1ULL << 27)); cout << "# generated " << datas.size() << " arrays" << endl; if (splitlongarrays) splitLongArrays(datas); @@ -194,9 +194,9 @@ int main(int argc, char **argv) { cout << "# sparse uniform data generation..." << endl; vector> datas; UniformDataGenerator clu; - for (uint32_t k = 0; k < (1U << 14); - ++k) // by original paper should be 1U<<19 - datas.push_back(clu.generateUniform((1U << 9), 1U << 27)); + for (uint32_t k = 0; k < (1ULL << 14); + ++k) // by original paper should be 1ULL<<19 + datas.push_back(clu.generateUniform((1ULL << 9), 1ULL << 27)); cout << "# generated " << datas.size() << " arrays" << endl; if (splitlongarrays) splitLongArrays(datas); @@ -209,9 +209,9 @@ int main(int argc, char **argv) { cout << "# dense cluster data generation..." << endl; vector> datas; ClusteredDataGenerator clu; - for (uint32_t k = 0; k < (1U << 5); - ++k) // by original paper should be 1U<<10 - datas.push_back(clu.generateClustered((1U << 18), 1U << 27)); + for (uint32_t k = 0; k < (1ULL << 5); + ++k) // by original paper should be 1ULL<<10 + datas.push_back(clu.generateClustered((1ULL << 18), 1ULL << 27)); cout << "# generated " << datas.size() << " arrays" << endl; if (splitlongarrays) splitLongArrays(datas); @@ -224,9 +224,9 @@ int main(int argc, char **argv) { cout << "# sparse cluster data generation..." << endl; vector> datas; ClusteredDataGenerator clu; - for (uint32_t k = 0; k < (1U << 14); - ++k) // by original paper should be 1U<<19 - datas.push_back(clu.generateClustered((1U << 9), 1U << 27)); + for (uint32_t k = 0; k < (1ULL << 14); + ++k) // by original paper should be 1ULL<<19 + datas.push_back(clu.generateClustered((1ULL << 9), 1ULL << 27)); cout << "# generated " << datas.size() << " arrays" << endl; if (splitlongarrays) splitLongArrays(datas); @@ -239,9 +239,9 @@ int main(int argc, char **argv) { cout << "# dense uniform data generation..." << endl; vector> datas; UniformDataGenerator clu; - for (uint32_t k = 0; k < (1U << 3); - ++k) // by original paper should be 1U<<10 - datas.push_back(clu.generateUniform((1U << 22), 1U << 29)); + for (uint32_t k = 0; k < (1ULL << 3); + ++k) // by original paper should be 1ULL<<10 + datas.push_back(clu.generateUniform((1ULL << 22), 1ULL << 29)); cout << "# generated " << datas.size() << " arrays" << endl; if (splitlongarrays) splitLongArrays(datas); @@ -254,8 +254,8 @@ int main(int argc, char **argv) { cout << "# sparse uniform data generation..." << endl; vector> datas; UniformDataGenerator clu; - for (uint32_t k = 0; k < (1U << 13); ++k) - datas.push_back(clu.generateUniform((1U << 12), 1U << 29)); + for (uint32_t k = 0; k < (1ULL << 13); ++k) + datas.push_back(clu.generateUniform((1ULL << 12), 1ULL << 29)); cout << "# generated " << datas.size() << " arrays" << endl; if (splitlongarrays) splitLongArrays(datas); @@ -269,7 +269,7 @@ int main(int argc, char **argv) { vector> datas; ClusteredDataGenerator clu; for (uint32_t k = 0; k < 1; ++k) - datas.push_back(clu.generateClustered((1U << 23), 1U << 26)); + datas.push_back(clu.generateClustered((1ULL << 23), 1ULL << 26)); cout << "# generated " << datas.size() << " arrays" << endl; if (splitlongarrays) splitLongArrays(datas); @@ -282,8 +282,8 @@ int main(int argc, char **argv) { cout << "# sparse cluster data generation..." << endl; vector> datas; ClusteredDataGenerator clu; - for (uint32_t k = 0; k < (1U << 13); ++k) - datas.push_back(clu.generateClustered((1U << 12), 1U << 26)); + for (uint32_t k = 0; k < (1ULL << 13); ++k) + datas.push_back(clu.generateClustered((1ULL << 12), 1ULL << 26)); cout << "# generated " << datas.size() << " arrays" << endl; if (splitlongarrays) splitLongArrays(datas); @@ -297,10 +297,10 @@ int main(int argc, char **argv) { ClusteredDataGenerator clu; for (uint32_t K = 15; K <= 25; K += 5) { vector> datas; - for (uint32_t k = 0; k < (1U << (25 - K)); ++k) - datas.push_back(clu.generateClustered((1U << K), 1U << 29)); + for (uint32_t k = 0; k < (1ULL << (25 - K)); ++k) + datas.push_back(clu.generateClustered((1ULL << K), 1ULL << 29)); cout << "# generated " << datas.size() << " arrays" << endl; - cout << "# their size is " << (1U << K) << endl; + cout << "# their size is " << (1ULL << K) << endl; const uint32_t p = 29 - K; if (splitlongarrays) splitLongArrays(datas); @@ -315,10 +315,10 @@ int main(int argc, char **argv) { UniformDataGenerator clu; for (uint32_t K = 15; K <= 25; K += 5) { vector> datas; - for (uint32_t k = 0; k < (1U << (25 - K)); ++k) - datas.push_back(clu.generateUniform((1U << K), 1U << 29)); + for (uint32_t k = 0; k < (1ULL << (25 - K)); ++k) + datas.push_back(clu.generateUniform((1ULL << K), 1ULL << 29)); cout << "# generated " << datas.size() << " arrays" << endl; - cout << "# their size is " << (1U << K) << endl; + cout << "# their size is " << (1ULL << K) << endl; const uint32_t p = 29 - K; if (splitlongarrays) splitLongArrays(datas); @@ -333,10 +333,10 @@ int main(int argc, char **argv) { ClusteredDataGenerator clu; for (uint32_t K = 15; K <= 20; K += 5) { vector> datas; - for (size_t k = 0; k < static_cast(1U << (20 - K)); ++k) - datas.push_back(clu.generateClustered((1U << K), 1U << 29)); + for (size_t k = 0; k < static_cast(1ULL << (20 - K)); ++k) + datas.push_back(clu.generateClustered((1ULL << K), 1ULL << 29)); cout << "# generated " << datas.size() << " arrays" << endl; - cout << "# their size is " << (1U << K) << endl; + cout << "# their size is " << (1ULL << K) << endl; const uint32_t p = 29 - K; if (splitlongarrays) splitLongArrays(datas); @@ -351,10 +351,10 @@ int main(int argc, char **argv) { UniformDataGenerator clu; for (uint32_t K = 15; K <= 20; K += 5) { vector> datas; - for (size_t k = 0; k < static_cast(1U << (20 - K)); ++k) - datas.push_back(clu.generateUniform((1U << K), 1U << 29)); + for (size_t k = 0; k < static_cast(1ULL << (20 - K)); ++k) + datas.push_back(clu.generateUniform((1ULL << K), 1ULL << 29)); cout << "# generated " << datas.size() << " arrays" << endl; - cout << "# their size is " << (1U << K) << endl; + cout << "# their size is " << (1ULL << K) << endl; const uint32_t p = 29 - K; if (splitlongarrays) splitLongArrays(datas); @@ -369,12 +369,12 @@ int main(int argc, char **argv) { ClusteredDataGenerator clu; for (uint32_t K = 15; K <= 25; K += 5) { vector> datas; - for (size_t k = 0; k < static_cast(1U << (25 - K)); ++k) + for (size_t k = 0; k < static_cast(1ULL << (25 - K)); ++k) datas.push_back( - diffs(clu.generateClustered((1U << K), 1U << 29), false)); + diffs(clu.generateClustered((1ULL << K), 1ULL << 29), false)); cout << "# generated " << datas.size() << " arrays and applied delta coding" << endl; - cout << "# their size is " << (1U << K) << endl; + cout << "# their size is " << (1ULL << K) << endl; const uint32_t p = 29 - K; if (splitlongarrays) splitLongArrays(datas); @@ -389,12 +389,12 @@ int main(int argc, char **argv) { UniformDataGenerator clu; for (uint32_t K = 15; K <= 25; K += 5) { vector> datas; - for (size_t k = 0; k < static_cast(1U << (25 - K)); ++k) + for (size_t k = 0; k < static_cast(1ULL << (25 - K)); ++k) datas.push_back( - diffs(clu.generateUniform((1U << K), 1U << 29), false)); + diffs(clu.generateUniform((1ULL << K), 1ULL << 29), false)); cout << "# generated " << datas.size() << " arrays and applied delta coding" << endl; - cout << "# their size is " << (1U << K) << endl; + cout << "# their size is " << (1ULL << K) << endl; const uint32_t p = 29 - K; if (splitlongarrays) splitLongArrays(datas); diff --git a/src/horizontalbitpacking.cpp b/src/horizontalbitpacking.cpp index 086aa8a..6537f54 100644 --- a/src/horizontalbitpacking.cpp +++ b/src/horizontalbitpacking.cpp @@ -31,18 +31,18 @@ static void SIMD_nullunpacker32(const uint8_t *, uint32_t *out) { static void simdhunpack1(const uint8_t * in,uint32_t * out) { __m128i* pCurr = reinterpret_cast<__m128i*>(out); const static __m128i andmask = _mm_set_epi8 -(-128,1<<6,1<<5,1<<4,1<<3,1<<2,1<<1,1<<0,-128,1<<6,1<<5,1<<4,1<<3,1<<2,1<<1,1<<0); +((char)-128,(char)(1<<6),(char)(1<<5),(char)(1<<4),(char)(1<<3),(char)(1<<2),(char)(1<<1),(char)(1<<0),(char)-128,(char)(1<<6),(char)(1<<5),(char)(1<<4),(char)(1<<3),(char)(1<<2),(char)(1<<1),(char)(1<<0)); const static __m128i shufmask = _mm_set_epi8 -(1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0); - const static __m128i allones = _mm_set1_epi8 (1); +((char)1,(char)1,(char)1,(char)1,(char)1,(char)1,(char)1,(char)1,(char)0,(char)0,(char)0,(char)0,(char)0,(char)0,(char)0,(char)0); + const static __m128i allones = _mm_set1_epi8 ((char)1); const static __m128i finalshufmask1 = _mm_set_epi8 -(-128,-128,-128,3,-128,-128,-128,2,-128,-128,-128,1,-128,-128,-128,0); +((char)-128,(char)-128,(char)-128,(char)3,(char)-128,(char)-128,(char)-128,(char)2,(char)-128,(char)-128,(char)-128,(char)1,(char)-128,(char)-128,(char)-128,(char)0); const static __m128i finalshufmask2 = _mm_set_epi8 -(-128,-128,-128,7,-128,-128,-128,6,-128,-128,-128,5,-128,-128,-128,4); +((char)-128,(char)-128,(char)-128,(char)7,(char)-128,(char)-128,(char)-128,(char)6,(char)-128,(char)-128,(char)-128,(char)5,(char)-128,(char)-128,(char)-128,(char)4); const static __m128i finalshufmask3 = _mm_set_epi8 -(-128,-128,-128,11,-128,-128,-128,10,-128,-128,-128,9,-128,-128,-128,8); +((char)-128,(char)-128,(char)-128,(char)11,(char)-128,(char)-128,(char)-128,(char)10,(char)-128,(char)-128,(char)-128,(char)9,(char)-128,(char)-128,(char)-128,(char)8); const static __m128i finalshufmask4 = _mm_set_epi8 -(-128,-128,-128,15,-128,-128,-128,14,-128,-128,-128,13,-128,-128,-128,12); +((char)-128,(char)-128,(char)-128,(char)15,(char)-128,(char)-128,(char)-128,(char)14,(char)-128,(char)-128,(char)-128,(char)13,(char)-128,(char)-128,(char)-128,(char)12); for (uint32_t j = 0; j<8;++j) { __m128i ba = _mm_loadu_si128(reinterpret_cast(in + 2 * j)); @@ -65,11 +65,11 @@ j)); */ -const static __m128i shufkey1_1 = {-9187202500191551488, -9187202500191551488}; -const static __m128i shufkey2_1 = {-9187202500191551488, -9187202500191551488}; -const static __m128i multi1_1 = {17179869192, 4294967298}; -const static __m128i multi2_1 = {17179869192, 4294967298}; -const static __m128i mask_1 = {4294967297, 4294967297}; +const static __m128i shufkey1_1 = _mm_set_epi64x(-9187202500191551488LL, -9187202500191551488LL); +const static __m128i shufkey2_1 = _mm_set_epi64x(-9187202500191551488LL, -9187202500191551488LL); +const static __m128i multi1_1 = _mm_set_epi64x(17179869192LL, 4294967298LL); +const static __m128i multi2_1 = _mm_set_epi64x(17179869192LL, 4294967298LL); +const static __m128i mask_1 = _mm_set_epi64x(4294967297LL, 4294967297LL); static void simdhunpack1(const uint8_t *in, uint32_t *out) { enum { b = 1 }; @@ -91,11 +91,11 @@ static void simdhunpack1(const uint8_t *in, uint32_t *out) { _mm_storeu_si128(pCurr++, ca); } } -const static __m128i shufkey1_2 = {-9187202500191551488, -9187202500191551488}; -const static __m128i shufkey2_2 = {-9187202500191551488, -9187202500191551488}; -const static __m128i multi1_2 = {68719476800, 4294967300}; -const static __m128i multi2_2 = {68719476800, 4294967300}; -const static __m128i mask_2 = {12884901891, 12884901891}; +const static __m128i shufkey1_2 = _mm_set_epi64x(-9187202500191551488LL, -9187202500191551488LL); +const static __m128i shufkey2_2 = _mm_set_epi64x(-9187202500191551488LL, -9187202500191551488LL); +const static __m128i multi1_2 = _mm_set_epi64x(68719476800LL, 4294967300LL); +const static __m128i multi2_2 = _mm_set_epi64x(68719476800LL, 4294967300LL); +const static __m128i mask_2 = _mm_set_epi64x(12884901891LL, 12884901891LL); static void simdhunpack2(const uint8_t *in, uint32_t *out) { enum { b = 2 }; @@ -117,11 +117,11 @@ static void simdhunpack2(const uint8_t *in, uint32_t *out) { _mm_storeu_si128(pCurr++, ca); } } -const static __m128i shufkey1_3 = {-9187202500191551488, -9187202495896616704}; -const static __m128i shufkey2_3 = {-9187342138168279040, -9187202495896584191}; -const static __m128i multi1_3 = {34359738432, 137438953473}; -const static __m128i multi2_3 = {4294967304, 17179869216}; -const static __m128i mask_3 = {30064771079, 30064771079}; +const static __m128i shufkey1_3 = _mm_set_epi64x(-9187202500191551488LL, -9187202495896616704LL); +const static __m128i shufkey2_3 = _mm_set_epi64x(-9187342138168279040LL, -9187202495896584191LL); +const static __m128i multi1_3 = _mm_set_epi64x(34359738432LL, 137438953473LL); +const static __m128i multi2_3 = _mm_set_epi64x(4294967304LL, 17179869216LL); +const static __m128i mask_3 = _mm_set_epi64x(30064771079LL, 30064771079LL); static void simdhunpack3(const uint8_t *in, uint32_t *out) { enum { b = 3 }; @@ -151,17 +151,17 @@ static void simdhunpack4(const uint8_t * in,uint32_t * out) { enum{ b = 4}; __m128i* pCurr = reinterpret_cast<__m128i*>(out); - const static __m128i shuf8 = _mm_set_epi8 (13,12, 15,14, 9,8, 11,10, 5,4, -7,6, 1,0, 3,2); + const static __m128i shuf8 = _mm_set_epi8 ((char)13,(char)12, (char)15,(char)14, (char)9,(char)8, (char)11,(char)10, (char)5,(char)4, +(char)7,(char)6, (char)1,(char)0, (char)3,(char)2); const static __m128i shufoneoutoftwo = _mm_set1_epi32 (252645135); - const static __m128i shufmid8_1 = _mm_set_epi8 (-128, -128, -128, 3, -128, --128, -128, 1, -128, -128, -128, 2, -128, -128, -128, 0); - const static __m128i shufmid8_2 = _mm_set_epi8 (-128, -128, -128, 7, -128, --128, -128, 5, -128, -128, -128, 6, -128, -128, -128, 4); - const static __m128i shufmid8_3 = _mm_set_epi8 (-128, -128, -128, 11, -128, --128, -128, 9, -128, -128, -128, 10, -128, -128, -128, 8); - const static __m128i shufmid8_4 = _mm_set_epi8 (-128, -128, -128, 15, -128, --128, -128, 13, -128, -128, -128, 14, -128, -128, -128, 12); + const static __m128i shufmid8_1 = _mm_set_epi8 ((char)-128, (char)-128, (char)-128, (char)3, (char)-128, +(char)-128, (char)-128, (char)1, (char)-128, (char)-128, (char)-128, (char)2, (char)-128, (char)-128, (char)-128, (char)0); + const static __m128i shufmid8_2 = _mm_set_epi8 ((char)-128, (char)-128, (char)-128, (char)7, (char)-128, +(char)-128, (char)-128, (char)5, (char)-128, (char)-128, (char)-128, (char)6, (char)-128, (char)-128, (char)-128, (char)4); + const static __m128i shufmid8_3 = _mm_set_epi8 ((char)-128, (char)-128, (char)-128, (char)11, (char)-128, +(char)-128, (char)-128, (char)9, (char)-128, (char)-128, (char)-128, (char)10, (char)-128, (char)-128, (char)-128, (char)8); + const static __m128i shufmid8_4 = _mm_set_epi8 ((char)-128, (char)-128, (char)-128, (char)15, (char)-128, +(char)-128, (char)-128, (char)13, (char)-128, (char)-128, (char)-128, (char)14, (char)-128, (char)-128, (char)-128, (char)12); for (uint32_t j = 0; j<4;++j) { const __m128i ba = _mm_loadu_si128(reinterpret_cast(in + 16 * j)); @@ -199,11 +199,11 @@ static void simdhunpack4(const uint8_t * in,uint32_t * out) { */ -const static __m128i shufkey1_4 = {-9187202500191551488, -9187202495896584191}; -const static __m128i shufkey2_4 = {-9187202500191551488, -9187202495896584191}; -const static __m128i multi1_4 = {4294967312, 4294967312}; -const static __m128i multi2_4 = {4294967312, 4294967312}; -const static __m128i mask_4 = {64424509455, 64424509455}; +const static __m128i shufkey1_4 = _mm_set_epi64x(-9187202500191551488LL, -9187202495896584191LL); +const static __m128i shufkey2_4 = _mm_set_epi64x(-9187202500191551488LL, -9187202495896584191LL); +const static __m128i multi1_4 = _mm_set_epi64x(4294967312LL, 4294967312LL); +const static __m128i multi2_4 = _mm_set_epi64x(4294967312LL, 4294967312LL); +const static __m128i mask_4 = _mm_set_epi64x(64424509455LL, 64424509455LL); static void simdhunpack4(const uint8_t *in, uint32_t *out) { enum { b = 4 }; @@ -225,11 +225,11 @@ static void simdhunpack4(const uint8_t *in, uint32_t *out) { _mm_storeu_si128(pCurr++, ca); } } -const static __m128i shufkey1_5 = {-9187342138168279040, -9187341034361683967}; -const static __m128i shufkey2_5 = {-9187202495896616704, -9187202491601649151}; -const static __m128i multi1_5 = {17179869312, 4294967328}; -const static __m128i multi2_5 = {137438953476, 34359738369}; -const static __m128i mask_5 = {133143986207, 133143986207}; +const static __m128i shufkey1_5 = _mm_set_epi64x(-9187342138168279040LL, -9187341034361683967LL); +const static __m128i shufkey2_5 = _mm_set_epi64x(-9187202495896616704LL, -9187202491601649151LL); +const static __m128i multi1_5 = _mm_set_epi64x(17179869312LL, 4294967328LL); +const static __m128i multi2_5 = _mm_set_epi64x(137438953476LL, 34359738369LL); +const static __m128i mask_5 = _mm_set_epi64x(133143986207LL, 133143986207LL); static void simdhunpack5(const uint8_t *in, uint32_t *out) { enum { b = 5 }; @@ -251,11 +251,11 @@ static void simdhunpack5(const uint8_t *in, uint32_t *out) { _mm_storeu_si128(pCurr++, ca); } } -const static __m128i shufkey1_6 = {-9187342138168279040, -9187202491601649151}; -const static __m128i shufkey2_6 = {-9187342138168279040, -9187202491601649151}; -const static __m128i multi1_6 = {4294967360, 68719476740}; -const static __m128i multi2_6 = {4294967360, 68719476740}; -const static __m128i mask_6 = {270582939711, 270582939711}; +const static __m128i shufkey1_6 = _mm_set_epi64x(-9187342138168279040LL, -9187202491601649151LL); +const static __m128i shufkey2_6 = _mm_set_epi64x(-9187342138168279040LL, -9187202491601649151LL); +const static __m128i multi1_6 = _mm_set_epi64x(4294967360LL, 68719476740LL); +const static __m128i multi2_6 = _mm_set_epi64x(4294967360LL, 68719476740LL); +const static __m128i mask_6 = _mm_set_epi64x(270582939711LL, 270582939711LL); static void simdhunpack6(const uint8_t *in, uint32_t *out) { enum { b = 6 }; @@ -277,11 +277,11 @@ static void simdhunpack6(const uint8_t *in, uint32_t *out) { _mm_storeu_si128(pCurr++, ca); } } -const static __m128i shufkey1_7 = {-9187342138168279040, -9187339930555121151}; -const static __m128i shufkey2_7 = {-9187341034361716480, -9187202487306681598}; -const static __m128i multi1_7 = {4294967424, 17179869186}; -const static __m128i multi2_7 = {8589934593, 34359738372}; -const static __m128i mask_7 = {545460846719, 545460846719}; +const static __m128i shufkey1_7 = _mm_set_epi64x(-9187342138168279040LL, -9187339930555121151LL); +const static __m128i shufkey2_7 = _mm_set_epi64x(-9187341034361716480LL, -9187202487306681598LL); +const static __m128i multi1_7 = _mm_set_epi64x(4294967424LL, 17179869186LL); +const static __m128i multi2_7 = _mm_set_epi64x(8589934593LL, 34359738372LL); +const static __m128i mask_7 = _mm_set_epi64x(545460846719LL, 545460846719LL); static void simdhunpack7(const uint8_t *in, uint32_t *out) { enum { b = 7 }; @@ -303,8 +303,8 @@ static void simdhunpack7(const uint8_t *in, uint32_t *out) { _mm_storeu_si128(pCurr++, ca); } } -const static __m128i shufkey1_8 = {-9187202495896584192, -9187202487306649598}; -const static __m128i shufkey2_8 = {-9187202495896584192, -9187202487306649598}; +const static __m128i shufkey1_8 = _mm_set_epi64x(-9187202495896584192LL, -9187202487306649598LL); +const static __m128i shufkey2_8 = _mm_set_epi64x(-9187202495896584192LL, -9187202487306649598LL); static void simdhunpack8(const uint8_t *in, uint32_t *out) { enum { b = 8 }; @@ -318,11 +318,11 @@ static void simdhunpack8(const uint8_t *in, uint32_t *out) { _mm_storeu_si128(pCurr++, ca); } } -const static __m128i shufkey1_9 = {-9187341034361716480, -9187338826748525822}; -const static __m128i shufkey2_9 = {-9187341034361716480, -9187338826748525822}; -const static __m128i multi1_9 = {17179869192, 4294967298}; -const static __m128i multi2_9 = {17179869192, 4294967298}; -const static __m128i mask_9 = {2194728288767, 2194728288767}; +const static __m128i shufkey1_9 = _mm_set_epi64x(-9187341034361716480LL, -9187338826748525822LL); +const static __m128i shufkey2_9 = _mm_set_epi64x(-9187341034361716480LL, -9187338826748525822LL); +const static __m128i multi1_9 = _mm_set_epi64x(17179869192LL, 4294967298LL); +const static __m128i multi2_9 = _mm_set_epi64x(17179869192LL, 4294967298LL); +const static __m128i mask_9 = _mm_set_epi64x(2194728288767LL, 2194728288767LL); static void simdhunpack9(const uint8_t *in, uint32_t *out) { enum { b = 9 }; @@ -344,11 +344,11 @@ static void simdhunpack9(const uint8_t *in, uint32_t *out) { _mm_storeu_si128(pCurr++, ca); } } -const static __m128i shufkey1_10 = {-9187341034361716480, -9187338826748525822}; -const static __m128i shufkey2_10 = {-9187341034361716480, -9187338826748525822}; -const static __m128i multi1_10 = {68719476800, 4294967300}; -const static __m128i multi2_10 = {68719476800, 4294967300}; -const static __m128i mask_10 = {4393751544831, 4393751544831}; +const static __m128i shufkey1_10 = _mm_set_epi64x(-9187341034361716480LL, -9187338826748525822LL); +const static __m128i shufkey2_10 = _mm_set_epi64x(-9187341034361716480LL, -9187338826748525822LL); +const static __m128i multi1_10 = _mm_set_epi64x(68719476800LL, 4294967300LL); +const static __m128i multi2_10 = _mm_set_epi64x(68719476800LL, 4294967300LL); +const static __m128i mask_10 = _mm_set_epi64x(4393751544831LL, 4393751544831LL); static void simdhunpack10(const uint8_t *in, uint32_t *out) { enum { b = 10 }; @@ -370,11 +370,11 @@ static void simdhunpack10(const uint8_t *in, uint32_t *out) { _mm_storeu_si128(pCurr++, ca); } } -const static __m128i shufkey1_11 = {-9187341034361716480, -9187337722950057214}; -const static __m128i shufkey2_11 = {-9222525406450548480, -9187337722941930493}; -const static __m128i multi1_11 = {34359738432, 137438953473}; -const static __m128i multi2_11 = {4294967304, 17179869216}; -const static __m128i mask_11 = {8791798056959, 8791798056959}; +const static __m128i shufkey1_11 = _mm_set_epi64x(-9187341034361716480LL, -9187337722950057214LL); +const static __m128i shufkey2_11 = _mm_set_epi64x(-9222525406450548480LL, -9187337722941930493LL); +const static __m128i multi1_11 = _mm_set_epi64x(34359738432LL, 137438953473LL); +const static __m128i multi2_11 = _mm_set_epi64x(4294967304LL, 17179869216LL); +const static __m128i mask_11 = _mm_set_epi64x(8791798056959LL, 8791798056959LL); static void simdhunpack11(const uint8_t *in, uint32_t *out) { enum { b = 11 }; @@ -396,11 +396,11 @@ static void simdhunpack11(const uint8_t *in, uint32_t *out) { _mm_storeu_si128(pCurr++, ca); } } -const static __m128i shufkey1_12 = {-9187341034361716480, -9187337722941930493}; -const static __m128i shufkey2_12 = {-9187341034361716480, -9187337722941930493}; -const static __m128i multi1_12 = {4294967312, 4294967312}; -const static __m128i multi2_12 = {4294967312, 4294967312}; -const static __m128i mask_12 = {17587891081215, 17587891081215}; +const static __m128i shufkey1_12 = _mm_set_epi64x(-9187341034361716480LL, -9187337722941930493LL); +const static __m128i shufkey2_12 = _mm_set_epi64x(-9187341034361716480LL, -9187337722941930493LL); +const static __m128i multi1_12 = _mm_set_epi64x(4294967312LL, 4294967312LL); +const static __m128i multi2_12 = _mm_set_epi64x(4294967312LL, 4294967312LL); +const static __m128i mask_12 = _mm_set_epi64x(17587891081215LL, 17587891081215LL); static void simdhunpack12(const uint8_t *in, uint32_t *out) { enum { b = 12 }; @@ -422,11 +422,11 @@ static void simdhunpack12(const uint8_t *in, uint32_t *out) { _mm_storeu_si128(pCurr++, ca); } } -const static __m128i shufkey1_13 = {-9222525406450548480, -9221677670100630525}; -const static __m128i shufkey2_13 = {-9187339930563378944, -9187336619143396349}; -const static __m128i multi1_13 = {17179869312, 4294967328}; -const static __m128i multi2_13 = {137438953476, 34359738369}; -const static __m128i mask_13 = {35180077129727, 35180077129727}; +const static __m128i shufkey1_13 = _mm_set_epi64x(-9222525406450548480LL, -9221677670100630525LL); +const static __m128i shufkey2_13 = _mm_set_epi64x(-9187339930563378944LL, -9187336619143396349LL); +const static __m128i multi1_13 = _mm_set_epi64x(17179869312LL, 4294967328LL); +const static __m128i multi2_13 = _mm_set_epi64x(137438953476LL, 34359738369LL); +const static __m128i mask_13 = _mm_set_epi64x(35180077129727LL, 35180077129727LL); static void simdhunpack13(const uint8_t *in, uint32_t *out) { enum { b = 13 }; @@ -448,11 +448,11 @@ static void simdhunpack13(const uint8_t *in, uint32_t *out) { _mm_storeu_si128(pCurr++, ca); } } -const static __m128i shufkey1_14 = {-9222525406450548480, -9187336619143396349}; -const static __m128i shufkey2_14 = {-9222525406450548480, -9187336619143396349}; -const static __m128i multi1_14 = {4294967360, 68719476740}; -const static __m128i multi2_14 = {4294967360, 68719476740}; -const static __m128i mask_14 = {70364449226751, 70364449226751}; +const static __m128i shufkey1_14 = _mm_set_epi64x(-9222525406450548480LL, -9187336619143396349LL); +const static __m128i shufkey2_14 = _mm_set_epi64x(-9222525406450548480LL, -9187336619143396349LL); +const static __m128i multi1_14 = _mm_set_epi64x(4294967360LL, 68719476740LL); +const static __m128i multi2_14 = _mm_set_epi64x(4294967360LL, 68719476740LL); +const static __m128i mask_14 = _mm_set_epi64x(70364449226751LL, 70364449226751LL); static void simdhunpack14(const uint8_t *in, uint32_t *out) { enum { b = 14 }; @@ -474,11 +474,11 @@ static void simdhunpack14(const uint8_t *in, uint32_t *out) { _mm_storeu_si128(pCurr++, ca); } } -const static __m128i shufkey1_15 = {-9222525406450548480, -9221395091325385725}; -const static __m128i shufkey2_15 = {-9222242827675500288, -9187335515336735484}; -const static __m128i multi1_15 = {4294967424, 17179869186}; -const static __m128i multi2_15 = {8589934593, 34359738372}; -const static __m128i mask_15 = {140733193420799, 140733193420799}; +const static __m128i shufkey1_15 = _mm_set_epi64x(-9222525406450548480LL, -9221395091325385725LL); +const static __m128i shufkey2_15 = _mm_set_epi64x(-9222242827675500288LL, -9187335515336735484LL); +const static __m128i multi1_15 = _mm_set_epi64x(4294967424LL, 17179869186LL); +const static __m128i multi2_15 = _mm_set_epi64x(8589934593LL, 34359738372LL); +const static __m128i mask_15 = _mm_set_epi64x(140733193420799LL, 140733193420799LL); static void simdhunpack15(const uint8_t *in, uint32_t *out) { enum { b = 15 }; @@ -500,8 +500,8 @@ static void simdhunpack15(const uint8_t *in, uint32_t *out) { _mm_storeu_si128(pCurr++, ca); } } -const static __m128i shufkey1_16 = {-9187339930555121408, -9187335515328740092}; -const static __m128i shufkey2_16 = {-9187339930555121408, -9187335515328740092}; +const static __m128i shufkey1_16 = _mm_set_epi64x(-9187339930555121408LL, -9187335515328740092LL); +const static __m128i shufkey2_16 = _mm_set_epi64x(-9187339930555121408LL, -9187335515328740092LL); static void simdhunpack16(const uint8_t *in, uint32_t *out) { enum { b = 16 }; @@ -515,11 +515,11 @@ static void simdhunpack16(const uint8_t *in, uint32_t *out) { _mm_storeu_si128(pCurr++, ca); } } -const static __m128i shufkey1_17 = {-9222242827675500288, -9221112512542014204}; -const static __m128i shufkey2_17 = {-9222242827675500288, -9221112512542014204}; -const static __m128i multi1_17 = {17179869192, 4294967298}; -const static __m128i multi2_17 = {17179869192, 4294967298}; -const static __m128i mask_17 = {562945658585087, 562945658585087}; +const static __m128i shufkey1_17 = _mm_set_epi64x(-9222242827675500288LL, -9221112512542014204LL); +const static __m128i shufkey2_17 = _mm_set_epi64x(-9222242827675500288LL, -9221112512542014204LL); +const static __m128i multi1_17 = _mm_set_epi64x(17179869192LL, 4294967298LL); +const static __m128i multi2_17 = _mm_set_epi64x(17179869192LL, 4294967298LL); +const static __m128i mask_17 = _mm_set_epi64x(562945658585087LL, 562945658585087LL); static void simdhunpack17(const uint8_t *in, uint32_t *out) { enum { b = 17 }; @@ -541,11 +541,11 @@ static void simdhunpack17(const uint8_t *in, uint32_t *out) { _mm_storeu_si128(pCurr++, ca); } } -const static __m128i shufkey1_18 = {-9222242827675500288, -9221112512542014204}; -const static __m128i shufkey2_18 = {-9222242827675500288, -9221112512542014204}; -const static __m128i multi1_18 = {68719476800, 4294967300}; -const static __m128i multi2_18 = {68719476800, 4294967300}; -const static __m128i mask_18 = {1125895612137471, 1125895612137471}; +const static __m128i shufkey1_18 = _mm_set_epi64x(-9222242827675500288LL, -9221112512542014204LL); +const static __m128i shufkey2_18 = _mm_set_epi64x(-9222242827675500288LL, -9221112512542014204LL); +const static __m128i multi1_18 = _mm_set_epi64x(68719476800LL, 4294967300LL); +const static __m128i multi2_18 = _mm_set_epi64x(68719476800LL, 4294967300LL); +const static __m128i mask_18 = _mm_set_epi64x(1125895612137471LL, 1125895612137471LL); static void simdhunpack18(const uint8_t *in, uint32_t *out) { enum { b = 18 }; @@ -567,11 +567,11 @@ static void simdhunpack18(const uint8_t *in, uint32_t *out) { _mm_storeu_si128(pCurr++, ca); } } -const static __m128i shufkey1_19 = {-9222242827675500288, -9220829935788751612}; -const static __m128i shufkey2_19 = {361417179368915200, -9220829933758642683}; -const static __m128i multi1_19 = {34359738432, 137438953473}; -const static __m128i multi2_19 = {4294967304, 17179869216}; -const static __m128i mask_19 = {2251795519242239, 2251795519242239}; +const static __m128i shufkey1_19 = _mm_set_epi64x(-9222242827675500288LL, -9220829935788751612LL); +const static __m128i shufkey2_19 = _mm_set_epi64x(361417179368915200LL, -9220829933758642683LL); +const static __m128i multi1_19 = _mm_set_epi64x(34359738432LL, 137438953473LL); +const static __m128i multi2_19 = _mm_set_epi64x(4294967304LL, 17179869216LL); +const static __m128i mask_19 = _mm_set_epi64x(2251795519242239LL, 2251795519242239LL); static void simdhunpack19(const uint8_t *in, uint32_t *out) { enum { b = 19 }; @@ -593,11 +593,11 @@ static void simdhunpack19(const uint8_t *in, uint32_t *out) { _mm_storeu_si128(pCurr++, ca); } } -const static __m128i shufkey1_20 = {-9222242827675500288, -9220829933758642683}; -const static __m128i shufkey2_20 = {-9222242827675500288, -9220829933758642683}; -const static __m128i multi1_20 = {4294967312, 4294967312}; -const static __m128i multi2_20 = {4294967312, 4294967312}; -const static __m128i mask_20 = {4503595333451775, 4503595333451775}; +const static __m128i shufkey1_20 = _mm_set_epi64x(-9222242827675500288LL, -9220829933758642683LL); +const static __m128i shufkey2_20 = _mm_set_epi64x(-9222242827675500288LL, -9220829933758642683LL); +const static __m128i multi1_20 = _mm_set_epi64x(4294967312LL, 4294967312LL); +const static __m128i multi2_20 = _mm_set_epi64x(4294967312LL, 4294967312LL); +const static __m128i mask_20 = _mm_set_epi64x(4503595333451775LL, 4503595333451775LL); static void simdhunpack20(const uint8_t *in, uint32_t *out) { enum { b = 20 }; @@ -619,11 +619,11 @@ static void simdhunpack20(const uint8_t *in, uint32_t *out) { _mm_storeu_si128(pCurr++, ca); } } -const static __m128i shufkey1_21 = {361417179368915200, 723118043475412485}; -const static __m128i shufkey2_21 = {-9221960250989346560, -9220547356988602875}; -const static __m128i multi1_21 = {17179869312, 4294967328}; -const static __m128i multi2_21 = {137438953476, 34359738369}; -const static __m128i mask_21 = {9007194961870847, 9007194961870847}; +const static __m128i shufkey1_21 = _mm_set_epi64x(361417179368915200LL, 723118043475412485LL); +const static __m128i shufkey2_21 = _mm_set_epi64x(-9221960250989346560LL, -9220547356988602875LL); +const static __m128i multi1_21 = _mm_set_epi64x(17179869312LL, 4294967328LL); +const static __m128i multi2_21 = _mm_set_epi64x(137438953476LL, 34359738369LL); +const static __m128i mask_21 = _mm_set_epi64x(9007194961870847LL, 9007194961870847LL); static void simdhunpack21(const uint8_t *in, uint32_t *out) { enum { b = 21 }; @@ -645,11 +645,11 @@ static void simdhunpack21(const uint8_t *in, uint32_t *out) { _mm_storeu_si128(pCurr++, ca); } } -const static __m128i shufkey1_22 = {361417179368915200, -9220547356988602875}; -const static __m128i shufkey2_22 = {361417179368915200, -9220547356988602875}; -const static __m128i multi1_22 = {4294967360, 68719476740}; -const static __m128i multi2_22 = {4294967360, 68719476740}; -const static __m128i mask_22 = {18014394218708991, 18014394218708991}; +const static __m128i shufkey1_22 = _mm_set_epi64x(361417179368915200LL, -9220547356988602875LL); +const static __m128i shufkey2_22 = _mm_set_epi64x(361417179368915200LL, -9220547356988602875LL); +const static __m128i multi1_22 = _mm_set_epi64x(4294967360LL, 68719476740LL); +const static __m128i multi2_22 = _mm_set_epi64x(4294967360LL, 68719476740LL); +const static __m128i mask_22 = _mm_set_epi64x(18014394218708991LL, 18014394218708991LL); static void simdhunpack22(const uint8_t *in, uint32_t *out) { enum { b = 22 }; @@ -671,11 +671,11 @@ static void simdhunpack22(const uint8_t *in, uint32_t *out) { _mm_storeu_si128(pCurr++, ca); } } -const static __m128i shufkey1_23 = {361417179368915200, 795458214283380229}; -const static __m128i shufkey2_23 = {433757350092996864, -9220264778188454138}; -const static __m128i multi1_23 = {4294967424, 17179869186}; -const static __m128i multi2_23 = {8589934593, 34359738372}; -const static __m128i mask_23 = {36028792732385279, 36028792732385279}; +const static __m128i shufkey1_23 = _mm_set_epi64x(361417179368915200LL, 795458214283380229LL); +const static __m128i shufkey2_23 = _mm_set_epi64x(433757350092996864LL, -9220264778188454138LL); +const static __m128i multi1_23 = _mm_set_epi64x(4294967424LL, 17179869186LL); +const static __m128i multi2_23 = _mm_set_epi64x(8589934593LL, 34359738372LL); +const static __m128i mask_23 = _mm_set_epi64x(36028792732385279LL, 36028792732385279LL); static void simdhunpack23(const uint8_t *in, uint32_t *out) { enum { b = 23 }; @@ -697,8 +697,8 @@ static void simdhunpack23(const uint8_t *in, uint32_t *out) { _mm_storeu_si128(pCurr++, ca); } } -const static __m128i shufkey1_24 = {-9221960248892194560, -9220264776191965434}; -const static __m128i shufkey2_24 = {-9221960248892194560, -9220264776191965434}; +const static __m128i shufkey1_24 = _mm_set_epi64x(-9221960248892194560LL, -9220264776191965434LL); +const static __m128i shufkey2_24 = _mm_set_epi64x(-9221960248892194560LL, -9220264776191965434LL); static void simdhunpack24(const uint8_t *in, uint32_t *out) { enum { b = 24 }; @@ -712,11 +712,11 @@ static void simdhunpack24(const uint8_t *in, uint32_t *out) { _mm_storeu_si128(pCurr++, ca); } } -const static __m128i shufkey1_25 = {433757350092996864, 867798387121456902}; -const static __m128i shufkey2_25 = {433757350092996864, 867798387121456902}; -const static __m128i multi1_25 = {17179869192, 4294967298}; -const static __m128i multi2_25 = {17179869192, 4294967298}; -const static __m128i mask_25 = {144115183814443007, 144115183814443007}; +const static __m128i shufkey1_25 = _mm_set_epi64x(433757350092996864LL, 867798387121456902LL); +const static __m128i shufkey2_25 = _mm_set_epi64x(433757350092996864LL, 867798387121456902LL); +const static __m128i multi1_25 = _mm_set_epi64x(17179869192LL, 4294967298LL); +const static __m128i multi2_25 = _mm_set_epi64x(17179869192LL, 4294967298LL); +const static __m128i mask_25 = _mm_set_epi64x(144115183814443007LL, 144115183814443007LL); static void simdhunpack25(const uint8_t *in, uint32_t *out) { enum { b = 25 }; @@ -738,11 +738,11 @@ static void simdhunpack25(const uint8_t *in, uint32_t *out) { _mm_storeu_si128(pCurr++, ca); } } -const static __m128i shufkey1_26 = {433757350092996864, 867798387121456902}; -const static __m128i shufkey2_26 = {433757350092996864, 867798387121456902}; -const static __m128i multi1_26 = {68719476800, 4294967300}; -const static __m128i multi2_26 = {68719476800, 4294967300}; -const static __m128i mask_26 = {288230371923853311, 288230371923853311}; +const static __m128i shufkey1_26 = _mm_set_epi64x(433757350092996864LL, 867798387121456902LL); +const static __m128i shufkey2_26 = _mm_set_epi64x(433757350092996864LL, 867798387121456902LL); +const static __m128i multi1_26 = _mm_set_epi64x(68719476800LL, 4294967300LL); +const static __m128i multi2_26 = _mm_set_epi64x(68719476800LL, 4294967300LL); +const static __m128i mask_26 = _mm_set_epi64x(288230371923853311LL, 288230371923853311LL); static void simdhunpack26(const uint8_t *in, uint32_t *out) { enum { b = 26 }; @@ -764,11 +764,11 @@ static void simdhunpack26(const uint8_t *in, uint32_t *out) { _mm_storeu_si128(pCurr++, ca); } } -const static __m128i shufkey1_27 = {433757350092996864, 940138559942690566}; -const static __m128i shufkey2_27 = {506097522914230528, 940138559959533575}; -const static __m128i multi1_27 = {4294967304, 17179869192}; -const static __m128i multi2_27 = {137438953474, 4294967304}; -const static __m128i mask_27 = {576460748142673919, 576460748142673919}; +const static __m128i shufkey1_27 = _mm_set_epi64x(433757350092996864LL, 940138559942690566LL); +const static __m128i shufkey2_27 = _mm_set_epi64x(506097522914230528LL, 940138559959533575LL); +const static __m128i multi1_27 = _mm_set_epi64x(4294967304LL, 17179869192LL); +const static __m128i multi2_27 = _mm_set_epi64x(137438953474LL, 4294967304LL); +const static __m128i mask_27 = _mm_set_epi64x(576460748142673919LL, 576460748142673919LL); static void simdhunpack27(const uint8_t *in, uint32_t *out) { enum { b = 27 }; @@ -792,11 +792,11 @@ static void simdhunpack27(const uint8_t *in, uint32_t *out) { _mm_storeu_si128(pCurr++, ca); } } -const static __m128i shufkey1_28 = {433757350092996864, 940138559959533575}; -const static __m128i shufkey2_28 = {433757350092996864, 940138559959533575}; -const static __m128i multi1_28 = {4294967312, 4294967312}; -const static __m128i multi2_28 = {4294967312, 4294967312}; -const static __m128i mask_28 = {1152921500580315135, 1152921500580315135}; +const static __m128i shufkey1_28 = _mm_set_epi64x(433757350092996864LL, 940138559959533575LL); +const static __m128i shufkey2_28 = _mm_set_epi64x(433757350092996864LL, 940138559959533575LL); +const static __m128i multi1_28 = _mm_set_epi64x(4294967312LL, 4294967312LL); +const static __m128i multi2_28 = _mm_set_epi64x(4294967312LL, 4294967312LL); +const static __m128i mask_28 = _mm_set_epi64x(1152921500580315135LL, 1152921500580315135LL); static void simdhunpack28(const uint8_t *in, uint32_t *out) { enum { b = 28 }; @@ -818,10 +818,10 @@ static void simdhunpack28(const uint8_t *in, uint32_t *out) { _mm_storeu_si128(pCurr++, ca); } } -const static __m128i shufkey2_29 = {506097522914230528, 1012478732780767239}; -const static __m128i multi1_29 = {17179869188, 4294967297}; -const static __m128i multi2_29 = {17179869192, 4294967298}; -const static __m128i mask_29 = {2305843005455597567, 2305843005455597567}; +const static __m128i shufkey2_29 = _mm_set_epi64x(506097522914230528LL, 1012478732780767239LL); +const static __m128i multi1_29 = _mm_set_epi64x(17179869188LL, 4294967297LL); +const static __m128i multi2_29 = _mm_set_epi64x(17179869192LL, 4294967298LL); +const static __m128i mask_29 = _mm_set_epi64x(2305843005455597567LL, 2305843005455597567LL); static void simdhunpack29(const uint8_t *in, uint32_t *out) { enum { b = 29 }; @@ -833,10 +833,10 @@ static void simdhunpack29(const uint8_t *in, uint32_t *out) { _mm_loadu_si128(reinterpret_cast(in + b * j)); __m128i ca = _mm_blend_epi16(tmp, _mm_slli_epi64(tmp, 3), 236); const static __m128i x2 = - _mm_set_epi8(14, 13, 12, 11, 10, 9, 8, 7, 7, 6, 5, 4, 3, 2, 1, 0); + _mm_set_epi8((char)14, (char)13, (char)12, (char)11, (char)10, (char)9, (char)8, (char)7, (char)7, (char)6, (char)5, (char)4, (char)3, (char)2, (char)1, (char)0); ca = _mm_shuffle_epi8(ca, x2); const static __m128i x1 = - _mm_set_epi8(14, 13, 12, 11, 10, 9, 8, 7, 7, 6, 5, 4, 3, 2, 1, 0); + _mm_set_epi8((char)14, (char)13, (char)12, (char)11, (char)10, (char)9, (char)8, (char)7, (char)7, (char)6, (char)5, (char)4, (char)3, (char)2, (char)1, (char)0); ca = _mm_blend_epi16(ca, _mm_shuffle_epi8(tmp, x1), 51); ca = _mm_mullo_epi32(ca, multi1_29); ca = _mm_srli_epi32(ca, shift1); @@ -851,11 +851,11 @@ static void simdhunpack29(const uint8_t *in, uint32_t *out) { _mm_storeu_si128(pCurr++, ca); } } -const static __m128i shufkey1_30 = {433757350092996864, 1012478732780767239}; -const static __m128i shufkey2_30 = {433757350092996864, 1012478732780767239}; -const static __m128i multi1_30 = {4294967300, 4294967300}; -const static __m128i multi2_30 = {4294967300, 4294967300}; -const static __m128i mask_30 = {4611686015206162431, 4611686015206162431}; +const static __m128i shufkey1_30 = _mm_set_epi64x(433757350092996864LL, 1012478732780767239LL); +const static __m128i shufkey2_30 = _mm_set_epi64x(433757350092996864LL, 1012478732780767239LL); +const static __m128i multi1_30 = _mm_set_epi64x(4294967300LL, 4294967300LL); +const static __m128i multi2_30 = _mm_set_epi64x(4294967300LL, 4294967300LL); +const static __m128i mask_30 = _mm_set_epi64x(4611686015206162431LL, 4611686015206162431LL); static void simdhunpack30(const uint8_t *in, uint32_t *out) { enum { b = 30 }; @@ -907,8 +907,8 @@ static void simdhunpack31(const uint8_t *in, uint32_t *out) { } } -const static __m128i shufkey1_32 = {506097522914230528, 1084818905618843912}; -const static __m128i shufkey2_32 = {506097522914230528, 1084818905618843912}; +const static __m128i shufkey1_32 = _mm_set_epi64x(506097522914230528LL, 1084818905618843912LL); +const static __m128i shufkey2_32 = _mm_set_epi64x(506097522914230528LL, 1084818905618843912LL); static void simdhunpack32(const uint8_t *in, uint32_t *out) { enum { b = 32 }; diff --git a/src/simdbitpacking.cpp b/src/simdbitpacking.cpp index 974d45b..49a8930 100644 --- a/src/simdbitpacking.cpp +++ b/src/simdbitpacking.cpp @@ -8930,7 +8930,7 @@ static void __SIMD_fastunpack1_32(const __m128i *__restrict__ in, __m128i InReg2 = InReg1; __m128i OutReg1, OutReg2, OutReg3, OutReg4; const __m128i mask = _mm_set1_epi32(1); - +#if (defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))) || (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_AMD64))) unsigned shift = 0; for (unsigned i = 0; i < 8; ++i) { @@ -8943,6 +8943,72 @@ static void __SIMD_fastunpack1_32(const __m128i *__restrict__ in, _mm_storeu_si128(out++, OutReg3); _mm_storeu_si128(out++, OutReg4); } +#elif (defined(__GNUC__) && (defined(__aarch64__))) || (defined(_MSC_VER) && defined(_M_ARM64)) + OutReg1 = _mm_and_si128(_mm_srli_epi32(InReg1, 0), mask); + OutReg2 = _mm_and_si128(_mm_srli_epi32(InReg2, 1), mask); + OutReg3 = _mm_and_si128(_mm_srli_epi32(InReg1, 2), mask); + OutReg4 = _mm_and_si128(_mm_srli_epi32(InReg2, 3), mask); + _mm_store_si128(out++, OutReg1); + _mm_store_si128(out++, OutReg2); + _mm_store_si128(out++, OutReg3); + _mm_store_si128(out++, OutReg4); + OutReg1 = _mm_and_si128(_mm_srli_epi32(InReg1, 4), mask); + OutReg2 = _mm_and_si128(_mm_srli_epi32(InReg2, 5), mask); + OutReg3 = _mm_and_si128(_mm_srli_epi32(InReg1, 6), mask); + OutReg4 = _mm_and_si128(_mm_srli_epi32(InReg2, 7), mask); + _mm_store_si128(out++, OutReg1); + _mm_store_si128(out++, OutReg2); + _mm_store_si128(out++, OutReg3); + _mm_store_si128(out++, OutReg4); + OutReg1 = _mm_and_si128(_mm_srli_epi32(InReg1, 8), mask); + OutReg2 = _mm_and_si128(_mm_srli_epi32(InReg2, 9), mask); + OutReg3 = _mm_and_si128(_mm_srli_epi32(InReg1, 10), mask); + OutReg4 = _mm_and_si128(_mm_srli_epi32(InReg2, 11), mask); + _mm_store_si128(out++, OutReg1); + _mm_store_si128(out++, OutReg2); + _mm_store_si128(out++, OutReg3); + _mm_store_si128(out++, OutReg4); + OutReg1 = _mm_and_si128(_mm_srli_epi32(InReg1, 12), mask); + OutReg2 = _mm_and_si128(_mm_srli_epi32(InReg2, 13), mask); + OutReg3 = _mm_and_si128(_mm_srli_epi32(InReg1, 14), mask); + OutReg4 = _mm_and_si128(_mm_srli_epi32(InReg2, 15), mask); + _mm_store_si128(out++, OutReg1); + _mm_store_si128(out++, OutReg2); + _mm_store_si128(out++, OutReg3); + _mm_store_si128(out++, OutReg4); + OutReg1 = _mm_and_si128(_mm_srli_epi32(InReg1, 16), mask); + OutReg2 = _mm_and_si128(_mm_srli_epi32(InReg2, 17), mask); + OutReg3 = _mm_and_si128(_mm_srli_epi32(InReg1, 18), mask); + OutReg4 = _mm_and_si128(_mm_srli_epi32(InReg2, 19), mask); + _mm_store_si128(out++, OutReg1); + _mm_store_si128(out++, OutReg2); + _mm_store_si128(out++, OutReg3); + _mm_store_si128(out++, OutReg4); + OutReg1 = _mm_and_si128(_mm_srli_epi32(InReg1, 20), mask); + OutReg2 = _mm_and_si128(_mm_srli_epi32(InReg2, 21), mask); + OutReg3 = _mm_and_si128(_mm_srli_epi32(InReg1, 22), mask); + OutReg4 = _mm_and_si128(_mm_srli_epi32(InReg2, 23), mask); + _mm_store_si128(out++, OutReg1); + _mm_store_si128(out++, OutReg2); + _mm_store_si128(out++, OutReg3); + _mm_store_si128(out++, OutReg4); + OutReg1 = _mm_and_si128(_mm_srli_epi32(InReg1, 24), mask); + OutReg2 = _mm_and_si128(_mm_srli_epi32(InReg2, 25), mask); + OutReg3 = _mm_and_si128(_mm_srli_epi32(InReg1, 26), mask); + OutReg4 = _mm_and_si128(_mm_srli_epi32(InReg2, 27), mask); + _mm_store_si128(out++, OutReg1); + _mm_store_si128(out++, OutReg2); + _mm_store_si128(out++, OutReg3); + _mm_store_si128(out++, OutReg4); + OutReg1 = _mm_and_si128(_mm_srli_epi32(InReg1, 28), mask); + OutReg2 = _mm_and_si128(_mm_srli_epi32(InReg2, 29), mask); + OutReg3 = _mm_and_si128(_mm_srli_epi32(InReg1, 30), mask); + OutReg4 = _mm_and_si128(_mm_srli_epi32(InReg2, 31), mask); + _mm_store_si128(out++, OutReg1); + _mm_store_si128(out++, OutReg2); + _mm_store_si128(out++, OutReg3); + _mm_store_si128(out++, OutReg4); +#endif } static void __SIMD_fastunpack2_32(const __m128i *__restrict__ in, diff --git a/src/simdunalignedbitpacking.cpp b/src/simdunalignedbitpacking.cpp index 469ca6e..5e3646e 100644 --- a/src/simdunalignedbitpacking.cpp +++ b/src/simdunalignedbitpacking.cpp @@ -8930,7 +8930,7 @@ static void __SIMD_fastunpack1_32(const __m128i *__restrict__ in, __m128i InReg2 = InReg1; __m128i OutReg1, OutReg2, OutReg3, OutReg4; const __m128i mask = _mm_set1_epi32(1); - +#if (defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))) || (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_AMD64))) unsigned shift = 0; for (unsigned i = 0; i < 8; ++i) { @@ -8943,6 +8943,72 @@ static void __SIMD_fastunpack1_32(const __m128i *__restrict__ in, _mm_storeu_si128(out++, OutReg3); _mm_storeu_si128(out++, OutReg4); } +#elif (defined(__GNUC__) && (defined(__aarch64__))) || (defined(_MSC_VER) && defined(_M_ARM64)) + OutReg1 = _mm_and_si128(_mm_srli_epi32(InReg1, 0), mask); + OutReg2 = _mm_and_si128(_mm_srli_epi32(InReg2, 1), mask); + OutReg3 = _mm_and_si128(_mm_srli_epi32(InReg1, 2), mask); + OutReg4 = _mm_and_si128(_mm_srli_epi32(InReg2, 3), mask); + _mm_store_si128(out++, OutReg1); + _mm_store_si128(out++, OutReg2); + _mm_store_si128(out++, OutReg3); + _mm_store_si128(out++, OutReg4); + OutReg1 = _mm_and_si128(_mm_srli_epi32(InReg1, 4), mask); + OutReg2 = _mm_and_si128(_mm_srli_epi32(InReg2, 5), mask); + OutReg3 = _mm_and_si128(_mm_srli_epi32(InReg1, 6), mask); + OutReg4 = _mm_and_si128(_mm_srli_epi32(InReg2, 7), mask); + _mm_store_si128(out++, OutReg1); + _mm_store_si128(out++, OutReg2); + _mm_store_si128(out++, OutReg3); + _mm_store_si128(out++, OutReg4); + OutReg1 = _mm_and_si128(_mm_srli_epi32(InReg1, 8), mask); + OutReg2 = _mm_and_si128(_mm_srli_epi32(InReg2, 9), mask); + OutReg3 = _mm_and_si128(_mm_srli_epi32(InReg1, 10), mask); + OutReg4 = _mm_and_si128(_mm_srli_epi32(InReg2, 11), mask); + _mm_store_si128(out++, OutReg1); + _mm_store_si128(out++, OutReg2); + _mm_store_si128(out++, OutReg3); + _mm_store_si128(out++, OutReg4); + OutReg1 = _mm_and_si128(_mm_srli_epi32(InReg1, 12), mask); + OutReg2 = _mm_and_si128(_mm_srli_epi32(InReg2, 13), mask); + OutReg3 = _mm_and_si128(_mm_srli_epi32(InReg1, 14), mask); + OutReg4 = _mm_and_si128(_mm_srli_epi32(InReg2, 15), mask); + _mm_store_si128(out++, OutReg1); + _mm_store_si128(out++, OutReg2); + _mm_store_si128(out++, OutReg3); + _mm_store_si128(out++, OutReg4); + OutReg1 = _mm_and_si128(_mm_srli_epi32(InReg1, 16), mask); + OutReg2 = _mm_and_si128(_mm_srli_epi32(InReg2, 17), mask); + OutReg3 = _mm_and_si128(_mm_srli_epi32(InReg1, 18), mask); + OutReg4 = _mm_and_si128(_mm_srli_epi32(InReg2, 19), mask); + _mm_store_si128(out++, OutReg1); + _mm_store_si128(out++, OutReg2); + _mm_store_si128(out++, OutReg3); + _mm_store_si128(out++, OutReg4); + OutReg1 = _mm_and_si128(_mm_srli_epi32(InReg1, 20), mask); + OutReg2 = _mm_and_si128(_mm_srli_epi32(InReg2, 21), mask); + OutReg3 = _mm_and_si128(_mm_srli_epi32(InReg1, 22), mask); + OutReg4 = _mm_and_si128(_mm_srli_epi32(InReg2, 23), mask); + _mm_store_si128(out++, OutReg1); + _mm_store_si128(out++, OutReg2); + _mm_store_si128(out++, OutReg3); + _mm_store_si128(out++, OutReg4); + OutReg1 = _mm_and_si128(_mm_srli_epi32(InReg1, 24), mask); + OutReg2 = _mm_and_si128(_mm_srli_epi32(InReg2, 25), mask); + OutReg3 = _mm_and_si128(_mm_srli_epi32(InReg1, 26), mask); + OutReg4 = _mm_and_si128(_mm_srli_epi32(InReg2, 27), mask); + _mm_store_si128(out++, OutReg1); + _mm_store_si128(out++, OutReg2); + _mm_store_si128(out++, OutReg3); + _mm_store_si128(out++, OutReg4); + OutReg1 = _mm_and_si128(_mm_srli_epi32(InReg1, 28), mask); + OutReg2 = _mm_and_si128(_mm_srli_epi32(InReg2, 29), mask); + OutReg3 = _mm_and_si128(_mm_srli_epi32(InReg1, 30), mask); + OutReg4 = _mm_and_si128(_mm_srli_epi32(InReg2, 31), mask); + _mm_store_si128(out++, OutReg1); + _mm_store_si128(out++, OutReg2); + _mm_store_si128(out++, OutReg3); + _mm_store_si128(out++, OutReg4); +#endif } static void __SIMD_fastunpack2_32(const __m128i *__restrict__ in, diff --git a/src/streamvbyte.c b/src/streamvbyte.c index 655b916..94376e3 100644 --- a/src/streamvbyte.c +++ b/src/streamvbyte.c @@ -6,26 +6,31 @@ // no particular alignment is assumed or guaranteed for any elements #if defined(_MSC_VER) - /* Microsoft C/C++-compatible compiler */ - #include - #include - #include - #define __restrict__ __restrict + /* Microsoft C/C++-compatible compiler */ + #if (defined(_M_IX86) || defined(_M_AMD64)) + #include + #elif defined(_M_ARM64) + #include "fastpfor_neon.h" + #endif + + #include + #include + #define __restrict__ __restrict #elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) - /* GCC-compatible compiler, targeting x86/x86-64 */ - #include -#elif defined(__GNUC__) && defined(__ARM_NEON__) - /* GCC-compatible compiler, targeting ARM with NEON */ - #include + /* GCC-compatible compiler, targeting x86/x86-64 */ + #include +#elif defined(__aarch64__) + /* GCC-compatible compiler, targeting ARM with NEON */ + #include "fastpfor_neon.h" #elif defined(__GNUC__) && defined(__IWMMXT__) - /* GCC-compatible compiler, targeting ARM with WMMX */ - #include + /* GCC-compatible compiler, targeting ARM with WMMX */ + #include #elif (defined(__GNUC__) || defined(__xlC__)) && (defined(__VEC__) || defined(__ALTIVEC__)) - /* XLC or GCC-compatible compiler, targeting PowerPC with VMX/VSX */ - #include + /* XLC or GCC-compatible compiler, targeting PowerPC with VMX/VSX */ + #include #elif defined(__GNUC__) && defined(__SPE__) - /* GCC-compatible compiler, targeting PowerPC with SPE */ - #include + /* GCC-compatible compiler, targeting PowerPC with SPE */ + #include #endif #include @@ -256,7 +261,7 @@ uint8_t *svb_insert_scalar_d1_init(uint8_t *keyPtr, uint8_t *dataPtr, // first insert the new key uint8_t code = _encode_data(new_key - prev, &dataPtr); - *keyPtr = key | (code << shift); + *keyPtr = (uint8_t)(key | (code << shift)); // then update the current key shift += 2; @@ -285,7 +290,7 @@ uint8_t *svb_insert_scalar_d1_init(uint8_t *keyPtr, uint8_t *dataPtr, uint8_t code = _encode_data(new_key - prev, &dataPtr); key &= ~(3 << shift); key |= code << shift; - *keyPtr = key; // write last key (no increment needed) + *keyPtr = (uint8_t)(key); // write last key (no increment needed) *position = count; return dataPtrBegin + dataSize + code + 1; @@ -618,7 +623,7 @@ static const int8_t shuffleTable[256][16] = { // static char HighTo32[16] = {8, 9, -1, -1, 10, 11, -1, -1, 12, 13, -1, -1, 14, // 15, -1, -1}; // Byte Order: {0x0706050403020100, 0x0F0E0D0C0B0A0908} -#if !defined(_MSC_VER) || defined(__clang__) +#if !defined(_MSC_VER) || defined(__clang__) || (defined(_MSC_VER) && defined(_M_ARM64)) static const xmm_t High16To32 = { (long long)0xFFFF0B0AFFFF0908, (long long)0xFFFF0F0EFFFF0D0C}; #else static const xmm_t High16To32 = {8, 9, -1, -1, 10, 11, -1, -1, @@ -879,7 +884,7 @@ uint8_t *svb_decode_avx_simple(uint32_t *out, uint8_t *__restrict__ keyPtr, return svb_decode_scalar(out, keyPtr + consumedkeys, dataPtr, count & 31); } -uint64_t svb_encode(uint8_t *out, const uint32_t *in, uint32_t count, int delta, +size_t svb_encode(uint8_t *out, const uint32_t *in, uint32_t count, int delta, int type) { *(uint32_t *)out = count; // first 4 bytes is number of ints uint8_t *keyPtr = out + 4; // keys come immediately after 32-bit count @@ -898,7 +903,7 @@ uint64_t svb_encode(uint8_t *out, const uint32_t *in, uint32_t count, int delta, abort(); } -uint64_t svb_decode(uint32_t *out, uint8_t *in, int delta, int type) { +size_t svb_decode(uint32_t *out, uint8_t *in, int delta, int type) { uint32_t count = *(uint32_t *)in; // first 4 bytes is number of ints if (count == 0) return 0; diff --git a/src/varintdecode.c b/src/varintdecode.c index e106a2a..15f0cff 100644 --- a/src/varintdecode.c +++ b/src/varintdecode.c @@ -5,22 +5,27 @@ #if defined(_MSC_VER) /* Microsoft C/C++-compatible compiler */ - #include + #if (defined(_M_IX86) || defined(_M_AMD64)) + #include + #elif defined(_M_ARM64) + #include "fastpfor_neon.h" + #endif #elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) - /* GCC-compatible compiler, targeting x86/x86-64 */ - #include -#elif defined(__GNUC__) && defined(__ARM_NEON__) - /* GCC-compatible compiler, targeting ARM with NEON */ - #include + /* GCC-compatible compiler, targeting x86/x86-64 */ + #include + +#elif defined(__aarch64__) + /* GCC-compatible compiler, targeting ARM with NEON */ + #include "fastpfor_neon.h" #elif defined(__GNUC__) && defined(__IWMMXT__) - /* GCC-compatible compiler, targeting ARM with WMMX */ - #include + /* GCC-compatible compiler, targeting ARM with WMMX */ + #include #elif (defined(__GNUC__) || defined(__xlC__)) && (defined(__VEC__) || defined(__ALTIVEC__)) - /* XLC or GCC-compatible compiler, targeting PowerPC with VMX/VSX */ - #include + /* XLC or GCC-compatible compiler, targeting PowerPC with VMX/VSX */ + #include #elif defined(__GNUC__) && defined(__SPE__) - /* GCC-compatible compiler, targeting PowerPC with SPE */ - #include + /* GCC-compatible compiler, targeting PowerPC with SPE */ + #include #endif #include @@ -1205,7 +1210,7 @@ size_t masked_vbyte_read_loop(const uint8_t *in, uint32_t *out, } while (count + 112 < length); // 112 == 48 + 48 ahead for scanning + up to // 16 remaining in sig sig = (nextSig << (scanned - consumed - 48)) | sig; - availablebytes = scanned - consumed; + availablebytes = (int)(scanned - consumed); } while (availablebytes + count < length) { if (availablebytes < 16) { @@ -1236,7 +1241,7 @@ size_t masked_vbyte_read_loop(const uint8_t *in, uint32_t *out, uint64_t eaten = masked_vbyte_read_group(in + consumed, out + count, sig, &ints_read); consumed += eaten; - availablebytes -= eaten; + availablebytes -= (int)eaten; sig >>= eaten; count += ints_read; } @@ -1323,7 +1328,7 @@ size_t altmasked_vbyte_read_loop(const uint8_t *in, uint32_t *out, } } sig = (nextSig << (scanned - consumed - 48)) | sig; - availablebytes = scanned - consumed; + availablebytes = (int)(scanned - consumed); } while (1) { if (availablebytes < 16) { @@ -1354,7 +1359,7 @@ size_t altmasked_vbyte_read_loop(const uint8_t *in, uint32_t *out, uint64_t eaten = masked_vbyte_read_group(in + consumed, out + count, sig, &ints_read); consumed += eaten; - availablebytes -= eaten; + availablebytes -= (int)eaten; sig >>= eaten; count += ints_read; } @@ -1442,7 +1447,7 @@ size_t masked_vbyte_read_loop_fromcompressedsize(const uint8_t *in, } } sig = (nextSig << (scanned - consumed - 48)) | sig; - availablebytes = scanned - consumed; + availablebytes = (int)(scanned - consumed); } while (1) { if (availablebytes < 16) { @@ -1472,7 +1477,7 @@ size_t masked_vbyte_read_loop_fromcompressedsize(const uint8_t *in, uint64_t bytes = masked_vbyte_read_group(in + consumed, out, sig, &ints_read); consumed += bytes; - availablebytes -= bytes; + availablebytes -= (int)bytes; sig >>= bytes; out += ints_read; } @@ -1570,7 +1575,7 @@ size_t altmasked_vbyte_read_loop_fromcompressedsize(const uint8_t *in, } } sig = (nextSig << (scanned - consumed - 48)) | sig; - availablebytes = scanned - consumed; + availablebytes = (int)(scanned - consumed); } while (1) { if (availablebytes < 16) { @@ -1600,7 +1605,7 @@ size_t altmasked_vbyte_read_loop_fromcompressedsize(const uint8_t *in, uint64_t bytes = masked_vbyte_read_group(in + consumed, out, sig, &ints_read); consumed += bytes; - availablebytes -= bytes; + availablebytes -= (int)bytes; sig >>= bytes; out += ints_read; }