fast-pack · lemire · Jun 7, 2026
diff --git a/.github/workflows/buildwheels.yml b/.github/workflows/buildwheels.yml
@@ -0,0 +1,83 @@
+name: Build and upload wheels
+
+on:
+  push:
+    tags:
+      - 'v[0-9]+\.[0-9]+\.[0-9]+'
+  workflow_dispatch: # Allows manual triggering
+
+# The Python project lives in the python_bindings/ subdirectory and reaches the
+# C++ sources through a `fastpfor` symlink to the repository root. That symlink
+# is fine for local builds but does not survive being copied into cibuildwheel's
+# build containers (and is not preserved on Windows checkouts). To stay portable
+# we build a self-contained sdist first (it bundles the real header/source
+# files) and build every wheel from that extracted sdist.
+jobs:
+  build_sdist:
+    name: Build source distribution
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v5
+
+      - name: Build sdist
+        run: uv build --sdist
+        working-directory: python_bindings
+
+      - uses: actions/upload-artifact@v4
+        with:
+          name: sdist-artifact
+          path: python_bindings/dist/*.tar.gz
+          if-no-files-found: error
+
+  build_wheels:
+    name: Build wheels on ${{ matrix.os }}
+    needs: build_sdist
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        # Native runners for each architecture we publish: x86_64 Linux,
+        # aarch64 Linux, x86_64 macOS, Apple Silicon macOS, and x86_64 Windows.
+        os: [ubuntu-latest, ubuntu-24.04-arm, macos-13, macos-latest, windows-latest]
+
+    steps:
+      - uses: actions/download-artifact@v4
+        with:
+          name: sdist-artifact
+          path: dist
+
+      - name: Unpack sdist
+        shell: bash
+        run: |
+          mkdir -p sdist_src
+          tar -xzf dist/*.tar.gz -C sdist_src --strip-components=1
+
+      - name: Build wheels
+        uses: pypa/cibuildwheel@v3.3.1
+        with:
+          package-dir: sdist_src
+
+      - uses: actions/upload-artifact@v4
+        with:
+          name: wheel-artifact-${{ matrix.os }}
+          path: ./wheelhouse/*.whl
+          if-no-files-found: error
+
+  upload_pypi:
+    needs: [build_wheels, build_sdist]
+    runs-on: ubuntu-latest
+    # Only upload to PyPI when triggered by a tag (not manual workflow_dispatch).
+    if: startsWith(github.ref, 'refs/tags/')
+    steps:
+      - uses: actions/download-artifact@v4
+        with:
+          path: dist
+          merge-multiple: true
+
+      - uses: pypa/gh-action-pypi-publish@v1.13.0
+        with:
+          user: __token__
+          password: ${{ secrets.PYPI_API_TOKEN }}
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,32 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C/C++ extensions and objects
+*.so
+*.o
+*.a
+
+# Distribution / packaging
+.Python
+env/
+venv/
+build/
+develop-eggs/
+dist/
+eggs/
+.eggs/
+sdist/
+var/
+wheels/
+wheelhouse/
+*.egg-info/
+*.egg
+
+# CMake build directories
+cmake-build-*/
+
+# Editor / OS cruft
+.DS_Store
+*.swp
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -9,7 +9,7 @@
 #      will be re-used by the Python build process
 #   2) Build and run unit and performance tests.
 #
-cmake_minimum_required(VERSION 2.8.7)
+cmake_minimum_required(VERSION 3.10)
 if (NOT CMAKE_BUILD_TYPE)
 		message(STATUS "No build type selected, default to Release")
 		set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE)
@@ -54,8 +54,10 @@ MESSAGE( STATUS "CMAKE_C_COMPILER: " ${CMAKE_C_COMPILER} )
 MESSAGE( STATUS "CXX_COMPILER_VERSION: " ${CXX_COMPILER_VERSION} )
 if( SUPPORT_SSE42 )
     MESSAGE( STATUS "SSE 4.2 support detected" )
+elseif( SUPPORT_NEON )
+    MESSAGE( STATUS "ARM NEON detected: SSE intrinsics are emulated via fastpfor_neon.h" )
 else()
-    MESSAGE( STATUS "SSE 4.2 support not detected" )
+    MESSAGE( STATUS "Neither SSE 4.2 nor ARM NEON support detected" )
 endif()
 
 if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
@@ -79,14 +81,21 @@ elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel")
     set (CMAKE_CXX_FLAGS_DEBUG   "-Wall -ggdb   -std=c++11 -DHAVE_CXX0X  -march=native")
     set (CMAKE_C_FLAGS_RELEASE "-Wall -Ofast  -DNDEBUG -std=c99  -march=native")
     set (CMAKE_C_FLAGS_DEBUG   "-Wall -ggdb   -std=c99  -march=native")
-elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
+elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang")
     if (CXX_COMPILER_VERSION VERSION_LESS 4.2.1)
         message(STATUS  "Clang version must be at least 4.2.1!" )
     endif()
-    set (CMAKE_CXX_FLAGS_RELEASE "-Wall -Wcast-align -O3 -DNDEBUG -std=c++11 -DHAVE_CXX0X -msse4.1 -march=native")
-    set (CMAKE_CXX_FLAGS_DEBUG   "-Wall -Wcast-align -ggdb  -std=c++11 -DHAVE_CXX0X -msse4.1 -march=native")
-    set (CMAKE_C_FLAGS_RELEASE "-Wall -Wcast-align -O3 -DNDEBUG -std=c99 -msse4.1 -march=native")
-    set (CMAKE_C_FLAGS_DEBUG   "-Wall -Wcast-align -ggdb  -std=c99 -msse4.1 -march=native")
+    # -msse4.1 is x86-only and rejected on ARM; -march=native already enables the
+    # available SIMD (SSE/AVX on x86, NEON on ARM), so only add it off ARM.
+    if (SUPPORT_NEON)
+        set (SIMD_FLAGS "-march=native")
+    else()
+        set (SIMD_FLAGS "-msse4.1 -march=native")
+    endif()
+    set (CMAKE_CXX_FLAGS_RELEASE "-Wall -Wcast-align -O3 -DNDEBUG -std=c++11 -DHAVE_CXX0X ${SIMD_FLAGS}")
+    set (CMAKE_CXX_FLAGS_DEBUG   "-Wall -Wcast-align -ggdb  -std=c++11 -DHAVE_CXX0X ${SIMD_FLAGS}")
+    set (CMAKE_C_FLAGS_RELEASE "-Wall -Wcast-align -O3 -DNDEBUG -std=c99 ${SIMD_FLAGS}")
+    set (CMAKE_C_FLAGS_DEBUG   "-Wall -Wcast-align -ggdb  -std=c99 ${SIMD_FLAGS}")
 elseif(WIN32)
     # TODO add support for later versions?
     if(NOT MSVC12)

diff --git a/README.md b/README.md
@@ -19,7 +19,7 @@ or via pip:
 ```
 pip install pyfastpfor
 ```
-Due to some compilation quirks this currently seem to work with GCC only. I will fix it in some not so distant future. You may also need to install Python dev-files. On Ubuntu, for Python 3 you can do it as follows:
+The bindings build with GCC or Clang, on both x86-64 (SSE/AVX) and ARM/aarch64 (NEON, including Apple Silicon). You may also need to install Python dev-files. On Ubuntu, for Python 3 you can do it as follows:
 
 ```
 sudo apt-get install python3-dev
@@ -28,7 +28,7 @@ sudo apt-get install python3-dev
 
 # Documentation
 
-The library supports all the codecs implemented in the original [FastPFor](https://github.com/lemire/FastPFor) library by July 2023. To get a list of codecs, use the function ``getCodecList``. 
+The library supports all the codecs implemented in the original [FastPFor](https://github.com/lemire/FastPFor) library (v0.5.0). To get a list of codecs, use the function ``getCodecList``. 
 
 Typical light-weight compression does not take context into account and, consequently, works well only for small integers. When integers are large, data differencing is a common trick to make integers small. In particular, we often deal with sorted lists of integers, which can be represented by differences between neighboring numbers. 
 

diff --git a/cmake_modules/DetectCPUFeatures.cmake b/cmake_modules/DetectCPUFeatures.cmake
@@ -47,7 +47,17 @@ else()
 	check_cxx_source_compiles("${AVXPROG}" SUPPORT_AVX)
 	set(CMAKE_REQUIRED_FLAGS "-march=native -mavx2")
 	check_cxx_source_compiles("${AVX2PROG}" SUPPORT_AVX2)
-endif()	
+endif()
 
 set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQUIRED_FLAGS})
+
+# On ARM the SSE intrinsics used throughout FastPFor are provided by the
+# fastpfor_neon.h shim (mapped onto NEON), so there is no SSE4.2 support but the
+# code still compiles. Flag ARM builds so the SIMD-specific compiler options
+# (e.g. -msse4.1) can be skipped.
+if (CMAKE_SYSTEM_PROCESSOR MATCHES "arm64" OR CMAKE_SYSTEM_PROCESSOR MATCHES "arm"
+    OR CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64"
+    OR CMAKE_GENERATOR_PLATFORM MATCHES "ARM64" OR CMAKE_GENERATOR_PLATFORM MATCHES "ARM")
+	set(SUPPORT_NEON ON)
+endif ()
 
diff --git a/headers/VarIntG8IU.h b/headers/VarIntG8IU.h
@@ -2,7 +2,7 @@
  * This code is released under the
  * Apache License Version 2.0 http://www.apache.org/licenses/.
  */
-#if !defined(__SSSE3__) && !(defined(_MSC_VER) && defined(__AVX__))
+#if (!defined(__SSSE3__) && !(defined(_MSC_VER) && defined(__AVX__))) && !(defined(__ARM_NEON) || defined(__aarch64__))
 #ifndef _MSC_VER
 #pragma message                                                                \
     "Disabling varintg8iu due to lack of SSSE3 support, try adding -mssse3 or the equivalent on your compiler"
@@ -12,7 +12,12 @@
 #else
 #ifndef VARINTG8IU_H__
 #define VARINTG8IU_H__
+#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
 #include <emmintrin.h>
+#elif defined(__aarch64__)
+/* GCC-compatible compiler, targeting ARM with native NEON */
+#include "fastpfor_neon.h"
+#endif
 #include "codecs.h"
 #ifdef __GNUC__
 #define PREDICT_FALSE(x) (__builtin_expect(x, 0))

diff --git a/headers/blockpacking.h b/headers/blockpacking.h
@@ -286,6 +286,7 @@ class ByteAlignedPacking : public IntegerCODEC {
 
   const uint32_t *decodeArray(const uint32_t *in, const size_t length,
                               uint32_t *out, size_t &nvalue) {
+    (void)length;
     const uint32_t actuallength = *in++;
     const uint8_t *inbyte = reinterpret_cast<const uint8_t *>(in);
     const uint32_t *const initout(out);

diff --git a/headers/codecfactory.h b/headers/codecfactory.h
@@ -21,15 +21,54 @@ class CODECFactory {
 public:
   CODECFactory();
 
-  std::vector<std::shared_ptr<IntegerCODEC>> allSchemes();
+  std::vector<std::shared_ptr<IntegerCODEC>> allSchemes() const;
 
-  std::vector<std::string> allNames();
+  std::vector<std::string> allNames() const;
 
-  std::shared_ptr<IntegerCODEC> &getFromName(std::string name);
+  std::shared_ptr<IntegerCODEC> const& getFromName(std::string name) const;
 private:
   CodecMap scodecmap;
 };
 
+std::unique_ptr<IntegerCODEC> fastbinarypacking8_codec();
+std::unique_ptr<IntegerCODEC> fastbinarypacking16_codec();
+std::unique_ptr<IntegerCODEC> fastbinarypacking32_codec();
+std::unique_ptr<IntegerCODEC> BP32_codec();
+std::unique_ptr<IntegerCODEC> vsencoding_codec();
+std::unique_ptr<IntegerCODEC> fastpfor128_codec();
+std::unique_ptr<IntegerCODEC> fastpfor256_codec();
+std::unique_ptr<IntegerCODEC> simdfastpfor128_codec();
+std::unique_ptr<IntegerCODEC> simdfastpfor256_codec();
+std::unique_ptr<IntegerCODEC> simplepfor_codec();
+std::unique_ptr<IntegerCODEC> simdsimplepfor_codec();
+std::unique_ptr<IntegerCODEC> pfor_codec();
+std::unique_ptr<IntegerCODEC> simdpfor_codec();
+std::unique_ptr<IntegerCODEC> pfor2008_codec();
+std::unique_ptr<IntegerCODEC> simdnewpfor_codec();
+std::unique_ptr<IntegerCODEC> newpfor_codec();
+std::unique_ptr<IntegerCODEC> optpfor_codec();
+std::unique_ptr<IntegerCODEC> simdoptpfor_codec();
+std::unique_ptr<IntegerCODEC> varint_codec();
+std::unique_ptr<IntegerCODEC> vbyte_codec();
+std::unique_ptr<IntegerCODEC> maskedvbyte_codec();
+std::unique_ptr<IntegerCODEC> streamvbyte_codec();
+std::unique_ptr<IntegerCODEC> varintgb_codec();
+std::unique_ptr<IntegerCODEC> simple16_codec();
+std::unique_ptr<IntegerCODEC> simple9_codec();
+std::unique_ptr<IntegerCODEC> simple9_rle_codec();
+std::unique_ptr<IntegerCODEC> simple8b_codec();
+std::unique_ptr<IntegerCODEC> simple8b_rle_codec();
+#ifdef VARINTG8IU_H__
+std::unique_ptr<IntegerCODEC> varintg8iu_codec();
+#endif
+#ifdef USESNAPPY
+std::unique_ptr<IntegerCODEC> snappy_codec();
+#endif
+std::unique_ptr<IntegerCODEC> simdbinarypacking_codec();
+std::unique_ptr<IntegerCODEC> simdgroupsimple_codec();
+std::unique_ptr<IntegerCODEC> simdgroupsimple_ringbuf_codec();
+std::unique_ptr<IntegerCODEC> copy_codec();
+
 } // namespace FastPForLib
 
 #endif /* CODECFACTORY_H_ */
diff --git a/headers/codecs.h b/headers/codecs.h
@@ -53,6 +53,10 @@ class IntegerCODEC {
    * of the variable nvalue gets updated with the number actually use
    * (if nvalue exceeds the original value, there might be a buffer
    * overrun).
+   *
+   * NOTE: Decoding can be performed with an unknown input length. This
+   * case is indicated by a length of 0; however, nvalue must be provided
+   * in order for the decoder knows how many values to decode.
    */
   virtual const uint32_t *decodeArray(const uint32_t *in, const size_t length,
                                       uint32_t *out, size_t &nvalue) = 0;

diff --git a/headers/common.h b/headers/common.h
@@ -10,7 +10,12 @@
 // C headers (sorted)
 #include <errno.h>
 #include <fcntl.h>
+#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
 #include <immintrin.h>
+#elif defined(__GNUC__) && defined(__aarch64__)
+#include "fastpfor_neon.h"
+#endif
+
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -44,7 +49,12 @@
 #ifdef _MSC_VER
 #include <iso646.h>
 #include <stdint.h>
+
+#if (defined(_M_IX86) || defined(_M_AMD64))
 #include <intrin.h>
+#elif defined(_M_ARM64)
+#include "fastpfor_neon.h"
+#endif
 
 #define __attribute__(n)
 #define __restrict__ __restrict

diff --git a/headers/cpubenchmark.h b/headers/cpubenchmark.h
@@ -42,11 +42,34 @@ static __inline__ unsigned long long stopRDTSCP(void) {
                  "%rdx");
   return (static_cast<unsigned long long>(cycles_high) << 32) | cycles_low;
 }
-#elif defined(_MSC_VER)
+#elif (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_AMD64)))
 
 static inline unsigned long long startRDTSC(void) { return __rdtsc(); }
 
 static inline unsigned long long stopRDTSCP(void) { return __rdtsc(); }
+#elif defined(_MSC_VER) && defined(_M_ARM64)
+  // oriented by zeromq implementation for msc arm/arm64
+  // https://github.com/zeromq/libzmq/blob/master/src/clock.cpp
+  inline unsigned long long rdtsc() {
+    const int64_t pmccntr_el0 = (((3 & 1) << 14) |  // op0
+                                 ((3 & 7) << 11) |  // op1
+                                 ((9 & 15) << 7) |  // crn
+                                 ((13 & 15) << 3) | // crm
+                                 ((0 & 7) << 0));   // op2
+
+    return _ReadStatusReg (pmccntr_el0);
+  }
+
+  static inline unsigned long long startRDTSC(void) { return rdtsc(); }
+
+  static inline unsigned long long stopRDTSCP(void) { return rdtsc(); }
+#elif  (defined(_MSC_VER) && (defined(_M_ARM64)))
+// Taken from microsoft documentation (see
+// https://learn.microsoft.com/en-us/cpp/build/overview-of-arm-abi-conventions?view=msvc-170
+
+static inline unsigned long long startRDTSC(void) { return __rdpmccntr64(); }
+
+static inline unsigned long long stopRDTSCP(void) { return __rdpmccntr64(); }
 
 #elif defined(__i386__) || defined(__x86_64__)
 
@@ -66,15 +89,25 @@ inline unsigned long long rdtsc() {
 static __inline__ unsigned long long startRDTSC(void) { return rdtsc(); }
 
 static __inline__ unsigned long long stopRDTSCP(void) { return rdtsc(); }
+#elif (defined(__GNUC__) && (defined(__aarch64__)))
+    inline uint64_t rdtsc() {
+        uint64_t cycles;
+        asm volatile("mrs %0, cntvct_el0"
+                : "=r"(cycles)); /* output */
+        return cycles;
+    }
+
+    static __inline__ uint64_t startRDTSC(void) { return rdtsc(); }
 
-#elif(defined(__arm__) || defined(__ppc__) || defined(__ppc64__))
+    static __inline__ uint64_t stopRDTSCP(void) { return rdtsc(); }
+#elif(defined(__arm__) || defined(__ppc__) || defined(__ppc64__)) || (defined(_MSC_VER) && defined(_M_ARM64))
 
 // for PPC we should be able to use tbl, but I could not find
 // an equivalent to rdtsc for ARM.
 
-inline uint64 rdtsc() { return 0; }
-static __inline__ ticks startRDTSC(void) { return 0; }
-static __inline__ ticks stopRDTSCP(void) { return 0; }
+inline uint64_t rdtsc() { return 0; }
+static __inline__ uint64_t startRDTSC(void) { return 0; }
+static __inline__ uint64_t stopRDTSCP(void) { return 0; }
 #else
 #error Unknown architecture
 #endif