Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 83 additions & 0 deletions .github/workflows/buildwheels.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
name: Build and upload wheels

on:
push:
tags:
- 'v[0-9]+\.[0-9]+\.[0-9]+'
workflow_dispatch: # Allows manual triggering

# The Python project lives in the python_bindings/ subdirectory and reaches the
# C++ sources through a `fastpfor` symlink to the repository root. That symlink
# is fine for local builds but does not survive being copied into cibuildwheel's
# build containers (and is not preserved on Windows checkouts). To stay portable
# we build a self-contained sdist first (it bundles the real header/source
# files) and build every wheel from that extracted sdist.
jobs:
build_sdist:
name: Build source distribution
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4

- name: Install uv
uses: astral-sh/setup-uv@v5

- name: Build sdist
run: uv build --sdist
working-directory: python_bindings

- uses: actions/upload-artifact@v4
with:
name: sdist-artifact
path: python_bindings/dist/*.tar.gz
if-no-files-found: error

build_wheels:
name: Build wheels on ${{ matrix.os }}
needs: build_sdist
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
# Native runners for each architecture we publish: x86_64 Linux,
# aarch64 Linux, x86_64 macOS, Apple Silicon macOS, and x86_64 Windows.
os: [ubuntu-latest, ubuntu-24.04-arm, macos-13, macos-latest, windows-latest]

steps:
- uses: actions/download-artifact@v4
with:
name: sdist-artifact
path: dist

- name: Unpack sdist
shell: bash
run: |
mkdir -p sdist_src
tar -xzf dist/*.tar.gz -C sdist_src --strip-components=1

- name: Build wheels
uses: pypa/cibuildwheel@v3.3.1
with:
package-dir: sdist_src

- uses: actions/upload-artifact@v4
with:
name: wheel-artifact-${{ matrix.os }}
path: ./wheelhouse/*.whl
if-no-files-found: error

upload_pypi:
needs: [build_wheels, build_sdist]
runs-on: ubuntu-latest
# Only upload to PyPI when triggered by a tag (not manual workflow_dispatch).
if: startsWith(github.ref, 'refs/tags/')
steps:
- uses: actions/download-artifact@v4
with:
path: dist
merge-multiple: true

- uses: pypa/gh-action-pypi-publish@v1.13.0
with:
user: __token__
password: ${{ secrets.PYPI_API_TOKEN }}
32 changes: 32 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C/C++ extensions and objects
*.so
*.o
*.a

# Distribution / packaging
.Python
env/
venv/
build/
develop-eggs/
dist/
eggs/
.eggs/
sdist/
var/
wheels/
wheelhouse/
*.egg-info/
*.egg

# CMake build directories
cmake-build-*/

# Editor / OS cruft
.DS_Store
*.swp
23 changes: 16 additions & 7 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
# will be re-used by the Python build process
# 2) Build and run unit and performance tests.
#
cmake_minimum_required(VERSION 2.8.7)
cmake_minimum_required(VERSION 3.10)
if (NOT CMAKE_BUILD_TYPE)
message(STATUS "No build type selected, default to Release")
set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE)
Expand Down Expand Up @@ -54,8 +54,10 @@ MESSAGE( STATUS "CMAKE_C_COMPILER: " ${CMAKE_C_COMPILER} )
MESSAGE( STATUS "CXX_COMPILER_VERSION: " ${CXX_COMPILER_VERSION} )
if( SUPPORT_SSE42 )
MESSAGE( STATUS "SSE 4.2 support detected" )
elseif( SUPPORT_NEON )
MESSAGE( STATUS "ARM NEON detected: SSE intrinsics are emulated via fastpfor_neon.h" )
else()
MESSAGE( STATUS "SSE 4.2 support not detected" )
MESSAGE( STATUS "Neither SSE 4.2 nor ARM NEON support detected" )
endif()

if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
Expand All @@ -79,14 +81,21 @@ elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel")
set (CMAKE_CXX_FLAGS_DEBUG "-Wall -ggdb -std=c++11 -DHAVE_CXX0X -march=native")
set (CMAKE_C_FLAGS_RELEASE "-Wall -Ofast -DNDEBUG -std=c99 -march=native")
set (CMAKE_C_FLAGS_DEBUG "-Wall -ggdb -std=c99 -march=native")
elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang")
if (CXX_COMPILER_VERSION VERSION_LESS 4.2.1)
message(STATUS "Clang version must be at least 4.2.1!" )
endif()
set (CMAKE_CXX_FLAGS_RELEASE "-Wall -Wcast-align -O3 -DNDEBUG -std=c++11 -DHAVE_CXX0X -msse4.1 -march=native")
set (CMAKE_CXX_FLAGS_DEBUG "-Wall -Wcast-align -ggdb -std=c++11 -DHAVE_CXX0X -msse4.1 -march=native")
set (CMAKE_C_FLAGS_RELEASE "-Wall -Wcast-align -O3 -DNDEBUG -std=c99 -msse4.1 -march=native")
set (CMAKE_C_FLAGS_DEBUG "-Wall -Wcast-align -ggdb -std=c99 -msse4.1 -march=native")
# -msse4.1 is x86-only and rejected on ARM; -march=native already enables the
# available SIMD (SSE/AVX on x86, NEON on ARM), so only add it off ARM.
if (SUPPORT_NEON)
set (SIMD_FLAGS "-march=native")
else()
set (SIMD_FLAGS "-msse4.1 -march=native")
endif()
set (CMAKE_CXX_FLAGS_RELEASE "-Wall -Wcast-align -O3 -DNDEBUG -std=c++11 -DHAVE_CXX0X ${SIMD_FLAGS}")
set (CMAKE_CXX_FLAGS_DEBUG "-Wall -Wcast-align -ggdb -std=c++11 -DHAVE_CXX0X ${SIMD_FLAGS}")
set (CMAKE_C_FLAGS_RELEASE "-Wall -Wcast-align -O3 -DNDEBUG -std=c99 ${SIMD_FLAGS}")
set (CMAKE_C_FLAGS_DEBUG "-Wall -Wcast-align -ggdb -std=c99 ${SIMD_FLAGS}")
elseif(WIN32)
# TODO add support for later versions?
if(NOT MSVC12)
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ or via pip:
```
pip install pyfastpfor
```
Due to some compilation quirks this currently seem to work with GCC only. I will fix it in some not so distant future. You may also need to install Python dev-files. On Ubuntu, for Python 3 you can do it as follows:
The bindings build with GCC or Clang, on both x86-64 (SSE/AVX) and ARM/aarch64 (NEON, including Apple Silicon). You may also need to install Python dev-files. On Ubuntu, for Python 3 you can do it as follows:

```
sudo apt-get install python3-dev
Expand All @@ -28,7 +28,7 @@ sudo apt-get install python3-dev

# Documentation

The library supports all the codecs implemented in the original [FastPFor](https://github.com/lemire/FastPFor) library by July 2023. To get a list of codecs, use the function ``getCodecList``.
The library supports all the codecs implemented in the original [FastPFor](https://github.com/lemire/FastPFor) library (v0.5.0). To get a list of codecs, use the function ``getCodecList``.

Typical light-weight compression does not take context into account and, consequently, works well only for small integers. When integers are large, data differencing is a common trick to make integers small. In particular, we often deal with sorted lists of integers, which can be represented by differences between neighboring numbers.

Expand Down
12 changes: 11 additions & 1 deletion cmake_modules/DetectCPUFeatures.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,17 @@ else()
check_cxx_source_compiles("${AVXPROG}" SUPPORT_AVX)
set(CMAKE_REQUIRED_FLAGS "-march=native -mavx2")
check_cxx_source_compiles("${AVX2PROG}" SUPPORT_AVX2)
endif()
endif()

set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQUIRED_FLAGS})

# On ARM the SSE intrinsics used throughout FastPFor are provided by the
# fastpfor_neon.h shim (mapped onto NEON), so there is no SSE4.2 support but the
# code still compiles. Flag ARM builds so the SIMD-specific compiler options
# (e.g. -msse4.1) can be skipped.
if (CMAKE_SYSTEM_PROCESSOR MATCHES "arm64" OR CMAKE_SYSTEM_PROCESSOR MATCHES "arm"
OR CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64"
OR CMAKE_GENERATOR_PLATFORM MATCHES "ARM64" OR CMAKE_GENERATOR_PLATFORM MATCHES "ARM")
set(SUPPORT_NEON ON)
endif ()

7 changes: 6 additions & 1 deletion headers/VarIntG8IU.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
* This code is released under the
* Apache License Version 2.0 http://www.apache.org/licenses/.
*/
#if !defined(__SSSE3__) && !(defined(_MSC_VER) && defined(__AVX__))
#if (!defined(__SSSE3__) && !(defined(_MSC_VER) && defined(__AVX__))) && !(defined(__ARM_NEON) || defined(__aarch64__))
#ifndef _MSC_VER
#pragma message \
"Disabling varintg8iu due to lack of SSSE3 support, try adding -mssse3 or the equivalent on your compiler"
Expand All @@ -12,7 +12,12 @@
#else
#ifndef VARINTG8IU_H__
#define VARINTG8IU_H__
#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
#include <emmintrin.h>
#elif defined(__aarch64__)
/* GCC-compatible compiler, targeting ARM with native NEON */
#include "fastpfor_neon.h"
#endif
#include "codecs.h"
#ifdef __GNUC__
#define PREDICT_FALSE(x) (__builtin_expect(x, 0))
Expand Down
1 change: 1 addition & 0 deletions headers/blockpacking.h
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,7 @@ class ByteAlignedPacking : public IntegerCODEC {

const uint32_t *decodeArray(const uint32_t *in, const size_t length,
uint32_t *out, size_t &nvalue) {
(void)length;
const uint32_t actuallength = *in++;
const uint8_t *inbyte = reinterpret_cast<const uint8_t *>(in);
const uint32_t *const initout(out);
Expand Down
45 changes: 42 additions & 3 deletions headers/codecfactory.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,54 @@ class CODECFactory {
public:
CODECFactory();

std::vector<std::shared_ptr<IntegerCODEC>> allSchemes();
std::vector<std::shared_ptr<IntegerCODEC>> allSchemes() const;

std::vector<std::string> allNames();
std::vector<std::string> allNames() const;

std::shared_ptr<IntegerCODEC> &getFromName(std::string name);
std::shared_ptr<IntegerCODEC> const& getFromName(std::string name) const;
private:
CodecMap scodecmap;
};

std::unique_ptr<IntegerCODEC> fastbinarypacking8_codec();
std::unique_ptr<IntegerCODEC> fastbinarypacking16_codec();
std::unique_ptr<IntegerCODEC> fastbinarypacking32_codec();
std::unique_ptr<IntegerCODEC> BP32_codec();
std::unique_ptr<IntegerCODEC> vsencoding_codec();
std::unique_ptr<IntegerCODEC> fastpfor128_codec();
std::unique_ptr<IntegerCODEC> fastpfor256_codec();
std::unique_ptr<IntegerCODEC> simdfastpfor128_codec();
std::unique_ptr<IntegerCODEC> simdfastpfor256_codec();
std::unique_ptr<IntegerCODEC> simplepfor_codec();
std::unique_ptr<IntegerCODEC> simdsimplepfor_codec();
std::unique_ptr<IntegerCODEC> pfor_codec();
std::unique_ptr<IntegerCODEC> simdpfor_codec();
std::unique_ptr<IntegerCODEC> pfor2008_codec();
std::unique_ptr<IntegerCODEC> simdnewpfor_codec();
std::unique_ptr<IntegerCODEC> newpfor_codec();
std::unique_ptr<IntegerCODEC> optpfor_codec();
std::unique_ptr<IntegerCODEC> simdoptpfor_codec();
std::unique_ptr<IntegerCODEC> varint_codec();
std::unique_ptr<IntegerCODEC> vbyte_codec();
std::unique_ptr<IntegerCODEC> maskedvbyte_codec();
std::unique_ptr<IntegerCODEC> streamvbyte_codec();
std::unique_ptr<IntegerCODEC> varintgb_codec();
std::unique_ptr<IntegerCODEC> simple16_codec();
std::unique_ptr<IntegerCODEC> simple9_codec();
std::unique_ptr<IntegerCODEC> simple9_rle_codec();
std::unique_ptr<IntegerCODEC> simple8b_codec();
std::unique_ptr<IntegerCODEC> simple8b_rle_codec();
#ifdef VARINTG8IU_H__
std::unique_ptr<IntegerCODEC> varintg8iu_codec();
#endif
#ifdef USESNAPPY
std::unique_ptr<IntegerCODEC> snappy_codec();
#endif
std::unique_ptr<IntegerCODEC> simdbinarypacking_codec();
std::unique_ptr<IntegerCODEC> simdgroupsimple_codec();
std::unique_ptr<IntegerCODEC> simdgroupsimple_ringbuf_codec();
std::unique_ptr<IntegerCODEC> copy_codec();

} // namespace FastPForLib

#endif /* CODECFACTORY_H_ */
4 changes: 4 additions & 0 deletions headers/codecs.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,10 @@ class IntegerCODEC {
* of the variable nvalue gets updated with the number actually use
* (if nvalue exceeds the original value, there might be a buffer
* overrun).
*
* NOTE: Decoding can be performed with an unknown input length. This
* case is indicated by a length of 0; however, nvalue must be provided
* in order for the decoder knows how many values to decode.
*/
virtual const uint32_t *decodeArray(const uint32_t *in, const size_t length,
uint32_t *out, size_t &nvalue) = 0;
Expand Down
10 changes: 10 additions & 0 deletions headers/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,12 @@
// C headers (sorted)
#include <errno.h>
#include <fcntl.h>
#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
#include <immintrin.h>
#elif defined(__GNUC__) && defined(__aarch64__)
#include "fastpfor_neon.h"
#endif

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
Expand Down Expand Up @@ -44,7 +49,12 @@
#ifdef _MSC_VER
#include <iso646.h>
#include <stdint.h>

#if (defined(_M_IX86) || defined(_M_AMD64))
#include <intrin.h>
#elif defined(_M_ARM64)
#include "fastpfor_neon.h"
#endif

#define __attribute__(n)
#define __restrict__ __restrict
Expand Down
43 changes: 38 additions & 5 deletions headers/cpubenchmark.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,34 @@ static __inline__ unsigned long long stopRDTSCP(void) {
"%rdx");
return (static_cast<unsigned long long>(cycles_high) << 32) | cycles_low;
}
#elif defined(_MSC_VER)
#elif (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_AMD64)))

static inline unsigned long long startRDTSC(void) { return __rdtsc(); }

static inline unsigned long long stopRDTSCP(void) { return __rdtsc(); }
#elif defined(_MSC_VER) && defined(_M_ARM64)
// oriented by zeromq implementation for msc arm/arm64
// https://github.com/zeromq/libzmq/blob/master/src/clock.cpp
inline unsigned long long rdtsc() {
const int64_t pmccntr_el0 = (((3 & 1) << 14) | // op0
((3 & 7) << 11) | // op1
((9 & 15) << 7) | // crn
((13 & 15) << 3) | // crm
((0 & 7) << 0)); // op2

return _ReadStatusReg (pmccntr_el0);
}

static inline unsigned long long startRDTSC(void) { return rdtsc(); }

static inline unsigned long long stopRDTSCP(void) { return rdtsc(); }
#elif (defined(_MSC_VER) && (defined(_M_ARM64)))
// Taken from microsoft documentation (see
// https://learn.microsoft.com/en-us/cpp/build/overview-of-arm-abi-conventions?view=msvc-170

static inline unsigned long long startRDTSC(void) { return __rdpmccntr64(); }

static inline unsigned long long stopRDTSCP(void) { return __rdpmccntr64(); }

#elif defined(__i386__) || defined(__x86_64__)

Expand All @@ -66,15 +89,25 @@ inline unsigned long long rdtsc() {
static __inline__ unsigned long long startRDTSC(void) { return rdtsc(); }

static __inline__ unsigned long long stopRDTSCP(void) { return rdtsc(); }
#elif (defined(__GNUC__) && (defined(__aarch64__)))
inline uint64_t rdtsc() {
uint64_t cycles;
asm volatile("mrs %0, cntvct_el0"
: "=r"(cycles)); /* output */
return cycles;
}

static __inline__ uint64_t startRDTSC(void) { return rdtsc(); }

#elif(defined(__arm__) || defined(__ppc__) || defined(__ppc64__))
static __inline__ uint64_t stopRDTSCP(void) { return rdtsc(); }
#elif(defined(__arm__) || defined(__ppc__) || defined(__ppc64__)) || (defined(_MSC_VER) && defined(_M_ARM64))

// for PPC we should be able to use tbl, but I could not find
// an equivalent to rdtsc for ARM.

inline uint64 rdtsc() { return 0; }
static __inline__ ticks startRDTSC(void) { return 0; }
static __inline__ ticks stopRDTSCP(void) { return 0; }
inline uint64_t rdtsc() { return 0; }
static __inline__ uint64_t startRDTSC(void) { return 0; }
static __inline__ uint64_t stopRDTSCP(void) { return 0; }
#else
#error Unknown architecture
#endif
Expand Down
Loading