Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
0658c3f
ALP: Add AlpConstants with encoding tables and format constants
sfc-gh-pgaur Mar 7, 2026
d656aaf
ALP: Add AlpEncoderDecoder with core encode/decode logic
sfc-gh-pgaur Mar 7, 2026
388d2a9
ALP: Add AlpCompression for single-vector compress/decompress
sfc-gh-pgaur Mar 7, 2026
9bb8a86
ALP: Add AlpSampler for sampling-based encoding preset generation
sfc-gh-pgaur Mar 7, 2026
60607ed
ALP: Add AlpWrapper for page-level encode/decode with 7-byte header
sfc-gh-pgaur Mar 7, 2026
a2c3992
ALP: Add incremental AlpValuesWriter for float and double columns
sfc-gh-pgaur Mar 7, 2026
27508d9
ALP: Add lazy AlpValuesReader for float and double columns
sfc-gh-pgaur Mar 7, 2026
1036d09
ALP: Add cross-implementation tests and fix encode/decode to match C++
sfc-gh-pgaur Mar 7, 2026
1d4e46d
ALP: Add encoding benchmark for float and double throughput
sfc-gh-pgaur Mar 7, 2026
1cb5ea5
ALP: Wire up Encoding.ALP in column Encoding enum and AlpValuesWriter
sfc-gh-pgaur Mar 7, 2026
0d7cd79
ALP: Apply spotless formatting fixes
sfc-gh-pgaur Mar 7, 2026
09ca686
ALP: Add cross-language interop test reading C++ ALP-encoded parquet …
sfc-gh-pgaur Mar 7, 2026
6d0443d
ALP: Add writer pipeline integration and bidirectional cross-language…
sfc-gh-pgaur Mar 7, 2026
00dbf38
ALP: Add float32 cross-language tests and generator support
sfc-gh-pgaur Mar 8, 2026
385d053
ALP: Add codec-level and pipeline throughput benchmarks
sfc-gh-pgaur Mar 8, 2026
abbe36d
ALP: Add pipeline integration, reader buffer reuse, and validation
sfc-gh-pgaur Mar 8, 2026
05996ba
ALP: Add encoding comparison benchmark (ALP vs ZSTD vs BSS+ZSTD)
sfc-gh-pgaur Mar 8, 2026
8f4cb19
ALP: Pre-allocate long[] encoded buffer for double vector decodes
sfc-gh-pgaur Mar 8, 2026
78cb02d
ALP: Use unpack32Values for long bit unpacking
sfc-gh-pgaur Mar 8, 2026
af73f5e
ALP: Inline decode with hoisted multipliers for double decompression
sfc-gh-pgaur Mar 9, 2026
9c1526b
ALP: Use real Spotify dataset in codec throughput benchmark
sfc-gh-pgaur Mar 9, 2026
a598134
ALP: Remove duplicate CSV test data from parquet-column
sfc-gh-pgaur Mar 9, 2026
5c3d7fa
ALP: Gzip compress CSV test data and regenerate float parquet files
sfc-gh-pgaur Mar 9, 2026
48d5e73
ALP: Use real Spotify dataset in encoding comparison benchmark
sfc-gh-pgaur Mar 10, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
import org.apache.parquet.bytes.BytesUtils;
import org.apache.parquet.column.page.DictionaryPage;
import org.apache.parquet.column.values.ValuesReader;
import org.apache.parquet.column.values.alp.AlpValuesReaderForDouble;
import org.apache.parquet.column.values.alp.AlpValuesReaderForFloat;
import org.apache.parquet.column.values.bitpacking.ByteBitPackingValuesReader;
import org.apache.parquet.column.values.bytestreamsplit.ByteStreamSplitValuesReaderForDouble;
import org.apache.parquet.column.values.bytestreamsplit.ByteStreamSplitValuesReaderForFLBA;
Expand Down Expand Up @@ -147,6 +149,24 @@ public ValuesReader getValuesReader(ColumnDescriptor descriptor, ValuesType valu
}
},

/**
* Adaptive Lossless floating-Point (ALP) encoding for FLOAT and DOUBLE columns.
*/
ALP {
@Override
public ValuesReader getValuesReader(ColumnDescriptor descriptor, ValuesType valuesType) {
switch (descriptor.getType()) {
case FLOAT:
return new AlpValuesReaderForFloat();
case DOUBLE:
return new AlpValuesReaderForDouble();
default:
throw new ParquetDecodingException(
"Encoding ALP is only supported for type FLOAT and DOUBLE, got " + descriptor.getType());
}
}
},

/**
* @deprecated This is no longer used, and has been replaced by {@link #RLE}
* which is combination of bit packing and rle
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ public class ParquetProperties {
public static final int DEFAULT_DICTIONARY_PAGE_SIZE = DEFAULT_PAGE_SIZE;
public static final boolean DEFAULT_IS_DICTIONARY_ENABLED = true;
public static final boolean DEFAULT_IS_BYTE_STREAM_SPLIT_ENABLED = false;
public static final boolean DEFAULT_IS_ALP_ENABLED = false;
public static final WriterVersion DEFAULT_WRITER_VERSION = WriterVersion.PARQUET_1_0;
public static final boolean DEFAULT_ESTIMATE_ROW_COUNT_FOR_PAGE_SIZE_CHECK = true;
public static final int DEFAULT_MINIMUM_RECORD_COUNT_FOR_CHECK = 100;
Expand Down Expand Up @@ -132,6 +133,7 @@ public static WriterVersion fromString(String name) {
private final int pageRowCountLimit;
private final boolean pageWriteChecksumEnabled;
private final ColumnProperty<ByteStreamSplitMode> byteStreamSplitEnabled;
private final ColumnProperty<Boolean> alpEnabled;
private final Map<String, String> extraMetaData;
private final ColumnProperty<Boolean> statistics;
private final ColumnProperty<Boolean> sizeStatistics;
Expand Down Expand Up @@ -164,6 +166,7 @@ private ParquetProperties(Builder builder) {
this.pageRowCountLimit = builder.pageRowCountLimit;
this.pageWriteChecksumEnabled = builder.pageWriteChecksumEnabled;
this.byteStreamSplitEnabled = builder.byteStreamSplitEnabled.build();
this.alpEnabled = builder.alpEnabled.build();
this.extraMetaData = builder.extraMetaData;
this.statistics = builder.statistics.build();
this.sizeStatistics = builder.sizeStatistics.build();
Expand Down Expand Up @@ -259,6 +262,20 @@ public boolean isByteStreamSplitEnabled(ColumnDescriptor column) {
}
}

/**
* Returns true if ALP encoding is enabled for the given column.
* ALP encoding is only applicable to FLOAT and DOUBLE columns.
*/
public boolean isAlpEnabled(ColumnDescriptor column) {
switch (column.getPrimitiveType().getPrimitiveTypeName()) {
case FLOAT:
case DOUBLE:
return alpEnabled.getValue(column);
default:
return false;
}
}

public ByteBufferAllocator getAllocator() {
return allocator;
}
Expand Down Expand Up @@ -416,6 +433,7 @@ public static class Builder {
private int pageRowCountLimit = DEFAULT_PAGE_ROW_COUNT_LIMIT;
private boolean pageWriteChecksumEnabled = DEFAULT_PAGE_WRITE_CHECKSUM_ENABLED;
private final ColumnProperty.Builder<ByteStreamSplitMode> byteStreamSplitEnabled;
private final ColumnProperty.Builder<Boolean> alpEnabled;
private Map<String, String> extraMetaData = new HashMap<>();
private final ColumnProperty.Builder<Boolean> statistics;
private final ColumnProperty.Builder<Boolean> sizeStatistics;
Expand All @@ -427,6 +445,7 @@ private Builder() {
DEFAULT_IS_BYTE_STREAM_SPLIT_ENABLED
? ByteStreamSplitMode.FLOATING_POINT
: ByteStreamSplitMode.NONE);
alpEnabled = ColumnProperty.<Boolean>builder().withDefaultValue(DEFAULT_IS_ALP_ENABLED);
bloomFilterEnabled = ColumnProperty.<Boolean>builder().withDefaultValue(DEFAULT_BLOOM_FILTER_ENABLED);
bloomFilterNDVs = ColumnProperty.<Long>builder().withDefaultValue(null);
bloomFilterFPPs = ColumnProperty.<Double>builder().withDefaultValue(DEFAULT_BLOOM_FILTER_FPP);
Expand Down Expand Up @@ -457,6 +476,7 @@ private Builder(ParquetProperties toCopy) {
this.numBloomFilterCandidates = ColumnProperty.builder(toCopy.numBloomFilterCandidates);
this.maxBloomFilterBytes = toCopy.maxBloomFilterBytes;
this.byteStreamSplitEnabled = ColumnProperty.builder(toCopy.byteStreamSplitEnabled);
this.alpEnabled = ColumnProperty.builder(toCopy.alpEnabled);
this.extraMetaData = toCopy.extraMetaData;
this.statistics = ColumnProperty.builder(toCopy.statistics);
this.sizeStatistics = ColumnProperty.builder(toCopy.sizeStatistics);
Expand Down Expand Up @@ -534,6 +554,29 @@ public Builder withExtendedByteStreamSplitEncoding(boolean enable) {
return this;
}

/**
* Enable or disable ALP encoding for FLOAT and DOUBLE columns.
*
* @param enable whether ALP encoding should be enabled
* @return this builder for method chaining.
*/
public Builder withAlpEncoding(boolean enable) {
this.alpEnabled.withDefaultValue(enable);
return this;
}

/**
* Enable or disable ALP encoding for the specified column.
*
* @param columnPath the path of the column (dot-string)
* @param enable whether ALP encoding should be enabled
* @return this builder for method chaining.
*/
public Builder withAlpEncoding(String columnPath, boolean enable) {
this.alpEnabled.withValue(columnPath, enable);
return this;
}

/**
* Set the Parquet format dictionary page size.
*
Expand Down
Loading