diff --git a/parquet-column/src/main/java/org/apache/parquet/column/Encoding.java b/parquet-column/src/main/java/org/apache/parquet/column/Encoding.java index 874c99fded..1da687083d 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/Encoding.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/Encoding.java @@ -29,6 +29,8 @@ import org.apache.parquet.bytes.BytesUtils; import org.apache.parquet.column.page.DictionaryPage; import org.apache.parquet.column.values.ValuesReader; +import org.apache.parquet.column.values.alp.AlpValuesReaderForDouble; +import org.apache.parquet.column.values.alp.AlpValuesReaderForFloat; import org.apache.parquet.column.values.bitpacking.ByteBitPackingValuesReader; import org.apache.parquet.column.values.bytestreamsplit.ByteStreamSplitValuesReaderForDouble; import org.apache.parquet.column.values.bytestreamsplit.ByteStreamSplitValuesReaderForFLBA; @@ -147,6 +149,24 @@ public ValuesReader getValuesReader(ColumnDescriptor descriptor, ValuesType valu } }, + /** + * Adaptive Lossless floating-Point (ALP) encoding for FLOAT and DOUBLE columns. + */ + ALP { + @Override + public ValuesReader getValuesReader(ColumnDescriptor descriptor, ValuesType valuesType) { + switch (descriptor.getType()) { + case FLOAT: + return new AlpValuesReaderForFloat(); + case DOUBLE: + return new AlpValuesReaderForDouble(); + default: + throw new ParquetDecodingException( + "Encoding ALP is only supported for type FLOAT and DOUBLE, got " + descriptor.getType()); + } + } + }, + /** * @deprecated This is no longer used, and has been replaced by {@link #RLE} * which is combination of bit packing and rle diff --git a/parquet-column/src/main/java/org/apache/parquet/column/ParquetProperties.java b/parquet-column/src/main/java/org/apache/parquet/column/ParquetProperties.java index f29214b458..cab6ed6ce1 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/ParquetProperties.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/ParquetProperties.java @@ -50,6 +50,7 @@ public class ParquetProperties { public static final int DEFAULT_DICTIONARY_PAGE_SIZE = DEFAULT_PAGE_SIZE; public static final boolean DEFAULT_IS_DICTIONARY_ENABLED = true; public static final boolean DEFAULT_IS_BYTE_STREAM_SPLIT_ENABLED = false; + public static final boolean DEFAULT_IS_ALP_ENABLED = false; public static final WriterVersion DEFAULT_WRITER_VERSION = WriterVersion.PARQUET_1_0; public static final boolean DEFAULT_ESTIMATE_ROW_COUNT_FOR_PAGE_SIZE_CHECK = true; public static final int DEFAULT_MINIMUM_RECORD_COUNT_FOR_CHECK = 100; @@ -132,6 +133,7 @@ public static WriterVersion fromString(String name) { private final int pageRowCountLimit; private final boolean pageWriteChecksumEnabled; private final ColumnProperty byteStreamSplitEnabled; + private final ColumnProperty alpEnabled; private final Map extraMetaData; private final ColumnProperty statistics; private final ColumnProperty sizeStatistics; @@ -164,6 +166,7 @@ private ParquetProperties(Builder builder) { this.pageRowCountLimit = builder.pageRowCountLimit; this.pageWriteChecksumEnabled = builder.pageWriteChecksumEnabled; this.byteStreamSplitEnabled = builder.byteStreamSplitEnabled.build(); + this.alpEnabled = builder.alpEnabled.build(); this.extraMetaData = builder.extraMetaData; this.statistics = builder.statistics.build(); this.sizeStatistics = builder.sizeStatistics.build(); @@ -259,6 +262,20 @@ public boolean isByteStreamSplitEnabled(ColumnDescriptor column) { } } + /** + * Returns true if ALP encoding is enabled for the given column. + * ALP encoding is only applicable to FLOAT and DOUBLE columns. + */ + public boolean isAlpEnabled(ColumnDescriptor column) { + switch (column.getPrimitiveType().getPrimitiveTypeName()) { + case FLOAT: + case DOUBLE: + return alpEnabled.getValue(column); + default: + return false; + } + } + public ByteBufferAllocator getAllocator() { return allocator; } @@ -416,6 +433,7 @@ public static class Builder { private int pageRowCountLimit = DEFAULT_PAGE_ROW_COUNT_LIMIT; private boolean pageWriteChecksumEnabled = DEFAULT_PAGE_WRITE_CHECKSUM_ENABLED; private final ColumnProperty.Builder byteStreamSplitEnabled; + private final ColumnProperty.Builder alpEnabled; private Map extraMetaData = new HashMap<>(); private final ColumnProperty.Builder statistics; private final ColumnProperty.Builder sizeStatistics; @@ -427,6 +445,7 @@ private Builder() { DEFAULT_IS_BYTE_STREAM_SPLIT_ENABLED ? ByteStreamSplitMode.FLOATING_POINT : ByteStreamSplitMode.NONE); + alpEnabled = ColumnProperty.builder().withDefaultValue(DEFAULT_IS_ALP_ENABLED); bloomFilterEnabled = ColumnProperty.builder().withDefaultValue(DEFAULT_BLOOM_FILTER_ENABLED); bloomFilterNDVs = ColumnProperty.builder().withDefaultValue(null); bloomFilterFPPs = ColumnProperty.builder().withDefaultValue(DEFAULT_BLOOM_FILTER_FPP); @@ -457,6 +476,7 @@ private Builder(ParquetProperties toCopy) { this.numBloomFilterCandidates = ColumnProperty.builder(toCopy.numBloomFilterCandidates); this.maxBloomFilterBytes = toCopy.maxBloomFilterBytes; this.byteStreamSplitEnabled = ColumnProperty.builder(toCopy.byteStreamSplitEnabled); + this.alpEnabled = ColumnProperty.builder(toCopy.alpEnabled); this.extraMetaData = toCopy.extraMetaData; this.statistics = ColumnProperty.builder(toCopy.statistics); this.sizeStatistics = ColumnProperty.builder(toCopy.sizeStatistics); @@ -534,6 +554,29 @@ public Builder withExtendedByteStreamSplitEncoding(boolean enable) { return this; } + /** + * Enable or disable ALP encoding for FLOAT and DOUBLE columns. + * + * @param enable whether ALP encoding should be enabled + * @return this builder for method chaining. + */ + public Builder withAlpEncoding(boolean enable) { + this.alpEnabled.withDefaultValue(enable); + return this; + } + + /** + * Enable or disable ALP encoding for the specified column. + * + * @param columnPath the path of the column (dot-string) + * @param enable whether ALP encoding should be enabled + * @return this builder for method chaining. + */ + public Builder withAlpEncoding(String columnPath, boolean enable) { + this.alpEnabled.withValue(columnPath, enable); + return this; + } + /** * Set the Parquet format dictionary page size. * diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/alp/AlpCompression.java b/parquet-column/src/main/java/org/apache/parquet/column/values/alp/AlpCompression.java new file mode 100644 index 0000000000..ca738e343c --- /dev/null +++ b/parquet-column/src/main/java/org/apache/parquet/column/values/alp/AlpCompression.java @@ -0,0 +1,514 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.column.values.alp; + +import static org.apache.parquet.column.values.alp.AlpConstants.*; + +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.Arrays; +import org.apache.parquet.column.values.bitpacking.BytePacker; +import org.apache.parquet.column.values.bitpacking.BytePackerForLong; +import org.apache.parquet.column.values.bitpacking.Packer; + +/** + * ALP compression and decompression for single vectors of floating-point values. + * + *

Compression pipeline: find best (exponent, factor) → encode to integers → + * Frame of Reference → bit-pack. Decompression reverses this. + * + *

Mirrors C++ {@code AlpCompression} with overloaded static methods for float and double. + */ +final class AlpCompression { + + private AlpCompression() {} + + // ========== AlpEncodingPreset ========== + + /** Preset containing candidate (exponent, factor) combinations from sampling. */ + static final class AlpEncodingPreset { + final int[][] combinations; // each element is {exponent, factor} + + AlpEncodingPreset(int[][] combinations) { + this.combinations = combinations; + } + } + + // ========== FloatCompressedVector ========== + + /** + * A compressed ALP vector for float data. + * + *

Wire format (little-endian): + * [AlpInfo(4B)][ForInfo(5B)][PackedValues][ExcPositions][ExcValues] + */ + static final class FloatCompressedVector { + int exponent; + int factor; + int numExceptions; + int frameOfReference; + int bitWidth; + int numElements; + byte[] packedValues; + short[] exceptionPositions; + float[] exceptionValues; + + int storedSize() { + return ALP_INFO_SIZE + FLOAT_FOR_INFO_SIZE + dataStoredSize(); + } + + int dataStoredSize() { + return AlpEncoderDecoder.bitPackedSize(numElements, bitWidth) + + numExceptions * (POSITION_SIZE + Float.BYTES); + } + + void store(byte[] output, int offset) { + ByteBuffer buf = ByteBuffer.wrap(output, offset, storedSize()).order(ByteOrder.LITTLE_ENDIAN); + buf.put((byte) exponent); + buf.put((byte) factor); + buf.putShort((short) numExceptions); + buf.putInt(frameOfReference); + buf.put((byte) bitWidth); + storeDataTo(buf); + } + + void storeDataOnly(byte[] output, int offset) { + ByteBuffer buf = ByteBuffer.wrap(output, offset, dataStoredSize()).order(ByteOrder.LITTLE_ENDIAN); + storeDataTo(buf); + } + + private void storeDataTo(ByteBuffer buf) { + int bps = AlpEncoderDecoder.bitPackedSize(numElements, bitWidth); + buf.put(packedValues, 0, bps); + for (int i = 0; i < numExceptions; i++) { + buf.putShort(exceptionPositions[i]); + } + for (int i = 0; i < numExceptions; i++) { + buf.putFloat(exceptionValues[i]); + } + } + + static FloatCompressedVector load(byte[] input, int offset, int numElements) { + ByteBuffer buf = + ByteBuffer.wrap(input, offset, input.length - offset).order(ByteOrder.LITTLE_ENDIAN); + FloatCompressedVector v = new FloatCompressedVector(); + v.numElements = numElements; + v.exponent = buf.get() & 0xFF; + v.factor = buf.get() & 0xFF; + v.numExceptions = buf.getShort() & 0xFFFF; + v.frameOfReference = buf.getInt(); + v.bitWidth = buf.get() & 0xFF; + int bps = AlpEncoderDecoder.bitPackedSize(numElements, v.bitWidth); + v.packedValues = new byte[bps]; + buf.get(v.packedValues); + v.exceptionPositions = new short[v.numExceptions]; + for (int i = 0; i < v.numExceptions; i++) { + v.exceptionPositions[i] = buf.getShort(); + } + v.exceptionValues = new float[v.numExceptions]; + for (int i = 0; i < v.numExceptions; i++) { + v.exceptionValues[i] = buf.getFloat(); + } + return v; + } + } + + // ========== DoubleCompressedVector ========== + + /** + * A compressed ALP vector for double data. + * + *

Wire format (little-endian): + * [AlpInfo(4B)][ForInfo(9B)][PackedValues][ExcPositions][ExcValues] + */ + static final class DoubleCompressedVector { + int exponent; + int factor; + int numExceptions; + long frameOfReference; + int bitWidth; + int numElements; + byte[] packedValues; + short[] exceptionPositions; + double[] exceptionValues; + + int storedSize() { + return ALP_INFO_SIZE + DOUBLE_FOR_INFO_SIZE + dataStoredSize(); + } + + int dataStoredSize() { + return AlpEncoderDecoder.bitPackedSize(numElements, bitWidth) + + numExceptions * (POSITION_SIZE + Double.BYTES); + } + + void store(byte[] output, int offset) { + ByteBuffer buf = ByteBuffer.wrap(output, offset, storedSize()).order(ByteOrder.LITTLE_ENDIAN); + buf.put((byte) exponent); + buf.put((byte) factor); + buf.putShort((short) numExceptions); + buf.putLong(frameOfReference); + buf.put((byte) bitWidth); + storeDataTo(buf); + } + + void storeDataOnly(byte[] output, int offset) { + ByteBuffer buf = ByteBuffer.wrap(output, offset, dataStoredSize()).order(ByteOrder.LITTLE_ENDIAN); + storeDataTo(buf); + } + + private void storeDataTo(ByteBuffer buf) { + int bps = AlpEncoderDecoder.bitPackedSize(numElements, bitWidth); + buf.put(packedValues, 0, bps); + for (int i = 0; i < numExceptions; i++) { + buf.putShort(exceptionPositions[i]); + } + for (int i = 0; i < numExceptions; i++) { + buf.putDouble(exceptionValues[i]); + } + } + + static DoubleCompressedVector load(byte[] input, int offset, int numElements) { + ByteBuffer buf = + ByteBuffer.wrap(input, offset, input.length - offset).order(ByteOrder.LITTLE_ENDIAN); + DoubleCompressedVector v = new DoubleCompressedVector(); + v.numElements = numElements; + v.exponent = buf.get() & 0xFF; + v.factor = buf.get() & 0xFF; + v.numExceptions = buf.getShort() & 0xFFFF; + v.frameOfReference = buf.getLong(); + v.bitWidth = buf.get() & 0xFF; + int bps = AlpEncoderDecoder.bitPackedSize(numElements, v.bitWidth); + v.packedValues = new byte[bps]; + buf.get(v.packedValues); + v.exceptionPositions = new short[v.numExceptions]; + for (int i = 0; i < v.numExceptions; i++) { + v.exceptionPositions[i] = buf.getShort(); + } + v.exceptionValues = new double[v.numExceptions]; + for (int i = 0; i < v.numExceptions; i++) { + v.exceptionValues[i] = buf.getDouble(); + } + return v; + } + } + + // ========== Compress float ========== + + static FloatCompressedVector compressFloatVector(float[] input, int count, AlpEncodingPreset preset) { + if (count == 0) { + FloatCompressedVector r = new FloatCompressedVector(); + r.packedValues = new byte[0]; + r.exceptionPositions = new short[0]; + r.exceptionValues = new float[0]; + return r; + } + + // 1. Find best (exponent, factor) from preset + AlpEncoderDecoder.EncodingParams params = + AlpEncoderDecoder.findBestFloatParamsWithPresets(input, 0, count, preset.combinations); + int exponent = params.exponent; + int factor = params.factor; + + // 2. Encode all values to integers + int[] encoded = new int[count]; + for (int i = 0; i < count; i++) { + encoded[i] = AlpEncoderDecoder.encodeFloat(input[i], exponent, factor); + } + + // 3. Detect exceptions via bitwise round-trip check + int numExceptions = 0; + short[] excPositions = new short[count]; + for (int i = 0; i < count; i++) { + float decoded = AlpEncoderDecoder.decodeFloat(encoded[i], exponent, factor); + if (Float.floatToRawIntBits(decoded) != Float.floatToRawIntBits(input[i])) { + excPositions[numExceptions++] = (short) i; + } + } + + // 4. Find first non-exception value as placeholder (0 if all are exceptions) + int placeholder = 0; + int excIdx = 0; + for (int i = 0; i < count; i++) { + if (excIdx < numExceptions && (excPositions[excIdx] & 0xFFFF) == i) { + excIdx++; + } else { + placeholder = encoded[i]; + break; + } + } + + // 5. Replace exceptions with placeholder, collect original values + float[] excValues = new float[numExceptions]; + for (int i = 0; i < numExceptions; i++) { + int pos = excPositions[i] & 0xFFFF; + excValues[i] = input[pos]; + encoded[pos] = placeholder; + } + + // 6. FOR encoding + int min = encoded[0]; + int max = encoded[0]; + for (int i = 1; i < count; i++) { + if (encoded[i] < min) min = encoded[i]; + if (encoded[i] > max) max = encoded[i]; + } + for (int i = 0; i < count; i++) { + encoded[i] -= min; + } + int maxDelta = max - min; + + // 7. Bit packing + int bitWidth = AlpEncoderDecoder.bitWidthForInt(maxDelta); + int bps = AlpEncoderDecoder.bitPackedSize(count, bitWidth); + byte[] packedValues = new byte[bps]; + if (bitWidth > 0) { + packInts(encoded, count, bitWidth, packedValues); + } + + // Build result + FloatCompressedVector result = new FloatCompressedVector(); + result.exponent = exponent; + result.factor = factor; + result.numExceptions = numExceptions; + result.frameOfReference = min; + result.bitWidth = bitWidth; + result.numElements = count; + result.packedValues = packedValues; + result.exceptionPositions = Arrays.copyOf(excPositions, numExceptions); + result.exceptionValues = excValues; + return result; + } + + // ========== Decompress float ========== + + static void decompressFloatVector(FloatCompressedVector v, float[] output) { + // 1. Unpack integers + int[] encoded = new int[v.numElements]; + if (v.bitWidth > 0) { + unpackInts(v.packedValues, v.numElements, v.bitWidth, encoded); + } + + // 2. Fused unFOR + decode + for (int i = 0; i < v.numElements; i++) { + int unfored = encoded[i] + v.frameOfReference; + output[i] = AlpEncoderDecoder.decodeFloat(unfored, v.exponent, v.factor); + } + + // 3. Patch exceptions + for (int i = 0; i < v.numExceptions; i++) { + output[v.exceptionPositions[i] & 0xFFFF] = v.exceptionValues[i]; + } + } + + // ========== Compress double ========== + + static DoubleCompressedVector compressDoubleVector(double[] input, int count, AlpEncodingPreset preset) { + if (count == 0) { + DoubleCompressedVector r = new DoubleCompressedVector(); + r.packedValues = new byte[0]; + r.exceptionPositions = new short[0]; + r.exceptionValues = new double[0]; + return r; + } + + AlpEncoderDecoder.EncodingParams params = + AlpEncoderDecoder.findBestDoubleParamsWithPresets(input, 0, count, preset.combinations); + int exponent = params.exponent; + int factor = params.factor; + + long[] encoded = new long[count]; + for (int i = 0; i < count; i++) { + encoded[i] = AlpEncoderDecoder.encodeDouble(input[i], exponent, factor); + } + + int numExceptions = 0; + short[] excPositions = new short[count]; + for (int i = 0; i < count; i++) { + double decoded = AlpEncoderDecoder.decodeDouble(encoded[i], exponent, factor); + if (Double.doubleToRawLongBits(decoded) != Double.doubleToRawLongBits(input[i])) { + excPositions[numExceptions++] = (short) i; + } + } + + long placeholder = 0; + int excIdx = 0; + for (int i = 0; i < count; i++) { + if (excIdx < numExceptions && (excPositions[excIdx] & 0xFFFF) == i) { + excIdx++; + } else { + placeholder = encoded[i]; + break; + } + } + + double[] excValues = new double[numExceptions]; + for (int i = 0; i < numExceptions; i++) { + int pos = excPositions[i] & 0xFFFF; + excValues[i] = input[pos]; + encoded[pos] = placeholder; + } + + long min = encoded[0]; + long max = encoded[0]; + for (int i = 1; i < count; i++) { + if (encoded[i] < min) min = encoded[i]; + if (encoded[i] > max) max = encoded[i]; + } + for (int i = 0; i < count; i++) { + encoded[i] -= min; + } + long maxDelta = max - min; + + int bitWidth = AlpEncoderDecoder.bitWidthForLong(maxDelta); + int bps = AlpEncoderDecoder.bitPackedSize(count, bitWidth); + byte[] packedValues = new byte[bps]; + if (bitWidth > 0) { + packLongs(encoded, count, bitWidth, packedValues); + } + + DoubleCompressedVector result = new DoubleCompressedVector(); + result.exponent = exponent; + result.factor = factor; + result.numExceptions = numExceptions; + result.frameOfReference = min; + result.bitWidth = bitWidth; + result.numElements = count; + result.packedValues = packedValues; + result.exceptionPositions = Arrays.copyOf(excPositions, numExceptions); + result.exceptionValues = excValues; + return result; + } + + // ========== Decompress double ========== + + static void decompressDoubleVector(DoubleCompressedVector v, double[] output) { + long[] encoded = new long[v.numElements]; + decompressDoubleVector(v, output, encoded); + } + + static void decompressDoubleVector(DoubleCompressedVector v, double[] output, long[] encodedBuffer) { + if (v.bitWidth > 0) { + unpackLongs(v.packedValues, v.numElements, v.bitWidth, encodedBuffer); + } else { + for (int i = 0; i < v.numElements; i++) { + encodedBuffer[i] = 0; + } + } + + // Fused unFOR + decode with hoisted multipliers + final long frameOfRef = v.frameOfReference; + final double factorMul = DOUBLE_POW10[v.factor]; + final double expMul = DOUBLE_POW10_NEGATIVE[v.exponent]; + final int numElements = v.numElements; + for (int i = 0; i < numElements; i++) { + output[i] = (double) (encodedBuffer[i] + frameOfRef) * factorMul * expMul; + } + + for (int i = 0; i < v.numExceptions; i++) { + output[v.exceptionPositions[i] & 0xFFFF] = v.exceptionValues[i]; + } + } + + // ========== Bit packing helpers ========== + + @SuppressWarnings("deprecation") + static void packInts(int[] values, int count, int bitWidth, byte[] output) { + BytePacker packer = Packer.LITTLE_ENDIAN.newBytePacker(bitWidth); + int fullGroups = count / 8; + for (int g = 0; g < fullGroups; g++) { + packer.pack8Values(values, g * 8, output, g * bitWidth); + } + int remaining = count % 8; + if (remaining > 0) { + int[] padded = new int[8]; + System.arraycopy(values, fullGroups * 8, padded, 0, remaining); + byte[] tmp = new byte[bitWidth]; + packer.pack8Values(padded, 0, tmp, 0); + int tailBytes = AlpEncoderDecoder.bitPackedSize(count, bitWidth) - fullGroups * bitWidth; + System.arraycopy(tmp, 0, output, fullGroups * bitWidth, tailBytes); + } + } + + @SuppressWarnings("deprecation") + static void unpackInts(byte[] packed, int count, int bitWidth, int[] output) { + BytePacker packer = Packer.LITTLE_ENDIAN.newBytePacker(bitWidth); + int fullGroups = count / 8; + for (int g = 0; g < fullGroups; g++) { + packer.unpack8Values(packed, g * bitWidth, output, g * 8); + } + int remaining = count % 8; + if (remaining > 0) { + byte[] tmp = new byte[bitWidth]; + int available = packed.length - fullGroups * bitWidth; + System.arraycopy(packed, fullGroups * bitWidth, tmp, 0, Math.min(available, bitWidth)); + int[] padded = new int[8]; + packer.unpack8Values(tmp, 0, padded, 0); + System.arraycopy(padded, 0, output, fullGroups * 8, remaining); + } + } + + @SuppressWarnings("deprecation") + static void packLongs(long[] values, int count, int bitWidth, byte[] output) { + BytePackerForLong packer = Packer.LITTLE_ENDIAN.newBytePackerForLong(bitWidth); + int fullGroups = count / 8; + for (int g = 0; g < fullGroups; g++) { + packer.pack8Values(values, g * 8, output, g * bitWidth); + } + int remaining = count % 8; + if (remaining > 0) { + long[] padded = new long[8]; + System.arraycopy(values, fullGroups * 8, padded, 0, remaining); + byte[] tmp = new byte[bitWidth]; + packer.pack8Values(padded, 0, tmp, 0); + int tailBytes = AlpEncoderDecoder.bitPackedSize(count, bitWidth) - fullGroups * bitWidth; + System.arraycopy(tmp, 0, output, fullGroups * bitWidth, tailBytes); + } + } + + static void unpackLongs(byte[] packed, int count, int bitWidth, long[] output) { + BytePackerForLong packer = Packer.LITTLE_ENDIAN.newBytePackerForLong(bitWidth); + + // Process 32 values at a time (4x fewer calls than unpack8Values) + int fullGroups32 = count / 32; + for (int g = 0; g < fullGroups32; g++) { + packer.unpack32Values(packed, g * bitWidth * 4, output, g * 32); + } + + // Process remaining 8 at a time + int processed = fullGroups32 * 32; + int byteOffset = fullGroups32 * bitWidth * 4; + int remaining8 = (count - processed) / 8; + for (int g = 0; g < remaining8; g++) { + packer.unpack8Values(packed, byteOffset + g * bitWidth, output, processed + g * 8); + } + + // Handle tail (< 8 values) + int tailStart = processed + remaining8 * 8; + int tailCount = count - tailStart; + if (tailCount > 0) { + int tailByteOffset = byteOffset + remaining8 * bitWidth; + byte[] tmp = new byte[bitWidth]; + int available = packed.length - tailByteOffset; + System.arraycopy(packed, tailByteOffset, tmp, 0, Math.min(available, bitWidth)); + long[] padded = new long[8]; + packer.unpack8Values(tmp, 0, padded, 0); + System.arraycopy(padded, 0, output, tailStart, tailCount); + } + } +} diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/alp/AlpConstants.java b/parquet-column/src/main/java/org/apache/parquet/column/values/alp/AlpConstants.java new file mode 100644 index 0000000000..3be531d383 --- /dev/null +++ b/parquet-column/src/main/java/org/apache/parquet/column/values/alp/AlpConstants.java @@ -0,0 +1,149 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.column.values.alp; + +import org.apache.parquet.Preconditions; + +/** + * Constants for the ALP (Adaptive Lossless floating-Point) encoding. + * + *

ALP encoding converts floating-point values to integers using decimal scaling, + * then applies Frame of Reference encoding and bit-packing. + * Values that cannot be losslessly converted are stored as exceptions. + * + *

Based on the paper: "ALP: Adaptive Lossless floating-Point Compression" (SIGMOD 2024) + * + * @see ALP Paper + */ +public final class AlpConstants { + + private AlpConstants() { + // Utility class + } + + // ========== Page header (7 bytes, matching C++ AlpHeader) ========== + // [compression_mode(1)][integer_encoding(1)][log_vector_size(1)][num_elements(4)] + public static final int HEADER_SIZE = 7; + public static final int COMPRESSION_MODE_ALP = 0; + public static final int INTEGER_ENCODING_FOR = 0; + + // ========== Vector sizing ========== + public static final int DEFAULT_VECTOR_SIZE = 1024; + public static final int DEFAULT_VECTOR_SIZE_LOG = 10; + + // Capped at 15 (vectorSize=32768) because num_exceptions is uint16, + // so vectorSize must not exceed 65535 to avoid overflow when all values are exceptions. + static final int MAX_LOG_VECTOR_SIZE = 15; + static final int MIN_LOG_VECTOR_SIZE = 3; + + // ========== Sampler constants (matching C++ AlpConstants) ========== + static final int SAMPLER_VECTOR_SIZE = 4096; + static final int SAMPLER_ROWGROUP_SIZE = 122880; + static final int SAMPLER_SAMPLES_PER_VECTOR = 256; + static final int SAMPLER_SAMPLE_VECTORS_PER_ROWGROUP = 8; + static final int MAX_COMBINATIONS = 5; + static final int SAMPLING_EARLY_EXIT_THRESHOLD = 4; + + // ========== Float-specific ========== + static final int FLOAT_MAX_EXPONENT = 10; + static final float MAGIC_FLOAT = 12_582_912.0f; // 2^22 + 2^23 + static final float FLOAT_ENCODING_UPPER_LIMIT = 2147483520.0f; + static final float FLOAT_ENCODING_LOWER_LIMIT = -2147483520.0f; + static final int FLOAT_NEGATIVE_ZERO_BITS = 0x80000000; + + static final float[] FLOAT_POW10 = {1e0f, 1e1f, 1e2f, 1e3f, 1e4f, 1e5f, 1e6f, 1e7f, 1e8f, 1e9f, 1e10f}; + + // Negative powers of 10 as float, matching C++ PowerOfTenFloat(-power). + // Used in the two-step encode/decode to match C++ floating-point rounding behavior. + static final float[] FLOAT_POW10_NEGATIVE = { + 1e0f, 1e-1f, 1e-2f, 1e-3f, 1e-4f, 1e-5f, 1e-6f, 1e-7f, 1e-8f, 1e-9f, 1e-10f + }; + + // ========== Double-specific ========== + static final int DOUBLE_MAX_EXPONENT = 18; + static final double MAGIC_DOUBLE = 6_755_399_441_055_744.0; // 2^51 + 2^52 + static final double DOUBLE_ENCODING_UPPER_LIMIT = 9223372036854774784.0; + static final double DOUBLE_ENCODING_LOWER_LIMIT = -9223372036854774784.0; + static final long DOUBLE_NEGATIVE_ZERO_BITS = 0x8000000000000000L; + + static final double[] DOUBLE_POW10 = { + 1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9, 1e10, 1e11, 1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18 + }; + + // Negative powers of 10 as double, matching C++ PowerOfTenDouble(-power). + static final double[] DOUBLE_POW10_NEGATIVE = { + 1e0, 1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7, 1e-8, 1e-9, 1e-10, 1e-11, 1e-12, 1e-13, 1e-14, 1e-15, 1e-16, + 1e-17, 1e-18 + }; + + // ========== Per-vector metadata sizes ========== + public static final int ALP_INFO_SIZE = 4; // exponent(1) + factor(1) + num_exceptions(2) + public static final int FLOAT_FOR_INFO_SIZE = 5; // frame_of_reference(4) + bit_width(1) + public static final int DOUBLE_FOR_INFO_SIZE = 9; // frame_of_reference(8) + bit_width(1) + + // ========== Offset and position types ========== + // OffsetType = int (4 bytes), PositionType = short (2 bytes) — matching C++ uint32_t and uint16_t + public static final int OFFSET_SIZE = Integer.BYTES; + public static final int POSITION_SIZE = Short.BYTES; + + /** Returns 10^power as a long, for power in [0, 18]. */ + static long integerPow10(int power) { + Preconditions.checkArgument(power >= 0 && power <= 18, "power must be in [0, 18], got: %s", power); + return INTEGER_POW10[power]; + } + + private static final long[] INTEGER_POW10 = { + 1L, + 10L, + 100L, + 1_000L, + 10_000L, + 100_000L, + 1_000_000L, + 10_000_000L, + 100_000_000L, + 1_000_000_000L, + 10_000_000_000L, + 100_000_000_000L, + 1_000_000_000_000L, + 10_000_000_000_000L, + 100_000_000_000_000L, + 1_000_000_000_000_000L, + 10_000_000_000_000_000L, + 100_000_000_000_000_000L, + 1_000_000_000_000_000_000L, + }; + + /** Validates vector size: must be a power of 2 in [2^MIN_LOG .. 2^MAX_LOG]. */ + public static int validateVectorSize(int vectorSize) { + Preconditions.checkArgument( + vectorSize > 0 && (vectorSize & (vectorSize - 1)) == 0, + "Vector size must be a power of 2, got: %s", + vectorSize); + int logSize = Integer.numberOfTrailingZeros(vectorSize); + Preconditions.checkArgument( + logSize >= MIN_LOG_VECTOR_SIZE && logSize <= MAX_LOG_VECTOR_SIZE, + "Vector size log2 must be between %s and %s, got: %s (vectorSize=%s)", + MIN_LOG_VECTOR_SIZE, + MAX_LOG_VECTOR_SIZE, + logSize, + vectorSize); + return vectorSize; + } +} diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/alp/AlpEncoderDecoder.java b/parquet-column/src/main/java/org/apache/parquet/column/values/alp/AlpEncoderDecoder.java new file mode 100644 index 0000000000..da489ec35e --- /dev/null +++ b/parquet-column/src/main/java/org/apache/parquet/column/values/alp/AlpEncoderDecoder.java @@ -0,0 +1,287 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.column.values.alp; + +import static org.apache.parquet.column.values.alp.AlpConstants.*; + +/** + * Core ALP (Adaptive Lossless floating-Point) encoding and decoding logic. + * + *

ALP works by converting floating-point values to integers using decimal scaling, + * then applying Frame of Reference encoding and bit-packing. + * Values that cannot be losslessly converted are stored as exceptions. + * + *

Encoding formula: encoded = round(value * 10^exponent * 10^(-factor)) + *

Decoding formula: value = encoded * 10^factor * 10^(-exponent) + * + *

Exception conditions: + *

    + *
  • NaN values
  • + *
  • Infinity values
  • + *
  • Negative zero (-0.0)
  • + *
  • Out of integer range
  • + *
  • Round-trip failure (decode(encode(v)) != v)
  • + *
+ */ +final class AlpEncoderDecoder { + + private AlpEncoderDecoder() { + // Utility class + } + + // ========== Float exception detection ========== + + /** NaN, Inf, and -0.0 can never be encoded regardless of exponent/factor. */ + static boolean isFloatException(float value) { + if (Float.isNaN(value)) { + return true; + } + if (Float.isInfinite(value)) { + return true; + } + return Float.floatToRawIntBits(value) == FLOAT_NEGATIVE_ZERO_BITS; + } + + /** Check round-trip: encode then decode, and see if we get the same bits back. */ + static boolean isFloatException(float value, int exponent, int factor) { + if (isFloatException(value)) { + return true; + } + float scaled = value * FLOAT_POW10[exponent] * FLOAT_POW10_NEGATIVE[factor]; + if (scaled > FLOAT_ENCODING_UPPER_LIMIT || scaled < FLOAT_ENCODING_LOWER_LIMIT) { + return true; + } + int encoded = encodeFloat(value, exponent, factor); + float decoded = decodeFloat(encoded, exponent, factor); + return Float.floatToRawIntBits(value) != Float.floatToRawIntBits(decoded); + } + + // ========== Float encode/decode ========== + // Two-step multiplication matching C++ to produce identical floating-point rounding. + // C++ encode: value * 10^exponent * 10^(-factor) + // C++ decode: (float)encoded * 10^factor * 10^(-exponent) + + /** Encode: round(value * 10^exponent * 10^(-factor)) */ + static int encodeFloat(float value, int exponent, int factor) { + return fastRoundFloat(value * FLOAT_POW10[exponent] * FLOAT_POW10_NEGATIVE[factor]); + } + + /** Decode: encoded * 10^factor * 10^(-exponent) */ + static float decodeFloat(int encoded, int exponent, int factor) { + return (float) encoded * FLOAT_POW10[factor] * FLOAT_POW10_NEGATIVE[exponent]; + } + + // Uses the 2^22+2^23 magic-number trick to round without branching on the FPU. + static int fastRoundFloat(float value) { + if (value >= 0) { + return (int) ((value + MAGIC_FLOAT) - MAGIC_FLOAT); + } else { + return (int) ((value - MAGIC_FLOAT) + MAGIC_FLOAT); + } + } + + // ========== Double exception detection ========== + + static boolean isDoubleException(double value) { + if (Double.isNaN(value)) { + return true; + } + if (Double.isInfinite(value)) { + return true; + } + return Double.doubleToRawLongBits(value) == DOUBLE_NEGATIVE_ZERO_BITS; + } + + static boolean isDoubleException(double value, int exponent, int factor) { + if (isDoubleException(value)) { + return true; + } + double scaled = value * DOUBLE_POW10[exponent] * DOUBLE_POW10_NEGATIVE[factor]; + if (scaled > DOUBLE_ENCODING_UPPER_LIMIT || scaled < DOUBLE_ENCODING_LOWER_LIMIT) { + return true; + } + long encoded = encodeDouble(value, exponent, factor); + double decoded = decodeDouble(encoded, exponent, factor); + return Double.doubleToRawLongBits(value) != Double.doubleToRawLongBits(decoded); + } + + // ========== Double encode/decode ========== + // Two-step multiplication matching C++ to produce identical floating-point rounding. + + /** Encode: round(value * 10^exponent * 10^(-factor)) */ + static long encodeDouble(double value, int exponent, int factor) { + return fastRoundDouble(value * DOUBLE_POW10[exponent] * DOUBLE_POW10_NEGATIVE[factor]); + } + + /** Decode: encoded * 10^factor * 10^(-exponent) */ + static double decodeDouble(long encoded, int exponent, int factor) { + return (double) encoded * DOUBLE_POW10[factor] * DOUBLE_POW10_NEGATIVE[exponent]; + } + + // Same trick but with 2^51+2^52 for double precision. + static long fastRoundDouble(double value) { + if (value >= 0) { + return (long) ((value + MAGIC_DOUBLE) - MAGIC_DOUBLE); + } else { + return (long) ((value - MAGIC_DOUBLE) + MAGIC_DOUBLE); + } + } + + // ========== Bit width ========== + + /** Number of bits needed to represent maxDelta as an unsigned value. */ + static int bitWidthForInt(int maxDelta) { + if (maxDelta == 0) { + return 0; + } + return Integer.SIZE - Integer.numberOfLeadingZeros(maxDelta); + } + + static int bitWidthForLong(long maxDelta) { + if (maxDelta == 0) { + return 0; + } + return Long.SIZE - Long.numberOfLeadingZeros(maxDelta); + } + + /** Packed data size in bytes: ceil(numElements * bitWidth / 8). */ + static int bitPackedSize(int numElements, int bitWidth) { + return (numElements * bitWidth + 7) / 8; + } + + // ========== Encoding params ========== + + static class EncodingParams { + final int exponent; + final int factor; + final int numExceptions; + + EncodingParams(int exponent, int factor, int numExceptions) { + this.exponent = exponent; + this.factor = factor; + this.numExceptions = numExceptions; + } + } + + /** Try all (exponent, factor) combos and pick the one with fewest exceptions. */ + static EncodingParams findBestFloatParams(float[] values, int offset, int length) { + int bestExponent = 0; + int bestFactor = 0; + int bestExceptions = length; + + for (int e = 0; e <= FLOAT_MAX_EXPONENT; e++) { + for (int f = 0; f <= e; f++) { + int exceptions = 0; + for (int i = 0; i < length; i++) { + if (isFloatException(values[offset + i], e, f)) { + exceptions++; + } + } + if (exceptions < bestExceptions) { + bestExponent = e; + bestFactor = f; + bestExceptions = exceptions; + if (bestExceptions == 0) { + return new EncodingParams(bestExponent, bestFactor, bestExceptions); + } + } + } + } + return new EncodingParams(bestExponent, bestFactor, bestExceptions); + } + + /** Same as findBestFloatParams but only tries the cached preset combos. */ + static EncodingParams findBestFloatParamsWithPresets(float[] values, int offset, int length, int[][] presets) { + int bestExponent = presets[0][0]; + int bestFactor = presets[0][1]; + int bestExceptions = length; + + for (int[] preset : presets) { + int e = preset[0]; + int f = preset[1]; + int exceptions = 0; + for (int i = 0; i < length; i++) { + if (isFloatException(values[offset + i], e, f)) { + exceptions++; + } + } + if (exceptions < bestExceptions) { + bestExponent = e; + bestFactor = f; + bestExceptions = exceptions; + if (bestExceptions == 0) { + return new EncodingParams(bestExponent, bestFactor, bestExceptions); + } + } + } + return new EncodingParams(bestExponent, bestFactor, bestExceptions); + } + + static EncodingParams findBestDoubleParams(double[] values, int offset, int length) { + int bestExponent = 0; + int bestFactor = 0; + int bestExceptions = length; + + for (int e = 0; e <= DOUBLE_MAX_EXPONENT; e++) { + for (int f = 0; f <= e; f++) { + int exceptions = 0; + for (int i = 0; i < length; i++) { + if (isDoubleException(values[offset + i], e, f)) { + exceptions++; + } + } + if (exceptions < bestExceptions) { + bestExponent = e; + bestFactor = f; + bestExceptions = exceptions; + if (bestExceptions == 0) { + return new EncodingParams(bestExponent, bestFactor, bestExceptions); + } + } + } + } + return new EncodingParams(bestExponent, bestFactor, bestExceptions); + } + + static EncodingParams findBestDoubleParamsWithPresets(double[] values, int offset, int length, int[][] presets) { + int bestExponent = presets[0][0]; + int bestFactor = presets[0][1]; + int bestExceptions = length; + + for (int[] preset : presets) { + int e = preset[0]; + int f = preset[1]; + int exceptions = 0; + for (int i = 0; i < length; i++) { + if (isDoubleException(values[offset + i], e, f)) { + exceptions++; + } + } + if (exceptions < bestExceptions) { + bestExponent = e; + bestFactor = f; + bestExceptions = exceptions; + if (bestExceptions == 0) { + return new EncodingParams(bestExponent, bestFactor, bestExceptions); + } + } + } + return new EncodingParams(bestExponent, bestFactor, bestExceptions); + } +} diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/alp/AlpSampler.java b/parquet-column/src/main/java/org/apache/parquet/column/values/alp/AlpSampler.java new file mode 100644 index 0000000000..8acb1c42ca --- /dev/null +++ b/parquet-column/src/main/java/org/apache/parquet/column/values/alp/AlpSampler.java @@ -0,0 +1,344 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.column.values.alp; + +import static org.apache.parquet.column.values.alp.AlpConstants.*; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * ALP sampler that collects representative samples and generates encoding presets. + * + *

Usage: call {@code addSample()} to feed data, then {@code finalize()} to get the preset. + * Mirrors C++ {@code AlpSampler} with separate inner classes for float and double. + */ +final class AlpSampler { + + private AlpSampler() {} + + // ========== FloatSampler ========== + + static final class FloatSampler { + private final long sampleVectorSize = SAMPLER_VECTOR_SIZE; + private final long rowgroupSize = SAMPLER_ROWGROUP_SIZE; + private final long samplesPerVector = SAMPLER_SAMPLES_PER_VECTOR; + private final long sampleVectorsPerRowgroup = SAMPLER_SAMPLE_VECTORS_PER_ROWGROUP; + private final long rowgroupSampleJump; + + private long vectorsSampledCount; + private long totalValuesCount; + private long vectorsCount; + private final List rowgroupSample = new ArrayList<>(); + + FloatSampler() { + rowgroupSampleJump = (rowgroupSize / sampleVectorsPerRowgroup) / sampleVectorSize; + } + + /** Add a sample of arbitrary size (split into vectors internally). */ + void addSample(float[] data, int count) { + for (int i = 0; i < count; i += (int) sampleVectorSize) { + int elements = (int) Math.min(count - i, sampleVectorSize); + addSampleVector(data, i, elements); + } + } + + private void addSampleVector(float[] data, int offset, int length) { + boolean mustSkip = mustSkipSamplingFromCurrentVector(vectorsCount, vectorsSampledCount, length); + vectorsCount++; + totalValuesCount += length; + if (mustSkip) { + return; + } + + int numLookup = (int) Math.min(length, DEFAULT_VECTOR_SIZE); + int increment = (int) Math.max(1, (int) Math.ceil((double) numLookup / samplesPerVector)); + + // Take equidistant subsample + List sample = new ArrayList<>(); + for (int i = 0; i < numLookup; i += increment) { + sample.add(data[offset + i]); + } + + float[] sampleArray = new float[sample.size()]; + for (int i = 0; i < sample.size(); i++) { + sampleArray[i] = sample.get(i); + } + + rowgroupSample.add(sampleArray); + vectorsSampledCount++; + } + + private boolean mustSkipSamplingFromCurrentVector( + long vectorsCount, long vectorsSampledCount, int currentVectorSize) { + if ((vectorsCount % rowgroupSampleJump) != 0) { + return true; + } + return currentVectorSize < SAMPLER_SAMPLES_PER_VECTOR && vectorsSampledCount != 0; + } + + /** Finalize sampling and return the encoding preset. */ + AlpCompression.AlpEncodingPreset finalizeSampling() { + return createFloatEncodingPreset(rowgroupSample); + } + } + + // ========== DoubleSampler ========== + + static final class DoubleSampler { + private final long sampleVectorSize = SAMPLER_VECTOR_SIZE; + private final long rowgroupSize = SAMPLER_ROWGROUP_SIZE; + private final long samplesPerVector = SAMPLER_SAMPLES_PER_VECTOR; + private final long sampleVectorsPerRowgroup = SAMPLER_SAMPLE_VECTORS_PER_ROWGROUP; + private final long rowgroupSampleJump; + + private long vectorsSampledCount; + private long totalValuesCount; + private long vectorsCount; + private final List rowgroupSample = new ArrayList<>(); + + DoubleSampler() { + rowgroupSampleJump = (rowgroupSize / sampleVectorsPerRowgroup) / sampleVectorSize; + } + + void addSample(double[] data, int count) { + for (int i = 0; i < count; i += (int) sampleVectorSize) { + int elements = (int) Math.min(count - i, sampleVectorSize); + addSampleVector(data, i, elements); + } + } + + private void addSampleVector(double[] data, int offset, int length) { + boolean mustSkip = mustSkipSamplingFromCurrentVector(vectorsCount, vectorsSampledCount, length); + vectorsCount++; + totalValuesCount += length; + if (mustSkip) { + return; + } + + int numLookup = (int) Math.min(length, DEFAULT_VECTOR_SIZE); + int increment = (int) Math.max(1, (int) Math.ceil((double) numLookup / samplesPerVector)); + + List sample = new ArrayList<>(); + for (int i = 0; i < numLookup; i += increment) { + sample.add(data[offset + i]); + } + + double[] sampleArray = new double[sample.size()]; + for (int i = 0; i < sample.size(); i++) { + sampleArray[i] = sample.get(i); + } + + rowgroupSample.add(sampleArray); + vectorsSampledCount++; + } + + private boolean mustSkipSamplingFromCurrentVector( + long vectorsCount, long vectorsSampledCount, int currentVectorSize) { + if ((vectorsCount % rowgroupSampleJump) != 0) { + return true; + } + return currentVectorSize < SAMPLER_SAMPLES_PER_VECTOR && vectorsSampledCount != 0; + } + + AlpCompression.AlpEncodingPreset finalizeSampling() { + return createDoubleEncodingPreset(rowgroupSample); + } + } + + // ========== CreateEncodingPreset (float) ========== + + /** + * Estimate compressed size in bits for a given (exponent, factor) on sample data. + * Returns -1 if the combination yields almost all exceptions (< 2 non-exceptions). + */ + private static long estimateFloatCompressedSize( + float[] sample, int exponent, int factor, boolean penalizeExceptions) { + int minEncoded = Integer.MAX_VALUE; + int maxEncoded = Integer.MIN_VALUE; + int numExceptions = 0; + int numNonExceptions = 0; + + for (float value : sample) { + int encoded = AlpEncoderDecoder.encodeFloat(value, exponent, factor); + float decoded = AlpEncoderDecoder.decodeFloat(encoded, exponent, factor); + if (Float.floatToRawIntBits(decoded) == Float.floatToRawIntBits(value)) { + numNonExceptions++; + if (encoded < minEncoded) minEncoded = encoded; + if (encoded > maxEncoded) maxEncoded = encoded; + } else { + numExceptions++; + } + } + + if (penalizeExceptions && numNonExceptions < 2) { + return -1; + } + + long delta; + if (numNonExceptions >= 2) { + // Unsigned difference + delta = Integer.toUnsignedLong(maxEncoded) - Integer.toUnsignedLong(minEncoded); + } else { + delta = 0; + } + int bitsPerValue = (delta == 0) ? 0 : (64 - Long.numberOfLeadingZeros(delta)); + long estimatedSize = (long) sample.length * bitsPerValue; + estimatedSize += (long) numExceptions * (32 + POSITION_SIZE * 8); + return estimatedSize; + } + + static AlpCompression.AlpEncodingPreset createFloatEncodingPreset(List vectorsSampled) { + // For each sampled vector, find the best (e,f) combo by estimated compressed size. + // Count how many times each best combo appears across all sampled vectors. + Map bestCombosCount = new HashMap<>(); // key = e<<8|f, value = [count] + + for (float[] sample : vectorsSampled) { + long bestSize = Long.MAX_VALUE; + int bestE = FLOAT_MAX_EXPONENT; + int bestF = FLOAT_MAX_EXPONENT; + + for (int e = 0; e <= FLOAT_MAX_EXPONENT; e++) { + for (int f = 0; f <= e; f++) { + long size = estimateFloatCompressedSize(sample, e, f, true); + if (size < 0) continue; + if (size < bestSize + || (size == bestSize && e > bestE) + || (size == bestSize && e == bestE && f > bestF)) { + bestSize = size; + bestE = e; + bestF = f; + } + } + } + long key = ((long) bestE << 8) | bestF; + bestCombosCount.computeIfAbsent(key, k -> new int[1])[0]++; + } + + // Sort by appearance count (descending), then by exponent/factor (descending) + List> sorted = new ArrayList<>(bestCombosCount.entrySet()); + sorted.sort((a, b) -> { + int cmpCount = Integer.compare(b.getValue()[0], a.getValue()[0]); + if (cmpCount != 0) return cmpCount; + int eA = (int) (a.getKey() >> 8); + int fA = (int) (a.getKey() & 0xFF); + int eB = (int) (b.getKey() >> 8); + int fB = (int) (b.getKey() & 0xFF); + if (eA != eB) return Integer.compare(eB, eA); + return Integer.compare(fB, fA); + }); + + int k = Math.min(MAX_COMBINATIONS, sorted.size()); + int[][] combinations = new int[k][2]; + for (int i = 0; i < k; i++) { + long key = sorted.get(i).getKey(); + combinations[i][0] = (int) (key >> 8); + combinations[i][1] = (int) (key & 0xFF); + } + return new AlpCompression.AlpEncodingPreset(combinations); + } + + // ========== CreateEncodingPreset (double) ========== + + private static long estimateDoubleCompressedSize( + double[] sample, int exponent, int factor, boolean penalizeExceptions) { + long minEncoded = Long.MAX_VALUE; + long maxEncoded = Long.MIN_VALUE; + int numExceptions = 0; + int numNonExceptions = 0; + + for (double value : sample) { + long encoded = AlpEncoderDecoder.encodeDouble(value, exponent, factor); + double decoded = AlpEncoderDecoder.decodeDouble(encoded, exponent, factor); + if (Double.doubleToRawLongBits(decoded) == Double.doubleToRawLongBits(value)) { + numNonExceptions++; + if (encoded < minEncoded) minEncoded = encoded; + if (encoded > maxEncoded) maxEncoded = encoded; + } else { + numExceptions++; + } + } + + if (penalizeExceptions && numNonExceptions < 2) { + return -1; + } + + // For bit width: unsigned difference. Use Long.compareUnsigned logic. + int bitsPerValue; + if (numNonExceptions < 2) { + bitsPerValue = 0; + } else { + // Unsigned subtraction: maxEncoded - minEncoded as unsigned + long delta = maxEncoded - minEncoded; + bitsPerValue = (delta == 0) ? 0 : (64 - Long.numberOfLeadingZeros(delta)); + } + long estimatedSize = (long) sample.length * bitsPerValue; + estimatedSize += (long) numExceptions * (64 + POSITION_SIZE * 8); + return estimatedSize; + } + + static AlpCompression.AlpEncodingPreset createDoubleEncodingPreset(List vectorsSampled) { + Map bestCombosCount = new HashMap<>(); + + for (double[] sample : vectorsSampled) { + long bestSize = Long.MAX_VALUE; + int bestE = DOUBLE_MAX_EXPONENT; + int bestF = DOUBLE_MAX_EXPONENT; + + for (int e = 0; e <= DOUBLE_MAX_EXPONENT; e++) { + for (int f = 0; f <= e; f++) { + long size = estimateDoubleCompressedSize(sample, e, f, true); + if (size < 0) continue; + if (size < bestSize + || (size == bestSize && e > bestE) + || (size == bestSize && e == bestE && f > bestF)) { + bestSize = size; + bestE = e; + bestF = f; + } + } + } + long key = ((long) bestE << 8) | bestF; + bestCombosCount.computeIfAbsent(key, k -> new int[1])[0]++; + } + + List> sorted = new ArrayList<>(bestCombosCount.entrySet()); + sorted.sort((a, b) -> { + int cmpCount = Integer.compare(b.getValue()[0], a.getValue()[0]); + if (cmpCount != 0) return cmpCount; + int eA = (int) (a.getKey() >> 8); + int fA = (int) (a.getKey() & 0xFF); + int eB = (int) (b.getKey() >> 8); + int fB = (int) (b.getKey() & 0xFF); + if (eA != eB) return Integer.compare(eB, eA); + return Integer.compare(fB, fA); + }); + + int k = Math.min(MAX_COMBINATIONS, sorted.size()); + int[][] combinations = new int[k][2]; + for (int i = 0; i < k; i++) { + long key = sorted.get(i).getKey(); + combinations[i][0] = (int) (key >> 8); + combinations[i][1] = (int) (key & 0xFF); + } + return new AlpCompression.AlpEncodingPreset(combinations); + } +} diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/alp/AlpValuesReader.java b/parquet-column/src/main/java/org/apache/parquet/column/values/alp/AlpValuesReader.java new file mode 100644 index 0000000000..0f07033280 --- /dev/null +++ b/parquet-column/src/main/java/org/apache/parquet/column/values/alp/AlpValuesReader.java @@ -0,0 +1,134 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.column.values.alp; + +import static org.apache.parquet.column.values.alp.AlpConstants.*; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import org.apache.parquet.bytes.ByteBufferInputStream; +import org.apache.parquet.column.values.ValuesReader; +import org.apache.parquet.io.ParquetDecodingException; + +/** + * Abstract base class for ALP values readers with lazy per-vector decoding. + * + *

On {@link #initFromPage}, reads the 7-byte header and offset array but does NOT + * decode any vectors. Vectors are decoded on demand when values are accessed. + * {@link #skip()} is O(1) — it just advances the index. + * + *

Reuses the decoded buffer across vectors to reduce allocations. + * Validates header fields (compression mode, integer encoding, log vector size bounds, + * element count) and skip bounds. + */ +abstract class AlpValuesReader extends ValuesReader { + + protected int vectorSize; + protected int totalCount; + protected int numVectors; + protected int currentIndex; + protected int decodedVectorIndex = -1; + protected int[] vectorOffsets; + protected byte[] rawData; // all data after header (offsets + vectors) + + @Override + public void initFromPage(int valueCount, ByteBufferInputStream stream) throws IOException { + int available = (int) stream.available(); + if (available < HEADER_SIZE) { + throw new ParquetDecodingException("ALP page too small for header: " + available + " bytes"); + } + + // Read header + byte[] headerBytes = new byte[HEADER_SIZE]; + stream.read(headerBytes); + ByteBuffer header = ByteBuffer.wrap(headerBytes).order(ByteOrder.LITTLE_ENDIAN); + + int compressionMode = header.get() & 0xFF; + int integerEncoding = header.get() & 0xFF; + int logVectorSize = header.get() & 0xFF; + totalCount = header.getInt(); + + if (compressionMode != COMPRESSION_MODE_ALP) { + throw new ParquetDecodingException("Unsupported ALP compression mode: " + compressionMode); + } + if (integerEncoding != INTEGER_ENCODING_FOR) { + throw new ParquetDecodingException("Unsupported ALP integer encoding: " + integerEncoding); + } + if (logVectorSize < MIN_LOG_VECTOR_SIZE || logVectorSize > MAX_LOG_VECTOR_SIZE) { + throw new ParquetDecodingException("Invalid ALP log vector size: " + logVectorSize + + ", must be between " + MIN_LOG_VECTOR_SIZE + " and " + MAX_LOG_VECTOR_SIZE); + } + if (totalCount < 0) { + throw new ParquetDecodingException("Invalid ALP element count: " + totalCount); + } + + vectorSize = 1 << logVectorSize; + numVectors = (totalCount + vectorSize - 1) / vectorSize; + currentIndex = 0; + decodedVectorIndex = -1; + + if (numVectors == 0) { + vectorOffsets = new int[0]; + rawData = new byte[0]; + allocateDecodedBuffer(vectorSize); + return; + } + + // Read remaining data (offsets + vectors) + int remaining = (int) stream.available(); + rawData = new byte[remaining]; + stream.read(rawData); + + // Parse offsets from rawData + ByteBuffer body = ByteBuffer.wrap(rawData, 0, numVectors * OFFSET_SIZE).order(ByteOrder.LITTLE_ENDIAN); + vectorOffsets = new int[numVectors]; + for (int i = 0; i < numVectors; i++) { + vectorOffsets[i] = body.getInt(); + } + + allocateDecodedBuffer(vectorSize); + } + + @Override + public void skip() { + currentIndex++; + } + + @Override + public void skip(int n) { + if (n < 0 || currentIndex + n > totalCount) { + throw new ParquetDecodingException(String.format( + "Cannot skip %d elements. Current index: %d, total count: %d", n, currentIndex, totalCount)); + } + currentIndex += n; + } + + /** Number of elements in the given vector (last vector may be partial). */ + protected int elementsInVector(int vectorIdx) { + if (vectorIdx < totalCount / vectorSize) { + return vectorSize; + } + int rem = totalCount % vectorSize; + return (rem == 0) ? vectorSize : rem; + } + + /** Allocate the decoded buffer once; called from initFromPage. */ + protected abstract void allocateDecodedBuffer(int capacity); +} diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/alp/AlpValuesReaderForDouble.java b/parquet-column/src/main/java/org/apache/parquet/column/values/alp/AlpValuesReaderForDouble.java new file mode 100644 index 0000000000..5886d6ae9b --- /dev/null +++ b/parquet-column/src/main/java/org/apache/parquet/column/values/alp/AlpValuesReaderForDouble.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.column.values.alp; + +import org.apache.parquet.io.ParquetDecodingException; + +/** + * ALP values reader for double columns with lazy per-vector decoding. + * + *

Reuses the decoded buffer across vectors to reduce allocations. + */ +public class AlpValuesReaderForDouble extends AlpValuesReader { + + private double[] decodedBuffer; + private long[] encodedLongBuffer; + + @Override + protected void allocateDecodedBuffer(int capacity) { + this.decodedBuffer = new double[capacity]; + this.encodedLongBuffer = new long[capacity]; + } + + @Override + public double readDouble() { + if (currentIndex >= totalCount) { + throw new ParquetDecodingException("ALP double reader exhausted at index " + currentIndex); + } + int vectorIdx = currentIndex / vectorSize; + int posInVector = currentIndex % vectorSize; + ensureVectorDecoded(vectorIdx); + currentIndex++; + return decodedBuffer[posInVector]; + } + + private void ensureVectorDecoded(int vectorIdx) { + if (vectorIdx == decodedVectorIndex) { + return; + } + int numElements = elementsInVector(vectorIdx); + int dataOffset = vectorOffsets[vectorIdx]; + AlpCompression.DoubleCompressedVector cv = + AlpCompression.DoubleCompressedVector.load(rawData, dataOffset, numElements); + AlpCompression.decompressDoubleVector(cv, decodedBuffer, encodedLongBuffer); + decodedVectorIndex = vectorIdx; + } +} diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/alp/AlpValuesReaderForFloat.java b/parquet-column/src/main/java/org/apache/parquet/column/values/alp/AlpValuesReaderForFloat.java new file mode 100644 index 0000000000..210b77f592 --- /dev/null +++ b/parquet-column/src/main/java/org/apache/parquet/column/values/alp/AlpValuesReaderForFloat.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.column.values.alp; + +import org.apache.parquet.io.ParquetDecodingException; + +/** + * ALP values reader for float columns with lazy per-vector decoding. + * + *

Reuses the decoded buffer across vectors to reduce allocations. + */ +public class AlpValuesReaderForFloat extends AlpValuesReader { + + private float[] decodedBuffer; + + @Override + protected void allocateDecodedBuffer(int capacity) { + this.decodedBuffer = new float[capacity]; + } + + @Override + public float readFloat() { + if (currentIndex >= totalCount) { + throw new ParquetDecodingException("ALP float reader exhausted at index " + currentIndex); + } + int vectorIdx = currentIndex / vectorSize; + int posInVector = currentIndex % vectorSize; + ensureVectorDecoded(vectorIdx); + currentIndex++; + return decodedBuffer[posInVector]; + } + + private void ensureVectorDecoded(int vectorIdx) { + if (vectorIdx == decodedVectorIndex) { + return; + } + int numElements = elementsInVector(vectorIdx); + int dataOffset = vectorOffsets[vectorIdx]; + AlpCompression.FloatCompressedVector cv = + AlpCompression.FloatCompressedVector.load(rawData, dataOffset, numElements); + AlpCompression.decompressFloatVector(cv, decodedBuffer); + decodedVectorIndex = vectorIdx; + } +} diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/alp/AlpValuesWriter.java b/parquet-column/src/main/java/org/apache/parquet/column/values/alp/AlpValuesWriter.java new file mode 100644 index 0000000000..f7c435cc1a --- /dev/null +++ b/parquet-column/src/main/java/org/apache/parquet/column/values/alp/AlpValuesWriter.java @@ -0,0 +1,375 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.column.values.alp; + +import static org.apache.parquet.column.values.alp.AlpConstants.*; + +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.ArrayList; +import java.util.List; +import org.apache.parquet.bytes.ByteBufferAllocator; +import org.apache.parquet.bytes.BytesInput; +import org.apache.parquet.bytes.CapacityByteArrayOutputStream; +import org.apache.parquet.bytes.HeapByteBufferAllocator; +import org.apache.parquet.column.Encoding; +import org.apache.parquet.column.values.ValuesWriter; + +/** + * Incremental ALP values writer for float and double columns. + * + *

Buffers values into fixed-size vectors (default 1024). When a vector is full, + * it is compressed via {@link AlpCompression} and stored. On {@link #getBytes()}, + * assembles the ALP page: [Header(7B)][Offsets...][Vector0][Vector1]... + * + *

Uses {@link CapacityByteArrayOutputStream} for encoded vector storage + * and {@link BytesInput#concat} for zero-copy page assembly, integrating + * with the Parquet pipeline's memory management. + * + *

Sampling: the first vector's data is used to create an encoding preset via + * {@link AlpSampler}. The preset is cached for subsequent vectors. + */ +public abstract class AlpValuesWriter extends ValuesWriter { + + protected final int vectorSize; + protected int bufferedCount; // values in current partial vector + protected int totalCount; // total values written + protected AlpCompression.AlpEncodingPreset preset; + protected boolean presetReady; + + protected AlpValuesWriter(int vectorSize) { + this.vectorSize = AlpConstants.validateVectorSize(vectorSize); + } + + protected AlpValuesWriter() { + this(DEFAULT_VECTOR_SIZE); + } + + @Override + public Encoding getEncoding() { + return Encoding.ALP; + } + + // ========== FloatAlpValuesWriter ========== + + public static class FloatAlpValuesWriter extends AlpValuesWriter { + private float[] vectorBuffer; + private float[] samplerBuffer; + private int samplerCount; + private CapacityByteArrayOutputStream encodedVectors; + private final List vectorByteSizes = new ArrayList<>(); + + public FloatAlpValuesWriter(int initialCapacity, int pageSize, ByteBufferAllocator allocator) { + this(initialCapacity, pageSize, allocator, DEFAULT_VECTOR_SIZE); + } + + public FloatAlpValuesWriter( + int initialCapacity, int pageSize, ByteBufferAllocator allocator, int vectorSize) { + super(vectorSize); + this.vectorBuffer = new float[this.vectorSize]; + this.samplerBuffer = new float[SAMPLER_ROWGROUP_SIZE]; + this.encodedVectors = new CapacityByteArrayOutputStream(initialCapacity, pageSize, allocator); + } + + /** No-arg constructor for tests and benchmarks. */ + public FloatAlpValuesWriter() { + this(DEFAULT_VECTOR_SIZE); + } + + public FloatAlpValuesWriter(int vectorSize) { + super(vectorSize); + this.vectorBuffer = new float[this.vectorSize]; + this.samplerBuffer = new float[SAMPLER_ROWGROUP_SIZE]; + this.encodedVectors = + new CapacityByteArrayOutputStream(64, 1024 * 1024, HeapByteBufferAllocator.getInstance()); + } + + @Override + public void writeFloat(float v) { + // Collect for sampling if preset not ready + if (!presetReady && samplerCount < samplerBuffer.length) { + samplerBuffer[samplerCount++] = v; + } + + vectorBuffer[bufferedCount++] = v; + totalCount++; + + if (bufferedCount == vectorSize) { + ensurePreset(); + flushVector(); + } + } + + private void ensurePreset() { + if (!presetReady) { + AlpSampler.FloatSampler sampler = new AlpSampler.FloatSampler(); + sampler.addSample(samplerBuffer, samplerCount); + preset = sampler.finalizeSampling(); + presetReady = true; + samplerBuffer = null; // free sampling buffer + } + } + + private void flushVector() { + AlpCompression.FloatCompressedVector cv = + AlpCompression.compressFloatVector(vectorBuffer, bufferedCount, preset); + int size = cv.storedSize(); + byte[] encoded = new byte[size]; + cv.store(encoded, 0); + encodedVectors.write(encoded, 0, size); + vectorByteSizes.add(size); + bufferedCount = 0; + } + + @Override + public BytesInput getBytes() { + // Flush any partial vector + if (bufferedCount > 0) { + ensurePreset(); + flushVector(); + } + + if (totalCount == 0) { + byte[] header = new byte[HEADER_SIZE]; + writeAlpHeader(header, vectorSize, 0); + return BytesInput.from(header); + } + + // Build header + byte[] header = new byte[HEADER_SIZE]; + writeAlpHeader(header, vectorSize, totalCount); + + // Build offset array + int numVectors = vectorByteSizes.size(); + int offsetsSectionSize = numVectors * OFFSET_SIZE; + ByteBuffer offsets = ByteBuffer.allocate(offsetsSectionSize).order(ByteOrder.LITTLE_ENDIAN); + int currentOffset = offsetsSectionSize; + for (int i = 0; i < numVectors; i++) { + offsets.putInt(currentOffset); + currentOffset += vectorByteSizes.get(i); + } + + return BytesInput.concat( + BytesInput.from(header), BytesInput.from(offsets.array()), BytesInput.from(encodedVectors)); + } + + @Override + public long getBufferedSize() { + long size = HEADER_SIZE + encodedVectors.size(); + size += (long) vectorByteSizes.size() * OFFSET_SIZE; + size += (long) bufferedCount * Float.BYTES; + return size; + } + + @Override + public long getAllocatedSize() { + long size = (long) vectorBuffer.length * Float.BYTES; + if (samplerBuffer != null) { + size += (long) samplerBuffer.length * Float.BYTES; + } + size += encodedVectors.getCapacity(); + return size; + } + + @Override + public String memUsageString(String prefix) { + return String.format( + "%s ALPFloatWriter: %d values, %d vectors, %d bytes allocated", + prefix, totalCount, vectorByteSizes.size(), getAllocatedSize()); + } + + @Override + public void reset() { + bufferedCount = 0; + totalCount = 0; + encodedVectors.reset(); + vectorByteSizes.clear(); + preset = null; + presetReady = false; + vectorBuffer = new float[vectorSize]; + samplerBuffer = new float[SAMPLER_ROWGROUP_SIZE]; + samplerCount = 0; + } + + @Override + public void close() { + encodedVectors.close(); + } + } + + // ========== DoubleAlpValuesWriter ========== + + public static class DoubleAlpValuesWriter extends AlpValuesWriter { + private double[] vectorBuffer; + private double[] samplerBuffer; + private int samplerCount; + private CapacityByteArrayOutputStream encodedVectors; + private final List vectorByteSizes = new ArrayList<>(); + + public DoubleAlpValuesWriter(int initialCapacity, int pageSize, ByteBufferAllocator allocator) { + this(initialCapacity, pageSize, allocator, DEFAULT_VECTOR_SIZE); + } + + public DoubleAlpValuesWriter( + int initialCapacity, int pageSize, ByteBufferAllocator allocator, int vectorSize) { + super(vectorSize); + this.vectorBuffer = new double[this.vectorSize]; + this.samplerBuffer = new double[SAMPLER_ROWGROUP_SIZE]; + this.encodedVectors = new CapacityByteArrayOutputStream(initialCapacity, pageSize, allocator); + } + + /** No-arg constructor for tests and benchmarks. */ + public DoubleAlpValuesWriter() { + this(DEFAULT_VECTOR_SIZE); + } + + public DoubleAlpValuesWriter(int vectorSize) { + super(vectorSize); + this.vectorBuffer = new double[this.vectorSize]; + this.samplerBuffer = new double[SAMPLER_ROWGROUP_SIZE]; + this.encodedVectors = + new CapacityByteArrayOutputStream(64, 1024 * 1024, HeapByteBufferAllocator.getInstance()); + } + + @Override + public void writeDouble(double v) { + if (!presetReady && samplerCount < samplerBuffer.length) { + samplerBuffer[samplerCount++] = v; + } + + vectorBuffer[bufferedCount++] = v; + totalCount++; + + if (bufferedCount == vectorSize) { + ensurePreset(); + flushVector(); + } + } + + private void ensurePreset() { + if (!presetReady) { + AlpSampler.DoubleSampler sampler = new AlpSampler.DoubleSampler(); + sampler.addSample(samplerBuffer, samplerCount); + preset = sampler.finalizeSampling(); + presetReady = true; + samplerBuffer = null; + } + } + + private void flushVector() { + AlpCompression.DoubleCompressedVector cv = + AlpCompression.compressDoubleVector(vectorBuffer, bufferedCount, preset); + int size = cv.storedSize(); + byte[] encoded = new byte[size]; + cv.store(encoded, 0); + encodedVectors.write(encoded, 0, size); + vectorByteSizes.add(size); + bufferedCount = 0; + } + + @Override + public BytesInput getBytes() { + if (bufferedCount > 0) { + ensurePreset(); + flushVector(); + } + + if (totalCount == 0) { + byte[] header = new byte[HEADER_SIZE]; + writeAlpHeader(header, vectorSize, 0); + return BytesInput.from(header); + } + + byte[] header = new byte[HEADER_SIZE]; + writeAlpHeader(header, vectorSize, totalCount); + + int numVectors = vectorByteSizes.size(); + int offsetsSectionSize = numVectors * OFFSET_SIZE; + ByteBuffer offsets = ByteBuffer.allocate(offsetsSectionSize).order(ByteOrder.LITTLE_ENDIAN); + int currentOffset = offsetsSectionSize; + for (int i = 0; i < numVectors; i++) { + offsets.putInt(currentOffset); + currentOffset += vectorByteSizes.get(i); + } + + return BytesInput.concat( + BytesInput.from(header), BytesInput.from(offsets.array()), BytesInput.from(encodedVectors)); + } + + @Override + public long getBufferedSize() { + long size = HEADER_SIZE + encodedVectors.size(); + size += (long) vectorByteSizes.size() * OFFSET_SIZE; + size += (long) bufferedCount * Double.BYTES; + return size; + } + + @Override + public long getAllocatedSize() { + long size = (long) vectorBuffer.length * Double.BYTES; + if (samplerBuffer != null) { + size += (long) samplerBuffer.length * Double.BYTES; + } + size += encodedVectors.getCapacity(); + return size; + } + + @Override + public String memUsageString(String prefix) { + return String.format( + "%s ALPDoubleWriter: %d values, %d vectors, %d bytes allocated", + prefix, totalCount, vectorByteSizes.size(), getAllocatedSize()); + } + + @Override + public void reset() { + bufferedCount = 0; + totalCount = 0; + encodedVectors.reset(); + vectorByteSizes.clear(); + preset = null; + presetReady = false; + vectorBuffer = new double[vectorSize]; + samplerBuffer = new double[SAMPLER_ROWGROUP_SIZE]; + samplerCount = 0; + } + + @Override + public void close() { + encodedVectors.close(); + } + } + + // ========== Header helpers ========== + + static void writeAlpHeader(byte[] output, int vectorSize, int numElements) { + int logVs = Integer.numberOfTrailingZeros(vectorSize); + output[0] = (byte) COMPRESSION_MODE_ALP; + output[1] = (byte) INTEGER_ENCODING_FOR; + output[2] = (byte) logVs; + writeLittleEndianInt(output, 3, numElements); + } + + static void writeLittleEndianInt(byte[] output, int pos, int value) { + output[pos] = (byte) value; + output[pos + 1] = (byte) (value >> 8); + output[pos + 2] = (byte) (value >> 16); + output[pos + 3] = (byte) (value >> 24); + } +} diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/alp/AlpWrapper.java b/parquet-column/src/main/java/org/apache/parquet/column/values/alp/AlpWrapper.java new file mode 100644 index 0000000000..a10704d7a4 --- /dev/null +++ b/parquet-column/src/main/java/org/apache/parquet/column/values/alp/AlpWrapper.java @@ -0,0 +1,320 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.column.values.alp; + +import static org.apache.parquet.column.values.alp.AlpConstants.*; + +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.ArrayList; +import java.util.List; +import org.apache.parquet.Preconditions; + +/** + * Top-level API for ALP page-level encoding and decoding. + * + *

Page layout (offset-based interleaved, matching C++ AlpWrapper): + *

+ * [Header(7B)][Offset0..OffsetN-1][Vector0][Vector1]...[VectorN-1]
+ * 
+ * where each Vector = [AlpInfo(4B)][ForInfo(5B/9B)][PackedValues][ExcPositions][ExcValues] + * + *

Header format (7 bytes, little-endian): + *

+ * [compression_mode(1B)][integer_encoding(1B)][log_vector_size(1B)][num_elements(4B LE)]
+ * 
+ */ +public final class AlpWrapper { + + private AlpWrapper() {} + + // ========== Sampling presets ========== + + /** Create a sampling-based encoding preset for float data. */ + public static AlpCompression.AlpEncodingPreset createFloatSamplingPreset(float[] data, int count) { + AlpSampler.FloatSampler sampler = new AlpSampler.FloatSampler(); + sampler.addSample(data, count); + return sampler.finalizeSampling(); + } + + /** Create a sampling-based encoding preset for double data. */ + public static AlpCompression.AlpEncodingPreset createDoubleSamplingPreset(double[] data, int count) { + AlpSampler.DoubleSampler sampler = new AlpSampler.DoubleSampler(); + sampler.addSample(data, count); + return sampler.finalizeSampling(); + } + + // ========== Encode floats ========== + + /** + * Encode float data into ALP compressed page format. + * + * @param input the float values to encode + * @param count number of values + * @param output output byte array (must be at least maxCompressedSizeFloat(count) bytes) + * @param preset the encoding preset from sampling + * @return the number of compressed bytes written + */ + public static int encodeFloats(float[] input, int count, byte[] output, AlpCompression.AlpEncodingPreset preset) { + Preconditions.checkArgument(count >= 0, "count must be non-negative, got: %s", count); + if (count == 0) { + writeHeader(output, 0, COMPRESSION_MODE_ALP, INTEGER_ENCODING_FOR, DEFAULT_VECTOR_SIZE_LOG, 0); + return HEADER_SIZE; + } + + int vectorSize = DEFAULT_VECTOR_SIZE; + int numVectors = (count + vectorSize - 1) / vectorSize; + + // Phase 1: Compress all vectors + List vectors = new ArrayList<>(numVectors); + for (int i = 0; i < numVectors; i++) { + int offset = i * vectorSize; + int elementsInVector = Math.min(vectorSize, count - offset); + float[] vectorInput = new float[elementsInVector]; + System.arraycopy(input, offset, vectorInput, 0, elementsInVector); + vectors.add(AlpCompression.compressFloatVector(vectorInput, elementsInVector, preset)); + } + + // Phase 2: Calculate offsets + int offsetsSectionSize = numVectors * OFFSET_SIZE; + int[] vectorOffsets = new int[numVectors]; + int currentOffset = offsetsSectionSize; + for (int i = 0; i < numVectors; i++) { + vectorOffsets[i] = currentOffset; + currentOffset += + ALP_INFO_SIZE + FLOAT_FOR_INFO_SIZE + vectors.get(i).dataStoredSize(); + } + int bodySize = currentOffset; + int totalSize = HEADER_SIZE + bodySize; + + // Phase 3: Write header + writeHeader(output, 0, COMPRESSION_MODE_ALP, INTEGER_ENCODING_FOR, DEFAULT_VECTOR_SIZE_LOG, count); + + // Phase 4: Write offsets + ByteBuffer buf = ByteBuffer.wrap(output, HEADER_SIZE, bodySize).order(ByteOrder.LITTLE_ENDIAN); + for (int offset : vectorOffsets) { + buf.putInt(offset); + } + + // Phase 5: Write interleaved vectors + for (int i = 0; i < numVectors; i++) { + AlpCompression.FloatCompressedVector v = vectors.get(i); + int pos = HEADER_SIZE + vectorOffsets[i]; + v.store(output, pos); + } + + return totalSize; + } + + // ========== Encode doubles ========== + + public static int encodeDoubles(double[] input, int count, byte[] output, AlpCompression.AlpEncodingPreset preset) { + Preconditions.checkArgument(count >= 0, "count must be non-negative, got: %s", count); + if (count == 0) { + writeHeader(output, 0, COMPRESSION_MODE_ALP, INTEGER_ENCODING_FOR, DEFAULT_VECTOR_SIZE_LOG, 0); + return HEADER_SIZE; + } + + int vectorSize = DEFAULT_VECTOR_SIZE; + int numVectors = (count + vectorSize - 1) / vectorSize; + + List vectors = new ArrayList<>(numVectors); + for (int i = 0; i < numVectors; i++) { + int offset = i * vectorSize; + int elementsInVector = Math.min(vectorSize, count - offset); + double[] vectorInput = new double[elementsInVector]; + System.arraycopy(input, offset, vectorInput, 0, elementsInVector); + vectors.add(AlpCompression.compressDoubleVector(vectorInput, elementsInVector, preset)); + } + + int offsetsSectionSize = numVectors * OFFSET_SIZE; + int[] vectorOffsets = new int[numVectors]; + int currentOffset = offsetsSectionSize; + for (int i = 0; i < numVectors; i++) { + vectorOffsets[i] = currentOffset; + currentOffset += + ALP_INFO_SIZE + DOUBLE_FOR_INFO_SIZE + vectors.get(i).dataStoredSize(); + } + int bodySize = currentOffset; + int totalSize = HEADER_SIZE + bodySize; + + writeHeader(output, 0, COMPRESSION_MODE_ALP, INTEGER_ENCODING_FOR, DEFAULT_VECTOR_SIZE_LOG, count); + + ByteBuffer buf = ByteBuffer.wrap(output, HEADER_SIZE, bodySize).order(ByteOrder.LITTLE_ENDIAN); + for (int offset : vectorOffsets) { + buf.putInt(offset); + } + + for (int i = 0; i < numVectors; i++) { + AlpCompression.DoubleCompressedVector v = vectors.get(i); + int pos = HEADER_SIZE + vectorOffsets[i]; + v.store(output, pos); + } + + return totalSize; + } + + // ========== Decode floats ========== + + /** + * Decode ALP compressed page to float values. + * + * @param compressed the compressed page bytes + * @param compSize number of compressed bytes + * @param output output float array (must hold numElements values) + * @param numElements number of elements to decode + */ + public static void decodeFloats(byte[] compressed, int compSize, float[] output, int numElements) { + Preconditions.checkArgument(compSize >= HEADER_SIZE, "compressed size too small for header: %s", compSize); + + ByteBuffer header = ByteBuffer.wrap(compressed, 0, HEADER_SIZE).order(ByteOrder.LITTLE_ENDIAN); + int compressionMode = header.get() & 0xFF; + int integerEncoding = header.get() & 0xFF; + int logVectorSize = header.get() & 0xFF; + int storedNumElements = header.getInt(); + + Preconditions.checkArgument( + compressionMode == COMPRESSION_MODE_ALP, "unsupported compression mode: %s", compressionMode); + Preconditions.checkArgument( + integerEncoding == INTEGER_ENCODING_FOR, "unsupported integer encoding: %s", integerEncoding); + + int vectorSize = 1 << logVectorSize; + int numVectors = (storedNumElements + vectorSize - 1) / vectorSize; + + if (numVectors == 0) return; + + // Read offsets + ByteBuffer body = + ByteBuffer.wrap(compressed, HEADER_SIZE, compSize - HEADER_SIZE).order(ByteOrder.LITTLE_ENDIAN); + int[] vectorOffsets = new int[numVectors]; + for (int i = 0; i < numVectors; i++) { + vectorOffsets[i] = body.getInt(); + } + + // Decode each vector + int outputOffset = 0; + for (int vi = 0; vi < numVectors; vi++) { + int elementsInVector; + if (vi < storedNumElements / vectorSize) { + elementsInVector = vectorSize; + } else { + elementsInVector = storedNumElements % vectorSize; + if (elementsInVector == 0) elementsInVector = vectorSize; + } + + int vectorPos = HEADER_SIZE + vectorOffsets[vi]; + AlpCompression.FloatCompressedVector cv = + AlpCompression.FloatCompressedVector.load(compressed, vectorPos, elementsInVector); + + float[] vectorOutput = new float[elementsInVector]; + AlpCompression.decompressFloatVector(cv, vectorOutput); + System.arraycopy( + vectorOutput, 0, output, outputOffset, Math.min(elementsInVector, numElements - outputOffset)); + outputOffset += elementsInVector; + } + } + + // ========== Decode doubles ========== + + public static void decodeDoubles(byte[] compressed, int compSize, double[] output, int numElements) { + Preconditions.checkArgument(compSize >= HEADER_SIZE, "compressed size too small for header: %s", compSize); + + ByteBuffer header = ByteBuffer.wrap(compressed, 0, HEADER_SIZE).order(ByteOrder.LITTLE_ENDIAN); + int compressionMode = header.get() & 0xFF; + int integerEncoding = header.get() & 0xFF; + int logVectorSize = header.get() & 0xFF; + int storedNumElements = header.getInt(); + + Preconditions.checkArgument( + compressionMode == COMPRESSION_MODE_ALP, "unsupported compression mode: %s", compressionMode); + Preconditions.checkArgument( + integerEncoding == INTEGER_ENCODING_FOR, "unsupported integer encoding: %s", integerEncoding); + + int vectorSize = 1 << logVectorSize; + int numVectors = (storedNumElements + vectorSize - 1) / vectorSize; + + if (numVectors == 0) return; + + ByteBuffer body = + ByteBuffer.wrap(compressed, HEADER_SIZE, compSize - HEADER_SIZE).order(ByteOrder.LITTLE_ENDIAN); + int[] vectorOffsets = new int[numVectors]; + for (int i = 0; i < numVectors; i++) { + vectorOffsets[i] = body.getInt(); + } + + int outputOffset = 0; + for (int vi = 0; vi < numVectors; vi++) { + int elementsInVector; + if (vi < storedNumElements / vectorSize) { + elementsInVector = vectorSize; + } else { + elementsInVector = storedNumElements % vectorSize; + if (elementsInVector == 0) elementsInVector = vectorSize; + } + + int vectorPos = HEADER_SIZE + vectorOffsets[vi]; + AlpCompression.DoubleCompressedVector cv = + AlpCompression.DoubleCompressedVector.load(compressed, vectorPos, elementsInVector); + + double[] vectorOutput = new double[elementsInVector]; + AlpCompression.decompressDoubleVector(cv, vectorOutput); + System.arraycopy( + vectorOutput, 0, output, outputOffset, Math.min(elementsInVector, numElements - outputOffset)); + outputOffset += elementsInVector; + } + } + + // ========== Max compressed size ========== + + /** Maximum compressed size for float data of given element count. */ + public static long maxCompressedSizeFloat(int numElements) { + long size = HEADER_SIZE; + long numVectors = (numElements + DEFAULT_VECTOR_SIZE - 1) / DEFAULT_VECTOR_SIZE; + size += numVectors * OFFSET_SIZE; + size += numVectors * (ALP_INFO_SIZE + FLOAT_FOR_INFO_SIZE); + // Worst case: all values bit-packed at full width + all exceptions + size += (long) numElements * Float.BYTES; // packed values worst case + size += (long) numElements * Float.BYTES; // exception values + size += (long) numElements * POSITION_SIZE; // exception positions + return size; + } + + /** Maximum compressed size for double data of given element count. */ + public static long maxCompressedSizeDouble(int numElements) { + long size = HEADER_SIZE; + long numVectors = (numElements + DEFAULT_VECTOR_SIZE - 1) / DEFAULT_VECTOR_SIZE; + size += numVectors * OFFSET_SIZE; + size += numVectors * (ALP_INFO_SIZE + DOUBLE_FOR_INFO_SIZE); + size += (long) numElements * Double.BYTES; + size += (long) numElements * Double.BYTES; + size += (long) numElements * POSITION_SIZE; + return size; + } + + // ========== Header helpers ========== + + private static void writeHeader( + byte[] output, int offset, int compressionMode, int integerEncoding, int logVectorSize, int numElements) { + ByteBuffer buf = ByteBuffer.wrap(output, offset, HEADER_SIZE).order(ByteOrder.LITTLE_ENDIAN); + buf.put((byte) compressionMode); + buf.put((byte) integerEncoding); + buf.put((byte) logVectorSize); + buf.putInt(numElements); + } +} diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/factory/DefaultV2ValuesWriterFactory.java b/parquet-column/src/main/java/org/apache/parquet/column/values/factory/DefaultV2ValuesWriterFactory.java index c50b4e49c5..535e3a2fdc 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/values/factory/DefaultV2ValuesWriterFactory.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/values/factory/DefaultV2ValuesWriterFactory.java @@ -25,6 +25,7 @@ import org.apache.parquet.column.Encoding; import org.apache.parquet.column.ParquetProperties; import org.apache.parquet.column.values.ValuesWriter; +import org.apache.parquet.column.values.alp.AlpValuesWriter; import org.apache.parquet.column.values.bytestreamsplit.ByteStreamSplitValuesWriter; import org.apache.parquet.column.values.delta.DeltaBinaryPackingValuesWriterForInteger; import org.apache.parquet.column.values.delta.DeltaBinaryPackingValuesWriterForLong; @@ -159,7 +160,12 @@ private ValuesWriter getInt96ValuesWriter(ColumnDescriptor path) { private ValuesWriter getDoubleValuesWriter(ColumnDescriptor path) { final ValuesWriter fallbackWriter; - if (this.parquetProperties.isByteStreamSplitEnabled(path)) { + if (this.parquetProperties.isAlpEnabled(path)) { + fallbackWriter = new AlpValuesWriter.DoubleAlpValuesWriter( + parquetProperties.getInitialSlabSize(), + parquetProperties.getPageSizeThreshold(), + parquetProperties.getAllocator()); + } else if (this.parquetProperties.isByteStreamSplitEnabled(path)) { fallbackWriter = new ByteStreamSplitValuesWriter.DoubleByteStreamSplitValuesWriter( parquetProperties.getInitialSlabSize(), parquetProperties.getPageSizeThreshold(), @@ -176,7 +182,12 @@ private ValuesWriter getDoubleValuesWriter(ColumnDescriptor path) { private ValuesWriter getFloatValuesWriter(ColumnDescriptor path) { final ValuesWriter fallbackWriter; - if (this.parquetProperties.isByteStreamSplitEnabled(path)) { + if (this.parquetProperties.isAlpEnabled(path)) { + fallbackWriter = new AlpValuesWriter.FloatAlpValuesWriter( + parquetProperties.getInitialSlabSize(), + parquetProperties.getPageSizeThreshold(), + parquetProperties.getAllocator()); + } else if (this.parquetProperties.isByteStreamSplitEnabled(path)) { fallbackWriter = new ByteStreamSplitValuesWriter.FloatByteStreamSplitValuesWriter( parquetProperties.getInitialSlabSize(), parquetProperties.getPageSizeThreshold(), diff --git a/parquet-column/src/test/java/org/apache/parquet/column/values/alp/AlpCompressionTest.java b/parquet-column/src/test/java/org/apache/parquet/column/values/alp/AlpCompressionTest.java new file mode 100644 index 0000000000..9f840016da --- /dev/null +++ b/parquet-column/src/test/java/org/apache/parquet/column/values/alp/AlpCompressionTest.java @@ -0,0 +1,372 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.column.values.alp; + +import static org.junit.Assert.*; + +import java.util.ArrayList; +import java.util.List; +import java.util.Random; +import org.junit.Test; + +public class AlpCompressionTest { + + // ========== Helpers ========== + + private static AlpCompression.AlpEncodingPreset allFloatCombos() { + List combos = new ArrayList<>(); + for (int e = 0; e <= AlpConstants.FLOAT_MAX_EXPONENT; e++) { + for (int f = 0; f <= e; f++) { + combos.add(new int[] {e, f}); + } + } + return new AlpCompression.AlpEncodingPreset(combos.toArray(new int[0][])); + } + + private static AlpCompression.AlpEncodingPreset allDoubleCombos() { + List combos = new ArrayList<>(); + for (int e = 0; e <= AlpConstants.DOUBLE_MAX_EXPONENT; e++) { + for (int f = 0; f <= e; f++) { + combos.add(new int[] {e, f}); + } + } + return new AlpCompression.AlpEncodingPreset(combos.toArray(new int[0][])); + } + + private static void assertFloatRoundTrip(float[] input) { + AlpCompression.AlpEncodingPreset preset = allFloatCombos(); + AlpCompression.FloatCompressedVector cv = AlpCompression.compressFloatVector(input, input.length, preset); + float[] output = new float[input.length]; + AlpCompression.decompressFloatVector(cv, output); + for (int i = 0; i < input.length; i++) { + assertEquals( + "Mismatch at index " + i, Float.floatToRawIntBits(input[i]), Float.floatToRawIntBits(output[i])); + } + } + + private static void assertDoubleRoundTrip(double[] input) { + AlpCompression.AlpEncodingPreset preset = allDoubleCombos(); + AlpCompression.DoubleCompressedVector cv = AlpCompression.compressDoubleVector(input, input.length, preset); + double[] output = new double[input.length]; + AlpCompression.decompressDoubleVector(cv, output); + for (int i = 0; i < input.length; i++) { + assertEquals( + "Mismatch at index " + i, + Double.doubleToRawLongBits(input[i]), + Double.doubleToRawLongBits(output[i])); + } + } + + // ========== Float compress/decompress ========== + + @Test + public void testFloatConstantValues() { + float[] input = new float[100]; + for (int i = 0; i < 100; i++) { + input[i] = 3.14f; + } + assertFloatRoundTrip(input); + + // Constant → bitWidth should be 0 + AlpCompression.FloatCompressedVector cv = + AlpCompression.compressFloatVector(input, input.length, allFloatCombos()); + assertEquals(0, cv.bitWidth); + } + + @Test + public void testFloatDecimalValues() { + float[] input = new float[100]; + for (int i = 0; i < 100; i++) { + input[i] = i * 0.1f; + } + assertFloatRoundTrip(input); + } + + @Test + public void testFloatIntegerValues() { + float[] input = new float[100]; + for (int i = 0; i < 100; i++) { + input[i] = i; + } + assertFloatRoundTrip(input); + } + + @Test + public void testFloatRandomValues() { + Random rng = new Random(42); + float[] input = new float[200]; + for (int i = 0; i < 200; i++) { + input[i] = Math.round(rng.nextFloat() * 10000) / 100.0f; + } + assertFloatRoundTrip(input); + } + + @Test + public void testFloatSpecialValues() { + float[] input = { + 1.0f, Float.NaN, 2.0f, Float.POSITIVE_INFINITY, 3.0f, Float.NEGATIVE_INFINITY, 4.0f, -0.0f, 5.0f + }; + assertFloatRoundTrip(input); + + AlpCompression.FloatCompressedVector cv = + AlpCompression.compressFloatVector(input, input.length, allFloatCombos()); + // NaN, +Inf, -Inf, -0.0 should be exceptions + assertTrue(cv.numExceptions >= 4); + } + + @Test + public void testFloatSingleElement() { + assertFloatRoundTrip(new float[] {42.5f}); + } + + @Test + public void testFloatEmptyVector() { + AlpCompression.FloatCompressedVector cv = AlpCompression.compressFloatVector(new float[0], 0, allFloatCombos()); + assertEquals(0, cv.numElements); + assertEquals(0, cv.numExceptions); + } + + @Test + public void testFloatAllExceptions() { + float[] input = new float[16]; + for (int i = 0; i < 16; i++) { + input[i] = Float.NaN; + } + assertFloatRoundTrip(input); + + AlpCompression.FloatCompressedVector cv = + AlpCompression.compressFloatVector(input, input.length, allFloatCombos()); + assertEquals(16, cv.numExceptions); + assertEquals(0, cv.bitWidth); + } + + @Test + public void testFloatExactVectorSize() { + float[] input = new float[AlpConstants.DEFAULT_VECTOR_SIZE]; + for (int i = 0; i < input.length; i++) { + input[i] = i * 0.01f; + } + assertFloatRoundTrip(input); + } + + @Test + public void testFloatNonMultipleOf8() { + // 13 elements — tests tail handling in pack/unpack + float[] input = new float[13]; + for (int i = 0; i < 13; i++) { + input[i] = i * 1.5f; + } + assertFloatRoundTrip(input); + } + + // ========== Float store/load ========== + + @Test + public void testFloatStoreLoadRoundTrip() { + float[] input = new float[50]; + for (int i = 0; i < 50; i++) { + input[i] = i * 0.3f; + } + + AlpCompression.FloatCompressedVector cv = + AlpCompression.compressFloatVector(input, input.length, allFloatCombos()); + + byte[] buf = new byte[cv.storedSize()]; + cv.store(buf, 0); + + AlpCompression.FloatCompressedVector loaded = AlpCompression.FloatCompressedVector.load(buf, 0, input.length); + + float[] output = new float[input.length]; + AlpCompression.decompressFloatVector(loaded, output); + + for (int i = 0; i < input.length; i++) { + assertEquals( + "Mismatch at index " + i, Float.floatToRawIntBits(input[i]), Float.floatToRawIntBits(output[i])); + } + } + + @Test + public void testFloatStoreLoadMetadata() { + float[] input = {1.1f, 2.2f, 3.3f, Float.NaN, 5.5f}; + AlpCompression.FloatCompressedVector cv = + AlpCompression.compressFloatVector(input, input.length, allFloatCombos()); + + byte[] buf = new byte[cv.storedSize()]; + cv.store(buf, 0); + + AlpCompression.FloatCompressedVector loaded = AlpCompression.FloatCompressedVector.load(buf, 0, input.length); + + assertEquals(cv.exponent, loaded.exponent); + assertEquals(cv.factor, loaded.factor); + assertEquals(cv.numExceptions, loaded.numExceptions); + assertEquals(cv.frameOfReference, loaded.frameOfReference); + assertEquals(cv.bitWidth, loaded.bitWidth); + assertEquals(cv.numElements, loaded.numElements); + } + + // ========== Double compress/decompress ========== + + @Test + public void testDoubleConstantValues() { + double[] input = new double[100]; + for (int i = 0; i < 100; i++) { + input[i] = 3.14; + } + assertDoubleRoundTrip(input); + + AlpCompression.DoubleCompressedVector cv = + AlpCompression.compressDoubleVector(input, input.length, allDoubleCombos()); + assertEquals(0, cv.bitWidth); + } + + @Test + public void testDoubleDecimalValues() { + double[] input = new double[100]; + for (int i = 0; i < 100; i++) { + input[i] = i * 0.01; + } + assertDoubleRoundTrip(input); + } + + @Test + public void testDoubleRandomValues() { + Random rng = new Random(42); + double[] input = new double[200]; + for (int i = 0; i < 200; i++) { + input[i] = Math.round(rng.nextDouble() * 10000) / 100.0; + } + assertDoubleRoundTrip(input); + } + + @Test + public void testDoubleSpecialValues() { + double[] input = {1.0, Double.NaN, 2.0, Double.POSITIVE_INFINITY, 3.0, Double.NEGATIVE_INFINITY, 4.0, -0.0, 5.0 + }; + assertDoubleRoundTrip(input); + + AlpCompression.DoubleCompressedVector cv = + AlpCompression.compressDoubleVector(input, input.length, allDoubleCombos()); + assertTrue(cv.numExceptions >= 4); + } + + @Test + public void testDoubleSingleElement() { + assertDoubleRoundTrip(new double[] {42.5}); + } + + @Test + public void testDoubleAllExceptions() { + double[] input = new double[16]; + for (int i = 0; i < 16; i++) { + input[i] = Double.NaN; + } + assertDoubleRoundTrip(input); + + AlpCompression.DoubleCompressedVector cv = + AlpCompression.compressDoubleVector(input, input.length, allDoubleCombos()); + assertEquals(16, cv.numExceptions); + } + + @Test + public void testDoubleNonMultipleOf8() { + double[] input = new double[13]; + for (int i = 0; i < 13; i++) { + input[i] = i * 1.5; + } + assertDoubleRoundTrip(input); + } + + // ========== Double store/load ========== + + @Test + public void testDoubleStoreLoadRoundTrip() { + double[] input = new double[50]; + for (int i = 0; i < 50; i++) { + input[i] = i * 0.3; + } + + AlpCompression.DoubleCompressedVector cv = + AlpCompression.compressDoubleVector(input, input.length, allDoubleCombos()); + + byte[] buf = new byte[cv.storedSize()]; + cv.store(buf, 0); + + AlpCompression.DoubleCompressedVector loaded = AlpCompression.DoubleCompressedVector.load(buf, 0, input.length); + + double[] output = new double[input.length]; + AlpCompression.decompressDoubleVector(loaded, output); + + for (int i = 0; i < input.length; i++) { + assertEquals( + "Mismatch at index " + i, + Double.doubleToRawLongBits(input[i]), + Double.doubleToRawLongBits(output[i])); + } + } + + // ========== Bit packing helpers ========== + + @Test + public void testPackUnpackInts() { + int[] values = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + int bitWidth = 4; + byte[] packed = new byte[AlpEncoderDecoder.bitPackedSize(values.length, bitWidth)]; + AlpCompression.packInts(values, values.length, bitWidth, packed); + + int[] unpacked = new int[values.length]; + AlpCompression.unpackInts(packed, values.length, bitWidth, unpacked); + assertArrayEquals(values, unpacked); + } + + @Test + public void testPackUnpackIntsNonMultipleOf8() { + int[] values = {5, 10, 15, 20, 25}; + int bitWidth = 5; + byte[] packed = new byte[AlpEncoderDecoder.bitPackedSize(values.length, bitWidth)]; + AlpCompression.packInts(values, values.length, bitWidth, packed); + + int[] unpacked = new int[values.length]; + AlpCompression.unpackInts(packed, values.length, bitWidth, unpacked); + assertArrayEquals(values, unpacked); + } + + @Test + public void testPackUnpackLongs() { + long[] values = {0, 1, 2, 3, 4, 5, 6, 7, 100, 200, 300, 400, 500, 600, 700, 800}; + int bitWidth = 10; + byte[] packed = new byte[AlpEncoderDecoder.bitPackedSize(values.length, bitWidth)]; + AlpCompression.packLongs(values, values.length, bitWidth, packed); + + long[] unpacked = new long[values.length]; + AlpCompression.unpackLongs(packed, values.length, bitWidth, unpacked); + assertArrayEquals(values, unpacked); + } + + @Test + public void testPackUnpackLongsNonMultipleOf8() { + long[] values = {10, 20, 30}; + int bitWidth = 6; + byte[] packed = new byte[AlpEncoderDecoder.bitPackedSize(values.length, bitWidth)]; + AlpCompression.packLongs(values, values.length, bitWidth, packed); + + long[] unpacked = new long[values.length]; + AlpCompression.unpackLongs(packed, values.length, bitWidth, unpacked); + assertArrayEquals(values, unpacked); + } +} diff --git a/parquet-column/src/test/java/org/apache/parquet/column/values/alp/AlpConstantsTest.java b/parquet-column/src/test/java/org/apache/parquet/column/values/alp/AlpConstantsTest.java new file mode 100644 index 0000000000..10c1ea1454 --- /dev/null +++ b/parquet-column/src/test/java/org/apache/parquet/column/values/alp/AlpConstantsTest.java @@ -0,0 +1,103 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.column.values.alp; + +import static org.junit.Assert.*; + +import org.junit.Test; + +public class AlpConstantsTest { + + @Test + public void testHeaderSize() { + assertEquals(7, AlpConstants.HEADER_SIZE); + } + + @Test + public void testFloatPow10Table() { + assertEquals(11, AlpConstants.FLOAT_POW10.length); + assertEquals(1.0f, AlpConstants.FLOAT_POW10[0], 0.0f); + assertEquals(10.0f, AlpConstants.FLOAT_POW10[1], 0.0f); + assertEquals(1e10f, AlpConstants.FLOAT_POW10[10], 0.0f); + } + + @Test + public void testDoublePow10Table() { + assertEquals(19, AlpConstants.DOUBLE_POW10.length); + assertEquals(1.0, AlpConstants.DOUBLE_POW10[0], 0.0); + assertEquals(10.0, AlpConstants.DOUBLE_POW10[1], 0.0); + assertEquals(1e18, AlpConstants.DOUBLE_POW10[18], 0.0); + } + + @Test + public void testIntegerPow10() { + assertEquals(1L, AlpConstants.integerPow10(0)); + assertEquals(10L, AlpConstants.integerPow10(1)); + assertEquals(100L, AlpConstants.integerPow10(2)); + assertEquals(1_000_000_000L, AlpConstants.integerPow10(9)); + assertEquals(1_000_000_000_000_000_000L, AlpConstants.integerPow10(18)); + } + + @Test(expected = IllegalArgumentException.class) + public void testIntegerPow10NegativePower() { + AlpConstants.integerPow10(-1); + } + + @Test(expected = IllegalArgumentException.class) + public void testIntegerPow10TooLargePower() { + AlpConstants.integerPow10(19); + } + + @Test + public void testValidateVectorSize() { + assertEquals(8, AlpConstants.validateVectorSize(8)); + assertEquals(1024, AlpConstants.validateVectorSize(1024)); + assertEquals(32768, AlpConstants.validateVectorSize(32768)); + } + + @Test(expected = IllegalArgumentException.class) + public void testValidateVectorSizeNotPowerOf2() { + AlpConstants.validateVectorSize(100); + } + + @Test(expected = IllegalArgumentException.class) + public void testValidateVectorSizeTooSmall() { + AlpConstants.validateVectorSize(4); // 2^2 < MIN_LOG=3 + } + + @Test(expected = IllegalArgumentException.class) + public void testValidateVectorSizeTooLarge() { + AlpConstants.validateVectorSize(65536); // 2^16 > MAX_LOG=15 + } + + @Test + public void testEncodingLimits() { + assertTrue(AlpConstants.FLOAT_ENCODING_UPPER_LIMIT > 0); + assertTrue(AlpConstants.FLOAT_ENCODING_LOWER_LIMIT < 0); + assertTrue(AlpConstants.DOUBLE_ENCODING_UPPER_LIMIT > 0); + assertTrue(AlpConstants.DOUBLE_ENCODING_LOWER_LIMIT < 0); + } + + @Test + public void testMetadataSizes() { + assertEquals(4, AlpConstants.ALP_INFO_SIZE); + assertEquals(5, AlpConstants.FLOAT_FOR_INFO_SIZE); + assertEquals(9, AlpConstants.DOUBLE_FOR_INFO_SIZE); + } +} diff --git a/parquet-column/src/test/java/org/apache/parquet/column/values/alp/AlpCrossImplTest.java b/parquet-column/src/test/java/org/apache/parquet/column/values/alp/AlpCrossImplTest.java new file mode 100644 index 0000000000..86fd318b97 --- /dev/null +++ b/parquet-column/src/test/java/org/apache/parquet/column/values/alp/AlpCrossImplTest.java @@ -0,0 +1,303 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.column.values.alp; + +import static org.junit.Assert.*; + +import org.junit.Test; + +/** + * Cross-implementation tests: decode C++ reference blobs with the Java decoder. + * + *

Reference blobs were generated by the C++ Arrow ALP implementation using + * {@code generate_reference_blobs.cc}. These tests verify that the Java decoder + * produces bit-identical output for the same compressed bytes. + */ +public class AlpCrossImplTest { + + // ========== Auto-generated reference data from C++ ========== + + // === Float decimal values (16 elements) === + private static final float[] FLOAT_DECIMAL_INPUT = { + Float.intBitsToFloat(0x00000000), Float.intBitsToFloat(0x3DCCCCCD), + Float.intBitsToFloat(0x3E4CCCCD), Float.intBitsToFloat(0x3E99999A), + Float.intBitsToFloat(0x3ECCCCCD), Float.intBitsToFloat(0x3F000000), + Float.intBitsToFloat(0x3F19999A), Float.intBitsToFloat(0x3F333333), + Float.intBitsToFloat(0x3F4CCCCD), Float.intBitsToFloat(0x3F666667), + Float.intBitsToFloat(0x3F800000), Float.intBitsToFloat(0x3F8CCCCD), + Float.intBitsToFloat(0x3F99999A), Float.intBitsToFloat(0x3FA66667), + Float.intBitsToFloat(0x3FB33333), Float.intBitsToFloat(0x3FC00000) + }; + + // FLOAT_DECIMAL_COMPRESSED (28 bytes) + private static final byte[] FLOAT_DECIMAL_COMPRESSED = { + (byte) 0x00, (byte) 0x00, (byte) 0x0A, (byte) 0x10, (byte) 0x00, (byte) 0x00, (byte) 0x00, + (byte) 0x04, (byte) 0x00, (byte) 0x00, (byte) 0x00, (byte) 0x07, (byte) 0x06, (byte) 0x00, + (byte) 0x00, (byte) 0x00, (byte) 0x00, (byte) 0x00, (byte) 0x00, (byte) 0x04, (byte) 0x10, + (byte) 0x32, (byte) 0x54, (byte) 0x76, (byte) 0x98, (byte) 0xBA, (byte) 0xDC, (byte) 0xFE + }; + + // === Float integer values (16 elements) === + private static final float[] FLOAT_INTEGER_INPUT = { + Float.intBitsToFloat(0x00000000), Float.intBitsToFloat(0x41200000), + Float.intBitsToFloat(0x41A00000), Float.intBitsToFloat(0x41F00000), + Float.intBitsToFloat(0x42200000), Float.intBitsToFloat(0x42480000), + Float.intBitsToFloat(0x42700000), Float.intBitsToFloat(0x428C0000), + Float.intBitsToFloat(0x42A00000), Float.intBitsToFloat(0x42B40000), + Float.intBitsToFloat(0x42C80000), Float.intBitsToFloat(0x42DC0000), + Float.intBitsToFloat(0x42F00000), Float.intBitsToFloat(0x43020000), + Float.intBitsToFloat(0x430C0000), Float.intBitsToFloat(0x43160000) + }; + + // FLOAT_INTEGER_COMPRESSED (36 bytes) + private static final byte[] FLOAT_INTEGER_COMPRESSED = { + (byte) 0x00, (byte) 0x00, (byte) 0x0A, (byte) 0x10, (byte) 0x00, (byte) 0x00, (byte) 0x00, + (byte) 0x04, (byte) 0x00, (byte) 0x00, (byte) 0x00, (byte) 0x07, (byte) 0x07, (byte) 0x00, + (byte) 0x00, (byte) 0x00, (byte) 0x00, (byte) 0x00, (byte) 0x00, (byte) 0x08, (byte) 0x00, + (byte) 0x0A, (byte) 0x14, (byte) 0x1E, (byte) 0x28, (byte) 0x32, (byte) 0x3C, (byte) 0x46, + (byte) 0x50, (byte) 0x5A, (byte) 0x64, (byte) 0x6E, (byte) 0x78, (byte) 0x82, (byte) 0x8C, + (byte) 0x96 + }; + + // === Float with special values (16 elements) === + private static final float[] FLOAT_SPECIAL_INPUT = { + Float.intBitsToFloat(0x00000000), Float.intBitsToFloat(0x3FC00000), + Float.intBitsToFloat(0x40400000), Float.intBitsToFloat(0x7FC00000), + Float.intBitsToFloat(0x40C00000), Float.intBitsToFloat(0x40F00000), + Float.intBitsToFloat(0x41100000), Float.intBitsToFloat(0x7F800000), + Float.intBitsToFloat(0x41400000), Float.intBitsToFloat(0x41580000), + Float.intBitsToFloat(0x41700000), Float.intBitsToFloat(0xFF800000), + Float.intBitsToFloat(0x41900000), Float.intBitsToFloat(0x419C0000), + Float.intBitsToFloat(0x80000000), Float.intBitsToFloat(0x41B40000) + }; + + // FLOAT_SPECIAL_COMPRESSED (60 bytes) + private static final byte[] FLOAT_SPECIAL_COMPRESSED = { + (byte) 0x00, (byte) 0x00, (byte) 0x0A, (byte) 0x10, (byte) 0x00, (byte) 0x00, (byte) 0x00, + (byte) 0x04, (byte) 0x00, (byte) 0x00, (byte) 0x00, (byte) 0x07, (byte) 0x06, (byte) 0x04, + (byte) 0x00, (byte) 0x00, (byte) 0x00, (byte) 0x00, (byte) 0x00, (byte) 0x08, (byte) 0x00, + (byte) 0x0F, (byte) 0x1E, (byte) 0x00, (byte) 0x3C, (byte) 0x4B, (byte) 0x5A, (byte) 0x00, + (byte) 0x78, (byte) 0x87, (byte) 0x96, (byte) 0x00, (byte) 0xB4, (byte) 0xC3, (byte) 0x00, + (byte) 0xE1, (byte) 0x03, (byte) 0x00, (byte) 0x07, (byte) 0x00, (byte) 0x0B, (byte) 0x00, + (byte) 0x0E, (byte) 0x00, (byte) 0x00, (byte) 0x00, (byte) 0xC0, (byte) 0x7F, (byte) 0x00, + (byte) 0x00, (byte) 0x80, (byte) 0x7F, (byte) 0x00, (byte) 0x00, (byte) 0x80, (byte) 0xFF, + (byte) 0x00, (byte) 0x00, (byte) 0x00, (byte) 0x80 + }; + + // === Double decimal values (16 elements) === + private static final double[] DOUBLE_DECIMAL_INPUT = { + Double.longBitsToDouble(0x0000000000000000L), + Double.longBitsToDouble(0x3F847AE147AE147BL), + Double.longBitsToDouble(0x3F947AE147AE147BL), + Double.longBitsToDouble(0x3F9EB851EB851EB8L), + Double.longBitsToDouble(0x3FA47AE147AE147BL), + Double.longBitsToDouble(0x3FA999999999999AL), + Double.longBitsToDouble(0x3FAEB851EB851EB8L), + Double.longBitsToDouble(0x3FB1EB851EB851ECL), + Double.longBitsToDouble(0x3FB47AE147AE147BL), + Double.longBitsToDouble(0x3FB70A3D70A3D70AL), + Double.longBitsToDouble(0x3FB999999999999AL), + Double.longBitsToDouble(0x3FBC28F5C28F5C29L), + Double.longBitsToDouble(0x3FBEB851EB851EB8L), + Double.longBitsToDouble(0x3FC0A3D70A3D70A4L), + Double.longBitsToDouble(0x3FC1EB851EB851ECL), + Double.longBitsToDouble(0x3FC3333333333333L) + }; + + // DOUBLE_DECIMAL_COMPRESSED (32 bytes) + private static final byte[] DOUBLE_DECIMAL_COMPRESSED = { + (byte) 0x00, (byte) 0x00, (byte) 0x0A, (byte) 0x10, (byte) 0x00, (byte) 0x00, (byte) 0x00, + (byte) 0x04, (byte) 0x00, (byte) 0x00, (byte) 0x00, (byte) 0x0E, (byte) 0x0C, (byte) 0x00, + (byte) 0x00, (byte) 0x00, (byte) 0x00, (byte) 0x00, (byte) 0x00, (byte) 0x00, (byte) 0x00, + (byte) 0x00, (byte) 0x00, (byte) 0x04, (byte) 0x10, (byte) 0x32, (byte) 0x54, (byte) 0x76, + (byte) 0x98, (byte) 0xBA, (byte) 0xDC, (byte) 0xFE + }; + + // === Double integer values (16 elements) === + private static final double[] DOUBLE_INTEGER_INPUT = { + Double.longBitsToDouble(0x0000000000000000L), + Double.longBitsToDouble(0x4024000000000000L), + Double.longBitsToDouble(0x4034000000000000L), + Double.longBitsToDouble(0x403E000000000000L), + Double.longBitsToDouble(0x4044000000000000L), + Double.longBitsToDouble(0x4049000000000000L), + Double.longBitsToDouble(0x404E000000000000L), + Double.longBitsToDouble(0x4051800000000000L), + Double.longBitsToDouble(0x4054000000000000L), + Double.longBitsToDouble(0x4056800000000000L), + Double.longBitsToDouble(0x4059000000000000L), + Double.longBitsToDouble(0x405B800000000000L), + Double.longBitsToDouble(0x405E000000000000L), + Double.longBitsToDouble(0x4060400000000000L), + Double.longBitsToDouble(0x4061800000000000L), + Double.longBitsToDouble(0x4062C00000000000L) + }; + + // DOUBLE_INTEGER_COMPRESSED (40 bytes) + private static final byte[] DOUBLE_INTEGER_COMPRESSED = { + (byte) 0x00, (byte) 0x00, (byte) 0x0A, (byte) 0x10, (byte) 0x00, (byte) 0x00, (byte) 0x00, + (byte) 0x04, (byte) 0x00, (byte) 0x00, (byte) 0x00, (byte) 0x10, (byte) 0x10, (byte) 0x00, + (byte) 0x00, (byte) 0x00, (byte) 0x00, (byte) 0x00, (byte) 0x00, (byte) 0x00, (byte) 0x00, + (byte) 0x00, (byte) 0x00, (byte) 0x08, (byte) 0x00, (byte) 0x0A, (byte) 0x14, (byte) 0x1E, + (byte) 0x28, (byte) 0x32, (byte) 0x3C, (byte) 0x46, (byte) 0x50, (byte) 0x5A, (byte) 0x64, + (byte) 0x6E, (byte) 0x78, (byte) 0x82, (byte) 0x8C, (byte) 0x96 + }; + + // === Float constant (16 elements) === + private static final float[] FLOAT_CONSTANT_INPUT = { + Float.intBitsToFloat(0x4048F5C3), Float.intBitsToFloat(0x4048F5C3), + Float.intBitsToFloat(0x4048F5C3), Float.intBitsToFloat(0x4048F5C3), + Float.intBitsToFloat(0x4048F5C3), Float.intBitsToFloat(0x4048F5C3), + Float.intBitsToFloat(0x4048F5C3), Float.intBitsToFloat(0x4048F5C3), + Float.intBitsToFloat(0x4048F5C3), Float.intBitsToFloat(0x4048F5C3), + Float.intBitsToFloat(0x4048F5C3), Float.intBitsToFloat(0x4048F5C3), + Float.intBitsToFloat(0x4048F5C3), Float.intBitsToFloat(0x4048F5C3), + Float.intBitsToFloat(0x4048F5C3), Float.intBitsToFloat(0x4048F5C3) + }; + + // FLOAT_CONSTANT_COMPRESSED (20 bytes) + private static final byte[] FLOAT_CONSTANT_COMPRESSED = { + (byte) 0x00, (byte) 0x00, (byte) 0x0A, (byte) 0x10, (byte) 0x00, (byte) 0x00, (byte) 0x00, + (byte) 0x04, (byte) 0x00, (byte) 0x00, (byte) 0x00, (byte) 0x0A, (byte) 0x08, (byte) 0x00, + (byte) 0x00, (byte) 0x3A, (byte) 0x01, (byte) 0x00, (byte) 0x00, (byte) 0x00 + }; + + // === Double constant (16 elements) === + private static final double[] DOUBLE_CONSTANT_INPUT = { + Double.longBitsToDouble(0x40091EB851EB851FL), + Double.longBitsToDouble(0x40091EB851EB851FL), + Double.longBitsToDouble(0x40091EB851EB851FL), + Double.longBitsToDouble(0x40091EB851EB851FL), + Double.longBitsToDouble(0x40091EB851EB851FL), + Double.longBitsToDouble(0x40091EB851EB851FL), + Double.longBitsToDouble(0x40091EB851EB851FL), + Double.longBitsToDouble(0x40091EB851EB851FL), + Double.longBitsToDouble(0x40091EB851EB851FL), + Double.longBitsToDouble(0x40091EB851EB851FL), + Double.longBitsToDouble(0x40091EB851EB851FL), + Double.longBitsToDouble(0x40091EB851EB851FL), + Double.longBitsToDouble(0x40091EB851EB851FL), + Double.longBitsToDouble(0x40091EB851EB851FL), + Double.longBitsToDouble(0x40091EB851EB851FL), + Double.longBitsToDouble(0x40091EB851EB851FL) + }; + + // DOUBLE_CONSTANT_COMPRESSED (24 bytes) + private static final byte[] DOUBLE_CONSTANT_COMPRESSED = { + (byte) 0x00, (byte) 0x00, (byte) 0x0A, (byte) 0x10, (byte) 0x00, (byte) 0x00, (byte) 0x00, + (byte) 0x04, (byte) 0x00, (byte) 0x00, (byte) 0x00, (byte) 0x12, (byte) 0x10, (byte) 0x00, + (byte) 0x00, (byte) 0x3A, (byte) 0x01, (byte) 0x00, (byte) 0x00, (byte) 0x00, (byte) 0x00, + (byte) 0x00, (byte) 0x00, (byte) 0x00 + }; + + // ========== Tests: decode C++ blobs, verify output matches input ========== + + @Test + public void testDecodeFloatDecimal() { + float[] output = new float[FLOAT_DECIMAL_INPUT.length]; + AlpWrapper.decodeFloats( + FLOAT_DECIMAL_COMPRESSED, FLOAT_DECIMAL_COMPRESSED.length, + output, FLOAT_DECIMAL_INPUT.length); + assertFloatArrayBitEqual(FLOAT_DECIMAL_INPUT, output); + } + + @Test + public void testDecodeFloatInteger() { + float[] output = new float[FLOAT_INTEGER_INPUT.length]; + AlpWrapper.decodeFloats( + FLOAT_INTEGER_COMPRESSED, FLOAT_INTEGER_COMPRESSED.length, + output, FLOAT_INTEGER_INPUT.length); + assertFloatArrayBitEqual(FLOAT_INTEGER_INPUT, output); + } + + @Test + public void testDecodeFloatSpecial() { + float[] output = new float[FLOAT_SPECIAL_INPUT.length]; + AlpWrapper.decodeFloats( + FLOAT_SPECIAL_COMPRESSED, FLOAT_SPECIAL_COMPRESSED.length, + output, FLOAT_SPECIAL_INPUT.length); + assertFloatArrayBitEqual(FLOAT_SPECIAL_INPUT, output); + } + + @Test + public void testDecodeDoubleDecimal() { + double[] output = new double[DOUBLE_DECIMAL_INPUT.length]; + AlpWrapper.decodeDoubles( + DOUBLE_DECIMAL_COMPRESSED, DOUBLE_DECIMAL_COMPRESSED.length, + output, DOUBLE_DECIMAL_INPUT.length); + assertDoubleArrayBitEqual(DOUBLE_DECIMAL_INPUT, output); + } + + @Test + public void testDecodeDoubleInteger() { + double[] output = new double[DOUBLE_INTEGER_INPUT.length]; + AlpWrapper.decodeDoubles( + DOUBLE_INTEGER_COMPRESSED, DOUBLE_INTEGER_COMPRESSED.length, + output, DOUBLE_INTEGER_INPUT.length); + assertDoubleArrayBitEqual(DOUBLE_INTEGER_INPUT, output); + } + + @Test + public void testDecodeFloatConstant() { + float[] output = new float[FLOAT_CONSTANT_INPUT.length]; + AlpWrapper.decodeFloats( + FLOAT_CONSTANT_COMPRESSED, FLOAT_CONSTANT_COMPRESSED.length, + output, FLOAT_CONSTANT_INPUT.length); + assertFloatArrayBitEqual(FLOAT_CONSTANT_INPUT, output); + } + + @Test + public void testDecodeDoubleConstant() { + double[] output = new double[DOUBLE_CONSTANT_INPUT.length]; + AlpWrapper.decodeDoubles( + DOUBLE_CONSTANT_COMPRESSED, DOUBLE_CONSTANT_COMPRESSED.length, + output, DOUBLE_CONSTANT_INPUT.length); + assertDoubleArrayBitEqual(DOUBLE_CONSTANT_INPUT, output); + } + + // ========== Helpers ========== + + private static void assertFloatArrayBitEqual(float[] expected, float[] actual) { + assertEquals("array length mismatch", expected.length, actual.length); + for (int i = 0; i < expected.length; i++) { + assertEquals( + "Mismatch at index " + i + + ": expected " + expected[i] + " (0x" + + Integer.toHexString(Float.floatToRawIntBits(expected[i])) + + "), got " + actual[i] + " (0x" + + Integer.toHexString(Float.floatToRawIntBits(actual[i])) + ")", + Float.floatToRawIntBits(expected[i]), + Float.floatToRawIntBits(actual[i])); + } + } + + private static void assertDoubleArrayBitEqual(double[] expected, double[] actual) { + assertEquals("array length mismatch", expected.length, actual.length); + for (int i = 0; i < expected.length; i++) { + assertEquals( + "Mismatch at index " + i + + ": expected " + expected[i] + " (0x" + + Long.toHexString(Double.doubleToRawLongBits(expected[i])) + + "), got " + actual[i] + " (0x" + + Long.toHexString(Double.doubleToRawLongBits(actual[i])) + ")", + Double.doubleToRawLongBits(expected[i]), + Double.doubleToRawLongBits(actual[i])); + } + } +} diff --git a/parquet-column/src/test/java/org/apache/parquet/column/values/alp/AlpEncoderDecoderTest.java b/parquet-column/src/test/java/org/apache/parquet/column/values/alp/AlpEncoderDecoderTest.java new file mode 100644 index 0000000000..2c2279fdd9 --- /dev/null +++ b/parquet-column/src/test/java/org/apache/parquet/column/values/alp/AlpEncoderDecoderTest.java @@ -0,0 +1,151 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.column.values.alp; + +import static org.junit.Assert.*; + +import org.junit.Test; + +public class AlpEncoderDecoderTest { + + @Test + public void testFloatRoundTrip() { + float[] testValues = {0.0f, 1.0f, -1.0f, 3.14159f, 100.5f, 0.001f, 1234567.0f}; + for (float value : testValues) { + for (int e = 0; e <= AlpConstants.FLOAT_MAX_EXPONENT; e++) { + for (int f = 0; f <= e; f++) { + if (!AlpEncoderDecoder.isFloatException(value, e, f)) { + int encoded = AlpEncoderDecoder.encodeFloat(value, e, f); + float decoded = AlpEncoderDecoder.decodeFloat(encoded, e, f); + assertEquals(Float.floatToRawIntBits(value), Float.floatToRawIntBits(decoded)); + } + } + } + } + } + + @Test + public void testFloatExceptionDetection() { + assertTrue(AlpEncoderDecoder.isFloatException(Float.NaN)); + assertTrue(AlpEncoderDecoder.isFloatException(Float.POSITIVE_INFINITY)); + assertTrue(AlpEncoderDecoder.isFloatException(Float.NEGATIVE_INFINITY)); + assertTrue(AlpEncoderDecoder.isFloatException(-0.0f)); + assertFalse(AlpEncoderDecoder.isFloatException(1.0f)); + assertFalse(AlpEncoderDecoder.isFloatException(0.0f)); + } + + @Test + public void testFloatEncoding() { + assertEquals(123, AlpEncoderDecoder.encodeFloat(1.23f, 2, 0)); + assertEquals(123, AlpEncoderDecoder.encodeFloat(12.3f, 2, 1)); + assertEquals(0, AlpEncoderDecoder.encodeFloat(0.0f, 5, 0)); + } + + @Test + public void testFastRoundFloat() { + assertEquals(5, AlpEncoderDecoder.fastRoundFloat(5.4f)); + assertEquals(6, AlpEncoderDecoder.fastRoundFloat(5.5f)); + assertEquals(-5, AlpEncoderDecoder.fastRoundFloat(-5.4f)); + assertEquals(-6, AlpEncoderDecoder.fastRoundFloat(-5.5f)); + assertEquals(0, AlpEncoderDecoder.fastRoundFloat(0.0f)); + } + + @Test + public void testDoubleRoundTrip() { + double[] testValues = {0.0, 1.0, -1.0, 3.14159265358979, 100.5, 0.001}; + for (double value : testValues) { + for (int e = 0; e <= Math.min(AlpConstants.DOUBLE_MAX_EXPONENT, 10); e++) { + for (int f = 0; f <= e; f++) { + if (!AlpEncoderDecoder.isDoubleException(value, e, f)) { + long encoded = AlpEncoderDecoder.encodeDouble(value, e, f); + double decoded = AlpEncoderDecoder.decodeDouble(encoded, e, f); + assertEquals(Double.doubleToRawLongBits(value), Double.doubleToRawLongBits(decoded)); + } + } + } + } + } + + @Test + public void testDoubleExceptionDetection() { + assertTrue(AlpEncoderDecoder.isDoubleException(Double.NaN)); + assertTrue(AlpEncoderDecoder.isDoubleException(Double.POSITIVE_INFINITY)); + assertTrue(AlpEncoderDecoder.isDoubleException(Double.NEGATIVE_INFINITY)); + assertTrue(AlpEncoderDecoder.isDoubleException(-0.0)); + assertFalse(AlpEncoderDecoder.isDoubleException(1.0)); + assertFalse(AlpEncoderDecoder.isDoubleException(0.0)); + } + + @Test + public void testBitWidthForInt() { + assertEquals(0, AlpEncoderDecoder.bitWidthForInt(0)); + assertEquals(1, AlpEncoderDecoder.bitWidthForInt(1)); + assertEquals(8, AlpEncoderDecoder.bitWidthForInt(255)); + assertEquals(9, AlpEncoderDecoder.bitWidthForInt(256)); + assertEquals(31, AlpEncoderDecoder.bitWidthForInt(Integer.MAX_VALUE)); + } + + @Test + public void testBitWidthForLong() { + assertEquals(0, AlpEncoderDecoder.bitWidthForLong(0L)); + assertEquals(1, AlpEncoderDecoder.bitWidthForLong(1L)); + assertEquals(63, AlpEncoderDecoder.bitWidthForLong(Long.MAX_VALUE)); + } + + @Test + public void testBitPackedSize() { + assertEquals(0, AlpEncoderDecoder.bitPackedSize(1024, 0)); + assertEquals(128, AlpEncoderDecoder.bitPackedSize(1024, 1)); + assertEquals(1024, AlpEncoderDecoder.bitPackedSize(1024, 8)); + assertEquals(1, AlpEncoderDecoder.bitPackedSize(3, 2)); // ceil(6/8)=1 + } + + @Test + public void testFindBestFloatParams() { + float[] values = {1.23f, 4.56f, 7.89f, 10.11f, 12.13f}; + AlpEncoderDecoder.EncodingParams params = AlpEncoderDecoder.findBestFloatParams(values, 0, values.length); + assertNotNull(params); + assertEquals(0, params.numExceptions); + } + + @Test + public void testFindBestFloatParamsAllExceptions() { + float[] values = {Float.NaN, Float.NaN, Float.NaN}; + AlpEncoderDecoder.EncodingParams params = AlpEncoderDecoder.findBestFloatParams(values, 0, values.length); + assertEquals(values.length, params.numExceptions); + } + + @Test + public void testFindBestDoubleParams() { + double[] values = {1.23, 4.56, 7.89, 10.11, 12.13}; + AlpEncoderDecoder.EncodingParams params = AlpEncoderDecoder.findBestDoubleParams(values, 0, values.length); + assertNotNull(params); + assertEquals(0, params.numExceptions); + } + + @Test + public void testFindBestParamsWithPresets() { + float[] values = {1.23f, 4.56f, 7.89f}; + AlpEncoderDecoder.EncodingParams fullResult = AlpEncoderDecoder.findBestFloatParams(values, 0, values.length); + int[][] presets = {{fullResult.exponent, fullResult.factor}, {0, 0}, {1, 0}}; + AlpEncoderDecoder.EncodingParams presetResult = + AlpEncoderDecoder.findBestFloatParamsWithPresets(values, 0, values.length, presets); + assertTrue(presetResult.numExceptions <= fullResult.numExceptions); + } +} diff --git a/parquet-column/src/test/java/org/apache/parquet/column/values/alp/AlpSamplerTest.java b/parquet-column/src/test/java/org/apache/parquet/column/values/alp/AlpSamplerTest.java new file mode 100644 index 0000000000..2373d5ef6b --- /dev/null +++ b/parquet-column/src/test/java/org/apache/parquet/column/values/alp/AlpSamplerTest.java @@ -0,0 +1,188 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.column.values.alp; + +import static org.junit.Assert.*; + +import java.util.Random; +import org.junit.Test; + +public class AlpSamplerTest { + + // ========== Float sampler ========== + + @Test + public void testFloatDecimalData() { + // 2-decimal-place data should pick exponent=2, factor=0 + float[] data = new float[5000]; + for (int i = 0; i < data.length; i++) { + data[i] = i * 0.01f; + } + AlpSampler.FloatSampler sampler = new AlpSampler.FloatSampler(); + sampler.addSample(data, data.length); + AlpCompression.AlpEncodingPreset preset = sampler.finalizeSampling(); + + assertNotNull(preset.combinations); + assertTrue(preset.combinations.length >= 1); + assertTrue(preset.combinations.length <= AlpConstants.MAX_COMBINATIONS); + } + + @Test + public void testFloatIntegerData() { + float[] data = new float[5000]; + for (int i = 0; i < data.length; i++) { + data[i] = i; + } + AlpSampler.FloatSampler sampler = new AlpSampler.FloatSampler(); + sampler.addSample(data, data.length); + AlpCompression.AlpEncodingPreset preset = sampler.finalizeSampling(); + + assertNotNull(preset.combinations); + assertTrue(preset.combinations.length >= 1); + // For integer data, any (e,e) combo works since multiplier = 10^e/10^e = 1. + // Tiebreaker prefers bigger exponents, matching C++ behavior. + int bestE = preset.combinations[0][0]; + int bestF = preset.combinations[0][1]; + assertEquals("Integer data: exponent should equal factor", bestE, bestF); + } + + @Test + public void testFloatPresetProducesValidRoundTrip() { + float[] data = new float[5000]; + Random rng = new Random(42); + for (int i = 0; i < data.length; i++) { + data[i] = Math.round(rng.nextFloat() * 10000) / 100.0f; + } + + AlpSampler.FloatSampler sampler = new AlpSampler.FloatSampler(); + sampler.addSample(data, data.length); + AlpCompression.AlpEncodingPreset preset = sampler.finalizeSampling(); + + // Compress and decompress a vector using the preset + int vectorSize = Math.min(1024, data.length); + float[] vector = new float[vectorSize]; + System.arraycopy(data, 0, vector, 0, vectorSize); + + AlpCompression.FloatCompressedVector cv = AlpCompression.compressFloatVector(vector, vectorSize, preset); + float[] output = new float[vectorSize]; + AlpCompression.decompressFloatVector(cv, output); + + for (int i = 0; i < vectorSize; i++) { + assertEquals("Mismatch at " + i, Float.floatToRawIntBits(vector[i]), Float.floatToRawIntBits(output[i])); + } + } + + @Test + public void testFloatSmallDataset() { + // Fewer values than SAMPLER_VECTOR_SIZE + float[] data = {1.1f, 2.2f, 3.3f, 4.4f, 5.5f}; + AlpSampler.FloatSampler sampler = new AlpSampler.FloatSampler(); + sampler.addSample(data, data.length); + AlpCompression.AlpEncodingPreset preset = sampler.finalizeSampling(); + + assertNotNull(preset.combinations); + assertTrue(preset.combinations.length >= 1); + } + + @Test + public void testFloatMultipleSamples() { + AlpSampler.FloatSampler sampler = new AlpSampler.FloatSampler(); + for (int batch = 0; batch < 10; batch++) { + float[] data = new float[1000]; + for (int i = 0; i < 1000; i++) { + data[i] = (batch * 1000 + i) * 0.1f; + } + sampler.addSample(data, data.length); + } + AlpCompression.AlpEncodingPreset preset = sampler.finalizeSampling(); + assertNotNull(preset.combinations); + assertTrue(preset.combinations.length >= 1); + } + + // ========== Double sampler ========== + + @Test + public void testDoubleDecimalData() { + double[] data = new double[5000]; + for (int i = 0; i < data.length; i++) { + data[i] = i * 0.01; + } + AlpSampler.DoubleSampler sampler = new AlpSampler.DoubleSampler(); + sampler.addSample(data, data.length); + AlpCompression.AlpEncodingPreset preset = sampler.finalizeSampling(); + + assertNotNull(preset.combinations); + assertTrue(preset.combinations.length >= 1); + assertTrue(preset.combinations.length <= AlpConstants.MAX_COMBINATIONS); + } + + @Test + public void testDoubleIntegerData() { + double[] data = new double[5000]; + for (int i = 0; i < data.length; i++) { + data[i] = i; + } + AlpSampler.DoubleSampler sampler = new AlpSampler.DoubleSampler(); + sampler.addSample(data, data.length); + AlpCompression.AlpEncodingPreset preset = sampler.finalizeSampling(); + + assertNotNull(preset.combinations); + assertTrue(preset.combinations.length >= 1); + int bestE = preset.combinations[0][0]; + int bestF = preset.combinations[0][1]; + assertEquals("Integer data: exponent should equal factor", bestE, bestF); + } + + @Test + public void testDoublePresetProducesValidRoundTrip() { + double[] data = new double[5000]; + Random rng = new Random(42); + for (int i = 0; i < data.length; i++) { + data[i] = Math.round(rng.nextDouble() * 10000) / 100.0; + } + + AlpSampler.DoubleSampler sampler = new AlpSampler.DoubleSampler(); + sampler.addSample(data, data.length); + AlpCompression.AlpEncodingPreset preset = sampler.finalizeSampling(); + + int vectorSize = Math.min(1024, data.length); + double[] vector = new double[vectorSize]; + System.arraycopy(data, 0, vector, 0, vectorSize); + + AlpCompression.DoubleCompressedVector cv = AlpCompression.compressDoubleVector(vector, vectorSize, preset); + double[] output = new double[vectorSize]; + AlpCompression.decompressDoubleVector(cv, output); + + for (int i = 0; i < vectorSize; i++) { + assertEquals( + "Mismatch at " + i, Double.doubleToRawLongBits(vector[i]), Double.doubleToRawLongBits(output[i])); + } + } + + @Test + public void testDoubleSmallDataset() { + double[] data = {1.1, 2.2, 3.3, 4.4, 5.5}; + AlpSampler.DoubleSampler sampler = new AlpSampler.DoubleSampler(); + sampler.addSample(data, data.length); + AlpCompression.AlpEncodingPreset preset = sampler.finalizeSampling(); + + assertNotNull(preset.combinations); + assertTrue(preset.combinations.length >= 1); + } +} diff --git a/parquet-column/src/test/java/org/apache/parquet/column/values/alp/AlpValuesReaderTest.java b/parquet-column/src/test/java/org/apache/parquet/column/values/alp/AlpValuesReaderTest.java new file mode 100644 index 0000000000..994c496608 --- /dev/null +++ b/parquet-column/src/test/java/org/apache/parquet/column/values/alp/AlpValuesReaderTest.java @@ -0,0 +1,227 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.column.values.alp; + +import static org.junit.Assert.*; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.Random; +import org.apache.parquet.bytes.ByteBufferInputStream; +import org.junit.Test; + +public class AlpValuesReaderTest { + + private static ByteBufferInputStream toInputStream(byte[] data) { + return ByteBufferInputStream.wrap(ByteBuffer.wrap(data)); + } + + // ========== Float writer → reader round-trip ========== + + private static void assertFloatWriterReaderRoundTrip(float[] values) throws IOException { + // Write + AlpValuesWriter.FloatAlpValuesWriter writer = new AlpValuesWriter.FloatAlpValuesWriter(); + for (float v : values) { + writer.writeFloat(v); + } + byte[] compressed = writer.getBytes().toByteArray(); + + // Read + AlpValuesReaderForFloat reader = new AlpValuesReaderForFloat(); + reader.initFromPage(values.length, toInputStream(compressed)); + + for (int i = 0; i < values.length; i++) { + float actual = reader.readFloat(); + assertEquals("Mismatch at " + i, Float.floatToRawIntBits(values[i]), Float.floatToRawIntBits(actual)); + } + } + + @Test + public void testFloatReaderSingleVector() throws IOException { + float[] values = new float[100]; + for (int i = 0; i < 100; i++) { + values[i] = i * 0.1f; + } + assertFloatWriterReaderRoundTrip(values); + } + + @Test + public void testFloatReaderMultipleVectors() throws IOException { + float[] values = new float[2500]; + Random rng = new Random(42); + for (int i = 0; i < 2500; i++) { + values[i] = Math.round(rng.nextFloat() * 10000) / 100.0f; + } + assertFloatWriterReaderRoundTrip(values); + } + + @Test + public void testFloatReaderExactVectorSize() throws IOException { + float[] values = new float[AlpConstants.DEFAULT_VECTOR_SIZE]; + for (int i = 0; i < values.length; i++) { + values[i] = i * 0.5f; + } + assertFloatWriterReaderRoundTrip(values); + } + + @Test + public void testFloatReaderSpecialValues() throws IOException { + float[] values = { + 1.0f, Float.NaN, 2.0f, Float.POSITIVE_INFINITY, 3.0f, Float.NEGATIVE_INFINITY, 4.0f, -0.0f, 5.0f + }; + assertFloatWriterReaderRoundTrip(values); + } + + // ========== Float skip ========== + + @Test + public void testFloatReaderSkip() throws IOException { + float[] values = new float[50]; + for (int i = 0; i < 50; i++) { + values[i] = i * 0.3f; + } + + AlpValuesWriter.FloatAlpValuesWriter writer = new AlpValuesWriter.FloatAlpValuesWriter(); + for (float v : values) { + writer.writeFloat(v); + } + byte[] compressed = writer.getBytes().toByteArray(); + + AlpValuesReaderForFloat reader = new AlpValuesReaderForFloat(); + reader.initFromPage(values.length, toInputStream(compressed)); + + // Skip first 10, read next 5 + reader.skip(10); + for (int i = 10; i < 15; i++) { + assertEquals(Float.floatToRawIntBits(values[i]), Float.floatToRawIntBits(reader.readFloat())); + } + + // Skip 20 more, read next + reader.skip(20); + assertEquals(Float.floatToRawIntBits(values[35]), Float.floatToRawIntBits(reader.readFloat())); + } + + @Test + public void testFloatReaderSkipAcrossVectors() throws IOException { + float[] values = new float[2500]; + for (int i = 0; i < 2500; i++) { + values[i] = i * 0.01f; + } + + AlpValuesWriter.FloatAlpValuesWriter writer = new AlpValuesWriter.FloatAlpValuesWriter(); + for (float v : values) { + writer.writeFloat(v); + } + byte[] compressed = writer.getBytes().toByteArray(); + + AlpValuesReaderForFloat reader = new AlpValuesReaderForFloat(); + reader.initFromPage(values.length, toInputStream(compressed)); + + // Skip into second vector + reader.skip(1500); + assertEquals(Float.floatToRawIntBits(values[1500]), Float.floatToRawIntBits(reader.readFloat())); + } + + // ========== Double writer → reader round-trip ========== + + private static void assertDoubleWriterReaderRoundTrip(double[] values) throws IOException { + AlpValuesWriter.DoubleAlpValuesWriter writer = new AlpValuesWriter.DoubleAlpValuesWriter(); + for (double v : values) { + writer.writeDouble(v); + } + byte[] compressed = writer.getBytes().toByteArray(); + + AlpValuesReaderForDouble reader = new AlpValuesReaderForDouble(); + reader.initFromPage(values.length, toInputStream(compressed)); + + for (int i = 0; i < values.length; i++) { + double actual = reader.readDouble(); + assertEquals("Mismatch at " + i, Double.doubleToRawLongBits(values[i]), Double.doubleToRawLongBits(actual)); + } + } + + @Test + public void testDoubleReaderSingleVector() throws IOException { + double[] values = new double[100]; + for (int i = 0; i < 100; i++) { + values[i] = i * 0.1; + } + assertDoubleWriterReaderRoundTrip(values); + } + + @Test + public void testDoubleReaderMultipleVectors() throws IOException { + double[] values = new double[2500]; + Random rng = new Random(42); + for (int i = 0; i < 2500; i++) { + values[i] = Math.round(rng.nextDouble() * 10000) / 100.0; + } + assertDoubleWriterReaderRoundTrip(values); + } + + @Test + public void testDoubleReaderSpecialValues() throws IOException { + double[] values = {1.0, Double.NaN, 2.0, Double.POSITIVE_INFINITY, 3.0, Double.NEGATIVE_INFINITY, 4.0, -0.0, 5.0 + }; + assertDoubleWriterReaderRoundTrip(values); + } + + @Test + public void testDoubleReaderSkip() throws IOException { + double[] values = new double[50]; + for (int i = 0; i < 50; i++) { + values[i] = i * 0.3; + } + + AlpValuesWriter.DoubleAlpValuesWriter writer = new AlpValuesWriter.DoubleAlpValuesWriter(); + for (double v : values) { + writer.writeDouble(v); + } + byte[] compressed = writer.getBytes().toByteArray(); + + AlpValuesReaderForDouble reader = new AlpValuesReaderForDouble(); + reader.initFromPage(values.length, toInputStream(compressed)); + + reader.skip(10); + for (int i = 10; i < 15; i++) { + assertEquals(Double.doubleToRawLongBits(values[i]), Double.doubleToRawLongBits(reader.readDouble())); + } + } + + // ========== Partial last vector ========== + + @Test + public void testFloatReaderPartialLastVector() throws IOException { + // 1030 = 1024 + 6 → 2 vectors, last has 6 elements + float[] values = new float[1030]; + for (int i = 0; i < 1030; i++) { + values[i] = i * 0.1f; + } + assertFloatWriterReaderRoundTrip(values); + } + + @Test + public void testDoubleReaderPartialLastVector() throws IOException { + double[] values = new double[1030]; + for (int i = 0; i < 1030; i++) { + values[i] = i * 0.1; + } + assertDoubleWriterReaderRoundTrip(values); + } +} diff --git a/parquet-column/src/test/java/org/apache/parquet/column/values/alp/AlpValuesWriterTest.java b/parquet-column/src/test/java/org/apache/parquet/column/values/alp/AlpValuesWriterTest.java new file mode 100644 index 0000000000..029e8d02b0 --- /dev/null +++ b/parquet-column/src/test/java/org/apache/parquet/column/values/alp/AlpValuesWriterTest.java @@ -0,0 +1,260 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.column.values.alp; + +import static org.junit.Assert.*; + +import java.io.IOException; +import java.util.Random; +import org.apache.parquet.bytes.BytesInput; +import org.junit.Test; + +public class AlpValuesWriterTest { + + // ========== Float writer ========== + + @Test + public void testFloatWriterRoundTrip() throws IOException { + float[] values = new float[100]; + for (int i = 0; i < 100; i++) { + values[i] = i * 0.1f; + } + + AlpValuesWriter.FloatAlpValuesWriter writer = new AlpValuesWriter.FloatAlpValuesWriter(); + for (float v : values) { + writer.writeFloat(v); + } + + BytesInput bytes = writer.getBytes(); + byte[] compressed = bytes.toByteArray(); + assertTrue(compressed.length >= AlpConstants.HEADER_SIZE); + + float[] output = new float[values.length]; + AlpWrapper.decodeFloats(compressed, compressed.length, output, values.length); + + for (int i = 0; i < values.length; i++) { + assertEquals("Mismatch at " + i, Float.floatToRawIntBits(values[i]), Float.floatToRawIntBits(output[i])); + } + } + + @Test + public void testFloatWriterMultipleVectors() throws IOException { + // 2500 values = 2 full vectors + 1 partial + float[] values = new float[2500]; + Random rng = new Random(42); + for (int i = 0; i < 2500; i++) { + values[i] = Math.round(rng.nextFloat() * 10000) / 100.0f; + } + + AlpValuesWriter.FloatAlpValuesWriter writer = new AlpValuesWriter.FloatAlpValuesWriter(); + for (float v : values) { + writer.writeFloat(v); + } + + byte[] compressed = writer.getBytes().toByteArray(); + + float[] output = new float[values.length]; + AlpWrapper.decodeFloats(compressed, compressed.length, output, values.length); + + for (int i = 0; i < values.length; i++) { + assertEquals("Mismatch at " + i, Float.floatToRawIntBits(values[i]), Float.floatToRawIntBits(output[i])); + } + } + + @Test + public void testFloatWriterExactVectorSize() throws IOException { + float[] values = new float[AlpConstants.DEFAULT_VECTOR_SIZE]; + for (int i = 0; i < values.length; i++) { + values[i] = i * 0.5f; + } + + AlpValuesWriter.FloatAlpValuesWriter writer = new AlpValuesWriter.FloatAlpValuesWriter(); + for (float v : values) { + writer.writeFloat(v); + } + + byte[] compressed = writer.getBytes().toByteArray(); + + float[] output = new float[values.length]; + AlpWrapper.decodeFloats(compressed, compressed.length, output, values.length); + + for (int i = 0; i < values.length; i++) { + assertEquals(Float.floatToRawIntBits(values[i]), Float.floatToRawIntBits(output[i])); + } + } + + @Test + public void testFloatWriterSpecialValues() throws IOException { + float[] values = { + 1.0f, Float.NaN, 2.0f, Float.POSITIVE_INFINITY, 3.0f, Float.NEGATIVE_INFINITY, 4.0f, -0.0f, 5.0f + }; + + AlpValuesWriter.FloatAlpValuesWriter writer = new AlpValuesWriter.FloatAlpValuesWriter(); + for (float v : values) { + writer.writeFloat(v); + } + + byte[] compressed = writer.getBytes().toByteArray(); + + float[] output = new float[values.length]; + AlpWrapper.decodeFloats(compressed, compressed.length, output, values.length); + + for (int i = 0; i < values.length; i++) { + assertEquals("Mismatch at " + i, Float.floatToRawIntBits(values[i]), Float.floatToRawIntBits(output[i])); + } + } + + @Test + public void testFloatWriterEmpty() throws IOException { + AlpValuesWriter.FloatAlpValuesWriter writer = new AlpValuesWriter.FloatAlpValuesWriter(); + byte[] compressed = writer.getBytes().toByteArray(); + assertEquals(AlpConstants.HEADER_SIZE, compressed.length); + } + + @Test + public void testFloatWriterReset() throws IOException { + AlpValuesWriter.FloatAlpValuesWriter writer = new AlpValuesWriter.FloatAlpValuesWriter(); + for (int i = 0; i < 100; i++) { + writer.writeFloat(i * 0.1f); + } + + byte[] first = writer.getBytes().toByteArray(); + assertTrue(first.length > AlpConstants.HEADER_SIZE); + + writer.reset(); + + // Write different data + for (int i = 0; i < 50; i++) { + writer.writeFloat(i * 2.0f); + } + + byte[] second = writer.getBytes().toByteArray(); + assertTrue(second.length > AlpConstants.HEADER_SIZE); + + // Verify second batch round-trips correctly + float[] output = new float[50]; + AlpWrapper.decodeFloats(second, second.length, output, 50); + for (int i = 0; i < 50; i++) { + assertEquals(Float.floatToRawIntBits(i * 2.0f), Float.floatToRawIntBits(output[i])); + } + } + + // ========== Double writer ========== + + @Test + public void testDoubleWriterRoundTrip() throws IOException { + double[] values = new double[100]; + for (int i = 0; i < 100; i++) { + values[i] = i * 0.1; + } + + AlpValuesWriter.DoubleAlpValuesWriter writer = new AlpValuesWriter.DoubleAlpValuesWriter(); + for (double v : values) { + writer.writeDouble(v); + } + + byte[] compressed = writer.getBytes().toByteArray(); + + double[] output = new double[values.length]; + AlpWrapper.decodeDoubles(compressed, compressed.length, output, values.length); + + for (int i = 0; i < values.length; i++) { + assertEquals( + "Mismatch at " + i, Double.doubleToRawLongBits(values[i]), Double.doubleToRawLongBits(output[i])); + } + } + + @Test + public void testDoubleWriterMultipleVectors() throws IOException { + double[] values = new double[2500]; + Random rng = new Random(42); + for (int i = 0; i < 2500; i++) { + values[i] = Math.round(rng.nextDouble() * 10000) / 100.0; + } + + AlpValuesWriter.DoubleAlpValuesWriter writer = new AlpValuesWriter.DoubleAlpValuesWriter(); + for (double v : values) { + writer.writeDouble(v); + } + + byte[] compressed = writer.getBytes().toByteArray(); + + double[] output = new double[values.length]; + AlpWrapper.decodeDoubles(compressed, compressed.length, output, values.length); + + for (int i = 0; i < values.length; i++) { + assertEquals( + "Mismatch at " + i, Double.doubleToRawLongBits(values[i]), Double.doubleToRawLongBits(output[i])); + } + } + + @Test + public void testDoubleWriterSpecialValues() throws IOException { + double[] values = {1.0, Double.NaN, 2.0, Double.POSITIVE_INFINITY, 3.0, Double.NEGATIVE_INFINITY, 4.0, -0.0, 5.0 + }; + + AlpValuesWriter.DoubleAlpValuesWriter writer = new AlpValuesWriter.DoubleAlpValuesWriter(); + for (double v : values) { + writer.writeDouble(v); + } + + byte[] compressed = writer.getBytes().toByteArray(); + + double[] output = new double[values.length]; + AlpWrapper.decodeDoubles(compressed, compressed.length, output, values.length); + + for (int i = 0; i < values.length; i++) { + assertEquals( + "Mismatch at " + i, Double.doubleToRawLongBits(values[i]), Double.doubleToRawLongBits(output[i])); + } + } + + @Test + public void testDoubleWriterEmpty() throws IOException { + AlpValuesWriter.DoubleAlpValuesWriter writer = new AlpValuesWriter.DoubleAlpValuesWriter(); + byte[] compressed = writer.getBytes().toByteArray(); + assertEquals(AlpConstants.HEADER_SIZE, compressed.length); + } + + // ========== Buffered size / allocated size ========== + + @Test + public void testBufferedSizeGrowsWithValues() { + AlpValuesWriter.FloatAlpValuesWriter writer = new AlpValuesWriter.FloatAlpValuesWriter(); + long initial = writer.getBufferedSize(); + for (int i = 0; i < 10; i++) { + writer.writeFloat(i * 0.1f); + } + assertTrue(writer.getBufferedSize() > initial); + } + + @Test + public void testAllocatedSizeNonNegative() { + AlpValuesWriter.FloatAlpValuesWriter writer = new AlpValuesWriter.FloatAlpValuesWriter(); + assertTrue(writer.getAllocatedSize() > 0); + } + + @Test + public void testMemUsageString() { + AlpValuesWriter.FloatAlpValuesWriter writer = new AlpValuesWriter.FloatAlpValuesWriter(); + String usage = writer.memUsageString("TEST"); + assertTrue(usage.startsWith("TEST")); + assertTrue(usage.contains("ALP")); + } +} diff --git a/parquet-column/src/test/java/org/apache/parquet/column/values/alp/AlpWrapperTest.java b/parquet-column/src/test/java/org/apache/parquet/column/values/alp/AlpWrapperTest.java new file mode 100644 index 0000000000..d6aa034b62 --- /dev/null +++ b/parquet-column/src/test/java/org/apache/parquet/column/values/alp/AlpWrapperTest.java @@ -0,0 +1,227 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.column.values.alp; + +import static org.junit.Assert.*; + +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.Random; +import org.junit.Test; + +public class AlpWrapperTest { + + // ========== Float round-trip ========== + + private static void assertFloatPageRoundTrip(float[] input) { + AlpCompression.AlpEncodingPreset preset = AlpWrapper.createFloatSamplingPreset(input, input.length); + byte[] compressed = new byte[(int) AlpWrapper.maxCompressedSizeFloat(input.length)]; + int compSize = AlpWrapper.encodeFloats(input, input.length, compressed, preset); + assertTrue(compSize > 0); + assertTrue(compSize <= compressed.length); + + float[] output = new float[input.length]; + AlpWrapper.decodeFloats(compressed, compSize, output, input.length); + + for (int i = 0; i < input.length; i++) { + assertEquals( + "Mismatch at index " + i, Float.floatToRawIntBits(input[i]), Float.floatToRawIntBits(output[i])); + } + } + + @Test + public void testFloatSingleVector() { + float[] input = new float[100]; + for (int i = 0; i < 100; i++) { + input[i] = i * 0.1f; + } + assertFloatPageRoundTrip(input); + } + + @Test + public void testFloatMultipleVectors() { + // 2500 values = 2 full vectors (1024) + 1 partial (452) + float[] input = new float[2500]; + for (int i = 0; i < 2500; i++) { + input[i] = i * 0.01f; + } + assertFloatPageRoundTrip(input); + } + + @Test + public void testFloatExactVectorSize() { + float[] input = new float[AlpConstants.DEFAULT_VECTOR_SIZE]; + for (int i = 0; i < input.length; i++) { + input[i] = i * 0.5f; + } + assertFloatPageRoundTrip(input); + } + + @Test + public void testFloatExactTwoVectors() { + float[] input = new float[2 * AlpConstants.DEFAULT_VECTOR_SIZE]; + for (int i = 0; i < input.length; i++) { + input[i] = (i % 100) * 0.3f; + } + assertFloatPageRoundTrip(input); + } + + @Test + public void testFloatSpecialValues() { + float[] input = new float[20]; + for (int i = 0; i < 20; i++) { + input[i] = i * 1.5f; + } + input[3] = Float.NaN; + input[7] = Float.POSITIVE_INFINITY; + input[11] = Float.NEGATIVE_INFINITY; + input[15] = -0.0f; + assertFloatPageRoundTrip(input); + } + + @Test + public void testFloatEmptyInput() { + byte[] compressed = new byte[AlpConstants.HEADER_SIZE]; + int compSize = AlpWrapper.encodeFloats( + new float[0], 0, compressed, new AlpCompression.AlpEncodingPreset(new int[][] {{0, 0}})); + assertEquals(AlpConstants.HEADER_SIZE, compSize); + } + + @Test + public void testFloatRandomLargeDataset() { + Random rng = new Random(42); + float[] input = new float[5000]; + for (int i = 0; i < 5000; i++) { + input[i] = Math.round(rng.nextFloat() * 10000) / 100.0f; + } + assertFloatPageRoundTrip(input); + } + + // ========== Double round-trip ========== + + private static void assertDoublePageRoundTrip(double[] input) { + AlpCompression.AlpEncodingPreset preset = AlpWrapper.createDoubleSamplingPreset(input, input.length); + byte[] compressed = new byte[(int) AlpWrapper.maxCompressedSizeDouble(input.length)]; + int compSize = AlpWrapper.encodeDoubles(input, input.length, compressed, preset); + assertTrue(compSize > 0); + + double[] output = new double[input.length]; + AlpWrapper.decodeDoubles(compressed, compSize, output, input.length); + + for (int i = 0; i < input.length; i++) { + assertEquals( + "Mismatch at index " + i, + Double.doubleToRawLongBits(input[i]), + Double.doubleToRawLongBits(output[i])); + } + } + + @Test + public void testDoubleSingleVector() { + double[] input = new double[100]; + for (int i = 0; i < 100; i++) { + input[i] = i * 0.1; + } + assertDoublePageRoundTrip(input); + } + + @Test + public void testDoubleMultipleVectors() { + double[] input = new double[2500]; + for (int i = 0; i < 2500; i++) { + input[i] = i * 0.01; + } + assertDoublePageRoundTrip(input); + } + + @Test + public void testDoubleSpecialValues() { + double[] input = new double[20]; + for (int i = 0; i < 20; i++) { + input[i] = i * 1.5; + } + input[3] = Double.NaN; + input[7] = Double.POSITIVE_INFINITY; + input[11] = Double.NEGATIVE_INFINITY; + input[15] = -0.0; + assertDoublePageRoundTrip(input); + } + + @Test + public void testDoubleRandomLargeDataset() { + Random rng = new Random(42); + double[] input = new double[5000]; + for (int i = 0; i < 5000; i++) { + input[i] = Math.round(rng.nextDouble() * 10000) / 100.0; + } + assertDoublePageRoundTrip(input); + } + + // ========== Wire format verification ========== + + @Test + public void testHeaderFormat() { + float[] input = {1.0f, 2.0f, 3.0f}; + AlpCompression.AlpEncodingPreset preset = AlpWrapper.createFloatSamplingPreset(input, input.length); + byte[] compressed = new byte[(int) AlpWrapper.maxCompressedSizeFloat(input.length)]; + int compSize = AlpWrapper.encodeFloats(input, input.length, compressed, preset); + + // Verify 7-byte header + ByteBuffer header = + ByteBuffer.wrap(compressed, 0, AlpConstants.HEADER_SIZE).order(ByteOrder.LITTLE_ENDIAN); + assertEquals(AlpConstants.COMPRESSION_MODE_ALP, header.get() & 0xFF); + assertEquals(AlpConstants.INTEGER_ENCODING_FOR, header.get() & 0xFF); + assertEquals(AlpConstants.DEFAULT_VECTOR_SIZE_LOG, header.get() & 0xFF); + assertEquals(3, header.getInt()); // num_elements + } + + @Test + public void testOffsetLayout() { + // 2048 elements = 2 vectors + float[] input = new float[2048]; + for (int i = 0; i < 2048; i++) { + input[i] = i * 0.5f; + } + AlpCompression.AlpEncodingPreset preset = AlpWrapper.createFloatSamplingPreset(input, input.length); + byte[] compressed = new byte[(int) AlpWrapper.maxCompressedSizeFloat(input.length)]; + AlpWrapper.encodeFloats(input, input.length, compressed, preset); + + // After header (7B), offsets section should have 2 int offsets (8 bytes) + ByteBuffer body = ByteBuffer.wrap( + compressed, AlpConstants.HEADER_SIZE, compressed.length - AlpConstants.HEADER_SIZE) + .order(ByteOrder.LITTLE_ENDIAN); + int offset0 = body.getInt(); + int offset1 = body.getInt(); + + // First vector starts right after offsets (2 * 4 = 8) + assertEquals(8, offset0); + // Second vector starts after first vector's data + assertTrue(offset1 > offset0); + } + + // ========== Max compressed size ========== + + @Test + public void testMaxCompressedSize() { + assertTrue(AlpWrapper.maxCompressedSizeFloat(0) >= AlpConstants.HEADER_SIZE); + assertTrue(AlpWrapper.maxCompressedSizeFloat(1024) > AlpConstants.HEADER_SIZE); + assertTrue(AlpWrapper.maxCompressedSizeDouble(0) >= AlpConstants.HEADER_SIZE); + assertTrue(AlpWrapper.maxCompressedSizeDouble(1024) > AlpConstants.HEADER_SIZE); + } +} diff --git a/parquet-column/src/test/java/org/apache/parquet/column/values/alp/benchmark/AlpCodecThroughput.java b/parquet-column/src/test/java/org/apache/parquet/column/values/alp/benchmark/AlpCodecThroughput.java new file mode 100644 index 0000000000..a64051a6c1 --- /dev/null +++ b/parquet-column/src/test/java/org/apache/parquet/column/values/alp/benchmark/AlpCodecThroughput.java @@ -0,0 +1,312 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.column.values.alp.benchmark; + +import java.io.BufferedReader; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.zip.GZIPInputStream; +import java.nio.ByteBuffer; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.List; +import org.apache.parquet.bytes.ByteBufferInputStream; +import org.apache.parquet.column.values.alp.AlpValuesReaderForDouble; +import org.apache.parquet.column.values.alp.AlpValuesReaderForFloat; +import org.apache.parquet.column.values.alp.AlpValuesWriter; +import org.junit.BeforeClass; +import org.junit.Test; + +/** + * Codec-level ALP throughput benchmark using real Spotify dataset columns. + * + *

Comparable to C++ encoding_alp_benchmark.cc. Measures encode and decode + * throughput at the codec level (no Parquet pipeline overhead). Uses the same + * Spotify audio features dataset as the C++ benchmark for direct comparison. + * + *

The CSV source has 15K rows per column; values are tiled to 1M for stable + * measurement. + */ +public class AlpCodecThroughput { + + private static final int TARGET_VALUES = 1_000_000; + private static final int WARMUP = 10; + private static final int MEASURED = 30; + + private static final String CSV_DIR = "parquet-hadoop/src/test/resources"; + private static final String DOUBLE_CSV = "alp_spotify1_expect.csv.gz"; + private static final String FLOAT_CSV = "alp_float_spotify1_expect.csv.gz"; + + // Spotify column names matching C++ benchmark + private static final String[] COLUMNS = { + "valence", "danceability", "energy", "loudness", "speechiness", + "acousticness", "instrumentalness", "liveness", "tempo" + }; + + private static double[][] doubleColumns; + private static float[][] floatColumns; + private static byte[][] doubleCompressed; + private static byte[][] floatCompressed; + + @BeforeClass + public static void setup() throws IOException { + Path csvDir = findCsvDir(); + + double[][] rawDoubles = loadDoubleCsv(csvDir.resolve(DOUBLE_CSV)); + doubleColumns = new double[rawDoubles.length][]; + doubleCompressed = new byte[rawDoubles.length][]; + for (int c = 0; c < rawDoubles.length; c++) { + doubleColumns[c] = tile(rawDoubles[c], TARGET_VALUES); + doubleCompressed[c] = compressDoubles(doubleColumns[c]); + } + + float[][] rawFloats = loadFloatCsv(csvDir.resolve(FLOAT_CSV)); + floatColumns = new float[rawFloats.length][]; + floatCompressed = new byte[rawFloats.length][]; + for (int c = 0; c < rawFloats.length; c++) { + floatColumns[c] = tile(rawFloats[c], TARGET_VALUES); + floatCompressed[c] = compressFloats(floatColumns[c]); + } + } + + @Test + public void measureThroughput() throws IOException { + System.out.println(); + System.out.printf("=== ALP Codec-Level Throughput (%dK values, Spotify dataset) ===%n", + TARGET_VALUES / 1000); + System.out.println(); + + // Double columns + System.out.printf("%-30s %10s %10s %10s %10s%n", + "Double Column", "Enc MB/s", "Dec MB/s", "Raw KB", "Comp KB"); + System.out.println("------------------------------" + + " ---------- ---------- ---------- ----------"); + for (int c = 0; c < doubleColumns.length; c++) { + benchDouble(COLUMNS[c], doubleColumns[c], doubleCompressed[c]); + } + + System.out.println(); + + // Float columns + System.out.printf("%-30s %10s %10s %10s %10s%n", + "Float Column", "Enc MB/s", "Dec MB/s", "Raw KB", "Comp KB"); + System.out.println("------------------------------" + + " ---------- ---------- ---------- ----------"); + for (int c = 0; c < floatColumns.length; c++) { + benchFloat(COLUMNS[c], floatColumns[c], floatCompressed[c]); + } + + System.out.println(); + } + + // ========== CSV loading ========== + + /** + * Find the CSV directory. Searches from the working directory upward for the + * parquet-hadoop test resources directory, so the benchmark works whether run + * from the project root or from parquet-column/. + */ + private static Path findCsvDir() throws IOException { + Path dir = Paths.get("").toAbsolutePath(); + for (int i = 0; i < 3; i++) { + Path candidate = dir.resolve(CSV_DIR); + if (Files.isDirectory(candidate) && Files.exists(candidate.resolve(DOUBLE_CSV))) { + return candidate; + } + dir = dir.getParent(); + if (dir == null) break; + } + throw new IOException("Cannot find CSV directory '" + CSV_DIR + + "'. Run from the parquet-java project root."); + } + + private static double[][] loadDoubleCsv(Path csvPath) throws IOException { + try (InputStream is = new GZIPInputStream(new FileInputStream(csvPath.toFile()))) { + BufferedReader br = new BufferedReader(new InputStreamReader(is)); + String header = br.readLine(); + int numCols = header.split(",").length; + + List rows = new ArrayList<>(); + String line; + while ((line = br.readLine()) != null) { + String[] parts = line.split(","); + double[] row = new double[numCols]; + for (int i = 0; i < numCols; i++) { + row[i] = Double.parseDouble(parts[i]); + } + rows.add(row); + } + + double[][] columns = new double[numCols][rows.size()]; + for (int r = 0; r < rows.size(); r++) { + double[] row = rows.get(r); + for (int c = 0; c < numCols; c++) { + columns[c][r] = row[c]; + } + } + return columns; + } + } + + private static float[][] loadFloatCsv(Path csvPath) throws IOException { + try (InputStream is = new GZIPInputStream(new FileInputStream(csvPath.toFile()))) { + BufferedReader br = new BufferedReader(new InputStreamReader(is)); + String header = br.readLine(); + int numCols = header.split(",").length; + + List rows = new ArrayList<>(); + String line; + while ((line = br.readLine()) != null) { + String[] parts = line.split(","); + float[] row = new float[numCols]; + for (int i = 0; i < numCols; i++) { + row[i] = Float.parseFloat(parts[i]); + } + rows.add(row); + } + + float[][] columns = new float[numCols][rows.size()]; + for (int r = 0; r < rows.size(); r++) { + float[] row = rows.get(r); + for (int c = 0; c < numCols; c++) { + columns[c][r] = row[c]; + } + } + return columns; + } + } + + // ========== Tiling ========== + + private static double[] tile(double[] source, int targetSize) { + double[] result = new double[targetSize]; + for (int i = 0; i < targetSize; i++) { + result[i] = source[i % source.length]; + } + return result; + } + + private static float[] tile(float[] source, int targetSize) { + float[] result = new float[targetSize]; + for (int i = 0; i < targetSize; i++) { + result[i] = source[i % source.length]; + } + return result; + } + + // ========== Benchmark methods ========== + + private void benchDouble(String name, double[] data, byte[] compressed) throws IOException { + long rawBytes = (long) data.length * Double.BYTES; + + for (int i = 0; i < WARMUP; i++) { + compressDoubles(data); + } + long encNanos = 0; + for (int i = 0; i < MEASURED; i++) { + long t0 = System.nanoTime(); + compressDoubles(data); + encNanos += System.nanoTime() - t0; + } + + for (int i = 0; i < WARMUP; i++) { + decompressDoubles(compressed, data.length); + } + long decNanos = 0; + for (int i = 0; i < MEASURED; i++) { + long t0 = System.nanoTime(); + decompressDoubles(compressed, data.length); + decNanos += System.nanoTime() - t0; + } + + double encMBps = (rawBytes * MEASURED / (encNanos / 1e9)) / (1024.0 * 1024.0); + double decMBps = (rawBytes * MEASURED / (decNanos / 1e9)) / (1024.0 * 1024.0); + + System.out.printf("%-30s %10.1f %10.1f %10d %10d%n", + name, encMBps, decMBps, rawBytes / 1024, compressed.length / 1024); + } + + private void benchFloat(String name, float[] data, byte[] compressed) throws IOException { + long rawBytes = (long) data.length * Float.BYTES; + + for (int i = 0; i < WARMUP; i++) { + compressFloats(data); + } + long encNanos = 0; + for (int i = 0; i < MEASURED; i++) { + long t0 = System.nanoTime(); + compressFloats(data); + encNanos += System.nanoTime() - t0; + } + + for (int i = 0; i < WARMUP; i++) { + decompressFloats(compressed, data.length); + } + long decNanos = 0; + for (int i = 0; i < MEASURED; i++) { + long t0 = System.nanoTime(); + decompressFloats(compressed, data.length); + decNanos += System.nanoTime() - t0; + } + + double encMBps = (rawBytes * MEASURED / (encNanos / 1e9)) / (1024.0 * 1024.0); + double decMBps = (rawBytes * MEASURED / (decNanos / 1e9)) / (1024.0 * 1024.0); + + System.out.printf("%-30s %10.1f %10.1f %10d %10d%n", + name, encMBps, decMBps, rawBytes / 1024, compressed.length / 1024); + } + + // ========== Compress / Decompress ========== + + private static byte[] compressDoubles(double[] values) throws IOException { + AlpValuesWriter.DoubleAlpValuesWriter writer = new AlpValuesWriter.DoubleAlpValuesWriter(); + for (double v : values) { + writer.writeDouble(v); + } + return writer.getBytes().toByteArray(); + } + + private static byte[] compressFloats(float[] values) throws IOException { + AlpValuesWriter.FloatAlpValuesWriter writer = new AlpValuesWriter.FloatAlpValuesWriter(); + for (float v : values) { + writer.writeFloat(v); + } + return writer.getBytes().toByteArray(); + } + + private static void decompressDoubles(byte[] compressed, int numValues) throws IOException { + AlpValuesReaderForDouble reader = new AlpValuesReaderForDouble(); + reader.initFromPage(numValues, ByteBufferInputStream.wrap(ByteBuffer.wrap(compressed))); + for (int i = 0; i < numValues; i++) { + reader.readDouble(); + } + } + + private static void decompressFloats(byte[] compressed, int numValues) throws IOException { + AlpValuesReaderForFloat reader = new AlpValuesReaderForFloat(); + reader.initFromPage(numValues, ByteBufferInputStream.wrap(ByteBuffer.wrap(compressed))); + for (int i = 0; i < numValues; i++) { + reader.readFloat(); + } + } +} diff --git a/parquet-column/src/test/java/org/apache/parquet/column/values/alp/benchmark/AlpEncodingBenchmark.java b/parquet-column/src/test/java/org/apache/parquet/column/values/alp/benchmark/AlpEncodingBenchmark.java new file mode 100644 index 0000000000..c0d655d894 --- /dev/null +++ b/parquet-column/src/test/java/org/apache/parquet/column/values/alp/benchmark/AlpEncodingBenchmark.java @@ -0,0 +1,321 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.column.values.alp.benchmark; + +import com.carrotsearch.junitbenchmarks.BenchmarkOptions; +import com.carrotsearch.junitbenchmarks.BenchmarkRule; +import com.carrotsearch.junitbenchmarks.annotation.AxisRange; +import com.carrotsearch.junitbenchmarks.annotation.BenchmarkMethodChart; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.Random; +import org.apache.parquet.bytes.ByteBufferInputStream; +import org.apache.parquet.column.values.alp.AlpValuesReaderForDouble; +import org.apache.parquet.column.values.alp.AlpValuesReaderForFloat; +import org.apache.parquet.column.values.alp.AlpValuesWriter; +import org.junit.BeforeClass; +import org.junit.Rule; +import org.junit.Test; + +/** + * Benchmark for ALP (Adaptive Lossless floating-Point) encoding. + * + *

Measures encode and decode throughput for float and double values across + * multiple data patterns: decimal, integer, constant, and mixed with special + * values. Also reports compressed size for compression ratio analysis. + * + *

Mirrors the C++ parquet-encoding-alp-benchmark for cross-language + * performance comparison. + */ +@AxisRange(min = 0, max = 1) +@BenchmarkMethodChart(filePrefix = "benchmark-alp-encoding") +public class AlpEncodingBenchmark { + + private static final int NUM_VALUES = 50_000; // matching C++ benchmark element count + + @Rule + public org.junit.rules.TestRule benchmarkRun = new BenchmarkRule(); + + // ========== Float data & compressed blobs ========== + private static float[] floatDecimalData; + private static float[] floatIntegerData; + private static float[] floatConstantData; + private static float[] floatMixedData; + + private static byte[] floatDecimalCompressed; + private static byte[] floatIntegerCompressed; + private static byte[] floatConstantCompressed; + private static byte[] floatMixedCompressed; + + // ========== Double data & compressed blobs ========== + private static double[] doubleDecimalData; + private static double[] doubleIntegerData; + private static double[] doubleConstantData; + private static double[] doubleMixedData; + + private static byte[] doubleDecimalCompressed; + private static byte[] doubleIntegerCompressed; + private static byte[] doubleConstantCompressed; + private static byte[] doubleMixedCompressed; + + @BeforeClass + public static void prepare() throws IOException { + Random rng = new Random(42); + + // --- Float datasets --- + floatDecimalData = new float[NUM_VALUES]; + for (int i = 0; i < NUM_VALUES; i++) { + floatDecimalData[i] = Math.round(rng.nextFloat() * 10000) / 100.0f; + } + + floatIntegerData = new float[NUM_VALUES]; + for (int i = 0; i < NUM_VALUES; i++) { + floatIntegerData[i] = (float) (rng.nextInt(100000)); + } + + floatConstantData = new float[NUM_VALUES]; + for (int i = 0; i < NUM_VALUES; i++) { + floatConstantData[i] = 3.14f; + } + + floatMixedData = new float[NUM_VALUES]; + for (int i = 0; i < NUM_VALUES; i++) { + floatMixedData[i] = Math.round(rng.nextFloat() * 10000) / 100.0f; + } + // Inject ~2% special values + for (int i = 0; i < NUM_VALUES; i += 50) { + switch (i % 200) { + case 0: + floatMixedData[i] = Float.NaN; + break; + case 50: + floatMixedData[i] = Float.POSITIVE_INFINITY; + break; + case 100: + floatMixedData[i] = Float.NEGATIVE_INFINITY; + break; + case 150: + floatMixedData[i] = -0.0f; + break; + } + } + + // --- Double datasets --- + doubleDecimalData = new double[NUM_VALUES]; + for (int i = 0; i < NUM_VALUES; i++) { + doubleDecimalData[i] = Math.round(rng.nextDouble() * 10000) / 100.0; + } + + doubleIntegerData = new double[NUM_VALUES]; + for (int i = 0; i < NUM_VALUES; i++) { + doubleIntegerData[i] = (double) (rng.nextInt(100000)); + } + + doubleConstantData = new double[NUM_VALUES]; + for (int i = 0; i < NUM_VALUES; i++) { + doubleConstantData[i] = 3.14; + } + + doubleMixedData = new double[NUM_VALUES]; + for (int i = 0; i < NUM_VALUES; i++) { + doubleMixedData[i] = Math.round(rng.nextDouble() * 10000) / 100.0; + } + for (int i = 0; i < NUM_VALUES; i += 50) { + switch (i % 200) { + case 0: + doubleMixedData[i] = Double.NaN; + break; + case 50: + doubleMixedData[i] = Double.POSITIVE_INFINITY; + break; + case 100: + doubleMixedData[i] = Double.NEGATIVE_INFINITY; + break; + case 150: + doubleMixedData[i] = -0.0; + break; + } + } + + // --- Pre-compress all datasets --- + floatDecimalCompressed = compressFloats(floatDecimalData); + floatIntegerCompressed = compressFloats(floatIntegerData); + floatConstantCompressed = compressFloats(floatConstantData); + floatMixedCompressed = compressFloats(floatMixedData); + + doubleDecimalCompressed = compressDoubles(doubleDecimalData); + doubleIntegerCompressed = compressDoubles(doubleIntegerData); + doubleConstantCompressed = compressDoubles(doubleConstantData); + doubleMixedCompressed = compressDoubles(doubleMixedData); + + // --- Print compression ratios --- + System.out.println("=== ALP Compression Ratios ==="); + printRatio("Float decimal", floatDecimalCompressed.length, NUM_VALUES * 4); + printRatio("Float integer", floatIntegerCompressed.length, NUM_VALUES * 4); + printRatio("Float constant", floatConstantCompressed.length, NUM_VALUES * 4); + printRatio("Float mixed", floatMixedCompressed.length, NUM_VALUES * 4); + printRatio("Double decimal", doubleDecimalCompressed.length, NUM_VALUES * 8); + printRatio("Double integer", doubleIntegerCompressed.length, NUM_VALUES * 8); + printRatio("Double constant", doubleConstantCompressed.length, NUM_VALUES * 8); + printRatio("Double mixed", doubleMixedCompressed.length, NUM_VALUES * 8); + } + + private static void printRatio(String label, int compressedSize, int rawSize) { + double ratio = 100.0 * compressedSize / rawSize; + System.out.printf(" %-20s: %6d / %6d bytes = %5.1f%%%n", label, compressedSize, rawSize, ratio); + } + + // ========== Float encode benchmarks ========== + + @BenchmarkOptions(benchmarkRounds = 20, warmupRounds = 10) + @Test + public void encodeFloatDecimal() throws IOException { + compressFloats(floatDecimalData); + } + + @BenchmarkOptions(benchmarkRounds = 20, warmupRounds = 10) + @Test + public void encodeFloatInteger() throws IOException { + compressFloats(floatIntegerData); + } + + @BenchmarkOptions(benchmarkRounds = 20, warmupRounds = 10) + @Test + public void encodeFloatConstant() throws IOException { + compressFloats(floatConstantData); + } + + @BenchmarkOptions(benchmarkRounds = 20, warmupRounds = 10) + @Test + public void encodeFloatMixed() throws IOException { + compressFloats(floatMixedData); + } + + // ========== Float decode benchmarks ========== + + @BenchmarkOptions(benchmarkRounds = 20, warmupRounds = 10) + @Test + public void decodeFloatDecimal() throws IOException { + decompressFloats(floatDecimalCompressed, NUM_VALUES); + } + + @BenchmarkOptions(benchmarkRounds = 20, warmupRounds = 10) + @Test + public void decodeFloatInteger() throws IOException { + decompressFloats(floatIntegerCompressed, NUM_VALUES); + } + + @BenchmarkOptions(benchmarkRounds = 20, warmupRounds = 10) + @Test + public void decodeFloatConstant() throws IOException { + decompressFloats(floatConstantCompressed, NUM_VALUES); + } + + @BenchmarkOptions(benchmarkRounds = 20, warmupRounds = 10) + @Test + public void decodeFloatMixed() throws IOException { + decompressFloats(floatMixedCompressed, NUM_VALUES); + } + + // ========== Double encode benchmarks ========== + + @BenchmarkOptions(benchmarkRounds = 20, warmupRounds = 10) + @Test + public void encodeDoubleDecimal() throws IOException { + compressDoubles(doubleDecimalData); + } + + @BenchmarkOptions(benchmarkRounds = 20, warmupRounds = 10) + @Test + public void encodeDoubleInteger() throws IOException { + compressDoubles(doubleIntegerData); + } + + @BenchmarkOptions(benchmarkRounds = 20, warmupRounds = 10) + @Test + public void encodeDoubleConstant() throws IOException { + compressDoubles(doubleConstantData); + } + + @BenchmarkOptions(benchmarkRounds = 20, warmupRounds = 10) + @Test + public void encodeDoubleMixed() throws IOException { + compressDoubles(doubleMixedData); + } + + // ========== Double decode benchmarks ========== + + @BenchmarkOptions(benchmarkRounds = 20, warmupRounds = 10) + @Test + public void decodeDoubleDecimal() throws IOException { + decompressDoubles(doubleDecimalCompressed, NUM_VALUES); + } + + @BenchmarkOptions(benchmarkRounds = 20, warmupRounds = 10) + @Test + public void decodeDoubleInteger() throws IOException { + decompressDoubles(doubleIntegerCompressed, NUM_VALUES); + } + + @BenchmarkOptions(benchmarkRounds = 20, warmupRounds = 10) + @Test + public void decodeDoubleConstant() throws IOException { + decompressDoubles(doubleConstantCompressed, NUM_VALUES); + } + + @BenchmarkOptions(benchmarkRounds = 20, warmupRounds = 10) + @Test + public void decodeDoubleMixed() throws IOException { + decompressDoubles(doubleMixedCompressed, NUM_VALUES); + } + + // ========== Helpers ========== + + private static byte[] compressFloats(float[] values) throws IOException { + AlpValuesWriter.FloatAlpValuesWriter writer = new AlpValuesWriter.FloatAlpValuesWriter(); + for (float v : values) { + writer.writeFloat(v); + } + return writer.getBytes().toByteArray(); + } + + private static byte[] compressDoubles(double[] values) throws IOException { + AlpValuesWriter.DoubleAlpValuesWriter writer = new AlpValuesWriter.DoubleAlpValuesWriter(); + for (double v : values) { + writer.writeDouble(v); + } + return writer.getBytes().toByteArray(); + } + + private static void decompressFloats(byte[] compressed, int numValues) throws IOException { + AlpValuesReaderForFloat reader = new AlpValuesReaderForFloat(); + reader.initFromPage(numValues, ByteBufferInputStream.wrap(ByteBuffer.wrap(compressed))); + for (int i = 0; i < numValues; i++) { + reader.readFloat(); + } + } + + private static void decompressDoubles(byte[] compressed, int numValues) throws IOException { + AlpValuesReaderForDouble reader = new AlpValuesReaderForDouble(); + reader.initFromPage(numValues, ByteBufferInputStream.wrap(ByteBuffer.wrap(compressed))); + for (int i = 0; i < numValues; i++) { + reader.readDouble(); + } + } +} diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetWriter.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetWriter.java index 8eb5f7f17b..b93cd576ca 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetWriter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetWriter.java @@ -705,6 +705,16 @@ public SELF withByteStreamSplitEncoding(String columnPath, boolean enableByteStr return self(); } + public SELF withAlpEncoding(boolean enableAlp) { + encodingPropsBuilder.withAlpEncoding(enableAlp); + return self(); + } + + public SELF withAlpEncoding(String columnPath, boolean enableAlp) { + encodingPropsBuilder.withAlpEncoding(columnPath, enableAlp); + return self(); + } + /** * Enable or disable dictionary encoding of the specified column for the constructed writer. * diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/AlpDecompressionThroughput.java b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/AlpDecompressionThroughput.java new file mode 100644 index 0000000000..ae5b62ca17 --- /dev/null +++ b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/AlpDecompressionThroughput.java @@ -0,0 +1,129 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.hadoop; + +import java.io.IOException; +import java.net.URISyntaxException; +import org.apache.hadoop.fs.Path; +import org.apache.parquet.example.data.Group; +import org.apache.parquet.hadoop.example.GroupReadSupport; +import org.apache.parquet.hadoop.metadata.ParquetMetadata; +import org.junit.Test; + +/** + * Measures ALP decompression throughput in bytes/second for real-world datasets. + * Reports raw (uncompressed) bytes/second and compressed bytes/second. + */ +public class AlpDecompressionThroughput { + + private static final int WARMUP_ITERS = 5; + private static final int MEASURED_ITERS = 20; + + private static Path resourcePath(String name) { + try { + return new Path(AlpDecompressionThroughput.class.getResource("/" + name).toURI()); + } catch (URISyntaxException e) { + throw new RuntimeException(e); + } + } + + @Test + public void measureDecompressionThroughput() throws IOException { + System.out.println(); + System.out.println("=== ALP Decompression Throughput ==="); + System.out.printf( + "%-40s %8s %6s %6s %12s %12s %12s%n", + "File", "Rows", "Cols", "Type", "Compressed", "Raw MB/s", "Comp MB/s"); + System.out.println( + "---------------------------------------- -------- ------ ------ ------------ ------------ ------------"); + + // Double datasets + benchmarkFile("alp_arade.parquet", 15000, 4, "double", 8); + benchmarkFile("alp_spotify1.parquet", 15000, 9, "double", 8); + benchmarkFile("alp_java_arade.parquet", 15000, 4, "double", 8); + benchmarkFile("alp_java_spotify1.parquet", 15000, 9, "double", 8); + + // Float datasets + benchmarkFile("alp_float_arade.parquet", 15000, 4, "float", 4); + benchmarkFile("alp_float_spotify1.parquet", 15000, 9, "float", 4); + benchmarkFile("alp_java_float_arade.parquet", 15000, 4, "float", 4); + benchmarkFile("alp_java_float_spotify1.parquet", 15000, 9, "float", 4); + + System.out.println(); + } + + private void benchmarkFile(String fileName, int expectedRows, int numCols, String type, int bytesPerValue) + throws IOException { + Path path = resourcePath(fileName); + + // Get compressed file size from parquet metadata + long compressedSize = 0; + try (ParquetFileReader pfr = ParquetFileReader.open( + org.apache.parquet.hadoop.util.HadoopInputFile.fromPath( + path, new org.apache.hadoop.conf.Configuration()))) { + ParquetMetadata footer = pfr.getFooter(); + for (org.apache.parquet.hadoop.metadata.BlockMetaData block : footer.getBlocks()) { + compressedSize += block.getTotalByteSize(); + } + } + + long rawBytes = (long) expectedRows * numCols * bytesPerValue; + + // Warmup + for (int i = 0; i < WARMUP_ITERS; i++) { + readAllValues(path, type, numCols); + } + + // Measured runs + long totalNanos = 0; + for (int i = 0; i < MEASURED_ITERS; i++) { + long start = System.nanoTime(); + readAllValues(path, type, numCols); + totalNanos += System.nanoTime() - start; + } + + double avgSeconds = (totalNanos / (double) MEASURED_ITERS) / 1_000_000_000.0; + double rawMBps = (rawBytes / avgSeconds) / (1024.0 * 1024.0); + double compMBps = (compressedSize / avgSeconds) / (1024.0 * 1024.0); + + System.out.printf( + "%-40s %8d %6d %6s %12d %10.1f %10.1f%n", + fileName, expectedRows, numCols, type, compressedSize, rawMBps, compMBps); + } + + private void readAllValues(Path path, String type, int numCols) throws IOException { + try (ParquetReader reader = + ParquetReader.builder(new GroupReadSupport(), path).build()) { + Group group; + if ("double".equals(type)) { + while ((group = reader.read()) != null) { + for (int c = 0; c < numCols; c++) { + group.getDouble(c, 0); + } + } + } else { + while ((group = reader.read()) != null) { + for (int c = 0; c < numCols; c++) { + group.getFloat(c, 0); + } + } + } + } + } +} diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/EncodingCompressionBenchmark.java b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/EncodingCompressionBenchmark.java new file mode 100644 index 0000000000..27d053acaa --- /dev/null +++ b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/EncodingCompressionBenchmark.java @@ -0,0 +1,471 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.hadoop; + +import com.github.luben.zstd.Zstd; +import java.io.BufferedReader; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.List; +import java.util.zip.GZIPInputStream; +import org.apache.parquet.bytes.ByteBufferInputStream; +import org.apache.parquet.bytes.HeapByteBufferAllocator; +import org.apache.parquet.column.values.alp.AlpValuesReaderForDouble; +import org.apache.parquet.column.values.alp.AlpValuesReaderForFloat; +import org.apache.parquet.column.values.alp.AlpValuesWriter; +import org.apache.parquet.column.values.bytestreamsplit.ByteStreamSplitValuesReaderForDouble; +import org.apache.parquet.column.values.bytestreamsplit.ByteStreamSplitValuesReaderForFloat; +import org.apache.parquet.column.values.bytestreamsplit.ByteStreamSplitValuesWriter; +import org.junit.BeforeClass; +import org.junit.Test; + +/** + * Codec-level benchmark comparing ALP, ZSTD (plain), and ByteStreamSplit+ZSTD. + * + *

Comparable to C++ encoding_alp_benchmark.cc. Reports encode and decode + * throughput in MB/s plus compression ratio for each encoding strategy. + */ +public class EncodingCompressionBenchmark { + + private static final int TILE_FACTOR = 6; // 15K * 6 = 90K values + private static final int WARMUP = 10; + private static final int MEASURED = 30; + + private static final String CSV_DIR = "parquet-hadoop/src/test/resources"; + private static final String DOUBLE_CSV = "alp_spotify1_expect.csv.gz"; + private static final String FLOAT_CSV = "alp_float_spotify1_expect.csv.gz"; + + private static final String[] COLUMNS = { + "valence", "danceability", "energy", "loudness", "speechiness", + "acousticness", "instrumentalness", "liveness", "tempo" + }; + + private static double[][] doubleColumns; + private static float[][] floatColumns; + + @BeforeClass + public static void setup() throws IOException { + Path csvDir = findCsvDir(); + + double[][] rawDoubles = loadDoubleCsv(csvDir.resolve(DOUBLE_CSV)); + doubleColumns = new double[rawDoubles.length][]; + for (int c = 0; c < rawDoubles.length; c++) { + doubleColumns[c] = tile(rawDoubles[c], rawDoubles[c].length * TILE_FACTOR); + } + + float[][] rawFloats = loadFloatCsv(csvDir.resolve(FLOAT_CSV)); + floatColumns = new float[rawFloats.length][]; + for (int c = 0; c < rawFloats.length; c++) { + floatColumns[c] = tile(rawFloats[c], rawFloats[c].length * TILE_FACTOR); + } + } + + @Test + public void measureThroughput() throws IOException { + System.out.println(); + System.out.printf("=== Encoding/Compression Benchmark (%d values per column, Spotify dataset, %d iters) ===%n", + doubleColumns[0].length, MEASURED); + System.out.println(); + + String hdr = String.format( + "%-35s %10s %10s %10s %10s %8s", "Column / Encoding", "Enc MB/s", "Dec MB/s", "Raw KB", "Comp KB", "Ratio"); + String sep = "-".repeat(hdr.length()); + + // --- Double columns --- + System.out.println("=== DOUBLE (8 bytes/value) ==="); + System.out.println(hdr); + System.out.println(sep); + + for (int c = 0; c < doubleColumns.length; c++) { + benchAllDouble(COLUMNS[c], doubleColumns[c]); + System.out.println(); + } + + // --- Float columns --- + System.out.println("=== FLOAT (4 bytes/value) ==="); + System.out.println(hdr); + System.out.println(sep); + + for (int c = 0; c < floatColumns.length; c++) { + benchAllFloat(COLUMNS[c], floatColumns[c]); + System.out.println(); + } + } + + // ---- Double benchmarks ---- + + private void benchAllDouble(String datasetName, double[] data) throws IOException { + long rawBytes = (long) data.length * Double.BYTES; + + // ALP + byte[] alpComp = alpEncodeDoubles(data); + benchEncodeDecode( + datasetName + " ALP", + rawBytes, + alpComp.length, + () -> alpEncodeDoubles(data), + () -> alpDecodeDoubles(alpComp, data.length)); + + // PLAIN + ZSTD + byte[] plainBytes = plainEncodeDoubles(data); + byte[] plainZstd = Zstd.compress(plainBytes); + benchEncodeDecode( + datasetName + " PLAIN+ZSTD", + rawBytes, + plainZstd.length, + () -> Zstd.compress(plainEncodeDoubles(data)), + () -> { + byte[] dec = Zstd.decompress(plainZstd, plainBytes.length); + consumeDoublesFromPlain(dec, data.length); + }); + + // BSS + byte[] bssBytes = bssEncodeDoubles(data); + benchEncodeDecode( + datasetName + " BSS", + rawBytes, + bssBytes.length, + () -> bssEncodeDoubles(data), + () -> bssDecodeDoubles(bssBytes, data.length)); + + // BSS + ZSTD + byte[] bssZstd = Zstd.compress(bssBytes); + benchEncodeDecode( + datasetName + " BSS+ZSTD", + rawBytes, + bssZstd.length, + () -> Zstd.compress(bssEncodeDoubles(data)), + () -> { + byte[] dec = Zstd.decompress(bssZstd, bssBytes.length); + bssDecodeDoublesFromRaw(dec, data.length); + }); + } + + // ---- Float benchmarks ---- + + private void benchAllFloat(String datasetName, float[] data) throws IOException { + long rawBytes = (long) data.length * Float.BYTES; + + // ALP + byte[] alpComp = alpEncodeFloats(data); + benchEncodeDecode( + datasetName + " ALP", + rawBytes, + alpComp.length, + () -> alpEncodeFloats(data), + () -> alpDecodeFloats(alpComp, data.length)); + + // PLAIN + ZSTD + byte[] plainBytes = plainEncodeFloats(data); + byte[] plainZstd = Zstd.compress(plainBytes); + benchEncodeDecode( + datasetName + " PLAIN+ZSTD", + rawBytes, + plainZstd.length, + () -> Zstd.compress(plainEncodeFloats(data)), + () -> { + byte[] dec = Zstd.decompress(plainZstd, plainBytes.length); + consumeFloatsFromPlain(dec, data.length); + }); + + // BSS + byte[] bssBytes = bssEncodeFloats(data); + benchEncodeDecode( + datasetName + " BSS", + rawBytes, + bssBytes.length, + () -> bssEncodeFloats(data), + () -> bssDecodeFloats(bssBytes, data.length)); + + // BSS + ZSTD + byte[] bssZstd = Zstd.compress(bssBytes); + benchEncodeDecode( + datasetName + " BSS+ZSTD", + rawBytes, + bssZstd.length, + () -> Zstd.compress(bssEncodeFloats(data)), + () -> { + byte[] dec = Zstd.decompress(bssZstd, bssBytes.length); + bssDecodeFloatsFromRaw(dec, data.length); + }); + } + + // ---- Benchmark harness ---- + + @FunctionalInterface + interface BenchRunnable { + void run() throws Exception; + } + + private void benchEncodeDecode(String name, long rawBytes, long compBytes, BenchRunnable enc, BenchRunnable dec) + throws IOException { + try { + // Warmup + for (int i = 0; i < WARMUP; i++) { + enc.run(); + } + long encNanos = 0; + for (int i = 0; i < MEASURED; i++) { + long t0 = System.nanoTime(); + enc.run(); + encNanos += System.nanoTime() - t0; + } + + for (int i = 0; i < WARMUP; i++) { + dec.run(); + } + long decNanos = 0; + for (int i = 0; i < MEASURED; i++) { + long t0 = System.nanoTime(); + dec.run(); + decNanos += System.nanoTime() - t0; + } + + double encMBps = (rawBytes * (double) MEASURED / (encNanos / 1e9)) / (1024.0 * 1024.0); + double decMBps = (rawBytes * (double) MEASURED / (decNanos / 1e9)) / (1024.0 * 1024.0); + double ratio = (double) compBytes / rawBytes * 100.0; + + System.out.printf( + "%-35s %10.1f %10.1f %10d %10d %7.1f%%%n", + name, encMBps, decMBps, rawBytes / 1024, compBytes / 1024, ratio); + } catch (Exception e) { + throw new IOException("Benchmark failed for " + name, e); + } + } + + // ---- ALP encode/decode ---- + + private static byte[] alpEncodeDoubles(double[] values) throws IOException { + AlpValuesWriter.DoubleAlpValuesWriter writer = new AlpValuesWriter.DoubleAlpValuesWriter(); + for (double v : values) { + writer.writeDouble(v); + } + return writer.getBytes().toByteArray(); + } + + private static void alpDecodeDoubles(byte[] compressed, int numValues) throws IOException { + AlpValuesReaderForDouble reader = new AlpValuesReaderForDouble(); + reader.initFromPage(numValues, ByteBufferInputStream.wrap(ByteBuffer.wrap(compressed))); + for (int i = 0; i < numValues; i++) { + reader.readDouble(); + } + } + + private static byte[] alpEncodeFloats(float[] values) throws IOException { + AlpValuesWriter.FloatAlpValuesWriter writer = new AlpValuesWriter.FloatAlpValuesWriter(); + for (float v : values) { + writer.writeFloat(v); + } + return writer.getBytes().toByteArray(); + } + + private static void alpDecodeFloats(byte[] compressed, int numValues) throws IOException { + AlpValuesReaderForFloat reader = new AlpValuesReaderForFloat(); + reader.initFromPage(numValues, ByteBufferInputStream.wrap(ByteBuffer.wrap(compressed))); + for (int i = 0; i < numValues; i++) { + reader.readFloat(); + } + } + + // ---- PLAIN encode/decode ---- + + private static byte[] plainEncodeDoubles(double[] values) { + ByteBuffer buf = ByteBuffer.allocate(values.length * Double.BYTES).order(ByteOrder.LITTLE_ENDIAN); + for (double v : values) { + buf.putDouble(v); + } + return buf.array(); + } + + private static void consumeDoublesFromPlain(byte[] raw, int numValues) { + ByteBuffer buf = ByteBuffer.wrap(raw).order(ByteOrder.LITTLE_ENDIAN); + for (int i = 0; i < numValues; i++) { + buf.getDouble(); + } + } + + private static byte[] plainEncodeFloats(float[] values) { + ByteBuffer buf = ByteBuffer.allocate(values.length * Float.BYTES).order(ByteOrder.LITTLE_ENDIAN); + for (float v : values) { + buf.putFloat(v); + } + return buf.array(); + } + + private static void consumeFloatsFromPlain(byte[] raw, int numValues) { + ByteBuffer buf = ByteBuffer.wrap(raw).order(ByteOrder.LITTLE_ENDIAN); + for (int i = 0; i < numValues; i++) { + buf.getFloat(); + } + } + + // ---- ByteStreamSplit encode/decode ---- + + private static byte[] bssEncodeDoubles(double[] values) throws IOException { + ByteStreamSplitValuesWriter.DoubleByteStreamSplitValuesWriter writer = + new ByteStreamSplitValuesWriter.DoubleByteStreamSplitValuesWriter( + 64 * 1024, 8 * 1024 * 1024, HeapByteBufferAllocator.getInstance()); + for (double v : values) { + writer.writeDouble(v); + } + return writer.getBytes().toByteArray(); + } + + private static void bssDecodeDoubles(byte[] encoded, int numValues) throws IOException { + ByteStreamSplitValuesReaderForDouble reader = new ByteStreamSplitValuesReaderForDouble(); + reader.initFromPage(numValues, ByteBufferInputStream.wrap(ByteBuffer.wrap(encoded))); + for (int i = 0; i < numValues; i++) { + reader.readDouble(); + } + } + + private static void bssDecodeDoublesFromRaw(byte[] encoded, int numValues) throws IOException { + ByteStreamSplitValuesReaderForDouble reader = new ByteStreamSplitValuesReaderForDouble(); + reader.initFromPage(numValues, ByteBufferInputStream.wrap(ByteBuffer.wrap(encoded))); + for (int i = 0; i < numValues; i++) { + reader.readDouble(); + } + } + + private static byte[] bssEncodeFloats(float[] values) throws IOException { + ByteStreamSplitValuesWriter.FloatByteStreamSplitValuesWriter writer = + new ByteStreamSplitValuesWriter.FloatByteStreamSplitValuesWriter( + 64 * 1024, 8 * 1024 * 1024, HeapByteBufferAllocator.getInstance()); + for (float v : values) { + writer.writeFloat(v); + } + return writer.getBytes().toByteArray(); + } + + private static void bssDecodeFloats(byte[] encoded, int numValues) throws IOException { + ByteStreamSplitValuesReaderForFloat reader = new ByteStreamSplitValuesReaderForFloat(); + reader.initFromPage(numValues, ByteBufferInputStream.wrap(ByteBuffer.wrap(encoded))); + for (int i = 0; i < numValues; i++) { + reader.readFloat(); + } + } + + private static void bssDecodeFloatsFromRaw(byte[] encoded, int numValues) throws IOException { + ByteStreamSplitValuesReaderForFloat reader = new ByteStreamSplitValuesReaderForFloat(); + reader.initFromPage(numValues, ByteBufferInputStream.wrap(ByteBuffer.wrap(encoded))); + for (int i = 0; i < numValues; i++) { + reader.readFloat(); + } + } + + // ---- CSV loading and tiling ---- + + private static Path findCsvDir() throws IOException { + Path dir = Paths.get("").toAbsolutePath(); + for (int i = 0; i < 3; i++) { + Path candidate = dir.resolve(CSV_DIR); + if (Files.isDirectory(candidate) && Files.exists(candidate.resolve(DOUBLE_CSV))) { + return candidate; + } + dir = dir.getParent(); + if (dir == null) break; + } + throw new IOException("Cannot find CSV directory '" + CSV_DIR + + "'. Run from the parquet-java project root."); + } + + private static double[][] loadDoubleCsv(Path csvPath) throws IOException { + try (InputStream is = new GZIPInputStream(new FileInputStream(csvPath.toFile()))) { + BufferedReader br = new BufferedReader(new InputStreamReader(is)); + String header = br.readLine(); + int numCols = header.split(",").length; + + List rows = new ArrayList<>(); + String line; + while ((line = br.readLine()) != null) { + String[] parts = line.split(","); + double[] row = new double[numCols]; + for (int i = 0; i < numCols; i++) { + row[i] = Double.parseDouble(parts[i]); + } + rows.add(row); + } + + double[][] columns = new double[numCols][rows.size()]; + for (int r = 0; r < rows.size(); r++) { + double[] row = rows.get(r); + for (int c = 0; c < numCols; c++) { + columns[c][r] = row[c]; + } + } + return columns; + } + } + + private static float[][] loadFloatCsv(Path csvPath) throws IOException { + try (InputStream is = new GZIPInputStream(new FileInputStream(csvPath.toFile()))) { + BufferedReader br = new BufferedReader(new InputStreamReader(is)); + String header = br.readLine(); + int numCols = header.split(",").length; + + List rows = new ArrayList<>(); + String line; + while ((line = br.readLine()) != null) { + String[] parts = line.split(","); + float[] row = new float[numCols]; + for (int i = 0; i < numCols; i++) { + row[i] = Float.parseFloat(parts[i]); + } + rows.add(row); + } + + float[][] columns = new float[numCols][rows.size()]; + for (int r = 0; r < rows.size(); r++) { + float[] row = rows.get(r); + for (int c = 0; c < numCols; c++) { + columns[c][r] = row[c]; + } + } + return columns; + } + } + + private static double[] tile(double[] source, int targetSize) { + double[] result = new double[targetSize]; + int len = source.length; + for (int i = 0; i < targetSize; i++) { + int copyIdx = i / len; // 0 for first copy, 1 for second, etc. + result[i] = source[i % len] + copyIdx; + } + return result; + } + + private static float[] tile(float[] source, int targetSize) { + float[] result = new float[targetSize]; + int len = source.length; + for (int i = 0; i < targetSize; i++) { + int copyIdx = i / len; + result[i] = source[i % len] + copyIdx; + } + return result; + } +} diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/GenerateAlpParquet.java b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/GenerateAlpParquet.java new file mode 100644 index 0000000000..dfcaeb374b --- /dev/null +++ b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/GenerateAlpParquet.java @@ -0,0 +1,181 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.hadoop; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.nio.charset.StandardCharsets; +import java.util.zip.GZIPInputStream; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.List; +import org.apache.hadoop.fs.Path; +import org.apache.parquet.example.data.Group; +import org.apache.parquet.example.data.simple.SimpleGroupFactory; +import org.apache.parquet.hadoop.example.ExampleParquetWriter; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; +import org.apache.parquet.schema.Types; + +/** + * Standalone utility to generate ALP-encoded parquet files from CSV test data. + * + *

Reads the existing expect CSV files (alp_spotify1_expect.csv.gz, alp_arade_expect.csv.gz) + * from test resources and writes ALP-encoded parquet files using the Java ALP encoder. + * + *

Usage: java GenerateAlpParquet [output_directory] + * If no output directory is specified, files are written to the current directory. + */ +public class GenerateAlpParquet { + + public static void main(String[] args) throws IOException { + String outputDir = args.length > 0 ? args[0] : "."; + Files.createDirectories(Paths.get(outputDir)); + + generateAlpParquet("/alp_arade_expect.csv.gz", outputDir + "/alp_java_arade.parquet"); + System.out.println("Generated: " + outputDir + "/alp_java_arade.parquet"); + + generateAlpParquet("/alp_spotify1_expect.csv.gz", outputDir + "/alp_java_spotify1.parquet"); + System.out.println("Generated: " + outputDir + "/alp_java_spotify1.parquet"); + + generateAlpParquetFloat( + "/alp_float_arade_expect.csv.gz", outputDir + "/alp_java_float_arade.parquet"); + System.out.println("Generated: " + outputDir + "/alp_java_float_arade.parquet"); + + generateAlpParquetFloat( + "/alp_float_spotify1_expect.csv.gz", outputDir + "/alp_java_float_spotify1.parquet"); + System.out.println("Generated: " + outputDir + "/alp_java_float_spotify1.parquet"); + } + + private static void generateAlpParquet(String csvResource, String outputPath) throws IOException { + // Read CSV + String[] columnNames; + List rows = new ArrayList<>(); + + try (InputStream raw = GenerateAlpParquet.class.getResourceAsStream(csvResource); + InputStream is = new GZIPInputStream(raw); + BufferedReader br = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8))) { + // Parse header + String header = br.readLine(); + columnNames = header.split(","); + + // Parse data rows + String line; + while ((line = br.readLine()) != null) { + String[] parts = line.split(","); + double[] values = new double[parts.length]; + for (int i = 0; i < parts.length; i++) { + values[i] = Double.parseDouble(parts[i]); + } + rows.add(values); + } + } + + // Build schema: all required DOUBLE columns + Types.MessageTypeBuilder schemaBuilder = Types.buildMessage(); + for (String name : columnNames) { + schemaBuilder.required(PrimitiveTypeName.DOUBLE).named(name); + } + MessageType schema = schemaBuilder.named("schema"); + + // Delete output file if it exists + java.io.File outFile = new java.io.File(outputPath); + if (outFile.exists()) { + outFile.delete(); + } + + // Write ALP-encoded parquet + Path path = new Path(outFile.getAbsolutePath()); + SimpleGroupFactory groupFactory = new SimpleGroupFactory(schema); + try (ParquetWriter writer = ExampleParquetWriter.builder(path) + .withType(schema) + .withWriterVersion(org.apache.parquet.column.ParquetProperties.WriterVersion.PARQUET_2_0) + .withAlpEncoding(true) + .withDictionaryEncoding(false) + .build()) { + for (double[] row : rows) { + Group group = groupFactory.newGroup(); + for (int c = 0; c < columnNames.length; c++) { + group.append(columnNames[c], row[c]); + } + writer.write(group); + } + } + } + + private static void generateAlpParquetFloat(String csvResource, String outputPath) + throws IOException { + // Read CSV into float values + String[] columnNames; + List rows = new ArrayList<>(); + + try (InputStream raw = GenerateAlpParquet.class.getResourceAsStream(csvResource); + InputStream is = new GZIPInputStream(raw); + BufferedReader br = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8))) { + // Parse header + String header = br.readLine(); + columnNames = header.split(","); + + // Parse data rows + String line; + while ((line = br.readLine()) != null) { + String[] parts = line.split(","); + float[] values = new float[parts.length]; + for (int i = 0; i < parts.length; i++) { + values[i] = Float.parseFloat(parts[i]); + } + rows.add(values); + } + } + + // Build schema: all required FLOAT columns + Types.MessageTypeBuilder schemaBuilder = Types.buildMessage(); + for (String name : columnNames) { + schemaBuilder.required(PrimitiveTypeName.FLOAT).named(name); + } + MessageType schema = schemaBuilder.named("schema"); + + // Delete output file if it exists + java.io.File outFile = new java.io.File(outputPath); + if (outFile.exists()) { + outFile.delete(); + } + + // Write ALP-encoded parquet + Path path = new Path(outFile.getAbsolutePath()); + SimpleGroupFactory groupFactory = new SimpleGroupFactory(schema); + try (ParquetWriter writer = ExampleParquetWriter.builder(path) + .withType(schema) + .withWriterVersion(org.apache.parquet.column.ParquetProperties.WriterVersion.PARQUET_2_0) + .withAlpEncoding(true) + .withDictionaryEncoding(false) + .build()) { + for (float[] row : rows) { + Group group = groupFactory.newGroup(); + for (int c = 0; c < columnNames.length; c++) { + group.append(columnNames[c], row[c]); + } + writer.write(group); + } + } + } +} diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestInteropAlpEncoding.java b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestInteropAlpEncoding.java new file mode 100644 index 0000000000..7a7dfbb7ec --- /dev/null +++ b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestInteropAlpEncoding.java @@ -0,0 +1,425 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.hadoop; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.zip.GZIPInputStream; +import java.net.URISyntaxException; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; +import org.apache.hadoop.fs.Path; +import org.apache.parquet.column.Encoding; +import org.apache.parquet.example.data.Group; +import org.apache.parquet.hadoop.example.GroupReadSupport; +import org.apache.parquet.hadoop.metadata.BlockMetaData; +import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; +import org.junit.Test; + +/** + * Integration test for reading ALP (Adaptive Lossless floating-Point) encoded + * parquet files generated by the C++ implementation and verifying correctness + * against expected CSV data. + * + *

The test parquet files were generated using the generate_alp_parquet C++ + * utility from Arrow, which encodes floating-point CSV datasets using ALP encoding. + */ +public class TestInteropAlpEncoding { + + private static Path resourcePath(String name) { + try { + return new Path(TestInteropAlpEncoding.class.getResource("/" + name).toURI()); + } catch (URISyntaxException e) { + throw new RuntimeException(e); + } + } + + /** + * Read the ALP-encoded arade parquet file (4 double columns, 15000 rows) + * and verify all values match the expected CSV. + */ + @Test + public void testReadAlpAradeParquet() throws IOException { + Path parquetPath = resourcePath("alp_arade.parquet"); + String[] columnNames = {"value1", "value2", "value3", "value4"}; + int expectedRows = 15000; + + // Read expected values from CSV + double[][] expected = readExpectedCsv("/alp_arade_expect.csv.gz", columnNames.length, expectedRows); + + // Read parquet file using GroupReadSupport + List rows = readParquetGroups(parquetPath); + assertEquals("Row count should match", expectedRows, rows.size()); + + // Verify ALP encoding is used in metadata + verifyAlpEncoding(parquetPath); + + // Compare all values + for (int r = 0; r < expectedRows; r++) { + Group group = rows.get(r); + for (int c = 0; c < columnNames.length; c++) { + double actual = group.getDouble(columnNames[c], 0); + assertEquals( + String.format("Mismatch at row %d, column %s", r, columnNames[c]), + Double.doubleToLongBits(expected[c][r]), + Double.doubleToLongBits(actual)); + } + } + } + + /** + * Read the ALP-encoded spotify1 parquet file (9 double columns, 15000 rows) + * and verify all values match the expected CSV. + */ + @Test + public void testReadAlpSpotify1Parquet() throws IOException { + Path parquetPath = resourcePath("alp_spotify1.parquet"); + String[] columnNames = { + "danceability", + "energy", + "loudness", + "speechiness", + "acousticness", + "instrumentalness", + "liveness", + "valence", + "tempo" + }; + int expectedRows = 15000; + + // Read expected values from CSV + double[][] expected = readExpectedCsv("/alp_spotify1_expect.csv.gz", columnNames.length, expectedRows); + + // Read parquet file using GroupReadSupport + List rows = readParquetGroups(parquetPath); + assertEquals("Row count should match", expectedRows, rows.size()); + + // Verify ALP encoding is used in metadata + verifyAlpEncoding(parquetPath); + + // Compare all values + for (int r = 0; r < expectedRows; r++) { + Group group = rows.get(r); + for (int c = 0; c < columnNames.length; c++) { + double actual = group.getDouble(columnNames[c], 0); + assertEquals( + String.format("Mismatch at row %d, column %s", r, columnNames[c]), + Double.doubleToLongBits(expected[c][r]), + Double.doubleToLongBits(actual)); + } + } + } + + /** + * Read the Java-generated ALP-encoded arade parquet file and verify all values + * match the expected CSV. + */ + @Test + public void testReadAlpJavaAradeParquet() throws IOException { + Path parquetPath = resourcePath("alp_java_arade.parquet"); + String[] columnNames = {"value1", "value2", "value3", "value4"}; + int expectedRows = 15000; + + double[][] expected = readExpectedCsv("/alp_arade_expect.csv.gz", columnNames.length, expectedRows); + + List rows = readParquetGroups(parquetPath); + assertEquals("Row count should match", expectedRows, rows.size()); + + verifyAlpEncoding(parquetPath); + + for (int r = 0; r < expectedRows; r++) { + Group group = rows.get(r); + for (int c = 0; c < columnNames.length; c++) { + double actual = group.getDouble(columnNames[c], 0); + assertEquals( + String.format("Mismatch at row %d, column %s", r, columnNames[c]), + Double.doubleToLongBits(expected[c][r]), + Double.doubleToLongBits(actual)); + } + } + } + + /** + * Read the Java-generated ALP-encoded spotify1 parquet file and verify all values + * match the expected CSV. + */ + @Test + public void testReadAlpJavaSpotify1Parquet() throws IOException { + Path parquetPath = resourcePath("alp_java_spotify1.parquet"); + String[] columnNames = { + "danceability", + "energy", + "loudness", + "speechiness", + "acousticness", + "instrumentalness", + "liveness", + "valence", + "tempo" + }; + int expectedRows = 15000; + + double[][] expected = readExpectedCsv("/alp_spotify1_expect.csv.gz", columnNames.length, expectedRows); + + List rows = readParquetGroups(parquetPath); + assertEquals("Row count should match", expectedRows, rows.size()); + + verifyAlpEncoding(parquetPath); + + for (int r = 0; r < expectedRows; r++) { + Group group = rows.get(r); + for (int c = 0; c < columnNames.length; c++) { + double actual = group.getDouble(columnNames[c], 0); + assertEquals( + String.format("Mismatch at row %d, column %s", r, columnNames[c]), + Double.doubleToLongBits(expected[c][r]), + Double.doubleToLongBits(actual)); + } + } + } + + /** + * Read the ALP-encoded float32 arade parquet file (C++ generated) + * and verify all values match the expected CSV. + */ + @Test + public void testReadAlpFloatAradeParquet() throws IOException { + Path parquetPath = resourcePath("alp_float_arade.parquet"); + String[] columnNames = {"value1", "value2", "value3", "value4"}; + int expectedRows = 15000; + + float[][] expected = readExpectedCsvFloat("/alp_float_arade_expect.csv.gz", columnNames.length, expectedRows); + + List rows = readParquetGroups(parquetPath); + assertEquals("Row count should match", expectedRows, rows.size()); + + verifyAlpEncoding(parquetPath); + + for (int r = 0; r < expectedRows; r++) { + Group group = rows.get(r); + for (int c = 0; c < columnNames.length; c++) { + float actual = group.getFloat(columnNames[c], 0); + assertEquals( + String.format("Mismatch at row %d, column %s", r, columnNames[c]), + Float.floatToIntBits(expected[c][r]), + Float.floatToIntBits(actual)); + } + } + } + + /** + * Read the ALP-encoded float32 spotify1 parquet file (C++ generated) + * and verify all values match the expected CSV. + */ + @Test + public void testReadAlpFloatSpotify1Parquet() throws IOException { + Path parquetPath = resourcePath("alp_float_spotify1.parquet"); + String[] columnNames = { + "danceability", + "energy", + "loudness", + "speechiness", + "acousticness", + "instrumentalness", + "liveness", + "valence", + "tempo" + }; + int expectedRows = 15000; + + float[][] expected = readExpectedCsvFloat("/alp_float_spotify1_expect.csv.gz", columnNames.length, expectedRows); + + List rows = readParquetGroups(parquetPath); + assertEquals("Row count should match", expectedRows, rows.size()); + + verifyAlpEncoding(parquetPath); + + for (int r = 0; r < expectedRows; r++) { + Group group = rows.get(r); + for (int c = 0; c < columnNames.length; c++) { + float actual = group.getFloat(columnNames[c], 0); + assertEquals( + String.format("Mismatch at row %d, column %s", r, columnNames[c]), + Float.floatToIntBits(expected[c][r]), + Float.floatToIntBits(actual)); + } + } + } + + /** + * Read the Java-generated ALP-encoded float32 arade parquet file + * and verify all values match the expected CSV. + */ + @Test + public void testReadAlpJavaFloatAradeParquet() throws IOException { + Path parquetPath = resourcePath("alp_java_float_arade.parquet"); + String[] columnNames = {"value1", "value2", "value3", "value4"}; + int expectedRows = 15000; + + float[][] expected = readExpectedCsvFloat("/alp_float_arade_expect.csv.gz", columnNames.length, expectedRows); + + List rows = readParquetGroups(parquetPath); + assertEquals("Row count should match", expectedRows, rows.size()); + + verifyAlpEncoding(parquetPath); + + for (int r = 0; r < expectedRows; r++) { + Group group = rows.get(r); + for (int c = 0; c < columnNames.length; c++) { + float actual = group.getFloat(columnNames[c], 0); + assertEquals( + String.format("Mismatch at row %d, column %s", r, columnNames[c]), + Float.floatToIntBits(expected[c][r]), + Float.floatToIntBits(actual)); + } + } + } + + /** + * Read the Java-generated ALP-encoded float32 spotify1 parquet file + * and verify all values match the expected CSV. + */ + @Test + public void testReadAlpJavaFloatSpotify1Parquet() throws IOException { + Path parquetPath = resourcePath("alp_java_float_spotify1.parquet"); + String[] columnNames = { + "danceability", + "energy", + "loudness", + "speechiness", + "acousticness", + "instrumentalness", + "liveness", + "valence", + "tempo" + }; + int expectedRows = 15000; + + float[][] expected = readExpectedCsvFloat("/alp_float_spotify1_expect.csv.gz", columnNames.length, expectedRows); + + List rows = readParquetGroups(parquetPath); + assertEquals("Row count should match", expectedRows, rows.size()); + + verifyAlpEncoding(parquetPath); + + for (int r = 0; r < expectedRows; r++) { + Group group = rows.get(r); + for (int c = 0; c < columnNames.length; c++) { + float actual = group.getFloat(columnNames[c], 0); + assertEquals( + String.format("Mismatch at row %d, column %s", r, columnNames[c]), + Float.floatToIntBits(expected[c][r]), + Float.floatToIntBits(actual)); + } + } + } + + private List readParquetGroups(Path path) throws IOException { + List rows = new ArrayList<>(); + try (ParquetReader reader = + ParquetReader.builder(new GroupReadSupport(), path).build()) { + Group group; + while ((group = reader.read()) != null) { + rows.add(group); + } + } + return rows; + } + + private void verifyAlpEncoding(Path path) throws IOException { + try (ParquetFileReader reader = ParquetFileReader.open(org.apache.parquet.hadoop.util.HadoopInputFile.fromPath( + path, new org.apache.hadoop.conf.Configuration()))) { + List blocks = reader.getFooter().getBlocks(); + for (BlockMetaData block : blocks) { + for (ColumnChunkMetaData column : block.getColumns()) { + assertNotNull( + "Column " + column.getPath() + " should have encoding stats", column.getEncodingStats()); + boolean hasAlp = column.getEncodings().contains(Encoding.ALP); + assertEquals("Column " + column.getPath() + " should use ALP encoding", true, hasAlp); + } + } + } + } + + /** + * Parse expected CSV into column arrays. + * CSV format: header row, then data rows with comma-separated double values. + */ + private double[][] readExpectedCsv(String resourcePath, int numColumns, int expectedRows) throws IOException { + double[][] columns = new double[numColumns][expectedRows]; + try (InputStream raw = getClass().getResourceAsStream(resourcePath); + InputStream is = new GZIPInputStream(raw); + BufferedReader br = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8))) { + assertNotNull("CSV resource not found: " + resourcePath, raw); + + // Skip header + String header = br.readLine(); + assertNotNull("CSV should have a header", header); + + int row = 0; + String line; + while ((line = br.readLine()) != null) { + String[] parts = line.split(","); + assertEquals("CSV row " + row + " should have " + numColumns + " columns", numColumns, parts.length); + for (int c = 0; c < numColumns; c++) { + columns[c][row] = Double.parseDouble(parts[c]); + } + row++; + } + assertEquals("CSV should have " + expectedRows + " data rows", expectedRows, row); + } + return columns; + } + + /** + * Parse expected CSV into float column arrays. + * CSV format: header row, then data rows with comma-separated float values. + */ + private float[][] readExpectedCsvFloat(String resourcePath, int numColumns, int expectedRows) throws IOException { + float[][] columns = new float[numColumns][expectedRows]; + try (InputStream raw = getClass().getResourceAsStream(resourcePath); + InputStream is = new GZIPInputStream(raw); + BufferedReader br = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8))) { + assertNotNull("CSV resource not found: " + resourcePath, raw); + + // Skip header + String header = br.readLine(); + assertNotNull("CSV should have a header", header); + + int row = 0; + String line; + while ((line = br.readLine()) != null) { + String[] parts = line.split(","); + assertEquals("CSV row " + row + " should have " + numColumns + " columns", numColumns, parts.length); + for (int c = 0; c < numColumns; c++) { + columns[c][row] = Float.parseFloat(parts[c]); + } + row++; + } + assertEquals("CSV should have " + expectedRows + " data rows", expectedRows, row); + } + return columns; + } +} diff --git a/parquet-hadoop/src/test/resources/alp_arade.parquet b/parquet-hadoop/src/test/resources/alp_arade.parquet new file mode 100644 index 0000000000..156557e40a Binary files /dev/null and b/parquet-hadoop/src/test/resources/alp_arade.parquet differ diff --git a/parquet-hadoop/src/test/resources/alp_arade_expect.csv.gz b/parquet-hadoop/src/test/resources/alp_arade_expect.csv.gz new file mode 100644 index 0000000000..e7cc6def5e Binary files /dev/null and b/parquet-hadoop/src/test/resources/alp_arade_expect.csv.gz differ diff --git a/parquet-hadoop/src/test/resources/alp_float_arade.parquet b/parquet-hadoop/src/test/resources/alp_float_arade.parquet new file mode 100644 index 0000000000..489cc18d0e Binary files /dev/null and b/parquet-hadoop/src/test/resources/alp_float_arade.parquet differ diff --git a/parquet-hadoop/src/test/resources/alp_float_arade_expect.csv.gz b/parquet-hadoop/src/test/resources/alp_float_arade_expect.csv.gz new file mode 100644 index 0000000000..d594128f76 Binary files /dev/null and b/parquet-hadoop/src/test/resources/alp_float_arade_expect.csv.gz differ diff --git a/parquet-hadoop/src/test/resources/alp_float_spotify1.parquet b/parquet-hadoop/src/test/resources/alp_float_spotify1.parquet new file mode 100644 index 0000000000..011420c166 Binary files /dev/null and b/parquet-hadoop/src/test/resources/alp_float_spotify1.parquet differ diff --git a/parquet-hadoop/src/test/resources/alp_float_spotify1_expect.csv.gz b/parquet-hadoop/src/test/resources/alp_float_spotify1_expect.csv.gz new file mode 100644 index 0000000000..1b8f9534dc Binary files /dev/null and b/parquet-hadoop/src/test/resources/alp_float_spotify1_expect.csv.gz differ diff --git a/parquet-hadoop/src/test/resources/alp_java_arade.parquet b/parquet-hadoop/src/test/resources/alp_java_arade.parquet new file mode 100644 index 0000000000..0e7e07c428 Binary files /dev/null and b/parquet-hadoop/src/test/resources/alp_java_arade.parquet differ diff --git a/parquet-hadoop/src/test/resources/alp_java_float_arade.parquet b/parquet-hadoop/src/test/resources/alp_java_float_arade.parquet new file mode 100644 index 0000000000..c21a2e3394 Binary files /dev/null and b/parquet-hadoop/src/test/resources/alp_java_float_arade.parquet differ diff --git a/parquet-hadoop/src/test/resources/alp_java_float_spotify1.parquet b/parquet-hadoop/src/test/resources/alp_java_float_spotify1.parquet new file mode 100644 index 0000000000..29ed04aff3 Binary files /dev/null and b/parquet-hadoop/src/test/resources/alp_java_float_spotify1.parquet differ diff --git a/parquet-hadoop/src/test/resources/alp_java_spotify1.parquet b/parquet-hadoop/src/test/resources/alp_java_spotify1.parquet new file mode 100644 index 0000000000..d7d9aadf7b Binary files /dev/null and b/parquet-hadoop/src/test/resources/alp_java_spotify1.parquet differ diff --git a/parquet-hadoop/src/test/resources/alp_spotify1.parquet b/parquet-hadoop/src/test/resources/alp_spotify1.parquet new file mode 100644 index 0000000000..0ed223ab7e Binary files /dev/null and b/parquet-hadoop/src/test/resources/alp_spotify1.parquet differ diff --git a/parquet-hadoop/src/test/resources/alp_spotify1_expect.csv.gz b/parquet-hadoop/src/test/resources/alp_spotify1_expect.csv.gz new file mode 100644 index 0000000000..bad6e31dba Binary files /dev/null and b/parquet-hadoop/src/test/resources/alp_spotify1_expect.csv.gz differ diff --git a/pom.xml b/pom.xml index d27788932c..7b6fa0c7b7 100644 --- a/pom.xml +++ b/pom.xml @@ -475,6 +475,7 @@ .github/PULL_REQUEST_TEMPLATE.md **/*.parquet + **/*.csv **/*.avro **/*.json **/*.avsc