From 5c73eeda2439da105c401e85dd7f2b4e11f846d7 Mon Sep 17 00:00:00 2001 From: jmestwa-coder Date: Sat, 23 May 2026 16:03:20 +0530 Subject: [PATCH] reject too-short statistics values in FormatStatValue --- cpp/src/parquet/types.cc | 37 +++++++++++++++++++++++++++++++++++ cpp/src/parquet/types_test.cc | 23 ++++++++++++++++++++++ 2 files changed, 60 insertions(+) diff --git a/cpp/src/parquet/types.cc b/cpp/src/parquet/types.cc index fb4eb92a7544..824913117b5f 100644 --- a/cpp/src/parquet/types.cc +++ b/cpp/src/parquet/types.cc @@ -184,6 +184,43 @@ std::string FormatFloat16Value(::std::string_view val) { std::string FormatStatValue(Type::type parquet_type, ::std::string_view val, const std::shared_ptr& logical_type) { + // Statistics blobs come from the file's Thrift-encoded metadata and may have + // arbitrary length under a malicious writer. Reject any value that is shorter + // than what the physical (and, for FLOAT16, logical) type requires so the + // memcpy/byte loads below cannot run past the buffer. + size_t required = 0; + switch (parquet_type) { + case Type::BOOLEAN: + required = sizeof(bool); + break; + case Type::INT32: + case Type::FLOAT: + required = 4; + break; + case Type::INT64: + case Type::DOUBLE: + required = 8; + break; + case Type::INT96: + required = 3 * sizeof(int32_t); + break; + case Type::BYTE_ARRAY: + case Type::FIXED_LEN_BYTE_ARRAY: + if (logical_type != nullptr && logical_type->is_float16()) { + required = 2; + } + break; + case Type::UNDEFINED: + default: + break; + } + if (val.size() < required) { + std::stringstream ss; + ss << "Statistics value of " << val.size() << " bytes is too small for " + << TypeToString(parquet_type); + throw ParquetException(ss.str()); + } + std::stringstream result; const char* bytes = val.data(); switch (parquet_type) { diff --git a/cpp/src/parquet/types_test.cc b/cpp/src/parquet/types_test.cc index 6c77662d58f7..6803dfd136a0 100644 --- a/cpp/src/parquet/types_test.cc +++ b/cpp/src/parquet/types_test.cc @@ -20,6 +20,7 @@ #include #include "arrow/util/endian.h" +#include "parquet/exception.h" #include "parquet/types.h" namespace parquet { @@ -179,6 +180,28 @@ TEST(TypePrinter, StatisticsTypes) { FormatStatValue(Type::FIXED_LEN_BYTE_ARRAY, smin, LogicalType::Float16())); } +TEST(TypePrinter, StatisticsTypesShortValue) { + // A maliciously crafted Parquet file may set a statistics min/max shorter + // than the column's physical type. FormatStatValue must reject it instead of + // memcpy'ing past the buffer. + ASSERT_THROW(FormatStatValue(Type::BOOLEAN, std::string()), ParquetException); + ASSERT_THROW(FormatStatValue(Type::INT32, std::string("abc")), ParquetException); + ASSERT_THROW(FormatStatValue(Type::INT64, std::string("abcdefg")), ParquetException); + ASSERT_THROW(FormatStatValue(Type::FLOAT, std::string()), ParquetException); + ASSERT_THROW(FormatStatValue(Type::DOUBLE, std::string("1234567")), ParquetException); + ASSERT_THROW(FormatStatValue(Type::INT96, std::string("12345678901")), + ParquetException); + ASSERT_THROW(FormatStatValue(Type::INT32, std::string("abc"), + LogicalType::Decimal(6, 2)), + ParquetException); + ASSERT_THROW(FormatStatValue(Type::INT64, std::string("abcdefg"), + LogicalType::Decimal(18, 4)), + ParquetException); + ASSERT_THROW(FormatStatValue(Type::FIXED_LEN_BYTE_ARRAY, std::string("a"), + LogicalType::Float16()), + ParquetException); +} + TEST(TestInt96Timestamp, Decoding) { auto check = [](int32_t julian_day, uint64_t nanoseconds) { #if ARROW_LITTLE_ENDIAN