From fc204ac67739082bc9430d7e4a6ae1d3e18cf6b4 Mon Sep 17 00:00:00 2001 From: rootvector2 Date: Thu, 21 May 2026 18:06:22 +0530 Subject: [PATCH] [C++][Parquet] Saturate ApplicationVersion components instead of atoi UB The version components in a Parquet file's `created_by` string are attacker-controlled. std::atoi exhibits undefined behavior when the parsed value overflows int (see [c.strtoint]). Replace the three atoi calls in ApplicationVersionParser with a saturating helper that uses std::strtoul and clamps to INT_MAX when the parsed value would not fit in int. Add a regression test exercising overflow inputs. --- cpp/src/parquet/metadata.cc | 27 ++++++++++++++++++++++++--- cpp/src/parquet/metadata_test.cc | 28 ++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 3 deletions(-) diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 46cd1e442461..2984048c72ce 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -18,7 +18,10 @@ #include "parquet/metadata.h" #include +#include #include +#include +#include #include #include #include @@ -1274,6 +1277,24 @@ ApplicationVersion::ApplicationVersion(std::string application, int major, int m : application_(std::move(application)), version{major, minor, patch, "", "", ""} {} namespace { +// Parse a digit-only ASCII string into an int, saturating to INT_MAX on +// overflow. The version components in a file's `created_by` string are +// attacker-controlled, and std::atoi has undefined behavior when the +// converted value falls outside the range of int (see [c.strtoint]). +int ParseUnsignedVersionComponent(const std::string& s) { + if (s.empty()) { + return 0; + } + errno = 0; + char* end = nullptr; + unsigned long value = std::strtoul(s.c_str(), &end, 10); + if (errno == ERANGE || + value > static_cast(std::numeric_limits::max())) { + return std::numeric_limits::max(); + } + return static_cast(value); +} + // Parse the application version format and set parsed values to // ApplicationVersion. // @@ -1441,7 +1462,7 @@ class ApplicationVersionParser { } auto version_major_string = version_string_.substr( version_major_start, version_major_end - version_major_start); - application_version_.version.major = atoi(version_major_string.c_str()); + application_version_.version.major = ParseUnsignedVersionComponent(version_major_string); return true; } @@ -1466,7 +1487,7 @@ class ApplicationVersionParser { } auto version_minor_string = version_string_.substr( version_minor_start, version_minor_end - version_minor_start); - application_version_.version.minor = atoi(version_minor_string.c_str()); + application_version_.version.minor = ParseUnsignedVersionComponent(version_minor_string); return true; } @@ -1484,7 +1505,7 @@ class ApplicationVersionParser { } auto version_patch_string = version_string_.substr( version_patch_start, version_patch_end - version_patch_start); - application_version_.version.patch = atoi(version_patch_string.c_str()); + application_version_.version.patch = ParseUnsignedVersionComponent(version_patch_string); version_parsing_position_ = version_patch_end; return true; } diff --git a/cpp/src/parquet/metadata_test.cc b/cpp/src/parquet/metadata_test.cc index 572f053179cd..e0e76ada3edc 100644 --- a/cpp/src/parquet/metadata_test.cc +++ b/cpp/src/parquet/metadata_test.cc @@ -17,6 +17,8 @@ #include "parquet/metadata.h" +#include + #include #include "arrow/util/key_value_metadata.h" @@ -731,6 +733,32 @@ TEST(ApplicationVersion, VersionNoUnknownBuildInfoPreRelease) { ASSERT_EQ("cd-cdh5.5.0", version.version.build_info); } +TEST(ApplicationVersion, VersionComponentOverflow) { + // Version components in `created_by` are attacker-controlled. std::atoi + // exhibits undefined behavior when the converted value overflows int, so + // the parser must clamp instead of calling atoi. + ApplicationVersion version( + "parquet-mr version 99999999999999999999.88888888888888888888." + "77777777777777777777 (build abcd)"); + + ASSERT_EQ("parquet-mr", version.application_); + ASSERT_EQ("abcd", version.build_); + ASSERT_EQ(std::numeric_limits::max(), version.version.major); + ASSERT_EQ(std::numeric_limits::max(), version.version.minor); + ASSERT_EQ(std::numeric_limits::max(), version.version.patch); + + // Boundary cases: INT_MAX is representable, INT_MAX+1 saturates. + ApplicationVersion at_max("parquet-mr version 2147483647.2147483647.2147483647"); + ASSERT_EQ(std::numeric_limits::max(), at_max.version.major); + ASSERT_EQ(std::numeric_limits::max(), at_max.version.minor); + ASSERT_EQ(std::numeric_limits::max(), at_max.version.patch); + + ApplicationVersion just_over("parquet-mr version 2147483648.2147483648.2147483648"); + ASSERT_EQ(std::numeric_limits::max(), just_over.version.major); + ASSERT_EQ(std::numeric_limits::max(), just_over.version.minor); + ASSERT_EQ(std::numeric_limits::max(), just_over.version.patch); +} + TEST(ApplicationVersion, FullWithSpaces) { ApplicationVersion version( " parquet-mr \t version \v 1.5.3ab-cdh5.5.0+cd \r (build \n abcd \f) ");