From 6c131e738355fc377bbc5b6a0f9560906698da09 Mon Sep 17 00:00:00 2001 From: Sriniketh24 Date: Sat, 23 May 2026 08:08:30 +0530 Subject: [PATCH] GH-36388: [C++][Python] Return error from MakeArrayFromScalar on offset overflow MakeArrayFromScalar silently created an invalid array with negative offsets when the total data size (value_size * repetition_count) exceeded the maximum value of the offset type. For 32-bit offset types like StringType and BinaryType, this threshold is INT32_MAX (~2 GB). The root cause was in CreateOffsetsBuffer where the running offset accumulated via OffsetType addition without checking for overflow, wrapping around to negative values. Added an early overflow check in CreateOffsetsBuffer that computes the total size in int64_t and compares against the offset type's maximum. On overflow, a Status::Invalid error is returned with a message suggesting the use of large_* types. This is AI-assisted work by Claude. --- cpp/src/arrow/array/array_test.cc | 27 +++++++++++++++++++++++++++ cpp/src/arrow/array/util.cc | 13 +++++++++++++ python/pyarrow/tests/test_array.py | 10 ++++++++++ 3 files changed, 50 insertions(+) diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index 64ea3fd71a73..fecbf79aa03e 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -767,6 +767,33 @@ TEST_F(TestArray, TestMakeArrayFromScalar) { } } +TEST_F(TestArray, TestMakeArrayFromScalarOffsetOverflow) { + // Regression test for GH-36388: MakeArrayFromScalar should return an error + // when the total data size would overflow 32-bit offsets instead of silently + // producing an invalid array with negative offsets. + + // A single-byte string repeated 2^31 times overflows int32 offsets + auto scalar = MakeScalar("x"); + int64_t length = static_cast(1) << 31; + ASSERT_RAISES(Invalid, MakeArrayFromScalar(*scalar, length)); + + // A two-byte string repeated just over INT32_MAX/2 times also overflows + auto scalar2 = MakeScalar("xy"); + int64_t length2 = (static_cast(1) << 30) + 1; + ASSERT_RAISES(Invalid, MakeArrayFromScalar(*scalar2, length2)); + + // Binary type has the same issue + auto bin_scalar = std::make_shared(Buffer::FromString("abc")); + int64_t length3 = (static_cast(std::numeric_limits::max()) / 3) + 1; + ASSERT_RAISES(Invalid, MakeArrayFromScalar(*bin_scalar, length3)); + + // Large string type should NOT overflow (uses 64-bit offsets) + auto large_scalar = std::make_shared("x"); + // Just verify it doesn't raise for a small count (we can't allocate 2^31 bytes here) + ASSERT_OK_AND_ASSIGN(auto arr, MakeArrayFromScalar(*large_scalar, 16)); + ASSERT_EQ(arr->length(), 16); +} + TEST_F(TestArray, TestMakeArrayFromScalarSliced) { // Regression test for ARROW-13437 auto scalars = GetScalars(); diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc index 1c19bd5a5468..91b5819a750f 100644 --- a/cpp/src/arrow/array/util.cc +++ b/cpp/src/arrow/array/util.cc @@ -853,6 +853,19 @@ class RepeatedArrayFactory { template Status CreateOffsetsBuffer(OffsetType value_length, std::shared_ptr* out) { + // Check that the total data size does not overflow the offset type. + // For 32-bit offset types (e.g. StringType, BinaryType), value_length * length_ + // must fit in int32_t, otherwise the offsets wrap around and produce an invalid + // array with negative offsets. + if (value_length > 0 && length_ > 0) { + int64_t total_size = static_cast(value_length) * length_; + if (total_size > static_cast(std::numeric_limits::max())) { + return Status::Invalid( + "Cannot create array: total data size (", total_size, + " bytes) would overflow the offset type. Consider using a large_* " + "type (e.g. large_string, large_binary) for data exceeding 2 GB."); + } + } TypedBufferBuilder builder(pool_); RETURN_NOT_OK(builder.Resize(length_ + 1)); OffsetType offset = 0; diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index a103519dc5ac..ba1c486e0c67 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -442,6 +442,16 @@ def test_array_from_dictionary_scalar(): assert result.equals(expected) +def test_repeat_offset_overflow(): + # GH-36388: pa.repeat should raise an error when the total data size + # would overflow 32-bit offsets, instead of returning an invalid array. + with pytest.raises(pa.ArrowInvalid, match="overflow"): + pa.repeat("x", 2**31) + + with pytest.raises(pa.ArrowInvalid, match="overflow"): + pa.repeat("xy", 2**30 + 1) + + def test_array_getitem(): arr = pa.array(range(10, 15)) lst = arr.to_pylist()