diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index 64ea3fd71a73..fecbf79aa03e 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -767,6 +767,33 @@ TEST_F(TestArray, TestMakeArrayFromScalar) { } } +TEST_F(TestArray, TestMakeArrayFromScalarOffsetOverflow) { + // Regression test for GH-36388: MakeArrayFromScalar should return an error + // when the total data size would overflow 32-bit offsets instead of silently + // producing an invalid array with negative offsets. + + // A single-byte string repeated 2^31 times overflows int32 offsets + auto scalar = MakeScalar("x"); + int64_t length = static_cast(1) << 31; + ASSERT_RAISES(Invalid, MakeArrayFromScalar(*scalar, length)); + + // A two-byte string repeated just over INT32_MAX/2 times also overflows + auto scalar2 = MakeScalar("xy"); + int64_t length2 = (static_cast(1) << 30) + 1; + ASSERT_RAISES(Invalid, MakeArrayFromScalar(*scalar2, length2)); + + // Binary type has the same issue + auto bin_scalar = std::make_shared(Buffer::FromString("abc")); + int64_t length3 = (static_cast(std::numeric_limits::max()) / 3) + 1; + ASSERT_RAISES(Invalid, MakeArrayFromScalar(*bin_scalar, length3)); + + // Large string type should NOT overflow (uses 64-bit offsets) + auto large_scalar = std::make_shared("x"); + // Just verify it doesn't raise for a small count (we can't allocate 2^31 bytes here) + ASSERT_OK_AND_ASSIGN(auto arr, MakeArrayFromScalar(*large_scalar, 16)); + ASSERT_EQ(arr->length(), 16); +} + TEST_F(TestArray, TestMakeArrayFromScalarSliced) { // Regression test for ARROW-13437 auto scalars = GetScalars(); diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc index 1c19bd5a5468..91b5819a750f 100644 --- a/cpp/src/arrow/array/util.cc +++ b/cpp/src/arrow/array/util.cc @@ -853,6 +853,19 @@ class RepeatedArrayFactory { template Status CreateOffsetsBuffer(OffsetType value_length, std::shared_ptr* out) { + // Check that the total data size does not overflow the offset type. + // For 32-bit offset types (e.g. StringType, BinaryType), value_length * length_ + // must fit in int32_t, otherwise the offsets wrap around and produce an invalid + // array with negative offsets. + if (value_length > 0 && length_ > 0) { + int64_t total_size = static_cast(value_length) * length_; + if (total_size > static_cast(std::numeric_limits::max())) { + return Status::Invalid( + "Cannot create array: total data size (", total_size, + " bytes) would overflow the offset type. Consider using a large_* " + "type (e.g. large_string, large_binary) for data exceeding 2 GB."); + } + } TypedBufferBuilder builder(pool_); RETURN_NOT_OK(builder.Resize(length_ + 1)); OffsetType offset = 0; diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index a103519dc5ac..ba1c486e0c67 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -442,6 +442,16 @@ def test_array_from_dictionary_scalar(): assert result.equals(expected) +def test_repeat_offset_overflow(): + # GH-36388: pa.repeat should raise an error when the total data size + # would overflow 32-bit offsets, instead of returning an invalid array. + with pytest.raises(pa.ArrowInvalid, match="overflow"): + pa.repeat("x", 2**31) + + with pytest.raises(pa.ArrowInvalid, match="overflow"): + pa.repeat("xy", 2**30 + 1) + + def test_array_getitem(): arr = pa.array(range(10, 15)) lst = arr.to_pylist()