Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions cpp/src/arrow/array/array_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -767,6 +767,33 @@ TEST_F(TestArray, TestMakeArrayFromScalar) {
}
}

TEST_F(TestArray, TestMakeArrayFromScalarOffsetOverflow) {
// Regression test for GH-36388: MakeArrayFromScalar should return an error
// when the total data size would overflow 32-bit offsets instead of silently
// producing an invalid array with negative offsets.

// A single-byte string repeated 2^31 times overflows int32 offsets
auto scalar = MakeScalar("x");
int64_t length = static_cast<int64_t>(1) << 31;
ASSERT_RAISES(Invalid, MakeArrayFromScalar(*scalar, length));

// A two-byte string repeated just over INT32_MAX/2 times also overflows
auto scalar2 = MakeScalar("xy");
int64_t length2 = (static_cast<int64_t>(1) << 30) + 1;
ASSERT_RAISES(Invalid, MakeArrayFromScalar(*scalar2, length2));

// Binary type has the same issue
auto bin_scalar = std::make_shared<BinaryScalar>(Buffer::FromString("abc"));
int64_t length3 = (static_cast<int64_t>(std::numeric_limits<int32_t>::max()) / 3) + 1;
ASSERT_RAISES(Invalid, MakeArrayFromScalar(*bin_scalar, length3));

// Large string type should NOT overflow (uses 64-bit offsets)
auto large_scalar = std::make_shared<LargeStringScalar>("x");
// Just verify it doesn't raise for a small count (we can't allocate 2^31 bytes here)
ASSERT_OK_AND_ASSIGN(auto arr, MakeArrayFromScalar(*large_scalar, 16));
ASSERT_EQ(arr->length(), 16);
}

TEST_F(TestArray, TestMakeArrayFromScalarSliced) {
// Regression test for ARROW-13437
auto scalars = GetScalars();
Expand Down
13 changes: 13 additions & 0 deletions cpp/src/arrow/array/util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -853,6 +853,19 @@ class RepeatedArrayFactory {

template <typename OffsetType>
Status CreateOffsetsBuffer(OffsetType value_length, std::shared_ptr<Buffer>* out) {
// Check that the total data size does not overflow the offset type.
// For 32-bit offset types (e.g. StringType, BinaryType), value_length * length_
// must fit in int32_t, otherwise the offsets wrap around and produce an invalid
// array with negative offsets.
if (value_length > 0 && length_ > 0) {
int64_t total_size = static_cast<int64_t>(value_length) * length_;
if (total_size > static_cast<int64_t>(std::numeric_limits<OffsetType>::max())) {
return Status::Invalid(
"Cannot create array: total data size (", total_size,
" bytes) would overflow the offset type. Consider using a large_* "
"type (e.g. large_string, large_binary) for data exceeding 2 GB.");
}
}
TypedBufferBuilder<OffsetType> builder(pool_);
RETURN_NOT_OK(builder.Resize(length_ + 1));
OffsetType offset = 0;
Expand Down
10 changes: 10 additions & 0 deletions python/pyarrow/tests/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -442,6 +442,16 @@ def test_array_from_dictionary_scalar():
assert result.equals(expected)


def test_repeat_offset_overflow():
# GH-36388: pa.repeat should raise an error when the total data size
# would overflow 32-bit offsets, instead of returning an invalid array.
with pytest.raises(pa.ArrowInvalid, match="overflow"):
pa.repeat("x", 2**31)

with pytest.raises(pa.ArrowInvalid, match="overflow"):
pa.repeat("xy", 2**30 + 1)


def test_array_getitem():
arr = pa.array(range(10, 15))
lst = arr.to_pylist()
Expand Down