From 08c39f36d3af50d4f6c423ee780ef2db06953ee6 Mon Sep 17 00:00:00 2001 From: "Ankit.Ahlawat@ibm.com" Date: Fri, 22 May 2026 12:24:57 +0530 Subject: [PATCH 1/2] GH-50012[Python]: Fix list_ storage crashes when values exceed int32 offsets --- python/pyarrow/array.pxi | 10 ++- python/pyarrow/tests/test_extension_type.py | 75 +++++++++++++++++++++ 2 files changed, 84 insertions(+), 1 deletion(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index ecdbb342d3e2..7c3b634ea6bc 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -401,7 +401,15 @@ def array(object obj, type=None, mask=None, size=None, from_pandas=None, result = _sequence_to_array(obj, mask, size, type, pool, c_from_pandas) if extension_type is not None: - result = ExtensionArray.from_storage(extension_type, result) + if isinstance(result, ChunkedArray): + # Handle ChunkedArray case (e.g., when data overflows int32) + chunks = [] + for chunk in result.chunks: + ext_chunk = ExtensionArray.from_storage(extension_type, chunk) + chunks.append(ext_chunk) + result = chunked_array(chunks, type=extension_type) + else: + result = ExtensionArray.from_storage(extension_type, result) return result diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index 465b556876b4..679370a55ea0 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -2120,3 +2120,78 @@ def test_json(storage_type, pickle_module): pa.ArrowInvalid, match=f"Invalid storage type for JsonExtensionType: {storage_type}"): pa.json_(storage_type) + + +class ListExtensionType(pa.ExtensionType): + """Extension type with a list field for testing int32 overflow.""" + + def __init__(self): + super().__init__( + pa.struct({"data": pa.list_(pa.uint8())}), + "test_list_ext", + ) + + def __arrow_ext_serialize__(self): + return b"" + + @classmethod + def __arrow_ext_deserialize__(cls, storage_type, serialized): + return cls() + + +@pytest.mark.large_memory +def test_extension_type_list_overflow(): + """ + Test that extension types with list fields handle int32 offset overflow. + """ + try: + pa.register_extension_type(ListExtensionType()) + except pa.ArrowKeyError: + pass + + schema = pa.schema({"col": ListExtensionType()}) + + # Create data that exceeds int32 max cumulative values + # 5 rows × 500M values = 2.5B > int32 max (2,147,483,647) + arr = np.zeros(500_000_000, dtype=np.uint8) + rows = [{"col": {"data": arr}} for _ in range(5)] + + result = pa.Table.from_pylist(rows, schema=schema) + + assert result.num_rows == 5 + assert result.num_columns == 1 + assert result.schema[0].type == ListExtensionType() + + col = result.column(0) + assert isinstance(col, pa.ChunkedArray) + assert col.type == ListExtensionType() + + for chunk_idx in range(col.num_chunks): + chunk_data = col.chunk(chunk_idx) + assert chunk_data.type == ListExtensionType() + + +def test_extension_type_no_overflow(): + """Test that extension types work normally when there's no overflow.""" + try: + pa.register_extension_type(ListExtensionType()) + except pa.ArrowKeyError: + # Already registered + pass + + schema = pa.schema({"col": ListExtensionType()}) + + # Small data that won't overflow + arr = np.array([1, 2, 3], dtype=np.uint8) + rows = [{"col": {"data": arr}} for _ in range(3)] + + result = pa.Table.from_pylist(rows, schema=schema) + + assert result.num_rows == 3 + assert result.num_columns == 1 + assert result.schema[0].type == ListExtensionType() + + # The column should be a ChunkedArray with a single chunk + col = result.column(0) + assert isinstance(col, pa.ChunkedArray) + assert col.type == ListExtensionType() From c554be546fbf388548d4c34921b58b55c089fdaa Mon Sep 17 00:00:00 2001 From: "Ankit.Ahlawat@ibm.com" Date: Fri, 22 May 2026 13:02:59 +0530 Subject: [PATCH 2/2] GH-50012[Python] Add NumPy decorator to extension type tests --- python/pyarrow/tests/test_extension_type.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index 679370a55ea0..e9387c7088cc 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -2140,6 +2140,7 @@ def __arrow_ext_deserialize__(cls, storage_type, serialized): @pytest.mark.large_memory +@pytest.mark.numpy def test_extension_type_list_overflow(): """ Test that extension types with list fields handle int32 offset overflow. @@ -2171,6 +2172,7 @@ def test_extension_type_list_overflow(): assert chunk_data.type == ListExtensionType() +@pytest.mark.numpy def test_extension_type_no_overflow(): """Test that extension types work normally when there's no overflow.""" try: