diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index ecdbb342d3e..7c3b634ea6b 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -401,7 +401,15 @@ def array(object obj, type=None, mask=None, size=None, from_pandas=None, result = _sequence_to_array(obj, mask, size, type, pool, c_from_pandas) if extension_type is not None: - result = ExtensionArray.from_storage(extension_type, result) + if isinstance(result, ChunkedArray): + # Handle ChunkedArray case (e.g., when data overflows int32) + chunks = [] + for chunk in result.chunks: + ext_chunk = ExtensionArray.from_storage(extension_type, chunk) + chunks.append(ext_chunk) + result = chunked_array(chunks, type=extension_type) + else: + result = ExtensionArray.from_storage(extension_type, result) return result diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index 465b556876b..e9387c7088c 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -2120,3 +2120,80 @@ def test_json(storage_type, pickle_module): pa.ArrowInvalid, match=f"Invalid storage type for JsonExtensionType: {storage_type}"): pa.json_(storage_type) + + +class ListExtensionType(pa.ExtensionType): + """Extension type with a list field for testing int32 overflow.""" + + def __init__(self): + super().__init__( + pa.struct({"data": pa.list_(pa.uint8())}), + "test_list_ext", + ) + + def __arrow_ext_serialize__(self): + return b"" + + @classmethod + def __arrow_ext_deserialize__(cls, storage_type, serialized): + return cls() + + +@pytest.mark.large_memory +@pytest.mark.numpy +def test_extension_type_list_overflow(): + """ + Test that extension types with list fields handle int32 offset overflow. + """ + try: + pa.register_extension_type(ListExtensionType()) + except pa.ArrowKeyError: + pass + + schema = pa.schema({"col": ListExtensionType()}) + + # Create data that exceeds int32 max cumulative values + # 5 rows × 500M values = 2.5B > int32 max (2,147,483,647) + arr = np.zeros(500_000_000, dtype=np.uint8) + rows = [{"col": {"data": arr}} for _ in range(5)] + + result = pa.Table.from_pylist(rows, schema=schema) + + assert result.num_rows == 5 + assert result.num_columns == 1 + assert result.schema[0].type == ListExtensionType() + + col = result.column(0) + assert isinstance(col, pa.ChunkedArray) + assert col.type == ListExtensionType() + + for chunk_idx in range(col.num_chunks): + chunk_data = col.chunk(chunk_idx) + assert chunk_data.type == ListExtensionType() + + +@pytest.mark.numpy +def test_extension_type_no_overflow(): + """Test that extension types work normally when there's no overflow.""" + try: + pa.register_extension_type(ListExtensionType()) + except pa.ArrowKeyError: + # Already registered + pass + + schema = pa.schema({"col": ListExtensionType()}) + + # Small data that won't overflow + arr = np.array([1, 2, 3], dtype=np.uint8) + rows = [{"col": {"data": arr}} for _ in range(3)] + + result = pa.Table.from_pylist(rows, schema=schema) + + assert result.num_rows == 3 + assert result.num_columns == 1 + assert result.schema[0].type == ListExtensionType() + + # The column should be a ChunkedArray with a single chunk + col = result.column(0) + assert isinstance(col, pa.ChunkedArray) + assert col.type == ListExtensionType()