From 4ed5538dc89c01a5332c71a4ff79ea1a9fa23bb7 Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Fri, 19 Dec 2025 12:54:21 -0800 Subject: [PATCH 1/2] PERF: Use PyArrow dictionary_encode for all Arrow merge keys Previously, only string Arrow types used the optimized PyArrow dictionary_encode() path for merge key factorization. Numeric Arrow types fell back to to_numpy() + hashtable factorization. This extends the PyArrow-native path to all Arrow dtypes, giving 5-25% speedup for int64 keys and 3-11% for float64 keys. --- pandas/core/reshape/merge.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index f7fd4da2968a7..9ba5020e34d46 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -2832,7 +2832,7 @@ def _factorize_keys( rk = ensure_int64(rk.codes) elif isinstance(lk, ExtensionArray) and lk.dtype == rk.dtype: - if (isinstance(lk.dtype, ArrowDtype) and is_string_dtype(lk.dtype)) or ( + if isinstance(lk.dtype, ArrowDtype) or ( isinstance(lk.dtype, StringDtype) and lk.dtype.storage == "pyarrow" ): import pyarrow as pa From bdc18aa3bf11b6c6cd4d78172b612b4b012ba9b7 Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Sat, 20 Dec 2025 22:35:40 -0800 Subject: [PATCH 2/2] sort entry --- doc/source/whatsnew/v3.0.0.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 7d65ca781d81e..8be96529c94db 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -1091,6 +1091,7 @@ Performance improvements - Performance improvement in :meth:`CategoricalDtype.update_dtype` when ``dtype`` is a :class:`CategoricalDtype` with non ``None`` categories and ordered (:issue:`59647`) - Performance improvement in :meth:`DataFrame.__getitem__` when ``key`` is a :class:`DataFrame` with many columns (:issue:`61010`) - Performance improvement in :meth:`DataFrame.astype` when converting to extension floating dtypes, e.g. "Float64" (:issue:`60066`) +- Performance improvement in :meth:`DataFrame.merge` by using Arrow-native path for all Arrow-backed dtypes (:issue:`63435`) - Performance improvement in :meth:`DataFrame.stack` when using ``future_stack=True`` and the DataFrame does not have a :class:`MultiIndex` (:issue:`58391`) - Performance improvement in :meth:`DataFrame.to_hdf` avoid unnecessary reopenings of the HDF5 file to speedup data addition to files with a very large number of groups . (:issue:`58248`) - Performance improvement in :meth:`DataFrame.where` when ``cond`` is a :class:`DataFrame` with many columns (:issue:`61010`) @@ -1098,6 +1099,7 @@ Performance improvements - Performance improvement in indexing operations for string dtypes (:issue:`56997`) - Performance improvement in unary methods on a :class:`RangeIndex` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57825`) + .. --------------------------------------------------------------------------- .. _whatsnew_300.bug_fixes: