From 0e1d5095b03910efa45f9bd6b94143f64fbee96b Mon Sep 17 00:00:00 2001 From: fotinosk Date: Fri, 3 Apr 2026 13:13:38 +0100 Subject: [PATCH 1/4] feat(dict converter): add conversion for large bin and string --- cpp/src/arrow/util/converter.h | 2 ++ python/pyarrow/src/arrow/python/python_to_arrow.cc | 2 +- python/pyarrow/tests/test_dict_array_converter.py | 12 ++++++++++++ 3 files changed, 15 insertions(+), 1 deletion(-) create mode 100644 python/pyarrow/tests/test_dict_array_converter.py diff --git a/cpp/src/arrow/util/converter.h b/cpp/src/arrow/util/converter.h index c23d6ccd9886..d987bf3061fe 100644 --- a/cpp/src/arrow/util/converter.h +++ b/cpp/src/arrow/util/converter.h @@ -238,7 +238,9 @@ struct MakeConverterImpl { DICTIONARY_CASE(FloatType); DICTIONARY_CASE(DoubleType); DICTIONARY_CASE(BinaryType); + DICTIONARY_CASE(LargeBinaryType); DICTIONARY_CASE(StringType); + DICTIONARY_CASE(LargeStringType); DICTIONARY_CASE(FixedSizeBinaryType); #undef DICTIONARY_CASE default: diff --git a/python/pyarrow/src/arrow/python/python_to_arrow.cc b/python/pyarrow/src/arrow/python/python_to_arrow.cc index e7ce54abcd8f..081d67bf82f9 100644 --- a/python/pyarrow/src/arrow/python/python_to_arrow.cc +++ b/python/pyarrow/src/arrow/python/python_to_arrow.cc @@ -826,7 +826,7 @@ class PyDictionaryConverter> } else { ARROW_RETURN_NOT_OK( PyValue::Convert(this->value_type_, this->options_, value, view_)); - return this->value_builder_->Append(view_.bytes, static_cast(view_.size)); + return this->value_builder_->Append(view_.bytes, static_cast(view_.size)); } } diff --git a/python/pyarrow/tests/test_dict_array_converter.py b/python/pyarrow/tests/test_dict_array_converter.py new file mode 100644 index 000000000000..46611834dfe0 --- /dev/null +++ b/python/pyarrow/tests/test_dict_array_converter.py @@ -0,0 +1,12 @@ +import pytest + +import pyarrow as pa + + +def test_arrow_missing_function(): + + pa.array([], pa.dictionary(pa.int32(), pa.string())) + pa.array([], pa.dictionary(pa.int32(), pa.binary())) + + pa.array([], pa.dictionary(pa.int32(), pa.large_string())) + pa.array([], pa.dictionary(pa.int32(), pa.large_binary())) From 2067d769cf1a2cd730137f4d797b91d20bcf75d8 Mon Sep 17 00:00:00 2001 From: fotinosk Date: Fri, 3 Apr 2026 13:53:44 +0100 Subject: [PATCH 2/4] feat(dict converter): add conversion for large bin and string --- python/pyarrow/src/arrow/python/python_to_arrow.cc | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/python/pyarrow/src/arrow/python/python_to_arrow.cc b/python/pyarrow/src/arrow/python/python_to_arrow.cc index 081d67bf82f9..17441ffddddd 100644 --- a/python/pyarrow/src/arrow/python/python_to_arrow.cc +++ b/python/pyarrow/src/arrow/python/python_to_arrow.cc @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -826,7 +827,17 @@ class PyDictionaryConverter> } else { ARROW_RETURN_NOT_OK( PyValue::Convert(this->value_type_, this->options_, value, view_)); - return this->value_builder_->Append(view_.bytes, static_cast(view_.size)); + // return this->value_builder_->Append(view_.bytes, static_cast(view_.size)); + + // if constexpr ( + // std::is_same::value || + // std::is_same::value) { + // return this->value_builder_->Append(view_.bytes, static_cast(view_.size)); + // } else { + // return this->value_builder_->Append(view_.bytes, static_cast(view_.size)); + // } + + return this->value_builder_->Append(std::string_view(view_.bytes, view_.size)); } } From 866c28ba78ef6a1c80efda242f886cc7ba723db5 Mon Sep 17 00:00:00 2001 From: fotinosk Date: Fri, 3 Apr 2026 14:10:38 +0100 Subject: [PATCH 3/4] cleanup --- python/pyarrow/src/arrow/python/python_to_arrow.cc | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/python/pyarrow/src/arrow/python/python_to_arrow.cc b/python/pyarrow/src/arrow/python/python_to_arrow.cc index 17441ffddddd..72cbb91712fa 100644 --- a/python/pyarrow/src/arrow/python/python_to_arrow.cc +++ b/python/pyarrow/src/arrow/python/python_to_arrow.cc @@ -827,16 +827,6 @@ class PyDictionaryConverter> } else { ARROW_RETURN_NOT_OK( PyValue::Convert(this->value_type_, this->options_, value, view_)); - // return this->value_builder_->Append(view_.bytes, static_cast(view_.size)); - - // if constexpr ( - // std::is_same::value || - // std::is_same::value) { - // return this->value_builder_->Append(view_.bytes, static_cast(view_.size)); - // } else { - // return this->value_builder_->Append(view_.bytes, static_cast(view_.size)); - // } - return this->value_builder_->Append(std::string_view(view_.bytes, view_.size)); } } From f0f29ef7cfdb9cc36ac81bc10b9b7f3620dbec13 Mon Sep 17 00:00:00 2001 From: fotinosk Date: Fri, 3 Apr 2026 14:31:30 +0100 Subject: [PATCH 4/4] feat(dict converter): add tests --- python/pyarrow/tests/test_array.py | 16 ++++++++++++++++ .../pyarrow/tests/test_dict_array_converter.py | 12 ------------ 2 files changed, 16 insertions(+), 12 deletions(-) delete mode 100644 python/pyarrow/tests/test_dict_array_converter.py diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index a103519dc5ac..1b727d5cf8b8 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -4468,3 +4468,19 @@ def test_dunders_checked_overflow(): arr ** pa.scalar(2, type=pa.int8()) with pytest.raises(pa.ArrowInvalid, match=error_match): arr / (-arr) + + +def test_dictionary_large_string_and_binary(): + # Test dictionary with large_string values + arr_str = pa.array( + ["a", "b", "a"], type=pa.dictionary(pa.int32(), pa.large_string()) + ) + assert arr_str.type.value_type == pa.large_string() + assert arr_str.to_pylist() == ["a", "b", "a"] + + # Test dictionary with large_binary values + arr_bin = pa.array( + [b"x", b"y", b"x"], type=pa.dictionary(pa.int32(), pa.large_binary()) + ) + assert arr_bin.type.value_type == pa.large_binary() + assert arr_bin.to_pylist() == [b"x", b"y", b"x"] diff --git a/python/pyarrow/tests/test_dict_array_converter.py b/python/pyarrow/tests/test_dict_array_converter.py deleted file mode 100644 index 46611834dfe0..000000000000 --- a/python/pyarrow/tests/test_dict_array_converter.py +++ /dev/null @@ -1,12 +0,0 @@ -import pytest - -import pyarrow as pa - - -def test_arrow_missing_function(): - - pa.array([], pa.dictionary(pa.int32(), pa.string())) - pa.array([], pa.dictionary(pa.int32(), pa.binary())) - - pa.array([], pa.dictionary(pa.int32(), pa.large_string())) - pa.array([], pa.dictionary(pa.int32(), pa.large_binary()))