Fix error `value_counts` result with pyarrow categorical columns #60949

mroeschke · 2025-02-18T17:23:46Z

Suggested change

from pandas import Index

if isinstance(values, Index):

arr = values._data._pa_array.combine_chunks()

else:

arr = values._pa_array.combine_chunks()

arr = values.array._pa_array.combine_chunks()

@mroeschke Thanks for review. After applied this change got failed of 2 testcase in pandas/tests/extension/test_arrow.py 🤔 (testing in my local):

Following is error message from 2 fail testcase, values.array raise error when value is ArrowExtensionArray Object:

test_sort_values_dictionary:

[1/1] Generating write_version_file with a custom command + /usr/local/bin/ninja ============================= test session starts ============================== platform linux -- Python 3.10.8, pytest-8.3.4, pluggy-1.5.0 PyQt5 5.15.11 -- Qt runtime 5.15.16 -- Qt compiled 5.15.14 rootdir: /home/pandas configfile: pyproject.toml plugins: xdist-3.6.1, hypothesis-6.125.3, qt-4.4.0, anyio-4.8.0, localserver-0.9.0.post0, cython-0.3.1, cov-6.0.0 collected 26414 items / 26413 deselected / 1 selected pandas/tests/extension/test_arrow.py F =================================== FAILURES =================================== _________________________ test_sort_values_dictionary __________________________ def test_sort_values_dictionary(): df = pd.DataFrame( { "a": pd.Series( ["x", "y"], dtype=ArrowDtype(pa.dictionary(pa.int32(), pa.string())) ), "b": [1, 2], }, ) expected = df.copy() > result = df.sort_values(by=["a", "b"]) pandas/tests/extension/test_arrow.py:1699: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ pandas/core/frame.py:7077: in sort_values indexer = lexsort_indexer( pandas/core/sorting.py:350: in lexsort_indexer cat = Categorical(k, ordered=True) _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ self = <[AttributeError("'NoneType' object has no attribute 'categories'") raised in repr()] Categorical object at 0x7f5aa42d8a90> values = <ArrowExtensionArray> ['x', 'y'] Length: 2, dtype: dictionary<values=string, indices=int32, ordered=0>[pyarrow] categories = None, ordered = True dtype = CategoricalDtype(categories=None, ordered=True, categories_dtype=None) copy = True def __init__( self, values, categories=None, ordered=None, dtype: Dtype | None = None, copy: bool = True, ) -> None: dtype = CategoricalDtype._from_values_or_dtype( values, categories, ordered, dtype ) # At this point, dtype is always a CategoricalDtype, but # we may have dtype.categories be None, and we need to # infer categories in a factorization step further below if not is_list_like(values): # GH#38433 raise TypeError("Categorical input must be list-like") # null_mask indicates missing values we want to exclude from inference. # This means: only missing values in list-likes (not arrays/ndframes). null_mask = np.array(False) # sanitize input vdtype = getattr(values, "dtype", None) if isinstance(vdtype, CategoricalDtype): if dtype.categories is None: dtype = CategoricalDtype(values.categories, dtype.ordered) elif isinstance(values, range): from pandas.core.indexes.range import RangeIndex values = RangeIndex(values) elif not isinstance(values, (ABCIndex, ABCSeries, ExtensionArray)): values = com.convert_to_list_like(values) if isinstance(values, list) and len(values) == 0: # By convention, empty lists result in object dtype: values = np.array([], dtype=object) elif isinstance(values, np.ndarray): if values.ndim > 1: # preempt sanitize_array from raising ValueError raise NotImplementedError( "> 1 ndim Categorical are not supported at this time" ) values = sanitize_array(values, None) else: # i.e. must be a list arr = sanitize_array(values, None) null_mask = isna(arr) if null_mask.any(): # We remove null values here, then below will re-insert # them, grep "full_codes" arr_list = [values[idx] for idx in np.where(~null_mask)[0]] # GH#44900 Do not cast to float if we have only missing values if arr_list or arr.dtype == "object": sanitize_dtype = None else: sanitize_dtype = arr.dtype arr = sanitize_array(arr_list, None, dtype=sanitize_dtype) values = arr if dtype.categories is None: if isinstance(values.dtype, ArrowDtype) and issubclass( values.dtype.type, CategoricalDtypeType ): # from pandas import Index # if isinstance(values, Index): # arr = values._data._pa_array.combine_chunks() # else: # arr = values._pa_array.combine_chunks() > arr = values.array._pa_array.combine_chunks() E AttributeError: 'ArrowExtensionArray' object has no attribute 'array' pandas/core/arrays/categorical.py:456: AttributeError ---------------- generated xml file: /home/pandas/test-data.xml ---------------- ============================= slowest 30 durations ============================= (3 durations < 0.005s hidden. Use -vv to show these durations.) =========================== short test summary info ============================ FAILED pandas/tests/extension/test_arrow.py::test_sort_values_dictionary - At... ===================== 1 failed, 26413 deselected in 1.61s ======================

test_dictionary_astype_categorical:

[1/1] Generating write_version_file with a custom command + /usr/local/bin/ninja ============================= test session starts ============================== platform linux -- Python 3.10.8, pytest-8.3.4, pluggy-1.5.0 PyQt5 5.15.11 -- Qt runtime 5.15.16 -- Qt compiled 5.15.14 rootdir: /home/pandas configfile: pyproject.toml plugins: xdist-3.6.1, hypothesis-6.125.3, qt-4.4.0, anyio-4.8.0, localserver-0.9.0.post0, cython-0.3.1, cov-6.0.0 collected 26414 items / 26413 deselected / 1 selected pandas/tests/extension/test_arrow.py F =================================== FAILURES =================================== ______________________ test_dictionary_astype_categorical ______________________ def test_dictionary_astype_categorical(): # GH#56672 arrs = [ pa.array(np.array(["a", "x", "c", "a"])).dictionary_encode(), pa.array(np.array(["a", "d", "c"])).dictionary_encode(), ] ser = pd.Series(ArrowExtensionArray(pa.chunked_array(arrs))) > result = ser.astype("category") pandas/tests/extension/test_arrow.py:3339: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ pandas/core/generic.py:6435: in astype new_data = self._mgr.astype(dtype=dtype, errors=errors) pandas/core/internals/managers.py:588: in astype return self.apply("astype", dtype=dtype, errors=errors) pandas/core/internals/managers.py:438: in apply applied = getattr(b, f)(**kwargs) pandas/core/internals/blocks.py:610: in astype new_values = astype_array_safe(values, dtype, errors=errors) pandas/core/dtypes/astype.py:234: in astype_array_safe new_values = astype_array(values, dtype, copy=copy) pandas/core/dtypes/astype.py:176: in astype_array values = values.astype(dtype, copy=copy) pandas/core/arrays/base.py:769: in astype return cls._from_sequence(self, dtype=dtype, copy=copy) pandas/core/arrays/categorical.py:531: in _from_sequence return cls(scalars, dtype=dtype, copy=copy) _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ self = <[AttributeError("'NoneType' object has no attribute 'categories'") raised in repr()] Categorical object at 0x7feb7f621940> values = <ArrowExtensionArray> ['a', 'x', 'c', 'a', 'a', 'd', 'c'] Length: 7, dtype: dictionary<values=string, indices=int32, ordered=0>[pyarrow] categories = None, ordered = None dtype = CategoricalDtype(categories=None, ordered=None, categories_dtype=None) copy = False def __init__( self, values, categories=None, ordered=None, dtype: Dtype | None = None, copy: bool = True, ) -> None: dtype = CategoricalDtype._from_values_or_dtype( values, categories, ordered, dtype ) # At this point, dtype is always a CategoricalDtype, but # we may have dtype.categories be None, and we need to # infer categories in a factorization step further below if not is_list_like(values): # GH#38433 raise TypeError("Categorical input must be list-like") # null_mask indicates missing values we want to exclude from inference. # This means: only missing values in list-likes (not arrays/ndframes). null_mask = np.array(False) # sanitize input vdtype = getattr(values, "dtype", None) if isinstance(vdtype, CategoricalDtype): if dtype.categories is None: dtype = CategoricalDtype(values.categories, dtype.ordered) elif isinstance(values, range): from pandas.core.indexes.range import RangeIndex values = RangeIndex(values) elif not isinstance(values, (ABCIndex, ABCSeries, ExtensionArray)): values = com.convert_to_list_like(values) if isinstance(values, list) and len(values) == 0: # By convention, empty lists result in object dtype: values = np.array([], dtype=object) elif isinstance(values, np.ndarray): if values.ndim > 1: # preempt sanitize_array from raising ValueError raise NotImplementedError( "> 1 ndim Categorical are not supported at this time" ) values = sanitize_array(values, None) else: # i.e. must be a list arr = sanitize_array(values, None) null_mask = isna(arr) if null_mask.any(): # We remove null values here, then below will re-insert # them, grep "full_codes" arr_list = [values[idx] for idx in np.where(~null_mask)[0]] # GH#44900 Do not cast to float if we have only missing values if arr_list or arr.dtype == "object": sanitize_dtype = None else: sanitize_dtype = arr.dtype arr = sanitize_array(arr_list, None, dtype=sanitize_dtype) values = arr if dtype.categories is None: if isinstance(values.dtype, ArrowDtype) and issubclass( values.dtype.type, CategoricalDtypeType ): # from pandas import Index # if isinstance(values, Index): # arr = values._data._pa_array.combine_chunks() # else: # arr = values._pa_array.combine_chunks() > arr = values.array._pa_array.combine_chunks() E AttributeError: 'ArrowExtensionArray' object has no attribute 'array' pandas/core/arrays/categorical.py:456: AttributeError ---------------- generated xml file: /home/pandas/test-data.xml ---------------- ============================= slowest 30 durations ============================= (3 durations < 0.005s hidden. Use -vv to show these durations.) =========================== short test summary info ============================ FAILED pandas/tests/extension/test_arrow.py::test_dictionary_astype_categorical ===================== 1 failed, 26413 deselected in 1.67s ======================

Ah ok. It appears something is passing in an ArrowExtensionArray here already. OK what you have here is fine then

-Original file line number
+Diff line change
@@ Expand Up / @@ -782,6 +782,7 @@ Sparse @@
     ExtensionArray
     ^^^^^^^^^^^^^^
+    - Bug in :class:`Categorical` when constructing with an :class:`Index` with :class:`ArrowDtype` (:issue:`60563`)
     - Bug in :meth:`.arrays.ArrowExtensionArray.__setitem__` which caused wrong behavior when using an integer array with repeated values as a key (:issue:`58530`)
     - Bug in :meth:`api.types.is_datetime64_any_dtype` where a custom :class:`ExtensionDtype` would return ``False`` for array-likes (:issue:`57055`)
     - Bug in comparison between object with :class:`ArrowDtype` and incompatible-dtyped (e.g. string vs bool) incorrectly raising instead of returning all-``False`` (for ``==``) or all-``True`` (for ``!=``) (:issue:`59505`)
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -3511,3 +3511,20 @@ def test_map_numeric_na_action(): @@
         result = ser.map(lambda x: 42, na_action="ignore")
         expected = pd.Series([42.0, 42.0, np.nan], dtype="float64")
         tm.assert_series_equal(result, expected)
+    def test_categorical_from_arrow_dictionary():
+        # GH 60563
+        df = pd.DataFrame(
+            {"A": ["a1", "a2"]}, dtype=ArrowDtype(pa.dictionary(pa.int32(), pa.utf8()))
+        )
+        result = df.value_counts(dropna=False)
+        expected = pd.Series(
+            [1, 1],
+            index=pd.MultiIndex.from_arrays(
+                [pd.Index(["a1", "a2"], dtype=ArrowDtype(pa.string()), name="A")]
+            ),
+            name="count",
+            dtype="int64",
+        )
+        tm.assert_series_equal(result, expected)

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Uh oh!

Fix error `value_counts` result with pyarrow categorical columns #60949

Uh oh!

Diff view

Diff view

There are no files selected for viewing

mroeschke Feb 18, 2025

Uh oh!

chilin0525 Feb 19, 2025 •

edited

Loading

Uh oh!

mroeschke Feb 19, 2025

Uh oh!

Uh oh!

Uh oh!

Fix error value_counts result with pyarrow categorical columns #60949

Uh oh!

Fix error value_counts result with pyarrow categorical columns #60949

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing

mroeschke Feb 18, 2025

Choose a reason for hiding this comment

Uh oh!

chilin0525 Feb 19, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

mroeschke Feb 19, 2025

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Fix error `value_counts` result with pyarrow categorical columns #60949

Fix error `value_counts` result with pyarrow categorical columns #60949

chilin0525 Feb 19, 2025 •

edited

Loading