Skip to content

Commit 01024ef

Browse files
committed
Implement DataFrame.__array_ufunc__
For some cases, this will preserve extension types of arrays by calling the ufunc blockwise. ```python In [1]: import pandas as pd; import numpy as np In [2]: df = pd.DataFrame({"A": pd.array([0, 1], dtype="Sparse")}) In [3]: np.sin(df).dtypes Out[3]: A Sparse[float64, nan] dtype: object ``` We don't currently handle the multi-input case well (aside from ufuncs that are implemented as dunder ops like `np.add`). For these, we fall back to the old implementation of converting to an ndarray.
1 parent da3a2d3 commit 01024ef

File tree

7 files changed

+272
-79
lines changed

7 files changed

+272
-79
lines changed

doc/source/whatsnew/v1.2.0.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,8 @@ Other enhancements
189189
- :meth:`Rolling.mean()` and :meth:`Rolling.sum()` use Kahan summation to calculate the mean to avoid numerical problems (:issue:`10319`, :issue:`11645`, :issue:`13254`, :issue:`32761`, :issue:`36031`)
190190
- :meth:`DatetimeIndex.searchsorted`, :meth:`TimedeltaIndex.searchsorted`, :meth:`PeriodIndex.searchsorted`, and :meth:`Series.searchsorted` with datetimelike dtypes will now try to cast string arguments (listlike and scalar) to the matching datetimelike type (:issue:`36346`)
191191
- Added methods :meth:`IntegerArray.prod`, :meth:`IntegerArray.min`, and :meth:`IntegerArray.max` (:issue:`33790`)
192+
- Calling a NumPy ufunc on a ``DataFrame`` with extension types now presrves the extension types when possible (:issue:`23743`).
193+
- Calling a binary-input NumPy ufunc on multiple ``DataFrame`` objects now aligns, matching the behavior of binary operations and ufuncs on ``Series`` (:issue:`23743`).
192194
- Where possible :meth:`RangeIndex.difference` and :meth:`RangeIndex.symmetric_difference` will return :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`36564`)
193195

194196
.. _whatsnew_120.api_breaking.python:
@@ -289,6 +291,7 @@ Deprecations
289291
- Deprecated :meth:`Index.is_all_dates` (:issue:`27744`)
290292
- Deprecated automatic alignment on comparison operations between :class:`DataFrame` and :class:`Series`, do ``frame, ser = frame.align(ser, axis=1, copy=False)`` before e.g. ``frame == ser`` (:issue:`28759`)
291293
- :meth:`Rolling.count` with ``min_periods=None`` will default to the size of the window in a future version (:issue:`31302`)
294+
- Using "outer" ufuncs on DataFrames to return 4d ndarray is now deprecated. Convert to an ndarray first (:issue:`23743`)
292295
- :meth:`Index.ravel` returning a ``np.ndarray`` is deprecated, in the future this will return a view on the same index (:issue:`19956`)
293296

294297
.. ---------------------------------------------------------------------------

pandas/core/frame.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -556,6 +556,10 @@ def __init__(
556556

557557
NDFrame.__init__(self, mgr)
558558

559+
# ----------------------------------------------------------------------
560+
# Array interface
561+
_HANDLED_TYPES = (Series, Index, ExtensionArray, np.ndarray)
562+
559563
# ----------------------------------------------------------------------
560564

561565
@property

pandas/core/generic.py

Lines changed: 98 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -87,11 +87,11 @@
8787
from pandas.core.dtypes.missing import isna, notna
8888

8989
import pandas as pd
90-
from pandas.core import missing, nanops
90+
from pandas.core import missing, nanops, ops
9191
import pandas.core.algorithms as algos
9292
from pandas.core.base import PandasObject, SelectionMixin
9393
import pandas.core.common as com
94-
from pandas.core.construction import create_series_with_explicit_dtype
94+
from pandas.core.construction import create_series_with_explicit_dtype, extract_array
9595
from pandas.core.flags import Flags
9696
from pandas.core.indexes import base as ibase
9797
from pandas.core.indexes.api import Index, MultiIndex, RangeIndex, ensure_index
@@ -1912,6 +1912,102 @@ def __array_wrap__(
19121912
self, method="__array_wrap__"
19131913
)
19141914

1915+
@ops.defer_or_dispatch_ufunc
1916+
def __array_ufunc__(
1917+
self, ufunc: Callable, method: str, *inputs: Any, **kwargs: Any
1918+
):
1919+
# XXX: check outer
1920+
# align all the inputs.
1921+
types = tuple(type(x) for x in inputs)
1922+
alignable = [x for x, t in zip(inputs, types) if issubclass(t, NDFrame)]
1923+
1924+
if len(alignable) > 1:
1925+
# This triggers alignment.
1926+
# At the moment, there aren't any ufuncs with more than two inputs
1927+
# so this ends up just being x1.index | x2.index, but we write
1928+
# it to handle *args.
1929+
1930+
if len(set(types)) > 1:
1931+
# We currently don't handle ufunc(DataFrame, Series)
1932+
# well. Previously this raised an internal ValueError. We might
1933+
# support it someday, so raise a NotImplementedError.
1934+
raise NotImplementedError(
1935+
"Cannot apply ufunc {} to mixed DataFrame and Series "
1936+
"inputs.".format(ufunc)
1937+
)
1938+
axes = self.axes
1939+
for obj in alignable[1:]:
1940+
# this relies on the fact that we aren't handling mixed
1941+
# series / frame ufuncs.
1942+
for i, (ax1, ax2) in enumerate(zip(axes, obj.axes)):
1943+
axes[i] = ax1 | ax2
1944+
1945+
reconstruct_axes = dict(zip(self._AXIS_ORDERS, axes))
1946+
inputs = tuple(
1947+
x.reindex(**reconstruct_axes) if issubclass(t, NDFrame) else x
1948+
for x, t in zip(inputs, types)
1949+
)
1950+
else:
1951+
reconstruct_axes = dict(zip(self._AXIS_ORDERS, self.axes))
1952+
1953+
if self.ndim == 1:
1954+
names = [getattr(x, "name") for x in inputs if hasattr(x, "name")]
1955+
name = names[0] if len(set(names)) == 1 else None
1956+
reconstruct_kwargs = {"name": name}
1957+
else:
1958+
reconstruct_kwargs = {}
1959+
1960+
def reconstruct(result):
1961+
if lib.is_scalar(result):
1962+
return result
1963+
if result.ndim != self.ndim:
1964+
if method == "outer":
1965+
if self.ndim == 2:
1966+
# we already deprecated for Series
1967+
msg = (
1968+
"outer method for ufunc {} is not implemented on "
1969+
"pandas objects. Returning an ndarray, but in the "
1970+
"future this will raise a 'NotImplementedError'. "
1971+
"Consider explicitly converting the DataFrame "
1972+
"to an array with '.to_numpy()' first."
1973+
)
1974+
warnings.warn(msg.format(ufunc), FutureWarning, stacklevel=4)
1975+
return result
1976+
raise NotImplementedError
1977+
return result
1978+
if isinstance(result, BlockManager):
1979+
# we went through BlockManager.apply
1980+
return self._constructor(result, **reconstruct_kwargs, copy=False)
1981+
else:
1982+
# we converted an array, lost our axes
1983+
return self._constructor(
1984+
result, **reconstruct_axes, **reconstruct_kwargs, copy=False
1985+
)
1986+
1987+
if self.ndim > 1 and (len(inputs) > 1 or ufunc.nout > 1):
1988+
# Just give up on preserving types in the complex case.
1989+
# In theory we could preserve them for them.
1990+
# * nout>1 is doable if BlockManager.apply took nout and
1991+
# returned a Tuple[BlockManager].
1992+
# * len(inputs) > 1 is doable when we know that we have
1993+
# aligned blocks / dtypes.
1994+
inputs = tuple(np.asarray(x) for x in inputs)
1995+
result = getattr(ufunc, method)(*inputs)
1996+
elif self.ndim == 1:
1997+
# ufunc(series, ...)
1998+
inputs = tuple(extract_array(x, extract_numpy=True) for x in inputs)
1999+
result = getattr(ufunc, method)(*inputs, **kwargs)
2000+
else:
2001+
# ufunc(dataframe)
2002+
mgr = inputs[0]._mgr
2003+
result = mgr.apply(getattr(ufunc, method))
2004+
2005+
if ufunc.nout > 1:
2006+
result = tuple(reconstruct(x) for x in result)
2007+
else:
2008+
result = reconstruct(result)
2009+
return result
2010+
19152011
# ideally we would define this to avoid the getattr checks, but
19162012
# is slower
19172013
# @property

pandas/core/ops/__init__.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,10 @@
2727
logical_op,
2828
)
2929
from pandas.core.ops.array_ops import comp_method_OBJECT_ARRAY # noqa:F401
30-
from pandas.core.ops.common import unpack_zerodim_and_defer
30+
from pandas.core.ops.common import ( # noqa:F401
31+
defer_or_dispatch_ufunc,
32+
unpack_zerodim_and_defer,
33+
)
3134
from pandas.core.ops.docstrings import (
3235
_arith_doc_FRAME,
3336
_flex_comp_doc_FRAME,

pandas/core/ops/common.py

Lines changed: 52 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,12 @@
22
Boilerplate functions used in defining binary operations.
33
"""
44
from functools import wraps
5-
from typing import Callable
5+
from typing import Any, Callable
6+
7+
import numpy as np
68

79
from pandas._libs.lib import item_from_zerodim
10+
from pandas._libs.ops_dispatch import maybe_dispatch_ufunc_to_dunder_op
811
from pandas._typing import F
912

1013
from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries
@@ -65,3 +68,51 @@ def new_method(self, other):
6568
return method(self, other)
6669

6770
return new_method
71+
72+
73+
def defer_or_dispatch_ufunc(meth):
74+
"""
75+
Boilerplate for pandas conventions in arithmetic and comparison methods.
76+
77+
Ensure method returns NotImplemented when operating against "senior"
78+
classes. Ensure zero-dimensional ndarrays are always unpacked.
79+
80+
Parameters
81+
----------
82+
method : binary method
83+
84+
Returns
85+
-------
86+
method
87+
"""
88+
89+
@wraps(meth)
90+
def new_method(self, ufunc: Callable, method: str, *inputs: Any, **kwargs: Any):
91+
cls = type(self)
92+
93+
# for binary ops, use our custom dunder methods
94+
result = maybe_dispatch_ufunc_to_dunder_op(
95+
self, ufunc, method, *inputs, **kwargs
96+
)
97+
if result is not NotImplemented:
98+
return result
99+
100+
# Determine if we should defer.
101+
no_defer = (np.ndarray.__array_ufunc__, cls.__array_ufunc__)
102+
103+
for item in inputs:
104+
higher_priority = (
105+
hasattr(item, "__array_priority__")
106+
and item.__array_priority__ > self.__array_priority__
107+
)
108+
has_array_ufunc = (
109+
hasattr(item, "__array_ufunc__")
110+
and type(item).__array_ufunc__ not in no_defer
111+
and not isinstance(item, self._HANDLED_TYPES)
112+
)
113+
if higher_priority or has_array_ufunc:
114+
return NotImplemented
115+
116+
return meth(self, ufunc, method, *inputs, **kwargs)
117+
118+
return new_method

pandas/core/series.py

Lines changed: 0 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -682,81 +682,6 @@ def view(self, dtype=None) -> "Series":
682682
# NDArray Compat
683683
_HANDLED_TYPES = (Index, ExtensionArray, np.ndarray)
684684

685-
def __array_ufunc__(
686-
self, ufunc: Callable, method: str, *inputs: Any, **kwargs: Any
687-
):
688-
# TODO: handle DataFrame
689-
cls = type(self)
690-
691-
# for binary ops, use our custom dunder methods
692-
result = ops.maybe_dispatch_ufunc_to_dunder_op(
693-
self, ufunc, method, *inputs, **kwargs
694-
)
695-
if result is not NotImplemented:
696-
return result
697-
698-
# Determine if we should defer.
699-
no_defer = (np.ndarray.__array_ufunc__, cls.__array_ufunc__)
700-
701-
for item in inputs:
702-
higher_priority = (
703-
hasattr(item, "__array_priority__")
704-
and item.__array_priority__ > self.__array_priority__
705-
)
706-
has_array_ufunc = (
707-
hasattr(item, "__array_ufunc__")
708-
and type(item).__array_ufunc__ not in no_defer
709-
and not isinstance(item, self._HANDLED_TYPES)
710-
)
711-
if higher_priority or has_array_ufunc:
712-
return NotImplemented
713-
714-
# align all the inputs.
715-
names = [getattr(x, "name") for x in inputs if hasattr(x, "name")]
716-
types = tuple(type(x) for x in inputs)
717-
# TODO: dataframe
718-
alignable = [x for x, t in zip(inputs, types) if issubclass(t, Series)]
719-
720-
if len(alignable) > 1:
721-
# This triggers alignment.
722-
# At the moment, there aren't any ufuncs with more than two inputs
723-
# so this ends up just being x1.index | x2.index, but we write
724-
# it to handle *args.
725-
index = alignable[0].index
726-
for s in alignable[1:]:
727-
index |= s.index
728-
inputs = tuple(
729-
x.reindex(index) if issubclass(t, Series) else x
730-
for x, t in zip(inputs, types)
731-
)
732-
else:
733-
index = self.index
734-
735-
inputs = tuple(extract_array(x, extract_numpy=True) for x in inputs)
736-
result = getattr(ufunc, method)(*inputs, **kwargs)
737-
738-
name = names[0] if len(set(names)) == 1 else None
739-
740-
def construct_return(result):
741-
if lib.is_scalar(result):
742-
return result
743-
elif result.ndim > 1:
744-
# e.g. np.subtract.outer
745-
if method == "outer":
746-
# GH#27198
747-
raise NotImplementedError
748-
return result
749-
return self._constructor(result, index=index, name=name, copy=False)
750-
751-
if type(result) is tuple:
752-
# multiple return values
753-
return tuple(construct_return(x) for x in result)
754-
elif method == "at":
755-
# no return value
756-
return None
757-
else:
758-
return construct_return(result)
759-
760685
def __array__(self, dtype=None) -> np.ndarray:
761686
"""
762687
Return the values as a NumPy array.

0 commit comments

Comments
 (0)