Merge pull request #240 from pandas-dev/master

sthagen · web-flow · commit 00bff1d6eeff · 2021-07-26T19:14:44.000+02:00
Sync Fork from Upstream Repo
diff --git a/asv_bench/benchmarks/reshape.py b/asv_bench/benchmarks/reshape.py
@@ -111,7 +111,7 @@ def setup(self, dtype):
             values = np.take(list(string.ascii_letters), indices)
             values = [pd.Categorical(v) for v in values.T]
 
-        self.df = DataFrame(values, index, columns)
+        self.df = DataFrame({i: cat for i, cat in enumerate(values)}, index, columns)
         self.df2 = self.df.iloc[:-1]
 
     def time_full_product(self, dtype):
diff --git a/doc/source/whatsnew/v1.3.2.rst b/doc/source/whatsnew/v1.3.2.rst
@@ -14,6 +14,7 @@ including other versions of pandas.
 
 Fixed regressions
 ~~~~~~~~~~~~~~~~~
+- Performance regression in :meth:`DataFrame.isin` and :meth:`Series.isin` for nullable data types (:issue:`42714`)
 -
 -
 
diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
@@ -225,7 +225,7 @@ Indexing
 - Bug in :meth:`Series.loc` when with a :class:`MultiIndex` whose first level contains only ``np.nan`` values (:issue:`42055`)
 - Bug in indexing on a :class:`Series` or :class:`DataFrame` with a :class:`DatetimeIndex` when passing a string, the return type depended on whether the index was monotonic (:issue:`24892`)
 - Bug in indexing on a :class:`MultiIndex` failing to drop scalar levels when the indexer is a tuple containing a datetime-like string (:issue:`42476`)
--
+- Bug in updating values of :class:`pandas.Series` using boolean index, created by using :meth:`pandas.DataFrame.pop` (:issue:`42530`)
 
 Missing
 ^^^^^^^
diff --git a/environment.yml b/environment.yml
@@ -108,7 +108,7 @@ dependencies:
   - fsspec>=0.7.4, <2021.6.0  # for generic remote file operations
   - gcsfs>=0.6.0  # file IO when using 'gcs://...' path
   - sqlalchemy  # pandas.read_sql, DataFrame.to_sql
-  - xarray  # DataFrame.to_xarray
+  - xarray<0.19  # DataFrame.to_xarray
   - cftime  # Needed for downstream xarray.CFTimeIndex test
   - pyreadstat  # pandas.read_spss
   - tabulate>=0.8.3  # DataFrame.to_markdown
diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
@@ -417,7 +417,7 @@ def isin(self, values) -> BooleanArray:  # type: ignore[override]
             # see https://github.com/pandas-dev/pandas/pull/38379 for some discussion
             result[self._mask] = values_have_NA
 
-        mask = np.zeros_like(self, dtype=bool)
+        mask = np.zeros(self._data.shape, dtype=bool)
         return BooleanArray(result, mask, copy=False)
 
     def copy(self: BaseMaskedArrayT) -> BaseMaskedArrayT:
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -3745,7 +3745,7 @@ def _set_item_mgr(self, key, value: ArrayLike) -> None:
         # try to set first as we want an invalid
         # value exception to occur first
         if len(self):
-            self._check_setitem_copy(stacklevel=5)
+            self._check_setitem_copy()
 
     def _iset_item(self, loc: int, value) -> None:
         arraylike = self._sanitize_column(value)
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -67,6 +67,7 @@
     doc,
     rewrite_axis_style_signature,
 )
+from pandas.util._exceptions import find_stack_level
 from pandas.util._validators import (
     validate_ascending,
     validate_bool_kwarg,
@@ -3506,7 +3507,7 @@ def _maybe_update_cacher(
         """
 
         if verify_is_copy:
-            self._check_setitem_copy(stacklevel=5, t="referent")
+            self._check_setitem_copy(t="referent")
 
         if clear:
             self._clear_item_cache()
@@ -3853,26 +3854,21 @@ def _check_is_chained_assignment_possible(self) -> bool_t:
         setting.
         """
         if self._is_copy:
-            self._check_setitem_copy(stacklevel=4, t="referent")
+            self._check_setitem_copy(t="referent")
         return False
 
     @final
-    def _check_setitem_copy(self, stacklevel=4, t="setting", force=False):
+    def _check_setitem_copy(self, t="setting", force=False):
         """
 
         Parameters
         ----------
-        stacklevel : int, default 4
-           the level to show of the stack when the error is output
         t : str, the type of setting error
         force : bool, default False
            If True, then force showing an error.
 
         validate if we are doing a setitem on a chained copy.
 
-        If you call this function, be sure to set the stacklevel such that the
-        user will see the error *at the level of setting*
-
         It is technically possible to figure out that we are setting on
         a copy even WITH a multi-dtyped pandas object. In other words, some
         blocks may be views while other are not. Currently _is_view will ALWAYS
@@ -3931,7 +3927,7 @@ def _check_setitem_copy(self, stacklevel=4, t="setting", force=False):
         if value == "raise":
             raise com.SettingWithCopyError(t)
         elif value == "warn":
-            warnings.warn(t, com.SettingWithCopyWarning, stacklevel=stacklevel)
+            warnings.warn(t, com.SettingWithCopyWarning, stacklevel=find_stack_level())
 
     def __delitem__(self, key) -> None:
         """
diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
@@ -1255,7 +1255,7 @@ def where(self, other, cond, errors="raise") -> list[Block]:
 
         return result_blocks
 
-    def _unstack(self, unstacker, fill_value, new_placement):
+    def _unstack(self, unstacker, fill_value, new_placement, allow_fill: bool):
         """
         Return a list of unstacked blocks of self
 
@@ -1264,6 +1264,7 @@ def _unstack(self, unstacker, fill_value, new_placement):
         unstacker : reshape._Unstacker
         fill_value : int
             Only used in ExtensionBlock._unstack
+        allow_fill : bool
 
         Returns
         -------
@@ -1638,7 +1639,7 @@ def where(self, other, cond, errors="raise") -> list[Block]:
 
         return [self.make_block_same_class(result)]
 
-    def _unstack(self, unstacker, fill_value, new_placement):
+    def _unstack(self, unstacker, fill_value, new_placement, allow_fill: bool):
         # ExtensionArray-safe unstack.
         # We override ObjectBlock._unstack, which unstacks directly on the
         # values of the array. For EA-backed blocks, this would require
@@ -1655,7 +1656,7 @@ def _unstack(self, unstacker, fill_value, new_placement):
         blocks = [
             # TODO: could cast to object depending on fill_value?
             self.make_block_same_class(
-                self.values.take(indices, allow_fill=True, fill_value=fill_value),
+                self.values.take(indices, allow_fill=allow_fill, fill_value=fill_value),
                 BlockPlacement(place),
             )
             for indices, place in zip(new_values.T, new_placement)
diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
@@ -1368,6 +1368,8 @@ def unstack(self, unstacker, fill_value) -> BlockManager:
         new_columns = unstacker.get_new_columns(self.items)
         new_index = unstacker.new_index
 
+        allow_fill = not unstacker.mask.all()
+
         new_blocks: list[Block] = []
         columns_mask: list[np.ndarray] = []
 
@@ -1377,7 +1379,10 @@ def unstack(self, unstacker, fill_value) -> BlockManager:
             new_placement = new_columns.get_indexer(new_items)
 
             blocks, mask = blk._unstack(
-                unstacker, fill_value, new_placement=new_placement
+                unstacker,
+                fill_value,
+                new_placement=new_placement,
+                allow_fill=allow_fill,
             )
 
             new_blocks.extend(blocks)
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -1214,7 +1214,7 @@ def _check_is_chained_assignment_possible(self) -> bool:
         if self._is_view and self._is_cached:
             ref = self._get_cacher()
             if ref is not None and ref._is_mixed_type:
-                self._check_setitem_copy(stacklevel=4, t="referent", force=True)
+                self._check_setitem_copy(t="referent", force=True)
             return True
         return super()._check_is_chained_assignment_possible()
 
@@ -1233,14 +1233,15 @@ def _maybe_update_cacher(
             # a copy
             if ref is None:
                 del self._cacher
+            elif len(self) == len(ref) and self.name in ref.columns:
+                # GH#42530 self.name must be in ref.columns
+                # to ensure column still in dataframe
+                # otherwise, either self or ref has swapped in new arrays
+                ref._maybe_cache_changed(cacher[0], self)
             else:
-                if len(self) == len(ref):
-                    # otherwise, either self or ref has swapped in new arrays
-                    ref._maybe_cache_changed(cacher[0], self)
-                else:
-                    # GH#33675 we have swapped in a new array, so parent
-                    #  reference to self is now invalid
-                    ref._item_cache.pop(cacher[0], None)
+                # GH#33675 we have swapped in a new array, so parent
+                #  reference to self is now invalid
+                ref._item_cache.pop(cacher[0], None)
 
         super()._maybe_update_cacher(clear=clear, verify_is_copy=verify_is_copy)
 
diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py
@@ -8,6 +8,7 @@
 
 from pandas import (
     Categorical,
+    DataFrame,
     DatetimeIndex,
     Index,
     IntervalIndex,
@@ -945,3 +946,17 @@ def test_setitem_int_as_positional_fallback_deprecation():
     with tm.assert_produces_warning(FutureWarning, match=msg):
         ser3[4] = 99
     tm.assert_series_equal(ser3, expected3)
+
+
+def test_setitem_with_bool_indexer():
+    # GH#42530
+
+    df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
+    result = df.pop("b")
+    result[[True, False, False]] = 9
+    expected = Series(data=[9, 5, 6], name="b")
+    tm.assert_series_equal(result, expected)
+
+    df.loc[[True, False, False], "a"] = 10
+    expected = DataFrame({"a": [10, 2, 3]})
+    tm.assert_frame_equal(df, expected)
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -72,7 +72,7 @@ s3fs>=0.4.0
 fsspec>=0.7.4, <2021.6.0
 gcsfs>=0.6.0
 sqlalchemy
-xarray
+xarray<0.19
 cftime
 pyreadstat
 tabulate>=0.8.3
diff --git a/scripts/generate_pip_deps_from_conda.py b/scripts/generate_pip_deps_from_conda.py
@@ -13,17 +13,17 @@
     $ python scripts/generate_pip_deps_from_conda.py --compare
 """
 import argparse
-import os
+import pathlib
 import re
 import sys
 
 import yaml
 
 EXCLUDE = {"python", "c-compiler", "cxx-compiler"}
-RENAME = {"pytables": "tables", "pyqt": "pyqt5", "dask-core": "dask"}
+RENAME = {"pytables": "tables", "dask-core": "dask"}
 
 
-def conda_package_to_pip(package):
+def conda_package_to_pip(package: str):
     """
     Convert a conda package to its pip equivalent.
 
@@ -36,17 +36,13 @@ def conda_package_to_pip(package):
     package = re.sub("(?<=[^<>])=", "==", package).strip()
 
     for compare in ("<=", ">=", "=="):
-        if compare not in package:
-            continue
+        if compare in package:
+            pkg, version = package.split(compare)
+            if pkg in EXCLUDE:
+                return
 
-        pkg, version = package.split(compare)
-        if pkg in EXCLUDE:
-            return
-
-        if pkg in RENAME:
-            return "".join((RENAME[pkg], compare, version))
-
-        break
+            if pkg in RENAME:
+                return "".join((RENAME[pkg], compare, version))
 
     if package in EXCLUDE:
         return
@@ -57,16 +53,18 @@ def conda_package_to_pip(package):
     return package
 
 
-def main(conda_fname, pip_fname, compare=False):
+def generate_pip_from_conda(
+    conda_path: pathlib.Path, pip_path: pathlib.Path, compare: bool = False
+) -> bool:
     """
     Generate the pip dependencies file from the conda file, or compare that
     they are synchronized (``compare=True``).
 
     Parameters
     ----------
-    conda_fname : str
+    conda_path : pathlib.Path
         Path to the conda file with dependencies (e.g. `environment.yml`).
-    pip_fname : str
+    pip_path : pathlib.Path
         Path to the pip file with dependencies (e.g. `requirements-dev.txt`).
     compare : bool, default False
         Whether to generate the pip file (``False``) or to compare if the
@@ -78,8 +76,8 @@ def main(conda_fname, pip_fname, compare=False):
     bool
         True if the comparison fails, False otherwise
     """
-    with open(conda_fname) as conda_fd:
-        deps = yaml.safe_load(conda_fd)["dependencies"]
+    with conda_path.open() as file:
+        deps = yaml.safe_load(file)["dependencies"]
 
     pip_deps = []
     for dep in deps:
@@ -88,24 +86,23 @@ def main(conda_fname, pip_fname, compare=False):
             if conda_dep:
                 pip_deps.append(conda_dep)
         elif isinstance(dep, dict) and len(dep) == 1 and "pip" in dep:
-            pip_deps += dep["pip"]
+            pip_deps.extend(dep["pip"])
         else:
             raise ValueError(f"Unexpected dependency {dep}")
 
-    fname = os.path.split(conda_fname)[1]
     header = (
-        f"# This file is auto-generated from {fname}, do not modify.\n"
+        f"# This file is auto-generated from {conda_path.name}, do not modify.\n"
         "# See that file for comments about the need/usage of each dependency.\n\n"
     )
     pip_content = header + "\n".join(pip_deps) + "\n"
 
     if compare:
-        with open(pip_fname) as pip_fd:
-            return pip_content != pip_fd.read()
-    else:
-        with open(pip_fname, "w") as pip_fd:
-            pip_fd.write(pip_content)
-        return False
+        with pip_path.open() as file:
+            return pip_content != file.read()
+
+    with pip_path.open("w") as file:
+        file.write(pip_content)
+    return False
 
 
 if __name__ == "__main__":
@@ -117,25 +114,20 @@ def main(conda_fname, pip_fname, compare=False):
         action="store_true",
         help="compare whether the two files are equivalent",
     )
-    argparser.add_argument(
-        "--azure", action="store_true", help="show the output in azure-pipelines format"
-    )
     args = argparser.parse_args()
 
-    repo_path = os.path.dirname(os.path.abspath(os.path.dirname(__file__)))
-    res = main(
-        os.path.join(repo_path, "environment.yml"),
-        os.path.join(repo_path, "requirements-dev.txt"),
+    conda_fname = "environment.yml"
+    pip_fname = "requirements-dev.txt"
+    repo_path = pathlib.Path(__file__).parent.parent.absolute()
+    res = generate_pip_from_conda(
+        pathlib.Path(repo_path, conda_fname),
+        pathlib.Path(repo_path, pip_fname),
         compare=args.compare,
     )
     if res:
         msg = (
-            f"`requirements-dev.txt` has to be generated with `{sys.argv[0]}` after "
-            "`environment.yml` is modified.\n"
+            f"`{pip_fname}` has to be generated with `{__file__}` after "
+            f"`{conda_fname}` is modified.\n"
         )
-        if args.azure:
-            msg = (
-                f"##vso[task.logissue type=error;sourcepath=requirements-dev.txt]{msg}"
-            )
         sys.stderr.write(msg)
     sys.exit(res)

Original file line number	Diff line number	Diff line change
`@@ -14,6 +14,7 @@ including other versions of pandas.`
`14`	`14`
`15`	`15`	`Fixed regressions`
`16`	`16`	`~~~~~~~~~~~~~~~~~`
	`17`	+- Performance regression in :meth:`DataFrame.isin` and :meth:`Series.isin` for nullable data types (:issue:`42714`)
`17`	`18`	`-`
`18`	`19`	`-`
`19`	`20`