Merge remote-tracking branch 'upstream/main' into read-csv-from-directory

fangchenli · fangchenli · commit a2b65e134905 · 2025-06-02T12:19:07.000-07:00
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -19,7 +19,7 @@ ci:
     skip: [pyright, mypy]
 repos:
 -   repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.11.8
+    rev: v0.11.12
     hooks:
     -   id: ruff
         args: [--exit-non-zero-on-fix]
@@ -74,7 +74,7 @@ repos:
     hooks:
     -   id: isort
 -   repo: https://github.com/asottile/pyupgrade
-    rev: v3.19.1
+    rev: v3.20.0
     hooks:
     -   id: pyupgrade
         args: [--py310-plus]
@@ -95,14 +95,14 @@ repos:
     - id: sphinx-lint
       args: ["--enable", "all", "--disable", "line-too-long"]
 -   repo: https://github.com/pre-commit/mirrors-clang-format
-    rev: v20.1.3
+    rev: v20.1.5
     hooks:
     - id: clang-format
       files: ^pandas/_libs/src|^pandas/_libs/include
       args: [-i]
       types_or: [c, c++]
 -   repo: https://github.com/trim21/pre-commit-mirror-meson
-    rev: v1.8.0
+    rev: v1.8.1
     hooks:
     - id: meson-fmt
       args: ['--inplace']
diff --git a/doc/cheatsheet/Pandas_Cheat_Sheet.pdf b/doc/cheatsheet/Pandas_Cheat_Sheet.pdf
diff --git a/doc/cheatsheet/Pandas_Cheat_Sheet.pptx b/doc/cheatsheet/Pandas_Cheat_Sheet.pptx
diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst
@@ -1461,16 +1461,33 @@ Looking up values by index/column labels
 
 Sometimes you want to extract a set of values given a sequence of row labels
 and column labels, this can be achieved by ``pandas.factorize``  and NumPy indexing.
-For instance:
 
-.. ipython:: python
+For heterogeneous column types, we subset columns to avoid unnecessary NumPy conversions:
+
+.. code-block:: python
+
+   def pd_lookup_het(df, row_labels, col_labels):
+      rows = df.index.get_indexer(row_labels)
+      cols = df.columns.get_indexer(col_labels)
+      sub = df.take(np.unique(cols), axis=1)
+      sub = sub.take(np.unique(rows), axis=0)
+      rows = sub.index.get_indexer(row_labels)
+      values = sub.melt()["value"]
+      cols = sub.columns.get_indexer(col_labels)
+      flat_index = rows + cols * len(sub)
+      result = values[flat_index]
+      return result
+
+For homogeneous column types, it is fastest to skip column subsetting and go directly to NumPy:
+
+.. code-block:: python
 
-    df = pd.DataFrame({'col': ["A", "A", "B", "B"],
-                       'A': [80, 23, np.nan, 22],
-                       'B': [80, 55, 76, 67]})
-    df
-    idx, cols = pd.factorize(df['col'])
-    df.reindex(cols, axis=1).to_numpy()[np.arange(len(df)), idx]
+   def pd_lookup_hom(df, row_labels, col_labels):
+       rows = df.index.get_indexer(row_labels)
+       df = df.loc[:, sorted(set(col_labels))]
+       cols = df.columns.get_indexer(col_labels)
+       result = df.to_numpy()[rows, cols]
+       return result
 
 Formerly this could be achieved with the dedicated ``DataFrame.lookup`` method
 which was deprecated in version 1.2.0 and removed in version 2.0.0.
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -891,6 +891,7 @@ Other
 - Bug in :meth:`DataFrame.query` which raised an exception when querying integer column names using backticks. (:issue:`60494`)
 - Bug in :meth:`DataFrame.shift` where passing a ``freq`` on a DataFrame with no columns did not shift the index correctly. (:issue:`60102`)
 - Bug in :meth:`DataFrame.sort_index` when passing ``axis="columns"`` and ``ignore_index=True`` and ``ascending=False`` not returning a :class:`RangeIndex` columns (:issue:`57293`)
+- Bug in :meth:`DataFrame.sort_values` where sorting by a column explicitly named ``None`` raised a ``KeyError`` instead of sorting by the column as expected. (:issue:`61512`)
 - Bug in :meth:`DataFrame.transform` that was returning the wrong order unless the index was monotonically increasing. (:issue:`57069`)
 - Bug in :meth:`DataFrame.where` where using a non-bool type array in the function would return a ``ValueError`` instead of a ``TypeError`` (:issue:`56330`)
 - Bug in :meth:`Index.sort_values` when passing a key function that turns values into tuples, e.g. ``key=natsort.natsort_key``, would raise ``TypeError`` (:issue:`56081`)
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -1645,11 +1645,7 @@ def _is_label_reference(self, key: Level, axis: Axis = 0) -> bool:
         axis_int = self._get_axis_number(axis)
         other_axes = (ax for ax in range(self._AXIS_LEN) if ax != axis_int)
 
-        return (
-            key is not None
-            and is_hashable(key)
-            and any(key in self.axes[ax] for ax in other_axes)
-        )
+        return is_hashable(key) and any(key in self.axes[ax] for ax in other_axes)
 
     @final
     def _is_label_or_level_reference(self, key: Level, axis: AxisInt = 0) -> bool:
diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py
@@ -633,6 +633,15 @@ def autocorrelation_plot(series: Series, ax: Axes | None = None, **kwargs) -> Ax
     """
     Autocorrelation plot for time series.
 
+    This method generates an autocorrelation plot for a given time series,
+    which helps to identify any periodic structure or correlation within the
+    data across various lags. It shows the correlation of a time series with a
+    delayed copy of itself as a function of delay. Autocorrelation plots are useful for
+    checking randomness in a data set. If the data are random, the autocorrelations
+    should be near zero for any and all time-lag separations. If the data are not
+    random, then one or more of the autocorrelations will be significantly
+    non-zero.
+
     Parameters
     ----------
     series : Series
diff --git a/pandas/tests/arrays/integer/test_arithmetic.py b/pandas/tests/arrays/integer/test_arithmetic.py
@@ -178,25 +178,9 @@ def test_error_invalid_values(data, all_arithmetic_operators):
     ops = getattr(s, op)
 
     # invalid scalars
-    msg = "|".join(
-        [
-            r"can only perform ops with numeric values",
-            r"IntegerArray cannot perform the operation mod",
-            r"unsupported operand type",
-            r"can only concatenate str \(not \"int\"\) to str",
-            "not all arguments converted during string",
-            "ufunc '.*' not supported for the input types, and the inputs could not",
-            "ufunc '.*' did not contain a loop with signature matching types",
-            "Addition/subtraction of integers and integer-arrays with Timestamp",
-            "has no kernel",
-            "not implemented",
-            "The 'out' kwarg is necessary. Use numpy.strings.multiply without it.",
-            "not supported for dtype",
-        ]
-    )
-    with pytest.raises(TypeError, match=msg):
+    with tm.external_error_raised(TypeError):
         ops("foo")
-    with pytest.raises(TypeError, match=msg):
+    with tm.external_error_raised(TypeError):
         ops(pd.Timestamp("20180101"))
 
     # invalid array-likes
@@ -214,25 +198,10 @@ def test_error_invalid_values(data, all_arithmetic_operators):
         #  more-correct than np.nan here.
         tm.assert_series_equal(res, expected)
     else:
-        with pytest.raises(TypeError, match=msg):
+        with tm.external_error_raised(TypeError):
             ops(str_ser)
 
-    msg = "|".join(
-        [
-            "can only perform ops with numeric values",
-            "cannot perform .* with this index type: DatetimeArray",
-            "Addition/subtraction of integers and integer-arrays "
-            "with DatetimeArray is no longer supported. *",
-            "unsupported operand type",
-            r"can only concatenate str \(not \"int\"\) to str",
-            "not all arguments converted during string",
-            "cannot subtract DatetimeArray from ndarray",
-            "has no kernel",
-            "not implemented",
-            "not supported for dtype",
-        ]
-    )
-    with pytest.raises(TypeError, match=msg):
+    with tm.external_error_raised(TypeError):
         ops(pd.Series(pd.date_range("20180101", periods=len(s))))
 
 
diff --git a/pandas/tests/frame/methods/test_sort_values.py b/pandas/tests/frame/methods/test_sort_values.py
@@ -630,6 +630,13 @@ def test_sort_values_no_op_reset_index(self):
         expected = DataFrame({"A": [10, 20], "B": [1, 5]})
         tm.assert_frame_equal(result, expected)
 
+    def test_sort_by_column_named_none(self):
+        # GH#61512
+        df = DataFrame([[3, 1], [2, 2]], columns=[None, "C1"])
+        result = df.sort_values(by=None)
+        expected = DataFrame([[2, 2], [3, 1]], columns=[None, "C1"], index=[1, 0])
+        tm.assert_frame_equal(result, expected)
+
 
 class TestDataFrameSortKey:  # test key sorting (issue 27237)
     def test_sort_values_inplace_key(self, sort_by_key):