Merge branch 'main' into refactor-io-sql-execute

gmcrocetti · web-flow · commit a119020f6d9d · 2025-02-10T16:25:55.000-03:00
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -106,6 +106,11 @@ repos:
     hooks:
     - id: meson-fmt
       args: ['--inplace']
+-   repo: https://github.com/shellcheck-py/shellcheck-py
+    rev: v0.10.0.1
+    hooks:
+    -   id: shellcheck
+        args: ["--severity=warning"]
 -   repo: local
     hooks:
     -   id: pyright
diff --git a/ci/code_checks.sh b/ci/code_checks.sh
@@ -24,15 +24,15 @@ else
 fi
 
 [[ -z "$CHECK" || "$CHECK" == "code" || "$CHECK" == "doctests" || "$CHECK" == "docstrings" || "$CHECK" == "single-docs" || "$CHECK" == "notebooks" ]] || \
-    { echo "Unknown command $1. Usage: $0 [code|doctests|docstrings|single-docs|notebooks]"; exit 9999; }
+    { echo "Unknown command $1. Usage: $0 [code|doctests|docstrings|single-docs|notebooks]"; exit 1; }
 
-BASE_DIR="$(dirname $0)/.."
+BASE_DIR="$(dirname "$0")/.."
 RET=0
 
 ### CODE ###
 if [[ -z "$CHECK" || "$CHECK" == "code" ]]; then
 
-    MSG='Check import. No warnings, and blocklist some optional dependencies' ; echo $MSG
+    MSG='Check import. No warnings, and blocklist some optional dependencies' ; echo "$MSG"
     python -W error -c "
 import sys
 import pandas
@@ -49,24 +49,24 @@ if mods:
     sys.stderr.write('err: pandas should not import: {}\n'.format(', '.join(mods)))
     sys.exit(len(mods))
     "
-    RET=$(($RET + $?)) ; echo $MSG "DONE"
+    RET=$(($RET + $?)) ; echo "$MSG" "DONE"
 
 fi
 
 ### DOCTESTS ###
 if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then
 
-    MSG='Python and Cython Doctests' ; echo $MSG
+    MSG='Python and Cython Doctests' ; echo "$MSG"
     python -c 'import pandas as pd; pd.test(run_doctests=True)'
-    RET=$(($RET + $?)) ; echo $MSG "DONE"
+    RET=$(($RET + $?)) ; echo "$MSG" "DONE"
 
 fi
 
 ### DOCSTRINGS ###
 if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
 
-    MSG='Validate Docstrings' ; echo $MSG
-    $BASE_DIR/scripts/validate_docstrings.py \
+    MSG='Validate Docstrings' ; echo "$MSG"
+    "$BASE_DIR"/scripts/validate_docstrings.py \
         --format=actions \
         -i ES01 `# For now it is ok if docstrings are missing the extended summary` \
         -i "pandas.Series.dt PR01" `# Accessors are implemented as classes, but we do not document the Parameters section` \
@@ -265,7 +265,7 @@ fi
 if [[ -z "$CHECK" || "$CHECK" == "notebooks" ]]; then
 
     MSG='Notebooks' ; echo $MSG
-    jupyter nbconvert --execute $(find doc/source -name '*.ipynb') --to notebook
+    jupyter nbconvert --execute "$(find doc/source -name '*.ipynb')" --to notebook
     RET=$(($RET + $?)) ; echo $MSG "DONE"
 
 fi
diff --git a/ci/run_tests.sh b/ci/run_tests.sh
@@ -3,10 +3,8 @@
 # Workaround for pytest-xdist (it collects different tests in the workers if PYTHONHASHSEED is not set)
 # https://github.com/pytest-dev/pytest/issues/920
 # https://github.com/pytest-dev/pytest/issues/1075
-export PYTHONHASHSEED=$(python -c 'import random; print(random.randint(1, 4294967295))')
-
-# May help reproduce flaky CI builds if set in subsequent runs
-echo PYTHONHASHSEED=$PYTHONHASHSEED
+PYTHONHASHSEED=$(python -c 'import random; print(random.randint(1, 4294967295))')
+export PYTHONHASHSEED
 
 COVERAGE="-s --cov=pandas --cov-report=xml --cov-append --cov-config=pyproject.toml"
 
@@ -16,5 +14,5 @@ if [[ "$PATTERN" ]]; then
   PYTEST_CMD="$PYTEST_CMD -m \"$PATTERN\""
 fi
 
-echo $PYTEST_CMD
+echo "$PYTEST_CMD"
 sh -c "$PYTEST_CMD"
diff --git a/ci/upload_wheels.sh b/ci/upload_wheels.sh
@@ -1,3 +1,4 @@
+#!/bin/bash
 # Modified from numpy's https://github.com/numpy/numpy/blob/main/tools/wheels/upload_wheels.sh
 
 set_upload_vars() {
@@ -19,20 +20,20 @@ set_upload_vars() {
     fi
 }
 upload_wheels() {
-    echo ${PWD}
+    echo "${PWD}"
     if [[ ${ANACONDA_UPLOAD} == true ]]; then
-        if [ -z ${TOKEN} ]; then
+        if [ -z "${TOKEN}" ]; then
             echo no token set, not uploading
         else
             # sdists are located under dist folder when built through setup.py
             if compgen -G "./dist/*.gz"; then
                 echo "Found sdist"
-                anaconda -q -t ${TOKEN} upload --skip -u ${ANACONDA_ORG} ./dist/*.gz
+                anaconda -q -t "${TOKEN}" upload --skip -u "${ANACONDA_ORG}" ./dist/*.gz
                 echo "Uploaded sdist"
             fi
             if compgen -G "./wheelhouse/*.whl"; then
                 echo "Found wheel"
-                anaconda -q -t ${TOKEN} upload --skip -u ${ANACONDA_ORG} ./wheelhouse/*.whl
+                anaconda -q -t "${TOKEN}" upload --skip -u "${ANACONDA_ORG}" ./wheelhouse/*.whl
                 echo "Uploaded wheel"
             fi
             echo "PyPI-style index: https://pypi.anaconda.org/$ANACONDA_ORG/simple"
diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
@@ -753,16 +753,20 @@ def group_sum(
 
                 if uses_mask:
                     isna_entry = mask[i, j]
-                    isna_result = result_mask[lab, j]
                 else:
                     isna_entry = _treat_as_na(val, is_datetimelike)
-                    isna_result = _treat_as_na(sumx[lab, j], is_datetimelike)
 
-                if not skipna and isna_result:
-                    # If sum is already NA, don't add to it. This is important for
-                    # datetimelikebecause adding a value to NPY_NAT may not result
-                    # in a NPY_NAT
-                    continue
+                if not skipna:
+                    if uses_mask:
+                        isna_result = result_mask[lab, j]
+                    else:
+                        isna_result = _treat_as_na(sumx[lab, j], is_datetimelike)
+
+                    if isna_result:
+                        # If sum is already NA, don't add to it. This is important for
+                        # datetimelikebecause adding a value to NPY_NAT may not result
+                        # in a NPY_NAT
+                        continue
 
                 if not isna_entry:
                     nobs[lab, j] += 1
@@ -845,14 +849,18 @@ def group_prod(
 
                 if uses_mask:
                     isna_entry = mask[i, j]
-                    isna_result = result_mask[lab, j]
                 else:
                     isna_entry = _treat_as_na(val, False)
-                    isna_result = _treat_as_na(prodx[lab, j], False)
 
-                if not skipna and isna_result:
-                    # If prod is already NA, no need to update it
-                    continue
+                if not skipna:
+                    if uses_mask:
+                        isna_result = result_mask[lab, j]
+                    else:
+                        isna_result = _treat_as_na(prodx[lab, j], False)
+
+                    if isna_result:
+                        # If prod is already NA, no need to update it
+                        continue
 
                 if not isna_entry:
                     nobs[lab, j] += 1
@@ -919,22 +927,30 @@ def group_var(
 
                 if uses_mask:
                     isna_entry = mask[i, j]
-                    isna_result = result_mask[lab, j]
                 elif is_datetimelike:
                     # With group_var, we cannot just use _treat_as_na bc
                     #  datetimelike dtypes get cast to float64 instead of
                     #  to int64.
                     isna_entry = val == NPY_NAT
-                    isna_result = out[lab, j] == NPY_NAT
                 else:
                     isna_entry = _treat_as_na(val, is_datetimelike)
-                    isna_result = _treat_as_na(out[lab, j], is_datetimelike)
 
-                if not skipna and isna_result:
-                    # If aggregate is already NA, don't add to it. This is important for
-                    # datetimelike because adding a value to NPY_NAT may not result
-                    # in a NPY_NAT
-                    continue
+                if not skipna:
+                    if uses_mask:
+                        isna_result = result_mask[lab, j]
+                    elif is_datetimelike:
+                        # With group_var, we cannot just use _treat_as_na bc
+                        #  datetimelike dtypes get cast to float64 instead of
+                        #  to int64.
+                        isna_result = out[lab, j] == NPY_NAT
+                    else:
+                        isna_result = _treat_as_na(out[lab, j], is_datetimelike)
+
+                    if isna_result:
+                        # If aggregate is already NA, don't add to it. This is
+                        # important for datetimelike because adding a value to NPY_NAT
+                        # may not result in a NPY_NAT
+                        continue
 
                 if not isna_entry:
                     nobs[lab, j] += 1
@@ -1232,22 +1248,30 @@ def group_mean(
 
                 if uses_mask:
                     isna_entry = mask[i, j]
-                    isna_result = result_mask[lab, j]
                 elif is_datetimelike:
                     # With group_mean, we cannot just use _treat_as_na bc
                     #  datetimelike dtypes get cast to float64 instead of
                     #  to int64.
                     isna_entry = val == NPY_NAT
-                    isna_result = sumx[lab, j] == NPY_NAT
                 else:
                     isna_entry = _treat_as_na(val, is_datetimelike)
-                    isna_result = _treat_as_na(sumx[lab, j], is_datetimelike)
 
-                if not skipna and isna_result:
-                    # If sum is already NA, don't add to it. This is important for
-                    # datetimelike because adding a value to NPY_NAT may not result
-                    # in NPY_NAT
-                    continue
+                if not skipna:
+                    if uses_mask:
+                        isna_result = result_mask[lab, j]
+                    elif is_datetimelike:
+                        # With group_mean, we cannot just use _treat_as_na bc
+                        #  datetimelike dtypes get cast to float64 instead of
+                        #  to int64.
+                        isna_result = sumx[lab, j] == NPY_NAT
+                    else:
+                        isna_result = _treat_as_na(sumx[lab, j], is_datetimelike)
+
+                    if isna_result:
+                        # If sum is already NA, don't add to it. This is important for
+                        # datetimelike because adding a value to NPY_NAT may not result
+                        # in NPY_NAT
+                        continue
 
                 if not isna_entry:
                     nobs[lab, j] += 1
@@ -1909,15 +1933,20 @@ cdef group_min_max(
 
                 if uses_mask:
                     isna_entry = mask[i, j]
-                    isna_result = result_mask[lab, j]
                 else:
                     isna_entry = _treat_as_na(val, is_datetimelike)
-                    isna_result = _treat_as_na(group_min_or_max[lab, j],
-                                               is_datetimelike)
 
-                if not skipna and isna_result:
-                    # If current min/max is already NA, it will always be NA
-                    continue
+                if not skipna:
+                    if uses_mask:
+                        isna_result = result_mask[lab, j]
+                    else:
+                        isna_result = _treat_as_na(
+                            group_min_or_max[lab, j], is_datetimelike
+                        )
+
+                    if isna_result:
+                        # If current min/max is already NA, it will always be NA
+                        continue
 
                 if not isna_entry:
                     nobs[lab, j] += 1
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -1522,6 +1522,11 @@ def infer_dtype(value: object, skipna: bool = True) -> str:
     """
     Return a string label of the type of a scalar or list-like of values.
 
+    This method inspects the elements of the provided input and determines
+    classification of its data type. It is particularly useful for
+    handling heterogeneous data inputs where explicit dtype conversion may not
+    be possible or necessary.
+
     Parameters
     ----------
     value : scalar, list, ndarray, or pandas type
diff --git a/pandas/core/_numba/kernels/min_max_.py b/pandas/core/_numba/kernels/min_max_.py
@@ -98,7 +98,7 @@ def grouped_min_max(
     for i in range(N):
         lab = labels[i]
         val = values[i]
-        if lab < 0 or (nobs[lab] >= 1 and np.isnan(output[lab])):
+        if lab < 0 or (not skipna and nobs[lab] >= 1 and np.isnan(output[lab])):
             continue
 
         if values.dtype.kind == "i" or not np.isnan(val):
diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py
@@ -279,6 +279,11 @@ class SparseFrameAccessor(BaseAccessor, PandasDelegate):
     """
     DataFrame accessor for sparse data.
 
+    It allows users to interact with a `DataFrame` that contains sparse data types
+    (`SparseDtype`). It provides methods and attributes to efficiently work with sparse
+    storage, reducing memory usage while maintaining compatibility with standard pandas
+    operations.
+
     Parameters
     ----------
     data : scipy.sparse.spmatrix
diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py
@@ -161,6 +161,11 @@ class CategoricalDtype(PandasExtensionDtype, ExtensionDtype):
     """
     Type for categorical data with the categories and orderedness.
 
+    It is a dtype representation for categorical data, which allows users to define
+    a fixed set of values and optionally impose an ordering. This is particularly
+    useful for handling categorical variables efficiently, as it can significantly
+    reduce memory usage compared to using object dtypes.
+
     Parameters
     ----------
     categories : sequence, optional
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -8029,10 +8029,15 @@ def _should_reindex_frame_op(self, right, op, axis: int, fill_value, level) -> b
             return False
 
         if (
-            isinstance(self.columns, MultiIndex)
-            or isinstance(right.columns, MultiIndex)
-        ) and not self.columns.equals(right.columns):
+            (
+                isinstance(self.columns, MultiIndex)
+                or isinstance(right.columns, MultiIndex)
+            )
+            and not self.columns.equals(right.columns)
+            and fill_value is None
+        ):
             # GH#60498 Reindex if MultiIndexe columns are not matching
+            # GH#60903 Don't reindex if fill_value is provided
             return True
 
         if fill_value is None and level is None and axis == 1:
diff --git a/pandas/io/formats/printing.py b/pandas/io/formats/printing.py
@@ -112,7 +112,7 @@ def _pprint_seq(
     if isinstance(seq, set):
         fmt = "{{{body}}}"
     elif isinstance(seq, frozenset):
-        fmt = "frozenset({body})"
+        fmt = "frozenset({{{body}}})"
     else:
         fmt = "[{body}]" if hasattr(seq, "__setitem__") else "({body})"
 
diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py
@@ -117,6 +117,12 @@ class Styler(StylerRenderer):
     r"""
     Helps style a DataFrame or Series according to the data with HTML and CSS.
 
+    This class provides methods for styling and formatting a Pandas DataFrame or Series.
+    The styled output can be rendered as HTML or LaTeX, and it supports CSS-based
+    styling, allowing users to control colors, font styles, and other visual aspects of
+    tabular data. It is particularly useful for presenting DataFrame objects in a
+    Jupyter Notebook environment or when exporting styled tables for reports and
+
     Parameters
     ----------
     data : Series or DataFrame
diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py
@@ -520,6 +520,12 @@ def read_json(
     """
     Convert a JSON string to pandas object.
 
+    This method reads JSON files or JSON-like data and converts them into pandas
+    objects. It supports a variety of input formats, including line-delimited JSON,
+    compressed files, and various data representations (table, records, index-based,
+    etc.). When `chunksize` is specified, an iterator is returned instead of loading
+    the entire data into memory.
+
     Parameters
     ----------
     path_or_buf : a str path, path object or file-like object
diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
@@ -1131,6 +1131,12 @@ def put(
         """
         Store object in HDFStore.
 
+        This method writes a pandas DataFrame or Series into an HDF5 file using
+        either the fixed or table format. The `table` format allows additional
+        operations like incremental appends and queries but may have performance
+        trade-offs. The `fixed` format provides faster read/write operations but
+        does not support appends or queries.
+
         Parameters
         ----------
         key : str
diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py
diff --git a/pandas/tests/io/formats/test_printing.py b/pandas/tests/io/formats/test_printing.py
diff --git a/scripts/cibw_before_build.sh b/scripts/cibw_before_build.sh
diff --git a/scripts/cibw_before_build_windows.sh b/scripts/cibw_before_build_windows.sh
diff --git a/scripts/cibw_before_test_windows.sh b/scripts/cibw_before_test_windows.sh
diff --git a/scripts/download_wheels.sh b/scripts/download_wheels.sh