Skip to content

Commit a119020

Browse files
authored
Merge branch 'main' into refactor-io-sql-execute
2 parents 71e0d24 + 02de814 commit a119020

20 files changed

+161
-66
lines changed

.pre-commit-config.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,11 @@ repos:
106106
hooks:
107107
- id: meson-fmt
108108
args: ['--inplace']
109+
- repo: https://github.com/shellcheck-py/shellcheck-py
110+
rev: v0.10.0.1
111+
hooks:
112+
- id: shellcheck
113+
args: ["--severity=warning"]
109114
- repo: local
110115
hooks:
111116
- id: pyright

ci/code_checks.sh

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -24,15 +24,15 @@ else
2424
fi
2525

2626
[[ -z "$CHECK" || "$CHECK" == "code" || "$CHECK" == "doctests" || "$CHECK" == "docstrings" || "$CHECK" == "single-docs" || "$CHECK" == "notebooks" ]] || \
27-
{ echo "Unknown command $1. Usage: $0 [code|doctests|docstrings|single-docs|notebooks]"; exit 9999; }
27+
{ echo "Unknown command $1. Usage: $0 [code|doctests|docstrings|single-docs|notebooks]"; exit 1; }
2828

29-
BASE_DIR="$(dirname $0)/.."
29+
BASE_DIR="$(dirname "$0")/.."
3030
RET=0
3131

3232
### CODE ###
3333
if [[ -z "$CHECK" || "$CHECK" == "code" ]]; then
3434

35-
MSG='Check import. No warnings, and blocklist some optional dependencies' ; echo $MSG
35+
MSG='Check import. No warnings, and blocklist some optional dependencies' ; echo "$MSG"
3636
python -W error -c "
3737
import sys
3838
import pandas
@@ -49,24 +49,24 @@ if mods:
4949
sys.stderr.write('err: pandas should not import: {}\n'.format(', '.join(mods)))
5050
sys.exit(len(mods))
5151
"
52-
RET=$(($RET + $?)) ; echo $MSG "DONE"
52+
RET=$(($RET + $?)) ; echo "$MSG" "DONE"
5353

5454
fi
5555

5656
### DOCTESTS ###
5757
if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then
5858

59-
MSG='Python and Cython Doctests' ; echo $MSG
59+
MSG='Python and Cython Doctests' ; echo "$MSG"
6060
python -c 'import pandas as pd; pd.test(run_doctests=True)'
61-
RET=$(($RET + $?)) ; echo $MSG "DONE"
61+
RET=$(($RET + $?)) ; echo "$MSG" "DONE"
6262

6363
fi
6464

6565
### DOCSTRINGS ###
6666
if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
6767

68-
MSG='Validate Docstrings' ; echo $MSG
69-
$BASE_DIR/scripts/validate_docstrings.py \
68+
MSG='Validate Docstrings' ; echo "$MSG"
69+
"$BASE_DIR"/scripts/validate_docstrings.py \
7070
--format=actions \
7171
-i ES01 `# For now it is ok if docstrings are missing the extended summary` \
7272
-i "pandas.Series.dt PR01" `# Accessors are implemented as classes, but we do not document the Parameters section` \
@@ -265,7 +265,7 @@ fi
265265
if [[ -z "$CHECK" || "$CHECK" == "notebooks" ]]; then
266266

267267
MSG='Notebooks' ; echo $MSG
268-
jupyter nbconvert --execute $(find doc/source -name '*.ipynb') --to notebook
268+
jupyter nbconvert --execute "$(find doc/source -name '*.ipynb')" --to notebook
269269
RET=$(($RET + $?)) ; echo $MSG "DONE"
270270

271271
fi

ci/run_tests.sh

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,8 @@
33
# Workaround for pytest-xdist (it collects different tests in the workers if PYTHONHASHSEED is not set)
44
# https://github.com/pytest-dev/pytest/issues/920
55
# https://github.com/pytest-dev/pytest/issues/1075
6-
export PYTHONHASHSEED=$(python -c 'import random; print(random.randint(1, 4294967295))')
7-
8-
# May help reproduce flaky CI builds if set in subsequent runs
9-
echo PYTHONHASHSEED=$PYTHONHASHSEED
6+
PYTHONHASHSEED=$(python -c 'import random; print(random.randint(1, 4294967295))')
7+
export PYTHONHASHSEED
108

119
COVERAGE="-s --cov=pandas --cov-report=xml --cov-append --cov-config=pyproject.toml"
1210

@@ -16,5 +14,5 @@ if [[ "$PATTERN" ]]; then
1614
PYTEST_CMD="$PYTEST_CMD -m \"$PATTERN\""
1715
fi
1816

19-
echo $PYTEST_CMD
17+
echo "$PYTEST_CMD"
2018
sh -c "$PYTEST_CMD"

ci/upload_wheels.sh

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
#!/bin/bash
12
# Modified from numpy's https://github.com/numpy/numpy/blob/main/tools/wheels/upload_wheels.sh
23

34
set_upload_vars() {
@@ -19,20 +20,20 @@ set_upload_vars() {
1920
fi
2021
}
2122
upload_wheels() {
22-
echo ${PWD}
23+
echo "${PWD}"
2324
if [[ ${ANACONDA_UPLOAD} == true ]]; then
24-
if [ -z ${TOKEN} ]; then
25+
if [ -z "${TOKEN}" ]; then
2526
echo no token set, not uploading
2627
else
2728
# sdists are located under dist folder when built through setup.py
2829
if compgen -G "./dist/*.gz"; then
2930
echo "Found sdist"
30-
anaconda -q -t ${TOKEN} upload --skip -u ${ANACONDA_ORG} ./dist/*.gz
31+
anaconda -q -t "${TOKEN}" upload --skip -u "${ANACONDA_ORG}" ./dist/*.gz
3132
echo "Uploaded sdist"
3233
fi
3334
if compgen -G "./wheelhouse/*.whl"; then
3435
echo "Found wheel"
35-
anaconda -q -t ${TOKEN} upload --skip -u ${ANACONDA_ORG} ./wheelhouse/*.whl
36+
anaconda -q -t "${TOKEN}" upload --skip -u "${ANACONDA_ORG}" ./wheelhouse/*.whl
3637
echo "Uploaded wheel"
3738
fi
3839
echo "PyPI-style index: https://pypi.anaconda.org/$ANACONDA_ORG/simple"

pandas/_libs/groupby.pyx

Lines changed: 63 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -753,16 +753,20 @@ def group_sum(
753753

754754
if uses_mask:
755755
isna_entry = mask[i, j]
756-
isna_result = result_mask[lab, j]
757756
else:
758757
isna_entry = _treat_as_na(val, is_datetimelike)
759-
isna_result = _treat_as_na(sumx[lab, j], is_datetimelike)
760758

761-
if not skipna and isna_result:
762-
# If sum is already NA, don't add to it. This is important for
763-
# datetimelikebecause adding a value to NPY_NAT may not result
764-
# in a NPY_NAT
765-
continue
759+
if not skipna:
760+
if uses_mask:
761+
isna_result = result_mask[lab, j]
762+
else:
763+
isna_result = _treat_as_na(sumx[lab, j], is_datetimelike)
764+
765+
if isna_result:
766+
# If sum is already NA, don't add to it. This is important for
767+
# datetimelikebecause adding a value to NPY_NAT may not result
768+
# in a NPY_NAT
769+
continue
766770

767771
if not isna_entry:
768772
nobs[lab, j] += 1
@@ -845,14 +849,18 @@ def group_prod(
845849

846850
if uses_mask:
847851
isna_entry = mask[i, j]
848-
isna_result = result_mask[lab, j]
849852
else:
850853
isna_entry = _treat_as_na(val, False)
851-
isna_result = _treat_as_na(prodx[lab, j], False)
852854

853-
if not skipna and isna_result:
854-
# If prod is already NA, no need to update it
855-
continue
855+
if not skipna:
856+
if uses_mask:
857+
isna_result = result_mask[lab, j]
858+
else:
859+
isna_result = _treat_as_na(prodx[lab, j], False)
860+
861+
if isna_result:
862+
# If prod is already NA, no need to update it
863+
continue
856864

857865
if not isna_entry:
858866
nobs[lab, j] += 1
@@ -919,22 +927,30 @@ def group_var(
919927

920928
if uses_mask:
921929
isna_entry = mask[i, j]
922-
isna_result = result_mask[lab, j]
923930
elif is_datetimelike:
924931
# With group_var, we cannot just use _treat_as_na bc
925932
# datetimelike dtypes get cast to float64 instead of
926933
# to int64.
927934
isna_entry = val == NPY_NAT
928-
isna_result = out[lab, j] == NPY_NAT
929935
else:
930936
isna_entry = _treat_as_na(val, is_datetimelike)
931-
isna_result = _treat_as_na(out[lab, j], is_datetimelike)
932937

933-
if not skipna and isna_result:
934-
# If aggregate is already NA, don't add to it. This is important for
935-
# datetimelike because adding a value to NPY_NAT may not result
936-
# in a NPY_NAT
937-
continue
938+
if not skipna:
939+
if uses_mask:
940+
isna_result = result_mask[lab, j]
941+
elif is_datetimelike:
942+
# With group_var, we cannot just use _treat_as_na bc
943+
# datetimelike dtypes get cast to float64 instead of
944+
# to int64.
945+
isna_result = out[lab, j] == NPY_NAT
946+
else:
947+
isna_result = _treat_as_na(out[lab, j], is_datetimelike)
948+
949+
if isna_result:
950+
# If aggregate is already NA, don't add to it. This is
951+
# important for datetimelike because adding a value to NPY_NAT
952+
# may not result in a NPY_NAT
953+
continue
938954

939955
if not isna_entry:
940956
nobs[lab, j] += 1
@@ -1232,22 +1248,30 @@ def group_mean(
12321248

12331249
if uses_mask:
12341250
isna_entry = mask[i, j]
1235-
isna_result = result_mask[lab, j]
12361251
elif is_datetimelike:
12371252
# With group_mean, we cannot just use _treat_as_na bc
12381253
# datetimelike dtypes get cast to float64 instead of
12391254
# to int64.
12401255
isna_entry = val == NPY_NAT
1241-
isna_result = sumx[lab, j] == NPY_NAT
12421256
else:
12431257
isna_entry = _treat_as_na(val, is_datetimelike)
1244-
isna_result = _treat_as_na(sumx[lab, j], is_datetimelike)
12451258

1246-
if not skipna and isna_result:
1247-
# If sum is already NA, don't add to it. This is important for
1248-
# datetimelike because adding a value to NPY_NAT may not result
1249-
# in NPY_NAT
1250-
continue
1259+
if not skipna:
1260+
if uses_mask:
1261+
isna_result = result_mask[lab, j]
1262+
elif is_datetimelike:
1263+
# With group_mean, we cannot just use _treat_as_na bc
1264+
# datetimelike dtypes get cast to float64 instead of
1265+
# to int64.
1266+
isna_result = sumx[lab, j] == NPY_NAT
1267+
else:
1268+
isna_result = _treat_as_na(sumx[lab, j], is_datetimelike)
1269+
1270+
if isna_result:
1271+
# If sum is already NA, don't add to it. This is important for
1272+
# datetimelike because adding a value to NPY_NAT may not result
1273+
# in NPY_NAT
1274+
continue
12511275

12521276
if not isna_entry:
12531277
nobs[lab, j] += 1
@@ -1909,15 +1933,20 @@ cdef group_min_max(
19091933

19101934
if uses_mask:
19111935
isna_entry = mask[i, j]
1912-
isna_result = result_mask[lab, j]
19131936
else:
19141937
isna_entry = _treat_as_na(val, is_datetimelike)
1915-
isna_result = _treat_as_na(group_min_or_max[lab, j],
1916-
is_datetimelike)
19171938

1918-
if not skipna and isna_result:
1919-
# If current min/max is already NA, it will always be NA
1920-
continue
1939+
if not skipna:
1940+
if uses_mask:
1941+
isna_result = result_mask[lab, j]
1942+
else:
1943+
isna_result = _treat_as_na(
1944+
group_min_or_max[lab, j], is_datetimelike
1945+
)
1946+
1947+
if isna_result:
1948+
# If current min/max is already NA, it will always be NA
1949+
continue
19211950

19221951
if not isna_entry:
19231952
nobs[lab, j] += 1

pandas/_libs/lib.pyx

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1522,6 +1522,11 @@ def infer_dtype(value: object, skipna: bool = True) -> str:
15221522
"""
15231523
Return a string label of the type of a scalar or list-like of values.
15241524

1525+
This method inspects the elements of the provided input and determines
1526+
classification of its data type. It is particularly useful for
1527+
handling heterogeneous data inputs where explicit dtype conversion may not
1528+
be possible or necessary.
1529+
15251530
Parameters
15261531
----------
15271532
value : scalar, list, ndarray, or pandas type

pandas/core/_numba/kernels/min_max_.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ def grouped_min_max(
9898
for i in range(N):
9999
lab = labels[i]
100100
val = values[i]
101-
if lab < 0 or (nobs[lab] >= 1 and np.isnan(output[lab])):
101+
if lab < 0 or (not skipna and nobs[lab] >= 1 and np.isnan(output[lab])):
102102
continue
103103

104104
if values.dtype.kind == "i" or not np.isnan(val):

pandas/core/arrays/sparse/accessor.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -279,6 +279,11 @@ class SparseFrameAccessor(BaseAccessor, PandasDelegate):
279279
"""
280280
DataFrame accessor for sparse data.
281281
282+
It allows users to interact with a `DataFrame` that contains sparse data types
283+
(`SparseDtype`). It provides methods and attributes to efficiently work with sparse
284+
storage, reducing memory usage while maintaining compatibility with standard pandas
285+
operations.
286+
282287
Parameters
283288
----------
284289
data : scipy.sparse.spmatrix

pandas/core/dtypes/dtypes.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,11 @@ class CategoricalDtype(PandasExtensionDtype, ExtensionDtype):
161161
"""
162162
Type for categorical data with the categories and orderedness.
163163
164+
It is a dtype representation for categorical data, which allows users to define
165+
a fixed set of values and optionally impose an ordering. This is particularly
166+
useful for handling categorical variables efficiently, as it can significantly
167+
reduce memory usage compared to using object dtypes.
168+
164169
Parameters
165170
----------
166171
categories : sequence, optional

pandas/core/frame.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8029,10 +8029,15 @@ def _should_reindex_frame_op(self, right, op, axis: int, fill_value, level) -> b
80298029
return False
80308030

80318031
if (
8032-
isinstance(self.columns, MultiIndex)
8033-
or isinstance(right.columns, MultiIndex)
8034-
) and not self.columns.equals(right.columns):
8032+
(
8033+
isinstance(self.columns, MultiIndex)
8034+
or isinstance(right.columns, MultiIndex)
8035+
)
8036+
and not self.columns.equals(right.columns)
8037+
and fill_value is None
8038+
):
80358039
# GH#60498 Reindex if MultiIndexe columns are not matching
8040+
# GH#60903 Don't reindex if fill_value is provided
80368041
return True
80378042

80388043
if fill_value is None and level is None and axis == 1:

pandas/io/formats/printing.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ def _pprint_seq(
112112
if isinstance(seq, set):
113113
fmt = "{{{body}}}"
114114
elif isinstance(seq, frozenset):
115-
fmt = "frozenset({body})"
115+
fmt = "frozenset({{{body}}})"
116116
else:
117117
fmt = "[{body}]" if hasattr(seq, "__setitem__") else "({body})"
118118

pandas/io/formats/style.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,12 @@ class Styler(StylerRenderer):
117117
r"""
118118
Helps style a DataFrame or Series according to the data with HTML and CSS.
119119
120+
This class provides methods for styling and formatting a Pandas DataFrame or Series.
121+
The styled output can be rendered as HTML or LaTeX, and it supports CSS-based
122+
styling, allowing users to control colors, font styles, and other visual aspects of
123+
tabular data. It is particularly useful for presenting DataFrame objects in a
124+
Jupyter Notebook environment or when exporting styled tables for reports and
125+
120126
Parameters
121127
----------
122128
data : Series or DataFrame

pandas/io/json/_json.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -520,6 +520,12 @@ def read_json(
520520
"""
521521
Convert a JSON string to pandas object.
522522
523+
This method reads JSON files or JSON-like data and converts them into pandas
524+
objects. It supports a variety of input formats, including line-delimited JSON,
525+
compressed files, and various data representations (table, records, index-based,
526+
etc.). When `chunksize` is specified, an iterator is returned instead of loading
527+
the entire data into memory.
528+
523529
Parameters
524530
----------
525531
path_or_buf : a str path, path object or file-like object

pandas/io/pytables.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1131,6 +1131,12 @@ def put(
11311131
"""
11321132
Store object in HDFStore.
11331133
1134+
This method writes a pandas DataFrame or Series into an HDF5 file using
1135+
either the fixed or table format. The `table` format allows additional
1136+
operations like incremental appends and queries but may have performance
1137+
trade-offs. The `fixed` format provides faster read/write operations but
1138+
does not support appends or queries.
1139+
11341140
Parameters
11351141
----------
11361142
key : str

0 commit comments

Comments
 (0)