Skip to content

Commit def5ed6

Browse files
committed
Merge branch 'master' into feature/series-info
2 parents f7cb4f8 + 4aa0783 commit def5ed6

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

72 files changed

+890
-816
lines changed

.pre-commit-config.yaml

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,36 @@ repos:
119119
entry: python scripts/validate_unwanted_patterns.py --validation-type="private_function_across_module"
120120
types: [python]
121121
exclude: ^(asv_bench|pandas/tests|doc)/
122+
- id: FrameOrSeriesUnion
123+
name: Check for use of Union[Series, DataFrame] instead of FrameOrSeriesUnion alias
124+
entry: Union\[.*(Series.*DataFrame|DataFrame.*Series).*\]
125+
language: pygrep
126+
types: [python]
127+
exclude: ^pandas/_typing\.py$
128+
- id: type-not-class
129+
name: Check for use of foo.__class__ instead of type(foo)
130+
entry: \.__class__
131+
language: pygrep
132+
files: \.(py|pyx)$
133+
- id: unwanted-typing
134+
name: Check for use of comment-based annotation syntax and missing error codes
135+
entry: |
136+
(?x)
137+
\#\ type:\ (?!ignore)|
138+
\#\ type:\s?ignore(?!\[)
139+
language: pygrep
140+
types: [python]
141+
- id: no-os-remove
142+
name: Check code for instances of os.remove
143+
entry: os\.remove
144+
language: pygrep
145+
types: [python]
146+
files: ^pandas/tests/
147+
exclude: |
148+
(?x)^
149+
pandas/tests/io/excel/test_writers\.py|
150+
pandas/tests/io/pytables/common\.py|
151+
pandas/tests/io/pytables/test_store\.py$
122152
- repo: https://github.com/asottile/yesqa
123153
rev: v1.2.2
124154
hooks:

ci/code_checks.sh

Lines changed: 0 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -122,29 +122,6 @@ if [[ -z "$CHECK" || "$CHECK" == "patterns" ]]; then
122122
RET=$(($RET + $?)) ; echo $MSG "DONE"
123123

124124
# -------------------------------------------------------------------------
125-
# Type annotations
126-
127-
MSG='Check for use of comment-based annotation syntax' ; echo $MSG
128-
invgrep -R --include="*.py" -P '# type: (?!ignore)' pandas
129-
RET=$(($RET + $?)) ; echo $MSG "DONE"
130-
131-
MSG='Check for missing error codes with # type: ignore' ; echo $MSG
132-
invgrep -R --include="*.py" -P '# type:\s?ignore(?!\[)' pandas
133-
RET=$(($RET + $?)) ; echo $MSG "DONE"
134-
135-
MSG='Check for use of Union[Series, DataFrame] instead of FrameOrSeriesUnion alias' ; echo $MSG
136-
invgrep -R --include="*.py" --exclude=_typing.py -E 'Union\[.*(Series.*DataFrame|DataFrame.*Series).*\]' pandas
137-
RET=$(($RET + $?)) ; echo $MSG "DONE"
138-
139-
# -------------------------------------------------------------------------
140-
MSG='Check for use of foo.__class__ instead of type(foo)' ; echo $MSG
141-
invgrep -R --include=*.{py,pyx} '\.__class__' pandas
142-
RET=$(($RET + $?)) ; echo $MSG "DONE"
143-
144-
MSG='Check code for instances of os.remove' ; echo $MSG
145-
invgrep -R --include="*.py*" --exclude "common.py" --exclude "test_writers.py" --exclude "test_store.py" -E "os\.remove" pandas/tests/
146-
RET=$(($RET + $?)) ; echo $MSG "DONE"
147-
148125
MSG='Check for inconsistent use of pandas namespace in tests' ; echo $MSG
149126
for class in "Series" "DataFrame" "Index" "MultiIndex" "Timestamp" "Timedelta" "TimedeltaIndex" "DatetimeIndex" "Categorical"; do
150127
check_namespace ${class}

doc/source/user_guide/indexing.rst

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -422,6 +422,17 @@ above example, ``s.loc[1:6]`` would raise ``KeyError``.
422422
For the rationale behind this behavior, see
423423
:ref:`Endpoints are inclusive <advanced.endpoints_are_inclusive>`.
424424

425+
.. ipython:: python
426+
427+
s = pd.Series(list('abcdef'), index=[0, 3, 2, 5, 4, 2])
428+
s.loc[3:5]
429+
430+
Also, if the index has duplicate labels *and* either the start or the stop label is dupulicated,
431+
an error will be raised. For instance, in the above example, ``s.loc[2:5]`` would raise a ``KeyError``.
432+
433+
For more information about duplicate labels, see
434+
:ref:`Duplicate Labels <duplicates>`.
435+
425436
.. _indexing.integer:
426437

427438
Selection by position

doc/source/whatsnew/v1.1.5.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ Fixed regressions
2323

2424
Bug fixes
2525
~~~~~~~~~
26-
-
26+
- Bug in metadata propagation for ``groupby`` iterator (:issue:`37343`)
2727
-
2828

2929
.. ---------------------------------------------------------------------------

doc/source/whatsnew/v1.2.0.rst

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,7 @@ Other enhancements
231231
- :class:`Window` now supports all Scipy window types in ``win_type`` with flexible keyword argument support (:issue:`34556`)
232232
- :meth:`testing.assert_index_equal` now has a ``check_order`` parameter that allows indexes to be checked in an order-insensitive manner (:issue:`37478`)
233233
- :meth:`Series.info` has been added, for compatibility with :meth:`DataFrame.info` (:issue:`5167`)
234+
- :func:`read_csv` supports memory-mapping for compressed files (:issue:`37621`)
234235

235236
.. _whatsnew_120.api_breaking.python:
236237

@@ -400,11 +401,13 @@ Datetimelike
400401
- Bug in :meth:`TimedeltaIndex.sum` and :meth:`Series.sum` with ``timedelta64`` dtype on an empty index or series returning ``NaT`` instead of ``Timedelta(0)`` (:issue:`31751`)
401402
- Bug in :meth:`DatetimeArray.shift` incorrectly allowing ``fill_value`` with a mismatched timezone (:issue:`37299`)
402403
- Bug in adding a :class:`BusinessDay` with nonzero ``offset`` to a non-scalar other (:issue:`37457`)
404+
- Bug in :func:`to_datetime` with a read-only array incorrectly raising (:issue:`34857`)
403405

404406
Timedelta
405407
^^^^^^^^^
406408
- Bug in :class:`TimedeltaIndex`, :class:`Series`, and :class:`DataFrame` floor-division with ``timedelta64`` dtypes and ``NaT`` in the denominator (:issue:`35529`)
407409
- Bug in parsing of ISO 8601 durations in :class:`Timedelta`, :meth:`pd.to_datetime` (:issue:`37159`, fixes :issue:`29773` and :issue:`36204`)
410+
- Bug in :func:`to_timedelta` with a read-only array incorrectly raising (:issue:`34857`)
408411

409412
Timezones
410413
^^^^^^^^^
@@ -434,7 +437,7 @@ Numeric
434437
Conversion
435438
^^^^^^^^^^
436439

437-
-
440+
- Bug in :meth:`DataFrame.to_dict` with ``orient='records'`` now returns python native datetime objects for datetimelike columns (:issue:`21256`)
438441
-
439442

440443
Strings
@@ -499,6 +502,7 @@ I/O
499502
- Bug in output rendering of complex numbers showing too many trailing zeros (:issue:`36799`)
500503
- Bug in :class:`HDFStore` threw a ``TypeError`` when exporting an empty :class:`DataFrame` with ``datetime64[ns, tz]`` dtypes with a fixed HDF5 store (:issue:`20594`)
501504
- Bug in :class:`HDFStore` was dropping timezone information when exporting :class:`Series` with ``datetime64[ns, tz]`` dtypes with a fixed HDF5 store (:issue:`20594`)
505+
- :func:`read_csv` was closing user-provided binary file handles when ``engine="c"`` and an ``encoding`` was requested (:issue:`36980`)
502506
- Bug in :meth:`DataFrame.to_hdf` was not dropping missing rows with ``dropna=True`` (:issue:`35719`)
503507

504508
Plotting
@@ -545,6 +549,7 @@ Reshaping
545549
- Bug in :func:`join` returned a non deterministic level-order for the resulting :class:`MultiIndex` (:issue:`36910`)
546550
- Bug in :meth:`DataFrame.combine_first()` caused wrong alignment with dtype ``string`` and one level of ``MultiIndex`` containing only ``NA`` (:issue:`37591`)
547551
- Fixed regression in :func:`merge` on merging DatetimeIndex with empty DataFrame (:issue:`36895`)
552+
- Bug in :meth:`DataFrame.apply` not setting index of return value when ``func`` return type is ``dict`` (:issue:`37544`)
548553

549554
Sparse
550555
^^^^^^
@@ -567,7 +572,7 @@ Other
567572
- Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` incorrectly raising ``AssertionError`` instead of ``ValueError`` when invalid parameter combinations are passed (:issue:`36045`)
568573
- Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` with numeric values and string ``to_replace`` (:issue:`34789`)
569574
- Fixed bug in metadata propagation incorrectly copying DataFrame columns as metadata when the column name overlaps with the metadata name (:issue:`37037`)
570-
- Fixed metadata propagation in the :class:`Series.dt` and :class:`Series.str` accessors and :class:`DataFrame.duplicated` and :class:`DataFrame.stack` and :class:`DataFrame.unstack` and :class:`DataFrame.pivot` methods (:issue:`28283`)
575+
- Fixed metadata propagation in the :class:`Series.dt`, :class:`Series.str` accessors, :class:`DataFrame.duplicated`, :class:`DataFrame.stack`, :class:`DataFrame.unstack`, :class:`DataFrame.pivot`, :class:`DataFrame.append`, :class:`DataFrame.diff`, :class:`DataFrame.applymap` and :class:`DataFrame.update` methods (:issue:`28283`) (:issue:`37381`)
571576
- Bug in :meth:`Index.union` behaving differently depending on whether operand is a :class:`Index` or other list-like (:issue:`36384`)
572577
- Passing an array with 2 or more dimensions to the :class:`Series` constructor now raises the more specific ``ValueError``, from a bare ``Exception`` previously (:issue:`35744`)
573578
- Bug in ``accessor.DirNamesMixin``, where ``dir(obj)`` wouldn't show attributes defined on the instance (:issue:`37173`).

pandas/_libs/join.pyx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -268,7 +268,7 @@ ctypedef fused join_t:
268268

269269
@cython.wraparound(False)
270270
@cython.boundscheck(False)
271-
def left_join_indexer_unique(join_t[:] left, join_t[:] right):
271+
def left_join_indexer_unique(ndarray[join_t] left, ndarray[join_t] right):
272272
cdef:
273273
Py_ssize_t i, j, nleft, nright
274274
ndarray[int64_t] indexer

pandas/_libs/parsers.pyx

Lines changed: 14 additions & 108 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,18 @@
11
# Copyright (c) 2012, Lambda Foundry, Inc.
22
# See LICENSE for the license
3-
import bz2
43
from csv import QUOTE_MINIMAL, QUOTE_NONE, QUOTE_NONNUMERIC
54
from errno import ENOENT
6-
import gzip
7-
import io
8-
import os
95
import sys
106
import time
117
import warnings
12-
import zipfile
138

149
from libc.stdlib cimport free
1510
from libc.string cimport strcasecmp, strlen, strncpy
1611

1712
import cython
1813
from cython import Py_ssize_t
1914

20-
from cpython.bytes cimport PyBytes_AsString, PyBytes_FromString
15+
from cpython.bytes cimport PyBytes_AsString
2116
from cpython.exc cimport PyErr_Fetch, PyErr_Occurred
2217
from cpython.object cimport PyObject
2318
from cpython.ref cimport Py_XDECREF
@@ -67,7 +62,6 @@ from pandas._libs.khash cimport (
6762
khiter_t,
6863
)
6964

70-
from pandas.compat import get_lzma_file, import_lzma
7165
from pandas.errors import DtypeWarning, EmptyDataError, ParserError, ParserWarning
7266

7367
from pandas.core.dtypes.common import (
@@ -82,11 +76,10 @@ from pandas.core.dtypes.common import (
8276
)
8377
from pandas.core.dtypes.concat import union_categoricals
8478

85-
lzma = import_lzma()
86-
8779
cdef:
8880
float64_t INF = <float64_t>np.inf
8981
float64_t NEGINF = -INF
82+
int64_t DEFAULT_CHUNKSIZE = 256 * 1024
9083

9184

9285
cdef extern from "headers/portable.h":
@@ -275,14 +268,15 @@ cdef extern from "parser/io.h":
275268
size_t *bytes_read, int *status)
276269

277270

278-
DEFAULT_CHUNKSIZE = 256 * 1024
279-
280-
281271
cdef class TextReader:
282272
"""
283273
284274
# source: StringIO or file object
285275
276+
..versionchange:: 1.2.0
277+
removed 'compression', 'memory_map', and 'encoding' argument.
278+
These arguments are outsourced to CParserWrapper.
279+
'source' has to be a file handle.
286280
"""
287281

288282
cdef:
@@ -299,16 +293,14 @@ cdef class TextReader:
299293

300294
cdef public:
301295
int64_t leading_cols, table_width, skipfooter, buffer_lines
302-
bint allow_leading_cols, mangle_dupe_cols, memory_map, low_memory
296+
bint allow_leading_cols, mangle_dupe_cols, low_memory
303297
bint delim_whitespace
304298
object delimiter, converters
305299
object na_values
306300
object header, orig_header, names, header_start, header_end
307301
object index_col
308302
object skiprows
309303
object dtype
310-
object encoding
311-
object compression
312304
object usecols
313305
list dtype_cast_order
314306
set unnamed_cols
@@ -321,18 +313,15 @@ cdef class TextReader:
321313
header_end=0,
322314
index_col=None,
323315
names=None,
324-
bint memory_map=False,
325316
tokenize_chunksize=DEFAULT_CHUNKSIZE,
326317
bint delim_whitespace=False,
327-
compression=None,
328318
converters=None,
329319
bint skipinitialspace=False,
330320
escapechar=None,
331321
bint doublequote=True,
332322
quotechar=b'"',
333323
quoting=0,
334324
lineterminator=None,
335-
encoding=None,
336325
comment=None,
337326
decimal=b'.',
338327
thousands=None,
@@ -356,15 +345,7 @@ cdef class TextReader:
356345
bint skip_blank_lines=True):
357346

358347
# set encoding for native Python and C library
359-
if encoding is not None:
360-
if not isinstance(encoding, bytes):
361-
encoding = encoding.encode('utf-8')
362-
encoding = encoding.lower()
363-
self.c_encoding = <char*>encoding
364-
else:
365-
self.c_encoding = NULL
366-
367-
self.encoding = encoding
348+
self.c_encoding = NULL
368349

369350
self.parser = parser_new()
370351
self.parser.chunksize = tokenize_chunksize
@@ -374,9 +355,6 @@ cdef class TextReader:
374355
# For timekeeping
375356
self.clocks = []
376357

377-
self.compression = compression
378-
self.memory_map = memory_map
379-
380358
self.parser.usecols = (usecols is not None)
381359

382360
self._setup_parser_source(source)
@@ -562,11 +540,6 @@ cdef class TextReader:
562540
parser_del(self.parser)
563541

564542
def close(self):
565-
# we need to properly close an open derived
566-
# filehandle here, e.g. and UTFRecoder
567-
if self.handle is not None:
568-
self.handle.close()
569-
570543
# also preemptively free all allocated memory
571544
parser_free(self.parser)
572545
if self.true_set:
@@ -614,82 +587,15 @@ cdef class TextReader:
614587
cdef:
615588
void *ptr
616589

617-
self.parser.cb_io = NULL
618-
self.parser.cb_cleanup = NULL
619-
620-
if self.compression:
621-
if self.compression == 'gzip':
622-
if isinstance(source, str):
623-
source = gzip.GzipFile(source, 'rb')
624-
else:
625-
source = gzip.GzipFile(fileobj=source)
626-
elif self.compression == 'bz2':
627-
source = bz2.BZ2File(source, 'rb')
628-
elif self.compression == 'zip':
629-
zip_file = zipfile.ZipFile(source)
630-
zip_names = zip_file.namelist()
631-
632-
if len(zip_names) == 1:
633-
file_name = zip_names.pop()
634-
source = zip_file.open(file_name)
635-
636-
elif len(zip_names) == 0:
637-
raise ValueError(f'Zero files found in compressed '
638-
f'zip file {source}')
639-
else:
640-
raise ValueError(f'Multiple files found in compressed '
641-
f'zip file {zip_names}')
642-
elif self.compression == 'xz':
643-
if isinstance(source, str):
644-
source = get_lzma_file(lzma)(source, 'rb')
645-
else:
646-
source = get_lzma_file(lzma)(filename=source)
647-
else:
648-
raise ValueError(f'Unrecognized compression type: '
649-
f'{self.compression}')
650-
651-
if (self.encoding and hasattr(source, "read") and
652-
not hasattr(source, "encoding")):
653-
source = io.TextIOWrapper(
654-
source, self.encoding.decode('utf-8'), newline='')
655-
656-
self.encoding = b'utf-8'
657-
self.c_encoding = <char*>self.encoding
658-
659-
self.handle = source
660-
661-
if isinstance(source, str):
662-
encoding = sys.getfilesystemencoding() or "utf-8"
663-
usource = source
664-
source = source.encode(encoding)
665-
666-
if self.memory_map:
667-
ptr = new_mmap(source)
668-
if ptr == NULL:
669-
# fall back
670-
ptr = new_file_source(source, self.parser.chunksize)
671-
self.parser.cb_io = &buffer_file_bytes
672-
self.parser.cb_cleanup = &del_file_source
673-
else:
674-
self.parser.cb_io = &buffer_mmap_bytes
675-
self.parser.cb_cleanup = &del_mmap
676-
else:
677-
ptr = new_file_source(source, self.parser.chunksize)
678-
self.parser.cb_io = &buffer_file_bytes
679-
self.parser.cb_cleanup = &del_file_source
680-
self.parser.source = ptr
681-
682-
elif hasattr(source, 'read'):
683-
# e.g., StringIO
684-
685-
ptr = new_rd_source(source)
686-
self.parser.source = ptr
687-
self.parser.cb_io = &buffer_rd_bytes
688-
self.parser.cb_cleanup = &del_rd_source
689-
else:
590+
if not hasattr(source, "read"):
690591
raise IOError(f'Expected file path name or file-like object, '
691592
f'got {type(source)} type')
692593

594+
ptr = new_rd_source(source)
595+
self.parser.source = ptr
596+
self.parser.cb_io = &buffer_rd_bytes
597+
self.parser.cb_cleanup = &del_rd_source
598+
693599
cdef _get_header(self):
694600
# header is now a list of lists, so field_count should use header[0]
695601

0 commit comments

Comments
 (0)