Skip to content

Commit 6c1ab7f

Browse files
committed
Merge remote-tracking branch 'upstream/master' into issue20452
2 parents cb19683 + 402ad45 commit 6c1ab7f

File tree

20 files changed

+534
-228
lines changed

20 files changed

+534
-228
lines changed

asv_bench/benchmarks/io/csv.py

Lines changed: 0 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -118,38 +118,6 @@ def time_read_uint64_na_values(self):
118118
na_values=self.na_values)
119119

120120

121-
class S3(object):
122-
# Make sure that we can read part of a file from S3 without
123-
# needing to download the entire thing. Use the timeit.default_timer
124-
# to measure wall time instead of CPU time -- we want to see
125-
# how long it takes to download the data.
126-
timer = timeit.default_timer
127-
params = ([None, "gzip", "bz2"], ["python", "c"])
128-
param_names = ["compression", "engine"]
129-
130-
def setup(self, compression, engine):
131-
if compression == "bz2" and engine == "c" and PY2:
132-
# The Python 2 C parser can't read bz2 from open files.
133-
raise NotImplementedError
134-
try:
135-
import s3fs # noqa
136-
except ImportError:
137-
# Skip these benchmarks if `boto` is not installed.
138-
raise NotImplementedError
139-
140-
ext = ""
141-
if compression == "gzip":
142-
ext = ".gz"
143-
elif compression == "bz2":
144-
ext = ".bz2"
145-
self.big_fname = "s3://pandas-test/large_random.csv" + ext
146-
147-
def time_read_csv_10_rows(self, compression, engine):
148-
# Read a small number of rows from a huge (100,000 x 50) table.
149-
read_csv(self.big_fname, nrows=10, compression=compression,
150-
engine=engine)
151-
152-
153121
class ReadCSVThousands(BaseIO):
154122

155123
goal_time = 0.2

doc/source/install.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -212,7 +212,7 @@ Recommended Dependencies
212212
``numexpr`` uses multiple cores as well as smart chunking and caching to achieve large speedups.
213213
If installed, must be Version 2.4.6 or higher.
214214

215-
* `bottleneck <http://berkeleyanalytics.com/bottleneck>`__: for accelerating certain types of ``nan``
215+
* `bottleneck <https://github.com/kwgoodman/bottleneck>`__: for accelerating certain types of ``nan``
216216
evaluations. ``bottleneck`` uses specialized cython routines to achieve large speedups. If installed,
217217
must be Version 1.0.0 or higher.
218218

@@ -233,7 +233,7 @@ Optional Dependencies
233233
* `xarray <http://xarray.pydata.org>`__: pandas like handling for > 2 dims, needed for converting Panels to xarray objects. Version 0.7.0 or higher is recommended.
234234
* `PyTables <http://www.pytables.org>`__: necessary for HDF5-based storage. Version 3.0.0 or higher required, Version 3.2.1 or higher highly recommended.
235235
* `Feather Format <https://github.com/wesm/feather>`__: necessary for feather-based storage, version 0.3.1 or higher.
236-
* `Apache Parquet <https://parquet.apache.org/>`__, either `pyarrow <http://arrow.apache.org/docs/python/>`__ (>= 0.4.1) or `fastparquet <https://fastparquet.readthedocs.io/en/latest/necessary>`__ (>= 0.0.6) for parquet-based storage. The `snappy <https://pypi.python.org/pypi/python-snappy>`__ and `brotli <https://pypi.python.org/pypi/brotlipy>`__ are available for compression support.
236+
* `Apache Parquet <https://parquet.apache.org/>`__, either `pyarrow <http://arrow.apache.org/docs/python/>`__ (>= 0.4.1) or `fastparquet <https://fastparquet.readthedocs.io/en/latest>`__ (>= 0.0.6) for parquet-based storage. The `snappy <https://pypi.python.org/pypi/python-snappy>`__ and `brotli <https://pypi.python.org/pypi/brotlipy>`__ are available for compression support.
237237
* `SQLAlchemy <http://www.sqlalchemy.org>`__: for SQL database support. Version 0.8.1 or higher recommended. Besides SQLAlchemy, you also need a database specific driver. You can find an overview of supported drivers for each SQL dialect in the `SQLAlchemy docs <http://docs.sqlalchemy.org/en/latest/dialects/index.html>`__. Some common drivers are:
238238

239239
* `psycopg2 <http://initd.org/psycopg/>`__: for PostgreSQL

doc/source/whatsnew/v0.23.0.txt

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -326,6 +326,7 @@ Other Enhancements
326326
- ``Resampler`` objects now have a functioning :attr:`~pandas.core.resample.Resampler.pipe` method.
327327
Previously, calls to ``pipe`` were diverted to the ``mean`` method (:issue:`17905`).
328328
- :func:`~pandas.api.types.is_scalar` now returns ``True`` for ``DateOffset`` objects (:issue:`18943`).
329+
- :func:`DataFrame.pivot` now accepts a list for the ``values=`` kwarg (:issue:`17160`).
329330
- Added :func:`pandas.api.extensions.register_dataframe_accessor`,
330331
:func:`pandas.api.extensions.register_series_accessor`, and
331332
:func:`pandas.api.extensions.register_index_accessor`, accessor for libraries downstream of pandas
@@ -716,6 +717,7 @@ Other API Changes
716717
- :func:`Series.str.replace` now takes an optional `regex` keyword which, when set to ``False``, uses literal string replacement rather than regex replacement (:issue:`16808`)
717718
- :func:`DatetimeIndex.strftime` and :func:`PeriodIndex.strftime` now return an ``Index`` instead of a numpy array to be consistent with similar accessors (:issue:`20127`)
718719
- Constructing a Series from a list of length 1 no longer broadcasts this list when a longer index is specified (:issue:`19714`, :issue:`20391`).
720+
- :func:`DataFrame.to_dict` with ``orient='index'`` no longer casts int columns to float for a DataFrame with only int and float columns (:issue:`18580`)
719721

720722
.. _whatsnew_0230.deprecations:
721723

@@ -903,11 +905,12 @@ Timezones
903905
- :func:`Timestamp.replace` will now handle Daylight Savings transitions gracefully (:issue:`18319`)
904906
- Bug in tz-aware :class:`DatetimeIndex` where addition/subtraction with a :class:`TimedeltaIndex` or array with ``dtype='timedelta64[ns]'`` was incorrect (:issue:`17558`)
905907
- Bug in :func:`DatetimeIndex.insert` where inserting ``NaT`` into a timezone-aware index incorrectly raised (:issue:`16357`)
906-
- Bug in the :class:`DataFrame` constructor, where tz-aware Datetimeindex and a given column name will result in an empty ``DataFrame`` (:issue:`19157`)
908+
- Bug in :class:`DataFrame` constructor, where tz-aware Datetimeindex and a given column name will result in an empty ``DataFrame`` (:issue:`19157`)
907909
- Bug in :func:`Timestamp.tz_localize` where localizing a timestamp near the minimum or maximum valid values could overflow and return a timestamp with an incorrect nanosecond value (:issue:`12677`)
908910
- Bug when iterating over :class:`DatetimeIndex` that was localized with fixed timezone offset that rounded nanosecond precision to microseconds (:issue:`19603`)
909911
- Bug in :func:`DataFrame.diff` that raised an ``IndexError`` with tz-aware values (:issue:`18578`)
910912
- Bug in :func:`melt` that converted tz-aware dtypes to tz-naive (:issue:`15785`)
913+
- Bug in :func:`Dataframe.count` that raised an ``ValueError`` if .dropna() method is invoked for single column timezone-aware values. (:issue:`13407`)
911914

912915
Offsets
913916
^^^^^^^

pandas/_libs/properties.pyx

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,9 @@ cdef class CachedProperty(object):
3737
PyDict_SetItem(cache, self.name, val)
3838
return val
3939

40+
def __set__(self, obj, value):
41+
raise AttributeError("Can't set attribute")
42+
4043

4144
cache_readonly = CachedProperty
4245

pandas/core/frame.py

Lines changed: 180 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1102,7 +1102,8 @@ def to_dict(self, orient='dict', into=dict):
11021102
for k, v in zip(self.columns, np.atleast_1d(row)))
11031103
for row in self.values]
11041104
elif orient.lower().startswith('i'):
1105-
return into_c((k, v.to_dict(into)) for k, v in self.iterrows())
1105+
return into_c((t[0], dict(zip(self.columns, t[1:])))
1106+
for t in self.itertuples())
11061107
else:
11071108
raise ValueError("orient '%s' not understood" % orient)
11081109

@@ -2153,7 +2154,7 @@ def _verbose_repr():
21532154
lines.append(_put_str(col, space) + tmpl % (count, dtype))
21542155

21552156
def _non_verbose_repr():
2156-
lines.append(self.columns.summary(name='Columns'))
2157+
lines.append(self.columns._summary(name='Columns'))
21572158

21582159
def _sizeof_fmt(num, size_qualifier):
21592160
# returns size in human readable format
@@ -5049,11 +5050,14 @@ def pivot(self, index=None, columns=None, values=None):
50495050
existing index.
50505051
columns : string or object
50515052
Column to use to make new frame's columns.
5052-
values : string or object, optional
5053-
Column to use for populating new frame's values. If not
5053+
values : string, object or a list of the previous, optional
5054+
Column(s) to use for populating new frame's values. If not
50545055
specified, all remaining columns will be used and the result will
50555056
have hierarchically indexed columns.
50565057
5058+
.. versionchanged :: 0.23.0
5059+
Also accept list of column names.
5060+
50575061
Returns
50585062
-------
50595063
DataFrame
@@ -5082,15 +5086,16 @@ def pivot(self, index=None, columns=None, values=None):
50825086
>>> df = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two',
50835087
... 'two'],
50845088
... 'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
5085-
... 'baz': [1, 2, 3, 4, 5, 6]})
5089+
... 'baz': [1, 2, 3, 4, 5, 6],
5090+
... 'zoo': ['x', 'y', 'z', 'q', 'w', 't']})
50865091
>>> df
5087-
foo bar baz
5088-
0 one A 1
5089-
1 one B 2
5090-
2 one C 3
5091-
3 two A 4
5092-
4 two B 5
5093-
5 two C 6
5092+
foo bar baz zoo
5093+
0 one A 1 x
5094+
1 one B 2 y
5095+
2 one C 3 z
5096+
3 two A 4 q
5097+
4 two B 5 w
5098+
5 two C 6 t
50945099
50955100
>>> df.pivot(index='foo', columns='bar', values='baz')
50965101
bar A B C
@@ -5104,6 +5109,13 @@ def pivot(self, index=None, columns=None, values=None):
51045109
one 1 2 3
51055110
two 4 5 6
51065111
5112+
>>> df.pivot(index='foo', columns='bar', values=['baz', 'zoo'])
5113+
baz zoo
5114+
bar A B C A B C
5115+
foo
5116+
one 1 2 3 x y z
5117+
two 4 5 6 q w t
5118+
51075119
A ValueError is raised if there are any duplicates.
51085120
51095121
>>> df = pd.DataFrame({"foo": ['one', 'one', 'two', 'two'],
@@ -5238,36 +5250,166 @@ def pivot_table(self, values=None, index=None, columns=None,
52385250

52395251
def stack(self, level=-1, dropna=True):
52405252
"""
5241-
Pivot a level of the (possibly hierarchical) column labels, returning a
5242-
DataFrame (or Series in the case of an object with a single level of
5243-
column labels) having a hierarchical index with a new inner-most level
5244-
of row labels.
5245-
The level involved will automatically get sorted.
5253+
Stack the prescribed level(s) from columns to index.
5254+
5255+
Return a reshaped DataFrame or Series having a multi-level
5256+
index with one or more new inner-most levels compared to the current
5257+
DataFrame. The new inner-most levels are created by pivoting the
5258+
columns of the current dataframe:
5259+
5260+
- if the columns have a single level, the output is a Series;
5261+
- if the columns have multiple levels, the new index
5262+
level(s) is (are) taken from the prescribed level(s) and
5263+
the output is a DataFrame.
5264+
5265+
The new index levels are sorted.
52465266
52475267
Parameters
52485268
----------
5249-
level : int, string, or list of these, default last level
5250-
Level(s) to stack, can pass level name
5251-
dropna : boolean, default True
5252-
Whether to drop rows in the resulting Frame/Series with no valid
5253-
values
5269+
level : int, str, list, default -1
5270+
Level(s) to stack from the column axis onto the index
5271+
axis, defined as one index or label, or a list of indices
5272+
or labels.
5273+
dropna : bool, default True
5274+
Whether to drop rows in the resulting Frame/Series with
5275+
missing values. Stacking a column level onto the index
5276+
axis can create combinations of index and column values
5277+
that are missing from the original dataframe. See Examples
5278+
section.
5279+
5280+
Returns
5281+
-------
5282+
DataFrame or Series
5283+
Stacked dataframe or series.
5284+
5285+
See Also
5286+
--------
5287+
DataFrame.unstack : Unstack prescribed level(s) from index axis
5288+
onto column axis.
5289+
DataFrame.pivot : Reshape dataframe from long format to wide
5290+
format.
5291+
DataFrame.pivot_table : Create a spreadsheet-style pivot table
5292+
as a DataFrame.
5293+
5294+
Notes
5295+
-----
5296+
The function is named by analogy with a collection of books
5297+
being re-organised from being side by side on a horizontal
5298+
position (the columns of the dataframe) to being stacked
5299+
vertically on top of of each other (in the index of the
5300+
dataframe).
52545301
52555302
Examples
5256-
----------
5257-
>>> s
5258-
a b
5259-
one 1. 2.
5260-
two 3. 4.
5303+
--------
5304+
**Single level columns**
5305+
5306+
>>> df_single_level_cols = pd.DataFrame([[0, 1], [2, 3]],
5307+
... index=['cat', 'dog'],
5308+
... columns=['weight', 'height'])
5309+
5310+
Stacking a dataframe with a single level column axis returns a Series:
5311+
5312+
>>> df_single_level_cols
5313+
weight height
5314+
cat 0 1
5315+
dog 2 3
5316+
>>> df_single_level_cols.stack()
5317+
cat weight 0
5318+
height 1
5319+
dog weight 2
5320+
height 3
5321+
dtype: int64
52615322
5262-
>>> s.stack()
5263-
one a 1
5264-
b 2
5265-
two a 3
5266-
b 4
5323+
**Multi level columns: simple case**
5324+
5325+
>>> multicol1 = pd.MultiIndex.from_tuples([('weight', 'kg'),
5326+
... ('weight', 'pounds')])
5327+
>>> df_multi_level_cols1 = pd.DataFrame([[1, 2], [2, 4]],
5328+
... index=['cat', 'dog'],
5329+
... columns=multicol1)
5330+
5331+
Stacking a dataframe with a multi-level column axis:
5332+
5333+
>>> df_multi_level_cols1
5334+
weight
5335+
kg pounds
5336+
cat 1 2
5337+
dog 2 4
5338+
>>> df_multi_level_cols1.stack()
5339+
weight
5340+
cat kg 1
5341+
pounds 2
5342+
dog kg 2
5343+
pounds 4
5344+
5345+
**Missing values**
5346+
5347+
>>> multicol2 = pd.MultiIndex.from_tuples([('weight', 'kg'),
5348+
... ('height', 'm')])
5349+
>>> df_multi_level_cols2 = pd.DataFrame([[1.0, 2.0], [3.0, 4.0]],
5350+
... index=['cat', 'dog'],
5351+
... columns=multicol2)
5352+
5353+
It is common to have missing values when stacking a dataframe
5354+
with multi-level columns, as the stacked dataframe typically
5355+
has more values than the original dataframe. Missing values
5356+
are filled with NaNs:
5357+
5358+
>>> df_multi_level_cols2
5359+
weight height
5360+
kg m
5361+
cat 1.0 2.0
5362+
dog 3.0 4.0
5363+
>>> df_multi_level_cols2.stack()
5364+
height weight
5365+
cat kg NaN 1.0
5366+
m 2.0 NaN
5367+
dog kg NaN 3.0
5368+
m 4.0 NaN
5369+
5370+
**Prescribing the level(s) to be stacked**
5371+
5372+
The first parameter controls which level or levels are stacked:
5373+
5374+
>>> df_multi_level_cols2.stack(0)
5375+
kg m
5376+
cat height NaN 2.0
5377+
weight 1.0 NaN
5378+
dog height NaN 4.0
5379+
weight 3.0 NaN
5380+
>>> df_multi_level_cols2.stack([0, 1])
5381+
cat height m 2.0
5382+
weight kg 1.0
5383+
dog height m 4.0
5384+
weight kg 3.0
5385+
dtype: float64
52675386
5268-
Returns
5269-
-------
5270-
stacked : DataFrame or Series
5387+
**Dropping missing values**
5388+
5389+
>>> df_multi_level_cols3 = pd.DataFrame([[None, 1.0], [2.0, 3.0]],
5390+
... index=['cat', 'dog'],
5391+
... columns=multicol2)
5392+
5393+
Note that rows where all values are missing are dropped by
5394+
default but this behaviour can be controlled via the dropna
5395+
keyword parameter:
5396+
5397+
>>> df_multi_level_cols3
5398+
weight height
5399+
kg m
5400+
cat NaN 1.0
5401+
dog 2.0 3.0
5402+
>>> df_multi_level_cols3.stack(dropna=False)
5403+
height weight
5404+
cat kg NaN NaN
5405+
m 1.0 NaN
5406+
dog kg NaN 2.0
5407+
m 3.0 NaN
5408+
>>> df_multi_level_cols3.stack(dropna=True)
5409+
height weight
5410+
cat m 1.0 NaN
5411+
dog kg NaN 2.0
5412+
m 3.0 NaN
52715413
"""
52725414
from pandas.core.reshape.reshape import stack, stack_multiple
52735415

@@ -6578,7 +6720,9 @@ def count(self, axis=0, level=None, numeric_only=False):
65786720
# column frames with an extension array
65796721
result = notna(frame).sum(axis=axis)
65806722
else:
6581-
counts = notna(frame.values).sum(axis=axis)
6723+
# GH13407
6724+
series_counts = notna(frame).sum(axis=axis)
6725+
counts = series_counts.values
65826726
result = Series(counts, index=frame._get_agg_axis(axis))
65836727

65846728
return result.astype('int64')

pandas/core/groupby.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3860,7 +3860,7 @@ def count(self):
38603860

38613861
mask = (ids != -1) & ~isna(val)
38623862
ids = _ensure_platform_int(ids)
3863-
out = np.bincount(ids[mask], minlength=ngroups or None)
3863+
out = np.bincount(ids[mask], minlength=ngroups or 0)
38643864

38653865
return Series(out,
38663866
index=self.grouper.result_index,

pandas/core/indexes/base.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -99,9 +99,14 @@ def cmp_method(self, other):
9999
# don't pass MultiIndex
100100
with np.errstate(all='ignore'):
101101
result = ops._comp_method_OBJECT_ARRAY(op, self.values, other)
102+
102103
else:
103-
with np.errstate(all='ignore'):
104-
result = op(self.values, np.asarray(other))
104+
105+
# numpy will show a DeprecationWarning on invalid elementwise
106+
# comparisons, this will raise in the future
107+
with warnings.catch_warnings(record=True):
108+
with np.errstate(all='ignore'):
109+
result = op(self.values, np.asarray(other))
105110

106111
# technically we could support bool dtyped Index
107112
# for now just return the indexing array directly

0 commit comments

Comments
 (0)