Merge branch 'master' of https://github.com/pandas-dev/pandas into tslibs-conversion11

jbrockmendel · jbrockmendel · commit 3616f292cb6b · 2017-11-08T17:06:47.000-08:00
diff --git a/ci/install_travis.sh b/ci/install_travis.sh
@@ -34,9 +34,9 @@ fi
 
 # install miniconda
 if [ "${TRAVIS_OS_NAME}" == "osx" ]; then
-    time wget http://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -O miniconda.sh || exit 1
+    time wget http://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -q -O miniconda.sh || exit 1
 else
-    time wget http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh || exit 1
+    time wget http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -q -O miniconda.sh || exit 1
 fi
 time bash miniconda.sh -b -p "$MINICONDA_DIR" || exit 1
 
diff --git a/ci/requirements-3.6_NUMPY_DEV.build.sh b/ci/requirements-3.6_NUMPY_DEV.build.sh
@@ -12,7 +12,10 @@ PRE_WHEELS="https://7933911d6844c6c53a7d-47bd50c35cd79bd838daf386af554a83.ssl.cf
 pip install --pre --upgrade --timeout=60 -f $PRE_WHEELS numpy scipy
 
 # install dateutil from master
-pip install -U git+git://github.com/dateutil/dateutil.git
+
+# TODO(jreback), temp disable dateutil master has changed
+# pip install -U git+git://github.com/dateutil/dateutil.git
+pip install python-dateutil
 
 # cython via pip
 pip install cython
diff --git a/doc/source/advanced.rst b/doc/source/advanced.rst
@@ -174,14 +174,14 @@ on a deeper level.
 Defined Levels
 ~~~~~~~~~~~~~~
 
-The repr of a ``MultiIndex`` shows ALL the defined levels of an index, even
+The repr of a ``MultiIndex`` shows all the defined levels of an index, even
 if the they are not actually used. When slicing an index, you may notice this.
 For example:
 
 .. ipython:: python
 
-   # original multi-index
-   df.columns
+   # original MultiIndex
+   df.columns
 
    # sliced
    df[['foo','qux']].columns
@@ -264,7 +264,7 @@ Passing a list of labels or tuples works similar to reindexing:
 Using slicers
 ~~~~~~~~~~~~~
 
-You can slice a multi-index by providing multiple indexers.
+You can slice a ``MultiIndex`` by providing multiple indexers.
 
 You can provide any of the selectors as if you are indexing by label, see :ref:`Selection by Label <indexing.label>`,
 including slices, lists of labels, labels, and boolean indexers.
@@ -278,16 +278,16 @@ As usual, **both sides** of the slicers are included as this is label indexing.
 
    You should specify all axes in the ``.loc`` specifier, meaning the indexer for the **index** and
    for the **columns**. There are some ambiguous cases where the passed indexer could be mis-interpreted
-   as indexing *both* axes, rather than into say the MuliIndex for the rows.
+   as indexing *both* axes, rather than into say the ``MultiIndex`` for the rows.
 
    You should do this:
 
    .. code-block:: python
 
       df.loc[(slice('A1','A3'),.....), :]
 
-   rather than this:
-
+   rather than this:
+ 
    .. code-block:: python
 
       df.loc[(slice('A1','A3'),.....)]
@@ -494,7 +494,7 @@ are named.
    s.sort_index(level='L2')
 
 On higher dimensional objects, you can sort any of the other axes by level if
-they have a MultiIndex:
+they have a ``MultiIndex``:
 
 .. ipython:: python
 
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -4538,6 +4538,16 @@ Read from a parquet file.
 
    result.dtypes
 
+Read only certain columns of a parquet file. 
+
+.. ipython:: python
+
+   result = pd.read_parquet('example_pa.parquet', engine='pyarrow', columns=['a', 'b'])
+   result = pd.read_parquet('example_fp.parquet', engine='fastparquet', columns=['a', 'b'])
+
+   result.dtypes
+
+
 .. ipython:: python
    :suppress:
 
diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt
@@ -60,6 +60,7 @@ Bug Fixes
 - Bug in :class:`TimedeltaIndex` subtraction could incorrectly overflow when ``NaT`` is present (:issue:`17791`)
 - Bug in :class:`DatetimeIndex` subtracting datetimelike from DatetimeIndex could fail to overflow (:issue:`18020`)
 - Bug in ``pd.Series.rolling.skew()`` and ``rolling.kurt()`` with all equal values has floating issue (:issue:`18044`)
+- Bug in ``pd.DataFrameGroupBy.count()`` when counting over a datetimelike column (:issue:`13393`)
 
 Conversion
 ^^^^^^^^^^
@@ -82,6 +83,7 @@ I/O
 - Bug in :func:`read_csv` when reading a compressed UTF-16 encoded file (:issue:`18071`)
 - Bug in :func:`read_csv` for handling null values in index columns when specifying ``na_filter=False`` (:issue:`5239`)
 - Bug in :meth:`DataFrame.to_csv` when the table had ``MultiIndex`` columns, and a list of strings was passed in for ``header`` (:issue:`5539`)
+- :func:`read_parquet` now allows to specify the columns to read from a parquet file (:issue:`18154`)
 
 Plotting
 ^^^^^^^^
diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py
@@ -381,17 +381,20 @@ def raise_with_traceback(exc, traceback=Ellipsis):
 # http://stackoverflow.com/questions/4126348
 # Thanks to @martineau at SO
 
-from dateutil import parser as _date_parser
 import dateutil
+
+if PY2 and LooseVersion(dateutil.__version__) == '2.0':
+    # dateutil brokenness
+    raise Exception('dateutil 2.0 incompatible with Python 2.x, you must '
+    'install version 1.5 or 2.1+!')
+
+from dateutil import parser as _date_parser
 if LooseVersion(dateutil.__version__) < '2.0':
+
     @functools.wraps(_date_parser.parse)
     def parse_date(timestr, *args, **kwargs):
         timestr = bytes(timestr)
         return _date_parser.parse(timestr, *args, **kwargs)
-elif PY2 and LooseVersion(dateutil.__version__) == '2.0':
-    # dateutil brokenness
-    raise Exception('dateutil 2.0 incompatible with Python 2.x, you must '
-                    'install version 1.5 or 2.1+!')
 else:
     parse_date = _date_parser.parse
 
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -4365,7 +4365,8 @@ def count(self):
         ids, _, ngroups = self.grouper.group_info
         mask = ids != -1
 
-        val = ((mask & ~isna(blk.get_values())) for blk in data.blocks)
+        val = ((mask & ~isna(np.atleast_2d(blk.get_values())))
+               for blk in data.blocks)
         loc = (blk.mgr_locs for blk in data.blocks)
 
         counter = partial(count_level_2d, labels=ids, max_bin=ngroups, axis=1)
diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py
@@ -76,9 +76,9 @@ def write(self, df, path, compression='snappy',
                 table, path, compression=compression,
                 coerce_timestamps=coerce_timestamps, **kwargs)
 
-    def read(self, path):
+    def read(self, path, columns=None):
         path, _, _ = get_filepath_or_buffer(path)
-        return self.api.parquet.read_table(path).to_pandas()
+        return self.api.parquet.read_table(path, columns=columns).to_pandas()
 
 
 class FastParquetImpl(object):
@@ -115,9 +115,9 @@ def write(self, df, path, compression='snappy', **kwargs):
             self.api.write(path, df,
                            compression=compression, **kwargs)
 
-    def read(self, path):
+    def read(self, path, columns=None):
         path, _, _ = get_filepath_or_buffer(path)
-        return self.api.ParquetFile(path).to_pandas()
+        return self.api.ParquetFile(path).to_pandas(columns=columns)
 
 
 def to_parquet(df, path, engine='auto', compression='snappy', **kwargs):
@@ -178,7 +178,7 @@ def to_parquet(df, path, engine='auto', compression='snappy', **kwargs):
     return impl.write(df, path, compression=compression)
 
 
-def read_parquet(path, engine='auto', **kwargs):
+def read_parquet(path, engine='auto', columns=None, **kwargs):
     """
     Load a parquet object from the file path, returning a DataFrame.
 
@@ -188,6 +188,10 @@ def read_parquet(path, engine='auto', **kwargs):
     ----------
     path : string
         File path
+    columns: list, default=None
+        If not None, only these columns will be read from the file.
+
+        .. versionadded 0.21.1
     engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
         Parquet reader library to use. If 'auto', then the option
         'io.parquet.engine' is used. If 'auto', then the first
@@ -201,4 +205,4 @@ def read_parquet(path, engine='auto', **kwargs):
     """
 
     impl = get_engine(engine)
-    return impl.read(path)
+    return impl.read(path, columns=columns)
diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py
@@ -10,6 +10,8 @@
 from pandas import (DataFrame, Series, date_range, Timedelta, Timestamp,
                     compat, concat, option_context)
 from pandas.compat import u
+from pandas import _np_version_under1p14
+
 from pandas.core.dtypes.dtypes import DatetimeTZDtype
 from pandas.tests.frame.common import TestData
 from pandas.util.testing import (assert_series_equal,
@@ -531,7 +533,12 @@ def test_astype_str(self):
             assert_frame_equal(result, expected)
 
             result = DataFrame([1.12345678901234567890]).astype(tt)
-            expected = DataFrame(['1.12345678901'])
+            if _np_version_under1p14:
+                # < 1.14 truncates
+                expected = DataFrame(['1.12345678901'])
+            else:
+                # >= 1.14 preserves the full repr
+                expected = DataFrame(['1.1234567890123457'])
             assert_frame_equal(result, expected)
 
     @pytest.mark.parametrize("dtype_class", [dict, Series])
diff --git a/pandas/tests/groupby/test_counting.py b/pandas/tests/groupby/test_counting.py
@@ -2,9 +2,11 @@
 from __future__ import print_function
 
 import numpy as np
+import pytest
 
-from pandas import (DataFrame, Series, MultiIndex)
-from pandas.util.testing import assert_series_equal
+from pandas import (DataFrame, Series, MultiIndex, Timestamp, Timedelta,
+                    Period)
+from pandas.util.testing import (assert_series_equal, assert_frame_equal)
 from pandas.compat import (range, product as cart_product)
 
 
@@ -195,3 +197,18 @@ def test_ngroup_respects_groupby_order(self):
                                 g.ngroup())
             assert_series_equal(Series(df['group_index'].values),
                                 g.cumcount())
+
+    @pytest.mark.parametrize('datetimelike', [
+        [Timestamp('2016-05-%02d 20:09:25+00:00' % i) for i in range(1, 4)],
+        [Timestamp('2016-05-%02d 20:09:25' % i) for i in range(1, 4)],
+        [Timedelta(x, unit="h") for x in range(1, 4)],
+        [Period(freq="2W", year=2017, month=x) for x in range(1, 4)]])
+    def test_count_with_datetimelike(self, datetimelike):
+        # test for #13393, where DataframeGroupBy.count() fails
+        # when counting a datetimelike column.
+
+        df = DataFrame({'x': ['a', 'a', 'b'], 'y': datetimelike})
+        res = df.groupby('x').count()
+        expected = DataFrame({'y': [2, 1]}, index=['a', 'b'])
+        expected.index.name = "x"
+        assert_frame_equal(expected, res)
diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
@@ -192,15 +192,15 @@ def check_round_trip(self, df, engine, expected=None, **kwargs):
 
         with tm.ensure_clean() as path:
             df.to_parquet(path, engine, **kwargs)
-            result = read_parquet(path, engine)
+            result = read_parquet(path, engine, **kwargs)
 
             if expected is None:
                 expected = df
             tm.assert_frame_equal(result, expected)
 
             # repeat
             to_parquet(df, path, engine, **kwargs)
-            result = pd.read_parquet(path, engine)
+            result = pd.read_parquet(path, engine, **kwargs)
 
             if expected is None:
                 expected = df
@@ -282,6 +282,15 @@ def test_compression(self, engine, compression):
         df = pd.DataFrame({'A': [1, 2, 3]})
         self.check_round_trip(df, engine, compression=compression)
 
+    def test_read_columns(self, engine):
+        # GH18154
+        df = pd.DataFrame({'string': list('abc'),
+                           'int': list(range(1, 4))})
+
+        expected = pd.DataFrame({'string': list('abc')})
+        self.check_round_trip(df, engine, expected=expected,
+                              compression=None, columns=["string"])
+
 
 class TestParquetPyArrow(Base):