Merge remote-tracking branch 'upstream/master' into 26814-optional-fu

TomAugspurger · TomAugspurger · commit f0b1d972e0ce · 2019-06-12T14:32:55.000-05:00
diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst
@@ -761,34 +761,6 @@ regularity will result in a ``DatetimeIndex``, although frequency is lost:
 
    ts2[[0, 2, 6]].index
 
-.. _timeseries.iterating-label:
-
-Iterating through groups
-------------------------
-
-With the ``Resampler`` object in hand, iterating through the grouped data is very
-natural and functions similarly to :py:func:`itertools.groupby`:
-
-.. ipython:: python
-
-   small = pd.Series(
-       range(6),
-       index=pd.to_datetime(['2017-01-01T00:00:00',
-                             '2017-01-01T00:30:00',
-                             '2017-01-01T00:31:00',
-                             '2017-01-01T01:00:00',
-                             '2017-01-01T03:00:00',
-                             '2017-01-01T03:05:00'])
-   )
-   resampled = small.resample('H')
-
-   for name, group in resampled:
-       print("Group: ", name)
-       print("-" * 27)
-       print(group, end="\n\n")
-
-See :ref:`groupby.iterating-label` or :class:`Resampler.__iter__` for more.
-
 .. _timeseries.components:
 
 Time/Date Components
@@ -1628,24 +1600,32 @@ labels.
 
    ts.resample('5Min', label='left', loffset='1s').mean()
 
-.. note::
+.. warning::
 
-    The default values for ``label`` and ``closed`` is 'left' for all
+    The default values for ``label`` and ``closed`` is '**left**' for all
     frequency offsets except for 'M', 'A', 'Q', 'BM', 'BA', 'BQ', and 'W'
     which all have a default of 'right'.
 
+    This might unintendedly lead to looking ahead, where the value for a later
+    time is pulled back to a previous time as in the following example with
+    the :class:`~pandas.tseries.offsets.BusinessDay` frequency:
+
     .. ipython:: python
 
-       rng2 = pd.date_range('1/1/2012', end='3/31/2012', freq='D')
-       ts2 = pd.Series(range(len(rng2)), index=rng2)
+        s = pd.date_range('2000-01-01', '2000-01-05').to_series()
+        s.iloc[2] = pd.NaT
+        s.dt.weekday_name
 
-       # default: label='right', closed='right'
-       ts2.resample('M').max()
+        # default: label='left', closed='left'
+        s.resample('B').last().dt.weekday_name
 
-       # default: label='left', closed='left'
-       ts2.resample('SM').max()
+    Notice how the value for Sunday got pulled back to the previous Friday.
+    To get the behavior where the value for Sunday is pushed to Monday, use
+    instead
 
-       ts2.resample('SM', label='right', closed='right').max()
+    .. ipython:: python
+
+        s.resample('B', label='right', closed='right').last().dt.weekday_name
 
 The ``axis`` parameter can be set to 0 or 1 and allows you to resample the
 specified axis for a ``DataFrame``.
@@ -1796,6 +1776,34 @@ level of ``MultiIndex``, its name or location can be passed to the
 
    df.resample('M', level='d').sum()
 
+.. _timeseries.iterating-label:
+
+Iterating through groups
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+With the ``Resampler`` object in hand, iterating through the grouped data is very
+natural and functions similarly to :py:func:`itertools.groupby`:
+
+.. ipython:: python
+
+   small = pd.Series(
+       range(6),
+       index=pd.to_datetime(['2017-01-01T00:00:00',
+                             '2017-01-01T00:30:00',
+                             '2017-01-01T00:31:00',
+                             '2017-01-01T01:00:00',
+                             '2017-01-01T03:00:00',
+                             '2017-01-01T03:05:00'])
+   )
+   resampled = small.resample('H')
+
+   for name, group in resampled:
+       print("Group: ", name)
+       print("-" * 27)
+       print(group, end="\n\n")
+
+See :ref:`groupby.iterating-label` or :class:`Resampler.__iter__` for more.
+
 
 .. _timeseries.periods:
 
diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst
@@ -667,6 +667,7 @@ I/O
 - Bug in :func:`read_json` where date strings with ``Z`` were not converted to a UTC timezone (:issue:`26168`)
 - Added ``cache_dates=True`` parameter to :meth:`read_csv`, which allows to cache unique dates when they are parsed (:issue:`25990`)
 - :meth:`DataFrame.to_excel` now raises a ``ValueError`` when the caller's dimensions exceed the limitations of Excel (:issue:`26051`)
+- Fixed bug in :func:`pandas.read_csv` where a BOM would result in incorrect parsing using engine='python' (:issue:`26545`)
 - :func:`read_excel` now raises a ``ValueError`` when input is of type :class:`pandas.io.excel.ExcelFile` and ``engine`` param is passed since :class:`pandas.io.excel.ExcelFile` has an engine defined (:issue:`26566`)
 - Bug while selecting from :class:`HDFStore` with ``where=''`` specified (:issue:`26610`).
 
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -2755,23 +2755,24 @@ def _check_for_bom(self, first_row):
         if first_elt != _BOM:
             return first_row
 
-        first_row = first_row[0]
+        first_row_bom = first_row[0]
 
-        if len(first_row) > 1 and first_row[1] == self.quotechar:
+        if len(first_row_bom) > 1 and first_row_bom[1] == self.quotechar:
             start = 2
-            quote = first_row[1]
-            end = first_row[2:].index(quote) + 2
+            quote = first_row_bom[1]
+            end = first_row_bom[2:].index(quote) + 2
 
             # Extract the data between the quotation marks
-            new_row = first_row[start:end]
+            new_row = first_row_bom[start:end]
 
             # Extract any remaining data after the second
             # quotation mark.
-            if len(first_row) > end + 1:
-                new_row += first_row[end + 1:]
-            return [new_row]
-        elif len(first_row) > 1:
-            return [first_row[1:]]
+            if len(first_row_bom) > end + 1:
+                new_row += first_row_bom[end + 1:]
+            return [new_row] + first_row[1:]
+
+        elif len(first_row_bom) > 1:
+            return [first_row_bom[1:]]
         else:
             # First row is just the BOM, so we
             # return an empty string.
diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py
@@ -1927,3 +1927,13 @@ def test_read_table_deprecated(all_parsers):
                                     check_stacklevel=False):
         result = parser.read_table(StringIO(data))
         tm.assert_frame_equal(result, expected)
+
+
+def test_first_row_bom(all_parsers):
+    # see gh-26545
+    parser = all_parsers
+    data = '''\ufeff"Head1"	"Head2"	"Head3"'''
+
+    result = parser.read_csv(StringIO(data), delimiter='\t')
+    expected = DataFrame(columns=["Head1", "Head2", "Head3"])
+    tm.assert_frame_equal(result, expected)