read_excel - added comment as named argument comment and test_comment_* tests

JanLauGe · JanLauGe · commit 6755f081d97a · 2017-12-13T12:40:20.000Z
diff --git a/pandas/io/excel.py b/pandas/io/excel.py
@@ -154,6 +154,9 @@
     convert integral floats to int (i.e., 1.0 --> 1). If False, all numeric
     data will be read in as floats: Excel stores all numbers as floats
     internally
+comment : str, default None
+    Comment out remainder of line. Character or characters to indicate comments
+    in the input file. Any data between comment and line end is ignored.
 
 Returns
 -------
@@ -294,6 +297,7 @@ def read_excel(io,
                thousands=None,
                skip_footer=0,
                convert_float=True,
+               comment=None,
                **kwds):
 
     # Can't use _deprecate_kwarg since sheetname=None has a special meaning
@@ -327,6 +331,7 @@ def read_excel(io,
         thousands=thousands,
         skip_footer=skip_footer,
         convert_float=convert_float,
+        comment=comment,
         **kwds)
 
 
@@ -409,6 +414,7 @@ def parse(self,
               thousands=None,
               skip_footer=0,
               convert_float=True,
+              comment=None,
               **kwds):
         """
         Parse specified sheet(s) into a DataFrame
@@ -434,6 +440,7 @@ def parse(self,
                                  thousands=thousands,
                                  skip_footer=skip_footer,
                                  convert_float=convert_float,
+                                 comment=comment,
                                  **kwds)
 
     def _should_parse(self, i, usecols):
@@ -488,6 +495,7 @@ def _parse_excel(self,
                      thousands=None,
                      skip_footer=0,
                      convert_float=True,
+                     comment=None,
                      **kwds):
 
         skipfooter = kwds.pop('skipfooter', None)
@@ -665,6 +673,7 @@ def _parse_cell(cell_contents, cell_typ):
                                     date_parser=date_parser,
                                     thousands=thousands,
                                     skipfooter=skip_footer,
+                                    comment=comment,
                                     **kwds)
 
                 output[asheetname] = parser.read(nrows=nrows)
diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py
@@ -1862,6 +1862,62 @@ def test_invalid_columns(self):
             with pytest.raises(KeyError):
                 write_frame.to_excel(path, 'test1', columns=['C', 'D'])
 
+    def test_comment_arg(self):
+        # Test the comment argument functionality to read_excel
+        with ensure_clean(self.ext) as path:
+
+            # Create file to read in
+            write_frame = DataFrame({'A': ['one', '#one', 'one'],
+                                     'B': ['two', 'two', '#two']})
+            write_frame.to_excel(path, 'test_c')
+
+            # Read file without comment arg
+            read_frame = read_excel(path, 'test_c')
+            read_frame_commented = read_excel(path, 'test_c', comment='#')
+            tm.assert_class_equal(read_frame, read_frame_commented)
+
+    def test_comment_default(self):
+        # Test the comment argument default to read_excel
+        with ensure_clean(self.ext) as path:
+
+            # Create file to read in
+            write_frame = DataFrame({'A': ['one', '#one', 'one'],
+                                     'B': ['two', 'two', '#two']})
+            write_frame.to_excel(path, 'test_c')
+
+            # Read file with default and explicit comment=None
+            read_frame = read_excel(path, 'test_c')
+            read_frame_uncommented = read_excel(path, 'test_c', comment=None)
+            tm.assert_frame_equal(read_frame, read_frame_uncommented)
+
+    def test_comment_used(self):
+        # Test the comment argument is working as expected when used
+        with ensure_clean(self.ext) as path:
+
+            # Create file to read in
+            write_frame = DataFrame({'A': ['one', '#one', 'one'],
+                                     'B': ['two', 'two', '#two']})
+            write_frame.to_excel(path, 'test_c')
+
+            # Test read_frame_comment against manually produced expected output
+            read_frame_commented = read_excel(path, 'test_c', comment='#')
+            expected = read_excel(path, 'test_c')
+            expected.iloc[1, 0] = None
+            expected.iloc[1, 1] = None
+            expected.iloc[2, 1] = None
+            tm.assert_frame_equal(read_frame_commented, expected)
+
+    def test_comment_emptyline(self):
+        # Test that read_excel ignores commented lines at the end of file
+        with ensure_clean(self.ext) as path:
+
+            write_frame = DataFrame({'a': ['1', '#2'], 'b': ['2', '3']})
+            write_frame.to_excel(path, index=False)
+
+            # Test that all-comment lines at EoF are ignored
+            read_frame_short = read_excel(path, comment='#')
+            assert (read_frame_short.shape == write_frame.iloc[0:1, :].shape)
+
     def test_datetimes(self):
 
         # Test writing and reading datetimes. For issue #9139. (xref #9185)