-
-
Notifications
You must be signed in to change notification settings - Fork 18.6k
Read excel nrows #16672
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Read excel nrows #16672
Changes from 2 commits
9788cdb
53bdb62
72cd981
f1a6740
ef52114
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -82,6 +82,8 @@ | |
Rows to skip at the beginning (0-indexed) | ||
skip_footer : int, default 0 | ||
Rows at the end to skip (0-indexed) | ||
nrows : int, default None | ||
Number of rows to parse | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. versionadded 0.21.0 tag |
||
index_col : int, list of ints, default None | ||
Column (0-indexed) to use as the row labels of the DataFrame. | ||
Pass None if there is no such column. If a list is passed, | ||
|
@@ -191,12 +193,12 @@ def get_writer(engine_name): | |
|
||
|
||
@Appender(_read_excel_doc) | ||
def read_excel(io, sheet_name=0, header=0, skiprows=None, skip_footer=0, | ||
index_col=None, names=None, parse_cols=None, parse_dates=False, | ||
date_parser=None, na_values=None, thousands=None, | ||
convert_float=True, converters=None, dtype=None, | ||
true_values=None, false_values=None, engine=None, | ||
squeeze=False, **kwds): | ||
def read_excel(io, sheet_name=0, header=0, skiprows=None, nrows=None, | ||
skip_footer=0, index_col=None, names=None, parse_cols=None, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we normally don't like to shuffle parameters around in kwargs. Please align the ordering of these params as much as possible with how read_csv does it (obviously only include the current parameters). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure, not a problem. I was using read_csv as a guide and saw that I will go ahead and line up the read_excel kwargs as close as possible to the read_csv kwargs. I see 3or4 out of order with just a quick glance. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok great. prob need a slightly expanded whatsnew note to tell about this There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I rearranged the kwargs and also updated docstrings to match parameter order (one of the things that always bugs me). Should I also update all internal function kwargs in I'm adding the following note to the Other Enhancements section (just wanted to make sure it was the right spot...): Thanks! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes I would have the internal API match the external |
||
parse_dates=False, date_parser=None, na_values=None, | ||
thousands=None, convert_float=True, converters=None, | ||
dtype=None, true_values=None, false_values=None, | ||
engine=None, squeeze=False, **kwds): | ||
|
||
# Can't use _deprecate_kwarg since sheetname=None has a special meaning | ||
if is_integer(sheet_name) and sheet_name == 0 and 'sheetname' in kwds: | ||
|
@@ -211,12 +213,13 @@ def read_excel(io, sheet_name=0, header=0, skiprows=None, skip_footer=0, | |
io = ExcelFile(io, engine=engine) | ||
|
||
return io._parse_excel( | ||
sheetname=sheet_name, header=header, skiprows=skiprows, names=names, | ||
index_col=index_col, parse_cols=parse_cols, parse_dates=parse_dates, | ||
date_parser=date_parser, na_values=na_values, thousands=thousands, | ||
convert_float=convert_float, skip_footer=skip_footer, | ||
converters=converters, dtype=dtype, true_values=true_values, | ||
false_values=false_values, squeeze=squeeze, **kwds) | ||
sheetname=sheet_name, header=header, skiprows=skiprows, nrows=nrows, | ||
names=names, index_col=index_col, parse_cols=parse_cols, | ||
parse_dates=parse_dates, date_parser=date_parser, na_values=na_values, | ||
thousands=thousands, convert_float=convert_float, | ||
skip_footer=skip_footer, converters=converters, dtype=dtype, | ||
true_values=true_values, false_values=false_values, | ||
squeeze=squeeze, **kwds) | ||
|
||
|
||
class ExcelFile(object): | ||
|
@@ -275,11 +278,11 @@ def __init__(self, io, **kwds): | |
def __fspath__(self): | ||
return self._io | ||
|
||
def parse(self, sheet_name=0, header=0, skiprows=None, skip_footer=0, | ||
names=None, index_col=None, parse_cols=None, parse_dates=False, | ||
date_parser=None, na_values=None, thousands=None, | ||
convert_float=True, converters=None, true_values=None, | ||
false_values=None, squeeze=False, **kwds): | ||
def parse(self, sheet_name=0, header=0, skiprows=None, nrows=None, | ||
skip_footer=0, names=None, index_col=None, parse_cols=None, | ||
parse_dates=False, date_parser=None, na_values=None, | ||
thousands=None, convert_float=True, converters=None, | ||
true_values=None, false_values=None, squeeze=False, **kwds): | ||
""" | ||
Parse specified sheet(s) into a DataFrame | ||
|
||
|
@@ -288,7 +291,9 @@ def parse(self, sheet_name=0, header=0, skiprows=None, skip_footer=0, | |
""" | ||
|
||
return self._parse_excel(sheetname=sheet_name, header=header, | ||
skiprows=skiprows, names=names, | ||
skiprows=skiprows, | ||
nrow=nrows, | ||
names=names, | ||
index_col=index_col, | ||
parse_cols=parse_cols, | ||
parse_dates=parse_dates, | ||
|
@@ -335,12 +340,12 @@ def _excel2num(x): | |
else: | ||
return i in parse_cols | ||
|
||
def _parse_excel(self, sheetname=0, header=0, skiprows=None, names=None, | ||
skip_footer=0, index_col=None, parse_cols=None, | ||
parse_dates=False, date_parser=None, na_values=None, | ||
thousands=None, convert_float=True, true_values=None, | ||
false_values=None, verbose=False, dtype=None, | ||
squeeze=False, **kwds): | ||
def _parse_excel(self, sheetname=0, header=0, skiprows=None, nrows=None, | ||
names=None, skip_footer=0, index_col=None, | ||
parse_cols=None, parse_dates=False, date_parser=None, | ||
na_values=None, thousands=None, convert_float=True, | ||
true_values=None, false_values=None, verbose=False, | ||
dtype=None, squeeze=False, **kwds): | ||
|
||
skipfooter = kwds.pop('skipfooter', None) | ||
if skipfooter is not None: | ||
|
@@ -511,12 +516,13 @@ def _parse_cell(cell_contents, cell_typ): | |
true_values=true_values, | ||
false_values=false_values, | ||
skiprows=skiprows, | ||
nrows=nrows, | ||
skipfooter=skip_footer, | ||
squeeze=squeeze, | ||
dtype=dtype, | ||
**kwds) | ||
|
||
output[asheetname] = parser.read() | ||
output[asheetname] = parser.read(nrows=nrows) | ||
if names is not None: | ||
output[asheetname].columns = names | ||
if not squeeze or isinstance(output[asheetname], DataFrame): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -999,6 +999,8 @@ def _failover_to_python(self): | |
|
||
def read(self, nrows=None): | ||
if nrows is not None: | ||
nrows = _validate_integer('nrows', nrows) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good catch (see my comment here. However, instead of littering our code with duplicate checks, here's what I think is best: In your modified Locate the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I originally didn't have this line there, but then I was getting the following error: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes this change should be done in the parser itself. See if you can come up with an example that ONLY used pd.read_csv directly. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Where should I put this test? There are a bunch in `tests/io/parser', but nothing for read_csv directly. |
||
|
||
if self.options.get('skipfooter'): | ||
raise ValueError('skipfooter not supported for iteration') | ||
|
||
|
@@ -1893,6 +1895,8 @@ def TextParser(*args, **kwds): | |
date_parser : function, default None | ||
skiprows : list of integers | ||
Row numbers to skip | ||
nrows : int, default None | ||
Number of rows to parse | ||
skipfooter : int | ||
Number of line at bottom of file to skip | ||
converters : dict, default None | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1000,6 +1000,20 @@ def test_read_excel_skiprows_list(self): | |
'skiprows_list', skiprows=np.array([0, 2])) | ||
tm.assert_frame_equal(actual, expected) | ||
|
||
def test_read_excel_nrows(self): | ||
# GH 16645 | ||
num_rows_to_pull = 5 | ||
actual = pd.read_excel(os.path.join(self.dirpath, 'test1' + self.ext), | ||
nrows=num_rows_to_pull) | ||
expected = pd.read_excel(os.path.join(self.dirpath, | ||
'test1' + self.ext)) | ||
expected = expected[:num_rows_to_pull] | ||
tm.assert_frame_equal(actual, expected) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. try to pull more rows than exist in the file as well. |
||
|
||
with pytest.raises(ValueError): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @gfyoung don't we have an issue about There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We do, but I think you can circumvent the check via non There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @alysivji : Put this as a separate test, and use |
||
pd.read_excel(os.path.join(self.dirpath, 'test1' + self.ext), | ||
nrows='5') | ||
|
||
def test_read_excel_squeeze(self): | ||
# GH 12157 | ||
f = os.path.join(self.dirpath, 'test_squeeze' + self.ext) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How about:
pd.read_excel()
has gained thenrows
parameter (...)There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
you can make another entry (with this PR number), in api_breaking section that the kwargs are re-aranged to match pd.read_csv
use
:func:`read_excel`