Skip to content

ENH: Better error message if usecols doesn't match columns #17310

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 21 additions & 3 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1065,6 +1065,20 @@ def _evaluate_usecols(usecols, names):
return usecols


def _validate_usecols(usecols, names):
"""
Validates that all usecols are present in a given
list of names, if not, raise a ValueError that
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: let's remove the comma splice in this doc-string i.e.

"...names. If not, raise a ValueError that..."

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, let's add a parameter description + return usecols (just in case).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done & done!

I've used the doc format of the existing functions that were below, if this needs to change let me know and I'll do so.

shows what usecols are missing.
"""
missing = [c for c in usecols if c not in names]
if len(missing) > 0:
raise ValueError(
"Usecols do not match columns, "
"columns expected but not found: {missing}".format(missing=missing)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you might need to ', '.join(missing) here, not sure.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Python 3.6.3 |Anaconda, Inc.| (default, Nov  8 2017, 18:10:31) 
[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)] on darwin
Type "help", "copyright", "credits" or "license" for more information.
>>> l = [1, 'foo', [2,3]]
>>> 'Formatted l: {}'.format(l)
"Formatted l: [1, 'foo', [2, 3]]"

if you'd prefer a different error message, I'd be happy to use join though.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

that’s fine i guess format is pretty smart about this ok!

)


def _validate_skipfooter_arg(skipfooter):
"""
Validate the 'skipfooter' parameter.
Expand Down Expand Up @@ -1662,14 +1676,14 @@ def __init__(self, src, **kwds):
# GH 14671
if (self.usecols_dtype == 'string' and
not set(usecols).issubset(self.orig_names)):
raise ValueError("Usecols do not match names.")
_validate_usecols(usecols, self.orig_names)

if len(self.names) > len(usecols):
self.names = [n for i, n in enumerate(self.names)
if (i in usecols or n in usecols)]

if len(self.names) < len(usecols):
raise ValueError("Usecols do not match names.")
_validate_usecols(usecols, self.names)

self._set_noconvert_columns()

Expand Down Expand Up @@ -2442,9 +2456,13 @@ def _handle_usecols(self, columns, usecols_key):
raise ValueError("If using multiple headers, usecols must "
"be integers.")
col_indices = []

for col in self.usecols:
if isinstance(col, string_types):
col_indices.append(usecols_key.index(col))
try:
col_indices.append(usecols_key.index(col))
except ValueError:
_validate_usecols(self.usecols, usecols_key)
else:
col_indices.append(col)
else:
Expand Down
16 changes: 8 additions & 8 deletions pandas/tests/io/parser/usecols.py
Original file line number Diff line number Diff line change
Expand Up @@ -480,10 +480,10 @@ def test_raise_on_usecols_names_mismatch(self):
# GH 14671
data = 'a,b,c,d\n1,2,3,4\n5,6,7,8'

if self.engine == 'c':
msg = 'Usecols do not match names'
else:
msg = 'is not in list'
msg = (
"Usecols do not match columns, "
"columns expected but not found: {missing}"
)

usecols = ['a', 'b', 'c', 'd']
df = self.read_csv(StringIO(data), usecols=usecols)
Expand All @@ -492,11 +492,11 @@ def test_raise_on_usecols_names_mismatch(self):
tm.assert_frame_equal(df, expected)

usecols = ['a', 'b', 'c', 'f']
with tm.assert_raises_regex(ValueError, msg):
with tm.assert_raises_regex(ValueError, msg.format(missing="\['f'\]")):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add a test where you have 2 missing cols?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, will do this and add the whatsnew entry tomorrow, thanks!

self.read_csv(StringIO(data), usecols=usecols)

usecols = ['a', 'b', 'f']
with tm.assert_raises_regex(ValueError, msg):
with tm.assert_raises_regex(ValueError, msg.format(missing="\['f'\]")):
self.read_csv(StringIO(data), usecols=usecols)

names = ['A', 'B', 'C', 'D']
Expand All @@ -520,9 +520,9 @@ def test_raise_on_usecols_names_mismatch(self):
# tm.assert_frame_equal(df, expected)

usecols = ['A', 'B', 'C', 'f']
with tm.assert_raises_regex(ValueError, msg):
with tm.assert_raises_regex(ValueError, msg.format(missing="\['f'\]")):
self.read_csv(StringIO(data), header=0, names=names,
usecols=usecols)
usecols = ['A', 'B', 'f']
with tm.assert_raises_regex(ValueError, msg):
with tm.assert_raises_regex(ValueError, msg.format(missing="\['f'\]")):
self.read_csv(StringIO(data), names=names, usecols=usecols)