-
-
Notifications
You must be signed in to change notification settings - Fork 18.6k
ENH: Allow compression in NDFrame.to_csv to be a dict with optional arguments (#26023) #26024
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 22 commits
4e73dc4
ab7620d
2e782f9
83e8834
d238878
b41be54
60ea58c
8ba9082
0a3a9fd
a1cb3f7
af2a96c
5853a28
789751f
5b09e6f
68a2b4d
c856f50
8df6c81
40d0252
18a735d
103c877
b6c34bc
969d387
abfbc0f
04ae25d
9c22652
56a75c2
bbfea34
7717f16
779511e
780eb04
6c4e679
1b567c9
9324b63
7cf65ee
29374f3
6701aa4
0f5489d
e04138e
6f2bf00
865aa81
8d1deee
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,7 +6,7 @@ | |
import operator | ||
import pickle | ||
from textwrap import dedent | ||
from typing import Callable, FrozenSet, List, Set | ||
from typing import Any, Callable, Dict, FrozenSet, List, Optional, Set, Union | ||
import warnings | ||
import weakref | ||
|
||
|
@@ -2942,10 +2942,11 @@ def to_latex(self, buf=None, columns=None, col_space=None, header=True, | |
|
||
def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, | ||
columns=None, header=True, index=True, index_label=None, | ||
mode='w', encoding=None, compression='infer', quoting=None, | ||
quotechar='"', line_terminator=None, chunksize=None, | ||
tupleize_cols=None, date_format=None, doublequote=True, | ||
escapechar=None, decimal='.'): | ||
mode='w', encoding=None, | ||
compression: Optional[Union[str, Dict[str, Any]]] = 'infer', | ||
quoting=None, quotechar='"', line_terminator=None, | ||
chunksize=None, tupleize_cols=None, date_format=None, | ||
doublequote=True, escapechar=None, decimal='.'): | ||
r""" | ||
Write object to a comma-separated values (csv) file. | ||
|
||
|
@@ -2992,16 +2993,21 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, | |
encoding : str, optional | ||
A string representing the encoding to use in the output file, | ||
defaults to 'utf-8'. | ||
compression : str, default 'infer' | ||
Compression mode among the following possible values: {'infer', | ||
'gzip', 'bz2', 'zip', 'xz', None}. If 'infer' and `path_or_buf` | ||
is path-like, then detect compression from the following | ||
extensions: '.gz', '.bz2', '.zip' or '.xz'. (otherwise no | ||
compression). | ||
|
||
.. versionchanged:: 0.24.0 | ||
|
||
'infer' option added and set to default. | ||
compression : str or dict, default 'infer' | ||
If str, represents compression mode. If dict, value at 'method' is | ||
the compression mode. Compression mode may be any of the following | ||
possible values: {'infer', 'gzip', 'bz2', 'zip', 'xz', None}. If | ||
compression mode is 'infer' and `path_or_buf` is path-like, then | ||
detect compression mode from the following extensions: '.gz', | ||
'.bz2', '.zip' or '.xz'. (otherwise no compression). If dict given | ||
and mode is 'zip' or inferred as 'zip', other entries passed as | ||
additional compression options. | ||
|
||
.. versionchanged:: 0.25.0 | ||
|
||
May now be a dict with key 'method' as compression mode | ||
and other entries as additional compression options if | ||
compression mode is 'zip'. | ||
|
||
quoting : optional constant from csv module | ||
Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format` | ||
|
@@ -3054,6 +3060,10 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, | |
... 'weapon': ['sai', 'bo staff']}) | ||
>>> df.to_csv(index=False) | ||
'name,mask,weapon\nRaphael,red,sai\nDonatello,purple,bo staff\n' | ||
|
||
# create 'out.zip' containing 'out.csv' | ||
>>> compression_opts = dict(method='zip', archive_name='out.csv') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you put the comment before the example (and put a blank line between cases); also might need to have a DOCTEST: SKIP here There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you do this |
||
>>> df.to_csv('out.zip', index=False, compression=compression_opts) | ||
""" | ||
|
||
df = self if isinstance(self, ABCDataFrame) else self.to_frame() | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,6 +10,7 @@ | |
import mmap | ||
import os | ||
import pathlib | ||
from typing import Any, Dict, Optional, Tuple, Union | ||
from urllib.error import URLError # noqa | ||
from urllib.parse import ( # noqa | ||
urlencode, urljoin, urlparse as parse_url, uses_netloc, uses_params, | ||
|
@@ -219,13 +220,46 @@ def file_path_to_url(path): | |
} | ||
|
||
|
||
def _get_compression_method(compression: Optional[Union[str, Dict[str, Any]]]): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the Dict can be typed as There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you do this |
||
""" | ||
Simplifies a compression argument to a compression method string and | ||
a dict containing additional arguments. | ||
|
||
Parameters | ||
---------- | ||
compression : str or dict | ||
If string, specifies the compression method. If dict, value at key | ||
'method' specifies compression method. | ||
|
||
Returns | ||
------- | ||
tuple of ({compression method}, any | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. any -> Optional[str] |
||
{compression arguments}, dict) | ||
|
||
Raises | ||
------ | ||
ValueError on dict missing 'method' key | ||
""" | ||
# Handle dict | ||
if isinstance(compression, dict): | ||
compression_args = compression.copy() | ||
try: | ||
compression = compression['method'] | ||
compression_args.pop('method') | ||
except KeyError: | ||
raise ValueError("If dict, compression " | ||
"must have key 'method'") | ||
else: | ||
compression_args = {} | ||
return compression, compression_args | ||
WillAyd marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
|
||
def _infer_compression(filepath_or_buffer, compression): | ||
""" | ||
Get the compression method for filepath_or_buffer. If compression='infer', | ||
the inferred compression method is returned. Otherwise, the input | ||
compression method is returned unchanged, unless it's invalid, in which | ||
case an error is raised. | ||
|
||
gfyoung marked this conversation as resolved.
Show resolved
Hide resolved
|
||
Parameters | ||
---------- | ||
filepath_or_buffer : | ||
|
@@ -234,12 +268,10 @@ def _infer_compression(filepath_or_buffer, compression): | |
If 'infer' and `filepath_or_buffer` is path-like, then detect | ||
compression from the following extensions: '.gz', '.bz2', '.zip', | ||
or '.xz' (otherwise no compression). | ||
|
||
Returns | ||
------- | ||
string or None : | ||
compression method | ||
|
||
Raises | ||
------ | ||
ValueError on invalid compression specified | ||
|
@@ -273,7 +305,8 @@ def _infer_compression(filepath_or_buffer, compression): | |
raise ValueError(msg) | ||
|
||
|
||
def _get_handle(path_or_buf, mode, encoding=None, compression=None, | ||
def _get_handle(path_or_buf, mode, encoding=None, | ||
compression: Optional[Union[str, Dict[str, Any]]] = None, | ||
memory_map=False, is_text=True): | ||
""" | ||
Get file handle for given path/buffer and mode. | ||
|
@@ -285,10 +318,21 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None, | |
mode : str | ||
mode to open path_or_buf with | ||
encoding : str or None | ||
compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default None | ||
If 'infer' and `filepath_or_buffer` is path-like, then detect | ||
compression from the following extensions: '.gz', '.bz2', '.zip', | ||
or '.xz' (otherwise no compression). | ||
compression : str or dict, default None | ||
If string, specifies compression mode. If dict, value at key 'method' | ||
specifies compression mode. Compression mode must be one of {'infer', | ||
'gzip', 'bz2', 'zip', 'xz', None}. If compression mode is 'infer' | ||
and `filepath_or_buffer` is path-like, then detect compression from | ||
the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise | ||
no compression). If dict and compression mode is 'zip' or inferred as | ||
'zip', other entries passed as additional compression options. | ||
|
||
.. versionchanged:: 0.25.0 | ||
|
||
May now be a dict with key 'method' as compression mode | ||
and other keys as compression options if compression | ||
mode is 'zip'. | ||
|
||
memory_map : boolean, default False | ||
See parsers._parser_params for more information. | ||
is_text : boolean, default True | ||
|
@@ -304,7 +348,7 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None, | |
""" | ||
try: | ||
from s3fs import S3File | ||
need_text_wrapping = (BytesIO, S3File) | ||
need_text_wrapping = (BytesIO, S3File) # type: Tuple | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Was this throwing a Typing error? Think MyPy should be able to infer here There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @WillAyd Yeah, it was. Couldn’t tell why, but MyPy couldn’t infer when I added types to the function definition. It also occurred on There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What error was it giving you? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It was giving |
||
except ImportError: | ||
need_text_wrapping = (BytesIO,) | ||
|
||
|
@@ -315,6 +359,7 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None, | |
path_or_buf = _stringify_path(path_or_buf) | ||
is_path = isinstance(path_or_buf, str) | ||
|
||
compression, compression_args = _get_compression_method(compression) | ||
WillAyd marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if is_path: | ||
compression = _infer_compression(path_or_buf, compression) | ||
|
||
|
@@ -336,7 +381,7 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None, | |
|
||
# ZIP Compression | ||
elif compression == 'zip': | ||
zf = BytesZipFile(path_or_buf, mode) | ||
zf = BytesZipFile(path_or_buf, mode, **compression_args) | ||
# Ensure the container is closed as well. | ||
handles.append(zf) | ||
if zf.mode == 'w': | ||
|
@@ -406,13 +451,19 @@ class BytesZipFile(zipfile.ZipFile, BytesIO): # type: ignore | |
bytes strings into a member of the archive. | ||
""" | ||
# GH 17778 | ||
def __init__(self, file, mode, compression=zipfile.ZIP_DEFLATED, **kwargs): | ||
def __init__(self, file, mode, compression=zipfile.ZIP_DEFLATED, | ||
archive_name: Optional[str] = None, | ||
**kwargs): | ||
if mode in ['wb', 'rb']: | ||
mode = mode.replace('b', '') | ||
self.archive_name = archive_name | ||
super().__init__(file, mode, compression, **kwargs) | ||
|
||
def write(self, data): | ||
super().writestr(self.filename, data) | ||
archive_name = self.filename | ||
if self.archive_name is not None: | ||
archive_name = self.archive_name | ||
super().writestr(archive_name, data) | ||
|
||
@property | ||
def closed(self): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,6 +5,7 @@ | |
import csv as csvlib | ||
from io import StringIO | ||
import os | ||
from typing import Any, Dict, Optional, Union | ||
import warnings | ||
from zipfile import ZipFile | ||
|
||
|
@@ -17,26 +18,32 @@ | |
from pandas.core.dtypes.missing import notna | ||
|
||
from pandas.io.common import ( | ||
UnicodeWriter, _get_handle, _infer_compression, get_filepath_or_buffer) | ||
UnicodeWriter, _get_compression_method, _get_handle, _infer_compression, | ||
get_filepath_or_buffer) | ||
|
||
|
||
class CSVFormatter: | ||
|
||
def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', | ||
float_format=None, cols=None, header=True, index=True, | ||
index_label=None, mode='w', nanRep=None, encoding=None, | ||
compression='infer', quoting=None, line_terminator='\n', | ||
chunksize=None, tupleize_cols=False, quotechar='"', | ||
date_format=None, doublequote=True, escapechar=None, | ||
decimal='.'): | ||
compression: Optional[Union[str, Dict[str, Any]]] = 'infer', | ||
quoting=None, line_terminator='\n', chunksize=None, | ||
tupleize_cols=False, quotechar='"', date_format=None, | ||
doublequote=True, escapechar=None, decimal='.'): | ||
|
||
self.obj = obj | ||
|
||
if path_or_buf is None: | ||
path_or_buf = StringIO() | ||
|
||
# Extract compression mode as given, if dict | ||
compression, self.compression_args = _get_compression_method( | ||
compression) | ||
|
||
self.path_or_buf, _, _, _ = get_filepath_or_buffer( | ||
path_or_buf, encoding=encoding, compression=compression, mode=mode | ||
path_or_buf, encoding=encoding, | ||
compression=compression, mode=mode | ||
) | ||
self.sep = sep | ||
self.na_rep = na_rep | ||
|
@@ -114,7 +121,7 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', | |
self.data_index = obj.index | ||
if (isinstance(self.data_index, (ABCDatetimeIndex, ABCPeriodIndex)) and | ||
date_format is not None): | ||
from pandas import Index | ||
from pandas import Index # type: ignore | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What error was this giving you? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. MyPy was giving There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Make sure you try again on master - I think a separate PR should have resolved this already There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The same error appears on master if any type annotations are added. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @drew-heenan can you remove this and push as a new commit? Again thought we resolved this in a separate PR so would like to validate its not a mypy versioning thing between your local environment and what we have on CI There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @WillAyd Just did that - the error still appears. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm OK thanks for confirming. @ryankarlos not sure if you have any insight - thought this would be resolved by #26019 @drew-heenan this isn't a blocker so OK to add back in the ignore I think; can review separate from this There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @WillAyd Got it, thanks for checking! |
||
self.data_index = Index([x.strftime(date_format) if notna(x) else | ||
'' for x in self.data_index]) | ||
|
||
|
@@ -149,7 +156,8 @@ def save(self): | |
else: | ||
f, handles = _get_handle(self.path_or_buf, self.mode, | ||
encoding=self.encoding, | ||
compression=self.compression) | ||
compression=dict(self.compression_args, | ||
method=self.compression)) | ||
close = True | ||
|
||
try: | ||
|
@@ -173,9 +181,11 @@ def save(self): | |
if hasattr(self.path_or_buf, 'write'): | ||
self.path_or_buf.write(buf) | ||
else: | ||
compression = dict(self.compression_args, | ||
method=self.compression) | ||
f, handles = _get_handle(self.path_or_buf, self.mode, | ||
encoding=self.encoding, | ||
compression=self.compression) | ||
compression=compression) | ||
f.write(buf) | ||
close = True | ||
if close: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -534,3 +534,39 @@ def test_to_csv_compression(self, compression_only, | |
result = pd.read_csv(path, index_col=0, | ||
compression=read_compression) | ||
tm.assert_frame_equal(result, df) | ||
|
||
def test_to_csv_compression_dict(self, compression_only): | ||
# GH 26023 | ||
method = compression_only | ||
df = DataFrame({"ABC": [1]}) | ||
filename = "to_csv_compress_as_dict." | ||
filename += "gz" if method == "gzip" else method | ||
with tm.ensure_clean(filename) as path: | ||
df.to_csv(path, compression={"method": method}) | ||
read_df = pd.read_csv(path, index_col=0) | ||
tm.assert_frame_equal(read_df, df) | ||
|
||
def test_to_csv_compression_dict_no_method(self): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you append |
||
# GH 26023 | ||
df = DataFrame({"ABC": [1]}) | ||
compression = {"some_option": True} | ||
with tm.ensure_clean("out.zip") as path, pytest.raises(ValueError): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you match on the expected message with There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do the raises as an inner context manager,e .g.
|
||
df.to_csv(path, compression=compression) | ||
|
||
@pytest.mark.parametrize("compression", ["zip", "infer"]) | ||
@pytest.mark.parametrize("archive_name", [None, "test_to_csv.csv", | ||
"test_to_csv.zip"]) | ||
def test_to_csv_zip_arguments(self, compression, archive_name): | ||
drew-heenan marked this conversation as resolved.
Show resolved
Hide resolved
|
||
# GH 26023 | ||
from zipfile import ZipFile | ||
|
||
df = DataFrame({"ABC": [1]}) | ||
with tm.ensure_clean("to_csv_archive_name.zip") as path: | ||
df.to_csv(path, compression={"method": compression, | ||
"archive_name": archive_name}) | ||
zp = ZipFile(path) | ||
expected_arcname = path if archive_name is None else archive_name | ||
expected_arcname = os.path.basename(expected_arcname) | ||
assert len(zp.filelist) == 1 | ||
archived_file = os.path.basename(zp.filelist[0].filename) | ||
assert archived_file == expected_arcname |
Uh oh!
There was an error while loading. Please reload this page.