Skip to content

ENH: add use_nullable_dtypes option in read_parquet #31242

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Nov 29, 2020
5 changes: 5 additions & 0 deletions doc/source/whatsnew/v1.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,11 @@ Other enhancements
- ``Styler`` now allows direct CSS class name addition to individual data cells (:issue:`36159`)
- :meth:`Rolling.mean()` and :meth:`Rolling.sum()` use Kahan summation to calculate the mean to avoid numerical problems (:issue:`10319`, :issue:`11645`, :issue:`13254`, :issue:`32761`, :issue:`36031`)
- :meth:`DatetimeIndex.searchsorted`, :meth:`TimedeltaIndex.searchsorted`, :meth:`PeriodIndex.searchsorted`, and :meth:`Series.searchsorted` with datetimelike dtypes will now try to cast string arguments (listlike and scalar) to the matching datetimelike type (:issue:`36346`)
- :func:`read_parquet` gained a ``use_nullable_dtypes=True`` option to use
nullable dtypes that use ``pd.NA`` as missing value indicator where possible
for the resulting DataFrame (default is False, and only applicable for
``engine="pyarrow"``) (:issue:`31242`)


.. _whatsnew_120.api_breaking.python:

Expand Down
61 changes: 57 additions & 4 deletions pandas/io/parquet.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
""" parquet compat """

from distutils.version import LooseVersion
from typing import Any, AnyStr, Dict, List, Optional
from warnings import catch_warnings

Expand Down Expand Up @@ -128,7 +129,12 @@ def write(
self.api.parquet.write_table(table, path, compression=compression, **kwargs)

def read(
self, path, columns=None, storage_options: StorageOptions = None, **kwargs
self,
path,
columns=None,
use_nullable_dtypes=False,
storage_options: StorageOptions = None,
**kwargs,
):
if is_fsspec_url(path) and "filesystem" not in kwargs:
import_optional_dependency("fsspec")
Expand All @@ -151,9 +157,33 @@ def read(
should_close = ioargs.should_close

kwargs["use_pandas_metadata"] = True
to_pandas_kwargs = {}
if use_nullable_dtypes:
if LooseVersion(self.api.__version__) > "0.15.1.dev":
import pandas as pd

mapping = {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you instead import from the arrays locations.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We also import eg DataFrame from the main namespace in this file

self.api.int8(): pd.Int8Dtype(),
self.api.int16(): pd.Int16Dtype(),
self.api.int32(): pd.Int32Dtype(),
self.api.int64(): pd.Int64Dtype(),
self.api.uint8(): pd.UInt8Dtype(),
self.api.uint16(): pd.UInt16Dtype(),
self.api.uint32(): pd.UInt32Dtype(),
self.api.uint64(): pd.UInt64Dtype(),
self.api.bool_(): pd.BooleanDtype(),
self.api.string(): pd.StringDtype(),
}
to_pandas_kwargs["types_mapper"] = mapping.get
else:
raise ValueError(
"'use_nullable_dtypes=True' is only supported for pyarrow >= 0.16 "
f"({self.api.__version__} is installed"
)

result = self.api.parquet.read_table(
path, columns=columns, filesystem=fs, **kwargs
).to_pandas()
).to_pandas(**to_pandas_kwargs)
if should_close:
path.close()

Expand Down Expand Up @@ -222,6 +252,12 @@ def write(
def read(
self, path, columns=None, storage_options: StorageOptions = None, **kwargs
):
use_nullable_dtypes = kwargs.pop("use_nullable_dtypes", False)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we should have a global option to turn this on (pls add an issue for this)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this generally worth it, if you can add an issue for this / PR welcome too! (bot blocking for this PR)

if use_nullable_dtypes:
raise ValueError(
"The 'use_nullable_dtypes' argument is not supported for the "
"fastparquet engine"
)
if is_fsspec_url(path):
fsspec = import_optional_dependency("fsspec")

Expand Down Expand Up @@ -313,7 +349,13 @@ def to_parquet(
)


def read_parquet(path, engine: str = "auto", columns=None, **kwargs):
def read_parquet(
path,
engine: str = "auto",
columns=None,
use_nullable_dtypes: bool = False,
**kwargs,
):
"""
Load a parquet object from the file path, returning a DataFrame.

Expand Down Expand Up @@ -342,6 +384,15 @@ def read_parquet(path, engine: str = "auto", columns=None, **kwargs):
'pyarrow' is unavailable.
columns : list, default=None
If not None, only these columns will be read from the file.
use_nullable_dtypes : bool, default False
If True, use dtypes that use ``pd.NA`` as missing value indicator
for the resulting DataFrame (only applicable for ``engine="pyarrow"``).
As new dtypes are added that support ``pd.NA`` in the future, the
output with this option will change to use those dtypes.
Note: this is an experimental option, and behaviour (e.g. additional
support dtypes) may change without notice.

.. versionadded:: 1.2.0
**kwargs
Any additional kwargs are passed to the engine.

Expand All @@ -350,4 +401,6 @@ def read_parquet(path, engine: str = "auto", columns=None, **kwargs):
DataFrame
"""
impl = get_engine(engine)
return impl.read(path, columns=columns, **kwargs)
return impl.read(
path, columns=columns, use_nullable_dtypes=use_nullable_dtypes, **kwargs
)
29 changes: 29 additions & 0 deletions pandas/tests/io/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -732,6 +732,35 @@ def test_additional_extension_types(self, pa):
)
check_round_trip(df, pa)

@td.skip_if_no("pyarrow", min_version="0.15.1.dev")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same comment as above

def test_use_nullable_dtypes(self, pa):
import pyarrow.parquet as pq

table = pyarrow.table(
{
"a": pyarrow.array([1, 2, 3, None], "int64"),
"b": pyarrow.array([1, 2, 3, None], "uint8"),
"c": pyarrow.array(["a", "b", "c", None]),
"d": pyarrow.array([True, False, True, None]),
}
)
with tm.ensure_clean() as path:
# write manually with pyarrow to write integers
pq.write_table(table, path)
result1 = read_parquet(path)
result2 = read_parquet(path, use_nullable_dtypes=True)

assert result1["a"].dtype == np.dtype("float64")
expected = pd.DataFrame(
{
"a": pd.array([1, 2, 3, None], dtype="Int64"),
"b": pd.array([1, 2, 3, None], dtype="UInt8"),
"c": pd.array(["a", "b", "c", None], dtype="string"),
"d": pd.array([True, False, True, None], dtype="boolean"),
}
)
tm.assert_frame_equal(result2, expected)

@td.skip_if_no("pyarrow", min_version="0.14")
def test_timestamp_nanoseconds(self, pa):
# with version 2.0, pyarrow defaults to writing the nanoseconds, so
Expand Down