pandas-dev · jreback · Nov 29, 2020 · Jan 23, 2020 · Sep 4, 2020 · Sep 4, 2020
diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst
@@ -120,6 +120,11 @@ Other enhancements
 - ``Styler`` now allows direct CSS class name addition to individual data cells (:issue:`36159`)
 - :meth:`Rolling.mean()` and :meth:`Rolling.sum()` use Kahan summation to calculate the mean to avoid numerical problems (:issue:`10319`, :issue:`11645`, :issue:`13254`, :issue:`32761`, :issue:`36031`)
 - :meth:`DatetimeIndex.searchsorted`, :meth:`TimedeltaIndex.searchsorted`, :meth:`PeriodIndex.searchsorted`, and :meth:`Series.searchsorted` with datetimelike dtypes will now try to cast string arguments (listlike and scalar) to the matching datetimelike type (:issue:`36346`)
+- :func:`read_parquet` gained a ``use_nullable_dtypes=True`` option to use
+  nullable dtypes that use ``pd.NA`` as missing value indicator where possible
+  for the resulting DataFrame (default is False, and only applicable for
+  ``engine="pyarrow"``) (:issue:`31242`)
+
 
 .. _whatsnew_120.api_breaking.python:
 

diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py
@@ -1,5 +1,6 @@
 """ parquet compat """
 
+from distutils.version import LooseVersion
 from typing import Any, AnyStr, Dict, List, Optional
 from warnings import catch_warnings
 
@@ -128,7 +129,12 @@ def write(
             self.api.parquet.write_table(table, path, compression=compression, **kwargs)
 
     def read(
-        self, path, columns=None, storage_options: StorageOptions = None, **kwargs
+        self,
+        path,
+        columns=None,
+        use_nullable_dtypes=False,
+        storage_options: StorageOptions = None,
+        **kwargs,
     ):
         if is_fsspec_url(path) and "filesystem" not in kwargs:
             import_optional_dependency("fsspec")
@@ -151,9 +157,33 @@ def read(
             should_close = ioargs.should_close
 
         kwargs["use_pandas_metadata"] = True
+        to_pandas_kwargs = {}
+        if use_nullable_dtypes:
+            if LooseVersion(self.api.__version__) > "0.15.1.dev":
+                import pandas as pd
+
+                mapping = {
+                    self.api.int8(): pd.Int8Dtype(),
+                    self.api.int16(): pd.Int16Dtype(),
+                    self.api.int32(): pd.Int32Dtype(),
+                    self.api.int64(): pd.Int64Dtype(),
+                    self.api.uint8(): pd.UInt8Dtype(),
+                    self.api.uint16(): pd.UInt16Dtype(),
+                    self.api.uint32(): pd.UInt32Dtype(),
+                    self.api.uint64(): pd.UInt64Dtype(),
+                    self.api.bool_(): pd.BooleanDtype(),
+                    self.api.string(): pd.StringDtype(),
+                }
+                to_pandas_kwargs["types_mapper"] = mapping.get
+            else:
+                raise ValueError(
+                    "'use_nullable_dtypes=True' is only supported for pyarrow >= 0.16 "
+                    f"({self.api.__version__} is installed"
+                )
+
         result = self.api.parquet.read_table(
             path, columns=columns, filesystem=fs, **kwargs
-        ).to_pandas()
+        ).to_pandas(**to_pandas_kwargs)
         if should_close:
             path.close()
 
@@ -222,6 +252,12 @@ def write(
     def read(
         self, path, columns=None, storage_options: StorageOptions = None, **kwargs
     ):
+        use_nullable_dtypes = kwargs.pop("use_nullable_dtypes", False)
+        if use_nullable_dtypes:
+            raise ValueError(
+                "The 'use_nullable_dtypes' argument is not supported for the "
+                "fastparquet engine"
+            )
         if is_fsspec_url(path):
             fsspec = import_optional_dependency("fsspec")
 
@@ -313,7 +349,13 @@ def to_parquet(
     )
 
 
-def read_parquet(path, engine: str = "auto", columns=None, **kwargs):
+def read_parquet(
+    path,
+    engine: str = "auto",
+    columns=None,
+    use_nullable_dtypes: bool = False,
+    **kwargs,
+):
     """
     Load a parquet object from the file path, returning a DataFrame.
 
@@ -342,6 +384,15 @@ def read_parquet(path, engine: str = "auto", columns=None, **kwargs):
         'pyarrow' is unavailable.
     columns : list, default=None
         If not None, only these columns will be read from the file.
+    use_nullable_dtypes : bool, default False
+        If True, use dtypes that use ``pd.NA`` as missing value indicator
+        for the resulting DataFrame (only applicable for ``engine="pyarrow"``).
+        As new dtypes are added that support ``pd.NA`` in the future, the
+        output with this option will change to use those dtypes.
+        Note: this is an experimental option, and behaviour (e.g. additional
+        support dtypes) may change without notice.
+
+        .. versionadded:: 1.2.0
     **kwargs
         Any additional kwargs are passed to the engine.
 
@@ -350,4 +401,6 @@ def read_parquet(path, engine: str = "auto", columns=None, **kwargs):
     DataFrame
     """
     impl = get_engine(engine)
-    return impl.read(path, columns=columns, **kwargs)
+    return impl.read(
+        path, columns=columns, use_nullable_dtypes=use_nullable_dtypes, **kwargs
+    )
diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
@@ -732,6 +732,35 @@ def test_additional_extension_types(self, pa):
         )
         check_round_trip(df, pa)
 
+    @td.skip_if_no("pyarrow", min_version="0.15.1.dev")
+    def test_use_nullable_dtypes(self, pa):
+        import pyarrow.parquet as pq
+
+        table = pyarrow.table(
+            {
+                "a": pyarrow.array([1, 2, 3, None], "int64"),
+                "b": pyarrow.array([1, 2, 3, None], "uint8"),
+                "c": pyarrow.array(["a", "b", "c", None]),
+                "d": pyarrow.array([True, False, True, None]),
+            }
+        )
+        with tm.ensure_clean() as path:
+            # write manually with pyarrow to write integers
+            pq.write_table(table, path)
+            result1 = read_parquet(path)
+            result2 = read_parquet(path, use_nullable_dtypes=True)
+
+        assert result1["a"].dtype == np.dtype("float64")
+        expected = pd.DataFrame(
+            {
+                "a": pd.array([1, 2, 3, None], dtype="Int64"),
+                "b": pd.array([1, 2, 3, None], dtype="UInt8"),
+                "c": pd.array(["a", "b", "c", None], dtype="string"),
+                "d": pd.array([True, False, True, None], dtype="boolean"),
+            }
+        )
+        tm.assert_frame_equal(result2, expected)
+
     @td.skip_if_no("pyarrow", min_version="0.14")
     def test_timestamp_nanoseconds(self, pa):
         # with version 2.0, pyarrow defaults to writing the nanoseconds, so