PERF: Slow performance of to_dict (#46470)

Roger Thomas · Roger Thomas · commit 8d93fecb0175 · 2022-04-22T10:33:02.000+01:00
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
@@ -453,6 +453,7 @@ Performance improvements
 - Performance improvement when setting values in a pyarrow backed string array (:issue:`46400`)
 - Performance improvement in :func:`factorize` (:issue:`46109`)
 - Performance improvement in :class:`DataFrame` and :class:`Series` constructors for extension dtype scalars (:issue:`45854`)
+- Performance improvement in :meth:`DataFrame.to_dict` and :meth:`Series.to_dict` especially when using non-mixed dtypes (:issue:`46470`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_150.bug_fixes:
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -1771,6 +1771,139 @@ def to_numpy(
 
         return result
 
+    def _to_dict_helper(self, orient, into_c, into):
+        """Helper function to do main work to convert frame into dict based on
+        `orient` and `into`
+
+        As part of GH46470 also takes care in when to use maybe_box_native as this
+        function can perform badly and is not necessary for non object cols
+        """
+        object_dtype_cols = {
+            col for col, dtype in self.dtypes.items() if is_object_dtype(dtype)
+        }
+        are_all_object_dtype_cols = len(object_dtype_cols) == len(self.dtypes)
+        if orient == "dict":
+            return into_c((k, v.to_dict(into)) for k, v in self.items())
+        elif orient == "list":
+            return into_c(
+                (
+                    k,
+                    list(map(maybe_box_native, v.tolist()))
+                    if k in object_dtype_cols
+                    else v.tolist(),
+                )
+                for k, v in self.items()
+            )
+        elif orient == "split":
+            if are_all_object_dtype_cols:
+                data = [
+                    list(map(maybe_box_native, t))
+                    for t in self.itertuples(index=False, name=None)
+                ]
+            elif object_dtype_cols:
+                is_object_dtype_by_index = [
+                    col in object_dtype_cols for col in self.columns
+                ]
+                data = [
+                    [
+                        maybe_box_native(v) if is_object_dtype_by_index[i] else v
+                        for i, v in enumerate(t)
+                    ]
+                    for t in self.itertuples(index=False, name=None)
+                ]
+            else:
+                data = [list(t) for t in self.itertuples(index=False, name=None)]
+            return into_c(
+                (
+                    ("index", self.index.tolist()),
+                    ("columns", self.columns.tolist()),
+                    ("data", data),
+                )
+            )
+        elif orient == "series":
+            return into_c((k, v) for k, v in self.items())
+        elif orient == "records":
+            columns = self.columns.tolist()
+            if object_dtype_cols:
+                is_object_dtype_by_index = [col in object_dtype_cols for col in columns]
+                return [
+                    into_c(
+                        zip(
+                            columns,
+                            [
+                                maybe_box_native(v)
+                                if is_object_dtype_by_index[i]
+                                else v
+                                for i, v in enumerate(t)
+                            ],
+                        )
+                    )
+                    for t in self.itertuples(index=False, name=None)
+                ]
+            else:
+                return [
+                    into_c(zip(columns, t))
+                    for t in self.itertuples(index=False, name=None)
+                ]
+        elif orient == "index":
+            if not self.index.is_unique:
+                raise ValueError("DataFrame index must be unique for orient='index'.")
+            columns = self.columns.tolist()
+            if object_dtype_cols:
+                is_object_dtype_by_index = [
+                    col in object_dtype_cols for col in self.columns
+                ]
+                return into_c(
+                    (
+                        t[0],
+                        {
+                            columns[i]: maybe_box_native(v)
+                            if is_object_dtype_by_index[i]
+                            else v
+                            for i, v in enumerate(t[1:])
+                        },
+                    )
+                    for t in self.itertuples(name=None)
+                )
+            else:
+                return into_c(
+                    (
+                        t[0],
+                        {columns[i]: v for i, v in enumerate(t[1:])},
+                    )
+                    for t in self.itertuples(name=None)
+                )
+        elif orient == "tight":
+            if are_all_object_dtype_cols:
+                data = [
+                    list(map(maybe_box_native, t))
+                    for t in self.itertuples(index=False, name=None)
+                ]
+            elif object_dtype_cols:
+                is_object_dtype_by_index = [
+                    col in object_dtype_cols for col in self.columns
+                ]
+                data = [
+                    [
+                        maybe_box_native(v) if is_object_dtype_by_index[i] else v
+                        for i, v in enumerate(t)
+                    ]
+                    for t in self.itertuples(index=False, name=None)
+                ]
+            else:
+                data = [list(t) for t in self.itertuples(index=False, name=None)]
+            return into_c(
+                (
+                    ("index", self.index.tolist()),
+                    ("columns", self.columns.tolist()),
+                    ("data", data),
+                    ("index_names", list(self.index.names)),
+                    ("column_names", list(self.columns.names)),
+                )
+            )
+        else:
+            raise ValueError(f"orient '{orient}' not understood")
+
     def to_dict(self, orient: str = "dict", into=dict):
         """
         Convert the DataFrame to a dictionary.
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -1771,7 +1771,13 @@ def to_dict(self, into=dict):
         """
         # GH16122
         into_c = com.standardize_mapping(into)
-        return into_c((k, maybe_box_native(v)) for k, v in self.items())
+
+        if is_object_dtype(self):
+            return into_c((k, maybe_box_native(v)) for k, v in self.items())
+        else:
+            # Not an object dtype => all types will be the same so let the default
+            # indexer return native python type
+            return into_c((k, v) for k, v in self.items())
 
     def to_frame(self, name: Hashable = lib.no_default) -> DataFrame:
         """