Skip to content

Commit 8d93fec

Browse files
author
Roger Thomas
committed
PERF: Slow performance of to_dict (#46470)
1 parent f9762d8 commit 8d93fec

File tree

3 files changed

+141
-1
lines changed

3 files changed

+141
-1
lines changed

doc/source/whatsnew/v1.5.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -453,6 +453,7 @@ Performance improvements
453453
- Performance improvement when setting values in a pyarrow backed string array (:issue:`46400`)
454454
- Performance improvement in :func:`factorize` (:issue:`46109`)
455455
- Performance improvement in :class:`DataFrame` and :class:`Series` constructors for extension dtype scalars (:issue:`45854`)
456+
- Performance improvement in :meth:`DataFrame.to_dict` and :meth:`Series.to_dict` especially when using non-mixed dtypes (:issue:`46470`)
456457

457458
.. ---------------------------------------------------------------------------
458459
.. _whatsnew_150.bug_fixes:

pandas/core/frame.py

Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1771,6 +1771,139 @@ def to_numpy(
17711771

17721772
return result
17731773

1774+
def _to_dict_helper(self, orient, into_c, into):
1775+
"""Helper function to do main work to convert frame into dict based on
1776+
`orient` and `into`
1777+
1778+
As part of GH46470 also takes care in when to use maybe_box_native as this
1779+
function can perform badly and is not necessary for non object cols
1780+
"""
1781+
object_dtype_cols = {
1782+
col for col, dtype in self.dtypes.items() if is_object_dtype(dtype)
1783+
}
1784+
are_all_object_dtype_cols = len(object_dtype_cols) == len(self.dtypes)
1785+
if orient == "dict":
1786+
return into_c((k, v.to_dict(into)) for k, v in self.items())
1787+
elif orient == "list":
1788+
return into_c(
1789+
(
1790+
k,
1791+
list(map(maybe_box_native, v.tolist()))
1792+
if k in object_dtype_cols
1793+
else v.tolist(),
1794+
)
1795+
for k, v in self.items()
1796+
)
1797+
elif orient == "split":
1798+
if are_all_object_dtype_cols:
1799+
data = [
1800+
list(map(maybe_box_native, t))
1801+
for t in self.itertuples(index=False, name=None)
1802+
]
1803+
elif object_dtype_cols:
1804+
is_object_dtype_by_index = [
1805+
col in object_dtype_cols for col in self.columns
1806+
]
1807+
data = [
1808+
[
1809+
maybe_box_native(v) if is_object_dtype_by_index[i] else v
1810+
for i, v in enumerate(t)
1811+
]
1812+
for t in self.itertuples(index=False, name=None)
1813+
]
1814+
else:
1815+
data = [list(t) for t in self.itertuples(index=False, name=None)]
1816+
return into_c(
1817+
(
1818+
("index", self.index.tolist()),
1819+
("columns", self.columns.tolist()),
1820+
("data", data),
1821+
)
1822+
)
1823+
elif orient == "series":
1824+
return into_c((k, v) for k, v in self.items())
1825+
elif orient == "records":
1826+
columns = self.columns.tolist()
1827+
if object_dtype_cols:
1828+
is_object_dtype_by_index = [col in object_dtype_cols for col in columns]
1829+
return [
1830+
into_c(
1831+
zip(
1832+
columns,
1833+
[
1834+
maybe_box_native(v)
1835+
if is_object_dtype_by_index[i]
1836+
else v
1837+
for i, v in enumerate(t)
1838+
],
1839+
)
1840+
)
1841+
for t in self.itertuples(index=False, name=None)
1842+
]
1843+
else:
1844+
return [
1845+
into_c(zip(columns, t))
1846+
for t in self.itertuples(index=False, name=None)
1847+
]
1848+
elif orient == "index":
1849+
if not self.index.is_unique:
1850+
raise ValueError("DataFrame index must be unique for orient='index'.")
1851+
columns = self.columns.tolist()
1852+
if object_dtype_cols:
1853+
is_object_dtype_by_index = [
1854+
col in object_dtype_cols for col in self.columns
1855+
]
1856+
return into_c(
1857+
(
1858+
t[0],
1859+
{
1860+
columns[i]: maybe_box_native(v)
1861+
if is_object_dtype_by_index[i]
1862+
else v
1863+
for i, v in enumerate(t[1:])
1864+
},
1865+
)
1866+
for t in self.itertuples(name=None)
1867+
)
1868+
else:
1869+
return into_c(
1870+
(
1871+
t[0],
1872+
{columns[i]: v for i, v in enumerate(t[1:])},
1873+
)
1874+
for t in self.itertuples(name=None)
1875+
)
1876+
elif orient == "tight":
1877+
if are_all_object_dtype_cols:
1878+
data = [
1879+
list(map(maybe_box_native, t))
1880+
for t in self.itertuples(index=False, name=None)
1881+
]
1882+
elif object_dtype_cols:
1883+
is_object_dtype_by_index = [
1884+
col in object_dtype_cols for col in self.columns
1885+
]
1886+
data = [
1887+
[
1888+
maybe_box_native(v) if is_object_dtype_by_index[i] else v
1889+
for i, v in enumerate(t)
1890+
]
1891+
for t in self.itertuples(index=False, name=None)
1892+
]
1893+
else:
1894+
data = [list(t) for t in self.itertuples(index=False, name=None)]
1895+
return into_c(
1896+
(
1897+
("index", self.index.tolist()),
1898+
("columns", self.columns.tolist()),
1899+
("data", data),
1900+
("index_names", list(self.index.names)),
1901+
("column_names", list(self.columns.names)),
1902+
)
1903+
)
1904+
else:
1905+
raise ValueError(f"orient '{orient}' not understood")
1906+
17741907
def to_dict(self, orient: str = "dict", into=dict):
17751908
"""
17761909
Convert the DataFrame to a dictionary.

pandas/core/series.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1771,7 +1771,13 @@ def to_dict(self, into=dict):
17711771
"""
17721772
# GH16122
17731773
into_c = com.standardize_mapping(into)
1774-
return into_c((k, maybe_box_native(v)) for k, v in self.items())
1774+
1775+
if is_object_dtype(self):
1776+
return into_c((k, maybe_box_native(v)) for k, v in self.items())
1777+
else:
1778+
# Not an object dtype => all types will be the same so let the default
1779+
# indexer return native python type
1780+
return into_c((k, v) for k, v in self.items())
17751781

17761782
def to_frame(self, name: Hashable = lib.no_default) -> DataFrame:
17771783
"""

0 commit comments

Comments
 (0)