Skip to content

Commit cca4ba9

Browse files
RomainL972Derekt2
and
Derekt2
committed
ENH: Add an option to prevent stripping extra whitespaces in pd.read_html
Co-authored-by: Derekt2 <[email protected]>
1 parent d41884b commit cca4ba9

File tree

3 files changed

+48
-1
lines changed

3 files changed

+48
-1
lines changed

doc/source/whatsnew/v3.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ Other enhancements
4646
- :class:`.errors.DtypeWarning` improved to include column names when mixed data types are detected (:issue:`58174`)
4747
- :class:`Series` now supports the Arrow PyCapsule Interface for export (:issue:`59518`)
4848
- :func:`DataFrame.to_excel` argument ``merge_cells`` now accepts a value of ``"columns"`` to only merge :class:`MultiIndex` column header header cells (:issue:`35384`)
49+
- :func:`read_html` now accepts a ``strip_whitespace`` argument to decide if extra whitespaces should be trimmed in HTML tables (:issue:`24766`)
4950
- :meth:`DataFrame.corrwith` now accepts ``min_periods`` as optional arguments, as in :meth:`DataFrame.corr` and :meth:`Series.corr` (:issue:`9490`)
5051
- :meth:`DataFrame.cummin`, :meth:`DataFrame.cummax`, :meth:`DataFrame.cumprod` and :meth:`DataFrame.cumsum` methods now have a ``numeric_only`` parameter (:issue:`53072`)
5152
- :meth:`DataFrame.ewm` now allows ``adjust=False`` when ``times`` is provided (:issue:`54328`)

pandas/io/html.py

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,11 @@ class _HtmlFrameParser:
172172
173173
.. versionadded:: 1.5.0
174174
175+
strip_whitespace : bool
176+
Whether table row values should have all extra whitespaces stripped to
177+
a single space.
178+
.. versionadded:: 3.0.0
179+
175180
Attributes
176181
----------
177182
io : str or file-like
@@ -196,6 +201,11 @@ class _HtmlFrameParser:
196201
197202
.. versionadded:: 1.5.0
198203
204+
strip_whitespace : bool
205+
Whether table row values should have all extra whitespaces stripped to
206+
a single space.
207+
.. versionadded:: 3.0.0
208+
199209
Notes
200210
-----
201211
To subclass this class effectively you must override the following methods:
@@ -222,6 +232,7 @@ def __init__(
222232
displayed_only: bool,
223233
extract_links: Literal[None, "header", "footer", "body", "all"],
224234
storage_options: StorageOptions = None,
235+
strip_whitespace: bool = True,
225236
) -> None:
226237
self.io = io
227238
self.match = match
@@ -230,6 +241,7 @@ def __init__(
230241
self.displayed_only = displayed_only
231242
self.extract_links = extract_links
232243
self.storage_options = storage_options
244+
self.strip_whitespace = strip_whitespace
233245

234246
def parse_tables(self):
235247
"""
@@ -523,10 +535,15 @@ def _expand_colspan_rowspan(
523535
index += 1
524536

525537
# Append the text from this <td>, colspan times
526-
text = _remove_whitespace(self._text_getter(td))
538+
if self.strip_whitespace:
539+
text = _remove_whitespace(self._text_getter(td))
540+
else:
541+
text = self._text_getter(td)
542+
527543
if self.extract_links in ("all", section):
528544
href = self._href_getter(td)
529545
text = (text, href)
546+
530547
rowspan = int(self._attr_getter(td, "rowspan") or 1)
531548
colspan = int(self._attr_getter(td, "colspan") or 1)
532549

@@ -962,6 +979,7 @@ def _parse(
962979
displayed_only,
963980
extract_links,
964981
storage_options,
982+
strip_whitespace,
965983
**kwargs,
966984
):
967985
flavor = _validate_flavor(flavor)
@@ -978,6 +996,7 @@ def _parse(
978996
displayed_only,
979997
extract_links,
980998
storage_options,
999+
strip_whitespace,
9811000
)
9821001

9831002
try:
@@ -1045,6 +1064,7 @@ def read_html(
10451064
extract_links: Literal[None, "header", "footer", "body", "all"] = None,
10461065
dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
10471066
storage_options: StorageOptions = None,
1067+
strip_whitespace: bool = True,
10481068
) -> list[DataFrame]:
10491069
r"""
10501070
Read HTML tables into a ``list`` of ``DataFrame`` objects.
@@ -1165,6 +1185,11 @@ def read_html(
11651185
11661186
.. versionadded:: 2.1.0
11671187
1188+
strip_whitespace : bool
1189+
Whether table row values should have all extra whitespaces stripped to
1190+
a single space.
1191+
.. versionadded:: 3.0.0
1192+
11681193
Returns
11691194
-------
11701195
dfs
@@ -1245,4 +1270,5 @@ def read_html(
12451270
extract_links=extract_links,
12461271
dtype_backend=dtype_backend,
12471272
storage_options=storage_options,
1273+
strip_whitespace=strip_whitespace,
12481274
)

pandas/tests/io/test_html.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1657,3 +1657,23 @@ def test_style_tag(self, flavor_read_html):
16571657
result = flavor_read_html(StringIO(data))[0]
16581658
expected = DataFrame(data=[["A1", "B1"], ["A2", "B2"]], columns=["A", "B"])
16591659
tm.assert_frame_equal(result, expected)
1660+
1661+
def test_strip_whitespace(self, flavor_read_html):
1662+
# GH 24766
1663+
data = """
1664+
<table>
1665+
<tr>
1666+
<td>Field 1
1667+
Field 2</td>
1668+
<td>Value 1
1669+
Value 2</td>
1670+
</tr>
1671+
</table>
1672+
"""
1673+
result_strip = flavor_read_html(StringIO(data))[0]
1674+
expected_strip = DataFrame([["Field 1 Field 2", "Value 1 Value 2"]])
1675+
tm.assert_frame_equal(result_strip, expected_strip)
1676+
1677+
result_nostrip = flavor_read_html(StringIO(data), strip_whitespace=False)[0]
1678+
expected_nostrip = DataFrame([["Field 1\nField 2", "Value 1\nValue 2"]])
1679+
tm.assert_frame_equal(result_nostrip, expected_nostrip)

0 commit comments

Comments
 (0)