ENH: Add an option to prevent stripping extra whitespaces in pd.read_html

RomainL972 · Derekt2 · RomainL972 · commit cca4ba921b4e · 2024-12-15T20:35:25.000-05:00
Co-authored-by: Derekt2 &lt;derek.e.thomas@biola.edu&gt;
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -46,6 +46,7 @@ Other enhancements
 - :class:`.errors.DtypeWarning` improved to include column names when mixed data types are detected (:issue:`58174`)
 - :class:`Series` now supports the Arrow PyCapsule Interface for export (:issue:`59518`)
 - :func:`DataFrame.to_excel` argument ``merge_cells`` now accepts a value of ``"columns"`` to only merge :class:`MultiIndex` column header header cells (:issue:`35384`)
+- :func:`read_html` now accepts a ``strip_whitespace`` argument to decide if extra whitespaces should be trimmed in HTML tables (:issue:`24766`)
 - :meth:`DataFrame.corrwith` now accepts ``min_periods`` as optional arguments, as in :meth:`DataFrame.corr` and :meth:`Series.corr` (:issue:`9490`)
 - :meth:`DataFrame.cummin`, :meth:`DataFrame.cummax`, :meth:`DataFrame.cumprod` and :meth:`DataFrame.cumsum` methods now have a ``numeric_only`` parameter (:issue:`53072`)
 - :meth:`DataFrame.ewm` now allows ``adjust=False`` when ``times`` is provided (:issue:`54328`)
diff --git a/pandas/io/html.py b/pandas/io/html.py
@@ -172,6 +172,11 @@ class _HtmlFrameParser:
 
         .. versionadded:: 1.5.0
 
+    strip_whitespace : bool
+        Whether table row values should have all extra whitespaces stripped to
+        a single space.
+        .. versionadded:: 3.0.0
+
     Attributes
     ----------
     io : str or file-like
@@ -196,6 +201,11 @@ class _HtmlFrameParser:
 
         .. versionadded:: 1.5.0
 
+    strip_whitespace : bool
+        Whether table row values should have all extra whitespaces stripped to
+        a single space.
+        .. versionadded:: 3.0.0
+
     Notes
     -----
     To subclass this class effectively you must override the following methods:
@@ -222,6 +232,7 @@ def __init__(
         displayed_only: bool,
         extract_links: Literal[None, "header", "footer", "body", "all"],
         storage_options: StorageOptions = None,
+        strip_whitespace: bool = True,
     ) -> None:
         self.io = io
         self.match = match
@@ -230,6 +241,7 @@ def __init__(
         self.displayed_only = displayed_only
         self.extract_links = extract_links
         self.storage_options = storage_options
+        self.strip_whitespace = strip_whitespace
 
     def parse_tables(self):
         """
@@ -523,10 +535,15 @@ def _expand_colspan_rowspan(
                     index += 1
 
                 # Append the text from this <td>, colspan times
-                text = _remove_whitespace(self._text_getter(td))
+                if self.strip_whitespace:
+                    text = _remove_whitespace(self._text_getter(td))
+                else:
+                    text = self._text_getter(td)
+
                 if self.extract_links in ("all", section):
                     href = self._href_getter(td)
                     text = (text, href)
+
                 rowspan = int(self._attr_getter(td, "rowspan") or 1)
                 colspan = int(self._attr_getter(td, "colspan") or 1)
 
@@ -962,6 +979,7 @@ def _parse(
     displayed_only,
     extract_links,
     storage_options,
+    strip_whitespace,
     **kwargs,
 ):
     flavor = _validate_flavor(flavor)
@@ -978,6 +996,7 @@ def _parse(
             displayed_only,
             extract_links,
             storage_options,
+            strip_whitespace,
         )
 
         try:
@@ -1045,6 +1064,7 @@ def read_html(
     extract_links: Literal[None, "header", "footer", "body", "all"] = None,
     dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
     storage_options: StorageOptions = None,
+    strip_whitespace: bool = True,
 ) -> list[DataFrame]:
     r"""
     Read HTML tables into a ``list`` of ``DataFrame`` objects.
@@ -1165,6 +1185,11 @@ def read_html(
 
         .. versionadded:: 2.1.0
 
+    strip_whitespace : bool
+        Whether table row values should have all extra whitespaces stripped to
+        a single space.
+        .. versionadded:: 3.0.0
+
     Returns
     -------
     dfs
@@ -1245,4 +1270,5 @@ def read_html(
         extract_links=extract_links,
         dtype_backend=dtype_backend,
         storage_options=storage_options,
+        strip_whitespace=strip_whitespace,
     )
diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
@@ -1657,3 +1657,23 @@ def test_style_tag(self, flavor_read_html):
         result = flavor_read_html(StringIO(data))[0]
         expected = DataFrame(data=[["A1", "B1"], ["A2", "B2"]], columns=["A", "B"])
         tm.assert_frame_equal(result, expected)
+
+    def test_strip_whitespace(self, flavor_read_html):
+        # GH 24766
+        data = """
+        <table>
+            <tr>
+                <td>Field 1
+Field 2</td>
+                <td>Value 1
+Value 2</td>
+            </tr>
+        </table>
+        """
+        result_strip = flavor_read_html(StringIO(data))[0]
+        expected_strip = DataFrame([["Field 1 Field 2", "Value 1 Value 2"]])
+        tm.assert_frame_equal(result_strip, expected_strip)
+
+        result_nostrip = flavor_read_html(StringIO(data), strip_whitespace=False)[0]
+        expected_nostrip = DataFrame([["Field 1\nField 2", "Value 1\nValue 2"]])
+        tm.assert_frame_equal(result_nostrip, expected_nostrip)