chore: change table extraction defaults (Unstructured-IO#370)

ds-filipknefel · Filip Knefel · pawel-kmiecik · web-flow · commit 21bb99c9a00b · 2024-04-11T20:00:16.000+02:00
Changed default value of `pdf_infer_table_structure` to `True`.
Changed default value of `skip_infer_table_types` to `[]`.
Marked `pdf_infer_table_structure` as deprecated and removed from documented usage examples.
Updated tests in line with above changes.

---------

Co-authored-by: Filip Knefel &lt;filip@unstructured.io&gt;
Co-authored-by: Paweł Kmiecik &lt;pawel.kmiecik@deepsense.ai&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,8 +1,9 @@
-## 0.0.66-dev2
+## 0.0.66-dev3
 
 * Add support for `unique_element_ids` parameter.
 * Add max lifetime, via MAX_LIFETIME_SECONDS env-var, to API containers
 * Bump unstructured to 0.13.2
+* Change default values for `pdf_infer_table_structure` and `skip_infer_table_types`. Mark `pdf_infer_table_structure` deprecated.
 
 ## 0.0.65
 
diff --git a/README.md b/README.md
@@ -141,25 +141,10 @@ When elements are extracted from PDFs or images, it may be useful to get their b
   | jq -C . | less -R
 ```
 
-#### PDF Table Extraction
-
-To extract the table structure from PDF files using the `hi_res` strategy, ensure that the `pdf_infer_table_structure` parameter is set to `true`. This setting includes the table's text content in the response. By default, this parameter is set to `false` to avoid the expensive reading process.
-
-```
- curl -X 'POST' \
-  'https://api.unstructured.io/general/v0/general' \
-  -H 'accept: application/json' \
-  -H 'Content-Type: multipart/form-data' \
-  -F 'files=@sample-docs/layout-parser-paper.pdf' \
-  -F 'strategy=hi_res' \
-  -F 'pdf_infer_table_structure=true' \
-  | jq -C . | less -R
-```
-
 #### Skip Table Extraction
 
-Currently, we provide support for enabling and disabling table extraction for file types other than PDF files. Set parameter `skip_infer_table_types` to specify the document types that you want to skip table extraction with. By default, we skip table extraction
-for PDFs and Images, which are `pdf`, `jpg` and `png`. Again, please note that table extraction only works with `hi_res` strategy. For example, if you don't want to skip table extraction for images, you can pass an empty value to `skip_infer_table_types` with:
+Currently, we provide support for enabling and disabling table extraction for all file types. Set parameter `skip_infer_table_types` to specify the document types that you want to skip table extraction with. By default, we enable table extraction
+for all file types (`skip_infer_table_types=[]`). Again, please note that table extraction only works with `hi_res` strategy. For example, if you want to skip table extraction for images, you can pass a list with matching image file types:
 
 ```
  curl -X 'POST' \
@@ -168,7 +153,7 @@ for PDFs and Images, which are `pdf`, `jpg` and `png`. Again, please note that t
   -H 'Content-Type: multipart/form-data' \
   -F 'files=@sample-docs/layout-parser-paper-with-table.jpg' \
   -F 'strategy=hi_res' \
-  -F 'skip_infer_table_types=[]' \
+  -F 'skip_infer_table_types=["jpg"]' \
   | jq -C . | less -R
 ```
 
diff --git a/openapi.json b/openapi.json
@@ -169,7 +169,7 @@
                     "pdf_infer_table_structure": {
                         "type": "boolean",
                         "title": "Pdf Infer Table Structure",
-                        "description": "If True and strategy=hi_res, any Table Elements extracted from a PDF will include an additional metadata field, 'text_as_html', where the value (string) is a just a transformation of the data into an HTML <table>."
+                        "description": "Deprecated! Use skip_infer_table_types to opt out of table extraction for any file type. If False and strategy=hi_res, no Table Elements will be extracted from pdf files regardless of skip_infer_table_types contents."
                     },
                     "skip_infer_table_types": {
                         "items": {
@@ -178,7 +178,7 @@
                         },
                         "type": "array",
                         "title": "Skip Infer Table Types",
-                        "description": "The document types that you want to skip table extraction with. Default: ['pdf', 'jpg', 'png']"
+                        "description": "The document types that you want to skip table extraction with. Default: []"
                     },
                     "xml_keep_tags": {
                         "type": "boolean",
diff --git a/prepline_general/api/general.py b/prepline_general/api/general.py
@@ -293,7 +293,7 @@ def pipeline_api(
     hi_res_model_name: Optional[str] = None,
     include_page_breaks: bool = False,
     ocr_languages: Optional[List[str]] = None,
-    pdf_infer_table_structure: bool = False,
+    pdf_infer_table_structure: bool = True,
     skip_infer_table_types: Optional[List[str]] = None,
     strategy: str = "auto",
     xml_keep_tags: bool = False,
diff --git a/prepline_general/api/models/form_params.py b/prepline_general/api/models/form_params.py
@@ -69,15 +69,13 @@ def as_form(
             List[str],
             Form(
                 title="Skip Infer Table Types",
-                description="The document types that you want to skip table extraction with. Default: ['pdf', 'jpg', 'png']",
+                description=(
+                    "The document types that you want to skip table extraction with. Default: []"
+                ),
                 example="['pdf', 'jpg', 'png']",
             ),
             BeforeValidator(SmartValueParser[List[str]]().value_or_first_element),
-        ] = [
-            "pdf",
-            "jpg",
-            "png",
-        ],  # noqa
+        ] = [],  # noqa
         gz_uncompressed_content_type: Annotated[
             Optional[str],
             Form(
@@ -132,10 +130,14 @@ def as_form(
             bool,
             Form(
                 title="Pdf Infer Table Structure",
-                description="If True and strategy=hi_res, any Table Elements extracted from a PDF will include an additional metadata field, 'text_as_html', where the value (string) is a just a transformation of the data into an HTML <table>.",
+                description=(
+                    "Deprecated! Use skip_infer_table_types to opt out of table extraction for any "
+                    "file type. If False and strategy=hi_res, no Table Elements will be extracted "
+                    "from pdf files regardless of skip_infer_table_types contents."
+                ),
             ),
             BeforeValidator(SmartValueParser[bool]().value_or_first_element),
-        ] = False,
+        ] = True,
         strategy: Annotated[
             Literal["fast", "hi_res", "auto", "ocr_only"],
             Form(
diff --git a/scripts/smoketest.py b/scripts/smoketest.py
@@ -22,9 +22,8 @@ def send_document(
     content_type: str = "",
     strategy: str = "auto",
     output_format: str = "application/json",
-    pdf_infer_table_structure: str = "false",
+    skip_infer_table_types: list[str] = [],
     uncompressed_content_type: str = "",
-    skip_infer_table_types: str = "['pdf', 'jpg', 'png']",
 ):
     if filenames_gzipped is None:
         filenames_gzipped = []
@@ -37,7 +36,6 @@ def send_document(
     options = {
         "strategy": strategy,
         "output_format": output_format,
-        "pdf_infer_table_structure": pdf_infer_table_structure,
         "skip_infer_table_types": skip_infer_table_types,
     }
     if uncompressed_content_type:
@@ -226,15 +224,15 @@ def test_strategy_performance():
 
 @pytest.mark.skipif(skip_inference_tests, reason="emulated architecture")
 @pytest.mark.parametrize(
-    "strategy, pdf_infer_table_structure, expected_table_num",
+    "strategy, skip_infer_table_types, expected_table_num",
     [
-        ("fast", "True", 0),
-        ("fast", "False", 0),
-        ("hi_res", "True", 2),
-        ("hi_res", "False", 0),
+        ("fast", [], 0),
+        ("fast", ["pdf"], 0),
+        ("hi_res", [], 2),
+        ("hi_res", ["pdf"], 0),
     ],
 )
-def test_table_support(strategy: str, pdf_infer_table_structure: str, expected_table_num: int):
+def test_table_support(strategy: str, skip_infer_table_types: list[str], expected_table_num: int):
     """
     Test that table extraction works on hi_res strategy
     """
@@ -243,8 +241,7 @@ def test_table_support(strategy: str, pdf_infer_table_structure: str, expected_t
         filenames=[test_file],
         content_type="application/pdf",
         strategy=strategy,
-        pdf_infer_table_structure=pdf_infer_table_structure,
-        skip_infer_table_types="[]",
+        skip_infer_table_types=skip_infer_table_types,
     )
 
     assert response.status_code == 200
diff --git a/test_general/api/test_app.py b/test_general/api/test_app.py
@@ -177,9 +177,7 @@ def test_languages_param():
 
 
 def test_skip_infer_table_types_param():
-    """
-    Verify that we skip table instruction unless specified
-    """
+    """Verify that we extract table unless excluded by skip_infer_table_types"""
     client = TestClient(app)
     test_file = Path("sample-docs") / "layout-parser-paper-with-table.jpg"
     response = client.post(
@@ -191,19 +189,19 @@ def test_skip_infer_table_types_param():
     # test we skip table extraction by default
     elements = response.json()
     table = [el["metadata"]["text_as_html"] for el in elements if "text_as_html" in el["metadata"]]
-    assert len(table) == 0
+    assert len(table) == 1
 
     response = client.post(
         MAIN_API_ROUTE,
         files=[("files", (str(test_file), open(test_file, "rb")))],
-        data={"skip_infer_table_types": "['pdf']"},
+        data={"skip_infer_table_types": ["jpg"]},
     )
 
     assert response.status_code == 200
-    # test we didn't specify to skip table extration with image
+    # test we specified to skip extraction for jpg
     elements = response.json()
     table = [el["metadata"]["text_as_html"] for el in elements if "text_as_html" in el["metadata"]]
-    assert len(table) == 1
+    assert len(table) == 0
     # This text is not currently picked up
     # assert "Layouts of history Japanese documents" in table[0]