Skip to content

Commit 386df41

Browse files
Fix pdf_infer_table_strategy being silently changed in API (#411)
Changes the logic which disabled pdf_infer_table_strategy for auto strategy.
1 parent aca9663 commit 386df41

File tree

6 files changed

+15
-15
lines changed

6 files changed

+15
-15
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
## 0.0.67-dev1
2+
3+
* Change pdf_infer_table_structure parameter from being disabled in auto strategy.
4+
15
## 0.0.66
26

37
* Add support for `unique_element_ids` parameter.

prepline_general/api/app.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
app = FastAPI(
1313
title="Unstructured Pipeline API",
1414
summary="Partition documents with the Unstructured library",
15-
version="0.0.66",
15+
version="0.0.67",
1616
docs_url="/general/docs",
1717
openapi_url="/general/openapi.json",
1818
servers=[

prepline_general/api/general.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -596,7 +596,8 @@ def _validate_chunking_strategy(chunking_strategy: Optional[str]) -> Optional[st
596596

597597

598598
def _set_pdf_infer_table_structure(pdf_infer_table_structure: bool, strategy: str) -> bool:
599-
return strategy == "hi_res" and pdf_infer_table_structure
599+
"""Avoids table inference in "fast" and "ocr_only" runs."""
600+
return strategy in ("hi_res", "auto") and pdf_infer_table_structure
600601

601602

602603
def get_validated_mimetype(file: UploadFile) -> Optional[str]:
@@ -703,7 +704,7 @@ def return_content_type(filename: str):
703704

704705

705706
@router.get("/general/v0/general", include_in_schema=False)
706-
@router.get("/general/v0.0.66/general", include_in_schema=False)
707+
@router.get("/general/v0.0.67/general", include_in_schema=False)
707708
async def handle_invalid_get_request():
708709
raise HTTPException(
709710
status_code=status.HTTP_405_METHOD_NOT_ALLOWED, detail="Only POST requests are supported."
@@ -718,7 +719,7 @@ async def handle_invalid_get_request():
718719
description="Description",
719720
operation_id="partition_parameters",
720721
)
721-
@router.post("/general/v0.0.66/general", include_in_schema=False)
722+
@router.post("/general/v0.0.67/general", include_in_schema=False)
722723
def general_partition(
723724
request: Request,
724725
# cannot use annotated type here because of a bug described here:

preprocessing-pipeline-family.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
name: general
2-
version: 0.0.66
2+
version: 0.0.67

scripts/smoketest.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -207,18 +207,10 @@ def test_strategy_performance():
207207
hi_res_time = time.monotonic() - start_time
208208
assert response.status_code == 200
209209

210-
start_time = time.monotonic()
211-
response = send_document(filenames=[test_file], content_type="application/pdf", strategy="auto")
212-
auto_time = time.monotonic() - start_time
213-
assert response.status_code == 200
214-
215-
assert hi_res_time > performance_ratio * auto_time
216-
217210
start_time = time.monotonic()
218211
response = send_document(filenames=[test_file], content_type="application/pdf", strategy="fast")
219212
fast_time = time.monotonic() - start_time
220213
assert response.status_code == 200
221-
222214
assert hi_res_time > performance_ratio * fast_time
223215

224216

test_general/api/test_app.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -860,14 +860,17 @@ def test_chunking_strategy_param():
860860
@pytest.mark.parametrize(
861861
("multipage_sections", "combine_under_n_chars", "new_after_n_chars", "max_characters"),
862862
[
863-
(False, None, None, 500), # test multipage_sections
863+
(False, None, None, 600), # test multipage_sections
864864
(True, 1000, None, 5000), # test combine_under_n_chars
865865
(True, None, 10, 500), # test new_after_n_chars
866866
(True, None, None, 100), # test max__characters
867867
],
868868
)
869869
def test_chunking_strategy_additional_params(
870-
multipage_sections, combine_under_n_chars, new_after_n_chars, max_characters
870+
multipage_sections: bool,
871+
combine_under_n_chars: int,
872+
new_after_n_chars: int,
873+
max_characters: int,
871874
):
872875
client = TestClient(app)
873876
test_file = Path("sample-docs") / "layout-parser-paper-fast.pdf"

0 commit comments

Comments
 (0)