Skip to content

Fix pdf_infer_table_strategy being silently changed in API #411

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
## 0.0.67-dev1

* Change pdf_infer_table_structure parameter from being disabled in auto strategy.

## 0.0.66

* Add support for `unique_element_ids` parameter.
Expand Down
2 changes: 1 addition & 1 deletion prepline_general/api/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
app = FastAPI(
title="Unstructured Pipeline API",
summary="Partition documents with the Unstructured library",
version="0.0.66",
version="0.0.67",
docs_url="/general/docs",
openapi_url="/general/openapi.json",
servers=[
Expand Down
7 changes: 4 additions & 3 deletions prepline_general/api/general.py
Original file line number Diff line number Diff line change
Expand Up @@ -596,7 +596,8 @@ def _validate_chunking_strategy(chunking_strategy: Optional[str]) -> Optional[st


def _set_pdf_infer_table_structure(pdf_infer_table_structure: bool, strategy: str) -> bool:
return strategy == "hi_res" and pdf_infer_table_structure
"""Avoids table inference in "fast" and "ocr_only" runs."""
return strategy in ("hi_res", "auto") and pdf_infer_table_structure


def get_validated_mimetype(file: UploadFile) -> Optional[str]:
Expand Down Expand Up @@ -703,7 +704,7 @@ def return_content_type(filename: str):


@router.get("/general/v0/general", include_in_schema=False)
@router.get("/general/v0.0.66/general", include_in_schema=False)
@router.get("/general/v0.0.67/general", include_in_schema=False)
async def handle_invalid_get_request():
raise HTTPException(
status_code=status.HTTP_405_METHOD_NOT_ALLOWED, detail="Only POST requests are supported."
Expand All @@ -718,7 +719,7 @@ async def handle_invalid_get_request():
description="Description",
operation_id="partition_parameters",
)
@router.post("/general/v0.0.66/general", include_in_schema=False)
@router.post("/general/v0.0.67/general", include_in_schema=False)
def general_partition(
request: Request,
# cannot use annotated type here because of a bug described here:
Expand Down
2 changes: 1 addition & 1 deletion preprocessing-pipeline-family.yaml
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
name: general
version: 0.0.66
version: 0.0.67
8 changes: 0 additions & 8 deletions scripts/smoketest.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,18 +207,10 @@ def test_strategy_performance():
hi_res_time = time.monotonic() - start_time
assert response.status_code == 200

start_time = time.monotonic()
response = send_document(filenames=[test_file], content_type="application/pdf", strategy="auto")
auto_time = time.monotonic() - start_time
assert response.status_code == 200

assert hi_res_time > performance_ratio * auto_time

start_time = time.monotonic()
response = send_document(filenames=[test_file], content_type="application/pdf", strategy="fast")
fast_time = time.monotonic() - start_time
assert response.status_code == 200

assert hi_res_time > performance_ratio * fast_time


Expand Down
7 changes: 5 additions & 2 deletions test_general/api/test_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -860,14 +860,17 @@ def test_chunking_strategy_param():
@pytest.mark.parametrize(
("multipage_sections", "combine_under_n_chars", "new_after_n_chars", "max_characters"),
[
(False, None, None, 500), # test multipage_sections
(False, None, None, 600), # test multipage_sections
(True, 1000, None, 5000), # test combine_under_n_chars
(True, None, 10, 500), # test new_after_n_chars
(True, None, None, 100), # test max__characters
],
)
def test_chunking_strategy_additional_params(
multipage_sections, combine_under_n_chars, new_after_n_chars, max_characters
multipage_sections: bool,
combine_under_n_chars: int,
new_after_n_chars: int,
max_characters: int,
):
client = TestClient(app)
test_file = Path("sample-docs") / "layout-parser-paper-fast.pdf"
Expand Down