Skip to content

Commit 8bdb1de

Browse files
authored
build(deps): version bumps for maintenance (Unstructured-IO#420)
### Summary Version bumps for regular maintenance and to address moderate CVEs from security scans. Also updates the `unstructured` extra from `local-inference` to `all-docs` to keep up with latest best practices for the `unstructured` library. Includes an update for appropriately setting `pdf_infer_table_structure` depending on the value of `skip_infer_table_types` and adds a test.
1 parent 65a344d commit 8bdb1de

File tree

8 files changed

+328
-138
lines changed

8 files changed

+328
-138
lines changed

CHANGELOG.md

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
1+
## 0.0.69
2+
3+
* Bump to `unstructured` 0.14.4
4+
* Add handling for `pdf_infer_table_structure` to reflect the "tables off by default" behavior in `unstructured`.
5+
16
## 0.0.68
27

38
* Fix list params such as `extract_image_block_types` not working via the python/js clients
@@ -20,7 +25,7 @@
2025
* Bump unstructured to 0.12.4
2126
* Add support for both `list[str]` and `str` input formats for `ocr_languages` parameter
2227
* Adds support for additional MIME types from `unstructured`
23-
* Document the support for gzip files and add additional testing
28+
* Document the support for gzip files and add additional testing
2429

2530
## 0.0.64
2631

prepline_general/api/app.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
app = FastAPI(
1414
title="Unstructured Pipeline API",
1515
summary="Partition documents with the Unstructured library",
16-
version="0.0.68",
16+
version="0.0.69",
1717
docs_url="/general/docs",
1818
openapi_url="/general/openapi.json",
1919
servers=[

prepline_general/api/general.py

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -361,7 +361,11 @@ def pipeline_api(
361361

362362
hi_res_model_name = _validate_hi_res_model_name(hi_res_model_name, coordinates)
363363
strategy = _validate_strategy(strategy)
364-
pdf_infer_table_structure = _set_pdf_infer_table_structure(pdf_infer_table_structure, strategy)
364+
pdf_infer_table_structure = _set_pdf_infer_table_structure(
365+
pdf_infer_table_structure,
366+
strategy,
367+
skip_infer_table_types,
368+
)
365369

366370
# Parallel mode is set by env variable
367371
enable_parallel_mode = os.environ.get("UNSTRUCTURED_PARALLEL_MODE_ENABLED", "false")
@@ -441,9 +445,9 @@ def pipeline_api(
441445
)
442446
elif hi_res_model_name and hi_res_model_name in CHIPPER_MODEL_TYPES:
443447
with ChipperMemoryProtection():
444-
elements = partition(**partition_kwargs) # pyright: ignore[reportGeneralTypeIssues]
448+
elements = partition(**partition_kwargs) # type: ignore # pyright: ignore[reportGeneralTypeIssues]
445449
else:
446-
elements = partition(**partition_kwargs) # pyright: ignore[reportGeneralTypeIssues]
450+
elements = partition(**partition_kwargs) # type: ignore # pyright: ignore[reportGeneralTypeIssues]
447451

448452
except OSError as e:
449453
if isinstance(e.args[0], str) and (
@@ -595,8 +599,13 @@ def _validate_chunking_strategy(chunking_strategy: Optional[str]) -> Optional[st
595599
return chunking_strategy
596600

597601

598-
def _set_pdf_infer_table_structure(pdf_infer_table_structure: bool, strategy: str) -> bool:
602+
def _set_pdf_infer_table_structure(
603+
pdf_infer_table_structure: bool, strategy: str, skip_infer_table_types: Optional[List[str]]
604+
) -> bool:
599605
"""Avoids table inference in "fast" and "ocr_only" runs."""
606+
# NOTE(robinson) - line below is for type checking
607+
skip_infer_table_types = [] if skip_infer_table_types is None else skip_infer_table_types
608+
pdf_infer_table_structure = pdf_infer_table_structure and ("pdf" not in skip_infer_table_types)
600609
return strategy in ("hi_res", "auto") and pdf_infer_table_structure
601610

602611

@@ -704,7 +713,7 @@ def return_content_type(filename: str):
704713

705714

706715
@router.get("/general/v0/general", include_in_schema=False)
707-
@router.get("/general/v0.0.68/general", include_in_schema=False)
716+
@router.get("/general/v0.0.69/general", include_in_schema=False)
708717
async def handle_invalid_get_request():
709718
raise HTTPException(
710719
status_code=status.HTTP_405_METHOD_NOT_ALLOWED, detail="Only POST requests are supported."
@@ -719,7 +728,7 @@ async def handle_invalid_get_request():
719728
description="Description",
720729
operation_id="partition_parameters",
721730
)
722-
@router.post("/general/v0.0.68/general", include_in_schema=False)
731+
@router.post("/general/v0.0.69/general", include_in_schema=False)
723732
def general_partition(
724733
request: Request,
725734
# cannot use annotated type here because of a bug described here:

preprocessing-pipeline-family.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
name: general
2-
version: 0.0.68
2+
version: 0.0.69

requirements/base.in

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
-c constraints.in
2-
unstructured[local-inference]>=0.8.1
2+
unstructured[all-docs]>=0.8.1
33
# Pinning click due to a unicode issue in black
44
# can remove after black drops support for Python 3.6
55
# ref: https://github.com/psf/black/issues/2964

0 commit comments

Comments
 (0)