Skip to content

Commit f54ad53

Browse files
committed
Add get_page_range form util helper
1 parent 1d63200 commit f54ad53

File tree

2 files changed

+61
-0
lines changed

2 files changed

+61
-0
lines changed

_test_unstructured_client/unit/test_split_pdf_hook.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from unstructured_client._hooks.custom.form_utils import (
1010
PARTITION_FORM_CONCURRENCY_LEVEL_KEY,
1111
PARTITION_FORM_STARTING_PAGE_NUMBER_KEY,
12+
PARTITION_FORM_PAGE_RANGE_KEY,
1213
)
1314
from unstructured_client._hooks.custom.split_pdf_hook import (
1415
DEFAULT_CONCURRENCY_LEVEL,
@@ -398,3 +399,25 @@ def test_unit_get_starting_page_number(starting_page_number, expected_result):
398399
fallback_value=DEFAULT_STARTING_PAGE_NUMBER,
399400
)
400401
assert result == expected_result
402+
403+
404+
@pytest.mark.parametrize(
405+
"page_range, expected_result",
406+
[
407+
(["2", "5"], (2, 5)), # Valid range
408+
(["2", "100"], (2, 20)), # End too high
409+
(["-50", "5"], (1, 5)), # Start too low
410+
(None, (1, 20)), # Range not specified
411+
(["foo", "foo"], (1, 20)), # Parse error
412+
],
413+
)
414+
def test_unit_get_page_range_returns_valid_range(page_range, expected_result):
415+
"""Test get_page_range method with different inputs.
416+
Ranges that are out of bounds for a 20 page doc will be adjusted."""
417+
form_data = {"split_pdf_page_range[]": page_range}
418+
result = form_utils.get_page_range(
419+
form_data,
420+
key=PARTITION_FORM_PAGE_RANGE_KEY,
421+
max_pages=20,
422+
)
423+
assert result == expected_result

src/unstructured_client/_hooks/custom/form_utils.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,48 @@
1313

1414
PARTITION_FORM_FILES_KEY = "files"
1515
PARTITION_FORM_SPLIT_PDF_PAGE_KEY = "split_pdf_page"
16+
PARTITION_FORM_PAGE_RANGE_KEY = "split_pdf_page_range[]"
1617
PARTITION_FORM_STARTING_PAGE_NUMBER_KEY = "starting_page_number"
1718
PARTITION_FORM_CONCURRENCY_LEVEL_KEY = "split_pdf_concurrency_level"
1819

1920

21+
def get_page_range(form_data: FormData, key: str, max_pages: int) -> tuple[int, int]:
22+
"""Retrieves the split page range from the given form data.
23+
24+
If the range is invalid or outside the bounds of the page count,
25+
returns (1, num_pages), i.e. the full range.
26+
27+
Args:
28+
form_data: The form data containing the page range
29+
key: The key to look for in the form data.
30+
31+
Returns:
32+
The range of pages to send in the request in the form (start, end)
33+
"""
34+
try:
35+
_page_range = form_data.get(key)
36+
37+
if _page_range is not None:
38+
page_range = (int(_page_range[0]), int(_page_range[1]))
39+
else:
40+
page_range = (1, max_pages)
41+
42+
except (ValueError, IndexError):
43+
logger.warning(
44+
"'%s' is not a valid page range. Selecting default range (1 to %d).",
45+
_page_range,
46+
max_pages,
47+
)
48+
page_range = (1, max_pages)
49+
50+
if page_range[0] < 1 or page_range[1] > max_pages:
51+
new_page_range = (max(page_range[0], 1), min(page_range[1], max_pages))
52+
logger.warning(f"Page range {page_range} is out of bounds, setting to {new_page_range}.")
53+
page_range = new_page_range
54+
55+
return page_range
56+
57+
2058
def get_starting_page_number(form_data: FormData, key: str, fallback_value: int) -> int:
2159
"""Retrieves the starting page number from the given form data.
2260

0 commit comments

Comments
 (0)