Skip to content

Commit 80902b5

Browse files
committed
Add split_pdf_page_range parameter, update unit tests
1 parent f98193c commit 80902b5

File tree

6 files changed

+54
-10
lines changed

6 files changed

+54
-10
lines changed

README.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,10 @@ res = s.general.partition(request=operations.PartitionRequest(
5555
content='0x2cC94b2FEF'.encode(),
5656
file_name='your_file_here',
5757
),
58+
split_pdf_page_range=[
59+
1,
60+
10,
61+
],
5862
strategy=shared.Strategy.AUTO,
5963
),
6064
))
@@ -110,6 +114,10 @@ res = s.general.partition(request=operations.PartitionRequest(
110114
content='0x2cC94b2FEF'.encode(),
111115
file_name='your_file_here',
112116
),
117+
split_pdf_page_range=[
118+
1,
119+
10,
120+
],
113121
strategy=shared.Strategy.AUTO,
114122
),
115123
),
@@ -139,6 +147,10 @@ res = s.general.partition(request=operations.PartitionRequest(
139147
content='0x2cC94b2FEF'.encode(),
140148
file_name='your_file_here',
141149
),
150+
split_pdf_page_range=[
151+
1,
152+
10,
153+
],
142154
strategy=shared.Strategy.AUTO,
143155
),
144156
))

USAGE.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,10 @@ res = s.general.partition(request=operations.PartitionRequest(
1414
content='0x2cC94b2FEF'.encode(),
1515
file_name='your_file_here',
1616
),
17+
split_pdf_page_range=[
18+
1,
19+
10,
20+
],
1721
strategy=shared.Strategy.AUTO,
1822
),
1923
))

_test_unstructured_client/unit/test_split_pdf_hook.py

Lines changed: 23 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -404,20 +404,33 @@ def test_unit_get_starting_page_number(starting_page_number, expected_result):
404404
@pytest.mark.parametrize(
405405
"page_range, expected_result",
406406
[
407-
(["2", "5"], (2, 5)), # Valid range
408-
(["2", "100"], (2, 20)), # End too high
409-
(["-50", "5"], (1, 5)), # Start too low
410-
(None, (1, 20)), # Range not specified
411-
(["foo", "foo"], (1, 20)), # Parse error
407+
(["1", "14"], (1, 14)), # Valid range, start on boundary
408+
(["4", "16"], (4, 16)), # Valid range, end on boundary
409+
(None, (1, 20)), # Range not specified, defaults to full range
410+
(["2", "5"], (2, 5)), # Valid range within boundary
411+
(["2", "100"], None), # End page too high
412+
(["50", "100"], None), # Range too high
413+
(["-50", "5"], None), # Start page too low
414+
(["-50", "-2"], None), # Range too low
415+
(["10", "2"], None), # Backwards range
416+
(["foo", "foo"], None), # Parse error
412417
],
413418
)
414419
def test_unit_get_page_range_returns_valid_range(page_range, expected_result):
415420
"""Test get_page_range method with different inputs.
416421
Ranges that are out of bounds for a 20 page doc will be adjusted."""
417422
form_data = {"split_pdf_page_range[]": page_range}
418-
result = form_utils.get_page_range(
419-
form_data,
420-
key=PARTITION_FORM_PAGE_RANGE_KEY,
421-
max_pages=20,
422-
)
423+
try:
424+
result = form_utils.get_page_range(
425+
form_data,
426+
key=PARTITION_FORM_PAGE_RANGE_KEY,
427+
max_pages=20,
428+
)
429+
except ValueError as exc:
430+
if not expected_result:
431+
assert "is out of bounds." in str(exc) or "is not a valid page range." in str(exc)
432+
return
433+
else:
434+
assert exc is None
435+
423436
assert result == expected_result

docs/models/shared/partitionparameters.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
| `skip_infer_table_types` | List[*str*] | :heavy_minus_sign: | The document types that you want to skip table extraction with. Default: [] | |
2929
| `split_pdf_concurrency_level` | *Optional[int]* | :heavy_minus_sign: | When `split_pdf_page` is set to `True`, this parameter specifies the number of workers used for sending requests when the PDF is split on the client side. It's an internal parameter for the Python client and is not sent to the backend. | |
3030
| `split_pdf_page` | *Optional[bool]* | :heavy_minus_sign: | This parameter determines if the PDF file should be split on the client side. It's an internal parameter for the Python client and is not sent to the backend. | |
31+
| `split_pdf_page_range` | List[*int*] | :heavy_minus_sign: | When `split_pdf_page is set to `True`, this parameter selects a subset of the pdf to send to the API. The parameter is a list of 2 integers within the range [1, length_of_pdf]. It's an internal parameter for the Python client and is not sent to the backend. | [<br/>1,<br/>10<br/>] |
3132
| `starting_page_number` | *Optional[int]* | :heavy_minus_sign: | When PDF is split into pages before sending it into the API, providing this information will allow the page number to be assigned correctly. Introduced in 1.0.27. | |
3233
| `strategy` | [Optional[shared.Strategy]](../../models/shared/strategy.md) | :heavy_minus_sign: | The strategy to use for partitioning PDF/image. Options are fast, hi_res, auto. Default: auto | auto |
3334
| `unique_element_ids` | *Optional[bool]* | :heavy_minus_sign: | When `True`, assign UUIDs to element IDs, which guarantees their uniqueness (useful when using them as primary keys in database). Otherwise a SHA-256 of element text is used. Default: `False` | |

overlay_client.yaml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,18 @@ actions:
1212
"description": "This parameter determines if the PDF file should be split on the client side. It's an internal parameter for the Python client and is not sent to the backend.",
1313
"default": true,
1414
}
15+
- target: $["components"]["schemas"]["partition_parameters"]["properties"]
16+
update:
17+
"split_pdf_page_range":
18+
{
19+
"type": "array",
20+
"title": "Split Pdf Page Range",
21+
"description": "When `split_pdf_page is set to `True`, this parameter selects a subset of the pdf to send to the API. The parameter is a list of 2 integers within the range [1, length_of_pdf]. It's an internal parameter for the Python client and is not sent to the backend.",
22+
"items": {"type": "integer"},
23+
"minItems": 2,
24+
"maxItems": 2,
25+
"example": [1, 10],
26+
}
1527
- target: $["components"]["schemas"]["partition_parameters"]["properties"]
1628
update:
1729
"split_pdf_concurrency_level":

src/unstructured_client/models/shared/partition_parameters.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,8 @@ class PartitionParameters:
8585
r"""When `split_pdf_page` is set to `True`, this parameter specifies the number of workers used for sending requests when the PDF is split on the client side. It's an internal parameter for the Python client and is not sent to the backend."""
8686
split_pdf_page: Optional[bool] = dataclasses.field(default=True, metadata={'multipart_form': { 'field_name': 'split_pdf_page' }})
8787
r"""This parameter determines if the PDF file should be split on the client side. It's an internal parameter for the Python client and is not sent to the backend."""
88+
split_pdf_page_range: Optional[List[int]] = dataclasses.field(default=None, metadata={'multipart_form': { 'field_name': 'split_pdf_page_range' }})
89+
r"""When `split_pdf_page is set to `True`, this parameter selects a subset of the pdf to send to the API. The parameter is a list of 2 integers within the range [1, length_of_pdf]. It's an internal parameter for the Python client and is not sent to the backend."""
8890
starting_page_number: Optional[int] = dataclasses.field(default=None, metadata={'multipart_form': { 'field_name': 'starting_page_number' }})
8991
r"""When PDF is split into pages before sending it into the API, providing this information will allow the page number to be assigned correctly. Introduced in 1.0.27."""
9092
strategy: Optional[Strategy] = dataclasses.field(default=Strategy.AUTO, metadata={'multipart_form': { 'field_name': 'strategy' }})

0 commit comments

Comments
 (0)