Skip to content

Commit f98193c

Browse files
committed
Add support for page ranges in pdf split hook
1 parent f54ad53 commit f98193c

File tree

5 files changed

+112
-26
lines changed

5 files changed

+112
-26
lines changed

_test_unstructured_client/integration/test_decorators.py

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,3 +110,75 @@ def test_integration_split_pdf_for_file_with_no_name():
110110
)
111111

112112
pytest.raises(ValueError, client.general.partition, req)
113+
114+
115+
@pytest.mark.parametrize("starting_page_number", [1, 100])
116+
@pytest.mark.parametrize(
117+
"page_range, expected_ok, expected_pages",
118+
[
119+
(["1", "14"], True, (1, 14)), # Valid range, start on boundary
120+
(["4", "16"], True, (4, 16)), # Valid range, end on boundary
121+
(["2", "5"], True, (2, 5)), # Valid range within boundary
122+
# A 1 page doc wouldn't normally be split,
123+
# but this code still needs to return the page range
124+
(["6", "6"], True, (6, 6)),
125+
(["2", "100"], False, None), # End page too high
126+
(["50", "100"], False, None), # Range too high
127+
(["-50", "5"], False, None), # Start page too low
128+
(["-50", "-2"], False, None), # Range too low
129+
(["10", "2"], False, None), # Backwards range
130+
],
131+
)
132+
def test_integration_split_pdf_with_page_range(
133+
starting_page_number: int,
134+
page_range: list[int],
135+
expected_ok: bool,
136+
expected_pages: tuple[int, int],
137+
caplog,
138+
):
139+
"""
140+
Test that we can split pdfs with an arbitrary page range. Send the selected range to the API and assert that the metadata page numbers are correct.
141+
We should also be able to offset the metadata with starting_page_number.
142+
143+
Requires unstructured-api running in bg. See Makefile for how to run it.
144+
"""
145+
try:
146+
response = requests.get("http://localhost:8000/general/docs")
147+
assert response.status_code == 200, "The unstructured-api is not running on localhost:8000"
148+
except requests.exceptions.ConnectionError:
149+
assert False, "The unstructured-api is not running on localhost:8000"
150+
151+
client = UnstructuredClient(api_key_auth=FAKE_KEY, server_url="localhost:8000")
152+
153+
filename = "_sample_docs/layout-parser-paper.pdf"
154+
with open(filename, "rb") as f:
155+
files = shared.Files(
156+
content=f.read(),
157+
file_name=filename,
158+
)
159+
160+
req = shared.PartitionParameters(
161+
files=files,
162+
strategy="fast",
163+
split_pdf_page=True,
164+
split_pdf_page_range=page_range,
165+
starting_page_number=starting_page_number,
166+
)
167+
168+
try:
169+
resp = client.general.partition(req)
170+
except ValueError as exc:
171+
if not expected_ok:
172+
assert "is out of bounds." in caplog.text
173+
assert "is out of bounds." in str(exc)
174+
return
175+
else:
176+
assert exc is None
177+
178+
page_numbers = set([e["metadata"]["page_number"] for e in resp.elements])
179+
180+
min_page_number = expected_pages[0] + starting_page_number - 1
181+
max_page_number = expected_pages[1] + starting_page_number - 1
182+
183+
assert min(page_numbers) == min_page_number, f"Result should start at page {min_page_number}"
184+
assert max(page_numbers) == max_page_number, f"Result should end at page {max_page_number}"

src/unstructured_client/_hooks/custom/form_utils.py

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -40,17 +40,16 @@ def get_page_range(form_data: FormData, key: str, max_pages: int) -> tuple[int,
4040
page_range = (1, max_pages)
4141

4242
except (ValueError, IndexError):
43-
logger.warning(
44-
"'%s' is not a valid page range. Selecting default range (1 to %d).",
45-
_page_range,
46-
max_pages,
47-
)
48-
page_range = (1, max_pages)
43+
msg = f"{_page_range} is not a valid page range."
44+
logger.error(msg)
45+
raise ValueError(msg)
46+
47+
start, end = page_range
4948

50-
if page_range[0] < 1 or page_range[1] > max_pages:
51-
new_page_range = (max(page_range[0], 1), min(page_range[1], max_pages))
52-
logger.warning(f"Page range {page_range} is out of bounds, setting to {new_page_range}.")
53-
page_range = new_page_range
49+
if not (0 < start <= max_pages) or not (0 < end <= max_pages) or not (start <= end):
50+
msg = f"Page range {page_range} is out of bounds. Valid range is (1 - {max_pages})."
51+
logger.error(msg)
52+
raise ValueError(msg)
5453

5554
return page_range
5655

src/unstructured_client/_hooks/custom/pdf_utils.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import io
22
import logging
3-
from typing import Generator, Tuple
3+
from typing import Generator, Tuple, Optional
44

55
from pypdf import PdfReader, PdfWriter
66
from pypdf.errors import PdfReadError
@@ -12,7 +12,7 @@
1212

1313

1414
def get_pdf_pages(
15-
pdf: PdfReader, split_size: int = 1
15+
pdf: PdfReader, split_size: int = 1, page_start: int = 1, page_end: Optional[int] = None
1616
) -> Generator[Tuple[io.BytesIO, int, int], None, None]:
1717
"""Reads given bytes of a pdf file and split it into n file-like objects, each
1818
with `split_size` pages.
@@ -22,13 +22,15 @@ def get_pdf_pages(
2222
split_size: Split size, e.g. if the given file has 10 pages
2323
and this value is set to 2 it will yield 5 documents, each containing 2 pages
2424
of the original document. By default it will split each page to a separate file.
25+
page_start: Begin splitting at this page number
26+
page_end: If provided, split up to and including this page number
2527
2628
Yields:
2729
The file contents with their page number and overall pages number of the original document.
2830
"""
2931

30-
offset = 0
31-
offset_end = len(pdf.pages)
32+
offset = page_start - 1
33+
offset_end = page_end or len(pdf.pages)
3234

3335
while offset < offset_end:
3436
new_pdf = PdfWriter()

src/unstructured_client/_hooks/custom/request_utils.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from unstructured_client._hooks.custom.form_utils import (
1717
PARTITION_FORM_FILES_KEY,
1818
PARTITION_FORM_SPLIT_PDF_PAGE_KEY,
19+
PARTITION_FORM_PAGE_RANGE_KEY,
1920
PARTITION_FORM_STARTING_PAGE_NUMBER_KEY,
2021
FormData,
2122
)
@@ -145,6 +146,7 @@ def prepare_request_payload(form_data: FormData) -> FormData:
145146
payload = copy.deepcopy(form_data)
146147
payload.pop(PARTITION_FORM_SPLIT_PDF_PAGE_KEY, None)
147148
payload.pop(PARTITION_FORM_FILES_KEY, None)
149+
payload.pop(PARTITION_FORM_PAGE_RANGE_KEY, None)
148150
payload.pop(PARTITION_FORM_STARTING_PAGE_NUMBER_KEY, None)
149151
updated_parameters = {
150152
PARTITION_FORM_SPLIT_PDF_PAGE_KEY: "false",

src/unstructured_client/_hooks/custom/split_pdf_hook.py

Lines changed: 23 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from unstructured_client._hooks.custom.form_utils import (
1919
PARTITION_FORM_CONCURRENCY_LEVEL_KEY,
2020
PARTITION_FORM_FILES_KEY,
21+
PARTITION_FORM_PAGE_RANGE_KEY,
2122
PARTITION_FORM_SPLIT_PDF_PAGE_KEY,
2223
PARTITION_FORM_STARTING_PAGE_NUMBER_KEY,
2324
)
@@ -143,7 +144,10 @@ def before_request(
143144
key=PARTITION_FORM_STARTING_PAGE_NUMBER_KEY,
144145
fallback_value=DEFAULT_STARTING_PAGE_NUMBER,
145146
)
146-
logger.info("Starting page number set to %d", starting_page_number)
147+
148+
if starting_page_number > 1:
149+
logger.info("Starting page number set to %d", starting_page_number)
150+
147151
concurrency_level = form_utils.get_split_pdf_concurrency_level_param(
148152
form_data,
149153
key=PARTITION_FORM_CONCURRENCY_LEVEL_KEY,
@@ -154,27 +158,34 @@ def before_request(
154158
limiter = asyncio.Semaphore(concurrency_level)
155159

156160
pdf = PdfReader(io.BytesIO(file.content))
161+
162+
page_range_start, page_range_end = form_utils.get_page_range(
163+
form_data,
164+
key=PARTITION_FORM_PAGE_RANGE_KEY,
165+
max_pages=len(pdf.pages),
166+
)
167+
168+
page_count = min(len(pdf.pages), page_range_end - page_range_start + 1)
169+
logger.info(f"Splitting pages {page_range_start} to {page_range_end} ({page_count} total)")
170+
157171
split_size = get_optimal_split_size(
158-
num_pages=len(pdf.pages), concurrency_level=concurrency_level
172+
num_pages=page_count, concurrency_level=concurrency_level
159173
)
160174
logger.info("Determined optimal split size of %d pages.", split_size)
161175

162-
if split_size >= len(pdf.pages):
176+
# If the doc is small enough, and we aren't slicing it with a page range:
177+
# do not split, just continue with the original request
178+
if split_size >= page_count and page_count == len(pdf.pages):
163179
logger.info(
164180
"Document has too few pages (%d) to be split efficiently. Partitioning without split.",
165-
len(pdf.pages),
181+
page_count,
166182
)
167183
return request
168184

169-
pages = pdf_utils.get_pdf_pages(pdf, split_size)
170-
logger.info(
171-
"Document split into %d, %d-paged sets.",
172-
math.ceil(len(pdf.pages) / split_size),
173-
split_size,
174-
)
185+
pages = pdf_utils.get_pdf_pages(pdf, split_size=split_size, page_start=page_range_start, page_end=page_range_end)
175186
logger.info(
176187
"Partitioning %d, %d-paged sets.",
177-
math.ceil(len(pdf.pages) / split_size),
188+
math.ceil(page_count / split_size),
178189
split_size,
179190
)
180191

@@ -209,7 +220,7 @@ async def call_api_partial(page):
209220
"Partitioning set #%d (pages %d-%d).",
210221
set_index,
211222
page_number,
212-
min(page_number + split_size, all_pages_number),
223+
min(page_number + split_size - 1, all_pages_number),
213224
)
214225
# Check if this set of pages is the last one
215226
if page_index + split_size >= all_pages_number:

0 commit comments

Comments
 (0)