Skip to content

Commit b441902

Browse files
committed
Add support for page ranges in pdf split hook
1 parent f54ad53 commit b441902

File tree

4 files changed

+85
-16
lines changed

4 files changed

+85
-16
lines changed

_test_unstructured_client/integration/test_decorators.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,3 +110,57 @@ def test_integration_split_pdf_for_file_with_no_name():
110110
)
111111

112112
pytest.raises(ValueError, client.general.partition, req)
113+
114+
115+
@pytest.mark.parametrize("starting_page_number", [1, 100])
116+
@pytest.mark.parametrize(
117+
"page_range, expected_pages",
118+
[
119+
(["2", "5"], (2, 5)), # Valid range
120+
(["2", "100"], (2, 16)), # End too high
121+
(["-50", "5"], (1, 5)), # Start too low
122+
],
123+
)
124+
def test_integration_split_pdf_with_page_range(
125+
starting_page_number: int,
126+
page_range: list[int],
127+
expected_pages: tuple[int, int],
128+
):
129+
"""
130+
Test that we can split pdfs with an arbitrary page range. Send the selected range to the API and assert that the metadata page numbers are correct.
131+
We should also be able to offset the metadata with starting_page_number.
132+
133+
Requires unstructured-api running in bg. See Makefile for how to run it.
134+
"""
135+
try:
136+
response = requests.get("http://localhost:8000/general/docs")
137+
assert response.status_code == 200, "The unstructured-api is not running on localhost:8000"
138+
except requests.exceptions.ConnectionError:
139+
assert False, "The unstructured-api is not running on localhost:8000"
140+
141+
client = UnstructuredClient(api_key_auth=FAKE_KEY, server_url="localhost:8000")
142+
143+
filename = "_sample_docs/layout-parser-paper.pdf"
144+
with open(filename, "rb") as f:
145+
files = shared.Files(
146+
content=f.read(),
147+
file_name=filename,
148+
)
149+
150+
req = shared.PartitionParameters(
151+
files=files,
152+
strategy="fast",
153+
split_pdf_page=True,
154+
split_pdf_page_range=page_range,
155+
starting_page_number=starting_page_number,
156+
)
157+
158+
resp = client.general.partition(req)
159+
160+
page_numbers = set([e["metadata"]["page_number"] for e in resp.elements])
161+
162+
min_page_number = expected_pages[0] + starting_page_number - 1
163+
max_page_number = expected_pages[1] + starting_page_number - 1
164+
165+
assert min(page_numbers) == min_page_number
166+
assert max(page_numbers) == max_page_number

src/unstructured_client/_hooks/custom/pdf_utils.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import io
22
import logging
3-
from typing import Generator, Tuple
3+
from typing import Generator, Tuple, Optional
44

55
from pypdf import PdfReader, PdfWriter
66
from pypdf.errors import PdfReadError
@@ -12,7 +12,7 @@
1212

1313

1414
def get_pdf_pages(
15-
pdf: PdfReader, split_size: int = 1
15+
pdf: PdfReader, split_size: int = 1, page_start: int = 1, page_end: Optional[int] = None
1616
) -> Generator[Tuple[io.BytesIO, int, int], None, None]:
1717
"""Reads given bytes of a pdf file and split it into n file-like objects, each
1818
with `split_size` pages.
@@ -22,13 +22,15 @@ def get_pdf_pages(
2222
split_size: Split size, e.g. if the given file has 10 pages
2323
and this value is set to 2 it will yield 5 documents, each containing 2 pages
2424
of the original document. By default it will split each page to a separate file.
25+
page_start: Begin splitting at this page number
26+
page_end: If provided, split up to and including this page number
2527
2628
Yields:
2729
The file contents with their page number and overall pages number of the original document.
2830
"""
2931

30-
offset = 0
31-
offset_end = len(pdf.pages)
32+
offset = page_start - 1
33+
offset_end = page_end or len(pdf.pages)
3234

3335
while offset < offset_end:
3436
new_pdf = PdfWriter()

src/unstructured_client/_hooks/custom/request_utils.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from unstructured_client._hooks.custom.form_utils import (
1717
PARTITION_FORM_FILES_KEY,
1818
PARTITION_FORM_SPLIT_PDF_PAGE_KEY,
19+
PARTITION_FORM_PAGE_RANGE_KEY,
1920
PARTITION_FORM_STARTING_PAGE_NUMBER_KEY,
2021
FormData,
2122
)
@@ -145,6 +146,7 @@ def prepare_request_payload(form_data: FormData) -> FormData:
145146
payload = copy.deepcopy(form_data)
146147
payload.pop(PARTITION_FORM_SPLIT_PDF_PAGE_KEY, None)
147148
payload.pop(PARTITION_FORM_FILES_KEY, None)
149+
payload.pop(PARTITION_FORM_PAGE_RANGE_KEY, None)
148150
payload.pop(PARTITION_FORM_STARTING_PAGE_NUMBER_KEY, None)
149151
updated_parameters = {
150152
PARTITION_FORM_SPLIT_PDF_PAGE_KEY: "false",

src/unstructured_client/_hooks/custom/split_pdf_hook.py

Lines changed: 23 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from unstructured_client._hooks.custom.form_utils import (
1919
PARTITION_FORM_CONCURRENCY_LEVEL_KEY,
2020
PARTITION_FORM_FILES_KEY,
21+
PARTITION_FORM_PAGE_RANGE_KEY,
2122
PARTITION_FORM_SPLIT_PDF_PAGE_KEY,
2223
PARTITION_FORM_STARTING_PAGE_NUMBER_KEY,
2324
)
@@ -143,7 +144,10 @@ def before_request(
143144
key=PARTITION_FORM_STARTING_PAGE_NUMBER_KEY,
144145
fallback_value=DEFAULT_STARTING_PAGE_NUMBER,
145146
)
146-
logger.info("Starting page number set to %d", starting_page_number)
147+
148+
if starting_page_number > 1:
149+
logger.info("Starting page number set to %d", starting_page_number)
150+
147151
concurrency_level = form_utils.get_split_pdf_concurrency_level_param(
148152
form_data,
149153
key=PARTITION_FORM_CONCURRENCY_LEVEL_KEY,
@@ -154,27 +158,34 @@ def before_request(
154158
limiter = asyncio.Semaphore(concurrency_level)
155159

156160
pdf = PdfReader(io.BytesIO(file.content))
161+
162+
page_range_start, page_range_end = form_utils.get_page_range(
163+
form_data,
164+
key=PARTITION_FORM_PAGE_RANGE_KEY,
165+
max_pages=len(pdf.pages),
166+
)
167+
168+
page_count = min(len(pdf.pages), page_range_end - page_range_start + 1)
169+
logger.info(f"Splitting pages {page_range_start} to {page_range_end} ({page_count} total)")
170+
157171
split_size = get_optimal_split_size(
158-
num_pages=len(pdf.pages), concurrency_level=concurrency_level
172+
num_pages=page_count, concurrency_level=concurrency_level
159173
)
160174
logger.info("Determined optimal split size of %d pages.", split_size)
161175

162-
if split_size >= len(pdf.pages):
176+
# If the doc is small enough, and we aren't slicing it with a page range:
177+
# do not split, just continue with the original request
178+
if split_size >= page_count and page_count == len(pdf.pages):
163179
logger.info(
164180
"Document has too few pages (%d) to be split efficiently. Partitioning without split.",
165-
len(pdf.pages),
181+
page_count,
166182
)
167183
return request
168184

169-
pages = pdf_utils.get_pdf_pages(pdf, split_size)
170-
logger.info(
171-
"Document split into %d, %d-paged sets.",
172-
math.ceil(len(pdf.pages) / split_size),
173-
split_size,
174-
)
185+
pages = pdf_utils.get_pdf_pages(pdf, split_size=split_size, page_start=page_range_start, page_end=page_range_end)
175186
logger.info(
176187
"Partitioning %d, %d-paged sets.",
177-
math.ceil(len(pdf.pages) / split_size),
188+
math.ceil(page_count / split_size),
178189
split_size,
179190
)
180191

@@ -209,7 +220,7 @@ async def call_api_partial(page):
209220
"Partitioning set #%d (pages %d-%d).",
210221
set_index,
211222
page_number,
212-
min(page_number + split_size, all_pages_number),
223+
min(page_number + split_size - 1, all_pages_number),
213224
)
214225
# Check if this set of pages is the last one
215226
if page_index + split_size >= all_pages_number:

0 commit comments

Comments
 (0)