Skip to content

Commit ec16a2d

Browse files
Starting page number (#72)
**Only `test__decorators.py`, `test_split_pdf_hook.py`, `split_pdf_hook.py` and `overlay_client.yaml` files were modified by human. Rest of them were auto generated.** To run integration tests first run `unstructured-api` on port 8000
1 parent f173b57 commit ec16a2d

File tree

10 files changed

+119
-25
lines changed

10 files changed

+119
-25
lines changed

.speakeasy/gen.lock

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
lockVersion: 2.0.0
22
id: 8b5fa338-9106-4734-abf0-e30d67044a90
33
management:
4-
docChecksum: b35264eb5f2ce89c808012333367cf1c
4+
docChecksum: 666d45deb8d9066b8e19e04a305ca734
55
docVersion: 0.0.1
6-
speakeasyVersion: 1.267.1
6+
speakeasyVersion: 1.272.0
77
generationVersion: 2.312.1
8-
releaseVersion: 0.23.2
9-
configChecksum: c0ddfa44eb8fbd51d397d36253d1d68f
8+
releaseVersion: 0.23.3
9+
configChecksum: 7aae705f3e8a728a15a7177fbca343ad
1010
repoURL: https://github.com/Unstructured-IO/unstructured-python-client.git
1111
repoSubDirectory: .
1212
installationURL: https://github.com/Unstructured-IO/unstructured-python-client.git

_test_unstructured_client/test__decorators.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -84,8 +84,6 @@ def test_integration_split_pdf_has_same_output_as_non_split(
8484
t2=resp_single.elements,
8585
exclude_regex_paths=[
8686
r"root\[\d+\]\['metadata'\]\['parent_id'\]",
87-
# TODO: (Marek Połom) - Remove page number pattern after page numbering parameter is added
88-
r"root\[\d+\]\['metadata'\]\['page_number'\]",
8987
],
9088
)
9189
assert len(diff) == 0

_test_unstructured_client/test_split_pdf_hook.py

Lines changed: 38 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
import requests
88
from requests_toolbelt import MultipartDecoder, MultipartEncoder
99

10-
1110
from unstructured_client._hooks.custom import SplitPdfHook
1211
from unstructured_client.models import shared
1312

@@ -150,6 +149,7 @@ def test_unit_create_request(self):
150149
"parameter_1": "value_1",
151150
"parameter_2": "value_2",
152151
"split_pdf_page": "false",
152+
"starting_page_number": "7",
153153
}
154154
expected_page_filename = "test_file.pdf"
155155
expected_body = MultipartEncoder(
@@ -165,7 +165,7 @@ def test_unit_create_request(self):
165165
expected_url = ""
166166

167167
# Create request
168-
request_obj = hook._create_request(request, form_data, page[0], filename)
168+
request_obj = hook._create_request(request, form_data, page[0], filename, 7)
169169
request_content_type: str = request_obj.headers.get("Content-Type")
170170
# Assert the request object
171171
self.assertEqual(request_obj.method, "POST")
@@ -306,3 +306,39 @@ def test_unit_is_pdf_invalid_pdf(self):
306306

307307
self.assertFalse(result)
308308
self.assertIn("Attempted to interpret file as pdf", cm.output[1])
309+
310+
def test_unit_get_starting_page_number_valid_integer(self):
311+
"""Test _get_starting_page_number method with valid integer."""
312+
hook = SplitPdfHook()
313+
form_data = {"starting_page_number": "5"}
314+
315+
result = hook._get_starting_page_number(form_data)
316+
317+
self.assertEqual(result, 5)
318+
319+
def test_unit_get_starting_page_number_invalid_integer(self):
320+
"""Test _get_starting_page_number method with invalid integer."""
321+
hook = SplitPdfHook()
322+
form_data = {"starting_page_number": "abc"}
323+
324+
result = hook._get_starting_page_number(form_data)
325+
326+
self.assertEqual(result, 1)
327+
328+
def test_unit_get_starting_page_number_less_than_one(self):
329+
"""Test _get_starting_page_number method with value less than 1."""
330+
hook = SplitPdfHook()
331+
form_data = {"starting_page_number": "0"}
332+
333+
result = hook._get_starting_page_number(form_data)
334+
335+
self.assertEqual(result, 1)
336+
337+
def test_unit_get_starting_page_number_missing_key(self):
338+
"""Test _get_starting_page_number method with missing key."""
339+
hook = SplitPdfHook()
340+
form_data = {}
341+
342+
result = hook._get_starting_page_number(form_data)
343+
344+
self.assertEqual(result, 1)

docs/models/shared/partitionparameters.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
| `pdf_infer_table_structure` | *Optional[bool]* | :heavy_minus_sign: | Deprecated! Use skip_infer_table_types to opt out of table extraction for any file type. If False and strategy=hi_res, no Table Elements will be extracted from pdf files regardless of skip_infer_table_types contents. | |
2626
| `skip_infer_table_types` | List[*str*] | :heavy_minus_sign: | The document types that you want to skip table extraction with. Default: [] | |
2727
| `split_pdf_page` | *Optional[bool]* | :heavy_minus_sign: | Should the pdf file be split at client. Ignored on backend. | |
28+
| `starting_page_number` | *Optional[int]* | :heavy_minus_sign: | The real number of the first PDF page. | |
2829
| `strategy` | *Optional[str]* | :heavy_minus_sign: | The strategy to use for partitioning PDF/image. Options are fast, hi_res, auto. Default: auto | hi_res |
2930
| `unique_element_ids` | *Optional[bool]* | :heavy_minus_sign: | When True, assign UUIDs to element IDs, which guarantees their uniqueness (useful when using them as primary keys in database). Otherwise a SHA-256 of element text is used. Default: False | |
3031
| `xml_keep_tags` | *Optional[bool]* | :heavy_minus_sign: | If True, will retain the XML tags in the output. Otherwise it will simply extract the text from within the tags. Only applies to partition_xml. | |

gen.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ generation:
1010
auth:
1111
oAuth2ClientCredentialsEnabled: false
1212
python:
13-
version: 0.23.2
13+
version: 0.23.3
1414
additionalDependencies:
1515
dependencies:
1616
deepdiff: '>=6.0'

overlay_client.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,6 @@ actions:
66
- target: $["components"]["schemas"]["partition_parameters"]["properties"]
77
update:
88
"split_pdf_page": {"type": "boolean", "title": "Split Pdf Page", "description": "Should the pdf file be split at client. Ignored on backend."}
9+
- target: $["components"]["schemas"]["partition_parameters"]["properties"]
10+
update:
11+
"starting_page_number": {"type": "integer", "title": "Starting Page Number", "description": "The real number of the first PDF page."}

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919

2020
setuptools.setup(
2121
name='unstructured-client',
22-
version='0.23.2',
22+
version='0.23.3',
2323
author='Unstructured',
2424
description='Python Client SDK for Unstructured API',
2525
license = 'MIT',

src/unstructured_client/_hooks/custom/split_pdf_hook.py

Lines changed: 67 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
from pypdf import PdfReader, PdfWriter
1717
from pypdf.errors import PdfReadError
1818

19-
2019
from unstructured_client._hooks.custom.common import UNSTRUCTURED_CLIENT_LOGGER_NAME
2120
from unstructured_client._hooks.types import (
2221
BeforeRequestContext,
@@ -33,6 +32,10 @@
3332

3433
PARTITION_FORM_FILES_KEY = "files"
3534
PARTITION_FORM_SPLIT_PDF_PAGE_KEY = "split_pdf_page"
35+
PARTITION_FORM_STARTING_PAGE_NUMBER_KEY = "starting_page_number"
36+
37+
DEFAULT_STARTING_PAGE_NUMBER = 1
38+
3639

3740
FormData = dict[str, Union[str, shared.Files]]
3841

@@ -85,6 +88,10 @@ def before_request(
8588
Union[requests.PreparedRequest, Exception]: If `splitPdfPage` is set to `true`,
8689
the last page request; otherwise, the original request.
8790
"""
91+
if self.client is None:
92+
logger.warning("HTTP client not accessible! Continuing without splitting.")
93+
return request
94+
8895
operation_id = hook_ctx.operation_id
8996
content_type = request.headers.get("Content-Type")
9097
body = request.body
@@ -101,9 +108,7 @@ def before_request(
101108
if file is None or not isinstance(file, shared.Files) or not self._is_pdf(file):
102109
return request
103110

104-
if self.client is None:
105-
logger.warning("HTTP client not accessible! Continuing without splitting.")
106-
return request
111+
starting_page_number = self._get_starting_page_number(form_data)
107112

108113
pages = self._get_pdf_pages(file.content)
109114
call_api_partial = functools.partial(
@@ -115,11 +120,14 @@ def before_request(
115120
call_threads = self._get_split_pdf_call_threads()
116121
self.partition_requests[operation_id] = []
117122
last_page_content = io.BytesIO()
123+
last_page_number = 0
118124
with ThreadPoolExecutor(max_workers=call_threads) as executor:
119-
for page_content, page_number, all_pages_number in pages:
120-
# Check if the next page will be the last one
121-
if page_number == all_pages_number:
125+
for page_content, page_index, all_pages_number in pages:
126+
page_number = page_index + starting_page_number
127+
# Check if this page is the last one
128+
if page_index == all_pages_number - 1:
122129
last_page_content = page_content
130+
last_page_number = page_number
123131
break
124132
self.partition_requests[operation_id].append(
125133
executor.submit(call_api_partial, (page_content, page_number))
@@ -128,7 +136,7 @@ def before_request(
128136
# `before_request` method needs to return a request so we skip sending the last page in parallel
129137
# and return that last page at the end of this method
130138
last_page_request = self._create_request(
131-
request, form_data, last_page_content, file.file_name
139+
request, form_data, last_page_content, file.file_name, last_page_number
132140
)
133141
last_page_prepared_request = self.client.prepare_request(last_page_request)
134142
return last_page_prepared_request
@@ -217,7 +225,9 @@ def _is_pdf(self, file: shared.Files) -> bool:
217225
bool: True if the file is a PDF, False otherwise.
218226
"""
219227
if not file.file_name.endswith(".pdf"):
220-
logger.warning("Given file doesn't have '.pdf' extension. Continuing without splitting.")
228+
logger.warning(
229+
"Given file doesn't have '.pdf' extension. Continuing without splitting."
230+
)
221231
return False
222232

223233
try:
@@ -267,8 +277,7 @@ def _get_pdf_pages(
267277
new_pdf.write(pdf_buffer)
268278
pdf_buffer.seek(0)
269279

270-
# 1-index the page numbers
271-
yield pdf_buffer, offset + 1, offset_end
280+
yield pdf_buffer, offset, offset_end
272281
offset += split_size
273282

274283
def _parse_form_data(self, decoded_data: MultipartDecoder) -> FormData:
@@ -349,7 +358,9 @@ def _call_api(
349358
raise RuntimeError("HTTP client not accessible!")
350359
page_content, page_number = page
351360

352-
new_request = self._create_request(request, form_data, page_content, filename)
361+
new_request = self._create_request(
362+
request, form_data, page_content, filename, page_number
363+
)
353364
prepared_request = self.client.prepare_request(new_request)
354365

355366
try:
@@ -364,6 +375,7 @@ def _create_request(
364375
form_data: FormData,
365376
page_content: io.BytesIO,
366377
filename: str,
378+
page_number: int,
367379
) -> requests.Request:
368380
"""
369381
Creates a request object for a part of a splitted PDF file.
@@ -373,6 +385,7 @@ def _create_request(
373385
form_data (FormData): The form data for the request.
374386
page_content (io.BytesIO): Page content in bytes.
375387
filename (str): The original filename of the PDF file.
388+
page_number (int): Number of the page in the original PDF file.
376389
377390
Returns:
378391
requests.Request: The request object for a splitted part of the
@@ -388,6 +401,7 @@ def _create_request(
388401
page_content,
389402
"application/pdf",
390403
),
404+
PARTITION_FORM_STARTING_PAGE_NUMBER_KEY: str(page_number),
391405
}
392406
)
393407
return requests.Request(
@@ -429,7 +443,11 @@ def _prepare_request_payload(self, form_data: FormData) -> FormData:
429443
payload = copy.deepcopy(form_data)
430444
payload.pop(PARTITION_FORM_SPLIT_PDF_PAGE_KEY, None)
431445
payload.pop(PARTITION_FORM_FILES_KEY, None)
432-
payload.update({PARTITION_FORM_SPLIT_PDF_PAGE_KEY: "false"})
446+
payload.pop(PARTITION_FORM_STARTING_PAGE_NUMBER_KEY, None)
447+
updated_parameters = {
448+
PARTITION_FORM_SPLIT_PDF_PAGE_KEY: "false",
449+
}
450+
payload.update(updated_parameters)
433451
return payload
434452

435453
def _create_response(
@@ -527,3 +545,39 @@ def _clear_operation(self, operation_id: str) -> None:
527545
"""
528546
self.partition_responses.pop(operation_id, None)
529547
self.partition_requests.pop(operation_id, None)
548+
549+
def _get_starting_page_number(self, form_data: FormData) -> int:
550+
"""
551+
Retrieves the starting page number from the given form data. In case given
552+
starting page number is not a valid integer or less than 1, it will use the
553+
default value.
554+
555+
Args:
556+
form_data (FormData): The form data containing the starting page number.
557+
558+
Returns:
559+
int: The starting page number.
560+
"""
561+
starting_page_number = DEFAULT_STARTING_PAGE_NUMBER
562+
try:
563+
_starting_page_number = (
564+
form_data.get(PARTITION_FORM_STARTING_PAGE_NUMBER_KEY)
565+
or DEFAULT_STARTING_PAGE_NUMBER
566+
)
567+
starting_page_number = int(_starting_page_number) # type: ignore
568+
except ValueError:
569+
logger.warning(
570+
"'%s' is not a valid integer. Using default value '%d'.",
571+
PARTITION_FORM_STARTING_PAGE_NUMBER_KEY,
572+
DEFAULT_STARTING_PAGE_NUMBER,
573+
)
574+
575+
if starting_page_number < 1:
576+
logger.warning(
577+
"'%s' is less than 1. Using default value '%d'.",
578+
PARTITION_FORM_STARTING_PAGE_NUMBER_KEY,
579+
DEFAULT_STARTING_PAGE_NUMBER,
580+
)
581+
starting_page_number = DEFAULT_STARTING_PAGE_NUMBER
582+
583+
return starting_page_number

0 commit comments

Comments
 (0)