29
29
)
30
30
from unstructured_client .models import shared
31
31
32
+ # TODO: (Marek Połom) - Update documentation before merging
33
+
32
34
logger = logging .getLogger (UNSTRUCTURED_CLIENT_LOGGER_NAME )
33
35
34
36
PARTITION_FORM_FILES_KEY = "files"
35
37
PARTITION_FORM_SPLIT_PDF_PAGE_KEY = "split_pdf_page"
38
+ PARTITION_FORM_STARTING_PAGE_NUMBER_KEY = "starting_page_number"
36
39
37
40
SUBSTITUTE_FILENAME = "file_for_partition.pdf"
41
+ DEFAULT_STARTING_PAGE_NUMBER = 1
38
42
39
43
40
44
FormData = dict [str , Union [str , shared .Files ]]
@@ -88,6 +92,10 @@ def before_request(
88
92
Union[requests.PreparedRequest, Exception]: If `splitPdfPage` is set to `true`,
89
93
the last page request; otherwise, the original request.
90
94
"""
95
+ if self .client is None :
96
+ logger .warning ("HTTP client not accessible! Continuing without splitting." )
97
+ return request
98
+
91
99
operation_id = hook_ctx .operation_id
92
100
content_type = request .headers .get ("Content-Type" )
93
101
body = request .body
@@ -104,9 +112,7 @@ def before_request(
104
112
if file is None or not isinstance (file , shared .Files ) or not self ._is_pdf (file ):
105
113
return request
106
114
107
- if self .client is None :
108
- logger .warning ("HTTP client not accessible! Continuing without splitting." )
109
- return request
115
+ starting_page_number = self ._get_starting_page_number (form_data )
110
116
111
117
pages = self ._get_pdf_pages (file .content )
112
118
call_api_partial = functools .partial (
@@ -119,10 +125,12 @@ def before_request(
119
125
self .partition_requests [operation_id ] = []
120
126
last_page_content = io .BytesIO ()
121
127
with ThreadPoolExecutor (max_workers = call_threads ) as executor :
122
- for page_content , page_number , all_pages_number in pages :
128
+ for page_content , page_index , all_pages_number in pages :
129
+ page_number = page_index + starting_page_number
123
130
# Check if this page is the last one
124
- if page_number == all_pages_number :
131
+ if page_index == all_pages_number :
125
132
last_page_content = page_content
133
+ last_page_number = page_number
126
134
break
127
135
self .partition_requests [operation_id ].append (
128
136
executor .submit (call_api_partial , (page_content , page_number ))
@@ -131,7 +139,7 @@ def before_request(
131
139
# `before_request` method needs to return a request so we skip sending the last page in parallel
132
140
# and return that last page at the end of this method
133
141
last_page_request = self ._create_request (
134
- request , form_data , last_page_content , file .file_name
142
+ request , form_data , last_page_content , file .file_name , last_page_number
135
143
)
136
144
last_page_prepared_request = self .client .prepare_request (last_page_request )
137
145
return last_page_prepared_request
@@ -270,8 +278,7 @@ def _get_pdf_pages(
270
278
new_pdf .write (pdf_buffer )
271
279
pdf_buffer .seek (0 )
272
280
273
- # 1-index the page numbers
274
- yield pdf_buffer , offset + 1 , offset_end
281
+ yield pdf_buffer , offset , offset_end
275
282
offset += split_size
276
283
277
284
def _parse_form_data (self , decoded_data : MultipartDecoder ) -> FormData :
@@ -362,7 +369,9 @@ def _call_api(
362
369
raise RuntimeError ("HTTP client not accessible!" )
363
370
page_content , page_number = page
364
371
365
- new_request = self ._create_request (request , form_data , page_content , filename )
372
+ new_request = self ._create_request (
373
+ request , form_data , page_content , filename , page_number
374
+ )
366
375
prepared_request = self .client .prepare_request (new_request )
367
376
368
377
try :
@@ -377,6 +386,7 @@ def _create_request(
377
386
form_data : FormData ,
378
387
page_content : io .BytesIO ,
379
388
filename : str ,
389
+ page_number : int ,
380
390
) -> requests .Request :
381
391
"""
382
392
Creates a request object for a part of a splitted PDF file.
@@ -392,7 +402,7 @@ def _create_request(
392
402
original file.
393
403
"""
394
404
headers = self ._prepare_request_headers (request .headers )
395
- payload = self ._prepare_request_payload (form_data )
405
+ payload = self ._prepare_request_payload (form_data , page_number )
396
406
body = MultipartEncoder (
397
407
fields = {
398
408
** payload ,
@@ -428,7 +438,9 @@ def _prepare_request_headers(
428
438
headers .pop ("Content-Length" , None )
429
439
return headers
430
440
431
- def _prepare_request_payload (self , form_data : FormData ) -> FormData :
441
+ def _prepare_request_payload (
442
+ self , form_data : FormData , page_number : int
443
+ ) -> FormData :
432
444
"""
433
445
Prepares the request payload by removing unnecessary keys and updating the
434
446
file.
@@ -442,7 +454,12 @@ def _prepare_request_payload(self, form_data: FormData) -> FormData:
442
454
payload = copy .deepcopy (form_data )
443
455
payload .pop (PARTITION_FORM_SPLIT_PDF_PAGE_KEY , None )
444
456
payload .pop (PARTITION_FORM_FILES_KEY , None )
445
- payload .update ({PARTITION_FORM_SPLIT_PDF_PAGE_KEY : "false" })
457
+ payload .pop (PARTITION_FORM_STARTING_PAGE_NUMBER_KEY , None )
458
+ updated_parameters = {
459
+ PARTITION_FORM_SPLIT_PDF_PAGE_KEY : "false" ,
460
+ PARTITION_FORM_STARTING_PAGE_NUMBER_KEY : str (page_number ),
461
+ }
462
+ payload .update (updated_parameters )
446
463
return payload
447
464
448
465
def _create_response (
@@ -540,3 +557,28 @@ def _clear_operation(self, operation_id: str) -> None:
540
557
"""
541
558
self .partition_responses .pop (operation_id , None )
542
559
self .partition_requests .pop (operation_id , None )
560
+
561
+ def _get_starting_page_number (self , form_data : FormData ) -> int :
562
+ starting_page_number = DEFAULT_STARTING_PAGE_NUMBER
563
+ try :
564
+ _starting_page_number = (
565
+ form_data .get (PARTITION_FORM_STARTING_PAGE_NUMBER_KEY )
566
+ or DEFAULT_STARTING_PAGE_NUMBER
567
+ )
568
+ starting_page_number = int (_starting_page_number ) # type: ignore
569
+ except ValueError :
570
+ logger .warning (
571
+ "'%s' is not a valid integer. Using default value '%d'." ,
572
+ PARTITION_FORM_STARTING_PAGE_NUMBER_KEY ,
573
+ DEFAULT_STARTING_PAGE_NUMBER ,
574
+ )
575
+
576
+ if starting_page_number < 1 :
577
+ logger .warning (
578
+ "'%s' is less than 1. Using default value '%d'." ,
579
+ PARTITION_FORM_STARTING_PAGE_NUMBER_KEY ,
580
+ DEFAULT_STARTING_PAGE_NUMBER ,
581
+ )
582
+ starting_page_number = DEFAULT_STARTING_PAGE_NUMBER
583
+
584
+ return starting_page_number
0 commit comments