Skip to content

Commit 61c3f67

Browse files
micmarty-deepsensecragwolfeawalker4
authored
Support for starting_page_number parameter when doing PDF page splitting (Unstructured-IO#400)
This PR enables the Python and JS clients to partition PDF pages independently after splitting them on their side (`split_pdf_page=True`). Splitting is also supported by API itself - this makes sense when users send their requests without using our dedicated clients. Related to: * Unstructured-IO/unstructured#2842 * Unstructured-IO/unstructured#2673 It should be merged before these: * Unstructured-IO/unstructured-js-client#55 * Unstructured-IO/unstructured-python-client#72 **The tests for this PR won't pass until the related PRs are both merged.** ## How to test it locally Unfortunately the `pytest` test is not fully implemented, it fails - see [this comment](Unstructured-IO#400 (comment)) 1. Clone Python client and checkout to this PR: Unstructured-IO/unstructured-js-client#55 2. `cd unstructured-client; pip install --editable .` 3. `make run-web-app` 4. `python <script-below>.py` ```python from unstructured_client import UnstructuredClient from unstructured_client.models import shared from unstructured_client.models.errors import SDKError s = UnstructuredClient(api_key_auth=os.environ["UNS_API_KEY"], server_url="http://localhost:8000") # -- this file is included in this PR -- filename = "sample-docs/DA-1p-with-duplicate-pages.pdf" with open(filename, "rb") as f: files = shared.Files(content=f.read(), file_name=filename) req = shared.PartitionParameters( files=files, strategy="fast", languages=["eng"], split_pdf_page=False, # this forces splitting on API side (if parallelization is enabled) # split_pdf_page=True, # forces client-side splitting, implemented here: Unstructured-IO/unstructured-js-client#55 ) resp = s.general.partition(req) ids = [e["element_id"] for e in resp.elements] page_numbers = [e["metadata"]["page_number"] for e in resp.elements] # this PDF contains 3 identical pages, 13 elements each assert page_numbers == [1,1,1,1,1,1,1,1,1,1,1,1,1, 2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,3,3,3,3,3] assert len(ids) == len(set(ids)), "Element IDs are not unique" ``` --------- Co-authored-by: cragwolfe <[email protected]> Co-authored-by: Austin Walker <[email protected]>
1 parent 21bb99c commit 61c3f67

File tree

7 files changed

+324
-108
lines changed

7 files changed

+324
-108
lines changed

prepline_general/api/general.py

Lines changed: 20 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@
3232
from pypdf.errors import FileNotDecryptedError, PdfReadError
3333
from starlette.datastructures import Headers
3434
from starlette.types import Send
35+
36+
from prepline_general.api.models.form_params import GeneralFormParams
3537
from unstructured.documents.elements import Element
3638
from unstructured.partition.auto import partition
3739
from unstructured.staging.base import (
@@ -42,8 +44,6 @@
4244
from unstructured_inference.models.base import UnknownModelException
4345
from unstructured_inference.models.chipper import MODEL_TYPES as CHIPPER_MODEL_TYPES
4446

45-
from prepline_general.api.models.form_params import GeneralFormParams
46-
4747
app = FastAPI()
4848
router = APIRouter()
4949

@@ -178,17 +178,19 @@ def partition_file_via_api(
178178
raise HTTPException(status_code=500, detail="Parallel mode enabled but no url set!")
179179

180180
api_key = request.headers.get("unstructured-api-key", default="")
181+
partition_kwargs["starting_page_number"] = (
182+
partition_kwargs.get("starting_page_number", 1) + page_offset
183+
)
181184

182-
result = call_api(request_url, api_key, filename, file, content_type, **partition_kwargs)
183-
elements = elements_from_json(text=result)
184-
185-
# We need to account for the original page numbers
186-
for element in elements:
187-
if element.metadata.page_number:
188-
# Page number could be None if we include page breaks
189-
element.metadata.page_number += page_offset
190-
191-
return elements
185+
result = call_api(
186+
request_url,
187+
api_key,
188+
filename,
189+
file,
190+
content_type,
191+
**partition_kwargs,
192+
)
193+
return elements_from_json(text=result)
192194

193195

194196
def partition_pdf_splits(
@@ -300,6 +302,7 @@ def pipeline_api(
300302
languages: Optional[List[str]] = None,
301303
extract_image_block_types: Optional[List[str]] = None,
302304
unique_element_ids: Optional[bool] = False,
305+
starting_page_number: Optional[int] = None,
303306
) -> List[Dict[str, Any]] | str:
304307
if filename.endswith(".msg"):
305308
# Note(yuming): convert file type for msg files
@@ -342,6 +345,7 @@ def pipeline_api(
342345
"new_after_n_chars": new_after_n_chars,
343346
"overlap": overlap,
344347
"overlap_all": overlap_all,
348+
"starting_page_number": starting_page_number,
345349
},
346350
default=str,
347351
)
@@ -362,6 +366,8 @@ def pipeline_api(
362366
# Parallel mode is set by env variable
363367
enable_parallel_mode = os.environ.get("UNSTRUCTURED_PARALLEL_MODE_ENABLED", "false")
364368
pdf_parallel_mode_enabled = enable_parallel_mode == "true"
369+
if starting_page_number is None:
370+
starting_page_number = 1
365371

366372
ocr_languages_str = "+".join(ocr_languages) if ocr_languages and len(ocr_languages) else None
367373

@@ -422,6 +428,7 @@ def pipeline_api(
422428
"extract_image_block_types": extract_image_block_types,
423429
"extract_image_block_to_payload": extract_image_block_to_payload,
424430
"unique_element_ids": unique_element_ids,
431+
"starting_page_number": starting_page_number,
425432
}
426433

427434
if file_content_type == "application/pdf" and pdf_parallel_mode_enabled:
@@ -791,6 +798,7 @@ def response_generator(is_multipart: bool):
791798
new_after_n_chars=form_params.new_after_n_chars,
792799
overlap=form_params.overlap,
793800
overlap_all=form_params.overlap_all,
801+
starting_page_number=form_params.starting_page_number,
794802
)
795803

796804
yield (

prepline_general/api/models/form_params.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
1-
from typing import Annotated, Optional, List, Literal
1+
from typing import Annotated, List, Literal, Optional
22

33
from fastapi import Form
4-
54
from pydantic import BaseModel, BeforeValidator
65

76
from prepline_general.api.utils import SmartValueParser
@@ -35,6 +34,7 @@ class GeneralFormParams(BaseModel):
3534
new_after_n_chars: Optional[int]
3635
overlap: int
3736
overlap_all: bool
37+
starting_page_number: Optional[int] = None
3838

3939
@classmethod
4040
def as_form(
@@ -225,6 +225,17 @@ def as_form(
225225
example=True,
226226
),
227227
] = False,
228+
starting_page_number: Annotated[
229+
Optional[int],
230+
Form(
231+
title="PDF Starting Page Number",
232+
description=(
233+
"When PDF is split into pages before sending it into the API, providing "
234+
"this information will allow the page number to be assigned correctly."
235+
),
236+
example=3,
237+
),
238+
] = None,
228239
) -> "GeneralFormParams":
229240
return cls(
230241
xml_keep_tags=xml_keep_tags,
@@ -250,4 +261,5 @@ def as_form(
250261
overlap=overlap,
251262
overlap_all=overlap_all,
252263
unique_element_ids=unique_element_ids,
264+
starting_page_number=starting_page_number,
253265
)

requirements/base.txt

Lines changed: 64 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ backoff==2.2.1
1616
# unstructured
1717
beautifulsoup4==4.12.3
1818
# via unstructured
19+
cachetools==5.3.3
20+
# via google-auth
1921
certifi==2024.2.2
2022
# via
2123
# requests
@@ -42,23 +44,25 @@ cryptography==42.0.5
4244
# via pdfminer-six
4345
cycler==0.12.1
4446
# via matplotlib
45-
dataclasses-json==0.6.4
46-
# via unstructured
47-
dataclasses-json-speakeasy==0.5.11
47+
dataclasses-json==0.6.5
48+
# via
49+
# unstructured
50+
# unstructured-client
51+
deepdiff==7.0.1
4852
# via unstructured-client
4953
deprecated==1.2.14
5054
# via pikepdf
5155
effdet==0.4.1
5256
# via layoutparser
53-
emoji==2.11.0
57+
emoji==2.11.1
5458
# via unstructured
5559
et-xmlfile==1.1.0
5660
# via openpyxl
57-
exceptiongroup==1.2.0
61+
exceptiongroup==1.2.1
5862
# via anyio
59-
fastapi==0.110.1
63+
fastapi==0.110.2
6064
# via -r requirements/base.in
61-
filelock==3.13.4
65+
filelock==3.14.0
6266
# via
6367
# huggingface-hub
6468
# torch
@@ -73,6 +77,24 @@ fsspec==2024.3.1
7377
# via
7478
# huggingface-hub
7579
# torch
80+
google-api-core[grpc]==2.18.0
81+
# via google-cloud-vision
82+
google-auth==2.29.0
83+
# via
84+
# google-api-core
85+
# google-cloud-vision
86+
google-cloud-vision==3.7.2
87+
# via unstructured
88+
googleapis-common-protos==1.63.0
89+
# via
90+
# google-api-core
91+
# grpcio-status
92+
grpcio==1.62.2
93+
# via
94+
# google-api-core
95+
# grpcio-status
96+
grpcio-status==1.62.2
97+
# via google-api-core
7698
h11==0.14.0
7799
# via uvicorn
78100
huggingface-hub==0.22.2
@@ -83,7 +105,7 @@ huggingface-hub==0.22.2
83105
# unstructured-inference
84106
humanfriendly==10.0
85107
# via coloredlogs
86-
idna==3.6
108+
idna==3.7
87109
# via
88110
# anyio
89111
# requests
@@ -115,7 +137,6 @@ markupsafe==2.1.5
115137
marshmallow==3.21.1
116138
# via
117139
# dataclasses-json
118-
# dataclasses-json-speakeasy
119140
# unstructured-client
120141
matplotlib==3.8.4
121142
# via pycocotools
@@ -155,14 +176,16 @@ onnx==1.16.0
155176
# via
156177
# unstructured
157178
# unstructured-inference
158-
onnxruntime==1.15.1
179+
onnxruntime==1.17.3
159180
# via unstructured-inference
160181
opencv-python==4.9.0.80
161182
# via
162183
# layoutparser
163184
# unstructured-inference
164185
openpyxl==3.1.2
165186
# via unstructured
187+
ordered-set==4.1.0
188+
# via deepdiff
166189
packaging==24.0
167190
# via
168191
# huggingface-hub
@@ -174,7 +197,7 @@ packaging==24.0
174197
# transformers
175198
# unstructured-client
176199
# unstructured-pytesseract
177-
pandas==2.2.1
200+
pandas==2.2.2
178201
# via
179202
# layoutparser
180203
# unstructured
@@ -188,7 +211,7 @@ pdfminer-six==20231228
188211
# unstructured
189212
pdfplumber==0.11.0
190213
# via layoutparser
191-
pikepdf==8.15.0
214+
pikepdf==8.15.1
192215
# via unstructured
193216
pillow==10.3.0
194217
# via
@@ -206,21 +229,36 @@ pillow-heif==0.16.0
206229
# via unstructured
207230
portalocker==2.8.2
208231
# via iopath
209-
protobuf==5.26.1
232+
proto-plus==1.23.0
233+
# via
234+
# google-api-core
235+
# google-cloud-vision
236+
protobuf==4.25.3
210237
# via
238+
# google-api-core
239+
# google-cloud-vision
240+
# googleapis-common-protos
241+
# grpcio-status
211242
# onnx
212243
# onnxruntime
244+
# proto-plus
213245
psutil==5.9.8
214246
# via -r requirements/base.in
247+
pyasn1==0.6.0
248+
# via
249+
# pyasn1-modules
250+
# rsa
251+
pyasn1-modules==0.4.0
252+
# via google-auth
215253
pycocotools==2.0.7
216254
# via effdet
217255
pycparser==2.22
218256
# via cffi
219257
pycryptodome==3.20.0
220258
# via -r requirements/base.in
221-
pydantic==2.6.4
259+
pydantic==2.7.1
222260
# via fastapi
223-
pydantic-core==2.16.3
261+
pydantic-core==2.18.2
224262
# via pydantic
225263
pypandoc==1.13
226264
# via unstructured
@@ -230,6 +268,7 @@ pypdf==4.2.0
230268
# via
231269
# -r requirements/base.in
232270
# unstructured
271+
# unstructured-client
233272
pypdfium2==4.29.0
234273
# via pdfplumber
235274
pytesseract==0.3.10
@@ -241,7 +280,7 @@ python-dateutil==2.9.0.post0
241280
# unstructured-client
242281
python-docx==1.1.0
243282
# via unstructured
244-
python-iso639==2024.2.7
283+
python-iso639==2024.4.27
245284
# via unstructured
246285
python-magic==0.4.27
247286
# via unstructured
@@ -264,17 +303,20 @@ rapidfuzz==3.8.1
264303
# unstructured-inference
265304
ratelimit==2.2.1
266305
# via -r requirements/base.in
267-
regex==2023.12.25
306+
regex==2024.4.28
268307
# via
269308
# nltk
270309
# transformers
271310
requests==2.31.0
272311
# via
273312
# -r requirements/base.in
313+
# google-api-core
274314
# huggingface-hub
275315
# transformers
276316
# unstructured
277317
# unstructured-client
318+
rsa==4.9
319+
# via google-auth
278320
safetensors==0.3.2
279321
# via
280322
# -c requirements/constraints.in
@@ -303,13 +345,13 @@ timm==0.9.16
303345
# via effdet
304346
tokenizers==0.15.2
305347
# via transformers
306-
torch==2.2.2
348+
torch==2.3.0
307349
# via
308350
# effdet
309351
# layoutparser
310352
# timm
311353
# torchvision
312-
torchvision==0.17.2
354+
torchvision==0.18.0
313355
# via
314356
# effdet
315357
# layoutparser
@@ -340,15 +382,14 @@ typing-extensions==4.11.0
340382
typing-inspect==0.9.0
341383
# via
342384
# dataclasses-json
343-
# dataclasses-json-speakeasy
344385
# unstructured-client
345386
tzdata==2024.1
346387
# via pandas
347-
unstructured[local-inference]==0.13.2
388+
unstructured[local-inference]==0.13.5
348389
# via -r requirements/base.in
349-
unstructured-client==0.18.0
390+
unstructured-client==0.22.0
350391
# via unstructured
351-
unstructured-inference==0.7.25
392+
unstructured-inference==0.7.29
352393
# via unstructured
353394
unstructured-pytesseract==0.3.12
354395
# via unstructured

0 commit comments

Comments
 (0)