Skip to content

Starting page number #72

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 39 commits into from
May 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
b78ff6f
requests-toolbelt dependency
mpolomdeepsense Apr 22, 2024
5dd920e
Move hooks to separate subdirectory and refactor hooks registration
mpolomdeepsense Apr 22, 2024
74ffbd7
Split pdf hook
mpolomdeepsense Apr 22, 2024
0207dfd
Remove previous implementation of split pdf page functionality
mpolomdeepsense Apr 22, 2024
b4829f0
Add missing new line at the end of a file
mpolomdeepsense Apr 22, 2024
daf7f5e
Fix _clear_operation function indentation
mpolomdeepsense Apr 22, 2024
6ca1451
Speakeasy generate client output
mpolomdeepsense Apr 22, 2024
a73fbe3
Remove TypeAlias import; fixes python 3.9 build
mpolomdeepsense Apr 22, 2024
d15518b
Support default type annotations in older python versions
mpolomdeepsense Apr 22, 2024
8c7ced7
Fix logging initialization
mpolomdeepsense Apr 22, 2024
c038dfd
Fix parallel api calls
mpolomdeepsense Apr 22, 2024
e3ce00c
Added logging in case of an error
mpolomdeepsense Apr 22, 2024
a182d19
Replace relative imports
mpolomdeepsense Apr 23, 2024
846941f
Reason to removing file extension from file name
mpolomdeepsense Apr 23, 2024
9d5d532
Code cleanup
mpolomdeepsense Apr 23, 2024
2e438e8
Explained why last page is skipped when sending parallel requests
mpolomdeepsense Apr 23, 2024
25f1a96
split pdf hook tests
mpolomdeepsense Apr 23, 2024
9042e99
Improve pdf file validation before splitting
mpolomdeepsense Apr 24, 2024
2b850fb
Rollback to port 8000
mpolomdeepsense Apr 24, 2024
b017cf4
Use Speakeasy Files instead of a custom File class
mpolomdeepsense Apr 24, 2024
7263f0e
Unit tests for _is_pdf method of split_pdf_hook
mpolomdeepsense Apr 24, 2024
a2ba3ae
Changed after error hooks order. We don't want to log error in case l…
mpolomdeepsense Apr 24, 2024
9f920dc
Fix for when split_pdf_page is True and one paged pdf is sent. Docume…
mpolomdeepsense Apr 24, 2024
24eae11
Test comment for more code understanding
mpolomdeepsense Apr 24, 2024
6ddac3e
Handling an edge case for when a filename would be an empty string
mpolomdeepsense Apr 24, 2024
4610465
Made empty filename edge case more robust
mpolomdeepsense Apr 24, 2024
1db74d3
Handle edge case for when user inputs empty filename
mpolomdeepsense Apr 25, 2024
1f14e24
Split pdf hook unit tests description
mpolomdeepsense Apr 25, 2024
edbed11
Replace relative imports
mpolomdeepsense Apr 25, 2024
6145375
Fixed comment
mpolomdeepsense Apr 24, 2024
4942a2d
Starting page number
mpolomdeepsense Apr 25, 2024
5d1fe19
Payload starting page number assignment refactor
mpolomdeepsense Apr 25, 2024
70093db
_get_starting_page_number unit tests
mpolomdeepsense Apr 25, 2024
5b24777
Formatting
mpolomdeepsense Apr 26, 2024
6ccf5d2
Fix page numbering
mpolomdeepsense Apr 26, 2024
e0de8d7
temporary starting_page_number overlay addition
mpolomdeepsense Apr 26, 2024
e215f0c
Auto-generated code for starting_page_number parameter
mpolomdeepsense Apr 26, 2024
45cdb2b
Merge branch 'main' into starting_page_num
mpolomdeepsense Apr 26, 2024
252dba7
Remove page_number exclude from split_pdf_page integration test
mpolomdeepsense Apr 26, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions .speakeasy/gen.lock
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
lockVersion: 2.0.0
id: 8b5fa338-9106-4734-abf0-e30d67044a90
management:
docChecksum: b35264eb5f2ce89c808012333367cf1c
docChecksum: 666d45deb8d9066b8e19e04a305ca734
docVersion: 0.0.1
speakeasyVersion: 1.267.1
speakeasyVersion: 1.272.0
generationVersion: 2.312.1
releaseVersion: 0.23.2
configChecksum: c0ddfa44eb8fbd51d397d36253d1d68f
releaseVersion: 0.23.3
configChecksum: 7aae705f3e8a728a15a7177fbca343ad
repoURL: https://github.com/Unstructured-IO/unstructured-python-client.git
repoSubDirectory: .
installationURL: https://github.com/Unstructured-IO/unstructured-python-client.git
Expand Down
2 changes: 0 additions & 2 deletions _test_unstructured_client/test__decorators.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,8 +84,6 @@ def test_integration_split_pdf_has_same_output_as_non_split(
t2=resp_single.elements,
exclude_regex_paths=[
r"root\[\d+\]\['metadata'\]\['parent_id'\]",
# TODO: (Marek Połom) - Remove page number pattern after page numbering parameter is added
r"root\[\d+\]\['metadata'\]\['page_number'\]",
],
)
assert len(diff) == 0
Expand Down
40 changes: 38 additions & 2 deletions _test_unstructured_client/test_split_pdf_hook.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import requests
from requests_toolbelt import MultipartDecoder, MultipartEncoder


from unstructured_client._hooks.custom import SplitPdfHook
from unstructured_client.models import shared

Expand Down Expand Up @@ -150,6 +149,7 @@ def test_unit_create_request(self):
"parameter_1": "value_1",
"parameter_2": "value_2",
"split_pdf_page": "false",
"starting_page_number": "7",
}
expected_page_filename = "test_file.pdf"
expected_body = MultipartEncoder(
Expand All @@ -165,7 +165,7 @@ def test_unit_create_request(self):
expected_url = ""

# Create request
request_obj = hook._create_request(request, form_data, page[0], filename)
request_obj = hook._create_request(request, form_data, page[0], filename, 7)
request_content_type: str = request_obj.headers.get("Content-Type")
# Assert the request object
self.assertEqual(request_obj.method, "POST")
Expand Down Expand Up @@ -306,3 +306,39 @@ def test_unit_is_pdf_invalid_pdf(self):

self.assertFalse(result)
self.assertIn("Attempted to interpret file as pdf", cm.output[1])

def test_unit_get_starting_page_number_valid_integer(self):
"""Test _get_starting_page_number method with valid integer."""
hook = SplitPdfHook()
form_data = {"starting_page_number": "5"}

result = hook._get_starting_page_number(form_data)

self.assertEqual(result, 5)

def test_unit_get_starting_page_number_invalid_integer(self):
"""Test _get_starting_page_number method with invalid integer."""
hook = SplitPdfHook()
form_data = {"starting_page_number": "abc"}

result = hook._get_starting_page_number(form_data)

self.assertEqual(result, 1)

def test_unit_get_starting_page_number_less_than_one(self):
"""Test _get_starting_page_number method with value less than 1."""
hook = SplitPdfHook()
form_data = {"starting_page_number": "0"}

result = hook._get_starting_page_number(form_data)

self.assertEqual(result, 1)

def test_unit_get_starting_page_number_missing_key(self):
"""Test _get_starting_page_number method with missing key."""
hook = SplitPdfHook()
form_data = {}

result = hook._get_starting_page_number(form_data)

self.assertEqual(result, 1)
1 change: 1 addition & 0 deletions docs/models/shared/partitionparameters.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
| `pdf_infer_table_structure` | *Optional[bool]* | :heavy_minus_sign: | Deprecated! Use skip_infer_table_types to opt out of table extraction for any file type. If False and strategy=hi_res, no Table Elements will be extracted from pdf files regardless of skip_infer_table_types contents. | |
| `skip_infer_table_types` | List[*str*] | :heavy_minus_sign: | The document types that you want to skip table extraction with. Default: [] | |
| `split_pdf_page` | *Optional[bool]* | :heavy_minus_sign: | Should the pdf file be split at client. Ignored on backend. | |
| `starting_page_number` | *Optional[int]* | :heavy_minus_sign: | The real number of the first PDF page. | |
| `strategy` | *Optional[str]* | :heavy_minus_sign: | The strategy to use for partitioning PDF/image. Options are fast, hi_res, auto. Default: auto | hi_res |
| `unique_element_ids` | *Optional[bool]* | :heavy_minus_sign: | When True, assign UUIDs to element IDs, which guarantees their uniqueness (useful when using them as primary keys in database). Otherwise a SHA-256 of element text is used. Default: False | |
| `xml_keep_tags` | *Optional[bool]* | :heavy_minus_sign: | If True, will retain the XML tags in the output. Otherwise it will simply extract the text from within the tags. Only applies to partition_xml. | |
2 changes: 1 addition & 1 deletion gen.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ generation:
auth:
oAuth2ClientCredentialsEnabled: false
python:
version: 0.23.2
version: 0.23.3
additionalDependencies:
dependencies:
deepdiff: '>=6.0'
Expand Down
3 changes: 3 additions & 0 deletions overlay_client.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,6 @@ actions:
- target: $["components"]["schemas"]["partition_parameters"]["properties"]
update:
"split_pdf_page": {"type": "boolean", "title": "Split Pdf Page", "description": "Should the pdf file be split at client. Ignored on backend."}
- target: $["components"]["schemas"]["partition_parameters"]["properties"]
update:
"starting_page_number": {"type": "integer", "title": "Starting Page Number", "description": "The real number of the first PDF page."}
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

setuptools.setup(
name='unstructured-client',
version='0.23.2',
version='0.23.3',
author='Unstructured',
description='Python Client SDK for Unstructured API',
license = 'MIT',
Expand Down
80 changes: 67 additions & 13 deletions src/unstructured_client/_hooks/custom/split_pdf_hook.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
from pypdf import PdfReader, PdfWriter
from pypdf.errors import PdfReadError


from unstructured_client._hooks.custom.common import UNSTRUCTURED_CLIENT_LOGGER_NAME
from unstructured_client._hooks.types import (
BeforeRequestContext,
Expand All @@ -33,6 +32,10 @@

PARTITION_FORM_FILES_KEY = "files"
PARTITION_FORM_SPLIT_PDF_PAGE_KEY = "split_pdf_page"
PARTITION_FORM_STARTING_PAGE_NUMBER_KEY = "starting_page_number"

DEFAULT_STARTING_PAGE_NUMBER = 1


FormData = dict[str, Union[str, shared.Files]]

Expand Down Expand Up @@ -85,6 +88,10 @@ def before_request(
Union[requests.PreparedRequest, Exception]: If `splitPdfPage` is set to `true`,
the last page request; otherwise, the original request.
"""
if self.client is None:
logger.warning("HTTP client not accessible! Continuing without splitting.")
return request

operation_id = hook_ctx.operation_id
content_type = request.headers.get("Content-Type")
body = request.body
Expand All @@ -101,9 +108,7 @@ def before_request(
if file is None or not isinstance(file, shared.Files) or not self._is_pdf(file):
return request

if self.client is None:
logger.warning("HTTP client not accessible! Continuing without splitting.")
return request
starting_page_number = self._get_starting_page_number(form_data)

pages = self._get_pdf_pages(file.content)
call_api_partial = functools.partial(
Expand All @@ -115,11 +120,14 @@ def before_request(
call_threads = self._get_split_pdf_call_threads()
self.partition_requests[operation_id] = []
last_page_content = io.BytesIO()
last_page_number = 0
with ThreadPoolExecutor(max_workers=call_threads) as executor:
for page_content, page_number, all_pages_number in pages:
# Check if the next page will be the last one
if page_number == all_pages_number:
for page_content, page_index, all_pages_number in pages:
page_number = page_index + starting_page_number
# Check if this page is the last one
if page_index == all_pages_number - 1:
last_page_content = page_content
last_page_number = page_number
break
self.partition_requests[operation_id].append(
executor.submit(call_api_partial, (page_content, page_number))
Expand All @@ -128,7 +136,7 @@ def before_request(
# `before_request` method needs to return a request so we skip sending the last page in parallel
# and return that last page at the end of this method
last_page_request = self._create_request(
request, form_data, last_page_content, file.file_name
request, form_data, last_page_content, file.file_name, last_page_number
)
last_page_prepared_request = self.client.prepare_request(last_page_request)
return last_page_prepared_request
Expand Down Expand Up @@ -217,7 +225,9 @@ def _is_pdf(self, file: shared.Files) -> bool:
bool: True if the file is a PDF, False otherwise.
"""
if not file.file_name.endswith(".pdf"):
logger.warning("Given file doesn't have '.pdf' extension. Continuing without splitting.")
logger.warning(
"Given file doesn't have '.pdf' extension. Continuing without splitting."
)
return False

try:
Expand Down Expand Up @@ -267,8 +277,7 @@ def _get_pdf_pages(
new_pdf.write(pdf_buffer)
pdf_buffer.seek(0)

# 1-index the page numbers
yield pdf_buffer, offset + 1, offset_end
yield pdf_buffer, offset, offset_end
offset += split_size

def _parse_form_data(self, decoded_data: MultipartDecoder) -> FormData:
Expand Down Expand Up @@ -349,7 +358,9 @@ def _call_api(
raise RuntimeError("HTTP client not accessible!")
page_content, page_number = page

new_request = self._create_request(request, form_data, page_content, filename)
new_request = self._create_request(
request, form_data, page_content, filename, page_number
)
prepared_request = self.client.prepare_request(new_request)

try:
Expand All @@ -364,6 +375,7 @@ def _create_request(
form_data: FormData,
page_content: io.BytesIO,
filename: str,
page_number: int,
) -> requests.Request:
"""
Creates a request object for a part of a splitted PDF file.
Expand All @@ -373,6 +385,7 @@ def _create_request(
form_data (FormData): The form data for the request.
page_content (io.BytesIO): Page content in bytes.
filename (str): The original filename of the PDF file.
page_number (int): Number of the page in the original PDF file.

Returns:
requests.Request: The request object for a splitted part of the
Expand All @@ -388,6 +401,7 @@ def _create_request(
page_content,
"application/pdf",
),
PARTITION_FORM_STARTING_PAGE_NUMBER_KEY: str(page_number),
}
)
return requests.Request(
Expand Down Expand Up @@ -429,7 +443,11 @@ def _prepare_request_payload(self, form_data: FormData) -> FormData:
payload = copy.deepcopy(form_data)
payload.pop(PARTITION_FORM_SPLIT_PDF_PAGE_KEY, None)
payload.pop(PARTITION_FORM_FILES_KEY, None)
payload.update({PARTITION_FORM_SPLIT_PDF_PAGE_KEY: "false"})
payload.pop(PARTITION_FORM_STARTING_PAGE_NUMBER_KEY, None)
updated_parameters = {
PARTITION_FORM_SPLIT_PDF_PAGE_KEY: "false",
}
payload.update(updated_parameters)
return payload

def _create_response(
Expand Down Expand Up @@ -527,3 +545,39 @@ def _clear_operation(self, operation_id: str) -> None:
"""
self.partition_responses.pop(operation_id, None)
self.partition_requests.pop(operation_id, None)

def _get_starting_page_number(self, form_data: FormData) -> int:
"""
Retrieves the starting page number from the given form data. In case given
starting page number is not a valid integer or less than 1, it will use the
default value.

Args:
form_data (FormData): The form data containing the starting page number.

Returns:
int: The starting page number.
"""
starting_page_number = DEFAULT_STARTING_PAGE_NUMBER
try:
_starting_page_number = (
form_data.get(PARTITION_FORM_STARTING_PAGE_NUMBER_KEY)
or DEFAULT_STARTING_PAGE_NUMBER
)
starting_page_number = int(_starting_page_number) # type: ignore
except ValueError:
logger.warning(
"'%s' is not a valid integer. Using default value '%d'.",
PARTITION_FORM_STARTING_PAGE_NUMBER_KEY,
DEFAULT_STARTING_PAGE_NUMBER,
)

if starting_page_number < 1:
logger.warning(
"'%s' is less than 1. Using default value '%d'.",
PARTITION_FORM_STARTING_PAGE_NUMBER_KEY,
DEFAULT_STARTING_PAGE_NUMBER,
)
starting_page_number = DEFAULT_STARTING_PAGE_NUMBER

return starting_page_number
Loading