Skip to content

feat: Parameter to send custom page range when splitting pdf #125

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Jul 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,10 @@ res = s.general.partition(request=operations.PartitionRequest(
content='0x2cC94b2FEF'.encode(),
file_name='your_file_here',
),
split_pdf_page_range=[
1,
10,
],
strategy=shared.Strategy.AUTO,
),
))
Expand Down Expand Up @@ -110,6 +114,10 @@ res = s.general.partition(request=operations.PartitionRequest(
content='0x2cC94b2FEF'.encode(),
file_name='your_file_here',
),
split_pdf_page_range=[
1,
10,
],
strategy=shared.Strategy.AUTO,
),
),
Expand Down Expand Up @@ -139,6 +147,10 @@ res = s.general.partition(request=operations.PartitionRequest(
content='0x2cC94b2FEF'.encode(),
file_name='your_file_here',
),
split_pdf_page_range=[
1,
10,
],
strategy=shared.Strategy.AUTO,
),
))
Expand Down
4 changes: 4 additions & 0 deletions USAGE.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@ res = s.general.partition(request=operations.PartitionRequest(
content='0x2cC94b2FEF'.encode(),
file_name='your_file_here',
),
split_pdf_page_range=[
1,
10,
],
strategy=shared.Strategy.AUTO,
),
))
Expand Down
70 changes: 70 additions & 0 deletions _test_unstructured_client/integration/test_decorators.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,3 +110,73 @@ def test_integration_split_pdf_for_file_with_no_name():
)

pytest.raises(ValueError, client.general.partition, req)


@pytest.mark.parametrize("starting_page_number", [1, 100])
@pytest.mark.parametrize(
"page_range, expected_ok, expected_pages",
[
(["1", "14"], True, (1, 14)), # Valid range, start on boundary
(["4", "16"], True, (4, 16)), # Valid range, end on boundary
(["2", "5"], True, (2, 5)), # Valid range within boundary
# A 1 page doc wouldn't normally be split,
# but this code still needs to return the page range
(["6", "6"], True, (6, 6)),
(["2", "100"], False, None), # End page too high
(["50", "100"], False, None), # Range too high
(["-50", "5"], False, None), # Start page too low
(["-50", "-2"], False, None), # Range too low
(["10", "2"], False, None), # Backwards range
],
)
def test_integration_split_pdf_with_page_range(
starting_page_number: int,
page_range: list[int],
expected_ok: bool,
expected_pages: tuple[int, int],
caplog,
):
"""
Test that we can split pdfs with an arbitrary page range. Send the selected range to the API and assert that the metadata page numbers are correct.
We should also be able to offset the metadata with starting_page_number.

Requires unstructured-api running in bg. See Makefile for how to run it.
"""
try:
response = requests.get("http://localhost:8000/general/docs")
assert response.status_code == 200, "The unstructured-api is not running on localhost:8000"
except requests.exceptions.ConnectionError:
assert False, "The unstructured-api is not running on localhost:8000"

client = UnstructuredClient(api_key_auth=FAKE_KEY, server_url="localhost:8000")

filename = "_sample_docs/layout-parser-paper.pdf"
with open(filename, "rb") as f:
files = shared.Files(
content=f.read(),
file_name=filename,
)

req = shared.PartitionParameters(
files=files,
strategy="fast",
split_pdf_page=True,
split_pdf_page_range=page_range,
starting_page_number=starting_page_number,
)

try:
resp = client.general.partition(req)
except ValueError as exc:
assert not expected_ok
assert "is out of bounds." in caplog.text
assert "is out of bounds." in str(exc)
return

page_numbers = set([e["metadata"]["page_number"] for e in resp.elements])

min_page_number = expected_pages[0] + starting_page_number - 1
max_page_number = expected_pages[1] + starting_page_number - 1

assert min(page_numbers) == min_page_number, f"Result should start at page {min_page_number}"
assert max(page_numbers) == max_page_number, f"Result should end at page {max_page_number}"
96 changes: 81 additions & 15 deletions _test_unstructured_client/unit/test_split_pdf_hook.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from unstructured_client._hooks.custom.form_utils import (
PARTITION_FORM_CONCURRENCY_LEVEL_KEY,
PARTITION_FORM_STARTING_PAGE_NUMBER_KEY,
PARTITION_FORM_PAGE_RANGE_KEY,
)
from unstructured_client._hooks.custom.split_pdf_hook import (
DEFAULT_CONCURRENCY_LEVEL,
Expand Down Expand Up @@ -122,7 +123,8 @@ def test_unit_create_response():


def test_unit_create_request():
"""Test create request method properly sets file, Content-Type and Content-Length headers."""
"""Test create request method properly sets file, Content-Type and Content-Length headers.
List parameters should be flattened in the body."""

# Prepare test data
request = requests.PreparedRequest()
Expand All @@ -133,27 +135,27 @@ def test_unit_create_request():
form_data = {
"parameter_1": "value_1",
"parameter_2": "value_2",
"list_parameter": ["value_1", "value_2"],
}
page = (io.BytesIO(b"page_content"), 1)
filename = "test_file.pdf"

# Expected results
expected_payload = {
"parameter_1": "value_1",
"parameter_2": "value_2",
"split_pdf_page": "false",
"starting_page_number": "7",
}
expected_page_filename = "test_file.pdf"
expected_body = MultipartEncoder(
fields={
**expected_payload,
"files": (
fields=[
("parameter_1", "value_1"),
("parameter_2", "value_2"),
("list_parameter", "value_1"),
("list_parameter", "value_2"),
("split_pdf_page", "false"),
("starting_page_number", "7"),
("files", (
expected_page_filename,
page[0],
"application/pdf",
),
}
)),
]
)
expected_url = ""

Expand All @@ -164,7 +166,10 @@ def test_unit_create_request():
# Assert the request object
assert request_obj.method == "POST"
assert request_obj.url == expected_url
assert request_obj.data.fields == expected_body.fields

# Validate fields ignoring order
assert set(request_obj.data.fields) == set(expected_body.fields)

assert request_content_type.startswith("multipart/form-data")


Expand All @@ -191,11 +196,37 @@ def test_unit_decode_content_disposition():


def test_unit_parse_form_data():
"""Test parse form data method properly parses the form data and returns dictionary."""
"""Test parse form data method properly parses the form data and returns dictionary.
Parameters with the same key should be consolidated to a list."""

# Prepare test data
test_form_data = (
b"--boundary\r\n"
b"Content-Disposition: form-data; name=\"files\"; filename=\"test_file.pdf\"\r\n"
b"\r\n"
b"file_content\r\n"
b"--boundary\r\n"
b"Content-Disposition: form-data; name=\"parameter_1\"\r\n"
b"\r\n"
b"value_1\r\n"
b"--boundary\r\n"
b"Content-Disposition: form-data; name=\"parameter_2\"\r\n"
b"\r\n"
b"value_2\r\n"
b"--boundary\r\n"
b"Content-Disposition: form-data; name=\"list_parameter\"\r\n"
b"\r\n"
b"value_1\r\n"
b"--boundary\r\n"
b"Content-Disposition: form-data; name=\"list_parameter\"\r\n"
b"\r\n"
b"value_2\r\n"
b"--boundary--\r\n"
)


decoded_data = MultipartDecoder(
b'--boundary\r\nContent-Disposition: form-data; name="files"; filename="test_file.pdf"\r\n\r\nfile_content\r\n--boundary\r\nContent-Disposition: form-data; name="parameter_1"\r\n\r\nvalue_1\r\n--boundary\r\nContent-Disposition: form-data; name="parameter_2"\r\n\r\nvalue_2\r\n--boundary--\r\n',
test_form_data,
"multipart/form-data; boundary=boundary",
)

Expand All @@ -204,6 +235,7 @@ def test_unit_parse_form_data():
"files": shared.Files(b"file_content", "test_file.pdf"),
"parameter_1": "value_1",
"parameter_2": "value_2",
"list_parameter": ["value_1", "value_2"],
}

# Parse form data
Expand All @@ -212,6 +244,7 @@ def test_unit_parse_form_data():
# Assert the parsed form data
assert form_data.get("parameter_1") == expected_form_data.get("parameter_1")
assert form_data.get("parameter_2") == expected_form_data.get("parameter_2")
assert form_data.get("list_parameter") == expected_form_data.get("list_parameter")
assert form_data.get("files").file_name == expected_form_data.get("files").file_name

assert form_data.get("files").content == expected_form_data.get("files").content
Expand Down Expand Up @@ -366,3 +399,36 @@ def test_unit_get_starting_page_number(starting_page_number, expected_result):
fallback_value=DEFAULT_STARTING_PAGE_NUMBER,
)
assert result == expected_result


@pytest.mark.parametrize(
"page_range, expected_result",
[
(["1", "14"], (1, 14)), # Valid range, start on boundary
(["4", "16"], (4, 16)), # Valid range, end on boundary
(None, (1, 20)), # Range not specified, defaults to full range
(["2", "5"], (2, 5)), # Valid range within boundary
(["2", "100"], None), # End page too high
(["50", "100"], None), # Range too high
(["-50", "5"], None), # Start page too low
(["-50", "-2"], None), # Range too low
(["10", "2"], None), # Backwards range
(["foo", "foo"], None), # Parse error
],
)
def test_unit_get_page_range_returns_valid_range(page_range, expected_result):
"""Test get_page_range method with different inputs.
Ranges that are out of bounds for a 20 page doc will throw a ValueError."""
form_data = {"split_pdf_page_range[]": page_range}
try:
result = form_utils.get_page_range(
form_data,
key=PARTITION_FORM_PAGE_RANGE_KEY,
max_pages=20,
)
except ValueError as exc:
assert not expected_result
assert "is out of bounds." in str(exc) or "is not a valid page range." in str(exc)
return

assert result == expected_result
Loading