Add support for page ranges in pdf split hook

awalker4 · awalker4 · commit b4419021a51c · 2024-07-10T11:24:05.000-04:00
diff --git a/_test_unstructured_client/integration/test_decorators.py b/_test_unstructured_client/integration/test_decorators.py
@@ -110,3 +110,57 @@ def test_integration_split_pdf_for_file_with_no_name():
     )
 
     pytest.raises(ValueError, client.general.partition, req)
+
+
+@pytest.mark.parametrize("starting_page_number", [1, 100])
+@pytest.mark.parametrize(
+    "page_range, expected_pages",
+    [
+        (["2", "5"], (2, 5)),  # Valid range
+        (["2", "100"], (2, 16)), # End too high
+        (["-50", "5"], (1, 5)), # Start too low
+    ],
+)
+def test_integration_split_pdf_with_page_range(
+    starting_page_number: int,
+    page_range: list[int],
+    expected_pages: tuple[int, int],
+):
+    """
+    Test that we can split pdfs with an arbitrary page range. Send the selected range to the API and assert that the metadata page numbers are correct.
+    We should also be able to offset the metadata with starting_page_number.
+
+    Requires unstructured-api running in bg. See Makefile for how to run it.
+    """
+    try:
+        response = requests.get("http://localhost:8000/general/docs")
+        assert response.status_code == 200, "The unstructured-api is not running on localhost:8000"
+    except requests.exceptions.ConnectionError:
+        assert False, "The unstructured-api is not running on localhost:8000"
+
+    client = UnstructuredClient(api_key_auth=FAKE_KEY, server_url="localhost:8000")
+
+    filename = "_sample_docs/layout-parser-paper.pdf"
+    with open(filename, "rb") as f:
+        files = shared.Files(
+            content=f.read(),
+            file_name=filename,
+        )
+
+    req = shared.PartitionParameters(
+        files=files,
+        strategy="fast",
+        split_pdf_page=True,
+        split_pdf_page_range=page_range,
+        starting_page_number=starting_page_number,
+    )
+
+    resp = client.general.partition(req)
+
+    page_numbers = set([e["metadata"]["page_number"] for e in resp.elements])
+
+    min_page_number = expected_pages[0] + starting_page_number - 1
+    max_page_number = expected_pages[1] + starting_page_number - 1
+
+    assert min(page_numbers) == min_page_number
+    assert max(page_numbers) == max_page_number
diff --git a/src/unstructured_client/_hooks/custom/pdf_utils.py b/src/unstructured_client/_hooks/custom/pdf_utils.py
@@ -1,6 +1,6 @@
 import io
 import logging
-from typing import Generator, Tuple
+from typing import Generator, Tuple, Optional
 
 from pypdf import PdfReader, PdfWriter
 from pypdf.errors import PdfReadError
@@ -12,7 +12,7 @@
 
 
 def get_pdf_pages(
-    pdf: PdfReader, split_size: int = 1
+    pdf: PdfReader, split_size: int = 1, page_start: int = 1, page_end: Optional[int] = None
 ) -> Generator[Tuple[io.BytesIO, int, int], None, None]:
     """Reads given bytes of a pdf file and split it into n file-like objects, each
     with `split_size` pages.
@@ -22,13 +22,15 @@ def get_pdf_pages(
         split_size: Split size, e.g. if the given file has 10 pages
             and this value is set to 2 it will yield 5 documents, each containing 2 pages
             of the original document. By default it will split each page to a separate file.
+        page_start: Begin splitting at this page number
+        page_end: If provided, split up to and including this page number
 
     Yields:
         The file contents with their page number and overall pages number of the original document.
     """
 
-    offset = 0
-    offset_end = len(pdf.pages)
+    offset = page_start - 1
+    offset_end = page_end or len(pdf.pages)
 
     while offset < offset_end:
         new_pdf = PdfWriter()
diff --git a/src/unstructured_client/_hooks/custom/request_utils.py b/src/unstructured_client/_hooks/custom/request_utils.py
@@ -16,6 +16,7 @@
 from unstructured_client._hooks.custom.form_utils import (
     PARTITION_FORM_FILES_KEY,
     PARTITION_FORM_SPLIT_PDF_PAGE_KEY,
+    PARTITION_FORM_PAGE_RANGE_KEY,
     PARTITION_FORM_STARTING_PAGE_NUMBER_KEY,
     FormData,
 )
@@ -145,6 +146,7 @@ def prepare_request_payload(form_data: FormData) -> FormData:
     payload = copy.deepcopy(form_data)
     payload.pop(PARTITION_FORM_SPLIT_PDF_PAGE_KEY, None)
     payload.pop(PARTITION_FORM_FILES_KEY, None)
+    payload.pop(PARTITION_FORM_PAGE_RANGE_KEY, None)
     payload.pop(PARTITION_FORM_STARTING_PAGE_NUMBER_KEY, None)
     updated_parameters = {
         PARTITION_FORM_SPLIT_PDF_PAGE_KEY: "false",
diff --git a/src/unstructured_client/_hooks/custom/split_pdf_hook.py b/src/unstructured_client/_hooks/custom/split_pdf_hook.py
@@ -18,6 +18,7 @@
 from unstructured_client._hooks.custom.form_utils import (
     PARTITION_FORM_CONCURRENCY_LEVEL_KEY,
     PARTITION_FORM_FILES_KEY,
+    PARTITION_FORM_PAGE_RANGE_KEY,
     PARTITION_FORM_SPLIT_PDF_PAGE_KEY,
     PARTITION_FORM_STARTING_PAGE_NUMBER_KEY,
 )
@@ -143,7 +144,10 @@ def before_request(
             key=PARTITION_FORM_STARTING_PAGE_NUMBER_KEY,
             fallback_value=DEFAULT_STARTING_PAGE_NUMBER,
         )
-        logger.info("Starting page number set to %d", starting_page_number)
+
+        if starting_page_number > 1:
+            logger.info("Starting page number set to %d", starting_page_number)
+
         concurrency_level = form_utils.get_split_pdf_concurrency_level_param(
             form_data,
             key=PARTITION_FORM_CONCURRENCY_LEVEL_KEY,
@@ -154,27 +158,34 @@ def before_request(
         limiter = asyncio.Semaphore(concurrency_level)
 
         pdf = PdfReader(io.BytesIO(file.content))
+
+        page_range_start, page_range_end = form_utils.get_page_range(
+            form_data,
+            key=PARTITION_FORM_PAGE_RANGE_KEY,
+            max_pages=len(pdf.pages),
+        )
+
+        page_count = min(len(pdf.pages), page_range_end - page_range_start + 1)
+        logger.info(f"Splitting pages {page_range_start} to {page_range_end} ({page_count} total)")
+
         split_size = get_optimal_split_size(
-            num_pages=len(pdf.pages), concurrency_level=concurrency_level
+            num_pages=page_count, concurrency_level=concurrency_level
         )
         logger.info("Determined optimal split size of %d pages.", split_size)
 
-        if split_size >= len(pdf.pages):
+        # If the doc is small enough, and we aren't slicing it with a page range:
+        # do not split, just continue with the original request
+        if split_size >= page_count and page_count == len(pdf.pages):
             logger.info(
                 "Document has too few pages (%d) to be split efficiently. Partitioning without split.",
-                len(pdf.pages),
+                page_count,
             )
             return request
 
-        pages = pdf_utils.get_pdf_pages(pdf, split_size)
-        logger.info(
-            "Document split into %d, %d-paged sets.",
-            math.ceil(len(pdf.pages) / split_size),
-            split_size,
-        )
+        pages = pdf_utils.get_pdf_pages(pdf, split_size=split_size, page_start=page_range_start, page_end=page_range_end)
         logger.info(
             "Partitioning %d, %d-paged sets.",
-            math.ceil(len(pdf.pages) / split_size),
+            math.ceil(page_count / split_size),
             split_size,
         )
 
@@ -209,7 +220,7 @@ async def call_api_partial(page):
                 "Partitioning set #%d (pages %d-%d).",
                 set_index,
                 page_number,
-                min(page_number + split_size, all_pages_number),
+                min(page_number + split_size - 1, all_pages_number),
             )
             # Check if this set of pages is the last one
             if page_index + split_size >= all_pages_number: