Starting page number (#72)

mpolomdeepsense · web-flow · commit ec16a2d11e2c · 2024-05-01T17:47:22.000-04:00
**Only `test__decorators.py`, `test_split_pdf_hook.py`,
`split_pdf_hook.py` and `overlay_client.yaml` files were modified by
human. Rest of them were auto generated.**

To run integration tests first run `unstructured-api` on port 8000
diff --git a/.speakeasy/gen.lock b/.speakeasy/gen.lock
@@ -1,12 +1,12 @@
 lockVersion: 2.0.0
 id: 8b5fa338-9106-4734-abf0-e30d67044a90
 management:
-  docChecksum: b35264eb5f2ce89c808012333367cf1c
+  docChecksum: 666d45deb8d9066b8e19e04a305ca734
   docVersion: 0.0.1
-  speakeasyVersion: 1.267.1
+  speakeasyVersion: 1.272.0
   generationVersion: 2.312.1
-  releaseVersion: 0.23.2
-  configChecksum: c0ddfa44eb8fbd51d397d36253d1d68f
+  releaseVersion: 0.23.3
+  configChecksum: 7aae705f3e8a728a15a7177fbca343ad
   repoURL: https://github.com/Unstructured-IO/unstructured-python-client.git
   repoSubDirectory: .
   installationURL: https://github.com/Unstructured-IO/unstructured-python-client.git
diff --git a/_test_unstructured_client/test__decorators.py b/_test_unstructured_client/test__decorators.py
@@ -84,8 +84,6 @@ def test_integration_split_pdf_has_same_output_as_non_split(
         t2=resp_single.elements,
         exclude_regex_paths=[
             r"root\[\d+\]\['metadata'\]\['parent_id'\]",
-            # TODO: (Marek Połom) - Remove page number pattern after page numbering parameter is added
-            r"root\[\d+\]\['metadata'\]\['page_number'\]",
         ],
     )
     assert len(diff) == 0
diff --git a/_test_unstructured_client/test_split_pdf_hook.py b/_test_unstructured_client/test_split_pdf_hook.py
@@ -7,7 +7,6 @@
 import requests
 from requests_toolbelt import MultipartDecoder, MultipartEncoder
 
-
 from unstructured_client._hooks.custom import SplitPdfHook
 from unstructured_client.models import shared
 
@@ -150,6 +149,7 @@ def test_unit_create_request(self):
             "parameter_1": "value_1",
             "parameter_2": "value_2",
             "split_pdf_page": "false",
+            "starting_page_number": "7",
         }
         expected_page_filename = "test_file.pdf"
         expected_body = MultipartEncoder(
@@ -165,7 +165,7 @@ def test_unit_create_request(self):
         expected_url = ""
 
         # Create request
-        request_obj = hook._create_request(request, form_data, page[0], filename)
+        request_obj = hook._create_request(request, form_data, page[0], filename, 7)
         request_content_type: str = request_obj.headers.get("Content-Type")
         # Assert the request object
         self.assertEqual(request_obj.method, "POST")
@@ -306,3 +306,39 @@ def test_unit_is_pdf_invalid_pdf(self):
 
         self.assertFalse(result)
         self.assertIn("Attempted to interpret file as pdf", cm.output[1])
+
+    def test_unit_get_starting_page_number_valid_integer(self):
+        """Test _get_starting_page_number method with valid integer."""
+        hook = SplitPdfHook()
+        form_data = {"starting_page_number": "5"}
+
+        result = hook._get_starting_page_number(form_data)
+
+        self.assertEqual(result, 5)
+
+    def test_unit_get_starting_page_number_invalid_integer(self):
+        """Test _get_starting_page_number method with invalid integer."""
+        hook = SplitPdfHook()
+        form_data = {"starting_page_number": "abc"}
+
+        result = hook._get_starting_page_number(form_data)
+
+        self.assertEqual(result, 1)
+
+    def test_unit_get_starting_page_number_less_than_one(self):
+        """Test _get_starting_page_number method with value less than 1."""
+        hook = SplitPdfHook()
+        form_data = {"starting_page_number": "0"}
+
+        result = hook._get_starting_page_number(form_data)
+
+        self.assertEqual(result, 1)
+
+    def test_unit_get_starting_page_number_missing_key(self):
+        """Test _get_starting_page_number method with missing key."""
+        hook = SplitPdfHook()
+        form_data = {}
+
+        result = hook._get_starting_page_number(form_data)
+
+        self.assertEqual(result, 1)
diff --git a/docs/models/shared/partitionparameters.md b/docs/models/shared/partitionparameters.md
@@ -25,6 +25,7 @@
 | `pdf_infer_table_structure`                                                                                                                                                                                              | *Optional[bool]*                                                                                                                                                                                                         | :heavy_minus_sign:                                                                                                                                                                                                       | Deprecated! Use skip_infer_table_types to opt out of table extraction for any file type. If False and strategy=hi_res, no Table Elements will be extracted from pdf files regardless of skip_infer_table_types contents. |                                                                                                                                                                                                                          |
 | `skip_infer_table_types`                                                                                                                                                                                                 | List[*str*]                                                                                                                                                                                                              | :heavy_minus_sign:                                                                                                                                                                                                       | The document types that you want to skip table extraction with. Default: []                                                                                                                                              |                                                                                                                                                                                                                          |
 | `split_pdf_page`                                                                                                                                                                                                         | *Optional[bool]*                                                                                                                                                                                                         | :heavy_minus_sign:                                                                                                                                                                                                       | Should the pdf file be split at client. Ignored on backend.                                                                                                                                                              |                                                                                                                                                                                                                          |
+| `starting_page_number`                                                                                                                                                                                                   | *Optional[int]*                                                                                                                                                                                                          | :heavy_minus_sign:                                                                                                                                                                                                       | The real number of the first PDF page.                                                                                                                                                                                   |                                                                                                                                                                                                                          |
 | `strategy`                                                                                                                                                                                                               | *Optional[str]*                                                                                                                                                                                                          | :heavy_minus_sign:                                                                                                                                                                                                       | The strategy to use for partitioning PDF/image. Options are fast, hi_res, auto. Default: auto                                                                                                                            | hi_res                                                                                                                                                                                                                   |
 | `unique_element_ids`                                                                                                                                                                                                     | *Optional[bool]*                                                                                                                                                                                                         | :heavy_minus_sign:                                                                                                                                                                                                       | When True, assign UUIDs to element IDs, which guarantees their uniqueness (useful when using them as primary keys in database). Otherwise a SHA-256 of element text is used. Default: False                              |                                                                                                                                                                                                                          |
 | `xml_keep_tags`                                                                                                                                                                                                          | *Optional[bool]*                                                                                                                                                                                                         | :heavy_minus_sign:                                                                                                                                                                                                       | If True, will retain the XML tags in the output. Otherwise it will simply extract the text from within the tags. Only applies to partition_xml.                                                                          |                                                                                                                                                                                                                          |
diff --git a/gen.yaml b/gen.yaml
@@ -10,7 +10,7 @@ generation:
   auth:
     oAuth2ClientCredentialsEnabled: false
 python:
-  version: 0.23.2
+  version: 0.23.3
   additionalDependencies:
     dependencies:
       deepdiff: '>=6.0'
diff --git a/overlay_client.yaml b/overlay_client.yaml
@@ -6,3 +6,6 @@ actions:
   - target: $["components"]["schemas"]["partition_parameters"]["properties"]
     update:
       "split_pdf_page": {"type": "boolean", "title": "Split Pdf Page", "description": "Should the pdf file be split at client. Ignored on backend."}
+  - target: $["components"]["schemas"]["partition_parameters"]["properties"]
+    update:
+      "starting_page_number": {"type": "integer", "title": "Starting Page Number", "description": "The real number of the first PDF page."}
diff --git a/setup.py b/setup.py
@@ -19,7 +19,7 @@
 
 setuptools.setup(
     name='unstructured-client',
-    version='0.23.2',
+    version='0.23.3',
     author='Unstructured',
     description='Python Client SDK for Unstructured API',
     license = 'MIT',
diff --git a/src/unstructured_client/_hooks/custom/split_pdf_hook.py b/src/unstructured_client/_hooks/custom/split_pdf_hook.py
@@ -16,7 +16,6 @@
 from pypdf import PdfReader, PdfWriter
 from pypdf.errors import PdfReadError
 
-
 from unstructured_client._hooks.custom.common import UNSTRUCTURED_CLIENT_LOGGER_NAME
 from unstructured_client._hooks.types import (
     BeforeRequestContext,
@@ -33,6 +32,10 @@
 
 PARTITION_FORM_FILES_KEY = "files"
 PARTITION_FORM_SPLIT_PDF_PAGE_KEY = "split_pdf_page"
+PARTITION_FORM_STARTING_PAGE_NUMBER_KEY = "starting_page_number"
+
+DEFAULT_STARTING_PAGE_NUMBER = 1
+
 
 FormData = dict[str, Union[str, shared.Files]]
 
@@ -85,6 +88,10 @@ def before_request(
             Union[requests.PreparedRequest, Exception]: If `splitPdfPage` is set to `true`,
             the last page request; otherwise, the original request.
         """
+        if self.client is None:
+            logger.warning("HTTP client not accessible! Continuing without splitting.")
+            return request
+
         operation_id = hook_ctx.operation_id
         content_type = request.headers.get("Content-Type")
         body = request.body
@@ -101,9 +108,7 @@ def before_request(
         if file is None or not isinstance(file, shared.Files) or not self._is_pdf(file):
             return request
 
-        if self.client is None:
-            logger.warning("HTTP client not accessible! Continuing without splitting.")
-            return request
+        starting_page_number = self._get_starting_page_number(form_data)
 
         pages = self._get_pdf_pages(file.content)
         call_api_partial = functools.partial(
@@ -115,11 +120,14 @@ def before_request(
         call_threads = self._get_split_pdf_call_threads()
         self.partition_requests[operation_id] = []
         last_page_content = io.BytesIO()
+        last_page_number = 0
         with ThreadPoolExecutor(max_workers=call_threads) as executor:
-            for page_content, page_number, all_pages_number in pages:
-                # Check if the next page will be the last one
-                if page_number == all_pages_number:
+            for page_content, page_index, all_pages_number in pages:
+                page_number = page_index + starting_page_number
+                # Check if this page is the last one
+                if page_index == all_pages_number - 1:
                     last_page_content = page_content
+                    last_page_number = page_number
                     break
                 self.partition_requests[operation_id].append(
                     executor.submit(call_api_partial, (page_content, page_number))
@@ -128,7 +136,7 @@ def before_request(
         # `before_request` method needs to return a request so we skip sending the last page in parallel
         # and return that last page at the end of this method
         last_page_request = self._create_request(
-            request, form_data, last_page_content, file.file_name
+            request, form_data, last_page_content, file.file_name, last_page_number
         )
         last_page_prepared_request = self.client.prepare_request(last_page_request)
         return last_page_prepared_request
@@ -217,7 +225,9 @@ def _is_pdf(self, file: shared.Files) -> bool:
             bool: True if the file is a PDF, False otherwise.
         """
         if not file.file_name.endswith(".pdf"):
-            logger.warning("Given file doesn't have '.pdf' extension. Continuing without splitting.")
+            logger.warning(
+                "Given file doesn't have '.pdf' extension. Continuing without splitting."
+            )
             return False
 
         try:
@@ -267,8 +277,7 @@ def _get_pdf_pages(
             new_pdf.write(pdf_buffer)
             pdf_buffer.seek(0)
 
-            # 1-index the page numbers
-            yield pdf_buffer, offset + 1, offset_end
+            yield pdf_buffer, offset, offset_end
             offset += split_size
 
     def _parse_form_data(self, decoded_data: MultipartDecoder) -> FormData:
@@ -349,7 +358,9 @@ def _call_api(
             raise RuntimeError("HTTP client not accessible!")
         page_content, page_number = page
 
-        new_request = self._create_request(request, form_data, page_content, filename)
+        new_request = self._create_request(
+            request, form_data, page_content, filename, page_number
+        )
         prepared_request = self.client.prepare_request(new_request)
 
         try:
@@ -364,6 +375,7 @@ def _create_request(
         form_data: FormData,
         page_content: io.BytesIO,
         filename: str,
+        page_number: int,
     ) -> requests.Request:
         """
         Creates a request object for a part of a splitted PDF file.
@@ -373,6 +385,7 @@ def _create_request(
             form_data (FormData): The form data for the request.
             page_content (io.BytesIO): Page content in bytes.
             filename (str): The original filename of the PDF file.
+            page_number (int): Number of the page in the original PDF file.
 
         Returns:
             requests.Request: The request object for a splitted part of the
@@ -388,6 +401,7 @@ def _create_request(
                     page_content,
                     "application/pdf",
                 ),
+                PARTITION_FORM_STARTING_PAGE_NUMBER_KEY: str(page_number),
             }
         )
         return requests.Request(
@@ -429,7 +443,11 @@ def _prepare_request_payload(self, form_data: FormData) -> FormData:
         payload = copy.deepcopy(form_data)
         payload.pop(PARTITION_FORM_SPLIT_PDF_PAGE_KEY, None)
         payload.pop(PARTITION_FORM_FILES_KEY, None)
-        payload.update({PARTITION_FORM_SPLIT_PDF_PAGE_KEY: "false"})
+        payload.pop(PARTITION_FORM_STARTING_PAGE_NUMBER_KEY, None)
+        updated_parameters = {
+            PARTITION_FORM_SPLIT_PDF_PAGE_KEY: "false",
+        }
+        payload.update(updated_parameters)
         return payload
 
     def _create_response(
@@ -527,3 +545,39 @@ def _clear_operation(self, operation_id: str) -> None:
         """
         self.partition_responses.pop(operation_id, None)
         self.partition_requests.pop(operation_id, None)
+
+    def _get_starting_page_number(self, form_data: FormData) -> int:
+        """
+        Retrieves the starting page number from the given form data. In case given
+        starting page number is not a valid integer or less than 1, it will use the
+        default value.
+
+        Args:
+            form_data (FormData): The form data containing the starting page number.
+
+        Returns:
+            int: The starting page number.
+        """
+        starting_page_number = DEFAULT_STARTING_PAGE_NUMBER
+        try:
+            _starting_page_number = (
+                form_data.get(PARTITION_FORM_STARTING_PAGE_NUMBER_KEY)
+                or DEFAULT_STARTING_PAGE_NUMBER
+            )
+            starting_page_number = int(_starting_page_number)  # type: ignore
+        except ValueError:
+            logger.warning(
+                "'%s' is not a valid integer. Using default value '%d'.",
+                PARTITION_FORM_STARTING_PAGE_NUMBER_KEY,
+                DEFAULT_STARTING_PAGE_NUMBER,
+            )
+
+        if starting_page_number < 1:
+            logger.warning(
+                "'%s' is less than 1. Using default value '%d'.",
+                PARTITION_FORM_STARTING_PAGE_NUMBER_KEY,
+                DEFAULT_STARTING_PAGE_NUMBER,
+            )
+            starting_page_number = DEFAULT_STARTING_PAGE_NUMBER
+
+        return starting_page_number
diff --git a/src/unstructured_client/models/shared/partition_parameters.py b/src/unstructured_client/models/shared/partition_parameters.py
diff --git a/src/unstructured_client/sdkconfiguration.py b/src/unstructured_client/sdkconfiguration.py

Original file line number	Diff line number	Diff line change
`@@ -84,8 +84,6 @@ def test_integration_split_pdf_has_same_output_as_non_split(`
`84`	`84`	`t2=resp_single.elements,`
`85`	`85`	`exclude_regex_paths=[`
`86`	`86`	`r"root\[\d+\]\['metadata'\]\['parent_id'\]",`
`87`		`- # TODO: (Marek Połom) - Remove page number pattern after page numbering parameter is added`
`88`		`- r"root\[\d+\]\['metadata'\]\['page_number'\]",`
`89`	`87`	`],`
`90`	`88`	`)`
`91`	`89`	`assert len(diff) == 0`