Skip to content

Commit 3855197

Browse files
committed
feat: Parameter to send custom page range when splitting pdf
To match the python feature: Unstructured-IO/unstructured-python-client#125 Add a client-side param called `splitPdfPageRange` which takes a list of two integers, `[start, end]`. If `splitPdfPage` is `true` and a range is set, slice the doc from `start` up to and including `end`. Only this page range will be sent to the API. The subset of pages is still split up as needed. If `[start, end]` is out of bounds, throw an error to the user.
1 parent ae9c2e2 commit 3855197

File tree

8 files changed

+163
-10
lines changed

8 files changed

+163
-10
lines changed

overlay_client.yaml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,3 +21,15 @@ actions:
2121
"description": "Number of maximum concurrent requests made when splitting PDF. Ignored on backend.",
2222
"default": 5,
2323
}
24+
- target: $["components"]["schemas"]["partition_parameters"]["properties"]
25+
update:
26+
"split_pdf_page_range":
27+
{
28+
"type": "array",
29+
"title": "Split Pdf Page Range",
30+
"description": "When `split_pdf_page is set to `True`, this parameter selects a subset of the pdf to send to the API. The parameter is a list of 2 integers within the range [1, length_of_pdf]. TODO A ValueError is thrown if the given range is invalid. It's an internal parameter for the Python client and is not sent to the backend.",
31+
"items": {"type": "integer"},
32+
"minItems": 2,
33+
"maxItems": 2,
34+
"example": [1, 10],
35+
}

src/hooks/custom/SplitPdfHook.ts

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ import {
1515
getOptimalSplitSize,
1616
getSplitPdfConcurrencyLevel,
1717
getStartingPageNumber,
18+
getSplitPdfPageRange,
1819
loadPdf,
1920
prepareRequestBody,
2021
prepareRequestHeaders,
@@ -96,12 +97,16 @@ export class SplitPdfHook
9697
return request;
9798
}
9899

99-
const [error, pdf, pagesCount] = await loadPdf(file);
100+
const [error, pdf, totalPages] = await loadPdf(file);
100101
if (file === null || pdf === null || error) {
101102
console.info("Partitioning without split.")
102103
return request;
103104
}
104105

106+
let [pageRangeStart, pageRangeEnd] = getSplitPdfPageRange(formData, totalPages);
107+
108+
let pagesCount = pageRangeEnd - pageRangeStart + 1;
109+
105110
if (pagesCount < MIN_PAGES_PER_THREAD) {
106111
console.info(
107112
`PDF has less than ${MIN_PAGES_PER_THREAD} pages. Partitioning without split.`
@@ -118,15 +123,17 @@ export class SplitPdfHook
118123
const splitSize = await getOptimalSplitSize(pagesCount, concurrencyLevel);
119124
console.info("Determined optimal split size of %d pages.", splitSize)
120125

121-
if (splitSize >= pagesCount) {
126+
// If the doc is small enough, and we aren't slicing it with a page range:
127+
// do not split, just continue with the original request
128+
if (splitSize >= pagesCount && pagesCount == totalPages) {
122129
console.info(
123130
"Document has too few pages (%d) to be split efficiently. Partitioning without split.",
124131
pagesCount,
125132
)
126133
return request;
127134
}
128135

129-
const splits = await splitPdf(pdf, splitSize);
136+
const splits = await splitPdf(pdf, splitSize, pageRangeStart, pageRangeEnd);
130137
const numberOfSplits = splits.length
131138
console.info(
132139
"Document split into %d, %d-paged sets.",

src/hooks/custom/common.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ export const BASE_PROTOCOL = "https:";
1111
export const PARTITION_FORM_FILES_KEY = "files";
1212
export const PARTITION_FORM_SPLIT_PDF_PAGE_KEY = "split_pdf_page";
1313
export const PARTITION_FORM_STARTING_PAGE_NUMBER_KEY = "starting_page_number";
14+
export const PARTITION_FORM_SPLIT_PDF_PAGE_RANGE_KEY = "split_pdf_page_range";
1415
export const PARTITION_FORM_SPLIT_PDF_CONCURRENCY_LEVEL =
1516
"split_pdf_concurrency_level";
1617

src/hooks/custom/utils/form.ts

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ import {
33
DEFAULT_STARTING_PAGE_NUMBER,
44
MAX_NUMBER_OF_PARALLEL_REQUESTS,
55
PARTITION_FORM_SPLIT_PDF_CONCURRENCY_LEVEL,
6+
PARTITION_FORM_SPLIT_PDF_PAGE_RANGE_KEY,
67
PARTITION_FORM_STARTING_PAGE_NUMBER_KEY,
78
} from "../common.js";
89

@@ -41,6 +42,31 @@ function getIntegerParameter(
4142
return numberParameter;
4243
}
4344

45+
/**
46+
* Retrieves and validates a page range from FormData, ensuring that the start and end values are defined and within bounds.
47+
*
48+
* @param formData - The FormData object containing the page range parameter.
49+
* @param maxPages - The maximum number of pages in the document.
50+
* @returns {[number, number]} - A tuple containing the validated start and end page numbers.
51+
*
52+
* @throws Will throw an error if the page range is invalid or out of bounds.
53+
*/
54+
export function getSplitPdfPageRange(formData: FormData, maxPages: number): [number, number] {
55+
const formDataParameter = formData.get(PARTITION_FORM_SPLIT_PDF_PAGE_RANGE_KEY);
56+
const pageRange = String(formDataParameter).split(",").map(Number)
57+
58+
let start = pageRange[0] || 1;
59+
let end = pageRange[1] || maxPages;
60+
61+
if (!(start > 0 && start <= maxPages) || !(end > 0 && end <= maxPages) || !(start <= end)) {
62+
const msg = `Page range (${start} to ${end}) is out of bounds. Values should be between 1 and ${maxPages}.`;
63+
console.error(msg);
64+
throw new Error(msg);
65+
}
66+
67+
return [start, end];
68+
}
69+
4470
/**
4571
* Gets the number of maximum requests that can be made when splitting PDF.
4672
* - The number of maximum requests is determined by the value of the request parameter

src/hooks/custom/utils/pdf.ts

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -66,24 +66,32 @@ export async function getOptimalSplitSize(
6666
*
6767
* @param pdf - The PDF file to extract pages from.
6868
* @param splitSize - The number of pages per split.
69+
* @param [pageRangeStart=1] - The starting page of the range to be split (1-based index). Defaults to the first page of the document.
70+
* @param [pageRangeEnd=pdf.getPageCount()] - The ending page of the range to be split (1-based index). Defaults to the last page of the document.
6971
* @returns A promise that resolves to an array of objects containing Blob files and
7072
* start and end page numbers from the original document.
7173
*/
7274
export async function splitPdf(
7375
pdf: PDFDocument,
74-
splitSize: number
76+
splitSize: number,
77+
pageRangeStart?: number,
78+
pageRangeEnd?: number
7579
): Promise<PdfSplit[]> {
7680
const pdfSplits: PdfSplit[] = [];
77-
const pagesCount = pdf.getPages().length;
81+
82+
const startPage = pageRangeStart || 1;
83+
const endPage = pageRangeEnd || pdf.getPageCount();
84+
const pagesCount = endPage - startPage + 1
85+
7886
const numberOfSplits = Math.ceil(pagesCount / splitSize);
7987

8088
for (let i = 0; i < numberOfSplits; ++i) {
8189
const offset = i * splitSize;
82-
const startPage = offset + 1;
83-
// If it's the last split, take the rest of the pages
84-
const endPage = Math.min(pagesCount, offset + splitSize);
85-
const pdfSplit = await pdfPagesToBlob(pdf, startPage, endPage);
86-
pdfSplits.push({ content: pdfSplit, startPage, endPage });
90+
const splitStartPage = offset + startPage;
91+
const splitEndPage = Math.min(endPage, splitStartPage + splitSize - 1);
92+
93+
const pdfSplit = await pdfPagesToBlob(pdf, splitStartPage, splitEndPage);
94+
pdfSplits.push({ content: pdfSplit, startPage: splitStartPage, endPage: splitEndPage });
8795
}
8896

8997
return pdfSplits;

src/sdk/general.ts

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,12 @@ export class General extends ClientSDK {
174174
if (payload$.partition_parameters.split_pdf_page !== undefined) {
175175
body$.append("split_pdf_page", String(payload$.partition_parameters.split_pdf_page));
176176
}
177+
if (payload$.partition_parameters.split_pdf_page_range !== undefined) {
178+
body$.append(
179+
"split_pdf_page_range",
180+
String(payload$.partition_parameters.split_pdf_page_range)
181+
);
182+
}
177183
if (payload$.partition_parameters.starting_page_number !== undefined) {
178184
body$.append(
179185
"starting_page_number",

src/sdk/models/shared/partitionparameters.ts

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,10 @@ export type PartitionParameters = {
140140
* Should the pdf file be split at client. Ignored on backend.
141141
*/
142142
splitPdfPage?: boolean | undefined;
143+
/**
144+
* When `split_pdf_page is set to `True`, this parameter selects a subset of the pdf to send to the API. The parameter is a list of 2 integers within the range [1, length_of_pdf]. TODO A ValueError is thrown if the given range is invalid. It's an internal parameter for the Python client and is not sent to the backend.
145+
*/
146+
splitPdfPageRange?: Array<number> | undefined;
143147
/**
144148
* When PDF is split into pages before sending it into the API, providing this information will allow the page number to be assigned correctly. Introduced in 1.0.27.
145149
*/
@@ -288,6 +292,7 @@ export const PartitionParameters$inboundSchema: z.ZodType<
288292
skip_infer_table_types: z.array(z.string()).optional(),
289293
split_pdf_concurrency_level: z.number().int().default(5),
290294
split_pdf_page: z.boolean().default(true),
295+
split_pdf_page_range: z.array(z.number().int()).optional(),
291296
starting_page_number: z.nullable(z.number().int()).optional(),
292297
strategy: Strategy$inboundSchema.default(Strategy.Auto),
293298
unique_element_ids: z.boolean().default(false),
@@ -313,6 +318,7 @@ export const PartitionParameters$inboundSchema: z.ZodType<
313318
skip_infer_table_types: "skipInferTableTypes",
314319
split_pdf_concurrency_level: "splitPdfConcurrencyLevel",
315320
split_pdf_page: "splitPdfPage",
321+
split_pdf_page_range: "splitPdfPageRange",
316322
starting_page_number: "startingPageNumber",
317323
unique_element_ids: "uniqueElementIds",
318324
xml_keep_tags: "xmlKeepTags",
@@ -344,6 +350,7 @@ export type PartitionParameters$Outbound = {
344350
skip_infer_table_types?: Array<string> | undefined;
345351
split_pdf_concurrency_level: number;
346352
split_pdf_page: boolean;
353+
split_pdf_page_range?: Array<number> | undefined;
347354
starting_page_number?: number | null | undefined;
348355
strategy: string;
349356
unique_element_ids: boolean;
@@ -380,6 +387,7 @@ export const PartitionParameters$outboundSchema: z.ZodType<
380387
skipInferTableTypes: z.array(z.string()).optional(),
381388
splitPdfConcurrencyLevel: z.number().int().default(5),
382389
splitPdfPage: z.boolean().default(true),
390+
splitPdfPageRange: z.array(z.number().int()).optional(),
383391
startingPageNumber: z.nullable(z.number().int()).optional(),
384392
strategy: Strategy$outboundSchema.default(Strategy.Auto),
385393
uniqueElementIds: z.boolean().default(false),
@@ -405,6 +413,7 @@ export const PartitionParameters$outboundSchema: z.ZodType<
405413
skipInferTableTypes: "skip_infer_table_types",
406414
splitPdfConcurrencyLevel: "split_pdf_concurrency_level",
407415
splitPdfPage: "split_pdf_page",
416+
splitPdfPageRange: "split_pdf_page_range",
408417
startingPageNumber: "starting_page_number",
409418
uniqueElementIds: "unique_element_ids",
410419
xmlKeepTags: "xml_keep_tags",

test/integration/SplitPdfHook.test.ts

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -243,3 +243,87 @@ describe("SplitPdfHook integration tests check splitted file is same as not spli
243243
}).rejects.toThrow(/.*File type None is not supported.*/);
244244
});
245245
});
246+
247+
describe("SplitPdfHook integration tests page range parameter", () => {
248+
const FAKE_API_KEY = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
249+
const consoleInfoSpy = jest.spyOn(console, "info");
250+
const consoleWarnSpy = jest.spyOn(console, "warn");
251+
const consoleErrorSpy = jest.spyOn(console, "error");
252+
253+
let client = new UnstructuredClient({
254+
serverURL: localServer,
255+
security: {
256+
apiKeyAuth: FAKE_API_KEY,
257+
},
258+
});
259+
260+
beforeEach(async () => {
261+
try {
262+
const res = await fetch(`${localServer}/general/docs`);
263+
expect(res.status).toEqual(200);
264+
} catch {
265+
throw Error(`The unstructured-api is not running on ${localServer}`);
266+
}
267+
268+
client = new UnstructuredClient({
269+
serverURL: localServer,
270+
security: {
271+
apiKeyAuth: FAKE_API_KEY,
272+
},
273+
});
274+
});
275+
276+
afterEach(() => {
277+
consoleInfoSpy.mockClear();
278+
consoleWarnSpy.mockClear();
279+
consoleErrorSpy.mockClear();
280+
});
281+
282+
it.each([
283+
{ pageRange: [1, 14], expectedOk: true, expectedPages: [1, 14] }, // Valid range, start on boundary
284+
{ pageRange: [4, 16], expectedOk: true, expectedPages: [4, 16] }, // Valid range, end on boundary
285+
{ pageRange: [2, 5], expectedOk: true, expectedPages: [2, 5] }, // Valid range within boundary
286+
{ pageRange: [6, 6], expectedOk: true, expectedPages: [6, 6] }, // Single page range
287+
{ pageRange: [2, 100], expectedOk: false, expectedPages: null }, // End page too high
288+
{ pageRange: [50, 100], expectedOk: false, expectedPages: null }, // Range too high
289+
{ pageRange: [-50, 5], expectedOk: false, expectedPages: null }, // Start page too low
290+
{ pageRange: [-50, -2], expectedOk: false, expectedPages: null }, // Range too low
291+
{ pageRange: [10, 2], expectedOk: false, expectedPages: null }, // Backwards range
292+
])(
293+
"for page range $pageRange",
294+
async ({ pageRange, expectedOk, expectedPages }) => {
295+
const filename = "test/data/layout-parser-paper.pdf";
296+
const file = { content: readFileSync(filename), fileName: filename };
297+
298+
let startingPageNumber = 1;
299+
try {
300+
let response = await client.general.partition({
301+
partitionParameters: {
302+
files: file,
303+
strategy: Strategy.Fast,
304+
splitPdfPage: true,
305+
splitPdfPageRange: pageRange
306+
},
307+
});
308+
309+
// Grab the set of page numbers in the result
310+
// Assert that all returned elements are in the expected page range
311+
const pageNumbers = new Set(response?.elements?.map((element: any) => element.metadata.page_number));
312+
const minPageNumber = expectedPages?.[0] ?? 0 + startingPageNumber - 1;
313+
const maxPageNumber = expectedPages?.[1] ?? 0 + startingPageNumber - 1;
314+
315+
expect(Math.min(...pageNumbers)).toBe(minPageNumber);
316+
expect(Math.max(...pageNumbers)).toBe(maxPageNumber);
317+
} catch (e) {
318+
if (!expectedOk) {
319+
expect(consoleErrorSpy).toHaveBeenCalledWith(expect.stringContaining("is out of bounds"));
320+
return;
321+
} else {
322+
throw e;
323+
}
324+
}
325+
326+
},
327+
300000
328+
);
329+
});

0 commit comments

Comments
 (0)