Skip to content

Commit 3c4a608

Browse files
authored
feat: Parameter to send custom page range when splitting pdf (#101)
To match the python feature: Unstructured-IO/unstructured-python-client#125 # New parameter Add a client-side param called `splitPdfPageRange` which takes a list of two integers, `[start, end]`. If `splitPdfPage` is `true` and a range is set, slice the doc from `start` up to and including `end`. Only this page range will be sent to the API. The subset of pages is still split up as needed. If `[start, end]` is out of bounds, throw an error to the user. # Testing Check out this branch and set up a request to your local API: ``` const client = new UnstructuredClient({ serverURL: "http://localhost:8000", security: { apiKeyAuth: key, }, }); const filename = "layout-parser-paper.pdf"; const data = fs.readFileSync(filename); client.general.partition({ partitionParameters: { files: { content: data, fileName: filename, }, strategy: Strategy.Fast, splitPdfPage: true, splitPdfPageRange: [4, 8], } }).then((res: PartitionResponse) => { if (res.statusCode == 200) { console.log(res.elements); } }).catch((e) => { if (e.statusCode) { console.log(e.statusCode); console.log(e.body); } else { console.log(e); } }); ``` Test out various page ranges and confirm that the returned elements are within the range. Invalid ranges should throw a useful Error (pages are out of bounds, or end_page < start_page).
1 parent 0fdd5e9 commit 3c4a608

File tree

10 files changed

+245
-98
lines changed

10 files changed

+245
-98
lines changed

docs/sdk/models/shared/partitionparameters.md

Lines changed: 31 additions & 31 deletions
Large diffs are not rendered by default.

overlay_client.yaml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,18 @@ actions:
2121
"description": "Number of maximum concurrent requests made when splitting PDF. Ignored on backend.",
2222
"default": 5,
2323
}
24+
- target: $["components"]["schemas"]["partition_parameters"]["properties"]
25+
update:
26+
"split_pdf_page_range":
27+
{
28+
"type": "array",
29+
"title": "Split Pdf Page Range",
30+
"description": "When `split_pdf_page is set to `True`, this parameter selects a subset of the pdf to send to the API. The parameter is a list of 2 integers within the range [1, length_of_pdf]. An Error is thrown if the given range is invalid. Ignored on backend.",
31+
"items": {"type": "integer"},
32+
"minItems": 2,
33+
"maxItems": 2,
34+
"example": [1, 10],
35+
}
2436
- target: $["components"]["schemas"]["partition_parameters"]["properties"]
2537
update:
2638
"split_pdf_allow_failed":

src/hooks/custom/SplitPdfHook.ts

Lines changed: 16 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ import {
1515
getOptimalSplitSize, getSplitPdfAllowFailed,
1616
getSplitPdfConcurrencyLevel,
1717
getStartingPageNumber,
18+
getSplitPdfPageRange,
1819
loadPdf,
1920
prepareRequestBody,
2021
prepareRequestHeaders,
@@ -107,18 +108,14 @@ export class SplitPdfHook
107108
return request;
108109
}
109110

110-
const [error, pdf, pagesCount] = await loadPdf(file);
111+
const [error, pdf, totalPages] = await loadPdf(file);
111112
if (file === null || pdf === null || error) {
112113
console.info("Partitioning without split.")
113114
return request;
114115
}
115116

116-
if (pagesCount < MIN_PAGES_PER_THREAD) {
117-
console.info(
118-
`PDF has less than ${MIN_PAGES_PER_THREAD} pages. Partitioning without split.`
119-
);
120-
return request;
121-
}
117+
const [pageRangeStart, pageRangeEnd] = getSplitPdfPageRange(formData, totalPages);
118+
const pagesCount = pageRangeEnd - pageRangeStart + 1;
122119

123120
const startingPageNumber = getStartingPageNumber(formData);
124121
console.info("Starting page number set to %d", startingPageNumber);
@@ -132,15 +129,22 @@ export class SplitPdfHook
132129
const splitSize = await getOptimalSplitSize(pagesCount, concurrencyLevel);
133130
console.info("Determined optimal split size of %d pages.", splitSize)
134131

135-
if (splitSize >= pagesCount) {
136-
console.info(
132+
// If user wants a specific page range, we need to call splitPdf,
133+
// even if this page count is too small to be split normally
134+
const isPageRangeRequested = pagesCount < totalPages;
135+
136+
// Otherwise, if there are not enough pages, return the original request without splitting
137+
if (!isPageRangeRequested) {
138+
if (splitSize >= pagesCount || pagesCount < MIN_PAGES_PER_THREAD) {
139+
console.info(
137140
"Document has too few pages (%d) to be split efficiently. Partitioning without split.",
138141
pagesCount,
139-
)
140-
return request;
142+
)
143+
return request;
144+
}
141145
}
142146

143-
const splits = await splitPdf(pdf, splitSize);
147+
const splits = await splitPdf(pdf, splitSize, pageRangeStart, pageRangeEnd);
144148
const numberOfSplits = splits.length
145149
console.info(
146150
"Document split into %d, %d-paged sets.",

src/hooks/custom/common.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ export const PARTITION_FORM_FILES_KEY = "files";
1212
export const PARTITION_FORM_SPLIT_PDF_PAGE_KEY = "split_pdf_page";
1313
export const PARTITION_FORM_SPLIT_PDF_ALLOW_FAILED_KEY = "split_pdf_allow_failed";
1414
export const PARTITION_FORM_STARTING_PAGE_NUMBER_KEY = "starting_page_number";
15+
export const PARTITION_FORM_SPLIT_PDF_PAGE_RANGE_KEY = "split_pdf_page_range";
1516
export const PARTITION_FORM_SPLIT_PDF_CONCURRENCY_LEVEL =
1617
"split_pdf_concurrency_level";
1718

src/hooks/custom/utils/form.ts

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ import {
33
DEFAULT_STARTING_PAGE_NUMBER,
44
MAX_NUMBER_OF_PARALLEL_REQUESTS, PARTITION_FORM_SPLIT_PDF_ALLOW_FAILED_KEY,
55
PARTITION_FORM_SPLIT_PDF_CONCURRENCY_LEVEL,
6+
PARTITION_FORM_SPLIT_PDF_PAGE_RANGE_KEY,
67
PARTITION_FORM_STARTING_PAGE_NUMBER_KEY,
78
} from "../common.js";
89

@@ -78,6 +79,31 @@ function getBooleanParameter(
7879
return booleanParameter;
7980
}
8081

82+
/**
83+
* Retrieves and validates a page range from FormData, ensuring that the start and end values are defined and within bounds.
84+
*
85+
* @param formData - The FormData object containing the page range parameter.
86+
* @param maxPages - The maximum number of pages in the document.
87+
* @returns {[number, number]} - A tuple containing the validated start and end page numbers.
88+
*
89+
* @throws Will throw an error if the page range is invalid or out of bounds.
90+
*/
91+
export function getSplitPdfPageRange(formData: FormData, maxPages: number): [number, number] {
92+
const formDataParameter = formData.get(PARTITION_FORM_SPLIT_PDF_PAGE_RANGE_KEY);
93+
const pageRange = String(formDataParameter).split(",").map(Number)
94+
95+
let start = pageRange[0] || 1;
96+
let end = pageRange[1] || maxPages;
97+
98+
if (!(start > 0 && start <= maxPages) || !(end > 0 && end <= maxPages) || !(start <= end)) {
99+
const msg = `Page range (${start} to ${end}) is out of bounds. Values should be between 1 and ${maxPages}.`;
100+
console.error(msg);
101+
throw new Error(msg);
102+
}
103+
104+
return [start, end];
105+
}
106+
81107
/**
82108
* Gets the number of maximum requests that can be made when splitting PDF.
83109
* - The number of maximum requests is determined by the value of the request parameter

src/hooks/custom/utils/pdf.ts

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -66,24 +66,32 @@ export async function getOptimalSplitSize(
6666
*
6767
* @param pdf - The PDF file to extract pages from.
6868
* @param splitSize - The number of pages per split.
69+
* @param [pageRangeStart=1] - The starting page of the range to be split (1-based index). Defaults to the first page of the document.
70+
* @param [pageRangeEnd=pdf.getPageCount()] - The ending page of the range to be split (1-based index). Defaults to the last page of the document.
6971
* @returns A promise that resolves to an array of objects containing Blob files and
7072
* start and end page numbers from the original document.
7173
*/
7274
export async function splitPdf(
7375
pdf: PDFDocument,
74-
splitSize: number
76+
splitSize: number,
77+
pageRangeStart?: number,
78+
pageRangeEnd?: number
7579
): Promise<PdfSplit[]> {
7680
const pdfSplits: PdfSplit[] = [];
77-
const pagesCount = pdf.getPages().length;
81+
82+
const startPage = pageRangeStart || 1;
83+
const endPage = pageRangeEnd || pdf.getPageCount();
84+
const pagesCount = endPage - startPage + 1
85+
7886
const numberOfSplits = Math.ceil(pagesCount / splitSize);
7987

8088
for (let i = 0; i < numberOfSplits; ++i) {
8189
const offset = i * splitSize;
82-
const startPage = offset + 1;
83-
// If it's the last split, take the rest of the pages
84-
const endPage = Math.min(pagesCount, offset + splitSize);
85-
const pdfSplit = await pdfPagesToBlob(pdf, startPage, endPage);
86-
pdfSplits.push({ content: pdfSplit, startPage, endPage });
90+
const splitStartPage = offset + startPage;
91+
const splitEndPage = Math.min(endPage, splitStartPage + splitSize - 1);
92+
93+
const pdfSplit = await pdfPagesToBlob(pdf, splitStartPage, splitEndPage);
94+
pdfSplits.push({ content: pdfSplit, startPage: splitStartPage, endPage: splitEndPage });
8795
}
8896

8997
return pdfSplits;

src/sdk/general.ts

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,12 @@ export class General extends ClientSDK {
161161
if (payload$.partition_parameters.split_pdf_page !== undefined) {
162162
body$.append("split_pdf_page", String(payload$.partition_parameters.split_pdf_page));
163163
}
164+
if (payload$.partition_parameters.split_pdf_page_range !== undefined) {
165+
body$.append(
166+
"split_pdf_page_range",
167+
String(payload$.partition_parameters.split_pdf_page_range)
168+
);
169+
}
164170
if (payload$.partition_parameters.starting_page_number !== undefined) {
165171
body$.append(
166172
"starting_page_number",

src/sdk/models/shared/partitionparameters.ts

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,10 @@ export type PartitionParameters = {
148148
* Should the pdf file be split at client. Ignored on backend.
149149
*/
150150
splitPdfPage?: boolean | undefined;
151+
/**
152+
* When `split_pdf_page is set to `True`, this parameter selects a subset of the pdf to send to the API. The parameter is a list of 2 integers within the range [1, length_of_pdf]. An Error is thrown if the given range is invalid. Ignored on backend.
153+
*/
154+
splitPdfPageRange?: Array<number> | undefined;
151155
/**
152156
* When PDF is split into pages before sending it into the API, providing this information will allow the page number to be assigned correctly. Introduced in 1.0.27.
153157
*/
@@ -298,6 +302,7 @@ export const PartitionParameters$inboundSchema: z.ZodType<
298302
split_pdf_allow_failed: z.boolean().default(false),
299303
split_pdf_concurrency_level: z.number().int().default(5),
300304
split_pdf_page: z.boolean().default(true),
305+
split_pdf_page_range: z.array(z.number().int()).optional(),
301306
starting_page_number: z.nullable(z.number().int()).optional(),
302307
strategy: Strategy$inboundSchema.default(Strategy.Auto),
303308
unique_element_ids: z.boolean().default(false),
@@ -325,6 +330,7 @@ export const PartitionParameters$inboundSchema: z.ZodType<
325330
split_pdf_allow_failed: "splitPdfAllowFailed",
326331
split_pdf_concurrency_level: "splitPdfConcurrencyLevel",
327332
split_pdf_page: "splitPdfPage",
333+
split_pdf_page_range: "splitPdfPageRange",
328334
starting_page_number: "startingPageNumber",
329335
unique_element_ids: "uniqueElementIds",
330336
xml_keep_tags: "xmlKeepTags",
@@ -358,6 +364,7 @@ export type PartitionParameters$Outbound = {
358364
split_pdf_allow_failed: boolean;
359365
split_pdf_concurrency_level: number;
360366
split_pdf_page: boolean;
367+
split_pdf_page_range?: Array<number> | undefined;
361368
starting_page_number?: number | null | undefined;
362369
strategy: string;
363370
unique_element_ids: boolean;
@@ -396,6 +403,7 @@ export const PartitionParameters$outboundSchema: z.ZodType<
396403
splitPdfAllowFailed: z.boolean().default(false),
397404
splitPdfConcurrencyLevel: z.number().int().default(5),
398405
splitPdfPage: z.boolean().default(true),
406+
splitPdfPageRange: z.array(z.number().int()).optional(),
399407
startingPageNumber: z.nullable(z.number().int()).optional(),
400408
strategy: Strategy$outboundSchema.default(Strategy.Auto),
401409
uniqueElementIds: z.boolean().default(false),
@@ -423,6 +431,7 @@ export const PartitionParameters$outboundSchema: z.ZodType<
423431
splitPdfAllowFailed: "split_pdf_allow_failed",
424432
splitPdfConcurrencyLevel: "split_pdf_concurrency_level",
425433
splitPdfPage: "split_pdf_page",
434+
splitPdfPageRange: "split_pdf_page_range",
426435
startingPageNumber: "starting_page_number",
427436
uniqueElementIds: "unique_element_ids",
428437
xmlKeepTags: "xml_keep_tags",

test/integration/HttpsCheckHook.test.ts

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,14 @@ import { UnstructuredClient } from "../../src";
44
import { PartitionResponse } from "../../src/sdk/models/operations";
55
import { PartitionParameters, Strategy } from "../../src/sdk/models/shared";
66

7+
const localServer = "http://localhost:8000"
8+
79
describe("HttpsCheckHook integration tests", () => {
810
const FAKE_API_KEY = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
911

1012
it.each([
11-
"http://localhost:8000",
12-
"http://localhost:8000/general/v0/general",
13+
localServer,
14+
`${localServer}/general/v0/general`,
1315
])("should throw error when given filename is empty", async (serverURL) => {
1416
const client = new UnstructuredClient({
1517
serverURL: serverURL,

0 commit comments

Comments
 (0)