Skip to content

Commit cdaa42c

Browse files
feat: added allow_failed parameter (#93)
This PR: - adds `allowFailed` parameter to control the strict mode while sending multiple pages chunks when `splitPdf` mode is used.
1 parent f12b10c commit cdaa42c

File tree

7 files changed

+261
-61
lines changed

7 files changed

+261
-61
lines changed

docs/sdk/models/shared/partitionparameters.md

Lines changed: 30 additions & 29 deletions
Large diffs are not rendered by default.

overlay_client.yaml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,3 +21,12 @@ actions:
2121
"description": "Number of maximum concurrent requests made when splitting PDF. Ignored on backend.",
2222
"default": 5,
2323
}
24+
- target: $["components"]["schemas"]["partition_parameters"]["properties"]
25+
update:
26+
"split_pdf_allow_failed":
27+
{
28+
"title": "Split Pdf Allow Failed",
29+
"description": "When `split_pdf_page` is set to `True`, this parameter defines the behavior when some of the parallel requests fail. By default `split_pdf_allow_failed` is set to `False` and any failed request send to the API will make the whole process break and raise an Exception. If `split_pdf_allow_failed` is set to `True`, the errors encountered while sending parallel requests will not break the processing - the resuling list of Elements will miss the data from errored pages.",
30+
"type": "boolean",
31+
"default": false,
32+
}

src/hooks/custom/SplitPdfHook.ts

Lines changed: 101 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ import {
1212
SDKInitOptions,
1313
} from "../types.js";
1414
import {
15-
getOptimalSplitSize,
15+
getOptimalSplitSize, getSplitPdfAllowFailed,
1616
getSplitPdfConcurrencyLevel,
1717
getStartingPageNumber,
1818
loadPdf,
@@ -40,10 +40,21 @@ export class SplitPdfHook
4040
*/
4141
client: HTTPClient | undefined;
4242

43+
44+
/**
45+
* Keeps the strict-mode setting for splitPdfPage feature.
46+
*/
47+
allowFailed: boolean | undefined;
48+
4349
/**
4450
* Maps lists responses to client operation.
4551
*/
46-
partitionResponses: Record<string, Response[]> = {};
52+
partitionSuccessfulResponses: Record<string, Response[]> = {};
53+
54+
/**
55+
* Maps lists failed responses to client operation.
56+
*/
57+
partitionFailedResponses: Record<string, Response[]> = {};
4758

4859
/**
4960
* Maps parallel requests to client operation.
@@ -115,6 +126,9 @@ export class SplitPdfHook
115126
const concurrencyLevel = getSplitPdfConcurrencyLevel(formData);
116127
console.info("Concurrency level set to %d", concurrencyLevel)
117128

129+
this.allowFailed = getSplitPdfAllowFailed(formData);
130+
console.info("Allow failed set to %s", this.allowFailed)
131+
118132
const splitSize = await getOptimalSplitSize(pagesCount, concurrencyLevel);
119133
console.info("Determined optimal split size of %d pages.", splitSize)
120134

@@ -168,19 +182,30 @@ export class SplitPdfHook
168182
setIndex+=1;
169183
}
170184

171-
this.partitionResponses[operationID] = new Array(requests.length);
185+
this.partitionSuccessfulResponses[operationID] = new Array(requests.length);
186+
187+
const allowFailed = this.allowFailed;
172188

173189
this.partitionRequests[operationID] = async.parallelLimit(
174190
requests.slice(0, -1).map((req, pageIndex) => async () => {
175191
const pageNumber = pageIndex + startingPageNumber;
176192
try {
177193
const response = await this.client!.request(req);
178194
if (response.status === 200) {
179-
(this.partitionResponses[operationID] as Response[])[pageIndex] =
195+
(this.partitionSuccessfulResponses[operationID] as Response[])[pageIndex] =
196+
response.clone();
197+
} else {
198+
(this.partitionFailedResponses[operationID] as Response[])[pageIndex] =
180199
response.clone();
200+
if (!allowFailed) {
201+
throw new Error(`Failed to send request for page ${pageNumber}.`);
202+
}
181203
}
182204
} catch (e) {
183205
console.error(`Failed to send request for page ${pageNumber}.`);
206+
if (!allowFailed) {
207+
throw e;
208+
}
184209
}
185210
}),
186211
concurrencyLevel
@@ -189,6 +214,54 @@ export class SplitPdfHook
189214
return requests.at(-1) as Request;
190215
}
191216

217+
/**
218+
* Forms the final response object based on the successful and failed responses.
219+
* @param response - The response object returned from the API request.
220+
* Expected to be a successful response.
221+
* @param successfulResponses - The list of successful responses.
222+
* @param failedResponses - The list of failed responses.
223+
* @returns The final response object.
224+
*/
225+
async formFinalResponse(response: Response,
226+
successfulResponses: Response[],
227+
failedResponses: Response[]
228+
): Promise<Response> {
229+
let responseBody, responseStatus, responseStatusText;
230+
const numFailedResponses = failedResponses?.length ?? 0;
231+
const headers = prepareResponseHeaders(response);
232+
233+
if (!this.allowFailed && failedResponses && failedResponses.length > 0) {
234+
const failedResponse = failedResponses[0]?.clone();
235+
if (failedResponse) {
236+
responseBody = await failedResponse.text();
237+
responseStatus = failedResponse.status;
238+
responseStatusText = failedResponse.statusText;
239+
} else {
240+
responseBody = JSON.stringify({"details:": "Unknown error"});
241+
responseStatus = 503
242+
responseStatusText = "Unknown error"
243+
}
244+
console.warn(
245+
`${numFailedResponses} requests failed. The partition operation is cancelled.`
246+
);
247+
} else {
248+
responseBody = await prepareResponseBody([...successfulResponses, response]);
249+
responseStatus = response.status
250+
responseStatusText = response.statusText
251+
if (numFailedResponses > 0) {
252+
console.warn(
253+
`${numFailedResponses} requests failed. The results might miss some pages.`
254+
);
255+
}
256+
}
257+
return new Response(responseBody, {
258+
headers: headers,
259+
status: responseStatus,
260+
statusText: responseStatusText,
261+
});
262+
263+
}
264+
192265
/**
193266
* Executes after a successful API request. Awaits all parallel requests and combines
194267
* the responses into a single response object.
@@ -203,22 +276,19 @@ export class SplitPdfHook
203276
): Promise<Response> {
204277
const { operationID } = hookCtx;
205278
const responses = await this.awaitAllRequests(operationID);
206-
207-
if (!responses) {
279+
const successfulResponses = responses?.get("success") ?? [];
280+
const failedResponses = responses?.get("failed") ?? [];
281+
if (!successfulResponses) {
208282
return response;
209283
}
210284

211-
const headers = prepareResponseHeaders(response);
212-
const body = await prepareResponseBody([...responses, response]);
285+
const finalResponse = await this.formFinalResponse(response, successfulResponses, failedResponses);
213286

214287
this.clearOperation(operationID);
215288

216-
return new Response(body, {
217-
headers: headers,
218-
status: response.status,
219-
statusText: response.statusText,
220-
});
221-
}
289+
return finalResponse;
290+
}
291+
222292

223293
/**
224294
* Executes after an unsuccessful API request. Awaits all parallel requests, if at least one
@@ -237,21 +307,19 @@ export class SplitPdfHook
237307
): Promise<{ response: Response | null; error: unknown }> {
238308
const { operationID } = hookCtx;
239309
const responses = await this.awaitAllRequests(operationID);
240-
241-
if (!responses?.length) {
310+
const successfulResponses = responses?.get("success") ?? [];
311+
const failedResponses = responses?.get("failed") ?? [];
312+
if (!successfulResponses?.length) {
242313
this.clearOperation(operationID);
243314
return { response, error };
244315
}
245316

246-
const okResponse = responses[0] as Response;
247-
const headers = prepareResponseHeaders(okResponse);
248-
const body = await prepareResponseBody(responses);
249-
250-
const finalResponse = new Response(body, {
251-
headers: headers,
252-
status: okResponse.status,
253-
statusText: okResponse.statusText,
254-
});
317+
const okResponse = successfulResponses[0] as Response;
318+
const finalResponse = await this.formFinalResponse(
319+
okResponse,
320+
successfulResponses.slice(1),
321+
failedResponses
322+
);
255323

256324
this.clearOperation(operationID);
257325

@@ -265,7 +333,8 @@ export class SplitPdfHook
265333
* @param operationID - The ID of the operation to clear.
266334
*/
267335
clearOperation(operationID: string) {
268-
delete this.partitionResponses[operationID];
336+
delete this.partitionSuccessfulResponses[operationID];
337+
delete this.partitionFailedResponses[operationID];
269338
delete this.partitionRequests[operationID];
270339
}
271340

@@ -276,15 +345,17 @@ export class SplitPdfHook
276345
* @returns A promise that resolves to an array of responses, or undefined
277346
* if there are no requests for the given operation ID.
278347
*/
279-
async awaitAllRequests(operationID: string): Promise<Response[] | undefined> {
348+
async awaitAllRequests(operationID: string): Promise<Map<string, Response[]>> {
280349
const requests = this.partitionRequests[operationID];
350+
const responseMap = new Map<string, Response[]>();
281351

282352
if (!requests) {
283-
return;
353+
return responseMap;
284354
}
285-
286355
await requests;
287356

288-
return this.partitionResponses[operationID]?.filter((e) => e) ?? [];
357+
responseMap.set("success", this.partitionSuccessfulResponses[operationID]?.filter((e) => e) ?? []);
358+
responseMap.set("failed", this.partitionFailedResponses[operationID]?.filter((e) => e) ?? []);
359+
return responseMap
289360
}
290361
}

src/hooks/custom/common.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,14 @@ export const BASE_PROTOCOL = "https:";
1010

1111
export const PARTITION_FORM_FILES_KEY = "files";
1212
export const PARTITION_FORM_SPLIT_PDF_PAGE_KEY = "split_pdf_page";
13+
export const PARTITION_FORM_SPLIT_PDF_ALLOW_FAILED_KEY = "split_pdf_allow_failed";
1314
export const PARTITION_FORM_STARTING_PAGE_NUMBER_KEY = "starting_page_number";
1415
export const PARTITION_FORM_SPLIT_PDF_CONCURRENCY_LEVEL =
1516
"split_pdf_concurrency_level";
1617

1718
export const DEFAULT_STARTING_PAGE_NUMBER = 1;
1819
export const DEFAULT_NUMBER_OF_PARALLEL_REQUESTS = 8;
20+
export const DEFAULT_SPLIT_PDF_ALLOW_FAILED_KEY = false;
1921
export const MAX_NUMBER_OF_PARALLEL_REQUESTS = 15;
2022

2123
export const MIN_PAGES_PER_THREAD = 2;

src/hooks/custom/utils/form.ts

Lines changed: 81 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import {
2-
DEFAULT_NUMBER_OF_PARALLEL_REQUESTS,
2+
DEFAULT_NUMBER_OF_PARALLEL_REQUESTS, DEFAULT_SPLIT_PDF_ALLOW_FAILED_KEY,
33
DEFAULT_STARTING_PAGE_NUMBER,
4-
MAX_NUMBER_OF_PARALLEL_REQUESTS,
4+
MAX_NUMBER_OF_PARALLEL_REQUESTS, PARTITION_FORM_SPLIT_PDF_ALLOW_FAILED_KEY,
55
PARTITION_FORM_SPLIT_PDF_CONCURRENCY_LEVEL,
66
PARTITION_FORM_STARTING_PAGE_NUMBER_KEY,
77
} from "../common.js";
@@ -41,6 +41,43 @@ function getIntegerParameter(
4141
return numberParameter;
4242
}
4343

44+
/**
45+
* Retrieves a boolean parameter from the given form data.
46+
* If the parameter is not found or does not have true/false value, the default value is returned.
47+
*
48+
* @param formData - The form data object.
49+
* @param parameterName - The name of the parameter to retrieve.
50+
* @param defaultValue - The default value to use if the parameter is not found or is not
51+
* a true/false string.
52+
* @returns The boolean value of the parameter.
53+
*/
54+
function getBooleanParameter(
55+
formData: FormData,
56+
parameterName: string,
57+
defaultValue: boolean
58+
): boolean {
59+
let booleanParameter = defaultValue;
60+
const formDataParameter = formData.get(parameterName);
61+
62+
if (formDataParameter === null) {
63+
return booleanParameter;
64+
}
65+
66+
const formDataBooleanParameterString = formDataParameter as string;
67+
68+
if (formDataBooleanParameterString.toLowerCase() === "true") {
69+
booleanParameter = true;
70+
} else if (formDataBooleanParameterString.toLowerCase() === "false") {
71+
booleanParameter = false;
72+
} else {
73+
console.warn(
74+
`'${parameterName}' is not a valid boolean. Using default value '${defaultValue}'.`
75+
);
76+
}
77+
78+
return booleanParameter;
79+
}
80+
4481
/**
4582
* Gets the number of maximum requests that can be made when splitting PDF.
4683
* - The number of maximum requests is determined by the value of the request parameter
@@ -82,6 +119,48 @@ export function getSplitPdfConcurrencyLevel(formData: FormData): number {
82119
return splitPdfConcurrencyLevel;
83120
}
84121

122+
/**
123+
* Gets the allowFailed parameter which decides whether the partial requests can fail or not
124+
* when using splitPdfPage parameter.
125+
* - The number of maximum requests is determined by the value of the request parameter
126+
* `split_pdf_thread`.
127+
* - If the parameter is not set or has an invalid value, the default number of
128+
* parallel requests (5) is used.
129+
* - If the number of maximum requests is greater than the maximum allowed (15), it is
130+
* clipped to the maximum value.
131+
* - If the number of maximum requests is less than 1, the default number of parallel
132+
* requests (5) is used.
133+
*
134+
* @returns The number of maximum requests to use when calling the API to split a PDF.
135+
*/
136+
export function getSplitPdfAllowFailed(formData: FormData): boolean {
137+
const splitPdfAllowFailed = getBooleanParameter(
138+
formData,
139+
PARTITION_FORM_SPLIT_PDF_ALLOW_FAILED_KEY,
140+
DEFAULT_SPLIT_PDF_ALLOW_FAILED_KEY
141+
);
142+
143+
144+
if (splitPdfAllowFailed) {
145+
console.info(
146+
`Running split PDF requests in parallel with no-strict mode -
147+
the failed requests will not stop the process, and the resulting elements might miss
148+
some pages in case of failure.`
149+
);
150+
} else {
151+
console.info(
152+
`Running split PDF requests in parallel with strict mode -
153+
the failed requests will stop the process, and the resulting elements will have all pages
154+
or error out.`
155+
)
156+
}
157+
158+
console.info(
159+
`Set ${PARTITION_FORM_SPLIT_PDF_CONCURRENCY_LEVEL} parameter if you want to change that.`
160+
);
161+
return splitPdfAllowFailed;
162+
}
163+
85164
/**
86165
* Retrieves the starting page number from the provided form data.
87166
* If the starting page number is not a valid integer or less than 1,

src/sdk/models/shared/partitionparameters.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,10 @@ export type PartitionParameters = {
136136
* Number of maximum concurrent requests made when splitting PDF. Ignored on backend.
137137
*/
138138
splitPdfConcurrencyLevel?: number | undefined;
139+
/**
140+
* Controls the split pdf strict mode - if set to True, the partial requests with page batches may fail. If set to False - the first failed partial request will cancel the process and error out.
141+
*/
142+
splitPdfAllowFailed?: boolean | undefined;
139143
/**
140144
* Should the pdf file be split at client. Ignored on backend.
141145
*/

0 commit comments

Comments
 (0)