Skip to content

Commit 279fc39

Browse files
committed
fix: Improve retry mechanisms for split pdf logic
1 parent f8a3dbe commit 279fc39

File tree

2 files changed

+56
-5
lines changed

2 files changed

+56
-5
lines changed

src/hooks/custom/SplitPdfHook.ts

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ import {
2929
PARTITION_FORM_FILES_KEY,
3030
PARTITION_FORM_SPLIT_PDF_PAGE_KEY,
3131
} from "./common.js";
32+
import {retry, RetryConfig} from "../../lib/retries";
3233

3334
/**
3435
* Represents a hook for splitting and sending PDF files as per page requests.
@@ -178,9 +179,11 @@ export class SplitPdfHook
178179
file.name,
179180
firstPageNumber
180181
);
182+
const timeoutInMs = 60 * 10 * 1000;
181183
const req = new Request(requestClone, {
182184
headers,
183185
body,
186+
signal: AbortSignal.timeout(timeoutInMs)
184187
});
185188
requests.push(req);
186189
setIndex+=1;
@@ -190,11 +193,19 @@ export class SplitPdfHook
190193

191194
const allowFailed = this.allowFailed;
192195

196+
const retryConfig = { strategy: "backoff" } as RetryConfig;
197+
const retryCodes = ["502", "503", "504"];
198+
193199
this.partitionRequests[operationID] = async.parallelLimit(
194200
requests.slice(0, -1).map((req, pageIndex) => async () => {
195201
const pageNumber = pageIndex + startingPageNumber;
196202
try {
197-
const response = await this.client!.request(req);
203+
const response = await retry(
204+
async () => {
205+
return await this.client!.request(req.clone());
206+
},
207+
{ config: retryConfig, statusCodes: retryCodes }
208+
);
198209
if (response.status === 200) {
199210
(this.partitionSuccessfulResponses[operationID] as Response[])[pageIndex] =
200211
response.clone();
@@ -206,7 +217,7 @@ export class SplitPdfHook
206217
}
207218
}
208219
} catch (e) {
209-
console.error(`Failed to send request for page ${pageNumber}.`);
220+
console.error(`Failed to send request for page ${pageNumber}.`, e);
210221
if (!allowFailed) {
211222
throw e;
212223
}
@@ -238,13 +249,13 @@ export class SplitPdfHook
238249
const failedResponse = failedResponses[0]?.clone();
239250
if (failedResponse) {
240251
responseBody = await failedResponse.text();
241-
responseStatus = failedResponse.status;
242252
responseStatusText = failedResponse.statusText;
243253
} else {
244254
responseBody = JSON.stringify({"details:": "Unknown error"});
245-
responseStatus = 503
246255
responseStatusText = "Unknown error"
247256
}
257+
// if the response status is unknown or was 502, 503, 504, set back to 500 to ensure we don't cause more retries
258+
responseStatus = 500;
248259
console.warn(
249260
`${numFailedResponses} requests failed. The partition operation is cancelled.`
250261
);

test/integration/SplitPdfHook.test.ts

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@ import { UnstructuredClient } from "../../src";
44
import { PartitionResponse } from "../../src/sdk/models/operations";
55
import { PartitionParameters, Strategy } from "../../src/sdk/models/shared";
66

7-
const localServer = "http://localhost:8000"
7+
const localServer = "http://localhost:8000";
8+
const SECONDS = 1000;
89

910
describe("SplitPdfHook integration tests check splitted file is same as not splitted", () => {
1011
const FAKE_API_KEY = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
@@ -354,3 +355,42 @@ describe("SplitPdfHook integration tests page range parameter", () => {
354355
300000
355356
);
356357
});
358+
359+
360+
describe("SplitPDF succeeds for large PDF with high concurrency", () => {
361+
const FAKE_API_KEY = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
362+
363+
it.each([
364+
`${localServer}/general/v0/general`,
365+
])("succeed", async (serverURL) => {
366+
const client = new UnstructuredClient({
367+
serverURL: serverURL,
368+
security: {
369+
apiKeyAuth: FAKE_API_KEY,
370+
},
371+
});
372+
373+
const file = {
374+
content: readFileSync("test/data/layout-parser-paper.pdf"),
375+
fileName: "test/data/layout-parser-paper.pdf"
376+
};
377+
378+
const requestParams: PartitionParameters = {
379+
files: file,
380+
splitPdfPage: true,
381+
strategy: Strategy.HiRes,
382+
splitPdfAllowFailed: false,
383+
splitPdfConcurrencyLevel: 15
384+
};
385+
386+
const res: PartitionResponse = await client.general.partition({
387+
partitionParameters: {
388+
...requestParams
389+
},
390+
});
391+
392+
expect(res.statusCode).toEqual(200);
393+
expect(res.elements?.length).toBeGreaterThan(0);
394+
},
395+
300000);
396+
});

0 commit comments

Comments
 (0)