Skip to content

Commit 04c2406

Browse files
committed
fix: Improve retry mechanisms for split pdf logic
1 parent f8a3dbe commit 04c2406

File tree

3 files changed

+84
-11
lines changed

3 files changed

+84
-11
lines changed

src/hooks/custom/SplitPdfHook.ts

Lines changed: 34 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ import {
2929
PARTITION_FORM_FILES_KEY,
3030
PARTITION_FORM_SPLIT_PDF_PAGE_KEY,
3131
} from "./common.js";
32+
import {retry, RetryConfig} from "../../lib/retries";
3233

3334
/**
3435
* Represents a hook for splitting and sending PDF files as per page requests.
@@ -178,9 +179,11 @@ export class SplitPdfHook
178179
file.name,
179180
firstPageNumber
180181
);
182+
const timeoutInMs = 60 * 10 * 1000;
181183
const req = new Request(requestClone, {
182184
headers,
183185
body,
186+
signal: AbortSignal.timeout(timeoutInMs)
184187
});
185188
requests.push(req);
186189
setIndex+=1;
@@ -190,11 +193,19 @@ export class SplitPdfHook
190193

191194
const allowFailed = this.allowFailed;
192195

196+
const retryConfig = { strategy: "backoff" } as RetryConfig;
197+
const retryCodes = ["502", "503", "504"];
198+
193199
this.partitionRequests[operationID] = async.parallelLimit(
194-
requests.slice(0, -1).map((req, pageIndex) => async () => {
200+
requests.map((req, pageIndex) => async () => {
195201
const pageNumber = pageIndex + startingPageNumber;
196202
try {
197-
const response = await this.client!.request(req);
203+
const response = await retry(
204+
async () => {
205+
return await this.client!.request(req.clone());
206+
},
207+
{ config: retryConfig, statusCodes: retryCodes }
208+
);
198209
if (response.status === 200) {
199210
(this.partitionSuccessfulResponses[operationID] as Response[])[pageIndex] =
200211
response.clone();
@@ -206,7 +217,7 @@ export class SplitPdfHook
206217
}
207218
}
208219
} catch (e) {
209-
console.error(`Failed to send request for page ${pageNumber}.`);
220+
console.error(`Failed to send request for page ${pageNumber}.`, e);
210221
if (!allowFailed) {
211222
throw e;
212223
}
@@ -215,7 +226,9 @@ export class SplitPdfHook
215226
concurrencyLevel
216227
);
217228

218-
return requests.at(-1) as Request;
229+
const dummyRequest = new Request("https://no-op/");
230+
await this.client!.request(dummyRequest);
231+
return dummyRequest;
219232
}
220233

221234
/**
@@ -230,28 +243,39 @@ export class SplitPdfHook
230243
successfulResponses: Response[],
231244
failedResponses: Response[]
232245
): Promise<Response> {
246+
let realResponse = response.clone();
247+
const firstSuccessfulResponse = successfulResponses.at(0);
248+
const isFakeResponse = response.headers.has("fake-response");
249+
if (firstSuccessfulResponse !== undefined && isFakeResponse) {
250+
realResponse = firstSuccessfulResponse.clone();
251+
}
252+
233253
let responseBody, responseStatus, responseStatusText;
234254
const numFailedResponses = failedResponses?.length ?? 0;
235-
const headers = prepareResponseHeaders(response);
255+
const headers = prepareResponseHeaders(realResponse);
236256

237257
if (!this.allowFailed && failedResponses && failedResponses.length > 0) {
238258
const failedResponse = failedResponses[0]?.clone();
239259
if (failedResponse) {
240260
responseBody = await failedResponse.text();
241-
responseStatus = failedResponse.status;
242261
responseStatusText = failedResponse.statusText;
243262
} else {
244263
responseBody = JSON.stringify({"details:": "Unknown error"});
245-
responseStatus = 503
246264
responseStatusText = "Unknown error"
247265
}
266+
// if the response status is unknown or was 502, 503, 504, set back to 500 to ensure we don't cause more retries
267+
responseStatus = 500;
248268
console.warn(
249269
`${numFailedResponses} requests failed. The partition operation is cancelled.`
250270
);
251271
} else {
252-
responseBody = await prepareResponseBody([...successfulResponses, response]);
253-
responseStatus = response.status
254-
responseStatusText = response.statusText
272+
if (isFakeResponse) {
273+
responseBody = await prepareResponseBody([...successfulResponses]);
274+
} else {
275+
responseBody = await prepareResponseBody([...successfulResponses, response]);
276+
}
277+
responseStatus = realResponse.status
278+
responseStatusText = realResponse.statusText
255279
if (numFailedResponses > 0) {
256280
console.warn(
257281
`${numFailedResponses} requests failed. The results might miss some pages.`

src/lib/http.ts

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,16 @@ export class HTTPClient {
5252

5353
async request(request: Request): Promise<Response> {
5454
let req = request;
55+
if (req.url === "https://no-op/") {
56+
return new Response('{}', {
57+
headers: [
58+
["fake-response", "fake-response"]
59+
],
60+
status: 200,
61+
statusText: 'OK_NO_OP'
62+
});
63+
}
64+
5565
for (const hook of this.requestHooks) {
5666
const nextRequest = await hook(req);
5767
if (nextRequest) {

test/integration/SplitPdfHook.test.ts

Lines changed: 40 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ import { UnstructuredClient } from "../../src";
44
import { PartitionResponse } from "../../src/sdk/models/operations";
55
import { PartitionParameters, Strategy } from "../../src/sdk/models/shared";
66

7-
const localServer = "http://localhost:8000"
7+
const localServer = "http://localhost:8000";
88

99
describe("SplitPdfHook integration tests check splitted file is same as not splitted", () => {
1010
const FAKE_API_KEY = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
@@ -354,3 +354,42 @@ describe("SplitPdfHook integration tests page range parameter", () => {
354354
300000
355355
);
356356
});
357+
358+
359+
describe("SplitPDF succeeds for large PDF with high concurrency", () => {
360+
const FAKE_API_KEY = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
361+
362+
it.each([
363+
`${localServer}/general/v0/general`,
364+
])("succeed", async (serverURL) => {
365+
const client = new UnstructuredClient({
366+
serverURL: serverURL,
367+
security: {
368+
apiKeyAuth: FAKE_API_KEY,
369+
},
370+
});
371+
372+
const file = {
373+
content: readFileSync("test/data/layout-parser-paper.pdf"),
374+
fileName: "test/data/layout-parser-paper.pdf"
375+
};
376+
377+
const requestParams: PartitionParameters = {
378+
files: file,
379+
splitPdfPage: true,
380+
strategy: Strategy.HiRes,
381+
splitPdfAllowFailed: false,
382+
splitPdfConcurrencyLevel: 15
383+
};
384+
385+
const res: PartitionResponse = await client.general.partition({
386+
partitionParameters: {
387+
...requestParams
388+
},
389+
});
390+
391+
expect(res.statusCode).toEqual(200);
392+
expect(res.elements?.length).toBeGreaterThan(0);
393+
},
394+
300000);
395+
});

0 commit comments

Comments
 (0)