Skip to content

Commit dbfb84c

Browse files
authored
fix: Allow split page logic to process files concurrently (#121)
### Summary copy Unstructured-IO/unstructured-python-client#175 into JS/TS ### Test added a integration tests that is passed on CI Local integration test: * docker start core product api but with -p 8000:5000 (change the line in make file for make docker-start-api) * `make build` * `npx jest --verbose --detectOpenHandles --config jest.config.js test/integration --forceExit -t "SplitPDF async can be used to send multiple files concurrently"` you can also move the unit test to main and `make build npx jest test...` again and will see the test fail, but not on this branch
1 parent 4b38b39 commit dbfb84c

File tree

4 files changed

+116
-4
lines changed

4 files changed

+116
-4
lines changed

package-lock.json

Lines changed: 32 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
"devDependencies": {
2121
"@types/async": "^3.2.24",
2222
"@types/jest": "^29.5.12",
23+
"@types/uuid": "^10.0.0",
2324
"@typescript-eslint/eslint-plugin": "^7.7.1",
2425
"@typescript-eslint/parser": "^7.7.1",
2526
"eslint": "^8.57.0",
@@ -32,6 +33,7 @@
3233
},
3334
"dependencies": {
3435
"async": "^3.2.5",
35-
"pdf-lib": "^1.17.1"
36+
"pdf-lib": "^1.17.1",
37+
"uuid": "^10.0.0"
3638
}
3739
}

src/hooks/custom/SplitPdfHook.ts

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import async from "async";
2+
import { v4 as uuidv4 } from 'uuid';
23

34
import {
45
AfterErrorContext,
@@ -96,7 +97,11 @@ export class SplitPdfHook
9697
hookCtx: BeforeRequestContext,
9798
request: Request
9899
): Promise<Request> {
99-
const { operationID } = hookCtx;
100+
101+
// setting the current operationID to be unique
102+
const operationID = "partition-" + uuidv4();
103+
hookCtx.operationID = operationID;
104+
100105
const requestClone = request.clone();
101106
const formData = await requestClone.formData();
102107
const splitPdfPage = stringToBoolean(

test/integration/SplitPdfHook.test.ts

Lines changed: 75 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -392,4 +392,78 @@ describe("SplitPDF succeeds for large PDF with high concurrency", () => {
392392
expect(res.elements?.length).toBeGreaterThan(0);
393393
},
394394
300000);
395-
});
395+
});
396+
397+
398+
describe("SplitPDF async can be used to send multiple files concurrently", () => {
399+
const FAKE_API_KEY = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
400+
401+
it.each([
402+
`${localServer}/general/v0/general`,
403+
])("succeed", async (serverURL) => {
404+
const client = new UnstructuredClient({
405+
serverURL: serverURL,
406+
security: {
407+
apiKeyAuth: FAKE_API_KEY,
408+
},
409+
});
410+
411+
const file = {
412+
content: readFileSync("test/data/layout-parser-paper.pdf"),
413+
fileName: "test/data/layout-parser-paper.pdf"
414+
};
415+
416+
const RequestsParams = [
417+
{
418+
files: file,
419+
splitPdfPage: true,
420+
strategy: Strategy.Fast,
421+
splitPdfPageRange: [1, 3],
422+
languages: ["eng"],
423+
},
424+
{
425+
files: file,
426+
splitPdfPage: true,
427+
strategy: Strategy.Fast,
428+
splitPdfPageRange: [10, 12],
429+
languages: ["eng"],
430+
}
431+
];
432+
433+
// Process requests serially
434+
const serialElements: any[][] = [];
435+
for (const requestParams of RequestsParams) {
436+
const res: PartitionResponse = await client.general.partition({
437+
partitionParameters: {
438+
...requestParams
439+
},
440+
});
441+
expect(res.statusCode).toEqual(200);
442+
expect(res.elements?.length).toBeGreaterThan(0);
443+
if (res.elements) {
444+
serialElements.push(res.elements);
445+
}
446+
}
447+
448+
// Process requests concurrently
449+
const concurrentElements: any[][] = [];
450+
const concurrentResponses = await Promise.all(RequestsParams.map(req =>
451+
client.general.partition({
452+
partitionParameters: req
453+
})
454+
));
455+
456+
for (const res of concurrentResponses) {
457+
expect(res.statusCode).toEqual(200);
458+
expect(res.elements?.length).toBeGreaterThan(0);
459+
if (res.elements) {
460+
concurrentElements.push(res.elements);
461+
}
462+
}
463+
464+
const isEqual = JSON.stringify(serialElements) === JSON.stringify(concurrentElements);
465+
expect(isEqual).toBe(true);
466+
467+
},
468+
300000);
469+
});

0 commit comments

Comments
 (0)