Skip to content

Commit f173b57

Browse files
Move Split PDF page functionality to hooks (#69)
<b> ONLY `_hooks` directory and `gen.yaml` file are relevant for review. Rest of the changes were generated by speakeasy </b>
1 parent 7d55f6e commit f173b57

File tree

18 files changed

+1478
-565
lines changed

18 files changed

+1478
-565
lines changed

.speakeasy/gen.lock

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,25 @@
11
lockVersion: 2.0.0
22
id: 8b5fa338-9106-4734-abf0-e30d67044a90
33
management:
4-
docChecksum: 33cfd4e27a32bf67fdb56996b6bd7a7a
4+
docChecksum: b35264eb5f2ce89c808012333367cf1c
55
docVersion: 0.0.1
6-
speakeasyVersion: 1.241.0
7-
generationVersion: 2.300.0
8-
releaseVersion: 0.23.1
9-
configChecksum: fabd3c02a49e5b9a6a10fa261a46c3c9
6+
speakeasyVersion: 1.267.1
7+
generationVersion: 2.312.1
8+
releaseVersion: 0.23.2
9+
configChecksum: c0ddfa44eb8fbd51d397d36253d1d68f
1010
repoURL: https://github.com/Unstructured-IO/unstructured-python-client.git
1111
repoSubDirectory: .
1212
installationURL: https://github.com/Unstructured-IO/unstructured-python-client.git
1313
published: true
1414
features:
1515
python:
16-
core: 4.6.3
16+
core: 4.6.5
1717
examples: 2.81.3
1818
globalSecurity: 2.83.5
1919
globalServerURLs: 2.82.2
2020
nameOverrides: 2.81.2
2121
responseFormat: 0.1.0
22-
retries: 2.82.1
22+
retries: 2.82.2
2323
serverIDs: 2.81.1
2424
unions: 2.82.6
2525
generatedFiles:

_test_unstructured_client/test__decorators.py

Lines changed: 51 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import os
2-
import pypdf
32
import pytest
43
import requests
54
from deepdiff import DeepDiff
@@ -15,18 +14,18 @@
1514
@pytest.mark.parametrize(
1615
"filename, expected_ok",
1716
[
18-
("_sample_docs/list-item-example-1.pdf", True), # 1 page
17+
("_sample_docs/list-item-example-1.pdf", True), # 1 page
1918
("_sample_docs/layout-parser-paper-fast.pdf", True), # 2 pages
20-
("_sample_docs/layout-parser-paper.pdf", True), # 16 pages
19+
("_sample_docs/layout-parser-paper.pdf", True), # 16 pages
2120
("_sample_docs/fake.doc", True),
22-
("_sample_docs/fake.doc", False), # This will append .pdf to filename to fool first line of filetype detection, to simulate decoding error
21+
(
22+
"_sample_docs/fake.doc",
23+
False,
24+
), # This will append .pdf to filename to fool first line of filetype detection, to simulate decoding error
2325
],
2426
)
2527
def test_integration_split_pdf_has_same_output_as_non_split(
26-
call_threads: int,
27-
filename: str,
28-
expected_ok: bool,
29-
caplog
28+
call_threads: int, filename: str, expected_ok: bool, caplog
3029
):
3130
"""
3231
Tests that output that we get from the split-by-page pdf is the same as from non-split.
@@ -36,14 +35,13 @@ def test_integration_split_pdf_has_same_output_as_non_split(
3635
"""
3736
try:
3837
response = requests.get("http://localhost:8000/general/docs")
39-
assert response.status_code == 200, "The unstructured-api is not running on localhost:8000"
38+
assert (
39+
response.status_code == 200
40+
), "The unstructured-api is not running on localhost:8000"
4041
except requests.exceptions.ConnectionError:
4142
assert False, "The unstructured-api is not running on localhost:8000"
4243

43-
client = UnstructuredClient(
44-
api_key_auth=FAKE_KEY,
45-
server_url="localhost:8000"
46-
)
44+
client = UnstructuredClient(api_key_auth=FAKE_KEY, server_url="localhost:8000")
4745

4846
with open(filename, "rb") as f:
4947
files = shared.Files(
@@ -56,7 +54,7 @@ def test_integration_split_pdf_has_same_output_as_non_split(
5654

5755
req = shared.PartitionParameters(
5856
files=files,
59-
strategy='fast',
57+
strategy="fast",
6058
languages=["eng"],
6159
split_pdf_page=True,
6260
)
@@ -81,6 +79,43 @@ def test_integration_split_pdf_has_same_output_as_non_split(
8179
assert resp_split.status_code == resp_single.status_code
8280

8381
# Difference in the parent_id is expected, because parent_ids are assigned when element crosses page boundary
84-
diff = DeepDiff(t1=resp_split.elements, t2=resp_single.elements,
85-
exclude_regex_paths=r"root\[\d+\]\['metadata'\]\['parent_id'\]")
82+
diff = DeepDiff(
83+
t1=resp_split.elements,
84+
t2=resp_single.elements,
85+
exclude_regex_paths=[
86+
r"root\[\d+\]\['metadata'\]\['parent_id'\]",
87+
# TODO: (Marek Połom) - Remove page number pattern after page numbering parameter is added
88+
r"root\[\d+\]\['metadata'\]\['page_number'\]",
89+
],
90+
)
8691
assert len(diff) == 0
92+
93+
94+
def test_integration_split_pdf_for_file_with_no_name():
95+
"""
96+
Tests that the client raises an error when the file_name is empty.
97+
"""
98+
try:
99+
response = requests.get("http://localhost:8000/general/docs")
100+
assert (
101+
response.status_code == 200
102+
), "The unstructured-api is not running on localhost:8000"
103+
except requests.exceptions.ConnectionError:
104+
assert False, "The unstructured-api is not running on localhost:8000"
105+
106+
client = UnstructuredClient(api_key_auth=FAKE_KEY, server_url="localhost:8000")
107+
108+
with open("_sample_docs/layout-parser-paper-fast.pdf", "rb") as f:
109+
files = shared.Files(
110+
content=f.read(),
111+
file_name=" ",
112+
)
113+
114+
req = shared.PartitionParameters(
115+
files=files,
116+
strategy="fast",
117+
languages=["eng"],
118+
split_pdf_page=True,
119+
)
120+
121+
pytest.raises(ValueError, client.general.partition, req)

0 commit comments

Comments
 (0)