1
1
import os
2
- import pypdf
3
2
import pytest
4
3
import requests
5
4
from deepdiff import DeepDiff
15
14
@pytest .mark .parametrize (
16
15
"filename, expected_ok" ,
17
16
[
18
- ("_sample_docs/list-item-example-1.pdf" , True ), # 1 page
17
+ ("_sample_docs/list-item-example-1.pdf" , True ), # 1 page
19
18
("_sample_docs/layout-parser-paper-fast.pdf" , True ), # 2 pages
20
- ("_sample_docs/layout-parser-paper.pdf" , True ), # 16 pages
19
+ ("_sample_docs/layout-parser-paper.pdf" , True ), # 16 pages
21
20
("_sample_docs/fake.doc" , True ),
22
- ("_sample_docs/fake.doc" , False ), # This will append .pdf to filename to fool first line of filetype detection, to simulate decoding error
21
+ (
22
+ "_sample_docs/fake.doc" ,
23
+ False ,
24
+ ), # This will append .pdf to filename to fool first line of filetype detection, to simulate decoding error
23
25
],
24
26
)
25
27
def test_integration_split_pdf_has_same_output_as_non_split (
26
- call_threads : int ,
27
- filename : str ,
28
- expected_ok : bool ,
29
- caplog
28
+ call_threads : int , filename : str , expected_ok : bool , caplog
30
29
):
31
30
"""
32
31
Tests that output that we get from the split-by-page pdf is the same as from non-split.
@@ -36,14 +35,13 @@ def test_integration_split_pdf_has_same_output_as_non_split(
36
35
"""
37
36
try :
38
37
response = requests .get ("http://localhost:8000/general/docs" )
39
- assert response .status_code == 200 , "The unstructured-api is not running on localhost:8000"
38
+ assert (
39
+ response .status_code == 200
40
+ ), "The unstructured-api is not running on localhost:8000"
40
41
except requests .exceptions .ConnectionError :
41
42
assert False , "The unstructured-api is not running on localhost:8000"
42
43
43
- client = UnstructuredClient (
44
- api_key_auth = FAKE_KEY ,
45
- server_url = "localhost:8000"
46
- )
44
+ client = UnstructuredClient (api_key_auth = FAKE_KEY , server_url = "localhost:8000" )
47
45
48
46
with open (filename , "rb" ) as f :
49
47
files = shared .Files (
@@ -56,7 +54,7 @@ def test_integration_split_pdf_has_same_output_as_non_split(
56
54
57
55
req = shared .PartitionParameters (
58
56
files = files ,
59
- strategy = ' fast' ,
57
+ strategy = " fast" ,
60
58
languages = ["eng" ],
61
59
split_pdf_page = True ,
62
60
)
@@ -81,6 +79,43 @@ def test_integration_split_pdf_has_same_output_as_non_split(
81
79
assert resp_split .status_code == resp_single .status_code
82
80
83
81
# Difference in the parent_id is expected, because parent_ids are assigned when element crosses page boundary
84
- diff = DeepDiff (t1 = resp_split .elements , t2 = resp_single .elements ,
85
- exclude_regex_paths = r"root\[\d+\]\['metadata'\]\['parent_id'\]" )
82
+ diff = DeepDiff (
83
+ t1 = resp_split .elements ,
84
+ t2 = resp_single .elements ,
85
+ exclude_regex_paths = [
86
+ r"root\[\d+\]\['metadata'\]\['parent_id'\]" ,
87
+ # TODO: (Marek Połom) - Remove page number pattern after page numbering parameter is added
88
+ r"root\[\d+\]\['metadata'\]\['page_number'\]" ,
89
+ ],
90
+ )
86
91
assert len (diff ) == 0
92
+
93
+
94
+ def test_integration_split_pdf_for_file_with_no_name ():
95
+ """
96
+ Tests that the client raises an error when the file_name is empty.
97
+ """
98
+ try :
99
+ response = requests .get ("http://localhost:8000/general/docs" )
100
+ assert (
101
+ response .status_code == 200
102
+ ), "The unstructured-api is not running on localhost:8000"
103
+ except requests .exceptions .ConnectionError :
104
+ assert False , "The unstructured-api is not running on localhost:8000"
105
+
106
+ client = UnstructuredClient (api_key_auth = FAKE_KEY , server_url = "localhost:8000" )
107
+
108
+ with open ("_sample_docs/layout-parser-paper-fast.pdf" , "rb" ) as f :
109
+ files = shared .Files (
110
+ content = f .read (),
111
+ file_name = " " ,
112
+ )
113
+
114
+ req = shared .PartitionParameters (
115
+ files = files ,
116
+ strategy = "fast" ,
117
+ languages = ["eng" ],
118
+ split_pdf_page = True ,
119
+ )
120
+
121
+ pytest .raises (ValueError , client .general .partition , req )
0 commit comments