@@ -49,72 +49,96 @@ def send_document(
49
49
50
50
51
51
@pytest .mark .parametrize (
52
- " example_filename, content_type" ,
52
+ ( "extension" , " example_filename" , " content_type") ,
53
53
[
54
- # Note(yuming): Please sort filetypes alphabetically according to
55
- # https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/partition/auto.py#L14
56
- ("stanley-cups.csv" , "application/csv" ),
57
- ("fake.doc" , "application/msword" ),
58
- ("fake.docx" , "application/vnd.openxmlformats-officedocument.wordprocessingml.document" ),
59
- ("alert.eml" , "message/rfc822" ),
60
- ("announcement.eml" , "message/rfc822" ),
61
- ("fake-email-attachment.eml" , "message/rfc822" ),
62
- ("fake-email-image-embedded.eml" , "message/rfc822" ),
63
- ("fake-email.eml" , "message/rfc822" ),
64
- ("family-day.eml" , "message/rfc822" ),
65
- ("winter-sports.epub" , "application/epub" ),
66
- ("fake-html.html" , "text/html" ),
67
- pytest .param (
68
- "layout-parser-paper-fast.jpg" ,
69
- "image/jpeg" ,
70
- marks = pytest .mark .skipif (skip_inference_tests , reason = "emulated architecture" ),
71
- ),
72
- ("spring-weather.html.json" , "application/json" ),
73
- ("README.md" , "text/markdown" ),
74
- ("fake-email.msg" , "application/x-ole-storage" ),
75
- ("fake.odt" , "application/vnd.oasis.opendocument.text" ),
76
- # Note(austin) The two inference calls will hang on mac with unsupported hardware error
77
- # Skip these with SKIP_INFERENCE_TESTS=true make docker-test
78
- pytest .param (
79
- "layout-parser-paper.pdf.gz" ,
80
- "application/gzip" ,
81
- marks = pytest .mark .skipif (skip_inference_tests , reason = "emulated architecture" ),
82
- ),
83
- pytest .param (
84
- "layout-parser-paper.pdf" ,
85
- "application/pdf" ,
86
- marks = pytest .mark .skipif (skip_inference_tests , reason = "emulated architecture" ),
54
+ (".bmp" , "DA-1p.bmp" , "image/bmp" ),
55
+ (".csv" , "stanley-cups.csv" , "application/csv" ),
56
+ (".doc" , "fake.doc" , "application/msword" ),
57
+ (
58
+ ".docx" ,
59
+ "fake.docx" ,
60
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document" ,
87
61
),
88
- ("fake-power-point.ppt" , "application/vnd.ms-powerpoint" ),
62
+ (".eml" , "fake-email-attachment.eml" , "message/rfc822" ),
63
+ (".epub" , "winter-sports.epub" , "application/epub" ),
64
+ (".heic" , "DA-1p.heic" , "image/heic" ),
65
+ (".html" , "fake-html.html" , "text/html" ),
66
+ (".jpeg" , "layout-parser-paper-fast.jpg" , "image/jpeg" ),
67
+ (".md" , "README.md" , "text/markdown" ),
68
+ (".msg" , "fake-email.msg" , "application/x-ole-storage" ),
69
+ (".odt" , "fake.odt" , "application/vnd.oasis.opendocument.text" ),
70
+ (".pdf" , "layout-parser-paper.pdf" , "application/pdf" ),
71
+ (".png" , "english-and-korean.png" , "image/png" ),
72
+ (".ppt" , "fake-power-point.ppt" , "application/vnd.ms-powerpoint" ),
89
73
(
74
+ ".pptx" ,
90
75
"fake-power-point.pptx" ,
91
76
"application/vnd.openxmlformats-officedocument.presentationml.presentation" ,
92
77
),
93
- ("README.rst" , "text/prs.fallenstein.rst" ),
94
- ("fake-doc.rtf" , "application/rtf" ),
95
- ("fake-text.txt" , "text/plain" ),
96
- ("stanley-cups.tsv" , "text/tab-separated-values" ),
78
+ (".rst" , "README.rst" , "text/prs.fallenstein.rst" ),
79
+ (".rtf" , "fake-doc.rtf" , "application/rtf" ),
80
+ (".tiff" , "layout-parser-paper-fast.tiff" , "image/tiff" ),
81
+ (".tsv" , "stanley-cups.tsv" , "text/tab-separated-values" ),
82
+ (".txt" , "fake-text.txt" , "text/plain" ),
97
83
(
84
+ ".xlsx" ,
98
85
"stanley-cups.xlsx" ,
99
86
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" ,
100
87
),
101
- ("fake-xml.xml" , "text/xml" ),
88
+ (".xml" , "fake-xml.xml" , "text/xml" ),
89
+ (".json" , "spring-weather.html.json" , "application/json" ),
90
+ (
91
+ ".gz" ,
92
+ "layout-parser-paper.pdf.gz" ,
93
+ "application/gzip" ,
94
+ ),
102
95
],
103
96
)
104
- def test_happy_path ( example_filename : str , content_type : str ):
97
+ def test_happy_path_all_types ( extension , example_filename : str , content_type : str ):
105
98
"""
106
99
For the files in sample-docs, verify that we get a 200
107
100
and some structured response
108
101
"""
102
+ # The auto strategy will run ocr on these files
103
+ # This doesn't always work on our macs
104
+ if skip_inference_tests and extension in [
105
+ ".bmp" ,
106
+ ".heic" ,
107
+ ".jpeg" ,
108
+ ".pdf" ,
109
+ ".png" ,
110
+ ".tiff" ,
111
+ ".gz" , # Since we're using a gzipped pdf...
112
+ ]:
113
+ pytest .skip ("emulated hardware" )
114
+
109
115
test_file = str (Path ("sample-docs" ) / example_filename )
110
- print (f"sending { content_type } " )
111
- json_response = send_document (filenames = [test_file ], content_type = content_type )
112
- assert json_response .status_code == 200
113
- assert len (json_response .json ()) > 0
114
- assert len ("" .join (elem ["text" ] for elem in json_response .json ())) > 20
115
116
117
+ # Verify we can send with explicit content type
118
+ response = send_document (filenames = [test_file ], content_type = content_type )
119
+
120
+ if response .status_code != 200 :
121
+ assert False , response .text
122
+
123
+ assert len (response .json ()) > 0
124
+ assert len ("" .join (elem ["text" ] for elem in response .json ())) > 20
125
+
126
+ # Verify we can infer the filetype on the server
127
+ response = send_document (filenames = [test_file ], content_type = None )
128
+
129
+ if response .status_code != 200 :
130
+ assert False , response .text
131
+
132
+ assert len (response .json ()) > 0
133
+ assert len ("" .join (elem ["text" ] for elem in response .json ())) > 20
134
+
135
+ json_response = response
136
+
137
+ # Verify we can set output type to csv
116
138
csv_response = send_document (
117
- filenames = [test_file ], content_type = content_type , output_format = "text/csv"
139
+ filenames = [test_file ],
140
+ content_type = content_type ,
141
+ output_format = "text/csv" ,
118
142
)
119
143
assert csv_response .status_code == 200
120
144
assert len (csv_response .text ) > 0
0 commit comments