Skip to content

Commit 25016ef

Browse files
committed
Update docker smoketest with all supported filetypes
1 parent d5a878f commit 25016ef

File tree

4 files changed

+71
-47
lines changed

4 files changed

+71
-47
lines changed

sample-docs/DA-1p.bmp

1.44 MB
Binary file not shown.

sample-docs/DA-1p.heic

94.2 KB
Binary file not shown.
1.85 MB
Binary file not shown.

scripts/smoketest.py

Lines changed: 71 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -49,72 +49,96 @@ def send_document(
4949

5050

5151
@pytest.mark.parametrize(
52-
"example_filename, content_type",
52+
("extension", "example_filename", "content_type"),
5353
[
54-
# Note(yuming): Please sort filetypes alphabetically according to
55-
# https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/partition/auto.py#L14
56-
("stanley-cups.csv", "application/csv"),
57-
("fake.doc", "application/msword"),
58-
("fake.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"),
59-
("alert.eml", "message/rfc822"),
60-
("announcement.eml", "message/rfc822"),
61-
("fake-email-attachment.eml", "message/rfc822"),
62-
("fake-email-image-embedded.eml", "message/rfc822"),
63-
("fake-email.eml", "message/rfc822"),
64-
("family-day.eml", "message/rfc822"),
65-
("winter-sports.epub", "application/epub"),
66-
("fake-html.html", "text/html"),
67-
pytest.param(
68-
"layout-parser-paper-fast.jpg",
69-
"image/jpeg",
70-
marks=pytest.mark.skipif(skip_inference_tests, reason="emulated architecture"),
71-
),
72-
("spring-weather.html.json", "application/json"),
73-
("README.md", "text/markdown"),
74-
("fake-email.msg", "application/x-ole-storage"),
75-
("fake.odt", "application/vnd.oasis.opendocument.text"),
76-
# Note(austin) The two inference calls will hang on mac with unsupported hardware error
77-
# Skip these with SKIP_INFERENCE_TESTS=true make docker-test
78-
pytest.param(
79-
"layout-parser-paper.pdf.gz",
80-
"application/gzip",
81-
marks=pytest.mark.skipif(skip_inference_tests, reason="emulated architecture"),
82-
),
83-
pytest.param(
84-
"layout-parser-paper.pdf",
85-
"application/pdf",
86-
marks=pytest.mark.skipif(skip_inference_tests, reason="emulated architecture"),
54+
(".bmp", "DA-1p.bmp", "image/bmp"),
55+
(".csv", "stanley-cups.csv", "application/csv"),
56+
(".doc", "fake.doc", "application/msword"),
57+
(
58+
".docx",
59+
"fake.docx",
60+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
8761
),
88-
("fake-power-point.ppt", "application/vnd.ms-powerpoint"),
62+
(".eml", "fake-email-attachment.eml", "message/rfc822"),
63+
(".epub", "winter-sports.epub", "application/epub"),
64+
(".heic", "DA-1p.heic", "image/heic"),
65+
(".html", "fake-html.html", "text/html"),
66+
(".jpeg", "layout-parser-paper-fast.jpg", "image/jpeg"),
67+
(".md", "README.md", "text/markdown"),
68+
(".msg", "fake-email.msg", "application/x-ole-storage"),
69+
(".odt", "fake.odt", "application/vnd.oasis.opendocument.text"),
70+
(".pdf", "layout-parser-paper.pdf", "application/pdf"),
71+
(".png", "english-and-korean.png", "image/png"),
72+
(".ppt", "fake-power-point.ppt", "application/vnd.ms-powerpoint"),
8973
(
74+
".pptx",
9075
"fake-power-point.pptx",
9176
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
9277
),
93-
("README.rst", "text/prs.fallenstein.rst"),
94-
("fake-doc.rtf", "application/rtf"),
95-
("fake-text.txt", "text/plain"),
96-
("stanley-cups.tsv", "text/tab-separated-values"),
78+
(".rst", "README.rst", "text/prs.fallenstein.rst"),
79+
(".rtf", "fake-doc.rtf", "application/rtf"),
80+
(".tiff", "layout-parser-paper-fast.tiff", "image/tiff"),
81+
(".tsv", "stanley-cups.tsv", "text/tab-separated-values"),
82+
(".txt", "fake-text.txt", "text/plain"),
9783
(
84+
".xlsx",
9885
"stanley-cups.xlsx",
9986
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
10087
),
101-
("fake-xml.xml", "text/xml"),
88+
(".xml", "fake-xml.xml", "text/xml"),
89+
(".json", "spring-weather.html.json", "application/json"),
90+
(
91+
".gz",
92+
"layout-parser-paper.pdf.gz",
93+
"application/gzip",
94+
),
10295
],
10396
)
104-
def test_happy_path(example_filename: str, content_type: str):
97+
def test_happy_path_all_types(extension, example_filename: str, content_type: str):
10598
"""
10699
For the files in sample-docs, verify that we get a 200
107100
and some structured response
108101
"""
102+
# The auto strategy will run ocr on these files
103+
# This doesn't always work on our macs
104+
if skip_inference_tests and extension in [
105+
".bmp",
106+
".heic",
107+
".jpeg",
108+
".pdf",
109+
".png",
110+
".tiff",
111+
".gz", # Since we're using a gzipped pdf...
112+
]:
113+
pytest.skip("emulated hardware")
114+
109115
test_file = str(Path("sample-docs") / example_filename)
110-
print(f"sending {content_type}")
111-
json_response = send_document(filenames=[test_file], content_type=content_type)
112-
assert json_response.status_code == 200
113-
assert len(json_response.json()) > 0
114-
assert len("".join(elem["text"] for elem in json_response.json())) > 20
115116

117+
# Verify we can send with explicit content type
118+
response = send_document(filenames=[test_file], content_type=content_type)
119+
120+
if response.status_code != 200:
121+
assert False, response.text
122+
123+
assert len(response.json()) > 0
124+
assert len("".join(elem["text"] for elem in response.json())) > 20
125+
126+
# Verify we can infer the filetype on the server
127+
response = send_document(filenames=[test_file], content_type=None)
128+
129+
if response.status_code != 200:
130+
assert False, response.text
131+
132+
assert len(response.json()) > 0
133+
assert len("".join(elem["text"] for elem in response.json())) > 20
134+
135+
json_response = response
136+
137+
# Verify we can set output type to csv
116138
csv_response = send_document(
117-
filenames=[test_file], content_type=content_type, output_format="text/csv"
139+
filenames=[test_file],
140+
content_type=content_type,
141+
output_format="text/csv",
118142
)
119143
assert csv_response.status_code == 200
120144
assert len(csv_response.text) > 0

0 commit comments

Comments
 (0)