Skip to content

Commit ef170c9

Browse files
committed
Add explicit lookup for all supported mimetypes
1 parent 25016ef commit ef170c9

File tree

2 files changed

+108
-63
lines changed

2 files changed

+108
-63
lines changed

prepline_general/api/filetypes.py

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
import mimetypes
2+
import os
3+
from fastapi import UploadFile
4+
from typing import Optional
5+
6+
DEFAULT_MIMETYPES = (
7+
"application/pdf,application/msword,image/jpeg,image/png,text/markdown,"
8+
"text/x-markdown,text/html,"
9+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document,"
10+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet,"
11+
"application/vnd.ms-excel,application/vnd.openxmlformats-officedocument."
12+
"presentationml.presentation,"
13+
"application/json,"
14+
"application/vnd.ms-powerpoint,"
15+
"text/html,message/rfc822,text/plain,image/png,"
16+
"application/epub,application/epub+zip,"
17+
"application/rtf,text/rtf,"
18+
"application/vnd.oasis.opendocument.text,"
19+
"text/csv,text/x-csv,application/csv,application/x-csv,"
20+
"text/comma-separated-values,text/x-comma-separated-values,"
21+
"application/xml,text/xml,text/x-rst,text/prs.fallenstein.rst,"
22+
"text/tsv,text/tab-separated-values,"
23+
"application/x-ole-storage,application/vnd.ms-outlook,"
24+
"application/yaml,"
25+
"application/x-yaml,"
26+
"text/x-yaml,"
27+
"text/yaml,"
28+
"image/bmp,"
29+
"image/heic,"
30+
"image/tiff,"
31+
"text/org,"
32+
)
33+
34+
if not os.environ.get("UNSTRUCTURED_ALLOWED_MIMETYPES", None):
35+
os.environ["UNSTRUCTURED_ALLOWED_MIMETYPES"] = DEFAULT_MIMETYPES
36+
37+
38+
def _load_mimetypes() -> None:
39+
"""Call this on startup to ensure that all expected file extensions are present in the mimetypes
40+
lib"""
41+
expected_mimetypes = [
42+
(".bmp", "image/bmp"),
43+
(".csv", "application/csv"),
44+
(".doc", "application/msword"),
45+
(".docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"),
46+
(".eml", "message/rfc822"),
47+
(".epub", "application/epub"),
48+
(".gz", "application/gzip"),
49+
(".heic", "image/heic"),
50+
(".html", "text/html"),
51+
(".jpeg", "image/jpeg"),
52+
(".jpg", "image/jpeg"),
53+
(".json", "application/json"),
54+
(".md", "text/markdown"),
55+
(".msg", "application/x-ole-storage"),
56+
(".odt", "application/vnd.oasis.opendocument.text"),
57+
(".org", "text/org"),
58+
(".pdf", "application/pdf"),
59+
(".png", "image/png"),
60+
(".ppt", "application/vnd.ms-powerpoint"),
61+
(".pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation"),
62+
(".rst", "text/prs.fallenstein.rst"),
63+
(".rtf", "application/rtf"),
64+
(".tiff", "image/tiff"),
65+
(".tsv", "text/tab-separated-values"),
66+
(".txt", "text/plain"),
67+
(".xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"),
68+
(".xml", "text/xml"),
69+
]
70+
71+
for extension, mimetype in expected_mimetypes:
72+
mimetypes.add_type(mimetype, extension)
73+
74+
75+
_load_mimetypes()
76+
77+
78+
def get_validated_mimetype(file: UploadFile) -> Optional[str]:
79+
"""The MIME-type of `file`.
80+
81+
The mimetype is computed based on `file.content_type`, or the mimetypes lib if that's too
82+
generic. If the user has set UNSTRUCTURED_ALLOWED_MIMETYPES, validate against this list and
83+
return HTTP 400 for an invalid type.
84+
"""
85+
content_type = file.content_type
86+
filename = str(file.filename) # -- "None" when file.filename is None --
87+
if not content_type or content_type == "application/octet-stream":
88+
content_type = mimetypes.guess_type(filename)[0]
89+
90+
# Some filetypes missing for this library, just hardcode them for now
91+
if not content_type:
92+
if filename.endswith(".md"):
93+
content_type = "text/markdown"
94+
elif filename.endswith(".msg"):
95+
content_type = "message/rfc822"
96+
97+
allowed_mimetypes_str = os.environ.get("UNSTRUCTURED_ALLOWED_MIMETYPES")
98+
if allowed_mimetypes_str is not None:
99+
allowed_mimetypes = allowed_mimetypes_str.split(",")
100+
101+
if content_type not in allowed_mimetypes:
102+
raise HTTPException(
103+
status_code=400,
104+
detail=(f"File type {content_type} is not supported."),
105+
)
106+
107+
return content_type

prepline_general/api/general.py

Lines changed: 1 addition & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
from starlette.types import Send
3535

3636
from prepline_general.api.models.form_params import GeneralFormParams
37+
from prepline_general.api.filetypes import get_validated_mimetype
3738
from unstructured.documents.elements import Element
3839
from unstructured.partition.auto import partition
3940
from unstructured.staging.base import (
@@ -59,37 +60,6 @@ def is_compatible_response_type(media_type: str, response_type: type) -> bool:
5960

6061
logger = logging.getLogger("unstructured_api")
6162

62-
DEFAULT_MIMETYPES = (
63-
"application/pdf,application/msword,image/jpeg,image/png,text/markdown,"
64-
"text/x-markdown,text/html,"
65-
"application/vnd.openxmlformats-officedocument.wordprocessingml.document,"
66-
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet,"
67-
"application/vnd.ms-excel,application/vnd.openxmlformats-officedocument."
68-
"presentationml.presentation,"
69-
"application/json,"
70-
"application/vnd.ms-powerpoint,"
71-
"text/html,message/rfc822,text/plain,image/png,"
72-
"application/epub,application/epub+zip,"
73-
"application/rtf,text/rtf,"
74-
"application/vnd.oasis.opendocument.text,"
75-
"text/csv,text/x-csv,application/csv,application/x-csv,"
76-
"text/comma-separated-values,text/x-comma-separated-values,"
77-
"application/xml,text/xml,text/x-rst,text/prs.fallenstein.rst,"
78-
"text/tsv,text/tab-separated-values,"
79-
"application/x-ole-storage,application/vnd.ms-outlook,"
80-
"application/yaml,"
81-
"application/x-yaml,"
82-
"text/x-yaml,"
83-
"text/yaml,"
84-
"image/bmp,"
85-
"image/heic,"
86-
"image/tiff,"
87-
"text/org,"
88-
)
89-
90-
if not os.environ.get("UNSTRUCTURED_ALLOWED_MIMETYPES", None):
91-
os.environ["UNSTRUCTURED_ALLOWED_MIMETYPES"] = DEFAULT_MIMETYPES
92-
9363

9464
def get_pdf_splits(pdf_pages: Sequence[PageObject], split_size: int = 1):
9565
"""Given a pdf (PdfReader) with n pages, split it into pdfs each with split_size # of pages.
@@ -609,38 +579,6 @@ def _set_pdf_infer_table_structure(
609579
return strategy in ("hi_res", "auto") and pdf_infer_table_structure
610580

611581

612-
def get_validated_mimetype(file: UploadFile) -> Optional[str]:
613-
"""The MIME-type of `file`.
614-
615-
The mimetype is computed based on `file.content_type`, or the mimetypes lib if that's too
616-
generic. If the user has set UNSTRUCTURED_ALLOWED_MIMETYPES, validate against this list and
617-
return HTTP 400 for an invalid type.
618-
"""
619-
content_type = file.content_type
620-
filename = str(file.filename) # -- "None" when file.filename is None --
621-
if not content_type or content_type == "application/octet-stream":
622-
content_type = mimetypes.guess_type(filename)[0]
623-
624-
# Some filetypes missing for this library, just hardcode them for now
625-
if not content_type:
626-
if filename.endswith(".md"):
627-
content_type = "text/markdown"
628-
elif filename.endswith(".msg"):
629-
content_type = "message/rfc822"
630-
631-
allowed_mimetypes_str = os.environ.get("UNSTRUCTURED_ALLOWED_MIMETYPES")
632-
if allowed_mimetypes_str is not None:
633-
allowed_mimetypes = allowed_mimetypes_str.split(",")
634-
635-
if content_type not in allowed_mimetypes:
636-
raise HTTPException(
637-
status_code=400,
638-
detail=(f"File type {content_type} is not supported."),
639-
)
640-
641-
return content_type
642-
643-
644582
class MultipartMixedResponse(StreamingResponse):
645583
CRLF = b"\r\n"
646584

0 commit comments

Comments
 (0)