|
| 1 | +import mimetypes |
| 2 | +import os |
| 3 | +from fastapi import UploadFile |
| 4 | +from typing import Optional |
| 5 | + |
| 6 | +DEFAULT_MIMETYPES = ( |
| 7 | + "application/pdf,application/msword,image/jpeg,image/png,text/markdown," |
| 8 | + "text/x-markdown,text/html," |
| 9 | + "application/vnd.openxmlformats-officedocument.wordprocessingml.document," |
| 10 | + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet," |
| 11 | + "application/vnd.ms-excel,application/vnd.openxmlformats-officedocument." |
| 12 | + "presentationml.presentation," |
| 13 | + "application/json," |
| 14 | + "application/vnd.ms-powerpoint," |
| 15 | + "text/html,message/rfc822,text/plain,image/png," |
| 16 | + "application/epub,application/epub+zip," |
| 17 | + "application/rtf,text/rtf," |
| 18 | + "application/vnd.oasis.opendocument.text," |
| 19 | + "text/csv,text/x-csv,application/csv,application/x-csv," |
| 20 | + "text/comma-separated-values,text/x-comma-separated-values," |
| 21 | + "application/xml,text/xml,text/x-rst,text/prs.fallenstein.rst," |
| 22 | + "text/tsv,text/tab-separated-values," |
| 23 | + "application/x-ole-storage,application/vnd.ms-outlook," |
| 24 | + "application/yaml," |
| 25 | + "application/x-yaml," |
| 26 | + "text/x-yaml," |
| 27 | + "text/yaml," |
| 28 | + "image/bmp," |
| 29 | + "image/heic," |
| 30 | + "image/tiff," |
| 31 | + "text/org," |
| 32 | +) |
| 33 | + |
| 34 | +if not os.environ.get("UNSTRUCTURED_ALLOWED_MIMETYPES", None): |
| 35 | + os.environ["UNSTRUCTURED_ALLOWED_MIMETYPES"] = DEFAULT_MIMETYPES |
| 36 | + |
| 37 | + |
| 38 | +def _load_mimetypes() -> None: |
| 39 | + """Call this on startup to ensure that all expected file extensions are present in the mimetypes |
| 40 | + lib""" |
| 41 | + expected_mimetypes = [ |
| 42 | + (".bmp", "image/bmp"), |
| 43 | + (".csv", "application/csv"), |
| 44 | + (".doc", "application/msword"), |
| 45 | + (".docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"), |
| 46 | + (".eml", "message/rfc822"), |
| 47 | + (".epub", "application/epub"), |
| 48 | + (".gz", "application/gzip"), |
| 49 | + (".heic", "image/heic"), |
| 50 | + (".html", "text/html"), |
| 51 | + (".jpeg", "image/jpeg"), |
| 52 | + (".jpg", "image/jpeg"), |
| 53 | + (".json", "application/json"), |
| 54 | + (".md", "text/markdown"), |
| 55 | + (".msg", "application/x-ole-storage"), |
| 56 | + (".odt", "application/vnd.oasis.opendocument.text"), |
| 57 | + (".org", "text/org"), |
| 58 | + (".pdf", "application/pdf"), |
| 59 | + (".png", "image/png"), |
| 60 | + (".ppt", "application/vnd.ms-powerpoint"), |
| 61 | + (".pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation"), |
| 62 | + (".rst", "text/prs.fallenstein.rst"), |
| 63 | + (".rtf", "application/rtf"), |
| 64 | + (".tiff", "image/tiff"), |
| 65 | + (".tsv", "text/tab-separated-values"), |
| 66 | + (".txt", "text/plain"), |
| 67 | + (".xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"), |
| 68 | + (".xml", "text/xml"), |
| 69 | + ] |
| 70 | + |
| 71 | + for extension, mimetype in expected_mimetypes: |
| 72 | + mimetypes.add_type(mimetype, extension) |
| 73 | + |
| 74 | + |
| 75 | +_load_mimetypes() |
| 76 | + |
| 77 | + |
| 78 | +def get_validated_mimetype(file: UploadFile) -> Optional[str]: |
| 79 | + """The MIME-type of `file`. |
| 80 | +
|
| 81 | + The mimetype is computed based on `file.content_type`, or the mimetypes lib if that's too |
| 82 | + generic. If the user has set UNSTRUCTURED_ALLOWED_MIMETYPES, validate against this list and |
| 83 | + return HTTP 400 for an invalid type. |
| 84 | + """ |
| 85 | + content_type = file.content_type |
| 86 | + filename = str(file.filename) # -- "None" when file.filename is None -- |
| 87 | + if not content_type or content_type == "application/octet-stream": |
| 88 | + content_type = mimetypes.guess_type(filename)[0] |
| 89 | + |
| 90 | + # Some filetypes missing for this library, just hardcode them for now |
| 91 | + if not content_type: |
| 92 | + if filename.endswith(".md"): |
| 93 | + content_type = "text/markdown" |
| 94 | + elif filename.endswith(".msg"): |
| 95 | + content_type = "message/rfc822" |
| 96 | + |
| 97 | + allowed_mimetypes_str = os.environ.get("UNSTRUCTURED_ALLOWED_MIMETYPES") |
| 98 | + if allowed_mimetypes_str is not None: |
| 99 | + allowed_mimetypes = allowed_mimetypes_str.split(",") |
| 100 | + |
| 101 | + if content_type not in allowed_mimetypes: |
| 102 | + raise HTTPException( |
| 103 | + status_code=400, |
| 104 | + detail=(f"File type {content_type} is not supported."), |
| 105 | + ) |
| 106 | + |
| 107 | + return content_type |
0 commit comments