Skip to content

fix/Fix MS Office filetype errors and harden docker smoketest #436

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jun 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
## 0.0.72

* Fix certain filetypes failing mimetype lookup in the new base image

## 0.0.71

* replace rockylinux with chainguard/wolfi as a base image for `amd64`
Expand Down
2 changes: 1 addition & 1 deletion prepline_general/api/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
app = FastAPI(
title="Unstructured Pipeline API",
summary="Partition documents with the Unstructured library",
version="0.0.71",
version="0.0.72",
docs_url="/general/docs",
openapi_url="/general/openapi.json",
servers=[
Expand Down
107 changes: 107 additions & 0 deletions prepline_general/api/filetypes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
import mimetypes
import os
from fastapi import UploadFile, HTTPException
from typing import Optional

DEFAULT_MIMETYPES = (
"application/pdf,application/msword,image/jpeg,image/png,text/markdown,"
"text/x-markdown,text/html,"
"application/vnd.openxmlformats-officedocument.wordprocessingml.document,"
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet,"
"application/vnd.ms-excel,application/vnd.openxmlformats-officedocument."
"presentationml.presentation,"
"application/json,"
"application/vnd.ms-powerpoint,"
"text/html,message/rfc822,text/plain,image/png,"
"application/epub,application/epub+zip,"
"application/rtf,text/rtf,"
"application/vnd.oasis.opendocument.text,"
"text/csv,text/x-csv,application/csv,application/x-csv,"
"text/comma-separated-values,text/x-comma-separated-values,"
"application/xml,text/xml,text/x-rst,text/prs.fallenstein.rst,"
"text/tsv,text/tab-separated-values,"
"application/x-ole-storage,application/vnd.ms-outlook,"
"application/yaml,"
"application/x-yaml,"
"text/x-yaml,"
"text/yaml,"
"image/bmp,"
"image/heic,"
"image/tiff,"
"text/org,"
)

if not os.environ.get("UNSTRUCTURED_ALLOWED_MIMETYPES", None):
os.environ["UNSTRUCTURED_ALLOWED_MIMETYPES"] = DEFAULT_MIMETYPES


def _load_mimetypes() -> None:
"""Call this on startup to ensure that all expected file extensions are present in the mimetypes
lib"""
expected_mimetypes = [
(".bmp", "image/bmp"),
(".csv", "application/csv"),
(".doc", "application/msword"),
(".docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"),
(".eml", "message/rfc822"),
(".epub", "application/epub"),
(".gz", "application/gzip"),
(".heic", "image/heic"),
(".html", "text/html"),
(".jpeg", "image/jpeg"),
(".jpg", "image/jpeg"),
(".json", "application/json"),
(".md", "text/markdown"),
(".msg", "application/x-ole-storage"),
(".odt", "application/vnd.oasis.opendocument.text"),
(".org", "text/org"),
(".pdf", "application/pdf"),
(".png", "image/png"),
(".ppt", "application/vnd.ms-powerpoint"),
(".pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation"),
(".rst", "text/prs.fallenstein.rst"),
(".rtf", "application/rtf"),
(".tiff", "image/tiff"),
(".tsv", "text/tab-separated-values"),
(".txt", "text/plain"),
(".xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"),
(".xml", "text/xml"),
]

for extension, mimetype in expected_mimetypes:
mimetypes.add_type(mimetype, extension)


_load_mimetypes()


def get_validated_mimetype(file: UploadFile) -> Optional[str]:
"""The MIME-type of `file`.

The mimetype is computed based on `file.content_type`, or the mimetypes lib if that's too
generic. If the user has set UNSTRUCTURED_ALLOWED_MIMETYPES, validate against this list and
return HTTP 400 for an invalid type.
"""
content_type = file.content_type
filename = str(file.filename) # -- "None" when file.filename is None --
if not content_type or content_type == "application/octet-stream":
content_type = mimetypes.guess_type(filename)[0]

# Some filetypes missing for this library, just hardcode them for now
if not content_type:
if filename.endswith(".md"):
content_type = "text/markdown"
elif filename.endswith(".msg"):
content_type = "message/rfc822"

allowed_mimetypes_str = os.environ.get("UNSTRUCTURED_ALLOWED_MIMETYPES")
if allowed_mimetypes_str is not None:
allowed_mimetypes = allowed_mimetypes_str.split(",")

if content_type not in allowed_mimetypes:
raise HTTPException(
status_code=400,
detail=(f"File type {content_type} is not supported."),
)

return content_type
68 changes: 3 additions & 65 deletions prepline_general/api/general.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
from starlette.types import Send

from prepline_general.api.models.form_params import GeneralFormParams
from prepline_general.api.filetypes import get_validated_mimetype
from unstructured.documents.elements import Element
from unstructured.partition.auto import partition
from unstructured.staging.base import (
Expand All @@ -59,37 +60,6 @@ def is_compatible_response_type(media_type: str, response_type: type) -> bool:

logger = logging.getLogger("unstructured_api")

DEFAULT_MIMETYPES = (
"application/pdf,application/msword,image/jpeg,image/png,text/markdown,"
"text/x-markdown,text/html,"
"application/vnd.openxmlformats-officedocument.wordprocessingml.document,"
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet,"
"application/vnd.ms-excel,application/vnd.openxmlformats-officedocument."
"presentationml.presentation,"
"application/json,"
"application/vnd.ms-powerpoint,"
"text/html,message/rfc822,text/plain,image/png,"
"application/epub,application/epub+zip,"
"application/rtf,text/rtf,"
"application/vnd.oasis.opendocument.text,"
"text/csv,text/x-csv,application/csv,application/x-csv,"
"text/comma-separated-values,text/x-comma-separated-values,"
"application/xml,text/xml,text/x-rst,text/prs.fallenstein.rst,"
"text/tsv,text/tab-separated-values,"
"application/x-ole-storage,application/vnd.ms-outlook,"
"application/yaml,"
"application/x-yaml,"
"text/x-yaml,"
"text/yaml,"
"image/bmp,"
"image/heic,"
"image/tiff,"
"text/org,"
)

if not os.environ.get("UNSTRUCTURED_ALLOWED_MIMETYPES", None):
os.environ["UNSTRUCTURED_ALLOWED_MIMETYPES"] = DEFAULT_MIMETYPES


def get_pdf_splits(pdf_pages: Sequence[PageObject], split_size: int = 1):
"""Given a pdf (PdfReader) with n pages, split it into pdfs each with split_size # of pages.
Expand Down Expand Up @@ -609,38 +579,6 @@ def _set_pdf_infer_table_structure(
return strategy in ("hi_res", "auto") and pdf_infer_table_structure


def get_validated_mimetype(file: UploadFile) -> Optional[str]:
"""The MIME-type of `file`.

The mimetype is computed based on `file.content_type`, or the mimetypes lib if that's too
generic. If the user has set UNSTRUCTURED_ALLOWED_MIMETYPES, validate against this list and
return HTTP 400 for an invalid type.
"""
content_type = file.content_type
filename = str(file.filename) # -- "None" when file.filename is None --
if not content_type or content_type == "application/octet-stream":
content_type = mimetypes.guess_type(filename)[0]

# Some filetypes missing for this library, just hardcode them for now
if not content_type:
if filename.endswith(".md"):
content_type = "text/markdown"
elif filename.endswith(".msg"):
content_type = "message/rfc822"

allowed_mimetypes_str = os.environ.get("UNSTRUCTURED_ALLOWED_MIMETYPES")
if allowed_mimetypes_str is not None:
allowed_mimetypes = allowed_mimetypes_str.split(",")

if content_type not in allowed_mimetypes:
raise HTTPException(
status_code=400,
detail=(f"File type {content_type} is not supported."),
)

return content_type


class MultipartMixedResponse(StreamingResponse):
CRLF = b"\r\n"

Expand Down Expand Up @@ -713,7 +651,7 @@ def return_content_type(filename: str):


@router.get("/general/v0/general", include_in_schema=False)
@router.get("/general/v0.0.71/general", include_in_schema=False)
@router.get("/general/v0.0.72/general", include_in_schema=False)
async def handle_invalid_get_request():
raise HTTPException(
status_code=status.HTTP_405_METHOD_NOT_ALLOWED, detail="Only POST requests are supported."
Expand All @@ -728,7 +666,7 @@ async def handle_invalid_get_request():
description="Description",
operation_id="partition_parameters",
)
@router.post("/general/v0.0.71/general", include_in_schema=False)
@router.post("/general/v0.0.72/general", include_in_schema=False)
def general_partition(
request: Request,
# cannot use annotated type here because of a bug described here:
Expand Down
2 changes: 1 addition & 1 deletion preprocessing-pipeline-family.yaml
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
name: general
version: 0.0.71
version: 0.0.72
Binary file added sample-docs/DA-1p.bmp
Binary file not shown.
Binary file added sample-docs/DA-1p.heic
Binary file not shown.
Binary file added sample-docs/layout-parser-paper-fast.tiff
Binary file not shown.
118 changes: 71 additions & 47 deletions scripts/smoketest.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,72 +49,96 @@ def send_document(


@pytest.mark.parametrize(
"example_filename, content_type",
("extension", "example_filename", "content_type"),
[
# Note(yuming): Please sort filetypes alphabetically according to
# https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/partition/auto.py#L14
("stanley-cups.csv", "application/csv"),
("fake.doc", "application/msword"),
("fake.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"),
("alert.eml", "message/rfc822"),
("announcement.eml", "message/rfc822"),
("fake-email-attachment.eml", "message/rfc822"),
("fake-email-image-embedded.eml", "message/rfc822"),
("fake-email.eml", "message/rfc822"),
("family-day.eml", "message/rfc822"),
("winter-sports.epub", "application/epub"),
("fake-html.html", "text/html"),
pytest.param(
"layout-parser-paper-fast.jpg",
"image/jpeg",
marks=pytest.mark.skipif(skip_inference_tests, reason="emulated architecture"),
),
("spring-weather.html.json", "application/json"),
("README.md", "text/markdown"),
("fake-email.msg", "application/x-ole-storage"),
("fake.odt", "application/vnd.oasis.opendocument.text"),
# Note(austin) The two inference calls will hang on mac with unsupported hardware error
# Skip these with SKIP_INFERENCE_TESTS=true make docker-test
pytest.param(
"layout-parser-paper.pdf.gz",
"application/gzip",
marks=pytest.mark.skipif(skip_inference_tests, reason="emulated architecture"),
),
pytest.param(
"layout-parser-paper.pdf",
"application/pdf",
marks=pytest.mark.skipif(skip_inference_tests, reason="emulated architecture"),
(".bmp", "DA-1p.bmp", "image/bmp"),
(".csv", "stanley-cups.csv", "application/csv"),
(".doc", "fake.doc", "application/msword"),
(
".docx",
"fake.docx",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
),
("fake-power-point.ppt", "application/vnd.ms-powerpoint"),
(".eml", "fake-email-attachment.eml", "message/rfc822"),
(".epub", "winter-sports.epub", "application/epub"),
(".heic", "DA-1p.heic", "image/heic"),
(".html", "fake-html.html", "text/html"),
(".jpeg", "layout-parser-paper-fast.jpg", "image/jpeg"),
(".md", "README.md", "text/markdown"),
(".msg", "fake-email.msg", "application/x-ole-storage"),
(".odt", "fake.odt", "application/vnd.oasis.opendocument.text"),
(".pdf", "layout-parser-paper.pdf", "application/pdf"),
(".png", "english-and-korean.png", "image/png"),
(".ppt", "fake-power-point.ppt", "application/vnd.ms-powerpoint"),
(
".pptx",
"fake-power-point.pptx",
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
),
("README.rst", "text/prs.fallenstein.rst"),
("fake-doc.rtf", "application/rtf"),
("fake-text.txt", "text/plain"),
("stanley-cups.tsv", "text/tab-separated-values"),
(".rst", "README.rst", "text/prs.fallenstein.rst"),
(".rtf", "fake-doc.rtf", "application/rtf"),
(".tiff", "layout-parser-paper-fast.tiff", "image/tiff"),
(".tsv", "stanley-cups.tsv", "text/tab-separated-values"),
(".txt", "fake-text.txt", "text/plain"),
(
".xlsx",
"stanley-cups.xlsx",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
),
("fake-xml.xml", "text/xml"),
(".xml", "fake-xml.xml", "text/xml"),
(".json", "spring-weather.html.json", "application/json"),
(
".gz",
"layout-parser-paper.pdf.gz",
"application/gzip",
),
],
)
def test_happy_path(example_filename: str, content_type: str):
def test_happy_path_all_types(extension, example_filename: str, content_type: str):
"""
For the files in sample-docs, verify that we get a 200
and some structured response
"""
# The auto strategy will run ocr on these files
# This doesn't always work on our macs
if skip_inference_tests and extension in [
".bmp",
".heic",
".jpeg",
".pdf",
".png",
".tiff",
".gz", # Since we're using a gzipped pdf...
]:
pytest.skip("emulated hardware")

test_file = str(Path("sample-docs") / example_filename)
print(f"sending {content_type}")
json_response = send_document(filenames=[test_file], content_type=content_type)
assert json_response.status_code == 200
assert len(json_response.json()) > 0
assert len("".join(elem["text"] for elem in json_response.json())) > 20

# Verify we can send with explicit content type
response = send_document(filenames=[test_file], content_type=content_type)

if response.status_code != 200:
assert False, response.text

assert len(response.json()) > 0
assert len("".join(elem["text"] for elem in response.json())) > 20

# Verify we can infer the filetype on the server
response = send_document(filenames=[test_file], content_type=None)

if response.status_code != 200:
assert False, response.text

assert len(response.json()) > 0
assert len("".join(elem["text"] for elem in response.json())) > 20

json_response = response

# Verify we can set output type to csv
csv_response = send_document(
filenames=[test_file], content_type=content_type, output_format="text/csv"
filenames=[test_file],
content_type=content_type,
output_format="text/csv",
)
assert csv_response.status_code == 200
assert len(csv_response.text) > 0
Expand Down