Skip to content

Commit 46c7551

Browse files
authored
fix/Fix python/js client sending unrecognized list params (Unstructured-IO#412)
# Changes ## FormData fix FastAPI expects list params to look a certain way in form data. The Speakeasy clients use a different, more explicit format that isn't getting parsed by the api. Therefore, when client users send `skip_infer_table_types` or `extract_image_block_types`, the server just sets them to `None`. The fix is to transform the formdata params before FastAPI parses them into a list type. I tried adding middleware for this, but it turns out `Request._form` isn't loaded at this point. Instead, we can just monkeypatch the `Request` class to return the right thing when it's asked for. I updated an old parallel mode unit test to more generally assert that all params received in the endpoint make it down to partition. By adding square brackets to the list params, we can see the test pass once the fix is applied. # Testing Run the server at port 8000 with `make run-web-app`. Load a pyenv that has the latest python client, and in `ipython`, run the following snippet. In the server log, verify that we see a warning about `extract_image_block_types` having an invalid value: `foo`. ``` from unstructured_client import UnstructuredClient from unstructured_client.models import shared from unstructured_client.models.errors import SDKError s = UnstructuredClient( api_key_auth=None, server_url="http://localhost:8000", ) filename = "/path/to/any/file" with open(filename, "rb") as f: files=shared.Files( content=f.read(), file_name=filename, ) req = shared.PartitionParameters( files=files, # Other partition params strategy='fast', extract_image_block_types=["Foo", "Foo"], ) try: resp = s.general.partition(req) print(resp.elements[0]) except SDKError as e: print(e) ```
1 parent 7d32cea commit 46c7551

File tree

6 files changed

+61
-12
lines changed

6 files changed

+61
-12
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
## 0.0.68
2+
3+
* Fix list params such as `extract_image_block_types` not working via the python/js clients
4+
15
## 0.0.67
26

37
* Allow for a different server port with the PORT variable

prepline_general/api/app.py

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from fastapi import FastAPI, Request, status, HTTPException
2+
from fastapi.datastructures import FormData
23
from fastapi.responses import JSONResponse
34
from fastapi.security import APIKeyHeader
45
import logging
@@ -12,7 +13,7 @@
1213
app = FastAPI(
1314
title="Unstructured Pipeline API",
1415
summary="Partition documents with the Unstructured library",
15-
version="0.0.67",
16+
version="0.0.68",
1617
docs_url="/general/docs",
1718
openapi_url="/general/openapi.json",
1819
servers=[
@@ -66,6 +67,47 @@ async def error_handler(request: Request, e: Exception):
6667
set_custom_openapi(app)
6768

6869

70+
# Note(austin) - When FastAPI parses our FormData params,
71+
# it builds lists out of duplicate keys, like so:
72+
# FormData([('key', 'value1'), ('key', 'value2')])
73+
#
74+
# The Speakeasy clients send a more explicit form:
75+
# FormData([('key[]', 'value1'), ('key[]', 'value2')])
76+
#
77+
# FastAPI doesn't understand these, so we need to transform them.
78+
# Can't do this in middleware before the data stream is read, nor in the endpoint
79+
# after the fields are parsed. Thus, we have to patch it into Request.form() on startup.
80+
get_form = Request._get_form
81+
82+
83+
async def patched_get_form(
84+
self,
85+
*,
86+
max_files: int | float = 1000,
87+
max_fields: int | float = 1000,
88+
) -> FormData:
89+
"""
90+
Call the original get_form, and iterate the results
91+
If a key has brackets at the end, remove them before returning the final FormData
92+
Note the extra params here are unused, but needed to match the signature
93+
"""
94+
form_params = await get_form(self)
95+
96+
fixed_params = []
97+
for key, value in form_params.multi_items():
98+
# Transform key[] into key
99+
if key and key.endswith("[]"):
100+
key = key[:-2]
101+
102+
fixed_params.append((key, value))
103+
104+
return FormData(fixed_params)
105+
106+
107+
# Replace the private method with our wrapper
108+
Request._get_form = patched_get_form # type: ignore[assignment]
109+
110+
69111
# Filter out /healthcheck noise
70112
class HealthCheckFilter(logging.Filter):
71113
def filter(self, record: logging.LogRecord) -> bool:

prepline_general/api/general.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -704,7 +704,7 @@ def return_content_type(filename: str):
704704

705705

706706
@router.get("/general/v0/general", include_in_schema=False)
707-
@router.get("/general/v0.0.67/general", include_in_schema=False)
707+
@router.get("/general/v0.0.68/general", include_in_schema=False)
708708
async def handle_invalid_get_request():
709709
raise HTTPException(
710710
status_code=status.HTTP_405_METHOD_NOT_ALLOWED, detail="Only POST requests are supported."
@@ -719,7 +719,7 @@ async def handle_invalid_get_request():
719719
description="Description",
720720
operation_id="partition_parameters",
721721
)
722-
@router.post("/general/v0.0.67/general", include_in_schema=False)
722+
@router.post("/general/v0.0.68/general", include_in_schema=False)
723723
def general_partition(
724724
request: Request,
725725
# cannot use annotated type here because of a bug described here:

preprocessing-pipeline-family.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
name: general
2-
version: 0.0.67
2+
version: 0.0.68

scripts/docker-smoke-test.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -78,8 +78,8 @@ await_server_ready 8000
7878
#######################
7979
# Smoke Tests
8080
#######################
81-
echo Running smoke tests
82-
PYTHONPATH=. SKIP_INFERENCE_TESTS=$SKIP_INFERENCE_TESTS pytest scripts/smoketest.py
81+
echo Running smoke tests with SKIP_INFERENCE_TESTS: "$SKIP_INFERENCE_TESTS"
82+
PYTHONPATH=. SKIP_INFERENCE_TESTS=$SKIP_INFERENCE_TESTS pytest -vv scripts/smoketest.py
8383

8484
#######################
8585
# Test parallel vs single mode

test_general/api/test_app.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -664,18 +664,21 @@ def test_parallel_mode_passes_params(monkeypatch):
664664
client = TestClient(app)
665665
test_file = Path("sample-docs") / "layout-parser-paper.pdf"
666666

667+
# For list params, send the formdata keys with brackets
668+
# This is how Speakeasy sends them
667669
response = client.post(
668670
MAIN_API_ROUTE,
669671
files=[("files", (str(test_file), open(test_file, "rb"), "application/pdf"))],
670672
data={
671673
"encoding": "foo",
672674
"hi_res_model_name": "yolox",
673675
"include_page_breaks": "True",
674-
"languages": "foo",
676+
"languages": "eng",
675677
"pdf_infer_table_structure": "True",
676678
"strategy": "hi_res",
677679
"xml_keep_tags": "True",
678-
"skip_infer_table_types": "foo",
680+
"skip_infer_table_types[]": ["pdf"],
681+
"extract_image_block_types[]": ["Image", "Table"],
679682
"unique_element_ids": "True",
680683
"starting_page_number": 1,
681684
# -- chunking options --
@@ -699,13 +702,13 @@ def test_parallel_mode_passes_params(monkeypatch):
699702
encoding="foo",
700703
include_page_breaks=True,
701704
ocr_languages=None,
702-
languages=["foo"],
705+
languages=["eng"],
703706
pdf_infer_table_structure=True,
704707
strategy="hi_res",
705708
xml_keep_tags=True,
706-
skip_infer_table_types=["foo"],
707-
extract_image_block_types=None,
708-
extract_image_block_to_payload=False,
709+
skip_infer_table_types=["pdf"],
710+
extract_image_block_types=["Image", "Table"],
711+
extract_image_block_to_payload=True, # Set to true because block_types is non empty
709712
unique_element_ids=True,
710713
starting_page_number=1,
711714
# -- chunking options --

0 commit comments

Comments
 (0)